原理:通过执行 js 来获取页面可滚动的高度,抓取该高度范围内的数据后,再跳转到底部。
这里获取可滚动高度用的是document.documentElement.scrollHeight
而不是 document.body.scrollHeight
,用 body 的时候这个返回值一直是 0
。
代码如下:
from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support.wait import WebDriverWait import time chrome_options = Options() chrome_options.add_argument('disable_infobars') driver = webdriver.Chrome(options=chrome_options) driver.set_window_position(0, 0) driver.set_window_size(1024, 768) driver.get("https://www.youtube.com/results?search_query=python") # Get scroll height last_height = driver.execute_script("return document.documentElement.scrollHeight") print("scrollHeight0=" + str(last_height)) j = 1 n = 10 links = [] while True: user_data = [] try: user_data = WebDriverWait(driver, timeout=5).until(lambda d: d.find_elements(by=By.ID, value='video-title')) except Exception as e: print(e) for i in user_data: links.append(i.get_attribute('href')) # Scroll down to bottom driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);") # Wait to load page time.sleep(0.5) # Calculate new scroll height and compare with last scroll height new_height = driver.execute_script("return document.documentElement.scrollHeight") print("scrollHeight1=" + str(new_height)) if new_height == last_height: break last_height = new_height print("finish "+str(j)+" time") if j > n: break j += 1 print(len(links)) print(links) driver.close()