今天的影片內容為爬取Google Play網頁版應用程式的評論
並將前幾天爬取AJAX網頁(Dcard)的程式碼稍微進行改良
還有引入重要的try-except陳述句,即使程式跳出錯誤也能夠繼續執行下去
Check it out!
以下為影片中有使用到的程式碼
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import bs4, time
page = int(input("請輸入頁面向下捲動次數:"))
dirverPath = 'C:\\spider\\chromedriver.exe'
browser = webdriver.Chrome(executable_path = dirverPath)
url = 'https://play.google.com/store/apps/details?id=com.facebook.katana&hl=zh_TW&gl=US&showAllReviews=true'
browser.get(url)
number = 0
counter = 0
post_title = []
while page > counter:
move = browser.find_element_by_tag_name('body')
move.send_keys(Keys.PAGE_DOWN)
time.sleep(0.25)
objsoup = bs4.BeautifulSoup(browser.page_source, 'lxml')
articles = objsoup.find_all('div', jscontroller = 'H6eOGe')
for article in articles:
title = article.find('span', class_ = 'X43Kjb') #尋找留言暱稱
rank = article.find('div', role = 'img') #尋找評價
if title.text not in post_title:
number += 1
post_title.append(title.text)
print("留言編號:", number)
print("留言暱稱:", title.text)
print("評價:", rank.get('aria-label')) #取得aria-label的屬性值
print("="*100)
counter += 1
print(post_title)
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import bs4, time
def click_text(obj):
#尋找「顯示更多內容」的字串並點擊
try:
obj.find_element_by_xpath("//span[contains(text(),'顯示更多內容')]").click()
except:
pass
page = int(input("請輸入頁面向下捲動次數:"))
dirverPath = 'C:\\spider\\chromedriver.exe'
browser = webdriver.Chrome(executable_path = dirverPath)
url = 'https://play.google.com/store/apps/details?id=com.facebook.katana&hl=zh_TW&gl=US&showAllReviews=true'
browser.get(url)
number = 0
counter = 0
post_title = []
while page > counter:
move = browser.find_element_by_tag_name('body')
move.send_keys(Keys.PAGE_DOWN)
click_text(move)
time.sleep(0.5)
objsoup = bs4.BeautifulSoup(browser.page_source, 'lxml')
articles = objsoup.find_all('div', jscontroller = 'H6eOGe')
for article in articles:
title = article.find('span', class_ = 'X43Kjb') #尋找留言暱稱
rank = article.find('div', role = 'img') #尋找評價
if title.text not in post_title:
number += 1
post_title.append(title.text)
print("留言編號:", number)
print("留言暱稱:", title.text)
print("評價:", rank.get('aria-label')) #取得aria-label的屬性值
print("="*100)
counter += 1
print(post_title)
#顯示錯誤
print(5 + "Hello")
print("列印成功!")
#引進try-except陳述句
try:
print(5 + "Hello") #嘗試執行此程式
except:
print("語法錯誤!") #若上述程式出現錯誤則執行此程式
print("列印成功!")
================================================分隔線(6/15更新)
最近Google Play的網頁有進行更新,以下為更新後的程式碼~
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import bs4, time
def click_text(obj):
#尋找「查看所有評論」的字串並點擊
try:
obj.find_element_by_xpath("//span[contains(text(),'查看所有評論')]").click()
except:
pass
'''
以下為新增的程式碼,當點擊「查看所有評論」的連結後會跳出一個視窗。此時進行PAGE_DOWN或其他鍵盤指令是無效的,必須先點擊視窗中的任何一個地方。我選擇的是點擊「這則評論對你有幫助嗎?」旁邊「是」的按鈕,這時鍵盤指令就會有效了。(過程中可能會請你登入,但這不影響我們爬取網站內容~)
'''
def click_yes(obj):
try:
obj.find_element_by_xpath("//body/div[4]/div[2]/div[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/footer[1]/div[2]/div[1]/div[1]/div[2]/span[2]").click()
except:
pass
page = int(input("請輸入頁面向下捲動次數:"))
dirverPath = 'C:\\spider\\chromedriver.exe'
browser = webdriver.Chrome(executable_path = dirverPath)
url = 'https://play.google.com/store/apps/details?id=com.facebook.katana&hl=zh_TW&gl=US&showAllReviews=true'
browser.get(url)
number = 0
counter = 0
post_title = []
move = browser.find_element_by_tag_name('body')
click_text(move)
time.sleep(1)#這地方的暫停時間盡量不要少於1秒,否則視窗還沒開起,後面點擊「是」的指令卻執行完了(會跑出錯誤喔!)
click_yes(move)
time.sleep(1)
while page > counter:
move.send_keys(Keys.PAGE_DOWN)
objsoup = bs4.BeautifulSoup(browser.page_source, 'lxml')
articles = objsoup.find_all('div', class_ = 'RHo1pe')#這邊的class有更新
for article in articles:
title = article.find('div', class_ = 'X5PpBb') #尋找留言暱稱,這邊的class也有更新
rank = article.find('div', role = 'img') #尋找評價
if title.text not in post_title:
number += 1
post_title.append(title.text)
print("留言編號:", number)
print("留言暱稱:", title.text)
print("評價:", rank.get('aria-label')) #取得aria-label的屬性值
print("="*100)
counter += 1
print(post_title)
本篇影片及程式碼僅提供研究使用,請勿大量惡意地爬取資料造成對方網頁的負擔呦!
如果在影片中有說得不太清楚或錯誤的地方,歡迎留言告訴我,謝謝您的指教。