因為找不到合適得中文數據集,改成一樣使用IMDB店營評論做接續,下面我們以爬蟲獲得電影頻論來讓後續做使用。
我們來逐步解釋它的運作:
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import csv
import re
url
: 這是 IMDb 電影評論頁面的網址,我們要在這個頁面上進行評論的爬取。csv_filename
: 我們指定了一個 CSV 檔案的名稱,這個檔案將用於儲存我們爬取到的評論數據。max_reviews
: 我們設定最多要爬取的評論數量,預設是 100。爬取評論數據:
scraper.scrape_reviews()
方法開始爬取評論數據。url
) 並解析頁面內容。csv_filename
) 來儲存我們要爬取的數據,並設定 CSV 檔案的列標題。max_reviews
),則停止爬取。處理分頁:
最終將評論數據爬取並存儲在一個 CSV 檔案中,以便後續使用這些數據。
class IMDbReviewsScraper:
def __init__(self, url, csv_filename, max_reviews=100):
self.base_url = "https://www.imdb.com/"
self.key = ""
self.url = url
self.csv_filename = csv_filename
self.max_reviews = max_reviews
def scrape_reviews(self):
with open(self.csv_filename, mode='w', newline='', encoding='utf-8') as csv_file:
csv_writer = csv.writer(csv_file)
csv_writer.writerow(["Title", "Author", "Date", "Up Vote", "Total Vote", "Rating", "Review"])
cnt = 1
print("url = ", self.url)
res = requests.get(self.url)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, "lxml")
for item in soup.select(".lister-list"):
title = item.select(".title")[0].text
author = item.select(".display-name-link")[0].text
date = item.select(".review-date")[0].text
votetext = item.select(".text-muted")[0].text
upvote = re.findall(r"\d+", votetext)[0]
totalvote = re.findall(r"\d+", votetext)[1]
rating = item.select("span.rating-other-user-rating > span")
if len(rating) == 2:
rating = rating[0].text
else:
rating = ""
review = item.select(".text")[0].text
row = [title, author, date, upvote, totalvote, rating, review]
csv_writer.writerow(row)
cnt = cnt + 1
if cnt >= self.max_reviews:
break
load_more = soup.select(".load-more-data")
flag = True
if len(load_more):
ajaxurl = load_more[0]['data-ajaxurl']
self.base_url = self.base_url + ajaxurl + "?ref_=undefined&paginationKey="
self.key = load_more[0]['data-key']
else:
flag = False
while flag:
url = self.base_url + self.key
print("url = ", url)
res = requests.get(url)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, "lxml")
for item in soup.select(".lister-item-content"):
title = item.select(".title")[0].text
author = item.select(".display-name-link")[0].text
date = item.select(".review-date")[0].text
votetext = item.select(".text-muted")[0].text
vote = re.findall(r"\d+", votetext)[0]
totalvote = re.findall(r"\d+", votetext)[1]
rating = item.select("span.rating-other-user-rating > span")
if len(rating) == 2:
rating = rating[0].text
else:
rating = ""
review = item.select(".text")[0].text
row = [title, author, date, vote, totalvote, rating, review]
csv_writer.writerow(row)
if cnt >= self.max_reviews:
break
cnt = cnt + 1
if cnt >= self.max_reviews:
break
load_more = soup.select(".load-more-data")
if len(load_more):
self.key = load_more[0]['data-key']
else:
flag = False
print(f'{cnt} reviews saved to {self.csv_filename}.')
# 使用 IMDbReviewsScraper 來爬取內容
url = 'https://www.imdb.com/title/tt0120731/reviews?ref_=tt_ql_3'
csv_filename = 'movie_reviews.csv'
max_reviews = 100
scraper = IMDbReviewsScraper(url, csv_filename, max_reviews)
scraper.scrape_reviews()