import requests
from bs4 import BeautifulSoup
import csv
import json
def get_articles(url, articles_data):
response = requests.get(url, cookies={'over18': '1'}) # 需要設置cookie來通過18歲確認
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
articles = soup.find_all('div', class_='r-ent')
for article in articles:
title_tag = article.find('div', class_='title').find('a')
if title_tag:
title = title_tag.text.strip()
link = title_tag['href']
print(f'標題: {title}, 連結: https://www.ptt.cc{link}')
images, tables = get_article_content(f'https://www.ptt.cc{link}')
articles_data.append({
'title': title,
'link': f'https://www.ptt.cc{link}',
'images': images,
'tables': tables
})
return soup
else:
print(f'無法訪問 {url}, 狀態碼: {response.status_code}')
return None
def get_article_content(article_url):
response = requests.get(article_url, cookies={'over18': '1'})
images = []
tables = []
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
# 提取圖片URL
img_tags = soup.find_all('a', {'href': True})
for img in img_tags:
if img['href'].endswith(('.jpg', '.png', '.gif')):
print(f'圖片URL: {img["href"]}')
images.append(img['href'])
# 提取表格數據
table_tags = soup.find_all('table')
for table in table_tags:
rows = table.find_all('tr')
table_data = []
for row in rows:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
table_data.append(cols)
tables.append(table_data)
else:
print(f'無法訪問 {article_url}, 狀態碼: {response.status_code}')
return images, tables
def get_next_page(soup):
paging_div = soup.find('div', class_='btn-group btn-group-paging')
next_page_link = paging_div.find_all('a')[1]['href'] # 取得上一頁的連結
return next_page_link
def save_to_csv(data, filename):
keys = data[0].keys()
with open(filename, 'w', newline='', encoding='utf-8') as output_file:
dict_writer = csv.DictWriter(output_file, fieldnames=keys)
dict_writer.writeheader()
dict_writer.writerows(data)
def save_to_json(data, filename):
with open(filename, 'w', encoding='utf-8') as output_file:
json.dump(data, output_file, ensure_ascii=False, indent=4)
base_url = 'https://www.ptt.cc'
board = '/bbs/Gossiping/index.html'
page_url = base_url + board
articles_data = []
# 爬取前兩頁的文章列表並處理內容
for _ in range(2): # 假設我們要爬取兩頁
soup = get_articles(page_url, articles_data)
if soup:
next_page = get_next_page(soup)
page_url = base_url + next_page
else:
break
# 保存數據到CSV和JSON文件
save_to_csv(articles_data, 'ptt_articles.csv')
save_to_json(articles_data, 'ptt_articles.json')