import requests
from bs4 import BeautifulSoup
def get_articles(url):
response = requests.get(url, cookies={'over18': '1'}) # 需要設置cookie來通過18歲確認
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
articles = soup.find_all('div', class_='r-ent')
for article in articles:
title = article.find('div', class_='title').text.strip()
link = article.find('a')['href'] if article.find('a') else 'No link'
print(f'標題: {title}, 連結: https://www.ptt.cc{link}')
return soup
else:
print(f'無法訪問 {url}, 狀態碼: {response.status_code}')
return None
def get_next_page(soup):
paging_div = soup.find('div', class_='btn-group btn-group-paging')
next_page_link = paging_div.find_all('a')[1]['href'] # 取得上一頁的連結
return next_page_link
base_url = 'https://www.ptt.cc'
board = '/bbs/Gossiping/index.html'
page_url = base_url + board
# 爬取前兩頁的文章列表
for _ in range(2): # 假設我們要爬取兩頁
soup = get_articles(page_url)
if soup:
next_page = get_next_page(soup)
page_url = base_url + next_page
else:
break