iT邦幫忙

2024 iThome 鐵人賽

DAY 5
0
Python

Python自修系列 第 5

DAY5:擴展爬蟲功能,保存數據

  • 分享至 

  • xImage
  •  
import requests
from bs4 import BeautifulSoup
import csv
import json

def get_articles(url, articles_data):
    response = requests.get(url, cookies={'over18': '1'})  # 需要設置cookie來通過18歲確認
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        articles = soup.find_all('div', class_='r-ent')
        for article in articles:
            title_tag = article.find('div', class_='title').find('a')
            if title_tag:
                title = title_tag.text.strip()
                link = title_tag['href']
                print(f'標題: {title}, 連結: https://www.ptt.cc{link}')
                images, tables = get_article_content(f'https://www.ptt.cc{link}')
                articles_data.append({
                    'title': title,
                    'link': f'https://www.ptt.cc{link}',
                    'images': images,
                    'tables': tables
                })
        return soup
    else:
        print(f'無法訪問 {url}, 狀態碼: {response.status_code}')
        return None

def get_article_content(article_url):
    response = requests.get(article_url, cookies={'over18': '1'})
    images = []
    tables = []
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # 提取圖片URL
        img_tags = soup.find_all('a', {'href': True})
        for img in img_tags:
            if img['href'].endswith(('.jpg', '.png', '.gif')):
                print(f'圖片URL: {img["href"]}')
                images.append(img['href'])
        
        # 提取表格數據
        table_tags = soup.find_all('table')
        for table in table_tags:
            rows = table.find_all('tr')
            table_data = []
            for row in rows:
                cols = row.find_all('td')
                cols = [ele.text.strip() for ele in cols]
                table_data.append(cols)
            tables.append(table_data)
    else:
        print(f'無法訪問 {article_url}, 狀態碼: {response.status_code}')
    return images, tables

def get_next_page(soup):
    paging_div = soup.find('div', class_='btn-group btn-group-paging')
    next_page_link = paging_div.find_all('a')[1]['href']  # 取得上一頁的連結
    return next_page_link

def save_to_csv(data, filename):
    keys = data[0].keys()
    with open(filename, 'w', newline='', encoding='utf-8') as output_file:
        dict_writer = csv.DictWriter(output_file, fieldnames=keys)
        dict_writer.writeheader()
        dict_writer.writerows(data)

def save_to_json(data, filename):
    with open(filename, 'w', encoding='utf-8') as output_file:
        json.dump(data, output_file, ensure_ascii=False, indent=4)

base_url = 'https://www.ptt.cc'
board = '/bbs/Gossiping/index.html'
page_url = base_url + board

articles_data = []

# 爬取前兩頁的文章列表並處理內容
for _ in range(2):  # 假設我們要爬取兩頁
    soup = get_articles(page_url, articles_data)
    if soup:
        next_page = get_next_page(soup)
        page_url = base_url + next_page
    else:
        break

# 保存數據到CSV和JSON文件
save_to_csv(articles_data, 'ptt_articles.csv')
save_to_json(articles_data, 'ptt_articles.json')


上一篇
DAY4:爬取文章中的圖片
下一篇
DAY6:介紹SQLite,將數據存儲到SQLite數據庫
系列文
Python自修30
圖片
  直播研討會
圖片
{{ item.channelVendor }} {{ item.webinarstarted }} |
{{ formatDate(item.duration) }}
直播中

尚未有邦友留言

立即登入留言