各位大大們好,小弟不才,近期在學習爬蟲;
目前的目標是:
爬取PTT(NBA版)的文章,擷取推文數、日期、作者及標題
以下先附上我的程式碼
import requests
from bs4 import BeautifulSoup
url="https://www.ptt.cc/bbs/NBA/index.html"
def get_NBA(url):
r = requests.get(url)
sp = BeautifulSoup(r.text,"html.parser")
rs = sp.select('.r-ent')
for item in rs:
print ('日期:',item.select('.date')[0].text)
print ('推文數:',item.select('.nrec')[0].text)
print ('作者:',item.select('.author')[0].text)
print ('標題:',item.select('.title')[0].text)
print ('==============================================')
for page in range(0,4):
r = requests.get(url)
sp = BeautifulSoup(r.text,"html.parser")
btn = sp.select('div.btn-group > a')
up_page_href = btn[3]['href']
next_page_url = 'https://www.ptt.cc' + up_page_href
url = next_page_url
get_NBA(url = url)
接下來我想將我執行的結果輸出至一個記事本的檔案(txt),爬文試過許多方法都失敗,不知道是否能像大家請教一下該如何解決這個問題?
小弟先謝過各位!
PTT有爬蟲,可以直接輸出JSON。
https://github.com/jwlin/ptt-web-crawler
如果是想學習,輸出的code你應該自己想想看。
https://ithelp.ithome.com.tw/articles/10161708
大概是這樣
import requests
from bs4 import BeautifulSoup
url="https://www.ptt.cc/bbs/NBA/index.html"
def get_NBA(url):
r = requests.get(url)
sp = BeautifulSoup(r.text,"html.parser")
rs = sp.select('.r-ent')
# with會自動關閉檔案
# a+模式可讀可寫,檔案不存在則創建一個,檔案存在會附加在後面
with open('text.txt', 'a+') as f:
for item in rs:
f.write(item.select('.date')[0].text)
f.write(item.select('.nrec')[0].text)
f.write(item.select('.author')[0].text)
f.write(item.select('.title')[0].text)
for page in range(0,4):
r = requests.get(url)
sp = BeautifulSoup(r.text,"html.parser")
btn = sp.select('div.btn-group > a')
up_page_href = btn[3]['href']
next_page_url = 'https://www.ptt.cc' + up_page_href
url = next_page_url
get_NBA(url = url)
open()和with詳細的用法可咕狗大神了解
我最近剛好也在寫抓ptt的爬蟲
附上我的程式碼,大家互相交流
以下程式碼會抓取最新前十頁的資料
import re
import time
import sys
import requests
import pandas as pd
from bs4 import BeautifulSoup
class GetPttdata:
header ={
'User-Agent':
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0'
}
href_list = list()
author_list = list()
date_list = list()
title_list = list()
url_list =list()
def __init__(self, input_url):
self.url = input_url
html = requests.get(self.url)
self.soup = BeautifulSoup(html.text, 'lxml')
def get_index(self):
atags = self.soup.find_all('a',{'class':'btn wide'})
for atag in atags:
pat = re.findall(r'\d{2,}', atag.get('href'))
if pat: return pat
def get_soup(self):
soups =list()
b = self.get_index()
try:
for page in range(int(b[0])+1 ,int(b[0])-9, -1):
html = requests.get(
'https://www.ptt.cc/bbs/Python/index' + str(page) + '.html',
headers=self.header
)
if html.status_code == 200:
sp = BeautifulSoup(html.text)
soups.append(sp)
time.sleep(3)
else:
print('網站無法讀取!')
html.raise_for_status()
return soups
except Exception as e:
print(e)
sys.exit()
def get_data(self):
for sp in self.get_soup():
title_data = sp.select('.title')
author_data = sp.select('.author')
date_data = sp.find_all('div',{'class':'date'})
self.author_list += [_.text for _ in author_data]
self.date_list += [_.text for _ in date_data]
for _ in title_data:
pat = re.sub(r'\s', '', _.text)
self.title_list.append(pat)
self.href_list = [_.find('a') for _ in title_data]
for url in self.href_list:
if url == None:
self.url_list.append(0)
elif url:
href = url.get('href')
self.url_list.append('https://www.ptt.cc'+ href)
def to_exc(self):
datas = [
self.title_list,
self.author_list,
self.date_list,
self.url_list
]
dexs = ['title', 'author', 'date', 'href']
df = pd.DataFrame(datas, index=dexs)
df_chr = df.stack()
df_chr.to_excel('pttdata.xlsx')
print('請輸入想抓取的ptt看板網址!')
print('網址格式:https://www.ptt.cc/bbs/看板名稱/index.html.')
url = input('網址: ')
pat = re.compile(r'https://www\.ptt\.cc/bbs/.+/index\.html')
pat_str = pat.findall(url)
if pat_str:
print('解析網址中......')
pttobj = GetPttdata(url)
print('網站資料抓取中......')
pttobj.get_index()
pttobj.get_soup()
print('資料整理中......')
pttobj.get_data()
print('資料輸出中......')
pttobj.to_exc()
print("資料以輸出試算表'pttdata.xlsx'")
else:
print('網址格式錯誤')