在練習作業的時候出現了一個問題,要抓的兩個各自有80頁和41頁,可是在跑到39頁時就都會出現IndexError: list index out of range。
1.看了39頁和40頁以後的網址、原始碼,看起來沒有變?
2.showkind跑完問題是指到showpage,可是print showpage、res都可以印出東西。
想請問接下來要哪邊檢查?先謝謝大家
def showkind(url,kind):
html=requests.get(url,headers=headers).text
sp=BeautifulSoup(html,"lxml")
try:
pages=int(sp.select(".cnt_page span")[0].text)
print("Total:",pages,"pages")
for page in range(1,pages+1):
pageurl=url+"&page="+str(page).strip()
print("第",page,"頁",pageurl)
showpage(pageurl,kind)
except:
showpage(url,kind)
def showpage(url,kind):
html=requests.get(url,headers=headers).text
sp=BeautifulSoup(html,"lxml")
res=sp.find_all("div",{"class":"mod type02_m047_m054_wrap clearfix"})[0] #mod_a clearfix / mod type02_m012 clearfix
items=res.select(".item")
n=0
for item in items:
msg=item.select('.msg')[0]
title=msg.select('a')[0].text
author=msg.select('a')[1].text
publish=msg.select('a')[2].text
date=msg.find('span').text.split(":")[-1]
onsale=item.select('.price .set2')[0].text
content=item.select('.txt_cont')[0].text.replace(" ","").strip()
listdata=[kind,title,author,publish,date,onsale,content]
list1.append(listdata)
n=n+1
print("n=",n)
def twobyte(kindno):
kindnostr="01020"+str(kindno)
return kindnostr
import requests
from bs4 import BeautifulSoup
import openpyxl
from time import sleep
list1=[]
kindno=1
homeurl="https://www.books.com.tw/web/books_bmidm_0102/?o=1&v=1"
mode="/?o=1&v=1"
url="https://www.books.com.tw/web/sys_bbotm/books/"
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
html=requests.get(homeurl,headers=headers).text
sp=BeautifulSoup(html,"lxml")
res=sp.find("ul",class_="sub")
hrefs=res.select("a")
kindno=int(input("請輸入要下載的分類:"))
if 0 < kindno <= len(hrefs):
kind=hrefs[kindno-1].text
print("下載的分類編號:{} 分類名稱:{}".format(kindno,kind))
kindurl=url+twobyte(kindno)+mode
print(kindurl)
showkind(kindurl,kind)
print("資料寫入中,請稍等")
workbook=openpyxl.Workbook()
sheet=workbook.worksheets[0]
listtitle=["分類","書名","作者","出版社","出版日期","優惠價","內容"]
sheet.append(listtitle)
for item1 in list1:
sheet.append(item1)
sleep(0.1)
workbook.save("book-1228.xlsx")
else:
print("分類不存在")
print("資料儲存完畢")
IndexError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_2940/4062216474.py in showkind(url, kind)
9 print("第",page,"頁",pageurl)
---> 10 showpage(pageurl,kind)
11 except:
~\AppData\Local\Temp/ipykernel_2940/4062216474.py in showpage(url, kind)
16 sp=BeautifulSoup(html,"lxml")
---> 17 res=sp.find_all("div",{"class":"mod type02_m047_m054_wrap clearfix"})[0] #mod_a clearfix / mod type02_m012 clearfix
18 items=res.select(".item")
IndexError: list index out of range
During handling of the above exception, another exception occurred:
IndexError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_2940/4062216474.py in <module>
60 kindurl=url+twobyte(kindno)+mode
61 print(kindurl)
---> 62 showkind(kindurl,kind)
63
64 print("資料寫入中,請稍等")
~\AppData\Local\Temp/ipykernel_2940/4062216474.py in showkind(url, kind)
10 showpage(pageurl,kind)
11 except:
---> 12 showpage(url,kind)
13
14 def showpage(url,kind):
~\AppData\Local\Temp/ipykernel_2940/4062216474.py in showpage(url, kind)
15 html=requests.get(url,headers=headers).text
16 sp=BeautifulSoup(html,"lxml")
---> 17 res=sp.find_all("div",{"class":"mod type02_m047_m054_wrap clearfix"})[0] #mod_a clearfix / mod type02_m012 clearfix
18 items=res.select(".item")
19 n=0
IndexError: list index out of range
你這應該單純就是 query 次數太頻繁
被鎖爬蟲而已
單純看
html=requests.get(url,headers=headers)
print(html)
就會發現前面都是「Response<200>」
但是到 page39 都會出現「Response<484>」
就代表 Query 頻率可能超過網站限制
所以直接
try:
res=sp.find_all("div",{"class":"mod type02_m047_m054_wrap clearfix"})[0]
items=res.select(".item")
except Exception as e:
print("資料輸出錯誤,休息 60 秒再繼續...")
sleep(60) # ************加這行************
html=requests.get(url,headers=headers).text
sp=BeautifulSoup(html,"lxml")
res=sp.find_all("div",{"class":"mod type02_m047_m054_wrap clearfix"})[0]
items=res.select(".item")
就好
秒數要自己試一下,每種網站冷卻時間都不一樣