[已解決]Python抓網頁抓到特定頁數後會出現IndexError: list index out of range

python

neu1ing 2021-12-31 02:03:27 ‧ 2114 瀏覽

分享至

在練習作業的時候出現了一個問題，要抓的兩個各自有80頁和41頁，可是在跑到39頁時就都會出現IndexError: list index out of range。
1.看了39頁和40頁以後的網址、原始碼，看起來沒有變？
2.showkind跑完問題是指到showpage，可是print showpage、res都可以印出東西。
想請問接下來要哪邊檢查？先謝謝大家


def showkind(url,kind):
    html=requests.get(url,headers=headers).text
    sp=BeautifulSoup(html,"lxml")
    try:
        pages=int(sp.select(".cnt_page span")[0].text)
        print("Total:",pages,"pages")
        for page in range(1,pages+1):
            pageurl=url+"&page="+str(page).strip()
            print("第",page,"頁",pageurl)
            showpage(pageurl,kind)
    except:
        showpage(url,kind)

def showpage(url,kind):
    html=requests.get(url,headers=headers).text
    sp=BeautifulSoup(html,"lxml")
    res=sp.find_all("div",{"class":"mod type02_m047_m054_wrap clearfix"})[0] #mod_a clearfix / mod type02_m012 clearfix
    items=res.select(".item")
    n=0
    for item in items:
        msg=item.select('.msg')[0] 
        title=msg.select('a')[0].text
        author=msg.select('a')[1].text
        publish=msg.select('a')[2].text
        date=msg.find('span').text.split("：")[-1]
        onsale=item.select('.price .set2')[0].text
        content=item.select('.txt_cont')[0].text.replace(" ","").strip()
        listdata=[kind,title,author,publish,date,onsale,content]
        list1.append(listdata)
        n=n+1
        print("n=",n)
        


def twobyte(kindno):
    kindnostr="01020"+str(kindno)
    return kindnostr


import requests
from bs4 import BeautifulSoup
import openpyxl
from time import sleep

list1=[]
kindno=1
homeurl="https://www.books.com.tw/web/books_bmidm_0102/?o=1&v=1"
mode="/?o=1&v=1"
url="https://www.books.com.tw/web/sys_bbotm/books/"
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
html=requests.get(homeurl,headers=headers).text
sp=BeautifulSoup(html,"lxml")

res=sp.find("ul",class_="sub")
hrefs=res.select("a")
kindno=int(input("請輸入要下載的分類:"))
if 0 < kindno <= len(hrefs):
    kind=hrefs[kindno-1].text
    print("下載的分類編號:{} 分類名稱：{}".format(kindno,kind))
    kindurl=url+twobyte(kindno)+mode
    print(kindurl)
    showkind(kindurl,kind)
    
    print("資料寫入中，請稍等")
    workbook=openpyxl.Workbook()
    sheet=workbook.worksheets[0]
    listtitle=["分類","書名","作者","出版社","出版日期","優惠價","內容"]
    sheet.append(listtitle)
    for item1 in list1:
        sheet.append(item1)
        sleep(0.1)
    workbook.save("book-1228.xlsx")
else:
    print("分類不存在")
print("資料儲存完畢")

IndexError                                Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_2940/4062216474.py in showkind(url, kind)
      9             print("第",page,"頁",pageurl)
---> 10             showpage(pageurl,kind)
     11     except:

~\AppData\Local\Temp/ipykernel_2940/4062216474.py in showpage(url, kind)
     16     sp=BeautifulSoup(html,"lxml")
---> 17     res=sp.find_all("div",{"class":"mod type02_m047_m054_wrap clearfix"})[0] #mod_a clearfix / mod type02_m012 clearfix
     18     items=res.select(".item")

IndexError: list index out of range

During handling of the above exception, another exception occurred:

IndexError                                Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_2940/4062216474.py in <module>
     60     kindurl=url+twobyte(kindno)+mode
     61     print(kindurl)
---> 62     showkind(kindurl,kind)
     63 
     64     print("資料寫入中，請稍等")

~\AppData\Local\Temp/ipykernel_2940/4062216474.py in showkind(url, kind)
     10             showpage(pageurl,kind)
     11     except:
---> 12         showpage(url,kind)
     13 
     14 def showpage(url,kind):

~\AppData\Local\Temp/ipykernel_2940/4062216474.py in showpage(url, kind)
     15     html=requests.get(url,headers=headers).text
     16     sp=BeautifulSoup(html,"lxml")
---> 17     res=sp.find_all("div",{"class":"mod type02_m047_m054_wrap clearfix"})[0] #mod_a clearfix / mod type02_m012 clearfix
     18     items=res.select(".item")
     19     n=0

IndexError: list index out of range

登入發表討論

直播研討會

{{ item.channelVendor }} {{ item.webinarstarted }} |

直播中

1 個回答

huahualiu

iT邦新手 2 級 ‧ 2021-12-31 10:29:48

最佳解答

你這應該單純就是 query 次數太頻繁
被鎖爬蟲而已

單純看

html=requests.get(url,headers=headers)
print(html)

就會發現前面都是「Response<200>」
但是到 page39 都會出現「Response<484>」
就代表 Query 頻率可能超過網站限制

所以直接

try:
        res=sp.find_all("div",{"class":"mod type02_m047_m054_wrap clearfix"})[0] 
        items=res.select(".item")
except Exception as e:
        print("資料輸出錯誤，休息 60 秒再繼續...")
        sleep(60) # ************加這行************
        html=requests.get(url,headers=headers).text
        sp=BeautifulSoup(html,"lxml")
        res=sp.find_all("div",{"class":"mod type02_m047_m054_wrap clearfix"})[0] 
        items=res.select(".item")

就好

秒數要自己試一下，每種網站冷卻時間都不一樣