我想要用python讀取柏克萊暢銷榜與試讀榜的所有書籍資料
用DataFrame儲存,5個欄位
類型 booktype 暢銷榜/試讀榜
圖 picture
書名 bookname
作者 author
價格 price
兩個榜的url可用一個清單儲存 https://www.books.com.tw/web/sys_cebtopb/cebook https://www.books.com.tw/web/sys_cebtryb/cebook
圖要下載, 檔名可以用序號重新命名
我東抄西抄,只組合出這樣
import requests
from IPython.display import Image
from IPython.display import Image, display
from bs4 import BeautifulSoup
import pandas as pd
def downloadimg(imgurl, img_name):
urlcontent = requests.get(imgurl)
with open(img_name,'wb') as file:
file.write(urlcontent.content)
file.flush()
file.close()
print('已儲存' + img_name)
url="https://www.books.com.tw/web/sys_cebtryb/cebook"
myrequest = requests.get(url)
#print(myrequest.content)
soup=BeautifulSoup(myrequest.content,"html.parser")
#print(soup)
firstImg = soup.find('img') #取得第1個<img>
print('firstImg:', firstImg)
print('--------------------')
#先跑出柏克萊是讀榜資料=====================================
dvdlist = soup.find_all('img',{'class':'cover'})
count = 0
imglist = []
namelist = []
#print(dvdlist)
#跑出dvdlist跑出dvdlist=================================================
for idx in range(len(dvdlist)):
count += 1
print(count)
print(dvdlist[idx])
print('圖-->', dvdlist[idx].get('src'))
imglist.append(dvdlist[idx].get('src'))
print('書名-->', dvdlist[idx].get('alt'))
namelist.append(dvdlist[idx].get('alt'))
print('作者-->', dvdlist[idx].get('b'))#這兩段跑不成功
namelist.append(dvdlist[idx].get('b'))#這兩段跑不成功
#跑出booklist跑出 另外用來抓作者=================================================
url="https://www.books.com.tw/web/sys_cebtryb/cebook"
myrequest = requests.get(url)
#print(myrequest.content)
soup=BeautifulSoup(myrequest.content,"html.parser")
booklist=soup.find_all('a',{'href':'cover'})# 我是抓錯屬性嗎?
print(booklist)
display(Image(pic))
#上面這段也跑不成功
想要把圖用
只到這裡就卡好久 ~ 看看有沒有高手可以教一下 感謝
修改如下,試試看吧
#先跑出柏克萊是讀榜資料=====================================
dvdlist = soup.find_all('div',{'class':'type02_bd-a'})
piclist = soup.find_all('img',{'class':'cover'})
count = 0
imglist = []
namelist = []
#print(dvdlist)
#跑出dvdlist跑出dvdlist=================================================
for idx in range(len(dvdlist)):
count += 1
print(count)
print('圖-->', piclist[idx].get('src'))
print('書名-->', dvdlist[idx].h4.text.strip())
print('作者-->', dvdlist[idx].li.text.strip().replace("作者:",""))