iT邦幫忙

1

python 爬蟲 如何爬canvas上的資料

爬蟲目的網站(首頁/現貨行情/蛋/養雞協會雞蛋交易行情/行情圖):http://www.foodchina.com.tw/model/marketing/AnaChartNew.aspx?id=51&ChkID=297&Page=0&Type=1&cn=False

該網站需要先利用篩選器選擇你要的資料,
(類別:全選,價格:全選,區間:日,2018年1月31日前31天走勢圖)

我目前利用

from selenium import webdriver
from selenium.webdriver.support.ui import Select

開啟該網站,並下指令去操作篩選器,
目前完整程式碼:

from selenium import webdriver
from selenium.webdriver.support.ui import Select


#安裝chromedriver,chromedriver存放位置
dr = webdriver.Chrome('C:/Users/User/chromedriver')

#造訪網頁連結
dr.get('http://www.foodchina.com.tw/model/marketing/AnaChartNew.aspx?id=51&ChkID=297&Page=0&Type=1&cn=False')

#點選所有類別checkbox
checkboxs = dr.find_elements_by_css_selector('input[type=checkbox]')
for checkbox in checkboxs:
    checkbox.click()

#點選項目checkbox
checkboxs_price = dr.find_elements_by_css_selector('input[checked=checked]')
for checkbox_price in checkboxs_price:
    checkbox_price.click()

#點選指定radiobutton,區間,日
radios = dr.find_elements_by_css_selector('input[value=RB1]')
for radio in radios:
    radio.click()
    
#抓取下拉式選單元件
#年
select_year = Select(dr.find_element_by_id('ctl00_ctl00_cpl_MainContent_cpl_BasicMainContent_ddl_Year1'))
select_year.select_by_value('2018')
#月
select_month = Select(dr.find_element_by_id('ctl00_ctl00_cpl_MainContent_cpl_BasicMainContent_ddl_Month1'))
select_month.select_by_value('1')
#日
select_day = Select(dr.find_element_by_id('ctl00_ctl00_cpl_MainContent_cpl_BasicMainContent_ddl_Day'))
select_day.select_by_value('31') 

#點選查詢button
buttons = dr.find_elements_by_css_selector('input[value=查詢]')
for button in buttons:
    button.click()

run程式後,會開啟chrome,並篩選我需要的指令,
結果跑出canvas圖,
https://ithelp.ithome.com.tw/upload/images/20210826/20136579iK1SnTMkpG.png
但是我想要爬梳的是canvas圖上的折點數值,
值包在http://www.foodchina.com.tw/model/ajax/getChartData.ashx 內,
我先簡單撰寫爬蟲程式,但爬梳的結果卻沒有爬到值data的y值,
目前爬梳程式碼:

import requests
from bs4 import BeautifulSoup
import pandas as pd

url = 'http://www.foodchina.com.tw/model/ajax/getChartData.ashx'

headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'}

resp = requests.post(url)

# 將 HTML 轉成 BeautifulSoup 物件
soup = BeautifulSoup(resp.text, 'html.parser')

print(soup)

想詢問我可以如何修改我的程式碼,
才可以爬到我要的值呢?謝謝

淺水員 iT邦高手 3 級 ‧ 2021-08-26 23:06:53 檢舉
getChartData 需要之前的 cookie 才拿得到資料
也許可以考慮直接跟第一個程式放在一起
當第一個程式拿到 session id 後
繼續給接下來的動作使用
skyksl066 iT邦新手 4 級 ‧ 2021-08-27 20:46:32 檢舉
一進那個網站他就console.log給你了阿...
object=>data=>datasets=>0=>data=>你要的資料
我有一個建議的方法你可以試試看
網頁上這種jJS表要用selenium 裡面的execute_script方法去拿

1 個回答

0
blanksoul12
iT邦新手 4 級 ‧ 2021-08-28 10:57:53

excel 做法

Sub test()

With CreateObject("WinHttp.WinHttpRequest.5.1")
    .Open "GET", "http://www.foodchina.com.tw/model/marketing/AnaChartNew.aspx?id=51&ChkID=297&Page=0&Type=1&cn=False", False
    .setRequestHeader "Content-Type", "application/x-www-form-urlencoded"
    .Send
    strText = .responsetext
    VIEWSTATE = encodeURI(CStr(Split(Split(strText, "__VIEWSTATE"" value=""")(1), """ />")(0))) 'VIEWSTATE
    VIEWSTATEGENERATOR = encodeURI(CStr(Split(Split(strText, "__VIEWSTATEGENERATOR"" value=""")(1), """ />")(0))) 'VIEWSTATEGENERATOR
    EVENTVALIDATION = encodeURI(CStr(Split(Split(strText, "__EVENTVALIDATION"" value=""")(1), """ />")(0))) 'EVENTVALIDATION
    
    .Open "POST", "http://www.foodchina.com.tw/model/marketing/AnaChartNew.aspx?id=51&ChkID=297&Page=0&Type=1&cn=False", False
    .setRequestHeader "Content-Type", "application/x-www-form-urlencoded"
    
    msg_string = "ctl00_ctl00_ToolkitScriptManager1_HiddenField="
    msg_string = msg_string & "&__EVENTTARGET="
    msg_string = msg_string & "&__EVENTARGUMENT="
    msg_string = msg_string & "&__VIEWSTATE=" & VIEWSTATE
    msg_string = msg_string & "&__VIEWSTATEGENERATOR=" & VIEWSTATEGENERATOR
    msg_string = msg_string & "&__SCROLLPOSITIONX=0"
    msg_string = msg_string & "&__SCROLLPOSITIONY=288"
    msg_string = msg_string & "&__EVENTVALIDATION=" & EVENTVALIDATION
    msg_string = msg_string & "&ctl00%24ctl00%24cpl_MainContent%24cpl_BasicMainContent%24cblCols%240=on" '240=北部, 241=彰化...
    msg_string = msg_string & "&ctl00%24ctl00%24cpl_MainContent%24cpl_BasicMainContent%24cblRows%240=on"
    msg_string = msg_string & "&ctl00%24ctl00%24cpl_MainContent%24cpl_BasicMainContent%24DataType=RB1"
    msg_string = msg_string & "&ctl00%24ctl00%24cpl_MainContent%24cpl_BasicMainContent%24ddl_Year1=" & Format(Date, "yyyy")
    msg_string = msg_string & "&ctl00%24ctl00%24cpl_MainContent%24cpl_BasicMainContent%24ddl_Month1=" & Format(Date, "m")
    msg_string = msg_string & "&ctl00%24ctl00%24cpl_MainContent%24cpl_BasicMainContent%24ddl_Day=" & Format(Date, "d")
    msg_string = msg_string & "&ctl00%24ctl00%24cpl_MainContent%24cpl_BasicMainContent%24ddl_Year2=" & Format(Date, "yyyy")
    msg_string = msg_string & "&ctl00%24ctl00%24cpl_MainContent%24cpl_BasicMainContent%24ddl_Month2=" & Format(Date, "m")
    msg_string = msg_string & "&ctl00%24ctl00%24cpl_MainContent%24cpl_BasicMainContent%24queryYearList=1"
    msg_string = msg_string & "&ctl00%24ctl00%24cpl_MainContent%24cpl_BasicMainContent%24btnSummit=%E6%9F%A5%E8%A9%A2"
    
    .Send msg_string
    
    .Open "POST", "http://www.foodchina.com.tw/model/ajax/getChartData.ashx", False
    .setRequestHeader "Content-Type", "application/x-www-form-urlencoded"
    .Send
    strJSON = .responsetext
    With CreateObject("msscriptcontrol.scriptcontrol")
        .Language = "JavaScript"
        .AddCode "var mydata =" & strJSON
        '.AddCode strJSON
        Set objJSON = .CodeObject
        Set detail = CallByName(CallByName(objJSON, "mydata", VbGet), "data", VbGet)
        aaa = UBound(Split(strJSON, "{"))
        ReDim arr(1 To aaa, 1 To 2)
        i = 1
        For Each info In CallByName(detail, "datasets", VbGet)
            Set data_set = CallByName(info, "data", VbGet)
            For Each detail_data In data_set
                arr(i, 1) = CallByName(detail_data, "x", VbGet)
                arr(i, 2) = CallByName(detail_data, "y", VbGet)
                i = i + 1
            Next
            [a1].Resize(UBound(arr), 2) = arr
        Next
    End With
End With
End Sub
Function encodeURI(strText) As String
    With CreateObject("msscriptcontrol.scriptcontrol")
        .Language = "JavaScript"
        encodeURI = .Eval("encodeURIComponent('" & strText & "');")
    End With
End Function

我要發表回答

立即登入回答