新手python爬蟲

selector

12120327ruby 2022-04-16 03:01:14 ‧ 2636 瀏覽

分享至

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from time import sleep
import json,os,pprint,time
from bs4 import BeautifulSoup
import requests

# 啟動瀏覽器工具的選項
options = webdriver.ChromeOptions()
# options.add_argument("--headless")              #不開啟實體瀏覽器背景執行
options.add_argument("--start-maximized")         #最大化視窗
options.add_argument("--incognito")               #開啟無痕模式
options.add_argument("--disable-popup-blocking ") #禁用彈出攔截

# 使用 Chrome 的 WebDriver
driver = webdriver.Chrome(options = options)
listData=[]

url='https://www.gutenberg.org/browse/languages/zh'

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
}


folderPath='Gutenberg'                    
if not os.path.exists(folderPath):
    os.makedirs(folderPath)
def visit():
    driver.get(url)
def getMainLinks():
    mainElement=driver.find_elements(By.CSS_SELECTOR,'ul>li.pgdbetext > a')
    for a in mainElement:
 
        listData.append({
            "link":a.get_attribute("href"),
            "title":a.get_attribute('innerText')
        })
        #pprint(listData)
def getSubLinks():
    for i in range(len(listData)):
        if"sub" not in listData[i]:
            listData[i]["sub"]=[]
            
        driver.get(listData[i]["link"])
        try:
            webDriverWait(driver,3).until(
            EC.presence_of_element_located(
               (By.CSS_SELECTOR,'a[class="link"][title="Download"]')
            )
            )
            
            mainElement=driver.find_element_located(By.CSS_SELECTOR,'a[class="link"][title="Download"]')

            for a in mainElement:
                listData[i]["sub"].append({
                "sub_title":a.get_attribute('innerText'),
                "sub_link":a.get_attribute("href")
            })
            
        except TimeoutException as e:
            continue
def saveJson():
    fp=open("gutenberg.json","w",encoding="utf-8")
    fp.write(json.dumps(listData,ensure_ascii=False))
    fp.close()
    
def writeTxt():
    listContent=[]
    
    fp=open("gutenberg.json","r",encoding="utf-8")
    strJson=fp.read()
    fp.close()
    
    listResult=json.loads(strJson)

    for i in range(len(listResult)):
        for j in range(len(listResult)[i]["sub"]):
            driver.get(listResult[i]['sub'][j]['sub_link'])
            div=diver.find_element(By.CSS_SELECTOR,'body#html>pre')
            strContent=div.get_attribute('innerText')
            strContent=strContent.replace(" ","")
            strContent=strContent.replace("\r","")
            strContent=strContent.replace("\n","")
            
            fileName=f"{listResult[i]['title']}_{listResult[i]['sub'][j]['sub_title']}.txt"
            
            fp=open(f"{folderPath}/{fileName}","w",encoding="utf-8")
            fp.write(strContent)
            fp.close()
            
            listContent.append(strContent)
            fp=open("train.json","w",encoding="utf-8")
            fp.write(json.dumps(listContent,ensure_ascii=False))
            fp.close()
            
def close():
    driver.quit()
if __name__ == "__main__":
    visit()
    getMainLinks()
    getSubLinks()
    saveJson()
    writeTxt()
    close()

想請問，為什麼爬不進去txt檔案?

淺水員 iT邦大師 6 級 ‧ 2022-04-16 16:58:22 檢舉

剛剛看了一下，這網站沒防爬
應該不需要 selenium 也可以抓到資料

登入發表討論

熱門推薦

{{ item.channelVendor }} | {{ item.webinarstarted }} |

直播中

1 個回答

熊熊工程師

iT邦好手 1 級 ‧ 2022-04-17 02:02:28

姑且調整了一下你的程式碼，有大概幾個問題

write_txt() 裡面的巢狀迴圈的第二層的 range 部分有改為 for j in range(len(list_result[i]['sub'])):
同樣是 write_txt() 目前報錯的原因是說他沒有找到指定的元素，從你的 CSS 內容來看有點難通靈你想抓什麼，可以先從修正爬蟲開始
如果直接複製過去使用應該會提示你沒有 webdriver_manager 這個套件，直接用 pip install webdriver_manager 安裝就好
有些變數命名的方式不太像 python 的撰寫風格就一並改了，不然 pycharm 會一直跳底線看了有點煩躁
下面的程式碼只是根據你貼上來的做調整，相信還會有更好的寫法

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from time import sleep
import json
import os
from bs4 import BeautifulSoup
import requests

# 啟動瀏覽器工具的選項
options = webdriver.ChromeOptions()
# options.add_argument("--headless")              #不開啟實體瀏覽器背景執行
options.add_argument("--start-maximized")  # 最大化視窗
options.add_argument("--incognito")  # 開啟無痕模式
options.add_argument("--disable-popup-blocking ")  # 禁用彈出攔截

# 使用 Chrome 的 WebDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
listData = []

url = 'https://www.gutenberg.org/browse/languages/zh'

headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                         'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'}

folderPath = 'Gutenberg'
if not os.path.exists(folderPath):
    os.makedirs(folderPath)


def visit():
    driver.get(url)


def get_main_links():
    main_element = driver.find_elements(By.CSS_SELECTOR, 'ul>li.pgdbetext > a')

    for a in main_element:
        listData.append({
            "link": a.get_attribute("href"),
            "title": a.get_attribute('innerText')
        })


def get_sub_links():
    for i in range(len(listData)):
        if "sub" not in listData[i]:
            listData[i]["sub"] = []

        driver.get(listData[i]["link"])
        try:
            WebDriverWait(driver, 3).until(
                EC.presence_of_element_located(
                    (By.CSS_SELECTOR, 'a[class="link"][title="Download"]')
                )
            )

            main_element = driver.find_elements(By.CSS_SELECTOR, 'a[class="link"][title="Download"]')

            for a in main_element:
                listData[i]["sub"].append({
                    "sub_title": a.get_attribute('innerText'),
                    "sub_link": a.get_attribute("href")
                })

        except TimeoutException as e:
            continue


def save_json():
    fp = open("gutenberg.json", "w", encoding="utf-8")
    fp.write(json.dumps(listData, ensure_ascii=False))
    fp.close()


def write_txt():
    list_content = []

    fp = open("gutenberg.json", "r", encoding="utf-8")
    str_json = fp.read()
    fp.close()

    list_result = json.loads(str_json)

    for i in range(len(list_result)):
        for j in range(len(list_result[i]['sub'])):
            driver.get(list_result[i]['sub'][j]['sub_link'])
            div = driver.find_element(By.CSS_SELECTOR, 'body#html>pre')
            str_content = div.get_attribute('innerText')
            str_content = str_content.replace(" ", "")
            str_content = str_content.replace("\r", "")
            str_content = str_content.replace("\n", "")

            file_name = f"{list_result[i]['title']}_{list_result[i]['sub'][j]['sub_title']}.txt"

            fp = open(f"{folderPath}/{file_name}", "w", encoding="utf-8")
            fp.write(str_content)
            fp.close()

            list_content.append(str_content)
            fp = open("train.json", "w", encoding="utf-8")
            fp.write(json.dumps(list_content, ensure_ascii=False))
            fp.close()


def close():
    driver.quit()


if __name__ == "__main__":
    try:
        visit()
        get_main_links()
        get_sub_links()
        save_json()
        write_txt()
    finally:
        close()