from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from time import sleep
import json,os,pprint,time
from bs4 import BeautifulSoup
import requests
# 啟動瀏覽器工具的選項
options = webdriver.ChromeOptions()
# options.add_argument("--headless") #不開啟實體瀏覽器背景執行
options.add_argument("--start-maximized") #最大化視窗
options.add_argument("--incognito") #開啟無痕模式
options.add_argument("--disable-popup-blocking ") #禁用彈出攔截
# 使用 Chrome 的 WebDriver
driver = webdriver.Chrome(options = options)
listData=[]
url='https://www.gutenberg.org/browse/languages/zh'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
}
folderPath='Gutenberg'
if not os.path.exists(folderPath):
os.makedirs(folderPath)
def visit():
driver.get(url)
def getMainLinks():
mainElement=driver.find_elements(By.CSS_SELECTOR,'ul>li.pgdbetext > a')
for a in mainElement:
listData.append({
"link":a.get_attribute("href"),
"title":a.get_attribute('innerText')
})
#pprint(listData)
def getSubLinks():
for i in range(len(listData)):
if"sub" not in listData[i]:
listData[i]["sub"]=[]
driver.get(listData[i]["link"])
try:
webDriverWait(driver,3).until(
EC.presence_of_element_located(
(By.CSS_SELECTOR,'a[class="link"][title="Download"]')
)
)
mainElement=driver.find_element_located(By.CSS_SELECTOR,'a[class="link"][title="Download"]')
for a in mainElement:
listData[i]["sub"].append({
"sub_title":a.get_attribute('innerText'),
"sub_link":a.get_attribute("href")
})
except TimeoutException as e:
continue
def saveJson():
fp=open("gutenberg.json","w",encoding="utf-8")
fp.write(json.dumps(listData,ensure_ascii=False))
fp.close()
def writeTxt():
listContent=[]
fp=open("gutenberg.json","r",encoding="utf-8")
strJson=fp.read()
fp.close()
listResult=json.loads(strJson)
for i in range(len(listResult)):
for j in range(len(listResult)[i]["sub"]):
driver.get(listResult[i]['sub'][j]['sub_link'])
div=diver.find_element(By.CSS_SELECTOR,'body#html>pre')
strContent=div.get_attribute('innerText')
strContent=strContent.replace(" ","")
strContent=strContent.replace("\r","")
strContent=strContent.replace("\n","")
fileName=f"{listResult[i]['title']}_{listResult[i]['sub'][j]['sub_title']}.txt"
fp=open(f"{folderPath}/{fileName}","w",encoding="utf-8")
fp.write(strContent)
fp.close()
listContent.append(strContent)
fp=open("train.json","w",encoding="utf-8")
fp.write(json.dumps(listContent,ensure_ascii=False))
fp.close()
def close():
driver.quit()
if __name__ == "__main__":
visit()
getMainLinks()
getSubLinks()
saveJson()
writeTxt()
close()
想請問,為什麼爬不進去txt檔案?
姑且調整了一下你的程式碼,有大概幾個問題
for j in range(len(list_result[i]['sub'])):
pip install webdriver_manager
安裝就好from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from time import sleep
import json
import os
from bs4 import BeautifulSoup
import requests
# 啟動瀏覽器工具的選項
options = webdriver.ChromeOptions()
# options.add_argument("--headless") #不開啟實體瀏覽器背景執行
options.add_argument("--start-maximized") # 最大化視窗
options.add_argument("--incognito") # 開啟無痕模式
options.add_argument("--disable-popup-blocking ") # 禁用彈出攔截
# 使用 Chrome 的 WebDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
listData = []
url = 'https://www.gutenberg.org/browse/languages/zh'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'}
folderPath = 'Gutenberg'
if not os.path.exists(folderPath):
os.makedirs(folderPath)
def visit():
driver.get(url)
def get_main_links():
main_element = driver.find_elements(By.CSS_SELECTOR, 'ul>li.pgdbetext > a')
for a in main_element:
listData.append({
"link": a.get_attribute("href"),
"title": a.get_attribute('innerText')
})
def get_sub_links():
for i in range(len(listData)):
if "sub" not in listData[i]:
listData[i]["sub"] = []
driver.get(listData[i]["link"])
try:
WebDriverWait(driver, 3).until(
EC.presence_of_element_located(
(By.CSS_SELECTOR, 'a[class="link"][title="Download"]')
)
)
main_element = driver.find_elements(By.CSS_SELECTOR, 'a[class="link"][title="Download"]')
for a in main_element:
listData[i]["sub"].append({
"sub_title": a.get_attribute('innerText'),
"sub_link": a.get_attribute("href")
})
except TimeoutException as e:
continue
def save_json():
fp = open("gutenberg.json", "w", encoding="utf-8")
fp.write(json.dumps(listData, ensure_ascii=False))
fp.close()
def write_txt():
list_content = []
fp = open("gutenberg.json", "r", encoding="utf-8")
str_json = fp.read()
fp.close()
list_result = json.loads(str_json)
for i in range(len(list_result)):
for j in range(len(list_result[i]['sub'])):
driver.get(list_result[i]['sub'][j]['sub_link'])
div = driver.find_element(By.CSS_SELECTOR, 'body#html>pre')
str_content = div.get_attribute('innerText')
str_content = str_content.replace(" ", "")
str_content = str_content.replace("\r", "")
str_content = str_content.replace("\n", "")
file_name = f"{list_result[i]['title']}_{list_result[i]['sub'][j]['sub_title']}.txt"
fp = open(f"{folderPath}/{file_name}", "w", encoding="utf-8")
fp.write(str_content)
fp.close()
list_content.append(str_content)
fp = open("train.json", "w", encoding="utf-8")
fp.write(json.dumps(list_content, ensure_ascii=False))
fp.close()
def close():
driver.quit()
if __name__ == "__main__":
try:
visit()
get_main_links()
get_sub_links()
save_json()
write_txt()
finally:
close()