印出記憶體位置而不是印出值怎解決

python list literable list_iterator object 網路爬蟲

Eddie_code 2023-01-16 00:23:52 ‧ 1366 瀏覽

分享至

我寫了一個抓小說的程式單次執行沒問題
一個一個寫沒問題
ex:
getNovel(1234)
getNovel(4567)
但是如果
for chapter in chapters:
getNovel(int(chapter))
下面book這行就會印出記憶體位置而不是值
ex:<list_iterator object at 0x7f5aef6c0a30>
** books=bs.find('div',{'id':'tbchapterlist'}).children**
請問要怎麼改謝謝
＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝
已解決去掉children就正常了
＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import os
def getNovel(articleNum):
url='https://tw.hjwzw.com/Book/Chapter/%d'%articleNum
html = urlopen(url)
bs = BeautifulSoup(html.read(),'html.parser')
title=bs.find("head").title.get_text()
#求得書名的長度
length=title.find('/')
#表達式無法過濾去掉非文字部份所以先去掉
if '，' in title:
title=title.replace('，','')
length=length-1
if '：' in title:
title=title.replace('：','')
length=length-1
if '!' in title:
title=title.replace('!','')
length=length-1
if '?' in title:
title=title.replace('?','')
length=length-1
#去掉書名後的文字動態給表達式長度
regex01=r'\w{%s}(?=/)'%length
title=re.match(regex01,title)
#內文標題是書名
str='%s'%title[0]
#找到div的dom
** books=bs.find('div',{'id':'tbchapterlist'}).children**
#把網址改成絕對路徑
for a in books.select('a'):
a['href']="https://tw.hjwzw.com"+a['href']
#找出所有章節
for book in books:
str=str+'%s'%book
#放在子資料夾novels 如果沒有就新增
folderPath = os.path.join(os.getcwd(), 'novels')
if not os.path.exists(folderPath):
os.makedirs(folderPath)
#寫入檔案檔名是書名
path='./novels/%s.html'%title[0]
fileobj =open(path, 'wt',encoding='UTF-8')
print(str,file=fileobj)
fileobj.close()
#鬥破蒼穹
getNovel(1642)

登入發表討論

1 個回答

JamesDoge

iT邦高手 1 級 ‧ 2023-01-17 06:21:58

請問要怎麼改

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import os

def getNovel(articleNum):
    #組合小說章節網址
    url='https://tw.hjwzw.com/Book/Chapter/%d'%articleNum
    #開啟網頁
    html = urlopen(url)
    #解析網頁
    bs = BeautifulSoup(html.read(),'html.parser')
    #獲取書名
    title=bs.find("head").title.get_text()
    #求得書名的長度
    length=title.find('/')
    #表達式無法過濾去掉非文字部份 所以先去掉
    if '，' in title:
        title=title.replace('，','')
        length=length-1
    if '：' in title:
        title=title.replace('：','')
        length=length-1
    if '!' in title:
        title=title.replace('!','')
        length=length-1
    if '?' in title:
        title=title.replace('?','')
        length=length-1
    #去掉書名後的文字 動態給表達式長度
    regex01=r'\w{%s}(?=/)'%length
    title=re.match(regex01,title)
    #內文標題是書名
    str='%s'%title[0]
    #找到div的dom
    books=bs.find('div',{'id':'tbchapterlist'})
    #把網址改成絕對路徑
    for a in books.select('a'):
        a['href']="https://tw.hjwzw.com"+a['href']
    #找出所有章節
    for book in books:
        str=str+'%s'%book
    #放在子資料夾novels 如果沒有就新增
    folderPath = os.path.join(os.getcwd(), 'novels')
    if not os.path.exists(folderPath):
        os.makedirs(folderPath)
    #寫入文件
    with open(os.path.join(folderPath, '%s.txt' % str), 'w',encoding='utf-8') as f:
    f.write(str)
    #測試程式
    chapters = [1234, 4567]
    for chapter in chapters:
    getNovel(int(chapter))