您好:
參考書本範例,並有修改一些小問題
pttbeauty.json是有擷取資料
但images內沒有圖檔
請問,是圖檔有權限,還是還有哪邊設定需要再處理(範本較舊)
謝謝
import scrapy
from Ch9_4a.items import BeautyItem
from datetime import datetime
class PptbeautySpider(scrapy.Spider):
name = "pttbeauty"
allowed_domains = ["ptt.cc"]
start_urls = ["https://www.ptt.cc/bbs/Beauty/index.html"]
def __init__(self):
self.max_pages = 2 # 最大頁數
self.num_of_pages = 0 # 目前已爬取的頁數
def parse(self, response):
for href in response.css(".r-ent > div.title > a::attr(href)"):
url = response.urljoin(href.extract())
#print("-------------------url-AA--------------------------",url)
yield scrapy.Request(url, callback=self.parse_post)
#print("-------------------parse---------------------------")
self.num_of_pages = self.num_of_pages + 1
#print("-------------------parse------self.num_of_pages---------------------", self.num_of_pages)
# 是否已經到達最大頁數
if self.num_of_pages < self.max_pages:
prev_page = response.xpath('//div[@id="action-bar-container"]//a[contains(text(), "上頁")]/@href')
if prev_page: # 是否有上一頁
# print("-------------------prev_page[0]---------------------------",prev_page[0])
url = response.urljoin(prev_page[0].extract())
# print("-------------------url---------------------------",url)
yield scrapy.Request(url, self.parse)
else:
print("已經是最後一頁, 總共頁數: ", self.num_of_pages)
else:
print("已經到達最大頁數: ", self.max_pages)
def parse_post(self, response):
#print("-------------------parse_post---------------------------")
item = BeautyItem()
#item["author"] = response.css(".article-metaline:nth-child(1) .article-meta-value::text").extract_first()
item["author"] = response.css(".article-metaline:nth-child(1) .article-meta-value::text").extract_first()
item["title"] = response.css(".article-metaline-right+ .article-metaline .article-meta-value::text").extract_first()
#article-metaline-right+ 下一個
print( item["author"], item["title"])
datetime_str = response.css(".article-metaline+ .article-metaline .article-meta-value::text").extract_first()
item["date"] = datetime.strptime(datetime_str, '%a %b %d %H:%M:%S %Y')
score = 0
num_of_pushes = 0
comments = response.xpath('//div[@class="push"]')
for comment in comments:
push = comment.css("span.push-tag::text")[0].extract()
if "推" in push:
score = score + 1
num_of_pushes = num_of_pushes + 1
elif "噓" in push:
score = score - 1
item["score"] = score
item["pushes"] = num_of_pushes
item["comments"] = len(comments)
item["url"] = response.url
img_urls = response.xpath('//a[contains(@href, "imgur.com")]/@href').extract()
if img_urls:
img_urls = [url for url in img_urls if url.endswith(".jpg")]
item["images"] = len(img_urls)
item["file_urls"] = img_urls
else:
item["images"] = 0
item["file_urls"] = []
yield item
items
import scrapy
class BeautyItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
author = scrapy.Field()
date = scrapy.Field()
score = scrapy.Field()
pushes = scrapy.Field()
comments = scrapy.Field()
url = scrapy.Field()
images = scrapy.Field()
file_urls = scrapy.Field()
pipelines.py
from itemadapter import ItemAdapter
class Ch94APipeline:
def process_item(self, item, spider):
return item
settings.py
BOT_NAME = 'Ch9_4a'
SPIDER_MODULES = ['Ch9_4a.spiders']
NEWSPIDER_MODULE = 'Ch9_4a.spiders'
# 輸出 JSON 資料
#FEED_FORMAT = "json"
#FEED_URI = "pttbeauty.json"
#FEED_EXPORT_ENCODING = "utf-8"
FEEDS = {
'pttbeauty.json': {
'format': 'json',
'encoding': 'utf8',
'store_empty': False,
'fields': None,
'indent': 4,
},
}
ITEM_PIPELINES = {
'scrapy.pipelines.files.FilesPipeline': 1
}
FILES_STORE = 'images'
CONCURRENT_REQUESTS_PER_DOMAIN = 1
DOWNLOAD_DELAY = 5
pttbeauty.json
其中一段
{
"author": "tactics2100 (Ose)",
"title": "[正妹] 岡田さりお 開球",
"date": "2024-12-07 15:34:37",
"score": 46,
"pushes": 53,
"comments": 90,
"url": "https://www.ptt.cc/bbs/Beauty/M.1733556879.A.F3E.html",
"images": 2,
"file_urls": [
"http://i.imgur.com/SzEH8BS.jpg",
"http://i.imgur.com/KTiZOUy.jpg"
]
}
且在專案底下,也有自動產生images 資料夾
但就是沒有圖片
不知道是否真這些EXCEPTION有關
--- <exception caught here> ---
C:\ProgramData\anaconda3\Lib\site-packages\twisted\internet\defer.py:1075:_runCallbacks
C:\ProgramData\anaconda3\Lib\site-packages\scrapy\pipelines\files.py:459:media_failed
]
NoneType: None
2024-12-09 22:39:02 [scrapy.pipelines.media] ERROR: [Failure instance: Traceback: <class 'scrapy.pipelines.files.FileException'>:
C:\ProgramData\anaconda3\Lib\site-packages\twisted\internet\defer.py:533:addCallbacks
C:\ProgramData\anaconda3\Lib\site-packages\twisted\internet\defer.py:1075:_runCallbacks
C:\ProgramData\anaconda3\Lib\site-packages\scrapy\pipelines\media.py:197:_check_media_to_download
C:\ProgramData\anaconda3\Lib\site-packages\twisted\internet\defer.py:533:addCallbacks
可以看看 "C:\images"
有沒有
或是把 FILES_STORE = 'images'
裡面的路徑改成相對或絕對路徑看看
"./images"
"D:\images"
應該是沒找到資料夾
您好:這之前都是過,就是沒圖片下載
且都有產生images資料夾,但裡面就是沒有檔案
#FILES_STORE = './images'
FILES_STORE = 'D:\images'
#FILES_STORE = os.path.join(os.path.dirname(os.path.dirname(__file__)),'images')
allowed_domains = ["ptt.cc"]
加入 i.imgur.com 看看
或是不要設
您好:
試過
allowed_domains = ["ptt.cc"]
allowed_domains = ["ptt.cc","i.imgur.com"]
allowed_domains = []
但 都有建立images,但沒有圖片下載
FILES_STORE = 'path/to/your/files'
MEDIA_ALLOW_REDIRECTS = True # 若下載鏈接有重定向
另外我實際試
http://i.imgur.com/SzEH8BS.jpg
點了網址會變成
https://imgur.com/SzEH8BS
從這去看圖片網址是 .jpeg
也許是格式差異
https://i.imgur.com/SzEH8BS.jpeg