iT邦幫忙

0

scrapy設定middleware後主程式無法執行

  • 分享至 

  • xImage

如題

settings.py

DOWNLOADER_MIDDLEWARES = {
    'get_proxy.middlewares.ProxyMiddleware' :740
}

ITEM_PIPELINES = {
    'get_proxy.pipelines.GetProxyPipeline': 300,
    'get_proxy.pipelines.MongoPipeline': 400
}

middleware.py

from scrapy import signals
from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware
from collections import defaultdict
from scrapy.exceptions import NotConfigured
import json
import random
from fake_useragent import UserAgent

class ProxyMiddleware(object):

    def process_request(self,request,spider):
        #request對象加上proxy
        #每次爬蟲訪問網頁之前執行
        proxy=self.get_random_proxy()
        print('Request ip:'+proxy)
        request.meta['http_proxy']=proxy

    def get_random_proxy(self):
        #隨機讀取proxy
        import pymongo
        client = pymongo.MongoClient(host='localhost', port=27017)
        db = client.ip_proxy_pool  # 資料庫
        collection = db.pool  # 資料表
        get_num=0
        count=collection.find({'scheme':{'$in':['https']}}).count()
        print('目前還有:',count)
        # -----此部分需增加當沒取得到ip時需要啟動爬取ip程式-----
        while get_num==0:
            result = collection.aggregate([{'$sample': {'size': 1}}])
            for i in result:
                proxy=i['proxy']
                run=self.check_proxy(proxy)
                if run==True:
                    get_num+=1
                    return proxy
                else:
                    self.del_proxy(proxy)

    def check_proxy(self,proxy):
        from bs4 import BeautifulSoup
        import requests
        import random
        proxy = proxy.split('://')[1]
        proxies = {
            'http': 'http://' + proxy,
            'https': 'https://' + proxy,
        }
        headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}
        showmyip = 'https://www.showmyip.gr/'
        try:
            htmlfile = requests.get(showmyip, headers=headers, proxies=proxies, timeout=random.randint(4,6))
            objSoup = BeautifulSoup(htmlfile.text, 'lxml')
            objTag = objSoup.find('div', 'starter-template container')
            myip = objTag.find('div', 'col-md-6 col-sm-12').find('h2').find('span').text.strip()
            print('ShowMyIP:', myip)
            if myip in proxy:
                print('代理成功')
                return True
            else:
                print('代理失敗')
                return False
        except requests.exceptions.ConnectionError as e:
            print('Error', e.args)
            return False

    def del_proxy(self,proxy):
        import pymongo
        client = pymongo.MongoClient(host='localhost', port=27017)
        db = client.ip_proxy_pool  # 資料庫
        collection = db.pool  # 資料表
        collection.remove({'proxy': proxy})
        print('已刪除:', proxy)

主程式是proxy_example.py

class ProxyExampleSpider(scrapy.Spider):
    name = 'proxy_example'
    allowed_domains = ['www.us-proxy.org']
    start_urls = ['http://www.us-proxy.org/']

    # def start_requests(self):
    #     yield scrapy.Request('http://www.us-proxy.org/', dont_filter=True)

    def parse(self, response):
    ...

執行scrapy crawl proxy_example後卻只會執行middlewares的內容
請問這個狀況是QQ?

圖片
  直播研討會
圖片
{{ item.channelVendor }} {{ item.webinarstarted }} |
{{ formatDate(item.duration) }}
直播中

1 個回答

0
Rex Chien
iT邦新手 4 級 ‧ 2019-12-15 23:45:24

參考官方 Downloader Middlewares 的文件,process_request 方法應該要有回傳值(也可以參考我之前寫的文章)。

如果這個 middleware 是會繼續使用原本的 request 實例的話,return None 就可以了

我要發表回答

立即登入回答