如題
settings.py
DOWNLOADER_MIDDLEWARES = {
'get_proxy.middlewares.ProxyMiddleware' :740
}
ITEM_PIPELINES = {
'get_proxy.pipelines.GetProxyPipeline': 300,
'get_proxy.pipelines.MongoPipeline': 400
}
middleware.py
from scrapy import signals
from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware
from collections import defaultdict
from scrapy.exceptions import NotConfigured
import json
import random
from fake_useragent import UserAgent
class ProxyMiddleware(object):
def process_request(self,request,spider):
#request對象加上proxy
#每次爬蟲訪問網頁之前執行
proxy=self.get_random_proxy()
print('Request ip:'+proxy)
request.meta['http_proxy']=proxy
def get_random_proxy(self):
#隨機讀取proxy
import pymongo
client = pymongo.MongoClient(host='localhost', port=27017)
db = client.ip_proxy_pool # 資料庫
collection = db.pool # 資料表
get_num=0
count=collection.find({'scheme':{'$in':['https']}}).count()
print('目前還有:',count)
# -----此部分需增加當沒取得到ip時需要啟動爬取ip程式-----
while get_num==0:
result = collection.aggregate([{'$sample': {'size': 1}}])
for i in result:
proxy=i['proxy']
run=self.check_proxy(proxy)
if run==True:
get_num+=1
return proxy
else:
self.del_proxy(proxy)
def check_proxy(self,proxy):
from bs4 import BeautifulSoup
import requests
import random
proxy = proxy.split('://')[1]
proxies = {
'http': 'http://' + proxy,
'https': 'https://' + proxy,
}
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}
showmyip = 'https://www.showmyip.gr/'
try:
htmlfile = requests.get(showmyip, headers=headers, proxies=proxies, timeout=random.randint(4,6))
objSoup = BeautifulSoup(htmlfile.text, 'lxml')
objTag = objSoup.find('div', 'starter-template container')
myip = objTag.find('div', 'col-md-6 col-sm-12').find('h2').find('span').text.strip()
print('ShowMyIP:', myip)
if myip in proxy:
print('代理成功')
return True
else:
print('代理失敗')
return False
except requests.exceptions.ConnectionError as e:
print('Error', e.args)
return False
def del_proxy(self,proxy):
import pymongo
client = pymongo.MongoClient(host='localhost', port=27017)
db = client.ip_proxy_pool # 資料庫
collection = db.pool # 資料表
collection.remove({'proxy': proxy})
print('已刪除:', proxy)
主程式是proxy_example.py
class ProxyExampleSpider(scrapy.Spider):
name = 'proxy_example'
allowed_domains = ['www.us-proxy.org']
start_urls = ['http://www.us-proxy.org/']
# def start_requests(self):
# yield scrapy.Request('http://www.us-proxy.org/', dont_filter=True)
def parse(self, response):
...
執行scrapy crawl proxy_example後卻只會執行middlewares的內容
請問這個狀況是QQ?
參考官方 Downloader Middlewares 的文件,process_request
方法應該要有回傳值(也可以參考我之前寫的文章)。
如果這個 middleware 是會繼續使用原本的 request 實例的話,return None
就可以了