proxy_example.py
class ProxyExampleSpider(scrapy.Spider):
name = 'proxy_example'
allowed_domains = ['www.us-proxy.org']
start_urls = ['http://www.us-proxy.org/']
def parse(self, response):
soup = BeautifulSoup(response.text, 'lxml')
trs = soup.select("#proxylisttable tr")
for tr in trs:
tds = tr.select("td")
if len(tds) > 6:
ip = tds[0].text
port = tds[1].text
anonymity = tds[4].text
ifScheme = tds[6].text
if ifScheme == 'yes':
scheme = 'https'
else: scheme = 'http'
proxy = "%s://%s:%s"%(scheme, ip, port)
meta = {
'port': port,
'proxy': proxy,
'dont_retry': True,
'download_timeout': 5,
'_proxy_scheme': scheme,
'_proxy_ip': ip
}
print(meta)
yield scrapy.Request('https://httpbin.org/ip',callback=self.proxy_check_available, meta=meta, dont_filter=True)
def proxy_check_available(self, response):
proxy_ip = response.meta['_proxy_ip']
if proxy_ip == json.loads(response.text)['origin']:
yield {
'scheme': response.meta['_proxy_scheme'],
'proxy': response.meta['proxy'],
'port': response.meta['port']
}
接著執行
scrapy crawl proxy_example -o proxy.json
然而第二次執行的結果不會直接覆蓋原本的結果,
會直接出現在原本的檔案之下,造成錯誤。
請問該如何解決?
沒用過 scrapy
用 request-html 給你參考
from requests_html import HTMLSession
import json
url = 'http://www.us-proxy.org/'
with open('result.json', 'w') as resultFile:
json.dump([], resultFile)
r = HTMLSession().get(url)
rows = r.html.find('#proxylisttable tbody tr')
for row in rows:
td = row.find('td')
meta = {
'port': td[1].text,
'proxy': '{}://{}:{}'.format('https' if td[6].text == 'Yes' else 'http', td[0].text, td[1].text),
#'dont_retry': True,
#'download_timeout': 5,
'_proxy_scheme': 'https' if td[6].text == 'Yes' else 'http',
'_proxy_ip': td[0].text
}
print(meta)
try:
demo = HTMLSession().request('GET', 'http://httpbin.org/ip', proxies={'http': meta['proxy']}, timeout=5)
if (meta['_proxy_ip'] in demo.text):
print(demo.text)
with open('result.json', 'r') as resultFile:
datas = json.load(resultFile)
with open('result.json', 'w') as resultFile:
datas.append(meta)
json.dump(datas, resultFile)
except:
print('fail')
result
[
{
"port": "8080",
"proxy": "http://167.71.104.163:8080",
"_proxy_scheme": "http",
"_proxy_ip": "167.71.104.163"
},
{
"port": "80",
"proxy": "http://45.144.240.142:80",
"_proxy_scheme": "http",
"_proxy_ip": "45.144.240.142"
},
{
"port": "3129",
"proxy": "http://107.179.75.2:3129",
"_proxy_scheme": "http",
"_proxy_ip": "107.179.75.2"
},
{
"port": "3121",
"proxy": "http://64.137.110.52:3121",
"_proxy_scheme": "http",
"_proxy_ip": "64.137.110.52"
},
{
"port": "3121",
"proxy": "http://64.137.110.168:3121",
"_proxy_scheme": "http",
"_proxy_ip": "64.137.110.168"
},
{
"port": "8080",
"proxy": "http://208.118.229.134:8080",
"_proxy_scheme": "http",
"_proxy_ip": "208.118.229.134"
}
]
沒用過 scrapy
不過這樣的問題不是可以很單純處理嘛??
從你的結果可以得知,這一定不是覆蓋式的,而是接續式的輸出方式。
所以就先找找是否可以有覆蓋式的寫法。
就算沒有,反正你是指令式的。在下指令前先砍掉檔案不就好了。