# -*- coding: utf-8 -*-
import time
import random
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from pprint import pprint
class TencentSpider(CrawlSpider):
name = '1111'
start_urls = ['https://www.1111.com.tw/job-bank/job-index.asp?ks=python&page=1']
page_list = LinkExtractor(restrict_xpaths=('//ul[@class="pagination"]/li'))
print(page_list)
rules = (
Rule(page_list,callback='parse_item',follow=True),
)
def parse_item(self,response):
time.sleep(random.choice([1.1,1.2,1.5,2.1,1.8]))
print('現在位置,',response.url)
even_1 = response.xpath('//div[@class="jbInfoin"]')
for even in even_1:
title= even.xpath('./h3/a/@title').extract()[0].strip()
href= even.xpath('./h3/a/@href').extract()[0]
href = 'https:'+href
#公司名,含類別地址等
company = even.xpath('./h4/a/@title').extract()[0]
print(title,href,company)
協議記得設置為False
這個記得關>>>>> # allowed_domains = ['1111.com']