Python3----Scrapy,繼承CrawlSpider爬取1111人力

python3

Sad_Clown 2019-06-25 17:36:58 ‧ 1175 瀏覽

分享至


# -*- coding: utf-8 -*-
import time
import random
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from pprint import pprint

class TencentSpider(CrawlSpider):
    name = '1111'
    start_urls = ['https://www.1111.com.tw/job-bank/job-index.asp?ks=python&page=1']

    page_list = LinkExtractor(restrict_xpaths=('//ul[@class="pagination"]/li'))

    print(page_list)
    rules = (
        Rule(page_list,callback='parse_item',follow=True),
    )
    def parse_item(self,response):
        time.sleep(random.choice([1.1,1.2,1.5,2.1,1.8]))
        print('現在位置,',response.url)
        even_1 = response.xpath('//div[@class="jbInfoin"]')
        for even in  even_1:
            title= even.xpath('./h3/a/@title').extract()[0].strip()
            href= even.xpath('./h3/a/@href').extract()[0]
            href = 'https:'+href
            #公司名,含類別地址等
            company = even.xpath('./h4/a/@title').extract()[0]
            print(title,href,company)

協議記得設置為False

這個記得關>>>>> # allowed_domains = ['1111.com']