python爬蟲程式請指點程式有何可以改進

#python #爬蟲 #新手求指導

g5289460 2020-10-02 01:15:22 ‧ 1775 瀏覽

分享至

小弟接觸python約4個月 ,非資工相關科系
首次發問
想請教各路大大
這樣的程式碼有甚麼建議修改的地方嗎?
感謝大大
想加多執行續在程式內 ,還不知道要怎麼放進程式內 ,請再麻煩指點迷津
感謝
'''

import requests
from http import cookiejar
from lxml import etree
import os
import json
import multiprocessing as mp
import re

#轉換網頁Html
def lxml_html(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
        , 'referer': 'https://c-h-c.com.tw/shop/'}

    cookies=cookiejar.CookieJar()
    #print(cookies)
    res=requests.get(url=url,headers=headers,cookies=cookies)
    #res.encoding=res.apparent_encoding #自動轉換代碼
    #關鍵字charse 找網頁編碼格式

    html=etree.HTML(res.text)
    return html

#建立指定路徑資料夾
def set_folders(key):
    resource_path=r'./CHCCFolders/'+key+'/'
    if not os.path.exists(resource_path):
        os.makedirs(resource_path)
    else:
        print(resource_path)
    return resource_path


# 轉json檔寫入指定路徑資料夾內
def dump_json_file(query_dict, file_name, resource_path):
    with open(resource_path + "/{}.json".format(file_name), 'w', encoding='utf-8') as outfile:
        json.dump(query_dict, outfile, ensure_ascii=False)
        print('dump the data successfully')

#取得網頁所有的主題網址
def GetUrls():
    url='https://c-h-c.com.tw/shop/'
    html=lxml_html(url)
    urls_list = html.xpath('//li[@class="mega-menu-column mega-menu-columns-2-of-12"]/ul/li/a/@href')
    title_list = html.xpath('//li[@class="mega-menu-column mega-menu-columns-2-of-12"]/ul/li/a/text()')
    CHClist = []
    for run in range(len(title_list)):
        CHCdict = {'id': run, 'title': title_list[run], 'url': urls_list[run]}
        CHClist.append(CHCdict)
    content_all = []
    for number in range(len(CHClist)):
        title = CHClist[number]['title']
        in_url = CHClist[number]['url']

        htmls = lxml_html(in_url)
        content_url = htmls.xpath('//div[@class="un-product-thumbnail"]/a/@href')
        content_json = {'title': title, 'url': content_url}
        content_all.append(content_json)

    return content_all


#取得主題網頁後所有商品
def Get_ALL(content_all):
    for urls in range(len(content_all)):
        title = re.sub(r'\W', '', content_all[urls]['title'])
        urllist = content_all[urls]['url']
        path = set_folders(title)
        for url in urllist:
            content_html = lxml_html(url)

            # Name
            Name = re.sub(r'\W', '', content_html.xpath('//h1[@class="product_title entry-title"]/text()')[0])
            # Url
            Url = url
            # Price
            Price = content_html.xpath('//p/ins/span[@class="woocommerce-Price-amount amount"]/text()')
            if Price == []:
                Price = content_html.xpath('//p/span[@class="woocommerce-Price-amount amount"]/text()')
            try:
                # Store
                Store = re.sub('\n', '', content_html.xpath(
                    '//div[@class="summary entry-summary"]/div[@class="woo-short-description"]/p/text()')[0])
            except IndexError:
                pass
            try:
                # Brand
                Brand = re.sub('\n', '', content_html.xpath(
                    '//div[@class="summary entry-summary"]/div[@class="woo-short-description"]/p/text()')[1])
            except IndexError:
                pass
            try:
                # Introduction
                Introduction = [
                    content_html.xpath('//div[@class="vc_tta-panel-body"]//div[@class="wpb_wrapper"]/h2/text()')[0]
                    , content_html.xpath('//div[@class="vc_tta-panel-body"]//div[@class="wpb_wrapper"]/p/text()')[0]]
            except IndexError:
                pass
            try:
                # Specification
                Specification = [content_html.xpath('//div[@class="wpb_wrapper"]/h4/text()')[0]
                    , content_html.xpath('//div[@class="boxEditCont"]/p/text()')[0]]
            except IndexError:
                pass
            # Product
            Product_img = content_html.xpath('//a[@class="photoswipe"]/img/@src')[0]

            # Layout

            Layout_img = content_html.xpath('//div[@class="wpb_wrapper"]//img/@src')

            try:
                content_json = {'title': title, 'Name': Name, 'Price': Price, 'Url': Url, 'Store': Store,
                                'Brand': Brand, 'Introduction': Introduction
                    , 'Specification': Specification, 'Product': Product_img, 'Layout': Layout_img}
            except:
                content_json = {'title': title, 'Name': Name, 'Price': Price, 'Url': Url, 'Store': [], 'Brand': [],
                                'Introduction': []
                    , 'Specification': [], 'Product': Product_img, 'Layout': Layout_img}
            print(content_json)
            dump_json_file(content_json, Name, path)

def main():
     content_all=GetUrls()

     Get_ALL(content_all)


if __name__ =='__main__':
    main()