iT邦幫忙

0

python爬蟲程式 請指點程式有何可以改進

小弟接觸python約4個月 ,非資工相關科系
首次發問
想請教各路大大
這樣的程式碼有甚麼建議修改的地方嗎?
感謝大大
想加多執行續在程式內 ,還不知道要怎麼放進程式內 ,請再麻煩指點迷津
感謝
'''

import requests
from http import cookiejar
from lxml import etree
import os
import json
import multiprocessing as mp
import re

#轉換網頁Html
def lxml_html(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
        , 'referer': 'https://c-h-c.com.tw/shop/'}

    cookies=cookiejar.CookieJar()
    #print(cookies)
    res=requests.get(url=url,headers=headers,cookies=cookies)
    #res.encoding=res.apparent_encoding #自動轉換代碼
    #關鍵字charse 找網頁編碼格式

    html=etree.HTML(res.text)
    return html

#建立指定路徑資料夾
def set_folders(key):
    resource_path=r'./CHCCFolders/'+key+'/'
    if not os.path.exists(resource_path):
        os.makedirs(resource_path)
    else:
        print(resource_path)
    return resource_path


# 轉json檔寫入指定路徑資料夾內
def dump_json_file(query_dict, file_name, resource_path):
    with open(resource_path + "/{}.json".format(file_name), 'w', encoding='utf-8') as outfile:
        json.dump(query_dict, outfile, ensure_ascii=False)
        print('dump the data successfully')

#取得網頁所有的主題網址
def GetUrls():
    url='https://c-h-c.com.tw/shop/'
    html=lxml_html(url)
    urls_list = html.xpath('//li[@class="mega-menu-column mega-menu-columns-2-of-12"]/ul/li/a/@href')
    title_list = html.xpath('//li[@class="mega-menu-column mega-menu-columns-2-of-12"]/ul/li/a/text()')
    CHClist = []
    for run in range(len(title_list)):
        CHCdict = {'id': run, 'title': title_list[run], 'url': urls_list[run]}
        CHClist.append(CHCdict)
    content_all = []
    for number in range(len(CHClist)):
        title = CHClist[number]['title']
        in_url = CHClist[number]['url']

        htmls = lxml_html(in_url)
        content_url = htmls.xpath('//div[@class="un-product-thumbnail"]/a/@href')
        content_json = {'title': title, 'url': content_url}
        content_all.append(content_json)

    return content_all


#取得主題網頁後所有商品
def Get_ALL(content_all):
    for urls in range(len(content_all)):
        title = re.sub(r'\W', '', content_all[urls]['title'])
        urllist = content_all[urls]['url']
        path = set_folders(title)
        for url in urllist:
            content_html = lxml_html(url)

            # Name
            Name = re.sub(r'\W', '', content_html.xpath('//h1[@class="product_title entry-title"]/text()')[0])
            # Url
            Url = url
            # Price
            Price = content_html.xpath('//p/ins/span[@class="woocommerce-Price-amount amount"]/text()')
            if Price == []:
                Price = content_html.xpath('//p/span[@class="woocommerce-Price-amount amount"]/text()')
            try:
                # Store
                Store = re.sub('\n', '', content_html.xpath(
                    '//div[@class="summary entry-summary"]/div[@class="woo-short-description"]/p/text()')[0])
            except IndexError:
                pass
            try:
                # Brand
                Brand = re.sub('\n', '', content_html.xpath(
                    '//div[@class="summary entry-summary"]/div[@class="woo-short-description"]/p/text()')[1])
            except IndexError:
                pass
            try:
                # Introduction
                Introduction = [
                    content_html.xpath('//div[@class="vc_tta-panel-body"]//div[@class="wpb_wrapper"]/h2/text()')[0]
                    , content_html.xpath('//div[@class="vc_tta-panel-body"]//div[@class="wpb_wrapper"]/p/text()')[0]]
            except IndexError:
                pass
            try:
                # Specification
                Specification = [content_html.xpath('//div[@class="wpb_wrapper"]/h4/text()')[0]
                    , content_html.xpath('//div[@class="boxEditCont"]/p/text()')[0]]
            except IndexError:
                pass
            # Product
            Product_img = content_html.xpath('//a[@class="photoswipe"]/img/@src')[0]

            # Layout

            Layout_img = content_html.xpath('//div[@class="wpb_wrapper"]//img/@src')

            try:
                content_json = {'title': title, 'Name': Name, 'Price': Price, 'Url': Url, 'Store': Store,
                                'Brand': Brand, 'Introduction': Introduction
                    , 'Specification': Specification, 'Product': Product_img, 'Layout': Layout_img}
            except:
                content_json = {'title': title, 'Name': Name, 'Price': Price, 'Url': Url, 'Store': [], 'Brand': [],
                                'Introduction': []
                    , 'Specification': [], 'Product': Product_img, 'Layout': Layout_img}
            print(content_json)
            dump_json_file(content_json, Name, path)

def main():
     content_all=GetUrls()

     Get_ALL(content_all)


if __name__ =='__main__':
    main()

'''

想要挑戰自己的話 : 如果是想要爬完某個List,看你電腦是幾線呈,可以一個執行續各爬一個網站,接著用遞迴甚至非同步把一開始的網址迭代完。假如"沒有事先的List",可以幾個線呈負責搜尋網頁,其他負責爬,應該會比較快。如果你做的是平行處理的話,可以改成用GPU進行運算。

1 個回答

我要發表回答

立即登入回答