Python爬蟲練習 Instagram follow(ing/er) list crawler

python python爬蟲 instagram

sn0wl3r0ker 2022-08-13 23:01:56 ‧ 3934 瀏覽

分享至

github網址

學習筆記

復習到Python爬蟲寫法
比對兩筆資料(difflib)寫法
pickle儲存session
try except寫報錯方法
路徑、讀寫文件、生成Excel(openpyxl->workbook)寫法
待更新用法、補README.md

main.py

# from pprint import pprint
from bs4 import BeautifulSoup
import requests, pickle
import re
from config import username,password,headers,url,ajax_url,p_url,path
from datetime import datetime
from openpyxl import Workbook
import compare
import os, sys

def create_folder(path):
    if not os.path.isdir(path):
        os.makedirs(path)

def ask_excel(ask_option):
    flag = []
    yes_list = ['y','Y','yes']
    no_list = ['n','N','no','']
    while flag not in yes_list and flag not in no_list:
        flag = input(f'Do u want {ask_option}? y/n [n]: ')
        if(flag not in yes_list and flag not in no_list):
            print('plz enter y or n or ENTER!!!')
    # print(f'flag={flag}')
    if(flag in yes_list):
        return 1
    elif(flag in no_list):
        return 0

def do_excel(uid,date,opt_title,option,root_json,path):              # 跑生成excel
    wb = Workbook()
    ws = wb.active
    title = ['username', 'full_name', 'profile_pic']
    ws.append(title)
    for users in root_json['users']:
        id = []
        id.append('@'+users['username'])
        id.append(users['full_name'])
        id.append(users['profile_pic_url']+'.jpg')
        ws.append(id)
    wb.save(path+f'{uid}{date}{opt_title[option]}.xlsx')

def do_txt(uid,date,opt_title,option,root_json,path):                # 跑生成txt 
    i=1
    with open(path+f'{uid}{date}{opt_title[option]}.txt', 'w+',encoding='utf-8') as f:
        for users in root_json['users']:
            # id = (f'{i}','@'+users['username'], users['full_name'])
            id = (f'@'+users['username'], users['full_name'])
            i+=1
        # reresponse = response.text.replace('\\u0026','&')
            f.write(str(id)+'\n')
        f.write(f'Total: {i-1} records!')
    print(f'Got {i-1} records!!!')

def main():
    date = datetime.now().strftime("%Y%m%d-%H%M")
    time = int(datetime.now().timestamp())
    payload = {
        'username': f'{username}',
        'enc_password': f'#PWD_INSTAGRAM_BROWSER:0:{time}:{password}',
        'queryParams': {},
        'optIntoOneTap': 'false'
    }
    print(f'If target is private account, you have to follow it first!!!')
    while True:
        uid = str(input('Enter id: '))
        if (uid == ''):
            print(f'Do not leave blank!!!')
        else:
            break
    while True:
        opt_list = {'':'following','1':'following','2':'followers','following':'following','followers':'followers'}
        option = str(input('following[1]/followers[2] [1]: '))
        if(option in opt_list):
            option = opt_list[option]
            break
        else:
            print(f'enter 1 or 2 or following or followers!!!')
    try:
        fcount = int(input('Enter max num of following/followers [2000]: '))
    except ValueError:
        fcount = 2000

    opt_title = {
        'following': 'fwi',
        'followers': 'fwr',}
    ask = ask_excel('Excel file(will have profile pic)')
    # print(ask)
    with requests.session() as session:
        if not os.path.exists(f'{path}{username}session.pkl'):
            print('Getting sessions')      #session = requests.sess.....
            res = session.get(url)
            csrf = re.findall(r"csrf_token\":\"(.*?)\"",res.text)[0]
            cookies = res.cookies                   #res獲取第一次cookie和csrf
            cookies['csrf'] = csrf
            headers['x-csrftoken'] = csrf
            # print(headers)
            session.post(ajax_url, data=payload, headers=headers, cookies=cookies)  
            with open(f'{path}{username}session.pkl', 'wb') as f:
                pickle.dump(session.cookies, f)        #用現有cookie和csrf token 去取得登入的session
            headers['Referer'] = f'https://www.instagram.com/{uid}/following/'
        # print(req2.text)
        else:
            print('Reloading sessions and updating cookies')
            headers['Referer'] = f'https://www.instagram.com/{uid}/following/'
            with open(f'{path}{username}session.pkl', 'rb') as f:
                cookies = session.cookies.update(pickle.load(f))
                headers['x-csrftoken'] = session.cookies['csrftoken']
                # print(session.cookies)
                # print(headers)
        fsi=session.get(p_url+uid,cookies=cookies,headers=headers)    
        # print(fsi.text)
        
        try:
            # print(str(re.findall(r"id\":\"(.*?)\"",fsi.text)))
            friendid = str(re.findall(r"id\":\"(.*?)\"",fsi.text)[1])
            checkid = str(re.findall(r"id\":\"(.*?)\"",fsi.text)[-1])
            if(friendid == '236' or friendid == None or checkid == '236'):
                raise Exception
            print(f"userid:{friendid}")
        except:
            os.remove(f'{path}{username}session.pkl')
            print(f'error while checking userid')
            print(f'1.plz check the target username(no @)!!!')
            print(f'2.Make sure u set the right USERNAME and PASSWORD in *config.py* file!!!')
            print(f'3.your account might block by instagram server, plz try again later or change your ip!!')
            sys.exit()

        # url的後輟 可以像翻頁一樣去增加再爬取 或是直接爆max來爬取
        params = {
        'count': fcount,
        'max_id': '',
        'search_surface': 'follow_list_page'}
        response = session.get(f'https://i.instagram.com/api/v1/friendships/{friendid}/{option}/', params=params, cookies=cookies, headers=headers)
        # print(response.text)
        try:
            root_json = response.json()
        except requests.exceptions.JSONDecodeError as jsonError:
            print(f'Error when processing json file: {jsonError}')
            print(f'1.Make sure u set the right USERNAME and PASSWORD in *config.py* file!!!')
            print(f'2.your account might block by instagram server, plz try again later or change your ip!!')
            sys.exit()
            
    if(ask == 1):
        try:
            do_excel(uid,date,opt_title,option,root_json,path)
        except IOError as error:
            print(f'Error when generate Excel file:{error}')
    # pprint(response.text)
    do_txt(uid,date,opt_title,option,root_json,path)
    ask2 = ask_excel('compare with old file')
    if(ask2 == 1):
        try:
            f1 = path+input(f'Enter first filename(older file): ')+'.txt'
            f2 = path+f'{uid}{date}{opt_title[option]}'+'.txt'
            compare.compare_file(f1, f2)
        except IOError as error:
            print(f'Error when generate compared.txt file:{error}')
            print(f'Make sure u have the file existed and enter the right filename(without .txt)!!!')
            sys.exit()

if __name__ == '__main__':
    create_folder(path)
    if username == 'USERNAME or EMAIL':
        print(f'plz go config.py to set your USERNAME and PASSWORD')
        os._exit(0)
    main()

compare.py

import sys
import difflib
import os
from config import path


def create_folder(path):
    if not os.path.isdir(path):
        os.makedirs(path)

# 讀取配置文件函數
def read_file(file_name):
    try:
        file_handle = open(file_name, 'r', encoding="utf-8")
        text = file_handle.read().splitlines()  # 讀取後以行進行分割
        file_handle.close()
        return text
    except IOError as error:
        print('Read file Error: {0}'.format(error))
        sys.exit()


# 比較兩個文件並輸出html格式的結果
def compare_file(file1_name, file2_name):
    if file1_name == "" or file2_name == "":
        print('文件路徑不能為空: file1_name的路徑為: {0}, file2_name的路徑為: {1} .'.format(file1_name, file2_name))
        sys.exit()
    text1_lines = read_file(file1_name)
    text2_lines = read_file(file2_name)
    print_list = ['+','-']
    no_print_list = ['!']
    d = difflib.Differ()
    diff_print = [a for a in d.compare(text1_lines, text2_lines) if a[0] in print_list and a[-1] not in no_print_list ]
    if diff_print:
        print('record of compare will be storage in compared.txt and result.html !')
        try:
            with open(path+'compared.txt', 'w', encoding="utf-8") as result_file:
                result_file.write('\n'.join(diff_print))
            print('\n'.join(diff_print))
        except IOError as error:
            print('寫入compare.txt文件時發生錯誤:{0}'.format(error))

    else:
        print(f"It's all same as old file")
    diff = difflib.HtmlDiff()  # 創建htmldiff 對象
    result = diff.make_file(text1_lines, text2_lines)  # 通過make_file 方法輸出 html 格式的對比結果
    #  將結果保存到result.html文件中並打開
    try:
        with open(path+'result.html', 'w', encoding="utf-8") as result_file:      #同 f = open('result.html', 'w') 打開或創建一個result.html文件
            result_file.write(result)                      #同 f.write(result)
    except IOError as error:
        print('寫入html文件錯誤:{0}'.format(error))


if __name__ == '__main__':
    create_folder(path)
    f1 = path+input(f'Enter first filename(older file): ')+'.txt'
    f2 = path+input(f'Enter second filename(newer file): ')+'.txt'
    compare_file(f1, f2)

config.py

#Ur Instagram username and password
username = 'USERNAME or EMAIL'
password = 'PASSWORD'

url = f'https://www.instagram.com/accounts/login/'
ajax_url = f'https://www.instagram.com/accounts/login/ajax/'
p_url = f'https://i.instagram.com/api/v1/users/web_profile_info/?username='

path = r'./data/'

headers = {
'authority': 'www.instagram.com',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language': 'zh-TW,zh;q=0.9',
'dnt': '1',
'sec-ch-prefers-color-scheme': 'dark',
'sec-ch-ua': '".Not/A)Brand";v="99", "Google Chrome";v="103", "Chromium";v="103"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'sec-gpc': '1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 12_3_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Instagram 105.0.0.11.118 (iPhone11,8; iOS 12_3_1; en_US; en-US; scale=2.00; 828x1792; 165586599)',
'viewport-width': '1707',
'X-Requested-With': 'XMLHttpRequest',
}