[Day5] 爬蟲收集音檔(+ yt-dlp) - 3 - iT 邦幫忙::一起幫忙解決難題，拯救 IT 人的一天

2024 iThome 鐵人賽

DAY 5

自我挑戰組

菜鳥AI工程師給碩班學弟妹的挑戰系列第 5 篇

[Day5] 爬蟲收集音檔(+ yt-dlp) - 3

16th鐵人賽

jeremylee

2024-08-09 09:09:37

323 瀏覽

分享至

前情提要: 昨天已經把所有的DJ名稱和url儲存到mp3_urls.txt，今天我們將透過多個thread來並行下載，增加下載的速度。

1. 下載mp3

這裡流程很簡單:

開啟txt: 讀取每個DJ name跟url
透過thread建立多個executor: executor來執行download_file這個function
download_file: 先建立資料夾用於存放音檔，requests get獲取音檔，with open儲存音檔

import os
import requests
from concurrent.futures import ThreadPoolExecutor
import random
import time
# 創建資料夾，如果不存在
def create_folder(name):
    if not os.path.exists(name):
        os.makedirs(name)

# 下載文件並存儲到相應資料夾
def download_file(name_url_pair):
    name, url = name_url_pair
    folder_name = name.replace(" ", "_")
    create_folder(folder_name)
    try:
        response = requests.get(url)
        file_name = os.path.join(folder_name, url.split("/")[-1])
        with open(file_name, 'wb') as file:
            file.write(response.content)
    except:
        pass

    
    time.sleep(3)

# 讀取txt並解析內容
def read_txt(file_path):
	'''
		format: DJ_name\turl\n
	'''
    with open(file_path, 'r', encoding = 'utf-8') as file:
        lines = file.readlines()
    name_url_pairs = [line.strip().split('\t') for line in lines]
    return name_url_pairs


def main(txt_file):
    name_url_pairs = read_txt(txt_file)
	# 開啟多個thread並行下載
    with ThreadPoolExecutor(max_workers = 8) as executor:
        executor.map(download_file, name_url_pairs)

if __name__ == "__main__":
    txt_file = "mp3_urls.txt"  # 替換為你的txt路徑
    main(txt_file)

2. yt-dlp

另外來介紹一下yt-dlp這套，可以用於下載youtube影片，相比youtube-dl下載速度還要快，也可以利用這個套件來蒐集yt的音檔。

基本上想要下載yt的影片只需以下指令，其他還包含指定畫質(720p, 480p…)，檔案類型(mp4, webm, …)，這些都可以上網找尋相關指令或者問gpt。

pip install yt-dlp # 直接pip安裝
yt-dlp -x <yt_url> # 單下載那頁url的音檔，如果想下載影片不用-x

那如果我想要下載整個playlist怎麼辦呢? 這裡提供用多進程下載

from multiprocessing import Pool
from functools import partial
from tqdm import tqdm
import yt_dlp
import os
import time

def download(index_row, output_dir):
    index, video_url = index_row

    # 設置 yt-dlp 的下載選項
    ydl_opts = {
        'quiet' : True,
        'format': 'bestaudio/best',
        'outtmpl': os.path.join(output_dir, f'{index:04d}_%(title)s.%(ext)s'),
        'postprocessors': [{  # 使用ffmpeg做後處理，直接轉成wav
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'wav',
            'preferredquality': '0',
        }],
        'postprocessor_args': [
            '-ar', '16000',  # sample rate 16kHz
            '-ac', '1',      # 合併成單通道
        ],
        # 'ffmpeg_location': 'ffmpeg' # 如果是windows系統，這裡要填ffmpeg bin檔的路徑
    }

    with yt_dlp.YoutubeDL(ydl_opts, auto_init = 'no_verbose_header') as ydl:
        try:
            ydl.download([video_url])
        except:
            pass

    # 休息 3 秒
    time.sleep(3)
    return f"Processed: {video_url}"

def fetch_video_urls_from_playlist(playlist_url):
    ydl_opts = {
        'quiet': True,
        'extract_flat': True,
        'skip_download': True
    }
    video_urls = []
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        try:
            playlist_info = ydl.extract_info(playlist_url, download = False)
            if 'entries' in playlist_info:
                for entry in playlist_info['entries']:
                    # 判斷是不是私有影片，如果是私有影片在下載會有錯誤
                    if entry['title'] != '[Private video]':
                        video_urls.append(entry['url'])
        except:
            pass
    return video_urls

def main():
	# playlist_urls裡面放要下載的playlist url
    with open('playlist_urls.txt', 'r') as f_i:
        lines = f_i.readlines()
        playlist_urls = [line.strip() for line in lines]

    num_processes = 8  # 並行進程的數量
    for idx, playlist_url in enumerate(playlist_urls):
        output_dir = f'yt_list_{idx:03d}'
        # 創建目錄
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        # 得到palylist所有url
        video_urls = fetch_video_urls_from_playlist(playlist_url)

        # 使用多進程下載音檔
        p = Pool(num_processes)
        download_trim = partial(download, output_dir = output_dir)
        logs = []

        with tqdm(total = len(video_urls), leave = False) as pbar:
            for log in p.imap_unordered(download_trim, enumerate(video_urls)):
                logs.append(log)
                pbar.update()

        p.close()
        p.join()

if __name__ == "__main__":
    main()

上面剛好兩個範例講到multi-thread跟multi-process，可以再學兩者差異，基本上應用在單純下載是差不多的，因為瓶頸在於網路頻寬，但如果是用於大量計算，通常multi-process速度會快很多。

今天就先到這囉~ 明天開始應該就會開始講pytorch相關，爬蟲就先告一個段落囉~