前情提要: 昨天已經把所有的DJ名稱和url儲存到mp3_urls.txt,今天我們將透過多個thread來並行下載,增加下載的速度。
這裡流程很簡單:
import os
import requests
from concurrent.futures import ThreadPoolExecutor
import random
import time
# 創建資料夾,如果不存在
def create_folder(name):
if not os.path.exists(name):
os.makedirs(name)
# 下載文件並存儲到相應資料夾
def download_file(name_url_pair):
name, url = name_url_pair
folder_name = name.replace(" ", "_")
create_folder(folder_name)
try:
response = requests.get(url)
file_name = os.path.join(folder_name, url.split("/")[-1])
with open(file_name, 'wb') as file:
file.write(response.content)
except:
pass
time.sleep(3)
# 讀取txt並解析內容
def read_txt(file_path):
'''
format: DJ_name\turl\n
'''
with open(file_path, 'r', encoding = 'utf-8') as file:
lines = file.readlines()
name_url_pairs = [line.strip().split('\t') for line in lines]
return name_url_pairs
def main(txt_file):
name_url_pairs = read_txt(txt_file)
# 開啟多個thread並行下載
with ThreadPoolExecutor(max_workers = 8) as executor:
executor.map(download_file, name_url_pairs)
if __name__ == "__main__":
txt_file = "mp3_urls.txt" # 替換為你的txt路徑
main(txt_file)
另外來介紹一下yt-dlp這套,可以用於下載youtube影片,相比youtube-dl下載速度還要快,也可以利用這個套件來蒐集yt的音檔。
基本上想要下載yt的影片只需以下指令,其他還包含指定畫質(720p, 480p…),檔案類型(mp4, webm, …),這些都可以上網找尋相關指令或者問gpt。
pip install yt-dlp # 直接pip安裝
yt-dlp -x <yt_url> # 單下載那頁url的音檔,如果想下載影片不用-x
那如果我想要下載整個playlist怎麼辦呢? 這裡提供用多進程下載
from multiprocessing import Pool
from functools import partial
from tqdm import tqdm
import yt_dlp
import os
import time
def download(index_row, output_dir):
index, video_url = index_row
# 設置 yt-dlp 的下載選項
ydl_opts = {
'quiet' : True,
'format': 'bestaudio/best',
'outtmpl': os.path.join(output_dir, f'{index:04d}_%(title)s.%(ext)s'),
'postprocessors': [{ # 使用ffmpeg做後處理,直接轉成wav
'key': 'FFmpegExtractAudio',
'preferredcodec': 'wav',
'preferredquality': '0',
}],
'postprocessor_args': [
'-ar', '16000', # sample rate 16kHz
'-ac', '1', # 合併成單通道
],
# 'ffmpeg_location': 'ffmpeg' # 如果是windows系統,這裡要填ffmpeg bin檔的路徑
}
with yt_dlp.YoutubeDL(ydl_opts, auto_init = 'no_verbose_header') as ydl:
try:
ydl.download([video_url])
except:
pass
# 休息 3 秒
time.sleep(3)
return f"Processed: {video_url}"
def fetch_video_urls_from_playlist(playlist_url):
ydl_opts = {
'quiet': True,
'extract_flat': True,
'skip_download': True
}
video_urls = []
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
try:
playlist_info = ydl.extract_info(playlist_url, download = False)
if 'entries' in playlist_info:
for entry in playlist_info['entries']:
# 判斷是不是私有影片,如果是私有影片在下載會有錯誤
if entry['title'] != '[Private video]':
video_urls.append(entry['url'])
except:
pass
return video_urls
def main():
# playlist_urls裡面放要下載的playlist url
with open('playlist_urls.txt', 'r') as f_i:
lines = f_i.readlines()
playlist_urls = [line.strip() for line in lines]
num_processes = 8 # 並行進程的數量
for idx, playlist_url in enumerate(playlist_urls):
output_dir = f'yt_list_{idx:03d}'
# 創建目錄
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# 得到palylist所有url
video_urls = fetch_video_urls_from_playlist(playlist_url)
# 使用多進程下載音檔
p = Pool(num_processes)
download_trim = partial(download, output_dir = output_dir)
logs = []
with tqdm(total = len(video_urls), leave = False) as pbar:
for log in p.imap_unordered(download_trim, enumerate(video_urls)):
logs.append(log)
pbar.update()
p.close()
p.join()
if __name__ == "__main__":
main()
上面剛好兩個範例講到multi-thread跟multi-process,可以再學兩者差異,基本上應用在單純下載是差不多的,因為瓶頸在於網路頻寬,但如果是用於大量計算,通常multi-process速度會快很多。
今天就先到這囉~ 明天開始應該就會開始講pytorch相關,爬蟲就先告一個段落囉~