連續寫了好多天的文章,好想放個假出去旅行。開始上網查訂房資訊,查完幾個選項之後貼給同行朋友加權評估一下,最後存進試算表裡就完成!這總沒有資料工程或是程式的戲了吧?
且慢,我們流程拆解一下:
赫然發現這就是資料收集 ⮕ 資料轉換 ⮕ 資料匯入的過程,活生生就是個 ETL 啊!看來怎麼樣都逃不開資料工程的魔爪呀。我們進入了第二階段-軟體工程,首先想到的就是撰寫程式碼來實現這段 data pipeline。
沒辦法,為了找到理想的飯店,先快速寫一段程式在 main.py
上,把前述的流程都做完吧!
import requests
import csv
def get_place_count(location, keyword):
api_key = "YOUR_GOOGLE_MAPS_API_KEY"
query_string = f"location={location}&radius=1000&keyword={keyword}&key={api_key}"
url = f"https://maps.googleapis.com/maps/api/place/nearbysearch/json?{query_string}"
response = requests.get(url)
return len(response.json().get("results", []))
# 列出住宿點的經緯度
locations = [
"25.0330,121.5654", # location 1
"25.0320,121.5655", # location 2
"25.0340,121.5656", # location 3
"25.0350,121.5657", # location 4
"25.0360,121.5658", # location 5
"25.0370,121.5659", # location 6
"25.0380,121.5660", # location 7
"25.0390,121.5661", # location 8
"25.0400,121.5662", # location 9
"25.0410,121.5663", # location 10
]
# 每個住宿點的綜合分數
scores = []
# 搜尋每個類別:餐廳、咖啡廳、便利商店、超市、捷運站
for location in locations:
count_restaurants = get_place_count(location, "restaurant")
count_cafes = get_place_count(location, "cafe")
count_convenience_stores = get_place_count(location, "convenience store")
count_supermarkets = get_place_count(location, "supermarket")
count_mrt_stations = get_place_count(location, "MRT station")
# 計算每個住宿點的綜合分數
total = (
count_restaurants * 1.5
+ count_cafes * 1.2
+ count_convenience_stores * 1.1
+ count_supermarkets * 1.3
+ count_mrt_stations * 1.4
)
scores.append([location, total])
print(f"{location} 綜合分數:{total}")
# 將資料寫入 CSV 檔案
with open("location_scores.csv", mode="w", newline="") as file:
writer = csv.writer(file)
writer.writerow(["Location", "Score"]) # Header
writer.writerows(scores) # 寫入每個住宿點的綜合分數
print("資料已寫入 location_scores.csv")
這段程式碼雖然能夠很快幫我們獲得所需資料,但
get_place_count
,這使得程式碼冗長且難以修改。這種程式碼我們稱之為 Spaghetti code,像義大利麵一樣,很多不同功能交纏在一起,稍微修改一處就可能讓其他部分有崩壞的風險。在需求變多時,會非常難以維護和擴展。
面對這樣交纏的程式碼,我們有個抽絲剝繭的方式叫做重構 (refactoring),把程式碼重新組構,至少涵蓋模組化及錯誤處理。這過程也是需要花點心力的,還好 2024 年的今天我們有 AI 小幫手 ChatGPT 可以幫忙,我開口問他:『如果我要模組化,而且每個函式之間不要耦合,你會怎麼重構?』
他給了我以下檔案:
import csv
import requests
def fetch_places_count(api_key, location, keyword, radius=1000):
"""
使用 Google Maps API 根據關鍵字和地點來取得結果數量。
"""
query_string = (
f"location={location}&radius={radius}&keyword={keyword}&key={api_key}"
)
url = f"https://maps.googleapis.com/maps/api/place/nearbysearch/json?{query_string}"
try:
response = requests.get(url)
response.raise_for_status() # 若 API 回應有錯誤,則拋出異常
return len(response.json().get("results", []))
except requests.exceptions.RequestException as e:
print(f"API 請求錯誤: {e}")
return 0
def calculate_scores(counts, weights):
"""
根據提供的數量和權重計算綜合分數。
"""
total_score = 0
for count, weight in zip(counts, weights):
total_score += count * weight
return total_score # 返回總分
def save_to_csv(file_name, header, data):
"""
將資料寫入 CSV 檔案。
"""
with open(file_name, mode="w", newline="") as file:
writer = csv.writer(file)
writer.writerow(header)
writer.writerows(data)
print(f"資料已成功寫入 {file_name}")
def main():
# API 金鑰和資料設定
api_key = "YOUR_GOOGLE_MAPS_API_KEY"
locations = [
"25.0330,121.5654",
"25.0320,121.5655",
"25.0340,121.5656",
"25.0350,121.5657",
"25.0360,121.5658",
"25.0370,121.5659",
"25.0380,121.5660",
"25.0390,121.5661",
"25.0400,121.5662",
"25.0410,121.5663",
]
categories = [
"restaurant",
"cafe",
"convenience store",
"supermarket",
"MRT station",
]
weights = [1.5, 1.2, 1.1, 1.3, 1.4]
# 評分結果 (Results)
results = []
# 每個地點的綜合評分計算
for location in locations:
# Extract
counts = [
fetch_places_count(api_key, location, category)
for category in categories
]
# Transform
total_score = calculate_scores(counts, weights)
results.append([location] + counts + [total_score])
# Load
header = [
"Location",
"Restaurants",
"Cafes",
"Convenience Stores",
"Supermarkets",
"MRT Stations",
"Total Score",
]
save_to_csv("location_scores.csv", header, results)
if __name__ == "__main__":
main()
可以看到 ETL 流程的每一步都被拆成一個函式 (function):
如果我們要把程式碼打包,放在一個程式庫 (repository) 裡,需要按照各段程式碼的功能歸建檔案。我們再次尋求厲害小幫手 ChatGPT 的協助:『若我要將程式碼拆放在不同資料夾,讓這三個功能都可以被重複使用,你會怎麼設計整個 repository 的架構?』
它又娓娓道來以下內容:
project-root/
│
├── etl/
│ ├── __init__.py
│ ├── extract.py # 取得資料的邏輯
│ ├── transform.py # 資料轉換和計算的邏輯
│ └── load.py # 將資料寫入檔案的邏輯
│
├── pipeline/
│ ├── __init__.py
│ └── process.py # ETL 流程控制模組
│
├── config/
│ └── settings.py # API key 和其他參數
│
├── data/
│ └── location_scores.csv # 儲存結果的 CSV
│
├── tests/
│ ├── __init__.py
│ └── test_etl.py # 測試程式
│
├── main.py # 主程式,主要負責調用模組
├── requirements.txt # 套件管理
└── README.md # 說明文件
ETL 的邏輯被歸類在 etl/
資料夾下,函式內容與前述相同,就不再重談。主程式 main.py
只負責調用流程控制模組。main.py
from pipeline.process import run_etl_pipeline
from config.settings import LOCATIONS, CATEGORIES, WEIGHTS, OUTPUT_FILE
def main():
run_etl_pipeline(LOCATIONS, CATEGORIES, WEIGHTS, OUTPUT_FILE)
if __name__ == "__main__":
main()
pipeline/process.py
from etl.extract import fetch_places_count
from etl.transform import calculate_score
from etl.load import save_to_csv
from config.settings import API_KEY
def run_etl_pipeline(locations, categories, weights, output_file):
results = []
for location in locations:
scores = calculate_score(
API_KEY, location, categories, weights, fetch_places_count
)
results.append([location] + scores)
header = [
"Location",
"Restaurants",
"Cafes",
"Convenience Stores",
"Supermarkets",
"MRT Stations",
"Total Score",
]
save_to_csv(output_file, header, results)
這個檔案將邏輯從主程式抽離,專注於 ETL 流程控制。
config/settings.py
API_KEY = "GOOGLE_MAPS_API_KEY"
SEARCH_RADIUS = 1000
# 地點和參數配置
LOCATIONS = [
"25.0330,121.5654",
"25.0320,121.5655",
"25.0340,121.5656",
"25.0350,121.5657",
"25.0360,121.5658",
"25.0370,121.5659",
"25.0380,121.5660",
"25.0390,121.5661",
"25.0400,121.5662",
"25.0410,121.5663",
]
CATEGORIES = [
"restaurant",
"cafe",
"convenience store",
"supermarket",
"MRT station",
]
WEIGHTS = [1.5, 1.2, 1.1, 1.3, 1.4]
# 輸出文件
OUTPUT_FILE = "data/location_scores.csv"
最後是參數集中擺放的部分,讓流程歸流程、邏輯歸邏輯,參數的調整統一在此進行即可。打造好這個架構後,想要改變搜尋地點的數量、考慮的鄰近設施類別以及權重都可以快速地調整,不必在程式碼海裡苦苦搜尋。
重構的好處顯而易見。
資料工程師與軟體工程師,有什麼不一樣?
再想一次這個問題,這次我會說:『軟體工程師負責打造好的應用程式給產品顧客;資料工程師則致力於打造好的應用程式,讓資料的流轉與儲存更加順利。』其實非常相似吧!差別就是應用情境與使用者不同而已。
專案手把手教學《生活機能好?有多好?》(上)[2021,簡書廷撰]
專案手把手教學《生活機能好?有多好?》(下)[2021,簡書廷撰]