因為我之前都是用手打的資料數,通常都是打1500或3000,但我覺得這樣很麻煩,因為只要動到抓的資料範圍,就會需要更改我的測試視窗跟訓練視窗,所以我改了抓資料的方式,我從自行輸入資料數量改成輸入想從哪天開始抓,然後又增加了一個函式幫我運算FOLD數,盡量讓他能保持持在7個左右。
這是更改過後的抓資料的程式
def fetch_crypto_data(symbol="BTC/USDT", timeframe="1h",
start_date=None, force_reload=True):
#自動根據 start_date 計算 TOTAL_LIMIT 並抓取加密貨幣歷史資料
filename = f"{symbol.replace('/', '_')}_latest.csv"
# 如果有快取檔案且不強制重抓
if os.path.exists(filename) and not force_reload:
print(f"✅ 讀取快取檔案 {filename}")
df = pd.read_csv(filename)
df['timestamp'] = pd.to_datetime(df['timestamp'])
print(f"📅 Data range: {df['timestamp'].min()} → {df['timestamp'].max()}")
return df
# -----------------------------
# 🧮 自動計算 TOTAL_LIMIT
# -----------------------------
if start_date is None:
raise ValueError("請提供 start_date,例如 '2025-07-01'")
start_dt = pd.to_datetime(start_date)
now = datetime.utcnow()
delta_hours = int((now - start_dt).total_seconds() / 3600)
# 根據 timeframe 計算間隔倍率
tf_map = {
"1h": 1,
"30m": 2,
"15m": 4,
"4h": 1/4,
"1d": 1/24
}
multiplier = tf_map.get(timeframe, 1)
total_limit = int(delta_hours * multiplier * 1.1) # 多抓10% buffer
print(f"📡 從 Binance 抓取 {symbol} {timeframe} 資料中...")
print(f"⏱️ 時間範圍: {start_date} → {now.strftime('%Y-%m-%d %H:%M:%S')} (約 {total_limit} 根K線)\n")
# 抓取資料
exchange = ccxt.binance({"enableRateLimit": True})
exchange.load_markets()
all_data = []
since = int(start_dt.timestamp() * 1000)
limit = 1000
fetched = 0
while True:
ohlcv = exchange.fetch_ohlcv(symbol, timeframe=timeframe, since=since, limit=limit)
if not ohlcv:
break
all_data += ohlcv
fetched += len(ohlcv)
since = ohlcv[-1][0] + 1
print(f" → 已抓取 {fetched} 筆")
if len(ohlcv) < limit or fetched >= total_limit:
break
df = pd.DataFrame(all_data, columns=['timestamp','open','high','low','close','volume'])
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
df = df.drop_duplicates(subset='timestamp').sort_values('timestamp').reset_index(drop=True)
df.to_csv(filename, index=False)
print(f"✅ 資料已儲存至 {filename}")
print(f"📅 Data range: {df['timestamp'].min()} → {df['timestamp'].max()} (共 {len(df)} 筆)")
return df
這是計算Walk-forward 參數
def calculate_walk_forward_params(total_data_len, target_folds=7, fixed_test_window=300, fixed_step=300, min_train_window=500):
"""
根據總資料長度,計算出能產生指定 Fold 數的最佳訓練窗口長度。
"""
if total_data_len < min_train_window + fixed_test_window + (target_folds - 1) * fixed_step:
print(f"❌ 警告:數據量 {total_data_len} 過少,無法達到 {target_folds} 個 Fold。將使用最低訓練窗口。")
train_window = min_train_window
# 重新計算實際 Fold 數
actual_folds = (total_data_len - min_train_window - fixed_test_window) // fixed_step + 1
if actual_folds < 1:
actual_folds = 0
else:
# 計算所需的訓練窗口 (Train)
# N = (F - 1) * Step + Train + Test
# Train = N - Test - (F - 1) * Step
train_window = total_data_len - fixed_test_window - (target_folds - 1) * fixed_step
# 確保訓練窗口不小於最小值
if train_window < min_train_window:
train_window = min_train_window
actual_folds = (total_data_len - min_train_window - fixed_test_window) // fixed_step + 1
else:
actual_folds = target_folds
print(f"📊 Walk-Forward 參數設定:")
print(f" -> 目標 Fold 數: {target_folds} 個")
print(f" -> 實際 Fold 數: {actual_folds} 個")
print(f" -> TRAIN_WINDOW: {train_window}")
print(f" -> TEST_WINDOW: {fixed_test_window}")
print(f" -> STEP: {fixed_step}")
return train_window, fixed_test_window, fixed_step, actual_folds
算TOTAL_LIMIT
def calc_total_limit(start_date, timeframe="1h"):
"""
根據 start_date 自動計算應該抓取的資料筆數。
timeframe 可為 '1h', '4h', '1d'。
"""
now = datetime.utcnow()
start = datetime.strptime(start_date, "%Y-%m-%d")
delta_hours = (now - start).total_seconds() / 3600
if timeframe == "1h":
total_limit = int(delta_hours)
elif timeframe == "4h":
total_limit = int(delta_hours / 4)
elif timeframe == "1d":
total_limit = int(delta_hours / 24)
else:
raise ValueError("不支援的 timeframe,請使用 '1h', '4h' 或 '1d'。")
# 預留一點 buffer 避免邊界誤差
return int(total_limit * 1.1)
主程式更改
if __name__ == "__main__":
# ✅ 設定資料時間範圍
START_DATE = "2025-06-01" # 想從哪一天開始抓
TIMEFRAME = "1h"
# ✅ 根據起始日期自動計算 TOTAL_LIMIT
TOTAL_LIMIT = calc_total_limit(START_DATE, timeframe=TIMEFRAME)
# Walk-forward 預設參數 (固定這兩個,讓 TRAIN_WINDOW 變化)
TARGET_FOLDS = 7
FIXED_TEST_WINDOW = 300
FIXED_STEP = 300
RETURN_THRESHOLD = 0.003 # 0.3% 漲幅才算 Target=1
print(f"===== 抓取與處理資料 (總筆數: {TOTAL_LIMIT}) =====")
# 【修正:加入 force_reload=True 以確保抓取足夠數據】
df_raw = fetch_crypto_data(
symbol="BTC/USDT",
timeframe="1h",
start_date=START_DATE, # ✅ 從這天開始抓資料
force_reload=True
)
#加入技術指標與 ML 資料處理
df_ind = add_indicators(df_raw)
X, y, df = prepare_ml_data(df_ind, return_threshold=RETURN_THRESHOLD)
#計算 Walk-forward 參數
FINAL_DATA_LEN = len(X)
TRAIN_WINDOW, TEST_WINDOW, STEP, ACTUAL_FOLDS = calculate_walk_forward_params(
total_data_len=FINAL_DATA_LEN,
target_folds=TARGET_FOLDS,
fixed_test_window=FIXED_TEST_WINDOW,
fixed_step=FIXED_STEP
)
if ACTUAL_FOLDS < 1:
print("\n❌ 錯誤:數據量嚴重不足,無法執行 Walk-forward 訓練。請將 START_DATE 設置得更早。")
else:
print("\n===== 開始 Sliding-Window Walk-forward 訓練 =====")
# 傳遞 TRAIN_WINDOW, TEST_WINDOW, STEP 參數
model, last_test_index, y_true, y_pred, folds = walk_forward_train(
X, y, df,
train_window=TRAIN_WINDOW,
test_window=TEST_WINDOW,
step=STEP
)