etf50_df = pd.read_csv("data/ETF50.csv", dtype={"STOCK_ID": str})
etf50_id = etf50_df.loc[:, "STOCK_ID"]
data_df = load_stock(stock_index, start_year=2011, end_year=2021)
data_df
從之前的實驗我們可以知道,比起直接使用原始資料,
將資料數值間的差值可以獲得更好的效果,
最後使用ZScore Normalize避免特徵間的數值差異影響輸出結果
norm_df = data_df
norm_df.loc[:, "Open"] = norm_df["Open"] - norm_df["Close"]
norm_df.loc[:, "High"] = norm_df["High"] - norm_df["Close"]
norm_df.loc[:, "Low"] = norm_df["Low"] - norm_df["Close"]
norm_df.loc[:, "Close"] = norm_df["Close"] - norm_df["Close"].shift(1)
norm_df = (norm_df - norm_df.mean()) / norm_df.std()
norm_df = norm_df.dropna()
norm_df
一樣記得別把未來資料帶進去了。
train_mask = (data_df.index > "2014-01-01") & (data_df.index < "2018-12-31")
test_mask = (data_df.index > "2019-01-01") & (data_df.index < "2019-12-31")
X_train = data_df[test_mask]
y_train = data_df["Close"].shift(-1)[train_mask]
X_test = data_df[test_mask]
y_test = data_df["Close"].shift(-1)[test_mask]
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
如果是在4年前,我們會需要寫一堆pandas或numpy程式,
去將資料轉換成lstm使用的時間序列格式,然而現在2021只要使用套版函式就好了。
watch_days = 10
# data shape = 128 * 10 * 5
data_gen = TimeseriesGenerator(
X_train, y_train, length=watch_days, sampling_rate=1, batch_size=128
)
model = Sequential()
model.add(LSTM(100, activation="relu", input_shape=(watch_days, 5)))
model.add(Dense(1))
model.compile(optimizer="adam", loss="mse")
# fit model
model.fit_generator(data_gen, steps_per_epoch=1, epochs=500, verbose=0)