在上一節中我們仔細講過了 RL 的細節以及 from scratch 的實做了整個 pipeline。在本節中,我們將學習如何使用強化學習庫 Stable Baselines
和金融強化學習庫 FinRL
,來開發股票交易機器人。我們將比較多種常見的強化學習算法,如 DQN、PPO、A2C
等,並在實驗中分析它們的性能差異。我們還將繪製買賣點、年化收益、總獎勵-訓練回合等圖形,以直觀地評估模型的表現。那由於今日同時比較了3個模型,具體詳細跑得結果就請看今日 Colab,我這邊只放程式這樣不會太亂。
!pip install stable-baselines3[extra]
!pip install finrl
!pip install yfinance
!pip install pandas numpy matplotlib
!pip install alpaca-trade-api
!pip install exchange_calendars
!pip install stockstats
!pip install wrds
for FinRL:
## install required packages
!pip install swig
!pip install wrds
!pip install pyportfolioopt
## install finrl library
!pip install -q condacolab
import condacolab
condacolab.install()
!apt-get update -y -qq && apt-get install -y -qq cmake libopenmpi-dev python3-dev zlib1g-dev libgl1-mesa-glx swig
!pip install git+https://github.com/AI4Finance-Foundation/FinRL.git
for TA-Lib:
!wget http://prdownloads.sourceforge.net/ta-lib/ta-lib-0.4.0-src.tar.gz
!tar -xzvf ta-lib-0.4.0-src.tar.gz
%cd ta-lib
!./configure --prefix=/usr
!make
!make install
!pip install Ta-Lib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yfinance as yf
from finrl.meta.env_stock_trading.env_stocktrading import StockTradingEnv
from stable_baselines3 import DQN, PPO, A2C
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
我們將使用 yfinance
來下載蘋果公司(AAPL)的歷史數據。
data = yf.download('AAPL', start='2015-01-01', end='2021-01-01')
data.reset_index(inplace=True)
data = data[['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']]
在強化學習中,我們可以添加一些技術指標作為狀態特徵,提升代理的決策能力。
import talib
# 添加技術指標
data['MA5'] = talib.SMA(data['Close'], timeperiod=5)
data['MA10'] = talib.SMA(data['Close'], timeperiod=10)
data['RSI'] = talib.RSI(data['Close'], timeperiod=14)
data['MACD'], data['MACD_signal'], data['MACD_hist'] = talib.MACD(data['Close'])
data['tic'] = 'AAPL'
data.fillna(method='bfill', inplace=True)
stock_dimension = 1
state_space = stock_dimension * (5 + 3) # 5個價格 + 3個技術指標
action_space = stock_dimension
# 將數據按照日期排序
data.sort_values('Date', inplace=True)
# 定義環境參數
env_kwargs = {
"stock_dim": stock_dimension,
"hmax": 100, # 每次交易的最大股票數量
"initial_amount": 100000, # 初始資金
"buy_cost_pct": 0.001, # 買入手續費
"sell_cost_pct": 0.001, # 賣出手續費
"state_space": state_space,
"action_space": action_space,
"reward_scaling": 1e-4,
"tech_indicator_list": ['MA5', 'MA10', 'RSI', 'MACD', 'MACD_signal', 'MACD_hist'],
"print_verbosity": 0,
"num_stock_shares":1
}
# 重命名列
data.rename(columns={'Date': 'date', 'Open': 'open', 'High': 'high', 'Low': 'low',
'Close': 'close', 'Adj Close': 'adj_close', 'Volume': 'volume'}, inplace=True)
# 分割訓練和測試數據
train = data.iloc[:int(0.8*len(data))].copy().reset_index(drop=True)
test = data.iloc[int(0.8*len(data)):].copy().reset_index(drop=True)
# 創建訓練環境
train_env = StockTradingEnv(df=train, **env_kwargs)
train_env = DummyVecEnv([lambda: train_env])
# 創建測試環境
test_env = StockTradingEnv(df=test, **env_kwargs)
test_env = DummyVecEnv([lambda: test_env])
我們將比較三種常見的強化學習算法:DQN、PPO 和 A2C。
def train_agent(env, model_class, model_name, timesteps=10000):
model = model_class('MlpPolicy', env, verbose=0)
model.learn(total_timesteps=timesteps)
model.save(f"{model_name}_model")
return model
dqn_model = train_agent(train_env, DQN, "DQN", timesteps=10000)
ppo_model = train_agent(train_env, PPO, "PPO", timesteps=10000)
a2c_model = train_agent(train_env, A2C, "A2C", timesteps=10000)
def evaluate_agent(env, model):
obs = env.reset()
done = False
total_reward = 0
rewards = []
net_worths = []
while not done:
action, _states = model.predict(obs)
obs, reward, done, info = env.step(action)
total_reward += reward[0]
rewards.append(reward[0])
net_worths.append(env.envs[0].portfolio_value)
return total_reward, rewards, net_worths
dqn_total_reward, dqn_rewards, dqn_net_worths = evaluate_agent(test_env, dqn_model)
ppo_total_reward, ppo_rewards, ppo_net_worths = evaluate_agent(test_env, ppo_model)
a2c_total_reward, a2c_rewards, a2c_net_worths = evaluate_agent(test_env, a2c_model)
print(f"DQN Total Reward: {dqn_total_reward}")
print(f"PPO Total Reward: {ppo_total_reward}")
print(f"A2C Total Reward: {a2c_total_reward}")
plt.figure(figsize=(12,6))
plt.plot(dqn_net_worths, label='DQN')
plt.plot(ppo_net_worths, label='PPO')
plt.plot(a2c_net_worths, label='A2C')
plt.title('Agent Net Worth Over Time')
plt.xlabel('Time Step')
plt.ylabel('Net Worth')
plt.legend()
plt.show()
plt.figure(figsize=(12,6))
plt.plot(np.cumsum(dqn_rewards), label='DQN')
plt.plot(np.cumsum(ppo_rewards), label='PPO')
plt.plot(np.cumsum(a2c_rewards), label='A2C')
plt.title('Cumulative Reward Over Time')
plt.xlabel('Time Step')
plt.ylabel('Cumulative Reward')
plt.legend()
plt.show()
def calculate_annual_return(net_worths, initial_amount, days):
total_return = (net_worths[-1] - initial_amount) / initial_amount
annual_return = ((1 + total_return) ** (252 / days)) - 1
return annual_return
test_days = len(test)
dqn_annual_return = calculate_annual_return(dqn_net_worths, env_kwargs['initial_amount'], test_days)
ppo_annual_return = calculate_annual_return(ppo_net_worths, env_kwargs['initial_amount'], test_days)
a2c_annual_return = calculate_annual_return(a2c_net_worths, env_kwargs['initial_amount'], test_days)
print(f"DQN Annual Return: {dqn_annual_return:.2%}")
print(f"PPO Annual Return: {ppo_annual_return:.2%}")
print(f"A2C Annual Return: {a2c_annual_return:.2%}")
def plot_actions(data, actions):
data = data.iloc[int(0.8*len(data)):]
data.reset_index(inplace=True, drop=True)
buy = []
sell = []
for i in range(len(actions)):
if actions[i] == 1: # 買入
buy.append(data['Close'].iloc[i])
sell.append(np.nan)
elif actions[i] == 2: # 賣出
buy.append(np.nan)
sell.append(data['Close'].iloc[i])
else:
buy.append(np.nan)
sell.append(np.nan)
plt.figure(figsize=(12,6))
plt.plot(data['Close'], label='Close Price')
plt.scatter(data.index, buy, label='Buy', marker='^', color='g')
plt.scatter(data.index, sell, label='Sell', marker='v', color='r')
plt.title('Trading Actions')
plt.xlabel('Time Step')
plt.ylabel('Price')
plt.legend()
plt.show()
def get_actions(env, model):
obs = env.reset()
done = False
actions = []
while not done:
action, _states = model.predict(obs)
actions.append(action[0])
obs, reward, done, info = env.step(action)
return actions
dqn_actions = get_actions(test_env, dqn_model)
plot_actions(test, dqn_actions)
ppo_actions = get_actions(test_env, ppo_model)
plot_actions(test, ppo_actions)
a2c_actions = get_actions(test_env, a2c_model)
plot_actions(test, a2c_actions)
1.https://stable-baselines.readthedocs.io/en/master/
2.https://finrl.readthedocs.io/en/latest/index.html