大家好,我是毛毛。
今天是Day 23
要來實作看看CartPole~ ヽ(✿゚▽゚)ノ
import random
import numpy as np
import tensorflow as tf
from collections import deque
import tensorflow.keras.layers as layers
import tensorflow.keras.optimizers as optim
引入需要的相關套件。
def __init__(self, num_actions, num_features, learning_rate=0.02,
reward_decay=0.95, e_greedy=0.95, replace_target_iter=500,
memory_size=5000, batch_size=32, e_greedy_increment=None,
output_graph=False, memory_neg_p = 0.5):
#super(DQN_Model, self).__init__()
# -----------------define parameters-----------------
self.num_actions = num_actions
self.num_features = num_features
self.lr = learning_rate
self.gamma = reward_decay
self.epsilon_max = e_greedy
# 凍結參數的間隔次數
self.replace_target_iter = replace_target_iter
self.memory_size = memory_size
self.batch_size = batch_size
self.epsilon_increment = e_greedy_increment
self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
self.learn_step_counter = 0
self.memory_neg = deque(maxlen = int(self.memory_size*memory_neg_p))
self.memory_pos = deque(maxlen = self.memory_size - int(self.memory_size*memory_neg_p))
self.memory_neg_p = memory_neg_p
# -----------------Call build_network-----------------
self.__build_network()
設定神經網路的相關參數,因為eval_net和target_net的神經網路架構是一樣的,所以用一個function就好,讓eval_net和target_net只需要呼叫這個function不用打兩段一樣的程式碼。
def __define_network(self, name):
inputs = layers.Input(shape=(self.num_features,))
x = layers.Dense(64, activation='sigmoid')(inputs)
x = layers.Dense(64, activation='sigmoid')(x)
return inputs, layers.Dense(self.num_actions, name=name+'_output')(x)
設定神經網路的層數與該層參數。
def __build_network(self):
# 有最新的參數
eval_inputs, self.q_eval = self.__define_network('Eval')
# 被凍結的參數
target_inputs, self.q_next = self.__define_network('Target')
# 建構target_net,target層输出是q_next而不是q_target
self.targetNet = tf.keras.Model(target_inputs, self.q_next)
# 建構eval_net
self.evalNet = tf.keras.Model(eval_inputs, self.q_eval, name = 'DQN_Eval_Net')
rmsprop = optim.RMSprop(lr=self.lr)
self.targetNet.compile(loss='mean_squared_error', optimizer=rmsprop, metrics=['accuracy'])
self.evalNet.compile(loss='mean_squared_error', optimizer=rmsprop, metrics=['accuracy'])
在這個function中才建立eval_net和target_net,並透過compile函數定義損失函數(loss)、優化函數(optimizer)及成效衡量指標(mertrics)。
def target_replacement(self):
self.targetNet.set_weights(self.evalNet.get_weights())
print("=================Parameters have changed=================")
因為target_net需要透過eval_net的權重來更新自己的神經網路的權重。
def store_transition(self, s, a, r, s_, terminal):
if terminal:
self.memory_neg.append((s, a, r, s_, terminal))
else:
self.memory_pos.append((s, a, r, s_, terminal))
這邊是將當下的資訊存起來,而memory neg和pos是為了因應reward的問題
CartPole中的reward,只要黑色小車每撐過1幀reward就會加1,失敗也只會立刻結束遊戲,所以reward並不會有負的,而且只要完到200帧就會結束。
所以讓他根據terminal的值,分別存到negative和positive的list中。
def replay_transition(self):
batch_mem_pos = random.sample(self.memory_pos, self.batch_size-int(self.batch_size*self.memory_neg_p))
batch_mem_neg = random.sample(self.memory_neg, int(self.batch_size*self.memory_neg_p))
batch_mem = batch_mem_pos+batch_mem_neg
s = np.array([d[0] for d in batch_mem])
a = np.array([d[1] for d in batch_mem], dtype=np.int32)
r = np.array([d[2] for d in batch_mem])
s_ = np.array([d[3] for d in batch_mem])
t = np.array([d[4] for d in batch_mem])
return s, a, r, s_, t
這裡就是要隨機地選取過去的經驗,分別從negative和positive的list選出來,在把兩者合併當成這次回放的經驗。
def choose_action(self, obs):
if np.random.uniform() < self.epsilon:
action_dist = self.evalNet.predict(obs)
action = np.argmax(action_dist)
else:
action = np.random.randint(0, self.num_actions)
return action
選取動作的部分是透過epsilon-greedy的方法。
def learn(self):
# -----------------Replace_Target_weights-----------------
if self.learn_step_counter % self.replace_target_iter == 0:
self.target_replacement()
# -----------------Sample_Memory-----------------
if len(self.memory_neg)+len(self.memory_pos) > self.batch_size:
s, eval_act, reward, s_, t = self.replay_transition()
q_next, q_eval = self.targetNet.predict(s_), self.evalNet.predict(s)
q_target = q_eval.copy()
batch_index = np.arange(self.batch_size, dtype=np.int32)
q_target[batch_index, eval_act] = reward + self.gamma * np.max(q_next, axis=1)
q_target[t, eval_act[t]] = reward[t]
if self.learn_step_counter % self.replace_target_iter == 0:
print(np.append(q_eval,q_target, axis=1))
self.evalNet.fit(s, q_target, epochs=10, verbose=False)
self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
self.learn_step_counter += 1
上面的function應用的地方,像是權重取代和回放經驗的部分,還有eval_net的訓練和epsilon的更新。
今天就先用完神經網路的建構,明天在來實際跑CartPole的結果 0(:3 )~ ('、3_ヽ)_
大家明天見
Reference