讓智慧型代理人/智慧體/智能體(intelligent agent)通過與環境的交互學習如何做出決策,以最大化一個獎勵信號。它通常涉及到Agent採取一系列行動,觀察環境的狀態,然後根據獎勵信號來調整策略。
Q(state, action) = (1 - learning_rate) * Q(state, action) + learning_rate * (reward + discount_factor * max(Q(next_state, all_actions)))
import numpy as np
# 定義簡化的迷宮地圖
# 0 表示空格,1 表示障礙物,2 表示目標
maze = np.array([
[0, 0, 0],
[1, 1, 0],
[0, 0, 2]
])
# 定義Q-table,初始化為0
num_states = maze.size
num_actions = 4 # 上、下、左、右四個動作
q_table = np.zeros((num_states, num_actions))
# 定義參數
# 學習速率
learning_rate = 0.5
# 用於衡量未來獎勵的重要性,通常介於0和1之間
discount_factor = 0.9
# 在ε-greedy策略中用於控制隨機探索的機率
exploration_prob = 0.2
# 指定訓練Q-learning的總次數
num_episodes = 1
# 定義狀態轉換函數
def get_next_state(state, action):
row, col = np.where(maze == state)
row, col = row[0], col[0]
if action == 0: # 上
new_row, new_col = max(row - 1, 0), col
elif action == 1: # 下
new_row, new_col = min(row + 1, maze.shape[0] - 1), col
elif action == 2: # 左
new_row, new_col = row, max(col - 1, 0)
else: # 右
new_row, new_col = row, min(col + 1, maze.shape[1] - 1)
return maze[new_row, new_col]
# 簡化的Q-learning主循環
for episode in range(num_episodes):
state = maze[0, 0] # 初始狀態
done = False
while not done:
# ε-greedy策略
if np.random.rand() < exploration_prob:
action = np.random.randint(num_actions) # 隨機選擇動作
else:
action = np.argmax(q_table[state]) # 選擇具有最高Q值的動作
next_state = get_next_state(state, action)
reward = -1 if next_state != 2 else 10 # 目標狀態獎勵為10,其他狀態獎勵為-1
# Q值更新
q_table[state, action] = q_table[state, action] + learning_rate * (
reward + discount_factor *
np.max(q_table[next_state]) - q_table[state, action]
)
state = next_state
if state == 2: # 到達目標狀態
done = True
# 測試學習後的策略
state = maze[0, 0] # 重新回到起始狀態
path = [state]
while state != 2:
action = np.argmax(q_table[state])
next_state = get_next_state(state, action)
path.append(next_state)
state = next_state
print("最優路徑:", path)