Q-Learning它是强化学习中的一种 values-based 算法,是以QTable表格形式体现,在学习中遇到的任何操作存入QTable中,根据之前的学习选择当前最优操作,也可以根据设置的e_greedy机率随机选择。

Q-Learning的QTable标签更新公式:

强化学习Q-Learning_强化学习


Q-Learning的计算步骤:

1.判断在当前位置可以有几种操作;

2.在当前位置允许的操作中选择一个操作;

3.根据选择的操作进行奖赏;

4.修改当前行为的本次操作权重;

游戏界面:

import time
import tkinter as tk

class GameWorld(tk.Tk,object):

def __init__(self,unit_pixels = 40,grid_height = 6,grid_width = 5):
super(GameWorld,self).__init__()
self.action = ['上','下','左','右']
self.title('迷宫')
self.geometry('{0}x{1}'.format(grid_height * unit_pixels , grid_height * unit_pixels))
self.unit_pixels = unit_pixels
self.grid_height = grid_height
self.grid_width = grid_width
self.buildMap()

def buildMap(self):
self.canvas = tk.Canvas(self, bg='white',
height=self.grid_height * self.unit_pixels,
width=self.grid_width * self.unit_pixels)

for x in range(0, self.grid_width * self.unit_pixels, self.unit_pixels):
x0, y0, x1, y1 = x, 0, x, self.grid_height * self.unit_pixels
self.canvas.create_line(x0, y0, x1, y1)

for y in range(0, self.grid_height * self.unit_pixels, self.unit_pixels):
x0, y0, x1, y1 = 0, y, self.grid_width * self.unit_pixels, y
self.canvas.create_line(x0, y0, x1, y1)

##绘画地图上物品
self.obstacle1 = self.canvas.create_rectangle(0, self.unit_pixels * 3,
self.unit_pixels, self.unit_pixels * 4, fill='black')

self.obstacle2 = self.canvas.create_rectangle(self.unit_pixels, self.unit_pixels * 2,
self.unit_pixels * 2, self.unit_pixels * 3, fill='black')

self.obstacle3 = self.canvas.create_rectangle(self.unit_pixels * 2, self.unit_pixels * 1,
self.unit_pixels * 3, self.unit_pixels * 2, fill='black')

self.obstacle4 = self.canvas.create_rectangle(self.unit_pixels * 4, self.unit_pixels * 1,
self.unit_pixels * 5, self.unit_pixels * 2, fill='black')

self.obstacle5 = self.canvas.create_rectangle(self.unit_pixels * 3, self.unit_pixels * 4,
self.unit_pixels * 4, self.unit_pixels * 5, fill='black')

self.trap = self.canvas.create_rectangle(self.unit_pixels * 1, self.unit_pixels * 4,
self.unit_pixels * 2, self.unit_pixels * 5, fill='grey')

self.end = self.canvas.create_oval(self.unit_pixels * 3, self.unit_pixels * 5,
self.unit_pixels * 4, self.unit_pixels * 6, fill='yellow')

self.position = self.canvas.create_rectangle(self.unit_pixels, 0,
self.unit_pixels * 2, self.unit_pixels, fill='red')
self.canvas.pack()
self.update()

## 重置,把游戏中改动过的物品还原
def reset(self):
self.canvas.delete(self.position)
self.position = self.canvas.create_rectangle(self.unit_pixels, 0,
self.unit_pixels * 2, self.unit_pixels, fill='red')
self.update()


'''
判断在当前位置可以有几种操作
'''
def allowActions(self):
allow = []
for i in self.action:
a_s = self.actionState(i)
if(a_s != None):allow.append(a_s)

return allow,self.canvas.coords(self.position)

def obstacle(self,x,y):
position = self.canvas.coords(self.position)
newP = [position[0]+x,position[1]+y,position[2]+x,position[3]+y]
if newP == self.canvas.coords(self.obstacle1) : return False
if newP == self.canvas.coords(self.obstacle2) : return False
if newP == self.canvas.coords(self.obstacle3) : return False
if newP == self.canvas.coords(self.obstacle4) : return False
if newP == self.canvas.coords(self.obstacle5) : return False
return True


'''
当前位置允许的操作
'''
def actionState(self,action):
position = self.canvas.coords(self.position)

if action == '上':
if position[1] - self.unit_pixels >= 0 and self.obstacle(0,0-self.unit_pixels):
return "上",[0,0 - self.unit_pixels]
elif action == '下':
if position[1] + self.unit_pixels < self.grid_height*self.unit_pixels and \
self.obstacle(0,self.unit_pixels) :
return "下",[0,self.unit_pixels]
elif action == '左':
if position[0] - self.unit_pixels >= 0 and self.obstacle(0-self.unit_pixels,0):
return "左",[0 - self.unit_pixels,0]
elif action == '右':
if position[0] + self.unit_pixels < self.grid_width*self.unit_pixels and \
self.obstacle(self.unit_pixels,0):
return "右",[0 + self.unit_pixels,0]


'''
根据状态进行操作,操作完后对本次行为进行奖赏
'''
def updatePosition(self,allow):
#行为操作
self.canvas.move(self.position, allow[1][0],allow[1][1]) # 移动位置
newPosition = self.canvas.coords(self.position) # 移动位置后的状态
self.update()
time.sleep(0.05)
#奖赏判断
if newPosition == self.canvas.coords(self.end):
return True,newPosition,1
elif newPosition == self.canvas.coords(self.trap):
return True,newPosition,-1
else:
return False,newPosition,0


Qlearning算法: 

import numpy as np

class Qlearning:
def __init__(self,learning_rate=0.01,reward_decay=0.9, e_greedy=0.9):
#学习率
self.lr = learning_rate
#奖励衰减
self.gamma = reward_decay
self.epsilon = e_greedy
self.qTable = {}

'''
选择一个操作
'''
def chooseAction(self,oldPosition,state):
strop = str(oldPosition)
optionalChoices = self.qTable.get(strop,None)
if(optionalChoices == None):
current = {}
for s in state:
current[s[0]] = 0.0

self.qTable[strop] = current

if np.random.random() > self.epsilon:
#随机数大于设置的进行随机一个操作
action = state[np.random.choice(len(state))]
else:
#选择最佳操作
actionWeight = self.qTable[strop]
allowState = []
maxAW = max(actionWeight.values())
for s in state:
if (actionWeight.get(s[0],0.0) == maxAW):
allowState.append(s)

action = allowState[np.random.choice(len(allowState))]

return action

'''
修改当前位置的权重
'''
def learn(self,oldPosition,newPosition,state,award):
strOp = str(oldPosition)
strNp = str(newPosition)
maxAward = award
if strNp in self.qTable:
#获取当前步骤最大奖励
maxAward = award + self.gamma * max(self.qTable[strNp].values())

#Q-Learning的跟新公式
# self.qTable[strOp][state] = (1-self.lr) * self.qTable[strOp].get(state,0) + self.lr*(award+self.gamma*next_max_q)
self.qTable[strOp][state] += self.lr * (maxAward - self.qTable[strOp].get(state, 0))


训练:

from src.tkd.GameWorld import GameWorld
from src.tkd.Qlearning import Qlearning

class Test:

'''
训练
'''
def run(self,game,ql,i):
step = 0
finish = False
while not finish:
allow,oldPosition = game.allowActions()
state = ql.chooseAction(oldPosition,allow)
finish,newPosition,award = game.updatePosition(state)
ql.learn(oldPosition,newPosition,state[0],award)
step += 1

print('第%d 次到达目的地, 用了%d 步' %(i,step))

game.reset()

if __name__ == '__main__':

game = GameWorld()
qlearning = Qlearning()
test = Test()
for i in range(50):
test.run(game,qlearning,i)