强化学习Q-Learning

原创

O_O哇 2022-03-11 16:44:52 ©著作权

文章标签 强化学习 Q-learning 小游戏 文章分类 深度学习人工智能

©著作权归作者所有：来自51CTO博客作者O_O哇的原创作品，请联系作者获取转载授权，否则将追究法律责任

Q-Learning它是强化学习中的一种 values-based 算法，是以QTable表格形式体现，在学习中遇到的任何操作存入QTable中，根据之前的学习选择当前最优操作，也可以根据设置的e_greedy机率随机选择。

Q-Learning的QTable标签更新公式：

$强化学习Q-Learning_强化学习$

Q-Learning的计算步骤:

1.判断在当前位置可以有几种操作；

2.在当前位置允许的操作中选择一个操作；

3.根据选择的操作进行奖赏；

4.修改当前行为的本次操作权重；

游戏界面：

import time
import tkinter as tk

class GameWorld(tk.Tk,object):

    def __init__(self,unit_pixels = 40,grid_height = 6,grid_width = 5):
        super(GameWorld,self).__init__()
        self.action = ['上','下','左','右']
        self.title('迷宫')
        self.geometry('{0}x{1}'.format(grid_height * unit_pixels , grid_height * unit_pixels))
        self.unit_pixels = unit_pixels
        self.grid_height = grid_height
        self.grid_width = grid_width
        self.buildMap()

    def buildMap(self):
        self.canvas = tk.Canvas(self, bg='white',
                                height=self.grid_height * self.unit_pixels,
                                width=self.grid_width * self.unit_pixels)

        for x in range(0, self.grid_width * self.unit_pixels, self.unit_pixels):
            x0, y0, x1, y1 = x, 0, x, self.grid_height * self.unit_pixels
            self.canvas.create_line(x0, y0, x1, y1)

        for y in range(0, self.grid_height * self.unit_pixels, self.unit_pixels):
            x0, y0, x1, y1 = 0, y, self.grid_width * self.unit_pixels, y
            self.canvas.create_line(x0, y0, x1, y1)

        ##绘画地图上物品
        self.obstacle1 = self.canvas.create_rectangle(0, self.unit_pixels * 3,
                                                      self.unit_pixels, self.unit_pixels * 4, fill='black')

        self.obstacle2 = self.canvas.create_rectangle(self.unit_pixels, self.unit_pixels * 2,
                                                      self.unit_pixels * 2, self.unit_pixels * 3, fill='black')

        self.obstacle3 = self.canvas.create_rectangle(self.unit_pixels * 2, self.unit_pixels * 1,
                                                      self.unit_pixels * 3, self.unit_pixels * 2, fill='black')

        self.obstacle4 = self.canvas.create_rectangle(self.unit_pixels * 4, self.unit_pixels * 1,
                                                      self.unit_pixels * 5, self.unit_pixels * 2, fill='black')

        self.obstacle5 = self.canvas.create_rectangle(self.unit_pixels * 3, self.unit_pixels * 4,
                                                      self.unit_pixels * 4, self.unit_pixels * 5, fill='black')

        self.trap = self.canvas.create_rectangle(self.unit_pixels * 1, self.unit_pixels * 4,
                                                      self.unit_pixels * 2, self.unit_pixels * 5, fill='grey')

        self.end = self.canvas.create_oval(self.unit_pixels * 3, self.unit_pixels * 5,
                                           self.unit_pixels * 4, self.unit_pixels * 6, fill='yellow')

        self.position = self.canvas.create_rectangle(self.unit_pixels, 0,
                                                     self.unit_pixels * 2, self.unit_pixels, fill='red')
        self.canvas.pack()
        self.update()

    ## 重置，把游戏中改动过的物品还原
    def reset(self):
        self.canvas.delete(self.position)
        self.position = self.canvas.create_rectangle(self.unit_pixels, 0,
                                                     self.unit_pixels * 2, self.unit_pixels, fill='red')
        self.update()


    '''
    判断在当前位置可以有几种操作
    '''
    def allowActions(self):
        allow = []
        for i in self.action:
            a_s = self.actionState(i)
            if(a_s != None):allow.append(a_s)

        return allow,self.canvas.coords(self.position)

    def obstacle(self,x,y):
        position = self.canvas.coords(self.position)
        newP = [position[0]+x,position[1]+y,position[2]+x,position[3]+y]
        if newP == self.canvas.coords(self.obstacle1) : return False
        if newP == self.canvas.coords(self.obstacle2) : return False
        if newP == self.canvas.coords(self.obstacle3) : return False
        if newP == self.canvas.coords(self.obstacle4) : return False
        if newP == self.canvas.coords(self.obstacle5) : return False
        return True


    '''
    当前位置允许的操作
    '''
    def actionState(self,action):
        position = self.canvas.coords(self.position)

        if action == '上':
            if position[1] - self.unit_pixels >= 0 and self.obstacle(0,0-self.unit_pixels):
                return "上",[0,0 - self.unit_pixels]
        elif action == '下':
            if position[1] + self.unit_pixels < self.grid_height*self.unit_pixels and \
                    self.obstacle(0,self.unit_pixels) :
                return "下",[0,self.unit_pixels]
        elif action == '左':
            if position[0] - self.unit_pixels >= 0 and self.obstacle(0-self.unit_pixels,0):
                return "左",[0 - self.unit_pixels,0]
        elif action == '右':
            if position[0] + self.unit_pixels < self.grid_width*self.unit_pixels and \
                    self.obstacle(self.unit_pixels,0):
                return "右",[0 + self.unit_pixels,0]


    '''
    根据状态进行操作，操作完后对本次行为进行奖赏
    '''
    def updatePosition(self,allow):
        #行为操作
        self.canvas.move(self.position, allow[1][0],allow[1][1])  # 移动位置
        newPosition = self.canvas.coords(self.position)  # 移动位置后的状态
        self.update()
        time.sleep(0.05)
        #奖赏判断
        if newPosition == self.canvas.coords(self.end):
            return True,newPosition,1
        elif newPosition == self.canvas.coords(self.trap):
            return True,newPosition,-1
        else:
            return False,newPosition,0

Qlearning算法:

import numpy as np

class Qlearning:
    def __init__(self,learning_rate=0.01,reward_decay=0.9, e_greedy=0.9):
        #学习率
        self.lr = learning_rate
        #奖励衰减
        self.gamma = reward_decay
        self.epsilon = e_greedy
        self.qTable = {}

    '''
    选择一个操作
    '''
    def chooseAction(self,oldPosition,state):
        strop = str(oldPosition)
        optionalChoices = self.qTable.get(strop,None)
        if(optionalChoices == None):
            current = {}
            for s in state:
                    current[s[0]] = 0.0

            self.qTable[strop] = current

        if np.random.random() > self.epsilon:
            #随机数大于设置的进行随机一个操作
            action = state[np.random.choice(len(state))]
        else:
            #选择最佳操作
            actionWeight = self.qTable[strop]
            allowState = []
            maxAW = max(actionWeight.values())
            for s in state:
                if (actionWeight.get(s[0],0.0) == maxAW):
                    allowState.append(s)

            action = allowState[np.random.choice(len(allowState))]

        return action

    '''
    修改当前位置的权重
    '''
    def learn(self,oldPosition,newPosition,state,award):
        strOp = str(oldPosition)
        strNp = str(newPosition)
        maxAward = award
        if strNp in self.qTable:
            #获取当前步骤最大奖励
            maxAward = award + self.gamma * max(self.qTable[strNp].values())

        #Q-Learning的跟新公式
        # self.qTable[strOp][state] = (1-self.lr) * self.qTable[strOp].get(state,0) + self.lr*(award+self.gamma*next_max_q)
        self.qTable[strOp][state] += self.lr * (maxAward - self.qTable[strOp].get(state, 0))

训练：

from src.tkd.GameWorld import GameWorld
from src.tkd.Qlearning import Qlearning

class Test:

    '''
    训练
    '''
    def run(self,game,ql,i):
        step = 0
        finish = False
        while not finish:
            allow,oldPosition = game.allowActions()
            state = ql.chooseAction(oldPosition,allow)
            finish,newPosition,award = game.updatePosition(state)
            ql.learn(oldPosition,newPosition,state[0],award)
            step += 1

        print('第%d 次到达目的地， 用了%d 步' %(i,step))

        game.reset()

if __name__ == '__main__':

    game = GameWorld()
    qlearning = Qlearning()
    test = Test()
    for i in range(50):
        test.run(game,qlearning,i)