根据模型的数学原理进行简单的代码自我复现以及使用测试,仅作自我学习用。模型原理此处不作过多赘述,仅罗列自己将要使用到的部分公式。

如文中或代码有错误或是不足之处,还望能不吝指正。

Softmax是logistic回归的一个“一般形式”。基本思想就是在线性回归的基础上,使用一个单调可微的函数,使得预测值变为1个趋近于0/1的“预测概率”,与真实的标记y联系起来。在Logistics回归中,这个函数为

GPT自回归 自回归模型代码_python

其中

GPT自回归 自回归模型代码_ico_02


而softmax将其一般化为多分类任务,单调可微的函数为

GPT自回归 自回归模型代码_python_03


将预测的概率与真实值的标签分布的距离作为损失值,这种损失被称为交叉熵函数

GPT自回归 自回归模型代码_python_04

对其进行梯度下降,可以得到

GPT自回归 自回归模型代码_GPT自回归_05

。其中

GPT自回归 自回归模型代码_GPT自回归_06

为学习率,

GPT自回归 自回归模型代码_机器学习_07

为以每次使用到的样本个数。

实际在sklearn中,是可以采取不同的优化策略的,如牛顿法、共轭梯度下降、梯度下降等方法。

在sklearn中,Logistics回归是具有"l1正则"、"l2正则"以及"弹性网络"的参数的,但是在我自己实现的正则化下,对于没有进行过特殊处理的数据,并没有明显的改善过拟合情况。所以此处不再进行试验。

import numpy as np
import pandas as pd
import random
from sklearn.preprocessing import  PolynomialFeatures
from sklearn.utils import shuffle

class SoftMax:
    def pre_processing(self,data,polynomial_degree=0,sinusoid_degree=0,normalize_data=True):
        """
        数据预处理
        polynomial_degree:多项式处理,1为仅添加偏置
        sinusoid_degree:正弦处理次数
        normalize_data:是否标准化
        """
        if polynomial_degree>=1:#多项式
            poly = PolynomialFeatures(polynomial_degree)
            data = poly.fit_transform(data)
            
        if sinusoid_degree>0:#sin函数非线性变换
            sinusoid = np.empty((data.shape[0],0))
            for i in range(1,sinusoid_degree+1):
                sinusoid_feature = np.sin(i*data)
                sinusoid = np.concatenate((sinusoid,sinusoid_feature),axis=1)
            data = sinusoid
        if normalize_data:
            data = (data-self.feature_mean)/self.feature_std
        return data
    
    def __init__(self,data,labels,polynomial_degree=0,sinusoid_degree=0,normalize_data=True,threshold=1e-9):
        """
        data:特征矩阵
        labels:分类类别
        polynomial_degree:多项式处理,1为仅添加偏置
        sinusoid_degree:正弦处理次数
        normalize_data:是否标准化
        threshold:阈值,损失函数变化小于阈值时停止训练
        """
        self.normalize_data=normalize_data
        self.feature_mean = np.mean(data)
        self.feature_std = np.std(data)
        self.threshold = threshold #权重变化阈值
        self.data = self.pre_processing(data,polynomial_degree,sinusoid_degree,normalize_data)
        self.labels = labels
        self.label_unique = np.unique(self.labels)
        self.label_num = len(self.label_unique)
        self.one_hot_label = pd.get_dummies(self.labels)
        self.one_hot_label = self.one_hot_label.values
        self.polynomial_degree=polynomial_degree
        self.sinusoid_degree=sinusoid_degree
        feature_num = self.data.shape[1]
        self.weight = np.random.random((feature_num,self.label_num)) #初始化w矩阵
        
    
        
    def train(self,alpha=0.1,num_iterations = 1000,gradient_m=128,get_weight_history = False,lambda_l1 = 0,lambda_l2 = 0):
        """
        alpha:学习率
        num_iterations:梯度/坐标下降做大次数
        gradient_m:每次迭代使用样本数量,-1时为批量梯度下降(BGD)
        get_weight_history:是否返回权重的训练历史信息
        """
        cost_history = self.gradient_descent(alpha,num_iterations,gradient_m,get_weight_history,lambda_l1,lambda_l2)
        return self.weight,cost_history
    
    def gradient_descent(self,alpha,num_iterations,gradient_m,get_weight_history,lambda_l1,lambda_l2):
        """
        梯度下降过程,参数同train()中的一样
        alpha:学习率
        num_iterations:梯度/坐标下降做大次数
        gradient_m:每次迭代使用样本数量,-1时为批量梯度下降(BGD)
        get_weight_history:是否返回权重的训练历史信息
        """
        if gradient_m==-1 or self.data.shape[0]<128:
            gradient_m = self.data.shape[0]
        cost_history=[]
        weight_history = []
        for i in range(num_iterations):
            self.gradient_step(alpha,gradient_m,lambda_l1,lambda_l2)
            last_cost = self.cost_function(self.data,self.one_hot_label,gradient_m,lambda_l1,lambda_l2)
            cost_history.append(last_cost)
            if get_weight_history:
                weight_history.append(self.weight)
            if len(cost_history)>2 and abs(cost_history[-1]-cost_history[-2])<self.threshold:
                break
        if get_weight_history:
            return [cost_history,weight_history]
        return cost_history
        
    def cost_function(self,data,labels,example_num,lambda_l1,lambda_l2):
        """
        data:特征矩阵
        labels:分类类别
        example_num:每次迭代使用样本数量,-1时为批量梯度下降(BGD)
        """
        y_hat = SoftMax.hypothesis(data,self.weight,self.label_num)
        cost = 0
        for i in range(self.label_num):
            cost+=(1/2)*(np.dot(np.log(y_hat[:,i].T),labels[:,i])/example_num+lambda_l2*np.dot(self.weight[:,i].T,self.weight[:,i])+lambda_l1*np.linalg.norm(self.weight[:,i]))
        return -cost
    
    def gradient_step(self,alpha,example_num,lambda_l1,lambda_l2):
        """
        一次梯度下降
        参数同cost_function
        data:特征矩阵
        labels:分类类别
        example_num:每次迭代使用样本数量,-1时为批量梯度下降(BGD)
        """
        idx = random.sample(range(self.data.shape[0]),example_num)
        data = self.data[idx]
        prediction = SoftMax.hypothesis(data,self.weight,self.label_num)
        delta = prediction-self.one_hot_label[idx]
        weight = self.weight
        for i in range(self.label_num):
            for w in range(weight.shape[0]):
                if lambda_l1>0:
                    rk = np.dot(delta[:,i].T,data[:,w])
                    if rk<-lambda_l1:
                        lambda_l1 = -lambda_l1
                    weight[w][i] = alpha*(1/example_num)*(np.dot(delta[:,i].T,data[:,w])-lambda_l1)/(np.linalg.norm(data[:,w],2)**2)
                else:
                    weight[w][i] -= (alpha*(1/example_num)*np.dot(delta[:,i].T,data[:,w])-lambda_l2*weight[w][i])
        #weight -= alpha*(1/example_num)*(np.dot(delta.T,data)).T
        self.weight = weight
    @staticmethod
    def hypothesis(data,weight,label_num):
        """
        计算因变量
        data:特征矩阵
        labels:分类类别
        label_num:类别总数
        """
        addition = sum([np.exp(np.dot(data,weight[:,i])) for i in range(label_num)])
        predictions = np.array([np.exp(np.dot(data,weight[:,i])) for i in range(label_num)])/addition
        return predictions.T
    
    def get_cost(self,data,labels):
        """
        计算损失值
        data:特征矩阵
        labels:分类类别
        """
        data = self.pre_processing(data)
        return self.cost_function(data,labels,data.shape[0])
    
    def predict(self,data):
        """
        从新数据(测试集)上预测
        data:特征矩阵
        """
        data = self.pre_processing(data,self.polynomial_degree,self.sinusoid_degree,self.normalize_data)
        predictions = SoftMax.hypothesis(data,self.weight,self.label_num)
        return predictions

    def predict_label(self,data):
        """
        从新数据(测试集)上预测标签,同时返回一个是否预测正确的list
        data:特征矩阵
        """
        data = self.pre_processing(data,self.polynomial_degree,self.sinusoid_degree,self.normalize_data)
        pred_proba = SoftMax.hypothesis(data,self.weight,self.label_num)
        predictions = [sf.label_unique[np.argmax(i)] for i in pred_proba]
        return predictions

    def score(self,data,label):
        """
        计算(在传入的测试集上)的正确率
        data:(测试集上的)特征矩阵
        label:(测试机上的)分类类别
        """
        predictions = self.predict_label(data)
        is_true = predictions==label
        return sum(is_true)/len(is_true)

 在鸢尾花数据集上测试自己的Softmax回归效果,达到了预期的分类目标。

df = pd.read_csv("iris.data",header = None)
X = df.iloc[:,0:4]
Y = df.iloc[:,4]

from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(X,Y)
sf = SoftMax(x_train.values,y_train.values)
weight,cost = sf.train(alpha=0.1,num_iterations=5000)
print("测试集上的准确率为:",sf.score(x_test,y_test))

"""
测试集上的准确率为: 0.9210526315789473
"""

接下来,仅用花瓣的特征作为softmax的特征,以此在二维平面上画出决策边界。

from matplotlib import pyplot as plt
import matplotlib as mpl
mpl.rcParams["font.family"]="SimHei"

X = df.iloc[:,2:4]
Y = df.iloc[:,4]
x_train,x_test,y_train,y_test = train_test_split(X,Y)
sf = SoftMax(x_train.values,y_train.values)
weight,cost = sf.train(alpha=0.1,num_iterations=5000)

X_contour = np.array([np.linspace(np.min(X[2]),np.max(X[2]),df.shape[0])
                    ,np.linspace(np.min(X[3]),np.max(X[3]),df.shape[0])])

Z_setosa = np.zeros((df.shape[0],df.shape[0]))
Z_versicolor = np.zeros((df.shape[0],df.shape[0]))
Z_virginica = np.zeros((df.shape[0],df.shape[0]))
for idx_1,x1 in enumerate(X_contour[0]):
    for idx_2,x2 in enumerate(X_contour[1]):
        prediction = sf.predict_label([np.array([x1,x2])])[0]
        if prediction == "Iris-setosa":
            Z_setosa[idx_1][idx_2]=1
        elif prediction=="Iris-versicolor":
            Z_versicolor[idx_1][idx_2]=1
        else:
            Z_virginica[idx_1][idx_2]=1

setosa_idx = [i for i in range(len(Y)) if Y[i]=="Iris-setosa"]
versicolor_idx = [i for i in range(len(Y)) if Y[i]=="Iris-versicolor"]
virginica_idx = [i for i in range(len(Y)) if Y[i]=="Iris-virginica"]
plt.scatter(X[2][setosa_idx],X[3][setosa_idx],c='blue',label = 'setosa')
plt.scatter(X[2][versicolor_idx],X[3][versicolor_idx],c='green',label = 'versicolor')
plt.scatter(X[2][virginica_idx],X[3][virginica_idx],c='red',label = "virginica")
plt.contour(X_contour[0],X_contour[1],Z_setosa)
plt.contour(X_contour[0],X_contour[1],Z_versicolor)
plt.contour(X_contour[0],X_contour[1],Z_virginica)
plt.legend()
plt.title("决策边界")
plt.show()

GPT自回归 自回归模型代码_python_08

由于在测试过程中,没有发现L1惩罚项和L2惩罚项比原始的无偏估计准确,故此处不再演示ambda_l1 和lambda_l2这2个参数的相关操作了。