根据模型的数学原理进行简单的代码自我复现以及使用测试,仅作自我学习用。模型原理此处不作过多赘述,仅罗列自己将要使用到的部分公式。
如文中或代码有错误或是不足之处,还望能不吝指正。
Softmax是logistic回归的一个“一般形式”。基本思想就是在线性回归的基础上,使用一个单调可微的函数,使得预测值变为1个趋近于0/1的“预测概率”,与真实的标记y联系起来。在Logistics回归中,这个函数为
其中
。
而softmax将其一般化为多分类任务,单调可微的函数为
。
将预测的概率与真实值的标签分布的距离作为损失值,这种损失被称为交叉熵函数
对其进行梯度下降,可以得到
。其中
为学习率,
为以每次使用到的样本个数。
实际在sklearn中,是可以采取不同的优化策略的,如牛顿法、共轭梯度下降、梯度下降等方法。
在sklearn中,Logistics回归是具有"l1正则"、"l2正则"以及"弹性网络"的参数的,但是在我自己实现的正则化下,对于没有进行过特殊处理的数据,并没有明显的改善过拟合情况。所以此处不再进行试验。
import numpy as np
import pandas as pd
import random
from sklearn.preprocessing import PolynomialFeatures
from sklearn.utils import shuffle
class SoftMax:
def pre_processing(self,data,polynomial_degree=0,sinusoid_degree=0,normalize_data=True):
"""
数据预处理
polynomial_degree:多项式处理,1为仅添加偏置
sinusoid_degree:正弦处理次数
normalize_data:是否标准化
"""
if polynomial_degree>=1:#多项式
poly = PolynomialFeatures(polynomial_degree)
data = poly.fit_transform(data)
if sinusoid_degree>0:#sin函数非线性变换
sinusoid = np.empty((data.shape[0],0))
for i in range(1,sinusoid_degree+1):
sinusoid_feature = np.sin(i*data)
sinusoid = np.concatenate((sinusoid,sinusoid_feature),axis=1)
data = sinusoid
if normalize_data:
data = (data-self.feature_mean)/self.feature_std
return data
def __init__(self,data,labels,polynomial_degree=0,sinusoid_degree=0,normalize_data=True,threshold=1e-9):
"""
data:特征矩阵
labels:分类类别
polynomial_degree:多项式处理,1为仅添加偏置
sinusoid_degree:正弦处理次数
normalize_data:是否标准化
threshold:阈值,损失函数变化小于阈值时停止训练
"""
self.normalize_data=normalize_data
self.feature_mean = np.mean(data)
self.feature_std = np.std(data)
self.threshold = threshold #权重变化阈值
self.data = self.pre_processing(data,polynomial_degree,sinusoid_degree,normalize_data)
self.labels = labels
self.label_unique = np.unique(self.labels)
self.label_num = len(self.label_unique)
self.one_hot_label = pd.get_dummies(self.labels)
self.one_hot_label = self.one_hot_label.values
self.polynomial_degree=polynomial_degree
self.sinusoid_degree=sinusoid_degree
feature_num = self.data.shape[1]
self.weight = np.random.random((feature_num,self.label_num)) #初始化w矩阵
def train(self,alpha=0.1,num_iterations = 1000,gradient_m=128,get_weight_history = False,lambda_l1 = 0,lambda_l2 = 0):
"""
alpha:学习率
num_iterations:梯度/坐标下降做大次数
gradient_m:每次迭代使用样本数量,-1时为批量梯度下降(BGD)
get_weight_history:是否返回权重的训练历史信息
"""
cost_history = self.gradient_descent(alpha,num_iterations,gradient_m,get_weight_history,lambda_l1,lambda_l2)
return self.weight,cost_history
def gradient_descent(self,alpha,num_iterations,gradient_m,get_weight_history,lambda_l1,lambda_l2):
"""
梯度下降过程,参数同train()中的一样
alpha:学习率
num_iterations:梯度/坐标下降做大次数
gradient_m:每次迭代使用样本数量,-1时为批量梯度下降(BGD)
get_weight_history:是否返回权重的训练历史信息
"""
if gradient_m==-1 or self.data.shape[0]<128:
gradient_m = self.data.shape[0]
cost_history=[]
weight_history = []
for i in range(num_iterations):
self.gradient_step(alpha,gradient_m,lambda_l1,lambda_l2)
last_cost = self.cost_function(self.data,self.one_hot_label,gradient_m,lambda_l1,lambda_l2)
cost_history.append(last_cost)
if get_weight_history:
weight_history.append(self.weight)
if len(cost_history)>2 and abs(cost_history[-1]-cost_history[-2])<self.threshold:
break
if get_weight_history:
return [cost_history,weight_history]
return cost_history
def cost_function(self,data,labels,example_num,lambda_l1,lambda_l2):
"""
data:特征矩阵
labels:分类类别
example_num:每次迭代使用样本数量,-1时为批量梯度下降(BGD)
"""
y_hat = SoftMax.hypothesis(data,self.weight,self.label_num)
cost = 0
for i in range(self.label_num):
cost+=(1/2)*(np.dot(np.log(y_hat[:,i].T),labels[:,i])/example_num+lambda_l2*np.dot(self.weight[:,i].T,self.weight[:,i])+lambda_l1*np.linalg.norm(self.weight[:,i]))
return -cost
def gradient_step(self,alpha,example_num,lambda_l1,lambda_l2):
"""
一次梯度下降
参数同cost_function
data:特征矩阵
labels:分类类别
example_num:每次迭代使用样本数量,-1时为批量梯度下降(BGD)
"""
idx = random.sample(range(self.data.shape[0]),example_num)
data = self.data[idx]
prediction = SoftMax.hypothesis(data,self.weight,self.label_num)
delta = prediction-self.one_hot_label[idx]
weight = self.weight
for i in range(self.label_num):
for w in range(weight.shape[0]):
if lambda_l1>0:
rk = np.dot(delta[:,i].T,data[:,w])
if rk<-lambda_l1:
lambda_l1 = -lambda_l1
weight[w][i] = alpha*(1/example_num)*(np.dot(delta[:,i].T,data[:,w])-lambda_l1)/(np.linalg.norm(data[:,w],2)**2)
else:
weight[w][i] -= (alpha*(1/example_num)*np.dot(delta[:,i].T,data[:,w])-lambda_l2*weight[w][i])
#weight -= alpha*(1/example_num)*(np.dot(delta.T,data)).T
self.weight = weight
@staticmethod
def hypothesis(data,weight,label_num):
"""
计算因变量
data:特征矩阵
labels:分类类别
label_num:类别总数
"""
addition = sum([np.exp(np.dot(data,weight[:,i])) for i in range(label_num)])
predictions = np.array([np.exp(np.dot(data,weight[:,i])) for i in range(label_num)])/addition
return predictions.T
def get_cost(self,data,labels):
"""
计算损失值
data:特征矩阵
labels:分类类别
"""
data = self.pre_processing(data)
return self.cost_function(data,labels,data.shape[0])
def predict(self,data):
"""
从新数据(测试集)上预测
data:特征矩阵
"""
data = self.pre_processing(data,self.polynomial_degree,self.sinusoid_degree,self.normalize_data)
predictions = SoftMax.hypothesis(data,self.weight,self.label_num)
return predictions
def predict_label(self,data):
"""
从新数据(测试集)上预测标签,同时返回一个是否预测正确的list
data:特征矩阵
"""
data = self.pre_processing(data,self.polynomial_degree,self.sinusoid_degree,self.normalize_data)
pred_proba = SoftMax.hypothesis(data,self.weight,self.label_num)
predictions = [sf.label_unique[np.argmax(i)] for i in pred_proba]
return predictions
def score(self,data,label):
"""
计算(在传入的测试集上)的正确率
data:(测试集上的)特征矩阵
label:(测试机上的)分类类别
"""
predictions = self.predict_label(data)
is_true = predictions==label
return sum(is_true)/len(is_true)
在鸢尾花数据集上测试自己的Softmax回归效果,达到了预期的分类目标。
df = pd.read_csv("iris.data",header = None)
X = df.iloc[:,0:4]
Y = df.iloc[:,4]
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,Y)
sf = SoftMax(x_train.values,y_train.values)
weight,cost = sf.train(alpha=0.1,num_iterations=5000)
print("测试集上的准确率为:",sf.score(x_test,y_test))
"""
测试集上的准确率为: 0.9210526315789473
"""
接下来,仅用花瓣的特征作为softmax的特征,以此在二维平面上画出决策边界。
from matplotlib import pyplot as plt
import matplotlib as mpl
mpl.rcParams["font.family"]="SimHei"
X = df.iloc[:,2:4]
Y = df.iloc[:,4]
x_train,x_test,y_train,y_test = train_test_split(X,Y)
sf = SoftMax(x_train.values,y_train.values)
weight,cost = sf.train(alpha=0.1,num_iterations=5000)
X_contour = np.array([np.linspace(np.min(X[2]),np.max(X[2]),df.shape[0])
,np.linspace(np.min(X[3]),np.max(X[3]),df.shape[0])])
Z_setosa = np.zeros((df.shape[0],df.shape[0]))
Z_versicolor = np.zeros((df.shape[0],df.shape[0]))
Z_virginica = np.zeros((df.shape[0],df.shape[0]))
for idx_1,x1 in enumerate(X_contour[0]):
for idx_2,x2 in enumerate(X_contour[1]):
prediction = sf.predict_label([np.array([x1,x2])])[0]
if prediction == "Iris-setosa":
Z_setosa[idx_1][idx_2]=1
elif prediction=="Iris-versicolor":
Z_versicolor[idx_1][idx_2]=1
else:
Z_virginica[idx_1][idx_2]=1
setosa_idx = [i for i in range(len(Y)) if Y[i]=="Iris-setosa"]
versicolor_idx = [i for i in range(len(Y)) if Y[i]=="Iris-versicolor"]
virginica_idx = [i for i in range(len(Y)) if Y[i]=="Iris-virginica"]
plt.scatter(X[2][setosa_idx],X[3][setosa_idx],c='blue',label = 'setosa')
plt.scatter(X[2][versicolor_idx],X[3][versicolor_idx],c='green',label = 'versicolor')
plt.scatter(X[2][virginica_idx],X[3][virginica_idx],c='red',label = "virginica")
plt.contour(X_contour[0],X_contour[1],Z_setosa)
plt.contour(X_contour[0],X_contour[1],Z_versicolor)
plt.contour(X_contour[0],X_contour[1],Z_virginica)
plt.legend()
plt.title("决策边界")
plt.show()
由于在测试过程中,没有发现L1惩罚项和L2惩罚项比原始的无偏估计准确,故此处不再演示ambda_l1 和lambda_l2这2个参数的相关操作了。