一、概述
主成分分析法(PCA)主要应用于数据降维。其思想是使用较少的变量来取代原先较多的变量,以实现节省数据量的效果。需要指出,若原始变量之间互相正交,即线性无关,则主成分分析法没有效果。
二、原理
假定有n个样本,每个样本有p个变量描述,则所有数据构成了一个n*p阶的矩阵X
X = [[dat1],
[dat2],
.....
[datn]]
但我们希望通过q个变量来描述这些数据(q<p),最简单地,可以取之前p个变量的线性组合,记为Z。对于n中的第i个数据,有
Z[i,1] = a[1,1]*x[i,1] + a[1,2]*x[i,2] +...+ a[1,p]*x[i,p]
Z[i,2] = a[2,1]*x[i,1] + a[2,2]*x[i,2] +...+ a[2,p]*x[i,p]
...
Z[i,q] = a[q,1]*x[i,1] + a[q,2]*x[i,2] +...+ a[q,p]*x[i,p]
现在要做的就是确定所有a的值,方便起见,将它们记为矩阵A。A为q*p阶。
要确定A,有以下原则:
(1)对于第i个数据而言,变换后描述其特征的向量Z中各元素两两线性无关。
(2)Z1是x1,x2,…,xp的一切线性组合中方差最大者;Z2是与Z1不相关的x1,x2,…,xp的一切线性组合中方差最大者;Zq是与Z1,…,Zq-1不相关的x1,x2,…,xp的一切线性组合中方差最大者;
根据上述原则确定的矩阵A,即可将数据X从p维空间降维到q维空间,实现降维。
三、步骤
(1)对所有样本进行中心化
x[i] = x[i] - (1/m)*np.sum(x[i])
(2)计算样本协方差矩阵C,C为p*p维矩阵
c = np.cov(x.T)
(3)对协方差矩阵C进行特征值分解,得到特征值T和其对应特征向量矩阵W。
t,m = np.linalg.eig(self.cov_mat)
(4)将特征值T和特征向量矩阵M中每一个特征向量以T为索引按从大到小排序。
(5)计算主成分贡献率及累计贡献率。主成分贡献率即每一特征值除以全部特征值之和,累计贡献率即前n个特征值之和除以全部特征值之和。
contribution rate = t[i]/np.sum(t)
accumulated contribution rate = np.sum(t[:n])/np.sum(t)
(6)从M中取出前q个特征向量并将其标准化,组成qp矩阵W。
(7)将数据矩阵X与W的转置W‘相乘,得到nq维输出样本矩阵Y。
一般来说,n的确定需要依据累计贡献率确定。一般选取累计贡献率大于85%的前几个特征值。
四、python代码实现
这里使用64维的手写数字图像作为数据集。
以下代码实现数据集的下载
# import necessary packages
import numpy as np
import pandas as pd
from sklearn.externals import joblib
#download data sets
digits_train = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tra', header=None)
digits_test = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tes', header=None)
data_train = digits_train.values
data_test = digits_test.values
#save datas
joblib.dump(data_train,'data_train.pkl')
joblib.dump(data_test,'data_test.pkl')
joblib.dump(digits_train,'digits_train.pkl')
joblib.dump(digits_test,'digits_test.pkl')
以下代码载入数据并进行计算,结果保存至result.csv文件中。这里将维度降至20。
#import necessary packages
import numpy as np
import pandas as pd
from scipy import io
from matplotlib import pyplot as plt
from sklearn.externals import joblib
#define PCA class
class PCA:
'''
The type of input data should be np.array.
As the form of [[dat1],[dat2],...]
'''
def __init__(self,data):
self.inp = data
self.inp_centralized = np.array(self.inp.shape,dtype="float64")
self.inp_mat = np.mat(self.centralize(),dtype='float64')
self.cov_mat = None
self.eig_value = None
self.eig_vectors = None
self.eig_elements = []
self.eig_elements_used = []
self.eig_elements_normalized = []
self.matrix_w = None
self.contribution_rate = []
self.contribution_rate_accumulated = []
self.res = None
#centralize the data
def centralize(self):
tem = []
for i in range(self.inp.shape[0]):
tem_dat = self.inp[i] - np.mean(self.inp[i])
tem.append(tem_dat)
self.inp_centralized = np.array(tem,dtype="float64")
return self.inp_centralized
#caulate eigenvalues and eigenvectors
def eig(self,n):
self.cov_mat = np.cov(self.inp_mat.T)
self.eig_value,self.eig_vectors = np.linalg.eig(self.cov_mat)
for i in range(self.eig_value.size):
tem = eig_element(self.eig_value[i],self.eig_vectors[i])
self.eig_elements.append(tem)
self.eig_elements.sort(key=lambda element:element.value,reverse=True) #sort
for i in range(n):
self.eig_elements_used.append(self.eig_elements[i])
self.caulate_contribution_rate()
#caulate the contribution rate
def caulate_contribution_rate(self):
s = np.sum(self.eig_value)
for i in self.eig_value:
self.contribution_rate.append(i/s)
for i in range(self.eig_value.size-1):
tem_eig_value = self.eig_value[0:i+1]
r = np.sum(tem_eig_value)/s
self.contribution_rate_accumulated.append(r)
print("\nThe list of contribution rate:")
print(self.contribution_rate)
print("\nThe list of accumulated contribution rate:")
print(self.contribution_rate_accumulated)
n = 0
for i in self.contribution_rate_accumulated:
if (i > 0.85):
print("\nThe number of features should above " + str(n+1) + " .")
break
n = n + 1
def MaxMinNormalization(self,x):
x = (x - np.min(x)) / (np.max(x) - np.min(x))
return x
#normalize the used eigenvectors
def normalize(self):
for i in self.eig_elements_used:
tem_vec = self.MaxMinNormalization(i.vector)
tem = eig_element(i.value,tem_vec)
self.eig_elements_normalized.append(tem)
#caulate processed data
def caulate_new_samples(self):
w = []
for i in self.eig_elements_normalized:
w.append(i.vector)
self.matrix_w = np.mat(w,dtype="float64")
self.res = self.matrix_w * np.mat(self.inp,dtype="float64").T
self.res = np.array(self.res.T)
return self.res
#using this function to complete whole process
def caulate(self,n):
self.eig(n)
self.normalize()
res = self.caulate_new_samples()
return res
#define class of eigenvalues and eigenvectors
class eig_element:
def __init__(self,val,vec):
self.value = val
self.vector = vec
def main():
#load data
ary_data_train = joblib.load('data_train.pkl')
print("The raw data is:")
print(ary_data_train)
np.savetxt('raw_data.csv', ary_data_train, delimiter = ',')
#process data
train = PCA(ary_data_train)
res = train.caulate(20)
print("\nThe result of data is:")
print(res)
io.savemat("result.mat",{"array":res})
np.savetxt('result.csv', res, delimiter = ',')
if __name__ == '__main__':
main()