用类封装起来,以后使用起来将方便多了。

import numpy as np
from sklearn import preprocessing 

from sklearn.ensemble import RandomForestClassifier 

import matplotlib.pyplot as plt 



class MYRandomForestClassifier: 

    def __init__(self,Modul=RandomForestClassifier,dataset=np.empty((3,3))): 

        self.modul=Modul         

        self.dataset=dataset 

    def ReadFile(self,input_file): 

        # Reading the data 

        X = [];  

        with open(input_file, 'r') as f: 

            for line in f.readlines(): 

#                data=line.split(',') 

                data = line[:-1].split(',')                  

                X.append(data) 

        X = np.array(X) 

        # Convert string data to numerical data 

        label_encoder = []  

        X_encoded = np.empty(X.shape) 

        for i,item in enumerate(X[0]): 

            label_encoder.append(preprocessing.LabelEncoder()) 

            X_encoded[:, i] = label_encoder[-1].fit_transform(X[:, i]) 

        X = X_encoded.astype(int) 

        self.dataset=X 

    def SetModule(self,Modul): 

        self.modul=Modul 

    def FitModule(self): 

        X=self.dataset[:,:-1] 

        y=self.dataset[:,-1] 

        self.modul.fit(X,y) 

        from sklearn import model_selection 

        accuracy = model_selection.cross_val_score(self.modul,X, y, scoring='accuracy', cv=3) 

        print("Accuracy of the classifier: " + str(round(100*accuracy.mean(), 2)) + "%") 

    def PlotModule(self,parameter_grid,args): 

        # Plot the curve 

        from sklearn.model_selection import validation_curve  

        train_scores, validation_scores = validation_curve(self.modul,self.dataset[:,:-1], 

                                                           self.dataset[:,-1],args,  

                                                           parameter_grid, cv=5) 

        plt.figure() 

        plt.plot(parameter_grid, 100*np.average(train_scores, axis=1), color='black') 

        plt.title('Training curve') 

        plt.xlabel('Number of estimators') 

        plt.ylabel('Accuracy') 

        plt.show() 

    def PlotLearingCurve(self,parameter_grid): 

        from sklearn.learning_curve import learning_curve       

        train_sizes, train_scores, validation_scores = learning_curve(self.modul,  

                self.dataset[:,:-1],self.dataset[:,-1], train_sizes=parameter_grid, cv=5) 

        print( "\n##### LEARNING CURVES #####") 

        print("\nTraining scores:\n", train_scores) 

        print("\nValidation scores:\n", validation_scores)         

        # Plot the curve 

        plt.figure() 

        plt.plot(parameter_grid, 100*np.average(train_scores, axis=1), color='black') 

        plt.title('Learning curve') 

        plt.xlabel('Number of training samples') 

        plt.ylabel('Accuracy') 

        plt.show() 

params={'n_estimators':200,'max_depth':8,'random_state':7} 

classifier=RandomForestClassifier(**params) 

MY=MYRandomForestClassifier(Modul=classifier) 

MY.ReadFile("d:\\car.data.txt") 

MY.FitModule() 

MY.modul = RandomForestClassifier(max_depth=4, random_state=7) 

parameter_grid = np.linspace(25, 200, 8).astype(int) 

MY.PlotModule(parameter_grid,"n_estimators") 



MY.modul = RandomForestClassifier(n_estimators=20, random_state=7) 

parameter_grid = np.linspace(2, 10, 5).astype(int) 

MY.PlotModule(parameter_grid,"max_depth") 



MY.modul = RandomForestClassifier(random_state=7) 

parameter_grid = np.array([200, 500, 800, 1100]) 

MY.PlotLearingCurve(parameter_grid)