数据来源:
参考文献:《机器学习Python实战》魏贞原
博文目的:复习
工具:Geany
#导入类库
from pandas import read_csv #读数据
from pandas.plotting import scatter_matrix #画散点图from pandas import set_option #设置打印数据精确度import numpy as npimport matplotlib.pyplot as plt #画图from sklearn.preprocessing import Normalizer #数据预处理:归一化from sklearn.preprocessing import StandardScaler #数据预处理:正态化from sklearn.preprocessing import MinMaxScaler #数据预处理:调整数据尺度
from sklearn.model_selection import train_test_split #分离数据集from sklearn.model_selection import cross_val_score #计算算法准确度from sklearn.model_selection import KFold #交叉验证from sklearn.model_selection import GridSearchCV #机器学习算法的参数优化方法:网格优化法from sklearn.linear_model import LinearRegression #线性回归from sklearn.linear_model import Lasso #套索回归from sklearn.linear_model import ElasticNet #弹性网络回归from sklearn.linear_model import LogisticRegression #逻辑回归算法from sklearn.discriminant_analysis import LinearDiscriminantAnalysis #线性判别分析from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis #二次判别分析from sklearn.tree import DecisionTreeRegressor #决策树回归 from sklearn.tree import DecisionTreeClassifier #决策树分类from sklearn.neighbors import KNeighborsRegressor #KNN回归from sklearn.neighbors import KNeighborsClassifier #KNN分类
from sklearn.naive_bayes import GaussianNB #贝叶斯分类器from sklearn.svm import SVR #支持向量机 回归from sklearn.svm import SVC #支持向量机 分类from sklearn.pipeline import Pipeline #pipeline能够将从数据转换到评估模型的整个机器学习流程进行自动化处理from sklearn.ensemble import RandomForestRegressor #随即森林回归from sklearn.ensemble import RandomForestClassifier #随即森林分类from sklearn.ensemble import GradientBoostingRegressor #随即梯度上升回归from sklearn.ensemble import GradientBoostingClassifier #随机梯度上分类from sklearn.ensemble import ExtraTreesRegressor #极端树回归from sklearn.ensemble import ExtraTreesClassifier #极端树分类from sklearn.ensemble import AdaBoostRegressor #AdaBoost回归from sklearn.ensemble import AdaBoostClassifier #AdaBoost分类from sklearn.metrics import mean_squared_error #
from sklearn.metrics import accuracy_score #分类准确率from sklearn.metrics import confusion_matrix #混淆矩阵
from sklearn.metrics import classification_report #分类报告
#导入数据
filename = 'wine.csv'data = read_csv(filename, header=None, delimiter=',')#数据理解print(data.shape)#print(data.dtypes)#print(data.corr(method='pearson'))#print(data.describe())#print(data.groupby(0).size())#数据可视化:直方图、散点图、密度图、关系矩阵图
#直方图
#data.hist()
#plt.show()#密度图
#data.plot(kind='density', subplots=True, layout=(4,4), sharex=False, sharey=False)
#plt.show()#散点图
#scatter_matrix(data)
#plt.show()#关系矩阵图
#fig = plt.figure()
#ax = fig.add_subplot(111)#cax = ax.matshow(data.corr(), vmin=-1, vmax=1)#fig.colorbar(cax)#plt.show()#数据处理:调整数据尺度、归一化、正态化、二值化
array = data.valuesX = array[:, 1:14].astype(float)Y = array[:,0]scaler = MinMaxScaler(feature_range=(0,1)).fit(X)X_m = scaler.transform(X)scaler = Normalizer().fit(X)X_n = scaler.transform(X)scaler = StandardScaler().fit(X)X_s = scaler.transform(X)#分离数据集validation_size = 0.2seed = 7X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=validation_size, random_state=seed)X_m_train, X_m_test, Y_m_train, Y_m_test = train_test_split(X, Y, test_size=validation_size, random_state=seed)X_n_train, X_n_test, Y_n_train, Y_n_test = train_test_split(X, Y, test_size=validation_size, random_state=seed)X_s_train, X_s_test, Y_s_train, Y_s_test = train_test_split(X, Y, test_size=validation_size, random_state=seed)#选择模型:(本例是一个分类问题)#非线性:KNN, SVC, CART, GaussianNB,#线性:KNN, SVR, LR, Lasso, ElasticNet, LDA, models = {}models['KNN'] = KNeighborsClassifier()models['SVM'] = SVC()models['CART'] = DecisionTreeClassifier()models['GN'] = GaussianNB()#models['LR'] = LinearRegression()#models['Lasso'] = Lasso()#models['EN'] = ElasticNet()models['LDA'] = LinearDiscriminantAnalysis()models['QDA'] = QuadraticDiscriminantAnalysis()#评估模型scoring = 'accuracy'num_folds = 10seed = 7results = []for key in models: kfold = KFold(n_splits=num_folds, random_state=seed) cv_results =cross_val_score(models[key], X_train, Y_train, scoring=scoring, cv=kfold) results.append(cv_results) print('%s %f(%f)'%(key, cv_results.mean(), cv_results.std()))results_m = []for key in models: kfold = KFold(n_splits=num_folds, random_state=seed) cv_results_m =cross_val_score(models[key], X_m_train, Y_m_train, scoring=scoring, cv=kfold) results_m.append(cv_results_m) print('调整数据尺度:%s %f(%f)'%(key, cv_results_m.mean(), cv_results_m.std()))results_n = []for key in models: kfold = KFold(n_splits=num_folds, random_state=seed) cv_results_n =cross_val_score(models[key], X_n_train, Y_n_train, scoring=scoring, cv=kfold) results_n.append(cv_results_n) print('归一化数据:%s %f(%f)'%(key, cv_results_n.mean(), cv_results_n.std())) results_s = []for key in models: kfold = KFold(n_splits=num_folds, random_state=seed) cv_results_s =cross_val_score(models[key], X_s_train, Y_s_train, scoring=scoring, cv=kfold) results_s.append(cv_results_s) print('正态化数据:%s %f(%f)'%(key, cv_results_s.mean(), cv_results_s.std()))#箱线图
#fig = plt.figure()#ax = fig.add_subplot(111)#fig.suptitle('Algorithm Comparison')#plt.boxplot(results)#ax.set_xticklabels(models.keys())#plt.show()#算法优化:LDA#调参改善算法LinearDiscriminantAnalysis
param_grid = {'solver':['svd', 'lsqr', 'eigen']}
model = LinearDiscriminantAnalysis()kfold = KFold(n_splits=num_folds, random_state=seed)grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)grid_result = grid.fit(X=X_train, y=Y_train)print('最优:%s 使用:%s'%(grid_result.best_score_, grid_result.best_params_))cv_results = zip(grid_result.cv_results_['mean_test_score'], grid_result.cv_results_['std_test_score'], grid_result.cv_results_['params'])for mean, std, params in cv_results: print('%f(%f) with %r'%(mean, std, params))#算法集成#bagging: 随机森林,极限树;#boosting:ada, 随机梯度上升ensembles = {}ensembles['RF'] = RandomForestClassifier()ensembles['ET'] = ExtraTreesClassifier()ensembles['ADA'] = AdaBoostClassifier()ensembles['GBM'] = GradientBoostingClassifier()results = []for key in ensembles: kfold = KFold(n_splits=num_folds, random_state=seed) cv_results =cross_val_score(ensembles[key], X_train, Y_train, scoring=scoring, cv=kfold) results.append(cv_results) print('%s %f(%f)'%(key, cv_results.mean(), cv_results.std()))#集成算法调参gbmparam_grid = {'n_estimators':[10,50,100,200,300,400,500,600,700,800,900]}model = GradientBoostingClassifier()kfold = KFold(n_splits=num_folds, random_state=seed)grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=kfold, scoring=scoring)grid_result = grid.fit(X=X_train, y=Y_train)print('最优:%s 使用:%s'%(grid_result.best_score_, grid_result.best_params_))cv_results = zip(grid_result.cv_results_['mean_test_score'], grid_result.cv_results_['std_test_score'], grid_result.cv_results_['params'])for mean, std, params in cv_results: print('%f(%f) with %r'%(mean, std, params))#训练最终模型model = LinearDiscriminantAnalysis(solver='svd')model.fit(X=X_train, y=Y_train)#评估最终模型predictions = model.predict(X_test)print(accuracy_score(Y_test, predictions))print(confusion_matrix(Y_test, predictions))print(classification_report(Y_test, predictions))