Python數據分析:實用向( 三 )

畫圖畫圖準備解決中文符號顯示問題plt.rcParams['font.sans-serif'] = ['SimHei']# 黑體plt.rcParams['axes.unicode_minus'] = False# 解決無法顯示符號的問題sns.set(font='SimHei', font_scale=0.8)# 解決Seaborn中文顯示問題設置背景樣式plt.style.use('classic')plt.rc("figure", facecolor="white")#去除灰色邊框繪圖這是一個畫箱線圖代碼import matplotlib.pyplot as pltsns.set_style('darkgrid')fig, ax = plt.subplots(figsize=(16,12),ncols=2)ax1 = sns.boxplot(x="Embarked", y="Fare", hue="Pclass", data=https://www.huyubaike.com/biancheng/train, ax = ax[0]);ax2 = sns.boxplot(x="Embarked", y="Fare", hue="Pclass", data=https://www.huyubaike.com/biancheng/test, ax = ax[1]);ax1.set_title("Training Set", fontsize = 18)ax2.set_title('Test Set',fontsize = 18)fig.show()畫缺口餅圖churn_value=https://www.huyubaike.com/biancheng/data['cvr_group_high'].value_counts()labels=data['cvr_group_high'].value_counts().indexplt.figure(figsize=(7,7))plt.pie(churn_value,labels=['一般客戶', '高價值客戶'],colors=["#75bbfd","#00ffff"], explode=(0.05,0),autopct='%1.1f%%', shadow=False)plt.rcParams['font.sans-serif']=['SimHei']plt.rcParams['axes.unicode_minus'] = Falseplt.title("高價值客戶占比23.4%")#plt.savefig('pie.png', dpi=300)畫相關性系數圖mask = np.zeros_like(data.corr(), dtype=np.bool)#mask[np.triu_indices_from(mask)] = Trueplt.subplots(figsize = (15,12))sns.heatmap(data.corr(),annot=True,#mask = mask,cmap = 'RdBu', ## in order to reverse the bar replace "RdBu" with "RdBu_r"linewidths=.9,linecolor='gray',fmt='.2g',center = 0,square=True)plt.title("Correlations Among Features", y = 1.03,fontsize = 20, pad = 40) #相關性矩陣plt.savefig('cor.png', dpi=300)plt.show()畫核密度估計fig = plt.figure(figsize=(15,8),)## I have included to different ways to code a plot behigh, choose the one that suites you.ax=sns.kdeplot(data.client[data.cvr_group_high == 0] ,color='gray',shade=True,label='high')ax=sns.kdeplot(data.loc[(data['cvr_group_high'] == 1),'client'] ,color='g',shade=True,label='high',)plt.title('client - high vs high', fontsize = 25, pad = 40)plt.ylabel("Frequency of cvr", fontsize = 15, labelpad = 20)plt.xlabel("Client", fontsize = 15,labelpad =20)## Converting xticks into words for better understandinglabels = ['H5', 'android', 'ios','pc','wap']plt.xticks(sorted(data.client.unique()), labels)plt.legend()模型訓練導入模塊#加載模塊from sklearn.preprocessing import StandardScalerimport warningswarnings.filterwarnings("ignore") #過濾掉警告的意思from pyforest import *import pandas as pdimport numpy as npfrom sklearn.ensemble import RandomForestClassifier#隨機森林from sklearn.svm import SVC,LinearSVC#支持向量機from sklearn.linear_model import LogisticRegression#邏輯回歸from sklearn.neighbors import KNeighborsClassifier#KNN算法from sklearn.cluster import KMeans#K-Means 聚類算法from sklearn.naive_bayes import GaussianNB#樸素貝葉斯from sklearn.tree import DecisionTreeClassifier#決策樹import xgboost as xgbfrom xgboost import XGBClassifierfrom catboost import CatBoostClassifierfrom sklearn.ensemble import AdaBoostClassifierfrom sklearn.ensemble import GradientBoostingClassifier from sklearn.metrics import classification_report,precision_score,recall_score,f1_score,accuracy_score #分類報告from sklearn.metrics import confusion_matrix#混淆矩陣from sklearn.metrics import silhouette_score#輪廓系數(評價k-mean聚類效果)from sklearn.model_selection import GridSearchCV#交叉驗證from sklearn.metrics import make_scorerfrom sklearn.ensemble import VotingClassifier#投票def plot_predictions(test,predicted):#整體平移x=np.arange(0,len(test))+1#x[0]=1#my_x_ticks = np.arange(1, 14, 1)#plt.xticks(my_x_ticks)plt.plot(x,test,label='Real')plt.plot(x,predicted,color='darkOrange',linestyle='--',label='Predicted')#plt.xlabel('month')plt.ylabel('count')plt.legend()import mathdef mse_loss(y_true, y_pred):return np.sum(np.power(y_true - y_pred, 2)) / y_true.shape[0] / 2def return_rmse(test,predicted):rmse = math.sqrt(mse_loss(test, predicted))return rmse#print("The mean squared error is {}.".format(rmse))Classifiers=[["Random Forest",RandomForestClassifier()],["Support Vector Machine",SVC()],["LogisticRegression",LogisticRegression()],["KNN",KNeighborsClassifier(n_neighbors=5)],["Naive Bayes",GaussianNB()],["Decision Tree",DecisionTreeClassifier()],["AdaBoostClassifier",AdaBoostClassifier()],["GradientBoostingClassifier", GradientBoostingClassifier()],["XGB", XGBClassifier()],]設置訓練集X=train.drop(['目標客戶編號','品牌類型','購買意愿'], axis = 1)# X=train.drop(['目標客戶編號','品牌類型'], axis = 1)t=Xheaders = X.columnsX= X.astype(float)y = train["購買意愿"]訓練模型import warningswarnings.filterwarnings('ignore')Classify_result=[]names=[]prediction=[]for name,classifier in Classifiers:classifier=classifierclassifier.fit(X_train,y_train)y_pred=classifier.predict(X_test)recall=recall_score(y_test,y_pred,average='macro')precision=precision_score(y_test,y_pred,average='macro')f1score = f1_score(y_test, y_pred,average='macro')mse = return_rmse(y_test,y_pred)class_eva=pd.DataFrame([recall,precision,f1score,mse])Classify_result.append(class_eva)name=pd.Series(name)names.append(name)y_pred=pd.Series(y_pred)prediction.append(y_pred)plot_predictions(y_test,y_pred)## plt.savefig('seven1.png', dpi=300)plt.show()

推薦閱讀