Python數據分析:實用向

文件處理導包import pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as sns%matplotlib inline

添加鏡像
https://mirrors.tuna.tsinghua.edu.cn/https://developer.aliyun.com/mirror/http://mirrors.163.com/ubuntu/https://mirrors.ustc.edu.cn/http://mirrors.zju.edu.cn/http://mirrors.sohu.com/http://ftp.sjtu.edu.cn/http://mirror.bjtu.edu.cn/http://mirror.bjtu.edu.cn/
語法
其中httphttps是可選的
! pip install xxx -i https://mirrors.tuna.tsinghua.edu.cn/導入文件exceldata=https://www.huyubaike.com/biancheng/pd.read_excel(r"C:\Users\ranxi\Desktop\附錄1 目標客戶體驗數據.xlsx", sheet_name='data')data.head()csvdata=https://www.huyubaike.com/biancheng/pd.read_csv()EDA報告#生成報告import pandas_profilingdata.profile_report()#輸出報告文件pfr = pandas_profiling.ProfileReport(data)pfr.to_file('report.html')dataframe導出excel文件data.to_excel('data.xlsx')數據處理數據篩選分類均值展示cvr_summary = data.groupby("cvr_group_high")cvr_summary.mean().reset_index()標簽編碼print("client","--" ,data.client.unique())from sklearn.preprocessing import LabelEncoderdata.client = LabelEncoder().fit_transform(data.client)print("client","--" ,data.client.unique())交叉比例表pd.crosstab(data['invited_is'],data["cvr_group_high"],normalize=0)計算分布比例def percent_value_counts(df, feature):"""This function takes in a dataframe and a column and finds the percentage of the value_counts"""percent = pd.DataFrame(round(df.loc[:,feature].value_counts(dropna=False, normalize=True)*100,2))## creating a df with thtotal = pd.DataFrame(df.loc[:,feature].value_counts(dropna=False))## concating percent and total dataframetotal.columns = ["Total"]percent.columns = ['Percent']return pd.concat([total, percent], axis = 1)percent_value_counts(data, "B7")多列apply函數with_N['B7'] = with_N.apply(lambda x: child_estimator(x['B6'], x['B5']), axis=1)卡方檢驗#分組間確實是有顯著性差異 , 頻數比較的結論才有可信度,故需進行”卡方檢驗“from scipy.stats import chi2_contingency#統計分析 卡方檢驗#自定義卡方檢驗函數def KF(x):df1=pd.crosstab(data2['購買意愿'],data2[x])li1=list(df1.iloc[0,:])li2=list(df1.iloc[1,:])kf_data=https://www.huyubaike.com/biancheng/np.array([li1,li2])kf=chi2_contingency(kf_data)if kf[1]<0.05:print('購買意愿 by {} 的卡方臨界值是{:.2f},小于0.05,表明{}組間有顯著性差異,可進行【交叉分析】'.format(x,kf[1],x),'\n')else:print('購買意愿 by {} 的卡方臨界值是{:.2f} , 大于0.05,表明{}組間無顯著性差異,不可進行交叉分析'.format(x,kf[1],x),'\n')#對 kf_var進行卡方檢驗print('kf_var的卡方檢驗結果如下:','\n')print(list(map(KF, kf_var)))條件篩選specific=data[(data['a1']>100)|(data['a2']>100)|(data['a3']>100)|(data['a4']>100)|(data['a5']>100)|(data['a6']>100)|(data['a7']>100)|(data['a8']>100)]specificspecific=data[(data['']>x)|&()]data[data.Cabin=='N']map函數分組def hour_group_fun(hour):x = ''if 0<=hour<8:x=1elif 8<=hour<16:x=2else:x=3return x## Applying function to the column.police['hour_group'] =police['hour'].map(hour_group_fun)apply多列賦值with_N['B7'] = with_N.apply(lambda x: child_estimator(x['B6'], x['B5']), axis=1)這是一個分布比例函數def percent_value_counts(df, feature):"""This function takes in a dataframe and a column and finds the percentage of the value_counts"""percent = pd.DataFrame(round(df.loc[:,feature].value_counts(dropna=False, normalize=True)*100,2))## creating a df with thtotal = pd.DataFrame(df.loc[:,feature].value_counts(dropna=False))## concating percent and total dataframetotal.columns = ["Total"]percent.columns = ['Percent']return pd.concat([total, percent], axis = 1)特征工程時間數據處理police['date'] = pd.to_datetime(police['接警日期'],errors='coerce')police['year'] =police['date'].dt.year.fillna(0).astype("int")#轉化提取年police['month'] = police['date'].dt.month.fillna(0).astype("int")#轉化提取月police['day'] = police['date'].dt.day.fillna(0).astype("int")#轉化提取天police['dates'] = police['month'].map(str) + '-' + police['day'].map(str) #轉化獲取月-日police['time'] = pd.to_datetime(police['接警時間點'],errors='coerce').dt.timepolice['hour'] = pd.to_datetime(police['接警時間點'],errors='coerce').dt.hour.fillna(0).astype("int")#轉化提取小時SMOTE過抽樣from imblearn.over_sampling import SMOTEmodel_smote=SMOTE()X,y=model_smote.fit_resample(X,y)X=pd.DataFrame(X,columns=t.columns)#分拆數據集:訓練集 和 測試集X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)print('過抽樣數據特征:', X.shape,'訓練數據特征:',X_train.shape,'測試數據特征:',X_test.shape)print('過抽樣后數據標簽:', y.shape,'訓練數據標簽:',y_train.shape,'測試數據標簽:',y_test.shape)

推薦閱讀