Python數據分析：實用向 _生活百科

文件處理導包import pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as sns%matplotlib inline

添加鏡像

https://mirrors.tuna.tsinghua.edu.cn/https://developer.aliyun.com/mirror/http://mirrors.163.com/ubuntu/https://mirrors.ustc.edu.cn/http://mirrors.zju.edu.cn/http://mirrors.sohu.com/http://ftp.sjtu.edu.cn/http://mirror.bjtu.edu.cn/http://mirror.bjtu.edu.cn/

語法

其中http和https是可選的
! pip install xxx -i https://mirrors.tuna.tsinghua.edu.cn/導入文件excel

data=https://www.huyubaike.com/biancheng/pd.read_excel(r"C:\Users\ranxi\Desktop\附錄1 目標客戶體驗數據.xlsx", sheet_name='data')data.head()

csvdata=https://www.huyubaike.com/biancheng/pd.read_csv()EDA報告

#生成報告import pandas_profilingdata.profile_report()#輸出報告文件pfr = pandas_profiling.ProfileReport(data)pfr.to_file('report.html')

dataframe導出excel文件data.to_excel('data.xlsx')數據處理數據篩選分類均值展示cvr_summary = data.groupby("cvr_group_high")cvr_summary.mean().reset_index()標簽編碼

print("client","--" ,data.client.unique())from sklearn.preprocessing import LabelEncoderdata.client = LabelEncoder().fit_transform(data.client)print("client","--" ,data.client.unique())

交叉比例表pd.crosstab(data['invited_is'],data["cvr_group_high"],normalize=0)計算分布比例

def percent_value_counts(df, feature):"""This function takes in a dataframe and a column and finds the percentage of the value_counts"""percent = pd.DataFrame(round(df.loc[:,feature].value_counts(dropna=False, normalize=True)*100,2))## creating a df with thtotal = pd.DataFrame(df.loc[:,feature].value_counts(dropna=False))## concating percent and total dataframetotal.columns = ["Total"]percent.columns = ['Percent']return pd.concat([total, percent], axis = 1)percent_value_counts(data, "B7")

多列apply函數with_N['B7'] = with_N.apply(lambda x: child_estimator(x['B6'], x['B5']), axis=1)卡方檢驗

#分組間確實是有顯著性差異 ， 頻數比較的結論才有可信度，故需進行”卡方檢驗“from scipy.stats import chi2_contingency#統計分析 卡方檢驗#自定義卡方檢驗函數def KF(x):df1=pd.crosstab(data2['購買意愿'],data2[x])li1=list(df1.iloc[0,:])li2=list(df1.iloc[1,:])kf_data=https://www.huyubaike.com/biancheng/np.array([li1,li2])kf=chi2_contingency(kf_data)if kf[1]<0.05:print('購買意愿 by {} 的卡方臨界值是{:.2f}，小于0.05，表明{}組間有顯著性差異,可進行【交叉分析】'.format(x,kf[1],x),'\n')else:print('購買意愿 by {} 的卡方臨界值是{:.2f} ， 大于0.05，表明{}組間無顯著性差異,不可進行交叉分析'.format(x,kf[1],x),'\n')#對 kf_var進行卡方檢驗print('kf_var的卡方檢驗結果如下:','\n')print(list(map(KF, kf_var)))

條件篩選

specific=data[(data['a1']>100)|(data['a2']>100)|(data['a3']>100)|(data['a4']>100)|(data['a5']>100)|(data['a6']>100)|(data['a7']>100)|(data['a8']>100)]specific

specific=data[(data['']>x)|&()]data[data.Cabin=='N']map函數分組

def hour_group_fun(hour):x = ''if 0<=hour<8:x=1elif 8<=hour<16:x=2else:x=3return x## Applying function to the column.police['hour_group'] =police['hour'].map(hour_group_fun)

apply多列賦值with_N['B7'] = with_N.apply(lambda x: child_estimator(x['B6'], x['B5']), axis=1)這是一個分布比例函數

def percent_value_counts(df, feature):"""This function takes in a dataframe and a column and finds the percentage of the value_counts"""percent = pd.DataFrame(round(df.loc[:,feature].value_counts(dropna=False, normalize=True)*100,2))## creating a df with thtotal = pd.DataFrame(df.loc[:,feature].value_counts(dropna=False))## concating percent and total dataframetotal.columns = ["Total"]percent.columns = ['Percent']return pd.concat([total, percent], axis = 1)

特征工程時間數據處理

police['date'] = pd.to_datetime(police['接警日期'],errors='coerce')police['year'] =police['date'].dt.year.fillna(0).astype("int")#轉化提取年police['month'] = police['date'].dt.month.fillna(0).astype("int")#轉化提取月police['day'] = police['date'].dt.day.fillna(0).astype("int")#轉化提取天police['dates'] = police['month'].map(str) + '-' + police['day'].map(str) #轉化獲取月-日police['time'] = pd.to_datetime(police['接警時間點'],errors='coerce').dt.timepolice['hour'] = pd.to_datetime(police['接警時間點'],errors='coerce').dt.hour.fillna(0).astype("int")#轉化提取小時

SMOTE過抽樣

from imblearn.over_sampling import SMOTEmodel_smote=SMOTE()X,y=model_smote.fit_resample(X,y)X=pd.DataFrame(X,columns=t.columns)#分拆數據集：訓練集 和 測試集X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)print('過抽樣數據特征：', X.shape,'訓練數據特征：',X_train.shape,'測試數據特征：',X_test.shape)print('過抽樣后數據標簽：', y.shape,'訓練數據標簽：',y_train.shape,'測試數據標簽：',y_test.shape)
上一頁
1
2
3
4
下一頁
		  	

    
    




    
    
    


推薦閱讀

           
                  
              
                  飛機托運酒水規定2022 國內飛機能帶酒嗎？ 
                
                   
                
              
            

                  
              
                  光遇12月3日復刻旅行先祖兌換物品要多少蠟燭,高手進階 
                
                   
                
              
            

                  
              
                  今日查詢價格3011元，今日查詢價格3011元寶？ 
                
                   
                
              
            

                  
              
                  淡豆豉煮水喝有什么作用與功效 淡豆豉煮水喝有什么作用 
                
                   
                
              
            

                  
              
                  交通事故索賠需要準備哪些誤工費證據 
                
                   
                
              
            

                  
              
                  vivox21耗電快怎么辦 vivox21耗電快怎么解決 
                
                   
                
              
            

                  
              
                  報考的c1考了科目一可以改成c2嗎 報的是c1,考了科一,想換成c2可以不 
                
                   
                
              
            

                  
              
                  當一個女人不愛你了會有什么表現 
                
                   
                
              
            

                  
              
                  職場女強人的星座女巨蟹女 
                
                   
                
              
            

                  
              
                  平凡的世界讀書心得初二優秀作文700字 
                
                   
                
              
            

                  
              
                  2021年正月初六出生的寶寶名字如何取,簡約大氣旺生肖 
                
                   
                
              
            

                  
              
                  2006年屬狗的是什么命五行屬什么 
                
                   
                
              
            

                  
              
                  關于閃長巖簡述 閃長巖 
                
                   
                
              
            

                  
              
                  李廣的生平事跡 李廣人物介紹 
                
                   
                
              
            

                  
              
                  造夢西游4手機版龍幣怎么得到 
                
                   
                
              
            

                  
              
                  保衛蘿卜深海16攻略 保衛蘿卜深海攻略2關 
                
                   
                
              
            

                  
              
                  四個木念什么 四個木念什么呀 
                
                   
                
              
            

                  
              
                  知名女星李羲兒車禍縫54針！一臉血跡躺倒在地，車子翻轉掉落田溝 ... 
                
                   
                
              
            

                  
              
                  關于跑跑姜餅人簡述 跑跑姜餅人 
                
                   
                
              
            

                  
              
                  宋媽后來怎么樣了 宋媽后來怎樣了 
                
                   
                
              
            

          

python ROS2時間同步 

圖文 Python 嵌入式打包 

其三 Gitea 1.18 功能前瞻：增強文本預覽效果、繼續擴展軟件包注冊中心、增強工單實用功能、完善了用戶邀請機制和SEO 

3 Python全棧工程師之從網頁搭建入門到Flask全棧項目實戰 - 入門Flask微框架 

跟我學Python圖像處理丨圖像特效處理：毛玻璃、浮雕和油漆特效 

四 Selenium4.0+Python3系列 - 常見元素操作（含鼠標鍵盤事件） 

二、python基本數據類型 

python3使用mutagen進行音頻元數據處理 

Python 根據兩個字段排序 中文排序 漢字排序 升序 降序 

錘子手機怎么樣，功能都實用嗎（錘子手機怎么進入工程模式)