pandas分析各国家交易情况
生活随笔
收集整理的這篇文章主要介紹了
pandas分析各国家交易情况
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
數據集來源: https://archive.ics.uci.edu/ml/datasets/Online%20Retail?
#coding:utf-8import pandas as pd import os import seaborn as sns import matplotlib.pyplot as pltRAW_DATA_FILE = './data/online_retail.xlsx' CLN_DATA_FILE = './output/cln_online_retail.csv'def inspect_data(data_df):"""查看數據集信息參數:- data_df: DataFrame數據"""print('數據集基本信息')print(data_df.info())print('\n數據集統計信息')print(data_df.describe())print('\n數據集預覽')print(data_df.head())def clean_data(data_df):"""數據清洗,包括去除空記錄,去除重復記錄參數:- data_df: DataFrame數據返回:- cln_data_df: 清洗后的數據"""# 去除空記錄后的數據non_empty_data_df = data_df.dropna()n_empty = data_df.shape[0] - non_empty_data_df.shape[0]# 去重后的記錄cln_data_df = non_empty_data_df.drop_duplicates()n_duplicates = data_df.shape[0] - cln_data_df.shape[0]print('原始數據共有{}條記錄,清洗后的數據共有{}條有效記錄。(其中空記錄有{}條,重復記錄有{}條。)'.format(data_df.shape[0], cln_data_df.shape[0], n_empty, n_duplicates))# 保存清洗結果cln_data_df.to_csv(CLN_DATA_FILE, index=False, encoding='utf-8')return cln_data_dfdef show_customer_stats(data_df):"""比較各國家的客戶數參數:- data_df: DataFrame數據"""customer_per_country = data_df.drop_duplicates(['CustomerID'])['Country'].value_counts()# 由于'United Kingdom'數據過多,所以這里只考慮其他國家customer_per_country_df = \customer_per_country[customer_per_country.index != 'United Kingdom'].to_frame().T# 可視化結果sns.barplot(data=customer_per_country_df)#xlabel的名字旋轉90度plt.xticks(rotation=90)plt.xlabel('Country')plt.ylabel('#Customers')#布局緊湊plt.tight_layout()plt.savefig('./output/customer_per_country.png')plt.show()def show_total_cost_stats(data_df):"""比較各國家的成交額參數:- data_df: DataFrame數據"""# 過濾掉"取消"的交易記錄,以及'United Kingdom'的數據cond1 = ~data_df['InvoiceNo'].str.startswith('C')cond2 = data_df['Country'] != 'United Kingdom'valid_data_df = data_df[cond1 & cond2].copy()valid_data_df['TotalCost'] = valid_data_df['UnitPrice'] * valid_data_df['Quantity']cost_per_country = valid_data_df.groupby('Country')['TotalCost'].sum()# 可視化結果cost_per_country.sort_values(ascending=False).plot(kind='bar')plt.ylabel('Total Cost')plt.tight_layout()plt.savefig('./output/cost_per_country.png')plt.show()def show_trend_by_country(data_df):"""統計各國家交易記錄的趨勢參數:- data_df: DataFrame數據"""countries = ['Germany', 'France', 'Spain', 'Belgium', 'Switzerland']data_df = data_df[data_df['Country'].isin(countries)].copy()data_df['InvoiceDate'] = pd.to_datetime(data_df['InvoiceDate'])data_df['InvoiceYear'] = data_df['InvoiceDate'].dt.year.astype(str)data_df['InvoiceMonth'] = data_df['InvoiceDate'].dt.month.astype(str)#年份拼上月份用橫線隔開data_df['InvoiceYearMonth'] = data_df['InvoiceYear'].str.cat(data_df['InvoiceMonth'], sep='-')month_country_count = data_df.groupby(['InvoiceYearMonth', 'Country'])['StockCode'].count()print(month_country_count.head())#打散month_country_count_df = month_country_count.unstack()print(month_country_count_df.head())#橫軸截至到月份month_country_count_df.index = pd.to_datetime(month_country_count_df.index).to_period('M')print(month_country_count_df.head())month_country_count_df.sort_index(inplace=True)# 可視化結果# 堆疊柱狀圖month_country_count_df.plot(kind='bar', stacked=True, rot=45)plt.xlabel('Month')plt.ylabel('#Transaction')plt.tight_layout()plt.savefig('./output/country_trend_stacked_bar.png')plt.show()## 熱圖sns.heatmap(month_country_count_df.T)plt.xlabel('Month')plt.xticks(rotation=90)plt.yticks(rotation=0)plt.tight_layout()plt.savefig('./output/country_trend_heatmap.png')plt.show()def main():"""主函數"""if not os.path.exists(CLN_DATA_FILE):# 如果不存在清洗后的數據集,進行數據清洗raw_data_df = pd.read_excel(RAW_DATA_FILE, dtype={'InvoiceNo': str,'StockCode': str,'CustomerID': str})# 查看數據集信息inspect_data(raw_data_df)# 數據清洗cln_data_df = clean_data(raw_data_df)else:print('讀取已清洗的數據')cln_data_df = pd.read_csv(CLN_DATA_FILE)# 數據分析# 1. 比較各國家的客戶數show_customer_stats(cln_data_df)# 2. 比較各國家的成交額show_total_cost_stats(cln_data_df)# 3. 統計各國家交易記錄的趨勢show_trend_by_country(cln_data_df)if __name__ == '__main__':main()各國家的客戶數
各國家的成交額
各國家交易記錄的趨勢
堆疊柱狀圖
熱力圖
?
總結
以上是生活随笔為你收集整理的pandas分析各国家交易情况的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 使用matlab程序,基于标准卡标定感压
- 下一篇: 图像拼接1 特征提取