pandas分析各国家交易情况
生活随笔
收集整理的這篇文章主要介紹了
pandas分析各国家交易情况
小編覺(jué)得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.
數(shù)據(jù)集來(lái)源: https://archive.ics.uci.edu/ml/datasets/Online%20Retail?
#coding:utf-8import pandas as pd import os import seaborn as sns import matplotlib.pyplot as pltRAW_DATA_FILE = './data/online_retail.xlsx' CLN_DATA_FILE = './output/cln_online_retail.csv'def inspect_data(data_df):"""查看數(shù)據(jù)集信息參數(shù):- data_df: DataFrame數(shù)據(jù)"""print('數(shù)據(jù)集基本信息')print(data_df.info())print('\n數(shù)據(jù)集統(tǒng)計(jì)信息')print(data_df.describe())print('\n數(shù)據(jù)集預(yù)覽')print(data_df.head())def clean_data(data_df):"""數(shù)據(jù)清洗,包括去除空記錄,去除重復(fù)記錄參數(shù):- data_df: DataFrame數(shù)據(jù)返回:- cln_data_df: 清洗后的數(shù)據(jù)"""# 去除空記錄后的數(shù)據(jù)non_empty_data_df = data_df.dropna()n_empty = data_df.shape[0] - non_empty_data_df.shape[0]# 去重后的記錄cln_data_df = non_empty_data_df.drop_duplicates()n_duplicates = data_df.shape[0] - cln_data_df.shape[0]print('原始數(shù)據(jù)共有{}條記錄,清洗后的數(shù)據(jù)共有{}條有效記錄。(其中空記錄有{}條,重復(fù)記錄有{}條。)'.format(data_df.shape[0], cln_data_df.shape[0], n_empty, n_duplicates))# 保存清洗結(jié)果cln_data_df.to_csv(CLN_DATA_FILE, index=False, encoding='utf-8')return cln_data_dfdef show_customer_stats(data_df):"""比較各國(guó)家的客戶數(shù)參數(shù):- data_df: DataFrame數(shù)據(jù)"""customer_per_country = data_df.drop_duplicates(['CustomerID'])['Country'].value_counts()# 由于'United Kingdom'數(shù)據(jù)過(guò)多,所以這里只考慮其他國(guó)家customer_per_country_df = \customer_per_country[customer_per_country.index != 'United Kingdom'].to_frame().T# 可視化結(jié)果sns.barplot(data=customer_per_country_df)#xlabel的名字旋轉(zhuǎn)90度plt.xticks(rotation=90)plt.xlabel('Country')plt.ylabel('#Customers')#布局緊湊plt.tight_layout()plt.savefig('./output/customer_per_country.png')plt.show()def show_total_cost_stats(data_df):"""比較各國(guó)家的成交額參數(shù):- data_df: DataFrame數(shù)據(jù)"""# 過(guò)濾掉"取消"的交易記錄,以及'United Kingdom'的數(shù)據(jù)cond1 = ~data_df['InvoiceNo'].str.startswith('C')cond2 = data_df['Country'] != 'United Kingdom'valid_data_df = data_df[cond1 & cond2].copy()valid_data_df['TotalCost'] = valid_data_df['UnitPrice'] * valid_data_df['Quantity']cost_per_country = valid_data_df.groupby('Country')['TotalCost'].sum()# 可視化結(jié)果cost_per_country.sort_values(ascending=False).plot(kind='bar')plt.ylabel('Total Cost')plt.tight_layout()plt.savefig('./output/cost_per_country.png')plt.show()def show_trend_by_country(data_df):"""統(tǒng)計(jì)各國(guó)家交易記錄的趨勢(shì)參數(shù):- data_df: DataFrame數(shù)據(jù)"""countries = ['Germany', 'France', 'Spain', 'Belgium', 'Switzerland']data_df = data_df[data_df['Country'].isin(countries)].copy()data_df['InvoiceDate'] = pd.to_datetime(data_df['InvoiceDate'])data_df['InvoiceYear'] = data_df['InvoiceDate'].dt.year.astype(str)data_df['InvoiceMonth'] = data_df['InvoiceDate'].dt.month.astype(str)#年份拼上月份用橫線隔開(kāi)data_df['InvoiceYearMonth'] = data_df['InvoiceYear'].str.cat(data_df['InvoiceMonth'], sep='-')month_country_count = data_df.groupby(['InvoiceYearMonth', 'Country'])['StockCode'].count()print(month_country_count.head())#打散month_country_count_df = month_country_count.unstack()print(month_country_count_df.head())#橫軸截至到月份month_country_count_df.index = pd.to_datetime(month_country_count_df.index).to_period('M')print(month_country_count_df.head())month_country_count_df.sort_index(inplace=True)# 可視化結(jié)果# 堆疊柱狀圖month_country_count_df.plot(kind='bar', stacked=True, rot=45)plt.xlabel('Month')plt.ylabel('#Transaction')plt.tight_layout()plt.savefig('./output/country_trend_stacked_bar.png')plt.show()## 熱圖sns.heatmap(month_country_count_df.T)plt.xlabel('Month')plt.xticks(rotation=90)plt.yticks(rotation=0)plt.tight_layout()plt.savefig('./output/country_trend_heatmap.png')plt.show()def main():"""主函數(shù)"""if not os.path.exists(CLN_DATA_FILE):# 如果不存在清洗后的數(shù)據(jù)集,進(jìn)行數(shù)據(jù)清洗raw_data_df = pd.read_excel(RAW_DATA_FILE, dtype={'InvoiceNo': str,'StockCode': str,'CustomerID': str})# 查看數(shù)據(jù)集信息inspect_data(raw_data_df)# 數(shù)據(jù)清洗cln_data_df = clean_data(raw_data_df)else:print('讀取已清洗的數(shù)據(jù)')cln_data_df = pd.read_csv(CLN_DATA_FILE)# 數(shù)據(jù)分析# 1. 比較各國(guó)家的客戶數(shù)show_customer_stats(cln_data_df)# 2. 比較各國(guó)家的成交額show_total_cost_stats(cln_data_df)# 3. 統(tǒng)計(jì)各國(guó)家交易記錄的趨勢(shì)show_trend_by_country(cln_data_df)if __name__ == '__main__':main()各國(guó)家的客戶數(shù)
各國(guó)家的成交額
各國(guó)家交易記錄的趨勢(shì)
堆疊柱狀圖
熱力圖
?
總結(jié)
以上是生活随笔為你收集整理的pandas分析各国家交易情况的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問(wèn)題。
- 上一篇: 使用matlab程序,基于标准卡标定感压
- 下一篇: 图像拼接1 特征提取