【算法竞赛学习】资金流入流出预测-挑战Baseline_数据探索与分析1
生活随笔
收集整理的這篇文章主要介紹了
【算法竞赛学习】资金流入流出预测-挑战Baseline_数据探索与分析1
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
賽題簡介
螞蟻金服擁有上億會員并且業務場景中每天都涉及大量的資金流入和流出,面對如此龐大的用戶群,資金管理壓力會非常大。在既保證資金流動性風險最小,又滿足日常業務運轉的情況下,精準地預測資金的流入流出情況變得尤為重要。此屆大賽以《資金流入流出預測》為題,期望參賽者能夠通過對例如余額寶用戶的申購贖回數據的把握,精準預測未來每日的資金流入流出情況。對貨幣基金而言,資金流入意味著申購行為,資金流出為贖回行為 。
賽題與數據
競賽中使用的數據主要包含四個部分,分別為用戶基本信息數據、用戶申購贖回數據、收益率表和銀行間拆借利率表。https://tianchi.aliyun.com/competition/entrance/231573/information
官方Baseline
數據探索與分析
import pandas as pd import numpy as np import warnings import datetime import seaborn as sns import matplotlib.pyplot as plt import datetime from scipy import stats import warnings warnings.filterwarnings('ignore') # 設置數據集路徑 dataset_path = 'Data/' # 讀取數據 data_balance = pd.read_csv(dataset_path+'user_balance_table.csv') # 為數據集添加時間戳 data_balance['date'] = pd.to_datetime(data_balance['report_date'], format= "%Y%m%d") data_balance['day'] = data_balance['date'].dt.day data_balance['month'] = data_balance['date'].dt.month data_balance['year'] = data_balance['date'].dt.year data_balance['week'] = data_balance['date'].dt.week data_balance['weekday'] = data_balance['date'].dt.weekday時間序列分析
# 聚合時間數據,行為'total_purchase_amt'和'total_redeem_amt',列為不同的'date',數據為每日的purchase和redeem總和 # 數據清洗時,會將帶空值的行刪除,此時DataFrame或Series類型的數據不再是連續的索引,可以使用reset_index()重置索引 total_balance = data_balance.groupby(['date'])['total_purchase_amt','total_redeem_amt'].sum().reset_index() # 生成測試集區段數據 # 總共隨機抽取了約 3 萬用戶,其中部分用戶在 2014 年 9 月份第一次出現,這部分用戶只在測試數據中 # datetime.timedelta對象代表兩個時間之間的時間差 start = datetime.datetime(2014,9,1) testdata = [] while start != datetime.datetime(2014,10,1):temp = [start, np.nan, np.nan]testdata.append(temp)start += datetime.timedelta(days = 1) testdata = pd.DataFrame(testdata) testdata.columns = total_balance.columns # 拼接數據集 total_balance = pd.concat([total_balance, testdata], axis = 0) # 為數據集添加時間戳 total_balance['day'] = total_balance['date'].dt.day total_balance['month'] = total_balance['date'].dt.month total_balance['year'] = total_balance['date'].dt.year total_balance['week'] = total_balance['date'].dt.week total_balance['weekday'] = total_balance['date'].dt.weekday # 畫出每日總購買與贖回量的時間序列圖fig = plt.figure(figsize=(20,6)) plt.plot(total_balance['date'], total_balance['total_purchase_amt'],label='purchase') plt.plot(total_balance['date'], total_balance['total_redeem_amt'],label='redeem') plt.legend(loc='best') plt.title("The lineplot of total amount of Purchase and Redeem from July.13 to Sep.14") plt.xlabel("Time") plt.ylabel("Amount") plt.show() # 畫出4月份以后的時間序列圖total_balance_1 = total_balance[total_balance['date'].dt.date >= datetime.date(2014,4,1)] fig = plt.figure(figsize=(20,6)) plt.plot(total_balance_1['date'], total_balance_1['total_purchase_amt']) plt.plot(total_balance_1['date'], total_balance_1['total_redeem_amt']) plt.legend() plt.title("The lineplot of total amount of Purchase and Redeem from April.14 to Sep.14") plt.xlabel("Time") plt.ylabel("Amount") plt.show() # 分別畫出每個月中每天購買贖回量的時間序列圖fig = plt.figure(figsize=(15,15))plt.subplot(4,1,1) plt.title("The time series of total amount of Purchase and Redeem for August, July, June, May respectively")total_balance_2 = total_balance[total_balance['date'].dt.date >= datetime.date(2014,8,1)] plt.plot(total_balance_2['date'], total_balance_2['total_purchase_amt']) plt.plot(total_balance_2['date'], total_balance_2['total_redeem_amt']) plt.legend()total_balance_3 = total_balance[(total_balance['date'].dt.date >= datetime.date(2014,7,1)) & (total_balance['date'].dt.date < datetime.date(2014,8,1))] plt.subplot(4,1,2) plt.plot(total_balance_3['date'], total_balance_3['total_purchase_amt']) plt.plot(total_balance_3['date'], total_balance_3['total_redeem_amt']) plt.legend()total_balance_4 = total_balance[(total_balance['date'].dt.date >= datetime.date(2014,6,1)) & (total_balance['date'].dt.date < datetime.date(2014,7,1))] plt.subplot(4,1,3) plt.plot(total_balance_4['date'], total_balance_4['total_purchase_amt']) plt.plot(total_balance_4['date'], total_balance_4['total_redeem_amt']) plt.legend()total_balance_5 = total_balance[(total_balance['date'].dt.date >= datetime.date(2014,5,1)) & (total_balance['date'].dt.date < datetime.date(2014,6,1))] plt.subplot(4,1,4) plt.plot(total_balance_5['date'], total_balance_5['total_purchase_amt']) plt.plot(total_balance_5['date'], total_balance_5['total_redeem_amt']) plt.legend()plt.xlabel("Time") plt.ylabel("Amount") plt.show() # 分別畫出13年8月與9月每日購買贖回量的時序圖fig = plt.figure(figsize=(15,9))total_balance_last8 = total_balance[(total_balance['date'].dt.date >= datetime.date(2013,8,1)) & (total_balance['date'].dt.date < datetime.date(2013,9,1))] plt.subplot(2,1,1) plt.plot(total_balance_last8['date'], total_balance_last8['total_purchase_amt'],label='purchase') plt.plot(total_balance_last8['date'], total_balance_last8['total_redeem_amt'],label='redeem') plt.legend()total_balance_last9 = total_balance[(total_balance['date'].dt.date >= datetime.date(2013,9,1)) & (total_balance['date'].dt.date < datetime.date(2013,10,1))] plt.subplot(2,1,2) plt.plot(total_balance_last9['date'], total_balance_last9['total_purchase_amt'],label='purchase') plt.plot(total_balance_last9['date'], total_balance_last9['total_redeem_amt'],label='redeem') plt.legend()plt.xlabel("Time") plt.ylabel("Amount") plt.show()翌日特征分析
# 畫出每個周幾的數據分布于整體數據的分布圖a = plt.figure(figsize=(10,10)) scatter_para = {'marker':'.', 's':3, 'alpha':0.3} line_kws = {'color':'k'} plt.subplot(2,2,1) plt.title('The distrubution of total purchase') sns.violinplot(x='weekday', y='total_purchase_amt', data = total_balance_1, scatter_kws=scatter_para, line_kws=line_kws) plt.subplot(2,2,2) plt.title('The distrubution of total purchase') sns.distplot(total_balance_1['total_purchase_amt'].dropna()) plt.subplot(2,2,3) plt.title('The distrubution of total redeem') sns.violinplot(x='weekday', y='total_redeem_amt', data = total_balance_1, scatter_kws=scatter_para, line_kws=line_kws) plt.subplot(2,2,4) plt.title('The distrubution of total redeem') sns.distplot(total_balance_1['total_redeem_amt'].dropna()) # 按周幾對數據聚合后取均值week_sta = total_balance_1[['total_purchase_amt', 'total_redeem_amt', 'weekday']].groupby('weekday', as_index=False).mean() # 分析周幾的中位數特征plt.figure(figsize=(12, 5)) ax = plt.subplot(1,2,1) plt.title('The barplot of average total purchase with each weekday') ax = sns.barplot(x="weekday", y="total_purchase_amt", data=week_sta, label='Purchase') ax.legend() ax = plt.subplot(1,2,2) plt.title('The barplot of average total redeem with each weekday') ax = sns.barplot(x="weekday", y="total_redeem_amt", data=week_sta, label='Redeem') ax.legend() # 畫出周幾的箱型圖plt.figure(figsize=(12, 5)) ax = plt.subplot(1,2,1) plt.title('The boxplot of total purchase with each weekday') ax = sns.boxplot(x="weekday", y="total_purchase_amt", data=total_balance_1) ax = plt.subplot(1,2,2) plt.title('The boxplot of total redeem with each weekday') ax = sns.boxplot(x="weekday", y="total_redeem_amt", data=total_balance_1) # 使用OneHot方法將周幾特征劃分,獲取劃分后特征 from sklearn.preprocessing import OneHotEncoder encoder = OneHotEncoder() total_balance = total_balance.reset_index() week_feature = encoder.fit_transform(np.array(total_balance['weekday']).reshape(-1, 1)).toarray() week_feature = pd.DataFrame(week_feature,columns=['weekday_onehot']*len(week_feature[0])) feature = pd.concat([total_balance, week_feature], axis = 1)[['total_purchase_amt', 'total_redeem_amt','weekday_onehot','date']] feature.columns = list(feature.columns[0:2]) + [x+str(i) for i,x in enumerate(feature.columns[2:-1])] + ['date'] # 畫出劃分后周幾特征與標簽的斯皮爾曼相關性 f, ax = plt.subplots(figsize = (15, 8)) plt.subplot(1,2,1) plt.title('The spearman coleration between total purchase and each weekday') sns.heatmap(feature[[x for x in feature.columns if x not in ['total_redeem_amt', 'date'] ]].corr('spearman'),linewidths = 0.1, vmax = 0.2, vmin=-0.2) plt.subplot(1,2,2) plt.title('The spearman coleration between total redeem and each weekday') sns.heatmap(feature[[x for x in feature.columns if x not in ['total_purchase_amt', 'date'] ]].corr('spearman'),linewidths = 0.1, vmax = 0.2, vmin=-0.2)月特征分析
# 畫出每個月的購買總量分布估計圖(kdeplot) # 核密度估計是概率論上用來估計未知的密度函數,屬于非參數檢驗,通過核密度估計圖可以比較直觀的看出樣本數據本身的分布特征 plt.figure(figsize=(15,10)) plt.title('The Probability Density of total purchase amount in Each Month') plt.ylabel('Probability') plt.xlabel('Amount') for i in range(7, 12):sns.kdeplot(total_balance[(total_balance['date'].dt.date >= datetime.date(2013,i,1)) & (total_balance['date'].dt.date < datetime.date(2013,i+1,1))]['total_purchase_amt'],label='13Y,'+str(i)+'M') for i in range(1, 9):sns.kdeplot(total_balance[(total_balance['date'].dt.date >= datetime.date(2014,i,1)) & (total_balance['date'].dt.date < datetime.date(2014,i+1,1))]['total_purchase_amt'],label='14Y,'+str(i)+'M') # 畫出每個月的贖回總量分布估計圖(kdeplot)plt.figure(figsize=(15,10)) plt.title('The Probability Density of total redeem amount in Each Month') plt.ylabel('Probability') plt.xlabel('Amount') for i in range(7, 12):sns.kdeplot(total_balance[(total_balance['date'].dt.date >= datetime.date(2013,i,1)) & (total_balance['date'].dt.date < datetime.date(2013,i+1,1))]['total_redeem_amt'],label='13Y,'+str(i)+'M') for i in range(1, 9):sns.kdeplot(total_balance[(total_balance['date'].dt.date >= datetime.date(2014,i,1)) & (total_balance['date'].dt.date < datetime.date(2014,i+1,1))]['total_redeem_amt'],label='14Y,'+str(i)+'M') # 畫出14年五六七八月份的分布估計圖 total_balance_last_2 = total_balance[(total_balance['date'].dt.date >= datetime.date(2014,7,1)) & (total_balance['date'].dt.date < datetime.date(2014,8,1))] total_balance_last_3 = total_balance[(total_balance['date'].dt.date >= datetime.date(2014,6,1)) & (total_balance['date'].dt.date < datetime.date(2014,7,1))] total_balance_last_4 = total_balance[(total_balance['date'].dt.date >= datetime.date(2014,5,1)) & (total_balance['date'].dt.date < datetime.date(2014,6,1))] total_balance_last_5 = total_balance[(total_balance['date'].dt.date >= datetime.date(2014,4,1)) & (total_balance['date'].dt.date < datetime.date(2014,5,1))] plt.figure(figsize=(12,10))ax = plt.subplot(2,1,1) plt.title('The Probability Density of total purchase and redeem amount from May.14 to August.14') plt.ylabel('Probability') plt.xlabel('Amount') ax = sns.kdeplot(total_balance_last_2['total_purchase_amt'],label='August') ax = sns.kdeplot(total_balance_last_3['total_purchase_amt'],label='July') ax = sns.kdeplot(total_balance_last_4['total_purchase_amt'],label='June') ax = sns.kdeplot(total_balance_last_5['total_purchase_amt'],color='Black',label='May')ax = plt.subplot(2,1,2) plt.ylabel('Probability') plt.xlabel('Amount') ax = sns.kdeplot(total_balance_last_2['total_redeem_amt'],label='August') ax = sns.kdeplot(total_balance_last_3['total_redeem_amt'],label='July') ax = sns.kdeplot(total_balance_last_4['total_redeem_amt'],label='June') ax = sns.kdeplot(total_balance_last_5['total_redeem_amt'],color='Black',label='May') # 畫出13年八月到九月份的分布估計圖total_balance_last_7 = total_balance[(total_balance['date'].dt.date >= datetime.date(2013,7,1)) & (total_balance['date'].dt.date < datetime.date(2013,8,1))] total_balance_last_8 = total_balance[(total_balance['date'].dt.date >= datetime.date(2013,8,1)) & (total_balance['date'].dt.date < datetime.date(2013,9,1))] total_balance_last_9 = total_balance[(total_balance['date'].dt.date >= datetime.date(2013,9,1)) & (total_balance['date'].dt.date < datetime.date(2013,10,1))] total_balance_last_10 = total_balance[(total_balance['date'].dt.date >= datetime.date(2013,10,1)) & (total_balance['date'].dt.date < datetime.date(2013,11,1))] plt.figure(figsize=(12,10)) ax = plt.subplot(2,1,1) plt.title('The Probability Density of total purchase and redeem amount from Aug.13 to Sep.13') plt.ylabel('Probability') plt.xlabel('Amount') ax = sns.kdeplot(total_balance_last_8['total_purchase_amt'],label='August') ax = sns.kdeplot(total_balance_last_7['total_purchase_amt'],label='July') ax = sns.kdeplot(total_balance_last_9['total_purchase_amt'],color='Red',label='September')ax = plt.subplot(2,1,2) plt.ylabel('Probability') plt.xlabel('Amount') ax = sns.kdeplot(total_balance_last_8['total_redeem_amt'],label='August') ax = sns.kdeplot(total_balance_last_7['total_redeem_amt'],label='July') ax = sns.kdeplot(total_balance_last_9['total_redeem_amt'],color='Red',label='September') ax = sns.kdeplot(total_balance_last_10['total_redeem_amt'],color='Black',label='Novermber')日期特征分析
# 按照每天聚合數據集 day_sta = total_balance_2[['total_purchase_amt', 'total_redeem_amt', 'day']].groupby('day', as_index=False).mean() # 獲取聚合后每月購買分布的柱狀圖 ax = sns.barplot(x="day", y="total_purchase_amt", data=day_sta, label='Purchase') ax = sns.lineplot(x="day", y="total_purchase_amt", data=day_sta, label='Purchase') ax.legend() plt.title("The total Purchase in Aug.14") # 獲取聚合后每月贖回分布的柱狀圖ax = sns.barplot(x="day", y="total_redeem_amt", data=day_sta, label='Redeem') ax = sns.lineplot(x="day", y="total_redeem_amt", data=day_sta, label='Redeem') ax.legend() plt.title("The total Redeem in Aug.14") # 畫出13年九月份的分布圖plt.figure(figsize=(15,5)) day_sta = total_balance_last_9[['total_purchase_amt', 'total_redeem_amt', 'day']].groupby('day', as_index=False).mean() plt.subplot(1,2,1) plt.title("The total Purchase in Sep.13") ax = sns.barplot(x="day", y="total_purchase_amt", data=day_sta, label='Purchase') ax = sns.lineplot(x="day", y="total_purchase_amt", data=day_sta, label='Purchase') plt.subplot(1,2,2) plt.title("The total Redeem in Sep.13") bx = sns.barplot(x="day", y="total_redeem_amt", data=day_sta, label='Redeem') bx = sns.lineplot(x="day", y="total_redeem_amt", data=day_sta, label='Redeem') bx.legend()
我們發現,去年9月的數據具有非常有限的星期特征
9月份有一些奇怪的日子。
從熱圖中我們發現,第4周的周六的數據非常奇怪,第12周的周二也是
5月4日是勞動節過后的第一天
6月25日贖出很多但是購買很少
對于節假日的分析
# 獲取節假日的數據 qingming = total_balance[(total_balance['date'].dt.date >= datetime.date(2014,4,5)) & (total_balance['date'].dt.date < datetime.date(2014,4,8))] labour = total_balance[(total_balance['date'].dt.date >= datetime.date(2014,5,1)) & (total_balance['date'].dt.date < datetime.date(2014,5,4))] duanwu = total_balance[(total_balance['date'].dt.date >= datetime.date(2014,5,31)) & (total_balance['date'].dt.date < datetime.date(2014,6,3))] data618 = total_balance[(total_balance['date'].dt.date >= datetime.date(2014,6,10)) & (total_balance['date'].dt.date < datetime.date(2014,6,20))] # 畫出節假日與平時的均值fig = plt.figure() index_list = ['QM','Labour','DW','618','Mean'] label_list = [np.mean(qingming['total_purchase_amt']), np.mean(labour['total_purchase_amt']),np.mean(duanwu['total_purchase_amt']),np.mean(data618['total_purchase_amt']),np.mean(total_balance_1['total_purchase_amt'])] plt.bar(index_list, label_list, label="Purchase")index_list = ['QM.','Labour.','DW.','618.','Mean.'] label_list = [np.mean(qingming['total_redeem_amt']), np.mean(labour['total_redeem_amt']),np.mean(duanwu['total_redeem_amt']),np.mean(data618['total_redeem_amt']),np.mean(total_balance_1['total_redeem_amt'])] plt.bar(index_list, label_list, label="Redeem") plt.title("The average of different holiday") plt.ylabel("Amount") plt.legend() plt.show() # 畫出節假日購買量與其所處翌日的對比import numpy as np import matplotlib.pyplot as plt size = 4 x = np.arange(size)total_width, n = 0.8, 2 width = total_width / n x = x - (total_width - width) / 2a = [176250006, 167825284, 162844282,321591063] b = [225337516, 241859315, 225337516,307635449]plt.bar(x, a, width=width, label='Holiday_Purchase') plt.bar(x + width, b, width=width, label='Normal_Purchase') plt.xticks(x + width / 2, ('QingMing', 'Labour', 'DuanWu', '618')) plt.legend() plt.show() # 畫出節假日贖回量與其所處翌日的對比import numpy as np import matplotlib.pyplot as plt size = 4 x = np.arange(size)total_width, n = 0.8, 2 width = total_width / n x = x - (total_width - width) / 2a = [159914308, 154717620, 154366940,291016763] b = [235439685, 240364238, 235439685,313310347]plt.bar(x, a, width=width, label='Holiday_Redeem') plt.bar(x + width, b, width=width, label='Normal_Redeem') plt.xticks(x + width / 2, ('QingMing', 'Labour', 'DuanWu', '618')) plt.legend() plt.show()對于節假日周邊日期的分析
# 畫出清明節與周邊日期的時序圖qingming_around = total_balance[(total_balance['date'].dt.date >= datetime.date(2014,4,1)) & (total_balance['date'].dt.date < datetime.date(2014,4,13))] ax = sns.lineplot(x="date", y="total_purchase_amt", data=qingming_around, label='Purchase') ax = sns.lineplot(x="date", y="total_redeem_amt", data=qingming_around, label='Redeem', ax=ax) ax = sns.scatterplot(x="date", y="total_purchase_amt", data=qingming, ax=ax) ax = sns.scatterplot(x="date", y="total_redeem_amt", data=qingming, ax=ax) plt.title("The data around Qingming Holiday") ax.legend() # 畫出勞動節與周邊日期的時序圖labour_around = total_balance[(total_balance['date'].dt.date >= datetime.date(2014,4,25)) & (total_balance['date'].dt.date < datetime.date(2014,5,10))] ax = sns.lineplot(x="date", y="total_purchase_amt", data=labour_around, label='Purchase') ax = sns.lineplot(x="date", y="total_redeem_amt", data=labour_around, label='Redeem', ax=ax) ax = sns.scatterplot(x="date", y="total_purchase_amt", data=labour, ax=ax) ax = sns.scatterplot(x="date", y="total_redeem_amt", data=labour, ax=ax) plt.title("The data around Labour holiday") ax.legend() # 畫出端午節與周邊日期的時序圖duanwu_around = total_balance[(total_balance['date'].dt.date >= datetime.date(2014,5,25)) & (total_balance['date'].dt.date < datetime.date(2014,6,7))] ax = sns.lineplot(x="date", y="total_purchase_amt", data=duanwu_around, label='Purchase') ax = sns.lineplot(x="date", y="total_redeem_amt", data=duanwu_around, label='Redeem', ax=ax) ax = sns.scatterplot(x="date", y="total_purchase_amt", data=duanwu, ax=ax) ax = sns.scatterplot(x="date", y="total_redeem_amt", data=duanwu, ax=ax) plt.title("The data around Duanwu Holiday") ax.legend() # 畫出中秋與周邊日期的時序圖zhongqiu = total_balance[(total_balance['date'].dt.date >= datetime.date(2013,9,19)) & (total_balance['date'].dt.date < datetime.date(2013,9,22))] zhongqiu_around = total_balance[(total_balance['date'].dt.date >= datetime.date(2013,9,14)) & (total_balance['date'].dt.date < datetime.date(2013,9,28))] ax = sns.lineplot(x="date", y="total_purchase_amt", data=zhongqiu_around, label='Purchase') ax = sns.lineplot(x="date", y="total_redeem_amt", data=zhongqiu_around, label='Redeem', ax=ax) ax = sns.scatterplot(x="date", y="total_purchase_amt", data=zhongqiu, ax=ax) ax = sns.scatterplot(x="date", y="total_redeem_amt", data=zhongqiu, ax=ax) plt.title("The data around MiddleAutumn Holiday(in 2013)") ax.legend()對于異常值的分析
# 畫出用戶交易紀錄的箱型圖sns.boxplot(data_balance['total_purchase_amt']) plt.title("The abnormal value of total purchase") # 對于購買2e8的用戶的交易行為分析data_balance[data_balance['user_id'] == 14592].sort_values(by = 'total_redeem_amt',axis = 0,ascending = False).head() # 畫出單筆交易為2e8的那天的總交易量及附近幾天的交易量e2 = total_balance[(total_balance['date'].dt.date >= datetime.date(2013,11,1)) & (total_balance['date'].dt.date < datetime.date(2013,11,10))] ax = sns.barplot(x="day", y="total_purchase_amt", data=e2, label='2E') ax = sns.lineplot(x="day", y="total_purchase_amt", data=e2, label='2E') plt.title("The influence of the big deal with 200 million purchasing(Red Bar)") ax.legend() # 畫出每日單筆最大交易的時序圖plt.figure(figsize=(20, 6)) ax = sns.lineplot(x="date", y="total_purchase_amt", data=data_balance[['total_purchase_amt', 'date']].groupby('date', as_index=False).max(), label='MAX_PURCHASE') ax = sns.lineplot(x="date", y="total_redeem_amt", data=data_balance[['total_redeem_amt', 'date']].groupby('date', as_index=False).max(), label='MAX_REDEEM') plt.title("The Biggest deal happend in each day") # 畫出每日單筆最大交易以及總交易額的時序圖plt.figure(figsize=(20, 6)) ax = sns.lineplot(x="date", y="total_purchase_amt", data=data_balance[['total_purchase_amt', 'date']].groupby('date', as_index=False).max(), label='MAX_PURCHASE') ax = sns.lineplot(x="date", y="total_redeem_amt", data=data_balance[['total_redeem_amt', 'date']].groupby('date', as_index=False).max(), label='MAX_REDEEM') ax = sns.lineplot(x="date", y="total_purchase_amt", data=data_balance[['total_purchase_amt', 'date']].groupby('date', as_index=False).sum(), label='TOTAL_PURCHASE') ax = sns.lineplot(x="date", y="total_redeem_amt", data=data_balance[['total_redeem_amt', 'date']].groupby('date', as_index=False).sum(), label='TOTAL_REDEEM') # 畫出每個月大額交易的頻次直方圖big_frequancy = data_balance[(data_balance['total_purchase_amt'] > 10000000) | (data_balance['total_redeem_amt'] > 10000000)][['month','year','user_id']].groupby(['year','month'], as_index=False).count() big_frequancy['i'] = big_frequancy['year'] + big_frequancy['month'] / 100 ax = sns.barplot(x="i", y="user_id", data=big_frequancy) plt.title("The frequency of super big deal(larger than 100million) in each month") # 獲取大額交易的數據集data_balance['big_purchase'] = 0 data_balance.loc[data_balance['total_purchase_amt'] > 1000000, 'big_purchase'] = 1 data_balance['big_redeem'] = 0 data_balance.loc[data_balance['total_redeem_amt'] > 1000000, 'big_redeem'] = 1 # 對大額交易按每天做聚合操作big_purchase = data_balance[data_balance['big_purchase'] == 1].groupby(['date'], as_index=False)['total_purchase_amt'].sum() small_purchase = data_balance[data_balance['big_purchase'] == 0].groupby(['date'], as_index=False)['total_purchase_amt'].sum() big_redeem = data_balance[data_balance['big_redeem'] == 1].groupby(['date'], as_index=False)['total_redeem_amt'].sum() small_redeem = data_balance[data_balance['big_redeem'] == 0].groupby(['date'], as_index=False)['total_redeem_amt'].sum() # 畫出大額交易與小額交易的時序分布圖fig = plt.figure(figsize=(20,6)) plt.plot(big_purchase['date'], big_purchase['total_purchase_amt'],label='big_purchase') plt.plot(big_redeem['date'], big_redeem['total_redeem_amt'],label='big_redeem')plt.plot(small_purchase['date'], small_purchase['total_purchase_amt'],label='small_purchase') plt.plot(small_redeem['date'], small_redeem['total_redeem_amt'],label='small_redeem') plt.legend(loc='best') plt.title("The time series of big deal of Purchase and Redeem from July.13 to Sep.14") plt.xlabel("Time") plt.ylabel("Amount") plt.show() # 畫出大額交易與小額交易的分布估計圖plt.figure(figsize=(12,10))plt.subplot(2,2,1) for i in range(4, 9):sns.kdeplot(big_purchase[(big_purchase['date'].dt.date >= datetime.date(2014,i,1)) & (big_purchase['date'].dt.date < datetime.date(2014,i+1,1))]['total_purchase_amt'],label='14Y,'+str(i)+'M') plt.title('BIG PURCHASE')plt.subplot(2,2,2) for i in range(4, 9):sns.kdeplot(small_purchase[(small_purchase['date'].dt.date >= datetime.date(2014,i,1)) & (small_purchase['date'].dt.date < datetime.date(2014,i+1,1))]['total_purchase_amt'],label='14Y,'+str(i)+'M') plt.title('SMALL PURCHASE')plt.subplot(2,2,3) for i in range(4, 9):sns.kdeplot(big_redeem[(big_redeem['date'].dt.date >= datetime.date(2014,i,1)) & (big_redeem['date'].dt.date < datetime.date(2014,i+1,1))]['total_redeem_amt'],label='14Y,'+str(i)+'M') plt.title('BIG REDEEM')plt.subplot(2,2,4) for i in range(4, 9):sns.kdeplot(small_redeem[(small_redeem['date'].dt.date >= datetime.date(2014,i,1)) & (small_redeem['date'].dt.date < datetime.date(2014,i+1,1))]['total_redeem_amt'],label='14Y,'+str(i)+'M') plt.title('SMALL REDEEM') # 添加時間戳big_purchase['weekday'] = big_purchase['date'].dt.weekday small_purchase['weekday'] = small_purchase['date'].dt.weekday big_redeem['weekday'] = big_redeem['date'].dt.weekday small_redeem['weekday'] = small_redeem['date'].dt.weekday # 分析大額小額的翌日分布plt.figure(figsize=(12, 10))ax = plt.subplot(2,2,1) ax = sns.boxplot(x="weekday", y="total_purchase_amt", data=big_purchase[big_purchase['date'].dt.date >= datetime.date(2014,4,1)]) plt.title('BIG PURCHASE')ax = plt.subplot(2,2,2) ax = sns.boxplot(x="weekday", y="total_redeem_amt", data=big_redeem[big_redeem['date'].dt.date >= datetime.date(2014,4,1)]) plt.title('BIG REDEEM')ax = plt.subplot(2,2,3) ax = sns.boxplot(x="weekday", y="total_purchase_amt", data=small_purchase[small_purchase['date'].dt.date >= datetime.date(2014,4,1)]) plt.title('SMALL PURCHASE')ax = plt.subplot(2,2,4) ax = sns.boxplot(x="weekday", y="total_redeem_amt", data=small_redeem[small_redeem['date'].dt.date >= datetime.date(2014,4,1)]) plt.title('SMALL REDEEM')分析用戶交易記錄表中其他變量
# 截斷數據集 data_balance_1 = data_balance[data_balance['date'] > datetime.datetime(2014,4,1)] # 畫出用戶交易紀錄表中其他變量與標簽的相關性圖 feature = ['total_purchase_amt','total_redeem_amt', 'report_date', 'tBalance', 'yBalance', 'direct_purchase_amt', 'purchase_bal_amt', 'purchase_bank_amt','consume_amt', 'transfer_amt', 'tftobal_amt','tftocard_amt', 'share_amt']sns.heatmap(data_balance_1[feature].corr(), linewidths = 0.05) plt.title("The coleration between each feature in User_Balance_Table")對于銀行及支付寶利率的分析
# 讀取銀行利率并添加時間戳bank = pd.read_csv(dataset_path + "mfd_bank_shibor.csv") bank = bank.rename(columns = {'mfd_date': 'date'}) bank_features = [x for x in bank.columns if x not in ['date']] bank['date'] = pd.to_datetime(bank['date'], format= "%Y%m%d") bank['day'] = bank['date'].dt.day bank['month'] = bank['date'].dt.month bank['year'] = bank['date'].dt.year bank['week'] = bank['date'].dt.week bank['weekday'] = bank['date'].dt.weekday # 讀取支付寶利率并添加時間戳share = pd.read_csv(dataset_path + 'mfd_day_share_interest.csv') share = share.rename(columns = {'mfd_date': 'date'}) share_features = [x for x in share.columns if x not in ['date']] share['date'] = pd.to_datetime(share['date'], format= "%Y%m%d") share['day'] = share['date'].dt.day share['month'] = share['date'].dt.month share['year'] = share['date'].dt.year share['week'] = share['date'].dt.week share['weekday'] = share['date'].dt.weekday # 畫出上一天銀行及支付寶利率與標簽的相關性圖bank['last_date'] = bank['date'] + datetime.timedelta(days=1) plt.figure(figsize=(12,4)) plt.subplot(1,3,1) plt.title("The coleration between each lastday bank rate and total purchase") temp = pd.merge(bank[['last_date']+bank_features], total_balance, left_on='last_date', right_on='date')[['total_purchase_amt']+bank_features] sns.heatmap(temp.corr(), linewidths = 0.05) plt.subplot(1,3,3) plt.title("The coleration between each lastday bank rate and total redeem") temp = pd.merge(bank[['last_date']+bank_features], total_balance, left_on='last_date', right_on='date')[['total_redeem_amt']+bank_features] sns.heatmap(temp.corr(), linewidths = 0.05) # 畫出上一星期銀行及支付寶利率與標簽的相關性圖bank['last_week'] = bank['week'] + 1 plt.figure(figsize=(12,4)) plt.subplot(1,3,1) plt.title("The coleration between each last week bank rate and total purchase") temp = pd.merge(bank[['last_week','weekday']+bank_features], total_balance, left_on=['last_week','weekday'], right_on=['week','weekday'])[['total_purchase_amt']+bank_features] sns.heatmap(temp.corr(), linewidths = 0.05) plt.subplot(1,3,3) plt.title("The coleration between each last week bank rate and total redeem") temp = pd.merge(bank[['last_week','weekday']+bank_features], total_balance, left_on=['last_week','weekday'], right_on=['week','weekday'])[['total_redeem_amt']+bank_features] sns.heatmap(temp.corr(), linewidths = 0.05) # 分別畫出上一星期銀行及支付寶利率與大額小額數據的相關性圖bank['last_date'] = bank['date'] + datetime.timedelta(days=1) plt.figure(figsize=(12,4)) plt.subplot(1,3,1) plt.title("The coleration of Small Rate purchase") temp = pd.merge(bank[['last_date']+bank_features], small_purchase, left_on='last_date', right_on='date')[['total_purchase_amt']+bank_features] sns.heatmap(temp.corr(), linewidths = 0.05) plt.subplot(1,3,3) plt.title("The coleration of Small Rate redeem") temp = pd.merge(bank[['last_date']+bank_features], small_redeem, left_on='last_date', right_on='date')[['total_redeem_amt']+bank_features] sns.heatmap(temp.corr(), linewidths = 0.05) bank['last_date'] = bank['date'] + datetime.timedelta(days=1) plt.figure(figsize=(12,4)) plt.subplot(1,3,1) plt.title("The coleration of Big Rate purchase") temp = pd.merge(bank[['last_date']+bank_features], big_purchase, left_on='last_date', right_on='date')[['total_purchase_amt']+bank_features] sns.heatmap(temp.corr(), linewidths = 0.05) plt.subplot(1,3,3) plt.title("The coleration of Big Rate redeem") temp = pd.merge(bank[['last_date']+bank_features], big_redeem, left_on='last_date', right_on='date')[['total_redeem_amt']+bank_features] sns.heatmap(temp.corr(), linewidths = 0.05)
利率信息
# 畫出支付寶利率與標簽的相關性圖share['last_date'] = share['date'] + datetime.timedelta(days=1) plt.figure(figsize=(12,4)) plt.subplot(1,3,1) temp = pd.merge(share[['last_date']+share_features], total_balance, left_on='last_date', right_on='date')[['total_purchase_amt']+share_features] sns.heatmap(temp.corr(), linewidths = 0.05, vmin = 0) plt.subplot(1,3,3) temp = pd.merge(share[['last_date']+share_features], total_balance, left_on='last_date', right_on='date')[['total_redeem_amt']+share_features] sns.heatmap(temp.corr(), linewidths = 0.05, vmin = 0) # 畫出銀行利率與標簽的相關性圖share['last_week'] = share['week'] + 1 plt.figure(figsize=(12,4)) plt.subplot(1,3,1) temp = pd.merge(share[['last_week','weekday']+share_features], total_balance, left_on=['last_week','weekday'], right_on=['week','weekday'])[['total_purchase_amt']+share_features] sns.heatmap(temp.corr(), linewidths = 0.05, vmin = 0) plt.subplot(1,3,3) temp = pd.merge(share[['last_week','weekday']+share_features], total_balance, left_on=['last_week','weekday'], right_on=['week','weekday'])[['total_redeem_amt']+share_features] sns.heatmap(temp.corr(), linewidths = 0.05, vmin = 0) # 畫出支付寶利率與購買量的時序圖fig,ax1 = plt.subplots(figsize=(15,5)) for i in share_features:plt.plot(share['date'], share[i],'b',label=i)break plt.legend() ax2=ax1.twinx() plt.plot(total_balance['date'], total_balance['total_purchase_amt'],'g',label="Total purchase") plt.legend(loc=2) plt.show() # 畫出支付寶利率與贖回量的時序圖fig,ax1 = plt.subplots(figsize=(15,5)) for i in share_features:plt.plot(share['date'], share[i],'b',label=i)break plt.legend() ax2=ax1.twinx() plt.plot(total_balance['date'], total_balance['total_redeem_amt'],'g',label="Total redeem") plt.legend(loc=2) plt.show() # 畫出大額小額數據與支付寶利率的相關性圖share['last_date'] = share['date'] + datetime.timedelta(days=1) plt.figure(figsize=(12,4)) plt.subplot(1,3,1) temp = pd.merge(share[['last_date']+share_features], small_purchase, left_on='last_date', right_on='date')[['total_purchase_amt']+share_features] sns.heatmap(temp.corr(), linewidths = 0.05, vmin=0) plt.title("SMALL PURCHASE") plt.subplot(1,3,3) plt.title("SMALL REDEEM") temp = pd.merge(share[['last_date']+share_features], small_redeem, left_on='last_date', right_on='date')[['total_redeem_amt']+share_features] sns.heatmap(temp.corr(), linewidths = 0.05, vmin=0) share['last_date'] = share['date'] + datetime.timedelta(days=1) plt.figure(figsize=(12,4)) plt.subplot(1,3,1) plt.title("BIG PURCHASE") temp = pd.merge(share[['last_date']+share_features], big_purchase, left_on='last_date', right_on='date')[['total_purchase_amt']+share_features] sns.heatmap(temp.corr(), linewidths = 0.05, vmin=0) plt.subplot(1,3,3) plt.title("BIG REDEEM") temp = pd.merge(share[['last_date']+share_features], big_redeem, left_on='last_date', right_on='date')[['total_redeem_amt']+share_features] sns.heatmap(temp.corr(), linewidths = 0.05, vmin=0)
對利率的分析可以總結以下幾點:
根據上述分析,可以發現以下幾點會對購買贖回行為有影響:
總結
以上是生活随笔為你收集整理的【算法竞赛学习】资金流入流出预测-挑战Baseline_数据探索与分析1的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: Oracle 函数大全
- 下一篇: 【算法竞赛学习】资金流入流出预测-挑战B