# 數據合并import numpy as np
import pandas as pdt1 = pd.DataFrame(np.arange(12).reshape(3,4),index=list('ABC'),columns=list('DEFG'))
t2 = pd.DataFrame(np.arange(10).reshape(2,5),index=list('AB'),columns=list('VWXYZ'))print(t1)'''D E F G
A 0 1 2 3
B 4 5 6 7
C 8 9 10 11'''print(t2)'''V W X Y Z
A 0 1 2 3 4
B 5 6 7 8 9'''print(t1 + t2)'''D E F G V W X Y Z
A NaN NaN NaN NaN NaN NaN NaN NaN NaN
B NaN NaN NaN NaN NaN NaN NaN NaN NaN
C NaN NaN NaN NaN NaN NaN NaN NaN NaN'''print(t1.join(t2))# join:以t1為基礎,默認情況下它是把行索引相同的數據合并到一起'''D E F G V W X Y Z
A 0 1 2 3 0.0 1.0 2.0 3.0 4.0
B 4 5 6 7 5.0 6.0 7.0 8.0 9.0
C 8 9 10 11 NaN NaN NaN NaN NaN'''print(t2.join(t1))# join:以t2為基礎,默認情況下它是把行索引相同的數據合并到一起'''V W X Y Z D E F G
A 0 1 2 3 4 0 1 2 3
B 5 6 7 8 9 4 5 6 7'''
t3 = pd.DataFrame(np.arange(12).reshape(3,4),index=list('ABC'),columns=list('DEFG'))
t4 = pd.DataFrame(np.arange(10).reshape(2,5),index=list('AC'),columns=list('VWXYZ'))print(t3)'''D E F G
A 0 1 2 3
B 4 5 6 7
C 8 9 10 11'''print(t4)'''V W X Y Z
A 0 1 2 3 4
C 5 6 7 8 9'''print(t3.join(t4))# join:以t3為基礎,默認情況下它是把行索引相同的數據合并到一起'''D E F G V W X Y Z
A 0 1 2 3 0.0 1.0 2.0 3.0 4.0
B 4 5 6 7 NaN NaN NaN NaN NaN
C 8 9 10 11 5.0 6.0 7.0 8.0 9.0'''print(t4.join(t3))# join:以t4為基礎,默認情況下它是把行索引相同的數據合并到一起'''V W X Y Z D E F G
A 0 1 2 3 4 0 1 2 3
C 5 6 7 8 9 8 9 10 11'''
t5 = pd.DataFrame(np.arange(12).reshape(3,4),index=list('ABC'),columns=list('DEFG'))
t6 = pd.DataFrame(np.arange(10).reshape(2,5),index=list('DE'),columns=list('VWXYZ'))print(t5.join(t6))# join:以t5為基礎,沒有相同的索引,后面拼接的都為NaN'''D E F G V W X Y Z
A 0 1 2 3 NaN NaN NaN NaN NaN
B 4 5 6 7 NaN NaN NaN NaN NaN
C 8 9 10 11 NaN NaN NaN NaN NaN'''print(t6.join(t5))# join:以t6為基礎,沒有相同的索引,后面拼接的都為NaN'''V W X Y Z D E F G
D 0 1 2 3 4 NaN NaN NaN NaN
E 5 6 7 8 9 NaN NaN NaN NaN'''
示例2
# 數據合并import numpy as np
import pandas as pdt1 = pd.DataFrame(np.arange(12).reshape(3,4), index=list('ABC'), columns=list('DEFG'))
t2 = pd.DataFrame(np.arange(10).reshape(2,5), index=list('AB'), columns=list('DWXYZ'))print(t1)'''D E F G
A 0 1 2 3
B 4 5 6 7
C 8 9 10 11'''print(t2)'''D W X Y Z
A 0 1 2 3 4
B 5 6 7 8 9'''print(t1 + t2)'''D E F G W X Y Z
A 0.0 NaN NaN NaN NaN NaN NaN NaN
B 9.0 NaN NaN NaN NaN NaN NaN NaN
C NaN NaN NaN NaN NaN NaN NaN NaN'''print(t1.merge(t2))# merge:至少要有一個相同的列索引,相同的列下有相同的元素,就把該元素所在的行(去除相同的元素)合并到一行'''D E F G W X Y Z
0 0 1 2 3 1 2 3 4'''
t1.iloc[2,0]=5print(t1)'''D E F G
A 0 1 2 3
B 4 5 6 7
C 5 9 10 11'''print(t2)'''D W X Y Z
A 0 1 2 3 4
B 5 6 7 8 9'''print(t1.merge(t2))# 以t1為主,merge:至少要有一個相同的列索引,相同的列下有相同的元素,就把該元素所在的行(去除相同的元素)合并到一行,合并后的數據行索引從0開始'''D E F G W X Y Z
0 0 1 2 3 1 2 3 4
1 5 9 10 11 6 7 8 9'''print(t2.merge(t1))# 以t2為主,merge:至少要有一個相同的列索引,相同的列下有相同的元素,就把該元素所在的行(去除相同的元素)合并到一行,合并后的數據行索引從0開始'''D W X Y Z E F G
0 0 1 2 3 4 1 2 3
1 5 6 7 8 9 9 10 11'''
t1.iloc[[0,1,2],0]=1print(t1)'''D E F G
A 1 1 2 3
B 1 5 6 7
C 1 9 10 11'''print(t2)'''D W X Y Z
A 0 1 2 3 4
B 5 6 7 8 9'''print(t2.merge(t1))# 以t2為主,merge:至少要有一個相同的列索引,相同的列下沒有相同的元素,Empty DataFrame'''
Empty DataFrame
Columns: [D, W, X, Y, Z, E, F, G]
Index: []'''
t1.iloc[[0,1,2],0]=0print(t1)'''D E F G
A 0 1 2 3
B 0 5 6 7
C 0 9 10 11'''print(t2)'''D W X Y Z
A 0 1 2 3 4
B 5 6 7 8 9'''print(t2.merge(t1))# 以t2為主,merge:至少要有一個相同的列索引,相同的列下有相同的元素,就把該元素所在的行(去除相同的元素)合并到一行,合并后的數據行索引從0開始'''D W X Y Z E F G
0 0 1 2 3 4 1 2 3
1 0 1 2 3 4 5 6 7
2 0 1 2 3 4 9 10 11'''print(t2.merge(t1,how='left'))# 以t2為主,merge:至少要有一個相同的列索引,相同的列下有相同的元素,就把該元素所在的行(去除相同的元素)合并到一行,合并后的數據行索引從0開始 how='left'保留t2的原數據,后面用NaN填充'''D W X Y Z E F G
0 0 1 2 3 4 1.0 2.0 3.0
1 0 1 2 3 4 5.0 6.0 7.0
2 0 1 2 3 4 9.0 10.0 11.0
3 5 6 7 8 9 NaN NaN NaN'''print(t2.merge(t1, on='D', how='outer'))'''D W X Y Z E F G
0 0 1 2 3 4 1.0 2.0 3.0
1 0 1 2 3 4 5.0 6.0 7.0
2 0 1 2 3 4 9.0 10.0 11.0
3 5 6 7 8 9 NaN NaN NaN'''t3 = pd.DataFrame(np.arange(12).reshape(3,4), index=list('abc'), columns=list('MNOP'))
t4 = pd.DataFrame(np.arange(10).reshape(2,5), index=list('de'), columns=list('MWXYZ'))print(t3)'''M N O P
a 0 1 2 3
b 4 5 6 7
c 8 9 10 11'''print(t4)'''M W X Y Z
d 0 1 2 3 4
e 5 6 7 8 9
'''print(t3.merge(t4,left_on='N',right_on='Z'))'''M_x N O P M_y W X Y Z
0 8 9 10 11 5 6 7 8 9'''
# 分組聚合import pandas as pd
import numpy as npdict_obj ={'key1':['a','b','a','b','a','b','a','a'],'key2':['one','one','two','three','two','two','one','three'],'data1': np.arange(8),'data2': np.arange(8)}
df = pd.DataFrame(dict_obj)print(df)'''key1 key2 data1 data2
0 a one 0 0
1 b one 1 1
2 a two 2 2
3 b three 3 3
4 a two 4 4
5 b two 5 5
6 a one 6 6
7 a three 7 7
'''# 分組 groupby
df.groupby(by="key1")print(df.groupby(by="key1"))# <pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000000002CB9208>for i in df.groupby(by="key1"):print(i)'''
('a', key1 key2 data1 data2
0 a one 0 0
2 a two 2 2
4 a two 4 4
6 a one 6 6
7 a three 7 7)
('b', key1 key2 data1 data2
1 b one 1 1
3 b three 3 3
5 b two 5 5)
'''print(df.groupby(by="key1").mean())# 求平均值,key2不是數字類型直接忽略掉'''data1 data2
key1
a 3.8 3.8
b 3.0 3.0'''print(df.groupby(by="key1").sum())# 求和,key2不是數字類型直接忽略掉'''data1 data2
key1
a 19 19
b 9 9'''print(df.groupby(by="key1").count())# 記錄個數'''key2 data1 data2
key1
a 5 5 5
b 3 3 3'''
# 索引和復合索引import numpy as np
import pandas as pdt1 = pd.DataFrame(np.arange(12).reshape(3,4), index=list('ABC'), columns=list('WXYZ'))print(t1)'''W X Y Z
A 0 1 2 3
B 4 5 6 7
C 8 9 10 11'''print(t1.index)# Index(['A', 'B', 'C'], dtype='object')# 重置索引print(t1.reindex(['a','e']))'''W X Y Z
a NaN NaN NaN NaN
e NaN NaN NaN NaN'''print(t1.reindex(['A','e']))'''W X Y Z
A 0.0 1.0 2.0 3.0
e NaN NaN NaN NaN'''# 指定某一列作為索引print(t1.set_index('W'))# 默認刪除W列,以W作為行索引'''X Y Z
W
0 1 2 3
4 5 6 7
8 9 10 11'''print(t1.set_index('W',drop=False))# 不刪除W列,以W作為行索引'''W X Y Z
W
0 0 1 2 3
4 4 5 6 7
8 8 9 10 11'''# 返回index的唯一值: t.set_index("M").index.unique()print(t1.set_index('W').index)'''Int64Index([0, 4, 8], dtype='int64', name='W')'''print(t1.set_index('W').index.unique())'''Int64Index([0, 4, 8], dtype='int64', name='W')'''
t1.loc['B','W']=8print(t1)'''W X Y Z
A 0 1 2 3
B 8 5 6 7
C 8 9 10 11'''print(t1.set_index('W').index.unique())'''Int64Index([0, 8], dtype='int64', name='W')'''# 設置兩個索引 復合索引
t2 = pd.DataFrame(np.arange(12).reshape(3,4), index=list('ABC'), columns=list('WXYZ'))print(t2)'''W X Y Z
A 0 1 2 3
B 4 5 6 7
C 8 9 10 11'''print(t2.set_index(['W','X']))'''Y Z
W X
0 1 2 3
4 5 6 7
8 9 10 11'''print(type(t2.set_index(['W','X'])))# <class 'pandas.core.frame.DataFrame'>
Series復合索引
a = pd.DataFrame({‘a’: range(7),‘b’: range(7, 0, -1),‘c’: [‘one’,‘one’,‘one’,‘two’,‘two’,‘two’, ‘two’],‘d’: list(“hjklmno”)})
設置c,d為索引
# 索引和復合索引import numpy as np
import pandas as pda = pd.DataFrame({'a':range(7),'b':range(7,0,-1),'c':['one','one','one','two','two','two','two'],'d':list("hjklmno")})print(type(a))# <class 'pandas.core.frame.DataFrame'>print(a)'''a b c d
0 0 7 one h
1 1 6 one j
2 2 5 one k
3 3 4 two l
4 4 3 two m
5 5 2 two n
6 6 1 two o'''
b = a.set_index(['c','d'])print(b)'''a b
c d
one h 0 7j 1 6k 2 5
two l 3 4m 4 3n 5 2o 6 1
'''print(b.loc['one'])'''a b
d
h 0 7
j 1 6
k 2 5'''print(b.loc['one'].loc['j'])'''
a 1
b 6
Name: j, dtype: int64'''print(b.loc['one'].loc['j']['a'])# 1
c = b['a']print(type(c))# <class 'pandas.core.series.Series'>print(c)'''
c d
one h 0j 1k 2
two l 3m 4n 5o 6
Name: a, dtype: int64'''print(c['one']['j'])# 1print(c['one','j'])# 1
設置d,c為索引
# 索引和復合索引import numpy as np
import pandas as pda = pd.DataFrame({'a':range(7),'b':range(7,0,-1),'c':['one','one','one','two','two','two','two'],'d':list("hjklmno")})print(type(a))# <class 'pandas.core.frame.DataFrame'>print(a)'''a b c d
0 0 7 one h
1 1 6 one j
2 2 5 one k
3 3 4 two l
4 4 3 two m
5 5 2 two n
6 6 1 two o'''
b = a.set_index(['d','c'])print(b)'''a b
d c
h one 0 7
j one 1 6
k one 2 5
l two 3 4
m two 4 3
n two 5 2
o two 6 1
'''print(b.loc['j'])'''a b
c
one 1 6'''print(b.loc['j'].loc['one'])'''
a 1
b 6
Name: one, dtype: int64'''print(b.loc['j'].loc['one']['a'])# 1print(b.loc['j'].loc['one','a'])# 1# 復合索引交換print(b.swaplevel())'''a b
c d
one h 0 7j 1 6k 2 5
two l 3 4m 4 3n 5 2o 6 1'''
DateFrame復合索引
# 索引和復合索引import numpy as np
import pandas as pda = pd.DataFrame({'a':range(7),'b':range(7,0,-1),'c':['one','one','one','two','two','two','two'],'d':list("hjklmno")})print(type(a))# <class 'pandas.core.frame.DataFrame'>print(a)'''a b c d
0 0 7 one h
1 1 6 one j
2 2 5 one k
3 3 4 two l
4 4 3 two m
5 5 2 two n
6 6 1 two o'''
b = a.set_index(['c','d'])print(b)'''a b
c d
one h 0 7j 1 6k 2 5
two l 3 4m 4 3n 5 2o 6 1
'''# 從外層開始取值print(b.loc['one'].loc['j','b'])# 6# 從內層開始取值print(b.swaplevel().loc['j'].loc['one','b'])# 6
5.練習
1.使用matplotlib呈現出店鋪總數排名前10的國家
# # 1.使用matplotlib呈現出店鋪總數排名前10的國家 # sort_values groupbyimport numpy as np
import pandas as pd
from matplotlib import pyplot as pltfile_path ='./starbucks_store_worldwide.csv'
df = pd.read_csv(file_path)print(df.info())'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25600 entries, 0 to 25599
Data columns (total 13 columns):# Column Non-Null Count Dtype
--- ------ -------------- ----- 0 Brand 25600 non-null object 1 Store Number 25600 non-null object 2 Store Name 25600 non-null object 3 Ownership Type 25600 non-null object 4 Street Address 25598 non-null object 5 City 25585 non-null object 6 State/Province 25600 non-null object 7 Country 25600 non-null object 8 Postcode 24078 non-null object 9 Phone Number 18739 non-null object 10 Timezone 25600 non-null object 11 Longitude 25599 non-null float6412 Latitude 25599 non-null float64
dtypes: float64(2), object(11)
memory usage: 2.5+ MB
None'''
data = df.groupby(by='Country').count()print(data.head(3))'''Brand Store Number Store Name ... Timezone Longitude Latitude
Country ...
AD 1 1 1 ... 1 1 1
AE 144 144 144 ... 144 144 144
AR 108 108 108 ... 108 108 108[3 rows x 12 columns]'''
data_sort = data.sort_values(by='Brand', ascending=False)[0:10]# 倒序print(data_sort['Brand'])# data = df.groupby(by='Country').count()['Brand'].sort_values(ascending=False)[0:10]# print(data) # 這樣寫也可以'''
Country
US 13608
CN 2734
CA 1468
JP 1237
KR 993
GB 901
MX 579
TW 394
TR 326
PH 298
Name: Brand, dtype: int64'''
x = data_sort['Brand'].index
y = data_sort['Brand'].values# 設置圖片大小
plt.figure(figsize=(15,8), dpi=80)# 直方圖
plt.bar(x, y)
plt.show()
2.使用matplotlib呈現出中國每個城市的店鋪數量
# 2.使用matplotlib呈現出中國每個城市的店鋪數量import pandas as pd
from matplotlib import pyplot as plt
import matplotlibfont ={'family':'SimHei','weight':'bold','size':12}
matplotlib.rc("font",**font)file_path ='./starbucks_store_worldwide.csv'
df = pd.read_csv(file_path)# print(df.info())'''# Column Non-Null Count Dtype
--- ------ -------------- ----- 0 Brand 25600 non-null object 1 Store Number 25600 non-null object 2 Store Name 25600 non-null object 3 Ownership Type 25600 non-null object 4 Street Address 25598 non-null object 5 City 25585 non-null object 6 State/Province 25600 non-null object 7 Country 25600 non-null object 8 Postcode 24078 non-null object 9 Phone Number 18739 non-null object 10 Timezone 25600 non-null object 11 Longitude 25599 non-null float6412 Latitude 25599 non-null float64
dtypes: float64(2), object(11)
memory usage: 2.5+ MB
None'''
df = df[df['Country']=='CN']# print(df.head(3))'''Brand Store Number ... Longitude Latitude
2091 Starbucks 22901-225145 ... 116.32 39.90
2092 Starbucks 32320-116537 ... 116.32 39.97
2093 Starbucks 32447-132306 ... 116.47 39.95[3 rows x 13 columns]'''
data = df.groupby(by='City').count()['Brand'].sort_values(ascending=False)[0:30]
x = data.index
y = data.values# 設置圖片大小
plt.figure(figsize=(15,8), dpi=80)# 直方圖
plt.bar(x, y)# 設置x軸刻度
plt.xticks(rotation=45)
plt.show()
之前所學習的DatetimeIndex可以理解為時間戳 那么現在我們要學習的PeriodIndex可以理解為時間段 periods = pd.PeriodIndex(year=df[“year”],month=df[“month”],day=df[“day”],hour=df[“hour”],freq=“H”) 那么如果給這個時間段降采樣呢? data = df.set_index(periods).resample(“10D”).mean()
# 現在我們有北上廣、深圳、和沈陽5個城市空氣質量數據,請繪制出5個城市的PM2.5隨時間的變化情況import pandas as pd
from matplotlib import pyplot as pltdf = pd.read_csv('./PM2.5/BeijingPM20100101_20151231.csv')# print(df.info())'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52584 entries, 0 to 52583
Data columns (total 18 columns):# Column Non-Null Count Dtype
--- ------ -------------- ----- 0 No 52584 non-null int64 1 year 52584 non-null int64 2 month 52584 non-null int64 3 day 52584 non-null int64 4 hour 52584 non-null int64 5 season 52584 non-null int64 6 PM_Dongsi 25052 non-null float647 PM_Dongsihuan 20508 non-null float648 PM_Nongzhanguan 24931 non-null float649 PM_US Post 50387 non-null float6410 DEWP 52579 non-null float6411 HUMI 52245 non-null float6412 PRES 52245 non-null float6413 TEMP 52579 non-null float6414 cbwd 52579 non-null object 15 Iws 52579 non-null float6416 precipitation 52100 non-null float6417 Iprec 52100 non-null float64
dtypes: float64(11), int64(6), object(1)
memory usage: 7.2+ MB
None'''
pd.set_option('display.max_rows',30)
pd.set_option('display.max_columns',18)# print(df.head())'''No year month day hour season PM_Dongsi PM_Dongsihuan \
0 1 2010 1 1 0 4 NaN NaN
1 2 2010 1 1 1 4 NaN NaN
2 3 2010 1 1 2 4 NaN NaN
3 4 2010 1 1 3 4 NaN NaN
4 5 2010 1 1 4 4 NaN NaN PM_Nongzhanguan PM_US Post DEWP HUMI PRES TEMP cbwd Iws \
0 NaN NaN -21.0 43.0 1021.0 -11.0 NW 1.79
1 NaN NaN -21.0 47.0 1020.0 -12.0 NW 4.92
2 NaN NaN -21.0 43.0 1019.0 -11.0 NW 6.71
3 NaN NaN -21.0 55.0 1019.0 -14.0 NW 9.84
4 NaN NaN -20.0 51.0 1018.0 -12.0 NW 12.97 precipitation Iprec
0 0.0 0.0
1 0.0 0.0
2 0.0 0.0
3 0.0 0.0
4 0.0 0.0
'''# pd.PeriodIndex 把分開的時間字符串通過PeriodIndex的方法轉換為pandas的時間類型
periods = pd.PeriodIndex(year=df["year"], month=df["month"], day=df["day"], hour=df["hour"], freq="H")# print(periods)'''
PeriodIndex(['2010-01-01 00:00', '2010-01-01 01:00', '2010-01-01 02:00','2010-01-01 03:00', '2010-01-01 04:00', '2010-01-01 05:00','2010-01-01 06:00', '2010-01-01 07:00', '2010-01-01 08:00','2010-01-01 09:00',...'2015-12-31 14:00', '2015-12-31 15:00', '2015-12-31 16:00','2015-12-31 17:00', '2015-12-31 18:00', '2015-12-31 19:00','2015-12-31 20:00', '2015-12-31 21:00', '2015-12-31 22:00','2015-12-31 23:00'],dtype='period[H]', length=52584, freq='H')
'''
df['datetime']= periods
df.set_index('datetime',inplace=True)# print(df.head())'''No year month day hour season PM_Dongsi \
datetime
2010-01-01 00:00 1 2010 1 1 0 4 NaN
2010-01-01 01:00 2 2010 1 1 1 4 NaN
2010-01-01 02:00 3 2010 1 1 2 4 NaN
2010-01-01 03:00 4 2010 1 1 3 4 NaN
2010-01-01 04:00 5 2010 1 1 4 4 NaN PM_Dongsihuan PM_Nongzhanguan PM_US Post DEWP HUMI \
datetime
2010-01-01 00:00 NaN NaN NaN -21.0 43.0
2010-01-01 01:00 NaN NaN NaN -21.0 47.0
2010-01-01 02:00 NaN NaN NaN -21.0 43.0
2010-01-01 03:00 NaN NaN NaN -21.0 55.0
2010-01-01 04:00 NaN NaN NaN -20.0 51.0 PRES TEMP cbwd Iws precipitation Iprec
datetime
2010-01-01 00:00 1021.0 -11.0 NW 1.79 0.0 0.0
2010-01-01 01:00 1020.0 -12.0 NW 4.92 0.0 0.0
2010-01-01 02:00 1019.0 -11.0 NW 6.71 0.0 0.0
2010-01-01 03:00 1019.0 -14.0 NW 9.84 0.0 0.0
2010-01-01 04:00 1018.0 -12.0 NW 12.97 0.0 0.0
'''# 進行降采樣
df = df.resample('10D').mean()# 處理缺失數據
data = df['PM_US Post'].dropna()# print(data)'''
datetime
2010-01-01 23:00 129.0
2010-01-02 00:00 148.0
2010-01-02 01:00 159.0
2010-01-02 02:00 181.0
2010-01-02 03:00 138.0...
2015-12-31 19:00 133.0
2015-12-31 20:00 169.0
2015-12-31 21:00 203.0
2015-12-31 22:00 212.0
2015-12-31 23:00 235.0
Freq: H, Name: PM_US Post, Length: 50387, dtype: float64
'''
x = data.index
y = data.values
plt.figure(figsize=(15,8),dpi=80)
plt.plot(range(len(x)),y)
plt.xticks(range(0,len(x),10),list(x)[::10],rotation=45)
plt.show()