pandas基础知识
一,創(chuàng)建series?
import pandas as pd countries = ['中國(guó)', '美國(guó)', '澳大利亞'] countries_s = pd.Series(countries) print(type(countries_s)) print(countries_s) print(countries_s.values)二,添加索引名
import pandas as pd country_dicts = {'CH': '中國(guó)','US': '美國(guó)','AU': '澳大利亞'}country_dict_s = pd.Series(country_dicts) # 給索引命名 country_dict_s.index.name = 'index' # 給數(shù)據(jù)命名 country_dict_s.name = 'Country'print(country_dict_s) print(country_dict_s.values) print(country_dict_s.index)三,pd.DataFrame
import pandas as pdcountry1 = pd.Series({'Name': '中國(guó)','Language': 'Chinese','Area': '9.597M km2','Happiness Rank': 79})country2 = pd.Series({'Name': '美國(guó)','Language': 'English (US)','Area': '9.834M km2','Happiness Rank': 14})country3 = pd.Series({'Name': '澳大利亞','Language': 'English (AU)','Area': '7.692M km2','Happiness Rank': 9})df = pd.DataFrame([country1, country2, country3]) print(df) print('df.values=',df.values) print(type(df['Area'])) print('area values=',df['Area'].values) print(df[['Name','Area']]) print(df[['Name','Area']].values)#操作原數(shù)據(jù),要用copy,否則會(huì)改變?cè)瓟?shù)據(jù) rank=df['Happiness Rank'].values.copy() rank+=2 print(df['Happiness Rank'].values)四,增加一列:
country_1 = pd.Series({'Name': '中國(guó)','Language': '漢語(yǔ)','Area': '11111'}) country_2 = pd.Series({'Name': '美國(guó)','Language': '英語(yǔ)','Area': '222'}) country_3 = pd.Series({'Name': '澳大利亞','Language': '英語(yǔ)','Area': '333'}) # print(country_1)df=pd.DataFrame([country_1,country_2,country_3],index=['CH','US','AU']) print(df) # #增加一列 按列索引 df['location']='地球' print(df) df['region']=['亞洲','北美洲','大洋洲'] print(df)?五,轉(zhuǎn)置,刪除
country_1 = pd.Series({'Name': '中國(guó)','Language': '漢語(yǔ)','Area': '11111'}) country_2 = pd.Series({'Name': '美國(guó)','Language': '英語(yǔ)','Area': '222'}) country_3 = pd.Series({'Name': '澳大利亞','Language': '英語(yǔ)','Area': '333'}) # print(country_1)df=pd.DataFrame([country_1,country_2,country_3],index=['CH','US','AU']) print(df) #轉(zhuǎn)換行和列 print('====================================') print(df.T) #刪除數(shù)據(jù) print('====================================') print(df.drop(['CH'])) print('====================================') print(df) #注意 drop操作不會(huì)改變?cè)袛?shù)據(jù)的六,讀csv,index_col
import pandas as pd# 使用index_col指定索引列 # 使用usecols指定需要讀取的列 reprot_2016_df = pd.read_csv('./2016.csv',index_col='Country',usecols=['Country', 'Happiness Rank', 'Happiness Score', 'Region']) # 數(shù)據(jù)預(yù)覽 print(reprot_2016_df.head()) print(reprot_2016_df.values[:2,:])reprot_2016_df = pd.read_csv('./2016.csv',usecols=['Country', 'Happiness Rank', 'Happiness Score', 'Region']) # 數(shù)據(jù)預(yù)覽 print('==============================================') print(reprot_2016_df.head()) print(reprot_2016_df.values[:2,:])print('==============================================') print(reprot_2016_df[['Region','Happiness Rank']].values[:2,:])讀取csv的第二種方式
df_xc = pd.read_csv('../submit/submit_LF2551924C021_1007_xc.csv').copy() # 瑕疵結(jié)果print('len(df_xc)=',len(df_xc))newdict = {}for index, row in df_xc.iterrows():if index<1:name = '_'.join(row.filename.split('_')[2:6])print('===================')print('row')print(row)print('====================')print('name=',name)if name not in newdict.keys():newdict[name] = [row.probability]else:newdict[name].append(row.probability)break七,pd.query
from numpy.random import randn from pandas import DataFrame df = pd.DataFrame(randn(5, 2), columns=list('ab')) print(df) print(df.query('a > b')) print(df.query('a > 0.2'))八,列名重命名
import pandas as pdreprot_2016_df = pd.read_csv('./2016.csv',usecols=['Country', 'Happiness Rank', 'Happiness Score', 'Region']) # 數(shù)據(jù)預(yù)覽 print('==============================================') print(reprot_2016_df.head())reprot_2016_df.rename(columns={'Country': '國(guó)家','Region': '地區(qū)', 'Happiness Rank': '排名', 'Happiness Score': '幸福指數(shù)'},inplace=True) print('==============================================') print(reprot_2016_df.head())九,過(guò)濾
import pandas as pdreprot_2016_df = pd.read_csv('./2016.csv',usecols=['Country', 'Happiness Rank', 'Happiness Score', 'Region']) # 數(shù)據(jù)預(yù)覽 print('==============================================') print(reprot_2016_df.head())print('==============================================') df=reprot_2016_df[reprot_2016_df['Country'] == 'Denmark'] print(df.head())print('==============================================') only_western_europe_10 = reprot_2016_df[(reprot_2016_df['Region'] == 'Western Europe') & (reprot_2016_df['Happiness Rank'] > 10)] print(only_western_europe_10.head())十,處理Nan值
import pandas as pdlog_df = pd.read_csv('./data/log.csv') print(log_df.head()) print('===============查看head是否有空值=========================') #查看head是否有空值 print(log_df.head().isnull()) print('===============取出volume不為空的數(shù)據(jù)=========================') # 取出volume不為空的數(shù)據(jù) print(log_df[log_df['volume'].notnull()]) #將index改為time和user log_df.set_index(['time', 'user'], inplace=True) print(log_df) #按照index排序 print('===============按照index排序=========================') log_df.sort_index(inplace=True) print(log_df) print('================將nan替換為0========================') #將nan替換為0 print(log_df.fillna(0)) print('================丟掉nan值========================') #丟掉nan值 print(log_df.dropna())十一,處理重復(fù)值
import pandas as pd data = pd.DataFrame({'k1': ['one', 'two'] * 2+ ['two'],'k2': [1, 3, 3, 4, 4]}) print(data) print('===============判斷是否重復(fù)=========================') print(data.duplicated()) print('===============去除重復(fù)數(shù)據(jù)=========================') print(data.drop_duplicates()) print('===============去除指定列的重復(fù)數(shù)據(jù)=========================') print(data.drop_duplicates(['k2']))十二,數(shù)據(jù)合并
import pandas as pdstaff_df = pd.DataFrame([{'姓名': '張三', '部門(mén)': '研發(fā)部'},{'姓名': '李四', '部門(mén)': '財(cái)務(wù)部'},{'姓名': '趙六', '部門(mén)': '市場(chǎng)部'}])student_df = pd.DataFrame([{'姓名': '張三', '專業(yè)': '計(jì)算機(jī)'},{'姓名': '李四', '專業(yè)': '會(huì)計(jì)'},{'姓名': '王五', '專業(yè)': '市場(chǎng)營(yíng)銷(xiāo)'}])print(staff_df) print() print(student_df) print('===============數(shù)據(jù)合并有NAN==================') print(pd.merge(staff_df, student_df, how='outer', on='姓名')) print('===============數(shù)據(jù)合并無(wú)NAN==================') print(pd.merge(staff_df, student_df, how='inner', on='姓名'))十三,分箱操作
import pandas as pd# 年齡數(shù)據(jù) ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32] # 分箱的邊界 bins = [18, 25, 35, 60, 100] cats = pd.cut(ages, bins) print(cats) print('================獲取分箱編碼================') print(cats.codes) print('===========統(tǒng)計(jì)箱中元素的個(gè)數(shù)=============') print(pd.value_counts(cats)) print('===========帶標(biāo)簽的分箱=============') group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior'] cats = pd.cut(ages, bins, labels=group_names) print(cats)十四,畫(huà)圖
import pandas as pd import numpy as np import matplotlib.pyplot as plt np.random.seed(100) df = pd.DataFrame({'A': np.random.randn(365).cumsum(0),'B': np.random.randn(365).cumsum(0) + 20,'C': np.random.randn(365).cumsum(0) - 20},index=pd.date_range('2017/1/1', periods=365)) print(df.head()) df.plot() plt.show()df.plot('A', 'B', kind='scatter') plt.show()# 顏色(c)和大小(s)由'B'列的數(shù)據(jù)決定 ax = df.plot('A', 'B', kind='scatter',c='B', s=df['B'], colormap='viridis') # 設(shè)置坐標(biāo)為相同比例 ax.set_aspect('equal') plt.show()df.plot(kind='box') plt.show()df.plot(kind='hist', alpha=0.7) df.plot(kind='kde') plt.show()十五,groupby
import pandas as pd df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'c'],'key2': ['one', 'two', 'one', 'two'],'data1':[1,2,3,4],'data2':[2,3,4,5]}) print(df) print('====================') grouped = df['data1'].groupby(df['key1']) print(grouped.mean()) print('====================') means = df['data1'].groupby([df['key1'], df['key2']]).mean() print(means) print('====================') print(df.groupby('key1').mean()) import pandas as pd import matplotlib.pyplot as plt import seaborn as sns countries = ['Germany', 'UK', 'CH', 'JP', 'Switzerland']data = pd.DataFrame({'InvoiceNo':['c12','24','34','3','4','5','6'],'price': [2,1,1,2,3,4,3],'quantity':[3,2,2,1,4,5,4],'country': ['UK','UK','UK', 'UK', 'CH', 'JP', 'CH']}) print(data) #只要我關(guān)心的國(guó)家 data=data[data['country'].isin(countries)].copy() #c開(kāi)頭意味取消交易 cond1 = ~data['InvoiceNo'].str.startswith('c') cond2=data['country']!='UK' data2=data[cond1&cond2].copy() print('===============================================') print(data2) data2['total_cost']=data2['price']*data2['quantity'] print(data2) print('===============================================') cost_per_country=data2.groupby('country')['total_cost'].sum() print(cost_per_country) print('===============================================') print(cost_per_country.to_frame()) # 可視化結(jié)果 sns.barplot(data=cost_per_country.to_frame().T) # cost_per_country.sort_values(ascending=False).plot(kind='bar') plt.xticks(rotation=90) plt.xlabel('Country') plt.ylabel('costs') plt.tight_layout() plt.show()十六,apply用于每一列最小最大歸一化
import pandas as pd a=pd.Series({'v1':2,'v2':3}) b=pd.Series({'v1':5,'v2':10}) c=pd.Series({'v1':4,'v2':6}) all=pd.DataFrame([a,b,c]) def scale_minmax(col):return (col-col.min())/(col.max()-col.min()) print('================') print(all) all=all.apply(scale_minmax,axis=0) print('================') print(all) import pandas as pdImg1 = pd.Series({'ID': '1.jpg','Detection': '311 707 472 842'})Img2 = pd.Series({'ID': '2.jpg','Detection': '311 707 472 842'})Img3 = pd.Series({'ID': '3.jpg','Detection': '311 707 472 842'}) df = pd.DataFrame([Img1, Img2, Img3]) print('========================') print(df) print(df.iloc[:, 0])print('=========================') def pre_data(df):df.iloc[:, 0] = df.apply(lambda x: [float(a) for a in x[0].split(' ')], axis=1) pre_data(df) print(df)a='1 2 3 4' print([float(i) for i in a.split(' ')])十七,map,可用來(lái)制作類別型特征
示例1:
import pandas as pd x = pd.Series(['A', 'B', 'C'], index=['one', 'two', 'three']) y = {'A': 1, 'B': 2, 'C': 3} z=x.map(y) print(x) print(z)示例2 :
#produce res change 0 and 1df_yj['res'] = df_yj['probability'].map(lambda x: 0 if x < 0.2 else 1)# # 生成結(jié)果文件,保存在result文件夾中,可用于直接提交df_yj.to_csv(("../submit/LF2551924C021_1007_result_yj_0_1.csv"), index=False)十八,生成csv一
import pandas as pd c={} a=np.array([1]) b=np.array(['1 2 3 4']) c['ID']=a c['Detection']=b a_df=pd.DataFrame(c) a_df.to_csv('test16.csv',index=False,columns=['ID','Detection'])生成csv二
import pandas as pd a=np.array([1,2,3,4]) b=np.array([3,4,5,6]) a_df = pd.DataFrame(np.hstack([a.reshape(-1,1),b.reshape(-1,1)])) a_df.to_csv('1.csv',index=False,header=['a','b'])生成csv三
label_warp = {'normal': 0,'defect': 1} img_path=['a','b','c'] label=['normal','defect','normal'] label_file = pd.DataFrame({'img_path': img_path, 'label': label}) print(label_file) label_file=label_file['label'].map(label_warp) print(label_file)生成excel
df = pd.DataFrame(res) df.to_excel('./yunjiang_test3.xls', index=False, header=None)十九,給csv空的header增加header,注意在讀的時(shí)候沒(méi)有header要將其為None
csv_path = './train_only.csv' df = pd.read_csv(csv_path,header=None)######注意 print(df.shape) df_value=df.values # print(df_value[:-1,1]) # print(len(df_value[:,1]))df=pd.DataFrame(df_value,columns=['name','xmin','ymin','xmax','ymax','class']) df.to_csv('train_xml.csv',index=False)二十,loc,iloc,ix,loc——通過(guò)行標(biāo)簽索引行數(shù)據(jù),iloc——通過(guò)行號(hào)索引行數(shù)據(jù),ix——通過(guò)行標(biāo)簽或者行號(hào)索引行數(shù)據(jù)(基于loc和iloc 的混合)
import pandas as pddata = [[1, 2, 3], [4, 5, 6]] index = ['a', 'b'] # 行號(hào) columns = ['c', 'd', 'e'] # 列號(hào) df = pd.DataFrame(data, index=index, columns=columns) # 生成一個(gè)數(shù)據(jù)框 print(df) print('===============') #loc——通過(guò)行標(biāo)簽索引行數(shù)據(jù) print(df.loc['a']) #iloc——通過(guò)行號(hào)索引行數(shù)據(jù) print('=================') print(df.iloc[0]) #ix——通過(guò)行標(biāo)簽或者行號(hào)索引行數(shù)據(jù)(基于loc和iloc 的混合) print('=================') print(df.ix[0]) print(df.ix['a'])print('=================') print(df.loc[:, ['c']])print(df.iloc[:, [0]])二十一:value_counts()
可以用來(lái)統(tǒng)計(jì)每一類的個(gè)數(shù)
from sklearn.datasets import load_iris import matplotlib.pyplot as plt # load data iris = load_iris() df = pd.DataFrame(iris.data, columns=iris.feature_names) print('df.shape=',df.shape) df['label'] = iris.target print(df['label'].value_counts())二十二:pandas讀取csv的name
import numpy as np import pandas as pd names = np.array(pd.read_csv('./hunhe.csv', header=None))[:, 0] print(names)name_df=np.array(pd.read_csv('./hunhe.csv', header=None).values[:,0]).reshape(-1) print(name_df)二十三.pd.concat
import pandas as pd df1 = pd.DataFrame([['a', 1], ['b', 2]],columns = ['letter', 'number']) print(df1) df2 = pd.DataFrame([['c', 1], ['d', 2]], columns=['letter', 'number']) print(df2) df=pd.concat([df1,df2]) print(df)二十四.Categorical
import pandas as pd my_categories = pd.Categorical(['foo', 'bar', 'baz', 'foo', 'bar']) print('=====================') print(my_categories) #查看類別標(biāo)簽 print('======================') print(my_categories.categories) # 查看類別編碼 print('======================') print(my_categories.codes)二十五.利用modin進(jìn)行pandas加速
pip install modin[ray]
### Read in the data with Pandas import pandas as pds = time.time() df = pd.read_csv("esea_master_dmg_demos.part1.csv") e = time.time() print("Pandas Loading Time = {}".format(e-s))### Read in the data with Modin import modin.pandas as pds = time.time() df = pd.read_csv("esea_master_dmg_demos.part1.csv") e = time.time() print("Modin Loading Time = {}".format(e-s))二十六.取出csv的某一列類別值對(duì)應(yīng)的數(shù)據(jù)
import numpy as np import pandas as pddef gini(nums):probs = [nums.count(i)/len(nums) for i in set(nums)]gini = sum([p*(1-p) for p in probs])return ginidef split_dataframe(data, col):'''function: split pandas dataframe to sub-df based on data and column.input: dataframe, column name.output: a dict of splited dataframe.'''# unique value of columnunique_values = data[col].unique()# print('==unique_values:', unique_values)# empty dict of dataframeresult_dict = {elem: pd.DataFrame for elem in unique_values}# split dataframe based on column valuefor key in result_dict.keys():result_dict[key] = data[:][data[col] == key]return result_dictdef test_split_dataframe():df = pd.read_csv('./example_data.csv', dtype={'windy': 'str'})res = split_dataframe(df, 'temp')print('=res:', res.keys())print("=====res['mild']:\n", res['mild']) if __name__ == '__main__':test_split_dataframe()excel數(shù)據(jù)(注意excel數(shù)據(jù)排版沒(méi)對(duì)齊):?
humility outlook temp windy play high sunny hot FALSE no high sunny hot TRUE no high overcast hot FALSE yes high rainy mild FALSE yes normal rainy cool FALSE yes normal rainy cool TRUE no normal overcast cool TRUE yes high sunny mild FALSE no normal sunny cool FALSE yes normal rainy mild FALSE yes normal sunny mild TRUE yes high overcast mild TRUE yes normal overcast hot FALSE yes high rainy mild TRUE no輸出結(jié)果:?
總結(jié)
以上是生活随笔為你收集整理的pandas基础知识的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問(wèn)題。
- 上一篇: SIFT算法中概念简单解释
- 下一篇: CSAPP--信息的表示与处理