當前位置：首頁 > 人文社科 > 生活经验 >内容正文

生活经验

数据挖掘-matplotlib、numpy、pandas（二）

發布時間：2023/11/27 生活经验 40 豆豆

生活随笔收集整理的這篇文章主要介紹了数据挖掘-matplotlib、numpy、pandas（二）小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

文章目錄

Numpy
Pandas

Numpy

# 數值化計算庫import numpy as np
score=np.array([[80,81,82,83,84],[78,96,95,94,93],[85,83,86,81,89],[78,75,71,73,76],[96,94,93,97,91]])
score.size
type(score)'''
ndarray屬性
shape ndimsize
dypeitemsize
在創建ndarray的時候，如果沒有指定類型
默認
整數：int64
浮點數：float64''''''
基本操作
1.生成數組方法'''
# 生成0、1
np.zeros([5,4])
np.ones((5,4))
print(np.zeros([5,4]))
print(np.ones((5,4)))
'''
[[0. 0. 0. 0.][0. 0. 0. 0.][0. 0. 0. 0.][0. 0. 0. 0.][0. 0. 0. 0.]]
[[1. 1. 1. 1.][1. 1. 1. 1.][1. 1. 1. 1.][1. 1. 1. 1.][1. 1. 1. 1.]]'''
# 從原來數組生成
data1=np.array(score)
data2=np.copy(score)
print(data2)
# 生成固定范圍的數組
x=np.linspace(0,10,11)
print(x)
# [ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10.]# 生成隨機數組
x1=np.random.uniform(-1,1,100000)
print(x1)
import matplotlib.pyplot as plt
plt.figure(figsize=(20,8),dpi=80)
plt.hist(x1,1000)# 正態分布x2=np.random.normal(1.75,0.1,100000)
plt.hist(x2,1000)
plt.savefig("shujutupian9.png")
plt.show()

# 案例：隨機生成8只股票2周交易日漲幅import numpy as np
x=np.random.normal(loc=0,scale=1,size=(8,10))x1=x[0,0:3] #獲取第0行，0，1，2三個數
print(x)
print(x1)
print(x.shape)#(8, 10)# 修改形狀
'''
ndarray.reshape(shape) 返回新的ndarray,原始數據沒有改變
ndarray.resize(shape) 沒有返回值，對原始的ndarray進行了改變
ndarray.T 轉置 行變列，列變行'''
# x2=x.resize((10,8))#None
x2=x.T
print(x2.shape)#(10, 8)
# x2.astype(bytes)
print(x2.tostring)
# 修改類型
'''
nadarray.astype(type)
ndarray序列化到本地'''# 邏輯運算
x3=x>0.5
print(x3)
'''
[[ True  True False False False False False  True False False]
[False False False False False False False False False  True]
[False False False False False False False  True False  True]
[False False  True  True False False  True False False False]
[ True False  True False  True False False False  True False]
[ True False False False False  True False False False False]
[False False False False  True False  True False False False]
[False False False  True False  True False False False False]]
'''print(x[x>0.7])
'''返回所有大于0.7的數
[1.05730176 1.9653545  0.96310443 0.79025379 1.20871883 1.135774811.27004206 0.70143188 0.84627271 1.18769169 2.53638775 0.745746050.96702949 0.90181432 1.78217411]'''
x[x>0.7]=1.1#將所有大于0.7的數替換為1.1
print(x[x>0.7])
'''
[1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.11.1 1.1 1.1 1.1 1.1 1.1 1.1]'''
# 通用判斷函數
'''
np.all() 同&&
np.any() 同||'''
np.all(x[0:2,0:5]>0)
print(np.all(x[0:2,0:5]>0))  #False
print(np.any(x[0:2,0:5]>0))  #True# 三元運算符
'''
np.where
'''
print(np.where(x[:4,:4]>0,2,-2))
'''大于0則=2，\=-2
[[ 2 -2 -2 -2][-2 -2  2  2][-2  2 -2 -2][-2 -2 -2  2]]'''
print(np.where(np.logical_or(x[:4,:4]>0,x[:4,:4]<-1),2,1))
print(np.where(np.logical_and(x[:4,:4]>0,x[:4,:4]<2),2,1))
'''
[[[2 2 2 1][2 1 1 2][2 2 2 1][1 1 2 2]][[2 2 1 2][1 2 2 1][2 1 2 2][2 2 1 1]]]'''# 統計運算
'''
統計函數指標
max,min,mean,median,var,std
np.argmax最大值所在位置
np.argmin最小值所在位置'''
print(np.max(x,axis=0))#按列求最大
print(np.max(x,axis=1))#按行求最大
'''
[0.64386042 1.1        1.1        1.1        1.1        1.11.1        1.1        0.41923517 1.1       ]
[1.1        1.1        1.1        1.1        1.1        0.550960571.1        1.1       ]'''# 數組的拼接
a=x[:2,:4]
b=x[4:6,:4]
np.hstack(a,b)
print(np.hstack(a,b))

# 數組間運算import numpy as npx=np.array([[1,2,3,4,5,6],[6,5,4,9,8,7]])
# 數組與數
print(x+1)
# [[ 2  3  4  5  6  7]
#  [ 7  6  5 10  9  8]]# 數組與數組
# 廣播機制
'''
...n維度*...*2維度*1維度（維度從右向左追加）
維度相同且在相同維度上個數相同或為“1”
若有低維度則默認向前追加1 
eg  9*7*1*57*5*5
'''
# 矩陣運算
'''
矩陣matrix 二維數組'''
# 用ndarray存儲矩陣
data=np.array([[80,86],[86,79],[79,98],[85,97],[65,94],[90,85]])
data_mat=np.mat([[80,86],[86,79],[79,98],[85,97],[65,94],[90,85]])
print(type(data_mat))#<class 'numpy.matrix'>
print(type(data))#<class 'numpy.ndarray'>
x2=np.mat([[0.3],[0.7]])
print(data*x2)
print(np.matmul(data,x2))
print(np.dot(data,x2))
# [[84.2]
#  [81.1]
#  [92.3]
#  [93.4]
#  [85.3]
#  [86.5]]# 合并
# 數組的拼接
x=np.random.normal(loc=0,scale=1,size=(8,10))
print(x)
a=x[:2,:4]
b=x[4:6,:4]
# np.hstack(a,b)
print(np.hstack((a,b)))#列合并
# [[ 1.98184198 -0.34973153  1.13257694  1.41434373 -0.46409822 -1.8277629
#    1.67666373 -0.33325928]
#  [ 0.01002333  0.68042899 -1.07555942  0.35112946  1.12268772  0.0929585
#    0.28756724  0.78774338]]print(np.concatenate((a,b),axis=1))
# [[ 1.98184198 -0.34973153  1.13257694  1.41434373 -0.46409822 -1.8277629
#    1.67666373 -0.33325928]
#  [ 0.01002333  0.68042899 -1.07555942  0.35112946  1.12268772  0.0929585
#    0.28756724  0.78774338]]
print(np.concatenate((a,b),axis=0))#行合并
# [[ 1.98184198 -0.34973153  1.13257694  1.41434373]
#  [ 0.01002333  0.68042899 -1.07555942  0.35112946]
#  [-0.46409822 -1.8277629   1.67666373 -0.33325928]
#  [ 1.12268772  0.0929585   0.28756724  0.78774338]]print(np.vstack((a,b)))
# [[ 1.98184198 -0.34973153  1.13257694  1.41434373]
#  [ 0.01002333  0.68042899 -1.07555942  0.35112946]
#  [-0.46409822 -1.8277629   1.67666373 -0.33325928]
#  [ 1.12268772  0.0929585   0.28756724  0.78774338]]

Pandas

'''
核心數據結構
基本操作
運算
畫圖
文件的存儲與讀取'''
# Pandas  panel+data+analysis
import numpy as np
x=np.random.normal(0,1,(10,5))
import pandas as pd
x2=pd.DataFrame(x)
print(x2)
# 添加行索引
stock=["股票{}".format(i) for i in range (10)]
date=pd.date_range(start="20210814",periods=5,freq="B")
x3=pd.DataFrame(x,index=stock,columns=date)
print(x3)
'''2021-08-16  2021-08-17  2021-08-18  2021-08-19  2021-08-20
股票0   -0.269843    0.494288   -0.492346   -1.541906   -2.246752
股票1    0.571190   -0.248227   -1.274043    0.465642   -0.311196
股票2    0.405165    0.052852   -0.191542    0.393773    1.592916
股票3    2.066134    0.547554   -0.999797   -0.339771   -0.893654'''# values  只獲取數據
# T 轉置
# data.head(3)只獲取前3行
# data.tail(2)只獲取后2行# DataFrame 索引設置df=pd.DataFrame({'month':[1,4,7,10],'year':[2012,2014,2013,2014],'sale':[55,40,84,31]})
print(df)
#  month  year  sale
# 0      1  2012    55
# 1      4  2014    40
# 2      7  2013    84
# 3     10  2014    31x7=df.set_index("month",drop=True)
print(df.set_index("month",drop=True))
#       year  sale
# month
# 1      2012    55
# 4      2014    40
# 7      2013    84
# 10     2014    31# 設置多個索引
print(df.set_index(["year","month"]))#             sale
# year month
# 2012 1        55
# 2014 4        40
# 2013 7        84
# 2014 10       31
x4=df.set_index(["year","month"])
print(df.set_index(["year","month"]).index)#多個索引需要用數組括號表示
# MultiIndex([(2012,  1),
#             (2014,  4),
#             (2013,  7),
#             (2014, 10)],
#            names=['year', 'month'])# Multilndex與panelprint(x4.index.names)# ['year', 'month']print(x4.index.levels)
# [[2012, 2013, 2014], [1, 4, 7, 10]]# Series 只有行索引
'''
屬性
index
values方法'''
x5=pd.Series(np.arange(3,10,2),index=["a","b","c",'d'])#范圍，步長
print(x5)# a    3
# b    5
# c    7
# d    9
# dtype: int32x6=pd.Series({'red':100,'blue':200,'green':300,'yellow':400})
print(x6)
# 字典型創建
# red       100
# blue      200
# green     300
# yellow    400
# dtype: int64# 基本數據操作
#   month  year  sale
# 0      1  2012    55
# 1      4  2014    40
# 2      7  2013    84
# 3     10  2014    31
print(df["year"][1])#必須先列后行
# 2014
print(df.loc[1]['year'])
# 2014
print(df.loc[1,"year"])
# 2014
print(df.iloc[1,1])#直接位置獲取
# 2014# 賦值操作
# df.year=100# 排序
w1=df.sort_values(by="year")
print(w1)
#   month  year  sale
# 0      1  2012    55
# 2      7  2013    84
# 1      4  2014    40
# 3     10  2014    31
w2=df.sort_values(by="month",ascending=False)
print(w2)
#由大到小排序
#  month  year  sale
# 3     10  2014    31
# 2      7  2013    84
# 1      4  2014    40
# 0      1  2012    55# DataFrame運算
#     算術運算print(df["year"]+3)
# 0    2015
# 1    2017
# 2    2016
# 3    2017
# Name: year, dtype: int64
print((df-10)>-2)#所有數均-10
#  month  year  sale
# 0  False  True  True
# 1  False  True  True
# 2  False  True  True
# 3   True  True  Trueprint((df["year"]>2013)&(df["month"]<7))
# 0    False
# 1     True
# 2    False
# 3    False
# dtype: bool
print(df[(df["year"]>2013)&(df["month"]<7)])
# 返回符合要求的數據
#  month  year  sale
# 1      4  2014    40#     邏輯運算函數
print(df.query("year>2013&month<7"))
# .query()查詢到符合要求的字符串
#  month  year  sale
# 1      4  2014    40print(df["year"].isin([2012,2013]))
# 0     True
# 1    False
# 2     True
# 3    False
# Name: year, dtype: bool
print(df[df["year"].isin([2012,2013])])#上面只是判斷是否符合要求，該行df[上一行的要求，若True，則輸出。若False，則不輸出]
#  month  year  sale
# 0      1  2012    55
# 2      7  2013    84#統計運算與自定義運算
print(df.describe())#獲取常用的統計指標
#           month         year       sale
# count   4.000000     4.000000   4.000000
# mean    5.500000  2013.250000  52.500000
# std     3.872983     0.957427  23.216374
# min     1.000000  2012.000000  31.000000
# 25%     3.250000  2012.750000  37.750000
# 50%     5.500000  2013.500000  47.500000
# 75%     7.750000  2014.000000  62.250000
# max    10.000000  2014.000000  84.000000print(df.max())
# month      10
# year     2014
# sale       84
# dtype: int64
# 與上一最后一行相同
print(df.idxmax())#獲取上一指標的索引# month    3
# year     1
# sale     2
# dtype: int64# 累計統計函數
print(df["sale"].sort_index().cumsum().plot())
# AxesSubplot(0.125,0.11;0.775x0.77)

總結

以上是生活随笔為你收集整理的数据挖掘-matplotlib、numpy、pandas（二）的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇：数据挖掘-matplotlib、nump
下一篇：概率论与数理统计--第三章