import scipy.stats as st
import pandas as pd
import regex as re
pd.options.display.max_columns = None
pd.options.display.max_rows = None
path1 ="/home/kesci/input/liver_df9751/結構化數據訓練營.csv" # chipotle.tsv
# path2="/home/kesci/inputver_df9751/結構化數據訓練營測試集.csv"
data = pd.read_csv(path1)# data_test=pd.read_csv(path1)
col_names=list(data.columns)
col=[]for i in range(len(col_names)):if re.findall(r"\u2028(.+)",col_names[i])!=[]:col.append(re.findall(r"\u2028(.+)",col_names[i])[0])elif re.findall(r"\n(.+)",col_names[i])!=[]:col.append(re.findall(r"\n(.+)",col_names[i])[0])else:col.append(col_names[i])
## 修改dataframe列名
data.columns=col
feature1 =['體重','年齡','ALF']for i in feature1:ave=data[i].median()data[i]= data[i].fillna(ave)print(data[i].values)
a_zi=[]for i in range(len(data)):c=[data['體重'][i],data['年齡'][i]]a_zi.append(c)from sklearn.neighbors import KNeighborsClassifierneigh =KNeighborsClassifier(n_neighbors=5)
neigh.fit(a_zi, data['ALF'])
cnt=0for i in range(len(a_zi)):if(neigh.predict([a_zi[i]])==data['ALF'][i]):cnt+=1print(len(a_zi),cnt,len(a_zi)-cnt)