生活随笔
收集整理的這篇文章主要介紹了
今日代码(200624)--缺失值处理
小編覺得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.
代碼記錄
缺失值處理
某個(gè)比賽中數(shù)據(jù)的缺失值處理,但是缺的很有規(guī)則,填補(bǔ)起來很有邏輯,比較清爽。
#導(dǎo)包
library(VIM)
library(psych)
library(lattice)
library(mice)
library(MASS)#讀取數(shù)據(jù)
getwd()
setwd("C:/Users/goatbishop/Desktop/data")
car_srv_train <- read.csv("car_srv_train.csv", header = T, stringsAsFactors = F)
car_info_train <- read.csv("car_info_train.csv", header = T, stringsAsFactors = F)#簡單查看數(shù)據(jù)
head(car_srv_train)
head(car_info_train)
dim(car_srv_train)
dim(car_info_train)#合并數(shù)據(jù)
intersect(names(car_srv_train), names(car_info_train))new_car <- merge(car_srv_train, car_info_train, "CUST_ID")
new_car2 <- merge(car_srv_train, car_info_train, "CUST_ID", all.y = T) #與all = T的合并結(jié)果相同dim(new_car)
dim(new_car2)#根據(jù)觀察,有些客戶沒有回廠,我們把回廠次數(shù)以及回廠支出總費(fèi)用加入到判斷是否會(huì)流失的指標(biāo)中
backFactoryFreq <- table(car_srv_train$CUST_ID)
length(backFactoryFreq)
backFactoryDf <- as.data.frame(backFactoryFreq)
colnames(backFactoryDf) <- c("CUST_ID", "Freq")backFactoryCost <- tapply(car_srv_train$ACTUAL_AMOUNT, car_srv_train$CUST_ID, sum)
dim(backFactoryCost)
class(backFactoryCost)
backFactoryDf2 <- as.data.frame(backFactoryCost)
backFactoryDf2$CUST_ID <- row.names(backFactoryDf2)backFactoryDf <- merge(backFactoryDf, backFactoryDf2, "CUST_ID",all = T)
dim(backFactoryDf)new_car_info_train <- merge(car_info_train, backFactoryDf, "CUST_ID", all = T)#數(shù)據(jù)預(yù)處理str(new_car_info_train)
summary(new_car_info_train)
head(new_car_info_train)
#性別設(shè)為factor(無缺失值)
new_car_info_train$CUST_SEX <- factor(new_car_info_train$CUST_SEX)
#年齡中有475個(gè)缺失值(占比較小,可以考慮全部刪掉,也可考慮填補(bǔ)等等,待定)#婚姻狀況(缺失值較多為39038且已婚人群占所能調(diào)查到的大多數(shù),未婚占比非常小)
#且最高頻數(shù)和次高頻數(shù)的比值高達(dá)93,考慮刪除該變量
head(new_car_info_train$CUST_MARRY)
length(new_car_info_train$CUST_MARRY[which(new_car_info_train$CUST_MARRY == "")])
new_car_info_train$CUST_MARRY[which(new_car_info_train$CUST_MARRY == "")] <- NA
new_car_info_train$CUST_MARRY <- factor(new_car_info_train$CUST_MARRY)#車主性質(zhì)設(shè)為factor
new_car_info_train$BUYERPART <- factor(new_car_info_train$BUYERPART)#車型代碼設(shè)為factor
new_car_info_train$CAR_MODEL <- factor(new_car_info_train$CAR_MODEL)
table(new_car_info_train$CAR_MODEL)#車型顏色先把""空串設(shè)置為NA
#有21312個(gè)缺失值,好吧
head(new_car_info_train$CAR_COLOR)
length(new_car_info_train$CAR_COLOR[which(new_car_info_train$CAR_COLOR == "")]) #21312
new_car_info_train$CAR_COLOR[which(new_car_info_train$CAR_COLOR == "")] <- NA
new_car_info_train$CAR_COLOR <- factor(new_car_info_train$CAR_COLOR)#是否貸款買車設(shè)為factor
new_car_info_train$IS_LOAN <- factor(new_car_info_train$IS_LOAN)#貸款期限存在缺失值,5607
new_car_info_train$LOAN_PERIED <- factor(new_car_info_train$LOAN_PERIED)
#我們看到貸款金額的缺失值和貸款期限的缺失值一樣多,都為5607,
#所以,是否有由于客戶并沒有貸款,所以沒有填此項(xiàng)的可能
#也就是說是由于變量自身原因,而不是缺失值在樣本中隨機(jī)分布的原因
#我們看到IS_LOAD變量值為0的樣品有5607個(gè)和缺失值數(shù)目一樣,這證明了我們的猜想
#我們對(duì)其進(jìn)行人為填補(bǔ),設(shè)置LOAN_PERIED種類為0,LOAN_AMOUNT金額為0
#https://stackoverflow.com/questions/8229904/r-concatenating-two-factors
temp <- as.character(new_car_info_train$LOAN_PERIED)
temp[is.na(temp)] <- "0"
new_car_info_train$LOAN_PERIED <- factor(temp)
new_car_info_train$LOAN_AMOUNT[is.na(new_car_info_train$LOAN_AMOUNT)] <- 0#新車投保是否在4s店設(shè)為factor,缺失值為8151
new_car_info_train$F_INSORNOT <- factor(new_car_info_train$F_INSORNOT)#購買4種保險(xiǎn)的缺失值一樣多,這可能由于同一個(gè)客戶4項(xiàng)都沒有填寫,未填寫原因不明#是否流失設(shè)為factor,無缺失值
new_car_info_train$IS_LOST <- factor(new_car_info_train$IS_LOST)#因?yàn)槲捶祻S的客戶,4S店沒有記錄,所以對(duì)于返廠頻率和總花費(fèi)的缺失值我們均設(shè)置為0
new_car_info_train$Freq[is.na(new_car_info_train$Freq)] <- 0
new_car_info_train$backFactoryCost[is.na(new_car_info_train$backFactoryCost)] <- 0#繪制缺失值圖
aggr(new_car_info_train, prop = F, numbers = T)#通過繪制缺失值圖觀察到,對(duì)于購買4項(xiàng)保險(xiǎn)缺失的觀測,新車投保是否在4s店變量也存在缺失
#且,新車投保是否在4s店沒有缺失的變量全部都是1,也就是說,一部分缺失的原因,可能是由于
#沒有在4S店投保,因此后面的4項(xiàng)保險(xiǎn)也沒有寫
#通過ALL_BUYINS_N變量中,沒有缺失值的部分全都投保,我們可以推測出來
#對(duì)于這類我們?nèi)吭O(shè)置其是否在4S店投保為0,4項(xiàng)的次數(shù)也都設(shè)施為0
#而對(duì)于在4S店購買保險(xiǎn)總次數(shù)>0,或者購買4S店專修險(xiǎn)的次數(shù)>0的觀測,我們設(shè)置
#其是否在4S店投保為1temp <- as.character(new_car_info_train$F_INSORNOT)
temp[is.na(new_car_info_train$ALL_BUYINS_N)] <- "0"
temp_ALL <- new_car_info_train$ALL_BUYINS_N
temp_DLRSI <- new_car_info_train$DLRSI_CNTfor (i in c(1:length(temp))) {if (is.na(temp[i])) {if (temp_ALL[i] > 0 | temp_DLRSI[i] > 0) {temp[i] <- "1"}}
}new_car_info_train$F_INSORNOT <- factor(temp)
#F_INSORNOT此時(shí)無缺失值new_car_info_train$ALL_BUYINS_N[is.na(new_car_info_train$ALL_BUYINS_N)] <- 0
new_car_info_train$DLRSI_CNT[is.na(new_car_info_train$DLRSI_CNT)] <- 0
new_car_info_train$GLASSBUYSEPARATE_CNT[is.na(new_car_info_train$GLASSBUYSEPARATE_CNT)] <- 0
new_car_info_train$SII_CNT[is.na(new_car_info_train$SII_CNT)] <- 0#刪除變量,刪除用戶ID和婚否
new_car_info_train2 <- new_car_info_train[, -c(1, 4)]#繪制缺失值圖
aggr(new_car_info_train2, prop = F, numbers = T)summary(new_car_info_train2)
dim(new_car_info_train2)
#我們刪除缺失的年齡觀測
new_car_info_train2 <- new_car_info_train2[!is.na(new_car_info_train2$CUST_AGE), ]
dim(new_car_info_train2)table(new_car_info_train$IS_LOST) #流失占比0.2293882
table(new_car_info_train2$IS_LOST)#流失占比0.2289921
#基本沒有什么變動(dòng),表明刪除的一些年齡觀測對(duì)建模沒有顯著影響#對(duì)IS_LOST與CAR_COLOR變量進(jìn)行列聯(lián)表檢驗(yàn)
testDf2 <- new_car_info_train2[!is.na(new_car_info_train2$CAR_COLOR), c("CAR_COLOR", "IS_LOST")]
chisq.test(testDf2$CAR_COLOR, testDf2$IS_LOST)
table(testDf2)
#雖然列聯(lián)表檢驗(yàn)拒絕兩者相互獨(dú)立的原假設(shè),但是,這可能是由于顏色因子的水平過多
#從常理上來說顏色和流失沒有太大關(guān)系,我們先將其刪除(強(qiáng)行解釋)
#之后可以嘗試用加入顏色變量進(jìn)行建模
new_car_info_train3 <- new_car_info_train2[, -5]#繪制缺失值圖
aggr(new_car_info_train3, prop = F, numbers = T)summary(new_car_info_train3)#目前已經(jīng)沒有缺失值了##Logistic回歸new_car_info_train3$IS_LOST <- as.character(new_car_info_train3$IS_LOST)
table(new_car_info_train3$IS_LOST)
new_car_info_train3$IS_LOST <- new_car_info_train3$IS_LOST == 1
#換成TRUE或者FALSE
#new_car_info_train3$IS_LOST <- factor(new_car_info_train3$IS_LOST, levels = c(0, 1), labels = c("NO", "Yes"))lm1 <- glm(IS_LOST ~ ., data = new_car_info_train3, family = binomial())
summary(lm1) #45851
#利用AIC準(zhǔn)則進(jìn)行逐步回歸
stepAIC(lm1)#雖然也好像AIC也沒減少多少(45851),但是,還是利用逐步回歸后的模型
lm2 <- glm(IS_LOST ~ CUST_AGE + BUYERPART + CAR_MODEL + CAR_AGE + CAR_PRICE + LOAN_PERIED + F_INSORNOT + ALL_BUYINS_N + GLASSBUYSEPARATE_CNT + Freq, data = new_car_info_train3, family = binomial())summary(lm2)predCar <- predict(lm2, type = "response")
summary(predCar)
#我們將數(shù)據(jù)分為5個(gè)等級(jí)其中前兩個(gè)等級(jí)極有可能流失他的概率為80~100%, 50 ~80%
#其余3個(gè)等級(jí)流失危險(xiǎn)度逐漸降低為0~10%, 10~30%, 30~50%temp <- predCarfor (i in c(1:length(predCar))) {num = temp[i]if (num > 0.8) {temp[i] <- 5} else if (num <= 0.8 & num > 0.5) {temp[i] <- 4}else if (num <= 0.5 & num > 0.3) {temp[i] <- 3} else if (num <= 0.3 & num > 0.1) {temp[i] <- 2} else {temp[i] <- 1}
}table(temp)
new_car_info_train3$prob <- factor(temp, levels = c(1, 2, 3, 4, 5), ordered = T)
summary(new_car_info_train3)
write.csv(new_car_info_train3, "new_car_info_train3_0624.csv")#訓(xùn)練集正確率計(jì)算
temp <- ifelse(predCar > 0.5, 1, 0)
table(temp)
sum(temp == new_car_info_train3$IS_LOST)/length(temp)
#預(yù)測正確率78.3%有待改進(jìn)
還不錯(cuò)吧。
總結(jié)
以上是生活随笔為你收集整理的今日代码(200624)--缺失值处理的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
如果覺得生活随笔網(wǎng)站內(nèi)容還不錯(cuò),歡迎將生活随笔推薦給好友。