12 Working with missing values
12.1
## Some sample data
set.seed(0)
dat <- matrix(1:100, 10, 10)
dat[sample(1:100, 10)] <- NA
dat <- data.frame(dat)
12.2 Summaryize the missing values
12.2.1 Summarize the missing values using mice
package
The mice
package provides a nice function md.pattern() to get a better understanding of the pattern of missing data.
#install.packages("mice")
library(mice)
##
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
##
## cbind, rbind
md.pattern(dat)
## X1 X5 X8 X2 X3 X4 X7 X10 X6 X9
## 4 1 1 1 1 1 1 1 1 1 1 0
## 1 1 1 1 1 1 1 1 1 1 0 1
## 1 1 1 1 1 1 1 1 1 0 0 2
## 1 1 1 1 1 1 1 1 0 0 1 2
## 1 1 1 1 1 1 1 0 1 1 1 1
## 1 1 1 1 1 0 0 1 1 1 1 2
## 1 1 1 1 0 1 1 1 1 1 0 2
## 0 0 0 1 1 1 1 1 2 3 10
12.2.2 Summarize the missing values using VIM
packages
#install.packages("VIM")
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:xts':
##
## first, last
## The following objects are masked from 'package:reshape2':
##
## dcast, melt
## VIM is ready to use.
## Since version 4.0.0 the GUI is in its own package VIMGUI.
##
## Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
aggr(dat, numbers = TRUE, prop = c(TRUE, FALSE))
aggr(t(dat), numbers = TRUE, prop = c(TRUE, FALSE))
aggr_plot <- aggr(dat, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(dat), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))
##
## Variables sorted by number of missings:
## Variable Count
## X9 0.3
## X6 0.2
## X2 0.1
## X3 0.1
## X4 0.1
## X7 0.1
## X10 0.1
## X1 0.0
## X5 0.0
## X8 0.0
tdat = t(dat)
aggr_plot <- aggr(tdat, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(tdat), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))
##
## Variables sorted by number of missings:
## Variable Count
## V6 0.2
## V7 0.2
## V8 0.2
## V10 0.2
## V1 0.1
## V5 0.1
## V2 0.0
## V3 0.0
## V4 0.0
## V9 0.0
12.3 Delete columns/rows with more that x% missing
## Remove columns with more than 50% NA
dat_file = dat[, -which(colMeans(is.na(dat)) > 0.5)]
12.4 Imputing the data
12.4.1 Imputing the data with row-wise mean
dat.imp = dat
k <- which(is.na(dat.imp), arr.ind=TRUE)
dat.imp[k] <- rowMeans(dat.imp, na.rm=TRUE)[k[,1]]
12.4.2 Imputing the data with row-wise mean using mice
tempDat <- mice(as.matrix(dat), method = "mean")
##
## iter imp variable
## 1 1
## 1 2
## 1 3
## 1 4
## 1 5
## 2 1
## 2 2
## 2 3
## 2 4
## 2 5
## 3 1
## 3 2
## 3 3
## 3 4
## 3 5
## 4 1
## 4 2
## 4 3
## 4 4
## 4 5
## 5 1
## 5 2
## 5 3
## 5 4
## 5 5
## Warning: Number of logged events: 9
summary(tempDat)
## Class: mids
## Number of multiple imputations: 5
## Imputation methods:
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## "" "" "" "" "" "" "" "" "" ""
## PredictorMatrix:
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## X1 0 0 0 0 0 0 0 0 0 0
## X2 0 0 0 0 0 0 0 0 0 0
## X3 0 0 0 0 0 0 0 0 0 0
## X4 0 0 0 0 0 0 0 0 0 0
## X5 1 0 0 0 0 0 0 0 0 0
## X6 0 0 0 0 0 0 0 0 0 0
## Number of logged events: 9
## it im dep meth out
## 1 0 0 collinear X5
## 2 0 0 collinear X8
## 3 0 0 collinear X2
## 4 0 0 collinear X3
## 5 0 0 collinear X4
## 6 0 0 collinear X7
impDat<- complete(tempDat)
impDat
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## 1 1 11 21 31 41 51 NA 71 81 91
## 2 2 12 22 32 42 52 62 72 82 92
## 3 3 13 23 33 43 53 63 73 83 93
## 4 4 14 24 34 44 54 64 74 84 94
## 5 5 15 25 35 45 55 65 75 NA 95
## 6 6 16 26 36 46 NA 66 76 86 NA
## 7 7 17 NA NA 47 57 67 77 87 97
## 8 8 18 28 38 48 NA 68 78 NA 98
## 9 9 19 29 39 49 59 69 79 89 99
## 10 10 NA 30 40 50 60 70 80 NA 100
12.5 Reference
Imputing Missing Data with R; MICE package: https://datascienceplus.com/imputing-missing-data-with-r-mice-package/
Delete columns/rows with more that x% missing : https://stackoverflow.com/questions/31848156/delete-columns-rows-with-more-that-x-missing
- Tutorial on 5 Powerful R Packages used for imputing missing values
- https://www.analyticsvidhya.com/blog/2016/03/tutorial-powerful-packages-imputing-missing-values/