Chapter 21 data.table

21.1 Split data.table into chunks in a list

Split method for data.table. Faster and more flexible. Be aware that processing list of data.tables will be generally much slower than manipulation in single data.table by group using by argument, read more on data.table.

library(data.table)
set.seed(123)
dt = data.table(x1 = rep(letters[1:2], 6), 
                x2 = rep(letters[3:5], 4), 
                x3 = rep(letters[5:8], 3), 
                y = rnorm(12))
dt = dt[sample(.N)]
df = as.data.frame(dt)
df

##    x1 x2 x3           y
## 1   a  e  g  1.55870831
## 2   b  d  h -1.26506123
## 3   b  c  f -0.44566197
## 4   a  c  g  0.46091621
## 5   b  d  f -0.23017749
## 6   a  c  e -0.56047565
## 7   b  e  f  1.71506499
## 8   b  e  h  0.35981383
## 9   b  c  h  0.07050839
## 10  a  d  g  1.22408180
## 11  a  e  e -0.68685285
## 12  a  d  e  0.12928774

21.1.1 nested list using `flatten` arguments

new_list <- split(dt, by=c("x1", "x2"))
new_list

## $a.e
##    x1 x2 x3          y
## 1:  a  e  g  1.5587083
## 2:  a  e  e -0.6868529
## 
## $b.d
##    x1 x2 x3          y
## 1:  b  d  h -1.2650612
## 2:  b  d  f -0.2301775
## 
## $b.c
##    x1 x2 x3           y
## 1:  b  c  f -0.44566197
## 2:  b  c  h  0.07050839
## 
## $a.c
##    x1 x2 x3          y
## 1:  a  c  g  0.4609162
## 2:  a  c  e -0.5604756
## 
## $b.e
##    x1 x2 x3         y
## 1:  b  e  f 1.7150650
## 2:  b  e  h 0.3598138
## 
## $a.d
##    x1 x2 x3         y
## 1:  a  d  g 1.2240818
## 2:  a  d  e 0.1292877

new_list <- split(dt, by=c("x1", "x2"), flatten=FALSE)
new_list

## $a
## $a$e
##    x1 x2 x3          y
## 1:  a  e  g  1.5587083
## 2:  a  e  e -0.6868529
## 
## $a$c
##    x1 x2 x3          y
## 1:  a  c  g  0.4609162
## 2:  a  c  e -0.5604756
## 
## $a$d
##    x1 x2 x3         y
## 1:  a  d  g 1.2240818
## 2:  a  d  e 0.1292877
## 
## 
## $b
## $b$d
##    x1 x2 x3          y
## 1:  b  d  h -1.2650612
## 2:  b  d  f -0.2301775
## 
## $b$c
##    x1 x2 x3           y
## 1:  b  c  f -0.44566197
## 2:  b  c  h  0.07050839
## 
## $b$e
##    x1 x2 x3         y
## 1:  b  e  f 1.7150650
## 2:  b  e  h 0.3598138

21.1.2 Example

dt_example = data.table(group = rep(c("group1", "group2"), 4), 
                gene = c(letters[1:4], letters[3:6]))
dt

##     x1 x2 x3           y
##  1:  a  e  g  1.55870831
##  2:  b  d  h -1.26506123
##  3:  b  c  f -0.44566197
##  4:  a  c  g  0.46091621
##  5:  b  d  f -0.23017749
##  6:  a  c  e -0.56047565
##  7:  b  e  f  1.71506499
##  8:  b  e  h  0.35981383
##  9:  b  c  h  0.07050839
## 10:  a  d  g  1.22408180
## 11:  a  e  e -0.68685285
## 12:  a  d  e  0.12928774

21.1.2.1 Crate a matrix from data.table

library(UpSetR)
list_group = split(dt_example[-which(names(df)=="z")], by="group",  drop=TRUE)
list_group

## named list()