I love the tidy syntax of dplyr and the ultimate speed of data.table.
Why not take the both? That is why I have started the work of tidyfst,
bridging the tidy syntax and computation performance via translation.
This tool is especially friendly for dplyr users who want to learn some
data.table, but data.table could also benefit from it (more or
less).
A great comparison of data.table and dplyr was displayed
at https://atrebas.github.io/post/2019-03-03-datatable-dplyr/
(thanks to Atrebas). I love this tutorial very much because it dig
rather deep into many features from both packages. Here I’ll try to
implement all operations from that tutorial, and the potential users
could find why they would prefer tidyfst for some (if not most)
tasks.
The below examples have all been checked with tidyfst.
Now let’s begin.
library(tidyfst)
set.seed(1L)
## Create a data table
DF <- data.table(V1 = rep(c(1L, 2L), 5)[-10],
V2 = 1:9,
V3 = c(0.5, 1.0, 1.5),
V4 = rep(LETTERS[1:3], 3))
copy(DF) -> DT
class(DF)
#> [1] "data.table" "data.frame"
DF
#> V1 V2 V3 V4
#> <int> <int> <num> <char>
#> 1: 1 1 0.5 A
#> 2: 2 2 1.0 B
#> 3: 1 3 1.5 C
#> 4: 2 4 0.5 A
#> 5: 1 5 1.0 B
#> 6: 2 6 1.5 C
#> 7: 1 7 0.5 A
#> 8: 2 8 1.0 B
#> 9: 1 9 1.5 C
### Filter rows using indices
slice_dt(DF, 3:4)
#> V1 V2 V3 V4
#> <int> <int> <num> <char>
#> 1: 1 3 1.5 C
#> 2: 2 4 0.5 A
### Discard rows using negative indices
slice_dt(DF, -(3:7))
#> V1 V2 V3 V4
#> <int> <int> <num> <char>
#> 1: 1 1 0.5 A
#> 2: 2 2 1.0 B
#> 3: 2 8 1.0 B
#> 4: 1 9 1.5 C
### Filter rows using a logical expression
filter_dt(DF, V2 > 5)
#> V1 V2 V3 V4
#> <int> <int> <num> <char>
#> 1: 2 6 1.5 C
#> 2: 1 7 0.5 A
#> 3: 2 8 1.0 B
#> 4: 1 9 1.5 C
filter_dt(DF, V4 %in% c("A", "C"))
#> V1 V2 V3 V4
#> <int> <int> <num> <char>
#> 1: 1 1 0.5 A
#> 2: 1 3 1.5 C
#> 3: 2 4 0.5 A
#> 4: 2 6 1.5 C
#> 5: 1 7 0.5 A
#> 6: 1 9 1.5 C
filter_dt(DF, V4 %chin% c("A", "C")) # fast %in% for character
#> V1 V2 V3 V4
#> <int> <int> <num> <char>
#> 1: 1 1 0.5 A
#> 2: 1 3 1.5 C
#> 3: 2 4 0.5 A
#> 4: 2 6 1.5 C
#> 5: 1 7 0.5 A
#> 6: 1 9 1.5 C
### Filter rows using multiple conditions
filter_dt(DF, V1 == 1, V4 == "A")
#> [1] TRUE FALSE FALSE TRUE FALSE
# equals to
filter_dt(DF, V1 == 1 & V4 == "A")
#> V1 V2 V3 V4
#> <int> <int> <num> <char>
#> 1: 1 1 0.5 A
#> 2: 1 7 0.5 A
### Filter unique rows
distinct_dt(DF) # unique(DF)
#> Indices: <V4>, <V1>, <V4__V1>
#> V1 V2 V3 V4
#> <int> <int> <num> <char>
#> 1: 1 1 0.5 A
#> 2: 2 2 1.0 B
#> 3: 1 3 1.5 C
#> 4: 2 4 0.5 A
#> 5: 1 5 1.0 B
#> 6: 2 6 1.5 C
#> 7: 1 7 0.5 A
#> 8: 2 8 1.0 B
#> 9: 1 9 1.5 C
distinct_dt(DF, V1,V4)
#> V1 V4
#> <int> <char>
#> 1: 1 A
#> 2: 2 B
#> 3: 1 C
#> 4: 2 A
#> 5: 1 B
#> 6: 2 C
### Discard rows with missing values
drop_na_dt(DF) # na.omit(DF)
#> Indices: <V4>, <V1>, <V4__V1>
#> V1 V2 V3 V4
#> <int> <int> <num> <char>
#> 1: 1 1 0.5 A
#> 2: 2 2 1.0 B
#> 3: 1 3 1.5 C
#> 4: 2 4 0.5 A
#> 5: 1 5 1.0 B
#> 6: 2 6 1.5 C
#> 7: 1 7 0.5 A
#> 8: 2 8 1.0 B
#> 9: 1 9 1.5 C
### Other filters
sample_n_dt(DF, 3) # n random rows
#> V1 V2 V3 V4
#> <int> <int> <num> <char>
#> 1: 1 9 1.5 C
#> 2: 2 4 0.5 A
#> 3: 1 7 0.5 A
sample_frac_dt(DF, 0.5) # fraction of random rows
#> V1 V2 V3 V4
#> <int> <int> <num> <char>
#> 1: 1 1 0.5 A
#> 2: 2 2 1.0 B
#> 3: 1 5 1.0 B
#> 4: 1 3 1.5 C
slice_max_dt(DF, V1,1) # top n entries (includes equals)
#> V1 V2 V3 V4
#> <int> <int> <num> <char>
#> 1: 2 2 1.0 B
#> 2: 2 4 0.5 A
#> 3: 2 6 1.5 C
#> 4: 2 8 1.0 B
filter_dt(DT,V4 %like% "^B")
#> V1 V2 V3 V4
#> <int> <int> <num> <char>
#> 1: 2 2 1 B
#> 2: 1 5 1 B
#> 3: 2 8 1 B
filter_dt(DT,V2 %between% c(3, 5))
#> V1 V2 V3 V4
#> <int> <int> <num> <char>
#> 1: 1 3 1.5 C
#> 2: 2 4 0.5 A
#> 3: 1 5 1.0 B
filter_dt(DT,between(V2, 3, 5, incbounds = FALSE))
#> V1 V2 V3 V4
#> <int> <int> <num> <char>
#> 1: 2 4 0.5 A
filter_dt(DT,V2 %inrange% list(-1:1, 1:3)) # see also ?inrange
#> V1 V2 V3 V4
#> <int> <int> <num> <char>
#> 1: 1 1 0.5 A
#> 2: 2 2 1.0 B
#> 3: 1 3 1.5 C
### Sort rows by column
arrange_dt(DF, V3)
#> V1 V2 V3 V4
#> <int> <int> <num> <char>
#> 1: 1 1 0.5 A
#> 2: 2 4 0.5 A
#> 3: 1 7 0.5 A
#> 4: 2 2 1.0 B
#> 5: 1 5 1.0 B
#> 6: 2 8 1.0 B
#> 7: 1 3 1.5 C
#> 8: 2 6 1.5 C
#> 9: 1 9 1.5 C
### Sort rows in decreasing order
arrange_dt(DF, -V3)
#> V1 V2 V3 V4
#> <int> <int> <num> <char>
#> 1: 1 3 1.5 C
#> 2: 2 6 1.5 C
#> 3: 1 9 1.5 C
#> 4: 2 2 1.0 B
#> 5: 1 5 1.0 B
#> 6: 2 8 1.0 B
#> 7: 1 1 0.5 A
#> 8: 2 4 0.5 A
#> 9: 1 7 0.5 A
### Sort rows based on several columns
arrange_dt(DF, V1, -V2)
#> V1 V2 V3 V4
#> <int> <int> <num> <char>
#> 1: 1 9 1.5 C
#> 2: 1 7 0.5 A
#> 3: 1 5 1.0 B
#> 4: 1 3 1.5 C
#> 5: 1 1 0.5 A
#> 6: 2 8 1.0 B
#> 7: 2 6 1.5 C
#> 8: 2 4 0.5 A
#> 9: 2 2 1.0 B
### Select one column using an index (not recommended)
pull_dt(DT,3) # returns a vector
#> [1] 0.5 1.0 1.5 0.5 1.0 1.5 0.5 1.0 1.5
select_dt(DT,3) # returns a data.table
#> V3
#> <num>
#> 1: 0.5
#> 2: 1.0
#> 3: 1.5
#> 4: 0.5
#> 5: 1.0
#> 6: 1.5
#> 7: 0.5
#> 8: 1.0
#> 9: 1.5
### Select one column using column name
select_dt(DF, V2) # returns a data.table
#> V2
#> <int>
#> 1: 1
#> 2: 2
#> 3: 3
#> 4: 4
#> 5: 5
#> 6: 6
#> 7: 7
#> 8: 8
#> 9: 9
pull_dt(DF, V2) # returns a vector
#> [1] 1 2 3 4 5 6 7 8 9
### Select several columns
select_dt(DF, V2, V3, V4)
#> V2 V3 V4
#> <int> <num> <char>
#> 1: 1 0.5 A
#> 2: 2 1.0 B
#> 3: 3 1.5 C
#> 4: 4 0.5 A
#> 5: 5 1.0 B
#> 6: 6 1.5 C
#> 7: 7 0.5 A
#> 8: 8 1.0 B
#> 9: 9 1.5 C
select_dt(DF, V2:V4) # select columns between V2 and V4
#> V2 V3 V4
#> <int> <num> <char>
#> 1: 1 0.5 A
#> 2: 2 1.0 B
#> 3: 3 1.5 C
#> 4: 4 0.5 A
#> 5: 5 1.0 B
#> 6: 6 1.5 C
#> 7: 7 0.5 A
#> 8: 8 1.0 B
#> 9: 9 1.5 C
### Exclude columns
select_dt(DF, -V2, -V3)
#> Indices: <V4>, <V1>, <V4__V1>
#> V1 V4
#> <int> <char>
#> 1: 1 A
#> 2: 2 B
#> 3: 1 C
#> 4: 2 A
#> 5: 1 B
#> 6: 2 C
#> 7: 1 A
#> 8: 2 B
#> 9: 1 C
### Select/Exclude columns using a character vector
cols <- c("V2", "V3")
select_dt(DF,cols = cols)
#> V2 V3
#> <int> <num>
#> 1: 1 0.5
#> 2: 2 1.0
#> 3: 3 1.5
#> 4: 4 0.5
#> 5: 5 1.0
#> 6: 6 1.5
#> 7: 7 0.5
#> 8: 8 1.0
#> 9: 9 1.5
select_dt(DF,cols = cols,negate = TRUE)
#> Indices: <V4>, <V1>, <V4__V1>
#> V1 V4
#> <int> <char>
#> 1: 1 A
#> 2: 2 B
#> 3: 1 C
#> 4: 2 A
#> 5: 1 B
#> 6: 2 C
#> 7: 1 A
#> 8: 2 B
#> 9: 1 C
### Other selections
select_dt(DF, cols = paste0("V", 1:2))
#> Index: <V1>
#> V1 V2
#> <int> <int>
#> 1: 1 1
#> 2: 2 2
#> 3: 1 3
#> 4: 2 4
#> 5: 1 5
#> 6: 2 6
#> 7: 1 7
#> 8: 2 8
#> 9: 1 9
relocate_dt(DF, V4) # reorder columns
#> Indices: <V4>, <V1>, <V4__V1>
#> V4 V1 V2 V3
#> <char> <int> <int> <num>
#> 1: A 1 1 0.5
#> 2: B 2 2 1.0
#> 3: C 1 3 1.5
#> 4: A 2 4 0.5
#> 5: B 1 5 1.0
#> 6: C 2 6 1.5
#> 7: A 1 7 0.5
#> 8: B 2 8 1.0
#> 9: C 1 9 1.5
select_dt(DF, "V")
#> V1 V2 V3 V4
#> <int> <int> <num> <char>
#> 1: 1 1 0.5 A
#> 2: 2 2 1.0 B
#> 3: 1 3 1.5 C
#> 4: 2 4 0.5 A
#> 5: 1 5 1.0 B
#> 6: 2 6 1.5 C
#> 7: 1 7 0.5 A
#> 8: 2 8 1.0 B
#> 9: 1 9 1.5 C
select_dt(DF, "3$")
#> V3
#> <num>
#> 1: 0.5
#> 2: 1.0
#> 3: 1.5
#> 4: 0.5
#> 5: 1.0
#> 6: 1.5
#> 7: 0.5
#> 8: 1.0
#> 9: 1.5
select_dt(DF, ".2")
#> V2
#> <int>
#> 1: 1
#> 2: 2
#> 3: 3
#> 4: 4
#> 5: 5
#> 6: 6
#> 7: 7
#> 8: 8
#> 9: 9
select_dt(DF, "V1")
#> V1
#> <int>
#> 1: 1
#> 2: 2
#> 3: 1
#> 4: 2
#> 5: 1
#> 6: 2
#> 7: 1
#> 8: 2
#> 9: 1
select_dt(DF, -"^V2")
#> V1 V3 V4
#> <int> <num> <char>
#> 1: 1 0.5 A
#> 2: 2 1.0 B
#> 3: 1 1.5 C
#> 4: 2 0.5 A
#> 5: 1 1.0 B
#> 6: 2 1.5 C
#> 7: 1 0.5 A
#> 8: 2 1.0 B
#> 9: 1 1.5 C
# remove variables using "-" prior to function
### Summarise one column
summarise_dt(DF, sum(V1)) # returns a data.table
#> V1
#> <int>
#> 1: 13
summarise_dt(DF, sumV1 = sum(V1)) # returns a data.table
#> sumV1
#> <int>
#> 1: 13
### Summarise several columns
summarise_dt(DF, sum(V1), sd(V3))
#> V1 V2
#> <int> <num>
#> 1: 13 0.4330127
### Summarise several columns and assign column names
DF %>%
summarise_dt(sumv1 = sum(V1),
sdv3 = sd(V3))
#> sumv1 sdv3
#> <int> <num>
#> 1: 13 0.4330127
### Summarise a subset of rows
DT[1:4, sum(V1)]
#> [1] 6
DF %>%
slice_dt(1:4) %>%
summarise_dt(sum(V1))
#> V1
#> <int>
#> 1: 6
### Misc
summarise_dt(DF, nth(V3,1))
#> V1
#> <num>
#> 1: 0.5
summarise_dt(DF, nth(V3,-1))
#> V1
#> <num>
#> 1: 1.5
summarise_dt(DF, nth(V3, 5))
#> V1
#> <num>
#> 1: 1
summarise_dt(DF, uniqueN(V4))
#> V1
#> <int>
#> 1: 3
uniqueN(DF)
#> [1] 9
### By group
# not recommended
DF %>%
group_dt(
by = V4,
summarise_dt(sumV2 = sum(V2))
)
#> V4 sumV2
#> <char> <int>
#> 1: A 12
#> 2: B 15
#> 3: C 18
# recommended
DF %>%
summarise_dt(sumV2 = sum(V2),by = V4)
#> V4 sumV2
#> <char> <int>
#> 1: A 12
#> 2: B 15
#> 3: C 18
### By several groups
DF %>%
summarise_dt(sumV2 = sum(V2),by = .(V1,V4))
#> V1 V4 sumV2
#> <int> <char> <int>
#> 1: 1 A 8
#> 2: 2 B 10
#> 3: 1 C 12
#> 4: 2 A 4
#> 5: 1 B 5
#> 6: 2 C 6
### Calling function in by
DF %>%
summarise_dt(sumV2 = sum(V2),by = tolower(V4))
#> tolower sumV2
#> <char> <int>
#> 1: a 12
#> 2: b 15
#> 3: c 18
### Assigning column name in by
DF %>%
summarise_dt(sumV2 = sum(V2),by = .(abc = tolower(V4)))
#> abc sumV2
#> <char> <int>
#> 1: a 12
#> 2: b 15
#> 3: c 18
### Using a condition in by
DF %>%
summarise_dt(sumV2 = sum(V2),by = V4 == "A")
#> V4 sumV2
#> <lgcl> <int>
#> 1: TRUE 12
#> 2: FALSE 33
### By on a subset of rows
DF %>%
slice_dt(1:5) %>%
summarise_dt(sumV1 = sum(V1),by = V4)
#> V4 sumV1
#> <char> <int>
#> 1: A 3
#> 2: B 3
#> 3: C 1
### Count number of observations for each group
count_dt(DF, V4)
#> V4 n
#> <char> <int>
#> 1: A 3
#> 2: B 3
#> 3: C 3
### Add a column with number of observations for each group
add_count_dt(DF, V1)
#> Indices: <V4>, <V1>, <V4__V1>
#> V1 V2 V3 V4 n
#> <int> <int> <num> <char> <int>
#> 1: 1 1 0.5 A 5
#> 2: 2 2 1.0 B 4
#> 3: 1 3 1.5 C 5
#> 4: 2 4 0.5 A 4
#> 5: 1 5 1.0 B 5
#> 6: 2 6 1.5 C 4
#> 7: 1 7 0.5 A 5
#> 8: 2 8 1.0 B 4
#> 9: 1 9 1.5 C 5
### Retrieve the first/last/nth observation for each group
DF %>% summarise_dt(by = V4,nth(V2,1))
#> V4 V1
#> <char> <int>
#> 1: A 1
#> 2: B 2
#> 3: C 3
DF %>% summarise_dt(by = V4,nth(V2,-1))
#> V4 V1
#> <char> <int>
#> 1: A 7
#> 2: B 8
#> 3: C 9
DF %>% summarise_dt(by = V4,nth(V2,2))
#> V4 V1
#> <char> <int>
#> 1: A 4
#> 2: B 5
#> 3: C 6
### Summarise all the columns
summarise_vars(DT,.func = max)
#> V1 V2 V3 V4
#> <int> <int> <num> <char>
#> 1: 2 9 1.5 C
### Summarise several columns
summarise_vars(DT,c("V1", "V2"),mean)
#> V1 V2
#> <num> <num>
#> 1: 1.444444 5
### Summarise several columns by group
DT %>%
summarise_vars(c("V1", "V2"),mean,by = V4)
#> V4 V1 V2
#> <char> <num> <num>
#> 1: A 1.333333 4
#> 2: B 1.666667 5
#> 3: C 1.333333 6
## using patterns (regex)
DT %>%
summarise_vars("V1|V2",mean,by = V4)
#> V4 V1 V2
#> <char> <num> <num>
#> 1: A 1.333333 4
#> 2: B 1.666667 5
#> 3: C 1.333333 6
## Summarise with more than one function by group
# when you can't find a way, you can always use `in_dt` to use data.table
DT %>%
in_dt(, by = V4,
c(lapply(.SD, sum),
lapply(.SD, mean)))
#> V4 V1 V2 V3 V1 V2 V3
#> <char> <int> <int> <num> <num> <num> <num>
#> 1: A 4 12 1.5 1.333333 4 0.5
#> 2: B 5 15 3.0 1.666667 5 1.0
#> 3: C 4 18 4.5 1.333333 6 1.5
### Summarise using a condition
summarise_vars(DF, is.numeric, mean)
#> V1 V2 V3
#> <num> <num> <num>
#> 1: 1.444444 5 1
### Modify all the columns
mutate_vars(DF, .func = rev)
#> V1 V2 V3 V4
#> <int> <int> <num> <char>
#> 1: 1 9 1.5 C
#> 2: 2 8 1.0 B
#> 3: 1 7 0.5 A
#> 4: 2 6 1.5 C
#> 5: 1 5 1.0 B
#> 6: 2 4 0.5 A
#> 7: 1 3 1.5 C
#> 8: 2 2 1.0 B
#> 9: 1 1 0.5 A
### Modify several columns (dropping the others)
DF %>%
select_dt(cols = c("V1", "V2")) %>%
mutate_vars(.func = sqrt)
#> V1 V2
#> <num> <num>
#> 1: 1.000000 1.000000
#> 2: 1.414214 1.414214
#> 3: 1.000000 1.732051
#> 4: 1.414214 2.000000
#> 5: 1.000000 2.236068
#> 6: 1.414214 2.449490
#> 7: 1.000000 2.645751
#> 8: 1.414214 2.828427
#> 9: 1.000000 3.000000
DF %>%
select_dt(-V4) %>%
mutate_vars(.func = exp)
#> V1 V2 V3
#> <num> <num> <num>
#> 1: 2.718282 2.718282 1.648721
#> 2: 7.389056 7.389056 2.718282
#> 3: 2.718282 20.085537 4.481689
#> 4: 7.389056 54.598150 1.648721
#> 5: 2.718282 148.413159 2.718282
#> 6: 7.389056 403.428793 4.481689
#> 7: 2.718282 1096.633158 1.648721
#> 8: 7.389056 2980.957987 2.718282
#> 9: 2.718282 8103.083928 4.481689
### Modify several columns (keeping the others)
DF %>%
mutate_vars(c("V1", "V2"), sqrt)
#> Index: <V4>
#> V1 V2 V3 V4
#> <num> <num> <num> <char>
#> 1: 1.000000 1.000000 0.5 A
#> 2: 1.414214 1.414214 1.0 B
#> 3: 1.000000 1.732051 1.5 C
#> 4: 1.414214 2.000000 0.5 A
#> 5: 1.000000 2.236068 1.0 B
#> 6: 1.414214 2.449490 1.5 C
#> 7: 1.000000 2.645751 0.5 A
#> 8: 1.414214 2.828427 1.0 B
#> 9: 1.000000 3.000000 1.5 C
DF %>%
mutate_vars(-"V4", exp)
#> Index: <V4>
#> V1 V2 V3 V4
#> <num> <num> <num> <char>
#> 1: 2.718282 2.718282 1.648721 A
#> 2: 7.389056 7.389056 2.718282 B
#> 3: 2.718282 20.085537 4.481689 C
#> 4: 7.389056 54.598150 1.648721 A
#> 5: 2.718282 148.413159 2.718282 B
#> 6: 7.389056 403.428793 4.481689 C
#> 7: 2.718282 1096.633158 1.648721 A
#> 8: 7.389056 2980.957987 2.718282 B
#> 9: 2.718282 8103.083928 4.481689 C
### Modify columns using a condition (dropping the others)
select_dt(DT,is.numeric)
#> V1 V2 V3
#> <int> <int> <num>
#> 1: 1 1 0.5
#> 2: 2 2 1.0
#> 3: 1 3 1.5
#> 4: 2 4 0.5
#> 5: 1 5 1.0
#> 6: 2 6 1.5
#> 7: 1 7 0.5
#> 8: 2 8 1.0
#> 9: 1 9 1.5
### Modify columns using a condition (keeping the others)
mutate_vars(DT,is.numeric,as.integer)
#> V1 V2 V3 V4
#> <int> <int> <int> <char>
#> 1: 1 1 0 A
#> 2: 2 2 1 B
#> 3: 1 3 1 C
#> 4: 2 4 0 A
#> 5: 1 5 1 B
#> 6: 2 6 1 C
#> 7: 1 7 0 A
#> 8: 2 8 1 B
#> 9: 1 9 1 C
### Use a complex expression
DF %>%
group_dt(
by = V4,
slice_dt(1:2) %>%
transmute_dt(V1 = V1,
V2 = "X")
)
#> V4 V1 V2
#> <char> <int> <char>
#> 1: A 1 X
#> 2: A 2 X
#> 3: B 2 X
#> 4: B 1 X
#> 5: C 1 X
#> 6: C 2 X
### Use multiple expressions (with DT[,{j}])
DT %>%
in_dt(,{
print(V1) # comments here!
print(summary(V1))
x <- V1 + sum(V2)
.(A = 1:.N, B = x) # last list returned as a data.table
}
)
#> [1] 1 2 1 2 1 2 1 2 1
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 1.000 1.000 1.000 1.444 2.000 2.000
#> A B
#> <int> <int>
#> 1: 1 46
#> 2: 2 47
#> 3: 3 46
#> 4: 4 47
#> 5: 5 46
#> 6: 6 47
#> 7: 7 46
#> 8: 8 47
#> 9: 9 46
### Select first/last/… row by group
DT %>%
group_dt(
by = V4,
head(1)
)
#> V4 V1 V2 V3
#> <char> <int> <int> <num>
#> 1: A 1 1 0.5
#> 2: B 2 2 1.0
#> 3: C 1 3 1.5
DT %>%
group_dt(
by = V4,
tail(2)
)
#> V4 V1 V2 V3
#> <char> <int> <int> <num>
#> 1: A 2 4 0.5
#> 2: A 1 7 0.5
#> 3: B 1 5 1.0
#> 4: B 2 8 1.0
#> 5: C 2 6 1.5
#> 6: C 1 9 1.5
DT %>%
group_dt(
by = V4,
slice_dt(1,.N)
)
#> V4 V1 V2 V3
#> <char> <int> <int> <num>
#> 1: A 1 1 0.5
#> 2: A 1 7 0.5
#> 3: B 2 2 1.0
#> 4: B 2 8 1.0
#> 5: C 1 3 1.5
#> 6: C 1 9 1.5
### Select rows using a nested query
DF %>%
group_dt(
by = V4,
arrange_dt(V2) %>%
slice_dt(1)
)
#> V4 V1 V2 V3
#> <char> <int> <int> <num>
#> 1: A 1 1 0.5
#> 2: B 2 2 1.0
#> 3: C 1 3 1.5
### Add a group counter column
DT %>%
mutate_dt(Grp = .GRP,by = .(V4, V1))
#> V1 V2 V3 V4 Grp
#> <int> <int> <num> <char> <int>
#> 1: 1 1 0.5 A 1
#> 2: 2 2 1.0 B 2
#> 3: 1 3 1.5 C 3
#> 4: 2 4 0.5 A 4
#> 5: 1 5 1.0 B 5
#> 6: 2 6 1.5 C 6
#> 7: 1 7 0.5 A 1
#> 8: 2 8 1.0 B 2
#> 9: 1 9 1.5 C 3
### Get row number of first (and last) observation by group
DT %>% summarise_dt(I = .I,by = V4)
#> V4 I
#> <char> <int>
#> 1: A 1
#> 2: A 4
#> 3: A 7
#> 4: B 2
#> 5: B 5
#> 6: B 8
#> 7: C 3
#> 8: C 6
#> 9: C 9
DT %>% summarise_dt(I = .I[1],by = V4)
#> V4 I
#> <char> <int>
#> 1: A 1
#> 2: B 2
#> 3: C 3
DT %>% summarise_dt(I = .I[c(1,.N)],by = V4)
#> V4 I
#> <char> <int>
#> 1: A 1
#> 2: A 7
#> 3: B 2
#> 4: B 8
#> 5: C 3
#> 6: C 9
### Handle list-columns by group
DT %>%
select_dt(V1,V4) %>%
chop_dt(V1) # return V1 as a list
#> V4 V1
#> <char> <list>
#> 1: A 1,2,1
#> 2: B 2,1,2
#> 3: C 1,2,1
DT %>% nest_dt(V4) # subsets of the data
#> V4 ndt
#> <char> <list>
#> 1: A <data.table[3x3]>
#> 2: B <data.table[3x3]>
#> 3: C <data.table[3x3]>
### Grouping sets (multiple by at once)
# use data.table directly, tidyfst does not provide new methods for it yet
data.table::rollup(DT,
.(SumV2 = sum(V2)),
by = c("V1", "V4"))
#> V1 V4 SumV2
#> <int> <char> <int>
#> 1: 1 A 8
#> 2: 2 B 10
#> 3: 1 C 12
#> 4: 2 A 4
#> 5: 1 B 5
#> 6: 2 C 6
#> 7: 1 <NA> 25
#> 8: 2 <NA> 20
#> 9: NA <NA> 45
data.table::rollup(DT,
.(SumV2 = sum(V2), .N),
by = c("V1", "V4"),
id = TRUE)
#> grouping V1 V4 SumV2 N
#> <int> <int> <char> <int> <int>
#> 1: 0 1 A 8 2
#> 2: 0 2 B 10 2
#> 3: 0 1 C 12 2
#> 4: 0 2 A 4 1
#> 5: 0 1 B 5 1
#> 6: 0 2 C 6 1
#> 7: 1 1 <NA> 25 5
#> 8: 1 2 <NA> 20 4
#> 9: 3 NA <NA> 45 9
data.table::cube(DT,
.(SumV2 = sum(V2), .N),
by = c("V1", "V4"),
id = TRUE)
#> grouping V1 V4 SumV2 N
#> <int> <int> <char> <int> <int>
#> 1: 0 1 A 8 2
#> 2: 0 2 B 10 2
#> 3: 0 1 C 12 2
#> 4: 0 2 A 4 1
#> 5: 0 1 B 5 1
#> 6: 0 2 C 6 1
#> 7: 1 1 <NA> 25 5
#> 8: 1 2 <NA> 20 4
#> 9: 2 NA A 12 3
#> 10: 2 NA B 15 3
#> 11: 2 NA C 18 3
#> 12: 3 NA <NA> 45 9
data.table::groupingsets(DT,
.(SumV2 = sum(V2), .N),
by = c("V1", "V4"),
sets = list("V1", c("V1", "V4")),
id = TRUE)
#> grouping V1 V4 SumV2 N
#> <int> <int> <char> <int> <int>
#> 1: 1 1 <NA> 25 5
#> 2: 1 2 <NA> 20 4
#> 3: 0 1 A 8 2
#> 4: 0 2 B 10 2
#> 5: 0 1 C 12 2
#> 6: 0 2 A 4 1
#> 7: 0 1 B 5 1
#> 8: 0 2 C 6 1
tidyfst exports data.table::fread
and
data.table::fwrite
directly.
### Write data to a csv file
fwrite(DT, "DT.csv")
### Write data to a tab-delimited file
fwrite(DT, "DT.txt", sep = "\t")
### Write list-column data to a csv file
fwrite(setDT(list(0, list(1:5))), "DT2.csv")
#
### Read a csv / tab-delimited file
fread("DT.csv")
# fread("DT.csv", verbose = TRUE) # full details
fread("DT.txt", sep = "\t")
### Read a csv file selecting / droping columns
fread("DT.csv", select = c("V1", "V4"))
fread("DT.csv", drop = "V4")
# NA
### Read and rbind several files
rbindlist(lapply(c("DT.csv", "DT.csv"), fread))
# c("DT.csv", "DT.csv") %>% lapply(fread) %>% rbindlist
### Melt data (from wide to long)
fsetequal(DT,DF)
#> [1] TRUE
mDT = DT %>% longer_dt(V3,V4)
mDF = DF %>% longer_dt(-"V1|V2")
fsetequal(mDT,mDF)
#> [1] TRUE
mDT
#> V3 V4 name value
#> <num> <char> <fctr> <int>
#> 1: 0.5 A V1 1
#> 2: 1.0 B V1 2
#> 3: 1.5 C V1 1
#> 4: 0.5 A V1 2
#> 5: 1.0 B V1 1
#> 6: 1.5 C V1 2
#> 7: 0.5 A V1 1
#> 8: 1.0 B V1 2
#> 9: 1.5 C V1 1
#> 10: 0.5 A V2 1
#> 11: 1.0 B V2 2
#> 12: 1.5 C V2 3
#> 13: 0.5 A V2 4
#> 14: 1.0 B V2 5
#> 15: 1.5 C V2 6
#> 16: 0.5 A V2 7
#> 17: 1.0 B V2 8
#> 18: 1.5 C V2 9
### Cast data (from long to wide)
mDT %>%
wider_dt(V4,name = "name",value = "value")
#> Warning in dcast.data.table(dt, V4 ~ name, fun.aggregate = NULL, value.var =
#> value, : 'fun.aggregate' is NULL, but found duplicate row/column combinations,
#> so defaulting to length(). That is, the variables [V4, name] used in 'formula'
#> do not uniquely identify rows in the input 'data'. In such cases,
#> 'fun.aggregate' is used to derive a single representative value for each
#> combination in the output data.table, for example by summing or averaging
#> (fun.aggregate=sum or fun.aggregate=mean, respectively). Check the resulting
#> table for values larger than 1 to see which combinations were not unique. See
#> ?dcast.data.table for more details.
#> Key: <V4>
#> V4 V1 V2
#> <char> <int> <int>
#> 1: A 3 3
#> 2: B 3 3
#> 3: C 3 3
# below is a special case and could only be done in tidyfst
mDT %>%
wider_dt(V4,name = "name",value = "value",fun = list)
#> V4 V1 V2
#> <char> <int> <int>
#> 1: A 1 1
#> 2: A 2 4
#> 3: A 1 7
#> 4: B 2 2
#> 5: B 1 5
#> 6: B 2 8
#> 7: C 1 3
#> 8: C 2 6
#> 9: C 1 9
mDT %>%
wider_dt(V4,name = "name",value = "value",fun = sum)
#> Key: <V4>
#> V4 V1 V2
#> <char> <int> <int>
#> 1: A 4 12
#> 2: B 5 15
#> 3: C 4 18
### Split
split(DT, by = "V4")
#> $A
#> V1 V2 V3 V4
#> <int> <int> <num> <char>
#> 1: 1 1 0.5 A
#> 2: 2 4 0.5 A
#> 3: 1 7 0.5 A
#>
#> $B
#> V1 V2 V3 V4
#> <int> <int> <num> <char>
#> 1: 2 2 1 B
#> 2: 1 5 1 B
#> 3: 2 8 1 B
#>
#> $C
#> V1 V2 V3 V4
#> <int> <int> <num> <char>
#> 1: 1 3 1.5 C
#> 2: 2 6 1.5 C
#> 3: 1 9 1.5 C
x <- data.table(Id = c("A", "B", "C", "C"),
X1 = c(1L, 3L, 5L, 7L),
XY = c("x2", "x4", "x6", "x8"),
key = "Id")
y <- data.table(Id = c("A", "B", "B", "D"),
Y1 = c(1L, 3L, 5L, 7L),
XY = c("y1", "y3", "y5", "y7"),
key = "Id")
### left join
left_join_dt(x, y, by = "Id")
#> Key: <Id>
#> Id X1 XY.x Y1 XY.y
#> <char> <int> <char> <int> <char>
#> 1: A 1 x2 1 y1
#> 2: B 3 x4 3 y3
#> 3: B 3 x4 5 y5
#> 4: C 5 x6 NA <NA>
#> 5: C 7 x8 NA <NA>
### right join
right_join_dt(x, y, by = "Id")
#> Key: <Id>
#> Id X1 XY.x Y1 XY.y
#> <char> <int> <char> <int> <char>
#> 1: A 1 x2 1 y1
#> 2: B 3 x4 3 y3
#> 3: B 3 x4 5 y5
#> 4: D NA <NA> 7 y7
### inner join
inner_join_dt(x, y, by = "Id")
#> Key: <Id>
#> Id X1 XY.x Y1 XY.y
#> <char> <int> <char> <int> <char>
#> 1: A 1 x2 1 y1
#> 2: B 3 x4 3 y3
#> 3: B 3 x4 5 y5
### full join
full_join_dt(x, y, by = "Id")
#> Key: <Id>
#> Id X1 XY.x Y1 XY.y
#> <char> <int> <char> <int> <char>
#> 1: A 1 x2 1 y1
#> 2: B 3 x4 3 y3
#> 3: B 3 x4 5 y5
#> 4: C 5 x6 NA <NA>
#> 5: C 7 x8 NA <NA>
#> 6: D NA <NA> 7 y7
### semi join
semi_join_dt(x, y, by = "Id")
#> Key: <Id>
#> Id X1 XY
#> <char> <int> <char>
#> 1: A 1 x2
#> 2: B 3 x4
### anti join
anti_join_dt(x, y, by = "Id")
#> Key: <Id>
#> Id X1 XY
#> <char> <int> <char>
#> 1: C 5 x6
#> 2: C 7 x8
x <- data.table(1:3)
y <- data.table(4:6)
z <- data.table(7:9, 0L)
### Bind rows
rbind(x, y)
#> V1
#> <int>
#> 1: 1
#> 2: 2
#> 3: 3
#> 4: 4
#> 5: 5
#> 6: 6
rbind(x, z, fill = TRUE)
#> V1 V2
#> <int> <int>
#> 1: 1 NA
#> 2: 2 NA
#> 3: 3 NA
#> 4: 7 0
#> 5: 8 0
#> 6: 9 0
### Bind rows using a list
rbindlist(list(x, y), idcol = TRUE)
#> .id V1
#> <int> <int>
#> 1: 1 1
#> 2: 1 2
#> 3: 1 3
#> 4: 2 4
#> 5: 2 5
#> 6: 2 6
### Bind columns
cbind(x, y)
#> V1 V1
#> <int> <int>
#> 1: 1 4
#> 2: 2 5
#> 3: 3 6
x <- data.table(c(1, 2, 2, 3, 3))
y <- data.table(c(2, 2, 3, 4, 4))
### Intersection
fintersect(x, y)
#> V1
#> <num>
#> 1: 2
#> 2: 3
fintersect(x, y, all = TRUE)
#> V1
#> <num>
#> 1: 2
#> 2: 2
#> 3: 3
### Difference
fsetdiff(x, y)
#> V1
#> <num>
#> 1: 1
fsetdiff(x, y, all = TRUE)
#> V1
#> <num>
#> 1: 1
#> 2: 3
### Union
funion(x, y)
#> V1
#> <num>
#> 1: 1
#> 2: 2
#> 3: 3
#> 4: 4
funion(x, y, all = TRUE)
#> V1
#> <num>
#> 1: 1
#> 2: 2
#> 3: 2
#> 4: 3
#> 5: 3
#> 6: 2
#> 7: 2
#> 8: 3
#> 9: 4
#> 10: 4
### Equality
fsetequal(x, x[order(-V1),])
#> [1] TRUE
all.equal(x, x) # S3 method
#> [1] TRUE
setequal(x, x[order(-V1),])
#> [1] TRUE
To break all these codes through, tidyfst has improved bit by bit. If you are using tidyfst frequently, you’ll find that while it enjoys a tidy syntax, it is more like you are using data.table in another style. Compared with many other packages with similar goals, tidyfst sticks to many principles of data.table (and is more like data.table in many ways). However, the ultimate goal is still clear: providing users with state-of-the-art data manipulation tools with least pain. Therefore, keep it simple and make it fast. Enjoy tidyfst~