Package provides pipe-style interface for data.table
package. It preserves all data.table features without significant impact
on performance. let
and take
functions are
simplified interfaces for most common data manipulation tasks.
rows(mtcars, am==0)
columns(mtcars, mpg, vs:carb)
take(mtcars, mean_mpg = mean(mpg), by = am)
take_all(mtcars, mean, by = am)
take(mtcars, mpg, hp, fun = mean, by = am)
by
argument:
take_all(mtcars, mean)
%>%
to chain several
operations: mtcars %>%
let(mpg_hp = mpg/hp) %>%
take(mean(mpg_hp), by = am)
mtcars %>%
let(new_var = 42,
new_var2 = new_var*hp) %>%
head()
let(mtcars, am = NULL) %>% head()
iris %>%
let_all(
scaled = (.x - mean(.x))/sd(.x),
by = Species) %>%
head()
iris %>%
take_all(
mean = if(startsWith(.name, "Sepal")) mean(.x),
median = if(startsWith(.name, "Petal")) median(.x),
by = Species
)
:=
: new_var = "my_var"
old_var = "mpg"
mtcars %>%
let((new_var) := get(old_var)*2) %>%
head()
# or,
expr = quote(mean(cyl))
mtcars %>%
let((new_var) := eval(expr)) %>%
head()
# the same with `take`
by_var = "vs,am"
take(mtcars, (new_var) := eval(expr), by = by_var)
query_if
function translates its arguments one-to-one to
[.data.table
method. Additionally there are some
conveniences such as automatic data.frame
conversion to
data.table
.
Let’s make datasets for lookups:
workers = fread("
name company
Nick Acme
John Ajax
Daniela Ajax
")
positions = fread("
name position
John designer
Daniela engineer
Cathie manager
")
# xlookup
workers = let(workers,
position = xlookup(name, positions$name, positions$position)
)
# vlookup
# by default we search in the first column and return values from second column
workers = let(workers,
position = vlookup(name, positions, no_match = "Not found")
)
# the same
workers = let(workers,
position = vlookup(name, positions,
result_column = "position",
no_match = "Not found") # or, result_column = 2
)
head(workers)
## name company position
## 1: Nick Acme Not found
## 2: John Ajax designer
## 3: Daniela Ajax engineer
We will use for demonstartion well-known mtcars
dataset
and some examples from dplyr
package.
library(maditr)
data(mtcars)
# Newly created variables are available immediately
mtcars %>%
let(
cyl2 = cyl * 2,
cyl4 = cyl2 * 2
) %>% head()
## mpg cyl disp hp drat wt qsec vs am gear carb cyl2 cyl4
## 1: 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 12 24
## 2: 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 12 24
## 3: 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 8 16
## 4: 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 12 24
## 5: 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 16 32
## 6: 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 12 24
# You can also use let() to remove variables and
# modify existing variables
mtcars %>%
let(
mpg = NULL,
disp = disp * 0.0163871 # convert to litres
) %>% head()
## cyl disp hp drat wt qsec vs am gear carb
## 1: 6 2.621936 110 3.90 2.620 16.46 0 1 4 4
## 2: 6 2.621936 110 3.90 2.875 17.02 0 1 4 4
## 3: 4 1.769807 93 3.85 2.320 18.61 1 1 4 1
## 4: 6 4.227872 110 3.08 3.215 19.44 1 0 3 1
## 5: 8 5.899356 175 3.15 3.440 17.02 0 0 3 2
## 6: 6 3.687098 105 2.76 3.460 20.22 1 0 3 1
# window functions are useful for grouped computations
mtcars %>%
let(rank = rank(-mpg, ties.method = "min"),
by = cyl) %>%
head()
## mpg cyl disp hp drat wt qsec vs am gear carb rank
## 1: 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 2
## 2: 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 2
## 3: 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 8
## 4: 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 1
## 5: 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 2
## 6: 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 6
# You can drop variables by setting them to NULL
mtcars %>%
let(cyl = NULL) %>%
head()
## mpg disp hp drat wt qsec vs am gear carb
## 1: 21.0 160 110 3.90 2.620 16.46 0 1 4 4
## 2: 21.0 160 110 3.90 2.875 17.02 0 1 4 4
## 3: 22.8 108 93 3.85 2.320 18.61 1 1 4 1
## 4: 21.4 258 110 3.08 3.215 19.44 1 0 3 1
## 5: 18.7 360 175 3.15 3.440 17.02 0 0 3 2
## 6: 18.1 225 105 2.76 3.460 20.22 1 0 3 1
# keeps all existing variables
mtcars %>%
let(displ_l = disp / 61.0237) %>%
head()
## mpg cyl disp hp drat wt qsec vs am gear carb displ_l
## 1: 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 2.621932
## 2: 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 2.621932
## 3: 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 1.769804
## 4: 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 4.227866
## 5: 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 5.899347
## 6: 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 3.687092
# keeps only the variables you create
mtcars %>%
take(displ_l = disp / 61.0237) %>%
head()
## displ_l
## 1: 2.621932
## 2: 2.621932
## 3: 1.769804
## 4: 4.227866
## 5: 5.899347
## 6: 3.687092
# can refer to both contextual variables and variable names:
var = 100
mtcars %>%
let(cyl = cyl * var) %>%
head()
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1: 21.0 600 160 110 3.90 2.620 16.46 0 1 4 4
## 2: 21.0 600 160 110 3.90 2.875 17.02 0 1 4 4
## 3: 22.8 400 108 93 3.85 2.320 18.61 1 1 4 1
## 4: 21.4 600 258 110 3.08 3.215 19.44 1 0 3 1
## 5: 18.7 800 360 175 3.15 3.440 17.02 0 0 3 2
## 6: 18.1 600 225 105 2.76 3.460 20.22 1 0 3 1
# select rows
mtcars %>%
rows(am==0) %>%
head()
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1: 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## 2: 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## 3: 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## 4: 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## 5: 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## 6: 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
# select rows with compound condition
mtcars %>%
rows(am==0 & mpg>mean(mpg))
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1: 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## 2: 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## 3: 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## 4: 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
# select columns
mtcars %>%
columns(vs:carb, cyl)
## vs am gear carb cyl
## Mazda RX4 0 1 4 4 6
## Mazda RX4 Wag 0 1 4 4 6
## Datsun 710 1 1 4 1 4
## Hornet 4 Drive 1 0 3 1 6
## Hornet Sportabout 0 0 3 2 8
## Valiant 1 0 3 1 6
## Duster 360 0 0 3 4 8
## Merc 240D 1 0 4 2 4
## Merc 230 1 0 4 2 4
## Merc 280 1 0 4 4 6
## Merc 280C 1 0 4 4 6
## Merc 450SE 0 0 3 3 8
## Merc 450SL 0 0 3 3 8
## Merc 450SLC 0 0 3 3 8
## Cadillac Fleetwood 0 0 3 4 8
## Lincoln Continental 0 0 3 4 8
## Chrysler Imperial 0 0 3 4 8
## Fiat 128 1 1 4 1 4
## Honda Civic 1 1 4 2 4
## Toyota Corolla 1 1 4 1 4
## Toyota Corona 1 0 3 1 4
## Dodge Challenger 0 0 3 2 8
## AMC Javelin 0 0 3 2 8
## Camaro Z28 0 0 3 4 8
## Pontiac Firebird 0 0 3 2 8
## Fiat X1-9 1 1 4 1 4
## Porsche 914-2 0 1 5 2 4
## Lotus Europa 1 1 5 2 4
## Ford Pantera L 0 1 5 4 8
## Ferrari Dino 0 1 5 6 6
## Maserati Bora 0 1 5 8 8
## Volvo 142E 1 1 4 2 4
mtcars %>%
columns(-am, -cyl)
## mpg disp hp drat wt qsec vs gear carb
## Mazda RX4 21.0 160.0 110 3.90 2.620 16.46 0 4 4
## Mazda RX4 Wag 21.0 160.0 110 3.90 2.875 17.02 0 4 4
## Datsun 710 22.8 108.0 93 3.85 2.320 18.61 1 4 1
## Hornet 4 Drive 21.4 258.0 110 3.08 3.215 19.44 1 3 1
## Hornet Sportabout 18.7 360.0 175 3.15 3.440 17.02 0 3 2
## Valiant 18.1 225.0 105 2.76 3.460 20.22 1 3 1
## Duster 360 14.3 360.0 245 3.21 3.570 15.84 0 3 4
## Merc 240D 24.4 146.7 62 3.69 3.190 20.00 1 4 2
## Merc 230 22.8 140.8 95 3.92 3.150 22.90 1 4 2
## Merc 280 19.2 167.6 123 3.92 3.440 18.30 1 4 4
## Merc 280C 17.8 167.6 123 3.92 3.440 18.90 1 4 4
## Merc 450SE 16.4 275.8 180 3.07 4.070 17.40 0 3 3
## Merc 450SL 17.3 275.8 180 3.07 3.730 17.60 0 3 3
## Merc 450SLC 15.2 275.8 180 3.07 3.780 18.00 0 3 3
## Cadillac Fleetwood 10.4 472.0 205 2.93 5.250 17.98 0 3 4
## Lincoln Continental 10.4 460.0 215 3.00 5.424 17.82 0 3 4
## Chrysler Imperial 14.7 440.0 230 3.23 5.345 17.42 0 3 4
## Fiat 128 32.4 78.7 66 4.08 2.200 19.47 1 4 1
## Honda Civic 30.4 75.7 52 4.93 1.615 18.52 1 4 2
## Toyota Corolla 33.9 71.1 65 4.22 1.835 19.90 1 4 1
## Toyota Corona 21.5 120.1 97 3.70 2.465 20.01 1 3 1
## Dodge Challenger 15.5 318.0 150 2.76 3.520 16.87 0 3 2
## AMC Javelin 15.2 304.0 150 3.15 3.435 17.30 0 3 2
## Camaro Z28 13.3 350.0 245 3.73 3.840 15.41 0 3 4
## Pontiac Firebird 19.2 400.0 175 3.08 3.845 17.05 0 3 2
## Fiat X1-9 27.3 79.0 66 4.08 1.935 18.90 1 4 1
## Porsche 914-2 26.0 120.3 91 4.43 2.140 16.70 0 5 2
## Lotus Europa 30.4 95.1 113 3.77 1.513 16.90 1 5 2
## Ford Pantera L 15.8 351.0 264 4.22 3.170 14.50 0 5 4
## Ferrari Dino 19.7 145.0 175 3.62 2.770 15.50 0 5 6
## Maserati Bora 15.0 301.0 335 3.54 3.570 14.60 0 5 8
## Volvo 142E 21.4 121.0 109 4.11 2.780 18.60 1 4 2
# regular expression pattern
columns(iris, "^Petal") %>% head() # variables which start from 'Petal'
## Petal.Length Petal.Width
## 1 1.4 0.2
## 2 1.4 0.2
## 3 1.3 0.2
## 4 1.5 0.2
## 5 1.4 0.2
## 6 1.7 0.4
columns(iris, "Width$") %>% head() # variables which end with 'Width'
## Sepal.Width Petal.Width
## 1 3.5 0.2
## 2 3.0 0.2
## 3 3.2 0.2
## 4 3.1 0.2
## 5 3.6 0.2
## 6 3.9 0.4
# move Species variable to the front
# pattern "^." matches all variables
columns(iris, Species, "^.") %>% head()
## Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 setosa 5.1 3.5 1.4 0.2
## 2 setosa 4.9 3.0 1.4 0.2
## 3 setosa 4.7 3.2 1.3 0.2
## 4 setosa 4.6 3.1 1.5 0.2
## 5 setosa 5.0 3.6 1.4 0.2
## 6 setosa 5.4 3.9 1.7 0.4
# pattern "^.*al" means "contains 'al'"
columns(iris, "^.*al") %>% head()
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.1 3.5 1.4 0.2
## 2 4.9 3.0 1.4 0.2
## 3 4.7 3.2 1.3 0.2
## 4 4.6 3.1 1.5 0.2
## 5 5.0 3.6 1.4 0.2
## 6 5.4 3.9 1.7 0.4
# numeric indexing - all variables except Species
columns(iris, 1:4) %>% head()
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.1 3.5 1.4 0.2
## 2 4.9 3.0 1.4 0.2
## 3 4.7 3.2 1.3 0.2
## 4 4.6 3.1 1.5 0.2
## 5 5.0 3.6 1.4 0.2
## 6 5.4 3.9 1.7 0.4
# A 'take' with summary functions applied without 'by' argument returns an aggregated data
mtcars %>%
take(mean = mean(disp), n = .N)
## mean n
## 1: 230.7219 32
# Usually, you'll want to group first
mtcars %>%
take(mean = mean(disp), n = .N, by = am)
## am mean n
## 1: 1 143.5308 13
## 2: 0 290.3789 19
# grouping by multiple variables
mtcars %>%
take(mean = mean(disp), n = .N, by = list(am, vs))
## am vs mean n
## 1: 1 0 206.2167 6
## 2: 1 1 89.8000 7
## 3: 0 1 175.1143 7
## 4: 0 0 357.6167 12
# You can group by expressions:
mtcars %>%
take_all(
mean,
by = list(vsam = vs + am)
)
## vsam mpg cyl disp hp drat wt qsec
## 1: 1 20.28462 5.692308 189.4692 138.46154 3.738462 3.038846 18.04231
## 2: 2 28.37143 4.000000 89.8000 80.57143 4.148571 2.028286 18.70000
## 3: 0 15.05000 8.000000 357.6167 194.16667 3.120833 4.104083 17.14250
## gear carb
## 1: 4.076923 3.307692
## 2: 4.142857 1.428571
## 3: 3.000000 3.083333
# modify all non-grouping variables in-place
mtcars %>%
let_all((.x - mean(.x))/sd(.x), by = am) %>%
head()
## mpg cyl disp hp drat wt qsec
## 1: -0.5501185 0.5945745 0.1888587 -0.2004008 -0.4120299 0.3387459 -0.5021316
## 2: -0.5501185 0.5945745 0.1888587 -0.2004008 -0.4120299 0.7520483 -0.1896942
## 3: -0.2582189 -0.6936702 -0.4074443 -0.4026317 -0.5493732 -0.1474922 0.6974050
## 4: 1.1091990 -0.6133196 -0.2938955 -0.9323843 -0.5259081 -0.7124963 0.7176592
## 5: 0.4049674 0.6814663 0.6319326 0.2733692 -0.3474750 -0.4230701 -0.6641654
## 6: 0.2484716 -0.6133196 -0.5934281 -1.0251346 -1.3416023 -0.3973433 1.1630407
## vs am gear carb
## 1: -1.0377490 1 -0.7595545 0.4944600
## 2: -1.0377490 1 -0.7595545 0.4944600
## 3: 0.8894992 1 -0.7595545 -0.8829642
## 4: 1.2743862 0 -0.5026247 -1.5141438
## 5: -0.7433919 0 -0.5026247 -0.6423641
## 6: 1.2743862 0 -0.5026247 -1.5141438
# modify all non-grouping variables to new variables
mtcars %>%
let_all(scaled = (.x - mean(.x))/sd(.x), by = am) %>%
head()
## mpg cyl disp hp drat wt qsec vs am gear carb mpg_scaled cyl_scaled
## 1: 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 -0.5501185 0.5945745
## 2: 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 -0.5501185 0.5945745
## 3: 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 -0.2582189 -0.6936702
## 4: 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 1.1091990 -0.6133196
## 5: 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 0.4049674 0.6814663
## 6: 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 0.2484716 -0.6133196
## disp_scaled hp_scaled drat_scaled wt_scaled qsec_scaled vs_scaled
## 1: 0.1888587 -0.2004008 -0.4120299 0.3387459 -0.5021316 -1.0377490
## 2: 0.1888587 -0.2004008 -0.4120299 0.7520483 -0.1896942 -1.0377490
## 3: -0.4074443 -0.4026317 -0.5493732 -0.1474922 0.6974050 0.8894992
## 4: -0.2938955 -0.9323843 -0.5259081 -0.7124963 0.7176592 1.2743862
## 5: 0.6319326 0.2733692 -0.3474750 -0.4230701 -0.6641654 -0.7433919
## 6: -0.5934281 -1.0251346 -1.3416023 -0.3973433 1.1630407 1.2743862
## gear_scaled carb_scaled
## 1: -0.7595545 0.4944600
## 2: -0.7595545 0.4944600
## 3: -0.7595545 -0.8829642
## 4: -0.5026247 -1.5141438
## 5: -0.5026247 -0.6423641
## 6: -0.5026247 -1.5141438
# conditionally modify all variables
iris %>%
let_all(mean = if(is.numeric(.x)) mean(.x)) %>%
head()
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species Sepal.Length_mean
## 1: 5.1 3.5 1.4 0.2 setosa 5.843333
## 2: 4.9 3.0 1.4 0.2 setosa 5.843333
## 3: 4.7 3.2 1.3 0.2 setosa 5.843333
## 4: 4.6 3.1 1.5 0.2 setosa 5.843333
## 5: 5.0 3.6 1.4 0.2 setosa 5.843333
## 6: 5.4 3.9 1.7 0.4 setosa 5.843333
## Sepal.Width_mean Petal.Length_mean Petal.Width_mean
## 1: 3.057333 3.758 1.199333
## 2: 3.057333 3.758 1.199333
## 3: 3.057333 3.758 1.199333
## 4: 3.057333 3.758 1.199333
## 5: 3.057333 3.758 1.199333
## 6: 3.057333 3.758 1.199333
# modify all variables conditionally on name
iris %>%
let_all(
mean = if(startsWith(.name, "Sepal")) mean(.x),
median = if(startsWith(.name, "Petal")) median(.x),
by = Species
) %>%
head()
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species Sepal.Length_mean
## 1: 5.1 3.5 1.4 0.2 setosa 5.006
## 2: 4.9 3.0 1.4 0.2 setosa 5.006
## 3: 4.7 3.2 1.3 0.2 setosa 5.006
## 4: 4.6 3.1 1.5 0.2 setosa 5.006
## 5: 5.0 3.6 1.4 0.2 setosa 5.006
## 6: 5.4 3.9 1.7 0.4 setosa 5.006
## Sepal.Width_mean Petal.Length_median Petal.Width_median
## 1: 3.428 1.5 0.2
## 2: 3.428 1.5 0.2
## 3: 3.428 1.5 0.2
## 4: 3.428 1.5 0.2
## 5: 3.428 1.5 0.2
## 6: 3.428 1.5 0.2
# aggregation with 'take_all'
mtcars %>%
take_all(mean = mean(.x), sd = sd(.x), n = .N, by = am)
## am mpg_mean cyl_mean disp_mean hp_mean drat_mean wt_mean qsec_mean
## 1: 1 24.39231 5.076923 143.5308 126.8462 4.050000 2.411000 17.36000
## 2: 0 17.14737 6.947368 290.3789 160.2632 3.286316 3.768895 18.18316
## vs_mean gear_mean carb_mean mpg_sd cyl_sd disp_sd hp_sd drat_sd
## 1: 0.5384615 4.384615 2.923077 6.166504 1.552500 87.20399 84.06232 0.3640513
## 2: 0.3684211 3.210526 2.736842 3.833966 1.544657 110.17165 53.90820 0.3923039
## wt_sd qsec_sd vs_sd gear_sd carb_sd mpg_n cyl_n disp_n hp_n
## 1: 0.6169816 1.792359 0.5188745 0.5063697 2.177978 13 13 13 13
## 2: 0.7774001 1.751308 0.4955946 0.4188539 1.147079 19 19 19 19
## drat_n wt_n qsec_n vs_n gear_n carb_n
## 1: 13 13 13 13 13 13
## 2: 19 19 19 19 19 19
# conditionally aggregate all variables
iris %>%
take_all(mean = if(is.numeric(.x)) mean(.x))
## Sepal.Length_mean Sepal.Width_mean Petal.Length_mean Petal.Width_mean
## 1: 5.843333 3.057333 3.758 1.199333
# aggregate all variables conditionally on name
iris %>%
take_all(
mean = if(startsWith(.name, "Sepal")) mean(.x),
median = if(startsWith(.name, "Petal")) median(.x),
by = Species
)
## Species Sepal.Length_mean Sepal.Width_mean Petal.Length_median
## 1: setosa 5.006 3.428 1.50
## 2: versicolor 5.936 2.770 4.35
## 3: virginica 6.588 2.974 5.55
## Petal.Width_median
## 1: 0.2
## 2: 1.3
## 3: 2.0
# parametric evaluation:
var = quote(mean(cyl))
mtcars %>%
let(mean_cyl = eval(var)) %>%
head()
## mpg cyl disp hp drat wt qsec vs am gear carb mean_cyl
## 1: 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 6.1875
## 2: 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 6.1875
## 3: 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 6.1875
## 4: 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 6.1875
## 5: 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 6.1875
## 6: 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 6.1875
take(mtcars, eval(var))
## eval(var)
## 1: 6.1875
# all together
new_var = "mean_cyl"
mtcars %>%
let((new_var) := eval(var)) %>%
head()
## mpg cyl disp hp drat wt qsec vs am gear carb mean_cyl
## 1: 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 6.1875
## 2: 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 6.1875
## 3: 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 6.1875
## 4: 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 6.1875
## 5: 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 6.1875
## 6: 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 6.1875
take(mtcars, (new_var) := eval(var))
## mean_cyl
## 1: 6.1875
You can use ‘columns’ inside expression in the ‘take’/‘let’. ‘columns’ will be replaced with data.table with selected columns. In ‘let’ in the expressions with ‘:=’, ‘cols’ or ‘%to%’ can be placed in the left part of the expression. It is usefull for multiple assignment. There are four ways of column selection:
a{1:3}
will be
transformed to the names ‘a1’, ‘a2’, ‘a3’. ‘cols’ is just a shortcut for
‘columns’.# range selection
iris %>%
let(
avg = rowMeans(Sepal.Length %to% Petal.Width)
) %>%
head()
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species avg
## 1: 5.1 3.5 1.4 0.2 setosa 2.550
## 2: 4.9 3.0 1.4 0.2 setosa 2.375
## 3: 4.7 3.2 1.3 0.2 setosa 2.350
## 4: 4.6 3.1 1.5 0.2 setosa 2.350
## 5: 5.0 3.6 1.4 0.2 setosa 2.550
## 6: 5.4 3.9 1.7 0.4 setosa 2.850
# multiassignment
iris %>%
let(
# starts with Sepal or Petal
multipled1 %to% multipled4 := cols("^(Sepal|Petal)")*2
) %>%
head()
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species multipled1
## 1: 5.1 3.5 1.4 0.2 setosa 10.2
## 2: 4.9 3.0 1.4 0.2 setosa 9.8
## 3: 4.7 3.2 1.3 0.2 setosa 9.4
## 4: 4.6 3.1 1.5 0.2 setosa 9.2
## 5: 5.0 3.6 1.4 0.2 setosa 10.0
## 6: 5.4 3.9 1.7 0.4 setosa 10.8
## multipled2 multipled3 multipled4
## 1: 7.0 2.8 0.4
## 2: 6.0 2.8 0.4
## 3: 6.4 2.6 0.4
## 4: 6.2 3.0 0.4
## 5: 7.2 2.8 0.4
## 6: 7.8 3.4 0.8
mtcars %>%
let(
# text expansion
cols("scaled_{names(mtcars)}") := lapply(cols("{names(mtcars)}"), scale)
) %>%
head()
## mpg cyl disp hp drat wt qsec vs am gear carb scaled_mpg scaled_cyl
## 1: 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 0.1508848 -0.1049878
## 2: 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 0.1508848 -0.1049878
## 3: 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 0.4495434 -1.2248578
## 4: 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 0.2172534 -0.1049878
## 5: 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 -0.2307345 1.0148821
## 6: 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 -0.3302874 -0.1049878
## scaled_disp scaled_hp scaled_drat scaled_wt scaled_qsec scaled_vs
## 1: -0.57061982 -0.5350928 0.5675137 -0.610399567 -0.7771651 -0.8680278
## 2: -0.57061982 -0.5350928 0.5675137 -0.349785269 -0.4637808 -0.8680278
## 3: -0.99018209 -0.7830405 0.4739996 -0.917004624 0.4260068 1.1160357
## 4: 0.22009369 -0.5350928 -0.9661175 -0.002299538 0.8904872 1.1160357
## 5: 1.04308123 0.4129422 -0.8351978 0.227654255 -0.4637808 -0.8680278
## 6: -0.04616698 -0.6080186 -1.5646078 0.248094592 1.3269868 1.1160357
## scaled_am scaled_gear scaled_carb
## 1: 1.1899014 0.4235542 0.7352031
## 2: 1.1899014 0.4235542 0.7352031
## 3: 1.1899014 0.4235542 -1.1221521
## 4: -0.8141431 -0.9318192 -1.1221521
## 5: -0.8141431 -0.9318192 -0.5030337
## 6: -0.8141431 -0.9318192 -1.1221521
# range selection in 'by'
# selection of range + additional column
mtcars %>%
take(
res = sum(cols(mpg, disp %to% drat)),
by = vs %to% gear
)
## vs am gear res
## 1: 0 1 4 589.80
## 2: 1 1 4 1177.97
## 3: 1 0 3 985.64
## 4: 0 0 3 6839.45
## 5: 1 0 4 1125.35
## 6: 0 1 5 1874.61
## 7: 1 1 5 242.27
Here we use the same datasets as with lookups:
workers = fread("
name company
Nick Acme
John Ajax
Daniela Ajax
")
positions = fread("
name position
John designer
Daniela engineer
Cathie manager
")
workers
## name company
## 1: Nick Acme
## 2: John Ajax
## 3: Daniela Ajax
positions
## name position
## 1: John designer
## 2: Daniela engineer
## 3: Cathie manager
Different kinds of joins:
workers %>% dt_inner_join(positions)
## dt_inner_join: joining, by = "name"
## name company position
## 1: John Ajax designer
## 2: Daniela Ajax engineer
workers %>% dt_left_join(positions)
## dt_left_join: joining, by = "name"
## name company position
## 1: Nick Acme <NA>
## 2: John Ajax designer
## 3: Daniela Ajax engineer
workers %>% dt_right_join(positions)
## dt_right_join: joining, by = "name"
## name company position
## 1: John Ajax designer
## 2: Daniela Ajax engineer
## 3: Cathie <NA> manager
workers %>% dt_full_join(positions)
## dt_full_join: joining, by = "name"
## name company position
## 1: Nick Acme <NA>
## 2: John Ajax designer
## 3: Daniela Ajax engineer
## 4: Cathie <NA> manager
# filtering joins
workers %>% dt_anti_join(positions)
## dt_anti_join: joining, by = "name"
## name company
## 1: Nick Acme
workers %>% dt_semi_join(positions)
## dt_semi_join: joining, by = "name"
## name company
## 1: John Ajax
## 2: Daniela Ajax
To suppress the message, supply by
argument:
workers %>% dt_left_join(positions, by = "name")
Use a named by
if the join variables have different
names:
positions2 = setNames(positions, c("worker", "position")) # rename first column in 'positions'
workers %>% dt_inner_join(positions2, by = c("name" = "worker"))
There are a small subset of ‘dplyr’ verbs to work with data.table.
Note that there is no group_by
verb - use by or keyby
argument when needed.
dt_mutate
adds new variables or modify existing
variables. If data is data.table then it modifies in-place.dt_summarize
computes summary statistics. Splits the
data into subsets, computes summary statistics for each, and returns the
result in the “data.table” form.dt_summarize_all
the same as dt_summarize
but work over all non-grouping variables.dt_filter
Selects rows/cases where conditions are true.
Rows where the condition evaluates to NA are dropped.dt_select
Selects column/variables from the data set.
Range of variables are supported, e. g. vs:carb
. Characters
which start with ^
or end with \$
considered
as Perl-style regular expression patterns. For example,
'^Petal'
returns all variables started with ‘Petal’.
'Width\$'
returns all variables which end with ‘Width’.
Pattern ^.
matches all variables and pattern
'^.*my_str'
is equivalent to contains
"my_str"
. See examples.dt_arrange
sorts dataset by variable(-s). Use ‘-’ to
sort in desending order. If data is data.table then it modifies
in-place.The same examples with ‘dplyr’-verbs:
# examples from 'dplyr'
# newly created variables are available immediately
mtcars %>%
dt_mutate(
cyl2 = cyl * 2,
cyl4 = cyl2 * 2
) %>%
head()
## mpg cyl disp hp drat wt qsec vs am gear carb cyl2 cyl4
## 1: 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 12 24
## 2: 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 12 24
## 3: 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 8 16
## 4: 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 12 24
## 5: 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 16 32
## 6: 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 12 24
# you can also use dt_mutate() to remove variables and
# modify existing variables
mtcars %>%
dt_mutate(
mpg = NULL,
disp = disp * 0.0163871 # convert to litres
) %>%
head()
## cyl disp hp drat wt qsec vs am gear carb
## 1: 6 2.621936 110 3.90 2.620 16.46 0 1 4 4
## 2: 6 2.621936 110 3.90 2.875 17.02 0 1 4 4
## 3: 4 1.769807 93 3.85 2.320 18.61 1 1 4 1
## 4: 6 4.227872 110 3.08 3.215 19.44 1 0 3 1
## 5: 8 5.899356 175 3.15 3.440 17.02 0 0 3 2
## 6: 6 3.687098 105 2.76 3.460 20.22 1 0 3 1
# window functions are useful for grouped mutates
mtcars %>%
dt_mutate(
rank = rank(-mpg, ties.method = "min"),
keyby = cyl) %>%
print()
## mpg cyl disp hp drat wt qsec vs am gear carb rank
## 1: 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 8
## 2: 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 7
## 3: 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2 8
## 4: 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1 2
## 5: 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2 3
## 6: 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1 1
## 7: 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1 10
## 8: 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1 5
## 9: 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2 6
## 10: 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2 3
## 11: 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2 11
## 12: 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 2
## 13: 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 2
## 14: 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 1
## 15: 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1 6
## 16: 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4 5
## 17: 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4 7
## 18: 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6 4
## 19: 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2 2
## 20: 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4 11
## 21: 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3 4
## 22: 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3 3
## 23: 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3 7
## 24: 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4 13
## 25: 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4 13
## 26: 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4 10
## 27: 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2 6
## 28: 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2 7
## 29: 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4 12
## 30: 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2 1
## 31: 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4 5
## 32: 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8 9
## mpg cyl disp hp drat wt qsec vs am gear carb rank
# You can drop variables by setting them to NULL
mtcars %>% dt_mutate(cyl = NULL) %>% head()
## mpg disp hp drat wt qsec vs am gear carb
## 1: 21.0 160 110 3.90 2.620 16.46 0 1 4 4
## 2: 21.0 160 110 3.90 2.875 17.02 0 1 4 4
## 3: 22.8 108 93 3.85 2.320 18.61 1 1 4 1
## 4: 21.4 258 110 3.08 3.215 19.44 1 0 3 1
## 5: 18.7 360 175 3.15 3.440 17.02 0 0 3 2
## 6: 18.1 225 105 2.76 3.460 20.22 1 0 3 1
# A summary applied without by returns a single row
mtcars %>%
dt_summarise(mean = mean(disp), n = .N)
## mean n
## 1: 230.7219 32
# Usually, you'll want to group first
mtcars %>%
dt_summarise(mean = mean(disp), n = .N, by = cyl)
## cyl mean n
## 1: 6 183.3143 7
## 2: 4 105.1364 11
## 3: 8 353.1000 14
# Multiple 'by' - variables
mtcars %>%
dt_summarise(cyl_n = .N, by = list(cyl, vs))
## cyl vs cyl_n
## 1: 6 0 3
## 2: 4 1 10
## 3: 6 1 4
## 4: 8 0 14
## 5: 4 0 1
# Newly created summaries immediately
# doesn't overwrite existing variables
mtcars %>%
dt_summarise(disp = mean(disp),
sd = sd(disp),
by = cyl)
## cyl disp sd
## 1: 6 183.3143 41.56246
## 2: 4 105.1364 26.87159
## 3: 8 353.1000 67.77132
# You can group by expressions:
mtcars %>%
dt_summarise_all(mean, by = list(vsam = vs + am))
## vsam mpg cyl disp hp drat wt qsec
## 1: 1 20.28462 5.692308 189.4692 138.46154 3.738462 3.038846 18.04231
## 2: 2 28.37143 4.000000 89.8000 80.57143 4.148571 2.028286 18.70000
## 3: 0 15.05000 8.000000 357.6167 194.16667 3.120833 4.104083 17.14250
## gear carb
## 1: 4.076923 3.307692
## 2: 4.142857 1.428571
## 3: 3.000000 3.083333
# filter by condition
mtcars %>%
dt_filter(am==0)
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1: 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## 2: 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## 3: 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## 4: 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## 5: 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## 6: 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## 7: 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## 8: 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## 9: 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## 10: 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## 11: 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## 12: 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## 13: 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## 14: 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## 15: 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## 16: 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## 17: 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## 18: 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## 19: 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
# filter by compound condition
mtcars %>%
dt_filter(am==0, mpg>mean(mpg))
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1: 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## 2: 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## 3: 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## 4: 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
# select
mtcars %>%
dt_select(vs:carb, cyl) %>%
head()
## vs am gear carb cyl
## Mazda RX4 0 1 4 4 6
## Mazda RX4 Wag 0 1 4 4 6
## Datsun 710 1 1 4 1 4
## Hornet 4 Drive 1 0 3 1 6
## Hornet Sportabout 0 0 3 2 8
## Valiant 1 0 3 1 6
mtcars %>%
dt_select(-am, -cyl) %>%
head()
## mpg disp hp drat wt qsec vs gear carb
## Mazda RX4 21.0 160 110 3.90 2.620 16.46 0 4 4
## Mazda RX4 Wag 21.0 160 110 3.90 2.875 17.02 0 4 4
## Datsun 710 22.8 108 93 3.85 2.320 18.61 1 4 1
## Hornet 4 Drive 21.4 258 110 3.08 3.215 19.44 1 3 1
## Hornet Sportabout 18.7 360 175 3.15 3.440 17.02 0 3 2
## Valiant 18.1 225 105 2.76 3.460 20.22 1 3 1
# regular expression pattern
dt_select(iris, "^Petal") %>% head() # variables which start from 'Petal'
## Petal.Length Petal.Width
## 1 1.4 0.2
## 2 1.4 0.2
## 3 1.3 0.2
## 4 1.5 0.2
## 5 1.4 0.2
## 6 1.7 0.4
dt_select(iris, "Width$") %>% head() # variables which end with 'Width'
## Sepal.Width Petal.Width
## 1 3.5 0.2
## 2 3.0 0.2
## 3 3.2 0.2
## 4 3.1 0.2
## 5 3.6 0.2
## 6 3.9 0.4
# move Species variable to the front
# pattern "^." matches all variables
dt_select(iris, Species, "^.") %>% head()
## Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 setosa 5.1 3.5 1.4 0.2
## 2 setosa 4.9 3.0 1.4 0.2
## 3 setosa 4.7 3.2 1.3 0.2
## 4 setosa 4.6 3.1 1.5 0.2
## 5 setosa 5.0 3.6 1.4 0.2
## 6 setosa 5.4 3.9 1.7 0.4
# pattern "^.*al" means "contains 'al'"
dt_select(iris, "^.*al") %>% head()
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.1 3.5 1.4 0.2
## 2 4.9 3.0 1.4 0.2
## 3 4.7 3.2 1.3 0.2
## 4 4.6 3.1 1.5 0.2
## 5 5.0 3.6 1.4 0.2
## 6 5.4 3.9 1.7 0.4
dt_select(iris, 1:4) %>% head() # numeric indexing - all variables except Species
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.1 3.5 1.4 0.2
## 2 4.9 3.0 1.4 0.2
## 3 4.7 3.2 1.3 0.2
## 4 4.6 3.1 1.5 0.2
## 5 5.0 3.6 1.4 0.2
## 6 5.4 3.9 1.7 0.4
# sorting
dt_arrange(mtcars, cyl, disp)
dt_arrange(mtcars, -disp)