Compatibility with dplyr

linelist philosophy is to prevent you from accidentally losing valuable data, but to otherwise be totally transparent and not interfere with your workflow.

One popular ecosystem for data science workflow is the tidyverse and we are going the extra mile to ensure linelist compatibility with the tidyverse. All dplyr verbs are thoroughly tested in the tests/test-compat-dplyr.R file.

library(linelist)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

data("measles_hagelloch_1861", package = "outbreaks")

x <- make_linelist(
  measles_hagelloch_1861,
  id = "case_ID",
  date_onset = "date_of_prodrome",
  age = "age",
  gender = "gender"
)

head(x)
#> 
#> // linelist object
#>   case_ID infector date_of_prodrome date_of_rash date_of_death age gender
#> 1       1       45       1861-11-21   1861-11-25          <NA>   7      f
#> 2       2       45       1861-11-23   1861-11-27          <NA>   6      f
#> 3       3      172       1861-11-28   1861-12-02          <NA>   4      f
#> 4       4      180       1861-11-27   1861-11-28          <NA>  13      m
#> 5       5       45       1861-11-22   1861-11-27          <NA>   8      f
#> 6       6      180       1861-11-26   1861-11-29          <NA>  12      m
#>   family_ID class complications x_loc y_loc
#> 1        41     1           yes 142.5 100.0
#> 2        41     1           yes 142.5 100.0
#> 3        41     0           yes 142.5 100.0
#> 4        61     2           yes 165.0 102.5
#> 5        42     1           yes 145.0 120.0
#> 6        42     2           yes 145.0 120.0
#> 
#> // tags: id:case_ID, date_onset:date_of_prodrome, gender:gender, age:age

Verbs operating on rows

linelist does not modify anything regarding the behaviour for row-operations. As such, it is fully compatible with dplyr verbs operating on rows out-of-the-box. You can see in the following examples that linelist does not produce any errors, warnings or messages and its tags are conserved through dplyr operations on rows.

dplyr::arrange()

x %>%
  arrange(case_ID) %>%
  head()
#> 
#> // linelist object
#>   case_ID infector date_of_prodrome date_of_rash date_of_death age gender
#> 1       1       45       1861-11-21   1861-11-25          <NA>   7      f
#> 2       2       45       1861-11-23   1861-11-27          <NA>   6      f
#> 3       3      172       1861-11-28   1861-12-02          <NA>   4      f
#> 4       4      180       1861-11-27   1861-11-28          <NA>  13      m
#> 5       5       45       1861-11-22   1861-11-27          <NA>   8      f
#> 6       6      180       1861-11-26   1861-11-29          <NA>  12      m
#>   family_ID class complications x_loc y_loc
#> 1        41     1           yes 142.5 100.0
#> 2        41     1           yes 142.5 100.0
#> 3        41     0           yes 142.5 100.0
#> 4        61     2           yes 165.0 102.5
#> 5        42     1           yes 145.0 120.0
#> 6        42     2           yes 145.0 120.0
#> 
#> // tags: id:case_ID, date_onset:date_of_prodrome, gender:gender, age:age

dplyr:distinct()

x %>%
  distinct() %>%
  head()
#> 
#> // linelist object
#>   case_ID infector date_of_prodrome date_of_rash date_of_death age gender
#> 1       1       45       1861-11-21   1861-11-25          <NA>   7      f
#> 2       2       45       1861-11-23   1861-11-27          <NA>   6      f
#> 3       3      172       1861-11-28   1861-12-02          <NA>   4      f
#> 4       4      180       1861-11-27   1861-11-28          <NA>  13      m
#> 5       5       45       1861-11-22   1861-11-27          <NA>   8      f
#> 6       6      180       1861-11-26   1861-11-29          <NA>  12      m
#>   family_ID class complications x_loc y_loc
#> 1        41     1           yes 142.5 100.0
#> 2        41     1           yes 142.5 100.0
#> 3        41     0           yes 142.5 100.0
#> 4        61     2           yes 165.0 102.5
#> 5        42     1           yes 145.0 120.0
#> 6        42     2           yes 145.0 120.0
#> 
#> // tags: id:case_ID, date_onset:date_of_prodrome, gender:gender, age:age

dplyr::filter()

x %>%
  filter(age >= 10) %>%
  head()
#> 
#> // linelist object
#>   case_ID infector date_of_prodrome date_of_rash date_of_death age gender
#> 1       4      180       1861-11-27   1861-11-28          <NA>  13      m
#> 2       6      180       1861-11-26   1861-11-29          <NA>  12      m
#> 3       8       45       1861-11-21   1861-11-26          <NA>  10      m
#> 4       9      182       1861-11-26   1861-11-30          <NA>  13      m
#> 5      11      182       1861-11-25   1861-11-30          <NA>  11      f
#> 6      13       12       1861-11-30   1861-12-05          <NA>  13      m
#>   family_ID class complications x_loc y_loc
#> 1        61     2           yes 165.0 102.5
#> 2        42     2           yes 145.0 120.0
#> 3        44     1           yes  97.5 155.0
#> 4        44     2           yes  97.5 155.0
#> 5        27     2           yes 270.0 135.0
#> 6        32     2           yes 195.0  27.5
#> 
#> // tags: id:case_ID, date_onset:date_of_prodrome, gender:gender, age:age

dplyr::slice()

x %>%
  slice(5:10)
#> 
#> // linelist object
#>   case_ID infector date_of_prodrome date_of_rash date_of_death age gender
#> 1       5       45       1861-11-22   1861-11-27          <NA>   8      f
#> 2       6      180       1861-11-26   1861-11-29          <NA>  12      m
#> 3       7       42       1861-11-24   1861-11-28          <NA>   6      m
#> 4       8       45       1861-11-21   1861-11-26          <NA>  10      m
#> 5       9      182       1861-11-26   1861-11-30          <NA>  13      m
#> 6      10       45       1861-11-21   1861-11-25          <NA>   7      f
#>   family_ID class complications x_loc y_loc
#> 1        42     1           yes 145.0 120.0
#> 2        42     2           yes 145.0 120.0
#> 3        26     0           yes 272.5 147.5
#> 4        44     1           yes  97.5 155.0
#> 5        44     2           yes  97.5 155.0
#> 6        29     1           yes 240.0  75.0
#> 
#> // tags: id:case_ID, date_onset:date_of_prodrome, gender:gender, age:age

x %>%
  slice_head(n = 5)
#> 
#> // linelist object
#>   case_ID infector date_of_prodrome date_of_rash date_of_death age gender
#> 1       1       45       1861-11-21   1861-11-25          <NA>   7      f
#> 2       2       45       1861-11-23   1861-11-27          <NA>   6      f
#> 3       3      172       1861-11-28   1861-12-02          <NA>   4      f
#> 4       4      180       1861-11-27   1861-11-28          <NA>  13      m
#> 5       5       45       1861-11-22   1861-11-27          <NA>   8      f
#>   family_ID class complications x_loc y_loc
#> 1        41     1           yes 142.5 100.0
#> 2        41     1           yes 142.5 100.0
#> 3        41     0           yes 142.5 100.0
#> 4        61     2           yes 165.0 102.5
#> 5        42     1           yes 145.0 120.0
#> 
#> // tags: id:case_ID, date_onset:date_of_prodrome, gender:gender, age:age

x %>%
  slice_tail(n = 5)
#> 
#> // linelist object
#>   case_ID infector date_of_prodrome date_of_rash date_of_death age gender
#> 1     184       NA       1861-10-30   1861-11-06          <NA>  13   <NA>
#> 2     185       82       1861-12-03   1861-12-07          <NA>   3      m
#> 3     186       45       1861-11-22   1861-11-26          <NA>   6   <NA>
#> 4     187       82       1861-12-07   1861-12-11          <NA>   0      m
#> 5     188      175       1861-11-23   1861-11-27          <NA>   1   <NA>
#>   family_ID class complications x_loc y_loc
#> 1        51     2           yes 182.5 200.0
#> 2        21     0           yes 205.0 182.5
#> 3        57     0           yes 212.5  90.0
#> 4        21     0           yes 205.0 182.5
#> 5        57     0           yes 212.5  90.0
#> 
#> // tags: id:case_ID, date_onset:date_of_prodrome, gender:gender, age:age

x %>%
  slice_min(age, n = 3)
#> 
#> // linelist object
#>   case_ID infector date_of_prodrome date_of_rash date_of_death age gender
#> 1     113       31       1861-12-04   1861-12-07          <NA>   0      f
#> 2     119      116       1861-12-01   1861-12-08          <NA>   0      f
#> 3     147       18       1861-12-03   1861-12-07          <NA>   0      f
#> 4     150      148       1861-12-11   1861-12-15          <NA>   0      m
#> 5     160       68       1861-12-12   1861-12-13          <NA>   0      f
#> 6     167      110       1861-12-14   1861-12-18          <NA>   0      m
#> 7     171      169       1861-12-15   1861-12-17          <NA>   0      m
#> 8     176      146       1861-12-11   1861-12-15          <NA>   0   <NA>
#> 9     187       82       1861-12-07   1861-12-11          <NA>   0      m
#>   family_ID class complications x_loc y_loc
#> 1        15     0           yes 125.0 187.5
#> 2        40     0           yes 127.5 147.5
#> 3        13     0           yes  72.5 152.5
#> 4        19     0           yes 255.0 230.0
#> 5        16     0           yes 165.0 192.5
#> 6        49     0           yes 175.0 140.0
#> 7        38     0           yes 132.5  80.0
#> 8        64     0           yes  72.5 152.5
#> 9        21     0           yes 205.0 182.5
#> 
#> // tags: id:case_ID, date_onset:date_of_prodrome, gender:gender, age:age

x %>%
  slice_max(age, n = 3)
#> 
#> // linelist object
#>   case_ID infector date_of_prodrome date_of_rash date_of_death age gender
#> 1      16      181       1861-11-21   1861-11-25          <NA>  15      f
#> 2      62       11       1861-12-02   1861-12-06          <NA>  14      m
#> 3     117      116       1861-12-02   1861-12-06          <NA>  14      m
#>   family_ID class complications x_loc y_loc
#> 1        43     2           yes 172.5 172.5
#> 2         8     2           yes 270.0 102.5
#> 3        40     2           yes 127.5 147.5
#> 
#> // tags: id:case_ID, date_onset:date_of_prodrome, gender:gender, age:age

x %>%
  slice_sample(n = 5)
#> 
#> // linelist object
#>   case_ID infector date_of_prodrome date_of_rash date_of_death age gender
#> 1      38       17       1861-11-29   1861-12-05          <NA>  12   <NA>
#> 2     156       45       1861-11-22   1861-11-24          <NA>   8      m
#> 3     119      116       1861-12-01   1861-12-08          <NA>   0      f
#> 4     120        7       1861-12-06   1861-12-08          <NA>   1      m
#> 5      24       22       1861-11-30   1861-12-04          <NA>   4      m
#>   family_ID class complications x_loc y_loc
#> 1        54     2           yes 280.0 192.5
#> 2        31     1           yes 182.5  55.0
#> 3        40     0           yes 127.5 147.5
#> 4        26     0           yes 272.5 147.5
#> 5        35     0           yes 167.5   5.0
#> 
#> // tags: id:case_ID, date_onset:date_of_prodrome, gender:gender, age:age

Verbs operating on columns

During operations on columns, linelist will:

  • stay invisible and conserve tags if no tagged column is affected by the operation
  • trigger lost_tags_action() if tagged columns are affected by the operation

dplyr::mutate() ✓ (partial)

There is an incomplete compatibility with dplyr::mutate() in that simple renames without any actual modification of the column don’t update the tags. In this scenario, users should rather use dplyr::rename()

Although dplyr::mutate() is not able to leverage to full power of linelist tags, linelist objects behave as expected the same way a data.frame would:

# In place modification doesn't lose tags
x %>%
  mutate(age = as.integer(age)) %>%
  head()
#> 
#> // linelist object
#>   case_ID infector date_of_prodrome date_of_rash date_of_death age gender
#> 1       1       45       1861-11-21   1861-11-25          <NA>   7      f
#> 2       2       45       1861-11-23   1861-11-27          <NA>   6      f
#> 3       3      172       1861-11-28   1861-12-02          <NA>   4      f
#> 4       4      180       1861-11-27   1861-11-28          <NA>  13      m
#> 5       5       45       1861-11-22   1861-11-27          <NA>   8      f
#> 6       6      180       1861-11-26   1861-11-29          <NA>  12      m
#>   family_ID class complications x_loc y_loc
#> 1        41     1           yes 142.5 100.0
#> 2        41     1           yes 142.5 100.0
#> 3        41     0           yes 142.5 100.0
#> 4        61     2           yes 165.0 102.5
#> 5        42     1           yes 145.0 120.0
#> 6        42     2           yes 145.0 120.0
#> 
#> // tags: id:case_ID, date_onset:date_of_prodrome, gender:gender, age:age

# New columns don't affect existing tags
x %>%
  mutate(major = age >= 18) %>%
  head()
#> 
#> // linelist object
#>   case_ID infector date_of_prodrome date_of_rash date_of_death age gender
#> 1       1       45       1861-11-21   1861-11-25          <NA>   7      f
#> 2       2       45       1861-11-23   1861-11-27          <NA>   6      f
#> 3       3      172       1861-11-28   1861-12-02          <NA>   4      f
#> 4       4      180       1861-11-27   1861-11-28          <NA>  13      m
#> 5       5       45       1861-11-22   1861-11-27          <NA>   8      f
#> 6       6      180       1861-11-26   1861-11-29          <NA>  12      m
#>   family_ID class complications x_loc y_loc major
#> 1        41     1           yes 142.5 100.0 FALSE
#> 2        41     1           yes 142.5 100.0 FALSE
#> 3        41     0           yes 142.5 100.0 FALSE
#> 4        61     2           yes 165.0 102.5 FALSE
#> 5        42     1           yes 145.0 120.0 FALSE
#> 6        42     2           yes 145.0 120.0 FALSE
#> 
#> // tags: id:case_ID, date_onset:date_of_prodrome, gender:gender, age:age

# .keep = "unused" generate expected tag loss conditions
x %>%
  mutate(edad = age, .keep = "unused") %>%
  head()
#> Warning: The following tags have lost their variable:
#>  age:age
#> 
#> // linelist object
#>   case_ID infector date_of_prodrome date_of_rash date_of_death gender family_ID
#> 1       1       45       1861-11-21   1861-11-25          <NA>      f        41
#> 2       2       45       1861-11-23   1861-11-27          <NA>      f        41
#> 3       3      172       1861-11-28   1861-12-02          <NA>      f        41
#> 4       4      180       1861-11-27   1861-11-28          <NA>      m        61
#> 5       5       45       1861-11-22   1861-11-27          <NA>      f        42
#> 6       6      180       1861-11-26   1861-11-29          <NA>      m        42
#>   class complications x_loc y_loc edad
#> 1     1           yes 142.5 100.0    7
#> 2     1           yes 142.5 100.0    6
#> 3     0           yes 142.5 100.0    4
#> 4     2           yes 165.0 102.5   13
#> 5     1           yes 145.0 120.0    8
#> 6     2           yes 145.0 120.0   12
#> 
#> // tags: id:case_ID, date_onset:date_of_prodrome, gender:gender

dplyr::pull()

dplyr::pull() returns a vector, which results, as expected, in the loss of the linelist class and tags:

x %>%
  pull(age)
#>   [1]  7  6  4 13  8 12  6 10 13  7 11  7 13 13  8 15 10  2 11 10 13 10  7  4 12
#>  [26]  7  5 10 13 11  9  7  7 11 13 11 13 12 10 13 12  4  2 10  7 13 11  3 10  6
#>  [51]  4 13  6  4 11  8  3  9 10  2  5 14 12  7  2  5 11  2  1 13 10 10 11 10 13
#>  [76]  2  8 11  5 12 12  8 10  6  5  3 12 10  3 11  4  2  8  4  1  2 10  3  5 12
#> [101]  7 12 12  5  3  4 12  6  6  3 12 10  0 13 11  8 14  2  0  1 10  1  1  3  2
#> [126]  5  1  5  4 12  1 11  2 13  2 13 10 11 13  2  4  5 11  2  8  4  0 13  4  0
#> [151]  2  4 10  6 13  8  4  3  2  0  6  6  1  3  2  1  0  1  4 10  0  3  6  3  2
#> [176]  0  8  4  1 10 10 13  4 13  3  6  0  1

dplyr::relocate()

x %>%
  relocate(date_of_prodrome, .before = 1) %>%
  head()
#> 
#> // linelist object
#>   date_of_prodrome case_ID infector date_of_rash date_of_death age gender
#> 1       1861-11-21       1       45   1861-11-25          <NA>   7      f
#> 2       1861-11-23       2       45   1861-11-27          <NA>   6      f
#> 3       1861-11-28       3      172   1861-12-02          <NA>   4      f
#> 4       1861-11-27       4      180   1861-11-28          <NA>  13      m
#> 5       1861-11-22       5       45   1861-11-27          <NA>   8      f
#> 6       1861-11-26       6      180   1861-11-29          <NA>  12      m
#>   family_ID class complications x_loc y_loc
#> 1        41     1           yes 142.5 100.0
#> 2        41     1           yes 142.5 100.0
#> 3        41     0           yes 142.5 100.0
#> 4        61     2           yes 165.0 102.5
#> 5        42     1           yes 145.0 120.0
#> 6        42     2           yes 145.0 120.0
#> 
#> // tags: id:case_ID, date_onset:date_of_prodrome, gender:gender, age:age

dplyr::rename() & dplyr::rename_with()

dplyr::rename() is fully compatible out-of-the-box with linelist, meaning that tags will be updated at the same time that columns are renamed. This is possibly because it uses names<-() under the hood, which linelist provides a custom names<-.linelist() method for:

x %>%
  rename(edad = age) %>%
  head()
#> 
#> // linelist object
#>   case_ID infector date_of_prodrome date_of_rash date_of_death edad gender
#> 1       1       45       1861-11-21   1861-11-25          <NA>    7      f
#> 2       2       45       1861-11-23   1861-11-27          <NA>    6      f
#> 3       3      172       1861-11-28   1861-12-02          <NA>    4      f
#> 4       4      180       1861-11-27   1861-11-28          <NA>   13      m
#> 5       5       45       1861-11-22   1861-11-27          <NA>    8      f
#> 6       6      180       1861-11-26   1861-11-29          <NA>   12      m
#>   family_ID class complications x_loc y_loc
#> 1        41     1           yes 142.5 100.0
#> 2        41     1           yes 142.5 100.0
#> 3        41     0           yes 142.5 100.0
#> 4        61     2           yes 165.0 102.5
#> 5        42     1           yes 145.0 120.0
#> 6        42     2           yes 145.0 120.0
#> 
#> // tags: id:case_ID, date_onset:date_of_prodrome, gender:gender, age:edad

x %>%
  rename_with(toupper) %>%
  head()
#> 
#> // linelist object
#>   CASE_ID INFECTOR DATE_OF_PRODROME DATE_OF_RASH DATE_OF_DEATH AGE GENDER
#> 1       1       45       1861-11-21   1861-11-25          <NA>   7      f
#> 2       2       45       1861-11-23   1861-11-27          <NA>   6      f
#> 3       3      172       1861-11-28   1861-12-02          <NA>   4      f
#> 4       4      180       1861-11-27   1861-11-28          <NA>  13      m
#> 5       5       45       1861-11-22   1861-11-27          <NA>   8      f
#> 6       6      180       1861-11-26   1861-11-29          <NA>  12      m
#>   FAMILY_ID CLASS COMPLICATIONS X_LOC Y_LOC
#> 1        41     1           yes 142.5 100.0
#> 2        41     1           yes 142.5 100.0
#> 3        41     0           yes 142.5 100.0
#> 4        61     2           yes 165.0 102.5
#> 5        42     1           yes 145.0 120.0
#> 6        42     2           yes 145.0 120.0
#> 
#> // tags: id:CASE_ID, date_onset:DATE_OF_PRODROME, gender:GENDER, age:AGE

dplyr::select()

dplyr::select() is fully compatible with linelist, including when columns are renamed in a select():

# Works fine
x %>%
  select(case_ID, date_of_prodrome, gender, age) %>%
  head()
#> 
#> // linelist object
#>   case_ID date_of_prodrome gender age
#> 1       1       1861-11-21      f   7
#> 2       2       1861-11-23      f   6
#> 3       3       1861-11-28      f   4
#> 4       4       1861-11-27      m  13
#> 5       5       1861-11-22      f   8
#> 6       6       1861-11-26      m  12
#> 
#> // tags: id:case_ID, date_onset:date_of_prodrome, gender:gender, age:age

# Tags are updated!
x %>%
  select(case_ID, date_of_prodrome, gender, edad = age) %>%
  head()
#> 
#> // linelist object
#>   case_ID date_of_prodrome gender edad
#> 1       1       1861-11-21      f    7
#> 2       2       1861-11-23      f    6
#> 3       3       1861-11-28      f    4
#> 4       4       1861-11-27      m   13
#> 5       5       1861-11-22      f    8
#> 6       6       1861-11-26      m   12
#> 
#> // tags: id:case_ID, date_onset:date_of_prodrome, gender:gender, age:edad

Verbs operating on groups ✘

Groups are not yet supported. Applying any verb operating on group to a linelist will silently convert it back to a data.frame or tibble.

Verbs operating on data.frames

dplyr::bind_rows()

dim(x)
#> [1] 188  12

dim(bind_rows(x, x))
#> [1] 376  12

dplyr::bind_cols()

bind_cols() is currently incompatible with linelist:

  • Tags from the second element are lost
  • Warnings are produced about lost tagged, even for tags that are not actually lost
bind_cols(
  suppressWarnings(select(x, case_ID, date_of_prodrome)),
  suppressWarnings(select(x, age, gender))
) %>%
  head()
#> Warning: The following tags have lost their variable:
#>  id:case_ID, date_onset:date_of_prodrome
#> Warning: The following tags have lost their variable:
#>  gender:gender, age:age
#> 
#> // linelist object
#>   case_ID date_of_prodrome age gender
#> 1       1       1861-11-21   7      f
#> 2       2       1861-11-23   6      f
#> 3       3       1861-11-28   4      f
#> 4       4       1861-11-27  13      m
#> 5       5       1861-11-22   8      f
#> 6       6       1861-11-26  12      m
#> 
#> // tags: id:case_ID, date_onset:date_of_prodrome

Joins ✘

Joins are currently not compatible with linelist as tags from the second element are silently dropped.

full_join(
  suppressWarnings(select(x, case_ID, date_of_prodrome)),
  suppressWarnings(select(x, case_ID, age, gender))
) %>%
  head()
#> Joining with `by = join_by(case_ID)`
#> 
#> // linelist object
#>   case_ID date_of_prodrome age gender
#> 1       1       1861-11-21   7      f
#> 2       2       1861-11-23   6      f
#> 3       3       1861-11-28   4      f
#> 4       4       1861-11-27  13      m
#> 5       5       1861-11-22   8      f
#> 6       6       1861-11-26  12      m
#> 
#> // tags: id:case_ID, date_onset:date_of_prodrome

Verbs operating on multiple columns

dplyr::pick()

pick() makes tidyselect functions work in usually tidyselect-incompatible functions, such as:

x %>%
  dplyr::arrange(dplyr::pick(ends_with("loc"))) %>%
  head()
#> 
#> // linelist object
#>   case_ID infector date_of_prodrome date_of_rash date_of_death age gender
#> 1      26       45       1861-11-22   1861-11-27          <NA>   7      m
#> 2      28      180       1861-11-25   1861-11-30          <NA>  10      f
#> 3     146      172       1861-12-01   1861-12-07          <NA>   4      f
#> 4     147       18       1861-12-03   1861-12-07          <NA>   0      f
#> 5     176      146       1861-12-11   1861-12-15          <NA>   0   <NA>
#> 6     115       16       1861-12-01   1861-12-07          <NA>  11      f
#>   family_ID class complications x_loc y_loc
#> 1        67     1           yes   7.5  37.5
#> 2        65     2           yes  15.0  47.5
#> 3        13     0           yes  72.5 152.5
#> 4        13     0           yes  72.5 152.5
#> 5        64     0           yes  72.5 152.5
#> 6        66     2           yes  75.0  20.0
#> 
#> // tags: id:case_ID, date_onset:date_of_prodrome, gender:gender, age:age

As such, we could expect it to work with linelist custom tidyselect-like function: has_tag() but it’s not the case since pick() currently strips out all attributes, including the linelist class and all tags. This unclassing is documented in ?pick:

pick() returns a data frame containing the selected columns for the current group.