bookclub-advr

DSLC Advanced R Book Club
git clone https://git.eamoncaddigan.net/bookclub-advr.git
Log | Files | Refs | README | LICENSE

04.qmd (6283B)


      1 ---
      2 engine: knitr
      3 title: Subsetting
      4 ---
      5 
      6 ## Learning objectives:
      7 
      8 - Select multiple elements from a vector with `[`
      9 - Select single elements from a vector with `[[` and `$`
     10 - Assign to subsets of vectors
     11 - Use subsetting to expand data
     12 
     13 # Selecting multiple elements
     14 
     15 ## 1. Positive integers return elements at specified positions
     16 
     17 ```{r}
     18 #| label: positive_int
     19 x <- c(1.1, 2.2, 3.3, 4.4) # decimal = original position
     20 x
     21 x[c(4, 1)]
     22 x[c(1, 1, 1)]
     23 x[c(1.9999)]
     24 ```
     25 
     26 Reals *truncate* to integers.
     27 
     28 ```{r}
     29 #| label: positive_real
     30 x[c(1.0001, 1.9999)]
     31 ```
     32 
     33 ## 2. Negative integers remove specified elements
     34 
     35 ```{r}
     36 #| label: negative_int
     37 x[-c(1, 3)] # same as x[c(-1, -3)] or x[c(2, 4)]
     38 ```
     39 
     40 ## 2b. Mixing negative and positive integers throws an error
     41 
     42 ```{r}
     43 #| label: mixed_int
     44 #| error: true
     45 x[c(-1, 3)]
     46 ```
     47 
     48 ## 2c. Zeros ignored with other ints 
     49 
     50 ```{r}
     51 #| label: negative_int_zero
     52 x[c(-1, 0)]
     53 x[c(-1, 0, 0, 0, 0, 0 ,0 ,0)]
     54 x[c(1, 0, 2, 0, 3, 0)]
     55 ```
     56 
     57 
     58 ## 3. Logical vectors select specified elements
     59 
     60 ```{r}
     61 #| label: logical_vec
     62 x[c(TRUE, TRUE, FALSE, TRUE)]
     63 x[x < 3]
     64 
     65 cond <- x > 2.5
     66 x[cond]
     67 ```
     68 
     69 ## 3b. Shorter element are recycled to higher length
     70 
     71 ```{r}
     72 #| label: recycling
     73 x[FALSE]
     74 x[TRUE]
     75 x[c(FALSE, TRUE)] # equivalent to: x[c(FALSE, TRUE, FALSE, TRUE)]
     76 ```
     77 
     78 - Easy to understand if x or y is 1, best to avoid other lengths
     79 
     80 ## 3c. NA index returns NA
     81 
     82 ```{r}
     83 #| label: missing_index
     84 x[c(NA, TRUE, NA, TRUE)]
     85 ```
     86 ## 3d. Extra TRUE index returns NA
     87 
     88 ```{r}
     89 #| label: extra_index
     90 x[c(FALSE, TRUE, TRUE, TRUE, TRUE, TRUE)]
     91 x[1:5]
     92 ```
     93 
     94 ## 4. Indexing with nothing returns original vector
     95 
     96 ```{r nothing}
     97 x[]
     98 ```
     99 
    100 ## 5. Indexing with just 0 returns 0-length vector (with class)
    101 
    102 ```{r zero}
    103 x[0]
    104 letters[0]
    105 ```
    106 
    107 ## 6. Indexing with character vector returns element of named vector
    108 
    109 ```{r character}
    110 (y <- setNames(x, letters[1:4]))
    111 y[c("d", "b", "a")]
    112 y[c("a", "a", "a")]
    113 ```
    114 
    115 ## 6b. Names must be exact for `[`
    116 
    117 ```{r}
    118 #| label: exact_names
    119 z <- c(abc = 1, def = 2)
    120 z
    121 z[c("a", "d")]
    122 ```
    123 
    124 ## Subsetting a list with `[` returns a list
    125 
    126 ```{r}
    127 #| label: list_subset_basics
    128 my_list <- list(a = c(T, F), b = letters[5:15], c = 100:108)
    129 my_list
    130 my_list[c("a", "b")]
    131 ```
    132 
    133 ## Lists use same rules for `[`
    134 
    135 ```{r} 
    136 #| label: list_subset_multiple
    137 my_list[2:3]
    138 my_list[c(TRUE, FALSE, TRUE)]
    139 ```
    140 
    141 ## Matrices & arrays take multidimensional indices
    142 
    143 ```{r}
    144 #| label: array_subset
    145 a <- matrix(1:9, nrow = 3)
    146 a
    147 a[1:2, 2:3] # rows, columns
    148 ```
    149 
    150 ## Matrices & arrays can accept character, logical, etc
    151 
    152 ```{r}
    153 #| label: array_named
    154 colnames(a) <- c("A", "B", "C")
    155 a[c(TRUE, TRUE, FALSE), c("B", "A")] # a[1:2, 2:1]
    156 ```
    157 
    158 ## Matrices & arrays are also vectors
    159 
    160 ```{r}
    161 #| label: array_vector
    162 vals <- outer(1:5, 1:5, FUN = "paste", sep = ",") # All chr combos of 1:5
    163 vals
    164 vals[c(4, 15)]
    165 a[a > 5]
    166 ```
    167 
    168 ## Data frames subset list-like with single index
    169 
    170 ```{r}
    171 #| label: df_subset1
    172 df <- data.frame(x = 1:3, y = 3:1, z = letters[1:3])
    173 df[1:2]
    174 df[c("x", "z")]
    175 ```
    176 
    177 ## Data frames subset matrix-like with multiple indices
    178 
    179 ```{r}
    180 df[1:2, c("x", "z")] # rows, columns
    181 df[df$x == 2, ] # matching rows, all columns
    182 df[, c("x", "z")] # equivalent to no ,
    183 ```
    184 
    185 ## Subsetting a tibble with `[` returns a tibble
    186 
    187 ```{r}
    188 tbl <- tibble::as_tibble(df)
    189 df[, 1]
    190 df[, 1, drop = FALSE] # Prevent errors
    191 tbl[, 1]
    192 ```
    193 
    194 # Selecting a single element
    195 
    196 ## `[[` selects a single element
    197 
    198 :::: columns
    199 
    200 ::: column
    201 ```{r}
    202 x <- list(1:3, "a", 4:6)
    203 x[1]
    204 class(x[1])
    205 x[[1]]
    206 class(x[[1]])
    207 x[[1]][[1]]
    208 ```
    209 :::
    210 
    211 ::: column
    212 
    213 ![](images/subsetting/hadley-tweet.png)
    214 :::
    215 
    216 ::::
    217 
    218 ## `$` is shorthand for `[[..., exact = FALSE]]`
    219 
    220 ```{r}
    221 #| label: dollar_subset
    222 #| warning: true
    223 x <- list(abc = 1)
    224 x$abc
    225 x$a
    226 x[["a"]]
    227 x[["a", exact = FALSE]]
    228 
    229 options(warnPartialMatchDollar = TRUE)
    230 x$a
    231 ```
    232 
    233 ## Behavior for missing-ish indices is inconsistent
    234 
    235 ```{r}
    236 #| label: missingish_indices
    237 #| error: true
    238 a <- c(a = 1L, b = 2L)
    239 lst <- list(a = 1:2)
    240 
    241 # Errors:
    242 # a[[NULL]]
    243 # lst[[NULL]]
    244 # a[[5]]
    245 # lst[[5]]
    246 # a[["c"]]
    247 # a[[NA]]
    248 
    249 lst[["c"]]
    250 lst[[NA]]
    251 ```
    252 
    253 ## `purrr::pluck()` and `purrr::chuck()` provide consistent wrappers
    254 
    255 - `purrr::pluck()` always returns `NULL` or `.default` for (non-`NULL`) missing
    256 - `purrr::chuck()` always throws error
    257 
    258 ```{r}
    259 purrr::pluck(a, 5)
    260 purrr::pluck(a, "c")
    261 purrr::pluck(lst, 5)
    262 purrr::pluck(lst, "c")
    263 ```
    264 
    265 ## S4 has two additional subsetting operators
    266 
    267 - `@` equivalent to `$` (but error if bad)
    268 - `slot()` equivalent to `[[`
    269 
    270 More in Chapter 15
    271 
    272 # Subsetting and assignment
    273 
    274 ## Can assign to position with `[`
    275 
    276 ```{r}
    277 x <- 1:5
    278 x[1:2] <- c(101, 102)
    279 x
    280 x[1:3] <- 1:2
    281 x
    282 ```
    283 
    284 ## Remove list component with `NULL`
    285 
    286 ```{r}
    287 x <- list(a = 1, b = 2)
    288 x[["b"]] <- NULL
    289 x
    290 ```
    291 
    292 ## Use `list(NULL)` to add `NULL`
    293 
    294 ```{r}
    295 x <- list(a = 1, b = 2)
    296 x[["b"]] <- list(NULL)
    297 x
    298 ```
    299 
    300 ## Subset with nothing to retain shape
    301 
    302 ```{r}
    303 df <- data.frame(a = 1:3, b = 1:3)
    304 df[] <- "a"
    305 df
    306 df <- "a"
    307 df
    308 ```
    309 
    310 # Applications
    311 
    312 ## Use a lookup vector and recycling rules to translate values
    313 
    314 ```{r}
    315 x <- c("b", "g", "x", "g", "g", "b")
    316 lookup <- c(b = "blue", g = "green", x = NA)
    317 lookup[x]
    318 unname(lookup[x])
    319 ```
    320 
    321 ## Use a lookup table to generate rows of data
    322 
    323 ```{r}
    324 info <- data.frame(
    325   code = c("b", "g", "x"),
    326   color = c("blue", "green", NA),
    327   other_thing = 3:1
    328 )
    329 match(x, info$code) # Indices of info$code in x
    330 info[match(x, info$code), ]
    331 ```
    332 
    333 ## Sort with `order()`
    334 
    335 ```{r}
    336 x <- c("b", "c", "a")
    337 order(x)
    338 x[order(x)]
    339 
    340 df <- data.frame(b = 3:1, a = 1:3)
    341 df[order(df$b), ]
    342 df[, order(names(df))]
    343 ```
    344 
    345 ## Expand counts
    346 
    347 ```{r}
    348 df <- data.frame(x = c(2, 4, 1), y = c(9, 11, 6), n = c(3, 5, 1))
    349 rep(1:nrow(df), df$n)
    350 df[rep(1:nrow(df), df$n), ]
    351 ```
    352 
    353 ## Ran out of time to make slides for
    354 
    355 Ideally a future cohort should expand these:
    356 
    357 - Remove df columns with `setdiff()`
    358 - Logically subset rows `df[df$col > 5, ]`
    359 - The next slide about `which()`
    360 
    361 ## Boolean algebra versus sets (logical and integer)
    362 
    363 - `which()` gives the indices of a Boolean vector
    364 
    365 ```{r, eval=FALSE}
    366 (x1 <- 1:10 %% 2 == 0) # 1-10 divisible by 2
    367 #  [1] FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE
    368 (x2 <- which(x1))
    369 # [1]  2  4  6  8 10
    370 (y1 <- 1:10 %% 5 == 0) # 1-10 divisible by 5
    371 #  [1] FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE
    372 (y2 <- which(y1))
    373 # [1]  5 10
    374 x1 & y1
    375 # [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
    376 ```