04.qmd (6283B)
1 --- 2 engine: knitr 3 title: Subsetting 4 --- 5 6 ## Learning objectives: 7 8 - Select multiple elements from a vector with `[` 9 - Select single elements from a vector with `[[` and `$` 10 - Assign to subsets of vectors 11 - Use subsetting to expand data 12 13 # Selecting multiple elements 14 15 ## 1. Positive integers return elements at specified positions 16 17 ```{r} 18 #| label: positive_int 19 x <- c(1.1, 2.2, 3.3, 4.4) # decimal = original position 20 x 21 x[c(4, 1)] 22 x[c(1, 1, 1)] 23 x[c(1.9999)] 24 ``` 25 26 Reals *truncate* to integers. 27 28 ```{r} 29 #| label: positive_real 30 x[c(1.0001, 1.9999)] 31 ``` 32 33 ## 2. Negative integers remove specified elements 34 35 ```{r} 36 #| label: negative_int 37 x[-c(1, 3)] # same as x[c(-1, -3)] or x[c(2, 4)] 38 ``` 39 40 ## 2b. Mixing negative and positive integers throws an error 41 42 ```{r} 43 #| label: mixed_int 44 #| error: true 45 x[c(-1, 3)] 46 ``` 47 48 ## 2c. Zeros ignored with other ints 49 50 ```{r} 51 #| label: negative_int_zero 52 x[c(-1, 0)] 53 x[c(-1, 0, 0, 0, 0, 0 ,0 ,0)] 54 x[c(1, 0, 2, 0, 3, 0)] 55 ``` 56 57 58 ## 3. Logical vectors select specified elements 59 60 ```{r} 61 #| label: logical_vec 62 x[c(TRUE, TRUE, FALSE, TRUE)] 63 x[x < 3] 64 65 cond <- x > 2.5 66 x[cond] 67 ``` 68 69 ## 3b. Shorter element are recycled to higher length 70 71 ```{r} 72 #| label: recycling 73 x[FALSE] 74 x[TRUE] 75 x[c(FALSE, TRUE)] # equivalent to: x[c(FALSE, TRUE, FALSE, TRUE)] 76 ``` 77 78 - Easy to understand if x or y is 1, best to avoid other lengths 79 80 ## 3c. NA index returns NA 81 82 ```{r} 83 #| label: missing_index 84 x[c(NA, TRUE, NA, TRUE)] 85 ``` 86 ## 3d. Extra TRUE index returns NA 87 88 ```{r} 89 #| label: extra_index 90 x[c(FALSE, TRUE, TRUE, TRUE, TRUE, TRUE)] 91 x[1:5] 92 ``` 93 94 ## 4. Indexing with nothing returns original vector 95 96 ```{r nothing} 97 x[] 98 ``` 99 100 ## 5. Indexing with just 0 returns 0-length vector (with class) 101 102 ```{r zero} 103 x[0] 104 letters[0] 105 ``` 106 107 ## 6. Indexing with character vector returns element of named vector 108 109 ```{r character} 110 (y <- setNames(x, letters[1:4])) 111 y[c("d", "b", "a")] 112 y[c("a", "a", "a")] 113 ``` 114 115 ## 6b. Names must be exact for `[` 116 117 ```{r} 118 #| label: exact_names 119 z <- c(abc = 1, def = 2) 120 z 121 z[c("a", "d")] 122 ``` 123 124 ## Subsetting a list with `[` returns a list 125 126 ```{r} 127 #| label: list_subset_basics 128 my_list <- list(a = c(T, F), b = letters[5:15], c = 100:108) 129 my_list 130 my_list[c("a", "b")] 131 ``` 132 133 ## Lists use same rules for `[` 134 135 ```{r} 136 #| label: list_subset_multiple 137 my_list[2:3] 138 my_list[c(TRUE, FALSE, TRUE)] 139 ``` 140 141 ## Matrices & arrays take multidimensional indices 142 143 ```{r} 144 #| label: array_subset 145 a <- matrix(1:9, nrow = 3) 146 a 147 a[1:2, 2:3] # rows, columns 148 ``` 149 150 ## Matrices & arrays can accept character, logical, etc 151 152 ```{r} 153 #| label: array_named 154 colnames(a) <- c("A", "B", "C") 155 a[c(TRUE, TRUE, FALSE), c("B", "A")] # a[1:2, 2:1] 156 ``` 157 158 ## Matrices & arrays are also vectors 159 160 ```{r} 161 #| label: array_vector 162 vals <- outer(1:5, 1:5, FUN = "paste", sep = ",") # All chr combos of 1:5 163 vals 164 vals[c(4, 15)] 165 a[a > 5] 166 ``` 167 168 ## Data frames subset list-like with single index 169 170 ```{r} 171 #| label: df_subset1 172 df <- data.frame(x = 1:3, y = 3:1, z = letters[1:3]) 173 df[1:2] 174 df[c("x", "z")] 175 ``` 176 177 ## Data frames subset matrix-like with multiple indices 178 179 ```{r} 180 df[1:2, c("x", "z")] # rows, columns 181 df[df$x == 2, ] # matching rows, all columns 182 df[, c("x", "z")] # equivalent to no , 183 ``` 184 185 ## Subsetting a tibble with `[` returns a tibble 186 187 ```{r} 188 tbl <- tibble::as_tibble(df) 189 df[, 1] 190 df[, 1, drop = FALSE] # Prevent errors 191 tbl[, 1] 192 ``` 193 194 # Selecting a single element 195 196 ## `[[` selects a single element 197 198 :::: columns 199 200 ::: column 201 ```{r} 202 x <- list(1:3, "a", 4:6) 203 x[1] 204 class(x[1]) 205 x[[1]] 206 class(x[[1]]) 207 x[[1]][[1]] 208 ``` 209 ::: 210 211 ::: column 212 213  214 ::: 215 216 :::: 217 218 ## `$` is shorthand for `[[..., exact = FALSE]]` 219 220 ```{r} 221 #| label: dollar_subset 222 #| warning: true 223 x <- list(abc = 1) 224 x$abc 225 x$a 226 x[["a"]] 227 x[["a", exact = FALSE]] 228 229 options(warnPartialMatchDollar = TRUE) 230 x$a 231 ``` 232 233 ## Behavior for missing-ish indices is inconsistent 234 235 ```{r} 236 #| label: missingish_indices 237 #| error: true 238 a <- c(a = 1L, b = 2L) 239 lst <- list(a = 1:2) 240 241 # Errors: 242 # a[[NULL]] 243 # lst[[NULL]] 244 # a[[5]] 245 # lst[[5]] 246 # a[["c"]] 247 # a[[NA]] 248 249 lst[["c"]] 250 lst[[NA]] 251 ``` 252 253 ## `purrr::pluck()` and `purrr::chuck()` provide consistent wrappers 254 255 - `purrr::pluck()` always returns `NULL` or `.default` for (non-`NULL`) missing 256 - `purrr::chuck()` always throws error 257 258 ```{r} 259 purrr::pluck(a, 5) 260 purrr::pluck(a, "c") 261 purrr::pluck(lst, 5) 262 purrr::pluck(lst, "c") 263 ``` 264 265 ## S4 has two additional subsetting operators 266 267 - `@` equivalent to `$` (but error if bad) 268 - `slot()` equivalent to `[[` 269 270 More in Chapter 15 271 272 # Subsetting and assignment 273 274 ## Can assign to position with `[` 275 276 ```{r} 277 x <- 1:5 278 x[1:2] <- c(101, 102) 279 x 280 x[1:3] <- 1:2 281 x 282 ``` 283 284 ## Remove list component with `NULL` 285 286 ```{r} 287 x <- list(a = 1, b = 2) 288 x[["b"]] <- NULL 289 x 290 ``` 291 292 ## Use `list(NULL)` to add `NULL` 293 294 ```{r} 295 x <- list(a = 1, b = 2) 296 x[["b"]] <- list(NULL) 297 x 298 ``` 299 300 ## Subset with nothing to retain shape 301 302 ```{r} 303 df <- data.frame(a = 1:3, b = 1:3) 304 df[] <- "a" 305 df 306 df <- "a" 307 df 308 ``` 309 310 # Applications 311 312 ## Use a lookup vector and recycling rules to translate values 313 314 ```{r} 315 x <- c("b", "g", "x", "g", "g", "b") 316 lookup <- c(b = "blue", g = "green", x = NA) 317 lookup[x] 318 unname(lookup[x]) 319 ``` 320 321 ## Use a lookup table to generate rows of data 322 323 ```{r} 324 info <- data.frame( 325 code = c("b", "g", "x"), 326 color = c("blue", "green", NA), 327 other_thing = 3:1 328 ) 329 match(x, info$code) # Indices of info$code in x 330 info[match(x, info$code), ] 331 ``` 332 333 ## Sort with `order()` 334 335 ```{r} 336 x <- c("b", "c", "a") 337 order(x) 338 x[order(x)] 339 340 df <- data.frame(b = 3:1, a = 1:3) 341 df[order(df$b), ] 342 df[, order(names(df))] 343 ``` 344 345 ## Expand counts 346 347 ```{r} 348 df <- data.frame(x = c(2, 4, 1), y = c(9, 11, 6), n = c(3, 5, 1)) 349 rep(1:nrow(df), df$n) 350 df[rep(1:nrow(df), df$n), ] 351 ``` 352 353 ## Ran out of time to make slides for 354 355 Ideally a future cohort should expand these: 356 357 - Remove df columns with `setdiff()` 358 - Logically subset rows `df[df$col > 5, ]` 359 - The next slide about `which()` 360 361 ## Boolean algebra versus sets (logical and integer) 362 363 - `which()` gives the indices of a Boolean vector 364 365 ```{r, eval=FALSE} 366 (x1 <- 1:10 %% 2 == 0) # 1-10 divisible by 2 367 # [1] FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE 368 (x2 <- which(x1)) 369 # [1] 2 4 6 8 10 370 (y1 <- 1:10 %% 5 == 0) # 1-10 divisible by 5 371 # [1] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE 372 (y2 <- which(y1)) 373 # [1] 5 10 374 x1 & y1 375 # [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE 376 ```