commit e1e92bd624d2d21fd9a45015f37ef37e5ea11db2
parent 554edae3b96e09cbeb3c1a9a2f76974fc76e1cef
Author: Jo Hardin <hardin47@users.noreply.github.com>
Date:   Wed, 24 Jul 2024 06:17:33 -0700
edits to chapter 9 for Friday's cohort 9 presentation (#65)
Diffstat:
| M | 09_Functionals.Rmd |  |  | 224 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------- | 
1 file changed, 171 insertions(+), 53 deletions(-)
diff --git a/09_Functionals.Rmd b/09_Functionals.Rmd
@@ -67,7 +67,9 @@ out
 
 - the length of the input == the length of the output
 
+- `map()` is more flexible, with additional arguments allowed
 
+- `map()` has a host of extensions
 
 
 
@@ -97,7 +99,7 @@ map_lgl(.x=c(1, NA, 3), .f=is.na)
 
  **Anonymous functions** 
 ```{r}
-map_dbl(.x=mtcars, .f=function(x) mean(x, na.rm = TRUE)) %>% 
+map_dbl(.x=mtcars, .f=function(x) mean(x, na.rm = TRUE)) |> 
   head()
 ```
 
@@ -112,6 +114,33 @@ map_dbl(.x=mtcars,  .f=~mean(.x, na.rm = TRUE))
 map_dbl(.x=mtcars, .f=mean, na.rm = TRUE)
 ```
 
+- what happens when we try a handful of variants of the task at hand?  (how many unique values are there for each variable?)
+
+Note that `.x` is the **name** of the first argument in `map()` (`.f` is the name of the second argument).
+
+```{r}
+#| error: true
+
+# the task
+map_dbl(mtcars, function(x) length(unique(x)))
+map_dbl(mtcars, ~length(unique(.x)))
+map_dbl(mtcars, ~length(unique(..1)))
+map_dbl(mtcars, ~length(unique(.)))
+
+# not the task
+map_dbl(mtcars, length)
+map_dbl(mtcars, length(unique))
+map_dbl(mtcars, 1)
+
+#error
+map_dbl(mtcars, length(unique()))
+map_dbl(mtcars, ~length(unique(x)))
+
+
+```
+
+
+
 ## Modify {-}
 
 Sometimes we might want the output to be the same as the input, then in that case we can use the modify function rather than map
@@ -124,17 +153,27 @@ map(df, .f=~.x*3)
 modify(.x=df,.f=~.x*3)
 ```
 
+Note that `modify()` always returns the same type of output (which is not necessarily true with `map()`).  Additionally, `modify()` does not actually change the value of `df`.
+
+```{r}
+df
+```
+
+
 ## `purrr` style
 
 ```{r}
-mtcars %>% 
-  map(head, 20) %>% # pull first 20 of each column
-  map_dbl(mean) %>% # mean of each vector
+mtcars |> 
+  map(head, 20) |> # pull first 20 of each column
+  map_dbl(mean) |> # mean of each vector
   head()
 ```
 
 An example from `tidytuesday`
 ```{r, eval=FALSE}
+#| warning: false
+#| message: false
+
 tt <- tidytuesdayR::tt_load("2020-06-30")
 
 # filter data & exclude columns with lost of nulls
@@ -142,15 +181,13 @@ list_df <-
   map(
     .x = tt[1:3], 
     .f = 
-      ~ .x %>% 
-      filter(issue <= 152 | issue > 200) %>% 
-      mutate(timeframe = ifelse(issue <= 152, "first 5 years", "last 5 years")) %>% 
-      select_if(~mean(is.na(.)) < 0.2) 
+      ~ .x |> 
+      filter(issue <= 152 | issue > 200) |> 
+      mutate(timeframe = ifelse(issue <= 152, "first 5 years", "last 5 years")) |> 
+      select_if(~mean(is.na(.x)) < 0.2) 
   )
 
 
-
-
 # write to global environment
 iwalk(
   .x = list_df,
@@ -192,15 +229,16 @@ map2_dbl(
 ## The benefit of using the map over apply family of function {-}
 - It is written in C
 - It preserves names
-- We always know the return values
-- We can apply function into multiple input value
-- We can pass in some additional arguments to the function
+- We always know the return value type
+- We can apply the function for multiple input values
+- We can pass additional arguments into the function
 
 
 ## `walk()` {-}
 
 
-- We use walk when we want to call a function for it side effect rather than it return value, like generating plots, `write.csv()` or `ggsave()`, `map()` will print more info than you may want
+- We use `walk()` when we want to call a function for it side effect(s) rather than its return value, like generating plots, `write.csv()`, or `ggsave()`. If you don't want a return value, `map()` will print more info than you may want.
+
 
 ```{r}
 map(1:3, ~cat(.x, "\n"))
@@ -212,30 +250,43 @@ walk(1:3, ~cat(.x, "\n"))
 ```
 
 
-We can use pwalk to save a list of plot to disk
+We can use `pwalk()` to save a list of plot to disk.  Note that the "p" in `pwalk()` means that we have more than 1 (or 2) variables to pipe into the function.  Also note that the name of the first argument in all of the "p" functions is now `.l` (instead of `.x`).
+
 ```{r}
-plots <- mtcars %>% 
-  split(.$cyl) %>% 
-  map(~ggplot(.,aes(mpg,wt))+geom_point())
+plots <- mtcars |>  
+  split(mtcars$cyl) |>  
+  map(~ggplot(.x, aes(mpg,wt)) +
+        geom_point())
 
-paths <- stringr::str_c(names(plots),'.png')
+paths <- stringr::str_c(names(plots), '.png')
 
-pwalk(.l=list(paths,plots),.f=ggsave,path=tempdir())
+pwalk(.l = list(paths,plots), .f = ggsave, path = tempdir())
+pmap(.l = list(paths,plots), .f = ggsave, path = tempdir())
   
-
 ```
 
 - walk, walk2 and pwalk all invisibly return .x the first argument. This makes them suitable for use in the middle of pipelines.
 
+- note: I don't think that it is "`.x`" (or "`.l`") that they are returning invisibly.  But I'm not sure what it is.  Hadley says:
+
+> purrr provides the walk family of functions that ignore the return values of the `.f` and instead return `.x` invisibly.
+
+But not in the first `cat()` example, it is the `NULL` values that get returned invisibly (those aren't the same as `.x`).
 
 ## `imap()` {-}
 
-- `imap()` is like `map2()`except that `.y` is derived from `names(.x)` if named or `seq_along(.x)` if not
+- `imap()` is like `map2()`except that `.y` is derived from `names(.x)` if named or `seq_along(.x)` if not.
 
 - These two produce the same result
 
 ```{r}
-imap_chr(.x = mtcars, .f = ~ paste(.y, "has a mean of", round(mean(.x), 1))) %>% 
+imap_chr(.x = mtcars, 
+         .f = ~ paste(.y, "has a mean of", round(mean(.x), 1))) |> 
+head()
+
+map2_chr(.x = mtcars, 
+         .y = names(mtcars),
+         .f = ~ paste(.y, "has a mean of", round(mean(.x), 1))) |> 
 head()
 ```
 
@@ -264,15 +315,30 @@ list(
   n = 1:3, 
   min = 10 ^ (0:2), 
   max = 10 ^ (1:3)
-) %>% 
+) |> 
+pmap(runif)
+```
+
+- I like to use `expand_grid()` when I want all possible parameter combinations.
+
+```{r}
+expand_grid(n = 1:3,
+            min = 10 ^ (0:1),
+            max = 10 ^ (1:2))
+
+expand_grid(n = 1:3,
+            min = 10 ^ (0:1),
+            max = 10 ^ (1:2)) |> 
 pmap(runif)
 ```
 
+
+
 ## `reduce()` family
 
-The reduce() function is a powerful functional that allows you to abstract away from a sequence of functions that are applied in a fixed direction.
+The `reduce()` function is a powerful functional that allows you to abstract away from a sequence of functions that are applied in a fixed direction.
 
-reduce takes a vector as its first argument, a function as its second argument, and an optional .init argument, it will then apply this function repeatedly to a list until there is only a single element left.
+`reduce()` takes a vector as its first argument, a function as its second argument, and an optional `.init` argument last.  It will then apply the function repeatedly to the vector until there is only a single element left.
 
 ```{r,echo=FALSE,warning=FALSE,message=FALSE}
 knitr::include_graphics(path = 'images/reduce-init.png')
@@ -281,22 +347,20 @@ knitr::include_graphics(path = 'images/reduce-init.png')
 
 Let me really quickly demonstrate `reduce()` in action.
 
-Say you wanted to add up the numbers 1 through 5, but only using the plus operator +. You could do something like this
+Say you wanted to add up the numbers 1 through 5 using only the plus operator `+`. You could do something like:
 
 ```{r}
 1 + 2 + 3 + 4 + 5
 
 ```
 
-Which is the same thing as this:
+Which is the same as:
 
 ```{r}
-set.seed(1234)
-
 reduce(1:5, `+`)
 ```
 
-And if you want the start value to be something that’s not the first argument of the vector, pass that to the .init argument:
+And if you want the start value to be something that is not the first argument of the vector, pass that value to the .init argument:
 
 ```{r}
 
@@ -307,22 +371,23 @@ identical(
 
 ```
 
-## ggplot2 Example with reduce {-}
+## ggplot2 example with reduce {-}
 
 ```{r}
 ggplot(mtcars, aes(hp, mpg)) + 
-  geom_point(size = 8, alpha = .5) +
-  geom_point(size = 4, alpha = .5) +
-  geom_point(size = 2, alpha = .5)
+  geom_point(size = 8, alpha = .5, color = "yellow") +
+  geom_point(size = 4, alpha = .5, color = "red") +
+  geom_point(size = 2, alpha = .5, color = "blue")
 
 ```
 
-Let us use the reduce `function`
+Let us use the `reduce()` function.  Note that `reduce2()` takes two arguments, but the first value (`..1`) is given by the `.init` value.
 
 ```{r}
-reduce(
+reduce2(
   c(8, 4, 2),
-  ~ .x + geom_point(size = .y, alpha = .5),
+  c("yellow", "red", "blue"),
+  ~ ..1 + geom_point(size = ..2, alpha = .5, color = ..3),
   .init = ggplot(mtcars, aes(hp, mpg))
 )
 
@@ -333,6 +398,7 @@ df <- list(age=tibble(name='john',age=30),
     sex=tibble(name=c('john','mary'),sex=c('M','F'),
     trt=tibble(name='Mary',treatment='A')))
 
+df
 
 df |> reduce(.f = full_join)
 
@@ -340,11 +406,21 @@ reduce(.x = df,.f = full_join)
 ```
 
 - to see all intermediate steps, use **accumulate()**
+
 ```{r}
 set.seed(1234)
 accumulate(1:5, `+`)
 ```
 
+```{r}
+accumulate2(
+  c(8, 4, 2),
+  c("yellow", "red", "blue"),
+  ~ ..1 + geom_point(size = ..2, alpha = .5, color = ..3),
+  .init = ggplot(mtcars, aes(hp, mpg))
+)
+```
+
 
 ## Not covered: `map_df*()` variants {-}
 
@@ -352,17 +428,21 @@ accumulate(1:5, `+`)
 
 - `map_dfc()` = column bind the results
 
+- Note that `map_dfr()` has been superseded by `map() |> list_rbind()` and `map_dfc()` has been superseded by `map() |> list_cbind()`
+
 ```{r}
 col_stats <- function(n) {
-  head(mtcars, n) %>% 
-    summarise_all(mean) %>% 
-    mutate_all(floor) %>% 
+  head(mtcars, n) |> 
+    summarise_all(mean) |> 
+    mutate_all(floor) |> 
     mutate(n = paste("N =", n))
 }
 
 map((1:2) * 10, col_stats)
 
 map_dfr((1:2) * 10, col_stats)
+
+map((1:2) * 10, col_stats) |> list_rbind()
 ```
 
 ---
@@ -371,13 +451,20 @@ map_dfr((1:2) * 10, col_stats)
 
 - `pluck()` will pull a single element from a list
 
+I like the example from the book because the starting object is not particularly easy to work with (as many JSON objects might not be).
+
 ```{r}
 my_list <- list(
-  1:3, 
-  10 + (1:5), 
-  20 + (1:10)
+  list(-1, x = 1, y = c(2), z = "a"),
+  list(-2, x = 4, y = c(5, 6), z = "b"),
+  list(-3, x = 8, y = c(9, 10, 11))
 )
+my_list
+```
 
+Notice that the "first element" means something different in standard `pluck()` versus `map`ped `pluck()`.
+
+```{r}
 pluck(my_list, 1)
 
 map(my_list, pluck, 1)
@@ -385,10 +472,39 @@ map(my_list, pluck, 1)
 map_dbl(my_list, pluck, 1)
 ```
 
+The `map()` functions also have shortcuts for extracting elements from vectors (powered by `purrr::pluck()`).
+
+```{r}
+#| error: true
+
+# Select by name
+map_dbl(my_list, "x")
+
+# Or by position
+map_dbl(my_list, 1)
+
+# Or by both
+map_dbl(my_list, list("y", 1))
+
+# You'll get an error if you try to retrieve an inside item that doesn't have 
+# a consistent format and you want a numeric output
+map_dbl(my_list, list("y"))
+
+
+# You'll get an error if a component doesn't exist:
+map_chr(my_list, "z")
+#> Error: Result 3 must be a single string, not NULL of length 0
+
+# Unless you supply a .default value
+map_chr(my_list, "z", .default = NA)
+#> [1] "a" "b" NA
+```
+
+
 
 ## Not covered: `flatten()` {-}
 
-- `flatten()` will turn a list of lists into a simpler vector
+- `flatten()` will turn a list of lists into a simpler vector.
 
 ```{r}
 my_list <-
@@ -397,11 +513,13 @@ my_list <-
     b = list(1:3)
   )
 
+my_list
+
 map_if(my_list, is.list, pluck)
   
 map_if(my_list, is.list, flatten_int)
 
-map_if(my_list, is.list, flatten_int) %>% 
+map_if(my_list, is.list, flatten_int) |> 
   flatten_int()
 ```
 
@@ -409,27 +527,27 @@ map_if(my_list, is.list, flatten_int) %>%
 
 ## Safely {-}
 
-safely is an adverb, it takes a function (a verb) and returns a modified version. In this case, the modified function will never throw an error. Instead it always returns a list with two elements.
+`safely()` is an adverb.  It takes a function (a verb) and returns a modified version. In this case, the modified function will never throw an error. Instead it always returns a list with two elements.
 
-- Result is the original result. If there is an error this will be NULL
+- `result` is the original result. If there is an error this will be NULL
 
-- Error is an error object. If the operation was successful this will be NULL.
+- `error` is an error object. If the operation was successful the "`error`" will be NULL.
 
 ```{r}
-A <- list(1,10,"a")
+A <- list(1, 10, "a")
 
-map(.x = A,.f = safely(log))
+map(.x = A, .f = safely(log))
   
 ```
 
 ## Possibly {-}
 
-Possibly always succeeds. It is simpler than safely, because you can give it a default value to return when there is an error.
+`possibly()` always succeeds. It is simpler than `safely()`, because you can give it a default value to return when there is an error.
 
 ```{r}
 A <- list(1,10,"a")
 
- map_dbl(.x =A,.f = possibly(log,otherwise = NA_real_) )
+map_dbl(.x = A, .f = possibly(log, otherwise = NA_real_) )
 
 ```