moma-collection

The Museum of Modern Art (MoMA) collection data.
git clone https://git.eamoncaddigan.net/moma-collection.git
Log | Files | Refs | README | LICENSE

gatherNamesForSecondTry.R (1224B)


      1 # After running grabGenderData, there are a bunch of common (worldwide) names
      2 # that have no gender estimate. This picks those out and appends them to the
      3 # list of names to genderize with the country stripped out so that they may be
      4 # queried generally.
      5 
      6 artistData <- read.csv("names_to_genderize.csv", stringsAsFactors = FALSE)
      7 artistData.noCountry <- filter(artistData, iso3166 == "none")
      8 
      9 # Okay. NOW, go through and look up the names that have countries but didn't get
     10 # results, this time stripping away the country.
     11 namesWithGenders <- read.csv("names_with_genders.csv", stringsAsFactors = FALSE)
     12 namesWithGenders.missing <- namesWithGenders %>% 
     13   filter(is.na(gender), country_id != "none") %>%
     14   select(name) %>% 
     15   distinct()
     16 
     17 print(nrow(namesWithGenders.missing))
     18 namesWithGenders.missing <- anti_join(namesWithGenders.missing, artistData.noCountry,
     19                                       by = c("name" = "first_name"))
     20 print(nrow(namesWithGenders.missing))
     21 
     22 # Match the format of artistData
     23 namesWithGenders.missing <- namesWithGenders.missing %>%
     24   mutate(iso3166 = "none") %>%
     25   select(first_name = name, iso3166)
     26 
     27 write.csv(namesWithGenders.missing, "names_to_genderize_2.csv", 
     28           row.names = FALSE)