gatherNamesForSecondTry.R (1224B)
1 # After running grabGenderData, there are a bunch of common (worldwide) names 2 # that have no gender estimate. This picks those out and appends them to the 3 # list of names to genderize with the country stripped out so that they may be 4 # queried generally. 5 6 artistData <- read.csv("names_to_genderize.csv", stringsAsFactors = FALSE) 7 artistData.noCountry <- filter(artistData, iso3166 == "none") 8 9 # Okay. NOW, go through and look up the names that have countries but didn't get 10 # results, this time stripping away the country. 11 namesWithGenders <- read.csv("names_with_genders.csv", stringsAsFactors = FALSE) 12 namesWithGenders.missing <- namesWithGenders %>% 13 filter(is.na(gender), country_id != "none") %>% 14 select(name) %>% 15 distinct() 16 17 print(nrow(namesWithGenders.missing)) 18 namesWithGenders.missing <- anti_join(namesWithGenders.missing, artistData.noCountry, 19 by = c("name" = "first_name")) 20 print(nrow(namesWithGenders.missing)) 21 22 # Match the format of artistData 23 namesWithGenders.missing <- namesWithGenders.missing %>% 24 mutate(iso3166 = "none") %>% 25 select(first_name = name, iso3166) 26 27 write.csv(namesWithGenders.missing, "names_to_genderize_2.csv", 28 row.names = FALSE)