grabGenderData.R (4614B)
1 # This'll be the code that adds gender data to the damn names 2 3 require(jsonlite) 4 require(httr) 5 require(dplyr) 6 7 genderizeKey <- NA 8 inputCSV <- "names_to_genderize.csv" 9 10 11 # Helper function grabs gender info for a vector of names ----------------- 12 13 # Returns NA instead of NULL when a missing list element is requested, otherwise 14 # returns the element itself. 15 getListElement <- function(listName, elementName) { 16 listElement <- NA 17 if (!is.null(listName[[elementName]])) { 18 listElement <- listName[[elementName]] 19 } 20 return(listElement) 21 } 22 23 lookupNames <- function(nameVector, countryCode = NA, apiKey = NA) { 24 # Construct the query 25 query <- paste("name[", seq_along(nameVector), "]=", nameVector, 26 sep="", 27 collapse="&") 28 if (!is.na(countryCode) & (countryCode != "none")) { 29 query <- paste(query, "&country_id=", countryCode, sep="") 30 } 31 if (!is.na(apiKey)) { 32 query <- paste(query, "&apikey=", apiKey, sep="") 33 } 34 35 # Run it! 36 queryResult <- GET("https://api.genderize.io", query = query) 37 if (status_code(queryResult) == 200) { 38 responseDF <- fromJSON(content(queryResult, as="text")) 39 # Make sure this is a data.frame with the correct columns. I bet fromJSON 40 # can do this for me but I don't know how. This code works whether fromJSON 41 # returned a list (the response to one name) or a data.frame (the response 42 # to several). 43 responseDF <- data.frame(name = getListElement(responseDF, "name"), 44 gender = getListElement(responseDF, "gender"), 45 country_id = getListElement(responseDF, "country_id"), 46 probability = getListElement(responseDF, "probability"), 47 count = getListElement(responseDF, "count"), 48 stringsAsFactors = FALSE) 49 responseDF <- mutate(responseDF, 50 country_id = ifelse(is.na(country_id), "none", country_id)) 51 52 } else { 53 cat(paste("\n!!!! http returned status code:", 54 status_code(queryResult), 55 "!!!! message:", 56 http_status(queryResult)$message, 57 "!!!! error:", 58 content(queryResult)$error, 59 sep="\n")) 60 if (status_code(queryResult) == 429){ 61 cat('\n!!!! number of available requests exhaused') 62 } 63 responseDF <- NULL 64 } 65 return(responseDF) 66 } 67 68 69 # Read in the name and existing gender info ------------------------------- 70 71 # Load the genderize.io supported countries 72 genderizeCountries <- fromJSON("countries.json")[[2]] 73 74 # Read in the DF of artist data 75 artistData <- read.csv(inputCSV, stringsAsFactors = FALSE) 76 artistData <- artistData %>% 77 mutate(iso3166 = ifelse(iso3166 %in% genderizeCountries, iso3166, "none")) %>% 78 arrange(iso3166, first_name) 79 80 # Read in the gender data we have so we don't keep querying the same people 81 genderData <- read.csv("names_with_genders.csv", stringsAsFactors = FALSE) 82 genderData <- mutate(genderData, looked_up = TRUE) 83 artistData <- artistData %>% 84 left_join(genderData, 85 by = c("iso3166" = "country_id", "first_name" = "name")) %>% 86 mutate(looked_up = ifelse(is.na(looked_up), FALSE, looked_up)) 87 # Take the new column off of genderData 88 genderData <- select(genderData, -looked_up) 89 90 # Create a list of queries and run them ----------------------------------- 91 92 # Break the data frame of name/country combos into a list of query chunks 93 artistData <- filter(artistData, !looked_up) 94 queryChunks = list() 95 countriesConsidered <- unique(artistData$iso3166) 96 for (c in seq_along(countriesConsidered)) { 97 countrysNames <- artistData$first_name[artistData$iso3166 == countriesConsidered[c]] 98 # Can only query up to 10 names at a time 99 while(length(countrysNames) > 10) { 100 queryChunks[[length(queryChunks)+1]] <- list(countriesConsidered[c], countrysNames[1:10]) 101 countrysNames <- countrysNames[11:length(countrysNames)] 102 } 103 queryChunks[[length(queryChunks)+1]] <- list(countriesConsidered[c], countrysNames) 104 } 105 106 # Now query all the chunks and store a list of DFs of their results 107 responseList <- list(genderData) # Start with what we already have 108 for (i in seq_along(queryChunks)) { 109 responseDF <- lookupNames(queryChunks[[i]][[2]], queryChunks[[i]][[1]], 110 genderizeKey) 111 if (is.null(responseDF)) { 112 break 113 } else { 114 responseList[[length(responseList)+1]] <- responseDF 115 } 116 } 117 118 # Combine the list into a single DF and write to a file 119 namesWithGenders <- do.call(rbind, responseList) 120 write.csv(namesWithGenders, "names_with_genders.csv", row.names = FALSE)