guessGender.R (5526B)
1 # Code for using the genderize.io API to guess names' genders. 2 3 # Helper functions -------------------------------------------------------- 4 5 #' Check country and language code. 6 #' 7 #' Makes sure that no more than one of countryCode or languageCode is *not* NA 8 #' (i.e., they can both be NA, or one can be NA). Also ensures that any code 9 #' specified is recognized by genderize.io. 10 #' @keywords internal 11 checkLanguageCountryCodes <- function(languageCode, countryCode) { 12 checkCodeInVector <- function(code, codeVector) { 13 return(match(tolower(code), tolower(codeVector), nomatch = 0) > 0) 14 } 15 16 # Very ugly control flow here. 17 if (!is.na(countryCode)) { 18 if (!checkCodeInVector(countryCode, genderizeCountries)) { 19 stop("Country code not in list") 20 } 21 if (!is.na(languageCode)) { 22 stop("Only one of countryCode or languageCode can be specified") 23 } 24 } 25 if (!is.na(languageCode)) { 26 if (!checkCodeInVector(languageCode, genderizeLanguages)) { 27 stop("Language code not in list") 28 } 29 } 30 } 31 32 #' Get an element from a list 33 #' 34 #' Helper function that returns NA instead of NULL when a missing list element 35 #' is requested, otherwise returns the element itself. 36 #' @keywords internal 37 getListElement <- function(listName, elementName) { 38 if (!is.null(listName[[elementName]])) { 39 listElement <- listName[[elementName]] 40 } else { 41 listElement <- NA 42 } 43 return(listElement) 44 } 45 46 47 # API functions ----------------------------------------------------------- 48 49 #' Look up a vector of names on genderize.io. 50 #' 51 #' This function actually implements the genderize.io API. Can only query 10 52 #' names at a time. 53 #' @inheritParams guessGender 54 #' @keywords internal 55 lookupNameVectorGenderize <- function(nameVector, 56 countryCode = NA, languageCode = NA, apiKey = NA) { 57 # Make sure that no more than 10 names were passed 58 if (length(nameVector) > 10) { 59 stop("This only accepts 10 or fewer names") 60 } 61 checkLanguageCountryCodes(languageCode, countryCode) 62 63 # Construct the query 64 query <- paste("name[", seq_along(nameVector), "]=", nameVector, 65 sep = "", 66 collapse = "&") 67 if (!is.na(countryCode)) { 68 query <- paste(query, "&country_id=", countryCode, sep = "") 69 } 70 if (!is.na(languageCode)) { 71 query <- paste(query, "&language_id=", languageCode, sep = "") 72 } 73 if (!is.na(apiKey)) { 74 query <- paste(query, "&apikey=", apiKey, sep = "") 75 } 76 77 # Run it! 78 # XXX - setting ssl_verifypeer to FALSE is probably really bad. Whatev. 79 queryResult <- httr::GET("https://api.genderize.io", query = query, 80 httr::config(ssl_verifypeer = FALSE)) 81 if (httr::status_code(queryResult) == 200) { 82 responseFromJSON <- jsonlite::fromJSON(httr::content(queryResult, as = "text")) 83 # Make sure this is a data.frame with the correct columns. I bet fromJSON 84 # can do this for me but I don't know how. This code works whether fromJSON 85 # returned a list (the response to one name) or a data.frame (the response 86 # to several). 87 responseDF <- data.frame(name = getListElement(responseFromJSON, "name"), 88 gender = getListElement(responseFromJSON, "gender"), 89 country_id = getListElement(responseFromJSON, "country_id"), 90 language_id = getListElement(responseFromJSON, "language_id"), 91 probability = getListElement(responseFromJSON, "probability"), 92 count = getListElement(responseFromJSON, "count"), 93 stringsAsFactors = FALSE) 94 95 } else { 96 cat(paste("\n!!!! http returned status code:", 97 httr::status_code(queryResult), 98 "!!!! message:", 99 httr::http_status(queryResult)$message, 100 "!!!! error:", 101 httr::content(queryResult)$error, 102 sep="\n")) 103 if (httr::status_code(queryResult) == 429){ 104 cat('\n!!!! number of available requests exhaused') 105 } 106 responseDF <- NULL 107 } 108 return(responseDF) 109 } 110 111 #' Guess names' genders 112 #' 113 #' This function uses the genderize.io API to supply estimates of the gender one 114 #' or more names. 115 #' @param nameVector A vector containing one or more names to look up. 116 #' @param countryCode An optional ISO 3166-1 alpha-2 country code. 117 #' @param languageCode An optional ISO 639-1 language code. Only one of 118 #' countryCode or languageCode can be specified. 119 #' @param apiKey An optional API key for genderize.io. 120 #' @export 121 #' @examples 122 #' guessGender(c("Natalie", "Liam", "Eamon"), countryCode = "US") 123 guessGender <- function(nameVector, 124 countryCode = NA, languageCode = NA, apiKey = NA) { 125 checkLanguageCountryCodes(languageCode, countryCode) 126 127 # genderize.io only handles 10 names at a time. Create a list of vectors, each 128 # with no more than 10 names. 129 queryList <- list() 130 while(length(nameVector) > 10) { 131 queryList[[length(queryList)+1]] <- nameVector[1:10] 132 nameVector <- nameVector[11:length(nameVector)] 133 } 134 queryList[[length(queryList)+1]] <- nameVector 135 136 # Run the queries 137 responseList <- list() 138 for (i in seq_along(queryList)) { 139 responseDF <- lookupNameVectorGenderize(queryList[[i]], 140 countryCode, languageCode, apiKey) 141 if (is.null(responseDF)) { 142 break 143 } else { 144 responseList[[length(responseList)+1]] <- responseDF 145 } 146 } 147 148 return(do.call(rbind, responseList)) 149 }