GenderGuesser

An R package for using the genderize.io API to guess the gender of names.
git clone https://git.eamoncaddigan.net/GenderGuesser.git
Log | Files | Refs | README | LICENSE

guessGender.R (5526B)


      1 # Code for using the genderize.io API to guess names' genders.
      2 
      3 # Helper functions --------------------------------------------------------
      4 
      5 #' Check country and language code.
      6 #' 
      7 #' Makes sure that no more than one of countryCode or languageCode is *not* NA 
      8 #' (i.e., they can both be NA, or one can be NA). Also ensures that any code
      9 #' specified is recognized by genderize.io.
     10 #' @keywords internal
     11 checkLanguageCountryCodes <- function(languageCode, countryCode) {
     12   checkCodeInVector <- function(code, codeVector) {
     13     return(match(tolower(code), tolower(codeVector), nomatch = 0) > 0)
     14   }
     15   
     16   # Very ugly control flow here. 
     17   if (!is.na(countryCode)) {
     18     if (!checkCodeInVector(countryCode, genderizeCountries)) {
     19       stop("Country code not in list")
     20     } 
     21     if (!is.na(languageCode)) {
     22       stop("Only one of countryCode or languageCode can be specified")
     23     }
     24   }
     25   if (!is.na(languageCode)) {
     26     if (!checkCodeInVector(languageCode, genderizeLanguages)) {
     27       stop("Language code not in list")
     28     }
     29   }
     30 }
     31 
     32 #' Get an element from a list
     33 #'
     34 #' Helper function that returns NA instead of NULL when a missing list element
     35 #' is requested, otherwise returns the element itself.
     36 #' @keywords internal
     37 getListElement <- function(listName, elementName) {
     38   if (!is.null(listName[[elementName]])) {
     39     listElement <- listName[[elementName]]
     40   } else {
     41     listElement <- NA
     42   }
     43   return(listElement)
     44 }
     45 
     46 
     47 # API functions -----------------------------------------------------------
     48 
     49 #' Look up a vector of names on genderize.io.
     50 #'
     51 #' This function actually implements the genderize.io API. Can only query 10
     52 #' names at a time.
     53 #' @inheritParams guessGender
     54 #' @keywords internal
     55 lookupNameVectorGenderize <- function(nameVector, 
     56                                       countryCode = NA, languageCode = NA, apiKey = NA) {
     57   # Make sure that no more than 10 names were passed
     58   if (length(nameVector) > 10) {
     59     stop("This only accepts 10 or fewer names")
     60   }
     61   checkLanguageCountryCodes(languageCode, countryCode)
     62 
     63   # Construct the query
     64   query <- paste("name[", seq_along(nameVector), "]=", nameVector,
     65                  sep = "",
     66                  collapse = "&")
     67   if (!is.na(countryCode)) {
     68     query <- paste(query, "&country_id=", countryCode, sep = "")
     69   }
     70   if (!is.na(languageCode)) {
     71     query <- paste(query, "&language_id=", languageCode, sep = "")
     72   }
     73   if (!is.na(apiKey)) {
     74     query <- paste(query, "&apikey=", apiKey, sep = "")
     75   }
     76 
     77   # Run it!
     78   # XXX - setting ssl_verifypeer to FALSE is probably really bad. Whatev.
     79   queryResult <- httr::GET("https://api.genderize.io", query = query,
     80                            httr::config(ssl_verifypeer = FALSE))
     81   if (httr::status_code(queryResult) == 200) {
     82     responseFromJSON <- jsonlite::fromJSON(httr::content(queryResult, as = "text"))
     83     # Make sure this is a data.frame with the correct columns. I bet fromJSON
     84     # can do this for me but I don't know how. This code works whether fromJSON
     85     # returned a list (the response to one name) or a data.frame (the response
     86     # to several).
     87     responseDF <- data.frame(name = getListElement(responseFromJSON, "name"),
     88                              gender = getListElement(responseFromJSON, "gender"),
     89                              country_id = getListElement(responseFromJSON, "country_id"),
     90                              language_id = getListElement(responseFromJSON, "language_id"),
     91                              probability = getListElement(responseFromJSON, "probability"),
     92                              count = getListElement(responseFromJSON, "count"),
     93                              stringsAsFactors = FALSE)
     94 
     95   } else {
     96     cat(paste("\n!!!! http returned status code:",
     97               httr::status_code(queryResult),
     98               "!!!! message:",
     99               httr::http_status(queryResult)$message,
    100               "!!!! error:",
    101               httr::content(queryResult)$error,
    102               sep="\n"))
    103     if (httr::status_code(queryResult) == 429){
    104       cat('\n!!!! number of available requests exhaused')
    105     }
    106     responseDF <- NULL
    107   }
    108   return(responseDF)
    109 }
    110 
    111 #' Guess names' genders
    112 #'
    113 #' This function uses the genderize.io API to supply estimates of the gender one
    114 #' or more names.
    115 #' @param nameVector A vector containing one or more names to look up.
    116 #' @param countryCode An optional ISO 3166-1 alpha-2 country code.
    117 #' @param languageCode An optional ISO 639-1 language code. Only one of
    118 #'   countryCode or languageCode can be specified.
    119 #' @param apiKey An optional API key for genderize.io.
    120 #' @export
    121 #' @examples
    122 #' guessGender(c("Natalie", "Liam", "Eamon"), countryCode = "US")
    123 guessGender <- function(nameVector, 
    124                         countryCode = NA, languageCode = NA, apiKey = NA) {
    125   checkLanguageCountryCodes(languageCode, countryCode)
    126 
    127   # genderize.io only handles 10 names at a time. Create a list of vectors, each
    128   # with no more than 10 names.
    129   queryList <- list()
    130   while(length(nameVector) > 10) {
    131     queryList[[length(queryList)+1]] <- nameVector[1:10]
    132     nameVector <- nameVector[11:length(nameVector)]
    133   }
    134   queryList[[length(queryList)+1]] <- nameVector
    135 
    136   # Run the queries
    137   responseList <- list()
    138   for (i in seq_along(queryList)) {
    139     responseDF <- lookupNameVectorGenderize(queryList[[i]], 
    140                                             countryCode, languageCode, apiKey)
    141     if (is.null(responseDF)) {
    142       break
    143     } else {
    144       responseList[[length(responseList)+1]] <- responseDF
    145     }
    146   }
    147 
    148   return(do.call(rbind, responseList))
    149 }