GenderGuesser

An R package for using the genderize.io API to guess the gender of names.
git clone https://git.eamoncaddigan.net/GenderGuesser.git
Log | Files | Refs | README | LICENSE

commit 63c2ca7b47a596908e410b60709ab05739af945c
parent 937cfd726987a34eb5f8451cb211d75549232202
Author: eamoncaddigan <eamon.caddigan@gmail.com>
Date:   Sun,  9 Aug 2015 00:14:50 -0400

Ugh I don't know why this bullshit isn't working FML and R especially. Ugh!

Diffstat:
MDESCRIPTION | 5+++--
AR/guessGender.R | 128+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 131 insertions(+), 2 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION @@ -2,7 +2,8 @@ Package: GenderGuesser Title: Guess the gender of a name (one line, title case) Version: 0.0.0.9000 Authors@R: person("Eamon", "Caddigan", email = "eamon.caddigan@gmail.com", role = c("aut", "cre")) -Description: Currently, this package uses the genderize.io API to guess the gender of one or more names. +Description: Currently uses the genderize.io API to guess the gender of one or more names. Depends: R (>= 3.2.1) -License: GNU General Public License v2.0 +Imports: jsonlite, httr +License: GPL-3 LazyData: true diff --git a/R/guessGender.R b/R/guessGender.R @@ -0,0 +1,128 @@ +# Contains the function lookupGenderForNames(), which returns a data.frame of +# gender info from genderize.io given a character vector of first names. + +#' Check country and language code. +#' +#' Makes sure that no more than one of countryCode or languageCode is *not* NA +#' (i.e., they can both be NA, or one can be NA). +checkLanguageCountryCodes <- function(countryCode, languageCode) { + # TODO: Check code validity + if (sum(is.na(c(countryCode, languageCode))) < 1) { + stop("Only one of countryCode or languageCode can be passed") + } +} + +#' Get an element from a list +#' +#' Helper function that returns NA instead of NULL when a missing list element +#' is requested, otherwise returns the element itself. +#' @keywords internal +getListElement <- function(listName, elementName) { + if (match(elementName, listName, nomatch = 0) > 0) { + listElement <- listName[[elementName]] + } else { + listElement <- NA + } + return(listElement) +} + +#' Look up a vector of names. +#' +#' This function actually implements the genderize.io API. Can only query 10 +#' names at a time. +#' @param nameVector A vector containing one or more names to look up. +#' @param countryCode An optional ISO 3166-1 alpha-2 country code. +#' @param languageCode An optional ISO 639-1 language code. Only one of +#' countryCode or languageCode can be specified. +#' @param apiKey An optional API key for genderize.io. +#' @keywords internal +lookupNameVector <- function(nameVector, countryCode = NA, languageCode = NA, apiKey = NA) { + # Make sure that no more than 10 names were passed + if (length(nameVector) > 10) { + stop("This only accepts 10 or fewer names") + } + checkLanguageCountryCodes(countryCode, languageCode) + + # Construct the query + query <- paste("name[", seq_along(nameVector), "]=", nameVector, + sep="", + collapse="&") + if (!is.na(countryCode)) { + query <- paste(query, "&country_id=", countryCode, sep="") + } + if (!is.na(languageCode)) { + query <- paste(query, "&language_id=", languageCode, sep="") + } + if (!is.na(apiKey)) { + query <- paste(query, "&apikey=", apiKey, sep="") + } + + # Run it! + queryResult <- httr::GET("https://api.genderize.io", query = query) + if (httr::status_code(queryResult) == 200) { + responseDF <- jsonlite::fromJSON(httr::content(queryResult, as="text")) + # Make sure this is a data.frame with the correct columns. I bet fromJSON + # can do this for me but I don't know how. This code works whether fromJSON + # returned a list (the response to one name) or a data.frame (the response + # to several). + responseDF <- data.frame(name = getListElement(responseDF, "name"), + gender = getListElement(responseDF, "gender"), + country_id = getListElement(responseDF, "country_id"), + language_id = getListElement(responseDF, "language_id"), + probability = getListElement(responseDF, "probability"), + count = getListElement(responseDF, "count"), + stringsAsFactors = FALSE) + + } else { + cat(paste("\n!!!! http returned status code:", + httr::status_code(queryResult), + "!!!! message:", + httr::http_status(queryResult)$message, + "!!!! error:", + httr::content(queryResult)$error, + sep="\n")) + if (httr::status_code(queryResult) == 429){ + cat('\n!!!! number of available requests exhaused') + } + responseDF <- NULL + } + return(responseDF) +} + +#' Guess names' genders +#' +#' This function uses the genderize.io API to supply estimates of the gender one +#' or more names. +#' @param nameVector A vector containing one or more names to look up. +#' @param countryCode An optional ISO 3166-1 alpha-2 country code. +#' @param languageCode An optional ISO 639-1 language code. Only one of +#' countryCode or languageCode can be specified. +#' @param apiKey An optional API key for genderize.io. +#' @export +#' @examples +#' guessGender(c("Eamon", "Sean"), countryCode = "US") +guessGender <- function(nameVector, countryCode = NA, languageCode = NA, apiKey = NA) { + checkLanguageCountryCodes(countryCode, languageCode) + + # genderize.io only handles 10 names at a time. Create a list of vectors, each + # with no more than 10 names. + queryList <- list() + while(length(nameVector) > 10) { + queryList[[length(queryList)+1]] <- nameVector[1:10] + nameVector <- nameVector[11:length(nameVector)] + } + queryList[[length(queryList)+1]] <- nameVector + + # Run the queries + responseList <- list() + for (i in seq_along(queryList)) { + responseDF <- lookupNameVector(queryList[[i]], countryCode, apiKey) + if (is.null(responseDF)) { + break + } else { + responseList[[length(responseList)+1]] <- responseDF + } + } + + return(do.call(rbind, responseList)) +}