commit 63c2ca7b47a596908e410b60709ab05739af945c
parent 937cfd726987a34eb5f8451cb211d75549232202
Author: eamoncaddigan <eamon.caddigan@gmail.com>
Date: Sun, 9 Aug 2015 00:14:50 -0400
Ugh I don't know why this bullshit isn't working FML and R especially. Ugh!
Diffstat:
2 files changed, 131 insertions(+), 2 deletions(-)
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -2,7 +2,8 @@ Package: GenderGuesser
Title: Guess the gender of a name (one line, title case)
Version: 0.0.0.9000
Authors@R: person("Eamon", "Caddigan", email = "eamon.caddigan@gmail.com", role = c("aut", "cre"))
-Description: Currently, this package uses the genderize.io API to guess the gender of one or more names.
+Description: Currently uses the genderize.io API to guess the gender of one or more names.
Depends: R (>= 3.2.1)
-License: GNU General Public License v2.0
+Imports: jsonlite, httr
+License: GPL-3
LazyData: true
diff --git a/R/guessGender.R b/R/guessGender.R
@@ -0,0 +1,128 @@
+# Contains the function lookupGenderForNames(), which returns a data.frame of
+# gender info from genderize.io given a character vector of first names.
+
+#' Check country and language code.
+#'
+#' Makes sure that no more than one of countryCode or languageCode is *not* NA
+#' (i.e., they can both be NA, or one can be NA).
+checkLanguageCountryCodes <- function(countryCode, languageCode) {
+ # TODO: Check code validity
+ if (sum(is.na(c(countryCode, languageCode))) < 1) {
+ stop("Only one of countryCode or languageCode can be passed")
+ }
+}
+
+#' Get an element from a list
+#'
+#' Helper function that returns NA instead of NULL when a missing list element
+#' is requested, otherwise returns the element itself.
+#' @keywords internal
+getListElement <- function(listName, elementName) {
+ if (match(elementName, listName, nomatch = 0) > 0) {
+ listElement <- listName[[elementName]]
+ } else {
+ listElement <- NA
+ }
+ return(listElement)
+}
+
+#' Look up a vector of names.
+#'
+#' This function actually implements the genderize.io API. Can only query 10
+#' names at a time.
+#' @param nameVector A vector containing one or more names to look up.
+#' @param countryCode An optional ISO 3166-1 alpha-2 country code.
+#' @param languageCode An optional ISO 639-1 language code. Only one of
+#' countryCode or languageCode can be specified.
+#' @param apiKey An optional API key for genderize.io.
+#' @keywords internal
+lookupNameVector <- function(nameVector, countryCode = NA, languageCode = NA, apiKey = NA) {
+ # Make sure that no more than 10 names were passed
+ if (length(nameVector) > 10) {
+ stop("This only accepts 10 or fewer names")
+ }
+ checkLanguageCountryCodes(countryCode, languageCode)
+
+ # Construct the query
+ query <- paste("name[", seq_along(nameVector), "]=", nameVector,
+ sep="",
+ collapse="&")
+ if (!is.na(countryCode)) {
+ query <- paste(query, "&country_id=", countryCode, sep="")
+ }
+ if (!is.na(languageCode)) {
+ query <- paste(query, "&language_id=", languageCode, sep="")
+ }
+ if (!is.na(apiKey)) {
+ query <- paste(query, "&apikey=", apiKey, sep="")
+ }
+
+ # Run it!
+ queryResult <- httr::GET("https://api.genderize.io", query = query)
+ if (httr::status_code(queryResult) == 200) {
+ responseDF <- jsonlite::fromJSON(httr::content(queryResult, as="text"))
+ # Make sure this is a data.frame with the correct columns. I bet fromJSON
+ # can do this for me but I don't know how. This code works whether fromJSON
+ # returned a list (the response to one name) or a data.frame (the response
+ # to several).
+ responseDF <- data.frame(name = getListElement(responseDF, "name"),
+ gender = getListElement(responseDF, "gender"),
+ country_id = getListElement(responseDF, "country_id"),
+ language_id = getListElement(responseDF, "language_id"),
+ probability = getListElement(responseDF, "probability"),
+ count = getListElement(responseDF, "count"),
+ stringsAsFactors = FALSE)
+
+ } else {
+ cat(paste("\n!!!! http returned status code:",
+ httr::status_code(queryResult),
+ "!!!! message:",
+ httr::http_status(queryResult)$message,
+ "!!!! error:",
+ httr::content(queryResult)$error,
+ sep="\n"))
+ if (httr::status_code(queryResult) == 429){
+ cat('\n!!!! number of available requests exhaused')
+ }
+ responseDF <- NULL
+ }
+ return(responseDF)
+}
+
+#' Guess names' genders
+#'
+#' This function uses the genderize.io API to supply estimates of the gender one
+#' or more names.
+#' @param nameVector A vector containing one or more names to look up.
+#' @param countryCode An optional ISO 3166-1 alpha-2 country code.
+#' @param languageCode An optional ISO 639-1 language code. Only one of
+#' countryCode or languageCode can be specified.
+#' @param apiKey An optional API key for genderize.io.
+#' @export
+#' @examples
+#' guessGender(c("Eamon", "Sean"), countryCode = "US")
+guessGender <- function(nameVector, countryCode = NA, languageCode = NA, apiKey = NA) {
+ checkLanguageCountryCodes(countryCode, languageCode)
+
+ # genderize.io only handles 10 names at a time. Create a list of vectors, each
+ # with no more than 10 names.
+ queryList <- list()
+ while(length(nameVector) > 10) {
+ queryList[[length(queryList)+1]] <- nameVector[1:10]
+ nameVector <- nameVector[11:length(nameVector)]
+ }
+ queryList[[length(queryList)+1]] <- nameVector
+
+ # Run the queries
+ responseList <- list()
+ for (i in seq_along(queryList)) {
+ responseDF <- lookupNameVector(queryList[[i]], countryCode, apiKey)
+ if (is.null(responseDF)) {
+ break
+ } else {
+ responseList[[length(responseList)+1]] <- responseDF
+ }
+ }
+
+ return(do.call(rbind, responseList))
+}