commit 209c319eec63382eb282d1ce62ef3a9e543432ff
parent 69500b8a56b1c922591a4c669b330f585796d5b1
Author: eamoncaddigan <eamon.caddigan@gmail.com>
Date: Thu, 30 Jul 2015 12:50:51 -0400
.
Diffstat:
1 file changed, 41 insertions(+), 17 deletions(-)
diff --git a/genderize/grabGenderData.R b/genderize/grabGenderData.R
@@ -6,12 +6,22 @@ require(dplyr)
# Helper function grabs gender info for a vector of names -----------------
+# Returns NA instead of NULL when a missing list element is requested, otherwise
+# returns the element itself.
+getListElement <- function(listName, elementName) {
+ listElement <- NA
+ if (!is.null(listName[[elementName]])) {
+ listElement <- listName[[elementName]]
+ }
+ return(listElement)
+}
+
lookupNames <- function(nameVector, countryCode) {
# Construct the query
query <- paste("name[", seq_along(nameVector), "]=", nameVector,
sep="",
collapse="&")
- if (!is.na(foo) & (foo != "none")) {
+ if (!is.na(countryCode) & (countryCode != "none")) {
query <- paste(query, "&country_id=", countryCode, sep="")
}
@@ -19,7 +29,18 @@ lookupNames <- function(nameVector, countryCode) {
queryResult <- GET("https://api.genderize.io", query = query)
if (status_code(queryResult) == 200) {
responseDF <- fromJSON(content(queryResult, as="text"))
- # TODO, make sure this is a data.frame with the correct columns.
+ # Make sure this is a data.frame with the correct columns. I bet fromJSON
+ # can do this for me but I don't know how. This code works whether fromJSON
+ # returned a list (the response to one name) or a data.frame (the response
+ # to several).
+ responseDF <- data.frame(name = getListElement(responseDF, "name"),
+ gender = getListElement(responseDF, "gender"),
+ country_id = getListElement(responseDF, "country_id"),
+ probability = getListElement(responseDF, "probability"),
+ count = getListElement(responseDF, "count"),
+ stringsAsFactors = FALSE)
+ responseDF <- mutate(responseDF,
+ country_id = ifelse(is.na(country_id), "none", country_id))
} else {
cat(paste("\n!!!! http returned status code:",
@@ -27,7 +48,7 @@ lookupNames <- function(nameVector, countryCode) {
"!!!! message:",
http_status(queryResult)$message,
"!!!! error:",
- http_content(queryResult)$error,
+ content(queryResult)$error,
sep="\n"))
if (status_code(queryResult) == 429){
cat('\n!!!! number of available requests exhaused')
@@ -51,12 +72,13 @@ artistData <- artistData %>%
# Read in the gender data we have so we don't keep querying the same people
genderData <- read.csv("names_with_genders.csv", stringsAsFactors = FALSE)
-genderData <- genderData %>%
- mutate(looked_up = TRUE)
+genderData <- mutate(genderData, looked_up = TRUE)
artistData <- artistData %>%
left_join(genderData,
by = c("iso3166" = "country_id", "first_name" = "name")) %>%
mutate(looked_up = ifelse(is.na(looked_up), FALSE, looked_up))
+# Take the new column off of genderData
+genderData <- select(genderData, -looked_up)
# Create a list of queries and run them -----------------------------------
@@ -74,15 +96,17 @@ for (c in seq_along(countriesConsidered)) {
queryChunks[[length(queryChunks)+1]] <- list(countriesConsidered[c], countrysNames)
}
-# # Now query all the chunks
-# responseList <- list()
-# for (i in seq_along(queryChunks)) {
-# responseDF <- lookupNames(queryChunks[[i]][[2]], queryChunks[[i]][[1]])
-# if (is.null(responseDF)) {
-# break
-# } else {
-# responseList[[length(responseList)+1]] <- responseDF
-# }
-# }
-# namesWithGenders <- do.call(rbind, responseList)
-# write.csv(namesWithGenders, "names_with_genders.csv", row.names = FALSE)
+# Now query all the chunks and store a list of DFs of their results
+responseList <- list(genderData) # Start with what we already have
+for (i in seq_along(queryChunks)) {
+ responseDF <- lookupNames(queryChunks[[i]][[2]], queryChunks[[i]][[1]])
+ if (is.null(responseDF)) {
+ break
+ } else {
+ responseList[[length(responseList)+1]] <- responseDF
+ }
+}
+
+# Combine the list into a single DF and write to a file
+namesWithGenders <- do.call(rbind, responseList)
+write.csv(namesWithGenders, "names_with_genders.csv", row.names = FALSE)