. - moma-collection - The Museum of Modern Art (MoMA) collection data.

commit 209c319eec63382eb282d1ce62ef3a9e543432ff
parent 69500b8a56b1c922591a4c669b330f585796d5b1
Author: eamoncaddigan <eamon.caddigan@gmail.com>
Date:   Thu, 30 Jul 2015 12:50:51 -0400

.

Diffstat:
M genderize/grabGenderData.R  | 58 +++++++++++++++++++++++++++++++++++++++++-----------------

1 file changed, 41 insertions(+), 17 deletions(-)
diff --git a/genderize/grabGenderData.R b/genderize/grabGenderData.R
@@ -6,12 +6,22 @@ require(dplyr)
 
 # Helper function grabs gender info for a vector of names -----------------
 
+# Returns NA instead of NULL when a missing list element is requested, otherwise
+# returns the element itself.
+getListElement <- function(listName, elementName) {
+  listElement <- NA
+  if (!is.null(listName[[elementName]])) {
+    listElement <- listName[[elementName]]
+  }
+  return(listElement)
+}
+
 lookupNames <- function(nameVector, countryCode) {
   # Construct the query
   query <- paste("name[", seq_along(nameVector), "]=", nameVector, 
                  sep="", 
                  collapse="&")
-  if (!is.na(foo) & (foo != "none")) {
+  if (!is.na(countryCode) & (countryCode != "none")) {
     query <- paste(query, "&country_id=", countryCode, sep="")
   }
   
@@ -19,7 +29,18 @@ lookupNames <- function(nameVector, countryCode) {
   queryResult <- GET("https://api.genderize.io", query = query)
   if (status_code(queryResult) == 200) {
     responseDF <- fromJSON(content(queryResult, as="text"))
-    # TODO, make sure this is a data.frame with the correct columns.
+    # Make sure this is a data.frame with the correct columns. I bet fromJSON 
+    # can do this for me but I don't know how. This code works whether fromJSON 
+    # returned a list (the response to one name) or a data.frame (the response
+    # to several).
+    responseDF <- data.frame(name = getListElement(responseDF, "name"),
+                             gender = getListElement(responseDF, "gender"),
+                             country_id = getListElement(responseDF, "country_id"),
+                             probability = getListElement(responseDF, "probability"),
+                             count = getListElement(responseDF, "count"),
+                             stringsAsFactors = FALSE)
+    responseDF <- mutate(responseDF, 
+                         country_id = ifelse(is.na(country_id), "none", country_id))
     
   } else {
     cat(paste("\n!!!! http returned status code:",
@@ -27,7 +48,7 @@ lookupNames <- function(nameVector, countryCode) {
               "!!!! message:",
               http_status(queryResult)$message,
               "!!!! error:",
-              http_content(queryResult)$error,
+              content(queryResult)$error,
               sep="\n"))
     if (status_code(queryResult) == 429){
       cat('\n!!!! number of available requests exhaused')
@@ -51,12 +72,13 @@ artistData <- artistData %>%
 
 # Read in the gender data we have so we don't keep querying the same people
 genderData <- read.csv("names_with_genders.csv", stringsAsFactors = FALSE)
-genderData <- genderData %>%
-  mutate(looked_up = TRUE)
+genderData <- mutate(genderData, looked_up = TRUE)
 artistData <- artistData %>%
   left_join(genderData, 
             by = c("iso3166" = "country_id", "first_name" = "name")) %>%
   mutate(looked_up = ifelse(is.na(looked_up), FALSE, looked_up))
+# Take the new column off of genderData
+genderData <- select(genderData, -looked_up)
 
 # Create a list of queries and run them -----------------------------------
 
@@ -74,15 +96,17 @@ for (c in seq_along(countriesConsidered)) {
   queryChunks[[length(queryChunks)+1]] <- list(countriesConsidered[c], countrysNames)
 }
 
-# # Now query all the chunks
-# responseList <- list()
-# for (i in seq_along(queryChunks)) {
-#   responseDF <- lookupNames(queryChunks[[i]][[2]], queryChunks[[i]][[1]])
-#   if (is.null(responseDF)) {
-#     break
-#   } else {
-#     responseList[[length(responseList)+1]] <- responseDF
-#   }
-# }
-# namesWithGenders <- do.call(rbind, responseList)
-# write.csv(namesWithGenders, "names_with_genders.csv", row.names = FALSE)
+# Now query all the chunks and store a list of DFs of their results
+responseList <- list(genderData) # Start with what we already have
+for (i in seq_along(queryChunks)) {
+    responseDF <- lookupNames(queryChunks[[i]][[2]], queryChunks[[i]][[1]])
+  if (is.null(responseDF)) {
+    break
+  } else {
+    responseList[[length(responseList)+1]] <- responseDF
+  }
+}
+
+# Combine the list into a single DF and write to a file
+namesWithGenders <- do.call(rbind, responseList)
+write.csv(namesWithGenders, "names_with_genders.csv", row.names = FALSE)

	moma-collection The Museum of Modern Art (MoMA) collection data.
	git clone https://git.eamoncaddigan.net/moma-collection.git
	Log \| Files \| Refs \| README \| LICENSE