moma-collection

The Museum of Modern Art (MoMA) collection data.
git clone https://git.eamoncaddigan.net/moma-collection.git
Log | Files | Refs | README | LICENSE

commit 209c319eec63382eb282d1ce62ef3a9e543432ff
parent 69500b8a56b1c922591a4c669b330f585796d5b1
Author: eamoncaddigan <eamon.caddigan@gmail.com>
Date:   Thu, 30 Jul 2015 12:50:51 -0400

.

Diffstat:
Mgenderize/grabGenderData.R | 58+++++++++++++++++++++++++++++++++++++++++-----------------
1 file changed, 41 insertions(+), 17 deletions(-)

diff --git a/genderize/grabGenderData.R b/genderize/grabGenderData.R @@ -6,12 +6,22 @@ require(dplyr) # Helper function grabs gender info for a vector of names ----------------- +# Returns NA instead of NULL when a missing list element is requested, otherwise +# returns the element itself. +getListElement <- function(listName, elementName) { + listElement <- NA + if (!is.null(listName[[elementName]])) { + listElement <- listName[[elementName]] + } + return(listElement) +} + lookupNames <- function(nameVector, countryCode) { # Construct the query query <- paste("name[", seq_along(nameVector), "]=", nameVector, sep="", collapse="&") - if (!is.na(foo) & (foo != "none")) { + if (!is.na(countryCode) & (countryCode != "none")) { query <- paste(query, "&country_id=", countryCode, sep="") } @@ -19,7 +29,18 @@ lookupNames <- function(nameVector, countryCode) { queryResult <- GET("https://api.genderize.io", query = query) if (status_code(queryResult) == 200) { responseDF <- fromJSON(content(queryResult, as="text")) - # TODO, make sure this is a data.frame with the correct columns. + # Make sure this is a data.frame with the correct columns. I bet fromJSON + # can do this for me but I don't know how. This code works whether fromJSON + # returned a list (the response to one name) or a data.frame (the response + # to several). + responseDF <- data.frame(name = getListElement(responseDF, "name"), + gender = getListElement(responseDF, "gender"), + country_id = getListElement(responseDF, "country_id"), + probability = getListElement(responseDF, "probability"), + count = getListElement(responseDF, "count"), + stringsAsFactors = FALSE) + responseDF <- mutate(responseDF, + country_id = ifelse(is.na(country_id), "none", country_id)) } else { cat(paste("\n!!!! http returned status code:", @@ -27,7 +48,7 @@ lookupNames <- function(nameVector, countryCode) { "!!!! message:", http_status(queryResult)$message, "!!!! error:", - http_content(queryResult)$error, + content(queryResult)$error, sep="\n")) if (status_code(queryResult) == 429){ cat('\n!!!! number of available requests exhaused') @@ -51,12 +72,13 @@ artistData <- artistData %>% # Read in the gender data we have so we don't keep querying the same people genderData <- read.csv("names_with_genders.csv", stringsAsFactors = FALSE) -genderData <- genderData %>% - mutate(looked_up = TRUE) +genderData <- mutate(genderData, looked_up = TRUE) artistData <- artistData %>% left_join(genderData, by = c("iso3166" = "country_id", "first_name" = "name")) %>% mutate(looked_up = ifelse(is.na(looked_up), FALSE, looked_up)) +# Take the new column off of genderData +genderData <- select(genderData, -looked_up) # Create a list of queries and run them ----------------------------------- @@ -74,15 +96,17 @@ for (c in seq_along(countriesConsidered)) { queryChunks[[length(queryChunks)+1]] <- list(countriesConsidered[c], countrysNames) } -# # Now query all the chunks -# responseList <- list() -# for (i in seq_along(queryChunks)) { -# responseDF <- lookupNames(queryChunks[[i]][[2]], queryChunks[[i]][[1]]) -# if (is.null(responseDF)) { -# break -# } else { -# responseList[[length(responseList)+1]] <- responseDF -# } -# } -# namesWithGenders <- do.call(rbind, responseList) -# write.csv(namesWithGenders, "names_with_genders.csv", row.names = FALSE) +# Now query all the chunks and store a list of DFs of their results +responseList <- list(genderData) # Start with what we already have +for (i in seq_along(queryChunks)) { + responseDF <- lookupNames(queryChunks[[i]][[2]], queryChunks[[i]][[1]]) + if (is.null(responseDF)) { + break + } else { + responseList[[length(responseList)+1]] <- responseDF + } +} + +# Combine the list into a single DF and write to a file +namesWithGenders <- do.call(rbind, responseList) +write.csv(namesWithGenders, "names_with_genders.csv", row.names = FALSE)