moma-collection

The Museum of Modern Art (MoMA) collection data.
git clone https://git.eamoncaddigan.net/moma-collection.git
Log | Files | Refs | README | LICENSE

grabGenderData.R (4614B)


      1 # This'll be the code that adds gender data to the damn names
      2 
      3 require(jsonlite)
      4 require(httr)
      5 require(dplyr)
      6 
      7 genderizeKey <- NA
      8 inputCSV <- "names_to_genderize.csv"
      9 
     10 
     11 # Helper function grabs gender info for a vector of names -----------------
     12 
     13 # Returns NA instead of NULL when a missing list element is requested, otherwise
     14 # returns the element itself.
     15 getListElement <- function(listName, elementName) {
     16   listElement <- NA
     17   if (!is.null(listName[[elementName]])) {
     18     listElement <- listName[[elementName]]
     19   }
     20   return(listElement)
     21 }
     22 
     23 lookupNames <- function(nameVector, countryCode = NA, apiKey = NA) {
     24   # Construct the query
     25   query <- paste("name[", seq_along(nameVector), "]=", nameVector, 
     26                  sep="", 
     27                  collapse="&")
     28   if (!is.na(countryCode) & (countryCode != "none")) {
     29     query <- paste(query, "&country_id=", countryCode, sep="")
     30   }
     31   if (!is.na(apiKey)) {
     32     query <- paste(query, "&apikey=", apiKey, sep="")
     33   }
     34   
     35   # Run it!
     36   queryResult <- GET("https://api.genderize.io", query = query)
     37   if (status_code(queryResult) == 200) {
     38     responseDF <- fromJSON(content(queryResult, as="text"))
     39     # Make sure this is a data.frame with the correct columns. I bet fromJSON 
     40     # can do this for me but I don't know how. This code works whether fromJSON 
     41     # returned a list (the response to one name) or a data.frame (the response
     42     # to several).
     43     responseDF <- data.frame(name = getListElement(responseDF, "name"),
     44                              gender = getListElement(responseDF, "gender"),
     45                              country_id = getListElement(responseDF, "country_id"),
     46                              probability = getListElement(responseDF, "probability"),
     47                              count = getListElement(responseDF, "count"),
     48                              stringsAsFactors = FALSE)
     49     responseDF <- mutate(responseDF, 
     50                          country_id = ifelse(is.na(country_id), "none", country_id))
     51     
     52   } else {
     53     cat(paste("\n!!!! http returned status code:",
     54               status_code(queryResult),
     55               "!!!! message:",
     56               http_status(queryResult)$message,
     57               "!!!! error:",
     58               content(queryResult)$error,
     59               sep="\n"))
     60     if (status_code(queryResult) == 429){
     61       cat('\n!!!! number of available requests exhaused')
     62     }
     63     responseDF <- NULL
     64   }
     65   return(responseDF)
     66 }
     67 
     68 
     69 # Read in the name and existing gender info -------------------------------
     70 
     71 # Load the genderize.io supported countries
     72 genderizeCountries <- fromJSON("countries.json")[[2]]
     73 
     74 # Read in the DF of artist data
     75 artistData <- read.csv(inputCSV, stringsAsFactors = FALSE)
     76 artistData <- artistData %>%
     77   mutate(iso3166 = ifelse(iso3166 %in% genderizeCountries, iso3166, "none")) %>%
     78   arrange(iso3166, first_name)
     79 
     80 # Read in the gender data we have so we don't keep querying the same people
     81 genderData <- read.csv("names_with_genders.csv", stringsAsFactors = FALSE)
     82 genderData <- mutate(genderData, looked_up = TRUE)
     83 artistData <- artistData %>%
     84   left_join(genderData, 
     85             by = c("iso3166" = "country_id", "first_name" = "name")) %>%
     86   mutate(looked_up = ifelse(is.na(looked_up), FALSE, looked_up))
     87 # Take the new column off of genderData
     88 genderData <- select(genderData, -looked_up)
     89 
     90 # Create a list of queries and run them -----------------------------------
     91 
     92 # Break the data frame of name/country combos into a list of query chunks
     93 artistData <- filter(artistData, !looked_up)
     94 queryChunks = list()
     95 countriesConsidered <- unique(artistData$iso3166)
     96 for (c in seq_along(countriesConsidered)) {
     97   countrysNames <- artistData$first_name[artistData$iso3166 == countriesConsidered[c]]
     98   # Can only query up to 10 names at a time
     99   while(length(countrysNames) > 10) {
    100     queryChunks[[length(queryChunks)+1]] <- list(countriesConsidered[c], countrysNames[1:10])
    101     countrysNames <- countrysNames[11:length(countrysNames)]
    102   }
    103   queryChunks[[length(queryChunks)+1]] <- list(countriesConsidered[c], countrysNames)
    104 }
    105 
    106 # Now query all the chunks and store a list of DFs of their results
    107 responseList <- list(genderData) # Start with what we already have
    108 for (i in seq_along(queryChunks)) {
    109     responseDF <- lookupNames(queryChunks[[i]][[2]], queryChunks[[i]][[1]],
    110                               genderizeKey)
    111   if (is.null(responseDF)) {
    112     break
    113   } else {
    114     responseList[[length(responseList)+1]] <- responseDF
    115   }
    116 }
    117 
    118 # Combine the list into a single DF and write to a file
    119 namesWithGenders <- do.call(rbind, responseList)
    120 write.csv(namesWithGenders, "names_with_genders.csv", row.names = FALSE)