moma-collection

The Museum of Modern Art (MoMA) collection data.
git clone https://git.eamoncaddigan.net/moma-collection.git
Log | Files | Refs | README | LICENSE

commit c5f39b0680ce455ebe92da5ad67a25dd4c82ad4f
parent 26ee723380d3524f2caeb5d68175b0375983894a
Author: eamoncaddigan <eamon.caddigan@gmail.com>
Date:   Sun, 14 Feb 2016 21:39:26 -0500

Working on a big rewrite.

Diffstat:
MartistInfo.R | 308++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------
1 file changed, 234 insertions(+), 74 deletions(-)

diff --git a/artistInfo.R b/artistInfo.R @@ -1,82 +1,242 @@ -# A request! https://twitter.com/mkurlandsky/status/626363245675286528 - -require(dplyr) -require(tidyr) -require(ggplot2) -require(RSQLite) - -# Read the artist information out of the collection table ----------------- - -artists <- read.csv("Artworks.csv", stringsAsFactors = FALSE) %>% - select(Artist, ArtistBio) %>% - distinct() %>% - # Remove entries with missing bios - filter(ArtistBio != "", - !grepl("unknown", ArtistBio, ignore.case = TRUE), - grepl("[[:alpha:]]+", ArtistBio), - !grepl("unknown", Artist, ignore.case = TRUE)) %>% - # Try to remove anything with multiple artists - filter(!grepl(" and ", Artist), - !grepl(",", Artist)) %>% - # Not sure why dups are making it through. Unicode? +# This code reads in MoMA's Artworks.csv and creates a SQLite DB with two tables: +# * artworks: information about the pieces in MoMA's collection +# * artists: information about the people who made them + +library(dplyr) +library(purrr) +library(tidyr) +library(RSQLite) + +# Read in the info -------------------------------------------------------- + +momaData <- read.csv("Artworks.csv", stringsAsFactors = FALSE) +# Convert the venerable CamelCase column names to lowely snake_case. +colnames(momaData) <- colnames(momaData) %>% + # Sorry museum branding guidelines + gsub("MoMA", "moma", .) %>% + gsub("(([a-z])([A-Z]))", "\\2_\\3", .) %>% + tolower() + +# Create a linking table to handle the artist-object reationship ---------- + +artistsObjects <- momaData %>% + select(object_id, artist_bio, artist) %>% + # Split the artist and artist_bio columns into lists. + mutate(artist_bios_list = strsplit(artist_bio, "(?<=\\))[[:blank:]]*(?=\\()", + perl = TRUE), + num_bios = map_int(artist_bios_list, length), + artist_list = strsplit(artist, ",[[:blank:]]*"), + num_artists = map_int(artist_list, length)) %>% + # XXX: This currently removes 5952 records. + filter(num_artists > 0, + num_bios == num_artists) + +# Create duplicate rows for each object with multiple artists. +artistsObjects <- artistsObjects[rep(seq(nrow(artistsObjects)), + artistsObjects$num_artists), ] + +# Select one artist/artist_bio entry per row, and then create a unique artist_id +# for each unique artist/artist_bio entry. This also gets rid of the parens +# around the bio. +artistsObjects <- artistsObjects %>% + group_by(object_id) %>% + mutate(artist_index = row_number(num_artists)) %>% + ungroup() %>% + mutate(artist = map2_chr(artist_list, artist_index, ~ .x[.y]), + artist_bio = sub("^\\((.*)\\)$", "\\1", + map2_chr(artist_bios_list, artist_index, ~ .x[.y])), + artist_id = min_rank(paste(artist, artist_bio))) + + +# Normalize the DB into artist/object/and linking tables ------------------ + +# Create the artists table +artists <- artistsObjects %>% + select(artist_id, artist, artist_bio) %>% distinct() +# Drop all of the redundant info from the linking table. +artistsObjects <- artistsObjects %>% + select(object_id, artist_id) + +# Drop all of the redundant info from the objects table. +artworks <- momaData %>% + select(-artist, -artist_bio) + # Extract birth nation and nationality from the artist bio ---------------- -artists <- artists %>% - mutate(birth_nation = sub("U\\.S\\.A\\.", "United States", ArtistBio), - birth_nation = ifelse(grepl("born [[:alpha:]]+", birth_nation), - sub(".*born ([[:alpha:][:space:]]*).*", "\\1", birth_nation), - NA), - birth_nation = ifelse(birth_nation %in% c("c", "ca"), NA, birth_nation), - birth_nation = sub("\\W*$", "", birth_nation), - birth_nation = ifelse(birth_nation %in% c("US", "USA"), "United States", birth_nation), - nationality = sub("^[^[:alpha:]]([[:alpha:][:space:]]*).*", "\\1", ArtistBio), - nationality = sub("\\W*$", "", nationality), - nationality = sub(" and .*", "", nationality), - birth_nationality = ifelse(!is.na(birth_nation), birth_nation, nationality), - first_name = sub("([[:alpha:]]*).*", "\\1", Artist)) - - -# Add country codes to each artist ---------------------------------------- - -nationalitiesToCodes <- read.csv("countries/nationalities_codes.csv", stringsAsFactors = FALSE) -artists <- artists %>% - # Add the country code if there is one - left_join(nationalitiesToCodes, by = c("birth_nationality" = "nationality")) - -# Find unique first name / country code pairs for genderizing -artist.firstNames <- artists %>% - select(first_name, iso3166) %>% - distinct() %>% - arrange(iso3166, first_name) %>% - # NAs won't work if we go to CSV and read them back in! - mutate(iso3166 = ifelse(is.na(iso3166), "none", iso3166)) - -if (!file.exists("genderize/names_to_genderize.csv")) { - write.csv(artist.firstNames, "genderize/names_to_genderize.csv", - row.names = FALSE) -} else { - nameGenders <- read.csv("genderize/names_with_genders.csv", - stringsAsFactors = FALSE) %>% - select(name, gender, country_id) +# Not 100% happy with my approach here, but it uses a lot of functional +# programming so there's that. + +# This function takes a pattern and other options, and returns a function that +# will apply that pattern to a string and use those other options to extract +# relevant info. +makePatternMatcher <- function(pattern, + nationalityIndex = NA, + birthNationIndex = NA, + birthYearIndex = NA, + deathYearIndex = NA, + isEntityDefault = FALSE) { + # THESE functions return a data.frame containing the extracted artist_bio + # information + patternMatcher <- function(artistBio) { + m <- regexec(pattern, artistBio) + matches <- regmatches(artistBio, m) + + isMatch <- map_lgl(matches, ~length(.x) > 0) + nationality <- map_chr(matches, ~ifelse(length(.x) < nationalityIndex+1, NA, + .x[nationalityIndex+1])) + birthNation <- map_chr(matches, ~ifelse(length(.x) < birthNationIndex+1, NA, + .x[birthNationIndex+1])) + birthYear <- map_int(matches, ~ifelse(length(.x) < birthYearIndex+1, NA, + as.integer(.x[birthYearIndex+1]))) + deathYear <- map_int(matches, ~ifelse(length(.x) < deathYearIndex+1, NA, + as.integer(.x[deathYearIndex+1]))) + isEntity <- isEntityDefault + + return(data_frame(is_match = isMatch, + nationality, + birth_nation = birthNation, + birth_year = birthYear, + death_year = deathYear, + is_entity = isEntity)) + } - # Add genders to the artists - artists <- artists %>% - # First, pretend we have no country info and get the genders that way - mutate(no_iso3166 = "none") %>% - left_join(nameGenders, by = c("first_name" = "name", "no_iso3166" = "country_id")) %>% - rename(no_country_gender = gender) %>% - # Now get the genders using the country info - left_join(nameGenders, by = c("first_name" = "name", "iso3166" = "country_id")) %>% - # Fill in missing gender info using the no-country gender info - mutate(gender = ifelse(is.na(gender), no_country_gender, gender)) %>% - # Drop the dummy columns - select(-no_iso3166, -no_country_gender) + return(patternMatcher) +} + +patternFuns <- list() +# Nationality, born Country YYYY +# Nationality, born Country (now Other) YYYY +patternFuns[[1]] <- makePatternMatcher("^([[:alpha:]]+[[:blank:][:alpha:][:punct:]]*), born ([[:alpha:][:punct:][:space:]]+) ([[:digit:]]{4})$", + nationalityIndex = 1, birthNationIndex = 2, birthYearIndex = 3) +# Nationality, born Country. YYYY-YYYY +patternFuns[[2]] <- makePatternMatcher("^([[:alpha:]]+[[:blank:][:alpha:][:punct:]]*), born ([[:alpha:][:punct:][:space:]]+)([[:digit:]]{4})[[:punct:]]([[:digit:]]{4})$", + nationalityIndex = 1, birthNationIndex = 2, birthYearIndex = 3, deathYearIndex = 4) +# Nationality, born YYYY +patternFuns[[3]] <- makePatternMatcher("^([[:alpha:]]+[[:blank:][:alpha:][:punct:]]*), b(?:\\.|orn) ([[:digit:]]{4})$", + nationalityIndex = 1, birthYearIndex = 2) +# Nationality, YYYY-YYYY +patternFuns[[4]] <- makePatternMatcher("^([[:alpha:]]+[[:blank:][:alpha:][:punct:]]*),[[:space:]]*([[:digit:]]{4})[[:punct:]]+([[:digit:]]{4})$", + nationalityIndex = 1, birthYearIndex = 2, deathYearIndex = 3) +# Nationality, born Country +patternFuns[[5]] <- makePatternMatcher("^([[:alpha:]]+[[:blank:][:alpha:][:punct:]]*), born ([[:alpha:][:punct:][:space:]]+)$", + nationalityIndex = 1, birthNationIndex = 2) +# Nationality +patternFuns[[6]] <- makePatternMatcher("^([[:alpha:]]+[[:blank:][:alpha:][:punct:]]*)$", + nationalityIndex = 1) +# Nationality, (established|est.|founded) YYYY +patternFuns[[7]] <- makePatternMatcher("^([[:alpha:]]+[[:blank:][:alpha:][:punct:]]*), (?:(?:established|est\\.)|founded) ([[:digit:]]{4})$", + nationalityIndex = 1, birthYearIndex = 2, isEntityDefault = TRUE) +# est. YYYY +patternFuns[[8]] <- makePatternMatcher("^est(?:\\.|ablished) ([[:digit:]]{4})$", + birthYearIndex = 1, isEntityDefault = TRUE) +# Nationality, died YYYY +patternFuns[[9]] <- makePatternMatcher("^([[:alpha:]]+[[:blank:][:alpha:][:punct:]]*), died ([[:digit:]]{4})$", + nationalityIndex = 1, deathYearIndex = 2) +# born YYYY +patternFuns[[10]] <- makePatternMatcher("^born ([[:digit:]]{4})$", + birthYearIndex = 1) +# YYYY-YYYY +patternFuns[[11]] <- makePatternMatcher("^([[:digit:]]{4})[[:punct:]]+([[:digit:]]{4})$", + birthYearIndex = 1, deathYearIndex = 2) +# Nationality, YYYY-? +patternFuns[[12]] <- makePatternMatcher("^([[:alpha:]]+), ([[:digit:]]{4})[[:punct:]]*$", + nationalityIndex = 1, birthYearIndex = 2) +# Nationality, est. YYYY-YYYY +patternFuns[[13]] <- makePatternMatcher("^([[:alpha:]]+[[:blank:][:alpha:][:punct:]]*), (?:(?:established|est\\.)|founded) ([[:digit:]]{4})[[:punct:]]([[:digit:]]{4})$", + nationalityIndex = 1, birthYearIndex = 2, deathYearIndex = 3, isEntityDefault = TRUE) + + +# Prepare to add the new biographical columns +commonCols <- c("nationality", "birth_nation", + "birth_year", "death_year", "is_entity") +for (c in commonCols) { + artists[[c]] <- NA +} + +# Apply the pattern-matching functions to the artist_bios. This keeps track of +# which rows have previously matched so that the patternFuns aren't each run on +# all of the data. +matchFalses <- rep(FALSE, length(artists$artist_bio)) +matchedAlready <- matchFalses +for (patternFun in patternFuns) { + # Apply patternFun to extract the biographical information for bios that + # haven't been matched already. + bioData <- patternFun(artists$artist_bio[!matchedAlready]) + + # Where are new matches? + matchedHere <- matchFalses + matchedHere[!matchedAlready] <- bioData$is_match + + # Fill in the data for the new matches. + artists[matchedHere, commonCols] <- bioData[bioData$is_match, commonCols] - # Alright. All this CSV stuff is getting out of hand. Time for a RDB. :/ - momaDB <- dbConnect(RSQLite::SQLite(), "momaDB.sqlite") - dbWriteTable(momaDB, "artists", artists) - dbDisconnect(momaDB) + # Update the list of matched rows. + matchedAlready <- matchedAlready | matchedHere } + +# Clean up nationality and birth_nation +artists <- artists %>% + mutate(nationality = ifelse(grepl("unknown", nationality, ignore.case = TRUE), + NA, nationality), + nationality = sub(".*\\(now ([[:alpha:]][[:alpha:][:punct:][:space:]]*)\\).*", + "\\1", nationality), + nationality = sub(" \\((?:then|former) [[:alpha:]][[:alpha:][:punct:][:space:]]*\\)", + "", nationality), + nationality = sub(", n\\.[[:space:]]?d\\.$", "", nationality), + nationality = ifelse(nationality == "USA", "American", nationality), + nationality = ifelse(nationality == "UK", "British", nationality), + nationality = sub(", active.*", "", nationality), + birth_nation = ifelse(grepl("^ca?\\.?$", birth_nation), NA, birth_nation), + birth_nation = sub(".*\\(now ([[:alpha:]][[:alpha:][:punct:][:space:]]*)\\).*", + "\\1", birth_nation), + birth_nation = sub("[^[:alpha:]]*$", "", birth_nation), + birth_nation = ifelse(grepl("U\\.?S\\.?A\\.?", birth_nation), + "United States", birth_nation), + birth_nation = sub(" \\(.*", "", birth_nation), + birth_nation = sub("^ ?in ", "", birth_nation), + birth_nation = sub(", born.*", "", birth_nation)) + +# # Add country codes to each artist ---------------------------------------- +# +# nationalitiesToCodes <- read.csv("countries/nationalities_codes.csv", stringsAsFactors = FALSE) +# artists <- artists %>% +# # Add the country code if there is one +# left_join(nationalitiesToCodes, by = c("birth_nationality" = "nationality")) +# +# # Find unique first name / country code pairs for genderizing +# artist.firstNames <- artists %>% +# select(first_name, iso3166) %>% +# distinct() %>% +# arrange(iso3166, first_name) %>% +# # NAs won't work if we go to CSV and read them back in! +# mutate(iso3166 = ifelse(is.na(iso3166), "none", iso3166)) +# +# if (!file.exists("genderize/names_to_genderize.csv")) { +# write.csv(artist.firstNames, "genderize/names_to_genderize.csv", +# row.names = FALSE) +# } else { +# nameGenders <- read.csv("genderize/names_with_genders.csv", +# stringsAsFactors = FALSE) %>% +# select(name, gender, country_id) +# +# # Add genders to the artists +# artists <- artists %>% +# # First, pretend we have no country info and get the genders that way +# mutate(no_iso3166 = "none") %>% +# left_join(nameGenders, by = c("first_name" = "name", "no_iso3166" = "country_id")) %>% +# rename(no_country_gender = gender) %>% +# # Now get the genders using the country info +# left_join(nameGenders, by = c("first_name" = "name", "iso3166" = "country_id")) %>% +# # Fill in missing gender info using the no-country gender info +# mutate(gender = ifelse(is.na(gender), no_country_gender, gender)) %>% +# # Drop the dummy columns +# select(-no_iso3166, -no_country_gender) +# +# # Alright. All this CSV stuff is getting out of hand. Time for a RDB. :/ +# momaDB <- dbConnect(RSQLite::SQLite(), "momaDB.sqlite") +# dbWriteTable(momaDB, "artists", artists) +# dbDisconnect(momaDB) +# }