Working on a big rewrite. - moma-collection - The Museum of Modern Art (MoMA) collection data.

commit c5f39b0680ce455ebe92da5ad67a25dd4c82ad4f
parent 26ee723380d3524f2caeb5d68175b0375983894a
Author: eamoncaddigan <eamon.caddigan@gmail.com>
Date:   Sun, 14 Feb 2016 21:39:26 -0500

Working on a big rewrite.

Diffstat:
M artistInfo.R  | 308 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------

1 file changed, 234 insertions(+), 74 deletions(-)
diff --git a/artistInfo.R b/artistInfo.R
@@ -1,82 +1,242 @@
-# A request! https://twitter.com/mkurlandsky/status/626363245675286528
-
-require(dplyr)
-require(tidyr)
-require(ggplot2)
-require(RSQLite)
-
-# Read the artist information out of the collection table -----------------
-
-artists <- read.csv("Artworks.csv", stringsAsFactors = FALSE) %>%
-  select(Artist, ArtistBio) %>%
-  distinct() %>%
-  # Remove entries with missing bios
-  filter(ArtistBio != "",
-         !grepl("unknown", ArtistBio, ignore.case = TRUE),
-         grepl("[[:alpha:]]+", ArtistBio),
-         !grepl("unknown", Artist, ignore.case = TRUE)) %>%
-  # Try to remove anything with multiple artists
-  filter(!grepl(" and ", Artist), 
-         !grepl(",", Artist)) %>%
-  # Not sure why dups are making it through. Unicode?
+# This code reads in MoMA's Artworks.csv and creates a SQLite DB with two tables:
+# * artworks: information about the pieces in MoMA's collection
+# * artists: information about the people who made them
+
+library(dplyr)
+library(purrr)
+library(tidyr)
+library(RSQLite)
+
+# Read in the info --------------------------------------------------------
+
+momaData <- read.csv("Artworks.csv", stringsAsFactors = FALSE)
+# Convert the venerable CamelCase column names to lowely snake_case. 
+colnames(momaData) <- colnames(momaData) %>% 
+  # Sorry museum branding guidelines
+  gsub("MoMA", "moma", .) %>% 
+  gsub("(([a-z])([A-Z]))", "\\2_\\3", .) %>% 
+  tolower()
+
+# Create a linking table to handle the artist-object reationship ----------
+
+artistsObjects <- momaData %>%
+  select(object_id, artist_bio, artist) %>%
+  # Split the artist and artist_bio columns into lists.
+  mutate(artist_bios_list = strsplit(artist_bio, "(?<=\\))[[:blank:]]*(?=\\()", 
+                                     perl = TRUE),
+         num_bios = map_int(artist_bios_list, length),
+         artist_list = strsplit(artist, ",[[:blank:]]*"),
+         num_artists = map_int(artist_list, length)) %>%
+  # XXX: This currently removes 5952 records.
+  filter(num_artists > 0,
+         num_bios == num_artists)
+
+# Create duplicate rows for each object with multiple artists.
+artistsObjects <- artistsObjects[rep(seq(nrow(artistsObjects)), 
+                                     artistsObjects$num_artists), ]
+
+# Select one artist/artist_bio entry per row, and then create a unique artist_id
+# for each unique artist/artist_bio entry. This also gets rid of the parens 
+# around the bio.
+artistsObjects <- artistsObjects %>% 
+  group_by(object_id) %>%
+  mutate(artist_index = row_number(num_artists)) %>%
+  ungroup() %>%
+  mutate(artist = map2_chr(artist_list, artist_index, ~ .x[.y]),
+         artist_bio = sub("^\\((.*)\\)$", "\\1",
+                          map2_chr(artist_bios_list, artist_index, ~ .x[.y])),
+         artist_id = min_rank(paste(artist, artist_bio)))
+
+
+# Normalize the DB into artist/object/and linking tables ------------------
+
+# Create the artists table
+artists <- artistsObjects %>%
+  select(artist_id, artist, artist_bio) %>%
   distinct()
 
+# Drop all of the redundant info from the linking table.
+artistsObjects <- artistsObjects %>%
+  select(object_id, artist_id)
+
+# Drop all of the redundant info from the objects table.
+artworks <- momaData %>%
+  select(-artist, -artist_bio)
+
 
 # Extract birth nation and nationality from the artist bio ----------------
 
-artists <- artists %>%   
-  mutate(birth_nation = sub("U\\.S\\.A\\.", "United States", ArtistBio),
-         birth_nation = ifelse(grepl("born [[:alpha:]]+", birth_nation),
-                               sub(".*born ([[:alpha:][:space:]]*).*", "\\1", birth_nation),
-                               NA),
-         birth_nation = ifelse(birth_nation %in% c("c", "ca"), NA, birth_nation),
-         birth_nation = sub("\\W*$", "", birth_nation),
-         birth_nation = ifelse(birth_nation %in% c("US", "USA"), "United States", birth_nation),
-         nationality = sub("^[^[:alpha:]]([[:alpha:][:space:]]*).*", "\\1", ArtistBio),
-         nationality = sub("\\W*$", "", nationality),
-         nationality = sub(" and .*", "", nationality),
-         birth_nationality = ifelse(!is.na(birth_nation), birth_nation, nationality),
-         first_name = sub("([[:alpha:]]*).*", "\\1", Artist))
-
-
-# Add country codes to each artist ----------------------------------------
-
-nationalitiesToCodes <- read.csv("countries/nationalities_codes.csv", stringsAsFactors = FALSE)
-artists <- artists %>%
-  # Add the country code if there is one
-  left_join(nationalitiesToCodes, by = c("birth_nationality" = "nationality"))
-
-# Find unique first name / country code pairs for genderizing
-artist.firstNames <- artists %>%
-  select(first_name, iso3166) %>%
-  distinct() %>%
-  arrange(iso3166, first_name) %>%
-  # NAs won't work if we go to CSV and read them back in!
-  mutate(iso3166 = ifelse(is.na(iso3166), "none", iso3166))
-
-if (!file.exists("genderize/names_to_genderize.csv")) {
-  write.csv(artist.firstNames, "genderize/names_to_genderize.csv", 
-            row.names = FALSE)
-} else {
-  nameGenders <- read.csv("genderize/names_with_genders.csv", 
-                          stringsAsFactors = FALSE) %>%
-    select(name, gender, country_id)
+# Not 100% happy with my approach here, but it uses a lot of functional
+# programming so there's that.
+
+# This function takes a pattern and other options, and returns a function that
+# will apply that pattern to a string and use those other options to extract
+# relevant info.
+makePatternMatcher <- function(pattern, 
+                               nationalityIndex = NA, 
+                               birthNationIndex = NA, 
+                               birthYearIndex = NA, 
+                               deathYearIndex = NA,
+                               isEntityDefault = FALSE) {
+  # THESE functions return a data.frame containing the extracted artist_bio
+  # information
+  patternMatcher <- function(artistBio) {
+    m <- regexec(pattern, artistBio)
+    matches <- regmatches(artistBio, m)
+    
+    isMatch     <- map_lgl(matches, ~length(.x) > 0)
+    nationality <- map_chr(matches, ~ifelse(length(.x) < nationalityIndex+1, NA, 
+                                            .x[nationalityIndex+1]))
+    birthNation <- map_chr(matches, ~ifelse(length(.x) < birthNationIndex+1, NA, 
+                                            .x[birthNationIndex+1]))
+    birthYear   <- map_int(matches, ~ifelse(length(.x) < birthYearIndex+1, NA, 
+                                            as.integer(.x[birthYearIndex+1])))
+    deathYear   <- map_int(matches, ~ifelse(length(.x) < deathYearIndex+1, NA, 
+                                            as.integer(.x[deathYearIndex+1])))
+    isEntity    <- isEntityDefault
+    
+    return(data_frame(is_match = isMatch, 
+                      nationality, 
+                      birth_nation = birthNation, 
+                      birth_year = birthYear, 
+                      death_year = deathYear, 
+                      is_entity = isEntity))
+  }
   
-  # Add genders to the artists
-  artists <- artists %>%
-    # First, pretend we have no country info and get the genders that way
-    mutate(no_iso3166 = "none") %>%
-    left_join(nameGenders, by = c("first_name" = "name", "no_iso3166" = "country_id")) %>%
-    rename(no_country_gender = gender) %>%
-    # Now get the genders using the country info
-    left_join(nameGenders, by = c("first_name" = "name", "iso3166" = "country_id")) %>%
-    # Fill in missing gender info using the no-country gender info
-    mutate(gender = ifelse(is.na(gender), no_country_gender, gender)) %>%
-    # Drop the dummy columns
-    select(-no_iso3166, -no_country_gender)
+  return(patternMatcher)
+}
+
+patternFuns <- list()
+# Nationality, born Country YYYY
+# Nationality, born Country (now Other) YYYY
+patternFuns[[1]] <- makePatternMatcher("^([[:alpha:]]+[[:blank:][:alpha:][:punct:]]*), born ([[:alpha:][:punct:][:space:]]+) ([[:digit:]]{4})$",
+                                       nationalityIndex = 1, birthNationIndex = 2, birthYearIndex = 3)
+# Nationality, born Country. YYYY-YYYY
+patternFuns[[2]] <- makePatternMatcher("^([[:alpha:]]+[[:blank:][:alpha:][:punct:]]*), born ([[:alpha:][:punct:][:space:]]+)([[:digit:]]{4})[[:punct:]]([[:digit:]]{4})$",
+                                       nationalityIndex = 1, birthNationIndex = 2, birthYearIndex = 3, deathYearIndex = 4)
+# Nationality, born YYYY
+patternFuns[[3]] <- makePatternMatcher("^([[:alpha:]]+[[:blank:][:alpha:][:punct:]]*), b(?:\\.|orn) ([[:digit:]]{4})$",
+                                       nationalityIndex = 1, birthYearIndex = 2)
+# Nationality, YYYY-YYYY
+patternFuns[[4]] <- makePatternMatcher("^([[:alpha:]]+[[:blank:][:alpha:][:punct:]]*),[[:space:]]*([[:digit:]]{4})[[:punct:]]+([[:digit:]]{4})$",
+                                       nationalityIndex = 1, birthYearIndex = 2, deathYearIndex = 3)
+# Nationality, born Country
+patternFuns[[5]] <- makePatternMatcher("^([[:alpha:]]+[[:blank:][:alpha:][:punct:]]*), born ([[:alpha:][:punct:][:space:]]+)$",
+                                       nationalityIndex = 1, birthNationIndex = 2)
+# Nationality
+patternFuns[[6]] <- makePatternMatcher("^([[:alpha:]]+[[:blank:][:alpha:][:punct:]]*)$",
+                                       nationalityIndex = 1)
+# Nationality, (established|est.|founded) YYYY
+patternFuns[[7]] <- makePatternMatcher("^([[:alpha:]]+[[:blank:][:alpha:][:punct:]]*), (?:(?:established|est\\.)|founded) ([[:digit:]]{4})$",
+                                       nationalityIndex = 1, birthYearIndex = 2, isEntityDefault = TRUE)
+# est. YYYY
+patternFuns[[8]] <- makePatternMatcher("^est(?:\\.|ablished) ([[:digit:]]{4})$",
+                                       birthYearIndex = 1, isEntityDefault = TRUE)
+# Nationality, died YYYY
+patternFuns[[9]] <- makePatternMatcher("^([[:alpha:]]+[[:blank:][:alpha:][:punct:]]*), died ([[:digit:]]{4})$",
+                                       nationalityIndex = 1, deathYearIndex = 2)
+# born YYYY
+patternFuns[[10]] <- makePatternMatcher("^born ([[:digit:]]{4})$",
+                                       birthYearIndex = 1)
+# YYYY-YYYY
+patternFuns[[11]] <- makePatternMatcher("^([[:digit:]]{4})[[:punct:]]+([[:digit:]]{4})$",
+                                        birthYearIndex = 1, deathYearIndex = 2)
+# Nationality, YYYY-?
+patternFuns[[12]] <- makePatternMatcher("^([[:alpha:]]+), ([[:digit:]]{4})[[:punct:]]*$",
+                                        nationalityIndex = 1, birthYearIndex = 2)
+# Nationality, est. YYYY-YYYY
+patternFuns[[13]] <- makePatternMatcher("^([[:alpha:]]+[[:blank:][:alpha:][:punct:]]*), (?:(?:established|est\\.)|founded) ([[:digit:]]{4})[[:punct:]]([[:digit:]]{4})$",
+                                         nationalityIndex = 1, birthYearIndex = 2, deathYearIndex = 3, isEntityDefault = TRUE)
+
+
+# Prepare to add the new biographical columns
+commonCols <- c("nationality", "birth_nation", 
+                "birth_year", "death_year", "is_entity")
+for (c in commonCols) {
+  artists[[c]] <- NA
+}
+
+# Apply the pattern-matching functions to the artist_bios. This keeps track of
+# which rows have previously matched so that the patternFuns aren't each run on
+# all of the data.
+matchFalses <- rep(FALSE, length(artists$artist_bio))
+matchedAlready <- matchFalses
+for (patternFun in patternFuns) {
+  # Apply patternFun to extract the biographical information for bios that
+  # haven't been matched already.
+  bioData <- patternFun(artists$artist_bio[!matchedAlready])
+  
+  # Where are new matches?
+  matchedHere <- matchFalses
+  matchedHere[!matchedAlready] <- bioData$is_match
+  
+  # Fill in the data for the new matches.
+  artists[matchedHere, commonCols] <- bioData[bioData$is_match, commonCols]
   
-  # Alright. All this CSV stuff is getting out of hand. Time for a RDB. :/
-  momaDB <- dbConnect(RSQLite::SQLite(), "momaDB.sqlite")
-  dbWriteTable(momaDB, "artists", artists)
-  dbDisconnect(momaDB)
+  # Update the list of matched rows.
+  matchedAlready <- matchedAlready | matchedHere
 }
+
+# Clean up nationality and birth_nation
+artists <- artists %>% 
+  mutate(nationality = ifelse(grepl("unknown", nationality, ignore.case = TRUE),
+                              NA, nationality),
+         nationality = sub(".*\\(now ([[:alpha:]][[:alpha:][:punct:][:space:]]*)\\).*", 
+                           "\\1", nationality),
+         nationality = sub(" \\((?:then|former) [[:alpha:]][[:alpha:][:punct:][:space:]]*\\)",
+                           "", nationality),
+         nationality = sub(", n\\.[[:space:]]?d\\.$", "", nationality),
+         nationality = ifelse(nationality == "USA", "American", nationality),
+         nationality = ifelse(nationality == "UK", "British", nationality),
+         nationality = sub(", active.*", "", nationality),
+         birth_nation = ifelse(grepl("^ca?\\.?$", birth_nation), NA, birth_nation),
+         birth_nation = sub(".*\\(now ([[:alpha:]][[:alpha:][:punct:][:space:]]*)\\).*", 
+                            "\\1", birth_nation),
+         birth_nation = sub("[^[:alpha:]]*$", "", birth_nation),
+         birth_nation = ifelse(grepl("U\\.?S\\.?A\\.?", birth_nation), 
+                               "United States", birth_nation),
+         birth_nation = sub(" \\(.*", "", birth_nation),
+         birth_nation = sub("^ ?in ", "", birth_nation),
+         birth_nation = sub(", born.*", "", birth_nation))
+
+# # Add country codes to each artist ----------------------------------------
+# 
+# nationalitiesToCodes <- read.csv("countries/nationalities_codes.csv", stringsAsFactors = FALSE)
+# artists <- artists %>%
+#   # Add the country code if there is one
+#   left_join(nationalitiesToCodes, by = c("birth_nationality" = "nationality"))
+# 
+# # Find unique first name / country code pairs for genderizing
+# artist.firstNames <- artists %>%
+#   select(first_name, iso3166) %>%
+#   distinct() %>%
+#   arrange(iso3166, first_name) %>%
+#   # NAs won't work if we go to CSV and read them back in!
+#   mutate(iso3166 = ifelse(is.na(iso3166), "none", iso3166))
+# 
+# if (!file.exists("genderize/names_to_genderize.csv")) {
+#   write.csv(artist.firstNames, "genderize/names_to_genderize.csv", 
+#             row.names = FALSE)
+# } else {
+#   nameGenders <- read.csv("genderize/names_with_genders.csv", 
+#                           stringsAsFactors = FALSE) %>%
+#     select(name, gender, country_id)
+#   
+#   # Add genders to the artists
+#   artists <- artists %>%
+#     # First, pretend we have no country info and get the genders that way
+#     mutate(no_iso3166 = "none") %>%
+#     left_join(nameGenders, by = c("first_name" = "name", "no_iso3166" = "country_id")) %>%
+#     rename(no_country_gender = gender) %>%
+#     # Now get the genders using the country info
+#     left_join(nameGenders, by = c("first_name" = "name", "iso3166" = "country_id")) %>%
+#     # Fill in missing gender info using the no-country gender info
+#     mutate(gender = ifelse(is.na(gender), no_country_gender, gender)) %>%
+#     # Drop the dummy columns
+#     select(-no_iso3166, -no_country_gender)
+#   
+#   # Alright. All this CSV stuff is getting out of hand. Time for a RDB. :/
+#   momaDB <- dbConnect(RSQLite::SQLite(), "momaDB.sqlite")
+#   dbWriteTable(momaDB, "artists", artists)
+#   dbDisconnect(momaDB)
+# }

	moma-collection The Museum of Modern Art (MoMA) collection data.
	git clone https://git.eamoncaddigan.net/moma-collection.git
	Log \| Files \| Refs \| README \| LICENSE