moma-collection

The Museum of Modern Art (MoMA) collection data.
git clone https://git.eamoncaddigan.net/moma-collection.git
Log | Files | Refs | README | LICENSE

artistInfo.R (11690B)


      1 # This code reads in MoMA's Artworks.csv and creates a SQLite DB with two tables:
      2 # * artworks: information about the pieces in MoMA's collection
      3 # * artists: information about the people who made them
      4 
      5 library(dplyr)
      6 library(purrr)
      7 library(tidyr)
      8 library(RSQLite)
      9 
     10 # Read in the info --------------------------------------------------------
     11 
     12 momaData <- read.csv("Artworks.csv", stringsAsFactors = FALSE)
     13 # Convert the venerable CamelCase column names to lowely snake_case. 
     14 colnames(momaData) <- colnames(momaData) %>% 
     15   # Sorry museum branding guidelines
     16   gsub("MoMA", "moma", .) %>% 
     17   gsub("(([a-z])([A-Z]))", "\\2_\\3", .) %>% 
     18   tolower()
     19 
     20 # Create a linking table to handle the artist-object reationship ----------
     21 
     22 artistsObjects <- momaData %>%
     23   select(object_id, artist_bio, artist) %>%
     24   # Split the artist and artist_bio columns into lists.
     25   mutate(artist_bios_list = strsplit(artist_bio, "(?<=\\))[[:blank:]]*(?=\\()", 
     26                                      perl = TRUE),
     27          num_bios = map_int(artist_bios_list, length),
     28          artist_list = strsplit(artist, ",[[:blank:]]*"),
     29          num_artists = map_int(artist_list, length)) %>%
     30   # XXX: This currently removes 5952 records.
     31   filter(num_artists > 0,
     32          num_bios == num_artists)
     33 
     34 # Create duplicate rows for each object with multiple artists.
     35 artistsObjects <- artistsObjects[rep(seq(nrow(artistsObjects)), 
     36                                      artistsObjects$num_artists), ]
     37 
     38 # Select one artist/artist_bio entry per row, and then create a unique artist_id
     39 # for each unique artist/artist_bio entry. This also gets rid of the parens 
     40 # around the bio.
     41 artistsObjects <- artistsObjects %>% 
     42   group_by(object_id) %>%
     43   mutate(artist_index = row_number(num_artists)) %>%
     44   ungroup() %>%
     45   mutate(artist = map2_chr(artist_list, artist_index, ~ .x[.y]),
     46          artist_bio = sub("^\\((.*)\\)$", "\\1",
     47                           map2_chr(artist_bios_list, artist_index, ~ .x[.y])),
     48          artist_id = min_rank(paste(artist, artist_bio)))
     49 
     50 
     51 # Normalize the DB into artist/object/and linking tables ------------------
     52 
     53 # Create the artists table
     54 artists <- artistsObjects %>%
     55   select(artist_id, artist, artist_bio) %>%
     56   distinct()
     57 
     58 # Drop all of the redundant info from the linking table.
     59 artistsObjects <- artistsObjects %>%
     60   select(object_id, artist_id)
     61 
     62 # Drop all of the redundant info from the objects table.
     63 artworks <- momaData %>%
     64   select(-artist, -artist_bio)
     65 
     66 
     67 # Extract birth nation and nationality from the artist bio ----------------
     68 
     69 # Not 100% happy with my approach here, but it uses a lot of functional
     70 # programming so there's that.
     71 
     72 # This function takes a pattern and other options, and returns a function that
     73 # will apply that pattern to a string and use those other options to extract
     74 # relevant info.
     75 makePatternMatcher <- function(pattern, 
     76                                nationalityIndex = NA, 
     77                                birthNationIndex = NA, 
     78                                birthYearIndex = NA, 
     79                                deathYearIndex = NA,
     80                                isEntityDefault = FALSE) {
     81   # THESE functions return a data.frame containing the extracted artist_bio
     82   # information
     83   patternMatcher <- function(artistBio) {
     84     m <- regexec(pattern, artistBio)
     85     matches <- regmatches(artistBio, m)
     86     
     87     isMatch     <- map_lgl(matches, ~length(.x) > 0)
     88     nationality <- map_chr(matches, ~ifelse(length(.x) < nationalityIndex+1, NA, 
     89                                             .x[nationalityIndex+1]))
     90     birthNation <- map_chr(matches, ~ifelse(length(.x) < birthNationIndex+1, NA, 
     91                                             .x[birthNationIndex+1]))
     92     birthYear   <- map_int(matches, ~ifelse(length(.x) < birthYearIndex+1, NA, 
     93                                             as.integer(.x[birthYearIndex+1])))
     94     deathYear   <- map_int(matches, ~ifelse(length(.x) < deathYearIndex+1, NA, 
     95                                             as.integer(.x[deathYearIndex+1])))
     96     isEntity    <- isEntityDefault
     97     
     98     return(data_frame(is_match = isMatch, 
     99                       nationality, 
    100                       birth_nation = birthNation, 
    101                       birth_year = birthYear, 
    102                       death_year = deathYear, 
    103                       is_entity = isEntity))
    104   }
    105   
    106   return(patternMatcher)
    107 }
    108 
    109 patternFuns <- list()
    110 # Nationality, born Country YYYY
    111 # Nationality, born Country (now Other) YYYY
    112 patternFuns[[1]] <- makePatternMatcher("^([[:alpha:]]+[[:blank:][:alpha:][:punct:]]*), born ([[:alpha:][:punct:][:space:]]+) ([[:digit:]]{4})$",
    113                                        nationalityIndex = 1, birthNationIndex = 2, birthYearIndex = 3)
    114 # Nationality, born Country. YYYY-YYYY
    115 patternFuns[[2]] <- makePatternMatcher("^([[:alpha:]]+[[:blank:][:alpha:][:punct:]]*), born ([[:alpha:][:punct:][:space:]]+)([[:digit:]]{4})[[:punct:]]([[:digit:]]{4})$",
    116                                        nationalityIndex = 1, birthNationIndex = 2, birthYearIndex = 3, deathYearIndex = 4)
    117 # Nationality, born YYYY
    118 patternFuns[[3]] <- makePatternMatcher("^([[:alpha:]]+[[:blank:][:alpha:][:punct:]]*), b(?:\\.|orn) ([[:digit:]]{4})$",
    119                                        nationalityIndex = 1, birthYearIndex = 2)
    120 # Nationality, YYYY-YYYY
    121 patternFuns[[4]] <- makePatternMatcher("^([[:alpha:]]+[[:blank:][:alpha:][:punct:]]*),[[:space:]]*([[:digit:]]{4})[[:punct:]]+([[:digit:]]{4})$",
    122                                        nationalityIndex = 1, birthYearIndex = 2, deathYearIndex = 3)
    123 # Nationality, born Country
    124 patternFuns[[5]] <- makePatternMatcher("^([[:alpha:]]+[[:blank:][:alpha:][:punct:]]*), born ([[:alpha:][:punct:][:space:]]+)$",
    125                                        nationalityIndex = 1, birthNationIndex = 2)
    126 # Nationality
    127 patternFuns[[6]] <- makePatternMatcher("^([[:alpha:]]+[[:blank:][:alpha:][:punct:]]*)$",
    128                                        nationalityIndex = 1)
    129 # Nationality, (established|est.|founded) YYYY
    130 patternFuns[[7]] <- makePatternMatcher("^([[:alpha:]]+[[:blank:][:alpha:][:punct:]]*), (?:(?:established|est\\.)|founded) ([[:digit:]]{4})$",
    131                                        nationalityIndex = 1, birthYearIndex = 2, isEntityDefault = TRUE)
    132 # est. YYYY
    133 patternFuns[[8]] <- makePatternMatcher("^est(?:\\.|ablished) ([[:digit:]]{4})$",
    134                                        birthYearIndex = 1, isEntityDefault = TRUE)
    135 # Nationality, died YYYY
    136 patternFuns[[9]] <- makePatternMatcher("^([[:alpha:]]+[[:blank:][:alpha:][:punct:]]*), died ([[:digit:]]{4})$",
    137                                        nationalityIndex = 1, deathYearIndex = 2)
    138 # born YYYY
    139 patternFuns[[10]] <- makePatternMatcher("^born ([[:digit:]]{4})$",
    140                                        birthYearIndex = 1)
    141 # YYYY-YYYY
    142 patternFuns[[11]] <- makePatternMatcher("^([[:digit:]]{4})[[:punct:]]+([[:digit:]]{4})$",
    143                                         birthYearIndex = 1, deathYearIndex = 2)
    144 # Nationality, YYYY-?
    145 patternFuns[[12]] <- makePatternMatcher("^([[:alpha:]]+), ([[:digit:]]{4})[[:punct:]]*$",
    146                                         nationalityIndex = 1, birthYearIndex = 2)
    147 # Nationality, est. YYYY-YYYY
    148 patternFuns[[13]] <- makePatternMatcher("^([[:alpha:]]+[[:blank:][:alpha:][:punct:]]*), (?:(?:established|est\\.)|founded) ([[:digit:]]{4})[[:punct:]]([[:digit:]]{4})$",
    149                                          nationalityIndex = 1, birthYearIndex = 2, deathYearIndex = 3, isEntityDefault = TRUE)
    150 
    151 
    152 # Prepare to add the new biographical columns
    153 commonCols <- c("nationality", "birth_nation", 
    154                 "birth_year", "death_year", "is_entity")
    155 for (c in commonCols) {
    156   artists[[c]] <- NA
    157 }
    158 
    159 # Apply the pattern-matching functions to the artist_bios. This keeps track of
    160 # which rows have previously matched so that the patternFuns aren't each run on
    161 # all of the data.
    162 matchFalses <- rep(FALSE, length(artists$artist_bio))
    163 matchedAlready <- matchFalses
    164 for (patternFun in patternFuns) {
    165   # Apply patternFun to extract the biographical information for bios that
    166   # haven't been matched already.
    167   bioData <- patternFun(artists$artist_bio[!matchedAlready])
    168   
    169   # Where are new matches?
    170   matchedHere <- matchFalses
    171   matchedHere[!matchedAlready] <- bioData$is_match
    172   
    173   # Fill in the data for the new matches.
    174   artists[matchedHere, commonCols] <- bioData[bioData$is_match, commonCols]
    175   
    176   # Update the list of matched rows.
    177   matchedAlready <- matchedAlready | matchedHere
    178 }
    179 
    180 # Clean up nationality and birth_nation
    181 artists <- artists %>% 
    182   mutate(nationality = ifelse(grepl("unknown", nationality, ignore.case = TRUE),
    183                               NA, nationality),
    184          nationality = sub(".*\\(now ([[:alpha:]][[:alpha:][:punct:][:space:]]*)\\).*", 
    185                            "\\1", nationality),
    186          nationality = sub(" \\((?:then|former) [[:alpha:]][[:alpha:][:punct:][:space:]]*\\)",
    187                            "", nationality),
    188          nationality = sub(", n\\.[[:space:]]?d\\.$", "", nationality),
    189          nationality = ifelse(nationality == "USA", "American", nationality),
    190          nationality = ifelse(nationality == "UK", "British", nationality),
    191          nationality = sub(", active.*", "", nationality),
    192          birth_nation = ifelse(grepl("^ca?\\.?$", birth_nation), NA, birth_nation),
    193          birth_nation = sub(".*\\(now ([[:alpha:]][[:alpha:][:punct:][:space:]]*)\\).*", 
    194                             "\\1", birth_nation),
    195          birth_nation = sub("[^[:alpha:]]*$", "", birth_nation),
    196          birth_nation = ifelse(grepl("U\\.?S\\.?A\\.?", birth_nation), 
    197                                "United States", birth_nation),
    198          birth_nation = sub(" \\(.*", "", birth_nation),
    199          birth_nation = sub("^ ?in ", "", birth_nation),
    200          birth_nation = sub(", born.*", "", birth_nation))
    201 
    202 # # Add country codes to each artist ----------------------------------------
    203 # 
    204 # nationalitiesToCodes <- read.csv("countries/nationalities_codes.csv", stringsAsFactors = FALSE)
    205 # artists <- artists %>%
    206 #   # Add the country code if there is one
    207 #   left_join(nationalitiesToCodes, by = c("birth_nationality" = "nationality"))
    208 # 
    209 # # Find unique first name / country code pairs for genderizing
    210 # artist.firstNames <- artists %>%
    211 #   select(first_name, iso3166) %>%
    212 #   distinct() %>%
    213 #   arrange(iso3166, first_name) %>%
    214 #   # NAs won't work if we go to CSV and read them back in!
    215 #   mutate(iso3166 = ifelse(is.na(iso3166), "none", iso3166))
    216 # 
    217 # if (!file.exists("genderize/names_to_genderize.csv")) {
    218 #   write.csv(artist.firstNames, "genderize/names_to_genderize.csv", 
    219 #             row.names = FALSE)
    220 # } else {
    221 #   nameGenders <- read.csv("genderize/names_with_genders.csv", 
    222 #                           stringsAsFactors = FALSE) %>%
    223 #     select(name, gender, country_id)
    224 #   
    225 #   # Add genders to the artists
    226 #   artists <- artists %>%
    227 #     # First, pretend we have no country info and get the genders that way
    228 #     mutate(no_iso3166 = "none") %>%
    229 #     left_join(nameGenders, by = c("first_name" = "name", "no_iso3166" = "country_id")) %>%
    230 #     rename(no_country_gender = gender) %>%
    231 #     # Now get the genders using the country info
    232 #     left_join(nameGenders, by = c("first_name" = "name", "iso3166" = "country_id")) %>%
    233 #     # Fill in missing gender info using the no-country gender info
    234 #     mutate(gender = ifelse(is.na(gender), no_country_gender, gender)) %>%
    235 #     # Drop the dummy columns
    236 #     select(-no_iso3166, -no_country_gender)
    237 #   
    238 #   # Alright. All this CSV stuff is getting out of hand. Time for a RDB. :/
    239 #   momaDB <- dbConnect(RSQLite::SQLite(), "momaDB.sqlite")
    240 #   dbWriteTable(momaDB, "artists", artists)
    241 #   dbDisconnect(momaDB)
    242 # }