artistInfo.R (11690B)
1 # This code reads in MoMA's Artworks.csv and creates a SQLite DB with two tables: 2 # * artworks: information about the pieces in MoMA's collection 3 # * artists: information about the people who made them 4 5 library(dplyr) 6 library(purrr) 7 library(tidyr) 8 library(RSQLite) 9 10 # Read in the info -------------------------------------------------------- 11 12 momaData <- read.csv("Artworks.csv", stringsAsFactors = FALSE) 13 # Convert the venerable CamelCase column names to lowely snake_case. 14 colnames(momaData) <- colnames(momaData) %>% 15 # Sorry museum branding guidelines 16 gsub("MoMA", "moma", .) %>% 17 gsub("(([a-z])([A-Z]))", "\\2_\\3", .) %>% 18 tolower() 19 20 # Create a linking table to handle the artist-object reationship ---------- 21 22 artistsObjects <- momaData %>% 23 select(object_id, artist_bio, artist) %>% 24 # Split the artist and artist_bio columns into lists. 25 mutate(artist_bios_list = strsplit(artist_bio, "(?<=\\))[[:blank:]]*(?=\\()", 26 perl = TRUE), 27 num_bios = map_int(artist_bios_list, length), 28 artist_list = strsplit(artist, ",[[:blank:]]*"), 29 num_artists = map_int(artist_list, length)) %>% 30 # XXX: This currently removes 5952 records. 31 filter(num_artists > 0, 32 num_bios == num_artists) 33 34 # Create duplicate rows for each object with multiple artists. 35 artistsObjects <- artistsObjects[rep(seq(nrow(artistsObjects)), 36 artistsObjects$num_artists), ] 37 38 # Select one artist/artist_bio entry per row, and then create a unique artist_id 39 # for each unique artist/artist_bio entry. This also gets rid of the parens 40 # around the bio. 41 artistsObjects <- artistsObjects %>% 42 group_by(object_id) %>% 43 mutate(artist_index = row_number(num_artists)) %>% 44 ungroup() %>% 45 mutate(artist = map2_chr(artist_list, artist_index, ~ .x[.y]), 46 artist_bio = sub("^\\((.*)\\)$", "\\1", 47 map2_chr(artist_bios_list, artist_index, ~ .x[.y])), 48 artist_id = min_rank(paste(artist, artist_bio))) 49 50 51 # Normalize the DB into artist/object/and linking tables ------------------ 52 53 # Create the artists table 54 artists <- artistsObjects %>% 55 select(artist_id, artist, artist_bio) %>% 56 distinct() 57 58 # Drop all of the redundant info from the linking table. 59 artistsObjects <- artistsObjects %>% 60 select(object_id, artist_id) 61 62 # Drop all of the redundant info from the objects table. 63 artworks <- momaData %>% 64 select(-artist, -artist_bio) 65 66 67 # Extract birth nation and nationality from the artist bio ---------------- 68 69 # Not 100% happy with my approach here, but it uses a lot of functional 70 # programming so there's that. 71 72 # This function takes a pattern and other options, and returns a function that 73 # will apply that pattern to a string and use those other options to extract 74 # relevant info. 75 makePatternMatcher <- function(pattern, 76 nationalityIndex = NA, 77 birthNationIndex = NA, 78 birthYearIndex = NA, 79 deathYearIndex = NA, 80 isEntityDefault = FALSE) { 81 # THESE functions return a data.frame containing the extracted artist_bio 82 # information 83 patternMatcher <- function(artistBio) { 84 m <- regexec(pattern, artistBio) 85 matches <- regmatches(artistBio, m) 86 87 isMatch <- map_lgl(matches, ~length(.x) > 0) 88 nationality <- map_chr(matches, ~ifelse(length(.x) < nationalityIndex+1, NA, 89 .x[nationalityIndex+1])) 90 birthNation <- map_chr(matches, ~ifelse(length(.x) < birthNationIndex+1, NA, 91 .x[birthNationIndex+1])) 92 birthYear <- map_int(matches, ~ifelse(length(.x) < birthYearIndex+1, NA, 93 as.integer(.x[birthYearIndex+1]))) 94 deathYear <- map_int(matches, ~ifelse(length(.x) < deathYearIndex+1, NA, 95 as.integer(.x[deathYearIndex+1]))) 96 isEntity <- isEntityDefault 97 98 return(data_frame(is_match = isMatch, 99 nationality, 100 birth_nation = birthNation, 101 birth_year = birthYear, 102 death_year = deathYear, 103 is_entity = isEntity)) 104 } 105 106 return(patternMatcher) 107 } 108 109 patternFuns <- list() 110 # Nationality, born Country YYYY 111 # Nationality, born Country (now Other) YYYY 112 patternFuns[[1]] <- makePatternMatcher("^([[:alpha:]]+[[:blank:][:alpha:][:punct:]]*), born ([[:alpha:][:punct:][:space:]]+) ([[:digit:]]{4})$", 113 nationalityIndex = 1, birthNationIndex = 2, birthYearIndex = 3) 114 # Nationality, born Country. YYYY-YYYY 115 patternFuns[[2]] <- makePatternMatcher("^([[:alpha:]]+[[:blank:][:alpha:][:punct:]]*), born ([[:alpha:][:punct:][:space:]]+)([[:digit:]]{4})[[:punct:]]([[:digit:]]{4})$", 116 nationalityIndex = 1, birthNationIndex = 2, birthYearIndex = 3, deathYearIndex = 4) 117 # Nationality, born YYYY 118 patternFuns[[3]] <- makePatternMatcher("^([[:alpha:]]+[[:blank:][:alpha:][:punct:]]*), b(?:\\.|orn) ([[:digit:]]{4})$", 119 nationalityIndex = 1, birthYearIndex = 2) 120 # Nationality, YYYY-YYYY 121 patternFuns[[4]] <- makePatternMatcher("^([[:alpha:]]+[[:blank:][:alpha:][:punct:]]*),[[:space:]]*([[:digit:]]{4})[[:punct:]]+([[:digit:]]{4})$", 122 nationalityIndex = 1, birthYearIndex = 2, deathYearIndex = 3) 123 # Nationality, born Country 124 patternFuns[[5]] <- makePatternMatcher("^([[:alpha:]]+[[:blank:][:alpha:][:punct:]]*), born ([[:alpha:][:punct:][:space:]]+)$", 125 nationalityIndex = 1, birthNationIndex = 2) 126 # Nationality 127 patternFuns[[6]] <- makePatternMatcher("^([[:alpha:]]+[[:blank:][:alpha:][:punct:]]*)$", 128 nationalityIndex = 1) 129 # Nationality, (established|est.|founded) YYYY 130 patternFuns[[7]] <- makePatternMatcher("^([[:alpha:]]+[[:blank:][:alpha:][:punct:]]*), (?:(?:established|est\\.)|founded) ([[:digit:]]{4})$", 131 nationalityIndex = 1, birthYearIndex = 2, isEntityDefault = TRUE) 132 # est. YYYY 133 patternFuns[[8]] <- makePatternMatcher("^est(?:\\.|ablished) ([[:digit:]]{4})$", 134 birthYearIndex = 1, isEntityDefault = TRUE) 135 # Nationality, died YYYY 136 patternFuns[[9]] <- makePatternMatcher("^([[:alpha:]]+[[:blank:][:alpha:][:punct:]]*), died ([[:digit:]]{4})$", 137 nationalityIndex = 1, deathYearIndex = 2) 138 # born YYYY 139 patternFuns[[10]] <- makePatternMatcher("^born ([[:digit:]]{4})$", 140 birthYearIndex = 1) 141 # YYYY-YYYY 142 patternFuns[[11]] <- makePatternMatcher("^([[:digit:]]{4})[[:punct:]]+([[:digit:]]{4})$", 143 birthYearIndex = 1, deathYearIndex = 2) 144 # Nationality, YYYY-? 145 patternFuns[[12]] <- makePatternMatcher("^([[:alpha:]]+), ([[:digit:]]{4})[[:punct:]]*$", 146 nationalityIndex = 1, birthYearIndex = 2) 147 # Nationality, est. YYYY-YYYY 148 patternFuns[[13]] <- makePatternMatcher("^([[:alpha:]]+[[:blank:][:alpha:][:punct:]]*), (?:(?:established|est\\.)|founded) ([[:digit:]]{4})[[:punct:]]([[:digit:]]{4})$", 149 nationalityIndex = 1, birthYearIndex = 2, deathYearIndex = 3, isEntityDefault = TRUE) 150 151 152 # Prepare to add the new biographical columns 153 commonCols <- c("nationality", "birth_nation", 154 "birth_year", "death_year", "is_entity") 155 for (c in commonCols) { 156 artists[[c]] <- NA 157 } 158 159 # Apply the pattern-matching functions to the artist_bios. This keeps track of 160 # which rows have previously matched so that the patternFuns aren't each run on 161 # all of the data. 162 matchFalses <- rep(FALSE, length(artists$artist_bio)) 163 matchedAlready <- matchFalses 164 for (patternFun in patternFuns) { 165 # Apply patternFun to extract the biographical information for bios that 166 # haven't been matched already. 167 bioData <- patternFun(artists$artist_bio[!matchedAlready]) 168 169 # Where are new matches? 170 matchedHere <- matchFalses 171 matchedHere[!matchedAlready] <- bioData$is_match 172 173 # Fill in the data for the new matches. 174 artists[matchedHere, commonCols] <- bioData[bioData$is_match, commonCols] 175 176 # Update the list of matched rows. 177 matchedAlready <- matchedAlready | matchedHere 178 } 179 180 # Clean up nationality and birth_nation 181 artists <- artists %>% 182 mutate(nationality = ifelse(grepl("unknown", nationality, ignore.case = TRUE), 183 NA, nationality), 184 nationality = sub(".*\\(now ([[:alpha:]][[:alpha:][:punct:][:space:]]*)\\).*", 185 "\\1", nationality), 186 nationality = sub(" \\((?:then|former) [[:alpha:]][[:alpha:][:punct:][:space:]]*\\)", 187 "", nationality), 188 nationality = sub(", n\\.[[:space:]]?d\\.$", "", nationality), 189 nationality = ifelse(nationality == "USA", "American", nationality), 190 nationality = ifelse(nationality == "UK", "British", nationality), 191 nationality = sub(", active.*", "", nationality), 192 birth_nation = ifelse(grepl("^ca?\\.?$", birth_nation), NA, birth_nation), 193 birth_nation = sub(".*\\(now ([[:alpha:]][[:alpha:][:punct:][:space:]]*)\\).*", 194 "\\1", birth_nation), 195 birth_nation = sub("[^[:alpha:]]*$", "", birth_nation), 196 birth_nation = ifelse(grepl("U\\.?S\\.?A\\.?", birth_nation), 197 "United States", birth_nation), 198 birth_nation = sub(" \\(.*", "", birth_nation), 199 birth_nation = sub("^ ?in ", "", birth_nation), 200 birth_nation = sub(", born.*", "", birth_nation)) 201 202 # # Add country codes to each artist ---------------------------------------- 203 # 204 # nationalitiesToCodes <- read.csv("countries/nationalities_codes.csv", stringsAsFactors = FALSE) 205 # artists <- artists %>% 206 # # Add the country code if there is one 207 # left_join(nationalitiesToCodes, by = c("birth_nationality" = "nationality")) 208 # 209 # # Find unique first name / country code pairs for genderizing 210 # artist.firstNames <- artists %>% 211 # select(first_name, iso3166) %>% 212 # distinct() %>% 213 # arrange(iso3166, first_name) %>% 214 # # NAs won't work if we go to CSV and read them back in! 215 # mutate(iso3166 = ifelse(is.na(iso3166), "none", iso3166)) 216 # 217 # if (!file.exists("genderize/names_to_genderize.csv")) { 218 # write.csv(artist.firstNames, "genderize/names_to_genderize.csv", 219 # row.names = FALSE) 220 # } else { 221 # nameGenders <- read.csv("genderize/names_with_genders.csv", 222 # stringsAsFactors = FALSE) %>% 223 # select(name, gender, country_id) 224 # 225 # # Add genders to the artists 226 # artists <- artists %>% 227 # # First, pretend we have no country info and get the genders that way 228 # mutate(no_iso3166 = "none") %>% 229 # left_join(nameGenders, by = c("first_name" = "name", "no_iso3166" = "country_id")) %>% 230 # rename(no_country_gender = gender) %>% 231 # # Now get the genders using the country info 232 # left_join(nameGenders, by = c("first_name" = "name", "iso3166" = "country_id")) %>% 233 # # Fill in missing gender info using the no-country gender info 234 # mutate(gender = ifelse(is.na(gender), no_country_gender, gender)) %>% 235 # # Drop the dummy columns 236 # select(-no_iso3166, -no_country_gender) 237 # 238 # # Alright. All this CSV stuff is getting out of hand. Time for a RDB. :/ 239 # momaDB <- dbConnect(RSQLite::SQLite(), "momaDB.sqlite") 240 # dbWriteTable(momaDB, "artists", artists) 241 # dbDisconnect(momaDB) 242 # }