moma-collection

The Museum of Modern Art (MoMA) collection data.
git clone https://git.eamoncaddigan.net/moma-collection.git
Log | Files | Refs | README | LICENSE

commit 9eae7d0c57683d95d0e932be0b3ae095214d6c4b
parent fe35210acef56f66103efb4424d13f34b05b3de1
Author: eamoncaddigan <eamon.caddigan@gmail.com>
Date:   Thu, 30 Jul 2015 15:35:46 -0400

Artist info (including nationality and estimated gender) is collected and written in a SQLite DB. I've also thrown the artworks table in there. Movin' up.

Diffstat:
MartistInfo.R | 45+++++++++++++++++++++++++++++++++++++--------
AmomaDB.sqlite | 0
2 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/artistInfo.R b/artistInfo.R @@ -3,6 +3,7 @@ require(dplyr) require(tidyr) require(ggplot2) +require(RSQLite) # Read the artist information out of the collection table ----------------- @@ -16,15 +17,17 @@ artists <- read.csv("Artworks.csv", stringsAsFactors = FALSE) %>% !grepl("unknown", Artist, ignore.case = TRUE)) %>% # Try to remove anything with multiple artists filter(!grepl(" and ", Artist), - !grepl(",", Artist)) + !grepl(",", Artist)) %>% + # Not sure why dups are making it through. Unicode? + distinct() # Extract birth nation and nationality from the artist bio ---------------- artists <- artists %>% - mutate(ArtistBio = sub("U\\.S\\.A\\.", "United States", ArtistBio), - birth_nation = ifelse(grepl("born [[:alpha:]]+", ArtistBio), - sub(".*born ([[:alpha:][:space:]]*).*", "\\1", ArtistBio), + mutate(birth_nation = sub("U\\.S\\.A\\.", "United States", ArtistBio), + birth_nation = ifelse(grepl("born [[:alpha:]]+", birth_nation), + sub(".*born ([[:alpha:][:space:]]*).*", "\\1", birth_nation), NA), birth_nation = ifelse(birth_nation %in% c("c", "ca"), NA, birth_nation), birth_nation = sub("\\W*$", "", birth_nation), @@ -32,7 +35,7 @@ artists <- artists %>% nationality = sub("^[^[:alpha:]]([[:alpha:][:space:]]*).*", "\\1", ArtistBio), nationality = sub("\\W*$", "", nationality), nationality = sub(" and .*", "", nationality), - nationality = ifelse(!is.na(birth_nation), birth_nation, nationality), + birth_nationality = ifelse(!is.na(birth_nation), birth_nation, nationality), first_name = sub("([[:alpha:]]*).*", "\\1", Artist)) @@ -41,7 +44,7 @@ artists <- artists %>% nationalitiesToCodes <- read.csv("countries/nationalities_codes.csv", stringsAsFactors = FALSE) artists <- artists %>% # Add the country code if there is one - left_join(nationalitiesToCodes, by = "nationality") + left_join(nationalitiesToCodes, by = c("birth_nationality" = "nationality")) # Find unique first name / country code pairs for genderizing artist.firstNames <- artists %>% @@ -50,5 +53,31 @@ artist.firstNames <- artists %>% arrange(iso3166, first_name) %>% # NAs won't work if we go to CSV and read them back in! mutate(iso3166 = ifelse(is.na(iso3166), "none", iso3166)) -print(nrow(artist.firstNames)) -write.csv(artist.firstNames, "names_to_genderize.csv", row.names = FALSE) + +if (!file.exists("genderize/names_to_genderize.csv")) { + write.csv(artist.firstNames, "genderize/names_to_genderize.csv", + row.names = FALSE) +} else { + nameGenders <- read.csv("genderize/names_with_genders.csv", + stringsAsFactors = FALSE) %>% + select(name, gender, country_id) + + # Add genders to the artists + artists <- artists %>% + # First, pretend we have no country info and get the genders that way + mutate(no_iso3166 = "none") %>% + left_join(nameGenders, by = c("first_name" = "name", "no_iso3166" = "country_id")) %>% + rename(no_country_gender = gender) %>% + # Now get the genders using the country info + left_join(nameGenders, by = c("first_name" = "name", "iso3166" = "country_id")) %>% + # Fill in missing gender info using the no-country gender info + mutate(gender = ifelse(is.na(gender), no_country_gender, gender)) %>% + # Drop the dummy columns + select(-no_iso3166, -no_country_gender) + + # Alright. All this CSV stuff is getting out of hand. Time for a RDB. :/ + momaDB <- dbConnect(RSQLite::SQLite(), "momaDB.sqlite") + dbWriteTable(momaDB, "artists", artists) + dbDisconnect(momaDB) + unlink("momaDB.sqlite") +} diff --git a/momaDB.sqlite b/momaDB.sqlite Binary files differ.