Artist info (including nationality and estimated gender) is collected and written in a SQLite DB. I've also thrown the artworks table in there. Movin' up. - moma-collection - The Museum of Modern Art (MoMA) collection data.

commit 9eae7d0c57683d95d0e932be0b3ae095214d6c4b
parent fe35210acef56f66103efb4424d13f34b05b3de1
Author: eamoncaddigan <eamon.caddigan@gmail.com>
Date:   Thu, 30 Jul 2015 15:35:46 -0400

Artist info (including nationality and estimated gender) is collected and written in a SQLite DB. I've also thrown the artworks table in there. Movin' up.

Diffstat:
M artistInfo.R  | 45 +++++++++++++++++++++++++++++++++++++--------
A momaDB.sqlite  | 0

2 files changed, 37 insertions(+), 8 deletions(-)
diff --git a/artistInfo.R b/artistInfo.R
@@ -3,6 +3,7 @@
 require(dplyr)
 require(tidyr)
 require(ggplot2)
+require(RSQLite)
 
 # Read the artist information out of the collection table -----------------
 
@@ -16,15 +17,17 @@ artists <- read.csv("Artworks.csv", stringsAsFactors = FALSE) %>%
          !grepl("unknown", Artist, ignore.case = TRUE)) %>%
   # Try to remove anything with multiple artists
   filter(!grepl(" and ", Artist), 
-         !grepl(",", Artist))
+         !grepl(",", Artist)) %>%
+  # Not sure why dups are making it through. Unicode?
+  distinct()
 
 
 # Extract birth nation and nationality from the artist bio ----------------
 
 artists <- artists %>%   
-  mutate(ArtistBio = sub("U\\.S\\.A\\.", "United States", ArtistBio),
-         birth_nation = ifelse(grepl("born [[:alpha:]]+", ArtistBio),
-                               sub(".*born ([[:alpha:][:space:]]*).*", "\\1", ArtistBio),
+  mutate(birth_nation = sub("U\\.S\\.A\\.", "United States", ArtistBio),
+         birth_nation = ifelse(grepl("born [[:alpha:]]+", birth_nation),
+                               sub(".*born ([[:alpha:][:space:]]*).*", "\\1", birth_nation),
                                NA),
          birth_nation = ifelse(birth_nation %in% c("c", "ca"), NA, birth_nation),
          birth_nation = sub("\\W*$", "", birth_nation),
@@ -32,7 +35,7 @@ artists <- artists %>%
          nationality = sub("^[^[:alpha:]]([[:alpha:][:space:]]*).*", "\\1", ArtistBio),
          nationality = sub("\\W*$", "", nationality),
          nationality = sub(" and .*", "", nationality),
-         nationality = ifelse(!is.na(birth_nation), birth_nation, nationality),
+         birth_nationality = ifelse(!is.na(birth_nation), birth_nation, nationality),
          first_name = sub("([[:alpha:]]*).*", "\\1", Artist))
 
 
@@ -41,7 +44,7 @@ artists <- artists %>%
 nationalitiesToCodes <- read.csv("countries/nationalities_codes.csv", stringsAsFactors = FALSE)
 artists <- artists %>%
   # Add the country code if there is one
-  left_join(nationalitiesToCodes, by = "nationality")
+  left_join(nationalitiesToCodes, by = c("birth_nationality" = "nationality"))
 
 # Find unique first name / country code pairs for genderizing
 artist.firstNames <- artists %>%
@@ -50,5 +53,31 @@ artist.firstNames <- artists %>%
   arrange(iso3166, first_name) %>%
   # NAs won't work if we go to CSV and read them back in!
   mutate(iso3166 = ifelse(is.na(iso3166), "none", iso3166))
-print(nrow(artist.firstNames))
-write.csv(artist.firstNames, "names_to_genderize.csv", row.names = FALSE)
+
+if (!file.exists("genderize/names_to_genderize.csv")) {
+  write.csv(artist.firstNames, "genderize/names_to_genderize.csv", 
+            row.names = FALSE)
+} else {
+  nameGenders <- read.csv("genderize/names_with_genders.csv", 
+                          stringsAsFactors = FALSE) %>%
+    select(name, gender, country_id)
+  
+  # Add genders to the artists
+  artists <- artists %>%
+    # First, pretend we have no country info and get the genders that way
+    mutate(no_iso3166 = "none") %>%
+    left_join(nameGenders, by = c("first_name" = "name", "no_iso3166" = "country_id")) %>%
+    rename(no_country_gender = gender) %>%
+    # Now get the genders using the country info
+    left_join(nameGenders, by = c("first_name" = "name", "iso3166" = "country_id")) %>%
+    # Fill in missing gender info using the no-country gender info
+    mutate(gender = ifelse(is.na(gender), no_country_gender, gender)) %>%
+    # Drop the dummy columns
+    select(-no_iso3166, -no_country_gender)
+  
+  # Alright. All this CSV stuff is getting out of hand. Time for a RDB. :/
+  momaDB <- dbConnect(RSQLite::SQLite(), "momaDB.sqlite")
+  dbWriteTable(momaDB, "artists", artists)
+  dbDisconnect(momaDB)
+  unlink("momaDB.sqlite")
+}
diff --git a/momaDB.sqlite b/momaDB.sqlite
Binary files differ.

	moma-collection The Museum of Modern Art (MoMA) collection data.
	git clone https://git.eamoncaddigan.net/moma-collection.git
	Log \| Files \| Refs \| README \| LICENSE

M	artistInfo.R	\|	45	+++++++++++++++++++++++++++++++++++++--------
A	momaDB.sqlite	\|	0