commit 9eae7d0c57683d95d0e932be0b3ae095214d6c4b
parent fe35210acef56f66103efb4424d13f34b05b3de1
Author: eamoncaddigan <eamon.caddigan@gmail.com>
Date: Thu, 30 Jul 2015 15:35:46 -0400
Artist info (including nationality and estimated gender) is collected and written in a SQLite DB. I've also thrown the artworks table in there. Movin' up.
Diffstat:
2 files changed, 37 insertions(+), 8 deletions(-)
diff --git a/artistInfo.R b/artistInfo.R
@@ -3,6 +3,7 @@
require(dplyr)
require(tidyr)
require(ggplot2)
+require(RSQLite)
# Read the artist information out of the collection table -----------------
@@ -16,15 +17,17 @@ artists <- read.csv("Artworks.csv", stringsAsFactors = FALSE) %>%
!grepl("unknown", Artist, ignore.case = TRUE)) %>%
# Try to remove anything with multiple artists
filter(!grepl(" and ", Artist),
- !grepl(",", Artist))
+ !grepl(",", Artist)) %>%
+ # Not sure why dups are making it through. Unicode?
+ distinct()
# Extract birth nation and nationality from the artist bio ----------------
artists <- artists %>%
- mutate(ArtistBio = sub("U\\.S\\.A\\.", "United States", ArtistBio),
- birth_nation = ifelse(grepl("born [[:alpha:]]+", ArtistBio),
- sub(".*born ([[:alpha:][:space:]]*).*", "\\1", ArtistBio),
+ mutate(birth_nation = sub("U\\.S\\.A\\.", "United States", ArtistBio),
+ birth_nation = ifelse(grepl("born [[:alpha:]]+", birth_nation),
+ sub(".*born ([[:alpha:][:space:]]*).*", "\\1", birth_nation),
NA),
birth_nation = ifelse(birth_nation %in% c("c", "ca"), NA, birth_nation),
birth_nation = sub("\\W*$", "", birth_nation),
@@ -32,7 +35,7 @@ artists <- artists %>%
nationality = sub("^[^[:alpha:]]([[:alpha:][:space:]]*).*", "\\1", ArtistBio),
nationality = sub("\\W*$", "", nationality),
nationality = sub(" and .*", "", nationality),
- nationality = ifelse(!is.na(birth_nation), birth_nation, nationality),
+ birth_nationality = ifelse(!is.na(birth_nation), birth_nation, nationality),
first_name = sub("([[:alpha:]]*).*", "\\1", Artist))
@@ -41,7 +44,7 @@ artists <- artists %>%
nationalitiesToCodes <- read.csv("countries/nationalities_codes.csv", stringsAsFactors = FALSE)
artists <- artists %>%
# Add the country code if there is one
- left_join(nationalitiesToCodes, by = "nationality")
+ left_join(nationalitiesToCodes, by = c("birth_nationality" = "nationality"))
# Find unique first name / country code pairs for genderizing
artist.firstNames <- artists %>%
@@ -50,5 +53,31 @@ artist.firstNames <- artists %>%
arrange(iso3166, first_name) %>%
# NAs won't work if we go to CSV and read them back in!
mutate(iso3166 = ifelse(is.na(iso3166), "none", iso3166))
-print(nrow(artist.firstNames))
-write.csv(artist.firstNames, "names_to_genderize.csv", row.names = FALSE)
+
+if (!file.exists("genderize/names_to_genderize.csv")) {
+ write.csv(artist.firstNames, "genderize/names_to_genderize.csv",
+ row.names = FALSE)
+} else {
+ nameGenders <- read.csv("genderize/names_with_genders.csv",
+ stringsAsFactors = FALSE) %>%
+ select(name, gender, country_id)
+
+ # Add genders to the artists
+ artists <- artists %>%
+ # First, pretend we have no country info and get the genders that way
+ mutate(no_iso3166 = "none") %>%
+ left_join(nameGenders, by = c("first_name" = "name", "no_iso3166" = "country_id")) %>%
+ rename(no_country_gender = gender) %>%
+ # Now get the genders using the country info
+ left_join(nameGenders, by = c("first_name" = "name", "iso3166" = "country_id")) %>%
+ # Fill in missing gender info using the no-country gender info
+ mutate(gender = ifelse(is.na(gender), no_country_gender, gender)) %>%
+ # Drop the dummy columns
+ select(-no_iso3166, -no_country_gender)
+
+ # Alright. All this CSV stuff is getting out of hand. Time for a RDB. :/
+ momaDB <- dbConnect(RSQLite::SQLite(), "momaDB.sqlite")
+ dbWriteTable(momaDB, "artists", artists)
+ dbDisconnect(momaDB)
+ unlink("momaDB.sqlite")
+}
diff --git a/momaDB.sqlite b/momaDB.sqlite
Binary files differ.