scrapeCountryInfo.R - moma-collection - The Museum of Modern Art (MoMA) collection data.

scrapeCountryInfo.R (3306B)
      1 # Here's some code to pull in the tables I want for mapping countries to 
      2 # adjectivals and their ISO 3166-1 alpha-2 codes
      3 
      4 # For now, the CSVs this writes still need a little manual touch-up. It's not
      5 # much data tho.
      6 
      7 require(rvest)
      8 require(dplyr)
      9 
     10 getTableFromWeb <- function(url, xpath) {
     11   tableList <- url %>%
     12     html() %>%
     13     html_nodes(xpath=xpath) %>%
     14     html_table(fill=TRUE)
     15   return(tableList[[1]])
     16 }
     17 
     18 
     19 # Countries and their adjectival forms ------------------------------------
     20 
     21 countriesToAdjectivals <- getTableFromWeb("https://en.wikipedia.org/wiki/List_of_adjectival_and_demonymic_forms_for_countries_and_nations",
     22                                           "//*[@id=\"mw-content-text\"]/table[1]")
     23 
     24 colnames(countriesToAdjectivals) <- sub(" ", "_", 
     25                                         tolower(colnames(countriesToAdjectivals)))
     26 
     27 countriesToAdjectivals <- countriesToAdjectivals[2:nrow(countriesToAdjectivals),] %>% 
     28   select(country_name, adjectivals) %>% 
     29   # Get rid of the wikipedia cruft
     30   mutate_each(funs(gsub("\\[.*\\]", "", .))) %>% 
     31   # For later splitting of adjectivals
     32   mutate(adjectivals = sub(" or ", ", ", adjectivals)) %>%
     33   # Rearrange the country name into its natural order
     34   mutate(natural_country_name = sub("([[:alpha:]]*), ([[:alpha:]].*)", 
     35                                     "\\2 \\1", 
     36                                     country_name))
     37 splitAdjectivals <- strsplit(countriesToAdjectivals[["adjectivals"]], ",[[:space:]]*")
     38 for (i in seq_len(max(vapply(splitAdjectivals, length, 1)))) {
     39   countriesToAdjectivals[[paste("adjectival", i, sep="_")]] <- vapply(splitAdjectivals, function(x) { x[i] }, "")
     40 }
     41 
     42 
     43 # Countries and ISO 3166-1 alpha-2 codes ----------------------------------
     44 
     45 countriesToCodes <- getTableFromWeb("https://en.wikipedia.org/wiki/ISO_3166-2",
     46                                     "//*[@id=\"mw-content-text\"]/table[1]")
     47 
     48 colnames(countriesToCodes) <- c("iso3166", "country_name", "subdivisions")
     49 countriesToCodes <- countriesToCodes %>%
     50   # Get rid of the wikipedia cruft
     51   mutate_each(funs(gsub("\\[.*\\]", "", .))) %>% 
     52   # Rearrange the country name into its natural order
     53   mutate(natural_country_name = sub("([[:alpha:]]*), ([[:alpha:]].*)", 
     54                                     "\\2 \\1", 
     55                                     country_name))
     56 
     57 
     58 # Do a full join on the tables so I can tidy up by hand -------------------
     59 
     60 countriesCodesAdjectivals <- countriesToCodes %>%
     61   select(natural_country_name, iso3166) %>%
     62   full_join(countriesToAdjectivals, by="natural_country_name")
     63 if (file.exists("countries_codes_adjectivals.csv")) {
     64   #write.csv(countriesCodesAdjectivals, "countries_codes_adjectivals.csv", row.names = FALSE)
     65 }
     66 
     67 
     68 # Map nationalities to codes ----------------------------------------------
     69 
     70 countriesCodesAdjectivals <- read.csv("countries_codes_adjectivals.csv")
     71 
     72 nationalitiesToCodes <- countriesCodesAdjectivals %>% 
     73   select(-adjectivals) %>% 
     74   gather("adjectival_number", "adjectival", adjectival_1:adjectival_4) %>% 
     75   filter(!is.na(adjectival)) %>% 
     76   select(natural_country_name, adjectival, iso3166) %>% 
     77   gather("type", "nationality", natural_country_name, adjectival) %>% 
     78   select(nationality, iso3166) %>% 
     79   distinct()
     80 write.csv(nationalitiesToCodes, "nationalities_codes.csv", row.names = FALSE)
	moma-collection The Museum of Modern Art (MoMA) collection data.
	git clone https://git.eamoncaddigan.net/moma-collection.git
	Log \| Files \| Refs \| README \| LICENSE