scrapeCountryInfo.R (3306B)
1 # Here's some code to pull in the tables I want for mapping countries to 2 # adjectivals and their ISO 3166-1 alpha-2 codes 3 4 # For now, the CSVs this writes still need a little manual touch-up. It's not 5 # much data tho. 6 7 require(rvest) 8 require(dplyr) 9 10 getTableFromWeb <- function(url, xpath) { 11 tableList <- url %>% 12 html() %>% 13 html_nodes(xpath=xpath) %>% 14 html_table(fill=TRUE) 15 return(tableList[[1]]) 16 } 17 18 19 # Countries and their adjectival forms ------------------------------------ 20 21 countriesToAdjectivals <- getTableFromWeb("https://en.wikipedia.org/wiki/List_of_adjectival_and_demonymic_forms_for_countries_and_nations", 22 "//*[@id=\"mw-content-text\"]/table[1]") 23 24 colnames(countriesToAdjectivals) <- sub(" ", "_", 25 tolower(colnames(countriesToAdjectivals))) 26 27 countriesToAdjectivals <- countriesToAdjectivals[2:nrow(countriesToAdjectivals),] %>% 28 select(country_name, adjectivals) %>% 29 # Get rid of the wikipedia cruft 30 mutate_each(funs(gsub("\\[.*\\]", "", .))) %>% 31 # For later splitting of adjectivals 32 mutate(adjectivals = sub(" or ", ", ", adjectivals)) %>% 33 # Rearrange the country name into its natural order 34 mutate(natural_country_name = sub("([[:alpha:]]*), ([[:alpha:]].*)", 35 "\\2 \\1", 36 country_name)) 37 splitAdjectivals <- strsplit(countriesToAdjectivals[["adjectivals"]], ",[[:space:]]*") 38 for (i in seq_len(max(vapply(splitAdjectivals, length, 1)))) { 39 countriesToAdjectivals[[paste("adjectival", i, sep="_")]] <- vapply(splitAdjectivals, function(x) { x[i] }, "") 40 } 41 42 43 # Countries and ISO 3166-1 alpha-2 codes ---------------------------------- 44 45 countriesToCodes <- getTableFromWeb("https://en.wikipedia.org/wiki/ISO_3166-2", 46 "//*[@id=\"mw-content-text\"]/table[1]") 47 48 colnames(countriesToCodes) <- c("iso3166", "country_name", "subdivisions") 49 countriesToCodes <- countriesToCodes %>% 50 # Get rid of the wikipedia cruft 51 mutate_each(funs(gsub("\\[.*\\]", "", .))) %>% 52 # Rearrange the country name into its natural order 53 mutate(natural_country_name = sub("([[:alpha:]]*), ([[:alpha:]].*)", 54 "\\2 \\1", 55 country_name)) 56 57 58 # Do a full join on the tables so I can tidy up by hand ------------------- 59 60 countriesCodesAdjectivals <- countriesToCodes %>% 61 select(natural_country_name, iso3166) %>% 62 full_join(countriesToAdjectivals, by="natural_country_name") 63 if (file.exists("countries_codes_adjectivals.csv")) { 64 #write.csv(countriesCodesAdjectivals, "countries_codes_adjectivals.csv", row.names = FALSE) 65 } 66 67 68 # Map nationalities to codes ---------------------------------------------- 69 70 countriesCodesAdjectivals <- read.csv("countries_codes_adjectivals.csv") 71 72 nationalitiesToCodes <- countriesCodesAdjectivals %>% 73 select(-adjectivals) %>% 74 gather("adjectival_number", "adjectival", adjectival_1:adjectival_4) %>% 75 filter(!is.na(adjectival)) %>% 76 select(natural_country_name, adjectival, iso3166) %>% 77 gather("type", "nationality", natural_country_name, adjectival) %>% 78 select(nationality, iso3166) %>% 79 distinct() 80 write.csv(nationalitiesToCodes, "nationalities_codes.csv", row.names = FALSE)