Intro

This is the script that scrapes and assembles the data used by the package. While both the script and the data are included, this script has a very high chance of breaking, given the likelihood of websites or URLs changing, locales differing, etc. If you’d like to assemble the data differently or are just curious how it was assembled, though, this script will be of use.

A function to refresh data is not included in the package because this script is depends on web structures that may vary, requires a lot of dependencies (see below) and is irrelevant to the use of the package. If you do choose to run it, keep in mind that it scrapes a lot of sources; please be curteous and don’t re-scrape more times than necessary. Due to the number and size of sources, it will take a while to run, anyway, and so is probably best run by chunk. While structured as an RMarkdown notebook for ease of documentation and structure, the HTML file it knits is not useful and is thus not included.

Setup

Packages required:

Aside from rvest and tidyverse, most could be fairly easily refactored into an alternative.

library(rvest)
Loading required package: xml2
library(tidyverse)
── Attaching packages ──────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 2.2.1.9000     ✔ purrr   0.2.4     
✔ tibble  1.4.2          ✔ dplyr   0.7.4     
✔ tidyr   0.8.0          ✔ stringr 1.3.0     
✔ readr   1.2.0          ✔ forcats 0.3.0     
── Conflicts ─────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter()         masks stats::filter()
✖ readr::guess_encoding() masks rvest::guess_encoding()
✖ dplyr::lag()            masks stats::lag()
✖ purrr::pluck()          masks rvest::pluck()

Scraping

CIA World Factbook

# CIA World Factbook country names
country_names <- 'https://www.cia.gov/library/publications/the-world-factbook/fields/2142.html' %>%
    read_html() %>%
    html_nodes('table tr[id]') %>%
    { setNames(map(., html_nodes, 'td'), toupper(html_attr(., 'id'))) } %>%
    map_df(~data_frame(
        country = html_text(.x[1]),
        key = map(.x[2], html_nodes, css = 'strong') %>% map(html_text),
        value = map(.x[2], html_nodes, xpath = 'text()') %>%
            map(html_text, trim = TRUE) %>%
            map(discard, `==`, '')
    ), .id = 'gec') %>%
    unnest() %>%
    mutate(key = gsub(':\\s+$|`', '', key),
           key = gsub('etymolgy', 'etymology', key),
           key = gsub('official|Papiamentu', 'local', key),
           key = gsub('English', 'conventional', key),
           key = ifelse(country == 'Curacao',
                        gsub('Dutch', 'conventional', key),
                        gsub('Dutch', 'local', key)),
           country = ifelse(gec == 'AS', 'Australia', country)) %>%
    spread(key, value) %>% 
    mutate(abbreviation = ifelse(country == 'Australia', NA, abbreviation))


# CIA World Factbook country codes
country_codes <- 'https://www.cia.gov/library/publications/the-world-factbook/appendix/appendix-d.html' %>%
    read_html() %>%
    html_nodes('ul#GetAppendix_D li') %>%
    map(html_nodes, css = 'td') %>%
    map(html_text, trim = TRUE) %>%
    map(~.[c(-3, -10)]) %>%
    transpose() %>%
    simplify_all() %>%
    setNames(c('country', 'gec', 'iso2c', 'iso3c', 'iso3n', 'stanag', 'tld', 'comment')) %>%
    as_data_frame() %>%
    mutate_all(na_if, y = '-') %>%
    mutate(comment = na_if(comment, ''))

National Geospatial-Intelligence Agency

# http://geonames.nga.mil/gns/html/countrycodes.html
# http://geonames.nga.mil/gns/html/docs/GENC_ED3U5_GEC_XWALK.xlsx
genc <- rio::import('http://geonames.nga.mil/gns/html/docs/GENC_ED3U5_GEC_XWALK.xlsx', 
                    skip = 2, setclass = 'tbl_df', na = '--') %>% 
    modify(na_if, '[None]')

Wikipedia

# Wikipedia country codes
w <- 'https://en.wikipedia.org/wiki/Category:Lists_of_country_codes' %>%
    read_html() %>%
    html_nodes('a[title*="Country codes:"]') %>%
    html_attr('href') %>%
    paste0('https://en.wikipedia.org', .) %>%
    map(read_html) %>%
    map(html_nodes, 'h2 + table') %>%
    modify_depth(2, html_nodes, 'td') %>%
    map(map_df, ~list(
        key = map(.x, html_nodes, xpath = 'a|text()') %>%
            map(html_text) %>%
            map_chr(paste, collapse = '') %>%
            trimws(),
        value = html_nodes(.x, 'p') %>% html_text()
    ), .id = 'row') %>%
    map_df(spread, key, value) %>%
    select(-row) %>%
    mutate_all(na_if, y = '—') %>% modify(na_if, '-') %>%
    modify(~gsub('\\n', ', ', .x)) %>%
    setNames(c('tld', 'calling', 'mcc', 'gec', 'gs1_gtin', 'icao_aircraft', 'icao_airport', 'ioc', 'iso2c', 'iso3c', 'iso3n', 'itu_callsign', 'itu', 'itu_maritime', 'license_plate', 'marc', 'stanag', 'nato2c', 'undp', 'wmo'))
# # Wikipedia FIFA codes - unused
# fifa <- 'https://en.wikipedia.org/wiki/List_of_FIFA_country_codes' %>%
#     read_html() %>%
#     html_nodes('table.wikitable') %>%
#     map(html_table, fill = TRUE) %>%
#     .[-7:-9] %>%    # remove obsolete codes tables
#     reduce(full_join) %>%
#     mutate(FIFA = coalesce(Code, FIFA)) %>%
#     select(-Code, -Confederation) %>%
#     mutate_all(funs(gsub('\\[.*\\]', '', .))) %>% mutate_all(na_if, y = '') %>%
#     setNames(tolower(names(.))) %>% rename(iso3c = iso)

ITU

Unused except for corrections of Wikipedia innacuracies.

itu <- 'https://www.itu.int/online/mm/scripts/gensel8' %>% 
    read_html() %>% 
    html_node('table') %>% 
    html_table(header = TRUE) %>% 
    select(-5) %>% 
    set_names(c('en_iso', 'itu_region', 'itu', 'tld', 'itu_language')) %>% 
    mutate(tld = tolower(tld))

UN ISO regions

unstats <- 'https://unstats.un.org/unsd/methodology/m49/overview/' %>% 
    read_html() %>% 
    html_nodes('table') %>% 
    map_df(~html_table(.x, header = TRUE) %>% 
               mutate(language = tolower(sub('downloadTable', '', 
                                             html_attr(.x, 'id'))))) %>% 
    tbl_df()

OKFN

This data seems to have moved here.

okfn <- read_csv('https://github.com/datasets/country-codes/raw/master/data/country-codes.csv')

Unicode CLDR

# http://unicode.org/copyright.html
# http://unicode.org/repos/cldr/trunk/unicode-license.txt

# modern
langs_modern <- 'https://github.com/unicode-cldr/cldr-localenames-modern/tree/master/main' %>%
    read_html() %>%
    html_nodes('.content span a') %>% html_text()

unicode_modern <- langs_modern %>%
    set_names(
        paste0('https://github.com/unicode-cldr/cldr-localenames-modern/raw/master/main/',
               ., '/territories.json'),
        .) %>%
    map(jsonlite::fromJSON) %>%
    map(c(1, 1, 2, 1)) %>%
    simplify_all() %>%
    imap(~set_names(data_frame(names(.x), .x),
                    c('code', .y))) %>%
    reduce(full_join, by = 'code')

# # unused
# lang_codes <- 'https://github.com/unicode-cldr/cldr-localenames-modern/raw/master/main/en-US-POSIX/languages.json' %>%
#     jsonlite::fromJSON() %>%
#     map(c(1,2,1)) %>% .[[1]] %>%
#     simplify()
# 
# lang_code_df <- data_frame(language = lang_codes,
#                            code = names(lang_codes))

unicode_codes <- 'https://github.com/unicode-cldr/cldr-core/raw/master/supplemental/codeMappings.json' %>% 
    jsonlite::fromJSON() %>% 
    pluck(1, 2) %>% 
    bind_rows(.id = 'iso2c') %>% 
    set_names(c('iso2c', 'iso3n', 'iso3c', 'gec', 'tld')) %>% 
    select(-tld) %>% 
    mutate(iso3c = coalesce(iso3c, iso2c), 
           iso2c = ifelse(nchar(iso2c) > 2, NA, iso2c))

Cleaning

load(con)   # `codelist`
Error in load(con) : 
  the input does not start with a magic number compatible with loading from a connection

Joining

cia <- full_join(country_codes_c %>% drop_na(gec), country_names_c)

usg <- full_join(genc_c %>% drop_na(gec), cia, by = 'gec') %>%
    mutate(iso3c = coalesce(iso3c.x, iso3c.y),
           iso2c = coalesce(iso2c.x, iso2c.y),
           iso3n = coalesce(iso3n.x, iso3n.y)) %>%
    select(-matches('\\.'))

ok_un <- okfn_c %>% full_join(unstats_c)

us_ok_un <- full_join(ok_un, usg, by = 'iso3n') %>%
    mutate(iso3c = coalesce(iso3c.x, iso3c.y),
           iso2c = coalesce(iso2c.x, iso2c.y),
           gec = coalesce(gec.x, gec.y),
           tld = coalesce(tld.x, tld.y)) %>%
    select(-matches('\\.'))

us_ok_un_w <- full_join(us_ok_un, w_c, by = 'iso2c') %>%
    mutate(iso3c = coalesce(iso3c.x, iso3c.y),
           iso3n = coalesce(iso3n.x, iso3n.y),
           gec = coalesce(gec.x, gec.y),
           marc = coalesce(marc.x, marc.y),
           # wmo = coalesce(wmo.x, wmo.y),    # w/okfn codes differ, and no official list to verify
           ioc = coalesce(ioc.x, ioc.y),
           stanag = coalesce(stanag.x, stanag.y),
           tld = coalesce(tld.x, tld.y)) %>%
    select(-matches('\\.'))

unicode <- unicode_codes %>% 
    drop_na(iso2c) %>% 
    right_join(unicode_modern_c, by = 'iso2c') %>% 
    mutate(iso3n = coalesce(iso3n.x, iso3n.y)) %>% 
    select(-matches('\\.')) %>% 
    mutate(iso3n = case_when(iso2c == 'CP' ~ '905',
                             iso2c == 'DG' ~ '908',
                             iso2c == 'XK' ~ '901',
                             TRUE ~ iso3n),
           iso3c = ifelse(iso2c == 'XK', NA, iso3c))

countries <- full_join(unicode, us_ok_un_w, by = c('iso3n', 'iso2c', 'alt')) %>% 
    mutate(iso3c = coalesce(iso3c.x, iso3c.y), 
           gec = coalesce(gec.x, gec.y)) %>% 
    select(-matches('\\.'))

countries <- left_join(countries, countrycode_data_c)
# check code duplication
us_ok_un_w %>% names() %>% 
    map_int(~ countries %>% 
                filter(is.na(alt)) %>% 
                group_by_at(.x) %>% 
                filter(n() > 1) %>% 
                select(gec, en, iso3c, iso2c, iso3n) %>% 
                .[!is.na(.[[.x]]),] %>% 
                nrow()) %>% 
    set_names(names(us_ok_un_w))
                     ar_iso                      cn_iso 
                          8                           8 
                     en_iso                      es_iso 
                          8                           8 
                     fr_iso                      ru_iso 
                          8                           8 
                      edgar                        fifa 
                          0                           2 
                       gaul                  iso4217_3c 
                          0                         104 
               iso4217_name                  iso4217_3n 
                        107                         104 
             is_independent                     capital 
                        245                           2 
                  continent              un_region_code 
                        208                         255 
          un_subregion_code un_intermediate_region_code 
                        255                         115 
                        m49                         ldc 
                          8                         256 
                       lldc                        sids 
                        256                         256 
               is_developed                       ar_un 
                        255                           8 
  ar_un_intermediate_region                ar_un_region 
                        115                         255 
            ar_un_subregion                       en_un 
                        255                           8 
  en_un_intermediate_region                en_un_region 
                        115                         255 
            en_un_subregion                       es_un 
                        255                           8 
  es_un_intermediate_region                es_un_region 
                        115                         255 
            es_un_subregion                       fr_un 
                        255                           8 
  fr_un_intermediate_region                fr_un_region 
                        115                         255 
            fr_un_subregion                       ru_un 
                        255                           8 
  ru_un_intermediate_region                ru_un_region 
                        115                         255 
            ru_un_subregion                       zh_un 
                        255                           8 
  zh_un_intermediate_region                zh_un_region 
                        115                         255 
            zh_un_subregion         en_cia_abbreviation 
                        255                           0 
                        alt                      en_cia 
                          0                           0 
               en_cia_local                       iso2c 
                          0                           0 
                    calling                         mcc 
                         26                          20 
              icao_aircraft                icao_airport 
                         39                          25 
               itu_callsign                         itu 
                          3                           0 
               itu_maritime               license_plate 
                          0                           9 
                     nato2c                        undp 
                          0                           0 
                      iso3c                       iso3n 
                          0                           0 
                        gec                        marc 
                          0                           7 
                        ioc                      stanag 
                         20                          22 
                        tld 
                          5 

Documenting

countries_colnames <- names(countries)
codes <- data_frame(column = countries_colnames, 
                 code = gsub('_', '-', column)) %>% 
    mutate(expansion = map(code, safely(NLP::parse_IETF_language_tag), expand = TRUE), 
           expansion = map(expansion, c(1, 1)), 
           expansion = map(expansion, ~if(length(.x) == 0) {c(Language = NA_character_)} else .x), 
           expansion = map(expansion, map_df, ~suppressWarnings(na_if(toString(na.omit(.x)), ''))), 
           expansion = map(expansion, ~set_names(.x, sub('=.*', '', names(.x))))) %>% 
    unnest() %>% 
    mutate(Language = case_when(column == 'alt' ~ NA_character_,
                                column == 'mcc' ~ NA_character_,
                                column == 'tld' ~ NA_character_,
                                TRUE ~ Language),
           Variant = ifelse(column == 'en_us_posix', 'POSIX', Variant),
           Variant = ifelse(grepl('_iso', column), 'ISO', Variant),
           Variant = ifelse(grepl('_cia', column), 'CIA World Factbook', Variant),
           Extension = ifelse(grepl('_cia_', column), 
                              sub('en_cia_', '', column), Extension),
           name = case_when(
               column == 'iso2c' ~ 'ISO 3166-1 Alpha-2 code',
               column == 'iso3c' ~ 'ISO 3166-1 Alpha-3 code',
               column == 'iso3n' ~ 'ISO 3166-1 numeric code',
               column == 'en_iso' ~ 'ISO English name',
               column == 'fr_iso' ~ 'ISO French name',
               column == 'gec' ~ 'Geopolitical Entities and Codes',
               column == 'fifa' ~ 'FIFA (Fédération Internationale de Football Association) code',
               column == 'gaul' ~ 'Global Administrative Unit Layers from the Food and Agriculture Organization (FAO) code',
               column == 'iso4217_3c' ~ 'ISO 4217 3-character currency code',
               column == 'iso4217_name' ~ 'ISO 4217 currency name',
               column == 'iso4217_3n' ~ 'ISO 4217 numeric currency code',
               column == 'is_independent' ~ 'Country sovereignty status from the CIA World Factbook',
               column == 'capital' ~ 'Capital city',
               column == 'edgar' ~ 'EDGAR country code from the SEC',
               column == 'en_cia' ~ 'Country names from the CIA World Factbook',
               column == 'en_cia_local' ~ 'Local country names from the CIA World Factbook',
               column == 'en_cia_abbreviation' ~ 'Commonly used country abbreviations from the CIA World Factbook.',
               column == 'mcc' ~ 'International Telecommunication Union (ITU) Telecommunication Standardization Sector (ITU-T) E.212 Mobile Country Code',
               column == 'itu_callsign' ~ 'International Telecommunication Union (ITU) callsign prefixes for radio and television stations',
               column == 'itu' ~ 'International Telecommunication Union (ITU) 1-3 character country code',
               column == 'itu_maritime' ~ 'International Telecommunication Union (ITU) Maritime Identification Digits',
               column == 'license_plate' ~ 'Motor vehicle licence plate country code',
               column == 'stanag' ~ 'North Atlantic Treaty Organization (NATO/OTAN) STANAG 1059 Letter Codes for Geographical Entities',
               column == 'nato2c' ~ 'North Atlantic Treaty Organization (NATO/OTAN) 2-letter code.',
               column == 'undp' ~ 'United Nations Development Programme (UNDP) country code',
               column == 'marc' ~ 'MAchine-Readable Cataloging (MARC) codes from the US Library of Congress',
               column == 'calling' ~ 'International Telecommunication Union (ITU) Telecommunication Standardization Sector (ITU-T) E.164 international telephone calling code',
               column == 'ioc' ~ 'International Olympic Committee country code',
               column == 'tld' ~ 'Internet Assigned Numbers Authority (IANA) country code top-level domain',
               column == 'm49' ~ 'United Nations Statistics Division (UNSD) M.49 area code',
               column == 'ldc' ~ 'United Nations (UN) Least Developed Countries',
               column == 'lldc' ~ 'United Nations (UN) Land Locked Developing Countries',
               column == 'sids' ~ 'United Nations (UN) Small Island Developing States',
               column == 'is_developed' ~ 'United Nations (UN) development status',
               grepl('_un', column) ~ 'United Nations (UN) Geoscheme region name',
               TRUE ~ NA_character_
           ),
           notes = case_when(
               column == 'gec' ~ 'Formerly FIPS Pub 10-4, which was withdrawn by NIST in 2008. Maintained until 2014 by the National Geospatial-Intelligence Agency (NGA), after which it was frozen and superceded by GENC, the US government profile of ISO 3166.',
               column == 'en_iso' ~ 'Does not include Taiwan, which is not a member of the UN. A few use uncommon offical forms, including North and South Korea and Bolivia.',
               column == 'fr_iso' ~ 'Does not include Taiwan, which is not a member of the UN. A few use uncommon offical forms, including North and South Korea and Bolivia.',
               column == 'en_cia' ~ 'Many short and long forms. Uses "Burma" instead of "Myanmar".',
               column == 'en_cia_local' ~ 'Many short and long forms. Includes alternatives inline. Transliterates to Latin script.',
               column == 'en_cia_abbreviation' ~ 'Only included where commonly used. Includes alternatives inline.',
               column == 'mcc' ~ 'Includes ranges.',
               column == 'itu_callsign' ~ 'Includes ranges.',
               column == 'itu_maritime' ~ 'Includes ranges.',
               column == 'nato2c' ~ 'Officially deprecated in favor of STANAG 1059 (see "stanag").',
               column == 'tld' ~ 'Includes leading period.',
               grepl('_un_region', column) ~ 'Continent',
               grepl('regex', column) ~ 'Regex used for `parse_country`.',
               TRUE ~ NA_character_
           )) %>% 
    janitor::clean_names() %>% 
    arrange(column) %>% 
    select(1:2, 8:9, 3:7)

Saving

# to avoid ASCII CRAN warning
codes <- codes %>% modify(stringi::stri_trans_general, 'Latin-ASCII')

devtools::use_data(countries, countries_colnames, 
                   internal = TRUE, overwrite = TRUE)
devtools::use_data(codes, overwrite = TRUE)

Licensing

Data is licensed according to its source, most of which are in the public domain. Exceptions include

