Python: Get countries by continent from Wikipedia

Problem:

You need a list of countries, ordered by continent, under a liberal license.

Solution

One possibility is to use the data from Wikipedia and parse the information from the Wikitext format. For example, the english wikipedia ‘List of sovereign states and dependent territories by continent’ provides a good source.

The script provided here downloads and parses that format without any external dependencies (Python core libraries only). Python 2.7+ is required. The country list is encoded as Unicode.

Update 1.1 2015-12-14: Upgrade to Python3, added file headers Update 1.2 2017-02-19: Fix changed format

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Extract a list of countries, grouped by continent, from Wikipedia
"""
__copyright__ = "Copyright (c) 2015 Uli Köhler"
__license__ = "Apache License v2.0"
__version__ = "1.2"

try:
    import urllib.request as urllib2
except ImportError:
    import urllib2
import re

def downloadWikipediaCountryList():
    """Downloads the wikipedia country list in wikitext format"""
    url = "http://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_by_continent?action=raw"
    wikitext = urllib2.urlopen(url).read().decode("utf-8")
    return wikitext.split("\n")

def splitListAtIndices(theList, indices):
    """
    Split a list at a given set of indices
    >>> splitListAtIndices([1,3,5,7,9], [0, 3])
    [[], [1, 3, 5], [7, 9]]
    """
    return [theList[i:j] for i, j in zip([0] + indices, indices + [None])]

def extractContinents(wikitextLines):
    """Given a wikitext-format line list, extracts the lines that belong to a continent for each continent"""
    continentRegex = re.compile("==\s*(\w+)\s*==")
    #Generate list of (lines index, continent) tuples
    continents = [(idx, continentRegex.match(line).group(1))
                  for idx, line in enumerate(wikitextLines) if continentRegex.match(line)]
    continentLines = splitListAtIndices(wikitextLines, [continent[0] for continent in continents])[1:]
    return {continents[i][1]: continentLines[i] for i in range(len(continents))}

countryRegex = re.compile("\| '+\[?\[?([^\]]+)\]?\]?\'+")
def findCountry(line):
    """
    >>> findCountry("| '''[[Germany]]'''")
    'Germany'
    >>> findCountry("| [[Berlin]]")
    """
    match = countryRegex.match(line)
    if match:
        country = match.group(1)
        if "De facto" in country:
            return None
        if "|" in country:
            return country.partition("|")[2]
        return country
    return None

def extractCountries(linesByContinent):
    return {continent: [findCountry(line) for line in continentLines if findCountry(line)]
            for continent, continentLines in linesByContinent.items()}


def getCountriesFromWikipedia():
    lines = downloadWikipediaCountryList()
    linesByContinent = extractContinents(lines)
    return extractCountries(linesByContinent)

if __name__ == "__main__":
    #Unit-selftest
    import doctest
    doctest.testmod()
    #Print country list
    print(getCountriesFromWikipedia())

If this post helped you, please consider buying me a coffee or donating via PayPal to support research & publishing of new posts on TechOverflow

Buy me a coffee