Python: Get countries by continent from Wikipedia
Problem:
You need a list of countries, ordered by continent, under a liberal license.
Solution
One possibility is to use the data from Wikipedia and parse the information from the Wikitext format. For example, the english wikipedia ‘List of sovereign states and dependent territories by continent’ provides a good source.
The script provided here downloads and parses that format without any external dependencies (Python core libraries only). Python 2.7+ is required. The country list is encoded as Unicode.
Update 1.1 2015-12-14: Upgrade to Python3, added file headers Update 1.2 2017-02-19: Fix changed format
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Extract a list of countries, grouped by continent, from Wikipedia
"""
__copyright__ = "Copyright (c) 2015 Uli Köhler"
__license__ = "Apache License v2.0"
__version__ = "1.2"
try:
import urllib.request as urllib2
except ImportError:
import urllib2
import re
def downloadWikipediaCountryList():
"""Downloads the wikipedia country list in wikitext format"""
url = "http://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_by_continent?action=raw"
wikitext = urllib2.urlopen(url).read().decode("utf-8")
return wikitext.split("\n")
def splitListAtIndices(theList, indices):
"""
Split a list at a given set of indices
>>> splitListAtIndices([1,3,5,7,9], [0, 3])
[[], [1, 3, 5], [7, 9]]
"""
return [theList[i:j] for i, j in zip([0] + indices, indices + [None])]
def extractContinents(wikitextLines):
"""Given a wikitext-format line list, extracts the lines that belong to a continent for each continent"""
continentRegex = re.compile("==\s*(\w+)\s*==")
#Generate list of (lines index, continent) tuples
continents = [(idx, continentRegex.match(line).group(1))
for idx, line in enumerate(wikitextLines) if continentRegex.match(line)]
continentLines = splitListAtIndices(wikitextLines, [continent[0] for continent in continents])[1:]
return {continents[i][1]: continentLines[i] for i in range(len(continents))}
countryRegex = re.compile("\| '+\[?\[?([^\]]+)\]?\]?\'+")
def findCountry(line):
"""
>>> findCountry("| '''[[Germany]]'''")
'Germany'
>>> findCountry("| [[Berlin]]")
"""
match = countryRegex.match(line)
if match:
country = match.group(1)
if "De facto" in country:
return None
if "|" in country:
return country.partition("|")[2]
return country
return None
def extractCountries(linesByContinent):
return {continent: [findCountry(line) for line in continentLines if findCountry(line)]
for continent, continentLines in linesByContinent.items()}
def getCountriesFromWikipedia():
lines = downloadWikipediaCountryList()
linesByContinent = extractContinents(lines)
return extractCountries(linesByContinent)
if __name__ == "__main__":
#Unit-selftest
import doctest
doctest.testmod()
#Print country list
print(getCountriesFromWikipedia())