# Python: Get countries by continent from Wikipedia

### Problem:

You need a list of countries, ordered by continent, under a liberal license.

### Solution:

One possibility is to use the data from Wikipedia and parse the information from the Wikitext format. For example, the english wikipedia ‘List of sovereign states and dependent territories by continent’ provides a good source.

The script provided here downloads and parses that format without any external dependencies (Python core libraries only). Python 2.7+ is required. The country list is encoded as Unicode.

``````#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Extract a list of countries, grouped by continent, from Wikipedia
"""
__version__ = "1.2"

try:
import urllib.request as urllib2
except ImportError:
import urllib2
import re

url = "http://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_by_continent?action=raw"
return wikitext.split("\n")

def splitListAtIndices(theList, indices):
"""
Split a list at a given set of indices
>>> splitListAtIndices([1,3,5,7,9], [0, 3])
[[], [1, 3, 5], [7, 9]]
"""
return [theList[i:j] for i, j in zip([0] + indices, indices + [None])]

def extractContinents(wikitextLines):
"""Given a wikitext-format line list, extracts the lines that belong to a continent for each continent"""
continentRegex = re.compile("==\s*(\w+)\s*==")
#Generate list of (lines index, continent) tuples
continents = [(idx, continentRegex.match(line).group(1))
for idx, line in enumerate(wikitextLines) if continentRegex.match(line)]
continentLines = splitListAtIndices(wikitextLines, [continent[0] for continent in continents])[1:]
return {continents[i][1]: continentLines[i] for i in range(len(continents))}

countryRegex = re.compile("\| '+\[?\[?([^\]]+)\]?\]?\'+")
def findCountry(line):
"""
>>> findCountry("| '''[[Germany]]'''")
'Germany'
>>> findCountry("| [[Berlin]]")
"""
match = countryRegex.match(line)
if match:
country = match.group(1)
if "De facto" in country:
return None
if "|" in country:
return country.partition("|")[2]
return country
return None

def extractCountries(linesByContinent):
return {continent: [findCountry(line) for line in continentLines if findCountry(line)]
for continent, continentLines in linesByContinent.items()}

def getCountriesFromWikipedia():