#!/usr/bin/python # -*- coding: utf-8 -*- """Extract the current list of named entities from the HTML5 “standard” in CSV. Depends on the PyQuery library. Note that this doesn’t do any caching, so you probably want to save its output. """ import csv import re import sys import urllib import pyquery url = "https://developers.whatwg.org/named-character-references.html" def main(argv): csv.register_dialect('excel_without_cr', lineterminator="\n") rows = get_entities_utf8(url if len(argv) == 1 else argv[1]) csv.writer(sys.stdout, 'excel_without_cr').writerows(rows) def get_entities_utf8(from_url=None): "Yield (name, utf-8-string) pairs for all named entities from the web." return ((name, s.encode('utf-8')) for name, s in parse(get(from_url))) def get_entities(from_url=None): "Yield (name, unicodestring) pairs for all named entities from the web." return parse(get(from_url or url)) def parse(data): "Yield (name, unicodestring) pairs for all named entities from input data." pq = pyquery.PyQuery d = pq(data) for ii, row in enumerate(d('#named-character-references-table tr')): tds = pq(row)('td') if len(tds) > 1: yield '&' + pq(tds[0]).text().strip(), encode(pq(tds[1]).text()) def encode(character_descriptions): "Convert a string like 'U+0233E U+00333' into a Python Unicode string." return u''.join(_encode(character_descriptions)) def _encode(character_descriptions): for description in re.finditer(r'U\+([0-9A-F]+)', character_descriptions): yield unichr(int(description.group(1), 16)) def get(url): "Return the response entity from an HTTP request or raise an HttpError." response = urllib.urlopen(url) if response.code != 200 and response.code != None: raise HttpError(response.code, response.read()) return response.read() class HttpError(Exception): "Raised by get() if there is an HTTP error." pass if __name__ == '__main__': main(sys.argv)