marksman_escape 0.1.2

HTML escape and HTML unescape strings
Documentation
import html.entities
import itertools
import collections
import math

class TrieBuilder:
    def __init__(self):
        self.value = None
        self.suffixes = collections.defaultdict(lambda: TrieBuilder())

    def add_word(self, word, value):
        if len(word) == 0:
            self.value = value
        else:
            letter, rest = word[0], word[1:]
            self.suffixes[letter].add_word(rest, value)

    def build(self):
        try:
            suffix_ascii_start = min(map(ord, self.suffixes.keys()))
            suffix_ascii_end = max(map(ord, self.suffixes.keys()))
            children = [0]*(suffix_ascii_end-suffix_ascii_start + 1)
        except:
            suffix_ascii_start = 0xff
            suffix_ascii_end = 0xff
            children = []
        for k, v in self.suffixes.items():
            children[ord(k) - suffix_ascii_start] = v.build()
        return "Trie(&[{}], {}, {})".format(", ".join(map(null_or_trie, children)),
                                           indexes.get(self.value, 0xffff),
                                           suffix_ascii_start)

def null_or_trie(e):
    if e == 0:
        return 'None'
    else:
        return 'Some(&{})'.format(e)

def to_escapes(w):
    return ''.join("\\x{:02X}".format(x) for x in bytes(w, 'utf-8'))


if __name__ == "__main__":
    with open("unescape_named_gen.rs", "w") as f:
        f.write("// This file is autogenerated by generate.py, do not modify\n\n")

        # LONGEST_NAMED_REFERENCE generation
        longest = max(map(len, html.entities.html5.keys()))
        f.write("/// Length of the longest supported/known character reference name.\n")
        f.write("pub const LONGEST_NAMED_REFERENCE : usize = {};\n\n".format(longest))


        # DECODED and TRIE generation
        trie = TrieBuilder()
        entities = html.entities.html5
        indexes = {}
        num = 0
        f.write("pub static DECODED: [&'static [u8]; {}] = [\n".format(len(entities)))
        letters = []
        for k, v in sorted(entities.items(), key=lambda x: x[0]):
            indexes[v] = num
            num += 1
            letters.append("b\"{}\"".format(to_escapes(v)))
            trie.add_word(k, v)
        f.write(",\n".join([', '.join(letters[i:i+6]) for i in range(0, len(letters), 6)]))
        f.write('];\n\n')
        f.write("pub static TRIE: Trie = {};".format(trie.build()))