import html.entities
import itertools
import collections
import math
class TrieBuilder:
def __init__(self):
self.value = None
self.suffixes = collections.defaultdict(lambda: TrieBuilder())
def add_word(self, word, value):
if len(word) == 0:
self.value = value
else:
letter, rest = word[0], word[1:]
self.suffixes[letter].add_word(rest, value)
def build(self):
try:
suffix_ascii_start = min(map(ord, self.suffixes.keys()))
suffix_ascii_end = max(map(ord, self.suffixes.keys()))
children = [0]*(suffix_ascii_end-suffix_ascii_start + 1)
except:
suffix_ascii_start = 0xff
suffix_ascii_end = 0xff
children = []
for k, v in self.suffixes.items():
children[ord(k) - suffix_ascii_start] = v.build()
return "Trie(&[{}], {}, {})".format(", ".join(map(null_or_trie, children)),
indexes.get(self.value, 0xffff),
suffix_ascii_start)
def null_or_trie(e):
if e == 0:
return 'None'
else:
return 'Some(&{})'.format(e)
def to_escapes(w):
return ''.join("\\x{:02X}".format(x) for x in bytes(w, 'utf-8'))
if __name__ == "__main__":
with open("unescape_named_gen.rs", "w") as f:
f.write("// This file is autogenerated by generate.py, do not modify\n\n")
longest = max(map(len, html.entities.html5.keys()))
f.write("/// Length of the longest supported/known character reference name.\n")
f.write("pub const LONGEST_NAMED_REFERENCE : usize = {};\n\n".format(longest))
trie = TrieBuilder()
entities = html.entities.html5
indexes = {}
num = 0
f.write("pub static DECODED: [&'static [u8]; {}] = [\n".format(len(entities)))
letters = []
for k, v in sorted(entities.items(), key=lambda x: x[0]):
indexes[v] = num
num += 1
letters.append("b\"{}\"".format(to_escapes(v)))
trie.add_word(k, v)
f.write(",\n".join([', '.join(letters[i:i+6]) for i in range(0, len(letters), 6)]))
f.write('];\n\n')
f.write("pub static TRIE: Trie = {};".format(trie.build()))