read-fonts 0.39.2

Reading OpenType font files.
Documentation
from pathlib import Path
import struct

SCRIPT_DIR = Path(__file__).resolve().parent

def generate():
    glyphlist = Path(SCRIPT_DIR.joinpath("../data/glyphlist.txt")).read_text(encoding="utf-8")
    lines = glyphlist.split("\n")
    singles = []
    multis = []
    max_name_len = 0
    for line in lines:
        if line:
            if line.startswith("#"):
                continue
            fields = line.split(';')
            subfields = fields[1].split(" ")
            name = fields[0]
            max_name_len = max(max_name_len, len(name))
            if len(subfields) == 1:
                singles.append((fields[0], int(fields[1], 16)))
            else:
                multis.append((fields[0], ["0x" + x for x in subfields]))
    trie = StringNode("", 0)
    for pair in singles:
        trie.add(pair[0], pair[1])
    trie = trie.optimize()
    trie_len = trie.locate(0)
    trie_array = trie.store(b"")

    buf = ""
    buf += "// THIS FILE IS AUTOGENERATED.\n"
    buf += "// Any changes to this file will be overwritten.\n"
    buf += "// Use ../scripts/gen_agl.py to regenerate.\n\n"
    buf += "/// Maximum length of an AGL glyph name.\n"
    buf += "pub const MAX_NAME_LEN: usize = {};\n\n".format(max_name_len + 1)
    # Emit trie of single codepoint mappings
    buf += "#[rustfmt::skip]\n"
    buf += "static AGL_TRIE: [u8; {}] = [\n".format(len(trie_array))
    buf += "    "
    line_count = 0
    for value in trie_array:
        if line_count == 32:
            buf += "\n    "
            line_count = 0
        if line_count > 0:
            buf += " "
        line_count += 1
        buf += str(value).rjust(3, ' ') + ","
    buf += "\n];\n\n"
    # Emit multi codepoint mappings as array sorted by glyph name
    multis = sorted(multis, key=lambda tup: tup[0])
    buf += "#[rustfmt::skip]\n"
    buf += "static AGL_MULTIS: [(&str, &[u16]); {}] = [\n".format(len(multis))
    for value in multis:
        buf += "    (\"{}\", &[{}]),\n".format(value[0], ", ".join(value[1]))
    buf += "];\n"
    return buf

# The following type for constructing a compressed trie borrowed from 
# FreeType at <https://gitlab.freedesktop.org/freetype/freetype/-/blob/master/src/tools/glnames.py?ref_type=heads#L5049>

# We now store the Adobe Glyph List in compressed form.  The list is put
# into a data structure called `trie' (because it has a tree-like
# appearance).  Consider, for example, that you want to store the
# following name mapping:
#
#   A        => 1
#   Aacute   => 6
#   Abalon   => 2
#   Abstract => 4
#
# It is possible to store the entries as follows.
#
#   A => 1
#   |
#   +-acute => 6
#   |
#   +-b
#     |
#     +-alon => 2
#     |
#     +-stract => 4
#
# We see that each node in the trie has:
#
# - one or more `letters'
# - an optional value
# - zero or more child nodes
#
# The first step is to call
#
#   root = StringNode( "", 0 )
#   for word in map.values():
#     root.add( word, map[word] )
#
# which creates a large trie where each node has only one children.
#
# Executing
#
#   root = root.optimize()
#
# optimizes the trie by merging the letters of successive nodes whenever
# possible.
#
# Each node of the trie is stored as follows.
#
# - First the node's letter, according to the following scheme.  We
#   use the fact that in the AGL no name contains character codes > 127.
#
#     name         bitsize     description
#     ----------------------------------------------------------------
#     notlast            1     Set to 1 if this is not the last letter
#                              in the word.
#     ascii              7     The letter's ASCII value.
#
# - The letter is followed by a children count and the value of the
#   current key (if any).  Again we can do some optimization because all
#   AGL entries are from the BMP; this means that 16 bits are sufficient
#   to store its Unicode values.  Additionally, no node has more than
#   127 children.
#
#     name         bitsize     description
#     -----------------------------------------
#     hasvalue           1     Set to 1 if a 16-bit Unicode value follows.
#     num_children       7     Number of children.  Can be 0 only if
#                              `hasvalue' is set to 1.
#     value             16     Optional Unicode value.
#
# - A node is finished by a list of 16bit absolute offsets to the
#   children, which must be sorted in increasing order of their first
#   letter.
#
# For simplicity, all 16bit quantities are stored in big-endian order.
#
# The root node has first letter = 0, and no value.
#
class StringNode:
    def __init__(self, letter, value):
        self.letter = letter
        self.value = value
        self.children = {}

    def __cmp__(self, other):
        return ord(self.letter[0]) - ord(other.letter[0])

    def __lt__(self, other):
        return self.letter[0] < other.letter[0]

    def add(self, word, value):
        if len(word) == 0:
            self.value = value
            return

        letter = word[0]
        word = word[1:]

        if letter in self.children:
            child = self.children[letter]
        else:
            child = StringNode(letter, 0)
            self.children[letter] = child

        child.add(word, value)

    def optimize(self):
        # optimize all children first
        children = list(self.children.values())
        self.children = {}

        for child in children:
            self.children[child.letter[0]] = child.optimize()

        # don't optimize if there's a value,
        # if we don't have any child or if we
        # have more than one child
        if (self.value != 0) or (not children) or len(children) > 1:
            return self

        child = children[0]

        self.letter += child.letter
        self.value = child.value
        self.children = child.children

        return self

    def dump_debug(self, write, margin):
        # this is used during debugging
        line = margin + "+-"
        if len(self.letter) == 0:
            line += "<NOLETTER>"
        else:
            line += self.letter

        if self.value:
            line += " => " + repr(self.value)

        write(line + "\n")

        if self.children:
            margin += "| "
            for child in self.children.values():
                child.dump_debug(write, margin)

    def locate(self, index):
        self.index = index
        if len(self.letter) > 0:
            index += len(self.letter) + 1
        else:
            index += 2

        if self.value != 0:
            index += 2

        children = list(self.children.values())
        children.sort()

        index += 2 * len(children)
        for child in children:
            index = child.locate(index)

        return index

    def store(self, storage):
        # write the letters
        length = len(self.letter)
        if length == 0:
            storage += struct.pack("B", 0)
        else:
            for n in range(length):
                val = ord(self.letter[n])
                if n < length - 1:
                    val += 128
                storage += struct.pack("B", val)

        # write the count
        children = list(self.children.values())
        children.sort()

        count = len(children)

        if self.value != 0:
            storage += struct.pack("!BH", count + 128, self.value)
        else:
            storage += struct.pack("B", count)

        for child in children:
            storage += struct.pack("!H", child.index)

        for child in children:
            storage = child.store(storage)

        return storage


if __name__ == "__main__":    
    data = generate()
    Path(SCRIPT_DIR.joinpath("../data/generated/generated_agl.rs")).write_text(data, encoding="utf-8")