trivet 3.1.0

The trivet Parser Library
Documentation
#!/usr/bin/env python

# Trivet
# Copyright (c) 2025 by Stacy Prowell.  All rights reserved.
# https://gitlab.com/binary-tools/trivet

"""
Read the XML Unicode database file and construct a Rust version for use
by Trivet.  This converts the XML into a literal data structure in Rust
using the following process.

The result is a BTree mapping &str to char.  The strings come from the
names (NA attribute) and aliases (name attributes of contained name-alias
elements).

Obtain the Unicode database from https://unicode.org/ucd/.  The desired
file is the flat complete set, and is named ucd.all.flat.xml.

This only needs to be run if the Unicode database needs to be updated.
Place the ucd.all.flat.xml file in the same folder as this script (etc)
and then run from this folder with:

    $ python3 build_unicode_db.py > ucd.rs

Check the output to make sure it looks reasonable, then move it into place
in the src/parse/strings folder.  If you are in the etc folder this command
should do it.

    $ mv ucd.rs ../src/strings/ucd.rs
"""

import defusedxml.ElementTree as ET  # type: ignore


def main() -> None:
    """Entry point when run from the prompt."""
    # Open and read the Unicode XML data file.
    tree = ET.parse("ucd.all.flat.xml")
    root = tree.getroot()

    # Extract all char children.
    ucd = {}
    for character in root.iter("{http://www.unicode.org/ns/2003/ucd/1.0}char"):
        # If there is a code point in the attribute, then get it.
        if "cp" in character.attrib:
            code_point = character.attrib["cp"]
            # Now try to find the name.
            if "na" in character.attrib:
                name = character.attrib["na"]
                if name != "":
                    # Found the name.
                    ucd[name] = f"\\u{{{code_point}}}"
            # Now print any aliases.
            for alias in character.iter(
                "{http://www.unicode.org/ns/2003/ucd/1.0}name-alias"
            ):
                if "alias" in alias.attrib:
                    alias = alias.attrib["alias"]
                    if alias != "":
                        ucd[alias] = f"\\u{{{code_point}}}"

    # Now construct the Rust file.
    print("// trivet")
    print("// copyright")
    print("")
    print("//! Provide the Unicode database.")
    print("")
    print("/// The Unicode database.")
    print(
        """///
/// This is generated from the complete database, which can be obtained from
/// [unicode.org](https://unicode.org/ucd/).  It is an array of pairs, with
/// each pair consisting of a Unicode name or alias and the corresponding code
/// point.
"""
    )
    print("pub const UCD: &[(&str, char)] = &[")
    for name, code_point in ucd.items():
        print(f"    (\"{name.upper()}\", '{code_point}'),")
    print("];")


if __name__ == "__main__":
    main()