unicode-casefold 0.2.0

Unicode-aware case folding.
Documentation
#!/usr/bin/env python3
"""
This script generates the data tables from Unicode Consortium case
folding data.

See ``README.md`` for instructions on how to use this script.
"""


import re
import sys


def parse_version(lines):
    """Parse a triple containing the Unicode standard version."""
    line = next(lines)
    return re.match(r'# CaseFolding-(\d+)\.(\d+)\.(\d+)\.txt', line).groups()


def strip_comments(line):
    """Strip anything after a '#' character."""
    return line.partition('#')[0]


def parse_tables(lines):
    """Parse the main bulk of the data."""
    tables = {'C': {}, 'F': {}, 'S': {}, 'T': {}}
    for line in lines:
        line = strip_comments(line).strip()
        if not line:
            continue
        code, status, mapping, _empty = line.split(';')
        code = int(code, 16)
        status = status.strip()
        if status == 'F':
            # Only status F (full) mappings can expand to multiple chars
            mapping = [int(s, 16) for s in mapping.split()]
        else:
            mapping = int(mapping, 16)
        tables[status][code] = mapping
    if tables['T'] != {0x0049: 0x0131, 0x0130: 0x0069}:
        raise Exception('Turkic tables have changed -- please update code')
    return tables


def render(codepoint):
    """Render a codepoint as a Rust hex escape."""
    return r"'\u{{{:x}}}'".format(codepoint)


def main(lines):

    print('// NOTE: the following code was generated by `scripts/generate.py`; do not edit directly')

    version = parse_version(lines)
    print("""
/// The version of [Unicode](http://www.unicode.org/) that this version
/// of `unicode-casefold` is based on.
pub const UNICODE_VERSION: (u64, u64, u64) = ({}, {}, {});
""".format(*version))

    print("""
#[derive(Copy, Clone, Debug)]
pub enum Buffer {
    Zero,
    One(char),
    Two(char, char),
}
""")

    tables = parse_tables(lines)

    print("""/// Common mappings shared by both the full and simple mappings.""")
    print("""pub static COMMON_TABLE: &'static [(char, char)] = &[""")
    for code, mapping in sorted(tables['C'].items()):
        print("""    ({}, {}),""".format(render(code), render(mapping)))
    print("""];""")

    print()

    print("""/// Full mappings, which cause strings to grow in length.""")
    print("""pub static FULL_TABLE: &'static [(char, (char, Buffer))] = &[""")
    for code, mapping in sorted(tables['F'].items()):
        if len(mapping) == 1:
            variant = 'Buffer::Zero'
        elif len(mapping) == 2:
            variant = 'Buffer::One({})'.format(render(mapping[1]))
        elif len(mapping) == 3:
            variant = 'Buffer::Two({}, {})'.format(render(mapping[1]), render(mapping[2]))
        else:
            raise Exception('code {} maps to a string of length {}'.format(code, len(mapping)))
        print("""    ({}, ({}, {})),""".format(render(code), render(mapping[0]), variant))
    print("""];""")

    print()

    print("""/// Simple mappings, which differ from those in the `FULL_TABLE`.""")
    print("""pub static SIMPLE_TABLE: &'static [(char, char)] = &[""")
    for code, mapping in sorted(tables['S'].items()):
        print("""    ({}, {}),""".format(render(code), render(mapping)))
    print("""];""")


if __name__ == '__main__':
    main(sys.stdin)