rustybuzz 0.3.0

#!/usr/bin/env python3

import urllib.request
import os

URL = 'https://www.unicode.org/Public/13.0.0/ucd/UnicodeData.txt'
FILE_NAME = 'UnicodeData.txt'


def hex_to_char_rs(c):
    return f"'\\u{{{c}}}'"


if not os.path.exists(FILE_NAME):
    urllib.request.urlretrieve(URL, FILE_NAME)


print('// WARNING: this file was generated by ../scripts/gen-unicode-norm-table.py')
print()
print('//! This module provides Unicode tables for canonical (de)composition.')
print('//!')
print('//! The current implementation is not the fastest one. Just good enough.')
print()
print('#[allow(dead_code)]')
print('pub const UNICODE_VERSION: (u8, u8, u8) = (13, 0, 0);')
print()
print('// Rust support `Option<char>` layout optimization, so it will take only 4 bytes.')
print('pub const DECOMPOSITION_TABLE: &[(char, char, Option<char>)] = &[')

compose_data = []
with open(FILE_NAME) as f:
    for line in f:
        parts = line.split(';')
        if len(parts[5]) == 0:
            continue

        # Skip codepoints with compatibility formatting tags
        # since we care only about canonical mapping.
        if parts[5][0] == '<':
            continue

        # Print the decomposition table as is, since `UnicodeData` is already sorted.

        c = parts[0]
        mapping = parts[5].split(' ')
        if len(mapping) == 2:
            print(f"    ({hex_to_char_rs(c)}, {hex_to_char_rs(mapping[0])}, Some({hex_to_char_rs(mapping[1])})),")

            # Remember only codepoints that should be decomposed into two codepoints.
            compose_data.append([mapping[0], mapping[1], c])
        elif len(mapping) == 1:
            print(f'    ({hex_to_char_rs(c)}, {hex_to_char_rs(mapping[0])}, None),')
        else:
            raise 'invalid unicode data'

print('];')
print()


print('// The first value is `a << 32 | b`.')
print('// Sorted by the first value.')
print('pub const COMPOSITION_TABLE: &[(u64, char)] = &[')

pairs = []
for mapping in compose_data:
    needle = int(mapping[0], 16) << 32 | int(mapping[1], 16)
    pairs.append((needle, mapping[2]))

pairs.sort(key=lambda x: x[0])

# Make sure that needles are unique.
needles = set()
for pair in pairs:
    needles.add(pair[0])

assert len(pairs) == len(needles)

for pair in pairs:
    print(f'    ({pair[0]}, {hex_to_char_rs(pair[1])}),')

print('];')