rustybuzz 0.3.0

An incremental harfbuzz port to Rust.
Documentation
#!/usr/bin/env python

"""
Generator of the function to prohibit certain vowel sequences.

It creates ``preprocess_text_vowel_constraints``, which inserts dotted
circles into sequences prohibited by the USE script development spec.

Based on harfbuzz/src/gen-vowel-constraints.py
"""

import collections
import io
import os
import urllib.request

if not os.path.exists('Scripts.txt'):
    urllib.request.urlretrieve('https://unicode.org/Public/12.0.0/ucd/Scripts.txt', 'Scripts.txt')

with io.open('Scripts.txt', encoding='utf-8') as f:
    scripts_header = [f.readline() for i in range(2)]
    scripts = {}
    script_order = {}
    for line in f:
        j = line.find('#')
        if j >= 0:
            line = line[:j]
        fields = [x.strip() for x in line.split(';')]
        if len(fields) == 1:
            continue
        uu = fields[0].split('..')
        start = int(uu[0], 16)
        if len(uu) == 1:
            end = start
        else:
            end = int(uu[1], 16)
        script = fields[1]
        for u in range(start, end + 1):
            scripts[u] = script
        if script not in script_order:
            script_order[script] = start


class ConstraintSet(object):
    """A set of prohibited code point sequences.

    Args:
        constraint (List[int]): A prohibited code point sequence.

    """

    def __init__(self, constraint):
        # Either a list or a dictionary. As a list of code points, it
        # represents a prohibited code point sequence. As a dictionary,
        # it represents a set of prohibited sequences, where each item
        # represents the set of prohibited sequences starting with the
        # key (a code point) concatenated with any of the values
        # (ConstraintSets).
        self._c = constraint

    def add(self, constraint):
        """Add a constraint to this set."""
        if not constraint:
            return
        first = constraint[0]
        rest = constraint[1:]
        if isinstance(self._c, list):
            if constraint == self._c[:len(constraint)]:
                self._c = constraint
            elif self._c != constraint[:len(self._c)]:
                self._c = {self._c[0]: ConstraintSet(self._c[1:])}
        if isinstance(self._c, dict):
            if first in self._c:
                self._c[first].add(rest)
            else:
                self._c[first] = ConstraintSet(rest)

    def __str__(self, index=0, depth=4):
        s = []
        if isinstance(self._c, list):
            if len(self._c) == 0:
                assert index == 2, 'Cannot use `matched` for this constraint; the general case has not been implemented'
                s.append('matched = true;\n')
            elif len(self._c) == 1:
                assert index == 1, 'Cannot use `matched` for this constraint; the general case has not been implemented'
                s.append('matched = 0x{:04X} == buffer.cur({}).codepoint;\n'.format(next(
                    iter(self._c)), index))
            else:
                s.append('if 0x{:04X} == buffer.cur({}).codepoint &&\n'.format(self._c[0], index))
                if index:
                    s.append('buffer.idx + {} < buffer.len() &&\n'.format(index + 1))
                for i, cp in enumerate(self._c[1:], start=1):
                    s.append('0x{:04X} == buffer.cur({}).codepoint{}\n'.format(
                        cp, index + i, '' if i == len(self._c) - 1 else ' &&'))
                s.append('{\n')
                for i in range(index + 1):
                    s.append('buffer.next_glyph();\n')
                s.append('output_dotted_circle(buffer);\n')
                s.append('}\n')
        else:
            s.append('match buffer.cur({}).codepoint {{\n'.format(index))
            cases = collections.defaultdict(set)
            for first, rest in sorted(self._c.items()):
                cases[rest.__str__(index + 1, depth + 2)].add(first)
            for ii, (body, labels) in enumerate(sorted(cases.items(), key=lambda b_ls: sorted(b_ls[1])[0])):
                for i, cp in enumerate(sorted(labels)):
                    if i == len(labels) - 1:
                        s.append(' 0x{:04X} => {{ {}'.format(cp, '\n' if i % 4 == 3 else ''))
                    else:
                        s.append(' 0x{:04X} | {}'.format(cp, '\n' if i % 4 == 3 else ''))
                s.append(body)
                s.append('}')
                if ii == len(cases.items()) - 1:
                    s.append('_ => {}')
            s.append('}\n')
        return ''.join(s)


constraints = {}
with io.open('ms-use/IndicShapingInvalidCluster.txt', encoding='utf-8') as f:
    constraints_header = []
    while True:
        line = f.readline().strip()
        if line == '#':
            break
        constraints_header.append(line)
    for line in f:
        j = line.find('#')
        if j >= 0:
            line = line[:j]
        constraint = [int(cp, 16) for cp in line.split(';')[0].split()]
        if not constraint:
            continue
        assert 2 <= len(constraint), 'Prohibited sequence is too short: {}'.format(constraint)
        script = scripts[constraint[0]]
        if script in constraints:
            constraints[script].add(constraint)
        else:
            constraints[script] = ConstraintSet(constraint)
        assert constraints, 'No constraints found'

print('// WARNING: this file was generated by ../scripts/gen-vowel-constraints.py')
print()
print('use crate::buffer::{Buffer, BufferFlags};')
print('use crate::script;')
print()
print('fn output_dotted_circle(buffer: &mut Buffer) {')
print('    buffer.output_glyph(0x25CC);')
print('    {')
print('        let out_idx = buffer.out_len() - 1;')
print('        buffer.out_info_mut()[out_idx].reset_continuation();')
print('    }')
print('}')
print()
print('fn output_with_dotted_circle(buffer: &mut Buffer) {')
print('    output_dotted_circle(buffer);')
print('    buffer.next_glyph();')
print('}')
print()
print('pub fn preprocess_text_vowel_constraints(buffer: &mut Buffer) {')
print('    if buffer.flags.contains(BufferFlags::DO_NOT_INSERT_DOTTED_CIRCLE) {')
print('        return;')
print('    }')
print()
print('    // UGLY UGLY UGLY business of adding dotted-circle in the middle of')
print('    // vowel-sequences that look like another vowel.  Data for each script')
print('    // collected from the USE script development spec.')
print('    //')
print('    // https://github.com/harfbuzz/harfbuzz/issues/1019')
print('    let mut processed = false;')
print('    buffer.clear_output();')
print('    match buffer.props.script {')

for script, constraints in sorted(constraints.items(), key=lambda s_c: script_order[s_c[0]]):
    print('        Some(script::{}) => {{'.format(script.upper()))
    print('            buffer.idx = 0;')
    print('            while buffer.idx + 1 < buffer.len() {')
    print('                let mut matched = false;')
    print(str(constraints), end='')
    print('                buffer.next_glyph();')
    print('                if matched { output_with_dotted_circle(buffer); }')
    print('      }')
    print('      processed = true;')
    print('      }')
    print()

print('        _ => {}')
print('    }')
print('    if processed {')
print('        if buffer.idx < buffer.len() {')
print('            buffer.next_glyph();')
print('        }')
print('        buffer.swap_buffers();')
print('     }')
print('}')
print()