xi-unicode 0.3.0

Unicode utilities useful for text editing, including a line breaking iterator.
Documentation
#!/usr/bin/env python
# Copyright 2016 The xi-editor Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Utility for distilling Unicode data into properties which can be
# efficiently queried.

# Usage: python tools/mk_tables.py datadir > src/tables.rs
# datadir should point to Unicode data, including LineBreak.txt

import os
import sys
import random

linebreak_assignments = ['XX', 'AI', 'AL', 'B2', 'BA', 'BB', 'BK', 'CB', 'CL',
'CM', 'CR', 'EX', 'GL', 'HY', 'ID', 'IN', 'IS', 'LF', 'NS', 'NU', 'OP', 'PO',
'PR', 'QU', 'SA', 'SG', 'SP', 'SY', 'ZW', 'NL', 'WJ', 'H2', 'H3', 'JL', 'JT',
'JV', 'CP', 'CJ', 'HL', 'RI', 'EB', 'EM', 'ZWJ']

inv_lb_assigments = dict((val, i) for (i, val) in enumerate(linebreak_assignments))

def gen_data(data, width=80):
    line = ''
    for val in data:
        new = '%d,' % val
        if len(line) + 1 + len(new) > width :
            print line
            line = ''
        prefix = ' ' if line else '    '
        line += prefix + new
    print line

def gen_table(name, t, data, width=80):
    print
    print '#[rustfmt::skip]'
    print 'pub const %s: [%s; %d] = [' % (name, t, len(data))
    gen_data(data)
    print '];'

def compute_trie(rawdata, chunksize):
    root = []
    childmap = {}
    child_data = []
    for i in range(len(rawdata) / chunksize):
        data = rawdata[i * chunksize: (i + 1) * chunksize]
        child = '|'.join(map(str, data))
        if child not in childmap:
            childmap[child] = len(childmap)
            child_data.extend(data)
        root.append(childmap[child])
    return (root, child_data)

def compute_trie2(rawdata, midsize, leafsize):
    (mid, leaves) = compute_trie(rawdata, leafsize)
    (root, midnodes) = compute_trie(mid, midsize)
    return (root, midnodes, leaves)

def load_unicode_props(datadir, fn):
    f = open(os.path.join(datadir, fn))

    lb = ['XX'] * 0x110000;
    did_notice = False

    for line in f:
        if line.startswith('#'):
            if not did_notice:
                orig = line.split('#', 1)[1].strip()
                print "// This file autogenerated from %s by mk_tables.py" % orig
                did_notice = True
        s = line.split(' ')[0].split(';')
        if len(s) == 2:
            t = s[0].split('..')
            lo = int(t[0], 16)
            hi = int(t[-1], 16) + 1
            for cp in range(lo, hi):
                lb[cp] = s[1]

    numeric_lb = [inv_lb_assigments[lb[cp]] for cp in range(0x110000)]
    return numeric_lb

def mk_linebreak_props(datadir):
    numeric_lb = load_unicode_props(datadir, 'LineBreak.txt')

    # generate table for 1 and 2 byte utf-8, direct lookup
    gen_table('LINEBREAK_1_2', 'u8', numeric_lb[0:0x800])

    (root3, child3) = compute_trie(numeric_lb[0x800:0x10000], 0x40)
    gen_table('LINEBREAK_3_ROOT', 'u8', [255] * 32 + root3);
    gen_table('LINEBREAK_3_CHILD', 'u8', child3);

    (root4, mid4, leaves4) = compute_trie2(numeric_lb[0x10000:], 0x40, 0x40)
    gen_table('LINEBREAK_4_ROOT', 'u8', [255] * 16 + root4);
    gen_table('LINEBREAK_4_MID', 'u8', mid4);
    gen_table('LINEBREAK_4_LEAVES', 'u8', leaves4);

def mk_tables(datadir):
    print """// Copyright 2016 The xi-editor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//! Raw trie data for linebreak property lookup.
"""
    mk_linebreak_props(datadir)
    mk_lb_rules()

def mk_tests(datadir, do_str = False):
    numeric_lb = load_unicode_props(datadir, 'LineBreak.txt')
    ranges = (range(0x80), range(0x80, 0x800), range(0x800, 0x10000), range(0x10000, 0x110000))
    for r in ranges:
        for cp in sorted(random.sample(r, 32)):
            if 0xD800 <= cp and cp < 0xE000: continue  # invalid codepoint
            lb_prop = numeric_lb[cp]
            if do_str:
                if cp < 0x80:
                    cplen = 1
                elif cp < 0x800:
                    cplen = 2
                elif cp < 0x10000:
                    cplen = 3
                else:
                    cplen = 4
                print '        assert_eq!((%d, %d), linebreak_property_str(&"\\u{%04X}", 0));' % (lb_prop, cplen, cp)
            else:
                print '        assert_eq!(%2d, linebreak_property(\'\\u{%04X}\'));' % (lb_prop, cp)

def update(table, left, right, new):
    if type(left) == str: left = [left]
    if type(right) == str: right = [right]
    for l in left:
        for r in right:
            key = l + '|' + r
            if key not in table: table[key] = new

def update_both(table1, table2, left, right, new):
    update(table1, left, right, new)
    update(table2, left, right, new)

def resolve_ambig(orig):
    # LB1
    if orig in ('AI', 'SG', 'XX'):
        return 'AL'
    elif orig in 'SA':
        # TODO: need to incorporate this into property lookup
        return 'AL'
    elif orig == 'CJ':
        return 'NS'
    else:
        return orig

def mk_lb_rules():
    # Rules derived from UAX #14

    t = {}
    ts = {}  # transitions for when there is one or more SP
    Any = linebreak_assignments + ['HL+HY', 'HL+BA', 'RI+RI']
    # LB1: todo (affects South East Asian scripts)

    # LB2: handled in code
    # LB3: handled in code

    # LB4
    update(t, 'BK', Any, '!')

    # LB5
    update(t, 'CR', 'LF', 'x')
    update(t, 'CR', Any, '!')
    update(t, 'LF', Any, '!')
    update(t, 'NL', Any, '!')

    # LB6
    update_both(t, ts, Any, ['BK', 'CR', 'LF', 'NL'], 'x')

    # LB7
    update_both(t, ts, Any, ['SP'], 'x')
    update_both(t, ts, Any, ['ZW'], 'x')

    # LB8
    update_both(t, ts, 'ZW', Any, '_')

    # LB8a
    update(t, 'ZWJ', ['ID', 'EB', 'EM'], 'x')

    # LB9: handled in state machine construction
    # LB10: handled in state machine construction

    # LB11:
    update_both(t, ts, Any, 'WJ', 'x')
    update(t, 'WJ', Any, 'x')

    # LB12:
    update(t, 'GL', Any, 'x')

    # LB12:
    excl = set(linebreak_assignments) - set(('SP', 'BA', 'HY'))
    update(t, excl, 'GL', 'x')

    # LB13:
    update_both(t, ts, Any, 'CL', 'x')
    update_both(t, ts, Any, 'CP', 'x')
    update_both(t, ts, Any, 'EX', 'x')
    update_both(t, ts, Any, 'IS', 'x')
    update_both(t, ts, Any, 'SY', 'x')

    # LB14
    update_both(t, ts, 'OP', Any, 'x')

    # LB15
    update_both(t, ts, 'QU', 'OP', 'x')

    # LB16
    update_both(t, ts, ['CL', 'CP'], 'NS', 'x')

    # LB17
    update_both(t, ts, 'B2', 'B2', 'x')

    # LB18
    update(t, 'SP', Any, '_')
    update(ts, Any, Any, '_')  # note deviation from literal transcription

    # LB19
    update_both(t, ts, Any, 'QU', 'x')
    update(t, 'QU', Any, 'x')

    # LB20
    update_both(t, ts, Any, 'CB', '_')
    update(t, 'CB', Any, '_')

    # LB21
    update_both(t, ts, Any, 'BA', 'x')
    update_both(t, ts, Any, 'HY', 'x')
    update_both(t, ts, Any, 'NS', 'x')
    update(t, 'BB', Any, 'x')

    # LB21a: special states reached in state machine
    update(t, ['HL+HY', 'HL+BA'], Any, 'x')

    # LB21b:
    update(t, 'SY', 'HL', 'x')

    # LB22:
    update(t, ['AL', 'HL'], 'IN', 'x')
    update(t, 'EX', 'IN', 'x')
    update(t, ['ID', 'EB', 'EM'], 'IN', 'x')
    update(t, 'IN', 'IN', 'x')
    update(t, 'NU', 'IN', 'x')

    # LB23:
    update(t, ['AL', 'HL'], 'NU', 'x')
    update(t, 'NU', ['AL', 'HL'], 'x')

    # LB23a:
    update(t, 'PR', ['ID', 'EB', 'EM'], 'x')
    update(t, ['ID', 'EB', 'EM'], 'PO', 'x')

    # LB24:
    update(t, 'PR', ['AL', 'HL'], 'x')
    update(t, 'PO', ['AL', 'HL'], 'x')
    update(t, 'AL', ['PR', 'PO'], 'x')
    update(t, 'HL', ['PR', 'PO'], 'x')

    # LB25:
    update(t, 'CL', 'PO', 'x')
    update(t, 'CP', 'PO', 'x')
    update(t, 'CL', 'PP', 'x')
    update(t, 'CP', 'PR', 'x')
    update(t, 'NU', 'PO', 'x')
    update(t, 'NU', 'PR', 'x')
    update(t, 'PO', 'OP', 'x')
    update(t, 'PO', 'NU', 'x')
    update(t, 'PR', 'OP', 'x')
    update(t, 'PR', 'NU', 'x')
    update(t, 'HY', 'NU', 'x')
    update(t, 'IS', 'NU', 'x')
    update(t, 'NU', 'NU', 'x')
    update(t, 'SY', 'NU', 'x')

    # LB26:
    update(t, 'JL', ['JL', 'JV', 'H2', 'H3'], 'x')
    update(t, ['JV', 'H2'], ['JV', 'JT'], 'x')
    update(t, ['JT', 'H3'], 'JT', 'x')

    # LB27:
    update(t, ['JL', 'JV', 'JT', 'H2', 'H3'], 'IN', 'x')
    update(t, ['JL', 'JV', 'JT', 'H2', 'H3'], 'PO', 'x')
    update(t, 'PR', ['JL', 'JV', 'JT', 'H2', 'H3'], 'x')

    # LB28:
    update(t, ['AL', 'HL'], ['AL', 'HL'], 'x')

    # LB29:
    update(t, 'IS', ['AL', 'HL'], 'x')

    # LB30:
    update(t, ['AL', 'HL', 'NU'], 'OP', 'x')
    update(t, 'CP', ['AL', 'HL', 'NU'], 'x')

    # LB30a:
    update(t, 'RI', 'RI', 'x')
    update(t, Any, 'RI+RI', '_')
    update(t, 'RI+RI', Any, '_')

    # LB30b:
    update(t, 'EB', 'EM', 'x')

    # LB31:
    update_both(t, ts, Any, Any, '_')

    # state machine construction
    # states [0..43) correspond to LB class of previous ch
    # state 43 is 'HL+HY'
    # state 44 is 'HL+BA'
    # state 45 is 'RI+RI'
    # states [46..89) correspond to LB class (SP+)
    # result is new state on bottom (LB of right ch), + 0x80 if break + 0x40 if hard
    # (note that only states 0..43 need be represented if break)
    n = len(linebreak_assignments)
    nspecial = 3
    nstates = n * 2 + nspecial
    sm = []
    for i in range(nstates):
        sm.append([-1] * n)
    bk_to_flags = {'x': 0, '_': 0x80, '!': 0xc0}
    for left in range(n + nspecial):
        L = Any[left]
        if L == 'CM':
            L = 'AL'  # handling for LB10
        L = resolve_ambig(L)
        for right in range(n):
            R = linebreak_assignments[right]
            R = resolve_ambig(R)
            r_with_cm = right
            l_with_cm = left
            if R in ['CM', 'ZWJ'] and L in ['BK', 'CR', 'LF', 'NL', 'SP', 'ZW']:
                # handling for LB10
                r_with_cm = 2  # AL
                bk = t[L + '|' + 'AL']
            elif L == 'ZWJ' and R not in ['ID', 'EB', 'EM']:
                #handling for LB10
                l_with_cm = 2  # AL
                bk = t['AL' + '|' + R]
            else:
                bk = t[L + '|' + R]
            flags = bk_to_flags[bk]
            if flags == 0 and R == 'SP':
                if left < n:
                    state = left + n + nspecial
                else:
                    state = left
            elif flags == 0 and L == 'HL' and R == 'HY':
                # special state for LB21a
                state = n
            elif flags == 0 and L == 'HL' and R == 'BA':
                # special state for LB21a
                state = n + 1
            elif R in ['CM', 'ZWJ'] and L not in ['BK', 'CR', 'LF', 'NL', 'SP', 'ZW']:
                # handling for LB9
                state = l_with_cm
            elif flags == 0 and R == 'RI' and L == 'RI':
                # handling for LB31
                state = n + 2
            else:
                state = flags + r_with_cm
            #print '//', L, R, bk, state
            sm[left][right] = state

            if left < n:
                # SP+ states
                bk = ts[L + '|' + R]
                flags = bk_to_flags[bk]
                sm[left + n + nspecial][right] = flags + r_with_cm
    nunique = len(set(str(line) for line in sm))
    print '//', nunique, 'unique states'
    print 'pub const N_LINEBREAK_CATEGORIES: usize = %d;' % n
    print
    print '#[rustfmt::skip]'
    print 'pub const LINEBREAK_STATE_MACHINE: [u8; %d] = [' % (nstates * n)
    # TODO: dedup
    for state in range(nstates):
        if state < n + nspecial:
            statename = Any[state]
        else:
            statename = 'SP+ ' + Any[state - (n + nspecial)]
        print '    // state %d: %s' % (state, statename)
        gen_data(sm[state])
    print '];'

def main():
    datadir = sys.argv[1]
    if len(sys.argv) == 3 and sys.argv[2] == '--tests':
        mk_tests(datadir)
    if len(sys.argv) == 3 and sys.argv[2] == '--tests-str':
        mk_tests(datadir, True)
    else:
        mk_tables(datadir)

main()