unicode-segmentation 1.3.0

This crate provides Grapheme Cluster, Word and Sentence boundaries according to Unicode Standard Annex #29 rules.
Documentation
#!/usr/bin/env python
# -*- coding: utf-8
#
# Copyright 2015 The Rust Project Developers. See the COPYRIGHT
# file at the top-level directory of this distribution and at
# http://rust-lang.org/COPYRIGHT.
#
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
# option. This file may not be copied, modified, or distributed
# except according to those terms.

# This script uses the following Unicode tables:
# - auxiliary/GraphemeBreakTest.txt
# - auxiliary/WordBreakTest.txt
#
# Since this should not require frequent updates, we just store this
# out-of-line and check the unicode.rs file into git.
from __future__ import print_function

import unicode, re, os, fileinput

def load_test_data(f, optsplit=[]):
    testRe1 = re.compile(r"^÷\s+([^\s].*[^\s])\s+÷\s+#\s+÷\s+\[0.2\].*?([÷×].*)\s+÷\s+\[0.3\]\s*$")

    unicode.fetch(f)
    data = []
    for line in fileinput.input(os.path.basename(f)):
        # lines that include a test start with the ÷ character
        if len(line) < 2 or not line.startswith('÷'):
            continue

        m = testRe1.match(line)
        if not m:
            print("error: no match on line where test was expected: %s" % line)
            continue

        # process the characters in this test case
        chars = process_split_string(m.group(1))
        # skip test case if it contains invalid characters (viz., surrogates)
        if not chars:
            continue

        # now process test cases
        (chars, info) = process_split_info(m.group(2), chars, optsplit)

        # make sure that we have break info for each break!
        assert len(chars) - 1 == len(info)

        data.append((chars, info))

    return data

def process_split_info(s, c, o):
    outcs = []
    outis = []
    workcs = c.pop(0)

    # are we on a × or a ÷?
    isX = False
    if s.startswith('×'):
        isX = True

    # find each instance of '(÷|×) [x.y] '
    while s:
        # find the currently considered rule number
        sInd = s.index('[') + 1
        eInd = s.index(']')

        # if it's '× [a.b]' where 'a.b' is in o, then
        # we consider it a split even though it's not
        # marked as one
        # if it's ÷ then it's always a split
        if not isX or s[sInd:eInd] in o:
            outis.append(s[sInd:eInd])
            outcs.append(workcs)
            workcs = c.pop(0)
        else:
            workcs.extend(c.pop(0))

        idx = 1
        while idx < len(s):
            if s[idx:].startswith('×'):
                isX = True
                break
            if s[idx:].startswith('÷'):
                isX = False
                break
            idx += 1
        s = s[idx:]

    outcs.append(workcs)
    return (outcs, outis)

def process_split_string(s):
    outls = []
    workls = []

    inls = s.split()

    for i in inls:
        if i == '÷' or i == '×':
            outls.append(workls)
            workls = []
            continue

        ival = int(i,16)

        if unicode.is_surrogate(ival):
            return []

        workls.append(ival)

    if workls:
        outls.append(workls)

    return outls

def showfun(x):
    outstr = '("'
    for c in x[0]:
        outstr += "\\u{%x}" % c
    outstr += '",&['
    xfirst = True
    for xx in x[1:]:
        if not xfirst:
            outstr += '],&['
        xfirst = False
        sfirst = True
        for sp in xx:
            if not sfirst:
                outstr += ','
            sfirst = False
            outstr += '"'
            for c in sp:
                outstr += "\\u{%x}" % c
            outstr += '"'
    outstr += '])'
    return outstr

def create_grapheme_data(f):
    # rules 9.1 and 9.2 are for extended graphemes only
    optsplits = ['9.1','9.2']
    d = load_test_data("auxiliary/GraphemeBreakTest.txt", optsplits)

    test_same = []
    test_diff = []

    for (c, i) in d:
        allchars = [cn for s in c for cn in s]
        extgraphs = []
        extwork = []

        extwork.extend(c[0])
        for n in range(0,len(i)):
            if i[n] in optsplits:
                extwork.extend(c[n+1])
            else:
                extgraphs.append(extwork)
                extwork = []
                extwork.extend(c[n+1])

        # these are the extended grapheme clusters
        extgraphs.append(extwork)

        if extgraphs == c:
            test_same.append((allchars, c))
        else:
            test_diff.append((allchars, extgraphs, c))

    stype = "&'static [(&'static str, &'static [&'static str])]"
    dtype = "&'static [(&'static str, &'static [&'static str], &'static [&'static str])]"
    f.write("    // official Unicode test data\n")
    f.write("    // http://www.unicode.org/Public/9.0.0/ucd/auxiliary/GraphemeBreakTest.txt\n")
    unicode.emit_table(f, "TEST_SAME", test_same, stype, True, showfun, True)
    unicode.emit_table(f, "TEST_DIFF", test_diff, dtype, True, showfun, True)

def create_words_data(f):
    d = load_test_data("auxiliary/WordBreakTest.txt")

    test = []

    for (c, i) in d:
        allchars = [cn for s in c for cn in s]
        test.append((allchars, c))

    wtype = "&'static [(&'static str, &'static [&'static str])]"
    f.write("    // official Unicode test data\n")
    f.write("    // http://www.unicode.org/Public/9.0.0/ucd/auxiliary/WordBreakTest.txt\n")
    unicode.emit_table(f, "TEST_WORD", test, wtype, True, showfun, True)

def create_sentence_data(f):
    d = load_test_data("auxiliary/SentenceBreakTest.txt")

    test = []

    for (c, i) in d:
        allchars = [cn for s in c for cn in s]
        test.append((allchars, c))

    wtype = "&'static [(&'static str, &'static [&'static str])]"
    f.write("    // official Unicode test data\n")
    f.write("    // http://www.unicode.org/Public/UNIDATA/auxiliary/SentenceBreakTest.txt\n")
    unicode.emit_table(f, "TEST_SENTENCE", test, wtype, True, showfun, True)

if __name__ == "__main__":
    with open("testdata.rs", "w") as rf:
        rf.write(unicode.preamble)
        create_grapheme_data(rf)
        create_words_data(rf)
        create_sentence_data(rf)