feruca 0.6.0

An implementation of the Unicode Collation Algorithm
Documentation
use crate::cea_utils::{
    ccc_sequence_ok, get_subset, get_table_multis, get_table_singles, handle_implicit_weights,
    handle_low_weights, push_weights, remove_pulled,
};
use crate::consts::{LOW, LOW_CLDR, NEED_THREE, NEED_TWO};
use crate::{Collator, Tailoring};
use tinyvec::ArrayVec;
use unicode_canonical_combining_class::get_canonical_combining_class_u32 as get_ccc;

pub fn generate_cea(char_vals: &mut Vec<u32>, collator: &Collator) -> Vec<ArrayVec<[u16; 4]>> {
    let mut cea: Vec<ArrayVec<[u16; 4]>> = Vec::new();

    let cldr = collator.tailoring != Tailoring::Ducet;
    let shifting = collator.shifting;

    let low = if cldr { &LOW_CLDR } else { &LOW };
    let singles = get_table_singles(collator.tailoring);
    let multis = get_table_multis(collator.tailoring);

    let mut input_length = char_vals.len();
    let mut left: usize = 0;
    let mut last_variable = false;

    // We spend essentially the entire function in this loop
    'outer: while left < input_length {
        let left_val = char_vals[left];

        //
        // OUTCOME 0
        //
        // The code point was low, so we could draw from a small map that associates one u32 with
        // one Weights struct. Then push the weights, shifting if necessary. This is the path that
        // catches (most) ASCII characters present in not-completely-ASCII strings.
        //
        if left_val < 183 && left_val != 108 && left_val != 76 {
            let weights = low[&left_val]; // Guaranteed to succeed

            handle_low_weights(shifting, weights, &mut last_variable, &mut cea);

            // Increment and continue outer loop
            left += 1;
            continue;
        }

        // At this point, we aren't dealing with a low code point

        // Set lookahead depending on left_val. We need 3 in a few cases; 2 in several dozen cases;
        // and 1 otherwise.
        let lookahead: usize = match left_val {
            x if NEED_THREE.contains(&x) => 3,
            x if NEED_TWO.contains(&x) => 2,
            _ => 1,
        };

        // If lookahead is 1, or if this is the last item in the vec, we'll take an easier path
        let check_multi = lookahead > 1 && (input_length - left > 1);

        if !check_multi {
            //
            // OUTCOME 1
            //
            // We only had to check for a single code point, and found it, so we can push the
            // weights and continue. This is a relatively fast path.
            //
            if let Some(row) = singles.get(&left_val) {
                push_weights(row, shifting, &mut last_variable, &mut cea);

                // Increment and continue outer loop
                left += 1;
                continue;
            }

            //
            // OUTCOME 2
            //
            // We checked for a single code point and didn't find it. That means it's unlisted. We
            // then calculate implicit weights, push them, and move on. I used to think there were
            // multiple paths to the "implicit weights" case, but it seems not.
            //
            handle_implicit_weights(left_val, shifting, &mut cea);

            // Increment and continue outer loop
            left += 1;
            continue;
        }

        // Here we consider multi-code-point matches, if possible

        // Don't look past the end of the vec
        let mut right = if left + lookahead > input_length {
            input_length
        } else {
            left + lookahead
        };

        while right > left {
            // If right - left == 1 (which cannot be the case in the first iteration), attempts to
            // find a slice have failed. So look for one code point, in the singles map
            if right - left == 1 {
                // If we found it, we do still need to check for discontiguous matches
                if let Some(row) = singles.get(&left_val) {
                    // Determine how much further right to look
                    let mut max_right = match right {
                        r if r + 2 < input_length => r + 2,
                        r if r + 1 < input_length => r + 1,
                        _ => right, // Skip the loop below; there will be no discontiguous match
                    };

                    let mut try_two = (max_right - right == 2) && cldr;

                    'inner: while max_right > right {
                        // Make sure the sequence of CCC values is kosher
                        let interest_cohort = &char_vals[right..=max_right];

                        if !ccc_sequence_ok(interest_cohort) {
                            // Can also forget about try_two in this case
                            try_two = false;
                            max_right -= 1;
                            continue 'inner;
                        }

                        // Having made it this far, we can test a new subset, adding later char(s)
                        let new_subset = get_subset(try_two, left_val, char_vals, max_right);

                        //
                        // OUTCOME 3
                        //
                        // We found a discontiguous match for one code point. This is a bad path,
                        // since it implies that we: checked for multiple code points; didn't find
                        // them; fell back to check for the initial code point; found it; checked
                        // for discontiguous matches; and found one. Anyway, push the weights...
                        //
                        if let Some(new_row) = multis.get(&new_subset) {
                            push_weights(new_row, shifting, &mut last_variable, &mut cea);

                            // Remove the pulled char(s)
                            remove_pulled(char_vals, max_right, &mut input_length, try_two);

                            // Increment and continue outer loop
                            left += 1;
                            continue 'outer;
                        }

                        // If we tried for two, don't decrement max_right yet
                        // Inner loop will re-run
                        if try_two {
                            try_two = false;
                        } else {
                            // Otherwise decrement max_right; inner loop may re-run, or finish
                            max_right -= 1;
                        }
                    }

                    //
                    // OUTCOME 4
                    //
                    // We checked for multiple code points; failed to find them; fell back to check
                    // for the initial code point; found it; checked for discontiguous matches; and
                    // did not find any. This is a really bad path. Push the weights...
                    //
                    push_weights(row, shifting, &mut last_variable, &mut cea);

                    // Increment and continue outer loop
                    left += 1;
                    continue 'outer;
                }

                // Reaching this point would imply that we looked for multiple code points; failed
                // to find anything; fell back to search for the left code point; and didn't find
                // that, either. So in theory, we would be dealing with an unlisted code point and
                // proceeding to calculate implicit weights. But that's impossible, isn't it? If we
                // started this path by checking for multiples, that means we had one of the code
                // points in NEED_THREE or NEED_TWO -- all of which are listed in the tables. I
                // think this is actually unreachable; and my testing bears that out.

                // no-op
            }

            // At this point, we're trying to find a slice; this comes "before" the section above
            let subset = &char_vals[left..right];

            if let Some(row) = multis.get(subset) {
                // If we found it, we may need to check for a discontiguous match.
                // But that's only if we matched on a set of two code points; and we'll only skip
                // over one to find a possible third.
                let mut try_discont = subset.len() == 2 && (right + 1 < input_length);

                while try_discont {
                    // Need to make sure the sequence of CCCs is kosher
                    let ccc_a = get_ccc(char_vals[right]) as u8;
                    let ccc_b = get_ccc(char_vals[right + 1]) as u8;

                    if ccc_a == 0 || ccc_a >= ccc_b {
                        // Bail -- no discontiguous match
                        try_discont = false;
                        continue;
                    }

                    // Having made it this far, we can test a new subset, adding the later char.
                    // Again, this only happens if we found a match of two code points and want to
                    // add a third; so we can be oddly specific.
                    let new_subset = ArrayVec::from([subset[0], subset[1], char_vals[right + 1]]);

                    //
                    // OUTCOME 5
                    //
                    // We checked for multiple code points; found something; went on to check for a
                    // discontiguous match; and found one. For a complicated case, this is a good
                    // path. Push the weights...
                    //
                    if let Some(new_row) = multis.get(&new_subset) {
                        push_weights(new_row, shifting, &mut last_variable, &mut cea);

                        // Remove the pulled char
                        remove_pulled(char_vals, right + 1, &mut input_length, false);

                        // Increment and continue outer loop
                        left += right - left;
                        continue 'outer;
                    }

                    // The loop will not run again -- no discontiguous match
                    try_discont = false;
                }

                //
                // OUTCOME 6
                //
                // We checked for multiple code points; found something; checked for discontiguous
                // matches; and did not find any. This is an ok path? Push the weights...
                //
                push_weights(row, shifting, &mut last_variable, &mut cea);

                // Increment and continue outer loop
                left += right - left;
                continue 'outer;
            }

            // Shorten slice to try again
            right -= 1;
        }

        // This is another unreachable point. All possible cases for the outer loop have been
        // handled. There's no need to increment.
    }

    // Return!
    cea
}