1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
//! This crate provides a basic implementation of the Unicode Collation Algorithm. There is really
//! just one function, `collate`, and a few options that can be passed to it. (The
//! `collate_no_tiebreak` function is a variation whose behavior is a bit more strict.) Despite the
//! bare-bones API, this implementation conforms to the standard and allows for the use of the CLDR
//! root collation order; so it may indeed be useful, even in this early stage of development.

#![warn(clippy::pedantic, clippy::cargo)]
#![allow(clippy::module_name_repetitions)]
#![deny(missing_docs)]

use bstr::{ByteSlice, B};
use serde::Deserialize;
use std::cmp::Ordering;

mod ascii;
use ascii::{all_ascii, compare_ascii};

mod cea;
use cea::generate_cea;

mod cea_utils;
mod consts;

mod first_weight;
use first_weight::{get_first_primary, safe_first_chars};

mod normalize;
use normalize::make_nfd;

mod prefix;
use prefix::trim_prefix;

mod sort_key;
use sort_key::compare_incremental;

//
// Structs and enums
//

/// This struct specifies the options to be passed to the `collate` function. You can choose between
/// two tables (DUCET and CLDR root), and between two approaches to the handling of variable-weight
/// characters ("non-ignorable" and "shifted"). The default, and a good starting point for Unicode
/// collation, is to use the CLDR table with the "shifted" approach.
#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug)]
pub struct CollationOptions {
    /// The table of weights to be used (currently either DUCET or CLDR)
    pub keys_source: KeysSource,
    /// The approach to handling variable-weight characters ("non-ignorable" or "shifted"). For our
    /// purposes, `shifting` is either true (recommended) or false.
    pub shifting: bool,
}

impl Default for CollationOptions {
    fn default() -> Self {
        Self {
            keys_source: KeysSource::Cldr,
            shifting: true,
        }
    }
}

/// This enum provides for a choice of which table of character weights to use.
#[derive(Copy, Clone, PartialEq, Eq, Ord, PartialOrd, Hash, Debug)]
pub enum KeysSource {
    /// The table associated with the CLDR root collation order (recommended)
    Cldr,
    /// The default table for the Unicode Collation Algorithm
    Ducet,
}

#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Default, Deserialize)]
struct Weights {
    variable: bool,
    primary: u16,
    secondary: u16,
    tertiary: u16,
}

//
// Public functions
//

/// This is the main public function in the library. It accepts as arguments two string references
/// or byte slices, and a `CollationOptions` struct. It returns an `Ordering` value. This is
/// designed to be used in conjunction with the `sort_by` function in the standard library. Simple
/// usage might look like the following...
///
/// ```
/// use feruca::{collate, CollationOptions};
///
/// let mut names = ["Peng", "Peña", "Ernie", "Émile"];
/// names.sort_by(|a, b| collate(a, b, CollationOptions::default()));
///
/// let expected = ["Émile", "Ernie", "Peña", "Peng"];
/// assert_eq!(names, expected);
/// ```
///
/// Significantly, in the event that two strings are ordered equally per the Unicode Collation
/// Algorithm, this function will use byte-value comparison (i.e., the traditional, naïve way of
/// sorting strings) as a tiebreaker. While this is probably appropriate in most cases, it can be
/// avoided by using the `collate_no_tiebreak` function.
#[must_use]
pub fn collate<T: AsRef<[u8]> + Eq + Ord>(a: &T, b: &T, opt: CollationOptions) -> Ordering {
    // Early out; equal is equal
    if a == b {
        return Ordering::Equal;
    }

    // Turn both into Vecs of u32 code points, while validating UTF-8
    let mut a_chars: Vec<u32> = B(a).chars().map(|c| c as u32).collect();
    let mut b_chars: Vec<u32> = B(b).chars().map(|c| c as u32).collect();

    // Check if both are entirely alphanumeric ASCII
    let easy = all_ascii(&a_chars, &b_chars);
    if easy {
        return compare_ascii(a_chars, b_chars);
    }

    // Normalize to NFD
    make_nfd(&mut a_chars);
    make_nfd(&mut b_chars);

    // I think it's worth offering an out here, too, in case two strings decompose to the same.
    // If we went forward and generated sort keys, they would be equal, and we would end up at the
    // tiebreaker, anyway.
    if a_chars == b_chars {
        // Tiebreaker
        return a.cmp(b);
    }

    // Check for a shared prefix that might be safe to trim
    let cldr = opt.keys_source == KeysSource::Cldr;
    trim_prefix(&mut a_chars, &mut b_chars, cldr);

    // After prefix trimming, one of the Vecs may be empty (but not both!)
    if a_chars.is_empty() || b_chars.is_empty() {
        return a_chars.cmp(&b_chars);
    }

    // One last chance for an early out: if the opening code points of the two Vecs are different,
    // and neither requires checking for a multi-code-point sequence, then we can try comparing
    // their first primary weights. If those are different, and both non-zero, it's decisive.
    if safe_first_chars(&a_chars, &b_chars) {
        let a_first_primary = get_first_primary(a_chars[0], opt);
        let b_first_primary = get_first_primary(b_chars[0], opt);

        if a_first_primary != b_first_primary && a_first_primary != 0 && b_first_primary != 0 {
            return a_first_primary.cmp(&b_first_primary);
        }
    }

    // Otherwise we move forward with full collation element arrays
    let a_cea = generate_cea(&mut a_chars, opt);
    let b_cea = generate_cea(&mut b_chars, opt);

    // Sort keys are processed incrementally, until they yield a result
    let comparison = compare_incremental(&a_cea, &b_cea, opt.shifting);

    if comparison == Ordering::Equal {
        // Tiebreaker
        return a.cmp(b);
    }

    comparison
}

/// This is a variation on the `collate` function, to which it is almost identical. The difference
/// is that, in the event that two strings are ordered equally per the Unicode Collation Algorithm,
/// this function will not attempt to "break the tie" by using byte-value comparison.
#[must_use]
pub fn collate_no_tiebreak<T: AsRef<[u8]> + Eq + Ord>(
    a: &T,
    b: &T,
    opt: CollationOptions,
) -> Ordering {
    if a == b {
        return Ordering::Equal;
    }

    let mut a_chars: Vec<u32> = B(a).chars().map(|c| c as u32).collect();
    let mut b_chars: Vec<u32> = B(b).chars().map(|c| c as u32).collect();

    let easy = all_ascii(&a_chars, &b_chars);
    if easy {
        return compare_ascii(a_chars, b_chars);
    }

    make_nfd(&mut a_chars);
    make_nfd(&mut b_chars);

    if a_chars == b_chars {
        return Ordering::Equal;
    }

    let cldr = opt.keys_source == KeysSource::Cldr;
    trim_prefix(&mut a_chars, &mut b_chars, cldr);

    if a_chars.is_empty() || b_chars.is_empty() {
        return a_chars.cmp(&b_chars);
    }

    if safe_first_chars(&a_chars, &b_chars) {
        let a_first_primary = get_first_primary(a_chars[0], opt);
        let b_first_primary = get_first_primary(b_chars[0], opt);

        if a_first_primary != b_first_primary && a_first_primary != 0 && b_first_primary != 0 {
            return a_first_primary.cmp(&b_first_primary);
        }
    }

    let a_cea = generate_cea(&mut a_chars, opt);
    let b_cea = generate_cea(&mut b_chars, opt);

    compare_incremental(&a_cea, &b_cea, opt.shifting)
}