1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
//! This crate provides a basic implementation of the Unicode Collation Algorithm. There is really
//! just one function, `collate`, and a few options that can be passed to it. (The
//! `collate_no_tiebreak` function is a variation whose behavior is a bit more strict.) Despite the
//! bare-bones API, this implementation conforms to the standard and allows for the use of the CLDR
//! root collation order; so it may indeed be useful, even in this early stage of development.
#![warn(clippy::pedantic, clippy::cargo)]
#![allow(clippy::module_name_repetitions)]
#![deny(missing_docs)]
use bstr::{ByteSlice, B};
use serde::Deserialize;
use std::cmp::Ordering;
mod ascii;
use ascii::{all_ascii, compare_ascii};
mod cea;
use cea::generate_cea;
mod cea_utils;
mod consts;
mod first_weight;
use first_weight::{get_first_primary, safe_first_chars};
mod normalize;
use normalize::make_nfd;
mod prefix;
use prefix::trim_prefix;
mod sort_key;
use sort_key::compare_incremental;
//
// Structs and enums
//
/// This struct specifies the options to be passed to the `collate` function. You can choose between
/// two tables (DUCET and CLDR root), and between two approaches to the handling of variable-weight
/// characters ("non-ignorable" and "shifted"). The default, and a good starting point for Unicode
/// collation, is to use the CLDR table with the "shifted" approach.
#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug)]
pub struct CollationOptions {
/// The table of weights to be used (currently either DUCET or CLDR)
pub keys_source: KeysSource,
/// The approach to handling variable-weight characters ("non-ignorable" or "shifted"). For our
/// purposes, `shifting` is either true (recommended) or false.
pub shifting: bool,
}
impl Default for CollationOptions {
fn default() -> Self {
Self {
keys_source: KeysSource::Cldr,
shifting: true,
}
}
}
/// This enum provides for a choice of which table of character weights to use.
#[derive(Copy, Clone, PartialEq, Eq, Ord, PartialOrd, Hash, Debug)]
pub enum KeysSource {
/// The table associated with the CLDR root collation order (recommended)
Cldr,
/// The default table for the Unicode Collation Algorithm
Ducet,
}
#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Default, Deserialize)]
struct Weights {
variable: bool,
primary: u16,
secondary: u16,
tertiary: u16,
}
//
// Public functions
//
/// This is the main public function in the library. It accepts as arguments two string references
/// or byte slices, and a `CollationOptions` struct. It returns an `Ordering` value. This is
/// designed to be used in conjunction with the `sort_by` function in the standard library. Simple
/// usage might look like the following...
///
/// ```
/// use feruca::{collate, CollationOptions};
///
/// let mut names = ["Peng", "Peña", "Ernie", "Émile"];
/// names.sort_by(|a, b| collate(a, b, CollationOptions::default()));
///
/// let expected = ["Émile", "Ernie", "Peña", "Peng"];
/// assert_eq!(names, expected);
/// ```
///
/// Significantly, in the event that two strings are ordered equally per the Unicode Collation
/// Algorithm, this function will use byte-value comparison (i.e., the traditional, naïve way of
/// sorting strings) as a tiebreaker. While this is probably appropriate in most cases, it can be
/// avoided by using the `collate_no_tiebreak` function.
#[must_use]
pub fn collate<T: AsRef<[u8]> + Eq + Ord>(a: &T, b: &T, opt: CollationOptions) -> Ordering {
// Early out; equal is equal
if a == b {
return Ordering::Equal;
}
// Turn both into Vecs of u32 code points, while validating UTF-8
let mut a_chars: Vec<u32> = B(a).chars().map(|c| c as u32).collect();
let mut b_chars: Vec<u32> = B(b).chars().map(|c| c as u32).collect();
// Check if both are entirely alphanumeric ASCII
let easy = all_ascii(&a_chars, &b_chars);
if easy {
return compare_ascii(a_chars, b_chars);
}
// Normalize to NFD
make_nfd(&mut a_chars);
make_nfd(&mut b_chars);
// I think it's worth offering an out here, too, in case two strings decompose to the same.
// If we went forward and generated sort keys, they would be equal, and we would end up at the
// tiebreaker, anyway.
if a_chars == b_chars {
// Tiebreaker
return a.cmp(b);
}
// Check for a shared prefix that might be safe to trim
let cldr = opt.keys_source == KeysSource::Cldr;
trim_prefix(&mut a_chars, &mut b_chars, cldr);
// After prefix trimming, one of the Vecs may be empty (but not both!)
if a_chars.is_empty() || b_chars.is_empty() {
return a_chars.cmp(&b_chars);
}
// One last chance for an early out: if the opening code points of the two Vecs are different,
// and neither requires checking for a multi-code-point sequence, then we can try comparing
// their first primary weights. If those are different, and both non-zero, it's decisive.
if safe_first_chars(&a_chars, &b_chars) {
let a_first_primary = get_first_primary(a_chars[0], opt);
let b_first_primary = get_first_primary(b_chars[0], opt);
if a_first_primary != b_first_primary && a_first_primary != 0 && b_first_primary != 0 {
return a_first_primary.cmp(&b_first_primary);
}
}
// Otherwise we move forward with full collation element arrays
let a_cea = generate_cea(&mut a_chars, opt);
let b_cea = generate_cea(&mut b_chars, opt);
// Sort keys are processed incrementally, until they yield a result
let comparison = compare_incremental(&a_cea, &b_cea, opt.shifting);
if comparison == Ordering::Equal {
// Tiebreaker
return a.cmp(b);
}
comparison
}
/// This is a variation on the `collate` function, to which it is almost identical. The difference
/// is that, in the event that two strings are ordered equally per the Unicode Collation Algorithm,
/// this function will not attempt to "break the tie" by using byte-value comparison.
#[must_use]
pub fn collate_no_tiebreak<T: AsRef<[u8]> + Eq + Ord>(
a: &T,
b: &T,
opt: CollationOptions,
) -> Ordering {
if a == b {
return Ordering::Equal;
}
let mut a_chars: Vec<u32> = B(a).chars().map(|c| c as u32).collect();
let mut b_chars: Vec<u32> = B(b).chars().map(|c| c as u32).collect();
let easy = all_ascii(&a_chars, &b_chars);
if easy {
return compare_ascii(a_chars, b_chars);
}
make_nfd(&mut a_chars);
make_nfd(&mut b_chars);
if a_chars == b_chars {
return Ordering::Equal;
}
let cldr = opt.keys_source == KeysSource::Cldr;
trim_prefix(&mut a_chars, &mut b_chars, cldr);
if a_chars.is_empty() || b_chars.is_empty() {
return a_chars.cmp(&b_chars);
}
if safe_first_chars(&a_chars, &b_chars) {
let a_first_primary = get_first_primary(a_chars[0], opt);
let b_first_primary = get_first_primary(b_chars[0], opt);
if a_first_primary != b_first_primary && a_first_primary != 0 && b_first_primary != 0 {
return a_first_primary.cmp(&b_first_primary);
}
}
let a_cea = generate_cea(&mut a_chars, opt);
let b_cea = generate_cea(&mut b_chars, opt);
compare_incremental(&a_cea, &b_cea, opt.shifting)
}