1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176
//! This crate provides a basic implementation of the Unicode Collation Algorithm. There is really
//! just one function, `collate`, and a few options that can be passed to it. (The
//! `collate_no_tiebreak` function is a variation whose behavior is a bit more strict.) Despite the
//! bare-bones API, this implementation conforms to the standard and allows for the use of the CLDR
//! root collation order; so it may indeed be useful, even in this early stage of development.
#![warn(clippy::pedantic, clippy::cargo)]
#![allow(clippy::module_name_repetitions)]
#![deny(missing_docs)]
use bstr::{ByteSlice, B};
use serde::Deserialize;
use std::cmp::Ordering;
mod ascii;
use ascii::{all_ascii, compare_ascii};
mod cea;
mod cea_utils;
mod consts;
mod normalize;
use normalize::make_nfd;
mod prefix;
use prefix::trim_prefix;
mod sort_key;
use sort_key::nfd_to_sk;
//
// Structs and enums
//
/// This struct specifies the options to be passed to the `collate` function. You can choose between
/// two tables (DUCET and CLDR root), and between two approaches to the handling of variable-weight
/// characters ("non-ignorable" and "shifted"). The default, and a good starting point for Unicode
/// collation, is to use the CLDR table with the "shifted" approach.
#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug)]
pub struct CollationOptions {
/// The table of weights to be used (currently either DUCET or CLDR)
pub keys_source: KeysSource,
/// The approach to handling variable-weight characters ("non-ignorable" or "shifted"). For our
/// purposes, `shifting` is either true (recommended) or false.
pub shifting: bool,
}
impl Default for CollationOptions {
fn default() -> Self {
Self {
keys_source: KeysSource::Cldr,
shifting: true,
}
}
}
/// This enum provides for a choice of which table of character weights to use.
#[derive(Copy, Clone, PartialEq, Eq, Ord, PartialOrd, Hash, Debug)]
pub enum KeysSource {
/// The table associated with the CLDR root collation order (recommended)
Cldr,
/// The default table for the Unicode Collation Algorithm
Ducet,
}
#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Default, Deserialize)]
struct Weights {
variable: bool,
primary: u16,
secondary: u16,
tertiary: u16,
}
//
// Public functions
//
/// This is the main public function in the library. It accepts as arguments two string references
/// or byte slices, and a `CollationOptions` struct. It returns an `Ordering` value. This is
/// designed to be used in conjunction with the `sort_by` function in the standard library. Simple
/// usage might look like the following...
///
/// ```
/// use feruca::{collate, CollationOptions};
///
/// let mut names = ["Peng", "Peña", "Ernie", "Émile"];
/// names.sort_by(|a, b| collate(a, b, CollationOptions::default()));
///
/// let expected = ["Émile", "Ernie", "Peña", "Peng"];
/// assert_eq!(names, expected);
/// ```
///
/// Significantly, in the event that two strings are ordered equally per the Unicode Collation
/// Algorithm, this function will use byte-value comparison (i.e., the traditional, naïve way of
/// sorting strings) as a tiebreaker. While this is probably appropriate in most cases, it can be
/// avoided by using the `collate_no_tiebreak` function.
#[must_use]
pub fn collate<T: AsRef<[u8]> + Eq + Ord>(a: &T, b: &T, opt: CollationOptions) -> Ordering {
// Early out
if a == b {
return Ordering::Equal;
}
let mut a_chars: Vec<u32> = B(a).chars().map(|c| c as u32).collect();
let mut b_chars: Vec<u32> = B(b).chars().map(|c| c as u32).collect();
let easy = all_ascii(&a_chars, &b_chars);
if easy {
return compare_ascii(a_chars, b_chars);
}
make_nfd(&mut a_chars);
make_nfd(&mut b_chars);
// I think it's worth offering an out here, too, in case two strings decompose to the same.
// If we went forward and generated sort keys, they would be equal, and we would end up at the
// tiebreaker, anyway.
if a_chars == b_chars {
// Tiebreaker
return a.cmp(b);
}
let cldr = opt.keys_source == KeysSource::Cldr;
trim_prefix(&mut a_chars, &mut b_chars, cldr);
let a_sort_key = nfd_to_sk(&mut a_chars, opt);
let b_sort_key = nfd_to_sk(&mut b_chars, opt);
let comparison = a_sort_key.cmp(&b_sort_key);
if comparison == Ordering::Equal {
// Tiebreaker
return a.cmp(b);
}
comparison
}
/// This is a variation on the `collate` function, to which it is almost identical. The difference
/// is that, in the event that two strings are ordered equally per the Unicode Collation Algorithm,
/// this function will not attempt to "break the tie" by using byte-value comparison.
#[must_use]
pub fn collate_no_tiebreak<T: AsRef<[u8]> + Eq + Ord>(
a: &T,
b: &T,
opt: CollationOptions,
) -> Ordering {
if a == b {
return Ordering::Equal;
}
let mut a_chars: Vec<u32> = B(a).chars().map(|c| c as u32).collect();
let mut b_chars: Vec<u32> = B(b).chars().map(|c| c as u32).collect();
let easy = all_ascii(&a_chars, &b_chars);
if easy {
return compare_ascii(a_chars, b_chars);
}
make_nfd(&mut a_chars);
make_nfd(&mut b_chars);
if a_chars == b_chars {
return Ordering::Equal;
}
let cldr = opt.keys_source == KeysSource::Cldr;
trim_prefix(&mut a_chars, &mut b_chars, cldr);
let a_sort_key = nfd_to_sk(&mut a_chars, opt);
let b_sort_key = nfd_to_sk(&mut b_chars, opt);
a_sort_key.cmp(&b_sort_key)
}