use smallvec::SmallVec;
const INLINE_KEY_LEN: usize = 8;
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub enum Mapped<T> {
Yes(T),
No,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct CharToken<T> {
pub data: Mapped<T>,
pub character: char,
}
pub type SortKey<T> = SmallVec<[CharToken<T>; INLINE_KEY_LEN]>;
pub trait Collator {
type Data: Ord + Clone;
fn data_for(&self, ch: char) -> Option<Self::Data>;
fn phrase_data(&self, _phrase: &str) -> Option<Vec<Self::Data>> {
None
}
}
pub fn sort_key_of<C: Collator>(collator: &C, value: &str) -> SortKey<C::Data> {
if let Some(phrase) = collator.phrase_data(value) {
return value
.chars()
.zip(phrase)
.map(|(character, data)| CharToken {
data: Mapped::Yes(data),
character,
})
.collect();
}
value
.chars()
.map(|character| CharToken {
data: collator
.data_for(character)
.map(Mapped::Yes)
.unwrap_or(Mapped::No),
character,
})
.collect()
}
pub fn sort_strings_with<C: Collator>(input: Vec<String>, collator: &C) -> Vec<String> {
let mut with_keys: Vec<(SortKey<C::Data>, usize, String)> = input
.into_iter()
.enumerate()
.map(|(index, item)| {
let key = sort_key_of(collator, &item);
(key, index, item)
})
.collect();
with_keys.sort_unstable_by(|a, b| a.0.cmp(&b.0).then_with(|| a.1.cmp(&b.1)));
with_keys.into_iter().map(|(_, _, item)| item).collect()
}
pub fn sort_indices_with<C: Collator>(input: &[String], collator: &C) -> Vec<usize> {
let mut with_keys: Vec<(SortKey<C::Data>, usize)> = input
.iter()
.enumerate()
.map(|(index, item)| (sort_key_of(collator, item), index))
.collect();
with_keys.sort_unstable_by(|a, b| a.0.cmp(&b.0).then_with(|| a.1.cmp(&b.1)));
with_keys.into_iter().map(|(_, index)| index).collect()
}
#[derive(Debug, Clone)]
pub enum AnyCollator {
#[cfg(feature = "collator-pinyin")]
Pinyin(crate::pinyin::PinyinCollator),
#[cfg(feature = "collator-strokes")]
Strokes(crate::stroke::StrokesCollator),
#[cfg(feature = "collator-jyutping")]
Jyutping(crate::jyutping::JyutpingCollator),
#[cfg(feature = "collator-zhuyin")]
Zhuyin(crate::zhuyin::ZhuyinCollator),
#[cfg(feature = "collator-radical")]
Radical(crate::radical::RadicalCollator),
}
impl AnyCollator {
#[cfg(feature = "collator-pinyin")]
pub fn pinyin() -> Self {
Self::Pinyin(crate::pinyin::PinyinCollator::new())
}
#[cfg(feature = "collator-pinyin")]
pub fn pinyin_with_override(
overrides: crate::r#override::PinyinOverride,
) -> crate::error::Result<Self> {
Ok(Self::Pinyin(
crate::pinyin::PinyinCollator::with_override(overrides)?,
))
}
#[cfg(feature = "collator-strokes")]
pub fn strokes() -> Self {
Self::Strokes(crate::stroke::StrokesCollator)
}
#[cfg(feature = "collator-jyutping")]
pub fn jyutping() -> Self {
Self::Jyutping(crate::jyutping::JyutpingCollator::new())
}
#[cfg(feature = "collator-jyutping")]
pub fn jyutping_with_override(
overrides: crate::r#override::JyutpingOverride,
) -> crate::error::Result<Self> {
Ok(Self::Jyutping(
crate::jyutping::JyutpingCollator::with_override(overrides)?,
))
}
#[cfg(feature = "collator-zhuyin")]
pub fn zhuyin() -> Self {
Self::Zhuyin(crate::zhuyin::ZhuyinCollator::new())
}
#[cfg(feature = "collator-radical")]
pub fn radical() -> Self {
Self::Radical(crate::radical::RadicalCollator::new())
}
pub fn sort(&self, input: Vec<String>) -> Vec<String> {
match self {
#[cfg(feature = "collator-pinyin")]
Self::Pinyin(c) => sort_strings_with(input, c),
#[cfg(feature = "collator-strokes")]
Self::Strokes(c) => sort_strings_with(input, c),
#[cfg(feature = "collator-jyutping")]
Self::Jyutping(c) => sort_strings_with(input, c),
#[cfg(feature = "collator-zhuyin")]
Self::Zhuyin(c) => sort_strings_with(input, c),
#[cfg(feature = "collator-radical")]
Self::Radical(c) => sort_strings_with(input, c),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
struct AsciiLetterCollator;
impl Collator for AsciiLetterCollator {
type Data = u8;
fn data_for(&self, ch: char) -> Option<u8> {
if ch.is_ascii_alphabetic() {
Some(ch.to_ascii_lowercase() as u8)
} else {
None
}
}
}
#[test]
fn mapped_orders_yes_before_no() {
assert!(Mapped::Yes(0u32) < Mapped::No);
assert!(Mapped::Yes(0u32) < Mapped::Yes(1u32));
}
#[test]
fn sort_strings_with_uses_collator_data_then_character_tiebreak() {
let sorted = sort_strings_with(
vec!["banana".into(), "Apple".into(), "apple".into()],
&AsciiLetterCollator,
);
assert_eq!(sorted, vec!["Apple", "apple", "banana"]);
}
#[test]
fn sort_strings_with_places_unmapped_characters_after_mapped_ones() {
let sorted = sort_strings_with(
vec!["123".into(), "abc".into(), "1".into()],
&AsciiLetterCollator,
);
assert_eq!(sorted, vec!["abc", "1", "123"]);
}
#[test]
fn sort_strings_with_is_stable_for_duplicate_inputs() {
let input = vec!["alpha".into(), "alpha".into(), "alpha".into()];
let sorted = sort_strings_with(input.clone(), &AsciiLetterCollator);
assert_eq!(sorted, input);
}
#[test]
fn sort_strings_with_uses_phrase_data_when_provided() {
struct PhraseOnly;
impl Collator for PhraseOnly {
type Data = u8;
fn data_for(&self, _: char) -> Option<u8> {
None
}
fn phrase_data(&self, phrase: &str) -> Option<Vec<u8>> {
if phrase == "z-first" { Some(vec![0]) } else { None }
}
}
let sorted = sort_strings_with(
vec!["a-default".into(), "z-first".into()],
&PhraseOnly,
);
assert_eq!(sorted, vec!["z-first", "a-default"]);
}
#[test]
fn sort_indices_preserves_input_order_for_duplicates() {
let input: Vec<String> = vec!["dup".into(), "dup".into(), "dup".into()];
let indices = sort_indices_with(&input, &AsciiLetterCollator);
assert_eq!(indices, vec![0, 1, 2]);
}
#[test]
fn sort_indices_breaks_ties_by_input_order() {
let input: Vec<String> = (0..12).map(|_| "x".to_string()).collect();
let indices = sort_indices_with(&input, &AsciiLetterCollator);
assert_eq!(indices, (0..12).collect::<Vec<_>>());
}
}
#[cfg(test)]
mod proptests {
use super::*;
use proptest::prelude::*;
struct AsciiLetterCollator;
impl Collator for AsciiLetterCollator {
type Data = u8;
fn data_for(&self, ch: char) -> Option<u8> {
ch.is_ascii_alphabetic().then(|| ch.to_ascii_lowercase() as u8)
}
}
fn small_vec_of_strings() -> impl Strategy<Value = Vec<String>> {
prop::collection::vec(".*", 0..50)
}
proptest! {
#[test]
fn sort_is_idempotent(items in small_vec_of_strings()) {
let once = sort_strings_with(items.clone(), &AsciiLetterCollator);
let twice = sort_strings_with(once.clone(), &AsciiLetterCollator);
prop_assert_eq!(once, twice);
}
#[test]
fn sort_is_a_permutation(items in small_vec_of_strings()) {
let sorted = sort_strings_with(items.clone(), &AsciiLetterCollator);
prop_assert_eq!(sorted.len(), items.len());
let mut sorted_alpha = sorted.clone();
let mut input_alpha = items.clone();
sorted_alpha.sort();
input_alpha.sort();
prop_assert_eq!(sorted_alpha, input_alpha);
}
#[test]
fn sort_key_total_order_holds(
a in ".*", b in ".*", c in ".*",
) {
let ka = sort_key_of(&AsciiLetterCollator, &a);
let kb = sort_key_of(&AsciiLetterCollator, &b);
let kc = sort_key_of(&AsciiLetterCollator, &c);
prop_assert_eq!(ka.cmp(&ka), std::cmp::Ordering::Equal);
prop_assert_eq!(ka.cmp(&kb).reverse(), kb.cmp(&ka));
if ka <= kb && kb <= kc {
prop_assert!(ka <= kc);
}
}
#[test]
fn pinyin_collator_sort_is_a_permutation(items in small_vec_of_strings()) {
let collator = crate::pinyin::PinyinCollator::new();
let sorted = sort_strings_with(items.clone(), &collator);
prop_assert_eq!(sorted.len(), items.len());
}
#[test]
fn strokes_collator_sort_is_a_permutation(items in small_vec_of_strings()) {
let collator = crate::stroke::StrokesCollator;
let sorted = sort_strings_with(items.clone(), &collator);
prop_assert_eq!(sorted.len(), items.len());
}
#[test]
fn sort_indices_is_a_permutation(items in small_vec_of_strings()) {
let indices = sort_indices_with(&items, &AsciiLetterCollator);
prop_assert_eq!(indices.len(), items.len());
let mut sorted_indices = indices.clone();
sorted_indices.sort();
prop_assert_eq!(sorted_indices, (0..items.len()).collect::<Vec<_>>());
}
#[test]
fn sort_breaks_ties_by_input_order(items in small_vec_of_strings()) {
let indices = sort_indices_with(&items, &AsciiLetterCollator);
for window in indices.windows(2) {
let a_idx = window[0];
let b_idx = window[1];
let a_key = sort_key_of(&AsciiLetterCollator, &items[a_idx]);
let b_key = sort_key_of(&AsciiLetterCollator, &items[b_idx]);
match a_key.cmp(&b_key) {
std::cmp::Ordering::Less => {}
std::cmp::Ordering::Equal => prop_assert!(
a_idx < b_idx,
"stability: equal-key items must preserve input order; \
got [{a_idx}] before [{b_idx}]"
),
std::cmp::Ordering::Greater => prop_assert!(
false,
"ordering broken at indices {a_idx} -> {b_idx}"
),
}
}
}
}
}