use super::generated::collation as gen;
use super::normalize::{canonical_combining_class as ccc, nfd};
use alloc::vec::Vec;
use core::cmp::Ordering;
#[inline]
fn primary(ce: u64) -> u16 {
((ce >> 32) & 0xFFFF) as u16
}
#[inline]
fn secondary(ce: u64) -> u16 {
((ce >> 16) & 0xFFFF) as u16
}
#[inline]
fn tertiary(ce: u64) -> u16 {
(ce & 0xFFFF) as u16
}
#[inline]
fn is_variable(ce: u64) -> bool {
(ce >> 48) & 1 != 0
}
#[inline]
fn pack(p: u32, s: u32, t: u32) -> u64 {
(p as u64) << 32 | (s as u64) << 16 | t as u64
}
#[inline]
fn sub_weight(ce: u64) -> u16 {
((ce >> 49) & 0x7FFF) as u16
}
#[inline]
fn pack_tailored(base: u32, sub: u32, s: u32, t: u32) -> u64 {
((sub as u64) << 49) | (base as u64) << 32 | (s as u64) << 16 | t as u64
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum AlternateHandling {
NonIgnorable,
Shifted,
}
const IMPLICIT_RANGES: &[(u32, u32, u32, u32)] = &[
(0x17000, 0x187FF, 0xFB00, 0x17000),
(0x18800, 0x18AFF, 0xFB01, 0x18800),
(0x18D00, 0x18D7F, 0xFB00, 0x17000),
(0x18D80, 0x18DFF, 0xFB01, 0x18800),
(0x1B170, 0x1B2FF, 0xFB02, 0x1B170),
(0x18B00, 0x18CFF, 0xFB03, 0x18B00),
];
fn push_implicit(cp: u32, out: &mut Vec<u64>) {
let (aaaa, bbbb) = implicit_primaries(cp);
out.push(pack(aaaa, 0x0020, 0x0002));
out.push(pack(bbbb, 0x0000, 0x0000));
}
fn implicit_primaries(cp: u32) -> (u32, u32) {
for &(first, last, base, origin) in IMPLICIT_RANGES {
if cp >= first && cp <= last {
return (base, (cp - origin) | 0x8000);
}
}
let base = if gen::unified_ideograph(cp) {
if (0x4E00..=0x9FFF).contains(&cp) || (0xF900..=0xFAFF).contains(&cp) {
0xFB40
} else {
0xFB80
}
} else {
0xFBC0
};
(base + (cp >> 15), (cp & 0x7FFF) | 0x8000)
}
fn lookup_contraction(first: u32, suffix: &[char]) -> Option<&'static [u64]> {
for (suf, ces) in gen::contractions(first)? {
if *suf == suffix {
return Some(ces);
}
}
None
}
fn collation_elements(cv: Vec<char>) -> Vec<u64> {
let mut cea = Vec::new();
each_collation_element(&cv, |ces, opt, _start| match ces {
Some(ces) => cea.extend_from_slice(ces),
None => push_implicit(opt, &mut cea),
});
cea
}
fn collation_elements_tagged(cv: Vec<char>) -> (Vec<u64>, Vec<usize>) {
let mut cea = Vec::new();
let mut src = Vec::new();
each_collation_element(&cv, |ces, opt, start| match ces {
Some(ces) => {
for &ce in ces {
cea.push(ce);
src.push(start);
}
}
None => {
let before = cea.len();
push_implicit(opt, &mut cea);
for _ in before..cea.len() {
src.push(start);
}
}
});
(cea, src)
}
fn each_collation_element<F: FnMut(Option<&'static [u64]>, u32, usize)>(cv: &[char], mut emit: F) {
let mut consumed = alloc::vec![false; cv.len()];
let mut suffix: Vec<char> = Vec::new(); let mut i = 0;
while i < cv.len() {
if consumed[i] {
i += 1;
continue;
}
let s0 = cv[i] as u32;
let mut end = i + 1;
let mut matched: Option<&'static [u64]> = gen::ce_singles(s0);
suffix.clear();
let mut max_suf = 0usize;
if let Some(entries) = gen::contractions(s0) {
for (suf, ces) in entries {
if suf.len() > max_suf {
max_suf = suf.len();
}
let stop = i + 1 + suf.len();
if stop <= cv.len() && cv[i + 1..stop] == **suf {
matched = Some(ces);
suffix.clear();
suffix.extend_from_slice(suf);
end = stop;
break;
}
}
}
if max_suf > suffix.len() {
loop {
let mut last_ccc = 0u8;
let mut j = end;
let mut hit: Option<usize> = None;
while j < cv.len() {
if consumed[j] {
j += 1;
continue;
}
let cc = ccc(cv[j]);
if cc == 0 {
break; }
if last_ccc < cc {
suffix.push(cv[j]);
if let Some(ces) = lookup_contraction(s0, &suffix) {
matched = Some(ces);
hit = Some(j); break;
}
suffix.pop();
last_ccc = cc;
} else {
break; }
j += 1;
}
match hit {
Some(j) => {
consumed[j] = true;
if suffix.len() >= max_suf {
break; }
}
None => break,
}
}
}
match matched {
Some(ces) => emit(Some(ces), 0, i),
None => emit(None, s0, i),
}
i = end;
}
}
fn emit_number(digits: &[char], cea: &mut Vec<u64>) {
let marker = primary(collation_elements(alloc::vec!['0'])[0]) as u32;
let first_sig = digits
.iter()
.position(|&c| c != '0')
.unwrap_or(digits.len() - 1);
let sig = &digits[first_sig..];
cea.push(pack(marker, 0, 0));
cea.push(pack(sig.len() as u32 + 1, 0, 0));
for &d in sig {
cea.push(pack((d as u32 - '0' as u32) + 1, 0, 0));
}
}
fn collation_elements_numeric(cv: Vec<char>) -> Vec<u64> {
let mut cea = Vec::new();
let mut i = 0;
while i < cv.len() {
if cv[i].is_ascii_digit() {
let start = i;
while i < cv.len() && cv[i].is_ascii_digit() {
i += 1;
}
emit_number(&cv[start..i], &mut cea);
} else {
let start = i;
while i < cv.len() && !cv[i].is_ascii_digit() {
i += 1;
}
cea.extend(collation_elements(cv[start..i].to_vec()));
}
}
cea
}
fn primaries(s: &str) -> Vec<u16> {
collation_elements(nfd(s.chars()).collect())
.into_iter()
.map(primary)
.filter(|&p| p != 0)
.collect()
}
#[must_use]
pub fn find(text: &str, pattern: &str) -> Option<core::ops::Range<usize>> {
let pat = primaries(pattern);
if pat.is_empty() {
return Some(0..0);
}
let need = pat.len();
let mut nfd_buf: Vec<char> = Vec::new();
let mut start_of: Vec<usize> = Vec::new();
let mut end_of: Vec<usize> = Vec::new();
for (b, c) in text.char_indices() {
for d in nfd(core::iter::once(c)) {
nfd_buf.push(d);
start_of.push(b);
end_of.push(b + c.len_utf8());
}
}
let (cea, src) = collation_elements_tagged(nfd_buf);
let mut prim: Vec<u16> = Vec::new();
let mut prim_start: Vec<usize> = Vec::new();
let mut prim_end: Vec<usize> = Vec::new();
let mut group_starts: alloc::collections::BTreeSet<usize> = alloc::collections::BTreeSet::new();
for (idx, &ce) in cea.iter().enumerate() {
group_starts.insert(start_of[src[idx]]);
let p = primary(ce);
if p != 0 {
prim.push(p);
prim_start.push(start_of[src[idx]]);
prim_end.push(end_of[src[idx]]);
}
}
let mut k = 0usize; for (a, _) in text.char_indices() {
while k < prim_start.len() && prim_start[k] < a {
k += 1;
}
if group_starts.contains(&a) {
if k + need <= prim.len() && prim[k..k + need] == pat[..] {
let b = prim_end[k + need - 1];
if primaries(&text[a..b]) == pat {
return Some(a..b);
}
}
} else if let Some((win, b)) = window_decision(&text[a..], need) {
if win == pat {
return Some(a..a + b);
}
}
}
None
}
fn window_decision(s: &str, need: usize) -> Option<(Vec<u16>, usize)> {
for (b, _) in s
.char_indices()
.skip(1)
.chain(core::iter::once((s.len(), '\0')))
{
let pr = primaries(&s[..b]);
if pr.len() >= need {
return Some((pr, b));
}
}
None
}
#[must_use]
pub fn contains(text: &str, pattern: &str) -> bool {
find(text, pattern).is_some()
}
fn first_primary(tail: &Tailoring, s: &str) -> u32 {
let key = tail.sort_key(s);
let base = key.first().copied().unwrap_or(0) as u32;
if base == 0 {
return 0;
}
let sub = key.get(1).copied().unwrap_or(0) as u32;
(base << 16) | sub
}
#[must_use]
pub fn index_labels(lang: &str) -> Vec<alloc::string::String> {
use alloc::string::ToString;
let extra: &[&str] = match lang.split(['-', '_']).next().unwrap_or(lang) {
"sv" | "fi" => &["Å", "Ä", "Ö"],
"da" | "nb" | "nn" | "no" => &["Æ", "Ø", "Å"],
"is" => &["Þ", "Æ", "Ö"],
"es" | "gl" => &["Ñ"],
"et" => &["Š", "Ž", "Õ", "Ä", "Ö", "Ü"],
"cs" | "sk" => &["Č", "Ch", "Ř", "Š", "Ž"],
"pl" => &["Ą", "Ć", "Ę", "Ł", "Ń", "Ó", "Ś", "Ź", "Ż"],
"hu" => &["Cs", "Dz", "Dzs", "Gy", "Ly", "Ny", "Sz", "Ty", "Zs"],
"tr" | "az" => &["Ç", "Ğ", "Ö", "Ş", "Ü"],
"ro" => &["Ă", "Â", "Î", "Ș", "Ț"],
"sq" => &[
"Ç", "Dh", "Ë", "Gj", "Ll", "Nj", "Rr", "Sh", "Th", "Xh", "Zh",
],
"cy" => &["Ch", "Dd", "Ff", "Ng", "Ll", "Ph", "Rh", "Th"],
_ => &[],
};
let tail = Tailoring::for_locale(lang).unwrap_or_else(Tailoring::identity);
let mut labels: Vec<alloc::string::String> = ('A'..='Z')
.map(|c| c.to_string())
.chain(extra.iter().map(|s| s.to_string()))
.collect();
labels.sort_by_key(|l| first_primary(&tail, l));
labels
}
#[must_use]
pub fn index_bucket(lang: &str, s: &str) -> alloc::string::String {
use alloc::string::ToString;
let tail = Tailoring::for_locale(lang).unwrap_or_else(Tailoring::identity);
let sp = first_primary(&tail, s);
if sp == 0 {
return "#".to_string(); }
let labels = index_labels(lang);
let mut chosen: Option<usize> = None;
for (i, label) in labels.iter().enumerate() {
if first_primary(&tail, label) <= sp {
chosen = Some(i);
} else {
break;
}
}
match chosen {
Some(i) if i == labels.len() - 1 && first_primary(&tail, &labels[i]) < sp => {
"#".to_string()
}
Some(i) => labels[i].clone(),
None => "#".to_string(),
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub enum Strength {
Primary,
Secondary,
Tertiary,
Quaternary,
}
fn build_sort_key(cea: &[u64], alternate: AlternateHandling, strength: Strength) -> Vec<u16> {
let mut key = Vec::new();
match alternate {
AlternateHandling::NonIgnorable => {
for &ce in cea {
let p = primary(ce);
if p != 0 {
key.push(p);
}
}
if strength == Strength::Primary {
return key;
}
key.push(0);
for &ce in cea {
let s = secondary(ce);
if s != 0 {
key.push(s);
}
}
if strength == Strength::Secondary {
return key;
}
key.push(0);
for &ce in cea {
let t = tertiary(ce);
if t != 0 {
key.push(t);
}
}
}
AlternateHandling::Shifted => {
let mut rows: Vec<(u16, u16, u16, u16)> = Vec::with_capacity(cea.len());
let mut after_variable = false;
for &ce in cea {
let (p, s, t) = (primary(ce), secondary(ce), tertiary(ce));
if is_variable(ce) && p != 0 {
rows.push((0, 0, 0, p)); after_variable = true;
} else if p == 0 && s == 0 && t == 0 {
rows.push((0, 0, 0, 0)); } else if p == 0 {
if after_variable {
rows.push((0, 0, 0, 0));
} else {
rows.push((0, s, t, 0xFFFF));
}
} else {
rows.push((p, s, t, 0xFFFF));
after_variable = false;
}
}
for &(p, ..) in &rows {
if p != 0 {
key.push(p);
}
}
if strength == Strength::Primary {
return key;
}
key.push(0);
for &(_, s, ..) in &rows {
if s != 0 {
key.push(s);
}
}
if strength == Strength::Secondary {
return key;
}
key.push(0);
for &(_, _, t, _) in &rows {
if t != 0 {
key.push(t);
}
}
if strength == Strength::Tertiary {
return key;
}
key.push(0);
for &(.., q) in &rows {
if q != 0 {
key.push(q);
}
}
}
}
key
}
#[derive(Debug, Clone, Copy)]
pub struct Collator {
alternate: AlternateHandling,
strength: Strength,
numeric: bool,
}
impl Default for Collator {
fn default() -> Self {
Collator {
alternate: AlternateHandling::Shifted,
strength: Strength::Tertiary,
numeric: false,
}
}
}
impl Collator {
#[must_use]
pub fn new(alternate: AlternateHandling) -> Self {
Collator {
alternate,
strength: Strength::Tertiary,
numeric: false,
}
}
#[must_use]
pub fn with_strength(mut self, strength: Strength) -> Self {
self.strength = strength;
self
}
#[must_use]
pub fn with_numeric(mut self, numeric: bool) -> Self {
self.numeric = numeric;
self
}
#[must_use]
pub fn sort_key(&self, s: &str) -> Vec<u16> {
let cv: Vec<char> = nfd(s.chars()).collect();
let cea = if self.numeric {
collation_elements_numeric(cv)
} else {
collation_elements(cv)
};
build_sort_key(&cea, self.alternate, self.strength)
}
#[must_use]
pub fn compare(&self, a: &str, b: &str) -> Ordering {
self.sort_key(a).cmp(&self.sort_key(b))
}
}
#[must_use]
pub fn compare(a: &str, b: &str) -> Ordering {
Collator::default().compare(a, b)
}
#[must_use]
pub fn sort_key(s: &str) -> Vec<u16> {
Collator::default().sort_key(s)
}
pub struct Tailoring {
entries: Vec<(Vec<char>, Vec<u64>)>,
}
impl Tailoring {
#[must_use]
pub fn for_locale(lang: &str) -> Option<Tailoring> {
let full = lang.replace('_', "-").to_ascii_lowercase();
let primary = full.split('-').next().unwrap_or(&full);
for key in [full.as_str(), primary] {
if let Some(rule) = crate::cldr::collation_rule(key) {
if let Some(t) = Tailoring::parse(rule) {
return Some(t);
}
}
}
let lc = primary;
let rules = match lc {
"sv" | "fi" => "&z < å < ä < ö", "da" | "nb" | "nn" | "no" => "&z < æ < ø < å", "is" => "&y < ð < þ < æ < ö", "et" => "&s < š < z < ž < õ < ä < ö < ü", "de" => "&ae = ä &oe = ö &ue = ü &ss = ß", "pl" => "&a < ą &c < ć &e < ę &l < ł &n < ń &o < ó &s < ś &z < ź < ż", "cs" | "sk" => "&c < č &h < ch &r < ř &s < š &z < ž", "tr" | "az" => "&c < ç &g < ğ &h < ı &i < i̇ &o < ö &s < ş &u < ü", "lv" => "&c < č &g < ģ &i < ī &k < ķ &l < ļ &n < ņ &s < š &z < ž", "lt" => "&c < č &s < š &z < ž", "hr" | "sr" | "bs" => "&c < č < ć &d < dž < đ &l < lj &n < nj &s < š &z < ž", "es" => "&n < ñ", "hu" => "&c < cs &d < dz < dzs &g < gy &l < ly &n < ny &s < sz &t < ty &z < zs",
"ro" => "&a < ă < â &i < î &s < ș &t < ț", "sq" => "&c < ç &d < dh &e < ë &g < gj &l < ll &n < nj &r < rr &s < sh &t < th &x < xh &z < zh", "uk" => "&г < ґ &е < є &и < і < ї", "vi" => "&a < ă < â &d < đ &e < ê &o < ô < ơ &u < ư", "cy" => "&c < ch &d < dd &f < ff &g < ng &l < ll &p < ph &r < rh &t < th",
"fil" | "tl" => "&n < ñ < ng", "fo" => "&a < á &d < ð &i < í &o < ó &u < ú &y < ý &z < æ < ø", "kl" => "&z < æ < ø < å", "gl" => "&n < ñ", "ga" => "&a < á &e < é &i < í &o < ó &u < ú", "ha" => "&b < ɓ &d < ɗ &k < ƙ &s < sh &t < ts &y < ƴ", _ => return None,
};
Tailoring::parse(rules)
}
#[must_use]
pub fn parse(rules: &str) -> Option<Tailoring> {
let chars: Vec<char> = rules.chars().filter(|c| !c.is_whitespace()).collect();
let mut entries: Vec<(Vec<char>, Vec<u64>)> = Vec::new();
let mut anchor: Vec<char> = Vec::new();
let mut anchor_primary = 0u32;
let (mut p_off, mut s_off, mut t_off) = (0u32, 0u32, 0u32);
let mut i = 0;
while i < chars.len() {
match chars[i] {
'&' => {
i += 1;
let start = i;
while i < chars.len() && !matches!(chars[i], '<' | '=' | '&') {
i += 1;
}
anchor = chars[start..i].to_vec();
anchor_primary = primary(*collation_elements(anchor.clone()).first()?) as u32;
(p_off, s_off, t_off) = (0, 0, 0);
}
'<' | '=' => {
let mut level = 0u32;
while i < chars.len() && (chars[i] == '<' || chars[i] == '=') {
if chars[i] == '<' {
level += 1;
}
i += 1;
}
let tstart = i;
while i < chars.len() && !matches!(chars[i], '<' | '=' | '&') {
i += 1;
}
let target = &chars[tstart..i];
if target.is_empty() || anchor_primary == 0 {
return None;
}
if level == 0 {
Self::push_expansion(&mut entries, target, &anchor);
} else {
match level {
1 => (p_off, s_off, t_off) = (p_off + 1, 0, 0),
2 => (s_off, t_off) = (s_off + 1, 0),
_ => t_off += 1,
}
Self::push_letter(
&mut entries,
target,
anchor_primary,
p_off,
s_off,
t_off,
);
}
}
_ => i += 1, }
}
if entries.is_empty() {
return None;
}
entries.sort_by_key(|e| core::cmp::Reverse(e.0.len()));
Some(Tailoring { entries })
}
fn case_variants(target: &[char]) -> Vec<(Vec<char>, u32)> {
let upper_all: Vec<char> = target.iter().map(|&c| upper(c)).collect();
let mut v = alloc::vec![(target.to_vec(), 0x0002u32), (upper_all, 0x0008u32)];
if target.len() > 1 {
let mut title = target.to_vec();
title[0] = upper(target[0]);
v.push((title, 0x0008));
}
v
}
fn push_letter(
entries: &mut Vec<(Vec<char>, Vec<u64>)>,
target: &[char],
base: u32,
sub: u32,
s_off: u32,
t_off: u32,
) {
for (form, case_t) in Self::case_variants(target) {
let seq: Vec<char> = nfd(form.into_iter()).collect();
if !seq.is_empty() {
let ce = pack_tailored(base, sub, 0x0020 + s_off, case_t + t_off);
entries.push((seq, alloc::vec![ce]));
}
}
}
fn push_expansion(entries: &mut Vec<(Vec<char>, Vec<u64>)>, target: &[char], anchor: &[char]) {
let forms = [
(target.to_vec(), anchor.to_vec()),
(
target.iter().map(|&c| upper(c)).collect::<Vec<_>>(),
anchor.iter().map(|&c| upper(c)).collect::<Vec<_>>(),
),
];
for (t_form, a_form) in forms {
let seq: Vec<char> = nfd(t_form.into_iter()).collect();
let ces = collation_elements(nfd(a_form.into_iter()).collect());
if !seq.is_empty() && !ces.is_empty() {
entries.push((seq, ces));
}
}
}
fn match_at(&self, rest: &[char]) -> Option<(usize, &[u64])> {
for (seq, ces) in &self.entries {
if rest.len() >= seq.len() && rest[..seq.len()] == seq[..] {
return Some((seq.len(), ces));
}
}
None
}
#[must_use]
pub fn sort_key(&self, s: &str) -> Vec<u16> {
let cv: Vec<char> = nfd(s.chars()).collect();
let mut cea = Vec::new();
let mut buf: Vec<char> = Vec::new();
let mut i = 0;
while i < cv.len() {
if let Some((len, ces)) = self.match_at(&cv[i..]) {
if !buf.is_empty() {
cea.extend(collation_elements(core::mem::take(&mut buf)));
}
cea.extend_from_slice(ces);
i += len;
} else {
buf.push(cv[i]);
i += 1;
}
}
if !buf.is_empty() {
cea.extend(collation_elements(buf));
}
build_tailored_sort_key(&cea)
}
#[must_use]
pub fn compare(&self, a: &str, b: &str) -> Ordering {
self.sort_key(a).cmp(&self.sort_key(b))
}
#[must_use]
pub fn identity() -> Tailoring {
Tailoring {
entries: Vec::new(),
}
}
}
fn build_tailored_sort_key(cea: &[u64]) -> Vec<u16> {
let mut rows: Vec<(u16, u16, u16, u16, u16)> = Vec::with_capacity(cea.len());
let mut after_variable = false;
for &ce in cea {
let (p, sub, s, t) = (primary(ce), sub_weight(ce), secondary(ce), tertiary(ce));
if is_variable(ce) && p != 0 {
rows.push((0, 0, 0, 0, p)); after_variable = true;
} else if p == 0 && s == 0 && t == 0 {
rows.push((0, 0, 0, 0, 0)); } else if p == 0 {
if after_variable {
rows.push((0, 0, 0, 0, 0));
} else {
rows.push((0, 0, s, t, 0xFFFF));
}
} else {
rows.push((p, sub, s, t, 0xFFFF));
after_variable = false;
}
}
let mut key = Vec::new();
for &(p, sub, ..) in &rows {
if p != 0 {
key.push(p);
key.push(sub);
}
}
key.push(0);
for &(_, _, s, ..) in &rows {
if s != 0 {
key.push(s);
}
}
key.push(0);
for &(_, _, _, t, _) in &rows {
if t != 0 {
key.push(t);
}
}
key.push(0);
for &(.., q) in &rows {
if q != 0 {
key.push(q);
}
}
key
}
fn upper(c: char) -> char {
super::case::to_uppercase(c).next().unwrap_or(c)
}
#[cfg(test)]
mod dos_fix_tests {
use super::*;
use alloc::string::String;
fn find_reference(text: &str, pattern: &str) -> Option<core::ops::Range<usize>> {
let pat = primaries(pattern);
if pat.is_empty() {
return Some(0..0);
}
let bounds: Vec<usize> = text
.char_indices()
.map(|(i, _)| i)
.chain(core::iter::once(text.len()))
.collect();
for a in 0..bounds.len() - 1 {
for b in a + 1..bounds.len() {
let pr = primaries(&text[bounds[a]..bounds[b]]);
if pr.len() < pat.len() {
continue;
}
if pr == pat {
return Some(bounds[a]..bounds[b]);
}
break;
}
}
None
}
fn collation_elements_reference(mut cv: Vec<char>) -> Vec<u64> {
let mut cea = Vec::new();
let mut i = 0;
while i < cv.len() {
let s0 = cv[i] as u32;
let mut end = i + 1;
let mut matched: Option<&'static [u64]> = gen::ce_singles(s0);
let mut suffix: Vec<char> = Vec::new();
if let Some(entries) = gen::contractions(s0) {
for (suf, ces) in entries {
let stop = i + 1 + suf.len();
if stop <= cv.len() && cv[i + 1..stop] == **suf {
matched = Some(ces);
suffix = suf.to_vec();
end = stop;
break;
}
}
}
loop {
let mut last_ccc = 0u8;
let mut j = end;
let mut hit = None;
while j < cv.len() {
let cc = ccc(cv[j]);
if cc == 0 {
break;
}
if last_ccc < cc {
let mut trial = suffix.clone();
trial.push(cv[j]);
if let Some(ces) = lookup_contraction(s0, &trial) {
hit = Some((j, ces, trial));
break;
}
last_ccc = cc;
} else {
break;
}
j += 1;
}
match hit {
Some((j, ces, trial)) => {
matched = Some(ces);
suffix = trial;
cv.remove(j);
}
None => break,
}
}
match matched {
Some(ces) => cea.extend_from_slice(ces),
None => push_implicit(s0, &mut cea),
}
i = end;
}
cea
}
struct Rng(u64);
impl Rng {
fn next(&mut self) -> u64 {
let mut x = self.0;
x ^= x << 13;
x ^= x >> 7;
x ^= x << 17;
self.0 = x;
x
}
fn pick<'a, T>(&mut self, xs: &'a [T]) -> &'a T {
&xs[(self.next() as usize) % xs.len()]
}
}
const ALPHABET: &[char] = &[
'a', 'b', 'c', 'e', 'h', 'l', 'z', 'A', 'C', 'H', 'L', 'ñ', 'Ñ', 'å', 'Ç', 'ç', '·',
'\u{0301}', '\u{0300}', '\u{0327}', '\u{0323}', '\u{0308}', 'é', 'É', '0', '1', '2', '中',
' ', '!', '\u{00C6}', '\u{0153}',
];
fn random_string(rng: &mut Rng, len: usize) -> String {
(0..len).map(|_| *rng.pick(ALPHABET)).collect()
}
#[test]
fn find_matches_reference_fuzz() {
let mut rng = Rng(0x9E3779B97F4A7C15);
for _ in 0..4000 {
let tlen = (rng.next() as usize) % 14;
let plen = 1 + (rng.next() as usize) % 4;
let text = random_string(&mut rng, tlen);
let pat = random_string(&mut rng, plen);
assert_eq!(
find(&text, &pat),
find_reference(&text, &pat),
"find mismatch: text={text:?} pat={pat:?}"
);
}
for (t, p) in [
("l·a", "la"),
("L·", "l"),
("e\u{0301}", "e"),
("\u{0301}e", "e"),
(" café", "cafe"),
("ñ", "n"),
("中文", "中"),
("aaa", "aa"),
("", "x"),
("x", ""),
] {
assert_eq!(
find(t, p),
find_reference(t, p),
"case text={t:?} pat={p:?}"
);
}
}
#[test]
fn collation_elements_matches_reference_fuzz() {
let mut rng = Rng(0xD1B54A32D192ED03);
for _ in 0..4000 {
let len = (rng.next() as usize) % 16;
let s = random_string(&mut rng, len);
let cv: Vec<char> = nfd(s.chars()).collect();
assert_eq!(
collation_elements(cv.clone()),
collation_elements_reference(cv),
"CE mismatch: s={s:?}"
);
}
}
#[test]
fn perf_smoke_large_nonmatching_find() {
let text: String = "a".repeat(300_000);
assert_eq!(find(&text, "qzx"), None);
assert_eq!(find(&text, "aaa"), Some(0..3));
}
#[test]
fn perf_smoke_long_combining_run() {
let mut s = String::from("e");
for _ in 0..200_000 {
s.push('\u{0301}');
}
let key = sort_key(&s);
assert!(!key.is_empty());
assert_eq!(find(&s, "z"), None);
}
}