use super::generated::collation as gen;
use super::normalize::{canonical_combining_class as ccc, nfd};
use alloc::vec::Vec;
use core::cmp::Ordering;
#[inline]
fn primary(ce: u64) -> u16 {
((ce >> 32) & 0xFFFF) as u16
}
#[inline]
fn secondary(ce: u64) -> u16 {
((ce >> 16) & 0xFFFF) as u16
}
#[inline]
fn tertiary(ce: u64) -> u16 {
(ce & 0xFFFF) as u16
}
#[inline]
fn is_variable(ce: u64) -> bool {
(ce >> 48) & 1 != 0
}
#[inline]
fn pack(p: u32, s: u32, t: u32) -> u64 {
(p as u64) << 32 | (s as u64) << 16 | t as u64
}
#[inline]
fn sub_weight(ce: u64) -> u16 {
((ce >> 49) & 0x7FFF) as u16
}
#[inline]
fn pack_tailored(base: u32, sub: u32, s: u32, t: u32) -> u64 {
((sub as u64) << 49) | (base as u64) << 32 | (s as u64) << 16 | t as u64
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum AlternateHandling {
NonIgnorable,
Shifted,
}
const IMPLICIT_RANGES: &[(u32, u32, u32, u32)] = &[
(0x17000, 0x187FF, 0xFB00, 0x17000),
(0x18800, 0x18AFF, 0xFB01, 0x18800),
(0x18D00, 0x18D7F, 0xFB00, 0x17000),
(0x18D80, 0x18DFF, 0xFB01, 0x18800),
(0x1B170, 0x1B2FF, 0xFB02, 0x1B170),
(0x18B00, 0x18CFF, 0xFB03, 0x18B00),
];
fn push_implicit(cp: u32, out: &mut Vec<u64>) {
let (aaaa, bbbb) = implicit_primaries(cp);
out.push(pack(aaaa, 0x0020, 0x0002));
out.push(pack(bbbb, 0x0000, 0x0000));
}
fn implicit_primaries(cp: u32) -> (u32, u32) {
for &(first, last, base, origin) in IMPLICIT_RANGES {
if cp >= first && cp <= last {
return (base, (cp - origin) | 0x8000);
}
}
let base = if gen::unified_ideograph(cp) {
if (0x4E00..=0x9FFF).contains(&cp) || (0xF900..=0xFAFF).contains(&cp) {
0xFB40
} else {
0xFB80
}
} else {
0xFBC0
};
(base + (cp >> 15), (cp & 0x7FFF) | 0x8000)
}
fn lookup_contraction(first: u32, suffix: &[char]) -> Option<&'static [u64]> {
for (suf, ces) in gen::contractions(first)? {
if *suf == suffix {
return Some(ces);
}
}
None
}
fn collation_elements(mut cv: Vec<char>) -> Vec<u64> {
let mut cea = Vec::new();
let mut i = 0;
while i < cv.len() {
let s0 = cv[i] as u32;
let mut end = i + 1;
let mut matched: Option<&'static [u64]> = gen::ce_singles(s0);
let mut suffix: Vec<char> = Vec::new();
if let Some(entries) = gen::contractions(s0) {
for (suf, ces) in entries {
let stop = i + 1 + suf.len();
if stop <= cv.len() && cv[i + 1..stop] == **suf {
matched = Some(ces);
suffix = suf.to_vec();
end = stop;
break;
}
}
}
loop {
let mut last_ccc = 0u8;
let mut j = end;
let mut hit = None;
while j < cv.len() {
let cc = ccc(cv[j]);
if cc == 0 {
break; }
if last_ccc < cc {
let mut trial = suffix.clone();
trial.push(cv[j]);
if let Some(ces) = lookup_contraction(s0, &trial) {
hit = Some((j, ces, trial));
break;
}
last_ccc = cc;
} else {
break; }
j += 1;
}
match hit {
Some((j, ces, trial)) => {
matched = Some(ces);
suffix = trial;
cv.remove(j);
}
None => break,
}
}
match matched {
Some(ces) => cea.extend_from_slice(ces),
None => push_implicit(s0, &mut cea),
}
i = end;
}
cea
}
fn emit_number(digits: &[char], cea: &mut Vec<u64>) {
let marker = primary(collation_elements(alloc::vec!['0'])[0]) as u32;
let first_sig = digits
.iter()
.position(|&c| c != '0')
.unwrap_or(digits.len() - 1);
let sig = &digits[first_sig..];
cea.push(pack(marker, 0, 0));
cea.push(pack(sig.len() as u32 + 1, 0, 0));
for &d in sig {
cea.push(pack((d as u32 - '0' as u32) + 1, 0, 0));
}
}
fn collation_elements_numeric(cv: Vec<char>) -> Vec<u64> {
let mut cea = Vec::new();
let mut i = 0;
while i < cv.len() {
if cv[i].is_ascii_digit() {
let start = i;
while i < cv.len() && cv[i].is_ascii_digit() {
i += 1;
}
emit_number(&cv[start..i], &mut cea);
} else {
let start = i;
while i < cv.len() && !cv[i].is_ascii_digit() {
i += 1;
}
cea.extend(collation_elements(cv[start..i].to_vec()));
}
}
cea
}
fn primaries(s: &str) -> Vec<u16> {
collation_elements(nfd(s.chars()).collect())
.into_iter()
.map(primary)
.filter(|&p| p != 0)
.collect()
}
#[must_use]
pub fn find(text: &str, pattern: &str) -> Option<core::ops::Range<usize>> {
let pat = primaries(pattern);
if pat.is_empty() {
return Some(0..0);
}
let bounds: Vec<usize> = text
.char_indices()
.map(|(i, _)| i)
.chain(core::iter::once(text.len()))
.collect();
for a in 0..bounds.len() - 1 {
for b in a + 1..bounds.len() {
let pr = primaries(&text[bounds[a]..bounds[b]]);
if pr.len() < pat.len() {
continue; }
if pr == pat {
return Some(bounds[a]..bounds[b]);
}
break; }
}
None
}
#[must_use]
pub fn contains(text: &str, pattern: &str) -> bool {
find(text, pattern).is_some()
}
fn first_primary(tail: &Tailoring, s: &str) -> u32 {
let key = tail.sort_key(s);
let base = key.first().copied().unwrap_or(0) as u32;
if base == 0 {
return 0;
}
let sub = key.get(1).copied().unwrap_or(0) as u32;
(base << 16) | sub
}
#[must_use]
pub fn index_labels(lang: &str) -> Vec<alloc::string::String> {
use alloc::string::ToString;
let extra: &[&str] = match lang.split(['-', '_']).next().unwrap_or(lang) {
"sv" | "fi" => &["Å", "Ä", "Ö"],
"da" | "nb" | "nn" | "no" => &["Æ", "Ø", "Å"],
"is" => &["Þ", "Æ", "Ö"],
"es" | "gl" => &["Ñ"],
"et" => &["Š", "Ž", "Õ", "Ä", "Ö", "Ü"],
"cs" | "sk" => &["Č", "Ch", "Ř", "Š", "Ž"],
"pl" => &["Ą", "Ć", "Ę", "Ł", "Ń", "Ó", "Ś", "Ź", "Ż"],
"hu" => &["Cs", "Dz", "Dzs", "Gy", "Ly", "Ny", "Sz", "Ty", "Zs"],
"tr" | "az" => &["Ç", "Ğ", "Ö", "Ş", "Ü"],
"ro" => &["Ă", "Â", "Î", "Ș", "Ț"],
"sq" => &[
"Ç", "Dh", "Ë", "Gj", "Ll", "Nj", "Rr", "Sh", "Th", "Xh", "Zh",
],
"cy" => &["Ch", "Dd", "Ff", "Ng", "Ll", "Ph", "Rh", "Th"],
_ => &[],
};
let tail = Tailoring::for_locale(lang).unwrap_or_else(Tailoring::identity);
let mut labels: Vec<alloc::string::String> = ('A'..='Z')
.map(|c| c.to_string())
.chain(extra.iter().map(|s| s.to_string()))
.collect();
labels.sort_by_key(|l| first_primary(&tail, l));
labels
}
#[must_use]
pub fn index_bucket(lang: &str, s: &str) -> alloc::string::String {
use alloc::string::ToString;
let tail = Tailoring::for_locale(lang).unwrap_or_else(Tailoring::identity);
let sp = first_primary(&tail, s);
if sp == 0 {
return "#".to_string(); }
let labels = index_labels(lang);
let mut chosen: Option<usize> = None;
for (i, label) in labels.iter().enumerate() {
if first_primary(&tail, label) <= sp {
chosen = Some(i);
} else {
break;
}
}
match chosen {
Some(i) if i == labels.len() - 1 && first_primary(&tail, &labels[i]) < sp => {
"#".to_string()
}
Some(i) => labels[i].clone(),
None => "#".to_string(),
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub enum Strength {
Primary,
Secondary,
Tertiary,
Quaternary,
}
fn build_sort_key(cea: &[u64], alternate: AlternateHandling, strength: Strength) -> Vec<u16> {
let mut key = Vec::new();
match alternate {
AlternateHandling::NonIgnorable => {
for &ce in cea {
let p = primary(ce);
if p != 0 {
key.push(p);
}
}
if strength == Strength::Primary {
return key;
}
key.push(0);
for &ce in cea {
let s = secondary(ce);
if s != 0 {
key.push(s);
}
}
if strength == Strength::Secondary {
return key;
}
key.push(0);
for &ce in cea {
let t = tertiary(ce);
if t != 0 {
key.push(t);
}
}
}
AlternateHandling::Shifted => {
let mut rows: Vec<(u16, u16, u16, u16)> = Vec::with_capacity(cea.len());
let mut after_variable = false;
for &ce in cea {
let (p, s, t) = (primary(ce), secondary(ce), tertiary(ce));
if is_variable(ce) && p != 0 {
rows.push((0, 0, 0, p)); after_variable = true;
} else if p == 0 && s == 0 && t == 0 {
rows.push((0, 0, 0, 0)); } else if p == 0 {
if after_variable {
rows.push((0, 0, 0, 0));
} else {
rows.push((0, s, t, 0xFFFF));
}
} else {
rows.push((p, s, t, 0xFFFF));
after_variable = false;
}
}
for &(p, ..) in &rows {
if p != 0 {
key.push(p);
}
}
if strength == Strength::Primary {
return key;
}
key.push(0);
for &(_, s, ..) in &rows {
if s != 0 {
key.push(s);
}
}
if strength == Strength::Secondary {
return key;
}
key.push(0);
for &(_, _, t, _) in &rows {
if t != 0 {
key.push(t);
}
}
if strength == Strength::Tertiary {
return key;
}
key.push(0);
for &(.., q) in &rows {
if q != 0 {
key.push(q);
}
}
}
}
key
}
#[derive(Debug, Clone, Copy)]
pub struct Collator {
alternate: AlternateHandling,
strength: Strength,
numeric: bool,
}
impl Default for Collator {
fn default() -> Self {
Collator {
alternate: AlternateHandling::Shifted,
strength: Strength::Tertiary,
numeric: false,
}
}
}
impl Collator {
#[must_use]
pub fn new(alternate: AlternateHandling) -> Self {
Collator {
alternate,
strength: Strength::Tertiary,
numeric: false,
}
}
#[must_use]
pub fn with_strength(mut self, strength: Strength) -> Self {
self.strength = strength;
self
}
#[must_use]
pub fn with_numeric(mut self, numeric: bool) -> Self {
self.numeric = numeric;
self
}
#[must_use]
pub fn sort_key(&self, s: &str) -> Vec<u16> {
let cv: Vec<char> = nfd(s.chars()).collect();
let cea = if self.numeric {
collation_elements_numeric(cv)
} else {
collation_elements(cv)
};
build_sort_key(&cea, self.alternate, self.strength)
}
#[must_use]
pub fn compare(&self, a: &str, b: &str) -> Ordering {
self.sort_key(a).cmp(&self.sort_key(b))
}
}
#[must_use]
pub fn compare(a: &str, b: &str) -> Ordering {
Collator::default().compare(a, b)
}
#[must_use]
pub fn sort_key(s: &str) -> Vec<u16> {
Collator::default().sort_key(s)
}
pub struct Tailoring {
entries: Vec<(Vec<char>, Vec<u64>)>,
}
impl Tailoring {
#[must_use]
pub fn for_locale(lang: &str) -> Option<Tailoring> {
let full = lang.replace('_', "-").to_ascii_lowercase();
let primary = full.split('-').next().unwrap_or(&full);
for key in [full.as_str(), primary] {
if let Some(rule) = crate::cldr::collation_rule(key) {
if let Some(t) = Tailoring::parse(rule) {
return Some(t);
}
}
}
let lc = primary;
let rules = match lc {
"sv" | "fi" => "&z < å < ä < ö", "da" | "nb" | "nn" | "no" => "&z < æ < ø < å", "is" => "&y < ð < þ < æ < ö", "et" => "&s < š < z < ž < õ < ä < ö < ü", "de" => "&ae = ä &oe = ö &ue = ü &ss = ß", "pl" => "&a < ą &c < ć &e < ę &l < ł &n < ń &o < ó &s < ś &z < ź < ż", "cs" | "sk" => "&c < č &h < ch &r < ř &s < š &z < ž", "tr" | "az" => "&c < ç &g < ğ &h < ı &i < i̇ &o < ö &s < ş &u < ü", "lv" => "&c < č &g < ģ &i < ī &k < ķ &l < ļ &n < ņ &s < š &z < ž", "lt" => "&c < č &s < š &z < ž", "hr" | "sr" | "bs" => "&c < č < ć &d < dž < đ &l < lj &n < nj &s < š &z < ž", "es" => "&n < ñ", "hu" => "&c < cs &d < dz < dzs &g < gy &l < ly &n < ny &s < sz &t < ty &z < zs",
"ro" => "&a < ă < â &i < î &s < ș &t < ț", "sq" => "&c < ç &d < dh &e < ë &g < gj &l < ll &n < nj &r < rr &s < sh &t < th &x < xh &z < zh", "uk" => "&г < ґ &е < є &и < і < ї", "vi" => "&a < ă < â &d < đ &e < ê &o < ô < ơ &u < ư", "cy" => "&c < ch &d < dd &f < ff &g < ng &l < ll &p < ph &r < rh &t < th",
"fil" | "tl" => "&n < ñ < ng", "fo" => "&a < á &d < ð &i < í &o < ó &u < ú &y < ý &z < æ < ø", "kl" => "&z < æ < ø < å", "gl" => "&n < ñ", "ga" => "&a < á &e < é &i < í &o < ó &u < ú", "ha" => "&b < ɓ &d < ɗ &k < ƙ &s < sh &t < ts &y < ƴ", _ => return None,
};
Tailoring::parse(rules)
}
#[must_use]
pub fn parse(rules: &str) -> Option<Tailoring> {
let chars: Vec<char> = rules.chars().filter(|c| !c.is_whitespace()).collect();
let mut entries: Vec<(Vec<char>, Vec<u64>)> = Vec::new();
let mut anchor: Vec<char> = Vec::new();
let mut anchor_primary = 0u32;
let (mut p_off, mut s_off, mut t_off) = (0u32, 0u32, 0u32);
let mut i = 0;
while i < chars.len() {
match chars[i] {
'&' => {
i += 1;
let start = i;
while i < chars.len() && !matches!(chars[i], '<' | '=' | '&') {
i += 1;
}
anchor = chars[start..i].to_vec();
anchor_primary = primary(*collation_elements(anchor.clone()).first()?) as u32;
(p_off, s_off, t_off) = (0, 0, 0);
}
'<' | '=' => {
let mut level = 0u32;
while i < chars.len() && (chars[i] == '<' || chars[i] == '=') {
if chars[i] == '<' {
level += 1;
}
i += 1;
}
let tstart = i;
while i < chars.len() && !matches!(chars[i], '<' | '=' | '&') {
i += 1;
}
let target = &chars[tstart..i];
if target.is_empty() || anchor_primary == 0 {
return None;
}
if level == 0 {
Self::push_expansion(&mut entries, target, &anchor);
} else {
match level {
1 => (p_off, s_off, t_off) = (p_off + 1, 0, 0),
2 => (s_off, t_off) = (s_off + 1, 0),
_ => t_off += 1,
}
Self::push_letter(
&mut entries,
target,
anchor_primary,
p_off,
s_off,
t_off,
);
}
}
_ => i += 1, }
}
if entries.is_empty() {
return None;
}
entries.sort_by_key(|e| core::cmp::Reverse(e.0.len()));
Some(Tailoring { entries })
}
fn case_variants(target: &[char]) -> Vec<(Vec<char>, u32)> {
let upper_all: Vec<char> = target.iter().map(|&c| upper(c)).collect();
let mut v = alloc::vec![(target.to_vec(), 0x0002u32), (upper_all, 0x0008u32)];
if target.len() > 1 {
let mut title = target.to_vec();
title[0] = upper(target[0]);
v.push((title, 0x0008));
}
v
}
fn push_letter(
entries: &mut Vec<(Vec<char>, Vec<u64>)>,
target: &[char],
base: u32,
sub: u32,
s_off: u32,
t_off: u32,
) {
for (form, case_t) in Self::case_variants(target) {
let seq: Vec<char> = nfd(form.into_iter()).collect();
if !seq.is_empty() {
let ce = pack_tailored(base, sub, 0x0020 + s_off, case_t + t_off);
entries.push((seq, alloc::vec![ce]));
}
}
}
fn push_expansion(entries: &mut Vec<(Vec<char>, Vec<u64>)>, target: &[char], anchor: &[char]) {
let forms = [
(target.to_vec(), anchor.to_vec()),
(
target.iter().map(|&c| upper(c)).collect::<Vec<_>>(),
anchor.iter().map(|&c| upper(c)).collect::<Vec<_>>(),
),
];
for (t_form, a_form) in forms {
let seq: Vec<char> = nfd(t_form.into_iter()).collect();
let ces = collation_elements(nfd(a_form.into_iter()).collect());
if !seq.is_empty() && !ces.is_empty() {
entries.push((seq, ces));
}
}
}
fn match_at(&self, rest: &[char]) -> Option<(usize, &[u64])> {
for (seq, ces) in &self.entries {
if rest.len() >= seq.len() && rest[..seq.len()] == seq[..] {
return Some((seq.len(), ces));
}
}
None
}
#[must_use]
pub fn sort_key(&self, s: &str) -> Vec<u16> {
let cv: Vec<char> = nfd(s.chars()).collect();
let mut cea = Vec::new();
let mut buf: Vec<char> = Vec::new();
let mut i = 0;
while i < cv.len() {
if let Some((len, ces)) = self.match_at(&cv[i..]) {
if !buf.is_empty() {
cea.extend(collation_elements(core::mem::take(&mut buf)));
}
cea.extend_from_slice(ces);
i += len;
} else {
buf.push(cv[i]);
i += 1;
}
}
if !buf.is_empty() {
cea.extend(collation_elements(buf));
}
build_tailored_sort_key(&cea)
}
#[must_use]
pub fn compare(&self, a: &str, b: &str) -> Ordering {
self.sort_key(a).cmp(&self.sort_key(b))
}
#[must_use]
pub fn identity() -> Tailoring {
Tailoring {
entries: Vec::new(),
}
}
}
fn build_tailored_sort_key(cea: &[u64]) -> Vec<u16> {
let mut rows: Vec<(u16, u16, u16, u16, u16)> = Vec::with_capacity(cea.len());
let mut after_variable = false;
for &ce in cea {
let (p, sub, s, t) = (primary(ce), sub_weight(ce), secondary(ce), tertiary(ce));
if is_variable(ce) && p != 0 {
rows.push((0, 0, 0, 0, p)); after_variable = true;
} else if p == 0 && s == 0 && t == 0 {
rows.push((0, 0, 0, 0, 0)); } else if p == 0 {
if after_variable {
rows.push((0, 0, 0, 0, 0));
} else {
rows.push((0, 0, s, t, 0xFFFF));
}
} else {
rows.push((p, sub, s, t, 0xFFFF));
after_variable = false;
}
}
let mut key = Vec::new();
for &(p, sub, ..) in &rows {
if p != 0 {
key.push(p);
key.push(sub);
}
}
key.push(0);
for &(_, _, s, ..) in &rows {
if s != 0 {
key.push(s);
}
}
key.push(0);
for &(_, _, _, t, _) in &rows {
if t != 0 {
key.push(t);
}
}
key.push(0);
for &(.., q) in &rows {
if q != 0 {
key.push(q);
}
}
key
}
fn upper(c: char) -> char {
super::case::to_uppercase(c).next().unwrap_or(c)
}