#![expect(clippy::indexing_slicing, clippy::unwrap_used)]
mod hardcoded;
mod replaceable;
use crate::transliterate::provider::{FunctionCall, Rule, RuleULE, SimpleId, VarTable};
use crate::transliterate::provider::{RuleBasedTransliterator, Segment, TransliteratorRulesV1};
use crate::transliterate::transliterator::hardcoded::Case;
use alloc::borrow::Cow;
use alloc::boxed::Box;
use alloc::string::{String, ToString};
use alloc::vec::Vec;
use core::fmt::Debug;
use core::ops::Range;
use icu_casemap::provider::CaseMapV1;
use icu_casemap::CaseMapper;
use icu_collections::codepointinvlist::CodePointInversionList;
use icu_collections::codepointinvliststringlist::CodePointInversionListAndStringList;
use icu_locale::LanguageIdentifier;
use icu_locale_core::Locale;
use icu_normalizer::provider::*;
use icu_normalizer::{ComposingNormalizer, DecomposingNormalizer};
use icu_provider::prelude::*;
use litemap::LiteMap;
use replaceable::*;
use zerofrom::ZeroFrom;
use zerovec::vecs::Index32;
use zerovec::VarZeroSlice;
type Filter<'a> = CodePointInversionList<'a>;
pub trait CustomTransliterator: Debug {
fn transliterate(&self, input: &str, range: Range<usize>) -> String;
}
#[derive(Debug)]
enum InternalTransliterator {
RuleBased(DataPayload<TransliteratorRulesV1>),
Composing(ComposingNormalizer),
Decomposing(DecomposingNormalizer),
Hex(hardcoded::HexTransliterator),
Lower(CaseMapper),
Upper(CaseMapper),
Null,
Remove,
Dyn(Box<dyn CustomTransliterator>),
}
impl InternalTransliterator {
fn transliterate(&self, mut rep: Replaceable, env: &Env) {
match self {
Self::RuleBased(rbt) => rbt.get().transliterate(rep, env),
Self::Composing(normalizer) => {
if let Cow::Owned(buf) = normalizer.as_borrowed().normalize(rep.as_str_modifiable())
{
rep.replace_modifiable_with_str(&buf);
}
}
Self::Decomposing(normalizer) => {
if let Cow::Owned(buf) = normalizer.as_borrowed().normalize(rep.as_str_modifiable())
{
rep.replace_modifiable_with_str(&buf);
}
}
Self::Lower(casemap) => {
if let Cow::Owned(buf) = casemap
.as_borrowed()
.lowercase_to_string(rep.as_str_modifiable(), &LanguageIdentifier::UNKNOWN)
{
rep.replace_modifiable_with_str(&buf);
}
}
Self::Upper(casemap) => {
if let Cow::Owned(buf) = casemap
.as_borrowed()
.uppercase_to_string(rep.as_str_modifiable(), &LanguageIdentifier::UNKNOWN)
{
rep.replace_modifiable_with_str(&buf);
}
}
Self::Hex(t) => t.transliterate(rep),
Self::Null => (),
Self::Remove => rep.replace_modifiable_with_str(""),
Self::Dyn(custom) => {
let replacement = custom.transliterate(rep.as_str(), rep.allowed_range());
rep.replace_modifiable_with_str(&replacement)
}
}
}
}
type Env = LiteMap<String, InternalTransliterator>;
#[derive(Debug)]
pub struct Transliterator {
transliterator: DataPayload<TransliteratorRulesV1>,
env: Env,
}
#[derive(Debug)]
#[cfg(feature = "compiled_data")]
pub struct TransliteratorBuilder {
env: Env,
transliterator: DataPayload<TransliteratorRulesV1>,
}
#[cfg(feature = "compiled_data")]
impl Default for TransliteratorBuilder {
fn default() -> Self {
Self {
env: LiteMap::from_iter([
("any-remove".into(), InternalTransliterator::Remove),
("any-null".into(), InternalTransliterator::Null),
]),
transliterator: DataPayload::from_owned(RuleBasedTransliterator {
visibility: false,
variable_table: Default::default(),
filter: CodePointInversionList::all(),
id_group_list: Default::default(),
rule_group_list: Default::default(),
}),
}
}
}
#[cfg(feature = "compiled_data")]
impl TransliteratorBuilder {
pub fn from_rules(rules: &'static RuleBasedTransliterator<'static>) -> Self {
Self {
transliterator: DataPayload::from_static_ref(rules),
..Default::default()
}
}
pub fn replace(
mut self,
matcher: CodePointInversionListAndStringList<'static>,
replacer: String,
) -> Self {
if matcher.size() == 0 {
return self;
}
self.transliterator.with_mut(move |r| {
let rule_group_list = r.rule_group_list.make_mut();
let mut group = if rule_group_list.is_empty() {
Default::default()
} else {
let g = rule_group_list
.get(rule_group_list.len() - 1)
.unwrap()
.as_varzerovec()
.into_owned();
rule_group_list.remove(rule_group_list.len() - 1);
g
};
group.make_mut().push(&Rule {
key: Cow::Owned(String::from(
char::from_u32(
VarTable::BASE as u32 + r.variable_table.unicode_sets.len() as u32,
)
.unwrap(),
)),
replacer: Cow::Owned(replacer),
ante: Cow::Borrowed(""),
post: Cow::Borrowed(""),
});
rule_group_list.push(&group);
r.variable_table.unicode_sets.make_mut().push(&matcher);
});
self
}
pub fn nfc(mut self, filter: CodePointInversionList<'static>) -> Self {
if filter.is_empty() {
return self;
}
self.chain(filter, Cow::Borrowed("any-nfc"));
self.load_nfc()
}
pub fn nfkc(mut self, filter: CodePointInversionList<'static>) -> Self {
if filter.is_empty() {
return self;
}
self.chain(filter, Cow::Borrowed("any-nfkc"));
self.load_nfkc()
}
pub fn nfd(mut self, filter: CodePointInversionList<'static>) -> Self {
if filter.is_empty() {
return self;
}
self.chain(filter, Cow::Borrowed("any-nfd"));
self.load_nfd()
}
pub fn nfkd(mut self, filter: CodePointInversionList<'static>) -> Self {
if filter.is_empty() {
return self;
}
self.chain(filter, Cow::Borrowed("any-nfkd"));
self.load_nfkd()
}
pub fn lower(mut self, filter: CodePointInversionList<'static>) -> Self {
if filter.is_empty() {
return self;
}
self.chain(filter, Cow::Borrowed("any-lower"));
self.load_casing()
}
pub fn upper(mut self, filter: CodePointInversionList<'static>) -> Self {
if filter.is_empty() {
return self;
}
self.chain(filter, Cow::Borrowed("any-upper"));
self.load_casing()
}
pub fn remove(mut self, filter: CodePointInversionList<'static>) -> Self {
if filter.is_empty() {
return self;
}
self.chain(filter, Cow::Borrowed("any-remove"));
self
}
pub fn null(mut self) -> Self {
self.transliterator.with_mut(|r| {
r.id_group_list
.make_mut()
.push::<&[SimpleId]>(&[].as_slice());
r.rule_group_list.make_mut().push::<&[Rule]>(&[].as_slice());
});
self
}
pub fn call(
mut self,
rules: &'static RuleBasedTransliterator<'static>,
filter: CodePointInversionList<'static>,
) -> Self {
if filter.is_empty() {
return self;
}
let id = self.env.len().to_string();
self.env.insert(
id.clone(),
InternalTransliterator::RuleBased(DataPayload::from_static_ref(rules)),
);
self.chain(filter, Cow::Owned(id));
self
}
fn chain(&mut self, filter: CodePointInversionList<'static>, id: Cow<'static, str>) {
self.transliterator.with_mut(|r| {
r.id_group_list
.make_mut()
.push(&[SimpleId { filter, id }].as_slice());
r.rule_group_list.make_mut().push::<&[Rule]>(&[].as_slice());
});
}
pub fn build(self) -> Result<Transliterator, DataError> {
for dep in self.transliterator.get().deps() {
if !self.env.contains_key(&*dep) {
return Err(DataError::custom("dependency not loaded").with_display_context(&dep));
}
}
for (_, dep) in &self.env {
if let InternalTransliterator::RuleBased(rbt) = dep {
for dep in rbt.get().deps() {
if !self.env.contains_key(&*dep) {
return Err(
DataError::custom("dependency not loaded").with_display_context(&dep)
);
}
}
}
}
Ok(Transliterator {
transliterator: self.transliterator,
env: self.env,
})
}
pub fn load_nfc(mut self) -> Self {
if !self.env.contains_key("any-nfc") {
self.env.insert(
String::from("any-nfc"),
InternalTransliterator::Composing(ComposingNormalizer::new_nfc().static_to_owned()),
);
}
self
}
pub fn load_nfkc(mut self) -> Self {
if !self.env.contains_key("any-nfkc") {
self.env.insert(
String::from("any-nfkc"),
InternalTransliterator::Composing(
ComposingNormalizer::new_nfkc().static_to_owned(),
),
);
}
self
}
pub fn load_nfd(mut self) -> Self {
if !self.env.contains_key("any-nfd") {
self.env.insert(
String::from("any-nfd"),
InternalTransliterator::Decomposing(
DecomposingNormalizer::new_nfd().static_to_owned(),
),
);
}
self
}
pub fn load_nfkd(mut self) -> Self {
if !self.env.contains_key("any-nfkd") {
self.env.insert(
String::from("any-nfkd"),
InternalTransliterator::Decomposing(
DecomposingNormalizer::new_nfkd().static_to_owned(),
),
);
}
self
}
pub fn load_casing(mut self) -> Self {
if !self.env.contains_key("any-lower") {
self.env.insert(
String::from("any-lower"),
InternalTransliterator::Lower(CaseMapper::new().static_to_owned()),
);
self.env.insert(
String::from("any-upper"),
InternalTransliterator::Upper(CaseMapper::new().static_to_owned()),
);
}
self
}
}
impl Transliterator {
#[cfg(feature = "compiled_data")]
#[allow(unused_qualifications)]
pub fn try_new(locale: &Locale) -> Result<Self, DataError> {
Self::try_new_unstable(
&crate::provider::Baked,
&icu_normalizer::provider::Baked,
&icu_casemap::provider::Baked,
locale,
)
}
#[cfg(feature = "serde")]
#[doc = icu_provider::gen_buffer_unstable_docs!(BUFFER, Self::try_new)]
pub fn try_new_with_buffer_provider(
provider: &(impl BufferProvider + ?Sized),
locale: &Locale,
) -> Result<Self, DataError> {
Self::try_new_unstable(
&provider.as_deserializing(),
&provider.as_deserializing(),
&provider.as_deserializing(),
locale,
)
}
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::try_new)]
pub fn try_new_unstable<PT, PN, PC>(
transliterator_provider: &PT,
normalizer_provider: &PN,
casemap_provider: &PC,
locale: &Locale,
) -> Result<Self, DataError>
where
PT: DataProvider<TransliteratorRulesV1> + ?Sized,
PC: DataProvider<CaseMapV1> + ?Sized,
PN: DataProvider<NormalizerNfdDataV1>
+ DataProvider<NormalizerNfkdDataV1>
+ DataProvider<NormalizerNfdTablesV1>
+ DataProvider<NormalizerNfkdTablesV1>
+ DataProvider<NormalizerNfcV1>
+ ?Sized,
{
Self::internal_try_new_with_override_unstable(
locale,
None::<&fn(&Locale) -> Option<Result<Box<dyn CustomTransliterator>, DataError>>>,
transliterator_provider,
normalizer_provider,
casemap_provider,
)
}
#[cfg(feature = "compiled_data")]
#[allow(unused_qualifications)]
pub fn try_new_with_override<F>(locale: &Locale, lookup: F) -> Result<Self, DataError>
where
F: Fn(&Locale) -> Option<Result<Box<dyn CustomTransliterator>, DataError>>,
{
Self::try_new_with_override_unstable(
&crate::provider::Baked,
&icu_normalizer::provider::Baked,
&icu_casemap::provider::Baked,
locale,
lookup,
)
}
#[cfg(feature = "serde")]
#[doc = icu_provider::gen_buffer_unstable_docs!(BUFFER, Self::try_new_with_override)]
pub fn try_new_with_override_with_buffer_provider<F>(
provider: &(impl BufferProvider + ?Sized),
locale: &Locale,
lookup: F,
) -> Result<Self, DataError>
where
F: Fn(&Locale) -> Option<Result<Box<dyn CustomTransliterator>, DataError>>,
{
Self::try_new_with_override_unstable(
&provider.as_deserializing(),
&provider.as_deserializing(),
&provider.as_deserializing(),
locale,
lookup,
)
}
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::try_new_with_override)]
pub fn try_new_with_override_unstable<PT, PN, PC, F>(
transliterator_provider: &PT,
normalizer_provider: &PN,
casemap_provider: &PC,
locale: &Locale,
lookup: F,
) -> Result<Transliterator, DataError>
where
PT: DataProvider<TransliteratorRulesV1> + ?Sized,
PC: DataProvider<CaseMapV1> + ?Sized,
PN: DataProvider<NormalizerNfdDataV1>
+ DataProvider<NormalizerNfkdDataV1>
+ DataProvider<NormalizerNfdTablesV1>
+ DataProvider<NormalizerNfkdTablesV1>
+ DataProvider<NormalizerNfcV1>
+ ?Sized,
F: Fn(&Locale) -> Option<Result<Box<dyn CustomTransliterator>, DataError>>,
{
Self::internal_try_new_with_override_unstable(
locale,
Some(&lookup),
transliterator_provider,
normalizer_provider,
casemap_provider,
)
}
fn internal_try_new_with_override_unstable<PN, PT, PC, F>(
locale: &Locale,
lookup: Option<&F>,
transliterator_provider: &PT,
normalizer_provider: &PN,
casemap_provider: &PC,
) -> Result<Transliterator, DataError>
where
PT: DataProvider<TransliteratorRulesV1> + ?Sized,
PC: DataProvider<CaseMapV1> + ?Sized,
PN: DataProvider<NormalizerNfdDataV1>
+ DataProvider<NormalizerNfkdDataV1>
+ DataProvider<NormalizerNfdTablesV1>
+ DataProvider<NormalizerNfkdTablesV1>
+ DataProvider<NormalizerNfcV1>
+ ?Sized,
F: Fn(&Locale) -> Option<Result<Box<dyn CustomTransliterator>, DataError>>,
{
let mut env = LiteMap::new();
let transliterator = Transliterator::load_rbt(
#[expect(clippy::unwrap_used)] DataMarkerAttributes::try_from_str(&locale.to_string().to_ascii_lowercase()).unwrap(),
lookup,
transliterator_provider,
normalizer_provider,
casemap_provider,
false,
&mut env,
)?;
Ok(Transliterator {
transliterator,
env,
})
}
fn load_rbt<PT, PN, PC, F>(
marker_attributes: &DataMarkerAttributes,
lookup: Option<&F>,
transliterator_provider: &PT,
normalizer_provider: &PN,
casemap_provider: &PC,
allow_internal: bool,
env: &mut LiteMap<String, InternalTransliterator>,
) -> Result<DataPayload<TransliteratorRulesV1>, DataError>
where
PT: DataProvider<TransliteratorRulesV1> + ?Sized,
PC: DataProvider<CaseMapV1> + ?Sized,
PN: DataProvider<NormalizerNfdDataV1>
+ DataProvider<NormalizerNfkdDataV1>
+ DataProvider<NormalizerNfdTablesV1>
+ DataProvider<NormalizerNfkdTablesV1>
+ DataProvider<NormalizerNfcV1>
+ ?Sized,
F: Fn(&Locale) -> Option<Result<Box<dyn CustomTransliterator>, DataError>>,
{
let req = DataRequest {
id: DataIdentifierBorrowed::for_marker_attributes(marker_attributes),
..Default::default()
};
let transliterator = transliterator_provider.load(req)?.payload;
if !allow_internal && !transliterator.get().visibility {
return Err(DataError::custom("internal only transliterator"));
}
env.insert(marker_attributes.to_string(), InternalTransliterator::Null);
for dep in transliterator.get().deps() {
if !env.contains_key(&*dep) {
let internal_t =
Transliterator::load_special(&dep, normalizer_provider, casemap_provider)
.or_else(|| Some(lookup?(&dep.parse().ok()?)?.map(InternalTransliterator::Dyn)))
.unwrap_or_else(|| {
Transliterator::load_rbt(
#[expect(clippy::unwrap_used)] DataMarkerAttributes::try_from_str(&dep.to_ascii_lowercase()).unwrap(),
lookup,
transliterator_provider,
normalizer_provider,
casemap_provider,
true,
env,
).map(InternalTransliterator::RuleBased)
})?;
env.insert(dep.to_string(), internal_t);
}
}
Ok(transliterator)
}
fn load_special<PN, PD>(
special: &str,
normalizer_provider: &PN,
casemapper_provider: &PD,
) -> Option<Result<InternalTransliterator, DataError>>
where
PN: ?Sized
+ DataProvider<NormalizerNfdDataV1>
+ DataProvider<NormalizerNfkdDataV1>
+ DataProvider<NormalizerNfdTablesV1>
+ DataProvider<NormalizerNfkdTablesV1>
+ DataProvider<NormalizerNfcV1>,
PD: ?Sized + DataProvider<CaseMapV1>,
{
match special {
"any-nfc" => Some(
ComposingNormalizer::try_new_nfc_unstable(normalizer_provider)
.map(InternalTransliterator::Composing),
),
"any-nfkc" => Some(
ComposingNormalizer::try_new_nfkc_unstable(normalizer_provider)
.map(InternalTransliterator::Composing),
),
"any-nfd" => Some(
DecomposingNormalizer::try_new_nfd_unstable(normalizer_provider)
.map(InternalTransliterator::Decomposing),
),
"any-nfkd" => Some(
DecomposingNormalizer::try_new_nfkd_unstable(normalizer_provider)
.map(InternalTransliterator::Decomposing),
),
"any-lower" => Some(
CaseMapper::try_new_unstable(casemapper_provider)
.map(InternalTransliterator::Lower),
),
"any-upper" => Some(
CaseMapper::try_new_unstable(casemapper_provider)
.map(InternalTransliterator::Upper),
),
"any-null" => Some(Ok(InternalTransliterator::Null)),
"any-remove" => Some(Ok(InternalTransliterator::Remove)),
"any-hex/unicode" => Some(Ok(InternalTransliterator::Hex(
hardcoded::HexTransliterator::new("U+", "", 4, Case::Upper),
))),
"any-hex/rust" => Some(Ok(InternalTransliterator::Hex(
hardcoded::HexTransliterator::new("\\u{", "}", 2, Case::Lower),
))),
"any-hex/xml" => Some(Ok(InternalTransliterator::Hex(
hardcoded::HexTransliterator::new("&#x", ";", 1, Case::Upper),
))),
"any-hex/perl" => Some(Ok(InternalTransliterator::Hex(
hardcoded::HexTransliterator::new("\\x{", "}", 1, Case::Upper),
))),
"any-hex/plain" => Some(Ok(InternalTransliterator::Hex(
hardcoded::HexTransliterator::new("", "", 4, Case::Upper),
))),
_ => None,
}
}
pub fn transliterate(&self, input: String) -> String {
let mut buffer = TransliteratorBuffer::from_string(input);
let rep = Replaceable::new(&mut buffer);
self.transliterator.get().transliterate(rep, &self.env);
buffer.into_string()
}
}
impl RuleBasedTransliterator<'_> {
fn transliterate(&self, mut rep: Replaceable, env: &Env) {
rep.for_each_run(&self.filter, |run| {
for (id_group, rule_group) in self.id_group_list.iter().zip(self.rule_group_list.iter())
{
for single_id in id_group.iter() {
let id = SimpleId::zero_from(single_id);
id.transliterate(run.child(), env);
}
let rule_group = RuleGroup::from(rule_group);
rule_group.transliterate(run.child(), &self.variable_table, env);
}
});
}
}
impl SimpleId<'_> {
fn transliterate(&self, mut rep: Replaceable, env: &Env) {
let inner = env.get(self.id.as_ref()).unwrap();
rep.for_each_run(&self.filter, |run| {
inner.transliterate(run.child(), env)
})
}
}
struct RuleGroup<'a> {
rules: &'a VarZeroSlice<RuleULE, Index32>,
}
impl<'a> RuleGroup<'a> {
fn from(rules: &'a VarZeroSlice<RuleULE, Index32>) -> Self {
Self { rules }
}
fn transliterate(&self, mut rep: Replaceable, vt: &VarTable, env: &Env) {
if self.rules.is_empty() {
return;
}
'main: while !rep.is_finished() {
for rule in self.rules.iter() {
let rule: Rule = Rule::zero_from(rule);
let matcher = rep.start_match();
if let Some((data, matcher)) = rule.matches(matcher, vt) {
rule.apply(matcher.finish_match(), data, vt, env);
continue 'main;
}
}
rep.step_cursor();
}
}
}
impl Rule<'_> {
fn apply(&self, mut dest: Insertable, data: MatchData, vt: &VarTable, env: &Env) {
let replacement_size_estimate = estimate_replacement_size(&self.replacer, &data, vt);
dest.apply_size_hint(replacement_size_estimate);
replace_str_with_specials(&self.replacer, &mut dest, &data, vt, env);
}
fn matches<'r1, 'r2>(
&self,
mut matcher: RepMatcher<'r1, 'r2, false>,
vt: &VarTable,
) -> Option<(MatchData, RepMatcher<'r1, 'r2, true>)> {
let mut match_data = MatchData::new();
if !self.ante_matches(&mut matcher, &mut match_data, vt) {
return None;
}
if !self.key_matches(&mut matcher, &mut match_data, vt) {
return None;
}
let mut matcher = matcher.finish_key();
if !self.post_matches(&mut matcher, &mut match_data, vt) {
return None;
}
Some((match_data, matcher))
}
fn ante_matches(
&self,
matcher: &mut impl Utf8Matcher<Reverse>,
match_data: &mut MatchData,
vt: &VarTable,
) -> bool {
if self.ante.is_empty() {
return true;
}
rev_match_str_with_specials(&self.ante, matcher, match_data, vt)
}
fn post_matches(
&self,
matcher: &mut impl Utf8Matcher<Forward>,
match_data: &mut MatchData,
vt: &VarTable,
) -> bool {
if self.post.is_empty() {
return true;
}
match_str_with_specials(&self.post, matcher, match_data, vt)
}
fn key_matches(
&self,
matcher: &mut impl Utf8Matcher<Forward>,
match_data: &mut MatchData,
vt: &VarTable,
) -> bool {
if self.key.is_empty() {
return true;
}
match_str_with_specials(&self.key, matcher, match_data, vt)
}
}
fn find_special(s: &str) -> Option<usize> {
s.char_indices()
.find(|(_, c)| VarTable::ENCODE_RANGE.contains(c))
.map(|(i, _)| i)
}
fn rev_find_special(s: &str) -> Option<usize> {
s.char_indices()
.rfind(|(_, c)| VarTable::ENCODE_RANGE.contains(c))
.map(|(i, c)| i + c.len_utf8())
}
fn estimate_replacement_size(replacement: &str, data: &MatchData, vt: &VarTable) -> usize {
let mut size;
let replacement_tail;
match find_special(replacement) {
None => return replacement.len(),
Some(idx) => {
size = idx;
replacement_tail = &replacement[idx..];
}
}
for rep_c in replacement_tail.chars() {
if !VarTable::ENCODE_RANGE.contains(&rep_c) {
size += rep_c.len_utf8();
continue;
}
let replacer = match vt.lookup_replacer(rep_c) {
Some(replacer) => replacer,
None => {
debug_assert!(false, "invalid encoded special {rep_c:?}");
continue;
}
};
size += replacer.estimate_size(data, vt);
}
size
}
fn replace_str_with_specials(
replacement: &str,
dest: &mut Insertable,
data: &MatchData,
vt: &VarTable,
env: &Env,
) {
let replacement = match find_special(replacement) {
None => {
dest.push_str(replacement);
return;
}
Some(idx) => {
dest.push_str(&replacement[..idx]);
&replacement[idx..]
}
};
for rep_c in replacement.chars() {
if !VarTable::ENCODE_RANGE.contains(&rep_c) {
dest.push(rep_c);
continue;
}
let replacer = match vt.lookup_replacer(rep_c) {
Some(replacer) => replacer,
None => {
debug_assert!(false, "invalid encoded special {rep_c:?}");
continue;
}
};
replacer.replace(dest, data, vt, env);
}
}
fn match_str_with_specials(
query: &str,
matcher: &mut impl Utf8Matcher<Forward>,
match_data: &mut MatchData,
vt: &VarTable,
) -> bool {
let query = match find_special(query) {
None => {
return matcher.match_and_consume_str(query);
}
Some(idx) => {
if !matcher.match_and_consume_str(&query[..idx]) {
return false;
}
&query[idx..]
}
};
for query_c in query.chars() {
if !VarTable::ENCODE_RANGE.contains(&query_c) {
if !matcher.match_and_consume_char(query_c) {
return false;
}
continue;
}
let special_matcher = match vt.lookup_matcher(query_c) {
Some(matcher) => matcher,
None => {
debug_assert!(false, "invalid encoded special {query_c:?}");
continue;
}
};
if !special_matcher.matches(matcher, match_data, vt) {
return false;
}
}
true
}
fn rev_match_str_with_specials(
query: &str,
matcher: &mut impl Utf8Matcher<Reverse>,
match_data: &mut MatchData,
vt: &VarTable,
) -> bool {
let query = match rev_find_special(query) {
None => {
return matcher.match_and_consume_str(query);
}
Some(idx) => {
if !matcher.match_and_consume_str(&query[idx..]) {
return false;
}
&query[..idx]
}
};
for query_c in query.chars().rev() {
if !VarTable::ENCODE_RANGE.contains(&query_c) {
if !matcher.match_and_consume_char(query_c) {
return false;
}
continue;
}
let special_matcher = match vt.lookup_matcher(query_c) {
Some(matcher) => matcher,
None => {
debug_assert!(false, "invalid encoded special {query_c:?}");
continue;
}
};
if !special_matcher.rev_matches(matcher, match_data, vt) {
return false;
}
}
true
}
#[derive(Debug)]
struct MatchData {
segments: Vec<String>,
}
impl MatchData {
fn new() -> Self {
Self {
segments: Vec::new(),
}
}
fn update_segment(&mut self, i: usize, s: String) {
if i >= self.segments.len() {
self.segments.resize_with(i + 1, Default::default);
}
self.segments[i] = s;
}
fn get_segment(&self, i: usize) -> &str {
if let Some(s) = self.segments.get(i) {
return s;
}
""
}
}
enum QuantifierKind {
ZeroOrOne,
ZeroOrMore,
OneOrMore,
}
enum SpecialMatcher<'a> {
Compound(&'a str),
Quantifier(QuantifierKind, &'a str),
Segment(Segment<'a>),
UnicodeSet(CodePointInversionListAndStringList<'a>),
AnchorStart,
AnchorEnd,
}
impl SpecialMatcher<'_> {
fn matches(
&self,
matcher: &mut impl Utf8Matcher<Forward>,
match_data: &mut MatchData,
vt: &VarTable,
) -> bool {
match self {
Self::Compound(query) => match_str_with_specials(query, matcher, match_data, vt),
Self::UnicodeSet(set) => {
if matcher.is_empty() {
if set.contains_str("") {
return true;
}
if set.contains_str("\u{FFFF}") {
if matcher.match_end_anchor() {
return true;
}
if matcher.match_start_anchor() {
return true;
}
}
return false;
}
let mut max_str_match: Option<usize> = None;
for s in set.strings().iter() {
if matcher.match_str(s) {
max_str_match = max_str_match.map(|m| m.max(s.len())).or(Some(s.len()));
continue;
}
match (s.chars().next(), matcher.next_char()) {
(Some(s_c), Some(input_c)) if s_c > input_c => break,
_ => (),
}
}
if let Some(max) = max_str_match {
return matcher.consume(max);
}
if let Some(input_c) = matcher.next_char() {
if set.contains(input_c) {
return matcher.consume(input_c.len_utf8());
}
}
false
}
Self::AnchorEnd => matcher.match_end_anchor(),
Self::AnchorStart => matcher.match_start_anchor(),
Self::Segment(segment) => {
let start = matcher.cursor();
if !match_str_with_specials(&segment.content, matcher, match_data, vt) {
return false;
}
let end = matcher.cursor();
let matched = matcher.str_range(start..end).unwrap();
match_data.update_segment(segment.idx as usize, matched.to_string());
true
}
Self::Quantifier(kind, query) => {
let (min_matches, max_matches) = match kind {
QuantifierKind::ZeroOrOne => (0, 1),
QuantifierKind::ZeroOrMore => (0, usize::MAX),
QuantifierKind::OneOrMore => (1, usize::MAX),
};
let mut matches = 0;
while matches < max_matches {
let pre_cursor = matcher.cursor();
if !match_str_with_specials(query, matcher, match_data, vt) {
break;
}
let post_cursor = matcher.cursor();
matches += 1;
if pre_cursor == post_cursor {
break;
}
}
matches >= min_matches
}
}
}
fn rev_matches(
&self,
matcher: &mut impl Utf8Matcher<Reverse>,
match_data: &mut MatchData,
vt: &VarTable,
) -> bool {
match self {
Self::Compound(query) => rev_match_str_with_specials(query, matcher, match_data, vt),
Self::UnicodeSet(set) => {
if matcher.is_empty() {
if set.contains_str("") {
return true;
}
if set.contains_str("\u{FFFF}") {
if matcher.match_end_anchor() {
return true;
}
if matcher.match_start_anchor() {
return true;
}
}
return false;
}
let max_str_match = set
.strings()
.iter()
.filter(|s| matcher.match_str(s))
.map(str::len)
.max();
if let Some(max) = max_str_match {
return matcher.consume(max);
}
if let Some(input_c) = matcher.next_char() {
if set.contains(input_c) {
return matcher.consume(input_c.len_utf8());
}
}
false
}
Self::AnchorEnd => matcher.match_end_anchor(),
Self::AnchorStart => matcher.match_start_anchor(),
Self::Segment(segment) => {
let end = matcher.cursor();
if !rev_match_str_with_specials(&segment.content, matcher, match_data, vt) {
return false;
}
let start = matcher.cursor();
let matched = &matcher.str_range(start..end).unwrap();
match_data.update_segment(segment.idx as usize, matched.to_string());
true
}
Self::Quantifier(kind, query) => {
let (min_matches, max_matches) = match kind {
QuantifierKind::ZeroOrOne => (0, 1),
QuantifierKind::ZeroOrMore => (0, usize::MAX),
QuantifierKind::OneOrMore => (1, usize::MAX),
};
let mut matches = 0;
while matches < max_matches {
let pre_cursor = matcher.cursor();
if !rev_match_str_with_specials(query, matcher, match_data, vt) {
break;
}
let post_cursor = matcher.cursor();
matches += 1;
if pre_cursor == post_cursor {
break;
}
}
matches >= min_matches
}
}
}
}
enum SpecialReplacer<'a> {
Compound(&'a str),
FunctionCall(FunctionCall<'a>),
BackReference(u16),
LeftPlaceholderCursor(u16),
RightPlaceholderCursor(u16),
PureCursor,
}
impl SpecialReplacer<'_> {
fn estimate_size(&self, data: &MatchData, vt: &VarTable) -> usize {
match self {
Self::Compound(replacer) => estimate_replacement_size(replacer, data, vt),
Self::FunctionCall(call) => {
estimate_replacement_size(&call.arg, data, vt)
}
&Self::BackReference(num) => data.get_segment(num as usize).len(),
Self::LeftPlaceholderCursor(_) | Self::RightPlaceholderCursor(_) | Self::PureCursor => {
0
}
}
}
fn replace(&self, dest: &mut Insertable, data: &MatchData, vt: &VarTable, env: &Env) {
match self {
Self::Compound(replacer) => replace_str_with_specials(replacer, dest, data, vt, env),
Self::PureCursor => dest.set_offset_to_here(),
&Self::LeftPlaceholderCursor(num) => {
dest.set_offset_to_chars_off_end(num);
}
&Self::RightPlaceholderCursor(num) => {
debug_assert_eq!(
dest.curr_replacement_len(),
0,
"pre-start cursor not the first replacement"
);
dest.set_offset_to_chars_off_start(num);
}
&Self::BackReference(num) => {
dest.push_str(data.get_segment(num as usize));
}
Self::FunctionCall(call) => {
let mut range_aggregator = dest.start_replaceable_adapter();
replace_str_with_specials(&call.arg, &mut range_aggregator, data, vt, env);
call.translit
.transliterate(range_aggregator.as_replaceable().child(), env);
}
}
}
}
enum VarTableElement<'a> {
Compound(&'a str),
Quantifier(QuantifierKind, &'a str),
Segment(Segment<'a>),
UnicodeSet(CodePointInversionListAndStringList<'a>),
FunctionCall(FunctionCall<'a>),
BackReference(u16),
AnchorStart,
AnchorEnd,
LeftPlaceholderCursor(u16),
RightPlaceholderCursor(u16),
PureCursor,
}
impl<'a> VarTableElement<'a> {
fn into_replacer(self) -> Option<SpecialReplacer<'a>> {
Some(match self {
Self::Compound(elt) => SpecialReplacer::Compound(elt),
Self::FunctionCall(elt) => SpecialReplacer::FunctionCall(elt),
Self::BackReference(elt) => SpecialReplacer::BackReference(elt),
Self::LeftPlaceholderCursor(elt) => SpecialReplacer::LeftPlaceholderCursor(elt),
Self::RightPlaceholderCursor(elt) => SpecialReplacer::RightPlaceholderCursor(elt),
Self::PureCursor => SpecialReplacer::PureCursor,
_ => return None,
})
}
fn into_matcher(self) -> Option<SpecialMatcher<'a>> {
Some(match self {
Self::Compound(elt) => SpecialMatcher::Compound(elt),
Self::Quantifier(kind, elt) => SpecialMatcher::Quantifier(kind, elt),
Self::Segment(elt) => SpecialMatcher::Segment(elt),
Self::UnicodeSet(elt) => SpecialMatcher::UnicodeSet(elt),
Self::AnchorEnd => SpecialMatcher::AnchorEnd,
Self::AnchorStart => SpecialMatcher::AnchorStart,
_ => return None,
})
}
}
impl<'a> VarTable<'a> {
fn lookup(&'a self, query: char) -> Option<VarTableElement<'a>> {
match query {
Self::BASE..=Self::MAX_DYNAMIC => {}
Self::RESERVED_PURE_CURSOR => return Some(VarTableElement::PureCursor),
Self::RESERVED_ANCHOR_END => return Some(VarTableElement::AnchorEnd),
Self::RESERVED_ANCHOR_START => return Some(VarTableElement::AnchorStart),
_ => return None,
};
let idx = query as u32 - Self::BASE as u32;
let mut idx = idx as usize;
let mut next_base = self.compounds.len();
if idx < next_base {
return Some(VarTableElement::Compound(&self.compounds[idx]));
}
idx -= next_base;
next_base = self.quantifiers_opt.len();
if idx < next_base {
return Some(VarTableElement::Quantifier(
QuantifierKind::ZeroOrOne,
&self.quantifiers_opt[idx],
));
}
idx -= next_base;
next_base = self.quantifiers_kleene.len();
if idx < next_base {
return Some(VarTableElement::Quantifier(
QuantifierKind::ZeroOrMore,
&self.quantifiers_kleene[idx],
));
}
idx -= next_base;
next_base = self.quantifiers_kleene_plus.len();
if idx < next_base {
return Some(VarTableElement::Quantifier(
QuantifierKind::OneOrMore,
&self.quantifiers_kleene_plus[idx],
));
}
idx -= next_base;
next_base = self.segments.len();
if idx < next_base {
return Some(VarTableElement::Segment(Segment::zero_from(
&self.segments[idx],
)));
}
idx -= next_base;
next_base = self.unicode_sets.len();
if idx < next_base {
return Some(VarTableElement::UnicodeSet(
CodePointInversionListAndStringList::zero_from(&self.unicode_sets[idx]),
));
}
idx -= next_base;
next_base = self.function_calls.len();
if idx < next_base {
return Some(VarTableElement::FunctionCall(FunctionCall::zero_from(
&self.function_calls[idx],
)));
}
idx -= next_base;
next_base = self.max_left_placeholder_count as usize;
if idx < next_base {
return Some(VarTableElement::LeftPlaceholderCursor(idx as u16 + 1));
}
idx -= next_base;
next_base = self.max_right_placeholder_count as usize;
if idx < next_base {
return Some(VarTableElement::RightPlaceholderCursor(idx as u16 + 1));
}
idx -= next_base;
Some(VarTableElement::BackReference(idx as u16))
}
fn lookup_matcher(&'a self, query: char) -> Option<SpecialMatcher<'a>> {
let elt = self.lookup(query)?;
elt.into_matcher()
}
fn lookup_replacer(&'a self, query: char) -> Option<SpecialReplacer<'a>> {
let elt = self.lookup(query)?;
elt.into_replacer()
}
}
#[cfg(test)]
mod tests {
#![allow(unused_qualifications)]
use super::*;
use crate::transliterate::RuleCollection;
#[test]
fn test_empty_matches() {
let cases = [
("ax", "amatch"),
("a", "a"),
("a1", "amatch1"),
("b", "b"),
("b1", "bmatch1"),
];
let mut collection = RuleCollection::default();
collection.register_source(
&"und-x-test".parse().unwrap(),
include_str!("../../../tests/transliterate/data/transforms/EmptyMatches.txt").into(),
[],
false,
true,
);
let t = Transliterator::try_new_unstable(
&collection.as_provider(),
&icu_normalizer::provider::Baked,
&icu_casemap::provider::Baked,
&"und-x-test".parse().unwrap(),
)
.unwrap();
for (input, output) in cases {
assert_eq!(t.transliterate(input.to_string()), output);
}
}
#[test]
fn test_recursive_suite() {
let mut collection = RuleCollection::default();
collection.register_source(
&"und-x-root".parse().unwrap(),
include_str!("../../../tests/transliterate/data/transforms/RecursiveRoot.txt").into(),
[],
false,
true,
);
collection.register_source(
&"und-x-rec".parse().unwrap(),
include_str!("../../../tests/transliterate/data/transforms/RecursiveA.txt").into(),
["Test-Test/RecursiveSuiteA"],
false,
true,
);
let t = Transliterator::try_new_unstable(
&collection.as_provider(),
&icu_normalizer::provider::Baked,
&icu_casemap::provider::Baked,
&"und-x-root".parse().unwrap(),
)
.unwrap();
let input = "XXXabcXXXdXXe";
let output = "XXXXXXaWORKEDcXXXXXXdXXXXXe";
assert_eq!(t.transliterate(input.to_string()), output);
}
#[test]
fn test_cursor_placeholders_filters() {
let mut collection = RuleCollection::default();
collection.register_source(
&"und-x-test".parse().unwrap(),
include_str!("../../../tests/transliterate/data/transforms/CursorFilters.txt").into(),
[],
false,
true,
);
let t = Transliterator::try_new_unstable(
&collection.as_provider(),
&icu_normalizer::provider::Baked,
&icu_casemap::provider::Baked,
&"und-x-test".parse().unwrap(),
)
.unwrap();
let input = "xa";
let output = "xb";
assert_eq!(t.transliterate(input.to_string()), output);
}
#[test]
fn test_functionality() {
let mut collection = RuleCollection::default();
collection.register_source(
&"und-x-test".parse().unwrap(),
include_str!("../../../tests/transliterate/data/transforms/Functionality.txt").into(),
[],
false,
true,
);
let t = Transliterator::try_new_unstable(
&collection.as_provider(),
&icu_normalizer::provider::Baked,
&icu_casemap::provider::Baked,
&"und-x-test".parse().unwrap(),
)
.unwrap();
let input = "abädefghijkl!";
let output = "FIfiunremovedtbxyzftbxyzxyzXYZjkT!";
assert_eq!(t.transliterate(input.to_string()), output);
}
#[test]
fn test_de_ascii() {
let t = Transliterator::try_new(&"de-t-de-d0-ascii".parse().unwrap()).unwrap();
let input =
"Über ältere Lügner lästern ist sehr a\u{0308}rgerlich. Ja, SEHR ÄRGERLICH! - ꜵ";
let output =
"Ueber aeltere Luegner laestern ist sehr aergerlich. Ja, SEHR AERGERLICH! - ao";
assert_eq!(t.transliterate(input.to_string()), output);
}
#[test]
fn test_override() {
#[derive(Debug)]
struct MaoamTranslit;
impl CustomTransliterator for MaoamTranslit {
fn transliterate(&self, input: &str, range: Range<usize>) -> String {
let input = &input[range];
input.replace('ꜵ', "maoam")
}
}
let want_locale = "und-t-und-latn-d0-ascii".parse().unwrap();
let t =
Transliterator::try_new_with_override(&"de-t-de-d0-ascii".parse().unwrap(), |locale| {
locale
.eq(&want_locale)
.then_some(Ok(Box::new(MaoamTranslit)))
})
.unwrap();
let input = "Ich liebe ꜵ über alles";
let output = "Ich liebe maoam ueber alles";
assert_eq!(t.transliterate(input.to_string()), output);
}
#[test]
fn test_nfc_nfd() {
let t = Transliterator::try_new(&"und-t-und-latn-d0-ascii".parse().unwrap()).unwrap();
let input = "äa\u{0308}";
let output = "aa";
assert_eq!(t.transliterate(input.to_string()), output);
}
#[test]
fn test_hex_rust() {
let mut collection = RuleCollection::default();
collection.register_source(
&"und-x-test".parse().unwrap(),
"::Hex/Rust;".into(),
[],
false,
true,
);
let t = Transliterator::try_new_unstable(
&collection.as_provider(),
&icu_normalizer::provider::Baked,
&icu_casemap::provider::Baked,
&"und-x-test".parse().unwrap(),
)
.unwrap();
let input = "\0äa\u{10FFFF}❤!";
let output = r"\u{00}\u{e4}\u{61}\u{10ffff}\u{2764}\u{21}";
assert_eq!(t.transliterate(input.to_string()), output);
}
#[test]
fn test_hex_unicode() {
let mut collection = RuleCollection::default();
collection.register_source(
&"und-x-test".parse().unwrap(),
"::Hex/Unicode;".into(),
[],
false,
true,
);
let t = Transliterator::try_new_unstable(
&collection.as_provider(),
&icu_normalizer::provider::Baked,
&icu_casemap::provider::Baked,
&"und-x-test".parse().unwrap(),
)
.unwrap();
let input = "\0äa\u{10FFFF}❤!";
let output = "U+0000U+00E4U+0061U+10FFFFU+2764U+0021";
assert_eq!(t.transliterate(input.to_string()), output);
}
#[test]
fn test_katakana_hiragana() {
let t = Transliterator::try_new(&"und-Hira-t-und-kana".parse().unwrap()).unwrap();
let input = "ウィキペディアへようこそ";
let output = "うぃきぺでぃあへようこそ";
assert_eq!(t.transliterate(input.to_string()), output);
}
}