use std::borrow::Cow;
use std::collections::HashMap;
use unicode_normalization::UnicodeNormalization;
use crate::tables;
use crate::unicode_ranges as ur;
use crate::ErrorMode;
use crate::limits::{MAX_CAPACITY_HINT, MAX_REPLACEMENT_OUTPUT_BYTES};
pub(crate) fn apply_replacements_bounded(text: &str) -> Result<Cow<'_, str>, crate::ErrorRepr> {
tables::apply_replacements(text, MAX_REPLACEMENT_OUTPUT_BYTES).map_err(|size| {
tl_warn!("replacement output too large: size={size} max={MAX_REPLACEMENT_OUTPUT_BYTES}");
crate::ErrorRepr::ReplacementOutputTooLarge {
size,
max: MAX_REPLACEMENT_OUTPUT_BYTES,
}
})
}
pub(crate) fn validate_lang(lang: Option<&str>) -> Result<(), crate::ErrorRepr> {
if let Some(l) = lang {
if l != "auto" && !tables::is_valid_lang(l) {
let valid = tables::list_langs();
let suggestion = crate::utils::closest_match(
l,
valid
.iter()
.map(String::as_str)
.chain(["auto", "nb", "nn", "da"]),
)
.map(|s| format!(" (did you mean '{s}'?)"))
.unwrap_or_default();
return Err(crate::ErrorRepr::UnknownLang {
got: l.to_owned(),
suggestion,
valid: valid.join(", "),
});
}
}
Ok(())
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum ScriptClass {
None,
Ideograph,
Hangul,
Kana,
Latin,
Indic,
Other,
}
pub(crate) fn transliterate_strict(
text: &str,
lang: Option<&str>,
strict_iso9: bool,
gost7034: bool,
tones: bool,
) -> Result<String, crate::ErrorRepr> {
let mut first: Vec<(char, usize)> = Vec::new();
let result = transliterate_impl_inner(
text,
lang,
ErrorMode::Ignore,
"",
strict_iso9,
gost7034,
tones,
Some(&mut first),
true, );
if let Some((ch, byte_offset)) = first.into_iter().next() {
return Err(crate::ErrorRepr::Untranslatable { ch, byte_offset });
}
Ok(result.into_owned())
}
pub(crate) fn validate_transliterate_args(
lang: Option<&str>,
target: Option<&str>,
errors: &str,
replace_with: &str,
strict_iso9: bool,
gost7034: bool,
tones: bool,
context: bool,
) -> Result<(), crate::ErrorRepr> {
if target.is_some() && lang.is_some() {
return Err(crate::ErrorRepr::LangTargetExclusive);
}
if context && target.is_some() {
return Err(crate::ErrorRepr::ContextTargetExclusive);
}
if context && tones {
return Err(crate::ErrorRepr::TonesWithContext);
}
if context && errors == "strict" {
return Err(crate::ErrorRepr::StrictWithContext);
}
if target.is_some() {
let mut forward_only: Vec<&str> = Vec::new();
if errors != "replace" {
forward_only.push("errors");
}
if replace_with != "[?]" {
forward_only.push("replace_with");
}
if strict_iso9 {
forward_only.push("strict_iso9");
}
if gost7034 {
forward_only.push("gost7034");
}
if tones {
forward_only.push("tones");
}
if !forward_only.is_empty() {
forward_only.sort_unstable();
return Err(crate::ErrorRepr::ForwardOnlyWithTarget {
names: forward_only.join(", "),
});
}
}
Ok(())
}
pub(crate) fn transliterate_context(
text: &str,
lang: Option<&str>,
errors: &str,
replace_with: &str,
strict_iso9: bool,
gost7034: bool,
) -> Result<String, crate::ErrorRepr> {
if strict_iso9 && gost7034 {
return Err(crate::ErrorRepr::MutuallyExclusiveBare);
}
validate_lang(lang)?;
let error_mode = ErrorMode::parse(errors)?;
let text = apply_replacements_bounded(text)?;
let text = text.as_ref();
let dict_result: Result<Option<&'static crate::context::ContextDict>, &'static str> = match lang
{
Some("he") => crate::context::get_hebrew_dict(),
Some("fa") => match crate::context::get_persian_dict() {
Ok(Some(d)) => Ok(Some(d)),
Ok(None) => crate::context::get_arabic_dict(), Err(e) => Err(e), },
_ => crate::context::get_arabic_dict(),
};
let lang_name = match lang {
Some("he") => "Hebrew",
Some("fa") => "Arabic/Persian",
_ => "Arabic",
};
match dict_result {
Ok(Some(d)) => {
let result = crate::context::transliterate_context(text, lang, d, |word, lang| {
transliterate_impl(
word,
lang,
error_mode,
replace_with,
strict_iso9,
gost7034,
false,
)
.into_owned()
});
Ok(result)
}
Ok(None) => {
Err(crate::ErrorRepr::ContextDictNotFound {
lang: lang_name.to_owned(),
})
}
Err(corrupt_msg) => {
Err(crate::ErrorRepr::ContextDictCorrupt {
lang: lang_name.to_owned(),
reason: corrupt_msg.to_owned(),
})
}
}
}
pub(crate) fn transliterate_impl<'a>(
text: &'a str,
lang: Option<&str>,
error_mode: ErrorMode,
replace_with: &str,
strict_iso9: bool,
gost7034: bool,
tones: bool,
) -> Cow<'a, str> {
#[cfg(feature = "log")]
let start = log::log_enabled!(target: crate::obs::TARGET, log::Level::Debug)
.then(std::time::Instant::now);
let out = transliterate_impl_inner(
text,
lang,
error_mode,
replace_with,
strict_iso9,
gost7034,
tones,
None,
false, );
#[cfg(feature = "log")]
if let Some(start) = start {
let lang_log =
lang.map(|l| crate::log_injection::strip_log_injection_str(l, "\u{FFFD}", true));
tl_debug!(
"transliterate: in_bytes={} out_bytes={} lang={lang_log:?} mode={error_mode:?} \
iso9={strict_iso9} gost={gost7034} tones={tones} borrowed={} dur_us={}",
text.len(),
out.len(),
matches!(out, Cow::Borrowed(_)),
start.elapsed().as_micros(),
);
}
out
}
pub(crate) fn find_untranslatable_impl(
text: &str,
lang: Option<&str>,
strict_iso9: bool,
gost7034: bool,
tones: bool,
) -> Vec<(char, usize)> {
let mut out = Vec::new();
let _ = transliterate_impl_inner(
text,
lang,
ErrorMode::Ignore,
"",
strict_iso9,
gost7034,
tones,
Some(&mut out),
false, );
out
}
#[allow(clippy::too_many_arguments)]
fn transliterate_impl_inner<'a>(
text: &'a str,
lang: Option<&str>,
error_mode: ErrorMode,
replace_with: &str,
strict_iso9: bool,
gost7034: bool,
tones: bool,
untranslatable: Option<&mut Vec<(char, usize)>>,
stop_on_first: bool,
) -> Cow<'a, str> {
if text.is_ascii() {
return Cow::Borrowed(text);
}
let resolved: Option<String>;
let lang = if lang == Some("auto") {
resolved = crate::scripts::resolve_auto_lang(text);
resolved.as_deref()
} else {
lang
};
if strict_iso9 {
transliterate_run(
text,
|ch| {
tables::lookup_iso9(ch)
.map(Cow::Borrowed)
.or_else(|| default_lookup(ch, tones))
},
error_mode,
replace_with,
lang,
strict_iso9,
gost7034,
tones,
untranslatable,
stop_on_first,
)
} else if gost7034 {
transliterate_run(
text,
|ch| {
tables::lookup_gost7034(ch)
.map(Cow::Borrowed)
.or_else(|| default_lookup(ch, tones))
},
error_mode,
replace_with,
lang,
strict_iso9,
gost7034,
tones,
untranslatable,
stop_on_first,
)
} else {
let builtin_lang_map = lang.and_then(tables::resolve_lang_map);
transliterate_run(
text,
|ch| {
builtin_lang_map
.and_then(|m| m.get(&ch).copied().map(Cow::Borrowed))
.or_else(|| lang.and_then(|l| tables::lookup_registered(l, ch)))
.or_else(|| default_lookup(ch, tones))
},
error_mode,
replace_with,
lang,
strict_iso9,
gost7034,
tones,
untranslatable,
stop_on_first,
)
}
}
#[inline]
fn default_lookup(ch: char, tones: bool) -> Option<Cow<'static, str>> {
if tones {
tables::lookup_default_toned(ch).map(Cow::Borrowed)
} else {
tables::lookup_default(ch).map(Cow::Borrowed)
}
}
#[inline]
fn first_non_ascii(bytes: &[u8]) -> usize {
const CHUNK: usize = 8;
const HIGH_BITS: u64 = 0x8080_8080_8080_8080;
let n = bytes.len();
let mut i = 0;
while i + CHUNK <= n {
let word = u64::from_ne_bytes(
bytes[i..i + CHUNK]
.try_into()
.expect("slice is exactly CHUNK bytes"),
);
if word & HIGH_BITS != 0 {
break; }
i += CHUNK;
}
while i < n && bytes[i] < 0x80 {
i += 1;
}
i
}
#[allow(clippy::too_many_arguments)]
fn transliterate_run<'a, F>(
text: &'a str,
lookup: F,
error_mode: ErrorMode,
replace_with: &str,
lang: Option<&str>,
strict_iso9: bool,
gost7034: bool,
tones: bool,
mut untranslatable: Option<&mut Vec<(char, usize)>>,
stop_on_first: bool,
) -> Cow<'a, str>
where
F: Fn(char) -> Option<Cow<'static, str>>,
{
let mut result = String::with_capacity(estimate_capacity(text));
let mut prev_class = ScriptClass::None;
let mut last_appended: Option<char> = None;
let mut last_was_indic_consonant = false;
let bytes = text.as_bytes();
let len = bytes.len();
let mut i = 0;
while i < len {
if bytes[i] < 0x80 {
let run_end = i + first_non_ascii(&bytes[i..]);
let run = &text[i..run_end];
last_was_indic_consonant = false;
let first = bytes[i] as char;
if matches!(prev_class, ScriptClass::Ideograph | ScriptClass::Hangul)
&& first.is_ascii_alphanumeric()
{
if let Some(last) = last_appended {
if last.is_alphanumeric() {
result.push(' ');
}
}
}
result.push_str(run);
last_appended = Some(bytes[run_end - 1] as char);
prev_class = ScriptClass::Latin;
i = run_end;
continue;
}
let ch = text[i..]
.chars()
.next()
.expect("i is at a char boundary inside the string");
let byte_offset = i;
i += ch.len_utf8();
let char_class = classify_char(ch);
let is_cjk = matches!(
char_class,
ScriptClass::Ideograph | ScriptClass::Hangul | ScriptClass::Kana
);
let mut mapped: Option<Cow<'static, str>> = lookup(ch);
if char_class == ScriptClass::Indic {
let role = indic_char_role(ch as u32);
match role {
IndicRole::Virama | IndicRole::DependentVowel if last_was_indic_consonant => {
if result.ends_with('a') {
result.pop();
}
last_was_indic_consonant = false;
if mapped.is_none() {
mapped = Some(Cow::Borrowed(""));
}
}
IndicRole::Consonant => {
last_was_indic_consonant = true;
}
_ => {
last_was_indic_consonant = false;
}
}
} else {
last_was_indic_consonant = false;
}
let is_mapped = match mapped.as_deref() {
Some(s) if !s.is_empty() => true,
Some(_) => error_mode != ErrorMode::Preserve, None => false,
};
if is_mapped {
let s = mapped.as_deref().unwrap();
if is_cjk && prev_class != ScriptClass::None && needs_cjk_space(prev_class, char_class)
{
if let Some(last) = last_appended {
if last.is_alphanumeric() {
result.push(' ');
last_appended = Some(' ');
}
}
}
result.push_str(s);
if let Some(c) = s.chars().next_back() {
last_appended = Some(c);
}
prev_class = char_class;
} else {
let genuinely_untranslatable = handle_unmapped(
ch,
byte_offset,
&mut result,
&mut last_appended,
&mut prev_class,
untranslatable.as_deref_mut(),
lang,
error_mode,
replace_with,
strict_iso9,
gost7034,
tones,
);
if stop_on_first && genuinely_untranslatable {
break;
}
}
}
Cow::Owned(result)
}
#[cold]
#[inline(never)]
#[allow(clippy::too_many_arguments)]
fn handle_unmapped(
ch: char,
byte_offset: usize,
result: &mut String,
last_appended: &mut Option<char>,
prev_class: &mut ScriptClass,
mut untranslatable: Option<&mut Vec<(char, usize)>>,
lang: Option<&str>,
error_mode: ErrorMode,
replace_with: &str,
strict_iso9: bool,
gost7034: bool,
tones: bool,
) -> bool {
const NFKC_STACK_BYTES: usize = 80; let mut buf = [0u8; NFKC_STACK_BYTES];
let mut blen = 0usize;
let mut overflow = false;
for d in ch.nfkc() {
if blen + d.len_utf8() > buf.len() {
overflow = true;
break;
}
blen += d.encode_utf8(&mut buf[blen..]).len();
}
let heap: String;
let decomposed: &str = if overflow {
heap = ch.nfkc().collect();
heap.as_str()
} else {
std::str::from_utf8(&buf[..blen]).expect("nfkc output is valid utf8")
};
let nfkc_unchanged = decomposed.len() == ch.len_utf8() && decomposed.starts_with(ch);
if !nfkc_unchanged {
let sub = transliterate_impl(
decomposed,
lang,
error_mode,
replace_with,
strict_iso9,
gost7034,
tones,
);
if !sub.is_empty() {
result.push_str(&sub);
*last_appended = sub.chars().next_back();
*prev_class = ScriptClass::Latin;
return false; }
}
if let Some(v) = untranslatable.as_mut() {
v.push((ch, byte_offset));
}
match error_mode {
ErrorMode::Replace => {
result.push_str(replace_with);
*last_appended = replace_with.chars().next_back();
}
ErrorMode::Ignore => {}
ErrorMode::Preserve => {
result.push(ch);
*last_appended = Some(ch);
}
}
*prev_class = ScriptClass::Other;
true }
const CAPACITY_SAMPLE_CHARS: usize = 256;
const CJK_EXTRA_BYTES_PER_BYTE: usize = 3;
fn estimate_capacity(text: &str) -> usize {
let mut sampled = 0usize;
let mut expanding = 0usize;
for c in text.chars().take(CAPACITY_SAMPLE_CHARS) {
sampled += 1;
let cp = c as u32;
if ur::CJK_CAPACITY_RANGE.contains(&cp)
|| ur::HANGUL_SYLLABLES.contains(&cp)
|| ur::CJK_COMPAT.contains(&cp)
{
expanding += 1;
}
}
if expanding == 0 {
return text.len().min(MAX_CAPACITY_HINT);
}
let extra = text
.len()
.saturating_mul(CJK_EXTRA_BYTES_PER_BYTE)
.saturating_mul(expanding)
/ sampled.max(1);
text.len().saturating_add(extra).min(MAX_CAPACITY_HINT)
}
const CLASSIFY_LO: u32 = *ur::INDIC.start();
const CLASSIFY_HI: u32 = *ur::KATAKANA_HALFWIDTH.end();
#[derive(Clone, Copy, PartialEq, Eq)]
#[repr(u8)]
enum BlockClass {
Other,
Ideograph,
Hangul,
Kana,
Indic,
Mixed,
}
const fn in_r(cp: u32, r: &std::ops::RangeInclusive<u32>) -> bool {
cp >= *r.start() && cp <= *r.end()
}
const fn class_of_cp_const(cp: u32) -> BlockClass {
if in_r(cp, &ur::CJK_UNIFIED) || in_r(cp, &ur::CJK_EXT_A) || in_r(cp, &ur::CJK_COMPAT) {
BlockClass::Ideograph
} else if in_r(cp, &ur::HANGUL_SYLLABLES) || in_r(cp, &ur::HANGUL_COMPAT_JAMO) {
BlockClass::Hangul
} else if in_r(cp, &ur::HIRAGANA)
|| in_r(cp, &ur::KATAKANA)
|| in_r(cp, &ur::KATAKANA_HALFWIDTH)
{
BlockClass::Kana
} else if in_r(cp, &ur::INDIC)
|| in_r(cp, &ur::TIBETAN)
|| in_r(cp, &ur::MYANMAR)
|| in_r(cp, &ur::KHMER)
|| in_r(cp, &ur::BALINESE)
|| in_r(cp, &ur::JAVANESE)
|| in_r(cp, &ur::SUNDANESE)
|| in_r(cp, &ur::TAI_THAM)
|| in_r(cp, &ur::CHAM)
|| in_r(cp, &ur::BATAK)
|| in_r(cp, &ur::BUGINESE)
|| in_r(cp, &ur::TAGALOG)
|| in_r(cp, &ur::HANUNOO)
|| in_r(cp, &ur::BUHID)
|| in_r(cp, &ur::TAGBANWA)
|| in_r(cp, &ur::MEETEI_MAYEK)
|| in_r(cp, &ur::MEETEI_MAYEK_EXT)
{
BlockClass::Indic
} else {
BlockClass::Other
}
}
const BLOCK_CLASS: [BlockClass; 256] = {
let mut t = [BlockClass::Other; 256];
let mut b = 0usize;
while b < 256 {
let base = (b as u32) << 8;
let first = class_of_cp_const(base);
let mut off = 1u32;
let mut mixed = false;
while off < 256 {
if class_of_cp_const(base | off) as u8 != first as u8 {
mixed = true;
break;
}
off += 1;
}
t[b] = if mixed { BlockClass::Mixed } else { first };
b += 1;
}
t
};
const _: () = {
macro_rules! within {
($r:expr) => {
assert!(*$r.start() >= CLASSIFY_LO && *$r.end() <= CLASSIFY_HI);
};
}
within!(ur::CJK_UNIFIED);
within!(ur::CJK_EXT_A);
within!(ur::CJK_COMPAT);
within!(ur::HANGUL_SYLLABLES);
within!(ur::HANGUL_COMPAT_JAMO);
within!(ur::HIRAGANA);
within!(ur::KATAKANA);
within!(ur::KATAKANA_HALFWIDTH);
within!(ur::INDIC);
within!(ur::TIBETAN);
within!(ur::MYANMAR);
within!(ur::KHMER);
within!(ur::BALINESE);
within!(ur::JAVANESE);
within!(ur::SUNDANESE);
within!(ur::TAI_THAM);
within!(ur::CHAM);
within!(ur::BATAK);
within!(ur::BUGINESE);
within!(ur::TAGALOG);
within!(ur::HANUNOO);
within!(ur::BUHID);
within!(ur::TAGBANWA);
within!(ur::MEETEI_MAYEK);
within!(ur::MEETEI_MAYEK_EXT);
};
#[inline]
fn classify_char(ch: char) -> ScriptClass {
let cp = ch as u32;
if !(CLASSIFY_LO..=CLASSIFY_HI).contains(&cp) {
return ScriptClass::Other;
}
match BLOCK_CLASS[(cp >> 8) as usize] {
BlockClass::Ideograph => ScriptClass::Ideograph,
BlockClass::Hangul => ScriptClass::Hangul,
BlockClass::Kana => ScriptClass::Kana,
BlockClass::Indic => ScriptClass::Indic,
BlockClass::Other => ScriptClass::Other,
BlockClass::Mixed => classify_char_slow(ch),
}
}
#[inline]
fn classify_char_slow(ch: char) -> ScriptClass {
if is_cjk_ideograph(ch) {
ScriptClass::Ideograph
} else if is_hangul(ch) {
ScriptClass::Hangul
} else if is_kana(ch) {
ScriptClass::Kana
} else if is_indic(ch) {
ScriptClass::Indic
} else {
ScriptClass::Other
}
}
#[inline]
fn needs_cjk_space(prev: ScriptClass, curr: ScriptClass) -> bool {
use ScriptClass::{Hangul, Ideograph, Indic, Kana, Latin, Other};
matches!(
(prev, curr),
(Ideograph | Kana | Hangul, Ideograph | Hangul)
| (Ideograph | Hangul, Kana)
| (Latin | Other | Indic, Ideograph | Hangul | Kana)
)
}
#[inline]
fn is_cjk_ideograph(ch: char) -> bool {
let cp = ch as u32;
ur::CJK_UNIFIED.contains(&cp) || ur::CJK_EXT_A.contains(&cp) || ur::CJK_COMPAT.contains(&cp)
}
#[inline]
fn is_hangul(ch: char) -> bool {
let cp = ch as u32;
ur::HANGUL_SYLLABLES.contains(&cp) || ur::HANGUL_COMPAT_JAMO.contains(&cp)
}
#[inline]
fn is_indic(ch: char) -> bool {
let cp = ch as u32;
ur::INDIC.contains(&cp)
|| ur::TIBETAN.contains(&cp)
|| ur::MYANMAR.contains(&cp)
|| ur::KHMER.contains(&cp)
|| ur::BALINESE.contains(&cp)
|| ur::JAVANESE.contains(&cp)
|| ur::SUNDANESE.contains(&cp)
|| ur::TAI_THAM.contains(&cp)
|| ur::CHAM.contains(&cp)
|| ur::BATAK.contains(&cp)
|| ur::BUGINESE.contains(&cp)
|| ur::TAGALOG.contains(&cp)
|| ur::HANUNOO.contains(&cp)
|| ur::BUHID.contains(&cp)
|| ur::TAGBANWA.contains(&cp)
|| ur::MEETEI_MAYEK.contains(&cp)
|| ur::MEETEI_MAYEK_EXT.contains(&cp)
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum IndicRole {
None,
Consonant,
DependentVowel,
Virama,
}
#[inline]
pub fn indic_char_role(cp: u32) -> IndicRole {
match cp >> 8 {
0x09..=0x0C => core_indic_role(cp),
0x0F => tibetan_char_role(cp),
0x10 => myanmar_char_role(cp),
0xA9 => javanese_char_role(cp),
0xAB => meetei_mayek_char_role(cp),
_ => indic_char_role_chain(cp),
}
}
#[inline]
fn core_indic_role(cp: u32) -> IndicRole {
match cp & 0x7F {
0x15..=0x39 | 0x58..=0x5F => IndicRole::Consonant,
0x3E..=0x4C => IndicRole::DependentVowel,
0x4D => IndicRole::Virama,
_ => IndicRole::None,
}
}
#[inline]
fn indic_char_role_chain(cp: u32) -> IndicRole {
if (0x0D80..=0x0DFF).contains(&cp) {
return sinhala_char_role(cp);
}
if (0x0F00..=0x0FFF).contains(&cp) {
return tibetan_char_role(cp);
}
if (0x1000..=0x109F).contains(&cp) {
return myanmar_char_role(cp);
}
if (0x1780..=0x17FF).contains(&cp) {
return khmer_char_role(cp);
}
if (0x1B00..=0x1B7F).contains(&cp) {
return balinese_char_role(cp);
}
if (0xA980..=0xA9DF).contains(&cp) {
return javanese_char_role(cp);
}
if ur::SUNDANESE.contains(&cp) {
return sundanese_char_role(cp);
}
if ur::TAI_THAM.contains(&cp) {
return tai_tham_char_role(cp);
}
if ur::CHAM.contains(&cp) {
return cham_char_role(cp);
}
if ur::BATAK.contains(&cp) {
return batak_char_role(cp);
}
if ur::BUGINESE.contains(&cp) {
return buginese_char_role(cp);
}
if ur::TAGALOG.contains(&cp) {
return tagalog_char_role(cp);
}
if ur::HANUNOO.contains(&cp) {
return hanunoo_char_role(cp);
}
if ur::BUHID.contains(&cp) {
return buhid_char_role(cp);
}
if ur::TAGBANWA.contains(&cp) {
return tagbanwa_char_role(cp);
}
if ur::MEETEI_MAYEK.contains(&cp) || ur::MEETEI_MAYEK_EXT.contains(&cp) {
return meetei_mayek_char_role(cp);
}
if !(0x0900..=0x0D7F).contains(&cp) {
return IndicRole::None;
}
core_indic_role(cp)
}
#[inline]
pub fn sinhala_char_role(cp: u32) -> IndicRole {
match cp {
0x0D9A..=0x0DC6 => IndicRole::Consonant,
0x0DCF..=0x0DDF | 0x0DF2..=0x0DF3 => IndicRole::DependentVowel,
0x0DCA => IndicRole::Virama,
_ => IndicRole::None,
}
}
#[inline]
pub fn tibetan_char_role(cp: u32) -> IndicRole {
match cp {
0x0F40..=0x0F69 | 0x0F90..=0x0FBC => IndicRole::Consonant,
0x0F71..=0x0F7D => IndicRole::DependentVowel,
0x0F84 => IndicRole::Virama,
_ => IndicRole::None,
}
}
#[inline]
pub fn myanmar_char_role(cp: u32) -> IndicRole {
match cp {
0x1000..=0x1021 => IndicRole::Consonant,
0x102B..=0x1035 | 0x103B..=0x103E => IndicRole::DependentVowel,
0x1039 | 0x103A => IndicRole::Virama,
_ => IndicRole::None,
}
}
#[inline]
pub fn khmer_char_role(cp: u32) -> IndicRole {
match cp {
0x1780..=0x17A2 => IndicRole::Consonant,
0x17B6..=0x17C5 => IndicRole::DependentVowel,
0x17D2 => IndicRole::Virama,
_ => IndicRole::None,
}
}
#[inline]
pub fn balinese_char_role(cp: u32) -> IndicRole {
match cp {
0x1B13..=0x1B33 => IndicRole::Consonant,
0x1B35..=0x1B43 => IndicRole::DependentVowel,
0x1B44 => IndicRole::Virama,
_ => IndicRole::None,
}
}
#[inline]
pub fn javanese_char_role(cp: u32) -> IndicRole {
match cp {
0xA990..=0xA9B2 => IndicRole::Consonant,
0xA9B4..=0xA9BC => IndicRole::DependentVowel,
0xA9C0 => IndicRole::Virama,
_ => IndicRole::None,
}
}
#[inline]
pub fn sundanese_char_role(cp: u32) -> IndicRole {
match cp {
0x1B8A..=0x1BA0 => IndicRole::Consonant,
0x1BA1..=0x1BA9 => IndicRole::DependentVowel,
0x1BAB => IndicRole::Virama,
_ => IndicRole::None,
}
}
#[inline]
pub fn tai_tham_char_role(cp: u32) -> IndicRole {
match cp {
0x1A20..=0x1A54 => IndicRole::Consonant,
0x1A55..=0x1A5E | 0x1A61..=0x1A72 => IndicRole::DependentVowel,
0x1A60 => IndicRole::Virama,
_ => IndicRole::None,
}
}
#[inline]
pub fn cham_char_role(cp: u32) -> IndicRole {
match cp {
0xAA00..=0xAA28 => IndicRole::Consonant,
0xAA29..=0xAA36 => IndicRole::DependentVowel,
0xAA4D => IndicRole::Virama,
_ => IndicRole::None,
}
}
#[inline]
pub fn batak_char_role(cp: u32) -> IndicRole {
match cp {
0x1BC0..=0x1BE3 => IndicRole::Consonant,
0x1BE7..=0x1BEE => IndicRole::DependentVowel,
0x1BF2 | 0x1BF3 => IndicRole::Virama,
_ => IndicRole::None,
}
}
#[inline]
pub fn buginese_char_role(cp: u32) -> IndicRole {
match cp {
0x1A00..=0x1A16 => IndicRole::Consonant,
0x1A17..=0x1A1B => IndicRole::DependentVowel,
_ => IndicRole::None,
}
}
#[inline]
pub fn tagalog_char_role(cp: u32) -> IndicRole {
match cp {
0x1703..=0x1711 | 0x171F => IndicRole::Consonant,
0x1712 | 0x1713 => IndicRole::DependentVowel,
0x1714 => IndicRole::Virama,
_ => IndicRole::None,
}
}
#[inline]
pub fn hanunoo_char_role(cp: u32) -> IndicRole {
match cp {
0x1723..=0x1731 => IndicRole::Consonant,
0x1732 | 0x1733 => IndicRole::DependentVowel,
0x1734 => IndicRole::Virama,
_ => IndicRole::None,
}
}
#[inline]
pub fn buhid_char_role(cp: u32) -> IndicRole {
match cp {
0x1743..=0x1751 => IndicRole::Consonant,
0x1752 => IndicRole::DependentVowel,
0x1753 => IndicRole::Virama,
_ => IndicRole::None,
}
}
#[inline]
pub fn tagbanwa_char_role(cp: u32) -> IndicRole {
match cp {
0x1763..=0x1770 => IndicRole::Consonant,
0x1772 => IndicRole::DependentVowel,
0x1773 => IndicRole::Virama,
_ => IndicRole::None,
}
}
#[inline]
pub fn meetei_mayek_char_role(cp: u32) -> IndicRole {
match cp {
0xABC0..=0xABE2 => IndicRole::Consonant,
0xABE3..=0xABEA => IndicRole::DependentVowel,
0xABED => IndicRole::Virama,
_ => IndicRole::None,
}
}
#[inline]
fn is_kana(ch: char) -> bool {
let cp = ch as u32;
ur::HIRAGANA.contains(&cp) || ur::KATAKANA.contains(&cp) || ur::KATAKANA_HALFWIDTH.contains(&cp)
}
pub(crate) fn strip_accents(text: &str) -> String {
let mut out = String::new();
strip_accents_into(text, &mut out);
out
}
pub(crate) fn strip_accents_cow(text: &str) -> std::borrow::Cow<'_, str> {
use std::borrow::Cow;
use unicode_normalization::UnicodeNormalization;
if text.is_ascii()
|| !text
.nfd()
.any(unicode_normalization::char::is_combining_mark)
{
return Cow::Borrowed(text);
}
Cow::Owned(strip_accents(text))
}
pub fn strip_accents_into(text: &str, out: &mut String) {
use unicode_normalization::UnicodeNormalization;
out.clear();
if text.is_ascii() {
out.push_str(text);
return;
}
out.extend(
text.nfd()
.filter(|c| !unicode_normalization::char::is_combining_mark(*c))
.nfc(),
);
}
pub(crate) fn check_not_sealed(op: &str) -> Result<(), crate::ErrorRepr> {
if tables::registrations_sealed() {
return Err(crate::ErrorRepr::Sealed { op: op.to_owned() });
}
Ok(())
}
pub(crate) fn register_lang(
code: &str,
mappings: HashMap<String, String>,
) -> Result<(), crate::ErrorRepr> {
check_not_sealed("register_lang")?;
let current = tables::registered_lang_count();
if current >= tables::MAX_REGISTERED_LANGS {
if !tables::has_registered_lang(code) {
return Err(crate::ErrorRepr::RegisterLangLimit {
max: tables::MAX_REGISTERED_LANGS,
});
}
}
tables::register_lang(code, mappings).map_err(|bad_keys| {
crate::ErrorRepr::RegisterLangBadKeys {
keys: bad_keys
.iter()
.map(|k| format!("{k:?}"))
.collect::<Vec<_>>()
.join(", "),
}
})
}
pub(crate) fn register_replacements(
replacements: HashMap<String, String>,
) -> Result<(), crate::ErrorRepr> {
check_not_sealed("register_replacements")?;
tables::register_replacements(replacements).map_err(|projected| {
crate::ErrorRepr::RegisterReplacementsLimit {
max: tables::MAX_REPLACEMENTS,
projected,
}
})
}
pub(crate) fn remove_replacement(key: &str) -> Result<bool, crate::ErrorRepr> {
check_not_sealed("remove_replacement")?;
Ok(tables::remove_replacement(key))
}
pub(crate) fn clear_replacements() -> Result<(), crate::ErrorRepr> {
check_not_sealed("clear_replacements")?;
tables::clear_replacements();
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn strict_single_pass_matches_reference() {
assert_eq!(
transliterate_strict("café Москва", None, false, false, false).unwrap(),
transliterate_impl(
"café Москва",
None,
ErrorMode::Ignore,
"",
false,
false,
false
)
.into_owned()
);
let mut hit_error_branch = false;
for s in [
"a\u{E000}b\u{E000}",
"café\u{E000}",
"\u{1F980}x",
"abc",
"Привет",
] {
let reference = find_untranslatable_impl(s, None, false, false, false)
.into_iter()
.next();
match (
transliterate_strict(s, None, false, false, false),
reference,
) {
(Err(crate::ErrorRepr::Untranslatable { ch, byte_offset }), Some((ech, eoff))) => {
assert_eq!(
(ch, byte_offset),
(ech, eoff),
"strict first-untranslatable mismatch for {s:?}"
);
hit_error_branch = true;
}
(Ok(_), None) => {} (got, exp) => panic!("strict/reference disagree for {s:?}: {got:?} vs {exp:?}"),
}
}
assert!(
hit_error_branch,
"test never exercised the strict Error::Untranslatable branch"
);
}
#[test]
fn classify_char_matches_slow() {
for cp in 0u32..=0x10_FFFF {
if let Some(ch) = char::from_u32(cp) {
assert_eq!(
classify_char(ch),
classify_char_slow(ch),
"classify_char disagrees with chain at U+{cp:04X}"
);
}
}
}
#[test]
fn indic_role_matches_chain() {
for cp in 0u32..=0x10_FFFF {
assert_eq!(
indic_char_role(cp),
indic_char_role_chain(cp),
"indic_char_role disagrees with chain at U+{cp:04X}"
);
}
}
fn validate(
lang: Option<&str>,
target: Option<&str>,
errors: &str,
replace_with: &str,
strict_iso9: bool,
gost7034: bool,
tones: bool,
context: bool,
) -> Result<(), crate::ErrorRepr> {
validate_transliterate_args(
lang,
target,
errors,
replace_with,
strict_iso9,
gost7034,
tones,
context,
)
}
fn err_msg(r: Result<(), crate::ErrorRepr>) -> String {
r.unwrap_err().to_string()
}
#[test]
fn validate_accepts_defaults() {
assert!(validate(None, None, "replace", "[?]", false, false, false, false).is_ok());
assert!(validate(
Some("ru"),
None,
"replace",
"[?]",
false,
false,
false,
false
)
.is_ok());
assert!(validate(
None,
Some("ru"),
"replace",
"[?]",
false,
false,
false,
false
)
.is_ok());
}
#[test]
fn validate_rejects_lang_and_target() {
let r = validate(
Some("ru"),
Some("ru"),
"replace",
"[?]",
false,
false,
false,
false,
);
assert!(err_msg(r).contains("'lang' and 'target' are mutually exclusive"));
}
#[test]
fn validate_rejects_context_with_target() {
let r = validate(
None,
Some("ar"),
"replace",
"[?]",
false,
false,
false,
true,
);
assert!(err_msg(r).contains("'context' and 'target' are mutually exclusive"));
}
#[test]
fn validate_rejects_tones_with_context() {
let r = validate(None, None, "replace", "[?]", false, false, true, true);
assert!(err_msg(r).contains("'tones' cannot be used with 'context'"));
}
#[test]
fn validate_rejects_strict_with_context() {
let r = validate(None, None, "strict", "[?]", false, false, false, true);
assert!(err_msg(r).contains("errors='strict' cannot be used with 'context'"));
}
#[test]
fn validate_rejects_forward_only_with_target_sorted() {
let r = validate(None, Some("ru"), "ignore", "[?]", false, true, true, false);
let msg = err_msg(r);
assert!(msg.contains(
"forward-only parameters (errors, gost7034, tones) cannot be used with 'target'"
));
}
#[test]
fn validate_rejects_replace_with_override_with_target() {
let r = validate(None, Some("ru"), "replace", "X", false, false, false, false);
assert!(err_msg(r)
.contains("forward-only parameters (replace_with) cannot be used with 'target'"));
}
#[test]
fn test_ascii_passthrough() {
let result = transliterate_impl(
"hello",
None,
ErrorMode::Replace,
"[?]",
false,
false,
false,
);
assert_eq!(result, "hello");
}
#[test]
fn test_is_ascii() {
assert!("hello".is_ascii());
assert!(!"héllo".is_ascii());
}
mod proptest_properties {
use super::*;
use proptest::prelude::*;
proptest! {
#![proptest_config(ProptestConfig::with_cases(1000))]
#[test]
fn transliterate_ignore_is_ascii(s in "\\PC*") {
let result = transliterate_impl(&s, None, ErrorMode::Ignore, "", false, false, false);
prop_assert!(
result.is_ascii(),
"Non-ASCII in Ignore output: {:?}",
result.chars().filter(|c: &char| !c.is_ascii()).collect::<Vec<_>>()
);
}
#[test]
fn transliterate_preserve_nonempty(s in "[^\\s\\p{M}]{1,50}") {
let result = transliterate_impl(&s, None, ErrorMode::Preserve, "", false, false, false);
prop_assert!(!result.is_empty());
}
#[test]
fn strip_accents_idempotent(s in "\\PC*") {
let once = strip_accents(&s);
let twice = strip_accents(&once);
prop_assert_eq!(&once, &twice);
}
#[test]
fn strip_accents_output_is_nfc(s in "\\PC*") {
let result = strip_accents(&s);
prop_assert!(
unicode_normalization::is_nfc(&result),
"strip_accents output not NFC"
);
}
#[test]
fn transliterate_ascii_passthrough(s in "[a-zA-Z0-9 ]{0,100}") {
let result = transliterate_impl(&s, None, ErrorMode::Replace, "[?]", false, false, false);
prop_assert_eq!(&result, &s);
}
}
}
}