pub mod case_folding_data;
mod confusables_data;
pub mod emoji_data;
pub mod hangul;
mod hanzi_pinyin;
mod transliteration;
use std::borrow::Cow;
use std::collections::HashMap;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::RwLock;
use std::sync::LazyLock;
use crate::unicode_ranges as ur;
const HANGUL_SYLLABLE_COUNT: usize = 11_172;
struct HangulRomanizations {
blob: String,
offsets: Vec<u32>,
}
static HANGUL_ROMANIZATIONS: std::sync::OnceLock<HangulRomanizations> = std::sync::OnceLock::new();
fn hangul_romanizations() -> &'static HangulRomanizations {
HANGUL_ROMANIZATIONS.get_or_init(|| {
let mut blob = String::with_capacity(HANGUL_SYLLABLE_COUNT * 7);
let mut offsets = Vec::with_capacity(HANGUL_SYLLABLE_COUNT + 1);
offsets.push(0u32);
for i in 0..HANGUL_SYLLABLE_COUNT as u32 {
let ch = char::from_u32(0xAC00 + i).expect("all Hangul syllable codepoints are valid");
blob.push_str(&hangul::romanize_hangul(ch).unwrap_or_default());
offsets.push(u32::try_from(blob.len()).expect("Hangul blob fits in u32"));
}
HangulRomanizations { blob, offsets }
})
}
static LANG_TABLES: LazyLock<RwLock<HashMap<String, HashMap<char, String>>>> =
LazyLock::new(|| RwLock::new(HashMap::new()));
static GLOBAL_REPLACEMENTS: LazyLock<RwLock<HashMap<String, String>>> =
LazyLock::new(|| RwLock::new(HashMap::new()));
struct ReplacementAutomaton {
ac: aho_corasick::AhoCorasick,
values: Vec<String>,
}
static GLOBAL_REPLACEMENTS_AC: LazyLock<RwLock<Option<ReplacementAutomaton>>> =
LazyLock::new(|| RwLock::new(None));
fn build_replacement_automaton(map: &HashMap<String, String>) -> Option<ReplacementAutomaton> {
let mut keys: Vec<&String> = map.keys().filter(|k| !k.is_empty()).collect();
keys.sort();
if keys.is_empty() {
return None;
}
let ac = aho_corasick::AhoCorasick::builder()
.match_kind(aho_corasick::MatchKind::LeftmostLongest)
.build(keys.iter().map(|k| k.as_str()))
.expect("replacement keys are valid aho-corasick patterns");
let values = keys.iter().map(|k| map[*k].clone()).collect();
Some(ReplacementAutomaton { ac, values })
}
fn rebuild_replacement_automaton(map: &HashMap<String, String>) {
let built = build_replacement_automaton(map);
let mut slot = crate::recover_lock(GLOBAL_REPLACEMENTS_AC.write(), "GLOBAL_REPLACEMENTS_AC");
*slot = built;
}
static HAS_REPLACEMENTS: AtomicBool = AtomicBool::new(false);
static HAS_REGISTERED_LANGS: AtomicBool = AtomicBool::new(false);
static REGISTRATIONS_SEALED: AtomicBool = AtomicBool::new(false);
pub(crate) fn seal_registrations() {
REGISTRATIONS_SEALED.store(true, Ordering::Release);
tl_info!("registrations sealed");
}
pub(crate) fn registrations_sealed() -> bool {
REGISTRATIONS_SEALED.load(Ordering::Acquire)
}
pub use crate::limits::{MAX_REGISTERED_LANGS, MAX_REPLACEMENTS};
pub fn registered_lang_count() -> usize {
crate::recover_lock(LANG_TABLES.read(), "LANG_TABLES").len()
}
pub fn has_registered_lang(code: &str) -> bool {
crate::recover_lock(LANG_TABLES.read(), "LANG_TABLES").contains_key(code)
}
const LANG_ALIASES: &[&str] = &["nb", "nn", "da"];
pub fn is_valid_lang(code: &str) -> bool {
BUILTIN_LANGS.binary_search(&code).is_ok()
|| LANG_ALIASES.contains(&code)
|| has_registered_lang(code)
}
const BUILTIN_LANGS: &[&str] = &[
"am",
"ar",
"as",
"ban", "bax", "bg",
"bn",
"bo",
"bug", "ca",
"chr", "cjm", "cop", "cs",
"cy",
"da",
"de",
"dv",
"el",
"es",
"et",
"fa",
"fi",
"fr",
"ga",
"gu",
"he",
"hi",
"hr",
"hu",
"hy",
"is",
"it",
"ja",
"ja-kunrei",
"jv",
"ka",
"khb", "km",
"kn",
"ko",
"lis", "lo",
"lt",
"lv",
"ml",
"mn",
"mni", "mr",
"mt",
"my",
"ne",
"nl",
"no",
"nod", "nqo", "or",
"pa",
"pl",
"pt",
"ro",
"ru",
"sa",
"sat", "si",
"sk",
"sl",
"sq",
"sr",
"su", "sv",
"syr", "ta",
"tdd", "te",
"th",
"tl", "tr",
"tzm", "uk",
"vai", "vi",
"zh",
];
#[inline]
pub fn lookup_default(ch: char) -> Option<&'static str> {
let cp = ch as u32;
if ur::CJK_EXT_A.contains(&cp) || ur::CJK_UNIFIED.contains(&cp) || ur::CJK_COMPAT.contains(&cp)
{
return hanzi_pinyin::lookup_hanzi(ch).or_else(|| transliteration::lookup(ch));
}
if ur::HANGUL_SYLLABLES.contains(&cp) || ur::HANGUL_COMPAT_JAMO.contains(&cp) {
return lookup_hangul_static(ch).or_else(|| transliteration::lookup(ch));
}
transliteration::lookup(ch)
}
#[inline]
pub fn lookup_default_toned(ch: char) -> Option<&'static str> {
let cp = ch as u32;
if ur::CJK_EXT_A.contains(&cp) || ur::CJK_UNIFIED.contains(&cp) || ur::CJK_COMPAT.contains(&cp)
{
return hanzi_pinyin::lookup_hanzi_toned(ch).or_else(|| transliteration::lookup(ch));
}
if ur::HANGUL_SYLLABLES.contains(&cp) || ur::HANGUL_COMPAT_JAMO.contains(&cp) {
return lookup_hangul_static(ch).or_else(|| transliteration::lookup(ch));
}
transliteration::lookup(ch)
}
fn lookup_hangul_static(ch: char) -> Option<&'static str> {
let code = ch as u32;
if (0xAC00..=0xD7A3).contains(&code) {
let idx = (code - 0xAC00) as usize;
let table = hangul_romanizations();
let start = *table.offsets.get(idx)? as usize;
let end = *table.offsets.get(idx + 1)? as usize;
table.blob.get(start..end)
} else {
hangul::lookup_compat_jamo(ch)
}
}
#[inline]
pub fn lookup_iso9(ch: char) -> Option<&'static str> {
transliteration::lookup_iso9(ch)
}
#[inline]
pub fn lookup_gost7034(ch: char) -> Option<&'static str> {
transliteration::lookup_gost7034(ch)
}
pub fn lookup_lang(lang: &str, ch: char) -> Option<Cow<'static, str>> {
if let Some(result) = transliteration::lookup_lang(lang, ch) {
return Some(Cow::Borrowed(result));
}
lookup_registered(lang, ch)
}
#[inline]
pub fn resolve_lang_map(lang: &str) -> Option<&'static phf::Map<char, &'static str>> {
transliteration::resolve_lang_map(lang)
}
#[inline]
pub fn lookup_registered(lang: &str, ch: char) -> Option<Cow<'static, str>> {
if !HAS_REGISTERED_LANGS.load(Ordering::Acquire) {
return None;
}
let table = crate::recover_lock(LANG_TABLES.read(), "LANG_TABLES");
table
.get(lang)
.and_then(|char_map| char_map.get(&ch).cloned())
.map(Cow::Owned)
}
#[inline]
pub fn lookup_confusable(ch: char, target_script: &str) -> Option<&'static str> {
confusables_data::lookup(ch, target_script)
}
#[inline]
pub fn resolve_confusable_map(
target_script: &str,
) -> Option<&'static phf::Map<char, &'static str>> {
confusables_data::resolve_map(target_script)
}
pub fn list_langs() -> Vec<String> {
let mut langs: Vec<String> = BUILTIN_LANGS.iter().map(|s| (*s).to_string()).collect();
let table = crate::recover_lock(LANG_TABLES.read(), "LANG_TABLES");
for key in table.keys() {
if BUILTIN_LANGS.binary_search(&key.as_str()).is_err() {
langs.push(key.clone());
}
}
langs.sort();
langs
}
pub(crate) fn register_lang(
code: &str,
mappings: HashMap<String, String>,
) -> Result<(), Vec<String>> {
let mut char_map = HashMap::new();
let mut bad_keys: Vec<String> = Vec::new();
for (key, value) in mappings {
let mut chars = key.chars();
match (chars.next(), chars.next()) {
(Some(ch), None) => {
char_map.insert(ch, value);
}
_ => bad_keys.push(key),
}
}
if !bad_keys.is_empty() {
tl_warn!(
"register_lang: rejected non-single-char keys count={}",
bad_keys.len()
);
return Err(bad_keys);
}
#[cfg(feature = "log")]
let mapping_count = char_map.len();
let mut table = crate::recover_lock(LANG_TABLES.write(), "LANG_TABLES");
table.insert(code.to_owned(), char_map);
HAS_REGISTERED_LANGS.store(!table.is_empty(), Ordering::Release);
tl_info!("register_lang: code={code:?} mappings={mapping_count}");
Ok(())
}
pub(crate) fn register_replacements(replacements: HashMap<String, String>) -> Result<(), usize> {
let mut table = crate::recover_lock(GLOBAL_REPLACEMENTS.write(), "GLOBAL_REPLACEMENTS");
let new_keys: usize = replacements
.keys()
.filter(|k| !table.contains_key(*k))
.count();
let projected = table.len() + new_keys;
if projected > MAX_REPLACEMENTS {
tl_warn!(
"register_replacements: limit exceeded projected={projected} max={MAX_REPLACEMENTS}"
);
return Err(projected);
}
table.extend(replacements);
rebuild_replacement_automaton(&table);
HAS_REPLACEMENTS.store(!table.is_empty(), Ordering::Release);
tl_info!("register_replacements: total={}", table.len());
Ok(())
}
pub(crate) fn remove_replacement(key: &str) -> bool {
let mut table = crate::recover_lock(GLOBAL_REPLACEMENTS.write(), "GLOBAL_REPLACEMENTS");
let removed = table.remove(key).is_some();
rebuild_replacement_automaton(&table);
HAS_REPLACEMENTS.store(!table.is_empty(), Ordering::Release);
removed
}
pub(crate) fn clear_replacements() {
let mut table = crate::recover_lock(GLOBAL_REPLACEMENTS.write(), "GLOBAL_REPLACEMENTS");
table.clear();
rebuild_replacement_automaton(&table);
HAS_REPLACEMENTS.store(false, Ordering::Release);
}
pub fn apply_replacements(text: &str, max_len: usize) -> Result<Cow<'_, str>, usize> {
if !HAS_REPLACEMENTS.load(Ordering::Acquire) {
return Ok(Cow::Borrowed(text));
}
let guard = crate::recover_lock(GLOBAL_REPLACEMENTS_AC.read(), "GLOBAL_REPLACEMENTS_AC");
match guard.as_ref() {
Some(automaton) => replace_with_automaton(text, automaton, max_len),
None => Ok(Cow::Borrowed(text)),
}
}
fn replace_with_automaton<'a>(
text: &'a str,
automaton: &ReplacementAutomaton,
max_len: usize,
) -> Result<Cow<'a, str>, usize> {
let mut out: Option<String> = None;
let mut last = 0;
for mat in automaton.ac.find_iter(text) {
let buf = out.get_or_insert_with(|| String::with_capacity(text.len()));
buf.push_str(&text[last..mat.start()]);
buf.push_str(&automaton.values[mat.pattern().as_usize()]);
if buf.len() > max_len {
return Err(buf.len());
}
last = mat.end();
}
match out {
Some(mut buf) => {
buf.push_str(&text[last..]);
if buf.len() > max_len {
return Err(buf.len());
}
Ok(Cow::Owned(buf))
}
None => Ok(Cow::Borrowed(text)),
}
}
#[cfg(test)]
fn replace_longest_match<'a>(
text: &'a str,
table: &HashMap<String, String>,
max_len: usize,
) -> Result<Cow<'a, str>, usize> {
let mut lengths: Vec<usize> = table.keys().map(String::len).filter(|&l| l > 0).collect();
lengths.sort_unstable_by(|a, b| b.cmp(a));
lengths.dedup();
if lengths.is_empty() {
return Ok(Cow::Borrowed(text));
}
let mut out: Option<String> = None;
let mut last = 0;
let mut i = 0;
while i < text.len() {
let mut matched = false;
for &len in &lengths {
let end = i + len;
if end > text.len() || !text.is_char_boundary(end) {
continue;
}
if let Some(rep) = table.get(&text[i..end]) {
let buf = out.get_or_insert_with(|| String::with_capacity(text.len()));
buf.push_str(&text[last..i]);
buf.push_str(rep);
if buf.len() > max_len {
return Err(buf.len());
}
i = end;
last = end;
matched = true;
break;
}
}
if !matched {
let ch = text[i..].chars().next().unwrap();
i += ch.len_utf8();
}
}
match out {
Some(mut buf) => {
buf.push_str(&text[last..]);
if buf.len() > max_len {
return Err(buf.len());
}
Ok(Cow::Owned(buf))
}
None => Ok(Cow::Borrowed(text)),
}
}
#[inline]
pub fn lookup_emoji_single(ch: char) -> Option<&'static str> {
emoji_data::EMOJI_SINGLE.get(&ch).copied()
}
#[cfg(test)]
pub fn lookup_emoji_multi(key: &str) -> Option<&'static str> {
emoji_data::EMOJI_MULTI.get(key).copied()
}
pub fn match_emoji_sequence(window: &[char]) -> Option<(&'static str, usize)> {
use emoji_data::{
EMOJI_MULTI_TRIE_EDGE_CP as EDGE_CP, EMOJI_MULTI_TRIE_EDGE_START as EDGE_START,
EMOJI_MULTI_TRIE_EDGE_TARGET as EDGE_TARGET, EMOJI_MULTI_TRIE_NODE_VALUE as NODE_VALUE,
EMOJI_MULTI_TRIE_VALUES as VALUES,
};
const ZWJ: u32 = 0x200D;
const VS15: u32 = 0xFE0E;
const VS16: u32 = 0xFE0F;
let mut node = 0usize;
let mut best: Option<(&'static str, usize)> = None;
for (i, &c) in window.iter().enumerate() {
let cp = c as u32;
let start = EDGE_START[node] as usize;
let end = EDGE_START[node + 1] as usize;
match EDGE_CP[start..end].binary_search(&cp) {
Ok(idx) => node = EDGE_TARGET[start + idx] as usize,
Err(_) => break,
}
if i >= 1 && cp != ZWJ && cp != VS15 && cp != VS16 {
let vidx = NODE_VALUE[node];
if vidx != u32::MAX {
best = Some((VALUES[vidx as usize], i + 1));
}
}
}
best
}
#[inline]
pub fn is_emoji_multi_starter(ch: char) -> bool {
emoji_data::EMOJI_MULTI_STARTERS.contains(&ch)
}
#[inline]
pub const fn max_emoji_seq_len() -> usize {
emoji_data::MAX_EMOJI_SEQ_LEN
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn packed_hangul_matches_romanize_hangul() {
for i in 0..HANGUL_SYLLABLE_COUNT as u32 {
let ch = char::from_u32(0xAC00 + i).unwrap();
let expected = hangul::romanize_hangul(ch).unwrap();
assert_eq!(
lookup_hangul_static(ch),
Some(expected.as_str()),
"packed Hangul lookup diverged at U+{:04X}",
0xAC00 + i
);
}
}
#[test]
fn builtin_langs_is_sorted() {
assert!(
BUILTIN_LANGS.windows(2).all(|w| w[0] < w[1]),
"BUILTIN_LANGS must be sorted and unique for binary_search"
);
}
fn tbl(pairs: &[(&str, &str)]) -> HashMap<String, String> {
pairs
.iter()
.map(|(k, v)| ((*k).to_string(), (*v).to_string()))
.collect()
}
fn rlm<'a>(text: &'a str, t: &HashMap<String, String>) -> Cow<'a, str> {
replace_longest_match(text, t, usize::MAX).expect("no size limit")
}
#[test]
fn automaton_matches_longest_scan() {
let tables = [
tbl(&[("ab", "X"), ("abc", "Y")]),
tbl(&[("a", "b"), ("b", "c")]),
tbl(&[("@", "(at)"), ("\u{3a9}", "OMEGA")]),
tbl(&[("aa", "1"), ("a", "2"), ("aaa", "3")]),
tbl(&[("", "skip"), ("x", "Y")]),
tbl(&[("\u{5317}\u{4eac}", "beijing"), ("\u{5317}", "north")]),
tbl(&[("the", "T"), ("he", "H"), ("t", "_")]),
];
let inputs = [
"",
"abcd",
"abx",
"aaaa",
"aaaaa",
"ab",
"x\u{5317}\u{4eac}y\u{5317}z",
"the theatre",
"@a@",
"\u{3a9}\u{3a9}",
"no match here",
"aaabaaa",
"ababab",
];
for t in &tables {
let automaton = build_replacement_automaton(t);
for inp in inputs {
let reference = replace_longest_match(inp, t, usize::MAX).expect("oracle");
let got = match automaton.as_ref() {
Some(a) => replace_with_automaton(inp, a, usize::MAX).expect("automaton"),
None => Cow::Borrowed(inp),
};
assert_eq!(got, reference, "automaton != scan for input {inp:?}");
assert_eq!(
automaton
.as_ref()
.map_or(Ok(Cow::Borrowed(inp)), |a| replace_with_automaton(
inp, a, 4
)),
replace_longest_match(inp, t, 4),
"size-cap disagreement for input {inp:?}"
);
}
}
}
#[test]
fn test_replace_longest_match_basic() {
let t = tbl(&[("@", "(at)"), ("Ω", "OMEGA")]);
assert_eq!(rlm("a@b", &t), "a(at)b");
assert_eq!(rlm("xΩy", &t), "xOMEGAy");
}
#[test]
fn test_replace_longest_match_prefers_longest() {
let t = tbl(&[("ab", "X"), ("abc", "Y")]);
assert_eq!(rlm("abcd", &t), "Yd");
assert_eq!(rlm("abx", &t), "Xx");
}
#[test]
fn test_replace_longest_match_no_cascade() {
let t = tbl(&[("a", "b"), ("b", "c")]);
assert_eq!(rlm("a", &t), "b");
assert_eq!(rlm("aa", &t), "bb");
}
#[test]
fn test_replace_longest_match_borrows_on_no_match() {
let t = tbl(&[("zzz", "Q")]);
assert!(matches!(rlm("hello", &t), Cow::Borrowed(_)));
}
#[test]
fn test_replace_longest_match_empty_and_zero_len_key() {
assert!(matches!(rlm("hi", &HashMap::new()), Cow::Borrowed(_)));
let t = tbl(&[("", "X"), ("a", "Z")]);
assert_eq!(rlm("ba", &t), "bZ");
}
#[test]
fn test_replace_longest_match_multibyte_boundary_safe() {
let t = tbl(&[("é", "e"), ("好", "hao")]);
assert_eq!(rlm("café 好", &t), "cafe hao");
let t2 = tbl(&[("\u{00A9}", "(c)")]);
assert_eq!(rlm("\u{2605}", &t2), "\u{2605}");
}
#[test]
fn test_replace_longest_match_size_cap() {
let big = "X".repeat(100);
let t = tbl(&[("a", big.as_str())]);
assert!(replace_longest_match("aaaa", &t, 50).is_err());
assert_eq!(replace_longest_match("a", &t, 1000).unwrap(), big);
assert!(matches!(
replace_longest_match("zzz", &t, 1).unwrap(),
Cow::Borrowed(_)
));
}
#[test]
fn test_lookup_default_ascii() {
assert!(lookup_default('a').is_none());
assert!(lookup_default('Z').is_none());
}
#[test]
fn test_lookup_default_latin_extended() {
assert_eq!(lookup_default('é'), Some("e"));
assert_eq!(lookup_default('ñ'), Some("n"));
}
#[test]
fn test_lookup_default_hanzi() {
assert_eq!(lookup_default('北'), Some("bei"));
assert_eq!(lookup_default('京'), Some("jing"));
}
#[test]
fn test_lookup_default_hangul() {
let result = lookup_default('한');
assert!(result.is_some());
assert_eq!(result.unwrap(), "han");
}
#[test]
fn test_hangul_cache_consistency() {
let first = lookup_hangul_static('가');
let second = lookup_hangul_static('가');
assert_eq!(first, second);
assert_eq!(first.unwrap(), "ga");
}
#[test]
fn test_lookup_default_unmapped() {
let ch = char::from_u32(0x20000).unwrap();
assert!(lookup_default(ch).is_none());
}
#[test]
fn test_lookup_confusable() {
let result = lookup_confusable('\u{0430}', "latin");
assert_eq!(result, Some("a"));
}
#[test]
fn test_lookup_confusable_non_latin_target() {
assert!(lookup_confusable('\u{0430}', "cyrillic").is_none());
}
#[test]
fn test_list_langs_contains_builtins() {
let langs = list_langs();
assert!(langs.contains(&"de".to_owned()));
assert!(langs.contains(&"ja".to_owned()));
assert!(langs.contains(&"zh".to_owned()));
assert!(langs.len() >= BUILTIN_LANGS.len());
}
#[test]
fn test_lang_override_tables_are_registered_and_dispatched() {
let data_dir = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
.join("src")
.join("tables")
.join("data");
let mut checked = 0usize;
for entry in std::fs::read_dir(&data_dir).expect("read data dir") {
let fname = entry.unwrap().file_name().into_string().unwrap();
let Some(stem) = fname
.strip_prefix("translit_lang_")
.and_then(|s| s.strip_suffix(".tsv"))
else {
continue;
};
let code = stem.replace('_', "-"); assert!(
BUILTIN_LANGS.contains(&code.as_str()),
"translit_lang_{stem}.tsv exists but '{code}' is not in BUILTIN_LANGS"
);
let content = std::fs::read_to_string(data_dir.join(&fname)).unwrap();
let first = content
.lines()
.map(str::trim_start)
.find(|l| !l.is_empty() && !l.starts_with('#'))
.expect("override file has at least one entry");
let hex = first.split('\t').next().unwrap().trim();
let cp = u32::from_str_radix(hex, 16).expect("valid hex codepoint");
let ch = char::from_u32(cp).expect("valid codepoint");
assert!(
lookup_lang(&code, ch).is_some(),
"lookup_lang(\"{code}\", U+{cp:04X}) is None — translit_lang_{stem}.tsv not dispatched"
);
checked += 1;
}
assert!(
checked >= 20,
"expected ≥20 override tables, checked {checked}"
);
}
#[test]
fn test_list_langs_sorted() {
let langs = list_langs();
let mut sorted = langs.clone();
sorted.sort();
assert_eq!(langs, sorted);
}
#[test]
fn test_emoji_single_lookup() {
let result = lookup_emoji_single('\u{1F600}');
assert!(result.is_some());
}
#[test]
fn test_max_emoji_seq_len_positive() {
assert!(max_emoji_seq_len() > 0);
}
#[test]
fn test_max_emoji_seq_len_covers_all_sequences() {
let limit = emoji_data::MAX_EMOJI_SEQ_LEN;
let mut max_found = 0usize;
for (key, _) in emoji_data::EMOJI_MULTI.entries() {
let cp_count = key.split('_').count();
if cp_count > max_found {
max_found = cp_count;
}
assert!(
cp_count <= limit,
"Emoji sequence {key} has {cp_count} codepoints, exceeds MAX_EMOJI_SEQ_LEN={limit}"
);
}
assert_eq!(
max_found, limit,
"MAX_EMOJI_SEQ_LEN={limit} but longest sequence is {max_found} — consider tightening"
);
}
#[test]
fn test_register_lang_lookup() {
let mut mappings = HashMap::new();
mappings.insert("Ü".to_owned(), "Ue".to_owned());
register_lang("_test_cow_lookup", mappings).unwrap();
let first = lookup_lang("_test_cow_lookup", 'Ü');
let second = lookup_lang("_test_cow_lookup", 'Ü');
assert_eq!(first.as_deref(), Some("Ue"));
assert_eq!(second.as_deref(), Some("Ue"));
}
#[test]
fn test_register_lang_rejects_multi_char_key() {
let mut mappings = HashMap::new();
mappings.insert("AB".to_owned(), "ab".to_owned());
let result = register_lang("_test_bad_key", mappings);
assert!(result.is_err());
let bad = result.unwrap_err();
assert_eq!(bad, vec!["AB".to_owned()]);
}
#[test]
fn test_register_lang_rejects_empty_key() {
let mut mappings = HashMap::new();
mappings.insert(String::new(), "x".to_owned());
let result = register_lang("_test_empty_key", mappings);
assert!(result.is_err());
}
#[test]
fn test_register_lang_invalidates_on_reregister() {
let mut m1 = HashMap::new();
m1.insert("Ö".to_owned(), "Oe".to_owned());
register_lang("_test_inval2", m1).unwrap();
let first = lookup_lang("_test_inval2", 'Ö');
assert_eq!(first.as_deref(), Some("Oe"));
let mut m2 = HashMap::new();
m2.insert("Ö".to_owned(), "O".to_owned());
register_lang("_test_inval2", m2).unwrap();
let second = lookup_lang("_test_inval2", 'Ö');
assert_eq!(second.as_deref(), Some("O"));
}
#[test]
fn test_lookup_lang_builtin_is_borrowed() {
let result = lookup_lang("de", 'ü');
if let Some(cow) = result {
assert!(
matches!(cow, Cow::Borrowed(_)),
"built-in PHF result should be Cow::Borrowed"
);
}
}
#[test]
fn test_lookup_lang_user_registered_is_owned() {
let mut m = HashMap::new();
m.insert("X".to_owned(), "ex".to_owned());
register_lang("_test_owned", m).unwrap();
let result = lookup_lang("_test_owned", 'X');
if let Some(cow) = result {
assert!(
matches!(cow, Cow::Owned(_)),
"user-registered result should be Cow::Owned"
);
} else {
panic!("expected Some from registered lang");
}
}
}