use crate::tables;
fn validate_target_script(target_script: &str) -> Result<(), crate::ErrorRepr> {
match target_script {
"latin" | "cyrillic" => Ok(()),
_ => Err(crate::ErrorRepr::InvalidTargetScript {
got: target_script.to_owned(),
}),
}
}
pub(crate) fn normalize_confusables(
text: &str,
target_script: &str,
) -> Result<String, crate::ErrorRepr> {
let mut out = String::new();
normalize_confusables_into(text, target_script, &mut out)?;
Ok(out)
}
pub(crate) fn normalize_confusables_cow<'a>(
text: &'a str,
target_script: &str,
) -> Result<std::borrow::Cow<'a, str>, crate::ErrorRepr> {
use std::borrow::Cow;
validate_target_script(target_script)?;
let map = tables::resolve_confusable_map(target_script);
for (i, ch) in text.char_indices() {
if let Some(replacement) = map.and_then(|m| m.get(&ch).copied()) {
let mut out = String::with_capacity(text.len());
out.push_str(&text[..i]);
out.push_str(replacement);
for ch in text[i + ch.len_utf8()..].chars() {
match map.and_then(|m| m.get(&ch).copied()) {
Some(replacement) => out.push_str(replacement),
None => out.push(ch),
}
}
return Ok(Cow::Owned(out));
}
}
Ok(Cow::Borrowed(text))
}
pub(crate) fn normalize_confusables_into(
text: &str,
target_script: &str,
out: &mut String,
) -> Result<(), crate::ErrorRepr> {
validate_target_script(target_script)?;
out.clear();
out.reserve(text.len());
let map = tables::resolve_confusable_map(target_script);
for ch in text.chars() {
match map.and_then(|m| m.get(&ch).copied()) {
Some(replacement) => out.push_str(replacement),
None => out.push(ch),
}
}
Ok(())
}
pub(crate) fn is_confusable(text: &str, target_script: &str) -> Result<bool, crate::ErrorRepr> {
validate_target_script(target_script)?;
let map = tables::resolve_confusable_map(target_script);
for ch in text.chars() {
if map.is_some_and(|m| m.contains_key(&ch)) {
return Ok(true);
}
}
Ok(false)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_normalize_confusables_cyrillic() {
let result = normalize_confusables("\u{0430}", "latin").unwrap();
assert_eq!(result, "a");
}
#[test]
fn test_normalize_confusables_passthrough() {
let result = normalize_confusables("hello", "latin").unwrap();
assert_eq!(result, "hello");
}
#[test]
fn test_normalize_confusables_empty() {
let result = normalize_confusables("", "latin").unwrap();
assert_eq!(result, "");
}
#[test]
fn test_is_confusable_true() {
assert!(is_confusable("\u{0430}", "latin").unwrap());
}
#[test]
fn test_is_confusable_false() {
assert!(!is_confusable("hello", "latin").unwrap());
}
#[test]
fn test_is_confusable_empty() {
assert!(!is_confusable("", "latin").unwrap());
}
#[test]
fn test_validate_target_script_latin_ok() {
assert!(validate_target_script("latin").is_ok());
}
#[test]
fn test_validate_target_script_cyrillic_ok() {
assert!(validate_target_script("cyrillic").is_ok());
}
#[test]
fn test_validate_target_script_invalid() {
assert!(validate_target_script("greek").is_err());
assert!(validate_target_script("").is_err());
assert!(validate_target_script("Latin").is_err()); assert!(validate_target_script("Cyrillic").is_err()); }
#[test]
fn test_normalize_confusables_mixed_long() {
let input = "h\u{0435}ll\u{043E} w\u{043E}rld"; let result = normalize_confusables(input, "latin").unwrap();
assert_eq!(result, "hello world");
}
#[test]
fn test_normalize_confusables_nfc_vs_nfd() {
let nfc = "\u{00e9}"; let result = normalize_confusables(nfc, "latin").unwrap();
assert_eq!(result, nfc);
}
mod proptest_properties {
use super::*;
use proptest::prelude::*;
proptest! {
#![proptest_config(ProptestConfig::with_cases(1000))]
#[test]
fn normalize_confusables_idempotent(s in "\\PC*") {
let once = normalize_confusables(&s, "latin").unwrap();
let twice = normalize_confusables(&once, "latin").unwrap();
prop_assert_eq!(&once, &twice,
"normalize_confusables is not idempotent on: {:?}", s);
}
#[test]
fn normalized_is_not_confusable(s in "\\PC*") {
let normalized = normalize_confusables(&s, "latin").unwrap();
let still_confusable = is_confusable(&normalized, "latin").unwrap();
prop_assert!(!still_confusable,
"is_confusable returned true after normalize_confusables on: {:?} → {:?}",
s, normalized);
}
#[test]
fn normalize_confusables_never_drops_chars(s in "\\PC*") {
let result = normalize_confusables(&s, "latin").unwrap();
prop_assert!(
result.chars().count() >= s.chars().count(),
"normalize_confusables dropped chars: {:?} ({} chars) → {:?} ({} chars)",
s, s.chars().count(), result, result.chars().count()
);
}
#[test]
fn normalize_confusables_valid_utf8(s in "\\PC*") {
let result = normalize_confusables(&s, "latin").unwrap();
let _ = result.len(); }
}
}
}