const ZWNJ: char = '\u{200C}';
pub fn normalize_zwnj(text: &str) -> String {
let chars: Vec<char> = text.chars().collect();
let n = chars.len();
let mut out = String::with_capacity(text.len());
for i in 0..n {
let c = chars[i];
if c != ZWNJ {
out.push(c);
continue;
}
if i > 0 && chars[i - 1] == ZWNJ {
continue;
}
let prev_real = chars[..i]
.iter()
.rev()
.find(|&&ch| ch != ZWNJ)
.copied()
.unwrap_or('\0');
let next_real = chars[i + 1..]
.iter()
.find(|&&ch| ch != ZWNJ)
.copied()
.unwrap_or('\0');
if is_arabic_letter(prev_real) && is_arabic_letter(next_real) {
out.push(ZWNJ);
}
}
out
}
#[inline]
fn is_arabic_letter(c: char) -> bool {
let cp = c as u32;
(0x0600..=0x06FF).contains(&cp)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn keeps_valid_zwnj() {
let s = "می\u{200C}روم";
assert_eq!(normalize_zwnj(s), s);
}
#[test]
fn drops_zwnj_next_to_space() {
assert_eq!(normalize_zwnj("سلام \u{200C}دنیا"), "سلام دنیا");
assert_eq!(normalize_zwnj("سلام\u{200C} دنیا"), "سلام دنیا");
}
#[test]
fn drops_leading_trailing_zwnj() {
assert_eq!(normalize_zwnj("\u{200C}سلام\u{200C}"), "سلام");
}
#[test]
fn collapses_consecutive_zwnj() {
let s = "می\u{200C}\u{200C}رود";
assert_eq!(normalize_zwnj(s), "می\u{200C}رود");
}
#[test]
fn drops_zwnj_next_to_latin() {
assert_eq!(normalize_zwnj("ایران\u{200C}Air"), "ایرانAir");
}
}