const ZWNJ: &str = "\u{200C}";
#[must_use]
pub fn insert(text: &str) -> String {
let mut out = String::with_capacity(text.len() + text.len() / 8);
let mut current = String::new();
for c in text.chars() {
if is_word_break(c) {
if !current.is_empty() {
out.push_str(&apply(¤t));
current.clear();
}
out.push(c);
} else {
current.push(c);
}
}
if !current.is_empty() {
out.push_str(&apply(¤t));
}
out
}
#[inline]
fn is_word_break(c: char) -> bool {
c.is_whitespace() || c.is_ascii_punctuation() || matches!(c, '،' | '؛' | '؟' | '«' | '»')
}
fn apply(word: &str) -> String {
if word.contains(ZWNJ) {
return word.to_owned();
}
let mut s = word.to_owned();
s = insert_after_verb_prefix(&s, "نمی");
s = insert_after_verb_prefix(&s, "می");
s = insert_before_suffix(&s, "های");
s = insert_before_suffix(&s, "ها");
for suffix in &["مان", "تان", "شان", "ام", "ات", "اش"] {
s = insert_before_suffix(&s, suffix);
}
s
}
fn insert_after_verb_prefix(word: &str, prefix: &str) -> String {
let chars: Vec<char> = word.chars().collect();
let prefix_chars: Vec<char> = prefix.chars().collect();
if chars.len() < prefix_chars.len() + 3 {
return word.to_owned();
}
if !chars[..prefix_chars.len()]
.iter()
.zip(prefix_chars.iter())
.all(|(a, b)| a == b)
{
return word.to_owned();
}
let next = chars[prefix_chars.len()];
if !is_persian_letter(next) {
return word.to_owned();
}
let head: String = chars[..prefix_chars.len()].iter().collect();
let tail: String = chars[prefix_chars.len()..].iter().collect();
format!("{head}{ZWNJ}{tail}")
}
fn insert_before_suffix(word: &str, suffix: &str) -> String {
let chars: Vec<char> = word.chars().collect();
let suffix_chars: Vec<char> = suffix.chars().collect();
if chars.len() < suffix_chars.len() + 3 {
return word.to_owned();
}
if !chars[chars.len() - suffix_chars.len()..]
.iter()
.zip(suffix_chars.iter())
.all(|(a, b)| a == b)
{
return word.to_owned();
}
let stem_end = chars.len() - suffix_chars.len();
if !is_persian_letter(chars[stem_end - 1]) {
return word.to_owned();
}
let stem: String = chars[..stem_end].iter().collect();
let suf: String = chars[stem_end..].iter().collect();
format!("{stem}{ZWNJ}{suf}")
}
#[inline]
fn is_persian_letter(c: char) -> bool {
let cp = c as u32;
(0x0621..=0x06FF).contains(&cp)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn inserts_after_mi_prefix() {
assert_eq!(insert("میروم"), "می\u{200C}روم");
}
#[test]
fn inserts_after_nemi_prefix() {
assert_eq!(insert("نمیدانم"), "نمی\u{200C}دانم");
}
#[test]
fn inserts_before_ha() {
assert_eq!(insert("کتابها"), "کتاب\u{200C}ها");
}
#[test]
fn inserts_before_haye() {
assert_eq!(insert("کتابهای"), "کتاب\u{200C}های");
}
#[test]
fn inserts_before_possessive_am() {
assert_eq!(insert("کتابام"), "کتاب\u{200C}ام");
}
#[test]
fn skips_when_already_has_zwnj() {
assert_eq!(insert("می\u{200C}روم"), "می\u{200C}روم");
}
#[test]
fn preserves_whitespace_and_punct() {
assert_eq!(insert("میروم به خانه."), "می\u{200C}روم به خانه.");
}
#[test]
fn skips_too_short_words() {
assert_eq!(insert("ما"), "ما");
}
}