pub fn normalize_spaces(text: &str) -> String {
let mut out = String::with_capacity(text.len());
let mut in_space = false;
for c in text.chars() {
if is_collapsible_space(c) {
if !in_space {
out.push(' ');
}
in_space = true;
} else {
out.push(c);
in_space = false;
}
}
let trimmed = out.trim_matches(' ');
if trimmed.len() == out.len() {
out
} else {
trimmed.to_owned()
}
}
pub fn reduce_repetitions(text: &str) -> String {
let chars: Vec<char> = text.chars().collect();
let n = chars.len();
let mut out = String::with_capacity(text.len());
let mut i = 0;
while i < n {
let c = chars[i];
let mut run = 1;
while i + run < n && chars[i + run] == c {
run += 1;
}
if run > 2 && is_reducible(c) {
out.push(c);
out.push(c);
} else {
for _ in 0..run {
out.push(c);
}
}
i += run;
}
out
}
#[inline]
fn is_collapsible_space(c: char) -> bool {
matches!(
c,
' ' | '\t'
| '\u{00A0}' | '\u{200B}' | '\u{FEFF}' | '\u{3000}' )
}
#[inline]
fn is_reducible(c: char) -> bool {
!matches!(c, '\n' | '\r') && !c.is_numeric()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn collapses_multiple_spaces() {
assert_eq!(normalize_spaces("سلام دنیا"), "سلام دنیا");
}
#[test]
fn trims_edges() {
assert_eq!(normalize_spaces(" سلام "), "سلام");
}
#[test]
fn collapses_tab_nbsp() {
assert_eq!(normalize_spaces("a\t\u{00A0}b"), "a b");
}
#[test]
fn reduces_letter_run() {
assert_eq!(reduce_repetitions("خیییییلی"), "خییلی");
}
#[test]
fn reduces_punctuation_run() {
assert_eq!(reduce_repetitions("!!!!!!"), "!!");
}
#[test]
fn keeps_two_repetitions() {
assert_eq!(reduce_repetitions("آآ"), "آآ");
}
#[test]
fn preserves_newlines() {
assert_eq!(reduce_repetitions("a\n\n\nb"), "a\n\n\nb");
}
}