pub(crate) fn collapse_whitespace(
text: &str,
strip_control: bool,
strip_zero_width: bool,
) -> String {
let mut out = String::with_capacity(text.len());
collapse_whitespace_into(text, strip_control, strip_zero_width, &mut out);
out
}
pub(crate) fn collapse_whitespace_into(
text: &str,
strip_control: bool,
strip_zero_width: bool,
result: &mut String,
) {
result.clear();
result.reserve(text.len());
let mut prev_was_space = false;
let mut seen_non_ws = false;
for ch in text.chars() {
if is_zero_width(ch) {
if !strip_zero_width {
result.push(ch);
}
continue;
}
if ch.is_control() && ch != '\n' && ch != '\t' {
if !strip_control {
result.push(ch);
}
continue;
}
if ch.is_whitespace() {
if seen_non_ws && !prev_was_space {
result.push(' ');
prev_was_space = true;
}
} else {
result.push(ch);
prev_was_space = false;
seen_non_ws = true;
}
}
if result.ends_with(' ') {
result.truncate(result.len() - 1);
}
}
pub(crate) fn strip_control_chars(text: &str) -> String {
let mut out = String::new();
strip_control_chars_into(text, &mut out);
out
}
pub(crate) fn strip_control_chars_into(text: &str, out: &mut String) {
out.clear();
out.extend(
text.chars()
.filter(|&ch| !ch.is_control() || ch == '\n' || ch == '\t'),
);
}
pub(crate) fn strip_zero_width_chars(text: &str) -> String {
let mut out = String::new();
strip_zero_width_chars_into(text, &mut out);
out
}
pub(crate) fn strip_zero_width_chars_into(text: &str, out: &mut String) {
out.clear();
if text.is_ascii() {
out.push_str(text);
return;
}
out.extend(text.chars().filter(|&ch| !is_zero_width(ch)));
}
pub(crate) fn is_zero_width(ch: char) -> bool {
let cp = ch as u32;
cp.wrapping_sub(0x200B) <= 2 || cp.wrapping_sub(0x2060) <= 4 || cp == 0xFEFF || cp == 0x180E
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_collapse_whitespace() {
assert_eq!(
collapse_whitespace("hello world", true, true),
"hello world"
);
}
#[test]
fn test_strip_zero_width() {
assert_eq!(collapse_whitespace("he\u{200B}llo", true, true), "hello");
}
#[test]
fn test_strip_invisible_math_operators() {
assert_eq!(collapse_whitespace("a\u{2061}b", true, true), "ab"); assert_eq!(collapse_whitespace("a\u{2062}b", true, true), "ab"); assert_eq!(collapse_whitespace("a\u{2063}b", true, true), "ab"); assert_eq!(collapse_whitespace("a\u{2064}b", true, true), "ab"); }
#[test]
fn test_strip_all_zero_width_chars() {
let all_zw = "\u{200B}\u{200C}\u{200D}\u{FEFF}\u{2060}\u{180E}\
\u{2061}\u{2062}\u{2063}\u{2064}";
assert_eq!(
collapse_whitespace(&format!("x{all_zw}y"), true, true),
"xy"
);
assert_eq!(all_zw.chars().count(), 10);
}
#[test]
fn is_zero_width_has_no_ascii() {
for c in 0u8..0x80 {
assert!(
!is_zero_width(c as char),
"ASCII {c:#04x} must not be zero-width"
);
}
}
#[test]
fn test_nul_stripped_with_control() {
assert_eq!(collapse_whitespace("a\x00b", true, true), "ab");
}
#[test]
fn test_nul_preserved_without_control() {
assert_eq!(collapse_whitespace("a\x00b", false, true), "a\x00b");
}
#[test]
fn test_zero_width_preserved_when_disabled() {
assert_eq!(collapse_whitespace("a\u{2061}b", true, false), "a\u{2061}b");
}
mod proptest_properties {
use super::*;
use proptest::prelude::*;
proptest! {
#![proptest_config(ProptestConfig::with_cases(1000))]
#[test]
fn collapse_whitespace_idempotent(s in "\\PC*") {
let once = collapse_whitespace(&s, true, true);
let twice = collapse_whitespace(&once, true, true);
prop_assert_eq!(&once, &twice);
}
#[test]
fn no_leading_trailing_whitespace(s in "\\PC*") {
let result = collapse_whitespace(&s, true, true);
if !result.is_empty() {
prop_assert_ne!(result.as_bytes()[0], b' ');
prop_assert_ne!(result.as_bytes()[result.len() - 1], b' ');
}
}
#[test]
fn no_consecutive_spaces(s in "\\PC*") {
let result = collapse_whitespace(&s, true, true);
prop_assert!(!result.contains(" "), "double space in: {result:?}");
}
#[test]
fn alphanumeric_passthrough(s in "[a-zA-Z0-9]{1,50}") {
let result = collapse_whitespace(&s, true, true);
prop_assert_eq!(&result, &s);
}
}
}
}