use unicode_normalization::UnicodeNormalization;
pub fn nfc(text: &str) -> String {
text.nfc().collect()
}
pub fn nfd(text: &str) -> String {
text.nfd().collect()
}
pub fn nfkc(text: &str) -> String {
text.nfkc().collect()
}
pub fn nfkd(text: &str) -> String {
text.nfkd().collect()
}
pub fn normalize_newlines(text: &str) -> String {
let mut out = String::with_capacity(text.len());
let mut chars = text.chars().peekable();
while let Some(c) = chars.next() {
if c == '\r' {
if chars.peek() == Some(&'\n') {
let _ = chars.next();
}
out.push('\n');
} else {
out.push(c);
}
}
out
}
pub fn normalize_newlines_into(text: &str, out: &mut String) {
out.clear();
out.reserve(text.len());
let mut chars = text.chars().peekable();
while let Some(c) = chars.next() {
if c == '\r' {
if chars.peek() == Some(&'\n') {
let _ = chars.next();
}
out.push('\n');
} else {
out.push(c);
}
}
}
pub fn trim_lines_preserve_spaces(text: &str) -> String {
let normalized = normalize_newlines(text);
normalized
.lines()
.map(|l| l.trim())
.filter(|l| !l.is_empty())
.collect::<Vec<_>>()
.join("\n")
}
pub fn remove_zero_width(text: &str) -> String {
text.chars()
.filter(|&c| {
!matches!(
c,
'\u{200B}' | '\u{200C}' | '\u{200D}' | '\u{2060}' | '\u{FEFF}'
)
})
.collect()
}
pub fn remove_zero_width_into(text: &str, out: &mut String) {
out.clear();
out.reserve(text.len());
out.extend(text.chars().filter(|&c| {
!matches!(
c,
'\u{200B}' | '\u{200C}' | '\u{200D}' | '\u{2060}' | '\u{FEFF}'
)
}));
}
#[must_use]
pub fn contains_zero_width(text: &str) -> bool {
text.chars().any(|c| {
matches!(
c,
'\u{200B}' | '\u{200C}' | '\u{200D}' | '\u{2060}' | '\u{FEFF}'
)
})
}
#[must_use]
pub fn zero_width_with_offsets(text: &str) -> Vec<(usize, char)> {
text.chars()
.enumerate()
.filter_map(|(i, c)| {
if matches!(
c,
'\u{200B}' | '\u{200C}' | '\u{200D}' | '\u{2060}' | '\u{FEFF}'
) {
Some((i, c))
} else {
None
}
})
.collect()
}
pub fn remove_bidi_controls(text: &str) -> String {
text.chars()
.filter(|&c| {
!matches!(
c,
'\u{202A}'
| '\u{202B}'
| '\u{202C}'
| '\u{202D}'
| '\u{202E}'
| '\u{2066}'
| '\u{2067}'
| '\u{2068}'
| '\u{2069}'
| '\u{200E}'
| '\u{200F}'
| '\u{061C}'
)
})
.collect()
}
pub fn remove_bidi_controls_into(text: &str, out: &mut String) {
out.clear();
out.reserve(text.len());
out.extend(text.chars().filter(|&c| {
!matches!(
c,
'\u{202A}'
| '\u{202B}'
| '\u{202C}'
| '\u{202D}'
| '\u{202E}'
| '\u{2066}'
| '\u{2067}'
| '\u{2068}'
| '\u{2069}'
| '\u{200E}'
| '\u{200F}'
| '\u{061C}'
)
}));
}
#[must_use]
pub fn contains_bidi_controls(text: &str) -> bool {
text.chars().any(|c| {
matches!(
c,
'\u{202A}'
| '\u{202B}'
| '\u{202C}'
| '\u{202D}'
| '\u{202E}'
| '\u{2066}'
| '\u{2067}'
| '\u{2068}'
| '\u{2069}'
| '\u{200E}'
| '\u{200F}'
| '\u{061C}'
)
})
}
#[must_use]
pub fn bidi_controls_with_offsets(text: &str) -> Vec<(usize, char)> {
text.chars()
.enumerate()
.filter_map(|(i, c)| {
if matches!(
c,
'\u{202A}'
| '\u{202B}'
| '\u{202C}'
| '\u{202D}'
| '\u{202E}'
| '\u{2066}'
| '\u{2067}'
| '\u{2068}'
| '\u{2069}'
| '\u{200E}'
| '\u{200F}'
| '\u{061C}'
) {
Some((i, c))
} else {
None
}
})
.collect()
}
pub fn collapse_whitespace(text: &str) -> String {
let mut out = String::with_capacity(text.len());
let mut in_ws = true; for c in text.chars() {
if c.is_whitespace() {
in_ws = true;
continue;
}
if in_ws && !out.is_empty() {
out.push(' ');
}
in_ws = false;
out.push(c);
}
out
}
pub fn collapse_whitespace_into(text: &str, out: &mut String) {
out.clear();
out.reserve(text.len());
let mut in_ws = true; for c in text.chars() {
if c.is_whitespace() {
in_ws = true;
continue;
}
if in_ws && !out.is_empty() {
out.push(' ');
}
in_ws = false;
out.push(c);
}
}
#[cfg(test)]
mod tests {
use super::*;
use proptest::prelude::*;
#[test]
fn test_nfc() {
let decomposed = "a\u{0308}";
let normalized = nfc(decomposed);
assert_eq!(normalized, "ä");
}
#[test]
fn test_normalize_newlines() {
let text = "Line 1\r\nLine 2\rLine 3\nLine 4";
let normalized = normalize_newlines(text);
assert!(!normalized.contains("\r\n"));
assert!(!normalized.contains('\r'));
assert!(normalized.contains('\n'));
}
#[test]
fn test_normalize_newlines_into_matches() {
let text = "Line 1\r\nLine 2\rLine 3\nLine 4";
let expected = normalize_newlines(text);
let mut out = String::new();
normalize_newlines_into(text, &mut out);
assert_eq!(out, expected);
}
#[test]
fn test_trim_lines_preserve_spaces() {
let text = " Hello World \r\n\r\n 東京 \n\n Müller ";
let out = trim_lines_preserve_spaces(text);
assert_eq!(out, "Hello World\n東京\nMüller");
}
#[test]
fn test_remove_zero_width() {
let text = "a\u{200b}b\u{200c}c\u{200d}d\u{2060}e\u{feff}f";
assert!(contains_zero_width(text));
assert_eq!(
zero_width_with_offsets(text),
vec![
(1, '\u{200B}'),
(3, '\u{200C}'),
(5, '\u{200D}'),
(7, '\u{2060}'),
(9, '\u{FEFF}')
]
);
assert_eq!(remove_zero_width(text), "abcdef");
assert!(!contains_zero_width(&remove_zero_width(text)));
let mut out = String::new();
remove_zero_width_into(text, &mut out);
assert_eq!(out, "abcdef");
}
#[test]
fn test_remove_bidi_controls() {
let text = "a\u{202e}\u{2066}b\u{2069}\u{202c}\u{200f}c";
assert!(contains_bidi_controls(text));
assert_eq!(
bidi_controls_with_offsets(text),
vec![
(1, '\u{202E}'),
(2, '\u{2066}'),
(4, '\u{2069}'),
(5, '\u{202C}'),
(6, '\u{200F}')
]
);
assert_eq!(remove_bidi_controls(text), "abc");
assert!(!contains_bidi_controls(&remove_bidi_controls(text)));
let mut out = String::new();
remove_bidi_controls_into(text, &mut out);
assert_eq!(out, "abc");
}
#[test]
fn test_remove_bidi_controls_includes_alm() {
let text = "a\u{061c}b";
assert!(contains_bidi_controls(text));
assert_eq!(bidi_controls_with_offsets(text), vec![(1, '\u{061C}')]);
assert_eq!(remove_bidi_controls(text), "ab");
}
#[test]
fn test_collapse_whitespace() {
let text = " hello\tworld \n 東京 \r\n Müller ";
let collapsed = collapse_whitespace(text);
assert_eq!(collapsed, "hello world 東京 Müller");
}
#[test]
fn test_collapse_whitespace_into_matches() {
let text = " hello\tworld \n 東京 \r\n Müller ";
let expected = collapse_whitespace(text);
let mut out = String::new();
collapse_whitespace_into(text, &mut out);
assert_eq!(out, expected);
}
proptest! {
#[test]
fn prop_remove_zero_width_removes_all_targets(s in ".*") {
let out = remove_zero_width(&s);
prop_assert!(!out.contains('\u{200B}'), "ZWSP (U+200B) not removed");
prop_assert!(!out.contains('\u{200C}'), "ZWNJ (U+200C) not removed");
prop_assert!(!out.contains('\u{200D}'), "ZWJ (U+200D) not removed");
prop_assert!(!out.contains('\u{2060}'), "WORD JOINER (U+2060) not removed");
prop_assert!(!out.contains('\u{FEFF}'), "BOM (U+FEFF) not removed");
}
#[test]
fn prop_collapse_whitespace_has_no_runs(s in ".*") {
let out = collapse_whitespace(&s);
prop_assert!(!out.starts_with(char::is_whitespace), "leading whitespace present");
prop_assert!(!out.ends_with(char::is_whitespace), "trailing whitespace present");
prop_assert!(!out.contains(" "), "double-space present");
prop_assert!(!out.contains('\n'), "newline present");
prop_assert!(!out.contains('\t'), "tab present");
prop_assert!(!out.contains('\r'), "CR present");
}
#[test]
fn prop_normalize_newlines_into_equivalent(s in ".*") {
let expected = normalize_newlines(&s);
let mut out = String::new();
normalize_newlines_into(&s, &mut out);
prop_assert_eq!(out, expected);
}
#[test]
fn prop_collapse_whitespace_into_equivalent(s in ".*") {
let expected = collapse_whitespace(&s);
let mut out = String::new();
collapse_whitespace_into(&s, &mut out);
prop_assert_eq!(out, expected);
}
}
}