#[cfg(feature = "text-processing")]
use unicode_general_category::{GeneralCategory, get_general_category};
#[cfg(feature = "text-processing")]
use unicode_normalization::UnicodeNormalization;
#[cfg(feature = "text-processing")]
const NEWLINES: &[char] = &[
'\u{000A}', '\u{000B}', '\u{000C}', '\u{000D}', '\u{0085}', '\u{2028}', '\u{2029}', ];
#[cfg(feature = "text-processing")]
fn is_c_category(c: char) -> bool {
matches!(
get_general_category(c),
GeneralCategory::Control
| GeneralCategory::Format
| GeneralCategory::Unassigned
| GeneralCategory::PrivateUse
| GeneralCategory::Surrogate
)
}
#[cfg(feature = "text-processing")]
fn is_cmp_category(c: char) -> bool {
matches!(
get_general_category(c),
GeneralCategory::Control
| GeneralCategory::Format
| GeneralCategory::Unassigned
| GeneralCategory::PrivateUse
| GeneralCategory::Surrogate
| GeneralCategory::NonspacingMark
| GeneralCategory::SpacingMark
| GeneralCategory::EnclosingMark
| GeneralCategory::ConnectorPunctuation
| GeneralCategory::DashPunctuation
| GeneralCategory::OpenPunctuation
| GeneralCategory::ClosePunctuation
| GeneralCategory::InitialPunctuation
| GeneralCategory::FinalPunctuation
| GeneralCategory::OtherPunctuation
)
}
#[cfg(feature = "text-processing")]
pub fn text_clean(text: &str) -> String {
let text: String = text.nfkc().collect();
let mut cleaned = String::with_capacity(text.len());
let mut chars = text.chars().peekable();
while let Some(c) = chars.next() {
if NEWLINES.contains(&c) {
if c == '\r' && chars.peek() == Some(&'\n') {
chars.next();
}
cleaned.push('\n');
} else if is_c_category(c) {
} else {
cleaned.push(c);
}
}
let mut result_lines: Vec<&str> = Vec::new();
let mut prev_empty = false;
for line in cleaned.split('\n') {
let is_empty = line.trim().is_empty();
if is_empty {
if prev_empty {
continue;
}
prev_empty = true;
} else {
prev_empty = false;
}
result_lines.push(line);
}
result_lines.join("\n").trim().to_string()
}
pub fn text_remove_newlines(text: &str) -> String {
text.split_whitespace().collect::<Vec<_>>().join(" ")
}
pub fn text_trim(text: &str, nbytes: usize) -> String {
if text.len() <= nbytes {
return text.trim().to_string();
}
let bytes = &text.as_bytes()[..nbytes];
let s = match std::str::from_utf8(bytes) {
Ok(s) => s,
Err(e) => &text[..e.valid_up_to()],
};
s.trim().to_string()
}
#[cfg(feature = "text-processing")]
pub fn text_collapse(text: &str) -> String {
let nfd_lower: String = text.nfd().collect::<String>().to_lowercase();
let filtered: String = nfd_lower
.chars()
.filter(|&c| !c.is_whitespace() && !is_cmp_category(c))
.collect();
filtered.nfkc().collect()
}
pub(crate) fn multi_hash_blake3(data: &[u8]) -> String {
let digest = blake3::hash(data);
let mut result = Vec::with_capacity(34);
result.push(0x1e); result.push(0x20); result.extend_from_slice(digest.as_bytes());
hex::encode(result)
}
#[cfg(test)]
mod tests {
use super::*;
#[cfg(feature = "text-processing")]
#[test]
fn test_text_clean_nfkc_normalization() {
assert!(text_clean("ℍ").contains('H'));
}
#[cfg(feature = "text-processing")]
#[test]
fn test_text_clean_removes_control_chars() {
assert_eq!(text_clean("hello\tworld"), "helloworld");
}
#[cfg(feature = "text-processing")]
#[test]
fn test_text_clean_preserves_newlines() {
assert_eq!(text_clean("hello\nworld"), "hello\nworld");
}
#[cfg(feature = "text-processing")]
#[test]
fn test_text_clean_collapses_empty_lines() {
assert_eq!(text_clean("a\n\n\nb"), "a\n\nb");
}
#[cfg(feature = "text-processing")]
#[test]
fn test_text_clean_strips_whitespace() {
assert_eq!(text_clean(" hello "), "hello");
}
#[cfg(feature = "text-processing")]
#[test]
fn test_text_clean_handles_crlf() {
assert_eq!(text_clean("a\r\nb"), "a\nb");
}
#[cfg(feature = "text-processing")]
#[test]
fn test_text_clean_empty() {
assert_eq!(text_clean(""), "");
}
#[test]
fn test_text_remove_newlines() {
assert_eq!(text_remove_newlines("hello\nworld"), "hello world");
}
#[test]
fn test_text_remove_newlines_collapses_spaces() {
assert_eq!(text_remove_newlines("a b c"), "a b c");
}
#[test]
fn test_text_trim_no_truncation() {
assert_eq!(text_trim("hello", 10), "hello");
}
#[test]
fn test_text_trim_exact() {
assert_eq!(text_trim("hello", 5), "hello");
}
#[test]
fn test_text_trim_truncates() {
assert_eq!(text_trim("hello world", 5), "hello");
}
#[test]
fn test_text_trim_unicode_boundary() {
assert_eq!(text_trim("é", 1), "");
}
#[test]
fn test_text_trim_strips() {
assert_eq!(text_trim("hello ", 6), "hello");
}
#[cfg(feature = "text-processing")]
#[test]
fn test_text_collapse_basic() {
assert_eq!(text_collapse("Hello World"), "helloworld");
}
#[cfg(feature = "text-processing")]
#[test]
fn test_text_collapse_strips_accents() {
assert_eq!(text_collapse("café"), "cafe");
}
#[cfg(feature = "text-processing")]
#[test]
fn test_text_collapse_strips_punctuation() {
assert_eq!(text_collapse("hello, world!"), "helloworld");
}
#[cfg(feature = "text-processing")]
#[test]
fn test_text_collapse_empty() {
assert_eq!(text_collapse(""), "");
}
#[test]
fn test_multi_hash_blake3_empty() {
assert_eq!(
multi_hash_blake3(b""),
"1e20af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262"
);
}
#[test]
fn test_multi_hash_blake3_hello_world() {
assert_eq!(
multi_hash_blake3(b"hello world"),
"1e20d74981efa70a0c880b8d8c1985d075dbcbf679b99a5f9914e5aaf96b831a9e24"
);
}
}