Skip to main content

html_cleaning/
text.rs

1//! Text processing utilities.
2//!
3//! Generic functions for analyzing and normalizing text content.
4
5/// Check if text contains meaningful alphanumeric content.
6///
7/// Returns `true` if text contains at least one alphanumeric character.
8///
9/// # Example
10///
11/// ```
12/// use html_cleaning::text;
13///
14/// assert!(text::has_content("Hello"));
15/// assert!(text::has_content("  123  "));
16/// assert!(!text::has_content("  ...  "));
17/// assert!(!text::has_content(""));
18/// ```
19#[must_use]
20pub fn has_content(text: &str) -> bool {
21    text.chars().any(char::is_alphanumeric)
22}
23
24/// Check if text contains only whitespace.
25///
26/// # Example
27///
28/// ```
29/// use html_cleaning::text;
30///
31/// assert!(text::is_whitespace_only("   "));
32/// assert!(text::is_whitespace_only("\t\n"));
33/// assert!(!text::is_whitespace_only(" a "));
34/// ```
35#[must_use]
36pub fn is_whitespace_only(text: &str) -> bool {
37    text.chars().all(char::is_whitespace)
38}
39
40/// Normalize whitespace in text.
41///
42/// - Trims leading/trailing whitespace
43/// - Collapses multiple whitespace characters to single space
44///
45/// # Example
46///
47/// ```
48/// use html_cleaning::text;
49///
50/// assert_eq!(text::normalize("  hello   world  "), "hello world");
51/// ```
52#[must_use]
53pub fn normalize(text: &str) -> String {
54    text.split_whitespace().collect::<Vec<_>>().join(" ")
55}
56
57/// Count words in text.
58///
59/// Words are defined as whitespace-separated sequences.
60///
61/// # Example
62///
63/// ```
64/// use html_cleaning::text;
65///
66/// assert_eq!(text::word_count("hello world"), 2);
67/// assert_eq!(text::word_count(""), 0);
68/// ```
69#[must_use]
70pub fn word_count(text: &str) -> usize {
71    text.split_whitespace().count()
72}
73
74/// Count sentences in text (approximate).
75///
76/// Counts sentence-ending punctuation (. ! ?).
77///
78/// # Example
79///
80/// ```
81/// use html_cleaning::text;
82///
83/// assert_eq!(text::sentence_count("Hello. World!"), 2);
84/// ```
85#[must_use]
86pub fn sentence_count(text: &str) -> usize {
87    text.chars()
88        .filter(|c| matches!(c, '.' | '!' | '?'))
89        .count()
90}
91
92/// Clean text for fuzzy comparison.
93///
94/// - Converts to lowercase
95/// - Removes punctuation
96/// - Normalizes whitespace
97///
98/// # Example
99///
100/// ```
101/// use html_cleaning::text;
102///
103/// assert_eq!(text::clean_for_comparison("Hello, World!"), "hello world");
104/// ```
105#[must_use]
106pub fn clean_for_comparison(text: &str) -> String {
107    text.chars()
108        .filter(|c| c.is_alphanumeric() || c.is_whitespace())
109        .collect::<String>()
110        .to_lowercase()
111        .split_whitespace()
112        .collect::<Vec<_>>()
113        .join(" ")
114}
115
116#[cfg(test)]
117mod tests {
118    use super::*;
119
120    #[test]
121    fn test_has_content() {
122        assert!(has_content("Hello"));
123        assert!(has_content("123"));
124        assert!(has_content("  a  "));
125        assert!(!has_content(""));
126        assert!(!has_content("   "));
127        assert!(!has_content("..."));
128        assert!(!has_content("!@#$%"));
129    }
130
131    #[test]
132    fn test_is_whitespace_only() {
133        assert!(is_whitespace_only(""));
134        assert!(is_whitespace_only("   "));
135        assert!(is_whitespace_only("\t\n\r"));
136        assert!(!is_whitespace_only("a"));
137        assert!(!is_whitespace_only(" a "));
138    }
139
140    #[test]
141    fn test_normalize() {
142        assert_eq!(normalize("hello"), "hello");
143        assert_eq!(normalize("  hello  "), "hello");
144        assert_eq!(normalize("hello   world"), "hello world");
145        assert_eq!(normalize("  a  b  c  "), "a b c");
146        assert_eq!(normalize(""), "");
147        // Newlines and tabs should collapse to single space
148        assert_eq!(normalize("a\n\nb"), "a b");
149        assert_eq!(normalize("a\t\tb"), "a b");
150        assert_eq!(normalize("hello\n\n\nworld"), "hello world");
151    }
152
153    #[test]
154    fn test_word_count() {
155        assert_eq!(word_count("hello world"), 2);
156        assert_eq!(word_count("one"), 1);
157        assert_eq!(word_count(""), 0);
158        assert_eq!(word_count("   "), 0);
159        assert_eq!(word_count("a b c d e"), 5);
160    }
161
162    #[test]
163    fn test_sentence_count() {
164        assert_eq!(sentence_count("Hello."), 1);
165        assert_eq!(sentence_count("Hello. World!"), 2);
166        assert_eq!(sentence_count("What? Really! Yes."), 3);
167        assert_eq!(sentence_count("No punctuation"), 0);
168    }
169
170    #[test]
171    fn test_clean_for_comparison() {
172        assert_eq!(clean_for_comparison("Hello, World!"), "hello world");
173        assert_eq!(clean_for_comparison("Test123"), "test123");
174        assert_eq!(clean_for_comparison("  UPPER  lower  "), "upper lower");
175    }
176}