ass_core/utils/utf8/
normalization.rs

1//! Text normalization utilities for ASS subtitle processing
2//!
3//! Provides functionality for normalizing text content including line endings,
4//! whitespace handling, and other text cleanup operations commonly needed
5//! when processing ASS subtitle files from various sources and platforms.
6//!
7//! # Examples
8//!
9//! ```rust
10//! use ass_core::utils::utf8::{normalize_line_endings, normalize_whitespace};
11//!
12//! let input = "Line 1\r\nLine 2\rLine 3\n";
13//! let normalized = normalize_line_endings(input);
14//! assert_eq!(normalized, "Line 1\nLine 2\nLine 3\n");
15//! ```
16
17use alloc::{string::String, vec::Vec};
18
19/// Normalize line endings to Unix style (\n)
20///
21/// Converts Windows (\r\n) and classic Mac (\r) line endings to Unix (\n).
22/// This ensures consistent line ending handling across different platforms
23/// and source files.
24///
25/// # Arguments
26///
27/// * `text` - Input text with potentially mixed line endings
28///
29/// # Returns
30///
31/// String with normalized Unix line endings
32///
33/// # Examples
34///
35/// ```rust
36/// # use ass_core::utils::utf8::normalize_line_endings;
37/// let input = "Line 1\r\nLine 2\rLine 3\n";
38/// let normalized = normalize_line_endings(input);
39/// assert_eq!(normalized, "Line 1\nLine 2\nLine 3\n");
40/// ```
41#[must_use]
42pub fn normalize_line_endings(text: &str) -> String {
43    text.replace("\r\n", "\n").replace('\r', "\n")
44}
45
46/// Normalize whitespace characters for consistent processing
47///
48/// Converts various Unicode whitespace characters to standard spaces
49/// and optionally collapses multiple consecutive whitespace characters.
50///
51/// # Arguments
52///
53/// * `text` - Input text with potentially mixed whitespace
54/// * `collapse_multiple` - Whether to collapse multiple spaces into one
55///
56/// # Returns
57///
58/// String with normalized whitespace
59#[must_use]
60pub fn normalize_whitespace(text: &str, collapse_multiple: bool) -> String {
61    let mut result = text
62        .chars()
63        .map(|c| {
64            if c.is_whitespace() && c != '\n' && c != '\t' {
65                ' ' // Convert all whitespace except newlines and tabs to space
66            } else {
67                c
68            }
69        })
70        .collect::<String>();
71
72    if collapse_multiple {
73        result = collapse_consecutive_spaces(&result);
74    }
75
76    result
77}
78
79/// Remove or normalize control characters for safe text processing
80///
81/// Removes potentially problematic control characters while preserving
82/// essential ones like newlines and tabs. Helps ensure text is safe
83/// for processing and display.
84///
85/// # Arguments
86///
87/// * `text` - Input text that may contain control characters
88///
89/// # Returns
90///
91/// String with control characters removed or normalized
92#[must_use]
93pub fn remove_control_chars(text: &str) -> String {
94    text.chars()
95        .filter(|&c| {
96            // Keep printable characters, newlines, tabs, and carriage returns
97            !c.is_control() || c == '\n' || c == '\t' || c == '\r'
98        })
99        .collect()
100}
101
102/// Trim whitespace from start and end of each line
103///
104/// Removes leading and trailing whitespace from each line while
105/// preserving the line structure. Useful for cleaning up formatted
106/// text that may have inconsistent indentation.
107///
108/// # Arguments
109///
110/// * `text` - Input text with potentially inconsistent line formatting
111///
112/// # Returns
113///
114/// String with trimmed lines
115#[must_use]
116pub fn trim_lines(text: &str) -> String {
117    text.lines()
118        .map(str::trim)
119        .collect::<Vec<&str>>()
120        .join("\n")
121}
122
123/// Collapse consecutive whitespace characters into single spaces
124///
125/// Internal helper function that reduces multiple consecutive space
126/// characters to single spaces while preserving newlines and tabs.
127fn collapse_consecutive_spaces(text: &str) -> String {
128    let mut result = String::with_capacity(text.len());
129    let mut prev_was_space = false;
130
131    for c in text.chars() {
132        if c == ' ' {
133            if !prev_was_space {
134                result.push(c);
135                prev_was_space = true;
136            }
137        } else {
138            result.push(c);
139            prev_was_space = false;
140        }
141    }
142
143    result
144}
145
146#[cfg(test)]
147mod tests {
148    use super::*;
149
150    #[test]
151    fn normalize_line_endings_windows() {
152        let input = "Line 1\r\nLine 2\r\nLine 3";
153        let normalized = normalize_line_endings(input);
154        assert_eq!(normalized, "Line 1\nLine 2\nLine 3");
155    }
156
157    #[test]
158    fn normalize_line_endings_mac() {
159        let input = "Line 1\rLine 2\rLine 3";
160        let normalized = normalize_line_endings(input);
161        assert_eq!(normalized, "Line 1\nLine 2\nLine 3");
162    }
163
164    #[test]
165    fn normalize_line_endings_mixed() {
166        let input = "Line 1\r\nLine 2\rLine 3\n";
167        let normalized = normalize_line_endings(input);
168        assert_eq!(normalized, "Line 1\nLine 2\nLine 3\n");
169    }
170
171    #[test]
172    fn normalize_line_endings_unix() {
173        let input = "Line 1\nLine 2\nLine 3\n";
174        let normalized = normalize_line_endings(input);
175        assert_eq!(normalized, "Line 1\nLine 2\nLine 3\n");
176    }
177
178    #[test]
179    fn normalize_whitespace_basic() {
180        let input = "Hello\u{00A0}World\u{2000}Test"; // Non-breaking space and en quad
181        let normalized = normalize_whitespace(input, false);
182        assert_eq!(normalized, "Hello World Test");
183    }
184
185    #[test]
186    fn normalize_whitespace_preserve_structure() {
187        let input = "Hello\tWorld\nNext Line";
188        let normalized = normalize_whitespace(input, false);
189        assert_eq!(normalized, "Hello\tWorld\nNext Line");
190    }
191
192    #[test]
193    fn normalize_whitespace_collapse() {
194        let input = "Hello    World   Test";
195        let normalized = normalize_whitespace(input, true);
196        assert_eq!(normalized, "Hello World Test");
197    }
198
199    #[test]
200    fn normalize_whitespace_no_collapse() {
201        let input = "Hello    World   Test";
202        let normalized = normalize_whitespace(input, false);
203        assert_eq!(normalized, "Hello    World   Test");
204    }
205
206    #[test]
207    fn remove_control_chars_basic() {
208        let input = "Hello\x00World\x1FTest";
209        let cleaned = remove_control_chars(input);
210        assert_eq!(cleaned, "HelloWorldTest");
211    }
212
213    #[test]
214    fn remove_control_chars_preserve_essential() {
215        let input = "Hello\tWorld\nNext\rLine";
216        let cleaned = remove_control_chars(input);
217        assert_eq!(cleaned, "Hello\tWorld\nNext\rLine");
218    }
219
220    #[test]
221    fn trim_lines_basic() {
222        let input = "  Line 1  \n\t Line 2 \t\n   Line 3   ";
223        let trimmed = trim_lines(input);
224        assert_eq!(trimmed, "Line 1\nLine 2\nLine 3");
225    }
226
227    #[test]
228    fn trim_lines_empty_lines() {
229        let input = "Line 1\n   \nLine 3";
230        let trimmed = trim_lines(input);
231        assert_eq!(trimmed, "Line 1\n\nLine 3");
232    }
233
234    #[test]
235    fn collapse_consecutive_spaces_basic() {
236        let input = "Hello    World   Test";
237        let collapsed = collapse_consecutive_spaces(input);
238        assert_eq!(collapsed, "Hello World Test");
239    }
240
241    #[test]
242    fn collapse_consecutive_spaces_preserve_other() {
243        let input = "Hello\t\tWorld\n\nTest";
244        let collapsed = collapse_consecutive_spaces(input);
245        assert_eq!(collapsed, "Hello\t\tWorld\n\nTest");
246    }
247
248    #[test]
249    fn normalization_chain() {
250        let input = "  Line 1  \r\n\t Line 2 \t\r   Line 3   ";
251        let normalized = normalize_line_endings(input);
252        let trimmed = trim_lines(&normalized);
253        let final_result = normalize_whitespace(&trimmed, true);
254        assert_eq!(final_result, "Line 1\nLine 2\nLine 3");
255    }
256}