Skip to main content

saorsa_core/
text.rs

1//! Text preprocessing — tab expansion and control character filtering.
2
3/// Configuration for text preprocessing.
4#[derive(Debug, Clone)]
5pub struct TextConfig {
6    /// Tab stop width (default: 8).
7    pub tab_width: u8,
8}
9
10impl Default for TextConfig {
11    fn default() -> Self {
12        Self { tab_width: 8 }
13    }
14}
15
16impl TextConfig {
17    /// Create a text config with the given tab width.
18    pub fn new(tab_width: u8) -> Self {
19        Self { tab_width }
20    }
21}
22
23/// Expand tabs to spaces according to tab stop positions.
24///
25/// Each tab character is replaced with enough spaces to reach the next
26/// tab stop position. Tab stops are at every `tab_width` columns.
27///
28/// If `tab_width` is 0, tabs are simply removed.
29pub fn expand_tabs(text: &str, tab_width: u8) -> String {
30    if tab_width == 0 {
31        return text.replace('\t', "");
32    }
33
34    let tw = tab_width as usize;
35    let mut result = String::with_capacity(text.len());
36    let mut column: usize = 0;
37
38    for ch in text.chars() {
39        if ch == '\t' {
40            let spaces_needed = tw - (column % tw);
41            for _ in 0..spaces_needed {
42                result.push(' ');
43            }
44            column += spaces_needed;
45        } else if ch == '\n' {
46            result.push(ch);
47            column = 0;
48        } else {
49            result.push(ch);
50            column += 1;
51        }
52    }
53
54    result
55}
56
57/// Remove or replace control characters.
58///
59/// Strips C0 control characters (except tab and newline) and C1 control characters.
60/// Tab and newline are preserved since they have semantic meaning.
61pub fn filter_control_chars(text: &str) -> String {
62    let mut result = String::with_capacity(text.len());
63
64    for ch in text.chars() {
65        // Preserve tab (0x09) and newline (0x0A)
66        if ch == '\t' || ch == '\n' {
67            result.push(ch);
68            continue;
69        }
70
71        // Filter C0 control characters (0x00-0x1F) and DEL (0x7F)
72        if ch.is_ascii_control() {
73            continue;
74        }
75
76        // Filter C1 control characters (0x80-0x9F)
77        let code = ch as u32;
78        if (0x80..=0x9F).contains(&code) {
79            continue;
80        }
81
82        result.push(ch);
83    }
84
85    result
86}
87
88/// Preprocess text: expand tabs then filter control characters.
89///
90/// This is a convenience function that first expands tabs to spaces
91/// according to the given configuration, then strips control characters.
92pub fn preprocess(text: &str, config: &TextConfig) -> String {
93    let expanded = expand_tabs(text, config.tab_width);
94    filter_control_chars(&expanded)
95}
96
97/// Truncate a string to a maximum byte length on a UTF-8 character boundary.
98///
99/// Returns a substring that is at most `max_bytes` bytes long, without
100/// splitting any multi-byte characters. If the full string fits, it is
101/// returned unchanged.
102pub fn truncate_to_char_boundary(text: &str, max_bytes: usize) -> &str {
103    if text.len() <= max_bytes {
104        return text;
105    }
106    // Find the largest char boundary at or before max_bytes
107    let mut end = max_bytes;
108    while end > 0 && !text.is_char_boundary(end) {
109        end -= 1;
110    }
111    &text[..end]
112}
113
114/// Calculate the display width of text in terminal cells.
115///
116/// Uses the `unicode-width` crate to account for double-width characters
117/// (CJK, emoji, etc.). Returns the width clamped to `u16::MAX`.
118pub fn string_display_width(text: &str) -> u16 {
119    use unicode_width::UnicodeWidthStr;
120    let width = UnicodeWidthStr::width(text);
121    if width > u16::MAX as usize {
122        u16::MAX
123    } else {
124        width as u16
125    }
126}
127
128/// Truncate a string to fit within a maximum display width.
129///
130/// Iterates over characters, accumulating their display widths until the
131/// limit is reached. Returns a substring that fits within `max_width`
132/// terminal cells without splitting any characters.
133pub fn truncate_to_display_width(text: &str, max_width: usize) -> &str {
134    use unicode_width::UnicodeWidthChar;
135    let mut width = 0usize;
136    for (byte_idx, ch) in text.char_indices() {
137        let ch_width = UnicodeWidthChar::width(ch).unwrap_or(0);
138        if width + ch_width > max_width {
139            return &text[..byte_idx];
140        }
141        width += ch_width;
142    }
143    text
144}
145
146#[cfg(test)]
147mod tests {
148    use super::*;
149
150    #[test]
151    fn truncate_at_char_boundary_ascii() {
152        let text = "Hello World";
153        assert_eq!(truncate_to_char_boundary(text, 5), "Hello");
154    }
155
156    #[test]
157    fn truncate_at_char_boundary_emoji() {
158        // Emoji is 4 bytes; truncating at 7 bytes must not split the emoji
159        let text = "Hello \u{1F600} World";
160        let result = truncate_to_char_boundary(text, 7);
161        // "Hello " is 6 bytes, emoji is 4 bytes, so 7 bytes truncates before emoji
162        assert_eq!(result, "Hello ");
163    }
164
165    #[test]
166    fn truncate_at_char_boundary_cjk() {
167        // CJK chars are 3 bytes each
168        let text = "\u{4F60}\u{597D}\u{4E16}\u{754C}"; // 你好世界
169        let result = truncate_to_char_boundary(text, 7);
170        // 3+3=6 fits, 3+3+3=9 doesn't fit in 7 bytes
171        assert_eq!(result, "\u{4F60}\u{597D}");
172    }
173
174    #[test]
175    fn truncate_at_char_boundary_empty() {
176        assert_eq!(truncate_to_char_boundary("", 5), "");
177    }
178
179    #[test]
180    fn truncate_at_char_boundary_zero_limit() {
181        assert_eq!(truncate_to_char_boundary("Hello", 0), "");
182    }
183
184    #[test]
185    fn truncate_at_char_boundary_larger_limit() {
186        let text = "Hi";
187        assert_eq!(truncate_to_char_boundary(text, 100), "Hi");
188    }
189
190    #[test]
191    fn display_width_ascii() {
192        assert_eq!(string_display_width("Hello"), 5);
193    }
194
195    #[test]
196    fn display_width_emoji() {
197        // Emoji typically has width 2
198        assert_eq!(string_display_width("\u{1F600}"), 2);
199    }
200
201    #[test]
202    fn display_width_cjk() {
203        // Each CJK char has width 2
204        assert_eq!(string_display_width("\u{4F60}\u{597D}"), 4);
205    }
206
207    #[test]
208    fn display_width_empty() {
209        assert_eq!(string_display_width(""), 0);
210    }
211
212    #[test]
213    fn display_width_mixed() {
214        // "Hi " = 3, emoji = 2 → 5
215        assert_eq!(string_display_width("Hi \u{1F600}"), 5);
216    }
217
218    #[test]
219    fn truncate_to_display_width_ascii() {
220        assert_eq!(truncate_to_display_width("Hello World", 5), "Hello");
221    }
222
223    #[test]
224    fn truncate_to_display_width_cjk() {
225        // Each CJK is width 2; max_width 5 fits 2 chars (4 width), not 3 (6 width)
226        let text = "\u{4F60}\u{597D}\u{4E16}"; // 你好世
227        assert_eq!(truncate_to_display_width(text, 5), "\u{4F60}\u{597D}");
228    }
229
230    #[test]
231    fn truncate_to_display_width_emoji() {
232        // "Hi " is width 3, emoji is width 2 → total 5; max 4 stops before emoji
233        assert_eq!(truncate_to_display_width("Hi \u{1F600}", 4), "Hi ");
234    }
235
236    #[test]
237    fn expand_tabs_single_tab_at_position_zero() {
238        // Tab at position 0, width 8 → 8 spaces
239        let result = expand_tabs("\t", 8);
240        assert_eq!(result, "        ");
241        assert_eq!(result.len(), 8);
242    }
243
244    #[test]
245    fn expand_tabs_after_three_chars() {
246        // "abc" (3 chars) then tab → 5 spaces to reach column 8
247        let result = expand_tabs("abc\t", 8);
248        assert_eq!(result, "abc     ");
249        assert_eq!(result.len(), 8);
250    }
251
252    #[test]
253    fn expand_tabs_after_eight_chars() {
254        // "abcdefgh" (8 chars) then tab → 8 spaces to reach column 16
255        let result = expand_tabs("abcdefgh\t", 8);
256        assert_eq!(result, "abcdefgh        ");
257        assert_eq!(result.len(), 16);
258    }
259
260    #[test]
261    fn expand_tabs_no_tabs_unchanged() {
262        let result = expand_tabs("hello world", 8);
263        assert_eq!(result, "hello world");
264    }
265
266    #[test]
267    fn expand_tabs_custom_width_four() {
268        // Tab at position 0, width 4 → 4 spaces
269        let result = expand_tabs("\t", 4);
270        assert_eq!(result, "    ");
271        assert_eq!(result.len(), 4);
272
273        // "ab" (2 chars) then tab → 2 spaces to reach column 4
274        let result2 = expand_tabs("ab\t", 4);
275        assert_eq!(result2, "ab  ");
276        assert_eq!(result2.len(), 4);
277    }
278
279    #[test]
280    fn filter_control_chars_removes_null() {
281        let result = filter_control_chars("hello\x00world");
282        assert_eq!(result, "helloworld");
283    }
284
285    #[test]
286    fn filter_control_chars_removes_bell() {
287        let result = filter_control_chars("hello\x07world");
288        assert_eq!(result, "helloworld");
289    }
290
291    #[test]
292    fn filter_control_chars_preserves_tab_and_newline() {
293        let result = filter_control_chars("hello\tworld\n");
294        assert_eq!(result, "hello\tworld\n");
295    }
296
297    #[test]
298    fn filter_control_chars_clean_text_unchanged() {
299        let result = filter_control_chars("Hello, World! 123");
300        assert_eq!(result, "Hello, World! 123");
301    }
302
303    #[test]
304    fn preprocess_combines_tab_expansion_and_filtering() {
305        let config = TextConfig::new(4);
306        // Tab should be expanded, bell should be removed
307        let result = preprocess("a\tb\x07c", &config);
308        // "a" at col 0, tab expands to 3 spaces (to col 4), then "b", then bell removed, then "c"
309        assert_eq!(result, "a   bc");
310    }
311
312    #[test]
313    fn empty_string_handling() {
314        assert_eq!(expand_tabs("", 8), "");
315        assert_eq!(filter_control_chars(""), "");
316        let config = TextConfig::default();
317        assert_eq!(preprocess("", &config), "");
318    }
319
320    #[test]
321    fn expand_tabs_multiple_tabs() {
322        // Two tabs in a row at position 0, width 4:
323        // First tab: 4 spaces (col 0→4), second tab: 4 spaces (col 4→8)
324        let result = expand_tabs("\t\t", 4);
325        assert_eq!(result, "        ");
326        assert_eq!(result.len(), 8);
327    }
328
329    #[test]
330    fn filter_control_chars_removes_c1_range() {
331        // U+0080 through U+009F are C1 control characters
332        let text = format!("hello{}world", '\u{0085}'); // NEL
333        let result = filter_control_chars(&text);
334        assert_eq!(result, "helloworld");
335    }
336
337    #[test]
338    fn expand_tabs_with_newline_resets_column() {
339        // "abc\n\t" — after newline, column resets to 0, so tab expands to 4 spaces
340        let result = expand_tabs("abc\n\t", 4);
341        assert_eq!(result, "abc\n    ");
342    }
343
344    #[test]
345    fn text_config_default_tab_width_eight() {
346        let config = TextConfig::default();
347        assert_eq!(config.tab_width, 8);
348    }
349}