Skip to main content

marco_core/logic/
utf8.rs

1//! UTF-8 Input Sanitization and Validation
2//!
3//! This module provides defensive UTF-8 handling for all text input sources
4//! (keyboard, clipboard, files). It ensures that invalid UTF-8 sequences are
5//! safely handled and Unicode text is normalized before reaching the parser layer.
6//!
7//! # Architecture
8//! ```text
9//! Raw input (keyboard, clipboard, file)
10//!        │
11//!        ▼
12//! [UTF-8 Validation → Unicode Normalization → Control Char Filter]  ← This module
13//!        │
14//!        ▼
15//! Parser (nom, Markdown)
16//!        │
17//!        ▼
18//! Renderer (SourceView5 + WebKit6)
19//! ```
20//!
21//! # Strategy
22//! 1. **Validate** - Check if input is valid UTF-8
23//! 2. **Sanitize** - Replace invalid sequences with � (U+FFFD)
24//! 3. **Normalize** - Apply Unicode NFC normalization (canonical composition)
25//! 4. **Filter** - Remove control characters (except \n, \r, \t)
26//! 5. **Standardize** - Normalize line endings to \n
27//!
28//! # Unicode Normalization
29//!
30//! **Why NFC (Canonical Composition)?**
31//!
32//! Unicode allows multiple representations of visually identical text:
33//! - Precomposed form: `é` (U+00E9, single character)
34//! - Decomposed form: `e` + `´` (U+0065 + U+0301, two characters)
35//!
36//! Without normalization:
37//! - Parser may treat `café` and `café` as different strings
38//! - Emphasis markers like `*café*` might fail if `é` is decomposed
39//! - Em dashes (—, U+2014) vs hyphens (-, U+002D) stay distinct
40//!
41//! NFC normalization ensures:
42//! - Canonically equivalent forms are unified
43//! - Multi-script text is stable for tokenization
44//! - Parser results are deterministic across platforms
45//!
46//! # Examples
47//! ```
48//! use marco_core::logic::utf8::{sanitize_input, InputSource};
49//!
50//! // From keyboard input
51//! let raw_bytes = b"Hello World";
52//! let safe_text = sanitize_input(raw_bytes, InputSource::Keyboard);
53//!
54//! // From clipboard
55//! let clipboard_bytes = b"Hello \xF0\x28\x8C\x28 World"; // invalid UTF-8
56//! let safe_text = sanitize_input(clipboard_bytes, InputSource::Clipboard);
57//!
58//! // From file
59//! let file_bytes = b"Line1\r\nLine2\r\n";
60//! let safe_text = sanitize_input(file_bytes, InputSource::File);
61//! ```
62
63use std::borrow::Cow;
64use unicode_normalization::UnicodeNormalization;
65
66/// Source of the input text (for logging/diagnostics)
67#[derive(Debug, Clone, Copy, PartialEq, Eq)]
68pub enum InputSource {
69    /// Direct keyboard input
70    Keyboard,
71    /// Clipboard paste
72    Clipboard,
73    /// File load
74    File,
75    /// Network/API
76    Network,
77    /// Unknown/other source
78    Unknown,
79}
80
81impl std::fmt::Display for InputSource {
82    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
83        match self {
84            InputSource::Keyboard => write!(f, "keyboard"),
85            InputSource::Clipboard => write!(f, "clipboard"),
86            InputSource::File => write!(f, "file"),
87            InputSource::Network => write!(f, "network"),
88            InputSource::Unknown => write!(f, "unknown"),
89        }
90    }
91}
92
93/// Statistics about UTF-8 sanitization operation
94#[derive(Debug, Clone, PartialEq, Eq)]
95pub struct SanitizeStats {
96    /// Original byte length
97    pub original_bytes: usize,
98    /// Final byte length (may differ due to replacements)
99    pub sanitized_bytes: usize,
100    /// Number of invalid UTF-8 sequences replaced
101    pub invalid_sequences: usize,
102    /// Number of null bytes removed
103    pub null_bytes_removed: usize,
104    /// Number of control characters removed
105    pub control_chars_removed: usize,
106    /// Number of line ending normalizations
107    pub line_endings_normalized: usize,
108    /// Whether Unicode NFC normalization was applied
109    pub unicode_normalized: bool,
110    /// Whether input was already valid UTF-8
111    pub was_valid: bool,
112}
113
114impl SanitizeStats {
115    /// Check if any sanitization occurred
116    pub fn had_issues(&self) -> bool {
117        !self.was_valid
118            || self.invalid_sequences > 0
119            || self.null_bytes_removed > 0
120            || self.control_chars_removed > 0
121            || self.line_endings_normalized > 0
122    }
123
124    /// Get a human-readable summary
125    pub fn summary(&self) -> String {
126        if !self.had_issues() && !self.unicode_normalized {
127            return "Input was clean UTF-8".to_string();
128        }
129
130        let mut parts = Vec::new();
131        if self.invalid_sequences > 0 {
132            parts.push(format!(
133                "{} invalid UTF-8 sequences",
134                self.invalid_sequences
135            ));
136        }
137        if self.null_bytes_removed > 0 {
138            parts.push(format!("{} null bytes", self.null_bytes_removed));
139        }
140        if self.control_chars_removed > 0 {
141            parts.push(format!("{} control chars", self.control_chars_removed));
142        }
143        if self.line_endings_normalized > 0 {
144            parts.push(format!("{} line endings", self.line_endings_normalized));
145        }
146        if self.unicode_normalized {
147            parts.push("Unicode NFC normalized".to_string());
148        }
149
150        if parts.is_empty() {
151            "Input was clean".to_string()
152        } else {
153            format!("Sanitized: {}", parts.join(", "))
154        }
155    }
156}
157
158/// Sanitize raw bytes into safe UTF-8 string
159///
160/// This is the main entry point for all text input. It:
161/// 1. Replaces invalid UTF-8 with � (U+FFFD REPLACEMENT CHARACTER)
162/// 2. Removes null bytes (security risk)
163/// 3. Normalizes line endings to \n
164///
165/// # Examples
166/// ```
167/// use marco_core::logic::utf8::{sanitize_input, InputSource};
168///
169/// let raw = b"Hello \xF0\x28\x8C\x28 World"; // Invalid UTF-8
170/// let safe = sanitize_input(raw, InputSource::Clipboard);
171/// assert!(safe.contains('�')); // Replacement character
172/// ```
173pub fn sanitize_input(bytes: &[u8], source: InputSource) -> String {
174    let (sanitized, _stats) = sanitize_input_with_stats(bytes, source);
175    sanitized
176}
177
178/// Sanitize raw bytes and return statistics
179///
180/// Same as `sanitize_input()` but also returns detailed statistics
181/// about what was sanitized.
182///
183/// # Examples
184/// ```
185/// use marco_core::logic::utf8::{sanitize_input_with_stats, InputSource};
186///
187/// let raw = b"Hello \xF0\x28\x8C\x28 World";
188/// let (safe, stats) = sanitize_input_with_stats(raw, InputSource::File);
189/// assert!(stats.had_issues());
190/// println!("{}", stats.summary());
191/// ```
192pub fn sanitize_input_with_stats(bytes: &[u8], _source: InputSource) -> (String, SanitizeStats) {
193    let original_bytes = bytes.len();
194
195    // Step 1: Convert to UTF-8, replacing invalid sequences
196    let (utf8_str, invalid_sequences) = match std::str::from_utf8(bytes) {
197        Ok(s) => (Cow::Borrowed(s), 0),
198        Err(_) => {
199            // Use String::from_utf8_lossy which replaces invalid sequences with �
200            let lossy = String::from_utf8_lossy(bytes);
201            let invalid_count = lossy.matches('�').count();
202            (lossy, invalid_count)
203        }
204    };
205
206    let was_valid = invalid_sequences == 0;
207
208    // Step 2: Apply Unicode NFC normalization (canonical composition)
209    // This ensures that canonically equivalent forms are unified:
210    // - Precomposed vs decomposed characters (é vs e + ´)
211    // - Multi-script text stability
212    // - Deterministic parser results
213    let normalized_unicode: String = utf8_str.nfc().collect();
214    let unicode_normalized =
215        normalized_unicode.len() != utf8_str.len() || normalized_unicode != utf8_str.as_ref();
216
217    // Step 3: Remove null bytes (security risk)
218    let (no_nulls, null_bytes_removed) = if normalized_unicode.contains('\0') {
219        let filtered: String = normalized_unicode.chars().filter(|&c| c != '\0').collect();
220        let removed = normalized_unicode.len() - filtered.len();
221        (filtered, removed)
222    } else {
223        (normalized_unicode, 0)
224    };
225
226    // Step 4: Filter control characters (except \n, \r, \t)
227    // This prevents rendering anomalies and potential injection exploits
228    let original_len = no_nulls.len();
229    let filtered: String = no_nulls
230        .chars()
231        .filter(|&c| !c.is_control() || matches!(c, '\n' | '\r' | '\t'))
232        .collect();
233    let control_chars_removed = original_len - filtered.len();
234
235    // Step 5: Normalize line endings (\r\n → \n, \r → \n)
236    let (normalized, line_endings_normalized) = normalize_line_endings(&filtered);
237
238    let sanitized_bytes = normalized.len();
239
240    let stats = SanitizeStats {
241        original_bytes,
242        sanitized_bytes,
243        invalid_sequences,
244        null_bytes_removed,
245        control_chars_removed,
246        line_endings_normalized,
247        unicode_normalized,
248        was_valid,
249    };
250
251    // Log if issues were found (in production, use proper logging)
252    if stats.had_issues() {
253        #[cfg(debug_assertions)]
254        log::debug!("[UTF-8 Sanitizer] {}", stats.summary());
255    }
256
257    (normalized.into_owned(), stats)
258}
259
260/// Normalize line endings to Unix-style \n
261///
262/// Converts:
263/// - \r\n (Windows) → \n
264/// - \r (Old Mac) → \n
265fn normalize_line_endings(s: &str) -> (Cow<'_, str>, usize) {
266    if !s.contains('\r') {
267        return (Cow::Borrowed(s), 0);
268    }
269
270    // Count \r occurrences before normalization
271    let cr_count = s.matches('\r').count();
272
273    let normalized = s.replace("\r\n", "\n").replace('\r', "\n");
274
275    (Cow::Owned(normalized), cr_count)
276}
277
278/// Check if a byte index is on a UTF-8 character boundary
279///
280/// This is useful when you need to slice strings at calculated positions.
281/// Always check before slicing!
282///
283/// # Examples
284/// ```
285/// use marco_core::logic::utf8::is_char_boundary;
286///
287/// let text = "Hello — World"; // Em dash is 3 bytes
288/// assert!(is_char_boundary(text, 6)); // After "Hello "
289/// assert!(!is_char_boundary(text, 7)); // Inside em dash
290/// assert!(is_char_boundary(text, 9)); // After em dash
291/// ```
292pub fn is_char_boundary(s: &str, index: usize) -> bool {
293    s.is_char_boundary(index)
294}
295
296/// Find the previous valid char boundary from a given position
297///
298/// If `index` is already on a boundary, returns `index`.
299/// Otherwise, returns the position of the previous character start.
300///
301/// # Examples
302/// ```
303/// use marco_core::logic::utf8::find_prev_boundary;
304///
305/// let text = "Hello — World"; // Em dash is 3 bytes
306/// assert_eq!(find_prev_boundary(text, 8), 6); // Inside dash → start of dash
307/// assert_eq!(find_prev_boundary(text, 9), 9); // Already on boundary
308/// ```
309pub fn find_prev_boundary(s: &str, index: usize) -> usize {
310    if index >= s.len() {
311        return s.len();
312    }
313
314    let mut pos = index;
315    while pos > 0 && !s.is_char_boundary(pos) {
316        pos -= 1;
317    }
318    pos
319}
320
321/// Find the next valid char boundary from a given position
322///
323/// If `index` is already on a boundary, returns `index`.
324/// Otherwise, returns the position of the next character start.
325///
326/// # Examples
327/// ```
328/// use marco_core::logic::utf8::find_next_boundary;
329///
330/// let text = "Hello — World"; // Em dash is 3 bytes
331/// assert_eq!(find_next_boundary(text, 7), 9); // Inside dash → end of dash
332/// assert_eq!(find_next_boundary(text, 6), 6); // Already on boundary
333/// ```
334pub fn find_next_boundary(s: &str, index: usize) -> usize {
335    if index >= s.len() {
336        return s.len();
337    }
338
339    let mut pos = index;
340    while pos < s.len() && !s.is_char_boundary(pos) {
341        pos += 1;
342    }
343    pos
344}
345
346/// Get the byte length of a character at a given position
347///
348/// Returns 0 if the position is not on a character boundary.
349///
350/// # Examples
351/// ```
352/// use marco_core::logic::utf8::char_byte_length;
353///
354/// let text = "Hello — World";
355/// assert_eq!(char_byte_length(text, 0), 1); // 'H' = 1 byte
356/// assert_eq!(char_byte_length(text, 6), 3); // '—' = 3 bytes
357/// ```
358pub fn char_byte_length(s: &str, index: usize) -> usize {
359    if !s.is_char_boundary(index) {
360        return 0;
361    }
362
363    s[index..].chars().next().map(|c| c.len_utf8()).unwrap_or(0)
364}
365
366/// Safe substring extraction by character count (not bytes!)
367///
368/// Unlike Rust's `&str[start..end]` which uses byte indices, this function
369/// takes character positions and ensures slicing at valid boundaries.
370///
371/// # Examples
372/// ```
373/// use marco_core::logic::utf8::substring_by_chars;
374///
375/// let text = "Hello — World"; // Em dash is 3 bytes
376/// assert_eq!(substring_by_chars(text, 0, 5), "Hello");
377/// assert_eq!(substring_by_chars(text, 6, 7), "—"); // Single character
378/// assert_eq!(substring_by_chars(text, 8, 13), "World");
379/// ```
380pub fn substring_by_chars(s: &str, char_start: usize, char_end: usize) -> &str {
381    let byte_start = s
382        .char_indices()
383        .nth(char_start)
384        .map(|(i, _)| i)
385        .unwrap_or(s.len());
386
387    let byte_end = s
388        .char_indices()
389        .nth(char_end)
390        .map(|(i, _)| i)
391        .unwrap_or(s.len());
392
393    &s[byte_start..byte_end]
394}
395
396#[cfg(test)]
397mod tests {
398    use super::*;
399
400    #[test]
401    fn test_valid_utf8() {
402        let input = b"Hello, World!";
403        let (result, stats) = sanitize_input_with_stats(input, InputSource::Keyboard);
404        assert_eq!(result, "Hello, World!");
405        assert!(!stats.had_issues());
406        assert_eq!(stats.invalid_sequences, 0);
407    }
408
409    #[test]
410    fn test_invalid_utf8_replaced() {
411        // Invalid UTF-8 sequence
412        let input = b"Hello \xF0\x28\x8C\x28 World";
413        let (result, stats) = sanitize_input_with_stats(input, InputSource::Clipboard);
414        assert!(result.contains('�'));
415        assert!(stats.had_issues());
416        assert!(stats.invalid_sequences > 0);
417    }
418
419    #[test]
420    fn test_null_bytes_removed() {
421        let input = b"Hello\x00World\x00";
422        let (result, stats) = sanitize_input_with_stats(input, InputSource::File);
423        assert_eq!(result, "HelloWorld");
424        assert!(stats.had_issues());
425        assert_eq!(stats.null_bytes_removed, 2);
426    }
427
428    #[test]
429    fn test_line_ending_normalization_crlf() {
430        let input = b"Line1\r\nLine2\r\nLine3";
431        let (result, stats) = sanitize_input_with_stats(input, InputSource::File);
432        assert_eq!(result, "Line1\nLine2\nLine3");
433        assert!(stats.had_issues());
434        assert!(stats.line_endings_normalized > 0);
435    }
436
437    #[test]
438    fn test_line_ending_normalization_cr() {
439        let input = b"Line1\rLine2\rLine3";
440        let (result, stats) = sanitize_input_with_stats(input, InputSource::File);
441        assert_eq!(result, "Line1\nLine2\nLine3");
442        assert!(stats.had_issues());
443    }
444
445    #[test]
446    fn test_em_dash_char_boundary() {
447        let text = "Hello — World"; // Em dash (U+2014) is 3 bytes in UTF-8
448
449        // Check boundaries around em dash
450        assert!(is_char_boundary(text, 6)); // After "Hello "
451        assert!(is_char_boundary(text, 9)); // After em dash (bytes 6-8)
452        assert!(!is_char_boundary(text, 7)); // Inside em dash
453        assert!(!is_char_boundary(text, 8)); // Inside em dash
454    }
455
456    #[test]
457    fn test_find_boundaries() {
458        let text = "Hello — World";
459
460        // Find previous boundary from inside em dash
461        assert_eq!(find_prev_boundary(text, 7), 6); // Inside → start
462        assert_eq!(find_prev_boundary(text, 6), 6); // Already on boundary
463
464        // Find next boundary from inside em dash
465        assert_eq!(find_next_boundary(text, 7), 9); // Inside → end
466        assert_eq!(find_next_boundary(text, 9), 9); // Already on boundary
467    }
468
469    #[test]
470    fn test_char_byte_length() {
471        let text = "Hello — World 😀"; // Em dash = 3 bytes, emoji = 4 bytes
472
473        assert_eq!(char_byte_length(text, 0), 1); // 'H' = 1 byte
474        assert_eq!(char_byte_length(text, 6), 3); // '—' = 3 bytes
475        assert_eq!(char_byte_length(text, 16), 4); // '😀' = 4 bytes
476    }
477
478    #[test]
479    fn test_substring_by_chars() {
480        let text = "Hello — World"; // 13 characters, but more bytes
481
482        assert_eq!(substring_by_chars(text, 0, 5), "Hello");
483        assert_eq!(substring_by_chars(text, 6, 7), "—");
484        assert_eq!(substring_by_chars(text, 8, 13), "World");
485    }
486
487    #[test]
488    fn test_emoji_handling() {
489        let input = "Hello 😀 World 🎉".as_bytes();
490        let (result, stats) = sanitize_input_with_stats(input, InputSource::Keyboard);
491        assert_eq!(result, "Hello 😀 World 🎉");
492        assert!(!stats.had_issues());
493    }
494
495    #[test]
496    fn test_cjk_characters() {
497        let input = "こんにちは世界".as_bytes();
498        let (result, stats) = sanitize_input_with_stats(input, InputSource::Keyboard);
499        assert_eq!(result, "こんにちは世界");
500        assert!(!stats.had_issues());
501    }
502
503    #[test]
504    fn test_unicode_nfc_normalization_precomposed() {
505        // Test that decomposed form is normalized to precomposed form
506        // Decomposed: e (U+0065) + combining acute (U+0301) → Precomposed: é (U+00E9)
507        let decomposed = "cafe\u{0301}"; // café with decomposed é
508        let input = decomposed.as_bytes();
509        let (result, stats) = sanitize_input_with_stats(input, InputSource::Keyboard);
510
511        // Should be normalized to precomposed form
512        assert_eq!(result, "café"); // café with precomposed é (U+00E9)
513        assert!(stats.unicode_normalized);
514    }
515
516    #[test]
517    fn test_unicode_nfc_already_normalized() {
518        // Text already in NFC form should not be changed
519        let input = "café".as_bytes(); // Already precomposed
520        let (result, _stats) = sanitize_input_with_stats(input, InputSource::Keyboard);
521
522        assert_eq!(result, "café");
523        // Note: unicode_normalized may still be true if the check detects no difference
524    }
525
526    #[test]
527    fn test_em_dash_preserved() {
528        // Em dash (—, U+2014) should be preserved, not confused with hyphen (-, U+002D)
529        let input = "Native performance — no login".as_bytes();
530        let (result, _stats) = sanitize_input_with_stats(input, InputSource::Keyboard);
531
532        assert_eq!(result, "Native performance — no login");
533        assert!(result.contains('—')); // Em dash preserved
534
535        // Check character codes - find the em dash
536        let em_dash_char = result.chars().find(|&c| c == '—').unwrap();
537        assert_eq!(em_dash_char as u32, 0x2014); // Verify it's the em dash
538    }
539
540    #[test]
541    fn test_hyphen_vs_em_dash() {
542        // Test that hyphens and em dashes are distinct after normalization
543        let input = "hyphen - and em dash —".as_bytes();
544        let (result, _stats) = sanitize_input_with_stats(input, InputSource::Keyboard);
545
546        assert_eq!(result, "hyphen - and em dash —");
547
548        // Count each type
549        let hyphen_count = result.matches('-').count();
550        let em_dash_count = result.matches('—').count();
551
552        assert_eq!(hyphen_count, 1);
553        assert_eq!(em_dash_count, 1);
554    }
555
556    #[test]
557    fn test_control_characters_filtered() {
558        // Control characters (except \n, \r, \t) should be removed
559        let input = "Hello\x01\x02World\nNew\tLine\r\n".as_bytes();
560        let (result, stats) = sanitize_input_with_stats(input, InputSource::Keyboard);
561
562        // \x01 and \x02 should be removed, but \n, \t, \r should be preserved (then normalized)
563        assert_eq!(result, "HelloWorld\nNew\tLine\n");
564        assert!(stats.control_chars_removed > 0);
565    }
566
567    #[test]
568    fn test_complex_markdown_with_em_dashes() {
569        // Real-world test with markdown containing em dashes
570        let input = "- **Bold** — description\n- *Italic* — another item".as_bytes();
571        let (result, _stats) = sanitize_input_with_stats(input, InputSource::File);
572
573        // Should preserve markdown structure and em dashes
574        assert!(result.contains("**Bold** — description"));
575        assert!(result.contains("*Italic* — another item"));
576        assert_eq!(result.matches('—').count(), 2);
577    }
578
579    #[test]
580    fn test_mixed_multibyte() {
581        // Mix of 1, 2, 3, 4 byte UTF-8 characters
582        let input = "ASCII Café 日本語 😀".as_bytes();
583        let (result, stats) = sanitize_input_with_stats(input, InputSource::File);
584        assert_eq!(result, "ASCII Café 日本語 😀");
585        assert!(!stats.had_issues());
586    }
587
588    #[test]
589    fn test_stats_summary() {
590        let input = b"Hello\x00\xF0\x28World\r\n";
591        let (_result, stats) = sanitize_input_with_stats(input, InputSource::Clipboard);
592
593        assert!(stats.had_issues());
594        let summary = stats.summary();
595        assert!(summary.contains("Sanitized"));
596    }
597}