marco_core/logic/utf8.rs
1//! UTF-8 Input Sanitization and Validation
2//!
3//! This module provides defensive UTF-8 handling for all text input sources
4//! (keyboard, clipboard, files). It ensures that invalid UTF-8 sequences are
5//! safely handled and Unicode text is normalized before reaching the parser layer.
6//!
7//! # Architecture
8//! ```text
9//! Raw input (keyboard, clipboard, file)
10//! │
11//! ▼
12//! [UTF-8 Validation → Unicode Normalization → Control Char Filter] ← This module
13//! │
14//! ▼
15//! Parser (nom, Markdown)
16//! │
17//! ▼
18//! Renderer (SourceView5 + WebKit6)
19//! ```
20//!
21//! # Strategy
22//! 1. **Validate** - Check if input is valid UTF-8
23//! 2. **Sanitize** - Replace invalid sequences with � (U+FFFD)
24//! 3. **Normalize** - Apply Unicode NFC normalization (canonical composition)
25//! 4. **Filter** - Remove control characters (except \n, \r, \t)
26//! 5. **Standardize** - Normalize line endings to \n
27//!
28//! # Unicode Normalization
29//!
30//! **Why NFC (Canonical Composition)?**
31//!
32//! Unicode allows multiple representations of visually identical text:
33//! - Precomposed form: `é` (U+00E9, single character)
34//! - Decomposed form: `e` + `´` (U+0065 + U+0301, two characters)
35//!
36//! Without normalization:
37//! - Parser may treat `café` and `café` as different strings
38//! - Emphasis markers like `*café*` might fail if `é` is decomposed
39//! - Em dashes (—, U+2014) vs hyphens (-, U+002D) stay distinct
40//!
41//! NFC normalization ensures:
42//! - Canonically equivalent forms are unified
43//! - Multi-script text is stable for tokenization
44//! - Parser results are deterministic across platforms
45//!
46//! # Examples
47//! ```
48//! use marco_core::logic::utf8::{sanitize_input, InputSource};
49//!
50//! // From keyboard input
51//! let raw_bytes = b"Hello World";
52//! let safe_text = sanitize_input(raw_bytes, InputSource::Keyboard);
53//!
54//! // From clipboard
55//! let clipboard_bytes = b"Hello \xF0\x28\x8C\x28 World"; // invalid UTF-8
56//! let safe_text = sanitize_input(clipboard_bytes, InputSource::Clipboard);
57//!
58//! // From file
59//! let file_bytes = b"Line1\r\nLine2\r\n";
60//! let safe_text = sanitize_input(file_bytes, InputSource::File);
61//! ```
62
63use std::borrow::Cow;
64use unicode_normalization::UnicodeNormalization;
65
66/// Source of the input text (for logging/diagnostics)
67#[derive(Debug, Clone, Copy, PartialEq, Eq)]
68pub enum InputSource {
69 /// Direct keyboard input
70 Keyboard,
71 /// Clipboard paste
72 Clipboard,
73 /// File load
74 File,
75 /// Network/API
76 Network,
77 /// Unknown/other source
78 Unknown,
79}
80
81impl std::fmt::Display for InputSource {
82 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
83 match self {
84 InputSource::Keyboard => write!(f, "keyboard"),
85 InputSource::Clipboard => write!(f, "clipboard"),
86 InputSource::File => write!(f, "file"),
87 InputSource::Network => write!(f, "network"),
88 InputSource::Unknown => write!(f, "unknown"),
89 }
90 }
91}
92
93/// Statistics about UTF-8 sanitization operation
94#[derive(Debug, Clone, PartialEq, Eq)]
95pub struct SanitizeStats {
96 /// Original byte length
97 pub original_bytes: usize,
98 /// Final byte length (may differ due to replacements)
99 pub sanitized_bytes: usize,
100 /// Number of invalid UTF-8 sequences replaced
101 pub invalid_sequences: usize,
102 /// Number of null bytes removed
103 pub null_bytes_removed: usize,
104 /// Number of control characters removed
105 pub control_chars_removed: usize,
106 /// Number of line ending normalizations
107 pub line_endings_normalized: usize,
108 /// Whether Unicode NFC normalization was applied
109 pub unicode_normalized: bool,
110 /// Whether input was already valid UTF-8
111 pub was_valid: bool,
112}
113
114impl SanitizeStats {
115 /// Check if any sanitization occurred
116 pub fn had_issues(&self) -> bool {
117 !self.was_valid
118 || self.invalid_sequences > 0
119 || self.null_bytes_removed > 0
120 || self.control_chars_removed > 0
121 || self.line_endings_normalized > 0
122 }
123
124 /// Get a human-readable summary
125 pub fn summary(&self) -> String {
126 if !self.had_issues() && !self.unicode_normalized {
127 return "Input was clean UTF-8".to_string();
128 }
129
130 let mut parts = Vec::new();
131 if self.invalid_sequences > 0 {
132 parts.push(format!(
133 "{} invalid UTF-8 sequences",
134 self.invalid_sequences
135 ));
136 }
137 if self.null_bytes_removed > 0 {
138 parts.push(format!("{} null bytes", self.null_bytes_removed));
139 }
140 if self.control_chars_removed > 0 {
141 parts.push(format!("{} control chars", self.control_chars_removed));
142 }
143 if self.line_endings_normalized > 0 {
144 parts.push(format!("{} line endings", self.line_endings_normalized));
145 }
146 if self.unicode_normalized {
147 parts.push("Unicode NFC normalized".to_string());
148 }
149
150 if parts.is_empty() {
151 "Input was clean".to_string()
152 } else {
153 format!("Sanitized: {}", parts.join(", "))
154 }
155 }
156}
157
158/// Sanitize raw bytes into safe UTF-8 string
159///
160/// This is the main entry point for all text input. It:
161/// 1. Replaces invalid UTF-8 with � (U+FFFD REPLACEMENT CHARACTER)
162/// 2. Removes null bytes (security risk)
163/// 3. Normalizes line endings to \n
164///
165/// # Examples
166/// ```
167/// use marco_core::logic::utf8::{sanitize_input, InputSource};
168///
169/// let raw = b"Hello \xF0\x28\x8C\x28 World"; // Invalid UTF-8
170/// let safe = sanitize_input(raw, InputSource::Clipboard);
171/// assert!(safe.contains('�')); // Replacement character
172/// ```
173pub fn sanitize_input(bytes: &[u8], source: InputSource) -> String {
174 let (sanitized, _stats) = sanitize_input_with_stats(bytes, source);
175 sanitized
176}
177
178/// Sanitize raw bytes and return statistics
179///
180/// Same as `sanitize_input()` but also returns detailed statistics
181/// about what was sanitized.
182///
183/// # Examples
184/// ```
185/// use marco_core::logic::utf8::{sanitize_input_with_stats, InputSource};
186///
187/// let raw = b"Hello \xF0\x28\x8C\x28 World";
188/// let (safe, stats) = sanitize_input_with_stats(raw, InputSource::File);
189/// assert!(stats.had_issues());
190/// println!("{}", stats.summary());
191/// ```
192pub fn sanitize_input_with_stats(bytes: &[u8], _source: InputSource) -> (String, SanitizeStats) {
193 let original_bytes = bytes.len();
194
195 // Step 1: Convert to UTF-8, replacing invalid sequences
196 let (utf8_str, invalid_sequences) = match std::str::from_utf8(bytes) {
197 Ok(s) => (Cow::Borrowed(s), 0),
198 Err(_) => {
199 // Use String::from_utf8_lossy which replaces invalid sequences with �
200 let lossy = String::from_utf8_lossy(bytes);
201 let invalid_count = lossy.matches('�').count();
202 (lossy, invalid_count)
203 }
204 };
205
206 let was_valid = invalid_sequences == 0;
207
208 // Step 2: Apply Unicode NFC normalization (canonical composition)
209 // This ensures that canonically equivalent forms are unified:
210 // - Precomposed vs decomposed characters (é vs e + ´)
211 // - Multi-script text stability
212 // - Deterministic parser results
213 let normalized_unicode: String = utf8_str.nfc().collect();
214 let unicode_normalized =
215 normalized_unicode.len() != utf8_str.len() || normalized_unicode != utf8_str.as_ref();
216
217 // Step 3: Remove null bytes (security risk)
218 let (no_nulls, null_bytes_removed) = if normalized_unicode.contains('\0') {
219 let filtered: String = normalized_unicode.chars().filter(|&c| c != '\0').collect();
220 let removed = normalized_unicode.len() - filtered.len();
221 (filtered, removed)
222 } else {
223 (normalized_unicode, 0)
224 };
225
226 // Step 4: Filter control characters (except \n, \r, \t)
227 // This prevents rendering anomalies and potential injection exploits
228 let original_len = no_nulls.len();
229 let filtered: String = no_nulls
230 .chars()
231 .filter(|&c| !c.is_control() || matches!(c, '\n' | '\r' | '\t'))
232 .collect();
233 let control_chars_removed = original_len - filtered.len();
234
235 // Step 5: Normalize line endings (\r\n → \n, \r → \n)
236 let (normalized, line_endings_normalized) = normalize_line_endings(&filtered);
237
238 let sanitized_bytes = normalized.len();
239
240 let stats = SanitizeStats {
241 original_bytes,
242 sanitized_bytes,
243 invalid_sequences,
244 null_bytes_removed,
245 control_chars_removed,
246 line_endings_normalized,
247 unicode_normalized,
248 was_valid,
249 };
250
251 // Log if issues were found (in production, use proper logging)
252 if stats.had_issues() {
253 #[cfg(debug_assertions)]
254 log::debug!("[UTF-8 Sanitizer] {}", stats.summary());
255 }
256
257 (normalized.into_owned(), stats)
258}
259
260/// Normalize line endings to Unix-style \n
261///
262/// Converts:
263/// - \r\n (Windows) → \n
264/// - \r (Old Mac) → \n
265fn normalize_line_endings(s: &str) -> (Cow<'_, str>, usize) {
266 if !s.contains('\r') {
267 return (Cow::Borrowed(s), 0);
268 }
269
270 // Count \r occurrences before normalization
271 let cr_count = s.matches('\r').count();
272
273 let normalized = s.replace("\r\n", "\n").replace('\r', "\n");
274
275 (Cow::Owned(normalized), cr_count)
276}
277
278/// Check if a byte index is on a UTF-8 character boundary
279///
280/// This is useful when you need to slice strings at calculated positions.
281/// Always check before slicing!
282///
283/// # Examples
284/// ```
285/// use marco_core::logic::utf8::is_char_boundary;
286///
287/// let text = "Hello — World"; // Em dash is 3 bytes
288/// assert!(is_char_boundary(text, 6)); // After "Hello "
289/// assert!(!is_char_boundary(text, 7)); // Inside em dash
290/// assert!(is_char_boundary(text, 9)); // After em dash
291/// ```
292pub fn is_char_boundary(s: &str, index: usize) -> bool {
293 s.is_char_boundary(index)
294}
295
296/// Find the previous valid char boundary from a given position
297///
298/// If `index` is already on a boundary, returns `index`.
299/// Otherwise, returns the position of the previous character start.
300///
301/// # Examples
302/// ```
303/// use marco_core::logic::utf8::find_prev_boundary;
304///
305/// let text = "Hello — World"; // Em dash is 3 bytes
306/// assert_eq!(find_prev_boundary(text, 8), 6); // Inside dash → start of dash
307/// assert_eq!(find_prev_boundary(text, 9), 9); // Already on boundary
308/// ```
309pub fn find_prev_boundary(s: &str, index: usize) -> usize {
310 if index >= s.len() {
311 return s.len();
312 }
313
314 let mut pos = index;
315 while pos > 0 && !s.is_char_boundary(pos) {
316 pos -= 1;
317 }
318 pos
319}
320
321/// Find the next valid char boundary from a given position
322///
323/// If `index` is already on a boundary, returns `index`.
324/// Otherwise, returns the position of the next character start.
325///
326/// # Examples
327/// ```
328/// use marco_core::logic::utf8::find_next_boundary;
329///
330/// let text = "Hello — World"; // Em dash is 3 bytes
331/// assert_eq!(find_next_boundary(text, 7), 9); // Inside dash → end of dash
332/// assert_eq!(find_next_boundary(text, 6), 6); // Already on boundary
333/// ```
334pub fn find_next_boundary(s: &str, index: usize) -> usize {
335 if index >= s.len() {
336 return s.len();
337 }
338
339 let mut pos = index;
340 while pos < s.len() && !s.is_char_boundary(pos) {
341 pos += 1;
342 }
343 pos
344}
345
346/// Get the byte length of a character at a given position
347///
348/// Returns 0 if the position is not on a character boundary.
349///
350/// # Examples
351/// ```
352/// use marco_core::logic::utf8::char_byte_length;
353///
354/// let text = "Hello — World";
355/// assert_eq!(char_byte_length(text, 0), 1); // 'H' = 1 byte
356/// assert_eq!(char_byte_length(text, 6), 3); // '—' = 3 bytes
357/// ```
358pub fn char_byte_length(s: &str, index: usize) -> usize {
359 if !s.is_char_boundary(index) {
360 return 0;
361 }
362
363 s[index..].chars().next().map(|c| c.len_utf8()).unwrap_or(0)
364}
365
366/// Safe substring extraction by character count (not bytes!)
367///
368/// Unlike Rust's `&str[start..end]` which uses byte indices, this function
369/// takes character positions and ensures slicing at valid boundaries.
370///
371/// # Examples
372/// ```
373/// use marco_core::logic::utf8::substring_by_chars;
374///
375/// let text = "Hello — World"; // Em dash is 3 bytes
376/// assert_eq!(substring_by_chars(text, 0, 5), "Hello");
377/// assert_eq!(substring_by_chars(text, 6, 7), "—"); // Single character
378/// assert_eq!(substring_by_chars(text, 8, 13), "World");
379/// ```
380pub fn substring_by_chars(s: &str, char_start: usize, char_end: usize) -> &str {
381 let byte_start = s
382 .char_indices()
383 .nth(char_start)
384 .map(|(i, _)| i)
385 .unwrap_or(s.len());
386
387 let byte_end = s
388 .char_indices()
389 .nth(char_end)
390 .map(|(i, _)| i)
391 .unwrap_or(s.len());
392
393 &s[byte_start..byte_end]
394}
395
396#[cfg(test)]
397mod tests {
398 use super::*;
399
400 #[test]
401 fn test_valid_utf8() {
402 let input = b"Hello, World!";
403 let (result, stats) = sanitize_input_with_stats(input, InputSource::Keyboard);
404 assert_eq!(result, "Hello, World!");
405 assert!(!stats.had_issues());
406 assert_eq!(stats.invalid_sequences, 0);
407 }
408
409 #[test]
410 fn test_invalid_utf8_replaced() {
411 // Invalid UTF-8 sequence
412 let input = b"Hello \xF0\x28\x8C\x28 World";
413 let (result, stats) = sanitize_input_with_stats(input, InputSource::Clipboard);
414 assert!(result.contains('�'));
415 assert!(stats.had_issues());
416 assert!(stats.invalid_sequences > 0);
417 }
418
419 #[test]
420 fn test_null_bytes_removed() {
421 let input = b"Hello\x00World\x00";
422 let (result, stats) = sanitize_input_with_stats(input, InputSource::File);
423 assert_eq!(result, "HelloWorld");
424 assert!(stats.had_issues());
425 assert_eq!(stats.null_bytes_removed, 2);
426 }
427
428 #[test]
429 fn test_line_ending_normalization_crlf() {
430 let input = b"Line1\r\nLine2\r\nLine3";
431 let (result, stats) = sanitize_input_with_stats(input, InputSource::File);
432 assert_eq!(result, "Line1\nLine2\nLine3");
433 assert!(stats.had_issues());
434 assert!(stats.line_endings_normalized > 0);
435 }
436
437 #[test]
438 fn test_line_ending_normalization_cr() {
439 let input = b"Line1\rLine2\rLine3";
440 let (result, stats) = sanitize_input_with_stats(input, InputSource::File);
441 assert_eq!(result, "Line1\nLine2\nLine3");
442 assert!(stats.had_issues());
443 }
444
445 #[test]
446 fn test_em_dash_char_boundary() {
447 let text = "Hello — World"; // Em dash (U+2014) is 3 bytes in UTF-8
448
449 // Check boundaries around em dash
450 assert!(is_char_boundary(text, 6)); // After "Hello "
451 assert!(is_char_boundary(text, 9)); // After em dash (bytes 6-8)
452 assert!(!is_char_boundary(text, 7)); // Inside em dash
453 assert!(!is_char_boundary(text, 8)); // Inside em dash
454 }
455
456 #[test]
457 fn test_find_boundaries() {
458 let text = "Hello — World";
459
460 // Find previous boundary from inside em dash
461 assert_eq!(find_prev_boundary(text, 7), 6); // Inside → start
462 assert_eq!(find_prev_boundary(text, 6), 6); // Already on boundary
463
464 // Find next boundary from inside em dash
465 assert_eq!(find_next_boundary(text, 7), 9); // Inside → end
466 assert_eq!(find_next_boundary(text, 9), 9); // Already on boundary
467 }
468
469 #[test]
470 fn test_char_byte_length() {
471 let text = "Hello — World 😀"; // Em dash = 3 bytes, emoji = 4 bytes
472
473 assert_eq!(char_byte_length(text, 0), 1); // 'H' = 1 byte
474 assert_eq!(char_byte_length(text, 6), 3); // '—' = 3 bytes
475 assert_eq!(char_byte_length(text, 16), 4); // '😀' = 4 bytes
476 }
477
478 #[test]
479 fn test_substring_by_chars() {
480 let text = "Hello — World"; // 13 characters, but more bytes
481
482 assert_eq!(substring_by_chars(text, 0, 5), "Hello");
483 assert_eq!(substring_by_chars(text, 6, 7), "—");
484 assert_eq!(substring_by_chars(text, 8, 13), "World");
485 }
486
487 #[test]
488 fn test_emoji_handling() {
489 let input = "Hello 😀 World 🎉".as_bytes();
490 let (result, stats) = sanitize_input_with_stats(input, InputSource::Keyboard);
491 assert_eq!(result, "Hello 😀 World 🎉");
492 assert!(!stats.had_issues());
493 }
494
495 #[test]
496 fn test_cjk_characters() {
497 let input = "こんにちは世界".as_bytes();
498 let (result, stats) = sanitize_input_with_stats(input, InputSource::Keyboard);
499 assert_eq!(result, "こんにちは世界");
500 assert!(!stats.had_issues());
501 }
502
503 #[test]
504 fn test_unicode_nfc_normalization_precomposed() {
505 // Test that decomposed form is normalized to precomposed form
506 // Decomposed: e (U+0065) + combining acute (U+0301) → Precomposed: é (U+00E9)
507 let decomposed = "cafe\u{0301}"; // café with decomposed é
508 let input = decomposed.as_bytes();
509 let (result, stats) = sanitize_input_with_stats(input, InputSource::Keyboard);
510
511 // Should be normalized to precomposed form
512 assert_eq!(result, "café"); // café with precomposed é (U+00E9)
513 assert!(stats.unicode_normalized);
514 }
515
516 #[test]
517 fn test_unicode_nfc_already_normalized() {
518 // Text already in NFC form should not be changed
519 let input = "café".as_bytes(); // Already precomposed
520 let (result, _stats) = sanitize_input_with_stats(input, InputSource::Keyboard);
521
522 assert_eq!(result, "café");
523 // Note: unicode_normalized may still be true if the check detects no difference
524 }
525
526 #[test]
527 fn test_em_dash_preserved() {
528 // Em dash (—, U+2014) should be preserved, not confused with hyphen (-, U+002D)
529 let input = "Native performance — no login".as_bytes();
530 let (result, _stats) = sanitize_input_with_stats(input, InputSource::Keyboard);
531
532 assert_eq!(result, "Native performance — no login");
533 assert!(result.contains('—')); // Em dash preserved
534
535 // Check character codes - find the em dash
536 let em_dash_char = result.chars().find(|&c| c == '—').unwrap();
537 assert_eq!(em_dash_char as u32, 0x2014); // Verify it's the em dash
538 }
539
540 #[test]
541 fn test_hyphen_vs_em_dash() {
542 // Test that hyphens and em dashes are distinct after normalization
543 let input = "hyphen - and em dash —".as_bytes();
544 let (result, _stats) = sanitize_input_with_stats(input, InputSource::Keyboard);
545
546 assert_eq!(result, "hyphen - and em dash —");
547
548 // Count each type
549 let hyphen_count = result.matches('-').count();
550 let em_dash_count = result.matches('—').count();
551
552 assert_eq!(hyphen_count, 1);
553 assert_eq!(em_dash_count, 1);
554 }
555
556 #[test]
557 fn test_control_characters_filtered() {
558 // Control characters (except \n, \r, \t) should be removed
559 let input = "Hello\x01\x02World\nNew\tLine\r\n".as_bytes();
560 let (result, stats) = sanitize_input_with_stats(input, InputSource::Keyboard);
561
562 // \x01 and \x02 should be removed, but \n, \t, \r should be preserved (then normalized)
563 assert_eq!(result, "HelloWorld\nNew\tLine\n");
564 assert!(stats.control_chars_removed > 0);
565 }
566
567 #[test]
568 fn test_complex_markdown_with_em_dashes() {
569 // Real-world test with markdown containing em dashes
570 let input = "- **Bold** — description\n- *Italic* — another item".as_bytes();
571 let (result, _stats) = sanitize_input_with_stats(input, InputSource::File);
572
573 // Should preserve markdown structure and em dashes
574 assert!(result.contains("**Bold** — description"));
575 assert!(result.contains("*Italic* — another item"));
576 assert_eq!(result.matches('—').count(), 2);
577 }
578
579 #[test]
580 fn test_mixed_multibyte() {
581 // Mix of 1, 2, 3, 4 byte UTF-8 characters
582 let input = "ASCII Café 日本語 😀".as_bytes();
583 let (result, stats) = sanitize_input_with_stats(input, InputSource::File);
584 assert_eq!(result, "ASCII Café 日本語 😀");
585 assert!(!stats.had_issues());
586 }
587
588 #[test]
589 fn test_stats_summary() {
590 let input = b"Hello\x00\xF0\x28World\r\n";
591 let (_result, stats) = sanitize_input_with_stats(input, InputSource::Clipboard);
592
593 assert!(stats.had_issues());
594 let summary = stats.summary();
595 assert!(summary.contains("Sanitized"));
596 }
597}