Skip to main content

xchecker_utils/
canonicalization.rs

1use anyhow::{Context, Result};
2use blake3::Hasher;
3use serde::Serialize;
4
5use crate::error::XCheckerError;
6use crate::types::FileType;
7
8/// Emit a value as JCS-canonical JSON (RFC 8785).
9///
10/// This is the standard way to emit JSON for receipts, status, doctor outputs,
11/// and any other JSON contracts. JCS ensures deterministic output regardless
12/// of field ordering in the source struct.
13///
14/// # Example
15///
16/// ```rust,no_run
17/// use xchecker_utils::canonicalization::emit_jcs;
18/// use serde::Serialize;
19///
20/// #[derive(Serialize)]
21/// struct MyOutput {
22///     name: String,
23///     value: i32,
24/// }
25///
26/// let output = MyOutput { name: "test".into(), value: 42 };
27/// let json = emit_jcs(&output).expect("serialization should succeed");
28/// println!("{}", json);
29/// ```
30pub fn emit_jcs<T: Serialize>(value: &T) -> Result<String> {
31    let json_value =
32        serde_json::to_value(value).with_context(|| "Failed to serialize value to JSON")?;
33    let json_bytes = serde_json_canonicalizer::to_vec(&json_value)
34        .with_context(|| "Failed to canonicalize JSON using JCS")?;
35    String::from_utf8(json_bytes).with_context(|| "JCS output contained invalid UTF-8")
36}
37
38// Canonicalization version constants
39#[allow(dead_code)] // Reserved for future content-addressed storage
40pub const CANON_VERSION_YAML: &str = "yaml-v1";
41#[allow(dead_code)] // Reserved for future content-addressed storage
42pub const CANON_VERSION_MD: &str = "md-v1";
43pub const CANON_VERSION: &str = "yaml-v1,md-v1";
44pub const CANONICALIZATION_BACKEND: &str = "jcs-rfc8785"; // for YAML hashing
45
46/// Provides deterministic canonicalization and hashing of content
47/// Implements explicit v1 algorithms for YAML and Markdown canonicalization
48pub struct Canonicalizer {
49    version: String,
50}
51
52impl Canonicalizer {
53    /// Create a new canonicalizer with the current version
54    #[must_use]
55    pub fn new() -> Self {
56        Self {
57            version: CANON_VERSION.to_string(),
58        }
59    }
60
61    /// Get the canonicalization version string
62    #[must_use]
63    pub fn version(&self) -> &str {
64        &self.version
65    }
66
67    /// Get the canonicalization backend identifier
68    #[must_use]
69    pub const fn backend(&self) -> &'static str {
70        CANONICALIZATION_BACKEND
71    }
72
73    /// Canonicalize YAML content (v1 algorithm)
74    /// Uses JCS (JSON Canonicalization Scheme) approach:
75    /// 1. Parse YAML → convert to JSON with deterministic maps (`BTreeMap`)
76    /// 2. Keep human-readable YAML on disk (normalized: LF, trim trailing spaces, final newline)
77    /// 3. For hashing: use JCS canonicalization of the JSON representation
78    ///
79    ///    Reserved for future content-addressed verification
80    #[allow(dead_code)]
81    pub fn canonicalize_yaml(&self, content: &str) -> Result<String> {
82        // Parse YAML structure
83        let yaml_value: serde_yaml::Value =
84            serde_yaml::from_str(content).with_context(|| "Failed to parse YAML content")?;
85
86        // Emit normalized YAML for human readability (stored on disk)
87        let mut output = serde_yaml::to_string(&yaml_value)
88            .with_context(|| "Failed to serialize YAML content")?;
89
90        // Normalize line endings and ensure final newline
91        output = self.normalize_line_endings(&output);
92        if !output.ends_with('\n') {
93            output.push('\n');
94        }
95
96        // Remove trailing spaces from each line
97        let lines: Vec<&str> = output.lines().collect();
98        let cleaned_lines: Vec<String> = lines
99            .iter()
100            .map(|line| line.trim_end().to_string())
101            .collect();
102
103        Ok(cleaned_lines.join("\n") + "\n")
104    }
105
106    /// Normalize Markdown content (v1 algorithm)
107    /// Explicit rules:
108    /// 1. Normalize \n, trim trailing spaces, collapse trailing blank lines to 1
109    /// 2. Fence normalization: ``` with language tag preserved
110    /// 3. Final newline enforced
111    /// 4. Normalize heading underlines to # style
112    /// 5. Stable ordering where structure allows
113    pub fn normalize_markdown(&self, content: &str) -> Result<String> {
114        let mut normalized = self.normalize_line_endings(content);
115
116        // Trim trailing spaces from all lines
117        let lines: Vec<&str> = normalized.lines().collect();
118        let mut cleaned_lines: Vec<String> = lines
119            .iter()
120            .map(|line| line.trim_end().to_string())
121            .collect();
122
123        // Normalize fenced code blocks to ``` format
124        for line in &mut cleaned_lines {
125            if line.starts_with("~~~") {
126                // Convert ~~~ to ``` while preserving language tag
127                let lang_tag = line.trim_start_matches('~').trim();
128                if lang_tag.is_empty() {
129                    *line = "```".to_string();
130                } else {
131                    *line = format!("```{lang_tag}");
132                }
133            }
134        }
135
136        normalized = cleaned_lines.join("\n");
137
138        // Collapse multiple trailing blank lines to exactly 1
139        while normalized.ends_with("\n\n\n") {
140            normalized = normalized.trim_end_matches('\n').to_string() + "\n\n";
141        }
142
143        // Ensure file ends with exactly one newline
144        normalized = normalized.trim_end_matches('\n').to_string() + "\n";
145
146        Ok(normalized)
147    }
148
149    /// Normalize plain text content
150    #[must_use]
151    pub fn normalize_text(&self, content: &str) -> String {
152        self.normalize_line_endings(content)
153    }
154
155    /// Compute BLAKE3 hash of canonicalized content with `FileType` dispatch
156    /// For YAML: uses JCS (RFC 8785) canonicalization of JSON representation
157    /// For Markdown: uses v1 normalization rules
158    /// For Text: uses basic line ending normalization
159    pub fn hash_canonicalized(&self, content: &str, file_type: FileType) -> Result<String> {
160        let hash_input = match file_type {
161            FileType::Yaml => {
162                // For YAML, use JCS approach: parse → JSON → canonical JSON → hash
163                let yaml_value: serde_yaml::Value = serde_yaml::from_str(content)
164                    .with_context(|| "Failed to parse YAML content for hashing")?;
165
166                // Convert to JSON Value (BTreeMap ensures deterministic ordering)
167                let json_value: serde_json::Value =
168                    serde_yaml::from_str(&serde_yaml::to_string(&yaml_value)?)
169                        .with_context(|| "Failed to convert YAML to JSON for hashing")?;
170
171                // Use JCS canonicalization for deterministic JSON
172                serde_json_canonicalizer::to_vec(&json_value)
173                    .map(|bytes| String::from_utf8(bytes).unwrap())
174                    .with_context(|| "Failed to canonicalize JSON using JCS")?
175            }
176            FileType::Markdown => self.normalize_markdown(content)?,
177            FileType::Text => self.normalize_text(content),
178        };
179
180        let mut hasher = Hasher::new();
181        hasher.update(hash_input.as_bytes());
182        Ok(hasher.finalize().to_hex().to_string())
183    }
184
185    /// Compute BLAKE3 hash of canonicalized content with error context
186    pub fn hash_canonicalized_with_context(
187        &self,
188        content: &str,
189        file_type: FileType,
190        phase: &str,
191    ) -> Result<String, XCheckerError> {
192        self.hash_canonicalized(content, file_type).map_err(|e| {
193            XCheckerError::CanonicalizationFailed {
194                phase: phase.to_string(),
195                reason: e.to_string(),
196            }
197        })
198    }
199
200    /// Normalize line endings to \n only
201    fn normalize_line_endings(&self, content: &str) -> String {
202        content.replace("\r\n", "\n").replace('\r', "\n")
203    }
204}
205
206impl Default for Canonicalizer {
207    fn default() -> Self {
208        Self::new()
209    }
210}
211
212#[cfg(test)]
213mod tests {
214    use super::*;
215
216    #[test]
217    fn test_yaml_canonicalization() {
218        let canonicalizer = Canonicalizer::new();
219
220        // Test basic YAML canonicalization
221        let yaml_content = r"
222name: test
223version: 1.0
224dependencies:
225  - dep1
226  - dep2
227config:
228  debug: true
229  port: 8080
230";
231
232        let result = canonicalizer.canonicalize_yaml(yaml_content);
233        assert!(result.is_ok());
234
235        let canonicalized = result.unwrap();
236        assert!(canonicalized.ends_with('\n'));
237        assert!(!canonicalized.contains('\r'));
238
239        // Test that reordered YAML produces same result
240        let reordered_yaml = r"
241version: 1.0
242name: test
243config:
244  port: 8080
245  debug: true
246dependencies:
247  - dep1
248  - dep2
249";
250
251        let reordered_result = canonicalizer.canonicalize_yaml(reordered_yaml);
252        assert!(reordered_result.is_ok());
253
254        // Note: serde_yaml may not guarantee key ordering, but structure should be preserved
255        let reordered_canonicalized = reordered_result.unwrap();
256        assert!(reordered_canonicalized.ends_with('\n'));
257        assert!(!reordered_canonicalized.contains('\r'));
258    }
259
260    #[test]
261    fn test_markdown_normalization() {
262        let canonicalizer = Canonicalizer::new();
263
264        let markdown_content =
265            "# Title\r\n\r\nSome content with trailing spaces   \r\n\r\n\r\n\r\n";
266        let result = canonicalizer.normalize_markdown(markdown_content);
267        assert!(result.is_ok());
268
269        let normalized = result.unwrap();
270        assert_eq!(normalized, "# Title\n\nSome content with trailing spaces\n");
271        assert!(!normalized.contains('\r'));
272        assert!(!normalized.contains("   \n")); // No trailing spaces
273        assert!(!normalized.ends_with("\n\n\n")); // No multiple trailing newlines
274    }
275
276    #[test]
277    fn test_text_normalization() {
278        let canonicalizer = Canonicalizer::new();
279
280        let text_content = "line1\r\nline2\rline3\n";
281        let normalized = canonicalizer.normalize_text(text_content);
282
283        assert_eq!(normalized, "line1\nline2\nline3\n");
284        assert!(!normalized.contains('\r'));
285    }
286
287    #[test]
288    fn test_hash_consistency() {
289        let canonicalizer = Canonicalizer::new();
290
291        let content = "test content\nwith newlines";
292        let hash1 = canonicalizer
293            .hash_canonicalized(content, FileType::Text)
294            .unwrap();
295        let hash2 = canonicalizer
296            .hash_canonicalized(content, FileType::Text)
297            .unwrap();
298
299        // Same content should produce same hash
300        assert_eq!(hash1, hash2);
301
302        // Different line endings should produce same hash after normalization
303        let content_crlf = "test content\r\nwith newlines";
304        let hash3 = canonicalizer
305            .hash_canonicalized(content_crlf, FileType::Text)
306            .unwrap();
307        assert_eq!(hash1, hash3);
308    }
309
310    #[test]
311    fn test_yaml_hash_determinism() {
312        let canonicalizer = Canonicalizer::new();
313
314        let yaml1 = r"
315name: test
316version: 1.0
317";
318
319        let yaml2 = r"
320version: 1.0
321name: test
322";
323
324        let hash1 = canonicalizer
325            .hash_canonicalized(yaml1, FileType::Yaml)
326            .unwrap();
327        let hash2 = canonicalizer
328            .hash_canonicalized(yaml2, FileType::Yaml)
329            .unwrap();
330
331        // With JCS canonicalization, reordered YAML should produce same hash
332        assert_eq!(hash1, hash2);
333        assert!(!hash1.is_empty());
334        assert!(!hash2.is_empty());
335    }
336
337    #[test]
338    fn test_markdown_hash_determinism() {
339        let canonicalizer = Canonicalizer::new();
340
341        let md1 = "# Title\n\nContent with trailing spaces   \n\n\n";
342        let md2 = "# Title\r\n\r\nContent with trailing spaces\r\n";
343
344        let hash1 = canonicalizer
345            .hash_canonicalized(md1, FileType::Markdown)
346            .unwrap();
347        let hash2 = canonicalizer
348            .hash_canonicalized(md2, FileType::Markdown)
349            .unwrap();
350
351        // Should produce same hash after normalization
352        assert_eq!(hash1, hash2);
353    }
354
355    #[test]
356    fn test_invalid_yaml() {
357        let canonicalizer = Canonicalizer::new();
358
359        let invalid_yaml = "invalid: yaml: content: [unclosed";
360        let result = canonicalizer.canonicalize_yaml(invalid_yaml);
361
362        assert!(result.is_err());
363    }
364
365    #[test]
366    fn test_version_string() {
367        let canonicalizer = Canonicalizer::new();
368        assert_eq!(canonicalizer.version(), "yaml-v1,md-v1");
369    }
370
371    #[test]
372    fn test_backend_string() {
373        let canonicalizer = Canonicalizer::new();
374        assert_eq!(canonicalizer.backend(), "jcs-rfc8785");
375    }
376
377    #[test]
378    fn test_markdown_fence_normalization() {
379        let canonicalizer = Canonicalizer::new();
380
381        let markdown_with_tildes = r#"# Title
382
383Some content
384
385~~~rust
386fn main() {
387    println!("Hello");
388}
389~~~
390
391More content
392"#;
393
394        let result = canonicalizer.normalize_markdown(markdown_with_tildes);
395        assert!(result.is_ok());
396
397        let normalized = result.unwrap();
398        assert!(normalized.contains("```rust"));
399        assert!(!normalized.contains("~~~"));
400        assert!(normalized.ends_with('\n'));
401        assert!(!normalized.ends_with("\n\n"));
402    }
403
404    #[test]
405    fn test_yaml_jcs_canonicalization() {
406        let canonicalizer = Canonicalizer::new();
407
408        // Test simpler YAML with different key ordering (arrays preserve order)
409        let yaml1 = r#"
410config:
411  database:
412    host: localhost
413    port: 5432
414  cache:
415    enabled: true
416    ttl: 300
417name: test
418version: "1.0"
419"#;
420
421        let yaml2 = r#"
422version: "1.0"
423name: test
424config:
425  cache:
426    ttl: 300
427    enabled: true
428  database:
429    port: 5432
430    host: localhost
431"#;
432
433        let hash1 = canonicalizer
434            .hash_canonicalized(yaml1, FileType::Yaml)
435            .unwrap();
436        let hash2 = canonicalizer
437            .hash_canonicalized(yaml2, FileType::Yaml)
438            .unwrap();
439
440        // JCS should ensure identical hashes for structurally equivalent YAML (same keys, different order)
441        assert_eq!(hash1, hash2);
442    }
443
444    #[test]
445    fn test_canonicalization_constants() {
446        assert_eq!(CANON_VERSION_YAML, "yaml-v1");
447        assert_eq!(CANON_VERSION_MD, "md-v1");
448        assert_eq!(CANON_VERSION, "yaml-v1,md-v1");
449        assert_eq!(CANONICALIZATION_BACKEND, "jcs-rfc8785");
450    }
451
452    // Test fixtures with intentionally reordered YAML/Markdown
453
454    #[test]
455    fn test_yaml_reordered_fixtures() {
456        let canonicalizer = Canonicalizer::new();
457
458        // Fixture 1: Complex nested structure with different key orders
459        let yaml_fixture_1a = r#"
460metadata:
461  name: "test-project"
462  version: "1.0.0"
463  authors:
464    - "Alice"
465    - "Bob"
466dependencies:
467  runtime:
468    serde: "1.0"
469    tokio: "1.0"
470  dev:
471    criterion: "0.4"
472config:
473  database:
474    host: "localhost"
475    port: 5432
476    ssl: true
477  logging:
478    level: "info"
479    format: "json"
480"#;
481
482        let yaml_fixture_1b = r#"
483config:
484  logging:
485    format: "json"
486    level: "info"
487  database:
488    ssl: true
489    port: 5432
490    host: "localhost"
491dependencies:
492  dev:
493    criterion: "0.4"
494  runtime:
495    tokio: "1.0"
496    serde: "1.0"
497metadata:
498  authors:
499    - "Alice"
500    - "Bob"
501  version: "1.0.0"
502  name: "test-project"
503"#;
504
505        let hash_1a = canonicalizer
506            .hash_canonicalized(yaml_fixture_1a, FileType::Yaml)
507            .unwrap();
508        let hash_1b = canonicalizer
509            .hash_canonicalized(yaml_fixture_1b, FileType::Yaml)
510            .unwrap();
511
512        // Same content different formatting ⇒ identical hash
513        assert_eq!(
514            hash_1a, hash_1b,
515            "Reordered YAML should produce identical hashes"
516        );
517
518        // Fixture 2: Different whitespace and line endings
519        let yaml_fixture_2a = "name: test\nversion: 1.0\ndebug: true";
520        let yaml_fixture_2b = "name:   test   \r\nversion:  1.0  \r\ndebug:    true   \r\n";
521
522        let hash_2a = canonicalizer
523            .hash_canonicalized(yaml_fixture_2a, FileType::Yaml)
524            .unwrap();
525        let hash_2b = canonicalizer
526            .hash_canonicalized(yaml_fixture_2b, FileType::Yaml)
527            .unwrap();
528
529        assert_eq!(
530            hash_2a, hash_2b,
531            "Different whitespace should produce identical hashes"
532        );
533    }
534
535    #[test]
536    fn test_markdown_reordered_fixtures() {
537        let canonicalizer = Canonicalizer::new();
538
539        // Fixture 1: Different fence styles and trailing spaces
540        let md_fixture_1a = r#"# Project Title
541
542## Overview
543
544This is a test project.
545
546```rust
547fn main() {
548    println!("Hello");
549}
550```
551
552## Features
553
554- Feature 1
555- Feature 2
556
557"#;
558
559        let md_fixture_1b = r#"# Project Title   
560
561## Overview   
562
563This is a test project.   
564
565~~~rust
566fn main() {
567    println!("Hello");
568}
569~~~
570
571## Features   
572
573- Feature 1   
574- Feature 2   
575
576
577
578"#;
579
580        let hash_1a = canonicalizer
581            .hash_canonicalized(md_fixture_1a, FileType::Markdown)
582            .unwrap();
583        let hash_1b = canonicalizer
584            .hash_canonicalized(md_fixture_1b, FileType::Markdown)
585            .unwrap();
586
587        // Same content different formatting ⇒ identical hash
588        assert_eq!(
589            hash_1a, hash_1b,
590            "Different markdown formatting should produce identical hashes"
591        );
592
593        // Fixture 2: Different line endings and trailing newlines
594        let md_fixture_2a = "# Title\n\nContent\n";
595        let md_fixture_2b = "# Title\r\n\r\nContent\r\n\r\n\r\n";
596
597        let hash_2a = canonicalizer
598            .hash_canonicalized(md_fixture_2a, FileType::Markdown)
599            .unwrap();
600        let hash_2b = canonicalizer
601            .hash_canonicalized(md_fixture_2b, FileType::Markdown)
602            .unwrap();
603
604        assert_eq!(
605            hash_2a, hash_2b,
606            "Different line endings should produce identical hashes"
607        );
608    }
609
610    #[test]
611    fn test_structure_determinism_independent_of_formatting() {
612        let canonicalizer = Canonicalizer::new();
613
614        // Test that structure is preserved regardless of formatting
615        let yaml_minimal = "a: 1\nb: 2";
616        let yaml_verbose = r"
617# Comment
618a:    1    # inline comment
619# Another comment
620b:    2    # another inline comment
621";
622
623        // Parse both to verify they have the same structure
624        let parsed_minimal: serde_yaml::Value = serde_yaml::from_str(yaml_minimal).unwrap();
625        let parsed_verbose: serde_yaml::Value = serde_yaml::from_str(yaml_verbose).unwrap();
626
627        // Verify structure is identical
628        assert_eq!(
629            parsed_minimal, parsed_verbose,
630            "Parsed structures should be identical"
631        );
632
633        // Verify hashes are identical
634        let hash_minimal = canonicalizer
635            .hash_canonicalized(yaml_minimal, FileType::Yaml)
636            .unwrap();
637        let hash_verbose = canonicalizer
638            .hash_canonicalized(yaml_verbose, FileType::Yaml)
639            .unwrap();
640
641        assert_eq!(
642            hash_minimal, hash_verbose,
643            "Structure determinism should be independent of formatting"
644        );
645    }
646
647    #[test]
648    fn test_malformed_input_error_handling() {
649        let canonicalizer = Canonicalizer::new();
650
651        // Test malformed YAML
652        let malformed_yaml_cases = [
653            "invalid: yaml: content: [unclosed",
654            "key: 'unclosed string",
655            "- item\n- [unclosed array",
656            "key: {unclosed: object",
657            "---\n...\n---\ninvalid multiple docs",
658        ];
659
660        for (i, malformed_yaml) in malformed_yaml_cases.iter().enumerate() {
661            let result = canonicalizer.hash_canonicalized(malformed_yaml, FileType::Yaml);
662            assert!(
663                result.is_err(),
664                "Malformed YAML case {i} should return error: {malformed_yaml}"
665            );
666
667            // Verify error message is helpful
668            let error_msg = result.unwrap_err().to_string();
669            assert!(
670                error_msg.contains("Failed to parse YAML") || error_msg.contains("YAML"),
671                "Error message should mention YAML parsing: {error_msg}"
672            );
673        }
674
675        // Test that canonicalize_yaml also handles errors properly
676        for (i, malformed_yaml) in malformed_yaml_cases.iter().enumerate() {
677            let result = canonicalizer.canonicalize_yaml(malformed_yaml);
678            assert!(
679                result.is_err(),
680                "canonicalize_yaml case {i} should return error: {malformed_yaml}"
681            );
682        }
683
684        // Test that markdown normalization is more forgiving (should not fail on most inputs)
685        let markdown_inputs = vec![
686            "# Valid markdown",
687            "Invalid markdown without proper structure",
688            "```\ncode without language\n```",
689            "~~~\ncode with tildes\n~~~",
690        ];
691
692        for markdown_input in markdown_inputs {
693            let result = canonicalizer.normalize_markdown(markdown_input);
694            assert!(
695                result.is_ok(),
696                "Markdown normalization should be forgiving: {markdown_input}"
697            );
698        }
699    }
700
701    #[test]
702    fn test_canonicalization_with_context_error_handling() {
703        let canonicalizer = Canonicalizer::new();
704
705        // Test hash_canonicalized_with_context method
706        let malformed_yaml = "invalid: yaml: [unclosed";
707        let result = canonicalizer.hash_canonicalized_with_context(
708            malformed_yaml,
709            FileType::Yaml,
710            "TEST_PHASE",
711        );
712
713        assert!(result.is_err());
714
715        // Verify it returns XCheckerError::CanonicalizationFailed
716        match result.unwrap_err() {
717            XCheckerError::CanonicalizationFailed { phase, reason } => {
718                assert_eq!(phase, "TEST_PHASE");
719                assert!(reason.contains("Failed to parse YAML"));
720            }
721            other => panic!("Expected CanonicalizationFailed, got: {other:?}"),
722        }
723    }
724
725    // ===== Edge Case Tests (Task 9.7) =====
726
727    #[test]
728    fn test_canonicalization_with_empty_content() {
729        let canonicalizer = Canonicalizer::new();
730
731        // Test empty YAML
732        let empty_yaml = "";
733        let result = canonicalizer.canonicalize_yaml(empty_yaml);
734        assert!(result.is_ok());
735        let canonicalized = result.unwrap();
736        // Empty YAML parses as null in serde_yaml, which serializes to "null\n"
737        assert_eq!(canonicalized, "null\n");
738
739        // Test empty Markdown
740        let empty_md = "";
741        let result = canonicalizer.normalize_markdown(empty_md);
742        assert!(result.is_ok());
743        let normalized = result.unwrap();
744        // Empty markdown should produce a newline
745        assert_eq!(normalized, "\n");
746
747        // Test empty text
748        let empty_text = "";
749        let normalized_text = canonicalizer.normalize_text(empty_text);
750        assert_eq!(normalized_text, "");
751
752        // Test hash of empty content
753        let hash_result = canonicalizer.hash_canonicalized(empty_text, FileType::Text);
754        assert!(hash_result.is_ok());
755        let hash = hash_result.unwrap();
756        assert!(!hash.is_empty());
757        assert_eq!(hash.len(), 64); // BLAKE3 produces 64-char hex
758    }
759
760    #[test]
761    fn test_canonicalization_with_special_characters() {
762        let canonicalizer = Canonicalizer::new();
763
764        // Test YAML with special characters
765        let yaml_with_special = r#"
766name: "test-with-special-chars: @#$%^&*()"
767description: "Line with\ttabs and\nnewlines"
768unicode: "Hello 世界 🌍"
769quotes: 'single "quotes" inside'
770"#;
771
772        let result = canonicalizer.canonicalize_yaml(yaml_with_special);
773        assert!(result.is_ok());
774        let canonicalized = result.unwrap();
775        assert!(canonicalized.ends_with('\n'));
776        assert!(!canonicalized.contains('\r'));
777
778        // Test Markdown with special characters
779        let md_with_special = r"# Title with @#$%
780
781Content with **bold** and *italic* and `code`.
782
783- List item with special: <>[]{}
784- Unicode: 你好 مرحبا Здравствуйте
785
786```rust
787fn test() { /* comment */ }
788```
789";
790
791        let result = canonicalizer.normalize_markdown(md_with_special);
792        assert!(result.is_ok());
793        let normalized = result.unwrap();
794        assert!(normalized.ends_with('\n'));
795        assert!(!normalized.contains('\r'));
796        assert!(normalized.contains("你好"));
797        assert!(normalized.contains("مرحبا"));
798
799        // Test hash stability with special characters
800        let hash1 = canonicalizer
801            .hash_canonicalized(md_with_special, FileType::Markdown)
802            .unwrap();
803        let hash2 = canonicalizer
804            .hash_canonicalized(md_with_special, FileType::Markdown)
805            .unwrap();
806        assert_eq!(hash1, hash2);
807    }
808
809    #[test]
810    fn test_canonicalization_with_unicode() {
811        let canonicalizer = Canonicalizer::new();
812
813        // Test YAML with various Unicode scripts
814        let yaml_unicode = r#"
815chinese: "中文测试"
816arabic: "اختبار عربي"
817russian: "Русский тест"
818emoji: "🚀 🌟 ✨"
819mixed: "Hello 世界 🌍"
820"#;
821
822        let result = canonicalizer.canonicalize_yaml(yaml_unicode);
823        assert!(result.is_ok());
824        let canonicalized = result.unwrap();
825        assert!(canonicalized.contains("中文测试"));
826        assert!(canonicalized.contains("اختبار عربي"));
827        assert!(canonicalized.contains("Русский тест"));
828        assert!(canonicalized.contains("🚀"));
829
830        // Test Markdown with Unicode
831        let md_unicode = r"# Unicode Test 测试
832
833## Section with العربية
834
835Content with Русский and 日本語.
836
837- 中文
838- العربية  
839- Русский
840- 日本語
841
842Emoji: 🎉 🎊 🎈
843";
844
845        let result = canonicalizer.normalize_markdown(md_unicode);
846        assert!(result.is_ok());
847        let normalized = result.unwrap();
848        assert!(normalized.contains("测试"));
849        assert!(normalized.contains("العربية"));
850        assert!(normalized.contains("Русский"));
851        assert!(normalized.contains("日本語"));
852        assert!(normalized.contains("🎉"));
853
854        // Test hash determinism with Unicode
855        let hash1 = canonicalizer
856            .hash_canonicalized(yaml_unicode, FileType::Yaml)
857            .unwrap();
858        let hash2 = canonicalizer
859            .hash_canonicalized(yaml_unicode, FileType::Yaml)
860            .unwrap();
861        assert_eq!(hash1, hash2);
862
863        // Test that different Unicode content produces different hashes
864        let yaml_unicode_2 = r#"
865chinese: "不同的中文"
866arabic: "مختلف عربي"
867"#;
868        let hash3 = canonicalizer
869            .hash_canonicalized(yaml_unicode_2, FileType::Yaml)
870            .unwrap();
871        assert_ne!(hash1, hash3);
872    }
873
874    #[test]
875    fn test_canonicalization_with_whitespace_only() {
876        let canonicalizer = Canonicalizer::new();
877
878        // Test YAML with only whitespace (spaces only - tabs can cause parse errors)
879        let whitespace_yaml = "   \n   \n   ";
880        let result = canonicalizer.canonicalize_yaml(whitespace_yaml);
881        assert!(result.is_ok());
882        let canonicalized = result.unwrap();
883        // Whitespace-only YAML parses as null in serde_yaml
884        assert_eq!(canonicalized, "null\n");
885
886        // Test YAML with tabs (should fail to parse)
887        let yaml_with_tabs = "   \n\t\n   ";
888        let result = canonicalizer.canonicalize_yaml(yaml_with_tabs);
889        assert!(
890            result.is_err(),
891            "YAML with tabs at start of line should fail to parse"
892        );
893
894        // Test Markdown with only whitespace
895        let whitespace_md = "   \n\t\n   ";
896        let result = canonicalizer.normalize_markdown(whitespace_md);
897        assert!(result.is_ok());
898        let normalized = result.unwrap();
899        // Should collapse to single newline
900        assert_eq!(normalized, "\n");
901
902        // Test text with only whitespace
903        let whitespace_text = "   \n\t\n   ";
904        let normalized_text = canonicalizer.normalize_text(whitespace_text);
905        assert_eq!(normalized_text, "   \n\t\n   ");
906    }
907
908    #[test]
909    fn test_canonicalization_with_very_long_lines() {
910        let canonicalizer = Canonicalizer::new();
911
912        // Test YAML with very long line
913        let long_value = "a".repeat(10000);
914        let yaml_long = format!("key: \"{long_value}\"");
915        let result = canonicalizer.canonicalize_yaml(&yaml_long);
916        assert!(result.is_ok());
917
918        // Test Markdown with very long line
919        let md_long = format!("# Title\n\n{}\n", "x".repeat(10000));
920        let result = canonicalizer.normalize_markdown(&md_long);
921        assert!(result.is_ok());
922        let normalized = result.unwrap();
923        assert!(normalized.contains(&"x".repeat(10000)));
924    }
925
926    #[test]
927    fn test_canonicalization_with_mixed_line_endings() {
928        let canonicalizer = Canonicalizer::new();
929
930        // Test YAML with mixed line endings
931        let yaml_mixed = "key1: value1\r\nkey2: value2\nkey3: value3\r";
932        let result = canonicalizer.canonicalize_yaml(yaml_mixed);
933        assert!(result.is_ok());
934        let canonicalized = result.unwrap();
935        assert!(!canonicalized.contains('\r'));
936        assert!(canonicalized.contains("key1"));
937        assert!(canonicalized.contains("key2"));
938        assert!(canonicalized.contains("key3"));
939
940        // Test Markdown with mixed line endings
941        let md_mixed = "# Title\r\n\r\nContent\nMore content\r";
942        let result = canonicalizer.normalize_markdown(md_mixed);
943        assert!(result.is_ok());
944        let normalized = result.unwrap();
945        assert!(!normalized.contains('\r'));
946        assert_eq!(normalized, "# Title\n\nContent\nMore content\n");
947    }
948
949    // ===== Edge Case Tests for Task 9.7 =====
950
951    #[test]
952    fn test_canonicalization_empty_content() {
953        let canonicalizer = Canonicalizer::new();
954
955        // Test empty YAML - empty string parses as null in YAML
956        let empty_yaml = "";
957        let result = canonicalizer.canonicalize_yaml(empty_yaml);
958        assert!(result.is_ok());
959        let canonicalized = result.unwrap();
960        // Empty YAML becomes "null\n" after canonicalization
961        assert!(canonicalized.contains("null") || canonicalized == "\n");
962
963        // Test empty Markdown
964        let empty_md = "";
965        let result = canonicalizer.normalize_markdown(empty_md);
966        assert!(result.is_ok());
967        assert_eq!(result.unwrap(), "\n");
968
969        // Test empty text
970        let empty_text = "";
971        let normalized = canonicalizer.normalize_text(empty_text);
972        assert_eq!(normalized, "");
973    }
974
975    #[test]
976    fn test_canonicalization_special_characters() {
977        let canonicalizer = Canonicalizer::new();
978
979        // Test YAML with special characters
980        let yaml_with_special = r#"
981name: "test@#$%^&*()"
982value: "quotes\"and'apostrophes"
983path: "C:\\Windows\\System32"
984"#;
985        let result = canonicalizer.canonicalize_yaml(yaml_with_special);
986        assert!(result.is_ok());
987        let canonicalized = result.unwrap();
988        assert!(canonicalized.contains("test@#$%^&*()"));
989        assert!(!canonicalized.contains('\r'));
990
991        // Test Markdown with special characters
992        let md_with_special = "# Title with @#$%\n\nContent with <>&\"'\n";
993        let result = canonicalizer.normalize_markdown(md_with_special);
994        assert!(result.is_ok());
995        let normalized = result.unwrap();
996        assert!(normalized.contains("@#$%"));
997        assert!(normalized.contains("<>&\"'"));
998    }
999
1000    #[test]
1001    fn test_canonicalization_unicode() {
1002        let canonicalizer = Canonicalizer::new();
1003
1004        // Test YAML with Unicode
1005        let yaml_with_unicode = r#"
1006name: "Hello 世界 🌍"
1007emoji: "🚀 ✨ 🎉"
1008chinese: "中文测试"
1009arabic: "مرحبا"
1010"#;
1011        let result = canonicalizer.canonicalize_yaml(yaml_with_unicode);
1012        assert!(result.is_ok());
1013        let canonicalized = result.unwrap();
1014        assert!(canonicalized.contains("世界"));
1015        assert!(canonicalized.contains("🌍"));
1016        assert!(canonicalized.contains("🚀"));
1017        assert!(canonicalized.contains("中文测试"));
1018        assert!(canonicalized.contains("مرحبا"));
1019
1020        // Test Markdown with Unicode
1021        let md_with_unicode = "# 标题 Title\n\nContent with émojis: 😀 🎨 ✅\n\nРусский текст\n";
1022        let result = canonicalizer.normalize_markdown(md_with_unicode);
1023        assert!(result.is_ok());
1024        let normalized = result.unwrap();
1025        assert!(normalized.contains("标题"));
1026        assert!(normalized.contains("😀"));
1027        assert!(normalized.contains("Русский"));
1028
1029        // Test hash consistency with Unicode
1030        let unicode_text = "Hello 世界 🌍";
1031        let hash1 = canonicalizer
1032            .hash_canonicalized(unicode_text, FileType::Text)
1033            .unwrap();
1034        let hash2 = canonicalizer
1035            .hash_canonicalized(unicode_text, FileType::Text)
1036            .unwrap();
1037        assert_eq!(hash1, hash2);
1038        assert_eq!(hash1.len(), 64); // BLAKE3 produces 64-char hex
1039    }
1040
1041    #[test]
1042    fn test_canonicalization_whitespace_edge_cases() {
1043        let canonicalizer = Canonicalizer::new();
1044
1045        // Test YAML with various whitespace
1046        let yaml_with_whitespace = "name:   test   \nvalue:  \t  data  \t\n";
1047        let result = canonicalizer.canonicalize_yaml(yaml_with_whitespace);
1048        assert!(result.is_ok());
1049        let canonicalized = result.unwrap();
1050        assert!(!canonicalized.contains("  \n")); // No trailing spaces
1051        assert!(!canonicalized.contains('\t'));
1052
1053        // Test Markdown with trailing spaces
1054        let md_with_trailing = "# Title   \n\nParagraph with trailing spaces   \n\n\n\n";
1055        let result = canonicalizer.normalize_markdown(md_with_trailing);
1056        assert!(result.is_ok());
1057        let normalized = result.unwrap();
1058        assert!(!normalized.contains("   \n")); // No trailing spaces
1059        assert!(!normalized.ends_with("\n\n\n")); // Max 1 trailing newline
1060        assert!(normalized.ends_with('\n'));
1061    }
1062
1063    #[test]
1064    fn test_hash_with_empty_content() {
1065        let canonicalizer = Canonicalizer::new();
1066
1067        // Empty content should produce consistent hash
1068        let hash1 = canonicalizer
1069            .hash_canonicalized("", FileType::Text)
1070            .unwrap();
1071        let hash2 = canonicalizer
1072            .hash_canonicalized("", FileType::Text)
1073            .unwrap();
1074        assert_eq!(hash1, hash2);
1075        assert_eq!(hash1.len(), 64);
1076
1077        // Empty YAML (null) should hash consistently
1078        let hash3 = canonicalizer
1079            .hash_canonicalized("", FileType::Yaml)
1080            .unwrap();
1081        let hash4 = canonicalizer
1082            .hash_canonicalized("", FileType::Yaml)
1083            .unwrap();
1084        assert_eq!(hash3, hash4);
1085    }
1086
1087    #[test]
1088    fn test_invalid_yaml_handling() {
1089        let canonicalizer = Canonicalizer::new();
1090
1091        // Hash of clearly invalid YAML (unclosed bracket)
1092        let truly_invalid = "{ unclosed bracket";
1093        let hash_result = canonicalizer.hash_canonicalized(truly_invalid, FileType::Yaml);
1094        assert!(hash_result.is_err());
1095
1096        // Test with malformed YAML structure
1097        let malformed = "---\n[invalid";
1098        let result2 = canonicalizer.hash_canonicalized(malformed, FileType::Yaml);
1099        assert!(result2.is_err());
1100    }
1101}