1mod config;
29mod document;
30mod error;
31mod hash;
32mod pipeline;
33mod token;
34mod whitespace;
35
36pub use crate::config::CanonicalizeConfig;
37pub use crate::document::CanonicalizedDocument;
38pub use crate::error::CanonicalError;
39pub use crate::hash::{hash_canonical_bytes, hash_text};
40pub use crate::pipeline::canonicalize;
41pub use crate::token::{tokenize, Token};
42pub use crate::whitespace::collapse_whitespace;
43
44#[cfg(test)]
45mod tests {
46 use super::*;
47
48 #[test]
49 fn basic_canonicalize_default() {
50 let input = " HAcllo\nWORLD! This is UCFP. ";
51 let cfg = CanonicalizeConfig::default();
52 let out = canonicalize("doc-basic", input, &cfg).expect("canonicalization succeeds");
53
54 assert_eq!(out.canonical_text, "hacllo world! this is ucfp.");
55 assert_eq!(out.doc_id, "doc-basic");
56 assert_eq!(out.canonical_version, cfg.version);
57 assert_eq!(out.config, cfg);
58
59 let expected_tokens = vec![
60 ("hacllo", 0usize, 6usize),
61 ("world!", 7, 13),
62 ("this", 14, 18),
63 ("is", 19, 21),
64 ("ucfp.", 22, 27),
65 ];
66 assert_eq!(out.tokens.len(), expected_tokens.len());
67 for (token, (text, start, end)) in out.tokens.iter().zip(expected_tokens.into_iter()) {
68 assert_eq!(token.text, text);
69 assert_eq!(token.start, start);
70 assert_eq!(token.end, end);
71 }
72
73 let expected_hash =
74 hash_canonical_bytes(out.canonical_version, out.canonical_text.as_bytes());
75 assert_eq!(out.sha256_hex, expected_hash);
76 }
77
78 #[test]
79 fn strip_punctuation_canonicalize() {
80 let input = "Hello, world! It's UCFP: 100% fun.";
81 let cfg = CanonicalizeConfig {
82 strip_punctuation: true,
83 ..Default::default()
84 };
85 let out = canonicalize("doc-strip", input, &cfg).expect("canonicalization succeeds");
86 assert_eq!(out.canonical_text, "hello world it s ucfp 100 fun");
87 let token_texts: Vec<String> = out.tokens.iter().map(|t| t.text.clone()).collect();
88 assert_eq!(
89 token_texts,
90 vec!["hello", "world", "it", "s", "ucfp", "100", "fun"]
91 );
92 }
93
94 #[test]
95 fn unicode_equivalence_nfkc() {
96 let composed = "Caf\u{00E9}";
97 let decomposed = "Cafe\u{0301}";
98 let cfg = CanonicalizeConfig::default();
99
100 let doc_a = canonicalize("doc-a", composed, &cfg).expect("canonical composed");
101 let doc_b = canonicalize("doc-b", decomposed, &cfg).expect("canonical decomposed");
102
103 assert_eq!(doc_a.canonical_text, doc_b.canonical_text);
104 assert_eq!(doc_a.sha256_hex, doc_b.sha256_hex);
105 }
106
107 #[test]
108 fn token_offsets_stable_for_non_bmp() {
109 let cfg = CanonicalizeConfig::default();
110 let doc =
111 canonicalize("doc-token", " a\u{10348}b c ", &cfg).expect("canonicalization succeeds");
112
113 let expected = vec![
114 Token {
115 text: "a\u{10348}b".to_string(),
116 start: 0,
117 end: "a\u{10348}b".len(),
118 },
119 Token {
120 text: "c".to_string(),
121 start: "a\u{10348}b ".len(),
122 end: "a\u{10348}b c".len(),
123 },
124 ];
125 assert_eq!(doc.tokens, expected);
126 }
127
128 #[test]
129 fn hash_text_determinism() {
130 let texts = ["", "hello world", "こんにちは世界", "emoji \u{1f600}"];
131
132 for text in texts {
133 let hash_once = hash_text(text);
134 let hash_twice = hash_text(text);
135 assert_eq!(hash_once, hash_twice);
136 }
137 }
138
139 #[test]
140 fn empty_input_rejected() {
141 let cfg = CanonicalizeConfig::default();
142 let res = canonicalize("empty-doc", " ", &cfg);
143 assert!(matches!(res, Err(CanonicalError::EmptyInput)));
144 }
145
146 #[test]
147 fn missing_doc_id_rejected() {
148 let cfg = CanonicalizeConfig::default();
149 let res = canonicalize("", "content", &cfg);
150 assert!(matches!(res, Err(CanonicalError::MissingDocId)));
151 }
152
153 #[test]
154 fn disable_unicode_normalization() {
155 let cfg = CanonicalizeConfig {
156 normalize_unicode: false,
157 ..Default::default()
158 };
159 let doc = canonicalize("doc-raw", "Cafe\u{0301}", &cfg).expect("canonicalization succeeds");
160 assert_eq!(doc.canonical_text, "cafe\u{0301}");
161 }
162
163 #[test]
164 fn invalid_config_version_rejected() {
165 let cfg = CanonicalizeConfig {
166 version: 0,
167 ..Default::default()
168 };
169 let res = canonicalize("doc-invalid", "content", &cfg);
170 assert!(matches!(res, Err(CanonicalError::InvalidConfig(_))));
171 }
172
173 #[test]
174 fn canonical_hash_includes_version() {
175 let cfg_v1 = CanonicalizeConfig::default();
176 let cfg_v2 = CanonicalizeConfig {
177 version: cfg_v1.version + 1,
178 ..CanonicalizeConfig::default()
179 };
180
181 let doc_v1 = canonicalize("doc", "Same text", &cfg_v1).expect("v1");
182 let doc_v2 = canonicalize("doc", "Same text", &cfg_v2).expect("v2");
183
184 assert_eq!(doc_v1.canonical_text, doc_v2.canonical_text);
185 assert_ne!(doc_v1.sha256_hex, doc_v2.sha256_hex);
186 }
187}