Skip to main content

canonical/
lib.rs

1//! UCFP canonical text layer.
2//!
3//! This module normalizes text into a deterministic, versioned format. Downstream
4//! stages (perceptual, semantic, index) can rely on this for stable identity.
5//!
6//! ## What we do
7//!
8//! - Unicode normalization (NFKC by default, configurable)
9//! - Casing and punctuation handling (lowercase, optional stripping)
10//! - Whitespace normalization (collapses to single spaces)
11//! - Tokenization with byte offsets for downstream accuracy
12//! - Versioned hashes so you can tell which canonicalization was used
13//!
14//! ## Pure function guarantee
15//!
16//! No I/O, no clock calls, no OS/locale dependence. Give us the same text
17//! and config, you get the same result on any machine.
18//!
19//! ## Invariants worth knowing
20//!
21//! - Input should be trusted UTF-8 (usually from ingest stage)
22//! - We don't re-validate ingest constraints here
23//! - Output depends only on text + config
24//! - Hash = SHA-256(version || 0x00 || canonical_text)
25//!
26//! Bottom line: same input + same config = same output forever.
27
28mod config;
29mod document;
30mod error;
31mod hash;
32mod pipeline;
33mod token;
34mod whitespace;
35
36pub use crate::config::CanonicalizeConfig;
37pub use crate::document::CanonicalizedDocument;
38pub use crate::error::CanonicalError;
39pub use crate::hash::{hash_canonical_bytes, hash_text};
40pub use crate::pipeline::canonicalize;
41pub use crate::token::{tokenize, Token};
42pub use crate::whitespace::collapse_whitespace;
43
44#[cfg(test)]
45mod tests {
46    use super::*;
47
48    #[test]
49    fn basic_canonicalize_default() {
50        let input = "  HAcllo\nWORLD!  This is   UCFP. ";
51        let cfg = CanonicalizeConfig::default();
52        let out = canonicalize("doc-basic", input, &cfg).expect("canonicalization succeeds");
53
54        assert_eq!(out.canonical_text, "hacllo world! this is ucfp.");
55        assert_eq!(out.doc_id, "doc-basic");
56        assert_eq!(out.canonical_version, cfg.version);
57        assert_eq!(out.config, cfg);
58
59        let expected_tokens = vec![
60            ("hacllo", 0usize, 6usize),
61            ("world!", 7, 13),
62            ("this", 14, 18),
63            ("is", 19, 21),
64            ("ucfp.", 22, 27),
65        ];
66        assert_eq!(out.tokens.len(), expected_tokens.len());
67        for (token, (text, start, end)) in out.tokens.iter().zip(expected_tokens.into_iter()) {
68            assert_eq!(token.text, text);
69            assert_eq!(token.start, start);
70            assert_eq!(token.end, end);
71        }
72
73        let expected_hash =
74            hash_canonical_bytes(out.canonical_version, out.canonical_text.as_bytes());
75        assert_eq!(out.sha256_hex, expected_hash);
76    }
77
78    #[test]
79    fn strip_punctuation_canonicalize() {
80        let input = "Hello, world! It's UCFP: 100% fun.";
81        let cfg = CanonicalizeConfig {
82            strip_punctuation: true,
83            ..Default::default()
84        };
85        let out = canonicalize("doc-strip", input, &cfg).expect("canonicalization succeeds");
86        assert_eq!(out.canonical_text, "hello world it s ucfp 100 fun");
87        let token_texts: Vec<String> = out.tokens.iter().map(|t| t.text.clone()).collect();
88        assert_eq!(
89            token_texts,
90            vec!["hello", "world", "it", "s", "ucfp", "100", "fun"]
91        );
92    }
93
94    #[test]
95    fn unicode_equivalence_nfkc() {
96        let composed = "Caf\u{00E9}";
97        let decomposed = "Cafe\u{0301}";
98        let cfg = CanonicalizeConfig::default();
99
100        let doc_a = canonicalize("doc-a", composed, &cfg).expect("canonical composed");
101        let doc_b = canonicalize("doc-b", decomposed, &cfg).expect("canonical decomposed");
102
103        assert_eq!(doc_a.canonical_text, doc_b.canonical_text);
104        assert_eq!(doc_a.sha256_hex, doc_b.sha256_hex);
105    }
106
107    #[test]
108    fn token_offsets_stable_for_non_bmp() {
109        let cfg = CanonicalizeConfig::default();
110        let doc =
111            canonicalize("doc-token", " a\u{10348}b  c ", &cfg).expect("canonicalization succeeds");
112
113        let expected = vec![
114            Token {
115                text: "a\u{10348}b".to_string(),
116                start: 0,
117                end: "a\u{10348}b".len(),
118            },
119            Token {
120                text: "c".to_string(),
121                start: "a\u{10348}b ".len(),
122                end: "a\u{10348}b c".len(),
123            },
124        ];
125        assert_eq!(doc.tokens, expected);
126    }
127
128    #[test]
129    fn hash_text_determinism() {
130        let texts = ["", "hello world", "こんにちは世界", "emoji \u{1f600}"];
131
132        for text in texts {
133            let hash_once = hash_text(text);
134            let hash_twice = hash_text(text);
135            assert_eq!(hash_once, hash_twice);
136        }
137    }
138
139    #[test]
140    fn empty_input_rejected() {
141        let cfg = CanonicalizeConfig::default();
142        let res = canonicalize("empty-doc", "   ", &cfg);
143        assert!(matches!(res, Err(CanonicalError::EmptyInput)));
144    }
145
146    #[test]
147    fn missing_doc_id_rejected() {
148        let cfg = CanonicalizeConfig::default();
149        let res = canonicalize("", "content", &cfg);
150        assert!(matches!(res, Err(CanonicalError::MissingDocId)));
151    }
152
153    #[test]
154    fn disable_unicode_normalization() {
155        let cfg = CanonicalizeConfig {
156            normalize_unicode: false,
157            ..Default::default()
158        };
159        let doc = canonicalize("doc-raw", "Cafe\u{0301}", &cfg).expect("canonicalization succeeds");
160        assert_eq!(doc.canonical_text, "cafe\u{0301}");
161    }
162
163    #[test]
164    fn invalid_config_version_rejected() {
165        let cfg = CanonicalizeConfig {
166            version: 0,
167            ..Default::default()
168        };
169        let res = canonicalize("doc-invalid", "content", &cfg);
170        assert!(matches!(res, Err(CanonicalError::InvalidConfig(_))));
171    }
172
173    #[test]
174    fn canonical_hash_includes_version() {
175        let cfg_v1 = CanonicalizeConfig::default();
176        let cfg_v2 = CanonicalizeConfig {
177            version: cfg_v1.version + 1,
178            ..CanonicalizeConfig::default()
179        };
180
181        let doc_v1 = canonicalize("doc", "Same text", &cfg_v1).expect("v1");
182        let doc_v2 = canonicalize("doc", "Same text", &cfg_v2).expect("v2");
183
184        assert_eq!(doc_v1.canonical_text, doc_v2.canonical_text);
185        assert_ne!(doc_v1.sha256_hex, doc_v2.sha256_hex);
186    }
187}