canon_core/
chunk.rs

1//! Chunk node representing a text segment from a document
2//!
3//! Per CP-011: Chunks use byte offsets and lengths for precise positioning.
4//! Per CP-001: Chunk ID is STABLE - does not include text content.
5
6use crate::text::normalize;
7use serde::{Deserialize, Serialize};
8use uuid::Uuid;
9
10/// A chunk of text extracted from a document
11///
12/// Documents are split into overlapping chunks for embedding.
13/// Each chunk tracks its position within the source document.
14///
15/// Per CP-011: Uses byte-based offsets (not character-based) for accurate
16/// slicing back to original document content.
17///
18/// Per CP-001: Chunk ID is STABLE - ID = hash(doc_id + sequence) only.
19/// This ensures re-chunking with different parameters produces the same IDs.
20#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
21pub struct Chunk {
22    /// Unique identifier for this chunk (BLAKE3-16 of doc_id + sequence) - STABLE
23    pub id: Uuid,
24
25    /// Parent document ID
26    pub doc_id: Uuid,
27
28    /// The actual text content (canonicalized)
29    pub text: String,
30
31    /// Byte offset within the source document (u64 for large files)
32    pub byte_offset: u64,
33
34    /// Length of this chunk in bytes (u64 for large files)
35    pub byte_length: u64,
36
37    /// Sequence number within the document (0-indexed)
38    pub sequence: u32,
39
40    /// Hash of the canonicalized text content for verification
41    pub text_hash: [u8; 32],
42}
43
44impl Chunk {
45    /// Create a new chunk with automatic ID generation.
46    ///
47    /// Per CP-001: Chunk ID is STABLE - does NOT include text.
48    /// This ensures re-chunking with different parameters produces same IDs.
49    /// Content is verified via text_hash field.
50    pub fn new(doc_id: Uuid, text: String, byte_offset: u64, sequence: u32) -> Self {
51        // Per CP-003: Canonicalize text before hashing for determinism
52        let canonical_text = normalize(&text);
53        let text_hash = *blake3::hash(canonical_text.as_bytes()).as_bytes();
54
55        // Per CP-001: ID = hash(doc_id + sequence) - STABLE, does NOT include text
56        let id_bytes = crate::id::generate_composite_id(&[
57            doc_id.as_bytes(),
58            &sequence.to_le_bytes(),
59        ]);
60        let id = Uuid::from_bytes(id_bytes);
61
62        let byte_length = text.len() as u64;
63
64        Self {
65            id,
66            doc_id,
67            text: canonical_text, // Store canonicalized text
68            byte_offset,
69            byte_length,
70            sequence,
71            text_hash,
72        }
73    }
74
75    /// Create a chunk from already-canonicalized text (for internal use)
76    #[doc(hidden)]
77    pub fn from_canonical(doc_id: Uuid, text: String, byte_offset: u64, sequence: u32) -> Self {
78        let text_hash = *blake3::hash(text.as_bytes()).as_bytes();
79
80        // Per CP-001: ID = hash(doc_id + sequence) - STABLE
81        let id_bytes = crate::id::generate_composite_id(&[
82            doc_id.as_bytes(),
83            &sequence.to_le_bytes(),
84        ]);
85        let id = Uuid::from_bytes(id_bytes);
86
87        let byte_length = text.len() as u64;
88
89        Self {
90            id,
91            doc_id,
92            text,
93            byte_offset,
94            byte_length,
95            sequence,
96            text_hash,
97        }
98    }
99
100    /// Get the text hash as a hex string
101    pub fn text_hash_hex(&self) -> String {
102        self.text_hash
103            .iter()
104            .map(|b| format!("{:02x}", b))
105            .collect()
106    }
107
108    /// Approximate token count (rough estimate: 4 chars per token)
109    pub fn approx_tokens(&self) -> usize {
110        self.text.len() / 4
111    }
112}
113
114#[cfg(test)]
115mod tests {
116    use super::*;
117
118    #[test]
119    fn test_chunk_creation() {
120        let doc_id = Uuid::new_v4();
121        let chunk = Chunk::new(
122            doc_id,
123            "This is a test chunk.".to_string(),
124            0,
125            0,
126        );
127
128        assert_eq!(chunk.doc_id, doc_id);
129        assert_eq!(chunk.byte_offset, 0);
130        assert_eq!(chunk.sequence, 0);
131    }
132
133    #[test]
134    fn test_chunk_id_stable() {
135        let doc_id = Uuid::nil();
136        let text = "Test text";
137
138        // Same doc_id + sequence = same chunk ID regardless of text content
139        let chunk1 = Chunk::new(doc_id, text.to_string(), 0, 1);
140        let chunk2 = Chunk::new(doc_id, "Different text".to_string(), 0, 1);
141
142        assert_eq!(chunk1.id, chunk2.id);
143
144        // Different sequence = different ID
145        let chunk3 = Chunk::new(doc_id, text.to_string(), 0, 2);
146        assert_ne!(chunk1.id, chunk3.id);
147    }
148
149    #[test]
150    fn test_chunk_id_determinism() {
151        let doc_id = Uuid::nil();
152        let text = "Test text";
153        let seq = 1;
154
155        let chunk1 = Chunk::new(doc_id, text.to_string(), 0, seq);
156        let chunk2 = Chunk::new(doc_id, text.to_string(), 0, seq);
157
158        assert_eq!(chunk1.id, chunk2.id);
159    }
160
161    #[test]
162    fn test_text_canonicalized() {
163        let doc_id = Uuid::nil();
164
165        // Text with different whitespace should canonicalize to same
166        let chunk1 = Chunk::new(doc_id, "Hello   \nWorld".to_string(), 0, 0);
167        let chunk2 = Chunk::new(doc_id, "Hello\nWorld".to_string(), 0, 0);
168
169        // Text is canonicalized
170        assert_eq!(chunk1.text, "Hello\nWorld\n");
171        assert_eq!(chunk1.text, chunk2.text);
172
173        // But ID is still stable (not based on text)
174        assert_eq!(chunk1.id, chunk2.id);
175    }
176
177    #[test]
178    fn test_approx_tokens() {
179        let chunk = Chunk::new(
180            Uuid::new_v4(),
181            "A".repeat(400), // ~100 tokens
182            0,
183            0,
184        );
185
186        assert_eq!(chunk.approx_tokens(), 100);
187    }
188
189    #[test]
190    fn test_byte_offset() {
191        let doc_id = Uuid::new_v4();
192        let chunk = Chunk::new(doc_id, "Test".to_string(), 14, 1);
193
194        assert_eq!(chunk.byte_offset, 14);
195    }
196
197    // Additional tests for comprehensive coverage
198
199    #[test]
200    fn test_chunk_id_different_sequence_different_id() {
201        // Same content, different sequence = different ID
202        let doc_id = Uuid::nil();
203
204        let chunk1 = Chunk::new(doc_id, "Same text".to_string(), 0, 0);
205        let chunk2 = Chunk::new(doc_id, "Same text".to_string(), 0, 1);
206        let chunk3 = Chunk::new(doc_id, "Same text".to_string(), 0, 2);
207
208        // All should have different IDs
209        assert_ne!(chunk1.id, chunk2.id);
210        assert_ne!(chunk2.id, chunk3.id);
211        assert_ne!(chunk1.id, chunk3.id);
212    }
213
214    #[test]
215    fn test_chunk_text_hash_computation() {
216        // Test that text_hash is computed from canonicalized text
217        let doc_id = Uuid::nil();
218        let text = "Test text for hashing";
219
220        let chunk = Chunk::new(doc_id, text.to_string(), 0, 0);
221
222        // Compute expected hash: BLAKE3 of canonicalized text
223        let canonical = normalize(text);
224        let expected_hash = *blake3::hash(canonical.as_bytes()).as_bytes();
225
226        assert_eq!(chunk.text_hash, expected_hash);
227    }
228
229    #[test]
230    fn test_chunk_byte_offset_validation() {
231        // Test various byte offset values
232        let doc_id = Uuid::nil();
233
234        // Zero offset
235        let chunk0 = Chunk::new(doc_id, "test".to_string(), 0, 0);
236        assert_eq!(chunk0.byte_offset, 0);
237
238        // Large offset for large files
239        let chunk_large = Chunk::new(doc_id, "test".to_string(), 1_000_000, 0);
240        assert_eq!(chunk_large.byte_offset, 1_000_000);
241    }
242
243    #[test]
244    fn test_chunk_sequence_ordering() {
245        // Test sequence numbers are 0-indexed
246        let doc_id = Uuid::nil();
247
248        let chunk0 = Chunk::new(doc_id, "first".to_string(), 0, 0);
249        let chunk1 = Chunk::new(doc_id, "second".to_string(), 10, 1);
250        let chunk2 = Chunk::new(doc_id, "third".to_string(), 20, 2);
251
252        assert_eq!(chunk0.sequence, 0);
253        assert_eq!(chunk1.sequence, 1);
254        assert_eq!(chunk2.sequence, 2);
255    }
256
257    #[test]
258    fn test_chunk_canonical_bytes_format() {
259        // Verify serialization format includes all fields
260        let doc_id = Uuid::nil();
261        let chunk = Chunk::new(doc_id, "Test content".to_string(), 0, 0);
262
263        // Verify all required fields exist and are valid
264        assert_eq!(chunk.id.as_bytes().len(), 16);
265        assert_eq!(chunk.doc_id.as_bytes().len(), 16);
266        assert_eq!(chunk.text_hash.len(), 32);
267        assert!(chunk.sequence >= 0);
268    }
269
270    #[test]
271    fn test_chunk_overlap_semantics() {
272        // Test byte_offset and byte_length for overlap detection
273        let doc_id = Uuid::nil();
274        let text = "Hello World";
275
276        // First chunk
277        let chunk1 = Chunk::new(doc_id, text.to_string(), 0, 0);
278        // Second chunk starting at offset (simulating overlap)
279        let chunk2 = Chunk::new(doc_id, text.to_string(), 5, 1);
280
281        // Both should exist with different offsets
282        assert_eq!(chunk1.byte_offset, 0);
283        assert_eq!(chunk2.byte_offset, 5);
284
285        // byte_length should match text length
286        assert_eq!(chunk1.byte_length, text.len() as u64);
287    }
288
289    #[test]
290    fn test_chunk_text_validation_utf8() {
291        // Test that only valid UTF-8 is accepted
292        let doc_id = Uuid::nil();
293
294        // Valid UTF-8 strings
295        let valid_texts = vec![
296            "Hello, World!",
297            "Unicode: cafe with accent: cafe",
298            "Emoji: hello world",
299            "",
300            "Multiple\nlines\nhere",
301        ];
302
303        for text in valid_texts {
304            let chunk = Chunk::new(doc_id, text.to_string(), 0, 0);
305            assert!(chunk.text.chars().next().is_some() || chunk.text.is_empty());
306        }
307    }
308
309    #[test]
310    fn test_chunk_empty_text_rejected() {
311        // Test handling of empty chunk text
312        let doc_id = Uuid::nil();
313
314        // Empty text should still create a chunk (with empty text)
315        let chunk = Chunk::new(doc_id, "".to_string(), 0, 0);
316
317        // The chunk should exist but with empty text
318        assert_eq!(chunk.text, "");
319        assert_eq!(chunk.byte_length, 0);
320    }
321
322    #[test]
323    fn test_chunk_text_hash_hex() {
324        // Test text_hash_hex() method
325        let doc_id = Uuid::nil();
326        let chunk = Chunk::new(doc_id, "Test text".to_string(), 0, 0);
327
328        let hex = chunk.text_hash_hex();
329
330        // Should be 64 characters (32 bytes * 2 hex chars)
331        assert_eq!(hex.len(), 64);
332
333        // Should only contain hex characters
334        assert!(hex.chars().all(|c| c.is_ascii_hexdigit()));
335    }
336
337    #[test]
338    fn test_chunk_from_canonical() {
339        // Test from_canonical constructor
340        let doc_id = Uuid::nil();
341        let text = "Already canonicalized text".to_string();
342
343        let chunk = Chunk::from_canonical(doc_id, text.clone(), 100, 5);
344
345        assert_eq!(chunk.text, text);
346        assert_eq!(chunk.byte_offset, 100);
347        assert_eq!(chunk.sequence, 5);
348        assert_eq!(chunk.doc_id, doc_id);
349    }
350}
canon_core/chunk.rs

canon_core/
chunk.rs