1use crate::text::normalize;
7use serde::{Deserialize, Serialize};
8use uuid::Uuid;
9
10#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
21pub struct Chunk {
22 pub id: Uuid,
24
25 pub doc_id: Uuid,
27
28 pub text: String,
30
31 pub byte_offset: u64,
33
34 pub byte_length: u64,
36
37 pub sequence: u32,
39
40 pub text_hash: [u8; 32],
42}
43
44impl Chunk {
45 pub fn new(doc_id: Uuid, text: String, byte_offset: u64, sequence: u32) -> Self {
51 let canonical_text = normalize(&text);
53 let text_hash = *blake3::hash(canonical_text.as_bytes()).as_bytes();
54
55 let id_bytes = crate::id::generate_composite_id(&[
57 doc_id.as_bytes(),
58 &sequence.to_le_bytes(),
59 ]);
60 let id = Uuid::from_bytes(id_bytes);
61
62 let byte_length = text.len() as u64;
63
64 Self {
65 id,
66 doc_id,
67 text: canonical_text, byte_offset,
69 byte_length,
70 sequence,
71 text_hash,
72 }
73 }
74
75 #[doc(hidden)]
77 pub fn from_canonical(doc_id: Uuid, text: String, byte_offset: u64, sequence: u32) -> Self {
78 let text_hash = *blake3::hash(text.as_bytes()).as_bytes();
79
80 let id_bytes = crate::id::generate_composite_id(&[
82 doc_id.as_bytes(),
83 &sequence.to_le_bytes(),
84 ]);
85 let id = Uuid::from_bytes(id_bytes);
86
87 let byte_length = text.len() as u64;
88
89 Self {
90 id,
91 doc_id,
92 text,
93 byte_offset,
94 byte_length,
95 sequence,
96 text_hash,
97 }
98 }
99
100 pub fn text_hash_hex(&self) -> String {
102 self.text_hash
103 .iter()
104 .map(|b| format!("{:02x}", b))
105 .collect()
106 }
107
108 pub fn approx_tokens(&self) -> usize {
110 self.text.len() / 4
111 }
112}
113
114#[cfg(test)]
115mod tests {
116 use super::*;
117
118 #[test]
119 fn test_chunk_creation() {
120 let doc_id = Uuid::new_v4();
121 let chunk = Chunk::new(
122 doc_id,
123 "This is a test chunk.".to_string(),
124 0,
125 0,
126 );
127
128 assert_eq!(chunk.doc_id, doc_id);
129 assert_eq!(chunk.byte_offset, 0);
130 assert_eq!(chunk.sequence, 0);
131 }
132
133 #[test]
134 fn test_chunk_id_stable() {
135 let doc_id = Uuid::nil();
136 let text = "Test text";
137
138 let chunk1 = Chunk::new(doc_id, text.to_string(), 0, 1);
140 let chunk2 = Chunk::new(doc_id, "Different text".to_string(), 0, 1);
141
142 assert_eq!(chunk1.id, chunk2.id);
143
144 let chunk3 = Chunk::new(doc_id, text.to_string(), 0, 2);
146 assert_ne!(chunk1.id, chunk3.id);
147 }
148
149 #[test]
150 fn test_chunk_id_determinism() {
151 let doc_id = Uuid::nil();
152 let text = "Test text";
153 let seq = 1;
154
155 let chunk1 = Chunk::new(doc_id, text.to_string(), 0, seq);
156 let chunk2 = Chunk::new(doc_id, text.to_string(), 0, seq);
157
158 assert_eq!(chunk1.id, chunk2.id);
159 }
160
161 #[test]
162 fn test_text_canonicalized() {
163 let doc_id = Uuid::nil();
164
165 let chunk1 = Chunk::new(doc_id, "Hello \nWorld".to_string(), 0, 0);
167 let chunk2 = Chunk::new(doc_id, "Hello\nWorld".to_string(), 0, 0);
168
169 assert_eq!(chunk1.text, "Hello\nWorld\n");
171 assert_eq!(chunk1.text, chunk2.text);
172
173 assert_eq!(chunk1.id, chunk2.id);
175 }
176
177 #[test]
178 fn test_approx_tokens() {
179 let chunk = Chunk::new(
180 Uuid::new_v4(),
181 "A".repeat(400), 0,
183 0,
184 );
185
186 assert_eq!(chunk.approx_tokens(), 100);
187 }
188
189 #[test]
190 fn test_byte_offset() {
191 let doc_id = Uuid::new_v4();
192 let chunk = Chunk::new(doc_id, "Test".to_string(), 14, 1);
193
194 assert_eq!(chunk.byte_offset, 14);
195 }
196
197 #[test]
200 fn test_chunk_id_different_sequence_different_id() {
201 let doc_id = Uuid::nil();
203
204 let chunk1 = Chunk::new(doc_id, "Same text".to_string(), 0, 0);
205 let chunk2 = Chunk::new(doc_id, "Same text".to_string(), 0, 1);
206 let chunk3 = Chunk::new(doc_id, "Same text".to_string(), 0, 2);
207
208 assert_ne!(chunk1.id, chunk2.id);
210 assert_ne!(chunk2.id, chunk3.id);
211 assert_ne!(chunk1.id, chunk3.id);
212 }
213
214 #[test]
215 fn test_chunk_text_hash_computation() {
216 let doc_id = Uuid::nil();
218 let text = "Test text for hashing";
219
220 let chunk = Chunk::new(doc_id, text.to_string(), 0, 0);
221
222 let canonical = normalize(text);
224 let expected_hash = *blake3::hash(canonical.as_bytes()).as_bytes();
225
226 assert_eq!(chunk.text_hash, expected_hash);
227 }
228
229 #[test]
230 fn test_chunk_byte_offset_validation() {
231 let doc_id = Uuid::nil();
233
234 let chunk0 = Chunk::new(doc_id, "test".to_string(), 0, 0);
236 assert_eq!(chunk0.byte_offset, 0);
237
238 let chunk_large = Chunk::new(doc_id, "test".to_string(), 1_000_000, 0);
240 assert_eq!(chunk_large.byte_offset, 1_000_000);
241 }
242
243 #[test]
244 fn test_chunk_sequence_ordering() {
245 let doc_id = Uuid::nil();
247
248 let chunk0 = Chunk::new(doc_id, "first".to_string(), 0, 0);
249 let chunk1 = Chunk::new(doc_id, "second".to_string(), 10, 1);
250 let chunk2 = Chunk::new(doc_id, "third".to_string(), 20, 2);
251
252 assert_eq!(chunk0.sequence, 0);
253 assert_eq!(chunk1.sequence, 1);
254 assert_eq!(chunk2.sequence, 2);
255 }
256
257 #[test]
258 fn test_chunk_canonical_bytes_format() {
259 let doc_id = Uuid::nil();
261 let chunk = Chunk::new(doc_id, "Test content".to_string(), 0, 0);
262
263 assert_eq!(chunk.id.as_bytes().len(), 16);
265 assert_eq!(chunk.doc_id.as_bytes().len(), 16);
266 assert_eq!(chunk.text_hash.len(), 32);
267 assert!(chunk.sequence >= 0);
268 }
269
270 #[test]
271 fn test_chunk_overlap_semantics() {
272 let doc_id = Uuid::nil();
274 let text = "Hello World";
275
276 let chunk1 = Chunk::new(doc_id, text.to_string(), 0, 0);
278 let chunk2 = Chunk::new(doc_id, text.to_string(), 5, 1);
280
281 assert_eq!(chunk1.byte_offset, 0);
283 assert_eq!(chunk2.byte_offset, 5);
284
285 assert_eq!(chunk1.byte_length, text.len() as u64);
287 }
288
289 #[test]
290 fn test_chunk_text_validation_utf8() {
291 let doc_id = Uuid::nil();
293
294 let valid_texts = vec![
296 "Hello, World!",
297 "Unicode: cafe with accent: cafe",
298 "Emoji: hello world",
299 "",
300 "Multiple\nlines\nhere",
301 ];
302
303 for text in valid_texts {
304 let chunk = Chunk::new(doc_id, text.to_string(), 0, 0);
305 assert!(chunk.text.chars().next().is_some() || chunk.text.is_empty());
306 }
307 }
308
309 #[test]
310 fn test_chunk_empty_text_rejected() {
311 let doc_id = Uuid::nil();
313
314 let chunk = Chunk::new(doc_id, "".to_string(), 0, 0);
316
317 assert_eq!(chunk.text, "");
319 assert_eq!(chunk.byte_length, 0);
320 }
321
322 #[test]
323 fn test_chunk_text_hash_hex() {
324 let doc_id = Uuid::nil();
326 let chunk = Chunk::new(doc_id, "Test text".to_string(), 0, 0);
327
328 let hex = chunk.text_hash_hex();
329
330 assert_eq!(hex.len(), 64);
332
333 assert!(hex.chars().all(|c| c.is_ascii_hexdigit()));
335 }
336
337 #[test]
338 fn test_chunk_from_canonical() {
339 let doc_id = Uuid::nil();
341 let text = "Already canonicalized text".to_string();
342
343 let chunk = Chunk::from_canonical(doc_id, text.clone(), 100, 5);
344
345 assert_eq!(chunk.text, text);
346 assert_eq!(chunk.byte_offset, 100);
347 assert_eq!(chunk.sequence, 5);
348 assert_eq!(chunk.doc_id, doc_id);
349 }
350}