1use super::fingerprint::ChunkerConfig;
7
8#[derive(Debug, Clone)]
13pub struct SemanticChunker {
14 chunk_size: usize,
16 chunk_overlap: usize,
18 separators: Vec<String>,
20}
21
22impl SemanticChunker {
23 pub fn new(chunk_size: usize, chunk_overlap: usize, separators: Vec<String>) -> Self {
25 Self { chunk_size, chunk_overlap, separators }
26 }
27
28 pub fn from_config(config: &ChunkerConfig) -> Self {
30 Self {
31 chunk_size: config.chunk_size,
32 chunk_overlap: config.chunk_overlap,
33 separators: Self::default_separators(),
34 }
35 }
36
37 fn default_separators() -> Vec<String> {
39 vec![
40 "\n## ".to_string(), "\n### ".to_string(), "\n#### ".to_string(), "\nfn ".to_string(), "\npub fn ".to_string(), "\nimpl ".to_string(), "\nstruct ".to_string(), "\nenum ".to_string(), "\nmod ".to_string(), "\n```".to_string(), "\n\n".to_string(), "\n".to_string(), " ".to_string(), ]
54 }
55
56 pub fn split(&self, text: &str) -> Vec<Chunk> {
58 let mut chunks = Vec::new();
59 let mut current_pos = 0;
60
61 while current_pos < text.len() {
62 let (chunk_text, end_pos) = self.extract_chunk(text, current_pos);
63
64 if !chunk_text.trim().is_empty() {
65 let start_line = text[..current_pos].matches('\n').count() + 1;
66 let end_line = start_line + chunk_text.matches('\n').count();
67
68 chunks.push(Chunk {
69 content: chunk_text.to_string(),
70 start_offset: current_pos,
71 end_offset: end_pos,
72 start_line,
73 end_line,
74 });
75 }
76
77 let advance = if end_pos - current_pos > self.chunk_overlap {
79 end_pos - current_pos - self.chunk_overlap
80 } else {
81 end_pos - current_pos
82 };
83
84 let new_pos = current_pos + advance.max(1);
86 current_pos = Self::find_next_char_boundary(text, new_pos);
87 }
88
89 chunks
90 }
91
92 fn extract_chunk(&self, text: &str, start: usize) -> (String, usize) {
94 let remaining = &text[start..];
95 let target_end = Self::find_char_boundary(text, (start + self.chunk_size).min(text.len()));
96
97 if start + remaining.len() <= target_end {
99 return (remaining.to_string(), text.len());
100 }
101
102 let search_region = &text[start..target_end];
104
105 for separator in &self.separators {
106 if let Some(pos) = search_region.rfind(separator.as_str()) {
107 if pos > 0 {
108 let end = start + pos + separator.len();
110 return (text[start..end].to_string(), end);
111 }
112 }
113 }
114
115 (text[start..target_end].to_string(), target_end)
117 }
118
119 fn find_char_boundary(text: &str, pos: usize) -> usize {
121 if pos >= text.len() {
122 return text.len();
123 }
124 let mut boundary = pos;
126 while boundary > 0 && !text.is_char_boundary(boundary) {
127 boundary -= 1;
128 }
129 boundary
130 }
131
132 fn find_next_char_boundary(text: &str, pos: usize) -> usize {
134 if pos >= text.len() {
135 return text.len();
136 }
137 let mut boundary = pos;
139 while boundary < text.len() && !text.is_char_boundary(boundary) {
140 boundary += 1;
141 }
142 boundary
143 }
144
145 pub fn config(&self) -> ChunkerConfig {
147 let sep_refs: Vec<&str> = self.separators.iter().map(|s| s.as_str()).collect();
148 ChunkerConfig::new(self.chunk_size, self.chunk_overlap, &sep_refs)
149 }
150}
151
152impl Default for SemanticChunker {
153 fn default() -> Self {
154 Self { chunk_size: 512, chunk_overlap: 64, separators: Self::default_separators() }
155 }
156}
157
158#[derive(Debug, Clone, PartialEq, Eq)]
160pub struct Chunk {
161 pub content: String,
163 pub start_offset: usize,
165 pub end_offset: usize,
167 pub start_line: usize,
169 pub end_line: usize,
171}
172
173impl Chunk {
174 pub fn content_hash(&self) -> [u8; 32] {
176 let mut hash = [0u8; 32];
178 let mut state: u64 = 0xcbf2_9ce4_8422_2325;
179 for &byte in self.content.as_bytes() {
180 state ^= byte as u64;
181 state = state.wrapping_mul(0x0100_0000_01b3);
182 }
183 for i in 0..4 {
184 let chunk = state.wrapping_add(i as u64).to_le_bytes();
185 hash[i * 8..(i + 1) * 8].copy_from_slice(&chunk);
186 }
187 hash
188 }
189}
190
191#[cfg(test)]
192mod tests {
193 use super::*;
194
195 #[test]
196 fn test_chunker_default() {
197 let chunker = SemanticChunker::default();
198 assert_eq!(chunker.chunk_size, 512);
199 assert_eq!(chunker.chunk_overlap, 64);
200 assert!(!chunker.separators.is_empty());
201 }
202
203 #[test]
204 fn test_split_short_text() {
205 let chunker = SemanticChunker::default();
206 let text = "Short text";
207 let chunks = chunker.split(text);
208
209 assert_eq!(chunks.len(), 1);
210 assert_eq!(chunks[0].content, "Short text");
211 assert_eq!(chunks[0].start_line, 1);
212 assert_eq!(chunks[0].end_line, 1);
213 }
214
215 #[test]
216 fn test_split_markdown_headers() {
217 let chunker = SemanticChunker::new(100, 10, vec!["\n## ".to_string()]);
218 let text = "# Title\n\nIntro paragraph.\n\n## Section 1\n\nContent 1.\n\n## Section 2\n\nContent 2.";
219
220 let chunks = chunker.split(text);
221
222 assert!(chunks.len() >= 2);
224 }
225
226 #[test]
227 fn test_split_rust_code() {
228 let chunker = SemanticChunker::new(200, 20, vec!["\nfn ".to_string(), "\n\n".to_string()]);
229 let text = r#"
230fn foo() {
231 println!("foo");
232}
233
234fn bar() {
235 println!("bar");
236}
237
238fn baz() {
239 println!("baz");
240}
241"#;
242
243 let chunks = chunker.split(text);
244
245 assert!(!chunks.is_empty());
247 for chunk in &chunks {
248 assert!(!chunk.content.is_empty());
250 }
251 }
252
253 #[test]
254 fn test_chunk_overlap() {
255 let chunker = SemanticChunker::new(50, 10, vec![" ".to_string()]);
256 let text = "word1 word2 word3 word4 word5 word6 word7 word8 word9 word10 word11 word12";
257
258 let chunks = chunker.split(text);
259
260 if chunks.len() > 1 {
262 for chunk in &chunks {
264 assert!(chunk.content.len() <= chunker.chunk_size + 20); }
266 }
267 }
268
269 #[test]
270 fn test_chunk_line_tracking() {
271 let chunker = SemanticChunker::new(50, 5, vec!["\n".to_string()]);
272 let text = "Line 1\nLine 2\nLine 3\nLine 4\nLine 5";
273
274 let chunks = chunker.split(text);
275
276 assert_eq!(chunks[0].start_line, 1);
278 }
279
280 #[test]
281 fn test_chunk_content_hash_deterministic() {
282 let chunk1 = Chunk {
283 content: "test content".to_string(),
284 start_offset: 0,
285 end_offset: 12,
286 start_line: 1,
287 end_line: 1,
288 };
289 let chunk2 = Chunk {
290 content: "test content".to_string(),
291 start_offset: 100, end_offset: 112,
293 start_line: 5,
294 end_line: 5,
295 };
296
297 assert_eq!(chunk1.content_hash(), chunk2.content_hash());
298 }
299
300 #[test]
301 fn test_chunk_content_hash_different() {
302 let chunk1 = Chunk {
303 content: "content 1".to_string(),
304 start_offset: 0,
305 end_offset: 9,
306 start_line: 1,
307 end_line: 1,
308 };
309 let chunk2 = Chunk {
310 content: "content 2".to_string(),
311 start_offset: 0,
312 end_offset: 9,
313 start_line: 1,
314 end_line: 1,
315 };
316
317 assert_ne!(chunk1.content_hash(), chunk2.content_hash());
318 }
319
320 #[test]
321 fn test_empty_text() {
322 let chunker = SemanticChunker::default();
323 let chunks = chunker.split("");
324 assert!(chunks.is_empty());
325 }
326
327 #[test]
328 fn test_whitespace_only() {
329 let chunker = SemanticChunker::default();
330 let chunks = chunker.split(" \n\n \t ");
331 assert!(chunks.is_empty());
332 }
333
334 #[test]
335 fn test_config_round_trip() {
336 let chunker = SemanticChunker::new(256, 32, vec!["\n".to_string()]);
337 let config = chunker.config();
338
339 assert_eq!(config.chunk_size, 256);
340 assert_eq!(config.chunk_overlap, 32);
341 }
342
343 #[test]
344 fn test_large_document_chunking() {
345 let chunker = SemanticChunker::new(100, 20, SemanticChunker::default_separators());
346
347 let mut text = String::new();
349 for i in 0..50 {
350 text.push_str(&format!(
351 "\n## Section {}\n\nThis is paragraph {} with some content.\n",
352 i, i
353 ));
354 }
355
356 let chunks = chunker.split(&text);
357
358 assert!(chunks.len() > 1);
360
361 for chunk in &chunks {
363 assert!(!chunk.content.trim().is_empty());
364 }
365 }
366
367 mod proptests {
369 use super::*;
370 use proptest::prelude::*;
371
372 fn chunk_size_strategy() -> impl Strategy<Value = usize> {
374 32usize..=1024
375 }
376
377 fn overlap_strategy(chunk_size: usize) -> impl Strategy<Value = usize> {
379 0..=(chunk_size / 2)
380 }
381
382 proptest! {
383 #![proptest_config(ProptestConfig::with_cases(50))]
384
385 #[test]
387 fn prop_chunking_preserves_content(text in ".{0,500}") {
388 let chunker = SemanticChunker::default();
389 let chunks = chunker.split(&text);
390
391 for chunk in &chunks {
393 prop_assert!(text.contains(chunk.content.trim()) || chunk.content.trim().is_empty());
395 }
396 }
397
398 #[test]
400 fn prop_chunk_lines_valid(text in ".{1,500}\n.{1,500}\n.{1,500}") {
401 let chunker = SemanticChunker::default();
402 let chunks = chunker.split(&text);
403
404 for chunk in &chunks {
405 prop_assert!(chunk.start_line <= chunk.end_line,
406 "start_line {} > end_line {}", chunk.start_line, chunk.end_line);
407 prop_assert!(chunk.start_line >= 1,
408 "start_line {} should be >= 1", chunk.start_line);
409 }
410 }
411
412 #[test]
414 fn prop_chunk_offsets_valid(text in ".{10,500}") {
415 let chunker = SemanticChunker::default();
416 let chunks = chunker.split(&text);
417
418 for chunk in &chunks {
419 prop_assert!(chunk.start_offset <= chunk.end_offset,
420 "start_offset {} > end_offset {}", chunk.start_offset, chunk.end_offset);
421 prop_assert!(chunk.end_offset <= text.len(),
422 "end_offset {} > text.len() {}", chunk.end_offset, text.len());
423 }
424 }
425
426 #[test]
428 fn prop_content_hash_deterministic(content in ".{1,100}") {
429 let chunk = Chunk {
430 content: content.clone(),
431 start_offset: 0,
432 end_offset: content.len(),
433 start_line: 1,
434 end_line: 1,
435 };
436
437 let hash1 = chunk.content_hash();
438 let hash2 = chunk.content_hash();
439 prop_assert_eq!(hash1, hash2);
440 }
441
442 #[test]
444 fn prop_content_hash_different(
445 content1 in "[a-z]{5,20}",
446 content2 in "[A-Z]{5,20}"
447 ) {
448 if content1 != content2 {
450 let chunk1 = Chunk {
451 content: content1.clone(),
452 start_offset: 0,
453 end_offset: content1.len(),
454 start_line: 1,
455 end_line: 1,
456 };
457 let chunk2 = Chunk {
458 content: content2.clone(),
459 start_offset: 0,
460 end_offset: content2.len(),
461 start_line: 1,
462 end_line: 1,
463 };
464
465 prop_assert_ne!(chunk1.content_hash(), chunk2.content_hash());
466 }
467 }
468
469 #[test]
471 fn prop_chunk_size_respected(
472 chunk_size in chunk_size_strategy(),
473 text_len in 100usize..2000
474 ) {
475 let overlap = chunk_size / 4;
476 let chunker = SemanticChunker::new(chunk_size, overlap, vec![" ".to_string()]);
477
478 let text: String = (0..text_len).map(|i| if i % 10 == 0 { ' ' } else { 'a' }).collect();
480 let chunks = chunker.split(&text);
481
482 for chunk in &chunks {
484 prop_assert!(chunk.content.len() <= chunk_size * 2,
486 "chunk len {} > 2 * chunk_size {}", chunk.content.len(), chunk_size * 2);
487 }
488 }
489 }
490 }
491}