1use crate::error::Result;
2use crate::types::{Chunk, Message};
3use crate::utils::{HashUtils, TextProcessor, TokenCounter, TextPart, TextPartKind};
4use uuid::Uuid;
5
6#[derive(Debug, Clone)]
8pub struct ChunkingConfig {
9 pub target_tokens: i32,
10 pub overlap: i32,
11}
12
13impl Default for ChunkingConfig {
14 fn default() -> Self {
15 Self {
16 target_tokens: 320,
17 overlap: 64,
18 }
19 }
20}
21
22pub struct ChunkingService {
24 config: ChunkingConfig,
25}
26
27impl ChunkingService {
28 pub fn new(config: ChunkingConfig) -> Self {
30 Self { config }
31 }
32
33 pub fn chunk_message(&self, message: &Message) -> Result<Vec<Chunk>> {
35 let normalized_text = TextProcessor::normalize_text(&message.text);
37
38 let parts = TextProcessor::extract_code_fences(&normalized_text);
40
41 let mut chunks = Vec::new();
43 for part in parts {
44 let part_chunks = self.create_chunks_from_part(
45 &message.id,
46 &message.session_id,
47 &part,
48 )?;
49 chunks.extend(part_chunks);
50 }
51
52 Ok(chunks)
53 }
54
55 fn create_chunks_from_part(
57 &self,
58 message_id: &Uuid,
59 session_id: &str,
60 part: &TextPart,
61 ) -> Result<Vec<Chunk>> {
62 let tokens = TokenCounter::count_tokens(&part.content);
63 let mut chunks = Vec::new();
64
65 if tokens <= self.config.target_tokens {
66 let chunk_id = HashUtils::short_hash(&format!("{}-{}-{}", message_id, part.start, part.end));
68
69 chunks.push(Chunk {
70 id: chunk_id,
71 message_id: *message_id,
72 session_id: session_id.to_string(),
73 offset_start: part.start,
74 offset_end: part.end,
75 kind: match part.kind {
76 TextPartKind::Text => "text".to_string(),
77 TextPartKind::Code => "code".to_string(),
78 },
79 text: part.content.clone(),
80 tokens,
81 });
82 } else {
83 match part.kind {
85 TextPartKind::Text => {
86 chunks.extend(self.split_text_part(message_id, session_id, part)?);
87 }
88 TextPartKind::Code => {
89 chunks.extend(self.split_code_part(message_id, session_id, part)?);
90 }
91 }
92 }
93
94 Ok(chunks)
95 }
96
97 fn split_text_part(
99 &self,
100 message_id: &Uuid,
101 session_id: &str,
102 part: &TextPart,
103 ) -> Result<Vec<Chunk>> {
104 let sentences = TextProcessor::split_sentences(&part.content);
105 let mut chunks = Vec::new();
106 let mut current_chunk = String::new();
107 let mut current_start = part.start;
108 let mut current_tokens = 0;
109
110 for sentence in sentences {
111 let sentence_tokens = TokenCounter::count_tokens(&sentence);
112
113 if current_tokens + sentence_tokens > self.config.target_tokens && !current_chunk.is_empty() {
114 let chunk_end = current_start + current_chunk.len();
116 let chunk_id = HashUtils::short_hash(&format!("{}-{}-{}", message_id, current_start, chunk_end));
117
118 chunks.push(Chunk {
119 id: chunk_id,
120 message_id: *message_id,
121 session_id: session_id.to_string(),
122 offset_start: current_start,
123 offset_end: chunk_end,
124 kind: "text".to_string(),
125 text: current_chunk.trim().to_string(),
126 tokens: current_tokens,
127 });
128
129 let overlap_text = if current_chunk.len() > self.config.overlap as usize {
131 current_chunk[current_chunk.len() - self.config.overlap as usize..].to_string()
132 } else {
133 current_chunk.clone()
134 };
135
136 current_chunk = format!("{} {}", overlap_text, sentence);
137 current_start = chunk_end - overlap_text.len();
138 current_tokens = TokenCounter::count_tokens(¤t_chunk);
139 } else {
140 if !current_chunk.is_empty() {
141 current_chunk.push(' ');
142 }
143 current_chunk.push_str(&sentence);
144 current_tokens += sentence_tokens;
145 }
146 }
147
148 if !current_chunk.trim().is_empty() {
150 let chunk_end = current_start + current_chunk.len();
151 let chunk_id = HashUtils::short_hash(&format!("{}-{}-{}", message_id, current_start, chunk_end));
152
153 chunks.push(Chunk {
154 id: chunk_id,
155 message_id: *message_id,
156 session_id: session_id.to_string(),
157 offset_start: current_start,
158 offset_end: chunk_end,
159 kind: "text".to_string(),
160 text: current_chunk.trim().to_string(),
161 tokens: current_tokens,
162 });
163 }
164
165 Ok(chunks)
166 }
167
168 fn split_code_part(
170 &self,
171 message_id: &Uuid,
172 session_id: &str,
173 part: &TextPart,
174 ) -> Result<Vec<Chunk>> {
175 let lines: Vec<&str> = part.content.split('\n').collect();
176 let mut chunks = Vec::new();
177 let mut current_chunk = String::new();
178 let mut current_start = part.start;
179 let mut current_tokens = 0;
180 let mut line_offset = 0;
181
182 for (i, line) in lines.iter().enumerate() {
183 let line_with_newline = if i < lines.len() - 1 {
184 format!("{}\n", line)
185 } else {
186 line.to_string()
187 };
188 let line_tokens = TokenCounter::count_tokens(&line_with_newline);
189
190 if current_tokens + line_tokens > self.config.target_tokens && !current_chunk.is_empty() {
191 let chunk_end = current_start + current_chunk.len();
193 let chunk_id = HashUtils::short_hash(&format!("{}-{}-{}", message_id, current_start, chunk_end));
194
195 chunks.push(Chunk {
196 id: chunk_id,
197 message_id: *message_id,
198 session_id: session_id.to_string(),
199 offset_start: current_start,
200 offset_end: chunk_end,
201 kind: "code".to_string(),
202 text: current_chunk.clone(),
203 tokens: current_tokens,
204 });
205
206 let overlap_lines = std::cmp::min(3, self.config.overlap / 20);
208 let start_idx = std::cmp::max(0, i as i32 - overlap_lines) as usize;
209 let overlap_text = lines[start_idx..i].join("\n");
210
211 let line_len = line_with_newline.len(); current_chunk = if overlap_text.is_empty() {
214 line_with_newline
215 } else {
216 format!("{}\n{}", overlap_text, line_with_newline)
217 };
218
219 current_start = part.start + line_offset - overlap_text.len();
220 current_tokens = TokenCounter::count_tokens(¤t_chunk);
221 line_offset += line_len;
222 } else {
223 line_offset += line_with_newline.len();
224 current_chunk.push_str(&line_with_newline);
225 current_tokens += line_tokens;
226 }
227 }
228
229 if !current_chunk.trim().is_empty() {
231 let chunk_end = current_start + current_chunk.len();
232 let chunk_id = HashUtils::short_hash(&format!("{}-{}-{}", message_id, current_start, chunk_end));
233
234 chunks.push(Chunk {
235 id: chunk_id,
236 message_id: *message_id,
237 session_id: session_id.to_string(),
238 offset_start: current_start,
239 offset_end: chunk_end,
240 kind: "code".to_string(),
241 text: current_chunk,
242 tokens: current_tokens,
243 });
244 }
245
246 Ok(chunks)
247 }
248}
249
250#[cfg(test)]
251mod tests {
252 use super::*;
253 use chrono::Utc;
254
255 fn create_test_message(text: &str) -> Message {
256 Message {
257 id: Uuid::new_v4(),
258 session_id: "test-session".to_string(),
259 turn: 1,
260 role: "user".to_string(),
261 text: text.to_string(),
262 ts: Utc::now(),
263 meta: None,
264 }
265 }
266
267 #[test]
268 fn test_simple_chunking() {
269 let config = ChunkingConfig::default();
270 let service = ChunkingService::new(config);
271
272 let message = create_test_message("This is a simple test message.");
273 let chunks = service.chunk_message(&message).unwrap();
274
275 assert_eq!(chunks.len(), 1);
276 assert_eq!(chunks[0].kind, "text");
277 assert_eq!(chunks[0].text, "This is a simple test message.");
278 }
279
280 #[test]
281 fn test_code_fence_detection() {
282 let config = ChunkingConfig::default();
283 let service = ChunkingService::new(config);
284
285 let message = create_test_message("Here's some code:\n```rust\nfn main() {\n println!(\"Hello\");\n}\n```\nThat was the code.");
286 let chunks = service.chunk_message(&message).unwrap();
287
288 assert_eq!(chunks.len(), 3);
289 assert_eq!(chunks[0].kind, "text");
290 assert_eq!(chunks[1].kind, "code");
291 assert_eq!(chunks[2].kind, "text");
292 }
293
294 #[test]
295 fn test_long_text_splitting() {
296 let config = ChunkingConfig {
297 target_tokens: 10, overlap: 2,
299 };
300 let service = ChunkingService::new(config);
301
302 let long_text = "This is the first sentence. This is the second sentence. This is the third sentence. This is the fourth sentence.";
303 let message = create_test_message(long_text);
304 let chunks = service.chunk_message(&message).unwrap();
305
306 assert!(chunks.len() > 1);
308 assert!(chunks.iter().all(|c| c.kind == "text"));
309 }
310
311 #[test]
312 fn test_token_counting_accuracy() {
313 let short_text = "hello";
314 let medium_text = "hello world";
315 let long_text = "This is a longer text with multiple words and punctuation!";
316
317 assert_eq!(TokenCounter::count_tokens(short_text), 1);
318 assert!(TokenCounter::count_tokens(medium_text) >= 2);
319 assert!(TokenCounter::count_tokens(long_text) > TokenCounter::count_tokens(medium_text));
320 }
321
322 #[test]
323 fn test_chunking_configuration() {
324 let small_config = ChunkingConfig {
325 target_tokens: 5,
326 overlap: 1,
327 };
328
329 let large_config = ChunkingConfig {
330 target_tokens: 100,
331 overlap: 10,
332 };
333
334 let small_service = ChunkingService::new(small_config);
335 let large_service = ChunkingService::new(large_config);
336
337 let text = "This is a test message with several words that should be chunked differently based on configuration.";
338 let message = create_test_message(text);
339
340 let small_chunks = small_service.chunk_message(&message).unwrap();
341 let large_chunks = large_service.chunk_message(&message).unwrap();
342
343 assert!(small_chunks.len() >= large_chunks.len());
345
346 for chunk in &small_chunks {
348 assert!(!chunk.id.is_empty());
349 assert_eq!(chunk.message_id, message.id);
350 assert_eq!(chunk.session_id, message.session_id);
351 assert!(chunk.tokens > 0);
352 }
353 }
354
355 #[test]
356 fn test_chunking_overlap_behavior() {
357 let config = ChunkingConfig {
358 target_tokens: 10,
359 overlap: 3,
360 };
361 let service = ChunkingService::new(config);
362
363 let text = "First sentence here. Second sentence here. Third sentence here. Fourth sentence here.";
364 let message = create_test_message(text);
365 let chunks = service.chunk_message(&message).unwrap();
366
367 if chunks.len() > 1 {
368 for i in 1..chunks.len() {
371 assert!(chunks[i].offset_start < chunks[i].offset_end);
372 assert!(chunks[i-1].offset_end > chunks[i].offset_start); }
374 }
375 }
376
377 #[test]
378 fn test_chunking_edge_cases() {
379 let service = ChunkingService::new(ChunkingConfig::default());
380
381 let empty_message = create_test_message("");
383 let empty_chunks = service.chunk_message(&empty_message).unwrap();
384 assert_eq!(empty_chunks.len(), 1); let whitespace_message = create_test_message(" \n\t ");
388 let whitespace_chunks = service.chunk_message(&whitespace_message).unwrap();
389 assert_eq!(whitespace_chunks.len(), 1); let single_word_message = create_test_message("hello");
393 let single_word_chunks = service.chunk_message(&single_word_message).unwrap();
394 assert_eq!(single_word_chunks.len(), 1);
395 assert_eq!(single_word_chunks[0].text, "hello");
396
397 let long_word = "a".repeat(1000);
399 let long_word_message = create_test_message(&long_word);
400 let long_word_chunks = service.chunk_message(&long_word_message).unwrap();
401 assert!(!long_word_chunks.is_empty());
402 assert!(long_word_chunks[0].text.len() <= 1000);
403 }
404
405 #[test]
406 fn test_mixed_content_chunking() {
407 let service = ChunkingService::new(ChunkingConfig::default());
408
409 let mixed_content = r#"
410This is regular text content.
411
412```python
413def hello_world():
414 print("Hello, World!")
415 return "success"
416```
417
418And this is more text after the code block.
419
420```javascript
421function greet(name) {
422 return `Hello, ${name}!`;
423}
424```
425
426Final text content here.
427 "#;
428
429 let message = create_test_message(mixed_content);
430 let chunks = service.chunk_message(&message).unwrap();
431
432 assert!(!chunks.is_empty());
433
434 let kinds: Vec<String> = chunks.iter().map(|c| c.kind.clone()).collect();
436 let unique_kinds: std::collections::HashSet<String> = kinds.into_iter().collect();
437
438 assert!(unique_kinds.contains("text"));
440
441 for chunk in &chunks {
443 assert!(chunk.offset_start < chunk.offset_end);
444 assert!(chunk.offset_end <= mixed_content.len());
445 }
446 }
447
448 #[test]
449 fn test_token_counter_edge_cases() {
450 assert_eq!(TokenCounter::count_tokens(""), 0);
452
453 assert_eq!(TokenCounter::count_tokens(" "), 0);
455 assert_eq!(TokenCounter::count_tokens("\n\t"), 0);
456
457 assert!(TokenCounter::count_tokens("!!!") > 0);
459 assert!(TokenCounter::count_tokens("...") > 0);
460
461 assert_eq!(TokenCounter::count_tokens("123"), 1);
463 assert_eq!(TokenCounter::count_tokens("123 456"), 3); assert_eq!(TokenCounter::count_tokens("abc123"), 1);
467 assert_eq!(TokenCounter::count_tokens("test123 demo456"), 3); assert!(TokenCounter::count_tokens("@#$%") > 0);
471 assert!(TokenCounter::count_tokens("email@domain.com") > 0);
472
473 assert_eq!(TokenCounter::count_tokens("hello"), TokenCounter::count_tokens("hello"));
475 assert!(TokenCounter::count_tokens("测试") > 0);
476 assert!(TokenCounter::count_tokens("🌍🚀") > 0);
477 }
478
479 #[test]
480 fn test_chunk_validation() {
481 let service = ChunkingService::new(ChunkingConfig::default());
482 let message = create_test_message("Test message with multiple sentences. Each should be properly chunked.");
483 let chunks = service.chunk_message(&message).unwrap();
484
485 for chunk in &chunks {
486 assert!(!chunk.id.is_empty());
488 assert_eq!(chunk.message_id, message.id);
489 assert_eq!(chunk.session_id, message.session_id);
490 assert!(!chunk.text.is_empty());
491 assert!(chunk.tokens > 0);
492 assert!(chunk.offset_start < chunk.offset_end);
493
494 let expected_text = message.text[chunk.offset_start..chunk.offset_end].trim();
496 assert!(!expected_text.is_empty());
497 }
498 }
499
500 #[test]
501 fn test_chunking_service_consistency() {
502 let service = ChunkingService::new(ChunkingConfig::default());
503 let text = "Consistent test message for chunking.";
504 let message = create_test_message(text);
505
506 let chunks1 = service.chunk_message(&message).unwrap();
508 let chunks2 = service.chunk_message(&message).unwrap();
509
510 assert_eq!(chunks1.len(), chunks2.len());
512
513 for (c1, c2) in chunks1.iter().zip(chunks2.iter()) {
514 assert_eq!(c1.text, c2.text);
515 assert_eq!(c1.kind, c2.kind);
516 assert_eq!(c1.offset_start, c2.offset_start);
517 assert_eq!(c1.offset_end, c2.offset_end);
518 assert_eq!(c1.tokens, c2.tokens);
519 }
520 }
521
522 #[test]
523 fn test_chunking_config_clone_and_debug() {
524 let config = ChunkingConfig {
525 target_tokens: 50,
526 overlap: 5,
527 };
528
529 let cloned_config = config.clone();
531 assert_eq!(config.target_tokens, cloned_config.target_tokens);
532 assert_eq!(config.overlap, cloned_config.overlap);
533
534 let debug_str = format!("{:?}", config);
536 assert!(debug_str.contains("ChunkingConfig"));
537 assert!(debug_str.contains("target_tokens"));
538 assert!(debug_str.contains("overlap"));
539 }
540
541 #[test]
544 fn test_large_code_chunk_splitting() {
545 let config = ChunkingConfig {
546 target_tokens: 50, overlap: 10,
548 };
549 let service = ChunkingService::new(config);
550
551 let large_code = r#"
552```python
553# This is a large code block that should be split into multiple chunks
554def complex_function(param1, param2, param3):
555 """
556 This is a complex function with many lines
557 that should exceed the token limit and force chunking
558 """
559 # First part of the function
560 result = []
561 for i in range(param1):
562 if i % 2 == 0:
563 result.append(i * param2)
564 else:
565 result.append(i + param3)
566
567 # Second part of the function
568 processed_result = []
569 for item in result:
570 if item > 100:
571 processed_result.append(item / 2)
572 elif item < 10:
573 processed_result.append(item * 3)
574 else:
575 processed_result.append(item)
576
577 # Third part of the function
578 final_result = []
579 for i, item in enumerate(processed_result):
580 if i % 3 == 0:
581 final_result.append(item + 1)
582 elif i % 3 == 1:
583 final_result.append(item - 1)
584 else:
585 final_result.append(item)
586
587 return final_result
588
589def another_function():
590 return "This is another function"
591
592class TestClass:
593 def __init__(self):
594 self.value = 42
595
596 def method1(self):
597 return self.value * 2
598
599 def method2(self):
600 return self.value / 2
601```
602 "#;
603
604 let message = create_test_message(large_code);
605 let chunks = service.chunk_message(&message).unwrap();
606
607 assert!(chunks.len() > 1);
609
610 let code_chunks: Vec<_> = chunks.iter().filter(|c| c.kind == "code").collect();
612 assert!(!code_chunks.is_empty());
613
614 for window in chunks.windows(2) {
616 let chunk1 = &window[0];
617 let chunk2 = &window[1];
618
619 assert!(chunk1.offset_end <= chunk2.offset_end);
621 assert!(chunk1.tokens > 0);
622 assert!(chunk2.tokens > 0);
623 }
624 }
625
626 #[test]
627 fn test_overlap_functionality_detailed() {
628 let config = ChunkingConfig {
629 target_tokens: 30,
630 overlap: 15, };
632 let service = ChunkingService::new(config);
633
634 let text = "First sentence here. Second sentence follows. Third sentence continues. Fourth sentence extends. Fifth sentence concludes. Sixth sentence adds more. Seventh sentence finishes.";
635 let message = create_test_message(text);
636 let chunks = service.chunk_message(&message).unwrap();
637
638 if chunks.len() > 1 {
639 for i in 0..chunks.len() - 1 {
641 let chunk1_text = &chunks[i].text;
642 let chunk2_text = &chunks[i + 1].text;
643
644 let chunk1_words: std::collections::HashSet<&str> = chunk1_text.split_whitespace().collect();
646 let chunk2_words: std::collections::HashSet<&str> = chunk2_text.split_whitespace().collect();
647 let _intersection: Vec<_> = chunk1_words.intersection(&chunk2_words).collect();
648
649 assert!(chunk1_text.len() > 0);
652 assert!(chunk2_text.len() > 0);
653 }
654 }
655 }
656
657 #[test]
658 fn test_mixed_code_and_text_complex() {
659 let service = ChunkingService::new(ChunkingConfig::default());
660
661 let complex_content = r#"
662This is introductory text before the code.
663
664```javascript
665function processData(data) {
666 // Process the input data
667 return data.map(item => {
668 return {
669 id: item.id,
670 value: item.value * 2,
671 processed: true
672 };
673 });
674}
675
676const config = {
677 timeout: 5000,
678 retries: 3,
679 debug: true
680};
681```
682
683Here is explanatory text between code blocks.
684
685```python
686import json
687import time
688
689def load_config(filename):
690 with open(filename, 'r') as f:
691 return json.load(f)
692
693def process_file(input_file, output_file):
694 data = load_config(input_file)
695 processed = []
696
697 for item in data:
698 time.sleep(0.1) # Simulate processing
699 processed.append({
700 'original': item,
701 'timestamp': time.time()
702 })
703
704 with open(output_file, 'w') as f:
705 json.dump(processed, f, indent=2)
706```
707
708And this is concluding text after all the code.
709 "#;
710
711 let message = create_test_message(complex_content);
712 let chunks = service.chunk_message(&message).unwrap();
713
714 let text_chunks: Vec<_> = chunks.iter().filter(|c| c.kind == "text").collect();
716 let code_chunks: Vec<_> = chunks.iter().filter(|c| c.kind == "code").collect();
717
718 assert!(!text_chunks.is_empty());
719 assert!(!code_chunks.is_empty());
720
721 for chunk in &chunks {
723 assert!(!chunk.text.is_empty());
724 assert!(chunk.offset_start < chunk.offset_end);
725 assert!(chunk.offset_end <= complex_content.len());
726
727 let chunk_from_original = &complex_content[chunk.offset_start..chunk.offset_end];
729 assert!(chunk_from_original.contains(chunk.text.trim()));
730 }
731 }
732
733 #[test]
734 fn test_very_small_target_tokens() {
735 let config = ChunkingConfig {
736 target_tokens: 5, overlap: 2,
738 };
739 let service = ChunkingService::new(config);
740
741 let text = "One two three four five six seven eight nine ten eleven twelve thirteen fourteen fifteen sixteen seventeen eighteen nineteen twenty twentyone twentytwo twentythree twentyfour twentyfive.";
742 let message = create_test_message(text);
743 let chunks = service.chunk_message(&message).unwrap();
744
745 assert!(chunks.len() >= 1);
747
748 for chunk in &chunks {
750 assert!(!chunk.text.is_empty());
751 assert!(chunk.tokens > 0); }
753 }
754
755 #[test]
756 fn test_zero_overlap_configuration() {
757 let config = ChunkingConfig {
758 target_tokens: 20,
759 overlap: 0, };
761 let service = ChunkingService::new(config);
762
763 let text = "First chunk content here. Second chunk content follows. Third chunk content continues.";
764 let message = create_test_message(text);
765 let chunks = service.chunk_message(&message).unwrap();
766
767 if chunks.len() > 1 {
768 for i in 0..chunks.len() - 1 {
770 let chunk1_end = chunks[i].offset_end;
771 let chunk2_start = chunks[i + 1].offset_start;
772
773 assert!(chunk2_start >= chunk1_end);
775 }
776 }
777 }
778
779 #[test]
780 fn test_single_word_chunks() {
781 let config = ChunkingConfig {
782 target_tokens: 1, overlap: 0,
784 };
785 let service = ChunkingService::new(config);
786
787 let text = "alpha beta gamma delta epsilon";
788 let message = create_test_message(text);
789 let chunks = service.chunk_message(&message).unwrap();
790
791 assert!(chunks.len() >= 3);
793
794 for chunk in &chunks {
795 assert!(chunk.text.split_whitespace().count() <= 2);
797 assert!(!chunk.text.is_empty());
798 }
799 }
800
801 #[test]
802 fn test_empty_code_blocks() {
803 let service = ChunkingService::new(ChunkingConfig::default());
804
805 let content_with_empty_code = r#"
806Text before empty code block.
807
808```python
809# Just a comment, no actual code
810```
811
812Text after empty code block.
813
814```javascript
815// Another empty block
816// Just comments
817```
818
819Final text.
820 "#;
821
822 let message = create_test_message(content_with_empty_code);
823 let chunks = service.chunk_message(&message).unwrap();
824
825 assert!(!chunks.is_empty());
826
827 for chunk in &chunks {
829 assert!(!chunk.text.is_empty());
830 assert!(chunk.tokens > 0);
831 assert!(chunk.offset_start < chunk.offset_end);
832 }
833 }
834
835 #[test]
836 fn test_maximum_overlap_edge_case() {
837 let config = ChunkingConfig {
838 target_tokens: 20,
839 overlap: 100, };
841 let service = ChunkingService::new(config);
842
843 let text = "This is a test of maximum overlap configuration which should still work properly.";
844 let message = create_test_message(text);
845 let chunks = service.chunk_message(&message).unwrap();
846
847 assert!(!chunks.is_empty());
849
850 for chunk in &chunks {
851 assert!(!chunk.text.is_empty());
852 assert!(chunk.tokens > 0);
853 assert!(chunk.offset_start < chunk.offset_end);
854 }
855 }
856
857 #[test]
858 fn test_code_block_line_splitting() {
859 let config = ChunkingConfig {
860 target_tokens: 25, overlap: 3,
862 };
863 let service = ChunkingService::new(config);
864
865 let code_content = r#"
866```rust
867fn main() {
868 println!("Line 1");
869 println!("Line 2");
870 println!("Line 3");
871 println!("Line 4");
872 println!("Line 5");
873 println!("Line 6");
874 println!("Line 7");
875 println!("Line 8");
876 println!("Line 9");
877 println!("Line 10");
878 let x = 42;
879 let y = x * 2;
880 let z = y + 1;
881 println!("Result: {}", z);
882}
883```
884 "#;
885
886 let message = create_test_message(code_content);
887 let chunks = service.chunk_message(&message).unwrap();
888
889 let code_chunks: Vec<_> = chunks.iter().filter(|c| c.kind == "code").collect();
891
892 if code_chunks.len() > 1 {
893 for chunk in &code_chunks {
895 assert!(chunk.text.contains('\n') || chunk.text.trim().len() > 0);
896 assert!(chunk.tokens > 0);
897 }
898 }
899 }
900
901 #[test]
902 fn test_chunk_id_uniqueness() {
903 let service = ChunkingService::new(ChunkingConfig::default());
904
905 let text = "Unique test content for ID generation. Each chunk should have a unique identifier.";
906 let message = create_test_message(text);
907 let chunks = service.chunk_message(&message).unwrap();
908
909 let ids: Vec<String> = chunks.iter().map(|c| c.id.clone()).collect();
911 let unique_ids: std::collections::HashSet<String> = ids.iter().cloned().collect();
912
913 assert_eq!(ids.len(), unique_ids.len());
915
916 for id in &ids {
918 assert!(!id.is_empty());
919 }
920 }
921
922 #[test]
923 fn test_token_counting_complex_content() {
924 assert!(TokenCounter::count_tokens("simple text") > 0);
926 assert!(TokenCounter::count_tokens("function(param1, param2)") > 0);
927 assert!(TokenCounter::count_tokens("multi-line\ncontent\nwith\nbreaks") > 0);
928 assert!(TokenCounter::count_tokens(" whitespace around ") > 0);
929 assert!(TokenCounter::count_tokens("symbols!@#$%^&*()+={}[]|\\:;\"'<>?,./") > 0);
930
931 let content = "consistent content for testing";
933 let count1 = TokenCounter::count_tokens(content);
934 let count2 = TokenCounter::count_tokens(content);
935 assert_eq!(count1, count2);
936
937 let count_a = TokenCounter::count_tokens("short");
939 let count_b = TokenCounter::count_tokens("much longer text content with many more words");
940 assert!(count_b > count_a);
941 }
942
943 #[test]
944 fn test_chunking_boundaries_accuracy() {
945 let service = ChunkingService::new(ChunkingConfig::default());
946
947 let original_text = "Boundary test. First sentence. Second sentence here. Third sentence follows. Final sentence.";
948 let message = create_test_message(original_text);
949 let chunks = service.chunk_message(&message).unwrap();
950
951 for chunk in &chunks {
953 assert!(chunk.offset_start < chunk.offset_end);
954 assert!(chunk.offset_end <= original_text.len());
955
956 let extracted = &original_text[chunk.offset_start..chunk.offset_end];
958
959 assert!(extracted.contains(chunk.text.trim()));
961
962 assert!(chunk.offset_start < original_text.len());
964 assert!(chunk.offset_end > chunk.offset_start);
965 }
966 }
967}