synapse_core/processor.rs
1/// Simple semantic chunker for text processing
2pub struct TextProcessor;
3
4impl Default for TextProcessor {
5 fn default() -> Self {
6 Self::new()
7 }
8}
9
10impl TextProcessor {
11 pub fn new() -> Self {
12 Self
13 }
14
15 /// Split text into recursive chunks with overlap
16 pub fn chunk_text(&self, text: &str, max_chars: usize, overlap: usize) -> Vec<String> {
17 let mut chunks = Vec::new();
18 // Simple approach: Split by whitespace to preserve words
19 let words: Vec<&str> = text.split_inclusive(char::is_whitespace).collect();
20
21 let mut current_chunk = String::new();
22 let mut current_len = 0;
23 let mut current_words: Vec<&str> = Vec::new();
24
25 for word in words {
26 if current_len + word.len() > max_chars {
27 if !current_chunk.is_empty() {
28 chunks.push(current_chunk.trim().to_string());
29 }
30
31 // Handle overlap
32 let mut overlap_words = Vec::new();
33 let mut overlap_len = 0;
34
35 // Backtrack to capture overlap context
36 for w in current_words.iter().rev() {
37 if overlap_len + w.len() <= overlap {
38 overlap_words.push(*w);
39 overlap_len += w.len();
40 } else {
41 break;
42 }
43 }
44 overlap_words.reverse();
45
46 current_chunk = overlap_words.concat();
47 current_len = overlap_len;
48 current_words = overlap_words;
49 }
50
51 current_chunk.push_str(word);
52 current_len += word.len();
53 current_words.push(word);
54 }
55
56 if !current_chunk.is_empty() {
57 chunks.push(current_chunk.trim().to_string());
58 }
59
60 chunks
61 }
62}
63
64#[cfg(test)]
65mod tests {
66 use super::*;
67
68 #[test]
69 fn test_chunk_text_with_overlap() {
70 let processor = TextProcessor::new();
71 let text = "one two three four five six seven eight nine ten";
72 // max_chars small to force split. "one two " is 8 chars.
73 // Let's use max_chars=15.
74 // "one two three" = 13 chars. " four" = 5. Total 18 > 15.
75 // So chunk 1: "one two three" (13)
76 // Overlap: say 10 chars.
77 // "three" is 5 chars. "two " is 4. "one " is 4.
78 // overlap 10 captures "two three".
79 // next chunk start with "two three".
80 // "two three four five" = 19 chars > 15.
81 // So chunk 2: "two three four" (14).
82
83 // Wait, my implementation uses words.
84 // Let's test with overlap parameter.
85
86 let chunks = processor.chunk_text(text, 15, 6); // overlap 6 chars
87
88 // chunk 1: "one two three" (13 chars).
89 // overlap logic:
90 // current_words: ["one", " ", "two", " ", "three"]
91 // overlap=6.
92 // "three" (5) <= 6. Keep. len=5.
93 // " " (1) <= 6-5=1. Keep. len=6.
94 // "two" (3) > 0. Stop.
95 // overlap words: [" ", "three"] -> " three"
96 // next starts with " three".
97
98 // loop continues. next word " ". " three " (7).
99 // "four". " three four" (11).
100 // " ". " three four " (12).
101 // "five". " three four five" (16) > 15.
102 // chunk 2: "three four" (trimmed) -> "three four" (10 chars).
103
104 // verify
105 println!("Chunks: {:?}", chunks);
106 assert!(!chunks.is_empty());
107 assert_eq!(chunks[0], "one two three");
108 assert!(chunks[1].contains("three")); // overlap worked
109 }
110}