Skip to main content

synapse_core/
processor.rs

1/// Simple semantic chunker for text processing
2pub struct TextProcessor;
3
4impl Default for TextProcessor {
5    fn default() -> Self {
6        Self::new()
7    }
8}
9
10impl TextProcessor {
11    pub fn new() -> Self {
12        Self
13    }
14
15    /// Split text into recursive chunks with overlap
16    pub fn chunk_text(&self, text: &str, max_chars: usize, overlap: usize) -> Vec<String> {
17        let mut chunks = Vec::new();
18        // Simple approach: Split by whitespace to preserve words
19        let words: Vec<&str> = text.split_inclusive(char::is_whitespace).collect();
20
21        let mut current_chunk = String::new();
22        let mut current_len = 0;
23        let mut current_words: Vec<&str> = Vec::new();
24
25        for word in words {
26            if current_len + word.len() > max_chars {
27                if !current_chunk.is_empty() {
28                    chunks.push(current_chunk.trim().to_string());
29                }
30
31                // Handle overlap
32                let mut overlap_words = Vec::new();
33                let mut overlap_len = 0;
34
35                // Backtrack to capture overlap context
36                for w in current_words.iter().rev() {
37                    if overlap_len + w.len() <= overlap {
38                        overlap_words.push(*w);
39                        overlap_len += w.len();
40                    } else {
41                        break;
42                    }
43                }
44                overlap_words.reverse();
45
46                current_chunk = overlap_words.concat();
47                current_len = overlap_len;
48                current_words = overlap_words;
49            }
50
51            current_chunk.push_str(word);
52            current_len += word.len();
53            current_words.push(word);
54        }
55
56        if !current_chunk.is_empty() {
57            chunks.push(current_chunk.trim().to_string());
58        }
59
60        chunks
61    }
62}
63
64#[cfg(test)]
65mod tests {
66    use super::*;
67
68    #[test]
69    fn test_chunk_text_with_overlap() {
70        let processor = TextProcessor::new();
71        let text = "one two three four five six seven eight nine ten";
72        // max_chars small to force split. "one two " is 8 chars.
73        // Let's use max_chars=15.
74        // "one two three" = 13 chars. " four" = 5. Total 18 > 15.
75        // So chunk 1: "one two three" (13)
76        // Overlap: say 10 chars.
77        // "three" is 5 chars. "two " is 4. "one " is 4.
78        // overlap 10 captures "two three".
79        // next chunk start with "two three".
80        // "two three four five" = 19 chars > 15.
81        // So chunk 2: "two three four" (14).
82
83        // Wait, my implementation uses words.
84        // Let's test with overlap parameter.
85
86        let chunks = processor.chunk_text(text, 15, 6); // overlap 6 chars
87
88        // chunk 1: "one two three" (13 chars).
89        // overlap logic:
90        // current_words: ["one", " ", "two", " ", "three"]
91        // overlap=6.
92        // "three" (5) <= 6. Keep. len=5.
93        // " " (1) <= 6-5=1. Keep. len=6.
94        // "two" (3) > 0. Stop.
95        // overlap words: [" ", "three"] -> " three"
96        // next starts with " three".
97
98        // loop continues. next word " ". " three " (7).
99        // "four". " three four" (11).
100        // " ". " three four " (12).
101        // "five". " three four five" (16) > 15.
102        // chunk 2: "three four" (trimmed) -> "three four" (10 chars).
103
104        // verify
105        println!("Chunks: {:?}", chunks);
106        assert!(!chunks.is_empty());
107        assert_eq!(chunks[0], "one two three");
108        assert!(chunks[1].contains("three")); // overlap worked
109    }
110}