Skip to main content

synapse_core/
processor.rs

1/// Simple semantic chunker for text processing
2pub struct TextProcessor;
3
4impl TextProcessor {
5    pub fn new() -> Self {
6        Self
7    }
8
9    /// Split text into recursive chunks
10    /// Simple implementation: split by double newline (paragraphs), then by newline, then by length
11    pub fn chunk_text(&self, text: &str, max_chars: usize) -> Vec<String> {
12        let mut chunks = Vec::new();
13        
14        // 1. Split by paragraphs
15        let paragraphs: Vec<&str> = text.split("\n\n").collect();
16        
17        for p in paragraphs {
18            if p.len() <= max_chars {
19                if !p.trim().is_empty() {
20                    chunks.push(p.to_string());
21                }
22            } else {
23                // 2. Split by sentences (naive period check) or newlines
24                let lines: Vec<&str> = p.split('\n').collect();
25                let mut current_chunk = String::new();
26                
27                for line in lines {
28                    if current_chunk.len() + line.len() + 1 > max_chars {
29                        if !current_chunk.is_empty() {
30                            chunks.push(current_chunk.clone());
31                            current_chunk.clear();
32                        }
33                        // If line itself is too long, we truncate/force split (simplification)
34                        if line.len() > max_chars {
35                            chunks.push(line[..max_chars].to_string());
36                             // Drop remainder for simplicity in this MVP
37                        } else {
38                            current_chunk = line.to_string();
39                        }
40                    } else {
41                        if !current_chunk.is_empty() {
42                            current_chunk.push('\n');
43                        }
44                        current_chunk.push_str(line);
45                    }
46                }
47                if !current_chunk.is_empty() {
48                    chunks.push(current_chunk);
49                }
50            }
51        }
52        
53        chunks
54    }
55}