1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
/// Simple semantic chunker for text processing
pub struct TextProcessor;
impl Default for TextProcessor {
fn default() -> Self {
Self::new()
}
}
impl TextProcessor {
pub fn new() -> Self {
Self
}
/// Split text into recursive chunks with overlap
pub fn chunk_text(&self, text: &str, max_chars: usize, overlap: usize) -> Vec<String> {
let mut chunks = Vec::new();
// Simple approach: Split by whitespace to preserve words
let words: Vec<&str> = text.split_inclusive(char::is_whitespace).collect();
let mut current_chunk = String::new();
let mut current_len = 0;
let mut current_words: Vec<&str> = Vec::new();
for word in words {
if current_len + word.len() > max_chars {
if !current_chunk.is_empty() {
chunks.push(current_chunk.trim().to_string());
}
// Handle overlap
let mut overlap_words = Vec::new();
let mut overlap_len = 0;
// Backtrack to capture overlap context
for w in current_words.iter().rev() {
if overlap_len + w.len() <= overlap {
overlap_words.push(*w);
overlap_len += w.len();
} else {
break;
}
}
overlap_words.reverse();
current_chunk = overlap_words.concat();
current_len = overlap_len;
current_words = overlap_words;
}
current_chunk.push_str(word);
current_len += word.len();
current_words.push(word);
}
if !current_chunk.is_empty() {
chunks.push(current_chunk.trim().to_string());
}
chunks
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_chunk_text_with_overlap() {
let processor = TextProcessor::new();
let text = "one two three four five six seven eight nine ten";
// max_chars small to force split. "one two " is 8 chars.
// Let's use max_chars=15.
// "one two three" = 13 chars. " four" = 5. Total 18 > 15.
// So chunk 1: "one two three" (13)
// Overlap: say 10 chars.
// "three" is 5 chars. "two " is 4. "one " is 4.
// overlap 10 captures "two three".
// next chunk start with "two three".
// "two three four five" = 19 chars > 15.
// So chunk 2: "two three four" (14).
// Wait, my implementation uses words.
// Let's test with overlap parameter.
let chunks = processor.chunk_text(text, 15, 6); // overlap 6 chars
// chunk 1: "one two three" (13 chars).
// overlap logic:
// current_words: ["one", " ", "two", " ", "three"]
// overlap=6.
// "three" (5) <= 6. Keep. len=5.
// " " (1) <= 6-5=1. Keep. len=6.
// "two" (3) > 0. Stop.
// overlap words: [" ", "three"] -> " three"
// next starts with " three".
// loop continues. next word " ". " three " (7).
// "four". " three four" (11).
// " ". " three four " (12).
// "five". " three four five" (16) > 15.
// chunk 2: "three four" (trimmed) -> "three four" (10 chars).
// verify
println!("Chunks: {:?}", chunks);
assert!(!chunks.is_empty());
assert_eq!(chunks[0], "one two three");
assert!(chunks[1].contains("three")); // overlap worked
}
}