1use cp_core::{Chunk, Result};
4use uuid::Uuid;
5
6#[derive(Debug, Clone)]
8pub struct ChunkConfig {
9 pub chunk_size: usize,
11 pub overlap: usize,
13}
14
15impl Default for ChunkConfig {
16 fn default() -> Self {
17 Self {
18 chunk_size: 1000, overlap: 200, }
21 }
22}
23
24pub struct Chunker {
26 config: ChunkConfig,
27}
28
29impl Default for Chunker {
30 fn default() -> Self {
31 Self::new(ChunkConfig::default())
32 }
33}
34
35impl Chunker {
36 pub fn new(config: ChunkConfig) -> Self {
38 Self { config }
39 }
40
41 pub fn chunk(&self, doc_id: Uuid, text: &str) -> Result<Vec<Chunk>> {
43 let mut chunks = Vec::new();
44 let chars: Vec<char> = text.chars().collect();
45 let total_len = chars.len();
46
47 if total_len == 0 {
48 return Ok(chunks);
49 }
50
51 let mut offset = 0usize;
52 let mut seq = 0u32;
53
54 while offset < total_len {
55 let end = (offset + self.config.chunk_size).min(total_len);
57
58 let chunk_end = self.find_break_point(&chars, offset, end, total_len);
60
61 let chunk_text: String = chars[offset..chunk_end].iter().collect();
63 let chunk_text = chunk_text.trim().to_string();
64
65 if !chunk_text.is_empty() {
66 chunks.push(Chunk::new(
67 doc_id,
68 chunk_text,
69 offset as u64,
70 seq,
71 ));
72 seq += 1;
73 }
74
75 if chunk_end >= total_len {
77 break;
78 }
79
80 offset = if chunk_end > offset + self.config.overlap {
83 chunk_end - self.config.overlap
84 } else {
85 chunk_end
86 };
87 }
88
89 Ok(chunks)
90 }
91
92 fn find_break_point(
94 &self,
95 chars: &[char],
96 start: usize,
97 target_end: usize,
98 total_len: usize,
99 ) -> usize {
100 if target_end >= total_len {
101 return total_len;
102 }
103
104 for i in (start..target_end).rev() {
107 if chars[i] == '\n' && i + 1 < total_len && chars[i + 1] == '#' {
108 return i + 1; }
110 }
111
112 for i in (start..target_end).rev() {
114 if chars[i] == '\n' && i + 1 < total_len && chars[i + 1] == '\n' {
115 return i + 2;
116 }
117 }
118
119 for i in (start..target_end).rev() {
121 if (chars[i] == '.' || chars[i] == '!' || chars[i] == '?')
122 && i + 1 < total_len
123 && chars[i + 1].is_whitespace()
124 {
125 return i + 1;
126 }
127 }
128
129 for i in (start..target_end).rev() {
131 if chars[i].is_whitespace() {
132 return i + 1;
133 }
134 }
135
136 target_end
138 }
139}
140
141#[cfg(test)]
142mod tests {
143 use super::*;
144
145 #[test]
146 fn test_empty_text() {
147 let chunker = Chunker::default();
148 let chunks = chunker.chunk(Uuid::new_v4(), "").unwrap();
149 assert!(chunks.is_empty());
150 }
151
152 #[test]
153 fn test_short_text() {
154 let chunker = Chunker::default();
155 let chunks = chunker.chunk(Uuid::new_v4(), "Short text.").unwrap();
156 assert_eq!(chunks.len(), 1);
157 assert_eq!(chunks[0].text, "Short text.\n");
158 }
159
160 #[test]
161 fn test_long_text_chunking() {
162 let chunker = Chunker::new(ChunkConfig {
163 chunk_size: 100,
164 overlap: 20,
165 });
166
167 let text = "A".repeat(250);
168 let chunks = chunker.chunk(Uuid::new_v4(), &text).unwrap();
169
170 assert!(chunks.len() > 1);
171
172 for (i, chunk) in chunks.iter().enumerate() {
174 assert_eq!(chunk.sequence, i as u32);
175 }
176 }
177
178 #[test]
179 fn test_sentence_boundary() {
180 let chunker = Chunker::new(ChunkConfig {
181 chunk_size: 20, overlap: 5,
183 });
184
185 let text = "First sentence. Second sentence. Third sentence.";
186 let chunks = chunker.chunk(Uuid::new_v4(), text).unwrap();
187
188 assert!(chunks.len() > 1);
190 }
191}