Skip to main content

cp_parser/
chunker.rs

1//! Text chunking for embedding preparation
2
3use cp_core::{Chunk, Result};
4use uuid::Uuid;
5
6/// Configuration for chunking
7#[derive(Debug, Clone)]
8pub struct ChunkConfig {
9    /// Target chunk size in characters
10    pub chunk_size: usize,
11    /// Overlap between chunks in characters
12    pub overlap: usize,
13}
14
15impl Default for ChunkConfig {
16    fn default() -> Self {
17        Self {
18            chunk_size: 1000, // ~250 tokens
19            overlap: 200,    // ~50 tokens
20        }
21    }
22}
23
24/// Chunker for splitting text into overlapping segments
25pub struct Chunker {
26    config: ChunkConfig,
27}
28
29impl Default for Chunker {
30    fn default() -> Self {
31        Self::new(ChunkConfig::default())
32    }
33}
34
35impl Chunker {
36    /// Create a new chunker with the given config
37    pub fn new(config: ChunkConfig) -> Self {
38        Self { config }
39    }
40
41    /// Split text into overlapping chunks
42    pub fn chunk(&self, doc_id: Uuid, text: &str) -> Result<Vec<Chunk>> {
43        let mut chunks = Vec::new();
44        let chars: Vec<char> = text.chars().collect();
45        let total_len = chars.len();
46
47        if total_len == 0 {
48            return Ok(chunks);
49        }
50
51        let mut offset = 0usize;
52        let mut seq = 0u32;
53
54        while offset < total_len {
55            // Calculate chunk end
56            let end = (offset + self.config.chunk_size).min(total_len);
57
58            // Try to find a good break point (end of sentence/paragraph)
59            let chunk_end = self.find_break_point(&chars, offset, end, total_len);
60
61            // Extract chunk text
62            let chunk_text: String = chars[offset..chunk_end].iter().collect();
63            let chunk_text = chunk_text.trim().to_string();
64
65            if !chunk_text.is_empty() {
66                chunks.push(Chunk::new(
67                    doc_id,
68                    chunk_text,
69                    offset as u64,
70                    seq,
71                ));
72                seq += 1;
73            }
74
75            // Move offset with overlap
76            if chunk_end >= total_len {
77                break;
78            }
79
80            // Only overlap if the chunk is larger than the overlap amount.
81            // Otherwise, we'd be jumping backwards or staying still.
82            offset = if chunk_end > offset + self.config.overlap {
83                chunk_end - self.config.overlap
84            } else {
85                chunk_end
86            };
87        }
88
89        Ok(chunks)
90    }
91
92    /// Find a good break point near the target end
93    fn find_break_point(
94        &self,
95        chars: &[char],
96        start: usize,
97        target_end: usize,
98        total_len: usize,
99    ) -> usize {
100        if target_end >= total_len {
101            return total_len;
102        }
103
104        // Look for header start (Markdown) - highest priority
105        // We want to break BEFORE the header, so the header starts the next chunk
106        for i in (start..target_end).rev() {
107            if chars[i] == '\n' && i + 1 < total_len && chars[i + 1] == '#' {
108                return i + 1; // Include newline in previous chunk, header starts next
109            }
110        }
111
112        // Look for paragraph break first
113        for i in (start..target_end).rev() {
114            if chars[i] == '\n' && i + 1 < total_len && chars[i + 1] == '\n' {
115                return i + 2;
116            }
117        }
118
119        // Look for sentence end
120        for i in (start..target_end).rev() {
121            if (chars[i] == '.' || chars[i] == '!' || chars[i] == '?')
122                && i + 1 < total_len
123                && chars[i + 1].is_whitespace()
124            {
125                return i + 1;
126            }
127        }
128
129        // Look for any whitespace
130        for i in (start..target_end).rev() {
131            if chars[i].is_whitespace() {
132                return i + 1;
133            }
134        }
135
136        // Fall back to hard cut
137        target_end
138    }
139}
140
141#[cfg(test)]
142mod tests {
143    use super::*;
144
145    #[test]
146    fn test_empty_text() {
147        let chunker = Chunker::default();
148        let chunks = chunker.chunk(Uuid::new_v4(), "").unwrap();
149        assert!(chunks.is_empty());
150    }
151
152    #[test]
153    fn test_short_text() {
154        let chunker = Chunker::default();
155        let chunks = chunker.chunk(Uuid::new_v4(), "Short text.").unwrap();
156        assert_eq!(chunks.len(), 1);
157        assert_eq!(chunks[0].text, "Short text.\n");
158    }
159
160    #[test]
161    fn test_long_text_chunking() {
162        let chunker = Chunker::new(ChunkConfig {
163            chunk_size: 100,
164            overlap: 20,
165        });
166
167        let text = "A".repeat(250);
168        let chunks = chunker.chunk(Uuid::new_v4(), &text).unwrap();
169
170        assert!(chunks.len() > 1);
171
172        // Check sequence numbers
173        for (i, chunk) in chunks.iter().enumerate() {
174            assert_eq!(chunk.sequence, i as u32);
175        }
176    }
177
178    #[test]
179    fn test_sentence_boundary() {
180        let chunker = Chunker::new(ChunkConfig {
181            chunk_size: 20, // Small enough to force split
182            overlap: 5,
183        });
184
185        let text = "First sentence. Second sentence. Third sentence.";
186        let chunks = chunker.chunk(Uuid::new_v4(), text).unwrap();
187
188        // Should produce multiple chunks (text is longer than chunk_size)
189        assert!(chunks.len() > 1);
190    }
191}