Skip to main content

cp_parser/
chunker.rs

1//! Text chunking for embedding preparation
2
3use cp_core::{CPError, Chunk, Result};
4use text_splitter::{ChunkConfig as TSChunkConfig, MarkdownSplitter};
5use uuid::Uuid;
6
7/// Configuration for chunking
8#[derive(Debug, Clone)]
9pub struct ChunkConfig {
10    /// Target chunk size in characters
11    pub chunk_size: usize,
12    /// Overlap between chunks in characters
13    pub overlap: usize,
14}
15
16impl Default for ChunkConfig {
17    fn default() -> Self {
18        Self {
19            chunk_size: 1000, // ~250 tokens
20            overlap: 200,     // ~50 tokens
21        }
22    }
23}
24
25/// Chunker for splitting text into overlapping segments
26pub struct Chunker {
27    config: ChunkConfig,
28}
29
30impl Default for Chunker {
31    fn default() -> Self {
32        Self::new(ChunkConfig::default())
33    }
34}
35
36impl Chunker {
37    /// Create a new chunker with the given config
38    pub fn new(config: ChunkConfig) -> Self {
39        Self { config }
40    }
41
42    /// Split text into overlapping chunks
43    pub fn chunk(&self, doc_id: Uuid, text: &str) -> Result<Vec<Chunk>> {
44        if text.is_empty() {
45            return Ok(Vec::new());
46        }
47
48        let ts_config = TSChunkConfig::new(self.config.chunk_size)
49            .with_overlap(self.config.overlap)
50            .map_err(|e| CPError::Parse(format!("Invalid chunk config: {e}")))?
51            .with_trim(true);
52        let splitter = MarkdownSplitter::new(ts_config);
53
54        let chunks: Vec<Chunk> = splitter
55            .chunk_indices(text)
56            .enumerate()
57            .map(|(seq, (byte_offset, chunk_text))| {
58                Chunk::new(doc_id, chunk_text, byte_offset as u64, seq as u32)
59            })
60            .collect();
61
62        Ok(chunks)
63    }
64}
65
66#[cfg(test)]
67mod tests {
68    use super::*;
69
70    #[test]
71    fn test_empty_text() {
72        let chunker = Chunker::default();
73        let chunks = chunker.chunk(Uuid::new_v4(), "").unwrap();
74        assert!(chunks.is_empty());
75    }
76
77    #[test]
78    fn test_short_text() {
79        let chunker = Chunker::default();
80        let chunks = chunker.chunk(Uuid::new_v4(), "Short text.").unwrap();
81        assert_eq!(chunks.len(), 1);
82    }
83
84    #[test]
85    fn test_long_text_chunking() {
86        let chunker = Chunker::new(ChunkConfig {
87            chunk_size: 100,
88            overlap: 20,
89        });
90
91        let text = "A".repeat(250);
92        let chunks = chunker.chunk(Uuid::new_v4(), &text).unwrap();
93
94        assert!(chunks.len() > 1);
95
96        // Check sequence numbers
97        for (i, chunk) in chunks.iter().enumerate() {
98            assert_eq!(chunk.sequence, i as u32);
99        }
100    }
101
102    #[test]
103    fn test_sentence_boundary() {
104        let chunker = Chunker::new(ChunkConfig {
105            chunk_size: 20,
106            overlap: 5,
107        });
108
109        let text = "First sentence. Second sentence. Third sentence.";
110        let chunks = chunker.chunk(Uuid::new_v4(), text).unwrap();
111
112        // Should produce multiple chunks (text is longer than chunk_size)
113        assert!(chunks.len() > 1);
114    }
115
116    #[test]
117    fn test_byte_offsets_valid() {
118        let chunker = Chunker::new(ChunkConfig {
119            chunk_size: 50,
120            overlap: 10,
121        });
122
123        let text = "# Heading\n\nFirst paragraph with some text.\n\n## Subheading\n\nSecond paragraph with more text here.";
124        let chunks = chunker.chunk(Uuid::new_v4(), text).unwrap();
125
126        for chunk in &chunks {
127            let offset = chunk.byte_offset as usize;
128            // The byte offset should be within the original text
129            assert!(
130                offset <= text.len(),
131                "byte_offset {} exceeds text len {}",
132                offset,
133                text.len()
134            );
135        }
136    }
137
138    #[test]
139    fn test_overlap_shared_text() {
140        let chunker = Chunker::new(ChunkConfig {
141            chunk_size: 30,
142            overlap: 10,
143        });
144
145        let text = "Word one. Word two. Word three. Word four. Word five. Word six.";
146        let chunks = chunker.chunk(Uuid::new_v4(), text).unwrap();
147
148        if chunks.len() >= 2 {
149            // With overlap, there should be some shared content between adjacent chunks
150            // (the text-splitter handles overlap at semantic boundaries, so just verify
151            // we get multiple chunks and they have valid sequences)
152            for (i, chunk) in chunks.iter().enumerate() {
153                assert_eq!(chunk.sequence, i as u32);
154            }
155        }
156    }
157}