1use cp_core::{CPError, Chunk, Result};
4use text_splitter::{ChunkConfig as TSChunkConfig, MarkdownSplitter};
5use uuid::Uuid;
6
7#[derive(Debug, Clone)]
9pub struct ChunkConfig {
10 pub chunk_size: usize,
12 pub overlap: usize,
14}
15
16impl Default for ChunkConfig {
17 fn default() -> Self {
18 Self {
19 chunk_size: 1000, overlap: 200, }
22 }
23}
24
25pub struct Chunker {
27 config: ChunkConfig,
28}
29
30impl Default for Chunker {
31 fn default() -> Self {
32 Self::new(ChunkConfig::default())
33 }
34}
35
36impl Chunker {
37 pub fn new(config: ChunkConfig) -> Self {
39 Self { config }
40 }
41
42 pub fn chunk(&self, doc_id: Uuid, text: &str) -> Result<Vec<Chunk>> {
44 if text.is_empty() {
45 return Ok(Vec::new());
46 }
47
48 let ts_config = TSChunkConfig::new(self.config.chunk_size)
49 .with_overlap(self.config.overlap)
50 .map_err(|e| CPError::Parse(format!("Invalid chunk config: {e}")))?
51 .with_trim(true);
52 let splitter = MarkdownSplitter::new(ts_config);
53
54 let chunks: Vec<Chunk> = splitter
55 .chunk_indices(text)
56 .enumerate()
57 .map(|(seq, (byte_offset, chunk_text))| {
58 Chunk::new(doc_id, chunk_text, byte_offset as u64, seq as u32)
59 })
60 .collect();
61
62 Ok(chunks)
63 }
64}
65
66#[cfg(test)]
67mod tests {
68 use super::*;
69
70 #[test]
71 fn test_empty_text() {
72 let chunker = Chunker::default();
73 let chunks = chunker.chunk(Uuid::new_v4(), "").unwrap();
74 assert!(chunks.is_empty());
75 }
76
77 #[test]
78 fn test_short_text() {
79 let chunker = Chunker::default();
80 let chunks = chunker.chunk(Uuid::new_v4(), "Short text.").unwrap();
81 assert_eq!(chunks.len(), 1);
82 }
83
84 #[test]
85 fn test_long_text_chunking() {
86 let chunker = Chunker::new(ChunkConfig {
87 chunk_size: 100,
88 overlap: 20,
89 });
90
91 let text = "A".repeat(250);
92 let chunks = chunker.chunk(Uuid::new_v4(), &text).unwrap();
93
94 assert!(chunks.len() > 1);
95
96 for (i, chunk) in chunks.iter().enumerate() {
98 assert_eq!(chunk.sequence, i as u32);
99 }
100 }
101
102 #[test]
103 fn test_sentence_boundary() {
104 let chunker = Chunker::new(ChunkConfig {
105 chunk_size: 20,
106 overlap: 5,
107 });
108
109 let text = "First sentence. Second sentence. Third sentence.";
110 let chunks = chunker.chunk(Uuid::new_v4(), text).unwrap();
111
112 assert!(chunks.len() > 1);
114 }
115
116 #[test]
117 fn test_byte_offsets_valid() {
118 let chunker = Chunker::new(ChunkConfig {
119 chunk_size: 50,
120 overlap: 10,
121 });
122
123 let text = "# Heading\n\nFirst paragraph with some text.\n\n## Subheading\n\nSecond paragraph with more text here.";
124 let chunks = chunker.chunk(Uuid::new_v4(), text).unwrap();
125
126 for chunk in &chunks {
127 let offset = chunk.byte_offset as usize;
128 assert!(
130 offset <= text.len(),
131 "byte_offset {} exceeds text len {}",
132 offset,
133 text.len()
134 );
135 }
136 }
137
138 #[test]
139 fn test_overlap_shared_text() {
140 let chunker = Chunker::new(ChunkConfig {
141 chunk_size: 30,
142 overlap: 10,
143 });
144
145 let text = "Word one. Word two. Word three. Word four. Word five. Word six.";
146 let chunks = chunker.chunk(Uuid::new_v4(), text).unwrap();
147
148 if chunks.len() >= 2 {
149 for (i, chunk) in chunks.iter().enumerate() {
153 assert_eq!(chunk.sequence, i as u32);
154 }
155 }
156 }
157}