1use cognee_utils::NAMESPACE_OID;
8use uuid::Uuid;
9
10use crate::chunk_by_sentence::chunk_by_sentence;
11use crate::cut_type::CutType;
12use crate::token_counter::TokenCounter;
13
14#[derive(Debug, Clone)]
16pub struct ParagraphChunk<'a> {
17 pub text: &'a str,
19 pub chunk_size: usize,
21 pub chunk_id: Uuid,
23 pub paragraph_ids: Vec<Uuid>,
25 pub chunk_index: usize,
27 pub cut_type: CutType,
29}
30
31#[allow(
39 clippy::expect_used,
40 reason = "chunk_start invariants are upheld by the accumulation logic above each emit branch"
41)]
42pub fn chunk_by_paragraph<'a, C: TokenCounter>(
43 data: &'a str,
44 max_chunk_size: usize,
45 batch_paragraphs: bool,
46 counter: &C,
47) -> Vec<ParagraphChunk<'a>> {
48 let sentences = chunk_by_sentence(data, Some(max_chunk_size), counter);
49 let mut result = Vec::new();
50 let mut chunk_index: usize = 0;
51 let mut paragraph_ids: Vec<Uuid> = Vec::new();
52 let mut last_cut_type = CutType::SentenceCut;
53 let mut current_chunk_size: usize = 0;
54 let mut chunk_start: Option<usize> = None;
56 let mut chunk_end: usize = 0;
57
58 for sentence in &sentences {
59 let sent_start = sentence.text.as_ptr() as usize - data.as_ptr() as usize;
60 let sent_end = sent_start + sentence.text.len();
61
62 if current_chunk_size > 0 && (current_chunk_size + sentence.size > max_chunk_size) {
64 let text = &data[chunk_start.expect("chunk_start is Some because current_chunk_size > 0 only after a sentence was accumulated")..chunk_end];
65 result.push(ParagraphChunk {
66 text,
67 chunk_size: current_chunk_size,
68 chunk_id: Uuid::new_v5(&NAMESPACE_OID, text.as_bytes()),
69 paragraph_ids: std::mem::take(&mut paragraph_ids),
70 chunk_index,
71 cut_type: last_cut_type.clone(),
72 });
73 current_chunk_size = 0;
74 chunk_start = None;
75 chunk_index += 1;
76 }
77
78 paragraph_ids.push(sentence.paragraph_id);
79 if chunk_start.is_none() {
80 chunk_start = Some(sent_start);
81 }
82 chunk_end = sent_end;
83 current_chunk_size += sentence.size;
84
85 if !batch_paragraphs
87 && matches!(
88 sentence.cut_type,
89 CutType::ParagraphEnd | CutType::SentenceCut
90 )
91 {
92 let text = &data[chunk_start.expect(
93 "chunk_start is Some because it was set above before this emit branch is reached",
94 )..chunk_end];
95 result.push(ParagraphChunk {
96 text,
97 chunk_size: current_chunk_size,
98 chunk_id: Uuid::new_v5(&NAMESPACE_OID, text.as_bytes()),
99 paragraph_ids: std::mem::take(&mut paragraph_ids),
100 chunk_index,
101 cut_type: sentence.cut_type.clone(),
102 });
103 current_chunk_size = 0;
104 chunk_start = None;
105 chunk_index += 1;
106 }
107
108 last_cut_type = sentence.cut_type.clone();
109 }
110
111 if let Some(start) = chunk_start {
113 let final_cut_type = if last_cut_type == CutType::Word {
114 CutType::SentenceCut
115 } else {
116 last_cut_type
117 };
118 let text = &data[start..chunk_end];
119 result.push(ParagraphChunk {
120 chunk_id: Uuid::new_v5(&NAMESPACE_OID, text.as_bytes()),
121 text,
122 chunk_size: current_chunk_size,
123 paragraph_ids,
124 chunk_index,
125 cut_type: final_cut_type,
126 });
127 }
128
129 result
130}
131
132#[cfg(test)]
133#[allow(
134 clippy::unwrap_used,
135 clippy::expect_used,
136 reason = "test code — panics are acceptable failures"
137)]
138mod tests {
139 use super::*;
140 use crate::token_counter::WordCounter;
141
142 #[test]
143 fn empty_input() {
144 let chunks = chunk_by_paragraph("", 10, true, &WordCounter);
145 assert!(chunks.is_empty());
146 }
147
148 #[test]
149 fn single_short_paragraph() {
150 let chunks = chunk_by_paragraph("Hello world.", 100, true, &WordCounter);
151 assert_eq!(chunks.len(), 1);
152 assert_eq!(chunks[0].text, "Hello world.");
153 assert_eq!(chunks[0].chunk_size, 2);
154 assert_eq!(chunks[0].chunk_index, 0);
155 }
156
157 #[test]
158 fn batch_mode_accumulates() {
159 let text = "First sentence. Second sentence. Third sentence.";
160 let chunks = chunk_by_paragraph(text, 100, true, &WordCounter);
161 assert_eq!(chunks.len(), 1);
163 assert_eq!(chunks[0].chunk_size, 6);
164 }
165
166 #[test]
167 fn batch_mode_overflow() {
168 let text = "One two. Three four. Five six.";
169 let chunks = chunk_by_paragraph(text, 3, true, &WordCounter);
171 assert!(chunks.len() >= 2);
172 assert_eq!(chunks[0].chunk_index, 0);
173 assert_eq!(chunks[1].chunk_index, 1);
174 }
175
176 #[test]
177 fn non_batch_mode_yields_at_paragraph() {
178 let text = "First paragraph.\nSecond paragraph.";
179 let chunks = chunk_by_paragraph(text, 100, false, &WordCounter);
180 assert!(chunks.len() >= 2);
182 }
183
184 #[test]
185 fn sequential_chunk_indices() {
186 let text = "A. B. C. D. E.";
187 let chunks = chunk_by_paragraph(text, 2, true, &WordCounter);
188 for (i, chunk) in chunks.iter().enumerate() {
189 assert_eq!(chunk.chunk_index, i);
190 }
191 }
192
193 #[test]
194 fn deterministic_ids() {
195 let text = "Hello world. Foo bar.";
196 let chunks1 = chunk_by_paragraph(text, 100, true, &WordCounter);
197 let chunks2 = chunk_by_paragraph(text, 100, true, &WordCounter);
198 assert_eq!(chunks1[0].chunk_id, chunks2[0].chunk_id);
199 }
200
201 #[test]
202 fn ground_truth_whole_text() {
203 use crate::cut_type::CutType;
204
205 let input = "The quick brown fox jumps over the lazy dog. It was a sunny day.\n\
206 The rain in Spain falls mainly on the plain. A stitch in time saves nine. An apple a day keeps the doctor away.\n\
207 To be or not to be that is the question. All that glitters is not gold. Actions speak louder than words. The pen is mightier than the sword. Knowledge is power above all else.";
208 let counter = WordCounter;
209 let chunks = chunk_by_paragraph(input, 12, true, &counter);
210
211 assert!(
214 chunks.len() >= 2,
215 "expected at least 2 chunks, got {}",
216 chunks.len()
217 );
218
219 for (i, chunk) in chunks.iter().enumerate() {
221 assert!(
222 chunk.chunk_size <= 12,
223 "chunk {i} has size {} > 12",
224 chunk.chunk_size
225 );
226 }
227
228 let last = chunks.last().unwrap();
230 assert_eq!(last.cut_type, CutType::SentenceEnd);
231
232 for (i, chunk) in chunks.iter().enumerate() {
234 assert_eq!(chunk.chunk_index, i, "chunk_index mismatch at {i}");
235 }
236 }
237
238 #[test]
239 fn ground_truth_cut_text() {
240 use crate::cut_type::CutType;
241
242 let input = "The quick brown fox jumps over the lazy dog. It was a sunny day.\n\
243 The rain in Spain falls mainly on the plain. A stitch in time saves nine. An apple a day keeps the doctor away.\n\
244 To be or not to be that is the question. All that glitters is not gold. Actions speak louder than words. The pen is mightier than the sword. Knowledge is power above all else";
245 let counter = WordCounter;
246 let chunks = chunk_by_paragraph(input, 12, true, &counter);
247
248 assert!(chunks.len() >= 2, "expected at least 2 chunks");
249
250 let last = chunks.last().unwrap();
252 assert_eq!(
253 last.cut_type,
254 CutType::SentenceCut,
255 "last chunk should be SentenceCut when text doesn't end with punctuation"
256 );
257 }
258}