Skip to main content

cognee_chunking/
chunk_by_paragraph.rs

1//! Paragraph-level text chunker.
2//!
3//! Batches sentences into paragraph-sized chunks, respecting a maximum token
4//! count. Supports both batch mode (accumulate across paragraphs) and
5//! non-batch mode (yield at each paragraph boundary).
6
7use cognee_utils::NAMESPACE_OID;
8use uuid::Uuid;
9
10use crate::chunk_by_sentence::chunk_by_sentence;
11use crate::cut_type::CutType;
12use crate::token_counter::TokenCounter;
13
14/// A paragraph-level chunk with metadata. Borrows text from the input.
15#[derive(Debug, Clone)]
16pub struct ParagraphChunk<'a> {
17    /// The accumulated text, borrowed from the input.
18    pub text: &'a str,
19    /// Token count of the chunk.
20    pub chunk_size: usize,
21    /// Deterministic ID: uuid5(NAMESPACE_OID, text).
22    pub chunk_id: Uuid,
23    /// Paragraph IDs from the sentences that compose this chunk.
24    pub paragraph_ids: Vec<Uuid>,
25    /// Sequential chunk index.
26    pub chunk_index: usize,
27    /// How the chunk boundary was determined.
28    pub cut_type: CutType,
29}
30
31/// Chunks text by paragraph, optionally batching across paragraph boundaries.
32///
33/// - `data`: input text
34/// - `max_chunk_size`: maximum token count per chunk
35/// - `batch_paragraphs`: if true, accumulates sentences across paragraph
36///   boundaries until overflow. If false, yields at each paragraph boundary.
37/// - `counter`: token counter implementation
38#[allow(
39    clippy::expect_used,
40    reason = "chunk_start invariants are upheld by the accumulation logic above each emit branch"
41)]
42pub fn chunk_by_paragraph<'a, C: TokenCounter>(
43    data: &'a str,
44    max_chunk_size: usize,
45    batch_paragraphs: bool,
46    counter: &C,
47) -> Vec<ParagraphChunk<'a>> {
48    let sentences = chunk_by_sentence(data, Some(max_chunk_size), counter);
49    let mut result = Vec::new();
50    let mut chunk_index: usize = 0;
51    let mut paragraph_ids: Vec<Uuid> = Vec::new();
52    let mut last_cut_type = CutType::SentenceCut;
53    let mut current_chunk_size: usize = 0;
54    // Track the byte range of the current chunk in `data`.
55    let mut chunk_start: Option<usize> = None;
56    let mut chunk_end: usize = 0;
57
58    for sentence in &sentences {
59        let sent_start = sentence.text.as_ptr() as usize - data.as_ptr() as usize;
60        let sent_end = sent_start + sentence.text.len();
61
62        // Overflow: yield current chunk and start fresh
63        if current_chunk_size > 0 && (current_chunk_size + sentence.size > max_chunk_size) {
64            let text = &data[chunk_start.expect("chunk_start is Some because current_chunk_size > 0 only after a sentence was accumulated")..chunk_end];
65            result.push(ParagraphChunk {
66                text,
67                chunk_size: current_chunk_size,
68                chunk_id: Uuid::new_v5(&NAMESPACE_OID, text.as_bytes()),
69                paragraph_ids: std::mem::take(&mut paragraph_ids),
70                chunk_index,
71                cut_type: last_cut_type.clone(),
72            });
73            current_chunk_size = 0;
74            chunk_start = None;
75            chunk_index += 1;
76        }
77
78        paragraph_ids.push(sentence.paragraph_id);
79        if chunk_start.is_none() {
80            chunk_start = Some(sent_start);
81        }
82        chunk_end = sent_end;
83        current_chunk_size += sentence.size;
84
85        // Non-batch mode: yield at paragraph boundaries
86        if !batch_paragraphs
87            && matches!(
88                sentence.cut_type,
89                CutType::ParagraphEnd | CutType::SentenceCut
90            )
91        {
92            let text = &data[chunk_start.expect(
93                "chunk_start is Some because it was set above before this emit branch is reached",
94            )..chunk_end];
95            result.push(ParagraphChunk {
96                text,
97                chunk_size: current_chunk_size,
98                chunk_id: Uuid::new_v5(&NAMESPACE_OID, text.as_bytes()),
99                paragraph_ids: std::mem::take(&mut paragraph_ids),
100                chunk_index,
101                cut_type: sentence.cut_type.clone(),
102            });
103            current_chunk_size = 0;
104            chunk_start = None;
105            chunk_index += 1;
106        }
107
108        last_cut_type = sentence.cut_type.clone();
109    }
110
111    // Yield remaining text
112    if let Some(start) = chunk_start {
113        let final_cut_type = if last_cut_type == CutType::Word {
114            CutType::SentenceCut
115        } else {
116            last_cut_type
117        };
118        let text = &data[start..chunk_end];
119        result.push(ParagraphChunk {
120            chunk_id: Uuid::new_v5(&NAMESPACE_OID, text.as_bytes()),
121            text,
122            chunk_size: current_chunk_size,
123            paragraph_ids,
124            chunk_index,
125            cut_type: final_cut_type,
126        });
127    }
128
129    result
130}
131
132#[cfg(test)]
133#[allow(
134    clippy::unwrap_used,
135    clippy::expect_used,
136    reason = "test code — panics are acceptable failures"
137)]
138mod tests {
139    use super::*;
140    use crate::token_counter::WordCounter;
141
142    #[test]
143    fn empty_input() {
144        let chunks = chunk_by_paragraph("", 10, true, &WordCounter);
145        assert!(chunks.is_empty());
146    }
147
148    #[test]
149    fn single_short_paragraph() {
150        let chunks = chunk_by_paragraph("Hello world.", 100, true, &WordCounter);
151        assert_eq!(chunks.len(), 1);
152        assert_eq!(chunks[0].text, "Hello world.");
153        assert_eq!(chunks[0].chunk_size, 2);
154        assert_eq!(chunks[0].chunk_index, 0);
155    }
156
157    #[test]
158    fn batch_mode_accumulates() {
159        let text = "First sentence. Second sentence. Third sentence.";
160        let chunks = chunk_by_paragraph(text, 100, true, &WordCounter);
161        // Should accumulate all into one chunk
162        assert_eq!(chunks.len(), 1);
163        assert_eq!(chunks[0].chunk_size, 6);
164    }
165
166    #[test]
167    fn batch_mode_overflow() {
168        let text = "One two. Three four. Five six.";
169        // Max 3 words: first sentence fits (2), second would overflow (2+2=4>3)
170        let chunks = chunk_by_paragraph(text, 3, true, &WordCounter);
171        assert!(chunks.len() >= 2);
172        assert_eq!(chunks[0].chunk_index, 0);
173        assert_eq!(chunks[1].chunk_index, 1);
174    }
175
176    #[test]
177    fn non_batch_mode_yields_at_paragraph() {
178        let text = "First paragraph.\nSecond paragraph.";
179        let chunks = chunk_by_paragraph(text, 100, false, &WordCounter);
180        // Should yield at each paragraph boundary
181        assert!(chunks.len() >= 2);
182    }
183
184    #[test]
185    fn sequential_chunk_indices() {
186        let text = "A. B. C. D. E.";
187        let chunks = chunk_by_paragraph(text, 2, true, &WordCounter);
188        for (i, chunk) in chunks.iter().enumerate() {
189            assert_eq!(chunk.chunk_index, i);
190        }
191    }
192
193    #[test]
194    fn deterministic_ids() {
195        let text = "Hello world. Foo bar.";
196        let chunks1 = chunk_by_paragraph(text, 100, true, &WordCounter);
197        let chunks2 = chunk_by_paragraph(text, 100, true, &WordCounter);
198        assert_eq!(chunks1[0].chunk_id, chunks2[0].chunk_id);
199    }
200
201    #[test]
202    fn ground_truth_whole_text() {
203        use crate::cut_type::CutType;
204
205        let input = "The quick brown fox jumps over the lazy dog. It was a sunny day.\n\
206                     The rain in Spain falls mainly on the plain. A stitch in time saves nine. An apple a day keeps the doctor away.\n\
207                     To be or not to be that is the question. All that glitters is not gold. Actions speak louder than words. The pen is mightier than the sword. Knowledge is power above all else.";
208        let counter = WordCounter;
209        let chunks = chunk_by_paragraph(input, 12, true, &counter);
210
211        // With max_chunk_size=12 and batch_paragraphs=true, the text is split
212        // into multiple chunks by overflow. Each chunk respects the 12-word limit.
213        assert!(
214            chunks.len() >= 2,
215            "expected at least 2 chunks, got {}",
216            chunks.len()
217        );
218
219        // All chunk sizes should be within the limit
220        for (i, chunk) in chunks.iter().enumerate() {
221            assert!(
222                chunk.chunk_size <= 12,
223                "chunk {i} has size {} > 12",
224                chunk.chunk_size
225            );
226        }
227
228        // Last chunk — text ends with sentence-ending punctuation "."
229        let last = chunks.last().unwrap();
230        assert_eq!(last.cut_type, CutType::SentenceEnd);
231
232        // Verify indices are sequential
233        for (i, chunk) in chunks.iter().enumerate() {
234            assert_eq!(chunk.chunk_index, i, "chunk_index mismatch at {i}");
235        }
236    }
237
238    #[test]
239    fn ground_truth_cut_text() {
240        use crate::cut_type::CutType;
241
242        let input = "The quick brown fox jumps over the lazy dog. It was a sunny day.\n\
243                     The rain in Spain falls mainly on the plain. A stitch in time saves nine. An apple a day keeps the doctor away.\n\
244                     To be or not to be that is the question. All that glitters is not gold. Actions speak louder than words. The pen is mightier than the sword. Knowledge is power above all else";
245        let counter = WordCounter;
246        let chunks = chunk_by_paragraph(input, 12, true, &counter);
247
248        assert!(chunks.len() >= 2, "expected at least 2 chunks");
249
250        // Last chunk should be SentenceCut (no trailing punctuation)
251        let last = chunks.last().unwrap();
252        assert_eq!(
253            last.cut_type,
254            CutType::SentenceCut,
255            "last chunk should be SentenceCut when text doesn't end with punctuation"
256        );
257    }
258}