chonkier 0.0.2

🦛 Chonkie, now in Rust 🦀: No-nonsense, ultra-fast, ultra-light chunking library
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
/*
    Sentence Chunker chunks the text based on the sentence boundaries,
    which makes it useful for chunking text without sub-token splits.
*/

use std::cmp::{max, min};

use rayon::prelude::*;
use crate::tokenizer::CharacterTokenizer;
use crate::tokenizer::Tokenizer;
use crate::types::{Sentence, SentenceChunk};

/// Determines how delimiters are included in the chunked sentences
pub enum DelimiterHandling {
    /// Include delimiters with the previous sentence
    Previous,
    /// Include delimiters with the next sentence
    Next,
    /// Do not include delimiters in either sentence
    None,
}

impl Default for DelimiterHandling {
    fn default() -> Self {
        DelimiterHandling::Previous
    }
}

/// SentenceChunker splits text into chunks based on sentence boundaries and token limits
pub struct SentenceChunker<T: Tokenizer> {
    /// The tokenizer to use for counting tokens
    pub tokenizer: T,
    /// Maximum number of tokens per chunk
    pub chunk_size: usize,
    /// Number of tokens to overlap between chunks
    pub chunk_overlap: usize,
    /// Minimum number of sentences per chunk
    pub min_sentences_per_chunk: usize,
    /// Minimum number of characters per sentence
    pub min_characters_per_sentence: usize,
    /// Whether to use approximate token counting
    pub approximate: bool,
    /// Delimiters to split sentences on
    pub delimiters: Vec<String>,
    /// How to handle delimiters in sentences
    pub include_delim: DelimiterHandling,
    /// Separator used internally for sentence splitting
    separator: String,
    /// Average characters per token (for estimation)
    chars_per_token: f32,
}

impl<T: Tokenizer> SentenceChunker<T> {
    // Constructor for SentenceChunker
    pub fn new<S: Into<String>>(
        tokenizer: T,
        chunk_size: usize,
        chunk_overlap: usize,
        min_sentences_per_chunk: usize,
        min_characters_per_sentence: usize,
        approximate: bool,
        delimiters: Vec<S>,
        include_delim: DelimiterHandling,
    ) -> Result<Self, &'static str> {
        // Convert the delimiters into a vector of strings
        let delimiters = delimiters
            .into_iter()
            .map(|s| s.into())
            .collect::<Vec<String>>();

        // chunk_size is usize, which ensures it can hold a non-negative integer value.
        // chunk_size must be greater than 0.
        if chunk_size == 0 {
            return Err("chunk_size must be greater than 0");
        }
        // chunk_overlap must be less than chunk_size and greater than or equal to 0.
        if chunk_overlap >= chunk_size {
            return Err(
                "chunk_overlap must be less than chunk_size and greater than or equal to 0",
            );
        }
        if min_sentences_per_chunk == 0 {
            return Err("min_sentences_per_chunk must be greater than 0");
        }
        if min_characters_per_sentence == 0 {
            return Err("min_characters_per_sentence must be greater than 0");
        }
        if delimiters.is_empty() {
            return Err("delimiters must not be empty");
        }

        Ok(Self {
            tokenizer,
            chunk_size,
            chunk_overlap,
            min_sentences_per_chunk,
            min_characters_per_sentence,
            approximate,
            delimiters,
            include_delim,
            separator: "🦛".to_string(),
            chars_per_token: 6.0,
        })
    }

    /// Split text into sentences based on delimiters
    fn split_sentences(&self, text: &str) -> Vec<String> {
        let mut t = text.to_string();

        // Replace delimiters with separator based on include_delim setting
        for delim in &self.delimiters {
            match self.include_delim {
                DelimiterHandling::Previous => {
                    t = t.replace(delim, &format!("{}{}", delim, self.separator));
                }
                DelimiterHandling::Next => {
                    t = t.replace(delim, &format!("{}{}", self.separator, delim));
                }
                DelimiterHandling::None => {
                    t = t.replace(delim, &self.separator);
                }
            }
        }

        // Initial split
        let splits: Vec<String> = t
            .split(&self.separator)
            .filter(|s| !s.is_empty())
            .map(|s| s.to_string())
            .collect();

        // Combine short splits with previous sentence
        let mut current = String::new();
        let mut sentences = Vec::new();

        for s in splits {
            // If the split is short, add to current
            if s.len() < self.min_characters_per_sentence {
                current.push_str(&s);
            } else if !current.is_empty() {
                // If we have accumulated text and the new split is long enough
                current.push_str(&s);
                sentences.push(current.clone());
                current.clear();
            } else {
                // If the split is long enough and we don't have accumulated text
                sentences.push(s);
            }

            // If the current sentence is long enough, add it to sentences
            if current.len() >= self.min_characters_per_sentence {
                sentences.push(current.clone());
                current.clear();
            }
        }

        // If there is remaining text, add it to sentences
        if !current.is_empty() {
            sentences.push(current);
        }

        sentences
    }

    /// Estimate token counts based on character length
    fn estimate_token_counts(&self, texts: &[String]) -> Vec<usize> {
        texts
            .iter()
            .map(|text| max(1, (text.len() as f32 / self.chars_per_token) as usize))
            .collect()
    }

    /// Prepare sentences with token counts and positions
    fn prepare_sentences(&self, text: &str) -> Vec<Sentence> {
        // Split text into sentences
        let sentence_texts = self.split_sentences(text);
        if sentence_texts.is_empty() {
            return Vec::new();
        }

        // Calculate positions
        let mut positions = Vec::with_capacity(sentence_texts.len());
        let mut current_pos = 0;

        for sent in &sentence_texts {
            positions.push(current_pos);
            current_pos += sent.len();
        }

        // Calculate token counts
        let token_counts = if self.approximate {
            self.estimate_token_counts(&sentence_texts)
        } else {
            // For accurate counting, we'd encode each sentence
            sentence_texts
                .iter()
                .map(|sent| self.tokenizer.count_tokens(sent))
                .collect()
        };

        // Create sentence objects
        sentence_texts
            .iter()
            .zip(positions.iter())
            .zip(token_counts.iter())
            .map(|((sent, &pos), &count)| Sentence::new(sent, pos, pos + sent.len(), count))
            .collect()
    }

    /// Calculate feedback value to adjust token estimates
    fn get_feedback(&self, estimate: usize, actual: usize) -> f32 {
        let estimate = max(1, estimate) as f32;
        let actual = max(1, actual) as f32;
        (1.0 - ((estimate - actual) / estimate)).max(0.01)
    }

    /// Create a chunk from a list of sentences
    fn create_chunk(&self, sentences: &[Sentence]) -> SentenceChunk {
        let chunk_text: String = sentences
            .iter()
            .map(|sentence| sentence.text.clone())
            .collect();
        let token_count: usize = sentences.iter().map(|s| s.token_count).sum();

        SentenceChunk::new(
            &chunk_text,
            sentences.first().unwrap().start_index,
            sentences.last().unwrap().end_index,
            token_count,
            Some(sentences.to_vec()),
        )
    }

    // Split text into chunks based on sentences and token limits
    pub fn chunk(&self, text: &str) -> Vec<SentenceChunk> {
        if text.trim().is_empty() {
            return Vec::new();
        }

        // Get prepared sentences with token counts
        let sentences = self.prepare_sentences(text);
        if sentences.is_empty() {
            return Vec::new();
        }

        // Pre-calculate cumulative token counts
        let mut token_sums = Vec::with_capacity(sentences.len() + 1);
        token_sums.push(0);
        let mut sum = 0;
        for sentence in &sentences {
            sum += sentence.token_count;
            token_sums.push(sum);
        }

        let mut chunks = Vec::new();
        let mut feedback = 1.0;
        let mut pos = 0;

        while pos < sentences.len() {
            // Apply feedback to token sums
            let adjusted_token_sums: Vec<usize> = token_sums
                .iter()
                .map(|&sum| (sum as f32 * feedback) as usize)
                .collect();

            // Use binary search to find split point
            let target_tokens = adjusted_token_sums[pos] + self.chunk_size;
            let mut split_idx = match adjusted_token_sums.binary_search(&target_tokens) {
                Ok(idx) => idx,
                Err(idx) => idx.saturating_sub(1),
            };

            // Ensure we include at least one sentence beyond pos
            split_idx = max(split_idx, pos + 1);
            split_idx = min(split_idx, sentences.len());

            // Handle minimum sentences requirement
            if split_idx - pos < self.min_sentences_per_chunk {
                if pos + self.min_sentences_per_chunk <= sentences.len() {
                    split_idx = pos + self.min_sentences_per_chunk;
                } else {
                    eprintln!(
                        "Warning: Minimum sentences per chunk as {} could not be met for all chunks. \
                        Last chunk of the text will have only {} sentences. \
                        Consider increasing the chunk_size or decreasing the min_sentences_per_chunk.",
                        self.min_sentences_per_chunk,
                        sentences.len() - pos
                    );
                    split_idx = sentences.len();
                }
            }

            // Get estimated token count
            let estimate = adjusted_token_sums[split_idx] - adjusted_token_sums[pos];

            // Get candidate sentences and verify actual token count
            let chunk_sentences = &sentences[pos..split_idx];
            let chunk_text: String = chunk_sentences.iter().map(|s| s.text.clone()).collect();
            let actual = if self.approximate {
                self.tokenizer.encode(&chunk_text).len()
            } else {
                // We already have accurate counts
                chunk_sentences.iter().map(|s| s.token_count).sum()
            };

            // Update feedback for the next iteration
            feedback = self.get_feedback(estimate, actual);

            // Back off one sentence at a time if we exceeded chunk size
            let mut final_split_idx = split_idx;
            let mut final_chunk_sentences = chunk_sentences.to_vec();
            let mut final_actual = actual;

            while final_actual > self.chunk_size
                && final_chunk_sentences.len() > self.min_sentences_per_chunk
            {
                final_split_idx -= 1;
                final_chunk_sentences = sentences[pos..final_split_idx].to_vec();
                let final_chunk_text: String = final_chunk_sentences
                    .iter()
                    .map(|s| s.text.clone())
                    .collect();
                final_actual = if self.approximate {
                    self.tokenizer.encode(&final_chunk_text).len()
                } else {
                    final_chunk_sentences.iter().map(|s| s.token_count).sum()
                };
            }

            chunks.push(self.create_chunk(&final_chunk_sentences));

            // Calculate next position with overlap
            if self.chunk_overlap > 0 && final_split_idx < sentences.len() {
                // Calculate how many sentences we need for overlap
                let mut overlap_tokens = 0;
                let mut overlap_idx = final_split_idx - 1;

                while overlap_idx > pos && overlap_tokens < self.chunk_overlap {
                    let sent = &sentences[overlap_idx];
                    let next_tokens = overlap_tokens + sent.token_count + 1; // +1 for space
                    if next_tokens > self.chunk_overlap {
                        break;
                    }
                    overlap_tokens = next_tokens;
                    overlap_idx -= 1;
                }

                // Move position to after the overlap
                pos = overlap_idx + 1;
            } else {
                pos = final_split_idx;
            }
        }

        chunks
    }

    /// Chunk multiple texts in a batch
    pub fn chunk_batch(&self, texts: &Vec<String>) -> Vec<Vec<SentenceChunk>> {
        texts.par_iter().map(|text| self.chunk(text)).collect()
    }
}

impl Default for SentenceChunker<CharacterTokenizer> {
    fn default() -> Self {
        let tokenizer = CharacterTokenizer::new();
        Self::new(
            tokenizer,
            512,                         // chunk_size
            0,                           // chunk_overlap
            1,                           // min_sentences_per_chunk
            12,                          // min_characters_per_sentence
            true,                        // approximate
            vec![".", "!", "?", "\n"],   // delimiters
            DelimiterHandling::Previous, // include_delim
        )
        .expect("Default parameters should be valid")
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::tokenizer::CharacterTokenizer;

    #[test]
    fn test_split_sentences() {
        let tokenizer = CharacterTokenizer::new();
        let chunker = SentenceChunker::new(
            tokenizer,
            20,
            0,
            1,
            5,
            true,
            vec![".", "!", "?", "\n"],
            DelimiterHandling::Previous,
        )
        .unwrap(); // TODO: Figure out a better way to handle this than unwrap

        let text: &str = "Hello, world! This is a test. How are you?";
        let sentences = chunker.split_sentences(text);

        assert_eq!(sentences.len(), 3);
        assert_eq!(sentences[0], "Hello, world!");
        assert_eq!(sentences[1], " This is a test.");
        assert_eq!(sentences[2], " How are you?");
    }

    #[test]
    fn test_chunk() {
        let tokenizer = CharacterTokenizer::new();
        let chunker = SentenceChunker::new(
            tokenizer,
            20,
            0,
            1,
            5,
            true,
            vec![".", "!", "?"],
            DelimiterHandling::Previous,
        )
        .unwrap(); // TODO: Figure out a better way to handle this than unwrap

        let text: &str = "Hello, world! This is a test. How are you? I am fine, thank you!";
        let chunks = chunker.chunk(&text.to_string());

        // With a small chunk size, we should get multiple chunks
        assert!(chunks.len() > 1);

        // Verify the chunks cover the entire text
        let combined: String = chunks.iter().map(|c| c.text.clone()).collect();
        assert_eq!(combined, text);
    }

    #[test]
    fn test_min_sentences_per_chunk() {
        let tokenizer = CharacterTokenizer::new();
        let chunker = SentenceChunker::new(
            tokenizer,
            100,
            0,
            2, // Require at least 2 sentences per chunk
            5,
            true,
            vec![".", "!", "?", "\n"],
            DelimiterHandling::Previous,
        )
        .unwrap();

        let text = "Short. Another short. And another. One more.";
        let chunks = chunker.chunk(&text.to_string());

        // With min_sentences_per_chunk=2, we should get at most 2 chunks from 4 sentences
        assert!(chunks.len() <= 2);

        // Each chunk should have at least 2 sentences
        for chunk in &chunks {
            assert!(chunk.sentences.as_ref().unwrap().len() >= 2);
        }
    }
}