text-splitter 0.30.1

Split text into semantic chunks, up to a desired chunk size. Supports calculating length by characters and tokens, and is callable from Rust and Python.
use tiktoken_rs::CoreBPE;

use crate::ChunkSizer;

impl ChunkSizer for CoreBPE {
    /// Returns the number of tokens in a given text after tokenization.
    fn size(&self, chunk: &str) -> usize {
        self.encode_ordinary(chunk).len()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    use tiktoken_rs::cl100k_base;

    #[test]
    fn returns_offsets() {
        let tokenizer = cl100k_base().unwrap();
        let size = tokenizer.size("An apple a");
        assert_eq!(size, 3);
    }
}