1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
use crate::ChunkValidator;

/// Generic interface for tokenizers to calculate number of tokens.
pub trait TokenCount {
    /// Returns the number of tokens in a given text after tokenization.
    fn token_count(&self, text: &str) -> usize;
}

impl<T> ChunkValidator for T
where
    T: TokenCount,
{
    /// Determine if the given chunk still fits within the specified max chunk
    /// size, based on tokens.
    ///
    /// ```
    /// use text_splitter::{ChunkValidator};
    /// use tokenizers::Tokenizer;
    ///
    /// let tokenizer = Tokenizer::from_pretrained("bert-base-cased", None).unwrap();
    /// assert!(tokenizer.validate_chunk("hello", 10));
    /// ```
    fn validate_chunk(&self, chunk: &str, chunk_size: usize) -> bool {
        self.token_count(chunk) <= chunk_size
    }
}