Skip to main content

cognis_core/
tokenizer.rs

1//! Pluggable token counting trait.
2//!
3//! Lives in `cognis-core` so non-RAG code can budget tokens without
4//! pulling in `cognis-rag`. Concrete tokenizer crates (tiktoken-rs, HF
5//! tokenizers, etc.) are integrated by user code via the trait.
6
7/// Counts tokens in a piece of text.
8pub trait Tokenizer: Send + Sync {
9    /// Number of tokens this tokenizer would produce for `text`.
10    fn count(&self, text: &str) -> usize;
11}
12
13/// Trivial char-as-token implementation. Conservative upper bound on
14/// real tokenizer counts; useful as a default for budgeting.
15#[derive(Debug, Default, Clone, Copy)]
16pub struct CharTokenizer;
17
18impl Tokenizer for CharTokenizer {
19    fn count(&self, text: &str) -> usize {
20        text.chars().count()
21    }
22}
23
24/// Closure-backed tokenizer.
25pub struct FnTokenizer<F: Fn(&str) -> usize + Send + Sync>(pub F);
26
27impl<F: Fn(&str) -> usize + Send + Sync> Tokenizer for FnTokenizer<F> {
28    fn count(&self, text: &str) -> usize {
29        (self.0)(text)
30    }
31}
32
33#[cfg(test)]
34mod tests {
35    use super::*;
36
37    #[test]
38    fn char_tokenizer_counts_chars() {
39        assert_eq!(CharTokenizer.count("hello"), 5);
40    }
41
42    #[test]
43    fn fn_tokenizer_invokes_closure() {
44        let t = FnTokenizer(|s: &str| s.split_whitespace().count());
45        assert_eq!(t.count("hello rust world"), 3);
46    }
47}