Skip to main content

sqlite_graphrag/
tokenizer.rs

1//! Token-count utilities for embedding input sizing.
2//!
3//! v1.0.76: the `tokenizers` crate was removed. Token counts are now
4//! approximated from whitespace-split word counts, calibrated by a
5//! `WORDS_TO_TOKENS` factor (default `0.75`, conservative for English +
6//! the multilingual-e5 prefix that the LLM headless invocation prepends).
7//!
8//! For passages shorter than `EMBEDDING_MAX_TOKENS` words, the count
9//! is exact. For longer passages, the count is approximate but still
10//! useful for the chunking decision in `src/embedder.rs::embed_passages_controlled`.
11
12use crate::errors::AppError;
13
14/// Approximate tokens-per-word. The multilingual-e5 family uses
15/// SentencePiece tokenisation, which yields ~1.33 tokens per English word
16/// and slightly less for code. We round up to 1.5 to keep the chunking
17/// decision conservative (better to over-chunk than to overflow the
18/// LLM context window).
19const WORDS_TO_TOKENS_NUMERATOR: usize = 3;
20const WORDS_TO_TOKENS_DENOMINATOR: usize = 2;
21
22/// Returns the approximate token count for `text` when prefixed with
23/// `prefix` (e.g. `passage:` for `embed_passage`).
24pub fn count_passage_tokens(text: &str) -> Result<usize, AppError> {
25    Ok(approx_tokens(&format!(
26        "{}{}",
27        crate::constants::PASSAGE_PREFIX,
28        text
29    )))
30}
31
32/// Returns the byte-offset pairs `(start, end)` for each whitespace-delimited
33/// word in `text`. The tokenizers crate used to return true sub-word offsets;
34/// the LLM headless path doesn't need that granularity, so we return word
35/// boundaries.
36pub fn passage_token_offsets(text: &str) -> Result<Vec<(usize, usize)>, AppError> {
37    let mut offsets = Vec::new();
38    let mut start = None;
39    for (i, c) in text.char_indices() {
40        if c.is_whitespace() {
41            if let Some(s) = start.take() {
42                if i > s {
43                    offsets.push((s, i));
44                }
45            }
46        } else if start.is_none() {
47            start = Some(i);
48        }
49    }
50    if let Some(s) = start {
51        if text.len() > s {
52            offsets.push((s, text.len()));
53        }
54    }
55    Ok(offsets)
56}
57
58/// Returns the model's max input length. Since we no longer have a
59/// tokenizer config, this returns the constant from `constants.rs`.
60/// Operators that need a different ceiling should set
61/// `SQLITE_GRAPHRAG_EMBEDDING_MAX_TOKENS` in the environment.
62pub fn get_model_max_length() -> usize {
63    crate::constants::EMBEDDING_MAX_TOKENS
64}
65
66/// Returns the exact cl100k_base (OpenAI tiktoken) token count of `text`.
67///
68/// This is a deliberately conservative proxy for the
69/// `qwen/qwen3-embedding-8b` tokenizer used by the OpenRouter embedding
70/// backend: cl100k_base generally emits at least as many tokens as Qwen's
71/// BPE for the same input, so a count comfortably under the model's
72/// ~32K-token effective ceiling guarantees the input fits Qwen's window.
73///
74/// Unlike `approx_tokens`, this is exact for arbitrary input. It uses the
75/// process-wide cached BPE singleton, so repeated calls do not re-initialise
76/// the tokenizer.
77pub fn count_tokens(text: &str) -> usize {
78    tiktoken_rs::cl100k_base_singleton()
79        .encode_ordinary(text)
80        .len()
81}
82
83fn approx_tokens(text: &str) -> usize {
84    let words = text.split_whitespace().count();
85    // Round up to avoid under-chunking.
86    let num = words.saturating_mul(WORDS_TO_TOKENS_NUMERATOR);
87    let (tokens, rem) = (
88        num / WORDS_TO_TOKENS_DENOMINATOR,
89        num % WORDS_TO_TOKENS_DENOMINATOR,
90    );
91    if rem == 0 {
92        tokens
93    } else {
94        tokens + 1
95    }
96}
97
98#[cfg(test)]
99mod tests {
100    use super::*;
101
102    #[test]
103    fn empty_string_has_zero_tokens() {
104        assert_eq!(approx_tokens(""), 0);
105        assert_eq!(approx_tokens("   \n\t  "), 0);
106    }
107
108    #[test]
109    fn single_word_rounds_up() {
110        // 1 word * 3 / 2 = 1.5 → 2 tokens
111        assert_eq!(approx_tokens("hello"), 2);
112    }
113
114    #[test]
115    fn four_words_rounds_to_six() {
116        // 4 * 3 / 2 = 6 exactly
117        assert_eq!(approx_tokens("the quick brown fox"), 6);
118    }
119
120    #[test]
121    fn passage_offsets_skip_whitespace() {
122        let offsets = passage_token_offsets("hello world foo").unwrap();
123        assert_eq!(offsets, vec![(0, 5), (6, 11), (12, 15)]);
124    }
125
126    #[test]
127    fn passage_offsets_handle_leading_and_trailing_whitespace() {
128        let offsets = passage_token_offsets("  hello  ").unwrap();
129        assert_eq!(offsets, vec![(2, 7)]);
130    }
131
132    #[test]
133    fn count_passage_tokens_matches_approx_tokens() {
134        assert_eq!(count_passage_tokens("rust sqlite graphrag").unwrap(), 6);
135    }
136
137    #[test]
138    fn count_passage_tokens_includes_prefix_for_short_inputs() {
139        assert_eq!(count_passage_tokens("teste fix real 4").unwrap(), 8);
140    }
141
142    #[test]
143    fn count_passage_tokens_matches_embedding_when_text_already_has_prefix() {
144        assert_eq!(
145            count_passage_tokens("passage: teste fix real 5").unwrap(),
146            9
147        );
148    }
149
150    #[test]
151    fn count_tokens_matches_known_cl100k_counts() {
152        // "hello world" is exactly 2 cl100k_base tokens; empty string is 0.
153        assert_eq!(count_tokens("hello world"), 2);
154        assert_eq!(count_tokens(""), 0);
155    }
156}