sqlite_graphrag/tokenizer.rs
1//! Token-count utilities for embedding input sizing.
2//!
3//! v1.0.76: the `tokenizers` crate was removed. Token counts are now
4//! approximated from whitespace-split word counts, calibrated by a
5//! `WORDS_TO_TOKENS` factor (default `0.75`, conservative for English +
6//! the multilingual-e5 prefix that the LLM headless invocation prepends).
7//!
8//! For passages shorter than `EMBEDDING_MAX_TOKENS` words, the count
9//! is exact. For longer passages, the count is approximate but still
10//! useful for the chunking decision in `src/embedder.rs::embed_passages_controlled`.
11
12use crate::errors::AppError;
13
14/// Approximate tokens-per-word. The multilingual-e5 family uses
15/// SentencePiece tokenisation, which yields ~1.33 tokens per English word
16/// and slightly less for code. We round up to 1.5 to keep the chunking
17/// decision conservative (better to over-chunk than to overflow the
18/// LLM context window).
19const WORDS_TO_TOKENS_NUMERATOR: usize = 3;
20const WORDS_TO_TOKENS_DENOMINATOR: usize = 2;
21
22/// Returns the approximate token count for `text` when prefixed with
23/// `prefix` (e.g. `passage:` for `embed_passage`).
24pub fn count_passage_tokens(text: &str) -> Result<usize, AppError> {
25 Ok(approx_tokens(&format!(
26 "{}{}",
27 crate::constants::PASSAGE_PREFIX,
28 text
29 )))
30}
31
32/// Returns the byte-offset pairs `(start, end)` for each whitespace-delimited
33/// word in `text`. The tokenizers crate used to return true sub-word offsets;
34/// the LLM headless path doesn't need that granularity, so we return word
35/// boundaries.
36pub fn passage_token_offsets(text: &str) -> Result<Vec<(usize, usize)>, AppError> {
37 let mut offsets = Vec::new();
38 let mut start = None;
39 for (i, c) in text.char_indices() {
40 if c.is_whitespace() {
41 if let Some(s) = start.take() {
42 if i > s {
43 offsets.push((s, i));
44 }
45 }
46 } else if start.is_none() {
47 start = Some(i);
48 }
49 }
50 if let Some(s) = start {
51 if text.len() > s {
52 offsets.push((s, text.len()));
53 }
54 }
55 Ok(offsets)
56}
57
58/// Returns the model's max input length. Since we no longer have a
59/// tokenizer config, this returns the constant from `constants.rs`.
60/// Operators that need a different ceiling should set
61/// `SQLITE_GRAPHRAG_EMBEDDING_MAX_TOKENS` in the environment.
62pub fn get_model_max_length() -> usize {
63 crate::constants::EMBEDDING_MAX_TOKENS
64}
65
66/// Returns the exact cl100k_base (OpenAI tiktoken) token count of `text`.
67///
68/// This is a deliberately conservative proxy for the
69/// `qwen/qwen3-embedding-8b` tokenizer used by the OpenRouter embedding
70/// backend: cl100k_base generally emits at least as many tokens as Qwen's
71/// BPE for the same input, so a count comfortably under the model's
72/// ~32K-token effective ceiling guarantees the input fits Qwen's window.
73///
74/// Unlike `approx_tokens`, this is exact for arbitrary input. It uses the
75/// process-wide cached BPE singleton, so repeated calls do not re-initialise
76/// the tokenizer.
77pub fn count_tokens(text: &str) -> usize {
78 tiktoken_rs::cl100k_base_singleton()
79 .encode_ordinary(text)
80 .len()
81}
82
83fn approx_tokens(text: &str) -> usize {
84 let words = text.split_whitespace().count();
85 // Round up to avoid under-chunking.
86 let num = words.saturating_mul(WORDS_TO_TOKENS_NUMERATOR);
87 let (tokens, rem) = (
88 num / WORDS_TO_TOKENS_DENOMINATOR,
89 num % WORDS_TO_TOKENS_DENOMINATOR,
90 );
91 if rem == 0 {
92 tokens
93 } else {
94 tokens + 1
95 }
96}
97
98#[cfg(test)]
99mod tests {
100 use super::*;
101
102 #[test]
103 fn empty_string_has_zero_tokens() {
104 assert_eq!(approx_tokens(""), 0);
105 assert_eq!(approx_tokens(" \n\t "), 0);
106 }
107
108 #[test]
109 fn single_word_rounds_up() {
110 // 1 word * 3 / 2 = 1.5 → 2 tokens
111 assert_eq!(approx_tokens("hello"), 2);
112 }
113
114 #[test]
115 fn four_words_rounds_to_six() {
116 // 4 * 3 / 2 = 6 exactly
117 assert_eq!(approx_tokens("the quick brown fox"), 6);
118 }
119
120 #[test]
121 fn passage_offsets_skip_whitespace() {
122 let offsets = passage_token_offsets("hello world foo").unwrap();
123 assert_eq!(offsets, vec![(0, 5), (6, 11), (12, 15)]);
124 }
125
126 #[test]
127 fn passage_offsets_handle_leading_and_trailing_whitespace() {
128 let offsets = passage_token_offsets(" hello ").unwrap();
129 assert_eq!(offsets, vec![(2, 7)]);
130 }
131
132 #[test]
133 fn count_passage_tokens_matches_approx_tokens() {
134 assert_eq!(count_passage_tokens("rust sqlite graphrag").unwrap(), 6);
135 }
136
137 #[test]
138 fn count_passage_tokens_includes_prefix_for_short_inputs() {
139 assert_eq!(count_passage_tokens("teste fix real 4").unwrap(), 8);
140 }
141
142 #[test]
143 fn count_passage_tokens_matches_embedding_when_text_already_has_prefix() {
144 assert_eq!(
145 count_passage_tokens("passage: teste fix real 5").unwrap(),
146 9
147 );
148 }
149
150 #[test]
151 fn count_tokens_matches_known_cl100k_counts() {
152 // "hello world" is exactly 2 cl100k_base tokens; empty string is 0.
153 assert_eq!(count_tokens("hello world"), 2);
154 assert_eq!(count_tokens(""), 0);
155 }
156}