ck_embed/
tokenizer.rs

1//use anyhow::Result;
2
3/// Simple token estimation for code and text
4/// This is a rough approximation since we don't have access to the actual model tokenizer
5pub struct TokenEstimator;
6
7impl TokenEstimator {
8    /// Estimate token count for text
9    /// Based on empirical analysis of code and text tokenization:
10    /// - Code: ~4.2 characters per token
11    /// - Text: ~4.8 characters per token  
12    /// - Average: ~4.5 characters per token
13    pub fn estimate_tokens(text: &str) -> usize {
14        if text.is_empty() {
15            return 0;
16        }
17
18        // More sophisticated estimation based on content type
19        let char_count = text.chars().count();
20
21        // Detect if text is primarily code vs natural language
22        let code_indicators = Self::count_code_indicators(text);
23        let total_lines = text.lines().count().max(1);
24        let code_density = code_indicators as f32 / total_lines as f32;
25
26        // Adjust ratio based on code density
27        let chars_per_token = if code_density > 0.3 {
28            // Likely code - more tokens due to symbols, identifiers
29            4.2
30        } else if code_density > 0.1 {
31            // Mixed content
32            4.4
33        } else {
34            // Primarily natural language
35            4.8
36        };
37
38        (char_count as f32 / chars_per_token).ceil() as usize
39    }
40
41    /// Check if text exceeds token limit for a given model
42    pub fn exceeds_limit(text: &str, max_tokens: usize) -> bool {
43        Self::estimate_tokens(text) > max_tokens
44    }
45
46    /// Get token limit for different models
47    pub fn get_model_limit(model_name: &str) -> usize {
48        match model_name {
49            "BAAI/bge-small-en-v1.5" => 512,
50            "sentence-transformers/all-MiniLM-L6-v2" => 512,
51            "nomic-embed-text-v1" => 8192,
52            "nomic-embed-text-v1.5" => 8192,
53            "jina-embeddings-v2-base-code" => 8192,
54            "BAAI/bge-base-en-v1.5" => 512,
55            "BAAI/bge-large-en-v1.5" => 512,
56            _ => 8192, // Default to Nomic limit
57        }
58    }
59
60    /// Count code-specific indicators to help classify content
61    fn count_code_indicators(text: &str) -> usize {
62        let mut count = 0;
63
64        for line in text.lines() {
65            let trimmed = line.trim();
66
67            // Skip empty lines and comments
68            if trimmed.is_empty() || trimmed.starts_with("//") || trimmed.starts_with('#') {
69                continue;
70            }
71
72            // Look for code patterns
73            if trimmed.contains('{') || trimmed.contains('}') {
74                count += 1;
75            }
76            if trimmed.contains(';') && !trimmed.ends_with('.') {
77                count += 1;
78            }
79            if trimmed.contains("fn ")
80                || trimmed.contains("def ")
81                || trimmed.contains("function ")
82                || trimmed.contains("func ")
83            {
84                count += 1;
85            }
86            if trimmed.contains("->") || trimmed.contains("=>") || trimmed.contains("::") {
87                count += 1;
88            }
89            if trimmed.starts_with("pub ")
90                || trimmed.starts_with("private ")
91                || trimmed.starts_with("public ")
92            {
93                count += 1;
94            }
95        }
96
97        count
98    }
99}
100
101#[cfg(test)]
102mod tests {
103    use super::*;
104
105    #[test]
106    fn test_estimate_tokens_empty() {
107        assert_eq!(TokenEstimator::estimate_tokens(""), 0);
108    }
109
110    #[test]
111    fn test_estimate_tokens_simple() {
112        let text = "Hello, world!";
113        let tokens = TokenEstimator::estimate_tokens(text);
114        // Should be around 3 tokens, estimation might vary
115        assert!((2..=4).contains(&tokens), "Got {} tokens", tokens);
116    }
117
118    #[test]
119    fn test_estimate_tokens_code() {
120        let code = r#"
121fn main() {
122    println!("Hello, world!");
123    let x = 42;
124    return x;
125}
126"#;
127        let tokens = TokenEstimator::estimate_tokens(code);
128        // Code typically has more tokens due to symbols
129        assert!((15..=25).contains(&tokens), "Got {} tokens", tokens);
130    }
131
132    #[test]
133    fn test_exceeds_limit() {
134        assert!(!TokenEstimator::exceeds_limit("short text", 100));
135
136        let long_text = "word ".repeat(200); // ~1000 characters
137        assert!(TokenEstimator::exceeds_limit(&long_text, 100));
138    }
139
140    #[test]
141    fn test_model_limits() {
142        assert_eq!(
143            TokenEstimator::get_model_limit("BAAI/bge-small-en-v1.5"),
144            512
145        );
146        assert_eq!(
147            TokenEstimator::get_model_limit("nomic-embed-text-v1.5"),
148            8192
149        );
150        assert_eq!(TokenEstimator::get_model_limit("unknown-model"), 8192);
151    }
152
153    #[test]
154    fn test_code_detection() {
155        let code = r#"
156pub fn calculate(x: i32) -> i32 {
157    let result = x * 2;
158    return result;
159}
160"#;
161        let tokens = TokenEstimator::estimate_tokens(code);
162
163        let text = r#"
164This is a paragraph about programming.
165It contains some discussion of functions and variables.
166But it's written in natural language.
167"#;
168        let text_tokens = TokenEstimator::estimate_tokens(text);
169
170        // Code should generally have slightly more tokens per character
171        // due to more symbols and shorter identifiers
172        let code_ratio = tokens as f32 / code.chars().count() as f32;
173        let text_ratio = text_tokens as f32 / text.chars().count() as f32;
174
175        assert!(
176            code_ratio >= text_ratio * 0.8,
177            "Code ratio {} should be similar to or higher than text ratio {}",
178            code_ratio,
179            text_ratio
180        );
181    }
182}