1pub struct TokenEstimator;
6
7impl TokenEstimator {
8 pub fn estimate_tokens(text: &str) -> usize {
14 if text.is_empty() {
15 return 0;
16 }
17
18 let char_count = text.chars().count();
20
21 let code_indicators = Self::count_code_indicators(text);
23 let total_lines = text.lines().count().max(1);
24 let code_density = code_indicators as f32 / total_lines as f32;
25
26 let chars_per_token = if code_density > 0.3 {
28 4.2
30 } else if code_density > 0.1 {
31 4.4
33 } else {
34 4.8
36 };
37
38 (char_count as f32 / chars_per_token).ceil() as usize
39 }
40
41 pub fn exceeds_limit(text: &str, max_tokens: usize) -> bool {
43 Self::estimate_tokens(text) > max_tokens
44 }
45
46 pub fn get_model_limit(model_name: &str) -> usize {
48 match model_name {
49 "BAAI/bge-small-en-v1.5" => 512,
50 "sentence-transformers/all-MiniLM-L6-v2" => 512,
51 "nomic-embed-text-v1" => 8192,
52 "nomic-embed-text-v1.5" => 8192,
53 "jina-embeddings-v2-base-code" => 8192,
54 "BAAI/bge-base-en-v1.5" => 512,
55 "BAAI/bge-large-en-v1.5" => 512,
56 _ => 8192, }
58 }
59
60 fn count_code_indicators(text: &str) -> usize {
62 let mut count = 0;
63
64 for line in text.lines() {
65 let trimmed = line.trim();
66
67 if trimmed.is_empty() || trimmed.starts_with("//") || trimmed.starts_with('#') {
69 continue;
70 }
71
72 if trimmed.contains('{') || trimmed.contains('}') {
74 count += 1;
75 }
76 if trimmed.contains(';') && !trimmed.ends_with('.') {
77 count += 1;
78 }
79 if trimmed.contains("fn ")
80 || trimmed.contains("def ")
81 || trimmed.contains("function ")
82 || trimmed.contains("func ")
83 {
84 count += 1;
85 }
86 if trimmed.contains("->") || trimmed.contains("=>") || trimmed.contains("::") {
87 count += 1;
88 }
89 if trimmed.starts_with("pub ")
90 || trimmed.starts_with("private ")
91 || trimmed.starts_with("public ")
92 {
93 count += 1;
94 }
95 }
96
97 count
98 }
99}
100
101#[cfg(test)]
102mod tests {
103 use super::*;
104
105 #[test]
106 fn test_estimate_tokens_empty() {
107 assert_eq!(TokenEstimator::estimate_tokens(""), 0);
108 }
109
110 #[test]
111 fn test_estimate_tokens_simple() {
112 let text = "Hello, world!";
113 let tokens = TokenEstimator::estimate_tokens(text);
114 assert!((2..=4).contains(&tokens), "Got {} tokens", tokens);
116 }
117
118 #[test]
119 fn test_estimate_tokens_code() {
120 let code = r#"
121fn main() {
122 println!("Hello, world!");
123 let x = 42;
124 return x;
125}
126"#;
127 let tokens = TokenEstimator::estimate_tokens(code);
128 assert!((15..=25).contains(&tokens), "Got {} tokens", tokens);
130 }
131
132 #[test]
133 fn test_exceeds_limit() {
134 assert!(!TokenEstimator::exceeds_limit("short text", 100));
135
136 let long_text = "word ".repeat(200); assert!(TokenEstimator::exceeds_limit(&long_text, 100));
138 }
139
140 #[test]
141 fn test_model_limits() {
142 assert_eq!(
143 TokenEstimator::get_model_limit("BAAI/bge-small-en-v1.5"),
144 512
145 );
146 assert_eq!(
147 TokenEstimator::get_model_limit("nomic-embed-text-v1.5"),
148 8192
149 );
150 assert_eq!(TokenEstimator::get_model_limit("unknown-model"), 8192);
151 }
152
153 #[test]
154 fn test_code_detection() {
155 let code = r#"
156pub fn calculate(x: i32) -> i32 {
157 let result = x * 2;
158 return result;
159}
160"#;
161 let tokens = TokenEstimator::estimate_tokens(code);
162
163 let text = r#"
164This is a paragraph about programming.
165It contains some discussion of functions and variables.
166But it's written in natural language.
167"#;
168 let text_tokens = TokenEstimator::estimate_tokens(text);
169
170 let code_ratio = tokens as f32 / code.chars().count() as f32;
173 let text_ratio = text_tokens as f32 / text.chars().count() as f32;
174
175 assert!(
176 code_ratio >= text_ratio * 0.8,
177 "Code ratio {} should be similar to or higher than text ratio {}",
178 code_ratio,
179 text_ratio
180 );
181 }
182}