1use std::sync::Arc;
2
3pub type TokenEstimator = Arc<dyn Fn(&str) -> usize + Send + Sync>;
6
7#[derive(Clone)]
9pub struct TokenEstimatorWrapper(pub TokenEstimator);
10
11impl std::fmt::Debug for TokenEstimatorWrapper {
12 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
13 write!(f, "TokenEstimatorWrapper")
14 }
15}
16
17pub fn default_token_estimator() -> TokenEstimator {
24 Arc::new(|text: &str| {
25 let total: usize = text.chars().count();
26 if total == 0 {
27 return 0;
28 }
29 let english = text.chars().filter(|c| c.is_ascii_alphabetic()).count();
30 let cjk = text
31 .chars()
32 .filter(|c| matches!(c, '\u{4e00}'..='\u{9fff}' | '\u{3400}'..='\u{4dbf}'))
33 .count();
34 let other = total - english - cjk;
35(english / 4).max(1) + (cjk * 2 / 3).max(1) + (other / 4).max(1)
36 })
37}
38
39#[cfg(test)]
40mod tests {
41 use super::*;
42
43 #[test]
44 fn default_estimator_handles_empty_string() {
45 let est = default_token_estimator();
46 assert_eq!((est)(""), 0); }
48
49 #[test]
50 fn default_estimator_english_text() {
51 let est = default_token_estimator();
52 let text = "This is a test of English text";
53 let tokens = (est)(text);
54 assert!(tokens > 0);
56 assert!(tokens < 15);
57 }
58
59 #[test]
60 fn default_estimator_cjk_text() {
61 let est = default_token_estimator();
62 let text = "这是一个测试"; let tokens = (est)(text);
64 assert!(tokens > 0);
66 }
67}