use std::sync::Arc;
pub type TokenEstimator = Arc<dyn Fn(&str) -> usize + Send + Sync>;
#[derive(Clone)]
pub struct TokenEstimatorWrapper(pub TokenEstimator);
impl std::fmt::Debug for TokenEstimatorWrapper {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "TokenEstimatorWrapper")
}
}
pub fn default_token_estimator() -> TokenEstimator {
Arc::new(|text: &str| {
let total: usize = text.chars().count();
if total == 0 {
return 0;
}
let english = text.chars().filter(|c| c.is_ascii_alphabetic()).count();
let cjk = text
.chars()
.filter(|c| matches!(c, '\u{4e00}'..='\u{9fff}' | '\u{3400}'..='\u{4dbf}'))
.count();
let other = total - english - cjk;
(english / 4).max(1) + (cjk * 2 / 3).max(1) + (other / 4).max(1)
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn default_estimator_handles_empty_string() {
let est = default_token_estimator();
assert_eq!((est)(""), 0); }
#[test]
fn default_estimator_english_text() {
let est = default_token_estimator();
let text = "This is a test of English text";
let tokens = (est)(text);
assert!(tokens > 0);
assert!(tokens < 15);
}
#[test]
fn default_estimator_cjk_text() {
let est = default_token_estimator();
let text = "这是一个测试"; let tokens = (est)(text);
assert!(tokens > 0);
}
}