fn char_cpt(ch: char) -> f32 {
let cp = ch as u32;
match cp {
0x1F600..=0x1F64F | 0x1F300..=0x1F5FF | 0x1F680..=0x1F6FF | 0x2600..=0x26FF => 1.0,
0x3040..=0x30FF | 0x4E00..=0x9FFF | 0xAC00..=0xD7AF => 1.5,
0x0600..=0x06FF | 0x0900..=0x097F | 0x0E00..=0x0E7F => 2.0,
0x0400..=0x04FF => 2.0,
0x0370..=0x03FF => 2.0,
0x0590..=0x05FF => 2.0,
_ => DEFAULT_CPT,
}
}
const DEFAULT_CPT: f32 = 4.0;
const WS_WEIGHT: f32 = 0.25;
pub fn estimate_tokens(text: &str) -> usize {
if text.is_empty() {
return 0;
}
let mut total_weight: f32 = 0.0;
for ch in text.chars() {
if ch.is_ascii_control() {
continue;
}
if ch.is_whitespace() {
total_weight += WS_WEIGHT;
continue;
}
total_weight += 1.0 / char_cpt(ch);
}
if total_weight == 0.0 {
return 0;
}
total_weight.round() as usize
}
pub fn estimate_tokens_min(text: &str, min: usize) -> usize {
estimate_tokens(text).max(min)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn empty_string() {
assert_eq!(estimate_tokens(""), 0);
}
#[test]
fn ascii_text() {
let tokens = estimate_tokens("Hello, world! This is a test.");
assert!(tokens > 3 && tokens < 15, "got {tokens}");
}
#[test]
fn cjk_text() {
let tokens = estimate_tokens("こんにちは世界");
assert!(tokens > 2 && tokens < 10, "got {tokens}");
}
#[test]
fn mixed_scripts() {
let tokens = estimate_tokens("Hello こんにちは مرحبا");
assert!(tokens > 0);
}
#[test]
fn emoji() {
let tokens = estimate_tokens("🎉🚀👍");
assert!(tokens >= 2, "got {tokens}");
}
#[test]
fn min_clamp() {
assert_eq!(estimate_tokens_min("", 5), 5);
}
#[test]
fn long_text_proportional() {
let short = estimate_tokens("Hello world");
let long = estimate_tokens("Hello world Hello world Hello world");
assert!(long > short, "long={long} should be > short={short}");
}
#[test]
fn cyrillic_text() {
let tokens = estimate_tokens("Привет мир");
assert!(tokens > 2 && tokens < 10, "got {tokens}");
}
#[test]
fn greek_text() {
let tokens = estimate_tokens("Γεια σου κόσμε");
assert!(tokens > 0 && tokens < 10, "got {tokens}");
}
#[test]
fn hebrew_text() {
let tokens = estimate_tokens("שלום עולם");
assert!(tokens > 0 && tokens < 10, "got {tokens}");
}
#[test]
fn whitespace_contributes_tokens() {
let no_space = estimate_tokens("abcdef");
let with_space = estimate_tokens("a b c d e f");
assert!(
with_space > no_space / 2,
"with_space={with_space} should not be negligible vs no_space={no_space}"
);
}
}