fn is_cjk(c: char) -> bool {
matches!(c as u32,
0x2E80..=0x9FFF | 0xF900..=0xFAFF | 0xFF00..=0xFFEF | 0x20000..=0x2FA1F )
}
pub fn approx_token_count(text: &str) -> u32 {
let mut cjk = 0u32;
let mut other = 0u32;
for c in text.chars() {
if is_cjk(c) {
cjk += 1;
} else {
other += 1;
}
}
cjk + other.saturating_add(3) / 4
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn linear() {
assert_eq!(approx_token_count(""), 0);
assert_eq!(approx_token_count("abcd"), 1);
assert_eq!(approx_token_count("abcde"), 2);
assert_eq!(approx_token_count(&"x".repeat(400)), 100);
}
#[test]
fn cjk_counts_one_token_per_char() {
assert_eq!(approx_token_count("中文"), 2);
assert_eq!(approx_token_count("abcd中文"), 3);
assert_eq!(approx_token_count(",。"), 2);
}
}