fn get_tokenizer() -> &'static bpe_openai::Tokenizer {
bpe_openai::cl100k_base()
}
#[must_use]
pub fn count_tokens(content: &str) -> usize {
get_tokenizer().count(content)
}
#[must_use]
pub fn format_token_count(count: usize) -> String {
if count >= 1_000_000 {
format!("{:.1}M", count as f64 / 1_000_000.0)
} else if count >= 1_000 {
format!("{:.1}k", count as f64 / 1_000.0)
} else {
count.to_string()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_count_tokens_empty() {
assert_eq!(count_tokens(""), 0);
}
#[test]
fn test_count_tokens_simple() {
let count = count_tokens("Hello, world!");
assert!(count > 0);
assert!(count < 10);
}
#[test]
fn test_count_tokens_longer() {
let content = "This is a longer piece of text that should result in more tokens. \
The quick brown fox jumps over the lazy dog.";
let count = count_tokens(content);
assert!(count > 10);
}
#[test]
fn test_format_token_count_small() {
assert_eq!(format_token_count(0), "0");
assert_eq!(format_token_count(1), "1");
assert_eq!(format_token_count(999), "999");
}
#[test]
fn test_format_token_count_thousands() {
assert_eq!(format_token_count(1000), "1.0k");
assert_eq!(format_token_count(1500), "1.5k");
assert_eq!(format_token_count(12500), "12.5k");
assert_eq!(format_token_count(999999), "1000.0k");
}
#[test]
fn test_format_token_count_millions() {
assert_eq!(format_token_count(1_000_000), "1.0M");
assert_eq!(format_token_count(1_500_000), "1.5M");
assert_eq!(format_token_count(10_000_000), "10.0M");
}
}