matrixcode_core/
tokenizer.rs1use once_cell::sync::Lazy;
7use std::sync::Arc;
8use tiktoken_rs::CoreBPE;
9
10static BPE: Lazy<Arc<CoreBPE>> = Lazy::new(|| {
12 Arc::new(tiktoken_rs::cl100k_base().expect("Failed to initialize tokenizer"))
13});
14
15pub fn count_tokens(text: &str) -> u32 {
17 BPE.encode_with_special_tokens(text).len() as u32
18}
19
20pub fn message_overhead() -> u32 {
23 6
28}
29
30#[cfg(test)]
31mod tests {
32 use super::*;
33
34 #[test]
35 fn test_count_tokens_simple() {
36 let text = "Hello, world!";
37 let count = count_tokens(text);
38 assert!(count > 0);
39 assert!(count >= 3 && count <= 5);
41 }
42
43 #[test]
44 fn test_count_tokens_chinese() {
45 let text = "你好,世界!";
46 let count = count_tokens(text);
47 assert!(count > 0);
48 assert!(count >= 5);
50 }
51
52 #[test]
53 fn test_count_tokens_code() {
54 let code = r#"
55fn main() {
56 println!("Hello");
57}
58"#;
59 let count = count_tokens(code);
60 assert!(count > 0);
61 assert!(count >= 10, "Code should use at least 10 tokens, got {}", count);
64 }
65
66 #[test]
67 fn test_message_overhead() {
68 let overhead = message_overhead();
69 assert_eq!(overhead, 6);
70 }
71
72 #[test]
73 fn test_token_counting_accuracy() {
74 assert_eq!(count_tokens("Hello, world!"), 4);
77
78 assert_eq!(count_tokens("Hello"), 1);
80
81 assert_eq!(count_tokens("12345"), 2); let chinese = "你好世界";
86 let chinese_count = count_tokens(chinese);
87 assert!(chinese_count >= 4, "Chinese text should use at least 4 tokens, got {}", chinese_count);
88
89 assert_eq!(count_tokens(""), 0);
91
92 assert_eq!(count_tokens(" "), 1);
94 }
95}