probe_code/search/search_tokens.rs
1use std::sync::OnceLock;
2use tiktoken_rs::{p50k_base, CoreBPE};
3
4/// Returns a reference to the tiktoken tokenizer
5pub fn get_tokenizer() -> &'static CoreBPE {
6 static TOKENIZER: OnceLock<CoreBPE> = OnceLock::new();
7 TOKENIZER.get_or_init(|| p50k_base().expect("Failed to initialize tiktoken tokenizer"))
8}
9
10/// Helper function to count tokens in a string using tiktoken (same tokenizer as GPT models)
11pub fn count_tokens(text: &str) -> usize {
12 let tokenizer = get_tokenizer();
13 tokenizer.encode_with_special_tokens(text).len()
14}