probe_code/search/
search_tokens.rs

1use std::sync::OnceLock;
2use tiktoken_rs::{p50k_base, CoreBPE};
3
4/// Returns a reference to the tiktoken tokenizer
5pub fn get_tokenizer() -> &'static CoreBPE {
6    static TOKENIZER: OnceLock<CoreBPE> = OnceLock::new();
7    TOKENIZER.get_or_init(|| p50k_base().expect("Failed to initialize tiktoken tokenizer"))
8}
9
10/// Helper function to count tokens in a string using tiktoken (same tokenizer as GPT models)
11pub fn count_tokens(text: &str) -> usize {
12    let tokenizer = get_tokenizer();
13    tokenizer.encode_with_special_tokens(text).len()
14}