Skip to main content

rusty_commit/utils/
token.rs

1use anyhow::{Context, Result};
2use once_cell::sync::Lazy;
3use tiktoken_rs::cl100k_base;
4
5static TOKENIZER: Lazy<Result<tiktoken_rs::CoreBPE>> =
6    Lazy::new(|| cl100k_base().context("Failed to load tokenizer"));
7
8/// Estimates the number of tokens in the given text using the OpenAI tokenizer.
9/// This uses the cl100k encoding which is used by GPT-3.5 and GPT-4.
10///
11/// # Examples
12///
13/// ```ignore
14/// use rusty_commit::utils::token::estimate_tokens;
15///
16/// let text = "Hello, world!";
17/// let tokens = estimate_tokens(text).unwrap();
18/// assert!(tokens > 0);
19/// ```
20pub fn estimate_tokens(text: &str) -> Result<usize> {
21    let bpe = TOKENIZER.as_ref().map_err(|e| anyhow::anyhow!("{}", e))?;
22    let tokens = bpe.encode_with_special_tokens(text);
23    Ok(tokens.len())
24}