alith_prompt/
prompt_tokenizer.rs

1use std::sync::Arc;
2
3/// A trait for tokenizers that can be used with the prompt management system.
4///
5/// This trait defines the core functionality needed for any tokenizer to work with
6/// the prompt system. Implementors must provide methods to both tokenize text into
7/// token IDs and count tokens in a given input. The trait requires thread safety
8/// through Send + Sync bounds, making it suitable for use in concurrent contexts.
9pub trait PromptTokenizer: Send + Sync {
10    /// Converts a text string into a sequence of token IDs.
11    ///
12    /// This method should tokenize the input text according to the tokenizer's
13    /// vocabulary and rules, returning the corresponding sequence of token IDs.
14    ///
15    /// # Arguments
16    ///
17    /// * `input` - The text string to tokenize
18    ///
19    /// # Returns
20    ///
21    /// A vector of token IDs (usize) representing the tokenized input
22    fn tokenize(&self, input: &str) -> Vec<u32>;
23
24    /// Counts the number of tokens in a text string.
25    ///
26    /// This method should return the number of tokens that would be produced
27    /// by tokenizing the input text. It may be more efficient than calling
28    /// tokenize() and counting the results.
29    ///
30    /// # Arguments
31    ///
32    /// * `input` - The text string to count tokens for
33    ///
34    /// # Returns
35    ///
36    /// The number of tokens in the input text.
37    fn count_tokens(&self, input: &str) -> u32;
38}
39
40impl PromptTokenizer for Arc<dyn PromptTokenizer> {
41    fn tokenize(&self, input: &str) -> Vec<u32> {
42        (**self).tokenize(input)
43    }
44
45    fn count_tokens(&self, input: &str) -> u32 {
46        (**self).count_tokens(input)
47    }
48}