alith_prompt/prompt_tokenizer.rs
1use std::sync::Arc;
2
3/// A trait for tokenizers that can be used with the prompt management system.
4///
5/// This trait defines the core functionality needed for any tokenizer to work with
6/// the prompt system. Implementors must provide methods to both tokenize text into
7/// token IDs and count tokens in a given input. The trait requires thread safety
8/// through Send + Sync bounds, making it suitable for use in concurrent contexts.
9pub trait PromptTokenizer: Send + Sync {
10 /// Converts a text string into a sequence of token IDs.
11 ///
12 /// This method should tokenize the input text according to the tokenizer's
13 /// vocabulary and rules, returning the corresponding sequence of token IDs.
14 ///
15 /// # Arguments
16 ///
17 /// * `input` - The text string to tokenize
18 ///
19 /// # Returns
20 ///
21 /// A vector of token IDs (usize) representing the tokenized input
22 fn tokenize(&self, input: &str) -> Vec<u32>;
23
24 /// Counts the number of tokens in a text string.
25 ///
26 /// This method should return the number of tokens that would be produced
27 /// by tokenizing the input text. It may be more efficient than calling
28 /// tokenize() and counting the results.
29 ///
30 /// # Arguments
31 ///
32 /// * `input` - The text string to count tokens for
33 ///
34 /// # Returns
35 ///
36 /// The number of tokens in the input text.
37 fn count_tokens(&self, input: &str) -> u32;
38}
39
40impl PromptTokenizer for Arc<dyn PromptTokenizer> {
41 fn tokenize(&self, input: &str) -> Vec<u32> {
42 (**self).tokenize(input)
43 }
44
45 fn count_tokens(&self, input: &str) -> u32 {
46 (**self).count_tokens(input)
47 }
48}