Skip to main content

llm_tokenizer/
lib.rs

1use std::sync::Arc;
2
3use anyhow::Result;
4
5pub mod cache;
6pub mod factory;
7pub mod hub;
8pub mod mock;
9pub mod registry;
10pub mod sequence;
11pub mod stop;
12pub mod stream;
13pub mod traits;
14
15pub mod chat_template;
16pub mod huggingface;
17pub mod tiktoken;
18
19#[cfg(test)]
20mod tests;
21
22// Re-export types used outside this module
23pub use cache::{CacheConfig, CacheStats, CachedTokenizer, L0Cache, L1Cache, TokenizerFingerprint};
24pub use chat_template::ChatTemplateState;
25pub use factory::{
26    create_tokenizer, create_tokenizer_from_file, create_tokenizer_with_chat_template,
27    TokenizerType,
28};
29pub use huggingface::HuggingFaceTokenizer;
30pub use mock::MockTokenizer;
31pub use registry::{LoadError, LoadOutcome, TokenizerRegistry};
32pub use sequence::Sequence;
33pub use stop::{SequenceDecoderOutput, StopSequenceConfig, StopSequenceDecoder};
34pub use stream::DecodeStream;
35pub use tiktoken::{TiktokenModel, TiktokenTokenizer};
36pub use traits::{
37    Decoder, Encoder, Encoding, SpecialTokens, TokenIdType, Tokenizer as TokenizerTrait,
38};
39
40/// Main tokenizer wrapper that provides a unified interface for different tokenizer implementations
41#[derive(Clone)]
42pub struct Tokenizer(Arc<dyn traits::Tokenizer>);
43
44impl Tokenizer {
45    /// Create a tokenizer from a file path
46    pub fn from_file(file_path: &str) -> Result<Tokenizer> {
47        Ok(Tokenizer(create_tokenizer_from_file(file_path)?))
48    }
49
50    /// Create a tokenizer from a file path with an optional chat template
51    pub fn from_file_with_chat_template(
52        file_path: &str,
53        chat_template_path: Option<&str>,
54    ) -> Result<Tokenizer> {
55        Ok(Tokenizer(create_tokenizer_with_chat_template(
56            file_path,
57            chat_template_path,
58        )?))
59    }
60
61    /// Create a tokenizer from an Arc<dyn Tokenizer>
62    pub fn from_arc(tokenizer: Arc<dyn traits::Tokenizer>) -> Self {
63        Tokenizer(tokenizer)
64    }
65
66    /// Create a stateful sequence object for decoding token_ids into text
67    pub fn decode_stream(
68        &self,
69        prompt_token_ids: &[u32],
70        skip_special_tokens: bool,
71    ) -> DecodeStream {
72        DecodeStream::new(self.0.clone(), prompt_token_ids, skip_special_tokens)
73    }
74
75    /// Direct encode method
76    ///
77    /// Set `add_special_tokens` to `true` for embeddings (to add BOS/EOS tokens configured in tokenizer_config.json),
78    /// or `false` for chat completion (where the chat template handles special tokens).
79    pub fn encode(&self, input: &str, add_special_tokens: bool) -> Result<Encoding> {
80        self.0.encode(input, add_special_tokens)
81    }
82
83    /// Direct batch encode method
84    ///
85    /// Set `add_special_tokens` to `true` for embeddings (to add BOS/EOS tokens configured in tokenizer_config.json),
86    /// or `false` for chat completion (where the chat template handles special tokens).
87    pub fn encode_batch(&self, inputs: &[&str], add_special_tokens: bool) -> Result<Vec<Encoding>> {
88        self.0.encode_batch(inputs, add_special_tokens)
89    }
90
91    /// Direct decode method
92    pub fn decode(&self, token_ids: &[u32], skip_special_tokens: bool) -> Result<String> {
93        self.0.decode(token_ids, skip_special_tokens)
94    }
95
96    /// Get vocabulary size
97    pub fn vocab_size(&self) -> usize {
98        self.0.vocab_size()
99    }
100
101    /// Get special tokens
102    pub fn get_special_tokens(&self) -> &SpecialTokens {
103        self.0.get_special_tokens()
104    }
105
106    /// Convert token string to ID
107    pub fn token_to_id(&self, token: &str) -> Option<u32> {
108        self.0.token_to_id(token)
109    }
110
111    /// Convert ID to token string
112    pub fn id_to_token(&self, id: u32) -> Option<String> {
113        self.0.id_to_token(id)
114    }
115}
116
117impl From<Arc<dyn traits::Tokenizer>> for Tokenizer {
118    fn from(tokenizer: Arc<dyn traits::Tokenizer>) -> Self {
119        Tokenizer(tokenizer)
120    }
121}