Skip to main content

llm_tokenizer/
lib.rs

1use std::sync::Arc;
2
3use anyhow::Result;
4
5pub mod cache;
6pub mod encoders;
7pub mod eos;
8pub mod factory;
9pub mod hub;
10pub mod mock;
11pub mod registry;
12pub mod sequence;
13pub mod stop;
14pub mod stream;
15pub mod traits;
16
17pub mod chat_template;
18pub mod huggingface;
19mod kimi_k2_tokenizer;
20pub mod tiktoken;
21
22#[cfg(test)]
23mod tests;
24
25// Re-export types used outside this module
26pub use cache::{CacheConfig, CacheStats, CachedTokenizer, L0Cache, L1Cache, TokenizerFingerprint};
27pub use chat_template::ChatTemplateState;
28pub use factory::{
29    create_tokenizer, create_tokenizer_from_file, create_tokenizer_with_chat_template,
30    TokenizerType,
31};
32pub use huggingface::HuggingFaceTokenizer;
33pub use mock::MockTokenizer;
34pub use registry::{LoadError, LoadOutcome, TokenizerRegistry};
35pub use sequence::Sequence;
36pub use stop::{SequenceDecoderOutput, StopSequenceConfig, StopSequenceDecoder};
37pub use stream::DecodeStream;
38pub use tiktoken::{TiktokenModel, TiktokenTokenizer};
39pub use traits::{
40    Decoder, Encoder, Encoding, SpecialTokens, TokenIdType, Tokenizer as TokenizerTrait,
41};
42
43/// Main tokenizer wrapper that provides a unified interface for different tokenizer implementations
44#[derive(Clone)]
45pub struct Tokenizer(Arc<dyn traits::Tokenizer>);
46
47impl Tokenizer {
48    /// Create a tokenizer from a file path
49    pub fn from_file(file_path: &str) -> Result<Tokenizer> {
50        Ok(Tokenizer(create_tokenizer_from_file(file_path)?))
51    }
52
53    /// Create a tokenizer from a file path with an optional chat template
54    pub fn from_file_with_chat_template(
55        file_path: &str,
56        chat_template_path: Option<&str>,
57    ) -> Result<Tokenizer> {
58        Ok(Tokenizer(create_tokenizer_with_chat_template(
59            file_path,
60            chat_template_path,
61        )?))
62    }
63
64    /// Create a tokenizer from an Arc<dyn Tokenizer>
65    pub fn from_arc(tokenizer: Arc<dyn traits::Tokenizer>) -> Self {
66        Tokenizer(tokenizer)
67    }
68
69    /// Create a stateful sequence object for decoding token_ids into text
70    pub fn decode_stream(
71        &self,
72        prompt_token_ids: &[u32],
73        skip_special_tokens: bool,
74    ) -> DecodeStream {
75        DecodeStream::new(self.0.clone(), prompt_token_ids, skip_special_tokens)
76    }
77
78    /// Direct encode method
79    ///
80    /// Set `add_special_tokens` to `true` for embeddings (to add BOS/EOS tokens configured in tokenizer_config.json),
81    /// or `false` for chat completion (where the chat template handles special tokens).
82    pub fn encode(&self, input: &str, add_special_tokens: bool) -> Result<Encoding> {
83        self.0.encode(input, add_special_tokens)
84    }
85
86    /// Direct batch encode method
87    ///
88    /// Set `add_special_tokens` to `true` for embeddings (to add BOS/EOS tokens configured in tokenizer_config.json),
89    /// or `false` for chat completion (where the chat template handles special tokens).
90    pub fn encode_batch(&self, inputs: &[&str], add_special_tokens: bool) -> Result<Vec<Encoding>> {
91        self.0.encode_batch(inputs, add_special_tokens)
92    }
93
94    /// Direct decode method
95    pub fn decode(&self, token_ids: &[u32], skip_special_tokens: bool) -> Result<String> {
96        self.0.decode(token_ids, skip_special_tokens)
97    }
98
99    /// Get vocabulary size
100    pub fn vocab_size(&self) -> usize {
101        self.0.vocab_size()
102    }
103
104    /// Get special tokens
105    pub fn get_special_tokens(&self) -> &SpecialTokens {
106        self.0.get_special_tokens()
107    }
108
109    /// Convert token string to ID
110    pub fn token_to_id(&self, token: &str) -> Option<u32> {
111        self.0.token_to_id(token)
112    }
113
114    /// Convert ID to token string
115    pub fn id_to_token(&self, id: u32) -> Option<String> {
116        self.0.id_to_token(id)
117    }
118}
119
120impl From<Arc<dyn traits::Tokenizer>> for Tokenizer {
121    fn from(tokenizer: Arc<dyn traits::Tokenizer>) -> Self {
122        Tokenizer(tokenizer)
123    }
124}