1use std::sync::Arc;
2
3use anyhow::Result;
4
5pub mod cache;
6pub mod encoders;
7pub mod eos;
8pub mod factory;
9pub mod hub;
10pub mod mock;
11pub mod registry;
12pub mod sequence;
13pub mod stop;
14pub mod stream;
15pub mod traits;
16
17pub mod chat_template;
18pub mod huggingface;
19mod kimi_k2_tokenizer;
20pub mod tiktoken;
21
22#[cfg(test)]
23mod tests;
24
25pub use cache::{CacheConfig, CacheStats, CachedTokenizer, L0Cache, L1Cache, TokenizerFingerprint};
27pub use chat_template::ChatTemplateState;
28pub use factory::{
29 create_tokenizer, create_tokenizer_from_file, create_tokenizer_with_chat_template,
30 TokenizerType,
31};
32pub use huggingface::HuggingFaceTokenizer;
33pub use mock::MockTokenizer;
34pub use registry::{LoadError, LoadOutcome, TokenizerRegistry};
35pub use sequence::Sequence;
36pub use stop::{SequenceDecoderOutput, StopSequenceConfig, StopSequenceDecoder};
37pub use stream::DecodeStream;
38pub use tiktoken::{TiktokenModel, TiktokenTokenizer};
39pub use traits::{
40 Decoder, Encoder, Encoding, SpecialTokens, TokenIdType, Tokenizer as TokenizerTrait,
41};
42
43#[derive(Clone)]
45pub struct Tokenizer(Arc<dyn traits::Tokenizer>);
46
47impl Tokenizer {
48 pub fn from_file(file_path: &str) -> Result<Tokenizer> {
50 Ok(Tokenizer(create_tokenizer_from_file(file_path)?))
51 }
52
53 pub fn from_file_with_chat_template(
55 file_path: &str,
56 chat_template_path: Option<&str>,
57 ) -> Result<Tokenizer> {
58 Ok(Tokenizer(create_tokenizer_with_chat_template(
59 file_path,
60 chat_template_path,
61 )?))
62 }
63
64 pub fn from_arc(tokenizer: Arc<dyn traits::Tokenizer>) -> Self {
66 Tokenizer(tokenizer)
67 }
68
69 pub fn decode_stream(
71 &self,
72 prompt_token_ids: &[u32],
73 skip_special_tokens: bool,
74 ) -> DecodeStream {
75 DecodeStream::new(self.0.clone(), prompt_token_ids, skip_special_tokens)
76 }
77
78 pub fn encode(&self, input: &str, add_special_tokens: bool) -> Result<Encoding> {
83 self.0.encode(input, add_special_tokens)
84 }
85
86 pub fn encode_batch(&self, inputs: &[&str], add_special_tokens: bool) -> Result<Vec<Encoding>> {
91 self.0.encode_batch(inputs, add_special_tokens)
92 }
93
94 pub fn decode(&self, token_ids: &[u32], skip_special_tokens: bool) -> Result<String> {
96 self.0.decode(token_ids, skip_special_tokens)
97 }
98
99 pub fn vocab_size(&self) -> usize {
101 self.0.vocab_size()
102 }
103
104 pub fn get_special_tokens(&self) -> &SpecialTokens {
106 self.0.get_special_tokens()
107 }
108
109 pub fn token_to_id(&self, token: &str) -> Option<u32> {
111 self.0.token_to_id(token)
112 }
113
114 pub fn id_to_token(&self, id: u32) -> Option<String> {
116 self.0.id_to_token(id)
117 }
118}
119
120impl From<Arc<dyn traits::Tokenizer>> for Tokenizer {
121 fn from(tokenizer: Arc<dyn traits::Tokenizer>) -> Self {
122 Tokenizer(tokenizer)
123 }
124}