dynamo_llm/tokenizers/
hf.rs1use tokenizers::tokenizer::Tokenizer as HfTokenizer;
5
6use super::{
7 Encoding, Error, Result, TokenIdType,
8 traits::{Decoder, Encoder, Tokenizer},
9};
10
11pub struct HuggingFaceTokenizer {
12 tokenizer: HfTokenizer,
13}
14
15impl HuggingFaceTokenizer {
16 pub fn from_file(model_name: &str) -> Result<Self> {
17 let tokenizer = HfTokenizer::from_file(model_name)
18 .map_err(|err| Error::msg(format!("Error loading tokenizer: {}", err)))?;
19
20 Ok(HuggingFaceTokenizer { tokenizer })
21 }
22
23 pub fn from_tokenizer(tokenizer: HfTokenizer) -> Self {
24 HuggingFaceTokenizer { tokenizer }
25 }
26}
27
28impl Encoder for HuggingFaceTokenizer {
29 fn encode(&self, input: &str) -> Result<Encoding> {
30 let encoding = self
32 .tokenizer
33 .encode(input, false)
34 .map_err(|err| Error::msg(format!("Error tokenizing input: {err}")))?;
35
36 Ok(Encoding::Hf(Box::new(encoding)))
37 }
38
39 fn encode_batch(&self, inputs: &[&str]) -> Result<Vec<Encoding>> {
40 let hf_encodings = self
41 .tokenizer
42 .encode_batch(inputs.to_vec(), false)
43 .map_err(|err| Error::msg(format!("Error batch tokenizing input: {err}")))?;
44
45 let encodings = hf_encodings
46 .into_iter()
47 .map(|enc| Encoding::Hf(Box::new(enc)))
48 .collect();
49
50 Ok(encodings)
51 }
52}
53
54impl Decoder for HuggingFaceTokenizer {
55 fn decode(&self, token_ids: &[TokenIdType], skip_special_tokens: bool) -> Result<String> {
56 let text = self
58 .tokenizer
59 .decode(token_ids, skip_special_tokens)
60 .map_err(|err| Error::msg(format!("Error de-tokenizing input: {err}")))?;
61
62 Ok(text)
63 }
64}
65
66impl Tokenizer for HuggingFaceTokenizer {}
67
68impl From<HfTokenizer> for HuggingFaceTokenizer {
69 fn from(tokenizer: HfTokenizer) -> Self {
70 HuggingFaceTokenizer { tokenizer }
71 }
72}