dynamo_llm/tokenizers/
hf.rs1use tokenizers::tokenizer::Tokenizer as HfTokenizer;
17
18use super::{
19 traits::{Decoder, Encoder, Tokenizer},
20 Encoding, Error, Result, TokenIdType,
21};
22
23pub struct HuggingFaceTokenizer {
24 tokenizer: HfTokenizer,
25}
26
27impl HuggingFaceTokenizer {
28 pub fn from_file(model_name: &str) -> Result<Self> {
29 let tokenizer = HfTokenizer::from_file(model_name)
30 .map_err(|err| Error::msg(format!("Error loading tokenizer: {}", err)))?;
31
32 Ok(HuggingFaceTokenizer { tokenizer })
33 }
34
35 pub fn from_tokenizer(tokenizer: HfTokenizer) -> Self {
36 HuggingFaceTokenizer { tokenizer }
37 }
38}
39
40impl Encoder for HuggingFaceTokenizer {
41 fn encode(&self, input: &str) -> Result<Encoding> {
42 let encoding = self
43 .tokenizer
44 .encode(input, false)
45 .map_err(|err| Error::msg(format!("Error encoding input: {}", err)))?;
46
47 let token_ids = encoding.get_ids().to_vec();
48 let tokens = encoding.get_tokens().to_vec();
49 let spans = encoding.get_offsets().to_vec();
50
51 Ok(Encoding {
52 token_ids,
53 tokens,
54 spans,
55 })
56 }
57}
58
59impl Decoder for HuggingFaceTokenizer {
60 fn decode(&self, token_ids: &[TokenIdType], skip_special_tokens: bool) -> Result<String> {
61 let text = self
62 .tokenizer
63 .decode(token_ids, skip_special_tokens)
64 .map_err(|err| Error::msg(format!("Error decoding input: {}", err)))?;
65
66 Ok(text)
67 }
68}
69
70impl Tokenizer for HuggingFaceTokenizer {}
71
72impl From<HfTokenizer> for HuggingFaceTokenizer {
73 fn from(tokenizer: HfTokenizer) -> Self {
74 HuggingFaceTokenizer { tokenizer }
75 }
76}