1use hf_hub::api::sync::Api;
7use tokenizers::Tokenizer;
8
9pub fn load_tokenizer(model_repo: &str) -> crate::Result<Tokenizer> {
17 let api = Api::new().map_err(|e| crate::Error::Download(e.to_string()))?;
18 let repo = api.model(model_repo.to_string());
19 let tokenizer_path = repo
20 .get("tokenizer.json")
21 .map_err(|e| crate::Error::Download(e.to_string()))?;
22 Tokenizer::from_file(tokenizer_path).map_err(|e| crate::Error::Tokenization(e.to_string()))
23}
24
25pub fn tokenize_query(
34 text: &str,
35 tokenizer: &tokenizers::Tokenizer,
36 model_max_tokens: usize,
37) -> crate::Result<crate::backend::Encoding> {
38 let encoding = tokenizer
39 .encode(text, true)
40 .map_err(|e| crate::Error::Tokenization(e.to_string()))?;
41
42 let len = encoding.get_ids().len().min(model_max_tokens);
43 Ok(crate::backend::Encoding {
44 input_ids: encoding.get_ids()[..len]
45 .iter()
46 .map(|&x| i64::from(x))
47 .collect(),
48 attention_mask: encoding.get_attention_mask()[..len]
49 .iter()
50 .map(|&x| i64::from(x))
51 .collect(),
52 token_type_ids: encoding.get_type_ids()[..len]
53 .iter()
54 .map(|&x| i64::from(x))
55 .collect(),
56 })
57}