use hf_hub::api::sync::Api;
use tokenizers::Tokenizer;
pub fn load_tokenizer(model_repo: &str) -> crate::Result<Tokenizer> {
let api = Api::new().map_err(|e| crate::Error::Download(e.to_string()))?;
let repo = api.model(model_repo.to_string());
let tokenizer_path = repo
.get("tokenizer.json")
.map_err(|e| crate::Error::Download(e.to_string()))?;
Tokenizer::from_file(tokenizer_path).map_err(|e| crate::Error::Tokenization(e.to_string()))
}
pub fn tokenize_query(
text: &str,
tokenizer: &tokenizers::Tokenizer,
model_max_tokens: usize,
) -> crate::Result<crate::backend::Encoding> {
let encoding = tokenizer
.encode(text, true)
.map_err(|e| crate::Error::Tokenization(e.to_string()))?;
let len = encoding.get_ids().len().min(model_max_tokens);
Ok(crate::backend::Encoding {
input_ids: encoding.get_ids()[..len]
.iter()
.map(|&x| i64::from(x))
.collect(),
attention_mask: encoding.get_attention_mask()[..len]
.iter()
.map(|&x| i64::from(x))
.collect(),
token_type_ids: encoding.get_type_ids()[..len]
.iter()
.map(|&x| i64::from(x))
.collect(),
})
}