#[cfg(any(
feature = "default-small",
feature = "default-medium",
feature = "default-large"
))]
use {
lz4_flex::block::compress_prepend_size,
std::{collections::HashMap, env, fs, path::PathBuf},
};
fn main() {
#[cfg(feature = "default-small")]
process_vocab("multi.wiki.bpe.vs100000.vocab");
#[cfg(feature = "default-medium")]
process_vocab("multi.wiki.bpe.vs320000.vocab");
#[cfg(feature = "default-large")]
process_vocab("multi.wiki.bpe.vs1000000.vocab");
}
#[cfg(any(
feature = "default-small",
feature = "default-medium",
feature = "default-large"
))]
fn process_vocab(name: &str) {
let vocab_path = PathBuf::from(format!("vocab/{}", name));
let tokens: HashMap<String, isize> = load_vocab_hashmap(&vocab_path);
let serialized = bincode::serialize(&tokens).unwrap();
let compressed = compress_prepend_size(&serialized);
let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
let output_path = out_dir.join(format!("{}.hashmap.bincode.lz4", name));
fs::write(&output_path, compressed).unwrap();
}
#[cfg(any(
feature = "default-small",
feature = "default-medium",
feature = "default-large"
))]
fn load_vocab_hashmap(path: &PathBuf) -> HashMap<String, isize> {
let content = fs::read_to_string(path).unwrap();
let mut tokens = HashMap::new();
for line in content.lines() {
let (token, score_str) = match line.split_once('\t') {
Some(pair) => pair,
None => panic!("Invalid line in vocabulary file: {}", line),
};
let score = match score_str.parse::<isize>() {
Ok(score) => score,
Err(_) => panic!("Invalid score in vocabulary file: {}", line),
};
tokens.insert(token.to_string(), score);
}
tokens
}