use std::{
fs::File,
io::{self, BufReader, Write},
time::Instant,
};
use pllm::{Config, GgufFile, Tokenizer, Weights, LLM};
fn main() {
let f = File::open("testdata/gemma2b").unwrap();
let reader = BufReader::new(f);
let mut gf = GgufFile::from_reader(reader).unwrap();
let config = Config::from_gguf(&gf).unwrap();
println!("{:?}", config.clone());
let tokenizer = Tokenizer::from_gguf(&gf).unwrap();
let mut weights = Weights::new(config.clone());
weights.load_from_gguf(&mut gf, config.clone()).unwrap();
let iterator = LLM::new(config, tokenizer, weights)
.inference("why the sky is blue?".to_string(), 0.8)
.unwrap();
let mut token_count = 0;
let start = Instant::now();
for (_, t) in iterator.enumerate() {
print!("{}", t.unwrap());
io::stdout().flush().unwrap();
token_count += 1;
}
println!(
"\ntoken/s: {}\n",
(token_count as f64 - 1.0) / start.elapsed().as_millis() as f64 * 1000.0
);
}