use std::fs::File;
use std::io::{Read, Seek};
use std::path::{Path, PathBuf};
use candle_core::quantized::gguf_file;
use candle_core::{DType, Device, Result as CandleResult, Tensor};
use candle_transformers::models::{quantized_llama, quantized_qwen3, quantized_qwen3_moe};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum GgufArch {
Qwen3,
Qwen3Moe,
Llama,
}
impl GgufArch {
fn from_metadata_string(s: &str) -> Option<Self> {
match s {
"qwen3" => Some(Self::Qwen3),
"qwen3moe" => Some(Self::Qwen3Moe),
"llama" | "qwen2" | "mistral" | "tinyllama" => Some(Self::Llama),
_ => None,
}
}
}
pub enum GgufRuntime {
Qwen3(quantized_qwen3::ModelWeights),
Qwen3Moe(quantized_qwen3_moe::GGUFQWenMoE),
Llama(quantized_llama::ModelWeights),
}
impl GgufRuntime {
pub fn open(
path: impl AsRef<Path>,
device: &Device,
dtype_for_moe: DType,
) -> CandleResult<Self> {
let path = path.as_ref().to_path_buf();
let mut file = open_file(&path)?;
let content = gguf_file::Content::read(&mut file).map_err(|e| {
candle_core::Error::Msg(format!(
"failed to parse GGUF header at {}: {e}",
path.display()
))
})?;
let arch_str = content
.metadata
.get("general.architecture")
.ok_or_else(|| candle_core::Error::Msg("GGUF missing general.architecture".into()))?
.to_string()
.map_err(|e| candle_core::Error::Msg(format!("general.architecture: {e}")))?
.clone();
let arch = GgufArch::from_metadata_string(&arch_str).ok_or_else(|| {
candle_core::Error::Msg(format!(
"GGUF arch '{arch_str}' unsupported by GgufRuntime — \
expected qwen3, qwen3moe, llama, qwen2, mistral, or tinyllama"
))
})?;
let runtime = match arch {
GgufArch::Qwen3 => {
let m = quantized_qwen3::ModelWeights::from_gguf(content, &mut file, device)?;
GgufRuntime::Qwen3(m)
}
GgufArch::Qwen3Moe => {
let m = quantized_qwen3_moe::GGUFQWenMoE::from_gguf(
content,
&mut file,
device,
dtype_for_moe,
)?;
GgufRuntime::Qwen3Moe(m)
}
GgufArch::Llama => {
let m = quantized_llama::ModelWeights::from_gguf(content, &mut file, device)?;
GgufRuntime::Llama(m)
}
};
Ok(runtime)
}
pub fn detect_arch(path: impl AsRef<Path>) -> CandleResult<GgufArch> {
let path = path.as_ref().to_path_buf();
let mut file = open_file(&path)?;
let content = gguf_file::Content::read(&mut file)?;
let s = content
.metadata
.get("general.architecture")
.ok_or_else(|| candle_core::Error::Msg("GGUF missing general.architecture".into()))?
.to_string()
.map_err(|e| candle_core::Error::Msg(format!("{e}")))?
.clone();
GgufArch::from_metadata_string(&s)
.ok_or_else(|| candle_core::Error::Msg(format!("unsupported arch '{s}'")))
}
pub fn forward(&mut self, input: &Tensor, offset: usize) -> CandleResult<Tensor> {
match self {
Self::Qwen3(m) => m.forward(input, offset),
Self::Qwen3Moe(m) => m.forward(input, offset),
Self::Llama(m) => m.forward(input, offset),
}
}
pub fn reset_kv(&mut self) -> CandleResult<()> {
match self {
Self::Qwen3(m) => {
m.clear_kv_cache();
Ok(())
}
Self::Qwen3Moe(_) | Self::Llama(_) => Err(candle_core::Error::Msg(
"reset_kv unsupported for this arch — re-open the GGUF for a fresh cache".into(),
)),
}
}
pub fn arch(&self) -> GgufArch {
match self {
Self::Qwen3(_) => GgufArch::Qwen3,
Self::Qwen3Moe(_) => GgufArch::Qwen3Moe,
Self::Llama(_) => GgufArch::Llama,
}
}
}
fn open_file(path: &PathBuf) -> CandleResult<File> {
File::open(path).map_err(|e| {
candle_core::Error::Msg(format!("failed to open GGUF '{}': {e}", path.display()))
})
}
#[allow(dead_code)]
fn _force_link<R: Read + Seek>() {
let _ = std::marker::PhantomData::<R>;
}