car-inference 0.6.0

Local model inference for CAR — Candle backend with Qwen3 models
Documentation
//! Model registry — tracks available Qwen3 models, handles download-on-first-use.

use std::path::{Path, PathBuf};

use serde::{Deserialize, Serialize};
use tracing::info;

use crate::InferenceError;

/// Role a model is suited for.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum ModelRole {
    /// Fast classification, embedding, routing
    Small,
    /// Code reasoning, skill repair, policy eval
    Medium,
    /// Full reasoning, complex generation
    Large,
    /// Maximum quality via MoE (3B active / 30B total)
    Expert,
    /// Dedicated embedding model (semantic similarity, retrieval)
    Embedding,
}

/// Metadata about a model in the registry.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ModelInfo {
    pub name: String,
    pub hf_repo: String,
    pub hf_filename: String,
    pub tokenizer_repo: String,
    pub role: ModelRole,
    pub param_count: &'static str,
    pub quantized_size_mb: u64,
    pub downloaded: bool,
}

/// Registry of available models and their local paths.
pub struct ModelRegistry {
    models_dir: PathBuf,
    catalog: Vec<ModelSpec>,
}

struct ModelSpec {
    name: &'static str,
    hf_repo: &'static str,
    hf_filename: &'static str,
    tokenizer_repo: &'static str,
    role: ModelRole,
    param_count: &'static str,
    quantized_size_mb: u64,
}

impl ModelRegistry {
    pub fn new(models_dir: PathBuf) -> Self {
        Self {
            models_dir,
            catalog: builtin_catalog(),
        }
    }

    /// List all models with their download status.
    pub fn list_models(&self) -> Vec<ModelInfo> {
        self.catalog
            .iter()
            .map(|spec| {
                let local_path = self.models_dir.join(spec.name).join("model.gguf");
                ModelInfo {
                    name: spec.name.to_string(),
                    hf_repo: spec.hf_repo.to_string(),
                    hf_filename: spec.hf_filename.to_string(),
                    tokenizer_repo: spec.tokenizer_repo.to_string(),
                    role: spec.role,
                    param_count: spec.param_count,
                    quantized_size_mb: spec.quantized_size_mb,
                    downloaded: local_path.exists(),
                }
            })
            .collect()
    }

    /// Find a catalog entry by name (case-insensitive).
    fn find_spec(&self, name: &str) -> Option<&ModelSpec> {
        self.catalog
            .iter()
            .find(|s| s.name.eq_ignore_ascii_case(name))
    }

    /// Ensure a model is downloaded, returning its local directory path.
    pub async fn ensure_model(&self, name: &str) -> Result<PathBuf, InferenceError> {
        let spec = self
            .find_spec(name)
            .ok_or_else(|| InferenceError::ModelNotFound(name.to_string()))?;

        let model_dir = self.models_dir.join(spec.name);
        let model_path = model_dir.join("model.gguf");
        let tokenizer_path = model_dir.join("tokenizer.json");

        if model_path.exists() && tokenizer_path.exists() {
            return Ok(model_dir);
        }

        std::fs::create_dir_all(&model_dir)?;

        // Download model weights
        if !model_path.exists() {
            info!(
                model = spec.name,
                repo = spec.hf_repo,
                "downloading model weights"
            );
            download_file(spec.hf_repo, spec.hf_filename, &model_path).await?;
        }

        // Download tokenizer
        if !tokenizer_path.exists() {
            info!(
                model = spec.name,
                repo = spec.tokenizer_repo,
                "downloading tokenizer"
            );
            download_file(spec.tokenizer_repo, "tokenizer.json", &tokenizer_path).await?;
        }

        Ok(model_dir)
    }

    /// Remove a downloaded model.
    pub fn remove_model(&self, name: &str) -> Result<(), InferenceError> {
        let _spec = self
            .find_spec(name)
            .ok_or_else(|| InferenceError::ModelNotFound(name.to_string()))?;

        let model_dir = self.models_dir.join(name);
        if model_dir.exists() {
            std::fs::remove_dir_all(&model_dir)?;
            info!(model = name, "removed model");
        }
        Ok(())
    }
}

/// Download a single file from a HuggingFace repo.
async fn download_file(repo: &str, filename: &str, dest: &Path) -> Result<(), InferenceError> {
    let api = hf_hub::api::tokio::Api::new()
        .map_err(|e| InferenceError::DownloadFailed(e.to_string()))?;

    let repo = api.model(repo.to_string());
    let path = repo
        .get(filename)
        .await
        .map_err(|e| InferenceError::DownloadFailed(format!("{filename}: {e}")))?;

    // hf-hub caches to its own dir; symlink or copy to our location
    if dest.exists() {
        return Ok(());
    }

    // Try symlink first, fall back to copy
    #[cfg(unix)]
    {
        if std::os::unix::fs::symlink(&path, dest).is_ok() {
            return Ok(());
        }
    }

    std::fs::copy(&path, dest)
        .map_err(|e| InferenceError::DownloadFailed(format!("copy to {}: {e}", dest.display())))?;
    Ok(())
}

/// Built-in catalog of Qwen3 models.
fn builtin_catalog() -> Vec<ModelSpec> {
    vec![
        ModelSpec {
            name: "Qwen3-Embedding-0.6B",
            hf_repo: "Qwen/Qwen3-Embedding-0.6B-GGUF",
            hf_filename: "Qwen3-Embedding-0.6B-Q8_0.gguf",
            tokenizer_repo: "Qwen/Qwen3-Embedding-0.6B",
            role: ModelRole::Embedding,
            param_count: "0.6B",
            quantized_size_mb: 639,
        },
        ModelSpec {
            name: "Qwen3-0.6B",
            hf_repo: "Qwen/Qwen3-0.6B-GGUF",
            hf_filename: "Qwen3-0.6B-Q8_0.gguf",
            tokenizer_repo: "Qwen/Qwen3-0.6B",
            role: ModelRole::Small,
            param_count: "0.6B",
            quantized_size_mb: 650,
        },
        ModelSpec {
            name: "Qwen3-1.7B",
            hf_repo: "Qwen/Qwen3-1.7B-GGUF",
            hf_filename: "Qwen3-1.7B-Q8_0.gguf",
            tokenizer_repo: "Qwen/Qwen3-1.7B",
            role: ModelRole::Medium,
            param_count: "1.7B",
            quantized_size_mb: 1800,
        },
        ModelSpec {
            name: "Qwen3-4B",
            hf_repo: "Qwen/Qwen3-4B-GGUF",
            hf_filename: "Qwen3-4B-Q4_K_M.gguf",
            tokenizer_repo: "Qwen/Qwen3-4B",
            role: ModelRole::Medium,
            param_count: "4B",
            quantized_size_mb: 2500,
        },
        ModelSpec {
            name: "Qwen3-8B",
            hf_repo: "Qwen/Qwen3-8B-GGUF",
            hf_filename: "Qwen3-8B-Q4_K_M.gguf",
            tokenizer_repo: "Qwen/Qwen3-8B",
            role: ModelRole::Large,
            param_count: "8B",
            quantized_size_mb: 4900,
        },
        ModelSpec {
            name: "Qwen3-30B-A3B",
            hf_repo: "Qwen/Qwen3-30B-A3B-GGUF",
            hf_filename: "Qwen3-30B-A3B-Q4_K_M.gguf",
            tokenizer_repo: "Qwen/Qwen3-30B-A3B",
            role: ModelRole::Expert,
            param_count: "30B (3B active)",
            quantized_size_mb: 17000,
        },
    ]
}