nornir 0.4.27 - Docs.rs

//! Generative-LLM abstraction (EPIC #39) — ONE [`Generator`] trait, THREE real
//! implementations (`candle`, `mistralrs`, `onnx`) behind a [`generator`]
//! factory, plus a `mock` for tests and an off-by-default `ollama` HTTP client.
//!
//! # Why a trait
//! The H5 bake-off ([`super::agent_model_runs`]) pits a prompt against a set of
//! local models and records, per model, the answer + its token economics. Until
//! now the only real driver was [`OllamaCaller`](super::agent_model_runs) — an
//! HTTP shell-out to a running ollama daemon. EPIC #39 turns the model call into
//! a first-class interface with several in-process, pure-Rust (or ORT) backends
//! so nornir can generate without ollama. ollama is demoted to one OFF-by-default
//! convenience client; it is NOT one of the three backends.
//!
//! # The interface
//! ```ignore
//! pub trait Backend { fn id(&self) -> &str; fn available(&self) -> bool; }
//! pub trait Generator: Backend + Send + Sync {
//!     fn complete(&self, req: &GenRequest) -> Result<GenAnswer>;
//! }
//! ```
//! [`GenRequest`] carries the prompt + sampling knobs; [`GenAnswer`] carries the
//! completion + the exact fields the bake-off's `ModelAnswer` needs, so a
//! [`GenAnswer`] slots straight into an `agent_model_runs` row (see
//! [`GenAnswer::into_model_answer`]).
//!
//! # The factory
//! [`generator(spec)`](generator) parses a `<backend>:<model>` spec and returns
//! a `Box<dyn Generator>`. Each backend arm is `#[cfg(feature = …)]`-gated, so a
//! lean default build only carries `mock` (+ `ollama` when `gen-ollama` is on);
//! an unrecognized backend, or one whose feature is off, yields a clear error.
//! Backends runtime-probe their lib/model in [`Backend::available`], so a
//! compiled-but-unconfigured backend constructs fine and simply reports
//! `available() == false` until its model is present.
//!
//! Selectable from the CLI via `--gen <spec>` or `$NORNIR_GEN_BACKEND`
//! ([`spec_from_env`]) — the CLI-parity LAW.

use anyhow::{anyhow, Result};

use super::agent_model_runs::{ModelAnswer, ModelCaller};

#[cfg(feature = "gen-candle")]
pub mod candle;
#[cfg(feature = "gen-mistralrs")]
pub mod mistralrs;
#[cfg(feature = "gen-onnx")]
pub mod onnx;
#[cfg(feature = "gen-ollama")]
pub mod ollama;

/// The `$NORNIR_GEN_BACKEND` env var name (CLI-parity fallback for `--gen`).
pub const GEN_BACKEND_ENV: &str = "NORNIR_GEN_BACKEND";

/// One generation request: the prompt + sampling knobs. Mirrors the inputs every
/// backend needs; backends ignore knobs they don't support (and say so in their
/// docs) rather than erroring.
#[derive(Debug, Clone, PartialEq)]
pub struct GenRequest {
    /// The user prompt to complete.
    pub prompt: String,
    /// Optional system prompt (role/instructions). Empty `None` ⇒ none.
    pub system: Option<String>,
    /// Hard cap on generated tokens.
    pub max_tokens: usize,
    /// Sampling temperature (`0.0` ⇒ greedy/deterministic).
    pub temperature: f32,
    /// Stop sequences: generation halts when any is produced.
    pub stop: Vec<String>,
}

impl GenRequest {
    /// A minimal request: just a prompt, greedy decode, 256-token cap, no stops.
    pub fn new(prompt: impl Into<String>) -> Self {
        GenRequest {
            prompt: prompt.into(),
            system: None,
            max_tokens: 256,
            temperature: 0.0,
            stop: Vec::new(),
        }
    }
    /// Set the system prompt. Chainable.
    pub fn with_system(mut self, system: impl Into<String>) -> Self {
        self.system = Some(system.into());
        self
    }
    /// Set the token cap. Chainable.
    pub fn with_max_tokens(mut self, n: usize) -> Self {
        self.max_tokens = n;
        self
    }
    /// Set the sampling temperature. Chainable.
    pub fn with_temperature(mut self, t: f32) -> Self {
        self.temperature = t;
        self
    }
    /// Add a stop sequence. Chainable.
    pub fn with_stop(mut self, s: impl Into<String>) -> Self {
        self.stop.push(s.into());
        self
    }
}

/// One generation answer + its token economics — the fields the bake-off's
/// `ModelAnswer` (and thus an `agent_model_runs` row) needs, so a [`GenAnswer`]
/// slots straight in via [`into_model_answer`](GenAnswer::into_model_answer).
#[derive(Debug, Clone, PartialEq)]
pub struct GenAnswer {
    /// The model's completion.
    pub text: String,
    /// Prompt token count.
    pub tokens_in: i64,
    /// Completion token count.
    pub tokens_out: i64,
    /// Decode throughput (completion tokens per second). `0.0` ⇒ the bake-off
    /// derives it from `tokens_out / (latency_ms/1000)`.
    pub tokens_per_s: f64,
    /// Wall-clock duration of the call, milliseconds.
    pub latency_ms: f64,
}

impl GenAnswer {
    /// Convert to the bake-off's [`ModelAnswer`] (a local generation is free and
    /// makes no MCP calls; the quality `score` is left `0.0` for a judge to fill).
    pub fn into_model_answer(self) -> ModelAnswer {
        ModelAnswer {
            output: self.text,
            latency_ms: self.latency_ms,
            tokens_in: self.tokens_in,
            tokens_out: self.tokens_out,
            tokens_per_s: self.tokens_per_s,
            score: 0.0,
            cost_usd: 0.0,
            mcp_tool_calls: 0,
        }
    }
}

/// Backend identity + a runtime availability probe. Split out from [`Generator`]
/// so a caller can list/probe backends without driving generation.
pub trait Backend {
    /// The backend's spec id (e.g. `"candle:qwen2-0.5b"`, `"mock"`).
    fn id(&self) -> &str;
    /// Is this backend usable *right now*? Runtime-probes the underlying lib +
    /// model presence — a compiled backend with no model downloaded reports
    /// `false` instead of erroring on `complete`.
    fn available(&self) -> bool;
}

/// The ONE generative-LLM interface. A [`Generator`] turns a [`GenRequest`] into
/// a [`GenAnswer`]. `Send + Sync` so the server/bake-off can share one across
/// threads.
pub trait Generator: Backend + Send + Sync {
    /// Complete `req`, returning the answer + economics, or an error (a missing
    /// model, a load failure, …). Callers that drive a bake-off translate an
    /// error into a red `ok=false` row rather than aborting.
    fn complete(&self, req: &GenRequest) -> Result<GenAnswer>;
}

/// Adapt any [`Generator`] to the bake-off's [`ModelCaller`] so the existing
/// `run_bakeoff` pipeline can drive the new backends unchanged. The `prompt`
/// becomes a default [`GenRequest`]; `agent`/`model` are recorded by the row
/// (the generator already knows which model it loaded).
pub struct GeneratorCaller<G: Generator> {
    gen: G,
}

impl<G: Generator> GeneratorCaller<G> {
    pub fn new(gen: G) -> Self {
        Self { gen }
    }
}

impl<G: Generator> ModelCaller for GeneratorCaller<G> {
    fn call(&self, _agent: &str, _model: &str, prompt: &str) -> Result<ModelAnswer> {
        let answer = self.gen.complete(&GenRequest::new(prompt))?;
        Ok(answer.into_model_answer())
    }
}

/// The boxed-trait-object form of [`GeneratorCaller`] — bridges a
/// `Box<dyn Generator>` (the factory's return type) to the bake-off's
/// [`ModelCaller`] so the CLI can drive any selected backend through the
/// existing `run_bakeoff_matrix` pipeline.
pub struct BoxGeneratorCaller {
    gen: Box<dyn Generator>,
}

impl BoxGeneratorCaller {
    pub fn new(gen: Box<dyn Generator>) -> Self {
        Self { gen }
    }
    /// The wrapped generator's id (for CLI display).
    pub fn id(&self) -> &str {
        self.gen.id()
    }
    /// Probe the wrapped generator's availability (for CLI display).
    pub fn available(&self) -> bool {
        self.gen.available()
    }
}

impl ModelCaller for BoxGeneratorCaller {
    fn call(&self, _agent: &str, _model: &str, prompt: &str) -> Result<ModelAnswer> {
        let answer = self.gen.complete(&GenRequest::new(prompt))?;
        Ok(answer.into_model_answer())
    }
}

/// Resolve the generator spec from the environment: `$NORNIR_GEN_BACKEND` if set,
/// else `None` (the CLI then falls back to its own default). The `--gen` flag
/// takes precedence over this when both are present.
pub fn spec_from_env() -> Option<String> {
    std::env::var(GEN_BACKEND_ENV).ok().filter(|s| !s.trim().is_empty())
}

/// The generative-LLM factory: parse `spec` and build the matching backend.
///
/// Accepted forms:
/// - `candle:<model>`    — `#[cfg(feature = "gen-candle")]`
/// - `mistralrs:<model>` — `#[cfg(feature = "gen-mistralrs")]`
/// - `onnx:<model>`      — `#[cfg(feature = "gen-onnx")]`
/// - `ollama:<model>`    — `#[cfg(feature = "gen-ollama")]` (NOT one of the 3)
/// - `mock` / `mock:<id>` — always available; round-trips a request to a canned
///   answer for tests + offline use.
///
/// A `<backend>:<model>` whose backend feature is off returns a clear
/// "not compiled" error (so the operator knows to rebuild with the feature),
/// and an unknown backend returns an "unknown backend" error. The returned
/// generator may still report `available() == false` if its model/lib is absent.
pub fn generator(spec: &str) -> Result<Box<dyn Generator>> {
    let spec = spec.trim();
    let (backend, model) = match spec.split_once(':') {
        Some((b, m)) => (b.trim(), m.trim()),
        None => (spec, ""),
    };
    match backend {
        "mock" => Ok(Box::new(MockGenerator::new(if model.is_empty() {
            "mock"
        } else {
            model
        }))),

        "candle" => {
            #[cfg(feature = "gen-candle")]
            {
                Ok(Box::new(candle::CandleGenerator::new(model)?))
            }
            #[cfg(not(feature = "gen-candle"))]
            {
                Err(not_compiled("candle", "gen-candle"))
            }
        }

        "mistralrs" => {
            #[cfg(feature = "gen-mistralrs")]
            {
                Ok(Box::new(mistralrs::MistralRsGenerator::new(model)?))
            }
            #[cfg(not(feature = "gen-mistralrs"))]
            {
                Err(not_compiled("mistralrs", "gen-mistralrs"))
            }
        }

        "onnx" => {
            #[cfg(feature = "gen-onnx")]
            {
                Ok(Box::new(onnx::OnnxGenerator::new(model)?))
            }
            #[cfg(not(feature = "gen-onnx"))]
            {
                Err(not_compiled("onnx", "gen-onnx"))
            }
        }

        "ollama" => {
            #[cfg(feature = "gen-ollama")]
            {
                Ok(Box::new(ollama::OllamaGenerator::new(
                    model,
                    None,
                )))
            }
            #[cfg(not(feature = "gen-ollama"))]
            {
                Err(not_compiled("ollama", "gen-ollama"))
            }
        }

        other => Err(anyhow!(
            "unknown generator backend `{other}` in spec `{spec}` — \
             expected one of candle:<m> | mistralrs:<m> | onnx:<m> | ollama:<m> | mock"
        )),
    }
}

/// The error a factory arm returns when the named backend's feature is off.
#[allow(dead_code)] // unused only in a build where every gen feature is on
fn not_compiled(backend: &str, feature: &str) -> anyhow::Error {
    anyhow!(
        "generator backend `{backend}` is not compiled in — \
         rebuild with `--features {feature}`"
    )
}

// ─── mock backend ──────────────────────────────────────────────────────────

/// The `mock` backend: round-trips a [`GenRequest`] into a deterministic
/// [`GenAnswer`] with no LLM. It echoes the prompt (so a test can assert the
/// request reached the backend) and reports token counts derived from the
/// request, exercising the full request→answer path the real backends share.
///
/// `available()` is always `true` — the mock needs nothing external.
#[derive(Debug, Clone)]
pub struct MockGenerator {
    id: String,
}

impl MockGenerator {
    pub fn new(id: impl Into<String>) -> Self {
        Self { id: id.into() }
    }

    /// The canned completion for `req`: a fixed echo so tests assert real I/O.
    fn answer_text(req: &GenRequest) -> String {
        let sys = req.system.as_deref().unwrap_or("");
        format!("mock[{}]: {}{}", req.max_tokens, sys, req.prompt)
    }
}

impl Backend for MockGenerator {
    fn id(&self) -> &str {
        &self.id
    }
    fn available(&self) -> bool {
        true
    }
}

impl Generator for MockGenerator {
    fn complete(&self, req: &GenRequest) -> Result<GenAnswer> {
        let text = Self::answer_text(req);
        // Deterministic, request-derived economics: 1 token per whitespace word.
        let tokens_in = req.prompt.split_whitespace().count() as i64;
        let tokens_out = text.split_whitespace().count() as i64;
        // A fixed synthetic latency so tokens_per_s is reproducible.
        let latency_ms = 10.0;
        let tokens_per_s = tokens_out as f64 / (latency_ms / 1000.0);
        Ok(GenAnswer {
            text,
            tokens_in,
            tokens_out,
            tokens_per_s,
            latency_ms,
        })
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn mock_round_trips_request_into_answer() {
        let gen = generator("mock").unwrap();
        assert_eq!(gen.id(), "mock");
        assert!(gen.available(), "mock is always available");

        let req = GenRequest::new("capital of France?")
            .with_system("be terse")
            .with_max_tokens(64);
        let ans = gen.complete(&req).unwrap();

        // The answer carries the request back (real round-trip, not a no-op).
        assert!(ans.text.contains("capital of France?"), "echoes prompt: {}", ans.text);
        assert!(ans.text.contains("be terse"), "echoes system: {}", ans.text);
        assert!(ans.text.contains("64"), "echoes max_tokens: {}", ans.text);
        // Token economics are derived from the actual text, not zeroed.
        assert_eq!(ans.tokens_in, 3, "3 prompt words");
        assert!(ans.tokens_out > 0);
        assert!(ans.tokens_per_s > 0.0);
        assert!((ans.latency_ms - 10.0).abs() < 1e-9);
    }

    #[test]
    fn mock_with_explicit_id_keeps_the_id() {
        let gen = generator("mock:tiny").unwrap();
        assert_eq!(gen.id(), "tiny");
    }

    #[test]
    fn factory_errors_on_unknown_backend() {
        let err = match generator("wat:model") {
            Ok(_) => panic!("unknown backend must error"),
            Err(e) => e.to_string(),
        };
        assert!(err.contains("unknown generator backend"), "{err}");
        assert!(err.contains("wat"), "{err}");
    }

    /// A backend whose feature is OFF must report "not compiled"; one whose
    /// feature is ON must NOT be the "unknown backend" error (its arm exists) —
    /// it may Ok or Err on the model, but the backend is recognized. Each spec is
    /// gated by ITS OWN feature so the assertion matches the build.
    fn assert_backend_arm(spec: &str, compiled: bool) {
        let res = generator(spec);
        match res {
            Ok(_) => assert!(compiled, "{spec} produced a generator but its feature is off"),
            Err(e) => {
                let s = e.to_string();
                if compiled {
                    assert!(
                        !s.contains("unknown generator backend") && !s.contains("not compiled"),
                        "{spec} is compiled — error must be model-level, got: {s}"
                    );
                } else {
                    assert!(
                        s.contains("not compiled") || s.contains("rebuild with"),
                        "{spec} feature is off — expected 'not compiled', got: {s}"
                    );
                }
            }
        }
    }

    #[test]
    fn factory_reports_uncompiled_backends() {
        assert_backend_arm("candle:m", cfg!(feature = "gen-candle"));
        assert_backend_arm("mistralrs:m", cfg!(feature = "gen-mistralrs"));
        assert_backend_arm("onnx:m", cfg!(feature = "gen-onnx"));

        // The unknown-backend arm is unconditional: an unrecognized backend is
        // always the "unknown generator backend" error.
        match generator("nope:m") {
            Ok(_) => panic!("`nope` is not a backend"),
            Err(e) => assert!(
                e.to_string().contains("unknown generator backend"),
                "{}",
                e
            ),
        }
    }

    #[test]
    fn generator_caller_bridges_to_modelcaller() {
        let gen = MockGenerator::new("mock");
        let caller = GeneratorCaller::new(gen);
        let ans = caller.call("local-llm", "mock", "2+2?").unwrap();
        assert!(ans.output.contains("2+2?"));
        assert_eq!(ans.cost_usd, 0.0, "local generation is free");
        assert_eq!(ans.mcp_tool_calls, 0);
        assert!(ans.tokens_out > 0);
    }

    #[test]
    fn spec_from_env_reads_the_var() {
        // Save/restore so we don't disturb a parallel test.
        let prev = std::env::var(GEN_BACKEND_ENV).ok();
        std::env::set_var(GEN_BACKEND_ENV, "mock:envpick");
        assert_eq!(spec_from_env().as_deref(), Some("mock:envpick"));
        std::env::set_var(GEN_BACKEND_ENV, "   ");
        assert_eq!(spec_from_env(), None, "blank is treated as unset");
        match prev {
            Some(v) => std::env::set_var(GEN_BACKEND_ENV, v),
            None => std::env::remove_var(GEN_BACKEND_ENV),
        }
    }
}