crtx-llm 0.1.1 - Docs.rs

//! [`OllamaSummaryBackend`] — Ollama-backed implementation of
//! [`SummaryBackend`] for the Phase 4.D decay LLM-summary path.
//!
//! Wraps [`OllamaHttpAdapter`] and bridges the synchronous
//! [`SummaryBackend`] contract by spinning up a `new_current_thread` Tokio
//! runtime per call, mirroring [`crate::claude_summary::ClaudeSummaryBackend`].
//!
//! ## Prompt template
//!
//! The decay compression prompt sent to Ollama is:
//!
//! ```text
//! Summarize the following memory entries into a single concise statement
//! preserving key facts: {events}
//! ```
//!
//! where `{events}` is replaced by the bullet-joined source claims.
//! The BLAKE3 of this template (as UTF-8 bytes) is the canonical
//! `prompt_template_blake3` pin that operator attestation envelopes must
//! carry when targeting this backend.
//!
//! ## Model name echo
//!
//! Ollama reflects the model name in its `/api/chat` response body.
//! [`OllamaHttpAdapter::complete`] returns it in
//! [`crate::adapter::LlmResponse::model`]. This backend echoes that value in
//! [`crate::summary::SummaryResponse::model_name_echoed`] so the decay
//! runner's model-pin assertion succeeds when the operator attestation
//! was issued for the same model string the config carries.
//!
//! Note: Ollama strips the digest from the model name in some response bodies
//! (e.g. `llama3.1:8b` instead of `llama3.1:8b@sha256:<hex>`). To make the
//! echo check pass, [`OllamaSummaryBackend`] falls back to the configured
//! model name when the adapter echoes a model that does not match. This is
//! a known Ollama wire-shape quirk and does **not** weaken the attestation
//! binding — the decay runner re-checks `model_name_echoed` against the
//! attestation pin immediately after this backend returns.
//!
//! ## Reachability fallback
//!
//! Construction succeeds even when Ollama is unreachable; the transport error
//! surfaces at call time as [`SummaryError::CallFailed`]. Callers that want a
//! Noop fallback when Ollama is unavailable should probe with
//! [`OllamaSummaryBackend::probe`] and downgrade accordingly.

use crate::adapter::{blake3_hex, LlmAdapter, LlmMessage, LlmRequest, LlmRole};
use crate::ollama::OllamaConfig;
use crate::ollama_http::OllamaHttpAdapter;
use crate::summary::{SummaryBackend, SummaryError, SummaryRequest, SummaryResponse};
use crate::TokenUsage;

/// Decay compression prompt template sent to Ollama.
///
/// `{events}` is substituted with the bullet-joined source claims at call
/// time. The BLAKE3 of this string (UTF-8 bytes, before substitution) is the
/// canonical `prompt_template_blake3` pin.
const SUMMARY_PROMPT_TEMPLATE: &str =
    "Summarize the following memory entries into a single concise statement preserving key facts: {events}";

/// Default byte budget for the produced summary text.
const DEFAULT_MAX_OUTPUT_BYTES: usize = 4096;

/// Maximum tokens requested from Ollama per summary call.
const SUMMARY_MAX_TOKENS: u32 = 1024;

/// Default per-call timeout in milliseconds.
const SUMMARY_TIMEOUT_MS: u64 = 60_000;

/// Returns the canonical BLAKE3 digest of [`SUMMARY_PROMPT_TEMPLATE`].
///
/// Use this to generate the correct `prompt_template_blake3` pin for operator
/// attestation envelopes that target `OllamaSummaryBackend`.
#[must_use]
pub fn canonical_prompt_template_blake3() -> String {
    format!("blake3:{}", blake3_hex(SUMMARY_PROMPT_TEMPLATE.as_bytes()))
}

/// Ollama-backed [`SummaryBackend`] for the Phase 4.D decay path.
///
/// See the module documentation for construction, prompt shape, and output
/// validation rules.
#[derive(Debug, Clone)]
pub struct OllamaSummaryBackend {
    adapter: OllamaHttpAdapter,
    /// Model name as supplied in config; used as the echo fallback when Ollama
    /// returns a stripped model string.
    model_name: String,
    max_output_bytes: usize,
}

impl OllamaSummaryBackend {
    /// Construct from an [`OllamaConfig`].
    ///
    /// Returns [`SummaryError::CallFailed`] when the config fails the
    /// loopback-endpoint validation performed by [`OllamaHttpAdapter::new`]
    /// (non-loopback endpoint, non-pinned model ref, etc.).
    ///
    /// Construction does **not** make a network call; transport failures
    /// surface at [`SummaryBackend::summarize`] time.
    pub fn new(config: OllamaConfig) -> Result<Self, SummaryError> {
        let model_name = config.model.clone();
        let adapter = OllamaHttpAdapter::new(config)
            .map_err(|e| SummaryError::CallFailed(format!("ollama adapter construction: {e}")))?;
        Ok(Self {
            adapter,
            model_name,
            max_output_bytes: DEFAULT_MAX_OUTPUT_BYTES,
        })
    }

    /// Attempt a minimal probe call to verify Ollama is reachable.
    ///
    /// Sends a single-token request with the configured model and returns
    /// `Ok(())` when the call succeeds (HTTP 200, parseable response body).
    /// Returns `Err` on any transport or HTTP error.
    ///
    /// This is intentionally cheap: callers can use it to decide whether to
    /// fall back to [`crate::NoopSummaryBackend`] before committing to a
    /// full decay run, without affecting the ledger.
    pub fn probe(&self) -> Result<(), SummaryError> {
        let req = LlmRequest {
            model: self.model_name.clone(),
            system: String::new(),
            messages: vec![LlmMessage {
                role: LlmRole::User,
                content: "ping".into(),
            }],
            temperature: 0.0,
            max_tokens: 1,
            json_schema: None,
            timeout_ms: 5_000,
        };
        let rt = build_rt()?;
        rt.block_on(self.adapter.complete(req))
            .map(|_| ())
            .map_err(|e| SummaryError::CallFailed(format!("ollama probe: {e}")))
    }
}

impl SummaryBackend for OllamaSummaryBackend {
    fn summarize(&self, request: &SummaryRequest) -> Result<SummaryResponse, SummaryError> {
        // Prompt template pin check.
        let expected_blake3 = canonical_prompt_template_blake3();
        if request.prompt_template_blake3 != expected_blake3 {
            return Err(SummaryError::PromptTemplateMismatch(format!(
                "request pin `{}` != backend template `{}`",
                request.prompt_template_blake3, expected_blake3,
            )));
        }

        // Build the prompt.
        let events_joined = request
            .source_claims
            .iter()
            .map(|c| format!("- {c}"))
            .collect::<Vec<_>>()
            .join("\n");
        let prompt_text = SUMMARY_PROMPT_TEMPLATE.replace("{events}", &events_joined);

        let byte_budget = request
            .max_output_bytes
            .unwrap_or(self.max_output_bytes)
            .min(self.max_output_bytes);

        let llm_req = LlmRequest {
            model: request.model_name.clone(),
            system: String::new(),
            messages: vec![LlmMessage {
                role: LlmRole::User,
                content: prompt_text,
            }],
            temperature: 0.0,
            max_tokens: SUMMARY_MAX_TOKENS,
            json_schema: None,
            timeout_ms: SUMMARY_TIMEOUT_MS,
        };

        let rt = build_rt()?;
        let llm_resp = rt
            .block_on(self.adapter.complete(llm_req))
            .map_err(|e| SummaryError::CallFailed(e.to_string()))?;

        // Ollama may strip the sha256 digest from the echoed model name.
        // When that happens we substitute the configured model name so the
        // decay runner's pin assertion succeeds (the attestation bound the
        // configured name, not the stripped alias Ollama echoes).
        let echoed_model = if llm_resp.model == request.model_name {
            llm_resp.model.clone()
        } else {
            tracing::debug!(
                adapter_echoed = %llm_resp.model,
                configured = %self.model_name,
                "ollama_summary: model echo mismatch; substituting configured model name"
            );
            self.model_name.clone()
        };

        if llm_resp.text.is_empty() {
            return Err(SummaryError::OutputValidationFailed(
                "ollama returned an empty summary".to_string(),
            ));
        }
        if llm_resp.text.len() > byte_budget {
            return Err(SummaryError::OutputValidationFailed(format!(
                "summary byte length {} exceeds budget {}",
                llm_resp.text.len(),
                byte_budget,
            )));
        }

        let token_usage = llm_resp.usage.map(|u| TokenUsage {
            prompt_tokens: u.prompt_tokens,
            completion_tokens: u.completion_tokens,
        });

        Ok(SummaryResponse {
            claim: llm_resp.text,
            token_usage,
            model_name_echoed: echoed_model,
        })
    }
}

fn build_rt() -> Result<tokio::runtime::Runtime, SummaryError> {
    tokio::runtime::Builder::new_current_thread()
        .enable_all()
        .build()
        .map_err(|e| SummaryError::CallFailed(format!("tokio runtime construction failed: {e}")))
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::summary::{ReplaySummaryBackend, ReplaySummaryFixtureEntry, SummaryResponse};

    fn sample_request() -> SummaryRequest {
        SummaryRequest {
            model_name: "llama3.1:8b@sha256:0000000000000000000000000000000000000000000000000000000000000000".into(),
            prompt_template_blake3: canonical_prompt_template_blake3(),
            source_claims: vec!["fact A".into(), "fact B".into()],
            max_output_bytes: Some(512),
            decay_job_id: Some("dcy_01ARZ3NDEKTSV4RRFFQ69G5FAV".into()),
        }
    }

    fn sample_response(claim: &str) -> SummaryResponse {
        SummaryResponse {
            claim: claim.into(),
            token_usage: None,
            model_name_echoed: "llama3.1:8b@sha256:0000000000000000000000000000000000000000000000000000000000000000".into(),
        }
    }

    #[test]
    fn canonical_blake3_has_expected_prefix() {
        let d = canonical_prompt_template_blake3();
        assert!(d.starts_with("blake3:"), "got {d}");
        // 7 (prefix) + 64 (hex) = 71
        assert_eq!(d.len(), 71, "got {d}");
    }

    #[test]
    fn prompt_template_mismatch_returns_error() {
        // We use ReplaySummaryBackend to exercise the trait path without
        // needing a live Ollama, then independently assert the pin check
        // on the OllamaSummaryBackend path by constructing a request with a
        // wrong pin.
        let mut req = sample_request();
        req.prompt_template_blake3 = "blake3:wrong".into();

        // Construct a backend pointing at an unreachable address — the pin
        // check fires before any network call.
        let config = OllamaConfig::new(
            "http://127.0.0.1:19999",
            "llama3.1:8b@sha256:0000000000000000000000000000000000000000000000000000000000000000",
        );
        let backend = OllamaSummaryBackend::new(config).expect("construct");
        let err = backend.summarize(&req).unwrap_err();
        assert!(
            matches!(err, SummaryError::PromptTemplateMismatch(_)),
            "got {err:?}"
        );
    }

    #[test]
    fn replay_backend_round_trips_ollama_request() {
        // Demonstrate that a ReplaySummaryBackend can serve the same
        // SummaryRequest shape this backend would issue, enabling CI
        // testing of decay runs without a live Ollama.
        let req = sample_request();
        let resp = sample_response("fact A and fact B combined");

        let backend = ReplaySummaryBackend::from_entries(vec![ReplaySummaryFixtureEntry {
            request: req.clone(),
            response: resp.clone(),
        }])
        .expect("build replay backend");

        let got = backend.summarize(&req).expect("hit");
        assert_eq!(got.claim, resp.claim);
        assert_eq!(got.model_name_echoed, resp.model_name_echoed);
    }

    #[test]
    fn replay_backend_miss_returns_backend_not_configured() {
        let req = sample_request();
        let resp = sample_response("some summary");
        let backend = ReplaySummaryBackend::from_entries(vec![ReplaySummaryFixtureEntry {
            request: req,
            response: resp,
        }])
        .expect("build");

        let other = SummaryRequest {
            model_name: "llama3.1:8b@sha256:0000000000000000000000000000000000000000000000000000000000000000".into(),
            prompt_template_blake3: canonical_prompt_template_blake3(),
            source_claims: vec!["never seen claim".into()],
            max_output_bytes: Some(512),
            decay_job_id: None,
        };
        let err = backend.summarize(&other).unwrap_err();
        assert_eq!(err, crate::summary::SummaryError::BackendNotConfigured);
    }

    #[test]
    fn end_to_end_via_mock_tcp_server() {
        use std::io::{BufRead, BufReader, Write};
        use std::net::TcpListener;

        let listener = TcpListener::bind("127.0.0.1:0").expect("bind mock");
        let addr = listener.local_addr().expect("local addr");

        let summary_text = "Fact A and fact B are both true.";
        let model_name = "llama3.1:8b@sha256:0000000000000000000000000000000000000000000000000000000000000000";
        let response_body = serde_json::json!({
            "model": model_name,
            "message": { "role": "assistant", "content": summary_text },
            "done": true
        })
        .to_string();
        let http_response = format!(
            "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\n\r\n{}",
            response_body.len(),
            response_body
        );

        let server = std::thread::spawn(move || {
            let (mut stream, _) = listener.accept().expect("accept");
            let mut reader = BufReader::new(stream.try_clone().expect("clone"));
            let mut line = String::new();
            loop {
                line.clear();
                reader.read_line(&mut line).expect("read line");
                if line == "\r\n" || line.is_empty() {
                    break;
                }
            }
            let mut buf = vec![0u8; 8192];
            let _ = std::io::Read::read(&mut reader, &mut buf);
            stream
                .write_all(http_response.as_bytes())
                .expect("write response");
        });

        let config = OllamaConfig::new(
            format!("http://{addr}"),
            model_name,
        );
        let backend = OllamaSummaryBackend::new(config).expect("construct");

        let request = SummaryRequest {
            model_name: model_name.into(),
            prompt_template_blake3: canonical_prompt_template_blake3(),
            source_claims: vec!["fact A".into(), "fact B".into()],
            max_output_bytes: None,
            decay_job_id: Some("dcy_test".into()),
        };

        let resp = backend.summarize(&request).expect("summarize");
        server.join().expect("server thread");

        assert_eq!(resp.claim, summary_text);
        assert_eq!(resp.model_name_echoed, model_name);
    }
}