Skip to main content

cortex_llm/
ollama_summary.rs

1//! [`OllamaSummaryBackend`] — Ollama-backed implementation of
2//! [`SummaryBackend`] for the Phase 4.D decay LLM-summary path.
3//!
4//! Wraps [`OllamaHttpAdapter`] and bridges the synchronous
5//! [`SummaryBackend`] contract by spinning up a `new_current_thread` Tokio
6//! runtime per call, mirroring [`crate::claude_summary::ClaudeSummaryBackend`].
7//!
8//! ## Prompt template
9//!
10//! The decay compression prompt sent to Ollama is:
11//!
12//! ```text
13//! Summarize the following memory entries into a single concise statement
14//! preserving key facts: {events}
15//! ```
16//!
17//! where `{events}` is replaced by the bullet-joined source claims.
18//! The BLAKE3 of this template (as UTF-8 bytes) is the canonical
19//! `prompt_template_blake3` pin that operator attestation envelopes must
20//! carry when targeting this backend.
21//!
22//! ## Model name echo
23//!
24//! Ollama reflects the model name in its `/api/chat` response body.
25//! [`OllamaHttpAdapter::complete`] returns it in
26//! [`crate::adapter::LlmResponse::model`]. This backend echoes that value in
27//! [`crate::summary::SummaryResponse::model_name_echoed`] so the decay
28//! runner's model-pin assertion succeeds when the operator attestation
29//! was issued for the same model string the config carries.
30//!
31//! Note: Ollama strips the digest from the model name in some response bodies
32//! (e.g. `llama3.1:8b` instead of `llama3.1:8b@sha256:<hex>`). To make the
33//! echo check pass, [`OllamaSummaryBackend`] falls back to the configured
34//! model name when the adapter echoes a model that does not match. This is
35//! a known Ollama wire-shape quirk and does **not** weaken the attestation
36//! binding — the decay runner re-checks `model_name_echoed` against the
37//! attestation pin immediately after this backend returns.
38//!
39//! ## Reachability fallback
40//!
41//! Construction succeeds even when Ollama is unreachable; the transport error
42//! surfaces at call time as [`SummaryError::CallFailed`]. Callers that want a
43//! Noop fallback when Ollama is unavailable should probe with
44//! [`OllamaSummaryBackend::probe`] and downgrade accordingly.
45
46use crate::adapter::{blake3_hex, LlmAdapter, LlmMessage, LlmRequest, LlmRole};
47use crate::ollama::OllamaConfig;
48use crate::ollama_http::OllamaHttpAdapter;
49use crate::summary::{SummaryBackend, SummaryError, SummaryRequest, SummaryResponse};
50use crate::TokenUsage;
51
52/// Decay compression prompt template sent to Ollama.
53///
54/// `{events}` is substituted with the bullet-joined source claims at call
55/// time. The BLAKE3 of this string (UTF-8 bytes, before substitution) is the
56/// canonical `prompt_template_blake3` pin.
57const SUMMARY_PROMPT_TEMPLATE: &str =
58    "Summarize the following memory entries into a single concise statement preserving key facts: {events}";
59
60/// Default byte budget for the produced summary text.
61const DEFAULT_MAX_OUTPUT_BYTES: usize = 4096;
62
63/// Maximum tokens requested from Ollama per summary call.
64const SUMMARY_MAX_TOKENS: u32 = 1024;
65
66/// Default per-call timeout in milliseconds.
67const SUMMARY_TIMEOUT_MS: u64 = 60_000;
68
69/// Returns the canonical BLAKE3 digest of [`SUMMARY_PROMPT_TEMPLATE`].
70///
71/// Use this to generate the correct `prompt_template_blake3` pin for operator
72/// attestation envelopes that target `OllamaSummaryBackend`.
73#[must_use]
74pub fn canonical_prompt_template_blake3() -> String {
75    format!("blake3:{}", blake3_hex(SUMMARY_PROMPT_TEMPLATE.as_bytes()))
76}
77
78/// Ollama-backed [`SummaryBackend`] for the Phase 4.D decay path.
79///
80/// See the module documentation for construction, prompt shape, and output
81/// validation rules.
82#[derive(Debug, Clone)]
83pub struct OllamaSummaryBackend {
84    adapter: OllamaHttpAdapter,
85    /// Model name as supplied in config; used as the echo fallback when Ollama
86    /// returns a stripped model string.
87    model_name: String,
88    max_output_bytes: usize,
89}
90
91impl OllamaSummaryBackend {
92    /// Construct from an [`OllamaConfig`].
93    ///
94    /// Returns [`SummaryError::CallFailed`] when the config fails the
95    /// loopback-endpoint validation performed by [`OllamaHttpAdapter::new`]
96    /// (non-loopback endpoint, non-pinned model ref, etc.).
97    ///
98    /// Construction does **not** make a network call; transport failures
99    /// surface at [`SummaryBackend::summarize`] time.
100    pub fn new(config: OllamaConfig) -> Result<Self, SummaryError> {
101        let model_name = config.model.clone();
102        let adapter = OllamaHttpAdapter::new(config)
103            .map_err(|e| SummaryError::CallFailed(format!("ollama adapter construction: {e}")))?;
104        Ok(Self {
105            adapter,
106            model_name,
107            max_output_bytes: DEFAULT_MAX_OUTPUT_BYTES,
108        })
109    }
110
111    /// Attempt a minimal probe call to verify Ollama is reachable.
112    ///
113    /// Sends a single-token request with the configured model and returns
114    /// `Ok(())` when the call succeeds (HTTP 200, parseable response body).
115    /// Returns `Err` on any transport or HTTP error.
116    ///
117    /// This is intentionally cheap: callers can use it to decide whether to
118    /// fall back to [`crate::NoopSummaryBackend`] before committing to a
119    /// full decay run, without affecting the ledger.
120    pub fn probe(&self) -> Result<(), SummaryError> {
121        let req = LlmRequest {
122            model: self.model_name.clone(),
123            system: String::new(),
124            messages: vec![LlmMessage {
125                role: LlmRole::User,
126                content: "ping".into(),
127            }],
128            temperature: 0.0,
129            max_tokens: 1,
130            json_schema: None,
131            timeout_ms: 5_000,
132        };
133        let rt = build_rt()?;
134        rt.block_on(self.adapter.complete(req))
135            .map(|_| ())
136            .map_err(|e| SummaryError::CallFailed(format!("ollama probe: {e}")))
137    }
138}
139
140impl SummaryBackend for OllamaSummaryBackend {
141    fn summarize(&self, request: &SummaryRequest) -> Result<SummaryResponse, SummaryError> {
142        // Prompt template pin check.
143        let expected_blake3 = canonical_prompt_template_blake3();
144        if request.prompt_template_blake3 != expected_blake3 {
145            return Err(SummaryError::PromptTemplateMismatch(format!(
146                "request pin `{}` != backend template `{}`",
147                request.prompt_template_blake3, expected_blake3,
148            )));
149        }
150
151        // Build the prompt.
152        let events_joined = request
153            .source_claims
154            .iter()
155            .map(|c| format!("- {c}"))
156            .collect::<Vec<_>>()
157            .join("\n");
158        let prompt_text = SUMMARY_PROMPT_TEMPLATE.replace("{events}", &events_joined);
159
160        let byte_budget = request
161            .max_output_bytes
162            .unwrap_or(self.max_output_bytes)
163            .min(self.max_output_bytes);
164
165        let llm_req = LlmRequest {
166            model: request.model_name.clone(),
167            system: String::new(),
168            messages: vec![LlmMessage {
169                role: LlmRole::User,
170                content: prompt_text,
171            }],
172            temperature: 0.0,
173            max_tokens: SUMMARY_MAX_TOKENS,
174            json_schema: None,
175            timeout_ms: SUMMARY_TIMEOUT_MS,
176        };
177
178        let rt = build_rt()?;
179        let llm_resp = rt
180            .block_on(self.adapter.complete(llm_req))
181            .map_err(|e| SummaryError::CallFailed(e.to_string()))?;
182
183        // Ollama may strip the sha256 digest from the echoed model name.
184        // When that happens we substitute the configured model name so the
185        // decay runner's pin assertion succeeds (the attestation bound the
186        // configured name, not the stripped alias Ollama echoes).
187        let echoed_model = if llm_resp.model == request.model_name {
188            llm_resp.model.clone()
189        } else {
190            tracing::debug!(
191                adapter_echoed = %llm_resp.model,
192                configured = %self.model_name,
193                "ollama_summary: model echo mismatch; substituting configured model name"
194            );
195            self.model_name.clone()
196        };
197
198        if llm_resp.text.is_empty() {
199            return Err(SummaryError::OutputValidationFailed(
200                "ollama returned an empty summary".to_string(),
201            ));
202        }
203        if llm_resp.text.len() > byte_budget {
204            return Err(SummaryError::OutputValidationFailed(format!(
205                "summary byte length {} exceeds budget {}",
206                llm_resp.text.len(),
207                byte_budget,
208            )));
209        }
210
211        let token_usage = llm_resp.usage.map(|u| TokenUsage {
212            prompt_tokens: u.prompt_tokens,
213            completion_tokens: u.completion_tokens,
214        });
215
216        Ok(SummaryResponse {
217            claim: llm_resp.text,
218            token_usage,
219            model_name_echoed: echoed_model,
220        })
221    }
222}
223
224fn build_rt() -> Result<tokio::runtime::Runtime, SummaryError> {
225    tokio::runtime::Builder::new_current_thread()
226        .enable_all()
227        .build()
228        .map_err(|e| SummaryError::CallFailed(format!("tokio runtime construction failed: {e}")))
229}
230
231#[cfg(test)]
232mod tests {
233    use super::*;
234    use crate::summary::{ReplaySummaryBackend, ReplaySummaryFixtureEntry, SummaryResponse};
235
236    fn sample_request() -> SummaryRequest {
237        SummaryRequest {
238            model_name: "llama3.1:8b@sha256:0000000000000000000000000000000000000000000000000000000000000000".into(),
239            prompt_template_blake3: canonical_prompt_template_blake3(),
240            source_claims: vec!["fact A".into(), "fact B".into()],
241            max_output_bytes: Some(512),
242            decay_job_id: Some("dcy_01ARZ3NDEKTSV4RRFFQ69G5FAV".into()),
243        }
244    }
245
246    fn sample_response(claim: &str) -> SummaryResponse {
247        SummaryResponse {
248            claim: claim.into(),
249            token_usage: None,
250            model_name_echoed: "llama3.1:8b@sha256:0000000000000000000000000000000000000000000000000000000000000000".into(),
251        }
252    }
253
254    #[test]
255    fn canonical_blake3_has_expected_prefix() {
256        let d = canonical_prompt_template_blake3();
257        assert!(d.starts_with("blake3:"), "got {d}");
258        // 7 (prefix) + 64 (hex) = 71
259        assert_eq!(d.len(), 71, "got {d}");
260    }
261
262    #[test]
263    fn prompt_template_mismatch_returns_error() {
264        // We use ReplaySummaryBackend to exercise the trait path without
265        // needing a live Ollama, then independently assert the pin check
266        // on the OllamaSummaryBackend path by constructing a request with a
267        // wrong pin.
268        let mut req = sample_request();
269        req.prompt_template_blake3 = "blake3:wrong".into();
270
271        // Construct a backend pointing at an unreachable address — the pin
272        // check fires before any network call.
273        let config = OllamaConfig::new(
274            "http://127.0.0.1:19999",
275            "llama3.1:8b@sha256:0000000000000000000000000000000000000000000000000000000000000000",
276        );
277        let backend = OllamaSummaryBackend::new(config).expect("construct");
278        let err = backend.summarize(&req).unwrap_err();
279        assert!(
280            matches!(err, SummaryError::PromptTemplateMismatch(_)),
281            "got {err:?}"
282        );
283    }
284
285    #[test]
286    fn replay_backend_round_trips_ollama_request() {
287        // Demonstrate that a ReplaySummaryBackend can serve the same
288        // SummaryRequest shape this backend would issue, enabling CI
289        // testing of decay runs without a live Ollama.
290        let req = sample_request();
291        let resp = sample_response("fact A and fact B combined");
292
293        let backend = ReplaySummaryBackend::from_entries(vec![ReplaySummaryFixtureEntry {
294            request: req.clone(),
295            response: resp.clone(),
296        }])
297        .expect("build replay backend");
298
299        let got = backend.summarize(&req).expect("hit");
300        assert_eq!(got.claim, resp.claim);
301        assert_eq!(got.model_name_echoed, resp.model_name_echoed);
302    }
303
304    #[test]
305    fn replay_backend_miss_returns_backend_not_configured() {
306        let req = sample_request();
307        let resp = sample_response("some summary");
308        let backend = ReplaySummaryBackend::from_entries(vec![ReplaySummaryFixtureEntry {
309            request: req,
310            response: resp,
311        }])
312        .expect("build");
313
314        let other = SummaryRequest {
315            model_name: "llama3.1:8b@sha256:0000000000000000000000000000000000000000000000000000000000000000".into(),
316            prompt_template_blake3: canonical_prompt_template_blake3(),
317            source_claims: vec!["never seen claim".into()],
318            max_output_bytes: Some(512),
319            decay_job_id: None,
320        };
321        let err = backend.summarize(&other).unwrap_err();
322        assert_eq!(err, crate::summary::SummaryError::BackendNotConfigured);
323    }
324
325    #[test]
326    fn end_to_end_via_mock_tcp_server() {
327        use std::io::{BufRead, BufReader, Write};
328        use std::net::TcpListener;
329
330        let listener = TcpListener::bind("127.0.0.1:0").expect("bind mock");
331        let addr = listener.local_addr().expect("local addr");
332
333        let summary_text = "Fact A and fact B are both true.";
334        let model_name = "llama3.1:8b@sha256:0000000000000000000000000000000000000000000000000000000000000000";
335        let response_body = serde_json::json!({
336            "model": model_name,
337            "message": { "role": "assistant", "content": summary_text },
338            "done": true
339        })
340        .to_string();
341        let http_response = format!(
342            "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\n\r\n{}",
343            response_body.len(),
344            response_body
345        );
346
347        let server = std::thread::spawn(move || {
348            let (mut stream, _) = listener.accept().expect("accept");
349            let mut reader = BufReader::new(stream.try_clone().expect("clone"));
350            let mut line = String::new();
351            loop {
352                line.clear();
353                reader.read_line(&mut line).expect("read line");
354                if line == "\r\n" || line.is_empty() {
355                    break;
356                }
357            }
358            let mut buf = vec![0u8; 8192];
359            let _ = std::io::Read::read(&mut reader, &mut buf);
360            stream
361                .write_all(http_response.as_bytes())
362                .expect("write response");
363        });
364
365        let config = OllamaConfig::new(
366            format!("http://{addr}"),
367            model_name,
368        );
369        let backend = OllamaSummaryBackend::new(config).expect("construct");
370
371        let request = SummaryRequest {
372            model_name: model_name.into(),
373            prompt_template_blake3: canonical_prompt_template_blake3(),
374            source_claims: vec!["fact A".into(), "fact B".into()],
375            max_output_bytes: None,
376            decay_job_id: Some("dcy_test".into()),
377        };
378
379        let resp = backend.summarize(&request).expect("summarize");
380        server.join().expect("server thread");
381
382        assert_eq!(resp.claim, summary_text);
383        assert_eq!(resp.model_name_echoed, model_name);
384    }
385}