ai-memory 0.7.1

AI-agnostic persistent memory system — MCP server, HTTP API, and CLI for any AI platform
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
// Copyright 2026 AlphaOne LLC
// SPDX-License-Identifier: Apache-2.0

//! v0.7.0 WT-1-B — atomisation curator.
//!
//! The curator is the LLM-facing half of the atomisation engine: it
//! consumes one long memory body, asks Gemma 4 (E2B at the `smart`
//! tier, E4B at `autonomous`) to decompose it into atomic propositions,
//! parses the structured JSON response, validates per-atom token
//! budgets via `tiktoken-rs::cl100k_base`, and returns a `Vec<Atom>`
//! ready for the substrate writer in [`super::Atomiser::atomise`].
//!
//! The curator is intentionally factored as a trait
//! ([`Curator`]) so the substrate test suite can inject a deterministic
//! mock (see `tests/atomisation/core`). The production implementation
//! ([`LlmCurator`]) wraps an `OllamaClient` and is hot-path only when
//! the daemon's tier resolves to `smart` or higher.
//!
//! # Retry contract
//!
//! Malformed JSON responses retry up to `curator_max_retries` times
//! (default 3) with exponential backoff (100 ms → 500 ms → 2500 ms).
//! Each retry re-sends the original prompt verbatim — the LLM call is
//! stateless on our side. After the final attempt fails, the curator
//! surfaces [`CuratorError::MalformedResponse`] carrying the last
//! parser diagnostic; [`super::Atomiser::atomise`] maps that to
//! [`super::AtomiseError::CuratorFailed`].
//!
//! # Token-budget contract
//!
//! Atoms slightly over budget are accepted as-is — the curator emits
//! a warn-level log line and proceeds. The rationale is documented
//! in the WT-1-B brief ("fail-soft: accept atoms slightly over
//! budget rather than retry-loop"). The substrate writer is the
//! authoritative gate on memory size (governed by
//! `validate::validate_content`), not the curator.

use std::sync::Mutex;
use std::time::Duration;

use serde::Deserialize;

/// One proposed atom returned by the curator.
///
/// The wire shape mirrors the JSON the LLM emits — `{"text": "..."}` —
/// so the parser is `serde_json::from_str::<CuratorResponse>` with no
/// further fixup.
#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
pub struct Atom {
    /// Self-contained atomic proposition. Must be ≤ `max_atom_tokens`
    /// when measured with `cl100k_base`; the curator accepts a small
    /// over-budget overshoot rather than retrying.
    pub text: String,
}

/// Top-level wire shape returned by the LLM.
///
/// `atoms` is the list of decomposed propositions. An empty array
/// signals "this input cannot be decomposed" — see the prompt
/// contract; the substrate handler maps that to
/// [`super::AtomiseError::SourceTooSmall`].
#[derive(Debug, Clone, Deserialize)]
pub struct CuratorResponse {
    pub atoms: Vec<Atom>,
}

/// Curator-side error surface.
///
/// All variants carry a human-readable diagnostic; the substrate
/// `atomise` flow wraps them into the typed
/// [`super::AtomiseError::CuratorFailed`] variant.
#[derive(Debug)]
pub enum CuratorError {
    /// LLM was unreachable, returned an HTTP error, or otherwise
    /// failed to produce a body. Retries do NOT happen at this layer
    /// (the underlying `OllamaClient` already retries transient
    /// failures); the substrate caller decides whether to surface or
    /// fall back.
    LlmUnavailable(String),
    /// The LLM produced a body but the body did not parse as a
    /// [`CuratorResponse`] (missing `atoms`, wrong types, JSON
    /// trailing garbage, etc.). Carries the last parse diagnostic
    /// AFTER all retries were exhausted.
    MalformedResponse(String),
}

impl std::fmt::Display for CuratorError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::LlmUnavailable(m) => write!(f, "curator LLM unavailable: {m}"),
            Self::MalformedResponse(m) => write!(f, "curator response malformed: {m}"),
        }
    }
}

impl std::error::Error for CuratorError {}

/// Trait surface the [`super::Atomiser`] consumes.
///
/// The trait abstracts over the LLM round-trip so unit tests can
/// inject a deterministic stub (canned JSON, programmable
/// failure-then-success sequences) without standing up an Ollama
/// process. The production implementation [`LlmCurator`] performs
/// the real network call.
///
/// The trait method is sync (matching the rest of the curator surface
/// in this crate). The Ollama `generate` call is itself blocking-on-
/// HTTP-thread; the substrate `atomise` orchestrator runs on a thread
/// the caller manages.
pub trait Curator: Send + Sync {
    /// Decompose `body` into atomic propositions, each ≤ `max_atom_tokens`.
    ///
    /// Implementations MUST:
    /// 1. Send the canonical system prompt (see [`CURATOR_SYSTEM_PROMPT`]) — the
    ///    `{max_atom_tokens}` placeholder is substituted with the
    ///    caller-supplied value.
    /// 2. Parse the response body as a [`CuratorResponse`]. Retry up
    ///    to `max_retries` times on malformed JSON with exponential
    ///    backoff (100 ms / 500 ms / 2500 ms).
    /// 3. Validate per-atom token counts via
    ///    [`crate::storage::count_tokens_cl100k`]. Atoms slightly
    ///    over budget (≤ 25% overshoot) are accepted and
    ///    `tracing::warn!`-logged; gross over-budget atoms (> 25%)
    ///    are clamped at the prompt level by retry.
    /// 4. Bound the returned vec to `[2..=10]` atoms per the prompt
    ///    contract. An empty vec is a legitimate "cannot decompose"
    ///    signal — the caller maps that to `SourceTooSmall`.
    fn decompose(
        &self,
        body: &str,
        max_atom_tokens: u32,
        max_retries: u32,
    ) -> Result<Vec<Atom>, CuratorError>;
}

/// Verbatim system prompt sent to the LLM. The `{max_atom_tokens}`
/// token is substituted at call time. The shape of the JSON response
/// is pinned here — the parser depends on exactly this `{ atoms: [...] }`
/// envelope.
///
/// Lifted from the WT-1-B brief without modification so a future
/// audit can grep this constant in source against the spec doc.
pub const CURATOR_SYSTEM_PROMPT: &str =
    "You are decomposing a long memory into atomic propositions.
Each atom must:
(1) Be self-contained — readable without the original context
(2) Be at most {max_atom_tokens} tokens
(3) Contain exactly one fact, decision, observation, or relation
(4) Preserve original meaning — no editorial additions
Return JSON: { atoms: [{ text: string }] } with 2 to 10 atoms.
If the input cannot be decomposed (already atomic, all-or-nothing),
return { atoms: [] }.";

/// Render the system prompt with the supplied token budget substituted.
#[must_use]
pub fn render_system_prompt(max_atom_tokens: u32) -> String {
    CURATOR_SYSTEM_PROMPT.replace("{max_atom_tokens}", &max_atom_tokens.to_string())
}

/// Try to parse one candidate response body into a [`CuratorResponse`].
///
/// Returns `Ok(response)` on a clean parse, `Err(diagnostic)` on any
/// failure — the diagnostic is the underlying `serde_json` error
/// message verbatim so the retry loop can surface it in
/// [`CuratorError::MalformedResponse`].
///
/// LLM responses often arrive wrapped in markdown code fences (```json
/// … ```) or with leading/trailing prose; we strip the fences and
/// re-attempt once before giving up. This is the same defensive
/// shape used by `crate::llm::OllamaClient::auto_tag` and the
/// reflection curator's summariser.
pub fn parse_response(body: &str) -> Result<CuratorResponse, String> {
    // First attempt — direct parse.
    if let Ok(resp) = serde_json::from_str::<CuratorResponse>(body) {
        return Ok(resp);
    }
    // Second attempt — strip markdown fences. The LLM frequently
    // emits ```json\n...\n``` even when the prompt asks for raw
    // JSON; production curators have to tolerate this.
    let stripped = strip_code_fence(body);
    if let Ok(resp) = serde_json::from_str::<CuratorResponse>(&stripped) {
        return Ok(resp);
    }
    // Third attempt — extract the first balanced JSON object from
    // the body. Tolerates "Here are the atoms:\n{ ... }" preambles.
    if let Some(extracted) = extract_first_json_object(&stripped) {
        if let Ok(resp) = serde_json::from_str::<CuratorResponse>(&extracted) {
            return Ok(resp);
        }
    }
    // All three strategies failed; return the diagnostic from the
    // most informative (first) attempt.
    let err = serde_json::from_str::<CuratorResponse>(body)
        .err()
        .map_or_else(|| "unknown parse failure".to_string(), |e| e.to_string());
    Err(err)
}

/// Strip ``` and ```json fences from a candidate response body.
fn strip_code_fence(s: &str) -> String {
    let trimmed = s.trim();
    let stripped = trimmed
        .strip_prefix("```json")
        .or_else(|| trimmed.strip_prefix("```JSON"))
        .or_else(|| trimmed.strip_prefix("```"))
        .unwrap_or(trimmed);
    let stripped = stripped.trim_start_matches('\n');
    stripped
        .strip_suffix("```")
        .unwrap_or(stripped)
        .trim()
        .to_string()
}

/// Extract the first balanced `{ ... }` substring. Scans byte-wise so
/// string escapes inside the JSON don't fool the brace counter.
fn extract_first_json_object(s: &str) -> Option<String> {
    let bytes = s.as_bytes();
    let mut depth = 0i32;
    let mut start: Option<usize> = None;
    let mut in_string = false;
    let mut prev_backslash = false;
    for (i, &b) in bytes.iter().enumerate() {
        if in_string {
            if b == b'"' && !prev_backslash {
                in_string = false;
            }
            prev_backslash = b == b'\\' && !prev_backslash;
            continue;
        }
        prev_backslash = false;
        match b {
            b'"' => in_string = true,
            b'{' => {
                if depth == 0 {
                    start = Some(i);
                }
                depth += 1;
            }
            b'}' => {
                depth -= 1;
                if depth == 0 {
                    if let Some(s0) = start {
                        return Some(s[s0..=i].to_string());
                    }
                }
            }
            _ => {}
        }
    }
    None
}

/// Token-budget guardrail — accept atoms within 25% of the budget,
/// warn-log overshoots, drop atoms more than 25% over budget so a
/// pathological response cannot pollute the memory store.
///
/// Returns the (atoms_kept, atoms_dropped) pair so the caller can
/// telemetry-log how often the soft cap fires.
#[must_use]
pub fn enforce_token_budget(atoms: Vec<Atom>, max_atom_tokens: u32) -> (Vec<Atom>, usize) {
    let hard_cap = max_atom_tokens.saturating_add(max_atom_tokens / 4);
    let mut kept = Vec::with_capacity(atoms.len());
    let mut dropped = 0usize;
    for atom in atoms {
        let count = crate::storage::count_tokens_cl100k(&atom.text);
        let count_u32 = u32::try_from(count).unwrap_or(u32::MAX);
        if count_u32 <= max_atom_tokens {
            kept.push(atom);
        } else if count_u32 <= hard_cap {
            tracing::warn!(
                target: "atomisation::curator",
                atom_tokens = count_u32,
                budget = max_atom_tokens,
                "atom slightly over token budget — accepting (fail-soft)"
            );
            kept.push(atom);
        } else {
            tracing::warn!(
                target: "atomisation::curator",
                atom_tokens = count_u32,
                hard_cap,
                "atom grossly over token budget — dropping"
            );
            dropped += 1;
        }
    }
    (kept, dropped)
}

/// Exponential backoff schedule for the curator retry loop:
/// 100 ms, 500 ms, 2500 ms. Indexed by zero-based retry attempt; out
/// of range collapses to the last entry so a misconfigured retry cap
/// does not surface a `panic!`.
#[must_use]
pub fn backoff_for_attempt(attempt: u32) -> Duration {
    const SCHEDULE_MS: &[u64] = &[100, 500, 2500];
    let idx = (attempt as usize).min(SCHEDULE_MS.len() - 1);
    Duration::from_millis(SCHEDULE_MS[idx])
}

// ---------------------------------------------------------------------------
// LlmCurator — production impl backed by `crate::llm::OllamaClient`
// ---------------------------------------------------------------------------

/// Production curator. Wraps an `OllamaClient` (or any
/// `crate::autonomy::AutonomyLlm`-like surface — we re-use the
/// existing `generate` shape via a free function rather than coupling
/// to the autonomy trait, because the autonomy trait does not expose
/// `generate(prompt, system)`).
pub struct LlmCurator<L: LlmGenerate + Send + Sync> {
    llm: L,
    /// Sleep function. Production passes `std::thread::sleep`; tests
    /// pass a no-op to keep the suite fast.
    sleep: Mutex<Box<dyn FnMut(Duration) + Send + Sync>>,
}

/// Minimal generate surface the curator needs. Implemented for
/// `crate::llm::OllamaClient` in the same module; the trait stays
/// here (not in `src/llm.rs`) so external callers don't accidentally
/// pull it into their wire path.
pub trait LlmGenerate {
    /// Run a single generate cycle. Returns the response body verbatim
    /// (no trimming, no fence-stripping — `parse_response` handles
    /// that).
    fn generate(&self, prompt: &str, system: Option<&str>) -> Result<String, CuratorError>;
}

impl LlmGenerate for crate::llm::OllamaClient {
    fn generate(&self, prompt: &str, system: Option<&str>) -> Result<String, CuratorError> {
        Self::generate(self, prompt, system)
            .map_err(|e| CuratorError::LlmUnavailable(e.to_string()))
    }
}

/// Pass-through impl for `Arc<OllamaClient>` — lets the MCP wiring at
/// `mcp::run_mcp_server` share the daemon's existing `Arc<OllamaClient>`
/// across the auto-tag / expand-query / detect-contradiction surface
/// and the WT-1-C atomiser without cloning the underlying connection
/// pool.
impl LlmGenerate for std::sync::Arc<crate::llm::OllamaClient> {
    fn generate(&self, prompt: &str, system: Option<&str>) -> Result<String, CuratorError> {
        crate::llm::OllamaClient::generate(self.as_ref(), prompt, system)
            .map_err(|e| CuratorError::LlmUnavailable(e.to_string()))
    }
}

impl<L: LlmGenerate + Send + Sync> LlmCurator<L> {
    /// Construct a curator with the supplied LLM and the real
    /// `std::thread::sleep` for retry backoff.
    pub fn new(llm: L) -> Self {
        Self {
            llm,
            sleep: Mutex::new(Box::new(std::thread::sleep)),
        }
    }

    /// Construct a curator with an injected sleep — used by the
    /// unit test below to keep the suite under one second.
    #[cfg(test)]
    pub fn with_sleep<F>(llm: L, sleep: F) -> Self
    where
        F: FnMut(Duration) + Send + Sync + 'static,
    {
        Self {
            llm,
            sleep: Mutex::new(Box::new(sleep)),
        }
    }
}

impl<L: LlmGenerate + Send + Sync> Curator for LlmCurator<L> {
    fn decompose(
        &self,
        body: &str,
        max_atom_tokens: u32,
        max_retries: u32,
    ) -> Result<Vec<Atom>, CuratorError> {
        let system = render_system_prompt(max_atom_tokens);
        let mut last_err = String::from("no attempts made");
        for attempt in 0..=max_retries {
            let resp = self.llm.generate(body, Some(&system))?;
            match parse_response(&resp) {
                Ok(parsed) => {
                    let (kept, _dropped) = enforce_token_budget(parsed.atoms, max_atom_tokens);
                    return Ok(kept);
                }
                Err(e) => {
                    last_err = e;
                    if attempt < max_retries {
                        let backoff = backoff_for_attempt(attempt);
                        if let Ok(mut s) = self.sleep.lock() {
                            (s)(backoff);
                        }
                    }
                }
            }
        }
        Err(CuratorError::MalformedResponse(last_err))
    }
}

// ---------------------------------------------------------------------------
// Unit tests — pure logic. Mocked LLM. No DB, no network.
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;
    use std::sync::Arc;

    /// Mock that returns a programmable sequence of responses. Used by
    /// the integration suite as well as the unit tests below.
    pub(crate) struct MockLlm {
        responses: Mutex<Vec<Result<String, CuratorError>>>,
        calls: Mutex<usize>,
    }

    impl MockLlm {
        pub fn new(responses: Vec<Result<String, CuratorError>>) -> Self {
            Self {
                responses: Mutex::new(responses),
                calls: Mutex::new(0),
            }
        }

        pub fn call_count(&self) -> usize {
            *self.calls.lock().unwrap()
        }
    }

    impl LlmGenerate for Arc<MockLlm> {
        fn generate(&self, _prompt: &str, _system: Option<&str>) -> Result<String, CuratorError> {
            let mut calls = self.calls.lock().unwrap();
            *calls += 1;
            let mut rs = self.responses.lock().unwrap();
            if rs.is_empty() {
                return Err(CuratorError::LlmUnavailable(
                    "mock: no responses left".into(),
                ));
            }
            rs.remove(0)
        }
    }

    #[test]
    fn render_prompt_substitutes_max_atom_tokens() {
        let p = render_system_prompt(200);
        assert!(p.contains("at most 200 tokens"));
        assert!(!p.contains("{max_atom_tokens}"));
    }

    #[test]
    fn parse_response_accepts_direct_json() {
        let body = r#"{"atoms":[{"text":"alpha"},{"text":"beta"}]}"#;
        let r = parse_response(body).unwrap();
        assert_eq!(r.atoms.len(), 2);
        assert_eq!(r.atoms[0].text, "alpha");
    }

    #[test]
    fn parse_response_strips_markdown_fence() {
        let body = "```json\n{\"atoms\":[{\"text\":\"alpha\"}]}\n```";
        let r = parse_response(body).unwrap();
        assert_eq!(r.atoms.len(), 1);
    }

    #[test]
    fn parse_response_extracts_object_with_preamble() {
        let body = "Sure, here's the JSON:\n{\"atoms\":[{\"text\":\"alpha\"}]}\nThanks!";
        let r = parse_response(body).unwrap();
        assert_eq!(r.atoms.len(), 1);
    }

    #[test]
    fn parse_response_empty_atoms_is_valid() {
        // "Cannot decompose" signal — substrate maps to SourceTooSmall.
        let body = r#"{"atoms":[]}"#;
        let r = parse_response(body).unwrap();
        assert_eq!(r.atoms.len(), 0);
    }

    #[test]
    fn parse_response_rejects_garbage() {
        assert!(parse_response("nope nope nope").is_err());
        assert!(parse_response("").is_err());
        assert!(parse_response(r#"{"wrong":"shape"}"#).is_err());
    }

    #[test]
    fn enforce_token_budget_keeps_in_budget() {
        let atoms = vec![
            Atom {
                text: "small atom".to_string(),
            },
            Atom {
                text: "another small atom".to_string(),
            },
        ];
        let (kept, dropped) = enforce_token_budget(atoms, 200);
        assert_eq!(kept.len(), 2);
        assert_eq!(dropped, 0);
    }

    #[test]
    fn enforce_token_budget_drops_grossly_over() {
        // Build a string that is firmly over the 25% overshoot window.
        let huge: String = "word ".repeat(500);
        let atoms = vec![
            Atom {
                text: "fine".to_string(),
            },
            Atom { text: huge },
        ];
        let (kept, dropped) = enforce_token_budget(atoms, 10);
        assert_eq!(kept.len(), 1);
        assert_eq!(dropped, 1);
    }

    #[test]
    fn backoff_schedule_is_monotonic_and_bounded() {
        assert_eq!(backoff_for_attempt(0), Duration::from_millis(100));
        assert_eq!(backoff_for_attempt(1), Duration::from_millis(500));
        assert_eq!(backoff_for_attempt(2), Duration::from_millis(2500));
        assert_eq!(backoff_for_attempt(99), Duration::from_millis(2500));
    }

    #[test]
    fn curator_succeeds_on_first_attempt() {
        let mock = Arc::new(MockLlm::new(vec![Ok(
            r#"{"atoms":[{"text":"alpha"},{"text":"beta"}]}"#.to_string(),
        )]));
        let curator = LlmCurator::with_sleep(mock.clone(), |_| {});
        let atoms = curator.decompose("input", 200, 3).unwrap();
        assert_eq!(atoms.len(), 2);
        assert_eq!(mock.call_count(), 1);
    }

    #[test]
    fn curator_retries_on_malformed_then_succeeds() {
        let mock = Arc::new(MockLlm::new(vec![
            Ok("garbage".to_string()),
            Ok("still garbage".to_string()),
            Ok(r#"{"atoms":[{"text":"alpha"}]}"#.to_string()),
        ]));
        let curator = LlmCurator::with_sleep(mock.clone(), |_| {});
        let atoms = curator.decompose("input", 200, 3).unwrap();
        assert_eq!(atoms.len(), 1);
        assert_eq!(mock.call_count(), 3);
    }

    #[test]
    fn curator_fails_after_max_retries() {
        let mock = Arc::new(MockLlm::new(vec![
            Ok("garbage 1".to_string()),
            Ok("garbage 2".to_string()),
            Ok("garbage 3".to_string()),
            Ok("garbage 4".to_string()),
        ]));
        let curator = LlmCurator::with_sleep(mock.clone(), |_| {});
        // max_retries=3 means 1 initial + 3 retries = 4 total attempts.
        let err = curator.decompose("input", 200, 3).unwrap_err();
        assert!(matches!(err, CuratorError::MalformedResponse(_)));
        assert_eq!(mock.call_count(), 4);
    }

    #[test]
    fn curator_propagates_llm_unavailable() {
        let mock = Arc::new(MockLlm::new(vec![Err(CuratorError::LlmUnavailable(
            "connection refused".into(),
        ))]));
        let curator = LlmCurator::with_sleep(mock, |_| {});
        let err = curator.decompose("input", 200, 3).unwrap_err();
        assert!(matches!(err, CuratorError::LlmUnavailable(_)));
    }

    /// Drives the production `LlmGenerate for OllamaClient` impl (the
    /// error-mapping arm) AND the real-sleep `LlmCurator::new`
    /// constructor. Points the client at an unreachable loopback port so
    /// `generate` fails fast; `max_retries=0` means the real
    /// `std::thread::sleep` is never actually called (no backoff on the
    /// final attempt), keeping the test sub-second while still
    /// exercising `LlmCurator::new`'s body.
    #[test]
    fn llm_curator_new_with_real_ollama_client_maps_unavailable() {
        // Reserved-but-unbound loopback port (TEST-NET style high port).
        let client = crate::llm::OllamaClient::new_with_url_no_health_check(
            "http://127.0.0.1:1",
            "test-model",
        )
        .expect("build no-health-check client");
        // Exercise the production `LlmCurator::new` (real sleep) path.
        let curator = LlmCurator::new(client);
        let err = curator.decompose("body", 200, 0).unwrap_err();
        // The OllamaClient generate failed (connection refused) and the
        // production LlmGenerate impl mapped it to LlmUnavailable.
        assert!(
            matches!(err, CuratorError::LlmUnavailable(_)),
            "expected LlmUnavailable, got {err:?}"
        );
    }

    /// Drives the `LlmGenerate for Arc<OllamaClient>` pass-through impl
    /// (lines 341-346) used by the MCP daemon wiring.
    #[test]
    fn llm_curator_arc_ollama_passthrough_maps_unavailable() {
        let client = Arc::new(
            crate::llm::OllamaClient::new_with_url_no_health_check(
                "http://127.0.0.1:1",
                "test-model",
            )
            .expect("build no-health-check client"),
        );
        let curator = LlmCurator::with_sleep(client, |_| {});
        let err = curator.decompose("body", 200, 0).unwrap_err();
        assert!(matches!(err, CuratorError::LlmUnavailable(_)));
    }

    /// CuratorError Display arms — both variants render their prefix.
    #[test]
    fn curator_error_display_arms() {
        assert_eq!(
            CuratorError::LlmUnavailable("x".into()).to_string(),
            "curator LLM unavailable: x"
        );
        assert_eq!(
            CuratorError::MalformedResponse("y".into()).to_string(),
            "curator response malformed: y"
        );
        let e = CuratorError::MalformedResponse("z".into());
        let _: &dyn std::error::Error = &e;
    }

    #[test]
    fn extract_first_json_object_handles_braces_in_strings() {
        // Brace-counting must NOT be fooled by braces inside JSON strings.
        let s = r#"prefix {"atoms":[{"text":"contains } brace"}]} suffix"#;
        let extracted = extract_first_json_object(s).unwrap();
        let parsed: CuratorResponse = serde_json::from_str(&extracted).unwrap();
        assert_eq!(parsed.atoms.len(), 1);
        assert_eq!(parsed.atoms[0].text, "contains } brace");
    }
}