ai_memory/atomisation/
mod.rs

1// Copyright 2026 AlphaOne LLC
2// SPDX-License-Identifier: Apache-2.0
3
4//! v0.7.0 WT-1-B — substrate-level atomisation engine.
5//!
6//! The atomiser is the second hard prereq for the v0.7.0 atomisation
7//! pipeline (WT-1-A schema v36 is the first; WT-1-C/D/E/F all
8//! consume the writer landed here). It takes one long-form memory,
9//! runs the curator pass to decompose it into atomic propositions,
10//! and writes each atom back as a first-class memory with full
11//! provenance:
12//!
13//! * `memories.atom_of` → parent memory id (structural FK, schema v36)
14//! * `memory_links.relation = 'derives_from'` (atom → parent, the
15//!   typed, signable, federation-safe expression of the FK)
16//! * `signed_events` rows for the per-atom write, the per-link write,
17//!   and the final `atomisation_complete` summary event
18//!
19//! The parent memory is archived (`archived_at` set, `atomised_into`
20//! set to `atom_count`) in a SEPARATE post-atom transaction so the
21//! per-atom hooks fire on live writes; downstream consumers walk the
22//! `atom_of` index to surface atoms in place of an atomised parent.
23//!
24//! # Hook integration
25//!
26//! Atoms are first-class memory_store writes — the existing
27//! `pre_store`/`post_store` substrate hooks fire per atom via
28//! [`crate::storage::insert`]. Governance refusal mid-batch returns
29//! [`AtomiseError::GovernanceRefused`] carrying the failing atom
30//! index; prior atoms in the batch are NOT rolled back (they were
31//! valid writes by themselves).
32//!
33//! # Idempotency
34//!
35//! A second `atomise(source_id, ...)` call after a successful first
36//! returns [`AtomiseError::AlreadyAtomised`] with the existing atom
37//! ids. Passing `force=true` skips the idempotency check and mints a
38//! fresh set of atoms; old atoms are retained (their `atom_of`
39//! pointer remains valid), and `atomised_into` is bumped to the new
40//! `atom_count`.
41
42pub mod curator;
43
44use crate::models::ConfidenceSource;
45use std::sync::Arc;
46
47use chrono::Utc;
48use rusqlite::{Connection, params};
49
50use crate::identity::keypair::AgentKeypair;
51use crate::models::{Memory, MemoryKind, MemoryLinkRelation, SourceSpan};
52use crate::signed_events::{SignedEvent, append_signed_event, payload_hash};
53use crate::storage as db;
54use curator::Curator;
55
56/// Tunables for the atomiser. Plumbed from `AppConfig` in the daemon
57/// path; tests construct one directly.
58///
59/// Defaults mirror the WT-1-B brief (with the Cluster-F PERF-5 envelope
60/// trim for the Synchronous mode):
61/// * `default_max_atom_tokens = 200`
62/// * `min_atoms_per_source = 2`
63/// * `max_atoms_per_source = 10`
64/// * `curator_max_retries = 3`  (deferred path baseline)
65/// * `sync_curator_max_retries = 1`  (Cluster-F PERF-5 — Synchronous
66///   mode runs inside the operator's `memory_store` envelope; the
67///   3-retry default added up to 3× worst-case latency before the
68///   response could return. The Synchronous path now defaults to a
69///   SINGLE retry — the second failure surfaces an error and the
70///   operator either reruns explicitly or moves on. Per-namespace
71///   override via `GovernancePolicy::auto_atomise_max_retries`.)
72#[derive(Debug, Clone)]
73pub struct AtomiserConfig {
74    /// Default per-atom token budget when the caller does not supply
75    /// an explicit value. The CLI / MCP atomise tool surfaces this as
76    /// the `max_atom_tokens` parameter.
77    pub default_max_atom_tokens: u32,
78    /// Minimum atoms a single source must produce for the atomisation
79    /// to be considered productive. Below this the source is
80    /// "atomic-enough" — [`AtomiseError::SourceTooSmall`].
81    pub min_atoms_per_source: usize,
82    /// Cap on atoms per source — prevents pathological responses
83    /// where the LLM emits dozens of trivial atoms. Matches the prompt
84    /// envelope ("2 to 10 atoms").
85    pub max_atoms_per_source: usize,
86    /// Max retries on a malformed curator response in the deferred /
87    /// CLI / explicit `memory_atomise` path. Total attempts =
88    /// 1 + this value. See [`curator::backoff_for_attempt`] for the
89    /// exponential-backoff schedule.
90    pub curator_max_retries: u32,
91    /// Cluster-F PERF-5 — Max retries on a malformed curator response
92    /// inside the **Synchronous** `pre_store` path (latency-sensitive).
93    /// Default `1` (i.e. 2 total attempts). The full 3-retry budget
94    /// otherwise inflated the operator's `memory_store` envelope by up
95    /// to the curator backoff schedule (100ms + 500ms + 2500ms ≈ 3.1s).
96    /// Per-namespace override via
97    /// [`crate::models::GovernancePolicy::auto_atomise_max_retries`].
98    pub sync_curator_max_retries: u32,
99}
100
101impl Default for AtomiserConfig {
102    fn default() -> Self {
103        Self {
104            default_max_atom_tokens: 200,
105            min_atoms_per_source: 2,
106            max_atoms_per_source: 10,
107            curator_max_retries: 3,
108            sync_curator_max_retries: 1,
109        }
110    }
111}
112
113/// Successful atomisation outcome.
114///
115/// `atom_ids` carries the freshly-minted atom ids in the order the
116/// curator produced them (preserving narrative flow — the WT-1-C
117/// resolver depends on this order for the default surface).
118#[derive(Debug, Clone)]
119pub struct AtomiseResult {
120    pub source_id: String,
121    pub atom_ids: Vec<String>,
122    pub atom_count: usize,
123    /// RFC3339 timestamp the parent memory was archived (i.e. the
124    /// `atomised_into` write committed). Returned for telemetry and
125    /// for the MCP `memory_atomise` response shape; callers building
126    /// audit trails get the moment the parent went read-only.
127    pub archived_at: String,
128}
129
130/// Typed error surface for [`Atomiser::atomise`].
131///
132/// Carries enough structured payload that the MCP / HTTP / CLI
133/// wrappers can render a clean operator-readable message without
134/// re-querying the DB.
135#[derive(Debug)]
136pub enum AtomiseError {
137    /// Source memory id does not exist (or has been hard-deleted).
138    NotFound,
139    /// Source has already been atomised. `existing_atom_ids` is the
140    /// set of atom ids currently pointing at this source via
141    /// `atom_of`. Caller may surface them or re-issue with `force =
142    /// true` to mint a fresh set.
143    AlreadyAtomised {
144        source_id: String,
145        existing_atom_ids: Vec<String>,
146    },
147    /// The daemon's resolved feature tier is `Keyword` — atomisation
148    /// requires the curator LLM (`Smart` or `Autonomous`). The MCP
149    /// surface maps this to a 503-style refusal.
150    TierLocked,
151    /// Curator round-trip exhausted retries. Carries the last parse
152    /// diagnostic so the caller can render it.
153    CuratorFailed(String),
154    /// Source body is already at or under `max_atom_tokens` — no
155    /// productive decomposition possible. The caller may surface the
156    /// source as-is. Distinct from `AlreadyAtomised`: this is the
157    /// "never worth atomising" verdict, the latter is the "already
158    /// done" verdict.
159    SourceTooSmall,
160    /// A `pre_store` substrate hook refused atom `index` (zero-based
161    /// into the curator's atom list). Prior atoms (indices `< index`)
162    /// were already committed and are NOT rolled back — see module
163    /// docs for the rationale.
164    GovernanceRefused(String),
165    /// Signer error during a per-atom or per-link write. Carries the
166    /// underlying diagnostic.
167    SignerError(String),
168    /// Database error (SQL, transaction commit, etc.). Carries the
169    /// underlying diagnostic.
170    DbError(String),
171    /// ARCH-5 (FX-6) — emitted when the recursive atomisation depth
172    /// exceeds [`MAX_ATOMISATION_DEPTH`]. A curator that chain-fires
173    /// an atomise on each derived atom (e.g. via an aggressive
174    /// `pre_store` auto-atomise hook that itself triggers another
175    /// atomise) is bounded by the thread-local depth guard installed
176    /// by [`enter_atomisation_pass`]. Past the cap the substrate
177    /// refuses with this typed variant, surfacing the stable
178    /// `ATOMISATION_DEPTH_EXCEEDED` slug to MCP / HTTP / CLI callers.
179    ///
180    /// Mirrors the
181    /// [`crate::errors::MemoryError::ReflectionDepthExceeded`] +
182    /// [`crate::errors::MemoryError::SynthesisDepthExceeded`] contract
183    /// so the recursive-primitive discipline is uniform across the
184    /// substrate's recursive write paths.
185    DepthExceeded { attempted: u32, cap: u32 },
186}
187
188impl std::fmt::Display for AtomiseError {
189    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
190        match self {
191            Self::NotFound => f.write_str("atomise: source memory not found"),
192            Self::AlreadyAtomised {
193                source_id,
194                existing_atom_ids,
195            } => write!(
196                f,
197                "atomise: source '{source_id}' already atomised into {} atoms",
198                existing_atom_ids.len()
199            ),
200            Self::TierLocked => f.write_str(
201                "atomise: feature tier is 'keyword' — atomisation requires curator LLM (smart/autonomous)",
202            ),
203            Self::CuratorFailed(d) => write!(f, "atomise: curator failed: {d}"),
204            Self::SourceTooSmall => f.write_str(
205                "atomise: source body already at or under max_atom_tokens — no decomposition possible",
206            ),
207            Self::GovernanceRefused(d) => write!(f, "atomise: governance refused: {d}"),
208            Self::SignerError(d) => write!(f, "atomise: signer error: {d}"),
209            Self::DbError(d) => write!(f, "atomise: db error: {d}"),
210            Self::DepthExceeded { attempted, cap } => write!(
211                f,
212                "ATOMISATION_DEPTH_EXCEEDED: atomisation depth {attempted} would exceed \
213                 compiled max_atomisation_depth {cap}"
214            ),
215        }
216    }
217}
218
219impl std::error::Error for AtomiseError {}
220
221// ---------------------------------------------------------------------------
222// ARCH-5 (FX-6) — atomisation-pass cycle-depth guard.
223//
224// Atomisation can re-enter itself indirectly via the `pre_store` /
225// `post_store` substrate hook chain (a freshly-minted atom is itself a
226// `memory_store` write, which fires the auto-atomise hook against
227// namespaces that opted in; if that hook fires a fresh `atomise_sync`
228// then we have a recursive primitive). Every other recursive primitive
229// in the substrate (reflect, synthesis, kg-query, find-paths,
230// cycle-check) has an explicit named cap with a typed refusal slug;
231// pre-FX-6 the atomiser was the lone outlier, relying solely on the
232// `AlreadyAtomised` idempotency check to break a chain. A misbehaving
233// curator that returns slightly different atom content on each pass
234// (LLM nondeterminism — different atom titles never trip the
235// idempotency check because that check keys on `atomised_into > 0` on
236// the source, not on atom content) could therefore drive an unbounded
237// recursion + OOM.
238//
239// The guard pattern mirrors
240// [`crate::synthesis::enter_synthesis_pass`] verbatim — a thread-local
241// counter (cheap, no allocation per call), an RAII guard that
242// increments on entry + decrements on drop, and a `pub const` cap that
243// the substrate compares against on every entry. Thread-local because
244// parallel HTTP / MCP requests must not share state; each request
245// walks its own call stack and either stays shallow or hits the cap
246// independently.
247// ---------------------------------------------------------------------------
248
249/// Compiled-in cap for the recursive atomisation-pass depth. An
250/// `atomise_sync_with_retries` call site running at depth `N` may
251/// indirectly invoke another atomise (via the `pre_store`
252/// auto-atomise hook firing on a freshly-minted atom in a namespace
253/// that opted in, etc.); each such nesting bumps the counter by 1.
254/// Once the counter exceeds this cap the substrate refuses the
255/// atomisation pass with [`AtomiseError::DepthExceeded`] and the
256/// stable slug `ATOMISATION_DEPTH_EXCEEDED`.
257///
258/// Mirrors [`crate::synthesis::MAX_SYNTHESIS_DEPTH`] and the
259/// `effective_max_reflection_depth` ceiling at 3 — bounds recursion
260/// without strangling legitimate two-step curator hand-offs.
261pub const MAX_ATOMISATION_DEPTH: u32 = 3;
262
263/// Smallest accepted `max_atom_tokens` — below this an "atom" can't
264/// hold a self-contained proposition.
265pub const MIN_ATOM_TOKENS: u32 = 50;
266
267/// Largest accepted `max_atom_tokens` — above this an "atom" is no
268/// longer atomic.
269pub const MAX_ATOM_TOKENS: u32 = 1000;
270
271/// Default `max_atom_tokens` when the caller passes none (or null).
272pub const DEFAULT_ATOM_TOKENS: u32 = 200;
273
274thread_local! {
275    /// Per-thread counter tracking how deep into the
276    /// atomisation-pass call stack the current `atomise_sync*`
277    /// invocation is. Reset to 0 between independent requests by
278    /// the RAII guard returned from [`enter_atomisation_pass`].
279    /// Cheap, no allocation per call.
280    static ATOMISATION_DEPTH: std::cell::Cell<u32> = const { std::cell::Cell::new(0) };
281}
282
283/// Read the current thread's atomisation-pass recursion depth.
284/// Returns `0` outside any active atomiser call (production callers);
285/// returns `N` while the `N`-th nested atomise is in flight on this
286/// thread.
287#[must_use]
288pub fn current_atomisation_depth() -> u32 {
289    ATOMISATION_DEPTH.with(std::cell::Cell::get)
290}
291
292/// RAII guard returned by [`enter_atomisation_pass`]. While the guard
293/// is alive, the thread's atomisation depth is incremented by 1; on
294/// drop the depth decrements back to the prior value. The atomiser
295/// holds the guard across the full `atomise_sync_with_retries` body
296/// so any `pre_store` / `post_store` hooks that re-enter atomise
297/// observe the incremented depth and refuse past the cap on entry.
298pub struct AtomisationDepthGuard {
299    /// Depth this guard was responsible for incrementing TO. On drop
300    /// we restore to `prior = depth - 1`. Stored explicitly so a
301    /// panicked guard doesn't leak the higher depth into the next
302    /// request reusing this thread.
303    prior: u32,
304}
305
306impl Drop for AtomisationDepthGuard {
307    fn drop(&mut self) {
308        ATOMISATION_DEPTH.with(|cell| cell.set(self.prior));
309    }
310}
311
312/// Enter an atomisation-pass scope. Returns the new depth
313/// (post-increment) plus an RAII guard that restores the depth on
314/// drop. Callers MUST retain the guard across the full atomise body —
315/// dropping it mid-call would underflow the depth and let nested
316/// calls escape the cap.
317///
318/// The caller is expected to compare the returned depth against
319/// [`MAX_ATOMISATION_DEPTH`] and refuse with
320/// [`AtomiseError::DepthExceeded`] when over-cap.
321#[must_use]
322pub fn enter_atomisation_pass() -> (u32, AtomisationDepthGuard) {
323    let (prior, new) = ATOMISATION_DEPTH.with(|cell| {
324        let prior = cell.get();
325        let new = prior.saturating_add(1);
326        cell.set(new);
327        (prior, new)
328    });
329    (new, AtomisationDepthGuard { prior })
330}
331
332/// The atomisation engine.
333///
334/// Holds the curator (trait object so tests inject a mock), an
335/// optional signing keypair (matches the curator pass surface;
336/// `None` means writes land unsigned), the substrate connection
337/// (`Arc<Mutex<...>>`-wrapped at higher levels — the substrate
338/// expects a `&Connection` per call), and the tunables.
339///
340/// Re-uses `crate::storage::insert` / `crate::storage::create_link_signed`
341/// rather than reaching into the DB directly so the substrate-level
342/// hook layer (pre_store / post_store / pre_link / post_link) fires
343/// for every atom and every `derives_from` edge.
344pub struct Atomiser {
345    curator: Box<dyn Curator>,
346    keypair: Option<Arc<AgentKeypair>>,
347    config: AtomiserConfig,
348    /// Tier the substrate is running at. When `Keyword`, every
349    /// `atomise` call is short-circuited with [`AtomiseError::TierLocked`]
350    /// before the source is even loaded. Matches the WT-1-B brief
351    /// ("keyword → TierLocked"); other tiers proceed.
352    tier: crate::config::FeatureTier,
353    /// v0.7.0 (issue #1244) — the resolved curator model name. Stamped
354    /// verbatim into the `atomisation_complete` signed-event payload's
355    /// `curator_model` field so a downstream auditor walking
356    /// `signed_events` can attribute the decomposition to the model
357    /// that ACTUALLY ran on this deployment. Defaults to `"unknown"`
358    /// (the "truly unresolvable" fallback per the issue) when the
359    /// caller doesn't thread the model name through; production sites
360    /// call [`Self::with_curator_model`] with the resolved LLM model
361    /// id (`OllamaClient::model_name()` etc.) at construction.
362    ///
363    /// Pre-#1244 the payload field was hardcoded to `"gemma4"`,
364    /// surfacing the wrong provenance on every non-gemma deployment
365    /// (grok-4.3, claude-opus-4.7, etc. via the post-#1067 path).
366    curator_model: String,
367}
368
369impl Atomiser {
370    /// Construct an atomiser. `curator` is the LLM-facing surface
371    /// (production: [`curator::LlmCurator`]; tests: a mock). `keypair`
372    /// is the daemon's Ed25519 identity — when `None`, links land
373    /// `unsigned` (mirror of `create_link_signed`'s contract).
374    /// `tier` is the resolved feature tier; the keyword tier short-
375    /// circuits atomise calls immediately.
376    pub fn new(
377        curator: Box<dyn Curator>,
378        keypair: Option<Arc<AgentKeypair>>,
379        config: AtomiserConfig,
380        tier: crate::config::FeatureTier,
381    ) -> Self {
382        Self {
383            curator,
384            keypair,
385            config,
386            tier,
387            // v0.7.0 (#1244) — "unknown" is the documented fallback
388            // for callers (and tests with mock curators) that don't
389            // resolve a model. Production sites chain
390            // `.with_curator_model(llm.model_name())` after `::new`.
391            curator_model: "unknown".to_string(),
392        }
393    }
394
395    /// v0.7.0 (issue #1244) — builder method that stamps the resolved
396    /// curator model name into the atomiser. Threaded into the
397    /// `atomisation_complete` signed-event payload as the
398    /// `curator_model` field so downstream auditors see the model that
399    /// actually ran on this deployment (grok-4.3, claude-opus-4.7,
400    /// gemma3:4b, etc.), not the pre-#1244 hardcoded `"gemma4"`.
401    ///
402    /// Production wiring sites pass `llm_client.model_name()` (from
403    /// `crate::llm::OllamaClient`). The model id passes through
404    /// verbatim — no normalisation, no defaulting beyond the
405    /// `"unknown"` fallback applied by [`Self::new`].
406    #[must_use]
407    pub fn with_curator_model(mut self, curator_model: impl Into<String>) -> Self {
408        let resolved = curator_model.into();
409        if !resolved.trim().is_empty() {
410            self.curator_model = resolved;
411        }
412        self
413    }
414
415    /// v0.7.0 (issue #1244) — accessor for the resolved curator model.
416    /// Exposed for the regression test that pins the
417    /// `atomisation_complete` payload's `curator_model` field reflects
418    /// the model threaded in at construction.
419    #[must_use]
420    pub fn curator_model(&self) -> &str {
421        &self.curator_model
422    }
423
424    /// Cluster-F PERF-5 — accessor for the configured Synchronous-mode
425    /// curator retry budget. Used by the `pre_store::auto_atomise.rs`
426    /// hook to honour `AtomiserConfig::sync_curator_max_retries`
427    /// (compiled default `1`) when the namespace policy has no
428    /// explicit `auto_atomise_max_retries` override.
429    #[must_use]
430    pub fn sync_curator_max_retries(&self) -> u32 {
431        self.config.sync_curator_max_retries
432    }
433
434    /// Atomise the memory named by `source_id`.
435    ///
436    /// `max_atom_tokens` overrides the per-call token budget; pass 0
437    /// to defer to `config.default_max_atom_tokens`.
438    ///
439    /// `force` skips the idempotency check (use to re-atomise after
440    /// a curator-prompt change). Old atoms are retained and
441    /// `atomised_into` is updated to the fresh count.
442    ///
443    /// # Errors
444    ///
445    /// See [`AtomiseError`] for the closed enum of failure modes.
446    ///
447    /// # Async note
448    ///
449    /// The function is `async` to match the WT-1-B brief signature
450    /// even though the substrate body is fully synchronous (sqlite
451    /// is blocking; tiktoken is blocking; the curator LLM call is
452    /// blocking-on-HTTP-thread). The async signature exists so
453    /// callers in tokio-runtime contexts (the MCP server, the
454    /// autonomy scheduler) can `await` it without spawning a
455    /// blocking task themselves.
456    pub async fn atomise(
457        &self,
458        conn: &Connection,
459        source_id: &str,
460        max_atom_tokens: u32,
461        force: bool,
462        calling_agent_id: &str,
463    ) -> Result<AtomiseResult, AtomiseError> {
464        self.atomise_sync(conn, source_id, max_atom_tokens, force, calling_agent_id)
465    }
466
467    /// Sync entry-point — body of [`Self::atomise`]. Exposed for tests
468    /// that prefer to call without tokio scaffolding. Uses the
469    /// configured `curator_max_retries` (deferred-path default).
470    pub fn atomise_sync(
471        &self,
472        conn: &Connection,
473        source_id: &str,
474        max_atom_tokens: u32,
475        force: bool,
476        calling_agent_id: &str,
477    ) -> Result<AtomiseResult, AtomiseError> {
478        self.atomise_sync_with_retries(
479            conn,
480            source_id,
481            max_atom_tokens,
482            force,
483            calling_agent_id,
484            self.config.curator_max_retries,
485        )
486    }
487
488    /// Cluster-F PERF-5 — variant of [`Self::atomise_sync`] that takes
489    /// an explicit per-call `max_retries` override. The Synchronous
490    /// `pre_store` path uses this with `sync_curator_max_retries`
491    /// (default 1) so the operator's `memory_store` envelope is not
492    /// inflated by the full deferred-path retry budget. Per-namespace
493    /// override via `GovernancePolicy::auto_atomise_max_retries`
494    /// flows through this entry-point.
495    ///
496    /// # Errors
497    ///
498    /// See [`AtomiseError`] for the closed enum.
499    pub fn atomise_sync_with_retries(
500        &self,
501        conn: &Connection,
502        source_id: &str,
503        max_atom_tokens: u32,
504        force: bool,
505        calling_agent_id: &str,
506        max_retries: u32,
507    ) -> Result<AtomiseResult, AtomiseError> {
508        // ARCH-5 (FX-6) — atomisation-pass cycle-depth guard. Acquire
509        // the per-thread depth guard BEFORE any substrate work so a
510        // nested atomise triggered by a `pre_store` / `post_store` hook
511        // on a freshly-minted atom observes the higher depth and
512        // refuses on entry. The guard is bound to `_depth_guard` (a
513        // named binding) so Rust retains the RAII drop until the full
514        // atomise body completes; a bare `let _ = ...` would drop the
515        // guard at the end of the statement and let nested calls
516        // escape the cap. Mirrors the discipline in
517        // `crate::synthesis::enter_synthesis_pass` (issue #1240).
518        let (_depth, _depth_guard) = enter_atomisation_pass();
519        if _depth > MAX_ATOMISATION_DEPTH {
520            tracing::warn!(
521                target: "atomisation",
522                source_id = %source_id,
523                attempted = _depth,
524                cap = MAX_ATOMISATION_DEPTH,
525                "atomisation.depth_exceeded",
526            );
527            return Err(AtomiseError::DepthExceeded {
528                attempted: _depth,
529                cap: MAX_ATOMISATION_DEPTH,
530            });
531        }
532
533        // Step 3 — tier check (pulled forward of step 1 so we don't burn
534        // a DB read when the daemon is on keyword tier).
535        if self.tier == crate::config::FeatureTier::Keyword {
536            return Err(AtomiseError::TierLocked);
537        }
538
539        let budget = if max_atom_tokens == 0 {
540            self.config.default_max_atom_tokens
541        } else {
542            max_atom_tokens
543        };
544
545        // Step 1 — load source memory.
546        let source = db::get(conn, source_id)
547            .map_err(|e| AtomiseError::DbError(e.to_string()))?
548            .ok_or(AtomiseError::NotFound)?;
549
550        // Step 2 — idempotency check.
551        if !force {
552            if let Some(atomised_into) = read_atomised_into(conn, source_id)
553                .map_err(|e| AtomiseError::DbError(e.to_string()))?
554            {
555                if atomised_into > 0 {
556                    let existing = list_atoms_of(conn, source_id)
557                        .map_err(|e| AtomiseError::DbError(e.to_string()))?;
558                    return Err(AtomiseError::AlreadyAtomised {
559                        source_id: source_id.to_string(),
560                        existing_atom_ids: existing,
561                    });
562                }
563            }
564        }
565
566        // Step 4 — pre-flight token count. Sources at or under the
567        // budget can never produce a useful split.
568        let source_tokens = db::count_tokens_cl100k(&source.content);
569        if source_tokens <= budget as usize {
570            return Err(AtomiseError::SourceTooSmall);
571        }
572
573        // Step 5 + 6 — curator round-trip. `max_retries` is the
574        // per-call override (Cluster-F PERF-5): the deferred path
575        // passes `config.curator_max_retries` (3 by default), the
576        // Synchronous `pre_store` path passes
577        // `config.sync_curator_max_retries` (1 by default).
578        let atoms = self
579            .curator
580            .decompose(&source.content, budget, max_retries)
581            .map_err(|e| match e {
582                curator::CuratorError::LlmUnavailable(d)
583                | curator::CuratorError::MalformedResponse(d) => AtomiseError::CuratorFailed(d),
584            })?;
585
586        // Step 7 — empty atoms = "cannot decompose" → SourceTooSmall.
587        if atoms.is_empty() {
588            return Err(AtomiseError::SourceTooSmall);
589        }
590
591        // Cap the count to the brief's [2..=10] envelope. The prompt
592        // pins this, but a misbehaving LLM could return e.g. 50; clamp
593        // here so the substrate never writes outside the contract.
594        let atom_count = atoms.len().min(self.config.max_atoms_per_source);
595        if atom_count < self.config.min_atoms_per_source {
596            return Err(AtomiseError::SourceTooSmall);
597        }
598        let atoms: Vec<curator::Atom> = atoms.into_iter().take(atom_count).collect();
599
600        // Step 8 — per-atom transactional write. We iterate atom-by-atom
601        // so the hook layer fires per atom (the brief's "atoms are
602        // first-class memory_store ops" contract). A governance refusal
603        // mid-batch surfaces with the atom index; PRIOR atoms remain
604        // committed (they were valid writes by themselves).
605        //
606        // v0.7.0 Form 4 (issue #757) — atom-grain span fact-provenance.
607        // We compute a `SourceSpan` byte-range for each atom into the
608        // parent source body. The substring search advances a running
609        // cursor so duplicate prefixes across atoms (e.g. two atoms
610        // that both quote the same phrase) get assigned non-overlapping
611        // spans in the order the curator emitted them. Atoms whose
612        // text cannot be located fall back to `None` for the span
613        // (curator may have paraphrased) — the substrate still records
614        // `source_uri = doc:<parent>` so the lineage edge is preserved
615        // even when the byte-range is unrecoverable.
616        let mut atom_ids: Vec<String> = Vec::with_capacity(atom_count);
617        let mut search_cursor: usize = 0;
618        for (idx, atom) in atoms.iter().enumerate() {
619            let span = compute_atom_span(&source.content, &atom.text, &mut search_cursor);
620            let atom_id = write_atom(
621                conn,
622                &source,
623                atom,
624                span,
625                calling_agent_id,
626                self.keypair.as_deref(),
627            )
628            .map_err(|e| {
629                if let Some(refusal) = e.downcast_ref::<crate::storage::GovernanceRefusal>() {
630                    AtomiseError::GovernanceRefused(format!("atom[{idx}]: {}", refusal.reason))
631                } else {
632                    AtomiseError::DbError(format!("atom[{idx}]: {e}"))
633                }
634            })?;
635            atom_ids.push(atom_id);
636        }
637
638        // Step 9 — archive the source in a SEPARATE transaction. The
639        // per-atom hooks have already fired by this point, so the
640        // source is still live during those hook callbacks (the WT-1-C
641        // resolver can switch over only after this commit lands).
642        let archived_at = Utc::now().to_rfc3339();
643        let atom_count_i64 = i64::try_from(atom_count).unwrap_or(i64::MAX);
644        archive_source(conn, source_id, atom_count_i64, &archived_at)
645            .map_err(|e| AtomiseError::DbError(e.to_string()))?;
646
647        // Step 10 — emit the atomisation_complete signed_event.
648        emit_atomisation_complete_event(
649            conn,
650            source_id,
651            &atom_ids,
652            atom_count,
653            calling_agent_id,
654            &archived_at,
655            self.keypair.as_deref(),
656            &self.curator_model,
657        )
658        .map_err(|e| AtomiseError::DbError(e.to_string()))?;
659
660        Ok(AtomiseResult {
661            source_id: source_id.to_string(),
662            atom_ids,
663            atom_count,
664            archived_at,
665        })
666    }
667}
668
669// ---------------------------------------------------------------------------
670// Helpers — kept module-private but `pub(crate)` so the test crate's
671// `atomisation_core` module can poke at substrate state directly.
672// ---------------------------------------------------------------------------
673
674/// Read the `atomised_into` column for a memory. Returns `Ok(None)`
675/// when the column is NULL (memory has not been atomised) OR the row
676/// does not exist, `Ok(Some(n))` when set, error on rusqlite failures
677/// other than `QueryReturnedNoRows`.
678///
679/// # Cluster-A COR-2 fix
680///
681/// Pre-fix, the body swallowed every rusqlite error via
682/// `.unwrap_or(None)`. A real failure (lock-timeout, IO error, schema
683/// drift) was indistinguishable from "row not present" — the
684/// idempotency check would fall through and the caller would proceed
685/// to re-atomise an already-atomised source. Now only the
686/// `QueryReturnedNoRows` variant maps to `Ok(None)`; every other
687/// rusqlite error propagates via `?` and surfaces as
688/// `AtomiseError::DbError`.
689fn read_atomised_into(conn: &Connection, id: &str) -> anyhow::Result<Option<i64>> {
690    match conn.query_row(
691        "SELECT atomised_into FROM memories WHERE id = ?1",
692        params![id],
693        |r| r.get::<_, Option<i64>>(0),
694    ) {
695        Ok(v) => Ok(v),
696        Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None),
697        Err(e) => Err(e.into()),
698    }
699}
700
701/// Return the ordered list of atom ids whose `atom_of` column points
702/// at the supplied source id. Ordered by `created_at` then `id` so
703/// the response is deterministic across calls.
704fn list_atoms_of(conn: &Connection, source_id: &str) -> anyhow::Result<Vec<String>> {
705    let mut stmt =
706        conn.prepare("SELECT id FROM memories WHERE atom_of = ?1 ORDER BY created_at ASC, id ASC")?;
707    let rows = stmt.query_map(params![source_id], |r| r.get::<_, String>(0))?;
708    rows.collect::<rusqlite::Result<Vec<_>>>()
709        .map_err(Into::into)
710}
711
712/// Write one atom: build a Memory row from the source's metadata
713/// (namespace/tier/tags), call `db::insert` (which fires the per-write
714/// hook chain), then write the `derives_from` edge via
715/// `db::create_link_signed`. The edge write is also hook-instrumented.
716///
717/// Returns the freshly-minted atom id on success. Errors bubble up
718/// as `anyhow::Error`; the caller downcasts to `GovernanceRefusal` to
719/// distinguish refusal from generic DB failure.
720fn write_atom(
721    conn: &Connection,
722    source: &Memory,
723    atom: &curator::Atom,
724    span: Option<SourceSpan>,
725    calling_agent_id: &str,
726    keypair: Option<&AgentKeypair>,
727) -> anyhow::Result<String> {
728    let now = Utc::now().to_rfc3339();
729    let atom_id = uuid::Uuid::new_v4().to_string();
730    // Synthesise a title from the source title + a short atom prefix so
731    // (title, namespace) does not collide with the parent under the
732    // ON CONFLICT clause in db::insert. The first 50 chars of the atom
733    // text is the deterministic signal; the trailing UUID8 ensures
734    // uniqueness across multiple atoms that share a content prefix.
735    let prefix: String = atom
736        .text
737        .chars()
738        .take(50)
739        .collect::<String>()
740        .trim()
741        .to_string();
742    let title = if prefix.is_empty() {
743        format!("[atom] {} #{}", source.title, &atom_id[..8])
744    } else {
745        format!("[atom] {} ({})", prefix, &atom_id[..8])
746    };
747
748    // metadata.agent_id is the substrate's NHI provenance marker;
749    // metadata.atom_index records the curator's 0-based ordering so
750    // downstream consumers reproduce the parent's narrative flow.
751    let mut metadata = match source.metadata.clone() {
752        serde_json::Value::Object(map) => map,
753        _ => serde_json::Map::new(),
754    };
755    metadata.insert(
756        "agent_id".to_string(),
757        serde_json::Value::String(calling_agent_id.to_string()),
758    );
759    metadata.insert(
760        "atom_source_id".to_string(),
761        serde_json::Value::String(source.id.clone()),
762    );
763
764    let mem = Memory {
765        id: atom_id.clone(),
766        tier: source.tier.clone(),
767        namespace: source.namespace.clone(),
768        title,
769        content: atom.text.clone(),
770        tags: source.tags.clone(),
771        priority: source.priority,
772        confidence: source.confidence,
773        // Source provenance label — "atomiser" so an operator
774        // walking `metadata.source` sees the synthetic origin.
775        source: "atomiser".to_string(),
776        access_count: 0,
777        created_at: now.clone(),
778        updated_at: now,
779        last_accessed_at: None,
780        expires_at: None,
781        metadata: serde_json::Value::Object(metadata),
782        reflection_depth: source.reflection_depth,
783        // Atoms inherit the parent's typed kind: Observation source →
784        // Observation atoms (the WT-1-B brief case). Reflection sources
785        // could theoretically be atomised too, but that path is gated
786        // by WT-1-C/D — for now atoms are typed Observation per the
787        // brief.
788        memory_kind: MemoryKind::Observation,
789        // v0.7.0 QW-2 — atoms are not Persona-kind; entity_id +
790        // persona_version stay NULL on the atom row.
791        entity_id: None,
792        persona_version: None,
793        // v0.7.0 Form 4 — atom-grain fact-provenance. Atoms inherit
794        // the parent's citations array (the same supporting evidence
795        // applies to every decomposed proposition) and stamp the
796        // parent memory id under the `doc:` scheme so the lineage is
797        // discoverable via the `--source-uri-prefix` recall filter.
798        // `source_span` carries the byte-range into the parent body
799        // when the curator's text was located verbatim; otherwise
800        // `None` (curator may have paraphrased).
801        citations: source.citations.clone(),
802        source_uri: Some(format!("doc:{}", source.id)),
803        source_span: span,
804        // v0.7.0 issue #1242 — atom rows are curator-engine output, not
805        // caller-supplied. The curator inherits `confidence` from the
806        // parent memory (line 567 `confidence: source.confidence`) — that
807        // value is engine-propagated, not caller-supplied, so the
808        // discriminator must reflect the provenance. Pre-#1242 these
809        // rows mis-labelled `CallerProvided`, hiding them from the
810        // partial `idx_memories_confidence_source` enumeration the
811        // calibration sweep scans, and violating the audit-honesty
812        // invariant. Distinct from `AutoDerived`, which means the
813        // Form 5 derive engine computed the value from row signals;
814        // here the curator simply propagated the parent's number.
815        confidence_source: ConfidenceSource::CuratorDerived,
816        confidence_signals: None,
817        confidence_decayed_at: None,
818        version: 1,
819    };
820
821    let actual_id = db::insert(conn, &mem)?;
822
823    // Stamp `atom_of` on the freshly inserted row. db::insert does NOT
824    // accept this column on its struct surface (Memory pre-dates the
825    // v36 columns), so we issue a targeted UPDATE here. This is
826    // hot-path so a single-row UPDATE is acceptable; an alternate
827    // approach (extend Memory to carry atom_of) is deferred until a
828    // future Memory refactor.
829    //
830    // v0.7.0 #1036 (Agent-3 #7) — intentionally non-version-bumping.
831    // This is a back-fill of an atomisation-derived metadata column
832    // on a row that was JUST INSERTED (`db::insert(&mem)` returned
833    // `actual_id` two lines up). The row has not yet been observed
834    // by any caller, so there is no version contract to violate.
835    // Pinned by `tests/non_version_bumping_sites_1036.rs`.
836    conn.execute(
837        "UPDATE memories SET atom_of = ?1 WHERE id = ?2",
838        params![source.id, actual_id],
839    )?;
840
841    // derives_from edge: atom → parent. This goes through
842    // create_link_signed which writes the row, fires the pre/post-link
843    // hooks, signs with the supplied keypair, and appends a
844    // `memory_link.created` row to signed_events.
845    db::create_link_signed(
846        conn,
847        &actual_id,
848        &source.id,
849        MemoryLinkRelation::DerivesFrom.as_str(),
850        keypair,
851    )?;
852
853    Ok(actual_id)
854}
855
856/// Archive the source memory.
857///
858/// Sets `atomised_into = N` (the substrate-visible signal that the row
859/// has been atomised) and writes an `atomisation_archived_at` RFC3339
860/// stamp into `metadata` (logical "this row is read-only because its
861/// atoms are now the canonical surface"). We do NOT call
862/// `db::archive_memory` here — that physically moves the row to
863/// `archived_memories`, which would invalidate every atom's `atom_of`
864/// FK pointing at it. The atom-of relationship survives as long as
865/// the parent row remains in `memories`; flipping `atomised_into`
866/// from NULL to N is the downstream signal WT-1-C consumes.
867///
868/// Runs in its own transaction so the per-atom hooks (step 8) have
869/// already fired before the source flips into the "atomised" state.
870fn archive_source(
871    conn: &Connection,
872    source_id: &str,
873    atom_count: i64,
874    archived_at: &str,
875) -> anyhow::Result<()> {
876    conn.execute_batch(crate::storage::connection::SQL_BEGIN_IMMEDIATE)?;
877    let result = (|| -> anyhow::Result<()> {
878        // Merge the existing metadata with the new
879        // `atomisation_archived_at` key — never clobber other keys.
880        let existing_metadata_str: String = conn
881            .query_row(
882                "SELECT metadata FROM memories WHERE id = ?1",
883                params![source_id],
884                |r| {
885                    r.get::<_, Option<String>>(0)
886                        .map(|o| o.unwrap_or_else(|| "{}".to_string()))
887                },
888            )
889            .unwrap_or_else(|_| "{}".to_string());
890        let mut meta: serde_json::Map<String, serde_json::Value> =
891            serde_json::from_str(&existing_metadata_str).unwrap_or_default();
892        meta.insert(
893            crate::models::field_names::ATOMISATION_ARCHIVED_AT.to_string(),
894            serde_json::Value::String(archived_at.to_string()),
895        );
896        let merged = serde_json::Value::Object(meta).to_string();
897        conn.execute(
898            "UPDATE memories SET atomised_into = ?1, metadata = ?2, updated_at = ?3 \
899             WHERE id = ?4",
900            params![atom_count, merged, archived_at, source_id],
901        )?;
902        Ok(())
903    })();
904    match result {
905        Ok(()) => {
906            conn.execute_batch(crate::storage::connection::SQL_COMMIT)?;
907            Ok(())
908        }
909        Err(e) => {
910            let _ = conn.execute_batch(crate::storage::connection::SQL_ROLLBACK);
911            Err(e)
912        }
913    }
914}
915
916/// Append the final `atomisation_complete` event to `signed_events`.
917/// The payload binds the source id, the atom-id list, and the curator
918/// model id so a downstream auditor can reproduce the decomposition.
919///
920/// v0.7.0 (issue #1244) — `curator_model` is threaded in from the
921/// caller's resolved LLM model name (via [`Atomiser::with_curator_model`])
922/// rather than hardcoded to `"gemma4"`. Default fallback is `"unknown"`.
923fn emit_atomisation_complete_event(
924    conn: &Connection,
925    source_id: &str,
926    atom_ids: &[String],
927    atom_count: usize,
928    calling_agent_id: &str,
929    archived_at: &str,
930    keypair: Option<&AgentKeypair>,
931    curator_model: &str,
932) -> anyhow::Result<()> {
933    let payload = serde_json::json!({
934        "event_type": "atomisation_complete",
935        "source_id": source_id,
936        "atom_ids": atom_ids,
937        (crate::models::field_names::ATOM_COUNT): atom_count,
938        "calling_agent_id": calling_agent_id,
939        "atomisation_timestamp": archived_at,
940        "curator_model": curator_model,
941    });
942    let bytes = serde_json::to_vec(&payload)?;
943    let (signature, attest_level) = if let Some(kp) = keypair.filter(|k| k.can_sign()) {
944        let signing = kp.private.as_ref().expect("can_sign() checked");
945        use ed25519_dalek::Signer;
946        let sig = signing.sign(&bytes);
947        (
948            Some(sig.to_bytes().to_vec()),
949            crate::models::AttestLevel::SelfSigned.as_str(),
950        )
951    } else {
952        (None, crate::models::AttestLevel::Unsigned.as_str())
953    };
954    let event = SignedEvent {
955        id: uuid::Uuid::new_v4().to_string(),
956        agent_id: calling_agent_id.to_string(),
957        event_type: crate::signed_events::event_types::ATOMISATION_COMPLETE.to_string(),
958        payload_hash: payload_hash(&bytes),
959        signature,
960        attest_level: attest_level.to_string(),
961        timestamp: Utc::now().to_rfc3339(),
962        ..SignedEvent::default()
963    };
964    append_signed_event(conn, &event)?;
965    Ok(())
966}
967
968/// v0.7.0 Form 4 (issue #757) — locate an atom's text inside its
969/// parent source body and emit the byte-range as a [`SourceSpan`].
970///
971/// Strategy:
972/// 1. Search verbatim for `atom_text` in `source[cursor..]`. When
973///    found, advance the cursor past the hit so a subsequent atom
974///    that quotes the same prefix doesn't latch onto the same offset.
975/// 2. When the verbatim search misses (curator paraphrased, or
976///    whitespace differs), return `None`. The substrate still
977///    stamps `source_uri` so the lineage edge survives without the
978///    span. This is the documented fallback contract for
979///    curator-paraphrase atoms.
980///
981/// # UTF-8 safety
982///
983/// `cursor` is treated as a byte offset into `source_body`. The
984/// cursor MUST point at a char boundary on entry; the function
985/// advances it to the next char boundary AFTER the hit start so
986/// repeated invocations on the same body cannot land mid-codepoint.
987/// The returned span's `start` and `end` are both guaranteed to fall
988/// on char boundaries because `str::find` itself only returns
989/// codepoint-aligned offsets (a property of `str` slicing).
990///
991/// # Cluster-A COR-1 / COR-7 fix
992///
993/// Pre-fix, the cursor advanced via `start.saturating_add(1)` which
994/// could leave the cursor mid-codepoint on multi-byte text — a
995/// subsequent `source_body[*cursor..]` slice would panic at the byte
996/// boundary check. The fix walks to the next `char_indices()` entry
997/// past the hit so every advance lands on a valid boundary, and
998/// clamps `end` to `source_body.len()` defensively (verbatim
999/// `str::find` already guarantees this — the clamp is belt-and-braces
1000/// against a future refactor that might pre-pad `needle`).
1001fn compute_atom_span(source_body: &str, atom_text: &str, cursor: &mut usize) -> Option<SourceSpan> {
1002    let needle = atom_text.trim();
1003    if needle.is_empty() {
1004        return None;
1005    }
1006    // If the cursor has drifted mid-codepoint (shouldn't happen with the
1007    // boundary-aware advance below, but defend against pathological
1008    // callers passing in a stale cursor), realign DOWN to the previous
1009    // boundary before slicing. `floor_char_boundary` is nightly-only;
1010    // hand-roll the equivalent by walking back at most 3 bytes (UTF-8
1011    // codepoints are ≤4 bytes).
1012    let cursor_aligned = floor_char_boundary(source_body, *cursor);
1013    let start = if cursor_aligned < source_body.len() {
1014        source_body[cursor_aligned..]
1015            .find(needle)
1016            .map(|off| cursor_aligned + off)
1017    } else {
1018        None
1019    };
1020    let start = start.or_else(|| source_body.find(needle))?;
1021    let end = (start + needle.len()).min(source_body.len());
1022    // Advance the cursor to the next char boundary AFTER `start` — using
1023    // `char_indices()` to find the first index strictly greater than
1024    // `start`. This ensures the cursor never lands mid-codepoint on
1025    // multi-byte text (Café / 中文 / 🦀 etc.).
1026    *cursor = source_body[start..]
1027        .char_indices()
1028        .nth(1)
1029        .map_or(source_body.len(), |(off, _)| start + off);
1030    Some(SourceSpan { start, end })
1031}
1032
1033/// Hand-rolled `str::floor_char_boundary` (the std fn is nightly-only as
1034/// of 1.83). Returns the largest index `≤ index` that lies on a UTF-8
1035/// char boundary in `s`. When `index >= s.len()`, returns `s.len()`
1036/// (which is itself a valid boundary).
1037fn floor_char_boundary(s: &str, index: usize) -> usize {
1038    if index >= s.len() {
1039        return s.len();
1040    }
1041    let mut i = index;
1042    while i > 0 && !s.is_char_boundary(i) {
1043        i -= 1;
1044    }
1045    i
1046}
1047
1048// ---------------------------------------------------------------------------
1049// Unit tests — exercise the helpers that don't require a live curator.
1050// The full integration suite (mock curator + DB + hooks + signed_events)
1051// lives at `tests/atomisation.rs`.
1052// ---------------------------------------------------------------------------
1053
1054#[cfg(test)]
1055mod tests {
1056    use super::*;
1057
1058    #[test]
1059    fn config_defaults_match_brief() {
1060        let c = AtomiserConfig::default();
1061        assert_eq!(c.default_max_atom_tokens, 200);
1062        assert_eq!(c.min_atoms_per_source, 2);
1063        assert_eq!(c.max_atoms_per_source, 10);
1064        assert_eq!(c.curator_max_retries, 3);
1065    }
1066
1067    #[test]
1068    fn atomise_error_display_shapes() {
1069        // Spot-check every variant renders without panicking.
1070        for e in [
1071            AtomiseError::NotFound,
1072            AtomiseError::AlreadyAtomised {
1073                source_id: "src".into(),
1074                existing_atom_ids: vec!["a".into(), "b".into()],
1075            },
1076            AtomiseError::TierLocked,
1077            AtomiseError::CuratorFailed("bad json".into()),
1078            AtomiseError::SourceTooSmall,
1079            AtomiseError::GovernanceRefused("policy".into()),
1080            AtomiseError::SignerError("no key".into()),
1081            AtomiseError::DbError("io".into()),
1082            AtomiseError::DepthExceeded {
1083                attempted: 4,
1084                cap: MAX_ATOMISATION_DEPTH,
1085            },
1086        ] {
1087            let s = format!("{e}");
1088            assert!(!s.is_empty());
1089        }
1090    }
1091
1092    // ---- ARCH-5 (FX-6) — atomisation depth-cap invariants.
1093
1094    #[test]
1095    fn max_atomisation_depth_matches_substrate_recursive_primitive_cap() {
1096        // The cap must match the rest of the recursive-primitive
1097        // discipline (reflection / synthesis ship at 3). A drift here
1098        // would mean the atomiser tolerates deeper recursion than its
1099        // sibling primitives — surfaced explicitly so a refactor that
1100        // bumps any of the four caps trips this pin.
1101        assert_eq!(MAX_ATOMISATION_DEPTH, 3);
1102        assert_eq!(
1103            MAX_ATOMISATION_DEPTH,
1104            crate::synthesis::MAX_SYNTHESIS_DEPTH,
1105            "atomisation cap must match synthesis cap"
1106        );
1107    }
1108
1109    #[test]
1110    fn atomisation_depth_guard_increments_and_decrements() {
1111        // Pre-entry depth is 0 on a fresh thread.
1112        assert_eq!(current_atomisation_depth(), 0);
1113        {
1114            let (d1, _g1) = enter_atomisation_pass();
1115            assert_eq!(d1, 1);
1116            assert_eq!(current_atomisation_depth(), 1);
1117            {
1118                let (d2, _g2) = enter_atomisation_pass();
1119                assert_eq!(d2, 2);
1120                assert_eq!(current_atomisation_depth(), 2);
1121                {
1122                    let (d3, _g3) = enter_atomisation_pass();
1123                    assert_eq!(d3, 3);
1124                    {
1125                        // 4-th nesting trips the cap.
1126                        let (d4, _g4) = enter_atomisation_pass();
1127                        assert_eq!(d4, 4);
1128                        assert!(d4 > MAX_ATOMISATION_DEPTH, "depth=4 exceeds cap=3");
1129                    }
1130                    // Drop restores depth back to 3.
1131                    assert_eq!(current_atomisation_depth(), 3);
1132                }
1133                assert_eq!(current_atomisation_depth(), 2);
1134            }
1135            assert_eq!(current_atomisation_depth(), 1);
1136        }
1137        assert_eq!(current_atomisation_depth(), 0);
1138    }
1139
1140    // ---- Cluster-A COR-1 / COR-7 / COV-3 — compute_atom_span tests.
1141
1142    #[test]
1143    fn compute_atom_span_paraphrase_fallback_returns_none() {
1144        // Atom text that the curator paraphrased — does not appear
1145        // verbatim in the parent body. Pre-fix the fallback was a
1146        // 32-char prefix search; the new contract returns `None`
1147        // gracefully so the substrate stamps `source_uri` without a
1148        // span. Critical: NO panic, even when the cursor was advanced
1149        // by a prior successful hit on the same body.
1150        let body = "The deployment manifest pins the image digest explicitly.";
1151        let mut cursor = 0_usize;
1152        let got = compute_atom_span(
1153            body,
1154            "Curator paraphrased this sentence entirely.",
1155            &mut cursor,
1156        );
1157        assert!(
1158            got.is_none(),
1159            "paraphrase miss must return None, got {got:?}"
1160        );
1161        // Cursor unchanged on miss.
1162        assert_eq!(cursor, 0);
1163    }
1164
1165    #[test]
1166    fn compute_atom_span_multibyte_utf8_stays_on_char_boundary() {
1167        // Multi-byte body covering Latin-with-diacritic (Café), CJK
1168        // (中文), and a 4-byte emoji (🦀). Pre-fix the cursor advance
1169        // (`start.saturating_add(1)`) split codepoints and the next
1170        // `source_body[*cursor..]` slice would panic. Post-fix the
1171        // cursor always lands on a valid char boundary.
1172        let body = "Café 中文 🦀 statement that follows the emoji.";
1173        let mut cursor = 0_usize;
1174
1175        // First hit — the CJK substring.
1176        let span = compute_atom_span(body, "中文", &mut cursor)
1177            .expect("multi-byte needle should be found verbatim");
1178        // Span lies on char boundaries (Rust's str::find guarantees
1179        // this for verbatim matches; we re-assert as a regression pin).
1180        assert!(body.is_char_boundary(span.start));
1181        assert!(body.is_char_boundary(span.end));
1182        assert_eq!(&body[span.start..span.end], "中文");
1183
1184        // Cursor must have advanced PAST `start` to the next char
1185        // boundary — never mid-codepoint.
1186        assert!(cursor > span.start);
1187        assert!(
1188            body.is_char_boundary(cursor),
1189            "cursor={cursor} mid-codepoint"
1190        );
1191
1192        // Second hit — the emoji. Critical: this slice would have
1193        // panicked pre-fix because the cursor would have been
1194        // mid-codepoint at byte-offset start+1 of the CJK char.
1195        let span2 = compute_atom_span(body, "🦀", &mut cursor)
1196            .expect("emoji needle should be found after CJK");
1197        assert!(body.is_char_boundary(span2.start));
1198        assert!(body.is_char_boundary(span2.end));
1199        assert_eq!(&body[span2.start..span2.end], "🦀");
1200
1201        // Third hit — verify a verbatim ASCII sentence still works in
1202        // the same pass.
1203        let span3 = compute_atom_span(body, "statement that follows the emoji.", &mut cursor)
1204            .expect("ascii needle should still be found");
1205        assert_eq!(
1206            &body[span3.start..span3.end],
1207            "statement that follows the emoji."
1208        );
1209    }
1210
1211    #[test]
1212    fn compute_atom_span_cursor_clamps_to_body_length() {
1213        // Cursor sitting past EOL should NOT panic. The function falls
1214        // through to the whole-body search.
1215        let body = "short body";
1216        let mut cursor = 1_000_usize;
1217        let span = compute_atom_span(body, "body", &mut cursor).expect("fallback whole-body");
1218        assert_eq!(&body[span.start..span.end], "body");
1219    }
1220
1221    #[test]
1222    fn compute_atom_span_stale_cursor_realigns_to_boundary() {
1223        // A pathological caller passing a cursor mid-codepoint should
1224        // not panic; the function realigns DOWN to the prior boundary
1225        // before slicing. Regression pin for `floor_char_boundary`.
1226        let body = "Café statement";
1227        // The 'é' is the bytes `0xC3 0xA9` — byte offset 4 is the
1228        // start of `0xA9`, mid-codepoint.
1229        let mut cursor = 4_usize;
1230        // Should not panic.
1231        let _ = compute_atom_span(body, "statement", &mut cursor);
1232    }
1233
1234    #[test]
1235    fn floor_char_boundary_walks_back_to_codepoint_start() {
1236        let s = "Café 中文";
1237        // Boundary already valid.
1238        assert_eq!(floor_char_boundary(s, 0), 0);
1239        // Mid-codepoint (`é` starts at 3, occupies bytes 3..5).
1240        assert_eq!(floor_char_boundary(s, 4), 3);
1241        // Past EOL clamps to len.
1242        assert_eq!(floor_char_boundary(s, 9999), s.len());
1243    }
1244}
ai_memory/atomisation/mod.rs

ai_memory/atomisation/
mod.rs