ai_memory/atomisation/mod.rs
1// Copyright 2026 AlphaOne LLC
2// SPDX-License-Identifier: Apache-2.0
3
4//! v0.7.0 WT-1-B — substrate-level atomisation engine.
5//!
6//! The atomiser is the second hard prereq for the v0.7.0 atomisation
7//! pipeline (WT-1-A schema v36 is the first; WT-1-C/D/E/F all
8//! consume the writer landed here). It takes one long-form memory,
9//! runs the curator pass to decompose it into atomic propositions,
10//! and writes each atom back as a first-class memory with full
11//! provenance:
12//!
13//! * `memories.atom_of` → parent memory id (structural FK, schema v36)
14//! * `memory_links.relation = 'derives_from'` (atom → parent, the
15//! typed, signable, federation-safe expression of the FK)
16//! * `signed_events` rows for the per-atom write, the per-link write,
17//! and the final `atomisation_complete` summary event
18//!
19//! The parent memory is archived (`archived_at` set, `atomised_into`
20//! set to `atom_count`) in a SEPARATE post-atom transaction so the
21//! per-atom hooks fire on live writes; downstream consumers walk the
22//! `atom_of` index to surface atoms in place of an atomised parent.
23//!
24//! # Hook integration
25//!
26//! Atoms are first-class memory_store writes — the existing
27//! `pre_store`/`post_store` substrate hooks fire per atom via
28//! [`crate::storage::insert`]. Governance refusal mid-batch returns
29//! [`AtomiseError::GovernanceRefused`] carrying the failing atom
30//! index; prior atoms in the batch are NOT rolled back (they were
31//! valid writes by themselves).
32//!
33//! # Idempotency
34//!
35//! A second `atomise(source_id, ...)` call after a successful first
36//! returns [`AtomiseError::AlreadyAtomised`] with the existing atom
37//! ids. Passing `force=true` skips the idempotency check and mints a
38//! fresh set of atoms; old atoms are retained (their `atom_of`
39//! pointer remains valid), and `atomised_into` is bumped to the new
40//! `atom_count`.
41
42pub mod curator;
43
44use crate::models::ConfidenceSource;
45use std::sync::Arc;
46
47use chrono::Utc;
48use rusqlite::{Connection, params};
49
50use crate::identity::keypair::AgentKeypair;
51use crate::models::{Memory, MemoryKind, MemoryLinkRelation, SourceSpan};
52use crate::signed_events::{SignedEvent, append_signed_event, payload_hash};
53use crate::storage as db;
54use curator::Curator;
55
56/// Tunables for the atomiser. Plumbed from `AppConfig` in the daemon
57/// path; tests construct one directly.
58///
59/// Defaults mirror the WT-1-B brief (with the Cluster-F PERF-5 envelope
60/// trim for the Synchronous mode):
61/// * `default_max_atom_tokens = 200`
62/// * `min_atoms_per_source = 2`
63/// * `max_atoms_per_source = 10`
64/// * `curator_max_retries = 3` (deferred path baseline)
65/// * `sync_curator_max_retries = 1` (Cluster-F PERF-5 — Synchronous
66/// mode runs inside the operator's `memory_store` envelope; the
67/// 3-retry default added up to 3× worst-case latency before the
68/// response could return. The Synchronous path now defaults to a
69/// SINGLE retry — the second failure surfaces an error and the
70/// operator either reruns explicitly or moves on. Per-namespace
71/// override via `GovernancePolicy::auto_atomise_max_retries`.)
72#[derive(Debug, Clone)]
73pub struct AtomiserConfig {
74 /// Default per-atom token budget when the caller does not supply
75 /// an explicit value. The CLI / MCP atomise tool surfaces this as
76 /// the `max_atom_tokens` parameter.
77 pub default_max_atom_tokens: u32,
78 /// Minimum atoms a single source must produce for the atomisation
79 /// to be considered productive. Below this the source is
80 /// "atomic-enough" — [`AtomiseError::SourceTooSmall`].
81 pub min_atoms_per_source: usize,
82 /// Cap on atoms per source — prevents pathological responses
83 /// where the LLM emits dozens of trivial atoms. Matches the prompt
84 /// envelope ("2 to 10 atoms").
85 pub max_atoms_per_source: usize,
86 /// Max retries on a malformed curator response in the deferred /
87 /// CLI / explicit `memory_atomise` path. Total attempts =
88 /// 1 + this value. See [`curator::backoff_for_attempt`] for the
89 /// exponential-backoff schedule.
90 pub curator_max_retries: u32,
91 /// Cluster-F PERF-5 — Max retries on a malformed curator response
92 /// inside the **Synchronous** `pre_store` path (latency-sensitive).
93 /// Default `1` (i.e. 2 total attempts). The full 3-retry budget
94 /// otherwise inflated the operator's `memory_store` envelope by up
95 /// to the curator backoff schedule (100ms + 500ms + 2500ms ≈ 3.1s).
96 /// Per-namespace override via
97 /// [`crate::models::GovernancePolicy::auto_atomise_max_retries`].
98 pub sync_curator_max_retries: u32,
99}
100
101impl Default for AtomiserConfig {
102 fn default() -> Self {
103 Self {
104 default_max_atom_tokens: 200,
105 min_atoms_per_source: 2,
106 max_atoms_per_source: 10,
107 curator_max_retries: 3,
108 sync_curator_max_retries: 1,
109 }
110 }
111}
112
113/// Successful atomisation outcome.
114///
115/// `atom_ids` carries the freshly-minted atom ids in the order the
116/// curator produced them (preserving narrative flow — the WT-1-C
117/// resolver depends on this order for the default surface).
118#[derive(Debug, Clone)]
119pub struct AtomiseResult {
120 pub source_id: String,
121 pub atom_ids: Vec<String>,
122 pub atom_count: usize,
123 /// RFC3339 timestamp the parent memory was archived (i.e. the
124 /// `atomised_into` write committed). Returned for telemetry and
125 /// for the MCP `memory_atomise` response shape; callers building
126 /// audit trails get the moment the parent went read-only.
127 pub archived_at: String,
128}
129
130/// Typed error surface for [`Atomiser::atomise`].
131///
132/// Carries enough structured payload that the MCP / HTTP / CLI
133/// wrappers can render a clean operator-readable message without
134/// re-querying the DB.
135#[derive(Debug)]
136pub enum AtomiseError {
137 /// Source memory id does not exist (or has been hard-deleted).
138 NotFound,
139 /// Source has already been atomised. `existing_atom_ids` is the
140 /// set of atom ids currently pointing at this source via
141 /// `atom_of`. Caller may surface them or re-issue with `force =
142 /// true` to mint a fresh set.
143 AlreadyAtomised {
144 source_id: String,
145 existing_atom_ids: Vec<String>,
146 },
147 /// The daemon's resolved feature tier is `Keyword` — atomisation
148 /// requires the curator LLM (`Smart` or `Autonomous`). The MCP
149 /// surface maps this to a 503-style refusal.
150 TierLocked,
151 /// Curator round-trip exhausted retries. Carries the last parse
152 /// diagnostic so the caller can render it.
153 CuratorFailed(String),
154 /// Source body is already at or under `max_atom_tokens` — no
155 /// productive decomposition possible. The caller may surface the
156 /// source as-is. Distinct from `AlreadyAtomised`: this is the
157 /// "never worth atomising" verdict, the latter is the "already
158 /// done" verdict.
159 SourceTooSmall,
160 /// A `pre_store` substrate hook refused atom `index` (zero-based
161 /// into the curator's atom list). Prior atoms (indices `< index`)
162 /// were already committed and are NOT rolled back — see module
163 /// docs for the rationale.
164 GovernanceRefused(String),
165 /// Signer error during a per-atom or per-link write. Carries the
166 /// underlying diagnostic.
167 SignerError(String),
168 /// Database error (SQL, transaction commit, etc.). Carries the
169 /// underlying diagnostic.
170 DbError(String),
171 /// ARCH-5 (FX-6) — emitted when the recursive atomisation depth
172 /// exceeds [`MAX_ATOMISATION_DEPTH`]. A curator that chain-fires
173 /// an atomise on each derived atom (e.g. via an aggressive
174 /// `pre_store` auto-atomise hook that itself triggers another
175 /// atomise) is bounded by the thread-local depth guard installed
176 /// by [`enter_atomisation_pass`]. Past the cap the substrate
177 /// refuses with this typed variant, surfacing the stable
178 /// `ATOMISATION_DEPTH_EXCEEDED` slug to MCP / HTTP / CLI callers.
179 ///
180 /// Mirrors the
181 /// [`crate::errors::MemoryError::ReflectionDepthExceeded`] +
182 /// [`crate::errors::MemoryError::SynthesisDepthExceeded`] contract
183 /// so the recursive-primitive discipline is uniform across the
184 /// substrate's recursive write paths.
185 DepthExceeded { attempted: u32, cap: u32 },
186}
187
188impl std::fmt::Display for AtomiseError {
189 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
190 match self {
191 Self::NotFound => f.write_str("atomise: source memory not found"),
192 Self::AlreadyAtomised {
193 source_id,
194 existing_atom_ids,
195 } => write!(
196 f,
197 "atomise: source '{source_id}' already atomised into {} atoms",
198 existing_atom_ids.len()
199 ),
200 Self::TierLocked => f.write_str(
201 "atomise: feature tier is 'keyword' — atomisation requires curator LLM (smart/autonomous)",
202 ),
203 Self::CuratorFailed(d) => write!(f, "atomise: curator failed: {d}"),
204 Self::SourceTooSmall => f.write_str(
205 "atomise: source body already at or under max_atom_tokens — no decomposition possible",
206 ),
207 Self::GovernanceRefused(d) => write!(f, "atomise: governance refused: {d}"),
208 Self::SignerError(d) => write!(f, "atomise: signer error: {d}"),
209 Self::DbError(d) => write!(f, "atomise: db error: {d}"),
210 Self::DepthExceeded { attempted, cap } => write!(
211 f,
212 "ATOMISATION_DEPTH_EXCEEDED: atomisation depth {attempted} would exceed \
213 compiled max_atomisation_depth {cap}"
214 ),
215 }
216 }
217}
218
219impl std::error::Error for AtomiseError {}
220
221// ---------------------------------------------------------------------------
222// ARCH-5 (FX-6) — atomisation-pass cycle-depth guard.
223//
224// Atomisation can re-enter itself indirectly via the `pre_store` /
225// `post_store` substrate hook chain (a freshly-minted atom is itself a
226// `memory_store` write, which fires the auto-atomise hook against
227// namespaces that opted in; if that hook fires a fresh `atomise_sync`
228// then we have a recursive primitive). Every other recursive primitive
229// in the substrate (reflect, synthesis, kg-query, find-paths,
230// cycle-check) has an explicit named cap with a typed refusal slug;
231// pre-FX-6 the atomiser was the lone outlier, relying solely on the
232// `AlreadyAtomised` idempotency check to break a chain. A misbehaving
233// curator that returns slightly different atom content on each pass
234// (LLM nondeterminism — different atom titles never trip the
235// idempotency check because that check keys on `atomised_into > 0` on
236// the source, not on atom content) could therefore drive an unbounded
237// recursion + OOM.
238//
239// The guard pattern mirrors
240// [`crate::synthesis::enter_synthesis_pass`] verbatim — a thread-local
241// counter (cheap, no allocation per call), an RAII guard that
242// increments on entry + decrements on drop, and a `pub const` cap that
243// the substrate compares against on every entry. Thread-local because
244// parallel HTTP / MCP requests must not share state; each request
245// walks its own call stack and either stays shallow or hits the cap
246// independently.
247// ---------------------------------------------------------------------------
248
249/// Compiled-in cap for the recursive atomisation-pass depth. An
250/// `atomise_sync_with_retries` call site running at depth `N` may
251/// indirectly invoke another atomise (via the `pre_store`
252/// auto-atomise hook firing on a freshly-minted atom in a namespace
253/// that opted in, etc.); each such nesting bumps the counter by 1.
254/// Once the counter exceeds this cap the substrate refuses the
255/// atomisation pass with [`AtomiseError::DepthExceeded`] and the
256/// stable slug `ATOMISATION_DEPTH_EXCEEDED`.
257///
258/// Mirrors [`crate::synthesis::MAX_SYNTHESIS_DEPTH`] and the
259/// `effective_max_reflection_depth` ceiling at 3 — bounds recursion
260/// without strangling legitimate two-step curator hand-offs.
261pub const MAX_ATOMISATION_DEPTH: u32 = 3;
262
263/// Smallest accepted `max_atom_tokens` — below this an "atom" can't
264/// hold a self-contained proposition.
265pub const MIN_ATOM_TOKENS: u32 = 50;
266
267/// Largest accepted `max_atom_tokens` — above this an "atom" is no
268/// longer atomic.
269pub const MAX_ATOM_TOKENS: u32 = 1000;
270
271/// Default `max_atom_tokens` when the caller passes none (or null).
272pub const DEFAULT_ATOM_TOKENS: u32 = 200;
273
274thread_local! {
275 /// Per-thread counter tracking how deep into the
276 /// atomisation-pass call stack the current `atomise_sync*`
277 /// invocation is. Reset to 0 between independent requests by
278 /// the RAII guard returned from [`enter_atomisation_pass`].
279 /// Cheap, no allocation per call.
280 static ATOMISATION_DEPTH: std::cell::Cell<u32> = const { std::cell::Cell::new(0) };
281}
282
283/// Read the current thread's atomisation-pass recursion depth.
284/// Returns `0` outside any active atomiser call (production callers);
285/// returns `N` while the `N`-th nested atomise is in flight on this
286/// thread.
287#[must_use]
288pub fn current_atomisation_depth() -> u32 {
289 ATOMISATION_DEPTH.with(std::cell::Cell::get)
290}
291
292/// RAII guard returned by [`enter_atomisation_pass`]. While the guard
293/// is alive, the thread's atomisation depth is incremented by 1; on
294/// drop the depth decrements back to the prior value. The atomiser
295/// holds the guard across the full `atomise_sync_with_retries` body
296/// so any `pre_store` / `post_store` hooks that re-enter atomise
297/// observe the incremented depth and refuse past the cap on entry.
298pub struct AtomisationDepthGuard {
299 /// Depth this guard was responsible for incrementing TO. On drop
300 /// we restore to `prior = depth - 1`. Stored explicitly so a
301 /// panicked guard doesn't leak the higher depth into the next
302 /// request reusing this thread.
303 prior: u32,
304}
305
306impl Drop for AtomisationDepthGuard {
307 fn drop(&mut self) {
308 ATOMISATION_DEPTH.with(|cell| cell.set(self.prior));
309 }
310}
311
312/// Enter an atomisation-pass scope. Returns the new depth
313/// (post-increment) plus an RAII guard that restores the depth on
314/// drop. Callers MUST retain the guard across the full atomise body —
315/// dropping it mid-call would underflow the depth and let nested
316/// calls escape the cap.
317///
318/// The caller is expected to compare the returned depth against
319/// [`MAX_ATOMISATION_DEPTH`] and refuse with
320/// [`AtomiseError::DepthExceeded`] when over-cap.
321#[must_use]
322pub fn enter_atomisation_pass() -> (u32, AtomisationDepthGuard) {
323 let (prior, new) = ATOMISATION_DEPTH.with(|cell| {
324 let prior = cell.get();
325 let new = prior.saturating_add(1);
326 cell.set(new);
327 (prior, new)
328 });
329 (new, AtomisationDepthGuard { prior })
330}
331
332/// The atomisation engine.
333///
334/// Holds the curator (trait object so tests inject a mock), an
335/// optional signing keypair (matches the curator pass surface;
336/// `None` means writes land unsigned), the substrate connection
337/// (`Arc<Mutex<...>>`-wrapped at higher levels — the substrate
338/// expects a `&Connection` per call), and the tunables.
339///
340/// Re-uses `crate::storage::insert` / `crate::storage::create_link_signed`
341/// rather than reaching into the DB directly so the substrate-level
342/// hook layer (pre_store / post_store / pre_link / post_link) fires
343/// for every atom and every `derives_from` edge.
344pub struct Atomiser {
345 curator: Box<dyn Curator>,
346 keypair: Option<Arc<AgentKeypair>>,
347 config: AtomiserConfig,
348 /// Tier the substrate is running at. When `Keyword`, every
349 /// `atomise` call is short-circuited with [`AtomiseError::TierLocked`]
350 /// before the source is even loaded. Matches the WT-1-B brief
351 /// ("keyword → TierLocked"); other tiers proceed.
352 tier: crate::config::FeatureTier,
353 /// v0.7.0 (issue #1244) — the resolved curator model name. Stamped
354 /// verbatim into the `atomisation_complete` signed-event payload's
355 /// `curator_model` field so a downstream auditor walking
356 /// `signed_events` can attribute the decomposition to the model
357 /// that ACTUALLY ran on this deployment. Defaults to `"unknown"`
358 /// (the "truly unresolvable" fallback per the issue) when the
359 /// caller doesn't thread the model name through; production sites
360 /// call [`Self::with_curator_model`] with the resolved LLM model
361 /// id (`OllamaClient::model_name()` etc.) at construction.
362 ///
363 /// Pre-#1244 the payload field was hardcoded to `"gemma4"`,
364 /// surfacing the wrong provenance on every non-gemma deployment
365 /// (grok-4.3, claude-opus-4.7, etc. via the post-#1067 path).
366 curator_model: String,
367}
368
369impl Atomiser {
370 /// Construct an atomiser. `curator` is the LLM-facing surface
371 /// (production: [`curator::LlmCurator`]; tests: a mock). `keypair`
372 /// is the daemon's Ed25519 identity — when `None`, links land
373 /// `unsigned` (mirror of `create_link_signed`'s contract).
374 /// `tier` is the resolved feature tier; the keyword tier short-
375 /// circuits atomise calls immediately.
376 pub fn new(
377 curator: Box<dyn Curator>,
378 keypair: Option<Arc<AgentKeypair>>,
379 config: AtomiserConfig,
380 tier: crate::config::FeatureTier,
381 ) -> Self {
382 Self {
383 curator,
384 keypair,
385 config,
386 tier,
387 // v0.7.0 (#1244) — "unknown" is the documented fallback
388 // for callers (and tests with mock curators) that don't
389 // resolve a model. Production sites chain
390 // `.with_curator_model(llm.model_name())` after `::new`.
391 curator_model: "unknown".to_string(),
392 }
393 }
394
395 /// v0.7.0 (issue #1244) — builder method that stamps the resolved
396 /// curator model name into the atomiser. Threaded into the
397 /// `atomisation_complete` signed-event payload as the
398 /// `curator_model` field so downstream auditors see the model that
399 /// actually ran on this deployment (grok-4.3, claude-opus-4.7,
400 /// gemma3:4b, etc.), not the pre-#1244 hardcoded `"gemma4"`.
401 ///
402 /// Production wiring sites pass `llm_client.model_name()` (from
403 /// `crate::llm::OllamaClient`). The model id passes through
404 /// verbatim — no normalisation, no defaulting beyond the
405 /// `"unknown"` fallback applied by [`Self::new`].
406 #[must_use]
407 pub fn with_curator_model(mut self, curator_model: impl Into<String>) -> Self {
408 let resolved = curator_model.into();
409 if !resolved.trim().is_empty() {
410 self.curator_model = resolved;
411 }
412 self
413 }
414
415 /// v0.7.0 (issue #1244) — accessor for the resolved curator model.
416 /// Exposed for the regression test that pins the
417 /// `atomisation_complete` payload's `curator_model` field reflects
418 /// the model threaded in at construction.
419 #[must_use]
420 pub fn curator_model(&self) -> &str {
421 &self.curator_model
422 }
423
424 /// Cluster-F PERF-5 — accessor for the configured Synchronous-mode
425 /// curator retry budget. Used by the `pre_store::auto_atomise.rs`
426 /// hook to honour `AtomiserConfig::sync_curator_max_retries`
427 /// (compiled default `1`) when the namespace policy has no
428 /// explicit `auto_atomise_max_retries` override.
429 #[must_use]
430 pub fn sync_curator_max_retries(&self) -> u32 {
431 self.config.sync_curator_max_retries
432 }
433
434 /// Atomise the memory named by `source_id`.
435 ///
436 /// `max_atom_tokens` overrides the per-call token budget; pass 0
437 /// to defer to `config.default_max_atom_tokens`.
438 ///
439 /// `force` skips the idempotency check (use to re-atomise after
440 /// a curator-prompt change). Old atoms are retained and
441 /// `atomised_into` is updated to the fresh count.
442 ///
443 /// # Errors
444 ///
445 /// See [`AtomiseError`] for the closed enum of failure modes.
446 ///
447 /// # Async note
448 ///
449 /// The function is `async` to match the WT-1-B brief signature
450 /// even though the substrate body is fully synchronous (sqlite
451 /// is blocking; tiktoken is blocking; the curator LLM call is
452 /// blocking-on-HTTP-thread). The async signature exists so
453 /// callers in tokio-runtime contexts (the MCP server, the
454 /// autonomy scheduler) can `await` it without spawning a
455 /// blocking task themselves.
456 pub async fn atomise(
457 &self,
458 conn: &Connection,
459 source_id: &str,
460 max_atom_tokens: u32,
461 force: bool,
462 calling_agent_id: &str,
463 ) -> Result<AtomiseResult, AtomiseError> {
464 self.atomise_sync(conn, source_id, max_atom_tokens, force, calling_agent_id)
465 }
466
467 /// Sync entry-point — body of [`Self::atomise`]. Exposed for tests
468 /// that prefer to call without tokio scaffolding. Uses the
469 /// configured `curator_max_retries` (deferred-path default).
470 pub fn atomise_sync(
471 &self,
472 conn: &Connection,
473 source_id: &str,
474 max_atom_tokens: u32,
475 force: bool,
476 calling_agent_id: &str,
477 ) -> Result<AtomiseResult, AtomiseError> {
478 self.atomise_sync_with_retries(
479 conn,
480 source_id,
481 max_atom_tokens,
482 force,
483 calling_agent_id,
484 self.config.curator_max_retries,
485 )
486 }
487
488 /// Cluster-F PERF-5 — variant of [`Self::atomise_sync`] that takes
489 /// an explicit per-call `max_retries` override. The Synchronous
490 /// `pre_store` path uses this with `sync_curator_max_retries`
491 /// (default 1) so the operator's `memory_store` envelope is not
492 /// inflated by the full deferred-path retry budget. Per-namespace
493 /// override via `GovernancePolicy::auto_atomise_max_retries`
494 /// flows through this entry-point.
495 ///
496 /// # Errors
497 ///
498 /// See [`AtomiseError`] for the closed enum.
499 pub fn atomise_sync_with_retries(
500 &self,
501 conn: &Connection,
502 source_id: &str,
503 max_atom_tokens: u32,
504 force: bool,
505 calling_agent_id: &str,
506 max_retries: u32,
507 ) -> Result<AtomiseResult, AtomiseError> {
508 // ARCH-5 (FX-6) — atomisation-pass cycle-depth guard. Acquire
509 // the per-thread depth guard BEFORE any substrate work so a
510 // nested atomise triggered by a `pre_store` / `post_store` hook
511 // on a freshly-minted atom observes the higher depth and
512 // refuses on entry. The guard is bound to `_depth_guard` (a
513 // named binding) so Rust retains the RAII drop until the full
514 // atomise body completes; a bare `let _ = ...` would drop the
515 // guard at the end of the statement and let nested calls
516 // escape the cap. Mirrors the discipline in
517 // `crate::synthesis::enter_synthesis_pass` (issue #1240).
518 let (_depth, _depth_guard) = enter_atomisation_pass();
519 if _depth > MAX_ATOMISATION_DEPTH {
520 tracing::warn!(
521 target: "atomisation",
522 source_id = %source_id,
523 attempted = _depth,
524 cap = MAX_ATOMISATION_DEPTH,
525 "atomisation.depth_exceeded",
526 );
527 return Err(AtomiseError::DepthExceeded {
528 attempted: _depth,
529 cap: MAX_ATOMISATION_DEPTH,
530 });
531 }
532
533 // Step 3 — tier check (pulled forward of step 1 so we don't burn
534 // a DB read when the daemon is on keyword tier).
535 if self.tier == crate::config::FeatureTier::Keyword {
536 return Err(AtomiseError::TierLocked);
537 }
538
539 let budget = if max_atom_tokens == 0 {
540 self.config.default_max_atom_tokens
541 } else {
542 max_atom_tokens
543 };
544
545 // Step 1 — load source memory.
546 let source = db::get(conn, source_id)
547 .map_err(|e| AtomiseError::DbError(e.to_string()))?
548 .ok_or(AtomiseError::NotFound)?;
549
550 // Step 2 — idempotency check.
551 if !force {
552 if let Some(atomised_into) = read_atomised_into(conn, source_id)
553 .map_err(|e| AtomiseError::DbError(e.to_string()))?
554 {
555 if atomised_into > 0 {
556 let existing = list_atoms_of(conn, source_id)
557 .map_err(|e| AtomiseError::DbError(e.to_string()))?;
558 return Err(AtomiseError::AlreadyAtomised {
559 source_id: source_id.to_string(),
560 existing_atom_ids: existing,
561 });
562 }
563 }
564 }
565
566 // Step 4 — pre-flight token count. Sources at or under the
567 // budget can never produce a useful split.
568 let source_tokens = db::count_tokens_cl100k(&source.content);
569 if source_tokens <= budget as usize {
570 return Err(AtomiseError::SourceTooSmall);
571 }
572
573 // Step 5 + 6 — curator round-trip. `max_retries` is the
574 // per-call override (Cluster-F PERF-5): the deferred path
575 // passes `config.curator_max_retries` (3 by default), the
576 // Synchronous `pre_store` path passes
577 // `config.sync_curator_max_retries` (1 by default).
578 let atoms = self
579 .curator
580 .decompose(&source.content, budget, max_retries)
581 .map_err(|e| match e {
582 curator::CuratorError::LlmUnavailable(d)
583 | curator::CuratorError::MalformedResponse(d) => AtomiseError::CuratorFailed(d),
584 })?;
585
586 // Step 7 — empty atoms = "cannot decompose" → SourceTooSmall.
587 if atoms.is_empty() {
588 return Err(AtomiseError::SourceTooSmall);
589 }
590
591 // Cap the count to the brief's [2..=10] envelope. The prompt
592 // pins this, but a misbehaving LLM could return e.g. 50; clamp
593 // here so the substrate never writes outside the contract.
594 let atom_count = atoms.len().min(self.config.max_atoms_per_source);
595 if atom_count < self.config.min_atoms_per_source {
596 return Err(AtomiseError::SourceTooSmall);
597 }
598 let atoms: Vec<curator::Atom> = atoms.into_iter().take(atom_count).collect();
599
600 // Step 8 — per-atom transactional write. We iterate atom-by-atom
601 // so the hook layer fires per atom (the brief's "atoms are
602 // first-class memory_store ops" contract). A governance refusal
603 // mid-batch surfaces with the atom index; PRIOR atoms remain
604 // committed (they were valid writes by themselves).
605 //
606 // v0.7.0 Form 4 (issue #757) — atom-grain span fact-provenance.
607 // We compute a `SourceSpan` byte-range for each atom into the
608 // parent source body. The substring search advances a running
609 // cursor so duplicate prefixes across atoms (e.g. two atoms
610 // that both quote the same phrase) get assigned non-overlapping
611 // spans in the order the curator emitted them. Atoms whose
612 // text cannot be located fall back to `None` for the span
613 // (curator may have paraphrased) — the substrate still records
614 // `source_uri = doc:<parent>` so the lineage edge is preserved
615 // even when the byte-range is unrecoverable.
616 let mut atom_ids: Vec<String> = Vec::with_capacity(atom_count);
617 let mut search_cursor: usize = 0;
618 for (idx, atom) in atoms.iter().enumerate() {
619 let span = compute_atom_span(&source.content, &atom.text, &mut search_cursor);
620 let atom_id = write_atom(
621 conn,
622 &source,
623 atom,
624 span,
625 calling_agent_id,
626 self.keypair.as_deref(),
627 )
628 .map_err(|e| {
629 if let Some(refusal) = e.downcast_ref::<crate::storage::GovernanceRefusal>() {
630 AtomiseError::GovernanceRefused(format!("atom[{idx}]: {}", refusal.reason))
631 } else {
632 AtomiseError::DbError(format!("atom[{idx}]: {e}"))
633 }
634 })?;
635 atom_ids.push(atom_id);
636 }
637
638 // Step 9 — archive the source in a SEPARATE transaction. The
639 // per-atom hooks have already fired by this point, so the
640 // source is still live during those hook callbacks (the WT-1-C
641 // resolver can switch over only after this commit lands).
642 let archived_at = Utc::now().to_rfc3339();
643 let atom_count_i64 = i64::try_from(atom_count).unwrap_or(i64::MAX);
644 archive_source(conn, source_id, atom_count_i64, &archived_at)
645 .map_err(|e| AtomiseError::DbError(e.to_string()))?;
646
647 // Step 10 — emit the atomisation_complete signed_event.
648 emit_atomisation_complete_event(
649 conn,
650 source_id,
651 &atom_ids,
652 atom_count,
653 calling_agent_id,
654 &archived_at,
655 self.keypair.as_deref(),
656 &self.curator_model,
657 )
658 .map_err(|e| AtomiseError::DbError(e.to_string()))?;
659
660 Ok(AtomiseResult {
661 source_id: source_id.to_string(),
662 atom_ids,
663 atom_count,
664 archived_at,
665 })
666 }
667}
668
669// ---------------------------------------------------------------------------
670// Helpers — kept module-private but `pub(crate)` so the test crate's
671// `atomisation_core` module can poke at substrate state directly.
672// ---------------------------------------------------------------------------
673
674/// Read the `atomised_into` column for a memory. Returns `Ok(None)`
675/// when the column is NULL (memory has not been atomised) OR the row
676/// does not exist, `Ok(Some(n))` when set, error on rusqlite failures
677/// other than `QueryReturnedNoRows`.
678///
679/// # Cluster-A COR-2 fix
680///
681/// Pre-fix, the body swallowed every rusqlite error via
682/// `.unwrap_or(None)`. A real failure (lock-timeout, IO error, schema
683/// drift) was indistinguishable from "row not present" — the
684/// idempotency check would fall through and the caller would proceed
685/// to re-atomise an already-atomised source. Now only the
686/// `QueryReturnedNoRows` variant maps to `Ok(None)`; every other
687/// rusqlite error propagates via `?` and surfaces as
688/// `AtomiseError::DbError`.
689fn read_atomised_into(conn: &Connection, id: &str) -> anyhow::Result<Option<i64>> {
690 match conn.query_row(
691 "SELECT atomised_into FROM memories WHERE id = ?1",
692 params![id],
693 |r| r.get::<_, Option<i64>>(0),
694 ) {
695 Ok(v) => Ok(v),
696 Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None),
697 Err(e) => Err(e.into()),
698 }
699}
700
701/// Return the ordered list of atom ids whose `atom_of` column points
702/// at the supplied source id. Ordered by `created_at` then `id` so
703/// the response is deterministic across calls.
704fn list_atoms_of(conn: &Connection, source_id: &str) -> anyhow::Result<Vec<String>> {
705 let mut stmt =
706 conn.prepare("SELECT id FROM memories WHERE atom_of = ?1 ORDER BY created_at ASC, id ASC")?;
707 let rows = stmt.query_map(params![source_id], |r| r.get::<_, String>(0))?;
708 rows.collect::<rusqlite::Result<Vec<_>>>()
709 .map_err(Into::into)
710}
711
712/// Write one atom: build a Memory row from the source's metadata
713/// (namespace/tier/tags), call `db::insert` (which fires the per-write
714/// hook chain), then write the `derives_from` edge via
715/// `db::create_link_signed`. The edge write is also hook-instrumented.
716///
717/// Returns the freshly-minted atom id on success. Errors bubble up
718/// as `anyhow::Error`; the caller downcasts to `GovernanceRefusal` to
719/// distinguish refusal from generic DB failure.
720fn write_atom(
721 conn: &Connection,
722 source: &Memory,
723 atom: &curator::Atom,
724 span: Option<SourceSpan>,
725 calling_agent_id: &str,
726 keypair: Option<&AgentKeypair>,
727) -> anyhow::Result<String> {
728 let now = Utc::now().to_rfc3339();
729 let atom_id = uuid::Uuid::new_v4().to_string();
730 // Synthesise a title from the source title + a short atom prefix so
731 // (title, namespace) does not collide with the parent under the
732 // ON CONFLICT clause in db::insert. The first 50 chars of the atom
733 // text is the deterministic signal; the trailing UUID8 ensures
734 // uniqueness across multiple atoms that share a content prefix.
735 let prefix: String = atom
736 .text
737 .chars()
738 .take(50)
739 .collect::<String>()
740 .trim()
741 .to_string();
742 let title = if prefix.is_empty() {
743 format!("[atom] {} #{}", source.title, &atom_id[..8])
744 } else {
745 format!("[atom] {} ({})", prefix, &atom_id[..8])
746 };
747
748 // metadata.agent_id is the substrate's NHI provenance marker;
749 // metadata.atom_index records the curator's 0-based ordering so
750 // downstream consumers reproduce the parent's narrative flow.
751 let mut metadata = match source.metadata.clone() {
752 serde_json::Value::Object(map) => map,
753 _ => serde_json::Map::new(),
754 };
755 metadata.insert(
756 "agent_id".to_string(),
757 serde_json::Value::String(calling_agent_id.to_string()),
758 );
759 metadata.insert(
760 "atom_source_id".to_string(),
761 serde_json::Value::String(source.id.clone()),
762 );
763
764 let mem = Memory {
765 id: atom_id.clone(),
766 tier: source.tier.clone(),
767 namespace: source.namespace.clone(),
768 title,
769 content: atom.text.clone(),
770 tags: source.tags.clone(),
771 priority: source.priority,
772 confidence: source.confidence,
773 // Source provenance label — "atomiser" so an operator
774 // walking `metadata.source` sees the synthetic origin.
775 source: "atomiser".to_string(),
776 access_count: 0,
777 created_at: now.clone(),
778 updated_at: now,
779 last_accessed_at: None,
780 expires_at: None,
781 metadata: serde_json::Value::Object(metadata),
782 reflection_depth: source.reflection_depth,
783 // Atoms inherit the parent's typed kind: Observation source →
784 // Observation atoms (the WT-1-B brief case). Reflection sources
785 // could theoretically be atomised too, but that path is gated
786 // by WT-1-C/D — for now atoms are typed Observation per the
787 // brief.
788 memory_kind: MemoryKind::Observation,
789 // v0.7.0 QW-2 — atoms are not Persona-kind; entity_id +
790 // persona_version stay NULL on the atom row.
791 entity_id: None,
792 persona_version: None,
793 // v0.7.0 Form 4 — atom-grain fact-provenance. Atoms inherit
794 // the parent's citations array (the same supporting evidence
795 // applies to every decomposed proposition) and stamp the
796 // parent memory id under the `doc:` scheme so the lineage is
797 // discoverable via the `--source-uri-prefix` recall filter.
798 // `source_span` carries the byte-range into the parent body
799 // when the curator's text was located verbatim; otherwise
800 // `None` (curator may have paraphrased).
801 citations: source.citations.clone(),
802 source_uri: Some(format!("doc:{}", source.id)),
803 source_span: span,
804 // v0.7.0 issue #1242 — atom rows are curator-engine output, not
805 // caller-supplied. The curator inherits `confidence` from the
806 // parent memory (line 567 `confidence: source.confidence`) — that
807 // value is engine-propagated, not caller-supplied, so the
808 // discriminator must reflect the provenance. Pre-#1242 these
809 // rows mis-labelled `CallerProvided`, hiding them from the
810 // partial `idx_memories_confidence_source` enumeration the
811 // calibration sweep scans, and violating the audit-honesty
812 // invariant. Distinct from `AutoDerived`, which means the
813 // Form 5 derive engine computed the value from row signals;
814 // here the curator simply propagated the parent's number.
815 confidence_source: ConfidenceSource::CuratorDerived,
816 confidence_signals: None,
817 confidence_decayed_at: None,
818 version: 1,
819 };
820
821 let actual_id = db::insert(conn, &mem)?;
822
823 // Stamp `atom_of` on the freshly inserted row. db::insert does NOT
824 // accept this column on its struct surface (Memory pre-dates the
825 // v36 columns), so we issue a targeted UPDATE here. This is
826 // hot-path so a single-row UPDATE is acceptable; an alternate
827 // approach (extend Memory to carry atom_of) is deferred until a
828 // future Memory refactor.
829 //
830 // v0.7.0 #1036 (Agent-3 #7) — intentionally non-version-bumping.
831 // This is a back-fill of an atomisation-derived metadata column
832 // on a row that was JUST INSERTED (`db::insert(&mem)` returned
833 // `actual_id` two lines up). The row has not yet been observed
834 // by any caller, so there is no version contract to violate.
835 // Pinned by `tests/non_version_bumping_sites_1036.rs`.
836 conn.execute(
837 "UPDATE memories SET atom_of = ?1 WHERE id = ?2",
838 params![source.id, actual_id],
839 )?;
840
841 // derives_from edge: atom → parent. This goes through
842 // create_link_signed which writes the row, fires the pre/post-link
843 // hooks, signs with the supplied keypair, and appends a
844 // `memory_link.created` row to signed_events.
845 db::create_link_signed(
846 conn,
847 &actual_id,
848 &source.id,
849 MemoryLinkRelation::DerivesFrom.as_str(),
850 keypair,
851 )?;
852
853 Ok(actual_id)
854}
855
856/// Archive the source memory.
857///
858/// Sets `atomised_into = N` (the substrate-visible signal that the row
859/// has been atomised) and writes an `atomisation_archived_at` RFC3339
860/// stamp into `metadata` (logical "this row is read-only because its
861/// atoms are now the canonical surface"). We do NOT call
862/// `db::archive_memory` here — that physically moves the row to
863/// `archived_memories`, which would invalidate every atom's `atom_of`
864/// FK pointing at it. The atom-of relationship survives as long as
865/// the parent row remains in `memories`; flipping `atomised_into`
866/// from NULL to N is the downstream signal WT-1-C consumes.
867///
868/// Runs in its own transaction so the per-atom hooks (step 8) have
869/// already fired before the source flips into the "atomised" state.
870fn archive_source(
871 conn: &Connection,
872 source_id: &str,
873 atom_count: i64,
874 archived_at: &str,
875) -> anyhow::Result<()> {
876 conn.execute_batch(crate::storage::connection::SQL_BEGIN_IMMEDIATE)?;
877 let result = (|| -> anyhow::Result<()> {
878 // Merge the existing metadata with the new
879 // `atomisation_archived_at` key — never clobber other keys.
880 let existing_metadata_str: String = conn
881 .query_row(
882 "SELECT metadata FROM memories WHERE id = ?1",
883 params![source_id],
884 |r| {
885 r.get::<_, Option<String>>(0)
886 .map(|o| o.unwrap_or_else(|| "{}".to_string()))
887 },
888 )
889 .unwrap_or_else(|_| "{}".to_string());
890 let mut meta: serde_json::Map<String, serde_json::Value> =
891 serde_json::from_str(&existing_metadata_str).unwrap_or_default();
892 meta.insert(
893 crate::models::field_names::ATOMISATION_ARCHIVED_AT.to_string(),
894 serde_json::Value::String(archived_at.to_string()),
895 );
896 let merged = serde_json::Value::Object(meta).to_string();
897 conn.execute(
898 "UPDATE memories SET atomised_into = ?1, metadata = ?2, updated_at = ?3 \
899 WHERE id = ?4",
900 params![atom_count, merged, archived_at, source_id],
901 )?;
902 Ok(())
903 })();
904 match result {
905 Ok(()) => {
906 conn.execute_batch(crate::storage::connection::SQL_COMMIT)?;
907 Ok(())
908 }
909 Err(e) => {
910 let _ = conn.execute_batch(crate::storage::connection::SQL_ROLLBACK);
911 Err(e)
912 }
913 }
914}
915
916/// Append the final `atomisation_complete` event to `signed_events`.
917/// The payload binds the source id, the atom-id list, and the curator
918/// model id so a downstream auditor can reproduce the decomposition.
919///
920/// v0.7.0 (issue #1244) — `curator_model` is threaded in from the
921/// caller's resolved LLM model name (via [`Atomiser::with_curator_model`])
922/// rather than hardcoded to `"gemma4"`. Default fallback is `"unknown"`.
923fn emit_atomisation_complete_event(
924 conn: &Connection,
925 source_id: &str,
926 atom_ids: &[String],
927 atom_count: usize,
928 calling_agent_id: &str,
929 archived_at: &str,
930 keypair: Option<&AgentKeypair>,
931 curator_model: &str,
932) -> anyhow::Result<()> {
933 let payload = serde_json::json!({
934 "event_type": "atomisation_complete",
935 "source_id": source_id,
936 "atom_ids": atom_ids,
937 (crate::models::field_names::ATOM_COUNT): atom_count,
938 "calling_agent_id": calling_agent_id,
939 "atomisation_timestamp": archived_at,
940 "curator_model": curator_model,
941 });
942 let bytes = serde_json::to_vec(&payload)?;
943 let (signature, attest_level) = if let Some(kp) = keypair.filter(|k| k.can_sign()) {
944 let signing = kp.private.as_ref().expect("can_sign() checked");
945 use ed25519_dalek::Signer;
946 let sig = signing.sign(&bytes);
947 (
948 Some(sig.to_bytes().to_vec()),
949 crate::models::AttestLevel::SelfSigned.as_str(),
950 )
951 } else {
952 (None, crate::models::AttestLevel::Unsigned.as_str())
953 };
954 let event = SignedEvent {
955 id: uuid::Uuid::new_v4().to_string(),
956 agent_id: calling_agent_id.to_string(),
957 event_type: crate::signed_events::event_types::ATOMISATION_COMPLETE.to_string(),
958 payload_hash: payload_hash(&bytes),
959 signature,
960 attest_level: attest_level.to_string(),
961 timestamp: Utc::now().to_rfc3339(),
962 ..SignedEvent::default()
963 };
964 append_signed_event(conn, &event)?;
965 Ok(())
966}
967
968/// v0.7.0 Form 4 (issue #757) — locate an atom's text inside its
969/// parent source body and emit the byte-range as a [`SourceSpan`].
970///
971/// Strategy:
972/// 1. Search verbatim for `atom_text` in `source[cursor..]`. When
973/// found, advance the cursor past the hit so a subsequent atom
974/// that quotes the same prefix doesn't latch onto the same offset.
975/// 2. When the verbatim search misses (curator paraphrased, or
976/// whitespace differs), return `None`. The substrate still
977/// stamps `source_uri` so the lineage edge survives without the
978/// span. This is the documented fallback contract for
979/// curator-paraphrase atoms.
980///
981/// # UTF-8 safety
982///
983/// `cursor` is treated as a byte offset into `source_body`. The
984/// cursor MUST point at a char boundary on entry; the function
985/// advances it to the next char boundary AFTER the hit start so
986/// repeated invocations on the same body cannot land mid-codepoint.
987/// The returned span's `start` and `end` are both guaranteed to fall
988/// on char boundaries because `str::find` itself only returns
989/// codepoint-aligned offsets (a property of `str` slicing).
990///
991/// # Cluster-A COR-1 / COR-7 fix
992///
993/// Pre-fix, the cursor advanced via `start.saturating_add(1)` which
994/// could leave the cursor mid-codepoint on multi-byte text — a
995/// subsequent `source_body[*cursor..]` slice would panic at the byte
996/// boundary check. The fix walks to the next `char_indices()` entry
997/// past the hit so every advance lands on a valid boundary, and
998/// clamps `end` to `source_body.len()` defensively (verbatim
999/// `str::find` already guarantees this — the clamp is belt-and-braces
1000/// against a future refactor that might pre-pad `needle`).
1001fn compute_atom_span(source_body: &str, atom_text: &str, cursor: &mut usize) -> Option<SourceSpan> {
1002 let needle = atom_text.trim();
1003 if needle.is_empty() {
1004 return None;
1005 }
1006 // If the cursor has drifted mid-codepoint (shouldn't happen with the
1007 // boundary-aware advance below, but defend against pathological
1008 // callers passing in a stale cursor), realign DOWN to the previous
1009 // boundary before slicing. `floor_char_boundary` is nightly-only;
1010 // hand-roll the equivalent by walking back at most 3 bytes (UTF-8
1011 // codepoints are ≤4 bytes).
1012 let cursor_aligned = floor_char_boundary(source_body, *cursor);
1013 let start = if cursor_aligned < source_body.len() {
1014 source_body[cursor_aligned..]
1015 .find(needle)
1016 .map(|off| cursor_aligned + off)
1017 } else {
1018 None
1019 };
1020 let start = start.or_else(|| source_body.find(needle))?;
1021 let end = (start + needle.len()).min(source_body.len());
1022 // Advance the cursor to the next char boundary AFTER `start` — using
1023 // `char_indices()` to find the first index strictly greater than
1024 // `start`. This ensures the cursor never lands mid-codepoint on
1025 // multi-byte text (Café / 中文 / 🦀 etc.).
1026 *cursor = source_body[start..]
1027 .char_indices()
1028 .nth(1)
1029 .map_or(source_body.len(), |(off, _)| start + off);
1030 Some(SourceSpan { start, end })
1031}
1032
1033/// Hand-rolled `str::floor_char_boundary` (the std fn is nightly-only as
1034/// of 1.83). Returns the largest index `≤ index` that lies on a UTF-8
1035/// char boundary in `s`. When `index >= s.len()`, returns `s.len()`
1036/// (which is itself a valid boundary).
1037fn floor_char_boundary(s: &str, index: usize) -> usize {
1038 if index >= s.len() {
1039 return s.len();
1040 }
1041 let mut i = index;
1042 while i > 0 && !s.is_char_boundary(i) {
1043 i -= 1;
1044 }
1045 i
1046}
1047
1048// ---------------------------------------------------------------------------
1049// Unit tests — exercise the helpers that don't require a live curator.
1050// The full integration suite (mock curator + DB + hooks + signed_events)
1051// lives at `tests/atomisation.rs`.
1052// ---------------------------------------------------------------------------
1053
1054#[cfg(test)]
1055mod tests {
1056 use super::*;
1057
1058 #[test]
1059 fn config_defaults_match_brief() {
1060 let c = AtomiserConfig::default();
1061 assert_eq!(c.default_max_atom_tokens, 200);
1062 assert_eq!(c.min_atoms_per_source, 2);
1063 assert_eq!(c.max_atoms_per_source, 10);
1064 assert_eq!(c.curator_max_retries, 3);
1065 }
1066
1067 #[test]
1068 fn atomise_error_display_shapes() {
1069 // Spot-check every variant renders without panicking.
1070 for e in [
1071 AtomiseError::NotFound,
1072 AtomiseError::AlreadyAtomised {
1073 source_id: "src".into(),
1074 existing_atom_ids: vec!["a".into(), "b".into()],
1075 },
1076 AtomiseError::TierLocked,
1077 AtomiseError::CuratorFailed("bad json".into()),
1078 AtomiseError::SourceTooSmall,
1079 AtomiseError::GovernanceRefused("policy".into()),
1080 AtomiseError::SignerError("no key".into()),
1081 AtomiseError::DbError("io".into()),
1082 AtomiseError::DepthExceeded {
1083 attempted: 4,
1084 cap: MAX_ATOMISATION_DEPTH,
1085 },
1086 ] {
1087 let s = format!("{e}");
1088 assert!(!s.is_empty());
1089 }
1090 }
1091
1092 // ---- ARCH-5 (FX-6) — atomisation depth-cap invariants.
1093
1094 #[test]
1095 fn max_atomisation_depth_matches_substrate_recursive_primitive_cap() {
1096 // The cap must match the rest of the recursive-primitive
1097 // discipline (reflection / synthesis ship at 3). A drift here
1098 // would mean the atomiser tolerates deeper recursion than its
1099 // sibling primitives — surfaced explicitly so a refactor that
1100 // bumps any of the four caps trips this pin.
1101 assert_eq!(MAX_ATOMISATION_DEPTH, 3);
1102 assert_eq!(
1103 MAX_ATOMISATION_DEPTH,
1104 crate::synthesis::MAX_SYNTHESIS_DEPTH,
1105 "atomisation cap must match synthesis cap"
1106 );
1107 }
1108
1109 #[test]
1110 fn atomisation_depth_guard_increments_and_decrements() {
1111 // Pre-entry depth is 0 on a fresh thread.
1112 assert_eq!(current_atomisation_depth(), 0);
1113 {
1114 let (d1, _g1) = enter_atomisation_pass();
1115 assert_eq!(d1, 1);
1116 assert_eq!(current_atomisation_depth(), 1);
1117 {
1118 let (d2, _g2) = enter_atomisation_pass();
1119 assert_eq!(d2, 2);
1120 assert_eq!(current_atomisation_depth(), 2);
1121 {
1122 let (d3, _g3) = enter_atomisation_pass();
1123 assert_eq!(d3, 3);
1124 {
1125 // 4-th nesting trips the cap.
1126 let (d4, _g4) = enter_atomisation_pass();
1127 assert_eq!(d4, 4);
1128 assert!(d4 > MAX_ATOMISATION_DEPTH, "depth=4 exceeds cap=3");
1129 }
1130 // Drop restores depth back to 3.
1131 assert_eq!(current_atomisation_depth(), 3);
1132 }
1133 assert_eq!(current_atomisation_depth(), 2);
1134 }
1135 assert_eq!(current_atomisation_depth(), 1);
1136 }
1137 assert_eq!(current_atomisation_depth(), 0);
1138 }
1139
1140 // ---- Cluster-A COR-1 / COR-7 / COV-3 — compute_atom_span tests.
1141
1142 #[test]
1143 fn compute_atom_span_paraphrase_fallback_returns_none() {
1144 // Atom text that the curator paraphrased — does not appear
1145 // verbatim in the parent body. Pre-fix the fallback was a
1146 // 32-char prefix search; the new contract returns `None`
1147 // gracefully so the substrate stamps `source_uri` without a
1148 // span. Critical: NO panic, even when the cursor was advanced
1149 // by a prior successful hit on the same body.
1150 let body = "The deployment manifest pins the image digest explicitly.";
1151 let mut cursor = 0_usize;
1152 let got = compute_atom_span(
1153 body,
1154 "Curator paraphrased this sentence entirely.",
1155 &mut cursor,
1156 );
1157 assert!(
1158 got.is_none(),
1159 "paraphrase miss must return None, got {got:?}"
1160 );
1161 // Cursor unchanged on miss.
1162 assert_eq!(cursor, 0);
1163 }
1164
1165 #[test]
1166 fn compute_atom_span_multibyte_utf8_stays_on_char_boundary() {
1167 // Multi-byte body covering Latin-with-diacritic (Café), CJK
1168 // (中文), and a 4-byte emoji (🦀). Pre-fix the cursor advance
1169 // (`start.saturating_add(1)`) split codepoints and the next
1170 // `source_body[*cursor..]` slice would panic. Post-fix the
1171 // cursor always lands on a valid char boundary.
1172 let body = "Café 中文 🦀 statement that follows the emoji.";
1173 let mut cursor = 0_usize;
1174
1175 // First hit — the CJK substring.
1176 let span = compute_atom_span(body, "中文", &mut cursor)
1177 .expect("multi-byte needle should be found verbatim");
1178 // Span lies on char boundaries (Rust's str::find guarantees
1179 // this for verbatim matches; we re-assert as a regression pin).
1180 assert!(body.is_char_boundary(span.start));
1181 assert!(body.is_char_boundary(span.end));
1182 assert_eq!(&body[span.start..span.end], "中文");
1183
1184 // Cursor must have advanced PAST `start` to the next char
1185 // boundary — never mid-codepoint.
1186 assert!(cursor > span.start);
1187 assert!(
1188 body.is_char_boundary(cursor),
1189 "cursor={cursor} mid-codepoint"
1190 );
1191
1192 // Second hit — the emoji. Critical: this slice would have
1193 // panicked pre-fix because the cursor would have been
1194 // mid-codepoint at byte-offset start+1 of the CJK char.
1195 let span2 = compute_atom_span(body, "🦀", &mut cursor)
1196 .expect("emoji needle should be found after CJK");
1197 assert!(body.is_char_boundary(span2.start));
1198 assert!(body.is_char_boundary(span2.end));
1199 assert_eq!(&body[span2.start..span2.end], "🦀");
1200
1201 // Third hit — verify a verbatim ASCII sentence still works in
1202 // the same pass.
1203 let span3 = compute_atom_span(body, "statement that follows the emoji.", &mut cursor)
1204 .expect("ascii needle should still be found");
1205 assert_eq!(
1206 &body[span3.start..span3.end],
1207 "statement that follows the emoji."
1208 );
1209 }
1210
1211 #[test]
1212 fn compute_atom_span_cursor_clamps_to_body_length() {
1213 // Cursor sitting past EOL should NOT panic. The function falls
1214 // through to the whole-body search.
1215 let body = "short body";
1216 let mut cursor = 1_000_usize;
1217 let span = compute_atom_span(body, "body", &mut cursor).expect("fallback whole-body");
1218 assert_eq!(&body[span.start..span.end], "body");
1219 }
1220
1221 #[test]
1222 fn compute_atom_span_stale_cursor_realigns_to_boundary() {
1223 // A pathological caller passing a cursor mid-codepoint should
1224 // not panic; the function realigns DOWN to the prior boundary
1225 // before slicing. Regression pin for `floor_char_boundary`.
1226 let body = "Café statement";
1227 // The 'é' is the bytes `0xC3 0xA9` — byte offset 4 is the
1228 // start of `0xA9`, mid-codepoint.
1229 let mut cursor = 4_usize;
1230 // Should not panic.
1231 let _ = compute_atom_span(body, "statement", &mut cursor);
1232 }
1233
1234 #[test]
1235 fn floor_char_boundary_walks_back_to_codepoint_start() {
1236 let s = "Café 中文";
1237 // Boundary already valid.
1238 assert_eq!(floor_char_boundary(s, 0), 0);
1239 // Mid-codepoint (`é` starts at 3, occupies bytes 3..5).
1240 assert_eq!(floor_char_boundary(s, 4), 3);
1241 // Past EOL clamps to len.
1242 assert_eq!(floor_char_boundary(s, 9999), s.len());
1243 }
1244}