Skip to main content

ai_memory/forensic/
bundle.rs

1// Copyright 2026 AlphaOne LLC
2// SPDX-License-Identifier: Apache-2.0
3
4//! v0.7.0 L2-5 (issue #670) — forensic evidence bundle.
5//!
6//! This module assembles and verifies the procurement-grade evidence
7//! tarball produced by `ai-memory export-forensic-bundle`. The bundle
8//! is the OSS surface for the `AgenticMem Attest` tier — a single
9//! tar file an external auditor can re-verify with no network and no
10//! daemon state, just the public keys of the signing agents.
11//!
12//! ## Bundle layout
13//!
14//! ```text
15//!     <bundle>.tar
16//!       manifest.json                 — bundle metadata + SHA-256s + sig
17//!       verification.json             — L1-3 `verify-reflection-chain` JSON
18//!       memories/<id>.json            — target memory + sources
19//!       edges/<src>__<rel>__<dst>.json — reflects_on / supersedes /
20//!                                       derived_from edges + signatures
21//!       signed_events/<event_id>.json  — append-only audit rows for
22//!                                       the chain
23//!       transcripts/<id>.json          — transcript metadata
24//!       transcripts/<id>.content       — raw decompressed UTF-8 body
25//! ```
26//!
27//! ## Determinism + reproducibility
28//!
29//! Acceptance criterion from #670 is "byte-identical mod timestamp".
30//! We enforce that by:
31//!
32//! - Writing a minimal POSIX ustar archive in-process (no `tar` crate
33//!   dep — keeps the dep surface flat per repo convention).
34//! - Sorting every file name lexicographically before emission so two
35//!   builds over the same DB produce identical bytes regardless of
36//!   SQLite row order.
37//! - Pinning every per-file ustar header field (uid, gid, mtime, mode,
38//!   uname, gname) to a constant — there is no caller-supplied
39//!   filesystem metadata in the archive.
40//! - Pinning the manifest field order via a struct definition rather
41//!   than a `serde_json::Map` (which is `BTreeMap`-backed but the
42//!   default `to_string` writer is still order-preserving for the
43//!   struct path) and emitting via `serde_json::to_vec_pretty` which is
44//!   deterministic for `#[derive(Serialize)]` structs.
45//!
46//! The only legitimate non-determinism is `manifest.generated_at` —
47//! the RFC3339 instant the bundle was assembled. That field is
48//! explicitly documented as "expected to vary across rebuilds" and
49//! lives in a stable position so a downstream diff tool can ignore it
50//! exactly.
51//!
52//! ## Signature
53//!
54//! The bundle's `manifest.json` includes a SHA-256 over every file in
55//! the archive AND, when an AlphaOne operator keypair is on disk, an
56//! Ed25519 signature over a canonical concatenation of those hashes.
57//! An auditor verifies the bundle by:
58//!
59//! 1. Re-hashing every file in the tar.
60//! 2. Comparing each hash to `manifest.files[path].sha256`.
61//! 3. (If `manifest.signature` is present) re-deriving the same
62//!    canonical concat and verifying the Ed25519 signature against the
63//!    operator's public key.
64
65use std::collections::BTreeMap;
66use std::fs;
67use std::path::{Path, PathBuf};
68
69use anyhow::{Context, Result, anyhow, bail};
70use base64::Engine;
71use base64::engine::general_purpose::STANDARD_NO_PAD;
72use ed25519_dalek::Signer;
73use rusqlite::{Connection, params};
74use serde::{Deserialize, Serialize};
75use sha2::{Digest, Sha256};
76
77use crate::cli::CliOutput;
78use crate::identity::keypair as kp_mod;
79use crate::identity::sign::SignableLink;
80
81/// Bundle manifest entry name (#1558 batch 6).
82const MANIFEST_FILE_NAME: &str = "manifest.json";
83
84// ─────────────────────────────────────────────────────────────────────
85// Public arguments (consumed by daemon_runtime dispatch)
86// ─────────────────────────────────────────────────────────────────────
87
88/// Arguments for `ai-memory export-forensic-bundle`.
89#[derive(clap::Args, Debug)]
90pub struct ExportForensicBundleArgs {
91    /// Memory id whose reflection chain to bundle.
92    #[arg(long, value_name = "ID")]
93    pub memory_id: String,
94
95    /// Include the target memory + every reachable source memory.
96    #[arg(long, default_value_t = false)]
97    pub include_reflections: bool,
98
99    /// Include the transcript union (per L2-4 `replay_transcript_union`).
100    #[arg(long, default_value_t = false)]
101    pub include_transcripts: bool,
102
103    /// Output path for the tarball. Defaults to
104    /// `forensic-bundle-<short-id>-<rfc3339>.tar` in the working
105    /// directory.
106    #[arg(long, value_name = "PATH")]
107    pub output: Option<PathBuf>,
108
109    /// v0.7.0 WT-1-E — when true (default), include the full
110    /// atomisation chain whenever the target memory is an archived
111    /// source or an atom: the source row, every atom (atom_of =
112    /// source_id), every `derives_from` edge, and the
113    /// `atomisation_complete` signed event. When false the bundle
114    /// emits only the atoms (the source chain is skipped), useful
115    /// when an auditor only needs the canonical post-atomisation
116    /// surface and not the historical record.
117    #[arg(long, default_value_t = true)]
118    pub include_atomisation_chain: bool,
119}
120
121/// Arguments for `ai-memory verify-forensic-bundle <path>`.
122#[derive(clap::Args, Debug)]
123pub struct VerifyForensicBundleArgs {
124    /// Path to the `.tar` bundle to verify.
125    pub bundle_path: PathBuf,
126}
127
128// ─────────────────────────────────────────────────────────────────────
129// Manifest types
130// ─────────────────────────────────────────────────────────────────────
131
132/// One entry in the manifest's per-file index.
133#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
134pub struct ManifestFile {
135    /// Path inside the tarball (e.g. `memories/abc.json`).
136    pub path: String,
137    /// File size in bytes.
138    pub size: u64,
139    /// SHA-256 hex digest over the file contents.
140    pub sha256: String,
141}
142
143/// Manifest metadata + integrity index for the bundle.
144///
145/// Serialised to `manifest.json` inside the tar. The `signature` and
146/// `signer_agent_id` fields are filled when an AlphaOne operator
147/// keypair is available; auditors verify the signature with the
148/// operator's public key (out-of-band distribution — same model the
149/// rest of the H-track uses).
150#[derive(Debug, Clone, Serialize, Deserialize)]
151pub struct Manifest {
152    /// Bundle schema version. Bumped on a wire-incompatible change.
153    pub schema_version: u32,
154    /// Target memory id passed on the command line.
155    pub memory_id: String,
156    /// RFC3339 instant the bundle was assembled.
157    ///
158    /// Only field that legitimately varies across rebuilds (every
159    /// other byte in the bundle is deterministic — the reproducibility
160    /// acceptance criterion in #670 is "byte-identical mod timestamp"
161    /// and this is the timestamp).
162    pub generated_at: String,
163    /// `true` when `--include-reflections` was passed.
164    pub include_reflections: bool,
165    /// `true` when `--include-transcripts` was passed.
166    pub include_transcripts: bool,
167    /// Sorted-by-path SHA-256 manifest over every other file in the
168    /// archive (excludes `manifest.json` itself).
169    pub files: Vec<ManifestFile>,
170    /// Operator agent_id whose key signed the manifest, or `None` when
171    /// the bundle is unsigned (no operator key on disk).
172    #[serde(skip_serializing_if = "Option::is_none")]
173    pub signer_agent_id: Option<String>,
174    /// Ed25519 signature (base64) over `canonical_signed_bytes` of the
175    /// rest of the manifest, or `None` when unsigned.
176    #[serde(skip_serializing_if = "Option::is_none")]
177    pub signature: Option<String>,
178}
179
180/// Bundle schema version pin. Bumped on any change that breaks the
181/// auditor's deserialisation contract (new mandatory field, removed
182/// field, reshuffled enum, etc.).
183pub const BUNDLE_SCHEMA_VERSION: u32 = 1;
184
185// ─────────────────────────────────────────────────────────────────────
186// Per-entity envelope types
187// ─────────────────────────────────────────────────────────────────────
188
189/// One stored memory inside the bundle. We re-emit a stable subset of
190/// the [`crate::models::Memory`] shape so a future struct refactor
191/// doesn't silently break the on-disk format.
192#[derive(Debug, Clone, Serialize, Deserialize)]
193pub struct MemoryEnvelope {
194    pub id: String,
195    pub namespace: String,
196    pub title: String,
197    pub content: String,
198    pub tier: String,
199    pub memory_kind: String,
200    pub reflection_depth: i32,
201    pub created_at: String,
202    pub updated_at: String,
203    pub metadata: serde_json::Value,
204    /// v0.7.0 WT-1-E — atomisation-chain enrichment. Present only
205    /// when this memory is involved in atomisation (either an
206    /// archived source with `atomised_into > 0` or an atom with
207    /// `atom_of` set). Provides the full chain locally to the
208    /// auditor without forcing them to cross-reference between
209    /// envelopes. Skipped (None) on rows untouched by atomisation
210    /// and on every bundle built with
211    /// `--include-atomisation-chain=false`.
212    #[serde(skip_serializing_if = "Option::is_none")]
213    pub atomisation: Option<AtomisationEnvelope>,
214    /// v0.7.0 Form 4 (issue #757) — fact-provenance citations.
215    /// Always emitted (defaults to an empty array) so auditors can
216    /// rely on the field's presence regardless of row vintage.
217    /// Mirrors [`crate::models::Memory::citations`].
218    #[serde(default)]
219    pub citations: Vec<crate::models::Citation>,
220    /// v0.7.0 Form 4 — first-class URI-form pointer to the cited
221    /// source body. Omitted when NULL on the underlying row.
222    #[serde(default, skip_serializing_if = "Option::is_none")]
223    pub source_uri: Option<String>,
224    /// v0.7.0 Form 4 — byte-range into the parent source body.
225    /// Omitted when NULL on the underlying row.
226    #[serde(default, skip_serializing_if = "Option::is_none")]
227    pub source_span: Option<crate::models::SourceSpan>,
228    /// v0.7.0 Form 5 (issue #758) — typed discriminator for the
229    /// provenance of the `confidence` value on the underlying row.
230    /// Always emitted so auditors can rely on the field's presence
231    /// regardless of row vintage. Legacy rows resolve to
232    /// `caller_provided` (the SQL default on schema v39).
233    #[serde(default)]
234    pub confidence_source: crate::models::ConfidenceSource,
235    /// v0.7.0 Form 5 — JSON snapshot of the signals that produced an
236    /// auto-derived or calibrated confidence value. Omitted when NULL
237    /// on the underlying row (i.e., the row's
238    /// `confidence_source == CallerProvided`).
239    #[serde(default, skip_serializing_if = "Option::is_none")]
240    pub confidence_signals: Option<crate::models::ConfidenceSignals>,
241    /// v0.7.0 Form 5 — RFC3339 stamp of the last decay computation.
242    /// Omitted when NULL on the underlying row (i.e., the row has
243    /// never been touched by the decay updater).
244    #[serde(default, skip_serializing_if = "Option::is_none")]
245    pub confidence_decayed_at: Option<String>,
246}
247
248/// v0.7.0 WT-1-E — per-memory atomisation enrichment block. Carries
249/// the substrate-visible signals (`atomised_into`, `archived_at`,
250/// `atom_ids`, `atom_of`) directly so an auditor can reconstruct the
251/// chain from a single envelope.
252#[derive(Debug, Clone, Serialize, Deserialize, Default)]
253pub struct AtomisationEnvelope {
254    /// Count of atoms emitted from this source (mirror of
255    /// `memories.atomised_into`). `None` on atom rows and on rows
256    /// untouched by atomisation.
257    #[serde(skip_serializing_if = "Option::is_none")]
258    pub atomised_into: Option<i64>,
259    /// RFC3339 stamp from `metadata.atomisation_archived_at`,
260    /// populated by the WT-1-B `archive_source` step. `None` on
261    /// rows untouched by atomisation.
262    #[serde(skip_serializing_if = "Option::is_none")]
263    pub archived_at: Option<String>,
264    /// Ordered list of atom ids whose `atom_of` points back at this
265    /// source. Empty on atom rows and on rows untouched by
266    /// atomisation.
267    #[serde(skip_serializing_if = "Vec::is_empty", default)]
268    pub atom_ids: Vec<String>,
269    /// Parent source id when this memory is an atom. `None` on
270    /// archived-source rows and on rows untouched by atomisation.
271    #[serde(skip_serializing_if = "Option::is_none")]
272    pub atom_of: Option<String>,
273}
274
275/// One signed link inside the bundle. Carries the canonical
276/// [`SignableLink`] field set plus the raw signature so an auditor can
277/// re-derive the canonical-CBOR bytes and re-verify the Ed25519
278/// signature without joining back to a substrate row.
279#[derive(Debug, Clone, Serialize, Deserialize)]
280pub struct EdgeEnvelope {
281    pub source_id: String,
282    pub target_id: String,
283    pub relation: String,
284    pub created_at: String,
285    pub observed_by: Option<String>,
286    pub valid_from: Option<String>,
287    pub valid_until: Option<String>,
288    pub attest_level: String,
289    /// Hex-encoded Ed25519 signature, or `None` for unsigned edges.
290    pub signature_hex: Option<String>,
291}
292
293/// One `signed_events` audit row inside the bundle. Mirrors the column
294/// shape of [`crate::signed_events::SignedEvent`] but emits
295/// `payload_hash` and `signature` as hex strings so the on-wire format
296/// is JSON-safe.
297#[derive(Debug, Clone, Serialize, Deserialize)]
298pub struct SignedEventEnvelope {
299    pub id: String,
300    pub agent_id: String,
301    pub event_type: String,
302    pub payload_hash_hex: String,
303    pub signature_hex: Option<String>,
304    pub attest_level: String,
305    pub timestamp: String,
306}
307
308/// One transcript inside the bundle. We split metadata from content so
309/// callers can deserialise the metadata without holding the body in
310/// memory.
311#[derive(Debug, Clone, Serialize, Deserialize)]
312pub struct TranscriptEnvelope {
313    pub id: String,
314    pub namespace: String,
315    pub created_at: String,
316    pub expires_at: Option<String>,
317    pub compressed_size: i64,
318    pub original_size: i64,
319    /// Memory ids that linked to this transcript inside the chain.
320    pub linked_memory_ids: Vec<String>,
321}
322
323// ─────────────────────────────────────────────────────────────────────
324// Bundle builder
325// ─────────────────────────────────────────────────────────────────────
326
327/// In-memory representation of the bundle before it's emitted as a
328/// tar. Path → file bytes. Sorted iteration is guaranteed by the
329/// `BTreeMap` so the on-wire archive is deterministic.
330type BundleFiles = BTreeMap<String, Vec<u8>>;
331
332/// Build the bundle for the given memory id, writing the tarball to
333/// `output_path`.
334///
335/// `generated_at` overrides the RFC3339 timestamp written into the
336/// manifest. The CLI always passes `None` (which fills in
337/// `chrono::Utc::now()`); the test suite passes a fixed string to make
338/// the byte-identical reproducibility assertion provable.
339///
340/// # Errors
341///
342/// Propagates I/O errors writing the tarball, signing errors when an
343/// operator key is on disk but corrupted, or substrate read errors.
344pub fn build(
345    conn: &Connection,
346    args: &ExportForensicBundleArgs,
347    output_path: &Path,
348    generated_at: Option<&str>,
349) -> Result<()> {
350    let files = build_files(conn, args, generated_at)?;
351    write_ustar(output_path, &files).context("write forensic bundle tar")
352}
353
354/// In-memory variant of [`build`]. Returns the
355/// path-keyed `BundleFiles` map ready to be serialised by either
356/// [`write_ustar`] (production) or `pack_to_vec` (tests). Public so
357/// the integration test suite can rebuild the same bundle twice and
358/// diff the bytes without going through the filesystem.
359pub fn build_files(
360    conn: &Connection,
361    args: &ExportForensicBundleArgs,
362    generated_at: Option<&str>,
363) -> Result<BundleFiles> {
364    let generated_at: String = generated_at
365        .map(ToString::to_string)
366        .unwrap_or_else(|| chrono::Utc::now().to_rfc3339());
367
368    // 1) Walk the reflects_on graph backward from memory_id to assemble
369    //    the set of in-scope memory ids.
370    let mut chain_ids = walk_reflection_chain(conn, &args.memory_id)?;
371
372    // v0.7.0 WT-1-E — atomisation-chain expansion. When the target
373    // memory (or any ancestor) is an archived source, fold its atoms
374    // in. When the target is itself an atom, fold its parent source
375    // in. Both directions are needed so an auditor sees the full
376    // chain regardless of which "end" of it they queried by id.
377    // The expansion is purely additive — reflections + atomisation
378    // can coexist on the same memory.
379    if args.include_atomisation_chain {
380        let mut expanded = chain_ids.clone();
381        for mid in &chain_ids {
382            // Source → atoms (when this id is an archived source)
383            for atom_id in atom_ids_of_source(conn, mid)? {
384                if !expanded.contains(&atom_id) {
385                    expanded.push(atom_id);
386                }
387            }
388            // Atom → source (when this id is an atom)
389            if let Some(parent_id) = atom_of_for(conn, mid)? {
390                if !expanded.contains(&parent_id) {
391                    expanded.push(parent_id.clone());
392                    // Recursively pick up sibling atoms of that
393                    // parent so the auditor sees the whole sibling
394                    // cohort, not just the one atom that was the
395                    // entry point.
396                    for atom_id in atom_ids_of_source(conn, &parent_id)? {
397                        if !expanded.contains(&atom_id) {
398                            expanded.push(atom_id);
399                        }
400                    }
401                }
402            }
403        }
404        expanded.sort();
405        chain_ids = expanded;
406    }
407
408    let mut files: BundleFiles = BTreeMap::new();
409
410    // 2) Memory envelopes (target + ancestors when --include-reflections).
411    //    The atomisation expansion above is preserved verbatim when
412    //    --include-reflections=true; when --include-reflections=false
413    //    the original reflects_on logic emits only the target row,
414    //    but the atomisation enrichment is still attached to it.
415    let memory_ids_to_emit: Vec<String> = if args.include_reflections {
416        chain_ids.clone()
417    } else if args.include_atomisation_chain {
418        // Even without --include-reflections, emit the target's
419        // atomisation cohort (source + sibling atoms) so the bundle
420        // is self-contained for the substrate-visible chain.
421        let mut ids = vec![args.memory_id.clone()];
422        for atom_id in atom_ids_of_source(conn, &args.memory_id)? {
423            if !ids.contains(&atom_id) {
424                ids.push(atom_id);
425            }
426        }
427        if let Some(parent) = atom_of_for(conn, &args.memory_id)? {
428            if !ids.contains(&parent) {
429                ids.push(parent.clone());
430            }
431            for atom_id in atom_ids_of_source(conn, &parent)? {
432                if !ids.contains(&atom_id) {
433                    ids.push(atom_id);
434                }
435            }
436        }
437        ids.sort();
438        ids
439    } else {
440        vec![args.memory_id.clone()]
441    };
442    for mid in &memory_ids_to_emit {
443        if let Some(mem) = crate::db::get(conn, mid).context("db::get for bundle")? {
444            let atomisation = if args.include_atomisation_chain {
445                build_atomisation_envelope(conn, &mem)?
446            } else {
447                None
448            };
449            let env = MemoryEnvelope {
450                id: mem.id.clone(),
451                namespace: mem.namespace.clone(),
452                title: mem.title.clone(),
453                content: mem.content.clone(),
454                tier: mem.tier.as_str().to_string(),
455                memory_kind: format!("{:?}", mem.memory_kind).to_ascii_lowercase(),
456                reflection_depth: mem.reflection_depth,
457                created_at: mem.created_at.clone(),
458                updated_at: mem.updated_at.clone(),
459                metadata: mem.metadata.clone(),
460                atomisation,
461                // v0.7.0 Form 4 (issue #757) — fact-provenance fields
462                // ride alongside the existing envelope shape. Citations
463                // always lands (defaults to empty); source_uri /
464                // source_span emit only when populated.
465                citations: mem.citations.clone(),
466                source_uri: mem.source_uri.clone(),
467                source_span: mem.source_span,
468                // v0.7.0 Form 5 (issue #758) — confidence-provenance
469                // fields round-trip into the bundle so an auditor can
470                // verify whether the `confidence` value was caller-
471                // provided, auto-derived, calibrated, or decayed.
472                confidence_source: mem.confidence_source,
473                confidence_signals: mem.confidence_signals.clone(),
474                confidence_decayed_at: mem.confidence_decayed_at.clone(),
475            };
476            let bytes = serde_json::to_vec_pretty(&env).context("serialise MemoryEnvelope")?;
477            files.insert(format!("memories/{}.json", mem.id), bytes);
478        }
479    }
480
481    // 3) Edge envelopes — every reflects_on / supersedes / derived_from
482    //    edge whose source is in `chain_ids`. WT-1-E folds in
483    //    `derives_from` (atom → parent) alongside the existing
484    //    relations — see [`fetch_edges_for`]. When the
485    //    `include_atomisation_chain` flag is unset, drop
486    //    `derives_from` edges from the output so the auditor
487    //    sees only the atom rows (the historical record stays
488    //    in the substrate; the bundle just doesn't carry it).
489    let edges_raw = fetch_edges_for(conn, &chain_ids)?;
490    let edges: Vec<_> = if args.include_atomisation_chain {
491        edges_raw
492    } else {
493        edges_raw
494            .into_iter()
495            .filter(|e| e.relation != crate::models::MemoryLinkRelation::DerivesFrom.as_str())
496            .collect()
497    };
498    for edge in &edges {
499        let bytes = serde_json::to_vec_pretty(edge).context("serialise EdgeEnvelope")?;
500        // Lexicographic path so determinism survives row-order shuffling.
501        // Path components are safe ASCII (uuid + relation name); no
502        // sanitisation needed.
503        let path = format!(
504            "edges/{}__{}__{}.json",
505            edge.source_id, edge.relation, edge.target_id
506        );
507        files.insert(path, bytes);
508    }
509
510    // 4) signed_events slice — every audit row whose agent_id matches
511    //    a memory in the chain (the H5 convention is to use agent_id =
512    //    actor's id; the memory_id is embedded in the payload).
513    let mut event_ids_emitted: std::collections::HashSet<String> = std::collections::HashSet::new();
514    let events = fetch_signed_events_for(conn, &chain_ids)?;
515    for ev in &events {
516        let bytes = serde_json::to_vec_pretty(ev).context("serialise SignedEventEnvelope")?;
517        files.insert(format!("signed_events/{}.json", ev.id), bytes);
518        event_ids_emitted.insert(ev.id.clone());
519    }
520
521    // v0.7.0 WT-1-E — atomisation-chain signed_events. Two event
522    // shapes need to land in the bundle even when their agent_id is
523    // not itself a memory id in the chain:
524    //
525    //   * `atomisation_complete` — the summary event the WT-1-B
526    //     atomiser emits per source. Its `agent_id` is the calling
527    //     agent's id (e.g. `ai:claude@host:pid-…`), not a memory
528    //     id, so the H5-agent-id-match query above misses it.
529    //   * `memory_link.created` for each `derives_from` atom→parent
530    //     edge. Again the agent_id is the writer, not the memory.
531    //
532    // We fetch these explicitly by joining the memory_links table
533    // (for the per-atom edge events) and by event_type +
534    // payload_hash cross-reference (for the summary event). Both
535    // sets are unioned with the existing agent-id-matched events,
536    // de-duped, then emitted under the same path scheme.
537    if args.include_atomisation_chain {
538        let extra = fetch_atomisation_signed_events_for(conn, &chain_ids)?;
539        for ev in &extra {
540            if event_ids_emitted.contains(&ev.id) {
541                continue;
542            }
543            let bytes = serde_json::to_vec_pretty(ev).context("serialise SignedEventEnvelope")?;
544            files.insert(format!("signed_events/{}.json", ev.id), bytes);
545            event_ids_emitted.insert(ev.id.clone());
546        }
547    }
548
549    // 5) Transcript union (per L2-4) when --include-transcripts.
550    if args.include_transcripts {
551        let entries =
552            crate::transcripts::replay::replay_transcript_union(conn, &args.memory_id, None)
553                .context("replay_transcript_union for bundle")?;
554
555        // Dedup by transcript id (replay_transcript_union already
556        // dedups, but defensive coding here keeps the manifest stable
557        // if the upstream contract loosens).
558        let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
559        for entry in &entries {
560            if !seen.insert(entry.meta.id.clone()) {
561                continue;
562            }
563            // Gather every memory_id that linked to this transcript
564            // (deterministic order via sort).
565            let mut linked: Vec<String> = entries
566                .iter()
567                .filter(|e| e.meta.id == entry.meta.id)
568                .map(|e| e.memory_id.clone())
569                .collect();
570            linked.sort();
571            linked.dedup();
572
573            let env = TranscriptEnvelope {
574                id: entry.meta.id.clone(),
575                namespace: entry.meta.namespace.clone(),
576                created_at: entry.meta.created_at.clone(),
577                expires_at: entry.meta.expires_at.clone(),
578                compressed_size: entry.meta.compressed_size,
579                original_size: entry.meta.original_size,
580                linked_memory_ids: linked,
581            };
582            let meta_bytes =
583                serde_json::to_vec_pretty(&env).context("serialise TranscriptEnvelope")?;
584            files.insert(format!("transcripts/{}.json", entry.meta.id), meta_bytes);
585
586            if let Some(content) = crate::transcripts::storage::fetch(conn, &entry.meta.id)
587                .context("fetch transcript content for bundle")?
588            {
589                files.insert(
590                    format!("transcripts/{}.content", entry.meta.id),
591                    content.into_bytes(),
592                );
593            }
594        }
595    }
596
597    // 6) Embed the L1-3 verify-reflection-chain JSON as verification.json.
598    //    Pass `generated_at` through so the embedded report's
599    //    timestamp matches the manifest's — keeps the bundle
600    //    reproducible per #670's "byte-identical mod timestamp"
601    //    acceptance criterion (the manifest's `generated_at` is the
602    //    one legitimate non-deterministic field).
603    let report =
604        crate::cli::verify::build_chain_report_at(conn, &args.memory_id, true, Some(&generated_at))
605            .context("build_chain_report for bundle")?;
606    let verification_bytes =
607        serde_json::to_vec_pretty(&report).context("serialise chain report")?;
608    files.insert("verification.json".to_string(), verification_bytes);
609
610    // 7) Build the manifest (every file EXCEPT manifest.json itself
611    //    contributes to the SHA-256 index) and sign it.
612    let mut manifest = Manifest {
613        schema_version: BUNDLE_SCHEMA_VERSION,
614        memory_id: args.memory_id.clone(),
615        generated_at,
616        include_reflections: args.include_reflections,
617        include_transcripts: args.include_transcripts,
618        files: files
619            .iter()
620            .map(|(p, body)| ManifestFile {
621                path: p.clone(),
622                size: body.len() as u64,
623                sha256: hex_sha256(body),
624            })
625            .collect(),
626        signer_agent_id: None,
627        signature: None,
628    };
629
630    // 8) Sign the manifest with the operator's keypair, if one is on
631    //    disk. The signature commits to a canonical concatenation of
632    //    every file's path + size + sha256 (the rest of the manifest
633    //    fields are reconstructible from the tarball at verify time).
634    if let Some((agent_id, sig_b64)) = sign_manifest_if_keyed(&manifest)? {
635        manifest.signer_agent_id = Some(agent_id);
636        manifest.signature = Some(sig_b64);
637    }
638
639    let manifest_bytes = serde_json::to_vec_pretty(&manifest).context("serialise Manifest")?;
640    files.insert(MANIFEST_FILE_NAME.to_string(), manifest_bytes);
641
642    Ok(files)
643}
644
645/// Canonical signing input: `path:size:sha256` per file, joined by
646/// `\n`, then the bundle's schema version + memory id appended. The
647/// ordering of the `manifest.files` vec is already deterministic (it
648/// reflects `BundleFiles`'s BTreeMap iteration order), so the same
649/// bundle always produces the same signing input.
650pub fn canonical_signed_bytes(m: &Manifest) -> Vec<u8> {
651    let mut out = String::new();
652    for f in &m.files {
653        out.push_str(&f.path);
654        out.push(':');
655        out.push_str(&f.size.to_string());
656        out.push(':');
657        out.push_str(&f.sha256);
658        out.push('\n');
659    }
660    out.push_str("schema_version:");
661    out.push_str(&m.schema_version.to_string());
662    out.push('\n');
663    out.push_str("memory_id:");
664    out.push_str(&m.memory_id);
665    out.push('\n');
666    out.into_bytes()
667}
668
669/// Look for an operator keypair on disk and, if found, sign the
670/// manifest's canonical bytes. Returns `(agent_id, base64_signature)`.
671/// Returns `Ok(None)` when no key is available — that path is the
672/// "unsigned bundle" mode, which still verifies for integrity (every
673/// file's SHA-256 is recomputed and compared) but lacks operator
674/// attestation.
675fn sign_manifest_if_keyed(manifest: &Manifest) -> Result<Option<(String, String)>> {
676    let key_dir = match kp_mod::default_key_dir() {
677        Ok(p) => p,
678        Err(_) => return Ok(None),
679    };
680    if !key_dir.exists() {
681        return Ok(None);
682    }
683    let entries = match kp_mod::list(&key_dir) {
684        Ok(v) => v,
685        Err(_) => return Ok(None),
686    };
687    // Find the first keypair with a private signing key on disk
688    // (operator-managed; deterministic by `agent_id` sort order).
689    let mut candidates: Vec<String> = entries.into_iter().map(|kp| kp.agent_id).collect();
690    candidates.sort();
691    for agent_id in candidates {
692        if let Ok(kp) = kp_mod::load(&agent_id, &key_dir) {
693            if let Some(signing) = kp.private.as_ref() {
694                let bytes = canonical_signed_bytes(manifest);
695                let sig = signing.sign(&bytes);
696                let sig_b64 = STANDARD_NO_PAD.encode(sig.to_bytes());
697                return Ok(Some((agent_id, sig_b64)));
698            }
699        }
700    }
701    Ok(None)
702}
703
704// ─────────────────────────────────────────────────────────────────────
705// Substrate readers
706// ─────────────────────────────────────────────────────────────────────
707
708/// Walk `reflects_on` edges backward from `root` and return the
709/// visited memory ids in BFS order. Mirrors the walk in
710/// [`crate::cli::verify::build_chain_report`] but returns only the id
711/// set (no per-edge verification).
712fn walk_reflection_chain(conn: &Connection, root: &str) -> Result<Vec<String>> {
713    use std::collections::{HashSet, VecDeque};
714    let mut visited: HashSet<String> = HashSet::new();
715    let mut order: Vec<String> = Vec::new();
716    let mut queue: VecDeque<String> = VecDeque::new();
717    queue.push_back(root.to_string());
718    while let Some(cur) = queue.pop_front() {
719        if !visited.insert(cur.clone()) {
720            continue;
721        }
722        order.push(cur.clone());
723        let mut stmt = conn.prepare(
724            "SELECT target_id FROM memory_links \
725             WHERE source_id = ?1 AND relation = 'reflects_on' \
726             ORDER BY target_id",
727        )?;
728        let rows = stmt.query_map(params![cur], |r| r.get::<_, String>(0))?;
729        for r in rows {
730            let tgt = r?;
731            if !visited.contains(&tgt) {
732                queue.push_back(tgt);
733            }
734        }
735    }
736    // Stable sort so the on-wire ordering is independent of BFS
737    // expansion order (BFS depends on row insertion order which can
738    // differ across DBs even when the set is identical).
739    order.sort();
740    Ok(order)
741}
742
743/// Fetch every `reflects_on` / `supersedes` / `derived_from` edge
744/// whose `source_id` is in `chain_ids`. Returns rows sorted by
745/// (source_id, relation, target_id) so the on-wire ordering is
746/// deterministic.
747fn fetch_edges_for(conn: &Connection, chain_ids: &[String]) -> Result<Vec<EdgeEnvelope>> {
748    let mut out = Vec::new();
749    if chain_ids.is_empty() {
750        return Ok(out);
751    }
752    let placeholders: String = chain_ids
753        .iter()
754        .enumerate()
755        .map(|(i, _)| format!("?{}", i + 1))
756        .collect::<Vec<_>>()
757        .join(", ");
758    let sql = format!(
759        "SELECT source_id, target_id, relation, created_at, observed_by, \
760                valid_from, valid_until, signature, attest_level \
761         FROM memory_links \
762         WHERE source_id IN ({placeholders}) \
763           AND relation IN ('reflects_on', 'supersedes', 'derived_from', 'derives_from') \
764         ORDER BY source_id, relation, target_id"
765    );
766    let mut stmt = conn.prepare(&sql)?;
767    let param_refs: Vec<&dyn rusqlite::ToSql> = chain_ids
768        .iter()
769        .map(|s| s as &dyn rusqlite::ToSql)
770        .collect();
771    let rows = stmt.query_map(param_refs.as_slice(), |r| {
772        Ok(EdgeEnvelope {
773            source_id: r.get::<_, String>(0)?,
774            target_id: r.get::<_, String>(1)?,
775            relation: r.get::<_, String>(2)?,
776            created_at: r.get::<_, String>(3)?,
777            observed_by: r.get::<_, Option<String>>(4)?,
778            valid_from: r.get::<_, Option<String>>(5)?,
779            valid_until: r.get::<_, Option<String>>(6)?,
780            signature_hex: r.get::<_, Option<Vec<u8>>>(7)?.map(|b| bytes_to_hex(&b)),
781            attest_level: r
782                .get::<_, Option<String>>(8)?
783                .unwrap_or_else(|| crate::models::AttestLevel::Unsigned.as_str().to_string()),
784        })
785    })?;
786    for r in rows {
787        out.push(r?);
788    }
789    Ok(out)
790}
791
792/// v0.7.0 WT-1-E — return the atom ids whose `atom_of` column FK
793/// points back to `source_id`. Empty when `source_id` is not an
794/// archived source. Ordering matches the WT-1-B emission order
795/// (created_at ASC, id ASC).
796fn atom_ids_of_source(conn: &Connection, source_id: &str) -> Result<Vec<String>> {
797    let mut stmt = conn.prepare(
798        "SELECT id FROM memories \
799         WHERE atom_of = ?1 \
800         ORDER BY created_at ASC, id ASC",
801    )?;
802    let rows = stmt.query_map(params![source_id], |r| r.get::<_, String>(0))?;
803    let mut out = Vec::new();
804    for r in rows {
805        out.push(r?);
806    }
807    Ok(out)
808}
809
810/// v0.7.0 WT-1-E — return the parent source id when `id` is an atom
811/// (i.e. `memories.atom_of` is set). `None` for non-atom rows or
812/// when the id is unknown.
813fn atom_of_for(conn: &Connection, id: &str) -> Result<Option<String>> {
814    let res: rusqlite::Result<Option<String>> = conn.query_row(
815        "SELECT atom_of FROM memories WHERE id = ?1",
816        params![id],
817        |r| r.get::<_, Option<String>>(0),
818    );
819    match res {
820        Ok(v) => Ok(v),
821        Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None),
822        Err(e) => Err(e.into()),
823    }
824}
825
826/// v0.7.0 WT-1-E — build the `AtomisationEnvelope` enrichment block
827/// for `mem`. Returns `None` when the memory is untouched by
828/// atomisation (neither an archived source nor an atom), so the
829/// outer envelope's `Option<AtomisationEnvelope>` field round-trips
830/// `serde(skip_serializing_if = "Option::is_none")` cleanly.
831fn build_atomisation_envelope(
832    conn: &Connection,
833    mem: &crate::models::Memory,
834) -> Result<Option<AtomisationEnvelope>> {
835    // Read the two source-side columns. These are not on the Memory
836    // struct (yet), so query directly.
837    let (atomised_into, atom_of_col): (Option<i64>, Option<String>) = conn
838        .query_row(
839            "SELECT atomised_into, atom_of FROM memories WHERE id = ?1",
840            params![mem.id],
841            |r| Ok((r.get::<_, Option<i64>>(0)?, r.get::<_, Option<String>>(1)?)),
842        )
843        .unwrap_or((None, None));
844
845    let archived_at = mem
846        .metadata
847        .get(crate::models::field_names::ATOMISATION_ARCHIVED_AT)
848        .and_then(|v| v.as_str())
849        .map(ToString::to_string);
850
851    let is_archived_source = atomised_into.unwrap_or(0) > 0 || archived_at.is_some();
852    let is_atom = atom_of_col.is_some();
853    if !is_archived_source && !is_atom {
854        return Ok(None);
855    }
856    let atom_ids = if is_archived_source {
857        atom_ids_of_source(conn, &mem.id)?
858    } else {
859        Vec::new()
860    };
861    Ok(Some(AtomisationEnvelope {
862        atomised_into: atomised_into.filter(|n| *n > 0),
863        archived_at,
864        atom_ids,
865        atom_of: atom_of_col,
866    }))
867}
868
869/// v0.7.0 WT-1-E — fetch every atomisation-related signed event for
870/// the chain. Two queries:
871///
872///   1. Every `memory_link.created` row whose payload describes a
873///      `derives_from` edge from one of the chain's memory ids. We
874///      approximate this by joining on `memory_links` (the row that
875///      generated the audit event) — the WT-1-B atomiser writes the
876///      link via `create_link_signed` which appends a matching
877///      audit row at the same instant. Match heuristic: same
878///      agent_id and timestamp >= the link's created_at on the
879///      same source/target row.
880///   2. Every `atomisation_complete` event whose timestamp lies at
881///      or after the earliest `derives_from` edge's `created_at`
882///      for the chain (i.e. the summary event for any atomisation
883///      that involves these memories). Because the payload itself
884///      is only stored as a hash we can't filter on `source_id`
885///      directly; we instead fetch all events of that type that
886///      could plausibly have been emitted by the same calling
887///      agent and let the auditor cross-reference at verify time.
888///      The over-fetch is bounded by the agent_id set of the
889///      chain's `derives_from` writers, so unrelated atomisations
890///      from other agents are excluded.
891fn fetch_atomisation_signed_events_for(
892    conn: &Connection,
893    chain_ids: &[String],
894) -> Result<Vec<SignedEventEnvelope>> {
895    if chain_ids.is_empty() {
896        return Ok(Vec::new());
897    }
898    // Collect the set of `observed_by` agent ids on the chain's
899    // derives_from edges. The atomisation_complete event's
900    // `agent_id` matches the same `calling_agent_id` used by the
901    // per-atom create_link_signed call.
902    //
903    // Two disjoint placeholder ranges so the source_id and
904    // target_id INs each get their own bound slot — using the same
905    // placeholder name twice in rusqlite collapses the second bind,
906    // which leaves the OR branch unbound and rusqlite errors with
907    // "Wrong number of parameters."
908    let src_placeholders: String = (1..=chain_ids.len())
909        .map(|i| format!("?{i}"))
910        .collect::<Vec<_>>()
911        .join(", ");
912    let tgt_placeholders: String = (chain_ids.len() + 1..=chain_ids.len() * 2)
913        .map(|i| format!("?{i}"))
914        .collect::<Vec<_>>()
915        .join(", ");
916    let agent_sql = format!(
917        "SELECT DISTINCT observed_by FROM memory_links \
918         WHERE relation = 'derives_from' \
919           AND (source_id IN ({src_placeholders}) OR target_id IN ({tgt_placeholders})) \
920           AND observed_by IS NOT NULL"
921    );
922    let mut agent_stmt = conn.prepare(&agent_sql)?;
923    let bind_pairs: Vec<&dyn rusqlite::ToSql> = chain_ids
924        .iter()
925        .chain(chain_ids.iter())
926        .map(|s| s as &dyn rusqlite::ToSql)
927        .collect();
928    let agent_rows = agent_stmt.query_map(bind_pairs.as_slice(), |r| r.get::<_, String>(0))?;
929    let mut writer_agents: Vec<String> = Vec::new();
930    for r in agent_rows {
931        let id = r?;
932        if !writer_agents.contains(&id) {
933            writer_agents.push(id);
934        }
935    }
936
937    // Without a writer agent (i.e. unsigned `derives_from` edge) we
938    // still fall back to fetching atomisation_complete events of
939    // any agent so the bundle preserves the audit row. Auditors can
940    // distinguish via the `attest_level` column.
941    let mut out: Vec<SignedEventEnvelope> = Vec::new();
942    let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
943
944    if writer_agents.is_empty() {
945        // Fallback: fetch ALL atomisation_complete + memory_link.created
946        // events. The chain has unsigned derives_from edges, so we
947        // cannot scope by agent — better to over-include in the
948        // bundle (auditor sees the relevant subset) than to silently
949        // drop the chain's audit trail.
950        let sql = "SELECT id, agent_id, event_type, payload_hash, signature, \
951                          attest_level, timestamp \
952                   FROM signed_events \
953                   WHERE event_type IN ('atomisation_complete', 'memory_link.created') \
954                   ORDER BY timestamp ASC, id ASC";
955        let mut stmt = conn.prepare(sql)?;
956        let rows = stmt.query_map([], row_to_signed_event_envelope)?;
957        for r in rows {
958            let ev = r?;
959            if seen.insert(ev.id.clone()) {
960                out.push(ev);
961            }
962        }
963        return Ok(out);
964    }
965
966    let agent_placeholders: String = writer_agents
967        .iter()
968        .enumerate()
969        .map(|(i, _)| format!("?{}", i + 1))
970        .collect::<Vec<_>>()
971        .join(", ");
972    let sql = format!(
973        "SELECT id, agent_id, event_type, payload_hash, signature, \
974                attest_level, timestamp \
975         FROM signed_events \
976         WHERE event_type IN ('atomisation_complete', 'memory_link.created') \
977           AND agent_id IN ({agent_placeholders}) \
978         ORDER BY timestamp ASC, id ASC"
979    );
980    let mut stmt = conn.prepare(&sql)?;
981    let param_refs: Vec<&dyn rusqlite::ToSql> = writer_agents
982        .iter()
983        .map(|s| s as &dyn rusqlite::ToSql)
984        .collect();
985    let rows = stmt.query_map(param_refs.as_slice(), row_to_signed_event_envelope)?;
986    for r in rows {
987        let ev = r?;
988        if seen.insert(ev.id.clone()) {
989            out.push(ev);
990        }
991    }
992    Ok(out)
993}
994
995/// v0.7.0 WT-1-E — shared row→envelope decoder. Replicates the
996/// inline closure in [`fetch_signed_events_for`] so the WT-1-E
997/// fetcher does not duplicate the column-index pattern (and so a
998/// future column-set extension only needs to be applied in one
999/// place).
1000fn row_to_signed_event_envelope(r: &rusqlite::Row<'_>) -> rusqlite::Result<SignedEventEnvelope> {
1001    Ok(SignedEventEnvelope {
1002        id: r.get::<_, String>(0)?,
1003        agent_id: r.get::<_, String>(1)?,
1004        event_type: r.get::<_, String>(2)?,
1005        payload_hash_hex: bytes_to_hex(&r.get::<_, Vec<u8>>(3)?),
1006        signature_hex: r.get::<_, Option<Vec<u8>>>(4)?.map(|b| bytes_to_hex(&b)),
1007        attest_level: r.get::<_, String>(5)?,
1008        timestamp: r.get::<_, String>(6)?,
1009    })
1010}
1011
1012/// Fetch every `signed_events` row whose `agent_id` matches a memory
1013/// id in `chain_ids` (the H5 convention puts the actor's agent_id in
1014/// the `agent_id` column; the memory_id is embedded in the payload —
1015/// the LIKE is intentional best-effort: signed_events join to the
1016/// chain via the agent identity of whichever caller minted the link).
1017fn fetch_signed_events_for(
1018    conn: &Connection,
1019    chain_ids: &[String],
1020) -> Result<Vec<SignedEventEnvelope>> {
1021    if chain_ids.is_empty() {
1022        return Ok(Vec::new());
1023    }
1024    let placeholders: String = chain_ids
1025        .iter()
1026        .enumerate()
1027        .map(|(i, _)| format!("?{}", i + 1))
1028        .collect::<Vec<_>>()
1029        .join(", ");
1030    let sql = format!(
1031        "SELECT id, agent_id, event_type, payload_hash, signature, \
1032                attest_level, timestamp \
1033         FROM signed_events \
1034         WHERE agent_id IN ({placeholders}) \
1035         ORDER BY timestamp ASC, id ASC"
1036    );
1037    let mut stmt = conn.prepare(&sql)?;
1038    let param_refs: Vec<&dyn rusqlite::ToSql> = chain_ids
1039        .iter()
1040        .map(|s| s as &dyn rusqlite::ToSql)
1041        .collect();
1042    let rows = stmt.query_map(param_refs.as_slice(), |r| {
1043        Ok(SignedEventEnvelope {
1044            id: r.get::<_, String>(0)?,
1045            agent_id: r.get::<_, String>(1)?,
1046            event_type: r.get::<_, String>(2)?,
1047            payload_hash_hex: bytes_to_hex(&r.get::<_, Vec<u8>>(3)?),
1048            signature_hex: r.get::<_, Option<Vec<u8>>>(4)?.map(|b| bytes_to_hex(&b)),
1049            attest_level: r.get::<_, String>(5)?,
1050            timestamp: r.get::<_, String>(6)?,
1051        })
1052    })?;
1053    let mut out = Vec::new();
1054    for r in rows {
1055        out.push(r?);
1056    }
1057    Ok(out)
1058}
1059
1060// ─────────────────────────────────────────────────────────────────────
1061// Verification
1062// ─────────────────────────────────────────────────────────────────────
1063
1064/// Result of [`verify`]. One row per discrepancy plus an `ok` flag.
1065#[derive(Debug, Clone, Serialize)]
1066pub struct VerificationReport {
1067    pub ok: bool,
1068    pub bundle_path: String,
1069    pub manifest_present: bool,
1070    pub schema_version: u32,
1071    pub memory_id: String,
1072    pub signer_agent_id: Option<String>,
1073    pub signature_status: SignatureStatus,
1074    /// Files whose recomputed SHA-256 disagreed with the manifest.
1075    pub tampered_files: Vec<String>,
1076    /// Files present in the manifest but missing from the tarball.
1077    pub missing_files: Vec<String>,
1078    /// Files present in the tarball but absent from the manifest.
1079    pub extra_files: Vec<String>,
1080    /// Reflection-chain edges whose embedded signature failed to
1081    /// re-verify against the bundled `observed_by` public key.
1082    /// Auditors typically expect this to be empty.
1083    pub chain_edges_failed: Vec<String>,
1084}
1085
1086/// Manifest-signature outcome.
1087#[derive(Debug, Clone, Copy, Serialize, PartialEq, Eq)]
1088#[serde(rename_all = "snake_case")]
1089pub enum SignatureStatus {
1090    /// Manifest carried a signature and it verified against the
1091    /// signer's enrolled public key.
1092    Verified,
1093    /// Manifest carried a signature but verification failed.
1094    Failed,
1095    /// Manifest carried no signature (unsigned bundle).
1096    Absent,
1097    /// Manifest carried a signature but the signer's public key is
1098    /// not enrolled locally — we can't decide either way.
1099    UnknownSigner,
1100}
1101
1102/// Verify a forensic bundle on disk.
1103///
1104/// Re-reads the tarball, recomputes every file's SHA-256, cross-
1105/// checks the manifest's signature (when present), and re-verifies
1106/// every edge envelope's Ed25519 signature.
1107///
1108/// # Errors
1109///
1110/// Propagates I/O errors reading the tarball or parse errors for the
1111/// embedded manifest. A successful return with `ok = false` means the
1112/// bundle was structurally valid but failed integrity checks; an
1113/// `Err` means we couldn't even unpack the archive.
1114pub fn verify(bundle_path: &Path) -> Result<VerificationReport> {
1115    let bytes = fs::read(bundle_path)
1116        .with_context(|| format!("read bundle from {}", bundle_path.display()))?;
1117    let files = read_ustar(&bytes).context("parse forensic bundle tar")?;
1118
1119    let manifest_bytes = files
1120        .get(MANIFEST_FILE_NAME)
1121        .ok_or_else(|| anyhow!("bundle is missing manifest.json"))?
1122        .clone();
1123    let manifest: Manifest =
1124        serde_json::from_slice(&manifest_bytes).context("parse manifest.json")?;
1125
1126    let mut report = VerificationReport {
1127        ok: true,
1128        bundle_path: bundle_path.display().to_string(),
1129        manifest_present: true,
1130        schema_version: manifest.schema_version,
1131        memory_id: manifest.memory_id.clone(),
1132        signer_agent_id: manifest.signer_agent_id.clone(),
1133        signature_status: SignatureStatus::Absent,
1134        tampered_files: Vec::new(),
1135        missing_files: Vec::new(),
1136        extra_files: Vec::new(),
1137        chain_edges_failed: Vec::new(),
1138    };
1139
1140    // 1) Compare per-file SHA-256s + presence.
1141    let manifest_index: BTreeMap<&str, &ManifestFile> = manifest
1142        .files
1143        .iter()
1144        .map(|m| (m.path.as_str(), m))
1145        .collect();
1146    for (path, body) in &files {
1147        if path == MANIFEST_FILE_NAME {
1148            continue;
1149        }
1150        match manifest_index.get(path.as_str()) {
1151            Some(mf) => {
1152                let actual = hex_sha256(body);
1153                if actual != mf.sha256 || u64::try_from(body.len()).unwrap_or(0) != mf.size {
1154                    report.tampered_files.push(path.clone());
1155                }
1156            }
1157            None => report.extra_files.push(path.clone()),
1158        }
1159    }
1160    for (path, _) in manifest_index.iter() {
1161        if !files.contains_key(*path) {
1162            report.missing_files.push((*path).to_string());
1163        }
1164    }
1165
1166    // 2) Manifest signature.
1167    if let (Some(signer), Some(sig_b64)) = (
1168        manifest.signer_agent_id.as_ref(),
1169        manifest.signature.as_ref(),
1170    ) {
1171        let pubkey_opt = crate::identity::verify::lookup_peer_public_key(signer);
1172        match pubkey_opt {
1173            Some(pubkey) => {
1174                let signed_bytes = canonical_signed_bytes(&Manifest {
1175                    signer_agent_id: None,
1176                    signature: None,
1177                    ..manifest.clone()
1178                });
1179                let sig_bytes = STANDARD_NO_PAD
1180                    .decode(sig_b64)
1181                    .context("decode manifest signature")?;
1182                let sig_arr: [u8; ed25519_dalek::SIGNATURE_LENGTH] = sig_bytes
1183                    .as_slice()
1184                    .try_into()
1185                    .map_err(|_| anyhow!("manifest signature has wrong length"))?;
1186                let sig = ed25519_dalek::Signature::from_bytes(&sig_arr);
1187                report.signature_status = match pubkey.verify_strict(&signed_bytes, &sig) {
1188                    Ok(()) => SignatureStatus::Verified,
1189                    Err(_) => SignatureStatus::Failed,
1190                };
1191            }
1192            None => {
1193                report.signature_status = SignatureStatus::UnknownSigner;
1194            }
1195        }
1196    }
1197
1198    // 3) Re-verify every edge envelope's signature.
1199    for (path, body) in &files {
1200        if !path.starts_with("edges/") || !path.ends_with(".json") {
1201            continue;
1202        }
1203        let edge: EdgeEnvelope = match serde_json::from_slice(body) {
1204            Ok(e) => e,
1205            Err(_) => {
1206                report.chain_edges_failed.push(path.clone());
1207                continue;
1208            }
1209        };
1210        if !verify_edge_envelope(&edge) {
1211            report.chain_edges_failed.push(path.clone());
1212        }
1213    }
1214
1215    // 4) Roll the per-section failures into the top-level ok flag.
1216    report.ok = report.tampered_files.is_empty()
1217        && report.missing_files.is_empty()
1218        && report.chain_edges_failed.is_empty()
1219        && !matches!(report.signature_status, SignatureStatus::Failed);
1220
1221    Ok(report)
1222}
1223
1224/// Re-derive the canonical CBOR bytes from an [`EdgeEnvelope`] and
1225/// verify the embedded Ed25519 signature. Returns `true` for unsigned
1226/// edges (no signature to falsify) and signed edges that verify
1227/// cleanly. Returns `false` only when a present signature fails to
1228/// verify against the bundled `observed_by` public key.
1229fn verify_edge_envelope(edge: &EdgeEnvelope) -> bool {
1230    let Some(sig_hex) = edge.signature_hex.as_ref() else {
1231        return true; // unsigned — nothing to verify
1232    };
1233    let Some(observed_by) = edge.observed_by.as_ref() else {
1234        return false; // signed but no agent_id — broken envelope
1235    };
1236    let Some(pubkey) = crate::identity::verify::lookup_peer_public_key(observed_by) else {
1237        // Signer key not enrolled locally — auditor can't decide, but
1238        // we treat this as a verification failure on the conservative
1239        // side (the auditor running `verify-forensic-bundle` is
1240        // expected to have the chain's signers in their key dir).
1241        return false;
1242    };
1243    let Ok(sig_bytes) = hex_to_bytes(sig_hex) else {
1244        return false;
1245    };
1246    let link = SignableLink {
1247        src_id: &edge.source_id,
1248        dst_id: &edge.target_id,
1249        relation: &edge.relation,
1250        observed_by: Some(observed_by),
1251        valid_from: edge.valid_from.as_deref(),
1252        valid_until: edge.valid_until.as_deref(),
1253    };
1254    crate::identity::verify::verify(&pubkey, &link, &sig_bytes).is_ok()
1255}
1256
1257// ─────────────────────────────────────────────────────────────────────
1258// CLI entry points (called by daemon_runtime dispatch)
1259// ─────────────────────────────────────────────────────────────────────
1260
1261/// Run `ai-memory export-forensic-bundle`.
1262///
1263/// # Errors
1264///
1265/// Propagates DB / I/O / signing errors.
1266pub fn run_export(
1267    db_path: &Path,
1268    args: &ExportForensicBundleArgs,
1269    out: &mut CliOutput<'_>,
1270) -> Result<i32> {
1271    let conn = crate::db::open(db_path).context("open db")?;
1272    let output = match args.output.as_ref() {
1273        Some(p) => p.clone(),
1274        None => {
1275            let short = args.memory_id.chars().take(8).collect::<String>();
1276            let ts = chrono::Utc::now().format("%Y%m%dT%H%M%SZ");
1277            PathBuf::from(format!("forensic-bundle-{short}-{ts}.tar"))
1278        }
1279    };
1280    build(&conn, args, &output, None)?;
1281    writeln!(out.stdout, "forensic bundle written: {}", output.display())?;
1282    Ok(0)
1283}
1284
1285/// Run `ai-memory verify-forensic-bundle`.
1286///
1287/// # Errors
1288///
1289/// Propagates I/O / parse errors. Verification *failure* (the bundle
1290/// was parseable but didn't pass integrity checks) returns
1291/// `Ok(non-zero exit code)` rather than an error.
1292///
1293/// v0.7.0 G-PHASE-E-4 (#709) — raised the failure exit code from `1`
1294/// to `2`. `1` was indistinguishable from CLI argument errors / unwrap
1295/// panics under shell error trapping; `2` is the conventional
1296/// "verification failed" code (matches the new convention on
1297/// `verify-reflection-chain`).
1298pub fn run_verify(args: &VerifyForensicBundleArgs, out: &mut CliOutput<'_>) -> Result<i32> {
1299    let report = verify(&args.bundle_path)?;
1300    let payload = serde_json::to_string_pretty(&report).context("serialise VerificationReport")?;
1301    writeln!(out.stdout, "{payload}")?;
1302    if report.ok {
1303        writeln!(out.stdout, "verification OK")?;
1304        Ok(0)
1305    } else {
1306        writeln!(out.stdout, "verification FAILED")?;
1307        Ok(2)
1308    }
1309}
1310
1311// ─────────────────────────────────────────────────────────────────────
1312// Hex helpers
1313// ─────────────────────────────────────────────────────────────────────
1314
1315fn bytes_to_hex(b: &[u8]) -> String {
1316    b.iter().map(|x| format!("{x:02x}")).collect()
1317}
1318
1319fn hex_to_bytes(s: &str) -> Result<Vec<u8>> {
1320    if s.len() % 2 != 0 {
1321        bail!("hex string has odd length");
1322    }
1323    let mut out = Vec::with_capacity(s.len() / 2);
1324    for i in (0..s.len()).step_by(2) {
1325        let pair = &s[i..i + 2];
1326        let byte =
1327            u8::from_str_radix(pair, 16).with_context(|| format!("invalid hex pair '{pair}'"))?;
1328        out.push(byte);
1329    }
1330    Ok(out)
1331}
1332
1333fn hex_sha256(bytes: &[u8]) -> String {
1334    let mut hasher = Sha256::new();
1335    hasher.update(bytes);
1336    bytes_to_hex(&hasher.finalize())
1337}
1338
1339// ─────────────────────────────────────────────────────────────────────
1340// Minimal deterministic POSIX ustar writer + reader
1341// ─────────────────────────────────────────────────────────────────────
1342//
1343// We could pull in the `tar` crate, but that adds a transitive dep (it
1344// is not currently in the lockfile). The bundle format we need is a
1345// tiny subset of ustar — every file is a regular file, every name
1346// fits in 100 bytes, no symlinks, no hardlinks, no PAX extensions. A
1347// 80-line writer + reader keeps the dep surface flat per repo
1348// convention and makes the format trivially auditable.
1349//
1350// All header fields are pinned to constants so two builds over the
1351// same `BundleFiles` produce byte-identical archives:
1352//
1353//   - uid / gid: 0
1354//   - mode: 0o644
1355//   - mtime: 0 (Unix epoch)
1356//   - uname / gname: empty
1357//
1358// The only operator-visible field is the file name (BTreeMap key);
1359// the rest of the header derives from the file body.
1360
1361const USTAR_BLOCK_SIZE: usize = 512;
1362
1363/// Serialise `files` as a deterministic POSIX ustar archive, writing
1364/// to `path`. The on-wire bytes are identical for identical inputs
1365/// regardless of the host's filesystem, locale, or clock.
1366fn write_ustar(path: &Path, files: &BundleFiles) -> Result<()> {
1367    let mut out: Vec<u8> = Vec::new();
1368    for (name, body) in files {
1369        write_ustar_entry(&mut out, name, body)?;
1370    }
1371    // Two zero blocks = end-of-archive marker (POSIX requirement).
1372    out.extend(std::iter::repeat(0u8).take(USTAR_BLOCK_SIZE * 2));
1373    fs::write(path, &out).with_context(|| format!("write tarball to {}", path.display()))?;
1374    Ok(())
1375}
1376
1377/// Serialise `files` to an in-memory `Vec<u8>` — used by the
1378/// reproducibility tests so two builds can be byte-diffed without
1379/// hitting the disk.
1380pub fn pack_to_vec(files: &BundleFiles) -> Result<Vec<u8>> {
1381    let mut out: Vec<u8> = Vec::new();
1382    for (name, body) in files {
1383        write_ustar_entry(&mut out, name, body)?;
1384    }
1385    out.extend(std::iter::repeat(0u8).take(USTAR_BLOCK_SIZE * 2));
1386    Ok(out)
1387}
1388
1389fn write_ustar_entry(out: &mut Vec<u8>, name: &str, body: &[u8]) -> Result<()> {
1390    if name.len() > 100 {
1391        bail!(
1392            "bundle path '{name}' exceeds 100-byte ustar name limit; the bundle layout is \
1393             documented to keep every path under 100 bytes"
1394        );
1395    }
1396    let mut header = [0u8; USTAR_BLOCK_SIZE];
1397
1398    // name: bytes 0..100
1399    header[..name.len()].copy_from_slice(name.as_bytes());
1400    // mode: bytes 100..108 — 7-byte octal + NUL. "0000644"
1401    write_octal(&mut header[100..108], 0o644, 7);
1402    // uid: bytes 108..116 — "0000000"
1403    write_octal(&mut header[108..116], 0, 7);
1404    // gid: bytes 116..124 — "0000000"
1405    write_octal(&mut header[116..124], 0, 7);
1406    // size: bytes 124..136 — 11-byte octal + NUL
1407    write_octal(&mut header[124..136], body.len() as u64, 11);
1408    // mtime: bytes 136..148 — pinned to 0 for determinism
1409    write_octal(&mut header[136..148], 0, 11);
1410    // checksum: bytes 148..156 — filled with spaces first, then recomputed
1411    for b in &mut header[148..156] {
1412        *b = b' ';
1413    }
1414    // typeflag: bytes 156..157 — '0' = regular file
1415    header[156] = b'0';
1416    // linkname: bytes 157..257 — empty
1417    // magic: bytes 257..263 — "ustar\0"
1418    header[257..263].copy_from_slice(b"ustar\0");
1419    // version: bytes 263..265 — "00"
1420    header[263..265].copy_from_slice(b"00");
1421    // uname / gname: 265..297 + 297..329 — empty
1422    // devmajor / devminor: 329..337 + 337..345 — "0000000\0" each
1423    write_octal(&mut header[329..337], 0, 7);
1424    write_octal(&mut header[337..345], 0, 7);
1425    // prefix: 345..500 — empty (we require name <= 100)
1426
1427    // Compute the unsigned checksum over the entire header with the
1428    // checksum field treated as 8 spaces (already done above), then
1429    // write it back as 6-octal-digit + NUL + space (POSIX-mandated
1430    // termination).
1431    let chksum: u32 = header.iter().map(|b| u32::from(*b)).sum();
1432    let s = format!("{chksum:06o}\0 ");
1433    header[148..156].copy_from_slice(s.as_bytes());
1434
1435    out.extend_from_slice(&header);
1436    out.extend_from_slice(body);
1437    let pad = (USTAR_BLOCK_SIZE - (body.len() % USTAR_BLOCK_SIZE)) % USTAR_BLOCK_SIZE;
1438    out.extend(std::iter::repeat(0u8).take(pad));
1439    Ok(())
1440}
1441
1442fn write_octal(field: &mut [u8], value: u64, width: usize) {
1443    // Octal digits, zero-padded to `width`, followed by NUL.
1444    let s = format!("{value:0width$o}", width = width);
1445    for (i, b) in s.bytes().enumerate() {
1446        field[i] = b;
1447    }
1448    field[width] = 0;
1449}
1450
1451/// Parse a POSIX ustar archive emitted by [`write_ustar`] back into a
1452/// path-keyed `BundleFiles` map. We deliberately keep the parser
1453/// strict — only the field set we ourselves emit is accepted, so a
1454/// downstream auditor running this code path is auditing the same
1455/// minimal grammar the build path emits.
1456pub fn read_ustar(bytes: &[u8]) -> Result<BundleFiles> {
1457    let mut files: BundleFiles = BTreeMap::new();
1458    let mut pos = 0;
1459    while pos + USTAR_BLOCK_SIZE <= bytes.len() {
1460        let header = &bytes[pos..pos + USTAR_BLOCK_SIZE];
1461        // End-of-archive: first byte zero (per POSIX, two zero blocks
1462        // terminate; we accept one and bail).
1463        if header[0] == 0 {
1464            break;
1465        }
1466        let name = read_cstr(&header[..100]);
1467        let size = read_octal_size(&header[124..136])?;
1468        // #1250 — refuse implausibly large entry sizes BEFORE the
1469        // arithmetic that could otherwise wrap `usize`. The pre-#1250
1470        // code did `pos + size > bytes.len()`; with a crafted 12-digit
1471        // octal size near `usize::MAX` the addition wrapped to a small
1472        // value, the check passed, and the slice `bytes[pos..pos+size]`
1473        // panicked out of bounds (or read past the buffer on 32-bit
1474        // targets before bounds detection). We cap at
1475        // [`MAX_TAR_ENTRY_BYTES`] (1 GiB) which is two orders of
1476        // magnitude above any realistic forensic-bundle file and below
1477        // any value that could overflow `pos.checked_add(size)`.
1478        if size > MAX_TAR_ENTRY_BYTES {
1479            bail!(
1480                "tar entry '{name}' size {size} exceeds the {MAX_TAR_ENTRY_BYTES}-byte \
1481                 hard cap (likely a malformed or crafted bundle)"
1482            );
1483        }
1484        pos = pos
1485            .checked_add(USTAR_BLOCK_SIZE)
1486            .ok_or_else(|| anyhow!("tar parser: pos overflow advancing past header"))?;
1487        let body_end = pos
1488            .checked_add(size)
1489            .ok_or_else(|| anyhow!("tar entry '{name}' size {size} overflows usize"))?;
1490        if body_end > bytes.len() {
1491            bail!("tar entry '{name}' size {size} extends beyond archive bytes");
1492        }
1493        let body = bytes[pos..body_end].to_vec();
1494        files.insert(name, body);
1495        let pad = (USTAR_BLOCK_SIZE - (size % USTAR_BLOCK_SIZE)) % USTAR_BLOCK_SIZE;
1496        pos = body_end
1497            .checked_add(pad)
1498            .ok_or_else(|| anyhow!("tar parser: pos overflow advancing past padding"))?;
1499    }
1500    Ok(files)
1501}
1502
1503/// #1250 — hard cap on the per-entry body size accepted by
1504/// [`read_ustar`]. Set to 1 GiB: two orders of magnitude above the
1505/// largest realistic forensic-bundle file (a fully-attested signed
1506/// chain of a 7-day mid-tier namespace is ~10 MB) and small enough
1507/// that `pos.checked_add(size)` cannot wrap on any supported platform.
1508/// A crafted bundle declaring a larger entry is refused at parse time
1509/// with `tar entry … exceeds the … hard cap`.
1510pub const MAX_TAR_ENTRY_BYTES: usize = 1024 * 1024 * 1024;
1511
1512fn read_cstr(bytes: &[u8]) -> String {
1513    let end = bytes.iter().position(|b| *b == 0).unwrap_or(bytes.len());
1514    String::from_utf8_lossy(&bytes[..end]).into_owned()
1515}
1516
1517fn read_octal_size(bytes: &[u8]) -> Result<usize> {
1518    let s = read_cstr(bytes);
1519    let trimmed = s.trim().trim_matches(|c: char| !c.is_ascii_digit());
1520    if trimmed.is_empty() {
1521        return Ok(0);
1522    }
1523    usize::from_str_radix(trimmed, 8).with_context(|| format!("invalid octal size field '{s}'"))
1524}
1525
1526// ─────────────────────────────────────────────────────────────────────
1527// Unit tests
1528// ─────────────────────────────────────────────────────────────────────
1529
1530#[cfg(test)]
1531mod tests {
1532    use super::*;
1533    use crate::db;
1534    use crate::models::{Memory, MemoryKind, Tier};
1535    use chrono::Utc;
1536    use rusqlite::params;
1537    use tempfile::TempDir;
1538
1539    fn open_tmp_db(tmp: &TempDir) -> (rusqlite::Connection, PathBuf) {
1540        let p = tmp.path().join("ai-memory.db");
1541        let conn = db::open(&p).expect("db::open");
1542        (conn, p)
1543    }
1544
1545    fn insert_mem(conn: &rusqlite::Connection, ns: &str, depth: i32, kind: MemoryKind) -> String {
1546        let id = uuid::Uuid::new_v4().to_string();
1547        let now = Utc::now().to_rfc3339();
1548        let mem = Memory {
1549            id: id.clone(),
1550            tier: Tier::Mid,
1551            namespace: ns.to_string(),
1552            title: format!("t-{depth}"),
1553            content: format!("c-{depth}"),
1554            reflection_depth: depth,
1555            created_at: now.clone(),
1556            updated_at: now,
1557            memory_kind: kind,
1558            entity_id: None,
1559            persona_version: None,
1560            citations: Vec::new(),
1561            source_uri: None,
1562            source_span: None,
1563            ..Default::default()
1564        };
1565        db::insert(conn, &mem).expect("insert");
1566        id
1567    }
1568
1569    fn link_unsigned(conn: &rusqlite::Connection, src: &str, tgt: &str) {
1570        conn.execute(
1571            "INSERT OR IGNORE INTO memory_links \
1572             (source_id, target_id, relation, created_at, attest_level) \
1573             VALUES (?1, ?2, 'reflects_on', ?3, 'unsigned')",
1574            params![src, tgt, Utc::now().to_rfc3339()],
1575        )
1576        .expect("link_unsigned");
1577    }
1578
1579    #[test]
1580    fn write_and_read_ustar_round_trips() {
1581        let mut files = BTreeMap::new();
1582        files.insert("a.json".to_string(), b"{\"a\":1}".to_vec());
1583        files.insert("nested/b.txt".to_string(), b"hello world".to_vec());
1584        let bytes = pack_to_vec(&files).expect("pack");
1585        let parsed = read_ustar(&bytes).expect("parse");
1586        assert_eq!(parsed, files);
1587    }
1588
1589    #[test]
1590    fn ustar_is_byte_deterministic() {
1591        let mut files = BTreeMap::new();
1592        files.insert("z.txt".to_string(), b"last".to_vec());
1593        files.insert("a.txt".to_string(), b"first".to_vec());
1594        let a = pack_to_vec(&files).expect("pack a");
1595        let b = pack_to_vec(&files).expect("pack b");
1596        assert_eq!(a, b, "same input must produce byte-identical output");
1597    }
1598
1599    #[test]
1600    fn build_files_emits_manifest_with_pinned_schema_version() {
1601        let tmp = TempDir::new().unwrap();
1602        let (conn, _) = open_tmp_db(&tmp);
1603        let id = insert_mem(&conn, "fb-ns", 0, MemoryKind::Observation);
1604        let args = ExportForensicBundleArgs {
1605            memory_id: id.clone(),
1606            include_reflections: true,
1607            include_transcripts: false,
1608            include_atomisation_chain: true,
1609            output: None,
1610        };
1611        let files = build_files(&conn, &args, Some("2026-01-01T00:00:00Z")).expect("build");
1612        let manifest_bytes = files.get("manifest.json").expect("manifest present");
1613        let manifest: Manifest = serde_json::from_slice(manifest_bytes).expect("parse manifest");
1614        assert_eq!(manifest.schema_version, BUNDLE_SCHEMA_VERSION);
1615        assert_eq!(manifest.memory_id, id);
1616        assert_eq!(manifest.generated_at, "2026-01-01T00:00:00Z");
1617    }
1618
1619    #[test]
1620    fn build_files_reproducible_modulo_timestamp() {
1621        let tmp = TempDir::new().unwrap();
1622        let (conn, _) = open_tmp_db(&tmp);
1623        let d0 = insert_mem(&conn, "ns", 0, MemoryKind::Observation);
1624        let d1 = insert_mem(&conn, "ns", 1, MemoryKind::Reflection);
1625        link_unsigned(&conn, &d1, &d0);
1626        let args = ExportForensicBundleArgs {
1627            memory_id: d1.clone(),
1628            include_reflections: true,
1629            include_transcripts: false,
1630            include_atomisation_chain: true,
1631            output: None,
1632        };
1633        let files_a = build_files(&conn, &args, Some("2026-01-01T00:00:00Z")).expect("build a");
1634        let files_b = build_files(&conn, &args, Some("2026-01-01T00:00:00Z")).expect("build b");
1635        let bytes_a = pack_to_vec(&files_a).expect("pack a");
1636        let bytes_b = pack_to_vec(&files_b).expect("pack b");
1637        assert_eq!(
1638            bytes_a, bytes_b,
1639            "byte-identical mod timestamp is the L2-5 acceptance criterion"
1640        );
1641    }
1642
1643    #[test]
1644    fn verify_clean_bundle_reports_ok() {
1645        let tmp = TempDir::new().unwrap();
1646        let (conn, _) = open_tmp_db(&tmp);
1647        let d0 = insert_mem(&conn, "ns", 0, MemoryKind::Observation);
1648        let d1 = insert_mem(&conn, "ns", 1, MemoryKind::Reflection);
1649        link_unsigned(&conn, &d1, &d0);
1650        let args = ExportForensicBundleArgs {
1651            memory_id: d1.clone(),
1652            include_reflections: true,
1653            include_transcripts: false,
1654            include_atomisation_chain: true,
1655            output: None,
1656        };
1657        let bundle_path = tmp.path().join("bundle.tar");
1658        build(&conn, &args, &bundle_path, Some("2026-01-01T00:00:00Z")).expect("build");
1659        let report = verify(&bundle_path).expect("verify");
1660        assert!(report.ok, "clean bundle must verify: {report:#?}");
1661        assert!(report.tampered_files.is_empty());
1662        assert!(report.missing_files.is_empty());
1663    }
1664
1665    #[test]
1666    fn verify_detects_tampered_file_in_bundle() {
1667        let tmp = TempDir::new().unwrap();
1668        let (conn, _) = open_tmp_db(&tmp);
1669        let d0 = insert_mem(&conn, "ns", 0, MemoryKind::Observation);
1670        let d1 = insert_mem(&conn, "ns", 1, MemoryKind::Reflection);
1671        link_unsigned(&conn, &d1, &d0);
1672        let args = ExportForensicBundleArgs {
1673            memory_id: d1.clone(),
1674            include_reflections: true,
1675            include_transcripts: false,
1676            include_atomisation_chain: true,
1677            output: None,
1678        };
1679        let bundle_path = tmp.path().join("bundle.tar");
1680        build(&conn, &args, &bundle_path, Some("2026-01-01T00:00:00Z")).expect("build");
1681
1682        // Tamper: rewrite the file body without updating the manifest.
1683        // The verifier should flag the affected entry.
1684        let bytes = fs::read(&bundle_path).expect("read");
1685        let mut files = read_ustar(&bytes).expect("parse");
1686        let target_key = files
1687            .keys()
1688            .find(|k| k.starts_with("memories/"))
1689            .expect("at least one memory entry")
1690            .clone();
1691        files.insert(target_key.clone(), b"tampered".to_vec());
1692        let new_bytes = pack_to_vec(&files).expect("repack");
1693        fs::write(&bundle_path, &new_bytes).expect("write");
1694
1695        let report = verify(&bundle_path).expect("verify");
1696        assert!(!report.ok, "tampered bundle must fail verification");
1697        assert!(
1698            report.tampered_files.contains(&target_key),
1699            "verifier must name the tampered file; got {:?}",
1700            report.tampered_files
1701        );
1702    }
1703
1704    #[test]
1705    fn canonical_signed_bytes_is_stable() {
1706        let m = Manifest {
1707            schema_version: 1,
1708            memory_id: "abc".into(),
1709            generated_at: "2026-01-01T00:00:00Z".into(),
1710            include_reflections: true,
1711            include_transcripts: false,
1712            files: vec![
1713                ManifestFile {
1714                    path: "a.json".into(),
1715                    size: 5,
1716                    sha256: "ff".into(),
1717                },
1718                ManifestFile {
1719                    path: "b.json".into(),
1720                    size: 10,
1721                    sha256: "ee".into(),
1722                },
1723            ],
1724            signer_agent_id: None,
1725            signature: None,
1726        };
1727        let a = canonical_signed_bytes(&m);
1728        let b = canonical_signed_bytes(&m);
1729        assert_eq!(a, b);
1730        let s = String::from_utf8(a).unwrap();
1731        assert!(s.contains("a.json:5:ff"));
1732        assert!(s.contains("b.json:10:ee"));
1733        assert!(s.contains("memory_id:abc"));
1734    }
1735
1736    #[test]
1737    fn build_chain_includes_ancestors_when_reflections_requested() {
1738        let tmp = TempDir::new().unwrap();
1739        let (conn, _) = open_tmp_db(&tmp);
1740        let d0 = insert_mem(&conn, "ns", 0, MemoryKind::Observation);
1741        let d1 = insert_mem(&conn, "ns", 1, MemoryKind::Reflection);
1742        let d2 = insert_mem(&conn, "ns", 2, MemoryKind::Reflection);
1743        link_unsigned(&conn, &d2, &d1);
1744        link_unsigned(&conn, &d1, &d0);
1745        let args = ExportForensicBundleArgs {
1746            memory_id: d2.clone(),
1747            include_reflections: true,
1748            include_transcripts: false,
1749            include_atomisation_chain: true,
1750            output: None,
1751        };
1752        let files = build_files(&conn, &args, Some("2026-01-01T00:00:00Z")).expect("build");
1753        for id in [&d0, &d1, &d2] {
1754            let key = format!("memories/{id}.json");
1755            assert!(
1756                files.contains_key(&key),
1757                "depth-2 chain must include all ancestors; missing {key}"
1758            );
1759        }
1760        // Two reflects_on edges in the chain → two edge files.
1761        let edge_count = files.keys().filter(|k| k.starts_with("edges/")).count();
1762        assert_eq!(edge_count, 2, "expected 2 reflects_on edges");
1763    }
1764
1765    #[test]
1766    fn build_chain_excludes_ancestors_without_reflections_flag() {
1767        let tmp = TempDir::new().unwrap();
1768        let (conn, _) = open_tmp_db(&tmp);
1769        let d0 = insert_mem(&conn, "ns", 0, MemoryKind::Observation);
1770        let d1 = insert_mem(&conn, "ns", 1, MemoryKind::Reflection);
1771        link_unsigned(&conn, &d1, &d0);
1772        let args = ExportForensicBundleArgs {
1773            memory_id: d1.clone(),
1774            include_reflections: false,
1775            include_transcripts: false,
1776            include_atomisation_chain: true,
1777            output: None,
1778        };
1779        let files = build_files(&conn, &args, Some("2026-01-01T00:00:00Z")).expect("build");
1780        assert!(files.contains_key(&format!("memories/{d1}.json")));
1781        assert!(
1782            !files.contains_key(&format!("memories/{d0}.json")),
1783            "ancestor must be excluded when --include-reflections is unset"
1784        );
1785    }
1786
1787    #[test]
1788    fn verify_detects_missing_file_from_bundle() {
1789        let tmp = TempDir::new().unwrap();
1790        let (conn, _) = open_tmp_db(&tmp);
1791        let d0 = insert_mem(&conn, "ns", 0, MemoryKind::Observation);
1792        let d1 = insert_mem(&conn, "ns", 1, MemoryKind::Reflection);
1793        link_unsigned(&conn, &d1, &d0);
1794        let args = ExportForensicBundleArgs {
1795            memory_id: d1.clone(),
1796            include_reflections: true,
1797            include_transcripts: false,
1798            include_atomisation_chain: true,
1799            output: None,
1800        };
1801        let bundle_path = tmp.path().join("bundle.tar");
1802        build(&conn, &args, &bundle_path, Some("2026-01-01T00:00:00Z")).expect("build");
1803
1804        let bytes = fs::read(&bundle_path).expect("read");
1805        let mut files = read_ustar(&bytes).expect("parse");
1806        let memory_key = files
1807            .keys()
1808            .find(|k| k.starts_with("memories/") && k.contains(&d0))
1809            .expect("ancestor entry present")
1810            .clone();
1811        files.remove(&memory_key);
1812        let new_bytes = pack_to_vec(&files).expect("repack");
1813        fs::write(&bundle_path, &new_bytes).expect("write");
1814
1815        let report = verify(&bundle_path).expect("verify");
1816        assert!(!report.ok, "missing file must fail verification");
1817        assert!(report.missing_files.contains(&memory_key));
1818    }
1819
1820    #[test]
1821    fn hex_round_trip() {
1822        let bytes = vec![0u8, 0x0f, 0xa1, 0xff];
1823        let hex = bytes_to_hex(&bytes);
1824        assert_eq!(hex, "000fa1ff");
1825        assert_eq!(hex_to_bytes(&hex).unwrap(), bytes);
1826    }
1827
1828    #[test]
1829    fn hex_to_bytes_rejects_odd_length() {
1830        assert!(hex_to_bytes("abc").is_err());
1831    }
1832
1833    #[test]
1834    fn ustar_rejects_long_paths() {
1835        let mut files = BTreeMap::new();
1836        // 101-char name — must error.
1837        files.insert("a".repeat(101), b"x".to_vec());
1838        assert!(pack_to_vec(&files).is_err());
1839    }
1840
1841    #[test]
1842    fn hex_to_bytes_rejects_invalid_pair() {
1843        let err = hex_to_bytes("zz").unwrap_err();
1844        assert!(format!("{err:#}").contains("invalid hex pair"));
1845    }
1846
1847    #[test]
1848    fn hex_sha256_stable_for_same_input() {
1849        let a = hex_sha256(b"hello world");
1850        let b = hex_sha256(b"hello world");
1851        assert_eq!(a, b);
1852        // Known fixed property: 64 hex chars
1853        assert_eq!(a.len(), 64);
1854        assert!(a.chars().all(|c| c.is_ascii_hexdigit()));
1855    }
1856
1857    #[test]
1858    fn read_octal_size_parses_padded_field() {
1859        let mut field = [0u8; 12];
1860        write_octal(&mut field, 256, 11);
1861        let parsed = read_octal_size(&field).unwrap();
1862        assert_eq!(parsed, 256);
1863    }
1864
1865    #[test]
1866    fn read_octal_size_empty_returns_zero() {
1867        let field = [0u8; 12];
1868        // All zeros after octal write of 0 produces "00000000000\0".
1869        let parsed = read_octal_size(&field).unwrap();
1870        assert_eq!(parsed, 0);
1871    }
1872
1873    #[test]
1874    fn read_octal_size_garbage_returns_error_or_zero() {
1875        // Field starts with non-digit garbage — trim removes it, empty -> 0.
1876        let field = b"  \0\0\0\0\0\0\0\0\0\0";
1877        let parsed = read_octal_size(field).unwrap();
1878        assert_eq!(parsed, 0);
1879    }
1880
1881    #[test]
1882    fn ustar_pack_unpack_empty_files_map() {
1883        let files: BundleFiles = BTreeMap::new();
1884        let bytes = pack_to_vec(&files).unwrap();
1885        let parsed = read_ustar(&bytes).unwrap();
1886        assert!(parsed.is_empty());
1887    }
1888
1889    #[test]
1890    fn ustar_pack_unpack_handles_block_aligned_body() {
1891        let mut files = BundleFiles::new();
1892        // Exactly 512 bytes — no padding inside record.
1893        files.insert("aligned.bin".to_string(), vec![b'A'; 512]);
1894        let bytes = pack_to_vec(&files).unwrap();
1895        let parsed = read_ustar(&bytes).unwrap();
1896        assert_eq!(parsed.get("aligned.bin").unwrap().len(), 512);
1897    }
1898
1899    #[test]
1900    fn read_ustar_stops_on_zero_block() {
1901        // Empty zero block at the start -> empty map.
1902        let bytes = vec![0u8; 1024];
1903        let parsed = read_ustar(&bytes).unwrap();
1904        assert!(parsed.is_empty());
1905    }
1906
1907    #[test]
1908    fn canonical_signed_bytes_excludes_signature_fields() {
1909        // canonical_signed_bytes must not include `signer_agent_id` or
1910        // `signature` so re-signing produces the same canonical input.
1911        let mut m1 = Manifest {
1912            schema_version: 1,
1913            memory_id: "abc".into(),
1914            generated_at: "2026-01-01T00:00:00Z".into(),
1915            include_reflections: true,
1916            include_transcripts: false,
1917            files: vec![ManifestFile {
1918                path: "a.json".into(),
1919                size: 5,
1920                sha256: "ff".into(),
1921            }],
1922            signer_agent_id: None,
1923            signature: None,
1924        };
1925        let bytes_unsigned = canonical_signed_bytes(&m1);
1926        m1.signer_agent_id = Some("alice".into());
1927        m1.signature = Some("0xdead".into());
1928        let bytes_signed = canonical_signed_bytes(&m1);
1929        assert_eq!(
1930            bytes_unsigned, bytes_signed,
1931            "signer fields must not affect canonical signed bytes"
1932        );
1933    }
1934
1935    #[test]
1936    fn bytes_to_hex_empty_returns_empty_string() {
1937        assert_eq!(bytes_to_hex(&[]), "");
1938    }
1939
1940    #[test]
1941    fn hex_to_bytes_empty_returns_empty_vec() {
1942        let v = hex_to_bytes("").unwrap();
1943        assert!(v.is_empty());
1944    }
1945
1946    #[test]
1947    fn write_octal_zero_value_is_padded() {
1948        let mut field = [0u8; 8];
1949        write_octal(&mut field, 0, 7);
1950        assert_eq!(&field[..7], b"0000000");
1951        assert_eq!(field[7], 0);
1952    }
1953
1954    #[test]
1955    fn read_ustar_truncated_body_rejected() {
1956        // Build a single-file archive and truncate it mid-body.
1957        let mut files = BundleFiles::new();
1958        files.insert("x.txt".to_string(), b"hello".to_vec());
1959        let bytes = pack_to_vec(&files).unwrap();
1960        // Truncate at 520 bytes (just past header, body is incomplete).
1961        let truncated = &bytes[..516];
1962        let err = read_ustar(truncated).unwrap_err();
1963        let s = format!("{err}");
1964        assert!(s.contains("extends beyond"));
1965    }
1966
1967    #[test]
1968    fn verify_returns_error_for_missing_bundle_path() {
1969        let p = std::path::Path::new("/this/does/not/exist/bundle.tar");
1970        assert!(verify(p).is_err());
1971    }
1972
1973    /// #1250 — regression: a tar header that declares a body size
1974    /// above the hard cap MUST be refused with a typed error rather
1975    /// than wrapping `pos + size` and panicking on the subsequent
1976    /// slice. The pre-#1250 implementation panicked with `index out
1977    /// of bounds` on a crafted oversize entry; the new path rejects
1978    /// via the `MAX_TAR_ENTRY_BYTES` ceiling.
1979    #[test]
1980    fn read_ustar_rejects_oversize_entry_1250() {
1981        // Build a header by hand. The body size lives at offset
1982        // 124..136. Fill it with the largest legal 12-byte octal value
1983        // — 11 octal digits + a terminating space (ustar convention).
1984        // That decodes to (8^11 - 1) ≈ 8.6 GB, well over
1985        // `MAX_TAR_ENTRY_BYTES` and, critically, large enough that
1986        // pre-fix `pos + size` arithmetic would have overflowed on
1987        // 32-bit and saturated near `usize::MAX` on 64-bit, defeating
1988        // the `> bytes.len()` guard.
1989        let mut header = [0u8; USTAR_BLOCK_SIZE];
1990        // Name "x" + NUL at offset 0..100.
1991        header[0] = b'x';
1992        // Size field at 124..136 — 11 '7's + space terminator.
1993        for b in &mut header[124..135] {
1994            *b = b'7';
1995        }
1996        header[135] = b' ';
1997        // Mode bits + uid/gid zeros are fine as-is (the parser doesn't
1998        // read them); the rest of the header stays NUL.
1999        let err = read_ustar(&header).expect_err("oversize entry must be refused");
2000        let s = format!("{err}");
2001        assert!(
2002            s.contains("exceeds the") || s.contains("hard cap"),
2003            "expected MAX_TAR_ENTRY_BYTES rejection message, got: {s}"
2004        );
2005    }
2006
2007    /// #1250 — invariants on the hard cap: must be far enough below
2008    /// `usize::MAX` that `pos.checked_add(size)` cannot overflow even
2009    /// after several iterations, AND must accommodate the largest
2010    /// realistic bundle.
2011    #[test]
2012    fn read_ustar_oversize_cap_invariants_1250() {
2013        assert!(
2014            MAX_TAR_ENTRY_BYTES < usize::MAX / 4,
2015            "MAX_TAR_ENTRY_BYTES must be << usize::MAX so checked_add can never panic"
2016        );
2017        assert!(
2018            MAX_TAR_ENTRY_BYTES >= 100 * 1024 * 1024,
2019            "MAX_TAR_ENTRY_BYTES must accommodate the largest realistic forensic bundle"
2020        );
2021    }
2022
2023    // --- FUPC coverage additions (cov-fupc) ---
2024
2025    /// `run_export` with an explicit `--output` writes the bundle there
2026    /// and prints the path, returning exit 0.
2027    #[test]
2028    fn run_export_explicit_output_writes_bundle() {
2029        let tmp = TempDir::new().unwrap();
2030        let (conn, db_path) = open_tmp_db(&tmp);
2031        let id = insert_mem(&conn, "ns", 0, MemoryKind::Observation);
2032        drop(conn); // run_export reopens the db itself
2033        let output = tmp.path().join("explicit.tar");
2034        let args = ExportForensicBundleArgs {
2035            memory_id: id,
2036            include_reflections: true,
2037            include_transcripts: false,
2038            include_atomisation_chain: true,
2039            output: Some(output.clone()),
2040        };
2041        let mut stdout = Vec::<u8>::new();
2042        let mut stderr = Vec::<u8>::new();
2043        let code = {
2044            let mut out = CliOutput::from_std(&mut stdout, &mut stderr);
2045            run_export(&db_path, &args, &mut out).expect("run_export")
2046        };
2047        assert_eq!(code, 0);
2048        assert!(output.exists(), "bundle file must be written");
2049        let printed = String::from_utf8(stdout).unwrap();
2050        assert!(printed.contains("forensic bundle written"));
2051        assert!(printed.contains("explicit.tar"));
2052    }
2053
2054    /// `run_export` with no `--output` derives a default
2055    /// `forensic-bundle-<short>-<ts>.tar` name in the cwd. We run it in
2056    /// a scoped cwd so the derived file lands under the tempdir.
2057    #[test]
2058    fn run_export_default_output_name_derived() {
2059        let tmp = TempDir::new().unwrap();
2060        let (conn, db_path) = open_tmp_db(&tmp);
2061        let id = insert_mem(&conn, "ns", 0, MemoryKind::Observation);
2062        drop(conn);
2063        let args = ExportForensicBundleArgs {
2064            memory_id: id.clone(),
2065            include_reflections: false,
2066            include_transcripts: false,
2067            include_atomisation_chain: false,
2068            output: None,
2069        };
2070        let mut stdout = Vec::<u8>::new();
2071        let mut stderr = Vec::<u8>::new();
2072        let code = {
2073            let mut out = CliOutput::from_std(&mut stdout, &mut stderr);
2074            run_export(&db_path, &args, &mut out).expect("run_export default name")
2075        };
2076        assert_eq!(code, 0);
2077        let printed = String::from_utf8(stdout).unwrap();
2078        // The default name carries the 8-char short id prefix.
2079        let short: String = id.chars().take(8).collect();
2080        let prefix = format!("forensic-bundle-{short}-");
2081        assert!(
2082            printed.contains(&prefix),
2083            "default name must embed short id: {printed}"
2084        );
2085        // The default-name arm writes the bundle into the CWD (relative
2086        // path). Clean up the artifact so the test does not litter the
2087        // repo root. Recover the exact name from the stdout line.
2088        if let Some(name) = printed
2089            .lines()
2090            .find_map(|l| l.trim().strip_prefix("forensic bundle written: "))
2091        {
2092            let _ = fs::remove_file(name);
2093        }
2094    }
2095
2096    /// `run_verify` returns exit 0 on a clean bundle and prints
2097    /// "verification OK".
2098    #[test]
2099    fn run_verify_clean_bundle_exit_zero() {
2100        let tmp = TempDir::new().unwrap();
2101        let (conn, _) = open_tmp_db(&tmp);
2102        let id = insert_mem(&conn, "ns", 0, MemoryKind::Observation);
2103        let args = ExportForensicBundleArgs {
2104            memory_id: id,
2105            include_reflections: true,
2106            include_transcripts: false,
2107            include_atomisation_chain: true,
2108            output: None,
2109        };
2110        let bundle_path = tmp.path().join("ok.tar");
2111        build(&conn, &args, &bundle_path, Some("2026-01-01T00:00:00Z")).expect("build");
2112        let vargs = VerifyForensicBundleArgs {
2113            bundle_path: bundle_path.clone(),
2114        };
2115        let mut stdout = Vec::<u8>::new();
2116        let mut stderr = Vec::<u8>::new();
2117        let code = {
2118            let mut out = CliOutput::from_std(&mut stdout, &mut stderr);
2119            run_verify(&vargs, &mut out).expect("run_verify")
2120        };
2121        assert_eq!(code, 0);
2122        let printed = String::from_utf8(stdout).unwrap();
2123        assert!(printed.contains("verification OK"));
2124    }
2125
2126    /// `run_verify` returns exit 2 (not 1) on a tampered bundle and
2127    /// prints "verification FAILED".
2128    #[test]
2129    fn run_verify_tampered_bundle_exit_two() {
2130        let tmp = TempDir::new().unwrap();
2131        let (conn, _) = open_tmp_db(&tmp);
2132        let id = insert_mem(&conn, "ns", 0, MemoryKind::Observation);
2133        let args = ExportForensicBundleArgs {
2134            memory_id: id,
2135            include_reflections: true,
2136            include_transcripts: false,
2137            include_atomisation_chain: true,
2138            output: None,
2139        };
2140        let bundle_path = tmp.path().join("bad.tar");
2141        build(&conn, &args, &bundle_path, Some("2026-01-01T00:00:00Z")).expect("build");
2142        // Tamper a memory entry.
2143        let bytes = fs::read(&bundle_path).unwrap();
2144        let mut files = read_ustar(&bytes).unwrap();
2145        let key = files
2146            .keys()
2147            .find(|k| k.starts_with("memories/"))
2148            .unwrap()
2149            .clone();
2150        files.insert(key, b"tampered".to_vec());
2151        fs::write(&bundle_path, pack_to_vec(&files).unwrap()).unwrap();
2152
2153        let vargs = VerifyForensicBundleArgs { bundle_path };
2154        let mut stdout = Vec::<u8>::new();
2155        let mut stderr = Vec::<u8>::new();
2156        let code = {
2157            let mut out = CliOutput::from_std(&mut stdout, &mut stderr);
2158            run_verify(&vargs, &mut out).expect("run_verify")
2159        };
2160        assert_eq!(code, 2, "verification failure must exit 2 (#709)");
2161        let printed = String::from_utf8(stdout).unwrap();
2162        assert!(printed.contains("verification FAILED"));
2163    }
2164
2165    /// `verify` flags an entry present in the tar but absent from the
2166    /// manifest as an extra file (not part of the signed inventory).
2167    #[test]
2168    fn verify_detects_extra_file_in_bundle() {
2169        let tmp = TempDir::new().unwrap();
2170        let (conn, _) = open_tmp_db(&tmp);
2171        let id = insert_mem(&conn, "ns", 0, MemoryKind::Observation);
2172        let args = ExportForensicBundleArgs {
2173            memory_id: id,
2174            include_reflections: true,
2175            include_transcripts: false,
2176            include_atomisation_chain: true,
2177            output: None,
2178        };
2179        let bundle_path = tmp.path().join("extra.tar");
2180        build(&conn, &args, &bundle_path, Some("2026-01-01T00:00:00Z")).expect("build");
2181        let bytes = fs::read(&bundle_path).unwrap();
2182        let mut files = read_ustar(&bytes).unwrap();
2183        files.insert("memories/intruder.json".to_string(), b"{}".to_vec());
2184        fs::write(&bundle_path, pack_to_vec(&files).unwrap()).unwrap();
2185
2186        let report = verify(&bundle_path).expect("verify");
2187        assert!(
2188            report
2189                .extra_files
2190                .contains(&"memories/intruder.json".to_string()),
2191            "extra file must be reported: {:?}",
2192            report.extra_files
2193        );
2194    }
2195
2196    /// `verify` reports a missing manifest as a hard error.
2197    #[test]
2198    fn verify_missing_manifest_errors() {
2199        let tmp = TempDir::new().unwrap();
2200        let mut files = BundleFiles::new();
2201        files.insert("memories/x.json".to_string(), b"{}".to_vec());
2202        let bundle_path = tmp.path().join("no-manifest.tar");
2203        fs::write(&bundle_path, pack_to_vec(&files).unwrap()).unwrap();
2204        let err = verify(&bundle_path).unwrap_err();
2205        assert!(format!("{err:#}").contains("missing manifest"));
2206    }
2207
2208    /// `verify_edge_envelope` returns true for an unsigned edge (no
2209    /// signature to falsify).
2210    #[test]
2211    fn verify_edge_envelope_unsigned_is_ok() {
2212        let edge = EdgeEnvelope {
2213            source_id: "a".into(),
2214            target_id: "b".into(),
2215            relation: "reflects_on".into(),
2216            created_at: "2026-01-01T00:00:00Z".into(),
2217            observed_by: None,
2218            valid_from: None,
2219            valid_until: None,
2220            attest_level: "unsigned".into(),
2221            signature_hex: None,
2222        };
2223        assert!(verify_edge_envelope(&edge));
2224    }
2225
2226    /// A signed edge with no `observed_by` is a broken envelope → false.
2227    #[test]
2228    fn verify_edge_envelope_signed_without_agent_is_false() {
2229        let edge = EdgeEnvelope {
2230            source_id: "a".into(),
2231            target_id: "b".into(),
2232            relation: "reflects_on".into(),
2233            created_at: "2026-01-01T00:00:00Z".into(),
2234            observed_by: None,
2235            valid_from: None,
2236            valid_until: None,
2237            attest_level: "signed".into(),
2238            signature_hex: Some("deadbeef".into()),
2239        };
2240        assert!(!verify_edge_envelope(&edge));
2241    }
2242
2243    /// A signed edge naming an agent whose key is not enrolled locally
2244    /// is conservatively treated as a verification failure → false.
2245    #[test]
2246    fn verify_edge_envelope_unknown_signer_is_false() {
2247        let edge = EdgeEnvelope {
2248            source_id: "a".into(),
2249            target_id: "b".into(),
2250            relation: "reflects_on".into(),
2251            created_at: "2026-01-01T00:00:00Z".into(),
2252            observed_by: Some("nobody:unenrolled".into()),
2253            valid_from: None,
2254            valid_until: None,
2255            attest_level: "signed".into(),
2256            signature_hex: Some("deadbeef".into()),
2257        };
2258        assert!(!verify_edge_envelope(&edge));
2259    }
2260
2261    /// `SignatureStatus` + `VerificationReport` serialize to JSON (the
2262    /// shape `run_verify` prints).
2263    #[test]
2264    fn verification_report_serializes() {
2265        let report = VerificationReport {
2266            ok: true,
2267            bundle_path: "/x.tar".into(),
2268            manifest_present: true,
2269            schema_version: BUNDLE_SCHEMA_VERSION,
2270            memory_id: "abc".into(),
2271            signer_agent_id: None,
2272            signature_status: SignatureStatus::Absent,
2273            tampered_files: Vec::new(),
2274            missing_files: Vec::new(),
2275            extra_files: Vec::new(),
2276            chain_edges_failed: Vec::new(),
2277        };
2278        let json = serde_json::to_string(&report).expect("serialize");
2279        assert!(json.contains("\"ok\":true"));
2280        assert!(json.contains("abc"));
2281    }
2282}