nornir 0.4.31 - Docs.rs

//! Iceberg-backed warehouse implementation.
//!
//! Files live under `<root>/warehouse/<namespace>/<table>/data/...`,
//! catalog metadata in `<root>/catalog.redb` (single ACID file via
//! [`skade_katalog`]). The sync [`Warehouse`] trait is bridged
//! to the async iceberg API by a private tokio runtime owned by this
//! warehouse instance.
//!
//! No services, no SQL server, no JVM. Layout is straight Iceberg —
//! a pyiceberg / Spark / DuckDB reader pointed at the same warehouse
//! root + catalog can read every row.

use std::collections::HashMap;
use std::path::Path;
use std::sync::Arc;

use anyhow::{anyhow, Context, Result};
use arrow::array::{
    Array, BooleanArray, Float64Array, Int32Array, Int64Array, RecordBatch, StringArray,
    TimestampMicrosecondArray,
};
use chrono::{DateTime, NaiveDate, NaiveDateTime, TimeZone, Utc};
use futures::TryStreamExt;
use iceberg::arrow::schema_to_arrow_schema;
use iceberg::expr::Reference;
use iceberg::io::LocalFsStorageFactory;
use iceberg::spec::{Datum, PartitionSpec, Transform, UnboundPartitionSpec};
use iceberg::table::Table;
use iceberg::{Catalog, CatalogBuilder, NamespaceIdent, TableCreation, TableIdent};
use skade_katalog::{RedbCatalog, RedbCatalogBuilder};
use tokio::runtime::Runtime;
use uuid::Uuid;

use super::iceberg_schema;
use super::{BenchFilter, Warehouse};
use crate::bench::{BenchResult, BenchRun, TestOutcome};

const NAMESPACE: &str = "nornir";
const TABLE_BENCH_RUNS: &str = "bench_runs";
const TABLE_BENCH_RESULTS: &str = "bench_results";
// Per-bench CPU/mem telemetry (one row per (run_id, bench)). Partition by `repo`
// like bench_runs so per-repo reads prune.
const TABLE_BENCH_TELEMETRY: &str = "bench_telemetry";
const TABLE_TEST_OUTCOMES: &str = "test_outcomes";
// First-class test-matrix results (one row per (run_id, suite, test_name)).
// Written by `nornir test`; read by the viz Test pane + `nornir test history`.
// Partitioned by `repo` like bench_runs so per-repo reads prune.
pub(crate) const TABLE_TEST_RESULTS: &str = "test_results";
pub(crate) const TABLE_DEP_GRAPH_EDGES: &str = "dep_graph_edges";
pub(crate) const TABLE_RELEASE_LINEAGE: &str = "release_lineage";
// Row-level DAG of a release run (one row per component×op×phase boundary).
// Read by `nornir release events` and the future "Release Ops" viz tab.
pub(crate) const TABLE_RELEASE_EVENTS: &str = "release_events";
pub(crate) const TABLE_FUNNEL_EVENTS: &str = "funnel_events";
pub(crate) const TABLE_TANTIVY_INDEX_SNAPSHOTS: &str = "tantivy_index_snapshots";
pub(crate) const TABLE_TANTIVY_INDEX_BLOBS: &str = "tantivy_index_blobs";
// Urðr time-machine: future-artifact stubs. Tables are registered now
// so the warehouse schema is forward-compatible; no capture logic ships
// yet. Same shape as tantivy_index_* — populated by the upcoming
// `nornir debuginfo snapshot` workflow.
pub(crate) const TABLE_DWARF_SNAPSHOTS: &str = "dwarf_snapshots";
pub(crate) const TABLE_DWARF_BLOBS: &str = "dwarf_blobs";
pub(crate) const TABLE_GIMLI_SNAPSHOTS: &str = "gimli_snapshots";
pub(crate) const TABLE_GIMLI_BLOBS: &str = "gimli_blobs";
// Rustdoc-JSON snapshots: same artifact-snapshot/blob shape used by
// tantivy/dwarf/gimli; populated by feature 7 (rustdoc-json semver diff).
pub(crate) const TABLE_RUSTDOC_JSON_SNAPSHOTS: &str = "rustdoc_json_snapshots";
pub(crate) const TABLE_RUSTDOC_JSON_BLOBS: &str = "rustdoc_json_blobs";
// Vector / semantic-search: materialized embeddings + index identity +
// per-occurrence manifest. Lets semantic search at any historical git_sha be
// a warehouse read (no re-embed, no git walk).
pub(crate) const TABLE_EMBEDDINGS: &str = "embeddings";
pub(crate) const TABLE_EMBEDDING_SNAPSHOTS: &str = "embedding_snapshots";
pub(crate) const TABLE_EMBEDDING_MANIFEST: &str = "embedding_manifest";
// Cargo-pipeline fact tables (release-scoped, dep-graph-scoped, or events).
pub(crate) const TABLE_PATH_DEP_AUDITS: &str = "path_dep_audits";
pub(crate) const TABLE_PATCH_STRIP_EVENTS: &str = "patch_strip_events";
pub(crate) const TABLE_PUBLISH_ATTEMPTS: &str = "publish_attempts";
pub(crate) const TABLE_CRATE_METADATA_CHECKS: &str = "crate_metadata_checks";
pub(crate) const TABLE_CRATE_TARBALL_STATS: &str = "crate_tarball_stats";
pub(crate) const TABLE_YANK_EVENTS: &str = "yank_events";
pub(crate) const TABLE_SEMVER_DIFFS: &str = "semver_diffs";
pub(crate) const TABLE_LINKS_DECLARATIONS: &str = "links_declarations";
pub(crate) const TABLE_RESOLVED_FEATURES: &str = "resolved_features";
pub(crate) const TABLE_MSRV_PROBES: &str = "msrv_probes";
pub(crate) const TABLE_TEST_IMPACTED_SELECTIONS: &str = "test_impacted_selections";
pub(crate) const TABLE_TEST_QUARANTINES: &str = "test_quarantines";
pub(crate) const TABLE_RELEASE_COMMITS: &str = "release_commits";
pub(crate) const TABLE_REGISTRY_MIRRORS: &str = "registry_mirrors";
pub(crate) const TABLE_VERSION_BUMP_PLANS: &str = "version_bump_plans";
pub(crate) const TABLE_VERSION_BUMP_TARGETS: &str = "version_bump_targets";
// Knowledge map (syn + gix producers).
pub(crate) const TABLE_SYMBOL_FACTS: &str = "symbol_facts";
pub(crate) const TABLE_CALL_EDGES: &str = "call_edges";
pub(crate) const TABLE_FEATURE_GATE_FACTS: &str = "feature_gate_facts";
pub(crate) const TABLE_GIT_HEAT_FACTS: &str = "git_heat_facts";
pub(crate) const TABLE_WAREHOUSE_ACCESS_EDGES: &str = "warehouse_access_edges";

// Rendered documentation: export history (inline bytes) + a dedicated
// Tantivy index (snapshot/blob pattern), kept separate from the code index.
pub(crate) const TABLE_DOC_EXPORTS: &str = "doc_exports";
pub(crate) const TABLE_ARCHITECTURE_WIRING: &str = "architecture_wiring";
// MCP server call telemetry (metrics only — tool, status, latency, ts).
pub(crate) const TABLE_MCP_REQUESTS: &str = "mcp_requests";
pub(crate) const TABLE_VULN_FINDINGS: &str = "vuln_findings";
pub(crate) const TABLE_DOCS_INDEX_SNAPSHOTS: &str = "docs_index_snapshots";
pub(crate) const TABLE_DOCS_INDEX_BLOBS: &str = "docs_index_blobs";
// SBOM components captured from a deep-scan / cargo-metadata sweep (one row per
// resolved dep), so the CycloneDX SBOM is a warehouse read.
pub(crate) const TABLE_SBOM_COMPONENTS: &str = "sbom_components";
// Knowledge-scan ledger keyed by (repo, git_sha) — drives re-scan idempotency.
pub(crate) const TABLE_KNOWLEDGE_SCANS: &str = "knowledge_scans";
// H5 local-LLM bake-off: one row per (run_id, model, prompt_id) answer.
pub(crate) const TABLE_AGENT_MODEL_RUNS: &str = "agent_model_runs";
// N5 viz action trail — durable sink for the viz user-action log (the twin of
// the `$NORNIR_VIZ_ACTIONLOG` /tmp file). One row per pushed action.
pub(crate) const TABLE_VIZ_ACTIONS: &str = "viz_actions";
// autonom completeness gate (AUT2 / n-005): one row per discovered surface node
// × mode × workspace × verdict — the persisted gate verdict, mirroring
// `tests/mcp_tool_coverage.json` generalized to the whole surface.
pub(crate) const TABLE_SURFACE_COVERAGE: &str = "surface_coverage";

pub struct IcebergWarehouse {
    catalog: Arc<RedbCatalog>,
    rt: Option<Runtime>,
    namespace: NamespaceIdent,
    root: std::path::PathBuf,
    /// When this warehouse was opened from a copied-aside snapshot of
    /// `catalog.redb` (because a live `nornir-server` holds the exclusive
    /// redb lock on the real file), the temp dir that backs the copy is
    /// owned here so it lives exactly as long as the catalog and is cleaned
    /// up on drop. `None` for a normal exclusive open of the live catalog.
    _snapshot: Option<tempfile::TempDir>,
}

/// True when `err` (anywhere in its source chain) is redb's exclusive-lock
/// rejection — i.e. another process (typically the live `nornir-server`)
/// already holds the single-writer lock on `catalog.redb`. skade-katalog
/// surfaces this as an `anyhow`/`iceberg` error whose `Display` carries
/// redb's `DatabaseAlreadyOpen` wording, so we match on the message across
/// the whole chain rather than a concrete error type we don't re-export.
pub fn is_catalog_lock_error(err: &anyhow::Error) -> bool {
    err.chain().any(|e| {
        let m = e.to_string();
        m.contains("Database already open") || m.contains("Cannot acquire lock")
    })
}

/// Copy the locked `live` `catalog.redb` → `dst` so the copy is a **complete,
/// openable** image reflecting the latest committed catalog state, even while
/// the `nornir-server` is committing to `live`.
///
/// Why a plain `fs::copy` is not safe here: `live` is held open and mutated in
/// place by the server. redb grows the file in **regions** — to add a region it
/// updates the on-disk header to advertise the larger layout *and* extends the
/// file. Those two steps are not atomic w.r.t. an outside `fs::copy`: if the
/// copy snapshots the file after the header was written but before (or partway
/// through) the file was extended, the copy's byte length is **shorter than the
/// layout its own header claims**. Opening that copy trips redb's
/// `raw_file_len() >= header.layout().len()` invariant — in redb 2.6 an
/// **assertion that panics** (`page_manager.rs`), crashing the reader, or, in
/// milder mismatches, forces a recovery to an older/short state. The reader the
/// caller believes is "latest" then crashes or reads empty (the fat-viz
/// empty-Timeline bug).
///
/// The fix is to take the copy only when the source is **size-stable** — copy,
/// then re-stat the source and re-copy until its length does not change across
/// the copy window. A window with no region growth cannot leave the header
/// advertising bytes the copy is missing, so the resulting copy always satisfies
/// redb's length invariant and opens to the latest committed transaction. (redb
/// is robust to a *longer* file via recovery, and to a torn-but-complete copy
/// via its two-phase god-slot — both safe; only the short-file race is the
/// hazard, and size-stability closes exactly that.) Growth is rare and the
/// server fsyncs per commit, so this converges in one pass in the steady state
/// and a few retries while a region is being added; bounded by `MAX_ATTEMPTS`.
fn copy_catalog_consistent(live: &Path, dst: &Path) -> Result<()> {
    copy_catalog_consistent_with(live, dst, |s, d| {
        std::fs::copy(s, d)
            .map(|_| ())
            .with_context(|| format!("copy {} -> {}", s.display(), d.display()))
    })
}

/// Core of [`copy_catalog_consistent`] with the copy step injected, so tests can
/// deterministically drive the size-stability retry (force a short/growing copy
/// on early attempts, a stable one later) instead of racing a real writer.
fn copy_catalog_consistent_with(
    live: &Path,
    dst: &Path,
    mut copy: impl FnMut(&Path, &Path) -> Result<()>,
) -> Result<()> {
    const MAX_ATTEMPTS: usize = 64;

    let len_of = |p: &Path| -> Result<u64> {
        Ok(std::fs::metadata(p)
            .with_context(|| format!("stat {}", p.display()))?
            .len())
    };

    for attempt in 0..MAX_ATTEMPTS {
        let before = len_of(live)?;
        copy(live, dst)?;
        let after = len_of(live)?;
        // Accept only a copy taken across a window with no region growth AND
        // whose own length is not short relative to the source: then it cannot
        // be shorter than the header's advertised layout, so redb opens it to
        // the latest committed transaction. (`len_of(dst) >= after` rejects a
        // torn/short copy even if the source size happened to look unchanged.)
        if before == after && len_of(dst)? >= after {
            return Ok(());
        }
        // A region grew (or the copy came up short) across the window; let the
        // extension land, then retry.
        std::thread::sleep(std::time::Duration::from_millis(2 + attempt as u64 / 4));
    }
    anyhow::bail!(
        "snapshot of locked catalog {} never reached a size-stable window after \
         {MAX_ATTEMPTS} attempts (catalog growing under a very hot writer)",
        live.display()
    )
}

impl Drop for IcebergWarehouse {
    fn drop(&mut self) {
        // Dropping a tokio Runtime synchronously inside another tokio
        // runtime (e.g. when this warehouse is owned by an async MCP
        // server) panics. Hand the inner runtime off to be shut down
        // in the background so the outer runtime's drop is safe.
        if let Some(rt) = self.rt.take() {
            rt.shutdown_background();
        }
    }
}

impl IcebergWarehouse {
    pub fn open(root: &Path) -> Result<Self> {
        std::fs::create_dir_all(root)
            .with_context(|| format!("create warehouse root {}", root.display()))?;
        let warehouse_dir = root.join("warehouse");
        std::fs::create_dir_all(&warehouse_dir)
            .with_context(|| format!("create warehouse data dir {}", warehouse_dir.display()))?;
        let db_path = root.join("catalog.redb");
        let warehouse_uri = format!("file://{}", warehouse_dir.canonicalize()?.display());
        Self::open_with_catalog(root, &db_path, &warehouse_uri, None)
    }

    /// Lock-tolerant read open. Behaves like [`open`](Self::open), but when the
    /// real `catalog.redb` is already locked exclusively by another process
    /// (the live `nornir-server` — skade-katalog is single-writer), it does NOT
    /// hard-fail. Instead it copies `catalog.redb` aside into a temp dir and
    /// opens *that* copy, yielding a point-in-time read-only snapshot of the
    /// catalog. Table data files are referenced by absolute `file://` paths in
    /// the catalog metadata, so they still resolve to the live warehouse data —
    /// only the catalog index is copied (typically a few MB).
    ///
    /// This lets dev-side `nornir` CLI reads (the `docs_fresh` gate, docs
    /// render, `release trace`, etc.) coexist with a running server instead of
    /// deadlocking against its lock. A normal exclusive open is preferred; the
    /// snapshot fallback is logged so the degraded mode is never silent.
    ///
    /// Callers that mutate the warehouse must use [`open`](Self::open) — a
    /// snapshot copy is read-only by construction (writes would land in the
    /// throwaway temp dir, not the live catalog).
    pub fn open_read_only(root: &Path) -> Result<Self> {
        match Self::open(root) {
            Ok(wh) => Ok(wh),
            Err(e) if is_catalog_lock_error(&e) => {
                eprintln!(
                    "WARNING: nornir: catalog.redb at {} is locked by another process \
                     (a live nornir-server?); opening a read-only copied snapshot instead. \
                     Reads reflect the catalog as of now; this CLI cannot mutate the live \
                     warehouse while the server holds it.",
                    root.display()
                );
                Self::open_snapshot(root).with_context(|| {
                    format!(
                        "open read-only snapshot of locked catalog at {}",
                        root.display()
                    )
                })
            }
            Err(e) => Err(e),
        }
    }

    /// Copy `catalog.redb` aside and open the copy. Used by
    /// [`open_read_only`](Self::open_read_only) when the live catalog is locked.
    fn open_snapshot(root: &Path) -> Result<Self> {
        let live_db = root.join("catalog.redb");
        if !live_db.exists() {
            // Nothing to snapshot — the lock error wasn't about our catalog.
            anyhow::bail!(
                "no catalog.redb at {} to snapshot",
                live_db.display()
            );
        }
        // Data files are referenced by absolute paths in the catalog, so the
        // data dir must stay pointed at the *real* warehouse. Only the catalog
        // index file is copied into the temp dir.
        let tmp = tempfile::Builder::new()
            .prefix("nornir-catalog-snapshot-")
            .tempdir()
            .context("create temp dir for catalog snapshot")?;
        let snap_db = tmp.path().join("catalog.redb");
        // The live `catalog.redb` is being mutated *under us* by the server that
        // holds the lock. A single blind `fs::copy` can race redb's region growth
        // and capture a file shorter than its own header's advertised layout —
        // which redb then either panics on (its length assertion) or recovers to
        // a stale/short state, so the reader crashes or sees an empty Timeline.
        // `copy_catalog_consistent` instead snapshots only across a size-stable
        // window, yielding a complete image that opens to the latest committed
        // state (see its doc comment for the full rationale).
        copy_catalog_consistent(&live_db, &snap_db).with_context(|| {
            format!(
                "copy locked catalog {} -> snapshot {}",
                live_db.display(),
                snap_db.display()
            )
        })?;
        let warehouse_dir = root.join("warehouse");
        let warehouse_uri = format!("file://{}", warehouse_dir.canonicalize()?.display());
        Self::open_with_catalog(root, &snap_db, &warehouse_uri, Some(tmp))
    }

    /// Shared constructor: open `db_path` as the redb catalog while pointing
    /// table data at `warehouse_uri`. `snapshot` ties an owning temp dir to the
    /// warehouse lifetime when `db_path` is a copied-aside snapshot.
    fn open_with_catalog(
        root: &Path,
        db_path: &Path,
        warehouse_uri: &str,
        snapshot: Option<tempfile::TempDir>,
    ) -> Result<Self> {
        let rt = tokio::runtime::Builder::new_multi_thread()
            .enable_all()
            .build()
            .context("build tokio runtime")?;

        let catalog = rt.block_on(async {
            RedbCatalogBuilder::default()
                .db_path(db_path)
                .warehouse_location(warehouse_uri)
                .with_storage_factory(Arc::new(LocalFsStorageFactory))
                .load("nornir", HashMap::new())
                .await
        })?;
        let catalog = Arc::new(catalog);
        let namespace = NamespaceIdent::new(NAMESPACE.to_string());

        rt.block_on(ensure_layout(&catalog, &namespace))?;

        Ok(Self {
            catalog,
            rt: Some(rt),
            namespace,
            root: root.to_path_buf(),
            _snapshot: snapshot,
        })
    }

    /// The warehouse root this was opened from (parent of `warehouse/` +
    /// `catalog.redb`). Lets the timeline locate the sibling `../git/` clones
    /// for the git-history fallback when a workspace has no release backlog.
    pub fn root(&self) -> &Path {
        &self.root
    }

    /// True when this warehouse was opened from a copied-aside read-only
    /// snapshot of a locked `catalog.redb` (via [`open_read_only`](Self::open_read_only)).
    /// Such a warehouse is read-only: writes land in a throwaway temp dir, not
    /// the live catalog, so callers must not mutate it.
    pub fn is_snapshot(&self) -> bool {
        self._snapshot.is_some()
    }

    pub(crate) fn table_ident(&self, name: &str) -> TableIdent {
        TableIdent::new(self.namespace.clone(), name.to_string())
    }
}

impl IcebergWarehouse {
    /// Access the underlying catalog. Used by sibling modules
    /// (e.g. `warehouse::dep_graph`) that read/write additional tables
    /// without going through the `Warehouse` trait.
    pub fn catalog(&self) -> &Arc<RedbCatalog> {
        &self.catalog
    }

    /// Block on `fut` using this warehouse's runtime. Sibling modules
    /// share the runtime so the sync `Warehouse` trait stays sync.
    pub fn block_on<F: std::future::Future>(&self, fut: F) -> F::Output {
        self.rt.as_ref().expect("rt present").block_on(fut)
    }
}

async fn ensure_layout(catalog: &RedbCatalog, ns: &NamespaceIdent) -> Result<()> {
    if !catalog.namespace_exists(ns).await? {
        catalog.create_namespace(ns, HashMap::new()).await?;
    }
    // bench_runs is the highest-traffic scoped-read table — partition by `repo`
    // so `query_bench_runs(for_repo(..))` prunes to one repo's files.
    create_partitioned_table_if_missing(
        catalog,
        ns,
        TABLE_BENCH_RUNS,
        iceberg_schema::bench_runs()?,
        &["repo"],
    )
    .await?;
    create_table_if_missing(
        catalog,
        ns,
        TABLE_BENCH_RESULTS,
        iceberg_schema::bench_results()?,
    )
    .await?;
    // Per-bench telemetry: partitioned by `repo` like bench_runs so per-repo
    // reads prune to one repo's files.
    create_partitioned_table_if_missing(
        catalog,
        ns,
        TABLE_BENCH_TELEMETRY,
        iceberg_schema::bench_telemetry()?,
        &["repo"],
    )
    .await?;
    create_table_if_missing(
        catalog,
        ns,
        TABLE_TEST_OUTCOMES,
        iceberg_schema::test_outcomes()?,
    )
    .await?;
    // C6 test-matrix: standalone green/red board (not bench-coupled). Partition
    // by `repo` so `nornir test history <repo>` + the viz Test pane prune to one
    // repo's files, exactly like bench_runs.
    create_partitioned_table_if_missing(
        catalog,
        ns,
        TABLE_TEST_RESULTS,
        iceberg_schema::test_results()?,
        &["repo"],
    )
    .await?;
    create_table_if_missing(
        catalog,
        ns,
        TABLE_DEP_GRAPH_EDGES,
        iceberg_schema::dep_graph_edges()?,
    )
    .await?;
    create_table_if_missing(
        catalog,
        ns,
        TABLE_RELEASE_LINEAGE,
        iceberg_schema::release_lineage()?,
    )
    .await?;
    // Release-op DAG. Reads scope by `run_id` (one release run) and roll the
    // long rows into a topo view; unpartitioned (a run is a handful of rows, the
    // residual `run_id` filter in the reader is enough).
    create_table_if_missing(
        catalog,
        ns,
        TABLE_RELEASE_EVENTS,
        iceberg_schema::release_events()?,
    )
    .await?;
    create_table_if_missing(
        catalog,
        ns,
        TABLE_FUNNEL_EVENTS,
        iceberg_schema::funnel_events()?,
    )
    .await?;
    // Artifact snapshot/blob pairs. Snapshots are looked up by `repo` (find the
    // latest for a repo); blobs are fetched by `snapshot_id` (one snapshot's
    // bytes) — partition each by its read key so restore prunes to one snapshot
    // instead of O(all bytes ever). (`*_blobs` carries no `repo` column.)
    for (snap_name, blob_name) in [
        (TABLE_TANTIVY_INDEX_SNAPSHOTS, TABLE_TANTIVY_INDEX_BLOBS),
        (TABLE_DWARF_SNAPSHOTS, TABLE_DWARF_BLOBS),
        (TABLE_GIMLI_SNAPSHOTS, TABLE_GIMLI_BLOBS),
        (TABLE_RUSTDOC_JSON_SNAPSHOTS, TABLE_RUSTDOC_JSON_BLOBS),
        (TABLE_DOCS_INDEX_SNAPSHOTS, TABLE_DOCS_INDEX_BLOBS),
    ] {
        create_partitioned_table_if_missing(
            catalog, ns, snap_name, iceberg_schema::artifact_snapshots()?, &["repo"],
        )
        .await?;
        create_partitioned_table_if_missing(
            catalog, ns, blob_name, iceberg_schema::artifact_blobs()?, &["snapshot_id"],
        )
        .await?;
    }
    // Rendered-document export history (inline bytes).
    create_table_if_missing(catalog, ns, TABLE_DOC_EXPORTS, iceberg_schema::doc_exports()?).await?;
    // Architecture-wiring history (EPIC ARCH n-006): one row per generated
    // coarsened wiring graph, keyed by git sha; carries the graph JSON + SVG.
    create_table_if_missing(
        catalog,
        ns,
        TABLE_ARCHITECTURE_WIRING,
        iceberg_schema::architecture_wiring()?,
    )
    .await?;
    // Vector / semantic-search tables.
    create_table_if_missing(catalog, ns, TABLE_EMBEDDINGS, iceberg_schema::embeddings()?).await?;
    create_table_if_missing(
        catalog,
        ns,
        TABLE_EMBEDDING_SNAPSHOTS,
        iceberg_schema::embedding_snapshots()?,
    )
    .await?;
    create_table_if_missing(
        catalog,
        ns,
        TABLE_EMBEDDING_MANIFEST,
        iceberg_schema::embedding_manifest()?,
    )
    .await?;
    // Cargo-pipeline tables (feature 1-15).
    for (name, schema) in [
        (TABLE_PATH_DEP_AUDITS, iceberg_schema::path_dep_audits()?),
        (TABLE_PATCH_STRIP_EVENTS, iceberg_schema::patch_strip_events()?),
        (TABLE_PUBLISH_ATTEMPTS, iceberg_schema::publish_attempts()?),
        (TABLE_CRATE_METADATA_CHECKS, iceberg_schema::crate_metadata_checks()?),
        (TABLE_CRATE_TARBALL_STATS, iceberg_schema::crate_tarball_stats()?),
        (TABLE_YANK_EVENTS, iceberg_schema::yank_events()?),
        (TABLE_SEMVER_DIFFS, iceberg_schema::semver_diffs()?),
        (TABLE_LINKS_DECLARATIONS, iceberg_schema::links_declarations()?),
        (TABLE_RESOLVED_FEATURES, iceberg_schema::resolved_features()?),
        (TABLE_MSRV_PROBES, iceberg_schema::msrv_probes()?),
        (TABLE_TEST_IMPACTED_SELECTIONS, iceberg_schema::test_impacted_selections()?),
        (TABLE_TEST_QUARANTINES, iceberg_schema::test_quarantines()?),
        (TABLE_RELEASE_COMMITS, iceberg_schema::release_commits()?),
        (TABLE_REGISTRY_MIRRORS, iceberg_schema::registry_mirrors()?),
        (TABLE_VERSION_BUMP_PLANS, iceberg_schema::version_bump_plans()?),
        (TABLE_VERSION_BUMP_TARGETS, iceberg_schema::version_bump_targets()?),
        (TABLE_MCP_REQUESTS, iceberg_schema::mcp_requests()?),
        (TABLE_VULN_FINDINGS, iceberg_schema::vuln_findings()?),
        // fn → warehouse-table access fact (AUT7 / EPIC ARCH n-002). A small,
        // flat, code-derived fact table (mirrors links_declarations); browsable
        // generically and read scoped via `query_warehouse_access_edges`.
        (TABLE_WAREHOUSE_ACCESS_EDGES, iceberg_schema::warehouse_access_edges()?),
        // autonom completeness gate (n-005): per surface-node verdict rows.
        (TABLE_SURFACE_COVERAGE, iceberg_schema::surface_coverage()?),
    ] {
        create_table_if_missing(catalog, ns, name, schema).await?;
    }
    // Knowledge-map tables are written by single-repo scans and read scoped to a
    // repo (`knowledge::query::load_latest`), so partition them by `repo` to prune.
    for (name, schema) in [
        (TABLE_SYMBOL_FACTS, iceberg_schema::symbol_facts()?),
        (TABLE_CALL_EDGES, iceberg_schema::call_edges()?),
        (TABLE_FEATURE_GATE_FACTS, iceberg_schema::feature_gate_facts()?),
        (TABLE_GIT_HEAT_FACTS, iceberg_schema::git_heat_facts()?),
        // SBOM components + the knowledge-scan ledger are both read scoped to a
        // single `repo`, so partition them by `repo` to prune scans.
        (TABLE_SBOM_COMPONENTS, iceberg_schema::sbom_components()?),
        (TABLE_KNOWLEDGE_SCANS, iceberg_schema::knowledge_scans()?),
    ] {
        create_partitioned_table_if_missing(catalog, ns, name, schema, &["repo"]).await?;
    }
    // H5 bake-off: one row per (run_id, model, prompt) answer. A bake-off writes
    // every model's row in one batch, and the leaderboard reads a whole run back,
    // so partition by `run_id` — one append = one partition, and a run read prunes
    // to that run's files (a per-model read does a residual `model` filter).
    create_partitioned_table_if_missing(
        catalog,
        ns,
        TABLE_AGENT_MODEL_RUNS,
        iceberg_schema::agent_model_runs()?,
        &["run_id"],
    )
    .await?;
    // N5 viz action trail. A viz session appends its actions as they happen and
    // reads its own recent history back scoped to one `session_id`, so partition
    // by `session_id` — a per-session read prunes to that launch's files (a
    // cross-session read does a residual filter).
    create_partitioned_table_if_missing(
        catalog,
        ns,
        TABLE_VIZ_ACTIONS,
        iceberg_schema::viz_actions()?,
        &["session_id"],
    )
    .await?;
    Ok(())
}

async fn create_table_if_missing(
    catalog: &RedbCatalog,
    ns: &NamespaceIdent,
    name: &str,
    schema: iceberg::spec::Schema,
) -> Result<()> {
    create_partitioned_table_if_missing(catalog, ns, name, schema, &[]).await
}

/// Like [`create_table_if_missing`] but lays the table out partitioned by the
/// identity of `partition_cols` (each must be a top-level column in `schema`).
/// Partitioning makes scoped reads prune whole data files at plan time instead
/// of scanning all history (see plan.md §"Iceberg warehouse efficiency").
async fn create_partitioned_table_if_missing(
    catalog: &RedbCatalog,
    ns: &NamespaceIdent,
    name: &str,
    schema: iceberg::spec::Schema,
    partition_cols: &[&str],
) -> Result<()> {
    let ident = TableIdent::new(ns.clone(), name.to_string());
    if catalog.table_exists(&ident).await? {
        return Ok(());
    }
    // `TableCreation::builder()` is a typed-builder (each setter changes the
    // type), so branch rather than mutate a single binding.
    let creation = if partition_cols.is_empty() {
        TableCreation::builder().name(name.to_string()).schema(schema).build()
    } else {
        // Build the (unbound) spec from a borrow before `schema` is moved in.
        let spec = identity_partition_spec(&schema, partition_cols)?;
        TableCreation::builder()
            .name(name.to_string())
            .schema(schema)
            .partition_spec(spec)
            .build()
    };
    catalog.create_table(ns, creation).await?;
    Ok(())
}

/// An identity [`UnboundPartitionSpec`] over `columns`, bound against `schema`
/// (to resolve names → field ids) then returned unbound for `TableCreation`.
fn identity_partition_spec(
    schema: &iceberg::spec::Schema,
    columns: &[&str],
) -> Result<UnboundPartitionSpec> {
    let mut b = PartitionSpec::builder(Arc::new(schema.clone()));
    for c in columns {
        b = b.add_partition_field(c, (*c).to_string(), Transform::Identity)?;
    }
    Ok(b.build()?.into_unbound())
}

impl IcebergWarehouse {
    /// Async variant of [`Warehouse::append_bench_run`] — call this
    /// directly from async contexts (e.g. `release::pipeline`) to
    /// avoid the nested `block_on` that would otherwise panic when
    /// the sync trait method is invoked from inside a running tokio
    /// runtime.
    pub async fn append_bench_run_async(&self, repo: &str, run: &BenchRun) -> Result<Uuid> {
        if run.machine.trim().is_empty() {
            anyhow::bail!("BenchRun.machine is required");
        }
        let run_id = Uuid::new_v4();
        let ts = resolve_timestamp(run)?;

        let table = self.catalog.load_table(&self.table_ident(TABLE_BENCH_RUNS)).await?;
        let batch = build_bench_runs_batch(&table, run_id, repo, ts, run)?;
        append_batch(&self.catalog, table, batch).await?;

        let results_batch_opt = {
            let table = self.catalog.load_table(&self.table_ident(TABLE_BENCH_RESULTS)).await?;
            let batch = build_bench_results_batch(&table, run_id, run)?;
            if batch.num_rows() > 0 { Some((table, batch)) } else { None }
        };
        if let Some((table, batch)) = results_batch_opt {
            append_batch(&self.catalog, table, batch).await?;
        }

        // Per-bench telemetry (CPU/mem footprint) lifted out of each result's
        // `telem_*` metrics into its own partitioned table, keyed by run_id+bench.
        let telem_batch_opt = {
            let table = self.catalog.load_table(&self.table_ident(TABLE_BENCH_TELEMETRY)).await?;
            let batch = build_bench_telemetry_batch(&table, run_id, repo, run)?;
            if batch.num_rows() > 0 { Some((table, batch)) } else { None }
        };
        if let Some((table, batch)) = telem_batch_opt {
            append_batch(&self.catalog, table, batch).await?;
        }

        let tests_batch_opt = {
            let table = self.catalog.load_table(&self.table_ident(TABLE_TEST_OUTCOMES)).await?;
            let batch = build_test_outcomes_batch(&table, run_id, run)?;
            if batch.num_rows() > 0 { Some((table, batch)) } else { None }
        };
        if let Some((table, batch)) = tests_batch_opt {
            append_batch(&self.catalog, table, batch).await?;
        }
        Ok(run_id)
    }

    /// Idempotency lookup: the `snapshot_id` already recorded for `(repo,
    /// git_sha)` in an artifact `*_snapshots` table, or `None`. Lets a capture
    /// phase skip re-writing a projection that's already in the warehouse at this
    /// SHA (the warehouse can't compact — iceberg-rust 0.9 has no
    /// `expire_snapshots`/rewrite — so not writing redundant blobs is the lever
    /// that bounds growth). Partition-pruned by `repo`, so it reads one repo's
    /// small snapshot registry, never the blob bytes. A missing table (first
    /// capture ever) reads as "not present".
    async fn artifact_snapshot_id_for(
        &self,
        table_name: &str,
        repo: &str,
        git_sha: &str,
    ) -> Result<Option<String>> {
        let table = match self.catalog.load_table(&self.table_ident(table_name)).await {
            Ok(t) => t,
            Err(_) => return Ok(None),
        };
        let batches = scan_repo_filtered(&table, Some(repo)).await?;
        // artifact_snapshots layout: col 0 = snapshot_id, col 3 = git_sha.
        let mut found = None;
        for b in &batches {
            let sid = downcast::<StringArray>(b, 0)?;
            let sha = downcast::<StringArray>(b, 3)?;
            for i in 0..b.num_rows() {
                if sha.value(i) == git_sha {
                    found = Some(sid.value(i).to_string());
                }
            }
        }
        Ok(found)
    }

    /// `snapshot_id` of an existing Tantivy index capture at `(repo, git_sha)`.
    pub async fn index_snapshot_id_for(&self, repo: &str, git_sha: &str) -> Result<Option<String>> {
        self.artifact_snapshot_id_for(TABLE_TANTIVY_INDEX_SNAPSHOTS, repo, git_sha).await
    }

    /// `snapshot_id` of an existing DWARF capture at `(repo, git_sha)`.
    pub async fn dwarf_snapshot_id_for(&self, repo: &str, git_sha: &str) -> Result<Option<String>> {
        self.artifact_snapshot_id_for(TABLE_DWARF_SNAPSHOTS, repo, git_sha).await
    }

    /// Every table name in the warehouse namespace, sorted — the inventory the
    /// generic viz browser lists. (Catalog metadata only, no data read.)
    pub fn table_names(&self) -> Result<Vec<String>> {
        self.rt.as_ref().expect("rt present").block_on(async {
            let mut names: Vec<String> = self
                .catalog
                .list_tables(&self.namespace)
                .await?
                .into_iter()
                .map(|t| t.name().to_string())
                .collect();
            names.sort();
            Ok(names)
        })
    }

    /// Scan `table` and return up to `limit` rows as display strings (column
    /// names + stringified cells) — the generic "show any warehouse table in
    /// egui" path. Blob/timestamp columns render as a `<type>` placeholder
    /// rather than dumping bytes.
    pub fn scan_preview(&self, table: &str, limit: usize) -> Result<TablePreview> {
        self.rt.as_ref().expect("rt present").block_on(async {
            let t = self.catalog.load_table(&self.table_ident(table)).await?;
            let batches = scan_limited(&t, limit).await?;
            let mut columns: Vec<String> = Vec::new();
            let mut rows: Vec<Vec<String>> = Vec::new();
            'outer: for b in &batches {
                if columns.is_empty() {
                    columns = b.schema().fields().iter().map(|f| f.name().to_string()).collect();
                }
                for i in 0..b.num_rows() {
                    if rows.len() >= limit {
                        break 'outer;
                    }
                    let row = (0..b.num_columns())
                        .map(|c| cell_to_string(b.column(c).as_ref(), i))
                        .collect();
                    rows.push(row);
                }
            }
            Ok(TablePreview { columns, rows })
        })
    }
}

/// A stringified preview of one warehouse table for the egui browser.
#[derive(Debug, Clone, Default)]
pub struct TablePreview {
    pub columns: Vec<String>,
    pub rows: Vec<Vec<String>>,
}

/// Stringify one arrow cell for display. Covers the scalar types the warehouse
/// schemas use; anything else (timestamps, blobs, lists) renders as a
/// `<DataType>` placeholder so the browser never dumps raw bytes.
fn cell_to_string(arr: &dyn Array, row: usize) -> String {
    if arr.is_null(row) {
        return String::new();
    }
    if let Some(a) = arr.as_any().downcast_ref::<StringArray>() {
        return a.value(row).to_string();
    }
    if let Some(a) = arr.as_any().downcast_ref::<Int64Array>() {
        return a.value(row).to_string();
    }
    if let Some(a) = arr.as_any().downcast_ref::<Int32Array>() {
        return a.value(row).to_string();
    }
    if let Some(a) = arr.as_any().downcast_ref::<Float64Array>() {
        return format!("{:.4}", a.value(row));
    }
    if let Some(a) = arr.as_any().downcast_ref::<BooleanArray>() {
        return a.value(row).to_string();
    }
    format!("<{}>", arr.data_type())
}

impl Warehouse for IcebergWarehouse {
    fn append_bench_run(&self, repo: &str, run: &BenchRun) -> Result<Uuid> {
        self.rt.as_ref().expect("rt present").block_on(self.append_bench_run_async(repo, run))
    }

    fn query_bench_runs(&self, filter: &BenchFilter) -> Result<Vec<BenchRun>> {
        self.rt.as_ref().expect("rt present").block_on(self.query_bench_runs_async(filter))
    }
}

impl IcebergWarehouse {
    /// Async form of [`Warehouse::query_bench_runs`]. Call this from code
    /// already on the warehouse runtime (e.g. inside `run_pipeline`, itself
    /// driven by `wh.block_on(...)`): the sync trait method does its own
    /// `block_on` and would panic if nested.
    pub async fn query_bench_runs_async(&self, filter: &BenchFilter) -> Result<Vec<BenchRun>> {
        async {
            let mut by_id: std::collections::HashMap<Uuid, BenchRun> =
                std::collections::HashMap::new();

            // bench_runs
            let table = self.catalog.load_table(&self.table_ident(TABLE_BENCH_RUNS)).await?;
            for batch in scan_repo_filtered(&table, filter.repo.as_deref()).await? {
                let ids = downcast::<StringArray>(&batch, 0)?;
                let repos = downcast::<StringArray>(&batch, 1)?;
                let ts = downcast::<TimestampMicrosecondArray>(&batch, 2)?;
                let dates = downcast::<StringArray>(&batch, 3)?;
                let versions = downcast::<StringArray>(&batch, 4)?;
                let machines = downcast::<StringArray>(&batch, 5)?;
                let cores = downcast::<Int32Array>(&batch, 6)?;
                for i in 0..batch.num_rows() {
                    if let Some(want) = &filter.repo {
                        if repos.value(i) != want { continue; }
                    }
                    if let Some(want) = &filter.machine {
                        if machines.value(i) != want { continue; }
                    }
                    let uid = Uuid::parse_str(ids.value(i))?;
                    let ts_dt = Utc.timestamp_micros(ts.value(i)).single()
                        .context("invalid micro timestamp")?;
                    by_id.insert(uid, BenchRun {
                        date: dates.value(i).to_string(),
                        timestamp: Some(ts_dt.to_rfc3339()),
                        version: versions.value(i).to_string(),
                        machine: machines.value(i).to_string(),
                        cores: cores.value(i).max(0) as u32,
                        results: Vec::new(),
                        tests: Vec::new(),
                    });
                }
            }

            // bench_results
            let table = self.catalog.load_table(&self.table_ident(TABLE_BENCH_RESULTS)).await?;
            for batch in scan_all(&table).await? {
                let ids = downcast::<StringArray>(&batch, 0)?;
                let names = downcast::<StringArray>(&batch, 1)?;
                let metrics = downcast::<StringArray>(&batch, 2)?;
                let values = downcast::<Float64Array>(&batch, 3)?;
                for i in 0..batch.num_rows() {
                    let uid = Uuid::parse_str(ids.value(i))?;
                    if let Some(run) = by_id.get_mut(&uid) {
                        let name = names.value(i).to_string();
                        let entry = run.results.iter_mut().find(|r| r.name == name);
                        let target = if let Some(e) = entry {
                            e
                        } else {
                            run.results.push(BenchResult { name, metrics: Default::default() });
                            run.results.last_mut().unwrap()
                        };
                        target.metrics.insert(
                            metrics.value(i).to_string(),
                            serde_json::Value::from(values.value(i)),
                        );
                    }
                }
            }

            // test_outcomes
            let table = self.catalog.load_table(&self.table_ident(TABLE_TEST_OUTCOMES)).await?;
            for batch in scan_all(&table).await? {
                let ids = downcast::<StringArray>(&batch, 0)?;
                let names = downcast::<StringArray>(&batch, 1)?;
                let passed = downcast::<BooleanArray>(&batch, 2)?;
                let durations = downcast::<Float64Array>(&batch, 3)?;
                let messages = downcast::<StringArray>(&batch, 4)?;
                for i in 0..batch.num_rows() {
                    let uid = Uuid::parse_str(ids.value(i))?;
                    if let Some(run) = by_id.get_mut(&uid) {
                        run.tests.push(TestOutcome {
                            name: names.value(i).to_string(),
                            passed: passed.value(i),
                            duration_ms: if durations.is_null(i) { None } else { Some(durations.value(i)) },
                            message: if messages.is_null(i) { None } else { Some(messages.value(i).to_string()) },
                        });
                    }
                }
            }

            let mut out: Vec<BenchRun> = by_id.into_values().collect();
            out.sort_by(|a, b| a.timestamp.cmp(&b.timestamp));
            if let Some(n) = filter.limit {
                let drop_n = out.len().saturating_sub(n);
                out.drain(..drop_n);
            }
            anyhow::Ok(out)
        }.await
    }
}

impl IcebergWarehouse {
    /// Read `bench_telemetry` rows, optionally pruned to one `repo`. One row per
    /// `(run_id, bench)` — the CPU/mem footprint the in-bench sampler captured.
    /// Sync wrapper; see [`IcebergWarehouse::query_bench_telemetry_async`].
    pub fn query_bench_telemetry(&self, repo: Option<&str>) -> Result<Vec<BenchTelemetryRow>> {
        self.rt
            .as_ref()
            .expect("rt present")
            .block_on(self.query_bench_telemetry_async(repo))
    }

    /// Async form of [`IcebergWarehouse::query_bench_telemetry`].
    pub async fn query_bench_telemetry_async(
        &self,
        repo: Option<&str>,
    ) -> Result<Vec<BenchTelemetryRow>> {
        let table = self.catalog.load_table(&self.table_ident(TABLE_BENCH_TELEMETRY)).await?;
        let mut out = Vec::new();
        for batch in scan_repo_filtered(&table, repo).await? {
            let run_ids = downcast::<StringArray>(&batch, 0)?;
            let repos = downcast::<StringArray>(&batch, 1)?;
            let benches = downcast::<StringArray>(&batch, 2)?;
            let n_cores = downcast::<Int32Array>(&batch, 3)?;
            let cpu_avg = downcast::<Float64Array>(&batch, 4)?;
            let cpu_max = downcast::<Float64Array>(&batch, 5)?;
            let busy_avg = downcast::<Float64Array>(&batch, 6)?;
            let busy_max = downcast::<Int32Array>(&batch, 7)?;
            let mem_peak = downcast::<Float64Array>(&batch, 8)?;
            let mem_pct = downcast::<Float64Array>(&batch, 9)?;
            let elapsed = downcast::<Float64Array>(&batch, 10)?;
            for i in 0..batch.num_rows() {
                if let Some(want) = repo {
                    if repos.value(i) != want {
                        continue;
                    }
                }
                out.push(BenchTelemetryRow {
                    run_id: run_ids.value(i).to_string(),
                    repo: repos.value(i).to_string(),
                    bench: benches.value(i).to_string(),
                    n_cores: n_cores.value(i).max(0) as u32,
                    cpu_pct_avg: cpu_avg.value(i),
                    cpu_pct_max: cpu_max.value(i),
                    cores_busy_avg: busy_avg.value(i),
                    cores_busy_max: busy_max.value(i).max(0) as u32,
                    mem_peak_mb: mem_peak.value(i),
                    mem_pct_max: mem_pct.value(i),
                    elapsed_ms: elapsed.value(i),
                });
            }
        }
        Ok(out)
    }
}

/// One persisted `bench_telemetry` row — the CPU/mem footprint of a single
/// bench unit. Mirrors [`crate::bench::telemetry::Telemetry`] plus the keys
/// (`run_id`, `repo`, `bench`).
#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
pub struct BenchTelemetryRow {
    pub run_id: String,
    pub repo: String,
    pub bench: String,
    pub n_cores: u32,
    pub cpu_pct_avg: f64,
    pub cpu_pct_max: f64,
    pub cores_busy_avg: f64,
    pub cores_busy_max: u32,
    pub mem_peak_mb: f64,
    pub mem_pct_max: f64,
    pub elapsed_ms: f64,
}

pub(crate) async fn append_batch(catalog: &RedbCatalog, table: Table, batch: RecordBatch) -> Result<()> {
    // The whole ParquetWriter → RollingFileWriter → DataFileWriter → fast_append
    // dance (and identity-partition-key tagging) lives in `skade::append`; the
    // batch is recast to the table's field-id schema there. One call.
    skade::append(catalog, &table, std::slice::from_ref(&batch)).await?;
    Ok(())
}

/// Evolve `table`'s stored schema to `canonical` if it is missing any of
/// `canonical`'s **top-level columns** — an Iceberg **add-column** migration
/// (per the schema-evolution LAW in `.nornir/warehouse-schema.md`).
///
/// Why this exists: a writer that always builds the full (canonical) Arrow batch
/// will fail with `number of columns(N) must match number of fields(M)` when the
/// server's table was created by an *older* binary whose schema had fewer
/// columns (e.g. `agent_model_runs` shipped 13 cols, then grew `agent` /
/// `cost_usd` / `mcp_tool_calls` → 16). Rather than down-project the write (which
/// would silently drop the new data), we evolve the stale table forward: add the
/// missing fields as OPTIONAL columns. Old data files are untouched (Iceberg
/// add-column is metadata-only; missing values read back as null), and the read
/// path already defaults those nulls.
///
/// Returns the (possibly re-loaded) table to append into. A no-op fast path when
/// every canonical column is already present, so the steady state costs one name
/// set-difference and no catalog write.
///
/// iceberg-rust 0.9.1 exposes no public schema-mutation `Transaction` action, so
/// the commit goes through skade's `RedbCatalog::commit_table` raw primitive
/// (`AddSchema` + `SetCurrentSchema`) — the same wire pieces an Iceberg-REST
/// `commit_table` would carry.
pub(crate) async fn ensure_table_schema(
    catalog: &RedbCatalog,
    ident: &TableIdent,
    table: Table,
    canonical: &iceberg::spec::Schema,
) -> Result<Table> {
    use std::collections::HashSet;

    use iceberg::TableUpdate;

    let current = table.metadata().current_schema();
    let have: HashSet<&str> =
        current.as_struct().fields().iter().map(|f| f.name.as_str()).collect();
    let missing: Vec<&str> = canonical
        .as_struct()
        .fields()
        .iter()
        .map(|f| f.name.as_str())
        .filter(|n| !have.contains(n))
        .collect();
    if missing.is_empty() {
        // Steady state: the table already carries every canonical column.
        return Ok(table);
    }

    // Evolve straight to the **canonical** schema. Our `iceberg_schema::*` fns
    // are strictly append-only with stable field ids (the file-header rule), so
    // every column the stale table already has keeps the exact id it was created
    // with, and the only difference is the added (trailing) columns — a pure
    // Iceberg add-column. The canonical schema is internally consistent (its
    // nested list/struct element ids are unique by construction), so handing it
    // to `AddSchema` whole avoids any field-id arithmetic of our own. We only
    // re-stamp the schema_id (current + 1) so `AddSchema` registers a new
    // schema rather than colliding with the existing one.
    use iceberg::spec::Schema;
    let new_schema_id = current.schema_id() + 1;
    let evolved = Schema::builder()
        .with_schema_id(new_schema_id)
        .with_fields(canonical.as_struct().fields().to_vec())
        .build()?;

    let updates = vec![
        TableUpdate::AddSchema { schema: evolved },
        // `-1` = "the schema just added" (iceberg metadata-builder convention).
        TableUpdate::SetCurrentSchema { schema_id: -1 },
    ];
    eprintln!(
        "nornir: evolving warehouse table `{}` schema — adding column(s) [{}] \
         (Iceberg add-column migration)",
        ident.name(),
        missing.join(", "),
    );
    catalog.commit_table(ident.clone(), Vec::new(), updates).await?;
    // Re-load so the caller appends against the new current schema.
    Ok(catalog.load_table(ident).await?)
}

async fn scan_all(table: &Table) -> Result<Vec<RecordBatch>> {
    // Full-snapshot scan → Arrow, via skade's reader.
    Ok(skade::read_all(table).await?)
}

/// Scan `table` but **stop streaming once `limit` rows are in hand** instead of
/// materializing the whole table (what `skade::read_all` does — it `try_collect`s
/// every data file before the caller can truncate).
///
/// This is the bottleneck the viz callgraph/dep-graph pays on big workspaces:
/// `njord`'s `symbol_facts`/`call_edges` are ~15 GB across ~6 k Iceberg data
/// files, so `read_all` reads all of it (~5 s/table) before the preview keeps
/// the first 8 000 rows. By driving the Iceberg scan's Arrow **stream** and
/// breaking as soon as we've accumulated `limit` rows, we read only the first
/// few data files — the remaining batches (and their parquet reads) are never
/// pulled. A small `with_batch_size` keeps the first batch from overshooting the
/// limit by much. `limit == 0` falls back to a full scan.
///
/// Caveat: this returns *some* `limit` rows, not a deterministic top-N — Iceberg
/// gives no ordering guarantee. The preview never promised one (it already
/// truncated arbitrarily), and the callgraph re-filters/-caps the rows itself,
/// so an arbitrary prefix is fine.
async fn scan_limited(table: &Table, limit: usize) -> Result<Vec<RecordBatch>> {
    use futures::StreamExt;
    if limit == 0 {
        return scan_all(table).await;
    }
    // Cap per-batch rows so the stream can stop close to `limit` rather than
    // pulling one giant batch. Clamp to a sane floor for tiny limits.
    let batch_size = limit.clamp(256, 8192);
    let mut stream = table
        .scan()
        .select_all()
        .with_batch_size(Some(batch_size))
        .build()?
        .to_arrow()
        .await?;
    let mut batches = Vec::new();
    let mut have = 0usize;
    while have < limit {
        match stream.next().await {
            Some(b) => {
                let b = b?;
                have += b.num_rows();
                batches.push(b);
            }
            None => break, // table smaller than the limit
        }
    }
    // Dropping `stream` here cancels the rest of the scan: no further parquet
    // files are opened or decoded.
    Ok(batches)
}

/// Like [`scan_all`] but pushes a `repo == <repo>` predicate into the scan
/// planner when `repo` is `Some`, so the engine prunes other repos' data
/// files / row-groups instead of reading them. No projection is applied, so
/// column order is preserved and positional [`downcast`] stays valid. The
/// caller must keep its residual per-row guard: pushdown prunes at file
/// granularity, not per row.
async fn scan_repo_filtered(table: &Table, repo: Option<&str>) -> Result<Vec<RecordBatch>> {
    let mut builder = table.scan();
    if let Some(r) = repo {
        builder = builder.with_filter(Reference::new("repo").equal_to(Datum::string(r)));
    }
    let scan = builder.build()?;
    let stream = scan.to_arrow().await?;
    let batches: Vec<RecordBatch> = stream.try_collect().await?;
    Ok(batches)
}

fn downcast<T: 'static>(batch: &RecordBatch, idx: usize) -> Result<&T> {
    batch
        .column(idx)
        .as_any()
        .downcast_ref::<T>()
        .ok_or_else(|| anyhow!("column {idx} has unexpected type {:?}", batch.column(idx).data_type()))
}

fn resolve_timestamp(run: &BenchRun) -> Result<DateTime<Utc>> {
    if let Some(s) = &run.timestamp {
        return DateTime::parse_from_rfc3339(s)
            .map(|dt| dt.with_timezone(&Utc))
            .with_context(|| format!("parse timestamp {s}"));
    }
    let nd = NaiveDate::parse_from_str(&run.date, "%Y-%m-%d")
        .with_context(|| format!("parse date {}", run.date))?;
    let ndt: NaiveDateTime = nd.and_hms_opt(0, 0, 0).unwrap();
    Ok(Utc.from_utc_datetime(&ndt))
}

/// Build the bench_runs RecordBatch using the table's iceberg-derived
/// Arrow schema (which carries `PARQUET:field_id` metadata).
fn build_bench_runs_batch(
    table: &Table,
    run_id: Uuid,
    repo: &str,
    ts: DateTime<Utc>,
    run: &BenchRun,
) -> Result<RecordBatch> {
    let s = Arc::new(schema_to_arrow_schema(table.metadata().current_schema())?);
    let cols: Vec<Arc<dyn Array>> = vec![
        Arc::new(StringArray::from(vec![run_id.to_string()])),
        Arc::new(StringArray::from(vec![repo.to_string()])),
        Arc::new(TimestampMicrosecondArray::from(vec![ts.timestamp_micros()]).with_timezone("+00:00")),
        Arc::new(StringArray::from(vec![run.date.clone()])),
        Arc::new(StringArray::from(vec![run.version.clone()])),
        Arc::new(StringArray::from(vec![run.machine.clone()])),
        Arc::new(Int32Array::from(vec![run.cores as i32])),
    ];
    Ok(RecordBatch::try_new(s, cols)?)
}

fn build_bench_results_batch(table: &Table, run_id: Uuid, run: &BenchRun) -> Result<RecordBatch> {
    let s = Arc::new(schema_to_arrow_schema(table.metadata().current_schema())?);
    let id_str = run_id.to_string();
    let mut ids = Vec::new();
    let mut names = Vec::new();
    let mut metrics = Vec::new();
    let mut values = Vec::new();
    for r in &run.results {
        for (k, v) in &r.metrics {
            // `telem_*` keys are the in-bench sampler's CPU/mem footprint; they
            // ride the metrics map only to survive the stdout-JSON hop and are
            // persisted into the dedicated `bench_telemetry` table instead, so
            // they never pollute the scalar metric set / docs comparison.
            if k.starts_with(crate::bench::telemetry::TELEM_PREFIX) {
                continue;
            }
            let f = match v {
                serde_json::Value::Number(n) => n.as_f64(),
                _ => continue,
            };
            if let Some(f) = f {
                ids.push(id_str.clone());
                names.push(r.name.clone());
                metrics.push(k.clone());
                values.push(f);
            }
        }
    }
    let cols: Vec<Arc<dyn Array>> = vec![
        Arc::new(StringArray::from(ids)),
        Arc::new(StringArray::from(names)),
        Arc::new(StringArray::from(metrics)),
        Arc::new(Float64Array::from(values)),
    ];
    Ok(RecordBatch::try_new(s, cols)?)
}

/// Build the `bench_telemetry` batch: one row per result that carries `telem_*`
/// metrics (the in-bench sampler's footprint). Results without telemetry (legacy
/// runs, or benches that produced no telemetry) contribute no row, so a missing
/// sampler degrades to an empty table rather than zero-filled noise.
fn build_bench_telemetry_batch(
    table: &Table,
    run_id: Uuid,
    repo: &str,
    run: &BenchRun,
) -> Result<RecordBatch> {
    let s = Arc::new(schema_to_arrow_schema(table.metadata().current_schema())?);
    let id_str = run_id.to_string();
    let mut ids = Vec::new();
    let mut repos = Vec::new();
    let mut benches = Vec::new();
    let mut n_cores = Vec::new();
    let mut cpu_avg = Vec::new();
    let mut cpu_max = Vec::new();
    let mut busy_avg = Vec::new();
    let mut busy_max = Vec::new();
    let mut mem_peak = Vec::new();
    let mut mem_pct = Vec::new();
    let mut elapsed = Vec::new();
    for r in &run.results {
        let Some(t) = crate::bench::telemetry::from_metrics(&r.metrics) else { continue };
        ids.push(id_str.clone());
        repos.push(repo.to_string());
        benches.push(r.name.clone());
        n_cores.push(t.n_cores as i32);
        cpu_avg.push(t.cpu_pct_avg);
        cpu_max.push(t.cpu_pct_max);
        busy_avg.push(t.cores_busy_avg);
        busy_max.push(t.cores_busy_max as i32);
        mem_peak.push(t.mem_peak_mb);
        mem_pct.push(t.mem_pct_max);
        elapsed.push(t.elapsed_ms);
    }
    let cols: Vec<Arc<dyn Array>> = vec![
        Arc::new(StringArray::from(ids)),
        Arc::new(StringArray::from(repos)),
        Arc::new(StringArray::from(benches)),
        Arc::new(Int32Array::from(n_cores)),
        Arc::new(Float64Array::from(cpu_avg)),
        Arc::new(Float64Array::from(cpu_max)),
        Arc::new(Float64Array::from(busy_avg)),
        Arc::new(Int32Array::from(busy_max)),
        Arc::new(Float64Array::from(mem_peak)),
        Arc::new(Float64Array::from(mem_pct)),
        Arc::new(Float64Array::from(elapsed)),
    ];
    Ok(RecordBatch::try_new(s, cols)?)
}

fn build_test_outcomes_batch(table: &Table, run_id: Uuid, run: &BenchRun) -> Result<RecordBatch> {
    let s = Arc::new(schema_to_arrow_schema(table.metadata().current_schema())?);
    let id_str = run_id.to_string();
    let n = run.tests.len();
    let ids: Vec<String> = run.tests.iter().map(|_| id_str.clone()).collect();
    let names: Vec<String> = run.tests.iter().map(|t| t.name.clone()).collect();
    let passed: Vec<bool> = run.tests.iter().map(|t| t.passed).collect();
    let durations: Vec<Option<f64>> = run.tests.iter().map(|t| t.duration_ms).collect();
    let messages: Vec<Option<String>> = run.tests.iter().map(|t| t.message.clone()).collect();
    // Fields 6-10 are optional metadata for impact selection + flake
    // quarantine — the bench harness doesn't populate them yet, but the
    // schema requires the columns to be present (null-filled).
    let nulls_string: Vec<Option<String>> = vec![None; n];
    let nulls_int: Vec<Option<i32>> = vec![None; n];
    let nulls_bool: Vec<Option<bool>> = vec![None; n];
    let cols: Vec<Arc<dyn Array>> = vec![
        Arc::new(StringArray::from(ids)),
        Arc::new(StringArray::from(names)),
        Arc::new(BooleanArray::from(passed)),
        Arc::new(Float64Array::from(durations)),
        Arc::new(StringArray::from(messages)),
        Arc::new(StringArray::from(nulls_string.clone())), // source_repo
        Arc::new(StringArray::from(nulls_string.clone())), // source_crate
        Arc::new(Int32Array::from(nulls_int.clone())),     // attempt_idx
        Arc::new(StringArray::from(nulls_string)),         // retry_of_run_id
        Arc::new(BooleanArray::from(nulls_bool)),          // quarantined
    ];
    Ok(RecordBatch::try_new(s, cols)?)
}

// Silence dead-code warnings for items used only via the public re-export.
#[allow(dead_code)]
fn _unused_imports_anchor() {}

// ─── knowledge-map writers ──────────────────────────────────────────────

impl IcebergWarehouse {
    /// Persist a [`crate::knowledge::symbols::SymbolScan`] into
    /// `symbol_facts` + `call_edges` + `feature_gate_facts`. All three
    /// table appends happen sequentially; partial failures leave whatever
    /// already wrote in place (iceberg snapshots are atomic per table).
    pub fn append_symbol_scan(
        &self,
        scan: &crate::knowledge::symbols::SymbolScan,
    ) -> Result<()> {
        self.rt.as_ref().expect("rt present").block_on(self.append_symbol_scan_async(scan))
    }

    /// Persist a cross-repo dependency-graph snapshot (sync wrapper over the
    /// async [`crate::warehouse::dep_graph::record_dep_graph`]) on this
    /// warehouse's private runtime. Returns the snapshot id.
    pub fn record_dep_graph(
        &self,
        workspace_name: &str,
        graph: &crate::warehouse::dep_graph::WorkspaceGraph,
    ) -> Result<uuid::Uuid> {
        self.rt.as_ref().expect("rt present").block_on(
            crate::warehouse::dep_graph::record_dep_graph(self, workspace_name, graph),
        )
    }

    pub async fn append_symbol_scan_async(
        &self,
        scan: &crate::knowledge::symbols::SymbolScan,
    ) -> Result<()> {
        let ts = scan.ts.timestamp_micros();
        let snap = scan.snapshot_id.to_string();

        if !scan.symbols.is_empty() {
            let table = self.catalog.load_table(&self.table_ident(TABLE_SYMBOL_FACTS)).await?;
            let s = Arc::new(schema_to_arrow_schema(table.metadata().current_schema())?);
            let n = scan.symbols.len();
            let cols: Vec<Arc<dyn Array>> = vec![
                Arc::new(StringArray::from(vec![snap.clone(); n])),
                Arc::new(TimestampMicrosecondArray::from(vec![ts; n]).with_timezone("+00:00")),
                Arc::new(StringArray::from(vec![scan.repo.clone(); n])),
                Arc::new(StringArray::from(scan.symbols.iter().map(|r| r.crate_name.clone()).collect::<Vec<_>>())),
                Arc::new(StringArray::from(scan.symbols.iter().map(|r| r.module_path.clone()).collect::<Vec<_>>())),
                Arc::new(StringArray::from(scan.symbols.iter().map(|r| r.item_kind.clone()).collect::<Vec<_>>())),
                Arc::new(StringArray::from(scan.symbols.iter().map(|r| r.item_name.clone()).collect::<Vec<_>>())),
                Arc::new(StringArray::from(scan.symbols.iter().map(|r| r.visibility.clone()).collect::<Vec<_>>())),
                Arc::new(StringArray::from(scan.symbols.iter().map(|r| r.file.clone()).collect::<Vec<_>>())),
                Arc::new(Int32Array::from(scan.symbols.iter().map(|r| r.line as i32).collect::<Vec<_>>())),
                Arc::new(Int32Array::from(scan.symbols.iter().map(|r| r.doc_lines as i32).collect::<Vec<_>>())),
                Arc::new(StringArray::from(scan.symbols.iter().map(|r| r.signature.clone()).collect::<Vec<_>>())),
            ];
            let batch = RecordBatch::try_new(s, cols)?;
            append_batch(&self.catalog, table, batch).await?;
        }

        if !scan.calls.is_empty() {
            let table = self.catalog.load_table(&self.table_ident(TABLE_CALL_EDGES)).await?;
            let s = Arc::new(schema_to_arrow_schema(table.metadata().current_schema())?);
            let n = scan.calls.len();
            let cols: Vec<Arc<dyn Array>> = vec![
                Arc::new(StringArray::from(vec![snap.clone(); n])),
                Arc::new(TimestampMicrosecondArray::from(vec![ts; n]).with_timezone("+00:00")),
                Arc::new(StringArray::from(vec![scan.repo.clone(); n])),
                Arc::new(StringArray::from(scan.calls.iter().map(|r| r.crate_name.clone()).collect::<Vec<_>>())),
                Arc::new(StringArray::from(scan.calls.iter().map(|r| r.caller_path.clone()).collect::<Vec<_>>())),
                Arc::new(StringArray::from(scan.calls.iter().map(|r| r.callee_ident.clone()).collect::<Vec<_>>())),
                Arc::new(StringArray::from(scan.calls.iter().map(|r| r.call_kind.clone()).collect::<Vec<_>>())),
                Arc::new(StringArray::from(scan.calls.iter().map(|r| r.file.clone()).collect::<Vec<_>>())),
                Arc::new(Int32Array::from(scan.calls.iter().map(|r| r.line as i32).collect::<Vec<_>>())),
            ];
            let batch = RecordBatch::try_new(s, cols)?;
            append_batch(&self.catalog, table, batch).await?;
        }

        if !scan.features.is_empty() {
            let table = self.catalog.load_table(&self.table_ident(TABLE_FEATURE_GATE_FACTS)).await?;
            let s = Arc::new(schema_to_arrow_schema(table.metadata().current_schema())?);
            let n = scan.features.len();
            let cols: Vec<Arc<dyn Array>> = vec![
                Arc::new(StringArray::from(vec![snap.clone(); n])),
                Arc::new(TimestampMicrosecondArray::from(vec![ts; n]).with_timezone("+00:00")),
                Arc::new(StringArray::from(vec![scan.repo.clone(); n])),
                Arc::new(StringArray::from(scan.features.iter().map(|r| r.crate_name.clone()).collect::<Vec<_>>())),
                Arc::new(StringArray::from(scan.features.iter().map(|r| r.module_path.clone()).collect::<Vec<_>>())),
                Arc::new(StringArray::from(scan.features.iter().map(|r| r.item_name.clone()).collect::<Vec<_>>())),
                Arc::new(StringArray::from(scan.features.iter().map(|r| r.cfg_expr.clone()).collect::<Vec<_>>())),
                Arc::new(StringArray::from(scan.features.iter().map(|r| r.file.clone()).collect::<Vec<_>>())),
                Arc::new(Int32Array::from(scan.features.iter().map(|r| r.line as i32).collect::<Vec<_>>())),
            ];
            let batch = RecordBatch::try_new(s, cols)?;
            append_batch(&self.catalog, table, batch).await?;
        }
        Ok(())
    }

    /// Persist a [`crate::knowledge::git_heat::GitHeatScan`] into `git_heat_facts`.
    pub fn append_git_heat_scan(
        &self,
        scan: &crate::knowledge::git_heat::GitHeatScan,
    ) -> Result<()> {
        self.rt.as_ref().expect("rt present").block_on(self.append_git_heat_scan_async(scan))
    }

    pub async fn append_git_heat_scan_async(
        &self,
        scan: &crate::knowledge::git_heat::GitHeatScan,
    ) -> Result<()> {
        if scan.files.is_empty() {
            return Ok(());
        }
        let ts = scan.ts.timestamp_micros();
        let snap = scan.snapshot_id.to_string();
        let table = self.catalog.load_table(&self.table_ident(TABLE_GIT_HEAT_FACTS)).await?;
        let s = Arc::new(schema_to_arrow_schema(table.metadata().current_schema())?);
        let n = scan.files.len();
        let cols: Vec<Arc<dyn Array>> = vec![
            Arc::new(StringArray::from(vec![snap; n])),
            Arc::new(TimestampMicrosecondArray::from(vec![ts; n]).with_timezone("+00:00")),
            Arc::new(StringArray::from(vec![scan.repo.clone(); n])),
            Arc::new(StringArray::from(scan.files.iter().map(|r| r.file.clone()).collect::<Vec<_>>())),
            Arc::new(Int64Array::from(scan.files.iter().map(|r| r.commits_total).collect::<Vec<_>>())),
            Arc::new(Int64Array::from(scan.files.iter().map(|r| r.commits_30d).collect::<Vec<_>>())),
            Arc::new(Int64Array::from(scan.files.iter().map(|r| r.commits_90d).collect::<Vec<_>>())),
            Arc::new(Int64Array::from(scan.files.iter().map(|r| r.authors_total).collect::<Vec<_>>())),
            Arc::new(TimestampMicrosecondArray::from(scan.files.iter().map(|r| r.last_commit_ts.timestamp_micros()).collect::<Vec<_>>()).with_timezone("+00:00")),
        ];
        let batch = RecordBatch::try_new(s, cols)?;
        append_batch(&self.catalog, table, batch).await?;
        Ok(())
    }

    /// Append a batch of MCP tool-call telemetry rows (one snapshot). Metrics
    /// only — no payloads. Called by the nornir-mcp server's logging thread.
    pub async fn append_mcp_calls_async(&self, calls: &[McpCall]) -> Result<()> {
        if calls.is_empty() {
            return Ok(());
        }
        let table = self.catalog.load_table(&self.table_ident(TABLE_MCP_REQUESTS)).await?;
        let s = Arc::new(schema_to_arrow_schema(table.metadata().current_schema())?);
        let cols: Vec<Arc<dyn Array>> = vec![
            Arc::new(
                TimestampMicrosecondArray::from(calls.iter().map(|c| c.ts_micros).collect::<Vec<_>>())
                    .with_timezone("+00:00"),
            ),
            Arc::new(StringArray::from(calls.iter().map(|c| c.tool.clone()).collect::<Vec<_>>())),
            Arc::new(StringArray::from(calls.iter().map(|c| c.status.clone()).collect::<Vec<_>>())),
            Arc::new(Int64Array::from(calls.iter().map(|c| c.latency_ms).collect::<Vec<_>>())),
        ];
        let batch = RecordBatch::try_new(s, cols)?;
        append_batch(&self.catalog, table, batch).await?;
        Ok(())
    }

    /// Sync wrapper over [`append_mcp_calls_async`](Self::append_mcp_calls_async).
    pub fn append_mcp_calls(&self, calls: &[McpCall]) -> Result<()> {
        self.rt.as_ref().expect("rt present").block_on(self.append_mcp_calls_async(calls))
    }

    /// Aggregate the `mcp_requests` table into per-tool usage stats (call count,
    /// error count, mean latency, last-seen), sorted by call count descending.
    pub async fn query_mcp_stats_async(&self) -> Result<Vec<McpToolStat>> {
        let table = self.catalog.load_table(&self.table_ident(TABLE_MCP_REQUESTS)).await?;
        let batches = scan_all(&table).await?;
        struct Acc {
            calls: u64,
            errors: u64,
            lat_sum: i128,
            last_ts: i64,
        }
        let mut map: std::collections::HashMap<String, Acc> = std::collections::HashMap::new();
        for b in &batches {
            let ts = downcast::<TimestampMicrosecondArray>(b, 0)?;
            let tool = downcast::<StringArray>(b, 1)?;
            let status = downcast::<StringArray>(b, 2)?;
            let lat = downcast::<Int64Array>(b, 3)?;
            for i in 0..b.num_rows() {
                let e = map.entry(tool.value(i).to_string()).or_insert(Acc {
                    calls: 0,
                    errors: 0,
                    lat_sum: 0,
                    last_ts: i64::MIN,
                });
                e.calls += 1;
                if status.value(i) != "ok" {
                    e.errors += 1;
                }
                e.lat_sum += lat.value(i) as i128;
                e.last_ts = e.last_ts.max(ts.value(i));
            }
        }
        let mut out: Vec<McpToolStat> = map
            .into_iter()
            .map(|(tool, a)| McpToolStat {
                tool,
                calls: a.calls,
                errors: a.errors,
                avg_latency_ms: if a.calls > 0 { a.lat_sum as f64 / a.calls as f64 } else { 0.0 },
                last_ts_micros: a.last_ts,
            })
            .collect();
        out.sort_by(|a, b| b.calls.cmp(&a.calls).then_with(|| a.tool.cmp(&b.tool)));
        Ok(out)
    }

    /// Sync wrapper over [`query_mcp_stats_async`](Self::query_mcp_stats_async).
    pub fn query_mcp_stats(&self) -> Result<Vec<McpToolStat>> {
        self.rt.as_ref().expect("rt present").block_on(self.query_mcp_stats_async())
    }

    /// The `vuln_findings` cache: `crate@version → (advisory ids, summary)`. The
    /// security scan reads this warehouse-first; only un-cached crate@versions
    /// hit OSV. Empty `ids` = checked and clean (still a cache hit).
    pub async fn query_vuln_findings_async(
        &self,
    ) -> Result<std::collections::HashMap<(String, String), (Vec<String>, String, i64)>> {
        let table = self.catalog.load_table(&self.table_ident(TABLE_VULN_FINDINGS)).await?;
        let batches = scan_all(&table).await?;
        // Value = (advisory ids, summary, checked_at_micros) — keep the LATEST
        // check per crate@version (the table is append-only, so re-checks add rows).
        let mut map: std::collections::HashMap<(String, String), (Vec<String>, String, i64)> =
            std::collections::HashMap::new();
        for b in &batches {
            let kr = downcast::<StringArray>(b, 0)?;
            let ver = downcast::<StringArray>(b, 1)?;
            let ids = downcast::<StringArray>(b, 2)?;
            let sum = downcast::<StringArray>(b, 3)?;
            let ts = downcast::<TimestampMicrosecondArray>(b, 4)?;
            for i in 0..b.num_rows() {
                let key = (kr.value(i).to_string(), ver.value(i).to_string());
                let checked = ts.value(i);
                let entry = map.entry(key).or_insert((Vec::new(), String::new(), i64::MIN));
                if checked >= entry.2 {
                    let id_vec: Vec<String> =
                        ids.value(i).split(',').filter(|s| !s.is_empty()).map(String::from).collect();
                    *entry = (id_vec, sum.value(i).to_string(), checked);
                }
            }
        }
        Ok(map)
    }

    /// Sync wrapper over [`query_vuln_findings_async`](Self::query_vuln_findings_async).
    pub fn query_vuln_findings(
        &self,
    ) -> Result<std::collections::HashMap<(String, String), (Vec<String>, String, i64)>> {
        self.rt.as_ref().expect("rt present").block_on(self.query_vuln_findings_async())
    }

    /// Persist freshly-checked `crate@version` vuln rows (fills the cache).
    pub async fn append_vuln_findings_async(&self, rows: &[VulnFinding]) -> Result<()> {
        if rows.is_empty() {
            return Ok(());
        }
        let table = self.catalog.load_table(&self.table_ident(TABLE_VULN_FINDINGS)).await?;
        let s = Arc::new(schema_to_arrow_schema(table.metadata().current_schema())?);
        let cols: Vec<Arc<dyn Array>> = vec![
            Arc::new(StringArray::from(rows.iter().map(|r| r.crate_name.clone()).collect::<Vec<_>>())),
            Arc::new(StringArray::from(rows.iter().map(|r| r.version.clone()).collect::<Vec<_>>())),
            Arc::new(StringArray::from(rows.iter().map(|r| r.ids.join(",")).collect::<Vec<_>>())),
            Arc::new(StringArray::from(rows.iter().map(|r| r.summary.clone()).collect::<Vec<_>>())),
            Arc::new(
                TimestampMicrosecondArray::from(
                    rows.iter().map(|r| r.checked_at_micros).collect::<Vec<_>>(),
                )
                .with_timezone("+00:00"),
            ),
        ];
        let batch = RecordBatch::try_new(s, cols)?;
        append_batch(&self.catalog, table, batch).await?;
        Ok(())
    }

    /// Sync wrapper over [`append_vuln_findings_async`](Self::append_vuln_findings_async).
    pub fn append_vuln_findings(&self, rows: &[VulnFinding]) -> Result<()> {
        self.rt.as_ref().expect("rt present").block_on(self.append_vuln_findings_async(rows))
    }

    /// Persist a freshly-resolved SBOM component set for `repo` under a single
    /// capture `snapshot_id` (one row per component). Append-only: the latest
    /// snapshot per repo is the authoritative one (see
    /// [`query_sbom_components`](Self::query_sbom_components)).
    pub async fn append_sbom_components_async(
        &self,
        repo: &str,
        snapshot_id: Uuid,
        components: &[SbomComponentRow],
    ) -> Result<()> {
        if components.is_empty() {
            return Ok(());
        }
        let table = self.catalog.load_table(&self.table_ident(TABLE_SBOM_COMPONENTS)).await?;
        let s = Arc::new(schema_to_arrow_schema(table.metadata().current_schema())?);
        let n = components.len();
        let snap = snapshot_id.to_string();
        let ts = Utc::now().timestamp_micros();
        let cols: Vec<Arc<dyn Array>> = vec![
            Arc::new(StringArray::from(vec![snap; n])),
            Arc::new(TimestampMicrosecondArray::from(vec![ts; n]).with_timezone("+00:00")),
            Arc::new(StringArray::from(vec![repo.to_string(); n])),
            Arc::new(StringArray::from(components.iter().map(|c| c.name.clone()).collect::<Vec<_>>())),
            Arc::new(StringArray::from(components.iter().map(|c| c.version.clone()).collect::<Vec<_>>())),
            Arc::new(StringArray::from(components.iter().map(|c| c.license.clone()).collect::<Vec<_>>())),
        ];
        let batch = RecordBatch::try_new(s, cols)?;
        append_batch(&self.catalog, table, batch).await?;
        Ok(())
    }

    /// Sync wrapper over [`append_sbom_components_async`](Self::append_sbom_components_async).
    pub fn append_sbom_components(
        &self,
        repo: &str,
        snapshot_id: Uuid,
        components: &[SbomComponentRow],
    ) -> Result<()> {
        self.rt
            .as_ref()
            .expect("rt present")
            .block_on(self.append_sbom_components_async(repo, snapshot_id, components))
    }

    /// The latest captured SBOM component set for `repo` (the rows of the
    /// most-recent `snapshot_id`, by capture ts). `None` when the repo has never
    /// been captured — the caller should then fall back to live `cargo metadata`.
    pub async fn query_sbom_components_async(
        &self,
        repo: &str,
    ) -> Result<Option<Vec<SbomComponentRow>>> {
        let table = self.catalog.load_table(&self.table_ident(TABLE_SBOM_COMPONENTS)).await?;
        let scan = table
            .scan()
            .with_filter(iceberg::expr::Reference::new("repo").equal_to(Datum::string(repo)))
            .build()?;
        let batches: Vec<RecordBatch> = scan.to_arrow().await?.try_collect().await?;
        // Find the latest snapshot_id for this repo (max ts_micros).
        let mut latest: Option<(String, i64)> = None;
        for b in &batches {
            let snaps = downcast::<StringArray>(b, 0)?;
            let ts = downcast::<TimestampMicrosecondArray>(b, 1)?;
            let repos = downcast::<StringArray>(b, 2)?;
            for i in 0..b.num_rows() {
                if repos.value(i) != repo {
                    continue;
                }
                let t = ts.value(i);
                if latest.as_ref().map(|(_, lt)| t > *lt).unwrap_or(true) {
                    latest = Some((snaps.value(i).to_string(), t));
                }
            }
        }
        let Some((snap, _)) = latest else { return Ok(None) };
        let mut out = Vec::new();
        for b in &batches {
            let snaps = downcast::<StringArray>(b, 0)?;
            let name = downcast::<StringArray>(b, 3)?;
            let version = downcast::<StringArray>(b, 4)?;
            let license = downcast::<StringArray>(b, 5)?;
            for i in 0..b.num_rows() {
                if snaps.value(i) != snap {
                    continue;
                }
                out.push(SbomComponentRow {
                    name: name.value(i).to_string(),
                    version: version.value(i).to_string(),
                    license: license.value(i).to_string(),
                });
            }
        }
        Ok(Some(out))
    }

    /// Sync wrapper over [`query_sbom_components_async`](Self::query_sbom_components_async).
    pub fn query_sbom_components(&self, repo: &str) -> Result<Option<Vec<SbomComponentRow>>> {
        self.rt.as_ref().expect("rt present").block_on(self.query_sbom_components_async(repo))
    }

    /// Has `(repo, git_sha)` already been knowledge-scanned? A `true` means the
    /// republish path can skip the (expensive) re-scan — the SHA is immutable, so
    /// its facts can't have changed.
    pub async fn knowledge_scan_exists_async(&self, repo: &str, git_sha: &str) -> Result<bool> {
        let table = self.catalog.load_table(&self.table_ident(TABLE_KNOWLEDGE_SCANS)).await?;
        let scan = table
            .scan()
            .with_filter(iceberg::expr::Reference::new("repo").equal_to(Datum::string(repo)))
            .build()?;
        let batches: Vec<RecordBatch> = scan.to_arrow().await?.try_collect().await?;
        for b in &batches {
            let repos = downcast::<StringArray>(b, 0)?;
            let shas = downcast::<StringArray>(b, 1)?;
            for i in 0..b.num_rows() {
                if repos.value(i) == repo && shas.value(i) == git_sha {
                    return Ok(true);
                }
            }
        }
        Ok(false)
    }

    /// Sync wrapper over [`knowledge_scan_exists_async`](Self::knowledge_scan_exists_async).
    pub fn knowledge_scan_exists(&self, repo: &str, git_sha: &str) -> Result<bool> {
        self.rt.as_ref().expect("rt present").block_on(self.knowledge_scan_exists_async(repo, git_sha))
    }

    /// Record that `(repo, git_sha)` was knowledge-scanned into `snapshot_id`.
    /// Idempotent at the ledger level: re-recording the same SHA is a no-op (the
    /// row already there satisfies [`knowledge_scan_exists`](Self::knowledge_scan_exists)).
    pub async fn record_knowledge_scan_async(
        &self,
        repo: &str,
        git_sha: &str,
        snapshot_id: Uuid,
    ) -> Result<()> {
        if self.knowledge_scan_exists_async(repo, git_sha).await? {
            return Ok(());
        }
        let table = self.catalog.load_table(&self.table_ident(TABLE_KNOWLEDGE_SCANS)).await?;
        let s = Arc::new(schema_to_arrow_schema(table.metadata().current_schema())?);
        let cols: Vec<Arc<dyn Array>> = vec![
            Arc::new(StringArray::from(vec![repo.to_string()])),
            Arc::new(StringArray::from(vec![git_sha.to_string()])),
            Arc::new(StringArray::from(vec![snapshot_id.to_string()])),
            Arc::new(
                TimestampMicrosecondArray::from(vec![Utc::now().timestamp_micros()])
                    .with_timezone("+00:00"),
            ),
        ];
        let batch = RecordBatch::try_new(s, cols)?;
        append_batch(&self.catalog, table, batch).await?;
        Ok(())
    }

    /// Sync wrapper over [`record_knowledge_scan_async`](Self::record_knowledge_scan_async).
    pub fn record_knowledge_scan(&self, repo: &str, git_sha: &str, snapshot_id: Uuid) -> Result<()> {
        self.rt
            .as_ref()
            .expect("rt present")
            .block_on(self.record_knowledge_scan_async(repo, git_sha, snapshot_id))
    }
}

/// One SBOM component row (a resolved dependency: name + version + SPDX license
/// expression, or `NOASSERTION`). Persisted by a deep-scan / metadata sweep and
/// read back to assemble the CycloneDX SBOM from the warehouse.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct SbomComponentRow {
    pub name: String,
    pub version: String,
    pub license: String,
}

/// One MCP tool-call telemetry row (metrics only — no request/response payload).
#[derive(Debug, Clone)]
pub struct McpCall {
    /// Call completion time, microseconds since the Unix epoch (UTC).
    pub ts_micros: i64,
    /// Tool / method name that was invoked.
    pub tool: String,
    /// `"ok"` or `"err"`.
    pub status: String,
    /// Wall-clock duration of the call, milliseconds.
    pub latency_ms: i64,
}

/// One cached vulnerability-scan row, keyed by the immutable `crate@version`.
/// `ids` empty = checked and clean.
#[derive(Debug, Clone)]
pub struct VulnFinding {
    pub crate_name: String,
    pub version: String,
    pub ids: Vec<String>,
    pub summary: String,
    pub checked_at_micros: i64,
}

/// Per-tool aggregate of [`McpCall`] rows (see
/// [`IcebergWarehouse::query_mcp_stats`]).
#[derive(Debug, Clone)]
pub struct McpToolStat {
    pub tool: String,
    pub calls: u64,
    pub errors: u64,
    pub avg_latency_ms: f64,
    pub last_ts_micros: i64,
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::bench::{BenchResult, BenchRun, TestOutcome};

    fn sample_run(machine: &str, ops: f64) -> BenchRun {
        let mut metrics = serde_json::Map::new();
        metrics.insert("ops_sec".into(), serde_json::json!(ops));
        BenchRun {
            date: "2026-05-30".into(),
            timestamp: Some("2026-05-30T21:00:00Z".into()),
            version: "0.1.0".into(),
            machine: machine.into(),
            cores: 32,
            results: vec![BenchResult { name: "x".into(), metrics }],
            tests: vec![TestOutcome { name: "smoke".into(), passed: true, duration_ms: Some(1.5), message: None }],
        }
    }

    #[test]
    fn roundtrip_bench_run() {
        let dir = tempfile::tempdir().unwrap();
        let wh = IcebergWarehouse::open(dir.path()).unwrap();
        let _id = wh.append_bench_run("holger", &sample_run("ryzen", 123.0)).unwrap();
        let runs = wh.query_bench_runs(&BenchFilter::for_repo("holger")).unwrap();
        assert_eq!(runs.len(), 1, "expected exactly one bench run");
        let r = &runs[0];
        assert_eq!(r.machine, "ryzen");
        assert_eq!(r.cores, 32);
        assert_eq!(r.results.len(), 1);
        assert_eq!(r.results[0].name, "x");
        let ops = r.results[0].metrics.get("ops_sec").unwrap().as_f64().unwrap();
        assert!((ops - 123.0).abs() < 1e-9);
        assert_eq!(r.tests.len(), 1);
        assert!(r.tests[0].passed);
        assert_eq!(r.tests[0].duration_ms, Some(1.5));
    }

    #[test]
    fn bench_telemetry_lands_as_warehouse_rows() {
        // INJECT-AND-ASSERT (LAW 1): run the real sampler over a short synthetic
        // CPU workload, fold its result into a BenchResult exactly as the harness
        // does, persist via append_bench_run, then read `bench_telemetry` back and
        // assert the row carries the expected fields with plausible non-zero load
        // — NOT merely "didn't panic".
        let dir = tempfile::tempdir().unwrap();
        let wh = IcebergWarehouse::open(dir.path()).unwrap();

        // Short synthetic workload while the sampler runs.
        let sampler = crate::bench::telemetry::Sampler::start();
        let stop = std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false));
        let mut handles = Vec::new();
        let n = std::thread::available_parallelism().map(|x| x.get()).unwrap_or(2).max(2);
        for _ in 0..n {
            let s = std::sync::Arc::clone(&stop);
            handles.push(std::thread::spawn(move || {
                let mut x = 0u64;
                while !s.load(std::sync::atomic::Ordering::Relaxed) {
                    x = x.wrapping_mul(2654435761).wrapping_add(1);
                    std::hint::black_box(x);
                }
            }));
        }
        std::thread::sleep(std::time::Duration::from_millis(2200));
        stop.store(true, std::sync::atomic::Ordering::Relaxed);
        for h in handles {
            let _ = h.join();
        }
        let telem = sampler.stop();

        // Build a run carrying a normal scalar metric + the injected telemetry.
        let mut metrics = serde_json::Map::new();
        metrics.insert("ops_sec".into(), serde_json::json!(999.0));
        crate::bench::telemetry::inject_into_metrics(&mut metrics, &telem);
        let run = BenchRun {
            date: "2026-06-12".into(),
            timestamp: Some("2026-06-12T10:00:00Z".into()),
            version: "0.1.0".into(),
            machine: "test-host".into(),
            cores: telem.n_cores.max(1),
            results: vec![BenchResult { name: "demo.cpu_burn".into(), metrics }],
            tests: vec![],
        };
        let run_id = wh.append_bench_run("demo", &run).unwrap().to_string();

        // Assert the telemetry row landed with the expected columns.
        let rows = wh.query_bench_telemetry(Some("demo")).unwrap();
        assert_eq!(rows.len(), 1, "expected exactly one telemetry row");
        let row = &rows[0];
        assert_eq!(row.run_id, run_id);
        assert_eq!(row.repo, "demo");
        assert_eq!(row.bench, "demo.cpu_burn");
        assert!(row.elapsed_ms >= 2000.0, "elapsed_ms={}", row.elapsed_ms);
        // On Linux /proc is present → assert real, non-zero load was captured.
        if crate::bench::telemetry::Sampler::start().n_cores() > 0
            || std::fs::read_to_string("/proc/stat").is_ok()
        {
            assert!(row.n_cores >= 1, "n_cores={}", row.n_cores);
            assert!(
                row.cores_busy_max >= 1,
                "expected ≥1 busy core under an N-thread spin, got {}",
                row.cores_busy_max
            );
            assert!(row.cpu_pct_max > 0.0, "cpu_pct_max={}", row.cpu_pct_max);
            assert!(row.mem_peak_mb > 0.0, "mem_peak_mb={}", row.mem_peak_mb);
        }

        // The `telem_*` keys must NOT have leaked into bench_results: the scalar
        // metric set there should contain ops_sec but no telemetry keys.
        let runs = wh.query_bench_runs(&BenchFilter::for_repo("demo")).unwrap();
        assert_eq!(runs.len(), 1);
        let res = &runs[0].results[0];
        assert!(res.metrics.contains_key("ops_sec"));
        assert!(
            !res.metrics.keys().any(|k| k.starts_with("telem_")),
            "telem_* keys must be routed to bench_telemetry, not bench_results: {:?}",
            res.metrics.keys().collect::<Vec<_>>()
        );
    }

    #[test]
    fn open_read_only_degrades_when_catalog_locked() {
        // Simulate the live nornir-server: open the warehouse exclusively and
        // keep it open (holds the redb flock on catalog.redb). A second plain
        // `open` against the same root must hit the lock; `open_read_only` must
        // degrade to a copied-aside snapshot and still read prior rows.
        let dir = tempfile::tempdir().unwrap();

        // Seed a row, then drop so the file is flushed but reopen below holds
        // the lock for the duration of the assertions.
        {
            let wh = IcebergWarehouse::open(dir.path()).unwrap();
            wh.append_bench_run("holger", &sample_run("ryzen", 42.0)).unwrap();
        }

        // The "server": exclusive holder of the lock for the rest of the test.
        let _server = IcebergWarehouse::open(dir.path()).unwrap();

        // Plain open must fail with the recognizable redb lock error.
        match IcebergWarehouse::open(dir.path()) {
            Ok(_) => panic!("plain open should not acquire an already-held lock"),
            Err(err) => assert!(
                is_catalog_lock_error(&err),
                "expected a catalog lock error, got: {err:#}"
            ),
        }

        // Lock-tolerant open must succeed via the snapshot fallback and still
        // see the row written before the lock was taken.
        let ro = IcebergWarehouse::open_read_only(dir.path())
            .expect("open_read_only should degrade to a snapshot, not error");
        assert!(ro._snapshot.is_some(), "expected a copied-aside snapshot");
        let runs = ro.query_bench_runs(&BenchFilter::for_repo("holger")).unwrap();
        assert_eq!(runs.len(), 1, "snapshot must reflect the pre-lock row");
        let ops = runs[0].results[0].metrics.get("ops_sec").unwrap().as_f64().unwrap();
        assert!((ops - 42.0).abs() < 1e-9);
    }

    #[test]
    fn open_read_only_reads_latest_committed_under_live_lock() {
        // REGRESSION (the fat-viz "stale/empty snapshot" bug): the live
        // nornir-server holds the exclusive redb lock AND keeps committing
        // *while* a reader opens a lock-tolerant snapshot. The copied-aside
        // snapshot must reflect the LATEST committed state — the same rows the
        // server would serve — not a stale/empty point-in-time.
        //
        // Before the fix `open_snapshot` `fs::copy`'d the live `catalog.redb`
        // out from under an in-flight writer, so the copy could capture a torn /
        // short image — the reader then saw fewer rows than were committed
        // (Timeline empty/short) or crashed. After the fix the read reflects
        // every committed row.
        let dir = tempfile::tempdir().unwrap();

        // The "server": exclusive lock holder for the whole test. It writes the
        // FIRST row before any reader exists, then keeps writing below.
        let server = IcebergWarehouse::open(dir.path()).unwrap();
        server.append_bench_run("holger", &sample_run("ryzen", 1.0)).unwrap();

        // The server commits MORE rows (this is the state a fat viz must see).
        server.append_bench_run("holger", &sample_run("ryzen", 2.0)).unwrap();
        server.append_bench_run("holger", &sample_run("ryzen", 3.0)).unwrap();

        // A lock-tolerant reader opens while the server still holds the lock.
        let ro = IcebergWarehouse::open_read_only(dir.path())
            .expect("open_read_only should degrade to a snapshot, not error");
        assert!(
            ro._snapshot.is_some(),
            "server holds the lock → reader must use a copied-aside snapshot"
        );

        let runs = ro.query_bench_runs(&BenchFilter::for_repo("holger")).unwrap();
        let mut ops: Vec<f64> = runs
            .iter()
            .map(|r| r.results[0].metrics.get("ops_sec").unwrap().as_f64().unwrap())
            .collect();
        ops.sort_by(|a, b| a.total_cmp(b));
        assert_eq!(
            ops,
            vec![1.0, 2.0, 3.0],
            "lock-contended read must see EVERY committed row (latest state), not a \
             stale/empty subset"
        );

        drop(ro);
        drop(server);
    }

    #[test]
    fn open_read_only_snapshot_is_consistent_with_concurrent_writer() {
        // REGRESSION (the harder face of the same bug): a *concurrent* server
        // commit racing the snapshot copy. A naive `fs::copy` of the live
        // `catalog.redb` is not atomic — it can capture the file mid-commit
        // (torn pages / partial grow), so the opened copy either fails redb
        // recovery or rolls back to a STALE transaction missing rows the server
        // already committed. Hammer a writer while repeatedly snapshotting and
        // assert every snapshot is (a) openable and (b) sees a row count that
        // only ever grows toward the committed total — never an empty/torn read.
        use std::sync::atomic::{AtomicUsize, Ordering};
        use std::sync::Arc as StdArc;

        let dir = tempfile::tempdir().unwrap();
        let server = IcebergWarehouse::open(dir.path()).unwrap();
        // Seed one row so the catalog is non-empty before readers start.
        server.append_bench_run("holger", &sample_run("ryzen", 0.0)).unwrap();

        let server = StdArc::new(server);
        let committed = StdArc::new(AtomicUsize::new(1));
        let stop = StdArc::new(std::sync::atomic::AtomicBool::new(false));

        let writer = {
            let server = StdArc::clone(&server);
            let committed = StdArc::clone(&committed);
            let stop = StdArc::clone(&stop);
            std::thread::spawn(move || {
                let mut n = 1.0_f64;
                while !stop.load(Ordering::Relaxed) {
                    server.append_bench_run("holger", &sample_run("ryzen", n)).unwrap();
                    committed.fetch_add(1, Ordering::SeqCst);
                    n += 1.0;
                }
            })
        };

        // Repeatedly snapshot while the writer races us.
        let mut max_seen = 0usize;
        for _ in 0..40 {
            // Lower bound on what's durably committed at the instant we snapshot.
            let floor = committed.load(Ordering::SeqCst);
            let ro = IcebergWarehouse::open_read_only(dir.path())
                .expect("concurrent lock-tolerant open must not fail on a torn copy");
            let n = ro.query_bench_runs(&BenchFilter::for_repo("holger")).unwrap().len();
            // The snapshot must reflect at least everything committed before the
            // copy began — never a stale/empty subset of already-committed rows.
            assert!(
                n >= floor,
                "snapshot saw {n} rows but {floor} were already committed before the copy — stale read"
            );
            max_seen = max_seen.max(n);
            drop(ro);
        }

        stop.store(true, Ordering::Relaxed);
        writer.join().unwrap();
        assert!(max_seen > 1, "writer should have advanced past the seed row");
    }

    #[test]
    fn copy_catalog_consistent_rejects_short_growing_copies_until_stable() {
        // Unit-level proof of the snapshot fix (the fat-viz crash / empty-Timeline
        // bug). The hazard: redb grows its file in regions — it advertises a
        // larger layout in the header, then extends the file. A copy taken across
        // that growth window can be *shorter than its own header claims*, and
        // redb's open asserts `raw_file_len() >= header.layout().len()` → panic /
        // stale read. The fix must only accept a copy taken across a window with
        // NO growth (size-stable), so the image always satisfies redb's length
        // invariant.
        //
        // We drive that deterministically with an injected copy step that mimics
        // a writer growing the file *during* the early copies (each such copy is
        // short — it lags the now-larger source), then quiescing. A blind single
        // copy (the pre-fix behaviour = "take the first copy") would return the
        // short, growth-torn image; the fix must reject those and return only the
        // final, size-stable copy.
        use std::cell::Cell;

        let dir = tempfile::tempdir().unwrap();
        let live = dir.path().join("catalog.redb");
        let snap = dir.path().join("snap.redb");

        // Source starts at 1 MiB.
        std::fs::write(&live, vec![1u8; 1024 * 1024]).unwrap();

        // The first 3 copies "race a growth": the source grows by 256 KiB and the
        // copy lands SHORT (it copies the pre-growth length into `dst`). From the
        // 4th attempt on, growth has quiesced and the copy is complete & stable.
        let attempt = Cell::new(0usize);
        let first_short_copy_len = Cell::new(0u64);
        let res = copy_catalog_consistent_with(&live, &snap, |s, d| {
            let n = attempt.get();
            attempt.set(n + 1);
            let src_len = std::fs::metadata(s).unwrap().len();
            if n < 3 {
                // Grow the source AFTER reading its length, then write a SHORT
                // image to `dst` (length = pre-growth) — exactly a copy that
                // raced a region extension.
                let mut body = std::fs::read(s).unwrap();
                body.extend(std::iter::repeat(7u8).take(256 * 1024));
                std::fs::write(s, &body).unwrap(); // source is now larger
                std::fs::write(d, vec![1u8; src_len as usize]).unwrap(); // short copy
                if n == 0 {
                    first_short_copy_len.set(src_len);
                }
            } else {
                // Quiescent: full, faithful copy.
                std::fs::copy(s, d).unwrap();
            }
            Ok(())
        });
        res.expect("must converge on a size-stable copy");

        // What the pre-fix single blind copy would have produced: the FIRST copy,
        // which was short relative to the source it should reflect.
        let stable_src_len = std::fs::metadata(&live).unwrap().len();
        let got = std::fs::metadata(&snap).unwrap().len();
        assert!(
            first_short_copy_len.get() < stable_src_len,
            "test precondition: the first (blind) copy must be short ({} < {})",
            first_short_copy_len.get(),
            stable_src_len
        );
        assert_eq!(
            got, stable_src_len,
            "fix must return the size-stable, full-length copy ({stable_src_len}), not a \
             short growth-torn image like the pre-fix blind copy ({})",
            first_short_copy_len.get()
        );
        // And it took several retries to get there (proving the retry loop, not a
        // lucky first copy).
        assert!(attempt.get() >= 4, "expected retries past the short copies, got {}", attempt.get());
    }

    #[test]
    fn open_read_only_is_exclusive_when_unlocked() {
        // With no competing holder, open_read_only behaves like a normal open:
        // no snapshot copy, full read access.
        let dir = tempfile::tempdir().unwrap();
        {
            let wh = IcebergWarehouse::open(dir.path()).unwrap();
            wh.append_bench_run("holger", &sample_run("ryzen", 7.0)).unwrap();
        }
        let ro = IcebergWarehouse::open_read_only(dir.path()).unwrap();
        assert!(ro._snapshot.is_none(), "unlocked open must not snapshot");
        let runs = ro.query_bench_runs(&BenchFilter::for_repo("holger")).unwrap();
        assert_eq!(runs.len(), 1);
    }

    #[test]
    fn partitioned_bench_runs_scope_to_repo() {
        // bench_runs is partitioned by `repo`; a scoped read must return only
        // that repo's runs (correctness over the partitioned layout — the write
        // path tags each file with its repo PartitionKey, the read filters on it).
        let dir = tempfile::tempdir().unwrap();
        let wh = IcebergWarehouse::open(dir.path()).unwrap();
        wh.append_bench_run("holger", &sample_run("ryzen", 1.0)).unwrap();
        wh.append_bench_run("znippy", &sample_run("ryzen", 2.0)).unwrap();
        wh.append_bench_run("holger", &sample_run("ryzen", 3.0)).unwrap();

        let ops = |r: &BenchRun| r.results[0].metrics.get("ops_sec").unwrap().as_f64().unwrap();

        let mut h: Vec<f64> = wh
            .query_bench_runs(&BenchFilter::for_repo("holger"))
            .unwrap()
            .iter()
            .map(ops)
            .collect();
        h.sort_by(|a, b| a.total_cmp(b));
        assert_eq!(h, vec![1.0, 3.0], "only holger's two runs");

        let z = wh.query_bench_runs(&BenchFilter::for_repo("znippy")).unwrap();
        assert_eq!(z.len(), 1, "only znippy's one run");
        assert!((ops(&z[0]) - 2.0).abs() < 1e-9);
    }

    #[test]
    fn filter_by_machine() {
        let dir = tempfile::tempdir().unwrap();
        let wh = IcebergWarehouse::open(dir.path()).unwrap();
        wh.append_bench_run("holger", &sample_run("ryzen", 1.0)).unwrap();
        wh.append_bench_run("holger", &sample_run("threadripper", 2.0)).unwrap();
        let filter = BenchFilter {
            repo: Some("holger".into()),
            machine: Some("threadripper".into()),
            limit: None,
        };
        let runs = wh.query_bench_runs(&filter).unwrap();
        assert_eq!(runs.len(), 1);
        assert_eq!(runs[0].machine, "threadripper");
    }

    #[test]
    fn reopen_sees_previous_data() {
        let dir = tempfile::tempdir().unwrap();
        {
            let wh = IcebergWarehouse::open(dir.path()).unwrap();
            wh.append_bench_run("holger", &sample_run("ryzen", 7.0)).unwrap();
        }
        let wh = IcebergWarehouse::open(dir.path()).unwrap();
        let runs = wh.query_bench_runs(&BenchFilter::for_repo("holger")).unwrap();
        assert_eq!(runs.len(), 1);
    }

    #[test]
    fn mcp_calls_round_trip_and_aggregate() {
        let dir = tempfile::tempdir().unwrap();
        let wh = IcebergWarehouse::open(dir.path()).unwrap();
        let t0 = 1_900_000_000_000_000i64;
        wh.append_mcp_calls(&[
            McpCall { ts_micros: t0, tool: "search".into(), status: "ok".into(), latency_ms: 12 },
            McpCall { ts_micros: t0 + 1, tool: "search".into(), status: "err".into(), latency_ms: 8 },
            McpCall { ts_micros: t0 + 2, tool: "search".into(), status: "ok".into(), latency_ms: 10 },
            McpCall { ts_micros: t0 + 3, tool: "deps_of".into(), status: "ok".into(), latency_ms: 20 },
        ])
        .unwrap();
        // A second append == a second snapshot; stats aggregate across both.
        wh.append_mcp_calls(&[McpCall {
            ts_micros: t0 + 4,
            tool: "deps_of".into(),
            status: "ok".into(),
            latency_ms: 30,
        }])
        .unwrap();

        let stats = wh.query_mcp_stats().unwrap();
        assert_eq!(stats.len(), 2);
        // Sorted by call count descending: search (3) before deps_of (2).
        assert_eq!(stats[0].tool, "search");
        assert_eq!(stats[0].calls, 3);
        assert_eq!(stats[0].errors, 1);
        assert!((stats[0].avg_latency_ms - 10.0).abs() < 1e-9);
        assert_eq!(stats[1].tool, "deps_of");
        assert_eq!(stats[1].calls, 2);
        assert_eq!(stats[1].errors, 0);
        assert!((stats[1].avg_latency_ms - 25.0).abs() < 1e-9);
        assert_eq!(stats[1].last_ts_micros, t0 + 4);
    }

    #[test]
    fn sbom_components_latest_snapshot_wins() {
        let dir = tempfile::tempdir().unwrap();
        let wh = IcebergWarehouse::open(dir.path()).unwrap();
        // No capture yet → None (caller falls back to live metadata).
        assert!(wh.query_sbom_components("znippy").unwrap().is_none());

        let snap1 = Uuid::new_v4();
        wh.append_sbom_components(
            "znippy",
            snap1,
            &[
                SbomComponentRow { name: "serde".into(), version: "1.0.0".into(), license: "MIT".into() },
                SbomComponentRow { name: "anyhow".into(), version: "1.0.80".into(), license: "MIT OR Apache-2.0".into() },
            ],
        )
        .unwrap();

        let got = wh.query_sbom_components("znippy").unwrap().unwrap();
        assert_eq!(got.len(), 2);
        assert!(got.iter().any(|c| c.name == "serde" && c.version == "1.0.0" && c.license == "MIT"));

        // A second capture (newer snapshot) supersedes the first.
        let snap2 = Uuid::new_v4();
        wh.append_sbom_components(
            "znippy",
            snap2,
            &[SbomComponentRow { name: "serde".into(), version: "1.0.1".into(), license: "MIT".into() }],
        )
        .unwrap();
        let got = wh.query_sbom_components("znippy").unwrap().unwrap();
        assert_eq!(got.len(), 1, "only the latest snapshot's components");
        assert_eq!(got[0].version, "1.0.1");

        // Scoping: a different repo with no capture is still None.
        assert!(wh.query_sbom_components("other").unwrap().is_none());
    }

    #[test]
    fn knowledge_scan_ledger_is_idempotent_per_sha() {
        let dir = tempfile::tempdir().unwrap();
        let wh = IcebergWarehouse::open(dir.path()).unwrap();
        assert!(!wh.knowledge_scan_exists("znippy", "deadbeef").unwrap());

        wh.record_knowledge_scan("znippy", "deadbeef", Uuid::new_v4()).unwrap();
        assert!(wh.knowledge_scan_exists("znippy", "deadbeef").unwrap());

        // Re-recording the same SHA is a no-op — still exactly one ledger row, so
        // a republish over an unchanged SHA does no work.
        wh.record_knowledge_scan("znippy", "deadbeef", Uuid::new_v4()).unwrap();
        let n = wh.block_on(async {
            let t = wh.catalog().load_table(&wh.table_ident(TABLE_KNOWLEDGE_SCANS)).await.unwrap();
            let batches = scan_all(&t).await.unwrap();
            batches.iter().map(|b| b.num_rows()).sum::<usize>()
        });
        assert_eq!(n, 1, "duplicate record_knowledge_scan must not add a row");

        // A different SHA / repo is tracked independently.
        assert!(!wh.knowledge_scan_exists("znippy", "cafef00d").unwrap());
        assert!(!wh.knowledge_scan_exists("other", "deadbeef").unwrap());
        wh.record_knowledge_scan("znippy", "cafef00d", Uuid::new_v4()).unwrap();
        assert!(wh.knowledge_scan_exists("znippy", "cafef00d").unwrap());
    }

    /// A/B benchmark — nornir's `append_batch` write path vs `skade`'s
    /// `Table::append`, over an identical batch + schema, run for BOTH an
    /// unpartitioned table AND one partitioned by `repo` (head-to-head). Since
    /// the migration `append_batch` *is* skade, Path A measures it through
    /// nornir's real `load_table`-per-commit wrapper and Path B through skade's
    /// reused handle; the partitioned rows are single-valued on `repo`, so each
    /// commit is one partition (the shape skade writes). The extra column the
    /// partitioned run exercises is skade's per-commit `partition_key_for`
    /// (identity-key derivation) — the bench shows its overhead is negligible.
    /// Each iteration is one append == one Iceberg snapshot; read-back is a full
    /// scan of all snapshots.
    ///
    /// Run: `cargo test -p nornir --release skade_ab_write_read -- --ignored --nocapture`
    /// Tunables: `AB_COMMITS` (default 200), `AB_ROWS` per batch (default 1000).
    #[test]
    #[ignore = "A/B benchmark; run with --release --ignored --nocapture"]
    fn skade_ab_write_read() {
        use std::sync::Arc;
        use std::time::{Duration, Instant};
        use arrow::datatypes::{DataType, Field, Schema as ArrowSchema};

        let env = |k: &str, d: usize| {
            std::env::var(k).ok().and_then(|v| v.parse().ok()).unwrap_or(d)
        };
        let commits = env("AB_COMMITS", 200);
        let rows = env("AB_ROWS", 1000);

        // bench_runs-shaped synthetic batch; all rows share `repo = "nornir"` so a
        // partitioned-by-`repo` table sees one partition per commit.
        let arrow_schema = Arc::new(ArrowSchema::new(vec![
            Field::new("id", DataType::Utf8, false),
            Field::new("repo", DataType::Utf8, false),
            Field::new("machine", DataType::Utf8, false),
            Field::new("cores", DataType::Int32, false),
            Field::new("seq", DataType::Int64, false),
            Field::new("score", DataType::Float64, false),
        ]));
        // Column arrays built once; each path wraps them in its OWN table's
        // field-id-annotated arrow schema (append_batch maps Parquet columns by
        // Iceberg field id; skade remaps internally) so the comparison is the
        // write path, not schema plumbing.
        let columns: Vec<arrow::array::ArrayRef> = vec![
            Arc::new(StringArray::from((0..rows).map(|i| format!("id-{i:08}")).collect::<Vec<_>>())),
            Arc::new(StringArray::from(vec!["nornir"; rows])),
            Arc::new(StringArray::from(vec!["oden"; rows])),
            Arc::new(Int32Array::from(vec![32i32; rows])),
            Arc::new(Int64Array::from((0..rows as i64).collect::<Vec<_>>())),
            Arc::new(Float64Array::from((0..rows).map(|i| i as f64).collect::<Vec<_>>())),
        ];
        let bytes_per_batch: usize = columns.iter().map(|c| c.get_array_memory_size()).sum();
        let total_rows = (commits * rows) as f64;
        let total_mb = (commits * bytes_per_batch) as f64 / 1e6;
        let ice_schema = skade::arrow_to_iceberg(&arrow_schema).unwrap();

        let rt = tokio::runtime::Builder::new_multi_thread().enable_all().build().unwrap();

        let mbs = |dt: Duration| total_mb / dt.as_secs_f64();
        let rps = |dt: Duration| total_rows / dt.as_secs_f64();
        let cps = |dt: Duration| commits as f64 / dt.as_secs_f64();
        println!("\n══ skade A/B  (commits={commits}, rows/batch={rows}, {total_mb:.1} MB, {total_rows:.0} rows) ══");

        // (scenario label, partition columns, captured write durations for the
        // partition-overhead summary).
        let mut a_writes: Vec<(&str, Duration)> = Vec::new();
        let mut b_writes: Vec<(&str, Duration)> = Vec::new();

        for (label, part_cols) in [
            ("unpartitioned", &[] as &[&str]),
            ("partitioned by `repo`", &["repo"] as &[&str]),
        ] {
            // ── Path A: nornir append_batch (load_table per commit) ──
            let dir_a = tempfile::tempdir().unwrap();
            let (a_write, a_read, a_rows) = rt.block_on(async {
                let wh_dir = dir_a.path().join("warehouse");
                std::fs::create_dir_all(&wh_dir).unwrap();
                let catalog = RedbCatalogBuilder::default()
                    .db_path(dir_a.path().join("catalog.redb"))
                    .warehouse_location(format!("file://{}", wh_dir.canonicalize().unwrap().display()))
                    .with_storage_factory(Arc::new(LocalFsStorageFactory))
                    .load("nornir", HashMap::new())
                    .await
                    .unwrap();
                let ns = NamespaceIdent::new("nornir".to_string());
                catalog.create_namespace(&ns, HashMap::new()).await.unwrap();
                create_partitioned_table_if_missing(&catalog, &ns, "ab", ice_schema.clone(), part_cols)
                    .await
                    .unwrap();
                let ident = TableIdent::new(ns, "ab".into());
                let table0 = catalog.load_table(&ident).await.unwrap();
                let a_schema = Arc::new(schema_to_arrow_schema(table0.metadata().current_schema()).unwrap());
                let batch_a = RecordBatch::try_new(a_schema, columns.clone()).unwrap();

                let t0 = Instant::now();
                for _ in 0..commits {
                    let table = catalog.load_table(&ident).await.unwrap();
                    append_batch(&catalog, table, batch_a.clone()).await.unwrap();
                }
                let write = t0.elapsed();

                let table = catalog.load_table(&ident).await.unwrap();
                let t1 = Instant::now();
                let batches = scan_all(&table).await.unwrap();
                let read = t1.elapsed();
                (write, read, batches.iter().map(|b| b.num_rows()).sum::<usize>())
            });

            // ── Path B: skade Table::append (handle reuse) ──
            let dir_b = tempfile::tempdir().unwrap();
            let (b_write, b_read, b_rows) = rt.block_on(async {
                let wh = skade::open(dir_b.path()).await.unwrap();
                let mut table = wh.create_partitioned_table("ab", &arrow_schema, part_cols).await.unwrap();
                let batch_b = RecordBatch::try_new(table.arrow_schema().unwrap(), columns.clone()).unwrap();
                let t0 = Instant::now();
                for _ in 0..commits {
                    table.append(std::slice::from_ref(&batch_b)).await.unwrap();
                }
                let write = t0.elapsed();
                let t1 = Instant::now();
                let batches = table.read().await.unwrap();
                let read = t1.elapsed();
                (write, read, batches.iter().map(|b| b.num_rows()).sum::<usize>())
            });

            assert_eq!(a_rows, b_rows, "[{label}] both paths must read back the same row count");
            assert_eq!(a_rows, commits * rows, "[{label}] all appended rows must be readable");

            println!("\n── {label} ──");
            println!("WRITE  nornir(append_batch): {:>8.0} rows/s  {:>6.1} commits/s  {:>6.1} MB/s  ({:?})",
                rps(a_write), cps(a_write), mbs(a_write), a_write);
            println!("WRITE  skade(append):        {:>8.0} rows/s  {:>6.1} commits/s  {:>6.1} MB/s  ({:?})",
                rps(b_write), cps(b_write), mbs(b_write), b_write);
            println!("       → skade/nornir write ratio: {:.2}x  ({})",
                a_write.as_secs_f64() / b_write.as_secs_f64(),
                if b_write <= a_write { "skade ≥ nornir ✓" } else { "skade slower" });
            println!("READ   nornir(scan_all):     {:>8.0} rows/s  ({:?})", rps(a_read), a_read);
            println!("READ   skade(read):          {:>8.0} rows/s  ({:?})", rps(b_read), b_read);
            println!("       → skade/nornir read ratio:  {:.2}x",
                a_read.as_secs_f64() / b_read.as_secs_f64());

            a_writes.push((label, a_write));
            b_writes.push((label, b_write));
        }

        // Partition overhead = partitioned write time / unpartitioned write time
        // (the cost of skade's per-commit identity partition_key_for).
        if a_writes.len() == 2 {
            let ov = |v: &[(&str, Duration)]| v[1].1.as_secs_f64() / v[0].1.as_secs_f64();
            println!("\n── partition overhead (partitioned ÷ unpartitioned write) ──");
            println!("       nornir(append_batch): {:.2}x   skade(append): {:.2}x", ov(&a_writes), ov(&b_writes));
        }
    }
}