nornir 0.4.32 - Docs.rs

//! Tantivy index ⇄ iceberg blob persistence (time machine).
//!
//! Snapshot: walks `<repo>/.nornir/cache/index/` (or any Tantivy
//! `MmapDirectory` root), persists every regular file as a row in the
//! `tantivy_index_blobs` iceberg table + one metadata row in
//! `tantivy_index_snapshots`. Keyed by `(repo, git_sha,
//! schema_hash)` — re-snapshotting an unchanged tree is a no-op.
//!
//! Restore: SELECTs every blob row for a given snapshot (or the
//! latest snapshot for `(repo, git_sha)`), writes them back into a
//! target directory. The caller then opens that directory with
//! `tantivy::directory::MmapDirectory::open` + `tantivy::Index::open_in_dir`.
//! No re-tokenize, no rebuild — Tantivy mmaps the bytes back as if
//! they had been written by the original `IndexWriter`.
//!
//! Same shape (snapshots + blobs) will be reused for DWARF / gimli
//! captures later; this module is the reference impl.

use std::path::{Path, PathBuf};
use std::sync::Arc;

use anyhow::{anyhow, Context, Result};
use arrow::array::{Array, Int32Array, Int64Array, LargeBinaryArray, RecordBatch, StringArray, TimestampMicrosecondArray};
use chrono::Utc;
use futures::TryStreamExt;
use iceberg::arrow::schema_to_arrow_schema;
use iceberg::expr::Reference;
use iceberg::spec::Datum;
use iceberg::Catalog;
use sha2::{Digest, Sha256};
use uuid::Uuid;

use crate::warehouse::blob_store::{put_blobs_dedup_async, BlobRow};
use crate::warehouse::iceberg::{
    append_batch, IcebergWarehouse, TABLE_TANTIVY_INDEX_BLOBS, TABLE_TANTIVY_INDEX_SNAPSHOTS,
};

/// Downcast a column **by name** (not position). Required once a scan
/// uses `.select([...])`, since projection changes column order and
/// drops unselected columns — positional `batch.column(0)` would then
/// read the wrong array.
fn col<'a, T: 'static>(batch: &'a RecordBatch, name: &str) -> Result<&'a T> {
    batch
        .column_by_name(name)
        .ok_or_else(|| anyhow!("projected batch missing column `{name}`"))?
        .as_any()
        .downcast_ref::<T>()
        .ok_or_else(|| anyhow!("column `{name}` has unexpected arrow type"))
}

fn hex_encode(bytes: &[u8]) -> String {
    const HEX: &[u8; 16] = b"0123456789abcdef";
    let mut s = String::with_capacity(bytes.len() * 2);
    for &b in bytes {
        s.push(HEX[(b >> 4) as usize] as char);
        s.push(HEX[(b & 0x0f) as usize] as char);
    }
    s
}

/// Identity of a captured Tantivy index in iceberg.
#[derive(Debug, Clone)]
pub struct SnapshotRef {
    pub snapshot_id: Uuid,
    pub workspace: String,
    pub repo: String,
    pub git_sha: String,
    pub branch: String,
    pub schema_hash: String,
    pub blob_count: i32,
    pub total_bytes: i64,
}

/// In-memory representation of a single blob row.
struct Blob {
    filename: String,
    bytes: Vec<u8>,
    sha256: String,
}

/// Walk `dir` recursively, collecting every regular file as a blob.
/// Filenames are stored relative to `dir` so restore lands them in
/// the same shape.
fn collect_blobs(dir: &Path) -> Result<Vec<Blob>> {
    let mut out = Vec::new();
    if !dir.exists() {
        return Ok(out);
    }
    let mut stack: Vec<PathBuf> = vec![dir.to_path_buf()];
    while let Some(cur) = stack.pop() {
        for entry in std::fs::read_dir(&cur)
            .with_context(|| format!("read_dir {}", cur.display()))?
        {
            let entry = entry?;
            let path = entry.path();
            let meta = entry.metadata()?;
            if meta.is_dir() {
                stack.push(path);
            } else if meta.is_file() {
                let bytes = std::fs::read(&path)
                    .with_context(|| format!("read {}", path.display()))?;
                let mut h = Sha256::new();
                h.update(&bytes);
                let sha256 = hex_encode(&h.finalize());
                let filename = path
                    .strip_prefix(dir)
                    .unwrap_or(&path)
                    .to_string_lossy()
                    .into_owned();
                out.push(Blob { filename, bytes, sha256 });
            }
        }
    }
    out.sort_by(|a, b| a.filename.cmp(&b.filename));
    Ok(out)
}

/// Stable digest of the collection's (filename, sha256) pairs — two
/// snapshots with the same digest contain the same bytes regardless
/// of capture order.
fn schema_hash_of(blobs: &[Blob]) -> String {
    let mut h = Sha256::new();
    for b in blobs {
        h.update(b.filename.as_bytes());
        h.update(b":");
        h.update(b.sha256.as_bytes());
        h.update(b"\n");
    }
    hex_encode(&h.finalize())
}

/// Look up an existing snapshot with matching (repo, git_sha,
/// schema_hash) — used for idempotent re-snapshot.
async fn find_existing_snapshot(
    wh: &IcebergWarehouse,
    snap_table: &str,
    repo: &str,
    git_sha: &str,
    schema_hash: &str,
) -> Result<Option<Uuid>> {
    let table = wh
        .catalog()
        .load_table(&wh.table_ident(snap_table))
        .await?;
    // Pushdown: this runs on **every** snapshot write (the idempotency
    // check), so a full table scan made write cost grow with history.
    // Filter to the exact (repo, git_sha, schema_hash) tuple and project
    // only the id we return — the planner skips non-matching data files.
    let predicate = Reference::new("repo")
        .equal_to(Datum::string(repo))
        .and(Reference::new("git_sha").equal_to(Datum::string(git_sha)))
        .and(Reference::new("schema_hash").equal_to(Datum::string(schema_hash)));
    let scan = table
        .scan()
        .with_filter(predicate)
        .select(["snapshot_id", "repo", "git_sha", "schema_hash"])
        .build()?;
    let stream = scan.to_arrow().await?;
    let batches: Vec<RecordBatch> = stream.try_collect().await?;
    for batch in &batches {
        let ids = col::<StringArray>(batch, "snapshot_id")?;
        let repos = col::<StringArray>(batch, "repo")?;
        let shas = col::<StringArray>(batch, "git_sha")?;
        let hashes = col::<StringArray>(batch, "schema_hash")?;
        for i in 0..batch.num_rows() {
            // Residual check: pushdown prunes at file granularity, not per row.
            if repos.value(i) == repo
                && shas.value(i) == git_sha
                && hashes.value(i) == schema_hash
            {
                return Ok(Some(Uuid::parse_str(ids.value(i))?));
            }
        }
    }
    Ok(None)
}

/// Persist every file under `index_dir` as iceberg blob rows for
/// `(workspace, repo, git_sha, branch)`. Returns the snapshot row's
/// identity. Idempotent: if a snapshot for the same
/// `(repo, git_sha, schema_hash)` already exists, returns it without
/// writing.
pub fn snapshot_to_iceberg(
    wh: &IcebergWarehouse,
    workspace: &str,
    repo: &str,
    git_sha: &str,
    branch: &str,
    index_dir: &Path,
) -> Result<SnapshotRef> {
    snapshot_dir_to_iceberg(
        wh,
        TABLE_TANTIVY_INDEX_SNAPSHOTS,
        TABLE_TANTIVY_INDEX_BLOBS,
        workspace,
        repo,
        git_sha,
        branch,
        index_dir,
    )
}

/// Like [`snapshot_to_iceberg`] but targets explicit snapshot/blob tables, so
/// a *separate* index (e.g. the docs index) can be persisted into its own pair
/// of tables while reusing the identical blob capture + dedup machinery.
#[allow(clippy::too_many_arguments)]
pub fn snapshot_dir_to_iceberg(
    wh: &IcebergWarehouse,
    snap_table: &str,
    blob_table: &str,
    workspace: &str,
    repo: &str,
    git_sha: &str,
    branch: &str,
    index_dir: &Path,
) -> Result<SnapshotRef> {
    let blobs = collect_blobs(index_dir)
        .with_context(|| format!("walk index dir {}", index_dir.display()))?;
    if blobs.is_empty() {
        return Err(anyhow!(
            "no files under {} — nothing to snapshot",
            index_dir.display()
        ));
    }
    let schema_hash = schema_hash_of(&blobs);
    let blob_count: i32 = blobs.len() as i32;
    let total_bytes: i64 = blobs.iter().map(|b| b.bytes.len() as i64).sum();

    wh.block_on(async {
        if let Some(existing) =
            find_existing_snapshot(wh, snap_table, repo, git_sha, &schema_hash).await?
        {
            return Ok(SnapshotRef {
                snapshot_id: existing,
                workspace: workspace.to_string(),
                repo: repo.to_string(),
                git_sha: git_sha.to_string(),
                branch: branch.to_string(),
                schema_hash,
                blob_count,
                total_bytes,
            });
        }

        let snapshot_id = Uuid::new_v4();
        let id_str = snapshot_id.to_string();
        let ts = Utc::now();

        // snapshots row
        let s_table = wh
            .catalog()
            .load_table(&wh.table_ident(snap_table))
            .await?;
        let s_schema = Arc::new(schema_to_arrow_schema(s_table.metadata().current_schema())?);
        let s_cols: Vec<Arc<dyn Array>> = vec![
            Arc::new(StringArray::from(vec![id_str.clone()])),
            Arc::new(StringArray::from(vec![workspace.to_string()])),
            Arc::new(StringArray::from(vec![repo.to_string()])),
            Arc::new(StringArray::from(vec![git_sha.to_string()])),
            Arc::new(StringArray::from(vec![branch.to_string()])),
            Arc::new(
                TimestampMicrosecondArray::from(vec![ts.timestamp_micros()])
                    .with_timezone("+00:00"),
            ),
            Arc::new(StringArray::from(vec![schema_hash.clone()])),
            Arc::new(Int32Array::from(vec![blob_count])),
            Arc::new(Int64Array::from(vec![total_bytes])),
        ];
        let s_batch = RecordBatch::try_new(s_schema, s_cols)?;
        append_batch(wh.catalog(), s_table, s_batch).await?;

        // blob rows — split into a *manifest* and a *content store*:
        //
        //   • manifest rows  — one per logical file of THIS snapshot, tagged
        //     with this `snapshot_id`, carrying `(filename, byte_len, sha256)`
        //     but an EMPTY `bytes` payload. They record the snapshot's
        //     filename→sha256 mapping cheaply (no payload), so two files with
        //     identical content, or a file whose bytes were already stored by
        //     an earlier snapshot, still leave a per-snapshot trail restore
        //     can follow.
        //
        //   • content rows   — the heavy `bytes`, written via `put_blobs_dedup`
        //     (content-addressed by sha256). Bytes already in the table — from
        //     an earlier snapshot OR an earlier file in this same batch — are
        //     skipped, so each distinct payload is stored exactly once.
        //
        // Restore (see `restore_dir_from_iceberg`) reads this snapshot's
        // manifest to learn which (filename, sha256) it contains, then resolves
        // each sha256 to its bytes from whichever content row stores them.
        // Content store FIRST: dedup the heavy payloads by sha256. Writing the
        // content before the manifest matters — `put_blobs_dedup` decides what
        // to skip from the sha256s *already in the table*, and the manifest rows
        // (below) carry those same sha256s with empty payloads. If the manifest
        // were written first, dedup would see every hash as "present" and store
        // zero actual bytes.
        let content: Vec<BlobRow> = blobs
            .iter()
            .map(|b| BlobRow {
                filename: b.filename.clone(),
                bytes: b.bytes.clone(),
                sha256: b.sha256.clone(),
            })
            .collect();
        put_blobs_dedup_async(wh, blob_table, &id_str, content).await?;

        // Manifest rows: one per logical file of THIS snapshot, with an EMPTY
        // `bytes` payload (the actual bytes live in the deduped content rows
        // above, addressed by sha256). They record the snapshot's
        // filename→sha256 mapping cheaply so restore can reconstruct even when
        // the payload was first stored by an earlier snapshot or shared by two
        // files in this one.
        let b_table = wh
            .catalog()
            .load_table(&wh.table_ident(blob_table))
            .await?;
        let b_schema = Arc::new(schema_to_arrow_schema(b_table.metadata().current_schema())?);

        let empty: &[u8] = &[];
        let mut ids = Vec::with_capacity(blobs.len());
        let mut names = Vec::with_capacity(blobs.len());
        let mut payloads: Vec<&[u8]> = Vec::with_capacity(blobs.len());
        let mut lens = Vec::with_capacity(blobs.len());
        let mut hashes = Vec::with_capacity(blobs.len());
        for b in &blobs {
            ids.push(id_str.clone());
            names.push(b.filename.clone());
            payloads.push(empty);
            lens.push(b.bytes.len() as i32);
            hashes.push(b.sha256.clone());
        }
        let m_cols: Vec<Arc<dyn Array>> = vec![
            Arc::new(StringArray::from(ids)),
            Arc::new(StringArray::from(names)),
            Arc::new(LargeBinaryArray::from(payloads)),
            Arc::new(Int32Array::from(lens)),
            Arc::new(StringArray::from(hashes)),
        ];
        let m_batch = RecordBatch::try_new(b_schema, m_cols)?;
        append_batch(wh.catalog(), b_table, m_batch).await?;

        Ok(SnapshotRef {
            snapshot_id,
            workspace: workspace.to_string(),
            repo: repo.to_string(),
            git_sha: git_sha.to_string(),
            branch: branch.to_string(),
            schema_hash,
            blob_count,
            total_bytes,
        })
    })
}

/// Find the snapshot_id to restore: either the explicit `git_sha`,
/// or the most-recent snapshot for `repo`.
async fn resolve_snapshot_id(
    wh: &IcebergWarehouse,
    snap_table: &str,
    repo: &str,
    git_sha: Option<&str>,
) -> Result<(Uuid, String, String)> {
    let table = wh
        .catalog()
        .load_table(&wh.table_ident(snap_table))
        .await?;
    // Pushdown: filter to `repo` (and `git_sha` when pinned) and project
    // only the three columns we rank on. Lets the manifest/metrics filter
    // skip data files for other repos instead of scanning the whole table.
    let mut predicate = Reference::new("repo").equal_to(Datum::string(repo));
    if let Some(want) = git_sha {
        predicate = predicate.and(Reference::new("git_sha").equal_to(Datum::string(want)));
    }
    let scan = table
        .scan()
        .with_filter(predicate)
        .select(["snapshot_id", "repo", "git_sha", "ts_micros"])
        .build()?;
    let stream = scan.to_arrow().await?;
    let batches: Vec<RecordBatch> = stream.try_collect().await?;

    let mut candidates: Vec<(Uuid, String, i64)> = Vec::new(); // (id, sha, ts_micros)
    for batch in &batches {
        let ids = col::<StringArray>(batch, "snapshot_id")?;
        let repos = col::<StringArray>(batch, "repo")?;
        let shas = col::<StringArray>(batch, "git_sha")?;
        let ts = col::<TimestampMicrosecondArray>(batch, "ts_micros")?;
        for i in 0..batch.num_rows() {
            // Residual checks: pushdown prunes at file granularity.
            if repos.value(i) != repo {
                continue;
            }
            if let Some(want) = git_sha {
                if shas.value(i) != want {
                    continue;
                }
            }
            candidates.push((
                Uuid::parse_str(ids.value(i))?,
                shas.value(i).to_string(),
                ts.value(i),
            ));
        }
    }
    if candidates.is_empty() {
        return Err(anyhow!(
            "no tantivy_index_snapshot for repo `{repo}`{}",
            git_sha
                .map(|s| format!(" at sha {s}"))
                .unwrap_or_default()
        ));
    }
    candidates.sort_by(|a, b| a.2.cmp(&b.2));
    let chosen = candidates.pop().unwrap();
    Ok((chosen.0, chosen.1, format!("ts_micros={}", chosen.2)))
}

/// Materialise the snapshot into `into` so a Tantivy `MmapDirectory`
/// can open it. `into` is created if it doesn't exist; existing
/// files with the same name are overwritten (Tantivy segment files
/// are immutable by content hash, so this is safe).
pub fn restore_from_iceberg(
    wh: &IcebergWarehouse,
    repo: &str,
    git_sha: Option<&str>,
    into: &Path,
) -> Result<SnapshotRef> {
    restore_dir_from_iceberg(
        wh,
        TABLE_TANTIVY_INDEX_SNAPSHOTS,
        TABLE_TANTIVY_INDEX_BLOBS,
        repo,
        git_sha,
        into,
    )
}

/// Like [`restore_from_iceberg`] but reads from explicit snapshot/blob tables
/// (mirror of [`snapshot_dir_to_iceberg`]).
pub fn restore_dir_from_iceberg(
    wh: &IcebergWarehouse,
    snap_table: &str,
    blob_table: &str,
    repo: &str,
    git_sha: Option<&str>,
    into: &Path,
) -> Result<SnapshotRef> {
    std::fs::create_dir_all(into)
        .with_context(|| format!("create restore dir {}", into.display()))?;

    wh.block_on(async {
        let (snapshot_id, sha, _) = resolve_snapshot_id(wh, snap_table, repo, git_sha).await?;

        // pull metadata for the SnapshotRef return value
        let s_table = wh
            .catalog()
            .load_table(&wh.table_ident(snap_table))
            .await?;
        let id_str = snapshot_id.to_string();
        let scan = s_table
            .scan()
            .with_filter(Reference::new("snapshot_id").equal_to(Datum::string(id_str.clone())))
            .build()?;
        let stream = scan.to_arrow().await?;
        let s_batches: Vec<RecordBatch> = stream.try_collect().await?;
        let mut meta: Option<SnapshotRef> = None;
        'outer: for batch in &s_batches {
            let ids = col::<StringArray>(batch, "snapshot_id")?;
            let wss = col::<StringArray>(batch, "workspace")?;
            let reps = col::<StringArray>(batch, "repo")?;
            let shas = col::<StringArray>(batch, "git_sha")?;
            let brs = col::<StringArray>(batch, "branch")?;
            let hashes = col::<StringArray>(batch, "schema_hash")?;
            let counts = col::<Int32Array>(batch, "blob_count")?;
            let totals = col::<Int64Array>(batch, "total_bytes")?;
            for i in 0..batch.num_rows() {
                if ids.value(i) == id_str {
                    meta = Some(SnapshotRef {
                        snapshot_id,
                        workspace: wss.value(i).to_string(),
                        repo: reps.value(i).to_string(),
                        git_sha: shas.value(i).to_string(),
                        branch: brs.value(i).to_string(),
                        schema_hash: hashes.value(i).to_string(),
                        blob_count: counts.value(i),
                        total_bytes: totals.value(i),
                    });
                    break 'outer;
                }
            }
        }
        let meta = meta.ok_or_else(|| anyhow!("snapshot {snapshot_id} metadata vanished"))?;

        let b_table = wh
            .catalog()
            .load_table(&wh.table_ident(blob_table))
            .await?;

        // Phase 1 — MANIFEST: which (filename, sha256) does this snapshot hold?
        // Pushdown: filter to this snapshot's rows and project only the manifest
        // columns (drop `bytes`). Each snapshot's manifest is its own data file
        // (one `append_batch` = one Parquet file), so the file-level
        // metrics/manifest filter skips every *other* snapshot's file. Content
        // rows for this snapshot also match `snapshot_id == id` and carry the
        // same (filename, sha256); collecting both is harmless — a `(filename →
        // sha256)` map collapses the duplicates.
        let scan = b_table
            .scan()
            .with_filter(Reference::new("snapshot_id").equal_to(Datum::string(id_str.clone())))
            .select(["snapshot_id", "filename", "sha256"])
            .build()?;
        let stream = scan.to_arrow().await?;
        let m_batches: Vec<RecordBatch> = stream.try_collect().await?;
        let mut want: Vec<(String, String)> = Vec::new(); // (filename, sha256)
        for batch in &m_batches {
            let ids = col::<StringArray>(batch, "snapshot_id")?;
            let names = col::<StringArray>(batch, "filename")?;
            let shas = col::<StringArray>(batch, "sha256")?;
            for i in 0..batch.num_rows() {
                // Residual row-level check: pushdown prunes at file/row-group
                // granularity, so a matched file could still carry other rows.
                if ids.value(i) != id_str {
                    continue;
                }
                want.push((names.value(i).to_string(), shas.value(i).to_string()));
            }
        }
        // De-dupe identical (filename, sha256) pairs contributed by both the
        // manifest row and the content row of a first-written file.
        want.sort();
        want.dedup();
        if want.is_empty() {
            return Err(anyhow!(
                "snapshot {snapshot_id} for sha {sha} has 0 blob rows — table corrupt?"
            ));
        }

        // Phase 2 — CONTENT: resolve each needed sha256 to its bytes. Because of
        // content-addressed dedup, the bytes for a file may have been stored by
        // an EARLIER snapshot (and reused here), so we cannot scope this scan to
        // `snapshot_id == id`. Scan content rows (non-empty `bytes`) and pick up
        // every sha256 this snapshot needs; stop once all are resolved.
        let needed: std::collections::HashSet<&str> =
            want.iter().map(|(_, s)| s.as_str()).collect();
        let scan = b_table
            .scan()
            .select(["bytes", "sha256"])
            .build()?;
        let stream = scan.to_arrow().await?;
        let c_batches: Vec<RecordBatch> = stream.try_collect().await?;
        let mut by_sha: std::collections::HashMap<String, Vec<u8>> =
            std::collections::HashMap::with_capacity(needed.len());
        'scan: for batch in &c_batches {
            let bytes = col::<LargeBinaryArray>(batch, "bytes")?;
            let shas = col::<StringArray>(batch, "sha256")?;
            for i in 0..batch.num_rows() {
                let sha = shas.value(i);
                // Skip manifest rows (empty payload); only content rows carry bytes.
                if bytes.value(i).is_empty() {
                    continue;
                }
                if needed.contains(sha) && !by_sha.contains_key(sha) {
                    by_sha.insert(sha.to_string(), bytes.value(i).to_vec());
                    if by_sha.len() == needed.len() {
                        break 'scan;
                    }
                }
            }
        }

        // A zero-length file is legitimate (its sha256 is the hash of empty
        // input); its bytes never appear in a content row, so synthesize them.
        let empty_sha = crate::warehouse::blob_store::sha256_hex(&[]);

        let mut written = 0usize;
        for (filename, sha) in &want {
            let payload: &[u8] = match by_sha.get(sha) {
                Some(b) => b.as_slice(),
                None if *sha == empty_sha => &[],
                None => {
                    return Err(anyhow!(
                        "snapshot {snapshot_id}: no content row for sha {sha} (file `{filename}`) — dedup store corrupt?"
                    ));
                }
            };
            let path = into.join(filename);
            if let Some(parent) = path.parent() {
                std::fs::create_dir_all(parent).ok();
            }
            std::fs::write(&path, payload)
                .with_context(|| format!("write {}", path.display()))?;
            written += 1;
        }
        if written == 0 {
            return Err(anyhow!(
                "snapshot {snapshot_id} for sha {sha} has 0 blob rows — table corrupt?"
            ));
        }
        Ok(meta)
    })
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::warehouse::iceberg::IcebergWarehouse;

    /// End-to-end Urðr time-machine round-trip:
    /// 1. Plant a fake "index" tree (a handful of binary blobs).
    /// 2. Snapshot it into a fresh iceberg warehouse.
    /// 3. Re-snapshot at the same SHA → must return the same UUID (dedup).
    /// 4. Restore into a different directory.
    /// 5. Byte-compare every restored file against the original.
    #[test]
    fn round_trip_snapshot_then_restore() {
        let root = tempfile::tempdir().expect("tempdir");
        let warehouse_dir = root.path().join("warehouse");
        let src_dir = root.path().join("src_index");
        let dst_dir = root.path().join("restored_index");
        std::fs::create_dir_all(&src_dir).unwrap();

        // Plant fake tantivy-shaped blobs (mixed binary content).
        let payloads: Vec<(&str, Vec<u8>)> = vec![
            ("meta.json", br#"{"segments":[]}"#.to_vec()),
            ("00000000000000000000000000000001.term", (0u8..=255).collect()),
            ("00000000000000000000000000000001.idx", vec![0xDEu8; 4096]),
            ("00000000000000000000000000000001.pos", vec![0xABu8; 1024]),
        ];
        for (name, bytes) in &payloads {
            std::fs::write(src_dir.join(name), bytes).unwrap();
        }

        let wh = IcebergWarehouse::open(&warehouse_dir).expect("open warehouse");
        let sha = "0000000000000000000000000000000000000000";
        let snap1 = snapshot_to_iceberg(&wh, "ws_test", "repo_test", sha, "main", &src_dir)
            .expect("snapshot1");
        assert_eq!(snap1.blob_count, 4);
        assert_eq!(snap1.total_bytes, payloads.iter().map(|(_, b)| b.len() as i64).sum::<i64>());

        // Idempotency.
        let snap2 = snapshot_to_iceberg(&wh, "ws_test", "repo_test", sha, "main", &src_dir)
            .expect("snapshot2");
        assert_eq!(snap1.snapshot_id, snap2.snapshot_id, "dedup must return same UUID");

        // Restore by SHA.
        let restored = restore_from_iceberg(&wh, "repo_test", Some(sha), &dst_dir)
            .expect("restore");
        assert_eq!(restored.snapshot_id, snap1.snapshot_id);

        // Byte-for-byte equality.
        for (name, bytes) in &payloads {
            let got = std::fs::read(dst_dir.join(name)).expect("read restored");
            assert_eq!(&got, bytes, "blob `{name}` mismatch after restore");
        }

        // Restore "latest" (no sha pin) returns same snapshot.
        let dst2 = root.path().join("restored_index2");
        let restored2 = restore_from_iceberg(&wh, "repo_test", None, &dst2).expect("restore latest");
        assert_eq!(restored2.snapshot_id, snap1.snapshot_id);

        // Idempotency lookup (what the release capture phase uses to skip work):
        // the recorded snapshot id for (repo, sha) comes back; an unknown sha
        // is None; an unknown repo is None.
        let found = wh.block_on(wh.index_snapshot_id_for("repo_test", sha)).expect("lookup");
        assert_eq!(found, Some(snap1.snapshot_id.to_string()));
        let other_sha = wh
            .block_on(wh.index_snapshot_id_for("repo_test", "ffffffffffffffffffffffffffffffffffffffff"))
            .expect("lookup other sha");
        assert_eq!(other_sha, None);
        let other_repo = wh.block_on(wh.index_snapshot_id_for("nope", sha)).expect("lookup other repo");
        assert_eq!(other_repo, None);
    }

    /// Count the rows in `tantivy_index_blobs` whose `bytes` payload is
    /// non-empty (i.e. real content rows, not the empty-payload manifest
    /// rows). This is the metric that proves content-addressed dedup: the
    /// number of *stored payloads* must equal the number of *distinct
    /// content hashes ever snapshotted*, regardless of how many snapshots or
    /// filenames reference them.
    fn stored_content_payloads(wh: &IcebergWarehouse) -> std::collections::HashSet<String> {
        wh.block_on(async {
            let table = wh
                .catalog()
                .load_table(&wh.table_ident(TABLE_TANTIVY_INDEX_BLOBS))
                .await
                .unwrap();
            let stream = table.scan().build().unwrap().to_arrow().await.unwrap();
            let batches: Vec<RecordBatch> = stream.try_collect().await.unwrap();
            let mut shas = std::collections::HashSet::new();
            for b in &batches {
                let bytes = col::<LargeBinaryArray>(b, "bytes").unwrap();
                let sha = col::<StringArray>(b, "sha256").unwrap();
                for i in 0..b.num_rows() {
                    if !bytes.value(i).is_empty() {
                        shas.insert(sha.value(i).to_string());
                    }
                }
            }
            shas
        })
    }

    /// Content-addressed blob dedup across the time-machine snapshot/restore
    /// path. Exercises the two cases that snapshot-level dedup alone could not
    /// handle:
    ///
    ///   1. **Within a snapshot**: two files with byte-identical content store
    ///      their payload exactly once, yet BOTH filenames restore.
    ///   2. **Across snapshots**: a later snapshot that reuses an earlier
    ///      snapshot's bytes adds NO new stored payload for the shared content,
    ///      yet still restores fully from the dedup'd store.
    #[test]
    fn dedup_stores_bytes_once_but_restore_reconstructs() {
        let root = tempfile::tempdir().expect("tempdir");
        let wh = IcebergWarehouse::open(&root.path().join("warehouse")).expect("open");

        // ── Snapshot 1: two files share the SAME content (`SHARED`), one is
        // unique (`UNIQUE_A`). Within-snapshot dedup must store 2 distinct
        // payloads, not 3.
        let shared = vec![0x5Au8; 2048];
        let unique_a = vec![0xA1u8; 777];
        let s1_dir = root.path().join("s1");
        std::fs::create_dir_all(&s1_dir).unwrap();
        std::fs::write(s1_dir.join("copy1.idx"), &shared).unwrap();
        std::fs::write(s1_dir.join("copy2.idx"), &shared).unwrap(); // identical bytes, different name
        std::fs::write(s1_dir.join("unique_a.term"), &unique_a).unwrap();

        let sha1 = "1111111111111111111111111111111111111111";
        let snap1 = snapshot_to_iceberg(&wh, "ws", "repo", sha1, "main", &s1_dir).expect("snap1");
        assert_eq!(snap1.blob_count, 3, "snapshot logically contains 3 files");

        let after1 = stored_content_payloads(&wh);
        assert_eq!(
            after1.len(),
            2,
            "within-snapshot dedup: SHARED + UNIQUE_A = 2 stored payloads (not 3)"
        );

        // Restore snapshot 1 → all THREE files come back, with correct bytes.
        let out1 = root.path().join("out1");
        let r1 = restore_from_iceberg(&wh, "repo", Some(sha1), &out1).expect("restore1");
        assert_eq!(r1.snapshot_id, snap1.snapshot_id);
        assert_eq!(std::fs::read(out1.join("copy1.idx")).unwrap(), shared);
        assert_eq!(std::fs::read(out1.join("copy2.idx")).unwrap(), shared, "duplicate-content file restores");
        assert_eq!(std::fs::read(out1.join("unique_a.term")).unwrap(), unique_a);

        // ── Snapshot 2 (later git_sha): REUSES `shared` under a new filename,
        // plus one brand-new payload. Cross-snapshot dedup must add only the
        // new payload — the reused `shared` bytes are NOT stored again.
        let unique_b = vec![0xB2u8; 333];
        let s2_dir = root.path().join("s2");
        std::fs::create_dir_all(&s2_dir).unwrap();
        std::fs::write(s2_dir.join("reused.idx"), &shared).unwrap(); // same bytes as snapshot 1
        std::fs::write(s2_dir.join("unique_b.term"), &unique_b).unwrap();

        let sha2 = "2222222222222222222222222222222222222222";
        let snap2 = snapshot_to_iceberg(&wh, "ws", "repo", sha2, "main", &s2_dir).expect("snap2");
        assert_ne!(snap2.snapshot_id, snap1.snapshot_id, "distinct snapshot");

        let after2 = stored_content_payloads(&wh);
        assert_eq!(
            after2.len(),
            3,
            "cross-snapshot dedup: only UNIQUE_B is new; SHARED bytes reused, not re-stored"
        );

        // ── The critical restore: snapshot 2 references snapshot 1's `shared`
        // bytes (which live in snapshot 1's content row). Restore must
        // reconstruct it from the dedup'd store anyway.
        let out2 = root.path().join("out2");
        let r2 = restore_from_iceberg(&wh, "repo", Some(sha2), &out2).expect("restore2");
        assert_eq!(r2.snapshot_id, snap2.snapshot_id);
        assert_eq!(
            std::fs::read(out2.join("reused.idx")).unwrap(),
            shared,
            "snapshot 2 restores bytes first stored by snapshot 1 (cross-snapshot dedup)"
        );
        assert_eq!(std::fs::read(out2.join("unique_b.term")).unwrap(), unique_b);
        // Snapshot 2 must NOT leak snapshot 1's exclusive files.
        assert!(!out2.join("unique_a.term").exists(), "snapshot 2 has no unique_a");
        assert!(!out2.join("copy1.idx").exists(), "snapshot 2 has no copy1");

        // Re-restoring snapshot 1 after snapshot 2 was written still works
        // (no state was disturbed by the second snapshot).
        let out1b = root.path().join("out1b");
        restore_from_iceberg(&wh, "repo", Some(sha1), &out1b).expect("restore1 again");
        assert_eq!(std::fs::read(out1b.join("copy1.idx")).unwrap(), shared);
        assert_eq!(std::fs::read(out1b.join("unique_a.term")).unwrap(), unique_a);
        assert!(!out1b.join("unique_b.term").exists(), "snapshot 1 has no unique_b");
    }

    /// Pushdown correctness with **multiple** snapshots present: each is
    /// written as its own data file, so `with_filter(snapshot_id == …)`
    /// on the blob scan must restore exactly one snapshot's blobs and
    /// never bleed rows from the others. Distinct file *names* per
    /// snapshot make cross-contamination observable.
    #[test]
    fn restore_selects_correct_snapshot_among_many() {
        let root = tempfile::tempdir().expect("tempdir");
        let wh = IcebergWarehouse::open(&root.path().join("warehouse")).expect("open");

        // Three snapshots: two repos, and two SHAs for repo_a.
        let mk = |dir: &Path, marker: &str| {
            std::fs::create_dir_all(dir).unwrap();
            std::fs::write(dir.join(format!("{marker}.seg")), vec![marker.as_bytes()[0]; 512]).unwrap();
            std::fs::write(dir.join("meta.json"), format!(r#"{{"m":"{marker}"}}"#)).unwrap();
        };
        let sha_a1 = "1111111111111111111111111111111111111111";
        let sha_a2 = "2222222222222222222222222222222222222222";
        let sha_b1 = "3333333333333333333333333333333333333333";
        let (da1, da2, db1) = (root.path().join("a1"), root.path().join("a2"), root.path().join("b1"));
        mk(&da1, "a"); mk(&da2, "x"); mk(&db1, "b");

        let s_a1 = snapshot_to_iceberg(&wh, "ws", "repo_a", sha_a1, "main", &da1).unwrap();
        let _s_a2 = snapshot_to_iceberg(&wh, "ws", "repo_a", sha_a2, "main", &da2).unwrap();
        let s_b1 = snapshot_to_iceberg(&wh, "ws", "repo_b", sha_b1, "main", &db1).unwrap();

        // Restore repo_a @ sha_a1 → only a.seg, never x.seg or b.seg.
        let out = root.path().join("out_a1");
        let r = restore_from_iceberg(&wh, "repo_a", Some(sha_a1), &out).unwrap();
        assert_eq!(r.snapshot_id, s_a1.snapshot_id);
        assert!(out.join("a.seg").exists(), "expected a1's blob");
        assert!(!out.join("x.seg").exists(), "must not bleed repo_a@sha_a2 blobs");
        assert!(!out.join("b.seg").exists(), "must not bleed repo_b blobs");

        // Restore repo_b (latest, no pin) → only b.seg.
        let out_b = root.path().join("out_b");
        let rb = restore_from_iceberg(&wh, "repo_b", None, &out_b).unwrap();
        assert_eq!(rb.snapshot_id, s_b1.snapshot_id);
        assert!(out_b.join("b.seg").exists());
        assert!(!out_b.join("a.seg").exists());
        assert!(!out_b.join("x.seg").exists());

        // repo_a latest must pick sha_a2 (most recent), not sha_a1.
        let out_a_latest = root.path().join("out_a_latest");
        let r2 = restore_from_iceberg(&wh, "repo_a", None, &out_a_latest).unwrap();
        assert_eq!(r2.git_sha, sha_a2, "latest repo_a snapshot is sha_a2");
        assert!(out_a_latest.join("x.seg").exists());
        assert!(!out_a_latest.join("a.seg").exists());
    }

    /// Urðr cache-miss path: `Index::open_or_restore` transparently
    /// pulls from iceberg when the local cache is empty.
    #[test]
    fn open_or_restore_rehydrates_empty_cache() {
        use crate::index::Index;

        let root = tempfile::tempdir().expect("tempdir");
        let workspace = root.path();
        std::fs::create_dir_all(workspace.join(".nornir/cache/index")).unwrap();

        let wh = IcebergWarehouse::open(&workspace.join(".nornir/warehouse"))
            .expect("open warehouse");

        // Build a tiny real Tantivy index in the workspace.
        {
            let idx = Index::open(workspace).expect("open empty");
            // Walking the empty tempdir produces no docs but creates
            // meta.json + the writer lock; that's enough for round-trip.
            let _ = idx.build().expect("build empty");
        }
        assert!(workspace.join(".nornir/cache/index/meta.json").exists());

        // Snapshot to iceberg under our chosen repo label.
        let sha = "deadbeefdeadbeefdeadbeefdeadbeefdeadbeef";
        let snap = snapshot_to_iceberg(&wh, "ws_t", "_workspace", sha, "main",
                                       &workspace.join(".nornir/cache/index"))
            .expect("snapshot");
        assert!(snap.blob_count > 0);

        // Nuke the cache.
        std::fs::remove_dir_all(workspace.join(".nornir/cache/index")).unwrap();
        assert!(!workspace.join(".nornir/cache/index/meta.json").exists());

        // open_or_restore must rehydrate from iceberg.
        let (_idx, restored) = Index::open_or_restore(workspace, &wh, "_workspace", None)
            .expect("open_or_restore");
        assert!(restored, "expected restore from iceberg");
        assert!(workspace.join(".nornir/cache/index/meta.json").exists(),
                "meta.json must be back after restore");

        // Second call: cache populated → no restore.
        let (_idx, restored2) = Index::open_or_restore(workspace, &wh, "_workspace", None)
            .expect("open_or_restore second");
        assert!(!restored2, "second call must skip restore (cache hot)");
    }
}