nornir 0.4.19

Companion to cargo: dependency tracking, release gating, deploy, benchmarks, and documentation assembly. Project-agnostic.
Documentation
//! Dedicated full-text index over a repo's **generated** documentation, kept
//! physically separate from the workspace code index.
//!
//! Important: this indexes the *rendered* output — the generated, marker-
//! expanded `README.md`/`CHANGELOG.md` at the repo root plus the rendered
//! markdown under `<repo>/docs/` (e.g. `book.md`) — **not** the `.nornir/*.md`
//! templates (whose `<!-- nornir:gen:* -->` markers are still unfilled). The
//! index lives at `<repo>/.nornir/cache/docs-index/` and reuses the identical
//! Tantivy schema and the snapshot/blob historization, parameterized onto the
//! `docs_index_*` Iceberg tables, so the docs index time-travels exactly like
//! the code index.

use std::path::{Path, PathBuf};

use anyhow::Result;

use crate::index::snapshot::{restore_dir_from_iceberg, snapshot_dir_to_iceberg, SnapshotRef};
use crate::index::{BuildStats, Corpus, Hit, Index};
use crate::warehouse::iceberg::{
    IcebergWarehouse, TABLE_DOCS_INDEX_BLOBS, TABLE_DOCS_INDEX_SNAPSHOTS,
};

use super::layout::MANAGED_DOCS;

/// Conventional, gitignored location of the docs index (rehydrated from
/// iceberg on a cache miss, mirroring the code index).
pub fn docs_index_dir(repo_root: &Path) -> PathBuf {
    repo_root.join(".nornir/cache/docs-index")
}

fn collect_md(dir: &Path, out: &mut Vec<PathBuf>) {
    if let Ok(rd) = std::fs::read_dir(dir) {
        for e in rd.flatten() {
            let p = e.path();
            if p.is_file() && p.extension().and_then(|s| s.to_str()) == Some("md") {
                out.push(p);
            }
        }
    }
}

/// Discover the **generated** documentation file set for `repo_root`: the
/// generated managed docs (`README.md`/`CHANGELOG.md`) plus every rendered
/// markdown under `<repo>/docs/`. The `.nornir/*.md` templates are excluded on
/// purpose — they hold unexpanded markers, not the published content.
pub fn discover_doc_files(repo_root: &Path) -> Vec<PathBuf> {
    let mut out = Vec::new();
    for name in MANAGED_DOCS {
        let p = repo_root.join(name);
        if p.is_file() {
            out.push(p);
        }
    }
    collect_md(&repo_root.join("docs"), &mut out);
    out.sort();
    out.dedup();
    out
}

/// Build (fresh) the docs index for `repo`. The directory is wiped first so a
/// removed chapter never lingers; the index is cheap to rebuild and is
/// historized via [`snapshot_docs_index`].
pub fn build_docs_index(repo_root: &Path, repo: &str) -> Result<(BuildStats, PathBuf)> {
    let dir = docs_index_dir(repo_root);
    if dir.exists() {
        let _ = std::fs::remove_dir_all(&dir);
    }
    let idx = Index::open_at(repo_root, &dir)?;
    let files = discover_doc_files(repo_root);
    let stats = idx.index_files(repo, Corpus::Docs, &files)?;
    Ok((stats, dir))
}

/// Historize the docs index into the `docs_index_*` Iceberg tables.
pub fn snapshot_docs_index(
    wh: &IcebergWarehouse,
    workspace: &str,
    repo: &str,
    git_sha: &str,
    branch: &str,
    repo_root: &Path,
) -> Result<SnapshotRef> {
    snapshot_dir_to_iceberg(
        wh,
        TABLE_DOCS_INDEX_SNAPSHOTS,
        TABLE_DOCS_INDEX_BLOBS,
        workspace,
        repo,
        git_sha,
        branch,
        &docs_index_dir(repo_root),
    )
}

/// Restore the docs index for `repo` (latest, or pinned `sha`) from iceberg.
pub fn restore_docs_index(
    wh: &IcebergWarehouse,
    repo: &str,
    sha: Option<&str>,
    repo_root: &Path,
) -> Result<SnapshotRef> {
    restore_dir_from_iceberg(
        wh,
        TABLE_DOCS_INDEX_SNAPSHOTS,
        TABLE_DOCS_INDEX_BLOBS,
        repo,
        sha,
        &docs_index_dir(repo_root),
    )
}

/// BM25 search over the docs index. On a cold cache, transparently rehydrates
/// from iceberg (optionally pinned to `sha`) before searching.
pub fn search_docs(
    repo_root: &Path,
    wh: &IcebergWarehouse,
    repo: &str,
    sha: Option<&str>,
    query: &str,
    limit: usize,
) -> Result<Vec<Hit>> {
    let dir = docs_index_dir(repo_root);
    if !dir.join("meta.json").exists() {
        // Best-effort: iceberg may legitimately have no snapshot yet.
        if let Err(e) = restore_docs_index(wh, repo, sha, repo_root) {
            eprintln!("⏃ urðr: no docs-index snapshot for `{repo}` ({e})");
        }
    }
    let idx = Index::open_at(repo_root, &dir)?;
    idx.search(query, Some(Corpus::Docs), None, limit)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn discovers_generated_docs_and_skips_templates() {
        let dir = tempfile::tempdir().unwrap();
        let root = dir.path();
        std::fs::create_dir_all(root.join(".nornir")).unwrap();
        std::fs::create_dir_all(root.join("docs")).unwrap();
        std::fs::write(root.join(".nornir/design.md"), "# Design template").unwrap(); // template → skip
        std::fs::write(root.join(".nornir/README.md"), "# src readme").unwrap(); // template → skip
        std::fs::write(root.join("README.md"), "# generated").unwrap(); // generated → keep
        std::fs::write(root.join("CHANGELOG.md"), "# changes").unwrap(); // generated → keep
        std::fs::write(root.join("docs/book.md"), "# Book").unwrap(); // rendered → keep

        let files = discover_doc_files(root);
        let names: Vec<String> = files
            .iter()
            .map(|p| p.strip_prefix(root).unwrap().to_string_lossy().into_owned())
            .collect();
        assert!(names.iter().any(|n| n == "README.md"));
        assert!(names.iter().any(|n| n == "CHANGELOG.md"));
        assert!(names.iter().any(|n| n.ends_with("docs/book.md")));
        assert!(!names.iter().any(|n| n.contains(".nornir/")));
    }

    #[test]
    fn build_then_search_round_trip() {
        let dir = tempfile::tempdir().unwrap();
        let root = dir.path();
        std::fs::create_dir_all(root.join("docs")).unwrap();
        std::fs::write(
            root.join("docs/book.md"),
            "# Vector\nSemantic search uses jina embeddings materialized in the warehouse.",
        )
        .unwrap();
        std::fs::write(
            root.join("README.md"),
            "# Warehouse\nApache Iceberg rows keyed by git sha.",
        )
        .unwrap();

        let (stats, idir) = build_docs_index(root, "demo").unwrap();
        assert_eq!(stats.added, 2);
        let idx = Index::open_at(root, &idir).unwrap();
        let hits = idx.search("jina embeddings", Some(Corpus::Docs), None, 5).unwrap();
        assert!(!hits.is_empty());
        assert!(hits[0].path.ends_with("book.md"));
    }
}