use std::path::{Path, PathBuf};
use anyhow::Result;
use crate::index::snapshot::{restore_dir_from_iceberg, snapshot_dir_to_iceberg, SnapshotRef};
use crate::index::{BuildStats, Corpus, Hit, Index};
use crate::warehouse::iceberg::{
IcebergWarehouse, TABLE_DOCS_INDEX_BLOBS, TABLE_DOCS_INDEX_SNAPSHOTS,
};
use super::layout::MANAGED_DOCS;
pub fn docs_index_dir(repo_root: &Path) -> PathBuf {
repo_root.join(".nornir/cache/docs-index")
}
fn collect_md(dir: &Path, out: &mut Vec<PathBuf>) {
if let Ok(rd) = std::fs::read_dir(dir) {
for e in rd.flatten() {
let p = e.path();
if p.is_file() && p.extension().and_then(|s| s.to_str()) == Some("md") {
out.push(p);
}
}
}
}
pub fn discover_doc_files(repo_root: &Path) -> Vec<PathBuf> {
let mut out = Vec::new();
for name in MANAGED_DOCS {
let p = repo_root.join(name);
if p.is_file() {
out.push(p);
}
}
collect_md(&repo_root.join("docs"), &mut out);
out.sort();
out.dedup();
out
}
pub fn build_docs_index(repo_root: &Path, repo: &str) -> Result<(BuildStats, PathBuf)> {
let dir = docs_index_dir(repo_root);
if dir.exists() {
let _ = std::fs::remove_dir_all(&dir);
}
let idx = Index::open_at(repo_root, &dir)?;
let files = discover_doc_files(repo_root);
let stats = idx.index_files(repo, Corpus::Docs, &files)?;
Ok((stats, dir))
}
pub fn snapshot_docs_index(
wh: &IcebergWarehouse,
workspace: &str,
repo: &str,
git_sha: &str,
branch: &str,
repo_root: &Path,
) -> Result<SnapshotRef> {
snapshot_dir_to_iceberg(
wh,
TABLE_DOCS_INDEX_SNAPSHOTS,
TABLE_DOCS_INDEX_BLOBS,
workspace,
repo,
git_sha,
branch,
&docs_index_dir(repo_root),
)
}
pub fn restore_docs_index(
wh: &IcebergWarehouse,
repo: &str,
sha: Option<&str>,
repo_root: &Path,
) -> Result<SnapshotRef> {
restore_dir_from_iceberg(
wh,
TABLE_DOCS_INDEX_SNAPSHOTS,
TABLE_DOCS_INDEX_BLOBS,
repo,
sha,
&docs_index_dir(repo_root),
)
}
pub fn search_docs(
repo_root: &Path,
wh: &IcebergWarehouse,
repo: &str,
sha: Option<&str>,
query: &str,
limit: usize,
) -> Result<Vec<Hit>> {
let dir = docs_index_dir(repo_root);
if !dir.join("meta.json").exists() {
if let Err(e) = restore_docs_index(wh, repo, sha, repo_root) {
eprintln!("⏃ urðr: no docs-index snapshot for `{repo}` ({e})");
}
}
let idx = Index::open_at(repo_root, &dir)?;
idx.search(query, Some(Corpus::Docs), None, limit)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn discovers_generated_docs_and_skips_templates() {
let dir = tempfile::tempdir().unwrap();
let root = dir.path();
std::fs::create_dir_all(root.join(".nornir")).unwrap();
std::fs::create_dir_all(root.join("docs")).unwrap();
std::fs::write(root.join(".nornir/design.md"), "# Design template").unwrap(); std::fs::write(root.join(".nornir/README.md"), "# src readme").unwrap(); std::fs::write(root.join("README.md"), "# generated").unwrap(); std::fs::write(root.join("CHANGELOG.md"), "# changes").unwrap(); std::fs::write(root.join("docs/book.md"), "# Book").unwrap();
let files = discover_doc_files(root);
let names: Vec<String> = files
.iter()
.map(|p| p.strip_prefix(root).unwrap().to_string_lossy().into_owned())
.collect();
assert!(names.iter().any(|n| n == "README.md"));
assert!(names.iter().any(|n| n == "CHANGELOG.md"));
assert!(names.iter().any(|n| n.ends_with("docs/book.md")));
assert!(!names.iter().any(|n| n.contains(".nornir/")));
}
#[test]
fn build_then_search_round_trip() {
let dir = tempfile::tempdir().unwrap();
let root = dir.path();
std::fs::create_dir_all(root.join("docs")).unwrap();
std::fs::write(
root.join("docs/book.md"),
"# Vector\nSemantic search uses jina embeddings materialized in the warehouse.",
)
.unwrap();
std::fs::write(
root.join("README.md"),
"# Warehouse\nApache Iceberg rows keyed by git sha.",
)
.unwrap();
let (stats, idir) = build_docs_index(root, "demo").unwrap();
assert_eq!(stats.added, 2);
let idx = Index::open_at(root, &idir).unwrap();
let hits = idx.search("jina embeddings", Some(Corpus::Docs), None, 5).unwrap();
assert!(!hits.is_empty());
assert!(hits[0].path.ends_with("book.md"));
}
}