exochain-dag-db-lab 0.2.0-beta

EXOCHAIN DAG DB diagnostics, graph explorer, benchmarks, and lab tools
Documentation
use std::{
    collections::{BTreeMap, BTreeSet},
    env, fs,
    io::{self, Write},
    path::{Path, PathBuf},
};

use exo_dag_db_lab::kg_markdown_manifest::{
    MANIFEST_SCHEMA_VERSION, Manifest, ManifestFile, build_manifest,
};
use serde::Serialize;
use sha2::{Digest, Sha256};

const SCHEMA_VERSION: &str = "dagdb_markdown_kg_import_candidates_v1";

fn main() {
    if let Err(error) = run() {
        eprintln!("dagdb_kg_import_candidates_error: {error}");
        std::process::exit(1);
    }
}

fn run() -> Result<(), String> {
    let args = parse_args(env::args().skip(1).collect())?;
    let manifest = load_manifest(&args)?;
    let candidates = build_candidates(manifest)?;
    let encoded = serde_json::to_string_pretty(&candidates)
        .map_err(|error| format!("serialize candidates: {error}"))?
        + "\n";

    if let Some(output) = args.output {
        if let Some(parent) = output.parent() {
            fs::create_dir_all(parent)
                .map_err(|error| format!("create output directory: {error}"))?;
        }
        fs::write(&output, encoded).map_err(|error| format!("write output: {error}"))?;
    } else {
        io::stdout()
            .write_all(encoded.as_bytes())
            .map_err(|error| format!("write stdout: {error}"))?;
    }
    Ok(())
}

struct Args {
    root: PathBuf,
    manifest: Option<PathBuf>,
    output: Option<PathBuf>,
}

fn parse_args(raw: Vec<String>) -> Result<Args, String> {
    let mut root = PathBuf::from("KnowledgeGraphs/dag-db");
    let mut manifest = None;
    let mut output = None;
    let mut index = 0;
    while index < raw.len() {
        match raw[index].as_str() {
            "--root" => {
                index += 1;
                root = PathBuf::from(
                    raw.get(index)
                        .ok_or_else(|| "--root requires a value".to_owned())?,
                );
            }
            "--manifest" => {
                index += 1;
                manifest = Some(PathBuf::from(
                    raw.get(index)
                        .ok_or_else(|| "--manifest requires a value".to_owned())?,
                ));
            }
            "--output" => {
                index += 1;
                output = Some(PathBuf::from(
                    raw.get(index)
                        .ok_or_else(|| "--output requires a value".to_owned())?,
                ));
            }
            "-h" | "--help" => {
                println!(
                    "usage: dagdb_kg_import_candidates [--root <path>] [--manifest <path>] [--output <path>]"
                );
                std::process::exit(0);
            }
            other => return Err(format!("unknown argument: {other}")),
        }
        index += 1;
    }
    Ok(Args {
        root,
        manifest,
        output,
    })
}

fn load_manifest(args: &Args) -> Result<Manifest, String> {
    let manifest = if let Some(path) = &args.manifest {
        let text = fs::read_to_string(path).map_err(|error| format!("read manifest: {error}"))?;
        serde_json::from_str(&text).map_err(|error| format!("parse manifest: {error}"))?
    } else {
        build_manifest(&args.root)?
    };
    if manifest.schema_version != MANIFEST_SCHEMA_VERSION {
        return Err(format!(
            "unsupported manifest schema: {:?}",
            manifest.schema_version
        ));
    }
    Ok(manifest)
}

#[derive(Debug, Serialize)]
struct CandidateReport {
    schema_version: &'static str,
    source_manifest_schema_version: String,
    graph_root: String,
    node_count: usize,
    edge_count: usize,
    unresolved_wikilink_count: usize,
    nodes: Vec<NodeCandidate>,
    edges: Vec<EdgeCandidate>,
    unresolved_wikilinks: Vec<UnresolvedWikilink>,
}

#[derive(Debug, Serialize)]
struct NodeCandidate {
    candidate_id: String,
    path: String,
    title: String,
    document_type: String,
    status: String,
    project_id: String,
    content_sha256: String,
    byte_length: usize,
    catalog_path: Vec<String>,
    frontmatter: BTreeMap<String, String>,
}

#[derive(Debug, Serialize)]
struct EdgeCandidate {
    candidate_id: String,
    edge_kind: &'static str,
    source_candidate_id: String,
    source_path: String,
    target_wikilink: String,
    target_candidate_id: String,
    target_path: String,
    resolution_status: String,
}

#[derive(Debug, Serialize)]
struct UnresolvedWikilink {
    source_path: String,
    target_wikilink: String,
    resolution_status: String,
}

fn build_candidates(mut manifest: Manifest) -> Result<CandidateReport, String> {
    manifest
        .files
        .sort_by(|left, right| left.path.cmp(&right.path));
    let link_index = build_link_index(&manifest.files);

    let mut nodes = Vec::new();
    let mut path_to_node_id = BTreeMap::new();
    for file_entry in &manifest.files {
        let candidate_id = stable_id("kg_node", &[&file_entry.path]);
        path_to_node_id.insert(file_entry.path.clone(), candidate_id.clone());
        nodes.push(NodeCandidate {
            candidate_id,
            path: file_entry.path.clone(),
            title: file_entry.title.clone(),
            document_type: document_type_for(&file_entry.path, &file_entry.frontmatter),
            status: file_entry
                .frontmatter
                .get("status")
                .cloned()
                .unwrap_or_else(|| "unknown".to_owned()),
            project_id: file_entry
                .frontmatter
                .get("project_id")
                .cloned()
                .unwrap_or_default(),
            content_sha256: file_entry.sha256.clone(),
            byte_length: file_entry.byte_length,
            catalog_path: catalog_path(&file_entry.path),
            frontmatter: file_entry.frontmatter.clone(),
        });
    }

    let mut edges = Vec::new();
    let mut unresolved = Vec::new();
    for file_entry in &manifest.files {
        let source_id = path_to_node_id
            .get(&file_entry.path)
            .ok_or_else(|| format!("missing node id for {}", file_entry.path))?
            .clone();
        for target in &file_entry.wikilinks {
            let matched_paths = link_index.get(target).cloned().unwrap_or_default();
            let (resolution_status, target_path, target_id) = match matched_paths.as_slice() {
                [path] => (
                    "resolved".to_owned(),
                    path.clone(),
                    path_to_node_id
                        .get(path)
                        .ok_or_else(|| format!("missing node id for {path}"))?
                        .clone(),
                ),
                [] => ("unresolved".to_owned(), String::new(), String::new()),
                _ => ("ambiguous".to_owned(), String::new(), String::new()),
            };
            edges.push(EdgeCandidate {
                candidate_id: stable_id("kg_edge", &[&file_entry.path, target]),
                edge_kind: "wikilink",
                source_candidate_id: source_id.clone(),
                source_path: file_entry.path.clone(),
                target_wikilink: target.clone(),
                target_candidate_id: target_id,
                target_path,
                resolution_status: resolution_status.clone(),
            });
            if resolution_status != "resolved" {
                unresolved.push(UnresolvedWikilink {
                    source_path: file_entry.path.clone(),
                    target_wikilink: target.clone(),
                    resolution_status,
                });
            }
        }
    }
    edges.sort_by(|left, right| {
        (left.source_path.as_str(), left.target_wikilink.as_str())
            .cmp(&(right.source_path.as_str(), right.target_wikilink.as_str()))
    });
    unresolved.sort_by(|left, right| {
        (left.source_path.as_str(), left.target_wikilink.as_str())
            .cmp(&(right.source_path.as_str(), right.target_wikilink.as_str()))
    });

    Ok(CandidateReport {
        schema_version: SCHEMA_VERSION,
        source_manifest_schema_version: manifest.schema_version,
        graph_root: manifest.graph_root,
        node_count: nodes.len(),
        edge_count: edges.len(),
        unresolved_wikilink_count: unresolved.len(),
        nodes,
        edges,
        unresolved_wikilinks: unresolved,
    })
}

fn build_link_index(files: &[ManifestFile]) -> BTreeMap<String, Vec<String>> {
    let mut index: BTreeMap<String, Vec<String>> = BTreeMap::new();
    for file_entry in files {
        for key in link_keys_for_file(file_entry) {
            index.entry(key).or_default().push(file_entry.path.clone());
        }
    }
    for paths in index.values_mut() {
        paths.sort();
    }
    index
}

fn link_keys_for_file(file_entry: &ManifestFile) -> Vec<String> {
    let path = file_entry.path.as_str();
    let path_without_ext = path.strip_suffix(".md").unwrap_or(path);
    let basename_without_ext = Path::new(path)
        .file_stem()
        .and_then(|stem| stem.to_str())
        .unwrap_or_default();
    let mut keys = BTreeSet::from([
        path.to_owned(),
        path_without_ext.to_owned(),
        basename_without_ext.to_owned(),
    ]);
    if !file_entry.title.trim().is_empty() {
        keys.insert(file_entry.title.clone());
    }
    keys.into_iter()
        .map(|key| key.trim().to_owned())
        .filter(|key| !key.is_empty())
        .collect()
}

fn catalog_path(path: &str) -> Vec<String> {
    let without_ext = path.strip_suffix(".md").unwrap_or(path);
    without_ext
        .split('/')
        .filter(|part| !part.is_empty())
        .map(str::to_owned)
        .collect()
}

fn document_type_for(path: &str, frontmatter: &BTreeMap<String, String>) -> String {
    if let Some(explicit) = frontmatter.get("type").map(|value| value.trim()) {
        if !explicit.is_empty() && explicit != "unknown" {
            return explicit.to_owned();
        }
    }

    let stem = Path::new(path)
        .file_stem()
        .and_then(|stem| stem.to_str())
        .unwrap_or_default();
    let lower_path = path.to_ascii_lowercase();
    let basename = Path::new(path)
        .file_name()
        .and_then(|name| name.to_str())
        .unwrap_or_default()
        .to_ascii_lowercase();

    if stem == "00_Index" {
        "index"
    } else if stem == "01_Project_Brief" {
        "project_brief"
    } else if stem == "00_Pinned_Mission" || lower_path.contains("pinned_mission") {
        "pinned_mission"
    } else if path.ends_with(".plan.md") {
        "plan"
    } else if path.ends_with(".schema.md") {
        "export_contract"
    } else if basename.ends_with("-status.md") {
        "batch_report"
    } else if basename.ends_with("-contract.md") {
        "requirement"
    } else if lower_path.contains("/03_decisions/") || stem.eq_ignore_ascii_case("decision log") {
        "decision"
    } else if lower_path.contains("/08_open_questions/") || lower_path.contains("open-question") {
        "open_question"
    } else if lower_path.contains("milestone") && lower_path.contains("ladder") {
        "milestone_ladder"
    } else if lower_path.contains("/09_exports/") || lower_path.starts_with("09_exports/") {
        "export"
    } else {
        "technical_note"
    }
    .to_owned()
}

fn stable_id(prefix: &str, parts: &[&str]) -> String {
    let mut hasher = Sha256::new();
    hasher.update(parts.join("\0").as_bytes());
    let digest = hasher.finalize();
    let hex = digest
        .iter()
        .map(|byte| format!("{byte:02x}"))
        .collect::<String>();
    format!("{prefix}_{}", &hex[..24])
}