weave-content 0.2.9

Content DSL parser, validator, and builder for OSINT case files
Documentation
#![deny(unsafe_code)]
#![deny(clippy::unwrap_used)]
#![deny(clippy::expect_used)]
#![allow(clippy::missing_errors_doc)]

pub mod cache;
pub mod entity;
pub mod nulid_gen;
pub mod output;
pub mod parser;
pub mod registry;
pub mod relationship;
pub mod timeline;
pub mod verifier;
pub mod writeback;

use crate::entity::Entity;
use crate::parser::{ParseError, ParsedCase, SectionKind};
use crate::relationship::Rel;

/// Parse a case file fully: front matter, entities, relationships, timeline.
/// Returns the parsed case, inline entities, and relationships (including NEXT from timeline).
///
/// When a registry is provided, relationship and timeline names are resolved
/// against both inline events AND the global entity registry.
pub fn parse_full(
    content: &str,
    reg: Option<&registry::EntityRegistry>,
) -> Result<(ParsedCase, Vec<Entity>, Vec<Rel>), Vec<ParseError>> {
    let case = parser::parse(content)?;
    let mut errors = Vec::new();

    let mut all_entities = Vec::new();
    for section in &case.sections {
        if section.kind == SectionKind::Events {
            let entities =
                entity::parse_entities(&section.body, section.kind, section.line, &mut errors);
            all_entities.extend(entities);
        }
    }

    // Build combined name list: inline events + registry entities
    let mut entity_names: Vec<&str> = all_entities.iter().map(|e| e.name.as_str()).collect();
    if let Some(registry) = reg {
        for name in registry.names() {
            if !entity_names.contains(&name) {
                entity_names.push(name);
            }
        }
    }

    let event_names: Vec<&str> = all_entities
        .iter()
        .filter(|e| e.label == entity::Label::PublicRecord)
        .map(|e| e.name.as_str())
        .collect();

    let mut all_rels = Vec::new();
    for section in &case.sections {
        if section.kind == SectionKind::Relationships {
            let rels = relationship::parse_relationships(
                &section.body,
                section.line,
                &entity_names,
                &case.sources,
                &mut errors,
            );
            all_rels.extend(rels);
        }
    }

    for section in &case.sections {
        if section.kind == SectionKind::Timeline {
            let rels =
                timeline::parse_timeline(&section.body, section.line, &event_names, &mut errors);
            all_rels.extend(rels);
        }
    }

    if errors.is_empty() {
        Ok((case, all_entities, all_rels))
    } else {
        Err(errors)
    }
}

/// Collect registry entities referenced by relationships in this case.
pub fn collect_referenced_registry_entities(
    rels: &[Rel],
    inline_entities: &[Entity],
    reg: &registry::EntityRegistry,
) -> Vec<Entity> {
    let inline_names: Vec<&str> = inline_entities.iter().map(|e| e.name.as_str()).collect();
    let mut referenced = Vec::new();
    let mut seen_names: Vec<String> = Vec::new();

    for rel in rels {
        for name in [&rel.source_name, &rel.target_name] {
            if !inline_names.contains(&name.as_str())
                && !seen_names.contains(name)
                && let Some(entry) = reg.get_by_name(name)
            {
                referenced.push(entry.entity.clone());
                seen_names.push(name.clone());
            }
        }
    }

    referenced
}

/// Build a `CaseOutput` from a case file path.
/// Handles parsing and ID writeback.
pub fn build_case_output(
    path: &str,
    reg: &registry::EntityRegistry,
) -> Result<output::CaseOutput, i32> {
    let content = match std::fs::read_to_string(path) {
        Ok(c) => c,
        Err(e) => {
            eprintln!("{path}: error reading file: {e}");
            return Err(2);
        }
    };

    let (case, entities, rels) = match parse_full(&content, Some(reg)) {
        Ok(result) => result,
        Err(errors) => {
            for err in &errors {
                eprintln!("{path}:{err}");
            }
            return Err(1);
        }
    };

    let referenced_entities = collect_referenced_registry_entities(&rels, &entities, reg);

    let build_result = match output::build_output(
        &case.id,
        &case.title,
        &case.summary,
        &case.sources,
        &entities,
        &rels,
        &referenced_entities,
    ) {
        Ok(out) => out,
        Err(errors) => {
            for err in &errors {
                eprintln!("{path}:{err}");
            }
            return Err(1);
        }
    };

    let case_output = build_result.output;

    // Write back generated IDs to source case file
    if !build_result.case_pending.is_empty() {
        let mut pending = build_result.case_pending;
        if let Some(modified) = writeback::apply_writebacks(&content, &mut pending) {
            if let Err(e) = writeback::write_file(std::path::Path::new(path), &modified) {
                eprintln!("{e}");
                return Err(2);
            }
            let count = pending.len();
            eprintln!("{path}: wrote {count} generated ID(s) back to file");
        }
    }

    // Write back generated IDs to entity files
    if let Some(code) = writeback_registry_entities(&build_result.registry_pending, reg) {
        return Err(code);
    }

    eprintln!(
        "{path}: built ({} nodes, {} relationships)",
        case_output.nodes.len(),
        case_output.relationships.len()
    );
    Ok(case_output)
}

/// Write back generated IDs to registry entity files.
/// Returns `Some(exit_code)` on error, `None` on success.
fn writeback_registry_entities(
    pending: &[(String, writeback::PendingId)],
    reg: &registry::EntityRegistry,
) -> Option<i32> {
    for (entity_name, pending_id) in pending {
        let Some(entry) = reg.get_by_name(entity_name) else {
            continue;
        };
        let entity_path = &entry.path;
        let entity_content = match std::fs::read_to_string(entity_path) {
            Ok(c) => c,
            Err(e) => {
                eprintln!("{}: error reading file: {e}", entity_path.display());
                return Some(2);
            }
        };

        // Skip if the file already has an `id:` in its front matter.
        // This happens when multiple cases reference the same shared entity:
        // the first case writes the ID, but the in-memory registry still has
        // `id: None`, so subsequent cases generate a new ID. Re-reading the
        // file here catches the already-written ID and avoids duplicates.
        if front_matter_has_id(&entity_content) {
            continue;
        }

        let fm_end = writeback::find_front_matter_end(&entity_content);
        let mut ids = vec![writeback::PendingId {
            line: fm_end.unwrap_or(2),
            id: pending_id.id.clone(),
            kind: writeback::WriteBackKind::EntityFrontMatter,
        }];
        if let Some(modified) = writeback::apply_writebacks(&entity_content, &mut ids) {
            if let Err(e) = writeback::write_file(entity_path, &modified) {
                eprintln!("{e}");
                return Some(2);
            }
            eprintln!("{}: wrote generated ID back to file", entity_path.display());
        }
    }
    None
}

/// Check whether a file's YAML front matter already contains an `id:` field.
fn front_matter_has_id(content: &str) -> bool {
    let mut in_front_matter = false;
    for line in content.lines() {
        let trimmed = line.trim();
        if trimmed == "---" && !in_front_matter {
            in_front_matter = true;
        } else if trimmed == "---" && in_front_matter {
            return false; // end of front matter, no id found
        } else if in_front_matter && trimmed.starts_with("id:") {
            return true;
        }
    }
    false
}

/// Resolve the content root directory.
///
/// Priority: explicit `--root` flag > parent of given path > current directory.
pub fn resolve_content_root(path: Option<&str>, root: Option<&str>) -> std::path::PathBuf {
    if let Some(r) = root {
        return std::path::PathBuf::from(r);
    }
    if let Some(p) = path {
        let p = std::path::Path::new(p);
        if p.is_file() {
            if let Some(parent) = p.parent() {
                for ancestor in parent.ancestors() {
                    if ancestor.join("cases").is_dir()
                        || ancestor.join("actors").is_dir()
                        || ancestor.join("institutions").is_dir()
                    {
                        return ancestor.to_path_buf();
                    }
                }
                return parent.to_path_buf();
            }
        } else if p.is_dir() {
            return p.to_path_buf();
        }
    }
    std::path::PathBuf::from(".")
}

/// Load entity registry from content root. Returns empty registry if no entity dirs exist.
pub fn load_registry(content_root: &std::path::Path) -> Result<registry::EntityRegistry, i32> {
    match registry::EntityRegistry::load(content_root) {
        Ok(reg) => Ok(reg),
        Err(errors) => {
            for err in &errors {
                eprintln!("registry: {err}");
            }
            Err(1)
        }
    }
}

/// Resolve case file paths from path argument.
/// If path is a file, returns just that file.
/// If path is a directory (or None), auto-discovers `cases/**/*.md`.
pub fn resolve_case_files(
    path: Option<&str>,
    content_root: &std::path::Path,
) -> Result<Vec<String>, i32> {
    if let Some(p) = path {
        let p_path = std::path::Path::new(p);
        if p_path.is_file() {
            return Ok(vec![p.to_string()]);
        }
        if !p_path.is_dir() {
            eprintln!("{p}: not a file or directory");
            return Err(2);
        }
    }

    let cases_dir = content_root.join("cases");
    if !cases_dir.is_dir() {
        return Ok(Vec::new());
    }

    let mut files = Vec::new();
    discover_md_files(&cases_dir, &mut files, 0);
    files.sort();
    Ok(files)
}

/// Recursively discover .md files in a directory (max 3 levels deep for cases/year/topic/).
fn discover_md_files(dir: &std::path::Path, files: &mut Vec<String>, depth: usize) {
    const MAX_DEPTH: usize = 3;
    if depth > MAX_DEPTH {
        return;
    }

    let Ok(entries) = std::fs::read_dir(dir) else {
        return;
    };

    let mut entries: Vec<_> = entries.filter_map(Result::ok).collect();
    entries.sort_by_key(std::fs::DirEntry::file_name);

    for entry in entries {
        let path = entry.path();
        if path.is_dir() {
            discover_md_files(&path, files, depth + 1);
        } else if path.extension().and_then(|e| e.to_str()) == Some("md")
            && let Some(s) = path.to_str()
        {
            files.push(s.to_string());
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn front_matter_has_id_present() {
        let content = "---\nid: 01JABC000000000000000000AA\n---\n\n# Test\n";
        assert!(front_matter_has_id(content));
    }

    #[test]
    fn front_matter_has_id_absent() {
        let content = "---\n---\n\n# Test\n";
        assert!(!front_matter_has_id(content));
    }

    #[test]
    fn front_matter_has_id_with_other_fields() {
        let content = "---\nother: value\nid: 01JABC000000000000000000AA\n---\n\n# Test\n";
        assert!(front_matter_has_id(content));
    }

    #[test]
    fn front_matter_has_id_no_front_matter() {
        let content = "# Test\n\nNo front matter here.\n";
        assert!(!front_matter_has_id(content));
    }

    #[test]
    fn front_matter_has_id_outside_front_matter() {
        // `id:` appearing in the body should not count
        let content = "---\n---\n\n# Test\n\n- id: some-value\n";
        assert!(!front_matter_has_id(content));
    }
}