weave-content 0.2.8

use clap::{Parser, Subcommand};

use weave_content::cache;
use weave_content::output;
use weave_content::registry;
use weave_content::verifier;
use weave_content::{
    build_case_output, load_registry, parse_full, resolve_case_files, resolve_content_root,
};

#[cfg(test)]
use weave_content::entity;

/// Content DSL parser, validator, and builder for OSINT case files.
#[derive(Parser)]
#[command(name = "weave-content", version, about)]
struct Cli {
    #[command(subcommand)]
    command: Command,
}

#[derive(Subcommand)]
enum Command {
    /// Validate case files (parse and check schema).
    Validate {
        /// Path to case file, or content root directory.
        /// When a directory is given, auto-discovers cases/**/*.md.
        path: Option<String>,

        /// Content root directory (for loading entity registry).
        /// Defaults to the parent of the given path, or current directory.
        #[arg(long)]
        root: Option<String>,
    },
    /// Verify URLs in case files (HEAD/GET checks).
    Verify {
        /// Path to case file, or content root directory.
        path: Option<String>,

        /// Content root directory.
        #[arg(long)]
        root: Option<String>,

        /// Maximum concurrent requests.
        #[arg(long, default_value_t = 16)]
        concurrency: usize,

        /// Per-URL timeout in seconds.
        #[arg(long, default_value_t = 15)]
        timeout: u64,

        /// Path to URL verification cache file.
        #[arg(long)]
        cache: Option<String>,

        /// Report all as warnings, never fail.
        #[arg(long)]
        warn_only: bool,
    },
    /// Build JSON output from case files.
    Build {
        /// Path to case file, or content root directory.
        path: Option<String>,

        /// Content root directory.
        #[arg(long)]
        root: Option<String>,

        /// Output directory (one JSON per case). Stdout if omitted.
        #[arg(short, long)]
        output: Option<String>,
    },
}

fn main() {
    let cli = Cli::parse();

    let exit_code = match cli.command {
        Command::Validate { ref path, ref root } => cmd_validate(path.as_deref(), root.as_deref()),
        Command::Verify {
            ref path,
            ref root,
            concurrency,
            timeout,
            ref cache,
            warn_only,
        } => cmd_verify(
            path.as_deref(),
            root.as_deref(),
            concurrency,
            timeout,
            cache.as_deref(),
            warn_only,
        ),
        Command::Build {
            ref path,
            ref root,
            ref output,
        } => cmd_build(path.as_deref(), root.as_deref(), output.as_deref()),
    };

    std::process::exit(exit_code);
}

fn cmd_validate(path: Option<&str>, root: Option<&str>) -> i32 {
    let content_root = resolve_content_root(path, root);
    let reg = match load_registry(&content_root) {
        Ok(r) => r,
        Err(code) => return code,
    };

    let case_files = match resolve_case_files(path, &content_root) {
        Ok(f) => f,
        Err(code) => return code,
    };

    if case_files.is_empty() {
        eprintln!("no case files found");
        return 1;
    }

    if !reg.is_empty() {
        eprintln!("registry: {} entities loaded", reg.len());
    }

    let mut exit_code = 0;
    for case_path in &case_files {
        let result = validate_single_case(case_path, &reg);
        if result != 0 {
            exit_code = result;
        }
    }
    exit_code
}

fn validate_single_case(path: &str, reg: &registry::EntityRegistry) -> i32 {
    let content = match std::fs::read_to_string(path) {
        Ok(c) => c,
        Err(e) => {
            eprintln!("{path}: error reading file: {e}");
            return 2;
        }
    };

    match parse_full(&content, Some(reg)) {
        Ok((case, entities, rels)) => {
            eprintln!(
                "{path}: ok -- {id}: {title} ({ent} entities, {rel} relationships, {src} sources)",
                id = case.id,
                title = case.title,
                ent = entities.len(),
                rel = rels.len(),
                src = case.sources.len(),
            );
            if !case.summary.is_empty() {
                eprintln!(
                    "  summary: {}...",
                    &case.summary[..case.summary.len().min(80)]
                );
            }
            for e in &entities {
                let id_display = e.id.as_deref().unwrap_or("(no id)");
                eprintln!(
                    "  line {}: {id_display} {} ({}, {} fields)",
                    e.line,
                    e.name,
                    e.label,
                    e.fields.len()
                );
            }
            for r in &rels {
                let id_display = r.id.as_deref().unwrap_or("(no id)");
                eprintln!(
                    "  line {}: {id_display} {} -> {}: {}",
                    r.line, r.source_name, r.target_name, r.rel_type,
                );
            }
            0
        }
        Err(errors) => {
            for err in &errors {
                eprintln!("{path}:{err}");
            }
            1
        }
    }
}

#[allow(clippy::too_many_lines)]
fn cmd_verify(
    path: Option<&str>,
    root: Option<&str>,
    concurrency: usize,
    timeout: u64,
    cache_path: Option<&str>,
    warn_only: bool,
) -> i32 {
    let content_root = resolve_content_root(path, root);
    let reg = match load_registry(&content_root) {
        Ok(r) => r,
        Err(code) => return code,
    };

    let case_files = match resolve_case_files(path, &content_root) {
        Ok(f) => f,
        Err(code) => return code,
    };

    if case_files.is_empty() {
        eprintln!("no case files found");
        return 1;
    }

    let mut exit_code = 0;
    for case_path in &case_files {
        let result =
            verify_single_case(case_path, &reg, concurrency, timeout, cache_path, warn_only);
        if result != 0 {
            exit_code = result;
        }
    }
    exit_code
}

#[allow(clippy::too_many_lines)]
fn verify_single_case(
    path: &str,
    reg: &registry::EntityRegistry,
    concurrency: usize,
    timeout: u64,
    cache_path: Option<&str>,
    warn_only: bool,
) -> i32 {
    let content = match std::fs::read_to_string(path) {
        Ok(c) => c,
        Err(e) => {
            eprintln!("{path}: error reading file: {e}");
            return 2;
        }
    };

    let (case, entities, rels) = match parse_full(&content, Some(reg)) {
        Ok(result) => result,
        Err(errors) => {
            for err in &errors {
                eprintln!("{path}:{err}");
            }
            return 1;
        }
    };

    let mut collect_errors = Vec::new();
    let urls = verifier::collect_urls(&case.sources, &entities, &rels, &mut collect_errors);

    if !collect_errors.is_empty() {
        for err in &collect_errors {
            eprintln!("{path}:{err}");
        }
        return 1;
    }

    if urls.is_empty() {
        eprintln!("{path}: no URLs to verify");
        return 0;
    }

    // Load cache if path provided
    let mut verify_cache = cache_path.map(|p| match cache::VerifyCache::load(p) {
        Ok(c) => {
            eprintln!("{path}: using cache {p}");
            c
        }
        Err(e) => {
            eprintln!("{path}: cache load warning: {e}");
            cache::VerifyCache::load("/dev/null").unwrap_or_else(|_| {
                // Fallback: in-memory only, won't save
                cache::VerifyCache::empty()
            })
        }
    });

    // Partition URLs into cached and uncached
    let (cached_results, urls_to_check) = partition_cached(&urls, verify_cache.as_ref());

    let check_count = urls_to_check.len();
    let cached_count = cached_results.len();

    if cached_count > 0 {
        eprintln!(
            "{path}: {cached_count} cached, {check_count} to check (concurrency={concurrency}, timeout={timeout}s)"
        );
    } else {
        eprintln!(
            "{path}: verifying {check_count} URLs (concurrency={concurrency}, timeout={timeout}s)"
        );
    }

    let fresh_results = if urls_to_check.is_empty() {
        Vec::new()
    } else {
        let rt = match tokio::runtime::Builder::new_current_thread()
            .enable_all()
            .build()
        {
            Ok(rt) => rt,
            Err(e) => {
                eprintln!("{path}: failed to create async runtime: {e}");
                return 2;
            }
        };
        rt.block_on(verifier::verify_urls(urls_to_check, concurrency, timeout))
    };

    // Update cache with fresh results
    if let Some(ref mut vc) = verify_cache {
        for check in &fresh_results {
            vc.put(&check.url, check.status, check.detail.as_deref());
        }
    }

    // Combine cached + fresh results
    let mut all_results = cached_results;
    all_results.extend(fresh_results);

    let mut has_error = false;

    for check in &all_results {
        let detail = check.detail.as_deref().unwrap_or("");
        match check.status {
            verifier::CheckStatus::Ok => {
                eprintln!(
                    "  ok  {}{}",
                    check.url,
                    if check.is_thumbnail {
                        " [thumbnail]"
                    } else {
                        ""
                    }
                );
            }
            verifier::CheckStatus::Warn => {
                eprintln!("  warn  {} -- {detail}", check.url);
            }
            verifier::CheckStatus::Error => {
                has_error = true;
                eprintln!("  ERROR {} -- {detail}", check.url);
            }
        }
    }

    let ok_count = all_results
        .iter()
        .filter(|c| c.status == verifier::CheckStatus::Ok)
        .count();
    let warn_count = all_results
        .iter()
        .filter(|c| c.status == verifier::CheckStatus::Warn)
        .count();
    let err_count = all_results
        .iter()
        .filter(|c| c.status == verifier::CheckStatus::Error)
        .count();

    eprintln!("{path}: {ok_count} ok, {warn_count} warn, {err_count} error");

    // Save cache
    if let Some(ref vc) = verify_cache
        && let Err(e) = vc.save()
    {
        eprintln!("{path}: cache save warning: {e}");
    }

    i32::from(has_error && !warn_only)
}

/// Partition URLs into cached (already verified) and uncached (need checking).
fn partition_cached(
    urls: &[verifier::UrlEntry],
    verify_cache: Option<&cache::VerifyCache>,
) -> (Vec<verifier::UrlCheck>, Vec<verifier::UrlEntry>) {
    let Some(vc) = verify_cache else {
        // No cache -- all URLs need checking
        return (Vec::new(), urls.to_vec());
    };

    let mut cached = Vec::new();
    let mut uncached = Vec::new();

    for entry in urls {
        if let Some(cache_entry) = vc.get(entry.url()) {
            let status = match cache_entry.status.as_str() {
                "ok" => verifier::CheckStatus::Ok,
                "warn" => verifier::CheckStatus::Warn,
                _ => verifier::CheckStatus::Error,
            };
            cached.push(verifier::UrlCheck {
                url: entry.url().to_string(),
                status,
                detail: cache_entry.detail.clone(),
                is_thumbnail: entry.is_thumbnail(),
            });
        } else {
            uncached.push(entry.clone());
        }
    }

    (cached, uncached)
}

fn cmd_build(path: Option<&str>, root: Option<&str>, output_dir: Option<&str>) -> i32 {
    let content_root = resolve_content_root(path, root);
    let reg = match load_registry(&content_root) {
        Ok(r) => r,
        Err(code) => return code,
    };

    let case_files = match resolve_case_files(path, &content_root) {
        Ok(f) => f,
        Err(code) => return code,
    };

    if case_files.is_empty() {
        eprintln!("no case files found");
        return 1;
    }

    let mut exit_code = 0;
    for case_path in &case_files {
        let result = build_single_case(case_path, &reg, output_dir);
        if result != 0 {
            exit_code = result;
        }
    }
    exit_code
}

fn build_single_case(path: &str, reg: &registry::EntityRegistry, output_dir: Option<&str>) -> i32 {
    let case_output = match build_case_output(path, reg) {
        Ok(output) => output,
        Err(code) => return code,
    };

    write_case_output(path, &case_output.case_id, &case_output, output_dir)
}

/// Write case output JSON to file or stdout.
fn write_case_output(
    path: &str,
    case_id: &str,
    case_output: &output::CaseOutput,
    output_dir: Option<&str>,
) -> i32 {
    match output_dir {
        Some(dir) => {
            let out_path = format!("{dir}/{case_id}.json");
            match serde_json::to_string_pretty(case_output) {
                Ok(json) => {
                    if let Err(e) = std::fs::write(&out_path, json) {
                        eprintln!("{out_path}: error writing file: {e}");
                        return 2;
                    }
                    eprintln!("{path} -> {out_path}");
                }
                Err(e) => {
                    eprintln!("{path}: JSON serialization error: {e}");
                    return 2;
                }
            }
        }
        None => match serde_json::to_string_pretty(case_output) {
            Ok(json) => println!("{json}"),
            Err(e) => {
                eprintln!("{path}: JSON serialization error: {e}");
                return 2;
            }
        },
    }

    0
}

#[cfg(test)]
mod tests {
    use super::*;

    const FULL_CASE: &str = r"---
id: bonnick-v-arsenal
sources:
  - https://www.theguardian.com/football/2025/feb/03/bonnick
  - https://novaramedia.com/2025/02/04/bonnick
---

# Bonnick v Arsenal FC

Kit manager dismissed over social media posts about Israel-Gaza.

## Events

### Bonnick dismissal
- occurred_at: 2024-12-24
- document_type: termination
- description: Arsenal dismisses Bonnick over social media posts
  regarding Israel-Gaza conflict.

### FA investigation finding
- occurred_at: 2024
- document_type: investigation
- description: FA investigates and finds the posts did not breach
  FA rules. Matter closed by FA.

### Employment tribunal filing
- occurred_at: 2025-02-03
- document_type: filing
- description: Bonnick files employment tribunal claim against Arsenal.

## Relationships

- Bonnick dismissal -> FA investigation finding: related_to
- FA investigation finding -> Employment tribunal filing: related_to
- Bonnick dismissal -> Employment tribunal filing: related_to
  - source: https://novaramedia.com/2025/02/04/bonnick

## Timeline

Bonnick dismissal -> FA investigation finding -> Employment tribunal filing
";

    #[test]
    fn parse_full_case_file() {
        let (case, entities, rels) = parse_full(FULL_CASE, None).unwrap();

        assert_eq!(case.id, "bonnick-v-arsenal");
        assert_eq!(case.title, "Bonnick v Arsenal FC");
        assert!(case.summary.contains("Kit manager dismissed"));
        assert_eq!(case.sources.len(), 2);

        // 3 events (PublicRecord entities)
        assert_eq!(entities.len(), 3);
        assert!(
            entities
                .iter()
                .all(|e| e.label == entity::Label::PublicRecord)
        );

        let dismissal = entities
            .iter()
            .find(|e| e.name == "Bonnick dismissal")
            .unwrap();
        assert_eq!(dismissal.label, entity::Label::PublicRecord);

        // 3 explicit rels + 2 NEXT from timeline = 5
        assert_eq!(rels.len(), 5);

        // Check NEXT relationships from timeline
        let next_rels: Vec<_> = rels.iter().filter(|r| r.rel_type == "next").collect();
        assert_eq!(next_rels.len(), 2);
        assert_eq!(next_rels[0].source_name, "Bonnick dismissal");
        assert_eq!(next_rels[0].target_name, "FA investigation finding");
        assert_eq!(next_rels[1].source_name, "FA investigation finding");
        assert_eq!(next_rels[1].target_name, "Employment tribunal filing");
    }

    #[test]
    fn parse_full_minimal_case() {
        let input = r"---
id: minimal-test
sources:
  - https://example.com/source
---

# Minimal Test Case

A simple test.

## Events

### Something happened
- occurred_at: 2025-01-01
- document_type: court_ruling
";
        let (case, entities, rels) = parse_full(input, None).unwrap();
        assert_eq!(case.id, "minimal-test");
        assert_eq!(case.title, "Minimal Test Case");
        assert_eq!(entities.len(), 1);
        assert_eq!(entities[0].name, "Something happened");
        assert!(rels.is_empty());
    }

    #[test]
    fn json_snapshot_full_case() {
        let (case, entities, rels) = parse_full(FULL_CASE, None).unwrap();
        let build_result = output::build_output(
            &case.id,
            &case.title,
            &case.summary,
            &case.sources,
            &entities,
            &rels,
            &[],
        )
        .unwrap();

        let json = serde_json::to_string_pretty(&build_result.output).unwrap();

        // Verify structure
        assert!(json.contains("\"case_id\": \"bonnick-v-arsenal\""));
        assert!(json.contains("\"title\": \"Bonnick v Arsenal FC\""));
        assert!(json.contains("\"label\": \"public_record\""));
        assert!(json.contains("\"name\": \"Bonnick dismissal\""));
        assert!(json.contains("\"name\": \"FA investigation finding\""));
        assert!(json.contains("\"document_type\": \"termination\""));
        assert!(json.contains("\"document_type\": \"investigation\""));
        assert!(json.contains("\"type\": \"related_to\""));
        assert!(json.contains("\"type\": \"next\""));

        // Verify NULIDs
        let output: serde_json::Value = serde_json::from_str(&json).unwrap();
        let nodes = output["nodes"].as_array().unwrap();
        let rels_arr = output["relationships"].as_array().unwrap();

        for node in nodes {
            let id = node["id"].as_str().unwrap();
            assert!(!id.is_empty());
            assert!(id.len() >= 20);
        }
        for rel in rels_arr {
            let id = rel["id"].as_str().unwrap();
            assert!(!id.is_empty());
        }

        // source_id/target_id should reference existing node IDs
        let node_ids: Vec<&str> = nodes.iter().map(|n| n["id"].as_str().unwrap()).collect();
        for rel in rels_arr {
            let source_id = rel["source_id"].as_str().unwrap();
            let target_id = rel["target_id"].as_str().unwrap();
            assert!(
                node_ids.contains(&source_id),
                "source_id {source_id} not found in nodes"
            );
            assert!(
                node_ids.contains(&target_id),
                "target_id {target_id} not found in nodes"
            );
        }
    }

    #[test]
    fn json_snapshot_omits_empty_fields() {
        let input = r"---
id: sparse
sources:
  - https://example.com/src
---

# Sparse Case

Summary.

## Events

### Something
- occurred_at: 2025-01-01
";
        let (case, entities, rels) = parse_full(input, None).unwrap();
        let build_result = output::build_output(
            &case.id,
            &case.title,
            &case.summary,
            &case.sources,
            &entities,
            &rels,
            &[],
        )
        .unwrap();

        let json = serde_json::to_string_pretty(&build_result.output).unwrap();

        // These should be omitted (not present at all)
        assert!(!json.contains("\"qualifier\""));
        assert!(!json.contains("\"description\""));
        assert!(!json.contains("\"thumbnail\""));
        assert!(!json.contains("\"aliases\""));
        assert!(!json.contains("\"urls\""));

        // These should be present
        assert!(json.contains("\"occurred_at\": \"2025-01-01\""));
    }

    #[test]
    fn cross_file_resolution_with_registry() {
        use std::path::PathBuf;
        use weave_content::entity::Entity;

        // Create a registry with an actor
        let entries = vec![registry::RegistryEntry {
            entity: Entity {
                name: "Mark Bonnick".to_string(),
                label: entity::Label::Actor,
                fields: vec![(
                    "nationality".to_string(),
                    entity::FieldValue::Single("British".to_string()),
                )],
                id: Some("01JXYZ123456789ABCDEFGHIJK".to_string()),
                line: 1,
            },
            path: PathBuf::from("actors/mark-bonnick.md"),
        }];
        let reg = registry::EntityRegistry::from_entries(entries).unwrap();

        // Case file references "Mark Bonnick" in relationships
        let input = r"---
id: test-cross-ref
sources:
  - https://example.com/src
---

# Cross Reference Test

Summary.

## Events

### Dismissal
- occurred_at: 2024-12-24
- document_type: termination

## Relationships

- Mark Bonnick -> Dismissal: related_to
";
        // Without registry: should fail (Mark Bonnick not found)
        let err = parse_full(input, None).unwrap_err();
        assert!(err.iter().any(|e| e.message.contains("Mark Bonnick")));

        // With registry: should succeed
        let (case, entities, rels) = parse_full(input, Some(&reg)).unwrap();
        assert_eq!(case.id, "test-cross-ref");
        assert_eq!(entities.len(), 1); // only inline event
        assert_eq!(rels.len(), 1);
        assert_eq!(rels[0].source_name, "Mark Bonnick");
        assert_eq!(rels[0].target_name, "Dismissal");
    }
}