weave-content 0.2.25

Content DSL parser, validator, and builder for OSINT case files
Documentation
use std::collections::HashSet;

use crate::parser::ParseError;
use crate::relationship::Rel;

/// Maximum timeline edges per file.
const MAX_EDGES: usize = 200;

/// Parse the `## Timeline` section body into `preceded_by` relationships.
///
/// Format:
/// ```text
/// - Event A -> Event B
///   id: 01ABC...
/// - Event B -> Event C
///   id: 01DEF...
/// ```
///
/// Each bullet is a single edge. The `id:` line is optional on first build
/// (will be generated and written back).
pub fn parse_timeline(
    body: &str,
    section_start_line: usize,
    event_names: &HashSet<&str>,
    errors: &mut Vec<ParseError>,
) -> Vec<Rel> {
    let lines: Vec<&str> = body.lines().collect();
    let mut rels = Vec::new();

    let mut i = 0;
    while i < lines.len() {
        let file_line = section_start_line + 1 + i;
        let trimmed = lines[i].trim();

        if trimmed.is_empty() {
            i += 1;
            continue;
        }

        let Some(bullet_body) = trimmed.strip_prefix("- ") else {
            errors.push(ParseError {
                line: file_line,
                message: format!("expected timeline bullet `- A -> B`, got {trimmed:?}"),
            });
            i += 1;
            continue;
        };

        let parts: Vec<&str> = bullet_body.split(" -> ").map(str::trim).collect();
        if parts.len() != 2 {
            errors.push(ParseError {
                line: file_line,
                message: format!(
                    "timeline bullet must have exactly 2 events: `- A -> B` (got {bullet_body:?})"
                ),
            });
            i += 1;
            continue;
        }

        let source = parts[0];
        let target = parts[1];

        for event in [source, target] {
            if !event_names.contains(event) {
                errors.push(ParseError {
                    line: file_line,
                    message: format!("timeline entity {event:?} not found in Events section"),
                });
            }
        }

        // Look ahead for `id:` on the next line
        let mut id: Option<String> = None;
        if i + 1 < lines.len() {
            let next = lines[i + 1].trim();
            if let Some(id_val) = next.strip_prefix("id: ") {
                id = Some(id_val.trim().to_string());
                i += 1;
            }
        }

        rels.push(Rel {
            source_name: source.to_string(),
            target_name: target.to_string(),
            rel_type: "preceded_by".to_string(),
            source_urls: Vec::new(),
            fields: vec![],
            id,
            line: file_line,
        });

        i += 1;
    }

    if rels.len() > MAX_EDGES {
        errors.push(ParseError {
            line: section_start_line,
            message: format!(
                "too many timeline edges (max {MAX_EDGES}, got {})",
                rels.len()
            ),
        });
    }

    rels
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn parse_basic() {
        let body = "\n- Event A -> Event B\n  id: 01JABC000000000000000000AA\n- Event B -> Event C\n  id: 01JDEF000000000000000000BB\n";
        let names = HashSet::from(["Event A", "Event B", "Event C"]);
        let mut errors = Vec::new();

        let rels = parse_timeline(body, 80, &names, &mut errors);
        assert!(errors.is_empty(), "errors: {errors:?}");
        assert_eq!(rels.len(), 2);
        assert_eq!(rels[0].source_name, "Event A");
        assert_eq!(rels[0].target_name, "Event B");
        assert_eq!(rels[0].rel_type, "preceded_by");
        assert!(rels[0].source_urls.is_empty());
        assert_eq!(rels[0].id.as_deref(), Some("01JABC000000000000000000AA"));
        assert_eq!(rels[1].source_name, "Event B");
        assert_eq!(rels[1].target_name, "Event C");
        assert_eq!(rels[1].id.as_deref(), Some("01JDEF000000000000000000BB"));
    }

    #[test]
    fn parse_without_ids() {
        let body = "\n- Event A -> Event B\n- Event B -> Event C\n";
        let names = HashSet::from(["Event A", "Event B", "Event C"]);
        let mut errors = Vec::new();

        let rels = parse_timeline(body, 80, &names, &mut errors);
        assert!(errors.is_empty(), "errors: {errors:?}");
        assert_eq!(rels.len(), 2);
        assert!(rels[0].id.is_none());
        assert!(rels[1].id.is_none());
    }

    #[test]
    fn parse_mixed_ids() {
        let body = "\n- A -> B\n  id: 01JABC000000000000000000AA\n- B -> C\n";
        let names = HashSet::from(["A", "B", "C"]);
        let mut errors = Vec::new();

        let rels = parse_timeline(body, 1, &names, &mut errors);
        assert!(errors.is_empty(), "errors: {errors:?}");
        assert_eq!(rels.len(), 2);
        assert_eq!(rels[0].id.as_deref(), Some("01JABC000000000000000000AA"));
        assert!(rels[1].id.is_none());
    }

    #[test]
    fn reject_unknown_event() {
        let body = "\n- Known -> Unknown\n";
        let names = HashSet::from(["Known"]);
        let mut errors = Vec::new();

        parse_timeline(body, 1, &names, &mut errors);
        assert!(
            errors
                .iter()
                .any(|e| e.message.contains("not found in Events"))
        );
    }

    #[test]
    fn reject_invalid_syntax() {
        let body = "\n- Just One Event\n";
        let names = HashSet::from(["Just One Event"]);
        let mut errors = Vec::new();

        parse_timeline(body, 1, &names, &mut errors);
        assert!(
            errors
                .iter()
                .any(|e| e.message.contains("exactly 2 events"))
        );
    }

    #[test]
    fn empty_timeline() {
        let body = "\n\n\n";
        let mut errors = Vec::new();

        let rels = parse_timeline(body, 1, &HashSet::new(), &mut errors);
        assert!(errors.is_empty());
        assert!(rels.is_empty());
    }
}