weave-content 0.2.17

Content DSL parser, validator, and builder for OSINT case files
Documentation
use std::collections::{HashMap, HashSet};
use std::path::Path;

use serde::Deserialize;

use crate::parser::ParseError;

/// Maximum number of categories in the tag registry.
const MAX_CATEGORIES: usize = 10;

/// Maximum tags per category.
const MAX_TAGS_PER_CATEGORY: usize = 30;

/// Maximum total tags in registry.
const MAX_TOTAL_TAGS: usize = 200;

/// Maximum tag slug length.
const MAX_SLUG_LEN: usize = 30;

/// Maximum tag label length.
const MAX_LABEL_LEN: usize = 60;

/// YAML schema for the tag registry file.
#[derive(Deserialize)]
struct TagRegistryFile {
    categories: Vec<CategoryEntry>,
}

/// A tag category in the YAML file.
#[derive(Deserialize)]
struct CategoryEntry {
    slug: String,
    label: String,
    tags: Vec<TagEntry>,
}

/// A single tag in the YAML file.
#[derive(Deserialize)]
struct TagEntry {
    slug: String,
    label: String,
}

/// A loaded tag with its category.
#[derive(Debug, Clone)]
pub struct Tag {
    pub slug: String,
    pub label: String,
    pub category_slug: String,
    pub category_label: String,
}

/// Tag registry: holds all valid tags loaded from `tags.yaml`.
/// Provides slug-based lookup for validation.
#[derive(Debug)]
pub struct TagRegistry {
    tags: Vec<Tag>,
    /// Slug → index into `tags`.
    slug_index: HashMap<String, usize>,
    /// Category slugs for iteration.
    category_slugs: Vec<String>,
}

impl TagRegistry {
    /// Load a tag registry from `{root}/tags.yaml`.
    /// Returns an empty registry if the file doesn't exist.
    pub fn load(root: &Path) -> Result<Self, Vec<ParseError>> {
        let path = root.join("tags.yaml");
        if !path.exists() {
            return Ok(Self::empty());
        }

        let content = std::fs::read_to_string(&path).map_err(|e| {
            vec![ParseError {
                line: 0,
                message: format!("{}: error reading file: {e}", path.display()),
            }]
        })?;

        Self::parse(&content)
    }

    /// Parse a tag registry from YAML content.
    pub fn parse(content: &str) -> Result<Self, Vec<ParseError>> {
        let file: TagRegistryFile = serde_yaml::from_str(content).map_err(|e| {
            vec![ParseError {
                line: 0,
                message: format!("tags.yaml: invalid YAML: {e}"),
            }]
        })?;

        let mut errors = Vec::new();

        if file.categories.len() > MAX_CATEGORIES {
            errors.push(ParseError {
                line: 0,
                message: format!(
                    "tags.yaml: too many categories ({}, max {MAX_CATEGORIES})",
                    file.categories.len()
                ),
            });
        }

        let mut tags = Vec::new();
        let mut slug_index = HashMap::new();
        let mut category_slugs = Vec::new();
        let mut seen_category_slugs: HashSet<String> = HashSet::new();
        let mut seen_tag_slugs: HashSet<String> = HashSet::new();

        for cat in &file.categories {
            validate_slug(&cat.slug, "category", &mut errors);
            validate_label(&cat.label, "category", &mut errors);

            if !seen_category_slugs.insert(cat.slug.clone()) {
                errors.push(ParseError {
                    line: 0,
                    message: format!("tags.yaml: duplicate category slug {:?}", cat.slug),
                });
            }

            if cat.tags.len() > MAX_TAGS_PER_CATEGORY {
                errors.push(ParseError {
                    line: 0,
                    message: format!(
                        "tags.yaml: category {:?} has too many tags ({}, max {MAX_TAGS_PER_CATEGORY})",
                        cat.slug,
                        cat.tags.len()
                    ),
                });
            }

            category_slugs.push(cat.slug.clone());

            for tag in &cat.tags {
                validate_slug(&tag.slug, "tag", &mut errors);
                validate_label(&tag.label, "tag", &mut errors);

                if !seen_tag_slugs.insert(tag.slug.clone()) {
                    errors.push(ParseError {
                        line: 0,
                        message: format!("tags.yaml: duplicate tag slug {:?}", tag.slug),
                    });
                }

                let idx = tags.len();
                slug_index.insert(tag.slug.clone(), idx);
                tags.push(Tag {
                    slug: tag.slug.clone(),
                    label: tag.label.clone(),
                    category_slug: cat.slug.clone(),
                    category_label: cat.label.clone(),
                });
            }
        }

        if tags.len() > MAX_TOTAL_TAGS {
            errors.push(ParseError {
                line: 0,
                message: format!(
                    "tags.yaml: too many total tags ({}, max {MAX_TOTAL_TAGS})",
                    tags.len()
                ),
            });
        }

        if !errors.is_empty() {
            return Err(errors);
        }

        Ok(Self {
            tags,
            slug_index,
            category_slugs,
        })
    }

    /// Create an empty tag registry (no tags.yaml).
    pub fn empty() -> Self {
        Self {
            tags: Vec::new(),
            slug_index: HashMap::new(),
            category_slugs: Vec::new(),
        }
    }

    /// Look up a tag by slug. Returns None if not found.
    pub fn get(&self, slug: &str) -> Option<&Tag> {
        self.slug_index.get(slug).map(|&idx| &self.tags[idx])
    }

    /// Check if a tag slug is valid (exists in the registry).
    pub fn contains(&self, slug: &str) -> bool {
        self.slug_index.contains_key(slug)
    }

    /// Number of tags in the registry.
    pub fn len(&self) -> usize {
        self.tags.len()
    }

    /// Whether the registry is empty (no tags.yaml or no tags).
    pub fn is_empty(&self) -> bool {
        self.tags.is_empty()
    }

    /// All tags.
    pub fn tags(&self) -> &[Tag] {
        &self.tags
    }

    /// All category slugs.
    pub fn category_slugs(&self) -> &[String] {
        &self.category_slugs
    }

    /// Validate a list of tags against the registry.
    /// Returns errors for unknown tags.
    pub fn validate_tags(&self, tags: &[String], line: usize) -> Vec<ParseError> {
        let mut errors = Vec::new();

        if self.is_empty() {
            // No registry loaded -- skip validation
            return errors;
        }

        for tag in tags {
            if !self.contains(tag) {
                errors.push(ParseError {
                    line,
                    message: format!("unknown tag {tag:?} (not in tags.yaml registry)"),
                });
            }
        }

        errors
    }
}

fn validate_slug(slug: &str, kind: &str, errors: &mut Vec<ParseError>) {
    if slug.len() > MAX_SLUG_LEN {
        errors.push(ParseError {
            line: 0,
            message: format!("tags.yaml: {kind} slug {slug:?} exceeds {MAX_SLUG_LEN} chars"),
        });
    }
    if slug.is_empty() {
        errors.push(ParseError {
            line: 0,
            message: format!("tags.yaml: {kind} slug must not be empty"),
        });
    }
    if !is_kebab_case(slug) {
        errors.push(ParseError {
            line: 0,
            message: format!("tags.yaml: {kind} slug {slug:?} must be kebab-case"),
        });
    }
}

fn validate_label(label: &str, kind: &str, errors: &mut Vec<ParseError>) {
    if label.len() > MAX_LABEL_LEN {
        errors.push(ParseError {
            line: 0,
            message: format!("tags.yaml: {kind} label {label:?} exceeds {MAX_LABEL_LEN} chars"),
        });
    }
    if label.is_empty() {
        errors.push(ParseError {
            line: 0,
            message: format!("tags.yaml: {kind} label must not be empty"),
        });
    }
}

/// Check if a string is valid kebab-case: `[a-z0-9](-[a-z0-9]+)*`
fn is_kebab_case(s: &str) -> bool {
    !s.is_empty()
        && !s.starts_with('-')
        && !s.ends_with('-')
        && !s.contains("--")
        && s.chars()
            .all(|c| c.is_ascii_lowercase() || c.is_ascii_digit() || c == '-')
}

#[cfg(test)]
mod tests {
    use super::*;

    const VALID_YAML: &str = r#"
categories:
  - slug: crime-type
    label: Crime Type
    tags:
      - slug: bribery
        label: Bribery
      - slug: fraud
        label: Fraud
  - slug: sector
    label: Sector
    tags:
      - slug: government
        label: Government
"#;

    #[test]
    fn parse_valid_registry() {
        let reg = TagRegistry::parse(VALID_YAML).expect("should parse");
        assert_eq!(reg.len(), 3);
        assert_eq!(reg.category_slugs().len(), 2);
        assert!(reg.contains("bribery"));
        assert!(reg.contains("fraud"));
        assert!(reg.contains("government"));
        assert!(!reg.contains("unknown"));

        let tag = reg.get("bribery").expect("should exist");
        assert_eq!(tag.label, "Bribery");
        assert_eq!(tag.category_slug, "crime-type");
    }

    #[test]
    fn validate_tags_known() {
        let reg = TagRegistry::parse(VALID_YAML).expect("should parse");
        let errors = reg.validate_tags(&["bribery".to_string(), "government".to_string()], 5);
        assert!(errors.is_empty());
    }

    #[test]
    fn validate_tags_unknown() {
        let reg = TagRegistry::parse(VALID_YAML).expect("should parse");
        let errors = reg.validate_tags(&["bribery".to_string(), "unknown".to_string()], 5);
        assert_eq!(errors.len(), 1);
        assert!(errors[0].message.contains("unknown"));
    }

    #[test]
    fn validate_tags_empty_registry() {
        let reg = TagRegistry::empty();
        let errors = reg.validate_tags(&["anything".to_string()], 5);
        assert!(errors.is_empty(), "empty registry should skip validation");
    }

    #[test]
    fn duplicate_tag_slug_error() {
        let yaml = r#"
categories:
  - slug: a
    label: A
    tags:
      - slug: dupe
        label: First
      - slug: dupe
        label: Second
"#;
        let errors = TagRegistry::parse(yaml).expect_err("should fail");
        assert!(errors.iter().any(|e| e.message.contains("duplicate tag")));
    }

    #[test]
    fn duplicate_category_slug_error() {
        let yaml = r#"
categories:
  - slug: same
    label: First
    tags:
      - slug: a
        label: A
  - slug: same
    label: Second
    tags:
      - slug: b
        label: B
"#;
        let errors = TagRegistry::parse(yaml).expect_err("should fail");
        assert!(
            errors
                .iter()
                .any(|e| e.message.contains("duplicate category"))
        );
    }

    #[test]
    fn empty_registry() {
        let reg = TagRegistry::empty();
        assert!(reg.is_empty());
        assert_eq!(reg.len(), 0);
        assert!(!reg.contains("anything"));
    }
}