openlatch-provider 0.1.0

//! Manifest (`<slug>.yaml`) parsing, schema validation, and YAML IO.
//!
//! Pipeline (per `phase-1-editor-cli.md` task P1.T3):
//!
//! ```text
//!   bytes (≤ 256 KB) ──► serde_yaml ──► Value ──► jsonschema validate
//!                                                       │
//!                                                       ▼
//!                                           OL-4210/4211/4212/...
//!                                                       │
//!                                          (rich error path with
//!                                           field-path + suggestion)
//!                                                       │
//!                                                       ▼
//!                            typify-generated `Manifest` (deserialize)
//!                                                       │
//!                                                       ▼
//!                                  Manifest::semantic_check()
//!                                  - slug uniqueness
//!                                  - threat_category enum
//!                                  - bindings reference existing tools/providers
//!                                  - https endpoint_url
//!                                                       │
//!                                                       ▼
//!                                                  Manifest

pub mod diff;
pub mod editor;
pub mod schema;
pub mod version;

pub use crate::generated::Manifest;

use std::path::Path;

use crate::error::{
    OlError, OL_4210_SCHEMA_MISMATCH, OL_4211_UNKNOWN_THREAT_CATEGORY,
    OL_4212_INVALID_ENDPOINT_URL, OL_4213_INVALID_AGENTS_SUPPORTED,
    OL_4214_INVALID_HOOKS_SUPPORTED, OL_4216_CAPABILITY_MISSING_FIELD, OL_4273_MANIFEST_UNREADABLE,
};

/// Maximum manifest size — anything larger is rejected up-front (security
/// rule per `.claude/rules/security-constraints.md`).
const MAX_MANIFEST_BYTES: usize = 256 * 1024;

/// 12 canonical threat categories — kept in sync with the
/// `threat_category` enum in `schemas/manifest-capability.schema.json`.
/// The `known_threat_categories_match_schema` test guards alignment.
pub const KNOWN_THREAT_CATEGORIES: &[&str] = &[
    "pii_outbound",
    "credential_detection",
    "tool_poison_detection",
    "shell_dangerous",
    "injection_tool_response",
    "injection_user_input",
    "attack_path_analysis",
    "behavioral_anomaly",
    "data_exfiltration",
    "privilege_escalation",
    "unauthorized_resource_access",
    "policy_violation",
];

/// Hook events providers can subscribe to.
pub const KNOWN_HOOK_EVENTS: &[&str] = &[
    "pre_tool_use",
    "post_tool_use",
    "user_prompt_submit",
    "before_shell_execution",
    "before_mcp_execution",
    "before_tool",
    "after_tool",
    "command_new",
    "agent_bootstrap",
];

/// Agent platforms the marketplace currently recognizes.
pub const KNOWN_AGENT_PLATFORMS: &[&str] = &[
    "claude-code",
    "cursor",
    "windsurf",
    "github-copilot",
    "codex-cli",
    "gemini-cli",
    "cline",
    "openclaw",
];

/// Read + validate a manifest YAML file (`<slug>.yaml`).
pub fn load(path: &Path) -> Result<Manifest, OlError> {
    let bytes = std::fs::read(path).map_err(|e| {
        OlError::new(
            OL_4273_MANIFEST_UNREADABLE,
            format!("cannot read '{}': {e}", path.display()),
        )
    })?;
    if bytes.len() > MAX_MANIFEST_BYTES {
        return Err(OlError::new(
            OL_4273_MANIFEST_UNREADABLE,
            format!(
                "manifest at '{}' is {} bytes (cap {} KB)",
                path.display(),
                bytes.len(),
                MAX_MANIFEST_BYTES / 1024
            ),
        ));
    }
    parse(&bytes)
}

/// Parse + validate raw bytes. Used by both [`load`] and the smoke tests.
pub fn parse(bytes: &[u8]) -> Result<Manifest, OlError> {
    let yaml: serde_yaml::Value = serde_yaml::from_slice(bytes)
        .map_err(|e| OlError::new(OL_4210_SCHEMA_MISMATCH, format!("YAML parse: {e}")))?;
    let json = serde_json::to_value(&yaml).map_err(|e| {
        OlError::new(
            OL_4210_SCHEMA_MISMATCH,
            format!("YAML→JSON conversion: {e}"),
        )
    })?;

    schema::validate(&json)?;

    let manifest: Manifest = serde_json::from_value(json)
        .map_err(|e| OlError::new(OL_4210_SCHEMA_MISMATCH, format!("typify deserialize: {e}")))?;

    semantic_check(&manifest)?;
    Ok(manifest)
}

/// Cross-field rules that the JSON Schema can't express.
pub fn semantic_check(m: &Manifest) -> Result<(), OlError> {
    use std::collections::BTreeSet;

    // Tool slug uniqueness.
    let mut tool_slugs: BTreeSet<String> = BTreeSet::new();
    for t in &m.tools {
        let slug: &str = &t.slug;
        if !tool_slugs.insert(slug.to_string()) {
            return Err(OlError::new(
                OL_4210_SCHEMA_MISMATCH,
                format!("duplicate tool slug `{slug}`"),
            ));
        }
        // agents_supported / hooks_supported open enum check.
        for a in &t.agents_supported {
            if !KNOWN_AGENT_PLATFORMS.contains(&a.as_str()) {
                tracing::warn!(agent = %a, "unknown agent.platform value (allowed but tagged)");
            }
        }
        for h in &t.hooks_supported {
            if !KNOWN_HOOK_EVENTS.contains(&h.as_str()) {
                tracing::warn!(hook = %h, "unknown hook event value (allowed but tagged)");
            }
        }
        if t.agents_supported.is_empty() {
            return Err(OlError::new(
                OL_4213_INVALID_AGENTS_SUPPORTED,
                format!("tool `{slug}` has empty agents_supported"),
            ));
        }
        if t.hooks_supported.is_empty() {
            return Err(OlError::new(
                OL_4214_INVALID_HOOKS_SUPPORTED,
                format!("tool `{slug}` has empty hooks_supported"),
            ));
        }
        if t.capabilities.is_empty() {
            return Err(OlError::new(
                OL_4216_CAPABILITY_MISSING_FIELD,
                format!("tool `{slug}` has no capabilities"),
            ));
        }
        for cap in &t.capabilities {
            // threat_category is a typed wrapper from typify; extract its
            // string form via Debug — checking against the known list with
            // strsim suggestion if not found.
            let serialized = serde_json::to_value(cap.threat_category)
                .ok()
                .and_then(|v| v.as_str().map(|s| s.to_string()))
                .unwrap_or_default();
            if !serialized.is_empty() && !KNOWN_THREAT_CATEGORIES.contains(&serialized.as_str()) {
                let mut err = OlError::new(
                    OL_4211_UNKNOWN_THREAT_CATEGORY,
                    format!("tool `{slug}`: unknown threat_category `{serialized}`"),
                );
                if let Some(suggestion) = closest_match(&serialized, KNOWN_THREAT_CATEGORIES) {
                    err = err.with_suggestion(format!("Did you mean `{suggestion}`?"));
                }
                return Err(err);
            }
        }
    }

    // Provider slug uniqueness.
    let mut provider_slugs: BTreeSet<String> = BTreeSet::new();
    for p in &m.providers {
        let slug: &str = &p.slug;
        if !provider_slugs.insert(slug.to_string()) {
            return Err(OlError::new(
                OL_4210_SCHEMA_MISMATCH,
                format!("duplicate provider slug `{slug}`"),
            ));
        }
    }

    // Bindings cross-reference + endpoint sanity.
    for b in &m.bindings {
        if !tool_slugs.contains(&b.tool) {
            return Err(OlError::new(
                OL_4210_SCHEMA_MISMATCH,
                format!(
                    "binding references tool `{}` which is not declared in tools[]",
                    b.tool
                ),
            ));
        }
        if !provider_slugs.contains(&b.provider) {
            return Err(OlError::new(
                OL_4210_SCHEMA_MISMATCH,
                format!(
                    "binding references provider `{}` which is not declared in providers[]",
                    b.provider
                ),
            ));
        }
        if !b.endpoint_url.starts_with("https://") {
            return Err(OlError::new(
                OL_4212_INVALID_ENDPOINT_URL,
                format!(
                    "binding {}/{} endpoint_url must be HTTPS, got `{}`",
                    b.tool, b.provider, b.endpoint_url
                ),
            )
            .with_suggestion(
                "Use https:// scheme; the platform never connects to plaintext endpoints.",
            ));
        }
    }

    Ok(())
}

fn closest_match(needle: &str, haystack: &[&str]) -> Option<String> {
    let mut best: Option<(usize, &str)> = None;
    for candidate in haystack {
        let d = strsim::levenshtein(needle, candidate);
        if d <= 4 {
            match best {
                None => best = Some((d, candidate)),
                Some((bd, _)) if d < bd => best = Some((d, candidate)),
                _ => {}
            }
        }
    }
    best.map(|(_, s)| s.to_string())
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn closest_match_finds_typo() {
        let suggestion = closest_match("pii_outbond", KNOWN_THREAT_CATEGORIES);
        assert_eq!(suggestion.as_deref(), Some("pii_outbound"));
    }

    /// Guards alignment between the local `KNOWN_THREAT_CATEGORIES` const and
    /// the `threat_category` enum in `schemas/manifest-capability.schema.json`.
    /// Drift here silently breaks `OL-4211` suggestions and the wizard's
    /// MultiSelect picker.
    #[test]
    fn known_threat_categories_match_schema() {
        use std::collections::BTreeSet;

        const CAPABILITY_SCHEMA: &str =
            include_str!("../../schemas/manifest-capability.schema.json");

        let value: serde_json::Value =
            serde_json::from_str(CAPABILITY_SCHEMA).expect("capability schema must parse");
        let enum_array = value
            .pointer("/properties/threat_category/enum")
            .and_then(|v| v.as_array())
            .expect("schema.properties.threat_category.enum must exist");

        let from_schema: BTreeSet<&str> = enum_array
            .iter()
            .map(|v| v.as_str().expect("threat_category enum must be strings"))
            .collect();
        let from_const: BTreeSet<&str> = KNOWN_THREAT_CATEGORIES.iter().copied().collect();

        assert_eq!(
            from_const, from_schema,
            "KNOWN_THREAT_CATEGORIES drifted from manifest-capability.schema.json"
        );
    }
}