openlatch-provider 0.2.1

Self-service onboarding CLI + runtime daemon for OpenLatch Editors and Providers
//! Manifest (`<slug>.yaml`) parsing, schema validation, and YAML IO.
//!
//! Pipeline (per `phase-1-editor-cli.md` task P1.T3):
//!
//! ```text
//!   bytes (≤ 256 KB) ──► serde_yaml ──► Value ──► jsonschema validate
//!//!//!                                           OL-4210/4211/4212/...
//!//!                                          (rich error path with
//!                                           field-path + suggestion)
//!//!//!                            typify-generated `Manifest` (deserialize)
//!//!//!                                  Manifest::semantic_check()
//!                                  - slug uniqueness
//!                                  - threat_category enum
//!                                  - bindings reference existing tools/providers
//!                                  - https endpoint_url (per provider)
//!//!//!                                                  Manifest

pub mod diff;
pub mod editor;
pub mod schema;
pub mod v2;
pub mod version;

pub use crate::generated::Manifest;
pub use v2::{load_provider_tree, ResolvedBinding, ResolvedManifest, ResolvedTool};

use std::path::Path;

use crate::error::{
    OlError, OL_4210_SCHEMA_MISMATCH, OL_4211_UNKNOWN_THREAT_CATEGORY,
    OL_4212_INVALID_ENDPOINT_URL, OL_4213_INVALID_AGENTS_SUPPORTED,
    OL_4214_INVALID_HOOKS_SUPPORTED, OL_4216_CAPABILITY_MISSING_FIELD, OL_4273_MANIFEST_UNREADABLE,
    OL_4300_PROCESS_SPEC_INVALID,
};

/// Maximum manifest size — anything larger is rejected up-front (security
/// rule per `.claude/rules/security-constraints.md`).
const MAX_MANIFEST_BYTES: usize = 256 * 1024;

/// 12 canonical threat categories — kept in sync with the
/// `threat_category` enum in `schemas/manifest-capability.schema.json`.
/// The `known_threat_categories_match_schema` test guards alignment.
pub const KNOWN_THREAT_CATEGORIES: &[&str] = &[
    "pii_outbound",
    "pii_inbound",
    "credential_detection",
    "injection_tool_response",
    "injection_user_input",
    "shell_dangerous",
    "shell_exfiltration",
    "tool_hash_verification",
    "tool_poison_detection",
    "tool_typosquatting",
    "attack_path_analysis",
    "configuration_threat",
];

/// Hook events providers can subscribe to.
pub const KNOWN_HOOK_EVENTS: &[&str] = &[
    "pre_tool_use",
    "post_tool_use",
    "user_prompt_submit",
    "before_shell_execution",
    "before_mcp_execution",
    "before_tool",
    "after_tool",
    "command_new",
    "agent_bootstrap",
];

/// Agent platforms the marketplace currently recognizes.
pub const KNOWN_AGENT_PLATFORMS: &[&str] = &[
    "claude-code",
    "cursor",
    "windsurf",
    "github-copilot",
    "codex-cli",
    "gemini-cli",
    "cline",
    "openclaw",
];

/// Read + validate a manifest YAML file (`<slug>.yaml`).
///
/// If the file is a v2 manifest (`kind: Provider` or `kind: Tool`), this
/// returns a synthesized v1 [`Manifest`] (via the v2 → v1 shim in
/// [`v2::load_provider_tree`]) so downstream code paths stay v1-shaped.
/// Callers that need the resolved v2 tree (e.g. `listen` for per-binding cwd
/// anchors) should call [`v2::load_provider_tree`] directly.
pub fn load(path: &Path) -> Result<Manifest, OlError> {
    let bytes = std::fs::read(path).map_err(|e| {
        OlError::new(
            OL_4273_MANIFEST_UNREADABLE,
            format!("cannot read '{}': {e}", path.display()),
        )
    })?;
    if bytes.len() > MAX_MANIFEST_BYTES {
        return Err(OlError::new(
            OL_4273_MANIFEST_UNREADABLE,
            format!(
                "manifest at '{}' is {} bytes (cap {} KB)",
                path.display(),
                bytes.len(),
                MAX_MANIFEST_BYTES / 1024
            ),
        ));
    }

    // Peek at `kind:` / `schema_version:` to pick the loader. v1 manifests
    // have neither `kind` nor a v2 schema_version; v2 manifests always carry
    // both. v1 stays the default path so the existing test corpus is
    // unaffected.
    if let Ok((schema_version, kind)) = v2::peek_kind(&bytes) {
        match (schema_version, kind.as_deref()) {
            (Some(2), Some("Provider")) => {
                let resolved = v2::load_provider_tree(path)?;
                return Ok(resolved.synth);
            }
            (Some(2), Some("Tool")) => {
                // A `kind: Tool` manifest by itself doesn't carry providers /
                // bindings — `publish --tool <path>` is the correct entrypoint.
                // For convenience, synthesize a v1 manifest that contains just
                // the editor + tools (no providers, no bindings). `register`
                // and `listen` will fail-fast with a more specific error
                // later, but callers that *only* need editor + tools (publish)
                // get a working result.
                let parsed = v2::parse_tool_v2(&bytes, path)?;
                let tools_json: Vec<serde_json::Value> = parsed
                    .tools
                    .iter()
                    .map(|t| {
                        let mut v = serde_json::to_value(t).unwrap_or(serde_json::Value::Null);
                        if let Some(m) = v.as_object_mut() {
                            m.remove("process");
                        }
                        v
                    })
                    .collect();
                let synth_json = serde_json::json!({
                    "schema_version": 1,
                    "editor": serde_json::to_value(&parsed.editor).unwrap_or(serde_json::Value::Null),
                    "tools": tools_json,
                    "providers": [],
                    "bindings": [],
                });
                schema::validate(&synth_json)?;
                let m: Manifest = serde_json::from_value(synth_json).map_err(|e| {
                    OlError::new(
                        OL_4210_SCHEMA_MISMATCH,
                        format!("internal: synth tool-only manifest: {e}"),
                    )
                })?;
                return Ok(m);
            }
            _ => {}
        }
    }
    parse(&bytes)
}

/// Parse + validate raw bytes. Used by both [`load`] and the smoke tests.
pub fn parse(bytes: &[u8]) -> Result<Manifest, OlError> {
    let yaml: serde_yaml::Value = serde_yaml::from_slice(bytes)
        .map_err(|e| OlError::new(OL_4210_SCHEMA_MISMATCH, format!("YAML parse: {e}")))?;
    let json = serde_json::to_value(&yaml).map_err(|e| {
        OlError::new(
            OL_4210_SCHEMA_MISMATCH,
            format!("YAML→JSON conversion: {e}"),
        )
    })?;

    schema::validate(&json)?;

    let manifest: Manifest = serde_json::from_value(json)
        .map_err(|e| OlError::new(OL_4210_SCHEMA_MISMATCH, format!("typify deserialize: {e}")))?;

    semantic_check(&manifest)?;
    Ok(manifest)
}

/// Cross-field rules that the JSON Schema can't express.
pub fn semantic_check(m: &Manifest) -> Result<(), OlError> {
    use std::collections::BTreeSet;

    // Tool slug uniqueness.
    let mut tool_slugs: BTreeSet<String> = BTreeSet::new();
    for t in &m.tools {
        let slug: &str = &t.slug;
        if !tool_slugs.insert(slug.to_string()) {
            return Err(OlError::new(
                OL_4210_SCHEMA_MISMATCH,
                format!("duplicate tool slug `{slug}`"),
            ));
        }
        // agents_supported / hooks_supported open enum check.
        for a in &t.agents_supported {
            if !KNOWN_AGENT_PLATFORMS.contains(&a.as_str()) {
                tracing::warn!(agent = %a, "unknown agent.platform value (allowed but tagged)");
            }
        }
        for h in &t.hooks_supported {
            if !KNOWN_HOOK_EVENTS.contains(&h.as_str()) {
                tracing::warn!(hook = %h, "unknown hook event value (allowed but tagged)");
            }
        }
        if t.agents_supported.is_empty() {
            return Err(OlError::new(
                OL_4213_INVALID_AGENTS_SUPPORTED,
                format!("tool `{slug}` has empty agents_supported"),
            ));
        }
        if t.hooks_supported.is_empty() {
            return Err(OlError::new(
                OL_4214_INVALID_HOOKS_SUPPORTED,
                format!("tool `{slug}` has empty hooks_supported"),
            ));
        }
        if t.capabilities.is_empty() {
            return Err(OlError::new(
                OL_4216_CAPABILITY_MISSING_FIELD,
                format!("tool `{slug}` has no capabilities"),
            ));
        }
        for cap in &t.capabilities {
            // threat_category is a typed wrapper from typify; extract its
            // string form via Debug — checking against the known list with
            // strsim suggestion if not found.
            let serialized = serde_json::to_value(cap.threat_category)
                .ok()
                .and_then(|v| v.as_str().map(|s| s.to_string()))
                .unwrap_or_default();
            if !serialized.is_empty() && !KNOWN_THREAT_CATEGORIES.contains(&serialized.as_str()) {
                let mut err = OlError::new(
                    OL_4211_UNKNOWN_THREAT_CATEGORY,
                    format!("tool `{slug}`: unknown threat_category `{serialized}`"),
                );
                if let Some(suggestion) = closest_match(&serialized, KNOWN_THREAT_CATEGORIES) {
                    err = err.with_suggestion(format!("Did you mean `{suggestion}`?"));
                }
                return Err(err);
            }
        }
    }

    // Provider slug uniqueness + endpoint sanity (HTTPS).
    let mut provider_slugs: BTreeSet<String> = BTreeSet::new();
    for p in &m.providers {
        let slug: &str = &p.slug;
        if !provider_slugs.insert(slug.to_string()) {
            return Err(OlError::new(
                OL_4210_SCHEMA_MISMATCH,
                format!("duplicate provider slug `{slug}`"),
            ));
        }
        if !p.endpoint_url.starts_with("https://") {
            return Err(OlError::new(
                OL_4212_INVALID_ENDPOINT_URL,
                format!(
                    "provider `{}` endpoint_url must be HTTPS, got `{}`",
                    slug, p.endpoint_url
                ),
            )
            .with_suggestion(
                "Use https:// scheme; the platform never connects to plaintext endpoints.",
            ));
        }
    }

    // Bindings cross-reference + health-check port uniqueness.
    //
    // Port uniqueness is checked dynamically (via `serde_json::to_value` +
    // JSON-Pointer) rather than typed field access so this rule stays robust
    // across regenerations of `src/generated/types.rs` from typify.
    let mut health_ports: BTreeSet<u64> = BTreeSet::new();
    for b in &m.bindings {
        if !tool_slugs.contains(&b.tool) {
            return Err(OlError::new(
                OL_4210_SCHEMA_MISMATCH,
                format!(
                    "binding references tool `{}` which is not declared in tools[]",
                    b.tool
                ),
            ));
        }
        if !provider_slugs.contains(&b.provider) {
            return Err(OlError::new(
                OL_4210_SCHEMA_MISMATCH,
                format!(
                    "binding references provider `{}` which is not declared in providers[]",
                    b.provider
                ),
            ));
        }
        let as_json = serde_json::to_value(b).map_err(|e| {
            OlError::new(
                OL_4210_SCHEMA_MISMATCH,
                format!("serialise binding {}/{}: {e}", b.tool, b.provider),
            )
        })?;
        if let Some(port) = as_json
            .pointer("/process/health_check/http/port")
            .and_then(serde_json::Value::as_u64)
        {
            if !health_ports.insert(port) {
                return Err(OlError::new(
                    OL_4300_PROCESS_SPEC_INVALID,
                    format!(
                        "two or more bindings declare process.health_check.http.port = {port}; \
                         each binding spawns its own process and must own its port"
                    ),
                ));
            }
        }
    }

    Ok(())
}

/// Remove fields from a serialized binding that are local-only and must never
/// reach the platform's `/api/v1/editor/bindings` upsert. Currently strips
/// `process` (managed-process spec is host-side concern).
pub fn strip_local_only_binding_fields(value: &mut serde_json::Value) {
    if let serde_json::Value::Object(map) = value {
        map.remove("process");
    }
}

fn closest_match(needle: &str, haystack: &[&str]) -> Option<String> {
    let mut best: Option<(usize, &str)> = None;
    for candidate in haystack {
        let d = strsim::levenshtein(needle, candidate);
        if d <= 4 {
            match best {
                None => best = Some((d, candidate)),
                Some((bd, _)) if d < bd => best = Some((d, candidate)),
                _ => {}
            }
        }
    }
    best.map(|(_, s)| s.to_string())
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn closest_match_finds_typo() {
        let suggestion = closest_match("pii_outbond", KNOWN_THREAT_CATEGORIES);
        assert_eq!(suggestion.as_deref(), Some("pii_outbound"));
    }

    /// Guards alignment between the local `KNOWN_THREAT_CATEGORIES` const and
    /// the `threat_category` enum in `schemas/manifest-capability.schema.json`.
    /// Drift here silently breaks `OL-4211` suggestions and the wizard's
    /// MultiSelect picker.
    #[test]
    fn known_threat_categories_match_schema() {
        use std::collections::BTreeSet;

        const CAPABILITY_SCHEMA: &str =
            include_str!("../../schemas/manifest-capability.schema.json");

        let value: serde_json::Value =
            serde_json::from_str(CAPABILITY_SCHEMA).expect("capability schema must parse");
        let enum_array = value
            .pointer("/properties/threat_category/enum")
            .and_then(|v| v.as_array())
            .expect("schema.properties.threat_category.enum must exist");

        let from_schema: BTreeSet<&str> = enum_array
            .iter()
            .map(|v| v.as_str().expect("threat_category enum must be strings"))
            .collect();
        let from_const: BTreeSet<&str> = KNOWN_THREAT_CATEGORIES.iter().copied().collect();

        assert_eq!(
            from_const, from_schema,
            "KNOWN_THREAT_CATEGORIES drifted from manifest-capability.schema.json"
        );
    }
}