batless 0.5.0 - Docs.rs

//! JSON schema validation for batless output
//!
//! This module provides JSON schema validation to ensure AI compatibility
//! and consistent output format across different modes.

use crate::error::{BatlessError, BatlessResult};
use serde_json::{json, Value};
use std::collections::HashMap;

/// JSON schema validator for batless output
pub struct JsonSchemaValidator {
    schemas: HashMap<String, Value>,
}

impl JsonSchemaValidator {
    /// Create a new JSON schema validator with built-in schemas
    pub fn new() -> Self {
        let mut validator = Self {
            schemas: HashMap::new(),
        };
        validator.load_builtin_schemas();
        validator
    }

    /// Load all built-in schemas for different output modes
    fn load_builtin_schemas(&mut self) {
        self.schemas
            .insert("file_info".to_string(), self.file_info_schema());
        self.schemas
            .insert("json_output".to_string(), self.json_output_schema());
        self.schemas
            .insert("token_count".to_string(), self.token_count_schema());
        self.schemas.insert(
            "processing_stats".to_string(),
            self.processing_stats_schema(),
        );
        self.schemas.insert(
            "streaming_chunk".to_string(),
            crate::streaming::StreamingProcessor::get_streaming_schema(),
        );
    }

    /// Validate JSON against a specific schema
    pub fn validate(&self, schema_name: &str, json_value: &Value) -> BatlessResult<()> {
        let schema = self.schemas.get(schema_name).ok_or_else(|| {
            BatlessError::config_error_with_help(
                format!("Unknown schema: {schema_name}"),
                Some(
                    "Available schemas: file_info, json_output, token_count, processing_stats, streaming_chunk"
                        .to_string(),
                ),
            )
        })?;

        self.validate_against_schema(json_value, schema)
            .map_err(|e| {
                BatlessError::config_error_with_help(
                    format!("JSON validation failed for schema '{schema_name}': {e}"),
                    Some("Check the JSON output format matches the expected schema".to_string()),
                )
            })
    }

    /// Validate JSON string against a schema
    pub fn validate_json_string(&self, schema_name: &str, json_str: &str) -> BatlessResult<()> {
        let json_value: Value = serde_json::from_str(json_str).map_err(|e| {
            BatlessError::config_error_with_help(
                format!("Invalid JSON: {e}"),
                Some("Ensure the JSON is properly formatted".to_string()),
            )
        })?;

        self.validate(schema_name, &json_value)
    }

    /// Get available schema names
    pub fn schema_names(&self) -> Vec<String> {
        self.schemas.keys().cloned().collect()
    }

    /// Get a specific schema
    pub fn get_schema(&self, name: &str) -> Option<&Value> {
        self.schemas.get(name)
    }

    /// Basic JSON schema validation with enhanced error messages
    fn validate_against_schema(&self, value: &Value, schema: &Value) -> Result<(), String> {
        self.validate_against_schema_with_path(value, schema, "")
    }

    /// Validate with path tracking for better error messages
    fn validate_against_schema_with_path(
        &self,
        value: &Value,
        schema: &Value,
        path: &str,
    ) -> Result<(), String> {
        match (value, schema) {
            (_, Value::Object(schema_obj)) => {
                if let Some(schema_type) = schema_obj.get("type") {
                    self.validate_type_with_path(value, schema_type, path)?;
                }

                if let Some(properties) = schema_obj.get("properties") {
                    if let (Value::Object(value_obj), Value::Object(props)) = (value, properties) {
                        for (key, prop_schema) in props {
                            let prop_path = if path.is_empty() {
                                key.clone()
                            } else {
                                format!("{path}.{key}")
                            };

                            if let Some(prop_value) = value_obj.get(key) {
                                self.validate_against_schema_with_path(
                                    prop_value,
                                    prop_schema,
                                    &prop_path,
                                )?;
                            }
                        }
                    }
                }

                if let Some(required) = schema_obj.get("required") {
                    if let (Value::Object(value_obj), Value::Array(req_fields)) = (value, required)
                    {
                        for field in req_fields {
                            if let Value::String(field_name) = field {
                                if !value_obj.contains_key(field_name) {
                                    let field_path = if path.is_empty() {
                                        field_name.clone()
                                    } else {
                                        format!("{path}.{field_name}")
                                    };

                                    return Err(format!(
                                        "Missing required field: '{field_path}'\n  Expected: This field is required for AI compatibility\n  Suggestion: Add the missing field to your JSON output"
                                    ));
                                }
                            }
                        }
                    }
                }
                Ok(())
            }
            _ => Ok(()),
        }
    }

    /// Validate value type against schema type with enhanced error messages
    fn validate_type_with_path(
        &self,
        value: &Value,
        schema_type: &Value,
        path: &str,
    ) -> Result<(), String> {
        // Handle array of types (e.g., ["string", "null"])
        if let Value::Array(types) = schema_type {
            for type_option in types {
                if self
                    .validate_type_with_path(value, type_option, path)
                    .is_ok()
                {
                    return Ok(());
                }
            }
            let type_names: Vec<String> = types
                .iter()
                .filter_map(|v| v.as_str())
                .map(ToString::to_string)
                .collect();

            let field_info = if path.is_empty() {
                "root".to_string()
            } else {
                format!("'{path}'")
            };

            return Err(format!(
                "Type mismatch at {field_info}: expected one of {type_names:?}, got {}\n  Expected: One of the allowed types for AI compatibility\n  Suggestion: Convert the value to match the expected type",
                self.get_value_type(value)
            ));
        }

        let expected_type = schema_type.as_str().unwrap_or("unknown");

        let matches = match (value, expected_type) {
            (Value::String(_), "string") => true,
            (Value::Number(_), "number") => true,
            (Value::Number(_), "integer") => value.as_i64().is_some(),
            (Value::Bool(_), "boolean") => true,
            (Value::Array(_), "array") => true,
            (Value::Object(_), "object") => true,
            (Value::Null, "null") => true,
            _ => false,
        };

        if !matches {
            let field_info = if path.is_empty() {
                "root".to_string()
            } else {
                format!("'{path}'")
            };
            return Err(format!(
                "Type mismatch at {field_info}: expected {expected_type}, got {}\n  Expected: Correct data type for AI compatibility\n  Suggestion: Convert the value to the expected type",
                self.get_value_type(value)
            ));
        }

        Ok(())
    }

    /// Get the type name of a JSON value
    const fn get_value_type(&self, value: &Value) -> &'static str {
        match value {
            Value::String(_) => "string",
            Value::Number(_) => "number",
            Value::Bool(_) => "boolean",
            Value::Array(_) => "array",
            Value::Object(_) => "object",
            Value::Null => "null",
        }
    }

    /// FileInfo JSON schema
    fn file_info_schema(&self) -> Value {
        json!({
            "$schema": "http://json-schema.org/draft-07/schema#",
            "type": "object",
            "properties": {
                "lines": {
                    "type": "array",
                    "items": { "type": "string" }
                },
                "total_lines": { "type": "integer" },
                "total_bytes": { "type": "integer" },
                "total_lines_exact": { "type": "boolean" },
                "truncated": { "type": "boolean" },
                "truncated_by_lines": { "type": "boolean" },
                "truncated_by_bytes": { "type": "boolean" },
                "language": {
                    "type": ["string", "null"]
                },
                "encoding": { "type": "string" },
                "syntax_errors": {
                    "type": "array",
                    "items": { "type": "string" }
                },
                "identifiers": {
                    "type": ["array", "null"],
                    "items": { "type": "string" }
                },
                "summary_lines": {
                    "type": ["array", "null"],
                    "items": { "type": "string" }
                }
            },
            "required": [
                "lines", "total_lines", "total_bytes", "truncated",
                "truncated_by_lines", "truncated_by_bytes", "encoding", "syntax_errors"
            ]
        })
    }

    /// JSON output schema for batless (matches --mode=json output)
    fn json_output_schema(&self) -> Value {
        json!({
            "$schema": "http://json-schema.org/draft-07/schema#",
            "type": "object",
            "properties": {
                "file": { "type": "string" },
                "lines": {
                    "type": "array",
                    "items": { "type": "string" }
                },
                "processed_lines": { "type": "integer" },
                "total_lines": { "type": "integer" },
                "total_bytes": { "type": "integer" },
                "total_lines_exact": { "type": "boolean" },
                "truncated": { "type": "boolean" },
                "truncated_by_lines": { "type": "boolean" },
                "truncated_by_bytes": { "type": "boolean" },
                "language": {
                    "type": ["string", "null"]
                },
                "encoding": { "type": "string" },
                "syntax_errors": {
                    "type": "array",
                    "items": { "type": "string" }
                },
                "identifiers": {
                    "type": ["array", "null"],
                    "items": { "type": "string" }
                },
                "identifier_count": { "type": "integer" },
                "identifiers_truncated": { "type": "boolean" },
                "summary_lines": {
                    "type": ["array", "null"],
                    "items": { "type": "string" }
                },
                "mode": { "type": "string" }
            },
            "required": [
                "file",
                "lines",
                "processed_lines",
                "total_lines",
                "total_bytes",
                "total_lines_exact",
                "truncated",
                "total_lines_exact",
                "truncated_by_lines",
                "truncated_by_bytes",
                "encoding",
                "identifier_count",
                "identifiers_truncated",
                "syntax_errors",
                "mode"
            ]
        })
    }

    /// Token count schema
    fn token_count_schema(&self) -> Value {
        json!({
            "$schema": "http://json-schema.org/draft-07/schema#",
            "type": "object",
            "properties": {
                "tokens": { "type": "integer" },
                "words": { "type": "integer" },
                "characters": { "type": "integer" },
                "model": { "type": "string" },
                "fits_in_context": { "type": "boolean" },
                "context_usage_percent": { "type": "number" }
            },
            "required": [
                "tokens", "words", "characters", "model",
                "fits_in_context", "context_usage_percent"
            ]
        })
    }

    /// Processing stats schema
    fn processing_stats_schema(&self) -> Value {
        json!({
            "$schema": "http://json-schema.org/draft-07/schema#",
            "type": "object",
            "properties": {
                "total_lines": { "type": "integer" },
                "processed_lines": { "type": "integer" },
                "total_lines_exact": { "type": "boolean" },
                "total_bytes": { "type": "integer" },
                "truncated": { "type": "boolean" },
                "truncation_reason": {
                    "type": ["string", "null"]
                },
                "has_syntax_errors": { "type": "boolean" },
                "error_count": { "type": "integer" },
                "language": {
                    "type": ["string", "null"]
                },
                "encoding": { "type": "string" },
                "token_count": { "type": "integer" },
                "tokens_truncated": { "type": "boolean" },
                "summary_line_count": { "type": "integer" }
            },
            "required": [
                "total_lines", "processed_lines", "total_lines_exact", "total_bytes", "truncated",
                "has_syntax_errors", "error_count", "encoding", "token_count", "tokens_truncated", "summary_line_count"
            ]
        })
    }
}

impl Default for JsonSchemaValidator {
    fn default() -> Self {
        Self::new()
    }
}

/// Validate batless JSON output
pub fn validate_batless_output(json_str: &str) -> BatlessResult<()> {
    let validator = JsonSchemaValidator::new();
    validator.validate_json_string("json_output", json_str)
}

/// Get the JSON schema for a specific output format
pub fn get_json_schema(schema_name: &str) -> BatlessResult<Value> {
    let validator = JsonSchemaValidator::new();
    validator.get_schema(schema_name).cloned().ok_or_else(|| {
        BatlessError::config_error_with_help(
            format!("Schema '{schema_name}' not found"),
            Some(format!(
                "Available schemas: {}",
                validator.schema_names().join(", ")
            )),
        )
    })
}

#[cfg(test)]
mod tests {
    use super::*;
    use serde_json::json;

    #[test]
    fn test_schema_validator_creation() {
        let validator = JsonSchemaValidator::new();
        assert!(!validator.schema_names().is_empty());
        assert!(validator.schema_names().contains(&"file_info".to_string()));
    }

    #[test]
    fn test_valid_file_info() {
        let validator = JsonSchemaValidator::new();
        let valid_json = json!({
            "lines": ["line1", "line2"],
            "total_lines": 2,
            "total_bytes": 100,
            "truncated": false,
            "truncated_by_lines": false,
            "truncated_by_bytes": false,
            "language": "rust",
            "encoding": "UTF-8",
            "syntax_errors": [],
            "identifiers": null,
            "summary_lines": null
        });

        let result = validator.validate("file_info", &valid_json);
        if let Err(e) = &result {
            println!("Validation error: {e}");
        }
        assert!(result.is_ok());
    }

    #[test]
    fn test_invalid_file_info_missing_field() {
        let validator = JsonSchemaValidator::new();
        let invalid_json = json!({
            "lines": ["line1", "line2"],
            "total_lines": 2
            // Missing required fields
        });

        assert!(validator.validate("file_info", &invalid_json).is_err());
    }

    #[test]
    fn test_invalid_file_info_wrong_type() {
        let validator = JsonSchemaValidator::new();
        let invalid_json = json!({
            "lines": ["line1", "line2"],
            "total_lines": "not_a_number", // Wrong type
            "total_bytes": 100,
            "truncated": false,
            "truncated_by_lines": false,
            "truncated_by_bytes": false,
            "encoding": "UTF-8",
            "syntax_errors": []
        });

        assert!(validator.validate("file_info", &invalid_json).is_err());
    }

    #[test]
    fn test_token_count_schema() {
        let validator = JsonSchemaValidator::new();
        let valid_token_count = json!({
            "tokens": 150,
            "words": 100,
            "characters": 500,
            "model": "gpt-4",
            "fits_in_context": true,
            "context_usage_percent": 12.5
        });

        assert!(validator
            .validate("token_count", &valid_token_count)
            .is_ok());
    }

    #[test]
    fn test_get_schema() {
        let validator = JsonSchemaValidator::new();
        let schema = validator.get_schema("file_info");
        assert!(schema.is_some());

        let schema_value = schema.unwrap();
        assert_eq!(schema_value["type"], "object");
    }

    #[test]
    fn test_validate_json_string() {
        let validator = JsonSchemaValidator::new();
        let json_str = r#"{
            "tokens": 150,
            "words": 100,
            "characters": 500,
            "model": "gpt-4",
            "fits_in_context": true,
            "context_usage_percent": 12.5
        }"#;

        assert!(validator
            .validate_json_string("token_count", json_str)
            .is_ok());
    }

    #[test]
    fn test_validate_batless_output() {
        let json_str = r#"{
            "file": "test.rs",
            "lines": ["line1"],
            "processed_lines": 1,
            "total_lines": 1,
            "total_lines_exact": true,
            "total_bytes": 10,
            "truncated": false,
            "truncated_by_lines": false,
            "truncated_by_bytes": false,
            "language": "rust",
            "encoding": "UTF-8",
            "syntax_errors": [],
            "identifier_count": 0,
            "identifiers_truncated": false,
            "identifiers": null,
            "summary_lines": null,
            "mode": "json"
        }"#;

        let result = validate_batless_output(json_str);
        assert!(
            result.is_ok(),
            "json_output validation should pass for sample payload: {result:?}"
        );
    }

    #[test]
    fn test_enhanced_error_messages() {
        let validator = JsonSchemaValidator::new();

        // Test missing required field with enhanced error
        let incomplete_token_count = json!({
            "words": 100,
            "characters": 500,
            "model": "gpt-4",
            "fits_in_context": true
            // Missing required "tokens" field
        });

        let result = validator.validate("token_count", &incomplete_token_count);
        assert!(result.is_err());

        let error_msg = result.unwrap_err().to_string();
        assert!(error_msg.contains("Missing required field"));
        assert!(error_msg.contains("Expected: This field is required for AI compatibility"));
        assert!(error_msg.contains("Suggestion: Add the missing field"));

        // Test type mismatch with enhanced error
        let wrong_type_token_count = json!({
            "tokens": "not_a_number", // Should be number
            "words": 100,
            "characters": 500,
            "model": "gpt-4",
            "fits_in_context": true,
            "context_usage_percent": 12.5
        });

        let result = validator.validate("token_count", &wrong_type_token_count);
        assert!(result.is_err());

        let error_msg = result.unwrap_err().to_string();
        assert!(error_msg.contains("Type mismatch"));
        assert!(error_msg.contains("Expected: Correct data type for AI compatibility"));
        assert!(error_msg.contains("Suggestion: Convert the value to the expected type"));
    }
}