use crate::error::{BatlessError, BatlessResult};
use serde_json::{json, Value};
use std::collections::HashMap;
pub struct JsonSchemaValidator {
schemas: HashMap<String, Value>,
}
impl JsonSchemaValidator {
pub fn new() -> Self {
let mut validator = Self {
schemas: HashMap::new(),
};
validator.load_builtin_schemas();
validator
}
fn load_builtin_schemas(&mut self) {
self.schemas
.insert("file_info".to_string(), self.file_info_schema());
self.schemas
.insert("json_output".to_string(), self.json_output_schema());
self.schemas
.insert("token_count".to_string(), self.token_count_schema());
self.schemas.insert(
"processing_stats".to_string(),
self.processing_stats_schema(),
);
self.schemas.insert(
"streaming_chunk".to_string(),
crate::streaming::StreamingProcessor::get_streaming_schema(),
);
}
pub fn validate(&self, schema_name: &str, json_value: &Value) -> BatlessResult<()> {
let schema = self.schemas.get(schema_name).ok_or_else(|| {
BatlessError::config_error_with_help(
format!("Unknown schema: {schema_name}"),
Some(
"Available schemas: file_info, json_output, token_count, processing_stats, streaming_chunk"
.to_string(),
),
)
})?;
self.validate_against_schema(json_value, schema)
.map_err(|e| {
BatlessError::config_error_with_help(
format!("JSON validation failed for schema '{schema_name}': {e}"),
Some("Check the JSON output format matches the expected schema".to_string()),
)
})
}
pub fn validate_json_string(&self, schema_name: &str, json_str: &str) -> BatlessResult<()> {
let json_value: Value = serde_json::from_str(json_str).map_err(|e| {
BatlessError::config_error_with_help(
format!("Invalid JSON: {e}"),
Some("Ensure the JSON is properly formatted".to_string()),
)
})?;
self.validate(schema_name, &json_value)
}
pub fn schema_names(&self) -> Vec<String> {
self.schemas.keys().cloned().collect()
}
pub fn get_schema(&self, name: &str) -> Option<&Value> {
self.schemas.get(name)
}
fn validate_against_schema(&self, value: &Value, schema: &Value) -> Result<(), String> {
self.validate_against_schema_with_path(value, schema, "")
}
fn validate_against_schema_with_path(
&self,
value: &Value,
schema: &Value,
path: &str,
) -> Result<(), String> {
match (value, schema) {
(_, Value::Object(schema_obj)) => {
if let Some(schema_type) = schema_obj.get("type") {
self.validate_type_with_path(value, schema_type, path)?;
}
if let Some(properties) = schema_obj.get("properties") {
if let (Value::Object(value_obj), Value::Object(props)) = (value, properties) {
for (key, prop_schema) in props {
let prop_path = if path.is_empty() {
key.clone()
} else {
format!("{path}.{key}")
};
if let Some(prop_value) = value_obj.get(key) {
self.validate_against_schema_with_path(
prop_value,
prop_schema,
&prop_path,
)?;
}
}
}
}
if let Some(required) = schema_obj.get("required") {
if let (Value::Object(value_obj), Value::Array(req_fields)) = (value, required)
{
for field in req_fields {
if let Value::String(field_name) = field {
if !value_obj.contains_key(field_name) {
let field_path = if path.is_empty() {
field_name.clone()
} else {
format!("{path}.{field_name}")
};
return Err(format!(
"Missing required field: '{field_path}'\n Expected: This field is required for AI compatibility\n Suggestion: Add the missing field to your JSON output"
));
}
}
}
}
}
Ok(())
}
_ => Ok(()),
}
}
fn validate_type_with_path(
&self,
value: &Value,
schema_type: &Value,
path: &str,
) -> Result<(), String> {
if let Value::Array(types) = schema_type {
for type_option in types {
if self
.validate_type_with_path(value, type_option, path)
.is_ok()
{
return Ok(());
}
}
let type_names: Vec<String> = types
.iter()
.filter_map(|v| v.as_str())
.map(ToString::to_string)
.collect();
let field_info = if path.is_empty() {
"root".to_string()
} else {
format!("'{path}'")
};
return Err(format!(
"Type mismatch at {field_info}: expected one of {type_names:?}, got {}\n Expected: One of the allowed types for AI compatibility\n Suggestion: Convert the value to match the expected type",
self.get_value_type(value)
));
}
let expected_type = schema_type.as_str().unwrap_or("unknown");
let matches = match (value, expected_type) {
(Value::String(_), "string") => true,
(Value::Number(_), "number") => true,
(Value::Number(_), "integer") => value.as_i64().is_some(),
(Value::Bool(_), "boolean") => true,
(Value::Array(_), "array") => true,
(Value::Object(_), "object") => true,
(Value::Null, "null") => true,
_ => false,
};
if !matches {
let field_info = if path.is_empty() {
"root".to_string()
} else {
format!("'{path}'")
};
return Err(format!(
"Type mismatch at {field_info}: expected {expected_type}, got {}\n Expected: Correct data type for AI compatibility\n Suggestion: Convert the value to the expected type",
self.get_value_type(value)
));
}
Ok(())
}
const fn get_value_type(&self, value: &Value) -> &'static str {
match value {
Value::String(_) => "string",
Value::Number(_) => "number",
Value::Bool(_) => "boolean",
Value::Array(_) => "array",
Value::Object(_) => "object",
Value::Null => "null",
}
}
fn file_info_schema(&self) -> Value {
json!({
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"properties": {
"lines": {
"type": "array",
"items": { "type": "string" }
},
"total_lines": { "type": "integer" },
"total_bytes": { "type": "integer" },
"total_lines_exact": { "type": "boolean" },
"truncated": { "type": "boolean" },
"truncated_by_lines": { "type": "boolean" },
"truncated_by_bytes": { "type": "boolean" },
"language": {
"type": ["string", "null"]
},
"encoding": { "type": "string" },
"syntax_errors": {
"type": "array",
"items": { "type": "string" }
},
"identifiers": {
"type": ["array", "null"],
"items": { "type": "string" }
},
"summary_lines": {
"type": ["array", "null"],
"items": { "type": "string" }
}
},
"required": [
"lines", "total_lines", "total_bytes", "truncated",
"truncated_by_lines", "truncated_by_bytes", "encoding", "syntax_errors"
]
})
}
fn json_output_schema(&self) -> Value {
json!({
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"properties": {
"file": { "type": "string" },
"lines": {
"type": "array",
"items": { "type": "string" }
},
"processed_lines": { "type": "integer" },
"total_lines": { "type": "integer" },
"total_bytes": { "type": "integer" },
"total_lines_exact": { "type": "boolean" },
"truncated": { "type": "boolean" },
"truncated_by_lines": { "type": "boolean" },
"truncated_by_bytes": { "type": "boolean" },
"language": {
"type": ["string", "null"]
},
"encoding": { "type": "string" },
"syntax_errors": {
"type": "array",
"items": { "type": "string" }
},
"identifiers": {
"type": ["array", "null"],
"items": { "type": "string" }
},
"identifier_count": { "type": "integer" },
"identifiers_truncated": { "type": "boolean" },
"summary_lines": {
"type": ["array", "null"],
"items": { "type": "string" }
},
"mode": { "type": "string" }
},
"required": [
"file",
"lines",
"processed_lines",
"total_lines",
"total_bytes",
"total_lines_exact",
"truncated",
"total_lines_exact",
"truncated_by_lines",
"truncated_by_bytes",
"encoding",
"identifier_count",
"identifiers_truncated",
"syntax_errors",
"mode"
]
})
}
fn token_count_schema(&self) -> Value {
json!({
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"properties": {
"tokens": { "type": "integer" },
"words": { "type": "integer" },
"characters": { "type": "integer" },
"model": { "type": "string" },
"fits_in_context": { "type": "boolean" },
"context_usage_percent": { "type": "number" }
},
"required": [
"tokens", "words", "characters", "model",
"fits_in_context", "context_usage_percent"
]
})
}
fn processing_stats_schema(&self) -> Value {
json!({
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"properties": {
"total_lines": { "type": "integer" },
"processed_lines": { "type": "integer" },
"total_lines_exact": { "type": "boolean" },
"total_bytes": { "type": "integer" },
"truncated": { "type": "boolean" },
"truncation_reason": {
"type": ["string", "null"]
},
"has_syntax_errors": { "type": "boolean" },
"error_count": { "type": "integer" },
"language": {
"type": ["string", "null"]
},
"encoding": { "type": "string" },
"token_count": { "type": "integer" },
"tokens_truncated": { "type": "boolean" },
"summary_line_count": { "type": "integer" }
},
"required": [
"total_lines", "processed_lines", "total_lines_exact", "total_bytes", "truncated",
"has_syntax_errors", "error_count", "encoding", "token_count", "tokens_truncated", "summary_line_count"
]
})
}
}
impl Default for JsonSchemaValidator {
fn default() -> Self {
Self::new()
}
}
pub fn validate_batless_output(json_str: &str) -> BatlessResult<()> {
let validator = JsonSchemaValidator::new();
validator.validate_json_string("json_output", json_str)
}
pub fn get_json_schema(schema_name: &str) -> BatlessResult<Value> {
let validator = JsonSchemaValidator::new();
validator.get_schema(schema_name).cloned().ok_or_else(|| {
BatlessError::config_error_with_help(
format!("Schema '{schema_name}' not found"),
Some(format!(
"Available schemas: {}",
validator.schema_names().join(", ")
)),
)
})
}
#[cfg(test)]
mod tests {
use super::*;
use serde_json::json;
#[test]
fn test_schema_validator_creation() {
let validator = JsonSchemaValidator::new();
assert!(!validator.schema_names().is_empty());
assert!(validator.schema_names().contains(&"file_info".to_string()));
}
#[test]
fn test_valid_file_info() {
let validator = JsonSchemaValidator::new();
let valid_json = json!({
"lines": ["line1", "line2"],
"total_lines": 2,
"total_bytes": 100,
"truncated": false,
"truncated_by_lines": false,
"truncated_by_bytes": false,
"language": "rust",
"encoding": "UTF-8",
"syntax_errors": [],
"identifiers": null,
"summary_lines": null
});
let result = validator.validate("file_info", &valid_json);
if let Err(e) = &result {
println!("Validation error: {e}");
}
assert!(result.is_ok());
}
#[test]
fn test_invalid_file_info_missing_field() {
let validator = JsonSchemaValidator::new();
let invalid_json = json!({
"lines": ["line1", "line2"],
"total_lines": 2
});
assert!(validator.validate("file_info", &invalid_json).is_err());
}
#[test]
fn test_invalid_file_info_wrong_type() {
let validator = JsonSchemaValidator::new();
let invalid_json = json!({
"lines": ["line1", "line2"],
"total_lines": "not_a_number", "total_bytes": 100,
"truncated": false,
"truncated_by_lines": false,
"truncated_by_bytes": false,
"encoding": "UTF-8",
"syntax_errors": []
});
assert!(validator.validate("file_info", &invalid_json).is_err());
}
#[test]
fn test_token_count_schema() {
let validator = JsonSchemaValidator::new();
let valid_token_count = json!({
"tokens": 150,
"words": 100,
"characters": 500,
"model": "gpt-4",
"fits_in_context": true,
"context_usage_percent": 12.5
});
assert!(validator
.validate("token_count", &valid_token_count)
.is_ok());
}
#[test]
fn test_get_schema() {
let validator = JsonSchemaValidator::new();
let schema = validator.get_schema("file_info");
assert!(schema.is_some());
let schema_value = schema.unwrap();
assert_eq!(schema_value["type"], "object");
}
#[test]
fn test_validate_json_string() {
let validator = JsonSchemaValidator::new();
let json_str = r#"{
"tokens": 150,
"words": 100,
"characters": 500,
"model": "gpt-4",
"fits_in_context": true,
"context_usage_percent": 12.5
}"#;
assert!(validator
.validate_json_string("token_count", json_str)
.is_ok());
}
#[test]
fn test_validate_batless_output() {
let json_str = r#"{
"file": "test.rs",
"lines": ["line1"],
"processed_lines": 1,
"total_lines": 1,
"total_lines_exact": true,
"total_bytes": 10,
"truncated": false,
"truncated_by_lines": false,
"truncated_by_bytes": false,
"language": "rust",
"encoding": "UTF-8",
"syntax_errors": [],
"identifier_count": 0,
"identifiers_truncated": false,
"identifiers": null,
"summary_lines": null,
"mode": "json"
}"#;
let result = validate_batless_output(json_str);
assert!(
result.is_ok(),
"json_output validation should pass for sample payload: {result:?}"
);
}
#[test]
fn test_enhanced_error_messages() {
let validator = JsonSchemaValidator::new();
let incomplete_token_count = json!({
"words": 100,
"characters": 500,
"model": "gpt-4",
"fits_in_context": true
});
let result = validator.validate("token_count", &incomplete_token_count);
assert!(result.is_err());
let error_msg = result.unwrap_err().to_string();
assert!(error_msg.contains("Missing required field"));
assert!(error_msg.contains("Expected: This field is required for AI compatibility"));
assert!(error_msg.contains("Suggestion: Add the missing field"));
let wrong_type_token_count = json!({
"tokens": "not_a_number", "words": 100,
"characters": 500,
"model": "gpt-4",
"fits_in_context": true,
"context_usage_percent": 12.5
});
let result = validator.validate("token_count", &wrong_type_token_count);
assert!(result.is_err());
let error_msg = result.unwrap_err().to_string();
assert!(error_msg.contains("Type mismatch"));
assert!(error_msg.contains("Expected: Correct data type for AI compatibility"));
assert!(error_msg.contains("Suggestion: Convert the value to the expected type"));
}
}