Skip to main content

devops_validate/
repair.rs

1//! 6-stage YAML auto-repair pipeline.
2//!
3//! Given a YAML string and a JSON Schema, applies deterministic fixes and
4//! identifies fields that need human or LLM assistance.
5//!
6//! ## Stages
7//!
8//! 1. **Normalize** — trim whitespace, normalize line endings
9//! 2. **Parse** — `serde_yaml` → `serde_json::Value`
10//! 3. **Fill defaults** — inject `default` values from schema
11//! 4. **Collect errors** — walk schema, find violations
12//! 5. **Categorize** — split into deterministic vs ambiguous errors
13//! 6. **Fix** — type coercion, extra-key removal; ambiguous → `llm_fields`
14//!
15//! ## Example
16//!
17//! ```rust
18//! use devops_validate::repair::repair_yaml;
19//! use serde_json::json;
20//!
21//! let schema = json!({
22//!     "properties": { "replicas": { "type": "integer", "default": 1 } }
23//! });
24//! let result = repair_yaml("replicas: \"3\"", &schema);
25//! assert!(result.valid);
26//! ```
27
28use std::borrow::Cow;
29
30use devops_models::models::validation::RepairResult;
31use serde_json::Value;
32
33/// Apply the 6-stage YAML repair pipeline to `yaml_content` guided by `schema`.
34///
35/// **Stages**:
36/// 1. *Normalize* — strip surrounding whitespace, normalise CRLF → LF.
37/// 2. *Parse* — `serde_yaml` → [`serde_json::Value`]; returns early on parse failure.
38/// 3. *Fill defaults* — inject `"default"` values from `schema` for absent fields.
39/// 4. *Collect errors* — walk `schema` recursively and gather violations.
40/// 5. *Categorize* — split into *deterministic* (auto-fixable) vs *ambiguous* errors.
41/// 6. *Fix* — apply type coercion and extra-key removal; ambiguous errors become
42///    [`RepairResult::llm_fields`] for human or LLM review.
43///
44/// The function **never panics** — even an entirely unparseable input returns a
45/// [`RepairResult`] with `valid = false` and the original YAML unchanged.
46///
47/// # Arguments
48///
49/// - `yaml_content` — raw YAML string (may be invalid).
50/// - `schema` — JSON Schema object used to guide repair (e.g. from [`crate::schema::SchemaRegistry`]).
51///
52/// # Example
53///
54/// ```rust
55/// use devops_validate::repair::repair_yaml;
56/// use serde_json::json;
57///
58/// let schema = json!({
59///     "type": "object",
60///     "properties": {
61///         "replicas": { "type": "integer", "default": 1 },
62///         "name":     { "type": "string" }
63///     }
64/// });
65///
66/// // "replicas" is a string "3" — should be coerced to integer 3
67/// let result = repair_yaml("replicas: \"3\"\nname: my-app", &schema);
68///
69/// assert!(result.valid);
70/// assert!(result.repaired_yaml.contains("replicas: 3"));
71/// assert!(result.warnings.iter().any(|w| w.contains("Coerced")));
72/// ```
73pub fn repair_yaml(yaml_content: &str, schema: &Value) -> RepairResult {
74    // Stage 1: Syntax normalization
75    let normalized = normalize_yaml(yaml_content);
76
77    // Stage 2: Parse
78    let mut data: Value = match serde_yaml::from_str(&normalized) {
79        Ok(v) => v,
80        Err(e) => {
81            return RepairResult {
82                valid: false,
83                repaired_yaml: yaml_content.to_string(),
84                errors: vec![format!("YAML parse error: {e}")],
85                warnings: vec![],
86                llm_fields: vec![],
87                summary: format!("Cannot parse YAML: {e}"),
88            }
89        }
90    };
91
92    // Stage 3: Fill defaults from schema
93    if let Some(obj) = data.as_object_mut() {
94        fill_defaults(obj, schema);
95    }
96
97    // Stage 4 & 5: Collect and categorize errors
98    let (deterministic, ambiguous) = categorize_schema_errors(&data, schema);
99
100    // Stage 6: Apply deterministic fixes
101    let mut fix_log = Vec::new();
102    let mut failed_fixes = Vec::new();
103
104    for error in &deterministic {
105        match error.kind {
106            SchemaErrorKind::Type => {
107                if let Some(target_type) = &error.expected_type {
108                    if apply_type_coercion(&mut data, &error.path, target_type) {
109                        fix_log.push(format!(
110                            "Coerced {} to type '{}'",
111                            path_str(&error.path),
112                            target_type
113                        ));
114                    } else {
115                        failed_fixes.push(path_str(&error.path));
116                    }
117                }
118            }
119            SchemaErrorKind::AdditionalProperties => {
120                if let Some(allowed) = &error.allowed_keys {
121                    strip_extra_keys(&mut data, &error.path, allowed);
122                    fix_log.push(format!(
123                        "Removed extra keys at {}",
124                        path_str(&error.path)
125                    ));
126                }
127            }
128            SchemaErrorKind::Enum => {
129                failed_fixes.push(path_str(&error.path));
130            }
131            SchemaErrorKind::Required => {}
132        }
133    }
134
135    // Re-serialize
136    let repaired_yaml = match serde_yaml::to_string(&data) {
137        Ok(s) => s,
138        Err(e) => {
139            return RepairResult {
140                valid: false,
141                repaired_yaml: yaml_content.to_string(),
142                errors: vec![format!("Failed to serialize repaired YAML: {e}")],
143                warnings: fix_log,
144                llm_fields: vec![],
145                summary: "Repair failed during serialization".to_string(),
146            }
147        }
148    };
149
150    // Collect remaining issues
151    let mut remaining_errors: Vec<String> = ambiguous.iter().map(|e| e.message.clone()).collect();
152    remaining_errors.extend(failed_fixes.iter().map(|p| format!("Could not auto-fix: {p}")));
153
154    let llm_fields: Vec<String> = ambiguous
155        .iter()
156        .map(|e| path_str(&e.path))
157        .chain(failed_fixes)
158        .collect();
159
160    let valid = remaining_errors.is_empty();
161    let summary = if valid {
162        format!(
163            "YAML repaired successfully ({} fix{})",
164            fix_log.len(),
165            if fix_log.len() == 1 { "" } else { "es" }
166        )
167    } else {
168        format!(
169            "{} fix(es) applied, {} issue(s) remaining (need LLM assistance)",
170            fix_log.len(),
171            remaining_errors.len()
172        )
173    };
174
175    RepairResult {
176        valid,
177        repaired_yaml,
178        errors: remaining_errors,
179        warnings: fix_log,
180        llm_fields,
181        summary,
182    }
183}
184
185// --- Stage 1: Normalization ---
186
187fn normalize_yaml(content: &str) -> String {
188    content
189        .replace("\r\n", "\n")
190        .replace('\r', "\n")
191        .trim()
192        .to_string()
193}
194
195// --- Stage 3: Fill defaults ---
196
197fn fill_defaults(obj: &mut serde_json::Map<String, Value>, schema: &Value) {
198    let properties = match schema.get("properties").and_then(|p| p.as_object()) {
199        Some(p) => p,
200        None => return,
201    };
202
203    for (prop, subschema) in properties {
204        if !obj.contains_key(prop) {
205            if let Some(default_val) = subschema.get("default") {
206                obj.insert(prop.clone(), default_val.clone());
207            }
208        } else if let Some(nested_obj) = obj.get_mut(prop).and_then(|v| v.as_object_mut())
209            && subschema.get("properties").is_some()
210        {
211            fill_defaults(nested_obj, subschema);
212        }
213    }
214}
215
216// --- Stage 4 & 5: Error categorization ---
217
218/// Kind of schema validation error.
219#[derive(Debug, Clone, PartialEq)]
220enum SchemaErrorKind {
221    Type,
222    Enum,
223    Required,
224    AdditionalProperties,
225}
226
227impl std::fmt::Display for SchemaErrorKind {
228    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
229        match self {
230            SchemaErrorKind::Type => write!(f, "type"),
231            SchemaErrorKind::Enum => write!(f, "enum"),
232            SchemaErrorKind::Required => write!(f, "required"),
233            SchemaErrorKind::AdditionalProperties => write!(f, "additionalProperties"),
234        }
235    }
236}
237
238/// Internal representation of a schema validation error.
239#[derive(Debug, Clone)]
240struct SchemaError {
241    path: Vec<String>,
242    kind: SchemaErrorKind,
243    message: String,
244    expected_type: Option<String>,
245    allowed_keys: Option<Vec<String>>,
246}
247
248impl std::fmt::Display for SchemaError {
249    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
250        write!(f, "{}", self.message)
251    }
252}
253
254impl std::error::Error for SchemaError {}
255
256fn categorize_schema_errors(data: &Value, schema: &Value) -> (Vec<SchemaError>, Vec<SchemaError>) {
257    let mut deterministic = Vec::new();
258    let mut ambiguous = Vec::new();
259
260    validate_against_schema(data, schema, &mut vec![], &mut deterministic, &mut ambiguous);
261
262    (deterministic, ambiguous)
263}
264
265fn validate_against_schema(
266    data: &Value,
267    schema: &Value,
268    path: &mut Vec<String>,
269    deterministic: &mut Vec<SchemaError>,
270    ambiguous: &mut Vec<SchemaError>,
271) {
272    // Check type constraint
273    if let Some(expected_type) = schema.get("type").and_then(|t| t.as_str())
274        && !value_matches_type(data, expected_type)
275    {
276        deterministic.push(SchemaError {
277            path: path.clone(),
278            kind: SchemaErrorKind::Type,
279            message: format!(
280                "{}: expected type '{}', got '{}'",
281                path_str(path),
282                expected_type,
283                json_type_name(data)
284            ),
285            expected_type: Some(expected_type.to_string()),
286            allowed_keys: None,
287        });
288        return;
289    }
290
291    // Check enum constraint
292    if let Some(enum_values) = schema.get("enum").and_then(|e| e.as_array())
293        && !enum_values.contains(data)
294    {
295        deterministic.push(SchemaError {
296            path: path.clone(),
297            kind: SchemaErrorKind::Enum,
298            message: format!("{}: value not in enum {:?}", path_str(path), enum_values),
299            expected_type: None,
300            allowed_keys: None,
301        });
302    }
303
304    // Check required properties
305    if let Some(required) = schema.get("required").and_then(|r| r.as_array())
306        && let Some(obj) = data.as_object()
307    {
308        for req in required {
309            if let Some(key) = req.as_str()
310                && !obj.contains_key(key)
311            {
312                ambiguous.push(SchemaError {
313                    path: path.clone(),
314                    kind: SchemaErrorKind::Required,
315                    message: format!("{}: missing required field '{}'", path_str(path), key),
316                    expected_type: None,
317                    allowed_keys: None,
318                });
319            }
320        }
321    }
322
323    // Check additionalProperties
324    if let Some(additional) = schema.get("additionalProperties")
325        && additional == &Value::Bool(false)
326        && let (Some(obj), Some(props)) = (
327            data.as_object(),
328            schema.get("properties").and_then(|p| p.as_object()),
329        )
330    {
331        let allowed: Vec<String> = props.keys().cloned().collect();
332        let extra: Vec<&String> = obj.keys().filter(|k| !allowed.contains(k)).collect();
333        if !extra.is_empty() {
334            deterministic.push(SchemaError {
335                path: path.clone(),
336                kind: SchemaErrorKind::AdditionalProperties,
337                message: format!(
338                    "{}: unknown fields: {}",
339                    path_str(path),
340                    extra.iter().map(|s| s.as_str()).collect::<Vec<_>>().join(", ")
341                ),
342                expected_type: None,
343                allowed_keys: Some(allowed),
344            });
345        }
346    }
347
348    // Recurse into properties
349    if let Some(properties) = schema.get("properties").and_then(|p| p.as_object())
350        && let Some(obj) = data.as_object()
351    {
352        for (key, subschema) in properties {
353            if let Some(value) = obj.get(key) {
354                path.push(key.clone());
355                validate_against_schema(value, subschema, path, deterministic, ambiguous);
356                path.pop();
357            }
358        }
359    }
360
361    // Recurse into array items
362    if let Some(items_schema) = schema.get("items")
363        && let Some(arr) = data.as_array()
364    {
365        for (i, item) in arr.iter().enumerate() {
366            path.push(i.to_string());
367            validate_against_schema(item, items_schema, path, deterministic, ambiguous);
368            path.pop();
369        }
370    }
371}
372
373// --- Stage 6: Fix application ---
374
375fn apply_type_coercion(data: &mut Value, path: &[String], target_type: &str) -> bool {
376    let value = navigate_to_mut(data, path);
377    let value = match value {
378        Some(v) => v,
379        None => return false,
380    };
381
382    let coerced = coerce_type(value, target_type);
383    match coerced {
384        Cow::Owned(new_value) => {
385            *value = new_value;
386            true
387        }
388        Cow::Borrowed(_) => false,
389    }
390}
391
392fn coerce_type<'a>(value: &'a Value, target_type: &str) -> Cow<'a, Value> {
393    match target_type {
394        "integer" => {
395            if let Some(s) = value.as_str()
396                && let Ok(n) = s.parse::<i64>()
397            {
398                return Cow::Owned(Value::Number(n.into()));
399            }
400            if let Some(f) = value.as_f64() {
401                return Cow::Owned(Value::Number((f as i64).into()));
402            }
403            Cow::Borrowed(value)
404        }
405        "number" => {
406            if let Some(s) = value.as_str()
407                && let Ok(f) = s.parse::<f64>()
408            {
409                return match serde_json::Number::from_f64(f) {
410                    Some(n) => Cow::Owned(Value::Number(n)),
411                    None => Cow::Borrowed(value),
412                };
413            }
414            Cow::Borrowed(value)
415        }
416        "string" => match value {
417            Value::Number(n) => Cow::Owned(Value::String(n.to_string())),
418            Value::Bool(b) => Cow::Owned(Value::String(b.to_string())),
419            _ => Cow::Borrowed(value),
420        },
421        "boolean" => {
422            if let Some(s) = value.as_str() {
423                let lower = s.to_lowercase();
424                return Cow::Owned(Value::Bool(matches!(
425                    lower.as_str(),
426                    "true" | "yes" | "1" | "on"
427                )));
428            }
429            if let Some(n) = value.as_i64() {
430                return Cow::Owned(Value::Bool(n != 0));
431            }
432            Cow::Borrowed(value)
433        }
434        _ => Cow::Borrowed(value),
435    }
436}
437
438fn strip_extra_keys(data: &mut Value, path: &[String], allowed: &[String]) {
439    let node = navigate_to_mut(data, path);
440    if let Some(obj) = node.and_then(|v| v.as_object_mut()) {
441        let keys_to_remove: Vec<String> = obj
442            .keys()
443            .filter(|k| !allowed.contains(k))
444            .cloned()
445            .collect();
446        for key in keys_to_remove {
447            obj.remove(&key);
448        }
449    }
450}
451
452// --- Utilities ---
453
454fn navigate_to_mut<'a>(data: &'a mut Value, path: &[String]) -> Option<&'a mut Value> {
455    let mut current = data;
456    for key in path {
457        current = if let Ok(idx) = key.parse::<usize>() {
458            current.get_mut(idx)?
459        } else {
460            current.get_mut(key.as_str())?
461        };
462    }
463    Some(current)
464}
465
466fn value_matches_type(value: &Value, type_name: &str) -> bool {
467    match type_name {
468        "object" => value.is_object(),
469        "array" => value.is_array(),
470        "string" => value.is_string(),
471        "number" => value.is_number(),
472        "integer" => value.is_i64() || value.is_u64(),
473        "boolean" => value.is_boolean(),
474        "null" => value.is_null(),
475        _ => true,
476    }
477}
478
479fn json_type_name(value: &Value) -> &'static str {
480    match value {
481        Value::Null => "null",
482        Value::Bool(_) => "boolean",
483        Value::Number(_) => "number",
484        Value::String(_) => "string",
485        Value::Array(_) => "array",
486        Value::Object(_) => "object",
487    }
488}
489
490fn path_str(path: &[String]) -> String {
491    if path.is_empty() {
492        "root".to_string()
493    } else {
494        path.join(" > ")
495    }
496}
497
498#[cfg(test)]
499mod tests {
500    use super::*;
501
502    #[test]
503    fn test_coerce_string_to_integer() {
504        let v = Value::String("42".to_string());
505        let result = coerce_type(&v, "integer");
506        assert_eq!(result, Cow::Owned::<Value>(Value::Number(42.into())));
507    }
508
509    #[test]
510    fn test_coerce_string_to_boolean() {
511        assert_eq!(
512            coerce_type(&Value::String("true".to_string()), "boolean"),
513            Cow::Owned::<Value>(Value::Bool(true))
514        );
515        assert_eq!(
516            coerce_type(&Value::String("yes".to_string()), "boolean"),
517            Cow::Owned::<Value>(Value::Bool(true))
518        );
519        assert_eq!(
520            coerce_type(&Value::String("no".to_string()), "boolean"),
521            Cow::Owned::<Value>(Value::Bool(false))
522        );
523    }
524
525    #[test]
526    fn test_normalize_yaml() {
527        assert_eq!(normalize_yaml("  foo: bar\r\n  "), "foo: bar");
528    }
529
530    #[test]
531    fn test_fill_defaults() {
532        let schema = serde_json::json!({
533            "properties": {
534                "replicas": { "type": "integer", "default": 1 },
535                "name": { "type": "string" }
536            }
537        });
538        let mut obj = serde_json::Map::new();
539        obj.insert("name".to_string(), Value::String("test".to_string()));
540
541        fill_defaults(&mut obj, &schema);
542
543        assert_eq!(obj.get("replicas"), Some(&Value::Number(1.into())));
544        assert_eq!(
545            obj.get("name"),
546            Some(&Value::String("test".to_string()))
547        );
548    }
549}