Skip to main content

sanitize_engine/processor/
yaml_proc.rs

1//! YAML structured processor.
2//!
3//! Parses YAML input, walks the value tree, replaces matched field
4//! values, and serializes back. Structure is preserved but minor
5//! formatting differences are possible (serde_yaml normalizes some
6//! whitespace).
7//!
8//! Key paths use the same dot-separated convention as the JSON processor.
9
10use crate::error::{Result, SanitizeError};
11use crate::processor::{build_path, find_matching_rule, replace_value, FileTypeProfile, Processor};
12use crate::store::MappingStore;
13use serde_yaml_ng::Value;
14
15/// Maximum recursion depth for walking YAML value trees.
16const MAX_YAML_DEPTH: usize = 128;
17
18/// Maximum allowed size (in bytes) for raw YAML input.
19/// Guards against alias/anchor bombs that expand exponentially (R-4 / F-04 / F-06 fix).
20const MAX_YAML_INPUT_SIZE: usize = 64 * 1024 * 1024; // 64 MiB
21
22/// Maximum number of distinct YAML nodes after alias expansion.
23/// serde_yaml expands aliases into values during deserialization;
24/// this limit caps the total node count to prevent exponential
25/// growth from alias bombs (F-06 fix).
26const MAX_YAML_NODE_COUNT: usize = 10_000_000;
27
28/// Structured processor for YAML files.
29pub struct YamlProcessor;
30
31impl Processor for YamlProcessor {
32    fn name(&self) -> &'static str {
33        "yaml"
34    }
35
36    fn can_handle(&self, content: &[u8], profile: &FileTypeProfile) -> bool {
37        if profile.processor == "yaml" {
38            return true;
39        }
40        // Heuristic: starts with `---` or a YAML-ish key: value.
41        let text = String::from_utf8_lossy(content);
42        let trimmed = text.trim_start();
43        trimmed.starts_with("---") || trimmed.starts_with("- ") || trimmed.contains(": ")
44    }
45
46    fn process(
47        &self,
48        content: &[u8],
49        profile: &FileTypeProfile,
50        store: &MappingStore,
51    ) -> Result<Vec<u8>> {
52        // Guard against alias bombs: reject inputs above MAX_YAML_INPUT_SIZE.
53        if content.len() > MAX_YAML_INPUT_SIZE {
54            return Err(SanitizeError::InputTooLarge {
55                size: content.len(),
56                limit: MAX_YAML_INPUT_SIZE,
57            });
58        }
59
60        let text = std::str::from_utf8(content).map_err(|e| SanitizeError::ParseError {
61            format: "YAML".into(),
62            message: format!("invalid UTF-8: {}", e),
63        })?;
64
65        let mut value: Value =
66            serde_yaml_ng::from_str(text).map_err(|e| SanitizeError::ParseError {
67                format: "YAML".into(),
68                message: format!("YAML parse error: {}", e),
69            })?;
70
71        // F-06 fix: count total nodes in the deserialized tree to detect
72        // alias bombs. After expansion, aliased subtrees become
73        // independent copies in memory, so the node count reflects the
74        // true memory footprint.
75        let node_count = count_yaml_nodes(&value);
76        if node_count > MAX_YAML_NODE_COUNT {
77            return Err(SanitizeError::InputTooLarge {
78                size: node_count,
79                limit: MAX_YAML_NODE_COUNT,
80            });
81        }
82
83        walk_yaml(&mut value, "", profile, store, 0)?;
84
85        let output = serde_yaml_ng::to_string(&value)
86            .map_err(|e| SanitizeError::IoError(format!("YAML serialize error: {}", e)))?;
87
88        Ok(output.into_bytes())
89    }
90}
91
92/// Count the total number of nodes in a YAML value tree (F-06 fix).
93/// Used to detect alias bombs that produce a small source document
94/// but expand to millions of nodes after alias resolution.
95fn count_yaml_nodes(value: &Value) -> usize {
96    count_yaml_nodes_inner(value, 0)
97}
98
99/// Inner recursive counter with depth guard to prevent stack overflow
100/// on deeply nested YAML before `walk_yaml`'s depth check is reached.
101fn count_yaml_nodes_inner(value: &Value, depth: usize) -> usize {
102    if depth > MAX_YAML_DEPTH {
103        return 1; // Stop counting deeper; walk_yaml will catch depth violations
104    }
105    match value {
106        Value::Mapping(map) => {
107            1 + map
108                .iter()
109                .map(|(k, v)| {
110                    count_yaml_nodes_inner(k, depth + 1) + count_yaml_nodes_inner(v, depth + 1)
111                })
112                .sum::<usize>()
113        }
114        Value::Sequence(seq) => {
115            1 + seq
116                .iter()
117                .map(|v| count_yaml_nodes_inner(v, depth + 1))
118                .sum::<usize>()
119        }
120        Value::Tagged(tagged) => 1 + count_yaml_nodes_inner(&tagged.value, depth + 1),
121        _ => 1, // Null, Bool, Number, String
122    }
123}
124
125/// Recursively walk a YAML value, replacing matched fields.
126fn walk_yaml(
127    value: &mut Value,
128    prefix: &str,
129    profile: &FileTypeProfile,
130    store: &MappingStore,
131    depth: usize,
132) -> Result<()> {
133    if depth > MAX_YAML_DEPTH {
134        return Err(SanitizeError::RecursionDepthExceeded(format!(
135            "YAML recursion depth exceeds limit of {MAX_YAML_DEPTH}"
136        )));
137    }
138    match value {
139        Value::Mapping(map) => {
140            let keys: Vec<Value> = map.keys().cloned().collect();
141            for key in keys {
142                let key_str = yaml_key_to_string(&key);
143                let path = build_path(prefix, &key_str);
144
145                if let Some(v) = map.get_mut(&key) {
146                    match v {
147                        Value::String(s) => {
148                            if let Some(rule) = find_matching_rule(&path, profile) {
149                                *s = replace_value(s, rule, store)?;
150                            }
151                        }
152                        Value::Number(_) | Value::Bool(_) => {
153                            if let Some(rule) = find_matching_rule(&path, profile) {
154                                let repr = yaml_scalar_to_string(v);
155                                let replaced = replace_value(&repr, rule, store)?;
156                                *v = Value::String(replaced);
157                            }
158                        }
159                        Value::Mapping(_) | Value::Sequence(_) => {
160                            walk_yaml(v, &path, profile, store, depth + 1)?;
161                        }
162                        Value::Null | Value::Tagged(_) => {}
163                    }
164                }
165            }
166        }
167        Value::Sequence(seq) => {
168            for item in seq.iter_mut() {
169                walk_yaml(item, prefix, profile, store, depth + 1)?;
170            }
171        }
172        _ => {}
173    }
174    Ok(())
175}
176
177fn yaml_key_to_string(key: &Value) -> String {
178    match key {
179        Value::String(s) => s.clone(),
180        Value::Number(n) => n.to_string(),
181        Value::Bool(b) => b.to_string(),
182        _ => format!("{:?}", key),
183    }
184}
185
186fn yaml_scalar_to_string(v: &Value) -> String {
187    match v {
188        Value::String(s) => s.clone(),
189        Value::Number(n) => n.to_string(),
190        Value::Bool(b) => b.to_string(),
191        _ => String::new(),
192    }
193}
194
195#[cfg(test)]
196mod tests {
197    use super::*;
198    use crate::category::Category;
199    use crate::generator::HmacGenerator;
200    use crate::processor::profile::FieldRule;
201    use std::sync::Arc;
202
203    fn make_store() -> MappingStore {
204        let gen = Arc::new(HmacGenerator::new([42u8; 32]));
205        MappingStore::new(gen, None)
206    }
207
208    #[test]
209    fn basic_yaml_replacement() {
210        let store = make_store();
211        let proc = YamlProcessor;
212
213        let content = b"database:\n  host: db.corp.com\n  password: s3cret\nport: 5432\n";
214        let profile = FileTypeProfile::new(
215            "yaml",
216            vec![
217                FieldRule::new("database.password").with_category(Category::Custom("pw".into())),
218                FieldRule::new("database.host").with_category(Category::Hostname),
219            ],
220        );
221
222        let result = proc.process(content, &profile, &store).unwrap();
223        let out = String::from_utf8(result).unwrap();
224
225        assert!(!out.contains("s3cret"));
226        assert!(!out.contains("db.corp.com"));
227        // port should be preserved
228        assert!(out.contains("5432"));
229    }
230
231    #[test]
232    fn yaml_sequence_traversal() {
233        let store = make_store();
234        let proc = YamlProcessor;
235
236        let content = b"users:\n  - email: a@b.com\n  - email: c@d.com\n";
237        let profile = FileTypeProfile::new(
238            "yaml",
239            vec![FieldRule::new("users.email").with_category(Category::Email)],
240        );
241
242        let result = proc.process(content, &profile, &store).unwrap();
243        let out = String::from_utf8(result).unwrap();
244
245        assert!(!out.contains("a@b.com"));
246        assert!(!out.contains("c@d.com"));
247    }
248}