Skip to main content

sanitize_engine/processor/
yaml_proc.rs

1//! YAML structured processor.
2//!
3//! Parses YAML input, walks the value tree, replaces matched field
4//! values, and serializes back. Structure is preserved but minor
5//! formatting differences are possible (serde_yaml normalizes some
6//! whitespace).
7//!
8//! Key paths use the same dot-separated convention as the JSON processor.
9
10use crate::error::{Result, SanitizeError};
11use crate::processor::limits::{DEFAULT_DEPTH, YAML_INPUT_SIZE, YAML_NODE_COUNT};
12use crate::processor::{walk_tree, FileTypeProfile, Processor, TreeNode};
13use crate::store::MappingStore;
14use serde_yaml_ng::Value;
15
16/// Structured processor for YAML files.
17pub struct YamlProcessor;
18
19impl Processor for YamlProcessor {
20    fn name(&self) -> &'static str {
21        "yaml"
22    }
23
24    fn can_handle(&self, content: &[u8], profile: &FileTypeProfile) -> bool {
25        if profile.processor == "yaml" {
26            return true;
27        }
28        // Heuristic: starts with `---` or a YAML-ish key: value.
29        let text = String::from_utf8_lossy(content);
30        let trimmed = text.trim_start();
31        trimmed.starts_with("---")
32            || trimmed.starts_with("- ")
33            || trimmed.starts_with('{')
34            || trimmed.contains(": ")
35    }
36
37    fn process(
38        &self,
39        content: &[u8],
40        profile: &FileTypeProfile,
41        store: &MappingStore,
42    ) -> Result<Vec<u8>> {
43        // Guard against alias bombs: reject inputs above YAML_INPUT_SIZE.
44        if content.len() > YAML_INPUT_SIZE {
45            return Err(SanitizeError::InputTooLarge {
46                size: content.len(),
47                limit: YAML_INPUT_SIZE,
48            });
49        }
50
51        let text = std::str::from_utf8(content).map_err(|e| SanitizeError::ParseError {
52            format: "YAML".into(),
53            message: format!("invalid UTF-8: {}", e),
54        })?;
55
56        let mut value: Value =
57            serde_yaml_ng::from_str(text).map_err(|e| SanitizeError::ParseError {
58                format: "YAML".into(),
59                message: format!("YAML parse error: {}", e),
60            })?;
61
62        // F-06 fix: count total nodes in the deserialized tree to detect
63        // alias bombs. After expansion, aliased subtrees become
64        // independent copies in memory, so the node count reflects the
65        // true memory footprint.
66        let node_count = count_yaml_nodes(&value);
67        if node_count > YAML_NODE_COUNT {
68            return Err(SanitizeError::InputTooLarge {
69                size: node_count,
70                limit: YAML_NODE_COUNT,
71            });
72        }
73
74        walk_yaml(&mut value, "", profile, store, 0)?;
75
76        let output = serde_yaml_ng::to_string(&value).map_err(|e| {
77            SanitizeError::IoError(std::io::Error::other(format!("YAML serialize error: {e}")))
78        })?;
79
80        Ok(output.into_bytes())
81    }
82}
83
84/// Count the total number of nodes in a YAML value tree (F-06 fix).
85/// Used to detect alias bombs that produce a small source document
86/// but expand to millions of nodes after alias resolution.
87fn count_yaml_nodes(value: &Value) -> usize {
88    count_yaml_nodes_inner(value, 0)
89}
90
91/// Inner recursive counter with depth guard to prevent stack overflow
92/// on deeply nested YAML before `walk_yaml`'s depth check is reached.
93fn count_yaml_nodes_inner(value: &Value, depth: usize) -> usize {
94    if depth > DEFAULT_DEPTH {
95        return 1; // Stop counting deeper; walk_yaml will catch depth violations
96    }
97    match value {
98        Value::Mapping(map) => {
99            1 + map
100                .iter()
101                .map(|(k, v)| {
102                    count_yaml_nodes_inner(k, depth + 1) + count_yaml_nodes_inner(v, depth + 1)
103                })
104                .sum::<usize>()
105        }
106        Value::Sequence(seq) => {
107            1 + seq
108                .iter()
109                .map(|v| count_yaml_nodes_inner(v, depth + 1))
110                .sum::<usize>()
111        }
112        Value::Tagged(tagged) => 1 + count_yaml_nodes_inner(&tagged.value, depth + 1),
113        _ => 1, // Null, Bool, Number, String
114    }
115}
116
117impl TreeNode for Value {
118    fn for_each_map_entry<F>(&mut self, mut f: F) -> Result<()>
119    where
120        F: FnMut(&str, &mut Self) -> Result<()>,
121    {
122        if let Self::Mapping(map) = self {
123            let keys: Vec<Self> = map.keys().cloned().collect();
124            for key in keys {
125                let key_str = yaml_key_to_string(&key);
126                if let Some(v) = map.get_mut(&key) {
127                    f(&key_str, v)?;
128                }
129            }
130        }
131        Ok(())
132    }
133
134    fn for_each_seq_item<F>(&mut self, mut f: F) -> Result<()>
135    where
136        F: FnMut(&mut Self) -> Result<()>,
137    {
138        if let Self::Sequence(seq) = self {
139            for item in seq.iter_mut() {
140                f(item)?;
141            }
142        }
143        Ok(())
144    }
145
146    fn as_str_mut(&mut self) -> Option<&mut String> {
147        if let Self::String(s) = self {
148            Some(s)
149        } else {
150            None
151        }
152    }
153
154    fn is_scalar(&self) -> bool {
155        matches!(self, Self::Number(_) | Self::Bool(_))
156    }
157
158    fn scalar_to_string(&self) -> String {
159        yaml_scalar_to_string(self)
160    }
161
162    fn set_string(&mut self, s: String) {
163        *self = Self::String(s);
164    }
165}
166
167/// Recursively walk a YAML value tree, replacing matched field values.
168fn walk_yaml(
169    value: &mut Value,
170    prefix: &str,
171    profile: &FileTypeProfile,
172    store: &MappingStore,
173    depth: usize,
174) -> Result<()> {
175    walk_tree(value, prefix, profile, store, depth, "YAML")
176}
177
178fn yaml_key_to_string(key: &Value) -> String {
179    match key {
180        Value::String(s) => s.clone(),
181        Value::Number(n) => n.to_string(),
182        Value::Bool(b) => b.to_string(),
183        _ => format!("{:?}", key),
184    }
185}
186
187fn yaml_scalar_to_string(v: &Value) -> String {
188    match v {
189        Value::String(s) => s.clone(),
190        Value::Number(n) => n.to_string(),
191        Value::Bool(b) => b.to_string(),
192        _ => String::new(),
193    }
194}
195
196#[cfg(test)]
197mod tests {
198    use super::*;
199    use crate::category::Category;
200    use crate::generator::HmacGenerator;
201    use crate::processor::profile::FieldRule;
202    use std::sync::Arc;
203
204    fn make_store() -> MappingStore {
205        let gen = Arc::new(HmacGenerator::new([42u8; 32]));
206        MappingStore::new(gen, None)
207    }
208
209    #[test]
210    fn basic_yaml_replacement() {
211        let store = make_store();
212        let proc = YamlProcessor;
213
214        let content = b"database:\n  host: db.corp.com\n  password: s3cret\nport: 5432\n";
215        let profile = FileTypeProfile::new(
216            "yaml",
217            vec![
218                FieldRule::new("database.password").with_category(Category::Custom("pw".into())),
219                FieldRule::new("database.host").with_category(Category::Hostname),
220            ],
221        );
222
223        let result = proc.process(content, &profile, &store).unwrap();
224        let out = String::from_utf8(result).unwrap();
225
226        assert!(!out.contains("s3cret"));
227        assert!(!out.contains("db.corp.com"));
228        // port should be preserved
229        assert!(out.contains("5432"));
230    }
231
232    #[test]
233    fn can_handle_by_profile_name() {
234        let proc = YamlProcessor;
235        let profile = FileTypeProfile::new("yaml", vec![]).with_extension(".yaml");
236        assert!(proc.can_handle(b"anything", &profile));
237    }
238
239    #[test]
240    fn can_handle_detects_document_marker() {
241        let proc = YamlProcessor;
242        let profile = FileTypeProfile::new("json", vec![]).with_extension(".json");
243        assert!(proc.can_handle(b"---\nkey: value\n", &profile));
244    }
245
246    #[test]
247    fn can_handle_detects_key_value_heuristic() {
248        let proc = YamlProcessor;
249        let profile = FileTypeProfile::new("other", vec![]).with_extension(".conf");
250        assert!(proc.can_handle(b"host: localhost\nport: 5432\n", &profile));
251    }
252
253    #[test]
254    fn can_handle_detects_sequence_heuristic() {
255        let proc = YamlProcessor;
256        let profile = FileTypeProfile::new("other", vec![]).with_extension(".txt");
257        assert!(proc.can_handle(b"- item1\n- item2\n", &profile));
258    }
259
260    #[test]
261    fn can_handle_rejects_plaintext() {
262        let proc = YamlProcessor;
263        let profile = FileTypeProfile::new("json", vec![]).with_extension(".json");
264        assert!(!proc.can_handle(b"just plain text with no yaml markers", &profile));
265    }
266
267    #[test]
268    fn non_string_scalars_not_targeted_pass_through() {
269        let store = make_store();
270        let proc = YamlProcessor;
271        // Only target the 'secret' field; booleans and numbers are untouched.
272        let content = b"enabled: true\ncount: 42\nsecret: hunter2\n";
273        let profile = FileTypeProfile::new(
274            "yaml",
275            vec![FieldRule::new("secret").with_category(Category::Custom("pw".into()))],
276        );
277        let result = proc.process(content, &profile, &store).unwrap();
278        let out = String::from_utf8(result).unwrap();
279        assert!(!out.contains("hunter2"), "secret must be replaced");
280        assert!(out.contains("42"), "integer must be preserved");
281    }
282
283    #[test]
284    fn deeply_nested_yaml_replaced() {
285        let store = make_store();
286        let proc = YamlProcessor;
287        let content = b"a:\n  b:\n    c:\n      secret: hunter2\n";
288        let profile = FileTypeProfile::new(
289            "yaml",
290            vec![FieldRule::new("a.b.c.secret").with_category(Category::Custom("pw".into()))],
291        );
292        let result = proc.process(content, &profile, &store).unwrap();
293        let out = String::from_utf8(result).unwrap();
294        assert!(!out.contains("hunter2"));
295    }
296
297    #[test]
298    fn invalid_utf8_returns_parse_error() {
299        let store = make_store();
300        let proc = YamlProcessor;
301        let bad = b"\xff\xfe invalid";
302        let profile = FileTypeProfile::new("yaml", vec![]);
303        let err = proc.process(bad, &profile, &store).unwrap_err();
304        assert!(matches!(
305            err,
306            crate::error::SanitizeError::ParseError { .. }
307        ));
308    }
309
310    #[test]
311    fn invalid_yaml_returns_parse_error() {
312        let store = make_store();
313        let proc = YamlProcessor;
314        let bad = b"key: [unclosed";
315        let profile = FileTypeProfile::new("yaml", vec![]);
316        let err = proc.process(bad, &profile, &store).unwrap_err();
317        assert!(matches!(
318            err,
319            crate::error::SanitizeError::ParseError { .. }
320        ));
321    }
322
323    #[test]
324    fn yaml_sequence_traversal() {
325        let store = make_store();
326        let proc = YamlProcessor;
327
328        let content = b"users:\n  - email: a@b.com\n  - email: c@d.com\n";
329        let profile = FileTypeProfile::new(
330            "yaml",
331            vec![FieldRule::new("users.email").with_category(Category::Email)],
332        );
333
334        let result = proc.process(content, &profile, &store).unwrap();
335        let out = String::from_utf8(result).unwrap();
336
337        assert!(!out.contains("a@b.com"));
338        assert!(!out.contains("c@d.com"));
339    }
340}