Skip to main content

sanitize_engine/processor/
xml_proc.rs

1//! XML structured processor.
2//!
3//! Uses `quick-xml` to parse and rewrite XML, preserving the document
4//! structure, attributes, and non-matched content.
5//!
6//! # Key Paths
7//!
8//! Element paths are slash-separated: `database/password`. Attributes
9//! are expressed as `element/@attr` (e.g. `connection/@host`).
10//!
11//! For simplicity this processor tracks the element stack and matches
12//! text content of elements and attribute values against field rules.
13
14use crate::error::{Result, SanitizeError};
15use crate::processor::limits::{DEFAULT_INPUT_SIZE, XML_DEPTH};
16use crate::processor::{find_matching_rule, replace_value, FileTypeProfile, Processor};
17use crate::store::MappingStore;
18use quick_xml::events::{BytesStart, BytesText, Event};
19use quick_xml::{Reader, Writer};
20use std::io::Cursor;
21
22/// Structured processor for XML files.
23pub struct XmlProcessor;
24
25impl Processor for XmlProcessor {
26    fn name(&self) -> &'static str {
27        "xml"
28    }
29
30    fn can_handle(&self, content: &[u8], profile: &FileTypeProfile) -> bool {
31        if profile.processor == "xml" {
32            return true;
33        }
34        let trimmed = content
35            .iter()
36            .copied()
37            .skip_while(|b| b.is_ascii_whitespace())
38            .take(5)
39            .collect::<Vec<u8>>();
40        trimmed.starts_with(b"<?xml") || trimmed.starts_with(b"<")
41    }
42
43    fn process(
44        &self,
45        content: &[u8],
46        profile: &FileTypeProfile,
47        store: &MappingStore,
48    ) -> Result<Vec<u8>> {
49        // F-04 fix: enforce input size limit.
50        if content.len() > DEFAULT_INPUT_SIZE {
51            return Err(SanitizeError::InputTooLarge {
52                size: content.len(),
53                limit: DEFAULT_INPUT_SIZE,
54            });
55        }
56
57        // Security: quick-xml disables external entity expansion by default,
58        // so XXE attacks are not possible with this configuration.
59        let mut reader = Reader::from_reader(content);
60        reader.trim_text(false);
61
62        let mut writer = Writer::new(Cursor::new(Vec::new()));
63        let mut element_stack: Vec<String> = Vec::new();
64        let mut buf = Vec::new();
65
66        loop {
67            match reader.read_event_into(&mut buf) {
68                Ok(Event::Start(ref e)) => {
69                    let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
70                    element_stack.push(name.clone());
71
72                    if element_stack.len() > XML_DEPTH {
73                        return Err(SanitizeError::RecursionDepthExceeded(format!(
74                            "XML element depth exceeds limit of {XML_DEPTH}"
75                        )));
76                    }
77
78                    // Process attributes.
79                    let current_path = element_stack.join("/");
80                    let new_elem = process_attributes(e, &current_path, profile, store)?;
81                    writer.write_event(Event::Start(new_elem)).map_err(|e| {
82                        SanitizeError::IoError(std::io::Error::other(format!(
83                            "XML write error: {e}"
84                        )))
85                    })?;
86                }
87                Ok(Event::End(ref e)) => {
88                    writer.write_event(Event::End(e.clone())).map_err(|e| {
89                        SanitizeError::IoError(std::io::Error::other(format!(
90                            "XML write error: {e}"
91                        )))
92                    })?;
93                    element_stack.pop();
94                }
95                Ok(Event::Empty(ref e)) => {
96                    let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
97                    let path = if element_stack.is_empty() {
98                        name.clone()
99                    } else {
100                        format!("{}/{}", element_stack.join("/"), name)
101                    };
102                    let new_elem = process_attributes(e, &path, profile, store)?;
103                    writer.write_event(Event::Empty(new_elem)).map_err(|e| {
104                        SanitizeError::IoError(std::io::Error::other(format!(
105                            "XML write error: {e}"
106                        )))
107                    })?;
108                }
109                Ok(Event::Text(ref e)) => {
110                    let current_path = element_stack.join("/");
111                    if let Some(rule) = find_matching_rule(&current_path, profile) {
112                        let text = e.unescape().map_err(|e| SanitizeError::ParseError {
113                            format: "XML".into(),
114                            message: format!("XML decode error: {}", e),
115                        })?;
116                        let replaced = replace_value(&text, rule, store)?;
117                        writer
118                            .write_event(Event::Text(BytesText::new(&replaced)))
119                            .map_err(|e| {
120                                SanitizeError::IoError(std::io::Error::other(format!(
121                                    "XML write error: {e}"
122                                )))
123                            })?;
124                    } else {
125                        writer.write_event(Event::Text(e.clone())).map_err(|e| {
126                            SanitizeError::IoError(std::io::Error::other(format!(
127                                "XML write error: {e}"
128                            )))
129                        })?;
130                    }
131                }
132                Ok(Event::Eof) => break,
133                Ok(e) => {
134                    writer.write_event(e).map_err(|er| {
135                        SanitizeError::IoError(std::io::Error::other(format!(
136                            "XML write error: {er}"
137                        )))
138                    })?;
139                }
140                Err(e) => {
141                    return Err(SanitizeError::ParseError {
142                        format: "XML".into(),
143                        message: format!("XML parse error: {}", e),
144                    });
145                }
146            }
147            buf.clear();
148        }
149
150        let result = writer.into_inner().into_inner();
151        Ok(result)
152    }
153}
154
155/// Process attributes of an element, replacing matched ones.
156fn process_attributes(
157    elem: &BytesStart<'_>,
158    element_path: &str,
159    profile: &FileTypeProfile,
160    store: &MappingStore,
161) -> Result<BytesStart<'static>> {
162    let name = elem.name();
163    let mut new_elem = BytesStart::new(String::from_utf8_lossy(name.as_ref()).to_string());
164
165    for attr_result in elem.attributes() {
166        let attr = attr_result.map_err(|e| SanitizeError::ParseError {
167            format: "XML".into(),
168            message: format!("XML attribute error: {}", e),
169        })?;
170        let attr_key = String::from_utf8_lossy(attr.key.as_ref()).to_string();
171        let attr_path = format!("{}/@{}", element_path, attr_key);
172
173        if let Some(rule) = find_matching_rule(&attr_path, profile) {
174            let attr_value = attr
175                .unescape_value()
176                .map_err(|e| SanitizeError::ParseError {
177                    format: "XML".into(),
178                    message: format!("XML attr decode error: {}", e),
179                })?;
180            let replaced = replace_value(&attr_value, rule, store)?;
181            new_elem.push_attribute((attr_key.as_str(), replaced.as_str()));
182        } else {
183            let attr_value = attr
184                .unescape_value()
185                .map_err(|e| SanitizeError::ParseError {
186                    format: "XML".into(),
187                    message: format!("XML attr decode error: {}", e),
188                })?;
189            new_elem.push_attribute((attr_key.as_str(), attr_value.as_ref()));
190        }
191    }
192
193    Ok(new_elem)
194}
195
196#[cfg(test)]
197mod tests {
198    use super::*;
199    use crate::category::Category;
200    use crate::generator::HmacGenerator;
201    use crate::processor::profile::FieldRule;
202    use std::fmt::Write as _;
203    use std::sync::Arc;
204
205    fn make_store() -> MappingStore {
206        let gen = Arc::new(HmacGenerator::new([42u8; 32]));
207        MappingStore::new(gen, None)
208    }
209
210    #[test]
211    fn basic_xml_text_replacement() {
212        let store = make_store();
213        let proc = XmlProcessor;
214
215        let content =
216            b"<config><database><password>s3cret</password><port>5432</port></database></config>";
217        let profile = FileTypeProfile::new(
218            "xml",
219            vec![FieldRule::new("config/database/password")
220                .with_category(Category::Custom("pw".into()))],
221        );
222
223        let result = proc.process(content, &profile, &store).unwrap();
224        let out = String::from_utf8(result).unwrap();
225
226        assert!(!out.contains("s3cret"));
227        assert!(out.contains("<port>5432</port>"));
228    }
229
230    #[test]
231    fn xml_attribute_replacement() {
232        let store = make_store();
233        let proc = XmlProcessor;
234
235        let content = b"<config><connection host=\"db.corp.com\" port=\"5432\"/></config>";
236        let profile = FileTypeProfile::new(
237            "xml",
238            vec![FieldRule::new("config/connection/@host").with_category(Category::Hostname)],
239        );
240
241        let result = proc.process(content, &profile, &store).unwrap();
242        let out = String::from_utf8(result).unwrap();
243
244        assert!(!out.contains("db.corp.com"));
245        assert!(out.contains("5432"));
246    }
247
248    #[test]
249    fn can_handle_xml_declaration() {
250        let proc = XmlProcessor;
251        let profile = FileTypeProfile::new("other", vec![]).with_extension(".txt");
252        assert!(proc.can_handle(b"<?xml version=\"1.0\"?><root/>", &profile));
253    }
254
255    #[test]
256    fn can_handle_bare_tag() {
257        let proc = XmlProcessor;
258        let profile = FileTypeProfile::new("other", vec![]).with_extension(".txt");
259        assert!(proc.can_handle(b"<root><child/></root>", &profile));
260    }
261
262    #[test]
263    fn can_handle_by_profile_name() {
264        let proc = XmlProcessor;
265        let profile = FileTypeProfile::new("xml", vec![]).with_extension(".xml");
266        assert!(proc.can_handle(b"not xml at all", &profile));
267    }
268
269    #[test]
270    fn can_handle_rejects_plaintext() {
271        let proc = XmlProcessor;
272        let profile = FileTypeProfile::new("json", vec![]).with_extension(".json");
273        assert!(!proc.can_handle(b"just some plain text", &profile));
274    }
275
276    #[test]
277    fn empty_element_attributes_replaced() {
278        let store = make_store();
279        let proc = XmlProcessor;
280        let content = b"<config><server host=\"prod.corp.com\" port=\"443\"/></config>";
281        let profile = FileTypeProfile::new(
282            "xml",
283            vec![FieldRule::new("config/server/@host").with_category(Category::Hostname)],
284        );
285        let result = proc.process(content, &profile, &store).unwrap();
286        let out = String::from_utf8(result).unwrap();
287        assert!(!out.contains("prod.corp.com"));
288        assert!(out.contains("443"));
289    }
290
291    #[test]
292    fn empty_element_at_root_level() {
293        let store = make_store();
294        let proc = XmlProcessor;
295        let content = b"<server host=\"root.corp.com\"/>";
296        let profile = FileTypeProfile::new(
297            "xml",
298            vec![FieldRule::new("server/@host").with_category(Category::Hostname)],
299        );
300        let result = proc.process(content, &profile, &store).unwrap();
301        let out = String::from_utf8(result).unwrap();
302        assert!(!out.contains("root.corp.com"));
303    }
304
305    #[test]
306    fn unmatched_attributes_pass_through() {
307        let store = make_store();
308        let proc = XmlProcessor;
309        let content = b"<config><db host=\"db.corp.com\" port=\"5432\"/></config>";
310        let profile = FileTypeProfile::new("xml", vec![]); // no field rules
311        let result = proc.process(content, &profile, &store).unwrap();
312        let out = String::from_utf8(result).unwrap();
313        assert!(out.contains("db.corp.com"));
314        assert!(out.contains("5432"));
315    }
316
317    #[test]
318    fn other_xml_events_pass_through() {
319        let store = make_store();
320        let proc = XmlProcessor;
321        let content = b"<?xml version=\"1.0\"?><!-- comment --><root><child>value</child></root>";
322        let profile = FileTypeProfile::new("xml", vec![]);
323        let result = proc.process(content, &profile, &store).unwrap();
324        let out = String::from_utf8(result).unwrap();
325        assert!(out.contains("value"));
326    }
327
328    #[test]
329    fn depth_limit_exceeded_returns_error() {
330        let store = make_store();
331        let proc = XmlProcessor;
332        // Build XML that exceeds XML_DEPTH (256) levels of nesting.
333        let open: String = (0..260).fold(String::new(), |mut s, i| {
334            write!(s, "<l{i}>").unwrap();
335            s
336        });
337        let close: String = (0..260).rev().fold(String::new(), |mut s, i| {
338            write!(s, "</l{i}>").unwrap();
339            s
340        });
341        let content = format!("{open}secret{close}");
342        let profile = FileTypeProfile::new("xml", vec![]);
343        let err = proc
344            .process(content.as_bytes(), &profile, &store)
345            .unwrap_err();
346        assert!(matches!(
347            err,
348            crate::error::SanitizeError::RecursionDepthExceeded(_)
349        ));
350    }
351}