Skip to main content

sanitize_engine/processor/
xml_proc.rs

1//! XML structured processor.
2//!
3//! Uses `quick-xml` to parse and rewrite XML, preserving the document
4//! structure, attributes, and non-matched content.
5//!
6//! # Key Paths
7//!
8//! Element paths are slash-separated: `database/password`. Attributes
9//! are expressed as `element/@attr` (e.g. `connection/@host`).
10//!
11//! For simplicity this processor tracks the element stack and matches
12//! text content of elements and attribute values against field rules.
13
14use crate::error::{Result, SanitizeError};
15use crate::processor::{find_matching_rule, replace_value, FileTypeProfile, Processor};
16use crate::store::MappingStore;
17use quick_xml::events::{BytesStart, BytesText, Event};
18use quick_xml::{Reader, Writer};
19use std::io::Cursor;
20
21/// Maximum element nesting depth for XML processing.
22/// Prevents stack/memory exhaustion from deeply nested documents (R-5 fix).
23const MAX_XML_DEPTH: usize = 256;
24
25/// Maximum allowed input size (bytes) for XML processing (F-04 fix).
26const MAX_XML_INPUT_SIZE: usize = 256 * 1024 * 1024; // 256 MiB
27
28/// Structured processor for XML files.
29pub struct XmlProcessor;
30
31impl Processor for XmlProcessor {
32    fn name(&self) -> &'static str {
33        "xml"
34    }
35
36    fn can_handle(&self, content: &[u8], profile: &FileTypeProfile) -> bool {
37        if profile.processor == "xml" {
38            return true;
39        }
40        let trimmed = content
41            .iter()
42            .copied()
43            .skip_while(|b| b.is_ascii_whitespace())
44            .take(5)
45            .collect::<Vec<u8>>();
46        trimmed.starts_with(b"<?xml") || trimmed.starts_with(b"<")
47    }
48
49    fn process(
50        &self,
51        content: &[u8],
52        profile: &FileTypeProfile,
53        store: &MappingStore,
54    ) -> Result<Vec<u8>> {
55        // F-04 fix: enforce input size limit.
56        if content.len() > MAX_XML_INPUT_SIZE {
57            return Err(SanitizeError::InputTooLarge {
58                size: content.len(),
59                limit: MAX_XML_INPUT_SIZE,
60            });
61        }
62
63        // Security: quick-xml disables external entity expansion by default,
64        // so XXE attacks are not possible with this configuration.
65        let mut reader = Reader::from_reader(content);
66        reader.trim_text(false);
67
68        let mut writer = Writer::new(Cursor::new(Vec::new()));
69        let mut element_stack: Vec<String> = Vec::new();
70        let mut buf = Vec::new();
71
72        loop {
73            match reader.read_event_into(&mut buf) {
74                Ok(Event::Start(ref e)) => {
75                    let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
76                    element_stack.push(name.clone());
77
78                    if element_stack.len() > MAX_XML_DEPTH {
79                        return Err(SanitizeError::RecursionDepthExceeded(format!(
80                            "XML element depth exceeds limit of {MAX_XML_DEPTH}"
81                        )));
82                    }
83
84                    // Process attributes.
85                    let current_path = element_stack.join("/");
86                    let new_elem = process_attributes(e, &current_path, profile, store)?;
87                    writer
88                        .write_event(Event::Start(new_elem))
89                        .map_err(|e| SanitizeError::IoError(format!("XML write error: {}", e)))?;
90                }
91                Ok(Event::End(ref e)) => {
92                    writer
93                        .write_event(Event::End(e.clone()))
94                        .map_err(|e| SanitizeError::IoError(format!("XML write error: {}", e)))?;
95                    element_stack.pop();
96                }
97                Ok(Event::Empty(ref e)) => {
98                    let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
99                    let path = if element_stack.is_empty() {
100                        name.clone()
101                    } else {
102                        format!("{}/{}", element_stack.join("/"), name)
103                    };
104                    let new_elem = process_attributes(e, &path, profile, store)?;
105                    writer
106                        .write_event(Event::Empty(new_elem))
107                        .map_err(|e| SanitizeError::IoError(format!("XML write error: {}", e)))?;
108                }
109                Ok(Event::Text(ref e)) => {
110                    let current_path = element_stack.join("/");
111                    if let Some(rule) = find_matching_rule(&current_path, profile) {
112                        let text = e.unescape().map_err(|e| SanitizeError::ParseError {
113                            format: "XML".into(),
114                            message: format!("XML decode error: {}", e),
115                        })?;
116                        let replaced = replace_value(&text, rule, store)?;
117                        writer
118                            .write_event(Event::Text(BytesText::new(&replaced)))
119                            .map_err(|e| {
120                                SanitizeError::IoError(format!("XML write error: {}", e))
121                            })?;
122                    } else {
123                        writer.write_event(Event::Text(e.clone())).map_err(|e| {
124                            SanitizeError::IoError(format!("XML write error: {}", e))
125                        })?;
126                    }
127                }
128                Ok(Event::Eof) => break,
129                Ok(e) => {
130                    writer
131                        .write_event(e)
132                        .map_err(|er| SanitizeError::IoError(format!("XML write error: {}", er)))?;
133                }
134                Err(e) => {
135                    return Err(SanitizeError::ParseError {
136                        format: "XML".into(),
137                        message: format!("XML parse error: {}", e),
138                    });
139                }
140            }
141            buf.clear();
142        }
143
144        let result = writer.into_inner().into_inner();
145        Ok(result)
146    }
147}
148
149/// Process attributes of an element, replacing matched ones.
150fn process_attributes(
151    elem: &BytesStart<'_>,
152    element_path: &str,
153    profile: &FileTypeProfile,
154    store: &MappingStore,
155) -> Result<BytesStart<'static>> {
156    let name = elem.name();
157    let mut new_elem = BytesStart::new(String::from_utf8_lossy(name.as_ref()).to_string());
158
159    for attr_result in elem.attributes() {
160        let attr = attr_result.map_err(|e| SanitizeError::ParseError {
161            format: "XML".into(),
162            message: format!("XML attribute error: {}", e),
163        })?;
164        let attr_key = String::from_utf8_lossy(attr.key.as_ref()).to_string();
165        let attr_path = format!("{}/@{}", element_path, attr_key);
166
167        if let Some(rule) = find_matching_rule(&attr_path, profile) {
168            let attr_value = attr
169                .unescape_value()
170                .map_err(|e| SanitizeError::ParseError {
171                    format: "XML".into(),
172                    message: format!("XML attr decode error: {}", e),
173                })?;
174            let replaced = replace_value(&attr_value, rule, store)?;
175            new_elem.push_attribute((attr_key.as_str(), replaced.as_str()));
176        } else {
177            let attr_value = attr
178                .unescape_value()
179                .map_err(|e| SanitizeError::ParseError {
180                    format: "XML".into(),
181                    message: format!("XML attr decode error: {}", e),
182                })?;
183            new_elem.push_attribute((attr_key.as_str(), attr_value.as_ref()));
184        }
185    }
186
187    Ok(new_elem)
188}
189
190#[cfg(test)]
191mod tests {
192    use super::*;
193    use crate::category::Category;
194    use crate::generator::HmacGenerator;
195    use crate::processor::profile::FieldRule;
196    use std::sync::Arc;
197
198    fn make_store() -> MappingStore {
199        let gen = Arc::new(HmacGenerator::new([42u8; 32]));
200        MappingStore::new(gen, None)
201    }
202
203    #[test]
204    fn basic_xml_text_replacement() {
205        let store = make_store();
206        let proc = XmlProcessor;
207
208        let content =
209            b"<config><database><password>s3cret</password><port>5432</port></database></config>";
210        let profile = FileTypeProfile::new(
211            "xml",
212            vec![FieldRule::new("config/database/password")
213                .with_category(Category::Custom("pw".into()))],
214        );
215
216        let result = proc.process(content, &profile, &store).unwrap();
217        let out = String::from_utf8(result).unwrap();
218
219        assert!(!out.contains("s3cret"));
220        assert!(out.contains("<port>5432</port>"));
221    }
222
223    #[test]
224    fn xml_attribute_replacement() {
225        let store = make_store();
226        let proc = XmlProcessor;
227
228        let content = b"<config><connection host=\"db.corp.com\" port=\"5432\"/></config>";
229        let profile = FileTypeProfile::new(
230            "xml",
231            vec![FieldRule::new("config/connection/@host").with_category(Category::Hostname)],
232        );
233
234        let result = proc.process(content, &profile, &store).unwrap();
235        let out = String::from_utf8(result).unwrap();
236
237        assert!(!out.contains("db.corp.com"));
238        assert!(out.contains("5432"));
239    }
240}