ddex_parser/parser/
xml_validator.rs

1//! XML structure validation for detecting malformed XML
2
3use crate::error::ParseError;
4use crate::utf8_utils;
5use quick_xml::events::{BytesEnd, BytesStart, BytesText, Event};
6use quick_xml::Reader;
7use std::io::BufRead;
8
9/// XML validator that tracks element stack and validates structure
10#[derive(Debug, Clone)]
11pub struct XmlValidator {
12    /// Stack of open XML elements for tag matching (stores element name and its depth)
13    element_stack: Vec<(String, usize)>,
14    /// Track current actual nesting depth (siblings don't increase depth)
15    current_depth: usize,
16    /// Track current byte position for error reporting
17    current_position: usize,
18    /// Enable strict validation (mismatched tags, unclosed elements)
19    strict_validation: bool,
20    /// Enable extended validation (attributes, content)
21    extended_validation: bool,
22}
23
24impl Default for XmlValidator {
25    fn default() -> Self {
26        Self::new(true, false)
27    }
28}
29
30impl XmlValidator {
31    /// Create new validator with specified validation levels
32    pub fn new(strict: bool, extended: bool) -> Self {
33        Self {
34            element_stack: Vec::new(),
35            current_depth: 0,
36            current_position: 0,
37            strict_validation: strict,
38            extended_validation: extended,
39        }
40    }
41
42    /// Create a strict validator for production use
43    pub fn strict() -> Self {
44        Self::new(true, true)
45    }
46
47    /// Create a lenient validator for development/testing
48    pub fn lenient() -> Self {
49        Self::new(false, false)
50    }
51
52    /// Validate XML structure during parsing
53    pub fn validate_event<R: BufRead>(
54        &mut self,
55        event: &Event,
56        reader: &Reader<R>,
57    ) -> Result<(), ParseError> {
58        // Update current position for error reporting
59        self.current_position = reader.buffer_position() as usize;
60
61        match event {
62            Event::Start(ref element) => {
63                self.handle_start_element(element)?;
64            }
65            Event::End(ref element) => {
66                self.handle_end_element(element)?;
67            }
68            Event::Empty(ref element) => {
69                self.handle_empty_element(element)?;
70            }
71            Event::Text(ref text) => {
72                if self.extended_validation {
73                    self.validate_text_content(text)?;
74                }
75            }
76            Event::CData(ref cdata) => {
77                if self.extended_validation {
78                    self.validate_cdata_content(cdata)?;
79                }
80            }
81            Event::Comment(_) => {
82                // Comments are always valid
83            }
84            Event::Decl(_) => {
85                // XML declarations are handled elsewhere
86            }
87            Event::PI(_) => {
88                // Processing instructions are generally allowed
89            }
90            Event::DocType(_) => {
91                // DocType validation is handled by security module
92            }
93            Event::Eof => {
94                self.validate_document_end()?;
95            }
96        }
97
98        Ok(())
99    }
100
101    /// Handle XML start element
102    fn handle_start_element(&mut self, element: &BytesStart) -> Result<(), ParseError> {
103        // Use local_name() to get just the element name without namespace prefix
104        let element_name = utf8_utils::decode_utf8_at_position(
105            element.local_name().as_ref(),
106            self.current_position,
107        )?;
108
109        if self.strict_validation {
110            // Validate element name
111            if element_name.is_empty() {
112                return Err(ParseError::MalformedXml {
113                    message: "Empty element name".to_string(),
114                    position: self.current_position,
115                });
116            }
117
118            // Validate element name contains only valid XML name characters
119            if !is_valid_xml_name(&element_name) {
120                return Err(ParseError::MalformedXml {
121                    message: format!("Invalid element name: '{}'", element_name),
122                    position: self.current_position,
123                });
124            }
125        }
126
127        // Validate attributes if extended validation is enabled
128        if self.extended_validation {
129            self.validate_attributes(element)?;
130        }
131
132        // Calculate depth: depth = number of open ancestors + 1 (for this element)
133        // Siblings have the same depth as each other
134        let element_depth = self.element_stack.len() + 1;
135
136        // Push element onto stack for tag matching with its depth
137        self.element_stack
138            .push((element_name.clone(), element_depth));
139
140        // Update current depth to this element's depth
141        self.current_depth = element_depth;
142
143        // Debug logging disabled for production use
144
145        Ok(())
146    }
147
148    /// Handle XML end element
149    fn handle_end_element(&mut self, element: &BytesEnd) -> Result<(), ParseError> {
150        // Use local_name() to get just the element name without namespace prefix
151        let element_name = utf8_utils::decode_utf8_at_position(
152            element.local_name().as_ref(),
153            self.current_position,
154        )?;
155
156        if self.strict_validation {
157            // Check if there's a matching start tag
158            if let Some((expected, depth)) = self.element_stack.pop() {
159                if expected != element_name {
160                    // Debug: print stack state when mismatch occurs
161                    // Debug logging disabled for production use
162
163                    return Err(ParseError::MismatchedTags {
164                        expected,
165                        found: element_name,
166                        position: self.current_position,
167                    });
168                }
169                // Update depth to parent's depth when exiting an element
170                // After popping, stack size = parent depth
171                self.current_depth = self.element_stack.len();
172            } else {
173                return Err(ParseError::UnexpectedClosingTag {
174                    tag: element_name,
175                    position: self.current_position,
176                });
177            }
178        } else {
179            // Even in lenient mode, we should pop from stack and update depth
180            if let Some((_, _depth)) = self.element_stack.pop() {
181                // After popping, current depth = remaining stack size
182                self.current_depth = self.element_stack.len();
183            }
184        }
185
186        Ok(())
187    }
188
189    /// Handle empty XML element (self-closing)
190    fn handle_empty_element(&mut self, element: &BytesStart) -> Result<(), ParseError> {
191        // Use local_name() to get just the element name without namespace prefix
192        let element_name = utf8_utils::decode_utf8_at_position(
193            element.local_name().as_ref(),
194            self.current_position,
195        )?;
196
197        if self.strict_validation {
198            // Validate element name
199            if element_name.is_empty() {
200                return Err(ParseError::MalformedXml {
201                    message: "Empty element name".to_string(),
202                    position: self.current_position,
203                });
204            }
205
206            if !is_valid_xml_name(&element_name) {
207                return Err(ParseError::MalformedXml {
208                    message: format!("Invalid element name: '{}'", element_name),
209                    position: self.current_position,
210                });
211            }
212        }
213
214        // Validate attributes if extended validation is enabled
215        if self.extended_validation {
216            self.validate_attributes(element)?;
217        }
218
219        // Empty elements don't need to be added to the stack since they're self-closing
220
221        Ok(())
222    }
223
224    /// Validate text content
225    fn validate_text_content(&self, text: &BytesText) -> Result<(), ParseError> {
226        // Use UTF-8 utilities to safely decode text
227        let _decoded = utf8_utils::handle_text_node(text, self.current_position)?;
228
229        // Additional text validation could be added here
230        // For example, checking for invalid control characters
231
232        Ok(())
233    }
234
235    /// Validate CDATA content
236    fn validate_cdata_content(&self, cdata: &[u8]) -> Result<(), ParseError> {
237        // Validate CDATA is properly UTF-8 encoded
238        let _decoded = utf8_utils::decode_utf8_at_position(cdata, self.current_position)?;
239
240        // CDATA sections cannot contain "]]>" sequence except at the end
241        let cdata_str = std::str::from_utf8(cdata).map_err(|e| ParseError::InvalidUtf8 {
242            message: format!("UTF-8 decoding error at position {}: {}", self.current_position + e.valid_up_to(), e),
243        })?;
244
245        if cdata_str.contains("]]>") && !cdata_str.ends_with("]]>") {
246            return Err(ParseError::MalformedXml {
247                message: "CDATA section contains ']]>' in the middle".to_string(),
248                position: self.current_position,
249            });
250        }
251
252        Ok(())
253    }
254
255    /// Validate XML attributes
256    fn validate_attributes(&self, element: &BytesStart) -> Result<(), ParseError> {
257        let mut seen_attributes = std::collections::HashSet::new();
258
259        for attr_result in element.attributes() {
260            let attr = attr_result.map_err(|e| ParseError::MalformedXml {
261                message: format!("Malformed attribute: {}", e),
262                position: self.current_position,
263            })?;
264
265            // Decode attribute name and value
266            let attr_name =
267                utf8_utils::decode_attribute_name(attr.key.as_ref(), self.current_position)?;
268            let attr_value =
269                utf8_utils::decode_attribute_value(&attr.value, self.current_position)?;
270
271            // Validate attribute name
272            if attr_name.is_empty() {
273                return Err(ParseError::InvalidAttribute {
274                    message: "Empty attribute name".to_string(),
275                    position: self.current_position,
276                });
277            }
278
279            if !is_valid_xml_name(&attr_name) {
280                return Err(ParseError::InvalidAttribute {
281                    message: format!("Invalid attribute name: '{}'", attr_name),
282                    position: self.current_position,
283                });
284            }
285
286            // Check for duplicate attributes
287            if !seen_attributes.insert(attr_name.clone()) {
288                return Err(ParseError::InvalidAttribute {
289                    message: format!("Duplicate attribute: '{}'", attr_name),
290                    position: self.current_position,
291                });
292            }
293
294            // Validate attribute value doesn't contain invalid characters
295            if attr_value.contains('<') || attr_value.contains('&') && !attr_value.contains(';') {
296                return Err(ParseError::InvalidAttribute {
297                    message: format!("Invalid character in attribute value: '{}'", attr_value),
298                    position: self.current_position,
299                });
300            }
301        }
302
303        Ok(())
304    }
305
306    /// Validate at document end that all elements are properly closed
307    fn validate_document_end(&mut self) -> Result<(), ParseError> {
308        if self.strict_validation && !self.element_stack.is_empty() {
309            let unclosed_tags = self
310                .element_stack
311                .iter()
312                .map(|(name, _)| name.clone())
313                .collect();
314            return Err(ParseError::UnclosedTags {
315                tags: unclosed_tags,
316                position: self.current_position,
317            });
318        }
319
320        // Clear stack and reset depth for next document
321        self.element_stack.clear();
322        self.current_depth = 0;
323        Ok(())
324    }
325
326    /// Get current element stack (for debugging)
327    pub fn get_element_stack(&self) -> Vec<String> {
328        self.element_stack
329            .iter()
330            .map(|(name, _)| name.clone())
331            .collect()
332    }
333
334    /// Check if validator is currently inside any elements
335    pub fn is_in_element(&self) -> bool {
336        !self.element_stack.is_empty()
337    }
338
339    /// Get current nesting depth (actual depth, not stack size)
340    pub fn get_depth(&self) -> usize {
341        // Return actual stack depth, which represents nesting level
342        // This fixes the sibling depth bug - siblings have the same depth as their parent + 1
343        self.element_stack.len()
344    }
345}
346
347/// Validate XML name according to XML 1.0 specification
348/// https://www.w3.org/TR/xml/#NT-Name
349fn is_valid_xml_name(name: &str) -> bool {
350    if name.is_empty() {
351        return false;
352    }
353
354    let chars: Vec<char> = name.chars().collect();
355
356    // First character must be a letter, underscore, or colon
357    if !is_name_start_char(chars[0]) {
358        return false;
359    }
360
361    // Remaining characters must be name characters
362    for &ch in chars.iter().skip(1) {
363        if !is_name_char(ch) {
364            return false;
365        }
366    }
367
368    true
369}
370
371/// Check if character can start an XML name
372fn is_name_start_char(ch: char) -> bool {
373    ch.is_ascii_alphabetic()
374        || ch == '_'
375        || ch == ':'
376        || ('\u{C0}'..='\u{D6}').contains(&ch)
377        || ('\u{D8}'..='\u{F6}').contains(&ch)
378        || ('\u{F8}'..='\u{2FF}').contains(&ch)
379        || ('\u{370}'..='\u{37D}').contains(&ch)
380        || ('\u{37F}'..='\u{1FFF}').contains(&ch)
381        || ('\u{200C}'..='\u{200D}').contains(&ch)
382        || ('\u{2070}'..='\u{218F}').contains(&ch)
383        || ('\u{2C00}'..='\u{2FEF}').contains(&ch)
384        || ('\u{3001}'..='\u{D7FF}').contains(&ch)
385        || ('\u{F900}'..='\u{FDCF}').contains(&ch)
386        || ('\u{FDF0}'..='\u{FFFD}').contains(&ch)
387}
388
389/// Check if character can be in an XML name
390fn is_name_char(ch: char) -> bool {
391    is_name_start_char(ch)
392        || ch.is_ascii_digit()
393        || ch == '-'
394        || ch == '.'
395        || ch == '\u{B7}'
396        || ('\u{0300}'..='\u{036F}').contains(&ch)
397        || ('\u{203F}'..='\u{2040}').contains(&ch)
398}
399
400#[cfg(test)]
401mod tests {
402    use super::*;
403    use std::io::Cursor;
404
405    #[test]
406    fn test_valid_xml_names() {
407        assert!(is_valid_xml_name("element"));
408        assert!(is_valid_xml_name("_private"));
409        assert!(is_valid_xml_name("ns:element"));
410        assert!(is_valid_xml_name("element-1"));
411        assert!(is_valid_xml_name("element.1"));
412    }
413
414    #[test]
415    fn test_invalid_xml_names() {
416        assert!(!is_valid_xml_name(""));
417        assert!(!is_valid_xml_name("1element"));
418        assert!(!is_valid_xml_name("-element"));
419        assert!(!is_valid_xml_name(".element"));
420        assert!(!is_valid_xml_name("element with spaces"));
421    }
422
423    #[test]
424    fn test_validator_creation() {
425        let validator = XmlValidator::default();
426        assert_eq!(validator.get_depth(), 0);
427        assert!(!validator.is_in_element());
428    }
429
430    #[test]
431    fn test_element_stack_tracking() {
432        let mut validator = XmlValidator::strict();
433        let cursor = Cursor::new(b"test");
434        let reader = Reader::from_reader(cursor);
435
436        // Simulate start element
437        let start_element = BytesStart::new("test");
438        let start_event = Event::Start(start_element);
439
440        validator.validate_event(&start_event, &reader).unwrap();
441        assert_eq!(validator.get_depth(), 1);
442        assert!(validator.is_in_element());
443
444        // Simulate end element
445        let end_element = BytesEnd::new("test");
446        let end_event = Event::End(end_element);
447
448        validator.validate_event(&end_event, &reader).unwrap();
449        assert_eq!(validator.get_depth(), 0);
450        assert!(!validator.is_in_element());
451    }
452}