ddex_parser/parser/
xml_validator.rs

1//! XML structure validation for detecting malformed XML
2
3use crate::error::ParseError;
4use crate::utf8_utils;
5use quick_xml::events::{BytesEnd, BytesStart, BytesText, Event};
6use quick_xml::Reader;
7use std::io::BufRead;
8
9/// XML validator that tracks element stack and validates structure
10#[derive(Debug, Clone)]
11pub struct XmlValidator {
12    /// Stack of open XML elements for tag matching (stores element name and its depth)
13    element_stack: Vec<(String, usize)>,
14    /// Track current actual nesting depth (siblings don't increase depth)
15    current_depth: usize,
16    /// Track current byte position for error reporting
17    current_position: usize,
18    /// Enable strict validation (mismatched tags, unclosed elements)
19    strict_validation: bool,
20    /// Enable extended validation (attributes, content)
21    extended_validation: bool,
22}
23
24impl Default for XmlValidator {
25    fn default() -> Self {
26        Self::new(true, false)
27    }
28}
29
30impl XmlValidator {
31    /// Create new validator with specified validation levels
32    pub fn new(strict: bool, extended: bool) -> Self {
33        Self {
34            element_stack: Vec::new(),
35            current_depth: 0,
36            current_position: 0,
37            strict_validation: strict,
38            extended_validation: extended,
39        }
40    }
41
42    /// Create a strict validator for production use
43    pub fn strict() -> Self {
44        Self::new(true, true)
45    }
46
47    /// Create a lenient validator for development/testing
48    pub fn lenient() -> Self {
49        Self::new(false, false)
50    }
51
52    /// Validate XML structure during parsing
53    pub fn validate_event<R: BufRead>(
54        &mut self,
55        event: &Event,
56        reader: &Reader<R>,
57    ) -> Result<(), ParseError> {
58        // Update current position for error reporting
59        self.current_position = reader.buffer_position() as usize;
60
61        match event {
62            Event::Start(ref element) => {
63                self.handle_start_element(element)?;
64            }
65            Event::End(ref element) => {
66                self.handle_end_element(element)?;
67            }
68            Event::Empty(ref element) => {
69                self.handle_empty_element(element)?;
70            }
71            Event::Text(ref text) => {
72                if self.extended_validation {
73                    self.validate_text_content(text)?;
74                }
75            }
76            Event::CData(ref cdata) => {
77                if self.extended_validation {
78                    self.validate_cdata_content(cdata)?;
79                }
80            }
81            Event::Comment(_) => {
82                // Comments are always valid
83            }
84            Event::Decl(_) => {
85                // XML declarations are handled elsewhere
86            }
87            Event::PI(_) => {
88                // Processing instructions are generally allowed
89            }
90            Event::DocType(_) => {
91                // DocType validation is handled by security module
92            }
93            Event::Eof => {
94                self.validate_document_end()?;
95            }
96        }
97
98        Ok(())
99    }
100
101    /// Handle XML start element
102    fn handle_start_element(&mut self, element: &BytesStart) -> Result<(), ParseError> {
103        // Use local_name() to get just the element name without namespace prefix
104        let element_name = utf8_utils::decode_utf8_at_position(
105            element.local_name().as_ref(),
106            self.current_position,
107        )?;
108
109        if self.strict_validation {
110            // Validate element name
111            if element_name.is_empty() {
112                return Err(ParseError::MalformedXml {
113                    message: "Empty element name".to_string(),
114                    position: self.current_position,
115                });
116            }
117
118            // Validate element name contains only valid XML name characters
119            if !is_valid_xml_name(&element_name) {
120                return Err(ParseError::MalformedXml {
121                    message: format!("Invalid element name: '{}'", element_name),
122                    position: self.current_position,
123                });
124            }
125        }
126
127        // Validate attributes if extended validation is enabled
128        if self.extended_validation {
129            self.validate_attributes(element)?;
130        }
131
132        // Calculate depth: depth = number of open ancestors + 1 (for this element)
133        // Siblings have the same depth as each other
134        let element_depth = self.element_stack.len() + 1;
135
136        // Push element onto stack for tag matching with its depth
137        self.element_stack
138            .push((element_name.clone(), element_depth));
139
140        // Update current depth to this element's depth
141        self.current_depth = element_depth;
142
143        // Debug: print what we're pushing (only for first few elements)
144        if self.element_stack.len() <= 5 {
145            eprintln!(
146                "PUSH DEBUG: '{}' depth {} (stack size now: {})",
147                element_name,
148                self.current_depth,
149                self.element_stack.len()
150            );
151        }
152
153        Ok(())
154    }
155
156    /// Handle XML end element
157    fn handle_end_element(&mut self, element: &BytesEnd) -> Result<(), ParseError> {
158        // Use local_name() to get just the element name without namespace prefix
159        let element_name = utf8_utils::decode_utf8_at_position(
160            element.local_name().as_ref(),
161            self.current_position,
162        )?;
163
164        if self.strict_validation {
165            // Check if there's a matching start tag
166            if let Some((expected, depth)) = self.element_stack.pop() {
167                if expected != element_name {
168                    // Debug: print stack state when mismatch occurs
169                    eprintln!("TAG MISMATCH DEBUG:");
170                    eprintln!("  Expected: '{}' at depth {}", expected, depth);
171                    eprintln!("  Found: '{}'", element_name);
172                    eprintln!("  Stack size: {}", self.element_stack.len() + 1); // +1 because we just popped
173                    eprintln!("  Stack contents: {:?}", self.element_stack);
174                    eprintln!("  Position: {}", self.current_position);
175
176                    return Err(ParseError::MismatchedTags {
177                        expected,
178                        found: element_name,
179                        position: self.current_position,
180                    });
181                }
182                // Update depth to parent's depth when exiting an element
183                // After popping, stack size = parent depth
184                self.current_depth = self.element_stack.len();
185            } else {
186                return Err(ParseError::UnexpectedClosingTag {
187                    tag: element_name,
188                    position: self.current_position,
189                });
190            }
191        } else {
192            // Even in lenient mode, we should pop from stack and update depth
193            if let Some((_, _depth)) = self.element_stack.pop() {
194                // After popping, current depth = remaining stack size
195                self.current_depth = self.element_stack.len();
196            }
197        }
198
199        Ok(())
200    }
201
202    /// Handle empty XML element (self-closing)
203    fn handle_empty_element(&mut self, element: &BytesStart) -> Result<(), ParseError> {
204        // Use local_name() to get just the element name without namespace prefix
205        let element_name = utf8_utils::decode_utf8_at_position(
206            element.local_name().as_ref(),
207            self.current_position,
208        )?;
209
210        if self.strict_validation {
211            // Validate element name
212            if element_name.is_empty() {
213                return Err(ParseError::MalformedXml {
214                    message: "Empty element name".to_string(),
215                    position: self.current_position,
216                });
217            }
218
219            if !is_valid_xml_name(&element_name) {
220                return Err(ParseError::MalformedXml {
221                    message: format!("Invalid element name: '{}'", element_name),
222                    position: self.current_position,
223                });
224            }
225        }
226
227        // Validate attributes if extended validation is enabled
228        if self.extended_validation {
229            self.validate_attributes(element)?;
230        }
231
232        // Empty elements don't need to be added to the stack since they're self-closing
233
234        Ok(())
235    }
236
237    /// Validate text content
238    fn validate_text_content(&self, text: &BytesText) -> Result<(), ParseError> {
239        // Use UTF-8 utilities to safely decode text
240        let _decoded = utf8_utils::handle_text_node(text, self.current_position)?;
241
242        // Additional text validation could be added here
243        // For example, checking for invalid control characters
244
245        Ok(())
246    }
247
248    /// Validate CDATA content
249    fn validate_cdata_content(&self, cdata: &[u8]) -> Result<(), ParseError> {
250        // Validate CDATA is properly UTF-8 encoded
251        let _decoded = utf8_utils::decode_utf8_at_position(cdata, self.current_position)?;
252
253        // CDATA sections cannot contain "]]>" sequence except at the end
254        let cdata_str = std::str::from_utf8(cdata).map_err(|e| ParseError::InvalidUtf8 {
255            message: format!("UTF-8 decoding error at position {}: {}", self.current_position + e.valid_up_to(), e),
256        })?;
257
258        if cdata_str.contains("]]>") && !cdata_str.ends_with("]]>") {
259            return Err(ParseError::MalformedXml {
260                message: "CDATA section contains ']]>' in the middle".to_string(),
261                position: self.current_position,
262            });
263        }
264
265        Ok(())
266    }
267
268    /// Validate XML attributes
269    fn validate_attributes(&self, element: &BytesStart) -> Result<(), ParseError> {
270        let mut seen_attributes = std::collections::HashSet::new();
271
272        for attr_result in element.attributes() {
273            let attr = attr_result.map_err(|e| ParseError::MalformedXml {
274                message: format!("Malformed attribute: {}", e),
275                position: self.current_position,
276            })?;
277
278            // Decode attribute name and value
279            let attr_name =
280                utf8_utils::decode_attribute_name(attr.key.as_ref(), self.current_position)?;
281            let attr_value =
282                utf8_utils::decode_attribute_value(&attr.value, self.current_position)?;
283
284            // Validate attribute name
285            if attr_name.is_empty() {
286                return Err(ParseError::InvalidAttribute {
287                    message: "Empty attribute name".to_string(),
288                    position: self.current_position,
289                });
290            }
291
292            if !is_valid_xml_name(&attr_name) {
293                return Err(ParseError::InvalidAttribute {
294                    message: format!("Invalid attribute name: '{}'", attr_name),
295                    position: self.current_position,
296                });
297            }
298
299            // Check for duplicate attributes
300            if !seen_attributes.insert(attr_name.clone()) {
301                return Err(ParseError::InvalidAttribute {
302                    message: format!("Duplicate attribute: '{}'", attr_name),
303                    position: self.current_position,
304                });
305            }
306
307            // Validate attribute value doesn't contain invalid characters
308            if attr_value.contains('<') || attr_value.contains('&') && !attr_value.contains(';') {
309                return Err(ParseError::InvalidAttribute {
310                    message: format!("Invalid character in attribute value: '{}'", attr_value),
311                    position: self.current_position,
312                });
313            }
314        }
315
316        Ok(())
317    }
318
319    /// Validate at document end that all elements are properly closed
320    fn validate_document_end(&mut self) -> Result<(), ParseError> {
321        if self.strict_validation && !self.element_stack.is_empty() {
322            let unclosed_tags = self
323                .element_stack
324                .iter()
325                .map(|(name, _)| name.clone())
326                .collect();
327            return Err(ParseError::UnclosedTags {
328                tags: unclosed_tags,
329                position: self.current_position,
330            });
331        }
332
333        // Clear stack and reset depth for next document
334        self.element_stack.clear();
335        self.current_depth = 0;
336        Ok(())
337    }
338
339    /// Get current element stack (for debugging)
340    pub fn get_element_stack(&self) -> Vec<String> {
341        self.element_stack
342            .iter()
343            .map(|(name, _)| name.clone())
344            .collect()
345    }
346
347    /// Check if validator is currently inside any elements
348    pub fn is_in_element(&self) -> bool {
349        !self.element_stack.is_empty()
350    }
351
352    /// Get current nesting depth (actual depth, not stack size)
353    pub fn get_depth(&self) -> usize {
354        // Return actual stack depth, which represents nesting level
355        // This fixes the sibling depth bug - siblings have the same depth as their parent + 1
356        self.element_stack.len()
357    }
358}
359
360/// Validate XML name according to XML 1.0 specification
361/// https://www.w3.org/TR/xml/#NT-Name
362fn is_valid_xml_name(name: &str) -> bool {
363    if name.is_empty() {
364        return false;
365    }
366
367    let chars: Vec<char> = name.chars().collect();
368
369    // First character must be a letter, underscore, or colon
370    if !is_name_start_char(chars[0]) {
371        return false;
372    }
373
374    // Remaining characters must be name characters
375    for &ch in chars.iter().skip(1) {
376        if !is_name_char(ch) {
377            return false;
378        }
379    }
380
381    true
382}
383
384/// Check if character can start an XML name
385fn is_name_start_char(ch: char) -> bool {
386    ch.is_ascii_alphabetic()
387        || ch == '_'
388        || ch == ':'
389        || ('\u{C0}'..='\u{D6}').contains(&ch)
390        || ('\u{D8}'..='\u{F6}').contains(&ch)
391        || ('\u{F8}'..='\u{2FF}').contains(&ch)
392        || ('\u{370}'..='\u{37D}').contains(&ch)
393        || ('\u{37F}'..='\u{1FFF}').contains(&ch)
394        || ('\u{200C}'..='\u{200D}').contains(&ch)
395        || ('\u{2070}'..='\u{218F}').contains(&ch)
396        || ('\u{2C00}'..='\u{2FEF}').contains(&ch)
397        || ('\u{3001}'..='\u{D7FF}').contains(&ch)
398        || ('\u{F900}'..='\u{FDCF}').contains(&ch)
399        || ('\u{FDF0}'..='\u{FFFD}').contains(&ch)
400}
401
402/// Check if character can be in an XML name
403fn is_name_char(ch: char) -> bool {
404    is_name_start_char(ch)
405        || ch.is_ascii_digit()
406        || ch == '-'
407        || ch == '.'
408        || ch == '\u{B7}'
409        || ('\u{0300}'..='\u{036F}').contains(&ch)
410        || ('\u{203F}'..='\u{2040}').contains(&ch)
411}
412
413#[cfg(test)]
414mod tests {
415    use super::*;
416    use std::io::Cursor;
417
418    #[test]
419    fn test_valid_xml_names() {
420        assert!(is_valid_xml_name("element"));
421        assert!(is_valid_xml_name("_private"));
422        assert!(is_valid_xml_name("ns:element"));
423        assert!(is_valid_xml_name("element-1"));
424        assert!(is_valid_xml_name("element.1"));
425    }
426
427    #[test]
428    fn test_invalid_xml_names() {
429        assert!(!is_valid_xml_name(""));
430        assert!(!is_valid_xml_name("1element"));
431        assert!(!is_valid_xml_name("-element"));
432        assert!(!is_valid_xml_name(".element"));
433        assert!(!is_valid_xml_name("element with spaces"));
434    }
435
436    #[test]
437    fn test_validator_creation() {
438        let validator = XmlValidator::default();
439        assert_eq!(validator.get_depth(), 0);
440        assert!(!validator.is_in_element());
441    }
442
443    #[test]
444    fn test_element_stack_tracking() {
445        let mut validator = XmlValidator::strict();
446        let cursor = Cursor::new(b"test");
447        let reader = Reader::from_reader(cursor);
448
449        // Simulate start element
450        let start_element = BytesStart::new("test");
451        let start_event = Event::Start(start_element);
452
453        validator.validate_event(&start_event, &reader).unwrap();
454        assert_eq!(validator.get_depth(), 1);
455        assert!(validator.is_in_element());
456
457        // Simulate end element
458        let end_element = BytesEnd::new("test");
459        let end_event = Event::End(end_element);
460
461        validator.validate_event(&end_event, &reader).unwrap();
462        assert_eq!(validator.get_depth(), 0);
463        assert!(!validator.is_in_element());
464    }
465}