ddex_parser/parser/
xml_validator.rs

1//! XML structure validation for detecting malformed XML
2
3use crate::error::ParseError;
4use crate::utf8_utils;
5use quick_xml::events::{BytesEnd, BytesStart, BytesText, Event};
6use quick_xml::Reader;
7use std::io::BufRead;
8
9/// XML validator that tracks element stack and validates structure
10#[derive(Debug, Clone)]
11pub struct XmlValidator {
12    /// Stack of open XML elements for tag matching (stores element name and its depth)
13    element_stack: Vec<(String, usize)>,
14    /// Track current actual nesting depth (siblings don't increase depth)
15    current_depth: usize,
16    /// Track current byte position for error reporting
17    current_position: usize,
18    /// Enable strict validation (mismatched tags, unclosed elements)
19    strict_validation: bool,
20    /// Enable extended validation (attributes, content)
21    extended_validation: bool,
22}
23
24impl Default for XmlValidator {
25    fn default() -> Self {
26        Self::new(true, false)
27    }
28}
29
30impl XmlValidator {
31    /// Create new validator with specified validation levels
32    pub fn new(strict: bool, extended: bool) -> Self {
33        Self {
34            element_stack: Vec::new(),
35            current_depth: 0,
36            current_position: 0,
37            strict_validation: strict,
38            extended_validation: extended,
39        }
40    }
41
42    /// Create a strict validator for production use
43    pub fn strict() -> Self {
44        Self::new(true, true)
45    }
46
47    /// Create a lenient validator for development/testing
48    pub fn lenient() -> Self {
49        Self::new(false, false)
50    }
51
52    /// Validate XML structure during parsing
53    pub fn validate_event<R: BufRead>(
54        &mut self,
55        event: &Event,
56        reader: &Reader<R>,
57    ) -> Result<(), ParseError> {
58        // Update current position for error reporting
59        self.current_position = reader.buffer_position() as usize;
60
61        match event {
62            Event::Start(ref element) => {
63                self.handle_start_element(element)?;
64            }
65            Event::End(ref element) => {
66                self.handle_end_element(element)?;
67            }
68            Event::Empty(ref element) => {
69                self.handle_empty_element(element)?;
70            }
71            Event::Text(ref text) => {
72                if self.extended_validation {
73                    self.validate_text_content(text)?;
74                }
75            }
76            Event::CData(ref cdata) => {
77                if self.extended_validation {
78                    self.validate_cdata_content(cdata)?;
79                }
80            }
81            Event::Comment(_) => {
82                // Comments are always valid
83            }
84            Event::Decl(_) => {
85                // XML declarations are handled elsewhere
86            }
87            Event::PI(_) => {
88                // Processing instructions are generally allowed
89            }
90            Event::DocType(_) => {
91                // DocType validation is handled by security module
92            }
93            Event::Eof => {
94                self.validate_document_end()?;
95            }
96        }
97
98        Ok(())
99    }
100
101    /// Handle XML start element
102    fn handle_start_element(&mut self, element: &BytesStart) -> Result<(), ParseError> {
103        // Use local_name() to get just the element name without namespace prefix
104        let element_name = utf8_utils::decode_utf8_at_position(
105            element.local_name().as_ref(),
106            self.current_position,
107        )?;
108
109        if self.strict_validation {
110            // Validate element name
111            if element_name.is_empty() {
112                return Err(ParseError::MalformedXml {
113                    message: "Empty element name".to_string(),
114                    position: self.current_position,
115                });
116            }
117
118            // Validate element name contains only valid XML name characters
119            if !is_valid_xml_name(&element_name) {
120                return Err(ParseError::MalformedXml {
121                    message: format!("Invalid element name: '{}'", element_name),
122                    position: self.current_position,
123                });
124            }
125        }
126
127        // Validate attributes if extended validation is enabled
128        if self.extended_validation {
129            self.validate_attributes(element)?;
130        }
131
132        // Calculate depth: depth = number of open ancestors + 1 (for this element)
133        // Siblings have the same depth as each other
134        let element_depth = self.element_stack.len() + 1;
135
136        // Push element onto stack for tag matching with its depth
137        self.element_stack
138            .push((element_name.clone(), element_depth));
139
140        // Update current depth to this element's depth
141        self.current_depth = element_depth;
142
143        // Debug: print what we're pushing (only for first few elements)
144        if self.element_stack.len() <= 5 {
145            eprintln!(
146                "PUSH DEBUG: '{}' depth {} (stack size now: {})",
147                element_name,
148                self.current_depth,
149                self.element_stack.len()
150            );
151        }
152
153        Ok(())
154    }
155
156    /// Handle XML end element
157    fn handle_end_element(&mut self, element: &BytesEnd) -> Result<(), ParseError> {
158        // Use local_name() to get just the element name without namespace prefix
159        let element_name = utf8_utils::decode_utf8_at_position(
160            element.local_name().as_ref(),
161            self.current_position,
162        )?;
163
164        if self.strict_validation {
165            // Check if there's a matching start tag
166            if let Some((expected, depth)) = self.element_stack.pop() {
167                if expected != element_name {
168                    // Debug: print stack state when mismatch occurs
169                    eprintln!("TAG MISMATCH DEBUG:");
170                    eprintln!("  Expected: '{}' at depth {}", expected, depth);
171                    eprintln!("  Found: '{}'", element_name);
172                    eprintln!("  Stack size: {}", self.element_stack.len() + 1); // +1 because we just popped
173                    eprintln!("  Stack contents: {:?}", self.element_stack);
174                    eprintln!("  Position: {}", self.current_position);
175
176                    return Err(ParseError::MismatchedTags {
177                        expected,
178                        found: element_name,
179                        position: self.current_position,
180                    });
181                }
182                // Update depth to parent's depth when exiting an element
183                // After popping, stack size = parent depth
184                self.current_depth = self.element_stack.len();
185            } else {
186                return Err(ParseError::UnexpectedClosingTag {
187                    tag: element_name,
188                    position: self.current_position,
189                });
190            }
191        } else {
192            // Even in lenient mode, we should pop from stack and update depth
193            if let Some((_, _depth)) = self.element_stack.pop() {
194                // After popping, current depth = remaining stack size
195                self.current_depth = self.element_stack.len();
196            }
197        }
198
199        Ok(())
200    }
201
202    /// Handle empty XML element (self-closing)
203    fn handle_empty_element(&mut self, element: &BytesStart) -> Result<(), ParseError> {
204        // Use local_name() to get just the element name without namespace prefix
205        let element_name = utf8_utils::decode_utf8_at_position(
206            element.local_name().as_ref(),
207            self.current_position,
208        )?;
209
210        if self.strict_validation {
211            // Validate element name
212            if element_name.is_empty() {
213                return Err(ParseError::MalformedXml {
214                    message: "Empty element name".to_string(),
215                    position: self.current_position,
216                });
217            }
218
219            if !is_valid_xml_name(&element_name) {
220                return Err(ParseError::MalformedXml {
221                    message: format!("Invalid element name: '{}'", element_name),
222                    position: self.current_position,
223                });
224            }
225        }
226
227        // Validate attributes if extended validation is enabled
228        if self.extended_validation {
229            self.validate_attributes(element)?;
230        }
231
232        // Empty elements don't need to be added to the stack since they're self-closing
233
234        Ok(())
235    }
236
237    /// Validate text content
238    fn validate_text_content(&self, text: &BytesText) -> Result<(), ParseError> {
239        // Use UTF-8 utilities to safely decode text
240        let _decoded = utf8_utils::handle_text_node(text, self.current_position)?;
241
242        // Additional text validation could be added here
243        // For example, checking for invalid control characters
244
245        Ok(())
246    }
247
248    /// Validate CDATA content
249    fn validate_cdata_content(&self, cdata: &[u8]) -> Result<(), ParseError> {
250        // Validate CDATA is properly UTF-8 encoded
251        let _decoded = utf8_utils::decode_utf8_at_position(cdata, self.current_position)?;
252
253        // CDATA sections cannot contain "]]>" sequence except at the end
254        let cdata_str = std::str::from_utf8(cdata).map_err(|e| ParseError::InvalidUtf8 {
255            position: self.current_position + e.valid_up_to(),
256            error: e.to_string(),
257        })?;
258
259        if cdata_str.contains("]]>") && !cdata_str.ends_with("]]>") {
260            return Err(ParseError::MalformedXml {
261                message: "CDATA section contains ']]>' in the middle".to_string(),
262                position: self.current_position,
263            });
264        }
265
266        Ok(())
267    }
268
269    /// Validate XML attributes
270    fn validate_attributes(&self, element: &BytesStart) -> Result<(), ParseError> {
271        let mut seen_attributes = std::collections::HashSet::new();
272
273        for attr_result in element.attributes() {
274            let attr = attr_result.map_err(|e| ParseError::MalformedXml {
275                message: format!("Malformed attribute: {}", e),
276                position: self.current_position,
277            })?;
278
279            // Decode attribute name and value
280            let attr_name =
281                utf8_utils::decode_attribute_name(attr.key.as_ref(), self.current_position)?;
282            let attr_value =
283                utf8_utils::decode_attribute_value(&attr.value, self.current_position)?;
284
285            // Validate attribute name
286            if attr_name.is_empty() {
287                return Err(ParseError::InvalidAttribute {
288                    message: "Empty attribute name".to_string(),
289                    position: self.current_position,
290                });
291            }
292
293            if !is_valid_xml_name(&attr_name) {
294                return Err(ParseError::InvalidAttribute {
295                    message: format!("Invalid attribute name: '{}'", attr_name),
296                    position: self.current_position,
297                });
298            }
299
300            // Check for duplicate attributes
301            if !seen_attributes.insert(attr_name.clone()) {
302                return Err(ParseError::InvalidAttribute {
303                    message: format!("Duplicate attribute: '{}'", attr_name),
304                    position: self.current_position,
305                });
306            }
307
308            // Validate attribute value doesn't contain invalid characters
309            if attr_value.contains('<') || attr_value.contains('&') && !attr_value.contains(';') {
310                return Err(ParseError::InvalidAttribute {
311                    message: format!("Invalid character in attribute value: '{}'", attr_value),
312                    position: self.current_position,
313                });
314            }
315        }
316
317        Ok(())
318    }
319
320    /// Validate at document end that all elements are properly closed
321    fn validate_document_end(&mut self) -> Result<(), ParseError> {
322        if self.strict_validation && !self.element_stack.is_empty() {
323            let unclosed_tags = self
324                .element_stack
325                .iter()
326                .map(|(name, _)| name.clone())
327                .collect();
328            return Err(ParseError::UnclosedTags {
329                tags: unclosed_tags,
330                position: self.current_position,
331            });
332        }
333
334        // Clear stack and reset depth for next document
335        self.element_stack.clear();
336        self.current_depth = 0;
337        Ok(())
338    }
339
340    /// Get current element stack (for debugging)
341    pub fn get_element_stack(&self) -> Vec<String> {
342        self.element_stack
343            .iter()
344            .map(|(name, _)| name.clone())
345            .collect()
346    }
347
348    /// Check if validator is currently inside any elements
349    pub fn is_in_element(&self) -> bool {
350        !self.element_stack.is_empty()
351    }
352
353    /// Get current nesting depth (actual depth, not stack size)
354    pub fn get_depth(&self) -> usize {
355        // Return actual stack depth, which represents nesting level
356        // This fixes the sibling depth bug - siblings have the same depth as their parent + 1
357        self.element_stack.len()
358    }
359}
360
361/// Validate XML name according to XML 1.0 specification
362/// https://www.w3.org/TR/xml/#NT-Name
363fn is_valid_xml_name(name: &str) -> bool {
364    if name.is_empty() {
365        return false;
366    }
367
368    let chars: Vec<char> = name.chars().collect();
369
370    // First character must be a letter, underscore, or colon
371    if !is_name_start_char(chars[0]) {
372        return false;
373    }
374
375    // Remaining characters must be name characters
376    for &ch in chars.iter().skip(1) {
377        if !is_name_char(ch) {
378            return false;
379        }
380    }
381
382    true
383}
384
385/// Check if character can start an XML name
386fn is_name_start_char(ch: char) -> bool {
387    ch.is_ascii_alphabetic()
388        || ch == '_'
389        || ch == ':'
390        || ('\u{C0}'..='\u{D6}').contains(&ch)
391        || ('\u{D8}'..='\u{F6}').contains(&ch)
392        || ('\u{F8}'..='\u{2FF}').contains(&ch)
393        || ('\u{370}'..='\u{37D}').contains(&ch)
394        || ('\u{37F}'..='\u{1FFF}').contains(&ch)
395        || ('\u{200C}'..='\u{200D}').contains(&ch)
396        || ('\u{2070}'..='\u{218F}').contains(&ch)
397        || ('\u{2C00}'..='\u{2FEF}').contains(&ch)
398        || ('\u{3001}'..='\u{D7FF}').contains(&ch)
399        || ('\u{F900}'..='\u{FDCF}').contains(&ch)
400        || ('\u{FDF0}'..='\u{FFFD}').contains(&ch)
401}
402
403/// Check if character can be in an XML name
404fn is_name_char(ch: char) -> bool {
405    is_name_start_char(ch)
406        || ch.is_ascii_digit()
407        || ch == '-'
408        || ch == '.'
409        || ch == '\u{B7}'
410        || ('\u{0300}'..='\u{036F}').contains(&ch)
411        || ('\u{203F}'..='\u{2040}').contains(&ch)
412}
413
414#[cfg(test)]
415mod tests {
416    use super::*;
417    use std::io::Cursor;
418
419    #[test]
420    fn test_valid_xml_names() {
421        assert!(is_valid_xml_name("element"));
422        assert!(is_valid_xml_name("_private"));
423        assert!(is_valid_xml_name("ns:element"));
424        assert!(is_valid_xml_name("element-1"));
425        assert!(is_valid_xml_name("element.1"));
426    }
427
428    #[test]
429    fn test_invalid_xml_names() {
430        assert!(!is_valid_xml_name(""));
431        assert!(!is_valid_xml_name("1element"));
432        assert!(!is_valid_xml_name("-element"));
433        assert!(!is_valid_xml_name(".element"));
434        assert!(!is_valid_xml_name("element with spaces"));
435    }
436
437    #[test]
438    fn test_validator_creation() {
439        let validator = XmlValidator::default();
440        assert_eq!(validator.get_depth(), 0);
441        assert!(!validator.is_in_element());
442    }
443
444    #[test]
445    fn test_element_stack_tracking() {
446        let mut validator = XmlValidator::strict();
447        let cursor = Cursor::new(b"test");
448        let reader = Reader::from_reader(cursor);
449
450        // Simulate start element
451        let start_element = BytesStart::new("test");
452        let start_event = Event::Start(start_element);
453
454        validator.validate_event(&start_event, &reader).unwrap();
455        assert_eq!(validator.get_depth(), 1);
456        assert!(validator.is_in_element());
457
458        // Simulate end element
459        let end_element = BytesEnd::new("test");
460        let end_event = Event::End(end_element);
461
462        validator.validate_event(&end_event, &reader).unwrap();
463        assert_eq!(validator.get_depth(), 0);
464        assert!(!validator.is_in_element());
465    }
466}