1use crate::error::ParseError;
4use crate::utf8_utils;
5use quick_xml::events::{BytesEnd, BytesStart, BytesText, Event};
6use quick_xml::Reader;
7use std::io::BufRead;
8
9#[derive(Debug, Clone)]
11pub struct XmlValidator {
12 element_stack: Vec<(String, usize)>,
14 current_depth: usize,
16 current_position: usize,
18 strict_validation: bool,
20 extended_validation: bool,
22}
23
24impl Default for XmlValidator {
25 fn default() -> Self {
26 Self::new(true, false)
27 }
28}
29
30impl XmlValidator {
31 pub fn new(strict: bool, extended: bool) -> Self {
33 Self {
34 element_stack: Vec::new(),
35 current_depth: 0,
36 current_position: 0,
37 strict_validation: strict,
38 extended_validation: extended,
39 }
40 }
41
42 pub fn strict() -> Self {
44 Self::new(true, true)
45 }
46
47 pub fn lenient() -> Self {
49 Self::new(false, false)
50 }
51
52 pub fn validate_event<R: BufRead>(
54 &mut self,
55 event: &Event,
56 reader: &Reader<R>,
57 ) -> Result<(), ParseError> {
58 self.current_position = reader.buffer_position() as usize;
60
61 match event {
62 Event::Start(ref element) => {
63 self.handle_start_element(element)?;
64 }
65 Event::End(ref element) => {
66 self.handle_end_element(element)?;
67 }
68 Event::Empty(ref element) => {
69 self.handle_empty_element(element)?;
70 }
71 Event::Text(ref text) => {
72 if self.extended_validation {
73 self.validate_text_content(text)?;
74 }
75 }
76 Event::CData(ref cdata) => {
77 if self.extended_validation {
78 self.validate_cdata_content(cdata)?;
79 }
80 }
81 Event::Comment(_) => {
82 }
84 Event::Decl(_) => {
85 }
87 Event::PI(_) => {
88 }
90 Event::DocType(_) => {
91 }
93 Event::Eof => {
94 self.validate_document_end()?;
95 }
96 }
97
98 Ok(())
99 }
100
101 fn handle_start_element(&mut self, element: &BytesStart) -> Result<(), ParseError> {
103 let element_name = utf8_utils::decode_utf8_at_position(
105 element.local_name().as_ref(),
106 self.current_position,
107 )?;
108
109 if self.strict_validation {
110 if element_name.is_empty() {
112 return Err(ParseError::MalformedXml {
113 message: "Empty element name".to_string(),
114 position: self.current_position,
115 });
116 }
117
118 if !is_valid_xml_name(&element_name) {
120 return Err(ParseError::MalformedXml {
121 message: format!("Invalid element name: '{}'", element_name),
122 position: self.current_position,
123 });
124 }
125 }
126
127 if self.extended_validation {
129 self.validate_attributes(element)?;
130 }
131
132 let element_depth = self.element_stack.len() + 1;
135
136 self.element_stack
138 .push((element_name.clone(), element_depth));
139
140 self.current_depth = element_depth;
142
143 Ok(())
146 }
147
148 fn handle_end_element(&mut self, element: &BytesEnd) -> Result<(), ParseError> {
150 let element_name = utf8_utils::decode_utf8_at_position(
152 element.local_name().as_ref(),
153 self.current_position,
154 )?;
155
156 if self.strict_validation {
157 if let Some((expected, depth)) = self.element_stack.pop() {
159 if expected != element_name {
160 return Err(ParseError::MismatchedTags {
164 expected,
165 found: element_name,
166 position: self.current_position,
167 });
168 }
169 self.current_depth = self.element_stack.len();
172 } else {
173 return Err(ParseError::UnexpectedClosingTag {
174 tag: element_name,
175 position: self.current_position,
176 });
177 }
178 } else {
179 if let Some((_, _depth)) = self.element_stack.pop() {
181 self.current_depth = self.element_stack.len();
183 }
184 }
185
186 Ok(())
187 }
188
189 fn handle_empty_element(&mut self, element: &BytesStart) -> Result<(), ParseError> {
191 let element_name = utf8_utils::decode_utf8_at_position(
193 element.local_name().as_ref(),
194 self.current_position,
195 )?;
196
197 if self.strict_validation {
198 if element_name.is_empty() {
200 return Err(ParseError::MalformedXml {
201 message: "Empty element name".to_string(),
202 position: self.current_position,
203 });
204 }
205
206 if !is_valid_xml_name(&element_name) {
207 return Err(ParseError::MalformedXml {
208 message: format!("Invalid element name: '{}'", element_name),
209 position: self.current_position,
210 });
211 }
212 }
213
214 if self.extended_validation {
216 self.validate_attributes(element)?;
217 }
218
219 Ok(())
222 }
223
224 fn validate_text_content(&self, text: &BytesText) -> Result<(), ParseError> {
226 let _decoded = utf8_utils::handle_text_node(text, self.current_position)?;
228
229 Ok(())
233 }
234
235 fn validate_cdata_content(&self, cdata: &[u8]) -> Result<(), ParseError> {
237 let _decoded = utf8_utils::decode_utf8_at_position(cdata, self.current_position)?;
239
240 let cdata_str = std::str::from_utf8(cdata).map_err(|e| ParseError::InvalidUtf8 {
242 message: format!("UTF-8 decoding error at position {}: {}", self.current_position + e.valid_up_to(), e),
243 })?;
244
245 if cdata_str.contains("]]>") && !cdata_str.ends_with("]]>") {
246 return Err(ParseError::MalformedXml {
247 message: "CDATA section contains ']]>' in the middle".to_string(),
248 position: self.current_position,
249 });
250 }
251
252 Ok(())
253 }
254
255 fn validate_attributes(&self, element: &BytesStart) -> Result<(), ParseError> {
257 let mut seen_attributes = std::collections::HashSet::new();
258
259 for attr_result in element.attributes() {
260 let attr = attr_result.map_err(|e| ParseError::MalformedXml {
261 message: format!("Malformed attribute: {}", e),
262 position: self.current_position,
263 })?;
264
265 let attr_name =
267 utf8_utils::decode_attribute_name(attr.key.as_ref(), self.current_position)?;
268 let attr_value =
269 utf8_utils::decode_attribute_value(&attr.value, self.current_position)?;
270
271 if attr_name.is_empty() {
273 return Err(ParseError::InvalidAttribute {
274 message: "Empty attribute name".to_string(),
275 position: self.current_position,
276 });
277 }
278
279 if !is_valid_xml_name(&attr_name) {
280 return Err(ParseError::InvalidAttribute {
281 message: format!("Invalid attribute name: '{}'", attr_name),
282 position: self.current_position,
283 });
284 }
285
286 if !seen_attributes.insert(attr_name.clone()) {
288 return Err(ParseError::InvalidAttribute {
289 message: format!("Duplicate attribute: '{}'", attr_name),
290 position: self.current_position,
291 });
292 }
293
294 if attr_value.contains('<') || attr_value.contains('&') && !attr_value.contains(';') {
296 return Err(ParseError::InvalidAttribute {
297 message: format!("Invalid character in attribute value: '{}'", attr_value),
298 position: self.current_position,
299 });
300 }
301 }
302
303 Ok(())
304 }
305
306 fn validate_document_end(&mut self) -> Result<(), ParseError> {
308 if self.strict_validation && !self.element_stack.is_empty() {
309 let unclosed_tags = self
310 .element_stack
311 .iter()
312 .map(|(name, _)| name.clone())
313 .collect();
314 return Err(ParseError::UnclosedTags {
315 tags: unclosed_tags,
316 position: self.current_position,
317 });
318 }
319
320 self.element_stack.clear();
322 self.current_depth = 0;
323 Ok(())
324 }
325
326 pub fn get_element_stack(&self) -> Vec<String> {
328 self.element_stack
329 .iter()
330 .map(|(name, _)| name.clone())
331 .collect()
332 }
333
334 pub fn is_in_element(&self) -> bool {
336 !self.element_stack.is_empty()
337 }
338
339 pub fn get_depth(&self) -> usize {
341 self.element_stack.len()
344 }
345}
346
347fn is_valid_xml_name(name: &str) -> bool {
350 if name.is_empty() {
351 return false;
352 }
353
354 let chars: Vec<char> = name.chars().collect();
355
356 if !is_name_start_char(chars[0]) {
358 return false;
359 }
360
361 for &ch in chars.iter().skip(1) {
363 if !is_name_char(ch) {
364 return false;
365 }
366 }
367
368 true
369}
370
371fn is_name_start_char(ch: char) -> bool {
373 ch.is_ascii_alphabetic()
374 || ch == '_'
375 || ch == ':'
376 || ('\u{C0}'..='\u{D6}').contains(&ch)
377 || ('\u{D8}'..='\u{F6}').contains(&ch)
378 || ('\u{F8}'..='\u{2FF}').contains(&ch)
379 || ('\u{370}'..='\u{37D}').contains(&ch)
380 || ('\u{37F}'..='\u{1FFF}').contains(&ch)
381 || ('\u{200C}'..='\u{200D}').contains(&ch)
382 || ('\u{2070}'..='\u{218F}').contains(&ch)
383 || ('\u{2C00}'..='\u{2FEF}').contains(&ch)
384 || ('\u{3001}'..='\u{D7FF}').contains(&ch)
385 || ('\u{F900}'..='\u{FDCF}').contains(&ch)
386 || ('\u{FDF0}'..='\u{FFFD}').contains(&ch)
387}
388
389fn is_name_char(ch: char) -> bool {
391 is_name_start_char(ch)
392 || ch.is_ascii_digit()
393 || ch == '-'
394 || ch == '.'
395 || ch == '\u{B7}'
396 || ('\u{0300}'..='\u{036F}').contains(&ch)
397 || ('\u{203F}'..='\u{2040}').contains(&ch)
398}
399
400#[cfg(test)]
401mod tests {
402 use super::*;
403 use std::io::Cursor;
404
405 #[test]
406 fn test_valid_xml_names() {
407 assert!(is_valid_xml_name("element"));
408 assert!(is_valid_xml_name("_private"));
409 assert!(is_valid_xml_name("ns:element"));
410 assert!(is_valid_xml_name("element-1"));
411 assert!(is_valid_xml_name("element.1"));
412 }
413
414 #[test]
415 fn test_invalid_xml_names() {
416 assert!(!is_valid_xml_name(""));
417 assert!(!is_valid_xml_name("1element"));
418 assert!(!is_valid_xml_name("-element"));
419 assert!(!is_valid_xml_name(".element"));
420 assert!(!is_valid_xml_name("element with spaces"));
421 }
422
423 #[test]
424 fn test_validator_creation() {
425 let validator = XmlValidator::default();
426 assert_eq!(validator.get_depth(), 0);
427 assert!(!validator.is_in_element());
428 }
429
430 #[test]
431 fn test_element_stack_tracking() {
432 let mut validator = XmlValidator::strict();
433 let cursor = Cursor::new(b"test");
434 let reader = Reader::from_reader(cursor);
435
436 let start_element = BytesStart::new("test");
438 let start_event = Event::Start(start_element);
439
440 validator.validate_event(&start_event, &reader).unwrap();
441 assert_eq!(validator.get_depth(), 1);
442 assert!(validator.is_in_element());
443
444 let end_element = BytesEnd::new("test");
446 let end_event = Event::End(end_element);
447
448 validator.validate_event(&end_event, &reader).unwrap();
449 assert_eq!(validator.get_depth(), 0);
450 assert!(!validator.is_in_element());
451 }
452}