ddex_parser/parser/
extension_capture.rs

1//! Extension capture system for preserving unknown XML elements and namespaces
2//!
3//! This module provides functionality to capture and preserve XML fragments that are
4//! not part of the standard DDEX schema, enabling perfect round-trip fidelity for
5//! documents containing proprietary extensions.
6
7use crate::utf8_utils;
8use ddex_core::models::{
9    extensions::utils, Comment, CommentPosition, Extensions, ProcessingInstruction, XmlFragment,
10};
11use indexmap::IndexMap;
12use quick_xml::{
13    events::{BytesEnd, BytesStart, BytesText, Event},
14    Reader,
15};
16
17/// Extension capture context during parsing
18#[derive(Debug, Clone)]
19pub struct ExtensionCaptureContext {
20    /// Current element path (for location tracking)
21    pub element_path: Vec<String>,
22
23    /// Namespace context (prefix -> URI mappings)
24    pub namespace_context: IndexMap<String, String>,
25
26    /// Whether we're currently inside an unknown element
27    pub in_extension: bool,
28
29    /// Depth of unknown element nesting
30    pub extension_depth: usize,
31
32    /// Buffer for accumulating unknown XML content
33    pub extension_buffer: String,
34
35    /// Current extension being built
36    pub current_extension: Option<XmlFragment>,
37
38    /// Extensions collected during parsing
39    pub extensions: Extensions,
40
41    /// Current line number for position tracking
42    pub current_line: usize,
43
44    /// Current column number for position tracking
45    pub current_column: usize,
46}
47
48impl Default for ExtensionCaptureContext {
49    fn default() -> Self {
50        Self::new()
51    }
52}
53
54impl ExtensionCaptureContext {
55    /// Create a new extension capture context
56    pub fn new() -> Self {
57        Self {
58            element_path: Vec::new(),
59            namespace_context: IndexMap::new(),
60            in_extension: false,
61            extension_depth: 0,
62            extension_buffer: String::new(),
63            current_extension: None,
64            extensions: Extensions::new(),
65            current_line: 1,
66            current_column: 1,
67        }
68    }
69
70    /// Enter an element during parsing
71    pub fn enter_element(&mut self, element_name: &str) {
72        self.element_path.push(element_name.to_string());
73    }
74
75    /// Exit an element during parsing
76    pub fn exit_element(&mut self) -> Option<String> {
77        self.element_path.pop()
78    }
79
80    /// Get the current element path as a string
81    pub fn current_path(&self) -> String {
82        self.element_path.join("/")
83    }
84
85    /// Update namespace context with new declarations
86    pub fn add_namespace_declaration(&mut self, prefix: String, uri: String) {
87        self.namespace_context.insert(prefix.clone(), uri.clone());
88
89        // Also add to global namespaces if it's not a DDEX namespace
90        if !utils::is_ddex_namespace(&uri) {
91            self.extensions.add_global_namespace(prefix, uri);
92        }
93    }
94
95    /// Check if an element should be captured as an extension
96    pub fn should_capture_element(&self, _element_name: &str, namespace_uri: Option<&str>) -> bool {
97        // If we're already in an extension, capture everything
98        if self.in_extension {
99            return true;
100        }
101
102        // Check if this element is from a non-DDEX namespace
103        if let Some(ns_uri) = namespace_uri {
104            return !utils::is_ddex_namespace(ns_uri);
105        }
106
107        // Check if it's an unknown element in the DDEX namespace
108        // This would require schema validation, for now we'll be conservative
109        false
110    }
111
112    /// Start capturing an extension element
113    pub fn start_extension_capture(
114        &mut self,
115        element_name: &str,
116        namespace_uri: Option<&str>,
117        namespace_prefix: Option<&str>,
118    ) {
119        self.in_extension = true;
120        self.extension_depth = 1;
121        self.extension_buffer.clear();
122
123        self.current_extension = Some(XmlFragment::with_namespace(
124            element_name.to_string(),
125            namespace_uri.map(String::from),
126            namespace_prefix.map(String::from),
127            String::new(), // Will be filled as we parse
128        ));
129    }
130
131    /// Add content to the current extension
132    pub fn add_extension_content(&mut self, content: &str) {
133        if self.in_extension {
134            self.extension_buffer.push_str(content);
135        }
136    }
137
138    /// Process an opening tag during extension capture
139    pub fn process_extension_start_tag(&mut self, event: &BytesStart) {
140        if !self.in_extension {
141            return;
142        }
143
144        self.extension_depth += 1;
145        self.extension_buffer.push('<');
146        let element_name = utf8_utils::process_text_content_lossy(event.name().as_ref());
147        self.extension_buffer.push_str(&element_name);
148
149        // Add attributes
150        for attr in event.attributes().flatten() {
151            self.extension_buffer.push(' ');
152            let key = utf8_utils::process_text_content_lossy(attr.key.as_ref());
153            let value = utf8_utils::process_text_content_lossy(&attr.value);
154
155            self.extension_buffer.push_str(&key);
156            self.extension_buffer.push_str("=\"");
157            self.extension_buffer.push_str(&value);
158            self.extension_buffer.push('"');
159
160            // Store attribute in current extension
161            if let Some(ref mut ext) = self.current_extension {
162                ext.add_attribute(key, value);
163            }
164        }
165
166        self.extension_buffer.push('>');
167    }
168
169    /// Process a closing tag during extension capture
170    pub fn process_extension_end_tag(&mut self, event: &BytesEnd) {
171        if !self.in_extension {
172            return;
173        }
174
175        self.extension_buffer.push_str("</");
176        self.extension_buffer
177            .push_str(std::str::from_utf8(event.name().as_ref()).unwrap_or("unknown"));
178        self.extension_buffer.push('>');
179
180        self.extension_depth -= 1;
181
182        // If we're back to depth 0, finish capturing this extension
183        if self.extension_depth == 0 {
184            self.finish_extension_capture();
185        }
186    }
187
188    /// Process text content during extension capture
189    pub fn process_extension_text(&mut self, event: &BytesText) {
190        if !self.in_extension {
191            return;
192        }
193
194        let text = event.unescape().unwrap_or_default();
195        self.extension_buffer.push_str(&text);
196
197        // If this is simple text content, store it in the fragment
198        if let Some(ref mut ext) = self.current_extension {
199            if ext.children.is_empty() {
200                ext.text_content = Some(text.to_string());
201            }
202        }
203    }
204
205    /// Finish capturing the current extension
206    pub fn finish_extension_capture(&mut self) {
207        if let Some(mut extension) = self.current_extension.take() {
208            extension.raw_content = self.extension_buffer.clone();
209
210            // Generate location key
211            let namespace_uri = extension.namespace_uri.as_deref();
212            let location_key = utils::generate_location_key(
213                &self
214                    .element_path
215                    .iter()
216                    .map(|s| s.as_str())
217                    .collect::<Vec<_>>(),
218                namespace_uri,
219                &extension.element_name,
220            );
221
222            self.extensions.add_fragment(location_key, extension);
223        }
224
225        self.in_extension = false;
226        self.extension_depth = 0;
227        self.extension_buffer.clear();
228    }
229
230    /// Add a document-level processing instruction
231    pub fn add_processing_instruction(&mut self, target: String, data: Option<String>) {
232        let pi = ProcessingInstruction::new(target, data);
233        self.extensions.add_document_processing_instruction(pi);
234    }
235
236    /// Add a document-level comment
237    pub fn add_comment(&mut self, comment: String) {
238        self.extensions.add_document_comment(comment);
239    }
240
241    /// Add a position-aware comment
242    pub fn add_comment_with_position(
243        &mut self,
244        comment: String,
245        position: CommentPosition,
246        line_number: Option<usize>,
247        column_number: Option<usize>,
248    ) {
249        let xpath = if !self.element_path.is_empty() {
250            Some(format!("/{}", self.element_path.join("/")))
251        } else {
252            None
253        };
254
255        let comment_struct =
256            Comment::with_location(comment, position, xpath, line_number, column_number);
257
258        if self.element_path.is_empty()
259            || matches!(position, CommentPosition::Before | CommentPosition::After)
260        {
261            // Document-level or standalone comment
262            self.extensions
263                .add_document_comment_structured(comment_struct);
264        } else {
265            // Element-level comment - add to current extension or buffer for later association
266            if let Some(ref mut ext) = self.current_extension {
267                ext.comments.push(comment_struct);
268            } else {
269                // Store for later association with the next element
270                self.extensions
271                    .add_document_comment_structured(comment_struct);
272            }
273        }
274    }
275
276    /// Get the accumulated extensions
277    pub fn into_extensions(self) -> Extensions {
278        self.extensions
279    }
280}
281
282/// Extension-aware XML parser
283pub struct ExtensionAwareParser {
284    /// Extension capture context
285    pub context: ExtensionCaptureContext,
286
287    /// Whether to capture extensions
288    pub capture_extensions: bool,
289}
290
291impl ExtensionAwareParser {
292    /// Create a new extension-aware parser
293    pub fn new(capture_extensions: bool) -> Self {
294        Self {
295            context: ExtensionCaptureContext::new(),
296            capture_extensions,
297        }
298    }
299
300    /// Parse XML with extension capture
301    pub fn parse_with_extensions(
302        &mut self,
303        xml_content: &str,
304    ) -> Result<Extensions, Box<dyn std::error::Error>> {
305        if !self.capture_extensions {
306            return Ok(Extensions::new());
307        }
308
309        let mut reader = Reader::from_str(xml_content);
310        reader.config_mut().trim_text(true);
311
312        let mut buf = Vec::new();
313
314        loop {
315            match reader.read_event_into(&mut buf) {
316                Ok(Event::Start(ref e)) => {
317                    let element_name_bytes = e.name();
318                    let element_name =
319                        std::str::from_utf8(element_name_bytes.as_ref()).unwrap_or("unknown");
320
321                    // Extract namespace information
322                    let (namespace_uri, namespace_prefix) = self.extract_namespace_info(e);
323
324                    // Update namespace context with any new declarations
325                    for attr in e.attributes().flatten() {
326                        let key = std::str::from_utf8(attr.key.as_ref()).unwrap_or("");
327                        if key.starts_with("xmlns") {
328                            let prefix = if key == "xmlns" {
329                                "".to_string()
330                            } else {
331                                key.strip_prefix("xmlns:").unwrap_or("").to_string()
332                            };
333                            let uri = String::from_utf8_lossy(&attr.value).to_string();
334                            self.context.add_namespace_declaration(prefix, uri);
335                        }
336                    }
337
338                    // Check if we should capture this element
339                    if self
340                        .context
341                        .should_capture_element(element_name, namespace_uri.as_deref())
342                    {
343                        if !self.context.in_extension {
344                            self.context.start_extension_capture(
345                                element_name,
346                                namespace_uri.as_deref(),
347                                namespace_prefix.as_deref(),
348                            );
349                        }
350                        self.context.process_extension_start_tag(e);
351                    } else {
352                        self.context.enter_element(element_name);
353                    }
354                }
355                Ok(Event::End(ref e)) => {
356                    if self.context.in_extension {
357                        self.context.process_extension_end_tag(e);
358                    } else {
359                        self.context.exit_element();
360                    }
361                }
362                Ok(Event::Text(ref e)) => {
363                    if self.context.in_extension {
364                        self.context.process_extension_text(e);
365                    }
366                }
367                Ok(Event::Comment(ref e)) => {
368                    let comment = String::from_utf8_lossy(e);
369                    if self.context.in_extension {
370                        self.context
371                            .add_extension_content(&format!("<!--{}-->", comment));
372                    } else {
373                        // Determine comment position based on context
374                        let position = if self.context.element_path.is_empty() {
375                            CommentPosition::Before
376                        } else {
377                            CommentPosition::FirstChild
378                        };
379
380                        self.context.add_comment_with_position(
381                            comment.trim().to_string(),
382                            position,
383                            Some(self.context.current_line),
384                            Some(self.context.current_column),
385                        );
386                    }
387                }
388                Ok(Event::PI(ref e)) => {
389                    let content = String::from_utf8_lossy(e);
390                    // Split processing instruction content into target and data
391                    if let Some(space_pos) = content.find(char::is_whitespace) {
392                        let target = content[..space_pos].to_string();
393                        let data = content[space_pos..].trim().to_string();
394                        let data = if data.is_empty() { None } else { Some(data) };
395                        self.context.add_processing_instruction(target, data);
396                    } else {
397                        self.context
398                            .add_processing_instruction(content.to_string(), None);
399                    }
400                }
401                Ok(Event::Eof) => break,
402                Err(e) => {
403                    // Log the error but continue parsing to capture as much as possible
404                    eprintln!("Warning: XML parsing error during extension capture: {}", e);
405                }
406                _ => {}
407            }
408            buf.clear();
409        }
410
411        Ok(self.context.extensions.clone())
412    }
413
414    /// Extract namespace information from a start tag
415    fn extract_namespace_info(&self, event: &BytesStart) -> (Option<String>, Option<String>) {
416        let name_bytes = event.name();
417        let name = std::str::from_utf8(name_bytes.as_ref()).unwrap_or("unknown");
418
419        if let Some(colon_pos) = name.find(':') {
420            let prefix = &name[..colon_pos];
421            let namespace_uri = self.context.namespace_context.get(prefix).cloned();
422            (namespace_uri, Some(prefix.to_string()))
423        } else {
424            // Check for default namespace
425            let default_ns = self.context.namespace_context.get("").cloned();
426            (default_ns, None)
427        }
428    }
429}
430
431/// Utility functions for extension capture
432pub mod capture_utils {
433    use super::*;
434
435    /// Extract all extensions from XML content
436    pub fn extract_extensions(xml_content: &str) -> Result<Extensions, Box<dyn std::error::Error>> {
437        let mut parser = ExtensionAwareParser::new(true);
438        parser.parse_with_extensions(xml_content)
439    }
440
441    /// Check if XML content contains extensions
442    pub fn has_extensions(xml_content: &str) -> bool {
443        match extract_extensions(xml_content) {
444            Ok(extensions) => !extensions.is_empty(),
445            Err(_) => false,
446        }
447    }
448
449    /// Get extension statistics from XML content
450    pub fn get_extension_stats(xml_content: &str) -> ExtensionStats {
451        match extract_extensions(xml_content) {
452            Ok(extensions) => ExtensionStats::from_extensions(&extensions),
453            Err(_) => ExtensionStats::default(),
454        }
455    }
456
457    /// Extension statistics
458    #[derive(Debug, Clone, Default)]
459    pub struct ExtensionStats {
460        pub fragment_count: usize,
461        pub namespace_count: usize,
462        pub comment_count: usize,
463        pub processing_instruction_count: usize,
464        pub unique_namespaces: Vec<String>,
465    }
466
467    impl ExtensionStats {
468        fn from_extensions(extensions: &Extensions) -> Self {
469            let unique_namespaces = extensions.global_namespaces.values().cloned().collect();
470
471            Self {
472                fragment_count: extensions.fragments.len(),
473                namespace_count: extensions.global_namespaces.len(),
474                comment_count: extensions.document_comments.len(),
475                processing_instruction_count: extensions.document_processing_instructions.len(),
476                unique_namespaces,
477            }
478        }
479    }
480}
481
482#[cfg(test)]
483mod tests {
484    use super::*;
485
486    #[test]
487    fn test_extension_capture_context() {
488        let mut context = ExtensionCaptureContext::new();
489
490        context.enter_element("message");
491        context.enter_element("header");
492        assert_eq!(context.current_path(), "message/header");
493
494        context.exit_element();
495        assert_eq!(context.current_path(), "message");
496    }
497
498    #[test]
499    fn test_namespace_detection() {
500        let context = ExtensionCaptureContext::new();
501
502        // Should not capture DDEX elements
503        assert!(!context.should_capture_element("Release", Some("http://ddex.net/xml/ern/43")));
504
505        // Should capture non-DDEX elements
506        assert!(context.should_capture_element("customElement", Some("http://example.com/custom")));
507    }
508
509    #[test]
510    fn test_extension_parsing() {
511        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
512<ern:NewReleaseMessage xmlns:ern="http://ddex.net/xml/ern/43" xmlns:custom="http://example.com/custom">
513  <MessageHeader>
514    <MessageId>MSG123</MessageId>
515    <custom:CustomField>Custom Value</custom:CustomField>
516  </MessageHeader>
517  <custom:CustomSection attr="value">
518    <custom:NestedElement>Nested Content</custom:NestedElement>
519  </custom:CustomSection>
520</ern:NewReleaseMessage>"#;
521
522        let extensions = capture_utils::extract_extensions(xml).unwrap();
523        assert!(!extensions.is_empty());
524        assert!(extensions.global_namespaces.contains_key("custom"));
525        assert_eq!(
526            extensions.global_namespaces["custom"],
527            "http://example.com/custom"
528        );
529    }
530
531    #[test]
532    fn test_processing_instruction_capture() {
533        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
534<?custom-instruction data="value"?>
535<root>content</root>"#;
536
537        let extensions = capture_utils::extract_extensions(xml).unwrap();
538        assert!(!extensions.document_processing_instructions.is_empty());
539        assert_eq!(
540            extensions.document_processing_instructions[0].target,
541            "custom-instruction"
542        );
543    }
544
545    #[test]
546    fn test_comment_capture() {
547        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
548<!-- This is a document comment -->
549<root>
550  <!-- This is an element comment -->
551  content
552</root>"#;
553
554        let extensions = capture_utils::extract_extensions(xml).unwrap();
555        assert!(!extensions.document_comments.is_empty());
556    }
557}