ddex_parser/parser/
extension_capture.rs

1//! Extension capture system for preserving unknown XML elements and namespaces
2//!
3//! This module provides functionality to capture and preserve XML fragments that are
4//! not part of the standard DDEX schema, enabling perfect round-trip fidelity for
5//! documents containing proprietary extensions.
6
7use crate::utf8_utils;
8use ddex_core::models::{
9    extensions::utils, Comment, CommentPosition, Extensions, ProcessingInstruction, XmlFragment,
10};
11use indexmap::IndexMap;
12use log::warn;
13use quick_xml::{
14    events::{BytesEnd, BytesStart, BytesText, Event},
15    Reader,
16};
17
18/// Extension capture context during parsing
19#[derive(Debug, Clone)]
20pub struct ExtensionCaptureContext {
21    /// Current element path (for location tracking)
22    pub element_path: Vec<String>,
23
24    /// Namespace context (prefix -> URI mappings)
25    pub namespace_context: IndexMap<String, String>,
26
27    /// Whether we're currently inside an unknown element
28    pub in_extension: bool,
29
30    /// Depth of unknown element nesting
31    pub extension_depth: usize,
32
33    /// Buffer for accumulating unknown XML content
34    pub extension_buffer: String,
35
36    /// Current extension being built
37    pub current_extension: Option<XmlFragment>,
38
39    /// Extensions collected during parsing
40    pub extensions: Extensions,
41
42    /// Current line number for position tracking
43    pub current_line: usize,
44
45    /// Current column number for position tracking
46    pub current_column: usize,
47}
48
49impl Default for ExtensionCaptureContext {
50    fn default() -> Self {
51        Self::new()
52    }
53}
54
55impl ExtensionCaptureContext {
56    /// Create a new extension capture context
57    pub fn new() -> Self {
58        Self {
59            element_path: Vec::new(),
60            namespace_context: IndexMap::new(),
61            in_extension: false,
62            extension_depth: 0,
63            extension_buffer: String::new(),
64            current_extension: None,
65            extensions: Extensions::new(),
66            current_line: 1,
67            current_column: 1,
68        }
69    }
70
71    /// Enter an element during parsing
72    pub fn enter_element(&mut self, element_name: &str) {
73        self.element_path.push(element_name.to_string());
74    }
75
76    /// Exit an element during parsing
77    pub fn exit_element(&mut self) -> Option<String> {
78        self.element_path.pop()
79    }
80
81    /// Get the current element path as a string
82    pub fn current_path(&self) -> String {
83        self.element_path.join("/")
84    }
85
86    /// Update namespace context with new declarations
87    pub fn add_namespace_declaration(&mut self, prefix: String, uri: String) {
88        self.namespace_context.insert(prefix.clone(), uri.clone());
89
90        // Also add to global namespaces if it's not a DDEX namespace
91        if !utils::is_ddex_namespace(&uri) {
92            self.extensions.add_global_namespace(prefix, uri);
93        }
94    }
95
96    /// Check if an element should be captured as an extension
97    pub fn should_capture_element(&self, _element_name: &str, namespace_uri: Option<&str>) -> bool {
98        // If we're already in an extension, capture everything
99        if self.in_extension {
100            return true;
101        }
102
103        // Check if this element is from a non-DDEX namespace
104        if let Some(ns_uri) = namespace_uri {
105            return !utils::is_ddex_namespace(ns_uri);
106        }
107
108        // Check if it's an unknown element in the DDEX namespace
109        // This would require schema validation, for now we'll be conservative
110        false
111    }
112
113    /// Start capturing an extension element
114    pub fn start_extension_capture(
115        &mut self,
116        element_name: &str,
117        namespace_uri: Option<&str>,
118        namespace_prefix: Option<&str>,
119    ) {
120        self.in_extension = true;
121        self.extension_depth = 1;
122        self.extension_buffer.clear();
123
124        self.current_extension = Some(XmlFragment::with_namespace(
125            element_name.to_string(),
126            namespace_uri.map(String::from),
127            namespace_prefix.map(String::from),
128            String::new(), // Will be filled as we parse
129        ));
130    }
131
132    /// Add content to the current extension
133    pub fn add_extension_content(&mut self, content: &str) {
134        if self.in_extension {
135            self.extension_buffer.push_str(content);
136        }
137    }
138
139    /// Process an opening tag during extension capture
140    pub fn process_extension_start_tag(&mut self, event: &BytesStart) {
141        if !self.in_extension {
142            return;
143        }
144
145        self.extension_depth += 1;
146        self.extension_buffer.push('<');
147        let element_name = utf8_utils::process_text_content_lossy(event.name().as_ref());
148        self.extension_buffer.push_str(&element_name);
149
150        // Add attributes
151        for attr in event.attributes().flatten() {
152            self.extension_buffer.push(' ');
153            let key = utf8_utils::process_text_content_lossy(attr.key.as_ref());
154            let value = utf8_utils::process_text_content_lossy(&attr.value);
155
156            self.extension_buffer.push_str(&key);
157            self.extension_buffer.push_str("=\"");
158            self.extension_buffer.push_str(&value);
159            self.extension_buffer.push('"');
160
161            // Store attribute in current extension
162            if let Some(ref mut ext) = self.current_extension {
163                ext.add_attribute(key, value);
164            }
165        }
166
167        self.extension_buffer.push('>');
168    }
169
170    /// Process a closing tag during extension capture
171    pub fn process_extension_end_tag(&mut self, event: &BytesEnd) {
172        if !self.in_extension {
173            return;
174        }
175
176        self.extension_buffer.push_str("</");
177        self.extension_buffer
178            .push_str(std::str::from_utf8(event.name().as_ref()).unwrap_or("unknown"));
179        self.extension_buffer.push('>');
180
181        self.extension_depth -= 1;
182
183        // If we're back to depth 0, finish capturing this extension
184        if self.extension_depth == 0 {
185            self.finish_extension_capture();
186        }
187    }
188
189    /// Process text content during extension capture
190    pub fn process_extension_text(&mut self, event: &BytesText) {
191        if !self.in_extension {
192            return;
193        }
194
195        let text = event.unescape().unwrap_or_default();
196        self.extension_buffer.push_str(&text);
197
198        // If this is simple text content, store it in the fragment
199        if let Some(ref mut ext) = self.current_extension {
200            if ext.children.is_empty() {
201                ext.text_content = Some(text.to_string());
202            }
203        }
204    }
205
206    /// Finish capturing the current extension
207    pub fn finish_extension_capture(&mut self) {
208        if let Some(mut extension) = self.current_extension.take() {
209            extension.raw_content = self.extension_buffer.clone();
210
211            // Generate location key
212            let namespace_uri = extension.namespace_uri.as_deref();
213            let location_key = utils::generate_location_key(
214                &self
215                    .element_path
216                    .iter()
217                    .map(|s| s.as_str())
218                    .collect::<Vec<_>>(),
219                namespace_uri,
220                &extension.element_name,
221            );
222
223            self.extensions.add_fragment(location_key, extension);
224        }
225
226        self.in_extension = false;
227        self.extension_depth = 0;
228        self.extension_buffer.clear();
229    }
230
231    /// Add a document-level processing instruction
232    pub fn add_processing_instruction(&mut self, target: String, data: Option<String>) {
233        let pi = ProcessingInstruction::new(target, data);
234        self.extensions.add_document_processing_instruction(pi);
235    }
236
237    /// Add a document-level comment
238    pub fn add_comment(&mut self, comment: String) {
239        self.extensions.add_document_comment(comment);
240    }
241
242    /// Add a position-aware comment
243    pub fn add_comment_with_position(
244        &mut self,
245        comment: String,
246        position: CommentPosition,
247        line_number: Option<usize>,
248        column_number: Option<usize>,
249    ) {
250        let xpath = if !self.element_path.is_empty() {
251            Some(format!("/{}", self.element_path.join("/")))
252        } else {
253            None
254        };
255
256        let comment_struct =
257            Comment::with_location(comment, position, xpath, line_number, column_number);
258
259        if self.element_path.is_empty()
260            || matches!(position, CommentPosition::Before | CommentPosition::After)
261        {
262            // Document-level or standalone comment
263            self.extensions
264                .add_document_comment_structured(comment_struct);
265        } else {
266            // Element-level comment - add to current extension or buffer for later association
267            if let Some(ref mut ext) = self.current_extension {
268                ext.comments.push(comment_struct);
269            } else {
270                // Store for later association with the next element
271                self.extensions
272                    .add_document_comment_structured(comment_struct);
273            }
274        }
275    }
276
277    /// Get the accumulated extensions
278    pub fn into_extensions(self) -> Extensions {
279        self.extensions
280    }
281}
282
283/// Extension-aware XML parser
284pub struct ExtensionAwareParser {
285    /// Extension capture context
286    pub context: ExtensionCaptureContext,
287
288    /// Whether to capture extensions
289    pub capture_extensions: bool,
290}
291
292impl ExtensionAwareParser {
293    /// Create a new extension-aware parser
294    pub fn new(capture_extensions: bool) -> Self {
295        Self {
296            context: ExtensionCaptureContext::new(),
297            capture_extensions,
298        }
299    }
300
301    /// Parse XML with extension capture
302    pub fn parse_with_extensions(
303        &mut self,
304        xml_content: &str,
305    ) -> Result<Extensions, Box<dyn std::error::Error>> {
306        if !self.capture_extensions {
307            return Ok(Extensions::new());
308        }
309
310        let mut reader = Reader::from_str(xml_content);
311        reader.config_mut().trim_text(true);
312
313        let mut buf = Vec::new();
314
315        loop {
316            match reader.read_event_into(&mut buf) {
317                Ok(Event::Start(ref e)) => {
318                    let element_name_bytes = e.name();
319                    let element_name =
320                        std::str::from_utf8(element_name_bytes.as_ref()).unwrap_or("unknown");
321
322                    // Extract namespace information
323                    let (namespace_uri, namespace_prefix) = self.extract_namespace_info(e);
324
325                    // Update namespace context with any new declarations
326                    for attr in e.attributes().flatten() {
327                        let key = std::str::from_utf8(attr.key.as_ref()).unwrap_or("");
328                        if key.starts_with("xmlns") {
329                            let prefix = if key == "xmlns" {
330                                "".to_string()
331                            } else {
332                                key.strip_prefix("xmlns:").unwrap_or("").to_string()
333                            };
334                            let uri = String::from_utf8_lossy(&attr.value).to_string();
335                            self.context.add_namespace_declaration(prefix, uri);
336                        }
337                    }
338
339                    // Check if we should capture this element
340                    if self
341                        .context
342                        .should_capture_element(element_name, namespace_uri.as_deref())
343                    {
344                        if !self.context.in_extension {
345                            self.context.start_extension_capture(
346                                element_name,
347                                namespace_uri.as_deref(),
348                                namespace_prefix.as_deref(),
349                            );
350                        }
351                        self.context.process_extension_start_tag(e);
352                    } else {
353                        self.context.enter_element(element_name);
354                    }
355                }
356                Ok(Event::End(ref e)) => {
357                    if self.context.in_extension {
358                        self.context.process_extension_end_tag(e);
359                    } else {
360                        self.context.exit_element();
361                    }
362                }
363                Ok(Event::Text(ref e)) => {
364                    if self.context.in_extension {
365                        self.context.process_extension_text(e);
366                    }
367                }
368                Ok(Event::Comment(ref e)) => {
369                    let comment = String::from_utf8_lossy(e);
370                    if self.context.in_extension {
371                        self.context
372                            .add_extension_content(&format!("<!--{}-->", comment));
373                    } else {
374                        // Determine comment position based on context
375                        let position = if self.context.element_path.is_empty() {
376                            CommentPosition::Before
377                        } else {
378                            CommentPosition::FirstChild
379                        };
380
381                        self.context.add_comment_with_position(
382                            comment.trim().to_string(),
383                            position,
384                            Some(self.context.current_line),
385                            Some(self.context.current_column),
386                        );
387                    }
388                }
389                Ok(Event::PI(ref e)) => {
390                    let content = String::from_utf8_lossy(e);
391                    // Split processing instruction content into target and data
392                    if let Some(space_pos) = content.find(char::is_whitespace) {
393                        let target = content[..space_pos].to_string();
394                        let data = content[space_pos..].trim().to_string();
395                        let data = if data.is_empty() { None } else { Some(data) };
396                        self.context.add_processing_instruction(target, data);
397                    } else {
398                        self.context
399                            .add_processing_instruction(content.to_string(), None);
400                    }
401                }
402                Ok(Event::Eof) => break,
403                Err(e) => {
404                    // Log the error but continue parsing to capture as much as possible
405                    warn!("XML parsing error during extension capture: {}", e);
406                }
407                _ => {}
408            }
409            buf.clear();
410        }
411
412        Ok(self.context.extensions.clone())
413    }
414
415    /// Extract namespace information from a start tag
416    fn extract_namespace_info(&self, event: &BytesStart) -> (Option<String>, Option<String>) {
417        let name_bytes = event.name();
418        let name = std::str::from_utf8(name_bytes.as_ref()).unwrap_or("unknown");
419
420        if let Some(colon_pos) = name.find(':') {
421            let prefix = &name[..colon_pos];
422            let namespace_uri = self.context.namespace_context.get(prefix).cloned();
423            (namespace_uri, Some(prefix.to_string()))
424        } else {
425            // Check for default namespace
426            let default_ns = self.context.namespace_context.get("").cloned();
427            (default_ns, None)
428        }
429    }
430}
431
432/// Utility functions for extension capture
433pub mod capture_utils {
434    use super::*;
435
436    /// Extract all extensions from XML content
437    pub fn extract_extensions(xml_content: &str) -> Result<Extensions, Box<dyn std::error::Error>> {
438        let mut parser = ExtensionAwareParser::new(true);
439        parser.parse_with_extensions(xml_content)
440    }
441
442    /// Check if XML content contains extensions
443    pub fn has_extensions(xml_content: &str) -> bool {
444        match extract_extensions(xml_content) {
445            Ok(extensions) => !extensions.is_empty(),
446            Err(_) => false,
447        }
448    }
449
450    /// Get extension statistics from XML content
451    pub fn get_extension_stats(xml_content: &str) -> ExtensionStats {
452        match extract_extensions(xml_content) {
453            Ok(extensions) => ExtensionStats::from_extensions(&extensions),
454            Err(_) => ExtensionStats::default(),
455        }
456    }
457
458    /// Extension statistics
459    #[derive(Debug, Clone, Default)]
460    pub struct ExtensionStats {
461        pub fragment_count: usize,
462        pub namespace_count: usize,
463        pub comment_count: usize,
464        pub processing_instruction_count: usize,
465        pub unique_namespaces: Vec<String>,
466    }
467
468    impl ExtensionStats {
469        fn from_extensions(extensions: &Extensions) -> Self {
470            let unique_namespaces = extensions.global_namespaces.values().cloned().collect();
471
472            Self {
473                fragment_count: extensions.fragments.len(),
474                namespace_count: extensions.global_namespaces.len(),
475                comment_count: extensions.document_comments.len(),
476                processing_instruction_count: extensions.document_processing_instructions.len(),
477                unique_namespaces,
478            }
479        }
480    }
481}
482
483#[cfg(test)]
484mod tests {
485    use super::*;
486
487    #[test]
488    fn test_extension_capture_context() {
489        let mut context = ExtensionCaptureContext::new();
490
491        context.enter_element("message");
492        context.enter_element("header");
493        assert_eq!(context.current_path(), "message/header");
494
495        context.exit_element();
496        assert_eq!(context.current_path(), "message");
497    }
498
499    #[test]
500    fn test_namespace_detection() {
501        let context = ExtensionCaptureContext::new();
502
503        // Should not capture DDEX elements
504        assert!(!context.should_capture_element("Release", Some("http://ddex.net/xml/ern/43")));
505
506        // Should capture non-DDEX elements
507        assert!(context.should_capture_element("customElement", Some("http://example.com/custom")));
508    }
509
510    #[test]
511    fn test_extension_parsing() {
512        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
513<ern:NewReleaseMessage xmlns:ern="http://ddex.net/xml/ern/43" xmlns:custom="http://example.com/custom">
514  <MessageHeader>
515    <MessageId>MSG123</MessageId>
516    <custom:CustomField>Custom Value</custom:CustomField>
517  </MessageHeader>
518  <custom:CustomSection attr="value">
519    <custom:NestedElement>Nested Content</custom:NestedElement>
520  </custom:CustomSection>
521</ern:NewReleaseMessage>"#;
522
523        let extensions = capture_utils::extract_extensions(xml).unwrap();
524        assert!(!extensions.is_empty());
525        assert!(extensions.global_namespaces.contains_key("custom"));
526        assert_eq!(
527            extensions.global_namespaces["custom"],
528            "http://example.com/custom"
529        );
530    }
531
532    #[test]
533    fn test_processing_instruction_capture() {
534        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
535<?custom-instruction data="value"?>
536<root>content</root>"#;
537
538        let extensions = capture_utils::extract_extensions(xml).unwrap();
539        assert!(!extensions.document_processing_instructions.is_empty());
540        assert_eq!(
541            extensions.document_processing_instructions[0].target,
542            "custom-instruction"
543        );
544    }
545
546    #[test]
547    fn test_comment_capture() {
548        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
549<!-- This is a document comment -->
550<root>
551  <!-- This is an element comment -->
552  content
553</root>"#;
554
555        let extensions = capture_utils::extract_extensions(xml).unwrap();
556        assert!(!extensions.document_comments.is_empty());
557    }
558}