1use crate::utf8_utils;
8use ddex_core::models::{
9 extensions::utils, Comment, CommentPosition, Extensions, ProcessingInstruction, XmlFragment,
10};
11use indexmap::IndexMap;
12use log::warn;
13use quick_xml::{
14 events::{BytesEnd, BytesStart, BytesText, Event},
15 Reader,
16};
17
18#[derive(Debug, Clone)]
20pub struct ExtensionCaptureContext {
21 pub element_path: Vec<String>,
23
24 pub namespace_context: IndexMap<String, String>,
26
27 pub in_extension: bool,
29
30 pub extension_depth: usize,
32
33 pub extension_buffer: String,
35
36 pub current_extension: Option<XmlFragment>,
38
39 pub extensions: Extensions,
41
42 pub current_line: usize,
44
45 pub current_column: usize,
47}
48
49impl Default for ExtensionCaptureContext {
50 fn default() -> Self {
51 Self::new()
52 }
53}
54
55impl ExtensionCaptureContext {
56 pub fn new() -> Self {
58 Self {
59 element_path: Vec::new(),
60 namespace_context: IndexMap::new(),
61 in_extension: false,
62 extension_depth: 0,
63 extension_buffer: String::new(),
64 current_extension: None,
65 extensions: Extensions::new(),
66 current_line: 1,
67 current_column: 1,
68 }
69 }
70
71 pub fn enter_element(&mut self, element_name: &str) {
73 self.element_path.push(element_name.to_string());
74 }
75
76 pub fn exit_element(&mut self) -> Option<String> {
78 self.element_path.pop()
79 }
80
81 pub fn current_path(&self) -> String {
83 self.element_path.join("/")
84 }
85
86 pub fn add_namespace_declaration(&mut self, prefix: String, uri: String) {
88 self.namespace_context.insert(prefix.clone(), uri.clone());
89
90 if !utils::is_ddex_namespace(&uri) {
92 self.extensions.add_global_namespace(prefix, uri);
93 }
94 }
95
96 pub fn should_capture_element(&self, _element_name: &str, namespace_uri: Option<&str>) -> bool {
98 if self.in_extension {
100 return true;
101 }
102
103 if let Some(ns_uri) = namespace_uri {
105 return !utils::is_ddex_namespace(ns_uri);
106 }
107
108 false
111 }
112
113 pub fn start_extension_capture(
115 &mut self,
116 element_name: &str,
117 namespace_uri: Option<&str>,
118 namespace_prefix: Option<&str>,
119 ) {
120 self.in_extension = true;
121 self.extension_depth = 1;
122 self.extension_buffer.clear();
123
124 self.current_extension = Some(XmlFragment::with_namespace(
125 element_name.to_string(),
126 namespace_uri.map(String::from),
127 namespace_prefix.map(String::from),
128 String::new(), ));
130 }
131
132 pub fn add_extension_content(&mut self, content: &str) {
134 if self.in_extension {
135 self.extension_buffer.push_str(content);
136 }
137 }
138
139 pub fn process_extension_start_tag(&mut self, event: &BytesStart) {
141 if !self.in_extension {
142 return;
143 }
144
145 self.extension_depth += 1;
146 self.extension_buffer.push('<');
147 let element_name = utf8_utils::process_text_content_lossy(event.name().as_ref());
148 self.extension_buffer.push_str(&element_name);
149
150 for attr in event.attributes().flatten() {
152 self.extension_buffer.push(' ');
153 let key = utf8_utils::process_text_content_lossy(attr.key.as_ref());
154 let value = utf8_utils::process_text_content_lossy(&attr.value);
155
156 self.extension_buffer.push_str(&key);
157 self.extension_buffer.push_str("=\"");
158 self.extension_buffer.push_str(&value);
159 self.extension_buffer.push('"');
160
161 if let Some(ref mut ext) = self.current_extension {
163 ext.add_attribute(key, value);
164 }
165 }
166
167 self.extension_buffer.push('>');
168 }
169
170 pub fn process_extension_end_tag(&mut self, event: &BytesEnd) {
172 if !self.in_extension {
173 return;
174 }
175
176 self.extension_buffer.push_str("</");
177 self.extension_buffer
178 .push_str(std::str::from_utf8(event.name().as_ref()).unwrap_or("unknown"));
179 self.extension_buffer.push('>');
180
181 self.extension_depth -= 1;
182
183 if self.extension_depth == 0 {
185 self.finish_extension_capture();
186 }
187 }
188
189 pub fn process_extension_text(&mut self, event: &BytesText) {
191 if !self.in_extension {
192 return;
193 }
194
195 let text = event.unescape().unwrap_or_default();
196 self.extension_buffer.push_str(&text);
197
198 if let Some(ref mut ext) = self.current_extension {
200 if ext.children.is_empty() {
201 ext.text_content = Some(text.to_string());
202 }
203 }
204 }
205
206 pub fn finish_extension_capture(&mut self) {
208 if let Some(mut extension) = self.current_extension.take() {
209 extension.raw_content = self.extension_buffer.clone();
210
211 let namespace_uri = extension.namespace_uri.as_deref();
213 let location_key = utils::generate_location_key(
214 &self
215 .element_path
216 .iter()
217 .map(|s| s.as_str())
218 .collect::<Vec<_>>(),
219 namespace_uri,
220 &extension.element_name,
221 );
222
223 self.extensions.add_fragment(location_key, extension);
224 }
225
226 self.in_extension = false;
227 self.extension_depth = 0;
228 self.extension_buffer.clear();
229 }
230
231 pub fn add_processing_instruction(&mut self, target: String, data: Option<String>) {
233 let pi = ProcessingInstruction::new(target, data);
234 self.extensions.add_document_processing_instruction(pi);
235 }
236
237 pub fn add_comment(&mut self, comment: String) {
239 self.extensions.add_document_comment(comment);
240 }
241
242 pub fn add_comment_with_position(
244 &mut self,
245 comment: String,
246 position: CommentPosition,
247 line_number: Option<usize>,
248 column_number: Option<usize>,
249 ) {
250 let xpath = if !self.element_path.is_empty() {
251 Some(format!("/{}", self.element_path.join("/")))
252 } else {
253 None
254 };
255
256 let comment_struct =
257 Comment::with_location(comment, position, xpath, line_number, column_number);
258
259 if self.element_path.is_empty()
260 || matches!(position, CommentPosition::Before | CommentPosition::After)
261 {
262 self.extensions
264 .add_document_comment_structured(comment_struct);
265 } else {
266 if let Some(ref mut ext) = self.current_extension {
268 ext.comments.push(comment_struct);
269 } else {
270 self.extensions
272 .add_document_comment_structured(comment_struct);
273 }
274 }
275 }
276
277 pub fn into_extensions(self) -> Extensions {
279 self.extensions
280 }
281}
282
283pub struct ExtensionAwareParser {
285 pub context: ExtensionCaptureContext,
287
288 pub capture_extensions: bool,
290}
291
292impl ExtensionAwareParser {
293 pub fn new(capture_extensions: bool) -> Self {
295 Self {
296 context: ExtensionCaptureContext::new(),
297 capture_extensions,
298 }
299 }
300
301 pub fn parse_with_extensions(
303 &mut self,
304 xml_content: &str,
305 ) -> Result<Extensions, Box<dyn std::error::Error>> {
306 if !self.capture_extensions {
307 return Ok(Extensions::new());
308 }
309
310 let mut reader = Reader::from_str(xml_content);
311 reader.config_mut().trim_text(true);
312
313 let mut buf = Vec::new();
314
315 loop {
316 match reader.read_event_into(&mut buf) {
317 Ok(Event::Start(ref e)) => {
318 let element_name_bytes = e.name();
319 let element_name =
320 std::str::from_utf8(element_name_bytes.as_ref()).unwrap_or("unknown");
321
322 let (namespace_uri, namespace_prefix) = self.extract_namespace_info(e);
324
325 for attr in e.attributes().flatten() {
327 let key = std::str::from_utf8(attr.key.as_ref()).unwrap_or("");
328 if key.starts_with("xmlns") {
329 let prefix = if key == "xmlns" {
330 "".to_string()
331 } else {
332 key.strip_prefix("xmlns:").unwrap_or("").to_string()
333 };
334 let uri = String::from_utf8_lossy(&attr.value).to_string();
335 self.context.add_namespace_declaration(prefix, uri);
336 }
337 }
338
339 if self
341 .context
342 .should_capture_element(element_name, namespace_uri.as_deref())
343 {
344 if !self.context.in_extension {
345 self.context.start_extension_capture(
346 element_name,
347 namespace_uri.as_deref(),
348 namespace_prefix.as_deref(),
349 );
350 }
351 self.context.process_extension_start_tag(e);
352 } else {
353 self.context.enter_element(element_name);
354 }
355 }
356 Ok(Event::End(ref e)) => {
357 if self.context.in_extension {
358 self.context.process_extension_end_tag(e);
359 } else {
360 self.context.exit_element();
361 }
362 }
363 Ok(Event::Text(ref e)) => {
364 if self.context.in_extension {
365 self.context.process_extension_text(e);
366 }
367 }
368 Ok(Event::Comment(ref e)) => {
369 let comment = String::from_utf8_lossy(e);
370 if self.context.in_extension {
371 self.context
372 .add_extension_content(&format!("<!--{}-->", comment));
373 } else {
374 let position = if self.context.element_path.is_empty() {
376 CommentPosition::Before
377 } else {
378 CommentPosition::FirstChild
379 };
380
381 self.context.add_comment_with_position(
382 comment.trim().to_string(),
383 position,
384 Some(self.context.current_line),
385 Some(self.context.current_column),
386 );
387 }
388 }
389 Ok(Event::PI(ref e)) => {
390 let content = String::from_utf8_lossy(e);
391 if let Some(space_pos) = content.find(char::is_whitespace) {
393 let target = content[..space_pos].to_string();
394 let data = content[space_pos..].trim().to_string();
395 let data = if data.is_empty() { None } else { Some(data) };
396 self.context.add_processing_instruction(target, data);
397 } else {
398 self.context
399 .add_processing_instruction(content.to_string(), None);
400 }
401 }
402 Ok(Event::Eof) => break,
403 Err(e) => {
404 warn!("XML parsing error during extension capture: {}", e);
406 }
407 _ => {}
408 }
409 buf.clear();
410 }
411
412 Ok(self.context.extensions.clone())
413 }
414
415 fn extract_namespace_info(&self, event: &BytesStart) -> (Option<String>, Option<String>) {
417 let name_bytes = event.name();
418 let name = std::str::from_utf8(name_bytes.as_ref()).unwrap_or("unknown");
419
420 if let Some(colon_pos) = name.find(':') {
421 let prefix = &name[..colon_pos];
422 let namespace_uri = self.context.namespace_context.get(prefix).cloned();
423 (namespace_uri, Some(prefix.to_string()))
424 } else {
425 let default_ns = self.context.namespace_context.get("").cloned();
427 (default_ns, None)
428 }
429 }
430}
431
432pub mod capture_utils {
434 use super::*;
435
436 pub fn extract_extensions(xml_content: &str) -> Result<Extensions, Box<dyn std::error::Error>> {
438 let mut parser = ExtensionAwareParser::new(true);
439 parser.parse_with_extensions(xml_content)
440 }
441
442 pub fn has_extensions(xml_content: &str) -> bool {
444 match extract_extensions(xml_content) {
445 Ok(extensions) => !extensions.is_empty(),
446 Err(_) => false,
447 }
448 }
449
450 pub fn get_extension_stats(xml_content: &str) -> ExtensionStats {
452 match extract_extensions(xml_content) {
453 Ok(extensions) => ExtensionStats::from_extensions(&extensions),
454 Err(_) => ExtensionStats::default(),
455 }
456 }
457
458 #[derive(Debug, Clone, Default)]
460 pub struct ExtensionStats {
461 pub fragment_count: usize,
462 pub namespace_count: usize,
463 pub comment_count: usize,
464 pub processing_instruction_count: usize,
465 pub unique_namespaces: Vec<String>,
466 }
467
468 impl ExtensionStats {
469 fn from_extensions(extensions: &Extensions) -> Self {
470 let unique_namespaces = extensions.global_namespaces.values().cloned().collect();
471
472 Self {
473 fragment_count: extensions.fragments.len(),
474 namespace_count: extensions.global_namespaces.len(),
475 comment_count: extensions.document_comments.len(),
476 processing_instruction_count: extensions.document_processing_instructions.len(),
477 unique_namespaces,
478 }
479 }
480 }
481}
482
483#[cfg(test)]
484mod tests {
485 use super::*;
486
487 #[test]
488 fn test_extension_capture_context() {
489 let mut context = ExtensionCaptureContext::new();
490
491 context.enter_element("message");
492 context.enter_element("header");
493 assert_eq!(context.current_path(), "message/header");
494
495 context.exit_element();
496 assert_eq!(context.current_path(), "message");
497 }
498
499 #[test]
500 fn test_namespace_detection() {
501 let context = ExtensionCaptureContext::new();
502
503 assert!(!context.should_capture_element("Release", Some("http://ddex.net/xml/ern/43")));
505
506 assert!(context.should_capture_element("customElement", Some("http://example.com/custom")));
508 }
509
510 #[test]
511 fn test_extension_parsing() {
512 let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
513<ern:NewReleaseMessage xmlns:ern="http://ddex.net/xml/ern/43" xmlns:custom="http://example.com/custom">
514 <MessageHeader>
515 <MessageId>MSG123</MessageId>
516 <custom:CustomField>Custom Value</custom:CustomField>
517 </MessageHeader>
518 <custom:CustomSection attr="value">
519 <custom:NestedElement>Nested Content</custom:NestedElement>
520 </custom:CustomSection>
521</ern:NewReleaseMessage>"#;
522
523 let extensions = capture_utils::extract_extensions(xml).unwrap();
524 assert!(!extensions.is_empty());
525 assert!(extensions.global_namespaces.contains_key("custom"));
526 assert_eq!(
527 extensions.global_namespaces["custom"],
528 "http://example.com/custom"
529 );
530 }
531
532 #[test]
533 fn test_processing_instruction_capture() {
534 let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
535<?custom-instruction data="value"?>
536<root>content</root>"#;
537
538 let extensions = capture_utils::extract_extensions(xml).unwrap();
539 assert!(!extensions.document_processing_instructions.is_empty());
540 assert_eq!(
541 extensions.document_processing_instructions[0].target,
542 "custom-instruction"
543 );
544 }
545
546 #[test]
547 fn test_comment_capture() {
548 let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
549<!-- This is a document comment -->
550<root>
551 <!-- This is an element comment -->
552 content
553</root>"#;
554
555 let extensions = capture_utils::extract_extensions(xml).unwrap();
556 assert!(!extensions.document_comments.is_empty());
557 }
558}