ddex_parser/parser/
attribute_extractor.rs

1//! # XML Attribute Extraction and Processing
2//!
3//! This module provides comprehensive attribute extraction from XML elements,
4//! handling namespace resolution, special attributes, and proper type conversion
5//! for both standard DDEX and custom/proprietary attributes.
6
7use crate::error::ParseError;
8use crate::parser::namespace_detector::NamespaceContext;
9use ddex_core::models::{AttributeMap, AttributeType, AttributeValue, QName};
10use indexmap::IndexMap;
11use quick_xml::events::{attributes::Attribute, BytesStart};
12use std::collections::HashMap;
13use tracing::{debug, warn};
14
15/// Comprehensive attribute extractor with namespace awareness
16#[derive(Debug, Clone)]
17pub struct AttributeExtractor {
18    /// Known DDEX attribute types for proper parsing
19    ddex_attribute_types: HashMap<String, AttributeType>,
20    /// Special attribute handlers
21    special_attributes: IndexMap<String, SpecialAttributeHandler>,
22}
23
24/// Special attribute handler for attributes requiring custom processing
25#[derive(Debug, Clone)]
26pub enum SpecialAttributeHandler {
27    /// xsi:type attribute (XML Schema instance type)
28    XsiType,
29    /// xsi:schemaLocation attribute
30    XsiSchemaLocation,
31    /// xsi:noNamespaceSchemaLocation attribute
32    XsiNoNamespaceSchemaLocation,
33    /// xsi:nil attribute (indicates null value)
34    XsiNil,
35    /// Namespace declaration attributes (xmlns, xmlns:*)
36    NamespaceDeclaration,
37    /// Language and territory codes
38    LanguageAndTerritory,
39    /// Sequence numbers and ordering
40    SequenceNumber,
41    /// Boolean flags
42    BooleanFlag,
43}
44
45/// Attribute extraction result
46#[derive(Debug, Clone)]
47pub struct AttributeExtractionResult {
48    /// All extracted attributes with proper typing
49    pub attributes: AttributeMap,
50    /// Standard DDEX attributes (subset of all attributes)
51    pub standard_attributes: IndexMap<QName, AttributeValue>,
52    /// Extension/custom attributes
53    pub extension_attributes: IndexMap<QName, AttributeValue>,
54    /// Namespace declarations found in this element
55    pub namespace_declarations: IndexMap<String, String>,
56    /// Special attributes requiring additional processing
57    pub special_attributes: IndexMap<QName, SpecialAttributeValue>,
58    /// Warnings about attribute processing
59    pub warnings: Vec<String>,
60}
61
62/// Special attribute values requiring custom handling
63#[derive(Debug, Clone, PartialEq)]
64pub enum SpecialAttributeValue {
65    /// xsi:type with resolved type information
66    XsiType {
67        type_name: String,
68        namespace_uri: Option<String>,
69        resolved_type: Option<String>,
70    },
71    /// Schema location with URI pairs
72    SchemaLocation {
73        locations: IndexMap<String, String>, // namespace_uri -> schema_location
74    },
75    /// No namespace schema location
76    NoNamespaceSchemaLocation(String),
77    /// Nil indicator
78    Nil(bool),
79    /// Language with territory code
80    Language {
81        language: String,
82        script: Option<String>,
83        territory: Option<String>,
84    },
85    /// Territory code list
86    Territory(Vec<String>),
87    /// Sequence number for ordering
88    Sequence(u32),
89    /// Boolean flag
90    Flag(bool),
91}
92
93impl AttributeExtractor {
94    /// Create a new attribute extractor with DDEX knowledge
95    pub fn new() -> Self {
96        let mut extractor = Self {
97            ddex_attribute_types: HashMap::new(),
98            special_attributes: IndexMap::new(),
99        };
100
101        extractor.initialize_ddex_attributes();
102        extractor.initialize_special_handlers();
103        extractor
104    }
105
106    /// Initialize known DDEX attribute types
107    fn initialize_ddex_attributes(&mut self) {
108        // Language and territory attributes
109        self.ddex_attribute_types
110            .insert("LanguageAndScriptCode".to_string(), AttributeType::Language);
111        self.ddex_attribute_types
112            .insert("ApplicableTerritoryCode".to_string(), AttributeType::String);
113
114        // Boolean attributes
115        self.ddex_attribute_types
116            .insert("IsDefault".to_string(), AttributeType::Boolean);
117        self.ddex_attribute_types
118            .insert("IsMainArtist".to_string(), AttributeType::Boolean);
119        self.ddex_attribute_types
120            .insert("HasChanged".to_string(), AttributeType::Boolean);
121
122        // Numeric attributes
123        self.ddex_attribute_types
124            .insert("SequenceNumber".to_string(), AttributeType::Integer);
125        self.ddex_attribute_types
126            .insert("Duration".to_string(), AttributeType::String); // ISO 8601 duration
127
128        // URI attributes
129        self.ddex_attribute_types
130            .insert("Namespace".to_string(), AttributeType::Uri);
131
132        // Date/time attributes
133        self.ddex_attribute_types
134            .insert("CreatedDateTime".to_string(), AttributeType::DateTime);
135        self.ddex_attribute_types
136            .insert("UpdatedDateTime".to_string(), AttributeType::DateTime);
137    }
138
139    /// Initialize special attribute handlers
140    fn initialize_special_handlers(&mut self) {
141        // XML Schema Instance attributes
142        self.special_attributes
143            .insert("xsi:type".to_string(), SpecialAttributeHandler::XsiType);
144        self.special_attributes.insert(
145            "xsi:schemaLocation".to_string(),
146            SpecialAttributeHandler::XsiSchemaLocation,
147        );
148        self.special_attributes.insert(
149            "xsi:noNamespaceSchemaLocation".to_string(),
150            SpecialAttributeHandler::XsiNoNamespaceSchemaLocation,
151        );
152        self.special_attributes
153            .insert("xsi:nil".to_string(), SpecialAttributeHandler::XsiNil);
154
155        // Namespace declarations
156        self.special_attributes.insert(
157            "xmlns".to_string(),
158            SpecialAttributeHandler::NamespaceDeclaration,
159        );
160        // Note: xmlns:* are handled dynamically
161
162        // DDEX specific
163        self.special_attributes.insert(
164            "LanguageAndScriptCode".to_string(),
165            SpecialAttributeHandler::LanguageAndTerritory,
166        );
167        self.special_attributes.insert(
168            "ApplicableTerritoryCode".to_string(),
169            SpecialAttributeHandler::LanguageAndTerritory,
170        );
171        self.special_attributes.insert(
172            "SequenceNumber".to_string(),
173            SpecialAttributeHandler::SequenceNumber,
174        );
175
176        // Boolean flags
177        self.special_attributes.insert(
178            "IsDefault".to_string(),
179            SpecialAttributeHandler::BooleanFlag,
180        );
181        self.special_attributes.insert(
182            "IsMainArtist".to_string(),
183            SpecialAttributeHandler::BooleanFlag,
184        );
185    }
186
187    /// Extract all attributes from an XML element
188    pub fn extract_attributes(
189        &self,
190        element: &BytesStart,
191        namespace_context: &NamespaceContext,
192    ) -> Result<AttributeExtractionResult, ParseError> {
193        let mut attributes = AttributeMap::new();
194        let mut namespace_declarations = IndexMap::new();
195        let mut special_attributes = IndexMap::new();
196        let warnings = Vec::new();
197
198        debug!(
199            "Extracting attributes from element: {}",
200            String::from_utf8_lossy(element.name().as_ref())
201        );
202
203        // Process all attributes
204        for attr_result in element.attributes() {
205            let attr = attr_result.map_err(|e| ParseError::XmlError(format!("Failed to read attribute: {}", e)))?;
206
207            let (qname, attr_value) = self.process_attribute(&attr, namespace_context)?;
208
209            // Handle namespace declarations separately
210            if qname.is_namespace_declaration() {
211                let prefix = if qname.local_name == "xmlns" {
212                    "".to_string() // Default namespace
213                } else {
214                    qname.local_name.clone() // Prefixed namespace
215                };
216                namespace_declarations.insert(prefix, attr_value.to_xml_value());
217                debug!(
218                    "Found namespace declaration: {}={}",
219                    qname.to_xml_name(),
220                    attr_value.to_xml_value()
221                );
222            }
223
224            // Check for special attributes
225            if let Some(special_value) =
226                self.process_special_attribute(&qname, &attr_value, namespace_context)?
227            {
228                special_attributes.insert(qname.clone(), special_value);
229            }
230
231            // Add to main attribute map
232            attributes.insert(qname, attr_value);
233        }
234
235        // Separate standard and extension attributes
236        let standard_attributes = attributes.standard_attributes();
237        let extension_attributes = attributes.extension_attributes();
238
239        debug!(
240            "Extracted {} total attributes ({} standard, {} extensions)",
241            attributes.len(),
242            standard_attributes.len(),
243            extension_attributes.len()
244        );
245
246        Ok(AttributeExtractionResult {
247            attributes,
248            standard_attributes,
249            extension_attributes,
250            namespace_declarations,
251            special_attributes,
252            warnings,
253        })
254    }
255
256    /// Process a single attribute
257    fn process_attribute(
258        &self,
259        attr: &Attribute,
260        namespace_context: &NamespaceContext,
261    ) -> Result<(QName, AttributeValue), ParseError> {
262        let attr_name = String::from_utf8_lossy(attr.key.as_ref());
263        let attr_value = String::from_utf8_lossy(&attr.value);
264
265        debug!("Processing attribute: {}={}", attr_name, attr_value);
266
267        // Create QName with namespace resolution
268        let qname = self.resolve_attribute_qname(&attr_name, namespace_context);
269
270        // Determine attribute type and parse value
271        let parsed_value = if let Some(attr_type) = self.get_attribute_type(&qname) {
272            AttributeValue::parse_with_type(&attr_value, attr_type).unwrap_or_else(|e| {
273                warn!(
274                    "Failed to parse attribute {} as {:?}: {}",
275                    qname, attr_type, e
276                );
277                AttributeValue::Raw(attr_value.to_string())
278            })
279        } else {
280            // Default to string for unknown attributes
281            AttributeValue::String(attr_value.to_string())
282        };
283
284        Ok((qname, parsed_value))
285    }
286
287    /// Resolve attribute name to QName with namespace context
288    fn resolve_attribute_qname(
289        &self,
290        attr_name: &str,
291        namespace_context: &NamespaceContext,
292    ) -> QName {
293        if let Some((prefix, local_name)) = attr_name.split_once(':') {
294            // Prefixed attribute
295            if let Some(namespace_uri) = namespace_context.current_scope.resolve_prefix(prefix) {
296                QName::with_prefix_and_namespace(local_name, prefix, namespace_uri)
297            } else {
298                // Unresolved prefix - keep as is with warning
299                warn!("Unresolved namespace prefix in attribute: {}", attr_name);
300                QName {
301                    local_name: local_name.to_string(),
302                    namespace_uri: None,
303                    prefix: Some(prefix.to_string()),
304                }
305            }
306        } else {
307            // Non-prefixed attribute - check if it's a namespace declaration
308            if attr_name == "xmlns" || attr_name.starts_with("xmlns:") {
309                QName::new(attr_name)
310            } else {
311                // Regular attribute without namespace
312                QName::new(attr_name)
313            }
314        }
315    }
316
317    /// Get the expected type for an attribute
318    fn get_attribute_type(&self, qname: &QName) -> Option<AttributeType> {
319        // Check by full qualified name first
320        if let Some(attr_type) = self.ddex_attribute_types.get(&qname.to_xml_name()) {
321            return Some(*attr_type);
322        }
323
324        // Check by local name
325        self.ddex_attribute_types.get(&qname.local_name).copied()
326    }
327
328    /// Process special attributes that require custom handling
329    fn process_special_attribute(
330        &self,
331        qname: &QName,
332        value: &AttributeValue,
333        namespace_context: &NamespaceContext,
334    ) -> Result<Option<SpecialAttributeValue>, ParseError> {
335        let attr_name = qname.to_xml_name();
336
337        if let Some(handler) = self.special_attributes.get(&attr_name) {
338            match handler {
339                SpecialAttributeHandler::XsiType => self.process_xsi_type(value, namespace_context),
340                SpecialAttributeHandler::XsiSchemaLocation => self.process_schema_location(value),
341                SpecialAttributeHandler::XsiNoNamespaceSchemaLocation => Ok(Some(
342                    SpecialAttributeValue::NoNamespaceSchemaLocation(value.to_xml_value()),
343                )),
344                SpecialAttributeHandler::XsiNil => self.process_xsi_nil(value),
345                SpecialAttributeHandler::NamespaceDeclaration => {
346                    // Already handled in main extraction
347                    Ok(None)
348                }
349                SpecialAttributeHandler::LanguageAndTerritory => {
350                    self.process_language_territory(value)
351                }
352                SpecialAttributeHandler::SequenceNumber => self.process_sequence_number(value),
353                SpecialAttributeHandler::BooleanFlag => self.process_boolean_flag(value),
354            }
355        } else {
356            Ok(None)
357        }
358    }
359
360    /// Process xsi:type attribute
361    fn process_xsi_type(
362        &self,
363        value: &AttributeValue,
364        namespace_context: &NamespaceContext,
365    ) -> Result<Option<SpecialAttributeValue>, ParseError> {
366        let type_value = value.to_xml_value();
367
368        if let Some((prefix, local_name)) = type_value.split_once(':') {
369            // Prefixed type
370            let namespace_uri = namespace_context.current_scope.resolve_prefix(prefix);
371            Ok(Some(SpecialAttributeValue::XsiType {
372                type_name: local_name.to_string(),
373                namespace_uri,
374                resolved_type: None, // Could be resolved later with schema information
375            }))
376        } else {
377            // Non-prefixed type
378            Ok(Some(SpecialAttributeValue::XsiType {
379                type_name: type_value,
380                namespace_uri: None,
381                resolved_type: None,
382            }))
383        }
384    }
385
386    /// Process xsi:schemaLocation attribute
387    fn process_schema_location(
388        &self,
389        value: &AttributeValue,
390    ) -> Result<Option<SpecialAttributeValue>, ParseError> {
391        let location_value = value.to_xml_value();
392        let mut locations = IndexMap::new();
393
394        // Schema locations are space-separated pairs: namespace_uri schema_url
395        let tokens: Vec<&str> = location_value.split_whitespace().collect();
396        for chunk in tokens.chunks(2) {
397            if chunk.len() == 2 {
398                locations.insert(chunk[0].to_string(), chunk[1].to_string());
399            }
400        }
401
402        Ok(Some(SpecialAttributeValue::SchemaLocation { locations }))
403    }
404
405    /// Process xsi:nil attribute
406    fn process_xsi_nil(
407        &self,
408        value: &AttributeValue,
409    ) -> Result<Option<SpecialAttributeValue>, ParseError> {
410        match value {
411            AttributeValue::Boolean(b) => Ok(Some(SpecialAttributeValue::Nil(*b))),
412            _ => {
413                let str_val = value.to_xml_value();
414                let nil_val = matches!(str_val.to_lowercase().as_str(), "true" | "1");
415                Ok(Some(SpecialAttributeValue::Nil(nil_val)))
416            }
417        }
418    }
419
420    /// Process language and territory codes
421    fn process_language_territory(
422        &self,
423        value: &AttributeValue,
424    ) -> Result<Option<SpecialAttributeValue>, ParseError> {
425        let lang_value = value.to_xml_value();
426
427        // Parse RFC 5646 language tags (simplified)
428        if lang_value.contains('-') {
429            let parts: Vec<&str> = lang_value.split('-').collect();
430            let language = parts[0].to_string();
431            let territory = if parts.len() > 1 {
432                Some(parts[1].to_string())
433            } else {
434                None
435            };
436
437            Ok(Some(SpecialAttributeValue::Language {
438                language,
439                script: None, // Could be enhanced to parse script codes
440                territory,
441            }))
442        } else if lang_value.contains(' ') {
443            // Space-separated territory codes
444            let territories: Vec<String> = lang_value
445                .split_whitespace()
446                .map(|s| s.to_string())
447                .collect();
448            Ok(Some(SpecialAttributeValue::Territory(territories)))
449        } else {
450            Ok(Some(SpecialAttributeValue::Language {
451                language: lang_value,
452                script: None,
453                territory: None,
454            }))
455        }
456    }
457
458    /// Process sequence number
459    fn process_sequence_number(
460        &self,
461        value: &AttributeValue,
462    ) -> Result<Option<SpecialAttributeValue>, ParseError> {
463        match value {
464            AttributeValue::Integer(i) => Ok(Some(SpecialAttributeValue::Sequence(*i as u32))),
465            _ => {
466                if let Ok(seq) = value.to_xml_value().parse::<u32>() {
467                    Ok(Some(SpecialAttributeValue::Sequence(seq)))
468                } else {
469                    Ok(None)
470                }
471            }
472        }
473    }
474
475    /// Process boolean flag
476    fn process_boolean_flag(
477        &self,
478        value: &AttributeValue,
479    ) -> Result<Option<SpecialAttributeValue>, ParseError> {
480        match value {
481            AttributeValue::Boolean(b) => Ok(Some(SpecialAttributeValue::Flag(*b))),
482            _ => {
483                let str_val = value.to_xml_value();
484                let bool_val = matches!(str_val.to_lowercase().as_str(), "true" | "1");
485                Ok(Some(SpecialAttributeValue::Flag(bool_val)))
486            }
487        }
488    }
489
490    /// Apply attribute inheritance from parent to child
491    pub fn apply_inheritance(
492        &self,
493        parent_attributes: &AttributeMap,
494        child_attributes: &mut AttributeMap,
495    ) {
496        let inheritance = ddex_core::models::AttributeInheritance::new();
497        inheritance.apply_inheritance(parent_attributes, child_attributes);
498    }
499
500    /// Validate extracted attributes
501    pub fn validate_attributes(&self, attributes: &AttributeMap) -> Vec<String> {
502        let mut errors = Vec::new();
503
504        for (qname, value) in attributes.iter() {
505            if let Err(e) = value.validate() {
506                errors.push(format!("Invalid attribute {}: {}", qname, e));
507            }
508        }
509
510        errors
511    }
512}
513
514impl Default for AttributeExtractor {
515    fn default() -> Self {
516        Self::new()
517    }
518}
519
520#[cfg(test)]
521mod tests {
522    use super::*;
523    use quick_xml::Reader;
524    use std::io::Cursor;
525
526    #[test]
527    fn test_attribute_extraction_basic() {
528        let xml = r#"<Release title="Test Album" SequenceNumber="1" IsDefault="true" />"#;
529        let mut reader = Reader::from_reader(Cursor::new(xml.as_bytes()));
530        let mut buf = Vec::new();
531
532        if let Ok(quick_xml::events::Event::Empty(start)) = reader.read_event_into(&mut buf) {
533            let extractor = AttributeExtractor::new();
534            let namespace_context = NamespaceContext {
535                current_scope: ddex_core::namespace::NamespaceScope::new(),
536                document_namespaces: indexmap::IndexMap::new(),
537                default_namespace: None,
538                ern_version: None,
539            };
540
541            let result = extractor
542                .extract_attributes(&start, &namespace_context)
543                .unwrap();
544
545            assert_eq!(result.attributes.len(), 3);
546            assert_eq!(
547                result.attributes.get_str("title").unwrap().to_xml_value(),
548                "Test Album"
549            );
550            assert_eq!(
551                result
552                    .attributes
553                    .get_str("SequenceNumber")
554                    .unwrap()
555                    .to_xml_value(),
556                "1"
557            );
558            assert_eq!(
559                result
560                    .attributes
561                    .get_str("IsDefault")
562                    .unwrap()
563                    .to_xml_value(),
564                "true"
565            );
566
567            // Check type parsing
568            if let Some(AttributeValue::Integer(seq)) = result.attributes.get_str("SequenceNumber")
569            {
570                assert_eq!(*seq, 1);
571            } else {
572                panic!("SequenceNumber should be parsed as integer");
573            }
574
575            if let Some(AttributeValue::Boolean(is_default)) =
576                result.attributes.get_str("IsDefault")
577            {
578                assert_eq!(*is_default, true);
579            } else {
580                panic!("IsDefault should be parsed as boolean");
581            }
582        }
583    }
584
585    #[test]
586    fn test_namespace_attribute_extraction() {
587        let xml = r#"<ern:Release xmlns:ern="http://ddex.net/xml/ern/43" 
588                                  xmlns:avs="http://ddex.net/xml/avs" 
589                                  ern:title="Test" />"#;
590        let mut reader = Reader::from_reader(Cursor::new(xml.as_bytes()));
591        let mut buf = Vec::new();
592
593        if let Ok(quick_xml::events::Event::Empty(start)) = reader.read_event_into(&mut buf) {
594            let extractor = AttributeExtractor::new();
595            let namespace_context = NamespaceContext {
596                current_scope: ddex_core::namespace::NamespaceScope::new(),
597                document_namespaces: indexmap::IndexMap::new(),
598                default_namespace: None,
599                ern_version: None,
600            };
601
602            let result = extractor
603                .extract_attributes(&start, &namespace_context)
604                .unwrap();
605
606            assert_eq!(result.namespace_declarations.len(), 2);
607            assert!(result.namespace_declarations.contains_key("ern"));
608            assert!(result.namespace_declarations.contains_key("avs"));
609        }
610    }
611
612    #[test]
613    fn test_special_attribute_processing() {
614        let xml = r#"<element xsi:type="xs:string" 
615                              xsi:nil="true"
616                              xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
617                              xmlns:xs="http://www.w3.org/2001/XMLSchema" />"#;
618        let mut reader = Reader::from_reader(Cursor::new(xml.as_bytes()));
619        let mut buf = Vec::new();
620
621        if let Ok(quick_xml::events::Event::Empty(start)) = reader.read_event_into(&mut buf) {
622            let extractor = AttributeExtractor::new();
623            let namespace_context = NamespaceContext {
624                current_scope: ddex_core::namespace::NamespaceScope::new(),
625                document_namespaces: indexmap::IndexMap::new(),
626                default_namespace: None,
627                ern_version: None,
628            };
629
630            let result = extractor
631                .extract_attributes(&start, &namespace_context)
632                .unwrap();
633
634            assert!(!result.special_attributes.is_empty());
635
636            // Check for xsi:nil
637            let xsi_nil_qname = QName::with_prefix_and_namespace(
638                "nil".to_string(),
639                "xsi".to_string(),
640                "http://www.w3.org/2001/XMLSchema-instance".to_string(),
641            );
642            if let Some(SpecialAttributeValue::Nil(nil_value)) =
643                result.special_attributes.get(&xsi_nil_qname)
644            {
645                assert_eq!(*nil_value, true);
646            }
647        }
648    }
649
650    #[test]
651    fn test_attribute_inheritance() {
652        let mut parent_attrs = AttributeMap::new();
653        parent_attrs.insert_str("LanguageAndScriptCode", "en-US");
654        parent_attrs.insert_str("ApplicableTerritoryCode", "Worldwide");
655
656        let mut child_attrs = AttributeMap::new();
657        child_attrs.insert_str("title", "Child Title");
658
659        let extractor = AttributeExtractor::new();
660        extractor.apply_inheritance(&parent_attrs, &mut child_attrs);
661
662        // Child should inherit language and territory
663        assert!(child_attrs.get_str("LanguageAndScriptCode").is_some());
664        assert!(child_attrs.get_str("ApplicableTerritoryCode").is_some());
665        assert!(child_attrs.get_str("title").is_some());
666    }
667
668    #[test]
669    fn test_ddex_standard_vs_extension_attributes() {
670        let mut attributes = AttributeMap::new();
671        attributes.insert_str("LanguageAndScriptCode", "en-US"); // Standard
672        attributes.insert_str("custom:proprietary", "custom value"); // Extension
673        attributes.insert_str("xmlns:custom", "http://example.com/custom"); // Namespace
674
675        let standard = attributes.standard_attributes();
676        let extensions = attributes.extension_attributes();
677
678        assert!(standard.len() >= 1); // Should contain LanguageAndScriptCode
679        assert!(extensions.len() >= 1); // Should contain custom:proprietary
680    }
681}