ddex_parser/parser/
attribute_extractor.rs

1//! # XML Attribute Extraction and Processing
2//!
3//! This module provides comprehensive attribute extraction from XML elements,
4//! handling namespace resolution, special attributes, and proper type conversion
5//! for both standard DDEX and custom/proprietary attributes.
6
7use crate::error::ParseError;
8use crate::parser::namespace_detector::NamespaceContext;
9use ddex_core::models::{AttributeMap, AttributeType, AttributeValue, QName};
10use indexmap::IndexMap;
11use quick_xml::events::{attributes::Attribute, BytesStart};
12use std::collections::HashMap;
13use tracing::{debug, warn};
14
15/// Comprehensive attribute extractor with namespace awareness
16#[derive(Debug, Clone)]
17pub struct AttributeExtractor {
18    /// Known DDEX attribute types for proper parsing
19    ddex_attribute_types: HashMap<String, AttributeType>,
20    /// Special attribute handlers
21    special_attributes: IndexMap<String, SpecialAttributeHandler>,
22}
23
24/// Special attribute handler for attributes requiring custom processing
25#[derive(Debug, Clone)]
26pub enum SpecialAttributeHandler {
27    /// xsi:type attribute (XML Schema instance type)
28    XsiType,
29    /// xsi:schemaLocation attribute
30    XsiSchemaLocation,
31    /// xsi:noNamespaceSchemaLocation attribute
32    XsiNoNamespaceSchemaLocation,
33    /// xsi:nil attribute (indicates null value)
34    XsiNil,
35    /// Namespace declaration attributes (xmlns, xmlns:*)
36    NamespaceDeclaration,
37    /// Language and territory codes
38    LanguageAndTerritory,
39    /// Sequence numbers and ordering
40    SequenceNumber,
41    /// Boolean flags
42    BooleanFlag,
43}
44
45/// Attribute extraction result
46#[derive(Debug, Clone)]
47pub struct AttributeExtractionResult {
48    /// All extracted attributes with proper typing
49    pub attributes: AttributeMap,
50    /// Standard DDEX attributes (subset of all attributes)
51    pub standard_attributes: IndexMap<QName, AttributeValue>,
52    /// Extension/custom attributes
53    pub extension_attributes: IndexMap<QName, AttributeValue>,
54    /// Namespace declarations found in this element
55    pub namespace_declarations: IndexMap<String, String>,
56    /// Special attributes requiring additional processing
57    pub special_attributes: IndexMap<QName, SpecialAttributeValue>,
58    /// Warnings about attribute processing
59    pub warnings: Vec<String>,
60}
61
62/// Special attribute values requiring custom handling
63#[derive(Debug, Clone, PartialEq)]
64pub enum SpecialAttributeValue {
65    /// xsi:type with resolved type information
66    XsiType {
67        type_name: String,
68        namespace_uri: Option<String>,
69        resolved_type: Option<String>,
70    },
71    /// Schema location with URI pairs
72    SchemaLocation {
73        locations: IndexMap<String, String>, // namespace_uri -> schema_location
74    },
75    /// No namespace schema location
76    NoNamespaceSchemaLocation(String),
77    /// Nil indicator
78    Nil(bool),
79    /// Language with territory code
80    Language {
81        language: String,
82        script: Option<String>,
83        territory: Option<String>,
84    },
85    /// Territory code list
86    Territory(Vec<String>),
87    /// Sequence number for ordering
88    Sequence(u32),
89    /// Boolean flag
90    Flag(bool),
91}
92
93impl AttributeExtractor {
94    /// Create a new attribute extractor with DDEX knowledge
95    pub fn new() -> Self {
96        let mut extractor = Self {
97            ddex_attribute_types: HashMap::new(),
98            special_attributes: IndexMap::new(),
99        };
100
101        extractor.initialize_ddex_attributes();
102        extractor.initialize_special_handlers();
103        extractor
104    }
105
106    /// Initialize known DDEX attribute types
107    fn initialize_ddex_attributes(&mut self) {
108        // Language and territory attributes
109        self.ddex_attribute_types
110            .insert("LanguageAndScriptCode".to_string(), AttributeType::Language);
111        self.ddex_attribute_types
112            .insert("ApplicableTerritoryCode".to_string(), AttributeType::String);
113
114        // Boolean attributes
115        self.ddex_attribute_types
116            .insert("IsDefault".to_string(), AttributeType::Boolean);
117        self.ddex_attribute_types
118            .insert("IsMainArtist".to_string(), AttributeType::Boolean);
119        self.ddex_attribute_types
120            .insert("HasChanged".to_string(), AttributeType::Boolean);
121
122        // Numeric attributes
123        self.ddex_attribute_types
124            .insert("SequenceNumber".to_string(), AttributeType::Integer);
125        self.ddex_attribute_types
126            .insert("Duration".to_string(), AttributeType::String); // ISO 8601 duration
127
128        // URI attributes
129        self.ddex_attribute_types
130            .insert("Namespace".to_string(), AttributeType::Uri);
131
132        // Date/time attributes
133        self.ddex_attribute_types
134            .insert("CreatedDateTime".to_string(), AttributeType::DateTime);
135        self.ddex_attribute_types
136            .insert("UpdatedDateTime".to_string(), AttributeType::DateTime);
137    }
138
139    /// Initialize special attribute handlers
140    fn initialize_special_handlers(&mut self) {
141        // XML Schema Instance attributes
142        self.special_attributes
143            .insert("xsi:type".to_string(), SpecialAttributeHandler::XsiType);
144        self.special_attributes.insert(
145            "xsi:schemaLocation".to_string(),
146            SpecialAttributeHandler::XsiSchemaLocation,
147        );
148        self.special_attributes.insert(
149            "xsi:noNamespaceSchemaLocation".to_string(),
150            SpecialAttributeHandler::XsiNoNamespaceSchemaLocation,
151        );
152        self.special_attributes
153            .insert("xsi:nil".to_string(), SpecialAttributeHandler::XsiNil);
154
155        // Namespace declarations
156        self.special_attributes.insert(
157            "xmlns".to_string(),
158            SpecialAttributeHandler::NamespaceDeclaration,
159        );
160        // Note: xmlns:* are handled dynamically
161
162        // DDEX specific
163        self.special_attributes.insert(
164            "LanguageAndScriptCode".to_string(),
165            SpecialAttributeHandler::LanguageAndTerritory,
166        );
167        self.special_attributes.insert(
168            "ApplicableTerritoryCode".to_string(),
169            SpecialAttributeHandler::LanguageAndTerritory,
170        );
171        self.special_attributes.insert(
172            "SequenceNumber".to_string(),
173            SpecialAttributeHandler::SequenceNumber,
174        );
175
176        // Boolean flags
177        self.special_attributes.insert(
178            "IsDefault".to_string(),
179            SpecialAttributeHandler::BooleanFlag,
180        );
181        self.special_attributes.insert(
182            "IsMainArtist".to_string(),
183            SpecialAttributeHandler::BooleanFlag,
184        );
185    }
186
187    /// Extract all attributes from an XML element
188    pub fn extract_attributes(
189        &self,
190        element: &BytesStart,
191        namespace_context: &NamespaceContext,
192    ) -> Result<AttributeExtractionResult, ParseError> {
193        let mut attributes = AttributeMap::new();
194        let mut namespace_declarations = IndexMap::new();
195        let mut special_attributes = IndexMap::new();
196        let warnings = Vec::new();
197
198        debug!(
199            "Extracting attributes from element: {}",
200            String::from_utf8_lossy(element.name().as_ref())
201        );
202
203        // Process all attributes
204        for attr_result in element.attributes() {
205            let attr = attr_result.map_err(|e| ParseError::XmlError {
206                message: format!("Failed to read attribute: {}", e),
207                location: crate::error::ErrorLocation::default(),
208            })?;
209
210            let (qname, attr_value) = self.process_attribute(&attr, namespace_context)?;
211
212            // Handle namespace declarations separately
213            if qname.is_namespace_declaration() {
214                let prefix = if qname.local_name == "xmlns" {
215                    "".to_string() // Default namespace
216                } else {
217                    qname.local_name.clone() // Prefixed namespace
218                };
219                namespace_declarations.insert(prefix, attr_value.to_xml_value());
220                debug!(
221                    "Found namespace declaration: {}={}",
222                    qname.to_xml_name(),
223                    attr_value.to_xml_value()
224                );
225            }
226
227            // Check for special attributes
228            if let Some(special_value) =
229                self.process_special_attribute(&qname, &attr_value, namespace_context)?
230            {
231                special_attributes.insert(qname.clone(), special_value);
232            }
233
234            // Add to main attribute map
235            attributes.insert(qname, attr_value);
236        }
237
238        // Separate standard and extension attributes
239        let standard_attributes = attributes.standard_attributes();
240        let extension_attributes = attributes.extension_attributes();
241
242        debug!(
243            "Extracted {} total attributes ({} standard, {} extensions)",
244            attributes.len(),
245            standard_attributes.len(),
246            extension_attributes.len()
247        );
248
249        Ok(AttributeExtractionResult {
250            attributes,
251            standard_attributes,
252            extension_attributes,
253            namespace_declarations,
254            special_attributes,
255            warnings,
256        })
257    }
258
259    /// Process a single attribute
260    fn process_attribute(
261        &self,
262        attr: &Attribute,
263        namespace_context: &NamespaceContext,
264    ) -> Result<(QName, AttributeValue), ParseError> {
265        let attr_name = String::from_utf8_lossy(attr.key.as_ref());
266        let attr_value = String::from_utf8_lossy(&attr.value);
267
268        debug!("Processing attribute: {}={}", attr_name, attr_value);
269
270        // Create QName with namespace resolution
271        let qname = self.resolve_attribute_qname(&attr_name, namespace_context);
272
273        // Determine attribute type and parse value
274        let parsed_value = if let Some(attr_type) = self.get_attribute_type(&qname) {
275            AttributeValue::parse_with_type(&attr_value, attr_type).unwrap_or_else(|e| {
276                warn!(
277                    "Failed to parse attribute {} as {:?}: {}",
278                    qname, attr_type, e
279                );
280                AttributeValue::Raw(attr_value.to_string())
281            })
282        } else {
283            // Default to string for unknown attributes
284            AttributeValue::String(attr_value.to_string())
285        };
286
287        Ok((qname, parsed_value))
288    }
289
290    /// Resolve attribute name to QName with namespace context
291    fn resolve_attribute_qname(
292        &self,
293        attr_name: &str,
294        namespace_context: &NamespaceContext,
295    ) -> QName {
296        if let Some((prefix, local_name)) = attr_name.split_once(':') {
297            // Prefixed attribute
298            if let Some(namespace_uri) = namespace_context.current_scope.resolve_prefix(prefix) {
299                QName::with_prefix_and_namespace(local_name, prefix, namespace_uri)
300            } else {
301                // Unresolved prefix - keep as is with warning
302                warn!("Unresolved namespace prefix in attribute: {}", attr_name);
303                QName {
304                    local_name: local_name.to_string(),
305                    namespace_uri: None,
306                    prefix: Some(prefix.to_string()),
307                }
308            }
309        } else {
310            // Non-prefixed attribute - check if it's a namespace declaration
311            if attr_name == "xmlns" || attr_name.starts_with("xmlns:") {
312                QName::new(attr_name)
313            } else {
314                // Regular attribute without namespace
315                QName::new(attr_name)
316            }
317        }
318    }
319
320    /// Get the expected type for an attribute
321    fn get_attribute_type(&self, qname: &QName) -> Option<AttributeType> {
322        // Check by full qualified name first
323        if let Some(attr_type) = self.ddex_attribute_types.get(&qname.to_xml_name()) {
324            return Some(*attr_type);
325        }
326
327        // Check by local name
328        self.ddex_attribute_types.get(&qname.local_name).copied()
329    }
330
331    /// Process special attributes that require custom handling
332    fn process_special_attribute(
333        &self,
334        qname: &QName,
335        value: &AttributeValue,
336        namespace_context: &NamespaceContext,
337    ) -> Result<Option<SpecialAttributeValue>, ParseError> {
338        let attr_name = qname.to_xml_name();
339
340        if let Some(handler) = self.special_attributes.get(&attr_name) {
341            match handler {
342                SpecialAttributeHandler::XsiType => self.process_xsi_type(value, namespace_context),
343                SpecialAttributeHandler::XsiSchemaLocation => self.process_schema_location(value),
344                SpecialAttributeHandler::XsiNoNamespaceSchemaLocation => Ok(Some(
345                    SpecialAttributeValue::NoNamespaceSchemaLocation(value.to_xml_value()),
346                )),
347                SpecialAttributeHandler::XsiNil => self.process_xsi_nil(value),
348                SpecialAttributeHandler::NamespaceDeclaration => {
349                    // Already handled in main extraction
350                    Ok(None)
351                }
352                SpecialAttributeHandler::LanguageAndTerritory => {
353                    self.process_language_territory(value)
354                }
355                SpecialAttributeHandler::SequenceNumber => self.process_sequence_number(value),
356                SpecialAttributeHandler::BooleanFlag => self.process_boolean_flag(value),
357            }
358        } else {
359            Ok(None)
360        }
361    }
362
363    /// Process xsi:type attribute
364    fn process_xsi_type(
365        &self,
366        value: &AttributeValue,
367        namespace_context: &NamespaceContext,
368    ) -> Result<Option<SpecialAttributeValue>, ParseError> {
369        let type_value = value.to_xml_value();
370
371        if let Some((prefix, local_name)) = type_value.split_once(':') {
372            // Prefixed type
373            let namespace_uri = namespace_context.current_scope.resolve_prefix(prefix);
374            Ok(Some(SpecialAttributeValue::XsiType {
375                type_name: local_name.to_string(),
376                namespace_uri,
377                resolved_type: None, // Could be resolved later with schema information
378            }))
379        } else {
380            // Non-prefixed type
381            Ok(Some(SpecialAttributeValue::XsiType {
382                type_name: type_value,
383                namespace_uri: None,
384                resolved_type: None,
385            }))
386        }
387    }
388
389    /// Process xsi:schemaLocation attribute
390    fn process_schema_location(
391        &self,
392        value: &AttributeValue,
393    ) -> Result<Option<SpecialAttributeValue>, ParseError> {
394        let location_value = value.to_xml_value();
395        let mut locations = IndexMap::new();
396
397        // Schema locations are space-separated pairs: namespace_uri schema_url
398        let tokens: Vec<&str> = location_value.split_whitespace().collect();
399        for chunk in tokens.chunks(2) {
400            if chunk.len() == 2 {
401                locations.insert(chunk[0].to_string(), chunk[1].to_string());
402            }
403        }
404
405        Ok(Some(SpecialAttributeValue::SchemaLocation { locations }))
406    }
407
408    /// Process xsi:nil attribute
409    fn process_xsi_nil(
410        &self,
411        value: &AttributeValue,
412    ) -> Result<Option<SpecialAttributeValue>, ParseError> {
413        match value {
414            AttributeValue::Boolean(b) => Ok(Some(SpecialAttributeValue::Nil(*b))),
415            _ => {
416                let str_val = value.to_xml_value();
417                let nil_val = matches!(str_val.to_lowercase().as_str(), "true" | "1");
418                Ok(Some(SpecialAttributeValue::Nil(nil_val)))
419            }
420        }
421    }
422
423    /// Process language and territory codes
424    fn process_language_territory(
425        &self,
426        value: &AttributeValue,
427    ) -> Result<Option<SpecialAttributeValue>, ParseError> {
428        let lang_value = value.to_xml_value();
429
430        // Parse RFC 5646 language tags (simplified)
431        if lang_value.contains('-') {
432            let parts: Vec<&str> = lang_value.split('-').collect();
433            let language = parts[0].to_string();
434            let territory = if parts.len() > 1 {
435                Some(parts[1].to_string())
436            } else {
437                None
438            };
439
440            Ok(Some(SpecialAttributeValue::Language {
441                language,
442                script: None, // Could be enhanced to parse script codes
443                territory,
444            }))
445        } else if lang_value.contains(' ') {
446            // Space-separated territory codes
447            let territories: Vec<String> = lang_value
448                .split_whitespace()
449                .map(|s| s.to_string())
450                .collect();
451            Ok(Some(SpecialAttributeValue::Territory(territories)))
452        } else {
453            Ok(Some(SpecialAttributeValue::Language {
454                language: lang_value,
455                script: None,
456                territory: None,
457            }))
458        }
459    }
460
461    /// Process sequence number
462    fn process_sequence_number(
463        &self,
464        value: &AttributeValue,
465    ) -> Result<Option<SpecialAttributeValue>, ParseError> {
466        match value {
467            AttributeValue::Integer(i) => Ok(Some(SpecialAttributeValue::Sequence(*i as u32))),
468            _ => {
469                if let Ok(seq) = value.to_xml_value().parse::<u32>() {
470                    Ok(Some(SpecialAttributeValue::Sequence(seq)))
471                } else {
472                    Ok(None)
473                }
474            }
475        }
476    }
477
478    /// Process boolean flag
479    fn process_boolean_flag(
480        &self,
481        value: &AttributeValue,
482    ) -> Result<Option<SpecialAttributeValue>, ParseError> {
483        match value {
484            AttributeValue::Boolean(b) => Ok(Some(SpecialAttributeValue::Flag(*b))),
485            _ => {
486                let str_val = value.to_xml_value();
487                let bool_val = matches!(str_val.to_lowercase().as_str(), "true" | "1");
488                Ok(Some(SpecialAttributeValue::Flag(bool_val)))
489            }
490        }
491    }
492
493    /// Apply attribute inheritance from parent to child
494    pub fn apply_inheritance(
495        &self,
496        parent_attributes: &AttributeMap,
497        child_attributes: &mut AttributeMap,
498    ) {
499        let inheritance = ddex_core::models::AttributeInheritance::new();
500        inheritance.apply_inheritance(parent_attributes, child_attributes);
501    }
502
503    /// Validate extracted attributes
504    pub fn validate_attributes(&self, attributes: &AttributeMap) -> Vec<String> {
505        let mut errors = Vec::new();
506
507        for (qname, value) in attributes.iter() {
508            if let Err(e) = value.validate() {
509                errors.push(format!("Invalid attribute {}: {}", qname, e));
510            }
511        }
512
513        errors
514    }
515}
516
517impl Default for AttributeExtractor {
518    fn default() -> Self {
519        Self::new()
520    }
521}
522
523#[cfg(test)]
524mod tests {
525    use super::*;
526    use quick_xml::Reader;
527    use std::io::Cursor;
528
529    #[test]
530    fn test_attribute_extraction_basic() {
531        let xml = r#"<Release title="Test Album" SequenceNumber="1" IsDefault="true" />"#;
532        let mut reader = Reader::from_reader(Cursor::new(xml.as_bytes()));
533        let mut buf = Vec::new();
534
535        if let Ok(quick_xml::events::Event::Empty(start)) = reader.read_event_into(&mut buf) {
536            let extractor = AttributeExtractor::new();
537            let namespace_context = NamespaceContext {
538                current_scope: ddex_core::namespace::NamespaceScope::new(),
539                document_namespaces: indexmap::IndexMap::new(),
540                default_namespace: None,
541                ern_version: None,
542            };
543
544            let result = extractor
545                .extract_attributes(&start, &namespace_context)
546                .unwrap();
547
548            assert_eq!(result.attributes.len(), 3);
549            assert_eq!(
550                result.attributes.get_str("title").unwrap().to_xml_value(),
551                "Test Album"
552            );
553            assert_eq!(
554                result
555                    .attributes
556                    .get_str("SequenceNumber")
557                    .unwrap()
558                    .to_xml_value(),
559                "1"
560            );
561            assert_eq!(
562                result
563                    .attributes
564                    .get_str("IsDefault")
565                    .unwrap()
566                    .to_xml_value(),
567                "true"
568            );
569
570            // Check type parsing
571            if let Some(AttributeValue::Integer(seq)) = result.attributes.get_str("SequenceNumber")
572            {
573                assert_eq!(*seq, 1);
574            } else {
575                panic!("SequenceNumber should be parsed as integer");
576            }
577
578            if let Some(AttributeValue::Boolean(is_default)) =
579                result.attributes.get_str("IsDefault")
580            {
581                assert_eq!(*is_default, true);
582            } else {
583                panic!("IsDefault should be parsed as boolean");
584            }
585        }
586    }
587
588    #[test]
589    fn test_namespace_attribute_extraction() {
590        let xml = r#"<ern:Release xmlns:ern="http://ddex.net/xml/ern/43" 
591                                  xmlns:avs="http://ddex.net/xml/avs" 
592                                  ern:title="Test" />"#;
593        let mut reader = Reader::from_reader(Cursor::new(xml.as_bytes()));
594        let mut buf = Vec::new();
595
596        if let Ok(quick_xml::events::Event::Empty(start)) = reader.read_event_into(&mut buf) {
597            let extractor = AttributeExtractor::new();
598            let namespace_context = NamespaceContext {
599                current_scope: ddex_core::namespace::NamespaceScope::new(),
600                document_namespaces: indexmap::IndexMap::new(),
601                default_namespace: None,
602                ern_version: None,
603            };
604
605            let result = extractor
606                .extract_attributes(&start, &namespace_context)
607                .unwrap();
608
609            assert_eq!(result.namespace_declarations.len(), 2);
610            assert!(result.namespace_declarations.contains_key("ern"));
611            assert!(result.namespace_declarations.contains_key("avs"));
612        }
613    }
614
615    #[test]
616    fn test_special_attribute_processing() {
617        let xml = r#"<element xsi:type="xs:string" 
618                              xsi:nil="true"
619                              xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
620                              xmlns:xs="http://www.w3.org/2001/XMLSchema" />"#;
621        let mut reader = Reader::from_reader(Cursor::new(xml.as_bytes()));
622        let mut buf = Vec::new();
623
624        if let Ok(quick_xml::events::Event::Empty(start)) = reader.read_event_into(&mut buf) {
625            let extractor = AttributeExtractor::new();
626            let namespace_context = NamespaceContext {
627                current_scope: ddex_core::namespace::NamespaceScope::new(),
628                document_namespaces: indexmap::IndexMap::new(),
629                default_namespace: None,
630                ern_version: None,
631            };
632
633            let result = extractor
634                .extract_attributes(&start, &namespace_context)
635                .unwrap();
636
637            assert!(!result.special_attributes.is_empty());
638
639            // Check for xsi:nil
640            let xsi_nil_qname = QName::with_prefix_and_namespace(
641                "nil".to_string(),
642                "xsi".to_string(),
643                "http://www.w3.org/2001/XMLSchema-instance".to_string(),
644            );
645            if let Some(SpecialAttributeValue::Nil(nil_value)) =
646                result.special_attributes.get(&xsi_nil_qname)
647            {
648                assert_eq!(*nil_value, true);
649            }
650        }
651    }
652
653    #[test]
654    fn test_attribute_inheritance() {
655        let mut parent_attrs = AttributeMap::new();
656        parent_attrs.insert_str("LanguageAndScriptCode", "en-US");
657        parent_attrs.insert_str("ApplicableTerritoryCode", "Worldwide");
658
659        let mut child_attrs = AttributeMap::new();
660        child_attrs.insert_str("title", "Child Title");
661
662        let extractor = AttributeExtractor::new();
663        extractor.apply_inheritance(&parent_attrs, &mut child_attrs);
664
665        // Child should inherit language and territory
666        assert!(child_attrs.get_str("LanguageAndScriptCode").is_some());
667        assert!(child_attrs.get_str("ApplicableTerritoryCode").is_some());
668        assert!(child_attrs.get_str("title").is_some());
669    }
670
671    #[test]
672    fn test_ddex_standard_vs_extension_attributes() {
673        let mut attributes = AttributeMap::new();
674        attributes.insert_str("LanguageAndScriptCode", "en-US"); // Standard
675        attributes.insert_str("custom:proprietary", "custom value"); // Extension
676        attributes.insert_str("xmlns:custom", "http://example.com/custom"); // Namespace
677
678        let standard = attributes.standard_attributes();
679        let extensions = attributes.extension_attributes();
680
681        assert!(standard.len() >= 1); // Should contain LanguageAndScriptCode
682        assert!(extensions.len() >= 1); // Should contain custom:proprietary
683    }
684}