ddex_parser/parser/
attribute_extractor.rs

1//! # XML Attribute Extraction and Processing
2//! 
3//! This module provides comprehensive attribute extraction from XML elements,
4//! handling namespace resolution, special attributes, and proper type conversion
5//! for both standard DDEX and custom/proprietary attributes.
6
7use ddex_core::models::{AttributeMap, AttributeValue, QName, AttributeType};
8use crate::parser::namespace_detector::NamespaceContext;
9use crate::error::ParseError;
10use indexmap::IndexMap;
11use quick_xml::events::{BytesStart, attributes::Attribute};
12use std::collections::HashMap;
13use tracing::{debug, warn};
14
15/// Comprehensive attribute extractor with namespace awareness
16#[derive(Debug, Clone)]
17pub struct AttributeExtractor {
18    /// Known DDEX attribute types for proper parsing
19    ddex_attribute_types: HashMap<String, AttributeType>,
20    /// Special attribute handlers
21    special_attributes: IndexMap<String, SpecialAttributeHandler>,
22}
23
24/// Special attribute handler for attributes requiring custom processing
25#[derive(Debug, Clone)]
26pub enum SpecialAttributeHandler {
27    /// xsi:type attribute (XML Schema instance type)
28    XsiType,
29    /// xsi:schemaLocation attribute
30    XsiSchemaLocation,
31    /// xsi:noNamespaceSchemaLocation attribute
32    XsiNoNamespaceSchemaLocation,
33    /// xsi:nil attribute (indicates null value)
34    XsiNil,
35    /// Namespace declaration attributes (xmlns, xmlns:*)
36    NamespaceDeclaration,
37    /// Language and territory codes
38    LanguageAndTerritory,
39    /// Sequence numbers and ordering
40    SequenceNumber,
41    /// Boolean flags
42    BooleanFlag,
43}
44
45/// Attribute extraction result
46#[derive(Debug, Clone)]
47pub struct AttributeExtractionResult {
48    /// All extracted attributes with proper typing
49    pub attributes: AttributeMap,
50    /// Standard DDEX attributes (subset of all attributes)
51    pub standard_attributes: IndexMap<QName, AttributeValue>,
52    /// Extension/custom attributes
53    pub extension_attributes: IndexMap<QName, AttributeValue>,
54    /// Namespace declarations found in this element
55    pub namespace_declarations: IndexMap<String, String>,
56    /// Special attributes requiring additional processing
57    pub special_attributes: IndexMap<QName, SpecialAttributeValue>,
58    /// Warnings about attribute processing
59    pub warnings: Vec<String>,
60}
61
62/// Special attribute values requiring custom handling
63#[derive(Debug, Clone, PartialEq)]
64pub enum SpecialAttributeValue {
65    /// xsi:type with resolved type information
66    XsiType {
67        type_name: String,
68        namespace_uri: Option<String>,
69        resolved_type: Option<String>,
70    },
71    /// Schema location with URI pairs
72    SchemaLocation {
73        locations: IndexMap<String, String>, // namespace_uri -> schema_location
74    },
75    /// No namespace schema location
76    NoNamespaceSchemaLocation(String),
77    /// Nil indicator
78    Nil(bool),
79    /// Language with territory code
80    Language {
81        language: String,
82        script: Option<String>,
83        territory: Option<String>,
84    },
85    /// Territory code list
86    Territory(Vec<String>),
87    /// Sequence number for ordering
88    Sequence(u32),
89    /// Boolean flag
90    Flag(bool),
91}
92
93impl AttributeExtractor {
94    /// Create a new attribute extractor with DDEX knowledge
95    pub fn new() -> Self {
96        let mut extractor = Self {
97            ddex_attribute_types: HashMap::new(),
98            special_attributes: IndexMap::new(),
99        };
100        
101        extractor.initialize_ddex_attributes();
102        extractor.initialize_special_handlers();
103        extractor
104    }
105
106    /// Initialize known DDEX attribute types
107    fn initialize_ddex_attributes(&mut self) {
108        // Language and territory attributes
109        self.ddex_attribute_types.insert("LanguageAndScriptCode".to_string(), AttributeType::Language);
110        self.ddex_attribute_types.insert("ApplicableTerritoryCode".to_string(), AttributeType::String);
111        
112        // Boolean attributes
113        self.ddex_attribute_types.insert("IsDefault".to_string(), AttributeType::Boolean);
114        self.ddex_attribute_types.insert("IsMainArtist".to_string(), AttributeType::Boolean);
115        self.ddex_attribute_types.insert("HasChanged".to_string(), AttributeType::Boolean);
116        
117        // Numeric attributes
118        self.ddex_attribute_types.insert("SequenceNumber".to_string(), AttributeType::Integer);
119        self.ddex_attribute_types.insert("Duration".to_string(), AttributeType::String); // ISO 8601 duration
120        
121        // URI attributes
122        self.ddex_attribute_types.insert("Namespace".to_string(), AttributeType::Uri);
123        
124        // Date/time attributes
125        self.ddex_attribute_types.insert("CreatedDateTime".to_string(), AttributeType::DateTime);
126        self.ddex_attribute_types.insert("UpdatedDateTime".to_string(), AttributeType::DateTime);
127    }
128
129    /// Initialize special attribute handlers
130    fn initialize_special_handlers(&mut self) {
131        // XML Schema Instance attributes
132        self.special_attributes.insert("xsi:type".to_string(), SpecialAttributeHandler::XsiType);
133        self.special_attributes.insert("xsi:schemaLocation".to_string(), SpecialAttributeHandler::XsiSchemaLocation);
134        self.special_attributes.insert("xsi:noNamespaceSchemaLocation".to_string(), SpecialAttributeHandler::XsiNoNamespaceSchemaLocation);
135        self.special_attributes.insert("xsi:nil".to_string(), SpecialAttributeHandler::XsiNil);
136        
137        // Namespace declarations
138        self.special_attributes.insert("xmlns".to_string(), SpecialAttributeHandler::NamespaceDeclaration);
139        // Note: xmlns:* are handled dynamically
140        
141        // DDEX specific
142        self.special_attributes.insert("LanguageAndScriptCode".to_string(), SpecialAttributeHandler::LanguageAndTerritory);
143        self.special_attributes.insert("ApplicableTerritoryCode".to_string(), SpecialAttributeHandler::LanguageAndTerritory);
144        self.special_attributes.insert("SequenceNumber".to_string(), SpecialAttributeHandler::SequenceNumber);
145        
146        // Boolean flags
147        self.special_attributes.insert("IsDefault".to_string(), SpecialAttributeHandler::BooleanFlag);
148        self.special_attributes.insert("IsMainArtist".to_string(), SpecialAttributeHandler::BooleanFlag);
149    }
150
151    /// Extract all attributes from an XML element
152    pub fn extract_attributes(
153        &self,
154        element: &BytesStart,
155        namespace_context: &NamespaceContext,
156    ) -> Result<AttributeExtractionResult, ParseError> {
157        let mut attributes = AttributeMap::new();
158        let mut namespace_declarations = IndexMap::new();
159        let mut special_attributes = IndexMap::new();
160        let warnings = Vec::new();
161
162        debug!("Extracting attributes from element: {}", String::from_utf8_lossy(element.name().as_ref()));
163
164        // Process all attributes
165        for attr_result in element.attributes() {
166            let attr = attr_result.map_err(|e| ParseError::XmlError {
167                message: format!("Failed to read attribute: {}", e),
168                location: crate::error::ErrorLocation::default(),
169            })?;
170
171            let (qname, attr_value) = self.process_attribute(&attr, namespace_context)?;
172            
173            // Handle namespace declarations separately
174            if qname.is_namespace_declaration() {
175                let prefix = if qname.local_name == "xmlns" {
176                    "".to_string() // Default namespace
177                } else {
178                    qname.local_name.clone() // Prefixed namespace
179                };
180                namespace_declarations.insert(prefix, attr_value.to_xml_value());
181                debug!("Found namespace declaration: {}={}", qname.to_xml_name(), attr_value.to_xml_value());
182            }
183
184            // Check for special attributes
185            if let Some(special_value) = self.process_special_attribute(&qname, &attr_value, namespace_context)? {
186                special_attributes.insert(qname.clone(), special_value);
187            }
188
189            // Add to main attribute map
190            attributes.insert(qname, attr_value);
191        }
192
193        // Separate standard and extension attributes
194        let standard_attributes = attributes.standard_attributes();
195        let extension_attributes = attributes.extension_attributes();
196
197        debug!("Extracted {} total attributes ({} standard, {} extensions)", 
198               attributes.len(), standard_attributes.len(), extension_attributes.len());
199
200        Ok(AttributeExtractionResult {
201            attributes,
202            standard_attributes,
203            extension_attributes,
204            namespace_declarations,
205            special_attributes,
206            warnings,
207        })
208    }
209
210    /// Process a single attribute
211    fn process_attribute(
212        &self,
213        attr: &Attribute,
214        namespace_context: &NamespaceContext,
215    ) -> Result<(QName, AttributeValue), ParseError> {
216        let attr_name = String::from_utf8_lossy(attr.key.as_ref());
217        let attr_value = String::from_utf8_lossy(&attr.value);
218
219        debug!("Processing attribute: {}={}", attr_name, attr_value);
220
221        // Create QName with namespace resolution
222        let qname = self.resolve_attribute_qname(&attr_name, namespace_context);
223        
224        // Determine attribute type and parse value
225        let parsed_value = if let Some(attr_type) = self.get_attribute_type(&qname) {
226            AttributeValue::parse_with_type(&attr_value, attr_type)
227                .unwrap_or_else(|e| {
228                    warn!("Failed to parse attribute {} as {:?}: {}", qname, attr_type, e);
229                    AttributeValue::Raw(attr_value.to_string())
230                })
231        } else {
232            // Default to string for unknown attributes
233            AttributeValue::String(attr_value.to_string())
234        };
235
236        Ok((qname, parsed_value))
237    }
238
239    /// Resolve attribute name to QName with namespace context
240    fn resolve_attribute_qname(&self, attr_name: &str, namespace_context: &NamespaceContext) -> QName {
241        if let Some((prefix, local_name)) = attr_name.split_once(':') {
242            // Prefixed attribute
243            if let Some(namespace_uri) = namespace_context.current_scope.resolve_prefix(prefix) {
244                QName::with_prefix_and_namespace(local_name, prefix, namespace_uri)
245            } else {
246                // Unresolved prefix - keep as is with warning
247                warn!("Unresolved namespace prefix in attribute: {}", attr_name);
248                QName {
249                    local_name: local_name.to_string(),
250                    namespace_uri: None,
251                    prefix: Some(prefix.to_string()),
252                }
253            }
254        } else {
255            // Non-prefixed attribute - check if it's a namespace declaration
256            if attr_name == "xmlns" || attr_name.starts_with("xmlns:") {
257                QName::new(attr_name)
258            } else {
259                // Regular attribute without namespace
260                QName::new(attr_name)
261            }
262        }
263    }
264
265    /// Get the expected type for an attribute
266    fn get_attribute_type(&self, qname: &QName) -> Option<AttributeType> {
267        // Check by full qualified name first
268        if let Some(attr_type) = self.ddex_attribute_types.get(&qname.to_xml_name()) {
269            return Some(*attr_type);
270        }
271        
272        // Check by local name
273        self.ddex_attribute_types.get(&qname.local_name).copied()
274    }
275
276    /// Process special attributes that require custom handling
277    fn process_special_attribute(
278        &self,
279        qname: &QName,
280        value: &AttributeValue,
281        namespace_context: &NamespaceContext,
282    ) -> Result<Option<SpecialAttributeValue>, ParseError> {
283        let attr_name = qname.to_xml_name();
284        
285        if let Some(handler) = self.special_attributes.get(&attr_name) {
286            match handler {
287                SpecialAttributeHandler::XsiType => {
288                    self.process_xsi_type(value, namespace_context)
289                },
290                SpecialAttributeHandler::XsiSchemaLocation => {
291                    self.process_schema_location(value)
292                },
293                SpecialAttributeHandler::XsiNoNamespaceSchemaLocation => {
294                    Ok(Some(SpecialAttributeValue::NoNamespaceSchemaLocation(value.to_xml_value())))
295                },
296                SpecialAttributeHandler::XsiNil => {
297                    self.process_xsi_nil(value)
298                },
299                SpecialAttributeHandler::NamespaceDeclaration => {
300                    // Already handled in main extraction
301                    Ok(None)
302                },
303                SpecialAttributeHandler::LanguageAndTerritory => {
304                    self.process_language_territory(value)
305                },
306                SpecialAttributeHandler::SequenceNumber => {
307                    self.process_sequence_number(value)
308                },
309                SpecialAttributeHandler::BooleanFlag => {
310                    self.process_boolean_flag(value)
311                },
312            }
313        } else {
314            Ok(None)
315        }
316    }
317
318    /// Process xsi:type attribute
319    fn process_xsi_type(
320        &self,
321        value: &AttributeValue,
322        namespace_context: &NamespaceContext,
323    ) -> Result<Option<SpecialAttributeValue>, ParseError> {
324        let type_value = value.to_xml_value();
325        
326        if let Some((prefix, local_name)) = type_value.split_once(':') {
327            // Prefixed type
328            let namespace_uri = namespace_context.current_scope.resolve_prefix(prefix);
329            Ok(Some(SpecialAttributeValue::XsiType {
330                type_name: local_name.to_string(),
331                namespace_uri,
332                resolved_type: None, // Could be resolved later with schema information
333            }))
334        } else {
335            // Non-prefixed type
336            Ok(Some(SpecialAttributeValue::XsiType {
337                type_name: type_value,
338                namespace_uri: None,
339                resolved_type: None,
340            }))
341        }
342    }
343
344    /// Process xsi:schemaLocation attribute
345    fn process_schema_location(&self, value: &AttributeValue) -> Result<Option<SpecialAttributeValue>, ParseError> {
346        let location_value = value.to_xml_value();
347        let mut locations = IndexMap::new();
348        
349        // Schema locations are space-separated pairs: namespace_uri schema_url
350        let tokens: Vec<&str> = location_value.split_whitespace().collect();
351        for chunk in tokens.chunks(2) {
352            if chunk.len() == 2 {
353                locations.insert(chunk[0].to_string(), chunk[1].to_string());
354            }
355        }
356        
357        Ok(Some(SpecialAttributeValue::SchemaLocation { locations }))
358    }
359
360    /// Process xsi:nil attribute
361    fn process_xsi_nil(&self, value: &AttributeValue) -> Result<Option<SpecialAttributeValue>, ParseError> {
362        match value {
363            AttributeValue::Boolean(b) => Ok(Some(SpecialAttributeValue::Nil(*b))),
364            _ => {
365                let str_val = value.to_xml_value();
366                let nil_val = matches!(str_val.to_lowercase().as_str(), "true" | "1");
367                Ok(Some(SpecialAttributeValue::Nil(nil_val)))
368            }
369        }
370    }
371
372    /// Process language and territory codes
373    fn process_language_territory(&self, value: &AttributeValue) -> Result<Option<SpecialAttributeValue>, ParseError> {
374        let lang_value = value.to_xml_value();
375        
376        // Parse RFC 5646 language tags (simplified)
377        if lang_value.contains('-') {
378            let parts: Vec<&str> = lang_value.split('-').collect();
379            let language = parts[0].to_string();
380            let territory = if parts.len() > 1 {
381                Some(parts[1].to_string())
382            } else {
383                None
384            };
385            
386            Ok(Some(SpecialAttributeValue::Language {
387                language,
388                script: None, // Could be enhanced to parse script codes
389                territory,
390            }))
391        } else if lang_value.contains(' ') {
392            // Space-separated territory codes
393            let territories: Vec<String> = lang_value.split_whitespace()
394                .map(|s| s.to_string())
395                .collect();
396            Ok(Some(SpecialAttributeValue::Territory(territories)))
397        } else {
398            Ok(Some(SpecialAttributeValue::Language {
399                language: lang_value,
400                script: None,
401                territory: None,
402            }))
403        }
404    }
405
406    /// Process sequence number
407    fn process_sequence_number(&self, value: &AttributeValue) -> Result<Option<SpecialAttributeValue>, ParseError> {
408        match value {
409            AttributeValue::Integer(i) => Ok(Some(SpecialAttributeValue::Sequence(*i as u32))),
410            _ => {
411                if let Ok(seq) = value.to_xml_value().parse::<u32>() {
412                    Ok(Some(SpecialAttributeValue::Sequence(seq)))
413                } else {
414                    Ok(None)
415                }
416            }
417        }
418    }
419
420    /// Process boolean flag
421    fn process_boolean_flag(&self, value: &AttributeValue) -> Result<Option<SpecialAttributeValue>, ParseError> {
422        match value {
423            AttributeValue::Boolean(b) => Ok(Some(SpecialAttributeValue::Flag(*b))),
424            _ => {
425                let str_val = value.to_xml_value();
426                let bool_val = matches!(str_val.to_lowercase().as_str(), "true" | "1");
427                Ok(Some(SpecialAttributeValue::Flag(bool_val)))
428            }
429        }
430    }
431
432    /// Apply attribute inheritance from parent to child
433    pub fn apply_inheritance(
434        &self,
435        parent_attributes: &AttributeMap,
436        child_attributes: &mut AttributeMap,
437    ) {
438        let inheritance = ddex_core::models::AttributeInheritance::new();
439        inheritance.apply_inheritance(parent_attributes, child_attributes);
440    }
441
442    /// Validate extracted attributes
443    pub fn validate_attributes(&self, attributes: &AttributeMap) -> Vec<String> {
444        let mut errors = Vec::new();
445        
446        for (qname, value) in attributes.iter() {
447            if let Err(e) = value.validate() {
448                errors.push(format!("Invalid attribute {}: {}", qname, e));
449            }
450        }
451        
452        errors
453    }
454}
455
456impl Default for AttributeExtractor {
457    fn default() -> Self {
458        Self::new()
459    }
460}
461
462#[cfg(test)]
463mod tests {
464    use super::*;
465    use quick_xml::Reader;
466    use std::io::Cursor;
467
468    #[test]
469    fn test_attribute_extraction_basic() {
470        let xml = r#"<Release title="Test Album" SequenceNumber="1" IsDefault="true" />"#;
471        let mut reader = Reader::from_reader(Cursor::new(xml.as_bytes()));
472        let mut buf = Vec::new();
473        
474        if let Ok(quick_xml::events::Event::Empty(start)) = reader.read_event_into(&mut buf) {
475            let extractor = AttributeExtractor::new();
476            let namespace_context = NamespaceContext {
477                current_scope: ddex_core::namespace::NamespaceScope::new(),
478                document_namespaces: indexmap::IndexMap::new(),
479                default_namespace: None,
480                ern_version: None,
481            };
482            
483            let result = extractor.extract_attributes(&start, &namespace_context).unwrap();
484            
485            assert_eq!(result.attributes.len(), 3);
486            assert_eq!(result.attributes.get_str("title").unwrap().to_xml_value(), "Test Album");
487            assert_eq!(result.attributes.get_str("SequenceNumber").unwrap().to_xml_value(), "1");
488            assert_eq!(result.attributes.get_str("IsDefault").unwrap().to_xml_value(), "true");
489            
490            // Check type parsing
491            if let Some(AttributeValue::Integer(seq)) = result.attributes.get_str("SequenceNumber") {
492                assert_eq!(*seq, 1);
493            } else {
494                panic!("SequenceNumber should be parsed as integer");
495            }
496            
497            if let Some(AttributeValue::Boolean(is_default)) = result.attributes.get_str("IsDefault") {
498                assert_eq!(*is_default, true);
499            } else {
500                panic!("IsDefault should be parsed as boolean");
501            }
502        }
503    }
504
505    #[test]
506    fn test_namespace_attribute_extraction() {
507        let xml = r#"<ern:Release xmlns:ern="http://ddex.net/xml/ern/43" 
508                                  xmlns:avs="http://ddex.net/xml/avs" 
509                                  ern:title="Test" />"#;
510        let mut reader = Reader::from_reader(Cursor::new(xml.as_bytes()));
511        let mut buf = Vec::new();
512        
513        if let Ok(quick_xml::events::Event::Empty(start)) = reader.read_event_into(&mut buf) {
514            let extractor = AttributeExtractor::new();
515            let namespace_context = NamespaceContext {
516                current_scope: ddex_core::namespace::NamespaceScope::new(),
517                document_namespaces: indexmap::IndexMap::new(),
518                default_namespace: None,
519                ern_version: None,
520            };
521            
522            let result = extractor.extract_attributes(&start, &namespace_context).unwrap();
523            
524            assert_eq!(result.namespace_declarations.len(), 2);
525            assert!(result.namespace_declarations.contains_key("ern"));
526            assert!(result.namespace_declarations.contains_key("avs"));
527        }
528    }
529
530    #[test]
531    fn test_special_attribute_processing() {
532        let xml = r#"<element xsi:type="xs:string" 
533                              xsi:nil="true"
534                              xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
535                              xmlns:xs="http://www.w3.org/2001/XMLSchema" />"#;
536        let mut reader = Reader::from_reader(Cursor::new(xml.as_bytes()));
537        let mut buf = Vec::new();
538        
539        if let Ok(quick_xml::events::Event::Empty(start)) = reader.read_event_into(&mut buf) {
540            let extractor = AttributeExtractor::new();
541            let namespace_context = NamespaceContext {
542                current_scope: ddex_core::namespace::NamespaceScope::new(),
543                document_namespaces: indexmap::IndexMap::new(),
544                default_namespace: None,
545                ern_version: None,
546            };
547            
548            let result = extractor.extract_attributes(&start, &namespace_context).unwrap();
549            
550            assert!(!result.special_attributes.is_empty());
551            
552            // Check for xsi:nil
553            let xsi_nil_qname = QName::with_prefix_and_namespace(
554                "nil".to_string(),
555                "xsi".to_string(), 
556                "http://www.w3.org/2001/XMLSchema-instance".to_string()
557            );
558            if let Some(SpecialAttributeValue::Nil(nil_value)) = result.special_attributes.get(&xsi_nil_qname) {
559                assert_eq!(*nil_value, true);
560            }
561        }
562    }
563
564    #[test]
565    fn test_attribute_inheritance() {
566        let mut parent_attrs = AttributeMap::new();
567        parent_attrs.insert_str("LanguageAndScriptCode", "en-US");
568        parent_attrs.insert_str("ApplicableTerritoryCode", "Worldwide");
569        
570        let mut child_attrs = AttributeMap::new();
571        child_attrs.insert_str("title", "Child Title");
572        
573        let extractor = AttributeExtractor::new();
574        extractor.apply_inheritance(&parent_attrs, &mut child_attrs);
575        
576        // Child should inherit language and territory
577        assert!(child_attrs.get_str("LanguageAndScriptCode").is_some());
578        assert!(child_attrs.get_str("ApplicableTerritoryCode").is_some());
579        assert!(child_attrs.get_str("title").is_some());
580    }
581
582    #[test]
583    fn test_ddex_standard_vs_extension_attributes() {
584        let mut attributes = AttributeMap::new();
585        attributes.insert_str("LanguageAndScriptCode", "en-US"); // Standard
586        attributes.insert_str("custom:proprietary", "custom value"); // Extension
587        attributes.insert_str("xmlns:custom", "http://example.com/custom"); // Namespace
588        
589        let standard = attributes.standard_attributes();
590        let extensions = attributes.extension_attributes();
591        
592        assert!(standard.len() >= 1); // Should contain LanguageAndScriptCode
593        assert!(extensions.len() >= 1); // Should contain custom:proprietary
594    }
595}