exif_oxide/xmp/
processor.rs

1//! XMP processor implementation
2//!
3//! Processes XMP packets from various sources (standalone .xmp files, JPEG APP1,
4//! TIFF IFD0) and produces structured TagValue output.
5
6use crate::types::{TagEntry, TagValue};
7use anyhow::{Context, Result};
8use quick_xml::events::{BytesStart, Event};
9use quick_xml::name::{Namespace, ResolveResult};
10use quick_xml::reader::NsReader;
11use std::collections::HashMap;
12
13// Import generated namespace tables
14use crate::generated::XMP_pm::NAMESPACE_URIS;
15
16/// XMP processor for structured metadata extraction
17pub struct XmpProcessor {
18    /// URI to namespace prefix reverse lookup (following ExifTool's %uri2ns)
19    uri_to_prefix: HashMap<String, String>,
20    /// Current namespace mappings discovered in this XMP document
21    current_ns_map: HashMap<String, String>,
22}
23
24impl Default for XmpProcessor {
25    fn default() -> Self {
26        Self::new()
27    }
28}
29
30impl XmpProcessor {
31    /// Create a new XMP processor
32    pub fn new() -> Self {
33        // Build reverse namespace lookup (URI -> prefix) from generated tables
34        // This follows ExifTool's %uri2ns pattern (XMP.pm:215-221)
35        let mut uri_to_prefix = HashMap::new();
36
37        // Add special case for ExifTool namespace (same as ExifTool)
38        uri_to_prefix.insert("http://ns.exiftool.ca/1.0/".to_string(), "et".to_string());
39        uri_to_prefix.insert("http://ns.exiftool.org/1.0/".to_string(), "et".to_string());
40
41        // Build reverse mapping from generated NAMESPACE_URIS
42        for (prefix, uri) in NAMESPACE_URIS.iter() {
43            uri_to_prefix.insert(uri.to_string(), prefix.to_string());
44        }
45
46        Self {
47            uri_to_prefix,
48            current_ns_map: HashMap::new(),
49        }
50    }
51
52    /// Process XMP data and return structured TagEntry
53    ///
54    /// Returns a single TagEntry with tag_id "XMP" containing the entire
55    /// XMP structure as a TagValue::Object with namespace grouping.
56    pub fn process_xmp_data(&mut self, data: &[u8]) -> Result<TagEntry> {
57        // Detect and handle BOM if present, and convert UTF-16 if needed
58        let processed_data = self.strip_bom(data);
59
60        // Convert to string for XML parsing
61        let xmp_str =
62            std::str::from_utf8(&processed_data).context("XMP data is not valid UTF-8")?;
63
64        // Parse XML and build structure
65        let xmp_structure = self.parse_xmp_xml(xmp_str)?;
66
67        // Create TagEntry with structured data
68        Ok(TagEntry {
69            group: "XMP".to_string(),
70            group1: "XMP".to_string(),
71            name: "XMP".to_string(),
72            value: TagValue::Object(xmp_structure.clone()),
73            print: TagValue::Object(xmp_structure),
74        })
75    }
76
77    /// Strip UTF BOM if present and handle UTF-16 conversion
78    fn strip_bom<'a>(&self, data: &'a [u8]) -> std::borrow::Cow<'a, [u8]> {
79        use std::borrow::Cow;
80
81        // UTF-8 BOM
82        if data.starts_with(b"\xEF\xBB\xBF") {
83            return Cow::Borrowed(&data[3..]);
84        }
85
86        // UTF-16 BE BOM
87        if data.starts_with(b"\xFE\xFF") {
88            return Cow::Owned(self.convert_utf16_be_to_utf8(&data[2..]));
89        }
90
91        // UTF-16 LE BOM
92        if data.starts_with(b"\xFF\xFE") {
93            return Cow::Owned(self.convert_utf16_le_to_utf8(&data[2..]));
94        }
95
96        // Check if data looks like UTF-16 LE without BOM (starts with '<' followed by null)
97        if data.len() >= 4 && data[0] == b'<' && data[1] == 0 {
98            return Cow::Owned(self.convert_utf16_le_to_utf8(data));
99        }
100
101        // Check if data looks like UTF-16 BE without BOM (starts with null followed by '<')
102        if data.len() >= 4 && data[0] == 0 && data[1] == b'<' {
103            return Cow::Owned(self.convert_utf16_be_to_utf8(data));
104        }
105
106        Cow::Borrowed(data)
107    }
108
109    /// Convert UTF-16 LE bytes to UTF-8
110    fn convert_utf16_le_to_utf8(&self, data: &[u8]) -> Vec<u8> {
111        // Convert bytes to u16 pairs (little-endian)
112        let utf16_chars: Vec<u16> = data
113            .chunks_exact(2)
114            .map(|chunk| u16::from_le_bytes([chunk[0], chunk[1]]))
115            .collect();
116
117        // Convert UTF-16 to string, then to UTF-8 bytes
118        String::from_utf16_lossy(&utf16_chars).into_bytes()
119    }
120
121    /// Convert UTF-16 BE bytes to UTF-8
122    fn convert_utf16_be_to_utf8(&self, data: &[u8]) -> Vec<u8> {
123        // Convert bytes to u16 pairs (big-endian)
124        let utf16_chars: Vec<u16> = data
125            .chunks_exact(2)
126            .map(|chunk| u16::from_be_bytes([chunk[0], chunk[1]]))
127            .collect();
128
129        // Convert UTF-16 to string, then to UTF-8 bytes
130        String::from_utf16_lossy(&utf16_chars).into_bytes()
131    }
132
133    /// Parse XMP XML and build structured representation
134    fn parse_xmp_xml(&mut self, xml: &str) -> Result<HashMap<String, TagValue>> {
135        let mut reader = NsReader::from_str(xml);
136        reader.config_mut().trim_text(true);
137
138        let mut buf = Vec::new();
139        let mut root_object = HashMap::new();
140        let mut namespace_objects: HashMap<String, HashMap<String, TagValue>> = HashMap::new();
141
142        // Clear current namespace mappings for this document
143        self.current_ns_map.clear();
144
145        // Stack to track our position in the XML tree
146        let mut element_stack: Vec<ElementContext> = Vec::new();
147
148        loop {
149            match reader.read_resolved_event_into(&mut buf) {
150                Ok((ns_result, Event::Start(e))) => {
151                    let element_local_name = e.local_name();
152                    let local_name = std::str::from_utf8(element_local_name.as_ref())
153                        .context("Invalid UTF-8 in element name")?;
154
155                    // Extract namespace URI from resolved result
156                    let namespace_uri = match ns_result {
157                        ResolveResult::Bound(Namespace(ns_bytes)) => Some(
158                            std::str::from_utf8(ns_bytes)
159                                .context("Invalid UTF-8 in namespace URI")?
160                                .to_string(),
161                        ),
162                        _ => None,
163                    };
164
165                    // Process element based on context
166                    self.process_start_element(
167                        &e,
168                        local_name,
169                        namespace_uri.as_deref(),
170                        &reader,
171                        &mut element_stack,
172                        &mut namespace_objects,
173                    )?;
174                }
175                Ok((_, Event::Text(e))) => {
176                    let text = e.decode()?.into_owned();
177                    if !text.trim().is_empty() {
178                        self.process_text_content(
179                            text,
180                            &mut element_stack,
181                            &mut namespace_objects,
182                        )?;
183                    }
184                }
185                Ok((_, Event::End(_))) => {
186                    if !element_stack.is_empty() {
187                        element_stack.pop();
188                    }
189                }
190                Ok((_, Event::Eof)) => break,
191                Err(e) => return Err(anyhow::anyhow!("XML parsing error: {}", e)),
192                _ => {} // Ignore other events
193            }
194            buf.clear();
195        }
196
197        // Convert namespace objects to final structure
198        for (ns_prefix, properties) in namespace_objects {
199            if !properties.is_empty() {
200                root_object.insert(ns_prefix, TagValue::Object(properties));
201            }
202        }
203
204        Ok(root_object)
205    }
206
207    /// Process start element
208    fn process_start_element(
209        &mut self,
210        element: &BytesStart,
211        local_name: &str,
212        namespace_uri: Option<&str>,
213        reader: &NsReader<&[u8]>,
214        element_stack: &mut Vec<ElementContext>,
215        _namespace_objects: &mut HashMap<String, HashMap<String, TagValue>>,
216    ) -> Result<()> {
217        // Process namespace declarations from attributes
218        for attr in element.attributes() {
219            let attr = attr?;
220            let key = std::str::from_utf8(attr.key.as_ref())?;
221
222            // Check for namespace declarations (xmlns:prefix="uri")
223            if let Some(prefix) = key.strip_prefix("xmlns:") {
224                let uri = std::str::from_utf8(&attr.value)?;
225                // Store the mapping discovered in this document
226                self.current_ns_map
227                    .insert(prefix.to_string(), uri.to_string());
228            }
229        }
230
231        // Detect RDF containers
232        let container_type = match local_name {
233            "Bag" => Some(RdfContainerType::Bag),
234            "Seq" => Some(RdfContainerType::Seq),
235            "Alt" => Some(RdfContainerType::Alt),
236            _ => None,
237        };
238
239        // Extract attributes, especially xml:lang for Alt containers
240        let mut lang_attr = None;
241        for attr in element.attributes() {
242            let attr = attr?;
243            let (_, attr_local) = reader.resolve_attribute(attr.key);
244            let attr_name = std::str::from_utf8(attr_local.as_ref())?;
245
246            if attr_name == "lang" {
247                lang_attr = Some(std::str::from_utf8(&attr.value)?.to_string());
248            }
249        }
250
251        // Determine namespace prefix from resolved URI
252        let property_ns = if let Some(uri) = namespace_uri {
253            self.get_namespace_prefix(uri)
254        } else {
255            None
256        };
257
258        // Create element context
259        let context = ElementContext {
260            local_name: local_name.to_string(),
261            namespace_prefix: property_ns,
262            container_type,
263            language: lang_attr,
264            is_rdf_li: local_name == "li",
265        };
266
267        element_stack.push(context);
268
269        Ok(())
270    }
271
272    /// Process text content
273    fn process_text_content(
274        &self,
275        text: String,
276        element_stack: &mut [ElementContext],
277        namespace_objects: &mut HashMap<String, HashMap<String, TagValue>>,
278    ) -> Result<()> {
279        if element_stack.len() < 2 {
280            return Ok(()); // Not enough context
281        }
282
283        // For RDF list items, we need to find the property element that contains the container
284        // Example stack: [rdf:Description, dc:creator, rdf:Seq, rdf:li]
285        let mut property_element = None;
286        let mut container_element = None;
287
288        // Walk up the stack to find the property and container
289        for i in (0..element_stack.len()).rev() {
290            let elem = &element_stack[i];
291
292            // Skip RDF structural elements
293            if elem.local_name == "li"
294                || elem.local_name == "Description"
295                || elem.local_name == "RDF"
296            {
297                continue;
298            }
299
300            // Found a container
301            if elem.container_type.is_some() {
302                container_element = Some(elem);
303                // The property should be the element before the container
304                if i > 0 {
305                    let prev = &element_stack[i - 1];
306                    if prev.namespace_prefix.is_some() && prev.container_type.is_none() {
307                        property_element = Some(prev);
308                        break;
309                    }
310                }
311            } else if elem.namespace_prefix.is_some() && property_element.is_none() {
312                // This might be a simple property without a container
313                property_element = Some(elem);
314                if container_element.is_some() {
315                    break;
316                }
317            }
318        }
319
320        // Get the property namespace and name
321        if let Some(prop) = property_element {
322            if let Some(ns) = &prop.namespace_prefix {
323                let ns_object = namespace_objects.entry(ns.clone()).or_default();
324
325                let property_name = prop.local_name.clone();
326
327                // Handle based on container type
328                if let Some(container) = container_element {
329                    match container.container_type {
330                        Some(RdfContainerType::Bag) | Some(RdfContainerType::Seq) => {
331                            // Add to array
332                            let array = ns_object
333                                .entry(property_name)
334                                .or_insert_with(|| TagValue::Array(Vec::new()));
335
336                            if let Some(arr) = array.as_array_mut() {
337                                arr.push(TagValue::string(text));
338                            }
339                        }
340                        Some(RdfContainerType::Alt) => {
341                            // Add to language alternatives object
342                            let alt_object = ns_object
343                                .entry(property_name)
344                                .or_insert_with(|| TagValue::Object(HashMap::new()));
345
346                            if let Some(obj) = alt_object.as_object_mut() {
347                                let current = &element_stack[element_stack.len() - 1];
348                                let lang_key = current.language.as_deref().unwrap_or("x-default");
349                                obj.insert(lang_key.to_string(), TagValue::string(text));
350                            }
351                        }
352                        None => {
353                            // Should not happen if we have a container_element
354                        }
355                    }
356                } else {
357                    // Simple property without container
358                    ns_object.insert(property_name, TagValue::string(text));
359                }
360            }
361        }
362
363        Ok(())
364    }
365
366    /// Get namespace prefix from URI
367    /// Following ExifTool's approach, uses the generated reverse lookup table
368    fn get_namespace_prefix(&self, uri: &str) -> Option<String> {
369        // Check our reverse URI to prefix mapping
370        // This includes all standard namespaces from generated tables
371        self.uri_to_prefix.get(uri).cloned()
372    }
373
374    /// Extract a reasonable prefix from namespace URI
375    /// TODO: Used for unknown namespace handling in future implementation
376    #[allow(dead_code)]
377    fn extract_prefix_from_uri(&self, uri: &str) -> String {
378        // Common namespace patterns
379        if uri.contains("/dc/") {
380            return "dc".to_string();
381        }
382        if uri.contains("/xmp/") || uri.contains("/xap/") {
383            return "xmp".to_string();
384        }
385        if uri.contains("/exif/") {
386            return "exif".to_string();
387        }
388        if uri.contains("/tiff/") {
389            return "tiff".to_string();
390        }
391        if uri.contains("/photoshop/") {
392            return "photoshop".to_string();
393        }
394        if uri.contains("/crs/") {
395            return "crs".to_string();
396        }
397
398        // Extract last path component
399        uri.trim_end_matches('/')
400            .split('/')
401            .next_back()
402            .unwrap_or("unknown")
403            .split('#')
404            .next_back()
405            .unwrap_or("unknown")
406            .to_string()
407    }
408}
409
410/// Context for tracking XML element state
411#[derive(Debug)]
412struct ElementContext {
413    local_name: String,
414    namespace_prefix: Option<String>,
415    container_type: Option<RdfContainerType>,
416    language: Option<String>,
417    /// TODO: Used for RDF list item context tracking in future implementation
418    #[allow(dead_code)]
419    is_rdf_li: bool,
420}
421
422/// RDF container types
423#[derive(Debug, Clone, Copy)]
424enum RdfContainerType {
425    Bag, // Unordered list
426    Seq, // Ordered sequence
427    Alt, // Language alternatives
428}
429
430#[cfg(test)]
431mod tests {
432    use super::*;
433
434    #[test]
435    fn test_minimal_xmp() {
436        let xmp_data = r#"<?xml version="1.0"?>
437<x:xmpmeta xmlns:x="adobe:ns:meta/">
438  <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
439    <rdf:Description xmlns:dc="http://purl.org/dc/elements/1.1/">
440      <dc:title>Test Title</dc:title>
441    </rdf:Description>
442  </rdf:RDF>
443</x:xmpmeta>"#;
444
445        let mut processor = XmpProcessor::new();
446        let result = processor.process_xmp_data(xmp_data.as_bytes()).unwrap();
447
448        assert_eq!(result.name, "XMP");
449        if let TagValue::Object(xmp) = &result.value {
450            eprintln!("Minimal XMP keys: {:?}", xmp.keys().collect::<Vec<_>>());
451            for (key, value) in xmp {
452                eprintln!("  {key}: {value:?}");
453            }
454        }
455    }
456
457    #[test]
458    fn test_simple_xmp_parsing() {
459        let xmp_data = r#"<?xml version="1.0"?>
460<x:xmpmeta xmlns:x="adobe:ns:meta/">
461  <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
462    <rdf:Description xmlns:dc="http://purl.org/dc/elements/1.1/">
463      <dc:creator>
464        <rdf:Seq>
465          <rdf:li>John Doe</rdf:li>
466          <rdf:li>Jane Smith</rdf:li>
467        </rdf:Seq>
468      </dc:creator>
469      <dc:title>
470        <rdf:Alt>
471          <rdf:li xml:lang="x-default">Test Photo</rdf:li>
472          <rdf:li xml:lang="en-US">Test Photo</rdf:li>
473        </rdf:Alt>
474      </dc:title>
475    </rdf:Description>
476  </rdf:RDF>
477</x:xmpmeta>"#;
478
479        let mut processor = XmpProcessor::new();
480        let result = processor.process_xmp_data(xmp_data.as_bytes()).unwrap();
481
482        assert_eq!(result.name, "XMP");
483        assert!(matches!(result.value, TagValue::Object(_)));
484
485        // Check structure
486        if let TagValue::Object(xmp) = &result.value {
487            // Debug: print what we actually got
488            eprintln!("XMP structure keys: {:?}", xmp.keys().collect::<Vec<_>>());
489            for (key, value) in xmp {
490                eprintln!("  {key}: {value:?}");
491            }
492
493            // For now, just check that we have some content
494            assert!(!xmp.is_empty(), "XMP structure should not be empty");
495
496            // TODO: Fix namespace extraction to properly identify 'dc' namespace
497            // assert!(xmp.contains_key("dc"), "Expected 'dc' namespace in XMP structure");
498
499            if let Some(TagValue::Object(dc)) = xmp.get("dc") {
500                // Check creator array
501                if let Some(TagValue::Array(creators)) = dc.get("creator") {
502                    assert_eq!(creators.len(), 2);
503                    assert_eq!(creators[0].as_string(), Some("John Doe"));
504                    assert_eq!(creators[1].as_string(), Some("Jane Smith"));
505                }
506
507                // Check title alternatives
508                if let Some(TagValue::Object(titles)) = dc.get("title") {
509                    assert_eq!(
510                        titles.get("x-default").unwrap().as_string(),
511                        Some("Test Photo")
512                    );
513                    assert_eq!(titles.get("en-US").unwrap().as_string(), Some("Test Photo"));
514                }
515            }
516        }
517    }
518}