Skip to main content

oxidize_pdf/pdfa/
xmp.rs

1//! XMP Metadata for PDF/A compliance
2
3use super::error::{PdfAError, PdfAResult};
4use super::types::PdfAConformance;
5use regex::Regex;
6use std::str::FromStr;
7
8/// PDF/A identification in XMP metadata
9#[derive(Debug, Clone, PartialEq, Eq)]
10pub struct XmpPdfAIdentifier {
11    /// PDF/A part (1, 2, or 3)
12    pub part: u8,
13    /// PDF/A conformance level (A, B, or U)
14    pub conformance: PdfAConformance,
15    /// Amendment (optional, e.g., "amd1")
16    pub amd: Option<String>,
17    /// Corrigenda (optional)
18    pub corr: Option<String>,
19}
20
21impl XmpPdfAIdentifier {
22    /// Create a new PDF/A identifier
23    pub fn new(part: u8, conformance: PdfAConformance) -> Self {
24        Self {
25            part,
26            conformance,
27            amd: None,
28            corr: None,
29        }
30    }
31
32    /// Generate XMP RDF for this identifier
33    pub fn to_rdf(&self) -> String {
34        let mut rdf = format!(
35            r#"    <rdf:Description rdf:about=""
36        xmlns:pdfaid="http://www.aiim.org/pdfa/ns/id/">
37      <pdfaid:part>{}</pdfaid:part>
38      <pdfaid:conformance>{}</pdfaid:conformance>"#,
39            self.part, self.conformance
40        );
41
42        if let Some(ref amd) = self.amd {
43            rdf.push_str(&format!("\n      <pdfaid:amd>{}</pdfaid:amd>", amd));
44        }
45
46        if let Some(ref corr) = self.corr {
47            rdf.push_str(&format!("\n      <pdfaid:corr>{}</pdfaid:corr>", corr));
48        }
49
50        rdf.push_str("\n    </rdf:Description>");
51        rdf
52    }
53}
54
55/// XMP Metadata for PDF documents
56#[derive(Debug, Clone, Default)]
57pub struct XmpMetadata {
58    /// Document title
59    pub title: Option<String>,
60    /// Document creator/author(s)
61    pub creator: Option<Vec<String>>,
62    /// Document description/subject
63    pub description: Option<String>,
64    /// Keywords
65    pub keywords: Option<Vec<String>>,
66    /// Creation date (ISO 8601)
67    pub create_date: Option<String>,
68    /// Modification date (ISO 8601)
69    pub modify_date: Option<String>,
70    /// Creator tool/application
71    pub creator_tool: Option<String>,
72    /// PDF/A identification (required for PDF/A)
73    pub pdfa_id: Option<XmpPdfAIdentifier>,
74    /// Document ID
75    pub document_id: Option<String>,
76    /// Instance ID
77    pub instance_id: Option<String>,
78}
79
80impl XmpMetadata {
81    /// Create a new empty XMP metadata
82    pub fn new() -> Self {
83        Self::default()
84    }
85
86    /// Parse XMP metadata from XML string
87    pub fn parse(xml: &str) -> PdfAResult<Self> {
88        let mut metadata = Self::new();
89
90        // Parse title
91        if let Some(title) = Self::extract_simple_value(xml, "dc:title") {
92            metadata.title = Some(title);
93        }
94
95        // Parse creator (can be a list)
96        if let Some(creator) = Self::extract_list_value(xml, "dc:creator") {
97            metadata.creator = Some(creator);
98        }
99
100        // Parse description
101        if let Some(desc) = Self::extract_simple_value(xml, "dc:description") {
102            metadata.description = Some(desc);
103        }
104
105        // Parse keywords
106        if let Some(keywords) = Self::extract_list_value(xml, "pdf:Keywords")
107            .or_else(|| Self::extract_list_value(xml, "dc:subject"))
108        {
109            metadata.keywords = Some(keywords);
110        }
111
112        // Parse dates
113        if let Some(date) = Self::extract_simple_value(xml, "xmp:CreateDate") {
114            metadata.create_date = Some(date);
115        }
116        if let Some(date) = Self::extract_simple_value(xml, "xmp:ModifyDate") {
117            metadata.modify_date = Some(date);
118        }
119
120        // Parse creator tool
121        if let Some(tool) = Self::extract_simple_value(xml, "xmp:CreatorTool") {
122            metadata.creator_tool = Some(tool);
123        }
124
125        // Parse PDF/A identification
126        if let (Some(part_str), Some(conf_str)) = (
127            Self::extract_simple_value(xml, "pdfaid:part"),
128            Self::extract_simple_value(xml, "pdfaid:conformance"),
129        ) {
130            if let (Ok(part), Ok(conformance)) =
131                (part_str.parse::<u8>(), PdfAConformance::from_str(&conf_str))
132            {
133                let mut pdfa_id = XmpPdfAIdentifier::new(part, conformance);
134                pdfa_id.amd = Self::extract_simple_value(xml, "pdfaid:amd");
135                pdfa_id.corr = Self::extract_simple_value(xml, "pdfaid:corr");
136                metadata.pdfa_id = Some(pdfa_id);
137            }
138        }
139
140        // Parse document/instance IDs
141        metadata.document_id = Self::extract_simple_value(xml, "xmpMM:DocumentID");
142        metadata.instance_id = Self::extract_simple_value(xml, "xmpMM:InstanceID");
143
144        Ok(metadata)
145    }
146
147    /// Extract a simple value from XMP
148    fn extract_simple_value(xml: &str, tag: &str) -> Option<String> {
149        // Try element form: <tag>value</tag>
150        let pattern = format!(r"<{tag}[^>]*>([^<]*)</{tag}>", tag = regex::escape(tag));
151        if let Ok(re) = Regex::new(&pattern) {
152            if let Some(caps) = re.captures(xml) {
153                return Some(caps[1].trim().to_string());
154            }
155        }
156
157        // Try Alt form: <tag><rdf:Alt><rdf:li...>value</rdf:li></rdf:Alt></tag>
158        let alt_pattern = format!(
159            r"<{tag}[^>]*>\s*<rdf:Alt[^>]*>\s*<rdf:li[^>]*>([^<]*)</rdf:li>",
160            tag = regex::escape(tag)
161        );
162        if let Ok(re) = Regex::new(&alt_pattern) {
163            if let Some(caps) = re.captures(xml) {
164                return Some(caps[1].trim().to_string());
165            }
166        }
167
168        None
169    }
170
171    /// Extract a list value from XMP (Seq or Bag)
172    fn extract_list_value(xml: &str, tag: &str) -> Option<Vec<String>> {
173        // Match the entire tag content (use [\s\S] to match newlines)
174        let pattern = format!(r"(?s)<{tag}[^>]*>(.*?)</{tag}>", tag = regex::escape(tag));
175
176        if let Ok(re) = Regex::new(&pattern) {
177            if let Some(caps) = re.captures(xml) {
178                let content = &caps[1];
179                // Extract all li elements
180                if let Ok(li_re) = Regex::new(r"<rdf:li[^>]*>([^<]*)</rdf:li>") {
181                    let values: Vec<String> = li_re
182                        .captures_iter(content)
183                        .map(|c| c[1].trim().to_string())
184                        .filter(|s| !s.is_empty())
185                        .collect();
186                    if !values.is_empty() {
187                        return Some(values);
188                    }
189                }
190            }
191        }
192
193        None
194    }
195
196    /// Generate XMP XML for this metadata
197    pub fn to_xml(&self) -> String {
198        let mut xml = String::from(
199            r#"<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>
200<x:xmpmeta xmlns:x="adobe:ns:meta/">
201  <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">"#,
202        );
203
204        // DC (Dublin Core) namespace
205        xml.push_str("\n    <rdf:Description rdf:about=\"\"\n        xmlns:dc=\"http://purl.org/dc/elements/1.1/\">");
206
207        if let Some(ref title) = self.title {
208            xml.push_str(&format!(
209                "\n      <dc:title>\n        <rdf:Alt>\n          <rdf:li xml:lang=\"x-default\">{}</rdf:li>\n        </rdf:Alt>\n      </dc:title>",
210                Self::xml_escape(title)
211            ));
212        }
213
214        if let Some(ref creators) = self.creator {
215            xml.push_str("\n      <dc:creator>\n        <rdf:Seq>");
216            for creator in creators {
217                xml.push_str(&format!(
218                    "\n          <rdf:li>{}</rdf:li>",
219                    Self::xml_escape(creator)
220                ));
221            }
222            xml.push_str("\n        </rdf:Seq>\n      </dc:creator>");
223        }
224
225        if let Some(ref desc) = self.description {
226            xml.push_str(&format!(
227                "\n      <dc:description>\n        <rdf:Alt>\n          <rdf:li xml:lang=\"x-default\">{}</rdf:li>\n        </rdf:Alt>\n      </dc:description>",
228                Self::xml_escape(desc)
229            ));
230        }
231
232        xml.push_str("\n    </rdf:Description>");
233
234        // XMP namespace
235        xml.push_str("\n    <rdf:Description rdf:about=\"\"\n        xmlns:xmp=\"http://ns.adobe.com/xap/1.0/\">");
236
237        if let Some(ref tool) = self.creator_tool {
238            xml.push_str(&format!(
239                "\n      <xmp:CreatorTool>{}</xmp:CreatorTool>",
240                Self::xml_escape(tool)
241            ));
242        }
243
244        if let Some(ref date) = self.create_date {
245            xml.push_str(&format!(
246                "\n      <xmp:CreateDate>{}</xmp:CreateDate>",
247                date
248            ));
249        }
250
251        if let Some(ref date) = self.modify_date {
252            xml.push_str(&format!(
253                "\n      <xmp:ModifyDate>{}</xmp:ModifyDate>",
254                date
255            ));
256        }
257
258        xml.push_str("\n    </rdf:Description>");
259
260        // PDF/A identification
261        if let Some(ref pdfa_id) = self.pdfa_id {
262            xml.push_str(&format!("\n{}", pdfa_id.to_rdf()));
263        }
264
265        // XMP Media Management
266        if self.document_id.is_some() || self.instance_id.is_some() {
267            xml.push_str("\n    <rdf:Description rdf:about=\"\"\n        xmlns:xmpMM=\"http://ns.adobe.com/xap/1.0/mm/\">");
268            if let Some(ref doc_id) = self.document_id {
269                xml.push_str(&format!(
270                    "\n      <xmpMM:DocumentID>{}</xmpMM:DocumentID>",
271                    doc_id
272                ));
273            }
274            if let Some(ref inst_id) = self.instance_id {
275                xml.push_str(&format!(
276                    "\n      <xmpMM:InstanceID>{}</xmpMM:InstanceID>",
277                    inst_id
278                ));
279            }
280            xml.push_str("\n    </rdf:Description>");
281        }
282
283        xml.push_str("\n  </rdf:RDF>\n</x:xmpmeta>\n<?xpacket end=\"w\"?>");
284        xml
285    }
286
287    /// Escape special XML characters
288    fn xml_escape(s: &str) -> String {
289        s.replace('&', "&amp;")
290            .replace('<', "&lt;")
291            .replace('>', "&gt;")
292            .replace('"', "&quot;")
293            .replace('\'', "&apos;")
294    }
295
296    /// Validate this metadata for PDF/A compliance
297    pub fn validate_for_pdfa(&self) -> PdfAResult<()> {
298        // PDF/A requires PDF/A identification
299        if self.pdfa_id.is_none() {
300            return Err(PdfAError::XmpParseError(
301                "PDF/A identification is required".to_string(),
302            ));
303        }
304
305        Ok(())
306    }
307}
308
309#[cfg(test)]
310mod tests {
311    use super::*;
312
313    #[test]
314    fn test_xmp_pdfa_identifier_new() {
315        let id = XmpPdfAIdentifier::new(1, PdfAConformance::B);
316        assert_eq!(id.part, 1);
317        assert_eq!(id.conformance, PdfAConformance::B);
318        assert!(id.amd.is_none());
319        assert!(id.corr.is_none());
320    }
321
322    #[test]
323    fn test_xmp_pdfa_identifier_to_rdf() {
324        let id = XmpPdfAIdentifier::new(2, PdfAConformance::U);
325        let rdf = id.to_rdf();
326        assert!(rdf.contains("<pdfaid:part>2</pdfaid:part>"));
327        assert!(rdf.contains("<pdfaid:conformance>U</pdfaid:conformance>"));
328    }
329
330    #[test]
331    fn test_xmp_metadata_new() {
332        let metadata = XmpMetadata::new();
333        assert!(metadata.title.is_none());
334        assert!(metadata.creator.is_none());
335        assert!(metadata.pdfa_id.is_none());
336    }
337
338    #[test]
339    fn test_xmp_metadata_parse_title() {
340        let xml = r#"<dc:title><rdf:Alt><rdf:li xml:lang="x-default">Test Title</rdf:li></rdf:Alt></dc:title>"#;
341        let metadata = XmpMetadata::parse(xml).unwrap();
342        assert_eq!(metadata.title.as_deref(), Some("Test Title"));
343    }
344
345    #[test]
346    fn test_xmp_metadata_parse_pdfa_id() {
347        let xml = r#"
348            <pdfaid:part>1</pdfaid:part>
349            <pdfaid:conformance>B</pdfaid:conformance>
350        "#;
351        let metadata = XmpMetadata::parse(xml).unwrap();
352        assert!(metadata.pdfa_id.is_some());
353        let pdfa_id = metadata.pdfa_id.unwrap();
354        assert_eq!(pdfa_id.part, 1);
355        assert_eq!(pdfa_id.conformance, PdfAConformance::B);
356    }
357
358    #[test]
359    fn test_xmp_metadata_parse_creator_list() {
360        let xml = r#"
361            <dc:creator>
362                <rdf:Seq>
363                    <rdf:li>Author One</rdf:li>
364                    <rdf:li>Author Two</rdf:li>
365                </rdf:Seq>
366            </dc:creator>
367        "#;
368        let metadata = XmpMetadata::parse(xml).unwrap();
369        assert!(metadata.creator.is_some());
370        let creators = metadata.creator.unwrap();
371        assert_eq!(creators.len(), 2);
372        assert_eq!(creators[0], "Author One");
373        assert_eq!(creators[1], "Author Two");
374    }
375
376    #[test]
377    fn test_xmp_metadata_to_xml() {
378        let mut metadata = XmpMetadata::new();
379        metadata.title = Some("Test Document".to_string());
380        metadata.creator = Some(vec!["Test Author".to_string()]);
381        metadata.pdfa_id = Some(XmpPdfAIdentifier::new(1, PdfAConformance::B));
382
383        let xml = metadata.to_xml();
384        assert!(xml.contains("Test Document"));
385        assert!(xml.contains("Test Author"));
386        assert!(xml.contains("pdfaid:part"));
387    }
388
389    #[test]
390    fn test_xmp_metadata_validate_for_pdfa_missing_id() {
391        let metadata = XmpMetadata::new();
392        assert!(metadata.validate_for_pdfa().is_err());
393    }
394
395    #[test]
396    fn test_xmp_metadata_validate_for_pdfa_with_id() {
397        let mut metadata = XmpMetadata::new();
398        metadata.pdfa_id = Some(XmpPdfAIdentifier::new(1, PdfAConformance::B));
399        assert!(metadata.validate_for_pdfa().is_ok());
400    }
401
402    #[test]
403    fn test_xml_escape() {
404        assert_eq!(XmpMetadata::xml_escape("<test>"), "&lt;test&gt;");
405        assert_eq!(XmpMetadata::xml_escape("a & b"), "a &amp; b");
406        assert_eq!(XmpMetadata::xml_escape("\"quoted\""), "&quot;quoted&quot;");
407    }
408
409    #[test]
410    fn test_xmp_pdfa_identifier_with_amd() {
411        let mut id = XmpPdfAIdentifier::new(1, PdfAConformance::B);
412        id.amd = Some("amd1".to_string());
413        let rdf = id.to_rdf();
414        assert!(rdf.contains("<pdfaid:amd>amd1</pdfaid:amd>"));
415    }
416
417    #[test]
418    fn test_xmp_metadata_parse_dates() {
419        let xml = r#"
420            <xmp:CreateDate>2024-01-15T10:30:00Z</xmp:CreateDate>
421            <xmp:ModifyDate>2024-01-16T14:00:00Z</xmp:ModifyDate>
422        "#;
423        let metadata = XmpMetadata::parse(xml).unwrap();
424        assert_eq!(
425            metadata.create_date.as_deref(),
426            Some("2024-01-15T10:30:00Z")
427        );
428        assert_eq!(
429            metadata.modify_date.as_deref(),
430            Some("2024-01-16T14:00:00Z")
431        );
432    }
433
434    #[test]
435    fn test_xmp_metadata_roundtrip() {
436        let mut original = XmpMetadata::new();
437        original.title = Some("Roundtrip Test".to_string());
438        original.creator = Some(vec!["Author".to_string()]);
439        original.pdfa_id = Some(XmpPdfAIdentifier::new(2, PdfAConformance::U));
440
441        let xml = original.to_xml();
442        let parsed = XmpMetadata::parse(&xml).unwrap();
443
444        assert_eq!(parsed.title, original.title);
445        assert_eq!(parsed.pdfa_id.as_ref().unwrap().part, 2);
446        assert_eq!(
447            parsed.pdfa_id.as_ref().unwrap().conformance,
448            PdfAConformance::U
449        );
450    }
451
452    #[test]
453    fn test_xmp_metadata_parse_simple_tag() {
454        let xml = r#"<xmp:CreatorTool>oxidize-pdf 1.6.0</xmp:CreatorTool>"#;
455        let metadata = XmpMetadata::parse(xml).unwrap();
456        assert_eq!(metadata.creator_tool.as_deref(), Some("oxidize-pdf 1.6.0"));
457    }
458
459    #[test]
460    fn test_xmp_pdfa_identifier_clone() {
461        let id1 = XmpPdfAIdentifier::new(3, PdfAConformance::A);
462        let id2 = id1.clone();
463        assert_eq!(id1, id2);
464    }
465
466    #[test]
467    fn test_xmp_metadata_clone() {
468        let mut metadata = XmpMetadata::new();
469        metadata.title = Some("Clone Test".to_string());
470        let cloned = metadata.clone();
471        assert_eq!(cloned.title, metadata.title);
472    }
473}