Skip to main content

rpdfium_doc/
xmp.rs

1//! XMP metadata extraction from PDF streams.
2//!
3//! Extracts Dublin Core and XMP metadata properties from the raw XML bytes
4//! of a PDF's `/Metadata` stream. Uses simple string scanning to avoid
5//! requiring an XML parser dependency.
6
7/// Type of shared form workflow detected in XMP metadata.
8///
9/// Corresponds to PDFium's `CPDF_Metadata::CheckForSharedForm()` which
10/// scans for the Acrobat ad-hoc workflow namespace and extracts the
11/// workflow type.
12#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
13pub enum SharedFormType {
14    /// No shared form workflow detected.
15    #[default]
16    None,
17    /// Email-based workflow (workflowType=0).
18    EmailBased,
19    /// Acrobat.com-based workflow (workflowType=1).
20    AcrobatCom,
21    /// Filesystem-based workflow (workflowType=2).
22    FilesystemBased,
23}
24
25/// Check if the XMP metadata indicates a shared form workflow.
26///
27/// Scans for the `adhocwf` or `AcrobatAdhocWorkflow` namespace and
28/// extracts the `workflowType` value.
29pub fn check_for_shared_form(xml: &[u8]) -> SharedFormType {
30    let text = String::from_utf8_lossy(xml);
31
32    // Look for the adhoc workflow namespace
33    if !text.contains("adhocwf") && !text.contains("AcrobatAdhocWorkflow") {
34        return SharedFormType::None;
35    }
36
37    // Extract workflowType value — scan for the tag
38    let workflow_value = extract_simple_tag(&text, "adhocwf:workflowType")
39        .or_else(|| extract_simple_tag(&text, "AcrobatAdhocWorkflow:workflowType"));
40
41    match workflow_value.as_deref() {
42        Some("0") => SharedFormType::EmailBased,
43        Some("1") => SharedFormType::AcrobatCom,
44        Some("2") => SharedFormType::FilesystemBased,
45        // Namespace present but no recognizable type — treat as email (upstream default)
46        Some(_) => SharedFormType::EmailBased,
47        None => SharedFormType::EmailBased,
48    }
49}
50
51/// Parsed XMP metadata from a PDF metadata stream.
52#[derive(Debug, Clone, Default)]
53pub struct XmpMetadata {
54    /// Document title (`dc:title`).
55    pub title: Option<String>,
56    /// Document creator/author (`dc:creator`).
57    pub creator: Option<String>,
58    /// Document description (`dc:description`).
59    pub description: Option<String>,
60    /// Creation date (`xmp:CreateDate`).
61    pub create_date: Option<String>,
62    /// Modification date (`xmp:ModifyDate`).
63    pub modify_date: Option<String>,
64    /// PDF producer (`pdf:Producer`).
65    pub producer: Option<String>,
66    /// Keywords (`pdf:Keywords`).
67    pub keywords: Option<String>,
68}
69
70impl XmpMetadata {
71    /// Parse XMP metadata from raw XML bytes.
72    ///
73    /// Uses simple string scanning -- no XML parser dependency.
74    /// Extracts values from known XMP/Dublin Core tags.
75    pub fn from_xml(xml: &[u8]) -> Self {
76        let text = String::from_utf8_lossy(xml);
77
78        Self {
79            title: extract_rdf_li_value(&text, "dc:title"),
80            creator: extract_rdf_li_value(&text, "dc:creator"),
81            description: extract_rdf_li_value(&text, "dc:description"),
82            create_date: extract_simple_tag(&text, "xmp:CreateDate"),
83            modify_date: extract_simple_tag(&text, "xmp:ModifyDate"),
84            producer: extract_simple_tag(&text, "pdf:Producer"),
85            keywords: extract_simple_tag(&text, "pdf:Keywords"),
86        }
87    }
88}
89
90/// Extract a value from an `<rdf:li>` element nested inside a parent tag.
91///
92/// Handles patterns like:
93/// ```xml
94/// <dc:title>
95///   <rdf:Alt>
96///     <rdf:li xml:lang="x-default">The Title</rdf:li>
97///   </rdf:Alt>
98/// </dc:title>
99/// ```
100fn extract_rdf_li_value(text: &str, tag: &str) -> Option<String> {
101    let open = format!("<{tag}");
102    let close = format!("</{tag}>");
103
104    let start = text.find(&open)?;
105    let end = text[start..].find(&close)?;
106    let section = &text[start..start + end];
107
108    // Look for <rdf:li ...>VALUE</rdf:li> within the section
109    extract_inner_rdf_li(section)
110}
111
112/// Extract the text content of the first `<rdf:li>` element in the given text.
113fn extract_inner_rdf_li(text: &str) -> Option<String> {
114    let li_start = text.find("<rdf:li")?;
115    let rest = &text[li_start..];
116    // Find the end of the opening tag
117    let content_start = rest.find('>')? + 1;
118    let content_rest = &rest[content_start..];
119    let content_end = content_rest.find("</rdf:li>")?;
120    let value = content_rest[..content_end].trim();
121    if value.is_empty() {
122        None
123    } else {
124        Some(value.to_string())
125    }
126}
127
128/// Extract a value from a simple tag like `<xmp:CreateDate>VALUE</xmp:CreateDate>`.
129fn extract_simple_tag(text: &str, tag: &str) -> Option<String> {
130    let open = format!("<{tag}>");
131    let close = format!("</{tag}>");
132
133    let start = text.find(&open)?;
134    let content_start = start + open.len();
135    let content_rest = &text[content_start..];
136    let content_end = content_rest.find(&close)?;
137    let value = content_rest[..content_end].trim();
138    if value.is_empty() {
139        None
140    } else {
141        Some(value.to_string())
142    }
143}
144
145#[cfg(test)]
146mod tests {
147    use super::*;
148
149    #[test]
150    fn test_parse_basic_xmp() {
151        let xml = br#"<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>
152<x:xmpmeta xmlns:x="adobe:ns:meta/">
153  <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
154           xmlns:dc="http://purl.org/dc/elements/1.1/"
155           xmlns:xmp="http://ns.adobe.com/xap/1.0/"
156           xmlns:pdf="http://ns.adobe.com/pdf/1.3/">
157    <rdf:Description rdf:about="">
158      <dc:title>
159        <rdf:Alt>
160          <rdf:li xml:lang="x-default">Test Document</rdf:li>
161        </rdf:Alt>
162      </dc:title>
163      <dc:creator>
164        <rdf:Seq>
165          <rdf:li>John Doe</rdf:li>
166        </rdf:Seq>
167      </dc:creator>
168      <dc:description>
169        <rdf:Alt>
170          <rdf:li xml:lang="x-default">A test PDF document</rdf:li>
171        </rdf:Alt>
172      </dc:description>
173      <xmp:CreateDate>2024-01-15T10:30:00Z</xmp:CreateDate>
174      <xmp:ModifyDate>2024-06-01T14:00:00Z</xmp:ModifyDate>
175      <pdf:Producer>rpdfium 0.1</pdf:Producer>
176      <pdf:Keywords>test, pdf, document</pdf:Keywords>
177    </rdf:Description>
178  </rdf:RDF>
179</x:xmpmeta>
180<?xpacket end="w"?>"#;
181
182        let meta = XmpMetadata::from_xml(xml);
183        assert_eq!(meta.title.as_deref(), Some("Test Document"));
184        assert_eq!(meta.creator.as_deref(), Some("John Doe"));
185        assert_eq!(meta.description.as_deref(), Some("A test PDF document"));
186        assert_eq!(meta.create_date.as_deref(), Some("2024-01-15T10:30:00Z"));
187        assert_eq!(meta.modify_date.as_deref(), Some("2024-06-01T14:00:00Z"));
188        assert_eq!(meta.producer.as_deref(), Some("rpdfium 0.1"));
189        assert_eq!(meta.keywords.as_deref(), Some("test, pdf, document"));
190    }
191
192    #[test]
193    fn test_parse_empty_xmp() {
194        let xml = b"<?xpacket begin=\"\" id=\"W5M0MpCehiHzreSzNTczkc9d\"?><?xpacket end=\"w\"?>";
195        let meta = XmpMetadata::from_xml(xml);
196        assert!(meta.title.is_none());
197        assert!(meta.creator.is_none());
198        assert!(meta.create_date.is_none());
199    }
200
201    #[test]
202    fn test_parse_partial_xmp() {
203        let xml = br#"<x:xmpmeta>
204<rdf:RDF>
205<rdf:Description>
206  <pdf:Producer>Test Producer</pdf:Producer>
207</rdf:Description>
208</rdf:RDF>
209</x:xmpmeta>"#;
210
211        let meta = XmpMetadata::from_xml(xml);
212        assert!(meta.title.is_none());
213        assert_eq!(meta.producer.as_deref(), Some("Test Producer"));
214    }
215
216    #[test]
217    fn test_parse_xmp_with_whitespace() {
218        let xml = br#"<x:xmpmeta>
219<rdf:RDF>
220<rdf:Description>
221  <xmp:CreateDate>
222    2024-03-15
223  </xmp:CreateDate>
224</rdf:Description>
225</rdf:RDF>
226</x:xmpmeta>"#;
227
228        let meta = XmpMetadata::from_xml(xml);
229        assert_eq!(meta.create_date.as_deref(), Some("2024-03-15"));
230    }
231
232    // ---- SharedFormType tests ----
233
234    #[test]
235    fn test_shared_form_none() {
236        let xml = b"<x:xmpmeta><rdf:RDF><rdf:Description></rdf:Description></rdf:RDF></x:xmpmeta>";
237        assert_eq!(check_for_shared_form(xml), SharedFormType::None);
238    }
239
240    #[test]
241    fn test_shared_form_email() {
242        let xml = br#"<x:xmpmeta>
243<rdf:RDF>
244<rdf:Description xmlns:adhocwf="http://ns.adobe.com/AcrobatAdhocWorkflow/1.0/">
245  <adhocwf:workflowType>0</adhocwf:workflowType>
246</rdf:Description>
247</rdf:RDF>
248</x:xmpmeta>"#;
249        assert_eq!(check_for_shared_form(xml), SharedFormType::EmailBased);
250    }
251
252    #[test]
253    fn test_shared_form_acrobat_com() {
254        let xml = br#"<x:xmpmeta>
255<rdf:RDF>
256<rdf:Description xmlns:adhocwf="http://ns.adobe.com/AcrobatAdhocWorkflow/1.0/">
257  <adhocwf:workflowType>1</adhocwf:workflowType>
258</rdf:Description>
259</rdf:RDF>
260</x:xmpmeta>"#;
261        assert_eq!(check_for_shared_form(xml), SharedFormType::AcrobatCom);
262    }
263
264    #[test]
265    fn test_shared_form_filesystem() {
266        let xml =
267            br#"<rdf:Description xmlns:adhocwf="http://ns.adobe.com/AcrobatAdhocWorkflow/1.0/">
268  <adhocwf:workflowType>2</adhocwf:workflowType>
269</rdf:Description>"#;
270        assert_eq!(check_for_shared_form(xml), SharedFormType::FilesystemBased);
271    }
272
273    #[test]
274    fn test_shared_form_namespace_no_type() {
275        // Namespace present but no workflowType tag → defaults to email
276        let xml =
277            br#"<rdf:Description xmlns:adhocwf="http://ns.adobe.com/AcrobatAdhocWorkflow/1.0/">
278</rdf:Description>"#;
279        assert_eq!(check_for_shared_form(xml), SharedFormType::EmailBased);
280    }
281
282    #[test]
283    fn test_shared_form_alternate_namespace() {
284        let xml = br#"<rdf:Description xmlns:AcrobatAdhocWorkflow="http://example.com/">
285  <AcrobatAdhocWorkflow:workflowType>1</AcrobatAdhocWorkflow:workflowType>
286</rdf:Description>"#;
287        assert_eq!(check_for_shared_form(xml), SharedFormType::AcrobatCom);
288    }
289
290    #[test]
291    fn test_parse_invalid_utf8() {
292        let xml = b"\xff\xfe<xmp:CreateDate>2024-01-01</xmp:CreateDate>";
293        let meta = XmpMetadata::from_xml(xml);
294        // Should handle gracefully (lossy conversion)
295        assert_eq!(meta.create_date.as_deref(), Some("2024-01-01"));
296    }
297}