Skip to main content

rpdfium_doc/
metadata.rs

1//! Document metadata from the PDF `/Info` dictionary (ISO 32000-2 section 14.3.3).
2
3use std::collections::HashMap;
4
5use rpdfium_core::{Name, PdfSource};
6use rpdfium_parser::{Object, ObjectStore};
7
8use crate::error::{DocError, DocResult};
9
10/// Document metadata extracted from the `/Info` dictionary.
11#[derive(Debug, Clone, Default)]
12pub struct DocumentMetadata {
13    /// The document's title.
14    pub title: Option<String>,
15    /// The name of the person who created the document.
16    pub author: Option<String>,
17    /// The subject of the document.
18    pub subject: Option<String>,
19    /// Keywords associated with the document.
20    pub keywords: Option<String>,
21    /// The name of the application that created the original document.
22    pub creator: Option<String>,
23    /// The name of the application that produced the PDF.
24    pub producer: Option<String>,
25    /// The date the document was created (as a PDF date string).
26    pub creation_date: Option<String>,
27    /// The date the document was last modified (as a PDF date string).
28    pub mod_date: Option<String>,
29}
30
31/// Parse document metadata from an `/Info` dictionary object.
32pub fn parse_metadata<S: PdfSource>(
33    info_obj: &Object,
34    store: &ObjectStore<S>,
35) -> DocResult<DocumentMetadata> {
36    let resolved = store
37        .deep_resolve(info_obj)
38        .map_err(|e| DocError::Parser(e.to_string()))?;
39    let dict = resolved.as_dict().ok_or(DocError::UnexpectedType)?;
40
41    Ok(DocumentMetadata {
42        title: extract_string_field(dict, &Name::title(), store),
43        author: extract_string_field(dict, &Name::author(), store),
44        subject: extract_string_field(dict, &Name::subject(), store),
45        keywords: extract_string_field(dict, &Name::keywords(), store),
46        creator: extract_string_field(dict, &Name::creator(), store),
47        producer: extract_string_field(dict, &Name::producer(), store),
48        creation_date: extract_string_field(dict, &Name::creation_date(), store),
49        mod_date: extract_string_field(dict, &Name::mod_date(), store),
50    })
51}
52
53/// Extract a string value from a dictionary, resolving references.
54/// Returns None if the key is missing or the value is not a string.
55fn extract_string_field<S: PdfSource>(
56    dict: &HashMap<Name, Object>,
57    key: &Name,
58    store: &ObjectStore<S>,
59) -> Option<String> {
60    let obj = dict.get(key)?;
61    let resolved = store.deep_resolve(obj).ok()?;
62    resolved.as_string().map(|s| s.to_string_lossy())
63}
64
65#[cfg(test)]
66mod tests {
67    use super::*;
68    use rpdfium_core::PdfString;
69
70    fn build_store() -> ObjectStore<Vec<u8>> {
71        let pdf = build_minimal_pdf();
72        ObjectStore::open(pdf, rpdfium_core::ParsingMode::Lenient).unwrap()
73    }
74
75    fn build_minimal_pdf() -> Vec<u8> {
76        let mut pdf = Vec::new();
77        pdf.extend_from_slice(b"%PDF-1.4\n");
78        let obj1_offset = pdf.len();
79        pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
80        let obj2_offset = pdf.len();
81        pdf.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Kids [] /Count 0 >>\nendobj\n");
82        let xref_offset = pdf.len();
83        pdf.extend_from_slice(b"xref\n0 3\n");
84        pdf.extend_from_slice(b"0000000000 65535 f \r\n");
85        pdf.extend_from_slice(format!("{:010} 00000 n \r\n", obj1_offset).as_bytes());
86        pdf.extend_from_slice(format!("{:010} 00000 n \r\n", obj2_offset).as_bytes());
87        pdf.extend_from_slice(b"trailer\n<< /Size 3 /Root 1 0 R >>\n");
88        pdf.extend_from_slice(format!("startxref\n{}\n%%EOF", xref_offset).as_bytes());
89        pdf
90    }
91
92    fn str_obj(s: &str) -> Object {
93        Object::String(PdfString::from_bytes(s.as_bytes().to_vec()))
94    }
95
96    #[test]
97    fn test_full_metadata() {
98        let store = build_store();
99        let mut dict = HashMap::new();
100        dict.insert(Name::title(), str_obj("My Document"));
101        dict.insert(Name::author(), str_obj("John Doe"));
102        dict.insert(Name::subject(), str_obj("Testing"));
103        dict.insert(Name::keywords(), str_obj("pdf rust"));
104        dict.insert(Name::creator(), str_obj("TestApp"));
105        dict.insert(Name::producer(), str_obj("rpdfium"));
106        dict.insert(Name::creation_date(), str_obj("D:20240101120000"));
107        dict.insert(Name::mod_date(), str_obj("D:20240615090000"));
108        let obj = Object::Dictionary(dict);
109        let meta = parse_metadata(&obj, &store).unwrap();
110        assert_eq!(meta.title.as_deref(), Some("My Document"));
111        assert_eq!(meta.author.as_deref(), Some("John Doe"));
112        assert_eq!(meta.subject.as_deref(), Some("Testing"));
113        assert_eq!(meta.keywords.as_deref(), Some("pdf rust"));
114        assert_eq!(meta.creator.as_deref(), Some("TestApp"));
115        assert_eq!(meta.producer.as_deref(), Some("rpdfium"));
116        assert_eq!(meta.creation_date.as_deref(), Some("D:20240101120000"));
117        assert_eq!(meta.mod_date.as_deref(), Some("D:20240615090000"));
118    }
119
120    #[test]
121    fn test_partial_metadata() {
122        let store = build_store();
123        let mut dict = HashMap::new();
124        dict.insert(Name::title(), str_obj("Partial"));
125        dict.insert(Name::producer(), str_obj("rpdfium"));
126        let obj = Object::Dictionary(dict);
127        let meta = parse_metadata(&obj, &store).unwrap();
128        assert_eq!(meta.title.as_deref(), Some("Partial"));
129        assert!(meta.author.is_none());
130        assert!(meta.subject.is_none());
131        assert!(meta.keywords.is_none());
132        assert!(meta.creator.is_none());
133        assert_eq!(meta.producer.as_deref(), Some("rpdfium"));
134        assert!(meta.creation_date.is_none());
135        assert!(meta.mod_date.is_none());
136    }
137
138    #[test]
139    fn test_empty_info_dict() {
140        let store = build_store();
141        let obj = Object::Dictionary(HashMap::new());
142        let meta = parse_metadata(&obj, &store).unwrap();
143        assert!(meta.title.is_none());
144        assert!(meta.author.is_none());
145        assert!(meta.subject.is_none());
146        assert!(meta.keywords.is_none());
147        assert!(meta.creator.is_none());
148        assert!(meta.producer.is_none());
149        assert!(meta.creation_date.is_none());
150        assert!(meta.mod_date.is_none());
151    }
152
153    #[test]
154    fn test_non_string_values_ignored() {
155        let store = build_store();
156        let mut dict = HashMap::new();
157        // These non-string values should be silently ignored
158        dict.insert(Name::title(), Object::Integer(42));
159        dict.insert(Name::author(), Object::Boolean(true));
160        dict.insert(Name::subject(), str_obj("Valid Subject"));
161        let obj = Object::Dictionary(dict);
162        let meta = parse_metadata(&obj, &store).unwrap();
163        assert!(meta.title.is_none());
164        assert!(meta.author.is_none());
165        assert_eq!(meta.subject.as_deref(), Some("Valid Subject"));
166    }
167
168    #[test]
169    fn test_metadata_default() {
170        let meta = DocumentMetadata::default();
171        assert!(meta.title.is_none());
172        assert!(meta.author.is_none());
173    }
174
175    // -----------------------------------------------------------------------
176    // Upstream: cpdf_metadata_unittest.cpp — CheckSharedForm tests
177    //
178    // CheckForSharedForm detects AcrobatAdhocWorkflow in XMP metadata.
179    // This feature is not yet implemented in rpdfium.
180    // -----------------------------------------------------------------------
181
182    /// Upstream: TEST(CPDFMetadataTest, CheckSharedFormEmailAtTopLevel)
183    #[test]
184    #[ignore = "CheckForSharedForm not yet implemented"]
185    fn test_cpdf_metadata_check_shared_form_email_at_top_level() {
186        // Should detect workflowType=0 as SharedFormEmail
187        // XMP: <adhocwf:workflowType>0</adhocwf:workflowType>
188        todo!()
189    }
190
191    /// Upstream: TEST(CPDFMetadataTest, CheckSharedFormAcrobatAtTopLevel)
192    #[test]
193    #[ignore = "CheckForSharedForm not yet implemented"]
194    fn test_cpdf_metadata_check_shared_form_acrobat_at_top_level() {
195        // Should detect workflowType=1 as SharedFormAcrobat
196        // XMP: <adhocwf:workflowType>1</adhocwf:workflowType>
197        todo!()
198    }
199
200    /// Upstream: TEST(CPDFMetadataTest, CheckSharedFormFilesystemAtTopLevel)
201    #[test]
202    #[ignore = "CheckForSharedForm not yet implemented"]
203    fn test_cpdf_metadata_check_shared_form_filesystem_at_top_level() {
204        // Should detect workflowType=2 as SharedFormFilesystem
205        // XMP: <adhocwf:workflowType>2</adhocwf:workflowType>
206        todo!()
207    }
208
209    /// Upstream: TEST(CPDFMetadataTest, CheckSharedFormWithoutWorkflow)
210    #[test]
211    #[ignore = "CheckForSharedForm not yet implemented"]
212    fn test_cpdf_metadata_check_shared_form_without_workflow() {
213        // Should return empty when workflowType is absent
214        // XMP has adhocwf namespace but only <adhocwf:state> and <adhocwf:version>
215        todo!()
216    }
217
218    /// Upstream: TEST(CPDFMetadataTest, CheckSharedFormAsChild)
219    #[test]
220    #[ignore = "CheckForSharedForm not yet implemented"]
221    fn test_cpdf_metadata_check_shared_form_as_child() {
222        // Should detect shared form even when the adhocwf element is nested
223        // inside <grandparent><parent>...<node xmlns:adhocwf=...>
224        todo!()
225    }
226
227    /// Upstream: TEST(CPDFMetadataTest, CheckSharedFormAsNoAdhoc)
228    #[test]
229    #[ignore = "CheckForSharedForm not yet implemented"]
230    fn test_cpdf_metadata_check_shared_form_as_no_adhoc() {
231        // Should return empty when adhocwf namespace is absent entirely
232        // XMP: <node></node>
233        todo!()
234    }
235
236    /// Upstream: TEST(CPDFMetadataTest, CheckSharedFormExceedMaxDepth)
237    #[test]
238    #[ignore = "CheckForSharedForm not yet implemented"]
239    fn test_cpdf_metadata_check_shared_form_exceed_max_depth() {
240        // Should return empty when XML exceeds max nesting depth (130 levels)
241        // even though adhocwf data exists at that depth
242        todo!()
243    }
244
245    /// Upstream: TEST(CPDFMetadataTest, CheckSharedFormWrongNamespace)
246    #[test]
247    #[ignore = "CheckForSharedForm not yet implemented"]
248    fn test_cpdf_metadata_check_shared_form_wrong_namespace() {
249        // Should return empty when namespace version is wrong (2.0 instead of 1.0)
250        // XMP: xmlns:adhocwf="http://ns.adobe.com/AcrobatAdhocWorkflow/2.0/"
251        todo!()
252    }
253
254    /// Upstream: TEST(CPDFMetadataTest, CheckSharedFormMultipleErrors)
255    #[test]
256    #[ignore = "CheckForSharedForm not yet implemented"]
257    fn test_cpdf_metadata_check_shared_form_multiple_errors() {
258        // Should detect multiple shared form types in one metadata stream:
259        // workflowType=0 (Email), workflowType=2 (Filesystem), workflowType=1 (Acrobat)
260        todo!()
261    }
262}