rpdfium-doc 7676.6.4

Document-level features for rpdfium
Documentation
//! Document metadata from the PDF `/Info` dictionary (ISO 32000-2 section 14.3.3).

use std::collections::HashMap;

use rpdfium_core::{Name, PdfSource};
use rpdfium_parser::{Object, ObjectStore};

use crate::error::{DocError, DocResult};

/// Document metadata extracted from the `/Info` dictionary.
#[derive(Debug, Clone, Default)]
pub struct DocumentMetadata {
    /// The document's title.
    pub title: Option<String>,
    /// The name of the person who created the document.
    pub author: Option<String>,
    /// The subject of the document.
    pub subject: Option<String>,
    /// Keywords associated with the document.
    pub keywords: Option<String>,
    /// The name of the application that created the original document.
    pub creator: Option<String>,
    /// The name of the application that produced the PDF.
    pub producer: Option<String>,
    /// The date the document was created (as a PDF date string).
    pub creation_date: Option<String>,
    /// The date the document was last modified (as a PDF date string).
    pub mod_date: Option<String>,
}

/// Parse document metadata from an `/Info` dictionary object.
pub fn parse_metadata<S: PdfSource>(
    info_obj: &Object,
    store: &ObjectStore<S>,
) -> DocResult<DocumentMetadata> {
    let resolved = store
        .deep_resolve(info_obj)
        .map_err(|e| DocError::Parser(e.to_string()))?;
    let dict = resolved.as_dict().ok_or(DocError::UnexpectedType)?;

    Ok(DocumentMetadata {
        title: extract_string_field(dict, &Name::title(), store),
        author: extract_string_field(dict, &Name::author(), store),
        subject: extract_string_field(dict, &Name::subject(), store),
        keywords: extract_string_field(dict, &Name::keywords(), store),
        creator: extract_string_field(dict, &Name::creator(), store),
        producer: extract_string_field(dict, &Name::producer(), store),
        creation_date: extract_string_field(dict, &Name::creation_date(), store),
        mod_date: extract_string_field(dict, &Name::mod_date(), store),
    })
}

/// Extract a string value from a dictionary, resolving references.
/// Returns None if the key is missing or the value is not a string.
fn extract_string_field<S: PdfSource>(
    dict: &HashMap<Name, Object>,
    key: &Name,
    store: &ObjectStore<S>,
) -> Option<String> {
    let obj = dict.get(key)?;
    let resolved = store.deep_resolve(obj).ok()?;
    resolved.as_string().map(|s| s.to_string_lossy())
}

#[cfg(test)]
mod tests {
    use super::*;
    use rpdfium_core::PdfString;

    fn build_store() -> ObjectStore<Vec<u8>> {
        let pdf = build_minimal_pdf();
        ObjectStore::open(pdf, rpdfium_core::ParsingMode::Lenient).unwrap()
    }

    fn build_minimal_pdf() -> Vec<u8> {
        let mut pdf = Vec::new();
        pdf.extend_from_slice(b"%PDF-1.4\n");
        let obj1_offset = pdf.len();
        pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
        let obj2_offset = pdf.len();
        pdf.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Kids [] /Count 0 >>\nendobj\n");
        let xref_offset = pdf.len();
        pdf.extend_from_slice(b"xref\n0 3\n");
        pdf.extend_from_slice(b"0000000000 65535 f \r\n");
        pdf.extend_from_slice(format!("{:010} 00000 n \r\n", obj1_offset).as_bytes());
        pdf.extend_from_slice(format!("{:010} 00000 n \r\n", obj2_offset).as_bytes());
        pdf.extend_from_slice(b"trailer\n<< /Size 3 /Root 1 0 R >>\n");
        pdf.extend_from_slice(format!("startxref\n{}\n%%EOF", xref_offset).as_bytes());
        pdf
    }

    fn str_obj(s: &str) -> Object {
        Object::String(PdfString::from_bytes(s.as_bytes().to_vec()))
    }

    #[test]
    fn test_full_metadata() {
        let store = build_store();
        let mut dict = HashMap::new();
        dict.insert(Name::title(), str_obj("My Document"));
        dict.insert(Name::author(), str_obj("John Doe"));
        dict.insert(Name::subject(), str_obj("Testing"));
        dict.insert(Name::keywords(), str_obj("pdf rust"));
        dict.insert(Name::creator(), str_obj("TestApp"));
        dict.insert(Name::producer(), str_obj("rpdfium"));
        dict.insert(Name::creation_date(), str_obj("D:20240101120000"));
        dict.insert(Name::mod_date(), str_obj("D:20240615090000"));
        let obj = Object::Dictionary(dict);
        let meta = parse_metadata(&obj, &store).unwrap();
        assert_eq!(meta.title.as_deref(), Some("My Document"));
        assert_eq!(meta.author.as_deref(), Some("John Doe"));
        assert_eq!(meta.subject.as_deref(), Some("Testing"));
        assert_eq!(meta.keywords.as_deref(), Some("pdf rust"));
        assert_eq!(meta.creator.as_deref(), Some("TestApp"));
        assert_eq!(meta.producer.as_deref(), Some("rpdfium"));
        assert_eq!(meta.creation_date.as_deref(), Some("D:20240101120000"));
        assert_eq!(meta.mod_date.as_deref(), Some("D:20240615090000"));
    }

    #[test]
    fn test_partial_metadata() {
        let store = build_store();
        let mut dict = HashMap::new();
        dict.insert(Name::title(), str_obj("Partial"));
        dict.insert(Name::producer(), str_obj("rpdfium"));
        let obj = Object::Dictionary(dict);
        let meta = parse_metadata(&obj, &store).unwrap();
        assert_eq!(meta.title.as_deref(), Some("Partial"));
        assert!(meta.author.is_none());
        assert!(meta.subject.is_none());
        assert!(meta.keywords.is_none());
        assert!(meta.creator.is_none());
        assert_eq!(meta.producer.as_deref(), Some("rpdfium"));
        assert!(meta.creation_date.is_none());
        assert!(meta.mod_date.is_none());
    }

    #[test]
    fn test_empty_info_dict() {
        let store = build_store();
        let obj = Object::Dictionary(HashMap::new());
        let meta = parse_metadata(&obj, &store).unwrap();
        assert!(meta.title.is_none());
        assert!(meta.author.is_none());
        assert!(meta.subject.is_none());
        assert!(meta.keywords.is_none());
        assert!(meta.creator.is_none());
        assert!(meta.producer.is_none());
        assert!(meta.creation_date.is_none());
        assert!(meta.mod_date.is_none());
    }

    #[test]
    fn test_non_string_values_ignored() {
        let store = build_store();
        let mut dict = HashMap::new();
        // These non-string values should be silently ignored
        dict.insert(Name::title(), Object::Integer(42));
        dict.insert(Name::author(), Object::Boolean(true));
        dict.insert(Name::subject(), str_obj("Valid Subject"));
        let obj = Object::Dictionary(dict);
        let meta = parse_metadata(&obj, &store).unwrap();
        assert!(meta.title.is_none());
        assert!(meta.author.is_none());
        assert_eq!(meta.subject.as_deref(), Some("Valid Subject"));
    }

    #[test]
    fn test_metadata_default() {
        let meta = DocumentMetadata::default();
        assert!(meta.title.is_none());
        assert!(meta.author.is_none());
    }

    // -----------------------------------------------------------------------
    // Upstream: cpdf_metadata_unittest.cpp — CheckSharedForm tests
    //
    // CheckForSharedForm detects AcrobatAdhocWorkflow in XMP metadata.
    // This feature is not yet implemented in rpdfium.
    // -----------------------------------------------------------------------

    /// Upstream: TEST(CPDFMetadataTest, CheckSharedFormEmailAtTopLevel)
    #[test]
    #[ignore = "CheckForSharedForm not yet implemented"]
    fn test_cpdf_metadata_check_shared_form_email_at_top_level() {
        // Should detect workflowType=0 as SharedFormEmail
        // XMP: <adhocwf:workflowType>0</adhocwf:workflowType>
        todo!()
    }

    /// Upstream: TEST(CPDFMetadataTest, CheckSharedFormAcrobatAtTopLevel)
    #[test]
    #[ignore = "CheckForSharedForm not yet implemented"]
    fn test_cpdf_metadata_check_shared_form_acrobat_at_top_level() {
        // Should detect workflowType=1 as SharedFormAcrobat
        // XMP: <adhocwf:workflowType>1</adhocwf:workflowType>
        todo!()
    }

    /// Upstream: TEST(CPDFMetadataTest, CheckSharedFormFilesystemAtTopLevel)
    #[test]
    #[ignore = "CheckForSharedForm not yet implemented"]
    fn test_cpdf_metadata_check_shared_form_filesystem_at_top_level() {
        // Should detect workflowType=2 as SharedFormFilesystem
        // XMP: <adhocwf:workflowType>2</adhocwf:workflowType>
        todo!()
    }

    /// Upstream: TEST(CPDFMetadataTest, CheckSharedFormWithoutWorkflow)
    #[test]
    #[ignore = "CheckForSharedForm not yet implemented"]
    fn test_cpdf_metadata_check_shared_form_without_workflow() {
        // Should return empty when workflowType is absent
        // XMP has adhocwf namespace but only <adhocwf:state> and <adhocwf:version>
        todo!()
    }

    /// Upstream: TEST(CPDFMetadataTest, CheckSharedFormAsChild)
    #[test]
    #[ignore = "CheckForSharedForm not yet implemented"]
    fn test_cpdf_metadata_check_shared_form_as_child() {
        // Should detect shared form even when the adhocwf element is nested
        // inside <grandparent><parent>...<node xmlns:adhocwf=...>
        todo!()
    }

    /// Upstream: TEST(CPDFMetadataTest, CheckSharedFormAsNoAdhoc)
    #[test]
    #[ignore = "CheckForSharedForm not yet implemented"]
    fn test_cpdf_metadata_check_shared_form_as_no_adhoc() {
        // Should return empty when adhocwf namespace is absent entirely
        // XMP: <node></node>
        todo!()
    }

    /// Upstream: TEST(CPDFMetadataTest, CheckSharedFormExceedMaxDepth)
    #[test]
    #[ignore = "CheckForSharedForm not yet implemented"]
    fn test_cpdf_metadata_check_shared_form_exceed_max_depth() {
        // Should return empty when XML exceeds max nesting depth (130 levels)
        // even though adhocwf data exists at that depth
        todo!()
    }

    /// Upstream: TEST(CPDFMetadataTest, CheckSharedFormWrongNamespace)
    #[test]
    #[ignore = "CheckForSharedForm not yet implemented"]
    fn test_cpdf_metadata_check_shared_form_wrong_namespace() {
        // Should return empty when namespace version is wrong (2.0 instead of 1.0)
        // XMP: xmlns:adhocwf="http://ns.adobe.com/AcrobatAdhocWorkflow/2.0/"
        todo!()
    }

    /// Upstream: TEST(CPDFMetadataTest, CheckSharedFormMultipleErrors)
    #[test]
    #[ignore = "CheckForSharedForm not yet implemented"]
    fn test_cpdf_metadata_check_shared_form_multiple_errors() {
        // Should detect multiple shared form types in one metadata stream:
        // workflowType=0 (Email), workflowType=2 (Filesystem), workflowType=1 (Acrobat)
        todo!()
    }
}