rpdfium-doc 7676.6.2

Document-level features for rpdfium
Documentation
//! XMP metadata extraction from PDF streams.
//!
//! Extracts Dublin Core and XMP metadata properties from the raw XML bytes
//! of a PDF's `/Metadata` stream. Uses simple string scanning to avoid
//! requiring an XML parser dependency.

/// Type of shared form workflow detected in XMP metadata.
///
/// Corresponds to PDFium's `CPDF_Metadata::CheckForSharedForm()` which
/// scans for the Acrobat ad-hoc workflow namespace and extracts the
/// workflow type.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum SharedFormType {
    /// No shared form workflow detected.
    #[default]
    None,
    /// Email-based workflow (workflowType=0).
    EmailBased,
    /// Acrobat.com-based workflow (workflowType=1).
    AcrobatCom,
    /// Filesystem-based workflow (workflowType=2).
    FilesystemBased,
}

/// Check if the XMP metadata indicates a shared form workflow.
///
/// Scans for the `adhocwf` or `AcrobatAdhocWorkflow` namespace and
/// extracts the `workflowType` value.
pub fn check_for_shared_form(xml: &[u8]) -> SharedFormType {
    let text = String::from_utf8_lossy(xml);

    // Look for the adhoc workflow namespace
    if !text.contains("adhocwf") && !text.contains("AcrobatAdhocWorkflow") {
        return SharedFormType::None;
    }

    // Extract workflowType value — scan for the tag
    let workflow_value = extract_simple_tag(&text, "adhocwf:workflowType")
        .or_else(|| extract_simple_tag(&text, "AcrobatAdhocWorkflow:workflowType"));

    match workflow_value.as_deref() {
        Some("0") => SharedFormType::EmailBased,
        Some("1") => SharedFormType::AcrobatCom,
        Some("2") => SharedFormType::FilesystemBased,
        // Namespace present but no recognizable type — treat as email (upstream default)
        Some(_) => SharedFormType::EmailBased,
        None => SharedFormType::EmailBased,
    }
}

/// Parsed XMP metadata from a PDF metadata stream.
#[derive(Debug, Clone, Default)]
pub struct XmpMetadata {
    /// Document title (`dc:title`).
    pub title: Option<String>,
    /// Document creator/author (`dc:creator`).
    pub creator: Option<String>,
    /// Document description (`dc:description`).
    pub description: Option<String>,
    /// Creation date (`xmp:CreateDate`).
    pub create_date: Option<String>,
    /// Modification date (`xmp:ModifyDate`).
    pub modify_date: Option<String>,
    /// PDF producer (`pdf:Producer`).
    pub producer: Option<String>,
    /// Keywords (`pdf:Keywords`).
    pub keywords: Option<String>,
}

impl XmpMetadata {
    /// Parse XMP metadata from raw XML bytes.
    ///
    /// Uses simple string scanning -- no XML parser dependency.
    /// Extracts values from known XMP/Dublin Core tags.
    pub fn from_xml(xml: &[u8]) -> Self {
        let text = String::from_utf8_lossy(xml);

        Self {
            title: extract_rdf_li_value(&text, "dc:title"),
            creator: extract_rdf_li_value(&text, "dc:creator"),
            description: extract_rdf_li_value(&text, "dc:description"),
            create_date: extract_simple_tag(&text, "xmp:CreateDate"),
            modify_date: extract_simple_tag(&text, "xmp:ModifyDate"),
            producer: extract_simple_tag(&text, "pdf:Producer"),
            keywords: extract_simple_tag(&text, "pdf:Keywords"),
        }
    }
}

/// Extract a value from an `<rdf:li>` element nested inside a parent tag.
///
/// Handles patterns like:
/// ```xml
/// <dc:title>
///   <rdf:Alt>
///     <rdf:li xml:lang="x-default">The Title</rdf:li>
///   </rdf:Alt>
/// </dc:title>
/// ```
fn extract_rdf_li_value(text: &str, tag: &str) -> Option<String> {
    let open = format!("<{tag}");
    let close = format!("</{tag}>");

    let start = text.find(&open)?;
    let end = text[start..].find(&close)?;
    let section = &text[start..start + end];

    // Look for <rdf:li ...>VALUE</rdf:li> within the section
    extract_inner_rdf_li(section)
}

/// Extract the text content of the first `<rdf:li>` element in the given text.
fn extract_inner_rdf_li(text: &str) -> Option<String> {
    let li_start = text.find("<rdf:li")?;
    let rest = &text[li_start..];
    // Find the end of the opening tag
    let content_start = rest.find('>')? + 1;
    let content_rest = &rest[content_start..];
    let content_end = content_rest.find("</rdf:li>")?;
    let value = content_rest[..content_end].trim();
    if value.is_empty() {
        None
    } else {
        Some(value.to_string())
    }
}

/// Extract a value from a simple tag like `<xmp:CreateDate>VALUE</xmp:CreateDate>`.
fn extract_simple_tag(text: &str, tag: &str) -> Option<String> {
    let open = format!("<{tag}>");
    let close = format!("</{tag}>");

    let start = text.find(&open)?;
    let content_start = start + open.len();
    let content_rest = &text[content_start..];
    let content_end = content_rest.find(&close)?;
    let value = content_rest[..content_end].trim();
    if value.is_empty() {
        None
    } else {
        Some(value.to_string())
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_parse_basic_xmp() {
        let xml = br#"<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>
<x:xmpmeta xmlns:x="adobe:ns:meta/">
  <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
           xmlns:dc="http://purl.org/dc/elements/1.1/"
           xmlns:xmp="http://ns.adobe.com/xap/1.0/"
           xmlns:pdf="http://ns.adobe.com/pdf/1.3/">
    <rdf:Description rdf:about="">
      <dc:title>
        <rdf:Alt>
          <rdf:li xml:lang="x-default">Test Document</rdf:li>
        </rdf:Alt>
      </dc:title>
      <dc:creator>
        <rdf:Seq>
          <rdf:li>John Doe</rdf:li>
        </rdf:Seq>
      </dc:creator>
      <dc:description>
        <rdf:Alt>
          <rdf:li xml:lang="x-default">A test PDF document</rdf:li>
        </rdf:Alt>
      </dc:description>
      <xmp:CreateDate>2024-01-15T10:30:00Z</xmp:CreateDate>
      <xmp:ModifyDate>2024-06-01T14:00:00Z</xmp:ModifyDate>
      <pdf:Producer>rpdfium 0.1</pdf:Producer>
      <pdf:Keywords>test, pdf, document</pdf:Keywords>
    </rdf:Description>
  </rdf:RDF>
</x:xmpmeta>
<?xpacket end="w"?>"#;

        let meta = XmpMetadata::from_xml(xml);
        assert_eq!(meta.title.as_deref(), Some("Test Document"));
        assert_eq!(meta.creator.as_deref(), Some("John Doe"));
        assert_eq!(meta.description.as_deref(), Some("A test PDF document"));
        assert_eq!(meta.create_date.as_deref(), Some("2024-01-15T10:30:00Z"));
        assert_eq!(meta.modify_date.as_deref(), Some("2024-06-01T14:00:00Z"));
        assert_eq!(meta.producer.as_deref(), Some("rpdfium 0.1"));
        assert_eq!(meta.keywords.as_deref(), Some("test, pdf, document"));
    }

    #[test]
    fn test_parse_empty_xmp() {
        let xml = b"<?xpacket begin=\"\" id=\"W5M0MpCehiHzreSzNTczkc9d\"?><?xpacket end=\"w\"?>";
        let meta = XmpMetadata::from_xml(xml);
        assert!(meta.title.is_none());
        assert!(meta.creator.is_none());
        assert!(meta.create_date.is_none());
    }

    #[test]
    fn test_parse_partial_xmp() {
        let xml = br#"<x:xmpmeta>
<rdf:RDF>
<rdf:Description>
  <pdf:Producer>Test Producer</pdf:Producer>
</rdf:Description>
</rdf:RDF>
</x:xmpmeta>"#;

        let meta = XmpMetadata::from_xml(xml);
        assert!(meta.title.is_none());
        assert_eq!(meta.producer.as_deref(), Some("Test Producer"));
    }

    #[test]
    fn test_parse_xmp_with_whitespace() {
        let xml = br#"<x:xmpmeta>
<rdf:RDF>
<rdf:Description>
  <xmp:CreateDate>
    2024-03-15
  </xmp:CreateDate>
</rdf:Description>
</rdf:RDF>
</x:xmpmeta>"#;

        let meta = XmpMetadata::from_xml(xml);
        assert_eq!(meta.create_date.as_deref(), Some("2024-03-15"));
    }

    // ---- SharedFormType tests ----

    #[test]
    fn test_shared_form_none() {
        let xml = b"<x:xmpmeta><rdf:RDF><rdf:Description></rdf:Description></rdf:RDF></x:xmpmeta>";
        assert_eq!(check_for_shared_form(xml), SharedFormType::None);
    }

    #[test]
    fn test_shared_form_email() {
        let xml = br#"<x:xmpmeta>
<rdf:RDF>
<rdf:Description xmlns:adhocwf="http://ns.adobe.com/AcrobatAdhocWorkflow/1.0/">
  <adhocwf:workflowType>0</adhocwf:workflowType>
</rdf:Description>
</rdf:RDF>
</x:xmpmeta>"#;
        assert_eq!(check_for_shared_form(xml), SharedFormType::EmailBased);
    }

    #[test]
    fn test_shared_form_acrobat_com() {
        let xml = br#"<x:xmpmeta>
<rdf:RDF>
<rdf:Description xmlns:adhocwf="http://ns.adobe.com/AcrobatAdhocWorkflow/1.0/">
  <adhocwf:workflowType>1</adhocwf:workflowType>
</rdf:Description>
</rdf:RDF>
</x:xmpmeta>"#;
        assert_eq!(check_for_shared_form(xml), SharedFormType::AcrobatCom);
    }

    #[test]
    fn test_shared_form_filesystem() {
        let xml =
            br#"<rdf:Description xmlns:adhocwf="http://ns.adobe.com/AcrobatAdhocWorkflow/1.0/">
  <adhocwf:workflowType>2</adhocwf:workflowType>
</rdf:Description>"#;
        assert_eq!(check_for_shared_form(xml), SharedFormType::FilesystemBased);
    }

    #[test]
    fn test_shared_form_namespace_no_type() {
        // Namespace present but no workflowType tag → defaults to email
        let xml =
            br#"<rdf:Description xmlns:adhocwf="http://ns.adobe.com/AcrobatAdhocWorkflow/1.0/">
</rdf:Description>"#;
        assert_eq!(check_for_shared_form(xml), SharedFormType::EmailBased);
    }

    #[test]
    fn test_shared_form_alternate_namespace() {
        let xml = br#"<rdf:Description xmlns:AcrobatAdhocWorkflow="http://example.com/">
  <AcrobatAdhocWorkflow:workflowType>1</AcrobatAdhocWorkflow:workflowType>
</rdf:Description>"#;
        assert_eq!(check_for_shared_form(xml), SharedFormType::AcrobatCom);
    }

    #[test]
    fn test_parse_invalid_utf8() {
        let xml = b"\xff\xfe<xmp:CreateDate>2024-01-01</xmp:CreateDate>";
        let meta = XmpMetadata::from_xml(xml);
        // Should handle gracefully (lossy conversion)
        assert_eq!(meta.create_date.as_deref(), Some("2024-01-01"));
    }
}