#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum SharedFormType {
#[default]
None,
EmailBased,
AcrobatCom,
FilesystemBased,
}
pub fn check_for_shared_form(xml: &[u8]) -> SharedFormType {
let text = String::from_utf8_lossy(xml);
if !text.contains("adhocwf") && !text.contains("AcrobatAdhocWorkflow") {
return SharedFormType::None;
}
let workflow_value = extract_simple_tag(&text, "adhocwf:workflowType")
.or_else(|| extract_simple_tag(&text, "AcrobatAdhocWorkflow:workflowType"));
match workflow_value.as_deref() {
Some("0") => SharedFormType::EmailBased,
Some("1") => SharedFormType::AcrobatCom,
Some("2") => SharedFormType::FilesystemBased,
Some(_) => SharedFormType::EmailBased,
None => SharedFormType::EmailBased,
}
}
#[derive(Debug, Clone, Default)]
pub struct XmpMetadata {
pub title: Option<String>,
pub creator: Option<String>,
pub description: Option<String>,
pub create_date: Option<String>,
pub modify_date: Option<String>,
pub producer: Option<String>,
pub keywords: Option<String>,
}
impl XmpMetadata {
pub fn from_xml(xml: &[u8]) -> Self {
let text = String::from_utf8_lossy(xml);
Self {
title: extract_rdf_li_value(&text, "dc:title"),
creator: extract_rdf_li_value(&text, "dc:creator"),
description: extract_rdf_li_value(&text, "dc:description"),
create_date: extract_simple_tag(&text, "xmp:CreateDate"),
modify_date: extract_simple_tag(&text, "xmp:ModifyDate"),
producer: extract_simple_tag(&text, "pdf:Producer"),
keywords: extract_simple_tag(&text, "pdf:Keywords"),
}
}
}
fn extract_rdf_li_value(text: &str, tag: &str) -> Option<String> {
let open = format!("<{tag}");
let close = format!("</{tag}>");
let start = text.find(&open)?;
let end = text[start..].find(&close)?;
let section = &text[start..start + end];
extract_inner_rdf_li(section)
}
fn extract_inner_rdf_li(text: &str) -> Option<String> {
let li_start = text.find("<rdf:li")?;
let rest = &text[li_start..];
let content_start = rest.find('>')? + 1;
let content_rest = &rest[content_start..];
let content_end = content_rest.find("</rdf:li>")?;
let value = content_rest[..content_end].trim();
if value.is_empty() {
None
} else {
Some(value.to_string())
}
}
fn extract_simple_tag(text: &str, tag: &str) -> Option<String> {
let open = format!("<{tag}>");
let close = format!("</{tag}>");
let start = text.find(&open)?;
let content_start = start + open.len();
let content_rest = &text[content_start..];
let content_end = content_rest.find(&close)?;
let value = content_rest[..content_end].trim();
if value.is_empty() {
None
} else {
Some(value.to_string())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_basic_xmp() {
let xml = br#"<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>
<x:xmpmeta xmlns:x="adobe:ns:meta/">
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:xmp="http://ns.adobe.com/xap/1.0/"
xmlns:pdf="http://ns.adobe.com/pdf/1.3/">
<rdf:Description rdf:about="">
<dc:title>
<rdf:Alt>
<rdf:li xml:lang="x-default">Test Document</rdf:li>
</rdf:Alt>
</dc:title>
<dc:creator>
<rdf:Seq>
<rdf:li>John Doe</rdf:li>
</rdf:Seq>
</dc:creator>
<dc:description>
<rdf:Alt>
<rdf:li xml:lang="x-default">A test PDF document</rdf:li>
</rdf:Alt>
</dc:description>
<xmp:CreateDate>2024-01-15T10:30:00Z</xmp:CreateDate>
<xmp:ModifyDate>2024-06-01T14:00:00Z</xmp:ModifyDate>
<pdf:Producer>rpdfium 0.1</pdf:Producer>
<pdf:Keywords>test, pdf, document</pdf:Keywords>
</rdf:Description>
</rdf:RDF>
</x:xmpmeta>
<?xpacket end="w"?>"#;
let meta = XmpMetadata::from_xml(xml);
assert_eq!(meta.title.as_deref(), Some("Test Document"));
assert_eq!(meta.creator.as_deref(), Some("John Doe"));
assert_eq!(meta.description.as_deref(), Some("A test PDF document"));
assert_eq!(meta.create_date.as_deref(), Some("2024-01-15T10:30:00Z"));
assert_eq!(meta.modify_date.as_deref(), Some("2024-06-01T14:00:00Z"));
assert_eq!(meta.producer.as_deref(), Some("rpdfium 0.1"));
assert_eq!(meta.keywords.as_deref(), Some("test, pdf, document"));
}
#[test]
fn test_parse_empty_xmp() {
let xml = b"<?xpacket begin=\"\" id=\"W5M0MpCehiHzreSzNTczkc9d\"?><?xpacket end=\"w\"?>";
let meta = XmpMetadata::from_xml(xml);
assert!(meta.title.is_none());
assert!(meta.creator.is_none());
assert!(meta.create_date.is_none());
}
#[test]
fn test_parse_partial_xmp() {
let xml = br#"<x:xmpmeta>
<rdf:RDF>
<rdf:Description>
<pdf:Producer>Test Producer</pdf:Producer>
</rdf:Description>
</rdf:RDF>
</x:xmpmeta>"#;
let meta = XmpMetadata::from_xml(xml);
assert!(meta.title.is_none());
assert_eq!(meta.producer.as_deref(), Some("Test Producer"));
}
#[test]
fn test_parse_xmp_with_whitespace() {
let xml = br#"<x:xmpmeta>
<rdf:RDF>
<rdf:Description>
<xmp:CreateDate>
2024-03-15
</xmp:CreateDate>
</rdf:Description>
</rdf:RDF>
</x:xmpmeta>"#;
let meta = XmpMetadata::from_xml(xml);
assert_eq!(meta.create_date.as_deref(), Some("2024-03-15"));
}
#[test]
fn test_shared_form_none() {
let xml = b"<x:xmpmeta><rdf:RDF><rdf:Description></rdf:Description></rdf:RDF></x:xmpmeta>";
assert_eq!(check_for_shared_form(xml), SharedFormType::None);
}
#[test]
fn test_shared_form_email() {
let xml = br#"<x:xmpmeta>
<rdf:RDF>
<rdf:Description xmlns:adhocwf="http://ns.adobe.com/AcrobatAdhocWorkflow/1.0/">
<adhocwf:workflowType>0</adhocwf:workflowType>
</rdf:Description>
</rdf:RDF>
</x:xmpmeta>"#;
assert_eq!(check_for_shared_form(xml), SharedFormType::EmailBased);
}
#[test]
fn test_shared_form_acrobat_com() {
let xml = br#"<x:xmpmeta>
<rdf:RDF>
<rdf:Description xmlns:adhocwf="http://ns.adobe.com/AcrobatAdhocWorkflow/1.0/">
<adhocwf:workflowType>1</adhocwf:workflowType>
</rdf:Description>
</rdf:RDF>
</x:xmpmeta>"#;
assert_eq!(check_for_shared_form(xml), SharedFormType::AcrobatCom);
}
#[test]
fn test_shared_form_filesystem() {
let xml =
br#"<rdf:Description xmlns:adhocwf="http://ns.adobe.com/AcrobatAdhocWorkflow/1.0/">
<adhocwf:workflowType>2</adhocwf:workflowType>
</rdf:Description>"#;
assert_eq!(check_for_shared_form(xml), SharedFormType::FilesystemBased);
}
#[test]
fn test_shared_form_namespace_no_type() {
let xml =
br#"<rdf:Description xmlns:adhocwf="http://ns.adobe.com/AcrobatAdhocWorkflow/1.0/">
</rdf:Description>"#;
assert_eq!(check_for_shared_form(xml), SharedFormType::EmailBased);
}
#[test]
fn test_shared_form_alternate_namespace() {
let xml = br#"<rdf:Description xmlns:AcrobatAdhocWorkflow="http://example.com/">
<AcrobatAdhocWorkflow:workflowType>1</AcrobatAdhocWorkflow:workflowType>
</rdf:Description>"#;
assert_eq!(check_for_shared_form(xml), SharedFormType::AcrobatCom);
}
#[test]
fn test_parse_invalid_utf8() {
let xml = b"\xff\xfe<xmp:CreateDate>2024-01-01</xmp:CreateDate>";
let meta = XmpMetadata::from_xml(xml);
assert_eq!(meta.create_date.as_deref(), Some("2024-01-01"));
}
}