use super::bindings::bind_pdfium;
use super::error::{PdfError, Result};
use crate::types::{PageBoundary, PageInfo, PageStructure, PageUnitType};
use pdfium_render::prelude::*;
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct PdfMetadata {
#[serde(skip_serializing_if = "Option::is_none")]
pub pdf_version: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub producer: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub is_encrypted: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
pub width: Option<i64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub height: Option<i64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub page_count: Option<usize>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PdfExtractionMetadata {
#[serde(skip_serializing_if = "Option::is_none")]
pub title: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub subject: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub authors: Option<Vec<String>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub keywords: Option<Vec<String>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub created_at: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub modified_at: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub created_by: Option<String>,
pub pdf_specific: PdfMetadata,
#[serde(skip_serializing_if = "Option::is_none")]
pub page_structure: Option<PageStructure>,
}
pub fn extract_metadata(pdf_bytes: &[u8]) -> Result<PdfMetadata> {
extract_metadata_with_password(pdf_bytes, None)
}
pub fn extract_metadata_with_password(pdf_bytes: &[u8], password: Option<&str>) -> Result<PdfMetadata> {
let pdfium = bind_pdfium(PdfError::MetadataExtractionFailed, "metadata extraction", None)?;
let document = pdfium.load_pdf_from_byte_slice(pdf_bytes, password).map_err(|e| {
let err_msg = super::error::format_pdfium_error(e);
if (err_msg.contains("password") || err_msg.contains("Password")) && password.is_some() {
PdfError::InvalidPassword
} else if err_msg.contains("password") || err_msg.contains("Password") {
PdfError::PasswordRequired
} else {
PdfError::MetadataExtractionFailed(err_msg)
}
})?;
extract_pdf_specific_metadata(&document)
}
pub fn extract_metadata_with_passwords(pdf_bytes: &[u8], passwords: &[&str]) -> Result<PdfMetadata> {
let mut last_error = None;
for password in passwords {
match extract_metadata_with_password(pdf_bytes, Some(password)) {
Ok(metadata) => return Ok(metadata),
Err(err) => {
last_error = Some(err);
continue;
}
}
}
if let Some(err) = last_error {
return Err(err);
}
extract_metadata(pdf_bytes)
}
pub fn extract_metadata_from_document(
document: &PdfDocument<'_>,
page_boundaries: Option<&[PageBoundary]>,
content: Option<&str>,
) -> Result<PdfExtractionMetadata> {
extract_metadata_from_document_impl(document, page_boundaries, content.unwrap_or(""))
}
pub(crate) fn extract_metadata_from_document_impl(
document: &PdfDocument<'_>,
page_boundaries: Option<&[PageBoundary]>,
content: &str,
) -> Result<PdfExtractionMetadata> {
let pdf_specific = extract_pdf_specific_metadata(document)?;
let common = extract_common_metadata_from_document(document)?;
let page_structure = if let Some(boundaries) = page_boundaries {
Some(build_page_structure(document, boundaries, content)?)
} else {
None
};
Ok(PdfExtractionMetadata {
title: common.title,
subject: common.subject,
authors: common.authors,
keywords: common.keywords,
created_at: common.created_at,
modified_at: common.modified_at,
created_by: common.created_by,
pdf_specific,
page_structure,
})
}
fn extract_pdf_specific_metadata(document: &PdfDocument<'_>) -> Result<PdfMetadata> {
let pdf_metadata = document.metadata();
let mut metadata = PdfMetadata {
pdf_version: format_pdf_version(document.version()),
..Default::default()
};
metadata.is_encrypted = document
.permissions()
.security_handler_revision()
.ok()
.map(|revision| revision != PdfSecurityHandlerRevision::Unprotected);
metadata.producer = pdf_metadata
.get(PdfDocumentMetadataTagType::Producer)
.map(|tag| tag.value().to_string());
if !document.pages().is_empty()
&& let Ok(page_rect) = document.pages().page_size(0)
{
metadata.width = Some(page_rect.width().value.round() as i64);
metadata.height = Some(page_rect.height().value.round() as i64);
}
metadata.page_count = Some(document.pages().len() as usize);
Ok(metadata)
}
fn build_page_structure(
document: &PdfDocument<'_>,
boundaries: &[PageBoundary],
content: &str,
) -> Result<PageStructure> {
let total_count = document.pages().len() as usize;
if boundaries.is_empty() {
return Err(PdfError::MetadataExtractionFailed(
"No page boundaries provided for PageStructure".to_string(),
));
}
if boundaries.len() != total_count {
return Err(PdfError::MetadataExtractionFailed(format!(
"Boundary count {} doesn't match page count {}",
boundaries.len(),
total_count
)));
}
let mut pages = Vec::new();
for (index, boundary) in boundaries.iter().enumerate() {
let page_number = boundary.page_number;
let dimensions = if let Ok(page_rect) = document.pages().page_size(index as i32) {
Some((page_rect.width().value as f64, page_rect.height().value as f64))
} else {
None
};
let is_blank = if boundary.byte_start <= boundary.byte_end && boundary.byte_end <= content.len() {
let page_text = &content[boundary.byte_start..boundary.byte_end];
Some(crate::extraction::blank_detection::is_page_text_blank(page_text))
} else {
None
};
pages.push(PageInfo {
number: page_number,
title: None,
dimensions,
image_count: None,
table_count: None,
hidden: None,
is_blank,
});
}
Ok(PageStructure {
total_count,
unit_type: PageUnitType::Page,
boundaries: Some(boundaries.to_vec()),
pages: if pages.is_empty() { None } else { Some(pages) },
})
}
pub fn extract_common_metadata_from_document(document: &PdfDocument<'_>) -> Result<CommonPdfMetadata> {
let pdf_metadata = document.metadata();
let tag_types = [
PdfDocumentMetadataTagType::Title,
PdfDocumentMetadataTagType::Subject,
PdfDocumentMetadataTagType::Author,
PdfDocumentMetadataTagType::Keywords,
PdfDocumentMetadataTagType::CreationDate,
PdfDocumentMetadataTagType::ModificationDate,
PdfDocumentMetadataTagType::Creator,
];
let mut metadata_cache: [Option<String>; 7] = Default::default();
for (index, tag_type) in tag_types.iter().enumerate() {
if let Some(tag) = pdf_metadata.get(*tag_type) {
metadata_cache[index] = Some(tag.value().to_string());
}
}
let title = metadata_cache[0].clone();
let subject = metadata_cache[1].clone();
let authors = metadata_cache[2]
.as_ref()
.map(|author_str| parse_authors(author_str))
.filter(|parsed| !parsed.is_empty());
let keywords = metadata_cache[3]
.as_ref()
.map(|keywords_str| parse_keywords(keywords_str))
.filter(|parsed| !parsed.is_empty());
let created_at = metadata_cache[4].as_ref().map(|date_str| parse_pdf_date(date_str));
let modified_at = metadata_cache[5].as_ref().map(|date_str| parse_pdf_date(date_str));
let created_by = metadata_cache[6].clone();
Ok(CommonPdfMetadata {
title,
subject,
authors,
keywords,
created_at,
modified_at,
created_by,
})
}
pub struct CommonPdfMetadata {
pub title: Option<String>,
pub subject: Option<String>,
pub authors: Option<Vec<String>>,
pub keywords: Option<Vec<String>>,
pub created_at: Option<String>,
pub modified_at: Option<String>,
pub created_by: Option<String>,
}
fn parse_authors(author_str: &str) -> Vec<String> {
let author_str = author_str.replace(" and ", ", ");
let mut authors = Vec::new();
for segment in author_str.split(';') {
for author in segment.split(',') {
let trimmed = author.trim();
if !trimmed.is_empty() {
authors.push(trimmed.to_string());
}
}
}
authors
}
fn parse_keywords(keywords_str: &str) -> Vec<String> {
keywords_str
.replace(';', ",")
.split(',')
.filter_map(|k| {
let trimmed = k.trim();
if trimmed.is_empty() {
None
} else {
Some(trimmed.to_string())
}
})
.collect()
}
fn parse_pdf_date(date_str: &str) -> String {
let cleaned = date_str.trim();
if cleaned.starts_with("D:") && cleaned.len() >= 10 {
let year = &cleaned[2..6];
let month = &cleaned[6..8];
let day = &cleaned[8..10];
if cleaned.len() >= 16 {
let hour = &cleaned[10..12];
let minute = &cleaned[12..14];
let second = &cleaned[14..16];
format!("{}-{}-{}T{}:{}:{}Z", year, month, day, hour, minute, second)
} else if cleaned.len() >= 14 {
let hour = &cleaned[10..12];
let minute = &cleaned[12..14];
format!("{}-{}-{}T{}:{}:00Z", year, month, day, hour, minute)
} else {
format!("{}-{}-{}T00:00:00Z", year, month, day)
}
} else if cleaned.len() >= 8 {
let year = &cleaned[0..4];
let month = &cleaned[4..6];
let day = &cleaned[6..8];
format!("{}-{}-{}T00:00:00Z", year, month, day)
} else {
date_str.to_string()
}
}
fn format_pdf_version(version: PdfDocumentVersion) -> Option<String> {
match version {
PdfDocumentVersion::Unset => None,
PdfDocumentVersion::Pdf1_0 => Some("1.0".to_string()),
PdfDocumentVersion::Pdf1_1 => Some("1.1".to_string()),
PdfDocumentVersion::Pdf1_2 => Some("1.2".to_string()),
PdfDocumentVersion::Pdf1_3 => Some("1.3".to_string()),
PdfDocumentVersion::Pdf1_4 => Some("1.4".to_string()),
PdfDocumentVersion::Pdf1_5 => Some("1.5".to_string()),
PdfDocumentVersion::Pdf1_6 => Some("1.6".to_string()),
PdfDocumentVersion::Pdf1_7 => Some("1.7".to_string()),
PdfDocumentVersion::Pdf2_0 => Some("2.0".to_string()),
PdfDocumentVersion::Other(value) => {
if value >= 10 {
Some(format!("{}.{}", value / 10, value % 10))
} else {
Some(value.to_string())
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_authors_single() {
let authors = parse_authors("John Doe");
assert_eq!(authors, vec!["John Doe"]);
}
#[test]
fn test_parse_authors_multiple_comma() {
let authors = parse_authors("John Doe, Jane Smith");
assert_eq!(authors, vec!["John Doe", "Jane Smith"]);
}
#[test]
fn test_parse_authors_multiple_and() {
let authors = parse_authors("John Doe and Jane Smith");
assert_eq!(authors, vec!["John Doe", "Jane Smith"]);
}
#[test]
fn test_parse_authors_semicolon() {
let authors = parse_authors("John Doe;Jane Smith");
assert_eq!(authors, vec!["John Doe", "Jane Smith"]);
}
#[test]
fn test_parse_keywords() {
let keywords = parse_keywords("pdf, document, test");
assert_eq!(keywords, vec!["pdf", "document", "test"]);
}
#[test]
fn test_parse_keywords_semicolon() {
let keywords = parse_keywords("pdf;document;test");
assert_eq!(keywords, vec!["pdf", "document", "test"]);
}
#[test]
fn test_parse_keywords_empty() {
let keywords = parse_keywords("");
assert!(keywords.is_empty());
}
#[test]
fn test_parse_pdf_date_full() {
let date = parse_pdf_date("D:20230115123045");
assert_eq!(date, "2023-01-15T12:30:45Z");
}
#[test]
fn test_parse_pdf_date_no_time() {
let date = parse_pdf_date("D:20230115");
assert_eq!(date, "2023-01-15T00:00:00Z");
}
#[test]
fn test_parse_pdf_date_no_prefix() {
let date = parse_pdf_date("20230115");
assert_eq!(date, "2023-01-15T00:00:00Z");
}
#[test]
#[serial_test::serial]
fn test_extract_metadata_invalid_pdf() {
let result = extract_metadata(b"not a pdf");
assert!(result.is_err());
}
#[test]
fn test_build_page_structure_empty_boundaries() {
let result_msg = "No page boundaries provided for PageStructure".to_string();
assert!(!result_msg.is_empty());
}
#[test]
fn test_build_page_structure_boundary_mismatch_message() {
let boundaries_count = 3;
let page_count = 5;
let error_msg = format!(
"Boundary count {} doesn't match page count {}",
boundaries_count, page_count
);
assert_eq!(error_msg, "Boundary count 3 doesn't match page count 5");
}
}