use crate::types::{ArchiveEntry, ProcessingWarning};
use lopdf::{Document, Object};
use std::borrow::Cow;
pub struct EmbeddedFile {
pub name: String,
pub data: Vec<u8>,
pub mime_type: Option<String>,
}
pub fn extract_embedded_files(document: &Document) -> Vec<EmbeddedFile> {
let mut files = Vec::new();
let catalog = match document.catalog() {
Ok(cat) => cat,
Err(_) => return files,
};
let names_obj = match catalog.get(b"Names") {
Ok(obj) => resolve_object(document, obj),
Err(_) => return files,
};
let names_dict = match names_obj {
Some(Object::Dictionary(dict)) => dict,
_ => return files,
};
let ef_obj = match names_dict.get(b"EmbeddedFiles") {
Ok(obj) => resolve_object(document, obj),
Err(_) => return files,
};
let ef_dict = match ef_obj {
Some(Object::Dictionary(dict)) => dict,
_ => return files,
};
collect_from_name_tree(document, &ef_dict, &mut files);
files
}
fn collect_from_name_tree(document: &Document, dict: &lopdf::Dictionary, files: &mut Vec<EmbeddedFile>) {
if let Ok(Object::Array(names_arr)) = dict.get(b"Names") {
let mut i = 0;
while i + 1 < names_arr.len() {
let name = match &names_arr[i] {
Object::String(bytes, _) => String::from_utf8_lossy(bytes).into_owned(),
_ => {
i += 2;
continue;
}
};
let filespec = resolve_object(document, &names_arr[i + 1]);
if let Some(Object::Dictionary(fs_dict)) = filespec
&& let Some(ef) = extract_file_from_filespec(document, &name, &fs_dict)
{
files.push(ef);
}
i += 2;
}
}
if let Ok(Object::Array(kids)) = dict.get(b"Kids") {
for kid in kids {
let kid_obj = resolve_object(document, kid);
if let Some(Object::Dictionary(kid_dict)) = kid_obj {
collect_from_name_tree(document, &kid_dict, files);
}
}
}
}
fn extract_file_from_filespec(
document: &Document,
tree_name: &str,
fs_dict: &lopdf::Dictionary,
) -> Option<EmbeddedFile> {
let display_name = fs_dict
.get(b"UF")
.or_else(|_| fs_dict.get(b"F"))
.ok()
.and_then(|obj| match obj {
Object::String(bytes, _) => Some(String::from_utf8_lossy(bytes).into_owned()),
_ => None,
})
.unwrap_or_else(|| tree_name.to_string());
let ef_obj = resolve_object(document, fs_dict.get(b"EF").ok()?)?;
let ef_dict = match ef_obj {
Object::Dictionary(d) => d,
_ => return None,
};
let stream_obj = ef_dict.get(b"F").or_else(|_| ef_dict.get(b"UF")).ok()?;
let stream_id = stream_obj.as_reference().ok()?;
let stream = match document.get_object(stream_id) {
Ok(Object::Stream(s)) => s,
_ => return None,
};
let data = stream.decompressed_content().unwrap_or_else(|_| stream.content.clone());
let mime_type = stream
.dict
.get(b"Subtype")
.ok()
.and_then(|obj| obj.as_name().ok())
.map(|name| String::from_utf8_lossy(name).into_owned())
.or_else(|| {
std::path::Path::new(&display_name)
.extension()
.and_then(|ext| ext.to_str())
.and_then(|ext| mime_guess::from_ext(ext).first())
.map(|m| m.to_string())
});
Some(EmbeddedFile {
name: display_name,
data,
mime_type,
})
}
fn resolve_object<'a>(document: &'a Document, obj: &'a Object) -> Option<Object> {
match obj {
Object::Reference(id) => document.get_object(*id).ok().cloned(),
other => Some(other.clone()),
}
}
pub async fn extract_and_process_embedded_files(
pdf_bytes: &[u8],
config: &crate::core::config::ExtractionConfig,
) -> (Vec<ArchiveEntry>, Vec<ProcessingWarning>) {
let mut children = Vec::new();
let mut warnings = Vec::new();
let document = match Document::load_mem(pdf_bytes) {
Ok(doc) => doc,
Err(_) => return (children, warnings),
};
let embedded = extract_embedded_files(&document);
if embedded.is_empty() {
return (children, warnings);
}
if config.max_archive_depth == 0 {
return (children, warnings);
}
let mut child_config = config.clone();
child_config.max_archive_depth = config.max_archive_depth.saturating_sub(1);
for file in embedded {
let mime = file.mime_type.unwrap_or_else(|| {
std::path::Path::new(&file.name)
.extension()
.and_then(|ext| ext.to_str())
.and_then(|ext| mime_guess::from_ext(ext).first())
.map(|m| m.to_string())
.unwrap_or_else(|| "application/octet-stream".to_string())
});
if mime == "application/octet-stream" {
continue;
}
match crate::core::extractor::extract_bytes(&file.data, &mime, &child_config).await {
Ok(result) => {
children.push(ArchiveEntry {
path: file.name,
mime_type: mime,
result: Box::new(result),
});
}
Err(e) => {
warnings.push(ProcessingWarning {
source: Cow::Borrowed("pdf_embedded_files"),
message: Cow::Owned(format!("Failed to extract embedded '{}': {}", file.name, e)),
});
}
}
}
(children, warnings)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extract_embedded_files_no_names() {
let doc = Document::with_version("1.5");
let files = extract_embedded_files(&doc);
assert!(files.is_empty());
}
}