use crate::core::config::ExtractionConfig;
use crate::types::{ArchiveEntry, ProcessingWarning};
use std::borrow::Cow;
use std::io::{Cursor, Read};
pub async fn extract_ooxml_embedded_objects(
zip_bytes: &[u8],
embeddings_prefix: &str,
source_label: &str,
config: &ExtractionConfig,
) -> (Vec<ArchiveEntry>, Vec<ProcessingWarning>) {
let mut children = Vec::new();
let mut warnings = Vec::new();
if config.max_archive_depth == 0 {
return (children, warnings);
}
let cursor = Cursor::new(zip_bytes);
let mut archive = match zip::ZipArchive::new(cursor) {
Ok(a) => a,
Err(_) => return (children, warnings),
};
let embedding_names: Vec<String> = (0..archive.len())
.filter_map(|i| {
let file = archive.by_index(i).ok()?;
let name = file.name().to_string();
if name.starts_with(embeddings_prefix) && name.len() > embeddings_prefix.len() {
Some(name)
} else {
None
}
})
.collect();
if embedding_names.is_empty() {
return (children, warnings);
}
let mut child_config = config.clone();
child_config.max_archive_depth = config.max_archive_depth.saturating_sub(1);
for entry_name in &embedding_names {
let filename = entry_name
.strip_prefix(embeddings_prefix)
.unwrap_or(entry_name)
.to_string();
let data = match archive.by_name(entry_name) {
Ok(mut file) => {
let mut buf = Vec::with_capacity(file.size() as usize);
if file.read_to_end(&mut buf).is_err() {
warnings.push(ProcessingWarning {
source: Cow::Owned(format!("{}_embedded_objects", source_label)),
message: Cow::Owned(format!("Failed to read embedded file '{}'", filename)),
});
continue;
}
buf
}
Err(_) => continue,
};
if data.is_empty() {
continue;
}
let is_ole_binary = data.len() >= 4 && data[0..4] == [0xD0, 0xCF, 0x11, 0xE0];
if is_ole_binary {
warnings.push(ProcessingWarning {
source: Cow::Owned(format!("{}_embedded_objects", source_label)),
message: Cow::Owned(format!(
"Skipped OLE compound file '{}': format identification not supported",
filename
)),
});
continue;
}
let detected_mime = crate::core::mime::detect_mime_type_from_bytes(&data).ok().or_else(|| {
std::path::Path::new(&filename)
.extension()
.and_then(|ext| ext.to_str())
.and_then(|ext| mime_guess::from_ext(ext).first())
.map(|m| m.to_string())
});
let file_mime = match detected_mime {
Some(m) if m != "application/octet-stream" => m,
_ => {
continue;
}
};
match crate::core::extractor::extract_bytes(&data, &file_mime, &child_config).await {
Ok(result) => {
children.push(ArchiveEntry {
path: filename,
mime_type: file_mime,
result: Box::new(result),
});
}
Err(e) => {
warnings.push(ProcessingWarning {
source: Cow::Owned(format!("{}_embedded_objects", source_label)),
message: Cow::Owned(format!("Failed to extract embedded '{}': {}", filename, e)),
});
}
}
}
(children, warnings)
}