use super::{ArchiveEntry, ArchiveMetadata, TEXT_EXTENSIONS};
use crate::error::{KreuzbergError, Result};
use crate::extractors::security::SecurityLimits;
use ahash::AHashMap;
use std::io::{Cursor, Read};
use zip::ZipArchive;
pub fn extract_zip_metadata(bytes: &[u8], limits: &SecurityLimits) -> Result<ArchiveMetadata> {
let cursor = Cursor::new(bytes);
let mut archive =
ZipArchive::new(cursor).map_err(|e| KreuzbergError::parsing(format!("Failed to read ZIP archive: {}", e)))?;
if archive.len() > limits.max_files_in_archive {
return Err(KreuzbergError::validation(format!(
"ZIP archive has too many files: {} (max: {})",
archive.len(),
limits.max_files_in_archive
)));
}
let mut file_list = Vec::with_capacity(archive.len());
let mut total_size = 0u64;
for i in 0..archive.len() {
let file = archive
.by_index(i)
.map_err(|e| KreuzbergError::parsing(format!("Failed to read ZIP entry: {}", e)))?;
let path = file.name().to_string();
let size = file.size();
let is_dir = file.is_dir();
if !is_dir {
total_size += size;
}
if total_size > limits.max_archive_size as u64 {
return Err(KreuzbergError::validation(format!(
"ZIP archive total uncompressed size exceeds limit: {} bytes (max: {} bytes)",
total_size, limits.max_archive_size
)));
}
file_list.push(ArchiveEntry { path, size, is_dir });
}
Ok(ArchiveMetadata {
format: "ZIP".to_string(),
file_list,
file_count: archive.len(),
total_size,
})
}
pub fn extract_zip_text_content(bytes: &[u8], limits: &SecurityLimits) -> Result<AHashMap<String, String>> {
let cursor = Cursor::new(bytes);
let mut archive =
ZipArchive::new(cursor).map_err(|e| KreuzbergError::parsing(format!("Failed to read ZIP archive: {}", e)))?;
if archive.len() > limits.max_files_in_archive {
return Err(KreuzbergError::validation(format!(
"ZIP archive has too many files: {} (max: {})",
archive.len(),
limits.max_files_in_archive
)));
}
let estimated_text_files = archive.len().saturating_mul(3).saturating_div(10).max(2);
let mut contents = AHashMap::with_capacity(estimated_text_files);
let mut total_content_size = 0usize;
for i in 0..archive.len() {
let mut file = archive
.by_index(i)
.map_err(|e| KreuzbergError::parsing(format!("Failed to read ZIP entry: {}", e)))?;
let path = file.name().to_string();
if !file.is_dir() && TEXT_EXTENSIONS.iter().any(|ext| path.to_lowercase().ends_with(ext)) {
let estimated_size = (file.size() as usize).min(10 * 1024 * 1024);
let mut content = String::with_capacity(estimated_size);
if file.read_to_string(&mut content).is_ok() {
total_content_size = total_content_size.saturating_add(content.len());
if total_content_size > limits.max_content_size {
return Err(KreuzbergError::validation(format!(
"ZIP archive text content exceeds limit: {} bytes (max: {} bytes)",
total_content_size, limits.max_content_size
)));
}
contents.insert(path, content);
}
}
}
Ok(contents)
}
pub fn extract_zip_file_bytes(bytes: &[u8], limits: &SecurityLimits) -> Result<AHashMap<String, Vec<u8>>> {
let cursor = Cursor::new(bytes);
let mut archive =
ZipArchive::new(cursor).map_err(|e| KreuzbergError::parsing(format!("Failed to read ZIP archive: {}", e)))?;
if archive.len() > limits.max_files_in_archive {
return Err(KreuzbergError::validation(format!(
"ZIP archive has too many files: {} (max: {})",
archive.len(),
limits.max_files_in_archive
)));
}
let mut file_bytes = AHashMap::with_capacity(archive.len());
let mut total_size = 0usize;
for i in 0..archive.len() {
let mut file = archive
.by_index(i)
.map_err(|e| KreuzbergError::parsing(format!("Failed to read ZIP entry: {}", e)))?;
if file.is_dir() {
continue;
}
let path = file.name().to_string();
let estimated_size = (file.size() as usize).min(limits.max_archive_size);
let mut content = Vec::with_capacity(estimated_size);
if file.read_to_end(&mut content).is_ok() {
total_size = total_size.saturating_add(content.len());
if total_size > limits.max_archive_size {
return Err(KreuzbergError::validation(format!(
"ZIP archive total extracted size exceeds limit: {} bytes (max: {} bytes)",
total_size, limits.max_archive_size
)));
}
file_bytes.insert(path, content);
}
}
Ok(file_bytes)
}