pub mod keynote;
pub mod numbers;
pub mod pages;
use crate::Result;
use crate::error::KreuzbergError;
use crate::text::utf8_validation;
use std::io::Cursor;
use std::io::Read;
const MAX_IWA_DECOMPRESSED_SIZE: usize = 64 * 1024 * 1024;
pub fn collect_iwa_paths(content: &[u8]) -> Result<Vec<String>> {
let cursor = Cursor::new(content);
let mut archive =
zip::ZipArchive::new(cursor).map_err(|e| KreuzbergError::parsing(format!("Failed to open iWork ZIP: {e}")))?;
let iwa_paths: Vec<String> = (0..archive.len())
.filter_map(|i| {
archive.by_index(i).ok().and_then(|f| {
let name = f.name().to_string();
if name.ends_with(".iwa") { Some(name) } else { None }
})
})
.collect();
Ok(iwa_paths)
}
pub fn read_iwa_file(content: &[u8], path: &str) -> Result<Vec<u8>> {
use std::io::Read;
let cursor = Cursor::new(content);
let mut archive =
zip::ZipArchive::new(cursor).map_err(|e| KreuzbergError::parsing(format!("Failed to open iWork ZIP: {e}")))?;
let mut file = archive
.by_name(path)
.map_err(|_| KreuzbergError::parsing(format!("IWA file not found in archive: {path}")))?;
let compressed_size = file.size() as usize;
let mut raw = Vec::with_capacity(compressed_size.min(MAX_IWA_DECOMPRESSED_SIZE));
file.read_to_end(&mut raw)
.map_err(|e| KreuzbergError::parsing(format!("Failed to read IWA file {path}: {e}")))?;
decode_iwa_stream(&raw).map_err(|e| KreuzbergError::parsing(format!("Failed to decode IWA {path}: {e}")))
}
pub fn decode_iwa_stream(data: &[u8]) -> std::result::Result<Vec<u8>, String> {
let mut decoder = snap::raw::Decoder::new();
let mut output = Vec::new();
let mut i = 0usize;
while i + 4 <= data.len() {
let chunk_type = data[i];
let chunk_len = (data[i + 1] as usize) | ((data[i + 2] as usize) << 8) | ((data[i + 3] as usize) << 16);
i += 4;
let end = i + chunk_len;
if end > data.len() {
return Err(format!(
"IWA chunk out of bounds: offset={i}, chunk_len={chunk_len}, data_len={}",
data.len()
));
}
let payload = &data[i..end];
i = end;
match chunk_type {
0x00 => {
let decompressed = decoder
.decompress_vec(payload)
.map_err(|e| format!("Snappy decompression failed: {e}"))?;
if output.len() + decompressed.len() > MAX_IWA_DECOMPRESSED_SIZE {
return Err(format!(
"Decompressed IWA exceeds size limit ({MAX_IWA_DECOMPRESSED_SIZE} bytes)"
));
}
output.extend_from_slice(&decompressed);
}
0x01 => {
if output.len() + payload.len() > MAX_IWA_DECOMPRESSED_SIZE {
return Err(format!(
"Uncompressed IWA exceeds size limit ({MAX_IWA_DECOMPRESSED_SIZE} bytes)"
));
}
output.extend_from_slice(payload);
}
_ => {
tracing::debug!("Unknown IWA chunk type: 0x{:02x}, len={chunk_len}", chunk_type);
}
}
}
Ok(output)
}
pub fn extract_text_from_proto(data: &[u8]) -> Vec<String> {
let mut texts: Vec<String> = Vec::new();
let mut i = 0usize;
while i < data.len() {
let (tag_varint, tag_len) = match read_varint(data, i) {
Some(v) => v,
None => break,
};
i += tag_len;
let wire_type = tag_varint & 0x7;
match wire_type {
0 => {
match read_varint(data, i) {
Some((_, len)) => i += len,
None => break,
}
}
1 => {
i += 8;
}
2 => {
let (length, len_bytes) = match read_varint(data, i) {
Some(v) => v,
None => break,
};
i += len_bytes;
let end = i + length as usize;
if end > data.len() {
break;
}
let payload = &data[i..end];
i = end;
if let Ok(s) = utf8_validation::from_utf8(payload) {
let trimmed = s.trim();
if trimmed.len() >= 3 && trimmed.chars().any(|c| c.is_alphabetic() || c.is_numeric()) {
texts.push(trimmed.to_string());
}
}
let nested = extract_text_from_proto(payload);
texts.extend(nested);
}
5 => {
i += 4;
}
_ => {
break;
}
}
}
texts
}
fn read_varint(data: &[u8], pos: usize) -> Option<(u64, usize)> {
let mut result: u64 = 0;
let mut shift = 0u32;
let mut i = pos;
loop {
if i >= data.len() {
return None;
}
let byte = data[i] as u64;
i += 1;
result |= (byte & 0x7F) << shift;
if byte & 0x80 == 0 {
return Some((result, i - pos));
}
shift += 7;
if shift >= 64 {
return None;
}
}
}
pub fn extract_text_from_iwa_files(content: &[u8], iwa_paths: &[&str]) -> Result<String> {
let cursor = Cursor::new(content);
let mut archive =
zip::ZipArchive::new(cursor).map_err(|e| KreuzbergError::parsing(format!("Failed to open iWork ZIP: {e}")))?;
let mut all_text: Vec<String> = Vec::new();
for path in iwa_paths {
match archive.by_name(path) {
Ok(mut file) => {
let compressed_size = file.size() as usize;
let mut compressed = Vec::with_capacity(compressed_size.min(MAX_IWA_DECOMPRESSED_SIZE));
if file.read_to_end(&mut compressed).is_err() {
continue;
}
let mut decoder = snap::raw::Decoder::new();
let Ok(decompressed) = decoder.decompress_vec(&compressed) else {
continue;
};
if decompressed.len() > MAX_IWA_DECOMPRESSED_SIZE {
continue;
}
let texts = extract_text_from_proto(&decompressed);
all_text.extend(texts);
}
Err(_) => {
continue;
}
}
}
Ok(all_text.join("\n"))
}
pub fn dedup_text(texts: Vec<String>) -> Vec<String> {
let mut seen = std::collections::HashSet::new();
let mut result = Vec::new();
for t in texts {
if seen.insert(t.clone()) {
result.push(t);
}
}
result
}