Skip to main content

tandem_document/
extractor.rs

1use calamine::{open_workbook_auto, Data, Reader};
2use quick_xml::events::Event;
3use quick_xml::Reader as XmlReader;
4use std::collections::BTreeMap;
5use std::fs;
6use std::io::{Cursor, Read};
7use std::path::{Path, PathBuf};
8use thiserror::Error;
9use zip::ZipArchive;
10
11#[derive(Error, Debug)]
12pub enum DocumentError {
13    #[error("IO error: {0}")]
14    Io(#[from] std::io::Error),
15
16    #[error("File not found: {0}")]
17    NotFound(String),
18
19    #[error("Invalid document: {0}")]
20    InvalidDocument(String),
21
22    #[error("Extraction failed: {0}")]
23    ExtractionFailed(String),
24}
25
26pub type Result<T> = std::result::Result<T, DocumentError>;
27
28fn lower_ext(path: &Path) -> Option<String> {
29    path.extension()
30        .and_then(|ext| ext.to_str())
31        .map(|ext| ext.to_ascii_lowercase())
32}
33
34fn truncate_output(text: String, max_chars: usize) -> String {
35    if max_chars == 0 {
36        return String::new();
37    }
38
39    let mut chars = text.chars();
40    let preview: String = chars.by_ref().take(max_chars).collect();
41    if chars.next().is_none() {
42        return preview;
43    }
44
45    let mut out = preview;
46    out.push_str("\n\n...[truncated]...\n");
47    out
48}
49
50fn read_zip_entry(path: &Path, inner_path: &str, max_bytes: usize) -> Result<Vec<u8>> {
51    const MAX_ZIP_BOMB_RATIO: u64 = 100;
52
53    let bytes = fs::read(path)?;
54    let compressed_size = bytes.len() as u64;
55    let cursor = Cursor::new(bytes);
56    let mut archive = ZipArchive::new(cursor).map_err(|err| {
57        DocumentError::InvalidDocument(format!("Failed to open zip container {:?}: {}", path, err))
58    })?;
59
60    let mut entry = archive.by_name(inner_path).map_err(|err| {
61        DocumentError::InvalidDocument(format!(
62            "Zip entry '{}' not found in {:?}: {}",
63            inner_path, path, err
64        ))
65    })?;
66
67    let uncompressed_size = entry.size();
68    if uncompressed_size > (compressed_size.saturating_mul(MAX_ZIP_BOMB_RATIO)) {
69        return Err(DocumentError::InvalidDocument(format!(
70            "Zip bomb detected: uncompressed size {} exceeds ratio limit ({}x compressed size {})",
71            uncompressed_size, MAX_ZIP_BOMB_RATIO, compressed_size
72        )));
73    }
74
75    let mut out = Vec::new();
76    let mut buffer = [0u8; 16 * 1024];
77    while out.len() < max_bytes {
78        let remaining = max_bytes - out.len();
79        let read_len = remaining.min(buffer.len());
80        let read = entry.read(&mut buffer[..read_len]).map_err(|err| {
81            DocumentError::ExtractionFailed(format!("Failed reading zip entry: {}", err))
82        })?;
83        if read == 0 {
84            break;
85        }
86        out.extend_from_slice(&buffer[..read]);
87    }
88
89    Ok(out)
90}
91
92fn append_paragraph_break(out: &mut String) {
93    if !out.is_empty() && !out.ends_with('\n') {
94        out.push('\n');
95    }
96}
97
98#[derive(Copy, Clone, Debug, Eq, PartialEq)]
99enum OoxmlKind {
100    Word,
101    Presentation,
102}
103
104fn extract_ooxml_text(xml: &[u8], kind: OoxmlKind) -> Result<String> {
105    let mut reader = XmlReader::from_reader(xml);
106    let config = reader.config_mut();
107    config.trim_text(false);
108    config.expand_empty_elements = false;
109
110    let mut out = String::new();
111    let mut in_text = false;
112    let mut buf = Vec::new();
113
114    loop {
115        match reader.read_event_into(&mut buf) {
116            Ok(Event::Start(event)) => {
117                let name = event.name();
118                let name = name.as_ref();
119                if name.ends_with(b"t") {
120                    in_text = true;
121                } else if matches!(kind, OoxmlKind::Word) && name.ends_with(b"tab") {
122                    out.push('\t');
123                } else if matches!(kind, OoxmlKind::Word) && name.ends_with(b"br") {
124                    out.push('\n');
125                } else if name.ends_with(b"p") {
126                    append_paragraph_break(&mut out);
127                }
128            }
129            Ok(Event::End(_)) => {
130                in_text = false;
131            }
132            Ok(Event::Text(text)) if in_text => {
133                let decoded = text.decode().map_err(|err| {
134                    DocumentError::ExtractionFailed(format!("XML decode/unescape error: {}", err))
135                })?;
136                out.push_str(&decoded);
137            }
138            Ok(Event::Eof) => break,
139            Err(err) => {
140                let label = match kind {
141                    OoxmlKind::Word => "OOXML XML",
142                    OoxmlKind::Presentation => "PPTX XML",
143                };
144                return Err(DocumentError::ExtractionFailed(format!(
145                    "Failed parsing {}: {}",
146                    label, err
147                )));
148            }
149            _ => {}
150        }
151
152        buf.clear();
153    }
154
155    Ok(out)
156}
157
158fn extract_text_docx(path: &Path, max_xml_bytes: usize) -> Result<String> {
159    let xml = read_zip_entry(path, "word/document.xml", max_xml_bytes)?;
160    extract_ooxml_text(&xml, OoxmlKind::Word)
161}
162
163fn extract_text_pptx(path: &Path, max_xml_bytes: usize) -> Result<String> {
164    const MAX_ZIP_BOMB_RATIO: u64 = 100;
165
166    let bytes = fs::read(path)?;
167    let compressed_size = bytes.len() as u64;
168    let cursor = Cursor::new(bytes);
169    let mut archive = ZipArchive::new(cursor).map_err(|err| {
170        DocumentError::InvalidDocument(format!("Failed to open zip container {:?}: {}", path, err))
171    })?;
172
173    let mut slides = BTreeMap::new();
174    for idx in 0..archive.len() {
175        let Ok(file) = archive.by_index(idx) else {
176            continue;
177        };
178        let name = file.name().to_string();
179        if !name.starts_with("ppt/slides/slide") || !name.ends_with(".xml") {
180            continue;
181        }
182
183        let uncompressed_size = file.size();
184        if uncompressed_size > (compressed_size.saturating_mul(MAX_ZIP_BOMB_RATIO)) {
185            return Err(DocumentError::InvalidDocument(format!(
186                "Zip bomb detected in slide {}: uncompressed size {} exceeds ratio limit",
187                name, uncompressed_size
188            )));
189        }
190
191        let mut buf = Vec::new();
192        file.take(max_xml_bytes as u64)
193            .read_to_end(&mut buf)
194            .map_err(|err| {
195                DocumentError::ExtractionFailed(format!("Failed reading slide XML: {}", err))
196            })?;
197        let text = extract_ooxml_text(&buf, OoxmlKind::Presentation)?;
198        slides.insert(name, text);
199    }
200
201    if slides.is_empty() {
202        return Err(DocumentError::InvalidDocument(format!(
203            "No slide XML found in {:?}",
204            path
205        )));
206    }
207
208    let mut out = String::new();
209    for (name, text) in slides {
210        out.push_str("# ");
211        out.push_str(&name);
212        out.push('\n');
213        out.push_str(text.trim());
214        out.push_str("\n\n");
215    }
216    Ok(out)
217}
218
219fn extract_text_spreadsheet(
220    path: &Path,
221    max_sheets: usize,
222    max_rows: usize,
223    max_cols: usize,
224) -> Result<String> {
225    let mut workbook = open_workbook_auto(path).map_err(|err| {
226        DocumentError::InvalidDocument(format!("Failed to open spreadsheet {:?}: {}", path, err))
227    })?;
228
229    let mut out = String::new();
230    for (sheet_index, sheet_name) in workbook.sheet_names().iter().cloned().enumerate() {
231        if sheet_index >= max_sheets {
232            out.push_str("\n...[more sheets truncated]...\n");
233            break;
234        }
235
236        let range = match workbook.worksheet_range(&sheet_name) {
237            Ok(range) => range,
238            Err(_) => continue,
239        };
240
241        out.push_str("# Sheet: ");
242        out.push_str(&sheet_name);
243        out.push('\n');
244
245        for (row_index, row) in range.rows().take(max_rows).enumerate() {
246            if row_index > 0 {
247                out.push('\n');
248            }
249
250            for (col_index, cell) in row.iter().take(max_cols).enumerate() {
251                if col_index > 0 {
252                    out.push('\t');
253                }
254                if !matches!(cell, Data::Empty) {
255                    out.push_str(&cell.to_string());
256                }
257            }
258        }
259        out.push_str("\n\n");
260    }
261
262    Ok(out)
263}
264
265fn extract_text_pdf(path: &Path) -> Result<String> {
266    pdf_extract::extract_text(path).map_err(|err| {
267        DocumentError::ExtractionFailed(format!("Failed to extract PDF text {:?}: {}", path, err))
268    })
269}
270
271fn extract_text_rtf(bytes: &[u8]) -> String {
272    let mut out = String::new();
273    let mut index = 0usize;
274
275    while index < bytes.len() {
276        match bytes[index] {
277            b'{' | b'}' => {
278                index += 1;
279            }
280            b'\\' => {
281                index += 1;
282                if index >= bytes.len() {
283                    break;
284                }
285
286                match bytes[index] {
287                    b'\\' | b'{' | b'}' => {
288                        out.push(bytes[index] as char);
289                        index += 1;
290                    }
291                    b'\'' => {
292                        if index + 2 < bytes.len() {
293                            let hex = &bytes[index + 1..index + 3];
294                            if let Ok(hex) = std::str::from_utf8(hex) {
295                                if let Ok(value) = u8::from_str_radix(hex, 16) {
296                                    out.push(value as char);
297                                    index += 3;
298                                    continue;
299                                }
300                            }
301                        }
302                        index += 1;
303                    }
304                    b'\n' | b'\r' => {
305                        index += 1;
306                    }
307                    _ => {
308                        while index < bytes.len() && bytes[index].is_ascii_alphabetic() {
309                            index += 1;
310                        }
311                        while index < bytes.len()
312                            && (bytes[index].is_ascii_digit() || bytes[index] == b'-')
313                        {
314                            index += 1;
315                        }
316                        if index < bytes.len() && bytes[index] == b' ' {
317                            index += 1;
318                        }
319                    }
320                }
321            }
322            b'\n' | b'\r' => {
323                index += 1;
324            }
325            byte => {
326                out.push(byte as char);
327                index += 1;
328            }
329        }
330    }
331
332    out.split_whitespace().collect::<Vec<_>>().join(" ")
333}
334
335#[derive(Debug, Clone)]
336pub struct ExtractLimits {
337    pub max_file_bytes: u64,
338    pub max_output_chars: usize,
339    pub max_xml_bytes: usize,
340    pub max_sheets: usize,
341    pub max_rows: usize,
342    pub max_cols: usize,
343}
344
345impl Default for ExtractLimits {
346    fn default() -> Self {
347        Self {
348            max_file_bytes: 25 * 1024 * 1024,
349            max_output_chars: 200_000,
350            max_xml_bytes: 5 * 1024 * 1024,
351            max_sheets: 6,
352            max_rows: 200,
353            max_cols: 30,
354        }
355    }
356}
357
358pub fn extract_file_text(path: &PathBuf, limits: ExtractLimits) -> Result<String> {
359    if !path.exists() {
360        return Err(DocumentError::NotFound(format!(
361            "File does not exist: {}",
362            path.display()
363        )));
364    }
365    if !path.is_file() {
366        return Err(DocumentError::InvalidDocument(format!(
367            "Path is not a file: {}",
368            path.display()
369        )));
370    }
371
372    let metadata = fs::metadata(path)?;
373    if metadata.len() > limits.max_file_bytes {
374        return Err(DocumentError::InvalidDocument(format!(
375            "File too large for text extraction: {} bytes (limit: {} bytes)",
376            metadata.len(),
377            limits.max_file_bytes
378        )));
379    }
380
381    let ext = lower_ext(path.as_path()).unwrap_or_default();
382    let text = match ext.as_str() {
383        "pdf" => extract_text_pdf(path.as_path())?,
384        "docx" => extract_text_docx(path.as_path(), limits.max_xml_bytes)?,
385        "pptx" => extract_text_pptx(path.as_path(), limits.max_xml_bytes)?,
386        "xlsx" | "xls" | "ods" | "xlsb" => extract_text_spreadsheet(
387            path.as_path(),
388            limits.max_sheets,
389            limits.max_rows,
390            limits.max_cols,
391        )?,
392        "rtf" => {
393            let bytes = fs::read(path)?;
394            extract_text_rtf(&bytes)
395        }
396        _ => fs::read_to_string(path)?,
397    };
398
399    Ok(truncate_output(text, limits.max_output_chars))
400}