Skip to main content

tandem_document/
extractor.rs

1use calamine::{open_workbook_auto, Data, Reader};
2use quick_xml::events::Event;
3use quick_xml::Reader as XmlReader;
4use std::collections::BTreeMap;
5use std::fs;
6use std::io::{Cursor, Read};
7use std::path::{Path, PathBuf};
8use thiserror::Error;
9use zip::ZipArchive;
10
11#[derive(Error, Debug)]
12pub enum DocumentError {
13    #[error("IO error: {0}")]
14    Io(#[from] std::io::Error),
15
16    #[error("File not found: {0}")]
17    NotFound(String),
18
19    #[error("Invalid document: {0}")]
20    InvalidDocument(String),
21
22    #[error("Extraction failed: {0}")]
23    ExtractionFailed(String),
24}
25
26pub type Result<T> = std::result::Result<T, DocumentError>;
27
28fn lower_ext(path: &Path) -> Option<String> {
29    path.extension()
30        .and_then(|ext| ext.to_str())
31        .map(|ext| ext.to_ascii_lowercase())
32}
33
34fn truncate_output(text: String, max_chars: usize) -> String {
35    if max_chars == 0 {
36        return String::new();
37    }
38
39    let mut chars = text.chars();
40    let preview: String = chars.by_ref().take(max_chars).collect();
41    if chars.next().is_none() {
42        return preview;
43    }
44
45    let mut out = preview;
46    out.push_str("\n\n...[truncated]...\n");
47    out
48}
49
50fn read_zip_entry(path: &Path, inner_path: &str, max_bytes: usize) -> Result<Vec<u8>> {
51    let bytes = fs::read(path)?;
52    let cursor = Cursor::new(bytes);
53    let mut archive = ZipArchive::new(cursor).map_err(|err| {
54        DocumentError::InvalidDocument(format!("Failed to open zip container {:?}: {}", path, err))
55    })?;
56
57    let mut entry = archive.by_name(inner_path).map_err(|err| {
58        DocumentError::InvalidDocument(format!(
59            "Zip entry '{}' not found in {:?}: {}",
60            inner_path, path, err
61        ))
62    })?;
63
64    let mut out = Vec::new();
65    let mut buffer = [0u8; 16 * 1024];
66    while out.len() < max_bytes {
67        let remaining = max_bytes - out.len();
68        let read_len = remaining.min(buffer.len());
69        let read = entry.read(&mut buffer[..read_len]).map_err(|err| {
70            DocumentError::ExtractionFailed(format!("Failed reading zip entry: {}", err))
71        })?;
72        if read == 0 {
73            break;
74        }
75        out.extend_from_slice(&buffer[..read]);
76    }
77
78    Ok(out)
79}
80
81fn append_paragraph_break(out: &mut String) {
82    if !out.is_empty() && !out.ends_with('\n') {
83        out.push('\n');
84    }
85}
86
87#[derive(Copy, Clone, Debug, Eq, PartialEq)]
88enum OoxmlKind {
89    Word,
90    Presentation,
91}
92
93fn extract_ooxml_text(xml: &[u8], kind: OoxmlKind) -> Result<String> {
94    let mut reader = XmlReader::from_reader(xml);
95    reader.config_mut().trim_text(false);
96
97    let mut out = String::new();
98    let mut in_text = false;
99    let mut buf = Vec::new();
100
101    loop {
102        match reader.read_event_into(&mut buf) {
103            Ok(Event::Start(event)) => {
104                let name = event.name();
105                let name = name.as_ref();
106                if name.ends_with(b"t") {
107                    in_text = true;
108                } else if matches!(kind, OoxmlKind::Word) && name.ends_with(b"tab") {
109                    out.push('\t');
110                } else if matches!(kind, OoxmlKind::Word) && name.ends_with(b"br") {
111                    out.push('\n');
112                } else if name.ends_with(b"p") {
113                    append_paragraph_break(&mut out);
114                }
115            }
116            Ok(Event::End(_)) => {
117                in_text = false;
118            }
119            Ok(Event::Text(text)) if in_text => {
120                let decoded = text.decode().map_err(|err| {
121                    DocumentError::ExtractionFailed(format!("XML decode/unescape error: {}", err))
122                })?;
123                out.push_str(&decoded);
124            }
125            Ok(Event::Eof) => break,
126            Err(err) => {
127                let label = match kind {
128                    OoxmlKind::Word => "OOXML XML",
129                    OoxmlKind::Presentation => "PPTX XML",
130                };
131                return Err(DocumentError::ExtractionFailed(format!(
132                    "Failed parsing {}: {}",
133                    label, err
134                )));
135            }
136            _ => {}
137        }
138
139        buf.clear();
140    }
141
142    Ok(out)
143}
144
145fn extract_text_docx(path: &Path, max_xml_bytes: usize) -> Result<String> {
146    let xml = read_zip_entry(path, "word/document.xml", max_xml_bytes)?;
147    extract_ooxml_text(&xml, OoxmlKind::Word)
148}
149
150fn extract_text_pptx(path: &Path, max_xml_bytes: usize) -> Result<String> {
151    let bytes = fs::read(path)?;
152    let cursor = Cursor::new(bytes);
153    let mut archive = ZipArchive::new(cursor).map_err(|err| {
154        DocumentError::InvalidDocument(format!("Failed to open zip container {:?}: {}", path, err))
155    })?;
156
157    let mut slides = BTreeMap::new();
158    for idx in 0..archive.len() {
159        let Ok(file) = archive.by_index(idx) else {
160            continue;
161        };
162        let name = file.name().to_string();
163        if !name.starts_with("ppt/slides/slide") || !name.ends_with(".xml") {
164            continue;
165        }
166
167        let mut buf = Vec::new();
168        file.take(max_xml_bytes as u64)
169            .read_to_end(&mut buf)
170            .map_err(|err| {
171                DocumentError::ExtractionFailed(format!("Failed reading slide XML: {}", err))
172            })?;
173        let text = extract_ooxml_text(&buf, OoxmlKind::Presentation)?;
174        slides.insert(name, text);
175    }
176
177    if slides.is_empty() {
178        return Err(DocumentError::InvalidDocument(format!(
179            "No slide XML found in {:?}",
180            path
181        )));
182    }
183
184    let mut out = String::new();
185    for (name, text) in slides {
186        out.push_str("# ");
187        out.push_str(&name);
188        out.push('\n');
189        out.push_str(text.trim());
190        out.push_str("\n\n");
191    }
192    Ok(out)
193}
194
195fn extract_text_spreadsheet(
196    path: &Path,
197    max_sheets: usize,
198    max_rows: usize,
199    max_cols: usize,
200) -> Result<String> {
201    let mut workbook = open_workbook_auto(path).map_err(|err| {
202        DocumentError::InvalidDocument(format!("Failed to open spreadsheet {:?}: {}", path, err))
203    })?;
204
205    let mut out = String::new();
206    for (sheet_index, sheet_name) in workbook.sheet_names().iter().cloned().enumerate() {
207        if sheet_index >= max_sheets {
208            out.push_str("\n...[more sheets truncated]...\n");
209            break;
210        }
211
212        let range = match workbook.worksheet_range(&sheet_name) {
213            Ok(range) => range,
214            Err(_) => continue,
215        };
216
217        out.push_str("# Sheet: ");
218        out.push_str(&sheet_name);
219        out.push('\n');
220
221        for (row_index, row) in range.rows().take(max_rows).enumerate() {
222            if row_index > 0 {
223                out.push('\n');
224            }
225
226            for (col_index, cell) in row.iter().take(max_cols).enumerate() {
227                if col_index > 0 {
228                    out.push('\t');
229                }
230                if !matches!(cell, Data::Empty) {
231                    out.push_str(&cell.to_string());
232                }
233            }
234        }
235        out.push_str("\n\n");
236    }
237
238    Ok(out)
239}
240
241fn extract_text_pdf(path: &Path) -> Result<String> {
242    pdf_extract::extract_text(path).map_err(|err| {
243        DocumentError::ExtractionFailed(format!("Failed to extract PDF text {:?}: {}", path, err))
244    })
245}
246
247fn extract_text_rtf(bytes: &[u8]) -> String {
248    let mut out = String::new();
249    let mut index = 0usize;
250
251    while index < bytes.len() {
252        match bytes[index] {
253            b'{' | b'}' => {
254                index += 1;
255            }
256            b'\\' => {
257                index += 1;
258                if index >= bytes.len() {
259                    break;
260                }
261
262                match bytes[index] {
263                    b'\\' | b'{' | b'}' => {
264                        out.push(bytes[index] as char);
265                        index += 1;
266                    }
267                    b'\'' => {
268                        if index + 2 < bytes.len() {
269                            let hex = &bytes[index + 1..index + 3];
270                            if let Ok(hex) = std::str::from_utf8(hex) {
271                                if let Ok(value) = u8::from_str_radix(hex, 16) {
272                                    out.push(value as char);
273                                    index += 3;
274                                    continue;
275                                }
276                            }
277                        }
278                        index += 1;
279                    }
280                    b'\n' | b'\r' => {
281                        index += 1;
282                    }
283                    _ => {
284                        while index < bytes.len() && bytes[index].is_ascii_alphabetic() {
285                            index += 1;
286                        }
287                        while index < bytes.len()
288                            && (bytes[index].is_ascii_digit() || bytes[index] == b'-')
289                        {
290                            index += 1;
291                        }
292                        if index < bytes.len() && bytes[index] == b' ' {
293                            index += 1;
294                        }
295                    }
296                }
297            }
298            b'\n' | b'\r' => {
299                index += 1;
300            }
301            byte => {
302                out.push(byte as char);
303                index += 1;
304            }
305        }
306    }
307
308    out.split_whitespace().collect::<Vec<_>>().join(" ")
309}
310
311#[derive(Debug, Clone)]
312pub struct ExtractLimits {
313    pub max_file_bytes: u64,
314    pub max_output_chars: usize,
315    pub max_xml_bytes: usize,
316    pub max_sheets: usize,
317    pub max_rows: usize,
318    pub max_cols: usize,
319}
320
321impl Default for ExtractLimits {
322    fn default() -> Self {
323        Self {
324            max_file_bytes: 25 * 1024 * 1024,
325            max_output_chars: 200_000,
326            max_xml_bytes: 5 * 1024 * 1024,
327            max_sheets: 6,
328            max_rows: 200,
329            max_cols: 30,
330        }
331    }
332}
333
334pub fn extract_file_text(path: &PathBuf, limits: ExtractLimits) -> Result<String> {
335    if !path.exists() {
336        return Err(DocumentError::NotFound(format!(
337            "File does not exist: {}",
338            path.display()
339        )));
340    }
341    if !path.is_file() {
342        return Err(DocumentError::InvalidDocument(format!(
343            "Path is not a file: {}",
344            path.display()
345        )));
346    }
347
348    let metadata = fs::metadata(path)?;
349    if metadata.len() > limits.max_file_bytes {
350        return Err(DocumentError::InvalidDocument(format!(
351            "File too large for text extraction: {} bytes (limit: {} bytes)",
352            metadata.len(),
353            limits.max_file_bytes
354        )));
355    }
356
357    let ext = lower_ext(path.as_path()).unwrap_or_default();
358    let text = match ext.as_str() {
359        "pdf" => extract_text_pdf(path.as_path())?,
360        "docx" => extract_text_docx(path.as_path(), limits.max_xml_bytes)?,
361        "pptx" => extract_text_pptx(path.as_path(), limits.max_xml_bytes)?,
362        "xlsx" | "xls" | "ods" | "xlsb" => extract_text_spreadsheet(
363            path.as_path(),
364            limits.max_sheets,
365            limits.max_rows,
366            limits.max_cols,
367        )?,
368        "rtf" => {
369            let bytes = fs::read(path)?;
370            extract_text_rtf(&bytes)
371        }
372        _ => fs::read_to_string(path)?,
373    };
374
375    Ok(truncate_output(text, limits.max_output_chars))
376}