Skip to main content

tandem_document/
extractor.rs

1use calamine::{open_workbook_auto, Data, Reader};
2use quick_xml::events::Event;
3use quick_xml::Reader as XmlReader;
4use std::collections::BTreeMap;
5use std::fs;
6use std::io::{Cursor, Read};
7use std::path::{Path, PathBuf};
8use thiserror::Error;
9use zip::ZipArchive;
10
11#[derive(Error, Debug)]
12pub enum DocumentError {
13    #[error("IO error: {0}")]
14    Io(#[from] std::io::Error),
15
16    #[error("File not found: {0}")]
17    NotFound(String),
18
19    #[error("Invalid document: {0}")]
20    InvalidDocument(String),
21
22    #[error("Extraction failed: {0}")]
23    ExtractionFailed(String),
24}
25
26pub type Result<T> = std::result::Result<T, DocumentError>;
27
28fn lower_ext(path: &Path) -> Option<String> {
29    path.extension()
30        .and_then(|ext| ext.to_str())
31        .map(|ext| ext.to_ascii_lowercase())
32}
33
34fn truncate_output(text: String, max_chars: usize) -> String {
35    if max_chars == 0 {
36        return String::new();
37    }
38
39    let mut chars = text.chars();
40    let preview: String = chars.by_ref().take(max_chars).collect();
41    if chars.next().is_none() {
42        return preview;
43    }
44
45    let mut out = preview;
46    out.push_str("\n\n...[truncated]...\n");
47    out
48}
49
50fn read_zip_entry(path: &Path, inner_path: &str, max_bytes: usize) -> Result<Vec<u8>> {
51    let bytes = fs::read(path)?;
52    let cursor = Cursor::new(bytes);
53    let mut archive = ZipArchive::new(cursor).map_err(|err| {
54        DocumentError::InvalidDocument(format!("Failed to open zip container {:?}: {}", path, err))
55    })?;
56
57    let mut entry = archive.by_name(inner_path).map_err(|err| {
58        DocumentError::InvalidDocument(format!(
59            "Zip entry '{}' not found in {:?}: {}",
60            inner_path, path, err
61        ))
62    })?;
63
64    let mut out = Vec::new();
65    let mut buffer = [0u8; 16 * 1024];
66    while out.len() < max_bytes {
67        let remaining = max_bytes - out.len();
68        let read_len = remaining.min(buffer.len());
69        let read = entry.read(&mut buffer[..read_len]).map_err(|err| {
70            DocumentError::ExtractionFailed(format!("Failed reading zip entry: {}", err))
71        })?;
72        if read == 0 {
73            break;
74        }
75        out.extend_from_slice(&buffer[..read]);
76    }
77
78    Ok(out)
79}
80
81fn append_paragraph_break(out: &mut String) {
82    if !out.is_empty() && !out.ends_with('\n') {
83        out.push('\n');
84    }
85}
86
87#[derive(Copy, Clone, Debug, Eq, PartialEq)]
88enum OoxmlKind {
89    Word,
90    Presentation,
91}
92
93fn extract_ooxml_text(xml: &[u8], kind: OoxmlKind) -> Result<String> {
94    let mut reader = XmlReader::from_reader(xml);
95    reader.config_mut().trim_text(false);
96
97    let mut out = String::new();
98    let mut in_text = false;
99    let mut buf = Vec::new();
100
101    loop {
102        match reader.read_event_into(&mut buf) {
103            Ok(Event::Start(event)) => {
104                let name = event.name();
105                let name = name.as_ref();
106                if name.ends_with(b"t") {
107                    in_text = true;
108                } else if matches!(kind, OoxmlKind::Word) && name.ends_with(b"tab") {
109                    out.push('\t');
110                } else if matches!(kind, OoxmlKind::Word) && name.ends_with(b"br") {
111                    out.push('\n');
112                } else if name.ends_with(b"p") {
113                    append_paragraph_break(&mut out);
114                }
115            }
116            Ok(Event::End(_)) => {
117                in_text = false;
118            }
119            Ok(Event::Text(text)) => {
120                if in_text {
121                    let decoded = text.decode().map_err(|err| {
122                        DocumentError::ExtractionFailed(format!(
123                            "XML decode/unescape error: {}",
124                            err
125                        ))
126                    })?;
127                    out.push_str(&decoded);
128                }
129            }
130            Ok(Event::Eof) => break,
131            Err(err) => {
132                let label = match kind {
133                    OoxmlKind::Word => "OOXML XML",
134                    OoxmlKind::Presentation => "PPTX XML",
135                };
136                return Err(DocumentError::ExtractionFailed(format!(
137                    "Failed parsing {}: {}",
138                    label, err
139                )));
140            }
141            _ => {}
142        }
143
144        buf.clear();
145    }
146
147    Ok(out)
148}
149
150fn extract_text_docx(path: &Path, max_xml_bytes: usize) -> Result<String> {
151    let xml = read_zip_entry(path, "word/document.xml", max_xml_bytes)?;
152    extract_ooxml_text(&xml, OoxmlKind::Word)
153}
154
155fn extract_text_pptx(path: &Path, max_xml_bytes: usize) -> Result<String> {
156    let bytes = fs::read(path)?;
157    let cursor = Cursor::new(bytes);
158    let mut archive = ZipArchive::new(cursor).map_err(|err| {
159        DocumentError::InvalidDocument(format!("Failed to open zip container {:?}: {}", path, err))
160    })?;
161
162    let mut slides = BTreeMap::new();
163    for idx in 0..archive.len() {
164        let Ok(file) = archive.by_index(idx) else {
165            continue;
166        };
167        let name = file.name().to_string();
168        if !name.starts_with("ppt/slides/slide") || !name.ends_with(".xml") {
169            continue;
170        }
171
172        let mut buf = Vec::new();
173        file.take(max_xml_bytes as u64)
174            .read_to_end(&mut buf)
175            .map_err(|err| {
176                DocumentError::ExtractionFailed(format!("Failed reading slide XML: {}", err))
177            })?;
178        let text = extract_ooxml_text(&buf, OoxmlKind::Presentation)?;
179        slides.insert(name, text);
180    }
181
182    if slides.is_empty() {
183        return Err(DocumentError::InvalidDocument(format!(
184            "No slide XML found in {:?}",
185            path
186        )));
187    }
188
189    let mut out = String::new();
190    for (name, text) in slides {
191        out.push_str("# ");
192        out.push_str(&name);
193        out.push('\n');
194        out.push_str(text.trim());
195        out.push_str("\n\n");
196    }
197    Ok(out)
198}
199
200fn extract_text_spreadsheet(
201    path: &Path,
202    max_sheets: usize,
203    max_rows: usize,
204    max_cols: usize,
205) -> Result<String> {
206    let mut workbook = open_workbook_auto(path).map_err(|err| {
207        DocumentError::InvalidDocument(format!("Failed to open spreadsheet {:?}: {}", path, err))
208    })?;
209
210    let mut out = String::new();
211    for (sheet_index, sheet_name) in workbook.sheet_names().iter().cloned().enumerate() {
212        if sheet_index >= max_sheets {
213            out.push_str("\n...[more sheets truncated]...\n");
214            break;
215        }
216
217        let range = match workbook.worksheet_range(&sheet_name) {
218            Ok(range) => range,
219            Err(_) => continue,
220        };
221
222        out.push_str("# Sheet: ");
223        out.push_str(&sheet_name);
224        out.push('\n');
225
226        for (row_index, row) in range.rows().take(max_rows).enumerate() {
227            if row_index > 0 {
228                out.push('\n');
229            }
230
231            for (col_index, cell) in row.iter().take(max_cols).enumerate() {
232                if col_index > 0 {
233                    out.push('\t');
234                }
235                if !matches!(cell, Data::Empty) {
236                    out.push_str(&cell.to_string());
237                }
238            }
239        }
240        out.push_str("\n\n");
241    }
242
243    Ok(out)
244}
245
246fn extract_text_pdf(path: &Path) -> Result<String> {
247    pdf_extract::extract_text(path).map_err(|err| {
248        DocumentError::ExtractionFailed(format!("Failed to extract PDF text {:?}: {}", path, err))
249    })
250}
251
252fn extract_text_rtf(bytes: &[u8]) -> String {
253    let mut out = String::new();
254    let mut index = 0usize;
255
256    while index < bytes.len() {
257        match bytes[index] {
258            b'{' | b'}' => {
259                index += 1;
260            }
261            b'\\' => {
262                index += 1;
263                if index >= bytes.len() {
264                    break;
265                }
266
267                match bytes[index] {
268                    b'\\' | b'{' | b'}' => {
269                        out.push(bytes[index] as char);
270                        index += 1;
271                    }
272                    b'\'' => {
273                        if index + 2 < bytes.len() {
274                            let hex = &bytes[index + 1..index + 3];
275                            if let Ok(hex) = std::str::from_utf8(hex) {
276                                if let Ok(value) = u8::from_str_radix(hex, 16) {
277                                    out.push(value as char);
278                                    index += 3;
279                                    continue;
280                                }
281                            }
282                        }
283                        index += 1;
284                    }
285                    b'\n' | b'\r' => {
286                        index += 1;
287                    }
288                    _ => {
289                        while index < bytes.len() && bytes[index].is_ascii_alphabetic() {
290                            index += 1;
291                        }
292                        while index < bytes.len()
293                            && (bytes[index].is_ascii_digit() || bytes[index] == b'-')
294                        {
295                            index += 1;
296                        }
297                        if index < bytes.len() && bytes[index] == b' ' {
298                            index += 1;
299                        }
300                    }
301                }
302            }
303            b'\n' | b'\r' => {
304                index += 1;
305            }
306            byte => {
307                out.push(byte as char);
308                index += 1;
309            }
310        }
311    }
312
313    out.split_whitespace().collect::<Vec<_>>().join(" ")
314}
315
316#[derive(Debug, Clone)]
317pub struct ExtractLimits {
318    pub max_file_bytes: u64,
319    pub max_output_chars: usize,
320    pub max_xml_bytes: usize,
321    pub max_sheets: usize,
322    pub max_rows: usize,
323    pub max_cols: usize,
324}
325
326impl Default for ExtractLimits {
327    fn default() -> Self {
328        Self {
329            max_file_bytes: 25 * 1024 * 1024,
330            max_output_chars: 200_000,
331            max_xml_bytes: 5 * 1024 * 1024,
332            max_sheets: 6,
333            max_rows: 200,
334            max_cols: 30,
335        }
336    }
337}
338
339pub fn extract_file_text(path: &PathBuf, limits: ExtractLimits) -> Result<String> {
340    if !path.exists() {
341        return Err(DocumentError::NotFound(format!(
342            "File does not exist: {}",
343            path.display()
344        )));
345    }
346    if !path.is_file() {
347        return Err(DocumentError::InvalidDocument(format!(
348            "Path is not a file: {}",
349            path.display()
350        )));
351    }
352
353    let metadata = fs::metadata(path)?;
354    if metadata.len() > limits.max_file_bytes {
355        return Err(DocumentError::InvalidDocument(format!(
356            "File too large for text extraction: {} bytes (limit: {} bytes)",
357            metadata.len(),
358            limits.max_file_bytes
359        )));
360    }
361
362    let ext = lower_ext(path.as_path()).unwrap_or_default();
363    let text = match ext.as_str() {
364        "pdf" => extract_text_pdf(path.as_path())?,
365        "docx" => extract_text_docx(path.as_path(), limits.max_xml_bytes)?,
366        "pptx" => extract_text_pptx(path.as_path(), limits.max_xml_bytes)?,
367        "xlsx" | "xls" | "ods" | "xlsb" => extract_text_spreadsheet(
368            path.as_path(),
369            limits.max_sheets,
370            limits.max_rows,
371            limits.max_cols,
372        )?,
373        "rtf" => {
374            let bytes = fs::read(path)?;
375            extract_text_rtf(&bytes)
376        }
377        _ => fs::read_to_string(path)?,
378    };
379
380    Ok(truncate_output(text, limits.max_output_chars))
381}