tandem_document/
lib.rs

1use calamine::{open_workbook_auto, Data, Reader};
2use quick_xml::events::Event;
3use quick_xml::Reader as XmlReader;
4use std::collections::BTreeMap;
5use std::fs;
6use std::io::{Cursor, Read};
7use std::path::{Path, PathBuf};
8use thiserror::Error;
9use zip::ZipArchive;
10
11#[derive(Error, Debug)]
12pub enum DocumentError {
13    #[error("IO error: {0}")]
14    Io(#[from] std::io::Error),
15
16    #[error("File not found: {0}")]
17    NotFound(String),
18
19    #[error("Invalid document: {0}")]
20    InvalidDocument(String),
21
22    #[error("Extraction failed: {0}")]
23    ExtractionFailed(String),
24}
25
26pub type Result<T> = std::result::Result<T, DocumentError>;
27
28fn lower_ext(path: &Path) -> Option<String> {
29    path.extension()
30        .and_then(|e| e.to_str())
31        .map(|s| s.to_lowercase())
32}
33
34fn truncate_output(s: String, max_chars: usize) -> String {
35    if max_chars == 0 {
36        return String::new();
37    }
38    if s.chars().count() <= max_chars {
39        return s;
40    }
41    let mut out = String::with_capacity(max_chars + 64);
42    for (i, ch) in s.chars().enumerate() {
43        if i >= max_chars {
44            break;
45        }
46        out.push(ch);
47    }
48    out.push_str("\n\n...[truncated]...\n");
49    out
50}
51
52fn read_zip_file(path: &Path, inner_path: &str, max_bytes: usize) -> Result<Vec<u8>> {
53    let bytes = fs::read(path)?;
54    let cursor = Cursor::new(bytes);
55    let mut zip = ZipArchive::new(cursor).map_err(|e| {
56        DocumentError::InvalidDocument(format!("Failed to open zip container {:?}: {}", path, e))
57    })?;
58
59    let mut file = zip.by_name(inner_path).map_err(|e| {
60        DocumentError::InvalidDocument(format!(
61            "Zip entry '{}' not found in {:?}: {}",
62            inner_path, path, e
63        ))
64    })?;
65
66    let mut out = Vec::new();
67    let mut buf = vec![0u8; 16 * 1024];
68    while out.len() < max_bytes {
69        let to_read = std::cmp::min(buf.len(), max_bytes - out.len());
70        let n = file.read(&mut buf[..to_read]).map_err(|e| {
71            DocumentError::ExtractionFailed(format!("Failed reading zip entry: {}", e))
72        })?;
73        if n == 0 {
74            break;
75        }
76        out.extend_from_slice(&buf[..n]);
77    }
78    Ok(out)
79}
80
81fn extract_text_from_wordprocessingml(xml: &[u8]) -> Result<String> {
82    let mut reader = XmlReader::from_reader(xml);
83    reader.config_mut().trim_text(false);
84
85    let mut out = String::new();
86    let mut in_text = false;
87
88    let mut buf = Vec::new();
89    loop {
90        match reader.read_event_into(&mut buf) {
91            Ok(Event::Start(e)) => {
92                let name = e.name();
93                let name = name.as_ref();
94                if name.ends_with(b"t") {
95                    in_text = true;
96                } else if name.ends_with(b"tab") {
97                    out.push('\t');
98                } else if name.ends_with(b"br") {
99                    out.push('\n');
100                } else if name.ends_with(b"p") {
101                    // Paragraph: ensure separation (intentionally nested to differ from br)
102                    #[allow(clippy::collapsible_if)]
103                    if !out.ends_with('\n') && !out.is_empty() {
104                        out.push('\n');
105                    }
106                }
107            }
108            Ok(Event::End(_e)) => {
109                in_text = false;
110            }
111            Ok(Event::Text(t)) => {
112                if in_text {
113                    let text = t.decode().map_err(|e| {
114                        DocumentError::ExtractionFailed(format!("XML decode/unescape error: {}", e))
115                    })?;
116                    out.push_str(&text);
117                }
118            }
119            Ok(Event::Eof) => break,
120            Err(e) => {
121                return Err(DocumentError::ExtractionFailed(format!(
122                    "Failed parsing OOXML XML: {}",
123                    e
124                )))
125            }
126            _ => {}
127        }
128        buf.clear();
129    }
130
131    Ok(out)
132}
133
134fn extract_text_from_presentationml(xml: &[u8]) -> Result<String> {
135    let mut reader = XmlReader::from_reader(xml);
136    reader.config_mut().trim_text(false);
137
138    let mut out = String::new();
139    let mut in_text = false;
140
141    let mut buf = Vec::new();
142    loop {
143        match reader.read_event_into(&mut buf) {
144            Ok(Event::Start(e)) => {
145                let name = e.name();
146                let name = name.as_ref();
147                if name.ends_with(b"t") {
148                    in_text = true;
149                } else if name.ends_with(b"p") {
150                    // Paragraph: ensure separation
151                    #[allow(clippy::collapsible_if)]
152                    if !out.ends_with('\n') && !out.is_empty() {
153                        out.push('\n');
154                    }
155                }
156            }
157            Ok(Event::End(_)) => {
158                in_text = false;
159            }
160            Ok(Event::Text(t)) => {
161                if in_text {
162                    let text = t.decode().map_err(|e| {
163                        DocumentError::ExtractionFailed(format!("XML decode/unescape error: {}", e))
164                    })?;
165                    out.push_str(&text);
166                }
167            }
168            Ok(Event::Eof) => break,
169            Err(e) => {
170                return Err(DocumentError::ExtractionFailed(format!(
171                    "Failed parsing PPTX XML: {}",
172                    e
173                )))
174            }
175            _ => {}
176        }
177        buf.clear();
178    }
179
180    Ok(out)
181}
182
183fn extract_text_docx(path: &Path, max_xml_bytes: usize) -> Result<String> {
184    let xml = read_zip_file(path, "word/document.xml", max_xml_bytes)?;
185    extract_text_from_wordprocessingml(&xml)
186}
187
188fn extract_text_pptx(path: &Path, max_xml_bytes: usize) -> Result<String> {
189    let bytes = fs::read(path)?;
190    let cursor = Cursor::new(bytes);
191    let mut zip = ZipArchive::new(cursor).map_err(|e| {
192        DocumentError::InvalidDocument(format!("Failed to open zip container {:?}: {}", path, e))
193    })?;
194
195    let mut slides: BTreeMap<String, String> = BTreeMap::new();
196    for i in 0..zip.len() {
197        let Ok(f) = zip.by_index(i) else {
198            continue;
199        };
200        let name = f.name().to_string();
201        if !name.starts_with("ppt/slides/slide") || !name.ends_with(".xml") {
202            continue;
203        }
204        let mut buf = Vec::new();
205        f.take(max_xml_bytes as u64)
206            .read_to_end(&mut buf)
207            .map_err(|e| {
208                DocumentError::ExtractionFailed(format!("Failed reading slide XML: {}", e))
209            })?;
210        let text = extract_text_from_presentationml(&buf)?;
211        slides.insert(name, text);
212    }
213
214    if slides.is_empty() {
215        return Err(DocumentError::InvalidDocument(format!(
216            "No slide XML found in {:?}",
217            path
218        )));
219    }
220
221    let mut out = String::new();
222    for (name, text) in slides {
223        out.push_str(&format!("# {}\n", name));
224        out.push_str(text.trim());
225        out.push_str("\n\n");
226    }
227    Ok(out)
228}
229
230fn extract_text_spreadsheet(
231    path: &Path,
232    max_sheets: usize,
233    max_rows: usize,
234    max_cols: usize,
235) -> Result<String> {
236    let mut workbook = open_workbook_auto(path).map_err(|e| {
237        DocumentError::InvalidDocument(format!("Failed to open spreadsheet {:?}: {}", path, e))
238    })?;
239
240    let sheet_names = workbook.sheet_names().to_vec();
241    let mut out = String::new();
242
243    for (idx, sheet) in sheet_names.into_iter().enumerate() {
244        if idx >= max_sheets {
245            out.push_str("\n...[more sheets truncated]...\n");
246            break;
247        }
248        let range = match workbook.worksheet_range(&sheet) {
249            Ok(r) => r,
250            Err(_) => continue,
251        };
252
253        out.push_str(&format!("# Sheet: {}\n", sheet));
254
255        for (r_i, row) in range.rows().take(max_rows).enumerate() {
256            if r_i > 0 {
257                out.push('\n');
258            }
259            for (c_i, cell) in row.iter().take(max_cols).enumerate() {
260                if c_i > 0 {
261                    out.push('\t');
262                }
263                match cell {
264                    Data::Empty => {}
265                    _ => out.push_str(&cell.to_string()),
266                }
267            }
268        }
269        out.push_str("\n\n");
270    }
271
272    Ok(out)
273}
274
275fn extract_text_pdf(path: &Path) -> Result<String> {
276    pdf_extract::extract_text(path).map_err(|e| {
277        DocumentError::ExtractionFailed(format!("Failed to extract PDF text {:?}: {}", path, e))
278    })
279}
280
281fn extract_text_rtf(bytes: &[u8]) -> String {
282    let mut out = String::new();
283    let mut i = 0usize;
284    let mut depth = 0i32;
285
286    while i < bytes.len() {
287        match bytes[i] {
288            b'{' => {
289                depth += 1;
290                i += 1;
291            }
292            b'}' => {
293                depth = (depth - 1).max(0);
294                i += 1;
295            }
296            b'\\' => {
297                i += 1;
298                if i >= bytes.len() {
299                    break;
300                }
301                match bytes[i] {
302                    b'\\' | b'{' | b'}' => {
303                        out.push(bytes[i] as char);
304                        i += 1;
305                    }
306                    b'\'' => {
307                        if i + 2 < bytes.len() {
308                            let h1 = bytes[i + 1];
309                            let h2 = bytes[i + 2];
310                            let hex = [h1, h2];
311                            if let Ok(s) = std::str::from_utf8(&hex) {
312                                if let Ok(v) = u8::from_str_radix(s, 16) {
313                                    out.push(v as char);
314                                    i += 3;
315                                    continue;
316                                }
317                            }
318                        }
319                        i += 1;
320                    }
321                    b'\n' | b'\r' => {
322                        i += 1;
323                    }
324                    _ => {
325                        while i < bytes.len() && (bytes[i].is_ascii_alphabetic()) {
326                            i += 1;
327                        }
328                        while i < bytes.len() && (bytes[i].is_ascii_digit() || bytes[i] == b'-') {
329                            i += 1;
330                        }
331                        if i < bytes.len() && bytes[i] == b' ' {
332                            i += 1;
333                        }
334                    }
335                }
336            }
337            b'\n' | b'\r' => {
338                i += 1;
339            }
340            b => {
341                out.push(b as char);
342                i += 1;
343            }
344        }
345    }
346
347    out.split_whitespace().collect::<Vec<_>>().join(" ")
348}
349
350#[derive(Debug, Clone)]
351pub struct ExtractLimits {
352    pub max_file_bytes: u64,
353    pub max_output_chars: usize,
354    pub max_xml_bytes: usize,
355    pub max_sheets: usize,
356    pub max_rows: usize,
357    pub max_cols: usize,
358}
359
360impl Default for ExtractLimits {
361    fn default() -> Self {
362        Self {
363            max_file_bytes: 25 * 1024 * 1024,
364            max_output_chars: 200_000,
365            max_xml_bytes: 5 * 1024 * 1024,
366            max_sheets: 6,
367            max_rows: 200,
368            max_cols: 30,
369        }
370    }
371}
372
373pub fn extract_file_text(path: &PathBuf, limits: ExtractLimits) -> Result<String> {
374    if !path.exists() {
375        return Err(DocumentError::NotFound(format!(
376            "File does not exist: {}",
377            path.display()
378        )));
379    }
380    if !path.is_file() {
381        return Err(DocumentError::InvalidDocument(format!(
382            "Path is not a file: {}",
383            path.display()
384        )));
385    }
386
387    let meta = fs::metadata(path)?;
388    if meta.len() > limits.max_file_bytes {
389        return Err(DocumentError::InvalidDocument(format!(
390            "File too large for text extraction: {} bytes (limit: {} bytes)",
391            meta.len(),
392            limits.max_file_bytes
393        )));
394    }
395
396    let ext = lower_ext(path.as_path()).unwrap_or_default();
397    let text = match ext.as_str() {
398        "pdf" => extract_text_pdf(path.as_path())?,
399        "docx" => extract_text_docx(path.as_path(), limits.max_xml_bytes)?,
400        "pptx" => extract_text_pptx(path.as_path(), limits.max_xml_bytes)?,
401        "xlsx" | "xls" | "ods" | "xlsb" => extract_text_spreadsheet(
402            path.as_path(),
403            limits.max_sheets,
404            limits.max_rows,
405            limits.max_cols,
406        )?,
407        "rtf" => {
408            let bytes = fs::read(path)?;
409            extract_text_rtf(&bytes)
410        }
411        _ => fs::read_to_string(path)?,
412    };
413
414    Ok(truncate_output(text, limits.max_output_chars))
415}
tandem_document/lib.rs

tandem_document/
lib.rs