1use calamine::{open_workbook_auto, Data, Reader};
2use quick_xml::events::Event;
3use quick_xml::Reader as XmlReader;
4use std::collections::BTreeMap;
5use std::fs;
6use std::io::{Cursor, Read};
7use std::path::{Path, PathBuf};
8use thiserror::Error;
9use zip::ZipArchive;
10
11#[derive(Error, Debug)]
12pub enum DocumentError {
13 #[error("IO error: {0}")]
14 Io(#[from] std::io::Error),
15
16 #[error("File not found: {0}")]
17 NotFound(String),
18
19 #[error("Invalid document: {0}")]
20 InvalidDocument(String),
21
22 #[error("Extraction failed: {0}")]
23 ExtractionFailed(String),
24}
25
26pub type Result<T> = std::result::Result<T, DocumentError>;
27
28fn lower_ext(path: &Path) -> Option<String> {
29 path.extension()
30 .and_then(|e| e.to_str())
31 .map(|s| s.to_lowercase())
32}
33
34fn truncate_output(s: String, max_chars: usize) -> String {
35 if max_chars == 0 {
36 return String::new();
37 }
38 if s.chars().count() <= max_chars {
39 return s;
40 }
41 let mut out = String::with_capacity(max_chars + 64);
42 for (i, ch) in s.chars().enumerate() {
43 if i >= max_chars {
44 break;
45 }
46 out.push(ch);
47 }
48 out.push_str("\n\n...[truncated]...\n");
49 out
50}
51
52fn read_zip_file(path: &Path, inner_path: &str, max_bytes: usize) -> Result<Vec<u8>> {
53 let bytes = fs::read(path)?;
54 let cursor = Cursor::new(bytes);
55 let mut zip = ZipArchive::new(cursor).map_err(|e| {
56 DocumentError::InvalidDocument(format!("Failed to open zip container {:?}: {}", path, e))
57 })?;
58
59 let mut file = zip.by_name(inner_path).map_err(|e| {
60 DocumentError::InvalidDocument(format!(
61 "Zip entry '{}' not found in {:?}: {}",
62 inner_path, path, e
63 ))
64 })?;
65
66 let mut out = Vec::new();
67 let mut buf = vec![0u8; 16 * 1024];
68 while out.len() < max_bytes {
69 let to_read = std::cmp::min(buf.len(), max_bytes - out.len());
70 let n = file.read(&mut buf[..to_read]).map_err(|e| {
71 DocumentError::ExtractionFailed(format!("Failed reading zip entry: {}", e))
72 })?;
73 if n == 0 {
74 break;
75 }
76 out.extend_from_slice(&buf[..n]);
77 }
78 Ok(out)
79}
80
81fn extract_text_from_wordprocessingml(xml: &[u8]) -> Result<String> {
82 let mut reader = XmlReader::from_reader(xml);
83 reader.config_mut().trim_text(false);
84
85 let mut out = String::new();
86 let mut in_text = false;
87
88 let mut buf = Vec::new();
89 loop {
90 match reader.read_event_into(&mut buf) {
91 Ok(Event::Start(e)) => {
92 let name = e.name();
93 let name = name.as_ref();
94 if name.ends_with(b"t") {
95 in_text = true;
96 } else if name.ends_with(b"tab") {
97 out.push('\t');
98 } else if name.ends_with(b"br") {
99 out.push('\n');
100 } else if name.ends_with(b"p") {
101 #[allow(clippy::collapsible_if)]
103 if !out.ends_with('\n') && !out.is_empty() {
104 out.push('\n');
105 }
106 }
107 }
108 Ok(Event::End(_e)) => {
109 in_text = false;
110 }
111 Ok(Event::Text(t)) => {
112 if in_text {
113 let text = t.decode().map_err(|e| {
114 DocumentError::ExtractionFailed(format!("XML decode/unescape error: {}", e))
115 })?;
116 out.push_str(&text);
117 }
118 }
119 Ok(Event::Eof) => break,
120 Err(e) => {
121 return Err(DocumentError::ExtractionFailed(format!(
122 "Failed parsing OOXML XML: {}",
123 e
124 )))
125 }
126 _ => {}
127 }
128 buf.clear();
129 }
130
131 Ok(out)
132}
133
134fn extract_text_from_presentationml(xml: &[u8]) -> Result<String> {
135 let mut reader = XmlReader::from_reader(xml);
136 reader.config_mut().trim_text(false);
137
138 let mut out = String::new();
139 let mut in_text = false;
140
141 let mut buf = Vec::new();
142 loop {
143 match reader.read_event_into(&mut buf) {
144 Ok(Event::Start(e)) => {
145 let name = e.name();
146 let name = name.as_ref();
147 if name.ends_with(b"t") {
148 in_text = true;
149 } else if name.ends_with(b"p") {
150 #[allow(clippy::collapsible_if)]
152 if !out.ends_with('\n') && !out.is_empty() {
153 out.push('\n');
154 }
155 }
156 }
157 Ok(Event::End(_)) => {
158 in_text = false;
159 }
160 Ok(Event::Text(t)) => {
161 if in_text {
162 let text = t.decode().map_err(|e| {
163 DocumentError::ExtractionFailed(format!("XML decode/unescape error: {}", e))
164 })?;
165 out.push_str(&text);
166 }
167 }
168 Ok(Event::Eof) => break,
169 Err(e) => {
170 return Err(DocumentError::ExtractionFailed(format!(
171 "Failed parsing PPTX XML: {}",
172 e
173 )))
174 }
175 _ => {}
176 }
177 buf.clear();
178 }
179
180 Ok(out)
181}
182
183fn extract_text_docx(path: &Path, max_xml_bytes: usize) -> Result<String> {
184 let xml = read_zip_file(path, "word/document.xml", max_xml_bytes)?;
185 extract_text_from_wordprocessingml(&xml)
186}
187
188fn extract_text_pptx(path: &Path, max_xml_bytes: usize) -> Result<String> {
189 let bytes = fs::read(path)?;
190 let cursor = Cursor::new(bytes);
191 let mut zip = ZipArchive::new(cursor).map_err(|e| {
192 DocumentError::InvalidDocument(format!("Failed to open zip container {:?}: {}", path, e))
193 })?;
194
195 let mut slides: BTreeMap<String, String> = BTreeMap::new();
196 for i in 0..zip.len() {
197 let Ok(f) = zip.by_index(i) else {
198 continue;
199 };
200 let name = f.name().to_string();
201 if !name.starts_with("ppt/slides/slide") || !name.ends_with(".xml") {
202 continue;
203 }
204 let mut buf = Vec::new();
205 f.take(max_xml_bytes as u64)
206 .read_to_end(&mut buf)
207 .map_err(|e| {
208 DocumentError::ExtractionFailed(format!("Failed reading slide XML: {}", e))
209 })?;
210 let text = extract_text_from_presentationml(&buf)?;
211 slides.insert(name, text);
212 }
213
214 if slides.is_empty() {
215 return Err(DocumentError::InvalidDocument(format!(
216 "No slide XML found in {:?}",
217 path
218 )));
219 }
220
221 let mut out = String::new();
222 for (name, text) in slides {
223 out.push_str(&format!("# {}\n", name));
224 out.push_str(text.trim());
225 out.push_str("\n\n");
226 }
227 Ok(out)
228}
229
230fn extract_text_spreadsheet(
231 path: &Path,
232 max_sheets: usize,
233 max_rows: usize,
234 max_cols: usize,
235) -> Result<String> {
236 let mut workbook = open_workbook_auto(path).map_err(|e| {
237 DocumentError::InvalidDocument(format!("Failed to open spreadsheet {:?}: {}", path, e))
238 })?;
239
240 let sheet_names = workbook.sheet_names().to_vec();
241 let mut out = String::new();
242
243 for (idx, sheet) in sheet_names.into_iter().enumerate() {
244 if idx >= max_sheets {
245 out.push_str("\n...[more sheets truncated]...\n");
246 break;
247 }
248 let range = match workbook.worksheet_range(&sheet) {
249 Ok(r) => r,
250 Err(_) => continue,
251 };
252
253 out.push_str(&format!("# Sheet: {}\n", sheet));
254
255 for (r_i, row) in range.rows().take(max_rows).enumerate() {
256 if r_i > 0 {
257 out.push('\n');
258 }
259 for (c_i, cell) in row.iter().take(max_cols).enumerate() {
260 if c_i > 0 {
261 out.push('\t');
262 }
263 match cell {
264 Data::Empty => {}
265 _ => out.push_str(&cell.to_string()),
266 }
267 }
268 }
269 out.push_str("\n\n");
270 }
271
272 Ok(out)
273}
274
275fn extract_text_pdf(path: &Path) -> Result<String> {
276 pdf_extract::extract_text(path).map_err(|e| {
277 DocumentError::ExtractionFailed(format!("Failed to extract PDF text {:?}: {}", path, e))
278 })
279}
280
281fn extract_text_rtf(bytes: &[u8]) -> String {
282 let mut out = String::new();
283 let mut i = 0usize;
284 let mut depth = 0i32;
285
286 while i < bytes.len() {
287 match bytes[i] {
288 b'{' => {
289 depth += 1;
290 i += 1;
291 }
292 b'}' => {
293 depth = (depth - 1).max(0);
294 i += 1;
295 }
296 b'\\' => {
297 i += 1;
298 if i >= bytes.len() {
299 break;
300 }
301 match bytes[i] {
302 b'\\' | b'{' | b'}' => {
303 out.push(bytes[i] as char);
304 i += 1;
305 }
306 b'\'' => {
307 if i + 2 < bytes.len() {
308 let h1 = bytes[i + 1];
309 let h2 = bytes[i + 2];
310 let hex = [h1, h2];
311 if let Ok(s) = std::str::from_utf8(&hex) {
312 if let Ok(v) = u8::from_str_radix(s, 16) {
313 out.push(v as char);
314 i += 3;
315 continue;
316 }
317 }
318 }
319 i += 1;
320 }
321 b'\n' | b'\r' => {
322 i += 1;
323 }
324 _ => {
325 while i < bytes.len() && (bytes[i].is_ascii_alphabetic()) {
326 i += 1;
327 }
328 while i < bytes.len() && (bytes[i].is_ascii_digit() || bytes[i] == b'-') {
329 i += 1;
330 }
331 if i < bytes.len() && bytes[i] == b' ' {
332 i += 1;
333 }
334 }
335 }
336 }
337 b'\n' | b'\r' => {
338 i += 1;
339 }
340 b => {
341 out.push(b as char);
342 i += 1;
343 }
344 }
345 }
346
347 out.split_whitespace().collect::<Vec<_>>().join(" ")
348}
349
350#[derive(Debug, Clone)]
351pub struct ExtractLimits {
352 pub max_file_bytes: u64,
353 pub max_output_chars: usize,
354 pub max_xml_bytes: usize,
355 pub max_sheets: usize,
356 pub max_rows: usize,
357 pub max_cols: usize,
358}
359
360impl Default for ExtractLimits {
361 fn default() -> Self {
362 Self {
363 max_file_bytes: 25 * 1024 * 1024,
364 max_output_chars: 200_000,
365 max_xml_bytes: 5 * 1024 * 1024,
366 max_sheets: 6,
367 max_rows: 200,
368 max_cols: 30,
369 }
370 }
371}
372
373pub fn extract_file_text(path: &PathBuf, limits: ExtractLimits) -> Result<String> {
374 if !path.exists() {
375 return Err(DocumentError::NotFound(format!(
376 "File does not exist: {}",
377 path.display()
378 )));
379 }
380 if !path.is_file() {
381 return Err(DocumentError::InvalidDocument(format!(
382 "Path is not a file: {}",
383 path.display()
384 )));
385 }
386
387 let meta = fs::metadata(path)?;
388 if meta.len() > limits.max_file_bytes {
389 return Err(DocumentError::InvalidDocument(format!(
390 "File too large for text extraction: {} bytes (limit: {} bytes)",
391 meta.len(),
392 limits.max_file_bytes
393 )));
394 }
395
396 let ext = lower_ext(path.as_path()).unwrap_or_default();
397 let text = match ext.as_str() {
398 "pdf" => extract_text_pdf(path.as_path())?,
399 "docx" => extract_text_docx(path.as_path(), limits.max_xml_bytes)?,
400 "pptx" => extract_text_pptx(path.as_path(), limits.max_xml_bytes)?,
401 "xlsx" | "xls" | "ods" | "xlsb" => extract_text_spreadsheet(
402 path.as_path(),
403 limits.max_sheets,
404 limits.max_rows,
405 limits.max_cols,
406 )?,
407 "rtf" => {
408 let bytes = fs::read(path)?;
409 extract_text_rtf(&bytes)
410 }
411 _ => fs::read_to_string(path)?,
412 };
413
414 Ok(truncate_output(text, limits.max_output_chars))
415}