1use calamine::{open_workbook_auto, Data, Reader};
2use quick_xml::events::Event;
3use quick_xml::Reader as XmlReader;
4use std::collections::BTreeMap;
5use std::fs;
6use std::io::{Cursor, Read};
7use std::path::{Path, PathBuf};
8use thiserror::Error;
9use zip::ZipArchive;
10
11#[derive(Error, Debug)]
12pub enum DocumentError {
13 #[error("IO error: {0}")]
14 Io(#[from] std::io::Error),
15
16 #[error("File not found: {0}")]
17 NotFound(String),
18
19 #[error("Invalid document: {0}")]
20 InvalidDocument(String),
21
22 #[error("Extraction failed: {0}")]
23 ExtractionFailed(String),
24}
25
26pub type Result<T> = std::result::Result<T, DocumentError>;
27
28fn lower_ext(path: &Path) -> Option<String> {
29 path.extension()
30 .and_then(|ext| ext.to_str())
31 .map(|ext| ext.to_ascii_lowercase())
32}
33
34fn truncate_output(text: String, max_chars: usize) -> String {
35 if max_chars == 0 {
36 return String::new();
37 }
38
39 let mut chars = text.chars();
40 let preview: String = chars.by_ref().take(max_chars).collect();
41 if chars.next().is_none() {
42 return preview;
43 }
44
45 let mut out = preview;
46 out.push_str("\n\n...[truncated]...\n");
47 out
48}
49
50fn read_zip_entry(path: &Path, inner_path: &str, max_bytes: usize) -> Result<Vec<u8>> {
51 const MAX_ZIP_BOMB_RATIO: u64 = 100;
52
53 let bytes = fs::read(path)?;
54 let compressed_size = bytes.len() as u64;
55 let cursor = Cursor::new(bytes);
56 let mut archive = ZipArchive::new(cursor).map_err(|err| {
57 DocumentError::InvalidDocument(format!("Failed to open zip container {:?}: {}", path, err))
58 })?;
59
60 let mut entry = archive.by_name(inner_path).map_err(|err| {
61 DocumentError::InvalidDocument(format!(
62 "Zip entry '{}' not found in {:?}: {}",
63 inner_path, path, err
64 ))
65 })?;
66
67 let uncompressed_size = entry.size();
68 if uncompressed_size > (compressed_size.saturating_mul(MAX_ZIP_BOMB_RATIO)) {
69 return Err(DocumentError::InvalidDocument(format!(
70 "Zip bomb detected: uncompressed size {} exceeds ratio limit ({}x compressed size {})",
71 uncompressed_size, MAX_ZIP_BOMB_RATIO, compressed_size
72 )));
73 }
74
75 let mut out = Vec::new();
76 let mut buffer = [0u8; 16 * 1024];
77 while out.len() < max_bytes {
78 let remaining = max_bytes - out.len();
79 let read_len = remaining.min(buffer.len());
80 let read = entry.read(&mut buffer[..read_len]).map_err(|err| {
81 DocumentError::ExtractionFailed(format!("Failed reading zip entry: {}", err))
82 })?;
83 if read == 0 {
84 break;
85 }
86 out.extend_from_slice(&buffer[..read]);
87 }
88
89 Ok(out)
90}
91
92fn append_paragraph_break(out: &mut String) {
93 if !out.is_empty() && !out.ends_with('\n') {
94 out.push('\n');
95 }
96}
97
98#[derive(Copy, Clone, Debug, Eq, PartialEq)]
99enum OoxmlKind {
100 Word,
101 Presentation,
102}
103
104fn extract_ooxml_text(xml: &[u8], kind: OoxmlKind) -> Result<String> {
105 let mut reader = XmlReader::from_reader(xml);
106 let config = reader.config_mut();
107 config.trim_text(false);
108 config.expand_empty_elements = false;
109
110 let mut out = String::new();
111 let mut in_text = false;
112 let mut buf = Vec::new();
113
114 loop {
115 match reader.read_event_into(&mut buf) {
116 Ok(Event::Start(event)) => {
117 let name = event.name();
118 let name = name.as_ref();
119 if name.ends_with(b"t") {
120 in_text = true;
121 } else if matches!(kind, OoxmlKind::Word) && name.ends_with(b"tab") {
122 out.push('\t');
123 } else if matches!(kind, OoxmlKind::Word) && name.ends_with(b"br") {
124 out.push('\n');
125 } else if name.ends_with(b"p") {
126 append_paragraph_break(&mut out);
127 }
128 }
129 Ok(Event::End(_)) => {
130 in_text = false;
131 }
132 Ok(Event::Text(text)) if in_text => {
133 let decoded = text.decode().map_err(|err| {
134 DocumentError::ExtractionFailed(format!("XML decode/unescape error: {}", err))
135 })?;
136 out.push_str(&decoded);
137 }
138 Ok(Event::Eof) => break,
139 Err(err) => {
140 let label = match kind {
141 OoxmlKind::Word => "OOXML XML",
142 OoxmlKind::Presentation => "PPTX XML",
143 };
144 return Err(DocumentError::ExtractionFailed(format!(
145 "Failed parsing {}: {}",
146 label, err
147 )));
148 }
149 _ => {}
150 }
151
152 buf.clear();
153 }
154
155 Ok(out)
156}
157
158fn extract_text_docx(path: &Path, max_xml_bytes: usize) -> Result<String> {
159 let xml = read_zip_entry(path, "word/document.xml", max_xml_bytes)?;
160 extract_ooxml_text(&xml, OoxmlKind::Word)
161}
162
163fn extract_text_pptx(path: &Path, max_xml_bytes: usize) -> Result<String> {
164 const MAX_ZIP_BOMB_RATIO: u64 = 100;
165
166 let bytes = fs::read(path)?;
167 let compressed_size = bytes.len() as u64;
168 let cursor = Cursor::new(bytes);
169 let mut archive = ZipArchive::new(cursor).map_err(|err| {
170 DocumentError::InvalidDocument(format!("Failed to open zip container {:?}: {}", path, err))
171 })?;
172
173 let mut slides = BTreeMap::new();
174 for idx in 0..archive.len() {
175 let Ok(file) = archive.by_index(idx) else {
176 continue;
177 };
178 let name = file.name().to_string();
179 if !name.starts_with("ppt/slides/slide") || !name.ends_with(".xml") {
180 continue;
181 }
182
183 let uncompressed_size = file.size();
184 if uncompressed_size > (compressed_size.saturating_mul(MAX_ZIP_BOMB_RATIO)) {
185 return Err(DocumentError::InvalidDocument(format!(
186 "Zip bomb detected in slide {}: uncompressed size {} exceeds ratio limit",
187 name, uncompressed_size
188 )));
189 }
190
191 let mut buf = Vec::new();
192 file.take(max_xml_bytes as u64)
193 .read_to_end(&mut buf)
194 .map_err(|err| {
195 DocumentError::ExtractionFailed(format!("Failed reading slide XML: {}", err))
196 })?;
197 let text = extract_ooxml_text(&buf, OoxmlKind::Presentation)?;
198 slides.insert(name, text);
199 }
200
201 if slides.is_empty() {
202 return Err(DocumentError::InvalidDocument(format!(
203 "No slide XML found in {:?}",
204 path
205 )));
206 }
207
208 let mut out = String::new();
209 for (name, text) in slides {
210 out.push_str("# ");
211 out.push_str(&name);
212 out.push('\n');
213 out.push_str(text.trim());
214 out.push_str("\n\n");
215 }
216 Ok(out)
217}
218
219fn extract_text_spreadsheet(
220 path: &Path,
221 max_sheets: usize,
222 max_rows: usize,
223 max_cols: usize,
224) -> Result<String> {
225 let mut workbook = open_workbook_auto(path).map_err(|err| {
226 DocumentError::InvalidDocument(format!("Failed to open spreadsheet {:?}: {}", path, err))
227 })?;
228
229 let mut out = String::new();
230 for (sheet_index, sheet_name) in workbook.sheet_names().iter().cloned().enumerate() {
231 if sheet_index >= max_sheets {
232 out.push_str("\n...[more sheets truncated]...\n");
233 break;
234 }
235
236 let range = match workbook.worksheet_range(&sheet_name) {
237 Ok(range) => range,
238 Err(_) => continue,
239 };
240
241 out.push_str("# Sheet: ");
242 out.push_str(&sheet_name);
243 out.push('\n');
244
245 for (row_index, row) in range.rows().take(max_rows).enumerate() {
246 if row_index > 0 {
247 out.push('\n');
248 }
249
250 for (col_index, cell) in row.iter().take(max_cols).enumerate() {
251 if col_index > 0 {
252 out.push('\t');
253 }
254 if !matches!(cell, Data::Empty) {
255 out.push_str(&cell.to_string());
256 }
257 }
258 }
259 out.push_str("\n\n");
260 }
261
262 Ok(out)
263}
264
265fn extract_text_pdf(path: &Path) -> Result<String> {
266 pdf_extract::extract_text(path).map_err(|err| {
267 DocumentError::ExtractionFailed(format!("Failed to extract PDF text {:?}: {}", path, err))
268 })
269}
270
271fn extract_text_rtf(bytes: &[u8]) -> String {
272 let mut out = String::new();
273 let mut index = 0usize;
274
275 while index < bytes.len() {
276 match bytes[index] {
277 b'{' | b'}' => {
278 index += 1;
279 }
280 b'\\' => {
281 index += 1;
282 if index >= bytes.len() {
283 break;
284 }
285
286 match bytes[index] {
287 b'\\' | b'{' | b'}' => {
288 out.push(bytes[index] as char);
289 index += 1;
290 }
291 b'\'' => {
292 if index + 2 < bytes.len() {
293 let hex = &bytes[index + 1..index + 3];
294 if let Ok(hex) = std::str::from_utf8(hex) {
295 if let Ok(value) = u8::from_str_radix(hex, 16) {
296 out.push(value as char);
297 index += 3;
298 continue;
299 }
300 }
301 }
302 index += 1;
303 }
304 b'\n' | b'\r' => {
305 index += 1;
306 }
307 _ => {
308 while index < bytes.len() && bytes[index].is_ascii_alphabetic() {
309 index += 1;
310 }
311 while index < bytes.len()
312 && (bytes[index].is_ascii_digit() || bytes[index] == b'-')
313 {
314 index += 1;
315 }
316 if index < bytes.len() && bytes[index] == b' ' {
317 index += 1;
318 }
319 }
320 }
321 }
322 b'\n' | b'\r' => {
323 index += 1;
324 }
325 byte => {
326 out.push(byte as char);
327 index += 1;
328 }
329 }
330 }
331
332 out.split_whitespace().collect::<Vec<_>>().join(" ")
333}
334
335#[derive(Debug, Clone)]
336pub struct ExtractLimits {
337 pub max_file_bytes: u64,
338 pub max_output_chars: usize,
339 pub max_xml_bytes: usize,
340 pub max_sheets: usize,
341 pub max_rows: usize,
342 pub max_cols: usize,
343}
344
345impl Default for ExtractLimits {
346 fn default() -> Self {
347 Self {
348 max_file_bytes: 25 * 1024 * 1024,
349 max_output_chars: 200_000,
350 max_xml_bytes: 5 * 1024 * 1024,
351 max_sheets: 6,
352 max_rows: 200,
353 max_cols: 30,
354 }
355 }
356}
357
358pub fn extract_file_text(path: &PathBuf, limits: ExtractLimits) -> Result<String> {
359 if !path.exists() {
360 return Err(DocumentError::NotFound(format!(
361 "File does not exist: {}",
362 path.display()
363 )));
364 }
365 if !path.is_file() {
366 return Err(DocumentError::InvalidDocument(format!(
367 "Path is not a file: {}",
368 path.display()
369 )));
370 }
371
372 let metadata = fs::metadata(path)?;
373 if metadata.len() > limits.max_file_bytes {
374 return Err(DocumentError::InvalidDocument(format!(
375 "File too large for text extraction: {} bytes (limit: {} bytes)",
376 metadata.len(),
377 limits.max_file_bytes
378 )));
379 }
380
381 let ext = lower_ext(path.as_path()).unwrap_or_default();
382 let text = match ext.as_str() {
383 "pdf" => extract_text_pdf(path.as_path())?,
384 "docx" => extract_text_docx(path.as_path(), limits.max_xml_bytes)?,
385 "pptx" => extract_text_pptx(path.as_path(), limits.max_xml_bytes)?,
386 "xlsx" | "xls" | "ods" | "xlsb" => extract_text_spreadsheet(
387 path.as_path(),
388 limits.max_sheets,
389 limits.max_rows,
390 limits.max_cols,
391 )?,
392 "rtf" => {
393 let bytes = fs::read(path)?;
394 extract_text_rtf(&bytes)
395 }
396 _ => fs::read_to_string(path)?,
397 };
398
399 Ok(truncate_output(text, limits.max_output_chars))
400}