1use calamine::{open_workbook_auto, Data, Reader};
2use quick_xml::events::Event;
3use quick_xml::Reader as XmlReader;
4use std::collections::BTreeMap;
5use std::fs;
6use std::io::{Cursor, Read};
7use std::path::{Path, PathBuf};
8use thiserror::Error;
9use zip::ZipArchive;
10
11#[derive(Error, Debug)]
12pub enum DocumentError {
13 #[error("IO error: {0}")]
14 Io(#[from] std::io::Error),
15
16 #[error("File not found: {0}")]
17 NotFound(String),
18
19 #[error("Invalid document: {0}")]
20 InvalidDocument(String),
21
22 #[error("Extraction failed: {0}")]
23 ExtractionFailed(String),
24}
25
26pub type Result<T> = std::result::Result<T, DocumentError>;
27
28fn lower_ext(path: &Path) -> Option<String> {
29 path.extension()
30 .and_then(|ext| ext.to_str())
31 .map(|ext| ext.to_ascii_lowercase())
32}
33
34fn truncate_output(text: String, max_chars: usize) -> String {
35 if max_chars == 0 {
36 return String::new();
37 }
38
39 let mut chars = text.chars();
40 let preview: String = chars.by_ref().take(max_chars).collect();
41 if chars.next().is_none() {
42 return preview;
43 }
44
45 let mut out = preview;
46 out.push_str("\n\n...[truncated]...\n");
47 out
48}
49
50fn read_zip_entry(path: &Path, inner_path: &str, max_bytes: usize) -> Result<Vec<u8>> {
51 let bytes = fs::read(path)?;
52 let cursor = Cursor::new(bytes);
53 let mut archive = ZipArchive::new(cursor).map_err(|err| {
54 DocumentError::InvalidDocument(format!("Failed to open zip container {:?}: {}", path, err))
55 })?;
56
57 let mut entry = archive.by_name(inner_path).map_err(|err| {
58 DocumentError::InvalidDocument(format!(
59 "Zip entry '{}' not found in {:?}: {}",
60 inner_path, path, err
61 ))
62 })?;
63
64 let mut out = Vec::new();
65 let mut buffer = [0u8; 16 * 1024];
66 while out.len() < max_bytes {
67 let remaining = max_bytes - out.len();
68 let read_len = remaining.min(buffer.len());
69 let read = entry.read(&mut buffer[..read_len]).map_err(|err| {
70 DocumentError::ExtractionFailed(format!("Failed reading zip entry: {}", err))
71 })?;
72 if read == 0 {
73 break;
74 }
75 out.extend_from_slice(&buffer[..read]);
76 }
77
78 Ok(out)
79}
80
81fn append_paragraph_break(out: &mut String) {
82 if !out.is_empty() && !out.ends_with('\n') {
83 out.push('\n');
84 }
85}
86
87#[derive(Copy, Clone, Debug, Eq, PartialEq)]
88enum OoxmlKind {
89 Word,
90 Presentation,
91}
92
93fn extract_ooxml_text(xml: &[u8], kind: OoxmlKind) -> Result<String> {
94 let mut reader = XmlReader::from_reader(xml);
95 reader.config_mut().trim_text(false);
96
97 let mut out = String::new();
98 let mut in_text = false;
99 let mut buf = Vec::new();
100
101 loop {
102 match reader.read_event_into(&mut buf) {
103 Ok(Event::Start(event)) => {
104 let name = event.name();
105 let name = name.as_ref();
106 if name.ends_with(b"t") {
107 in_text = true;
108 } else if matches!(kind, OoxmlKind::Word) && name.ends_with(b"tab") {
109 out.push('\t');
110 } else if matches!(kind, OoxmlKind::Word) && name.ends_with(b"br") {
111 out.push('\n');
112 } else if name.ends_with(b"p") {
113 append_paragraph_break(&mut out);
114 }
115 }
116 Ok(Event::End(_)) => {
117 in_text = false;
118 }
119 Ok(Event::Text(text)) => {
120 if in_text {
121 let decoded = text.decode().map_err(|err| {
122 DocumentError::ExtractionFailed(format!(
123 "XML decode/unescape error: {}",
124 err
125 ))
126 })?;
127 out.push_str(&decoded);
128 }
129 }
130 Ok(Event::Eof) => break,
131 Err(err) => {
132 let label = match kind {
133 OoxmlKind::Word => "OOXML XML",
134 OoxmlKind::Presentation => "PPTX XML",
135 };
136 return Err(DocumentError::ExtractionFailed(format!(
137 "Failed parsing {}: {}",
138 label, err
139 )));
140 }
141 _ => {}
142 }
143
144 buf.clear();
145 }
146
147 Ok(out)
148}
149
150fn extract_text_docx(path: &Path, max_xml_bytes: usize) -> Result<String> {
151 let xml = read_zip_entry(path, "word/document.xml", max_xml_bytes)?;
152 extract_ooxml_text(&xml, OoxmlKind::Word)
153}
154
155fn extract_text_pptx(path: &Path, max_xml_bytes: usize) -> Result<String> {
156 let bytes = fs::read(path)?;
157 let cursor = Cursor::new(bytes);
158 let mut archive = ZipArchive::new(cursor).map_err(|err| {
159 DocumentError::InvalidDocument(format!("Failed to open zip container {:?}: {}", path, err))
160 })?;
161
162 let mut slides = BTreeMap::new();
163 for idx in 0..archive.len() {
164 let Ok(file) = archive.by_index(idx) else {
165 continue;
166 };
167 let name = file.name().to_string();
168 if !name.starts_with("ppt/slides/slide") || !name.ends_with(".xml") {
169 continue;
170 }
171
172 let mut buf = Vec::new();
173 file.take(max_xml_bytes as u64)
174 .read_to_end(&mut buf)
175 .map_err(|err| {
176 DocumentError::ExtractionFailed(format!("Failed reading slide XML: {}", err))
177 })?;
178 let text = extract_ooxml_text(&buf, OoxmlKind::Presentation)?;
179 slides.insert(name, text);
180 }
181
182 if slides.is_empty() {
183 return Err(DocumentError::InvalidDocument(format!(
184 "No slide XML found in {:?}",
185 path
186 )));
187 }
188
189 let mut out = String::new();
190 for (name, text) in slides {
191 out.push_str("# ");
192 out.push_str(&name);
193 out.push('\n');
194 out.push_str(text.trim());
195 out.push_str("\n\n");
196 }
197 Ok(out)
198}
199
200fn extract_text_spreadsheet(
201 path: &Path,
202 max_sheets: usize,
203 max_rows: usize,
204 max_cols: usize,
205) -> Result<String> {
206 let mut workbook = open_workbook_auto(path).map_err(|err| {
207 DocumentError::InvalidDocument(format!("Failed to open spreadsheet {:?}: {}", path, err))
208 })?;
209
210 let mut out = String::new();
211 for (sheet_index, sheet_name) in workbook.sheet_names().iter().cloned().enumerate() {
212 if sheet_index >= max_sheets {
213 out.push_str("\n...[more sheets truncated]...\n");
214 break;
215 }
216
217 let range = match workbook.worksheet_range(&sheet_name) {
218 Ok(range) => range,
219 Err(_) => continue,
220 };
221
222 out.push_str("# Sheet: ");
223 out.push_str(&sheet_name);
224 out.push('\n');
225
226 for (row_index, row) in range.rows().take(max_rows).enumerate() {
227 if row_index > 0 {
228 out.push('\n');
229 }
230
231 for (col_index, cell) in row.iter().take(max_cols).enumerate() {
232 if col_index > 0 {
233 out.push('\t');
234 }
235 if !matches!(cell, Data::Empty) {
236 out.push_str(&cell.to_string());
237 }
238 }
239 }
240 out.push_str("\n\n");
241 }
242
243 Ok(out)
244}
245
246fn extract_text_pdf(path: &Path) -> Result<String> {
247 pdf_extract::extract_text(path).map_err(|err| {
248 DocumentError::ExtractionFailed(format!("Failed to extract PDF text {:?}: {}", path, err))
249 })
250}
251
252fn extract_text_rtf(bytes: &[u8]) -> String {
253 let mut out = String::new();
254 let mut index = 0usize;
255
256 while index < bytes.len() {
257 match bytes[index] {
258 b'{' | b'}' => {
259 index += 1;
260 }
261 b'\\' => {
262 index += 1;
263 if index >= bytes.len() {
264 break;
265 }
266
267 match bytes[index] {
268 b'\\' | b'{' | b'}' => {
269 out.push(bytes[index] as char);
270 index += 1;
271 }
272 b'\'' => {
273 if index + 2 < bytes.len() {
274 let hex = &bytes[index + 1..index + 3];
275 if let Ok(hex) = std::str::from_utf8(hex) {
276 if let Ok(value) = u8::from_str_radix(hex, 16) {
277 out.push(value as char);
278 index += 3;
279 continue;
280 }
281 }
282 }
283 index += 1;
284 }
285 b'\n' | b'\r' => {
286 index += 1;
287 }
288 _ => {
289 while index < bytes.len() && bytes[index].is_ascii_alphabetic() {
290 index += 1;
291 }
292 while index < bytes.len()
293 && (bytes[index].is_ascii_digit() || bytes[index] == b'-')
294 {
295 index += 1;
296 }
297 if index < bytes.len() && bytes[index] == b' ' {
298 index += 1;
299 }
300 }
301 }
302 }
303 b'\n' | b'\r' => {
304 index += 1;
305 }
306 byte => {
307 out.push(byte as char);
308 index += 1;
309 }
310 }
311 }
312
313 out.split_whitespace().collect::<Vec<_>>().join(" ")
314}
315
316#[derive(Debug, Clone)]
317pub struct ExtractLimits {
318 pub max_file_bytes: u64,
319 pub max_output_chars: usize,
320 pub max_xml_bytes: usize,
321 pub max_sheets: usize,
322 pub max_rows: usize,
323 pub max_cols: usize,
324}
325
326impl Default for ExtractLimits {
327 fn default() -> Self {
328 Self {
329 max_file_bytes: 25 * 1024 * 1024,
330 max_output_chars: 200_000,
331 max_xml_bytes: 5 * 1024 * 1024,
332 max_sheets: 6,
333 max_rows: 200,
334 max_cols: 30,
335 }
336 }
337}
338
339pub fn extract_file_text(path: &PathBuf, limits: ExtractLimits) -> Result<String> {
340 if !path.exists() {
341 return Err(DocumentError::NotFound(format!(
342 "File does not exist: {}",
343 path.display()
344 )));
345 }
346 if !path.is_file() {
347 return Err(DocumentError::InvalidDocument(format!(
348 "Path is not a file: {}",
349 path.display()
350 )));
351 }
352
353 let metadata = fs::metadata(path)?;
354 if metadata.len() > limits.max_file_bytes {
355 return Err(DocumentError::InvalidDocument(format!(
356 "File too large for text extraction: {} bytes (limit: {} bytes)",
357 metadata.len(),
358 limits.max_file_bytes
359 )));
360 }
361
362 let ext = lower_ext(path.as_path()).unwrap_or_default();
363 let text = match ext.as_str() {
364 "pdf" => extract_text_pdf(path.as_path())?,
365 "docx" => extract_text_docx(path.as_path(), limits.max_xml_bytes)?,
366 "pptx" => extract_text_pptx(path.as_path(), limits.max_xml_bytes)?,
367 "xlsx" | "xls" | "ods" | "xlsb" => extract_text_spreadsheet(
368 path.as_path(),
369 limits.max_sheets,
370 limits.max_rows,
371 limits.max_cols,
372 )?,
373 "rtf" => {
374 let bytes = fs::read(path)?;
375 extract_text_rtf(&bytes)
376 }
377 _ => fs::read_to_string(path)?,
378 };
379
380 Ok(truncate_output(text, limits.max_output_chars))
381}