1use calamine::{open_workbook_auto, Data, Reader};
2use quick_xml::events::Event;
3use quick_xml::Reader as XmlReader;
4use std::collections::BTreeMap;
5use std::fs;
6use std::io::{Cursor, Read};
7use std::path::{Path, PathBuf};
8use thiserror::Error;
9use zip::ZipArchive;
10
11#[derive(Error, Debug)]
12pub enum DocumentError {
13 #[error("IO error: {0}")]
14 Io(#[from] std::io::Error),
15
16 #[error("File not found: {0}")]
17 NotFound(String),
18
19 #[error("Invalid document: {0}")]
20 InvalidDocument(String),
21
22 #[error("Extraction failed: {0}")]
23 ExtractionFailed(String),
24}
25
26pub type Result<T> = std::result::Result<T, DocumentError>;
27
28fn lower_ext(path: &Path) -> Option<String> {
29 path.extension()
30 .and_then(|ext| ext.to_str())
31 .map(|ext| ext.to_ascii_lowercase())
32}
33
34fn truncate_output(text: String, max_chars: usize) -> String {
35 if max_chars == 0 {
36 return String::new();
37 }
38
39 let mut chars = text.chars();
40 let preview: String = chars.by_ref().take(max_chars).collect();
41 if chars.next().is_none() {
42 return preview;
43 }
44
45 let mut out = preview;
46 out.push_str("\n\n...[truncated]...\n");
47 out
48}
49
50fn read_zip_entry(path: &Path, inner_path: &str, max_bytes: usize) -> Result<Vec<u8>> {
51 let bytes = fs::read(path)?;
52 let cursor = Cursor::new(bytes);
53 let mut archive = ZipArchive::new(cursor).map_err(|err| {
54 DocumentError::InvalidDocument(format!("Failed to open zip container {:?}: {}", path, err))
55 })?;
56
57 let mut entry = archive.by_name(inner_path).map_err(|err| {
58 DocumentError::InvalidDocument(format!(
59 "Zip entry '{}' not found in {:?}: {}",
60 inner_path, path, err
61 ))
62 })?;
63
64 let mut out = Vec::new();
65 let mut buffer = [0u8; 16 * 1024];
66 while out.len() < max_bytes {
67 let remaining = max_bytes - out.len();
68 let read_len = remaining.min(buffer.len());
69 let read = entry.read(&mut buffer[..read_len]).map_err(|err| {
70 DocumentError::ExtractionFailed(format!("Failed reading zip entry: {}", err))
71 })?;
72 if read == 0 {
73 break;
74 }
75 out.extend_from_slice(&buffer[..read]);
76 }
77
78 Ok(out)
79}
80
81fn append_paragraph_break(out: &mut String) {
82 if !out.is_empty() && !out.ends_with('\n') {
83 out.push('\n');
84 }
85}
86
87#[derive(Copy, Clone, Debug, Eq, PartialEq)]
88enum OoxmlKind {
89 Word,
90 Presentation,
91}
92
93fn extract_ooxml_text(xml: &[u8], kind: OoxmlKind) -> Result<String> {
94 let mut reader = XmlReader::from_reader(xml);
95 reader.config_mut().trim_text(false);
96
97 let mut out = String::new();
98 let mut in_text = false;
99 let mut buf = Vec::new();
100
101 loop {
102 match reader.read_event_into(&mut buf) {
103 Ok(Event::Start(event)) => {
104 let name = event.name();
105 let name = name.as_ref();
106 if name.ends_with(b"t") {
107 in_text = true;
108 } else if matches!(kind, OoxmlKind::Word) && name.ends_with(b"tab") {
109 out.push('\t');
110 } else if matches!(kind, OoxmlKind::Word) && name.ends_with(b"br") {
111 out.push('\n');
112 } else if name.ends_with(b"p") {
113 append_paragraph_break(&mut out);
114 }
115 }
116 Ok(Event::End(_)) => {
117 in_text = false;
118 }
119 Ok(Event::Text(text)) if in_text => {
120 let decoded = text.decode().map_err(|err| {
121 DocumentError::ExtractionFailed(format!("XML decode/unescape error: {}", err))
122 })?;
123 out.push_str(&decoded);
124 }
125 Ok(Event::Eof) => break,
126 Err(err) => {
127 let label = match kind {
128 OoxmlKind::Word => "OOXML XML",
129 OoxmlKind::Presentation => "PPTX XML",
130 };
131 return Err(DocumentError::ExtractionFailed(format!(
132 "Failed parsing {}: {}",
133 label, err
134 )));
135 }
136 _ => {}
137 }
138
139 buf.clear();
140 }
141
142 Ok(out)
143}
144
145fn extract_text_docx(path: &Path, max_xml_bytes: usize) -> Result<String> {
146 let xml = read_zip_entry(path, "word/document.xml", max_xml_bytes)?;
147 extract_ooxml_text(&xml, OoxmlKind::Word)
148}
149
150fn extract_text_pptx(path: &Path, max_xml_bytes: usize) -> Result<String> {
151 let bytes = fs::read(path)?;
152 let cursor = Cursor::new(bytes);
153 let mut archive = ZipArchive::new(cursor).map_err(|err| {
154 DocumentError::InvalidDocument(format!("Failed to open zip container {:?}: {}", path, err))
155 })?;
156
157 let mut slides = BTreeMap::new();
158 for idx in 0..archive.len() {
159 let Ok(file) = archive.by_index(idx) else {
160 continue;
161 };
162 let name = file.name().to_string();
163 if !name.starts_with("ppt/slides/slide") || !name.ends_with(".xml") {
164 continue;
165 }
166
167 let mut buf = Vec::new();
168 file.take(max_xml_bytes as u64)
169 .read_to_end(&mut buf)
170 .map_err(|err| {
171 DocumentError::ExtractionFailed(format!("Failed reading slide XML: {}", err))
172 })?;
173 let text = extract_ooxml_text(&buf, OoxmlKind::Presentation)?;
174 slides.insert(name, text);
175 }
176
177 if slides.is_empty() {
178 return Err(DocumentError::InvalidDocument(format!(
179 "No slide XML found in {:?}",
180 path
181 )));
182 }
183
184 let mut out = String::new();
185 for (name, text) in slides {
186 out.push_str("# ");
187 out.push_str(&name);
188 out.push('\n');
189 out.push_str(text.trim());
190 out.push_str("\n\n");
191 }
192 Ok(out)
193}
194
195fn extract_text_spreadsheet(
196 path: &Path,
197 max_sheets: usize,
198 max_rows: usize,
199 max_cols: usize,
200) -> Result<String> {
201 let mut workbook = open_workbook_auto(path).map_err(|err| {
202 DocumentError::InvalidDocument(format!("Failed to open spreadsheet {:?}: {}", path, err))
203 })?;
204
205 let mut out = String::new();
206 for (sheet_index, sheet_name) in workbook.sheet_names().iter().cloned().enumerate() {
207 if sheet_index >= max_sheets {
208 out.push_str("\n...[more sheets truncated]...\n");
209 break;
210 }
211
212 let range = match workbook.worksheet_range(&sheet_name) {
213 Ok(range) => range,
214 Err(_) => continue,
215 };
216
217 out.push_str("# Sheet: ");
218 out.push_str(&sheet_name);
219 out.push('\n');
220
221 for (row_index, row) in range.rows().take(max_rows).enumerate() {
222 if row_index > 0 {
223 out.push('\n');
224 }
225
226 for (col_index, cell) in row.iter().take(max_cols).enumerate() {
227 if col_index > 0 {
228 out.push('\t');
229 }
230 if !matches!(cell, Data::Empty) {
231 out.push_str(&cell.to_string());
232 }
233 }
234 }
235 out.push_str("\n\n");
236 }
237
238 Ok(out)
239}
240
241fn extract_text_pdf(path: &Path) -> Result<String> {
242 pdf_extract::extract_text(path).map_err(|err| {
243 DocumentError::ExtractionFailed(format!("Failed to extract PDF text {:?}: {}", path, err))
244 })
245}
246
247fn extract_text_rtf(bytes: &[u8]) -> String {
248 let mut out = String::new();
249 let mut index = 0usize;
250
251 while index < bytes.len() {
252 match bytes[index] {
253 b'{' | b'}' => {
254 index += 1;
255 }
256 b'\\' => {
257 index += 1;
258 if index >= bytes.len() {
259 break;
260 }
261
262 match bytes[index] {
263 b'\\' | b'{' | b'}' => {
264 out.push(bytes[index] as char);
265 index += 1;
266 }
267 b'\'' => {
268 if index + 2 < bytes.len() {
269 let hex = &bytes[index + 1..index + 3];
270 if let Ok(hex) = std::str::from_utf8(hex) {
271 if let Ok(value) = u8::from_str_radix(hex, 16) {
272 out.push(value as char);
273 index += 3;
274 continue;
275 }
276 }
277 }
278 index += 1;
279 }
280 b'\n' | b'\r' => {
281 index += 1;
282 }
283 _ => {
284 while index < bytes.len() && bytes[index].is_ascii_alphabetic() {
285 index += 1;
286 }
287 while index < bytes.len()
288 && (bytes[index].is_ascii_digit() || bytes[index] == b'-')
289 {
290 index += 1;
291 }
292 if index < bytes.len() && bytes[index] == b' ' {
293 index += 1;
294 }
295 }
296 }
297 }
298 b'\n' | b'\r' => {
299 index += 1;
300 }
301 byte => {
302 out.push(byte as char);
303 index += 1;
304 }
305 }
306 }
307
308 out.split_whitespace().collect::<Vec<_>>().join(" ")
309}
310
311#[derive(Debug, Clone)]
312pub struct ExtractLimits {
313 pub max_file_bytes: u64,
314 pub max_output_chars: usize,
315 pub max_xml_bytes: usize,
316 pub max_sheets: usize,
317 pub max_rows: usize,
318 pub max_cols: usize,
319}
320
321impl Default for ExtractLimits {
322 fn default() -> Self {
323 Self {
324 max_file_bytes: 25 * 1024 * 1024,
325 max_output_chars: 200_000,
326 max_xml_bytes: 5 * 1024 * 1024,
327 max_sheets: 6,
328 max_rows: 200,
329 max_cols: 30,
330 }
331 }
332}
333
334pub fn extract_file_text(path: &PathBuf, limits: ExtractLimits) -> Result<String> {
335 if !path.exists() {
336 return Err(DocumentError::NotFound(format!(
337 "File does not exist: {}",
338 path.display()
339 )));
340 }
341 if !path.is_file() {
342 return Err(DocumentError::InvalidDocument(format!(
343 "Path is not a file: {}",
344 path.display()
345 )));
346 }
347
348 let metadata = fs::metadata(path)?;
349 if metadata.len() > limits.max_file_bytes {
350 return Err(DocumentError::InvalidDocument(format!(
351 "File too large for text extraction: {} bytes (limit: {} bytes)",
352 metadata.len(),
353 limits.max_file_bytes
354 )));
355 }
356
357 let ext = lower_ext(path.as_path()).unwrap_or_default();
358 let text = match ext.as_str() {
359 "pdf" => extract_text_pdf(path.as_path())?,
360 "docx" => extract_text_docx(path.as_path(), limits.max_xml_bytes)?,
361 "pptx" => extract_text_pptx(path.as_path(), limits.max_xml_bytes)?,
362 "xlsx" | "xls" | "ods" | "xlsb" => extract_text_spreadsheet(
363 path.as_path(),
364 limits.max_sheets,
365 limits.max_rows,
366 limits.max_cols,
367 )?,
368 "rtf" => {
369 let bytes = fs::read(path)?;
370 extract_text_rtf(&bytes)
371 }
372 _ => fs::read_to_string(path)?,
373 };
374
375 Ok(truncate_output(text, limits.max_output_chars))
376}