1#[cfg(feature = "content-processing")]
6use crate::content_processing::{
7 ContentExtractionConfig, ContentLocation, DocumentFormat, DocumentStructure, ExtractedContent,
8 ExtractedTable, FormatHandler, Heading, ProcessingStats,
9};
10#[cfg(feature = "content-processing")]
11use anyhow::{anyhow, Result};
12#[cfg(feature = "content-processing")]
13use std::collections::HashMap;
14
15#[cfg(feature = "content-processing")]
17pub trait OfficeDocumentHandler {
18 fn extract_from_zip(&self, data: &[u8], main_xml_path: &str) -> Result<String> {
19 let cursor = std::io::Cursor::new(data);
20 let mut archive = zip::ZipArchive::new(cursor)
21 .map_err(|e| anyhow!("Failed to open ZIP archive: {}", e))?;
22
23 let file = archive
25 .by_name(main_xml_path)
26 .map_err(|e| anyhow!("Main content file not found: {}", e))?;
27
28 let content =
29 std::io::read_to_string(file).map_err(|e| anyhow!("Failed to read content: {}", e))?;
30
31 self.extract_text_from_xml(&content)
32 }
33
34 fn extract_text_from_xml(&self, xml: &str) -> Result<String> {
35 let mut reader = quick_xml::Reader::from_str(xml);
36 let mut buf = Vec::new();
37 let mut text_content = Vec::new();
38 let mut in_text = false;
39
40 loop {
41 match reader.read_event_into(&mut buf) {
42 Ok(quick_xml::events::Event::Start(ref e)) => {
43 match e.name().as_ref() {
44 b"w:t" | b"a:t" | b"c" => in_text = true, _ => {}
46 }
47 }
48 Ok(quick_xml::events::Event::End(ref e)) => match e.name().as_ref() {
49 b"w:t" | b"a:t" | b"c" => in_text = false,
50 _ => {}
51 },
52 Ok(quick_xml::events::Event::Text(e)) if in_text => {
53 let inner = e.into_inner();
54 let text = String::from_utf8_lossy(inner.as_ref());
55 text_content.push(text.to_string());
56 }
57 Ok(quick_xml::events::Event::Eof) => break,
58 Err(e) => return Err(anyhow!("XML parsing error: {}", e)),
59 _ => {}
60 }
61 buf.clear();
62 }
63
64 Ok(text_content.join(" "))
65 }
66
67 fn extract_metadata_from_zip(&self, data: &[u8]) -> HashMap<String, String> {
68 let mut metadata = HashMap::new();
69
70 let cursor = std::io::Cursor::new(data);
71 if let Ok(mut archive) = zip::ZipArchive::new(cursor) {
72 if let Ok(file) = archive.by_name("docProps/core.xml") {
74 if let Ok(content) = std::io::read_to_string(file) {
75 let mut reader = quick_xml::Reader::from_str(&content);
77 let mut buf = Vec::new();
78 let mut current_element = String::new();
79
80 loop {
81 match reader.read_event_into(&mut buf) {
82 Ok(quick_xml::events::Event::Start(ref e)) => {
83 current_element =
84 String::from_utf8_lossy(e.name().as_ref()).to_string();
85 }
86 Ok(quick_xml::events::Event::Text(e)) => {
87 let inner = e.into_inner();
88 let text = String::from_utf8_lossy(inner.as_ref());
89 match current_element.as_str() {
90 "dc:title" => {
91 metadata.insert("title".to_string(), text.to_string());
92 }
93 "dc:creator" => {
94 metadata.insert("author".to_string(), text.to_string());
95 }
96 "dc:subject" => {
97 metadata.insert("subject".to_string(), text.to_string());
98 }
99 "dc:description" => {
100 metadata
101 .insert("description".to_string(), text.to_string());
102 }
103 _ => {}
104 }
105 }
106 Ok(quick_xml::events::Event::Eof) => break,
107 _ => {}
108 }
109 buf.clear();
110 }
111 }
112 }
113 }
114
115 metadata.insert("size".to_string(), data.len().to_string());
116 metadata
117 }
118}
119
120#[cfg(feature = "content-processing")]
122pub struct DocxHandler;
123
124#[cfg(feature = "content-processing")]
125impl OfficeDocumentHandler for DocxHandler {}
126
127#[cfg(feature = "content-processing")]
128impl FormatHandler for DocxHandler {
129 fn extract_content(
130 &self,
131 data: &[u8],
132 _config: &ContentExtractionConfig,
133 ) -> Result<ExtractedContent> {
134 let text = self.extract_from_zip(data, "word/document.xml")?;
135 let metadata = self.extract_metadata_from_zip(data);
136 let title = metadata.get("title").cloned();
137
138 let headings = self.extract_docx_headings(&text);
140
141 Ok(ExtractedContent {
142 format: DocumentFormat::Docx,
143 text,
144 metadata,
145 images: Vec::new(), tables: Vec::new(), links: Vec::new(), structure: DocumentStructure {
149 title,
150 headings,
151 page_count: 1, section_count: 1,
153 table_of_contents: Vec::new(),
154 },
155 chunks: Vec::new(),
156 language: None,
157 processing_stats: ProcessingStats::default(),
158 audio_content: Vec::new(),
159 video_content: Vec::new(),
160 cross_modal_embeddings: Vec::new(),
161 })
162 }
163
164 fn can_handle(&self, data: &[u8]) -> bool {
165 if data.len() < 4 {
166 return false;
167 }
168
169 if data[0..4] != [0x50, 0x4B, 0x03, 0x04] && data[0..4] != [0x50, 0x4B, 0x05, 0x06] {
171 return false;
172 }
173
174 let cursor = std::io::Cursor::new(data);
176 if let Ok(mut archive) = zip::ZipArchive::new(cursor) {
177 archive.by_name("word/document.xml").is_ok()
178 && archive.by_name("[Content_Types].xml").is_ok()
179 } else {
180 false
181 }
182 }
183
184 fn supported_extensions(&self) -> Vec<&'static str> {
185 vec!["docx"]
186 }
187}
188
189#[cfg(feature = "content-processing")]
190impl DocxHandler {
191 fn extract_docx_headings(&self, text: &str) -> Vec<Heading> {
192 let mut headings = Vec::new();
193
194 for (i, line) in text.lines().enumerate() {
196 let trimmed = line.trim();
197 if trimmed.len() > 3 && trimmed.len() < 100 {
198 let words: Vec<&str> = trimmed.split_whitespace().collect();
200 if words.len() <= 8 && !words.is_empty() {
201 let first_char = trimmed.chars().next().unwrap_or(' ');
202 if first_char.is_uppercase() {
203 headings.push(Heading {
204 level: 1, text: trimmed.to_string(),
206 location: ContentLocation {
207 page: None,
208 section: None,
209 char_offset: None,
210 line: Some(i + 1),
211 column: None,
212 },
213 });
214 }
215 }
216 }
217 }
218
219 headings
220 }
221}
222
223#[cfg(feature = "content-processing")]
225pub struct PptxHandler;
226
227#[cfg(feature = "content-processing")]
228impl OfficeDocumentHandler for PptxHandler {}
229
230#[cfg(feature = "content-processing")]
231impl FormatHandler for PptxHandler {
232 fn extract_content(
233 &self,
234 data: &[u8],
235 _config: &ContentExtractionConfig,
236 ) -> Result<ExtractedContent> {
237 let mut all_text = Vec::new();
239 let cursor = std::io::Cursor::new(data);
240 let mut archive = zip::ZipArchive::new(cursor)
241 .map_err(|e| anyhow!("Failed to open PPTX archive: {}", e))?;
242
243 let file_names: Vec<String> = (0..archive.len())
245 .filter_map(|i| {
246 archive
247 .by_index(i)
248 .ok()
249 .map(|file| file.name().to_string())
250 .filter(|name| name.starts_with("ppt/slides/slide") && name.ends_with(".xml"))
251 })
252 .collect();
253
254 for slide_name in file_names {
255 if let Ok(file) = archive.by_name(&slide_name) {
256 if let Ok(content) = std::io::read_to_string(file) {
257 if let Ok(slide_text) = self.extract_text_from_xml(&content) {
258 all_text.push(slide_text);
259 }
260 }
261 }
262 }
263
264 let text = all_text.join("\n\n");
265 let metadata = self.extract_metadata_from_zip(data);
266 let title = metadata.get("title").cloned();
267
268 Ok(ExtractedContent {
269 format: DocumentFormat::Pptx,
270 text,
271 metadata,
272 images: Vec::new(),
273 tables: Vec::new(),
274 links: Vec::new(),
275 structure: DocumentStructure {
276 title,
277 headings: Vec::new(), page_count: all_text.len(), section_count: all_text.len(),
280 table_of_contents: Vec::new(),
281 },
282 chunks: Vec::new(),
283 language: None,
284 processing_stats: ProcessingStats::default(),
285 audio_content: Vec::new(),
286 video_content: Vec::new(),
287 cross_modal_embeddings: Vec::new(),
288 })
289 }
290
291 fn can_handle(&self, data: &[u8]) -> bool {
292 if data.len() < 4 {
293 return false;
294 }
295
296 if data[0..4] != [0x50, 0x4B, 0x03, 0x04] && data[0..4] != [0x50, 0x4B, 0x05, 0x06] {
298 return false;
299 }
300
301 let cursor = std::io::Cursor::new(data);
303 if let Ok(mut archive) = zip::ZipArchive::new(cursor) {
304 archive.by_name("ppt/presentation.xml").is_ok()
305 && archive.by_name("[Content_Types].xml").is_ok()
306 } else {
307 false
308 }
309 }
310
311 fn supported_extensions(&self) -> Vec<&'static str> {
312 vec!["pptx"]
313 }
314}
315
316#[cfg(feature = "content-processing")]
318pub struct XlsxHandler;
319
320#[cfg(feature = "content-processing")]
321impl OfficeDocumentHandler for XlsxHandler {}
322
323#[cfg(feature = "content-processing")]
324impl FormatHandler for XlsxHandler {
325 fn extract_content(
326 &self,
327 data: &[u8],
328 config: &ContentExtractionConfig,
329 ) -> Result<ExtractedContent> {
330 let cursor = std::io::Cursor::new(data);
331 let mut archive = zip::ZipArchive::new(cursor)
332 .map_err(|e| anyhow!("Failed to open XLSX archive: {}", e))?;
333
334 let shared_strings = self.extract_shared_strings(&mut archive)?;
336
337 let (text, tables) = self.extract_worksheets(&mut archive, &shared_strings, config)?;
339 let metadata = self.extract_metadata_from_zip(data);
340 let title = metadata.get("title").cloned();
341
342 Ok(ExtractedContent {
343 format: DocumentFormat::Xlsx,
344 text,
345 metadata,
346 images: Vec::new(),
347 tables,
348 links: Vec::new(),
349 structure: DocumentStructure {
350 title,
351 headings: Vec::new(),
352 page_count: 1,
353 section_count: 1,
354 table_of_contents: Vec::new(),
355 },
356 chunks: Vec::new(),
357 language: None,
358 processing_stats: ProcessingStats::default(),
359 audio_content: Vec::new(),
360 video_content: Vec::new(),
361 cross_modal_embeddings: Vec::new(),
362 })
363 }
364
365 fn can_handle(&self, data: &[u8]) -> bool {
366 if data.len() < 4 {
367 return false;
368 }
369
370 if data[0..4] != [0x50, 0x4B, 0x03, 0x04] && data[0..4] != [0x50, 0x4B, 0x05, 0x06] {
372 return false;
373 }
374
375 let cursor = std::io::Cursor::new(data);
377 if let Ok(mut archive) = zip::ZipArchive::new(cursor) {
378 archive.by_name("xl/workbook.xml").is_ok()
379 && archive.by_name("[Content_Types].xml").is_ok()
380 } else {
381 false
382 }
383 }
384
385 fn supported_extensions(&self) -> Vec<&'static str> {
386 vec!["xlsx"]
387 }
388}
389
390#[cfg(feature = "content-processing")]
391impl XlsxHandler {
392 fn extract_shared_strings(
393 &self,
394 archive: &mut zip::ZipArchive<std::io::Cursor<&[u8]>>,
395 ) -> Result<Vec<String>> {
396 let mut shared_strings = Vec::new();
397
398 if let Ok(file) = archive.by_name("xl/sharedStrings.xml") {
399 let content = std::io::read_to_string(file)
400 .map_err(|e| anyhow!("Failed to read shared strings: {}", e))?;
401
402 let mut reader = quick_xml::Reader::from_str(&content);
403 let mut buf = Vec::new();
404 let mut in_text = false;
405 let mut current_string = String::new();
406
407 loop {
408 match reader.read_event_into(&mut buf) {
409 Ok(quick_xml::events::Event::Start(ref e)) => {
410 if e.name().as_ref() == b"t" {
411 in_text = true;
412 current_string.clear();
413 }
414 }
415 Ok(quick_xml::events::Event::End(ref e)) => {
416 if e.name().as_ref() == b"t" {
417 in_text = false;
418 } else if e.name().as_ref() == b"si" {
419 shared_strings.push(current_string.clone());
420 current_string.clear();
421 }
422 }
423 Ok(quick_xml::events::Event::Text(e)) if in_text => {
424 let inner = e.into_inner();
425 let text = String::from_utf8_lossy(inner.as_ref());
426 current_string.push_str(&text);
427 }
428 Ok(quick_xml::events::Event::Eof) => break,
429 Err(e) => return Err(anyhow!("XML parsing error: {}", e)),
430 _ => {}
431 }
432 buf.clear();
433 }
434 }
435
436 Ok(shared_strings)
437 }
438
439 fn extract_worksheets(
440 &self,
441 archive: &mut zip::ZipArchive<std::io::Cursor<&[u8]>>,
442 shared_strings: &[String],
443 config: &ContentExtractionConfig,
444 ) -> Result<(String, Vec<ExtractedTable>)> {
445 let mut all_text = Vec::new();
446 let mut tables = Vec::new();
447
448 let file_names: Vec<String> = (0..archive.len())
450 .filter_map(|i| {
451 archive
452 .by_index(i)
453 .ok()
454 .map(|file| file.name().to_string())
455 .filter(|name| {
456 name.starts_with("xl/worksheets/sheet") && name.ends_with(".xml")
457 })
458 })
459 .collect();
460
461 for (sheet_index, sheet_name) in file_names.iter().enumerate() {
462 if let Ok(file) = archive.by_name(sheet_name) {
463 if let Ok(content) = std::io::read_to_string(file) {
464 let (sheet_text, sheet_table) =
465 self.extract_sheet_content(&content, shared_strings)?;
466 all_text.push(sheet_text);
467
468 if config.extract_tables && !sheet_table.rows.is_empty() {
469 let mut table = sheet_table;
470 table.caption = Some(format!("Sheet {}", sheet_index + 1));
471 tables.push(table);
472 }
473 }
474 }
475 }
476
477 Ok((all_text.join("\n\n"), tables))
478 }
479
480 fn extract_sheet_content(
481 &self,
482 xml: &str,
483 shared_strings: &[String],
484 ) -> Result<(String, ExtractedTable)> {
485 let mut reader = quick_xml::Reader::from_str(xml);
486 let mut buf = Vec::new();
487 let mut cells = Vec::new();
488 let mut current_cell = (0, 0, String::new()); let mut in_value = false;
490 let mut cell_type_owned = String::from("str"); let mut row_index = 0;
492 let mut col_index = 0;
493
494 loop {
495 match reader.read_event_into(&mut buf) {
496 Ok(quick_xml::events::Event::Start(ref e)) => {
497 match e.name().as_ref() {
498 b"c" => {
499 for attr in e.attributes().flatten() {
502 match attr.key.as_ref() {
503 b"r" => {
504 let cell_ref = String::from_utf8_lossy(&attr.value);
506 (col_index, row_index) =
507 self.parse_cell_reference(&cell_ref);
508 }
509 b"t" => {
510 cell_type_owned =
511 String::from_utf8_lossy(&attr.value).to_string();
512 }
513 _ => {}
514 }
515 }
516 }
517 b"v" => {
518 in_value = true;
520 current_cell = (row_index, col_index, String::new());
521 }
522 _ => {}
523 }
524 }
525 Ok(quick_xml::events::Event::End(ref e)) => {
526 match e.name().as_ref() {
527 b"c" => {
528 if !current_cell.2.is_empty() {
529 cells.push(current_cell.clone());
530 }
531 cell_type_owned = String::from("str");
533 }
534 b"v" => {
535 in_value = false;
536 }
537 _ => {}
538 }
539 }
540 Ok(quick_xml::events::Event::Text(e)) if in_value => {
541 let inner = e.into_inner();
542 let text = String::from_utf8_lossy(inner.as_ref());
543 if cell_type_owned == "s" {
544 if let Ok(index) = text.parse::<usize>() {
546 if index < shared_strings.len() {
547 current_cell.2 = shared_strings[index].clone();
548 }
549 }
550 } else {
551 current_cell.2 = text.to_string();
552 }
553 }
554 Ok(quick_xml::events::Event::Eof) => break,
555 Err(e) => return Err(anyhow!("XML parsing error: {}", e)),
556 _ => {}
557 }
558 buf.clear();
559 }
560
561 let (text, table) = self.cells_to_table(cells);
563 Ok((text, table))
564 }
565
566 fn parse_cell_reference(&self, cell_ref: &str) -> (usize, usize) {
567 let mut col = 0;
568 let mut row = 0;
569 let mut i = 0;
570
571 for ch in cell_ref.chars() {
573 if ch.is_alphabetic() {
574 col = col * 26 + (ch.to_ascii_uppercase() as u8 - b'A') as usize + 1;
575 i += 1;
576 } else {
577 break;
578 }
579 }
580
581 if let Ok(row_num) = cell_ref[i..].parse::<usize>() {
583 row = row_num;
584 }
585
586 (col.saturating_sub(1), row.saturating_sub(1))
587 }
588
589 fn cells_to_table(&self, cells: Vec<(usize, usize, String)>) -> (String, ExtractedTable) {
590 if cells.is_empty() {
591 return (
592 String::new(),
593 ExtractedTable {
594 headers: Vec::new(),
595 rows: Vec::new(),
596 caption: None,
597 location: ContentLocation {
598 page: Some(1),
599 section: None,
600 char_offset: None,
601 line: None,
602 column: None,
603 },
604 },
605 );
606 }
607
608 let max_row = cells.iter().map(|(r, _, _)| *r).max().unwrap_or(0);
610 let max_col = cells.iter().map(|(_, c, _)| *c).max().unwrap_or(0);
611
612 let mut grid = vec![vec![String::new(); max_col + 1]; max_row + 1];
614 for (row, col, value) in cells {
615 if row <= max_row && col <= max_col {
616 grid[row][col] = value;
617 }
618 }
619
620 let headers = if !grid.is_empty() {
622 grid[0].clone()
623 } else {
624 Vec::new()
625 };
626
627 let rows = if grid.len() > 1 {
628 grid[1..].to_vec()
629 } else {
630 Vec::new()
631 };
632
633 let mut text_parts = Vec::new();
635 for row in &grid {
636 let row_text = row
637 .iter()
638 .filter(|cell| !cell.is_empty())
639 .cloned()
640 .collect::<Vec<_>>()
641 .join(" | ");
642 if !row_text.is_empty() {
643 text_parts.push(row_text);
644 }
645 }
646 let text = text_parts.join("\n");
647
648 let table = ExtractedTable {
649 headers,
650 rows,
651 caption: None,
652 location: ContentLocation {
653 page: Some(1),
654 section: None,
655 char_offset: None,
656 line: None,
657 column: None,
658 },
659 };
660
661 (text, table)
662 }
663}