1#[cfg(feature = "content-processing")]
6use crate::content_processing::{
7 ContentExtractionConfig, ContentLocation, DocumentFormat, DocumentStructure, ExtractedContent,
8 ExtractedTable, FormatHandler, Heading, ProcessingStats,
9};
10#[cfg(feature = "content-processing")]
11use anyhow::{anyhow, Result};
12#[cfg(feature = "content-processing")]
13use std::collections::HashMap;
14
15#[cfg(feature = "content-processing")]
17pub trait OfficeDocumentHandler {
18 fn extract_from_zip(&self, data: &[u8], main_xml_path: &str) -> Result<String> {
19 let cursor = std::io::Cursor::new(data);
20 let mut archive = oxiarc_archive::ZipReader::new(cursor)
21 .map_err(|e| anyhow!("Failed to open ZIP archive: {}", e))?;
22
23 let entry = archive
25 .entry_by_name(main_xml_path)
26 .cloned()
27 .ok_or_else(|| anyhow!("Main content file not found: {}", main_xml_path))?;
28
29 let data = archive.extract(&entry)?;
30 let content = String::from_utf8(data)?;
31
32 self.extract_text_from_xml(&content)
33 }
34
35 fn extract_text_from_xml(&self, xml: &str) -> Result<String> {
36 let mut reader = quick_xml::Reader::from_str(xml);
37 let mut buf = Vec::new();
38 let mut text_content = Vec::new();
39 let mut in_text = false;
40
41 loop {
42 match reader.read_event_into(&mut buf) {
43 Ok(quick_xml::events::Event::Start(ref e)) => {
44 match e.name().as_ref() {
45 b"w:t" | b"a:t" | b"c" => in_text = true, _ => {}
47 }
48 }
49 Ok(quick_xml::events::Event::End(ref e)) => match e.name().as_ref() {
50 b"w:t" | b"a:t" | b"c" => in_text = false,
51 _ => {}
52 },
53 Ok(quick_xml::events::Event::Text(e)) if in_text => {
54 let inner = e.into_inner();
55 let text = String::from_utf8_lossy(inner.as_ref());
56 text_content.push(text.to_string());
57 }
58 Ok(quick_xml::events::Event::Eof) => break,
59 Err(e) => return Err(anyhow!("XML parsing error: {}", e)),
60 _ => {}
61 }
62 buf.clear();
63 }
64
65 Ok(text_content.join(" "))
66 }
67
68 fn extract_metadata_from_zip(&self, data: &[u8]) -> HashMap<String, String> {
69 let mut metadata = HashMap::new();
70
71 let cursor = std::io::Cursor::new(data);
72 if let Ok(mut archive) = oxiarc_archive::ZipReader::new(cursor) {
73 if let Some(entry) = archive.entry_by_name("docProps/core.xml").cloned() {
75 if let Ok(data) = archive.extract(&entry) {
76 if let Ok(content) = String::from_utf8(data) {
77 let mut reader = quick_xml::Reader::from_str(&content);
79 let mut buf = Vec::new();
80 let mut current_element = String::new();
81
82 loop {
83 match reader.read_event_into(&mut buf) {
84 Ok(quick_xml::events::Event::Start(ref e)) => {
85 current_element =
86 String::from_utf8_lossy(e.name().as_ref()).to_string();
87 }
88 Ok(quick_xml::events::Event::Text(e)) => {
89 let inner = e.into_inner();
90 let text = String::from_utf8_lossy(inner.as_ref());
91 match current_element.as_str() {
92 "dc:title" => {
93 metadata.insert("title".to_string(), text.to_string());
94 }
95 "dc:creator" => {
96 metadata.insert("author".to_string(), text.to_string());
97 }
98 "dc:subject" => {
99 metadata
100 .insert("subject".to_string(), text.to_string());
101 }
102 "dc:description" => {
103 metadata.insert(
104 "description".to_string(),
105 text.to_string(),
106 );
107 }
108 _ => {}
109 }
110 }
111 Ok(quick_xml::events::Event::Eof) => break,
112 _ => {}
113 }
114 buf.clear();
115 }
116 }
117 }
118 }
119 }
120
121 metadata.insert("size".to_string(), data.len().to_string());
122 metadata
123 }
124}
125
126#[cfg(feature = "content-processing")]
128pub struct DocxHandler;
129
130#[cfg(feature = "content-processing")]
131impl OfficeDocumentHandler for DocxHandler {}
132
133#[cfg(feature = "content-processing")]
134impl FormatHandler for DocxHandler {
135 fn extract_content(
136 &self,
137 data: &[u8],
138 _config: &ContentExtractionConfig,
139 ) -> Result<ExtractedContent> {
140 let text = self.extract_from_zip(data, "word/document.xml")?;
141 let metadata = self.extract_metadata_from_zip(data);
142 let title = metadata.get("title").cloned();
143
144 let headings = self.extract_docx_headings(&text);
146
147 Ok(ExtractedContent {
148 format: DocumentFormat::Docx,
149 text,
150 metadata,
151 images: Vec::new(), tables: Vec::new(), links: Vec::new(), structure: DocumentStructure {
155 title,
156 headings,
157 page_count: 1, section_count: 1,
159 table_of_contents: Vec::new(),
160 },
161 chunks: Vec::new(),
162 language: None,
163 processing_stats: ProcessingStats::default(),
164 audio_content: Vec::new(),
165 video_content: Vec::new(),
166 cross_modal_embeddings: Vec::new(),
167 })
168 }
169
170 fn can_handle(&self, data: &[u8]) -> bool {
171 if data.len() < 4 {
172 return false;
173 }
174
175 if data[0..4] != [0x50, 0x4B, 0x03, 0x04] && data[0..4] != [0x50, 0x4B, 0x05, 0x06] {
177 return false;
178 }
179
180 let cursor = std::io::Cursor::new(data);
182 if let Ok(archive) = oxiarc_archive::ZipReader::new(cursor) {
183 archive.entry_by_name("word/document.xml").is_some()
184 && archive.entry_by_name("[Content_Types].xml").is_some()
185 } else {
186 false
187 }
188 }
189
190 fn supported_extensions(&self) -> Vec<&'static str> {
191 vec!["docx"]
192 }
193}
194
195#[cfg(feature = "content-processing")]
196impl DocxHandler {
197 fn extract_docx_headings(&self, text: &str) -> Vec<Heading> {
198 let mut headings = Vec::new();
199
200 for (i, line) in text.lines().enumerate() {
202 let trimmed = line.trim();
203 if trimmed.len() > 3 && trimmed.len() < 100 {
204 let words: Vec<&str> = trimmed.split_whitespace().collect();
206 if words.len() <= 8 && !words.is_empty() {
207 let first_char = trimmed.chars().next().unwrap_or(' ');
208 if first_char.is_uppercase() {
209 headings.push(Heading {
210 level: 1, text: trimmed.to_string(),
212 location: ContentLocation {
213 page: None,
214 section: None,
215 char_offset: None,
216 line: Some(i + 1),
217 column: None,
218 },
219 });
220 }
221 }
222 }
223 }
224
225 headings
226 }
227}
228
229#[cfg(feature = "content-processing")]
231pub struct PptxHandler;
232
233#[cfg(feature = "content-processing")]
234impl OfficeDocumentHandler for PptxHandler {}
235
236#[cfg(feature = "content-processing")]
237impl FormatHandler for PptxHandler {
238 fn extract_content(
239 &self,
240 data: &[u8],
241 _config: &ContentExtractionConfig,
242 ) -> Result<ExtractedContent> {
243 let mut all_text = Vec::new();
245 let cursor = std::io::Cursor::new(data);
246 let mut archive = oxiarc_archive::ZipReader::new(cursor)
247 .map_err(|e| anyhow!("Failed to open PPTX archive: {}", e))?;
248
249 let file_names: Vec<String> = archive
251 .entries()
252 .iter()
253 .map(|entry| entry.name.to_string())
254 .filter(|name: &String| name.starts_with("ppt/slides/slide") && name.ends_with(".xml"))
255 .collect();
256
257 for slide_name in file_names {
258 if let Some(entry) = archive.entry_by_name(&slide_name).cloned() {
259 if let Ok(data) = archive.extract(&entry) {
260 if let Ok(content) = String::from_utf8(data) {
261 if let Ok(slide_text) = self.extract_text_from_xml(&content) {
262 all_text.push(slide_text);
263 }
264 }
265 }
266 }
267 }
268
269 let text = all_text.join("\n\n");
270 let metadata = self.extract_metadata_from_zip(data);
271 let title = metadata.get("title").cloned();
272
273 Ok(ExtractedContent {
274 format: DocumentFormat::Pptx,
275 text,
276 metadata,
277 images: Vec::new(),
278 tables: Vec::new(),
279 links: Vec::new(),
280 structure: DocumentStructure {
281 title,
282 headings: Vec::new(), page_count: all_text.len(), section_count: all_text.len(),
285 table_of_contents: Vec::new(),
286 },
287 chunks: Vec::new(),
288 language: None,
289 processing_stats: ProcessingStats::default(),
290 audio_content: Vec::new(),
291 video_content: Vec::new(),
292 cross_modal_embeddings: Vec::new(),
293 })
294 }
295
296 fn can_handle(&self, data: &[u8]) -> bool {
297 if data.len() < 4 {
298 return false;
299 }
300
301 if data[0..4] != [0x50, 0x4B, 0x03, 0x04] && data[0..4] != [0x50, 0x4B, 0x05, 0x06] {
303 return false;
304 }
305
306 let cursor = std::io::Cursor::new(data);
308 if let Ok(archive) = oxiarc_archive::ZipReader::new(cursor) {
309 archive.entry_by_name("ppt/presentation.xml").is_some()
310 && archive.entry_by_name("[Content_Types].xml").is_some()
311 } else {
312 false
313 }
314 }
315
316 fn supported_extensions(&self) -> Vec<&'static str> {
317 vec!["pptx"]
318 }
319}
320
321#[cfg(feature = "content-processing")]
323pub struct XlsxHandler;
324
325#[cfg(feature = "content-processing")]
326impl OfficeDocumentHandler for XlsxHandler {}
327
328#[cfg(feature = "content-processing")]
329impl FormatHandler for XlsxHandler {
330 fn extract_content(
331 &self,
332 data: &[u8],
333 config: &ContentExtractionConfig,
334 ) -> Result<ExtractedContent> {
335 let cursor = std::io::Cursor::new(data);
336 let mut archive = oxiarc_archive::ZipReader::new(cursor)
337 .map_err(|e| anyhow!("Failed to open XLSX archive: {}", e))?;
338
339 let shared_strings = self.extract_shared_strings(&mut archive)?;
341
342 let (text, tables) = self.extract_worksheets(&mut archive, &shared_strings, config)?;
344 let metadata = self.extract_metadata_from_zip(data);
345 let title = metadata.get("title").cloned();
346
347 Ok(ExtractedContent {
348 format: DocumentFormat::Xlsx,
349 text,
350 metadata,
351 images: Vec::new(),
352 tables,
353 links: Vec::new(),
354 structure: DocumentStructure {
355 title,
356 headings: Vec::new(),
357 page_count: 1,
358 section_count: 1,
359 table_of_contents: Vec::new(),
360 },
361 chunks: Vec::new(),
362 language: None,
363 processing_stats: ProcessingStats::default(),
364 audio_content: Vec::new(),
365 video_content: Vec::new(),
366 cross_modal_embeddings: Vec::new(),
367 })
368 }
369
370 fn can_handle(&self, data: &[u8]) -> bool {
371 if data.len() < 4 {
372 return false;
373 }
374
375 if data[0..4] != [0x50, 0x4B, 0x03, 0x04] && data[0..4] != [0x50, 0x4B, 0x05, 0x06] {
377 return false;
378 }
379
380 let cursor = std::io::Cursor::new(data);
382 if let Ok(archive) = oxiarc_archive::ZipReader::new(cursor) {
383 archive.entry_by_name("xl/workbook.xml").is_some()
384 && archive.entry_by_name("[Content_Types].xml").is_some()
385 } else {
386 false
387 }
388 }
389
390 fn supported_extensions(&self) -> Vec<&'static str> {
391 vec!["xlsx"]
392 }
393}
394
395#[cfg(feature = "content-processing")]
396impl XlsxHandler {
397 fn extract_shared_strings(
398 &self,
399 archive: &mut oxiarc_archive::ZipReader<std::io::Cursor<&[u8]>>,
400 ) -> Result<Vec<String>> {
401 let mut shared_strings = Vec::new();
402
403 if let Some(entry) = archive.entry_by_name("xl/sharedStrings.xml").cloned() {
404 let data = archive
405 .extract(&entry)
406 .map_err(|e| anyhow!("Failed to extract shared strings: {}", e))?;
407 let content = String::from_utf8(data)
408 .map_err(|e| anyhow!("Failed to read shared strings: {}", e))?;
409
410 let mut reader = quick_xml::Reader::from_str(&content);
411 let mut buf = Vec::new();
412 let mut in_text = false;
413 let mut current_string = String::new();
414
415 loop {
416 match reader.read_event_into(&mut buf) {
417 Ok(quick_xml::events::Event::Start(ref e)) if e.name().as_ref() == b"t" => {
418 in_text = true;
419 current_string.clear();
420 }
421 Ok(quick_xml::events::Event::End(ref e)) => {
422 if e.name().as_ref() == b"t" {
423 in_text = false;
424 } else if e.name().as_ref() == b"si" {
425 shared_strings.push(current_string.clone());
426 current_string.clear();
427 }
428 }
429 Ok(quick_xml::events::Event::Text(e)) if in_text => {
430 let inner = e.into_inner();
431 let text = String::from_utf8_lossy(inner.as_ref());
432 current_string.push_str(&text);
433 }
434 Ok(quick_xml::events::Event::Eof) => break,
435 Err(e) => return Err(anyhow!("XML parsing error: {}", e)),
436 _ => {}
437 }
438 buf.clear();
439 }
440 }
441
442 Ok(shared_strings)
443 }
444
445 fn extract_worksheets(
446 &self,
447 archive: &mut oxiarc_archive::ZipReader<std::io::Cursor<&[u8]>>,
448 shared_strings: &[String],
449 config: &ContentExtractionConfig,
450 ) -> Result<(String, Vec<ExtractedTable>)> {
451 let mut all_text = Vec::new();
452 let mut tables = Vec::new();
453
454 let file_names: Vec<String> = archive
456 .entries()
457 .iter()
458 .map(|entry| entry.name.to_string())
459 .filter(|name: &String| {
460 name.starts_with("xl/worksheets/sheet") && name.ends_with(".xml")
461 })
462 .collect();
463
464 for (sheet_index, sheet_name) in file_names.iter().enumerate() {
465 if let Some(entry) = archive.entry_by_name(sheet_name).cloned() {
466 if let Ok(data) = archive.extract(&entry) {
467 if let Ok(content) = String::from_utf8(data) {
468 let (sheet_text, sheet_table) =
469 self.extract_sheet_content(&content, shared_strings)?;
470 all_text.push(sheet_text);
471
472 if config.extract_tables && !sheet_table.rows.is_empty() {
473 let mut table = sheet_table;
474 table.caption = Some(format!("Sheet {}", sheet_index + 1));
475 tables.push(table);
476 }
477 }
478 }
479 }
480 }
481
482 Ok((all_text.join("\n\n"), tables))
483 }
484
485 fn extract_sheet_content(
486 &self,
487 xml: &str,
488 shared_strings: &[String],
489 ) -> Result<(String, ExtractedTable)> {
490 let mut reader = quick_xml::Reader::from_str(xml);
491 let mut buf = Vec::new();
492 let mut cells = Vec::new();
493 let mut current_cell = (0, 0, String::new()); let mut in_value = false;
495 let mut cell_type_owned = String::from("str"); let mut row_index = 0;
497 let mut col_index = 0;
498
499 loop {
500 match reader.read_event_into(&mut buf) {
501 Ok(quick_xml::events::Event::Start(ref e)) => {
502 match e.name().as_ref() {
503 b"c" => {
504 for attr in e.attributes().flatten() {
507 match attr.key.as_ref() {
508 b"r" => {
509 let cell_ref = String::from_utf8_lossy(&attr.value);
511 (col_index, row_index) =
512 self.parse_cell_reference(&cell_ref);
513 }
514 b"t" => {
515 cell_type_owned =
516 String::from_utf8_lossy(&attr.value).to_string();
517 }
518 _ => {}
519 }
520 }
521 }
522 b"v" => {
523 in_value = true;
525 current_cell = (row_index, col_index, String::new());
526 }
527 _ => {}
528 }
529 }
530 Ok(quick_xml::events::Event::End(ref e)) => {
531 match e.name().as_ref() {
532 b"c" => {
533 if !current_cell.2.is_empty() {
534 cells.push(current_cell.clone());
535 }
536 cell_type_owned = String::from("str");
538 }
539 b"v" => {
540 in_value = false;
541 }
542 _ => {}
543 }
544 }
545 Ok(quick_xml::events::Event::Text(e)) if in_value => {
546 let inner = e.into_inner();
547 let text = String::from_utf8_lossy(inner.as_ref());
548 if cell_type_owned == "s" {
549 if let Ok(index) = text.parse::<usize>() {
551 if index < shared_strings.len() {
552 current_cell.2 = shared_strings[index].clone();
553 }
554 }
555 } else {
556 current_cell.2 = text.to_string();
557 }
558 }
559 Ok(quick_xml::events::Event::Eof) => break,
560 Err(e) => return Err(anyhow!("XML parsing error: {}", e)),
561 _ => {}
562 }
563 buf.clear();
564 }
565
566 let (text, table) = self.cells_to_table(cells);
568 Ok((text, table))
569 }
570
571 fn parse_cell_reference(&self, cell_ref: &str) -> (usize, usize) {
572 let mut col = 0;
573 let mut row = 0;
574 let mut i = 0;
575
576 for ch in cell_ref.chars() {
578 if ch.is_alphabetic() {
579 col = col * 26 + (ch.to_ascii_uppercase() as u8 - b'A') as usize + 1;
580 i += 1;
581 } else {
582 break;
583 }
584 }
585
586 if let Ok(row_num) = cell_ref[i..].parse::<usize>() {
588 row = row_num;
589 }
590
591 (col.saturating_sub(1), row.saturating_sub(1))
592 }
593
594 fn cells_to_table(&self, cells: Vec<(usize, usize, String)>) -> (String, ExtractedTable) {
595 if cells.is_empty() {
596 return (
597 String::new(),
598 ExtractedTable {
599 headers: Vec::new(),
600 rows: Vec::new(),
601 caption: None,
602 location: ContentLocation {
603 page: Some(1),
604 section: None,
605 char_offset: None,
606 line: None,
607 column: None,
608 },
609 },
610 );
611 }
612
613 let max_row = cells.iter().map(|(r, _, _)| *r).max().unwrap_or(0);
615 let max_col = cells.iter().map(|(_, c, _)| *c).max().unwrap_or(0);
616
617 let mut grid = vec![vec![String::new(); max_col + 1]; max_row + 1];
619 for (row, col, value) in cells {
620 if row <= max_row && col <= max_col {
621 grid[row][col] = value;
622 }
623 }
624
625 let headers = if !grid.is_empty() {
627 grid[0].clone()
628 } else {
629 Vec::new()
630 };
631
632 let rows = if grid.len() > 1 {
633 grid[1..].to_vec()
634 } else {
635 Vec::new()
636 };
637
638 let mut text_parts = Vec::new();
640 for row in &grid {
641 let row_text = row
642 .iter()
643 .filter(|cell| !cell.is_empty())
644 .cloned()
645 .collect::<Vec<_>>()
646 .join(" | ");
647 if !row_text.is_empty() {
648 text_parts.push(row_text);
649 }
650 }
651 let text = text_parts.join("\n");
652
653 let table = ExtractedTable {
654 headers,
655 rows,
656 caption: None,
657 location: ContentLocation {
658 page: Some(1),
659 section: None,
660 char_offset: None,
661 line: None,
662 column: None,
663 },
664 };
665
666 (text, table)
667 }
668}