1#[cfg(feature = "content-processing")]
6use crate::content_processing::{
7 ContentExtractionConfig, ContentLocation, DocumentFormat, DocumentStructure, ExtractedContent,
8 ExtractedTable, FormatHandler, Heading, ProcessingStats,
9};
10#[cfg(feature = "content-processing")]
11use anyhow::{anyhow, Result};
12#[cfg(feature = "content-processing")]
13use std::collections::HashMap;
14
15#[cfg(feature = "content-processing")]
17pub trait OfficeDocumentHandler {
18 fn extract_from_zip(&self, data: &[u8], main_xml_path: &str) -> Result<String> {
19 let cursor = std::io::Cursor::new(data);
20 let mut archive = oxiarc_archive::ZipReader::new(cursor)
21 .map_err(|e| anyhow!("Failed to open ZIP archive: {}", e))?;
22
23 let entry = archive
25 .entry_by_name(main_xml_path)
26 .cloned()
27 .ok_or_else(|| anyhow!("Main content file not found: {}", main_xml_path))?;
28
29 let data = archive.extract(&entry)?;
30 let content = String::from_utf8(data)?;
31
32 self.extract_text_from_xml(&content)
33 }
34
35 fn extract_text_from_xml(&self, xml: &str) -> Result<String> {
36 let mut reader = quick_xml::Reader::from_str(xml);
37 let mut buf = Vec::new();
38 let mut text_content = Vec::new();
39 let mut in_text = false;
40
41 loop {
42 match reader.read_event_into(&mut buf) {
43 Ok(quick_xml::events::Event::Start(ref e)) => {
44 match e.name().as_ref() {
45 b"w:t" | b"a:t" | b"c" => in_text = true, _ => {}
47 }
48 }
49 Ok(quick_xml::events::Event::End(ref e)) => match e.name().as_ref() {
50 b"w:t" | b"a:t" | b"c" => in_text = false,
51 _ => {}
52 },
53 Ok(quick_xml::events::Event::Text(e)) if in_text => {
54 let inner = e.into_inner();
55 let text = String::from_utf8_lossy(inner.as_ref());
56 text_content.push(text.to_string());
57 }
58 Ok(quick_xml::events::Event::Eof) => break,
59 Err(e) => return Err(anyhow!("XML parsing error: {}", e)),
60 _ => {}
61 }
62 buf.clear();
63 }
64
65 Ok(text_content.join(" "))
66 }
67
68 fn extract_metadata_from_zip(&self, data: &[u8]) -> HashMap<String, String> {
69 let mut metadata = HashMap::new();
70
71 let cursor = std::io::Cursor::new(data);
72 if let Ok(mut archive) = oxiarc_archive::ZipReader::new(cursor) {
73 if let Some(entry) = archive.entry_by_name("docProps/core.xml").cloned() {
75 if let Ok(data) = archive.extract(&entry) {
76 if let Ok(content) = String::from_utf8(data) {
77 let mut reader = quick_xml::Reader::from_str(&content);
79 let mut buf = Vec::new();
80 let mut current_element = String::new();
81
82 loop {
83 match reader.read_event_into(&mut buf) {
84 Ok(quick_xml::events::Event::Start(ref e)) => {
85 current_element =
86 String::from_utf8_lossy(e.name().as_ref()).to_string();
87 }
88 Ok(quick_xml::events::Event::Text(e)) => {
89 let inner = e.into_inner();
90 let text = String::from_utf8_lossy(inner.as_ref());
91 match current_element.as_str() {
92 "dc:title" => {
93 metadata.insert("title".to_string(), text.to_string());
94 }
95 "dc:creator" => {
96 metadata.insert("author".to_string(), text.to_string());
97 }
98 "dc:subject" => {
99 metadata
100 .insert("subject".to_string(), text.to_string());
101 }
102 "dc:description" => {
103 metadata.insert(
104 "description".to_string(),
105 text.to_string(),
106 );
107 }
108 _ => {}
109 }
110 }
111 Ok(quick_xml::events::Event::Eof) => break,
112 _ => {}
113 }
114 buf.clear();
115 }
116 }
117 }
118 }
119 }
120
121 metadata.insert("size".to_string(), data.len().to_string());
122 metadata
123 }
124}
125
126#[cfg(feature = "content-processing")]
128pub struct DocxHandler;
129
130#[cfg(feature = "content-processing")]
131impl OfficeDocumentHandler for DocxHandler {}
132
133#[cfg(feature = "content-processing")]
134impl FormatHandler for DocxHandler {
135 fn extract_content(
136 &self,
137 data: &[u8],
138 _config: &ContentExtractionConfig,
139 ) -> Result<ExtractedContent> {
140 let text = self.extract_from_zip(data, "word/document.xml")?;
141 let metadata = self.extract_metadata_from_zip(data);
142 let title = metadata.get("title").cloned();
143
144 let headings = self.extract_docx_headings(&text);
146
147 Ok(ExtractedContent {
148 format: DocumentFormat::Docx,
149 text,
150 metadata,
151 images: Vec::new(), tables: Vec::new(), links: Vec::new(), structure: DocumentStructure {
155 title,
156 headings,
157 page_count: 1, section_count: 1,
159 table_of_contents: Vec::new(),
160 },
161 chunks: Vec::new(),
162 language: None,
163 processing_stats: ProcessingStats::default(),
164 audio_content: Vec::new(),
165 video_content: Vec::new(),
166 cross_modal_embeddings: Vec::new(),
167 })
168 }
169
170 fn can_handle(&self, data: &[u8]) -> bool {
171 if data.len() < 4 {
172 return false;
173 }
174
175 if data[0..4] != [0x50, 0x4B, 0x03, 0x04] && data[0..4] != [0x50, 0x4B, 0x05, 0x06] {
177 return false;
178 }
179
180 let cursor = std::io::Cursor::new(data);
182 if let Ok(archive) = oxiarc_archive::ZipReader::new(cursor) {
183 archive.entry_by_name("word/document.xml").is_some()
184 && archive.entry_by_name("[Content_Types].xml").is_some()
185 } else {
186 false
187 }
188 }
189
190 fn supported_extensions(&self) -> Vec<&'static str> {
191 vec!["docx"]
192 }
193}
194
195#[cfg(feature = "content-processing")]
196impl DocxHandler {
197 fn extract_docx_headings(&self, text: &str) -> Vec<Heading> {
198 let mut headings = Vec::new();
199
200 for (i, line) in text.lines().enumerate() {
202 let trimmed = line.trim();
203 if trimmed.len() > 3 && trimmed.len() < 100 {
204 let words: Vec<&str> = trimmed.split_whitespace().collect();
206 if words.len() <= 8 && !words.is_empty() {
207 let first_char = trimmed.chars().next().unwrap_or(' ');
208 if first_char.is_uppercase() {
209 headings.push(Heading {
210 level: 1, text: trimmed.to_string(),
212 location: ContentLocation {
213 page: None,
214 section: None,
215 char_offset: None,
216 line: Some(i + 1),
217 column: None,
218 },
219 });
220 }
221 }
222 }
223 }
224
225 headings
226 }
227}
228
229#[cfg(feature = "content-processing")]
231pub struct PptxHandler;
232
233#[cfg(feature = "content-processing")]
234impl OfficeDocumentHandler for PptxHandler {}
235
236#[cfg(feature = "content-processing")]
237impl FormatHandler for PptxHandler {
238 fn extract_content(
239 &self,
240 data: &[u8],
241 _config: &ContentExtractionConfig,
242 ) -> Result<ExtractedContent> {
243 let mut all_text = Vec::new();
245 let cursor = std::io::Cursor::new(data);
246 let mut archive = oxiarc_archive::ZipReader::new(cursor)
247 .map_err(|e| anyhow!("Failed to open PPTX archive: {}", e))?;
248
249 let file_names: Vec<String> = archive
251 .entries()
252 .iter()
253 .map(|entry| entry.name.to_string())
254 .filter(|name: &String| name.starts_with("ppt/slides/slide") && name.ends_with(".xml"))
255 .collect();
256
257 for slide_name in file_names {
258 if let Some(entry) = archive.entry_by_name(&slide_name).cloned() {
259 if let Ok(data) = archive.extract(&entry) {
260 if let Ok(content) = String::from_utf8(data) {
261 if let Ok(slide_text) = self.extract_text_from_xml(&content) {
262 all_text.push(slide_text);
263 }
264 }
265 }
266 }
267 }
268
269 let text = all_text.join("\n\n");
270 let metadata = self.extract_metadata_from_zip(data);
271 let title = metadata.get("title").cloned();
272
273 Ok(ExtractedContent {
274 format: DocumentFormat::Pptx,
275 text,
276 metadata,
277 images: Vec::new(),
278 tables: Vec::new(),
279 links: Vec::new(),
280 structure: DocumentStructure {
281 title,
282 headings: Vec::new(), page_count: all_text.len(), section_count: all_text.len(),
285 table_of_contents: Vec::new(),
286 },
287 chunks: Vec::new(),
288 language: None,
289 processing_stats: ProcessingStats::default(),
290 audio_content: Vec::new(),
291 video_content: Vec::new(),
292 cross_modal_embeddings: Vec::new(),
293 })
294 }
295
296 fn can_handle(&self, data: &[u8]) -> bool {
297 if data.len() < 4 {
298 return false;
299 }
300
301 if data[0..4] != [0x50, 0x4B, 0x03, 0x04] && data[0..4] != [0x50, 0x4B, 0x05, 0x06] {
303 return false;
304 }
305
306 let cursor = std::io::Cursor::new(data);
308 if let Ok(archive) = oxiarc_archive::ZipReader::new(cursor) {
309 archive.entry_by_name("ppt/presentation.xml").is_some()
310 && archive.entry_by_name("[Content_Types].xml").is_some()
311 } else {
312 false
313 }
314 }
315
316 fn supported_extensions(&self) -> Vec<&'static str> {
317 vec!["pptx"]
318 }
319}
320
321#[cfg(feature = "content-processing")]
323pub struct XlsxHandler;
324
325#[cfg(feature = "content-processing")]
326impl OfficeDocumentHandler for XlsxHandler {}
327
328#[cfg(feature = "content-processing")]
329impl FormatHandler for XlsxHandler {
330 fn extract_content(
331 &self,
332 data: &[u8],
333 config: &ContentExtractionConfig,
334 ) -> Result<ExtractedContent> {
335 let cursor = std::io::Cursor::new(data);
336 let mut archive = oxiarc_archive::ZipReader::new(cursor)
337 .map_err(|e| anyhow!("Failed to open XLSX archive: {}", e))?;
338
339 let shared_strings = self.extract_shared_strings(&mut archive)?;
341
342 let (text, tables) = self.extract_worksheets(&mut archive, &shared_strings, config)?;
344 let metadata = self.extract_metadata_from_zip(data);
345 let title = metadata.get("title").cloned();
346
347 Ok(ExtractedContent {
348 format: DocumentFormat::Xlsx,
349 text,
350 metadata,
351 images: Vec::new(),
352 tables,
353 links: Vec::new(),
354 structure: DocumentStructure {
355 title,
356 headings: Vec::new(),
357 page_count: 1,
358 section_count: 1,
359 table_of_contents: Vec::new(),
360 },
361 chunks: Vec::new(),
362 language: None,
363 processing_stats: ProcessingStats::default(),
364 audio_content: Vec::new(),
365 video_content: Vec::new(),
366 cross_modal_embeddings: Vec::new(),
367 })
368 }
369
370 fn can_handle(&self, data: &[u8]) -> bool {
371 if data.len() < 4 {
372 return false;
373 }
374
375 if data[0..4] != [0x50, 0x4B, 0x03, 0x04] && data[0..4] != [0x50, 0x4B, 0x05, 0x06] {
377 return false;
378 }
379
380 let cursor = std::io::Cursor::new(data);
382 if let Ok(archive) = oxiarc_archive::ZipReader::new(cursor) {
383 archive.entry_by_name("xl/workbook.xml").is_some()
384 && archive.entry_by_name("[Content_Types].xml").is_some()
385 } else {
386 false
387 }
388 }
389
390 fn supported_extensions(&self) -> Vec<&'static str> {
391 vec!["xlsx"]
392 }
393}
394
395#[cfg(feature = "content-processing")]
396impl XlsxHandler {
397 fn extract_shared_strings(
398 &self,
399 archive: &mut oxiarc_archive::ZipReader<std::io::Cursor<&[u8]>>,
400 ) -> Result<Vec<String>> {
401 let mut shared_strings = Vec::new();
402
403 if let Some(entry) = archive.entry_by_name("xl/sharedStrings.xml").cloned() {
404 let data = archive
405 .extract(&entry)
406 .map_err(|e| anyhow!("Failed to extract shared strings: {}", e))?;
407 let content = String::from_utf8(data)
408 .map_err(|e| anyhow!("Failed to read shared strings: {}", e))?;
409
410 let mut reader = quick_xml::Reader::from_str(&content);
411 let mut buf = Vec::new();
412 let mut in_text = false;
413 let mut current_string = String::new();
414
415 loop {
416 match reader.read_event_into(&mut buf) {
417 Ok(quick_xml::events::Event::Start(ref e)) => {
418 if e.name().as_ref() == b"t" {
419 in_text = true;
420 current_string.clear();
421 }
422 }
423 Ok(quick_xml::events::Event::End(ref e)) => {
424 if e.name().as_ref() == b"t" {
425 in_text = false;
426 } else if e.name().as_ref() == b"si" {
427 shared_strings.push(current_string.clone());
428 current_string.clear();
429 }
430 }
431 Ok(quick_xml::events::Event::Text(e)) if in_text => {
432 let inner = e.into_inner();
433 let text = String::from_utf8_lossy(inner.as_ref());
434 current_string.push_str(&text);
435 }
436 Ok(quick_xml::events::Event::Eof) => break,
437 Err(e) => return Err(anyhow!("XML parsing error: {}", e)),
438 _ => {}
439 }
440 buf.clear();
441 }
442 }
443
444 Ok(shared_strings)
445 }
446
447 fn extract_worksheets(
448 &self,
449 archive: &mut oxiarc_archive::ZipReader<std::io::Cursor<&[u8]>>,
450 shared_strings: &[String],
451 config: &ContentExtractionConfig,
452 ) -> Result<(String, Vec<ExtractedTable>)> {
453 let mut all_text = Vec::new();
454 let mut tables = Vec::new();
455
456 let file_names: Vec<String> = archive
458 .entries()
459 .iter()
460 .map(|entry| entry.name.to_string())
461 .filter(|name: &String| {
462 name.starts_with("xl/worksheets/sheet") && name.ends_with(".xml")
463 })
464 .collect();
465
466 for (sheet_index, sheet_name) in file_names.iter().enumerate() {
467 if let Some(entry) = archive.entry_by_name(sheet_name).cloned() {
468 if let Ok(data) = archive.extract(&entry) {
469 if let Ok(content) = String::from_utf8(data) {
470 let (sheet_text, sheet_table) =
471 self.extract_sheet_content(&content, shared_strings)?;
472 all_text.push(sheet_text);
473
474 if config.extract_tables && !sheet_table.rows.is_empty() {
475 let mut table = sheet_table;
476 table.caption = Some(format!("Sheet {}", sheet_index + 1));
477 tables.push(table);
478 }
479 }
480 }
481 }
482 }
483
484 Ok((all_text.join("\n\n"), tables))
485 }
486
487 fn extract_sheet_content(
488 &self,
489 xml: &str,
490 shared_strings: &[String],
491 ) -> Result<(String, ExtractedTable)> {
492 let mut reader = quick_xml::Reader::from_str(xml);
493 let mut buf = Vec::new();
494 let mut cells = Vec::new();
495 let mut current_cell = (0, 0, String::new()); let mut in_value = false;
497 let mut cell_type_owned = String::from("str"); let mut row_index = 0;
499 let mut col_index = 0;
500
501 loop {
502 match reader.read_event_into(&mut buf) {
503 Ok(quick_xml::events::Event::Start(ref e)) => {
504 match e.name().as_ref() {
505 b"c" => {
506 for attr in e.attributes().flatten() {
509 match attr.key.as_ref() {
510 b"r" => {
511 let cell_ref = String::from_utf8_lossy(&attr.value);
513 (col_index, row_index) =
514 self.parse_cell_reference(&cell_ref);
515 }
516 b"t" => {
517 cell_type_owned =
518 String::from_utf8_lossy(&attr.value).to_string();
519 }
520 _ => {}
521 }
522 }
523 }
524 b"v" => {
525 in_value = true;
527 current_cell = (row_index, col_index, String::new());
528 }
529 _ => {}
530 }
531 }
532 Ok(quick_xml::events::Event::End(ref e)) => {
533 match e.name().as_ref() {
534 b"c" => {
535 if !current_cell.2.is_empty() {
536 cells.push(current_cell.clone());
537 }
538 cell_type_owned = String::from("str");
540 }
541 b"v" => {
542 in_value = false;
543 }
544 _ => {}
545 }
546 }
547 Ok(quick_xml::events::Event::Text(e)) if in_value => {
548 let inner = e.into_inner();
549 let text = String::from_utf8_lossy(inner.as_ref());
550 if cell_type_owned == "s" {
551 if let Ok(index) = text.parse::<usize>() {
553 if index < shared_strings.len() {
554 current_cell.2 = shared_strings[index].clone();
555 }
556 }
557 } else {
558 current_cell.2 = text.to_string();
559 }
560 }
561 Ok(quick_xml::events::Event::Eof) => break,
562 Err(e) => return Err(anyhow!("XML parsing error: {}", e)),
563 _ => {}
564 }
565 buf.clear();
566 }
567
568 let (text, table) = self.cells_to_table(cells);
570 Ok((text, table))
571 }
572
573 fn parse_cell_reference(&self, cell_ref: &str) -> (usize, usize) {
574 let mut col = 0;
575 let mut row = 0;
576 let mut i = 0;
577
578 for ch in cell_ref.chars() {
580 if ch.is_alphabetic() {
581 col = col * 26 + (ch.to_ascii_uppercase() as u8 - b'A') as usize + 1;
582 i += 1;
583 } else {
584 break;
585 }
586 }
587
588 if let Ok(row_num) = cell_ref[i..].parse::<usize>() {
590 row = row_num;
591 }
592
593 (col.saturating_sub(1), row.saturating_sub(1))
594 }
595
596 fn cells_to_table(&self, cells: Vec<(usize, usize, String)>) -> (String, ExtractedTable) {
597 if cells.is_empty() {
598 return (
599 String::new(),
600 ExtractedTable {
601 headers: Vec::new(),
602 rows: Vec::new(),
603 caption: None,
604 location: ContentLocation {
605 page: Some(1),
606 section: None,
607 char_offset: None,
608 line: None,
609 column: None,
610 },
611 },
612 );
613 }
614
615 let max_row = cells.iter().map(|(r, _, _)| *r).max().unwrap_or(0);
617 let max_col = cells.iter().map(|(_, c, _)| *c).max().unwrap_or(0);
618
619 let mut grid = vec![vec![String::new(); max_col + 1]; max_row + 1];
621 for (row, col, value) in cells {
622 if row <= max_row && col <= max_col {
623 grid[row][col] = value;
624 }
625 }
626
627 let headers = if !grid.is_empty() {
629 grid[0].clone()
630 } else {
631 Vec::new()
632 };
633
634 let rows = if grid.len() > 1 {
635 grid[1..].to_vec()
636 } else {
637 Vec::new()
638 };
639
640 let mut text_parts = Vec::new();
642 for row in &grid {
643 let row_text = row
644 .iter()
645 .filter(|cell| !cell.is_empty())
646 .cloned()
647 .collect::<Vec<_>>()
648 .join(" | ");
649 if !row_text.is_empty() {
650 text_parts.push(row_text);
651 }
652 }
653 let text = text_parts.join("\n");
654
655 let table = ExtractedTable {
656 headers,
657 rows,
658 caption: None,
659 location: ContentLocation {
660 page: Some(1),
661 section: None,
662 char_offset: None,
663 line: None,
664 column: None,
665 },
666 };
667
668 (text, table)
669 }
670}