Skip to main content

oxirs_vec/content_processing/
office_handlers.rs

1//! Office document handlers for content processing
2//!
3//! This module provides handlers for Microsoft Office documents (DOCX, PPTX, XLSX).
4
5#[cfg(feature = "content-processing")]
6use crate::content_processing::{
7    ContentExtractionConfig, ContentLocation, DocumentFormat, DocumentStructure, ExtractedContent,
8    ExtractedTable, FormatHandler, Heading, ProcessingStats,
9};
10#[cfg(feature = "content-processing")]
11use anyhow::{anyhow, Result};
12#[cfg(feature = "content-processing")]
13use std::collections::HashMap;
14
15/// Base handler for Office documents (DOCX, PPTX, XLSX)
16#[cfg(feature = "content-processing")]
17pub trait OfficeDocumentHandler {
18    fn extract_from_zip(&self, data: &[u8], main_xml_path: &str) -> Result<String> {
19        let cursor = std::io::Cursor::new(data);
20        let mut archive = oxiarc_archive::ZipReader::new(cursor)
21            .map_err(|e| anyhow!("Failed to open ZIP archive: {}", e))?;
22
23        // Try to find the main content file
24        let entry = archive
25            .entry_by_name(main_xml_path)
26            .cloned()
27            .ok_or_else(|| anyhow!("Main content file not found: {}", main_xml_path))?;
28
29        let data = archive.extract(&entry)?;
30        let content = String::from_utf8(data)?;
31
32        self.extract_text_from_xml(&content)
33    }
34
35    fn extract_text_from_xml(&self, xml: &str) -> Result<String> {
36        let mut reader = quick_xml::Reader::from_str(xml);
37        let mut buf = Vec::new();
38        let mut text_content = Vec::new();
39        let mut in_text = false;
40
41        loop {
42            match reader.read_event_into(&mut buf) {
43                Ok(quick_xml::events::Event::Start(ref e)) => {
44                    match e.name().as_ref() {
45                        b"w:t" | b"a:t" | b"c" => in_text = true, // Word text, PowerPoint text, Excel cell
46                        _ => {}
47                    }
48                }
49                Ok(quick_xml::events::Event::End(ref e)) => match e.name().as_ref() {
50                    b"w:t" | b"a:t" | b"c" => in_text = false,
51                    _ => {}
52                },
53                Ok(quick_xml::events::Event::Text(e)) if in_text => {
54                    let inner = e.into_inner();
55                    let text = String::from_utf8_lossy(inner.as_ref());
56                    text_content.push(text.to_string());
57                }
58                Ok(quick_xml::events::Event::Eof) => break,
59                Err(e) => return Err(anyhow!("XML parsing error: {}", e)),
60                _ => {}
61            }
62            buf.clear();
63        }
64
65        Ok(text_content.join(" "))
66    }
67
68    fn extract_metadata_from_zip(&self, data: &[u8]) -> HashMap<String, String> {
69        let mut metadata = HashMap::new();
70
71        let cursor = std::io::Cursor::new(data);
72        if let Ok(mut archive) = oxiarc_archive::ZipReader::new(cursor) {
73            // Try to read core properties
74            if let Some(entry) = archive.entry_by_name("docProps/core.xml").cloned() {
75                if let Ok(data) = archive.extract(&entry) {
76                    if let Ok(content) = String::from_utf8(data) {
77                        // Parse core properties XML
78                        let mut reader = quick_xml::Reader::from_str(&content);
79                        let mut buf = Vec::new();
80                        let mut current_element = String::new();
81
82                        loop {
83                            match reader.read_event_into(&mut buf) {
84                                Ok(quick_xml::events::Event::Start(ref e)) => {
85                                    current_element =
86                                        String::from_utf8_lossy(e.name().as_ref()).to_string();
87                                }
88                                Ok(quick_xml::events::Event::Text(e)) => {
89                                    let inner = e.into_inner();
90                                    let text = String::from_utf8_lossy(inner.as_ref());
91                                    match current_element.as_str() {
92                                        "dc:title" => {
93                                            metadata.insert("title".to_string(), text.to_string());
94                                        }
95                                        "dc:creator" => {
96                                            metadata.insert("author".to_string(), text.to_string());
97                                        }
98                                        "dc:subject" => {
99                                            metadata
100                                                .insert("subject".to_string(), text.to_string());
101                                        }
102                                        "dc:description" => {
103                                            metadata.insert(
104                                                "description".to_string(),
105                                                text.to_string(),
106                                            );
107                                        }
108                                        _ => {}
109                                    }
110                                }
111                                Ok(quick_xml::events::Event::Eof) => break,
112                                _ => {}
113                            }
114                            buf.clear();
115                        }
116                    }
117                }
118            }
119        }
120
121        metadata.insert("size".to_string(), data.len().to_string());
122        metadata
123    }
124}
125
126/// DOCX document handler
127#[cfg(feature = "content-processing")]
128pub struct DocxHandler;
129
130#[cfg(feature = "content-processing")]
131impl OfficeDocumentHandler for DocxHandler {}
132
133#[cfg(feature = "content-processing")]
134impl FormatHandler for DocxHandler {
135    fn extract_content(
136        &self,
137        data: &[u8],
138        _config: &ContentExtractionConfig,
139    ) -> Result<ExtractedContent> {
140        let text = self.extract_from_zip(data, "word/document.xml")?;
141        let metadata = self.extract_metadata_from_zip(data);
142        let title = metadata.get("title").cloned();
143
144        // Extract headings (would need style analysis for proper heading detection)
145        let headings = self.extract_docx_headings(&text);
146
147        Ok(ExtractedContent {
148            format: DocumentFormat::Docx,
149            text,
150            metadata,
151            images: Vec::new(), // Would require parsing word/media folder
152            tables: Vec::new(), // Would require parsing table XML structures
153            links: Vec::new(),  // Would require parsing hyperlink relationships
154            structure: DocumentStructure {
155                title,
156                headings,
157                page_count: 1, // Would need to analyze page breaks
158                section_count: 1,
159                table_of_contents: Vec::new(),
160            },
161            chunks: Vec::new(),
162            language: None,
163            processing_stats: ProcessingStats::default(),
164            audio_content: Vec::new(),
165            video_content: Vec::new(),
166            cross_modal_embeddings: Vec::new(),
167        })
168    }
169
170    fn can_handle(&self, data: &[u8]) -> bool {
171        if data.len() < 4 {
172            return false;
173        }
174
175        // Check for ZIP signature
176        if data[0..4] != [0x50, 0x4B, 0x03, 0x04] && data[0..4] != [0x50, 0x4B, 0x05, 0x06] {
177            return false;
178        }
179
180        // Check if it contains DOCX-specific files
181        let cursor = std::io::Cursor::new(data);
182        if let Ok(archive) = oxiarc_archive::ZipReader::new(cursor) {
183            archive.entry_by_name("word/document.xml").is_some()
184                && archive.entry_by_name("[Content_Types].xml").is_some()
185        } else {
186            false
187        }
188    }
189
190    fn supported_extensions(&self) -> Vec<&'static str> {
191        vec!["docx"]
192    }
193}
194
195#[cfg(feature = "content-processing")]
196impl DocxHandler {
197    fn extract_docx_headings(&self, text: &str) -> Vec<Heading> {
198        let mut headings = Vec::new();
199
200        // Simple heuristic for headings in extracted text
201        for (i, line) in text.lines().enumerate() {
202            let trimmed = line.trim();
203            if trimmed.len() > 3 && trimmed.len() < 100 {
204                // Check if line looks like a heading
205                let words: Vec<&str> = trimmed.split_whitespace().collect();
206                if words.len() <= 8 && !words.is_empty() {
207                    let first_char = trimmed.chars().next().unwrap_or(' ');
208                    if first_char.is_uppercase() {
209                        headings.push(Heading {
210                            level: 1, // Would need style information for proper level detection
211                            text: trimmed.to_string(),
212                            location: ContentLocation {
213                                page: None,
214                                section: None,
215                                char_offset: None,
216                                line: Some(i + 1),
217                                column: None,
218                            },
219                        });
220                    }
221                }
222            }
223        }
224
225        headings
226    }
227}
228
229/// PPTX document handler
230#[cfg(feature = "content-processing")]
231pub struct PptxHandler;
232
233#[cfg(feature = "content-processing")]
234impl OfficeDocumentHandler for PptxHandler {}
235
236#[cfg(feature = "content-processing")]
237impl FormatHandler for PptxHandler {
238    fn extract_content(
239        &self,
240        data: &[u8],
241        _config: &ContentExtractionConfig,
242    ) -> Result<ExtractedContent> {
243        // Extract text from all slides
244        let mut all_text = Vec::new();
245        let cursor = std::io::Cursor::new(data);
246        let mut archive = oxiarc_archive::ZipReader::new(cursor)
247            .map_err(|e| anyhow!("Failed to open PPTX archive: {}", e))?;
248
249        // Find all slide files
250        let file_names: Vec<String> = archive
251            .entries()
252            .iter()
253            .map(|entry| entry.name.to_string())
254            .filter(|name: &String| name.starts_with("ppt/slides/slide") && name.ends_with(".xml"))
255            .collect();
256
257        for slide_name in file_names {
258            if let Some(entry) = archive.entry_by_name(&slide_name).cloned() {
259                if let Ok(data) = archive.extract(&entry) {
260                    if let Ok(content) = String::from_utf8(data) {
261                        if let Ok(slide_text) = self.extract_text_from_xml(&content) {
262                            all_text.push(slide_text);
263                        }
264                    }
265                }
266            }
267        }
268
269        let text = all_text.join("\n\n");
270        let metadata = self.extract_metadata_from_zip(data);
271        let title = metadata.get("title").cloned();
272
273        Ok(ExtractedContent {
274            format: DocumentFormat::Pptx,
275            text,
276            metadata,
277            images: Vec::new(),
278            tables: Vec::new(),
279            links: Vec::new(),
280            structure: DocumentStructure {
281                title,
282                headings: Vec::new(), // Would extract slide titles as headings
283                page_count: all_text.len(), // Each slide is a "page"
284                section_count: all_text.len(),
285                table_of_contents: Vec::new(),
286            },
287            chunks: Vec::new(),
288            language: None,
289            processing_stats: ProcessingStats::default(),
290            audio_content: Vec::new(),
291            video_content: Vec::new(),
292            cross_modal_embeddings: Vec::new(),
293        })
294    }
295
296    fn can_handle(&self, data: &[u8]) -> bool {
297        if data.len() < 4 {
298            return false;
299        }
300
301        // Check for ZIP signature
302        if data[0..4] != [0x50, 0x4B, 0x03, 0x04] && data[0..4] != [0x50, 0x4B, 0x05, 0x06] {
303            return false;
304        }
305
306        // Check if it contains PPTX-specific files
307        let cursor = std::io::Cursor::new(data);
308        if let Ok(archive) = oxiarc_archive::ZipReader::new(cursor) {
309            archive.entry_by_name("ppt/presentation.xml").is_some()
310                && archive.entry_by_name("[Content_Types].xml").is_some()
311        } else {
312            false
313        }
314    }
315
316    fn supported_extensions(&self) -> Vec<&'static str> {
317        vec!["pptx"]
318    }
319}
320
321/// XLSX document handler
322#[cfg(feature = "content-processing")]
323pub struct XlsxHandler;
324
325#[cfg(feature = "content-processing")]
326impl OfficeDocumentHandler for XlsxHandler {}
327
328#[cfg(feature = "content-processing")]
329impl FormatHandler for XlsxHandler {
330    fn extract_content(
331        &self,
332        data: &[u8],
333        config: &ContentExtractionConfig,
334    ) -> Result<ExtractedContent> {
335        let cursor = std::io::Cursor::new(data);
336        let mut archive = oxiarc_archive::ZipReader::new(cursor)
337            .map_err(|e| anyhow!("Failed to open XLSX archive: {}", e))?;
338
339        // Extract shared strings first
340        let shared_strings = self.extract_shared_strings(&mut archive)?;
341
342        // Extract worksheet content
343        let (text, tables) = self.extract_worksheets(&mut archive, &shared_strings, config)?;
344        let metadata = self.extract_metadata_from_zip(data);
345        let title = metadata.get("title").cloned();
346
347        Ok(ExtractedContent {
348            format: DocumentFormat::Xlsx,
349            text,
350            metadata,
351            images: Vec::new(),
352            tables,
353            links: Vec::new(),
354            structure: DocumentStructure {
355                title,
356                headings: Vec::new(),
357                page_count: 1,
358                section_count: 1,
359                table_of_contents: Vec::new(),
360            },
361            chunks: Vec::new(),
362            language: None,
363            processing_stats: ProcessingStats::default(),
364            audio_content: Vec::new(),
365            video_content: Vec::new(),
366            cross_modal_embeddings: Vec::new(),
367        })
368    }
369
370    fn can_handle(&self, data: &[u8]) -> bool {
371        if data.len() < 4 {
372            return false;
373        }
374
375        // Check for ZIP signature
376        if data[0..4] != [0x50, 0x4B, 0x03, 0x04] && data[0..4] != [0x50, 0x4B, 0x05, 0x06] {
377            return false;
378        }
379
380        // Check if it contains XLSX-specific files
381        let cursor = std::io::Cursor::new(data);
382        if let Ok(archive) = oxiarc_archive::ZipReader::new(cursor) {
383            archive.entry_by_name("xl/workbook.xml").is_some()
384                && archive.entry_by_name("[Content_Types].xml").is_some()
385        } else {
386            false
387        }
388    }
389
390    fn supported_extensions(&self) -> Vec<&'static str> {
391        vec!["xlsx"]
392    }
393}
394
395#[cfg(feature = "content-processing")]
396impl XlsxHandler {
397    fn extract_shared_strings(
398        &self,
399        archive: &mut oxiarc_archive::ZipReader<std::io::Cursor<&[u8]>>,
400    ) -> Result<Vec<String>> {
401        let mut shared_strings = Vec::new();
402
403        if let Some(entry) = archive.entry_by_name("xl/sharedStrings.xml").cloned() {
404            let data = archive
405                .extract(&entry)
406                .map_err(|e| anyhow!("Failed to extract shared strings: {}", e))?;
407            let content = String::from_utf8(data)
408                .map_err(|e| anyhow!("Failed to read shared strings: {}", e))?;
409
410            let mut reader = quick_xml::Reader::from_str(&content);
411            let mut buf = Vec::new();
412            let mut in_text = false;
413            let mut current_string = String::new();
414
415            loop {
416                match reader.read_event_into(&mut buf) {
417                    Ok(quick_xml::events::Event::Start(ref e)) if e.name().as_ref() == b"t" => {
418                        in_text = true;
419                        current_string.clear();
420                    }
421                    Ok(quick_xml::events::Event::End(ref e)) => {
422                        if e.name().as_ref() == b"t" {
423                            in_text = false;
424                        } else if e.name().as_ref() == b"si" {
425                            shared_strings.push(current_string.clone());
426                            current_string.clear();
427                        }
428                    }
429                    Ok(quick_xml::events::Event::Text(e)) if in_text => {
430                        let inner = e.into_inner();
431                        let text = String::from_utf8_lossy(inner.as_ref());
432                        current_string.push_str(&text);
433                    }
434                    Ok(quick_xml::events::Event::Eof) => break,
435                    Err(e) => return Err(anyhow!("XML parsing error: {}", e)),
436                    _ => {}
437                }
438                buf.clear();
439            }
440        }
441
442        Ok(shared_strings)
443    }
444
445    fn extract_worksheets(
446        &self,
447        archive: &mut oxiarc_archive::ZipReader<std::io::Cursor<&[u8]>>,
448        shared_strings: &[String],
449        config: &ContentExtractionConfig,
450    ) -> Result<(String, Vec<ExtractedTable>)> {
451        let mut all_text = Vec::new();
452        let mut tables = Vec::new();
453
454        // Find all worksheet files
455        let file_names: Vec<String> = archive
456            .entries()
457            .iter()
458            .map(|entry| entry.name.to_string())
459            .filter(|name: &String| {
460                name.starts_with("xl/worksheets/sheet") && name.ends_with(".xml")
461            })
462            .collect();
463
464        for (sheet_index, sheet_name) in file_names.iter().enumerate() {
465            if let Some(entry) = archive.entry_by_name(sheet_name).cloned() {
466                if let Ok(data) = archive.extract(&entry) {
467                    if let Ok(content) = String::from_utf8(data) {
468                        let (sheet_text, sheet_table) =
469                            self.extract_sheet_content(&content, shared_strings)?;
470                        all_text.push(sheet_text);
471
472                        if config.extract_tables && !sheet_table.rows.is_empty() {
473                            let mut table = sheet_table;
474                            table.caption = Some(format!("Sheet {}", sheet_index + 1));
475                            tables.push(table);
476                        }
477                    }
478                }
479            }
480        }
481
482        Ok((all_text.join("\n\n"), tables))
483    }
484
485    fn extract_sheet_content(
486        &self,
487        xml: &str,
488        shared_strings: &[String],
489    ) -> Result<(String, ExtractedTable)> {
490        let mut reader = quick_xml::Reader::from_str(xml);
491        let mut buf = Vec::new();
492        let mut cells = Vec::new();
493        let mut current_cell = (0, 0, String::new()); // (row, col, value)
494        let mut in_value = false;
495        let mut cell_type_owned = String::from("str"); // Default to string
496        let mut row_index = 0;
497        let mut col_index = 0;
498
499        loop {
500            match reader.read_event_into(&mut buf) {
501                Ok(quick_xml::events::Event::Start(ref e)) => {
502                    match e.name().as_ref() {
503                        b"c" => {
504                            // Cell
505                            // Parse cell reference and type
506                            for attr in e.attributes().flatten() {
507                                match attr.key.as_ref() {
508                                    b"r" => {
509                                        // Parse cell reference like "A1", "B2", etc.
510                                        let cell_ref = String::from_utf8_lossy(&attr.value);
511                                        (col_index, row_index) =
512                                            self.parse_cell_reference(&cell_ref);
513                                    }
514                                    b"t" => {
515                                        cell_type_owned =
516                                            String::from_utf8_lossy(&attr.value).to_string();
517                                    }
518                                    _ => {}
519                                }
520                            }
521                        }
522                        b"v" => {
523                            // Cell value
524                            in_value = true;
525                            current_cell = (row_index, col_index, String::new());
526                        }
527                        _ => {}
528                    }
529                }
530                Ok(quick_xml::events::Event::End(ref e)) => {
531                    match e.name().as_ref() {
532                        b"c" => {
533                            if !current_cell.2.is_empty() {
534                                cells.push(current_cell.clone());
535                            }
536                            // Reset for next cell
537                            cell_type_owned = String::from("str");
538                        }
539                        b"v" => {
540                            in_value = false;
541                        }
542                        _ => {}
543                    }
544                }
545                Ok(quick_xml::events::Event::Text(e)) if in_value => {
546                    let inner = e.into_inner();
547                    let text = String::from_utf8_lossy(inner.as_ref());
548                    if cell_type_owned == "s" {
549                        // Shared string reference
550                        if let Ok(index) = text.parse::<usize>() {
551                            if index < shared_strings.len() {
552                                current_cell.2 = shared_strings[index].clone();
553                            }
554                        }
555                    } else {
556                        current_cell.2 = text.to_string();
557                    }
558                }
559                Ok(quick_xml::events::Event::Eof) => break,
560                Err(e) => return Err(anyhow!("XML parsing error: {}", e)),
561                _ => {}
562            }
563            buf.clear();
564        }
565
566        // Convert cells to table format
567        let (text, table) = self.cells_to_table(cells);
568        Ok((text, table))
569    }
570
571    fn parse_cell_reference(&self, cell_ref: &str) -> (usize, usize) {
572        let mut col = 0;
573        let mut row = 0;
574        let mut i = 0;
575
576        // Parse column letters
577        for ch in cell_ref.chars() {
578            if ch.is_alphabetic() {
579                col = col * 26 + (ch.to_ascii_uppercase() as u8 - b'A') as usize + 1;
580                i += 1;
581            } else {
582                break;
583            }
584        }
585
586        // Parse row number
587        if let Ok(row_num) = cell_ref[i..].parse::<usize>() {
588            row = row_num;
589        }
590
591        (col.saturating_sub(1), row.saturating_sub(1))
592    }
593
594    fn cells_to_table(&self, cells: Vec<(usize, usize, String)>) -> (String, ExtractedTable) {
595        if cells.is_empty() {
596            return (
597                String::new(),
598                ExtractedTable {
599                    headers: Vec::new(),
600                    rows: Vec::new(),
601                    caption: None,
602                    location: ContentLocation {
603                        page: Some(1),
604                        section: None,
605                        char_offset: None,
606                        line: None,
607                        column: None,
608                    },
609                },
610            );
611        }
612
613        // Find dimensions
614        let max_row = cells.iter().map(|(r, _, _)| *r).max().unwrap_or(0);
615        let max_col = cells.iter().map(|(_, c, _)| *c).max().unwrap_or(0);
616
617        // Create grid
618        let mut grid = vec![vec![String::new(); max_col + 1]; max_row + 1];
619        for (row, col, value) in cells {
620            if row <= max_row && col <= max_col {
621                grid[row][col] = value;
622            }
623        }
624
625        // Extract headers (first row) and data rows
626        let headers = if !grid.is_empty() {
627            grid[0].clone()
628        } else {
629            Vec::new()
630        };
631
632        let rows = if grid.len() > 1 {
633            grid[1..].to_vec()
634        } else {
635            Vec::new()
636        };
637
638        // Create text representation
639        let mut text_parts = Vec::new();
640        for row in &grid {
641            let row_text = row
642                .iter()
643                .filter(|cell| !cell.is_empty())
644                .cloned()
645                .collect::<Vec<_>>()
646                .join(" | ");
647            if !row_text.is_empty() {
648                text_parts.push(row_text);
649            }
650        }
651        let text = text_parts.join("\n");
652
653        let table = ExtractedTable {
654            headers,
655            rows,
656            caption: None,
657            location: ContentLocation {
658                page: Some(1),
659                section: None,
660                char_offset: None,
661                line: None,
662                column: None,
663            },
664        };
665
666        (text, table)
667    }
668}