Skip to main content

oxirs_vec/content_processing/
office_handlers.rs

1//! Office document handlers for content processing
2//!
3//! This module provides handlers for Microsoft Office documents (DOCX, PPTX, XLSX).
4
5#[cfg(feature = "content-processing")]
6use crate::content_processing::{
7    ContentExtractionConfig, ContentLocation, DocumentFormat, DocumentStructure, ExtractedContent,
8    ExtractedTable, FormatHandler, Heading, ProcessingStats,
9};
10#[cfg(feature = "content-processing")]
11use anyhow::{anyhow, Result};
12#[cfg(feature = "content-processing")]
13use std::collections::HashMap;
14
15/// Base handler for Office documents (DOCX, PPTX, XLSX)
16#[cfg(feature = "content-processing")]
17pub trait OfficeDocumentHandler {
18    fn extract_from_zip(&self, data: &[u8], main_xml_path: &str) -> Result<String> {
19        let cursor = std::io::Cursor::new(data);
20        let mut archive = oxiarc_archive::ZipReader::new(cursor)
21            .map_err(|e| anyhow!("Failed to open ZIP archive: {}", e))?;
22
23        // Try to find the main content file
24        let entry = archive
25            .entry_by_name(main_xml_path)
26            .cloned()
27            .ok_or_else(|| anyhow!("Main content file not found: {}", main_xml_path))?;
28
29        let data = archive.extract(&entry)?;
30        let content = String::from_utf8(data)?;
31
32        self.extract_text_from_xml(&content)
33    }
34
35    fn extract_text_from_xml(&self, xml: &str) -> Result<String> {
36        let mut reader = quick_xml::Reader::from_str(xml);
37        let mut buf = Vec::new();
38        let mut text_content = Vec::new();
39        let mut in_text = false;
40
41        loop {
42            match reader.read_event_into(&mut buf) {
43                Ok(quick_xml::events::Event::Start(ref e)) => {
44                    match e.name().as_ref() {
45                        b"w:t" | b"a:t" | b"c" => in_text = true, // Word text, PowerPoint text, Excel cell
46                        _ => {}
47                    }
48                }
49                Ok(quick_xml::events::Event::End(ref e)) => match e.name().as_ref() {
50                    b"w:t" | b"a:t" | b"c" => in_text = false,
51                    _ => {}
52                },
53                Ok(quick_xml::events::Event::Text(e)) if in_text => {
54                    let inner = e.into_inner();
55                    let text = String::from_utf8_lossy(inner.as_ref());
56                    text_content.push(text.to_string());
57                }
58                Ok(quick_xml::events::Event::Eof) => break,
59                Err(e) => return Err(anyhow!("XML parsing error: {}", e)),
60                _ => {}
61            }
62            buf.clear();
63        }
64
65        Ok(text_content.join(" "))
66    }
67
68    fn extract_metadata_from_zip(&self, data: &[u8]) -> HashMap<String, String> {
69        let mut metadata = HashMap::new();
70
71        let cursor = std::io::Cursor::new(data);
72        if let Ok(mut archive) = oxiarc_archive::ZipReader::new(cursor) {
73            // Try to read core properties
74            if let Some(entry) = archive.entry_by_name("docProps/core.xml").cloned() {
75                if let Ok(data) = archive.extract(&entry) {
76                    if let Ok(content) = String::from_utf8(data) {
77                        // Parse core properties XML
78                        let mut reader = quick_xml::Reader::from_str(&content);
79                        let mut buf = Vec::new();
80                        let mut current_element = String::new();
81
82                        loop {
83                            match reader.read_event_into(&mut buf) {
84                                Ok(quick_xml::events::Event::Start(ref e)) => {
85                                    current_element =
86                                        String::from_utf8_lossy(e.name().as_ref()).to_string();
87                                }
88                                Ok(quick_xml::events::Event::Text(e)) => {
89                                    let inner = e.into_inner();
90                                    let text = String::from_utf8_lossy(inner.as_ref());
91                                    match current_element.as_str() {
92                                        "dc:title" => {
93                                            metadata.insert("title".to_string(), text.to_string());
94                                        }
95                                        "dc:creator" => {
96                                            metadata.insert("author".to_string(), text.to_string());
97                                        }
98                                        "dc:subject" => {
99                                            metadata
100                                                .insert("subject".to_string(), text.to_string());
101                                        }
102                                        "dc:description" => {
103                                            metadata.insert(
104                                                "description".to_string(),
105                                                text.to_string(),
106                                            );
107                                        }
108                                        _ => {}
109                                    }
110                                }
111                                Ok(quick_xml::events::Event::Eof) => break,
112                                _ => {}
113                            }
114                            buf.clear();
115                        }
116                    }
117                }
118            }
119        }
120
121        metadata.insert("size".to_string(), data.len().to_string());
122        metadata
123    }
124}
125
126/// DOCX document handler
127#[cfg(feature = "content-processing")]
128pub struct DocxHandler;
129
130#[cfg(feature = "content-processing")]
131impl OfficeDocumentHandler for DocxHandler {}
132
133#[cfg(feature = "content-processing")]
134impl FormatHandler for DocxHandler {
135    fn extract_content(
136        &self,
137        data: &[u8],
138        _config: &ContentExtractionConfig,
139    ) -> Result<ExtractedContent> {
140        let text = self.extract_from_zip(data, "word/document.xml")?;
141        let metadata = self.extract_metadata_from_zip(data);
142        let title = metadata.get("title").cloned();
143
144        // Extract headings (would need style analysis for proper heading detection)
145        let headings = self.extract_docx_headings(&text);
146
147        Ok(ExtractedContent {
148            format: DocumentFormat::Docx,
149            text,
150            metadata,
151            images: Vec::new(), // Would require parsing word/media folder
152            tables: Vec::new(), // Would require parsing table XML structures
153            links: Vec::new(),  // Would require parsing hyperlink relationships
154            structure: DocumentStructure {
155                title,
156                headings,
157                page_count: 1, // Would need to analyze page breaks
158                section_count: 1,
159                table_of_contents: Vec::new(),
160            },
161            chunks: Vec::new(),
162            language: None,
163            processing_stats: ProcessingStats::default(),
164            audio_content: Vec::new(),
165            video_content: Vec::new(),
166            cross_modal_embeddings: Vec::new(),
167        })
168    }
169
170    fn can_handle(&self, data: &[u8]) -> bool {
171        if data.len() < 4 {
172            return false;
173        }
174
175        // Check for ZIP signature
176        if data[0..4] != [0x50, 0x4B, 0x03, 0x04] && data[0..4] != [0x50, 0x4B, 0x05, 0x06] {
177            return false;
178        }
179
180        // Check if it contains DOCX-specific files
181        let cursor = std::io::Cursor::new(data);
182        if let Ok(archive) = oxiarc_archive::ZipReader::new(cursor) {
183            archive.entry_by_name("word/document.xml").is_some()
184                && archive.entry_by_name("[Content_Types].xml").is_some()
185        } else {
186            false
187        }
188    }
189
190    fn supported_extensions(&self) -> Vec<&'static str> {
191        vec!["docx"]
192    }
193}
194
195#[cfg(feature = "content-processing")]
196impl DocxHandler {
197    fn extract_docx_headings(&self, text: &str) -> Vec<Heading> {
198        let mut headings = Vec::new();
199
200        // Simple heuristic for headings in extracted text
201        for (i, line) in text.lines().enumerate() {
202            let trimmed = line.trim();
203            if trimmed.len() > 3 && trimmed.len() < 100 {
204                // Check if line looks like a heading
205                let words: Vec<&str> = trimmed.split_whitespace().collect();
206                if words.len() <= 8 && !words.is_empty() {
207                    let first_char = trimmed.chars().next().unwrap_or(' ');
208                    if first_char.is_uppercase() {
209                        headings.push(Heading {
210                            level: 1, // Would need style information for proper level detection
211                            text: trimmed.to_string(),
212                            location: ContentLocation {
213                                page: None,
214                                section: None,
215                                char_offset: None,
216                                line: Some(i + 1),
217                                column: None,
218                            },
219                        });
220                    }
221                }
222            }
223        }
224
225        headings
226    }
227}
228
229/// PPTX document handler
230#[cfg(feature = "content-processing")]
231pub struct PptxHandler;
232
233#[cfg(feature = "content-processing")]
234impl OfficeDocumentHandler for PptxHandler {}
235
236#[cfg(feature = "content-processing")]
237impl FormatHandler for PptxHandler {
238    fn extract_content(
239        &self,
240        data: &[u8],
241        _config: &ContentExtractionConfig,
242    ) -> Result<ExtractedContent> {
243        // Extract text from all slides
244        let mut all_text = Vec::new();
245        let cursor = std::io::Cursor::new(data);
246        let mut archive = oxiarc_archive::ZipReader::new(cursor)
247            .map_err(|e| anyhow!("Failed to open PPTX archive: {}", e))?;
248
249        // Find all slide files
250        let file_names: Vec<String> = archive
251            .entries()
252            .iter()
253            .map(|entry| entry.name.to_string())
254            .filter(|name: &String| name.starts_with("ppt/slides/slide") && name.ends_with(".xml"))
255            .collect();
256
257        for slide_name in file_names {
258            if let Some(entry) = archive.entry_by_name(&slide_name).cloned() {
259                if let Ok(data) = archive.extract(&entry) {
260                    if let Ok(content) = String::from_utf8(data) {
261                        if let Ok(slide_text) = self.extract_text_from_xml(&content) {
262                            all_text.push(slide_text);
263                        }
264                    }
265                }
266            }
267        }
268
269        let text = all_text.join("\n\n");
270        let metadata = self.extract_metadata_from_zip(data);
271        let title = metadata.get("title").cloned();
272
273        Ok(ExtractedContent {
274            format: DocumentFormat::Pptx,
275            text,
276            metadata,
277            images: Vec::new(),
278            tables: Vec::new(),
279            links: Vec::new(),
280            structure: DocumentStructure {
281                title,
282                headings: Vec::new(), // Would extract slide titles as headings
283                page_count: all_text.len(), // Each slide is a "page"
284                section_count: all_text.len(),
285                table_of_contents: Vec::new(),
286            },
287            chunks: Vec::new(),
288            language: None,
289            processing_stats: ProcessingStats::default(),
290            audio_content: Vec::new(),
291            video_content: Vec::new(),
292            cross_modal_embeddings: Vec::new(),
293        })
294    }
295
296    fn can_handle(&self, data: &[u8]) -> bool {
297        if data.len() < 4 {
298            return false;
299        }
300
301        // Check for ZIP signature
302        if data[0..4] != [0x50, 0x4B, 0x03, 0x04] && data[0..4] != [0x50, 0x4B, 0x05, 0x06] {
303            return false;
304        }
305
306        // Check if it contains PPTX-specific files
307        let cursor = std::io::Cursor::new(data);
308        if let Ok(archive) = oxiarc_archive::ZipReader::new(cursor) {
309            archive.entry_by_name("ppt/presentation.xml").is_some()
310                && archive.entry_by_name("[Content_Types].xml").is_some()
311        } else {
312            false
313        }
314    }
315
316    fn supported_extensions(&self) -> Vec<&'static str> {
317        vec!["pptx"]
318    }
319}
320
321/// XLSX document handler
322#[cfg(feature = "content-processing")]
323pub struct XlsxHandler;
324
325#[cfg(feature = "content-processing")]
326impl OfficeDocumentHandler for XlsxHandler {}
327
328#[cfg(feature = "content-processing")]
329impl FormatHandler for XlsxHandler {
330    fn extract_content(
331        &self,
332        data: &[u8],
333        config: &ContentExtractionConfig,
334    ) -> Result<ExtractedContent> {
335        let cursor = std::io::Cursor::new(data);
336        let mut archive = oxiarc_archive::ZipReader::new(cursor)
337            .map_err(|e| anyhow!("Failed to open XLSX archive: {}", e))?;
338
339        // Extract shared strings first
340        let shared_strings = self.extract_shared_strings(&mut archive)?;
341
342        // Extract worksheet content
343        let (text, tables) = self.extract_worksheets(&mut archive, &shared_strings, config)?;
344        let metadata = self.extract_metadata_from_zip(data);
345        let title = metadata.get("title").cloned();
346
347        Ok(ExtractedContent {
348            format: DocumentFormat::Xlsx,
349            text,
350            metadata,
351            images: Vec::new(),
352            tables,
353            links: Vec::new(),
354            structure: DocumentStructure {
355                title,
356                headings: Vec::new(),
357                page_count: 1,
358                section_count: 1,
359                table_of_contents: Vec::new(),
360            },
361            chunks: Vec::new(),
362            language: None,
363            processing_stats: ProcessingStats::default(),
364            audio_content: Vec::new(),
365            video_content: Vec::new(),
366            cross_modal_embeddings: Vec::new(),
367        })
368    }
369
370    fn can_handle(&self, data: &[u8]) -> bool {
371        if data.len() < 4 {
372            return false;
373        }
374
375        // Check for ZIP signature
376        if data[0..4] != [0x50, 0x4B, 0x03, 0x04] && data[0..4] != [0x50, 0x4B, 0x05, 0x06] {
377            return false;
378        }
379
380        // Check if it contains XLSX-specific files
381        let cursor = std::io::Cursor::new(data);
382        if let Ok(archive) = oxiarc_archive::ZipReader::new(cursor) {
383            archive.entry_by_name("xl/workbook.xml").is_some()
384                && archive.entry_by_name("[Content_Types].xml").is_some()
385        } else {
386            false
387        }
388    }
389
390    fn supported_extensions(&self) -> Vec<&'static str> {
391        vec!["xlsx"]
392    }
393}
394
395#[cfg(feature = "content-processing")]
396impl XlsxHandler {
397    fn extract_shared_strings(
398        &self,
399        archive: &mut oxiarc_archive::ZipReader<std::io::Cursor<&[u8]>>,
400    ) -> Result<Vec<String>> {
401        let mut shared_strings = Vec::new();
402
403        if let Some(entry) = archive.entry_by_name("xl/sharedStrings.xml").cloned() {
404            let data = archive
405                .extract(&entry)
406                .map_err(|e| anyhow!("Failed to extract shared strings: {}", e))?;
407            let content = String::from_utf8(data)
408                .map_err(|e| anyhow!("Failed to read shared strings: {}", e))?;
409
410            let mut reader = quick_xml::Reader::from_str(&content);
411            let mut buf = Vec::new();
412            let mut in_text = false;
413            let mut current_string = String::new();
414
415            loop {
416                match reader.read_event_into(&mut buf) {
417                    Ok(quick_xml::events::Event::Start(ref e)) => {
418                        if e.name().as_ref() == b"t" {
419                            in_text = true;
420                            current_string.clear();
421                        }
422                    }
423                    Ok(quick_xml::events::Event::End(ref e)) => {
424                        if e.name().as_ref() == b"t" {
425                            in_text = false;
426                        } else if e.name().as_ref() == b"si" {
427                            shared_strings.push(current_string.clone());
428                            current_string.clear();
429                        }
430                    }
431                    Ok(quick_xml::events::Event::Text(e)) if in_text => {
432                        let inner = e.into_inner();
433                        let text = String::from_utf8_lossy(inner.as_ref());
434                        current_string.push_str(&text);
435                    }
436                    Ok(quick_xml::events::Event::Eof) => break,
437                    Err(e) => return Err(anyhow!("XML parsing error: {}", e)),
438                    _ => {}
439                }
440                buf.clear();
441            }
442        }
443
444        Ok(shared_strings)
445    }
446
447    fn extract_worksheets(
448        &self,
449        archive: &mut oxiarc_archive::ZipReader<std::io::Cursor<&[u8]>>,
450        shared_strings: &[String],
451        config: &ContentExtractionConfig,
452    ) -> Result<(String, Vec<ExtractedTable>)> {
453        let mut all_text = Vec::new();
454        let mut tables = Vec::new();
455
456        // Find all worksheet files
457        let file_names: Vec<String> = archive
458            .entries()
459            .iter()
460            .map(|entry| entry.name.to_string())
461            .filter(|name: &String| {
462                name.starts_with("xl/worksheets/sheet") && name.ends_with(".xml")
463            })
464            .collect();
465
466        for (sheet_index, sheet_name) in file_names.iter().enumerate() {
467            if let Some(entry) = archive.entry_by_name(sheet_name).cloned() {
468                if let Ok(data) = archive.extract(&entry) {
469                    if let Ok(content) = String::from_utf8(data) {
470                        let (sheet_text, sheet_table) =
471                            self.extract_sheet_content(&content, shared_strings)?;
472                        all_text.push(sheet_text);
473
474                        if config.extract_tables && !sheet_table.rows.is_empty() {
475                            let mut table = sheet_table;
476                            table.caption = Some(format!("Sheet {}", sheet_index + 1));
477                            tables.push(table);
478                        }
479                    }
480                }
481            }
482        }
483
484        Ok((all_text.join("\n\n"), tables))
485    }
486
487    fn extract_sheet_content(
488        &self,
489        xml: &str,
490        shared_strings: &[String],
491    ) -> Result<(String, ExtractedTable)> {
492        let mut reader = quick_xml::Reader::from_str(xml);
493        let mut buf = Vec::new();
494        let mut cells = Vec::new();
495        let mut current_cell = (0, 0, String::new()); // (row, col, value)
496        let mut in_value = false;
497        let mut cell_type_owned = String::from("str"); // Default to string
498        let mut row_index = 0;
499        let mut col_index = 0;
500
501        loop {
502            match reader.read_event_into(&mut buf) {
503                Ok(quick_xml::events::Event::Start(ref e)) => {
504                    match e.name().as_ref() {
505                        b"c" => {
506                            // Cell
507                            // Parse cell reference and type
508                            for attr in e.attributes().flatten() {
509                                match attr.key.as_ref() {
510                                    b"r" => {
511                                        // Parse cell reference like "A1", "B2", etc.
512                                        let cell_ref = String::from_utf8_lossy(&attr.value);
513                                        (col_index, row_index) =
514                                            self.parse_cell_reference(&cell_ref);
515                                    }
516                                    b"t" => {
517                                        cell_type_owned =
518                                            String::from_utf8_lossy(&attr.value).to_string();
519                                    }
520                                    _ => {}
521                                }
522                            }
523                        }
524                        b"v" => {
525                            // Cell value
526                            in_value = true;
527                            current_cell = (row_index, col_index, String::new());
528                        }
529                        _ => {}
530                    }
531                }
532                Ok(quick_xml::events::Event::End(ref e)) => {
533                    match e.name().as_ref() {
534                        b"c" => {
535                            if !current_cell.2.is_empty() {
536                                cells.push(current_cell.clone());
537                            }
538                            // Reset for next cell
539                            cell_type_owned = String::from("str");
540                        }
541                        b"v" => {
542                            in_value = false;
543                        }
544                        _ => {}
545                    }
546                }
547                Ok(quick_xml::events::Event::Text(e)) if in_value => {
548                    let inner = e.into_inner();
549                    let text = String::from_utf8_lossy(inner.as_ref());
550                    if cell_type_owned == "s" {
551                        // Shared string reference
552                        if let Ok(index) = text.parse::<usize>() {
553                            if index < shared_strings.len() {
554                                current_cell.2 = shared_strings[index].clone();
555                            }
556                        }
557                    } else {
558                        current_cell.2 = text.to_string();
559                    }
560                }
561                Ok(quick_xml::events::Event::Eof) => break,
562                Err(e) => return Err(anyhow!("XML parsing error: {}", e)),
563                _ => {}
564            }
565            buf.clear();
566        }
567
568        // Convert cells to table format
569        let (text, table) = self.cells_to_table(cells);
570        Ok((text, table))
571    }
572
573    fn parse_cell_reference(&self, cell_ref: &str) -> (usize, usize) {
574        let mut col = 0;
575        let mut row = 0;
576        let mut i = 0;
577
578        // Parse column letters
579        for ch in cell_ref.chars() {
580            if ch.is_alphabetic() {
581                col = col * 26 + (ch.to_ascii_uppercase() as u8 - b'A') as usize + 1;
582                i += 1;
583            } else {
584                break;
585            }
586        }
587
588        // Parse row number
589        if let Ok(row_num) = cell_ref[i..].parse::<usize>() {
590            row = row_num;
591        }
592
593        (col.saturating_sub(1), row.saturating_sub(1))
594    }
595
596    fn cells_to_table(&self, cells: Vec<(usize, usize, String)>) -> (String, ExtractedTable) {
597        if cells.is_empty() {
598            return (
599                String::new(),
600                ExtractedTable {
601                    headers: Vec::new(),
602                    rows: Vec::new(),
603                    caption: None,
604                    location: ContentLocation {
605                        page: Some(1),
606                        section: None,
607                        char_offset: None,
608                        line: None,
609                        column: None,
610                    },
611                },
612            );
613        }
614
615        // Find dimensions
616        let max_row = cells.iter().map(|(r, _, _)| *r).max().unwrap_or(0);
617        let max_col = cells.iter().map(|(_, c, _)| *c).max().unwrap_or(0);
618
619        // Create grid
620        let mut grid = vec![vec![String::new(); max_col + 1]; max_row + 1];
621        for (row, col, value) in cells {
622            if row <= max_row && col <= max_col {
623                grid[row][col] = value;
624            }
625        }
626
627        // Extract headers (first row) and data rows
628        let headers = if !grid.is_empty() {
629            grid[0].clone()
630        } else {
631            Vec::new()
632        };
633
634        let rows = if grid.len() > 1 {
635            grid[1..].to_vec()
636        } else {
637            Vec::new()
638        };
639
640        // Create text representation
641        let mut text_parts = Vec::new();
642        for row in &grid {
643            let row_text = row
644                .iter()
645                .filter(|cell| !cell.is_empty())
646                .cloned()
647                .collect::<Vec<_>>()
648                .join(" | ");
649            if !row_text.is_empty() {
650                text_parts.push(row_text);
651            }
652        }
653        let text = text_parts.join("\n");
654
655        let table = ExtractedTable {
656            headers,
657            rows,
658            caption: None,
659            location: ContentLocation {
660                page: Some(1),
661                section: None,
662                char_offset: None,
663                line: None,
664                column: None,
665            },
666        };
667
668        (text, table)
669    }
670}