oxirs_vec/content_processing/
office_handlers.rs

1//! Office document handlers for content processing
2//!
3//! This module provides handlers for Microsoft Office documents (DOCX, PPTX, XLSX).
4
5#[cfg(feature = "content-processing")]
6use crate::content_processing::{
7    ContentExtractionConfig, ContentLocation, DocumentFormat, DocumentStructure, ExtractedContent,
8    ExtractedTable, FormatHandler, Heading, ProcessingStats,
9};
10#[cfg(feature = "content-processing")]
11use anyhow::{anyhow, Result};
12#[cfg(feature = "content-processing")]
13use std::collections::HashMap;
14
15/// Base handler for Office documents (DOCX, PPTX, XLSX)
16#[cfg(feature = "content-processing")]
17pub trait OfficeDocumentHandler {
18    fn extract_from_zip(&self, data: &[u8], main_xml_path: &str) -> Result<String> {
19        let cursor = std::io::Cursor::new(data);
20        let mut archive = zip::ZipArchive::new(cursor)
21            .map_err(|e| anyhow!("Failed to open ZIP archive: {}", e))?;
22
23        // Try to find the main content file
24        let file = archive
25            .by_name(main_xml_path)
26            .map_err(|e| anyhow!("Main content file not found: {}", e))?;
27
28        let content =
29            std::io::read_to_string(file).map_err(|e| anyhow!("Failed to read content: {}", e))?;
30
31        self.extract_text_from_xml(&content)
32    }
33
34    fn extract_text_from_xml(&self, xml: &str) -> Result<String> {
35        let mut reader = quick_xml::Reader::from_str(xml);
36        let mut buf = Vec::new();
37        let mut text_content = Vec::new();
38        let mut in_text = false;
39
40        loop {
41            match reader.read_event_into(&mut buf) {
42                Ok(quick_xml::events::Event::Start(ref e)) => {
43                    match e.name().as_ref() {
44                        b"w:t" | b"a:t" | b"c" => in_text = true, // Word text, PowerPoint text, Excel cell
45                        _ => {}
46                    }
47                }
48                Ok(quick_xml::events::Event::End(ref e)) => match e.name().as_ref() {
49                    b"w:t" | b"a:t" | b"c" => in_text = false,
50                    _ => {}
51                },
52                Ok(quick_xml::events::Event::Text(e)) if in_text => {
53                    let inner = e.into_inner();
54                    let text = String::from_utf8_lossy(inner.as_ref());
55                    text_content.push(text.to_string());
56                }
57                Ok(quick_xml::events::Event::Eof) => break,
58                Err(e) => return Err(anyhow!("XML parsing error: {}", e)),
59                _ => {}
60            }
61            buf.clear();
62        }
63
64        Ok(text_content.join(" "))
65    }
66
67    fn extract_metadata_from_zip(&self, data: &[u8]) -> HashMap<String, String> {
68        let mut metadata = HashMap::new();
69
70        let cursor = std::io::Cursor::new(data);
71        if let Ok(mut archive) = zip::ZipArchive::new(cursor) {
72            // Try to read core properties
73            if let Ok(file) = archive.by_name("docProps/core.xml") {
74                if let Ok(content) = std::io::read_to_string(file) {
75                    // Parse core properties XML
76                    let mut reader = quick_xml::Reader::from_str(&content);
77                    let mut buf = Vec::new();
78                    let mut current_element = String::new();
79
80                    loop {
81                        match reader.read_event_into(&mut buf) {
82                            Ok(quick_xml::events::Event::Start(ref e)) => {
83                                current_element =
84                                    String::from_utf8_lossy(e.name().as_ref()).to_string();
85                            }
86                            Ok(quick_xml::events::Event::Text(e)) => {
87                                let inner = e.into_inner();
88                                let text = String::from_utf8_lossy(inner.as_ref());
89                                match current_element.as_str() {
90                                    "dc:title" => {
91                                        metadata.insert("title".to_string(), text.to_string());
92                                    }
93                                    "dc:creator" => {
94                                        metadata.insert("author".to_string(), text.to_string());
95                                    }
96                                    "dc:subject" => {
97                                        metadata.insert("subject".to_string(), text.to_string());
98                                    }
99                                    "dc:description" => {
100                                        metadata
101                                            .insert("description".to_string(), text.to_string());
102                                    }
103                                    _ => {}
104                                }
105                            }
106                            Ok(quick_xml::events::Event::Eof) => break,
107                            _ => {}
108                        }
109                        buf.clear();
110                    }
111                }
112            }
113        }
114
115        metadata.insert("size".to_string(), data.len().to_string());
116        metadata
117    }
118}
119
120/// DOCX document handler
121#[cfg(feature = "content-processing")]
122pub struct DocxHandler;
123
124#[cfg(feature = "content-processing")]
125impl OfficeDocumentHandler for DocxHandler {}
126
127#[cfg(feature = "content-processing")]
128impl FormatHandler for DocxHandler {
129    fn extract_content(
130        &self,
131        data: &[u8],
132        _config: &ContentExtractionConfig,
133    ) -> Result<ExtractedContent> {
134        let text = self.extract_from_zip(data, "word/document.xml")?;
135        let metadata = self.extract_metadata_from_zip(data);
136        let title = metadata.get("title").cloned();
137
138        // Extract headings (would need style analysis for proper heading detection)
139        let headings = self.extract_docx_headings(&text);
140
141        Ok(ExtractedContent {
142            format: DocumentFormat::Docx,
143            text,
144            metadata,
145            images: Vec::new(), // Would require parsing word/media folder
146            tables: Vec::new(), // Would require parsing table XML structures
147            links: Vec::new(),  // Would require parsing hyperlink relationships
148            structure: DocumentStructure {
149                title,
150                headings,
151                page_count: 1, // Would need to analyze page breaks
152                section_count: 1,
153                table_of_contents: Vec::new(),
154            },
155            chunks: Vec::new(),
156            language: None,
157            processing_stats: ProcessingStats::default(),
158            audio_content: Vec::new(),
159            video_content: Vec::new(),
160            cross_modal_embeddings: Vec::new(),
161        })
162    }
163
164    fn can_handle(&self, data: &[u8]) -> bool {
165        if data.len() < 4 {
166            return false;
167        }
168
169        // Check for ZIP signature
170        if data[0..4] != [0x50, 0x4B, 0x03, 0x04] && data[0..4] != [0x50, 0x4B, 0x05, 0x06] {
171            return false;
172        }
173
174        // Check if it contains DOCX-specific files
175        let cursor = std::io::Cursor::new(data);
176        if let Ok(mut archive) = zip::ZipArchive::new(cursor) {
177            archive.by_name("word/document.xml").is_ok()
178                && archive.by_name("[Content_Types].xml").is_ok()
179        } else {
180            false
181        }
182    }
183
184    fn supported_extensions(&self) -> Vec<&'static str> {
185        vec!["docx"]
186    }
187}
188
189#[cfg(feature = "content-processing")]
190impl DocxHandler {
191    fn extract_docx_headings(&self, text: &str) -> Vec<Heading> {
192        let mut headings = Vec::new();
193
194        // Simple heuristic for headings in extracted text
195        for (i, line) in text.lines().enumerate() {
196            let trimmed = line.trim();
197            if trimmed.len() > 3 && trimmed.len() < 100 {
198                // Check if line looks like a heading
199                let words: Vec<&str> = trimmed.split_whitespace().collect();
200                if words.len() <= 8 && !words.is_empty() {
201                    let first_char = trimmed.chars().next().unwrap_or(' ');
202                    if first_char.is_uppercase() {
203                        headings.push(Heading {
204                            level: 1, // Would need style information for proper level detection
205                            text: trimmed.to_string(),
206                            location: ContentLocation {
207                                page: None,
208                                section: None,
209                                char_offset: None,
210                                line: Some(i + 1),
211                                column: None,
212                            },
213                        });
214                    }
215                }
216            }
217        }
218
219        headings
220    }
221}
222
223/// PPTX document handler
224#[cfg(feature = "content-processing")]
225pub struct PptxHandler;
226
227#[cfg(feature = "content-processing")]
228impl OfficeDocumentHandler for PptxHandler {}
229
230#[cfg(feature = "content-processing")]
231impl FormatHandler for PptxHandler {
232    fn extract_content(
233        &self,
234        data: &[u8],
235        _config: &ContentExtractionConfig,
236    ) -> Result<ExtractedContent> {
237        // Extract text from all slides
238        let mut all_text = Vec::new();
239        let cursor = std::io::Cursor::new(data);
240        let mut archive = zip::ZipArchive::new(cursor)
241            .map_err(|e| anyhow!("Failed to open PPTX archive: {}", e))?;
242
243        // Find all slide files
244        let file_names: Vec<String> = (0..archive.len())
245            .filter_map(|i| {
246                archive
247                    .by_index(i)
248                    .ok()
249                    .map(|file| file.name().to_string())
250                    .filter(|name| name.starts_with("ppt/slides/slide") && name.ends_with(".xml"))
251            })
252            .collect();
253
254        for slide_name in file_names {
255            if let Ok(file) = archive.by_name(&slide_name) {
256                if let Ok(content) = std::io::read_to_string(file) {
257                    if let Ok(slide_text) = self.extract_text_from_xml(&content) {
258                        all_text.push(slide_text);
259                    }
260                }
261            }
262        }
263
264        let text = all_text.join("\n\n");
265        let metadata = self.extract_metadata_from_zip(data);
266        let title = metadata.get("title").cloned();
267
268        Ok(ExtractedContent {
269            format: DocumentFormat::Pptx,
270            text,
271            metadata,
272            images: Vec::new(),
273            tables: Vec::new(),
274            links: Vec::new(),
275            structure: DocumentStructure {
276                title,
277                headings: Vec::new(), // Would extract slide titles as headings
278                page_count: all_text.len(), // Each slide is a "page"
279                section_count: all_text.len(),
280                table_of_contents: Vec::new(),
281            },
282            chunks: Vec::new(),
283            language: None,
284            processing_stats: ProcessingStats::default(),
285            audio_content: Vec::new(),
286            video_content: Vec::new(),
287            cross_modal_embeddings: Vec::new(),
288        })
289    }
290
291    fn can_handle(&self, data: &[u8]) -> bool {
292        if data.len() < 4 {
293            return false;
294        }
295
296        // Check for ZIP signature
297        if data[0..4] != [0x50, 0x4B, 0x03, 0x04] && data[0..4] != [0x50, 0x4B, 0x05, 0x06] {
298            return false;
299        }
300
301        // Check if it contains PPTX-specific files
302        let cursor = std::io::Cursor::new(data);
303        if let Ok(mut archive) = zip::ZipArchive::new(cursor) {
304            archive.by_name("ppt/presentation.xml").is_ok()
305                && archive.by_name("[Content_Types].xml").is_ok()
306        } else {
307            false
308        }
309    }
310
311    fn supported_extensions(&self) -> Vec<&'static str> {
312        vec!["pptx"]
313    }
314}
315
316/// XLSX document handler
317#[cfg(feature = "content-processing")]
318pub struct XlsxHandler;
319
320#[cfg(feature = "content-processing")]
321impl OfficeDocumentHandler for XlsxHandler {}
322
323#[cfg(feature = "content-processing")]
324impl FormatHandler for XlsxHandler {
325    fn extract_content(
326        &self,
327        data: &[u8],
328        config: &ContentExtractionConfig,
329    ) -> Result<ExtractedContent> {
330        let cursor = std::io::Cursor::new(data);
331        let mut archive = zip::ZipArchive::new(cursor)
332            .map_err(|e| anyhow!("Failed to open XLSX archive: {}", e))?;
333
334        // Extract shared strings first
335        let shared_strings = self.extract_shared_strings(&mut archive)?;
336
337        // Extract worksheet content
338        let (text, tables) = self.extract_worksheets(&mut archive, &shared_strings, config)?;
339        let metadata = self.extract_metadata_from_zip(data);
340        let title = metadata.get("title").cloned();
341
342        Ok(ExtractedContent {
343            format: DocumentFormat::Xlsx,
344            text,
345            metadata,
346            images: Vec::new(),
347            tables,
348            links: Vec::new(),
349            structure: DocumentStructure {
350                title,
351                headings: Vec::new(),
352                page_count: 1,
353                section_count: 1,
354                table_of_contents: Vec::new(),
355            },
356            chunks: Vec::new(),
357            language: None,
358            processing_stats: ProcessingStats::default(),
359            audio_content: Vec::new(),
360            video_content: Vec::new(),
361            cross_modal_embeddings: Vec::new(),
362        })
363    }
364
365    fn can_handle(&self, data: &[u8]) -> bool {
366        if data.len() < 4 {
367            return false;
368        }
369
370        // Check for ZIP signature
371        if data[0..4] != [0x50, 0x4B, 0x03, 0x04] && data[0..4] != [0x50, 0x4B, 0x05, 0x06] {
372            return false;
373        }
374
375        // Check if it contains XLSX-specific files
376        let cursor = std::io::Cursor::new(data);
377        if let Ok(mut archive) = zip::ZipArchive::new(cursor) {
378            archive.by_name("xl/workbook.xml").is_ok()
379                && archive.by_name("[Content_Types].xml").is_ok()
380        } else {
381            false
382        }
383    }
384
385    fn supported_extensions(&self) -> Vec<&'static str> {
386        vec!["xlsx"]
387    }
388}
389
390#[cfg(feature = "content-processing")]
391impl XlsxHandler {
392    fn extract_shared_strings(
393        &self,
394        archive: &mut zip::ZipArchive<std::io::Cursor<&[u8]>>,
395    ) -> Result<Vec<String>> {
396        let mut shared_strings = Vec::new();
397
398        if let Ok(file) = archive.by_name("xl/sharedStrings.xml") {
399            let content = std::io::read_to_string(file)
400                .map_err(|e| anyhow!("Failed to read shared strings: {}", e))?;
401
402            let mut reader = quick_xml::Reader::from_str(&content);
403            let mut buf = Vec::new();
404            let mut in_text = false;
405            let mut current_string = String::new();
406
407            loop {
408                match reader.read_event_into(&mut buf) {
409                    Ok(quick_xml::events::Event::Start(ref e)) => {
410                        if e.name().as_ref() == b"t" {
411                            in_text = true;
412                            current_string.clear();
413                        }
414                    }
415                    Ok(quick_xml::events::Event::End(ref e)) => {
416                        if e.name().as_ref() == b"t" {
417                            in_text = false;
418                        } else if e.name().as_ref() == b"si" {
419                            shared_strings.push(current_string.clone());
420                            current_string.clear();
421                        }
422                    }
423                    Ok(quick_xml::events::Event::Text(e)) if in_text => {
424                        let inner = e.into_inner();
425                        let text = String::from_utf8_lossy(inner.as_ref());
426                        current_string.push_str(&text);
427                    }
428                    Ok(quick_xml::events::Event::Eof) => break,
429                    Err(e) => return Err(anyhow!("XML parsing error: {}", e)),
430                    _ => {}
431                }
432                buf.clear();
433            }
434        }
435
436        Ok(shared_strings)
437    }
438
439    fn extract_worksheets(
440        &self,
441        archive: &mut zip::ZipArchive<std::io::Cursor<&[u8]>>,
442        shared_strings: &[String],
443        config: &ContentExtractionConfig,
444    ) -> Result<(String, Vec<ExtractedTable>)> {
445        let mut all_text = Vec::new();
446        let mut tables = Vec::new();
447
448        // Find all worksheet files
449        let file_names: Vec<String> = (0..archive.len())
450            .filter_map(|i| {
451                archive
452                    .by_index(i)
453                    .ok()
454                    .map(|file| file.name().to_string())
455                    .filter(|name| {
456                        name.starts_with("xl/worksheets/sheet") && name.ends_with(".xml")
457                    })
458            })
459            .collect();
460
461        for (sheet_index, sheet_name) in file_names.iter().enumerate() {
462            if let Ok(file) = archive.by_name(sheet_name) {
463                if let Ok(content) = std::io::read_to_string(file) {
464                    let (sheet_text, sheet_table) =
465                        self.extract_sheet_content(&content, shared_strings)?;
466                    all_text.push(sheet_text);
467
468                    if config.extract_tables && !sheet_table.rows.is_empty() {
469                        let mut table = sheet_table;
470                        table.caption = Some(format!("Sheet {}", sheet_index + 1));
471                        tables.push(table);
472                    }
473                }
474            }
475        }
476
477        Ok((all_text.join("\n\n"), tables))
478    }
479
480    fn extract_sheet_content(
481        &self,
482        xml: &str,
483        shared_strings: &[String],
484    ) -> Result<(String, ExtractedTable)> {
485        let mut reader = quick_xml::Reader::from_str(xml);
486        let mut buf = Vec::new();
487        let mut cells = Vec::new();
488        let mut current_cell = (0, 0, String::new()); // (row, col, value)
489        let mut in_value = false;
490        let mut cell_type_owned = String::from("str"); // Default to string
491        let mut row_index = 0;
492        let mut col_index = 0;
493
494        loop {
495            match reader.read_event_into(&mut buf) {
496                Ok(quick_xml::events::Event::Start(ref e)) => {
497                    match e.name().as_ref() {
498                        b"c" => {
499                            // Cell
500                            // Parse cell reference and type
501                            for attr in e.attributes().flatten() {
502                                match attr.key.as_ref() {
503                                    b"r" => {
504                                        // Parse cell reference like "A1", "B2", etc.
505                                        let cell_ref = String::from_utf8_lossy(&attr.value);
506                                        (col_index, row_index) =
507                                            self.parse_cell_reference(&cell_ref);
508                                    }
509                                    b"t" => {
510                                        cell_type_owned =
511                                            String::from_utf8_lossy(&attr.value).to_string();
512                                    }
513                                    _ => {}
514                                }
515                            }
516                        }
517                        b"v" => {
518                            // Cell value
519                            in_value = true;
520                            current_cell = (row_index, col_index, String::new());
521                        }
522                        _ => {}
523                    }
524                }
525                Ok(quick_xml::events::Event::End(ref e)) => {
526                    match e.name().as_ref() {
527                        b"c" => {
528                            if !current_cell.2.is_empty() {
529                                cells.push(current_cell.clone());
530                            }
531                            // Reset for next cell
532                            cell_type_owned = String::from("str");
533                        }
534                        b"v" => {
535                            in_value = false;
536                        }
537                        _ => {}
538                    }
539                }
540                Ok(quick_xml::events::Event::Text(e)) if in_value => {
541                    let inner = e.into_inner();
542                    let text = String::from_utf8_lossy(inner.as_ref());
543                    if cell_type_owned == "s" {
544                        // Shared string reference
545                        if let Ok(index) = text.parse::<usize>() {
546                            if index < shared_strings.len() {
547                                current_cell.2 = shared_strings[index].clone();
548                            }
549                        }
550                    } else {
551                        current_cell.2 = text.to_string();
552                    }
553                }
554                Ok(quick_xml::events::Event::Eof) => break,
555                Err(e) => return Err(anyhow!("XML parsing error: {}", e)),
556                _ => {}
557            }
558            buf.clear();
559        }
560
561        // Convert cells to table format
562        let (text, table) = self.cells_to_table(cells);
563        Ok((text, table))
564    }
565
566    fn parse_cell_reference(&self, cell_ref: &str) -> (usize, usize) {
567        let mut col = 0;
568        let mut row = 0;
569        let mut i = 0;
570
571        // Parse column letters
572        for ch in cell_ref.chars() {
573            if ch.is_alphabetic() {
574                col = col * 26 + (ch.to_ascii_uppercase() as u8 - b'A') as usize + 1;
575                i += 1;
576            } else {
577                break;
578            }
579        }
580
581        // Parse row number
582        if let Ok(row_num) = cell_ref[i..].parse::<usize>() {
583            row = row_num;
584        }
585
586        (col.saturating_sub(1), row.saturating_sub(1))
587    }
588
589    fn cells_to_table(&self, cells: Vec<(usize, usize, String)>) -> (String, ExtractedTable) {
590        if cells.is_empty() {
591            return (
592                String::new(),
593                ExtractedTable {
594                    headers: Vec::new(),
595                    rows: Vec::new(),
596                    caption: None,
597                    location: ContentLocation {
598                        page: Some(1),
599                        section: None,
600                        char_offset: None,
601                        line: None,
602                        column: None,
603                    },
604                },
605            );
606        }
607
608        // Find dimensions
609        let max_row = cells.iter().map(|(r, _, _)| *r).max().unwrap_or(0);
610        let max_col = cells.iter().map(|(_, c, _)| *c).max().unwrap_or(0);
611
612        // Create grid
613        let mut grid = vec![vec![String::new(); max_col + 1]; max_row + 1];
614        for (row, col, value) in cells {
615            if row <= max_row && col <= max_col {
616                grid[row][col] = value;
617            }
618        }
619
620        // Extract headers (first row) and data rows
621        let headers = if !grid.is_empty() {
622            grid[0].clone()
623        } else {
624            Vec::new()
625        };
626
627        let rows = if grid.len() > 1 {
628            grid[1..].to_vec()
629        } else {
630            Vec::new()
631        };
632
633        // Create text representation
634        let mut text_parts = Vec::new();
635        for row in &grid {
636            let row_text = row
637                .iter()
638                .filter(|cell| !cell.is_empty())
639                .cloned()
640                .collect::<Vec<_>>()
641                .join(" | ");
642            if !row_text.is_empty() {
643                text_parts.push(row_text);
644            }
645        }
646        let text = text_parts.join("\n");
647
648        let table = ExtractedTable {
649            headers,
650            rows,
651            caption: None,
652            location: ContentLocation {
653                page: Some(1),
654                section: None,
655                char_offset: None,
656                line: None,
657                column: None,
658            },
659        };
660
661        (text, table)
662    }
663}