memvid_core/reader/
pptx.rs

1use std::io::{Cursor, Read};
2
3use quick_xml::Reader as XmlReader;
4use quick_xml::events::Event;
5use zip::ZipArchive;
6
7use crate::{
8    DocumentFormat, DocumentReader, PassthroughReader, ReaderDiagnostics, ReaderHint, ReaderOutput,
9    Result,
10};
11
12const SLIDE_PREFIX: &str = "ppt/slides/slide";
13const SLIDE_SUFFIX: &str = ".xml";
14
15pub struct PptxReader;
16
17impl PptxReader {
18    fn extract_text(bytes: &[u8]) -> Result<String> {
19        let cursor = Cursor::new(bytes);
20        let mut archive =
21            ZipArchive::new(cursor).map_err(|err| crate::MemvidError::ExtractionFailed {
22                reason: format!("failed to open pptx archive: {err}").into(),
23            })?;
24
25        let mut slides: Vec<String> = Vec::new();
26        for i in 1..=archive.len() {
27            let name = format!("{}{}{}", SLIDE_PREFIX, i, SLIDE_SUFFIX);
28            if let Ok(mut file) = archive.by_name(&name) {
29                let mut xml = String::new();
30                file.read_to_string(&mut xml).map_err(|err| {
31                    crate::MemvidError::ExtractionFailed {
32                        reason: format!("failed to read {name}: {err}").into(),
33                    }
34                })?;
35                slides.push(xml);
36            }
37        }
38
39        if slides.is_empty() {
40            return Ok(String::new());
41        }
42
43        let mut out = String::new();
44        for (idx, xml) in slides.iter().enumerate() {
45            if idx > 0 {
46                out.push_str("\n\n");
47            }
48            out.push_str(&format!("Slide {}:\n", idx + 1));
49            out.push_str(&extract_plain_text(xml, b"p"));
50        }
51
52        Ok(out.trim().to_string())
53    }
54}
55
56impl DocumentReader for PptxReader {
57    fn name(&self) -> &'static str {
58        "pptx"
59    }
60
61    fn supports(&self, hint: &ReaderHint<'_>) -> bool {
62        matches!(hint.format, Some(DocumentFormat::Pptx))
63            || hint
64                .mime
65                .map(|mime| {
66                    mime.eq_ignore_ascii_case(
67                        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
68                    )
69                })
70                .unwrap_or(false)
71    }
72
73    fn extract(&self, bytes: &[u8], hint: &ReaderHint<'_>) -> Result<ReaderOutput> {
74        match Self::extract_text(bytes) {
75            Ok(text) => {
76                if text.trim().is_empty() {
77                    // quick-xml returned empty - try extractous as fallback
78                    let mut fallback = PassthroughReader.extract(bytes, hint)?;
79                    fallback.reader_name = self.name().to_string();
80                    fallback.diagnostics.mark_fallback();
81                    fallback.diagnostics.record_warning(
82                        "pptx reader produced empty text; falling back to default extractor",
83                    );
84                    Ok(fallback)
85                } else {
86                    // quick-xml succeeded - build output directly WITHOUT calling extractous
87                    let mut document = crate::ExtractedDocument::empty();
88                    document.text = Some(text);
89                    document.mime_type = Some(
90                        "application/vnd.openxmlformats-officedocument.presentationml.presentation"
91                            .to_string(),
92                    );
93                    Ok(ReaderOutput::new(document, self.name())
94                        .with_diagnostics(ReaderDiagnostics::default()))
95                }
96            }
97            Err(err) => {
98                // quick-xml failed - try extractous as fallback
99                let mut fallback = PassthroughReader.extract(bytes, hint)?;
100                fallback.reader_name = self.name().to_string();
101                fallback.diagnostics.mark_fallback();
102                fallback
103                    .diagnostics
104                    .record_warning(format!("pptx reader error: {err}"));
105                Ok(fallback)
106            }
107        }
108    }
109}
110
111fn extract_plain_text(xml: &str, block_suffix: &[u8]) -> String {
112    let mut reader = XmlReader::from_str(xml);
113    reader.trim_text(true);
114    let mut buf = Vec::new();
115    let mut text = String::new();
116    let mut first_block = true;
117
118    loop {
119        match reader.read_event_into(&mut buf) {
120            Ok(Event::Start(e)) => {
121                if e.name().as_ref().ends_with(block_suffix) {
122                    if !first_block {
123                        text.push('\n');
124                    }
125                    first_block = false;
126                }
127            }
128            Ok(Event::Text(t)) => {
129                if let Ok(content) = t.unescape() {
130                    if !content.trim().is_empty() {
131                        text.push_str(content.trim());
132                        text.push(' ');
133                    }
134                }
135            }
136            Ok(Event::Eof) => break,
137            Err(_) => break,
138            _ => (),
139        }
140        buf.clear();
141    }
142
143    text.trim().to_string()
144}