memvid_core/reader/
pptx.rs1use std::io::{Cursor, Read};
2
3use quick_xml::Reader as XmlReader;
4use quick_xml::events::Event;
5use zip::ZipArchive;
6
7use crate::{
8 DocumentFormat, DocumentReader, PassthroughReader, ReaderDiagnostics, ReaderHint, ReaderOutput,
9 Result,
10};
11
12const SLIDE_PREFIX: &str = "ppt/slides/slide";
13const SLIDE_SUFFIX: &str = ".xml";
14
15pub struct PptxReader;
16
17impl PptxReader {
18 fn extract_text(bytes: &[u8]) -> Result<String> {
19 let cursor = Cursor::new(bytes);
20 let mut archive =
21 ZipArchive::new(cursor).map_err(|err| crate::MemvidError::ExtractionFailed {
22 reason: format!("failed to open pptx archive: {err}").into(),
23 })?;
24
25 let mut slides: Vec<String> = Vec::new();
26 for i in 1..=archive.len() {
27 let name = format!("{}{}{}", SLIDE_PREFIX, i, SLIDE_SUFFIX);
28 if let Ok(mut file) = archive.by_name(&name) {
29 let mut xml = String::new();
30 file.read_to_string(&mut xml).map_err(|err| {
31 crate::MemvidError::ExtractionFailed {
32 reason: format!("failed to read {name}: {err}").into(),
33 }
34 })?;
35 slides.push(xml);
36 }
37 }
38
39 if slides.is_empty() {
40 return Ok(String::new());
41 }
42
43 let mut out = String::new();
44 for (idx, xml) in slides.iter().enumerate() {
45 if idx > 0 {
46 out.push_str("\n\n");
47 }
48 out.push_str(&format!("Slide {}:\n", idx + 1));
49 out.push_str(&extract_plain_text(xml, b"p"));
50 }
51
52 Ok(out.trim().to_string())
53 }
54}
55
56impl DocumentReader for PptxReader {
57 fn name(&self) -> &'static str {
58 "pptx"
59 }
60
61 fn supports(&self, hint: &ReaderHint<'_>) -> bool {
62 matches!(hint.format, Some(DocumentFormat::Pptx))
63 || hint
64 .mime
65 .map(|mime| {
66 mime.eq_ignore_ascii_case(
67 "application/vnd.openxmlformats-officedocument.presentationml.presentation",
68 )
69 })
70 .unwrap_or(false)
71 }
72
73 fn extract(&self, bytes: &[u8], hint: &ReaderHint<'_>) -> Result<ReaderOutput> {
74 match Self::extract_text(bytes) {
75 Ok(text) => {
76 if text.trim().is_empty() {
77 let mut fallback = PassthroughReader.extract(bytes, hint)?;
78 fallback.reader_name = self.name().to_string();
79 fallback.diagnostics.mark_fallback();
80 fallback.diagnostics.record_warning(
81 "pptx reader produced empty text; falling back to default extractor",
82 );
83 Ok(fallback)
84 } else {
85 let mut base = PassthroughReader.extract(bytes, hint)?;
86 base.reader_name = self.name().to_string();
87 base.document.text = Some(text);
88 base.document.mime_type = Some(
89 "application/vnd.openxmlformats-officedocument.presentationml.presentation"
90 .to_string(),
91 );
92 base.diagnostics = ReaderDiagnostics::default();
93 Ok(base)
94 }
95 }
96 Err(err) => {
97 let mut fallback = PassthroughReader.extract(bytes, hint)?;
98 fallback.reader_name = self.name().to_string();
99 fallback.diagnostics.mark_fallback();
100 fallback
101 .diagnostics
102 .record_warning(format!("pptx reader error: {err}"));
103 Ok(fallback)
104 }
105 }
106 }
107}
108
109fn extract_plain_text(xml: &str, block_suffix: &[u8]) -> String {
110 let mut reader = XmlReader::from_str(xml);
111 reader.trim_text(true);
112 let mut buf = Vec::new();
113 let mut text = String::new();
114 let mut first_block = true;
115
116 loop {
117 match reader.read_event_into(&mut buf) {
118 Ok(Event::Start(e)) => {
119 if e.name().as_ref().ends_with(block_suffix) {
120 if !first_block {
121 text.push('\n');
122 }
123 first_block = false;
124 }
125 }
126 Ok(Event::Text(t)) => {
127 if let Ok(content) = t.unescape() {
128 if !content.trim().is_empty() {
129 text.push_str(content.trim());
130 text.push(' ');
131 }
132 }
133 }
134 Ok(Event::Eof) => break,
135 Err(_) => break,
136 _ => (),
137 }
138 buf.clear();
139 }
140
141 text.trim().to_string()
142}