1use std::collections::HashMap;
2use std::io::Read;
3
4use flate2::read::DeflateDecoder;
5
6use crate::engine::{text_document_from_paragraphs, ExtractionEngine};
7use crate::error::{DonglerError, Result};
8use crate::ir::Document;
9use crate::source::Source;
10use crate::textual::html_to_text;
11
12#[derive(Debug, Default, Clone, Copy)]
13pub struct OpenXmlEngine;
14
15#[derive(Debug)]
16struct ZipEntry {
17 name: String,
18 compression_method: u16,
19 compressed_size: usize,
20 local_header_offset: usize,
21}
22
23impl ExtractionEngine for OpenXmlEngine {
24 fn name(&self) -> &'static str {
25 "openxml-native"
26 }
27
28 fn extract(&self, source: &Source) -> Result<Document> {
29 let bytes = source.bytes.as_deref().unwrap_or(source.content.as_bytes());
30 let files = read_zip_files(bytes)?;
31 let paragraphs = match source.format.as_str() {
32 "word" => extract_docx_paragraphs(&files)?,
33 "excel" => extract_xlsx_rows(&files)?,
34 "presentation" => extract_pptx_slide_text(&files)?,
35 "opendocument" => extract_opendocument_text(&files)?,
36 _ => Vec::new(),
37 };
38
39 text_document_from_paragraphs(source, self.name(), paragraphs, None)
40 }
41}
42
43fn extract_pptx_slide_text(files: &HashMap<String, String>) -> Result<Vec<String>> {
44 let mut slide_names = files
45 .keys()
46 .filter(|name| name.starts_with("ppt/slides/") && name.ends_with(".xml"))
47 .cloned()
48 .collect::<Vec<_>>();
49 slide_names.sort_by_key(|name| slide_sort_key(name));
50 if slide_names.is_empty() {
51 return Err(DonglerError::archive("PPTX missing ppt/slides/*.xml"));
52 }
53
54 let mut paragraphs = Vec::new();
55 for slide_name in slide_names {
56 let Some(slide) = files.get(&slide_name) else {
57 continue;
58 };
59 for paragraph_xml in tagged_ranges(slide, "a:p") {
60 let text = xml_text_contents(paragraph_xml, "a:t")
61 .into_iter()
62 .collect::<Vec<_>>()
63 .join("");
64 let text = text.split_whitespace().collect::<Vec<_>>().join(" ");
65 if !text.is_empty() {
66 paragraphs.push(text);
67 }
68 }
69 }
70
71 Ok(paragraphs)
72}
73
74fn slide_sort_key(name: &str) -> usize {
75 let file_name = name.rsplit('/').next().unwrap_or(name);
76 let digits = file_name
77 .chars()
78 .filter(char::is_ascii_digit)
79 .collect::<String>();
80 digits.parse::<usize>().unwrap_or(usize::MAX)
81}
82
83pub(crate) fn read_zip_files(bytes: &[u8]) -> Result<HashMap<String, String>> {
84 let entries = read_zip_entries(bytes)?;
85 let mut files = HashMap::new();
86
87 for entry in entries {
88 let data = read_zip_entry(bytes, &entry)?;
89 let text = String::from_utf8_lossy(&data).into_owned();
90 files.insert(entry.name, text);
91 }
92
93 Ok(files)
94}
95
96fn read_zip_entries(bytes: &[u8]) -> Result<Vec<ZipEntry>> {
97 let eocd = find_eocd(bytes).ok_or_else(|| DonglerError::archive("missing ZIP directory"))?;
98 if eocd + 22 > bytes.len() {
99 return Err(DonglerError::archive("truncated ZIP directory"));
100 }
101
102 let entry_count = read_u16_le(bytes, eocd + 10)? as usize;
103 let central_size = read_u32_le(bytes, eocd + 12)? as usize;
104 let central_offset = read_u32_le(bytes, eocd + 16)? as usize;
105 if central_offset + central_size > bytes.len() {
106 return Err(DonglerError::archive("ZIP directory exceeds file size"));
107 }
108
109 let mut entries = Vec::with_capacity(entry_count);
110 let mut pos = central_offset;
111 for _ in 0..entry_count {
112 if pos + 46 > bytes.len() || read_u32_le(bytes, pos)? != 0x0201_4b50 {
113 return Err(DonglerError::archive("malformed ZIP central header"));
114 }
115
116 let compression_method = read_u16_le(bytes, pos + 10)?;
117 let compressed_size = read_u32_le(bytes, pos + 20)? as usize;
118 let name_len = read_u16_le(bytes, pos + 28)? as usize;
119 let extra_len = read_u16_le(bytes, pos + 30)? as usize;
120 let comment_len = read_u16_le(bytes, pos + 32)? as usize;
121 let local_header_offset = read_u32_le(bytes, pos + 42)? as usize;
122 let name_start = pos + 46;
123 let name_end = name_start + name_len;
124 if name_end > bytes.len() {
125 return Err(DonglerError::archive("truncated ZIP entry name"));
126 }
127
128 entries.push(ZipEntry {
129 name: String::from_utf8_lossy(&bytes[name_start..name_end]).into_owned(),
130 compression_method,
131 compressed_size,
132 local_header_offset,
133 });
134 pos = name_end + extra_len + comment_len;
135 }
136
137 Ok(entries)
138}
139
140fn read_zip_entry(bytes: &[u8], entry: &ZipEntry) -> Result<Vec<u8>> {
141 let pos = entry.local_header_offset;
142 if pos + 30 > bytes.len() || read_u32_le(bytes, pos)? != 0x0403_4b50 {
143 return Err(DonglerError::archive("malformed ZIP local header"));
144 }
145
146 let name_len = read_u16_le(bytes, pos + 26)? as usize;
147 let extra_len = read_u16_le(bytes, pos + 28)? as usize;
148 let data_start = pos + 30 + name_len + extra_len;
149 let data_end = data_start + entry.compressed_size;
150 if data_end > bytes.len() {
151 return Err(DonglerError::archive("truncated ZIP entry data"));
152 }
153 let data = &bytes[data_start..data_end];
154
155 match entry.compression_method {
156 0 => Ok(data.to_vec()),
157 8 => {
158 let mut decoder = DeflateDecoder::new(data);
159 let mut decoded = Vec::new();
160 decoder
161 .read_to_end(&mut decoded)
162 .map_err(|error| DonglerError::archive(format!("Deflate failed: {error}")))?;
163 Ok(decoded)
164 }
165 method => Err(DonglerError::archive(format!(
166 "unsupported ZIP compression method {method}"
167 ))),
168 }
169}
170
171fn extract_docx_paragraphs(files: &HashMap<String, String>) -> Result<Vec<String>> {
172 let document = files
173 .get("word/document.xml")
174 .ok_or_else(|| DonglerError::archive("DOCX missing word/document.xml"))?;
175 let mut paragraphs = Vec::new();
176
177 for paragraph_xml in tagged_ranges(document, "w:p") {
178 let mut text = xml_text_contents(paragraph_xml, "w:t").join("");
179 if text.is_empty() {
180 text = xml_text_contents(paragraph_xml, "t").join("");
181 }
182 let text = text.split_whitespace().collect::<Vec<_>>().join(" ");
183 if !text.is_empty() {
184 paragraphs.push(text);
185 }
186 }
187
188 Ok(paragraphs)
189}
190
191fn extract_xlsx_rows(files: &HashMap<String, String>) -> Result<Vec<String>> {
192 let shared_strings = files
193 .get("xl/sharedStrings.xml")
194 .map(|xml| {
195 tagged_ranges(xml, "si")
196 .into_iter()
197 .map(|item| {
198 let text = xml_text_contents(item, "t").join("");
199 text.split_whitespace().collect::<Vec<_>>().join(" ")
200 })
201 .collect::<Vec<_>>()
202 })
203 .unwrap_or_default();
204 let mut rows = Vec::new();
205
206 let mut sheet_names = files
207 .keys()
208 .filter(|name| name.starts_with("xl/worksheets/") && name.ends_with(".xml"))
209 .cloned()
210 .collect::<Vec<_>>();
211 sheet_names.sort();
212
213 for sheet_name in sheet_names {
214 let Some(sheet) = files.get(&sheet_name) else {
215 continue;
216 };
217 for row_xml in tagged_ranges(sheet, "row") {
218 let cells = tagged_elements(row_xml, "c")
219 .into_iter()
220 .filter_map(|(tag, cell)| xlsx_cell_text(tag, cell, &shared_strings))
221 .collect::<Vec<_>>();
222 if !cells.is_empty() {
223 rows.push(cells.join(" "));
224 }
225 }
226 }
227
228 Ok(rows)
229}
230
231fn xlsx_cell_text(cell_tag: &str, cell_xml: &str, shared_strings: &[String]) -> Option<String> {
232 let value = xml_text_contents(cell_xml, "v").into_iter().next()?;
233 if cell_tag.contains("t=\"s\"") || cell_tag.contains("t='s'") {
234 let index = value.trim().parse::<usize>().ok()?;
235 shared_strings.get(index).cloned()
236 } else {
237 Some(value.trim().to_owned())
238 }
239 .filter(|text| !text.is_empty())
240}
241
242fn extract_opendocument_text(files: &HashMap<String, String>) -> Result<Vec<String>> {
243 let content = files
244 .get("content.xml")
245 .ok_or_else(|| DonglerError::archive("OpenDocument missing content.xml"))?;
246
247 let rows = extract_opendocument_rows(content);
248 if !rows.is_empty() {
249 return Ok(rows);
250 }
251
252 Ok(extract_opendocument_paragraphs(content))
253}
254
255fn extract_opendocument_rows(content: &str) -> Vec<String> {
256 tagged_ranges(content, "table:table-row")
257 .into_iter()
258 .filter_map(|row_xml| {
259 let cells = tagged_ranges(row_xml, "table:table-cell")
260 .into_iter()
261 .filter_map(|cell_xml| {
262 let paragraphs = tagged_ranges(cell_xml, "text:p")
263 .into_iter()
264 .filter_map(clean_xml_text)
265 .collect::<Vec<_>>();
266 (!paragraphs.is_empty()).then(|| paragraphs.join(" "))
267 })
268 .collect::<Vec<_>>();
269 (!cells.is_empty()).then(|| cells.join(" "))
270 })
271 .collect()
272}
273
274fn extract_opendocument_paragraphs(content: &str) -> Vec<String> {
275 tagged_ranges(content, "text:p")
276 .into_iter()
277 .filter_map(clean_xml_text)
278 .collect()
279}
280
281fn clean_xml_text(xml: &str) -> Option<String> {
282 let text = html_to_text(&xml_unescape(xml))
283 .split_whitespace()
284 .collect::<Vec<_>>()
285 .join(" ");
286 (!text.is_empty()).then_some(text)
287}
288
289fn tagged_ranges<'a>(xml: &'a str, tag: &str) -> Vec<&'a str> {
290 let mut ranges = Vec::new();
291 let mut pos = 0;
292 let open_prefix = format!("<{tag}");
293 let close = format!("</{tag}>");
294
295 while let Some(relative_start) = xml[pos..].find(&open_prefix) {
296 let start = pos + relative_start;
297 let Some(open_end) = xml[start..].find('>') else {
298 break;
299 };
300 let content_start = start + open_end + 1;
301 let Some(relative_end) = xml[content_start..].find(&close) else {
302 break;
303 };
304 let content_end = content_start + relative_end;
305 ranges.push(&xml[content_start..content_end]);
306 pos = content_end + close.len();
307 }
308
309 ranges
310}
311
312fn tagged_elements<'a>(xml: &'a str, tag: &str) -> Vec<(&'a str, &'a str)> {
313 let mut ranges = Vec::new();
314 let mut pos = 0;
315 let open_prefix = format!("<{tag}");
316 let close = format!("</{tag}>");
317
318 while let Some(relative_start) = xml[pos..].find(&open_prefix) {
319 let start = pos + relative_start;
320 let Some(open_end) = xml[start..].find('>') else {
321 break;
322 };
323 let content_start = start + open_end + 1;
324 let Some(relative_end) = xml[content_start..].find(&close) else {
325 break;
326 };
327 let content_end = content_start + relative_end;
328 ranges.push((&xml[start..content_start], &xml[content_start..content_end]));
329 pos = content_end + close.len();
330 }
331
332 ranges
333}
334
335fn xml_text_contents(xml: &str, tag: &str) -> Vec<String> {
336 tagged_ranges(xml, tag)
337 .into_iter()
338 .map(xml_unescape)
339 .collect()
340}
341
342fn xml_unescape(text: &str) -> String {
343 text.replace("&", "&")
344 .replace("<", "<")
345 .replace(">", ">")
346 .replace(""", "\"")
347 .replace("'", "'")
348}
349
350fn find_eocd(bytes: &[u8]) -> Option<usize> {
351 let max_comment = 65_535usize.min(bytes.len());
352 let start = bytes.len().saturating_sub(22 + max_comment);
353 (start..=bytes.len().saturating_sub(22))
354 .rev()
355 .find(|pos| bytes.get(*pos..*pos + 4) == Some(&[0x50, 0x4b, 0x05, 0x06]))
356}
357
358fn read_u16_le(bytes: &[u8], pos: usize) -> Result<u16> {
359 let end = pos + 2;
360 let slice = bytes
361 .get(pos..end)
362 .ok_or_else(|| DonglerError::archive("unexpected end of ZIP data"))?;
363 Ok(u16::from_le_bytes([slice[0], slice[1]]))
364}
365
366fn read_u32_le(bytes: &[u8], pos: usize) -> Result<u32> {
367 let end = pos + 4;
368 let slice = bytes
369 .get(pos..end)
370 .ok_or_else(|| DonglerError::archive("unexpected end of ZIP data"))?;
371 Ok(u32::from_le_bytes([slice[0], slice[1], slice[2], slice[3]]))
372}