Skip to main content

dongler_core/
openxml.rs

1use std::collections::HashMap;
2use std::io::Read;
3
4use flate2::read::DeflateDecoder;
5
6use crate::engine::{text_document_from_paragraphs, ExtractionEngine};
7use crate::error::{DonglerError, Result};
8use crate::ir::Document;
9use crate::source::Source;
10use crate::textual::html_to_text;
11
12#[derive(Debug, Default, Clone, Copy)]
13pub struct OpenXmlEngine;
14
15#[derive(Debug)]
16struct ZipEntry {
17    name: String,
18    compression_method: u16,
19    compressed_size: usize,
20    local_header_offset: usize,
21}
22
23impl ExtractionEngine for OpenXmlEngine {
24    fn name(&self) -> &'static str {
25        "openxml-native"
26    }
27
28    fn extract(&self, source: &Source) -> Result<Document> {
29        let bytes = source.bytes.as_deref().unwrap_or(source.content.as_bytes());
30        let files = read_zip_files(bytes)?;
31        let paragraphs = match source.format.as_str() {
32            "word" => extract_docx_paragraphs(&files)?,
33            "excel" => extract_xlsx_rows(&files)?,
34            "presentation" => extract_pptx_slide_text(&files)?,
35            "opendocument" => extract_opendocument_text(&files)?,
36            _ => Vec::new(),
37        };
38
39        text_document_from_paragraphs(source, self.name(), paragraphs, None)
40    }
41}
42
43fn extract_pptx_slide_text(files: &HashMap<String, String>) -> Result<Vec<String>> {
44    let mut slide_names = files
45        .keys()
46        .filter(|name| name.starts_with("ppt/slides/") && name.ends_with(".xml"))
47        .cloned()
48        .collect::<Vec<_>>();
49    slide_names.sort_by_key(|name| slide_sort_key(name));
50    if slide_names.is_empty() {
51        return Err(DonglerError::archive("PPTX missing ppt/slides/*.xml"));
52    }
53
54    let mut paragraphs = Vec::new();
55    for slide_name in slide_names {
56        let Some(slide) = files.get(&slide_name) else {
57            continue;
58        };
59        for paragraph_xml in tagged_ranges(slide, "a:p") {
60            let text = xml_text_contents(paragraph_xml, "a:t")
61                .into_iter()
62                .collect::<Vec<_>>()
63                .join("");
64            let text = text.split_whitespace().collect::<Vec<_>>().join(" ");
65            if !text.is_empty() {
66                paragraphs.push(text);
67            }
68        }
69    }
70
71    Ok(paragraphs)
72}
73
74fn slide_sort_key(name: &str) -> usize {
75    let file_name = name.rsplit('/').next().unwrap_or(name);
76    let digits = file_name
77        .chars()
78        .filter(char::is_ascii_digit)
79        .collect::<String>();
80    digits.parse::<usize>().unwrap_or(usize::MAX)
81}
82
83pub(crate) fn read_zip_files(bytes: &[u8]) -> Result<HashMap<String, String>> {
84    let entries = read_zip_entries(bytes)?;
85    let mut files = HashMap::new();
86
87    for entry in entries {
88        let data = read_zip_entry(bytes, &entry)?;
89        let text = String::from_utf8_lossy(&data).into_owned();
90        files.insert(entry.name, text);
91    }
92
93    Ok(files)
94}
95
96fn read_zip_entries(bytes: &[u8]) -> Result<Vec<ZipEntry>> {
97    let eocd = find_eocd(bytes).ok_or_else(|| DonglerError::archive("missing ZIP directory"))?;
98    if eocd + 22 > bytes.len() {
99        return Err(DonglerError::archive("truncated ZIP directory"));
100    }
101
102    let entry_count = read_u16_le(bytes, eocd + 10)? as usize;
103    let central_size = read_u32_le(bytes, eocd + 12)? as usize;
104    let central_offset = read_u32_le(bytes, eocd + 16)? as usize;
105    if central_offset + central_size > bytes.len() {
106        return Err(DonglerError::archive("ZIP directory exceeds file size"));
107    }
108
109    let mut entries = Vec::with_capacity(entry_count);
110    let mut pos = central_offset;
111    for _ in 0..entry_count {
112        if pos + 46 > bytes.len() || read_u32_le(bytes, pos)? != 0x0201_4b50 {
113            return Err(DonglerError::archive("malformed ZIP central header"));
114        }
115
116        let compression_method = read_u16_le(bytes, pos + 10)?;
117        let compressed_size = read_u32_le(bytes, pos + 20)? as usize;
118        let name_len = read_u16_le(bytes, pos + 28)? as usize;
119        let extra_len = read_u16_le(bytes, pos + 30)? as usize;
120        let comment_len = read_u16_le(bytes, pos + 32)? as usize;
121        let local_header_offset = read_u32_le(bytes, pos + 42)? as usize;
122        let name_start = pos + 46;
123        let name_end = name_start + name_len;
124        if name_end > bytes.len() {
125            return Err(DonglerError::archive("truncated ZIP entry name"));
126        }
127
128        entries.push(ZipEntry {
129            name: String::from_utf8_lossy(&bytes[name_start..name_end]).into_owned(),
130            compression_method,
131            compressed_size,
132            local_header_offset,
133        });
134        pos = name_end + extra_len + comment_len;
135    }
136
137    Ok(entries)
138}
139
140fn read_zip_entry(bytes: &[u8], entry: &ZipEntry) -> Result<Vec<u8>> {
141    let pos = entry.local_header_offset;
142    if pos + 30 > bytes.len() || read_u32_le(bytes, pos)? != 0x0403_4b50 {
143        return Err(DonglerError::archive("malformed ZIP local header"));
144    }
145
146    let name_len = read_u16_le(bytes, pos + 26)? as usize;
147    let extra_len = read_u16_le(bytes, pos + 28)? as usize;
148    let data_start = pos + 30 + name_len + extra_len;
149    let data_end = data_start + entry.compressed_size;
150    if data_end > bytes.len() {
151        return Err(DonglerError::archive("truncated ZIP entry data"));
152    }
153    let data = &bytes[data_start..data_end];
154
155    match entry.compression_method {
156        0 => Ok(data.to_vec()),
157        8 => {
158            let mut decoder = DeflateDecoder::new(data);
159            let mut decoded = Vec::new();
160            decoder
161                .read_to_end(&mut decoded)
162                .map_err(|error| DonglerError::archive(format!("Deflate failed: {error}")))?;
163            Ok(decoded)
164        }
165        method => Err(DonglerError::archive(format!(
166            "unsupported ZIP compression method {method}"
167        ))),
168    }
169}
170
171fn extract_docx_paragraphs(files: &HashMap<String, String>) -> Result<Vec<String>> {
172    let document = files
173        .get("word/document.xml")
174        .ok_or_else(|| DonglerError::archive("DOCX missing word/document.xml"))?;
175    let mut paragraphs = Vec::new();
176
177    for paragraph_xml in tagged_ranges(document, "w:p") {
178        let mut text = xml_text_contents(paragraph_xml, "w:t").join("");
179        if text.is_empty() {
180            text = xml_text_contents(paragraph_xml, "t").join("");
181        }
182        let text = text.split_whitespace().collect::<Vec<_>>().join(" ");
183        if !text.is_empty() {
184            paragraphs.push(text);
185        }
186    }
187
188    Ok(paragraphs)
189}
190
191fn extract_xlsx_rows(files: &HashMap<String, String>) -> Result<Vec<String>> {
192    let shared_strings = files
193        .get("xl/sharedStrings.xml")
194        .map(|xml| {
195            tagged_ranges(xml, "si")
196                .into_iter()
197                .map(|item| {
198                    let text = xml_text_contents(item, "t").join("");
199                    text.split_whitespace().collect::<Vec<_>>().join(" ")
200                })
201                .collect::<Vec<_>>()
202        })
203        .unwrap_or_default();
204    let mut rows = Vec::new();
205
206    let mut sheet_names = files
207        .keys()
208        .filter(|name| name.starts_with("xl/worksheets/") && name.ends_with(".xml"))
209        .cloned()
210        .collect::<Vec<_>>();
211    sheet_names.sort();
212
213    for sheet_name in sheet_names {
214        let Some(sheet) = files.get(&sheet_name) else {
215            continue;
216        };
217        for row_xml in tagged_ranges(sheet, "row") {
218            let cells = tagged_elements(row_xml, "c")
219                .into_iter()
220                .filter_map(|(tag, cell)| xlsx_cell_text(tag, cell, &shared_strings))
221                .collect::<Vec<_>>();
222            if !cells.is_empty() {
223                rows.push(cells.join(" "));
224            }
225        }
226    }
227
228    Ok(rows)
229}
230
231fn xlsx_cell_text(cell_tag: &str, cell_xml: &str, shared_strings: &[String]) -> Option<String> {
232    let value = xml_text_contents(cell_xml, "v").into_iter().next()?;
233    if cell_tag.contains("t=\"s\"") || cell_tag.contains("t='s'") {
234        let index = value.trim().parse::<usize>().ok()?;
235        shared_strings.get(index).cloned()
236    } else {
237        Some(value.trim().to_owned())
238    }
239    .filter(|text| !text.is_empty())
240}
241
242fn extract_opendocument_text(files: &HashMap<String, String>) -> Result<Vec<String>> {
243    let content = files
244        .get("content.xml")
245        .ok_or_else(|| DonglerError::archive("OpenDocument missing content.xml"))?;
246
247    let rows = extract_opendocument_rows(content);
248    if !rows.is_empty() {
249        return Ok(rows);
250    }
251
252    Ok(extract_opendocument_paragraphs(content))
253}
254
255fn extract_opendocument_rows(content: &str) -> Vec<String> {
256    tagged_ranges(content, "table:table-row")
257        .into_iter()
258        .filter_map(|row_xml| {
259            let cells = tagged_ranges(row_xml, "table:table-cell")
260                .into_iter()
261                .filter_map(|cell_xml| {
262                    let paragraphs = tagged_ranges(cell_xml, "text:p")
263                        .into_iter()
264                        .filter_map(clean_xml_text)
265                        .collect::<Vec<_>>();
266                    (!paragraphs.is_empty()).then(|| paragraphs.join(" "))
267                })
268                .collect::<Vec<_>>();
269            (!cells.is_empty()).then(|| cells.join(" "))
270        })
271        .collect()
272}
273
274fn extract_opendocument_paragraphs(content: &str) -> Vec<String> {
275    tagged_ranges(content, "text:p")
276        .into_iter()
277        .filter_map(clean_xml_text)
278        .collect()
279}
280
281fn clean_xml_text(xml: &str) -> Option<String> {
282    let text = html_to_text(&xml_unescape(xml))
283        .split_whitespace()
284        .collect::<Vec<_>>()
285        .join(" ");
286    (!text.is_empty()).then_some(text)
287}
288
289fn tagged_ranges<'a>(xml: &'a str, tag: &str) -> Vec<&'a str> {
290    let mut ranges = Vec::new();
291    let mut pos = 0;
292    let open_prefix = format!("<{tag}");
293    let close = format!("</{tag}>");
294
295    while let Some(relative_start) = xml[pos..].find(&open_prefix) {
296        let start = pos + relative_start;
297        let Some(open_end) = xml[start..].find('>') else {
298            break;
299        };
300        let content_start = start + open_end + 1;
301        let Some(relative_end) = xml[content_start..].find(&close) else {
302            break;
303        };
304        let content_end = content_start + relative_end;
305        ranges.push(&xml[content_start..content_end]);
306        pos = content_end + close.len();
307    }
308
309    ranges
310}
311
312fn tagged_elements<'a>(xml: &'a str, tag: &str) -> Vec<(&'a str, &'a str)> {
313    let mut ranges = Vec::new();
314    let mut pos = 0;
315    let open_prefix = format!("<{tag}");
316    let close = format!("</{tag}>");
317
318    while let Some(relative_start) = xml[pos..].find(&open_prefix) {
319        let start = pos + relative_start;
320        let Some(open_end) = xml[start..].find('>') else {
321            break;
322        };
323        let content_start = start + open_end + 1;
324        let Some(relative_end) = xml[content_start..].find(&close) else {
325            break;
326        };
327        let content_end = content_start + relative_end;
328        ranges.push((&xml[start..content_start], &xml[content_start..content_end]));
329        pos = content_end + close.len();
330    }
331
332    ranges
333}
334
335fn xml_text_contents(xml: &str, tag: &str) -> Vec<String> {
336    tagged_ranges(xml, tag)
337        .into_iter()
338        .map(xml_unescape)
339        .collect()
340}
341
342fn xml_unescape(text: &str) -> String {
343    text.replace("&amp;", "&")
344        .replace("&lt;", "<")
345        .replace("&gt;", ">")
346        .replace("&quot;", "\"")
347        .replace("&apos;", "'")
348}
349
350fn find_eocd(bytes: &[u8]) -> Option<usize> {
351    let max_comment = 65_535usize.min(bytes.len());
352    let start = bytes.len().saturating_sub(22 + max_comment);
353    (start..=bytes.len().saturating_sub(22))
354        .rev()
355        .find(|pos| bytes.get(*pos..*pos + 4) == Some(&[0x50, 0x4b, 0x05, 0x06]))
356}
357
358fn read_u16_le(bytes: &[u8], pos: usize) -> Result<u16> {
359    let end = pos + 2;
360    let slice = bytes
361        .get(pos..end)
362        .ok_or_else(|| DonglerError::archive("unexpected end of ZIP data"))?;
363    Ok(u16::from_le_bytes([slice[0], slice[1]]))
364}
365
366fn read_u32_le(bytes: &[u8], pos: usize) -> Result<u32> {
367    let end = pos + 4;
368    let slice = bytes
369        .get(pos..end)
370        .ok_or_else(|| DonglerError::archive("unexpected end of ZIP data"))?;
371    Ok(u32::from_le_bytes([slice[0], slice[1], slice[2], slice[3]]))
372}