memvid_core/reader/
docx.rs

1use std::io::{Cursor, Read};
2
3use quick_xml::Reader as XmlReader;
4use quick_xml::events::Event;
5use zip::ZipArchive;
6
7use crate::{
8    DocumentFormat, DocumentReader, PassthroughReader, ReaderDiagnostics, ReaderHint, ReaderOutput,
9    Result,
10};
11
12const DOC_XML_PATH: &str = "word/document.xml";
13
14pub struct DocxReader;
15
16impl DocxReader {
17    fn extract_text(bytes: &[u8]) -> Result<String> {
18        let cursor = Cursor::new(bytes);
19        let mut archive =
20            ZipArchive::new(cursor).map_err(|err| crate::MemvidError::ExtractionFailed {
21                reason: format!("failed to open docx archive: {err}").into(),
22            })?;
23
24        let mut file =
25            archive
26                .by_name(DOC_XML_PATH)
27                .map_err(|err| crate::MemvidError::ExtractionFailed {
28                    reason: format!("docx missing document.xml: {err}").into(),
29                })?;
30        let mut xml = String::new();
31        file.read_to_string(&mut xml)
32            .map_err(|err| crate::MemvidError::ExtractionFailed {
33                reason: format!("failed to read document.xml: {err}").into(),
34            })?;
35
36        Ok(extract_plain_text(&xml, b"w:p"))
37    }
38}
39
40impl DocumentReader for DocxReader {
41    fn name(&self) -> &'static str {
42        "docx"
43    }
44
45    fn supports(&self, hint: &ReaderHint<'_>) -> bool {
46        matches!(hint.format, Some(DocumentFormat::Docx))
47            || hint
48                .mime
49                .map(|mime| {
50                    mime.eq_ignore_ascii_case(
51                        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
52                    )
53                })
54                .unwrap_or(false)
55    }
56
57    fn extract(&self, bytes: &[u8], hint: &ReaderHint<'_>) -> Result<ReaderOutput> {
58        match Self::extract_text(bytes) {
59            Ok(text) => {
60                if text.trim().is_empty() {
61                    // quick-xml returned empty - try extractous as fallback
62                    let mut output = PassthroughReader.extract(bytes, hint)?;
63                    output.reader_name = self.name().to_string();
64                    output.diagnostics.mark_fallback();
65                    output.diagnostics.record_warning(
66                        "docx reader produced empty text; falling back to default extractor",
67                    );
68                    Ok(output)
69                } else {
70                    // quick-xml succeeded - build output directly WITHOUT calling extractous
71                    let mut document = crate::ExtractedDocument::empty();
72                    document.text = Some(text);
73                    document.mime_type = Some(
74                        "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
75                            .to_string(),
76                    );
77                    Ok(ReaderOutput::new(document, self.name())
78                        .with_diagnostics(ReaderDiagnostics::default()))
79                }
80            }
81            Err(err) => {
82                // quick-xml failed - try extractous as fallback
83                let mut fallback = PassthroughReader.extract(bytes, hint)?;
84                fallback.reader_name = self.name().to_string();
85                fallback.diagnostics.mark_fallback();
86                fallback
87                    .diagnostics
88                    .record_warning(format!("docx reader error: {err}"));
89                Ok(fallback)
90            }
91        }
92    }
93}
94
95fn extract_plain_text(xml: &str, block_tag: &[u8]) -> String {
96    let mut reader = XmlReader::from_str(xml);
97    reader.trim_text(true);
98    let mut buf = Vec::new();
99    let mut text = String::new();
100    let mut first_block = true;
101
102    loop {
103        match reader.read_event_into(&mut buf) {
104            Ok(Event::Start(e)) => {
105                if e.name().as_ref().ends_with(block_tag) {
106                    if !first_block {
107                        text.push('\n');
108                    }
109                    first_block = false;
110                }
111            }
112            Ok(Event::Text(t)) => {
113                if let Ok(content) = t.unescape() {
114                    if !content.trim().is_empty() {
115                        text.push_str(content.trim());
116                        text.push(' ');
117                    }
118                }
119            }
120            Ok(Event::Eof) => break,
121            Err(_) => break,
122            _ => (),
123        }
124        buf.clear();
125    }
126
127    text.trim().to_string()
128}