memvid_core/reader/
docx.rs1use std::io::{Cursor, Read};
2
3use quick_xml::Reader as XmlReader;
4use quick_xml::events::Event;
5use zip::ZipArchive;
6
7use crate::{
8 DocumentFormat, DocumentReader, PassthroughReader, ReaderDiagnostics, ReaderHint, ReaderOutput,
9 Result,
10};
11
12const DOC_XML_PATH: &str = "word/document.xml";
13
14pub struct DocxReader;
15
16impl DocxReader {
17 fn extract_text(bytes: &[u8]) -> Result<String> {
18 let cursor = Cursor::new(bytes);
19 let mut archive =
20 ZipArchive::new(cursor).map_err(|err| crate::MemvidError::ExtractionFailed {
21 reason: format!("failed to open docx archive: {err}").into(),
22 })?;
23
24 let mut file =
25 archive
26 .by_name(DOC_XML_PATH)
27 .map_err(|err| crate::MemvidError::ExtractionFailed {
28 reason: format!("docx missing document.xml: {err}").into(),
29 })?;
30 let mut xml = String::new();
31 file.read_to_string(&mut xml)
32 .map_err(|err| crate::MemvidError::ExtractionFailed {
33 reason: format!("failed to read document.xml: {err}").into(),
34 })?;
35
36 Ok(extract_plain_text(&xml, b"w:p"))
37 }
38}
39
40impl DocumentReader for DocxReader {
41 fn name(&self) -> &'static str {
42 "docx"
43 }
44
45 fn supports(&self, hint: &ReaderHint<'_>) -> bool {
46 matches!(hint.format, Some(DocumentFormat::Docx))
47 || hint
48 .mime
49 .map(|mime| {
50 mime.eq_ignore_ascii_case(
51 "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
52 )
53 })
54 .unwrap_or(false)
55 }
56
57 fn extract(&self, bytes: &[u8], hint: &ReaderHint<'_>) -> Result<ReaderOutput> {
58 match Self::extract_text(bytes) {
59 Ok(text) => {
60 if text.trim().is_empty() {
61 let mut output = PassthroughReader.extract(bytes, hint)?;
62 output.reader_name = self.name().to_string();
63 output.diagnostics.mark_fallback();
64 output.diagnostics.record_warning(
65 "docx reader produced empty text; falling back to default extractor",
66 );
67 Ok(output)
68 } else {
69 let mut base = PassthroughReader.extract(bytes, hint)?;
70 base.reader_name = self.name().to_string();
71 base.document.text = Some(text);
72 base.document.mime_type = Some(
73 "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
74 .to_string(),
75 );
76 base.diagnostics = ReaderDiagnostics::default();
77 Ok(base)
78 }
79 }
80 Err(err) => {
81 let mut fallback = PassthroughReader.extract(bytes, hint)?;
82 fallback.reader_name = self.name().to_string();
83 fallback.diagnostics.mark_fallback();
84 fallback
85 .diagnostics
86 .record_warning(format!("docx reader error: {err}"));
87 Ok(fallback)
88 }
89 }
90 }
91}
92
93fn extract_plain_text(xml: &str, block_tag: &[u8]) -> String {
94 let mut reader = XmlReader::from_str(xml);
95 reader.trim_text(true);
96 let mut buf = Vec::new();
97 let mut text = String::new();
98 let mut first_block = true;
99
100 loop {
101 match reader.read_event_into(&mut buf) {
102 Ok(Event::Start(e)) => {
103 if e.name().as_ref().ends_with(block_tag) {
104 if !first_block {
105 text.push('\n');
106 }
107 first_block = false;
108 }
109 }
110 Ok(Event::Text(t)) => {
111 if let Ok(content) = t.unescape() {
112 if !content.trim().is_empty() {
113 text.push_str(content.trim());
114 text.push(' ');
115 }
116 }
117 }
118 Ok(Event::Eof) => break,
119 Err(_) => break,
120 _ => (),
121 }
122 buf.clear();
123 }
124
125 text.trim().to_string()
126}