Skip to main content

docspec_docx_reader/
lib.rs

1#![forbid(unsafe_code)]
2#![cfg_attr(docsrs, feature(doc_cfg))]
3//! DOCX to `DocSpec` event stream reader.
4//!
5//! This crate provides a [`DocxReader`] that implements [`EventSource`] to convert
6//! DOCX documents into the `DocSpec` event stream format. It uses `quick-xml` for
7//! streaming XML parsing and `zip` for archive extraction.
8//!
9//! # Scope
10//!
11//! **In scope**: Paragraphs (`<w:p>`) and direct text (`<w:t>` inside `<w:r>`).
12//! Emits exactly: `StartDocument`, `StartParagraph`, `Text`, `EndParagraph`, `EndDocument`.
13//!
14//! **Out of scope (silently dropped)**:
15//! - Run styling (`<w:rPr>`, bold, italic, etc.)
16//! - Line and page breaks (`<w:br>`)
17//! - Tabs (`<w:tab>`)
18//! - Headings (any `<w:pStyle>` value — every paragraph is `StartParagraph`)
19//! - Tables (`<w:tbl>`, `<w:tr>`, `<w:tc>`)
20//! - Lists
21//! - Hyperlinks (`<w:hyperlink>`)
22//! - Drawings and images (`<w:drawing>`, `<w:pict>`)
23//! - Structured document tags (`<w:sdt>`)
24//! - Comments, footnotes, headers, footers
25//! - Document metadata
26//! - Tracked changes (`<w:ins>`, `<w:del>`, `<w:moveFrom>`, `<w:moveTo>`)
27//!
28//! # Streaming Guarantee
29//!
30//! `DocxReader` streams `document.xml` event by event using constant memory
31//! regardless of document size. Only `_rels/.rels` (a few hundred bytes) is
32//! fully read into memory to discover the document target path.
33//!
34//! # Quick Start
35//!
36//! ```no_run
37//! use docspec_docx_reader::{DocxReader, EventSource};
38//!
39//! let mut reader = DocxReader::from_path("document.docx")?;
40//! while let Some(event) = reader.next_event()? {
41//!     println!("{event:?}");
42//! }
43//! # Ok::<(), docspec_core::Error>(())
44//! ```
45
46extern crate alloc;
47
48mod rels;
49
50use alloc::collections::VecDeque;
51use core::fmt;
52use std::io::{BufReader, Read, Seek};
53use std::path::Path;
54
55pub use docspec_core::EventSource;
56use docspec_core::{Error, Event, Result, TextStyle};
57use quick_xml::events::{BytesCData, BytesRef, BytesText};
58
59/// Document processing phase.
60#[derive(Clone, Copy, PartialEq, Eq)]
61enum Phase {
62    /// `EndDocument` has been emitted.
63    Finished,
64    /// `StartDocument` not yet emitted.
65    NotStarted,
66    /// Processing events between `StartDocument` and `EndDocument`.
67    Running,
68}
69
70/// A streaming DOCX reader that implements [`EventSource`].
71///
72/// `DocxReader` parses a DOCX archive and emits `DocSpec` events one at a time.
73/// Only `<w:p>` paragraph elements and `<w:t>` text elements are recognized;
74/// all other elements are silently ignored.
75///
76/// # Streaming
77///
78/// The reader streams `document.xml` event by event using constant memory.
79/// Only `_rels/.rels` (a few hundred bytes) is buffered to discover the
80/// document target path.
81///
82/// # Errors
83///
84/// Returns [`Error::Io`] for I/O failures and [`Error::Parse`] for malformed
85/// archives or XML.
86pub struct DocxReader {
87    /// Reusable buffer for quick-xml event reading.
88    buf: Vec<u8>,
89    /// Depth counter for ignored subtrees (tables, tracked changes, etc.).
90    /// Incremented on Start of an ignored container, decremented on End.
91    in_ignored_subtree: u32,
92    /// Whether the reader is currently inside a `<w:p>` element.
93    in_paragraph: bool,
94    /// Whether the reader is currently inside a `<w:t>` element.
95    in_text: bool,
96    /// Text collected for the current `<w:t>` element.
97    pending_text: String,
98    /// Document processing phase.
99    phase: Phase,
100    /// Queue of `DocSpec` events to emit.
101    queue: VecDeque<Event>,
102    /// The quick-xml reader streaming from the document entry.
103    xml: quick_xml::Reader<BufReader<Box<dyn Read>>>,
104}
105
106impl fmt::Debug for DocxReader {
107    #[inline]
108    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
109        f.debug_struct("DocxReader")
110            .field("buf", &self.buf)
111            .field("in_ignored_subtree", &self.in_ignored_subtree)
112            .field("in_paragraph", &self.in_paragraph)
113            .field("in_text", &self.in_text)
114            .field("pending_text", &self.pending_text)
115            .field("phase", &"<phase>")
116            .field("queue", &self.queue)
117            .field("xml", &"<quick_xml::Reader>")
118            .finish()
119    }
120}
121
122impl DocxReader {
123    /// Creates a `DocxReader` from a file path.
124    ///
125    /// # Errors
126    ///
127    /// Returns [`Error::Io`] if the file cannot be opened. See [`from_reader`](Self::from_reader)
128    /// for additional error conditions.
129    #[inline]
130    pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
131        let file = std::fs::File::open(path.as_ref()).map_err(Error::from)?;
132        Self::from_reader(file)
133    }
134
135    /// Creates a `DocxReader` from any `Read + Seek` source.
136    ///
137    /// The reader must be positioned at the start of a valid DOCX (ZIP) archive.
138    ///
139    /// # Errors
140    ///
141    /// Returns [`Error::Parse`] if the input is not a valid ZIP archive, if
142    /// `_rels/.rels` is missing or malformed, or if the document target entry
143    /// cannot be opened. Returns [`Error::Io`] for I/O failures.
144    #[inline]
145    pub fn from_reader<R: Read + Seek + 'static>(mut reader: R) -> Result<Self> {
146        let mut archive = zip::ZipArchive::new(&mut reader).map_err(|err| match err {
147            zip::result::ZipError::InvalidArchive(_)
148            | zip::result::ZipError::UnsupportedArchive(_) => Error::Parse {
149                message: "not a valid ZIP archive".to_string(),
150                position: None,
151            },
152            zip::result::ZipError::Io(source) => Error::Io { source },
153            zip::result::ZipError::FileNotFound
154            | zip::result::ZipError::InvalidPassword
155            | zip::result::ZipError::CompressionMethodNotSupported(_)
156            | _ => parse_error(format!("not a valid ZIP archive: {err}")),
157        })?;
158
159        let document_path = rels::find_document_path(&mut archive)?;
160
161        let (data_start, compressed_size, method) = {
162            let entry = archive
163                .by_name(&document_path)
164                .map_err(|_err| Error::Parse {
165                    message: format!("document target not found: {document_path}"),
166                    position: None,
167                })?;
168            let data_start = entry
169                .data_start()
170                .ok_or_else(|| parse_error("document.xml has no data offset".to_string()))?;
171            (data_start, entry.compressed_size(), entry.compression())
172        };
173        drop(archive);
174
175        reader
176            .seek(std::io::SeekFrom::Start(data_start))
177            .map_err(Error::from)?;
178
179        let limited = reader.take(compressed_size);
180
181        let stream: Box<dyn Read> = if method == zip::CompressionMethod::Stored {
182            Box::new(limited)
183        } else if method == zip::CompressionMethod::Deflated {
184            Box::new(flate2::read::DeflateDecoder::new(limited))
185        } else {
186            return Err(Error::Parse {
187                message: format!("unsupported compression: {method:?}"),
188                position: None,
189            });
190        };
191
192        let xml = quick_xml::Reader::from_reader(BufReader::new(stream));
193
194        Ok(Self {
195            buf: Vec::with_capacity(4096),
196            in_ignored_subtree: 0,
197            in_paragraph: false,
198            in_text: false,
199            pending_text: String::new(),
200            phase: Phase::NotStarted,
201            queue: VecDeque::new(),
202            xml,
203        })
204    }
205}
206
207impl DocxReader {
208    fn can_collect_text(&self) -> bool {
209        self.in_ignored_subtree == 0 && self.in_paragraph && self.in_text
210    }
211
212    fn end_paragraph(&mut self) {
213        self.queue.push_back(Event::EndParagraph);
214        self.in_paragraph = false;
215        self.in_text = false;
216        self.pending_text.clear();
217    }
218
219    fn flush_pending_text(&mut self) {
220        if !self.pending_text.is_empty() {
221            self.queue.push_back(Event::Text {
222                content: core::mem::take(&mut self.pending_text),
223                style: TextStyle::default(),
224            });
225        }
226    }
227
228    fn handle_cdata(&mut self, cdata: BytesCData<'_>) -> Result<()> {
229        if self.can_collect_text() {
230            let bytes = cdata.into_inner();
231            let content = core::str::from_utf8(&bytes)
232                .map_err(|err| parse_error(format!("malformed document.xml: {err}")))?;
233            self.pending_text.push_str(content);
234        }
235        Ok(())
236    }
237
238    fn handle_empty(&mut self, local: &[u8]) {
239        match local {
240            value if self.in_ignored_subtree > 0 || is_ignored_container(value) => {}
241            b"p" if !self.in_paragraph => {
242                self.queue.push_back(Event::StartParagraph {
243                    alignment: None,
244                    id: None,
245                });
246                self.queue.push_back(Event::EndParagraph);
247            }
248            _ => {}
249        }
250    }
251
252    fn handle_end(&mut self, local: &[u8]) {
253        if self.in_ignored_subtree > 0 {
254            self.in_ignored_subtree = self.in_ignored_subtree.saturating_sub(1);
255            return;
256        }
257
258        match local {
259            b"p" if self.in_paragraph => self.end_paragraph(),
260            b"t" if self.in_text => {
261                self.flush_pending_text();
262                self.in_text = false;
263            }
264            _ => {}
265        }
266    }
267
268    fn handle_eof(&mut self) {
269        if self.in_text {
270            self.flush_pending_text();
271        }
272        if self.in_paragraph {
273            self.end_paragraph();
274        }
275        self.queue.push_back(Event::EndDocument);
276        self.phase = Phase::Finished;
277    }
278
279    fn handle_general_ref(&mut self, reference: &BytesRef<'_>) -> Result<()> {
280        if self.can_collect_text() {
281            let decoded = reference
282                .decode()
283                .map_err(|err| parse_error(format!("malformed document.xml: {err}")))?;
284            let escaped = format!("&{decoded};");
285            let unescaped = quick_xml::escape::unescape(&escaped)
286                .map_err(|err| parse_error(format!("malformed document.xml: {err}")))?;
287            self.pending_text.push_str(&unescaped);
288        }
289        Ok(())
290    }
291
292    fn handle_start(&mut self, local: &[u8]) {
293        if self.in_ignored_subtree > 0 {
294            self.in_ignored_subtree = self.in_ignored_subtree.saturating_add(1);
295            return;
296        }
297
298        match local {
299            value if is_ignored_container(value) => self.in_ignored_subtree = 1,
300            b"p" if !self.in_paragraph => self.start_paragraph(),
301            b"t" if self.in_paragraph => {
302                self.in_text = true;
303                self.pending_text.clear();
304            }
305            _ => {}
306        }
307    }
308
309    fn handle_text(&mut self, text: &BytesText<'_>) -> Result<()> {
310        if self.can_collect_text() {
311            let decoded = text
312                .decode()
313                .map_err(|err| parse_error(format!("malformed document.xml: {err}")))?;
314            let unescaped = quick_xml::escape::unescape(&decoded)
315                .map_err(|err| parse_error(format!("malformed document.xml: {err}")))?;
316            self.pending_text.push_str(&unescaped);
317        }
318        Ok(())
319    }
320
321    fn read_until_event(&mut self) -> Result<()> {
322        let event = self
323            .xml
324            .read_event_into(&mut self.buf)
325            .map_err(|err| match err {
326                quick_xml::Error::Io(source) => Error::Io {
327                    source: std::io::Error::new(source.kind(), source.to_string()),
328                },
329                other => Error::Parse {
330                    message: format!("malformed document.xml: {other}"),
331                    position: None,
332                },
333            })?
334            .into_owned();
335
336        match event {
337            quick_xml::events::Event::Start(tag) => self.handle_start(tag.local_name().as_ref()),
338            quick_xml::events::Event::End(tag) => self.handle_end(tag.local_name().as_ref()),
339            quick_xml::events::Event::Empty(tag) => self.handle_empty(tag.local_name().as_ref()),
340            quick_xml::events::Event::Text(text) => {
341                self.handle_text(&text)?;
342            }
343            quick_xml::events::Event::GeneralRef(reference) => {
344                self.handle_general_ref(&reference)?;
345            }
346            quick_xml::events::Event::CData(cdata) => self.handle_cdata(cdata)?,
347            quick_xml::events::Event::Eof => self.handle_eof(),
348            quick_xml::events::Event::Comment(_)
349            | quick_xml::events::Event::Decl(_)
350            | quick_xml::events::Event::PI(_)
351            | quick_xml::events::Event::DocType(_) => {}
352        }
353
354        self.buf.clear();
355        Ok(())
356    }
357
358    fn start_paragraph(&mut self) {
359        self.queue.push_back(Event::StartParagraph {
360            alignment: None,
361            id: None,
362        });
363        self.in_paragraph = true;
364        self.in_text = false;
365        self.pending_text.clear();
366    }
367}
368
369impl EventSource for DocxReader {
370    #[inline]
371    fn next_event(&mut self) -> Result<Option<Event>> {
372        loop {
373            if let Some(event) = self.queue.pop_front() {
374                return Ok(Some(event));
375            }
376
377            match self.phase {
378                Phase::NotStarted => {
379                    self.phase = Phase::Running;
380                    self.queue.push_back(Event::StartDocument {
381                        id: None,
382                        language: None,
383                        metadata: None,
384                    });
385                }
386                Phase::Finished => return Ok(None),
387                Phase::Running => self.read_until_event()?,
388            }
389        }
390    }
391}
392
393fn is_ignored_container(local: &[u8]) -> bool {
394    matches!(
395        local,
396        b"tbl"
397            | b"tr"
398            | b"tc"
399            | b"sdt"
400            | b"hyperlink"
401            | b"drawing"
402            | b"pict"
403            | b"object"
404            | b"ins"
405            | b"del"
406            | b"moveFrom"
407            | b"moveTo"
408    )
409}
410
411fn parse_error(message: String) -> Error {
412    Error::Parse {
413        message,
414        position: None,
415    }
416}
417
418#[cfg(test)]
419#[cfg(not(coverage))]
420mod tests {
421    use std::io::{Cursor, Write as _};
422
423    use zip::{write::SimpleFileOptions, CompressionMethod, ZipWriter};
424
425    use super::*;
426
427    const SIMPLE_RELS: &str = r#"<?xml version="1.0"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/></Relationships>"#;
428
429    fn synth_docx_for_unit_test(
430        rels_xml: &str,
431        document_xml: &str,
432    ) -> core::result::Result<Vec<u8>, Box<dyn core::error::Error>> {
433        let buf = Cursor::new(Vec::new());
434        let mut writer = ZipWriter::new(buf);
435        let rels_options =
436            SimpleFileOptions::default().compression_method(CompressionMethod::Deflated);
437        writer.start_file("_rels/.rels", rels_options)?;
438        writer.write_all(rels_xml.as_bytes())?;
439        let document_options =
440            SimpleFileOptions::default().compression_method(CompressionMethod::Deflated);
441        writer.start_file("word/document.xml", document_options)?;
442        writer.write_all(document_xml.as_bytes())?;
443        Ok(writer.finish()?.into_inner())
444    }
445
446    fn make_reader(
447        document_xml: &str,
448    ) -> core::result::Result<DocxReader, Box<dyn core::error::Error>> {
449        let bytes = synth_docx_for_unit_test(SIMPLE_RELS, document_xml)?;
450        Ok(DocxReader::from_reader(Cursor::new(bytes))?)
451    }
452
453    #[test]
454    fn queue_length_never_exceeds_three() -> core::result::Result<(), Box<dyn core::error::Error>> {
455        let doc = {
456            let mut content = String::from(
457                r#"<?xml version="1.0"?><w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"><w:body>"#,
458            );
459            for _ in 0..1000 {
460                content.push_str("<w:p><w:r><w:t>hello</w:t></w:r></w:p>");
461            }
462            content.push_str("</w:body></w:document>");
463            content
464        };
465        let mut reader = make_reader(&doc)?;
466        loop {
467            if reader.queue.len() > 3 {
468                return Err(Box::new(Error::Other {
469                    message: format!("queue grew to {}", reader.queue.len()),
470                }));
471            }
472            if reader.next_event()?.is_none() {
473                break;
474            }
475        }
476        Ok(())
477    }
478
479    #[test]
480    fn buf_is_cleared_per_iteration() -> core::result::Result<(), Box<dyn core::error::Error>> {
481        let doc = r#"<?xml version="1.0"?><w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"><w:body><w:p><w:r><w:t>hello</w:t></w:r></w:p></w:body></w:document>"#;
482        let mut reader = make_reader(doc)?;
483        while reader.next_event()?.is_some() {
484            if !reader.buf.is_empty() {
485                return Err(Box::new(Error::Other {
486                    message: "buf not cleared after event".to_string(),
487                }));
488            }
489        }
490        Ok(())
491    }
492}