Skip to main content

docspec_docx_reader/
lib.rs

1#![forbid(unsafe_code)]
2#![cfg_attr(docsrs, feature(doc_cfg))]
3//! DOCX to `DocSpec` event stream reader.
4//!
5//! This crate provides a [`DocxReader`] that implements [`EventSource`] to convert
6//! DOCX documents into the `DocSpec` event stream format. It uses `quick-xml` for
7//! streaming XML parsing and `zip` for archive extraction.
8//!
9//! # Scope
10//!
11//! **In scope**: Paragraphs (`<w:p>`), direct text (`<w:t>` inside `<w:r>`),
12//! line breaks (`<w:br>` — including `w:type="page"` and `w:type="column"`, all
13//! emitted as `LineBreak`), tabs (`<w:tab>`, emitted as a `Text` event whose
14//! content is the single character `"\t"`), and tables (`<w:tbl>`, `<w:tr>`,
15//! `<w:tc>` — emitted as structural events only; cell merging, header rows, and
16//! table styles are not represented).
17//! Emits exactly: `StartDocument`, `StartParagraph`, `Text`, `LineBreak`,
18//! `EndParagraph`, `StartTable`, `StartTableRow`, `StartTableCell`,
19//! `EndTableCell`, `EndTableRow`, `EndTable`, `EndDocument`.
20//!
21//! **Out of scope (silently dropped)**:
22//! - Run styling (`<w:rPr>`, bold, italic, etc.)
23//! - Headings (any `<w:pStyle>` value — every paragraph is `StartParagraph`)
24//! - Cell merging (`<w:gridSpan>`, `<w:vMerge>`) — every cell emits with
25//!   `colspan: None` and `rowspan: None`
26//! - Header rows (`<w:tblHeader>`) — every cell emits as `StartTableCell`,
27//!   never `StartTableHeader`
28//! - Table, row, and cell properties (`<w:tblPr>`, `<w:trPr>`, `<w:tcPr>`,
29//!   `<w:tblGrid>`)
30//! - Lists
31//! - Hyperlinks (`<w:hyperlink>`)
32//! - Drawings and images (`<w:drawing>`, `<w:pict>`)
33//! - Structured document tags (`<w:sdt>`)
34//! - Comments, footnotes, headers, footers
35//! - Document metadata
36//! - Tracked changes (`<w:ins>`, `<w:del>`, `<w:moveFrom>`, `<w:moveTo>`)
37//!
38//! # Streaming Guarantee
39//!
40//! `DocxReader` streams `document.xml` event by event using constant memory
41//! regardless of document size. Only `_rels/.rels` (a few hundred bytes) is
42//! fully read into memory to discover the document target path.
43//!
44//! # Quick Start
45//!
46//! ```no_run
47//! use docspec_docx_reader::{DocxReader, EventSource};
48//!
49//! let mut reader = DocxReader::from_path("document.docx")?;
50//! while let Some(event) = reader.next_event()? {
51//!     println!("{event:?}");
52//! }
53//! # Ok::<(), docspec_core::Error>(())
54//! ```
55
56extern crate alloc;
57
58mod rels;
59
60use alloc::collections::VecDeque;
61use core::fmt;
62use std::io::{BufReader, Read, Seek};
63use std::path::Path;
64
65pub use docspec_core::EventSource;
66use docspec_core::{Error, Event, Result, TextStyle};
67use quick_xml::events::{BytesCData, BytesRef, BytesText};
68
69/// Document processing phase.
70#[derive(Clone, Copy, PartialEq, Eq)]
71enum Phase {
72    /// `EndDocument` has been emitted.
73    Finished,
74    /// `StartDocument` not yet emitted.
75    NotStarted,
76    /// Processing events between `StartDocument` and `EndDocument`.
77    Running,
78}
79
80/// A streaming DOCX reader that implements [`EventSource`].
81///
82/// `DocxReader` parses a DOCX archive and emits `DocSpec` events one at a time.
83/// `<w:p>` paragraphs, `<w:t>` text, `<w:br>` line breaks, `<w:tab>` tabs, and
84/// table elements (`<w:tbl>`, `<w:tr>`, `<w:tc>`) are recognized; all other
85/// elements are silently ignored.
86///
87/// # Streaming
88///
89/// The reader streams `document.xml` event by event using constant memory.
90/// Only `_rels/.rels` (a few hundred bytes) is buffered to discover the
91/// document target path.
92///
93/// # Errors
94///
95/// Returns [`Error::Io`] for I/O failures and [`Error::Parse`] for malformed
96/// archives or XML.
97pub struct DocxReader {
98    /// Reusable buffer for quick-xml event reading.
99    buf: Vec<u8>,
100    /// Depth counter for ignored subtrees (tracked changes, hyperlinks,
101    /// drawings, table/row/cell property containers, etc.).
102    /// Incremented on Start of an ignored container, decremented on End.
103    in_ignored_subtree: u32,
104    /// Whether the reader is currently inside a `<w:p>` element.
105    in_paragraph: bool,
106    /// Whether the reader is currently inside a `<w:t>` element.
107    in_text: bool,
108    /// Text collected for the current `<w:t>` element.
109    pending_text: String,
110    /// Document processing phase.
111    phase: Phase,
112    /// Queue of `DocSpec` events to emit.
113    queue: VecDeque<Event>,
114    /// The quick-xml reader streaming from the document entry.
115    xml: quick_xml::Reader<BufReader<Box<dyn Read + Send>>>,
116}
117
118impl fmt::Debug for DocxReader {
119    #[inline]
120    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
121        f.debug_struct("DocxReader")
122            .field("buf", &self.buf)
123            .field("in_ignored_subtree", &self.in_ignored_subtree)
124            .field("in_paragraph", &self.in_paragraph)
125            .field("in_text", &self.in_text)
126            .field("pending_text", &self.pending_text)
127            .field("phase", &"<phase>")
128            .field("queue", &self.queue)
129            .field("xml", &"<quick_xml::Reader>")
130            .finish()
131    }
132}
133
134impl DocxReader {
135    /// Creates a `DocxReader` from a file path.
136    ///
137    /// # Errors
138    ///
139    /// Returns [`Error::Io`] if the file cannot be opened. See [`from_reader`](Self::from_reader)
140    /// for additional error conditions.
141    #[inline]
142    pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
143        let file = std::fs::File::open(path.as_ref()).map_err(Error::from)?;
144        Self::from_reader(file)
145    }
146
147    /// Creates a `DocxReader` from any `Read + Seek` source.
148    ///
149    /// The reader must be positioned at the start of a valid DOCX (ZIP) archive.
150    ///
151    /// # Errors
152    ///
153    /// Returns [`Error::Parse`] if the input is not a valid ZIP archive, if
154    /// `_rels/.rels` is missing or malformed, or if the document target entry
155    /// cannot be opened. Returns [`Error::Io`] for I/O failures.
156    #[inline]
157    pub fn from_reader<R: Read + Seek + Send + 'static>(mut reader: R) -> Result<Self> {
158        let mut archive = zip::ZipArchive::new(&mut reader).map_err(|err| match err {
159            zip::result::ZipError::InvalidArchive(_)
160            | zip::result::ZipError::UnsupportedArchive(_) => Error::Parse {
161                message: "not a valid ZIP archive".to_string(),
162                position: None,
163            },
164            zip::result::ZipError::Io(source) => Error::Io { source },
165            zip::result::ZipError::FileNotFound
166            | zip::result::ZipError::InvalidPassword
167            | zip::result::ZipError::CompressionMethodNotSupported(_)
168            | _ => parse_error(format!("not a valid ZIP archive: {err}")),
169        })?;
170
171        let document_path = rels::find_document_path(&mut archive)?;
172
173        let (data_start, compressed_size, method) = {
174            let entry = archive
175                .by_name(&document_path)
176                .map_err(|_err| Error::Parse {
177                    message: format!("document target not found: {document_path}"),
178                    position: None,
179                })?;
180            let data_start = entry
181                .data_start()
182                .ok_or_else(|| parse_error("document.xml has no data offset".to_string()))?;
183            (data_start, entry.compressed_size(), entry.compression())
184        };
185        drop(archive);
186
187        reader
188            .seek(std::io::SeekFrom::Start(data_start))
189            .map_err(Error::from)?;
190
191        let limited = reader.take(compressed_size);
192
193        let stream: Box<dyn Read + Send> = if method == zip::CompressionMethod::Stored {
194            Box::new(limited)
195        } else if method == zip::CompressionMethod::Deflated {
196            Box::new(flate2::read::DeflateDecoder::new(limited))
197        } else {
198            return Err(Error::Parse {
199                message: format!("unsupported compression: {method:?}"),
200                position: None,
201            });
202        };
203
204        let xml = quick_xml::Reader::from_reader(BufReader::new(stream));
205
206        Ok(Self {
207            buf: Vec::with_capacity(4096),
208            in_ignored_subtree: 0,
209            in_paragraph: false,
210            in_text: false,
211            pending_text: String::new(),
212            phase: Phase::NotStarted,
213            queue: VecDeque::new(),
214            xml,
215        })
216    }
217}
218
219impl DocxReader {
220    fn can_collect_text(&self) -> bool {
221        self.in_ignored_subtree == 0 && self.in_paragraph && self.in_text
222    }
223
224    fn emit_line_break(&mut self) {
225        self.flush_pending_text();
226        self.queue.push_back(Event::LineBreak);
227    }
228
229    fn emit_tab(&mut self) {
230        self.flush_pending_text();
231        self.queue.push_back(Event::Text {
232            content: "\t".to_string(),
233            style: TextStyle::default(),
234        });
235    }
236
237    fn end_paragraph(&mut self) {
238        self.queue.push_back(Event::EndParagraph);
239        self.in_paragraph = false;
240        self.in_text = false;
241        self.pending_text.clear();
242    }
243
244    fn flush_pending_text(&mut self) {
245        if !self.pending_text.is_empty() {
246            self.queue.push_back(Event::Text {
247                content: core::mem::take(&mut self.pending_text),
248                style: TextStyle::default(),
249            });
250        }
251    }
252
253    fn handle_cdata(&mut self, cdata: BytesCData<'_>) -> Result<()> {
254        if self.can_collect_text() {
255            let bytes = cdata.into_inner();
256            let content = core::str::from_utf8(&bytes)
257                .map_err(|err| parse_error(format!("malformed document.xml: {err}")))?;
258            self.pending_text.push_str(content);
259        }
260        Ok(())
261    }
262
263    fn handle_empty(&mut self, local: &[u8]) {
264        match local {
265            value if self.in_ignored_subtree > 0 || is_ignored_container(value) => {}
266            b"p" if !self.in_paragraph => {
267                self.queue.push_back(Event::StartParagraph {
268                    alignment: None,
269                    id: None,
270                });
271                self.queue.push_back(Event::EndParagraph);
272            }
273            b"br" if self.in_paragraph => self.emit_line_break(),
274            b"tab" if self.in_paragraph => self.emit_tab(),
275            _ => {}
276        }
277    }
278
279    fn handle_end(&mut self, local: &[u8]) {
280        if self.in_ignored_subtree > 0 {
281            self.in_ignored_subtree = self.in_ignored_subtree.saturating_sub(1);
282            return;
283        }
284
285        match local {
286            b"p" if self.in_paragraph => self.end_paragraph(),
287            b"t" if self.in_text => {
288                self.flush_pending_text();
289                self.in_text = false;
290            }
291            b"tbl" => self.queue.push_back(Event::EndTable),
292            b"tr" => self.queue.push_back(Event::EndTableRow),
293            b"tc" => self.queue.push_back(Event::EndTableCell),
294            _ => {}
295        }
296    }
297
298    fn handle_eof(&mut self) {
299        if self.in_text {
300            self.flush_pending_text();
301        }
302        if self.in_paragraph {
303            self.end_paragraph();
304        }
305        self.queue.push_back(Event::EndDocument);
306        self.phase = Phase::Finished;
307    }
308
309    fn handle_general_ref(&mut self, reference: &BytesRef<'_>) -> Result<()> {
310        if self.can_collect_text() {
311            let decoded = reference
312                .decode()
313                .map_err(|err| parse_error(format!("malformed document.xml: {err}")))?;
314            let escaped = format!("&{decoded};");
315            let unescaped = quick_xml::escape::unescape(&escaped)
316                .map_err(|err| parse_error(format!("malformed document.xml: {err}")))?;
317            self.pending_text.push_str(&unescaped);
318        }
319        Ok(())
320    }
321
322    fn handle_start(&mut self, local: &[u8]) {
323        if self.in_ignored_subtree > 0 {
324            self.in_ignored_subtree = self.in_ignored_subtree.saturating_add(1);
325            return;
326        }
327
328        match local {
329            value if is_ignored_container(value) => self.in_ignored_subtree = 1,
330            b"p" if !self.in_paragraph => self.start_paragraph(),
331            b"t" if self.in_paragraph => {
332                self.in_text = true;
333                self.pending_text.clear();
334            }
335            b"br" if self.in_paragraph => self.emit_line_break(),
336            b"tab" if self.in_paragraph => self.emit_tab(),
337            b"tbl" => self.queue.push_back(Event::StartTable { id: None }),
338            b"tr" => self.queue.push_back(Event::StartTableRow { id: None }),
339            b"tc" => self.queue.push_back(Event::StartTableCell {
340                colspan: None,
341                id: None,
342                rowspan: None,
343            }),
344            _ => {}
345        }
346    }
347
348    fn handle_text(&mut self, text: &BytesText<'_>) -> Result<()> {
349        if self.can_collect_text() {
350            let decoded = text
351                .decode()
352                .map_err(|err| parse_error(format!("malformed document.xml: {err}")))?;
353            let unescaped = quick_xml::escape::unescape(&decoded)
354                .map_err(|err| parse_error(format!("malformed document.xml: {err}")))?;
355            self.pending_text.push_str(&unescaped);
356        }
357        Ok(())
358    }
359
360    fn read_until_event(&mut self) -> Result<()> {
361        let event = self
362            .xml
363            .read_event_into(&mut self.buf)
364            .map_err(|err| match err {
365                quick_xml::Error::Io(source) => Error::Io {
366                    source: std::io::Error::new(source.kind(), source.to_string()),
367                },
368                other => Error::Parse {
369                    message: format!("malformed document.xml: {other}"),
370                    position: None,
371                },
372            })?
373            .into_owned();
374
375        match event {
376            quick_xml::events::Event::Start(tag) => self.handle_start(tag.local_name().as_ref()),
377            quick_xml::events::Event::End(tag) => self.handle_end(tag.local_name().as_ref()),
378            quick_xml::events::Event::Empty(tag) => self.handle_empty(tag.local_name().as_ref()),
379            quick_xml::events::Event::Text(text) => {
380                self.handle_text(&text)?;
381            }
382            quick_xml::events::Event::GeneralRef(reference) => {
383                self.handle_general_ref(&reference)?;
384            }
385            quick_xml::events::Event::CData(cdata) => self.handle_cdata(cdata)?,
386            quick_xml::events::Event::Eof => self.handle_eof(),
387            quick_xml::events::Event::Comment(_)
388            | quick_xml::events::Event::Decl(_)
389            | quick_xml::events::Event::PI(_)
390            | quick_xml::events::Event::DocType(_) => {}
391        }
392
393        self.buf.clear();
394        Ok(())
395    }
396
397    fn start_paragraph(&mut self) {
398        self.queue.push_back(Event::StartParagraph {
399            alignment: None,
400            id: None,
401        });
402        self.in_paragraph = true;
403        self.in_text = false;
404        self.pending_text.clear();
405    }
406}
407
408impl EventSource for DocxReader {
409    #[inline]
410    fn next_event(&mut self) -> Result<Option<Event>> {
411        loop {
412            if let Some(event) = self.queue.pop_front() {
413                return Ok(Some(event));
414            }
415
416            match self.phase {
417                Phase::NotStarted => {
418                    self.phase = Phase::Running;
419                    self.queue.push_back(Event::StartDocument {
420                        id: None,
421                        language: None,
422                        metadata: None,
423                    });
424                }
425                Phase::Finished => return Ok(None),
426                Phase::Running => self.read_until_event()?,
427            }
428        }
429    }
430}
431
432fn is_ignored_container(local: &[u8]) -> bool {
433    matches!(
434        local,
435        b"sdt"
436            | b"hyperlink"
437            | b"drawing"
438            | b"pict"
439            | b"object"
440            | b"ins"
441            | b"del"
442            | b"moveFrom"
443            | b"moveTo"
444            | b"tblPr"
445            | b"trPr"
446            | b"tcPr"
447            | b"tblGrid"
448    )
449}
450
451fn parse_error(message: String) -> Error {
452    Error::Parse {
453        message,
454        position: None,
455    }
456}
457
458#[cfg(test)]
459#[cfg(not(coverage))]
460mod tests {
461    use std::io::{Cursor, Write as _};
462
463    use zip::{write::SimpleFileOptions, CompressionMethod, ZipWriter};
464
465    use super::*;
466
467    const SIMPLE_RELS: &str = r#"<?xml version="1.0"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/></Relationships>"#;
468
469    #[test]
470    fn docx_reader_is_send_static() {
471        fn assert_send_static<T: Send + 'static>() {}
472        assert_send_static::<DocxReader>();
473    }
474
475    fn synth_docx_for_unit_test(
476        rels_xml: &str,
477        document_xml: &str,
478    ) -> core::result::Result<Vec<u8>, Box<dyn core::error::Error>> {
479        let buf = Cursor::new(Vec::new());
480        let mut writer = ZipWriter::new(buf);
481        let rels_options =
482            SimpleFileOptions::default().compression_method(CompressionMethod::Deflated);
483        writer.start_file("_rels/.rels", rels_options)?;
484        writer.write_all(rels_xml.as_bytes())?;
485        let document_options =
486            SimpleFileOptions::default().compression_method(CompressionMethod::Deflated);
487        writer.start_file("word/document.xml", document_options)?;
488        writer.write_all(document_xml.as_bytes())?;
489        Ok(writer.finish()?.into_inner())
490    }
491
492    fn make_reader(
493        document_xml: &str,
494    ) -> core::result::Result<DocxReader, Box<dyn core::error::Error>> {
495        let bytes = synth_docx_for_unit_test(SIMPLE_RELS, document_xml)?;
496        Ok(DocxReader::from_reader(Cursor::new(bytes))?)
497    }
498
499    #[test]
500    fn queue_length_never_exceeds_three() -> core::result::Result<(), Box<dyn core::error::Error>> {
501        let doc = {
502            let mut content = String::from(
503                r#"<?xml version="1.0"?><w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"><w:body>"#,
504            );
505            for _ in 0..1000 {
506                content.push_str("<w:p><w:r><w:t>hello</w:t></w:r></w:p>");
507            }
508            content.push_str("</w:body></w:document>");
509            content
510        };
511        let mut reader = make_reader(&doc)?;
512        loop {
513            if reader.queue.len() > 3 {
514                return Err(Box::new(Error::Other {
515                    message: format!("queue grew to {}", reader.queue.len()),
516                }));
517            }
518            if reader.next_event()?.is_none() {
519                break;
520            }
521        }
522        Ok(())
523    }
524
525    #[test]
526    fn buf_is_cleared_per_iteration() -> core::result::Result<(), Box<dyn core::error::Error>> {
527        let doc = r#"<?xml version="1.0"?><w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"><w:body><w:p><w:r><w:t>hello</w:t></w:r></w:p></w:body></w:document>"#;
528        let mut reader = make_reader(doc)?;
529        while reader.next_event()?.is_some() {
530            if !reader.buf.is_empty() {
531                return Err(Box::new(Error::Other {
532                    message: "buf not cleared after event".to_string(),
533                }));
534            }
535        }
536        Ok(())
537    }
538}