Skip to main content

docspec_docx_reader/
lib.rs

1#![forbid(unsafe_code)]
2#![cfg_attr(docsrs, feature(doc_cfg))]
3//! DOCX to `DocSpec` event stream reader.
4//!
5//! This crate provides a [`DocxReader`] that implements [`EventSource`] to convert
6//! DOCX documents into the `DocSpec` event stream format. It uses `quick-xml` for
7//! streaming XML parsing and `zip` for archive extraction.
8//!
9//! # Scope
10//!
11//! **In scope**: Paragraphs (`<w:p>`), direct text (`<w:t>` inside `<w:r>`),
12//! line breaks (`<w:br>` — including `w:type="page"` and `w:type="column"`, all
13//! emitted as `LineBreak`), tabs (`<w:tab>`, emitted as a `Text` event whose
14//! content is the single character `"\t"`), tables (`<w:tbl>`, `<w:tr>`,
15//! `<w:tc>`), lists (`<w:p>` with `<w:numPr>` — ordered and unordered),
16//! hyperlinks (`<w:hyperlink>` — resolved via `word/_rels/document.xml.rels`
17//! and emitted as `StartLink`/`EndLink` events around inline content),
18//! structured document tags (`<w:sdt>` — content emitted normally;
19//! `<w:sdtPr>`/`<w:sdtEndPr>` dropped), and tracked insertions and moves
20//! (`<w:ins>`, `<w:moveTo>` — accept-changes semantics).
21//! Emits: `StartDocument`, `StartParagraph`, `StartTextStyle`, `Text`,
22//! `EndTextStyle`, `LineBreak`, `EndParagraph`, `StartTable`, `StartTableRow`,
23//! `StartTableCell`, `StartTableHeader`, `EndTableHeader`, `EndTableCell`,
24//! `EndTableRow`, `EndTable`, `StartLink`, `EndLink`, `StartOrderedListItem`,
25//! `EndOrderedListItem`, `StartUnorderedListItem`, `EndUnorderedListItem`,
26//! `EndDocument`.
27//!
28//! The elements listed under "Out of scope" are the reader's denylist — their
29//! entire subtree is silently dropped. Every other element (known or unknown)
30//! is parsed normally; the reader continues into its children.
31//!
32//! **Out of scope (subtree silently dropped)**:
33//! - Run styling not listed in the crate README
34//! - Headings (any `<w:pStyle>` value — every paragraph is `StartParagraph`)
35//! - Vertical cell merging (`<w:vMerge>`) — every cell emits with
36//!   `rowspan: None`
37//! - Header rows in nested tables — only the outermost table honors
38//!   `<w:tblHeader>`
39//! - Table-level property exceptions (`<w:tblPrEx>`) — silently ignored
40//! - Table, row, and cell visual properties (`<w:tblPr>`, `<w:trPr>` visual
41//!   fields, `<w:tcPr>` visual fields, `<w:tblGrid>`)
42//! - Drawings and images (`<w:drawing>`, `<w:pict>`)
43//! - Comments, footnotes, headers, footers
44//! - Document metadata
45//! - Tracked deletions (`<w:del>`, `<w:moveFrom>`) — accept-changes semantics
46//! - Structured document tag properties (`<w:sdtPr>`, `<w:sdtEndPr>`)
47//! - Field-code hyperlinks (`<w:fldChar>` + `<w:instrText>HYPERLINK ...`):
48//!   legacy form not currently supported; only the modern `<w:hyperlink>`
49//!   element is recognized.
50//!
51//! # Lists
52//!
53//! See the crate README for V1 list semantics and limitations.
54//!
55//! # Streaming Guarantee
56//!
57//! `DocxReader` streams `document.xml` event by event using constant memory
58//! regardless of document size. `_rels/.rels` and
59//! `word/_rels/document.xml.rels` are both fully read into memory at
60//! package-open time (typical combined size < 10 KB even for large documents).
61//! `word/document.xml` is consumed in streaming fashion via `quick-xml`. The
62//! internal event queue remains bounded regardless of document size or
63//! hyperlink count.
64//!
65//! # Quick Start
66//!
67//! ```no_run
68//! use docspec_docx_reader::{DocxReader, EventSource};
69//!
70//! let mut reader = DocxReader::from_path("document.docx")?;
71//! while let Some(event) = reader.next_event()? {
72//!     println!("{event:?}");
73//! }
74//! # Ok::<(), docspec_core::Error>(())
75//! ```
76
77extern crate alloc;
78
79mod document;
80mod numbering;
81mod package;
82mod properties;
83mod rels;
84mod styles;
85mod symbol_fonts;
86
87use std::io::{BufReader, Read, Seek};
88use std::path::Path;
89
90pub use docspec_core::EventSource;
91use docspec_core::{Error, Result};
92
93/// A streaming DOCX reader that implements [`EventSource`].
94///
95/// `DocxReader` parses a DOCX archive and emits `DocSpec` events one at a time.
96/// `<w:p>` paragraphs, `<w:t>` text, `<w:br>` line breaks, `<w:tab>` tabs, and
97/// table elements (`<w:tbl>`, `<w:tr>`, `<w:tc>`) are recognized; all other
98/// elements are silently ignored.
99///
100/// # Streaming
101///
102/// The reader streams `document.xml` event by event using constant memory.
103/// `_rels/.rels` and `word/_rels/document.xml.rels` are both fully read into
104/// memory at package-open time (typical combined size < 10 KB). The internal
105/// event queue remains bounded regardless of document size or hyperlink count.
106///
107/// # Errors
108///
109/// Returns [`Error::Io`] for I/O failures and [`Error::Parse`] for malformed
110/// archives or XML.
111#[derive(Debug)]
112pub struct DocxReader {
113    inner: document::DocumentReader,
114}
115
116impl DocxReader {
117    /// Creates a `DocxReader` from a file path.
118    ///
119    /// # Errors
120    ///
121    /// Returns [`Error::Io`] if the file cannot be opened. See [`from_reader`](Self::from_reader)
122    /// for additional error conditions.
123    #[inline]
124    pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
125        let file = std::fs::File::open(path.as_ref()).map_err(Error::from)?;
126        Self::from_reader(file)
127    }
128
129    /// Creates a `DocxReader` from any `Read + Seek` source.
130    ///
131    /// The reader must be positioned at the start of a valid DOCX (ZIP) archive.
132    ///
133    /// # Errors
134    ///
135    /// Returns [`Error::Parse`] if the input is not a valid ZIP archive, if
136    /// `_rels/.rels` is missing or malformed, or if the document target entry
137    /// cannot be opened. Returns [`Error::Io`] for I/O failures.
138    #[inline]
139    pub fn from_reader<R: Read + Seek + Send + 'static>(reader: R) -> Result<Self> {
140        let (style_list, numbering, hyperlink_map, stream) = package::open_package(reader)?;
141        let xml = quick_xml::Reader::from_reader(BufReader::new(stream));
142        let data = document::DocxData {
143            style_list,
144            hyperlink_map,
145            numbering,
146        };
147        Ok(Self {
148            inner: document::DocumentReader::from_xml_reader(xml, data),
149        })
150    }
151}
152
153impl EventSource for DocxReader {
154    #[inline]
155    fn next_event(&mut self) -> Result<Option<docspec_core::Event>> {
156        self.inner.next_event()
157    }
158}
159
160#[cfg(test)]
161#[cfg(not(coverage))]
162mod tests {
163    #![allow(clippy::unwrap_used, clippy::panic)]
164    use super::*;
165
166    #[test]
167    fn docx_reader_is_send_static() {
168        fn assert_send_static<T: Send + 'static>() {}
169        assert_send_static::<DocxReader>();
170    }
171
172    #[test]
173    fn docx_without_styles_emits_only_paragraphs() {
174        use std::io::{Cursor, Write as _};
175        use zip::ZipWriter;
176
177        let root_rels = r#"<?xml version="1.0" encoding="UTF-8"?>
178<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
179  <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
180</Relationships>"#;
181        let document_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
182<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
183  <w:body>
184    <w:p><w:r><w:t>hi</w:t></w:r></w:p>
185  </w:body>
186</w:document>"#;
187
188        let buf = Cursor::new(Vec::new());
189        let mut writer = ZipWriter::new(buf);
190        let options = zip::write::SimpleFileOptions::default()
191            .compression_method(zip::CompressionMethod::Stored);
192        writer.start_file("_rels/.rels", options).unwrap();
193        writer.write_all(root_rels.as_bytes()).unwrap();
194        writer.start_file("word/document.xml", options).unwrap();
195        writer.write_all(document_xml.as_bytes()).unwrap();
196        let zip_bytes = writer.finish().unwrap().into_inner();
197
198        let mut reader = DocxReader::from_reader(Cursor::new(zip_bytes)).unwrap();
199        let mut events = Vec::new();
200        loop {
201            match reader.next_event() {
202                Ok(Some(event)) => events.push(event),
203                Ok(None) => break,
204                Err(err) => panic!("unexpected error: {err:?}"),
205            }
206        }
207
208        assert_eq!(
209            events,
210            vec![
211                docspec_core::Event::StartDocument {
212                    id: None,
213                    language: None,
214                    metadata: None,
215                },
216                docspec_core::Event::StartParagraph {
217                    alignment: None,
218                    id: None,
219                },
220                docspec_core::Event::Text {
221                    content: "hi".to_string(),
222                },
223                docspec_core::Event::EndParagraph,
224                docspec_core::Event::EndDocument,
225            ]
226        );
227    }
228}