Skip to main content

docspec_docx_reader/
lib.rs

1#![forbid(unsafe_code)]
2#![cfg_attr(docsrs, feature(doc_cfg))]
3//! DOCX to `DocSpec` event stream reader.
4//!
5//! This crate provides a [`DocxReader`] that implements [`EventSource`] to convert
6//! DOCX documents into the `DocSpec` event stream format. It uses `quick-xml` for
7//! streaming XML parsing and `zip` for archive extraction.
8//!
9//! # Scope
10//!
11//! **In scope**: Paragraphs (`<w:p>`), direct text (`<w:t>` inside `<w:r>`),
12//! line breaks (`<w:br>` — including `w:type="page"` and `w:type="column"`, all
13//! emitted as `LineBreak`), tabs (`<w:tab>`, emitted as a `Text` event whose
14//! content is the single character `"\t"`), tables (`<w:tbl>`, `<w:tr>`,
15//! `<w:tc>`), lists (`<w:p>` with `<w:numPr>` — ordered and unordered),
16//! hyperlinks (`<w:hyperlink>` — resolved via `word/_rels/document.xml.rels`
17//! and emitted as `StartLink`/`EndLink` events around inline content),
18//! structured document tags (`<w:sdt>` — content emitted normally;
19//! `<w:sdtPr>`/`<w:sdtEndPr>` dropped), tracked insertions and moves
20//! (`<w:ins>`, `<w:moveTo>` — accept-changes semantics), and `DrawingML` images
21//! (`<w:drawing>` — emitted as `Image` events; see the crate README for
22//! `ImageSource` variants and `DocxAssetProvider` usage).
23//! Emits: `StartDocument`, `StartParagraph`, `StartTextStyle`, `Text`,
24//! `EndTextStyle`, `LineBreak`, `EndParagraph`, `StartTable`, `StartTableRow`,
25//! `StartTableCell`, `StartTableHeader`, `EndTableHeader`, `EndTableCell`,
26//! `EndTableRow`, `EndTable`, `StartLink`, `EndLink`, `StartOrderedListItem`,
27//! `EndOrderedListItem`, `StartUnorderedListItem`, `EndUnorderedListItem`,
28//! `Image`, `EndDocument`.
29//!
30//! The elements listed under "Out of scope" are the reader's denylist — their
31//! entire subtree is silently dropped. Every other element (known or unknown)
32//! is parsed normally; the reader continues into its children.
33//!
34//! **Out of scope (subtree silently dropped)**:
35//! - Run styling not listed in the crate README
36//! - Headings (any `<w:pStyle>` value — every paragraph is `StartParagraph`)
37//! - Vertical cell merging (`<w:vMerge>`) — every cell emits with
38//!   `rowspan: None`
39//! - Header rows in nested tables — only the outermost table honors
40//!   `<w:tblHeader>`
41//! - Table-level property exceptions (`<w:tblPrEx>`) — silently ignored
42//! - Table, row, and cell visual properties (`<w:tblPr>`, `<w:trPr>` visual
43//!   fields, `<w:tcPr>` visual fields, `<w:tblGrid>`)
44//! - VML images (`<w:pict>`) — deferred to follow-up; subtree silently dropped
45//! - Comments, footnotes, headers, footers
46//! - Document metadata
47//! - Tracked deletions (`<w:del>`, `<w:moveFrom>`) — accept-changes semantics
48//! - Structured document tag properties (`<w:sdtPr>`, `<w:sdtEndPr>`)
49//! - Field-code hyperlinks (`<w:fldChar>` + `<w:instrText>HYPERLINK ...`):
50//!   legacy form not currently supported; only the modern `<w:hyperlink>`
51//!   element is recognized.
52//!
53//! # Lists
54//!
55//! See the crate README for V1 list semantics and limitations.
56//!
57//! # Streaming Guarantee
58//!
59//! `DocxReader` streams `document.xml` event by event using constant memory
60//! regardless of document size. `_rels/.rels` and
61//! `word/_rels/document.xml.rels` are both fully read into memory at
62//! package-open time (typical combined size < 10 KB even for large documents).
63//! `word/document.xml` is consumed in streaming fashion via `quick-xml`. The
64//! internal event queue remains bounded regardless of document size or
65//! hyperlink count.
66//!
67//! # Quick Start
68//!
69//! ```no_run
70//! use docspec_docx_reader::{DocxReader, EventSource};
71//!
72//! let mut reader = DocxReader::from_path("document.docx")?;
73//! while let Some(event) = reader.next_event()? {
74//!     println!("{event:?}");
75//! }
76//! # Ok::<(), docspec_core::Error>(())
77//! ```
78
79extern crate alloc;
80
81mod asset_provider;
82mod content_types;
83mod document;
84mod numbering;
85mod package;
86mod properties;
87mod rels;
88mod styles;
89mod symbol_fonts;
90
91use std::io::{BufReader, Read, Seek};
92use std::path::Path;
93
94pub use docspec_core::EventSource;
95
96/// Provides streaming access to binary assets stored inside a DOCX ZIP archive.
97///
98/// Use this alongside [`DocxReader`] when you need to resolve embedded images.
99/// [`DocxReader`] emits `Event::Image` with an `asset_id` of the form
100/// `zip://word/media/image1.png`; pass that `asset_id` to
101/// [`DocxAssetProvider`] to stream the raw bytes.
102///
103/// # Example
104///
105/// ```no_run
106/// use docspec_docx_reader::DocxAssetProvider;
107/// use docspec_core::AssetProvider;
108///
109/// let provider = DocxAssetProvider::from_path("document.docx")?;
110/// let mut buf = Vec::new();
111/// if let Some(result) = provider.stream_to("zip://word/media/image1.png", &mut buf) {
112///     result?;
113/// }
114/// # Ok::<(), docspec_core::Error>(())
115/// ```
116pub use asset_provider::DocxAssetProvider;
117use docspec_core::{Error, Result};
118
119const _: for<'a> fn(&'a content_types::ContentTypes, &str) -> Option<&'a str> =
120    content_types::ContentTypes::lookup;
121fn _image_rel_fields(r: &rels::ImageRel) -> (&str, bool) {
122    (&r.target, r.is_external)
123}
124const _: for<'a> fn(&'a rels::ImageRel) -> (&'a str, bool) = _image_rel_fields;
125fn _use_docx_asset_provider_from_path(p: &Path) -> Result<asset_provider::DocxAssetProvider> {
126    asset_provider::DocxAssetProvider::from_path(p)
127}
128const _: fn(&Path) -> Result<asset_provider::DocxAssetProvider> =
129    _use_docx_asset_provider_from_path;
130fn _use_docx_asset_provider_from_reader(
131    r: std::io::Cursor<Vec<u8>>,
132) -> Result<asset_provider::DocxAssetProvider> {
133    asset_provider::DocxAssetProvider::from_reader(r)
134}
135const _: fn(std::io::Cursor<Vec<u8>>) -> Result<asset_provider::DocxAssetProvider> =
136    _use_docx_asset_provider_from_reader;
137
138/// A streaming DOCX reader that implements [`EventSource`].
139///
140/// `DocxReader` parses a DOCX archive and emits `DocSpec` events one at a time.
141/// `<w:p>` paragraphs, `<w:t>` text, `<w:br>` line breaks, `<w:tab>` tabs,
142/// table elements (`<w:tbl>`, `<w:tr>`, `<w:tc>`), and `DrawingML` images
143/// (`<w:drawing>`) are recognized; all other elements are silently ignored.
144///
145/// # Streaming
146///
147/// The reader streams `document.xml` event by event using constant memory.
148/// `_rels/.rels` and `word/_rels/document.xml.rels` are both fully read into
149/// memory at package-open time (typical combined size < 10 KB). The internal
150/// event queue remains bounded regardless of document size or hyperlink count.
151///
152/// # Errors
153///
154/// Returns [`Error::Io`] for I/O failures and [`Error::Parse`] for malformed
155/// archives or XML.
156#[derive(Debug)]
157pub struct DocxReader {
158    inner: document::DocumentReader,
159}
160
161impl DocxReader {
162    /// Creates a `DocxReader` from a file path.
163    ///
164    /// # Errors
165    ///
166    /// Returns [`Error::Io`] if the file cannot be opened. See [`from_reader`](Self::from_reader)
167    /// for additional error conditions.
168    #[inline]
169    pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
170        let file = std::fs::File::open(path.as_ref()).map_err(Error::from)?;
171        Self::from_reader(file)
172    }
173
174    /// Creates a `DocxReader` from any `Read + Seek` source.
175    ///
176    /// The reader must be positioned at the start of a valid DOCX (ZIP) archive.
177    ///
178    /// # Errors
179    ///
180    /// Returns [`Error::Parse`] if the input is not a valid ZIP archive, if
181    /// `_rels/.rels` is missing or malformed, or if the document target entry
182    /// cannot be opened. Returns [`Error::Io`] for I/O failures.
183    #[inline]
184    pub fn from_reader<R: Read + Seek + Send + 'static>(reader: R) -> Result<Self> {
185        let (style_list, numbering, hyperlink_map, image_map, _content_types, stream) =
186            package::open_package(reader)?;
187        let xml = quick_xml::Reader::from_reader(BufReader::new(stream));
188        let data = document::DocxData {
189            style_list,
190            hyperlink_map,
191            numbering,
192            image_map,
193        };
194        Ok(Self {
195            inner: document::DocumentReader::from_xml_reader(xml, data),
196        })
197    }
198}
199
200impl EventSource for DocxReader {
201    #[inline]
202    fn next_event(&mut self) -> Result<Option<docspec_core::Event>> {
203        self.inner.next_event()
204    }
205}
206
207#[cfg(test)]
208#[cfg(not(coverage))]
209mod tests {
210    #![allow(clippy::unwrap_used, clippy::panic)]
211    use super::*;
212
213    #[test]
214    fn docx_reader_is_send_static() {
215        fn assert_send_static<T: Send + 'static>() {}
216        assert_send_static::<DocxReader>();
217    }
218
219    #[test]
220    fn docx_without_styles_emits_only_paragraphs() {
221        use std::io::{Cursor, Write as _};
222        use zip::ZipWriter;
223
224        let root_rels = r#"<?xml version="1.0" encoding="UTF-8"?>
225<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
226  <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
227</Relationships>"#;
228        let document_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
229<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
230  <w:body>
231    <w:p><w:r><w:t>hi</w:t></w:r></w:p>
232  </w:body>
233</w:document>"#;
234
235        let buf = Cursor::new(Vec::new());
236        let mut writer = ZipWriter::new(buf);
237        let options = zip::write::SimpleFileOptions::default()
238            .compression_method(zip::CompressionMethod::Stored);
239        writer.start_file("_rels/.rels", options).unwrap();
240        writer.write_all(root_rels.as_bytes()).unwrap();
241        writer.start_file("word/document.xml", options).unwrap();
242        writer.write_all(document_xml.as_bytes()).unwrap();
243        let zip_bytes = writer.finish().unwrap().into_inner();
244
245        let mut reader = DocxReader::from_reader(Cursor::new(zip_bytes)).unwrap();
246        let mut events = Vec::new();
247        loop {
248            match reader.next_event() {
249                Ok(Some(event)) => events.push(event),
250                Ok(None) => break,
251                Err(err) => panic!("unexpected error: {err:?}"),
252            }
253        }
254
255        assert_eq!(
256            events,
257            vec![
258                docspec_core::Event::StartDocument {
259                    id: None,
260                    language: None,
261                    metadata: None,
262                },
263                docspec_core::Event::StartParagraph {
264                    alignment: None,
265                    id: None,
266                },
267                docspec_core::Event::Text {
268                    content: "hi".to_string(),
269                },
270                docspec_core::Event::EndParagraph,
271                docspec_core::Event::EndDocument,
272            ]
273        );
274    }
275}