docspec_docx_reader/lib.rs
1#![forbid(unsafe_code)]
2#![cfg_attr(docsrs, feature(doc_cfg))]
3//! DOCX to `DocSpec` event stream reader.
4//!
5//! This crate provides a [`DocxReader`] that implements [`EventSource`] to convert
6//! DOCX documents into the `DocSpec` event stream format. It uses `quick-xml` for
7//! streaming XML parsing and `zip` for archive extraction.
8//!
9//! # Scope
10//!
11//! **In scope**: Paragraphs (`<w:p>`), direct text (`<w:t>` inside `<w:r>`),
12//! line breaks (`<w:br>` — including `w:type="page"` and `w:type="column"`, all
13//! emitted as `LineBreak`), tabs (`<w:tab>`, emitted as a `Text` event whose
14//! content is the single character `"\t"`), tables (`<w:tbl>`, `<w:tr>`,
15//! `<w:tc>`), hyperlinks (`<w:hyperlink>` — link text content is emitted as
16//! plain runs), structured document tags (`<w:sdt>` — content emitted normally;
17//! `<w:sdtPr>`/`<w:sdtEndPr>` dropped), and tracked insertions and moves
18//! (`<w:ins>`, `<w:moveTo>` — accept-changes semantics).
19//! Emits: `StartDocument`, `StartParagraph`, `StartTextStyle`, `Text`,
20//! `EndTextStyle`, `LineBreak`, `EndParagraph`, `StartTable`, `StartTableRow`,
21//! `StartTableCell`, `StartTableHeader`, `EndTableHeader`, `EndTableCell`,
22//! `EndTableRow`, `EndTable`, `EndDocument`.
23//!
24//! The elements listed under "Out of scope" are the reader's denylist — their
25//! entire subtree is silently dropped. Every other element (known or unknown)
26//! is parsed normally; the reader continues into its children.
27//!
28//! **Out of scope (subtree silently dropped)**:
29//! - Run styling not listed in the crate README
30//! - Headings (any `<w:pStyle>` value — every paragraph is `StartParagraph`)
31//! - Vertical cell merging (`<w:vMerge>`) — every cell emits with
32//! `rowspan: None`
33//! - Header rows in nested tables — only the outermost table honors
34//! `<w:tblHeader>`
35//! - Table-level property exceptions (`<w:tblPrEx>`) — silently ignored
36//! - Table, row, and cell visual properties (`<w:tblPr>`, `<w:trPr>` visual
37//! fields, `<w:tcPr>` visual fields, `<w:tblGrid>`)
38//! - Lists
39//! - Drawings and images (`<w:drawing>`, `<w:pict>`)
40//! - Comments, footnotes, headers, footers
41//! - Document metadata
42//! - Tracked deletions (`<w:del>`, `<w:moveFrom>`) — accept-changes semantics
43//! - Structured document tag properties (`<w:sdtPr>`, `<w:sdtEndPr>`)
44//!
45//! # Streaming Guarantee
46//!
47//! `DocxReader` streams `document.xml` event by event using constant memory
48//! regardless of document size. Only `_rels/.rels` (a few hundred bytes) is
49//! fully read into memory to discover the document target path.
50//!
51//! # Quick Start
52//!
53//! ```no_run
54//! use docspec_docx_reader::{DocxReader, EventSource};
55//!
56//! let mut reader = DocxReader::from_path("document.docx")?;
57//! while let Some(event) = reader.next_event()? {
58//! println!("{event:?}");
59//! }
60//! # Ok::<(), docspec_core::Error>(())
61//! ```
62
63extern crate alloc;
64
65mod document;
66mod package;
67mod properties;
68mod rels;
69mod styles;
70mod symbol_fonts;
71
72use std::io::{BufReader, Read, Seek};
73use std::path::Path;
74
75pub use docspec_core::EventSource;
76use docspec_core::{Error, Result};
77
78/// A streaming DOCX reader that implements [`EventSource`].
79///
80/// `DocxReader` parses a DOCX archive and emits `DocSpec` events one at a time.
81/// `<w:p>` paragraphs, `<w:t>` text, `<w:br>` line breaks, `<w:tab>` tabs, and
82/// table elements (`<w:tbl>`, `<w:tr>`, `<w:tc>`) are recognized; all other
83/// elements are silently ignored.
84///
85/// # Streaming
86///
87/// The reader streams `document.xml` event by event using constant memory.
88/// Only `_rels/.rels` (a few hundred bytes) is buffered to discover the
89/// document target path.
90///
91/// # Errors
92///
93/// Returns [`Error::Io`] for I/O failures and [`Error::Parse`] for malformed
94/// archives or XML.
95#[derive(Debug)]
96pub struct DocxReader {
97 inner: document::DocumentReader,
98}
99
100impl DocxReader {
101 /// Creates a `DocxReader` from a file path.
102 ///
103 /// # Errors
104 ///
105 /// Returns [`Error::Io`] if the file cannot be opened. See [`from_reader`](Self::from_reader)
106 /// for additional error conditions.
107 #[inline]
108 pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
109 let file = std::fs::File::open(path.as_ref()).map_err(Error::from)?;
110 Self::from_reader(file)
111 }
112
113 /// Creates a `DocxReader` from any `Read + Seek` source.
114 ///
115 /// The reader must be positioned at the start of a valid DOCX (ZIP) archive.
116 ///
117 /// # Errors
118 ///
119 /// Returns [`Error::Parse`] if the input is not a valid ZIP archive, if
120 /// `_rels/.rels` is missing or malformed, or if the document target entry
121 /// cannot be opened. Returns [`Error::Io`] for I/O failures.
122 #[inline]
123 pub fn from_reader<R: Read + Seek + Send + 'static>(reader: R) -> Result<Self> {
124 let (style_list, stream) = package::open_package(reader)?;
125 let xml = quick_xml::Reader::from_reader(BufReader::new(stream));
126 let data = document::DocxData { style_list };
127 Ok(Self {
128 inner: document::DocumentReader::from_xml_reader(xml, data),
129 })
130 }
131}
132
133impl EventSource for DocxReader {
134 #[inline]
135 fn next_event(&mut self) -> Result<Option<docspec_core::Event>> {
136 self.inner.next_event()
137 }
138}
139
140#[cfg(test)]
141#[cfg(not(coverage))]
142mod tests {
143 #![allow(clippy::unwrap_used, clippy::panic)]
144 use super::*;
145
146 #[test]
147 fn docx_reader_is_send_static() {
148 fn assert_send_static<T: Send + 'static>() {}
149 assert_send_static::<DocxReader>();
150 }
151
152 #[test]
153 fn docx_without_styles_emits_only_paragraphs() {
154 use std::io::{Cursor, Write as _};
155 use zip::ZipWriter;
156
157 let root_rels = r#"<?xml version="1.0" encoding="UTF-8"?>
158<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
159 <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
160</Relationships>"#;
161 let document_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
162<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
163 <w:body>
164 <w:p><w:r><w:t>hi</w:t></w:r></w:p>
165 </w:body>
166</w:document>"#;
167
168 let buf = Cursor::new(Vec::new());
169 let mut writer = ZipWriter::new(buf);
170 let options = zip::write::SimpleFileOptions::default()
171 .compression_method(zip::CompressionMethod::Stored);
172 writer.start_file("_rels/.rels", options).unwrap();
173 writer.write_all(root_rels.as_bytes()).unwrap();
174 writer.start_file("word/document.xml", options).unwrap();
175 writer.write_all(document_xml.as_bytes()).unwrap();
176 let zip_bytes = writer.finish().unwrap().into_inner();
177
178 let mut reader = DocxReader::from_reader(Cursor::new(zip_bytes)).unwrap();
179 let mut events = Vec::new();
180 loop {
181 match reader.next_event() {
182 Ok(Some(event)) => events.push(event),
183 Ok(None) => break,
184 Err(err) => panic!("unexpected error: {err:?}"),
185 }
186 }
187
188 assert_eq!(
189 events,
190 vec![
191 docspec_core::Event::StartDocument {
192 id: None,
193 language: None,
194 metadata: None,
195 },
196 docspec_core::Event::StartParagraph {
197 alignment: None,
198 id: None,
199 },
200 docspec_core::Event::Text {
201 content: "hi".to_string(),
202 },
203 docspec_core::Event::EndParagraph,
204 docspec_core::Event::EndDocument,
205 ]
206 );
207 }
208}