1#![forbid(unsafe_code)]
2#![cfg_attr(docsrs, feature(doc_cfg))]
3extern crate alloc;
47
48mod rels;
49
50use alloc::collections::VecDeque;
51use core::fmt;
52use std::io::{BufReader, Read, Seek};
53use std::path::Path;
54
55pub use docspec_core::EventSource;
56use docspec_core::{Error, Event, Result, TextStyle};
57use quick_xml::events::{BytesCData, BytesRef, BytesText};
58
59#[derive(Clone, Copy, PartialEq, Eq)]
61enum Phase {
62 Finished,
64 NotStarted,
66 Running,
68}
69
70pub struct DocxReader {
87 buf: Vec<u8>,
89 in_ignored_subtree: u32,
92 in_paragraph: bool,
94 in_text: bool,
96 pending_text: String,
98 phase: Phase,
100 queue: VecDeque<Event>,
102 xml: quick_xml::Reader<BufReader<Box<dyn Read>>>,
104}
105
106impl fmt::Debug for DocxReader {
107 #[inline]
108 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
109 f.debug_struct("DocxReader")
110 .field("buf", &self.buf)
111 .field("in_ignored_subtree", &self.in_ignored_subtree)
112 .field("in_paragraph", &self.in_paragraph)
113 .field("in_text", &self.in_text)
114 .field("pending_text", &self.pending_text)
115 .field("phase", &"<phase>")
116 .field("queue", &self.queue)
117 .field("xml", &"<quick_xml::Reader>")
118 .finish()
119 }
120}
121
122impl DocxReader {
123 #[inline]
130 pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
131 let file = std::fs::File::open(path.as_ref()).map_err(Error::from)?;
132 Self::from_reader(file)
133 }
134
135 #[inline]
145 pub fn from_reader<R: Read + Seek + 'static>(mut reader: R) -> Result<Self> {
146 let mut archive = zip::ZipArchive::new(&mut reader).map_err(|err| match err {
147 zip::result::ZipError::InvalidArchive(_)
148 | zip::result::ZipError::UnsupportedArchive(_) => Error::Parse {
149 message: "not a valid ZIP archive".to_string(),
150 position: None,
151 },
152 zip::result::ZipError::Io(source) => Error::Io { source },
153 zip::result::ZipError::FileNotFound
154 | zip::result::ZipError::InvalidPassword
155 | zip::result::ZipError::CompressionMethodNotSupported(_)
156 | _ => parse_error(format!("not a valid ZIP archive: {err}")),
157 })?;
158
159 let document_path = rels::find_document_path(&mut archive)?;
160
161 let (data_start, compressed_size, method) = {
162 let entry = archive
163 .by_name(&document_path)
164 .map_err(|_err| Error::Parse {
165 message: format!("document target not found: {document_path}"),
166 position: None,
167 })?;
168 let data_start = entry
169 .data_start()
170 .ok_or_else(|| parse_error("document.xml has no data offset".to_string()))?;
171 (data_start, entry.compressed_size(), entry.compression())
172 };
173 drop(archive);
174
175 reader
176 .seek(std::io::SeekFrom::Start(data_start))
177 .map_err(Error::from)?;
178
179 let limited = reader.take(compressed_size);
180
181 let stream: Box<dyn Read> = if method == zip::CompressionMethod::Stored {
182 Box::new(limited)
183 } else if method == zip::CompressionMethod::Deflated {
184 Box::new(flate2::read::DeflateDecoder::new(limited))
185 } else {
186 return Err(Error::Parse {
187 message: format!("unsupported compression: {method:?}"),
188 position: None,
189 });
190 };
191
192 let xml = quick_xml::Reader::from_reader(BufReader::new(stream));
193
194 Ok(Self {
195 buf: Vec::with_capacity(4096),
196 in_ignored_subtree: 0,
197 in_paragraph: false,
198 in_text: false,
199 pending_text: String::new(),
200 phase: Phase::NotStarted,
201 queue: VecDeque::new(),
202 xml,
203 })
204 }
205}
206
207impl DocxReader {
208 fn can_collect_text(&self) -> bool {
209 self.in_ignored_subtree == 0 && self.in_paragraph && self.in_text
210 }
211
212 fn end_paragraph(&mut self) {
213 self.queue.push_back(Event::EndParagraph);
214 self.in_paragraph = false;
215 self.in_text = false;
216 self.pending_text.clear();
217 }
218
219 fn flush_pending_text(&mut self) {
220 if !self.pending_text.is_empty() {
221 self.queue.push_back(Event::Text {
222 content: core::mem::take(&mut self.pending_text),
223 style: TextStyle::default(),
224 });
225 }
226 }
227
228 fn handle_cdata(&mut self, cdata: BytesCData<'_>) -> Result<()> {
229 if self.can_collect_text() {
230 let bytes = cdata.into_inner();
231 let content = core::str::from_utf8(&bytes)
232 .map_err(|err| parse_error(format!("malformed document.xml: {err}")))?;
233 self.pending_text.push_str(content);
234 }
235 Ok(())
236 }
237
238 fn handle_empty(&mut self, local: &[u8]) {
239 match local {
240 value if self.in_ignored_subtree > 0 || is_ignored_container(value) => {}
241 b"p" if !self.in_paragraph => {
242 self.queue.push_back(Event::StartParagraph {
243 alignment: None,
244 id: None,
245 });
246 self.queue.push_back(Event::EndParagraph);
247 }
248 _ => {}
249 }
250 }
251
252 fn handle_end(&mut self, local: &[u8]) {
253 if self.in_ignored_subtree > 0 {
254 self.in_ignored_subtree = self.in_ignored_subtree.saturating_sub(1);
255 return;
256 }
257
258 match local {
259 b"p" if self.in_paragraph => self.end_paragraph(),
260 b"t" if self.in_text => {
261 self.flush_pending_text();
262 self.in_text = false;
263 }
264 _ => {}
265 }
266 }
267
268 fn handle_eof(&mut self) {
269 if self.in_text {
270 self.flush_pending_text();
271 }
272 if self.in_paragraph {
273 self.end_paragraph();
274 }
275 self.queue.push_back(Event::EndDocument);
276 self.phase = Phase::Finished;
277 }
278
279 fn handle_general_ref(&mut self, reference: &BytesRef<'_>) -> Result<()> {
280 if self.can_collect_text() {
281 let decoded = reference
282 .decode()
283 .map_err(|err| parse_error(format!("malformed document.xml: {err}")))?;
284 let escaped = format!("&{decoded};");
285 let unescaped = quick_xml::escape::unescape(&escaped)
286 .map_err(|err| parse_error(format!("malformed document.xml: {err}")))?;
287 self.pending_text.push_str(&unescaped);
288 }
289 Ok(())
290 }
291
292 fn handle_start(&mut self, local: &[u8]) {
293 if self.in_ignored_subtree > 0 {
294 self.in_ignored_subtree = self.in_ignored_subtree.saturating_add(1);
295 return;
296 }
297
298 match local {
299 value if is_ignored_container(value) => self.in_ignored_subtree = 1,
300 b"p" if !self.in_paragraph => self.start_paragraph(),
301 b"t" if self.in_paragraph => {
302 self.in_text = true;
303 self.pending_text.clear();
304 }
305 _ => {}
306 }
307 }
308
309 fn handle_text(&mut self, text: &BytesText<'_>) -> Result<()> {
310 if self.can_collect_text() {
311 let decoded = text
312 .decode()
313 .map_err(|err| parse_error(format!("malformed document.xml: {err}")))?;
314 let unescaped = quick_xml::escape::unescape(&decoded)
315 .map_err(|err| parse_error(format!("malformed document.xml: {err}")))?;
316 self.pending_text.push_str(&unescaped);
317 }
318 Ok(())
319 }
320
321 fn read_until_event(&mut self) -> Result<()> {
322 let event = self
323 .xml
324 .read_event_into(&mut self.buf)
325 .map_err(|err| match err {
326 quick_xml::Error::Io(source) => Error::Io {
327 source: std::io::Error::new(source.kind(), source.to_string()),
328 },
329 other => Error::Parse {
330 message: format!("malformed document.xml: {other}"),
331 position: None,
332 },
333 })?
334 .into_owned();
335
336 match event {
337 quick_xml::events::Event::Start(tag) => self.handle_start(tag.local_name().as_ref()),
338 quick_xml::events::Event::End(tag) => self.handle_end(tag.local_name().as_ref()),
339 quick_xml::events::Event::Empty(tag) => self.handle_empty(tag.local_name().as_ref()),
340 quick_xml::events::Event::Text(text) => {
341 self.handle_text(&text)?;
342 }
343 quick_xml::events::Event::GeneralRef(reference) => {
344 self.handle_general_ref(&reference)?;
345 }
346 quick_xml::events::Event::CData(cdata) => self.handle_cdata(cdata)?,
347 quick_xml::events::Event::Eof => self.handle_eof(),
348 quick_xml::events::Event::Comment(_)
349 | quick_xml::events::Event::Decl(_)
350 | quick_xml::events::Event::PI(_)
351 | quick_xml::events::Event::DocType(_) => {}
352 }
353
354 self.buf.clear();
355 Ok(())
356 }
357
358 fn start_paragraph(&mut self) {
359 self.queue.push_back(Event::StartParagraph {
360 alignment: None,
361 id: None,
362 });
363 self.in_paragraph = true;
364 self.in_text = false;
365 self.pending_text.clear();
366 }
367}
368
369impl EventSource for DocxReader {
370 #[inline]
371 fn next_event(&mut self) -> Result<Option<Event>> {
372 loop {
373 if let Some(event) = self.queue.pop_front() {
374 return Ok(Some(event));
375 }
376
377 match self.phase {
378 Phase::NotStarted => {
379 self.phase = Phase::Running;
380 self.queue.push_back(Event::StartDocument {
381 id: None,
382 language: None,
383 metadata: None,
384 });
385 }
386 Phase::Finished => return Ok(None),
387 Phase::Running => self.read_until_event()?,
388 }
389 }
390 }
391}
392
393fn is_ignored_container(local: &[u8]) -> bool {
394 matches!(
395 local,
396 b"tbl"
397 | b"tr"
398 | b"tc"
399 | b"sdt"
400 | b"hyperlink"
401 | b"drawing"
402 | b"pict"
403 | b"object"
404 | b"ins"
405 | b"del"
406 | b"moveFrom"
407 | b"moveTo"
408 )
409}
410
411fn parse_error(message: String) -> Error {
412 Error::Parse {
413 message,
414 position: None,
415 }
416}
417
418#[cfg(test)]
419#[cfg(not(coverage))]
420mod tests {
421 use std::io::{Cursor, Write as _};
422
423 use zip::{write::SimpleFileOptions, CompressionMethod, ZipWriter};
424
425 use super::*;
426
427 const SIMPLE_RELS: &str = r#"<?xml version="1.0"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/></Relationships>"#;
428
429 fn synth_docx_for_unit_test(
430 rels_xml: &str,
431 document_xml: &str,
432 ) -> core::result::Result<Vec<u8>, Box<dyn core::error::Error>> {
433 let buf = Cursor::new(Vec::new());
434 let mut writer = ZipWriter::new(buf);
435 let rels_options =
436 SimpleFileOptions::default().compression_method(CompressionMethod::Deflated);
437 writer.start_file("_rels/.rels", rels_options)?;
438 writer.write_all(rels_xml.as_bytes())?;
439 let document_options =
440 SimpleFileOptions::default().compression_method(CompressionMethod::Deflated);
441 writer.start_file("word/document.xml", document_options)?;
442 writer.write_all(document_xml.as_bytes())?;
443 Ok(writer.finish()?.into_inner())
444 }
445
446 fn make_reader(
447 document_xml: &str,
448 ) -> core::result::Result<DocxReader, Box<dyn core::error::Error>> {
449 let bytes = synth_docx_for_unit_test(SIMPLE_RELS, document_xml)?;
450 Ok(DocxReader::from_reader(Cursor::new(bytes))?)
451 }
452
453 #[test]
454 fn queue_length_never_exceeds_three() -> core::result::Result<(), Box<dyn core::error::Error>> {
455 let doc = {
456 let mut content = String::from(
457 r#"<?xml version="1.0"?><w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"><w:body>"#,
458 );
459 for _ in 0..1000 {
460 content.push_str("<w:p><w:r><w:t>hello</w:t></w:r></w:p>");
461 }
462 content.push_str("</w:body></w:document>");
463 content
464 };
465 let mut reader = make_reader(&doc)?;
466 loop {
467 if reader.queue.len() > 3 {
468 return Err(Box::new(Error::Other {
469 message: format!("queue grew to {}", reader.queue.len()),
470 }));
471 }
472 if reader.next_event()?.is_none() {
473 break;
474 }
475 }
476 Ok(())
477 }
478
479 #[test]
480 fn buf_is_cleared_per_iteration() -> core::result::Result<(), Box<dyn core::error::Error>> {
481 let doc = r#"<?xml version="1.0"?><w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"><w:body><w:p><w:r><w:t>hello</w:t></w:r></w:p></w:body></w:document>"#;
482 let mut reader = make_reader(doc)?;
483 while reader.next_event()?.is_some() {
484 if !reader.buf.is_empty() {
485 return Err(Box::new(Error::Other {
486 message: "buf not cleared after event".to_string(),
487 }));
488 }
489 }
490 Ok(())
491 }
492}