1#![forbid(unsafe_code)]
2#![cfg_attr(docsrs, feature(doc_cfg))]
3extern crate alloc;
57
58mod properties;
59mod rels;
60
61use alloc::collections::VecDeque;
62use core::fmt;
63use std::io::{BufReader, Read, Seek};
64use std::path::Path;
65
66pub use docspec_core::EventSource;
67use docspec_core::{Error, Event, Result, TextAlignment, TextStyle};
68use quick_xml::events::{BytesCData, BytesRef, BytesStart, BytesText};
69
70#[derive(Clone, Copy, PartialEq, Eq)]
72enum Phase {
73 Finished,
75 NotStarted,
77 Running,
79}
80
81#[expect(
99 clippy::struct_excessive_bools,
100 reason = "DocxReader tracks six independent boolean parser states; grouping them would obscure the streaming state machine"
101)]
102pub struct DocxReader {
103 buf: Vec<u8>,
105 in_ignored_subtree: u32,
109 in_paragraph: bool,
111 in_text: bool,
113 in_ppr: bool,
115 pending_paragraph_alignment: Option<TextAlignment>,
117 paragraph_started_emitted: bool,
119 in_rpr: bool,
121 pending_run_style: TextStyle,
123 pending_text: String,
125 current_run_style: TextStyle,
127 phase: Phase,
129 queue: VecDeque<Event>,
131 run_content_emitted: bool,
133 xml: quick_xml::Reader<BufReader<Box<dyn Read + Send>>>,
135}
136
137impl fmt::Debug for DocxReader {
138 #[inline]
139 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
140 f.debug_struct("DocxReader")
141 .field("buf", &self.buf)
142 .field("in_ignored_subtree", &self.in_ignored_subtree)
143 .field("in_paragraph", &self.in_paragraph)
144 .field("in_text", &self.in_text)
145 .field("in_ppr", &self.in_ppr)
146 .field(
147 "pending_paragraph_alignment",
148 &self.pending_paragraph_alignment,
149 )
150 .field("paragraph_started_emitted", &self.paragraph_started_emitted)
151 .field("in_rpr", &self.in_rpr)
152 .field("pending_run_style", &self.pending_run_style)
153 .field("pending_text", &self.pending_text)
154 .field("current_run_style", &self.current_run_style)
155 .field("phase", &"<phase>")
156 .field("queue", &self.queue)
157 .field("run_content_emitted", &self.run_content_emitted)
158 .field("xml", &"<quick_xml::Reader>")
159 .finish()
160 }
161}
162
163impl DocxReader {
164 #[inline]
171 pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
172 let file = std::fs::File::open(path.as_ref()).map_err(Error::from)?;
173 Self::from_reader(file)
174 }
175
176 #[inline]
186 pub fn from_reader<R: Read + Seek + Send + 'static>(mut reader: R) -> Result<Self> {
187 let mut archive = zip::ZipArchive::new(&mut reader).map_err(|err| match err {
188 zip::result::ZipError::InvalidArchive(_)
189 | zip::result::ZipError::UnsupportedArchive(_) => Error::Parse {
190 message: "not a valid ZIP archive".to_string(),
191 position: None,
192 },
193 zip::result::ZipError::Io(source) => Error::Io { source },
194 zip::result::ZipError::FileNotFound
195 | zip::result::ZipError::InvalidPassword
196 | zip::result::ZipError::CompressionMethodNotSupported(_)
197 | _ => parse_error(format!("not a valid ZIP archive: {err}")),
198 })?;
199
200 let document_path = rels::find_document_path(&mut archive)?;
201
202 let (data_start, compressed_size, method) = {
203 let entry = archive
204 .by_name(&document_path)
205 .map_err(|_err| Error::Parse {
206 message: format!("document target not found: {document_path}"),
207 position: None,
208 })?;
209 let data_start = entry
210 .data_start()
211 .ok_or_else(|| parse_error("document.xml has no data offset".to_string()))?;
212 (data_start, entry.compressed_size(), entry.compression())
213 };
214 drop(archive);
215
216 reader
217 .seek(std::io::SeekFrom::Start(data_start))
218 .map_err(Error::from)?;
219
220 let limited = reader.take(compressed_size);
221
222 let stream: Box<dyn Read + Send> = if method == zip::CompressionMethod::Stored {
223 Box::new(limited)
224 } else if method == zip::CompressionMethod::Deflated {
225 Box::new(flate2::read::DeflateDecoder::new(limited))
226 } else {
227 return Err(Error::Parse {
228 message: format!("unsupported compression: {method:?}"),
229 position: None,
230 });
231 };
232
233 let xml = quick_xml::Reader::from_reader(BufReader::new(stream));
234
235 Ok(Self {
236 buf: Vec::with_capacity(4096),
237 in_ignored_subtree: 0,
238 in_paragraph: false,
239 in_text: false,
240 in_ppr: false,
241 pending_paragraph_alignment: None,
242 paragraph_started_emitted: false,
243 in_rpr: false,
244 pending_run_style: TextStyle::default(),
245 pending_text: String::new(),
246 current_run_style: TextStyle::default(),
247 phase: Phase::NotStarted,
248 queue: VecDeque::new(),
249 run_content_emitted: false,
250 xml,
251 })
252 }
253}
254
255impl DocxReader {
256 fn can_collect_text(&self) -> bool {
257 self.in_ignored_subtree == 0 && self.in_paragraph && self.in_text
258 }
259
260 fn emit_line_break(&mut self) {
261 self.ensure_paragraph_started();
262 self.flush_pending_text();
263 self.run_content_emitted = true;
264 self.queue.push_back(Event::LineBreak);
265 }
266
267 fn emit_tab(&mut self) {
268 self.ensure_paragraph_started();
269 self.flush_pending_text();
270 self.run_content_emitted = true;
271 self.queue.push_back(Event::Text {
272 content: "\t".to_string(),
273 style: TextStyle::default(),
274 });
275 }
276
277 fn end_paragraph(&mut self) {
278 self.ensure_paragraph_started();
279 self.queue.push_back(Event::EndParagraph);
280 self.in_paragraph = false;
281 self.in_text = false;
282 self.pending_text.clear();
283 self.in_ppr = false;
284 self.pending_paragraph_alignment = None;
285 self.paragraph_started_emitted = false;
286 }
287
288 fn flush_pending_text(&mut self) {
289 if !self.pending_text.is_empty() {
290 self.queue.push_back(Event::Text {
291 content: core::mem::take(&mut self.pending_text),
292 style: self.current_run_style.clone(),
293 });
294 }
295 }
296
297 fn handle_cdata(&mut self, cdata: BytesCData<'_>) -> Result<()> {
298 if self.can_collect_text() {
299 let bytes = cdata.into_inner();
300 let content = core::str::from_utf8(&bytes)
301 .map_err(|err| parse_error(format!("malformed document.xml: {err}")))?;
302 self.pending_text.push_str(content);
303 }
304 Ok(())
305 }
306
307 fn handle_empty(&mut self, tag: &BytesStart<'_>) {
308 let local_name = tag.local_name();
309 let local = local_name.as_ref();
310 match local {
311 value if self.in_ignored_subtree > 0 || is_ignored_container(value) => {}
312 b"pPr" if self.in_paragraph && !self.paragraph_started_emitted => {
313 self.ensure_paragraph_started();
314 }
315 b"jc" if self.in_ppr => {
316 let val = read_val_attribute(tag);
317 self.pending_paragraph_alignment =
318 val.as_deref().and_then(properties::parse_alignment);
319 }
320 b"rPr" if self.in_ppr => {}
321 b"rPr" if self.in_paragraph && !self.in_ppr && !self.in_rpr => {}
322 b"b" if self.in_rpr => {
323 self.pending_run_style.bold = parse_on_off_attribute(tag);
324 }
325 b"i" if self.in_rpr => {
326 self.pending_run_style.italic = parse_on_off_attribute(tag);
327 }
328 b"strike" | b"dstrike" if self.in_rpr => {
329 self.pending_run_style.strikethrough = parse_on_off_attribute(tag);
330 }
331 b"u" if self.in_rpr => {
332 let val = read_val_attribute(tag);
333 self.pending_run_style.underline = properties::parse_underline_on(val.as_deref());
334 }
335 b"vertAlign" if self.in_rpr => {
336 let val = read_val_attribute(tag);
337 match properties::parse_vert_align(val.as_deref()) {
338 properties::VertAlign::Subscript => {
339 self.pending_run_style.subscript = true;
340 self.pending_run_style.superscript = false;
341 }
342 properties::VertAlign::Superscript => {
343 self.pending_run_style.superscript = true;
344 self.pending_run_style.subscript = false;
345 }
346 properties::VertAlign::None => {
347 self.pending_run_style.subscript = false;
348 self.pending_run_style.superscript = false;
349 }
350 }
351 }
352 b"p" if !self.in_paragraph => {
353 self.queue.push_back(Event::StartParagraph {
354 alignment: None,
355 id: None,
356 });
357 self.queue.push_back(Event::EndParagraph);
358 }
359 b"br" if self.in_paragraph => self.emit_line_break(),
360 b"tab" if self.in_paragraph => self.emit_tab(),
361 _ => {}
362 }
363 }
364
365 fn handle_end(&mut self, local: &[u8]) {
366 if self.in_ignored_subtree > 0 {
367 self.in_ignored_subtree = self.in_ignored_subtree.saturating_sub(1);
368 return;
369 }
370
371 match local {
372 b"p" if self.in_paragraph => self.end_paragraph(),
373 b"pPr" if self.in_ppr => {
374 self.ensure_paragraph_started();
375 self.in_ppr = false;
376 }
377 b"rPr" if self.in_rpr => {
378 self.current_run_style = self.pending_run_style.clone();
379 self.in_rpr = false;
380 }
381 b"r" => {
382 self.current_run_style = TextStyle::default();
383 self.pending_run_style = TextStyle::default();
384 self.run_content_emitted = false;
385 self.in_rpr = false;
386 }
387 b"t" if self.in_text => {
388 self.flush_pending_text();
389 self.in_text = false;
390 }
391 b"tbl" => self.queue.push_back(Event::EndTable),
392 b"tr" => self.queue.push_back(Event::EndTableRow),
393 b"tc" => self.queue.push_back(Event::EndTableCell),
394 _ => {}
395 }
396 }
397
398 fn handle_eof(&mut self) {
399 if self.in_text {
400 self.flush_pending_text();
401 }
402 if self.in_paragraph {
403 self.end_paragraph();
404 }
405 self.queue.push_back(Event::EndDocument);
406 self.phase = Phase::Finished;
407 }
408
409 fn handle_general_ref(&mut self, reference: &BytesRef<'_>) -> Result<()> {
410 if self.can_collect_text() {
411 let decoded = reference
412 .decode()
413 .map_err(|err| parse_error(format!("malformed document.xml: {err}")))?;
414 let escaped = format!("&{decoded};");
415 let unescaped = quick_xml::escape::unescape(&escaped)
416 .map_err(|err| parse_error(format!("malformed document.xml: {err}")))?;
417 self.pending_text.push_str(&unescaped);
418 }
419 Ok(())
420 }
421
422 fn handle_start(&mut self, tag: &BytesStart<'_>) {
423 let local_name = tag.local_name();
424 let local = local_name.as_ref();
425 if self.in_ignored_subtree > 0 {
426 self.in_ignored_subtree = self.in_ignored_subtree.saturating_add(1);
427 return;
428 }
429
430 match local {
431 value if is_ignored_container(value) => self.in_ignored_subtree = 1,
432 b"pPr" if self.in_paragraph => {
433 if self.paragraph_started_emitted {
434 self.in_ignored_subtree = 1;
436 } else {
437 self.in_ppr = true;
438 self.pending_paragraph_alignment = None;
439 }
440 }
441 b"jc" if self.in_ppr => {
442 let val = read_val_attribute(tag);
443 self.pending_paragraph_alignment =
444 val.as_deref().and_then(properties::parse_alignment);
445 }
446 b"rPr" if self.in_ppr => {
447 self.in_ignored_subtree = 1;
448 }
449 b"rPr" if self.in_paragraph && !self.in_ppr && !self.in_rpr => {
450 if self.run_content_emitted {
451 self.in_ignored_subtree = 1;
453 } else {
454 self.in_rpr = true;
455 self.pending_run_style = TextStyle::default();
456 }
457 }
458 b"b" if self.in_rpr => {
459 self.pending_run_style.bold = parse_on_off_attribute(tag);
460 }
461 b"i" if self.in_rpr => {
462 self.pending_run_style.italic = parse_on_off_attribute(tag);
463 }
464 b"strike" | b"dstrike" if self.in_rpr => {
465 self.pending_run_style.strikethrough = parse_on_off_attribute(tag);
466 }
467 b"u" if self.in_rpr => {
468 let val = read_val_attribute(tag);
469 self.pending_run_style.underline = properties::parse_underline_on(val.as_deref());
470 }
471 b"vertAlign" if self.in_rpr => {
472 let val = read_val_attribute(tag);
473 match properties::parse_vert_align(val.as_deref()) {
474 properties::VertAlign::Subscript => {
475 self.pending_run_style.subscript = true;
476 self.pending_run_style.superscript = false;
477 }
478 properties::VertAlign::Superscript => {
479 self.pending_run_style.superscript = true;
480 self.pending_run_style.subscript = false;
481 }
482 properties::VertAlign::None => {
483 self.pending_run_style.subscript = false;
484 self.pending_run_style.superscript = false;
485 }
486 }
487 }
488 b"p" if !self.in_paragraph => self.start_paragraph(),
489 b"r" if self.in_paragraph => {
490 self.ensure_paragraph_started();
491 }
492 b"t" if self.in_paragraph => {
493 self.ensure_paragraph_started();
494 self.in_text = true;
495 self.pending_text.clear();
496 self.run_content_emitted = true;
497 }
498 b"br" if self.in_paragraph => self.emit_line_break(),
499 b"tab" if self.in_paragraph => self.emit_tab(),
500 b"tbl" => self.queue.push_back(Event::StartTable { id: None }),
501 b"tr" => self.queue.push_back(Event::StartTableRow { id: None }),
502 b"tc" => self.queue.push_back(Event::StartTableCell {
503 colspan: None,
504 id: None,
505 rowspan: None,
506 }),
507 _ => {}
508 }
509 }
510
511 fn handle_text(&mut self, text: &BytesText<'_>) -> Result<()> {
512 if self.can_collect_text() {
513 let decoded = text
514 .decode()
515 .map_err(|err| parse_error(format!("malformed document.xml: {err}")))?;
516 let unescaped = quick_xml::escape::unescape(&decoded)
517 .map_err(|err| parse_error(format!("malformed document.xml: {err}")))?;
518 self.pending_text.push_str(&unescaped);
519 }
520 Ok(())
521 }
522
523 fn read_until_event(&mut self) -> Result<()> {
524 let event = self
525 .xml
526 .read_event_into(&mut self.buf)
527 .map_err(|err| match err {
528 quick_xml::Error::Io(source) => Error::Io {
529 source: std::io::Error::new(source.kind(), source.to_string()),
530 },
531 other => Error::Parse {
532 message: format!("malformed document.xml: {other}"),
533 position: None,
534 },
535 })?
536 .into_owned();
537
538 match event {
539 quick_xml::events::Event::Start(tag) => self.handle_start(&tag),
540 quick_xml::events::Event::End(tag) => self.handle_end(tag.local_name().as_ref()),
541 quick_xml::events::Event::Empty(tag) => self.handle_empty(&tag),
542 quick_xml::events::Event::Text(text) => {
543 self.handle_text(&text)?;
544 }
545 quick_xml::events::Event::GeneralRef(reference) => {
546 self.handle_general_ref(&reference)?;
547 }
548 quick_xml::events::Event::CData(cdata) => self.handle_cdata(cdata)?,
549 quick_xml::events::Event::Eof => self.handle_eof(),
550 quick_xml::events::Event::Comment(_)
551 | quick_xml::events::Event::Decl(_)
552 | quick_xml::events::Event::PI(_)
553 | quick_xml::events::Event::DocType(_) => {}
554 }
555
556 self.buf.clear();
557 Ok(())
558 }
559
560 fn start_paragraph(&mut self) {
561 self.in_paragraph = true;
562 self.in_text = false;
563 self.pending_text.clear();
564 self.paragraph_started_emitted = false;
565 self.pending_paragraph_alignment = None;
566 }
567
568 fn ensure_paragraph_started(&mut self) {
570 if self.in_paragraph && !self.paragraph_started_emitted {
571 self.queue.push_back(Event::StartParagraph {
572 alignment: self.pending_paragraph_alignment.clone(),
573 id: None,
574 });
575 self.paragraph_started_emitted = true;
576 }
577 }
578}
579
580impl EventSource for DocxReader {
581 #[inline]
582 fn next_event(&mut self) -> Result<Option<Event>> {
583 loop {
584 if let Some(event) = self.queue.pop_front() {
585 return Ok(Some(event));
586 }
587
588 match self.phase {
589 Phase::NotStarted => {
590 self.phase = Phase::Running;
591 self.queue.push_back(Event::StartDocument {
592 id: None,
593 language: None,
594 metadata: None,
595 });
596 }
597 Phase::Finished => return Ok(None),
598 Phase::Running => self.read_until_event()?,
599 }
600 }
601 }
602}
603
604fn is_ignored_container(local: &[u8]) -> bool {
605 matches!(
606 local,
607 b"sdt"
608 | b"hyperlink"
609 | b"drawing"
610 | b"pict"
611 | b"object"
612 | b"ins"
613 | b"del"
614 | b"moveFrom"
615 | b"moveTo"
616 | b"tblPr"
617 | b"trPr"
618 | b"tcPr"
619 | b"tblGrid"
620 )
621}
622
623fn read_val_attribute(tag: &BytesStart<'_>) -> Option<String> {
624 let a = tag.try_get_attribute(b"w:val").ok().flatten()?;
625 core::str::from_utf8(a.value.as_ref())
626 .ok()
627 .map(str::to_owned)
628}
629
630fn parse_on_off_attribute(tag: &BytesStart<'_>) -> bool {
631 let val = read_val_attribute(tag);
632 properties::parse_on_off(val.as_deref())
633}
634
635fn parse_error(message: String) -> Error {
636 Error::Parse {
637 message,
638 position: None,
639 }
640}
641
642#[cfg(test)]
643#[cfg(not(coverage))]
644mod tests {
645 use std::io::{Cursor, Write as _};
646
647 use zip::{write::SimpleFileOptions, CompressionMethod, ZipWriter};
648
649 use super::*;
650
651 const SIMPLE_RELS: &str = r#"<?xml version="1.0"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/></Relationships>"#;
652
653 #[test]
654 fn docx_reader_is_send_static() {
655 fn assert_send_static<T: Send + 'static>() {}
656 assert_send_static::<DocxReader>();
657 }
658
659 fn synth_docx_for_unit_test(
660 rels_xml: &str,
661 document_xml: &str,
662 ) -> core::result::Result<Vec<u8>, Box<dyn core::error::Error>> {
663 let buf = Cursor::new(Vec::new());
664 let mut writer = ZipWriter::new(buf);
665 let rels_options =
666 SimpleFileOptions::default().compression_method(CompressionMethod::Deflated);
667 writer.start_file("_rels/.rels", rels_options)?;
668 writer.write_all(rels_xml.as_bytes())?;
669 let document_options =
670 SimpleFileOptions::default().compression_method(CompressionMethod::Deflated);
671 writer.start_file("word/document.xml", document_options)?;
672 writer.write_all(document_xml.as_bytes())?;
673 Ok(writer.finish()?.into_inner())
674 }
675
676 fn make_reader(
677 document_xml: &str,
678 ) -> core::result::Result<DocxReader, Box<dyn core::error::Error>> {
679 let bytes = synth_docx_for_unit_test(SIMPLE_RELS, document_xml)?;
680 Ok(DocxReader::from_reader(Cursor::new(bytes))?)
681 }
682
683 #[test]
684 fn queue_length_never_exceeds_three() -> core::result::Result<(), Box<dyn core::error::Error>> {
685 let doc = {
686 let mut content = String::from(
687 r#"<?xml version="1.0"?><w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"><w:body>"#,
688 );
689 for _ in 0..1000 {
690 content.push_str("<w:p><w:r><w:t>hello</w:t></w:r></w:p>");
691 }
692 content.push_str("</w:body></w:document>");
693 content
694 };
695 let mut reader = make_reader(&doc)?;
696 loop {
697 if reader.queue.len() > 3 {
698 return Err(Box::new(Error::Other {
699 message: format!("queue grew to {}", reader.queue.len()),
700 }));
701 }
702 if reader.next_event()?.is_none() {
703 break;
704 }
705 }
706 Ok(())
707 }
708
709 #[test]
710 fn buf_is_cleared_per_iteration() -> core::result::Result<(), Box<dyn core::error::Error>> {
711 let doc = r#"<?xml version="1.0"?><w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"><w:body><w:p><w:r><w:t>hello</w:t></w:r></w:p></w:body></w:document>"#;
712 let mut reader = make_reader(doc)?;
713 while reader.next_event()?.is_some() {
714 if !reader.buf.is_empty() {
715 return Err(Box::new(Error::Other {
716 message: "buf not cleared after event".to_string(),
717 }));
718 }
719 }
720 Ok(())
721 }
722}