1#![forbid(unsafe_code)]
2#![cfg_attr(docsrs, feature(doc_cfg))]
3extern crate alloc;
57
58mod rels;
59
60use alloc::collections::VecDeque;
61use core::fmt;
62use std::io::{BufReader, Read, Seek};
63use std::path::Path;
64
65pub use docspec_core::EventSource;
66use docspec_core::{Error, Event, Result, TextStyle};
67use quick_xml::events::{BytesCData, BytesRef, BytesText};
68
69#[derive(Clone, Copy, PartialEq, Eq)]
71enum Phase {
72 Finished,
74 NotStarted,
76 Running,
78}
79
80pub struct DocxReader {
98 buf: Vec<u8>,
100 in_ignored_subtree: u32,
104 in_paragraph: bool,
106 in_text: bool,
108 pending_text: String,
110 phase: Phase,
112 queue: VecDeque<Event>,
114 xml: quick_xml::Reader<BufReader<Box<dyn Read + Send>>>,
116}
117
118impl fmt::Debug for DocxReader {
119 #[inline]
120 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
121 f.debug_struct("DocxReader")
122 .field("buf", &self.buf)
123 .field("in_ignored_subtree", &self.in_ignored_subtree)
124 .field("in_paragraph", &self.in_paragraph)
125 .field("in_text", &self.in_text)
126 .field("pending_text", &self.pending_text)
127 .field("phase", &"<phase>")
128 .field("queue", &self.queue)
129 .field("xml", &"<quick_xml::Reader>")
130 .finish()
131 }
132}
133
134impl DocxReader {
135 #[inline]
142 pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
143 let file = std::fs::File::open(path.as_ref()).map_err(Error::from)?;
144 Self::from_reader(file)
145 }
146
147 #[inline]
157 pub fn from_reader<R: Read + Seek + Send + 'static>(mut reader: R) -> Result<Self> {
158 let mut archive = zip::ZipArchive::new(&mut reader).map_err(|err| match err {
159 zip::result::ZipError::InvalidArchive(_)
160 | zip::result::ZipError::UnsupportedArchive(_) => Error::Parse {
161 message: "not a valid ZIP archive".to_string(),
162 position: None,
163 },
164 zip::result::ZipError::Io(source) => Error::Io { source },
165 zip::result::ZipError::FileNotFound
166 | zip::result::ZipError::InvalidPassword
167 | zip::result::ZipError::CompressionMethodNotSupported(_)
168 | _ => parse_error(format!("not a valid ZIP archive: {err}")),
169 })?;
170
171 let document_path = rels::find_document_path(&mut archive)?;
172
173 let (data_start, compressed_size, method) = {
174 let entry = archive
175 .by_name(&document_path)
176 .map_err(|_err| Error::Parse {
177 message: format!("document target not found: {document_path}"),
178 position: None,
179 })?;
180 let data_start = entry
181 .data_start()
182 .ok_or_else(|| parse_error("document.xml has no data offset".to_string()))?;
183 (data_start, entry.compressed_size(), entry.compression())
184 };
185 drop(archive);
186
187 reader
188 .seek(std::io::SeekFrom::Start(data_start))
189 .map_err(Error::from)?;
190
191 let limited = reader.take(compressed_size);
192
193 let stream: Box<dyn Read + Send> = if method == zip::CompressionMethod::Stored {
194 Box::new(limited)
195 } else if method == zip::CompressionMethod::Deflated {
196 Box::new(flate2::read::DeflateDecoder::new(limited))
197 } else {
198 return Err(Error::Parse {
199 message: format!("unsupported compression: {method:?}"),
200 position: None,
201 });
202 };
203
204 let xml = quick_xml::Reader::from_reader(BufReader::new(stream));
205
206 Ok(Self {
207 buf: Vec::with_capacity(4096),
208 in_ignored_subtree: 0,
209 in_paragraph: false,
210 in_text: false,
211 pending_text: String::new(),
212 phase: Phase::NotStarted,
213 queue: VecDeque::new(),
214 xml,
215 })
216 }
217}
218
219impl DocxReader {
220 fn can_collect_text(&self) -> bool {
221 self.in_ignored_subtree == 0 && self.in_paragraph && self.in_text
222 }
223
224 fn emit_line_break(&mut self) {
225 self.flush_pending_text();
226 self.queue.push_back(Event::LineBreak);
227 }
228
229 fn emit_tab(&mut self) {
230 self.flush_pending_text();
231 self.queue.push_back(Event::Text {
232 content: "\t".to_string(),
233 style: TextStyle::default(),
234 });
235 }
236
237 fn end_paragraph(&mut self) {
238 self.queue.push_back(Event::EndParagraph);
239 self.in_paragraph = false;
240 self.in_text = false;
241 self.pending_text.clear();
242 }
243
244 fn flush_pending_text(&mut self) {
245 if !self.pending_text.is_empty() {
246 self.queue.push_back(Event::Text {
247 content: core::mem::take(&mut self.pending_text),
248 style: TextStyle::default(),
249 });
250 }
251 }
252
253 fn handle_cdata(&mut self, cdata: BytesCData<'_>) -> Result<()> {
254 if self.can_collect_text() {
255 let bytes = cdata.into_inner();
256 let content = core::str::from_utf8(&bytes)
257 .map_err(|err| parse_error(format!("malformed document.xml: {err}")))?;
258 self.pending_text.push_str(content);
259 }
260 Ok(())
261 }
262
263 fn handle_empty(&mut self, local: &[u8]) {
264 match local {
265 value if self.in_ignored_subtree > 0 || is_ignored_container(value) => {}
266 b"p" if !self.in_paragraph => {
267 self.queue.push_back(Event::StartParagraph {
268 alignment: None,
269 id: None,
270 });
271 self.queue.push_back(Event::EndParagraph);
272 }
273 b"br" if self.in_paragraph => self.emit_line_break(),
274 b"tab" if self.in_paragraph => self.emit_tab(),
275 _ => {}
276 }
277 }
278
279 fn handle_end(&mut self, local: &[u8]) {
280 if self.in_ignored_subtree > 0 {
281 self.in_ignored_subtree = self.in_ignored_subtree.saturating_sub(1);
282 return;
283 }
284
285 match local {
286 b"p" if self.in_paragraph => self.end_paragraph(),
287 b"t" if self.in_text => {
288 self.flush_pending_text();
289 self.in_text = false;
290 }
291 b"tbl" => self.queue.push_back(Event::EndTable),
292 b"tr" => self.queue.push_back(Event::EndTableRow),
293 b"tc" => self.queue.push_back(Event::EndTableCell),
294 _ => {}
295 }
296 }
297
298 fn handle_eof(&mut self) {
299 if self.in_text {
300 self.flush_pending_text();
301 }
302 if self.in_paragraph {
303 self.end_paragraph();
304 }
305 self.queue.push_back(Event::EndDocument);
306 self.phase = Phase::Finished;
307 }
308
309 fn handle_general_ref(&mut self, reference: &BytesRef<'_>) -> Result<()> {
310 if self.can_collect_text() {
311 let decoded = reference
312 .decode()
313 .map_err(|err| parse_error(format!("malformed document.xml: {err}")))?;
314 let escaped = format!("&{decoded};");
315 let unescaped = quick_xml::escape::unescape(&escaped)
316 .map_err(|err| parse_error(format!("malformed document.xml: {err}")))?;
317 self.pending_text.push_str(&unescaped);
318 }
319 Ok(())
320 }
321
322 fn handle_start(&mut self, local: &[u8]) {
323 if self.in_ignored_subtree > 0 {
324 self.in_ignored_subtree = self.in_ignored_subtree.saturating_add(1);
325 return;
326 }
327
328 match local {
329 value if is_ignored_container(value) => self.in_ignored_subtree = 1,
330 b"p" if !self.in_paragraph => self.start_paragraph(),
331 b"t" if self.in_paragraph => {
332 self.in_text = true;
333 self.pending_text.clear();
334 }
335 b"br" if self.in_paragraph => self.emit_line_break(),
336 b"tab" if self.in_paragraph => self.emit_tab(),
337 b"tbl" => self.queue.push_back(Event::StartTable { id: None }),
338 b"tr" => self.queue.push_back(Event::StartTableRow { id: None }),
339 b"tc" => self.queue.push_back(Event::StartTableCell {
340 colspan: None,
341 id: None,
342 rowspan: None,
343 }),
344 _ => {}
345 }
346 }
347
348 fn handle_text(&mut self, text: &BytesText<'_>) -> Result<()> {
349 if self.can_collect_text() {
350 let decoded = text
351 .decode()
352 .map_err(|err| parse_error(format!("malformed document.xml: {err}")))?;
353 let unescaped = quick_xml::escape::unescape(&decoded)
354 .map_err(|err| parse_error(format!("malformed document.xml: {err}")))?;
355 self.pending_text.push_str(&unescaped);
356 }
357 Ok(())
358 }
359
360 fn read_until_event(&mut self) -> Result<()> {
361 let event = self
362 .xml
363 .read_event_into(&mut self.buf)
364 .map_err(|err| match err {
365 quick_xml::Error::Io(source) => Error::Io {
366 source: std::io::Error::new(source.kind(), source.to_string()),
367 },
368 other => Error::Parse {
369 message: format!("malformed document.xml: {other}"),
370 position: None,
371 },
372 })?
373 .into_owned();
374
375 match event {
376 quick_xml::events::Event::Start(tag) => self.handle_start(tag.local_name().as_ref()),
377 quick_xml::events::Event::End(tag) => self.handle_end(tag.local_name().as_ref()),
378 quick_xml::events::Event::Empty(tag) => self.handle_empty(tag.local_name().as_ref()),
379 quick_xml::events::Event::Text(text) => {
380 self.handle_text(&text)?;
381 }
382 quick_xml::events::Event::GeneralRef(reference) => {
383 self.handle_general_ref(&reference)?;
384 }
385 quick_xml::events::Event::CData(cdata) => self.handle_cdata(cdata)?,
386 quick_xml::events::Event::Eof => self.handle_eof(),
387 quick_xml::events::Event::Comment(_)
388 | quick_xml::events::Event::Decl(_)
389 | quick_xml::events::Event::PI(_)
390 | quick_xml::events::Event::DocType(_) => {}
391 }
392
393 self.buf.clear();
394 Ok(())
395 }
396
397 fn start_paragraph(&mut self) {
398 self.queue.push_back(Event::StartParagraph {
399 alignment: None,
400 id: None,
401 });
402 self.in_paragraph = true;
403 self.in_text = false;
404 self.pending_text.clear();
405 }
406}
407
408impl EventSource for DocxReader {
409 #[inline]
410 fn next_event(&mut self) -> Result<Option<Event>> {
411 loop {
412 if let Some(event) = self.queue.pop_front() {
413 return Ok(Some(event));
414 }
415
416 match self.phase {
417 Phase::NotStarted => {
418 self.phase = Phase::Running;
419 self.queue.push_back(Event::StartDocument {
420 id: None,
421 language: None,
422 metadata: None,
423 });
424 }
425 Phase::Finished => return Ok(None),
426 Phase::Running => self.read_until_event()?,
427 }
428 }
429 }
430}
431
432fn is_ignored_container(local: &[u8]) -> bool {
433 matches!(
434 local,
435 b"sdt"
436 | b"hyperlink"
437 | b"drawing"
438 | b"pict"
439 | b"object"
440 | b"ins"
441 | b"del"
442 | b"moveFrom"
443 | b"moveTo"
444 | b"tblPr"
445 | b"trPr"
446 | b"tcPr"
447 | b"tblGrid"
448 )
449}
450
451fn parse_error(message: String) -> Error {
452 Error::Parse {
453 message,
454 position: None,
455 }
456}
457
458#[cfg(test)]
459#[cfg(not(coverage))]
460mod tests {
461 use std::io::{Cursor, Write as _};
462
463 use zip::{write::SimpleFileOptions, CompressionMethod, ZipWriter};
464
465 use super::*;
466
467 const SIMPLE_RELS: &str = r#"<?xml version="1.0"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/></Relationships>"#;
468
469 #[test]
470 fn docx_reader_is_send_static() {
471 fn assert_send_static<T: Send + 'static>() {}
472 assert_send_static::<DocxReader>();
473 }
474
475 fn synth_docx_for_unit_test(
476 rels_xml: &str,
477 document_xml: &str,
478 ) -> core::result::Result<Vec<u8>, Box<dyn core::error::Error>> {
479 let buf = Cursor::new(Vec::new());
480 let mut writer = ZipWriter::new(buf);
481 let rels_options =
482 SimpleFileOptions::default().compression_method(CompressionMethod::Deflated);
483 writer.start_file("_rels/.rels", rels_options)?;
484 writer.write_all(rels_xml.as_bytes())?;
485 let document_options =
486 SimpleFileOptions::default().compression_method(CompressionMethod::Deflated);
487 writer.start_file("word/document.xml", document_options)?;
488 writer.write_all(document_xml.as_bytes())?;
489 Ok(writer.finish()?.into_inner())
490 }
491
492 fn make_reader(
493 document_xml: &str,
494 ) -> core::result::Result<DocxReader, Box<dyn core::error::Error>> {
495 let bytes = synth_docx_for_unit_test(SIMPLE_RELS, document_xml)?;
496 Ok(DocxReader::from_reader(Cursor::new(bytes))?)
497 }
498
499 #[test]
500 fn queue_length_never_exceeds_three() -> core::result::Result<(), Box<dyn core::error::Error>> {
501 let doc = {
502 let mut content = String::from(
503 r#"<?xml version="1.0"?><w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"><w:body>"#,
504 );
505 for _ in 0..1000 {
506 content.push_str("<w:p><w:r><w:t>hello</w:t></w:r></w:p>");
507 }
508 content.push_str("</w:body></w:document>");
509 content
510 };
511 let mut reader = make_reader(&doc)?;
512 loop {
513 if reader.queue.len() > 3 {
514 return Err(Box::new(Error::Other {
515 message: format!("queue grew to {}", reader.queue.len()),
516 }));
517 }
518 if reader.next_event()?.is_none() {
519 break;
520 }
521 }
522 Ok(())
523 }
524
525 #[test]
526 fn buf_is_cleared_per_iteration() -> core::result::Result<(), Box<dyn core::error::Error>> {
527 let doc = r#"<?xml version="1.0"?><w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"><w:body><w:p><w:r><w:t>hello</w:t></w:r></w:p></w:body></w:document>"#;
528 let mut reader = make_reader(doc)?;
529 while reader.next_event()?.is_some() {
530 if !reader.buf.is_empty() {
531 return Err(Box::new(Error::Other {
532 message: "buf not cleared after event".to_string(),
533 }));
534 }
535 }
536 Ok(())
537 }
538}