Skip to main content

docspec_docx_reader/
asset_provider.rs

1//! DOCX ZIP archive asset provider.
2
3use std::borrow::Cow;
4use std::io::{self, Read, Seek, Write};
5use std::path::Path;
6use std::sync::Mutex;
7
8use docspec_core::{AssetProvider, Error, Result};
9use zip::result::ZipError;
10use zip::ZipArchive;
11
12use crate::content_types::{self, ContentTypes};
13
14/// Object-safe alias combining [`Read`], [`Seek`], and [`Send`] for use in trait objects.
15trait ReadSeek: Read + Seek + Send {}
16impl<T: Read + Seek + Send> ReadSeek for T {}
17
18/// Provides streaming access to binary assets stored inside a DOCX ZIP archive.
19///
20/// Holds the docx ZIP file open until dropped. Uses internal Mutex to serialize
21/// concurrent ZIP reads. Not Clone — use `Arc<DocxAssetProvider>` to share.
22pub struct DocxAssetProvider {
23    archive: Mutex<ZipArchive<Box<dyn ReadSeek + 'static>>>,
24    content_types: ContentTypes,
25}
26
27impl DocxAssetProvider {
28    /// Creates a `DocxAssetProvider` from a file path.
29    ///
30    /// Opens the DOCX ZIP file and reads `[Content_Types].xml` to build the
31    /// content type lookup table.
32    ///
33    /// # Errors
34    ///
35    /// Returns [`Error::Io`] if the file cannot be opened, or [`Error::Parse`]
36    /// if the file is not a valid ZIP archive.
37    #[inline]
38    pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
39        let file = std::fs::File::open(path.as_ref()).map_err(Error::from)?;
40        Self::from_reader(file)
41    }
42
43    /// Creates a `DocxAssetProvider` from any [`Read`] + [`Seek`] + [`Send`] source.
44    ///
45    /// The source must be positioned at the start of a valid DOCX (ZIP) archive.
46    /// Reads `[Content_Types].xml` to build the content type lookup table.
47    ///
48    /// # Errors
49    ///
50    /// Returns [`Error::Parse`] if the input is not a valid ZIP archive, or
51    /// [`Error::Io`] for I/O failures when reading `[Content_Types].xml`.
52    #[inline]
53    pub fn from_reader<R: Read + Seek + Send + 'static>(reader: R) -> Result<Self> {
54        let boxed: Box<dyn ReadSeek + 'static> = Box::new(reader);
55        let mut archive = ZipArchive::new(boxed).map_err(|err| match err {
56            ZipError::InvalidArchive(_) | ZipError::UnsupportedArchive(_) => Error::Parse {
57                message: "not a valid ZIP archive".to_string(),
58                position: None,
59            },
60            ZipError::Io(source) => Error::Io { source },
61            ZipError::FileNotFound
62            | ZipError::InvalidPassword
63            | ZipError::CompressionMethodNotSupported(_)
64            | _ => Error::Parse {
65                message: format!("not a valid ZIP archive: {err}"),
66                position: None,
67            },
68        })?;
69
70        let ct_bytes = match archive.by_name("[Content_Types].xml") {
71            Ok(mut entry) => {
72                let mut bytes: Vec<u8> = Vec::new();
73                io::copy(&mut entry, &mut bytes).map_err(Error::from)?;
74                bytes
75            }
76            Err(_) => Vec::new(),
77        };
78
79        let content_types = content_types::parse(&ct_bytes)?;
80
81        Ok(Self {
82            archive: Mutex::new(archive),
83            content_types,
84        })
85    }
86}
87
88impl AssetProvider for DocxAssetProvider {
89    /// Returns the MIME content type for an asset ID with a `zip://` scheme prefix.
90    ///
91    /// Strips the `zip://` prefix and looks up the path in `[Content_Types].xml`.
92    /// Returns `None` if the scheme is not `zip://` or if no content type is registered
93    /// for the path.
94    #[inline]
95    fn content_type(&self, asset_id: &str) -> Option<Cow<'_, str>> {
96        asset_id
97            .strip_prefix("zip://")
98            .and_then(|p| self.content_types.lookup(p))
99            .map(Cow::Borrowed)
100    }
101
102    /// Streams the asset bytes at `asset_id` to `writer`.
103    ///
104    /// Strips the `zip://` prefix, acquires the archive mutex, locates the ZIP entry,
105    /// and copies bytes via [`io::copy`] — never buffers the full asset. Returns:
106    ///
107    /// - `None` if `asset_id` does not start with `zip://`
108    /// - `None` if the mutex is poisoned
109    /// - `None` if the entry is not found in the archive
110    /// - `Some(Ok(n))` on success with `n` bytes written
111    /// - `Some(Err(_))` on I/O error during copy
112    #[inline]
113    fn stream_to(&self, asset_id: &str, writer: &mut dyn Write) -> Option<io::Result<u64>> {
114        let path = asset_id.strip_prefix("zip://")?;
115        let mut archive = self.archive.lock().ok()?;
116        let mut entry = archive.by_name(path).ok()?;
117        Some(io::copy(&mut entry, writer))
118    }
119}
120
121#[cfg(test)]
122#[cfg(not(coverage))]
123mod tests {
124    #![allow(
125        clippy::unwrap_used,
126        clippy::expect_used,
127        clippy::separated_literal_suffix,
128        clippy::unseparated_literal_suffix
129    )]
130    use std::borrow::Cow;
131    use std::io::{Cursor, Write as _};
132    use zip::write::SimpleFileOptions;
133    use zip::CompressionMethod;
134
135    use super::DocxAssetProvider;
136    use docspec_core::AssetProvider as _;
137
138    fn synth_zip(entries: &[(&str, &[u8])]) -> Vec<u8> {
139        let buf = Cursor::new(Vec::new());
140        let mut writer = zip::ZipWriter::new(buf);
141        let options = SimpleFileOptions::default().compression_method(CompressionMethod::Stored);
142        for (name, data) in entries {
143            writer.start_file(*name, options).unwrap();
144            writer.write_all(data).unwrap();
145        }
146        writer.finish().unwrap().into_inner()
147    }
148
149    fn content_types_png_xml() -> &'static [u8] {
150        br#"<?xml version="1.0"?>
151<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
152  <Default Extension="png" ContentType="image/png"/>
153</Types>"#
154    }
155
156    fn synth_png_docx() -> Vec<u8> {
157        synth_zip(&[
158            ("[Content_Types].xml", content_types_png_xml()),
159            ("word/media/image1.png", &[0x89, 0x50, 0x4E, 0x47]),
160        ])
161    }
162
163    #[test]
164    fn is_send_sync() {
165        fn assert_send_sync<T: Send + Sync>() {}
166        assert_send_sync::<DocxAssetProvider>();
167    }
168
169    #[test]
170    fn stream_to_exact_bytes() {
171        let zip_bytes = synth_png_docx();
172        let provider = DocxAssetProvider::from_reader(Cursor::new(zip_bytes)).expect("should open");
173        let mut buf = Vec::new();
174        let result = provider.stream_to("zip://word/media/image1.png", &mut buf);
175        assert_eq!(
176            result.expect("should return Some").expect("should be Ok"),
177            4u64
178        );
179        assert_eq!(buf, &[0x89, 0x50, 0x4E, 0x47]);
180    }
181
182    #[test]
183    fn content_type_from_default() {
184        let zip_bytes = synth_png_docx();
185        let provider = DocxAssetProvider::from_reader(Cursor::new(zip_bytes)).expect("should open");
186        let ct = provider.content_type("zip://word/media/image1.png");
187        assert_eq!(ct, Some(Cow::Borrowed("image/png")));
188    }
189
190    #[test]
191    fn non_zip_scheme_returns_none() {
192        let zip_bytes = synth_png_docx();
193        let provider = DocxAssetProvider::from_reader(Cursor::new(zip_bytes)).expect("should open");
194        assert_eq!(provider.content_type("rId99"), None);
195        let mut buf = Vec::new();
196        assert!(provider.stream_to("rId99", &mut buf).is_none());
197    }
198
199    #[test]
200    fn missing_asset_stream_returns_none() {
201        let zip_bytes = synth_png_docx();
202        let provider = DocxAssetProvider::from_reader(Cursor::new(zip_bytes)).expect("should open");
203        let mut buf = Vec::new();
204        assert!(provider
205            .stream_to("zip://word/media/noexist.png", &mut buf)
206            .is_none());
207    }
208
209    #[test]
210    fn content_type_returns_none_for_unregistered_extension() {
211        let zip_bytes = synth_png_docx();
212        let provider = DocxAssetProvider::from_reader(Cursor::new(zip_bytes)).expect("should open");
213        assert_eq!(provider.content_type("zip://word/document.xml"), None);
214    }
215
216    #[test]
217    fn from_path_opens_file() {
218        let dir = tempfile::tempdir().expect("tempdir");
219        let path = dir.path().join("test.docx");
220        let zip_bytes = synth_png_docx();
221        std::fs::write(&path, &zip_bytes).expect("write file");
222        let provider = DocxAssetProvider::from_path(&path).expect("should open");
223        let ct = provider.content_type("zip://word/media/image1.png");
224        assert_eq!(ct, Some(Cow::Borrowed("image/png")));
225    }
226
227    #[test]
228    fn missing_content_types_yields_empty_lookup() {
229        let zip_bytes = synth_zip(&[("word/media/image1.png", &[0x89, 0x50, 0x4E, 0x47])]);
230        let provider = DocxAssetProvider::from_reader(Cursor::new(zip_bytes)).expect("should open");
231        assert_eq!(provider.content_type("zip://word/media/image1.png"), None);
232        let mut buf = Vec::new();
233        let result = provider.stream_to("zip://word/media/image1.png", &mut buf);
234        assert_eq!(
235            result.expect("should return Some").expect("should be Ok"),
236            4u64
237        );
238        assert_eq!(buf, &[0x89, 0x50, 0x4E, 0x47]);
239    }
240}