Skip to main content

epub_parser/utils/
zip.rs

1//! ZIP archive handling for EPUB files.
2//!
3//! This module provides utilities for reading EPUB files, which are
4//! ZIP archives containing XML, HTML, and media files. It handles
5//! locating the OPF file via META-INF/container.xml and reading
6//! files from the archive.
7
8use crate::epub::Error;
9use std::fs::File;
10use std::io::{Read, Seek};
11use std::path::Path;
12use zip::ZipArchive;
13
14/// A handler for reading EPUB files as ZIP archives.
15///
16/// This struct wraps a `ZipArchive` and provides convenience methods
17/// for reading specific files needed for EPUB parsing:
18/// - Locating the OPF file via META-INF/container.xml
19/// - Reading text files (XML, HTML, CSS, etc.)
20/// - Reading binary files (images, fonts, etc.)
21///
22/// # Example
23///
24/// ```
25/// use std::path::Path;
26/// use epub_parser::utils::ZipHandler;
27///
28/// let mut handler = ZipHandler::new(Path::new("book.epub"))?;
29/// let opf_path = handler.get_opf_path()?;
30/// println!("OPF location: {}", opf_path);
31/// # Ok::<(), Box<dyn std::error::Error>>(())
32/// ```
33pub struct ZipHandler<R: Read + Seek> {
34    archive: ZipArchive<R>,
35}
36
37impl ZipHandler<File> {
38    /// Creates a new ZipHandler from a file path.
39    ///
40    /// # Arguments
41    ///
42    /// * `path` - The path to the EPUB file.
43    ///
44    /// # Returns
45    ///
46    /// Returns `Ok(ZipHandler)` on success, or an error if the file
47    /// cannot be opened or is not a valid ZIP archive.
48    ///
49    /// # Errors
50    ///
51    /// Returns an error if:
52    /// - The file does not exist
53    /// - The file cannot be opened
54    /// - The file is not a valid ZIP archive
55    pub fn new(path: &Path) -> Result<Self, Error> {
56        let file = File::open(path)?;
57        let archive = ZipArchive::new(file)?;
58        Ok(ZipHandler { archive })
59    }
60}
61
62impl<R: Read + Seek> ZipHandler<R> {
63    /// Creates a new ZipHandler from any reader that implements
64    /// `Read + Seek`.
65    ///
66    /// This is useful for parsing EPUBs from memory (e.g., byte buffers)
67    /// or network streams.
68    ///
69    /// # Arguments
70    ///
71    /// * `reader` - Any type implementing `Read + Seek` (e.g., `Cursor<Vec<u8>>`).
72    ///
73    /// # Returns
74    ///
75    /// Returns `Ok(ZipHandler)` on success, or an error if the reader
76    /// does not contain a valid ZIP archive.
77    ///
78    /// # Example
79    ///
80    /// ```
81    /// use std::io::Cursor;
82    /// use epub_parser::utils::ZipHandler;
83    ///
84    /// let data = vec![0u8; 100]; // In practice, this would be EPUB data
85    /// // handler = ZipHandler::new_from_reader(Cursor::new(data))?;
86    /// ```
87    pub fn new_from_reader(reader: R) -> Result<Self, Error> {
88        let archive = ZipArchive::new(reader)?;
89        Ok(ZipHandler { archive })
90    }
91
92    /// Locates the OPF (Open Package Format) file path.
93    ///
94    /// EPUB files contain a `META-INF/container.xml` file that specifies
95    /// the location of the OPF file. This method parses that XML and
96    /// returns the path to the OPF file.
97    ///
98    /// # Returns
99    ///
100    /// Returns the path to the OPF file as a string (e.g., "OEBPS/content.opf").
101    ///
102    /// # Errors
103    ///
104    /// Returns `Error::MissingContainer` if META-INF/container.xml is missing.
105    /// Returns `Error::MissingOpf` if the OPF path cannot be found.
106    pub fn get_opf_path(&mut self) -> Result<String, Error> {
107        let container_content = self.read_file("META-INF/container.xml")?;
108
109        let mut reader = quick_xml::Reader::from_str(&container_content);
110        let mut opf_path = String::new();
111
112        let mut buf = Vec::new();
113
114        loop {
115            match reader.read_event_into(&mut buf) {
116                Ok(quick_xml::events::Event::Start(ref e)) => {
117                    let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
118
119                    if name == "rootfile" || name.ends_with(":rootfile") {
120                        for attr_result in e.attributes() {
121                            if let Ok(attr) = attr_result {
122                                let attr_name =
123                                    String::from_utf8_lossy(attr.key.as_ref()).to_string();
124
125                                if attr_name == "full-path" || attr_name.ends_with(":full-path") {
126                                    opf_path = attr
127                                        .decode_and_unescape_value(reader.decoder())?
128                                        .to_string();
129                                    break;
130                                }
131                            }
132                        }
133                        if !opf_path.is_empty() {
134                            break;
135                        }
136                    }
137                }
138                Ok(quick_xml::events::Event::Empty(ref e)) => {
139                    let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
140
141                    if name == "rootfile" || name.ends_with(":rootfile") {
142                        for attr_result in e.attributes() {
143                            if let Ok(attr) = attr_result {
144                                let attr_name =
145                                    String::from_utf8_lossy(attr.key.as_ref()).to_string();
146
147                                if attr_name == "full-path" || attr_name.ends_with(":full-path") {
148                                    opf_path = attr
149                                        .decode_and_unescape_value(reader.decoder())?
150                                        .to_string();
151                                    break;
152                                }
153                            }
154                        }
155                        if !opf_path.is_empty() {
156                            break;
157                        }
158                    }
159                }
160                Ok(quick_xml::events::Event::Eof) => break,
161                Err(e) => return Err(Error::XmlError(e.to_string())),
162                _ => {}
163            }
164            buf.clear();
165        }
166
167        if opf_path.is_empty() {
168            return Err(Error::MissingOpf);
169        }
170
171        Ok(opf_path)
172    }
173
174    /// Reads a file from the ZIP archive as a UTF-8 string.
175    ///
176    /// # Arguments
177    ///
178    /// * `path` - The path to the file within the ZIP archive.
179    ///
180    /// # Returns
181    ///
182    /// Returns the file contents as a `String`.
183    ///
184    /// # Errors
185    ///
186    /// Returns an error if:
187    /// - The file does not exist in the archive
188    /// - The file cannot be read
189    /// - The file contains invalid UTF-8
190    pub fn read_file(&mut self, path: &str) -> Result<String, Error> {
191        let mut file = self.archive.by_name(path)?;
192        let mut content = String::new();
193        file.read_to_string(&mut content)?;
194        Ok(content)
195    }
196
197    /// Reads a file from the ZIP archive as raw bytes.
198    ///
199    /// This is useful for binary files like images and fonts.
200    ///
201    /// # Arguments
202    ///
203    /// * `path` - The path to the file within the ZIP archive.
204    ///
205    /// # Returns
206    ///
207    /// Returns the file contents as a `Vec<u8>`.
208    ///
209    /// # Errors
210    ///
211    /// Returns an error if the file does not exist or cannot be read.
212    pub fn read_file_as_bytes(&mut self, path: &str) -> Result<Vec<u8>, Error> {
213        let mut file = self.archive.by_name(path)?;
214        let mut content = Vec::new();
215        file.read_to_end(&mut content)?;
216        Ok(content)
217    }
218}