epub_parser/utils/zip.rs
1//! ZIP archive handling for EPUB files.
2//!
3//! This module provides utilities for reading EPUB files, which are
4//! ZIP archives containing XML, HTML, and media files. It handles
5//! locating the OPF file via META-INF/container.xml and reading
6//! files from the archive.
7
8use crate::epub::Error;
9use std::fs::File;
10use std::io::{Read, Seek};
11use std::path::Path;
12use zip::ZipArchive;
13
14/// A handler for reading EPUB files as ZIP archives.
15///
16/// This struct wraps a `ZipArchive` and provides convenience methods
17/// for reading specific files needed for EPUB parsing:
18/// - Locating the OPF file via META-INF/container.xml
19/// - Reading text files (XML, HTML, CSS, etc.)
20/// - Reading binary files (images, fonts, etc.)
21///
22/// # Example
23///
24/// ```
25/// use std::path::Path;
26/// use epub_parser::utils::ZipHandler;
27///
28/// let mut handler = ZipHandler::new(Path::new("book.epub"))?;
29/// let opf_path = handler.get_opf_path()?;
30/// println!("OPF location: {}", opf_path);
31/// # Ok::<(), Box<dyn std::error::Error>>(())
32/// ```
33pub struct ZipHandler<R: Read + Seek> {
34 archive: ZipArchive<R>,
35}
36
37impl ZipHandler<File> {
38 /// Creates a new ZipHandler from a file path.
39 ///
40 /// # Arguments
41 ///
42 /// * `path` - The path to the EPUB file.
43 ///
44 /// # Returns
45 ///
46 /// Returns `Ok(ZipHandler)` on success, or an error if the file
47 /// cannot be opened or is not a valid ZIP archive.
48 ///
49 /// # Errors
50 ///
51 /// Returns an error if:
52 /// - The file does not exist
53 /// - The file cannot be opened
54 /// - The file is not a valid ZIP archive
55 pub fn new(path: &Path) -> Result<Self, Error> {
56 let file = File::open(path)?;
57 let archive = ZipArchive::new(file)?;
58 Ok(ZipHandler { archive })
59 }
60}
61
62impl<R: Read + Seek> ZipHandler<R> {
63 /// Creates a new ZipHandler from any reader that implements
64 /// `Read + Seek`.
65 ///
66 /// This is useful for parsing EPUBs from memory (e.g., byte buffers)
67 /// or network streams.
68 ///
69 /// # Arguments
70 ///
71 /// * `reader` - Any type implementing `Read + Seek` (e.g., `Cursor<Vec<u8>>`).
72 ///
73 /// # Returns
74 ///
75 /// Returns `Ok(ZipHandler)` on success, or an error if the reader
76 /// does not contain a valid ZIP archive.
77 ///
78 /// # Example
79 ///
80 /// ```
81 /// use std::io::Cursor;
82 /// use epub_parser::utils::ZipHandler;
83 ///
84 /// let data = vec![0u8; 100]; // In practice, this would be EPUB data
85 /// // handler = ZipHandler::new_from_reader(Cursor::new(data))?;
86 /// ```
87 pub fn new_from_reader(reader: R) -> Result<Self, Error> {
88 let archive = ZipArchive::new(reader)?;
89 Ok(ZipHandler { archive })
90 }
91
92 /// Locates the OPF (Open Package Format) file path.
93 ///
94 /// EPUB files contain a `META-INF/container.xml` file that specifies
95 /// the location of the OPF file. This method parses that XML and
96 /// returns the path to the OPF file.
97 ///
98 /// # Returns
99 ///
100 /// Returns the path to the OPF file as a string (e.g., "OEBPS/content.opf").
101 ///
102 /// # Errors
103 ///
104 /// Returns `Error::MissingContainer` if META-INF/container.xml is missing.
105 /// Returns `Error::MissingOpf` if the OPF path cannot be found.
106 pub fn get_opf_path(&mut self) -> Result<String, Error> {
107 let container_content = self.read_file("META-INF/container.xml")?;
108
109 let mut reader = quick_xml::Reader::from_str(&container_content);
110 let mut opf_path = String::new();
111
112 let mut buf = Vec::new();
113
114 loop {
115 match reader.read_event_into(&mut buf) {
116 Ok(quick_xml::events::Event::Start(ref e)) => {
117 let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
118
119 if name == "rootfile" || name.ends_with(":rootfile") {
120 for attr_result in e.attributes() {
121 if let Ok(attr) = attr_result {
122 let attr_name =
123 String::from_utf8_lossy(attr.key.as_ref()).to_string();
124
125 if attr_name == "full-path" || attr_name.ends_with(":full-path") {
126 opf_path = attr
127 .decode_and_unescape_value(reader.decoder())?
128 .to_string();
129 break;
130 }
131 }
132 }
133 if !opf_path.is_empty() {
134 break;
135 }
136 }
137 }
138 Ok(quick_xml::events::Event::Empty(ref e)) => {
139 let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
140
141 if name == "rootfile" || name.ends_with(":rootfile") {
142 for attr_result in e.attributes() {
143 if let Ok(attr) = attr_result {
144 let attr_name =
145 String::from_utf8_lossy(attr.key.as_ref()).to_string();
146
147 if attr_name == "full-path" || attr_name.ends_with(":full-path") {
148 opf_path = attr
149 .decode_and_unescape_value(reader.decoder())?
150 .to_string();
151 break;
152 }
153 }
154 }
155 if !opf_path.is_empty() {
156 break;
157 }
158 }
159 }
160 Ok(quick_xml::events::Event::Eof) => break,
161 Err(e) => return Err(Error::XmlError(e.to_string())),
162 _ => {}
163 }
164 buf.clear();
165 }
166
167 if opf_path.is_empty() {
168 return Err(Error::MissingOpf);
169 }
170
171 Ok(opf_path)
172 }
173
174 /// Reads a file from the ZIP archive as a UTF-8 string.
175 ///
176 /// # Arguments
177 ///
178 /// * `path` - The path to the file within the ZIP archive.
179 ///
180 /// # Returns
181 ///
182 /// Returns the file contents as a `String`.
183 ///
184 /// # Errors
185 ///
186 /// Returns an error if:
187 /// - The file does not exist in the archive
188 /// - The file cannot be read
189 /// - The file contains invalid UTF-8
190 pub fn read_file(&mut self, path: &str) -> Result<String, Error> {
191 let mut file = self.archive.by_name(path)?;
192 let mut content = String::new();
193 file.read_to_string(&mut content)?;
194 Ok(content)
195 }
196
197 /// Reads a file from the ZIP archive as raw bytes.
198 ///
199 /// This is useful for binary files like images and fonts.
200 ///
201 /// # Arguments
202 ///
203 /// * `path` - The path to the file within the ZIP archive.
204 ///
205 /// # Returns
206 ///
207 /// Returns the file contents as a `Vec<u8>`.
208 ///
209 /// # Errors
210 ///
211 /// Returns an error if the file does not exist or cannot be read.
212 pub fn read_file_as_bytes(&mut self, path: &str) -> Result<Vec<u8>, Error> {
213 let mut file = self.archive.by_name(path)?;
214 let mut content = Vec::new();
215 file.read_to_end(&mut content)?;
216 Ok(content)
217 }
218}