1use crate::content::Page;
2use crate::cover::Cover;
3use crate::image::Image;
4use crate::metadata::Metadata;
5use crate::toc::TocEntry;
6use crate::zip_handler::ZipHandler;
7use quick_xml::events::Event;
8use std::collections::HashMap;
9use std::io::Cursor;
10use std::path::{Path, PathBuf};
11
12#[derive(Debug)]
13pub struct Epub {
14 pub metadata: Metadata,
15 pub toc: Vec<TocEntry>,
16 pub pages: Vec<Page>,
17 pub cover: Cover,
18 pub images: Vec<Image>,
19}
20
21#[derive(Debug)]
22pub enum Error {
23 InvalidEpub(String),
24 IoError(std::io::Error),
25 ZipError(zip::result::ZipError),
26 XmlError(String),
27 MissingContainer,
28 MissingOpf,
29 MissingNcx,
30}
31
32impl From<std::io::Error> for Error {
33 fn from(err: std::io::Error) -> Self {
34 Error::IoError(err)
35 }
36}
37
38impl From<zip::result::ZipError> for Error {
39 fn from(err: zip::result::ZipError) -> Self {
40 Error::ZipError(err)
41 }
42}
43
44impl From<quick_xml::Error> for Error {
45 fn from(err: quick_xml::Error) -> Self {
46 Error::XmlError(err.to_string())
47 }
48}
49
50impl Epub {
51 pub fn parse(path: &Path) -> Result<Self, Error> {
52 let mut zip_handler = ZipHandler::new(path)?;
53 Self::parse_from_handler(&mut zip_handler)
54 }
55
56 pub fn parse_from_buffer(buffer: &[u8]) -> Result<Self, Error> {
57 let cursor = Cursor::new(buffer.to_vec());
58 let mut zip_handler = ZipHandler::new_from_reader(cursor)?;
59 Self::parse_from_handler(&mut zip_handler)
60 }
61
62 fn parse_from_handler<R: std::io::Read + std::io::Seek>(
63 zip_handler: &mut ZipHandler<R>,
64 ) -> Result<Self, Error> {
65 let opf_path = zip_handler.get_opf_path()?;
66 let opf_content = zip_handler.read_file(&opf_path)?;
67
68 let (metadata, manifest, spine, ncx_path, cover_id) = Self::parse_opf(&opf_content)?;
69
70 let toc = if let Some(ncx_ref) = ncx_path {
71 let ncx_path_full = Self::resolve_path(&opf_path, &ncx_ref);
72 let ncx_content = zip_handler.read_file(&ncx_path_full)?;
73 Self::parse_ncx(&ncx_content)?
74 } else {
75 Vec::new()
76 };
77
78 let mut pages = Vec::new();
79 for itemref in spine {
80 if let Some(manifest_item) = manifest.get(&itemref) {
81 let content_path = Self::resolve_path(&opf_path, &manifest_item.href);
82 let content = zip_handler.read_file(&content_path)?;
83 let text = Self::extract_text_from_html(&content)?;
84 pages.push(Page {
85 index: pages.len(),
86 content: text,
87 });
88 }
89 }
90
91 let mut cover = Cover::default();
92 if let Some(cover_id) = cover_id {
93 if let Some(cover_item) = manifest.get(&cover_id) {
94 let cover_path = Self::resolve_path(&opf_path, &cover_item.href);
95 match zip_handler.read_file_as_bytes(&cover_path) {
96 Ok(bytes) => {
97 cover.href = Some(cover_item.href.clone());
98 cover.content = Some(bytes);
99 }
100 Err(_) => {}
101 }
102 }
103 }
104
105 let mut images = Vec::new();
106 for (id, item) in &manifest {
107 if item._media_type.starts_with("image/") {
108 let image_path = Self::resolve_path(&opf_path, &item.href);
109 match zip_handler.read_file_as_bytes(&image_path) {
110 Ok(bytes) => {
111 images.push(Image {
112 id: id.clone(),
113 href: item.href.clone(),
114 media_type: item._media_type.clone(),
115 content: Some(bytes),
116 });
117 }
118 Err(_) => {
119 images.push(Image {
120 id: id.clone(),
121 href: item.href.clone(),
122 media_type: item._media_type.clone(),
123 content: None,
124 });
125 }
126 }
127 }
128 }
129
130 Ok(Epub {
131 metadata,
132 toc,
133 pages,
134 cover,
135 images,
136 })
137 }
138
139 fn parse_opf(
140 content: &str,
141 ) -> Result<
142 (
143 Metadata,
144 HashMap<String, ManifestItem>,
145 Vec<String>,
146 Option<String>,
147 Option<String>,
148 ),
149 Error,
150 > {
151 let mut reader = quick_xml::Reader::from_str(content);
152 let mut metadata = Metadata::new();
153 let mut manifest: HashMap<String, ManifestItem> = HashMap::new();
154 let mut spine: Vec<String> = Vec::new();
155 let mut ncx_path: Option<String> = None;
156 let mut cover_id: Option<String> = None;
157
158 let mut current_text_tag: Option<String> = None;
159
160 let mut buf = Vec::new();
161
162 loop {
163 match reader.read_event_into(&mut buf) {
164 Ok(Event::Start(ref e)) => {
165 let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
166 if name.contains("title") {
167 current_text_tag = Some("title".to_string());
168 } else if name.contains("creator") {
169 current_text_tag = Some("author".to_string());
170 } else if name.contains("publisher") {
171 current_text_tag = Some("publisher".to_string());
172 } else if name.contains("language") {
173 current_text_tag = Some("language".to_string());
174 } else if name.contains("identifier") {
175 current_text_tag = Some("identifier".to_string());
176 } else if name.contains("date") {
177 current_text_tag = Some("date".to_string());
178 } else if name.contains("rights") {
179 current_text_tag = Some("rights".to_string());
180 }
181 }
182 Ok(Event::Empty(ref e)) => {
183 let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
184 if name.contains("meta") {
185 for attr_result in e.attributes() {
186 if let Ok(attr) = attr_result {
187 let attr_name =
188 String::from_utf8_lossy(attr.key.as_ref()).to_string();
189 if attr_name.contains("content") {
190 if let Some(val) =
191 attr.decode_and_unescape_value(reader.decoder()).ok()
192 {
193 cover_id = Some(val.to_string());
194 }
195 }
196 }
197 }
198 } else if name.contains("item") && !name.contains("itemref") {
199 let mut id = String::new();
200 let mut href = String::new();
201 let mut media_type = String::new();
202
203 for attr_result in e.attributes() {
204 if let Ok(attr) = attr_result {
205 let attr_name =
206 String::from_utf8_lossy(attr.key.as_ref()).to_string();
207 if attr_name == "id" || attr_name.ends_with(":id") {
208 if let Some(val) =
209 attr.decode_and_unescape_value(reader.decoder()).ok()
210 {
211 id = val.to_string();
212 }
213 } else if attr_name == "href" || attr_name.ends_with(":href") {
214 href = attr
215 .decode_and_unescape_value(reader.decoder())?
216 .to_string();
217 } else if attr_name == "media-type"
218 || attr_name.ends_with(":media-type")
219 {
220 media_type = attr
221 .decode_and_unescape_value(reader.decoder())?
222 .to_string();
223 }
224 }
225 }
226
227 if !id.is_empty() && !href.is_empty() {
228 if media_type == "application/x-dtbncx+xml" {
229 ncx_path = Some(href.clone());
230 }
231 manifest.insert(
232 id.clone(),
233 ManifestItem {
234 _id: id.clone(),
235 href,
236 _media_type: media_type,
237 },
238 );
239 }
240 } else if name.contains("itemref") {
241 let mut idref = String::new();
242
243 for attr_result in e.attributes() {
244 if let Ok(attr) = attr_result {
245 let attr_name =
246 String::from_utf8_lossy(attr.key.as_ref()).to_string();
247 if attr_name == "idref" || attr_name.ends_with(":idref") {
248 if let Some(val) =
249 attr.decode_and_unescape_value(reader.decoder()).ok()
250 {
251 idref = val.to_string();
252 }
253 break;
254 }
255 }
256 }
257
258 if !idref.is_empty() {
259 spine.push(idref);
260 }
261 }
262 }
263 Ok(Event::Text(e)) => {
264 if let Some(tag) = ¤t_text_tag {
265 let text = e.unescape()?.into_owned().trim().to_string();
266 if !text.is_empty() {
267 match tag.as_str() {
268 "title" => metadata.title = Some(text),
269 "author" => metadata.author = Some(text),
270 "publisher" => metadata.publisher = Some(text),
271 "language" => metadata.language = Some(text),
272 "identifier" => metadata.identifier = Some(text),
273 "date" => metadata.date = Some(text),
274 "rights" => metadata.rights = Some(text),
275 _ => {}
276 }
277 }
278 current_text_tag = None;
279 }
280 }
281 Ok(Event::End(_)) => {
282 current_text_tag = None;
283 }
284 Ok(Event::Eof) => break,
285 Err(e) => return Err(Error::XmlError(e.to_string())),
286 _ => {}
287 }
288 buf.clear();
289 }
290
291 Ok((metadata, manifest, spine, ncx_path, cover_id))
292 }
293
294 fn parse_ncx(content: &str) -> Result<Vec<TocEntry>, Error> {
295 let mut reader = quick_xml::Reader::from_str(content);
296 let mut toc = Vec::new();
297 let mut stack: Vec<TocEntry> = Vec::new();
298
299 let mut buf = Vec::new();
300 let mut in_nav_label = false;
301 let mut in_text = false;
302
303 loop {
304 match reader.read_event_into(&mut buf) {
305 Ok(Event::Start(ref e)) => {
306 let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
307 if name == "navPoint" {
308 let entry = TocEntry {
309 label: String::new(),
310 href: String::new(),
311 children: Vec::new(),
312 };
313 stack.push(entry);
314 } else if name == "navLabel" {
315 in_nav_label = true;
316 } else if name == "text" && in_nav_label {
317 in_text = true;
318 }
319 }
320 Ok(Event::End(ref e)) => {
321 let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
322 if name == "navPoint" {
323 if let Some(entry) = stack.pop() {
324 if let Some(parent) = stack.last_mut() {
325 parent.children.push(entry);
326 } else {
327 toc.push(entry);
328 }
329 }
330 } else if name == "navLabel" {
331 in_nav_label = false;
332 } else if name == "text" && in_nav_label {
333 in_text = false;
334 }
335 }
336 Ok(Event::Text(e)) => {
337 if in_text {
338 if let Some(entry) = stack.last_mut() {
339 entry.label = e.unescape()?.into_owned();
340 }
341 }
342 }
343 Ok(Event::Empty(ref e)) => {
344 let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
345 if name == "content" {
346 if let Some(src) = e.try_get_attribute("src")? {
347 if let Some(entry) = stack.last_mut() {
348 entry.href =
349 src.decode_and_unescape_value(reader.decoder())?.to_string();
350 }
351 }
352 }
353 }
354 Ok(Event::Eof) => break,
355 Err(e) => return Err(Error::XmlError(e.to_string())),
356 _ => {}
357 }
358 buf.clear();
359 }
360
361 Ok(toc)
362 }
363
364 fn extract_text_from_html(content: &str) -> Result<String, Error> {
365 let mut reader = quick_xml::Reader::from_str(content);
366 let mut text = String::new();
367 let skip_tags: Vec<Vec<u8>> = vec![b"script".to_vec(), b"style".to_vec(), b"head".to_vec()];
368 let mut in_skip_tag = false;
369
370 let mut buf = Vec::new();
371
372 loop {
373 match reader.read_event_into(&mut buf) {
374 Ok(Event::Start(ref e)) => {
375 let tag = e.name().as_ref().to_vec();
376 if skip_tags.contains(&tag) {
377 in_skip_tag = true;
378 } else if tag.as_slice() == b"p"
379 || tag.as_slice() == b"div"
380 || tag.as_slice() == b"br"
381 || tag.as_slice() == b"li"
382 {
383 text.push('\n');
384 }
385 }
386 Ok(Event::End(ref e)) => {
387 let tag = e.name().as_ref().to_vec();
388 if skip_tags.contains(&tag) {
389 in_skip_tag = false;
390 }
391 }
392 Ok(Event::Text(e)) => {
393 if !in_skip_tag {
394 let t = e.unescape()?.into_owned();
395 let trimmed: String = t.chars().filter(|c| !c.is_control()).collect();
396 text.push_str(&trimmed);
397 text.push(' ');
398 }
399 }
400 Ok(Event::Eof) => break,
401 Err(e) => return Err(Error::XmlError(e.to_string())),
402 _ => {}
403 }
404 buf.clear();
405 }
406
407 Ok(text
408 .lines()
409 .map(|l| l.trim())
410 .filter(|l| !l.is_empty())
411 .collect::<Vec<_>>()
412 .join("\n"))
413 }
414
415 fn resolve_path(base_path: &str, href: &str) -> String {
416 let base = PathBuf::from(base_path);
417 let parent = base.parent().unwrap_or(base.as_path());
418 let resolved = parent.join(href);
419 resolved.to_string_lossy().to_string().replace('\\', "/")
420 }
421}
422
423#[derive(Debug, Clone)]
424struct ManifestItem {
425 _id: String,
426 href: String,
427 _media_type: String,
428}