1use crate::content::Page;
2use crate::cover::Cover;
3use crate::image::Image;
4use crate::metadata::Metadata;
5use crate::toc::TocEntry;
6use crate::zip_handler::ZipHandler;
7use quick_xml::events::Event;
8use std::collections::HashMap;
9use std::io::Cursor;
10use std::path::{Path, PathBuf};
11
12#[derive(Debug)]
13pub struct Epub {
14 pub metadata: Metadata,
15 pub toc: Vec<TocEntry>,
16 pub pages: Vec<Page>,
17 pub cover: Cover,
18 pub images: Vec<Image>,
19}
20
21#[derive(Debug)]
22pub enum Error {
23 InvalidEpub(String),
24 IoError(std::io::Error),
25 ZipError(zip::result::ZipError),
26 XmlError(String),
27 MissingContainer,
28 MissingOpf,
29 MissingNcx,
30}
31
32impl From<std::io::Error> for Error {
33 fn from(err: std::io::Error) -> Self {
34 Error::IoError(err)
35 }
36}
37
38impl From<zip::result::ZipError> for Error {
39 fn from(err: zip::result::ZipError) -> Self {
40 Error::ZipError(err)
41 }
42}
43
44impl From<quick_xml::Error> for Error {
45 fn from(err: quick_xml::Error) -> Self {
46 Error::XmlError(err.to_string())
47 }
48}
49
50impl Epub {
51 pub fn parse(path: &Path) -> Result<Self, Error> {
52 let mut zip_handler = ZipHandler::new(path)?;
53 Self::parse_from_handler(&mut zip_handler)
54 }
55
56 pub fn parse_from_buffer(buffer: &[u8]) -> Result<Self, Error> {
57 let cursor = Cursor::new(buffer.to_vec());
58 let mut zip_handler = ZipHandler::new_from_reader(cursor)?;
59 Self::parse_from_handler(&mut zip_handler)
60 }
61
62 fn parse_from_handler<R: std::io::Read + std::io::Seek>(
63 zip_handler: &mut ZipHandler<R>,
64 ) -> Result<Self, Error> {
65 let opf_path = zip_handler.get_opf_path()?;
66 let opf_content = zip_handler.read_file(&opf_path)?;
67
68 let (metadata, manifest, spine, ncx_path, cover_id) = Self::parse_opf(&opf_content)?;
69
70 let toc = if let Some(ncx_ref) = ncx_path {
71 let ncx_path_full = Self::resolve_path(&opf_path, &ncx_ref);
72 let ncx_content = zip_handler.read_file(&ncx_path_full)?;
73 Self::parse_ncx(&ncx_content)?
74 } else {
75 Vec::new()
76 };
77
78 let mut pages = Vec::new();
79 for itemref in spine {
80 if let Some(manifest_item) = manifest.get(&itemref) {
81 let content_path = Self::resolve_path(&opf_path, &manifest_item.href);
82 let content = zip_handler.read_file(&content_path)?;
83 let text = Self::extract_text_from_html(&content)?;
84 pages.push(Page {
85 index: pages.len(),
86 content: text,
87 });
88 }
89 }
90
91 let mut cover = Cover::default();
92 if let Some(cover_id) = cover_id {
93 if let Some(cover_item) = manifest.get(&cover_id) {
94 let cover_path = Self::resolve_path(&opf_path, &cover_item.href);
95 match zip_handler.read_file_as_bytes(&cover_path) {
96 Ok(bytes) => {
97 cover.href = Some(cover_item.href.clone());
98 cover.content = Some(bytes);
99 }
100 Err(_) => {}
101 }
102 }
103 }
104
105 let mut images = Vec::new();
106 for (id, item) in &manifest {
107 if item._media_type.starts_with("image/") {
108 let image_path = Self::resolve_path(&opf_path, &item.href);
109 match zip_handler.read_file_as_bytes(&image_path) {
110 Ok(bytes) => {
111 images.push(Image {
112 id: id.clone(),
113 href: item.href.clone(),
114 media_type: item._media_type.clone(),
115 content: Some(bytes),
116 });
117 }
118 Err(_) => {
119 images.push(Image {
120 id: id.clone(),
121 href: item.href.clone(),
122 media_type: item._media_type.clone(),
123 content: None,
124 });
125 }
126 }
127 }
128 }
129
130 Ok(Epub {
131 metadata,
132 toc,
133 pages,
134 cover,
135 images,
136 })
137 }
138
139 fn parse_opf(
140 content: &str,
141 ) -> Result<
142 (
143 Metadata,
144 HashMap<String, ManifestItem>,
145 Vec<String>,
146 Option<String>,
147 Option<String>,
148 ),
149 Error,
150 > {
151 let mut reader = quick_xml::Reader::from_str(content);
152 let mut metadata = Metadata::new();
153 let mut manifest: HashMap<String, ManifestItem> = HashMap::new();
154 let mut spine: Vec<String> = Vec::new();
155 let mut ncx_path: Option<String> = None;
156 let mut cover_id: Option<String> = None;
157
158 let mut current_text_tag: Option<String> = None;
159
160 let mut buf = Vec::new();
161
162 loop {
163 match reader.read_event_into(&mut buf) {
164 Ok(Event::Start(ref e)) => {
165 let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
166 if name.contains("title") {
167 current_text_tag = Some("title".to_string());
168 } else if name.contains("creator") {
169 current_text_tag = Some("author".to_string());
170 } else if name.contains("publisher") {
171 current_text_tag = Some("publisher".to_string());
172 } else if name.contains("language") {
173 current_text_tag = Some("language".to_string());
174 } else if name.contains("identifier") {
175 current_text_tag = Some("identifier".to_string());
176 } else if name.contains("date") {
177 current_text_tag = Some("date".to_string());
178 } else if name.contains("rights") {
179 current_text_tag = Some("rights".to_string());
180 }
181 }
182 Ok(Event::Empty(ref e)) => {
183 let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
184 if name.contains("meta") {
185 let mut is_cover = false;
186 for attr_result in e.attributes() {
187 if let Ok(attr) = attr_result {
188 let attr_name =
189 String::from_utf8_lossy(attr.key.as_ref()).to_string();
190 if attr_name.contains("name") {
191 let value = attr
192 .decode_and_unescape_value(reader.decoder())?
193 .to_string();
194 if value == "cover" {
195 is_cover = true;
196 }
197 } else if attr_name.contains("content") {
198 if is_cover {
199 if let Some(val) =
200 attr.decode_and_unescape_value(reader.decoder()).ok()
201 {
202 cover_id = Some(val.to_string());
203 }
204 }
205 }
206 }
207 }
208 } else if name.contains("item") && !name.contains("itemref") {
209 let mut id = String::new();
210 let mut href = String::new();
211 let mut media_type = String::new();
212
213 for attr_result in e.attributes() {
214 if let Ok(attr) = attr_result {
215 let attr_name =
216 String::from_utf8_lossy(attr.key.as_ref()).to_string();
217 if attr_name == "id" || attr_name.ends_with(":id") {
218 if let Some(val) =
219 attr.decode_and_unescape_value(reader.decoder()).ok()
220 {
221 id = val.to_string();
222 }
223 } else if attr_name == "href" || attr_name.ends_with(":href") {
224 href = attr
225 .decode_and_unescape_value(reader.decoder())?
226 .to_string();
227 } else if attr_name == "media-type"
228 || attr_name.ends_with(":media-type")
229 {
230 media_type = attr
231 .decode_and_unescape_value(reader.decoder())?
232 .to_string();
233 }
234 }
235 }
236
237 if !id.is_empty() && !href.is_empty() {
238 if media_type == "application/x-dtbncx+xml" {
239 ncx_path = Some(href.clone());
240 }
241 manifest.insert(
242 id.clone(),
243 ManifestItem {
244 _id: id.clone(),
245 href,
246 _media_type: media_type,
247 },
248 );
249 }
250 } else if name.contains("itemref") {
251 let mut idref = String::new();
252
253 for attr_result in e.attributes() {
254 if let Ok(attr) = attr_result {
255 let attr_name =
256 String::from_utf8_lossy(attr.key.as_ref()).to_string();
257 if attr_name == "idref" || attr_name.ends_with(":idref") {
258 if let Some(val) =
259 attr.decode_and_unescape_value(reader.decoder()).ok()
260 {
261 idref = val.to_string();
262 }
263 break;
264 }
265 }
266 }
267
268 if !idref.is_empty() {
269 spine.push(idref);
270 }
271 }
272 }
273 Ok(Event::Text(e)) => {
274 if let Some(tag) = ¤t_text_tag {
275 let text = e.unescape()?.into_owned().trim().to_string();
276 if !text.is_empty() {
277 match tag.as_str() {
278 "title" => metadata.title = Some(text),
279 "author" => metadata.author = Some(text),
280 "publisher" => metadata.publisher = Some(text),
281 "language" => metadata.language = Some(text),
282 "identifier" => metadata.identifier = Some(text),
283 "date" => metadata.date = Some(text),
284 "rights" => metadata.rights = Some(text),
285 _ => {}
286 }
287 }
288 current_text_tag = None;
289 }
290 }
291 Ok(Event::End(_)) => {
292 current_text_tag = None;
293 }
294 Ok(Event::Eof) => break,
295 Err(e) => return Err(Error::XmlError(e.to_string())),
296 _ => {}
297 }
298 buf.clear();
299 }
300
301 Ok((metadata, manifest, spine, ncx_path, cover_id))
302 }
303
304 fn parse_ncx(content: &str) -> Result<Vec<TocEntry>, Error> {
305 let mut reader = quick_xml::Reader::from_str(content);
306 let mut toc = Vec::new();
307 let mut stack: Vec<TocEntry> = Vec::new();
308
309 let mut buf = Vec::new();
310 let mut in_nav_label = false;
311 let mut in_text = false;
312
313 loop {
314 match reader.read_event_into(&mut buf) {
315 Ok(Event::Start(ref e)) => {
316 let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
317 if name == "navPoint" {
318 let entry = TocEntry {
319 label: String::new(),
320 href: String::new(),
321 children: Vec::new(),
322 };
323 stack.push(entry);
324 } else if name == "navLabel" {
325 in_nav_label = true;
326 } else if name == "text" && in_nav_label {
327 in_text = true;
328 }
329 }
330 Ok(Event::End(ref e)) => {
331 let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
332 if name == "navPoint" {
333 if let Some(entry) = stack.pop() {
334 if let Some(parent) = stack.last_mut() {
335 parent.children.push(entry);
336 } else {
337 toc.push(entry);
338 }
339 }
340 } else if name == "navLabel" {
341 in_nav_label = false;
342 } else if name == "text" && in_nav_label {
343 in_text = false;
344 }
345 }
346 Ok(Event::Text(e)) => {
347 if in_text {
348 if let Some(entry) = stack.last_mut() {
349 entry.label = e.unescape()?.into_owned();
350 }
351 }
352 }
353 Ok(Event::Empty(ref e)) => {
354 let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
355 if name == "content" {
356 if let Some(src) = e.try_get_attribute("src")? {
357 if let Some(entry) = stack.last_mut() {
358 entry.href =
359 src.decode_and_unescape_value(reader.decoder())?.to_string();
360 }
361 }
362 }
363 }
364 Ok(Event::Eof) => break,
365 Err(e) => return Err(Error::XmlError(e.to_string())),
366 _ => {}
367 }
368 buf.clear();
369 }
370
371 Ok(toc)
372 }
373
374 fn extract_text_from_html(content: &str) -> Result<String, Error> {
375 let mut reader = quick_xml::Reader::from_str(content);
376 let mut text = String::new();
377 let skip_tags: Vec<Vec<u8>> = vec![b"script".to_vec(), b"style".to_vec(), b"head".to_vec()];
378 let mut in_skip_tag = false;
379
380 let mut buf = Vec::new();
381
382 loop {
383 match reader.read_event_into(&mut buf) {
384 Ok(Event::Start(ref e)) => {
385 let tag = e.name().as_ref().to_vec();
386 if skip_tags.contains(&tag) {
387 in_skip_tag = true;
388 } else if tag.as_slice() == b"p"
389 || tag.as_slice() == b"div"
390 || tag.as_slice() == b"br"
391 || tag.as_slice() == b"li"
392 {
393 text.push('\n');
394 }
395 }
396 Ok(Event::End(ref e)) => {
397 let tag = e.name().as_ref().to_vec();
398 if skip_tags.contains(&tag) {
399 in_skip_tag = false;
400 }
401 }
402 Ok(Event::Text(e)) => {
403 if !in_skip_tag {
404 let t = e.unescape()?.into_owned();
405 let trimmed: String = t.chars().filter(|c| !c.is_control()).collect();
406 text.push_str(&trimmed);
407 text.push(' ');
408 }
409 }
410 Ok(Event::Eof) => break,
411 Err(e) => return Err(Error::XmlError(e.to_string())),
412 _ => {}
413 }
414 buf.clear();
415 }
416
417 Ok(text
418 .lines()
419 .map(|l| l.trim())
420 .filter(|l| !l.is_empty())
421 .collect::<Vec<_>>()
422 .join("\n"))
423 }
424
425 fn resolve_path(base_path: &str, href: &str) -> String {
426 let base = PathBuf::from(base_path);
427 let parent = base.parent().unwrap_or(base.as_path());
428 let resolved = parent.join(href);
429 resolved.to_string_lossy().to_string().replace('\\', "/")
430 }
431}
432
433#[derive(Debug, Clone)]
434struct ManifestItem {
435 _id: String,
436 href: String,
437 _media_type: String,
438}