Skip to main content

epubie_lib/
epub.rs

1//! EPUB Parser Library
2//!
3//! This library provides functionality to parse EPUB files and extract:
4//! - Metadata (title, creator, language, identifier, date, description, cover)
5//! - Chapter information with titles from navigation file
6//! - Complete HTML content from all XHTML files for external parsing
7//! - Proper handling of Dublin Core metadata elements
8//! - Navigation file parsing to extract actual chapter titles
9//!
10//! The parser handles the standard EPUB 3.0 format including:
11//! - META-INF/container.xml parsing to locate the OPF file
12//! - OPF (Open Packaging Format) file parsing for metadata and manifest
13//! - Navigation file (nav.xhtml) parsing for chapter titles
14//! - Full XHTML content extraction for use with external HTML parsers
15//! - Fallback to manifest IDs when navigation titles are not available
16//!
17//! ## HTML Content Access
18//!
19//! Each `EpubFile` contains the complete XHTML content which can be parsed using
20//! external HTML parsing libraries like `scraper`, `select`, or `html5ever`:
21//!
22//! ```rust,ignore
23//! // Example using the scraper crate
24//! use scraper::{Html, Selector};
25//!
26//! for chapter in epub.get_chapters() {
27//!     for file in chapter.get_files() {
28//!         let document = Html::parse_document(file.get_parsable_html());
29//!         let selector = Selector::parse("p").unwrap();
30//!
31//!         for element in document.select(&selector) {
32//!             println!("Paragraph: {}", element.text().collect::<String>());
33//!         }
34//!     }
35//! }
36//! ```
37
38use regex;
39use serde::Deserialize;
40use std::collections::HashMap;
41use std::error;
42use std::fs::File;
43use std::io::Read;
44use zip::read::ZipArchive;
45
46/// Represents a single file within an EPUB
47#[derive(Debug, Clone)]
48pub struct EpubFile {
49    pub id: String,
50    pub href: String,
51    pub title: Option<String>,
52    pub content: String,
53    pub media_type: String,
54}
55
56impl EpubFile {
57    pub fn get_id(&self) -> &str {
58        &self.id
59    }
60
61    pub fn get_href(&self) -> &str {
62        &self.href
63    }
64
65    pub fn get_title(&self) -> Option<&str> {
66        self.title.as_deref()
67    }
68
69    pub fn get_content(&self) -> &str {
70        &self.content
71    }
72
73    pub fn get_media_type(&self) -> &str {
74        &self.media_type
75    }
76
77    /// Get HTML content as bytes for parsing with external libraries
78    pub fn get_html_bytes(&self) -> &[u8] {
79        self.content.as_bytes()
80    }
81
82    /// Check if this file contains HTML content
83    pub fn is_html(&self) -> bool {
84        self.media_type == "application/xhtml+xml"
85    }
86
87    /// Get the HTML content ready for parsing with external HTML parsers
88    pub fn get_parsable_html(&self) -> &str {
89        &self.content
90    }
91}
92
93/// Represents a chapter that can contain multiple files
94pub struct Chapter {
95    title: String,
96    files: Vec<EpubFile>,
97}
98
99impl Chapter {
100    pub fn get_title(&self) -> &str {
101        &self.title
102    }
103
104    pub fn get_files(&self) -> &[EpubFile] {
105        &self.files
106    }
107
108    pub fn get_file_count(&self) -> usize {
109        self.files.len()
110    }
111}
112
113/// Table of Contents entry
114#[derive(Debug, Clone)]
115pub struct TocEntry {
116    pub title: String,
117    pub href: String,
118    pub level: usize,
119}
120
121impl TocEntry {
122    pub fn get_title(&self) -> &str {
123        &self.title
124    }
125
126    pub fn get_href(&self) -> &str {
127        &self.href
128    }
129
130    pub fn get_level(&self) -> usize {
131        self.level
132    }
133}
134
135/// Complete Table of Contents
136pub struct TableOfContents {
137    entries: Vec<TocEntry>,
138}
139
140impl TableOfContents {
141    pub fn new() -> Self {
142        TableOfContents {
143            entries: Vec::new(),
144        }
145    }
146
147    pub fn add_entry(&mut self, title: String, href: String, level: usize) {
148        self.entries.push(TocEntry { title, href, level });
149    }
150
151    pub fn get_entries(&self) -> &[TocEntry] {
152        &self.entries
153    }
154
155    pub fn get_entry_count(&self) -> usize {
156        self.entries.len()
157    }
158}
159
160// Structs for parsing container.xml
161#[derive(Debug, Deserialize)]
162struct Container {
163    #[serde(rename = "rootfiles")]
164    rootfiles: RootFiles,
165}
166
167#[derive(Debug, Deserialize)]
168struct RootFiles {
169    #[serde(rename = "rootfile")]
170    rootfile: Vec<RootFile>,
171}
172
173#[derive(Debug, Deserialize)]
174struct RootFile {
175    #[serde(rename = "@full-path", default)]
176    full_path: String,
177    #[serde(rename = "@media-type", default)]
178    media_type: String,
179}
180
181// Structs for parsing OPF file
182#[derive(Debug, Deserialize)]
183struct Package {
184    metadata: OpfMetadata,
185    manifest: Manifest,
186    spine: Spine,
187}
188
189#[derive(Debug, Deserialize)]
190struct OpfMetadata {
191    #[serde(rename = "dc:identifier", default)]
192    identifier: Vec<String>,
193    #[serde(rename = "dc:title")]
194    title: String,
195    #[serde(rename = "dc:creator")]
196    creator: String,
197    #[serde(rename = "dc:language")]
198    language: String,
199    #[serde(rename = "dc:date")]
200    date: String,
201    #[serde(rename = "dc:description")]
202    description: Option<String>,
203    #[serde(rename = "meta", default)]
204    meta: Vec<Meta>,
205}
206
207#[derive(Debug, Deserialize)]
208struct Meta {
209    #[serde(rename = "name")]
210    name: Option<String>,
211    #[serde(rename = "content")]
212    content: Option<String>,
213    #[serde(rename = "property")]
214    property: Option<String>,
215    #[serde(rename = "$text")]
216    value: Option<String>,
217}
218
219#[derive(Debug, Deserialize)]
220struct Manifest {
221    #[serde(rename = "item")]
222    item: Vec<ManifestItem>,
223}
224
225#[derive(Debug, Deserialize)]
226struct ManifestItem {
227    #[serde(rename = "@id")]
228    id: String,
229    #[serde(rename = "@href")]
230    href: String,
231    #[serde(rename = "@media-type")]
232    media_type: String,
233    #[serde(rename = "@properties")]
234    properties: Option<String>,
235}
236
237#[derive(Debug, Deserialize)]
238struct Spine {
239    #[serde(rename = "itemref")]
240    itemref: Vec<ItemRef>,
241}
242
243#[derive(Debug, Deserialize)]
244struct ItemRef {
245    #[serde(rename = "@idref")]
246    idref: String,
247}
248
249// Define the metadata structure
250struct Metadata {
251    title: String,
252    creator: String,
253    language: String,
254    identifier: String,
255    date: String,
256    publisher: Option<String>,
257    description: Option<String>,
258    rights: Option<String>,
259    cover: Option<String>,
260    tags: Vec<String>,
261}
262
263impl Metadata {
264    pub fn new(
265        title: String,
266        creator: String,
267        language: String,
268        identifier: String,
269        date: String,
270    ) -> Self {
271        Metadata {
272            title,
273            creator,
274            language,
275            identifier,
276            date,
277            publisher: None,
278            description: None,
279            rights: None,
280            cover: None,
281            tags: vec![],
282        }
283    }
284}
285
286/// Main EPUB container that holds all parsed data
287pub struct Epub {
288    metadata: Metadata,
289    chapters: Vec<Chapter>,
290    table_of_contents: TableOfContents,
291    all_files: Vec<EpubFile>,
292}
293
294impl Epub {
295    /// Creates a new Epub instance by parsing the EPUB file at the given path
296    ///
297    /// # Arguments
298    /// * `file_path` - Path to the EPUB file
299    ///
300    /// # Returns
301    /// * `Result<Epub, Box<dyn error::Error>>` - Parsed EPUB or error
302    pub fn new(file_path: String) -> Result<Epub, Box<dyn error::Error>> {
303        let file = File::open(file_path)?;
304        let mut archive = ZipArchive::new(file)?;
305
306        // Read and parse META-INF/container.xml
307        let container = {
308            let mut container_file = archive.by_name("META-INF/container.xml")?;
309            let mut xml = String::new();
310            container_file.read_to_string(&mut xml)?;
311            parse_container_xml(&xml)?
312        };
313
314        // Get the OPF path and parse OPF file
315        let opf_path = &container.rootfiles.rootfile[0].full_path;
316        let package = {
317            let mut opf_file = archive.by_name(&opf_path)?;
318            let mut xml = String::new();
319            opf_file.read_to_string(&mut xml)?;
320            parse_opf_xml(&xml)?
321        };
322
323        // Parse navigation file to get chapter titles first
324        let nav_titles = Self::parse_navigation(&mut archive, &package, &opf_path)?;
325
326        // Extract metadata from OPF
327        let mut metadata = Metadata::new(
328            package.metadata.title.clone(),
329            package.metadata.creator.clone(),
330            package.metadata.language.clone(),
331            package
332                .metadata
333                .identifier
334                .first()
335                .unwrap_or(&String::new())
336                .clone(),
337            package.metadata.date.clone(),
338        );
339
340        // Set optional metadata fields
341        metadata.description = package.metadata.description.clone();
342
343        // Find cover from meta tags
344        for meta in &package.metadata.meta {
345            if let (Some(name), Some(content)) = (&meta.name, &meta.content) {
346                if name == "cover" {
347                    metadata.cover = Some(content.clone());
348                }
349            }
350        }
351
352        // Parse all XHTML files and create EpubFile objects
353        let all_files = Self::parse_all_files(&mut archive, &package, &nav_titles, &opf_path)?;
354
355        // Create table of contents from navigation
356        let table_of_contents = Self::create_table_of_contents(&nav_titles, &all_files);
357
358        // Group files into chapters
359        let chapters = Self::group_files_into_chapters(&all_files, &package.spine);
360
361        Ok(Epub {
362            metadata,
363            chapters,
364            table_of_contents,
365            all_files,
366        })
367    }
368
369    // Getter methods for accessing parsed data
370    pub fn get_title(&self) -> &str {
371        &self.metadata.title
372    }
373
374    pub fn get_creator(&self) -> &str {
375        &self.metadata.creator
376    }
377
378    pub fn get_language(&self) -> &str {
379        &self.metadata.language
380    }
381
382    pub fn get_identifier(&self) -> &str {
383        &self.metadata.identifier
384    }
385
386    pub fn get_date(&self) -> &str {
387        &self.metadata.date
388    }
389
390    pub fn get_publisher(&self) -> Option<&str> {
391        self.metadata.publisher.as_deref()
392    }
393
394    pub fn get_description(&self) -> Option<&str> {
395        self.metadata.description.as_deref()
396    }
397
398    pub fn get_rights(&self) -> Option<&str> {
399        self.metadata.rights.as_deref()
400    }
401
402    pub fn get_cover(&self) -> Option<&str> {
403        self.metadata.cover.as_deref()
404    }
405
406    pub fn get_tags(&self) -> &[String] {
407        &self.metadata.tags
408    }
409
410    pub fn get_chapters(&self) -> &[Chapter] {
411        &self.chapters
412    }
413
414    pub fn get_chapter_count(&self) -> usize {
415        self.chapters.len()
416    }
417
418    pub fn get_table_of_contents(&self) -> &TableOfContents {
419        &self.table_of_contents
420    }
421
422    pub fn get_all_files(&self) -> &[EpubFile] {
423        &self.all_files
424    }
425
426    pub fn get_file_count(&self) -> usize {
427        self.all_files.len()
428    }
429
430    fn parse_navigation(
431        archive: &mut ZipArchive<File>,
432        package: &Package,
433        opf_path: &str,
434    ) -> Result<HashMap<String, String>, Box<dyn error::Error>> {
435        let mut nav_titles = HashMap::new();
436
437        // Find the navigation file in the manifest
438        if let Some(nav_item) = package.manifest.item.iter().find(|item| {
439            item.properties
440                .as_ref()
441                .map_or(false, |props| props.contains("nav"))
442        }) {
443            // Resolve the navigation file path relative to the OPF directory
444            let opf_dir = if let Some(slash_pos) = opf_path.rfind('/') {
445                &opf_path[..slash_pos + 1] // Include the trailing slash
446            } else {
447                "" // OPF is at root level
448            };
449            let nav_path = format!("{}{}", opf_dir, nav_item.href);
450
451            // Try to parse the navigation file
452            match archive.by_name(&nav_path) {
453                Ok(mut nav_file) => {
454                    let mut html = String::new();
455                    nav_file.read_to_string(&mut html)?;
456
457                    // Use simple regex pattern to extract href and text from <a> tags
458                    // Pattern: <a href="..." ...>TEXT</a>
459                    let pattern = r#"<a\s+href="([^"]+)"[^>]*>([^<]+)</a>"#;
460                    if let Ok(re) = regex::Regex::new(pattern) {
461                        for cap in re.captures_iter(&html) {
462                            if let (Some(href), Some(text)) = (cap.get(1), cap.get(2)) {
463                                let href_str = href.as_str().to_string();
464                                let text_str = text.as_str().trim().to_string();
465                                nav_titles.insert(href_str, text_str);
466                            }
467                        }
468                    }
469                }
470                Err(_) => {
471                    // Navigation file not found or couldn't be read, continue without titles
472                }
473            }
474        }
475
476        Ok(nav_titles)
477    }
478
479    fn parse_all_files(
480        archive: &mut ZipArchive<File>,
481        package: &Package,
482        nav_titles: &HashMap<String, String>,
483        opf_path: &str,
484    ) -> Result<Vec<EpubFile>, Box<dyn error::Error>> {
485        let mut files = Vec::new();
486
487        // Determine the OPF directory for resolving relative paths
488        let opf_dir = if let Some(slash_pos) = opf_path.rfind('/') {
489            &opf_path[..slash_pos + 1] // Include the trailing slash
490        } else {
491            "" // OPF is at root level
492        };
493
494        for manifest_item in &package.manifest.item {
495            if manifest_item.media_type == "application/xhtml+xml" {
496                // Skip navigation files
497                let is_nav = manifest_item
498                    .properties
499                    .as_ref()
500                    .map_or(false, |props| props.contains("nav"));
501
502                if is_nav {
503                    continue;
504                }
505
506                // Resolve the file path relative to the OPF directory
507                let file_path = format!("{}{}", opf_dir, manifest_item.href);
508
509                match archive.by_name(&file_path) {
510                    Ok(mut file) => {
511                        let mut content = String::new();
512                        file.read_to_string(&mut content)?;
513
514                        let epub_file = EpubFile {
515                            id: manifest_item.id.clone(),
516                            href: manifest_item.href.clone(),
517                            title: nav_titles.get(&manifest_item.href).cloned(),
518                            content,
519                            media_type: manifest_item.media_type.clone(),
520                        };
521
522                        files.push(epub_file);
523                    }
524                    Err(_) => {
525                        // File not found or couldn't be read, skip it
526                        continue;
527                    }
528                }
529            }
530        }
531
532        Ok(files)
533    }
534
535    fn create_table_of_contents(
536        _nav_titles: &HashMap<String, String>,
537        all_files: &[EpubFile],
538    ) -> TableOfContents {
539        let mut toc = TableOfContents::new();
540
541        // Add entries for all content files in spine order
542        for file in all_files {
543            let title = file.title.clone().unwrap_or_else(|| file.id.clone());
544            toc.add_entry(title, file.href.clone(), 0);
545        }
546
547        toc
548    }
549
550    fn group_files_into_chapters(all_files: &[EpubFile], spine: &Spine) -> Vec<Chapter> {
551        let mut chapters = Vec::new();
552        let mut current_chapter_files = Vec::new();
553        let mut current_chapter_title = String::new();
554
555        // Create a map from ID to file for easy lookup
556        let file_map: HashMap<String, &EpubFile> = all_files
557            .iter()
558            .map(|file| (file.id.clone(), file))
559            .collect();
560
561        for (_index, itemref) in spine.itemref.iter().enumerate() {
562            if let Some(file) = file_map.get(&itemref.idref) {
563                // Determine if this should start a new chapter
564                let should_start_new_chapter = if current_chapter_files.is_empty() {
565                    true
566                } else {
567                    // Start new chapter if:
568                    // 1. The file has a title from navigation
569                    // 2. The base name changes (e.g., chapter_1 vs chapter_2)
570                    file.title.is_some()
571                        && !Self::files_belong_to_same_chapter(&current_chapter_files[0], file)
572                };
573
574                if should_start_new_chapter && !current_chapter_files.is_empty() {
575                    // Finish current chapter
576                    let chapter = Chapter {
577                        title: current_chapter_title.clone(),
578                        files: current_chapter_files.clone(),
579                    };
580                    chapters.push(chapter);
581                    current_chapter_files.clear();
582                }
583
584                if current_chapter_files.is_empty() {
585                    // Starting a new chapter
586                    current_chapter_title = file.title.clone().unwrap_or_else(|| file.id.clone());
587                }
588
589                current_chapter_files.push((*file).clone());
590            }
591        }
592
593        // Add the last chapter if there are remaining files
594        if !current_chapter_files.is_empty() {
595            let chapter = Chapter {
596                title: current_chapter_title,
597                files: current_chapter_files,
598            };
599            chapters.push(chapter);
600        }
601
602        chapters
603    }
604
605    fn files_belong_to_same_chapter(file1: &EpubFile, file2: &EpubFile) -> bool {
606        // Extract base chapter name (everything before the last underscore and number)
607        let base1 = Self::extract_chapter_base(&file1.id);
608        let base2 = Self::extract_chapter_base(&file2.id);
609        base1 == base2
610    }
611
612    fn extract_chapter_base(id: &str) -> String {
613        // For IDs like "chapter_4_part1", extract "chapter_4"
614        // For IDs like "chapter_1", extract "chapter_1"
615        if let Some(last_underscore) = id.rfind('_') {
616            let after_underscore = &id[last_underscore + 1..];
617            // If what comes after the underscore starts with "part",
618            // return everything before the "_part" part
619            if after_underscore.starts_with("part") {
620                return id[..last_underscore].to_string();
621            }
622        }
623        id.to_string()
624    }
625
626    fn get_zip_archive(file_path: &str) -> Result<ZipArchive<File>, Box<dyn error::Error>> {
627        let file = File::open(file_path)?;
628        let archive = ZipArchive::new(file)?;
629        Ok(archive)
630    }
631}
632
633// Function to parse container.xml using serde-xml-rs
634fn parse_container_xml(xml: &str) -> Result<Container, Box<dyn std::error::Error>> {
635    let container: Container = serde_xml_rs::from_str(xml)?;
636    Ok(container)
637}
638
639// Function to parse OPF file using serde-xml-rs
640fn parse_opf_xml(xml: &str) -> Result<Package, Box<dyn std::error::Error>> {
641    let package: Package = serde_xml_rs::from_str(xml)?;
642    Ok(package)
643}