use regex;
use serde::Deserialize;
use std::collections::HashMap;
use std::error;
use std::fs::File;
use std::io::Read;
use zip::read::ZipArchive;
#[derive(Debug, Clone)]
pub struct EpubFile {
pub id: String,
pub href: String,
pub title: Option<String>,
pub content: String,
pub media_type: String,
}
impl EpubFile {
pub fn get_id(&self) -> &str {
&self.id
}
pub fn get_href(&self) -> &str {
&self.href
}
pub fn get_title(&self) -> Option<&str> {
self.title.as_deref()
}
pub fn get_content(&self) -> &str {
&self.content
}
pub fn get_media_type(&self) -> &str {
&self.media_type
}
pub fn get_html_bytes(&self) -> &[u8] {
self.content.as_bytes()
}
pub fn is_html(&self) -> bool {
self.media_type == "application/xhtml+xml"
}
pub fn get_parsable_html(&self) -> &str {
&self.content
}
}
pub struct Chapter {
title: String,
files: Vec<EpubFile>,
}
impl Chapter {
pub fn get_title(&self) -> &str {
&self.title
}
pub fn get_files(&self) -> &[EpubFile] {
&self.files
}
pub fn get_file_count(&self) -> usize {
self.files.len()
}
}
#[derive(Debug, Clone)]
pub struct TocEntry {
pub title: String,
pub href: String,
pub level: usize,
}
impl TocEntry {
pub fn get_title(&self) -> &str {
&self.title
}
pub fn get_href(&self) -> &str {
&self.href
}
pub fn get_level(&self) -> usize {
self.level
}
}
pub struct TableOfContents {
entries: Vec<TocEntry>,
}
impl TableOfContents {
pub fn new() -> Self {
TableOfContents {
entries: Vec::new(),
}
}
pub fn add_entry(&mut self, title: String, href: String, level: usize) {
self.entries.push(TocEntry { title, href, level });
}
pub fn get_entries(&self) -> &[TocEntry] {
&self.entries
}
pub fn get_entry_count(&self) -> usize {
self.entries.len()
}
}
#[derive(Debug, Deserialize)]
struct Container {
#[serde(rename = "rootfiles")]
rootfiles: RootFiles,
}
#[derive(Debug, Deserialize)]
struct RootFiles {
#[serde(rename = "rootfile")]
rootfile: Vec<RootFile>,
}
#[derive(Debug, Deserialize)]
struct RootFile {
#[serde(rename = "@full-path", default)]
full_path: String,
#[serde(rename = "@media-type", default)]
media_type: String,
}
#[derive(Debug, Deserialize)]
struct Package {
metadata: OpfMetadata,
manifest: Manifest,
spine: Spine,
}
#[derive(Debug, Deserialize)]
struct OpfMetadata {
#[serde(rename = "dc:identifier", default)]
identifier: Vec<String>,
#[serde(rename = "dc:title")]
title: String,
#[serde(rename = "dc:creator")]
creator: String,
#[serde(rename = "dc:language")]
language: String,
#[serde(rename = "dc:date")]
date: String,
#[serde(rename = "dc:description")]
description: Option<String>,
#[serde(rename = "meta", default)]
meta: Vec<Meta>,
}
#[derive(Debug, Deserialize)]
struct Meta {
#[serde(rename = "name")]
name: Option<String>,
#[serde(rename = "content")]
content: Option<String>,
#[serde(rename = "property")]
property: Option<String>,
#[serde(rename = "$text")]
value: Option<String>,
}
#[derive(Debug, Deserialize)]
struct Manifest {
#[serde(rename = "item")]
item: Vec<ManifestItem>,
}
#[derive(Debug, Deserialize)]
struct ManifestItem {
#[serde(rename = "@id")]
id: String,
#[serde(rename = "@href")]
href: String,
#[serde(rename = "@media-type")]
media_type: String,
#[serde(rename = "@properties")]
properties: Option<String>,
}
#[derive(Debug, Deserialize)]
struct Spine {
#[serde(rename = "itemref")]
itemref: Vec<ItemRef>,
}
#[derive(Debug, Deserialize)]
struct ItemRef {
#[serde(rename = "@idref")]
idref: String,
}
struct Metadata {
title: String,
creator: String,
language: String,
identifier: String,
date: String,
publisher: Option<String>,
description: Option<String>,
rights: Option<String>,
cover: Option<String>,
tags: Vec<String>,
}
impl Metadata {
pub fn new(
title: String,
creator: String,
language: String,
identifier: String,
date: String,
) -> Self {
Metadata {
title,
creator,
language,
identifier,
date,
publisher: None,
description: None,
rights: None,
cover: None,
tags: vec![],
}
}
}
pub struct Epub {
metadata: Metadata,
chapters: Vec<Chapter>,
table_of_contents: TableOfContents,
all_files: Vec<EpubFile>,
}
impl Epub {
pub fn new(file_path: String) -> Result<Epub, Box<dyn error::Error>> {
let file = File::open(file_path)?;
let mut archive = ZipArchive::new(file)?;
let container = {
let mut container_file = archive.by_name("META-INF/container.xml")?;
let mut xml = String::new();
container_file.read_to_string(&mut xml)?;
parse_container_xml(&xml)?
};
let opf_path = &container.rootfiles.rootfile[0].full_path;
let package = {
let mut opf_file = archive.by_name(&opf_path)?;
let mut xml = String::new();
opf_file.read_to_string(&mut xml)?;
parse_opf_xml(&xml)?
};
let nav_titles = Self::parse_navigation(&mut archive, &package, &opf_path)?;
let mut metadata = Metadata::new(
package.metadata.title.clone(),
package.metadata.creator.clone(),
package.metadata.language.clone(),
package
.metadata
.identifier
.first()
.unwrap_or(&String::new())
.clone(),
package.metadata.date.clone(),
);
metadata.description = package.metadata.description.clone();
for meta in &package.metadata.meta {
if let (Some(name), Some(content)) = (&meta.name, &meta.content) {
if name == "cover" {
metadata.cover = Some(content.clone());
}
}
}
let all_files = Self::parse_all_files(&mut archive, &package, &nav_titles, &opf_path)?;
let table_of_contents = Self::create_table_of_contents(&nav_titles, &all_files);
let chapters = Self::group_files_into_chapters(&all_files, &package.spine);
Ok(Epub {
metadata,
chapters,
table_of_contents,
all_files,
})
}
pub fn get_title(&self) -> &str {
&self.metadata.title
}
pub fn get_creator(&self) -> &str {
&self.metadata.creator
}
pub fn get_language(&self) -> &str {
&self.metadata.language
}
pub fn get_identifier(&self) -> &str {
&self.metadata.identifier
}
pub fn get_date(&self) -> &str {
&self.metadata.date
}
pub fn get_publisher(&self) -> Option<&str> {
self.metadata.publisher.as_deref()
}
pub fn get_description(&self) -> Option<&str> {
self.metadata.description.as_deref()
}
pub fn get_rights(&self) -> Option<&str> {
self.metadata.rights.as_deref()
}
pub fn get_cover(&self) -> Option<&str> {
self.metadata.cover.as_deref()
}
pub fn get_tags(&self) -> &[String] {
&self.metadata.tags
}
pub fn get_chapters(&self) -> &[Chapter] {
&self.chapters
}
pub fn get_chapter_count(&self) -> usize {
self.chapters.len()
}
pub fn get_table_of_contents(&self) -> &TableOfContents {
&self.table_of_contents
}
pub fn get_all_files(&self) -> &[EpubFile] {
&self.all_files
}
pub fn get_file_count(&self) -> usize {
self.all_files.len()
}
fn parse_navigation(
archive: &mut ZipArchive<File>,
package: &Package,
opf_path: &str,
) -> Result<HashMap<String, String>, Box<dyn error::Error>> {
let mut nav_titles = HashMap::new();
if let Some(nav_item) = package.manifest.item.iter().find(|item| {
item.properties
.as_ref()
.map_or(false, |props| props.contains("nav"))
}) {
let opf_dir = if let Some(slash_pos) = opf_path.rfind('/') {
&opf_path[..slash_pos + 1] } else {
"" };
let nav_path = format!("{}{}", opf_dir, nav_item.href);
match archive.by_name(&nav_path) {
Ok(mut nav_file) => {
let mut html = String::new();
nav_file.read_to_string(&mut html)?;
let pattern = r#"<a\s+href="([^"]+)"[^>]*>([^<]+)</a>"#;
if let Ok(re) = regex::Regex::new(pattern) {
for cap in re.captures_iter(&html) {
if let (Some(href), Some(text)) = (cap.get(1), cap.get(2)) {
let href_str = href.as_str().to_string();
let text_str = text.as_str().trim().to_string();
nav_titles.insert(href_str, text_str);
}
}
}
}
Err(_) => {
}
}
}
Ok(nav_titles)
}
fn parse_all_files(
archive: &mut ZipArchive<File>,
package: &Package,
nav_titles: &HashMap<String, String>,
opf_path: &str,
) -> Result<Vec<EpubFile>, Box<dyn error::Error>> {
let mut files = Vec::new();
let opf_dir = if let Some(slash_pos) = opf_path.rfind('/') {
&opf_path[..slash_pos + 1] } else {
"" };
for manifest_item in &package.manifest.item {
if manifest_item.media_type == "application/xhtml+xml" {
let is_nav = manifest_item
.properties
.as_ref()
.map_or(false, |props| props.contains("nav"));
if is_nav {
continue;
}
let file_path = format!("{}{}", opf_dir, manifest_item.href);
match archive.by_name(&file_path) {
Ok(mut file) => {
let mut content = String::new();
file.read_to_string(&mut content)?;
let epub_file = EpubFile {
id: manifest_item.id.clone(),
href: manifest_item.href.clone(),
title: nav_titles.get(&manifest_item.href).cloned(),
content,
media_type: manifest_item.media_type.clone(),
};
files.push(epub_file);
}
Err(_) => {
continue;
}
}
}
}
Ok(files)
}
fn create_table_of_contents(
_nav_titles: &HashMap<String, String>,
all_files: &[EpubFile],
) -> TableOfContents {
let mut toc = TableOfContents::new();
for file in all_files {
let title = file.title.clone().unwrap_or_else(|| file.id.clone());
toc.add_entry(title, file.href.clone(), 0);
}
toc
}
fn group_files_into_chapters(all_files: &[EpubFile], spine: &Spine) -> Vec<Chapter> {
let mut chapters = Vec::new();
let mut current_chapter_files = Vec::new();
let mut current_chapter_title = String::new();
let file_map: HashMap<String, &EpubFile> = all_files
.iter()
.map(|file| (file.id.clone(), file))
.collect();
for (_index, itemref) in spine.itemref.iter().enumerate() {
if let Some(file) = file_map.get(&itemref.idref) {
let should_start_new_chapter = if current_chapter_files.is_empty() {
true
} else {
file.title.is_some()
&& !Self::files_belong_to_same_chapter(¤t_chapter_files[0], file)
};
if should_start_new_chapter && !current_chapter_files.is_empty() {
let chapter = Chapter {
title: current_chapter_title.clone(),
files: current_chapter_files.clone(),
};
chapters.push(chapter);
current_chapter_files.clear();
}
if current_chapter_files.is_empty() {
current_chapter_title = file.title.clone().unwrap_or_else(|| file.id.clone());
}
current_chapter_files.push((*file).clone());
}
}
if !current_chapter_files.is_empty() {
let chapter = Chapter {
title: current_chapter_title,
files: current_chapter_files,
};
chapters.push(chapter);
}
chapters
}
fn files_belong_to_same_chapter(file1: &EpubFile, file2: &EpubFile) -> bool {
let base1 = Self::extract_chapter_base(&file1.id);
let base2 = Self::extract_chapter_base(&file2.id);
base1 == base2
}
fn extract_chapter_base(id: &str) -> String {
if let Some(last_underscore) = id.rfind('_') {
let after_underscore = &id[last_underscore + 1..];
if after_underscore.starts_with("part") {
return id[..last_underscore].to_string();
}
}
id.to_string()
}
fn get_zip_archive(file_path: &str) -> Result<ZipArchive<File>, Box<dyn error::Error>> {
let file = File::open(file_path)?;
let archive = ZipArchive::new(file)?;
Ok(archive)
}
}
fn parse_container_xml(xml: &str) -> Result<Container, Box<dyn std::error::Error>> {
let container: Container = serde_xml_rs::from_str(xml)?;
Ok(container)
}
fn parse_opf_xml(xml: &str) -> Result<Package, Box<dyn std::error::Error>> {
let package: Package = serde_xml_rs::from_str(xml)?;
Ok(package)
}