use std::io::Read;
use std::{collections::HashMap, io, path::PathBuf};
use io::Cursor;
use regex::Regex;
use xmltree::Element;
use zip::{result::ZipError, ZipArchive};
use crate::util::get_parser_config;
use crate::{
errors::{MalformattedEpubError, ParseError},
types::{Book, Chapter},
util,
};
struct ZipArchiveWrapper<'a> {
zip_archive: ZipArchive<Cursor<&'a [u8]>>,
}
pub struct ManifestItem {
id: String,
href: String,
media_type: String,
properties: Option<String>,
}
pub type ItemId = String;
pub type Manifest = HashMap<ItemId, ManifestItem>;
pub type Spine = Vec<ItemId>;
pub struct ContentOPF {
pub title: String,
pub author: Option<String>,
pub language: String,
pub manifest: Manifest,
pub spine: Spine,
}
#[derive(PartialEq, Eq, Hash)]
pub struct NavPoint {
pub id: String,
pub label: Option<String>,
pub play_order: Option<usize>,
pub level: usize,
pub src: String,
pub children: Vec<NavPoint>,
}
pub struct TocNcx {
pub depth: usize,
pub nav_points: Vec<NavPoint>,
}
impl TocNcx {
pub fn get_flattened_nav_points(&self) -> Vec<&NavPoint> {
let mut result = Vec::new();
for nav_point in &self.nav_points {
self.add_dfs(nav_point, &mut result);
}
result
}
fn add_dfs<'a>(&self, nav_point: &'a NavPoint, result: &mut Vec<&'a NavPoint>) {
result.push(nav_point);
for child in &nav_point.children {
self.add_dfs(child, result);
}
}
}
pub struct EpubArchive<'a> {
zip: ZipArchiveWrapper<'a>,
manifest_html_files: HashMap<String, String>,
pub content_opf_dir: PathBuf,
pub content_opf: ContentOPF,
pub navigation: TocNcx,
}
impl<'a> ZipArchiveWrapper<'a> {
fn new(reader: Cursor<&'a [u8]>) -> Result<Self, ZipError> {
let zip_archive = ZipArchive::new(reader)?;
Ok(ZipArchiveWrapper { zip_archive })
}
fn get_file_content(&mut self, filepath: &str) -> Result<String, ZipError> {
let mut file = self.zip_archive.by_name(filepath)?;
let mut buffer = String::new();
file.read_to_string(&mut buffer)?;
Ok(buffer)
}
fn get_filenames(&self) -> Vec<String> {
self.zip_archive
.file_names()
.into_iter()
.map(|s| s.to_string())
.collect()
}
}
impl<'a> EpubArchive<'a> {
pub fn new(bytes: &'a [u8]) -> Result<Self, ParseError> {
let mut zip = ZipArchiveWrapper::new(Cursor::new(bytes))?;
let container_text = zip.get_file_content("META-INF/container.xml")?;
let content_opf_re = Regex::new(r#"rootfile full-path="(\S*)""#).unwrap();
let content_opf_path = match content_opf_re.captures(&container_text) {
Some(captures) => captures.get(1).unwrap().as_str().to_string(),
None => {
return Err(ParseError::EpubError(
MalformattedEpubError::MalformattedContainer,
));
}
};
let content_opf_dir = match PathBuf::from(&content_opf_path).parent() {
Some(p) => p.to_path_buf(),
None => PathBuf::new(),
};
let content_opf_text = zip.get_file_content(&content_opf_path)?;
let content_opf = parse_content_opf(&content_opf_text)
.ok_or(MalformattedEpubError::MalformattedContentOpf)?;
let mut nxc_path = content_opf_dir.clone();
nxc_path.push(
&content_opf
.manifest
.get("ncx")
.ok_or(MalformattedEpubError::MalformattedContentOpf)?
.href,
);
let ncx_path = nxc_path.into_os_string().into_string().unwrap();
let ncx_text = zip.get_file_content(&ncx_path)?;
let navigation = parse_ncx(&ncx_text)?;
let manifest_html_files: HashMap<String, String> = content_opf
.manifest
.values()
.filter_map(|manifest_item| {
if manifest_item.media_type == "application/xhtml+xml" {
Some(manifest_item.href.to_string())
} else {
None
}
})
.map(|filepath| {
let mut full_path = content_opf_dir.clone();
full_path.push(filepath.clone());
let full_path = full_path.into_os_string().into_string().unwrap();
zip.get_file_content(&full_path)
.map(|content| (filepath, content))
})
.collect::<Result<HashMap<_, _>, ZipError>>()?;
Ok(EpubArchive {
zip,
manifest_html_files,
content_opf_dir,
content_opf,
navigation,
})
}
pub fn to_book(&self) -> Result<Book, ParseError> {
let (preface_content, chapters) = self.item_refs_to_chapters()?;
Ok(Book {
title: self.content_opf.title.clone(),
author: self.content_opf.author.clone(),
preface_content,
chapters,
})
}
pub fn get_title(&self) -> &str {
&self.content_opf.title
}
fn get_matching_navpoints(
&self,
item_href: &str,
nav_points: &'a [NavPoint],
) -> Vec<&'a NavPoint> {
let mut result = Vec::new();
for nav_point in nav_points {
if nav_point.src.contains(item_href) {
result.push(nav_point);
}
result.append(&mut self.get_matching_navpoints(item_href, &nav_point.children));
}
result
}
fn item_refs_to_chapters(&self) -> Result<(String, Vec<Chapter>), ParseError> {
let flattened_navpoints = self.navigation.get_flattened_nav_points();
let nav_point_sources_map: HashMap<&NavPoint, Vec<&str>> = flattened_navpoints
.iter()
.map(|np| (*np, vec![np.src.as_str()]))
.collect();
let mut nav_point_content_map: HashMap<&NavPoint, Vec<String>> = flattened_navpoints
.iter()
.map(|np| (*np, Vec::new()))
.collect();
let mut preface_sources: Vec<&str> = Vec::new();
let mut preface_content: Vec<String> = Vec::new();
let mut passed_preface = false;
let mut ordered_sources_navpoints: Vec<(&str, &NavPoint)> =
Vec::with_capacity(self.content_opf.spine.len());
let mut last_matched_nav_point = *flattened_navpoints.first().unwrap();
for item_id in &self.content_opf.spine {
let item_href = self
.content_opf
.manifest
.get(item_id)
.map(|manifest_item| &manifest_item.href)
.ok_or(ParseError::EpubError(
MalformattedEpubError::MalformattedContentOpf,
))?;
let matching_nav_points =
self.get_matching_navpoints(item_href, &self.navigation.nav_points);
if matching_nav_points.is_empty() {
if passed_preface {
ordered_sources_navpoints.push((item_href, last_matched_nav_point));
} else {
preface_sources.push(item_href);
}
} else {
passed_preface = true;
for matching_nav_point in matching_nav_points {
let first_src_of_np = *nav_point_sources_map
.get(matching_nav_point)
.unwrap()
.first()
.unwrap();
ordered_sources_navpoints.push((first_src_of_np, matching_nav_point));
last_matched_nav_point = matching_nav_point;
}
}
}
for i in 0..ordered_sources_navpoints.len() {
let (src_path, nav_point) = ordered_sources_navpoints.get(i).unwrap();
let next_src_path = ordered_sources_navpoints.get(i + 1).map(|(s, _np)| *s);
let content_chunk = self.src_to_text(src_path, next_src_path)?;
nav_point_content_map
.get_mut(nav_point)
.unwrap()
.push(content_chunk);
}
for i in 0..preface_sources.len() {
let src_path = preface_sources.get(i).unwrap();
let next_src_path = if i == preface_sources.len() - 1 {
preface_sources.last().copied()
} else {
ordered_sources_navpoints.first().map(|tpl_ref| tpl_ref.0)
};
let content_chunk = self.src_to_text(src_path, next_src_path)?;
preface_content.push(content_chunk);
}
let chapters: Vec<Chapter> = self
.navigation
.nav_points
.iter()
.map(|np| convert_np_to_chapter(np, &nav_point_content_map))
.collect();
Ok((preface_content.join("\n"), chapters))
}
fn src_to_text(&self, src: &str, next_src: Option<&str>) -> Result<String, ParseError> {
let mut src_split = src.split('#');
let src_file = src_split.next().unwrap();
let src_anchor = src_split.next();
let full_text = self.manifest_html_files.get(src_file).ok_or_else(|| {
MalformattedEpubError::MalformattedTocNcx(format!(
"File {} in TOC, but not in Manifest",
src_file
))
})?;
let stop_anchor = if next_src.map(|s| s.starts_with(src_file)) == Some(true) {
let mut next_src_split = next_src.unwrap().split('#');
let _next_src_file = next_src_split.next();
next_src_split.next()
} else {
None
};
util::html_to_text(full_text.as_str(), src_anchor, stop_anchor).map_err(|err| {
ParseError::EpubError(MalformattedEpubError::MalformattedHTML(
src_file.to_string(),
err,
))
})
}
}
fn convert_np_to_chapter(
nav_point: &NavPoint,
contents: &HashMap<&NavPoint, Vec<String>>,
) -> Chapter {
let subchapters: Vec<Chapter> = nav_point
.children
.iter()
.map(|child| convert_np_to_chapter(child, contents))
.collect();
let text = contents
.get(nav_point)
.expect("NavPoint should have been present in content map")
.join("\n");
Chapter {
title: nav_point.label.clone().unwrap_or_else(|| "".to_string()),
text,
subchapters,
}
}
fn parse_nav_points(nav_points: &Element, level: usize) -> Option<Vec<NavPoint>> {
nav_points
.children
.iter()
.filter_map(|node| {
if let Some(el) = node.as_element() {
if el.name == "navPoint" {
return Some(el);
}
}
None
})
.map(|el| {
let id = el.attributes.get("id")?.to_string();
let play_order: Option<usize> = el
.attributes
.get("playOrder")
.and_then(|po| po.parse().ok());
let src = el.get_child("content")?.attributes.get("src")?.to_string();
let label = el
.get_child("navLabel")
.and_then(|el| el.get_child("text"))
.and_then(|el| el.get_text())
.map(|s| s.to_string());
let children = parse_nav_points(el, level + 1)?;
Some(NavPoint {
id,
label,
play_order,
level,
src,
children,
})
})
.collect::<Option<Vec<_>>>()
}
fn parse_ncx(text: &str) -> Result<TocNcx, MalformattedEpubError> {
let ncx = xmltree::Element::parse_with_config(text.as_bytes(), get_parser_config())
.map_err(|_e| MalformattedEpubError::MalformattedTocNcx("Invalid XML".to_string()))?;
let depths: Vec<usize> = ncx
.get_child("head")
.ok_or_else(|| MalformattedEpubError::MalformattedTocNcx("Missing head".to_string()))?
.children
.iter()
.filter_map(|node| {
if let Some(el) = node.as_element() {
if el.name == "meta"
&& el.attributes.get("name").map(|s| s.as_str()) == Some("dtb:depth")
{
let depth = el.attributes.get("content")?;
return depth.parse().ok();
}
}
None
})
.collect();
let depth = if depths.len() != 1 {
return Err(MalformattedEpubError::MalformattedTocNcx(
"Depth info missing or duplicated".to_string(),
));
} else {
*depths.get(0).unwrap()
};
let nav_map = ncx
.get_child("navMap")
.ok_or_else(|| MalformattedEpubError::MalformattedTocNcx("Missing navMap".to_string()))?;
let nav_points = parse_nav_points(nav_map, 1).ok_or_else(|| {
MalformattedEpubError::MalformattedTocNcx("Could not parse NavPoints".to_string())
})?;
Ok(TocNcx { depth, nav_points })
}
fn parse_manifest(manifest: &Element) -> Manifest {
manifest
.children
.iter()
.filter_map(|node| {
if let Some(el) = node.as_element() {
if el.name == "item" {
let id = el.attributes.get("id")?.to_string();
return Some((
id.clone(),
ManifestItem {
id,
href: el.attributes.get("href")?.to_string(),
media_type: el.attributes.get("media-type")?.to_string(),
properties: el.attributes.get("properties").map(|s| s.to_string()),
},
));
}
}
None
})
.collect::<HashMap<ItemId, ManifestItem>>()
}
pub fn parse_spine(spine: &Element) -> Option<Spine> {
Some(
spine
.children
.iter()
.filter_map(|node| {
if let Some(el) = node.as_element() {
if el.name == "itemref" {
let id = el.attributes.get("idref")?.to_string();
return Some(id);
}
}
None
})
.collect(),
)
}
fn parse_content_opf(text: &str) -> Option<ContentOPF> {
let package = xmltree::Element::parse_with_config(text.as_bytes(), get_parser_config()).ok()?;
let metadata = package.get_child("metadata")?;
let manifest = package.get_child("manifest")?;
let spine = package.get_child("spine")?;
let title = metadata.get_child("title")?.get_text()?.to_string();
let author = metadata
.get_child("creator")
.map(|el| el.get_text())
.flatten()
.map(|s| s.to_string());
let language = metadata.get_child("language")?.get_text()?.to_string();
let manifest = parse_manifest(manifest);
let spine = parse_spine(spine)?;
Some(ContentOPF {
title,
author,
language,
manifest,
spine,
})
}
#[cfg(test)]
mod tests {
use super::*;
static EPUB_PAID_OFF: &[u8] = include_bytes!("../../test_resources/paid_off.epub");
static EPUB_SHAKESPEARES: &[u8] = include_bytes!("../../test_resources/shakespeares.epub");
static EPUB_SIMPLE: &[u8] = include_bytes!("../../test_resources/simple.epub");
static EPUB_NESTED: &[u8] = include_bytes!("../../test_resources/nested.epub");
static EPUB_KANJIAN: &[u8] = include_bytes!("../../test_resources/kanjian.epub");
#[test]
fn epub_to_contentopf() {
let epub_archive = EpubArchive::new(EPUB_PAID_OFF).unwrap();
let content_opf = epub_archive.content_opf;
assert_eq!("Paid Off", &content_opf.title);
assert_eq!("Walter J. Coburn", &content_opf.author.unwrap());
assert_eq!("en", &content_opf.language);
assert!(!content_opf.manifest.is_empty());
assert!(!content_opf.spine.is_empty());
}
#[test]
fn epub_to_flat_ncx() {
let epub_archive = EpubArchive::new(EPUB_PAID_OFF).unwrap();
let toc_ncx = epub_archive.navigation;
assert_eq!(1, toc_ncx.depth);
assert_eq!(14, toc_ncx.nav_points.len());
assert_eq!(
(1..15).collect::<Vec<usize>>(),
toc_ncx
.nav_points
.iter()
.map(|np| np.play_order.unwrap())
.collect::<Vec<usize>>()
);
}
#[test]
fn epub_to_nested_ncx() {
let epub_archive = EpubArchive::new(EPUB_SHAKESPEARES).unwrap();
let toc_ncx = epub_archive.navigation;
assert_eq!(3, toc_ncx.depth);
assert_eq!(
"ACT I",
toc_ncx.nav_points[3].children[3].label.as_ref().unwrap()
);
}
#[test]
fn simple_epub_to_book() {
let expected_author = "蒲松龄";
let expected_title = "聊斋志异白话文";
let expected_chapter_titles = vec![
"聊斋志异白话文",
"卷一 考城隍",
"卷一 耳中人",
"卷一 尸变",
"卷一 喷水",
"卷一 瞳人语",
"卷一 画壁",
"卷一 山魈",
"卷一 咬鬼",
"卷一 捉狐",
"卷一 荞中怪",
"卷一 宅妖",
"卷一 王六郎",
"卷一 偷桃",
"卷一 种梨",
"卷一 劳山道士",
"卷一 长清僧",
"卷一 蛇人",
"卷一 斫蟒",
"卷一 犬奸",
];
let expected_chapter2_start = "卷一 考城隍 我姐夫的祖父,名叫宋焘,是本县的廪生";
let expected_chapter3_start = "卷一 耳中人 谭晋玄,是本县的一名秀才。";
let expected_chapter4_start = "卷一 尸变 阳信县某老翁";
let epub_archive = EpubArchive::new(EPUB_SIMPLE).unwrap();
let book = epub_archive
.to_book()
.expect("simple.epub should be parsed to book without error");
let chapter_titles = book.chapters.iter().map(|ch| &ch.title).collect::<Vec<_>>();
let chapter2 = book.chapters.get(1).expect("Book should contain chapters");
let chapter3 = book.chapters.get(2).expect("Book should contain chapters");
let chapter4 = book.chapters.get(3).expect("Book should contain chapters");
assert_eq!(Some(expected_author.to_string()), book.author);
assert_eq!(expected_title, &book.title);
assert_eq!(expected_chapter_titles, chapter_titles);
assert_eq!("卷一 考城隍", chapter2.title);
assert!(chapter2.text.starts_with(expected_chapter2_start));
assert!(chapter3.text.starts_with(expected_chapter3_start));
assert!(chapter4.text.starts_with(expected_chapter4_start));
}
#[test]
fn nested_epub_to_book() {
let expected_author = "Jannes".to_string();
let expected_title = "Nested example".to_string();
let expected_chapters = vec![
Chapter {
title: "Nested example".to_string(),
text: "Nested example Nested example Jannes".to_string(),
subchapters: vec![],
},
Chapter {
title: "Chapter 1".to_string(),
text: "Chapter 1 This is Chapter 1".to_string(),
subchapters: vec![
Chapter {
title: "Chapter 1.1".to_string(),
text: "Chapter 1.1 This is Chapter 1.1".to_string(),
subchapters: vec![
Chapter {
title: "Chapter 1.1.1".to_string(),
text: "Chapter 1.1.1 This is Chapter 1.1.1".to_string(),
subchapters: vec![],
},
Chapter {
title: "Chapter 1.1.2".to_string(),
text: "Chapter 1.1.2 This is Chapter 1.1.2".to_string(),
subchapters: vec![],
},
],
},
Chapter {
title: "Chapter 1.2".to_string(),
text: "Chapter 1.2 This is Chapter 1.2".to_string(),
subchapters: vec![],
},
],
},
Chapter {
title: "Chapter 2".to_string(),
text: "Chapter 2 This is Chapter 2".to_string(),
subchapters: vec![Chapter {
title: "Chapter 2.1".to_string(),
text: "Chapter 2.1".to_string(),
subchapters: vec![Chapter {
title: "Chapter 2.1.1".to_string(),
text: "Chapter 2.1.1 This is Chapter 2.1.1".to_string(),
subchapters: vec![],
}],
}],
},
Chapter {
title: "Chapter 3".to_string(),
text: "Chapter 3".to_string(),
subchapters: vec![Chapter {
title: "Chapter 3.1".to_string(),
text: "Chapter 3.1".to_string(),
subchapters: vec![Chapter {
title: "Chapter 3.1.1".to_string(),
text: "Chapter 3.1.1 This is Chapter 3.1.1".to_string(),
subchapters: vec![],
}],
}],
},
];
let expected_book = Book {
title: expected_title,
author: Some(expected_author),
preface_content: "".to_string(),
chapters: expected_chapters,
};
let epub_archive = EpubArchive::new(EPUB_NESTED).unwrap();
let book = epub_archive
.to_book()
.expect("nested.epub should be parsed to book without error");
assert_eq!(expected_book, book);
}
#[test]
fn epub_to_book_1() {
let expected_author = "柴静";
let expected_title = "看见";
let epub_archive = EpubArchive::new(EPUB_KANJIAN).unwrap();
let book = epub_archive
.to_book()
.expect("kanjian.epub should be parsed to book without error");
assert_eq!(Some(expected_author.to_string()), book.author);
assert_eq!(expected_title, &book.title);
}
}