mod azw3;
mod epub;
mod kfx;
mod mobi;
pub use azw3::Azw3Importer;
pub use epub::EpubImporter;
pub use kfx::KfxImporter;
pub use mobi::MobiImporter;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use crate::dom::{Origin, Stylesheet, compile_html_bytes, extract_stylesheets};
use crate::model::{AnchorTarget, Chapter, FontFace, GlobalNodeId, Landmark, Metadata, TocEntry};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub struct ChapterId(pub u32);
#[derive(Debug, Clone)]
pub struct SpineEntry {
pub id: ChapterId,
pub size_estimate: usize,
}
pub trait Importer: Send + Sync {
fn open(path: &Path) -> std::io::Result<Self>
where
Self: Sized;
fn metadata(&self) -> &Metadata;
fn toc(&self) -> &[TocEntry];
fn landmarks(&self) -> &[Landmark];
fn spine(&self) -> &[SpineEntry];
fn load_chapter(&mut self, id: ChapterId) -> std::io::Result<Chapter> {
let html_bytes = self.load_raw(id)?;
let hint_encoding = crate::util::extract_xml_encoding(&html_bytes);
let html_str = crate::util::decode_text(&html_bytes, hint_encoding);
let (linked, inline) = extract_stylesheets(&html_str);
let mut stylesheets = Vec::new();
for href in linked {
let css_path = if let Some(chapter_path) = self.source_id(id) {
resolve_relative_path(chapter_path, &href)
} else {
PathBuf::from(&href)
};
if let Some(sheet) = self.load_stylesheet(&css_path) {
stylesheets.push((sheet, Origin::Author));
}
}
for css in inline {
stylesheets.push((Stylesheet::parse(&css), Origin::Author));
}
let mut chapter = compile_html_bytes(&html_bytes, &stylesheets);
if let Some(base_path) = self.source_id(id) {
resolve_semantic_paths(&mut chapter, base_path);
}
Ok(chapter)
}
fn source_id(&self, id: ChapterId) -> Option<&str>;
fn load_raw(&mut self, id: ChapterId) -> std::io::Result<Vec<u8>>;
fn list_assets(&self) -> &[PathBuf];
fn load_asset(&mut self, path: &Path) -> std::io::Result<Vec<u8>>;
fn load_stylesheet(&mut self, path: &Path) -> Option<Stylesheet> {
if let Ok(css_bytes) = self.load_asset(path) {
let css_str = String::from_utf8_lossy(&css_bytes);
return Some(Stylesheet::parse(&css_str));
}
None
}
fn font_faces(&mut self) -> Vec<FontFace> {
let mut font_faces = Vec::new();
let css_paths: Vec<_> = self
.list_assets()
.iter()
.filter(|p| {
p.extension()
.map(|e| e.eq_ignore_ascii_case("css"))
.unwrap_or(false)
})
.cloned()
.collect();
for css_path in css_paths {
if let Some(stylesheet) = self.load_stylesheet(&css_path) {
for mut font_face in stylesheet.font_faces {
let resolved =
resolve_relative_path(css_path.to_string_lossy().as_ref(), &font_face.src);
font_face.src = resolved.to_string_lossy().replace('\\', "/");
font_faces.push(font_face);
}
}
}
font_faces
}
fn requires_normalized_export(&self) -> bool {
false
}
fn index_anchors(&mut self, _chapters: &[(ChapterId, Arc<Chapter>)]) {
}
fn resolve_toc(&mut self) {
}
fn toc_mut(&mut self) -> &mut [TocEntry];
fn resolve_href(&self, _from_chapter: ChapterId, href: &str) -> Option<AnchorTarget> {
let href = href.trim();
if href.starts_with("http://")
|| href.starts_with("https://")
|| href.starts_with("mailto:")
|| href.starts_with("tel:")
{
return Some(AnchorTarget::External(href.to_string()));
}
None
}
}
pub fn resolve_path_based_href(
from_path: &str,
href: &str,
chapter_for_path: impl Fn(&str) -> Option<ChapterId>,
anchor: impl Fn(&str) -> Option<GlobalNodeId>,
) -> Option<AnchorTarget> {
let href = href.trim();
if href.starts_with("http://")
|| href.starts_with("https://")
|| href.starts_with("mailto:")
|| href.starts_with("tel:")
{
return Some(AnchorTarget::External(href.to_string()));
}
if let Some(fragment) = href.strip_prefix('#') {
let key = format!("{}#{}", from_path, fragment);
if let Some(target) = anchor(&key) {
return Some(AnchorTarget::Internal(target));
}
return None;
}
let (path, fragment) = if let Some(hash_pos) = href.find('#') {
(&href[..hash_pos], Some(&href[hash_pos + 1..]))
} else {
(href, None)
};
let target_chapter = chapter_for_path(path)?;
if let Some(frag) = fragment {
let key = format!("{}#{}", path, frag);
if let Some(target) = anchor(&key) {
return Some(AnchorTarget::Internal(target));
}
return None;
}
Some(AnchorTarget::Chapter(target_chapter))
}
fn resolve_relative_path(base: &str, relative: &str) -> PathBuf {
if relative.starts_with('/') || relative.contains("://") {
return PathBuf::from(relative);
}
if relative.starts_with('#') {
return PathBuf::from(format!("{}{}", base, relative));
}
let base_path = Path::new(base);
let base_dir = base_path.parent().unwrap_or(Path::new(""));
let joined = base_dir.join(relative);
let mut result = PathBuf::new();
for component in joined.components() {
match component {
std::path::Component::ParentDir => {
result.pop();
}
std::path::Component::Normal(name) => {
result.push(name);
}
std::path::Component::CurDir => {}
std::path::Component::RootDir => {
result.push("/");
}
std::path::Component::Prefix(prefix) => {
result.push(prefix.as_os_str());
}
}
}
result
}
fn resolve_semantic_paths(chapter: &mut Chapter, base_path: &str) {
chapter.semantics.resolve_paths(|path| {
if path.contains("://") || path.starts_with("data:") {
return path.to_string();
}
let resolved = resolve_relative_path(base_path, path);
resolved.to_string_lossy().replace('\\', "/")
});
}
#[cfg(test)]
mod tests {
use super::*;
use crate::model::{Landmark, Metadata, TocEntry};
use proptest::prelude::*;
use std::collections::HashMap;
use std::io;
#[test]
fn test_resolve_fragment_only_path() {
let result = resolve_relative_path("f_0004.xhtml", "#FOOTNOTE-1");
assert_eq!(result.to_string_lossy(), "f_0004.xhtml#FOOTNOTE-1");
let result = resolve_relative_path("OEBPS/text/chapter.xhtml", "#anchor");
assert_eq!(result.to_string_lossy(), "OEBPS/text/chapter.xhtml#anchor");
}
#[test]
fn test_resolve_relative_path_with_fragment() {
let result = resolve_relative_path("text/ch1.xhtml", "ch2.xhtml#section");
let normalized: String = result.to_string_lossy().replace('\\', "/");
assert_eq!(normalized, "text/ch2.xhtml#section");
}
#[test]
fn test_resolve_parent_directory() {
let result = resolve_relative_path("OEBPS/text/ch01.xhtml", "../styles/main.css");
let normalized: String = result.to_string_lossy().replace('\\', "/");
assert_eq!(normalized, "OEBPS/styles/main.css");
}
#[test]
fn test_resolve_absolute_path_unchanged() {
let result = resolve_relative_path("text/chapter.xhtml", "/absolute/path.css");
assert_eq!(result.to_string_lossy(), "/absolute/path.css");
}
#[test]
fn test_resolve_url_unchanged() {
let result = resolve_relative_path("text/chapter.xhtml", "https://example.com/");
assert_eq!(result.to_string_lossy(), "https://example.com/");
}
#[test]
fn test_load_chapter_stylesheet_cache() {
struct TestImporter {
chapters: HashMap<u32, String>,
assets: HashMap<String, Vec<u8>>,
asset_list: Vec<PathBuf>,
css_cache: HashMap<String, Stylesheet>,
css_loads: usize,
metadata: Metadata,
toc: Vec<TocEntry>,
landmarks: Vec<Landmark>,
spine: Vec<SpineEntry>,
source_ids: Vec<String>,
}
impl Importer for TestImporter {
fn open(_path: &Path) -> io::Result<Self> {
unreachable!()
}
fn metadata(&self) -> &Metadata {
&self.metadata
}
fn toc(&self) -> &[TocEntry] {
&self.toc
}
fn toc_mut(&mut self) -> &mut [TocEntry] {
&mut self.toc
}
fn landmarks(&self) -> &[Landmark] {
&self.landmarks
}
fn spine(&self) -> &[SpineEntry] {
&self.spine
}
fn source_id(&self, id: ChapterId) -> Option<&str> {
self.source_ids.get(id.0 as usize).map(|s| s.as_str())
}
fn load_raw(&mut self, id: ChapterId) -> io::Result<Vec<u8>> {
self.chapters
.get(&id.0)
.map(|s| s.as_bytes().to_vec())
.ok_or_else(|| io::Error::new(io::ErrorKind::NotFound, "chapter not found"))
}
fn list_assets(&self) -> &[PathBuf] {
&self.asset_list
}
fn load_asset(&mut self, path: &Path) -> io::Result<Vec<u8>> {
let key = path.to_string_lossy().replace('\\', "/");
self.assets
.get(&key)
.cloned()
.ok_or_else(|| io::Error::new(io::ErrorKind::NotFound, "asset not found"))
}
fn load_stylesheet(&mut self, path: &Path) -> Option<Stylesheet> {
let key = path.to_string_lossy().replace('\\', "/");
if let Some(sheet) = self.css_cache.get(&key) {
return Some(sheet.clone());
}
let css_bytes = self.load_asset(path).ok()?;
let css_str = String::from_utf8_lossy(&css_bytes);
let sheet = Stylesheet::parse(&css_str);
self.css_cache.insert(key, sheet.clone());
self.css_loads += 1;
Some(sheet)
}
}
let mut importer = TestImporter {
chapters: HashMap::from([
(
0,
r#"<html><head><link rel="stylesheet" href="style.css"></head><body>One</body></html>"#
.to_string(),
),
(
1,
r#"<html><head><link rel="stylesheet" href="style.css"></head><body>Two</body></html>"#
.to_string(),
),
]),
assets: HashMap::from([(
"text/style.css".to_string(),
b"p { color: red; }".to_vec(),
)]),
asset_list: vec![PathBuf::from("text/style.css")],
css_cache: HashMap::new(),
css_loads: 0,
metadata: Metadata::default(),
toc: Vec::new(),
landmarks: Vec::new(),
spine: vec![
SpineEntry {
id: ChapterId(0),
size_estimate: 0,
},
SpineEntry {
id: ChapterId(1),
size_estimate: 0,
},
],
source_ids: vec!["text/ch1.xhtml".to_string(), "text/ch2.xhtml".to_string()],
};
let _ = importer.load_chapter(ChapterId(0)).unwrap();
let _ = importer.load_chapter(ChapterId(1)).unwrap();
assert_eq!(importer.css_loads, 1);
}
#[test]
fn test_font_faces_uses_load_stylesheet() {
struct TestImporter {
asset_list: Vec<PathBuf>,
metadata: Metadata,
toc: Vec<TocEntry>,
landmarks: Vec<Landmark>,
spine: Vec<SpineEntry>,
}
impl Importer for TestImporter {
fn open(_path: &Path) -> io::Result<Self> {
unreachable!()
}
fn metadata(&self) -> &Metadata {
&self.metadata
}
fn toc(&self) -> &[TocEntry] {
&self.toc
}
fn toc_mut(&mut self) -> &mut [TocEntry] {
&mut self.toc
}
fn landmarks(&self) -> &[Landmark] {
&self.landmarks
}
fn spine(&self) -> &[SpineEntry] {
&self.spine
}
fn source_id(&self, _id: ChapterId) -> Option<&str> {
None
}
fn load_raw(&mut self, _id: ChapterId) -> io::Result<Vec<u8>> {
Err(io::Error::new(io::ErrorKind::Other, "unused"))
}
fn list_assets(&self) -> &[PathBuf] {
&self.asset_list
}
fn load_asset(&mut self, _path: &Path) -> io::Result<Vec<u8>> {
Err(io::Error::new(
io::ErrorKind::Other,
"load_asset should not be called",
))
}
fn load_stylesheet(&mut self, _path: &Path) -> Option<Stylesheet> {
let css = "@font-face { font-family: Test; src: url(../fonts/test.woff); }";
Some(Stylesheet::parse(css))
}
}
let mut importer = TestImporter {
asset_list: vec![PathBuf::from("styles/main.css")],
metadata: Metadata::default(),
toc: Vec::new(),
landmarks: Vec::new(),
spine: Vec::new(),
};
let fonts = importer.font_faces();
assert_eq!(fonts.len(), 1);
assert_eq!(fonts[0].font_family, "Test");
assert_eq!(fonts[0].src, "fonts/test.woff");
}
proptest! {
#[test]
fn prop_resolve_relative_path_preserves_fragment_and_no_backslashes(
base_parts in prop::collection::vec("[a-z]{1,8}", 1..5),
target_parts in prop::collection::vec("[a-z]{1,8}", 1..5),
fragment in "[A-Za-z0-9_-]{1,12}",
up_levels in 0usize..3
) {
let mut base = base_parts.join("/");
base.push_str("/chapter.xhtml");
let mut target = String::new();
for _ in 0..up_levels {
target.push_str("../");
}
target.push_str(&target_parts.join("/"));
target.push_str(".xhtml#");
target.push_str(&fragment);
let resolved = resolve_relative_path(&base, &target);
let normalized = resolved.to_string_lossy().replace('\\', "/");
let expected_fragment = format!("#{}", fragment);
prop_assert!(normalized.ends_with(&expected_fragment));
prop_assert!(!normalized.contains('\\'));
}
#[test]
fn prop_resolve_relative_path_preserves_absolute_and_urls(
base_parts in prop::collection::vec("[a-z]{1,8}", 1..5),
absolute in "[A-Za-z0-9/_\\-]{1,24}",
path in "[A-Za-z0-9/_\\-]{1,24}",
) {
let mut base = base_parts.join("/");
base.push_str("/chapter.xhtml");
let absolute_path = format!("/{}", absolute);
let url = format!("https://example.com/{}", path);
let resolved_abs = resolve_relative_path(&base, &absolute_path);
prop_assert_eq!(resolved_abs.to_string_lossy(), absolute_path);
let resolved_url = resolve_relative_path(&base, &url);
prop_assert_eq!(resolved_url.to_string_lossy(), url);
}
#[test]
fn prop_resolve_relative_path_eliminates_dotdot(
base_parts in prop::collection::vec("[a-z]{1,8}", 2..5),
target_parts in prop::collection::vec("[a-z]{1,8}", 1..4),
up_levels in 0usize..2
) {
let mut base = base_parts.join("/");
base.push_str("/chapter.xhtml");
let mut target = String::new();
for _ in 0..up_levels {
target.push_str("../");
}
target.push_str(&target_parts.join("/"));
target.push_str(".xhtml");
let resolved = resolve_relative_path(&base, &target);
let normalized = resolved.to_string_lossy().replace('\\', "/");
prop_assert!(!normalized.contains("/../"));
}
#[test]
fn prop_resolve_fragment_only_appends_to_base(
base_parts in prop::collection::vec("[a-z]{1,8}", 1..5),
fragment in "[A-Za-z0-9_-]{1,12}"
) {
let mut base = base_parts.join("/");
base.push_str("/chapter.xhtml");
let target = format!("#{}", fragment);
let resolved = resolve_relative_path(&base, &target);
let normalized = resolved.to_string_lossy().replace('\\', "/");
let expected = format!("{}#{}", base, fragment);
prop_assert_eq!(normalized, expected);
}
}
}