use std::io::{Cursor, Read};
use scraper::{Html, Selector};
use super::super::LoaderError;
pub(crate) fn extract(bytes: &[u8]) -> Result<String, LoaderError> {
let cursor = Cursor::new(bytes);
let mut archive = zip::ZipArchive::new(cursor)
.map_err(|e| LoaderError::ExtractionFailed(format!("Failed to open EPUB as ZIP: {e}")))?;
let opf_path = find_opf_path(&mut archive)?;
let opf_dir = opf_path.rfind('/').map(|i| &opf_path[..=i]).unwrap_or("");
let opf_dir = opf_dir.to_string();
let opf_content = read_zip_entry(&mut archive, &opf_path)?;
let chapter_paths = parse_opf_spine(&opf_content, &opf_dir)?;
let mut elements: Vec<String> = Vec::new();
for path in &chapter_paths {
match read_zip_entry(&mut archive, path) {
Ok(html_content) => {
let text = strip_html(&html_content);
let trimmed = text.trim().to_string();
if !trimmed.is_empty() {
elements.push(trimmed);
}
}
Err(_) => {
continue;
}
}
}
Ok(elements.join("\n\n"))
}
fn read_zip_entry(
archive: &mut zip::ZipArchive<Cursor<&[u8]>>,
name: &str,
) -> Result<String, LoaderError> {
let mut content = String::new();
let mut file = archive.by_name(name).map_err(|e| {
LoaderError::ExtractionFailed(format!("Failed to find '{name}' in EPUB: {e}"))
})?;
file.read_to_string(&mut content).map_err(|e| {
LoaderError::ExtractionFailed(format!("Failed to read '{name}' in EPUB: {e}"))
})?;
Ok(content)
}
fn find_opf_path(archive: &mut zip::ZipArchive<Cursor<&[u8]>>) -> Result<String, LoaderError> {
let container_xml = read_zip_entry(archive, "META-INF/container.xml")?;
use quick_xml::events::Event;
use quick_xml::reader::Reader;
let mut reader = Reader::from_str(&container_xml);
loop {
match reader.read_event() {
Ok(Event::Start(ref e)) | Ok(Event::Empty(ref e))
if e.local_name().as_ref() == b"rootfile" =>
{
for attr in e.attributes().flatten() {
if attr.key.local_name().as_ref() == b"full-path" {
let path = String::from_utf8(attr.value.to_vec()).map_err(|e| {
LoaderError::ExtractionFailed(format!("Invalid UTF-8 in OPF path: {e}"))
})?;
return Ok(path);
}
}
}
Ok(Event::Eof) => break,
Err(e) => {
return Err(LoaderError::ExtractionFailed(format!(
"XML parse error in container.xml: {e}"
)));
}
_ => {}
}
}
Err(LoaderError::ExtractionFailed(
"No rootfile found in META-INF/container.xml".to_string(),
))
}
fn parse_opf_spine(opf_xml: &str, opf_dir: &str) -> Result<Vec<String>, LoaderError> {
use quick_xml::events::Event;
use quick_xml::reader::Reader;
use std::collections::HashMap;
let mut reader = Reader::from_str(opf_xml);
let mut manifest: HashMap<String, String> = HashMap::new();
let mut spine_idrefs: Vec<String> = Vec::new();
let mut in_manifest = false;
let mut in_spine = false;
loop {
match reader.read_event() {
Ok(Event::Start(ref e)) => {
let local = e.local_name();
match local.as_ref() {
b"manifest" => in_manifest = true,
b"spine" => in_spine = true,
b"item" if in_manifest => {
parse_manifest_item(e, &mut manifest);
}
b"itemref" if in_spine => {
parse_spine_itemref(e, &mut spine_idrefs);
}
_ => {}
}
}
Ok(Event::Empty(ref e)) => {
let local = e.local_name();
if local.as_ref() == b"item" && in_manifest {
parse_manifest_item(e, &mut manifest);
} else if local.as_ref() == b"itemref" && in_spine {
parse_spine_itemref(e, &mut spine_idrefs);
}
}
Ok(Event::End(ref e)) => {
let local = e.local_name();
if local.as_ref() == b"manifest" {
in_manifest = false;
} else if local.as_ref() == b"spine" {
in_spine = false;
}
}
Ok(Event::Eof) => break,
Err(e) => {
return Err(LoaderError::ExtractionFailed(format!(
"XML parse error in OPF: {e}"
)));
}
_ => {}
}
}
let paths: Vec<String> = spine_idrefs
.iter()
.filter_map(|idref| {
manifest.get(idref).map(|href| {
if href.starts_with('/') {
href.clone()
} else {
format!("{opf_dir}{href}")
}
})
})
.collect();
Ok(paths)
}
fn parse_manifest_item(
e: &quick_xml::events::BytesStart<'_>,
manifest: &mut std::collections::HashMap<String, String>,
) {
let mut id = None;
let mut href = None;
for attr in e.attributes().flatten() {
match attr.key.local_name().as_ref() {
b"id" => {
id = String::from_utf8(attr.value.to_vec()).ok();
}
b"href" => {
href = String::from_utf8(attr.value.to_vec()).ok();
}
_ => {}
}
}
if let (Some(id), Some(href)) = (id, href) {
manifest.insert(id, href);
}
}
fn parse_spine_itemref(e: &quick_xml::events::BytesStart<'_>, spine_idrefs: &mut Vec<String>) {
for attr in e.attributes().flatten() {
if attr.key.local_name().as_ref() == b"idref"
&& let Ok(idref) = String::from_utf8(attr.value.to_vec())
{
spine_idrefs.push(idref);
}
}
}
fn strip_html(html: &str) -> String {
let document = Html::parse_document(html);
let body_selector = Selector::parse("body");
if let Ok(selector) = body_selector {
let mut texts: Vec<String> = Vec::new();
for element in document.select(&selector) {
let text: String = element
.text()
.map(|t| t.trim())
.filter(|t| !t.is_empty())
.collect::<Vec<&str>>()
.join(" ");
if !text.is_empty() {
texts.push(text);
}
}
if !texts.is_empty() {
return texts.join("\n");
}
}
document
.root_element()
.text()
.map(|t| t.trim())
.filter(|t| !t.is_empty())
.collect::<Vec<&str>>()
.join(" ")
}
#[cfg(test)]
#[allow(
clippy::unwrap_used,
clippy::expect_used,
reason = "test code — panics are acceptable failures"
)]
mod tests {
use super::*;
#[test]
fn strip_html_basic() {
let html = "<html><body><p>Hello</p><p>World</p></body></html>";
let result = strip_html(html);
assert!(result.contains("Hello"));
assert!(result.contains("World"));
}
#[test]
fn strip_html_with_tags() {
let html = "<html><body><p><b>Bold</b> and <i>italic</i></p></body></html>";
let result = strip_html(html);
assert!(result.contains("Bold"));
assert!(result.contains("italic"));
assert!(!result.contains("<b>"));
assert!(!result.contains("<i>"));
}
#[test]
fn invalid_zip_returns_error() {
let result = extract(b"not a zip file");
assert!(result.is_err());
let err = result.unwrap_err();
assert!(
matches!(err, LoaderError::ExtractionFailed(_)),
"expected ExtractionFailed, got {err:?}"
);
}
#[test]
fn empty_bytes_returns_error() {
let result = extract(b"");
assert!(result.is_err());
}
}