mod content;
mod metadata;
mod parsing;
use crate::Result;
use crate::core::config::ExtractionConfig;
use crate::plugins::{DocumentExtractor, Plugin};
use crate::types::{ExtractionResult, Metadata};
use ahash::AHashMap;
use async_trait::async_trait;
use std::borrow::Cow;
use std::io::Cursor;
use zip::ZipArchive;
use content::extract_content;
use metadata::extract_metadata;
use parsing::{parse_container_xml, read_file_from_zip};
pub struct EpubExtractor;
impl EpubExtractor {
pub fn new() -> Self {
Self
}
}
impl Default for EpubExtractor {
fn default() -> Self {
Self::new()
}
}
impl Plugin for EpubExtractor {
fn name(&self) -> &str {
"epub-extractor"
}
fn version(&self) -> String {
env!("CARGO_PKG_VERSION").to_string()
}
fn initialize(&self) -> Result<()> {
Ok(())
}
fn shutdown(&self) -> Result<()> {
Ok(())
}
fn description(&self) -> &str {
"Extracts content and metadata from EPUB documents (native Rust implementation with permissive licenses)"
}
fn author(&self) -> &str {
"Kreuzberg Team"
}
}
#[cfg(feature = "office")]
#[cfg_attr(not(target_arch = "wasm32"), async_trait)]
#[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
impl DocumentExtractor for EpubExtractor {
#[cfg_attr(
feature = "otel",
tracing::instrument(
skip(self, content, _config),
fields(
extractor.name = self.name(),
content.size_bytes = content.len(),
)
)
)]
async fn extract_bytes(
&self,
content: &[u8],
mime_type: &str,
_config: &ExtractionConfig,
) -> Result<ExtractionResult> {
let cursor = Cursor::new(content.to_vec());
let mut archive = ZipArchive::new(cursor).map_err(|e| crate::KreuzbergError::Parsing {
message: format!("Failed to open EPUB as ZIP: {}", e),
source: None,
})?;
let container_xml = read_file_from_zip(&mut archive, "META-INF/container.xml")?;
let opf_path = parse_container_xml(&container_xml)?;
let manifest_dir = if let Some(last_slash) = opf_path.rfind('/') {
opf_path[..last_slash].to_string()
} else {
String::new()
};
let opf_xml = read_file_from_zip(&mut archive, &opf_path)?;
let extracted_content = extract_content(&mut archive, &opf_path, &manifest_dir)?;
let (epub_metadata, additional_metadata) = extract_metadata(&opf_xml)?;
let metadata_map: AHashMap<Cow<'static, str>, serde_json::Value> = additional_metadata
.into_iter()
.map(|(k, v)| (Cow::Owned(k), v))
.collect();
Ok(ExtractionResult {
content: extracted_content,
mime_type: mime_type.to_string().into(),
metadata: Metadata {
title: epub_metadata.title,
authors: epub_metadata.creator.map(|c| vec![c]),
language: epub_metadata.language,
created_at: epub_metadata.date,
additional: metadata_map,
..Default::default()
},
pages: None,
tables: vec![],
detected_languages: None,
chunks: None,
images: None,
djot_content: None,
elements: None,
ocr_elements: None,
document: None,
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
extracted_keywords: None,
quality_score: None,
processing_warnings: Vec::new(),
annotations: None,
})
}
fn supported_mime_types(&self) -> &[&str] {
&[
"application/epub+zip",
"application/x-epub+zip",
"application/vnd.epub+zip",
]
}
fn priority(&self) -> i32 {
60
}
}
#[cfg(all(test, feature = "office"))]
mod tests {
use super::*;
#[test]
fn test_epub_extractor_plugin_interface() {
let extractor = EpubExtractor::new();
assert_eq!(extractor.name(), "epub-extractor");
assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
assert_eq!(extractor.priority(), 60);
assert!(!extractor.supported_mime_types().is_empty());
}
#[test]
fn test_epub_extractor_default() {
let extractor = EpubExtractor;
assert_eq!(extractor.name(), "epub-extractor");
}
#[tokio::test]
async fn test_epub_extractor_initialize_shutdown() {
let extractor = EpubExtractor::new();
assert!(extractor.initialize().is_ok());
assert!(extractor.shutdown().is_ok());
}
#[test]
fn test_epub_extractor_supported_mime_types() {
let extractor = EpubExtractor::new();
let supported = extractor.supported_mime_types();
assert!(supported.contains(&"application/epub+zip"));
assert!(supported.contains(&"application/x-epub+zip"));
assert!(supported.contains(&"application/vnd.epub+zip"));
}
}