kreuzberg 4.8.4

High-performance document intelligence library for Rust. Extract text, metadata, and structured data from PDFs, Office documents, images, and 91+ formats and 248 programming languages via tree-sitter code intelligence with async/sync APIs.
Documentation
//! Metadata extraction from EPUB OPF files.
//!
//! Handles parsing of OPF (Open Packaging Format) files and extraction of
//! Dublin Core metadata following EPUB2 and EPUB3 standards.

use crate::Result;
use crate::types::ProcessingWarning;
use roxmltree;
use std::collections::{BTreeMap, BTreeSet};

use super::parsing::resolve_path;

/// Metadata extracted from OPF (Open Packaging Format) file
#[derive(Debug, Default, Clone)]
pub(super) struct OepbMetadata {
    pub(super) title: Option<String>,
    pub(super) creator: Option<String>,
    pub(super) date: Option<String>,
    pub(super) language: Option<String>,
    pub(super) identifier: Option<String>,
    pub(super) publisher: Option<String>,
    pub(super) subject: Option<String>,
    pub(super) description: Option<String>,
    pub(super) rights: Option<String>,
    pub(super) coverage: Option<String>,
    pub(super) format: Option<String>,
    pub(super) relation: Option<String>,
    pub(super) source: Option<String>,
    pub(super) dc_type: Option<String>,
    pub(super) cover_image_href: Option<String>,
}

#[derive(Debug, Clone)]
pub(super) struct EpubPackageDocument {
    pub(super) metadata: OepbMetadata,
    pub(super) manifest: BTreeMap<String, ManifestItem>,
    pub(super) spine_items: Vec<EpubSpineItem>,
    guide_toc_paths: BTreeSet<String>,
}

#[allow(dead_code)]
impl EpubPackageDocument {
    pub(super) fn is_guide_toc_candidate_path(&self, path: &str) -> bool {
        self.guide_toc_paths.contains(path)
    }
}

#[derive(Debug, Clone, PartialEq, Eq)]
/// A spine entry extracted from the OPF package document.
pub(super) struct EpubSpineItem {
    pub(super) idref: String,
}

#[derive(Debug, Clone)]
/// Manifest metadata used to enrich spine entries after OPF parsing.
pub(super) struct ManifestItem {
    pub(super) raw_href: String,
    pub(super) path: Option<String>,
    path_resolution_error: Option<String>,
    #[allow(dead_code)]
    pub(super) media_type: Option<String>,
    #[allow(dead_code)]
    pub(super) fallback: Option<String>,
    pub(super) properties: Option<String>,
}

#[allow(dead_code)]
impl ManifestItem {
    pub(super) fn is_renderable_body_document(&self) -> bool {
        matches!(
            self.media_type.as_deref(),
            Some("application/xhtml+xml") | Some("application/x-dtbook+xml")
        ) || self.media_type.is_none() && has_renderable_extension(&self.raw_href)
    }

    /// Returns true if this manifest item has the EPUB3 `nav` property.
    pub(super) fn is_nav(&self) -> bool {
        self.properties
            .as_deref()
            .is_some_and(|p| p.split_ascii_whitespace().any(|v| v.eq_ignore_ascii_case("nav")))
    }

    pub(super) fn resolved_path(&self) -> std::result::Result<&str, String> {
        self.path.as_deref().ok_or_else(|| {
            self.path_resolution_error
                .clone()
                .unwrap_or_else(|| format!("unable to resolve manifest href '{}'", self.raw_href))
        })
    }
}

#[allow(dead_code)]
fn has_renderable_extension(href: &str) -> bool {
    let href = href
        .split_once('#')
        .map(|(path, _)| path)
        .unwrap_or(href)
        .rsplit('/')
        .next()
        .unwrap_or(href);

    href.rsplit_once('.')
        .map(|(_, ext)| {
            matches!(
                ext.to_ascii_lowercase().as_str(),
                "xhtml" | "html" | "htm" | "xml" | "dtbook"
            )
        })
        .unwrap_or(false)
}

/// Parse OPF file and extract metadata and spine order
pub(super) fn parse_opf(xml: &str, opf_dir: &str) -> Result<(EpubPackageDocument, Vec<ProcessingWarning>)> {
    match roxmltree::Document::parse(xml) {
        Ok(doc) => {
            let root = doc.root();

            let mut warnings = Vec::new();
            let mut package = EpubPackageDocument {
                metadata: OepbMetadata::default(),
                manifest: BTreeMap::new(),
                spine_items: Vec::new(),
                guide_toc_paths: BTreeSet::new(),
            };
            let mut manifest: BTreeMap<String, ManifestItem> = BTreeMap::new();

            for node in root.descendants() {
                match node.tag_name().name() {
                    "title" => {
                        if let Some(text) = node.text() {
                            package.metadata.title = Some(text.trim().to_string());
                        }
                    }
                    "creator" => {
                        if let Some(text) = node.text() {
                            package.metadata.creator = Some(text.trim().to_string());
                        }
                    }
                    "date" => {
                        if let Some(text) = node.text() {
                            package.metadata.date = Some(text.trim().to_string());
                        }
                    }
                    "language" => {
                        if let Some(text) = node.text() {
                            package.metadata.language = Some(text.trim().to_string());
                        }
                    }
                    "identifier" => {
                        if let Some(text) = node.text() {
                            package.metadata.identifier = Some(text.trim().to_string());
                        }
                    }
                    "publisher" => {
                        if let Some(text) = node.text() {
                            package.metadata.publisher = Some(text.trim().to_string());
                        }
                    }
                    "subject" => {
                        if let Some(text) = node.text() {
                            package.metadata.subject = Some(text.trim().to_string());
                        }
                    }
                    "description" => {
                        if let Some(text) = node.text() {
                            package.metadata.description = Some(text.trim().to_string());
                        }
                    }
                    "rights" => {
                        if let Some(text) = node.text() {
                            package.metadata.rights = Some(text.trim().to_string());
                        }
                    }
                    "coverage" => {
                        if let Some(text) = node.text() {
                            package.metadata.coverage = Some(text.trim().to_string());
                        }
                    }
                    "format" => {
                        if let Some(text) = node.text() {
                            package.metadata.format = Some(text.trim().to_string());
                        }
                    }
                    "relation" => {
                        if let Some(text) = node.text() {
                            package.metadata.relation = Some(text.trim().to_string());
                        }
                    }
                    "source" => {
                        if let Some(text) = node.text() {
                            package.metadata.source = Some(text.trim().to_string());
                        }
                    }
                    "type" => {
                        if let Some(text) = node.text() {
                            package.metadata.dc_type = Some(text.trim().to_string());
                        }
                    }
                    "item" => {
                        if let Some(id) = node.attribute("id")
                            && let Some(href) = node.attribute("href")
                        {
                            let (path, path_resolution_error) = match resolve_path(opf_dir, href) {
                                Ok(resolved_href) => (Some(resolved_href.path), None),
                                Err(err) => (None, Some(err.to_string())),
                            };
                            manifest.insert(
                                id.to_string(),
                                ManifestItem {
                                    raw_href: href.to_string(),
                                    path,
                                    path_resolution_error,
                                    media_type: node.attribute("media-type").map(ToString::to_string),
                                    fallback: node.attribute("fallback").map(ToString::to_string),
                                    properties: node.attribute("properties").map(ToString::to_string),
                                },
                            );
                        }
                    }
                    "reference" => {
                        if node
                            .attribute("type")
                            .is_some_and(|kind| kind.eq_ignore_ascii_case("toc"))
                            && let Some(href) = node.attribute("href")
                        {
                            match resolve_path(opf_dir, href) {
                                Ok(resolved_href) => {
                                    package.guide_toc_paths.insert(resolved_href.path);
                                }
                                Err(e) => {
                                    warnings.push(ProcessingWarning {
                                        source: std::borrow::Cow::Borrowed("epub"),
                                        message: std::borrow::Cow::Owned(format!(
                                            "Skipping malformed guide reference '{}': {}",
                                            href, e
                                        )),
                                    });
                                }
                            }
                        }
                    }
                    _ => {}
                }
            }

            // Find cover image via <meta name="cover" content="item-id"/>
            let mut cover_item_id = None;
            for node in root.descendants() {
                if node.tag_name().name() == "meta"
                    && node.attribute("name") == Some("cover")
                    && let Some(content) = node.attribute("content")
                {
                    cover_item_id = Some(content.to_string());
                    break;
                }
            }

            if let Some(cover_id) = cover_item_id
                && let Some(href) = manifest.get(&cover_id)
                && let Ok(path) = href.resolved_path()
            {
                package.metadata.cover_image_href = Some(path.to_string());
            }

            for node in root.descendants() {
                if node.tag_name().name() == "itemref"
                    && let Some(idref) = node.attribute("idref")
                {
                    package.spine_items.push(EpubSpineItem {
                        idref: idref.to_string(),
                    });
                }
            }

            package.manifest = manifest;
            Ok((package, warnings))
        }
        Err(e) => Err(crate::KreuzbergError::Parsing {
            message: format!("Failed to parse OPF file: {}", e),
            source: None,
        }),
    }
}

/// Convert parsed EPUB metadata into the extractor's generic metadata map.
pub(super) fn build_additional_metadata(epub_metadata: &OepbMetadata) -> BTreeMap<String, serde_json::Value> {
    let mut additional_metadata = BTreeMap::new();

    if let Some(ref identifier) = epub_metadata.identifier {
        additional_metadata.insert("identifier".to_string(), serde_json::json!(identifier.clone()));
    }

    if let Some(ref publisher) = epub_metadata.publisher {
        additional_metadata.insert("publisher".to_string(), serde_json::json!(publisher.clone()));
    }

    if let Some(ref subject) = epub_metadata.subject {
        additional_metadata.insert("subject".to_string(), serde_json::json!(subject.clone()));
    }

    if let Some(ref description) = epub_metadata.description {
        additional_metadata.insert("description".to_string(), serde_json::json!(description.clone()));
    }

    if let Some(ref rights) = epub_metadata.rights {
        additional_metadata.insert("rights".to_string(), serde_json::json!(rights.clone()));
    }

    additional_metadata
}