kawat-core 0.1.5

Core extraction cascade orchestrator for kawat
Documentation
//! Extracted document model.
//! Mirrors trafilatura settings.py Document class.

use kawat_metadata::DocumentMetadata;
use serde::{Deserialize, Serialize};

use crate::config::ExtractorOptions;

/// A fully extracted document with text, metadata, and comments.
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct Document {
    /// Extracted metadata.
    #[serde(flatten)]
    pub metadata: DocumentMetadata,
    /// Main body text (internal representation).
    pub body: String,
    /// Extracted comments (if enabled).
    pub comments: Option<String>,
    /// Raw extracted text (before formatting).
    pub raw_text: Option<String>,
    /// Formatted output string.
    #[serde(skip)]
    pub text: Option<String>,
}

impl Document {
    /// Convert to the configured output format.
    pub fn to_formatted_string(&self, options: &ExtractorOptions) -> String {
        use kawat_output::OutputFormat;

        match options.format {
            OutputFormat::Txt if options.with_metadata => kawat_output::to_txt(
                self.metadata.title.as_deref(),
                self.metadata.author.as_deref(),
                self.metadata.date.as_deref(),
                self.metadata.url.as_deref(),
                &self.body,
            ),
            OutputFormat::Txt => kawat_output::to_txt_body_only(&self.body),
            // TODO: implement other output formats
            _ => self.body.clone(),
        }
    }

    /// Convert to a HashMap for Python-dict-like access.
    pub fn as_map(&self) -> std::collections::HashMap<String, Option<String>> {
        let mut map = std::collections::HashMap::new();
        map.insert("title".into(), self.metadata.title.clone());
        map.insert("author".into(), self.metadata.author.clone());
        map.insert("url".into(), self.metadata.url.clone());
        map.insert("date".into(), self.metadata.date.clone());
        map.insert("body".into(), Some(self.body.clone()));
        map.insert("comments".into(), self.comments.clone());
        map
    }
}