use kawat_metadata::DocumentMetadata;
use serde::{Deserialize, Serialize};
use crate::config::ExtractorOptions;
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct Document {
#[serde(flatten)]
pub metadata: DocumentMetadata,
pub body: String,
pub comments: Option<String>,
pub raw_text: Option<String>,
#[serde(skip)]
pub text: Option<String>,
}
impl Document {
pub fn to_formatted_string(&self, options: &ExtractorOptions) -> String {
use kawat_output::OutputFormat;
match options.format {
OutputFormat::Txt if options.with_metadata => kawat_output::to_txt(
self.metadata.title.as_deref(),
self.metadata.author.as_deref(),
self.metadata.date.as_deref(),
self.metadata.url.as_deref(),
&self.body,
),
OutputFormat::Txt => kawat_output::to_txt_body_only(&self.body),
_ => self.body.clone(),
}
}
pub fn as_map(&self) -> std::collections::HashMap<String, Option<String>> {
let mut map = std::collections::HashMap::new();
map.insert("title".into(), self.metadata.title.clone());
map.insert("author".into(), self.metadata.author.clone());
map.insert("url".into(), self.metadata.url.clone());
map.insert("date".into(), self.metadata.date.clone());
map.insert("body".into(), Some(self.body.clone()));
map.insert("comments".into(), self.comments.clone());
map
}
}