use std::borrow::Cow;
use ahash::AHashMap;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use std::collections::{BTreeMap, HashMap};
#[cfg(feature = "pdf")]
use crate::pdf::metadata::PdfMetadata;
use super::formats::ImagePreprocessingMetadata;
use super::page::PageStructure;
mod additional_serde {
use super::*;
pub fn serialize<S>(map: &AHashMap<Cow<'static, str>, serde_json::Value>, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
let converted: HashMap<String, serde_json::Value> =
map.iter().map(|(k, v)| (k.to_string(), v.clone())).collect();
converted.serialize(serializer)
}
pub fn deserialize<'de, D>(deserializer: D) -> Result<AHashMap<Cow<'static, str>, serde_json::Value>, D::Error>
where
D: Deserializer<'de>,
{
let map = HashMap::<String, serde_json::Value>::deserialize(deserializer)?;
let result = map.into_iter().map(|(k, v)| (Cow::Owned(k), v)).collect();
Ok(result)
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
#[serde(tag = "format_type", rename_all = "snake_case")]
pub enum FormatMetadata {
#[cfg(feature = "pdf")]
Pdf(PdfMetadata),
#[cfg(feature = "office")]
Docx(Box<DocxMetadata>),
Excel(ExcelMetadata),
Email(EmailMetadata),
Pptx(PptxMetadata),
Archive(ArchiveMetadata),
Image(ImageMetadata),
Xml(XmlMetadata),
Text(TextMetadata),
#[cfg_attr(feature = "api", schema(value_type = HtmlMetadata))]
Html(Box<HtmlMetadata>),
Ocr(OcrMetadata),
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct Metadata {
#[serde(skip_serializing_if = "Option::is_none")]
pub title: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub subject: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub authors: Option<Vec<String>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub keywords: Option<Vec<String>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub language: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub created_at: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub modified_at: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub created_by: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub modified_by: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub pages: Option<PageStructure>,
#[serde(flatten, skip_serializing_if = "Option::is_none")]
#[cfg_attr(feature = "api", schema(value_type = Option<Object>))]
pub format: Option<FormatMetadata>,
#[serde(skip_serializing_if = "Option::is_none")]
pub image_preprocessing: Option<ImagePreprocessingMetadata>,
#[serde(skip_serializing_if = "Option::is_none")]
pub json_schema: Option<serde_json::Value>,
#[serde(skip_serializing_if = "Option::is_none")]
pub error: Option<ErrorMetadata>,
#[serde(skip_serializing_if = "Option::is_none")]
pub extraction_duration_ms: Option<u64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub category: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub tags: Option<Vec<String>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub document_version: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub abstract_text: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub output_format: Option<String>,
#[serde(
flatten,
serialize_with = "additional_serde::serialize",
deserialize_with = "additional_serde::deserialize"
)]
#[cfg_attr(feature = "api", schema(value_type = HashMap<String, serde_json::Value>))]
pub additional: AHashMap<Cow<'static, str>, serde_json::Value>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct ExcelMetadata {
pub sheet_count: usize,
pub sheet_names: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct EmailMetadata {
#[serde(skip_serializing_if = "Option::is_none")]
pub from_email: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub from_name: Option<String>,
pub to_emails: Vec<String>,
pub cc_emails: Vec<String>,
pub bcc_emails: Vec<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub message_id: Option<String>,
pub attachments: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct ArchiveMetadata {
#[cfg_attr(feature = "api", schema(value_type = String))]
pub format: Cow<'static, str>,
pub file_count: usize,
pub file_list: Vec<String>,
pub total_size: usize,
#[serde(skip_serializing_if = "Option::is_none")]
pub compressed_size: Option<usize>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct ImageMetadata {
pub width: u32,
pub height: u32,
pub format: String,
pub exif: HashMap<String, String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct XmlMetadata {
pub element_count: usize,
pub unique_elements: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct TextMetadata {
pub line_count: usize,
pub word_count: usize,
pub character_count: usize,
#[serde(skip_serializing_if = "Option::is_none")]
pub headers: Option<Vec<String>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub links: Option<Vec<(String, String)>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub code_blocks: Option<Vec<(String, String)>>,
}
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
#[serde(rename_all = "lowercase")]
pub enum TextDirection {
#[serde(rename = "ltr")]
LeftToRight,
#[serde(rename = "rtl")]
RightToLeft,
#[serde(rename = "auto")]
Auto,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct HeaderMetadata {
pub level: u8,
pub text: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub id: Option<String>,
pub depth: usize,
pub html_offset: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct LinkMetadata {
pub href: String,
pub text: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub title: Option<String>,
pub link_type: LinkType,
pub rel: Vec<String>,
pub attributes: Vec<(String, String)>,
}
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
#[serde(rename_all = "lowercase")]
pub enum LinkType {
Anchor,
Internal,
External,
Email,
Phone,
Other,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct ImageMetadataType {
pub src: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub alt: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub title: Option<String>,
pub dimensions: Option<(u32, u32)>,
pub image_type: ImageType,
pub attributes: Vec<(String, String)>,
}
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
#[serde(rename_all = "lowercase")]
pub enum ImageType {
#[serde(rename = "data-uri")]
DataUri,
#[serde(rename = "inline-svg")]
InlineSvg,
External,
Relative,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct StructuredData {
pub data_type: StructuredDataType,
pub raw_json: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub schema_type: Option<String>,
}
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
#[serde(rename_all = "lowercase")]
pub enum StructuredDataType {
#[serde(rename = "json-ld")]
JsonLd,
Microdata,
#[serde(rename = "rdfa")]
RDFa,
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct HtmlMetadata {
#[serde(skip_serializing_if = "Option::is_none")]
pub title: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub description: Option<String>,
#[serde(default)]
pub keywords: Vec<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub author: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub canonical_url: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub base_href: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub language: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub text_direction: Option<TextDirection>,
#[serde(default)]
pub open_graph: BTreeMap<String, String>,
#[serde(default)]
pub twitter_card: BTreeMap<String, String>,
#[serde(default)]
pub meta_tags: BTreeMap<String, String>,
#[serde(default)]
pub headers: Vec<HeaderMetadata>,
#[serde(default)]
pub links: Vec<LinkMetadata>,
#[serde(default)]
pub images: Vec<ImageMetadataType>,
#[serde(default)]
pub structured_data: Vec<StructuredData>,
}
impl HtmlMetadata {
pub fn is_empty(&self) -> bool {
self.title.is_none()
&& self.description.is_none()
&& self.keywords.is_empty()
&& self.author.is_none()
&& self.canonical_url.is_none()
&& self.base_href.is_none()
&& self.language.is_none()
&& self.text_direction.is_none()
&& self.open_graph.is_empty()
&& self.twitter_card.is_empty()
&& self.meta_tags.is_empty()
&& self.headers.is_empty()
&& self.links.is_empty()
&& self.images.is_empty()
&& self.structured_data.is_empty()
}
}
#[cfg(feature = "html")]
impl From<html_to_markdown_rs::ExtendedMetadata> for HtmlMetadata {
fn from(metadata: html_to_markdown_rs::ExtendedMetadata) -> Self {
let text_dir = metadata.document.text_direction.map(|td| match td {
html_to_markdown_rs::TextDirection::LeftToRight => TextDirection::LeftToRight,
html_to_markdown_rs::TextDirection::RightToLeft => TextDirection::RightToLeft,
html_to_markdown_rs::TextDirection::Auto => TextDirection::Auto,
});
HtmlMetadata {
title: metadata.document.title,
description: metadata.document.description,
keywords: metadata.document.keywords,
author: metadata.document.author,
canonical_url: metadata.document.canonical_url,
base_href: metadata.document.base_href,
language: metadata.document.language,
text_direction: text_dir,
open_graph: metadata.document.open_graph,
twitter_card: metadata.document.twitter_card,
meta_tags: metadata.document.meta_tags,
headers: metadata
.headers
.into_iter()
.map(|h| HeaderMetadata {
level: h.level,
text: h.text,
id: h.id,
depth: h.depth,
html_offset: h.html_offset,
})
.collect(),
links: metadata
.links
.into_iter()
.map(|l| LinkMetadata {
href: l.href,
text: l.text,
title: l.title,
link_type: match l.link_type {
html_to_markdown_rs::LinkType::Anchor => LinkType::Anchor,
html_to_markdown_rs::LinkType::Internal => LinkType::Internal,
html_to_markdown_rs::LinkType::External => LinkType::External,
html_to_markdown_rs::LinkType::Email => LinkType::Email,
html_to_markdown_rs::LinkType::Phone => LinkType::Phone,
html_to_markdown_rs::LinkType::Other => LinkType::Other,
},
rel: l.rel,
attributes: l.attributes.into_iter().collect(),
})
.collect(),
images: metadata
.images
.into_iter()
.map(|img| ImageMetadataType {
src: img.src,
alt: img.alt,
title: img.title,
dimensions: img.dimensions,
image_type: match img.image_type {
html_to_markdown_rs::ImageType::DataUri => ImageType::DataUri,
html_to_markdown_rs::ImageType::InlineSvg => ImageType::InlineSvg,
html_to_markdown_rs::ImageType::External => ImageType::External,
html_to_markdown_rs::ImageType::Relative => ImageType::Relative,
},
attributes: img.attributes.into_iter().collect(),
})
.collect(),
structured_data: metadata
.structured_data
.into_iter()
.map(|sd| StructuredData {
data_type: match sd.data_type {
html_to_markdown_rs::StructuredDataType::JsonLd => StructuredDataType::JsonLd,
html_to_markdown_rs::StructuredDataType::Microdata => StructuredDataType::Microdata,
html_to_markdown_rs::StructuredDataType::RDFa => StructuredDataType::RDFa,
},
raw_json: sd.raw_json,
schema_type: sd.schema_type,
})
.collect(),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct OcrMetadata {
pub language: String,
pub psm: i32,
pub output_format: String,
pub table_count: usize,
#[serde(skip_serializing_if = "Option::is_none")]
pub table_rows: Option<usize>,
#[serde(skip_serializing_if = "Option::is_none")]
pub table_cols: Option<usize>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct ErrorMetadata {
pub error_type: String,
pub message: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct PptxMetadata {
pub slide_count: usize,
pub slide_names: Vec<String>,
}
#[cfg(feature = "office")]
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct DocxMetadata {
#[serde(skip_serializing_if = "Option::is_none")]
#[cfg_attr(feature = "api", schema(value_type = Option<Object>))]
pub core_properties: Option<crate::extraction::office_metadata::CoreProperties>,
#[serde(skip_serializing_if = "Option::is_none")]
#[cfg_attr(feature = "api", schema(value_type = Option<Object>))]
pub app_properties: Option<crate::extraction::office_metadata::DocxAppProperties>,
#[serde(skip_serializing_if = "Option::is_none")]
pub custom_properties: Option<HashMap<String, serde_json::Value>>,
}