use serde::{Deserialize, Serialize};
#[derive(Debug, Default, Deserialize, Serialize, PartialEq)]
pub struct CommonMetadata {
pub filename: Option<String>,
pub file_directory: Option<String>,
pub last_modified: Option<String>,
pub filetype: Option<String>,
pub coordinates: Option<String>,
pub parent_id: Option<String>,
pub category_depth: Option<u32>,
pub text_as_html: Option<String>,
pub languages: Option<Vec<String>>,
pub emphasized_text_contents: Option<String>,
pub emphasized_text_tags: Option<String>,
pub is_continuation: Option<bool>,
pub detection_class_prob: Option<Vec<f64>>,
}
#[derive(Debug, Deserialize, Serialize, PartialEq)]
pub struct PagedDocument {
#[serde(flatten)]
pub common: CommonMetadata,
pub page_number: Option<u32>,
}
#[derive(Debug, Deserialize, Serialize, PartialEq)]
pub struct ExcelMetadata {
#[serde(flatten)]
pub common: CommonMetadata,
pub page_number: Option<u32>,
pub page_name: Option<String>,
}
#[derive(Debug, Deserialize, Serialize, PartialEq)]
pub struct EmailMetadata {
#[serde(flatten)]
pub common: CommonMetadata,
pub sent_from: Option<String>,
pub sent_to: Option<String>,
pub subject: Option<String>,
}
#[derive(Debug, Deserialize, Serialize, PartialEq)]
pub struct MsgMetadata {
#[serde(flatten)]
pub common: CommonMetadata,
pub attached_to_filename: Option<String>,
}
#[derive(Debug, Deserialize, Serialize, PartialEq)]
pub struct WordDocMetadata {
#[serde(flatten)]
pub common: CommonMetadata,
pub page_number: Option<u32>,
pub header_footer_type: Option<String>,
}
#[derive(Debug, Deserialize, Serialize, PartialEq)]
pub struct HtmlMetadata {
#[serde(flatten)]
pub common: CommonMetadata,
pub link_urls: Option<Vec<String>>,
pub link_texts: Option<Vec<String>>,
}
#[derive(Debug, Deserialize, Serialize, PartialEq)]
pub struct EpubMetadata {
#[serde(flatten)]
pub common: CommonMetadata,
pub section: Option<String>,
}
#[derive(Debug, Serialize, Deserialize, PartialEq)]
#[serde(tag = "filetype")]
pub enum ExtendedMetadata {
#[serde(rename = "application/pdf")]
PdfPage(PagedDocument),
#[serde(rename = "application/vnd.openxmlformats-officedocument.wordprocessingml.document")]
DocxPage(PagedDocument),
#[serde(rename = "application/vnd.openxmlformats-officedocument.presentationml.presentation")]
PptPage(PagedDocument),
#[serde(
rename = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
alias = "sheet",
alias = "excel"
)]
XlsxPage(ExcelMetadata),
#[serde(rename = "message/rfc822")]
Eml(EmailMetadata),
#[serde(rename = "application/vnd.ms-outlook")]
Msg(MsgMetadata),
#[serde(rename = "application/msword")]
WordDoc(WordDocMetadata),
#[serde(rename = "text/html")]
Html(HtmlMetadata),
#[serde(rename = "application/epub+zip")]
Epub(EpubMetadata),
}
#[derive(Serialize, Deserialize, Debug, PartialEq)]
#[serde(untagged)]
pub enum Metadata {
KnownFormat(ExtendedMetadata),
UnknownFormat(CommonMetadata),
}
impl Metadata {
pub fn into_common_metadata(self) -> CommonMetadata {
match self {
Metadata::KnownFormat(ext_metadata) => match ext_metadata {
ExtendedMetadata::PdfPage(m) => m.common,
ExtendedMetadata::DocxPage(m) => m.common,
ExtendedMetadata::PptPage(m) => m.common,
ExtendedMetadata::XlsxPage(m) => m.common,
ExtendedMetadata::Eml(m) => m.common,
ExtendedMetadata::Msg(m) => m.common,
ExtendedMetadata::WordDoc(m) => m.common,
ExtendedMetadata::Html(m) => m.common,
ExtendedMetadata::Epub(m) => m.common,
},
Metadata::UnknownFormat(metadata) => metadata,
}
}
}
impl From<Metadata> for CommonMetadata {
fn from(value: Metadata) -> Self {
value.into_common_metadata()
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::error::Result;
fn test_metadata_for_mime_type(
mime_type: &str,
expected_format: ExtendedMetadata,
) -> Result<()> {
let json_str = r#"
{
"filetype": "<REPLACE>"
}
"#;
let json_str = json_str.replace("<REPLACE>", mime_type);
let metadata: Metadata = serde_json::from_str(&json_str).unwrap();
match metadata {
Metadata::KnownFormat(mdi) => {
assert_eq!(mdi, expected_format);
}
_ => panic!("Other Metadata"),
}
Ok(())
}
#[test]
fn test_all_known_formats() -> Result<()> {
let known_formats = vec![
(
"application/pdf",
ExtendedMetadata::PdfPage(PagedDocument {
common: CommonMetadata::default(),
page_number: None,
}),
),
(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
ExtendedMetadata::DocxPage(PagedDocument {
common: CommonMetadata::default(),
page_number: None,
}),
),
(
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
ExtendedMetadata::PptPage(PagedDocument {
common: CommonMetadata::default(),
page_number: None,
}),
),
(
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
ExtendedMetadata::XlsxPage(ExcelMetadata {
common: CommonMetadata::default(),
page_number: None,
page_name: None,
}),
),
(
"message/rfc822",
ExtendedMetadata::Eml(EmailMetadata {
common: CommonMetadata::default(),
sent_from: None,
sent_to: None,
subject: None,
}),
),
(
"application/vnd.ms-outlook",
ExtendedMetadata::Msg(MsgMetadata {
common: CommonMetadata::default(),
attached_to_filename: None,
}),
),
(
"application/msword",
ExtendedMetadata::WordDoc(WordDocMetadata {
common: CommonMetadata::default(),
page_number: None,
header_footer_type: None,
}),
),
(
"text/html",
ExtendedMetadata::Html(HtmlMetadata {
common: CommonMetadata::default(),
link_urls: None,
link_texts: None,
}),
),
(
"application/epub+zip",
ExtendedMetadata::Epub(EpubMetadata {
common: CommonMetadata::default(),
section: None,
}),
),
];
for (mime_type, expected_format) in known_formats {
test_metadata_for_mime_type(mime_type, expected_format)?;
}
Ok(())
}
#[test]
fn test_pdf_element() -> Result<()> {
let json_str = r#"
{
"filetype": "application/pdf",
"filename": "example.pdf",
"file_directory": "/documents",
"last_modified": "2023-10-01",
"coordinates": "100,100,200,200",
"parent_id": "1",
"category_depth": 2,
"text_as_html": "<p>Example</p>",
"languages": ["en", "fr"],
"emphasized_text_contents": "important",
"emphasized_text_tags": "<b>",
"is_continuation": false,
"detection_class_prob": [0.1, 0.9],
"page_number": 1
}
"#;
let metadata: Metadata = serde_json::from_str(json_str).unwrap();
match metadata {
Metadata::KnownFormat(mdi) => match mdi {
ExtendedMetadata::PdfPage(_) => {}
_ => panic!("Format is not PDF"),
},
_ => panic!("Other Metadata"),
}
Ok(())
}
#[test]
fn test_unknown_element() -> Result<()> {
let json_str = r#"
{
"filetype": "asdfasdfasdf",
"filename": "example.pdf",
"file_directory": "/documents",
"last_modified": "2023-10-01"
}
"#;
let metadata: Metadata = serde_json::from_str(json_str).unwrap();
match metadata {
Metadata::UnknownFormat(_) => {}
_ => panic!("Wrong format"),
}
Ok(())
}
}