use crate::error::{KreuzbergError, Result};
use roxmltree::Node;
use std::io::Read;
use zip::ZipArchive;
#[derive(Debug, Clone, Default, PartialEq, serde::Serialize, serde::Deserialize)]
pub struct DocxAppProperties {
pub application: Option<String>,
pub app_version: Option<String>,
pub template: Option<String>,
pub total_time: Option<i32>,
pub pages: Option<i32>,
pub words: Option<i32>,
pub characters: Option<i32>,
pub characters_with_spaces: Option<i32>,
pub lines: Option<i32>,
pub paragraphs: Option<i32>,
pub company: Option<String>,
pub doc_security: Option<i32>,
pub scale_crop: Option<bool>,
pub links_up_to_date: Option<bool>,
pub shared_doc: Option<bool>,
pub hyperlinks_changed: Option<bool>,
}
#[derive(Debug, Clone, Default, PartialEq, serde::Serialize, serde::Deserialize)]
pub struct XlsxAppProperties {
pub application: Option<String>,
pub app_version: Option<String>,
pub doc_security: Option<i32>,
pub scale_crop: Option<bool>,
pub links_up_to_date: Option<bool>,
pub shared_doc: Option<bool>,
pub hyperlinks_changed: Option<bool>,
pub company: Option<String>,
pub worksheet_names: Vec<String>,
}
#[derive(Debug, Clone, Default, PartialEq, serde::Serialize, serde::Deserialize)]
pub struct PptxAppProperties {
pub application: Option<String>,
pub app_version: Option<String>,
pub total_time: Option<i32>,
pub company: Option<String>,
pub doc_security: Option<i32>,
pub scale_crop: Option<bool>,
pub links_up_to_date: Option<bool>,
pub shared_doc: Option<bool>,
pub hyperlinks_changed: Option<bool>,
pub slides: Option<i32>,
pub notes: Option<i32>,
pub hidden_slides: Option<i32>,
pub multimedia_clips: Option<i32>,
pub presentation_format: Option<String>,
pub slide_titles: Vec<String>,
}
pub fn extract_docx_app_properties<R: Read + std::io::Seek>(archive: &mut ZipArchive<R>) -> Result<DocxAppProperties> {
let xml_content = match super::read_zip_entry_to_string(archive, "docProps/app.xml", "app.xml")? {
Some(content) => content,
None => return Ok(DocxAppProperties::default()),
};
let doc = roxmltree::Document::parse(&xml_content)
.map_err(|e| KreuzbergError::parsing(format!("Failed to parse app.xml: {}", e)))?;
let root = doc.root_element();
Ok(DocxAppProperties {
application: super::parse_xml_text(root, "Application"),
app_version: super::parse_xml_text(root, "AppVersion"),
template: super::parse_xml_text(root, "Template"),
total_time: super::parse_xml_int(root, "TotalTime"),
pages: super::parse_xml_int(root, "Pages"),
words: super::parse_xml_int(root, "Words"),
characters: super::parse_xml_int(root, "Characters"),
characters_with_spaces: super::parse_xml_int(root, "CharactersWithSpaces"),
lines: super::parse_xml_int(root, "Lines"),
paragraphs: super::parse_xml_int(root, "Paragraphs"),
company: super::parse_xml_text(root, "Company"),
doc_security: super::parse_xml_int(root, "DocSecurity"),
scale_crop: super::parse_xml_bool(root, "ScaleCrop"),
links_up_to_date: super::parse_xml_bool(root, "LinksUpToDate"),
shared_doc: super::parse_xml_bool(root, "SharedDoc"),
hyperlinks_changed: super::parse_xml_bool(root, "HyperlinksChanged"),
})
}
pub fn extract_xlsx_app_properties<R: Read + std::io::Seek>(archive: &mut ZipArchive<R>) -> Result<XlsxAppProperties> {
let xml_content = match super::read_zip_entry_to_string(archive, "docProps/app.xml", "app.xml")? {
Some(content) => content,
None => return Ok(XlsxAppProperties::default()),
};
let doc = roxmltree::Document::parse(&xml_content)
.map_err(|e| KreuzbergError::parsing(format!("Failed to parse app.xml: {}", e)))?;
let root = doc.root_element();
let worksheet_names = extract_titles_of_parts(root);
Ok(XlsxAppProperties {
application: super::parse_xml_text(root, "Application"),
app_version: super::parse_xml_text(root, "AppVersion"),
doc_security: super::parse_xml_int(root, "DocSecurity"),
scale_crop: super::parse_xml_bool(root, "ScaleCrop"),
links_up_to_date: super::parse_xml_bool(root, "LinksUpToDate"),
shared_doc: super::parse_xml_bool(root, "SharedDoc"),
hyperlinks_changed: super::parse_xml_bool(root, "HyperlinksChanged"),
company: super::parse_xml_text(root, "Company"),
worksheet_names,
})
}
pub fn extract_pptx_app_properties<R: Read + std::io::Seek>(archive: &mut ZipArchive<R>) -> Result<PptxAppProperties> {
let xml_content = match super::read_zip_entry_to_string(archive, "docProps/app.xml", "app.xml")? {
Some(content) => content,
None => return Ok(PptxAppProperties::default()),
};
let doc = roxmltree::Document::parse(&xml_content)
.map_err(|e| KreuzbergError::parsing(format!("Failed to parse app.xml: {}", e)))?;
let root = doc.root_element();
let slide_titles = extract_titles_of_parts(root);
let presentation_format = super::parse_xml_text(root, "PresentationFormat");
Ok(PptxAppProperties {
application: super::parse_xml_text(root, "Application"),
app_version: super::parse_xml_text(root, "AppVersion"),
total_time: super::parse_xml_int(root, "TotalTime"),
company: super::parse_xml_text(root, "Company"),
doc_security: super::parse_xml_int(root, "DocSecurity"),
scale_crop: super::parse_xml_bool(root, "ScaleCrop"),
links_up_to_date: super::parse_xml_bool(root, "LinksUpToDate"),
shared_doc: super::parse_xml_bool(root, "SharedDoc"),
hyperlinks_changed: super::parse_xml_bool(root, "HyperlinksChanged"),
slides: super::parse_xml_int(root, "Slides"),
notes: super::parse_xml_int(root, "Notes"),
hidden_slides: super::parse_xml_int(root, "HiddenSlides"),
multimedia_clips: super::parse_xml_int(root, "MMClips"),
presentation_format,
slide_titles,
})
}
fn extract_titles_of_parts(root: Node) -> Vec<String> {
let mut titles = Vec::new();
if let Some(titles_node) = root.descendants().find(|n| n.has_tag_name("TitlesOfParts"))
&& let Some(vector_node) = titles_node.descendants().find(|n| n.has_tag_name("vector"))
{
for lpstr_node in vector_node.descendants().filter(|n| n.has_tag_name("lpstr")) {
if let Some(text) = lpstr_node.text() {
let text = text.trim();
if !text.is_empty() {
titles.push(text.to_string());
}
}
}
}
titles
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::{Cursor, Write};
fn create_test_zip_with_app_xml(app_xml: &str) -> ZipArchive<Cursor<Vec<u8>>> {
let buffer = Vec::new();
let cursor = Cursor::new(buffer);
let mut zip = zip::ZipWriter::new(cursor);
let options = zip::write::FileOptions::<()>::default().compression_method(zip::CompressionMethod::Stored);
zip.start_file("docProps/app.xml", options).unwrap();
zip.write_all(app_xml.as_bytes()).unwrap();
let cursor = zip.finish().unwrap();
ZipArchive::new(cursor).unwrap()
}
#[test]
fn test_extract_docx_app_properties() {
let app_xml = r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Properties xmlns="http://schemas.openxmlformats.org/officeDocument/2006/extended-properties">
<Application>Microsoft Office Word</Application>
<AppVersion>16.0000</AppVersion>
<TotalTime>120</TotalTime>
<Pages>5</Pages>
<Words>1000</Words>
<Characters>5500</Characters>
<CharactersWithSpaces>6500</CharactersWithSpaces>
<Lines>100</Lines>
<Paragraphs>50</Paragraphs>
<Company>Acme Corp</Company>
<DocSecurity>0</DocSecurity>
<ScaleCrop>false</ScaleCrop>
</Properties>"#;
let mut archive = create_test_zip_with_app_xml(app_xml);
let props = extract_docx_app_properties(&mut archive).unwrap();
assert_eq!(props.application, Some("Microsoft Office Word".to_string()));
assert_eq!(props.app_version, Some("16.0000".to_string()));
assert_eq!(props.total_time, Some(120));
assert_eq!(props.pages, Some(5));
assert_eq!(props.words, Some(1000));
assert_eq!(props.characters, Some(5500));
assert_eq!(props.characters_with_spaces, Some(6500));
assert_eq!(props.lines, Some(100));
assert_eq!(props.paragraphs, Some(50));
assert_eq!(props.company, Some("Acme Corp".to_string()));
assert_eq!(props.doc_security, Some(0));
assert_eq!(props.scale_crop, Some(false));
}
#[test]
fn test_extract_xlsx_app_properties_with_worksheets() {
let app_xml = r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Properties xmlns="http://schemas.openxmlformats.org/officeDocument/2006/extended-properties"
xmlns:vt="http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes">
<Application>Microsoft Excel</Application>
<AppVersion>16.0300</AppVersion>
<Company>Test Company</Company>
<TitlesOfParts>
<vt:vector size="3" baseType="lpstr">
<vt:lpstr>Sheet1</vt:lpstr>
<vt:lpstr>Sheet2</vt:lpstr>
<vt:lpstr>Sheet3</vt:lpstr>
</vt:vector>
</TitlesOfParts>
</Properties>"#;
let mut archive = create_test_zip_with_app_xml(app_xml);
let props = extract_xlsx_app_properties(&mut archive).unwrap();
assert_eq!(props.application, Some("Microsoft Excel".to_string()));
assert_eq!(props.app_version, Some("16.0300".to_string()));
assert_eq!(props.company, Some("Test Company".to_string()));
assert_eq!(props.worksheet_names, vec!["Sheet1", "Sheet2", "Sheet3"]);
}
#[test]
fn test_extract_pptx_app_properties() {
let app_xml = r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Properties xmlns="http://schemas.openxmlformats.org/officeDocument/2006/extended-properties"
xmlns:vt="http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes">
<Application>Microsoft Office PowerPoint</Application>
<AppVersion>16.0000</AppVersion>
<TotalTime>45</TotalTime>
<Slides>10</Slides>
<Notes>5</Notes>
<HiddenSlides>2</HiddenSlides>
<MMClips>3</MMClips>
<PresentationFormat>Widescreen</PresentationFormat>
<TitlesOfParts>
<vt:vector size="2" baseType="lpstr">
<vt:lpstr>Title Slide</vt:lpstr>
<vt:lpstr>Agenda</vt:lpstr>
</vt:vector>
</TitlesOfParts>
</Properties>"#;
let mut archive = create_test_zip_with_app_xml(app_xml);
let props = extract_pptx_app_properties(&mut archive).unwrap();
assert_eq!(props.application, Some("Microsoft Office PowerPoint".to_string()));
assert_eq!(props.slides, Some(10));
assert_eq!(props.notes, Some(5));
assert_eq!(props.hidden_slides, Some(2));
assert_eq!(props.multimedia_clips, Some(3));
assert_eq!(props.presentation_format, Some("Widescreen".to_string()));
assert_eq!(props.slide_titles, vec!["Title Slide", "Agenda"]);
}
#[test]
fn test_extract_app_properties_missing_file() {
let buffer = Vec::new();
let cursor = Cursor::new(buffer);
let zip = zip::ZipWriter::new(cursor);
let cursor = zip.finish().unwrap();
let mut archive = ZipArchive::new(cursor).unwrap();
let docx = extract_docx_app_properties(&mut archive).unwrap();
assert_eq!(docx, DocxAppProperties::default());
let buffer = Vec::new();
let cursor = Cursor::new(buffer);
let zip = zip::ZipWriter::new(cursor);
let cursor = zip.finish().unwrap();
let mut archive = ZipArchive::new(cursor).unwrap();
let xlsx = extract_xlsx_app_properties(&mut archive).unwrap();
assert_eq!(xlsx, XlsxAppProperties::default());
}
#[test]
fn test_extract_titles_of_parts_empty() {
let xml = r#"<Properties xmlns:vt="http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes">
<TitlesOfParts><vt:vector size="0" baseType="lpstr"></vt:vector></TitlesOfParts>
</Properties>"#;
let doc = roxmltree::Document::parse(xml).unwrap();
let titles = extract_titles_of_parts(doc.root_element());
assert_eq!(titles, Vec::<String>::new());
}
}