use crate::common::Metadata;
use crate::ooxml::error::{OoxmlError, Result};
use crate::ooxml::opc::constants::content_type as ct;
use crate::ooxml::opc::{OpcPackage, PackURI};
use chrono::{DateTime, Utc};
use quick_xml::events::Event;
use quick_xml::Reader;
use std::io::BufRead;
pub fn extract_metadata(package: &OpcPackage) -> Result<Metadata> {
let core_part = find_core_properties_part(package)?;
let xml_content = std::str::from_utf8(core_part.blob())
.map_err(|e| OoxmlError::Xml(format!("Invalid UTF-8 in core properties: {}", e)))?;
parse_core_properties_xml(xml_content)
}
fn find_core_properties_part(package: &OpcPackage) -> Result<&dyn crate::ooxml::opc::part::Part> {
let standard_uri = PackURI::new("/docProps/core.xml")
.map_err(|e| OoxmlError::Other(format!("Invalid core properties URI: {}", e)))?;
if let Ok(part) = package.get_part(&standard_uri)
&& part.content_type() == ct::OPC_CORE_PROPERTIES {
return Ok(part);
}
for part in package.iter_parts() {
if part.content_type() == ct::OPC_CORE_PROPERTIES {
return Ok(part);
}
}
Err(OoxmlError::PartNotFound("Core properties part not found".to_string()))
}
fn parse_core_properties_xml(xml: &str) -> Result<Metadata> {
let mut reader = Reader::from_str(xml);
reader.config_mut().trim_text(true);
let mut metadata = Metadata::default();
let mut buf = Vec::new();
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(ref e)) => {
match e.name().as_ref() {
b"dc:title" | b"cp:title" => {
if let Some(text) = read_text_element(&mut reader, &mut buf)? {
metadata.title = Some(text);
}
}
b"dc:subject" | b"cp:subject" => {
if let Some(text) = read_text_element(&mut reader, &mut buf)? {
metadata.subject = Some(text);
}
}
b"dc:creator" | b"cp:creator" | b"dc:author" | b"cp:author" => {
if let Some(text) = read_text_element(&mut reader, &mut buf)? {
metadata.author = Some(text);
}
}
b"cp:keywords" => {
if let Some(text) = read_text_element(&mut reader, &mut buf)? {
metadata.keywords = Some(text);
}
}
b"dc:description" | b"cp:description" | b"cp:comment" => {
if let Some(text) = read_text_element(&mut reader, &mut buf)? {
metadata.description = Some(text);
}
}
b"cp:lastModifiedBy" => {
if let Some(text) = read_text_element(&mut reader, &mut buf)? {
metadata.last_modified_by = Some(text);
}
}
b"cp:revision" => {
if let Some(text) = read_text_element(&mut reader, &mut buf)?
&& let Ok(rev) = text.parse::<u32>() {
metadata.revision = Some(rev.to_string());
}
}
b"cp:category" => {
if let Some(text) = read_text_element(&mut reader, &mut buf)? {
metadata.category = Some(text);
}
}
b"cp:contentStatus" => {
if let Some(text) = read_text_element(&mut reader, &mut buf)? {
metadata.content_status = Some(text);
}
}
b"dcterms:created" | b"cp:created" => {
if let Some(text) = read_text_element(&mut reader, &mut buf)?
&& let Ok(dt) = parse_datetime(&text) {
metadata.created = Some(dt);
}
}
b"dcterms:modified" | b"cp:modified" => {
if let Some(text) = read_text_element(&mut reader, &mut buf)?
&& let Ok(dt) = parse_datetime(&text) {
metadata.modified = Some(dt);
}
}
b"cp:lastPrinted" => {
if let Some(text) = read_text_element(&mut reader, &mut buf)?
&& let Ok(dt) = parse_datetime(&text) {
metadata.last_printed_time = Some(dt);
}
}
_ => {
}
}
}
Ok(Event::Eof) => break,
Err(e) => return Err(OoxmlError::Xml(format!("XML parsing error: {}", e))),
_ => {
}
}
buf.clear();
}
Ok(metadata)
}
fn read_text_element<B: BufRead>(reader: &mut Reader<B>, buf: &mut Vec<u8>) -> Result<Option<String>> {
let mut text = String::new();
loop {
match reader.read_event_into(buf) {
Ok(Event::Text(e)) => {
let text_content = std::str::from_utf8(e.as_ref())
.map_err(|e| OoxmlError::Xml(format!("Invalid UTF-8 in text content: {}", e)))?;
text.push_str(text_content);
}
Ok(Event::End(_)) => break,
Ok(Event::Eof) => break,
Err(e) => return Err(OoxmlError::Xml(format!("XML parsing error: {}", e))),
_ => {
}
}
}
if text.trim().is_empty() {
Ok(None)
} else {
Ok(Some(text))
}
}
fn parse_datetime(s: &str) -> Result<DateTime<Utc>> {
if let Ok(dt) = DateTime::parse_from_rfc3339(s) {
return Ok(dt.with_timezone(&Utc));
}
if let Ok(dt) = chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S%.fZ") {
return Ok(DateTime::from_naive_utc_and_offset(dt, Utc));
}
if let Ok(dt) = chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%SZ") {
return Ok(DateTime::from_naive_utc_and_offset(dt, Utc));
}
if let Ok(dt) = chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S") {
return Ok(DateTime::from_naive_utc_and_offset(dt, Utc));
}
Err(OoxmlError::InvalidFormat(format!("Invalid datetime format: {}", s)))
}
#[cfg(test)]
mod tests {
use super::*;
use chrono::Datelike;
#[test]
#[ignore] fn test_extract_metadata() {
}
#[test]
fn test_parse_datetime() {
let dt = parse_datetime("2023-10-10T14:30:00Z").unwrap();
assert_eq!(dt.year(), 2023);
assert_eq!(dt.month(), 10);
assert_eq!(dt.day(), 10);
let dt = parse_datetime("2023-10-10T14:30:00.123456Z").unwrap();
assert_eq!(dt.year(), 2023);
let dt = parse_datetime("2023-10-10T14:30:00").unwrap();
assert_eq!(dt.year(), 2023);
}
#[test]
fn test_parse_core_properties_xml() {
let xml = r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:dcterms="http://purl.org/dc/terms/"
xmlns:dcmitype="http://purl.org/dc/dcmitype/"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<dc:title>Test Document</dc:title>
<dc:subject>Test Subject</dc:subject>
<dc:creator>Test Author</dc:creator>
<cp:keywords>test, document</cp:keywords>
<dc:description>Test Description</dc:description>
<cp:lastModifiedBy>Test Modifier</cp:lastModifiedBy>
<cp:revision>5</cp:revision>
<cp:category>Test Category</cp:category>
<dcterms:created>2023-10-10T14:30:00Z</dcterms:created>
<dcterms:modified>2023-10-10T15:30:00Z</dcterms:modified>
</cp:coreProperties>"#;
let metadata = parse_core_properties_xml(xml).unwrap();
assert_eq!(metadata.title, Some("Test Document".to_string()));
assert_eq!(metadata.subject, Some("Test Subject".to_string()));
assert_eq!(metadata.author, Some("Test Author".to_string()));
assert_eq!(metadata.keywords, Some("test, document".to_string()));
assert_eq!(metadata.description, Some("Test Description".to_string()));
assert_eq!(metadata.last_modified_by, Some("Test Modifier".to_string()));
assert_eq!(metadata.revision, Some("5".to_string()));
assert_eq!(metadata.category, Some("Test Category".to_string()));
assert!(metadata.created.is_some());
assert!(metadata.modified.is_some());
}
}