use std::io::{Cursor, Read};
use quick_xml::events::Event;
use quick_xml::Reader;
use crate::error::{Error, Result};
use crate::msodde::field_parser::{self, DdeField};
pub fn process_docx(data: &[u8]) -> Result<Vec<DdeField>> {
let cursor = Cursor::new(data);
let mut archive = zip::ZipArchive::new(cursor)
.map_err(|e| Error::InvalidOoxml(format!("Invalid ZIP: {e}")))?;
let mut fields = Vec::new();
let xml_parts: Vec<String> = (0..archive.len())
.filter_map(|i| {
archive.by_index(i).ok().and_then(|e| {
let name = e.name().to_string();
let lower = name.to_lowercase();
if (lower.contains("word/document") || lower.contains("word/header")
|| lower.contains("word/footer"))
&& lower.ends_with(".xml")
{
Some(name)
} else {
None
}
})
})
.collect();
for part_name in &xml_parts {
let mut xml_data = Vec::new();
if let Ok(mut entry) = archive.by_name(part_name) {
entry.read_to_end(&mut xml_data)?;
}
if xml_data.is_empty() {
continue;
}
let part_fields = extract_fields_from_xml(&xml_data)?;
fields.extend(part_fields);
}
Ok(fields)
}
fn extract_fields_from_xml(xml_data: &[u8]) -> Result<Vec<DdeField>> {
let mut reader = Reader::from_reader(Cursor::new(xml_data));
reader.config_mut().trim_text(true);
let mut fields = Vec::new();
let mut buf = Vec::new();
let mut in_field = false;
let mut current_instruction = String::new();
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(ref e)) | Ok(Event::Empty(ref e)) => {
let local_name =
String::from_utf8_lossy(e.local_name().as_ref()).to_string();
match local_name.as_str() {
"fldChar" => {
for attr in e.attributes().flatten() {
let key = String::from_utf8_lossy(attr.key.local_name().as_ref())
.to_string();
if key == "fldCharType" {
let value =
String::from_utf8_lossy(&attr.value).to_string();
match value.as_str() {
"begin" => {
in_field = true;
current_instruction.clear();
}
"end" => {
if in_field {
process_instruction(
¤t_instruction,
&mut fields,
);
in_field = false;
current_instruction.clear();
}
}
_ => {}
}
}
}
}
"fldSimple" => {
for attr in e.attributes().flatten() {
let key = String::from_utf8_lossy(attr.key.local_name().as_ref())
.to_string();
if key == "instr" {
let value =
String::from_utf8_lossy(&attr.value).to_string();
process_instruction(&value, &mut fields);
}
}
}
_ => {}
}
}
Ok(Event::Text(ref e)) => {
if in_field {
current_instruction
.push_str(&e.unescape().unwrap_or_default());
}
}
Ok(Event::Eof) => break,
Err(e) => {
return Err(Error::XmlParsing(format!("Error parsing Word XML: {e}")));
}
_ => {}
}
buf.clear();
}
Ok(fields)
}
fn process_instruction(instruction: &str, fields: &mut Vec<DdeField>) {
let trimmed = instruction.trim();
if field_parser::is_dde_field(trimmed)
&& let Some(dde) = field_parser::parse_dde_field(trimmed) {
fields.push(dde);
}
if let Some(decoded) = field_parser::decode_quote_field(trimmed)
&& field_parser::is_dde_field(&decoded)
&& let Some(mut dde) = field_parser::parse_dde_field(&decoded) {
dde.quote_decoded = Some(decoded);
fields.push(dde);
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extract_fldsimple_dde() {
let xml = br#"<?xml version="1.0"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p>
<w:fldSimple w:instr=" DDEAUTO cmd.exe /c calc ">
<w:r><w:t>result</w:t></w:r>
</w:fldSimple>
</w:p>
</w:body>
</w:document>"#;
let fields = extract_fields_from_xml(xml).unwrap();
assert_eq!(fields.len(), 1);
assert_eq!(fields[0].source, "cmd.exe");
}
#[test]
fn test_extract_fldchar_dde() {
let xml = br#"<?xml version="1.0"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p>
<w:r><w:fldChar w:fldCharType="begin"/></w:r>
<w:r><w:instrText> DDEAUTO Excel Sheet1!R1C1 </w:instrText></w:r>
<w:r><w:fldChar w:fldCharType="end"/></w:r>
</w:p>
</w:body>
</w:document>"#;
let fields = extract_fields_from_xml(xml).unwrap();
assert_eq!(fields.len(), 1);
assert_eq!(fields[0].source, "Excel");
}
#[test]
fn test_extract_no_dde() {
let xml = br#"<?xml version="1.0"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p>
<w:fldSimple w:instr=" DATE \@ "yyyy-MM-dd" ">
<w:r><w:t>2024-01-01</w:t></w:r>
</w:fldSimple>
</w:p>
</w:body>
</w:document>"#;
let fields = extract_fields_from_xml(xml).unwrap();
assert!(fields.is_empty());
}
#[test]
fn test_extract_empty_xml() {
let xml = br#"<?xml version="1.0"?><w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"/>"#;
let fields = extract_fields_from_xml(xml).unwrap();
assert!(fields.is_empty());
}
}