oletools_rs 0.1.0

Rust port of oletools — analysis tools for Microsoft Office files (VBA macros, DDE, OLE objects, RTF exploits)
Documentation
//! OOXML parser — reads ZIP-based Office Open XML files.
//!
//! Provides access to XML parts, relationships, and embedded content
//! within .docx, .xlsx, .pptx, .docm, .xlsm files.

use std::collections::HashMap;
use std::io::{Cursor, Read};
use std::path::Path;

use quick_xml::events::Event;
use quick_xml::Reader;

use crate::error::{Error, Result};

/// Represents an open OOXML file (ZIP archive).
pub struct OoxmlParser {
    data: Vec<u8>,
}

/// An XML element extracted by event-based parsing.
#[derive(Debug, Clone)]
pub struct XmlElement {
    /// Local name of the element.
    pub name: String,
    /// Attributes as key-value pairs.
    pub attributes: HashMap<String, String>,
    /// Text content (if any).
    pub text: String,
}

impl OoxmlParser {
    /// Open an OOXML file from a filesystem path.
    pub fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
        let data = std::fs::read(path)?;
        Self::from_bytes(&data)
    }

    /// Open an OOXML file from a byte slice.
    pub fn from_bytes(data: &[u8]) -> Result<Self> {
        // Verify ZIP signature
        if data.len() < 4 || data[0..4] != [0x50, 0x4B, 0x03, 0x04] {
            return Err(Error::InvalidOoxml("Not a valid ZIP/OOXML file".into()));
        }
        Ok(Self {
            data: data.to_vec(),
        })
    }

    /// Open an OOXML file from a reader.
    pub fn from_reader<R: Read>(mut reader: R) -> Result<Self> {
        let mut data = Vec::new();
        reader.read_to_end(&mut data)?;
        Self::from_bytes(&data)
    }

    /// Check if data starts with ZIP magic bytes.
    pub fn is_ooxml(data: &[u8]) -> bool {
        data.len() >= 4 && data[0..4] == [0x50, 0x4B, 0x03, 0x04]
    }

    /// List all files in the ZIP archive.
    pub fn iter_files(&self) -> Result<Vec<String>> {
        let cursor = Cursor::new(&self.data);
        let archive = zip::ZipArchive::new(cursor)
            .map_err(|e| Error::InvalidOoxml(format!("Invalid ZIP: {e}")))?;

        let names: Vec<String> = (0..archive.len())
            .filter_map(|i| {
                archive
                    .clone()
                    .by_index(i)
                    .ok()
                    .map(|e| e.name().to_string())
            })
            .collect();

        Ok(names)
    }

    /// Read the raw bytes of a file within the ZIP archive.
    pub fn read_file(&self, name: &str) -> Result<Vec<u8>> {
        let cursor = Cursor::new(&self.data);
        let mut archive = zip::ZipArchive::new(cursor)
            .map_err(|e| Error::InvalidOoxml(format!("Invalid ZIP: {e}")))?;

        let mut entry = archive
            .by_name(name)
            .map_err(|e| Error::InvalidOoxml(format!("File not found '{name}': {e}")))?;

        let mut buf = Vec::new();
        entry.read_to_end(&mut buf)?;
        Ok(buf)
    }

    /// Read a file within the ZIP as a UTF-8 string.
    pub fn read_file_as_string(&self, name: &str) -> Result<String> {
        let data = self.read_file(name)?;
        String::from_utf8(data).map_err(|e| Error::InvalidOoxml(format!("Invalid UTF-8: {e}")))
    }

    /// Parse XML elements matching specific tag names from a file in the archive.
    ///
    /// Uses event-based parsing (quick-xml) for memory efficiency.
    /// Returns elements whose local name matches one of the specified tags.
    pub fn iter_xml_elements(&self, subfile: &str, tags: &[&str]) -> Result<Vec<XmlElement>> {
        let xml_data = self.read_file(subfile)?;
        let mut reader = Reader::from_reader(Cursor::new(xml_data));
        reader.config_mut().trim_text(true);

        let mut elements = Vec::new();
        let mut buf = Vec::new();
        let mut current_element: Option<XmlElement> = None;

        loop {
            match reader.read_event_into(&mut buf) {
                Ok(Event::Start(ref e)) | Ok(Event::Empty(ref e)) => {
                    let local_name = String::from_utf8_lossy(e.local_name().as_ref()).to_string();

                    if tags.iter().any(|&t| t == local_name) {
                        let mut attrs = HashMap::new();
                        for attr in e.attributes().flatten() {
                            let key =
                                String::from_utf8_lossy(attr.key.local_name().as_ref()).to_string();
                            let value = String::from_utf8_lossy(&attr.value).to_string();
                            attrs.insert(key, value);
                        }

                        let elem = XmlElement {
                            name: local_name,
                            attributes: attrs,
                            text: String::new(),
                        };

                        if matches!(reader.read_event_into(&mut Vec::new()), Ok(Event::End(_))) {
                            elements.push(elem);
                        } else {
                            current_element = Some(elem);
                        }
                    }
                }
                Ok(Event::Text(ref e)) => {
                    if let Some(ref mut elem) = current_element {
                        elem.text = e.unescape().unwrap_or_default().to_string();
                    }
                }
                Ok(Event::End(_)) => {
                    if let Some(elem) = current_element.take() {
                        elements.push(elem);
                    }
                }
                Ok(Event::Eof) => break,
                Err(e) => {
                    return Err(Error::XmlParsing(format!(
                        "Error parsing {subfile}: {e}"
                    )));
                }
                _ => {}
            }
            buf.clear();
        }

        Ok(elements)
    }

    /// Check if this is a single XML file (Word 2003 XML, not OOXML ZIP).
    pub fn is_single_xml(data: &[u8]) -> bool {
        if let Ok(text) = std::str::from_utf8(&data[..std::cmp::min(data.len(), 500)]) {
            text.contains("<?xml") && !Self::is_ooxml(data)
        } else {
            false
        }
    }

    /// Find all vbaProject.bin entries in the archive.
    pub fn find_vba_projects(&self) -> Result<Vec<String>> {
        let files = self.iter_files()?;
        Ok(files
            .into_iter()
            .filter(|f| f.to_lowercase().ends_with("vbaproject.bin"))
            .collect())
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_is_ooxml() {
        assert!(OoxmlParser::is_ooxml(&[0x50, 0x4B, 0x03, 0x04, 0x00]));
        assert!(!OoxmlParser::is_ooxml(&[0xD0, 0xCF, 0x11, 0xE0]));
        assert!(!OoxmlParser::is_ooxml(&[0x00, 0x01]));
    }

    #[test]
    fn test_is_single_xml() {
        assert!(OoxmlParser::is_single_xml(b"<?xml version=\"1.0\"?><doc/>"));
        assert!(!OoxmlParser::is_single_xml(&[0x50, 0x4B, 0x03, 0x04]));
    }

    #[test]
    fn test_invalid_ooxml() {
        let result = OoxmlParser::from_bytes(&[0x00, 0x01, 0x02]);
        assert!(result.is_err());
    }
}