oletools_rs 0.1.0

Rust port of oletools — analysis tools for Microsoft Office files (VBA macros, DDE, OLE objects, RTF exploits)
Documentation
//! VBA Parser — main API for extracting and analyzing VBA macros.
//!
//! Supports OLE2 (.doc, .xls), OOXML (.docm, .xlsm), FlatOPC, and
//! Word 2003 XML formats.

use std::io::{Cursor, Read};
use std::path::Path;

use base64::Engine as _;

use crate::error::{Error, Result};
use crate::ole::container::OleFile;
use crate::vba::module::VbaModule;
use crate::vba::project::{ModuleType, VbaProject};
use crate::vba::scanner::{Finding, VbaScanner};

/// Information about an extracted macro.
#[derive(Debug, Clone)]
pub struct MacroInfo {
    /// Original filename or container path.
    pub filename: String,
    /// OLE stream path (e.g., "VBA/ThisDocument").
    pub stream_path: String,
    /// VBA module name.
    pub name: String,
    /// Decompressed VBA source code.
    pub code: String,
    /// Module type.
    pub module_type: ModuleType,
}

/// Results from VBA analysis.
#[derive(Debug, Clone)]
pub struct AnalysisResults {
    /// All findings from scanning.
    pub findings: Vec<Finding>,
    /// Number of macros found.
    pub macro_count: usize,
    /// Whether any AutoExec triggers were found.
    pub has_autoexec: bool,
    /// Whether any suspicious keywords were found.
    pub has_suspicious: bool,
    /// Whether any IOCs were found.
    pub has_ioc: bool,
}

/// Detected source format of the VBA container.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum SourceFormat {
    Ole,
    Ooxml,
    FlatOpc,
    Word2003Xml,
}

/// Main VBA parser. Extracts and analyzes VBA macros from Office documents.
pub struct VbaParser {
    data: Vec<u8>,
    filename: String,
    format: Option<SourceFormat>,
}

impl VbaParser {
    /// Create a parser from a file path.
    pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
        let path = path.as_ref();
        let data = std::fs::read(path)?;
        let filename = path
            .file_name()
            .map(|n| n.to_string_lossy().to_string())
            .unwrap_or_default();

        let format = Self::detect_format(&data);

        Ok(Self {
            data,
            filename,
            format,
        })
    }

    /// Create a parser from a byte slice.
    pub fn from_bytes(data: &[u8]) -> Result<Self> {
        let format = Self::detect_format(data);
        Ok(Self {
            data: data.to_vec(),
            filename: "<bytes>".to_string(),
            format,
        })
    }

    /// Check if the document contains VBA macros.
    pub fn detect_vba_macros(&self) -> Result<bool> {
        match self.format {
            Some(SourceFormat::Ole) => self.detect_vba_ole(),
            Some(SourceFormat::Ooxml) => self.detect_vba_ooxml(),
            Some(SourceFormat::FlatOpc) => self.detect_vba_flatopc(),
            Some(SourceFormat::Word2003Xml) => self.detect_vba_word2003xml(),
            None => Err(Error::UnsupportedFormat(
                "Cannot determine file format".into(),
            )),
        }
    }

    /// Extract all VBA macros from the document.
    pub fn extract_macros(&self) -> Result<Vec<MacroInfo>> {
        match self.format {
            Some(SourceFormat::Ole) => self.extract_macros_ole(),
            Some(SourceFormat::Ooxml) => self.extract_macros_ooxml(),
            Some(SourceFormat::FlatOpc) => self.extract_macros_flatopc(),
            Some(SourceFormat::Word2003Xml) => self.extract_macros_word2003xml(),
            None => Err(Error::UnsupportedFormat(
                "Cannot determine file format".into(),
            )),
        }
    }

    /// Analyze the VBA code for suspicious patterns.
    pub fn analyze(&self) -> Result<AnalysisResults> {
        let macros = self.extract_macros()?;
        let mut all_findings = Vec::new();

        for macro_info in &macros {
            let findings = VbaScanner::scan(&macro_info.code);
            all_findings.extend(findings);
        }

        let has_autoexec = all_findings
            .iter()
            .any(|f| f.finding_type == crate::vba::keywords::FindingType::AutoExec);
        let has_suspicious = all_findings
            .iter()
            .any(|f| f.finding_type == crate::vba::keywords::FindingType::Suspicious);
        let has_ioc = all_findings
            .iter()
            .any(|f| f.finding_type == crate::vba::keywords::FindingType::Ioc);

        Ok(AnalysisResults {
            findings: all_findings,
            macro_count: macros.len(),
            has_autoexec,
            has_suspicious,
            has_ioc,
        })
    }

    // --- Format detection ---

    fn detect_format(data: &[u8]) -> Option<SourceFormat> {
        if OleFile::is_ole(data) {
            return Some(SourceFormat::Ole);
        }
        if data.len() >= 4 && data[0..4] == [0x50, 0x4B, 0x03, 0x04] {
            return Some(SourceFormat::Ooxml);
        }
        // Check for XML-based formats
        if let Ok(text) = std::str::from_utf8(&data[..std::cmp::min(data.len(), 1000)]) {
            if text.contains("<?mso-application") || text.contains("pkg:package") {
                return Some(SourceFormat::FlatOpc);
            }
            if text.contains("<?xml") && text.contains("urn:schemas-microsoft-com:office:word") {
                return Some(SourceFormat::Word2003Xml);
            }
            if text.contains("<?xml")
                && (text.contains("w:wordDocument") || text.contains("w:document"))
            {
                return Some(SourceFormat::Word2003Xml);
            }
        }
        None
    }

    // --- OLE extraction ---

    fn detect_vba_ole(&self) -> Result<bool> {
        let ole = OleFile::from_bytes(&self.data)?;
        let streams = ole.list_streams();
        Ok(streams.iter().any(|s| {
            let lower = s.to_lowercase();
            lower.contains("vba") && lower.ends_with("/dir")
        }))
    }

    fn extract_macros_ole(&self) -> Result<Vec<MacroInfo>> {
        let mut ole = OleFile::from_bytes(&self.data)?;
        self.extract_macros_from_ole(&mut ole, "")
    }

    fn extract_macros_from_ole(
        &self,
        ole: &mut OleFile,
        prefix: &str,
    ) -> Result<Vec<MacroInfo>> {
        let streams = ole.list_streams();

        // Find VBA storage by looking for a "dir" stream
        let dir_stream_path = streams
            .iter()
            .find(|s| {
                let lower = s.to_lowercase();
                lower.ends_with("/dir") && lower.contains("vba")
            })
            .cloned();

        let dir_path = match dir_stream_path {
            Some(p) => p,
            None => return Ok(Vec::new()),
        };

        // Determine VBA storage path (parent of "dir")
        let vba_storage = dir_path
            .rsplit_once('/')
            .map(|(parent, _)| parent.to_string())
            .unwrap_or_default();

        // Read and parse dir stream
        let dir_data = ole.open_stream(&dir_path)?;
        let project = VbaProject::from_dir_stream(&dir_data)?;

        let mut macros = Vec::new();

        for module_desc in &project.modules {
            let stream_path = if vba_storage.is_empty() {
                module_desc.stream_name.clone()
            } else {
                format!("{}/{}", vba_storage, module_desc.stream_name)
            };

            // Try to read the module stream
            let stream_data = match ole.open_stream(&stream_path) {
                Ok(data) => data,
                Err(_) => continue,
            };

            let source = VbaModule::extract_source(
                &stream_data,
                module_desc.text_offset,
                project.codepage,
            )
            .unwrap_or_default();

            if !source.trim().is_empty() {
                macros.push(MacroInfo {
                    filename: format!("{}{}", prefix, self.filename),
                    stream_path: stream_path.clone(),
                    name: module_desc.name.clone(),
                    code: source,
                    module_type: module_desc.module_type,
                });
            }
        }

        Ok(macros)
    }

    // --- OOXML extraction ---

    fn detect_vba_ooxml(&self) -> Result<bool> {
        let cursor = Cursor::new(&self.data);
        let mut archive = zip::ZipArchive::new(cursor)
            .map_err(|e| Error::InvalidOoxml(format!("Invalid ZIP: {e}")))?;

        for i in 0..archive.len() {
            if let Ok(entry) = archive.by_index(i) {
                let name = entry.name().to_lowercase();
                if name.ends_with("vbaproject.bin") {
                    return Ok(true);
                }
            }
        }
        Ok(false)
    }

    fn extract_macros_ooxml(&self) -> Result<Vec<MacroInfo>> {
        let cursor = Cursor::new(&self.data);
        let mut archive = zip::ZipArchive::new(cursor)
            .map_err(|e| Error::InvalidOoxml(format!("Invalid ZIP: {e}")))?;

        let mut macros = Vec::new();

        // Find vbaProject.bin entries
        let vba_entries: Vec<String> = (0..archive.len())
            .filter_map(|i| {
                archive
                    .by_index(i)
                    .ok()
                    .filter(|e| e.name().to_lowercase().ends_with("vbaproject.bin"))
                    .map(|e| e.name().to_string())
            })
            .collect();

        for vba_path in vba_entries {
            let mut vba_data = Vec::new();
            {
                let mut entry = archive
                    .by_name(&vba_path)
                    .map_err(|e| Error::InvalidOoxml(format!("Cannot read {vba_path}: {e}")))?;
                entry.read_to_end(&mut vba_data)?;
            }

            // vbaProject.bin is itself an OLE container
            let mut vba_ole = OleFile::from_bytes(&vba_data)?;
            let prefix = format!("{}//", vba_path);
            let extracted = self.extract_macros_from_ole(&mut vba_ole, &prefix)?;
            macros.extend(extracted);
        }

        Ok(macros)
    }

    // --- FlatOPC extraction ---

    fn detect_vba_flatopc(&self) -> Result<bool> {
        let text = String::from_utf8_lossy(&self.data);
        Ok(text.contains("vbaProject") || text.contains("vbaData"))
    }

    fn extract_macros_flatopc(&self) -> Result<Vec<MacroInfo>> {
        let text = String::from_utf8_lossy(&self.data);

        // Find base64-encoded vbaProject.bin inside the XML
        // Look for <pkg:binaryData> or <w:binData> elements
        let mut macros = Vec::new();

        // Try to find base64 data associated with vbaProject
        for pattern in &["<pkg:binaryData>", "<w:binData"] {
            if let Some(start_idx) = text.find(pattern) {
                // Find the content after the tag
                let content_start = text[start_idx..].find('>').map(|i| start_idx + i + 1);
                if let Some(cs) = content_start {
                    // Find closing tag
                    let content_end = text[cs..].find('<').map(|i| cs + i);
                    if let Some(ce) = content_end {
                        let b64_content: String =
                            text[cs..ce].chars().filter(|c| !c.is_whitespace()).collect();

                        if let Ok(vba_data) =
                            base64::engine::general_purpose::STANDARD.decode(&b64_content)
                            && OleFile::is_ole(&vba_data) {
                                let mut vba_ole = OleFile::from_bytes(&vba_data)?;
                                let extracted =
                                    self.extract_macros_from_ole(&mut vba_ole, "FlatOPC//")?;
                                macros.extend(extracted);
                            }
                    }
                }
            }
        }

        Ok(macros)
    }

    // --- Word 2003 XML extraction ---

    fn detect_vba_word2003xml(&self) -> Result<bool> {
        let text = String::from_utf8_lossy(&self.data);
        Ok(text.contains("Microsoft.VBA") || text.contains("w:binData"))
    }

    fn extract_macros_word2003xml(&self) -> Result<Vec<MacroInfo>> {
        // Word 2003 XML stores VBA as base64-encoded OLE within <w:binData>
        self.extract_macros_flatopc()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_detect_format_ole() {
        let ole_header = [0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1, 0x00, 0x00];
        assert_eq!(VbaParser::detect_format(&ole_header), Some(SourceFormat::Ole));
    }

    #[test]
    fn test_detect_format_zip() {
        let zip_header = [0x50, 0x4B, 0x03, 0x04, 0x00, 0x00];
        assert_eq!(VbaParser::detect_format(&zip_header), Some(SourceFormat::Ooxml));
    }

    #[test]
    fn test_detect_format_unknown() {
        let unknown = [0x00, 0x01, 0x02, 0x03];
        assert_eq!(VbaParser::detect_format(&unknown), None);
    }

    #[test]
    fn test_parser_invalid_data() {
        let result = VbaParser::from_bytes(&[0x00, 0x01, 0x02, 0x03]);
        assert!(result.is_ok()); // Parser creation succeeds
        let parser = result.unwrap();
        assert!(parser.detect_vba_macros().is_err()); // But detection fails
    }
}