oletools_rs 0.1.0

//! RTF parser — state machine for parsing Rich Text Format documents.
//!
//! Tracks group nesting, control words, destinations, and binary data.
//! Used by `RtfObjParser` to locate embedded object data.

use crate::error::{Error, Result};

/// A destination within the RTF document.
#[derive(Debug, Clone)]
pub struct RtfDestination {
    /// Name of the destination (control word that opened it).
    pub name: String,
    /// Group level at which this destination was opened.
    pub group_level: usize,
    /// Accumulated hex data (for data destinations like objdata).
    pub hex_data: String,
    /// Accumulated binary data from \bin commands.
    pub bin_data: Vec<u8>,
}

/// Result of parsing an RTF document.
#[derive(Debug)]
pub struct RtfParseResult {
    /// All destinations found during parsing.
    pub destinations: Vec<RtfDestination>,
    /// Maximum group nesting depth encountered.
    pub max_depth: usize,
}

/// RTF state-machine parser.
pub struct RtfParser;

impl RtfParser {
    /// Parse an RTF document and extract all destinations.
    pub fn parse(data: &[u8]) -> Result<RtfParseResult> {
        // Verify RTF signature
        if data.len() < 5 || &data[0..4] != b"{\\rt" {
            return Err(Error::RtfParsing("Not a valid RTF document".into()));
        }

        let mut group_level: usize = 0;
        let mut max_depth: usize = 0;
        let mut destinations: Vec<RtfDestination> = Vec::new();
        let mut dest_stack: Vec<usize> = Vec::new(); // indices into destinations
        let mut pos = 0;
        let len = data.len();

        while pos < len {
            match data[pos] {
                b'{' => {
                    group_level += 1;
                    if group_level > max_depth {
                        max_depth = group_level;
                    }
                    pos += 1;
                }
                b'}' => {
                    // Close destination if it matches current group level
                    if let Some(&dest_idx) = dest_stack.last()
                        && destinations[dest_idx].group_level == group_level {
                            dest_stack.pop();
                        }
                    group_level = group_level.saturating_sub(1);
                    pos += 1;
                }
                b'\\' => {
                    pos += 1;
                    if pos >= len {
                        break;
                    }

                    // Check for \'XX hex escape
                    if data[pos] == b'\'' {
                        pos += 1;
                        if pos + 2 <= len {
                            let hex_str =
                                String::from_utf8_lossy(&data[pos..pos + 2]).to_string();
                            // Add to current destination if any
                            if let Some(&dest_idx) = dest_stack.last() {
                                destinations[dest_idx].hex_data.push_str(&hex_str);
                            }
                            pos += 2;
                        }
                        continue;
                    }

                    // Parse control word
                    let (word, param, new_pos) = Self::parse_control_word(data, pos);
                    pos = new_pos;

                    // Handle \bin<N> — read N bytes of binary data
                    if word == "bin" {
                        if let Some(n) = param {
                            let n = n as usize;
                            let end = std::cmp::min(pos + n, len);
                            let bin_bytes = data[pos..end].to_vec();
                            if let Some(&dest_idx) = dest_stack.last() {
                                destinations[dest_idx].bin_data.extend_from_slice(&bin_bytes);
                            }
                            pos = end;
                        }
                        continue;
                    }

                    // Check if this is a destination keyword
                    if Self::is_destination_keyword(&word) {
                        let dest = RtfDestination {
                            name: word,
                            group_level,
                            hex_data: String::new(),
                            bin_data: Vec::new(),
                        };
                        let idx = destinations.len();
                        destinations.push(dest);
                        dest_stack.push(idx);
                    }
                }
                // Whitespace and regular text
                ch => {
                    // If we are in a destination, accumulate hex chars
                    if let Some(&dest_idx) = dest_stack.last()
                        && ch.is_ascii_hexdigit() {
                            destinations[dest_idx].hex_data.push(ch as char);
                        }
                        // Skip whitespace and other non-hex chars in data destinations
                    pos += 1;
                }
            }
        }

        Ok(RtfParseResult {
            destinations,
            max_depth,
        })
    }

    /// Parse a control word starting at `pos` (after the backslash).
    /// Returns (word, optional_numeric_param, new_pos).
    fn parse_control_word(data: &[u8], start: usize) -> (String, Option<i64>, usize) {
        let len = data.len();
        let mut pos = start;

        // Non-alphabetic control symbol (e.g., \*, \\, \{, \})
        if pos < len && !data[pos].is_ascii_alphabetic() {
            return (
                String::from(data[pos] as char),
                None,
                pos + 1,
            );
        }

        // Alphabetic control word
        let word_start = pos;
        while pos < len && data[pos].is_ascii_alphabetic() {
            pos += 1;
        }
        let word = String::from_utf8_lossy(&data[word_start..pos]).to_string();

        // Optional numeric parameter (possibly negative)
        let mut param: Option<i64> = None;
        if pos < len && (data[pos].is_ascii_digit() || data[pos] == b'-') {
            let param_start = pos;
            if data[pos] == b'-' {
                pos += 1;
            }
            while pos < len && data[pos].is_ascii_digit() {
                pos += 1;
            }
            if let Ok(n) = String::from_utf8_lossy(&data[param_start..pos]).parse::<i64>() {
                param = Some(n);
            }
        }

        // Skip optional space delimiter
        if pos < len && data[pos] == b' ' {
            pos += 1;
        }

        (word, param, pos)
    }

    /// Check if a control word is a known destination keyword.
    fn is_destination_keyword(word: &str) -> bool {
        matches!(
            word,
            "objdata"
                | "objclass"
                | "objname"
                | "rsltpict"
                | "pict"
                | "fldinst"
                | "fldrslt"
                | "datafield"
                | "blipuid"
                | "fonttbl"
                | "colortbl"
                | "stylesheet"
                | "info"
                | "title"
                | "author"
                | "operator"
                | "category"
                | "comment"
                | "doccomm"
                | "subject"
                | "company"
                | "hlinkbase"
        )
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_parse_minimal_rtf() {
        let rtf = br"{\rtf1 Hello World}";
        let result = RtfParser::parse(rtf).unwrap();
        assert_eq!(result.max_depth, 1);
    }

    #[test]
    fn test_parse_nested_groups() {
        let rtf = br"{\rtf1 {{\b bold}}}";
        let result = RtfParser::parse(rtf).unwrap();
        assert!(result.max_depth >= 3);
    }

    #[test]
    fn test_parse_objdata_destination() {
        let rtf = br"{\rtf1 {\object {\objdata 0105000002000000}}}";
        let result = RtfParser::parse(rtf).unwrap();
        let objdata: Vec<_> = result
            .destinations
            .iter()
            .filter(|d| d.name == "objdata")
            .collect();
        assert_eq!(objdata.len(), 1);
        assert!(objdata[0].hex_data.contains("0105000002000000"));
    }

    #[test]
    fn test_parse_not_rtf() {
        let result = RtfParser::parse(b"Not an RTF file");
        assert!(result.is_err());
    }

    #[test]
    fn test_parse_empty() {
        let result = RtfParser::parse(b"");
        assert!(result.is_err());
    }

    #[test]
    fn test_parse_hex_escape() {
        let rtf = br"{\rtf1 {\objdata \'41\'42}}";
        let result = RtfParser::parse(rtf).unwrap();
        let objdata: Vec<_> = result
            .destinations
            .iter()
            .filter(|d| d.name == "objdata")
            .collect();
        assert_eq!(objdata.len(), 1);
        // \'41 should add "41" to hex_data, \'42 adds "42"
        assert!(objdata[0].hex_data.contains("41"));
        assert!(objdata[0].hex_data.contains("42"));
    }

    #[test]
    fn test_parse_control_word() {
        let data = b"bold123 text";
        let (word, param, pos) = RtfParser::parse_control_word(data, 0);
        assert_eq!(word, "bold");
        assert_eq!(param, Some(123));
        assert_eq!(pos, 8); // past space
    }

    #[test]
    fn test_parse_control_word_no_param() {
        let data = b"par some text";
        let (word, param, pos) = RtfParser::parse_control_word(data, 0);
        assert_eq!(word, "par");
        assert_eq!(param, None);
        assert_eq!(pos, 4); // past space
    }

    #[test]
    fn test_fldinst_destination() {
        let rtf = br"{\rtf1 {\fldinst HYPERLINK}}";
        let result = RtfParser::parse(rtf).unwrap();
        let fldinst: Vec<_> = result
            .destinations
            .iter()
            .filter(|d| d.name == "fldinst")
            .collect();
        assert_eq!(fldinst.len(), 1);
    }
}