use crate::error::{Error, Result};
#[derive(Debug, Clone)]
pub struct RtfDestination {
pub name: String,
pub group_level: usize,
pub hex_data: String,
pub bin_data: Vec<u8>,
}
#[derive(Debug)]
pub struct RtfParseResult {
pub destinations: Vec<RtfDestination>,
pub max_depth: usize,
}
pub struct RtfParser;
impl RtfParser {
pub fn parse(data: &[u8]) -> Result<RtfParseResult> {
if data.len() < 5 || &data[0..4] != b"{\\rt" {
return Err(Error::RtfParsing("Not a valid RTF document".into()));
}
let mut group_level: usize = 0;
let mut max_depth: usize = 0;
let mut destinations: Vec<RtfDestination> = Vec::new();
let mut dest_stack: Vec<usize> = Vec::new(); let mut pos = 0;
let len = data.len();
while pos < len {
match data[pos] {
b'{' => {
group_level += 1;
if group_level > max_depth {
max_depth = group_level;
}
pos += 1;
}
b'}' => {
if let Some(&dest_idx) = dest_stack.last()
&& destinations[dest_idx].group_level == group_level {
dest_stack.pop();
}
group_level = group_level.saturating_sub(1);
pos += 1;
}
b'\\' => {
pos += 1;
if pos >= len {
break;
}
if data[pos] == b'\'' {
pos += 1;
if pos + 2 <= len {
let hex_str =
String::from_utf8_lossy(&data[pos..pos + 2]).to_string();
if let Some(&dest_idx) = dest_stack.last() {
destinations[dest_idx].hex_data.push_str(&hex_str);
}
pos += 2;
}
continue;
}
let (word, param, new_pos) = Self::parse_control_word(data, pos);
pos = new_pos;
if word == "bin" {
if let Some(n) = param {
let n = n as usize;
let end = std::cmp::min(pos + n, len);
let bin_bytes = data[pos..end].to_vec();
if let Some(&dest_idx) = dest_stack.last() {
destinations[dest_idx].bin_data.extend_from_slice(&bin_bytes);
}
pos = end;
}
continue;
}
if Self::is_destination_keyword(&word) {
let dest = RtfDestination {
name: word,
group_level,
hex_data: String::new(),
bin_data: Vec::new(),
};
let idx = destinations.len();
destinations.push(dest);
dest_stack.push(idx);
}
}
ch => {
if let Some(&dest_idx) = dest_stack.last()
&& ch.is_ascii_hexdigit() {
destinations[dest_idx].hex_data.push(ch as char);
}
pos += 1;
}
}
}
Ok(RtfParseResult {
destinations,
max_depth,
})
}
fn parse_control_word(data: &[u8], start: usize) -> (String, Option<i64>, usize) {
let len = data.len();
let mut pos = start;
if pos < len && !data[pos].is_ascii_alphabetic() {
return (
String::from(data[pos] as char),
None,
pos + 1,
);
}
let word_start = pos;
while pos < len && data[pos].is_ascii_alphabetic() {
pos += 1;
}
let word = String::from_utf8_lossy(&data[word_start..pos]).to_string();
let mut param: Option<i64> = None;
if pos < len && (data[pos].is_ascii_digit() || data[pos] == b'-') {
let param_start = pos;
if data[pos] == b'-' {
pos += 1;
}
while pos < len && data[pos].is_ascii_digit() {
pos += 1;
}
if let Ok(n) = String::from_utf8_lossy(&data[param_start..pos]).parse::<i64>() {
param = Some(n);
}
}
if pos < len && data[pos] == b' ' {
pos += 1;
}
(word, param, pos)
}
fn is_destination_keyword(word: &str) -> bool {
matches!(
word,
"objdata"
| "objclass"
| "objname"
| "rsltpict"
| "pict"
| "fldinst"
| "fldrslt"
| "datafield"
| "blipuid"
| "fonttbl"
| "colortbl"
| "stylesheet"
| "info"
| "title"
| "author"
| "operator"
| "category"
| "comment"
| "doccomm"
| "subject"
| "company"
| "hlinkbase"
)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_minimal_rtf() {
let rtf = br"{\rtf1 Hello World}";
let result = RtfParser::parse(rtf).unwrap();
assert_eq!(result.max_depth, 1);
}
#[test]
fn test_parse_nested_groups() {
let rtf = br"{\rtf1 {{\b bold}}}";
let result = RtfParser::parse(rtf).unwrap();
assert!(result.max_depth >= 3);
}
#[test]
fn test_parse_objdata_destination() {
let rtf = br"{\rtf1 {\object {\objdata 0105000002000000}}}";
let result = RtfParser::parse(rtf).unwrap();
let objdata: Vec<_> = result
.destinations
.iter()
.filter(|d| d.name == "objdata")
.collect();
assert_eq!(objdata.len(), 1);
assert!(objdata[0].hex_data.contains("0105000002000000"));
}
#[test]
fn test_parse_not_rtf() {
let result = RtfParser::parse(b"Not an RTF file");
assert!(result.is_err());
}
#[test]
fn test_parse_empty() {
let result = RtfParser::parse(b"");
assert!(result.is_err());
}
#[test]
fn test_parse_hex_escape() {
let rtf = br"{\rtf1 {\objdata \'41\'42}}";
let result = RtfParser::parse(rtf).unwrap();
let objdata: Vec<_> = result
.destinations
.iter()
.filter(|d| d.name == "objdata")
.collect();
assert_eq!(objdata.len(), 1);
assert!(objdata[0].hex_data.contains("41"));
assert!(objdata[0].hex_data.contains("42"));
}
#[test]
fn test_parse_control_word() {
let data = b"bold123 text";
let (word, param, pos) = RtfParser::parse_control_word(data, 0);
assert_eq!(word, "bold");
assert_eq!(param, Some(123));
assert_eq!(pos, 8); }
#[test]
fn test_parse_control_word_no_param() {
let data = b"par some text";
let (word, param, pos) = RtfParser::parse_control_word(data, 0);
assert_eq!(word, "par");
assert_eq!(param, None);
assert_eq!(pos, 4); }
#[test]
fn test_fldinst_destination() {
let rtf = br"{\rtf1 {\fldinst HYPERLINK}}";
let result = RtfParser::parse(rtf).unwrap();
let fldinst: Vec<_> = result
.destinations
.iter()
.filter(|d| d.name == "fldinst")
.collect();
assert_eq!(fldinst.len(), 1);
}
}