const INIT_DICT: &[u8] = b"{\\rtf1\\ansi\\mac\\deff0\\deftab720{\\fonttbl;}\
{\\f0\\fnil \\froman \\fswiss \\fmodern \\fscript \\fdecor MS Sans SerifSymbolArial\
Times New RomanCourier{\\colortbl\\red0\\green0\\blue0\r\n\\par \
\\pard\\plain\\f0\\fs20\\b\\i\\ul\\ob\\strike\\scaps\\shad\\outl\\pn\\v\\super\\sub\\nosupersub\
{\\*\\teletypertab{\\stylesheet{\\Normal;}}";
const INIT_DICT_LEN: usize = 207;
const COMP_MAGIC: u32 = 0x75465A4C; const UNCOMP_MAGIC: u32 = 0x414C454D;
pub(crate) fn decompress_rtf(data: &[u8]) -> Option<Vec<u8>> {
if data.len() < 16 {
return None;
}
let comp_size = u32::from_le_bytes([data[0], data[1], data[2], data[3]]) as usize;
let raw_size = u32::from_le_bytes([data[4], data[5], data[6], data[7]]) as usize;
let magic = u32::from_le_bytes([data[8], data[9], data[10], data[11]]);
let _crc = u32::from_le_bytes([data[12], data[13], data[14], data[15]]);
if comp_size + 4 > data.len() {
return None;
}
if magic == UNCOMP_MAGIC {
return Some(data[16..16 + raw_size.min(data.len() - 16)].to_vec());
}
if magic != COMP_MAGIC {
return None;
}
let mut dict = [0u8; 4096];
dict[..INIT_DICT_LEN].copy_from_slice(&INIT_DICT[..INIT_DICT_LEN]);
let mut write_pos: usize = INIT_DICT_LEN;
let mut output = Vec::with_capacity(raw_size);
let mut pos = 16usize; let end = comp_size + 4;
while pos < end && pos < data.len() {
let control = data[pos];
pos += 1;
for i in 0..8 {
if pos >= end || pos >= data.len() || output.len() >= raw_size {
break;
}
if (control >> i) & 1 == 1 {
if pos + 1 >= data.len() {
break;
}
let hi = data[pos] as u16;
let lo = data[pos + 1] as u16;
pos += 2;
let offset = ((hi << 4) | (lo >> 4)) as usize;
let length = (lo & 0x0F) as usize + 2;
for j in 0..length {
if output.len() >= raw_size {
break;
}
let byte = dict[(offset + j) % 4096];
output.push(byte);
dict[write_pos % 4096] = byte;
write_pos += 1;
}
} else {
let byte = data[pos];
pos += 1;
output.push(byte);
dict[write_pos % 4096] = byte;
write_pos += 1;
}
}
}
Some(output)
}
pub(crate) fn extract_html_from_rtf(rtf: &[u8]) -> Option<String> {
let rtf_str = std::str::from_utf8(rtf).ok()?;
if !rtf_str.contains("\\fromhtml1") {
return None;
}
let mut html = String::with_capacity(rtf_str.len());
let bytes = rtf_str.as_bytes();
let len = bytes.len();
let mut i = 0;
let mut in_htmlrtf = false;
while i < len {
if bytes[i] == b'\\' && i + 8 < len && &bytes[i..i + 8] == b"\\htmlrtf" {
i += 8;
if i < len && bytes[i] == b'0' {
in_htmlrtf = false;
i += 1;
} else {
in_htmlrtf = true;
}
if i < len && bytes[i] == b' ' {
i += 1;
}
continue;
}
if i + 12 < len && &bytes[i..i + 12] == b"{\\*\\htmltag " {
i += 12;
while i < len && bytes[i].is_ascii_digit() {
i += 1;
}
if i < len && bytes[i] == b' ' {
i += 1;
}
let mut depth = 1;
while i < len && depth > 0 {
match bytes[i] {
b'{' => depth += 1,
b'}' => {
depth -= 1;
if depth == 0 {
i += 1;
break;
}
}
b'\\' if i + 1 < len => {
match bytes[i + 1] {
b'\\' => {
html.push('\\');
i += 2;
continue;
}
b'{' => {
html.push('{');
i += 2;
continue;
}
b'}' => {
html.push('}');
i += 2;
continue;
}
b'\'' if i + 3 < len => {
let hex_str =
std::str::from_utf8(&bytes[i + 2..i + 4]).unwrap_or("3f");
if let Ok(byte_val) = u8::from_str_radix(hex_str, 16) {
html.push(byte_val as char);
}
i += 4;
continue;
}
_ => {
i += 1;
while i < len && bytes[i].is_ascii_alphabetic() {
i += 1;
}
if i < len && (bytes[i] == b'-' || bytes[i].is_ascii_digit()) {
i += 1;
while i < len && bytes[i].is_ascii_digit() {
i += 1;
}
}
if i < len && bytes[i] == b' ' {
i += 1;
}
continue;
}
}
}
_ => {
html.push(bytes[i] as char);
}
}
i += 1;
}
continue;
}
if in_htmlrtf {
i += 1;
continue;
}
if bytes[i] == b'{' || bytes[i] == b'}' {
i += 1;
continue;
}
if bytes[i] == b'\\' {
i += 1;
if i < len {
match bytes[i] {
b'\\' => {
html.push('\\');
i += 1;
continue;
}
b'{' => {
html.push('{');
i += 1;
continue;
}
b'}' => {
html.push('}');
i += 1;
continue;
}
b'\'' if i + 2 < len => {
let hex_str = std::str::from_utf8(&bytes[i + 1..i + 3]).unwrap_or("3f");
if let Ok(byte_val) = u8::from_str_radix(hex_str, 16) {
html.push(byte_val as char);
}
i += 3;
continue;
}
b'\r' | b'\n' => {
i += 1;
continue;
}
_ => {
while i < len && bytes[i].is_ascii_alphabetic() {
i += 1;
}
if i < len && (bytes[i] == b'-' || bytes[i].is_ascii_digit()) {
i += 1;
while i < len && bytes[i].is_ascii_digit() {
i += 1;
}
}
if i < len && bytes[i] == b' ' {
i += 1;
}
continue;
}
}
}
continue;
}
if bytes[i] == b'\r' || bytes[i] == b'\n' {
i += 1;
continue;
}
html.push(bytes[i] as char);
i += 1;
}
if html.is_empty() { None } else { Some(html) }
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_decompress_too_short() {
assert!(decompress_rtf(&[0u8; 10]).is_none());
}
#[test]
fn test_decompress_bad_magic() {
let mut data = vec![0u8; 20];
data[0] = 4;
data[8] = 0xFF;
assert!(decompress_rtf(&data).is_none());
}
#[test]
fn test_decompress_uncompressed() {
let content = b"hello world";
let mut data = Vec::new();
let comp_size = 12 + content.len();
data.extend(&(comp_size as u32).to_le_bytes());
data.extend(&(content.len() as u32).to_le_bytes());
data.extend(&UNCOMP_MAGIC.to_le_bytes());
data.extend(&0u32.to_le_bytes()); data.extend(content);
let result = decompress_rtf(&data).unwrap();
assert_eq!(&result, content);
}
#[test]
fn test_decompress_real_rtf() {
let outlook = crate::Outlook::from_path("data/test_email.msg").unwrap();
if !outlook.rtf_compressed.is_empty() {
let raw = hex::decode(&outlook.rtf_compressed).unwrap();
let decompressed = decompress_rtf(&raw);
assert!(decompressed.is_some());
let rtf = decompressed.unwrap();
assert!(
rtf.starts_with(b"{\\rtf"),
"RTF should start with {{\\rtf, got: {:?}",
&rtf[..20.min(rtf.len())]
);
}
}
#[test]
fn test_extract_html_no_fromhtml() {
let rtf = b"{\\rtf1 hello world}";
assert!(extract_html_from_rtf(rtf).is_none());
}
}