malwaredb_types/doc/
rtf.rs

1// SPDX-License-Identifier: Apache-2.0
2
3use crate::utils::find_subsequence;
4use crate::SpecimenFile;
5
6use std::fmt::{Display, Formatter};
7
8use crate::doc::DocumentFile;
9use anyhow::{bail, Result};
10use chrono::{DateTime, Utc};
11use tracing::instrument;
12
13const RTF_MAGIC: [u8; 4] = [0x7B, 0x5C, 0x72, 0x74];
14
15// This is added by LibreOffice. If that's the only program which uses it, maybe we won't bother with it.
16const TITLE_HEADER: [u8; 13] = [
17    0x7B, 0x5C, 0x69, 0x6E, 0x66, 0x6F, 0x7B, 0x5C, 0x74, 0x69, 0x74, 0x6C, 0x65,
18]; // {\info{\title
19
20const ANSI_CHARSET: [u8; 5] = [0x5C, 0x61, 0x6E, 0x73, 0x69]; // \ansi
21const MAC_CHARSET: [u8; 4] = [0x5C, 0x6D, 0x61, 0x63]; // \mac
22const PC_CHARSET: [u8; 3] = [0x5C, 0x70, 0x63]; // \pc
23const PCA_CHARSET: [u8; 4] = [0x5C, 0x70, 0x63, 0x61]; // \pca
24
25/// Character Set of the RTF document
26/// <https://docs.fileformat.com/word-processing/rtf/>
27/// <https://www.biblioscape.com/rtf15_spec.htm>
28#[derive(Copy, Clone, Debug)]
29pub enum CharacterSet {
30    /// The normal character set
31    Ansi,
32
33    /// Pre-Mac OS X (macOS 10.x) ANSI
34    MacAnsi,
35
36    /// The normal character set for MS-DOS
37    Pc,
38
39    /// MS-DOS Multilingual
40    Pca,
41}
42
43/// A struct representing Rich Text Format (RTF) files
44#[derive(Clone, Debug)]
45pub struct Rtf<'a> {
46    /// The character set (encoding) used by the document
47    pub character_set: Option<CharacterSet>,
48
49    /// Document title
50    pub title: Option<String>,
51
52    /// The array containing the raw bytes used to parse this document
53    pub contents: &'a [u8],
54}
55
56impl<'a> Rtf<'a> {
57    /// RTF parsed from a byte sequence
58    #[instrument(name = "RTF parser", skip(contents))]
59    pub fn from(contents: &'a [u8]) -> Result<Self> {
60        let doc_front = &contents[..40.min(contents.len())];
61
62        if !doc_front.starts_with(&RTF_MAGIC) {
63            bail!("not an RTF");
64        }
65
66        let mut character_set = None;
67        if find_subsequence(doc_front, &ANSI_CHARSET).is_some() {
68            character_set = Some(CharacterSet::Ansi);
69        } else if find_subsequence(doc_front, &MAC_CHARSET).is_some() {
70            character_set = Some(CharacterSet::MacAnsi);
71        } else if find_subsequence(doc_front, &PCA_CHARSET).is_some() {
72            character_set = Some(CharacterSet::Pca);
73        } else if find_subsequence(doc_front, &PC_CHARSET).is_some() {
74            character_set = Some(CharacterSet::Pc);
75        }
76
77        let mut title = None;
78        if contents.len() < 100_000 {
79            title = if let Some(start_index) = find_subsequence(contents, &TITLE_HEADER) {
80                let start_index = start_index + TITLE_HEADER.len() + 1; // The index has the start of the title header, so we need to pass it.
81                let mut end_index = start_index;
82                while contents[end_index] != 0x7D
83                    && end_index < start_index + 200
84                    && end_index < contents.len()
85                {
86                    end_index += 1;
87                }
88                if end_index < contents.len() && end_index > start_index + 1 {
89                    String::from_utf8(contents[start_index..end_index].to_vec()).ok()
90                } else {
91                    None // Failed to find the title
92                }
93            } else {
94                None // No title found
95            };
96        }
97
98        Ok(Self {
99            character_set,
100            title,
101            contents,
102        })
103    }
104}
105
106// TODO: Better RTF parsing
107impl DocumentFile for Rtf<'_> {
108    fn pages(&self) -> u32 {
109        0
110    }
111
112    fn author(&self) -> Option<String> {
113        None
114    }
115
116    fn title(&self) -> Option<String> {
117        None
118    }
119
120    fn has_javascript(&self) -> bool {
121        false // RTFs don't contained embedded javascript
122    }
123
124    fn has_form(&self) -> bool {
125        false
126    }
127
128    fn creation_time(&self) -> Option<DateTime<Utc>> {
129        None
130    }
131
132    fn modification_time(&self) -> Option<DateTime<Utc>> {
133        None
134    }
135}
136
137impl SpecimenFile for Rtf<'_> {
138    const MAGIC: &'static [&'static [u8]] = &[&RTF_MAGIC];
139
140    fn type_name(&self) -> &'static str {
141        "RTF"
142    }
143}
144
145impl Display for Rtf<'_> {
146    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
147        write!(f, "RTF")?;
148        if let Some(title) = &self.title {
149            write!(f, ", Title: \"{title}\"")?;
150        }
151        if let Some(charset) = &self.character_set {
152            write!(f, ", Character Set: {charset:?}")?;
153        }
154        write!(f, ", Bytes: {}", self.contents.len())
155    }
156}
157
158#[cfg(test)]
159mod tests {
160    use super::*;
161
162    #[test]
163    fn rtf() {
164        const BYTES: &[u8] = include_bytes!("../../testdata/rtf/hello.rtf");
165
166        let rtf = Rtf::from(BYTES);
167        assert!(rtf.is_ok());
168
169        let rtf = rtf.unwrap();
170        println!("RTF: {rtf}");
171        assert_eq!(rtf.title.unwrap(), "RTF Title");
172    }
173
174    #[test]
175    fn emtpy() {
176        const BYTES: &[u8] = include_bytes!("../../testdata/rtf/empty.rtf");
177
178        let rtf = Rtf::from(BYTES);
179        assert!(rtf.is_ok());
180        assert!(rtf.unwrap().title.is_none());
181    }
182}