malwaredb_types/doc/
rtf.rs

1// SPDX-License-Identifier: Apache-2.0
2
3use crate::utils::find_subsequence;
4use crate::SpecimenFile;
5
6use std::fmt::{Display, Formatter};
7
8use crate::doc::DocumentFile;
9use anyhow::{bail, Result};
10use chrono::{DateTime, Utc};
11use tracing::instrument;
12
13const RTF_MAGIC: [u8; 4] = [0x7B, 0x5C, 0x72, 0x74];
14
15// This is added by LibreOffice. If that's the only program which uses it, maybe we won't bother with it.
16const TITLE_HEADER: [u8; 13] = [
17    0x7B, 0x5C, 0x69, 0x6E, 0x66, 0x6F, 0x7B, 0x5C, 0x74, 0x69, 0x74, 0x6C, 0x65,
18]; // {\info{\title
19
20const ANSI_CHARSET: [u8; 5] = [0x5C, 0x61, 0x6E, 0x73, 0x69]; // \ansi
21const MAC_CHARSET: [u8; 4] = [0x5C, 0x6D, 0x61, 0x63]; // \mac
22const PC_CHARSET: [u8; 3] = [0x5C, 0x70, 0x63]; // \pc
23const PCA_CHARSET: [u8; 4] = [0x5C, 0x70, 0x63, 0x61]; // \pca
24
25/// Character Set of the RTF document
26/// <https://docs.fileformat.com/word-processing/rtf/>
27/// <https://www.biblioscape.com/rtf15_spec.htm>
28#[derive(Copy, Clone, Debug)]
29pub enum CharacterSet {
30    /// The normal character set
31    Ansi,
32
33    /// Pre-Mac OS X (macOS 10.x) ANSI
34    MacAnsi,
35
36    /// The normal character set for MS-DOS
37    Pc,
38
39    /// MS-DOS Multilingual
40    Pca,
41}
42
43/// A struct representing Rich Text Format (RTF) files
44#[derive(Clone, Debug)]
45pub struct Rtf<'a> {
46    /// The character set (encoding) used by the document
47    pub character_set: Option<CharacterSet>,
48
49    /// Document title
50    pub title: Option<String>,
51
52    /// The array containing the raw bytes used to parse this document
53    pub contents: &'a [u8],
54}
55
56impl<'a> Rtf<'a> {
57    /// RTF parsed from a byte sequence
58    ///
59    /// # Errors
60    ///
61    /// Returns an error if the parser fails.
62    #[instrument(name = "RTF parser", skip(contents))]
63    pub fn from(contents: &'a [u8]) -> Result<Self> {
64        let doc_front = &contents[..40.min(contents.len())];
65
66        if !doc_front.starts_with(&RTF_MAGIC) {
67            bail!("not an RTF");
68        }
69
70        let mut character_set = None;
71        if find_subsequence(doc_front, &ANSI_CHARSET).is_some() {
72            character_set = Some(CharacterSet::Ansi);
73        } else if find_subsequence(doc_front, &MAC_CHARSET).is_some() {
74            character_set = Some(CharacterSet::MacAnsi);
75        } else if find_subsequence(doc_front, &PCA_CHARSET).is_some() {
76            character_set = Some(CharacterSet::Pca);
77        } else if find_subsequence(doc_front, &PC_CHARSET).is_some() {
78            character_set = Some(CharacterSet::Pc);
79        }
80
81        let mut title = None;
82        if contents.len() < 100_000 {
83            title = if let Some(start_index) = find_subsequence(contents, &TITLE_HEADER) {
84                let start_index = start_index + TITLE_HEADER.len() + 1; // The index has the start of the title header, so we need to pass it.
85                let mut end_index = start_index;
86                while contents[end_index] != 0x7D
87                    && end_index < start_index + 200
88                    && end_index < contents.len()
89                {
90                    end_index += 1;
91                }
92                if end_index < contents.len() && end_index > start_index + 1 {
93                    String::from_utf8(contents[start_index..end_index].to_vec()).ok()
94                } else {
95                    None // Failed to find the title
96                }
97            } else {
98                None // No title found
99            };
100        }
101
102        Ok(Self {
103            character_set,
104            title,
105            contents,
106        })
107    }
108}
109
110// TODO: Better RTF parsing
111impl DocumentFile for Rtf<'_> {
112    fn pages(&self) -> u32 {
113        0
114    }
115
116    fn author(&self) -> Option<String> {
117        None
118    }
119
120    fn title(&self) -> Option<String> {
121        None
122    }
123
124    fn has_javascript(&self) -> bool {
125        false // RTFs don't contained embedded javascript
126    }
127
128    fn has_form(&self) -> bool {
129        false
130    }
131
132    fn creation_time(&self) -> Option<DateTime<Utc>> {
133        None
134    }
135
136    fn modification_time(&self) -> Option<DateTime<Utc>> {
137        None
138    }
139}
140
141impl SpecimenFile for Rtf<'_> {
142    const MAGIC: &'static [&'static [u8]] = &[&RTF_MAGIC];
143
144    fn type_name(&self) -> &'static str {
145        "RTF"
146    }
147}
148
149impl Display for Rtf<'_> {
150    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
151        write!(f, "RTF")?;
152        if let Some(title) = &self.title {
153            write!(f, ", Title: \"{title}\"")?;
154        }
155        if let Some(charset) = &self.character_set {
156            write!(f, ", Character Set: {charset:?}")?;
157        }
158        write!(f, ", Bytes: {}", self.contents.len())
159    }
160}
161
162#[cfg(test)]
163mod tests {
164    use super::*;
165
166    #[test]
167    fn rtf() {
168        const BYTES: &[u8] = include_bytes!("../../testdata/rtf/hello.rtf");
169
170        let rtf = Rtf::from(BYTES);
171        assert!(rtf.is_ok());
172
173        let rtf = rtf.unwrap();
174        println!("RTF: {rtf}");
175        assert_eq!(rtf.title.unwrap(), "RTF Title");
176    }
177
178    #[test]
179    fn emtpy() {
180        const BYTES: &[u8] = include_bytes!("../../testdata/rtf/empty.rtf");
181
182        let rtf = Rtf::from(BYTES);
183        assert!(rtf.is_ok());
184        assert!(rtf.unwrap().title.is_none());
185    }
186}