1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
use crate::utils::find_subsequence;
use crate::SpecimenFile;

use std::fmt::{Display, Formatter};

use crate::doc::DocumentFile;
use anyhow::{bail, Result};
use chrono::{DateTime, Utc};

const RTF_MAGIC: [u8; 4] = [0x7B, 0x5C, 0x72, 0x74];

// This is added by LibreOffice. If that's the only program which uses it, maybe we won't bother with it.
const TITLE_HEADER: [u8; 13] = [
    0x7B, 0x5C, 0x69, 0x6E, 0x66, 0x6F, 0x7B, 0x5C, 0x74, 0x69, 0x74, 0x6C, 0x65,
]; // {\info{\title

const ANSI_CHARSET: [u8; 5] = [0x5C, 0x61, 0x6E, 0x73, 0x69]; // \ansi
const MAC_CHARSET: [u8; 4] = [0x5C, 0x6D, 0x61, 0x63]; // \mac
const PC_CHARSET: [u8; 3] = [0x5C, 0x70, 0x63]; // \pc
const PCA_CHARSET: [u8; 4] = [0x5C, 0x70, 0x63, 0x61]; // \pca

/// Character Set of the RTF document
/// https://docs.fileformat.com/word-processing/rtf/
/// https://www.biblioscape.com/rtf15_spec.htm
#[derive(Copy, Clone, Debug)]
pub enum CharacterSet {
    /// The normal character set
    Ansi,

    /// Pre-Mac OS X (macOS 10.x) ANSI
    MacAnsi,

    /// The normal character set for MS-DOS
    Pc,

    /// MS-DOS Multilingual
    Pca,
}

/// A struct representing Rich Text Format (RTF) files
#[derive(Clone, Debug)]
pub struct Rtf<'a> {
    /// The character set (encoding) used by the document
    pub character_set: Option<CharacterSet>,

    /// Document title
    pub title: Option<String>,

    /// The array containing the raw bytes used to parse this document
    pub contents: &'a [u8],
}

impl<'a> Rtf<'a> {
    /// RTF parsed from a byte sequence
    pub fn from(contents: &'a [u8]) -> Result<Self> {
        let doc_front = &contents[..40.min(contents.len())];

        if !doc_front.starts_with(&RTF_MAGIC) {
            bail!("not an RTF");
        }

        let mut character_set = None;
        if find_subsequence(doc_front, &ANSI_CHARSET).is_some() {
            character_set = Some(CharacterSet::Ansi);
        } else if find_subsequence(doc_front, &MAC_CHARSET).is_some() {
            character_set = Some(CharacterSet::MacAnsi);
        } else if find_subsequence(doc_front, &PCA_CHARSET).is_some() {
            character_set = Some(CharacterSet::Pca);
        } else if find_subsequence(doc_front, &PC_CHARSET).is_some() {
            character_set = Some(CharacterSet::Pc);
        }

        let mut title = None;
        if contents.len() < 100000 {
            title = if let Some(start_index) = find_subsequence(contents, &TITLE_HEADER) {
                let start_index = start_index + TITLE_HEADER.len() + 1; // The index has the start of the title header, so we need to pass it.
                let mut end_index = start_index;
                while contents[end_index] != 0x7D
                    && end_index < start_index + 200
                    && end_index < contents.len()
                {
                    end_index += 1;
                }
                if end_index < contents.len() && end_index > start_index + 1 {
                    if let Ok(title) = String::from_utf8(contents[start_index..end_index].to_vec())
                    {
                        Some(title)
                    } else {
                        None // Title wasn't UTF-8
                    }
                } else {
                    None // Failed to find the title
                }
            } else {
                None // No title found
            };
        }

        Ok(Self {
            character_set,
            title,
            contents,
        })
    }
}

// TODO: Better RTF parsing
impl<'a> DocumentFile for Rtf<'a> {
    fn pages(&self) -> u32 {
        0
    }

    fn author(&self) -> Option<String> {
        None
    }

    fn title(&self) -> Option<String> {
        None
    }

    fn has_javascript(&self) -> bool {
        false // RTFs don't contained embedded javascript
    }

    fn has_form(&self) -> bool {
        false
    }

    fn creation_time(&self) -> Option<DateTime<Utc>> {
        None
    }

    fn modification_time(&self) -> Option<DateTime<Utc>> {
        None
    }
}

impl<'a> SpecimenFile for Rtf<'a> {
    const MAGIC: &'static [&'static [u8]] = &[&RTF_MAGIC];

    fn type_name(&self) -> &'static str {
        "RTF"
    }
}

impl<'a> Display for Rtf<'a> {
    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
        write!(f, "RTF")?;
        if let Some(title) = &self.title {
            write!(f, ", Title: \"{title}\"")?;
        }
        if let Some(charset) = &self.character_set {
            write!(f, ", Character Set: {charset:?}")?;
        }
        write!(f, ", Bytes: {}", self.contents.len())
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn rtf() {
        const BYTES: &[u8] = include_bytes!("../../testdata/rtf/hello.rtf");

        let rtf = Rtf::from(BYTES);
        assert!(rtf.is_ok());

        let rtf = rtf.unwrap();
        println!("RTF: {rtf}");
        assert_eq!(rtf.title.unwrap(), "RTF Title");
    }

    #[test]
    fn emtpy() {
        const BYTES: &[u8] = include_bytes!("../../testdata/rtf/empty.rtf");

        let rtf = Rtf::from(BYTES);
        assert!(rtf.is_ok());
        assert!(rtf.unwrap().title.is_none());
    }
}