1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
use crate::utils::find_subsequence;
use crate::TypeMagic;

use std::fmt::{Display, Formatter};

use anyhow::{bail, Result};

const RTF_MAGIC: [u8; 4] = [0x7B, 0x5C, 0x72, 0x74];

// This is added by LibreOffice. If that's the only program which uses it, maybe we won't bother with it.
const TITLE_HEADER: [u8; 13] = [
    0x7B, 0x5C, 0x69, 0x6E, 0x66, 0x6F, 0x7B, 0x5C, 0x74, 0x69, 0x74, 0x6C, 0x65,
]; // {\info{\title

const ANSI_CHARSET: [u8; 5] = [0x5C, 0x61, 0x6E, 0x73, 0x69]; // \ansi
const MAC_CHARSET: [u8; 4] = [0x5C, 0x6D, 0x61, 0x63]; // \mac
const PC_CHARSET: [u8; 3] = [0x5C, 0x70, 0x63]; // \pc
const PCA_CHARSET: [u8; 4] = [0x5C, 0x70, 0x63, 0x61]; // \pca

// https://docs.fileformat.com/word-processing/rtf/
// https://www.biblioscape.com/rtf15_spec.htm
#[derive(Copy, Clone, Debug)]
pub enum CharacterSet {
    /// The normal character set
    Ansi,

    /// Pre-Mac OS X (macOS 10.x) ANSI
    MacAnsi,

    /// The normal character set for MS-DOS
    Pc,

    /// MS-DOS Multilingual
    Pca,
}

/// A struct representing Rich Text Format (RTF) files
#[derive(Clone, Debug)]
pub struct Rtf<'a> {
    /// The character set (encoding) used by the document
    pub character_set: Option<CharacterSet>,

    /// Document title
    pub title: Option<String>,

    /// The array containing the raw bytes used to parse this document
    pub contents: &'a [u8],
}

impl<'a> Rtf<'a> {
    pub fn from(contents: &'a [u8]) -> Result<Self> {
        let doc_front = &contents[..40.min(contents.len())];

        if !doc_front.starts_with(&RTF_MAGIC) {
            bail!("not an RTF");
        }

        let mut character_set = None;
        if find_subsequence(doc_front, &ANSI_CHARSET).is_some() {
            character_set = Some(CharacterSet::Ansi);
        } else if find_subsequence(doc_front, &MAC_CHARSET).is_some() {
            character_set = Some(CharacterSet::MacAnsi);
        } else if find_subsequence(doc_front, &PCA_CHARSET).is_some() {
            character_set = Some(CharacterSet::Pca);
        } else if find_subsequence(doc_front, &PC_CHARSET).is_some() {
            character_set = Some(CharacterSet::Pc);
        }

        let mut title = None;
        if contents.len() < 100000 {
            title = if let Some(start_index) = find_subsequence(contents, &TITLE_HEADER) {
                let start_index = start_index + TITLE_HEADER.len() + 1; // The index has the start of the title header, so we need to pass it.
                let mut end_index = start_index;
                while contents[end_index] != 0x7D
                    && end_index < start_index + 200
                    && end_index < contents.len()
                {
                    end_index += 1;
                }
                if end_index < contents.len() && end_index > start_index + 1 {
                    if let Ok(title) = String::from_utf8(contents[start_index..end_index].to_vec())
                    {
                        Some(title)
                    } else {
                        None // Title wasn't UTF-8
                    }
                } else {
                    None // Failed to find the title
                }
            } else {
                None // No title found
            };
        }

        Ok(Self {
            character_set,
            title,
            contents,
        })
    }
}

impl<'a> TypeMagic for Rtf<'a> {
    const MAGIC: &'static [&'static [u8]] = &[&RTF_MAGIC];
}

impl<'a> Display for Rtf<'a> {
    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
        write!(f, "RTF")?;
        if let Some(title) = &self.title {
            write!(f, ", Title: \"{title}\"")?;
        }
        if let Some(charset) = &self.character_set {
            write!(f, ", Character Set: {charset:?}")?;
        }
        write!(f, ", Bytes: {}", self.contents.len())
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn rtf() {
        const BYTES: &[u8] = include_bytes!("../../testdata/rtf/hello.rtf");

        let rtf = Rtf::from(BYTES);
        assert!(rtf.is_ok());

        let rtf = rtf.unwrap();
        println!("RTF: {rtf}");
        assert_eq!(rtf.title.unwrap(), "RTF Title");
    }

    #[test]
    fn emtpy() {
        const BYTES: &[u8] = include_bytes!("../../testdata/rtf/empty.rtf");

        let rtf = Rtf::from(BYTES);
        assert!(rtf.is_ok());
        assert!(rtf.unwrap().title.is_none());
    }
}