rtf_grimoire/
tokenizer.rs

1// RTF document format tokenizer
2//
3// Written according to the RTF Format Specification 1.9.1, which carries
4// the following copyright notice:
5//
6//     Copyright (c) 2008 Microsoft Corporation.  All Rights reserved.
7//
8
9use crate::raw::{
10    control_bin_raw, control_symbol_raw, control_word_hexbyte_raw, control_word_raw, end_group_raw,
11    newline_raw, rtf_text_raw, start_group_raw,
12};
13use std;
14
15use nom::branch::alt;
16use nom::combinator::map;
17use nom::multi::many0;
18use nom::Finish;
19use nom::IResult;
20
21#[derive(Debug)]
22pub struct ParseError<I> {
23    inner: nom::error::Error<I>,
24}
25
26impl<I> std::convert::From<nom::error::Error<I>> for ParseError<I> {
27    fn from(error: nom::error::Error<I>) -> Self {
28        Self { inner: error }
29    }
30}
31
32impl<I> std::fmt::Display for ParseError<I>
33where
34    I: std::fmt::Debug,
35{
36    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
37        write!(f, "Parser Error: {:?}", self.inner)
38    }
39}
40
41impl<I> std::error::Error for ParseError<I>
42where
43    I: std::fmt::Debug + std::fmt::Display + 'static,
44{
45    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
46        Some(&self.inner)
47    }
48}
49
50#[derive(PartialEq, Eq)]
51pub enum Token {
52    ControlSymbol(char),
53    ControlWord {
54        name: String,
55        arg: Option<i32>,
56    },
57    ControlBin(Vec<u8>),
58    /// Text is not str because it can be in any of various encodings -
59    /// it's up to the processor to identify any encoding information in
60    /// the stream, and do any encoding conversion desired
61    Text(Vec<u8>),
62    StartGroup,
63    EndGroup,
64    /// stores the actual bytes of newline found
65    Newline(Vec<u8>),
66}
67
68impl std::fmt::Debug for Token {
69    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
70        match self {
71            Token::ControlSymbol(c) => write!(f, "Token::ControlSymbol({c})"),
72            Token::ControlWord { name, arg } => write!(
73                f,
74                "Token::ControlWord({}{})",
75                name,
76                arg.map(|i| format!(":{i}")).unwrap_or_default()
77            ),
78            Token::ControlBin(data) => {
79                write!(f, "Token::ControlBin(")?;
80                for byte in data {
81                    write!(f, " {byte:02x?}")?;
82                }
83                write!(f, ")")
84            }
85            Token::Text(data) => {
86                write!(f, "Token::Text(")?;
87                for byte in data {
88                    write!(f, " {byte:02x?}")?;
89                }
90                write!(f, ")")
91            }
92            Token::StartGroup => write!(f, "Token::StartGroup"),
93            Token::EndGroup => write!(f, "Token::EndGroup"),
94            Token::Newline(data) => {
95                write!(f, "Token::Newline(")?;
96                for byte in data {
97                    write!(f, " {byte:02x?}")?;
98                }
99                write!(f, ")")
100            }
101        }
102    }
103}
104
105impl Token {
106    pub fn to_rtf(&self) -> Vec<u8> {
107        match self {
108            Token::ControlSymbol(c) => format!("\\{c}").as_bytes().to_vec(),
109            Token::ControlWord { name, arg } => match arg {
110                Some(num) => format!("\\{name}{num}").as_bytes().to_vec(),
111                None => format!("\\{name}").as_bytes().to_vec(),
112            },
113            Token::ControlBin(data) => {
114                let mut rtf: Vec<u8> = Vec::with_capacity(12 + data.len());
115                rtf.extend_from_slice(format!("\\bin{} ", data.len()).as_bytes());
116                rtf.extend_from_slice(data);
117                rtf
118            }
119            Token::Text(data) => data.to_vec(),
120            Token::StartGroup => b"{".to_vec(),
121            Token::EndGroup => b"}".to_vec(),
122            Token::Newline(data) => data.to_vec(),
123        }
124    }
125
126    /// This function returns a control word delimiter if one is required, or an
127    /// empty string if none is required
128    ///
129    /// Control Word tokens must be delimited by a non-alphanumeric value, so
130    /// if the subsequent content could be alphanumeric, a space (' ') delimiter
131    /// must be inserted
132    pub fn token_delimiter_after(&self, next_token: &Token) -> &'static str {
133        if let Token::ControlWord { .. } = self {
134            // TODO: actually check the content of Text to see if a space is needed
135            // it's safe to be lazy here, but less efficient
136            if let Token::Text(_) = next_token {
137                return " ";
138            }
139        }
140        ""
141    }
142
143    /// This function returns a control word delimiter if one is required, or an
144    /// empty string if none is required
145    ///
146    /// Control Word tokens must be delimited by a non-alphanumeric value, so
147    /// if the subsequent content could be alphanumeric, a space (' ') delimiter
148    /// must be inserted
149    pub fn token_delimiter_before(&self, prev_token: &Token) -> &'static str {
150        prev_token.token_delimiter_after(self)
151    }
152
153    pub fn get_name(&self) -> Option<String> {
154        if let Token::ControlWord { ref name, .. } = self {
155            Some(name.clone())
156        } else {
157            None
158        }
159    }
160
161    pub fn get_arg(&self) -> Option<i32> {
162        if let Token::ControlWord { ref arg, .. } = self {
163            *arg
164        } else {
165            None
166        }
167    }
168
169    pub fn get_symbol(&self) -> Option<char> {
170        if let Token::ControlSymbol(c) = self {
171            Some(*c)
172        } else {
173            None
174        }
175    }
176
177    pub fn get_bin(&self) -> Option<&[u8]> {
178        if let Token::ControlBin(data) = self {
179            Some(data.as_slice())
180        } else {
181            None
182        }
183    }
184
185    pub fn get_text(&self) -> Option<&[u8]> {
186        if let Token::Text(data) = self {
187            Some(data.as_slice())
188        } else {
189            None
190        }
191    }
192}
193
194// Ordering here is important. Plain text is all content that isn't something else:
195// If the next unparsed character is anything other than an opening brace ({), closing brace (}),
196// backslash (\), or a CRLF (carriage return/line feed), the reader assumes that the character is
197// plain text and writes the character to the current destination using the current formatting
198// properties.  Finally, a control hexbyte is a special case of a control symbol, but needs to be
199// handled specially, so hexbyte should be tested for before control symbols.
200//
201// See section "Conventions of an RTF Reader" in the RTF specification.
202pub fn read_token(input: &[u8]) -> IResult<&[u8], Token> {
203    alt((
204        read_control_hexbyte,
205        read_control_symbol,
206        read_control_bin,
207        read_control_word,
208        read_start_group,
209        read_end_group,
210        read_newline,
211        read_rtf_text,
212    ))(input)
213}
214
215pub fn read_control_hexbyte(input: &[u8]) -> IResult<&[u8], Token> {
216    map(control_word_hexbyte_raw, |(name, arg)| Token::ControlWord {
217        name: String::from(name),
218        arg,
219    })(input)
220}
221
222pub fn read_control_symbol(input: &[u8]) -> IResult<&[u8], Token> {
223    map(control_symbol_raw, Token::ControlSymbol)(input)
224}
225
226pub fn read_control_word(input: &[u8]) -> IResult<&[u8], Token> {
227    map(control_word_raw, |(name, arg)| Token::ControlWord {
228        name: String::from(name),
229        arg,
230    })(input)
231}
232
233pub fn read_control_bin(input: &[u8]) -> IResult<&[u8], Token> {
234    map(control_bin_raw, |bytes| Token::ControlBin(bytes.to_vec()))(input)
235}
236
237pub fn read_newline(input: &[u8]) -> IResult<&[u8], Token> {
238    map(newline_raw, |bytes| Token::Newline(bytes.to_vec()))(input)
239}
240
241pub fn read_start_group(input: &[u8]) -> IResult<&[u8], Token> {
242    map(start_group_raw, |_| Token::StartGroup)(input)
243}
244
245pub fn read_end_group(input: &[u8]) -> IResult<&[u8], Token> {
246    map(end_group_raw, |_| Token::EndGroup)(input)
247}
248
249pub fn read_rtf_text(input: &[u8]) -> IResult<&[u8], Token> {
250    map(rtf_text_raw, |text_bytes| Token::Text(text_bytes.to_vec()))(input)
251}
252
253pub fn read_token_stream(input: &[u8]) -> IResult<&[u8], Vec<Token>> {
254    many0(read_token)(input)
255}
256
257pub fn parse(bytes: &[u8]) -> IResult<&[u8], Vec<Token>> {
258    read_token_stream(bytes)
259}
260
261pub fn parse_finished(bytes: &[u8]) -> Result<Vec<Token>, ParseError<&[u8]>> {
262    parse(bytes)
263        .finish()
264        .map(|(_, o)| o)
265        .map_err(ParseError::from)
266}
267
268#[cfg(test)]
269mod tests {
270    use super::*;
271
272    #[test]
273    fn test_control_symbol_tokens() {
274        let syms_str = br#"\*\.\+\~"#;
275        let valid_syms = vec![
276            Token::ControlSymbol('*'),
277            Token::ControlSymbol('.'),
278            Token::ControlSymbol('+'),
279            Token::ControlSymbol('~'),
280        ];
281        let syms_after_parse: &[u8] = b"";
282        let syms = read_token_stream(syms_str);
283        assert_eq!(syms, Ok((syms_after_parse, valid_syms)));
284    }
285
286    #[test]
287    fn test_control_word_tokens() {
288        let words_str = br#"\par\b0\b\uncle\foo-5\applepi314159"#;
289        let valid_words = vec![
290            Token::ControlWord {
291                name: "par".to_string(),
292                arg: None,
293            },
294            Token::ControlWord {
295                name: "b".to_string(),
296                arg: Some(0),
297            },
298            Token::ControlWord {
299                name: "b".to_string(),
300                arg: None,
301            },
302            Token::ControlWord {
303                name: "uncle".to_string(),
304                arg: None,
305            },
306            Token::ControlWord {
307                name: "foo".to_string(),
308                arg: Some(-5),
309            },
310            Token::ControlWord {
311                name: "applepi".to_string(),
312                arg: Some(314159),
313            },
314        ];
315        let words_after_parse: &[u8] = b"";
316        let words = read_token_stream(words_str);
317        assert_eq!(words, Ok((words_after_parse, valid_words)));
318    }
319
320    #[test]
321    fn test_control_bin_tokens() {
322        let bins_str = b"\\bin5 ABC{}\\bin1 {\\bin0 \\bin0\\bin1  \\bin1\x01\\bin1 \x02";
323        let valid_bins = vec![
324            Token::ControlBin(b"ABC{}".to_vec()),
325            Token::ControlBin(b"{".to_vec()),
326            Token::ControlBin(b"".to_vec()),
327            Token::ControlBin(b"".to_vec()),
328            Token::ControlBin(b" ".to_vec()),
329            Token::ControlBin(b"\x01".to_vec()),
330            Token::ControlBin(b"\x02".to_vec()),
331        ];
332        let bins_after_parse: &[u8] = b"";
333        let bins = read_token_stream(bins_str);
334        assert_eq!(bins, Ok((bins_after_parse, valid_bins)));
335    }
336
337    #[test]
338    fn test_control() {
339        let controls_str = b"\\*\\bin5 ABC{}\\b\\bin1 {\\bin0 \\b0\\bin0\\bin1  \\supercalifragilistic31415\\bin1\x01\\bin1 \x02";
340        let valid_controls = vec![
341            Token::ControlSymbol('*'),
342            Token::ControlBin(b"ABC{}".to_vec()),
343            Token::ControlWord {
344                name: "b".to_string(),
345                arg: None,
346            },
347            Token::ControlBin(b"{".to_vec()),
348            Token::ControlBin(b"".to_vec()),
349            Token::ControlWord {
350                name: "b".to_string(),
351                arg: Some(0),
352            },
353            Token::ControlBin(b"".to_vec()),
354            Token::ControlBin(b" ".to_vec()),
355            Token::ControlWord {
356                name: "supercalifragilistic".to_string(),
357                arg: Some(31415),
358            },
359            Token::ControlBin(b"\x01".to_vec()),
360            Token::ControlBin(b"\x02".to_vec()),
361        ];
362        let controls_after_parse: &[u8] = b"";
363        let controls = read_token_stream(controls_str);
364        assert_eq!(controls, Ok((controls_after_parse, valid_controls)));
365    }
366
367    #[test]
368    fn test_group_tokens() {
369        // Have to be very careful here to insert crlf, regardless of host platform
370        let group_content_str = b"\\b Hello World \\b0 \\par\r\nThis is a test {\\*\\nothing}\\\r";
371        let valid_group_content = vec![
372            Token::ControlWord {
373                name: "b".to_string(),
374                arg: None,
375            },
376            Token::Text(b"Hello World ".to_vec()),
377            Token::ControlWord {
378                name: "b".to_string(),
379                arg: Some(0),
380            },
381            Token::ControlWord {
382                name: "par".to_string(),
383                arg: None,
384            },
385            Token::Newline(vec![0x0d, 0x0a]),
386            Token::Text(b"This is a test ".to_vec()),
387            Token::StartGroup,
388            Token::ControlSymbol('*'),
389            Token::ControlWord {
390                name: "nothing".to_string(),
391                arg: None,
392            },
393            Token::EndGroup,
394            Token::ControlSymbol(0x0d.into()),
395        ];
396        let group_content_after_parse: &[u8] = b"";
397        let group_content = read_token_stream(group_content_str);
398        assert_eq!(
399            group_content,
400            Ok((group_content_after_parse, valid_group_content))
401        );
402    }
403
404    #[test]
405    fn test_sample_doc() {
406        let test_bytes = include_bytes!("../tests/sample.rtf");
407        parse(test_bytes).unwrap();
408        let (unparsed, _) = read_token_stream(test_bytes).unwrap();
409        assert_eq!(
410            unparsed.len(),
411            0,
412            "Unparsed data: {} bytes (first <=5 bytes: {:02X?})",
413            unparsed.len(),
414            &unparsed[0..std::cmp::min(5, unparsed.len())]
415        );
416    }
417
418    // The spec doc is interested because it has unmatched "{}" groups
419    #[test]
420    fn test_spec_doc() {
421        let test_bytes = include_bytes!("../tests/RTF-Spec-1.7.rtf");
422        parse(test_bytes).unwrap();
423        let (unparsed, _) = read_token_stream(test_bytes).unwrap();
424        assert_eq!(
425            unparsed.len(),
426            0,
427            "Unparsed data: {} bytes (first <=5 bytes: {:02X?})",
428            unparsed.len(),
429            &unparsed[0..std::cmp::min(5, unparsed.len())]
430        );
431    }
432}