rtf_grimoire/
raw.rs

1// RTF document format tokenizer
2//
3// Written according to the RTF Format Specification 1.9.1, which carries
4// the following copyright notice:
5//
6//     Copyright (c) 2008 Microsoft Corporation.  All Rights reserved.
7//
8
9use nom;
10use std;
11
12use nom::branch::alt;
13use nom::bytes::complete::{tag, take, take_while_m_n};
14use nom::character::complete::{alpha1, char, crlf, digit1, none_of};
15use nom::character::is_hex_digit;
16use nom::combinator::{map, map_res, opt, recognize};
17use nom::multi::many1;
18use nom::sequence::{pair, preceded, tuple};
19use nom::IResult;
20
21// Helper function for converting &str into a signed int
22#[allow(dead_code)]
23fn str_to_int(s: &str, sign: Option<&str>) -> Result<i32, std::num::ParseIntError> {
24    s.parse::<i32>().map(|x| {
25        x * sign.map_or(1, |x| match x {
26            "-" => -1,
27            "+" => 1,
28            _ => panic!("Unsupported integer sign char: {x}"),
29        })
30    })
31}
32
33// Helper function for converting hex &str into a u8
34#[allow(dead_code)]
35fn hex_str_to_int(s: &str) -> Result<u8, std::num::ParseIntError> {
36    u8::from_str_radix(s, 16)
37}
38
39// Helper function for parsing signed integers
40pub fn signed_int_raw(input: &[u8]) -> IResult<&[u8], (Option<&str>, &str)> {
41    pair(
42        opt(map_res(tag("-"), std::str::from_utf8)),
43        map_res(digit1, std::str::from_utf8),
44    )(input)
45}
46
47// Helper function for parsing hexadecimal bytes
48pub fn hexbyte_raw(input: &[u8]) -> IResult<&[u8], &str> {
49    map_res(take_while_m_n(2, 2, is_hex_digit), std::str::from_utf8)(input)
50}
51
52pub fn hexbyte(input: &[u8]) -> IResult<&[u8], u8> {
53    map_res(hexbyte_raw, hex_str_to_int)(input)
54}
55
56pub fn signed_int(input: &[u8]) -> IResult<&[u8], i32> {
57    map_res(signed_int_raw, |(sign, value)| str_to_int(value, sign))(input)
58}
59
60pub fn control_symbol_raw(input: &[u8]) -> IResult<&[u8], std::primitive::char> {
61    preceded(
62        tag("\\"),
63        none_of("'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"),
64    )(input)
65}
66
67pub fn control_word_raw(input: &[u8]) -> IResult<&[u8], (&str, Option<i32>)> {
68    let (input, (_, name, arg, _)) = tuple((
69        tag("\\"),
70        map_res(alpha1, std::str::from_utf8), // name
71        opt(signed_int),                      // arg
72        opt(tag(" ")),
73    ))(input)?;
74    Ok((input, (name, arg)))
75}
76
77// Sample.rtf's contents and rendering suggest that \'XX *doesn't* absorb a trailing space
78// like other control words do
79
80pub fn control_word_hexbyte_raw(input: &[u8]) -> IResult<&[u8], (&str, Option<i32>)> {
81    let (input, (_, name, arg)) = tuple((
82        tag("\\"),
83        map_res(tag("'"), std::str::from_utf8), // name
84        map(hexbyte, |x| Some(i32::from(x))),   // arg
85    ))(input)?;
86    Ok((input, (name, arg)))
87}
88
89pub fn control_bin_raw(input: &[u8]) -> IResult<&[u8], &[u8]> {
90    let (input, (_, len)) = tuple((
91        tag("\\bin"),
92        opt(map(pair(signed_int, opt(tag(" "))), |(s, _)| s)),
93    ))(input)?;
94    take(len.unwrap_or(0) as usize)(input)
95}
96
97// If the character is anything other than an opening brace ({), closing brace (}), backslash (\),
98// or a CRLF (carriage return/line feed), the reader assumes that the character is plain text and
99// writes the character to the current destination using the current formatting properties.
100// See section "Conventions of an RTF Reader"
101
102pub fn rtf_text_raw(input: &[u8]) -> IResult<&[u8], &[u8]> {
103    recognize(many1(none_of("\\}{\r\n")))(input)
104}
105
106pub fn start_group_raw(input: &[u8]) -> IResult<&[u8], std::primitive::char> {
107    char('{')(input)
108}
109
110pub fn end_group_raw(input: &[u8]) -> IResult<&[u8], std::primitive::char> {
111    char('}')(input)
112}
113
114// Oddly enough, the copy of the RTF spec we have has at least one carriage return without its
115// matching line feed, so it looks like we need to be more permissive about newlines than the spec
116// says.
117pub fn newline_raw(input: &[u8]) -> IResult<&[u8], &[u8]> {
118    alt((crlf, tag("\n"), tag("\r")))(input)
119}
120
121#[cfg(test)]
122mod tests {
123    use super::*;
124    use nom::error::ErrorKind;
125
126    #[test]
127    fn test_str_to_int() {
128        let test_data: [(&str, Option<&str>, i32); 3] = [
129            ("1234", Some("+"), 1234),  // Positive
130            ("1234", Some("-"), -1234), // Negative
131            ("1234", None, 1234),       // No sign
132        ];
133        for (input_str, input_sign, parsed_output) in test_data {
134            assert_eq!(Ok(parsed_output), str_to_int(input_str, input_sign));
135        }
136    }
137
138    #[test]
139    #[should_panic(expected = "Unsupported integer sign char: p")]
140    fn test_str_to_int_invalid_sign() {
141        let input_str = "1234";
142        let input_sign = Some("p");
143        let parsed_output: i32 = 1234;
144        assert_eq!(Ok(parsed_output), str_to_int(input_str, input_sign));
145    }
146
147    #[test]
148    fn test_str_to_int_invalid_str() {
149        let input_str = "BF";
150        let input_sign = Some("+");
151        let err_debug_str = "Err(ParseIntError { kind: InvalidDigit })";
152        assert_eq!(
153            format!("{:?}", str_to_int(input_str, input_sign)),
154            err_debug_str
155        );
156    }
157
158    #[test]
159    // We dont really need to test the conversion from hex to int, but let check the results /
160    // errors returned.
161    fn test_hex_str_to_int() {
162        let input = "0F";
163        let parsed_output: u8 = 15;
164        assert_eq!(Ok(parsed_output), hex_str_to_int(input));
165    }
166
167    #[test]
168    fn test_hex_str_to_int_invalid() {
169        let test_data = [
170            ("uj", "Err(ParseIntError { kind: InvalidDigit })"), // Invalid input
171            ("9D4B", "Err(ParseIntError { kind: PosOverflow })"), // Overflow
172            ("", "Err(ParseIntError { kind: Empty })"),          // Empty input
173        ];
174        for (input, err_debug_str) in test_data {
175            assert_eq!(format!("{:?}", hex_str_to_int(input)), err_debug_str);
176        }
177    }
178
179    #[test]
180    fn test_signed_int_raw() {
181        type TestData<'a> = (&'a [u8], &'a [u8], (Option<&'a str>, &'a str));
182
183        let test_data: [TestData; 2] = [
184            (b"123ab", b"ab", (None, "123")),       // Positive
185            (b"-123ab", b"ab", (Some("-"), "123")), // Negative
186        ];
187        for (input, remaining_input, parsed_output) in test_data {
188            assert_eq!(Ok((remaining_input, parsed_output)), signed_int_raw(input));
189        }
190    }
191
192    #[test]
193    fn test_signed_int_raw_invalid() {
194        let test_data: [(&[u8], &[u8], ErrorKind); 2] = [
195            (b"ab123", b"ab123", ErrorKind::Digit),   // Positive
196            (b"ab-123", b"ab-123", ErrorKind::Digit), // Negative
197        ];
198        for (input, remaining_input, error_kind) in test_data {
199            assert_eq!(
200                Err(nom::Err::Error(nom::error::Error {
201                    input: remaining_input,
202                    code: error_kind
203                })),
204                signed_int_raw(input)
205            );
206        }
207    }
208
209    #[test]
210    fn test_hexbyte_raw() {
211        let test_data: [(&[u8], &[u8], &str); 2] = [
212            (b"0F4E", b"4E", "0F"), // Uppercase
213            (b"4e0f", b"0f", "4e"), // Lowercase
214        ];
215        for (input, remaining_input, parsed_output) in test_data {
216            assert_eq!(Ok((remaining_input, parsed_output)), hexbyte_raw(input));
217        }
218    }
219
220    #[test]
221    fn test_hexbyte_raw_invalid() {
222        let test_data: [(&[u8], &[u8], ErrorKind); 2] = [
223            (b"ge0f", b"ge0f", ErrorKind::TakeWhileMN), // First byte invalid
224            (b"eg0f", b"eg0f", ErrorKind::TakeWhileMN), // Second byte invalid
225        ];
226        for (input, remaining_input, error_kind) in test_data {
227            assert_eq!(
228                Err(nom::Err::Error(nom::error::Error {
229                    input: remaining_input,
230                    code: error_kind
231                })),
232                hexbyte_raw(input)
233            );
234        }
235    }
236
237    #[test]
238    fn test_hexbyte() {
239        let input: &[u8] = b"4E2B";
240        let remaining_input: &[u8] = b"2B";
241        let parsed_output: u8 = 78;
242        assert_eq!(Ok((remaining_input, parsed_output)), hexbyte(input));
243    }
244
245    #[test]
246    fn test_hexbyte_invalid() {
247        let input: &[u8] = b"4G2B";
248        let remaining_input: &[u8] = b"4G2B";
249        let error_kind = ErrorKind::TakeWhileMN;
250        assert_eq!(
251            Err(nom::Err::Error(nom::error::Error {
252                input: remaining_input,
253                code: error_kind
254            })),
255            hexbyte(input)
256        );
257    }
258
259    #[test]
260    fn test_signed_int_positive() {
261        let test_data: [(&[u8], &[u8], i32); 2] = [
262            (b"456a", b"a", 456),   // Positive
263            (b"-920b", b"b", -920), // Negative
264        ];
265        for (input, remaining_input, parsed_output) in test_data {
266            assert_eq!(Ok((remaining_input, parsed_output)), signed_int(input));
267        }
268    }
269
270    #[test]
271    fn test_signed_int_invalid() {
272        let test_data: [(&[u8], &[u8], ErrorKind); 2] = [
273            (b"2147483648b", b"2147483648b", ErrorKind::MapRes), // Overflow
274            (b"a456", b"a456", ErrorKind::Digit),                // First char invalid
275        ];
276        for (input, remaining_input, error_kind) in test_data {
277            assert_eq!(
278                Err(nom::Err::Error(nom::error::Error {
279                    input: remaining_input,
280                    code: error_kind
281                })),
282                signed_int(input)
283            );
284        }
285    }
286
287    #[test]
288    fn test_control_symbol_raw_valid() {
289        let input: &[u8] = br#"\^t"#;
290        let remaining_input: &[u8] = b"t";
291        let parsed_output = '^';
292        assert_eq!(
293            Ok((remaining_input, parsed_output)),
294            control_symbol_raw(input)
295        );
296    }
297
298    #[test]
299    fn test_control_symbol_raw_invalid() {
300        let test_data: [(&[u8], &[u8], ErrorKind); 2] = [
301            (b"hx", b"hx", ErrorKind::Tag),        // No starting slash
302            (br#"\hx"#, b"hx", ErrorKind::NoneOf), // Excluded char
303        ];
304        for (input, remaining_input, error_kind) in test_data {
305            assert_eq!(
306                Err(nom::Err::Error(nom::error::Error {
307                    input: remaining_input,
308                    code: error_kind
309                })),
310                control_symbol_raw(input)
311            );
312        }
313    }
314
315    #[test]
316    fn test_control_word_raw_valid() {
317        type TestData<'a> = (&'a [u8], &'a [u8], (&'a str, Option<i32>));
318
319        let test_data: [TestData; 6] = [
320            (br#"\tag\tag67"#, br#"\tag67"#, ("tag", None)), // No int, no space
321            (br#"\tag \tag67"#, br#"\tag67"#, ("tag", None)), // No int, optional space
322            (br#"\tag45\tag67"#, br#"\tag67"#, ("tag", Some(45))), // Positive int, no space
323            (br#"\tag45 \tag67"#, br#"\tag67"#, ("tag", Some(45))), // Positive int, optional space
324            (br#"\tag-45\tag67"#, br#"\tag67"#, ("tag", Some(-45))), // Negative int, no space
325            (br#"\tag-45 \tag67"#, br#"\tag67"#, ("tag", Some(-45))), // Negative int, optional space
326        ];
327        for (input, remaining_input, parsed_output) in test_data {
328            assert_eq!(
329                Ok((remaining_input, parsed_output)),
330                control_word_raw(input)
331            );
332        }
333    }
334
335    #[test]
336    fn test_control_word_raw_invalid() {
337        let test_data: [(&[u8], &[u8], ErrorKind); 2] = [
338            (br#"dfg-45 \tag67"#, br#"dfg-45 \tag67"#, ErrorKind::Tag), // No slash
339            (br#"\*#~-45 \tag67"#, br#"*#~-45 \tag67"#, ErrorKind::Alpha), // Invalid chars in control word
340        ];
341        for (input, remaining_input, error_kind) in test_data {
342            assert_eq!(
343                Err(nom::Err::Error(nom::error::Error {
344                    input: remaining_input,
345                    code: error_kind
346                })),
347                control_word_raw(input)
348            );
349        }
350    }
351
352    #[test]
353    fn test_control_word_hexbyte_raw() {
354        let input: &[u8] = br#"\'9F4E"#;
355        let remaining_input: &[u8] = b"4E";
356        let parsed_output = ("'", Some(159i32));
357        assert_eq!(
358            Ok((remaining_input, parsed_output)),
359            control_word_hexbyte_raw(input)
360        );
361    }
362
363    #[test]
364    fn test_control_word_hexbyte_raw_invalid() {
365        let test_data: [(&[u8], &[u8], ErrorKind); 3] = [
366            (b"'9F4E", b"'9F4E", ErrorKind::Tag),              // No slash
367            (br#"\9F4E"#, b"9F4E", ErrorKind::Tag),            // No apostrophe
368            (br#"\'R9F4E"#, b"R9F4E", ErrorKind::TakeWhileMN), // Invalid hex
369        ];
370        for (input, remaining_input, error_kind) in test_data {
371            assert_eq!(
372                Err(nom::Err::Error(nom::error::Error {
373                    input: remaining_input,
374                    code: error_kind
375                })),
376                control_word_hexbyte_raw(input)
377            );
378        }
379    }
380
381    #[test]
382    fn test_control_bin_raw() {
383        let test_data: [(&[u8], &[u8], &[u8]); 3] = [
384            (br#"\bin2 ABCD"#, b"CD", b"AB"), // Optional length & space
385            (br#"\bin2ABCD"#, b"CD", b"AB"),  // Optional length, no space
386            (br#"\binABCD"#, b"ABCD", b""),   // No length, no space
387        ];
388        for (input, remaining_input, parsed_output) in test_data {
389            assert_eq!(Ok((remaining_input, parsed_output)), control_bin_raw(input));
390        }
391    }
392
393    #[test]
394    fn test_control_bin_raw_invalid_tag() {
395        let input: &[u8] = br#"\abcABCD"#;
396        let remaining_input: &[u8] = br#"\abcABCD"#;
397        let error_kind = ErrorKind::Tag;
398        assert_eq!(
399            Err(nom::Err::Error(nom::error::Error {
400                input: remaining_input,
401                code: error_kind
402            })),
403            control_bin_raw(input)
404        );
405    }
406
407    #[test]
408    fn test_rtf_text_raw() {
409        let test_data: [(&[u8], &[u8], &[u8]); 5] = [
410            (br#"123\abc"#, br#"\abc"#, b"123"), // Parse upto slash
411            (b"123}abc", b"}abc", b"123"),       // Parse upto closing curly brace
412            (b"123{abc", b"{abc", b"123"),       // Parse upto opening curly brace
413            (b"123\rabc", b"\rabc", b"123"),     // CR
414            (b"123\nabc", b"\nabc", b"123"),     // LF
415        ];
416        for (input, remaining_input, parsed_output) in test_data {
417            assert_eq!(Ok((remaining_input, parsed_output)), rtf_text_raw(input));
418        }
419    }
420
421    #[test]
422    fn test_start_group_raw() {
423        let input: &[u8] = b"{abc";
424        let remaining_input: &[u8] = b"abc";
425        let parsed_output = '{';
426        assert_eq!(Ok((remaining_input, parsed_output)), start_group_raw(input));
427    }
428
429    #[test]
430    fn test_start_group_raw_invalid() {
431        let input: &[u8] = b"a{bc";
432        let remaining_input: &[u8] = b"a{bc";
433        let error_kind = ErrorKind::Char;
434        assert_eq!(
435            Err(nom::Err::Error(nom::error::Error {
436                input: remaining_input,
437                code: error_kind
438            })),
439            start_group_raw(input)
440        );
441    }
442
443    #[test]
444    fn test_end_group_raw() {
445        let input: &[u8] = b"}abc";
446        let remaining_input: &[u8] = b"abc";
447        let parsed_output = '}';
448        assert_eq!(Ok((remaining_input, parsed_output)), end_group_raw(input));
449    }
450
451    #[test]
452    fn test_end_group_raw_invalid() {
453        let input: &[u8] = b"a}bc";
454        let remaining_input: &[u8] = b"a}bc";
455        let error_kind = ErrorKind::Char;
456        assert_eq!(
457            Err(nom::Err::Error(nom::error::Error {
458                input: remaining_input,
459                code: error_kind
460            })),
461            end_group_raw(input)
462        );
463    }
464
465    #[test]
466    fn test_newline_raw() {
467        let test_data: [(&[u8], &[u8], &[u8]); 3] = [
468            (b"\r\nabc", br#"abc"#, b"\r\n"), // CRLF
469            (b"\nabc", b"abc", b"\n"),        // LF
470            (b"\rabc", b"abc", b"\r"),        // CR
471        ];
472        for (input, remaining_input, parsed_output) in test_data {
473            assert_eq!(Ok((remaining_input, parsed_output)), newline_raw(input));
474        }
475    }
476
477    #[test]
478    fn test_newline_raw_invalid() {
479        let test_data: [(&[u8], &[u8], ErrorKind); 3] = [
480            (b"a\r\nbc", b"a\r\nbc", ErrorKind::Tag), // CRLF
481            (b"a\nbc", b"a\nbc", ErrorKind::Tag),     // LF
482            (b"a\rbc", b"a\rbc", ErrorKind::Tag),     // CR
483        ];
484        for (input, remaining_input, error_kind) in test_data {
485            assert_eq!(
486                Err(nom::Err::Error(nom::error::Error {
487                    input: remaining_input,
488                    code: error_kind
489                })),
490                newline_raw(input)
491            );
492        }
493    }
494}