Skip to main content

rtf_parser/
lexer.rs

1use std::fmt;
2
3use crate::tokens::{ControlWord, Property, Token};
4use crate::utils::StrUtils;
5use crate::{recursive_tokenize, recursive_tokenize_with_init};
6
7#[derive(Debug, Clone)]
8pub enum LexerError {
9    Error(String),
10    InvalidUnicode(String),
11    InvalidLastChar,
12}
13
14impl std::error::Error for LexerError {}
15
16impl fmt::Display for LexerError {
17    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
18        let _ = write!(f, "[RTF Lexer] : ");
19        let _ = match self {
20            LexerError::InvalidLastChar => write!(f, "Invalid last char, should be '}}'"),
21            LexerError::InvalidUnicode(uc) => write!(f, "Invalid unicode : {uc}"),
22            LexerError::Error(msg) => write!(f, "{}", msg),
23        };
24        return Ok(());
25    }
26}
27
28impl From<std::str::Utf8Error> for LexerError {
29    fn from(value: std::str::Utf8Error) -> Self {
30        return LexerError::Error(value.to_string());
31    }
32}
33
34impl From<std::num::ParseIntError> for LexerError {
35    fn from(value: std::num::ParseIntError) -> Self {
36        return LexerError::Error(value.to_string());
37    }
38}
39
40pub struct Lexer;
41
42impl Lexer {
43    pub fn scan(src: &str) -> Result<Vec<Token>, LexerError> {
44        let src = src.trim(); // Sanitize src : Trim the leading whitespaces
45
46        let mut tokens: Vec<Token> = vec![];
47        let mut slice_start_index = 0;
48        let mut previous_char = ' ';
49
50        for (current_index, c) in src.char_indices() {
51            match c {
52                // TODO: Handle char over code 127 for escaped chars
53                // Handle Escaped chars : "\" + any charcode below 127
54                '{' | '}' | '\\' | '\n' if previous_char == '\\' => {}
55                '{' | '}' | '\\' | '\n' => {
56                    // End of slice chars
57                    if slice_start_index < current_index {
58                        // Close slice
59                        let slice = &src[slice_start_index..current_index];
60                        // Get the corresponding token(s)
61                        let slice_tokens = Self::tokenize(slice)?;
62                        tokens.extend_from_slice(&slice_tokens.as_slice());
63                        slice_start_index = current_index;
64                    }
65                }
66                // Others chars
67                _ => {}
68            }
69            previous_char = c;
70        }
71        // Manage last token (should always be "}")
72        if slice_start_index < src.len() {
73            let slice = &src[slice_start_index..];
74            if slice != "}" {
75                return Err(LexerError::InvalidLastChar);
76            }
77            tokens.push(Token::ClosingBracket);
78        }
79        return Ok(tokens);
80    }
81
82    /// Get a string slice cut but the scanner and return the coreesponding token(s)
83    fn tokenize(slice: &str) -> Result<Vec<Token>, LexerError> {
84        let mut starting_chars = slice.trim_matches(' ').chars().take(2);
85        return match (starting_chars.next(), starting_chars.next()) {
86            // If it starts with \ : escaped text or control word
87            (Some('\\'), Some(c)) => match c {
88                '{' | '}' | '\\' => {
89                    // Handle escaped chars
90                    let tail = slice.get(1..).unwrap_or("");
91                    return Ok(vec![Token::PlainText(tail)]); // No recursive tokenize here, juste some plain text because the char is escaped
92                }
93                '\'' => {
94                    // Escaped unicode in hex value : \'f0
95                    let tail = slice.get(1..).unwrap_or("");
96                    let Some(hex) = tail.get(1..3) else {
97                        return Err(LexerError::InvalidUnicode(tail.into()));
98                    };
99                    let byte = u8::from_str_radix(hex, 16)?; // f0
100                    let mut ret = vec![Token::ControlSymbol((ControlWord::Unicode, Property::Value(byte as i32)))];
101                    recursive_tokenize!(&tail[3..], ret);
102                    return Ok(ret);
103                }
104                '\n' => {
105                    // CRLF
106                    let mut ret = vec![Token::CRLF];
107                    if let Some(tail) = slice.get(2..) {
108                        recursive_tokenize!(tail, ret);
109                    }
110                    return Ok(ret);
111                }
112                'a'..='z' => {
113                    // Identify control word
114                    // ex: parse "\b Words in bold" -> (Token::ControlWord(ControlWord::Bold), Token::ControlWordArgument("Words in bold")
115                    let (mut ident, tail) = slice.split_first_whitespace();
116                    // if ident end with semicolon, strip it for correct value parsing
117                    ident = if ident.chars().last().unwrap_or(' ') == ';' { &ident[0..ident.len() - 1] } else { ident };
118                    let control_word = ControlWord::from(ident)?;
119                    let mut ret = vec![Token::ControlSymbol(control_word)];
120                    recursive_tokenize!(tail, ret);
121
122                    // The first whitespace delimits the control word, the remaining ones are plain text
123                    if tail.len() > 0 && tail.is_only_whitespace() {
124                        ret.push(Token::PlainText(tail));
125                    }
126                    return Ok(ret);
127                }
128                '*' => Ok(vec![Token::IgnorableDestination]),
129                _ => Ok(vec![]),
130            },
131            (Some('\n'), Some(_)) => recursive_tokenize!(&slice[1..]), // Ignore the CRLF if it's not escaped
132            // Handle brackets
133            (Some('{'), None) => Ok(vec![Token::OpeningBracket]),
134            (Some('}'), None) => Ok(vec![Token::ClosingBracket]),
135            (Some('{'), Some(_)) => recursive_tokenize_with_init!(Token::OpeningBracket, &slice[1..]),
136            (Some('}'), Some(_)) => recursive_tokenize_with_init!(Token::ClosingBracket, &slice[1..]),
137            (None, None) => Err(LexerError::Error(format!("Empty token {}", &slice))),
138            // Else, it's plain text
139            _ => {
140                let text = slice.trim();
141                if text == "" {
142                    return Ok(vec![]);
143                }
144                return Ok(vec![Token::PlainText(slice)]);
145            }
146        };
147    }
148}
149
150#[cfg(test)]
151pub(crate) mod tests {
152    use crate::lexer::Lexer;
153    use crate::tokens::ControlWord::{Ansi, Bold, ColorBlue, ColorNumber, ColorRed, FontNumber, FontSize, FontTable, Italic, Par, Pard, Rtf, Underline, Unicode, Unknown};
154    use crate::tokens::Property::*;
155    use crate::tokens::Token::*;
156    use crate::tokens::{ControlWord, Property};
157
158    #[test]
159    fn simple_tokenize_test() {
160        let tokens = Lexer::tokenize(r"\b Words in bold").unwrap();
161        assert_eq!(tokens, vec![ControlSymbol((Bold, None)), PlainText("Words in bold"),]);
162    }
163
164    #[test]
165    fn scan_entire_file_test() {
166        let tokens = Lexer::scan(r#"{ \rtf1\ansi{\fonttbl\f0\fswiss Helvetica;}\f0\pard Voici du texte en {\b gras}.\par }"#);
167        assert_eq!(
168            tokens.unwrap(),
169            vec![
170                OpeningBracket,
171                ControlSymbol((Rtf, Value(1))),
172                ControlSymbol((Ansi, None)),
173                OpeningBracket,
174                ControlSymbol((FontTable, None)),
175                ControlSymbol((FontNumber, Value(0))),
176                ControlSymbol((Unknown("\\fswiss"), None)),
177                PlainText("Helvetica;"),
178                ClosingBracket,
179                ControlSymbol((FontNumber, Value(0))),
180                ControlSymbol((Pard, None)),
181                PlainText("Voici du texte en "),
182                OpeningBracket,
183                ControlSymbol((Bold, None)),
184                PlainText("gras"),
185                ClosingBracket,
186                PlainText("."),
187                ControlSymbol((Par, None)),
188                ClosingBracket,
189            ]
190        );
191    }
192
193    #[test]
194    fn scan_escaped_text() {
195        let tokens = Lexer::scan(
196            r#"\f0\fs24 \cf0 test de code \
197if (a == b) \{\
198    test();\
199\} else \{\
200    return;\
201\}}"#,
202        );
203        assert_eq!(
204            tokens.unwrap(),
205            vec![
206                ControlSymbol((FontNumber, Value(0))),
207                ControlSymbol((FontSize, Value(24))),
208                ControlSymbol((ColorNumber, Value(0))),
209                PlainText("test de code "),
210                CRLF,
211                PlainText("if (a == b) "),
212                PlainText("{"),
213                CRLF,
214                PlainText("    test();"),
215                CRLF,
216                PlainText("} else "),
217                PlainText("{"),
218                CRLF,
219                PlainText("    return;"),
220                CRLF,
221                PlainText("}"),
222                ClosingBracket
223            ],
224        );
225    }
226
227    #[test]
228    fn scan_ignorable_destination() {
229        let text = r"{\*\expandedcolortbl;;}";
230        let tokens = Lexer::scan(text);
231        assert_eq!(
232            tokens.unwrap(),
233            vec![OpeningBracket, IgnorableDestination, ControlSymbol((Unknown(r"\expandedcolortbl;"), None)), ClosingBracket,]
234        )
235    }
236
237    #[test]
238    fn should_parse_control_symbol_ending_semicolon() {
239        let text = r"{\red255\blue255;}";
240        let tokens = Lexer::scan(text);
241        assert_eq!(
242            tokens.unwrap(),
243            vec![OpeningBracket, ControlSymbol((ColorRed, Value(255))), ControlSymbol((ColorBlue, Value(255))), ClosingBracket]
244        );
245    }
246
247    #[test]
248    fn lex_with_leading_whitespaces() {
249        // Try to parse without error
250        let rtf_content = "\t {\\rtf1 }\n "; // Not raw str for the whitespace to be trimed
251        let tokens = Lexer::scan(rtf_content).unwrap();
252        assert_eq!(tokens, vec![OpeningBracket, ControlSymbol((Rtf, Value(1))), ClosingBracket]);
253    }
254
255    #[test]
256    fn should_parse_line_return() {
257        // From Microsoft's reference: "A carriage return (character value 13) or linefeed (character value 10)
258        // will be treated as a \par control if the character is preceded by a backslash.
259        // You must include the backslash; otherwise, RTF ignores the control word."
260        let text = r#"{\partightenfactor0
261
262\fs24 \cf0 Font size 12,
263\f0\b bold text. \ul Underline,bold text.\
264 }"#;
265        let tokens = Lexer::scan(text).unwrap();
266        assert_eq!(
267            tokens,
268            [
269                OpeningBracket,
270                ControlSymbol((Unknown("\\partightenfactor"), Value(0))),
271                ControlSymbol((FontSize, Value(24))),
272                ControlSymbol((ColorNumber, Value(0))),
273                PlainText("Font size 12,"),
274                ControlSymbol((FontNumber, Value(0))),
275                ControlSymbol((Bold, None)),
276                PlainText("bold text. "),
277                ControlSymbol((Underline, None)),
278                PlainText("Underline,bold text."),
279                CRLF,
280                ClosingBracket
281            ]
282        );
283    }
284
285    #[test]
286    fn space_after_control_word() {
287        let text = r"{in{\i cred}ible}";
288        let tokens = Lexer::scan(text).unwrap();
289        assert_eq!(
290            tokens,
291            [OpeningBracket, PlainText("in"), OpeningBracket, ControlSymbol((Italic, None)), PlainText("cred"), ClosingBracket, PlainText("ible"), ClosingBracket,]
292        )
293    }
294
295    #[test]
296    fn should_handle_escaped_char() {
297        let rtf = r"{je suis une b\'eate}"; // ê = 0xea = 234
298        let tokens = Lexer::scan(rtf).unwrap();
299        assert_eq!(
300            tokens,
301            [OpeningBracket, PlainText("je suis une b"), ControlSymbol((Unicode, Value(234))), PlainText("te"), ClosingBracket,]
302        );
303    }
304
305    #[test]
306    fn should_handle_utf8_plain_text() {
307        let tokens = Lexer::scan(r"{Привет}").unwrap();
308        assert_eq!(tokens, [OpeningBracket, PlainText("Привет"), ClosingBracket]);
309    }
310
311    #[test]
312    fn should_not_panic_on_invalid_unicode() {
313        let rtf = String::from_utf8_lossy(&[92u8, 39, 0, 10, 0]);
314        assert!(Lexer::scan(&rtf).is_err());
315    }
316
317    #[test]
318    fn should_not_panic_on_utf8_control_word() {
319        let rtf = String::from_utf8_lossy(&[92u8, 97, 194, 160, 125]);
320        assert!(Lexer::scan(&rtf).is_ok());
321    }
322
323    #[test]
324    fn should_lex_unicode() {
325        let rtf = r#"{\u21834  \u21834 }"#;
326        let tokens = Lexer::scan(rtf).unwrap();
327        assert_eq!(
328            tokens,
329            vec![OpeningBracket, ControlSymbol((Unicode, Value(21834))), PlainText(" "), ControlSymbol((Unicode, Value(21834))), ClosingBracket]
330        );
331    }
332
333    #[test]
334    fn should_handle_whitespace_group() {
335        let rtf = r"{\cf1  }"; // two whitespaces : one should be ignored, the other should be treated as plain text
336        let tokens = Lexer::scan(rtf).unwrap();
337        assert_eq!(tokens, [OpeningBracket, ControlSymbol((ColorNumber, Value(1))), PlainText(" "), ClosingBracket]);
338    }
339
340}