rtf_parser/
lexer.rs

1use std::fmt;
2
3use crate::tokens::{ControlWord, Property, Token};
4use crate::utils::StrUtils;
5use crate::{recursive_tokenize, recursive_tokenize_with_init};
6
7#[derive(Debug, Clone)]
8pub enum LexerError {
9    Error(String),
10    InvalidUnicode(String),
11    InvalidLastChar,
12}
13
14impl std::error::Error for LexerError {}
15
16impl fmt::Display for LexerError {
17    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
18        let _ = write!(f, "[RTF Lexer] : ");
19        let _ = match self {
20            LexerError::InvalidLastChar => write!(f, "Invalid last char, should be '}}'"),
21            LexerError::InvalidUnicode(uc) => write!(f, "Invalid unicode : {uc}"),
22            LexerError::Error(msg) => write!(f, "{}", msg),
23        };
24        return Ok(());
25    }
26}
27
28impl From<std::str::Utf8Error> for LexerError {
29    fn from(value: std::str::Utf8Error) -> Self {
30        return LexerError::Error(value.to_string());
31    }
32}
33
34impl From<std::num::ParseIntError> for LexerError {
35    fn from(value: std::num::ParseIntError) -> Self {
36        return LexerError::Error(value.to_string());
37    }
38}
39
40pub struct Lexer;
41
42impl Lexer {
43    pub fn scan(src: &str) -> Result<Vec<Token>, LexerError> {
44        let src = src.trim(); // Sanitize src : Trim the leading whitespaces
45
46        let mut tokens: Vec<Token> = vec![];
47        let mut slice_start_index = 0;
48        let mut current_index = 0;
49        let mut previous_char = ' ';
50
51        // This is faster than using an iterator
52        let len = src.len();
53        let bytes = src.as_bytes();
54        let mut i = 0;
55        while i < len {
56            let c = bytes[i] as char;
57            i += 1;
58
59            match c {
60                // TODO: Handle char over code 127 for escaped chars
61                // Handle Escaped chars : "\" + any charcode below 127
62                '{' | '}' | '\\' | '\n' if previous_char == '\\' => {}
63                '{' | '}' | '\\' | '\n' => {
64                    // End of slice chars
65                    if slice_start_index < current_index {
66                        // Close slice
67                        let slice = &src[slice_start_index..current_index];
68                        // Get the corresponding token(s)
69                        let slice_tokens = Self::tokenize(slice)?;
70                        tokens.extend_from_slice(&slice_tokens.as_slice());
71                        slice_start_index = current_index;
72                    }
73                }
74                // Others chars
75                _ => {}
76            }
77            current_index += 1;
78            previous_char = c;
79        }
80        // Manage last token (should always be "}")
81        if slice_start_index < current_index {
82            let slice = &src[slice_start_index..current_index];
83            if slice != "}" {
84                return Err(LexerError::InvalidLastChar);
85            }
86            tokens.push(Token::ClosingBracket);
87        }
88        return Ok(tokens);
89    }
90
91    /// Get a string slice cut but the scanner and return the coreesponding token(s)
92    fn tokenize(slice: &str) -> Result<Vec<Token>, LexerError> {
93        let mut starting_chars = slice.trim_matches(' ').chars().take(2);
94        return match (starting_chars.next(), starting_chars.next()) {
95            // If it starts with \ : escaped text or control word
96            (Some('\\'), Some(c)) => match c {
97                '{' | '}' | '\\' => {
98                    // Handle escaped chars
99                    let tail = slice.get(1..).unwrap_or("");
100                    return Ok(vec![Token::PlainText(tail)]); // No recursive tokenize here, juste some plain text because the char is escaped
101                }
102                '\'' => {
103                    // Escaped unicode in hex value : \'f0
104                    let tail = slice.get(1..).unwrap_or("");
105                    if tail.len() < 2 {
106                        return Err(LexerError::InvalidUnicode(tail.into()));
107                    }
108                    let byte = u8::from_str_radix(&tail[1..3], 16)?; // f0
109                    let mut ret = vec![Token::ControlSymbol((ControlWord::Unicode, Property::Value(byte as i32)))];
110                    recursive_tokenize!(&tail[3..], ret);
111                    return Ok(ret);
112                }
113                '\n' => {
114                    // CRLF
115                    let mut ret = vec![Token::CRLF];
116                    if let Some(tail) = slice.get(2..) {
117                        recursive_tokenize!(tail, ret);
118                    }
119                    return Ok(ret);
120                }
121                'a'..='z' => {
122                    // Identify control word
123                    // ex: parse "\b Words in bold" -> (Token::ControlWord(ControlWord::Bold), Token::ControlWordArgument("Words in bold")
124                    let (mut ident, tail) = slice.split_first_whitespace();
125                    // if ident end with semicolon, strip it for correct value parsing
126                    ident = if ident.chars().last().unwrap_or(' ') == ';' { &ident[0..ident.len() - 1] } else { ident };
127                    let control_word = ControlWord::from(ident)?;
128                    let mut ret = vec![Token::ControlSymbol(control_word)];
129                    recursive_tokenize!(tail, ret);
130
131                    // \u1234 \u1234 is ok, but \u1234  \u1234 is lost a space, \u1234   \u1234 lost two spaces, and so on
132                    // \u1234  1 -> No need to walk in here, it will enter plain text
133                    if control_word.0 == ControlWord::Unicode && tail.len() > 0 && tail.trim() == "" {
134                        ret.push(Token::PlainText(tail));
135                    }
136                    return Ok(ret);
137                }
138                '*' => Ok(vec![Token::IgnorableDestination]),
139                _ => Ok(vec![]),
140            },
141            (Some('\n'), Some(_)) => recursive_tokenize!(&slice[1..]), // Ignore the CRLF if it's not escaped
142            // Handle brackets
143            (Some('{'), None) => Ok(vec![Token::OpeningBracket]),
144            (Some('}'), None) => Ok(vec![Token::ClosingBracket]),
145            (Some('{'), Some(_)) => recursive_tokenize_with_init!(Token::OpeningBracket, &slice[1..]),
146            (Some('}'), Some(_)) => recursive_tokenize_with_init!(Token::ClosingBracket, &slice[1..]),
147            (None, None) => Err(LexerError::Error(format!("Empty token {}", &slice))),
148            // Else, it's plain text
149            _ => {
150                let text = slice.trim();
151                if text == "" {
152                    return Ok(vec![]);
153                }
154                return Ok(vec![Token::PlainText(slice)]);
155            }
156        };
157    }
158}
159
160#[cfg(test)]
161pub(crate) mod tests {
162    use crate::lexer::Lexer;
163    use crate::tokens::ControlWord::{Ansi, Bold, ColorBlue, ColorNumber, ColorRed, FontNumber, FontSize, FontTable, Italic, Par, Pard, Rtf, Underline, Unicode, Unknown};
164    use crate::tokens::Property::*;
165    use crate::tokens::Token::*;
166    use crate::tokens::{ControlWord, Property};
167
168    #[test]
169    fn simple_tokenize_test() {
170        let tokens = Lexer::tokenize(r"\b Words in bold").unwrap();
171        assert_eq!(tokens, vec![ControlSymbol((Bold, None)), PlainText("Words in bold"),]);
172    }
173
174    #[test]
175    fn scan_entire_file_test() {
176        let tokens = Lexer::scan(r#"{ \rtf1\ansi{\fonttbl\f0\fswiss Helvetica;}\f0\pard Voici du texte en {\b gras}.\par }"#);
177        assert_eq!(
178            tokens.unwrap(),
179            vec![
180                OpeningBracket,
181                ControlSymbol((Rtf, Value(1))),
182                ControlSymbol((Ansi, None)),
183                OpeningBracket,
184                ControlSymbol((FontTable, None)),
185                ControlSymbol((FontNumber, Value(0))),
186                ControlSymbol((Unknown("\\fswiss"), None)),
187                PlainText("Helvetica;"),
188                ClosingBracket,
189                ControlSymbol((FontNumber, Value(0))),
190                ControlSymbol((Pard, None)),
191                PlainText("Voici du texte en "),
192                OpeningBracket,
193                ControlSymbol((Bold, None)),
194                PlainText("gras"),
195                ClosingBracket,
196                PlainText("."),
197                ControlSymbol((Par, None)),
198                ClosingBracket,
199            ]
200        );
201    }
202
203    #[test]
204    fn scan_escaped_text() {
205        let tokens = Lexer::scan(
206            r#"\f0\fs24 \cf0 test de code \
207if (a == b) \{\
208    test();\
209\} else \{\
210    return;\
211\}}"#,
212        );
213        assert_eq!(
214            tokens.unwrap(),
215            vec![
216                ControlSymbol((FontNumber, Value(0))),
217                ControlSymbol((FontSize, Value(24))),
218                ControlSymbol((ColorNumber, Value(0))),
219                PlainText("test de code "),
220                CRLF,
221                PlainText("if (a == b) "),
222                PlainText("{"),
223                CRLF,
224                PlainText("    test();"),
225                CRLF,
226                PlainText("} else "),
227                PlainText("{"),
228                CRLF,
229                PlainText("    return;"),
230                CRLF,
231                PlainText("}"),
232                ClosingBracket
233            ],
234        );
235    }
236
237    #[test]
238    fn scan_ignorable_destination() {
239        let text = r"{\*\expandedcolortbl;;}";
240        let tokens = Lexer::scan(text);
241        assert_eq!(
242            tokens.unwrap(),
243            vec![OpeningBracket, IgnorableDestination, ControlSymbol((Unknown(r"\expandedcolortbl;"), None)), ClosingBracket,]
244        )
245    }
246
247    #[test]
248    fn should_parse_control_symbol_ending_semicolon() {
249        let text = r"{\red255\blue255;}";
250        let tokens = Lexer::scan(text);
251        assert_eq!(
252            tokens.unwrap(),
253            vec![OpeningBracket, ControlSymbol((ColorRed, Value(255))), ControlSymbol((ColorBlue, Value(255))), ClosingBracket]
254        );
255    }
256
257    #[test]
258    fn lex_with_leading_whitespaces() {
259        // Try to parse without error
260        let rtf_content = "\t {\\rtf1 }\n "; // Not raw str for the whitespace to be trimed
261        let tokens = Lexer::scan(rtf_content).unwrap();
262        assert_eq!(tokens, vec![OpeningBracket, ControlSymbol((Rtf, Value(1))), ClosingBracket]);
263    }
264
265    #[test]
266    fn should_parse_line_return() {
267        // From Microsoft's reference: "A carriage return (character value 13) or linefeed (character value 10)
268        // will be treated as a \par control if the character is preceded by a backslash.
269        // You must include the backslash; otherwise, RTF ignores the control word."
270        let text = r#"{\partightenfactor0
271
272\fs24 \cf0 Font size 12,
273\f0\b bold text. \ul Underline,bold text.\
274 }"#;
275        let tokens = Lexer::scan(text).unwrap();
276        assert_eq!(
277            tokens,
278            [
279                OpeningBracket,
280                ControlSymbol((Unknown("\\partightenfactor"), Value(0))),
281                ControlSymbol((FontSize, Value(24))),
282                ControlSymbol((ColorNumber, Value(0))),
283                PlainText("Font size 12,"),
284                ControlSymbol((FontNumber, Value(0))),
285                ControlSymbol((Bold, None)),
286                PlainText("bold text. "),
287                ControlSymbol((Underline, None)),
288                PlainText("Underline,bold text."),
289                CRLF,
290                ClosingBracket
291            ]
292        );
293    }
294
295    #[test]
296    fn space_after_control_word() {
297        let text = r"{in{\i cred}ible}";
298        let tokens = Lexer::scan(text).unwrap();
299        assert_eq!(
300            tokens,
301            [OpeningBracket, PlainText("in"), OpeningBracket, ControlSymbol((Italic, None)), PlainText("cred"), ClosingBracket, PlainText("ible"), ClosingBracket,]
302        )
303    }
304
305    #[test]
306    fn should_handle_escaped_char() {
307        let rtf = r"{je suis une b\'eate}"; // ê = 0xea = 234
308        let tokens = Lexer::scan(rtf).unwrap();
309        assert_eq!(
310            tokens,
311            [OpeningBracket, PlainText("je suis une b"), ControlSymbol((Unicode, Value(234))), PlainText("te"), ClosingBracket,]
312        );
313    }
314}