1use std::fmt;
2
3use crate::tokens::{ControlWord, Property, Token};
4use crate::utils::StrUtils;
5use crate::{recursive_tokenize, recursive_tokenize_with_init};
6
7#[derive(Debug, Clone)]
8pub enum LexerError {
9 Error(String),
10 InvalidUnicode(String),
11 InvalidLastChar,
12}
13
14impl std::error::Error for LexerError {}
15
16impl fmt::Display for LexerError {
17 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
18 let _ = write!(f, "[RTF Lexer] : ");
19 let _ = match self {
20 LexerError::InvalidLastChar => write!(f, "Invalid last char, should be '}}'"),
21 LexerError::InvalidUnicode(uc) => write!(f, "Invalid unicode : {uc}"),
22 LexerError::Error(msg) => write!(f, "{}", msg),
23 };
24 return Ok(());
25 }
26}
27
28impl From<std::str::Utf8Error> for LexerError {
29 fn from(value: std::str::Utf8Error) -> Self {
30 return LexerError::Error(value.to_string());
31 }
32}
33
34impl From<std::num::ParseIntError> for LexerError {
35 fn from(value: std::num::ParseIntError) -> Self {
36 return LexerError::Error(value.to_string());
37 }
38}
39
40pub struct Lexer;
41
42impl Lexer {
43 pub fn scan(src: &str) -> Result<Vec<Token>, LexerError> {
44 let src = src.trim(); let mut tokens: Vec<Token> = vec![];
47 let mut slice_start_index = 0;
48 let mut previous_char = ' ';
49
50 for (current_index, c) in src.char_indices() {
51 match c {
52 '{' | '}' | '\\' | '\n' if previous_char == '\\' => {}
55 '{' | '}' | '\\' | '\n' => {
56 if slice_start_index < current_index {
58 let slice = &src[slice_start_index..current_index];
60 let slice_tokens = Self::tokenize(slice)?;
62 tokens.extend_from_slice(&slice_tokens.as_slice());
63 slice_start_index = current_index;
64 }
65 }
66 _ => {}
68 }
69 previous_char = c;
70 }
71 if slice_start_index < src.len() {
73 let slice = &src[slice_start_index..];
74 if slice != "}" {
75 return Err(LexerError::InvalidLastChar);
76 }
77 tokens.push(Token::ClosingBracket);
78 }
79 return Ok(tokens);
80 }
81
82 fn tokenize(slice: &str) -> Result<Vec<Token>, LexerError> {
84 let mut starting_chars = slice.trim_matches(' ').chars().take(2);
85 return match (starting_chars.next(), starting_chars.next()) {
86 (Some('\\'), Some(c)) => match c {
88 '{' | '}' | '\\' => {
89 let tail = slice.get(1..).unwrap_or("");
91 return Ok(vec![Token::PlainText(tail)]); }
93 '\'' => {
94 let tail = slice.get(1..).unwrap_or("");
96 let Some(hex) = tail.get(1..3) else {
97 return Err(LexerError::InvalidUnicode(tail.into()));
98 };
99 let byte = u8::from_str_radix(hex, 16)?; let mut ret = vec![Token::ControlSymbol((ControlWord::Unicode, Property::Value(byte as i32)))];
101 recursive_tokenize!(&tail[3..], ret);
102 return Ok(ret);
103 }
104 '\n' => {
105 let mut ret = vec![Token::CRLF];
107 if let Some(tail) = slice.get(2..) {
108 recursive_tokenize!(tail, ret);
109 }
110 return Ok(ret);
111 }
112 'a'..='z' => {
113 let (mut ident, tail) = slice.split_first_whitespace();
116 ident = if ident.chars().last().unwrap_or(' ') == ';' { &ident[0..ident.len() - 1] } else { ident };
118 let control_word = ControlWord::from(ident)?;
119 let mut ret = vec![Token::ControlSymbol(control_word)];
120 recursive_tokenize!(tail, ret);
121
122 if tail.len() > 0 && tail.is_only_whitespace() {
124 ret.push(Token::PlainText(tail));
125 }
126 return Ok(ret);
127 }
128 '*' => Ok(vec![Token::IgnorableDestination]),
129 _ => Ok(vec![]),
130 },
131 (Some('\n'), Some(_)) => recursive_tokenize!(&slice[1..]), (Some('{'), None) => Ok(vec![Token::OpeningBracket]),
134 (Some('}'), None) => Ok(vec![Token::ClosingBracket]),
135 (Some('{'), Some(_)) => recursive_tokenize_with_init!(Token::OpeningBracket, &slice[1..]),
136 (Some('}'), Some(_)) => recursive_tokenize_with_init!(Token::ClosingBracket, &slice[1..]),
137 (None, None) => Err(LexerError::Error(format!("Empty token {}", &slice))),
138 _ => {
140 let text = slice.trim();
141 if text == "" {
142 return Ok(vec![]);
143 }
144 return Ok(vec![Token::PlainText(slice)]);
145 }
146 };
147 }
148}
149
150#[cfg(test)]
151pub(crate) mod tests {
152 use crate::lexer::Lexer;
153 use crate::tokens::ControlWord::{Ansi, Bold, ColorBlue, ColorNumber, ColorRed, FontNumber, FontSize, FontTable, Italic, Par, Pard, Rtf, Underline, Unicode, Unknown};
154 use crate::tokens::Property::*;
155 use crate::tokens::Token::*;
156 use crate::tokens::{ControlWord, Property};
157
158 #[test]
159 fn simple_tokenize_test() {
160 let tokens = Lexer::tokenize(r"\b Words in bold").unwrap();
161 assert_eq!(tokens, vec![ControlSymbol((Bold, None)), PlainText("Words in bold"),]);
162 }
163
164 #[test]
165 fn scan_entire_file_test() {
166 let tokens = Lexer::scan(r#"{ \rtf1\ansi{\fonttbl\f0\fswiss Helvetica;}\f0\pard Voici du texte en {\b gras}.\par }"#);
167 assert_eq!(
168 tokens.unwrap(),
169 vec![
170 OpeningBracket,
171 ControlSymbol((Rtf, Value(1))),
172 ControlSymbol((Ansi, None)),
173 OpeningBracket,
174 ControlSymbol((FontTable, None)),
175 ControlSymbol((FontNumber, Value(0))),
176 ControlSymbol((Unknown("\\fswiss"), None)),
177 PlainText("Helvetica;"),
178 ClosingBracket,
179 ControlSymbol((FontNumber, Value(0))),
180 ControlSymbol((Pard, None)),
181 PlainText("Voici du texte en "),
182 OpeningBracket,
183 ControlSymbol((Bold, None)),
184 PlainText("gras"),
185 ClosingBracket,
186 PlainText("."),
187 ControlSymbol((Par, None)),
188 ClosingBracket,
189 ]
190 );
191 }
192
193 #[test]
194 fn scan_escaped_text() {
195 let tokens = Lexer::scan(
196 r#"\f0\fs24 \cf0 test de code \
197if (a == b) \{\
198 test();\
199\} else \{\
200 return;\
201\}}"#,
202 );
203 assert_eq!(
204 tokens.unwrap(),
205 vec![
206 ControlSymbol((FontNumber, Value(0))),
207 ControlSymbol((FontSize, Value(24))),
208 ControlSymbol((ColorNumber, Value(0))),
209 PlainText("test de code "),
210 CRLF,
211 PlainText("if (a == b) "),
212 PlainText("{"),
213 CRLF,
214 PlainText(" test();"),
215 CRLF,
216 PlainText("} else "),
217 PlainText("{"),
218 CRLF,
219 PlainText(" return;"),
220 CRLF,
221 PlainText("}"),
222 ClosingBracket
223 ],
224 );
225 }
226
227 #[test]
228 fn scan_ignorable_destination() {
229 let text = r"{\*\expandedcolortbl;;}";
230 let tokens = Lexer::scan(text);
231 assert_eq!(
232 tokens.unwrap(),
233 vec![OpeningBracket, IgnorableDestination, ControlSymbol((Unknown(r"\expandedcolortbl;"), None)), ClosingBracket,]
234 )
235 }
236
237 #[test]
238 fn should_parse_control_symbol_ending_semicolon() {
239 let text = r"{\red255\blue255;}";
240 let tokens = Lexer::scan(text);
241 assert_eq!(
242 tokens.unwrap(),
243 vec![OpeningBracket, ControlSymbol((ColorRed, Value(255))), ControlSymbol((ColorBlue, Value(255))), ClosingBracket]
244 );
245 }
246
247 #[test]
248 fn lex_with_leading_whitespaces() {
249 let rtf_content = "\t {\\rtf1 }\n "; let tokens = Lexer::scan(rtf_content).unwrap();
252 assert_eq!(tokens, vec![OpeningBracket, ControlSymbol((Rtf, Value(1))), ClosingBracket]);
253 }
254
255 #[test]
256 fn should_parse_line_return() {
257 let text = r#"{\partightenfactor0
261
262\fs24 \cf0 Font size 12,
263\f0\b bold text. \ul Underline,bold text.\
264 }"#;
265 let tokens = Lexer::scan(text).unwrap();
266 assert_eq!(
267 tokens,
268 [
269 OpeningBracket,
270 ControlSymbol((Unknown("\\partightenfactor"), Value(0))),
271 ControlSymbol((FontSize, Value(24))),
272 ControlSymbol((ColorNumber, Value(0))),
273 PlainText("Font size 12,"),
274 ControlSymbol((FontNumber, Value(0))),
275 ControlSymbol((Bold, None)),
276 PlainText("bold text. "),
277 ControlSymbol((Underline, None)),
278 PlainText("Underline,bold text."),
279 CRLF,
280 ClosingBracket
281 ]
282 );
283 }
284
285 #[test]
286 fn space_after_control_word() {
287 let text = r"{in{\i cred}ible}";
288 let tokens = Lexer::scan(text).unwrap();
289 assert_eq!(
290 tokens,
291 [OpeningBracket, PlainText("in"), OpeningBracket, ControlSymbol((Italic, None)), PlainText("cred"), ClosingBracket, PlainText("ible"), ClosingBracket,]
292 )
293 }
294
295 #[test]
296 fn should_handle_escaped_char() {
297 let rtf = r"{je suis une b\'eate}"; let tokens = Lexer::scan(rtf).unwrap();
299 assert_eq!(
300 tokens,
301 [OpeningBracket, PlainText("je suis une b"), ControlSymbol((Unicode, Value(234))), PlainText("te"), ClosingBracket,]
302 );
303 }
304
305 #[test]
306 fn should_handle_utf8_plain_text() {
307 let tokens = Lexer::scan(r"{Привет}").unwrap();
308 assert_eq!(tokens, [OpeningBracket, PlainText("Привет"), ClosingBracket]);
309 }
310
311 #[test]
312 fn should_not_panic_on_invalid_unicode() {
313 let rtf = String::from_utf8_lossy(&[92u8, 39, 0, 10, 0]);
314 assert!(Lexer::scan(&rtf).is_err());
315 }
316
317 #[test]
318 fn should_not_panic_on_utf8_control_word() {
319 let rtf = String::from_utf8_lossy(&[92u8, 97, 194, 160, 125]);
320 assert!(Lexer::scan(&rtf).is_ok());
321 }
322
323 #[test]
324 fn should_lex_unicode() {
325 let rtf = r#"{\u21834 \u21834 }"#;
326 let tokens = Lexer::scan(rtf).unwrap();
327 assert_eq!(
328 tokens,
329 vec![OpeningBracket, ControlSymbol((Unicode, Value(21834))), PlainText(" "), ControlSymbol((Unicode, Value(21834))), ClosingBracket]
330 );
331 }
332
333 #[test]
334 fn should_handle_whitespace_group() {
335 let rtf = r"{\cf1 }"; let tokens = Lexer::scan(rtf).unwrap();
337 assert_eq!(tokens, [OpeningBracket, ControlSymbol((ColorNumber, Value(1))), PlainText(" "), ClosingBracket]);
338 }
339
340}