1use std::fmt;
2
3use crate::tokens::{ControlWord, Property, Token};
4use crate::utils::StrUtils;
5use crate::{recursive_tokenize, recursive_tokenize_with_init};
6
7#[derive(Debug, Clone)]
8pub enum LexerError {
9 Error(String),
10 InvalidUnicode(String),
11 InvalidLastChar,
12}
13
14impl std::error::Error for LexerError {}
15
16impl fmt::Display for LexerError {
17 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
18 let _ = write!(f, "[RTF Lexer] : ");
19 let _ = match self {
20 LexerError::InvalidLastChar => write!(f, "Invalid last char, should be '}}'"),
21 LexerError::InvalidUnicode(uc) => write!(f, "Invalid unicode : {uc}"),
22 LexerError::Error(msg) => write!(f, "{}", msg),
23 };
24 return Ok(());
25 }
26}
27
28impl From<std::str::Utf8Error> for LexerError {
29 fn from(value: std::str::Utf8Error) -> Self {
30 return LexerError::Error(value.to_string());
31 }
32}
33
34impl From<std::num::ParseIntError> for LexerError {
35 fn from(value: std::num::ParseIntError) -> Self {
36 return LexerError::Error(value.to_string());
37 }
38}
39
40pub struct Lexer;
41
42impl Lexer {
43 pub fn scan(src: &str) -> Result<Vec<Token>, LexerError> {
44 let src = src.trim(); let mut tokens: Vec<Token> = vec![];
47 let mut slice_start_index = 0;
48 let mut current_index = 0;
49 let mut previous_char = ' ';
50
51 let len = src.len();
53 let bytes = src.as_bytes();
54 let mut i = 0;
55 while i < len {
56 let c = bytes[i] as char;
57 i += 1;
58
59 match c {
60 '{' | '}' | '\\' | '\n' if previous_char == '\\' => {}
63 '{' | '}' | '\\' | '\n' => {
64 if slice_start_index < current_index {
66 let slice = &src[slice_start_index..current_index];
68 let slice_tokens = Self::tokenize(slice)?;
70 tokens.extend_from_slice(&slice_tokens.as_slice());
71 slice_start_index = current_index;
72 }
73 }
74 _ => {}
76 }
77 current_index += 1;
78 previous_char = c;
79 }
80 if slice_start_index < current_index {
82 let slice = &src[slice_start_index..current_index];
83 if slice != "}" {
84 return Err(LexerError::InvalidLastChar);
85 }
86 tokens.push(Token::ClosingBracket);
87 }
88 return Ok(tokens);
89 }
90
91 fn tokenize(slice: &str) -> Result<Vec<Token>, LexerError> {
93 let mut starting_chars = slice.trim_matches(' ').chars().take(2);
94 return match (starting_chars.next(), starting_chars.next()) {
95 (Some('\\'), Some(c)) => match c {
97 '{' | '}' | '\\' => {
98 let tail = slice.get(1..).unwrap_or("");
100 return Ok(vec![Token::PlainText(tail)]); }
102 '\'' => {
103 let tail = slice.get(1..).unwrap_or("");
105 if tail.len() < 2 {
106 return Err(LexerError::InvalidUnicode(tail.into()));
107 }
108 let byte = u8::from_str_radix(&tail[1..3], 16)?; let mut ret = vec![Token::ControlSymbol((ControlWord::Unicode, Property::Value(byte as i32)))];
110 recursive_tokenize!(&tail[3..], ret);
111 return Ok(ret);
112 }
113 '\n' => {
114 let mut ret = vec![Token::CRLF];
116 if let Some(tail) = slice.get(2..) {
117 recursive_tokenize!(tail, ret);
118 }
119 return Ok(ret);
120 }
121 'a'..='z' => {
122 let (mut ident, tail) = slice.split_first_whitespace();
125 ident = if ident.chars().last().unwrap_or(' ') == ';' { &ident[0..ident.len() - 1] } else { ident };
127 let control_word = ControlWord::from(ident)?;
128 let mut ret = vec![Token::ControlSymbol(control_word)];
129 recursive_tokenize!(tail, ret);
130
131 if control_word.0 == ControlWord::Unicode && tail.len() > 0 && tail.trim() == "" {
134 ret.push(Token::PlainText(tail));
135 }
136 return Ok(ret);
137 }
138 '*' => Ok(vec![Token::IgnorableDestination]),
139 _ => Ok(vec![]),
140 },
141 (Some('\n'), Some(_)) => recursive_tokenize!(&slice[1..]), (Some('{'), None) => Ok(vec![Token::OpeningBracket]),
144 (Some('}'), None) => Ok(vec![Token::ClosingBracket]),
145 (Some('{'), Some(_)) => recursive_tokenize_with_init!(Token::OpeningBracket, &slice[1..]),
146 (Some('}'), Some(_)) => recursive_tokenize_with_init!(Token::ClosingBracket, &slice[1..]),
147 (None, None) => Err(LexerError::Error(format!("Empty token {}", &slice))),
148 _ => {
150 let text = slice.trim();
151 if text == "" {
152 return Ok(vec![]);
153 }
154 return Ok(vec![Token::PlainText(slice)]);
155 }
156 };
157 }
158}
159
160#[cfg(test)]
161pub(crate) mod tests {
162 use crate::lexer::Lexer;
163 use crate::tokens::ControlWord::{Ansi, Bold, ColorBlue, ColorNumber, ColorRed, FontNumber, FontSize, FontTable, Italic, Par, Pard, Rtf, Underline, Unicode, Unknown};
164 use crate::tokens::Property::*;
165 use crate::tokens::Token::*;
166 use crate::tokens::{ControlWord, Property};
167
168 #[test]
169 fn simple_tokenize_test() {
170 let tokens = Lexer::tokenize(r"\b Words in bold").unwrap();
171 assert_eq!(tokens, vec![ControlSymbol((Bold, None)), PlainText("Words in bold"),]);
172 }
173
174 #[test]
175 fn scan_entire_file_test() {
176 let tokens = Lexer::scan(r#"{ \rtf1\ansi{\fonttbl\f0\fswiss Helvetica;}\f0\pard Voici du texte en {\b gras}.\par }"#);
177 assert_eq!(
178 tokens.unwrap(),
179 vec![
180 OpeningBracket,
181 ControlSymbol((Rtf, Value(1))),
182 ControlSymbol((Ansi, None)),
183 OpeningBracket,
184 ControlSymbol((FontTable, None)),
185 ControlSymbol((FontNumber, Value(0))),
186 ControlSymbol((Unknown("\\fswiss"), None)),
187 PlainText("Helvetica;"),
188 ClosingBracket,
189 ControlSymbol((FontNumber, Value(0))),
190 ControlSymbol((Pard, None)),
191 PlainText("Voici du texte en "),
192 OpeningBracket,
193 ControlSymbol((Bold, None)),
194 PlainText("gras"),
195 ClosingBracket,
196 PlainText("."),
197 ControlSymbol((Par, None)),
198 ClosingBracket,
199 ]
200 );
201 }
202
203 #[test]
204 fn scan_escaped_text() {
205 let tokens = Lexer::scan(
206 r#"\f0\fs24 \cf0 test de code \
207if (a == b) \{\
208 test();\
209\} else \{\
210 return;\
211\}}"#,
212 );
213 assert_eq!(
214 tokens.unwrap(),
215 vec![
216 ControlSymbol((FontNumber, Value(0))),
217 ControlSymbol((FontSize, Value(24))),
218 ControlSymbol((ColorNumber, Value(0))),
219 PlainText("test de code "),
220 CRLF,
221 PlainText("if (a == b) "),
222 PlainText("{"),
223 CRLF,
224 PlainText(" test();"),
225 CRLF,
226 PlainText("} else "),
227 PlainText("{"),
228 CRLF,
229 PlainText(" return;"),
230 CRLF,
231 PlainText("}"),
232 ClosingBracket
233 ],
234 );
235 }
236
237 #[test]
238 fn scan_ignorable_destination() {
239 let text = r"{\*\expandedcolortbl;;}";
240 let tokens = Lexer::scan(text);
241 assert_eq!(
242 tokens.unwrap(),
243 vec![OpeningBracket, IgnorableDestination, ControlSymbol((Unknown(r"\expandedcolortbl;"), None)), ClosingBracket,]
244 )
245 }
246
247 #[test]
248 fn should_parse_control_symbol_ending_semicolon() {
249 let text = r"{\red255\blue255;}";
250 let tokens = Lexer::scan(text);
251 assert_eq!(
252 tokens.unwrap(),
253 vec![OpeningBracket, ControlSymbol((ColorRed, Value(255))), ControlSymbol((ColorBlue, Value(255))), ClosingBracket]
254 );
255 }
256
257 #[test]
258 fn lex_with_leading_whitespaces() {
259 let rtf_content = "\t {\\rtf1 }\n "; let tokens = Lexer::scan(rtf_content).unwrap();
262 assert_eq!(tokens, vec![OpeningBracket, ControlSymbol((Rtf, Value(1))), ClosingBracket]);
263 }
264
265 #[test]
266 fn should_parse_line_return() {
267 let text = r#"{\partightenfactor0
271
272\fs24 \cf0 Font size 12,
273\f0\b bold text. \ul Underline,bold text.\
274 }"#;
275 let tokens = Lexer::scan(text).unwrap();
276 assert_eq!(
277 tokens,
278 [
279 OpeningBracket,
280 ControlSymbol((Unknown("\\partightenfactor"), Value(0))),
281 ControlSymbol((FontSize, Value(24))),
282 ControlSymbol((ColorNumber, Value(0))),
283 PlainText("Font size 12,"),
284 ControlSymbol((FontNumber, Value(0))),
285 ControlSymbol((Bold, None)),
286 PlainText("bold text. "),
287 ControlSymbol((Underline, None)),
288 PlainText("Underline,bold text."),
289 CRLF,
290 ClosingBracket
291 ]
292 );
293 }
294
295 #[test]
296 fn space_after_control_word() {
297 let text = r"{in{\i cred}ible}";
298 let tokens = Lexer::scan(text).unwrap();
299 assert_eq!(
300 tokens,
301 [OpeningBracket, PlainText("in"), OpeningBracket, ControlSymbol((Italic, None)), PlainText("cred"), ClosingBracket, PlainText("ible"), ClosingBracket,]
302 )
303 }
304
305 #[test]
306 fn should_handle_escaped_char() {
307 let rtf = r"{je suis une b\'eate}"; let tokens = Lexer::scan(rtf).unwrap();
309 assert_eq!(
310 tokens,
311 [OpeningBracket, PlainText("je suis une b"), ControlSymbol((Unicode, Value(234))), PlainText("te"), ClosingBracket,]
312 );
313 }
314}