1use std::collections::HashMap;
2use std::{fmt, mem};
3
4use serde::{Deserialize, Serialize};
5#[cfg(feature = "jsbindings")]
6use wasm_bindgen::prelude::wasm_bindgen;
7
8use crate::document::RtfDocument;
9use crate::header::{CharacterSet, Color, ColorRef, ColorTable, Font, FontFamily, FontRef, FontTable, RtfHeader, StyleSheet};
10use crate::paragraph::{Alignment, Paragraph, SpaceBetweenLine};
11use crate::tokens::{ControlWord, Property, Token};
12
13macro_rules! header_control_word {
15 ($cw:ident) => {
16 &Token::ControlSymbol((ControlWord::$cw, _))
17 };
18 ($cw:ident, $prop:ident) => {
19 &Token::ControlSymbol((ControlWord::$cw, Property::$prop))
20 };
21}
22
23#[derive(Debug, Default, PartialEq, Clone, Deserialize, Serialize)]
24#[cfg_attr(feature = "jsbindings", wasm_bindgen(getter_with_clone))]
25pub struct StyleBlock {
26 pub painter: Painter,
27 pub paragraph: Paragraph,
28 pub text: String,
29}
30
31#[derive(Debug, Clone, PartialEq, Hash, Deserialize, Serialize)]
32#[cfg_attr(feature = "jsbindings", wasm_bindgen)]
33pub struct Painter {
34 pub color_ref: ColorRef,
35 pub font_ref: FontRef,
36 pub font_size: u16,
37 pub bold: bool,
38 pub italic: bool,
39 pub underline: bool,
40 pub superscript: bool,
41 pub subscript: bool,
42 pub smallcaps: bool,
43 pub strike: bool,
44}
45
46impl Default for Painter {
47 fn default() -> Self {
48 Self {
49 color_ref: Default::default(),
50 font_ref: Default::default(),
51 font_size: 12,
52 bold: Default::default(),
53 italic: Default::default(),
54 underline: Default::default(),
55 superscript: Default::default(),
56 subscript: Default::default(),
57 smallcaps: Default::default(),
58 strike: Default::default(),
59 }
60 }
61}
62
63#[derive(Debug, Clone)]
64pub enum ParserError {
65 InvalidToken(String),
66 IgnorableDestinationParsingError,
67 MalformedPainterStack,
68 InvalidFontIdentifier(Property),
69 InvalidColorIdentifier(Property),
70 NoMoreToken,
71 ValueCastError(String),
72 UnicodeParsingError(i32),
73 ParseEmptyToken,
74}
75
76impl std::error::Error for ParserError {}
77
78impl fmt::Display for ParserError {
79 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
80 let _ = write!(f, "[RTF Parser] : ");
81 return match self {
82 ParserError::InvalidToken(msg) => write!(f, "{}", msg),
83 ParserError::IgnorableDestinationParsingError => write!(f, "No ignorable destination should be left"),
84 ParserError::MalformedPainterStack => write!(f, "Malformed painter stack : Unbalanced number of brackets"),
85 ParserError::InvalidFontIdentifier(property) => write!(f, "Invalid font identifier : {:?}", property),
86 ParserError::InvalidColorIdentifier(property) => write!(f, "Invalid color identifier : {:?}", property),
87 ParserError::NoMoreToken => write!(f, "No more token to parse"),
88 ParserError::ValueCastError(_type) => write!(f, "Unable to cast i32 to {_type}"),
89 ParserError::UnicodeParsingError(value) => write!(f, "Unable to parse {value} value to unicode"),
90 ParserError::ParseEmptyToken => write!(f, "Try to parse an empty token, this should never happen. If so, please open an issue in the github repository"),
91 };
92 }
93}
94
95#[derive(Debug, Clone, PartialEq, Hash)]
97struct ParserState {
98 pub painter: Painter,
99 pub paragraph: Paragraph,
100 pub unicode_ignore_count: i32,
101}
102
103impl Default for ParserState {
104 fn default() -> Self {
105 Self {
106 painter: Default::default(),
107 paragraph: Default::default(),
108 unicode_ignore_count: 1,
109 }
110 }
111}
112
113pub struct Parser<'a> {
114 tokens: Vec<Token<'a>>,
115 parsed_item: Vec<bool>,
116 cursor: usize,
117}
118
119impl<'a> Parser<'a> {
120 pub fn new(tokens: Vec<Token<'a>>) -> Self {
121 return Self {
122 parsed_item: vec![false; tokens.len()],
123 tokens,
124 cursor: 0,
125 };
126 }
127
128 pub fn get_tokens(&self) -> Vec<&Token> {
129 return self.tokens.iter().filter(|t| *t != &Token::Empty).collect();
131 }
132
133 fn check_document_validity(&self) -> Result<(), ParserError> {
134 if let Some(token) = self.tokens.first() {
136 if token != &Token::OpeningBracket {
137 return Err(ParserError::InvalidToken(format!("Invalid first token : {:?} not a '{{'", token)));
138 }
139 } else {
140 return Err(ParserError::NoMoreToken);
141 }
142 if let Some(token) = self.tokens.last() {
143 if token != &Token::ClosingBracket {
144 return Err(ParserError::InvalidToken(format!("Invalid last token : {:?} not a '}}'", token)));
145 }
146 } else {
147 return Err(ParserError::NoMoreToken);
148 }
149 return Ok(());
150 }
151
152 pub fn parse(&mut self) -> Result<RtfDocument, ParserError> {
153 self.check_document_validity()?;
154 let mut document = RtfDocument::default(); document.header = self.parse_header()?;
157 let mut state_stack: Vec<ParserState> = vec![ParserState::default()];
159 let len = self.tokens.len();
161 let mut i = 0;
162
163 while i < len {
164 if self.parsed_item[i] {
165 i += 1;
167 continue;
168 }
169 let token = &self.tokens[i];
170
171 match token {
172 Token::OpeningBracket => {
173 if let Some(last_state) = state_stack.last() {
174 state_stack.push(last_state.clone()); } else {
176 state_stack.push(ParserState::default());
177 }
178 }
179 Token::ClosingBracket => {
180 let state = state_stack.pop();
181 if state.is_none() {
182 return Err(ParserError::MalformedPainterStack);
183 }
184 }
185 Token::ControlSymbol((control_word, property)) => {
186 let Some(current_state) = state_stack.last_mut() else {
187 return Err(ParserError::MalformedPainterStack);
188 };
189 let current_painter = &mut current_state.painter;
190 let paragraph = &mut current_state.paragraph;
191 #[rustfmt::skip] match control_word {
193 ControlWord::ColorNumber => current_painter.color_ref = property.get_value_as::<ColorRef>()?,
194 ControlWord::FontNumber => current_painter.font_ref = property.get_value_as::<FontRef>()?,
195 ControlWord::FontSize => current_painter.font_size = property.get_value_as::<u16>()?,
196 ControlWord::Bold => current_painter.bold = property.as_bool(),
197 ControlWord::Italic => current_painter.italic = property.as_bool(),
198 ControlWord::Underline => current_painter.underline = property.as_bool(),
199 ControlWord::UnderlineNone => current_painter.underline = false,
200 ControlWord::Superscript => current_painter.superscript = property.as_bool(),
201 ControlWord::Subscript => current_painter.subscript = property.as_bool(),
202 ControlWord::Smallcaps => current_painter.smallcaps = property.as_bool(),
203 ControlWord::Strikethrough => current_painter.strike = property.as_bool(),
204 ControlWord::Pard => *paragraph = Paragraph::default(), ControlWord::Plain => *current_painter = Painter::default(), ControlWord::ParDefTab => paragraph.tab_width = property.get_value(),
208 ControlWord::LeftAligned
209 | ControlWord::RightAligned
210 | ControlWord::Center
211 | ControlWord::Justify => paragraph.alignment = Alignment::from(control_word),
212 ControlWord::SpaceBefore => paragraph.spacing.before = property.get_value(),
213 ControlWord::SpaceAfter => paragraph.spacing.after = property.get_value(),
214 ControlWord::SpaceBetweenLine => paragraph.spacing.between_line = SpaceBetweenLine::from(property.get_value()),
215 ControlWord::SpaceLineMul => paragraph.spacing.line_multiplier = property.get_value(),
216 ControlWord::UnicodeIgnoreCount => current_state.unicode_ignore_count = property.get_value(),
217 ControlWord::Unicode => {
218 let mut unicodes = Vec::with_capacity(current_state.unicode_ignore_count as usize + 1); if let Ok(unicode) = property.get_unicode_value() {
220 unicodes.push(unicode);
221 }
222 while i + 1 < len {
224 if let Token::ControlSymbol((ControlWord::Unicode, property)) = &self.tokens[i + 1] {
226 if let Ok(unicode) = property.get_unicode_value() {
227 unicodes.push(unicode);
228 }
229 i += 1;
230 } else {
231 break;
232 }
233 }
234 if unicodes.len() > 0 {
235 let mut ignore_mask = vec![true; unicodes.len()];
237 let mut ignore_counter = 0;
238 for i in 1..unicodes.len() {
239 if unicodes[i] <= 255 && ignore_counter < current_state.unicode_ignore_count {
240 ignore_counter += 1;
241 ignore_mask[i] = false;
242 } else {
243 ignore_counter = 0;
244 }
245 }
246 let mut ignore_mask_iter = ignore_mask.iter();
247 unicodes.retain(|_| *ignore_mask_iter.next().unwrap());
248 let str = String::from_utf16(unicodes.as_slice()).unwrap();
250 Self::add_text_to_document(&str, &state_stack, &mut document)?;
251 }
252 }
253 ControlWord::Emdash => Self::add_text_to_document("\u{2014}", &state_stack, &mut document)?,
255 ControlWord::Endash => Self::add_text_to_document("\u{2013}", &state_stack, &mut document)?,
256 ControlWord::Bullet => Self::add_text_to_document("\u{2022}", &state_stack, &mut document)?,
257 ControlWord::LeftSingleQuote => Self::add_text_to_document("\u{2018}", &state_stack, &mut document)?,
258 ControlWord::RightSingleQuote => Self::add_text_to_document("\u{2019}", &state_stack, &mut document)?,
259 ControlWord::LeftDoubleQuote => Self::add_text_to_document("\u{201C}", &state_stack, &mut document)?,
260 ControlWord::RightDoubleQuote => Self::add_text_to_document("\u{201D}", &state_stack, &mut document)?,
261 ControlWord::Tab => Self::add_text_to_document("\t", &state_stack, &mut document)?,
262 ControlWord::Line => Self::add_text_to_document("\n", &state_stack, &mut document)?,
263 _ => {}
265 };
266 }
267 Token::PlainText(text) => Self::add_text_to_document(*text, &state_stack, &mut document)?,
268 Token::CRLF => Self::add_text_to_document("\n", &state_stack, &mut document)?,
269 Token::IgnorableDestination => {
270 return Err(ParserError::IgnorableDestinationParsingError);
271 }
272 Token::Empty => return Err(ParserError::ParseEmptyToken),
273 };
274 i += 1;
275 }
276 return Ok(document);
277 }
278
279 fn add_text_to_document(text: &str, state_stack: &Vec<ParserState>, document: &mut RtfDocument) -> Result<(), ParserError> {
280 let Some(current_state) = state_stack.last() else {
281 return Err(ParserError::MalformedPainterStack);
282 };
283 let current_painter = ¤t_state.painter;
284 let paragraph = ¤t_state.paragraph;
285 let last_style_group = document.body.last_mut();
286 if let Some(group) = last_style_group {
288 if group.painter.eq(current_painter) && group.paragraph.eq(¶graph) {
289 group.text.push_str(text);
290 return Ok(());
291 }
292 }
293 document.body.push(StyleBlock {
295 painter: current_painter.clone(),
296 paragraph: paragraph.clone(),
297 text: String::from(text),
298 });
299 return Ok(());
300 }
301
302 fn get_token_at(&'a self, index: usize) -> Option<&'a Token<'a>> {
303 return self.tokens.get(index);
304 }
305
306 fn get_next_token(&'a self) -> Option<&'a Token<'a>> {
308 return self.get_token_at(self.cursor);
309 }
310
311 #[inline]
312 fn consume_token_at(&mut self, index: usize) -> Option<Token<'a>> {
313 if self.tokens.is_empty() || index >= self.tokens.len() {
314 return None;
315 }
316 self.cursor += 1;
318 self.parsed_item[index] = true;
319 return Some(mem::replace(&mut self.tokens[index], Token::Empty));
320 }
321
322 fn consume_next_token(&mut self) -> Option<Token<'a>> {
323 return self.consume_token_at(self.cursor);
324 }
325
326 fn _consume_tokens_until(&mut self, reference_token: &Token<'a>) -> Vec<Token<'a>> {
328 let mut ret = vec![];
329 let token_type_id = mem::discriminant(reference_token);
330 while let Some(token) = self.consume_next_token() {
331 let type_id = mem::discriminant(&token);
332 ret.push(token);
333 if type_id == token_type_id {
334 break;
335 }
336 }
337 return ret;
338 }
339
340 fn consume_tokens_until_matching_bracket(&mut self) -> Vec<Token<'a>> {
342 let mut ret = vec![];
343 let mut count = 0;
344 while let Some(token) = self.consume_next_token() {
345 match token {
346 Token::OpeningBracket => count += 1,
347 Token::ClosingBracket => count -= 1,
348 _ => {}
349 }
350 ret.push(token);
351 if count < 0 {
352 break;
353 }
354 }
355 return ret;
356 }
357
358 fn consume_group(&mut self) -> Vec<Token<'a>> {
360 self.consume_token_at(self.cursor); return self.consume_tokens_until_matching_bracket();
363 }
364
365 fn parse_header(&mut self) -> Result<RtfHeader, ParserError> {
367 self.cursor = 0; let mut header = RtfHeader::default();
369 while let (Some(token), Some(mut next_token)) = (self.get_token_at(self.cursor), self.get_token_at(self.cursor + 1)) {
370 let mut i = 0;
373 while *next_token == Token::CRLF {
374 if let Some(next_token_not_crlf) = self.get_token_at(self.cursor + 1 + i) {
375 next_token = next_token_not_crlf;
376 i += 1;
377 } else {
378 break;
379 }
380 }
381 match (token, next_token) {
382 (Token::OpeningBracket, Token::IgnorableDestination) => {
383 let ignore_group_tokens = self.consume_group();
384 Self::parse_ignore_groups(&ignore_group_tokens);
385 }
386 (Token::OpeningBracket, header_control_word!(FontTable, None)) => {
387 let font_table_tokens = self.consume_group();
388 header.font_table = Self::parse_font_table(&font_table_tokens)?;
389 }
390 (Token::OpeningBracket, header_control_word!(ColorTable, None)) => {
391 let color_table_tokens = self.consume_group();
392 header.color_table = Self::parse_color_table(&color_table_tokens)?;
393 }
394 (Token::OpeningBracket, header_control_word!(StyleSheet, None)) => {
395 let stylesheet_tokens = self.consume_group();
396 header.stylesheet = Self::parse_stylesheet(&stylesheet_tokens)?;
397 }
398 (token, _) => {
400 if let Some(charset) = CharacterSet::from(token) {
401 header.character_set = charset;
402 }
403 self.cursor += 1;
404 }
405 }
406 }
407 return Ok(header);
408 }
409
410 fn parse_font_table(font_tables_tokens: &Vec<Token<'a>>) -> Result<FontTable, ParserError> {
411 let Some(font_table_first_token) = font_tables_tokens.get(0) else {
412 return Err(ParserError::NoMoreToken);
413 };
414 if font_table_first_token != header_control_word!(FontTable, None) {
415 return Err(ParserError::InvalidToken(format!("{:?} is not a FontTable token", font_table_first_token)));
416 }
417 let mut table = HashMap::new();
418 let mut current_key = 0;
419 let mut current_font = Font::default();
420 for token in font_tables_tokens.iter() {
421 match token {
422 Token::ControlSymbol((control_word, property)) => match control_word {
423 ControlWord::FontNumber => {
424 table.insert(current_key, current_font.clone());
426 if let Property::Value(key) = property {
427 current_key = *key as FontRef;
428 } else {
429 return Err(ParserError::InvalidFontIdentifier(*property));
430 }
431 }
432 ControlWord::Unknown(name) => {
433 if let Some(font_family) = FontFamily::from(name) {
434 current_font.font_family = font_family;
435 }
436 }
437 _ => {}
438 },
439 Token::PlainText(name) => {
440 current_font.name = name.trim_end_matches(';').to_string();
441 }
442 Token::ClosingBracket => {
443 table.insert(current_key, current_font.clone());
444 } _ => {}
446 }
447 }
448 return Ok(table);
449 }
450
451 fn parse_color_table(color_table_tokens: &Vec<Token<'a>>) -> Result<ColorTable, ParserError> {
452 let Some(color_table_first_token) = color_table_tokens.get(0) else {
453 return Err(ParserError::NoMoreToken);
454 };
455 if color_table_first_token != header_control_word!(ColorTable, None) {
456 return Err(ParserError::InvalidToken(format!("ParserError: {:?} is not a ColorTable token", color_table_first_token)));
457 }
458 let mut table = HashMap::new();
459 let mut current_key = 1;
460 let mut current_color = Color::default();
461 for token in color_table_tokens.iter() {
462 match token {
463 Token::ControlSymbol((control_word, property)) => match control_word {
464 ControlWord::ColorRed => current_color.red = property.get_value_as::<u8>()?,
465 ControlWord::ColorGreen => current_color.green = property.get_value_as::<u8>()?,
466 ControlWord::ColorBlue => {
467 current_color.blue = property.get_value_as::<u8>()?;
468 table.insert(current_key, current_color.clone());
469 current_key += 1;
470 }
471 _ => {}
472 },
473 _ => {}
474 }
475 }
476 return Ok(table);
477 }
478
479 fn parse_stylesheet(_stylesheet_tokens: &Vec<Token<'a>>) -> Result<StyleSheet, ParserError> {
480 return Ok(StyleSheet::from([]));
482 }
483
484 fn parse_ignore_groups(_tokens: &Vec<Token<'a>>) {
485 }
487}
488
489#[cfg(test)]
490pub mod tests {
491 use super::*;
492 use crate::header::CharacterSet::*;
493 use crate::header::FontFamily::*;
494 use crate::header::RtfHeader;
495 use crate::include_test_file;
496 use crate::lexer::Lexer;
497
498 #[test]
499 fn parser_header() {
500 let tokens = Lexer::scan(r#"{ \rtf1\ansi{\fonttbl\f0\fswiss Helvetica;}\f0\pard Voici du texte en {\b gras}.\par }"#).unwrap();
501 let doc = Parser::new(tokens).parse().unwrap();
502 assert_eq!(
503 doc.header,
504 RtfHeader {
505 character_set: Ansi,
506 font_table: FontTable::from([(
507 0,
508 Font {
509 name: "Helvetica".into(),
510 character_set: 0,
511 font_family: Swiss
512 }
513 )]),
514 ..RtfHeader::default()
515 }
516 );
517 assert_eq!(
518 doc.body,
519 [
520 StyleBlock {
521 painter: Painter::default(),
522 paragraph: Default::default(),
523 text: "Voici du texte en ".into(),
524 },
525 StyleBlock {
526 painter: Painter { bold: true, ..Painter::default() },
527 paragraph: Default::default(),
528 text: "gras".into(),
529 },
530 StyleBlock {
531 painter: Painter::default(),
532 paragraph: Default::default(),
533 text: ".".into(),
534 },
535 ]
536 );
537 }
538
539 #[test]
540 fn parse_multiline_document() {
541 let document = r"{\rtf1\ansi\deff0 {\fonttbl {\f0 Courier;}{\f1 ProFontWindows;}}
542 {\colortbl;\red0\green0\blue0;\red255\green0\blue0;\red255\green255\blue0;}
543 This line is font 0 which is courier\line
544 \f1
545 This line is font 1\line
546 \f0
547 This line is font 0 again\line
548 This line has a \cf2 red \cf1 word\line
549 \highlight3 while this line has a \cf2 red \cf1 word and is highlighted in yellow\highlight0\line
550 Finally, back to the default color.\line
551 }";
552 let tokens = Lexer::scan(document).unwrap();
553 let _doc = Parser::new(tokens).parse().unwrap();
554 }
555
556 #[test]
557 fn parse_entire_file_header() {
558 let file_content = include_test_file!("test-file.rtf");
559 let tokens = Lexer::scan(file_content).unwrap();
560 let doc = Parser::new(tokens).parse().unwrap();
561 assert_eq!(
562 doc.header,
563 RtfHeader {
564 character_set: Ansi,
565 font_table: FontTable::from([
566 (
567 0,
568 Font {
569 name: "Helvetica".into(),
570 character_set: 0,
571 font_family: Swiss,
572 }
573 ),
574 (
575 1,
576 Font {
577 name: "Helvetica-Bold".into(),
578 character_set: 0,
579 font_family: Swiss,
580 }
581 )
582 ]),
583 color_table: ColorTable::from([(1, Color { red: 255, green: 255, blue: 255 }),]),
584 ..RtfHeader::default()
585 }
586 );
587 }
588
589 #[test]
590 fn parse_ignore_group() {
591 let rtf = r"{\*\expandedcolortbl;;}";
592 let tokens = Lexer::scan(rtf).unwrap();
593 let mut parser = Parser::new(tokens);
594 let document = parser.parse().unwrap();
595 assert_eq!(parser.get_tokens(), Vec::<&Token>::new()); assert_eq!(document.header, RtfHeader::default());
597 }
598
599 #[test]
600 fn parse_ignore_group_with_crlf() {
601 let rtf = r"{\
602 \
603 \*\expandedcolortbl;;}";
604 let tokens = Lexer::scan(rtf).unwrap();
605 let mut parser = Parser::new(tokens);
606 let document = parser.parse().unwrap();
607 assert_eq!(parser.get_tokens(), Vec::<&Token>::new()); assert_eq!(document.header, RtfHeader::default());
609 }
610
611 #[test]
612 #[ignore] fn parse_whitespaces() {
614 let file_content = include_test_file!("list-item.rtf");
615 let tokens = Lexer::scan(file_content).unwrap();
616 let mut parser = Parser::new(tokens);
617 let document = parser.parse().unwrap();
618 assert_eq!(
619 document.body,
620 vec![StyleBlock {
621 painter: Painter { font_size: 24, ..Painter::default() },
622 paragraph: Default::default(),
623 text: "\nEmpty start\n\nList test : \n - item 1\n - item 2\n - item 3\n - item 4".into(),
624 },]
625 );
626 }
627
628 #[test]
629 fn parse_image_data() {
630 let rtf_content = include_test_file!("file-with-image.rtf");
632 let tokens = Lexer::scan(rtf_content).unwrap();
633 let _document = Parser::new(tokens).parse();
634 }
635
636 #[test]
637 fn parse_header_and_body() {
638 let rtf = r#"{\rtf1\ansi\ansicpg1252\cocoartf2639
639\cocoatextscaling0\cocoaplatform0{\fonttbl\f0\froman\fcharset0 Times-Bold;\f1\froman\fcharset0 Times-Roman;\f2\froman\fcharset0 Times-Italic;
640\f3\fswiss\fcharset0 Helvetica;}
641{\colortbl;\red255\green255\blue255;\red0\green0\blue10;\red0\green0\blue1;\red191\green191\blue191;
642}
643\f0\b\fs21 \cf2 Lorem ipsum
644\fs56 \
645\pard\pardeftab709\sl288\slmult1\sa225\qj\partightenfactor0
646
647\f1\b0\fs21 \cf0 \
648\pard\pardeftab709\fi-432\ri-1\sb240\sa120\partightenfactor0
649\ls1\ilvl0
650\f0\b\fs36\cf2\plain Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc ac faucibus odio. \
651\pard\pardeftab709\sl288\slmult1\sa225\qj\partightenfactor0
652}"#;
653 let tokens = Lexer::scan(rtf).unwrap();
654 let document = Parser::new(tokens).parse().unwrap();
655 assert_eq!(document.body[0].text, "Lorem ipsum");
656 assert_eq!(document.body[1].text, "\n");
657 assert_eq!(document.body[2].text, "\n");
658 assert_eq!(document.body[3].text, "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc ac faucibus odio. \n");
659 }
660
661 #[test]
662 fn parse_paragraph_aligment() {
663 let rtf = r#"{\rtf1\ansi\deff0 {\fonttbl {\f0 Times;}}
664 \fs34
665 {\pard \qc \fs60 Annalium Romae\par}
666 {\pard \qj
667 Urbem Romam a principio reges habuere; libertatem et
668 \par}
669 {\pard \ql
670 Non Cinnae, non Sullae longa dominatio; et Pompei Crassique potentia
671 \par}"#;
672 let tokens = Lexer::scan(rtf).unwrap();
673 let document = Parser::new(tokens).parse().unwrap();
674 assert_eq!(document.body[0].paragraph.alignment, Alignment::Center);
675 assert_eq!(document.body[1].paragraph.alignment, Alignment::Justify);
676 assert_eq!(document.body[2].paragraph.alignment, Alignment::LeftAligned);
677 }
678
679 #[test]
680 fn should_parse_escaped_char() {
681 let rtf = r"{\rtf1\ansi\deff0 {\fonttbl {\f0 Times;}}je suis une b\'eate}";
682 let tokens = Lexer::scan(rtf).unwrap();
683 let document = Parser::new(tokens).parse().unwrap();
684 assert_eq!(document.body[0].text, "je suis une bête");
685 }
686
687 #[test]
688 fn parse_plain_directive() {
689 let rtf = r"{\rtf1{\fonttbl {\f0 Times;}}\f0\b\fs36\u\cf2\plain Plain text}";
690 let tokens = Lexer::scan(rtf).unwrap();
691 let document = Parser::new(tokens).parse().unwrap();
692 assert_eq!(document.body[0].painter, Painter::default());
693 }
694
695 #[test]
696 fn parse_color_table() {
697 let rtf = r#"{\rtf1\ansi\ansicpg936\cocoartf2761
699 \cocoatextscaling0\cocoaplatform0{\fonttbl\f0\fswiss\fcharset0 Helvetica;\f1\fnil\fcharset134 PingFangSC-Regular;}
700 {\colortbl;\red255\green255\blue255;\red251\green2\blue7;\red114\green44\blue253;}
701 {\*\expandedcolortbl;;\cssrgb\c100000\c14913\c0;\cssrgb\c52799\c30710\c99498;}
702 \f0\fs24 \cf2 A
703 \f1 \cf3 B}"#;
704 let tokens = Lexer::scan(rtf).unwrap();
705 let document = Parser::new(tokens).parse().unwrap();
706 assert_eq!(document.header.color_table.get(&document.body[0].painter.color_ref).unwrap(), &Color { red: 251, green: 2, blue: 7 });
707 }
708
709 #[test]
710 fn parse_underline() {
711 let rtf = r#"{\rtf1\ansi\ansicpg936\cocoartf2761
714 \cocoatextscaling0\cocoaplatform0{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
715 {\colortbl;\red255\green255\blue255;}
716 {\*\expandedcolortbl;;}
717 \paperw11900\paperh16840\margl1440\margr1440\vieww11520\viewh8400\viewkind0
718 \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
719
720 \f0\fs24 \cf0 \ul \ulc0 a\ulnone A}"#;
721 let tokens = Lexer::scan(rtf).unwrap();
722 let document = Parser::new(tokens).parse().unwrap();
723 assert_eq!(&document.body[0].painter.underline, &true);
724 assert_eq!(&document.body[1].painter.underline, &false);
725 }
726
727 #[test]
728 fn parse_unicode() {
729 let rtf = r#"{\rtf1\ansi\ansicpg936\cocoartf2761
732 \cocoatextscaling0\cocoaplatform0{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
733 \f0\fs24 \cf0 \uc0\u21834 \u21834 }"#;
734 let tokens = Lexer::scan(rtf).unwrap();
735 let document = Parser::new(tokens).parse().unwrap();
736 assert_eq!(&document.body[0].text, "啊 啊");
737 }
738
739 #[test]
740 fn parse_two_characters_compound_unicode() {
741 let rtf = r#"{\rtf1\ansi
742 \f0 a\u55357 \u56447 1 \u21834}"#;
743 let tokens = Lexer::scan(rtf).unwrap();
744 let document = Parser::new(tokens).parse().unwrap();
745 assert_eq!(&document.body[0].text, "a👿1 啊");
746 }
747
748 #[test]
749 fn parse_unicode_with_fallback() {
750 let rtf = r#"{\rtf1\ansi
752 {\f0 \u-10179\'5f\u-9089\'5f}
753 {\f1 \uc2\u32767\'c2\'52}
754 {\f2 \uc2\u26789\'97\'73}
755 {\f3 b\'eate}
756 {\f4 \uc0 b\'ea\'eate}
757 }"#;
758 let tokens = Lexer::scan(rtf).unwrap();
759 let document = Parser::new(tokens).parse().unwrap();
760 assert_eq!(&document.body[0].text, "👿");
761 assert_eq!(&document.body[1].text, "翿");
762 assert_eq!(&document.body[2].text, "梥");
763 assert_eq!(&document.body[3].text, "bête");
764 assert_eq!(&document.body[4].text, "bêête");
765 }
766
767 #[test]
768 fn body_starts_with_a_group() {
769 let rtf = r"{\rtf1\ansi\deff0{\fonttbl {\f0\fnil\fcharset0 Calibri;}{\f1\fnil\fcharset2 Symbol;}}{\colortbl ;}{\pard \u21435 \sb70\par}}";
770 let tokens = Lexer::scan(rtf).unwrap();
771 let _document = Parser::new(tokens).parse().unwrap();
772 }
773
774 #[test]
775 fn rtf_different_semantic() {
776 let rtf1 = r"{\rtf1 \b bold \i Bold Italic \i0 Bold again}";
777 let rtf2 = r"{\rtf1 \b bold {\i Bold Italic }Bold again}";
778 let rtf3 = r"{\rtf1 \b bold \i Bold Italic \plain\b Bold again}";
779 let doc1 = RtfDocument::try_from(rtf1).unwrap();
780 let doc2 = RtfDocument::try_from(rtf2).unwrap();
781 let doc3 = RtfDocument::try_from(rtf3).unwrap();
782 assert_eq!(doc1.body, doc2.body);
783 assert_eq!(doc3.body, doc2.body);
784 }
785
786 #[test]
787 fn parse_emdash() {
788 let rtf = r"{\rtf1\ansi hello\emdash world}";
789 let doc = RtfDocument::try_from(rtf).unwrap();
790 let text: String = doc.body.iter().map(|b| b.text.as_str()).collect();
791 assert!(text.contains("\u{2014}"), "Em-dash not found in: {}", text);
792 assert!(text.contains("hello\u{2014}world"), "Expected 'hello—world', got: {}", text);
793 }
794
795 #[test]
796 fn parse_endash() {
797 let rtf = r"{\rtf1\ansi 2020\endash 2025}";
798 let doc = RtfDocument::try_from(rtf).unwrap();
799 let text: String = doc.body.iter().map(|b| b.text.as_str()).collect();
800 assert!(text.contains("\u{2013}"), "En-dash not found in: {}", text);
801 }
802
803 #[test]
804 fn parse_smart_quotes() {
805 let rtf = r"{\rtf1\ansi \ldblquote Hello\rdblquote and \lquote hi\rquote}";
806 let doc = RtfDocument::try_from(rtf).unwrap();
807 let text: String = doc.body.iter().map(|b| b.text.as_str()).collect();
808 assert!(text.contains("\u{201C}"), "Left double quote not found");
809 assert!(text.contains("\u{201D}"), "Right double quote not found");
810 assert!(text.contains("\u{2018}"), "Left single quote not found");
811 assert!(text.contains("\u{2019}"), "Right single quote not found");
812 }
813
814 #[test]
815 fn parse_bullet() {
816 let rtf = r"{\rtf1\ansi \bullet Item one}";
817 let doc = RtfDocument::try_from(rtf).unwrap();
818 let text: String = doc.body.iter().map(|b| b.text.as_str()).collect();
819 assert!(text.contains("\u{2022}"), "Bullet not found in: {}", text);
820 }
821
822 #[test]
823 fn parse_tab_and_line() {
824 let rtf = r"{\rtf1\ansi col1\tab col2\line next}";
825 let doc = RtfDocument::try_from(rtf).unwrap();
826 let text: String = doc.body.iter().map(|b| b.text.as_str()).collect();
827 assert!(text.contains("\t"), "Tab not found in: {}", text);
828 assert!(text.contains("\n"), "Line break not found in: {}", text);
829 }
830
831 #[test]
832 fn parse_special_chars_in_scrivener_style() {
833 let rtf = r"{\rtf1\ansi\ansicpg1252\deff0
835{\fonttbl{\f0\fnil\fcharset0 TimesNewRomanPSMT;}}
836\f0\fs24 The transformation in reverse\emdash confident expert to tired father.\par
837He said, \ldblquote Hello.\rdblquote\par}";
838 let doc = RtfDocument::try_from(rtf).unwrap();
839 let text: String = doc.body.iter().map(|b| b.text.as_str()).collect();
840 assert!(text.contains("reverse\u{2014}confident"),
841 "Em-dash not properly placed in: {}", text);
842 assert!(text.contains("\u{201C}Hello.\u{201D}"),
843 "Smart quotes not properly placed in: {}", text);
844 }
845}