rtf_parser/
parser.rs

1use std::collections::HashMap;
2use std::{fmt, mem};
3
4use serde::{Deserialize, Serialize};
5use wasm_bindgen::prelude::wasm_bindgen;
6
7use crate::document::RtfDocument;
8use crate::header::{CharacterSet, Color, ColorRef, ColorTable, Font, FontFamily, FontRef, FontTable, RtfHeader, StyleSheet};
9use crate::paragraph::{Alignment, Paragraph, SpaceBetweenLine};
10use crate::tokens::{ControlWord, Property, Token};
11
12// Use to specify control word in parse_header
13macro_rules! header_control_word {
14    ($cw:ident) => {
15        &Token::ControlSymbol((ControlWord::$cw, _))
16    };
17    ($cw:ident, $prop:ident) => {
18        &Token::ControlSymbol((ControlWord::$cw, Property::$prop))
19    };
20}
21
22#[derive(Debug, Default, PartialEq, Clone, Deserialize, Serialize)]
23#[wasm_bindgen(getter_with_clone)]
24pub struct StyleBlock {
25    pub painter: Painter,
26    pub paragraph: Paragraph,
27    pub text: String,
28}
29
30#[derive(Debug, Clone, PartialEq, Hash, Deserialize, Serialize)]
31#[wasm_bindgen]
32pub struct Painter {
33    pub color_ref: ColorRef,
34    pub font_ref: FontRef,
35    pub font_size: u16,
36    pub bold: bool,
37    pub italic: bool,
38    pub underline: bool,
39    pub superscript: bool,
40    pub subscript: bool,
41    pub smallcaps: bool,
42    pub strike: bool,
43}
44
45impl Default for Painter {
46    fn default() -> Self {
47        Self {
48            color_ref: Default::default(),
49            font_ref: Default::default(),
50            font_size: 12,
51            bold: Default::default(),
52            italic: Default::default(),
53            underline: Default::default(),
54            superscript: Default::default(),
55            subscript: Default::default(),
56            smallcaps: Default::default(),
57            strike: Default::default(),
58        }
59    }
60}
61
62#[derive(Debug, Clone)]
63pub enum ParserError {
64    InvalidToken(String),
65    IgnorableDestinationParsingError,
66    MalformedPainterStack,
67    InvalidFontIdentifier(Property),
68    InvalidColorIdentifier(Property),
69    NoMoreToken,
70    ValueCastError(String),
71    UnicodeParsingError(i32),
72    ParseEmptyToken,
73}
74
75impl std::error::Error for ParserError {}
76
77impl fmt::Display for ParserError {
78    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
79        let _ = write!(f, "[RTF Parser] : ");
80        return match self {
81            ParserError::InvalidToken(msg) => write!(f, "{}", msg),
82            ParserError::IgnorableDestinationParsingError => write!(f, "No ignorable destination should be left"),
83            ParserError::MalformedPainterStack => write!(f, "Malformed painter stack : Unbalanced number of brackets"),
84            ParserError::InvalidFontIdentifier(property) => write!(f, "Invalid font identifier : {:?}", property),
85            ParserError::InvalidColorIdentifier(property) => write!(f, "Invalid color identifier : {:?}", property),
86            ParserError::NoMoreToken => write!(f, "No more token to parse"),
87            ParserError::ValueCastError(_type) => write!(f, "Unable to cast i32 to {_type}"),
88            ParserError::UnicodeParsingError(value) => write!(f, "Unable to parse {value} value to unicode"),
89            ParserError::ParseEmptyToken => write!(f, "Try to parse an empty token, this should never happen. If so, please open an issue in the github repository"),
90        };
91    }
92}
93
94// This state keeps track of each value that depends on the scope nesting
95#[derive(Debug, Clone, PartialEq, Hash)]
96struct ParserState {
97    pub painter: Painter,
98    pub paragraph: Paragraph,
99    pub unicode_ignore_count: i32,
100}
101
102impl Default for ParserState {
103    fn default() -> Self {
104        Self {
105            painter: Default::default(),
106            paragraph: Default::default(),
107            unicode_ignore_count: 1,
108        }
109    }
110}
111
112pub struct Parser<'a> {
113    tokens: Vec<Token<'a>>,
114    parsed_item: Vec<bool>,
115    cursor: usize,
116}
117
118impl<'a> Parser<'a> {
119    pub fn new(tokens: Vec<Token<'a>>) -> Self {
120        return Self {
121            parsed_item: vec![false; tokens.len()],
122            tokens,
123            cursor: 0,
124        };
125    }
126
127    pub fn get_tokens(&self) -> Vec<&Token> {
128        // It ignores the empty tokens, that replaced already parsed tokens istead of deleting them for performance reasons
129        return self.tokens.iter().filter(|t| *t != &Token::Empty).collect();
130    }
131
132    fn check_document_validity(&self) -> Result<(), ParserError> {
133        // Check the document boundaries
134        if let Some(token) = self.tokens.first() {
135            if token != &Token::OpeningBracket {
136                return Err(ParserError::InvalidToken(format!("Invalid first token : {:?} not a '{{'", token)));
137            }
138        } else {
139            return Err(ParserError::NoMoreToken);
140        }
141        if let Some(token) = self.tokens.last() {
142            if token != &Token::ClosingBracket {
143                return Err(ParserError::InvalidToken(format!("Invalid last token : {:?} not a '}}'", token)));
144            }
145        } else {
146            return Err(ParserError::NoMoreToken);
147        }
148        return Ok(());
149    }
150
151    pub fn parse(&mut self) -> Result<RtfDocument, ParserError> {
152        self.check_document_validity()?;
153        let mut document = RtfDocument::default(); // Init empty document
154                                                   // Traverse the document and consume the header groups (FontTable, StyleSheet, etc ...)
155        document.header = self.parse_header()?;
156        // Init the state of the docuement. the stack is used to keep track of the different scope changes.
157        let mut state_stack: Vec<ParserState> = vec![ParserState::default()];
158        // Parse the body
159        let len = self.tokens.len();
160        let mut i = 0;
161
162        while i < len {
163            if self.parsed_item[i] {
164                // The item already has been parsed
165                i += 1;
166                continue;
167            }
168            let token = &self.tokens[i];
169
170            match token {
171                Token::OpeningBracket => {
172                    if let Some(last_state) = state_stack.last() {
173                        state_stack.push(last_state.clone()); // Inherit from the last state properties
174                    } else {
175                        state_stack.push(ParserState::default());
176                    }
177                }
178                Token::ClosingBracket => {
179                    let state = state_stack.pop();
180                    if state.is_none() {
181                        return Err(ParserError::MalformedPainterStack);
182                    }
183                }
184                Token::ControlSymbol((control_word, property)) => {
185                    let Some(current_state) = state_stack.last_mut() else {
186                        return Err(ParserError::MalformedPainterStack);
187                    };
188                    let current_painter = &mut current_state.painter;
189                    let paragraph = &mut current_state.paragraph;
190                    #[rustfmt::skip]  // For now, rustfmt does not support this kind of alignement
191                    match control_word {
192                        ControlWord::ColorNumber        => current_painter.color_ref = property.get_value_as::<ColorRef>()?,
193                        ControlWord::FontNumber         => current_painter.font_ref = property.get_value_as::<FontRef>()?,
194                        ControlWord::FontSize           => current_painter.font_size = property.get_value_as::<u16>()?,
195                        ControlWord::Bold               => current_painter.bold = property.as_bool(),
196                        ControlWord::Italic             => current_painter.italic = property.as_bool(),
197                        ControlWord::Underline          => current_painter.underline = property.as_bool(),
198                        ControlWord::UnderlineNone      => current_painter.underline = false,
199                        ControlWord::Superscript        => current_painter.superscript = property.as_bool(),
200                        ControlWord::Subscript          => current_painter.subscript = property.as_bool(),
201                        ControlWord::Smallcaps          => current_painter.smallcaps = property.as_bool(),
202                        ControlWord::Strikethrough      => current_painter.strike = property.as_bool(),
203                        // Paragraph
204                        ControlWord::Pard               => *paragraph = Paragraph::default(), // Reset the par
205                        ControlWord::Plain              => *current_painter = Painter::default(), // Reset the painter
206                        ControlWord::ParDefTab          => paragraph.tab_width = property.get_value(),
207                        ControlWord::LeftAligned
208                            | ControlWord::RightAligned
209                            | ControlWord::Center
210                            | ControlWord::Justify      => paragraph.alignment = Alignment::from(control_word),
211                        ControlWord::SpaceBefore        => paragraph.spacing.before = property.get_value(),
212                        ControlWord::SpaceAfter         => paragraph.spacing.after = property.get_value(),
213                        ControlWord::SpaceBetweenLine   => paragraph.spacing.between_line = SpaceBetweenLine::from(property.get_value()),
214                        ControlWord::SpaceLineMul       => paragraph.spacing.line_multiplier = property.get_value(),
215                        ControlWord::UnicodeIgnoreCount => current_state.unicode_ignore_count = property.get_value(),
216                        ControlWord::Unicode            => {
217                            let mut unicodes = Vec::with_capacity(current_state.unicode_ignore_count as usize + 1); // try to avoid realocation due to fallback unicodes
218                            if let Ok(unicode) = property.get_unicode_value() {
219                                unicodes.push(unicode);
220                            }
221                            // Get the following unicode in case of compounds characters
222                            while i + 1 < len {
223                                // We should not check if the tokens has already been parsed, because we are looking for the following token in the document
224                                if let Token::ControlSymbol((ControlWord::Unicode, property)) = &self.tokens[i + 1] {
225                                    if let Ok(unicode) = property.get_unicode_value() {
226                                        unicodes.push(unicode);
227                                    }
228                                    i += 1;
229                                } else {
230                                    break;
231                                }
232                            }
233                            if unicodes.len() > 0 {
234                                // Handle the fallback unicode (\uc2 \u0000 'FA 'FB)
235                                let mut ignore_mask = vec![true; unicodes.len()];
236                                let mut ignore_counter = 0;
237                                for i in 1..unicodes.len() {
238                                    if unicodes[i] <= 255 && ignore_counter < current_state.unicode_ignore_count {
239                                        ignore_counter += 1;
240                                        ignore_mask[i] = false;
241                                    } else {
242                                        ignore_counter = 0;
243                                    }
244                                }
245                                let mut ignore_mask_iter = ignore_mask.iter();
246                                unicodes.retain(|_| *ignore_mask_iter.next().unwrap());
247                                // Convert the unicode to string
248                                let str = String::from_utf16(unicodes.as_slice()).unwrap();
249                                Self::add_text_to_document(&str, &state_stack, &mut document)?;
250                            }
251                        }
252                        // Others tokens
253                        _ => {}
254                    };
255                }
256                Token::PlainText(text) => Self::add_text_to_document(*text, &state_stack, &mut document)?,
257                Token::CRLF => Self::add_text_to_document("\n", &state_stack, &mut document)?,
258                Token::IgnorableDestination => {
259                    return Err(ParserError::IgnorableDestinationParsingError);
260                }
261                Token::Empty => return Err(ParserError::ParseEmptyToken),
262            };
263            i += 1;
264        }
265        return Ok(document);
266    }
267
268    fn add_text_to_document(text: &str, state_stack: &Vec<ParserState>, document: &mut RtfDocument) -> Result<(), ParserError> {
269        let Some(current_state) = state_stack.last() else {
270            return Err(ParserError::MalformedPainterStack);
271        };
272        let current_painter = &current_state.painter;
273        let paragraph = &current_state.paragraph;
274        let last_style_group = document.body.last_mut();
275        // If the painter is the same as the previous one, merge the two block.
276        if let Some(group) = last_style_group {
277            if group.painter.eq(current_painter) && group.paragraph.eq(&paragraph) {
278                group.text.push_str(text);
279                return Ok(());
280            }
281        }
282        // Else, push another StyleBlock on the stack with its own painter
283        document.body.push(StyleBlock {
284            painter: current_painter.clone(),
285            paragraph: paragraph.clone(),
286            text: String::from(text),
287        });
288        return Ok(());
289    }
290
291    fn get_token_at(&'a self, index: usize) -> Option<&'a Token<'a>> {
292        return self.tokens.get(index);
293    }
294
295    // Get a view of the next token after cursor
296    fn get_next_token(&'a self) -> Option<&'a Token<'a>> {
297        return self.get_token_at(self.cursor);
298    }
299
300    #[inline]
301    fn consume_token_at(&mut self, index: usize) -> Option<Token<'a>> {
302        if self.tokens.is_empty() || index >= self.tokens.len() {
303            return None;
304        }
305        // PERF : vec.remove can require reallocation unlike this method
306        self.cursor += 1;
307        self.parsed_item[index] = true;
308        return Some(mem::replace(&mut self.tokens[index], Token::Empty));
309    }
310
311    fn consume_next_token(&mut self) -> Option<Token<'a>> {
312        return self.consume_token_at(self.cursor);
313    }
314
315    // Consume token from cursor to <reference-token>
316    fn _consume_tokens_until(&mut self, reference_token: &Token<'a>) -> Vec<Token<'a>> {
317        let mut ret = vec![];
318        let token_type_id = mem::discriminant(reference_token);
319        while let Some(token) = self.consume_next_token() {
320            let type_id = mem::discriminant(&token);
321            ret.push(token);
322            if type_id == token_type_id {
323                break;
324            }
325        }
326        return ret;
327    }
328
329    // The opening bracket should already be consumed
330    fn consume_tokens_until_matching_bracket(&mut self) -> Vec<Token<'a>> {
331        let mut ret = vec![];
332        let mut count = 0;
333        while let Some(token) = self.consume_next_token() {
334            match token {
335                Token::OpeningBracket => count += 1,
336                Token::ClosingBracket => count -= 1,
337                _ => {}
338            }
339            ret.push(token);
340            if count < 0 {
341                break;
342            }
343        }
344        return ret;
345    }
346
347    // Consume all the tokens inside a group ({ ... }) and returns the includes ones
348    fn consume_group(&mut self) -> Vec<Token<'a>> {
349        // TODO: check the the token at cursor is indeed an OpeningBracket
350        self.consume_token_at(self.cursor); // Consume the opening bracket
351        return self.consume_tokens_until_matching_bracket();
352    }
353
354    // Consume all tokens until the header is read
355    fn parse_header(&mut self) -> Result<RtfHeader, ParserError> {
356        self.cursor = 0; // Reset the cursor
357        let mut header = RtfHeader::default();
358        while let (Some(token), Some(mut next_token)) = (self.get_token_at(self.cursor), self.get_token_at(self.cursor + 1)) {
359            // Manage the case where there is CRLF between { and control_word
360            // {\n /*/ignoregroup }
361            let mut i = 0;
362            while *next_token == Token::CRLF {
363                if let Some(next_token_not_crlf) = self.get_token_at(self.cursor + 1 + i) {
364                    next_token = next_token_not_crlf;
365                    i += 1;
366                } else {
367                    break;
368                }
369            }
370            match (token, next_token) {
371                (Token::OpeningBracket, Token::IgnorableDestination) => {
372                    let ignore_group_tokens = self.consume_group();
373                    Self::parse_ignore_groups(&ignore_group_tokens);
374                }
375                (Token::OpeningBracket, header_control_word!(FontTable, None)) => {
376                    let font_table_tokens = self.consume_group();
377                    header.font_table = Self::parse_font_table(&font_table_tokens)?;
378                }
379                (Token::OpeningBracket, header_control_word!(ColorTable, None)) => {
380                    let color_table_tokens = self.consume_group();
381                    header.color_table = Self::parse_color_table(&color_table_tokens)?;
382                }
383                (Token::OpeningBracket, header_control_word!(StyleSheet, None)) => {
384                    let stylesheet_tokens = self.consume_group();
385                    header.stylesheet = Self::parse_stylesheet(&stylesheet_tokens)?;
386                }
387                // Check and consume token
388                (token, _) => {
389                    if let Some(charset) = CharacterSet::from(token) {
390                        header.character_set = charset;
391                    }
392                    self.cursor += 1;
393                }
394            }
395        }
396        return Ok(header);
397    }
398
399    fn parse_font_table(font_tables_tokens: &Vec<Token<'a>>) -> Result<FontTable, ParserError> {
400        let Some(font_table_first_token) = font_tables_tokens.get(0) else {
401            return Err(ParserError::NoMoreToken);
402        };
403        if font_table_first_token != header_control_word!(FontTable, None) {
404            return Err(ParserError::InvalidToken(format!("{:?} is not a FontTable token", font_table_first_token)));
405        }
406        let mut table = HashMap::new();
407        let mut current_key = 0;
408        let mut current_font = Font::default();
409        for token in font_tables_tokens.iter() {
410            match token {
411                Token::ControlSymbol((control_word, property)) => match control_word {
412                    ControlWord::FontNumber => {
413                        // Insert previous font
414                        table.insert(current_key, current_font.clone());
415                        if let Property::Value(key) = property {
416                            current_key = *key as FontRef;
417                        } else {
418                            return Err(ParserError::InvalidFontIdentifier(*property));
419                        }
420                    }
421                    ControlWord::Unknown(name) => {
422                        if let Some(font_family) = FontFamily::from(name) {
423                            current_font.font_family = font_family;
424                        }
425                    }
426                    _ => {}
427                },
428                Token::PlainText(name) => {
429                    current_font.name = name.trim_end_matches(';').to_string();
430                }
431                Token::ClosingBracket => {
432                    table.insert(current_key, current_font.clone());
433                } // Insert previous font
434                _ => {}
435            }
436        }
437        return Ok(table);
438    }
439
440    fn parse_color_table(color_table_tokens: &Vec<Token<'a>>) -> Result<ColorTable, ParserError> {
441        let Some(color_table_first_token) = color_table_tokens.get(0) else {
442            return Err(ParserError::NoMoreToken);
443        };
444        if color_table_first_token != header_control_word!(ColorTable, None) {
445            return Err(ParserError::InvalidToken(format!("ParserError: {:?} is not a ColorTable token", color_table_first_token)));
446        }
447        let mut table = HashMap::new();
448        let mut current_key = 1;
449        let mut current_color = Color::default();
450        for token in color_table_tokens.iter() {
451            match token {
452                Token::ControlSymbol((control_word, property)) => match control_word {
453                    ControlWord::ColorRed => current_color.red = property.get_value_as::<u8>()?,
454                    ControlWord::ColorGreen => current_color.green = property.get_value_as::<u8>()?,
455                    ControlWord::ColorBlue => {
456                        current_color.blue = property.get_value_as::<u8>()?;
457                        table.insert(current_key, current_color.clone());
458                        current_key += 1;
459                    }
460                    _ => {}
461                },
462                _ => {}
463            }
464        }
465        return Ok(table);
466    }
467
468    fn parse_stylesheet(_stylesheet_tokens: &Vec<Token<'a>>) -> Result<StyleSheet, ParserError> {
469        // TODO : parse the stylesheet
470        return Ok(StyleSheet::from([]));
471    }
472
473    fn parse_ignore_groups(_tokens: &Vec<Token<'a>>) {
474        // Do nothing for now
475    }
476}
477
478#[cfg(test)]
479pub mod tests {
480    use super::*;
481    use crate::header::CharacterSet::*;
482    use crate::header::FontFamily::*;
483    use crate::header::RtfHeader;
484    use crate::include_test_file;
485    use crate::lexer::Lexer;
486
487    #[test]
488    fn parser_header() {
489        let tokens = Lexer::scan(r#"{ \rtf1\ansi{\fonttbl\f0\fswiss Helvetica;}\f0\pard Voici du texte en {\b gras}.\par }"#).unwrap();
490        let doc = Parser::new(tokens).parse().unwrap();
491        assert_eq!(
492            doc.header,
493            RtfHeader {
494                character_set: Ansi,
495                font_table: FontTable::from([(
496                    0,
497                    Font {
498                        name: "Helvetica".into(),
499                        character_set: 0,
500                        font_family: Swiss
501                    }
502                )]),
503                ..RtfHeader::default()
504            }
505        );
506        assert_eq!(
507            doc.body,
508            [
509                StyleBlock {
510                    painter: Painter::default(),
511                    paragraph: Default::default(),
512                    text: "Voici du texte en ".into(),
513                },
514                StyleBlock {
515                    painter: Painter { bold: true, ..Painter::default() },
516                    paragraph: Default::default(),
517                    text: "gras".into(),
518                },
519                StyleBlock {
520                    painter: Painter::default(),
521                    paragraph: Default::default(),
522                    text: ".".into(),
523                },
524            ]
525        );
526    }
527
528    #[test]
529    fn parse_multiline_document() {
530        let document = r"{\rtf1\ansi\deff0 {\fonttbl {\f0 Courier;}{\f1 ProFontWindows;}}
531            {\colortbl;\red0\green0\blue0;\red255\green0\blue0;\red255\green255\blue0;}
532            This line is font 0 which is courier\line
533            \f1
534            This line is font 1\line
535            \f0
536            This line is font 0 again\line
537            This line has a \cf2 red \cf1 word\line
538            \highlight3 while this line has a \cf2 red \cf1 word and is highlighted in yellow\highlight0\line
539            Finally, back to the default color.\line
540            }";
541        let tokens = Lexer::scan(document).unwrap();
542        let _doc = Parser::new(tokens).parse().unwrap();
543    }
544
545    #[test]
546    fn parse_entire_file_header() {
547        let file_content = include_test_file!("test-file.rtf");
548        let tokens = Lexer::scan(file_content).unwrap();
549        let doc = Parser::new(tokens).parse().unwrap();
550        assert_eq!(
551            doc.header,
552            RtfHeader {
553                character_set: Ansi,
554                font_table: FontTable::from([
555                    (
556                        0,
557                        Font {
558                            name: "Helvetica".into(),
559                            character_set: 0,
560                            font_family: Swiss,
561                        }
562                    ),
563                    (
564                        1,
565                        Font {
566                            name: "Helvetica-Bold".into(),
567                            character_set: 0,
568                            font_family: Swiss,
569                        }
570                    )
571                ]),
572                color_table: ColorTable::from([(1, Color { red: 255, green: 255, blue: 255 }),]),
573                ..RtfHeader::default()
574            }
575        );
576    }
577
578    #[test]
579    fn parse_ignore_group() {
580        let rtf = r"{\*\expandedcolortbl;;}";
581        let tokens = Lexer::scan(rtf).unwrap();
582        let mut parser = Parser::new(tokens);
583        let document = parser.parse().unwrap();
584        assert_eq!(parser.get_tokens(), Vec::<&Token>::new()); // Should have consumed all the tokens
585        assert_eq!(document.header, RtfHeader::default());
586    }
587
588    #[test]
589    fn parse_ignore_group_with_crlf() {
590        let rtf = r"{\
591        \
592        \*\expandedcolortbl;;}";
593        let tokens = Lexer::scan(rtf).unwrap();
594        let mut parser = Parser::new(tokens);
595        let document = parser.parse().unwrap();
596        assert_eq!(parser.get_tokens(), Vec::<&Token>::new()); // Should have consumed all the tokens
597        assert_eq!(document.header, RtfHeader::default());
598    }
599
600    #[test]
601    fn parse_whitespaces() {
602        let file_content = include_test_file!("list-item.rtf");
603        let tokens = Lexer::scan(file_content).unwrap();
604        let mut parser = Parser::new(tokens);
605        let document = parser.parse().unwrap();
606        assert_eq!(
607            document.body,
608            vec![StyleBlock {
609                painter: Painter { font_size: 24, ..Painter::default() },
610                paragraph: Default::default(),
611                text: "\nEmpty start\n\nList test : \n - item 1\n - item 2\n - item 3\n - item 4".into(),
612            },]
613        );
614    }
615
616    #[test]
617    fn parse_image_data() {
618        // Try to parse without error
619        let rtf_content = include_test_file!("file-with-image.rtf");
620        let tokens = Lexer::scan(rtf_content).unwrap();
621        let _document = Parser::new(tokens).parse();
622    }
623
624    #[test]
625    fn parse_header_and_body() {
626        let rtf = r#"{\rtf1\ansi\ansicpg1252\cocoartf2639
627\cocoatextscaling0\cocoaplatform0{\fonttbl\f0\froman\fcharset0 Times-Bold;\f1\froman\fcharset0 Times-Roman;\f2\froman\fcharset0 Times-Italic;
628\f3\fswiss\fcharset0 Helvetica;}
629{\colortbl;\red255\green255\blue255;\red0\green0\blue10;\red0\green0\blue1;\red191\green191\blue191;
630}
631\f0\b\fs21 \cf2 Lorem ipsum
632\fs56 \
633\pard\pardeftab709\sl288\slmult1\sa225\qj\partightenfactor0
634
635\f1\b0\fs21 \cf0 \
636\pard\pardeftab709\fi-432\ri-1\sb240\sa120\partightenfactor0
637\ls1\ilvl0
638\f0\b\fs36\cf2\plain Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc ac faucibus odio. \
639\pard\pardeftab709\sl288\slmult1\sa225\qj\partightenfactor0
640}"#;
641        let tokens = Lexer::scan(rtf).unwrap();
642        let document = Parser::new(tokens).parse().unwrap();
643        assert_eq!(document.body[0].text, "Lorem ipsum");
644        assert_eq!(document.body[1].text, "\n");
645        assert_eq!(document.body[2].text, "\n");
646        assert_eq!(document.body[3].text, "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc ac faucibus odio. \n");
647    }
648
649    #[test]
650    fn parse_paragraph_aligment() {
651        let rtf = r#"{\rtf1\ansi\deff0 {\fonttbl {\f0 Times;}}
652        \fs34
653        {\pard \qc \fs60 Annalium Romae\par}
654        {\pard \qj
655            Urbem Romam a principio reges habuere; libertatem et
656            \par}
657        {\pard \ql
658            Non Cinnae, non Sullae longa dominatio; et Pompei Crassique potentia
659            \par}"#;
660        let tokens = Lexer::scan(rtf).unwrap();
661        let document = Parser::new(tokens).parse().unwrap();
662        assert_eq!(document.body[0].paragraph.alignment, Alignment::Center);
663        assert_eq!(document.body[1].paragraph.alignment, Alignment::Justify);
664        assert_eq!(document.body[2].paragraph.alignment, Alignment::LeftAligned);
665    }
666
667    #[test]
668    fn should_parse_escaped_char() {
669        let rtf = r"{\rtf1\ansi\deff0 {\fonttbl {\f0 Times;}}je suis une b\'eate}";
670        let tokens = Lexer::scan(rtf).unwrap();
671        let document = Parser::new(tokens).parse().unwrap();
672        assert_eq!(document.body[0].text, "je suis une bête");
673    }
674
675    #[test]
676    fn parse_plain_directive() {
677        let rtf = r"{\rtf1{\fonttbl {\f0 Times;}}\f0\b\fs36\u\cf2\plain Plain text}";
678        let tokens = Lexer::scan(rtf).unwrap();
679        let document = Parser::new(tokens).parse().unwrap();
680        assert_eq!(document.body[0].painter, Painter::default());
681    }
682
683    #[test]
684    fn parse_color_table() {
685        // cf0 is unset color, cf1 is first color, cf2 is second color, etc ...
686        let rtf = r#"{\rtf1\ansi\ansicpg936\cocoartf2761
687            \cocoatextscaling0\cocoaplatform0{\fonttbl\f0\fswiss\fcharset0 Helvetica;\f1\fnil\fcharset134 PingFangSC-Regular;}
688            {\colortbl;\red255\green255\blue255;\red251\green2\blue7;\red114\green44\blue253;}
689            {\*\expandedcolortbl;;\cssrgb\c100000\c14913\c0;\cssrgb\c52799\c30710\c99498;}
690            \f0\fs24 \cf2 A
691            \f1 \cf3 B}"#;
692        let tokens = Lexer::scan(rtf).unwrap();
693        let document = Parser::new(tokens).parse().unwrap();
694        assert_eq!(document.header.color_table.get(&document.body[0].painter.color_ref).unwrap(), &Color { red: 251, green: 2, blue: 7 });
695    }
696
697    #[test]
698    fn parse_underline() {
699        // \\ul underline true
700        // \\ulnone underline false
701        let rtf = r#"{\rtf1\ansi\ansicpg936\cocoartf2761
702            \cocoatextscaling0\cocoaplatform0{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
703            {\colortbl;\red255\green255\blue255;}
704            {\*\expandedcolortbl;;}
705            \paperw11900\paperh16840\margl1440\margr1440\vieww11520\viewh8400\viewkind0
706            \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
707
708            \f0\fs24 \cf0 \ul \ulc0 a\ulnone A}"#;
709        let tokens = Lexer::scan(rtf).unwrap();
710        let document = Parser::new(tokens).parse().unwrap();
711        assert_eq!(&document.body[0].painter.underline, &true);
712        assert_eq!(&document.body[1].painter.underline, &false);
713    }
714
715    #[test]
716    fn parse_unicode() {
717        // start with \\uc0
718        // \u21834 => 啊
719        let rtf = r#"{\rtf1\ansi\ansicpg936\cocoartf2761
720            \cocoatextscaling0\cocoaplatform0{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
721            \f0\fs24 \cf0 \uc0\u21834  \u21834 }"#;
722        let tokens = Lexer::scan(rtf).unwrap();
723        let document = Parser::new(tokens).parse().unwrap();
724        assert_eq!(&document.body[0].text, "啊 啊");
725    }
726
727    #[test]
728    fn parse_two_characters_compound_unicode() {
729        let rtf = r#"{\rtf1\ansi
730            \f0 a\u55357 \u56447 1 \u21834}"#;
731        let tokens = Lexer::scan(rtf).unwrap();
732        let document = Parser::new(tokens).parse().unwrap();
733        assert_eq!(&document.body[0].text, "a👿1 啊");
734    }
735
736    #[test]
737    fn parse_unicode_with_fallback() {
738        // Should only consider the first unicode, not the two fallback chars
739        let rtf = r#"{\rtf1\ansi
740            {\f0 \u-10179\'5f\u-9089\'5f}
741            {\f1 \uc2\u32767\'c2\'52}
742            {\f2 \uc2\u26789\'97\'73}
743            {\f3 b\'eate}
744            {\f4 \uc0 b\'ea\'eate}
745           }"#;
746        let tokens = Lexer::scan(rtf).unwrap();
747        let document = Parser::new(tokens).parse().unwrap();
748        assert_eq!(&document.body[0].text, "👿");
749        assert_eq!(&document.body[1].text, "翿");
750        assert_eq!(&document.body[2].text, "梥");
751        assert_eq!(&document.body[3].text, "bête");
752        assert_eq!(&document.body[4].text, "bêête");
753    }
754
755    #[test]
756    fn body_starts_with_a_group() {
757        let rtf = r"{\rtf1\ansi\deff0{\fonttbl {\f0\fnil\fcharset0 Calibri;}{\f1\fnil\fcharset2 Symbol;}}{\colortbl ;}{\pard \u21435  \sb70\par}}";
758        let tokens = Lexer::scan(rtf).unwrap();
759        let _document = Parser::new(tokens).parse().unwrap();
760    }
761
762    #[test]
763    fn rtf_different_semantic() {
764        let rtf1 = r"{\rtf1 \b bold \i Bold Italic \i0 Bold again}";
765        let rtf2 = r"{\rtf1 \b bold {\i Bold Italic }Bold again}";
766        let rtf3 = r"{\rtf1 \b bold \i Bold Italic \plain\b Bold again}";
767        let doc1 = RtfDocument::try_from(rtf1).unwrap();
768        let doc2 = RtfDocument::try_from(rtf2).unwrap();
769        let doc3 = RtfDocument::try_from(rtf3).unwrap();
770        assert_eq!(doc1.body, doc2.body);
771        assert_eq!(doc3.body, doc2.body);
772    }
773}
rtf_parser/parser.rs

rtf_parser/
parser.rs