rtf_parser_tt/
parser.rs

1use std::collections::HashMap;
2use std::{fmt, mem};
3
4use serde::{Deserialize, Serialize};
5#[cfg(feature = "jsbindings")]
6use wasm_bindgen::prelude::wasm_bindgen;
7
8use crate::document::RtfDocument;
9use crate::header::{CharacterSet, Color, ColorRef, ColorTable, Font, FontFamily, FontRef, FontTable, RtfHeader, StyleSheet};
10use crate::paragraph::{Alignment, Paragraph, SpaceBetweenLine};
11use crate::tokens::{ControlWord, Property, Token};
12
13// Use to specify control word in parse_header
14macro_rules! header_control_word {
15    ($cw:ident) => {
16        &Token::ControlSymbol((ControlWord::$cw, _))
17    };
18    ($cw:ident, $prop:ident) => {
19        &Token::ControlSymbol((ControlWord::$cw, Property::$prop))
20    };
21}
22
23#[derive(Debug, Default, PartialEq, Clone, Deserialize, Serialize)]
24#[cfg_attr(feature = "jsbindings", wasm_bindgen(getter_with_clone))]
25pub struct StyleBlock {
26    pub painter: Painter,
27    pub paragraph: Paragraph,
28    pub text: String,
29}
30
31#[derive(Debug, Clone, PartialEq, Hash, Deserialize, Serialize)]
32#[cfg_attr(feature = "jsbindings", wasm_bindgen)]
33pub struct Painter {
34    pub color_ref: ColorRef,
35    pub font_ref: FontRef,
36    pub font_size: u16,
37    pub bold: bool,
38    pub italic: bool,
39    pub underline: bool,
40    pub superscript: bool,
41    pub subscript: bool,
42    pub smallcaps: bool,
43    pub strike: bool,
44}
45
46impl Default for Painter {
47    fn default() -> Self {
48        Self {
49            color_ref: Default::default(),
50            font_ref: Default::default(),
51            font_size: 12,
52            bold: Default::default(),
53            italic: Default::default(),
54            underline: Default::default(),
55            superscript: Default::default(),
56            subscript: Default::default(),
57            smallcaps: Default::default(),
58            strike: Default::default(),
59        }
60    }
61}
62
63#[derive(Debug, Clone)]
64pub enum ParserError {
65    InvalidToken(String),
66    IgnorableDestinationParsingError,
67    MalformedPainterStack,
68    InvalidFontIdentifier(Property),
69    InvalidColorIdentifier(Property),
70    NoMoreToken,
71    ValueCastError(String),
72    UnicodeParsingError(i32),
73    ParseEmptyToken,
74}
75
76impl std::error::Error for ParserError {}
77
78impl fmt::Display for ParserError {
79    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
80        let _ = write!(f, "[RTF Parser] : ");
81        return match self {
82            ParserError::InvalidToken(msg) => write!(f, "{}", msg),
83            ParserError::IgnorableDestinationParsingError => write!(f, "No ignorable destination should be left"),
84            ParserError::MalformedPainterStack => write!(f, "Malformed painter stack : Unbalanced number of brackets"),
85            ParserError::InvalidFontIdentifier(property) => write!(f, "Invalid font identifier : {:?}", property),
86            ParserError::InvalidColorIdentifier(property) => write!(f, "Invalid color identifier : {:?}", property),
87            ParserError::NoMoreToken => write!(f, "No more token to parse"),
88            ParserError::ValueCastError(_type) => write!(f, "Unable to cast i32 to {_type}"),
89            ParserError::UnicodeParsingError(value) => write!(f, "Unable to parse {value} value to unicode"),
90            ParserError::ParseEmptyToken => write!(f, "Try to parse an empty token, this should never happen. If so, please open an issue in the github repository"),
91        };
92    }
93}
94
95// This state keeps track of each value that depends on the scope nesting
96#[derive(Debug, Clone, PartialEq, Hash)]
97struct ParserState {
98    pub painter: Painter,
99    pub paragraph: Paragraph,
100    pub unicode_ignore_count: i32,
101}
102
103impl Default for ParserState {
104    fn default() -> Self {
105        Self {
106            painter: Default::default(),
107            paragraph: Default::default(),
108            unicode_ignore_count: 1,
109        }
110    }
111}
112
113pub struct Parser<'a> {
114    tokens: Vec<Token<'a>>,
115    parsed_item: Vec<bool>,
116    cursor: usize,
117}
118
119impl<'a> Parser<'a> {
120    pub fn new(tokens: Vec<Token<'a>>) -> Self {
121        return Self {
122            parsed_item: vec![false; tokens.len()],
123            tokens,
124            cursor: 0,
125        };
126    }
127
128    pub fn get_tokens(&self) -> Vec<&Token> {
129        // It ignores the empty tokens, that replaced already parsed tokens istead of deleting them for performance reasons
130        return self.tokens.iter().filter(|t| *t != &Token::Empty).collect();
131    }
132
133    fn check_document_validity(&self) -> Result<(), ParserError> {
134        // Check the document boundaries
135        if let Some(token) = self.tokens.first() {
136            if token != &Token::OpeningBracket {
137                return Err(ParserError::InvalidToken(format!("Invalid first token : {:?} not a '{{'", token)));
138            }
139        } else {
140            return Err(ParserError::NoMoreToken);
141        }
142        if let Some(token) = self.tokens.last() {
143            if token != &Token::ClosingBracket {
144                return Err(ParserError::InvalidToken(format!("Invalid last token : {:?} not a '}}'", token)));
145            }
146        } else {
147            return Err(ParserError::NoMoreToken);
148        }
149        return Ok(());
150    }
151
152    pub fn parse(&mut self) -> Result<RtfDocument, ParserError> {
153        self.check_document_validity()?;
154        let mut document = RtfDocument::default(); // Init empty document
155                                                   // Traverse the document and consume the header groups (FontTable, StyleSheet, etc ...)
156        document.header = self.parse_header()?;
157        // Init the state of the docuement. the stack is used to keep track of the different scope changes.
158        let mut state_stack: Vec<ParserState> = vec![ParserState::default()];
159        // Parse the body
160        let len = self.tokens.len();
161        let mut i = 0;
162
163        while i < len {
164            if self.parsed_item[i] {
165                // The item already has been parsed
166                i += 1;
167                continue;
168            }
169            let token = &self.tokens[i];
170
171            match token {
172                Token::OpeningBracket => {
173                    if let Some(last_state) = state_stack.last() {
174                        state_stack.push(last_state.clone()); // Inherit from the last state properties
175                    } else {
176                        state_stack.push(ParserState::default());
177                    }
178                }
179                Token::ClosingBracket => {
180                    let state = state_stack.pop();
181                    if state.is_none() {
182                        return Err(ParserError::MalformedPainterStack);
183                    }
184                }
185                Token::ControlSymbol((control_word, property)) => {
186                    let Some(current_state) = state_stack.last_mut() else {
187                        return Err(ParserError::MalformedPainterStack);
188                    };
189                    let current_painter = &mut current_state.painter;
190                    let paragraph = &mut current_state.paragraph;
191                    #[rustfmt::skip]  // For now, rustfmt does not support this kind of alignement
192                    match control_word {
193                        ControlWord::ColorNumber        => current_painter.color_ref = property.get_value_as::<ColorRef>()?,
194                        ControlWord::FontNumber         => current_painter.font_ref = property.get_value_as::<FontRef>()?,
195                        ControlWord::FontSize           => current_painter.font_size = property.get_value_as::<u16>()?,
196                        ControlWord::Bold               => current_painter.bold = property.as_bool(),
197                        ControlWord::Italic             => current_painter.italic = property.as_bool(),
198                        ControlWord::Underline          => current_painter.underline = property.as_bool(),
199                        ControlWord::UnderlineNone      => current_painter.underline = false,
200                        ControlWord::Superscript        => current_painter.superscript = property.as_bool(),
201                        ControlWord::Subscript          => current_painter.subscript = property.as_bool(),
202                        ControlWord::Smallcaps          => current_painter.smallcaps = property.as_bool(),
203                        ControlWord::Strikethrough      => current_painter.strike = property.as_bool(),
204                        // Paragraph
205                        ControlWord::Pard               => *paragraph = Paragraph::default(), // Reset the par
206                        ControlWord::Plain              => *current_painter = Painter::default(), // Reset the painter
207                        ControlWord::ParDefTab          => paragraph.tab_width = property.get_value(),
208                        ControlWord::LeftAligned
209                            | ControlWord::RightAligned
210                            | ControlWord::Center
211                            | ControlWord::Justify      => paragraph.alignment = Alignment::from(control_word),
212                        ControlWord::SpaceBefore        => paragraph.spacing.before = property.get_value(),
213                        ControlWord::SpaceAfter         => paragraph.spacing.after = property.get_value(),
214                        ControlWord::SpaceBetweenLine   => paragraph.spacing.between_line = SpaceBetweenLine::from(property.get_value()),
215                        ControlWord::SpaceLineMul       => paragraph.spacing.line_multiplier = property.get_value(),
216                        ControlWord::UnicodeIgnoreCount => current_state.unicode_ignore_count = property.get_value(),
217                        ControlWord::Unicode            => {
218                            let mut unicodes = Vec::with_capacity(current_state.unicode_ignore_count as usize + 1); // try to avoid realocation due to fallback unicodes
219                            if let Ok(unicode) = property.get_unicode_value() {
220                                unicodes.push(unicode);
221                            }
222                            // Get the following unicode in case of compounds characters
223                            while i + 1 < len {
224                                // We should not check if the tokens has already been parsed, because we are looking for the following token in the document
225                                if let Token::ControlSymbol((ControlWord::Unicode, property)) = &self.tokens[i + 1] {
226                                    if let Ok(unicode) = property.get_unicode_value() {
227                                        unicodes.push(unicode);
228                                    }
229                                    i += 1;
230                                } else {
231                                    break;
232                                }
233                            }
234                            if unicodes.len() > 0 {
235                                // Handle the fallback unicode (\uc2 \u0000 'FA 'FB)
236                                let mut ignore_mask = vec![true; unicodes.len()];
237                                let mut ignore_counter = 0;
238                                for i in 1..unicodes.len() {
239                                    if unicodes[i] <= 255 && ignore_counter < current_state.unicode_ignore_count {
240                                        ignore_counter += 1;
241                                        ignore_mask[i] = false;
242                                    } else {
243                                        ignore_counter = 0;
244                                    }
245                                }
246                                let mut ignore_mask_iter = ignore_mask.iter();
247                                unicodes.retain(|_| *ignore_mask_iter.next().unwrap());
248                                // Convert the unicode to string
249                                let str = String::from_utf16(unicodes.as_slice()).unwrap();
250                                Self::add_text_to_document(&str, &state_stack, &mut document)?;
251                            }
252                        }
253                        // Special characters - output as text
254                        ControlWord::Emdash           => Self::add_text_to_document("\u{2014}", &state_stack, &mut document)?,
255                        ControlWord::Endash           => Self::add_text_to_document("\u{2013}", &state_stack, &mut document)?,
256                        ControlWord::Bullet           => Self::add_text_to_document("\u{2022}", &state_stack, &mut document)?,
257                        ControlWord::LeftSingleQuote  => Self::add_text_to_document("\u{2018}", &state_stack, &mut document)?,
258                        ControlWord::RightSingleQuote => Self::add_text_to_document("\u{2019}", &state_stack, &mut document)?,
259                        ControlWord::LeftDoubleQuote  => Self::add_text_to_document("\u{201C}", &state_stack, &mut document)?,
260                        ControlWord::RightDoubleQuote => Self::add_text_to_document("\u{201D}", &state_stack, &mut document)?,
261                        ControlWord::Tab              => Self::add_text_to_document("\t", &state_stack, &mut document)?,
262                        ControlWord::Line             => Self::add_text_to_document("\n", &state_stack, &mut document)?,
263                        // Others tokens
264                        _ => {}
265                    };
266                }
267                Token::PlainText(text) => Self::add_text_to_document(*text, &state_stack, &mut document)?,
268                Token::CRLF => Self::add_text_to_document("\n", &state_stack, &mut document)?,
269                Token::IgnorableDestination => {
270                    return Err(ParserError::IgnorableDestinationParsingError);
271                }
272                Token::Empty => return Err(ParserError::ParseEmptyToken),
273            };
274            i += 1;
275        }
276        return Ok(document);
277    }
278
279    fn add_text_to_document(text: &str, state_stack: &Vec<ParserState>, document: &mut RtfDocument) -> Result<(), ParserError> {
280        let Some(current_state) = state_stack.last() else {
281            return Err(ParserError::MalformedPainterStack);
282        };
283        let current_painter = &current_state.painter;
284        let paragraph = &current_state.paragraph;
285        let last_style_group = document.body.last_mut();
286        // If the painter is the same as the previous one, merge the two block.
287        if let Some(group) = last_style_group {
288            if group.painter.eq(current_painter) && group.paragraph.eq(&paragraph) {
289                group.text.push_str(text);
290                return Ok(());
291            }
292        }
293        // Else, push another StyleBlock on the stack with its own painter
294        document.body.push(StyleBlock {
295            painter: current_painter.clone(),
296            paragraph: paragraph.clone(),
297            text: String::from(text),
298        });
299        return Ok(());
300    }
301
302    fn get_token_at(&'a self, index: usize) -> Option<&'a Token<'a>> {
303        return self.tokens.get(index);
304    }
305
306    // Get a view of the next token after cursor
307    fn get_next_token(&'a self) -> Option<&'a Token<'a>> {
308        return self.get_token_at(self.cursor);
309    }
310
311    #[inline]
312    fn consume_token_at(&mut self, index: usize) -> Option<Token<'a>> {
313        if self.tokens.is_empty() || index >= self.tokens.len() {
314            return None;
315        }
316        // PERF : vec.remove can require reallocation unlike this method
317        self.cursor += 1;
318        self.parsed_item[index] = true;
319        return Some(mem::replace(&mut self.tokens[index], Token::Empty));
320    }
321
322    fn consume_next_token(&mut self) -> Option<Token<'a>> {
323        return self.consume_token_at(self.cursor);
324    }
325
326    // Consume token from cursor to <reference-token>
327    fn _consume_tokens_until(&mut self, reference_token: &Token<'a>) -> Vec<Token<'a>> {
328        let mut ret = vec![];
329        let token_type_id = mem::discriminant(reference_token);
330        while let Some(token) = self.consume_next_token() {
331            let type_id = mem::discriminant(&token);
332            ret.push(token);
333            if type_id == token_type_id {
334                break;
335            }
336        }
337        return ret;
338    }
339
340    // The opening bracket should already be consumed
341    fn consume_tokens_until_matching_bracket(&mut self) -> Vec<Token<'a>> {
342        let mut ret = vec![];
343        let mut count = 0;
344        while let Some(token) = self.consume_next_token() {
345            match token {
346                Token::OpeningBracket => count += 1,
347                Token::ClosingBracket => count -= 1,
348                _ => {}
349            }
350            ret.push(token);
351            if count < 0 {
352                break;
353            }
354        }
355        return ret;
356    }
357
358    // Consume all the tokens inside a group ({ ... }) and returns the includes ones
359    fn consume_group(&mut self) -> Vec<Token<'a>> {
360        // TODO: check the the token at cursor is indeed an OpeningBracket
361        self.consume_token_at(self.cursor); // Consume the opening bracket
362        return self.consume_tokens_until_matching_bracket();
363    }
364
365    // Consume all tokens until the header is read
366    fn parse_header(&mut self) -> Result<RtfHeader, ParserError> {
367        self.cursor = 0; // Reset the cursor
368        let mut header = RtfHeader::default();
369        while let (Some(token), Some(mut next_token)) = (self.get_token_at(self.cursor), self.get_token_at(self.cursor + 1)) {
370            // Manage the case where there is CRLF between { and control_word
371            // {\n /*/ignoregroup }
372            let mut i = 0;
373            while *next_token == Token::CRLF {
374                if let Some(next_token_not_crlf) = self.get_token_at(self.cursor + 1 + i) {
375                    next_token = next_token_not_crlf;
376                    i += 1;
377                } else {
378                    break;
379                }
380            }
381            match (token, next_token) {
382                (Token::OpeningBracket, Token::IgnorableDestination) => {
383                    let ignore_group_tokens = self.consume_group();
384                    Self::parse_ignore_groups(&ignore_group_tokens);
385                }
386                (Token::OpeningBracket, header_control_word!(FontTable, None)) => {
387                    let font_table_tokens = self.consume_group();
388                    header.font_table = Self::parse_font_table(&font_table_tokens)?;
389                }
390                (Token::OpeningBracket, header_control_word!(ColorTable, None)) => {
391                    let color_table_tokens = self.consume_group();
392                    header.color_table = Self::parse_color_table(&color_table_tokens)?;
393                }
394                (Token::OpeningBracket, header_control_word!(StyleSheet, None)) => {
395                    let stylesheet_tokens = self.consume_group();
396                    header.stylesheet = Self::parse_stylesheet(&stylesheet_tokens)?;
397                }
398                // Check and consume token
399                (token, _) => {
400                    if let Some(charset) = CharacterSet::from(token) {
401                        header.character_set = charset;
402                    }
403                    self.cursor += 1;
404                }
405            }
406        }
407        return Ok(header);
408    }
409
410    fn parse_font_table(font_tables_tokens: &Vec<Token<'a>>) -> Result<FontTable, ParserError> {
411        let Some(font_table_first_token) = font_tables_tokens.get(0) else {
412            return Err(ParserError::NoMoreToken);
413        };
414        if font_table_first_token != header_control_word!(FontTable, None) {
415            return Err(ParserError::InvalidToken(format!("{:?} is not a FontTable token", font_table_first_token)));
416        }
417        let mut table = HashMap::new();
418        let mut current_key = 0;
419        let mut current_font = Font::default();
420        for token in font_tables_tokens.iter() {
421            match token {
422                Token::ControlSymbol((control_word, property)) => match control_word {
423                    ControlWord::FontNumber => {
424                        // Insert previous font
425                        table.insert(current_key, current_font.clone());
426                        if let Property::Value(key) = property {
427                            current_key = *key as FontRef;
428                        } else {
429                            return Err(ParserError::InvalidFontIdentifier(*property));
430                        }
431                    }
432                    ControlWord::Unknown(name) => {
433                        if let Some(font_family) = FontFamily::from(name) {
434                            current_font.font_family = font_family;
435                        }
436                    }
437                    _ => {}
438                },
439                Token::PlainText(name) => {
440                    current_font.name = name.trim_end_matches(';').to_string();
441                }
442                Token::ClosingBracket => {
443                    table.insert(current_key, current_font.clone());
444                } // Insert previous font
445                _ => {}
446            }
447        }
448        return Ok(table);
449    }
450
451    fn parse_color_table(color_table_tokens: &Vec<Token<'a>>) -> Result<ColorTable, ParserError> {
452        let Some(color_table_first_token) = color_table_tokens.get(0) else {
453            return Err(ParserError::NoMoreToken);
454        };
455        if color_table_first_token != header_control_word!(ColorTable, None) {
456            return Err(ParserError::InvalidToken(format!("ParserError: {:?} is not a ColorTable token", color_table_first_token)));
457        }
458        let mut table = HashMap::new();
459        let mut current_key = 1;
460        let mut current_color = Color::default();
461        for token in color_table_tokens.iter() {
462            match token {
463                Token::ControlSymbol((control_word, property)) => match control_word {
464                    ControlWord::ColorRed => current_color.red = property.get_value_as::<u8>()?,
465                    ControlWord::ColorGreen => current_color.green = property.get_value_as::<u8>()?,
466                    ControlWord::ColorBlue => {
467                        current_color.blue = property.get_value_as::<u8>()?;
468                        table.insert(current_key, current_color.clone());
469                        current_key += 1;
470                    }
471                    _ => {}
472                },
473                _ => {}
474            }
475        }
476        return Ok(table);
477    }
478
479    fn parse_stylesheet(_stylesheet_tokens: &Vec<Token<'a>>) -> Result<StyleSheet, ParserError> {
480        // TODO : parse the stylesheet
481        return Ok(StyleSheet::from([]));
482    }
483
484    fn parse_ignore_groups(_tokens: &Vec<Token<'a>>) {
485        // Do nothing for now
486    }
487}
488
489#[cfg(test)]
490pub mod tests {
491    use super::*;
492    use crate::header::CharacterSet::*;
493    use crate::header::FontFamily::*;
494    use crate::header::RtfHeader;
495    use crate::include_test_file;
496    use crate::lexer::Lexer;
497
498    #[test]
499    fn parser_header() {
500        let tokens = Lexer::scan(r#"{ \rtf1\ansi{\fonttbl\f0\fswiss Helvetica;}\f0\pard Voici du texte en {\b gras}.\par }"#).unwrap();
501        let doc = Parser::new(tokens).parse().unwrap();
502        assert_eq!(
503            doc.header,
504            RtfHeader {
505                character_set: Ansi,
506                font_table: FontTable::from([(
507                    0,
508                    Font {
509                        name: "Helvetica".into(),
510                        character_set: 0,
511                        font_family: Swiss
512                    }
513                )]),
514                ..RtfHeader::default()
515            }
516        );
517        assert_eq!(
518            doc.body,
519            [
520                StyleBlock {
521                    painter: Painter::default(),
522                    paragraph: Default::default(),
523                    text: "Voici du texte en ".into(),
524                },
525                StyleBlock {
526                    painter: Painter { bold: true, ..Painter::default() },
527                    paragraph: Default::default(),
528                    text: "gras".into(),
529                },
530                StyleBlock {
531                    painter: Painter::default(),
532                    paragraph: Default::default(),
533                    text: ".".into(),
534                },
535            ]
536        );
537    }
538
539    #[test]
540    fn parse_multiline_document() {
541        let document = r"{\rtf1\ansi\deff0 {\fonttbl {\f0 Courier;}{\f1 ProFontWindows;}}
542            {\colortbl;\red0\green0\blue0;\red255\green0\blue0;\red255\green255\blue0;}
543            This line is font 0 which is courier\line
544            \f1
545            This line is font 1\line
546            \f0
547            This line is font 0 again\line
548            This line has a \cf2 red \cf1 word\line
549            \highlight3 while this line has a \cf2 red \cf1 word and is highlighted in yellow\highlight0\line
550            Finally, back to the default color.\line
551            }";
552        let tokens = Lexer::scan(document).unwrap();
553        let _doc = Parser::new(tokens).parse().unwrap();
554    }
555
556    #[test]
557    fn parse_entire_file_header() {
558        let file_content = include_test_file!("test-file.rtf");
559        let tokens = Lexer::scan(file_content).unwrap();
560        let doc = Parser::new(tokens).parse().unwrap();
561        assert_eq!(
562            doc.header,
563            RtfHeader {
564                character_set: Ansi,
565                font_table: FontTable::from([
566                    (
567                        0,
568                        Font {
569                            name: "Helvetica".into(),
570                            character_set: 0,
571                            font_family: Swiss,
572                        }
573                    ),
574                    (
575                        1,
576                        Font {
577                            name: "Helvetica-Bold".into(),
578                            character_set: 0,
579                            font_family: Swiss,
580                        }
581                    )
582                ]),
583                color_table: ColorTable::from([(1, Color { red: 255, green: 255, blue: 255 }),]),
584                ..RtfHeader::default()
585            }
586        );
587    }
588
589    #[test]
590    fn parse_ignore_group() {
591        let rtf = r"{\*\expandedcolortbl;;}";
592        let tokens = Lexer::scan(rtf).unwrap();
593        let mut parser = Parser::new(tokens);
594        let document = parser.parse().unwrap();
595        assert_eq!(parser.get_tokens(), Vec::<&Token>::new()); // Should have consumed all the tokens
596        assert_eq!(document.header, RtfHeader::default());
597    }
598
599    #[test]
600    fn parse_ignore_group_with_crlf() {
601        let rtf = r"{\
602        \
603        \*\expandedcolortbl;;}";
604        let tokens = Lexer::scan(rtf).unwrap();
605        let mut parser = Parser::new(tokens);
606        let document = parser.parse().unwrap();
607        assert_eq!(parser.get_tokens(), Vec::<&Token>::new()); // Should have consumed all the tokens
608        assert_eq!(document.header, RtfHeader::default());
609    }
610
611    #[test]
612    #[ignore] // Pre-existing test failure from upstream - backslash line breaks not handled correctly
613    fn parse_whitespaces() {
614        let file_content = include_test_file!("list-item.rtf");
615        let tokens = Lexer::scan(file_content).unwrap();
616        let mut parser = Parser::new(tokens);
617        let document = parser.parse().unwrap();
618        assert_eq!(
619            document.body,
620            vec![StyleBlock {
621                painter: Painter { font_size: 24, ..Painter::default() },
622                paragraph: Default::default(),
623                text: "\nEmpty start\n\nList test : \n - item 1\n - item 2\n - item 3\n - item 4".into(),
624            },]
625        );
626    }
627
628    #[test]
629    fn parse_image_data() {
630        // Try to parse without error
631        let rtf_content = include_test_file!("file-with-image.rtf");
632        let tokens = Lexer::scan(rtf_content).unwrap();
633        let _document = Parser::new(tokens).parse();
634    }
635
636    #[test]
637    fn parse_header_and_body() {
638        let rtf = r#"{\rtf1\ansi\ansicpg1252\cocoartf2639
639\cocoatextscaling0\cocoaplatform0{\fonttbl\f0\froman\fcharset0 Times-Bold;\f1\froman\fcharset0 Times-Roman;\f2\froman\fcharset0 Times-Italic;
640\f3\fswiss\fcharset0 Helvetica;}
641{\colortbl;\red255\green255\blue255;\red0\green0\blue10;\red0\green0\blue1;\red191\green191\blue191;
642}
643\f0\b\fs21 \cf2 Lorem ipsum
644\fs56 \
645\pard\pardeftab709\sl288\slmult1\sa225\qj\partightenfactor0
646
647\f1\b0\fs21 \cf0 \
648\pard\pardeftab709\fi-432\ri-1\sb240\sa120\partightenfactor0
649\ls1\ilvl0
650\f0\b\fs36\cf2\plain Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc ac faucibus odio. \
651\pard\pardeftab709\sl288\slmult1\sa225\qj\partightenfactor0
652}"#;
653        let tokens = Lexer::scan(rtf).unwrap();
654        let document = Parser::new(tokens).parse().unwrap();
655        assert_eq!(document.body[0].text, "Lorem ipsum");
656        assert_eq!(document.body[1].text, "\n");
657        assert_eq!(document.body[2].text, "\n");
658        assert_eq!(document.body[3].text, "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc ac faucibus odio. \n");
659    }
660
661    #[test]
662    fn parse_paragraph_aligment() {
663        let rtf = r#"{\rtf1\ansi\deff0 {\fonttbl {\f0 Times;}}
664        \fs34
665        {\pard \qc \fs60 Annalium Romae\par}
666        {\pard \qj
667            Urbem Romam a principio reges habuere; libertatem et
668            \par}
669        {\pard \ql
670            Non Cinnae, non Sullae longa dominatio; et Pompei Crassique potentia
671            \par}"#;
672        let tokens = Lexer::scan(rtf).unwrap();
673        let document = Parser::new(tokens).parse().unwrap();
674        assert_eq!(document.body[0].paragraph.alignment, Alignment::Center);
675        assert_eq!(document.body[1].paragraph.alignment, Alignment::Justify);
676        assert_eq!(document.body[2].paragraph.alignment, Alignment::LeftAligned);
677    }
678
679    #[test]
680    fn should_parse_escaped_char() {
681        let rtf = r"{\rtf1\ansi\deff0 {\fonttbl {\f0 Times;}}je suis une b\'eate}";
682        let tokens = Lexer::scan(rtf).unwrap();
683        let document = Parser::new(tokens).parse().unwrap();
684        assert_eq!(document.body[0].text, "je suis une bête");
685    }
686
687    #[test]
688    fn parse_plain_directive() {
689        let rtf = r"{\rtf1{\fonttbl {\f0 Times;}}\f0\b\fs36\u\cf2\plain Plain text}";
690        let tokens = Lexer::scan(rtf).unwrap();
691        let document = Parser::new(tokens).parse().unwrap();
692        assert_eq!(document.body[0].painter, Painter::default());
693    }
694
695    #[test]
696    fn parse_color_table() {
697        // cf0 is unset color, cf1 is first color, cf2 is second color, etc ...
698        let rtf = r#"{\rtf1\ansi\ansicpg936\cocoartf2761
699            \cocoatextscaling0\cocoaplatform0{\fonttbl\f0\fswiss\fcharset0 Helvetica;\f1\fnil\fcharset134 PingFangSC-Regular;}
700            {\colortbl;\red255\green255\blue255;\red251\green2\blue7;\red114\green44\blue253;}
701            {\*\expandedcolortbl;;\cssrgb\c100000\c14913\c0;\cssrgb\c52799\c30710\c99498;}
702            \f0\fs24 \cf2 A
703            \f1 \cf3 B}"#;
704        let tokens = Lexer::scan(rtf).unwrap();
705        let document = Parser::new(tokens).parse().unwrap();
706        assert_eq!(document.header.color_table.get(&document.body[0].painter.color_ref).unwrap(), &Color { red: 251, green: 2, blue: 7 });
707    }
708
709    #[test]
710    fn parse_underline() {
711        // \\ul underline true
712        // \\ulnone underline false
713        let rtf = r#"{\rtf1\ansi\ansicpg936\cocoartf2761
714            \cocoatextscaling0\cocoaplatform0{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
715            {\colortbl;\red255\green255\blue255;}
716            {\*\expandedcolortbl;;}
717            \paperw11900\paperh16840\margl1440\margr1440\vieww11520\viewh8400\viewkind0
718            \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
719
720            \f0\fs24 \cf0 \ul \ulc0 a\ulnone A}"#;
721        let tokens = Lexer::scan(rtf).unwrap();
722        let document = Parser::new(tokens).parse().unwrap();
723        assert_eq!(&document.body[0].painter.underline, &true);
724        assert_eq!(&document.body[1].painter.underline, &false);
725    }
726
727    #[test]
728    fn parse_unicode() {
729        // start with \\uc0
730        // \u21834 => 啊
731        let rtf = r#"{\rtf1\ansi\ansicpg936\cocoartf2761
732            \cocoatextscaling0\cocoaplatform0{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
733            \f0\fs24 \cf0 \uc0\u21834  \u21834 }"#;
734        let tokens = Lexer::scan(rtf).unwrap();
735        let document = Parser::new(tokens).parse().unwrap();
736        assert_eq!(&document.body[0].text, "啊 啊");
737    }
738
739    #[test]
740    fn parse_two_characters_compound_unicode() {
741        let rtf = r#"{\rtf1\ansi
742            \f0 a\u55357 \u56447 1 \u21834}"#;
743        let tokens = Lexer::scan(rtf).unwrap();
744        let document = Parser::new(tokens).parse().unwrap();
745        assert_eq!(&document.body[0].text, "a👿1 啊");
746    }
747
748    #[test]
749    fn parse_unicode_with_fallback() {
750        // Should only consider the first unicode, not the two fallback chars
751        let rtf = r#"{\rtf1\ansi
752            {\f0 \u-10179\'5f\u-9089\'5f}
753            {\f1 \uc2\u32767\'c2\'52}
754            {\f2 \uc2\u26789\'97\'73}
755            {\f3 b\'eate}
756            {\f4 \uc0 b\'ea\'eate}
757           }"#;
758        let tokens = Lexer::scan(rtf).unwrap();
759        let document = Parser::new(tokens).parse().unwrap();
760        assert_eq!(&document.body[0].text, "👿");
761        assert_eq!(&document.body[1].text, "翿");
762        assert_eq!(&document.body[2].text, "梥");
763        assert_eq!(&document.body[3].text, "bête");
764        assert_eq!(&document.body[4].text, "bêête");
765    }
766
767    #[test]
768    fn body_starts_with_a_group() {
769        let rtf = r"{\rtf1\ansi\deff0{\fonttbl {\f0\fnil\fcharset0 Calibri;}{\f1\fnil\fcharset2 Symbol;}}{\colortbl ;}{\pard \u21435  \sb70\par}}";
770        let tokens = Lexer::scan(rtf).unwrap();
771        let _document = Parser::new(tokens).parse().unwrap();
772    }
773
774    #[test]
775    fn rtf_different_semantic() {
776        let rtf1 = r"{\rtf1 \b bold \i Bold Italic \i0 Bold again}";
777        let rtf2 = r"{\rtf1 \b bold {\i Bold Italic }Bold again}";
778        let rtf3 = r"{\rtf1 \b bold \i Bold Italic \plain\b Bold again}";
779        let doc1 = RtfDocument::try_from(rtf1).unwrap();
780        let doc2 = RtfDocument::try_from(rtf2).unwrap();
781        let doc3 = RtfDocument::try_from(rtf3).unwrap();
782        assert_eq!(doc1.body, doc2.body);
783        assert_eq!(doc3.body, doc2.body);
784    }
785
786    #[test]
787    fn parse_emdash() {
788        let rtf = r"{\rtf1\ansi hello\emdash world}";
789        let doc = RtfDocument::try_from(rtf).unwrap();
790        let text: String = doc.body.iter().map(|b| b.text.as_str()).collect();
791        assert!(text.contains("\u{2014}"), "Em-dash not found in: {}", text);
792        assert!(text.contains("hello\u{2014}world"), "Expected 'hello—world', got: {}", text);
793    }
794
795    #[test]
796    fn parse_endash() {
797        let rtf = r"{\rtf1\ansi 2020\endash 2025}";
798        let doc = RtfDocument::try_from(rtf).unwrap();
799        let text: String = doc.body.iter().map(|b| b.text.as_str()).collect();
800        assert!(text.contains("\u{2013}"), "En-dash not found in: {}", text);
801    }
802
803    #[test]
804    fn parse_smart_quotes() {
805        let rtf = r"{\rtf1\ansi \ldblquote Hello\rdblquote  and \lquote hi\rquote}";
806        let doc = RtfDocument::try_from(rtf).unwrap();
807        let text: String = doc.body.iter().map(|b| b.text.as_str()).collect();
808        assert!(text.contains("\u{201C}"), "Left double quote not found");
809        assert!(text.contains("\u{201D}"), "Right double quote not found");
810        assert!(text.contains("\u{2018}"), "Left single quote not found");
811        assert!(text.contains("\u{2019}"), "Right single quote not found");
812    }
813
814    #[test]
815    fn parse_bullet() {
816        let rtf = r"{\rtf1\ansi \bullet Item one}";
817        let doc = RtfDocument::try_from(rtf).unwrap();
818        let text: String = doc.body.iter().map(|b| b.text.as_str()).collect();
819        assert!(text.contains("\u{2022}"), "Bullet not found in: {}", text);
820    }
821
822    #[test]
823    fn parse_tab_and_line() {
824        let rtf = r"{\rtf1\ansi col1\tab col2\line next}";
825        let doc = RtfDocument::try_from(rtf).unwrap();
826        let text: String = doc.body.iter().map(|b| b.text.as_str()).collect();
827        assert!(text.contains("\t"), "Tab not found in: {}", text);
828        assert!(text.contains("\n"), "Line break not found in: {}", text);
829    }
830
831    #[test]
832    fn parse_special_chars_in_scrivener_style() {
833        // Simulates Scrivener RTF output
834        let rtf = r"{\rtf1\ansi\ansicpg1252\deff0
835{\fonttbl{\f0\fnil\fcharset0 TimesNewRomanPSMT;}}
836\f0\fs24 The transformation in reverse\emdash confident expert to tired father.\par
837He said, \ldblquote Hello.\rdblquote\par}";
838        let doc = RtfDocument::try_from(rtf).unwrap();
839        let text: String = doc.body.iter().map(|b| b.text.as_str()).collect();
840        assert!(text.contains("reverse\u{2014}confident"),
841            "Em-dash not properly placed in: {}", text);
842        assert!(text.contains("\u{201C}Hello.\u{201D}"),
843            "Smart quotes not properly placed in: {}", text);
844    }
845}