rtf_parser/
tokens.rs

1use std::any::type_name;
2use std::convert::TryFrom;
3use std::fmt;
4
5use crate::lexer::LexerError;
6use crate::parser::ParserError;
7
8/// Parser representation of an RTF token
9#[allow(dead_code)]
10#[derive(PartialEq, Eq, Clone)]
11pub enum Token<'a> {
12    PlainText(&'a str),
13    OpeningBracket,
14    ClosingBracket,
15    CRLF,                 // Line-return \n
16    IgnorableDestination, // \*\ <destination-name>
17    ControlSymbol(ControlSymbol<'a>),
18    Empty, // Used by the parser for optimization
19}
20
21#[allow(dead_code)]
22impl<'a> fmt::Debug for Token<'a> {
23    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
24        #[rustfmt::skip]
25        return match self {
26            Token::PlainText(text)        => write!(f, r"PlainText : {:?}", *text),
27            Token::OpeningBracket         => write!(f, "OpeningBracket"),
28            Token::ClosingBracket         => write!(f, "ClosingBracket"),
29            Token::CRLF                   => write!(f, "CRLF"),
30            Token::IgnorableDestination   => write!(f, "IgnorableDestination"),
31            Token::ControlSymbol(symbol)  => write!(f, "ControlSymbol : {:?}", symbol),
32            Token::Empty                  => write!(f, "Empty"),
33        };
34    }
35}
36
37/// A control symbol is a pair (control_word, property)
38/// In the RTF specification, it refers to 'control word entity'
39pub type ControlSymbol<'a> = (ControlWord<'a>, Property);
40
41/// Parameters for a control word
42#[allow(dead_code)]
43#[derive(Copy, Clone, Debug, PartialEq, Eq)]
44pub enum Property {
45    On,         // 1
46    Off,        // 0
47    Value(i32), // Specified as i16 in the spec 1.5 but some softwre use i32 (ex: TextEdit for unicode)
48    None,       // No parameter
49}
50
51impl Property {
52    pub fn as_bool(&self) -> bool {
53        match self {
54            Property::On => true,
55            Property::Off => false,
56            Property::None => true,
57            Property::Value(val) => *val == 1,
58        }
59    }
60
61    // Retrieve and cast the i32 value to a specific numeric type
62    pub fn get_value_as<T: TryFrom<i32>>(&self) -> Result<T, ParserError> {
63        let error: Result<T, ParserError> = Err(ParserError::ValueCastError(type_name::<T>().to_string()));
64        if let Property::Value(value) = &self {
65            return T::try_from(*value).or(error);
66        }
67        // If no value, returns 0
68        return T::try_from(0).or(error);
69    }
70
71    // Default variant
72    pub fn get_value(&self) -> i32 {
73        return self.get_value_as::<i32>().expect("i32 to i32 conversion should never fail");
74    }
75
76    /// Return the u16 corresponding value of the unicode
77    pub fn get_unicode_value(&self) -> Result<u16, ParserError> {
78        // RTF control words generally accept signed 16-bit numbers as arguments.
79        // For this reason, Unicode values greater than 32767 must be expressed as negative numbers.
80        let mut offset = 0;
81        if let Property::Value(value) = &self {
82            if *value < 0 {
83                offset = 65_536;
84            }
85            return u16::try_from(value + offset).or(Err(ParserError::UnicodeParsingError(*value)));
86        }
87        return Err(ParserError::UnicodeParsingError(0));
88    }
89}
90
91#[derive(Copy, Clone, Debug, PartialEq, Eq)]
92pub enum ControlWord<'a> {
93    Rtf,
94    Ansi,
95
96    Unicode,
97    UnicodeIgnoreCount,
98
99    FontTable,
100    FontCharset,
101    FontNumber,
102    FontSize, // Expressed in half point
103    ColorNumber,
104
105    ColorTable,
106    FileTable,
107    StyleSheet,
108
109    Italic,
110    Bold,
111    Underline,
112    UnderlineNone,
113    Superscript, // 5th
114    Subscript,   // H20
115    Smallcaps,
116    Strikethrough,
117
118    Par,  // New paragraph
119    Pard, // Resets to default paragraph properties
120    Sectd,
121    Plain,
122    ParStyle,  // Designates paragraph style. If a paragraph style is specified, style properties must be specified with the paragraph. N references an entry in the stylesheet.
123    ParDefTab, // Tab width
124    // Paragraph indent
125    FirstLineIdent,
126    LeftIndent,
127    RightIndent,
128    // Paragraph alignment
129    LeftAligned,
130    RightAligned,
131    Center,
132    Justify,
133    // Paragraph spacing
134    SpaceBefore,
135    SpaceAfter,
136    SpaceBetweenLine,
137    SpaceLineMul, // Line spacing multiple. Indicates that the current line spacing is a multiple of "Single" line spacing. This control word can follow only the \sl control word and works in conjunction with it.
138
139    ColorRed,
140    ColorGreen,
141    ColorBlue,
142
143    Unknown(&'a str),
144}
145
146impl<'a> ControlWord<'a> {
147    // https://www.biblioscape.com/rtf15_spec.htm
148    // version 1.5 should be compatible with 1.9
149    pub fn from(input: &str) -> Result<ControlSymbol, LexerError> {
150        // Loop backward the string to get the number
151        let mut it = input.chars().rev();
152        let mut suffix_index = 0;
153        while let Some(c) = it.next() {
154            match c {
155                '0'..='9' | '-' => {
156                    suffix_index += 1;
157                }
158                _ => break,
159            }
160        }
161
162        // f0 -> prefix: f, suffix: 0
163        let index = input.len() - suffix_index;
164        let prefix = &input[..index];
165        let suffix = &input[index..];
166
167        let property = if suffix == "" {
168            Property::None
169        } else {
170            let Ok(value) = suffix.parse::<i32>() else {
171                return Err(LexerError::Error(format!("[Lexer] Unable to parse {} as integer", &suffix)));
172            };
173            Property::Value(value)
174        };
175
176        #[rustfmt::skip]
177        let control_word = match prefix {
178            r"\rtf"           => ControlWord::Rtf,
179            r"\ansi"          => ControlWord::Ansi,
180            // Unicode
181            r"\u"             => ControlWord::Unicode,
182            r"\uc"            => ControlWord::UnicodeIgnoreCount,
183            // Header
184            r"\fonttbl"       => ControlWord::FontTable,
185            r"\colortbl"      => ControlWord::ColorTable,
186            r"\filetbl"       => ControlWord::FileTable,
187            r"\stylesheet"    => ControlWord::StyleSheet,
188            // Font
189            r"\fcharset"      => ControlWord::FontCharset,
190            r"\f"             => ControlWord::FontNumber,
191            r"\fs"            => ControlWord::FontSize,
192            r"\cf"            => ControlWord::ColorNumber,
193            // Format
194            r"\i"             => ControlWord::Italic,
195            r"\b"             => ControlWord::Bold,
196            r"\ul"            => ControlWord::Underline,
197            r"\ulnone"        => ControlWord::UnderlineNone,
198            r"\super"         => ControlWord::Superscript,
199            r"\sub"           => ControlWord::Subscript,
200            r"\scaps"         => ControlWord::Smallcaps,
201            r"\strike"        => ControlWord::Strikethrough,
202            // Paragraph
203            r"\par"           => ControlWord::Par,
204            r"\pard"          => ControlWord::Pard,
205            r"\sectd"         => ControlWord::Sectd,
206            r"\plain"         => ControlWord::Plain,
207            r"\s"             => ControlWord::ParStyle,
208            r"\pardeftab"     => ControlWord::ParDefTab,
209            // Paragraph alignment
210            r"\ql"            => ControlWord::LeftAligned,
211            r"\qr"            => ControlWord::RightAligned,
212            r"\qj"            => ControlWord::Justify,
213            r"\qc"            => ControlWord::Center,
214            // Paragraph indent
215            r"\fi"             => ControlWord::FirstLineIdent,
216            r"\ri"            => ControlWord::RightIndent,
217            r"\li"            => ControlWord::LeftIndent,
218            // Paragraph Spacing
219            r"\sb"            => ControlWord::SpaceBefore,
220            r"\sa"            => ControlWord::SpaceAfter,
221            r"\sl"            => ControlWord::SpaceBetweenLine,
222            r"\slmul"         => ControlWord::SpaceLineMul,
223            r"\red"           => ControlWord::ColorRed,
224            r"\green"         => ControlWord::ColorGreen,
225            r"\blue"          => ControlWord::ColorBlue,
226            // Unknown
227            _                 => ControlWord::Unknown(prefix),
228        };
229        return Ok((control_word, property));
230    }
231}
232
233#[cfg(test)]
234mod tests {
235    use crate::tokens::{ControlWord, Property};
236
237    #[test]
238    fn control_word_from_input_test() {
239        let input = r"\rtf1";
240        assert_eq!(ControlWord::from(input).unwrap(), (ControlWord::Rtf, Property::Value(1)))
241    }
242
243    #[test]
244    fn control_word_with_negative_parameter() {
245        let input = r"\rtf-1";
246        assert_eq!(ControlWord::from(input).unwrap(), (ControlWord::Rtf, Property::Value(-1)))
247    }
248}