rtf_parser_tt/
tokens.rs

1use std::any::type_name;
2use std::convert::TryFrom;
3use std::fmt;
4
5use crate::lexer::LexerError;
6use crate::parser::ParserError;
7
8/// Parser representation of an RTF token
9#[allow(dead_code)]
10#[derive(PartialEq, Eq, Clone)]
11pub enum Token<'a> {
12    PlainText(&'a str),
13    OpeningBracket,
14    ClosingBracket,
15    CRLF,                 // Line-return \n
16    IgnorableDestination, // \*\ <destination-name>
17    ControlSymbol(ControlSymbol<'a>),
18    Empty, // Used by the parser for optimization
19}
20
21#[allow(dead_code)]
22impl<'a> fmt::Debug for Token<'a> {
23    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
24        #[rustfmt::skip]
25        return match self {
26            Token::PlainText(text)        => write!(f, r"PlainText : {:?}", *text),
27            Token::OpeningBracket         => write!(f, "OpeningBracket"),
28            Token::ClosingBracket         => write!(f, "ClosingBracket"),
29            Token::CRLF                   => write!(f, "CRLF"),
30            Token::IgnorableDestination   => write!(f, "IgnorableDestination"),
31            Token::ControlSymbol(symbol)  => write!(f, "ControlSymbol : {:?}", symbol),
32            Token::Empty                  => write!(f, "Empty"),
33        };
34    }
35}
36
37/// A control symbol is a pair (control_word, property)
38/// In the RTF specification, it refers to 'control word entity'
39pub type ControlSymbol<'a> = (ControlWord<'a>, Property);
40
41/// Parameters for a control word
42#[allow(dead_code)]
43#[derive(Copy, Clone, Debug, PartialEq, Eq)]
44pub enum Property {
45    On,         // 1
46    Off,        // 0
47    Value(i32), // Specified as i16 in the spec 1.5 but some softwre use i32 (ex: TextEdit for unicode)
48    None,       // No parameter
49}
50
51impl Property {
52    pub fn as_bool(&self) -> bool {
53        match self {
54            Property::On => true,
55            Property::Off => false,
56            Property::None => true,
57            Property::Value(val) => *val == 1,
58        }
59    }
60
61    // Retrieve and cast the i32 value to a specific numeric type
62    pub fn get_value_as<T: TryFrom<i32>>(&self) -> Result<T, ParserError> {
63        let error: Result<T, ParserError> = Err(ParserError::ValueCastError(type_name::<T>().to_string()));
64        if let Property::Value(value) = &self {
65            return T::try_from(*value).or(error);
66        }
67        // If no value, returns 0
68        return T::try_from(0).or(error);
69    }
70
71    // Default variant
72    pub fn get_value(&self) -> i32 {
73        return self.get_value_as::<i32>().expect("i32 to i32 conversion should never fail");
74    }
75
76    /// Return the u16 corresponding value of the unicode
77    pub fn get_unicode_value(&self) -> Result<u16, ParserError> {
78        // RTF control words generally accept signed 16-bit numbers as arguments.
79        // For this reason, Unicode values greater than 32767 must be expressed as negative numbers.
80        let mut offset = 0;
81        if let Property::Value(value) = &self {
82            if *value < 0 {
83                offset = 65_536;
84            }
85            return u16::try_from(value + offset).or(Err(ParserError::UnicodeParsingError(*value)));
86        }
87        return Err(ParserError::UnicodeParsingError(0));
88    }
89}
90
91#[derive(Copy, Clone, Debug, PartialEq, Eq)]
92pub enum ControlWord<'a> {
93    Rtf,
94    Ansi,
95
96    Unicode,
97    UnicodeIgnoreCount,
98
99    FontTable,
100    FontCharset,
101    FontNumber,
102    FontSize, // Expressed in half point
103    ColorNumber,
104
105    ColorTable,
106    FileTable,
107    StyleSheet,
108
109    Italic,
110    Bold,
111    Underline,
112    UnderlineNone,
113    Superscript, // 5th
114    Subscript,   // H20
115    Smallcaps,
116    Strikethrough,
117
118    Par,  // New paragraph
119    Pard, // Resets to default paragraph properties
120    Sectd,
121    Plain,
122    ParStyle,  // Designates paragraph style. If a paragraph style is specified, style properties must be specified with the paragraph. N references an entry in the stylesheet.
123    ParDefTab, // Tab width
124    // Paragraph indent
125    FirstLineIdent,
126    LeftIndent,
127    RightIndent,
128    // Paragraph alignment
129    LeftAligned,
130    RightAligned,
131    Center,
132    Justify,
133    // Paragraph spacing
134    SpaceBefore,
135    SpaceAfter,
136    SpaceBetweenLine,
137    SpaceLineMul, // Line spacing multiple. Indicates that the current line spacing is a multiple of "Single" line spacing. This control word can follow only the \sl control word and works in conjunction with it.
138
139    ColorRed,
140    ColorGreen,
141    ColorBlue,
142
143    // Special characters
144    Emdash,
145    Endash,
146    Bullet,
147    LeftSingleQuote,
148    RightSingleQuote,
149    LeftDoubleQuote,
150    RightDoubleQuote,
151    Tab,
152    Line,
153
154    Unknown(&'a str),
155}
156
157impl<'a> ControlWord<'a> {
158    // https://www.biblioscape.com/rtf15_spec.htm
159    // version 1.5 should be compatible with 1.9
160    pub fn from(input: &str) -> Result<ControlSymbol, LexerError> {
161        // Loop backward the string to get the number
162        let mut it = input.chars().rev();
163        let mut suffix_index = 0;
164        while let Some(c) = it.next() {
165            match c {
166                '0'..='9' | '-' => {
167                    suffix_index += 1;
168                }
169                _ => break,
170            }
171        }
172
173        // f0 -> prefix: f, suffix: 0
174        let index = input.len() - suffix_index;
175        let prefix = &input[..index];
176        let suffix = &input[index..];
177
178        let property = if suffix == "" {
179            Property::None
180        } else {
181            let Ok(value) = suffix.parse::<i32>() else {
182                return Err(LexerError::Error(format!("[Lexer] Unable to parse {} as integer", &suffix)));
183            };
184            Property::Value(value)
185        };
186
187        #[rustfmt::skip]
188        let control_word = match prefix {
189            r"\rtf"           => ControlWord::Rtf,
190            r"\ansi"          => ControlWord::Ansi,
191            // Unicode
192            r"\u"             => ControlWord::Unicode,
193            r"\uc"            => ControlWord::UnicodeIgnoreCount,
194            // Header
195            r"\fonttbl"       => ControlWord::FontTable,
196            r"\colortbl"      => ControlWord::ColorTable,
197            r"\filetbl"       => ControlWord::FileTable,
198            r"\stylesheet"    => ControlWord::StyleSheet,
199            // Font
200            r"\fcharset"      => ControlWord::FontCharset,
201            r"\f"             => ControlWord::FontNumber,
202            r"\fs"            => ControlWord::FontSize,
203            r"\cf"            => ControlWord::ColorNumber,
204            // Format
205            r"\i"             => ControlWord::Italic,
206            r"\b"             => ControlWord::Bold,
207            r"\ul"            => ControlWord::Underline,
208            r"\ulnone"        => ControlWord::UnderlineNone,
209            r"\super"         => ControlWord::Superscript,
210            r"\sub"           => ControlWord::Subscript,
211            r"\scaps"         => ControlWord::Smallcaps,
212            r"\strike"        => ControlWord::Strikethrough,
213            // Paragraph
214            r"\par"           => ControlWord::Par,
215            r"\pard"          => ControlWord::Pard,
216            r"\sectd"         => ControlWord::Sectd,
217            r"\plain"         => ControlWord::Plain,
218            r"\s"             => ControlWord::ParStyle,
219            r"\pardeftab"     => ControlWord::ParDefTab,
220            // Paragraph alignment
221            r"\ql"            => ControlWord::LeftAligned,
222            r"\qr"            => ControlWord::RightAligned,
223            r"\qj"            => ControlWord::Justify,
224            r"\qc"            => ControlWord::Center,
225            // Paragraph indent
226            r"\fi"             => ControlWord::FirstLineIdent,
227            r"\ri"            => ControlWord::RightIndent,
228            r"\li"            => ControlWord::LeftIndent,
229            // Paragraph Spacing
230            r"\sb"            => ControlWord::SpaceBefore,
231            r"\sa"            => ControlWord::SpaceAfter,
232            r"\sl"            => ControlWord::SpaceBetweenLine,
233            r"\slmul"         => ControlWord::SpaceLineMul,
234            r"\red"           => ControlWord::ColorRed,
235            r"\green"         => ControlWord::ColorGreen,
236            r"\blue"          => ControlWord::ColorBlue,
237            // Special characters
238            r"\emdash"        => ControlWord::Emdash,
239            r"\endash"        => ControlWord::Endash,
240            r"\bullet"        => ControlWord::Bullet,
241            r"\lquote"        => ControlWord::LeftSingleQuote,
242            r"\rquote"        => ControlWord::RightSingleQuote,
243            r"\ldblquote"     => ControlWord::LeftDoubleQuote,
244            r"\rdblquote"     => ControlWord::RightDoubleQuote,
245            r"\tab"           => ControlWord::Tab,
246            r"\line"          => ControlWord::Line,
247            // Unknown
248            _                 => ControlWord::Unknown(prefix),
249        };
250        return Ok((control_word, property));
251    }
252}
253
254#[cfg(test)]
255mod tests {
256    use crate::tokens::{ControlWord, Property};
257
258    #[test]
259    fn control_word_from_input_test() {
260        let input = r"\rtf1";
261        assert_eq!(ControlWord::from(input).unwrap(), (ControlWord::Rtf, Property::Value(1)))
262    }
263
264    #[test]
265    fn control_word_with_negative_parameter() {
266        let input = r"\rtf-1";
267        assert_eq!(ControlWord::from(input).unwrap(), (ControlWord::Rtf, Property::Value(-1)))
268    }
269}