corosync_config_parser/
lexer.rs

1use super::error::{CodePosition, Error, ErrorType, Result};
2
3#[derive(Debug, PartialEq, Eq, Clone, Copy)]
4enum LexerMode {
5    None,
6    String,
7    Raw,
8}
9
10#[derive(Debug, PartialEq, Eq, Clone)]
11pub enum TokenType {
12    StringLiteral(String),
13    RawLiteral(String),
14    OpenBrace,
15    CloseBrace,
16    OpenParen,
17    CloseParen,
18    Semicolon,
19    Colon,
20    LineEnd,
21}
22
23#[derive(Debug, PartialEq, Eq, Clone)]
24pub struct Token {
25    pub token_type: TokenType,
26    pub line: u32,
27    pub col: u16,
28}
29
30impl Token {
31    pub fn new(line: u32, col: u16, ty: TokenType) -> Token {
32        Token {
33            line: line,
34            col: col,
35            token_type: ty,
36        }
37    }
38}
39
40struct LexerState {
41    line: u32,
42    col: u16,
43    input: Box<dyn Iterator<Item = char>>,
44    mode: LexerMode,
45    escaped: bool,
46    tmp: String,
47    tokens: Vec<Token>,
48    force_next: Option<char>,
49}
50
51impl CodePosition for LexerState {
52    fn location(&self) -> (u32, u16) {
53        (self.line, self.col)
54    }
55}
56
57fn end_token(state: &mut LexerState) {
58    if state.mode != LexerMode::None {
59        let t = match state.mode {
60            LexerMode::None => unreachable!("Invalid mode when generating token"),
61            LexerMode::String => Token::new(
62                state.line,
63                state.col,
64                TokenType::StringLiteral(state.tmp.clone()),
65            ),
66            LexerMode::Raw => Token::new(
67                state.line,
68                state.col,
69                TokenType::RawLiteral(state.tmp.clone()),
70            ),
71        };
72        state.mode = LexerMode::None;
73        state.tokens.push(t);
74    }
75}
76
77fn start_token(state: &mut LexerState, mode: LexerMode) {
78    if state.mode != LexerMode::None {
79        end_token(state);
80    }
81    state.tmp = String::new();
82    state.mode = mode;
83}
84
85fn append_token(state: &mut LexerState, t: TokenType) {
86    end_token(state);
87    state.tokens.push(Token::new(state.line, state.col, t));
88}
89
90pub fn run(input: Box<dyn Iterator<Item = char>>) -> Result<Vec<Token>> {
91    let mut state = LexerState {
92        line: 1,
93        col: 0,
94        input: input,
95        mode: LexerMode::None,
96        escaped: false,
97        tmp: String::new(),
98        tokens: vec![],
99        force_next: None,
100    };
101    loop {
102        let c = { next(&mut state) };
103        let mode = state.mode.clone();
104        let esc = state.escaped;
105        match (c, mode, esc) {
106            (Some('"'), LexerMode::String, false) => {
107                end_token(&mut state);
108            }
109            (Some('"'), LexerMode::None, false) => {
110                start_token(&mut state, LexerMode::String);
111            }
112            (Some('\\'), LexerMode::String, false) => {
113                state.escaped = true;
114            }
115            (Some('\\'), LexerMode::String, true) => {
116                state.tmp.push('\\');
117                state.escaped = false;
118            }
119            (Some('n'), LexerMode::String, true) => {
120                state.tmp.push('\n');
121                state.escaped = false;
122            }
123            (Some(x), LexerMode::String, false) => {
124                state.tmp.push(x);
125            }
126            (None, LexerMode::String, _) => return fail(&state, ErrorType::UnexpectedEOF),
127            (Some(' '), LexerMode::None, false) => {}
128            (Some(' '), LexerMode::Raw, false) => {
129                end_token(&mut state);
130            }
131            (Some('('), LexerMode::Raw, false) => {
132                append_token(&mut state, TokenType::OpenParen);
133            }
134            (Some(')'), LexerMode::Raw, false) => {
135                append_token(&mut state, TokenType::CloseParen);
136            }
137            (Some('{'), LexerMode::Raw, false) => {
138                append_token(&mut state, TokenType::OpenBrace);
139            }
140            (Some('}'), LexerMode::Raw, false) => {
141                append_token(&mut state, TokenType::CloseBrace);
142            }
143            (Some(':'), LexerMode::Raw, false) => {
144                append_token(&mut state, TokenType::Colon);
145            }
146            (Some('('), LexerMode::None, false) => {
147                append_token(&mut state, TokenType::OpenParen);
148            }
149            (Some(')'), LexerMode::None, false) => {
150                append_token(&mut state, TokenType::CloseParen);
151            }
152            (Some('{'), LexerMode::None, false) => {
153                append_token(&mut state, TokenType::OpenBrace);
154            }
155            (Some('}'), LexerMode::None, false) => {
156                append_token(&mut state, TokenType::CloseBrace);
157            }
158            (Some(';'), LexerMode::None, false) => {
159                append_token(&mut state, TokenType::Semicolon);
160            }
161            (Some(':'), LexerMode::None, false) => {
162                append_token(&mut state, TokenType::Colon);
163            }
164            (Some('\n'), LexerMode::None, false) => {}
165            (Some('\n'), LexerMode::Raw, false) => append_token(&mut state, TokenType::LineEnd),
166            (Some(x), LexerMode::None, false) => {
167                start_token(&mut state, LexerMode::Raw);
168                state.tmp.push(x);
169            }
170            (Some(x), LexerMode::Raw, false) => {
171                state.tmp.push(x);
172            }
173            (None, LexerMode::Raw, false) => {
174                end_token(&mut state);
175                break;
176            }
177            (None, LexerMode::None, false) => {
178                break;
179            }
180            (c, mode, esc) => {
181                unreachable!(
182                    "Invalid Parser State Reached: {:?}, {:?}, {:?}",
183                    c, mode, esc
184                );
185            }
186        }
187    }
188    Ok(state.tokens)
189}
190
191fn fail<T>(state: &LexerState, error_type: ErrorType) -> Result<T> {
192    Err(Error::from_state(state, error_type, None))
193}
194
195#[derive(Debug, Clone, Copy)]
196enum PreProcessorState {
197    Default,
198    LineComment,
199    MultiComment(u8),
200}
201
202fn next_char(state: &mut LexerState) -> Option<char> {
203    match state.force_next {
204        Some(c) => {
205            state.force_next = None;
206            Some(c)
207        }
208        None => state.input.next(),
209    }
210}
211
212fn lookahead(state: &mut LexerState) -> Option<char> {
213    match state.force_next {
214        Some(c) => Some(c),
215        None => {
216            let c = state.input.next();
217            state.force_next = c;
218            c
219        }
220    }
221}
222
223fn next(state: &mut LexerState) -> Option<char> {
224    let mut line = state.line;
225    let mut column = state.col;
226    let mut result: Option<char> = None;
227    let mut pre_processor_state = PreProcessorState::Default;
228    loop {
229        let character = match next_char(state) {
230            Some(c) => c,
231            None => break,
232        };
233        match (character, pre_processor_state) {
234            ('\n', PreProcessorState::Default) => {
235                line += 1;
236                column = 0;
237                result = Some('\n');
238                break;
239            }
240            ('\r', PreProcessorState::Default) => {}
241            ('/', PreProcessorState::Default) => {
242                column += 1;
243                let n = lookahead(state);
244                match n {
245                    Some('/') => pre_processor_state = PreProcessorState::LineComment,
246                    Some('*') => pre_processor_state = PreProcessorState::MultiComment(1),
247                    _ => {
248                        result = Some(character);
249                        break;
250                    }
251                }
252            }
253            ('#', PreProcessorState::Default) => {
254                pre_processor_state = PreProcessorState::LineComment;
255                column += 1;
256            }
257            (c, PreProcessorState::Default) if c.is_whitespace() => {
258                column += 1;
259                result = Some(' ');
260                break;
261            }
262            (_, PreProcessorState::Default) => {
263                result = Some(character);
264                column += 1;
265                break;
266            }
267
268            ('\n', PreProcessorState::LineComment) => {
269                line += 1;
270                column = 0;
271                result = Some(' ');
272                break;
273            }
274            (_, PreProcessorState::LineComment) => {
275                column += 1;
276            }
277
278            ('\n', PreProcessorState::MultiComment(_)) => {
279                line += 1;
280                column = 0;
281            }
282            ('*', PreProcessorState::MultiComment(level)) => {
283                match lookahead(state) {
284                    Some('/') => {
285                        if level <= 1 {
286                            next(state).unwrap(); // pop the next char
287                            pre_processor_state = PreProcessorState::Default
288                        } else {
289                            pre_processor_state = PreProcessorState::MultiComment(level - 1)
290                        }
291                    }
292                    _ => {}
293                }
294            }
295            ('/', PreProcessorState::MultiComment(level)) => match lookahead(state) {
296                Some('*') => {
297                    pre_processor_state = PreProcessorState::MultiComment(level + 1);
298                }
299                _ => {}
300            },
301            (_, PreProcessorState::MultiComment(_)) => {
302                column += 1;
303            }
304        }
305    }
306    state.line = line;
307    state.col = column;
308    result
309}
310
311#[cfg(test)]
312mod test {
313    use super::super::error::{Error, ErrorType, Result};
314    use super::*;
315
316    #[test]
317    fn successfully_parses_empty_string() {
318        assert_eq!(run(Box::new("".chars())), Ok(vec![]));
319    }
320
321    #[test]
322    fn successfully_parses_raw_token() {
323        assert_eq!(
324            unwrap_tokens(run(Box::new("test".chars()))),
325            Ok(vec![TokenType::RawLiteral(String::from("test"))])
326        );
327    }
328
329    #[test]
330    fn successfully_parses_string_token() {
331        assert_eq!(
332            unwrap_tokens(run(Box::new("\"test\"".chars()))),
333            Ok(vec![TokenType::StringLiteral(String::from("test"))])
334        );
335    }
336
337    #[test]
338    fn successfully_parse_basic_tokens() {
339        assert_eq!(
340            unwrap_tokens(run(Box::new("(){};:".chars()))),
341            Ok(vec![
342                TokenType::OpenParen,
343                TokenType::CloseParen,
344                TokenType::OpenBrace,
345                TokenType::CloseBrace,
346                TokenType::Semicolon,
347                TokenType::Colon,
348            ])
349        );
350    }
351
352    fn unwrap_tokens(tokens: Result<Vec<Token>>) -> Result<Vec<TokenType>> {
353        tokens.map(|toks| toks.iter().map(|t| t.token_type.clone()).collect())
354    }
355
356    #[test]
357    fn successfully_parse_a_typical_example() {
358        assert_eq!(
359            unwrap_tokens(run(Box::new(
360                "option param { inner_option \"value\"; };".chars()
361            ))),
362            Ok(vec![
363                TokenType::RawLiteral(String::from("option")),
364                TokenType::RawLiteral(String::from("param")),
365                TokenType::OpenBrace,
366                TokenType::RawLiteral(String::from("inner_option")),
367                TokenType::StringLiteral(String::from("value")),
368                TokenType::Semicolon,
369                TokenType::CloseBrace,
370                TokenType::Semicolon
371            ])
372        );
373    }
374
375    #[test]
376    fn ignores_comments() {
377        assert_eq!(
378            unwrap_tokens(run(Box::new(
379                "/* shit */
380                                       // crap
381                                       # shit"
382                    .chars()
383            ))),
384            Ok(vec![])
385        );
386    }
387
388    #[test]
389    fn fails_on_unterminated_string() {
390        assert_eq!(
391            run(Box::new("\"yo dawg".chars())),
392            Err(Error::new(1, 8, ErrorType::UnexpectedEOF, None))
393        );
394    }
395}