config_parser/
lexer.rs

1use super::error::{Error, Result, ErrorType, CodePos};
2
3#[derive(Debug, PartialEq, Eq, Clone, Copy)]
4enum LexerMode {
5    None,
6    String,
7    Raw
8}
9
10#[derive(Debug, PartialEq, Eq, Clone)]
11pub enum TokenType {
12    StringLiteral(String),
13    RawLiteral(String),
14    OpenBrace,
15    CloseBrace,
16    OpenParen,
17    CloseParen,
18    Semicolon
19}
20
21#[derive(Debug, PartialEq, Eq, Clone)]
22pub struct Token {
23    pub token_type: TokenType,
24    pub line: u32,
25    pub col: u16
26}
27
28impl Token {
29    pub fn new(line: u32, col: u16, ty: TokenType) -> Token {
30        Token {
31            line: line,
32            col: col,
33            token_type: ty
34        }
35    }
36}
37
38struct LexerState {
39    line: u32,
40    col: u16,
41    input: Box<Iterator<Item=char>>,
42    mode: LexerMode,
43    escaped: bool,
44    tmp: String,
45    tokens: Vec<Token>,
46    force_next: Option<char>
47}
48
49impl CodePos for LexerState {
50    fn location(&self) -> (u32, u16) {
51        (self.line, self.col)
52    }
53}
54
55fn end_token(state: &mut LexerState) {
56    if state.mode != LexerMode::None {
57        let t = match state.mode {
58            LexerMode::None => unreachable!("Invalid mode when generating token"),
59            LexerMode::String => Token::new(state.line, state.col, TokenType::StringLiteral(state.tmp.clone())),
60            LexerMode::Raw => Token::new(state.line, state.col, TokenType::RawLiteral(state.tmp.clone()))
61        };
62        state.mode = LexerMode::None;
63        state.tokens.push(t);
64    }
65}
66
67fn start_token(state: &mut LexerState, mode: LexerMode) {
68    if state.mode != LexerMode::None {
69        end_token(state);
70    }
71    state.tmp = String::new();
72    state.mode = mode;
73}
74
75fn token(state: &mut LexerState, t: TokenType) {
76    end_token(state);
77    state.tokens.push(Token::new(state.line, state.col, t));
78}
79
80pub fn run(input: Box<Iterator<Item=char>>) -> Result<Vec<Token>> {
81    let mut state = LexerState { line: 1, col: 0, input: input, mode: LexerMode::None, escaped: false, tmp: String::new(), tokens: vec![], force_next: None};
82    loop {
83        let c = { next(&mut state) };
84        let mode = state.mode.clone();
85        let esc = state.escaped;
86        match (c, mode, esc) {
87            (Some('"'),  LexerMode::String, false) => {
88                end_token(&mut state);
89            },
90            (Some('"'),  LexerMode::None,   false) => {
91                start_token(&mut state, LexerMode::String);
92            },
93            (Some('\\'), LexerMode::String, false) => {
94                state.escaped = true;
95            },
96            (Some('\\'), LexerMode::String, true ) => {
97                state.tmp.push('\\');
98                state.escaped = false;
99            }
100            (Some('n'),  LexerMode::String, true ) => {
101                state.tmp.push('\n');
102                state.escaped = false;
103            },
104            (Some(x),    LexerMode::String, false) => {
105                state.tmp.push(x);
106            },
107            (None,       LexerMode::String, _    ) => {
108                return fail(&state, ErrorType::UnexpectedEOF)
109            }
110            (Some(' '),  LexerMode::None,   false) => {},
111            (Some(' '),  LexerMode::Raw,    false) => {
112                end_token(&mut state);
113            }
114            (Some('('),  LexerMode::Raw,    false) => {
115                token(&mut state, TokenType::OpenParen);
116            },
117            (Some(')'),  LexerMode::Raw,    false) => {
118                token(&mut state, TokenType::CloseParen);
119            },
120            (Some('{'),  LexerMode::Raw,    false) => {
121                token(&mut state, TokenType::OpenBrace);
122            },
123            (Some('}'),  LexerMode::Raw,    false) => {
124                token(&mut state, TokenType::CloseBrace);
125            },
126            (Some(';'),  LexerMode::Raw, false) => {
127                token(&mut state, TokenType::Semicolon);
128            }
129            (Some('('),  LexerMode::None,   false) => {
130                token(&mut state, TokenType::OpenParen);
131            },
132            (Some(')'),  LexerMode::None,   false) => {
133                token(&mut state, TokenType::CloseParen);
134            },
135            (Some('{'),  LexerMode::None,    false) => {
136                token(&mut state, TokenType::OpenBrace);
137            },
138            (Some('}'),  LexerMode::None,    false) => {
139                token(&mut state, TokenType::CloseBrace);
140            },
141            (Some(';'),  LexerMode::None, false) => {
142                token(&mut state, TokenType::Semicolon);
143            }
144            (Some(x),    LexerMode::None,   false) => {
145                start_token(&mut state, LexerMode::Raw);
146                state.tmp.push(x);
147            },
148            (Some(x),    LexerMode::Raw,    false) => {
149                state.tmp.push(x);
150            },
151            (None,       LexerMode::Raw,    false) => {
152                end_token(&mut state);
153                break;
154            }
155            (None,       LexerMode::None,   false) => {
156                break;
157            }
158            (c,          mode,               esc  ) => {
159                unreachable!("Invalid Parser State Reached: {:?}, {:?}, {:?}", c, mode, esc);
160            }
161        }
162    }
163    Ok(state.tokens)
164
165}
166
167fn fail<T>(state: &LexerState, error_type: ErrorType) -> Result<T> {
168    Err(Error::from_state(state, error_type, None))
169}
170
171#[derive(Debug, Clone, Copy)]
172enum PreProcState {
173    Default,
174    LineComment,
175    MultiComment(u8)
176}
177
178fn next_char(state: &mut LexerState) -> Option<char> {
179    match state.force_next {
180        Some(c) => {
181            state.force_next = None;
182            Some(c)
183        },
184        None => state.input.next()
185    }
186}
187
188fn lookahead(state: &mut LexerState) -> Option<char> {
189    match state.force_next {
190        Some(c) => Some(c),
191        None => {
192            let c = state.input.next();
193            state.force_next = c;
194            c
195        }
196    }
197}
198
199fn next(state: &mut LexerState) -> Option<char> {
200    let mut line = state.line;
201    let mut col = state.col;
202    let mut result: Option<char> = None;
203    let mut ps = PreProcState::Default;
204    loop {    
205        let c = match next_char(state) {
206            Some(c) => c,
207            None => break
208        };
209        match (c, ps) {
210            ('\n', PreProcState::Default) => {
211                line += 1;
212                col = 0;
213                result = Some(' ');
214                break;
215            },
216            ('\r', PreProcState::Default) => {},
217            ('/', PreProcState::Default) => {
218                col += 1;
219                let n = lookahead(state);
220                match n {
221                    Some('/') => ps = PreProcState::LineComment,
222                    Some('*') => ps = PreProcState::MultiComment(1),
223                    _ => {
224                        result = Some(c);
225                        break;
226                    }
227                }
228            },
229            ('#', PreProcState::Default) => {
230                ps = PreProcState::LineComment;
231                col += 1;
232            },
233            (c, PreProcState::Default) if c.is_whitespace() => {
234                col += 1;
235                result = Some(' ');
236                break;
237            },
238            (_, PreProcState::Default) => {
239                result = Some(c);
240                col += 1;
241                break;
242            },
243
244            ('\n', PreProcState::LineComment) => {
245                line += 1;
246                col = 0;
247                result = Some(' ');
248                break;
249            },
250            (_, PreProcState::LineComment) => {
251                col += 1;
252            },
253
254            ('\n', PreProcState::MultiComment(_)) => {
255                line += 1;
256                col = 0;
257            },
258            ('*', PreProcState::MultiComment(level)) => {
259                match lookahead(state) {
260                    Some('/') => {
261                        if level <= 1 {
262                            next(state).unwrap(); // pop the next char
263                            ps = PreProcState::Default
264                        } else {
265                            ps = PreProcState::MultiComment(level - 1)
266                        }
267                    },
268                    _ => {}
269                }
270            },
271            ('/', PreProcState::MultiComment(level)) => {
272                match lookahead(state) {
273                    Some('*') => {
274                        ps = PreProcState::MultiComment(level + 1);
275                    },
276                    _ => {}
277                }
278            },
279            (_, PreProcState::MultiComment(_)) => {
280                col += 1;
281            }
282        }
283    }
284    state.line = line;
285    state.col = col;
286    result
287}
288
289#[cfg(test)]
290mod test {
291    use super::*;
292    use super::super::error::{ErrorType, Error, Result};
293    
294    #[test]
295    fn successfully_parses_empty_string() {
296        assert_eq!(run(Box::new("".chars())), Ok(vec![]));
297    }
298
299    #[test]
300    fn successfully_parses_raw_token() {
301        assert_eq!(unwrap_tokens(run(Box::new("test".chars()))), Ok(vec![TokenType::RawLiteral(String::from("test"))]));
302    }
303
304    #[test]
305    fn successfully_parses_string_token() {
306        assert_eq!(unwrap_tokens(run(Box::new("\"test\"".chars()))), Ok(vec![TokenType::StringLiteral(String::from("test"))]));
307    }
308
309    #[test]
310    fn successfully_parse_basic_tokens() {
311        assert_eq!(
312            unwrap_tokens(run(Box::new("(){};".chars()))),
313            Ok(vec![
314               TokenType::OpenParen,
315               TokenType::CloseParen,
316               TokenType::OpenBrace,
317               TokenType::CloseBrace,
318               TokenType::Semicolon,
319            ]));
320    }
321
322    fn unwrap_tokens(tokens: Result<Vec<Token>>) -> Result<Vec<TokenType>> {
323        tokens.map(|toks| toks.iter().map(|t| t.token_type.clone()).collect())
324    }
325
326    #[test]
327    fn successfully_parse_a_typical_example() {
328        assert_eq!(
329            unwrap_tokens(run(Box::new("option param { inner_option \"value\"; };".chars()))),
330            Ok(vec![
331                TokenType::RawLiteral(String::from("option")),
332                TokenType::RawLiteral(String::from("param")),
333                TokenType::OpenBrace,
334                TokenType::RawLiteral(String::from("inner_option")),
335                TokenType::StringLiteral(String::from("value")),
336                TokenType::Semicolon,
337                TokenType::CloseBrace,
338                TokenType::Semicolon
339            ]));
340    }
341
342    #[test]
343    fn ignores_comments() {
344        assert_eq!(
345            unwrap_tokens(run(Box::new("/* shit */
346                                       // crap
347                                       # shit".chars()))),
348            Ok(vec![]));
349    }
350
351    #[test]
352    fn fails_on_unterminated_string() {
353        assert_eq!(
354            run(Box::new("\"yo dawg".chars())),
355            Err(Error::new(1, 8, ErrorType::UnexpectedEOF, None))
356            );
357    }
358}