1use super::error::{CodePosition, Error, ErrorType, Result};
2
3#[derive(Debug, PartialEq, Eq, Clone, Copy)]
4enum LexerMode {
5 None,
6 String,
7 Raw,
8}
9
10#[derive(Debug, PartialEq, Eq, Clone)]
11pub enum TokenType {
12 StringLiteral(String),
13 RawLiteral(String),
14 OpenBrace,
15 CloseBrace,
16 OpenParen,
17 CloseParen,
18 Semicolon,
19 Colon,
20 LineEnd,
21}
22
23#[derive(Debug, PartialEq, Eq, Clone)]
24pub struct Token {
25 pub token_type: TokenType,
26 pub line: u32,
27 pub col: u16,
28}
29
30impl Token {
31 pub fn new(line: u32, col: u16, ty: TokenType) -> Token {
32 Token {
33 line: line,
34 col: col,
35 token_type: ty,
36 }
37 }
38}
39
40struct LexerState {
41 line: u32,
42 col: u16,
43 input: Box<dyn Iterator<Item = char>>,
44 mode: LexerMode,
45 escaped: bool,
46 tmp: String,
47 tokens: Vec<Token>,
48 force_next: Option<char>,
49}
50
51impl CodePosition for LexerState {
52 fn location(&self) -> (u32, u16) {
53 (self.line, self.col)
54 }
55}
56
57fn end_token(state: &mut LexerState) {
58 if state.mode != LexerMode::None {
59 let t = match state.mode {
60 LexerMode::None => unreachable!("Invalid mode when generating token"),
61 LexerMode::String => Token::new(
62 state.line,
63 state.col,
64 TokenType::StringLiteral(state.tmp.clone()),
65 ),
66 LexerMode::Raw => Token::new(
67 state.line,
68 state.col,
69 TokenType::RawLiteral(state.tmp.clone()),
70 ),
71 };
72 state.mode = LexerMode::None;
73 state.tokens.push(t);
74 }
75}
76
77fn start_token(state: &mut LexerState, mode: LexerMode) {
78 if state.mode != LexerMode::None {
79 end_token(state);
80 }
81 state.tmp = String::new();
82 state.mode = mode;
83}
84
85fn append_token(state: &mut LexerState, t: TokenType) {
86 end_token(state);
87 state.tokens.push(Token::new(state.line, state.col, t));
88}
89
90pub fn run(input: Box<dyn Iterator<Item = char>>) -> Result<Vec<Token>> {
91 let mut state = LexerState {
92 line: 1,
93 col: 0,
94 input: input,
95 mode: LexerMode::None,
96 escaped: false,
97 tmp: String::new(),
98 tokens: vec![],
99 force_next: None,
100 };
101 loop {
102 let c = { next(&mut state) };
103 let mode = state.mode.clone();
104 let esc = state.escaped;
105 match (c, mode, esc) {
106 (Some('"'), LexerMode::String, false) => {
107 end_token(&mut state);
108 }
109 (Some('"'), LexerMode::None, false) => {
110 start_token(&mut state, LexerMode::String);
111 }
112 (Some('\\'), LexerMode::String, false) => {
113 state.escaped = true;
114 }
115 (Some('\\'), LexerMode::String, true) => {
116 state.tmp.push('\\');
117 state.escaped = false;
118 }
119 (Some('n'), LexerMode::String, true) => {
120 state.tmp.push('\n');
121 state.escaped = false;
122 }
123 (Some(x), LexerMode::String, false) => {
124 state.tmp.push(x);
125 }
126 (None, LexerMode::String, _) => return fail(&state, ErrorType::UnexpectedEOF),
127 (Some(' '), LexerMode::None, false) => {}
128 (Some(' '), LexerMode::Raw, false) => {
129 end_token(&mut state);
130 }
131 (Some('('), LexerMode::Raw, false) => {
132 append_token(&mut state, TokenType::OpenParen);
133 }
134 (Some(')'), LexerMode::Raw, false) => {
135 append_token(&mut state, TokenType::CloseParen);
136 }
137 (Some('{'), LexerMode::Raw, false) => {
138 append_token(&mut state, TokenType::OpenBrace);
139 }
140 (Some('}'), LexerMode::Raw, false) => {
141 append_token(&mut state, TokenType::CloseBrace);
142 }
143 (Some(':'), LexerMode::Raw, false) => {
144 append_token(&mut state, TokenType::Colon);
145 }
146 (Some('('), LexerMode::None, false) => {
147 append_token(&mut state, TokenType::OpenParen);
148 }
149 (Some(')'), LexerMode::None, false) => {
150 append_token(&mut state, TokenType::CloseParen);
151 }
152 (Some('{'), LexerMode::None, false) => {
153 append_token(&mut state, TokenType::OpenBrace);
154 }
155 (Some('}'), LexerMode::None, false) => {
156 append_token(&mut state, TokenType::CloseBrace);
157 }
158 (Some(';'), LexerMode::None, false) => {
159 append_token(&mut state, TokenType::Semicolon);
160 }
161 (Some(':'), LexerMode::None, false) => {
162 append_token(&mut state, TokenType::Colon);
163 }
164 (Some('\n'), LexerMode::None, false) => {}
165 (Some('\n'), LexerMode::Raw, false) => append_token(&mut state, TokenType::LineEnd),
166 (Some(x), LexerMode::None, false) => {
167 start_token(&mut state, LexerMode::Raw);
168 state.tmp.push(x);
169 }
170 (Some(x), LexerMode::Raw, false) => {
171 state.tmp.push(x);
172 }
173 (None, LexerMode::Raw, false) => {
174 end_token(&mut state);
175 break;
176 }
177 (None, LexerMode::None, false) => {
178 break;
179 }
180 (c, mode, esc) => {
181 unreachable!(
182 "Invalid Parser State Reached: {:?}, {:?}, {:?}",
183 c, mode, esc
184 );
185 }
186 }
187 }
188 Ok(state.tokens)
189}
190
191fn fail<T>(state: &LexerState, error_type: ErrorType) -> Result<T> {
192 Err(Error::from_state(state, error_type, None))
193}
194
195#[derive(Debug, Clone, Copy)]
196enum PreProcessorState {
197 Default,
198 LineComment,
199 MultiComment(u8),
200}
201
202fn next_char(state: &mut LexerState) -> Option<char> {
203 match state.force_next {
204 Some(c) => {
205 state.force_next = None;
206 Some(c)
207 }
208 None => state.input.next(),
209 }
210}
211
212fn lookahead(state: &mut LexerState) -> Option<char> {
213 match state.force_next {
214 Some(c) => Some(c),
215 None => {
216 let c = state.input.next();
217 state.force_next = c;
218 c
219 }
220 }
221}
222
223fn next(state: &mut LexerState) -> Option<char> {
224 let mut line = state.line;
225 let mut column = state.col;
226 let mut result: Option<char> = None;
227 let mut pre_processor_state = PreProcessorState::Default;
228 loop {
229 let character = match next_char(state) {
230 Some(c) => c,
231 None => break,
232 };
233 match (character, pre_processor_state) {
234 ('\n', PreProcessorState::Default) => {
235 line += 1;
236 column = 0;
237 result = Some('\n');
238 break;
239 }
240 ('\r', PreProcessorState::Default) => {}
241 ('/', PreProcessorState::Default) => {
242 column += 1;
243 let n = lookahead(state);
244 match n {
245 Some('/') => pre_processor_state = PreProcessorState::LineComment,
246 Some('*') => pre_processor_state = PreProcessorState::MultiComment(1),
247 _ => {
248 result = Some(character);
249 break;
250 }
251 }
252 }
253 ('#', PreProcessorState::Default) => {
254 pre_processor_state = PreProcessorState::LineComment;
255 column += 1;
256 }
257 (c, PreProcessorState::Default) if c.is_whitespace() => {
258 column += 1;
259 result = Some(' ');
260 break;
261 }
262 (_, PreProcessorState::Default) => {
263 result = Some(character);
264 column += 1;
265 break;
266 }
267
268 ('\n', PreProcessorState::LineComment) => {
269 line += 1;
270 column = 0;
271 result = Some(' ');
272 break;
273 }
274 (_, PreProcessorState::LineComment) => {
275 column += 1;
276 }
277
278 ('\n', PreProcessorState::MultiComment(_)) => {
279 line += 1;
280 column = 0;
281 }
282 ('*', PreProcessorState::MultiComment(level)) => {
283 match lookahead(state) {
284 Some('/') => {
285 if level <= 1 {
286 next(state).unwrap(); pre_processor_state = PreProcessorState::Default
288 } else {
289 pre_processor_state = PreProcessorState::MultiComment(level - 1)
290 }
291 }
292 _ => {}
293 }
294 }
295 ('/', PreProcessorState::MultiComment(level)) => match lookahead(state) {
296 Some('*') => {
297 pre_processor_state = PreProcessorState::MultiComment(level + 1);
298 }
299 _ => {}
300 },
301 (_, PreProcessorState::MultiComment(_)) => {
302 column += 1;
303 }
304 }
305 }
306 state.line = line;
307 state.col = column;
308 result
309}
310
311#[cfg(test)]
312mod test {
313 use super::super::error::{Error, ErrorType, Result};
314 use super::*;
315
316 #[test]
317 fn successfully_parses_empty_string() {
318 assert_eq!(run(Box::new("".chars())), Ok(vec![]));
319 }
320
321 #[test]
322 fn successfully_parses_raw_token() {
323 assert_eq!(
324 unwrap_tokens(run(Box::new("test".chars()))),
325 Ok(vec![TokenType::RawLiteral(String::from("test"))])
326 );
327 }
328
329 #[test]
330 fn successfully_parses_string_token() {
331 assert_eq!(
332 unwrap_tokens(run(Box::new("\"test\"".chars()))),
333 Ok(vec![TokenType::StringLiteral(String::from("test"))])
334 );
335 }
336
337 #[test]
338 fn successfully_parse_basic_tokens() {
339 assert_eq!(
340 unwrap_tokens(run(Box::new("(){};:".chars()))),
341 Ok(vec![
342 TokenType::OpenParen,
343 TokenType::CloseParen,
344 TokenType::OpenBrace,
345 TokenType::CloseBrace,
346 TokenType::Semicolon,
347 TokenType::Colon,
348 ])
349 );
350 }
351
352 fn unwrap_tokens(tokens: Result<Vec<Token>>) -> Result<Vec<TokenType>> {
353 tokens.map(|toks| toks.iter().map(|t| t.token_type.clone()).collect())
354 }
355
356 #[test]
357 fn successfully_parse_a_typical_example() {
358 assert_eq!(
359 unwrap_tokens(run(Box::new(
360 "option param { inner_option \"value\"; };".chars()
361 ))),
362 Ok(vec![
363 TokenType::RawLiteral(String::from("option")),
364 TokenType::RawLiteral(String::from("param")),
365 TokenType::OpenBrace,
366 TokenType::RawLiteral(String::from("inner_option")),
367 TokenType::StringLiteral(String::from("value")),
368 TokenType::Semicolon,
369 TokenType::CloseBrace,
370 TokenType::Semicolon
371 ])
372 );
373 }
374
375 #[test]
376 fn ignores_comments() {
377 assert_eq!(
378 unwrap_tokens(run(Box::new(
379 "/* shit */
380 // crap
381 # shit"
382 .chars()
383 ))),
384 Ok(vec![])
385 );
386 }
387
388 #[test]
389 fn fails_on_unterminated_string() {
390 assert_eq!(
391 run(Box::new("\"yo dawg".chars())),
392 Err(Error::new(1, 8, ErrorType::UnexpectedEOF, None))
393 );
394 }
395}