1use std::char;
2use std::convert::TryFrom;
3use std::num::ParseFloatError;
4use std::num::ParseIntError;
5
6use crate::lexer::float;
7use crate::lexer::float::ProtobufFloatParseError;
8use crate::lexer::json_number_lit::JsonNumberLit;
9use crate::lexer::loc::Loc;
10use crate::lexer::loc::FIRST_COL;
11use crate::lexer::parser_language::ParserLanguage;
12use crate::lexer::str_lit::StrLit;
13use crate::lexer::str_lit::StrLitDecodeError;
14use crate::lexer::token::Token;
15use crate::lexer::token::TokenWithLocation;
16
17#[derive(Debug, thiserror::Error)]
18pub enum LexerError {
19 #[error("Incorrect input")]
21 IncorrectInput,
22 #[error("Unexpected EOF")]
23 UnexpectedEof,
24 #[error("Expecting char: {:?}", .0)]
25 ExpectChar(char),
26 #[error("Parse int error")]
27 ParseIntError,
28 #[error("Parse float error")]
29 ParseFloatError,
30 #[error("Incorrect float literal")]
32 IncorrectFloatLit,
33 #[error("Incorrect JSON escape")]
34 IncorrectJsonEscape,
35 #[error("Incorrect JSON number")]
36 IncorrectJsonNumber,
37 #[error("Incorrect Unicode character")]
38 IncorrectUnicodeChar,
39 #[error("Expecting hex digit")]
40 ExpectHexDigit,
41 #[error("Expecting oct digit")]
42 ExpectOctDigit,
43 #[error("Expecting dec digit")]
44 ExpectDecDigit,
45 #[error(transparent)]
46 StrLitDecodeError(#[from] StrLitDecodeError),
47 #[error("Expecting identifier")]
48 ExpectedIdent,
49}
50
51pub type LexerResult<T> = Result<T, LexerError>;
52
53impl From<ParseIntError> for LexerError {
54 fn from(_: ParseIntError) -> Self {
55 LexerError::ParseIntError
56 }
57}
58
59impl From<ParseFloatError> for LexerError {
60 fn from(_: ParseFloatError) -> Self {
61 LexerError::ParseFloatError
62 }
63}
64
65impl From<ProtobufFloatParseError> for LexerError {
66 fn from(_: ProtobufFloatParseError) -> Self {
67 LexerError::IncorrectFloatLit
68 }
69}
70
71#[derive(Copy, Clone)]
72pub struct Lexer<'a> {
73 language: ParserLanguage,
74 input: &'a str,
75 pos: usize,
76 pub loc: Loc,
77}
78
79fn is_letter(c: char) -> bool {
80 c.is_alphabetic() || c == '_'
81}
82
83impl<'a> Lexer<'a> {
84 pub fn new(input: &'a str, language: ParserLanguage) -> Lexer<'a> {
85 Lexer {
86 language,
87 input,
88 pos: 0,
89 loc: Loc::start(),
90 }
91 }
92
93 pub fn eof(&self) -> bool {
95 self.pos == self.input.len()
96 }
97
98 fn rem_chars(&self) -> &'a str {
100 &self.input[self.pos..]
101 }
102
103 pub fn lookahead_char_is<P: FnOnce(char) -> bool>(&self, p: P) -> bool {
104 self.lookahead_char().map_or(false, p)
105 }
106
107 fn lookahead_char_is_in(&self, alphabet: &str) -> bool {
108 self.lookahead_char_is(|c| alphabet.contains(c))
109 }
110
111 fn next_char_opt(&mut self) -> Option<char> {
112 let rem = self.rem_chars();
113 if rem.is_empty() {
114 None
115 } else {
116 let mut char_indices = rem.char_indices();
117 let (_, c) = char_indices.next().unwrap();
118 let c_len = char_indices.next().map(|(len, _)| len).unwrap_or(rem.len());
119 self.pos += c_len;
120 if c == '\n' {
121 self.loc.line += 1;
122 self.loc.col = FIRST_COL;
123 } else {
124 self.loc.col += 1;
125 }
126 Some(c)
127 }
128 }
129
130 fn next_char(&mut self) -> LexerResult<char> {
131 self.next_char_opt().ok_or(LexerError::UnexpectedEof)
132 }
133
134 fn skip_whitespaces(&mut self) {
136 self.take_while(|c| c.is_whitespace());
137 }
138
139 fn skip_c_comment(&mut self) -> LexerResult<()> {
140 if self.skip_if_lookahead_is_str("/*") {
141 let end = "*/";
142 match self.rem_chars().find(end) {
143 None => Err(LexerError::UnexpectedEof),
144 Some(len) => {
145 let new_pos = self.pos + len + end.len();
146 self.skip_to_pos(new_pos);
147 Ok(())
148 }
149 }
150 } else {
151 Ok(())
152 }
153 }
154
155 fn skip_cpp_comment(&mut self) {
156 if self.skip_if_lookahead_is_str("//") {
157 loop {
158 match self.next_char_opt() {
159 Some('\n') | None => break,
160 _ => {}
161 }
162 }
163 }
164 }
165
166 fn skip_sh_comment(&mut self) {
167 if self.skip_if_lookahead_is_str("#") {
168 loop {
169 match self.next_char_opt() {
170 Some('\n') | None => break,
171 _ => {}
172 }
173 }
174 }
175 }
176
177 fn skip_comment(&mut self) -> LexerResult<()> {
178 match self.language {
179 ParserLanguage::Proto => {
180 self.skip_c_comment()?;
181 self.skip_cpp_comment();
182 }
183 ParserLanguage::TextFormat => {
184 self.skip_sh_comment();
185 }
186 ParserLanguage::Json => {}
187 }
188 Ok(())
189 }
190
191 pub fn skip_ws(&mut self) -> LexerResult<()> {
192 loop {
193 let pos = self.pos;
194 self.skip_whitespaces();
195 self.skip_comment()?;
196 if pos == self.pos {
197 return Ok(());
199 }
200 }
201 }
202
203 pub fn take_while<F>(&mut self, f: F) -> &'a str
204 where
205 F: Fn(char) -> bool,
206 {
207 let start = self.pos;
208 while self.lookahead_char().map(&f) == Some(true) {
209 self.next_char_opt().unwrap();
210 }
211 let end = self.pos;
212 &self.input[start..end]
213 }
214
215 fn lookahead_char(&self) -> Option<char> {
216 self.clone().next_char_opt()
217 }
218
219 fn lookahead_is_str(&self, s: &str) -> bool {
220 self.rem_chars().starts_with(s)
221 }
222
223 fn skip_if_lookahead_is_str(&mut self, s: &str) -> bool {
224 if self.lookahead_is_str(s) {
225 let new_pos = self.pos + s.len();
226 self.skip_to_pos(new_pos);
227 true
228 } else {
229 false
230 }
231 }
232
233 fn next_char_if<P>(&mut self, p: P) -> Option<char>
234 where
235 P: FnOnce(char) -> bool,
236 {
237 let mut clone = self.clone();
238 match clone.next_char_opt() {
239 Some(c) if p(c) => {
240 *self = clone;
241 Some(c)
242 }
243 _ => None,
244 }
245 }
246
247 pub fn next_char_if_eq(&mut self, expect: char) -> bool {
248 self.next_char_if(|c| c == expect) != None
249 }
250
251 fn next_char_if_in(&mut self, alphabet: &str) -> Option<char> {
252 for c in alphabet.chars() {
253 if self.next_char_if_eq(c) {
254 return Some(c);
255 }
256 }
257 None
258 }
259
260 fn next_char_expect_eq(&mut self, expect: char) -> LexerResult<()> {
261 if self.next_char_if_eq(expect) {
262 Ok(())
263 } else {
264 Err(LexerError::ExpectChar(expect))
265 }
266 }
267
268 fn next_char_expect<P>(&mut self, expect: P, err: LexerError) -> LexerResult<char>
269 where
270 P: FnOnce(char) -> bool,
271 {
272 self.next_char_if(expect).ok_or(err)
273 }
274
275 fn skip_to_pos(&mut self, new_pos: usize) -> &'a str {
279 assert!(new_pos >= self.pos);
280 assert!(new_pos <= self.input.len());
281 let pos = self.pos;
282 while self.pos != new_pos {
283 self.next_char_opt().unwrap();
284 }
285 &self.input[pos..new_pos]
286 }
287
288 fn next_letter_opt(&mut self) -> Option<char> {
295 self.next_char_if(is_letter)
296 }
297
298 fn _next_capital_letter_opt(&mut self) -> Option<char> {
300 self.next_char_if(|c| c >= 'A' && c <= 'Z')
301 }
302
303 fn next_ident_part(&mut self) -> Option<char> {
304 self.next_char_if(|c| c.is_ascii_alphanumeric() || c == '_')
305 }
306
307 fn next_ident_opt(&mut self) -> LexerResult<Option<String>> {
311 if let Some(c) = self.next_letter_opt() {
312 let mut ident = String::new();
313 ident.push(c);
314 while let Some(c) = self.next_ident_part() {
315 ident.push(c);
316 }
317 Ok(Some(ident))
318 } else {
319 Ok(None)
320 }
321 }
322
323 fn next_hex_lit_opt(&mut self) -> LexerResult<Option<u64>> {
327 Ok(
328 if self.skip_if_lookahead_is_str("0x") || self.skip_if_lookahead_is_str("0X") {
329 let s = self.take_while(|c| c.is_ascii_hexdigit());
330 Some(u64::from_str_radix(s, 16)? as u64)
331 } else {
332 None
333 },
334 )
335 }
336
337 fn next_decimal_octal_lit_opt(&mut self) -> LexerResult<Option<u64>> {
340 let mut clone = self.clone();
342
343 let pos = clone.pos;
344
345 Ok(if clone.next_char_if(|c| c.is_ascii_digit()) != None {
346 clone.take_while(|c| c.is_ascii_digit());
347 let value = clone.input[pos..clone.pos].parse()?;
348 *self = clone;
349 Some(value)
350 } else {
351 None
352 })
353 }
354
355 fn next_hex_digit(&mut self) -> LexerResult<u32> {
357 let mut clone = self.clone();
358 let r = match clone.next_char()? {
359 c if c >= '0' && c <= '9' => c as u32 - b'0' as u32,
360 c if c >= 'A' && c <= 'F' => c as u32 - b'A' as u32 + 10,
361 c if c >= 'a' && c <= 'f' => c as u32 - b'a' as u32 + 10,
362 _ => return Err(LexerError::ExpectHexDigit),
363 };
364 *self = clone;
365 Ok(r)
366 }
367
368 fn next_octal_digit(&mut self) -> LexerResult<u32> {
370 self.next_char_expect(|c| c >= '0' && c <= '9', LexerError::ExpectOctDigit)
371 .map(|c| c as u32 - '0' as u32)
372 }
373
374 fn next_decimal_digit(&mut self) -> LexerResult<u32> {
376 self.next_char_expect(|c| c >= '0' && c <= '9', LexerError::ExpectDecDigit)
377 .map(|c| c as u32 - '0' as u32)
378 }
379
380 fn next_decimal_digits(&mut self) -> LexerResult<()> {
382 self.next_decimal_digit()?;
383 self.take_while(|c| c >= '0' && c <= '9');
384 Ok(())
385 }
386
387 pub fn next_int_lit_opt(&mut self) -> LexerResult<Option<u64>> {
389 assert_ne!(ParserLanguage::Json, self.language);
390
391 self.skip_ws()?;
392 if let Some(i) = self.next_hex_lit_opt()? {
393 return Ok(Some(i));
394 }
395 if let Some(i) = self.next_decimal_octal_lit_opt()? {
396 return Ok(Some(i));
397 }
398 Ok(None)
399 }
400
401 fn next_exponent_opt(&mut self) -> LexerResult<Option<()>> {
405 if self.next_char_if_in("eE") != None {
406 self.next_char_if_in("+-");
407 self.next_decimal_digits()?;
408 Ok(Some(()))
409 } else {
410 Ok(None)
411 }
412 }
413
414 fn next_float_lit(&mut self) -> LexerResult<()> {
416 assert_ne!(ParserLanguage::Json, self.language);
417
418 if self.next_char_if_eq('.') {
420 self.next_decimal_digits()?;
421 self.next_exponent_opt()?;
422 } else {
423 self.next_decimal_digits()?;
424 if self.next_char_if_eq('.') {
425 self.next_decimal_digits()?;
426 self.next_exponent_opt()?;
427 } else {
428 if self.next_exponent_opt()? == None {
429 return Err(LexerError::IncorrectFloatLit);
430 }
431 }
432 }
433 Ok(())
434 }
435
436 pub fn next_byte_value(&mut self) -> LexerResult<u8> {
445 match self.next_char()? {
446 '\\' => {
447 match self.next_char()? {
448 '\'' => Ok(b'\''),
449 '"' => Ok(b'"'),
450 '\\' => Ok(b'\\'),
451 'a' => Ok(b'\x07'),
452 'b' => Ok(b'\x08'),
453 'f' => Ok(b'\x0c'),
454 'n' => Ok(b'\n'),
455 'r' => Ok(b'\r'),
456 't' => Ok(b'\t'),
457 'v' => Ok(b'\x0b'),
458 'x' => {
459 let d1 = self.next_hex_digit()? as u8;
460 let d2 = self.next_hex_digit()? as u8;
461 Ok(((d1 << 4) | d2) as u8)
462 }
463 d if d >= '0' && d <= '7' => {
464 let mut r = d as u8 - b'0';
465 for _ in 0..2 {
466 match self.next_octal_digit() {
467 Err(_) => break,
468 Ok(d) => r = (r << 3) + d as u8,
469 }
470 }
471 Ok(r)
472 }
473 c => Ok(c as u8),
476 }
477 }
478 '\n' | '\0' => Err(LexerError::IncorrectInput),
479 c => Ok(c as u8),
481 }
482 }
483
484 fn char_try_from(i: u32) -> LexerResult<char> {
485 char::try_from(i).map_err(|_| LexerError::IncorrectUnicodeChar)
486 }
487
488 pub fn next_json_char_value(&mut self) -> LexerResult<char> {
489 match self.next_char()? {
490 '\\' => match self.next_char()? {
491 '"' => Ok('"'),
492 '\'' => Ok('\''),
493 '\\' => Ok('\\'),
494 '/' => Ok('/'),
495 'b' => Ok('\x08'),
496 'f' => Ok('\x0c'),
497 'n' => Ok('\n'),
498 'r' => Ok('\r'),
499 't' => Ok('\t'),
500 'u' => {
501 let mut v = 0;
502 for _ in 0..4 {
503 let digit = self.next_hex_digit()?;
504 v = v * 16 + digit;
505 }
506 Self::char_try_from(v)
507 }
508 _ => Err(LexerError::IncorrectJsonEscape),
509 },
510 c => Ok(c),
511 }
512 }
513
514 fn next_str_lit_raw(&mut self) -> LexerResult<String> {
517 let mut raw = String::new();
518
519 let mut first = true;
520 loop {
521 if !first {
522 self.skip_ws()?;
523 }
524
525 let start = self.pos;
526
527 let q = match self.next_char_if_in("'\"") {
528 Some(q) => q,
529 None if !first => break,
530 None => return Err(LexerError::IncorrectInput),
531 };
532 first = false;
533 while self.lookahead_char() != Some(q) {
534 self.next_byte_value()?;
535 }
536 self.next_char_expect_eq(q)?;
537
538 raw.push_str(&self.input[start + 1..self.pos - 1]);
539 }
540 Ok(raw)
541 }
542
543 fn next_str_lit_raw_opt(&mut self) -> LexerResult<Option<String>> {
544 if self.lookahead_char_is_in("'\"") {
545 Ok(Some(self.next_str_lit_raw()?))
546 } else {
547 Ok(None)
548 }
549 }
550
551 fn next_json_number_opt(&mut self) -> LexerResult<Option<JsonNumberLit>> {
553 assert_eq!(ParserLanguage::Json, self.language);
554
555 fn is_digit(c: char) -> bool {
556 c >= '0' && c <= '9'
557 }
558
559 fn is_digit_1_9(c: char) -> bool {
560 c >= '1' && c <= '9'
561 }
562
563 if !self.lookahead_char_is_in("-0123456789") {
564 return Ok(None);
565 }
566
567 let mut s = String::new();
568 if self.next_char_if_eq('-') {
569 s.push('-');
570 }
571
572 if self.next_char_if_eq('0') {
573 s.push('0');
574 } else {
575 s.push(self.next_char_expect(is_digit_1_9, LexerError::IncorrectJsonNumber)?);
576 while let Some(c) = self.next_char_if(is_digit) {
577 s.push(c);
578 }
579 }
580
581 if self.next_char_if_eq('.') {
582 s.push('.');
583 s.push(self.next_char_expect(is_digit, LexerError::IncorrectJsonNumber)?);
584 while let Some(c) = self.next_char_if(is_digit) {
585 s.push(c);
586 }
587 }
588
589 if let Some(c) = self.next_char_if_in("eE") {
590 s.push(c);
591 if let Some(c) = self.next_char_if_in("+-") {
592 s.push(c);
593 }
594 s.push(self.next_char_expect(is_digit, LexerError::IncorrectJsonNumber)?);
595 while let Some(c) = self.next_char_if(is_digit) {
596 s.push(c);
597 }
598 }
599
600 Ok(Some(JsonNumberLit(s)))
601 }
602
603 fn next_token_inner(&mut self) -> LexerResult<Token> {
604 if self.language == ParserLanguage::Json {
605 if let Some(v) = self.next_json_number_opt()? {
606 return Ok(Token::JsonNumber(v));
607 }
608 }
609
610 if let Some(ident) = self.next_ident_opt()? {
611 let token = if self.language != ParserLanguage::Json && ident == float::PROTOBUF_NAN {
612 Token::FloatLit(f64::NAN)
613 } else if self.language != ParserLanguage::Json && ident == float::PROTOBUF_INF {
614 Token::FloatLit(f64::INFINITY)
615 } else {
616 Token::Ident(ident.to_owned())
617 };
618 return Ok(token);
619 }
620
621 if self.language != ParserLanguage::Json {
622 let mut clone = self.clone();
623 let pos = clone.pos;
624 if let Ok(_) = clone.next_float_lit() {
625 let f = float::parse_protobuf_float(&self.input[pos..clone.pos])?;
626 *self = clone;
627 return Ok(Token::FloatLit(f));
628 }
629
630 if let Some(lit) = self.next_int_lit_opt()? {
631 return Ok(Token::IntLit(lit));
632 }
633 }
634
635 if let Some(escaped) = self.next_str_lit_raw_opt()? {
636 return Ok(Token::StrLit(StrLit { escaped }));
637 }
638
639 if let Some(c) = self.next_char_if(|c| c.is_ascii_punctuation()) {
641 return Ok(Token::Symbol(c));
642 }
643
644 if let Some(ident) = self.next_ident_opt()? {
645 return Ok(Token::Ident(ident));
646 }
647
648 Err(LexerError::IncorrectInput)
649 }
650
651 pub fn next_token(&mut self) -> LexerResult<Option<TokenWithLocation>> {
652 self.skip_ws()?;
653 let loc = self.loc;
654
655 Ok(if self.eof() {
656 None
657 } else {
658 let token = self.next_token_inner()?;
659 self.skip_ws()?;
662 Some(TokenWithLocation { token, loc })
663 })
664 }
665}
666
667#[cfg(test)]
668mod test {
669 use super::*;
670
671 fn lex<P, R>(input: &str, parse_what: P) -> R
672 where
673 P: FnOnce(&mut Lexer) -> LexerResult<R>,
674 {
675 let mut lexer = Lexer::new(input, ParserLanguage::Proto);
676 let r = parse_what(&mut lexer).expect(&format!("lexer failed at {}", lexer.loc));
677 assert!(lexer.eof(), "check eof failed at {}", lexer.loc);
678 r
679 }
680
681 fn lex_opt<P, R>(input: &str, parse_what: P) -> R
682 where
683 P: FnOnce(&mut Lexer) -> LexerResult<Option<R>>,
684 {
685 let mut lexer = Lexer::new(input, ParserLanguage::Proto);
686 let o = parse_what(&mut lexer).expect(&format!("lexer failed at {}", lexer.loc));
687 let r = o.expect(&format!("lexer returned none at {}", lexer.loc));
688 assert!(lexer.eof(), "check eof failed at {}", lexer.loc);
689 r
690 }
691
692 #[test]
693 fn test_lexer_int_lit() {
694 let msg = r#"10"#;
695 let mess = lex_opt(msg, |p| p.next_int_lit_opt());
696 assert_eq!(10, mess);
697 }
698
699 #[test]
700 fn test_lexer_float_lit() {
701 let msg = r#"12.3"#;
702 let mess = lex(msg, |p| p.next_token_inner());
703 assert_eq!(Token::FloatLit(12.3), mess);
704 }
705
706 #[test]
707 fn test_lexer_float_lit_leading_zeros_in_exp() {
708 let msg = r#"1e00009"#;
709 let mess = lex(msg, |p| p.next_token_inner());
710 assert_eq!(Token::FloatLit(1_000_000_000.0), mess);
711 }
712}