protobuf_support2/lexer/
lexer_impl.rs

1use std::char;
2use std::convert::TryFrom;
3use std::num::ParseFloatError;
4use std::num::ParseIntError;
5
6use crate::lexer::float;
7use crate::lexer::float::ProtobufFloatParseError;
8use crate::lexer::json_number_lit::JsonNumberLit;
9use crate::lexer::loc::Loc;
10use crate::lexer::loc::FIRST_COL;
11use crate::lexer::parser_language::ParserLanguage;
12use crate::lexer::str_lit::StrLit;
13use crate::lexer::str_lit::StrLitDecodeError;
14use crate::lexer::token::Token;
15use crate::lexer::token::TokenWithLocation;
16
17#[derive(Debug, thiserror::Error)]
18pub enum LexerError {
19    // TODO: something better than this
20    #[error("Incorrect input")]
21    IncorrectInput,
22    #[error("Unexpected EOF")]
23    UnexpectedEof,
24    #[error("Expecting char: {:?}", .0)]
25    ExpectChar(char),
26    #[error("Parse int error")]
27    ParseIntError,
28    #[error("Parse float error")]
29    ParseFloatError,
30    // TODO: how it is different from ParseFloatError?
31    #[error("Incorrect float literal")]
32    IncorrectFloatLit,
33    #[error("Incorrect JSON escape")]
34    IncorrectJsonEscape,
35    #[error("Incorrect JSON number")]
36    IncorrectJsonNumber,
37    #[error("Incorrect Unicode character")]
38    IncorrectUnicodeChar,
39    #[error("Expecting hex digit")]
40    ExpectHexDigit,
41    #[error("Expecting oct digit")]
42    ExpectOctDigit,
43    #[error("Expecting dec digit")]
44    ExpectDecDigit,
45    #[error(transparent)]
46    StrLitDecodeError(#[from] StrLitDecodeError),
47    #[error("Expecting identifier")]
48    ExpectedIdent,
49}
50
51pub type LexerResult<T> = Result<T, LexerError>;
52
53impl From<ParseIntError> for LexerError {
54    fn from(_: ParseIntError) -> Self {
55        LexerError::ParseIntError
56    }
57}
58
59impl From<ParseFloatError> for LexerError {
60    fn from(_: ParseFloatError) -> Self {
61        LexerError::ParseFloatError
62    }
63}
64
65impl From<ProtobufFloatParseError> for LexerError {
66    fn from(_: ProtobufFloatParseError) -> Self {
67        LexerError::IncorrectFloatLit
68    }
69}
70
71#[derive(Copy, Clone)]
72pub struct Lexer<'a> {
73    language: ParserLanguage,
74    input: &'a str,
75    pos: usize,
76    pub loc: Loc,
77}
78
79fn is_letter(c: char) -> bool {
80    c.is_alphabetic() || c == '_'
81}
82
83impl<'a> Lexer<'a> {
84    pub fn new(input: &'a str, language: ParserLanguage) -> Lexer<'a> {
85        Lexer {
86            language,
87            input,
88            pos: 0,
89            loc: Loc::start(),
90        }
91    }
92
93    /// No more chars
94    pub fn eof(&self) -> bool {
95        self.pos == self.input.len()
96    }
97
98    /// Remaining chars
99    fn rem_chars(&self) -> &'a str {
100        &self.input[self.pos..]
101    }
102
103    pub fn lookahead_char_is<P: FnOnce(char) -> bool>(&self, p: P) -> bool {
104        self.lookahead_char().map_or(false, p)
105    }
106
107    fn lookahead_char_is_in(&self, alphabet: &str) -> bool {
108        self.lookahead_char_is(|c| alphabet.contains(c))
109    }
110
111    fn next_char_opt(&mut self) -> Option<char> {
112        let rem = self.rem_chars();
113        if rem.is_empty() {
114            None
115        } else {
116            let mut char_indices = rem.char_indices();
117            let (_, c) = char_indices.next().unwrap();
118            let c_len = char_indices.next().map(|(len, _)| len).unwrap_or(rem.len());
119            self.pos += c_len;
120            if c == '\n' {
121                self.loc.line += 1;
122                self.loc.col = FIRST_COL;
123            } else {
124                self.loc.col += 1;
125            }
126            Some(c)
127        }
128    }
129
130    fn next_char(&mut self) -> LexerResult<char> {
131        self.next_char_opt().ok_or(LexerError::UnexpectedEof)
132    }
133
134    /// Skip whitespaces
135    fn skip_whitespaces(&mut self) {
136        self.take_while(|c| c.is_whitespace());
137    }
138
139    fn skip_c_comment(&mut self) -> LexerResult<()> {
140        if self.skip_if_lookahead_is_str("/*") {
141            let end = "*/";
142            match self.rem_chars().find(end) {
143                None => Err(LexerError::UnexpectedEof),
144                Some(len) => {
145                    let new_pos = self.pos + len + end.len();
146                    self.skip_to_pos(new_pos);
147                    Ok(())
148                }
149            }
150        } else {
151            Ok(())
152        }
153    }
154
155    fn skip_cpp_comment(&mut self) {
156        if self.skip_if_lookahead_is_str("//") {
157            loop {
158                match self.next_char_opt() {
159                    Some('\n') | None => break,
160                    _ => {}
161                }
162            }
163        }
164    }
165
166    fn skip_sh_comment(&mut self) {
167        if self.skip_if_lookahead_is_str("#") {
168            loop {
169                match self.next_char_opt() {
170                    Some('\n') | None => break,
171                    _ => {}
172                }
173            }
174        }
175    }
176
177    fn skip_comment(&mut self) -> LexerResult<()> {
178        match self.language {
179            ParserLanguage::Proto => {
180                self.skip_c_comment()?;
181                self.skip_cpp_comment();
182            }
183            ParserLanguage::TextFormat => {
184                self.skip_sh_comment();
185            }
186            ParserLanguage::Json => {}
187        }
188        Ok(())
189    }
190
191    pub fn skip_ws(&mut self) -> LexerResult<()> {
192        loop {
193            let pos = self.pos;
194            self.skip_whitespaces();
195            self.skip_comment()?;
196            if pos == self.pos {
197                // Did not advance
198                return Ok(());
199            }
200        }
201    }
202
203    pub fn take_while<F>(&mut self, f: F) -> &'a str
204    where
205        F: Fn(char) -> bool,
206    {
207        let start = self.pos;
208        while self.lookahead_char().map(&f) == Some(true) {
209            self.next_char_opt().unwrap();
210        }
211        let end = self.pos;
212        &self.input[start..end]
213    }
214
215    fn lookahead_char(&self) -> Option<char> {
216        self.clone().next_char_opt()
217    }
218
219    fn lookahead_is_str(&self, s: &str) -> bool {
220        self.rem_chars().starts_with(s)
221    }
222
223    fn skip_if_lookahead_is_str(&mut self, s: &str) -> bool {
224        if self.lookahead_is_str(s) {
225            let new_pos = self.pos + s.len();
226            self.skip_to_pos(new_pos);
227            true
228        } else {
229            false
230        }
231    }
232
233    fn next_char_if<P>(&mut self, p: P) -> Option<char>
234    where
235        P: FnOnce(char) -> bool,
236    {
237        let mut clone = self.clone();
238        match clone.next_char_opt() {
239            Some(c) if p(c) => {
240                *self = clone;
241                Some(c)
242            }
243            _ => None,
244        }
245    }
246
247    pub fn next_char_if_eq(&mut self, expect: char) -> bool {
248        self.next_char_if(|c| c == expect) != None
249    }
250
251    fn next_char_if_in(&mut self, alphabet: &str) -> Option<char> {
252        for c in alphabet.chars() {
253            if self.next_char_if_eq(c) {
254                return Some(c);
255            }
256        }
257        None
258    }
259
260    fn next_char_expect_eq(&mut self, expect: char) -> LexerResult<()> {
261        if self.next_char_if_eq(expect) {
262            Ok(())
263        } else {
264            Err(LexerError::ExpectChar(expect))
265        }
266    }
267
268    fn next_char_expect<P>(&mut self, expect: P, err: LexerError) -> LexerResult<char>
269    where
270        P: FnOnce(char) -> bool,
271    {
272        self.next_char_if(expect).ok_or(err)
273    }
274
275    // str functions
276
277    /// properly update line and column
278    fn skip_to_pos(&mut self, new_pos: usize) -> &'a str {
279        assert!(new_pos >= self.pos);
280        assert!(new_pos <= self.input.len());
281        let pos = self.pos;
282        while self.pos != new_pos {
283            self.next_char_opt().unwrap();
284        }
285        &self.input[pos..new_pos]
286    }
287
288    // Protobuf grammar
289
290    // char functions
291
292    // letter = "A" … "Z" | "a" … "z"
293    // https://github.com/google/protobuf/issues/4565
294    fn next_letter_opt(&mut self) -> Option<char> {
295        self.next_char_if(is_letter)
296    }
297
298    // capitalLetter =  "A" … "Z"
299    fn _next_capital_letter_opt(&mut self) -> Option<char> {
300        self.next_char_if(|c| c >= 'A' && c <= 'Z')
301    }
302
303    fn next_ident_part(&mut self) -> Option<char> {
304        self.next_char_if(|c| c.is_ascii_alphanumeric() || c == '_')
305    }
306
307    // Identifiers
308
309    // ident = letter { letter | decimalDigit | "_" }
310    fn next_ident_opt(&mut self) -> LexerResult<Option<String>> {
311        if let Some(c) = self.next_letter_opt() {
312            let mut ident = String::new();
313            ident.push(c);
314            while let Some(c) = self.next_ident_part() {
315                ident.push(c);
316            }
317            Ok(Some(ident))
318        } else {
319            Ok(None)
320        }
321    }
322
323    // Integer literals
324
325    // hexLit     = "0" ( "x" | "X" ) hexDigit { hexDigit }
326    fn next_hex_lit_opt(&mut self) -> LexerResult<Option<u64>> {
327        Ok(
328            if self.skip_if_lookahead_is_str("0x") || self.skip_if_lookahead_is_str("0X") {
329                let s = self.take_while(|c| c.is_ascii_hexdigit());
330                Some(u64::from_str_radix(s, 16)? as u64)
331            } else {
332                None
333            },
334        )
335    }
336
337    // decimalLit = ( "1" … "9" ) { decimalDigit }
338    // octalLit   = "0" { octalDigit }
339    fn next_decimal_octal_lit_opt(&mut self) -> LexerResult<Option<u64>> {
340        // do not advance on number parse error
341        let mut clone = self.clone();
342
343        let pos = clone.pos;
344
345        Ok(if clone.next_char_if(|c| c.is_ascii_digit()) != None {
346            clone.take_while(|c| c.is_ascii_digit());
347            let value = clone.input[pos..clone.pos].parse()?;
348            *self = clone;
349            Some(value)
350        } else {
351            None
352        })
353    }
354
355    // hexDigit     = "0" … "9" | "A" … "F" | "a" … "f"
356    fn next_hex_digit(&mut self) -> LexerResult<u32> {
357        let mut clone = self.clone();
358        let r = match clone.next_char()? {
359            c if c >= '0' && c <= '9' => c as u32 - b'0' as u32,
360            c if c >= 'A' && c <= 'F' => c as u32 - b'A' as u32 + 10,
361            c if c >= 'a' && c <= 'f' => c as u32 - b'a' as u32 + 10,
362            _ => return Err(LexerError::ExpectHexDigit),
363        };
364        *self = clone;
365        Ok(r)
366    }
367
368    // octalDigit   = "0" … "7"
369    fn next_octal_digit(&mut self) -> LexerResult<u32> {
370        self.next_char_expect(|c| c >= '0' && c <= '9', LexerError::ExpectOctDigit)
371            .map(|c| c as u32 - '0' as u32)
372    }
373
374    // decimalDigit = "0" … "9"
375    fn next_decimal_digit(&mut self) -> LexerResult<u32> {
376        self.next_char_expect(|c| c >= '0' && c <= '9', LexerError::ExpectDecDigit)
377            .map(|c| c as u32 - '0' as u32)
378    }
379
380    // decimals  = decimalDigit { decimalDigit }
381    fn next_decimal_digits(&mut self) -> LexerResult<()> {
382        self.next_decimal_digit()?;
383        self.take_while(|c| c >= '0' && c <= '9');
384        Ok(())
385    }
386
387    // intLit     = decimalLit | octalLit | hexLit
388    pub fn next_int_lit_opt(&mut self) -> LexerResult<Option<u64>> {
389        assert_ne!(ParserLanguage::Json, self.language);
390
391        self.skip_ws()?;
392        if let Some(i) = self.next_hex_lit_opt()? {
393            return Ok(Some(i));
394        }
395        if let Some(i) = self.next_decimal_octal_lit_opt()? {
396            return Ok(Some(i));
397        }
398        Ok(None)
399    }
400
401    // Floating-point literals
402
403    // exponent  = ( "e" | "E" ) [ "+" | "-" ] decimals
404    fn next_exponent_opt(&mut self) -> LexerResult<Option<()>> {
405        if self.next_char_if_in("eE") != None {
406            self.next_char_if_in("+-");
407            self.next_decimal_digits()?;
408            Ok(Some(()))
409        } else {
410            Ok(None)
411        }
412    }
413
414    // floatLit = ( decimals "." [ decimals ] [ exponent ] | decimals exponent | "."decimals [ exponent ] ) | "inf" | "nan"
415    fn next_float_lit(&mut self) -> LexerResult<()> {
416        assert_ne!(ParserLanguage::Json, self.language);
417
418        // "inf" and "nan" are handled as part of ident
419        if self.next_char_if_eq('.') {
420            self.next_decimal_digits()?;
421            self.next_exponent_opt()?;
422        } else {
423            self.next_decimal_digits()?;
424            if self.next_char_if_eq('.') {
425                self.next_decimal_digits()?;
426                self.next_exponent_opt()?;
427            } else {
428                if self.next_exponent_opt()? == None {
429                    return Err(LexerError::IncorrectFloatLit);
430                }
431            }
432        }
433        Ok(())
434    }
435
436    // String literals
437
438    // charValue = hexEscape | octEscape | charEscape | /[^\0\n\\]/
439    // hexEscape = '\' ( "x" | "X" ) hexDigit hexDigit
440    // https://github.com/google/protobuf/issues/4560
441    // octEscape = '\' octalDigit octalDigit octalDigit
442    // charEscape = '\' ( "a" | "b" | "f" | "n" | "r" | "t" | "v" | '\' | "'" | '"' )
443    // quote = "'" | '"'
444    pub fn next_byte_value(&mut self) -> LexerResult<u8> {
445        match self.next_char()? {
446            '\\' => {
447                match self.next_char()? {
448                    '\'' => Ok(b'\''),
449                    '"' => Ok(b'"'),
450                    '\\' => Ok(b'\\'),
451                    'a' => Ok(b'\x07'),
452                    'b' => Ok(b'\x08'),
453                    'f' => Ok(b'\x0c'),
454                    'n' => Ok(b'\n'),
455                    'r' => Ok(b'\r'),
456                    't' => Ok(b'\t'),
457                    'v' => Ok(b'\x0b'),
458                    'x' => {
459                        let d1 = self.next_hex_digit()? as u8;
460                        let d2 = self.next_hex_digit()? as u8;
461                        Ok(((d1 << 4) | d2) as u8)
462                    }
463                    d if d >= '0' && d <= '7' => {
464                        let mut r = d as u8 - b'0';
465                        for _ in 0..2 {
466                            match self.next_octal_digit() {
467                                Err(_) => break,
468                                Ok(d) => r = (r << 3) + d as u8,
469                            }
470                        }
471                        Ok(r)
472                    }
473                    // https://github.com/google/protobuf/issues/4562
474                    // TODO: overflow
475                    c => Ok(c as u8),
476                }
477            }
478            '\n' | '\0' => Err(LexerError::IncorrectInput),
479            // TODO: check overflow
480            c => Ok(c as u8),
481        }
482    }
483
484    fn char_try_from(i: u32) -> LexerResult<char> {
485        char::try_from(i).map_err(|_| LexerError::IncorrectUnicodeChar)
486    }
487
488    pub fn next_json_char_value(&mut self) -> LexerResult<char> {
489        match self.next_char()? {
490            '\\' => match self.next_char()? {
491                '"' => Ok('"'),
492                '\'' => Ok('\''),
493                '\\' => Ok('\\'),
494                '/' => Ok('/'),
495                'b' => Ok('\x08'),
496                'f' => Ok('\x0c'),
497                'n' => Ok('\n'),
498                'r' => Ok('\r'),
499                't' => Ok('\t'),
500                'u' => {
501                    let mut v = 0;
502                    for _ in 0..4 {
503                        let digit = self.next_hex_digit()?;
504                        v = v * 16 + digit;
505                    }
506                    Self::char_try_from(v)
507                }
508                _ => Err(LexerError::IncorrectJsonEscape),
509            },
510            c => Ok(c),
511        }
512    }
513
514    // https://github.com/google/protobuf/issues/4564
515    // strLit = ( "'" { charValue } "'" ) | ( '"' { charValue } '"' )
516    fn next_str_lit_raw(&mut self) -> LexerResult<String> {
517        let mut raw = String::new();
518
519        let mut first = true;
520        loop {
521            if !first {
522                self.skip_ws()?;
523            }
524
525            let start = self.pos;
526
527            let q = match self.next_char_if_in("'\"") {
528                Some(q) => q,
529                None if !first => break,
530                None => return Err(LexerError::IncorrectInput),
531            };
532            first = false;
533            while self.lookahead_char() != Some(q) {
534                self.next_byte_value()?;
535            }
536            self.next_char_expect_eq(q)?;
537
538            raw.push_str(&self.input[start + 1..self.pos - 1]);
539        }
540        Ok(raw)
541    }
542
543    fn next_str_lit_raw_opt(&mut self) -> LexerResult<Option<String>> {
544        if self.lookahead_char_is_in("'\"") {
545            Ok(Some(self.next_str_lit_raw()?))
546        } else {
547            Ok(None)
548        }
549    }
550
551    /// Parse next token as JSON number
552    fn next_json_number_opt(&mut self) -> LexerResult<Option<JsonNumberLit>> {
553        assert_eq!(ParserLanguage::Json, self.language);
554
555        fn is_digit(c: char) -> bool {
556            c >= '0' && c <= '9'
557        }
558
559        fn is_digit_1_9(c: char) -> bool {
560            c >= '1' && c <= '9'
561        }
562
563        if !self.lookahead_char_is_in("-0123456789") {
564            return Ok(None);
565        }
566
567        let mut s = String::new();
568        if self.next_char_if_eq('-') {
569            s.push('-');
570        }
571
572        if self.next_char_if_eq('0') {
573            s.push('0');
574        } else {
575            s.push(self.next_char_expect(is_digit_1_9, LexerError::IncorrectJsonNumber)?);
576            while let Some(c) = self.next_char_if(is_digit) {
577                s.push(c);
578            }
579        }
580
581        if self.next_char_if_eq('.') {
582            s.push('.');
583            s.push(self.next_char_expect(is_digit, LexerError::IncorrectJsonNumber)?);
584            while let Some(c) = self.next_char_if(is_digit) {
585                s.push(c);
586            }
587        }
588
589        if let Some(c) = self.next_char_if_in("eE") {
590            s.push(c);
591            if let Some(c) = self.next_char_if_in("+-") {
592                s.push(c);
593            }
594            s.push(self.next_char_expect(is_digit, LexerError::IncorrectJsonNumber)?);
595            while let Some(c) = self.next_char_if(is_digit) {
596                s.push(c);
597            }
598        }
599
600        Ok(Some(JsonNumberLit(s)))
601    }
602
603    fn next_token_inner(&mut self) -> LexerResult<Token> {
604        if self.language == ParserLanguage::Json {
605            if let Some(v) = self.next_json_number_opt()? {
606                return Ok(Token::JsonNumber(v));
607            }
608        }
609
610        if let Some(ident) = self.next_ident_opt()? {
611            let token = if self.language != ParserLanguage::Json && ident == float::PROTOBUF_NAN {
612                Token::FloatLit(f64::NAN)
613            } else if self.language != ParserLanguage::Json && ident == float::PROTOBUF_INF {
614                Token::FloatLit(f64::INFINITY)
615            } else {
616                Token::Ident(ident.to_owned())
617            };
618            return Ok(token);
619        }
620
621        if self.language != ParserLanguage::Json {
622            let mut clone = self.clone();
623            let pos = clone.pos;
624            if let Ok(_) = clone.next_float_lit() {
625                let f = float::parse_protobuf_float(&self.input[pos..clone.pos])?;
626                *self = clone;
627                return Ok(Token::FloatLit(f));
628            }
629
630            if let Some(lit) = self.next_int_lit_opt()? {
631                return Ok(Token::IntLit(lit));
632            }
633        }
634
635        if let Some(escaped) = self.next_str_lit_raw_opt()? {
636            return Ok(Token::StrLit(StrLit { escaped }));
637        }
638
639        // This branch must be after str lit
640        if let Some(c) = self.next_char_if(|c| c.is_ascii_punctuation()) {
641            return Ok(Token::Symbol(c));
642        }
643
644        if let Some(ident) = self.next_ident_opt()? {
645            return Ok(Token::Ident(ident));
646        }
647
648        Err(LexerError::IncorrectInput)
649    }
650
651    pub fn next_token(&mut self) -> LexerResult<Option<TokenWithLocation>> {
652        self.skip_ws()?;
653        let loc = self.loc;
654
655        Ok(if self.eof() {
656            None
657        } else {
658            let token = self.next_token_inner()?;
659            // Skip whitespace here to update location
660            // to the beginning of the next token
661            self.skip_ws()?;
662            Some(TokenWithLocation { token, loc })
663        })
664    }
665}
666
667#[cfg(test)]
668mod test {
669    use super::*;
670
671    fn lex<P, R>(input: &str, parse_what: P) -> R
672    where
673        P: FnOnce(&mut Lexer) -> LexerResult<R>,
674    {
675        let mut lexer = Lexer::new(input, ParserLanguage::Proto);
676        let r = parse_what(&mut lexer).expect(&format!("lexer failed at {}", lexer.loc));
677        assert!(lexer.eof(), "check eof failed at {}", lexer.loc);
678        r
679    }
680
681    fn lex_opt<P, R>(input: &str, parse_what: P) -> R
682    where
683        P: FnOnce(&mut Lexer) -> LexerResult<Option<R>>,
684    {
685        let mut lexer = Lexer::new(input, ParserLanguage::Proto);
686        let o = parse_what(&mut lexer).expect(&format!("lexer failed at {}", lexer.loc));
687        let r = o.expect(&format!("lexer returned none at {}", lexer.loc));
688        assert!(lexer.eof(), "check eof failed at {}", lexer.loc);
689        r
690    }
691
692    #[test]
693    fn test_lexer_int_lit() {
694        let msg = r#"10"#;
695        let mess = lex_opt(msg, |p| p.next_int_lit_opt());
696        assert_eq!(10, mess);
697    }
698
699    #[test]
700    fn test_lexer_float_lit() {
701        let msg = r#"12.3"#;
702        let mess = lex(msg, |p| p.next_token_inner());
703        assert_eq!(Token::FloatLit(12.3), mess);
704    }
705
706    #[test]
707    fn test_lexer_float_lit_leading_zeros_in_exp() {
708        let msg = r#"1e00009"#;
709        let mess = lex(msg, |p| p.next_token_inner());
710        assert_eq!(Token::FloatLit(1_000_000_000.0), mess);
711    }
712}