protobuf_ast_parser/
lexer.rs

1//! Tokenization for Protocol Buffers sources.
2//!
3//! # Examples
4//! ```rust
5//! use protobuf_ast_parser::lexer::{Lexer, Token};
6//!
7//! let mut lexer = Lexer::new("syntax = \"proto3\";");
8//! let first = lexer.next().unwrap().unwrap();
9//! assert_eq!(first.1, Token::Syntax);
10//! ```
11
12use logos::{Logos, Span};
13use std::num::{IntErrorKind, ParseIntError};
14
15/// Categories of lexical errors produced by [`Lexer`].
16#[derive(Default, Debug, Clone, PartialEq)]
17pub enum LexicalErrorKind {
18    #[default]
19    InvalidToken,
20    InvalidInteger(ParseIntError),
21}
22
23impl From<ParseIntError> for LexicalErrorKind {
24    fn from(value: ParseIntError) -> Self {
25        Self::InvalidInteger(value)
26    }
27}
28
29/// Error emitted when the lexer cannot produce a valid token.
30#[derive(Debug, Clone, PartialEq)]
31pub struct LexicalError<'a> {
32    kind: LexicalErrorKind,
33    input: &'a str,
34    span: Span,
35}
36
37impl<'a> std::fmt::Display for LexicalError<'a> {
38    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
39        let line = self.input[..self.span.start]
40            .chars()
41            .filter(|&ch| ch == '\n')
42            .count()
43            + 1;
44
45        let column = self.span.start - self.input[..self.span.start].rfind("\n").unwrap_or(0);
46
47        let position = format!("line {}, column {}", line, column);
48
49        match &self.kind {
50            LexicalErrorKind::InvalidToken => write!(
51                f,
52                "Invalid token \"{}\" at {}",
53                &self.input[self.span.start..self.span.end],
54                position
55            )?,
56            LexicalErrorKind::InvalidInteger(inner) => write!(
57                f,
58                "Invalid number {} at {}: {}",
59                &self.input[self.span.start..self.span.end],
60                position,
61                match inner.kind() {
62                    IntErrorKind::PosOverflow | IntErrorKind::NegOverflow => "overflow",
63                    _ => "unknown",
64                }
65            )?,
66        };
67
68        Ok(())
69    }
70}
71
72fn string_from_lexer<'a>(lex: &mut logos::Lexer<'a, Token<'a>>) -> &'a str {
73    let slice = lex.slice();
74    &slice[1..slice.len() - 1]
75}
76
77/// Token kinds produced by the lexer.
78#[derive(Clone, Debug, PartialEq, Logos)]
79#[logos(error = LexicalErrorKind)]
80#[logos(skip r"[\s\t\n\f]+")]
81pub enum Token<'a> {
82    #[regex(r"//.*", allow_greedy = true)]
83    SingleLineComment(&'a str),
84
85    #[regex(r"\/\*[^*]*\*+(?:[^\/*][^*]*\*+)*\/")]
86    MultiLineComment(&'a str),
87
88    #[token("=")]
89    Eq,
90
91    #[token(":")]
92    Colon,
93
94    #[token(";")]
95    Semicolon,
96
97    #[token(",")]
98    Comma,
99
100    #[token(".")]
101    Period,
102
103    #[token("(")]
104    OpenPth,
105
106    #[token(")")]
107    ClosePth,
108
109    #[token("[")]
110    OpenBracket,
111
112    #[token("]")]
113    CloseBracket,
114
115    #[token("{")]
116    OpenBrace,
117
118    #[token("}")]
119    CloseBrace,
120
121    #[token("<")]
122    OpenAngle,
123
124    #[token(">")]
125    CloseAngle,
126
127    #[token("true", |_| true)]
128    #[token("false", |_| false)]
129    Boolean(bool),
130
131    #[regex(r"-?[0-9]+", |lex| lex.slice().parse())]
132    #[regex(r"0x[0-9a-fA-F]{1,16}", |lex| i64::from_str_radix(&lex.slice()[2..], 16))]
133    Integer(i64),
134
135    #[token("to")]
136    To,
137
138    #[token("max")]
139    Max,
140
141    #[token("syntax")]
142    Syntax,
143
144    #[token("option")]
145    Option,
146
147    #[token("package")]
148    Package,
149
150    #[token("import")]
151    Import,
152
153    #[token("service")]
154    Service,
155
156    #[token("rpc")]
157    Rpc,
158
159    #[token("stream")]
160    Stream,
161
162    #[token("returns")]
163    Returns,
164
165    #[token("message")]
166    Message,
167
168    #[token("oneof")]
169    OneOf,
170
171    #[token("extend")]
172    Extend,
173
174    #[token("enum")]
175    Enum,
176
177    #[token("reserved")]
178    Reserved,
179
180    #[token("extensions")]
181    Extensions,
182
183    #[token("optional")]
184    Optional,
185
186    #[token("required")]
187    Required,
188
189    #[token("repeated")]
190    Repeated,
191
192    #[token("map")]
193    Map,
194
195    #[regex(r#"'((?:[^'\n]|(?:\\\'))*)'"#, string_from_lexer)]
196    #[regex(r#""((?:[^"\n]|(?:\\\"))*)""#, string_from_lexer)]
197    String(&'a str),
198
199    #[regex(r"[a-zA-Z_][a-zA-Z_0-9]*", priority = 0)]
200    Ident(&'a str),
201}
202
203impl<'a> std::fmt::Display for Token<'a> {
204    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
205        write!(f, "{:?}", self)
206    }
207}
208
209/// Streaming lexer that yields spanned tokens.
210pub struct Lexer<'input> {
211    inner: logos::SpannedIter<'input, Token<'input>>,
212}
213
214impl<'input> Lexer<'input> {
215    pub fn new(src: &'input str) -> Self {
216        Self {
217            inner: Token::lexer(src).spanned(),
218        }
219    }
220}
221
222/// LALRPOP-compatible spanned token wrapper.
223pub type Spanned<Tok, Loc, Error> = Result<(Loc, Tok, Loc), Error>;
224
225impl<'input> Iterator for Lexer<'input> {
226    type Item = Spanned<Token<'input>, usize, LexicalError<'input>>;
227
228    fn next(&mut self) -> Option<Self::Item> {
229        let (tok, span) = self.inner.next()?;
230
231        Some(
232            tok.map(|tok| (span.start, tok, span.end))
233                .map_err(|kind| LexicalError {
234                    kind,
235                    input: self.inner.source(),
236                    span,
237                }),
238        )
239    }
240}