1use crate::lexer::lexer_impl::Lexer;
2use crate::lexer::lexer_impl::LexerError;
3use crate::lexer::loc::Loc;
4use crate::lexer::parser_language::ParserLanguage;
5use crate::lexer::str_lit::StrLit;
6use crate::lexer::str_lit::StrLitDecodeError;
7use crate::lexer::token::Token;
8use crate::lexer::token::TokenWithLocation;
9
10#[derive(Debug, thiserror::Error)]
11pub enum TokenizerError {
12 #[error(transparent)]
13 LexerError(#[from] LexerError),
14 #[error(transparent)]
15 StrLitDecodeError(#[from] StrLitDecodeError),
16 #[error("Internal tokenizer error")]
17 InternalError,
18 #[error("Incorrect input")]
20 IncorrectInput,
21 #[error("Not allowed in this context: {0}")]
22 NotAllowedInThisContext(&'static str),
23 #[error("Unexpected end of input")]
24 UnexpectedEof,
25 #[error("Expecting string literal")]
26 ExpectStrLit,
27 #[error("Expecting int literal")]
28 ExpectIntLit,
29 #[error("Expecting float literal")]
30 ExpectFloatLit,
31 #[error("Expecting identifier")]
32 ExpectIdent,
33 #[error("Expecting identifier `{}`", .0)]
34 ExpectNamedIdent(String),
35 #[error("While parsing {}, expecting char `{}`", .1, .0)]
36 ExpectChar(char, &'static str),
37 #[error("Expecting any char of: {}", .0.iter().map(|c| format!("`{}`", c)).collect::<Vec<_>>().join(", "))]
38 ExpectAnyChar(Vec<char>),
39}
40
41pub type TokenizerResult<R> = Result<R, TokenizerError>;
42
43#[derive(Clone)]
44pub struct Tokenizer<'a> {
45 lexer: Lexer<'a>,
46 next_token: Option<TokenWithLocation>,
47 last_token_loc: Option<Loc>,
48}
49
50impl<'a> Tokenizer<'a> {
51 pub fn new(input: &'a str, comment_style: ParserLanguage) -> Tokenizer<'a> {
52 Tokenizer {
53 lexer: Lexer::new(input, comment_style),
54 next_token: None,
55 last_token_loc: None,
56 }
57 }
58
59 pub fn loc(&self) -> Loc {
60 self.next_token
62 .as_ref()
63 .map(|t| t.loc.clone())
64 .or(self.last_token_loc.clone())
66 .unwrap_or(self.lexer.loc)
68 }
69
70 pub fn lookahead_loc(&mut self) -> Loc {
71 drop(self.lookahead());
72 self.loc()
74 }
75
76 fn lookahead(&mut self) -> TokenizerResult<Option<&Token>> {
77 Ok(match self.next_token {
78 Some(ref token) => Some(&token.token),
79 None => {
80 self.next_token = self.lexer.next_token()?;
81 self.last_token_loc = self.next_token.as_ref().map(|t| t.loc.clone());
82 match self.next_token {
83 Some(ref token) => Some(&token.token),
84 None => None,
85 }
86 }
87 })
88 }
89
90 pub fn lookahead_some(&mut self) -> TokenizerResult<&Token> {
91 match self.lookahead()? {
92 Some(token) => Ok(token),
93 None => Err(TokenizerError::UnexpectedEof),
94 }
95 }
96
97 fn next(&mut self) -> TokenizerResult<Option<Token>> {
98 self.lookahead()?;
99 Ok(self
100 .next_token
101 .take()
102 .map(|TokenWithLocation { token, .. }| token))
103 }
104
105 pub fn next_some(&mut self) -> TokenizerResult<Token> {
106 match self.next()? {
107 Some(token) => Ok(token),
108 None => Err(TokenizerError::UnexpectedEof),
109 }
110 }
111
112 pub fn advance(&mut self) -> TokenizerResult<Token> {
114 self.next_token
115 .take()
116 .map(|TokenWithLocation { token, .. }| token)
117 .ok_or(TokenizerError::InternalError)
118 }
119
120 pub fn syntax_eof(&mut self) -> TokenizerResult<bool> {
122 Ok(self.lookahead()?.is_none())
123 }
124
125 pub fn next_token_if_map<P, R>(&mut self, p: P) -> TokenizerResult<Option<R>>
126 where
127 P: FnOnce(&Token) -> Option<R>,
128 {
129 self.lookahead()?;
130 let v = match self.next_token {
131 Some(ref token) => match p(&token.token) {
132 Some(v) => v,
133 None => return Ok(None),
134 },
135 _ => return Ok(None),
136 };
137 self.next_token = None;
138 Ok(Some(v))
139 }
140
141 pub fn next_token_check_map<P, R, E>(&mut self, p: P) -> Result<R, E>
142 where
143 P: FnOnce(&Token) -> Result<R, E>,
144 E: From<TokenizerError>,
145 {
146 self.lookahead()?;
147 let r = match self.next_token {
148 Some(ref token) => p(&token.token)?,
149 None => return Err(TokenizerError::UnexpectedEof.into()),
150 };
151 self.next_token = None;
152 Ok(r)
153 }
154
155 fn next_token_if<P>(&mut self, p: P) -> TokenizerResult<Option<Token>>
156 where
157 P: FnOnce(&Token) -> bool,
158 {
159 self.next_token_if_map(|token| if p(token) { Some(token.clone()) } else { None })
160 }
161
162 pub fn next_ident_if_in(&mut self, idents: &[&str]) -> TokenizerResult<Option<String>> {
163 let v = match self.lookahead()? {
164 Some(&Token::Ident(ref next)) => {
165 if idents.into_iter().find(|&i| i == next).is_some() {
166 next.clone()
167 } else {
168 return Ok(None);
169 }
170 }
171 _ => return Ok(None),
172 };
173 self.advance()?;
174 Ok(Some(v))
175 }
176
177 pub fn next_ident_if_eq(&mut self, word: &str) -> TokenizerResult<bool> {
178 Ok(self.next_ident_if_in(&[word])? != None)
179 }
180
181 pub fn next_ident_expect_eq(&mut self, word: &str) -> TokenizerResult<()> {
182 if self.next_ident_if_eq(word)? {
183 Ok(())
184 } else {
185 Err(TokenizerError::ExpectNamedIdent(word.to_owned()))
186 }
187 }
188
189 pub fn next_ident_if_eq_error(&mut self, word: &'static str) -> TokenizerResult<()> {
190 if self.clone().next_ident_if_eq(word)? {
191 return Err(TokenizerError::NotAllowedInThisContext(word));
193 }
194 Ok(())
195 }
196
197 pub fn next_symbol_if_eq(&mut self, symbol: char) -> TokenizerResult<bool> {
198 Ok(self.next_token_if(|token| match token {
199 &Token::Symbol(c) if c == symbol => true,
200 _ => false,
201 })? != None)
202 }
203
204 pub fn next_symbol_expect_eq(
205 &mut self,
206 symbol: char,
207 desc: &'static str,
208 ) -> TokenizerResult<()> {
209 if self.lookahead_is_symbol(symbol)? {
210 self.advance()?;
211 Ok(())
212 } else {
213 Err(TokenizerError::ExpectChar(symbol, desc))
214 }
215 }
216
217 pub fn next_symbol_expect_eq_oneof(&mut self, symbols: &[char]) -> TokenizerResult<char> {
218 for symbol in symbols {
219 if let Ok(()) = self.next_symbol_expect_eq(*symbol, "ignored") {
220 return Ok(*symbol);
221 }
222 }
223 Err(TokenizerError::ExpectAnyChar(symbols.to_owned()))
224 }
225
226 pub fn lookahead_is_str_lit(&mut self) -> TokenizerResult<bool> {
227 Ok(match self.lookahead()? {
228 Some(&Token::StrLit(..)) => true,
229 _ => false,
230 })
231 }
232
233 pub fn lookahead_is_int_lit(&mut self) -> TokenizerResult<bool> {
234 Ok(match self.lookahead()? {
235 Some(&Token::IntLit(..)) => true,
236 _ => false,
237 })
238 }
239
240 pub fn lookahead_is_json_number(&mut self) -> TokenizerResult<bool> {
241 Ok(match self.lookahead()? {
242 Some(&Token::JsonNumber(..)) => true,
243 _ => false,
244 })
245 }
246
247 pub fn lookahead_if_symbol(&mut self) -> TokenizerResult<Option<char>> {
248 Ok(match self.lookahead()? {
249 Some(&Token::Symbol(c)) => Some(c),
250 _ => None,
251 })
252 }
253
254 pub fn lookahead_is_symbol(&mut self, symbol: char) -> TokenizerResult<bool> {
255 Ok(self.lookahead_if_symbol()? == Some(symbol))
256 }
257
258 pub fn lookahead_is_ident(&mut self, ident: &str) -> TokenizerResult<bool> {
259 Ok(match self.lookahead()? {
260 Some(Token::Ident(i)) => i == ident,
261 _ => false,
262 })
263 }
264
265 pub fn next_ident(&mut self) -> TokenizerResult<String> {
266 self.next_token_check_map(|token| match token {
267 &Token::Ident(ref ident) => Ok(ident.clone()),
268 _ => Err(TokenizerError::ExpectIdent),
269 })
270 }
271
272 pub fn next_str_lit(&mut self) -> TokenizerResult<StrLit> {
273 self.next_token_check_map(|token| match token {
274 &Token::StrLit(ref str_lit) => Ok(str_lit.clone()),
275 _ => Err(TokenizerError::ExpectStrLit),
276 })
277 }
278
279 pub fn next_int_lit(&mut self) -> TokenizerResult<u64> {
280 self.next_token_check_map(|token| match token {
281 &Token::IntLit(v) => Ok(v),
282 _ => Err(TokenizerError::ExpectIntLit),
283 })
284 }
285
286 pub fn next_float_lit(&mut self) -> TokenizerResult<f64> {
287 self.next_token_check_map(|token| match token {
288 &Token::FloatLit(v) => Ok(v),
289 _ => Err(TokenizerError::ExpectFloatLit),
290 })
291 }
292}
293
294#[cfg(test)]
295mod test {
296
297 use super::*;
298
299 fn tokenize<P, R>(input: &str, what: P) -> R
300 where
301 P: FnOnce(&mut Tokenizer) -> TokenizerResult<R>,
302 {
303 let mut tokenizer = Tokenizer::new(input, ParserLanguage::Proto);
304 let r = what(&mut tokenizer).expect(&format!("parse failed at {}", tokenizer.loc()));
305 let eof = tokenizer
306 .syntax_eof()
307 .expect(&format!("check eof failed at {}", tokenizer.loc()));
308 assert!(eof, "{}", tokenizer.loc());
309 r
310 }
311
312 #[test]
313 fn test_ident() {
314 let msg = r#" aabb_c "#;
315 let mess = tokenize(msg, |p| p.next_ident().map(|s| s.to_owned()));
316 assert_eq!("aabb_c", mess);
317 }
318
319 #[test]
320 fn test_str_lit() {
321 let msg = r#" "a\nb" "#;
322 let mess = tokenize(msg, |p| p.next_str_lit());
323 assert_eq!(
324 StrLit {
325 escaped: r#"a\nb"#.to_owned()
326 },
327 mess
328 );
329 }
330}