smt2parser/
lexer.rs

1// Copyright (c) Facebook, Inc. and its affiliates
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4//! Lexing primitives.
5
6use crate::{parser::Token, Decimal, Numeral};
7use num::Num;
8
9/// Record a position in the input stream.
10#[derive(Debug, Clone, Default, Eq, PartialEq)]
11pub struct Position {
12    pub path: Option<String>,
13    pub line: usize,
14    pub column: usize,
15}
16
17impl Position {
18    pub fn new(path: Option<String>, line: usize, column: usize) -> Self {
19        Self { path, line, column }
20    }
21}
22
23impl std::fmt::Display for Position {
24    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
25        match &self.path {
26            Some(path) => write!(f, "{}:{}:{}", path, self.line, self.column),
27            None => write!(f, "{}:{}", self.line, self.column),
28        }
29    }
30}
31
32pub(crate) struct Lexer<R> {
33    reader: R,
34    reserved_words: Vec<Token>,
35    reserved_words_map: fst::Map<Vec<u8>>,
36    current_offset: usize,
37    current_line: usize,
38    current_column: usize,
39}
40
41const KEYWORDS: &[(&str, Token)] = {
42    use Token::*;
43    &[
44        ("_", Underscore),
45        ("!", Exclam),
46        ("as", As),
47        ("let", Let),
48        ("exists", Exists),
49        ("forall", Forall),
50        ("match", Match),
51        ("par", Par),
52        ("assert", Assert),
53        ("check-sat", CheckSat),
54        ("check-sat-assuming", CheckSatAssuming),
55        ("declare-const", DeclareConst),
56        ("declare-datatype", DeclareDatatype),
57        ("declare-datatypes", DeclareDatatypes),
58        ("declare-fun", DeclareFun),
59        ("declare-sort", DeclareSort),
60        ("define-fun", DefineFun),
61        ("define-fun-rec", DefineFunRec),
62        ("define-funs-rec", DefineFunsRec),
63        ("define-sort", DefineSort),
64        ("echo", Echo),
65        ("exit", Exit),
66        ("get-assertions", GetAssertions),
67        ("get-assignment", GetAssignment),
68        ("get-info", GetInfo),
69        ("get-model", GetModel),
70        ("get-option", GetOption),
71        ("get-proof", GetProof),
72        ("get-unsat-assumptions", GetUnsatAssumptions),
73        ("get-unsat-core", GetUnsatCore),
74        ("get-value", GetValue),
75        ("pop", Pop),
76        ("push", Push),
77        ("reset", Reset),
78        ("reset-assertions", ResetAssertions),
79        ("set-info", SetInfo),
80        ("set-logic", SetLogic),
81        ("set-option", SetOption),
82    ]
83};
84
85impl<R> Lexer<R>
86where
87    R: std::io::BufRead,
88{
89    pub(crate) fn new(reader: R) -> Self {
90        let (reserved_words, reserved_words_map) = Self::make_reserved_words();
91        Self {
92            reader,
93            reserved_words,
94            reserved_words_map,
95            current_offset: 0,
96            current_line: 0,
97            current_column: 0,
98        }
99    }
100
101    #[inline]
102    fn lookup_symbol(&self, s: &[u8]) -> Option<&Token> {
103        self.reserved_words_map
104            .get(s)
105            .map(|index| &self.reserved_words[index as usize])
106    }
107
108    fn make_reserved_words() -> (Vec<Token>, fst::Map<Vec<u8>>) {
109        let mut keywords = KEYWORDS.to_vec();
110        keywords.sort_by_key(|(key, _)| key.to_string());
111        let mut words = Vec::new();
112        for (_, token) in &keywords {
113            words.push(token.clone());
114        }
115        let map = fst::Map::from_iter(
116            keywords
117                .iter()
118                .enumerate()
119                .map(|(index, (key, _))| (key, index as u64)),
120        )
121        .expect("Failed to create reserved token map");
122        (words, map)
123    }
124
125    #[cfg(test)]
126    pub(crate) fn current_offset(&self) -> usize {
127        self.current_offset
128    }
129
130    #[cfg(test)]
131    pub(crate) fn current_column(&self) -> usize {
132        self.current_column
133    }
134
135    #[cfg(test)]
136    pub(crate) fn current_line(&self) -> usize {
137        self.current_line
138    }
139
140    #[inline]
141    pub(crate) fn update_position(&self, pos: &mut Position) {
142        pos.line = self.current_line + 1;
143        pos.column = self.current_column + 1;
144    }
145
146    fn consume_byte(&mut self) {
147        if let Some(c) = self.peek_byte() {
148            if *c == b'\n' {
149                self.current_line += 1;
150                self.current_column = 0;
151            } else {
152                self.current_column += 1;
153            }
154            self.current_offset += 1;
155            self.reader.consume(1)
156        }
157    }
158
159    fn read_byte(&mut self) -> Option<u8> {
160        let c = self.peek_byte().cloned();
161        self.consume_byte();
162        c
163    }
164
165    #[inline]
166    fn peek_bytes(&mut self) -> &[u8] {
167        self.reader
168            .fill_buf()
169            .expect("Error while reading input stream")
170    }
171
172    fn peek_byte(&mut self) -> Option<&u8> {
173        self.peek_bytes().get(0)
174    }
175
176    fn skip_whitespace(&mut self) -> bool {
177        match self.peek_byte() {
178            Some(b) if matches!(b, b' ' | b'\n' | b'\t' | b'\r') => {
179                self.consume_byte();
180                true
181            }
182            _ => false,
183        }
184    }
185
186    fn skip_comment(&mut self) -> bool {
187        match self.peek_byte() {
188            Some(c) if *c == b';' => {
189                self.consume_byte();
190                while let Some(c) = self.read_byte() {
191                    if c == b'\n' {
192                        break;
193                    }
194                }
195                true
196            }
197            _ => false,
198        }
199    }
200}
201
202impl<R> Iterator for Lexer<R>
203where
204    R: std::io::BufRead,
205{
206    type Item = Token;
207
208    fn next(&mut self) -> Option<Token> {
209        while self.skip_whitespace() || self.skip_comment() {}
210        match self.peek_byte() {
211            // Parentheses
212            Some(b'(') => {
213                self.consume_byte();
214                Some(Token::LeftParen)
215            }
216            Some(b')') => {
217                self.consume_byte();
218                Some(Token::RightParen)
219            }
220            // Quoted symbols
221            Some(b'|') => {
222                self.consume_byte();
223                let mut content = Vec::new();
224                while let Some(c) = self.read_byte() {
225                    if c == b'|' {
226                        return match String::from_utf8(content) {
227                            Ok(s) => Some(Token::Symbol(s)),
228                            Err(_) => None,
229                        };
230                    }
231                    content.push(c);
232                }
233                // Do not accept EOI as a terminator.
234                None
235            }
236            // String literals
237            Some(b'"') => {
238                self.consume_byte();
239                let mut content = Vec::new();
240                while let Some(c) = self.read_byte() {
241                    if c == b'"' {
242                        if let Some(d) = self.peek_byte() {
243                            if *d == b'"' {
244                                self.consume_byte();
245                                content.push(b'"');
246                                continue;
247                            }
248                        }
249                        return match String::from_utf8(content) {
250                            Ok(s) => Some(Token::String(s)),
251                            Err(_) => None,
252                        };
253                    }
254                    content.push(c);
255                }
256                // Do not accept EOI as a terminator.
257                None
258            }
259            // Binary and Hexadecimal literals
260            Some(b'#') => {
261                self.consume_byte();
262                match self.peek_byte() {
263                    Some(b'b') => {
264                        self.consume_byte();
265                        let mut content = Vec::new();
266                        while let Some(c) = self.peek_byte() {
267                            match c {
268                                b'0' => content.push(false),
269                                b'1' => content.push(true),
270                                _ => break,
271                            }
272                            self.consume_byte();
273                        }
274                        if content.is_empty() {
275                            None
276                        } else {
277                            Some(Token::Binary(content))
278                        }
279                    }
280                    Some(b'x') => {
281                        self.consume_byte();
282                        let mut content = Vec::new();
283                        while let Some(c) = self.peek_byte() {
284                            match c {
285                                b'0'..=b'9' => content.push(c - b'0'),
286                                b'a'..=b'f' => content.push(c - b'a' + 10),
287                                b'A'..=b'F' => content.push(c - b'A' + 10),
288                                _ => break,
289                            }
290                            self.consume_byte();
291                        }
292                        if content.is_empty() {
293                            None
294                        } else {
295                            Some(Token::Hexadecimal(content))
296                        }
297                    }
298                    _ => None,
299                }
300            }
301            // Number literals
302            Some(digit @ b'0'..=b'9') => {
303                let mut numerator = vec![*digit];
304                self.consume_byte();
305                while let Some(c) = self.peek_byte() {
306                    if !is_digit_byte(*c) {
307                        break;
308                    }
309                    numerator.push(*c);
310                    self.consume_byte();
311                }
312                if numerator.len() > 1 && numerator.starts_with(b"0") {
313                    return None;
314                }
315                let numerator = String::from_utf8(numerator).unwrap();
316                match self.peek_byte() {
317                    Some(b'.') => {
318                        self.consume_byte();
319                        let mut denumerator = Vec::new();
320                        while let Some(c) = self.peek_byte() {
321                            if !is_digit_byte(*c) {
322                                break;
323                            }
324                            denumerator.push(*c);
325                            self.consume_byte();
326                        }
327                        if denumerator.is_empty() {
328                            return None;
329                        }
330                        let denumerator = String::from_utf8(denumerator).unwrap();
331                        let num =
332                            num::BigInt::from_str_radix(&(numerator + &denumerator), 10).ok()?;
333                        let denom = num::BigInt::from(10u32).pow(denumerator.len() as u32);
334                        Some(Token::Decimal(Decimal::new(num, denom)))
335                    }
336                    _ => Some(Token::Numeral(
337                        Numeral::from_str_radix(&numerator, 10).ok()?,
338                    )),
339                }
340            }
341            // Keywords
342            Some(b':') => {
343                self.consume_byte();
344                let mut content = Vec::new();
345                while let Some(c) = self.peek_byte() {
346                    if !is_symbol_byte(*c) {
347                        break;
348                    }
349                    content.push(*c);
350                    self.consume_byte();
351                }
352                match String::from_utf8(content) {
353                    Ok(s) => Some(Token::Keyword(s)),
354                    Err(_) => None,
355                }
356            }
357            // Symbols (including `_` and `!`)
358            Some(c) if is_non_digit_symbol_byte(*c) => {
359                let mut content = vec![*c];
360                self.consume_byte();
361                while let Some(c) = self.peek_byte() {
362                    if !is_symbol_byte(*c) {
363                        break;
364                    }
365                    content.push(*c);
366                    self.consume_byte();
367                }
368                match self.lookup_symbol(&content) {
369                    Some(token) => Some(token.clone()),
370                    None => match String::from_utf8(content) {
371                        Ok(s) => Some(Token::Symbol(s)),
372                        Err(_) => None,
373                    },
374                }
375            }
376            // EOI or Error
377            _ => None,
378        }
379    }
380}
381
382fn is_digit_byte(c: u8) -> bool {
383    matches!(c, b'0'..=b'9')
384}
385
386pub(crate) fn is_symbol_byte(c: u8) -> bool {
387    is_digit_byte(c) || is_non_digit_symbol_byte(c)
388}
389
390pub(crate) fn is_non_digit_symbol_byte(c: u8) -> bool {
391    matches!(c,
392        b'a'..=b'z'
393        | b'A'..=b'Z'
394        | b'~'
395        | b'!'
396        | b'@'
397        | b'$'
398        | b'%'
399        | b'^'
400        | b'&'
401        | b'*'
402        | b'_'
403        | b'-'
404        | b'+'
405        | b'='
406        | b'<'
407        | b'>'
408        | b'.'
409        | b'?'
410        | b'/')
411}
412
413#[test]
414fn test_spaces() {
415    let input = b"xx  \n asd ";
416    let mut lexer = Lexer::new(&input[..]);
417    let tokens: Vec<_> = (&mut lexer).collect();
418    assert_eq!(
419        tokens,
420        vec![Token::Symbol("xx".into()), Token::Symbol("asd".into())]
421    );
422    assert_eq!(lexer.current_line(), 1);
423    assert_eq!(lexer.current_column(), 5);
424    assert_eq!(lexer.current_offset(), input.len());
425}
426
427#[test]
428fn test_error() {
429    let input = b"xx  \\";
430    let mut lexer = Lexer::new(&input[..]);
431    assert_eq!(
432        (&mut lexer).collect::<Vec<_>>(),
433        vec![Token::Symbol("xx".into())]
434    );
435    assert_eq!(lexer.current_line(), 0);
436    assert_eq!(lexer.current_column(), input.len() - 1);
437    assert_eq!(lexer.current_offset(), input.len() - 1);
438}
439
440#[test]
441fn test_literals() {
442    let lexer = Lexer::new(&b"135"[..]);
443    assert_eq!(
444        lexer.collect::<Vec<_>>(),
445        vec![Token::Numeral(Numeral::from(135u32))]
446    );
447
448    let lexer = Lexer::new(&b"0"[..]);
449    assert_eq!(
450        lexer.collect::<Vec<_>>(),
451        vec![Token::Numeral(Numeral::from(0u32))]
452    );
453
454    let lexer = Lexer::new(&b"(0 59)"[..]);
455    assert_eq!(
456        lexer.collect::<Vec<_>>(),
457        vec![
458            Token::LeftParen,
459            Token::Numeral(Numeral::from(0u32)),
460            Token::Numeral(Numeral::from(59u32)),
461            Token::RightParen
462        ]
463    );
464
465    let lexer = Lexer::new(&b"135"[..]);
466    assert_eq!(
467        lexer.collect::<Vec<_>>(),
468        vec![Token::Numeral(Numeral::from(135u32))]
469    );
470
471    let lexer = Lexer::new(&b"135.2"[..]);
472    assert_eq!(
473        lexer.collect::<Vec<_>>(),
474        vec![Token::Decimal(Decimal::from((
475            1352u32.into(),
476            10u32.into()
477        )))]
478    );
479
480    let mut lexer = Lexer::new(&b"0135"[..]);
481    assert!(lexer.next().is_none());
482
483    let mut lexer = Lexer::new(&b"135."[..]);
484    assert!(lexer.next().is_none());
485
486    let lexer = Lexer::new(&b"#b101"[..]);
487    assert_eq!(
488        lexer.collect::<Vec<_>>(),
489        vec![Token::Binary(vec![true, false, true])]
490    );
491
492    let lexer = Lexer::new(&b"#x1Ae"[..]);
493    assert_eq!(
494        lexer.collect::<Vec<_>>(),
495        vec![Token::Hexadecimal(vec![1, 10, 14])]
496    );
497
498    let lexer = Lexer::new(&br#""acv""12""#[..]);
499    assert_eq!(
500        lexer.collect::<Vec<_>>(),
501        vec![Token::String("acv\"12".into())]
502    );
503
504    let lexer = Lexer::new(&br#""acv12""#[..]);
505    assert_eq!(
506        lexer.collect::<Vec<_>>(),
507        vec![Token::String("acv12".into())]
508    );
509}
510
511#[test]
512fn test_symbol() {
513    let lexer = Lexer::new(&b"A$@135"[..]);
514    assert_eq!(
515        lexer.collect::<Vec<_>>(),
516        vec![Token::Symbol("A$@135".into())]
517    );
518
519    let lexer = Lexer::new(&b"|135|"[..]);
520    assert_eq!(lexer.collect::<Vec<_>>(), vec![Token::Symbol("135".into())]);
521}
522
523#[test]
524fn test_keyword() {
525    let lexer = Lexer::new(&b":A$@135"[..]);
526    assert_eq!(
527        lexer.collect::<Vec<_>>(),
528        vec![Token::Keyword("A$@135".into())]
529    );
530}
531
532#[test]
533fn test_token_size() {
534    assert_eq!(std::mem::size_of::<Token>(), 72);
535    assert_eq!(std::mem::size_of::<Box<Token>>(), 8);
536}