Skip to main content

oak_vampire/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2pub mod token_type;
3pub use token_type::VampireTokenType;
4
5use crate::language::VampireLanguage;
6use oak_core::{
7    Lexer, LexerCache, LexerState, OakError, TextEdit,
8    lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
9    source::Source,
10};
11use std::sync::LazyLock;
12
13type State<'s, S> = LexerState<'s, S, VampireLanguage>;
14
15static VAMPIRE_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
16static VAMPIRE_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "%", block_start: "/*", block_end: "*/", nested_blocks: false });
17static VAMPIRE_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
18
19#[derive(Clone)]
20pub struct VampireLexer<'config> {
21    config: &'config VampireLanguage,
22}
23
24impl<'config> Lexer<VampireLanguage> for VampireLexer<'config> {
25    fn lex<'a, S: Source + ?Sized>(&self, text: &'a S, _edits: &[TextEdit], _cache: &'a mut impl LexerCache<VampireLanguage>) -> LexOutput<VampireLanguage> {
26        let mut state: State<'_, S> = LexerState::new(text);
27        let result = self.run(&mut state);
28        if result.is_ok() {
29            state.add_eof();
30        }
31        state.finish(result)
32    }
33}
34impl<'config> VampireLexer<'config> {
35    pub fn new(config: &'config VampireLanguage) -> Self {
36        Self { config }
37    }
38
39    /// Main lexing loop
40    fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), OakError> {
41        while state.not_at_end() {
42            let safe_point = state.get_position();
43
44            if self.skip_whitespace(state) {
45                continue;
46            }
47
48            if self.skip_comment(state) {
49                continue;
50            }
51
52            if self.lex_string_literal(state) {
53                continue;
54            }
55
56            if self.lex_number_literal(state) {
57                continue;
58            }
59
60            if self.lex_identifier_or_keyword(state) {
61                continue;
62            }
63
64            if self.lex_operators(state) {
65                continue;
66            }
67
68            if self.lex_single_char_tokens(state) {
69                continue;
70            }
71
72            state.advance_if_dead_lock(safe_point);
73        }
74
75        Ok(())
76    }
77
78    /// Skip whitespace characters
79    fn skip_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
80        VAMPIRE_WHITESPACE.scan(state, VampireTokenType::Whitespace)
81    }
82
83    /// Skip comment lines
84    fn skip_comment<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
85        VAMPIRE_COMMENT.scan(state, VampireTokenType::LineComment, VampireTokenType::BlockComment)
86    }
87
88    /// Lex string literals
89    fn lex_string_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
90        VAMPIRE_STRING.scan(state, VampireTokenType::StringLiteral)
91    }
92
93    /// Lex number literals
94    fn lex_number_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
95        let start = state.get_position();
96        let mut has_digit = false;
97
98        // Optional minus
99        if state.peek() == Some('-') {
100            state.advance(1);
101        }
102
103        while let Some(ch) = state.peek() {
104            if ch.is_ascii_digit() {
105                state.advance(ch.len_utf8());
106                has_digit = true;
107            }
108            else {
109                break;
110            }
111        }
112
113        if !has_digit {
114            return false;
115        }
116
117        // Check for float (dot or exponent)
118        let mut is_float = false;
119        if state.peek() == Some('.') {
120            if let Some(next) = state.peek_next_n(1) {
121                if next.is_ascii_digit() {
122                    state.advance(1); // skip dot
123                    is_float = true;
124                    while let Some(ch) = state.peek() {
125                        if ch.is_ascii_digit() {
126                            state.advance(ch.len_utf8());
127                        }
128                        else {
129                            break;
130                        }
131                    }
132                }
133            }
134        }
135
136        if let Some(ch) = state.peek() {
137            if ch == 'e' || ch == 'E' {
138                state.advance(1);
139                is_float = true;
140                if let Some(ch) = state.peek() {
141                    if ch == '+' || ch == '-' {
142                        state.advance(1);
143                    }
144                }
145                while let Some(ch) = state.peek() {
146                    if ch.is_ascii_digit() {
147                        state.advance(ch.len_utf8());
148                    }
149                    else {
150                        break;
151                    }
152                }
153            }
154        }
155
156        let kind = if is_float { VampireTokenType::RealLiteral } else { VampireTokenType::IntegerLiteral };
157        state.add_token(kind, start, state.get_position());
158        true
159    }
160
161    /// Lex identifier or keyword
162    fn lex_identifier_or_keyword<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
163        let start = state.get_position();
164        if let Some(ch) = state.peek() {
165            if ch.is_ascii_alphabetic() || ch == '_' || ch == '$' {
166                state.advance(ch.len_utf8());
167                while let Some(ch) = state.peek() {
168                    if ch.is_ascii_alphanumeric() || ch == '_' || ch == '$' { state.advance(ch.len_utf8()) } else { break }
169                }
170
171                let end = state.get_position();
172                let text = state.get_text_in((start..end).into());
173                let kind = self.keyword_or_identifier(text.as_ref());
174                state.add_token(kind, start, end);
175                true
176            }
177            else {
178                false
179            }
180        }
181        else {
182            false
183        }
184    }
185
186    /// Determine if text is a keyword or identifier
187    fn keyword_or_identifier(&self, text: &str) -> VampireTokenType {
188        match text {
189            // TPTP formula types
190            "fof" => VampireTokenType::FofKw,
191            "cnf" => VampireTokenType::CnfKw,
192            "tff" => VampireTokenType::TffKw,
193            "thf" => VampireTokenType::ThfKw,
194            "tpi" => VampireTokenType::TpiKw,
195            "include" => VampireTokenType::IncludeKw,
196
197            // Formula roles
198            "axiom" => VampireTokenType::AxiomKw,
199            "hypothesis" => VampireTokenType::HypothesisKw,
200            "definition" => VampireTokenType::DefinitionKw,
201            "assumption" => VampireTokenType::AssumptionKw,
202            "lemma" => VampireTokenType::LemmaKw,
203            "theorem" => VampireTokenType::TheoremKw,
204            "conjecture" => VampireTokenType::ConjectureKw,
205            "negated_conjecture" => VampireTokenType::NegatedConjectureKw,
206            "plain" => VampireTokenType::PlainKw,
207            "type" => VampireTokenType::TypeKw,
208            "fi_domain" => VampireTokenType::FiDomainKw,
209            "fi_functors" => VampireTokenType::FiFunctorsKw,
210            "fi_predicates" => VampireTokenType::FiPredicatesKw,
211            "unknown" => VampireTokenType::UnknownKw,
212
213            // Logical operators
214            "!" => VampireTokenType::ForallKw,
215            "?" => VampireTokenType::ExistsKw,
216            "&" => VampireTokenType::AndKw,
217            "|" => VampireTokenType::OrKw,
218            "~" => VampireTokenType::NotKw,
219            "=>" => VampireTokenType::ImpliesKw,
220            "<=>" => VampireTokenType::IffKw,
221            "<~>" => VampireTokenType::XorKw,
222            "~|" => VampireTokenType::NorKw,
223            "~&" => VampireTokenType::NandKw,
224
225            // Types
226            "$o" => VampireTokenType::BoolKw,
227            "$i" => VampireTokenType::IndividualKw,
228            "$int" => VampireTokenType::IntKw,
229            "$real" => VampireTokenType::RealKw,
230            "$rat" => VampireTokenType::RatKw,
231            "$tType" => VampireTokenType::TTypeKw,
232            "$oType" => VampireTokenType::OTypeKw,
233            "$iType" => VampireTokenType::ITypeKw,
234
235            // Boolean literals
236            "$true" => VampireTokenType::BoolLiteral,
237            "$false" => VampireTokenType::BoolLiteral,
238
239            _ => VampireTokenType::Identifier,
240        }
241    }
242
243    /// Lex operators
244    fn lex_operators<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
245        let start = state.get_position();
246        let rest = state.rest();
247
248        let (kind, len) = if rest.starts_with("==>") {
249            (VampireTokenType::ImpliesKw, 3)
250        }
251        else if rest.starts_with("<=>") {
252            (VampireTokenType::IffKw, 3)
253        }
254        else if rest.starts_with("<~>") {
255            (VampireTokenType::XorKw, 3)
256        }
257        else if rest.starts_with("~|") {
258            (VampireTokenType::NorKw, 2)
259        }
260        else if rest.starts_with("~&") {
261            (VampireTokenType::NandKw, 2)
262        }
263        else if rest.starts_with("==") {
264            (VampireTokenType::DoubleEq, 2)
265        }
266        else if rest.starts_with("!=") {
267            (VampireTokenType::NotEq, 2)
268        }
269        else if rest.starts_with("<=") {
270            (VampireTokenType::LessEq, 2)
271        }
272        else if rest.starts_with(">=") {
273            (VampireTokenType::GreaterEq, 2)
274        }
275        else if rest.starts_with("&&") {
276            (VampireTokenType::AndAnd, 2)
277        }
278        else if rest.starts_with("||") {
279            (VampireTokenType::OrOr, 2)
280        }
281        else if rest.starts_with("++") {
282            (VampireTokenType::PlusPlus, 2)
283        }
284        else if rest.starts_with("--") {
285            (VampireTokenType::MinusMinus, 2)
286        }
287        else if rest.starts_with("+=") {
288            (VampireTokenType::PlusEq, 2)
289        }
290        else if rest.starts_with("-=") {
291            (VampireTokenType::MinusEq, 2)
292        }
293        else if rest.starts_with("*=") {
294            (VampireTokenType::StarEq, 2)
295        }
296        else if rest.starts_with("/=") {
297            (VampireTokenType::SlashEq, 2)
298        }
299        else if rest.starts_with("%=") {
300            (VampireTokenType::PercentEq, 2)
301        }
302        else if rest.starts_with("<<") {
303            (VampireTokenType::LeftShift, 2)
304        }
305        else if rest.starts_with(">>") {
306            (VampireTokenType::RightShift, 2)
307        }
308        else if rest.starts_with("->") {
309            (VampireTokenType::Arrow, 2)
310        }
311        else {
312            return false;
313        };
314
315        state.advance(len);
316        state.add_token(kind, start, state.get_position());
317        true
318    }
319
320    /// Lex single character tokens
321    fn lex_single_char_tokens<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
322        if let Some(ch) = state.peek() {
323            let start = state.get_position();
324            let kind = match ch {
325                '(' => Some(VampireTokenType::LeftParen),
326                ')' => Some(VampireTokenType::RightParen),
327                '[' => Some(VampireTokenType::LeftBracket),
328                ']' => Some(VampireTokenType::RightBracket),
329                '{' => Some(VampireTokenType::LeftBrace),
330                '}' => Some(VampireTokenType::RightBrace),
331                ':' => Some(VampireTokenType::Colon),
332                ';' => Some(VampireTokenType::Semicolon),
333                '.' => Some(VampireTokenType::Dot),
334                ',' => Some(VampireTokenType::Comma),
335                '?' => Some(VampireTokenType::Question),
336                '!' => Some(VampireTokenType::Bang),
337                '@' => Some(VampireTokenType::At),
338                '#' => Some(VampireTokenType::Hash),
339                '$' => Some(VampireTokenType::Dollar),
340                '%' => Some(VampireTokenType::Percent),
341                '^' => Some(VampireTokenType::Caret),
342                '&' => Some(VampireTokenType::Ampersand),
343                '*' => Some(VampireTokenType::Star),
344                '+' => Some(VampireTokenType::Plus),
345                '-' => Some(VampireTokenType::Minus),
346                '=' => Some(VampireTokenType::Eq),
347                '<' => Some(VampireTokenType::LessThan),
348                '>' => Some(VampireTokenType::GreaterThan),
349                '/' => Some(VampireTokenType::Slash),
350                '\\' => Some(VampireTokenType::Backslash),
351                '|' => Some(VampireTokenType::Pipe),
352                '~' => Some(VampireTokenType::Tilde),
353                _ => None,
354            };
355
356            if let Some(token_kind) = kind {
357                state.advance(ch.len_utf8());
358                state.add_token(token_kind, start, state.get_position());
359                true
360            }
361            else {
362                false
363            }
364        }
365        else {
366            false
367        }
368    }
369}