oak_vampire/lexer/
mod.rs

1use crate::{kind::VampireSyntaxKind, language::VampireLanguage};
2use oak_core::{
3    Lexer, LexerCache, LexerState, OakError, TextEdit,
4    lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
5    source::Source,
6};
7use std::sync::LazyLock;
8
9type State<'s, S> = LexerState<'s, S, VampireLanguage>;
10
11static VAMPIRE_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static VAMPIRE_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "%", block_start: "/*", block_end: "*/", nested_blocks: false });
13static VAMPIRE_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
14
15#[derive(Clone)]
16pub struct VampireLexer<'config> {
17    _config: &'config VampireLanguage,
18}
19
20impl<'config> Lexer<VampireLanguage> for VampireLexer<'config> {
21    fn lex<'a, S: Source + ?Sized>(&self, text: &'a S, _edits: &[TextEdit], _cache: &'a mut impl LexerCache<VampireLanguage>) -> LexOutput<VampireLanguage> {
22        let mut state: State<'_, S> = LexerState::new(text);
23        let result = self.run(&mut state);
24        if result.is_ok() {
25            state.add_eof();
26        }
27        state.finish(result)
28    }
29}
30
31impl<'config> VampireLexer<'config> {
32    pub fn new(config: &'config VampireLanguage) -> Self {
33        Self { _config: config }
34    }
35
36    /// Main lexing loop
37    fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), OakError> {
38        while state.not_at_end() {
39            let safe_point = state.get_position();
40
41            if self.skip_whitespace(state) {
42                continue;
43            }
44
45            if self.skip_comment(state) {
46                continue;
47            }
48
49            if self.lex_string_literal(state) {
50                continue;
51            }
52
53            if self.lex_number_literal(state) {
54                continue;
55            }
56
57            if self.lex_identifier_or_keyword(state) {
58                continue;
59            }
60
61            if self.lex_operators(state) {
62                continue;
63            }
64
65            if self.lex_single_char_tokens(state) {
66                continue;
67            }
68
69            state.advance_if_dead_lock(safe_point);
70        }
71
72        Ok(())
73    }
74
75    /// Skip whitespace characters
76    fn skip_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
77        VAMPIRE_WHITESPACE.scan(state, VampireSyntaxKind::Whitespace)
78    }
79
80    /// Skip comment lines
81    fn skip_comment<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
82        VAMPIRE_COMMENT.scan(state, VampireSyntaxKind::LineComment, VampireSyntaxKind::BlockComment)
83    }
84
85    /// Lex string literals
86    fn lex_string_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
87        VAMPIRE_STRING.scan(state, VampireSyntaxKind::StringLiteral)
88    }
89
90    /// Lex number literals
91    fn lex_number_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
92        let start = state.get_position();
93        let mut has_digit = false;
94
95        // Optional minus
96        if state.peek() == Some('-') {
97            state.advance(1);
98        }
99
100        while let Some(ch) = state.peek() {
101            if ch.is_ascii_digit() {
102                state.advance(ch.len_utf8());
103                has_digit = true;
104            }
105            else {
106                break;
107            }
108        }
109
110        if !has_digit {
111            return false;
112        }
113
114        // Check for float (dot or exponent)
115        let mut is_float = false;
116        if state.peek() == Some('.') {
117            if let Some(next) = state.peek_next_n(1) {
118                if next.is_ascii_digit() {
119                    state.advance(1); // skip dot
120                    is_float = true;
121                    while let Some(ch) = state.peek() {
122                        if ch.is_ascii_digit() {
123                            state.advance(ch.len_utf8());
124                        }
125                        else {
126                            break;
127                        }
128                    }
129                }
130            }
131        }
132
133        if let Some(ch) = state.peek() {
134            if ch == 'e' || ch == 'E' {
135                state.advance(1);
136                is_float = true;
137                if let Some(ch) = state.peek() {
138                    if ch == '+' || ch == '-' {
139                        state.advance(1);
140                    }
141                }
142                while let Some(ch) = state.peek() {
143                    if ch.is_ascii_digit() {
144                        state.advance(ch.len_utf8());
145                    }
146                    else {
147                        break;
148                    }
149                }
150            }
151        }
152
153        let kind = if is_float { VampireSyntaxKind::RealLiteral } else { VampireSyntaxKind::IntegerLiteral };
154        state.add_token(kind, start, state.get_position());
155        true
156    }
157
158    /// Lex identifier or keyword
159    fn lex_identifier_or_keyword<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
160        let start = state.get_position();
161        if let Some(ch) = state.peek() {
162            if ch.is_ascii_alphabetic() || ch == '_' || ch == '$' {
163                state.advance(ch.len_utf8());
164                while let Some(ch) = state.peek() {
165                    if ch.is_ascii_alphanumeric() || ch == '_' || ch == '$' {
166                        state.advance(ch.len_utf8());
167                    }
168                    else {
169                        break;
170                    }
171                }
172
173                let end = state.get_position();
174                let text = state.get_text_in((start..end).into());
175                let kind = self.keyword_or_identifier(text.as_ref());
176                state.add_token(kind, start, end);
177                true
178            }
179            else {
180                false
181            }
182        }
183        else {
184            false
185        }
186    }
187
188    /// Determine if text is a keyword or identifier
189    fn keyword_or_identifier(&self, text: &str) -> VampireSyntaxKind {
190        match text {
191            // TPTP formula types
192            "fof" => VampireSyntaxKind::FofKw,
193            "cnf" => VampireSyntaxKind::CnfKw,
194            "tff" => VampireSyntaxKind::TffKw,
195            "thf" => VampireSyntaxKind::ThfKw,
196            "tpi" => VampireSyntaxKind::TpiKw,
197            "include" => VampireSyntaxKind::IncludeKw,
198
199            // Formula roles
200            "axiom" => VampireSyntaxKind::AxiomKw,
201            "hypothesis" => VampireSyntaxKind::HypothesisKw,
202            "definition" => VampireSyntaxKind::DefinitionKw,
203            "assumption" => VampireSyntaxKind::AssumptionKw,
204            "lemma" => VampireSyntaxKind::LemmaKw,
205            "theorem" => VampireSyntaxKind::TheoremKw,
206            "conjecture" => VampireSyntaxKind::ConjectureKw,
207            "negated_conjecture" => VampireSyntaxKind::NegatedConjectureKw,
208            "plain" => VampireSyntaxKind::PlainKw,
209            "type" => VampireSyntaxKind::TypeKw,
210            "fi_domain" => VampireSyntaxKind::FiDomainKw,
211            "fi_functors" => VampireSyntaxKind::FiFunctorsKw,
212            "fi_predicates" => VampireSyntaxKind::FiPredicatesKw,
213            "unknown" => VampireSyntaxKind::UnknownKw,
214
215            // Logical operators
216            "!" => VampireSyntaxKind::ForallKw,
217            "?" => VampireSyntaxKind::ExistsKw,
218            "&" => VampireSyntaxKind::AndKw,
219            "|" => VampireSyntaxKind::OrKw,
220            "~" => VampireSyntaxKind::NotKw,
221            "=>" => VampireSyntaxKind::ImpliesKw,
222            "<=>" => VampireSyntaxKind::IffKw,
223            "<~>" => VampireSyntaxKind::XorKw,
224            "~|" => VampireSyntaxKind::NorKw,
225            "~&" => VampireSyntaxKind::NandKw,
226
227            // Types
228            "$o" => VampireSyntaxKind::BoolKw,
229            "$i" => VampireSyntaxKind::IndividualKw,
230            "$int" => VampireSyntaxKind::IntKw,
231            "$real" => VampireSyntaxKind::RealKw,
232            "$rat" => VampireSyntaxKind::RatKw,
233            "$tType" => VampireSyntaxKind::TTypeKw,
234            "$oType" => VampireSyntaxKind::OTypeKw,
235            "$iType" => VampireSyntaxKind::ITypeKw,
236
237            // Boolean literals
238            "$true" => VampireSyntaxKind::BoolLiteral,
239            "$false" => VampireSyntaxKind::BoolLiteral,
240
241            _ => VampireSyntaxKind::Identifier,
242        }
243    }
244
245    /// Lex operators
246    fn lex_operators<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
247        let start = state.get_position();
248        let rest = state.rest();
249
250        let (kind, len) = if rest.starts_with("==>") {
251            (VampireSyntaxKind::ImpliesKw, 3)
252        }
253        else if rest.starts_with("<=>") {
254            (VampireSyntaxKind::IffKw, 3)
255        }
256        else if rest.starts_with("<~>") {
257            (VampireSyntaxKind::XorKw, 3)
258        }
259        else if rest.starts_with("~|") {
260            (VampireSyntaxKind::NorKw, 2)
261        }
262        else if rest.starts_with("~&") {
263            (VampireSyntaxKind::NandKw, 2)
264        }
265        else if rest.starts_with("==") {
266            (VampireSyntaxKind::EqEq, 2)
267        }
268        else if rest.starts_with("!=") {
269            (VampireSyntaxKind::NotEq, 2)
270        }
271        else if rest.starts_with("<=") {
272            (VampireSyntaxKind::LessEq, 2)
273        }
274        else if rest.starts_with(">=") {
275            (VampireSyntaxKind::GreaterEq, 2)
276        }
277        else if rest.starts_with("&&") {
278            (VampireSyntaxKind::AndAnd, 2)
279        }
280        else if rest.starts_with("||") {
281            (VampireSyntaxKind::OrOr, 2)
282        }
283        else if rest.starts_with("++") {
284            (VampireSyntaxKind::PlusPlus, 2)
285        }
286        else if rest.starts_with("--") {
287            (VampireSyntaxKind::MinusMinus, 2)
288        }
289        else if rest.starts_with("+=") {
290            (VampireSyntaxKind::PlusEq, 2)
291        }
292        else if rest.starts_with("-=") {
293            (VampireSyntaxKind::MinusEq, 2)
294        }
295        else if rest.starts_with("*=") {
296            (VampireSyntaxKind::StarEq, 2)
297        }
298        else if rest.starts_with("/=") {
299            (VampireSyntaxKind::SlashEq, 2)
300        }
301        else if rest.starts_with("%=") {
302            (VampireSyntaxKind::PercentEq, 2)
303        }
304        else if rest.starts_with("<<") {
305            (VampireSyntaxKind::LeftShift, 2)
306        }
307        else if rest.starts_with(">>") {
308            (VampireSyntaxKind::RightShift, 2)
309        }
310        else if rest.starts_with("->") {
311            (VampireSyntaxKind::Arrow, 2)
312        }
313        else {
314            return false;
315        };
316
317        state.advance(len);
318        state.add_token(kind, start, state.get_position());
319        true
320    }
321
322    /// Lex single character tokens
323    fn lex_single_char_tokens<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
324        if let Some(ch) = state.peek() {
325            let start = state.get_position();
326            let kind = match ch {
327                '(' => Some(VampireSyntaxKind::LeftParen),
328                ')' => Some(VampireSyntaxKind::RightParen),
329                '[' => Some(VampireSyntaxKind::LeftBracket),
330                ']' => Some(VampireSyntaxKind::RightBracket),
331                '{' => Some(VampireSyntaxKind::LeftBrace),
332                '}' => Some(VampireSyntaxKind::RightBrace),
333                ':' => Some(VampireSyntaxKind::Colon),
334                ';' => Some(VampireSyntaxKind::Semicolon),
335                '.' => Some(VampireSyntaxKind::Dot),
336                ',' => Some(VampireSyntaxKind::Comma),
337                '?' => Some(VampireSyntaxKind::Question),
338                '!' => Some(VampireSyntaxKind::Bang),
339                '@' => Some(VampireSyntaxKind::At),
340                '#' => Some(VampireSyntaxKind::Hash),
341                '$' => Some(VampireSyntaxKind::Dollar),
342                '%' => Some(VampireSyntaxKind::Percent),
343                '^' => Some(VampireSyntaxKind::Caret),
344                '&' => Some(VampireSyntaxKind::Ampersand),
345                '*' => Some(VampireSyntaxKind::Star),
346                '+' => Some(VampireSyntaxKind::Plus),
347                '-' => Some(VampireSyntaxKind::Minus),
348                '=' => Some(VampireSyntaxKind::Eq),
349                '<' => Some(VampireSyntaxKind::LessThan),
350                '>' => Some(VampireSyntaxKind::GreaterThan),
351                '/' => Some(VampireSyntaxKind::Slash),
352                '\\' => Some(VampireSyntaxKind::Backslash),
353                '|' => Some(VampireSyntaxKind::Pipe),
354                '~' => Some(VampireSyntaxKind::Tilde),
355                _ => None,
356            };
357
358            if let Some(token_kind) = kind {
359                state.advance(ch.len_utf8());
360                state.add_token(token_kind, start, state.get_position());
361                true
362            }
363            else {
364                false
365            }
366        }
367        else {
368            false
369        }
370    }
371}