oak_vampire/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2/// Token type definitions for the Vampire lexer.
3pub mod token_type;
4pub use token_type::VampireTokenType;
5
6use crate::language::VampireLanguage;
7use oak_core::{
8    Lexer, LexerCache, LexerState, OakError, TextEdit,
9    lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
10    source::Source,
11};
12use std::sync::LazyLock;
13
14type State<'s, S> = LexerState<'s, S, VampireLanguage>;
15
16static VAMPIRE_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
17static VAMPIRE_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "%", block_start: "/*", block_end: "*/", nested_blocks: false });
18static VAMPIRE_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
19
20/// Lexer for the Vampire language.
21#[derive(Clone)]
22pub struct VampireLexer<'config> {
23    config: &'config VampireLanguage,
24}
25
26impl<'config> Lexer<VampireLanguage> for VampireLexer<'config> {
27    /// Lexes the input text and produces a token stream for the Vampire language.
28    fn lex<'a, S: Source + ?Sized>(&self, text: &'a S, _edits: &[TextEdit], _cache: &'a mut impl LexerCache<VampireLanguage>) -> LexOutput<VampireLanguage> {
29        let mut state: State<'_, S> = LexerState::new(text);
30        let result = self.run(&mut state);
31        if result.is_ok() {
32            state.add_eof();
33        }
34        state.finish(result)
35    }
36}
37impl<'config> VampireLexer<'config> {
38    /// Creates a new VampireLexer with the given language configuration.
39    pub fn new(config: &'config VampireLanguage) -> Self {
40        Self { config }
41    }
42
43    /// Main lexing loop
44    fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), OakError> {
45        while state.not_at_end() {
46            let safe_point = state.get_position();
47
48            if self.skip_whitespace(state) {
49                continue;
50            }
51
52            if self.skip_comment(state) {
53                continue;
54            }
55
56            if self.lex_string_literal(state) {
57                continue;
58            }
59
60            if self.lex_number_literal(state) {
61                continue;
62            }
63
64            if self.lex_identifier_or_keyword(state) {
65                continue;
66            }
67
68            if self.lex_operators(state) {
69                continue;
70            }
71
72            if self.lex_single_char_tokens(state) {
73                continue;
74            }
75
76            state.advance_if_dead_lock(safe_point);
77        }
78
79        Ok(())
80    }
81
82    /// Skip whitespace characters
83    fn skip_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
84        VAMPIRE_WHITESPACE.scan(state, VampireTokenType::Whitespace)
85    }
86
87    /// Skip comment lines
88    fn skip_comment<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
89        VAMPIRE_COMMENT.scan(state, VampireTokenType::LineComment, VampireTokenType::BlockComment)
90    }
91
92    /// Lex string literals
93    fn lex_string_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
94        VAMPIRE_STRING.scan(state, VampireTokenType::StringLiteral)
95    }
96
97    /// Lex number literals
98    fn lex_number_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
99        let start = state.get_position();
100        let mut has_digit = false;
101
102        // Optional minus
103        if state.peek() == Some('-') {
104            state.advance(1);
105        }
106
107        while let Some(ch) = state.peek() {
108            if ch.is_ascii_digit() {
109                state.advance(ch.len_utf8());
110                has_digit = true;
111            }
112            else {
113                break;
114            }
115        }
116
117        if !has_digit {
118            return false;
119        }
120
121        // Check for float (dot or exponent)
122        let mut is_float = false;
123        if state.peek() == Some('.') {
124            if let Some(next) = state.peek_next_n(1) {
125                if next.is_ascii_digit() {
126                    state.advance(1); // skip dot
127                    is_float = true;
128                    while let Some(ch) = state.peek() {
129                        if ch.is_ascii_digit() {
130                            state.advance(ch.len_utf8());
131                        }
132                        else {
133                            break;
134                        }
135                    }
136                }
137            }
138        }
139
140        if let Some(ch) = state.peek() {
141            if ch == 'e' || ch == 'E' {
142                state.advance(1);
143                is_float = true;
144                if let Some(ch) = state.peek() {
145                    if ch == '+' || ch == '-' {
146                        state.advance(1);
147                    }
148                }
149                while let Some(ch) = state.peek() {
150                    if ch.is_ascii_digit() {
151                        state.advance(ch.len_utf8());
152                    }
153                    else {
154                        break;
155                    }
156                }
157            }
158        }
159
160        let kind = if is_float { VampireTokenType::RealLiteral } else { VampireTokenType::IntegerLiteral };
161        state.add_token(kind, start, state.get_position());
162        true
163    }
164
165    /// Lex identifier or keyword
166    fn lex_identifier_or_keyword<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
167        let start = state.get_position();
168        if let Some(ch) = state.peek() {
169            if ch.is_ascii_alphabetic() || ch == '_' || ch == '$' {
170                state.advance(ch.len_utf8());
171                while let Some(ch) = state.peek() {
172                    if ch.is_ascii_alphanumeric() || ch == '_' || ch == '$' { state.advance(ch.len_utf8()) } else { break }
173                }
174
175                let end = state.get_position();
176                let text = state.get_text_in((start..end).into());
177                let kind = self.keyword_or_identifier(text.as_ref());
178                state.add_token(kind, start, end);
179                true
180            }
181            else {
182                false
183            }
184        }
185        else {
186            false
187        }
188    }
189
190    /// Determine if text is a keyword or identifier
191    fn keyword_or_identifier(&self, text: &str) -> VampireTokenType {
192        match text {
193            // TPTP formula types
194            "fof" => VampireTokenType::FofKw,
195            "cnf" => VampireTokenType::CnfKw,
196            "tff" => VampireTokenType::TffKw,
197            "thf" => VampireTokenType::ThfKw,
198            "tpi" => VampireTokenType::TpiKw,
199            "include" => VampireTokenType::IncludeKw,
200
201            // Formula roles
202            "axiom" => VampireTokenType::AxiomKw,
203            "hypothesis" => VampireTokenType::HypothesisKw,
204            "definition" => VampireTokenType::DefinitionKw,
205            "assumption" => VampireTokenType::AssumptionKw,
206            "lemma" => VampireTokenType::LemmaKw,
207            "theorem" => VampireTokenType::TheoremKw,
208            "conjecture" => VampireTokenType::ConjectureKw,
209            "negated_conjecture" => VampireTokenType::NegatedConjectureKw,
210            "plain" => VampireTokenType::PlainKw,
211            "type" => VampireTokenType::TypeKw,
212            "fi_domain" => VampireTokenType::FiDomainKw,
213            "fi_functors" => VampireTokenType::FiFunctorsKw,
214            "fi_predicates" => VampireTokenType::FiPredicatesKw,
215            "unknown" => VampireTokenType::UnknownKw,
216
217            // Logical operators
218            "!" => VampireTokenType::ForallKw,
219            "?" => VampireTokenType::ExistsKw,
220            "&" => VampireTokenType::AndKw,
221            "|" => VampireTokenType::OrKw,
222            "~" => VampireTokenType::NotKw,
223            "=>" => VampireTokenType::ImpliesKw,
224            "<=>" => VampireTokenType::IffKw,
225            "<~>" => VampireTokenType::XorKw,
226            "~|" => VampireTokenType::NorKw,
227            "~&" => VampireTokenType::NandKw,
228
229            // Types
230            "$o" => VampireTokenType::BoolKw,
231            "$i" => VampireTokenType::IndividualKw,
232            "$int" => VampireTokenType::IntKw,
233            "$real" => VampireTokenType::RealKw,
234            "$rat" => VampireTokenType::RatKw,
235            "$tType" => VampireTokenType::TTypeKw,
236            "$oType" => VampireTokenType::OTypeKw,
237            "$iType" => VampireTokenType::ITypeKw,
238
239            // Boolean literals
240            "$true" => VampireTokenType::BoolLiteral,
241            "$false" => VampireTokenType::BoolLiteral,
242
243            _ => VampireTokenType::Identifier,
244        }
245    }
246
247    /// Lex operators
248    fn lex_operators<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
249        let start = state.get_position();
250        let rest = state.rest();
251
252        let (kind, len) = if rest.starts_with("==>") {
253            (VampireTokenType::ImpliesKw, 3)
254        }
255        else if rest.starts_with("<=>") {
256            (VampireTokenType::IffKw, 3)
257        }
258        else if rest.starts_with("<~>") {
259            (VampireTokenType::XorKw, 3)
260        }
261        else if rest.starts_with("~|") {
262            (VampireTokenType::NorKw, 2)
263        }
264        else if rest.starts_with("~&") {
265            (VampireTokenType::NandKw, 2)
266        }
267        else if rest.starts_with("==") {
268            (VampireTokenType::DoubleEq, 2)
269        }
270        else if rest.starts_with("!=") {
271            (VampireTokenType::NotEq, 2)
272        }
273        else if rest.starts_with("<=") {
274            (VampireTokenType::LessEq, 2)
275        }
276        else if rest.starts_with(">=") {
277            (VampireTokenType::GreaterEq, 2)
278        }
279        else if rest.starts_with("&&") {
280            (VampireTokenType::AndAnd, 2)
281        }
282        else if rest.starts_with("||") {
283            (VampireTokenType::OrOr, 2)
284        }
285        else if rest.starts_with("++") {
286            (VampireTokenType::PlusPlus, 2)
287        }
288        else if rest.starts_with("--") {
289            (VampireTokenType::MinusMinus, 2)
290        }
291        else if rest.starts_with("+=") {
292            (VampireTokenType::PlusEq, 2)
293        }
294        else if rest.starts_with("-=") {
295            (VampireTokenType::MinusEq, 2)
296        }
297        else if rest.starts_with("*=") {
298            (VampireTokenType::StarEq, 2)
299        }
300        else if rest.starts_with("/=") {
301            (VampireTokenType::SlashEq, 2)
302        }
303        else if rest.starts_with("%=") {
304            (VampireTokenType::PercentEq, 2)
305        }
306        else if rest.starts_with("<<") {
307            (VampireTokenType::LeftShift, 2)
308        }
309        else if rest.starts_with(">>") {
310            (VampireTokenType::RightShift, 2)
311        }
312        else if rest.starts_with("->") {
313            (VampireTokenType::Arrow, 2)
314        }
315        else {
316            return false;
317        };
318
319        state.advance(len);
320        state.add_token(kind, start, state.get_position());
321        true
322    }
323
324    /// Lex single character tokens
325    fn lex_single_char_tokens<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
326        if let Some(ch) = state.peek() {
327            let start = state.get_position();
328            let kind = match ch {
329                '(' => Some(VampireTokenType::LeftParen),
330                ')' => Some(VampireTokenType::RightParen),
331                '[' => Some(VampireTokenType::LeftBracket),
332                ']' => Some(VampireTokenType::RightBracket),
333                '{' => Some(VampireTokenType::LeftBrace),
334                '}' => Some(VampireTokenType::RightBrace),
335                ':' => Some(VampireTokenType::Colon),
336                ';' => Some(VampireTokenType::Semicolon),
337                '.' => Some(VampireTokenType::Dot),
338                ',' => Some(VampireTokenType::Comma),
339                '?' => Some(VampireTokenType::Question),
340                '!' => Some(VampireTokenType::Bang),
341                '@' => Some(VampireTokenType::At),
342                '#' => Some(VampireTokenType::Hash),
343                '$' => Some(VampireTokenType::Dollar),
344                '%' => Some(VampireTokenType::Percent),
345                '^' => Some(VampireTokenType::Caret),
346                '&' => Some(VampireTokenType::Ampersand),
347                '*' => Some(VampireTokenType::Star),
348                '+' => Some(VampireTokenType::Plus),
349                '-' => Some(VampireTokenType::Minus),
350                '=' => Some(VampireTokenType::Eq),
351                '<' => Some(VampireTokenType::LessThan),
352                '>' => Some(VampireTokenType::GreaterThan),
353                '/' => Some(VampireTokenType::Slash),
354                '\\' => Some(VampireTokenType::Backslash),
355                '|' => Some(VampireTokenType::Pipe),
356                '~' => Some(VampireTokenType::Tilde),
357                _ => None,
358            };
359
360            if let Some(token_kind) = kind {
361                state.advance(ch.len_utf8());
362                state.add_token(token_kind, start, state.get_position());
363                true
364            }
365            else {
366                false
367            }
368        }
369        else {
370            false
371        }
372    }
373}
oak_vampire/lexer/mod.rs

oak_vampire/lexer/
mod.rs