oak_vampire/lexer/
mod.rs

1use crate::{kind::VampireSyntaxKind, language::VampireLanguage};
2use oak_core::{
3    IncrementalCache, Lexer, LexerState, OakError,
4    lexer::{CommentLine, LexOutput, StringConfig, WhitespaceConfig},
5    source::Source,
6};
7use std::sync::LazyLock;
8
9type State<S> = LexerState<S, VampireLanguage>;
10
11static VAMPIRE_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static VAMPIRE_COMMENT: LazyLock<CommentLine> = LazyLock::new(|| CommentLine { line_markers: &["%"] });
13static VAMPIRE_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
14
15#[derive(Clone)]
16pub struct VampireLexer<'config> {
17    config: &'config VampireLanguage,
18}
19
20impl<'config> Lexer<VampireLanguage> for VampireLexer<'config> {
21    fn lex_incremental(
22        &self,
23        source: impl Source,
24        changed: usize,
25        cache: IncrementalCache<VampireLanguage>,
26    ) -> LexOutput<VampireLanguage> {
27        let mut state = LexerState::new_with_cache(source, changed, cache);
28        let result = self.run(&mut state);
29        state.finish(result)
30    }
31}
32
33impl<'config> VampireLexer<'config> {
34    pub fn new(config: &'config VampireLanguage) -> Self {
35        Self { config }
36    }
37
38    /// Main lexing loop
39    fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
40        while state.not_at_end() {
41            let safe_point = state.get_position();
42
43            if self.skip_whitespace(state) {
44                continue;
45            }
46
47            if self.skip_comment(state) {
48                continue;
49            }
50
51            if self.lex_string_literal(state) {
52                continue;
53            }
54
55            if self.lex_number_literal(state) {
56                continue;
57            }
58
59            if self.lex_identifier_or_keyword(state) {
60                continue;
61            }
62
63            if self.lex_operators(state) {
64                continue;
65            }
66
67            if self.lex_single_char_tokens(state) {
68                continue;
69            }
70
71            state.safe_check(safe_point);
72        }
73
74        // Add EOF token
75        let eof_pos = state.get_position();
76        state.add_token(VampireSyntaxKind::Eof, eof_pos, eof_pos);
77        Ok(())
78    }
79
80    /// Skip whitespace characters
81    fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
82        match VAMPIRE_WHITESPACE.scan(state.rest(), state.get_position(), VampireSyntaxKind::Whitespace) {
83            Some(token) => {
84                state.advance_with(token);
85                true
86            }
87            None => false,
88        }
89    }
90
91    /// Skip comment lines
92    fn skip_comment<S: Source>(&self, state: &mut State<S>) -> bool {
93        let start = state.get_position();
94        let rest = state.rest();
95
96        // Line comment: % ... until newline
97        if rest.starts_with("%") {
98            state.advance(1);
99            while let Some(ch) = state.peek() {
100                if ch == '\n' || ch == '\r' {
101                    break;
102                }
103                state.advance(ch.len_utf8());
104            }
105            state.add_token(VampireSyntaxKind::LineComment, start, state.get_position());
106            return true;
107        }
108
109        // Block comment: /* ... */
110        if rest.starts_with("/*") {
111            state.advance(2);
112            while let Some(ch) = state.peek() {
113                if ch == '*' && state.peek_next_n(1) == Some('/') {
114                    state.advance(2);
115                    break;
116                }
117                state.advance(ch.len_utf8());
118            }
119            state.add_token(VampireSyntaxKind::BlockComment, start, state.get_position());
120            return true;
121        }
122
123        false
124    }
125
126    /// Lex string literals
127    fn lex_string_literal<S: Source>(&self, state: &mut State<S>) -> bool {
128        let start = state.get_position();
129
130        if state.current() == Some('"') {
131            state.advance(1);
132            let mut escaped = false;
133            while let Some(ch) = state.peek() {
134                if ch == '"' && !escaped {
135                    state.advance(1); // consume closing quote
136                    break;
137                }
138                state.advance(ch.len_utf8());
139                if escaped {
140                    escaped = false;
141                    continue;
142                }
143                if ch == '\\' {
144                    escaped = true;
145                    continue;
146                }
147                if ch == '\n' || ch == '\r' {
148                    break;
149                }
150            }
151            state.add_token(VampireSyntaxKind::StringLiteral, start, state.get_position());
152            return true;
153        }
154        false
155    }
156
157    /// Lex number literals
158    fn lex_number_literal<S: Source>(&self, state: &mut State<S>) -> bool {
159        let start = state.get_position();
160        let first = match state.current() {
161            Some(c) => c,
162            None => return false,
163        };
164
165        if !first.is_ascii_digit() && first != '-' && first != '+' {
166            return false;
167        }
168
169        let mut is_real = false;
170
171        // Handle sign
172        if first == '-' || first == '+' {
173            state.advance(1);
174            if !state.peek().map(|c| c.is_ascii_digit()).unwrap_or(false) {
175                state.set_position(start);
176                return false;
177            }
178        }
179
180        // Integer part
181        while let Some(c) = state.peek() {
182            if c.is_ascii_digit() {
183                state.advance(1);
184            }
185            else {
186                break;
187            }
188        }
189
190        // Fractional part
191        if state.peek() == Some('.') {
192            let n1 = state.peek_next_n(1);
193            if n1.map(|c| c.is_ascii_digit()).unwrap_or(false) {
194                is_real = true;
195                state.advance(1); // consume '.'
196                while let Some(c) = state.peek() {
197                    if c.is_ascii_digit() {
198                        state.advance(1);
199                    }
200                    else {
201                        break;
202                    }
203                }
204            }
205        }
206
207        // Exponent
208        if let Some(c) = state.peek() {
209            if c == 'e' || c == 'E' {
210                let n1 = state.peek_next_n(1);
211                if n1 == Some('+') || n1 == Some('-') || n1.map(|d| d.is_ascii_digit()).unwrap_or(false) {
212                    is_real = true;
213                    state.advance(1);
214                    if let Some(sign) = state.peek() {
215                        if sign == '+' || sign == '-' {
216                            state.advance(1);
217                        }
218                    }
219                    while let Some(d) = state.peek() {
220                        if d.is_ascii_digit() {
221                            state.advance(1);
222                        }
223                        else {
224                            break;
225                        }
226                    }
227                }
228            }
229        }
230
231        let end = state.get_position();
232        state.add_token(if is_real { VampireSyntaxKind::RealLiteral } else { VampireSyntaxKind::IntegerLiteral }, start, end);
233        true
234    }
235
236    /// Lex identifiers and keywords
237    fn lex_identifier_or_keyword<S: Source>(&self, state: &mut State<S>) -> bool {
238        let start = state.get_position();
239        let ch = match state.current() {
240            Some(c) => c,
241            None => return false,
242        };
243
244        if !(ch.is_ascii_alphabetic() || ch == '_' || ch == '$') {
245            return false;
246        }
247
248        state.advance(1);
249        while let Some(c) = state.current() {
250            if c.is_ascii_alphanumeric() || c == '_' || c == '$' {
251                state.advance(1);
252            }
253            else {
254                break;
255            }
256        }
257
258        let end = state.get_position();
259        let text = state.get_text_in((start..end).into());
260        let kind = match text {
261            // TPTP formula types
262            "fof" => VampireSyntaxKind::FofKw,
263            "cnf" => VampireSyntaxKind::CnfKw,
264            "tff" => VampireSyntaxKind::TffKw,
265            "thf" => VampireSyntaxKind::ThfKw,
266            "tpi" => VampireSyntaxKind::TpiKw,
267            "include" => VampireSyntaxKind::IncludeKw,
268
269            // Formula roles
270            "axiom" => VampireSyntaxKind::AxiomKw,
271            "hypothesis" => VampireSyntaxKind::HypothesisKw,
272            "definition" => VampireSyntaxKind::DefinitionKw,
273            "assumption" => VampireSyntaxKind::AssumptionKw,
274            "lemma" => VampireSyntaxKind::LemmaKw,
275            "theorem" => VampireSyntaxKind::TheoremKw,
276            "conjecture" => VampireSyntaxKind::ConjectureKw,
277            "negated_conjecture" => VampireSyntaxKind::NegatedConjectureKw,
278            "plain" => VampireSyntaxKind::PlainKw,
279            "type" => VampireSyntaxKind::TypeKw,
280            "fi_domain" => VampireSyntaxKind::FiDomainKw,
281            "fi_functors" => VampireSyntaxKind::FiFunctorsKw,
282            "fi_predicates" => VampireSyntaxKind::FiPredicatesKw,
283            "unknown" => VampireSyntaxKind::UnknownKw,
284
285            // Logical operators
286            "!" => VampireSyntaxKind::ForallKw,
287            "?" => VampireSyntaxKind::ExistsKw,
288            "&" => VampireSyntaxKind::AndKw,
289            "|" => VampireSyntaxKind::OrKw,
290            "~" => VampireSyntaxKind::NotKw,
291            "=>" => VampireSyntaxKind::ImpliesKw,
292            "<=>" => VampireSyntaxKind::IffKw,
293            "<~>" => VampireSyntaxKind::XorKw,
294            "~|" => VampireSyntaxKind::NorKw,
295            "~&" => VampireSyntaxKind::NandKw,
296
297            // Types
298            "$o" => VampireSyntaxKind::BoolKw,
299            "$i" => VampireSyntaxKind::IndividualKw,
300            "$int" => VampireSyntaxKind::IntKw,
301            "$real" => VampireSyntaxKind::RealKw,
302            "$rat" => VampireSyntaxKind::RatKw,
303            "$tType" => VampireSyntaxKind::TTypeKw,
304            "$oType" => VampireSyntaxKind::OTypeKw,
305            "$iType" => VampireSyntaxKind::ITypeKw,
306
307            // Boolean literals
308            "$true" => VampireSyntaxKind::BoolLiteral,
309            "$false" => VampireSyntaxKind::BoolLiteral,
310
311            _ => VampireSyntaxKind::Identifier,
312        };
313
314        state.add_token(kind, start, state.get_position());
315        true
316    }
317
318    /// Lex operators
319    fn lex_operators<S: Source>(&self, state: &mut State<S>) -> bool {
320        let start = state.get_position();
321        let rest = state.rest();
322
323        // Multi-character operators (longest first)
324        let patterns: &[(&str, VampireSyntaxKind)] = &[
325            ("<==>", VampireSyntaxKind::IffKw),
326            ("<~>", VampireSyntaxKind::XorKw),
327            ("==>", VampireSyntaxKind::ImpliesKw),
328            ("~|", VampireSyntaxKind::NorKw),
329            ("~&", VampireSyntaxKind::NandKw),
330            ("==", VampireSyntaxKind::EqEq),
331            ("!=", VampireSyntaxKind::NotEq),
332            ("<=", VampireSyntaxKind::LessEq),
333            (">=", VampireSyntaxKind::GreaterEq),
334            ("&&", VampireSyntaxKind::AndAnd),
335            ("||", VampireSyntaxKind::OrOr),
336            ("++", VampireSyntaxKind::PlusPlus),
337            ("--", VampireSyntaxKind::MinusMinus),
338            ("+=", VampireSyntaxKind::PlusEq),
339            ("-=", VampireSyntaxKind::MinusEq),
340            ("*=", VampireSyntaxKind::StarEq),
341            ("/=", VampireSyntaxKind::SlashEq),
342            ("%=", VampireSyntaxKind::PercentEq),
343            ("<<", VampireSyntaxKind::LeftShift),
344            (">>", VampireSyntaxKind::RightShift),
345            ("->", VampireSyntaxKind::Arrow),
346        ];
347
348        for (pat, kind) in patterns {
349            if rest.starts_with(pat) {
350                state.advance(pat.len());
351                state.add_token(*kind, start, state.get_position());
352                return true;
353            }
354        }
355
356        // Single character operators
357        if let Some(ch) = state.current() {
358            let kind = match ch {
359                '=' => Some(VampireSyntaxKind::Eq),
360                '<' => Some(VampireSyntaxKind::LessThan),
361                '>' => Some(VampireSyntaxKind::GreaterThan),
362                '+' => Some(VampireSyntaxKind::Plus),
363                '-' => Some(VampireSyntaxKind::Minus),
364                '*' => Some(VampireSyntaxKind::Star),
365                '/' => Some(VampireSyntaxKind::Slash),
366                '\\' => Some(VampireSyntaxKind::Backslash),
367                '%' => Some(VampireSyntaxKind::Percent),
368                '^' => Some(VampireSyntaxKind::Caret),
369                '&' => Some(VampireSyntaxKind::Ampersand),
370                '|' => Some(VampireSyntaxKind::Pipe),
371                '~' => Some(VampireSyntaxKind::Tilde),
372                '!' => Some(VampireSyntaxKind::Bang),
373                '?' => Some(VampireSyntaxKind::Question),
374                '.' => Some(VampireSyntaxKind::Dot),
375                ':' => Some(VampireSyntaxKind::Colon),
376                _ => None,
377            };
378
379            if let Some(k) = kind {
380                state.advance(ch.len_utf8());
381                state.add_token(k, start, state.get_position());
382                return true;
383            }
384        }
385
386        false
387    }
388
389    /// Lex single character tokens
390    fn lex_single_char_tokens<S: Source>(&self, state: &mut State<S>) -> bool {
391        let start = state.get_position();
392        if let Some(ch) = state.current() {
393            let kind = match ch {
394                '(' => VampireSyntaxKind::LeftParen,
395                ')' => VampireSyntaxKind::RightParen,
396                '[' => VampireSyntaxKind::LeftBracket,
397                ']' => VampireSyntaxKind::RightBracket,
398                '{' => VampireSyntaxKind::LeftBrace,
399                '}' => VampireSyntaxKind::RightBrace,
400                ',' => VampireSyntaxKind::Comma,
401                ';' => VampireSyntaxKind::Semicolon,
402                '@' => VampireSyntaxKind::At,
403                '#' => VampireSyntaxKind::Hash,
404                '$' => VampireSyntaxKind::Dollar,
405                _ => {
406                    // Unknown character, create error token
407                    state.advance(ch.len_utf8());
408                    state.add_token(VampireSyntaxKind::Error, start, state.get_position());
409                    return true;
410                }
411            };
412
413            state.advance(ch.len_utf8());
414            state.add_token(kind, start, state.get_position());
415            true
416        }
417        else {
418            false
419        }
420    }
421}