Skip to main content

oak_apl/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4pub use token_type::AplTokenType;
5
6use crate::language::AplLanguage;
7use oak_core::{
8    Lexer, LexerCache, LexerState, OakError,
9    lexer::{LexOutput, WhitespaceConfig},
10    source::Source,
11};
12use std::sync::LazyLock;
13
14type State<'a, S> = LexerState<'a, S, AplLanguage>;
15
16static APL_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
17
18#[derive(Clone, Debug)]
19pub struct AplLexer<'config> {
20    config: &'config AplLanguage,
21}
22
23impl<'config> Lexer<AplLanguage> for AplLexer<'config> {
24    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<AplLanguage>) -> LexOutput<AplLanguage> {
25        let mut state: State<'_, S> = LexerState::new_with_cache(source, 0, cache);
26        let result = self.run(&mut state);
27        if result.is_ok() {
28            state.add_eof();
29        }
30        state.finish_with_cache(result, cache)
31    }
32}
33
34impl<'config> AplLexer<'config> {
35    pub fn new(config: &'config AplLanguage) -> Self {
36        Self { config }
37    }
38
39    /// 主要词法分析逻辑
40    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
41        while state.not_at_end() {
42            let safe_point = state.get_position();
43
44            if self.skip_whitespace(state) {
45                continue;
46            }
47
48            if self.skip_comment(state) {
49                continue;
50            }
51
52            if self.lex_string_literal(state) {
53                continue;
54            }
55
56            if self.lex_number_literal(state) {
57                continue;
58            }
59
60            if self.lex_identifier(state) {
61                continue;
62            }
63
64            if self.lex_symbols(state) {
65                continue;
66            }
67
68            // 如果没有匹配任何模式,跳过当前字符并生成 Error token
69            if let Some(ch) = state.peek() {
70                state.advance(ch.len_utf8());
71                state.add_token(AplTokenType::Error, safe_point, state.get_position());
72            }
73        }
74
75        Ok(())
76    }
77
78    /// 跳过空白字符
79    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
80        APL_WHITESPACE.scan(state, AplTokenType::Whitespace)
81    }
82
83    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
84        let start = state.get_position();
85        if state.peek() == Some('⍝') {
86            state.advance('⍝'.len_utf8());
87            while let Some(ch) = state.peek() {
88                if ch == '\n' || ch == '\r' {
89                    break;
90                }
91                state.advance(ch.len_utf8());
92            }
93            state.add_token(AplTokenType::Comment, start, state.get_position());
94            return true;
95        }
96        false
97    }
98
99    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
100        let start = state.get_position();
101        if let Some(quote) = state.peek() {
102            if quote == '\'' || quote == '"' {
103                state.advance(1);
104                while let Some(ch) = state.peek() {
105                    if ch == quote {
106                        state.advance(1);
107                        if state.peek() == Some(quote) {
108                            state.advance(1);
109                            continue;
110                        }
111                        break;
112                    }
113                    state.advance(ch.len_utf8());
114                    if ch == '\n' || ch == '\r' {
115                        break;
116                    }
117                }
118                state.add_token(AplTokenType::StringLiteral, start, state.get_position());
119                return true;
120            }
121        }
122        false
123    }
124
125    fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
126        let start = state.get_position();
127        if let Some(ch) = state.peek() {
128            if ch.is_ascii_digit() || ch == '¯' || (ch == '.' && state.peek_next_n(1).map_or(false, |c| c.is_ascii_digit())) {
129                if ch == '¯' {
130                    state.advance('¯'.len_utf8());
131                }
132
133                let mut has_digits = false;
134                while let Some(c) = state.peek() {
135                    if c.is_ascii_digit() {
136                        state.advance(1);
137                        has_digits = true;
138                    }
139                    else {
140                        break;
141                    }
142                }
143
144                if state.peek() == Some('.') {
145                    state.advance(1);
146                    while let Some(c) = state.peek() {
147                        if c.is_ascii_digit() {
148                            state.advance(1);
149                            has_digits = true;
150                        }
151                        else {
152                            break;
153                        }
154                    }
155                }
156
157                if !has_digits && state.get_position() == start {
158                    return false;
159                }
160
161                if let Some(e) = state.peek() {
162                    if e == 'e' || e == 'E' {
163                        state.advance(1);
164                        if let Some(sign) = state.peek() {
165                            if sign == '+' || sign == '-' || sign == '¯' {
166                                state.advance(sign.len_utf8());
167                            }
168                        }
169                        while let Some(c) = state.peek() {
170                            if c.is_ascii_digit() {
171                                state.advance(1);
172                            }
173                            else {
174                                break;
175                            }
176                        }
177                    }
178                }
179
180                state.add_token(AplTokenType::NumberLiteral, start, state.get_position());
181                return true;
182            }
183        }
184        false
185    }
186
187    fn lex_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
188        let start = state.get_position();
189        if let Some(ch) = state.peek() {
190            if ch.is_alphabetic() || ch == '∆' || ch == '⍙' {
191                state.advance(ch.len_utf8());
192                while let Some(c) = state.peek() {
193                    if c.is_alphanumeric() || c == '∆' || c == '⍙' || c == '_' {
194                        state.advance(c.len_utf8());
195                    }
196                    else {
197                        break;
198                    }
199                }
200                state.add_token(AplTokenType::Identifier, start, state.get_position());
201                return true;
202            }
203        }
204        false
205    }
206
207    fn lex_symbols<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
208        let start = state.get_position();
209        if let Some(ch) = state.peek() {
210            let token = match ch {
211                '←' => AplTokenType::LeftArrow,
212                '→' => AplTokenType::RightArrow,
213                '⋄' => AplTokenType::Diamond,
214                '⎕' => AplTokenType::Quad,
215                '⍞' => AplTokenType::QuoteQuad,
216                '⍴' => AplTokenType::Rho,
217                '⍳' => AplTokenType::Iota,
218                '∊' => AplTokenType::Epsilon,
219                '↑' => AplTokenType::UpArrow,
220                '↓' => AplTokenType::DownArrow,
221                '∇' => AplTokenType::Del,
222                '∆' => AplTokenType::Delta,
223                '⍺' => AplTokenType::Alpha,
224                '⍵' => AplTokenType::Omega,
225                '⍬' => AplTokenType::Zilde,
226                '+' => AplTokenType::Plus,
227                '-' => AplTokenType::Minus,
228                '×' => AplTokenType::Times,
229                '÷' => AplTokenType::Divide,
230                '*' => AplTokenType::Star,
231                '⍟' => AplTokenType::Log,
232                '○' => AplTokenType::Circle,
233                '∨' => AplTokenType::Or,
234                '∧' => AplTokenType::And,
235                '∼' => AplTokenType::Not,
236                '⍱' => AplTokenType::Nor,
237                '⍲' => AplTokenType::Nand,
238                '=' => AplTokenType::Equal,
239                '≠' => AplTokenType::NotEqual,
240                '<' => AplTokenType::LessThan,
241                '≤' => AplTokenType::LessEqual,
242                '≥' => AplTokenType::GreaterEqual,
243                '>' => AplTokenType::GreaterThan,
244                '⌈' => AplTokenType::UpStile,
245                '⌊' => AplTokenType::DownStile,
246                '|' => AplTokenType::Bar,
247                '~' => AplTokenType::Tilde,
248                '?' => AplTokenType::Question,
249                '!' => AplTokenType::Factorial,
250                '/' => AplTokenType::Slash,
251                '\\' => AplTokenType::Backslash,
252                '⌿' => AplTokenType::SlashBar,
253                '⍀' => AplTokenType::BackslashBar,
254                '.' => AplTokenType::Dot,
255                '∘' => AplTokenType::Jot,
256                '¨' => AplTokenType::Diaeresis,
257                '⍣' => AplTokenType::Power,
258                '⍤' => AplTokenType::Rank,
259                '≢' => AplTokenType::Tally,
260                '(' => AplTokenType::LeftParen,
261                ')' => AplTokenType::RightParen,
262                '[' => AplTokenType::LeftBracket,
263                ']' => AplTokenType::RightBracket,
264                '{' => AplTokenType::LeftBrace,
265                '}' => AplTokenType::RightBrace,
266                ';' => AplTokenType::Semicolon,
267                _ => return false,
268            };
269            state.advance(ch.len_utf8());
270            state.add_token(token, start, state.get_position());
271            return true;
272        }
273        false
274    }
275}