Skip to main content

oak_j/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4pub use token_type::JTokenType;
5
6use crate::language::JLanguage;
7use oak_core::{
8    Lexer, LexerCache, LexerState, OakError,
9    lexer::{LexOutput, WhitespaceConfig},
10    source::Source,
11};
12use std::sync::LazyLock;
13
14type State<'a, S> = LexerState<'a, S, JLanguage>;
15
16static J_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
17
18#[derive(Clone, Debug)]
19pub struct JLexer<'config> {
20    config: &'config JLanguage,
21}
22
23impl<'config> Lexer<JLanguage> for JLexer<'config> {
24    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<JLanguage>) -> LexOutput<JLanguage> {
25        let mut state: State<'_, S> = LexerState::new_with_cache(source, 0, cache);
26        let result = self.run(&mut state);
27        if result.is_ok() {
28            state.add_eof();
29        }
30        state.finish_with_cache(result, cache)
31    }
32}
33
34impl<'config> JLexer<'config> {
35    pub fn new(config: &'config JLanguage) -> Self {
36        Self { config }
37    }
38
39    /// 主要词法分析逻辑
40    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
41        while state.not_at_end() {
42            let safe_point = state.get_position();
43
44            if self.skip_whitespace(state) {
45                continue;
46            }
47
48            if self.skip_comment(state) {
49                continue;
50            }
51
52            if self.lex_string_literal(state) {
53                continue;
54            }
55
56            if self.lex_number_literal(state) {
57                continue;
58            }
59
60            if self.lex_identifier(state) {
61                continue;
62            }
63
64            if self.lex_operators(state) {
65                continue;
66            }
67
68            // 如果没有匹配任何模式,跳过当前字符并生成 Error token
69            if let Some(ch) = state.peek() {
70                state.advance(ch.len_utf8());
71                state.add_token(JTokenType::Error, safe_point, state.get_position());
72            }
73        }
74
75        Ok(())
76    }
77
78    /// 跳过空白字符
79    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
80        J_WHITESPACE.scan(state, JTokenType::Whitespace)
81    }
82
83    /// J 语言的注释以 NB. 开头
84    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
85        let start = state.get_position();
86        if state.consume_if_starts_with("NB.") {
87            while let Some(ch) = state.peek() {
88                if ch == '\n' || ch == '\r' {
89                    break;
90                }
91                state.advance(ch.len_utf8());
92            }
93            state.add_token(JTokenType::Comment, start, state.get_position());
94            return true;
95        }
96        false
97    }
98
99    /// 字符串字面量
100    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
101        let start = state.get_position();
102        if state.consume_if_starts_with("'") {
103            while let Some(ch) = state.peek() {
104                if ch == '\'' {
105                    state.advance(ch.len_utf8());
106                    // 处理转义的单引号 ''
107                    if state.consume_if_starts_with("'") {
108                        continue;
109                    }
110                    state.add_token(JTokenType::StringLiteral, start, state.get_position());
111                    return true;
112                }
113                state.advance(ch.len_utf8());
114            }
115            // 未闭合的字符串
116            state.add_token(JTokenType::Error, start, state.get_position());
117            return true;
118        }
119        false
120    }
121
122    /// 数字字面量
123    fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
124        let start = state.get_position();
125        if let Some(ch) = state.peek() {
126            if ch.is_ascii_digit() || ch == '_' {
127                // J 使用 _ 表示负号
128                state.advance(ch.len_utf8());
129                while let Some(ch) = state.peek() {
130                    if ch.is_ascii_digit() || ch == '.' || ch == 'e' || ch == 'E' || ch == 'j' || ch == 'r' {
131                        state.advance(ch.len_utf8());
132                    }
133                    else {
134                        break;
135                    }
136                }
137                state.add_token(JTokenType::NumberLiteral, start, state.get_position());
138                return true;
139            }
140        }
141        false
142    }
143
144    /// 标识符
145    fn lex_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
146        let start = state.get_position();
147        if let Some(ch) = state.peek() {
148            if ch.is_ascii_alphabetic() {
149                state.advance(ch.len_utf8());
150                while let Some(ch) = state.peek() {
151                    if ch.is_ascii_alphanumeric() || ch == '_' {
152                        state.advance(ch.len_utf8());
153                    }
154                    else {
155                        break;
156                    }
157                }
158                state.add_token(JTokenType::Identifier, start, state.get_position());
159                return true;
160            }
161        }
162        false
163    }
164
165    /// 操作符和特殊符号
166    fn lex_operators<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
167        let start = state.get_position();
168
169        // 尝试匹配长的操作符
170        for (op, token) in [("=:", JTokenType::IsGlobal), ("=.", JTokenType::IsLocal)] {
171            if state.consume_if_starts_with(op) {
172                state.add_token(token, start, state.get_position());
173                return true;
174            }
175        }
176
177        // 匹配单个字符操作符
178        if let Some(ch) = state.peek() {
179            let token = match ch {
180                '=' => Some(JTokenType::Equal),
181                '.' => Some(JTokenType::Dot),
182                ':' => Some(JTokenType::Colon),
183                '+' => Some(JTokenType::Plus),
184                '-' => Some(JTokenType::Minus),
185                '*' => Some(JTokenType::Star),
186                '%' => Some(JTokenType::Percent),
187                '$' => Some(JTokenType::Dollar),
188                ',' => Some(JTokenType::Comma),
189                '#' => Some(JTokenType::Hash),
190                '/' => Some(JTokenType::Slash),
191                '\\' => Some(JTokenType::Backslash),
192                '|' => Some(JTokenType::Pipe),
193                '&' => Some(JTokenType::Ampersand),
194                '^' => Some(JTokenType::Caret),
195                '~' => Some(JTokenType::Tilde),
196                '<' => Some(JTokenType::Less),
197                '>' => Some(JTokenType::Greater),
198                '(' => Some(JTokenType::LeftParen),
199                ')' => Some(JTokenType::RightParen),
200                '[' => Some(JTokenType::LeftBracket),
201                ']' => Some(JTokenType::RightBracket),
202                '{' => Some(JTokenType::LeftBrace),
203                '}' => Some(JTokenType::RightBrace),
204                _ => None,
205            };
206
207            if let Some(token) = token {
208                state.advance(ch.len_utf8());
209                state.add_token(token, start, state.get_position());
210                return true;
211            }
212        }
213
214        false
215    }
216}