Skip to main content

oak_j/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2/// J token type definitions
3pub mod token_type;
4
5pub use token_type::JTokenType;
6
7use crate::language::JLanguage;
8use oak_core::{
9    Lexer, LexerCache, LexerState, OakError,
10    lexer::{LexOutput, WhitespaceConfig},
11    source::Source,
12};
13use std::sync::LazyLock;
14
15pub(crate) type State<'a, S> = LexerState<'a, S, JLanguage>;
16
17static J_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
18
19/// J language lexer
20#[derive(Clone, Debug)]
21pub struct JLexer<'config> {
22    config: &'config JLanguage,
23}
24
25impl<'config> Lexer<JLanguage> for JLexer<'config> {
26    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<JLanguage>) -> LexOutput<JLanguage> {
27        let mut state: State<'_, S> = LexerState::new_with_cache(source, 0, cache);
28        let result = self.run(&mut state);
29        if result.is_ok() {
30            state.add_eof();
31        }
32        state.finish_with_cache(result, cache)
33    }
34}
35
36impl<'config> JLexer<'config> {
37    /// Creates a new J lexer.
38    pub fn new(config: &'config JLanguage) -> Self {
39        Self { config }
40    }
41
42    /// Main lexing logic.
43    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
44        while state.not_at_end() {
45            let safe_point = state.get_position();
46
47            if self.skip_whitespace(state) {
48                continue;
49            }
50
51            if self.skip_comment(state) {
52                continue;
53            }
54
55            if self.lex_string_literal(state) {
56                continue;
57            }
58
59            if self.lex_number_literal(state) {
60                continue;
61            }
62
63            if self.lex_identifier(state) {
64                continue;
65            }
66
67            if self.lex_operators(state) {
68                continue;
69            }
70
71            // If no rules matched, skip current character and add error token
72            if let Some(ch) = state.peek() {
73                state.advance(ch.len_utf8());
74                state.add_token(JTokenType::Error, safe_point, state.get_position());
75            }
76        }
77
78        Ok(())
79    }
80
81    /// Skips whitespace characters.
82    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
83        J_WHITESPACE.scan(state, JTokenType::Whitespace)
84    }
85
86    /// J language comments start with `NB.`
87    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
88        let start = state.get_position();
89        if state.consume_if_starts_with("NB.") {
90            while let Some(ch) = state.peek() {
91                if ch == '\n' || ch == '\r' {
92                    break;
93                }
94                state.advance(ch.len_utf8());
95            }
96            state.add_token(JTokenType::Comment, start, state.get_position());
97            return true;
98        }
99        false
100    }
101
102    /// String literal.
103    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
104        let start = state.get_position();
105        if state.consume_if_starts_with("'") {
106            while let Some(ch) = state.peek() {
107                if ch == '\'' {
108                    state.advance(ch.len_utf8());
109                    // Handle escaped single quote ''
110                    if state.consume_if_starts_with("'") {
111                        continue;
112                    }
113                    state.add_token(JTokenType::StringLiteral, start, state.get_position());
114                    return true;
115                }
116                state.advance(ch.len_utf8());
117            }
118            // Unclosed string.
119            state.add_token(JTokenType::Error, start, state.get_position());
120            return true;
121        }
122        false
123    }
124
125    /// Number literal.
126    fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
127        let start = state.get_position();
128        if let Some(ch) = state.peek() {
129            if ch.is_ascii_digit() || ch == '_' {
130                // J uses _ for negative sign.
131                state.advance(ch.len_utf8());
132                while let Some(ch) = state.peek() {
133                    if ch.is_ascii_digit() || ch == '.' || ch == 'e' || ch == 'E' || ch == 'j' || ch == 'r' {
134                        state.advance(ch.len_utf8());
135                    }
136                    else {
137                        break;
138                    }
139                }
140                state.add_token(JTokenType::NumberLiteral, start, state.get_position());
141                return true;
142            }
143        }
144        false
145    }
146
147    /// Identifier.
148    fn lex_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
149        let start = state.get_position();
150        if let Some(ch) = state.peek() {
151            if ch.is_ascii_alphabetic() {
152                state.advance(ch.len_utf8());
153                while let Some(ch) = state.peek() {
154                    if ch.is_ascii_alphanumeric() || ch == '_' {
155                        state.advance(ch.len_utf8());
156                    }
157                    else {
158                        break;
159                    }
160                }
161                state.add_token(JTokenType::Identifier, start, state.get_position());
162                return true;
163            }
164        }
165        false
166    }
167
168    /// Operators and special symbols.
169    fn lex_operators<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
170        let start = state.get_position();
171
172        // Try to match long operators.
173        for (op, token) in [("=:", JTokenType::IsGlobal), ("=.", JTokenType::IsLocal)] {
174            if state.consume_if_starts_with(op) {
175                state.add_token(token, start, state.get_position());
176                return true;
177            }
178        }
179
180        // Match single character operators.
181        if let Some(ch) = state.peek() {
182            let token = match ch {
183                '=' => Some(JTokenType::Equal),
184                '.' => Some(JTokenType::Dot),
185                ':' => Some(JTokenType::Colon),
186                '+' => Some(JTokenType::Plus),
187                '-' => Some(JTokenType::Minus),
188                '*' => Some(JTokenType::Star),
189                '%' => Some(JTokenType::Percent),
190                '$' => Some(JTokenType::Dollar),
191                ',' => Some(JTokenType::Comma),
192                '#' => Some(JTokenType::Hash),
193                '/' => Some(JTokenType::Slash),
194                '\\' => Some(JTokenType::Backslash),
195                '|' => Some(JTokenType::Pipe),
196                '&' => Some(JTokenType::Ampersand),
197                '^' => Some(JTokenType::Caret),
198                '~' => Some(JTokenType::Tilde),
199                '<' => Some(JTokenType::Less),
200                '>' => Some(JTokenType::Greater),
201                '(' => Some(JTokenType::LeftParen),
202                ')' => Some(JTokenType::RightParen),
203                '[' => Some(JTokenType::LeftBracket),
204                ']' => Some(JTokenType::RightBracket),
205                '{' => Some(JTokenType::LeftBrace),
206                '}' => Some(JTokenType::RightBrace),
207                _ => None,
208            };
209
210            if let Some(token) = token {
211                state.advance(ch.len_utf8());
212                state.add_token(token, start, state.get_position());
213                return true;
214            }
215        }
216
217        false
218    }
219}