oak_clojure/lexer/
mod.rs

1use crate::{ClojureLanguage, ClojureSyntaxKind};
2use oak_core::{
3    lexer::{LexOutput, Lexer, LexerState},
4    source::Source,
5    tree::IncrementalCache,
6};
7
8pub struct ClojureLexer;
9
10impl Lexer<ClojureLanguage> for ClojureLexer {
11    fn lex_incremental(
12        &self,
13        source: impl Source,
14        _changed: usize,
15        _cache: IncrementalCache<ClojureLanguage>,
16    ) -> LexOutput<ClojureLanguage> {
17        let mut state = LexerState::new_with_cache(source, _changed, _cache);
18        while state.not_at_end() {
19            let start = state.get_position();
20
21            match state.current() {
22                Some(c) if c.is_whitespace() => {
23                    self.lex_whitespace(&mut state);
24                }
25                Some(';') => {
26                    self.lex_comment(&mut state);
27                }
28                Some('"') => {
29                    self.lex_string(&mut state);
30                }
31                Some('\\') => {
32                    self.lex_character(&mut state);
33                }
34                Some(c) if c.is_ascii_digit() => {
35                    self.lex_number(&mut state);
36                }
37                Some(':') => {
38                    self.lex_keyword(&mut state);
39                }
40                Some('#') => {
41                    self.lex_dispatch(&mut state);
42                }
43                Some('(') => {
44                    state.advance(1);
45                    state.add_token(ClojureSyntaxKind::ListStart, start, state.get_position());
46                }
47                Some(')') => {
48                    state.advance(1);
49                    state.add_token(ClojureSyntaxKind::ListEnd, start, state.get_position());
50                }
51                Some('[') => {
52                    state.advance(1);
53                    state.add_token(ClojureSyntaxKind::VectorStart, start, state.get_position());
54                }
55                Some(']') => {
56                    state.advance(1);
57                    state.add_token(ClojureSyntaxKind::VectorEnd, start, state.get_position());
58                }
59                Some('{') => {
60                    state.advance(1);
61                    state.add_token(ClojureSyntaxKind::MapStart, start, state.get_position());
62                }
63                Some('}') => {
64                    state.advance(1);
65                    state.add_token(ClojureSyntaxKind::MapEnd, start, state.get_position());
66                }
67                Some('\'') => {
68                    state.advance(1);
69                    state.add_token(ClojureSyntaxKind::Quote, start, state.get_position());
70                }
71                Some('`') => {
72                    state.advance(1);
73                    state.add_token(ClojureSyntaxKind::Quote, start, state.get_position());
74                }
75                Some('~') => {
76                    if state.peek() == Some('@') {
77                        state.advance(2);
78                        state.add_token(ClojureSyntaxKind::UnquoteSplice, start, state.get_position());
79                    }
80                    else {
81                        state.advance(1);
82                        state.add_token(ClojureSyntaxKind::Unquote, start, state.get_position());
83                    }
84                }
85                Some('^') => {
86                    state.advance(1);
87                    state.add_token(ClojureSyntaxKind::Meta, start, state.get_position());
88                }
89                Some(_) => {
90                    self.lex_symbol(&mut state);
91                }
92                None => break,
93            }
94        }
95
96        state.finish(Ok(()))
97    }
98}
99
100impl ClojureLexer {
101    fn lex_whitespace<S: Source>(&self, state: &mut LexerState<S, ClojureLanguage>) {
102        let start = state.get_position();
103        while let Some(c) = state.current() {
104            if c.is_whitespace() {
105                state.advance(1);
106            }
107            else {
108                break;
109            }
110        }
111        state.add_token(ClojureSyntaxKind::Whitespace, start, state.get_position());
112    }
113
114    fn lex_comment<S: Source>(&self, state: &mut LexerState<S, ClojureLanguage>) {
115        let start = state.get_position();
116        state.advance(1); // Skip ';'
117
118        while let Some(c) = state.current() {
119            if c == '\n' {
120                break;
121            }
122            state.advance(1);
123        }
124
125        state.add_token(ClojureSyntaxKind::Comment, start, state.get_position());
126    }
127
128    fn lex_string<S: Source>(&self, state: &mut LexerState<S, ClojureLanguage>) {
129        let start = state.get_position();
130        state.advance(1); // Skip opening quote
131
132        while let Some(c) = state.current() {
133            if c == '"' {
134                state.advance(1);
135                break;
136            }
137            else if c == '\\' {
138                state.advance(1); // Skip escape character
139                if state.current().is_some() {
140                    state.advance(1); // Skip escaped character
141                }
142            }
143            else {
144                state.advance(1);
145            }
146        }
147
148        state.add_token(ClojureSyntaxKind::StringLiteral, start, state.get_position());
149    }
150
151    fn lex_character<S: Source>(&self, state: &mut LexerState<S, ClojureLanguage>) {
152        let start = state.get_position();
153        state.advance(1); // Skip '\'
154
155        if let Some(_) = state.current() {
156            state.advance(1);
157        }
158
159        state.add_token(ClojureSyntaxKind::CharacterLiteral, start, state.get_position());
160    }
161
162    fn lex_number<S: Source>(&self, state: &mut LexerState<S, ClojureLanguage>) {
163        let start = state.get_position();
164
165        while let Some(c) = state.current() {
166            if c.is_ascii_digit() || c == '.' {
167                state.advance(1);
168            }
169            else {
170                break;
171            }
172        }
173
174        state.add_token(ClojureSyntaxKind::NumberLiteral, start, state.get_position());
175    }
176
177    fn lex_keyword<S: Source>(&self, state: &mut LexerState<S, ClojureLanguage>) {
178        let start = state.get_position();
179        state.advance(1); // Skip ':'
180
181        while let Some(c) = state.current() {
182            if c.is_alphanumeric() || c == '-' || c == '_' || c == '?' || c == '!' {
183                state.advance(1);
184            }
185            else {
186                break;
187            }
188        }
189
190        state.add_token(ClojureSyntaxKind::KeywordLiteral, start, state.get_position());
191    }
192
193    fn lex_dispatch<S: Source>(&self, state: &mut LexerState<S, ClojureLanguage>) {
194        let start = state.get_position();
195        state.advance(1); // Skip '#'
196
197        match state.current() {
198            Some('{') => {
199                state.advance(1);
200                state.add_token(ClojureSyntaxKind::SetStart, start, state.get_position());
201            }
202            Some('(') => {
203                state.advance(1);
204                state.add_token(ClojureSyntaxKind::AnonFnStart, start, state.get_position());
205            }
206            Some('"') => {
207                self.lex_regex(state, start);
208            }
209            _ => {
210                state.add_token(ClojureSyntaxKind::Dispatch, start, state.get_position());
211            }
212        }
213    }
214
215    fn lex_regex<S: Source>(&self, state: &mut LexerState<S, ClojureLanguage>, start: usize) {
216        state.advance(1); // Skip '"'
217
218        while let Some(c) = state.current() {
219            if c == '"' {
220                state.advance(1);
221                break;
222            }
223            else if c == '\\' {
224                state.advance(1); // Skip escape character
225                if state.current().is_some() {
226                    state.advance(1); // Skip escaped character
227                }
228            }
229            else {
230                state.advance(1);
231            }
232        }
233
234        state.add_token(ClojureSyntaxKind::RegexLiteral, start, state.get_position());
235    }
236
237    fn lex_symbol<S: Source>(&self, state: &mut LexerState<S, ClojureLanguage>) {
238        let start = state.get_position();
239
240        while let Some(c) = state.current() {
241            if c.is_alphanumeric() || c == '-' || c == '_' || c == '?' || c == '!' || c == '*' || c == '+' || c == '/' {
242                state.advance(1);
243            }
244            else {
245                break;
246            }
247        }
248
249        state.add_token(ClojureSyntaxKind::Symbol, start, state.get_position());
250    }
251}