oak_clojure/lexer/
mod.rs

1pub mod token_type;
2pub use token_type::ClojureTokenType;
3
4use crate::ClojureLanguage;
5use oak_core::{
6    Lexer, LexerCache, LexerState, OakError,
7    lexer::LexOutput,
8    source::{Source, TextEdit},
9};
10
11pub struct ClojureLexer;
12
13type State<'a, S> = LexerState<'a, S, ClojureLanguage>;
14
15impl Lexer<ClojureLanguage> for ClojureLexer {
16    fn lex<'a, S: Source + ?Sized>(&self, text: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<ClojureLanguage>) -> LexOutput<ClojureLanguage> {
17        let mut state = State::new(text);
18        let result = self.run(&mut state);
19        if result.is_ok() {
20            state.add_eof();
21        }
22        state.finish_with_cache(result, cache)
23    }
24}
25
26impl ClojureLexer {
27    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
28        while state.not_at_end() {
29            let start = state.get_position();
30            let safe_point = start;
31
32            match state.peek() {
33                Some(c) if c.is_whitespace() => {
34                    self.lex_whitespace(state);
35                }
36                Some(';') => {
37                    self.lex_comment(state);
38                }
39                Some('"') => {
40                    self.lex_string(state);
41                }
42                Some('\\') => {
43                    self.lex_character(state);
44                }
45                Some(c) if c.is_ascii_digit() => {
46                    self.lex_number(state);
47                }
48                Some(':') => {
49                    self.lex_keyword(state);
50                }
51                Some('#') => {
52                    self.lex_dispatch(state);
53                }
54                Some('(') => {
55                    state.advance(1);
56                    state.add_token(ClojureTokenType::ListStart, start, state.get_position());
57                }
58                Some(')') => {
59                    state.advance(1);
60                    state.add_token(ClojureTokenType::ListEnd, start, state.get_position());
61                }
62                Some('[') => {
63                    state.advance(1);
64                    state.add_token(ClojureTokenType::VectorStart, start, state.get_position());
65                }
66                Some(']') => {
67                    state.advance(1);
68                    state.add_token(ClojureTokenType::VectorEnd, start, state.get_position());
69                }
70                Some('{') => {
71                    state.advance(1);
72                    state.add_token(ClojureTokenType::MapStart, start, state.get_position());
73                }
74                Some('}') => {
75                    state.advance(1);
76                    state.add_token(ClojureTokenType::MapEnd, start, state.get_position());
77                }
78                Some('\'') | Some('`') => {
79                    state.advance(1);
80                    state.add_token(ClojureTokenType::Quote, start, state.get_position());
81                }
82                Some('~') => {
83                    state.advance(1);
84                    if state.peek() == Some('@') {
85                        state.advance(1);
86                        state.add_token(ClojureTokenType::UnquoteSplice, start, state.get_position());
87                    }
88                    else {
89                        state.add_token(ClojureTokenType::Unquote, start, state.get_position());
90                    }
91                }
92                Some('^') => {
93                    state.advance(1);
94                    state.add_token(ClojureTokenType::Meta, start, state.get_position());
95                }
96                Some(_) => {
97                    self.lex_symbol(state);
98                }
99                None => break,
100            }
101
102            state.advance_if_dead_lock(safe_point);
103        }
104        Ok(())
105    }
106}
107
108impl ClojureLexer {
109    fn lex_whitespace<'a, S: Source + ?Sized>(&self, state: &mut LexerState<'a, S, ClojureLanguage>) {
110        let start = state.get_position();
111        while let Some(c) = state.peek() {
112            if c.is_whitespace() {
113                state.advance(c.len_utf8());
114            }
115            else {
116                break;
117            }
118        }
119        state.add_token(ClojureTokenType::Whitespace, start, state.get_position());
120    }
121
122    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut LexerState<'a, S, ClojureLanguage>) {
123        let start = state.get_position();
124        state.advance(1); // Skip ';'
125
126        while let Some(c) = state.peek() {
127            if c == '\n' {
128                break;
129            }
130            state.advance(c.len_utf8());
131        }
132
133        state.add_token(ClojureTokenType::Comment, start, state.get_position());
134    }
135
136    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut LexerState<'a, S, ClojureLanguage>) {
137        let start = state.get_position();
138        state.advance(1); // Skip opening quote
139
140        while let Some(c) = state.peek() {
141            if c == '"' {
142                state.advance(1);
143                break;
144            }
145            else if c == '\\' {
146                state.advance(1); // Skip escape character
147                if let Some(escaped) = state.peek() {
148                    state.advance(escaped.len_utf8()); // Skip escaped character
149                }
150            }
151            else {
152                state.advance(c.len_utf8());
153            }
154        }
155
156        state.add_token(ClojureTokenType::StringLiteral, start, state.get_position());
157    }
158
159    fn lex_character<'a, S: Source + ?Sized>(&self, state: &mut LexerState<'a, S, ClojureLanguage>) {
160        let start = state.get_position();
161        state.advance(1); // Skip '\'
162
163        if let Some(c) = state.peek() {
164            state.advance(c.len_utf8());
165        }
166
167        state.add_token(ClojureTokenType::CharacterLiteral, start, state.get_position());
168    }
169
170    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut LexerState<'a, S, ClojureLanguage>) {
171        let start = state.get_position();
172
173        while let Some(c) = state.peek() {
174            if c.is_ascii_digit() || c == '.' {
175                state.advance(1);
176            }
177            else {
178                break;
179            }
180        }
181
182        state.add_token(ClojureTokenType::NumberLiteral, start, state.get_position());
183    }
184
185    fn lex_keyword<'a, S: Source + ?Sized>(&self, state: &mut LexerState<'a, S, ClojureLanguage>) {
186        let start = state.get_position();
187        state.advance(1); // Skip ':'
188
189        while let Some(c) = state.peek() {
190            if c.is_alphanumeric() || c == '-' || c == '_' || c == '?' || c == '!' {
191                state.advance(c.len_utf8());
192            }
193            else {
194                break;
195            }
196        }
197
198        state.add_token(ClojureTokenType::KeywordLiteral, start, state.get_position());
199    }
200
201    fn lex_dispatch<'a, S: Source + ?Sized>(&self, state: &mut LexerState<'a, S, ClojureLanguage>) {
202        let start = state.get_position();
203        state.advance(1); // Skip '#'
204
205        match state.peek() {
206            Some('{') => {
207                state.advance(1);
208                state.add_token(ClojureTokenType::SetStart, start, state.get_position());
209            }
210            Some('(') => {
211                state.advance(1);
212                state.add_token(ClojureTokenType::AnonFnStart, start, state.get_position());
213            }
214            Some('"') => {
215                self.lex_regex(state, start);
216            }
217            _ => {
218                state.add_token(ClojureTokenType::Dispatch, start, state.get_position());
219            }
220        }
221    }
222
223    fn lex_regex<'a, S: Source + ?Sized>(&self, state: &mut LexerState<'a, S, ClojureLanguage>, start: usize) {
224        state.advance(1); // Skip '"'
225
226        while let Some(c) = state.peek() {
227            if c == '"' {
228                state.advance(1);
229                break;
230            }
231            else if c == '\\' {
232                state.advance(1); // Skip escape character
233                if let Some(escaped) = state.peek() {
234                    state.advance(escaped.len_utf8()); // Skip escaped character
235                }
236            }
237            else {
238                state.advance(c.len_utf8());
239            }
240        }
241
242        state.add_token(ClojureTokenType::RegexLiteral, start, state.get_position());
243    }
244
245    fn lex_symbol<'a, S: Source + ?Sized>(&self, state: &mut LexerState<'a, S, ClojureLanguage>) {
246        let start = state.get_position();
247
248        while let Some(c) = state.peek() {
249            if c.is_alphanumeric() || c == '-' || c == '_' || c == '?' || c == '!' || c == '*' || c == '+' || c == '/' {
250                state.advance(c.len_utf8());
251            }
252            else {
253                break;
254            }
255        }
256
257        state.add_token(ClojureTokenType::Symbol, start, state.get_position());
258    }
259}