Skip to main content

oak_clojure/lexer/
mod.rs

1pub mod token_type;
2pub use token_type::ClojureTokenType;
3
4use crate::ClojureLanguage;
5use oak_core::{
6    Lexer, LexerCache, LexerState, OakError,
7    lexer::LexOutput,
8    source::{Source, TextEdit},
9};
10
11#[derive(Clone, Debug)]
12pub struct ClojureLexer<'config> {
13    _config: &'config ClojureLanguage,
14}
15
16type State<'a, S> = LexerState<'a, S, ClojureLanguage>;
17
18impl<'config> Lexer<ClojureLanguage> for ClojureLexer<'config> {
19    fn lex<'a, S: Source + ?Sized>(&self, text: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<ClojureLanguage>) -> LexOutput<ClojureLanguage> {
20        let mut state = State::new(text);
21        let result = self.run(&mut state);
22        if result.is_ok() {
23            state.add_eof();
24        }
25        state.finish_with_cache(result, cache)
26    }
27}
28
29impl<'config> ClojureLexer<'config> {
30    pub fn new(config: &'config ClojureLanguage) -> Self {
31        Self { _config: config }
32    }
33    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
34        while state.not_at_end() {
35            let start = state.get_position();
36            let safe_point = start;
37
38            match state.peek() {
39                Some(c) if c.is_whitespace() => {
40                    self.lex_whitespace(state);
41                }
42                Some(';') => {
43                    self.lex_comment(state);
44                }
45                Some('"') => {
46                    self.lex_string(state);
47                }
48                Some('\\') => {
49                    self.lex_character(state);
50                }
51                Some(c) if c.is_ascii_digit() => {
52                    self.lex_number(state);
53                }
54                Some(':') => {
55                    self.lex_keyword(state);
56                }
57                Some('#') => {
58                    self.lex_dispatch(state);
59                }
60                Some('(') => {
61                    state.advance(1);
62                    state.add_token(ClojureTokenType::ListStart, start, state.get_position());
63                }
64                Some(')') => {
65                    state.advance(1);
66                    state.add_token(ClojureTokenType::ListEnd, start, state.get_position());
67                }
68                Some('[') => {
69                    state.advance(1);
70                    state.add_token(ClojureTokenType::VectorStart, start, state.get_position());
71                }
72                Some(']') => {
73                    state.advance(1);
74                    state.add_token(ClojureTokenType::VectorEnd, start, state.get_position());
75                }
76                Some('{') => {
77                    state.advance(1);
78                    state.add_token(ClojureTokenType::MapStart, start, state.get_position());
79                }
80                Some('}') => {
81                    state.advance(1);
82                    state.add_token(ClojureTokenType::MapEnd, start, state.get_position());
83                }
84                Some('\'') | Some('`') => {
85                    state.advance(1);
86                    state.add_token(ClojureTokenType::Quote, start, state.get_position());
87                }
88                Some('~') => {
89                    state.advance(1);
90                    if state.peek() == Some('@') {
91                        state.advance(1);
92                        state.add_token(ClojureTokenType::UnquoteSplice, start, state.get_position());
93                    }
94                    else {
95                        state.add_token(ClojureTokenType::Unquote, start, state.get_position());
96                    }
97                }
98                Some('^') => {
99                    state.advance(1);
100                    state.add_token(ClojureTokenType::Meta, start, state.get_position());
101                }
102                Some(_) => {
103                    self.lex_symbol(state);
104                }
105                None => break,
106            }
107
108            state.advance_if_dead_lock(safe_point);
109        }
110        Ok(())
111    }
112}
113
114impl<'config> ClojureLexer<'config> {
115    fn lex_whitespace<'a, S: Source + ?Sized>(&self, state: &mut LexerState<'a, S, ClojureLanguage>) {
116        let start = state.get_position();
117        while let Some(c) = state.peek() {
118            if c.is_whitespace() {
119                state.advance(c.len_utf8());
120            }
121            else {
122                break;
123            }
124        }
125        state.add_token(ClojureTokenType::Whitespace, start, state.get_position());
126    }
127
128    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut LexerState<'a, S, ClojureLanguage>) {
129        let start = state.get_position();
130        state.advance(1); // Skip ';'
131
132        while let Some(c) = state.peek() {
133            if c == '\n' {
134                break;
135            }
136            state.advance(c.len_utf8());
137        }
138
139        state.add_token(ClojureTokenType::Comment, start, state.get_position());
140    }
141
142    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut LexerState<'a, S, ClojureLanguage>) {
143        let start = state.get_position();
144        state.advance(1); // Skip opening quote
145
146        while let Some(c) = state.peek() {
147            if c == '"' {
148                state.advance(1);
149                break;
150            }
151            else if c == '\\' {
152                state.advance(1); // Skip escape character
153                if let Some(escaped) = state.peek() {
154                    state.advance(escaped.len_utf8()); // Skip escaped character
155                }
156            }
157            else {
158                state.advance(c.len_utf8());
159            }
160        }
161
162        state.add_token(ClojureTokenType::StringLiteral, start, state.get_position());
163    }
164
165    fn lex_character<'a, S: Source + ?Sized>(&self, state: &mut LexerState<'a, S, ClojureLanguage>) {
166        let start = state.get_position();
167        state.advance(1); // Skip '\'
168
169        if let Some(c) = state.peek() {
170            state.advance(c.len_utf8());
171        }
172
173        state.add_token(ClojureTokenType::CharacterLiteral, start, state.get_position());
174    }
175
176    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut LexerState<'a, S, ClojureLanguage>) {
177        let start = state.get_position();
178
179        while let Some(c) = state.peek() {
180            if c.is_ascii_digit() || c == '.' {
181                state.advance(1);
182            }
183            else {
184                break;
185            }
186        }
187
188        state.add_token(ClojureTokenType::NumberLiteral, start, state.get_position());
189    }
190
191    fn lex_keyword<'a, S: Source + ?Sized>(&self, state: &mut LexerState<'a, S, ClojureLanguage>) {
192        let start = state.get_position();
193        state.advance(1); // Skip ':'
194
195        while let Some(c) = state.peek() {
196            if c.is_alphanumeric() || c == '-' || c == '_' || c == '?' || c == '!' {
197                state.advance(c.len_utf8());
198            }
199            else {
200                break;
201            }
202        }
203
204        state.add_token(ClojureTokenType::KeywordLiteral, start, state.get_position());
205    }
206
207    fn lex_dispatch<'a, S: Source + ?Sized>(&self, state: &mut LexerState<'a, S, ClojureLanguage>) {
208        let start = state.get_position();
209        state.advance(1); // Skip '#'
210
211        match state.peek() {
212            Some('{') => {
213                state.advance(1);
214                state.add_token(ClojureTokenType::SetStart, start, state.get_position());
215            }
216            Some('(') => {
217                state.advance(1);
218                state.add_token(ClojureTokenType::AnonFnStart, start, state.get_position());
219            }
220            Some('"') => {
221                self.lex_regex(state, start);
222            }
223            _ => {
224                state.add_token(ClojureTokenType::Dispatch, start, state.get_position());
225            }
226        }
227    }
228
229    fn lex_regex<'a, S: Source + ?Sized>(&self, state: &mut LexerState<'a, S, ClojureLanguage>, start: usize) {
230        state.advance(1); // Skip '"'
231
232        while let Some(c) = state.peek() {
233            if c == '"' {
234                state.advance(1);
235                break;
236            }
237            else if c == '\\' {
238                state.advance(1); // Skip escape character
239                if let Some(escaped) = state.peek() {
240                    state.advance(escaped.len_utf8()); // Skip escaped character
241                }
242            }
243            else {
244                state.advance(c.len_utf8());
245            }
246        }
247
248        state.add_token(ClojureTokenType::RegexLiteral, start, state.get_position());
249    }
250
251    fn lex_symbol<'a, S: Source + ?Sized>(&self, state: &mut LexerState<'a, S, ClojureLanguage>) {
252        let start = state.get_position();
253
254        while let Some(c) = state.peek() {
255            if c.is_alphanumeric() || c == '-' || c == '_' || c == '?' || c == '!' || c == '*' || c == '+' || c == '/' {
256                state.advance(c.len_utf8());
257            }
258            else {
259                break;
260            }
261        }
262
263        state.add_token(ClojureTokenType::Symbol, start, state.get_position());
264    }
265}