Skip to main content

oak_clojure/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2pub mod token_type;
3pub use token_type::ClojureTokenType;
4
5use crate::ClojureLanguage;
6use oak_core::{
7    Lexer, LexerCache, LexerState, OakError,
8    lexer::LexOutput,
9    source::{Source, TextEdit},
10};
11
12#[derive(Clone, Debug)]
13pub struct ClojureLexer<'config> {
14    _config: &'config ClojureLanguage,
15}
16
17type State<'a, S> = LexerState<'a, S, ClojureLanguage>;
18
19impl<'config> Lexer<ClojureLanguage> for ClojureLexer<'config> {
20    fn lex<'a, S: Source + ?Sized>(&self, text: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<ClojureLanguage>) -> LexOutput<ClojureLanguage> {
21        let mut state = State::new(text);
22        let result = self.run(&mut state);
23        if result.is_ok() {
24            state.add_eof()
25        }
26        state.finish_with_cache(result, cache)
27    }
28}
29
30impl<'config> ClojureLexer<'config> {
31    pub fn new(config: &'config ClojureLanguage) -> Self {
32        Self { _config: config }
33    }
34    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
35        while state.not_at_end() {
36            let start = state.get_position();
37            let safe_point = start;
38
39            match state.peek() {
40                Some(c) if c.is_whitespace() => self.lex_whitespace(state),
41                Some(';') => self.lex_comment(state),
42                Some('"') => self.lex_string(state),
43                Some('\\') => self.lex_character(state),
44                Some(c) if c.is_ascii_digit() => self.lex_number(state),
45                Some(':') => self.lex_keyword(state),
46                Some('#') => self.lex_dispatch(state),
47                Some('(') => {
48                    state.advance(1);
49                    state.add_token(ClojureTokenType::ListStart, start, state.get_position())
50                }
51                Some(')') => {
52                    state.advance(1);
53                    state.add_token(ClojureTokenType::ListEnd, start, state.get_position())
54                }
55                Some('[') => {
56                    state.advance(1);
57                    state.add_token(ClojureTokenType::VectorStart, start, state.get_position())
58                }
59                Some(']') => {
60                    state.advance(1);
61                    state.add_token(ClojureTokenType::VectorEnd, start, state.get_position())
62                }
63                Some('{') => {
64                    state.advance(1);
65                    state.add_token(ClojureTokenType::MapStart, start, state.get_position())
66                }
67                Some('}') => {
68                    state.advance(1);
69                    state.add_token(ClojureTokenType::MapEnd, start, state.get_position())
70                }
71                Some('\'') | Some('`') => {
72                    state.advance(1);
73                    state.add_token(ClojureTokenType::Quote, start, state.get_position())
74                }
75                Some('~') => {
76                    state.advance(1);
77                    if state.peek() == Some('↯') {
78                        state.advance(1);
79                        state.add_token(ClojureTokenType::UnquoteSplice, start, state.get_position())
80                    }
81                    else {
82                        state.add_token(ClojureTokenType::Unquote, start, state.get_position())
83                    }
84                }
85                Some('^') => {
86                    state.advance(1);
87                    state.add_token(ClojureTokenType::Meta, start, state.get_position())
88                }
89                Some(_) => self.lex_symbol(state),
90                None => break,
91            }
92
93            state.advance_if_dead_lock(safe_point)
94        }
95        Ok(())
96    }
97}
98
99impl<'config> ClojureLexer<'config> {
100    fn lex_whitespace<'a, S: Source + ?Sized>(&self, state: &mut LexerState<'a, S, ClojureLanguage>) {
101        let start = state.get_position();
102        while let Some(c) = state.peek() {
103            if c.is_whitespace() { state.advance(c.len_utf8()) } else { break }
104        }
105        state.add_token(ClojureTokenType::Whitespace, start, state.get_position())
106    }
107
108    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut LexerState<'a, S, ClojureLanguage>) {
109        let start = state.get_position();
110        state.advance(1); // Skip ';'
111
112        while let Some(c) = state.peek() {
113            if c == '\n' {
114                break;
115            }
116            state.advance(c.len_utf8())
117        }
118
119        state.add_token(ClojureTokenType::Comment, start, state.get_position())
120    }
121
122    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut LexerState<'a, S, ClojureLanguage>) {
123        let start = state.get_position();
124        state.advance(1); // Skip opening quote
125
126        while let Some(c) = state.peek() {
127            if c == '"' {
128                state.advance(1);
129                break;
130            }
131            else if c == '\\' {
132                state.advance(1); // Skip escape character
133                if let Some(escaped) = state.peek() {
134                    state.advance(escaped.len_utf8()); // Skip escaped character
135                }
136            }
137            else {
138                state.advance(c.len_utf8())
139            }
140        }
141
142        state.add_token(ClojureTokenType::StringLiteral, start, state.get_position())
143    }
144
145    fn lex_character<'a, S: Source + ?Sized>(&self, state: &mut LexerState<'a, S, ClojureLanguage>) {
146        let start = state.get_position();
147        state.advance(1); // Skip '\'
148
149        if let Some(c) = state.peek() {
150            state.advance(c.len_utf8())
151        }
152
153        state.add_token(ClojureTokenType::CharacterLiteral, start, state.get_position())
154    }
155
156    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut LexerState<'a, S, ClojureLanguage>) {
157        let start = state.get_position();
158
159        while let Some(c) = state.peek() {
160            if c.is_ascii_digit() || c == '.' { state.advance(1) } else { break }
161        }
162
163        state.add_token(ClojureTokenType::NumberLiteral, start, state.get_position())
164    }
165
166    fn lex_keyword<'a, S: Source + ?Sized>(&self, state: &mut LexerState<'a, S, ClojureLanguage>) {
167        let start = state.get_position();
168        state.advance(1); // Skip ':'
169
170        while let Some(c) = state.peek() {
171            if c.is_alphanumeric() || c == '-' || c == '_' || c == '?' || c == '!' { state.advance(c.len_utf8()) } else { break }
172        }
173
174        state.add_token(ClojureTokenType::KeywordLiteral, start, state.get_position())
175    }
176
177    fn lex_dispatch<'a, S: Source + ?Sized>(&self, state: &mut LexerState<'a, S, ClojureLanguage>) {
178        let start = state.get_position();
179        state.advance(1); // Skip '#'
180
181        match state.peek() {
182            Some('{') => {
183                state.advance(1);
184                state.add_token(ClojureTokenType::SetStart, start, state.get_position())
185            }
186            Some('(') => {
187                state.advance(1);
188                state.add_token(ClojureTokenType::AnonFnStart, start, state.get_position())
189            }
190            Some('"') => self.lex_regex(state, start),
191            _ => state.add_token(ClojureTokenType::Dispatch, start, state.get_position()),
192        }
193    }
194
195    fn lex_regex<'a, S: Source + ?Sized>(&self, state: &mut LexerState<'a, S, ClojureLanguage>, start: usize) {
196        state.advance(1); // Skip '"'
197
198        while let Some(c) = state.peek() {
199            if c == '"' {
200                state.advance(1);
201                break;
202            }
203            else if c == '\\' {
204                state.advance(1); // Skip escape character
205                if let Some(escaped) = state.peek() {
206                    state.advance(escaped.len_utf8()); // Skip escaped character
207                }
208            }
209            else {
210                state.advance(c.len_utf8())
211            }
212        }
213
214        state.add_token(ClojureTokenType::RegexLiteral, start, state.get_position())
215    }
216
217    fn lex_symbol<'a, S: Source + ?Sized>(&self, state: &mut LexerState<'a, S, ClojureLanguage>) {
218        let start = state.get_position();
219
220        while let Some(c) = state.peek() {
221            if c.is_alphanumeric() || c == '-' || c == '_' || c == '?' || c == '!' || c == '*' || c == '+' || c == '/' { state.advance(c.len_utf8()) } else { break }
222        }
223
224        state.add_token(ClojureTokenType::Symbol, start, state.get_position())
225    }
226}