Skip to main content

oak_elm/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2
3use oak_core::Source;
4/// Token types for Elm.
5pub mod token_type;
6
7use crate::{language::ElmLanguage, lexer::token_type::ElmTokenType};
8use oak_core::{
9    Lexer, LexerCache, LexerState,
10    errors::OakError,
11    lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
12};
13use std::sync::LazyLock;
14
15type State<'s, S> = LexerState<'s, S, ElmLanguage>;
16
17static ELM_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
18static ELM_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "--", block_start: "{-", block_end: "-}", nested_blocks: true });
19static ELM_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
20static ELM_CHAR: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['\''], escape: Some('\\') });
21
22/// A lexer for Elm source files.
23#[derive(Clone, Debug)]
24pub struct ElmLexer<'config> {
25    config: &'config ElmLanguage,
26}
27
28impl<'config> Lexer<ElmLanguage> for ElmLexer<'config> {
29    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<ElmLanguage>) -> LexOutput<ElmLanguage> {
30        let mut state = State::new_with_cache(source, 0, cache);
31        let result = self.run(&mut state);
32        if result.is_ok() {
33            state.add_eof();
34        }
35        state.finish_with_cache(result, cache)
36    }
37}
38
39impl<'config> ElmLexer<'config> {
40    /// Creates a new instance of the Elm lexer.
41    pub fn new(config: &'config ElmLanguage) -> Self {
42        Self { config }
43    }
44
45    fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), OakError> {
46        while state.not_at_end() {
47            let safe_point = state.get_position();
48
49            if self.skip_whitespace(state) {
50                continue;
51            }
52
53            if self.skip_comment(state) {
54                continue;
55            }
56
57            if self.lex_string_literal(state) {
58                continue;
59            }
60
61            if self.lex_char_literal(state) {
62                continue;
63            }
64
65            if self.lex_number_literal(state) {
66                continue;
67            }
68
69            if self.lex_identifier_or_keyword(state) {
70                continue;
71            }
72
73            if self.lex_operators(state) {
74                continue;
75            }
76
77            state.advance_if_dead_lock(safe_point);
78        }
79
80        Ok(())
81    }
82
83    /// Skips whitespace characters.
84    fn skip_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
85        ELM_WHITESPACE.scan(state, ElmTokenType::Whitespace)
86    }
87
88    fn skip_comment<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
89        ELM_COMMENT.scan(state, ElmTokenType::Comment, ElmTokenType::Comment)
90    }
91
92    fn lex_string_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
93        ELM_STRING.scan(state, ElmTokenType::String)
94    }
95
96    fn lex_char_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
97        ELM_CHAR.scan(state, ElmTokenType::Char)
98    }
99
100    fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
101        let start = state.get_position();
102        let first = match state.peek() {
103            Some(c) => c,
104            None => return false,
105        };
106        if !first.is_ascii_digit() {
107            return false;
108        }
109        let mut is_float = false;
110
111        state.advance(1);
112        state.take_while(|c| c.is_ascii_digit() || c == '_');
113
114        // fractional part
115        if state.peek() == Some('.') {
116            let n1 = state.peek_next_n(1);
117            if n1.map(|c| c.is_ascii_digit()).unwrap_or(false) {
118                is_float = true;
119                state.advance(1); // consume '.'
120                state.take_while(|c| c.is_ascii_digit() || c == '_');
121            }
122        }
123        // exponent
124        if let Some(c) = state.peek() {
125            if c == 'e' || c == 'E' {
126                let n1 = state.peek_next_n(1);
127                if n1 == Some('+') || n1 == Some('-') || n1.map(|d| d.is_ascii_digit()).unwrap_or(false) {
128                    is_float = true;
129                    state.advance(1);
130                    if let Some(sign) = state.peek() {
131                        if sign == '+' || sign == '-' {
132                            state.advance(1);
133                        }
134                    }
135                    state.take_while(|d| d.is_ascii_digit() || d == '_');
136                }
137            }
138        }
139
140        let end = state.get_position();
141        state.add_token(if is_float { ElmTokenType::Float } else { ElmTokenType::Number }, start, end);
142        true
143    }
144
145    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
146        let start = state.get_position();
147
148        if let Some(ch) = state.peek() {
149            if ch.is_alphabetic() || ch == '_' {
150                state.advance(ch.len_utf8());
151                state.take_while(|next_ch| next_ch.is_alphanumeric() || next_ch == '_');
152
153                let text = state.get_text_in((start..state.get_position()).into());
154                let kind = match text.as_ref() {
155                    "if" => ElmTokenType::If,
156                    "then" => ElmTokenType::Then,
157                    "else" => ElmTokenType::Else,
158                    "case" => ElmTokenType::Case,
159                    "of" => ElmTokenType::Of,
160                    "let" => ElmTokenType::Let,
161                    "in" => ElmTokenType::In,
162                    "type" => ElmTokenType::Type,
163                    "alias" => ElmTokenType::Alias,
164                    "module" => ElmTokenType::Module,
165                    "where" => ElmTokenType::Where,
166                    "import" => ElmTokenType::Import,
167                    "exposing" => ElmTokenType::Exposing,
168                    "as" => ElmTokenType::As,
169                    "port" => ElmTokenType::Port,
170                    _ => ElmTokenType::Identifier,
171                };
172
173                state.add_token(kind, start, state.get_position());
174                return true;
175            }
176        }
177        false
178    }
179
180    fn lex_operators<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
181        let start = state.get_position();
182
183        // Multi-character operators
184        let ops = [
185            ("==", ElmTokenType::EqualEqual),
186            ("/=", ElmTokenType::NotEqual),
187            ("<=", ElmTokenType::LessEqual),
188            (">=", ElmTokenType::GreaterEqual),
189            ("&&", ElmTokenType::DoubleAmpersand),
190            ("||", ElmTokenType::DoublePipe),
191            ("++", ElmTokenType::DoublePlus),
192            ("<<", ElmTokenType::DoubleLess),
193            (">>", ElmTokenType::DoubleGreater),
194            ("|>", ElmTokenType::PipeGreater),
195            ("->", ElmTokenType::Arrow),
196            ("...", ElmTokenType::TripleDot),
197            ("..", ElmTokenType::DoubleDot),
198            ("//", ElmTokenType::DoubleSlash),
199        ];
200
201        for (pattern, kind) in ops {
202            if state.consume_if_starts_with(pattern) {
203                state.add_token(kind, start, state.get_position());
204                return true;
205            }
206        }
207
208        // Single-character operators
209        if let Some(ch) = state.peek() {
210            let kind = match ch {
211                '+' => ElmTokenType::Plus,
212                '-' => ElmTokenType::Minus,
213                '*' => ElmTokenType::Star,
214                '/' => ElmTokenType::Slash,
215                '=' => ElmTokenType::Equal,
216                '<' => ElmTokenType::Less,
217                '>' => ElmTokenType::Greater,
218                '^' => ElmTokenType::Caret,
219                '|' => ElmTokenType::Pipe,
220                '(' => ElmTokenType::LeftParen,
221                ')' => ElmTokenType::RightParen,
222                '{' => ElmTokenType::LeftBrace,
223                '}' => ElmTokenType::RightBrace,
224                '[' => ElmTokenType::LeftBracket,
225                ']' => ElmTokenType::RightBracket,
226                ',' => ElmTokenType::Comma,
227                ';' => ElmTokenType::Semicolon,
228                '.' => ElmTokenType::Dot,
229                ':' => ElmTokenType::Colon,
230                '\\' => ElmTokenType::Backslash,
231                '%' => ElmTokenType::Percent,
232                '\n' => ElmTokenType::Newline,
233                _ => return false,
234            };
235
236            state.advance(ch.len_utf8());
237            state.add_token(kind, start, state.get_position());
238            return true;
239        }
240
241        false
242    }
243}