1#![doc = include_str!("readme.md")]
2pub mod token_type;
4
5use crate::{language::DHallLanguage, lexer::token_type::DHallTokenType};
6use oak_core::{
7 LexOutput, Lexer, LexerCache, LexerState, OakError,
8 lexer::{CommentConfig, StringConfig, WhitespaceConfig},
9 source::{Source, TextEdit},
10};
11use std::sync::LazyLock;
12
13static DHALL_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
14static DHALL_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "--", block_start: "{-", block_end: "-}", nested_blocks: true });
15static DHALL_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
16
17#[derive(Clone)]
19pub struct DHallLexer<'config> {
20 config: &'config DHallLanguage,
21}
22
23impl<'config> Lexer<DHallLanguage> for DHallLexer<'config> {
24 fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<DHallLanguage>) -> LexOutput<DHallLanguage> {
25 let mut state = LexerState::new(source);
26 let result = self.run(&mut state);
27 state.finish_with_cache(result, cache)
28 }
29}
30
31impl<'config> DHallLexer<'config> {
32 pub fn new(config: &'config DHallLanguage) -> Self {
34 Self { config }
35 }
36
37 fn run<'a, S: Source + ?Sized>(&self, state: &mut LexerState<'a, S, DHallLanguage>) -> Result<(), OakError> {
38 while state.not_at_end() {
39 let safe_point = state.get_position();
40 if self.skip_whitespace(state) {
41 continue;
42 };
43
44 if self.skip_comment(state) {
45 continue;
46 }
47
48 if self.lex_string_literal(state) {
49 continue;
50 }
51
52 if self.lex_number_literal(state) {
53 continue;
54 }
55
56 if self.lex_identifier_or_keyword(state) {
57 continue;
58 }
59
60 if self.lex_operators(state) {
61 continue;
62 }
63
64 if self.lex_single_char_tokens(state) {
65 continue;
66 }
67
68 state.advance_if_dead_lock(safe_point)
69 }
70
71 Ok(())
72 }
73
74 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut LexerState<'a, S, DHallLanguage>) -> bool {
75 DHALL_WHITESPACE.scan(state, DHallTokenType::Whitespace)
76 }
77
78 fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut LexerState<'a, S, DHallLanguage>) -> bool {
79 DHALL_COMMENT.scan(state, DHallTokenType::Comment, DHallTokenType::Comment)
80 }
81
82 fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut LexerState<'a, S, DHallLanguage>) -> bool {
83 DHALL_STRING.scan(state, DHallTokenType::String)
84 }
85
86 fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut LexerState<'a, S, DHallLanguage>) -> bool {
87 let start = state.get_position();
88 let first = match state.peek() {
89 Some(c) => c,
90 None => return false,
91 };
92
93 if !first.is_ascii_digit() {
94 return false;
95 }
96
97 state.advance(1);
98 while let Some(c) = state.peek() {
99 if c.is_ascii_digit() { state.advance(1) } else { break }
100 }
101
102 state.add_token(DHallTokenType::Number, start, state.get_position());
103 true
104 }
105
106 fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut LexerState<'a, S, DHallLanguage>) -> bool {
107 let start = state.get_position();
108 let first = match state.peek() {
109 Some(c) => c,
110 None => return false,
111 };
112
113 if !first.is_alphabetic() && first != '_' && first != 'λ' {
114 return false;
115 }
116
117 state.advance(1);
118 while let Some(c) = state.peek() {
119 if c.is_alphanumeric() || c == '_' || c == '-' || c == '/' { state.advance(1) } else { break }
120 }
121
122 let end = state.get_position();
123 let text = state.get_text_in((start..end).into());
124
125 let kind = match text.as_ref() {
126 "if" => DHallTokenType::If,
127 "then" => DHallTokenType::Then,
128 "else" => DHallTokenType::Else,
129 "let" => DHallTokenType::Let,
130 "in" => DHallTokenType::In,
131 "using" => DHallTokenType::Using,
132 "as" => DHallTokenType::As,
133 "merge" => DHallTokenType::Merge,
134 "Some" => DHallTokenType::Some,
135 "None" => DHallTokenType::None,
136 "with" => DHallTokenType::With,
137 "forall" => DHallTokenType::Forall,
138 "assert" => DHallTokenType::Assert,
139 "Bool" => DHallTokenType::Bool,
140 "Natural" => DHallTokenType::Natural,
141 "Integer" => DHallTokenType::Integer,
142 "Double" => DHallTokenType::Double,
143 "Text" => DHallTokenType::Text,
144 "List" => DHallTokenType::List,
145 "Optional" => DHallTokenType::Optional,
146 "True" => DHallTokenType::True,
147 "False" => DHallTokenType::False,
148 "λ" => DHallTokenType::Lambda,
149 _ => DHallTokenType::Identifier,
150 };
151
152 state.add_token(kind, start, end);
153 true
154 }
155
156 fn lex_operators<'a, S: Source + ?Sized>(&self, state: &mut LexerState<'a, S, DHallLanguage>) -> bool {
157 let start = state.get_position();
158 let text = state.rest();
159
160 let ops = [
161 ("->", DHallTokenType::Arrow),
162 ("→", DHallTokenType::Arrow),
163 ("=>", DHallTokenType::FatArrow),
164 ("==", DHallTokenType::EqualEqual),
165 ("≡", DHallTokenType::EqualEqual),
166 ("!=", DHallTokenType::NotEqual),
167 ("&&", DHallTokenType::And),
168 ("∧", DHallTokenType::And),
169 ("||", DHallTokenType::Or),
170 ("∨", DHallTokenType::Or),
171 ("++", DHallTokenType::Append),
172 ("//", DHallTokenType::Combine),
173 ("⫽", DHallTokenType::Combine),
174 ("/\\", DHallTokenType::CombineTypes),
175 ("⩓", DHallTokenType::CombineTypes),
176 ("//\\", DHallTokenType::Prefer),
177 ("∀", DHallTokenType::Forall),
178 ("λ", DHallTokenType::Lambda),
179 ];
180
181 for (op, kind) in ops {
182 if text.starts_with(op) {
183 state.advance(op.len());
184 state.add_token(kind, start, state.get_position());
185 return true;
186 }
187 }
188
189 false
190 }
191
192 fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut LexerState<'a, S, DHallLanguage>) -> bool {
193 let start = state.get_position();
194 let c = match state.peek() {
195 Some(c) => c,
196 None => return false,
197 };
198
199 let kind = match c {
200 '(' => DHallTokenType::LeftParen,
201 ')' => DHallTokenType::RightParen,
202 '[' => DHallTokenType::LeftBracket,
203 ']' => DHallTokenType::RightBracket,
204 '{' => DHallTokenType::LeftBrace,
205 '}' => DHallTokenType::RightBrace,
206 '<' => DHallTokenType::Less,
207 '>' => DHallTokenType::Greater,
208 ',' => DHallTokenType::Comma,
209 '.' => DHallTokenType::Dot,
210 ':' => DHallTokenType::Colon,
211 ';' => DHallTokenType::Semicolon,
212 '=' => DHallTokenType::Equal,
213 '@' => DHallTokenType::At,
214 '#' => DHallTokenType::Hash,
215 '?' => DHallTokenType::Question,
216 '+' => DHallTokenType::Plus,
217 '*' => DHallTokenType::Star,
218 '/' => DHallTokenType::Slash,
219 '|' => DHallTokenType::Pipe,
220 '\\' => DHallTokenType::Lambda,
221 _ => return false,
222 };
223
224 state.advance(1);
225 state.add_token(kind, start, state.get_position());
226 true
227 }
228}