panproto_expr_parser/
lexer.rs1use logos::Logos;
8
9use crate::token::{Span, Spanned, Token};
10
11pub fn tokenize(input: &str) -> Result<Vec<Spanned>, LexError> {
21 let raw = raw_tokenize(input)?;
22 Ok(insert_layout(input, &raw))
23}
24
25#[derive(Debug, Clone, PartialEq, Eq)]
27pub struct LexError {
28 pub offset: usize,
30 pub text: String,
32}
33
34impl std::fmt::Display for LexError {
35 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
36 write!(
37 f,
38 "unrecognized token at byte {}: {:?}",
39 self.offset, self.text
40 )
41 }
42}
43
44impl std::error::Error for LexError {}
45
46fn raw_tokenize(input: &str) -> Result<Vec<Spanned>, LexError> {
48 let mut tokens = Vec::new();
49 let mut lexer = Token::lexer(input);
50
51 while let Some(result) = lexer.next() {
52 let span = lexer.span();
53 if let Ok(token) = result {
54 tokens.push(Spanned {
55 token,
56 span: Span {
57 start: span.start,
58 end: span.end,
59 },
60 });
61 } else {
62 let slice = &input[span.clone()];
64 if slice.contains('\n') || slice.contains('\r') {
65 continue;
67 }
68 return Err(LexError {
69 offset: span.start,
70 text: slice.to_string(),
71 });
72 }
73 }
74
75 tokens.push(Spanned {
76 token: Token::Eof,
77 span: Span {
78 start: input.len(),
79 end: input.len(),
80 },
81 });
82
83 Ok(tokens)
84}
85
86fn insert_layout(input: &str, raw: &[Spanned]) -> Vec<Spanned> {
97 if raw.is_empty() {
98 return vec![];
99 }
100
101 let mut result = Vec::with_capacity(raw.len());
102 let mut indent_stack: Vec<usize> = vec![0]; let mut prev_line = line_of(input, 0);
104 let mut prev_end = 0;
105
106 for spanned in raw {
107 let cur_line = line_of(input, spanned.span.start);
108 let cur_col = col_of(input, spanned.span.start);
109
110 if cur_line > prev_line {
112 let current_indent = *indent_stack.last().unwrap_or(&0);
113
114 match cur_col.cmp(¤t_indent) {
115 std::cmp::Ordering::Greater => {
116 let prev_is_layout = result.last().is_some_and(|s: &Spanned| {
118 matches!(s.token, Token::Let | Token::Where | Token::Do | Token::Of)
119 });
120 if prev_is_layout {
121 indent_stack.push(cur_col);
122 result.push(Spanned {
123 token: Token::Indent,
124 span: Span {
125 start: spanned.span.start,
126 end: spanned.span.start,
127 },
128 });
129 }
130 }
131 std::cmp::Ordering::Less => {
132 while indent_stack.len() > 1 && *indent_stack.last().unwrap_or(&0) > cur_col {
134 indent_stack.pop();
135 result.push(Spanned {
136 token: Token::Dedent,
137 span: Span {
138 start: spanned.span.start,
139 end: spanned.span.start,
140 },
141 });
142 }
143 }
144 std::cmp::Ordering::Equal => {
145 if indent_stack.len() > 1 {
148 result.push(Spanned {
149 token: Token::Newline,
150 span: Span {
151 start: spanned.span.start,
152 end: spanned.span.start,
153 },
154 });
155 }
156 }
157 }
158 }
159
160 result.push(spanned.clone());
161 prev_line = cur_line;
162 prev_end = spanned.span.end;
163 }
164
165 while indent_stack.len() > 1 {
167 indent_stack.pop();
168 result.push(Spanned {
169 token: Token::Dedent,
170 span: Span {
171 start: prev_end,
172 end: prev_end,
173 },
174 });
175 }
176
177 result
178}
179
180fn line_of(input: &str, offset: usize) -> usize {
182 input[..offset].bytes().filter(|&b| b == b'\n').count()
183}
184
185fn col_of(input: &str, offset: usize) -> usize {
187 let line_start = input[..offset].rfind('\n').map_or(0, |pos| pos + 1);
188 offset - line_start
189}
190
191#[cfg(test)]
192mod tests {
193 use super::*;
194
195 #[test]
196 fn simple_expression() {
197 let tokens = tokenize("1 + 2").unwrap_or_default();
198 assert_eq!(tokens[0].token, Token::Int(1));
199 assert_eq!(tokens[1].token, Token::Plus);
200 assert_eq!(tokens[2].token, Token::Int(2));
201 assert_eq!(tokens[3].token, Token::Eof);
202 }
203
204 #[test]
205 fn keywords_recognized() {
206 let tokens = tokenize("let x = 1 in x").unwrap_or_default();
207 assert_eq!(tokens[0].token, Token::Let);
208 assert_eq!(tokens[1].token, Token::Ident("x".into()));
209 assert_eq!(tokens[2].token, Token::Eq);
210 assert_eq!(tokens[3].token, Token::Int(1));
211 assert_eq!(tokens[4].token, Token::In);
212 }
213
214 #[test]
215 fn string_literal() {
216 let tokens = tokenize(r#""hello world""#).unwrap_or_default();
217 assert_eq!(tokens[0].token, Token::Str("hello world".into()));
218 }
219
220 #[test]
221 fn operators() {
222 let tokens = tokenize("a -> b && c || d").unwrap_or_default();
223 assert_eq!(tokens[0].token, Token::Ident("a".into()));
224 assert_eq!(tokens[1].token, Token::Arrow);
225 assert_eq!(tokens[2].token, Token::Ident("b".into()));
226 assert_eq!(tokens[3].token, Token::AndAnd);
227 assert_eq!(tokens[5].token, Token::OrOr);
228 }
229
230 #[test]
231 fn layout_let_block() {
232 let input = "let\n x = 1\n y = 2\nin x";
233 let tokens = tokenize(input).unwrap_or_default();
234 let kinds: Vec<&Token> = tokens.iter().map(|s| &s.token).collect();
235 assert!(kinds.contains(&&Token::Indent));
238 assert!(kinds.contains(&&Token::Newline));
239 assert!(kinds.contains(&&Token::Dedent));
240 }
241
242 #[test]
243 fn comprehension_tokens() {
244 let tokens = tokenize("[ a | a <- xs, a > 0 ]").unwrap_or_default();
245 assert_eq!(tokens[0].token, Token::LBracket);
246 assert_eq!(tokens[1].token, Token::Ident("a".into()));
247 assert_eq!(tokens[2].token, Token::Pipe);
248 assert_eq!(tokens[3].token, Token::Ident("a".into()));
249 assert_eq!(tokens[4].token, Token::LeftArrow);
250 }
251
252 #[test]
253 fn comment_skipped() {
254 let tokens = tokenize("x -- this is a comment\ny").unwrap_or_default();
255 let idents: Vec<&str> = tokens
256 .iter()
257 .filter_map(|s| {
258 if let Token::Ident(ref name) = s.token {
259 Some(name.as_str())
260 } else {
261 None
262 }
263 })
264 .collect();
265 assert_eq!(idents, vec!["x", "y"]);
266 }
267
268 #[test]
269 fn float_literal() {
270 let tokens = tokenize("3.125").unwrap_or_default();
271 assert!(matches!(tokens[0].token, Token::Float(f) if (f - 3.125).abs() < f64::EPSILON));
272 }
273
274 #[test]
275 fn hex_literal() {
276 let tokens = tokenize("0xFF").unwrap_or_default();
277 assert_eq!(tokens[0].token, Token::Int(255));
278 }
279
280 #[test]
281 fn upper_ident() {
282 let tokens = tokenize("True Nothing MyType").unwrap_or_default();
283 assert_eq!(tokens[0].token, Token::True);
284 assert_eq!(tokens[1].token, Token::Nothing);
285 assert_eq!(tokens[2].token, Token::UpperIdent("MyType".into()));
286 }
287
288 #[test]
289 fn lambda_tokens() {
290 let tokens = tokenize("\\x -> x + 1").unwrap_or_default();
291 assert_eq!(tokens[0].token, Token::Backslash);
292 assert_eq!(tokens[1].token, Token::Ident("x".into()));
293 assert_eq!(tokens[2].token, Token::Arrow);
294 }
295
296 #[test]
297 fn edge_traversal() {
298 let tokens = tokenize("doc -> layers -> annotations").unwrap_or_default();
299 assert_eq!(tokens[0].token, Token::Ident("doc".into()));
300 assert_eq!(tokens[1].token, Token::Arrow);
301 assert_eq!(tokens[2].token, Token::Ident("layers".into()));
302 assert_eq!(tokens[3].token, Token::Arrow);
303 assert_eq!(tokens[4].token, Token::Ident("annotations".into()));
304 }
305}