Skip to main content

panproto_expr_parser/
lexer.rs

1//! Lexer producing a sequence of spanned tokens from source text.
2//!
3//! Uses logos for fast regex-based tokenization, then applies a layout
4//! insertion pass to convert indentation into explicit `Indent`/`Dedent`/
5//! `Newline` tokens (the GHC approach).
6
7use logos::Logos;
8
9use crate::token::{Span, Spanned, Token};
10
11/// Tokenize source text into a sequence of spanned tokens.
12///
13/// This performs two passes:
14/// 1. Raw tokenization via logos (skips whitespace within lines).
15/// 2. Layout insertion (converts indentation to virtual tokens).
16///
17/// # Errors
18///
19/// Returns an error if the input contains an unrecognized token.
20pub fn tokenize(input: &str) -> Result<Vec<Spanned>, LexError> {
21    let raw = raw_tokenize(input)?;
22    Ok(insert_layout(input, &raw))
23}
24
25/// A lexer error with source location.
26#[derive(Debug, Clone, PartialEq, Eq)]
27pub struct LexError {
28    /// Byte offset of the unrecognized token.
29    pub offset: usize,
30    /// The problematic character(s).
31    pub text: String,
32}
33
34impl std::fmt::Display for LexError {
35    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
36        write!(
37            f,
38            "unrecognized token at byte {}: {:?}",
39            self.offset, self.text
40        )
41    }
42}
43
44impl std::error::Error for LexError {}
45
46/// Raw tokenization via logos (no layout insertion).
47fn raw_tokenize(input: &str) -> Result<Vec<Spanned>, LexError> {
48    let mut tokens = Vec::new();
49    let mut lexer = Token::lexer(input);
50
51    while let Some(result) = lexer.next() {
52        let span = lexer.span();
53        if let Ok(token) = result {
54            tokens.push(Spanned {
55                token,
56                span: Span {
57                    start: span.start,
58                    end: span.end,
59                },
60            });
61        } else {
62            // Check if this is a newline (which logos skips).
63            let slice = &input[span.clone()];
64            if slice.contains('\n') || slice.contains('\r') {
65                // Newlines are handled by the layout pass, not as tokens.
66                continue;
67            }
68            return Err(LexError {
69                offset: span.start,
70                text: slice.to_string(),
71            });
72        }
73    }
74
75    tokens.push(Spanned {
76        token: Token::Eof,
77        span: Span {
78            start: input.len(),
79            end: input.len(),
80        },
81    });
82
83    Ok(tokens)
84}
85
86/// Layout insertion pass (GHC-style).
87///
88/// Scans the raw token stream and the original source text. When a layout
89/// keyword (`let`, `where`, `do`, `of`) is followed by a newline and
90/// increased indentation, inserts `Indent`. When indentation decreases,
91/// inserts `Dedent`. At the same indentation, inserts `Newline` to
92/// separate declarations.
93///
94/// If the layout keyword is followed by `{`, layout is suppressed
95/// (explicit delimiters).
96fn insert_layout(input: &str, raw: &[Spanned]) -> Vec<Spanned> {
97    if raw.is_empty() {
98        return vec![];
99    }
100
101    let mut result = Vec::with_capacity(raw.len());
102    let mut indent_stack: Vec<usize> = vec![0]; // column stack
103    let mut prev_line = line_of(input, 0);
104    let mut prev_end = 0;
105
106    for spanned in raw {
107        let cur_line = line_of(input, spanned.span.start);
108        let cur_col = col_of(input, spanned.span.start);
109
110        // If we moved to a new line, check indentation.
111        if cur_line > prev_line {
112            let current_indent = *indent_stack.last().unwrap_or(&0);
113
114            match cur_col.cmp(&current_indent) {
115                std::cmp::Ordering::Greater => {
116                    // Check if previous token was a layout keyword.
117                    let prev_is_layout = result.last().is_some_and(|s: &Spanned| {
118                        matches!(s.token, Token::Let | Token::Where | Token::Do | Token::Of)
119                    });
120                    if prev_is_layout {
121                        indent_stack.push(cur_col);
122                        result.push(Spanned {
123                            token: Token::Indent,
124                            span: Span {
125                                start: spanned.span.start,
126                                end: spanned.span.start,
127                            },
128                        });
129                    }
130                }
131                std::cmp::Ordering::Less => {
132                    // Dedent: pop indent stack until we match or go below.
133                    while indent_stack.len() > 1 && *indent_stack.last().unwrap_or(&0) > cur_col {
134                        indent_stack.pop();
135                        result.push(Spanned {
136                            token: Token::Dedent,
137                            span: Span {
138                                start: spanned.span.start,
139                                end: spanned.span.start,
140                            },
141                        });
142                    }
143                }
144                std::cmp::Ordering::Equal => {
145                    // Same indentation: insert Newline separator.
146                    // Only if we're inside a layout block (indent_stack.len() > 1).
147                    if indent_stack.len() > 1 {
148                        result.push(Spanned {
149                            token: Token::Newline,
150                            span: Span {
151                                start: spanned.span.start,
152                                end: spanned.span.start,
153                            },
154                        });
155                    }
156                }
157            }
158        }
159
160        result.push(spanned.clone());
161        prev_line = cur_line;
162        prev_end = spanned.span.end;
163    }
164
165    // Close any remaining open layout blocks.
166    while indent_stack.len() > 1 {
167        indent_stack.pop();
168        result.push(Spanned {
169            token: Token::Dedent,
170            span: Span {
171                start: prev_end,
172                end: prev_end,
173            },
174        });
175    }
176
177    result
178}
179
180/// Get the 0-indexed line number for a byte offset.
181fn line_of(input: &str, offset: usize) -> usize {
182    input[..offset].bytes().filter(|&b| b == b'\n').count()
183}
184
185/// Get the 0-indexed column (byte offset from start of line) for a byte offset.
186fn col_of(input: &str, offset: usize) -> usize {
187    let line_start = input[..offset].rfind('\n').map_or(0, |pos| pos + 1);
188    offset - line_start
189}
190
191#[cfg(test)]
192mod tests {
193    use super::*;
194
195    #[test]
196    fn simple_expression() {
197        let tokens = tokenize("1 + 2").unwrap_or_default();
198        assert_eq!(tokens[0].token, Token::Int(1));
199        assert_eq!(tokens[1].token, Token::Plus);
200        assert_eq!(tokens[2].token, Token::Int(2));
201        assert_eq!(tokens[3].token, Token::Eof);
202    }
203
204    #[test]
205    fn keywords_recognized() {
206        let tokens = tokenize("let x = 1 in x").unwrap_or_default();
207        assert_eq!(tokens[0].token, Token::Let);
208        assert_eq!(tokens[1].token, Token::Ident("x".into()));
209        assert_eq!(tokens[2].token, Token::Eq);
210        assert_eq!(tokens[3].token, Token::Int(1));
211        assert_eq!(tokens[4].token, Token::In);
212    }
213
214    #[test]
215    fn string_literal() {
216        let tokens = tokenize(r#""hello world""#).unwrap_or_default();
217        assert_eq!(tokens[0].token, Token::Str("hello world".into()));
218    }
219
220    #[test]
221    fn operators() {
222        let tokens = tokenize("a -> b && c || d").unwrap_or_default();
223        assert_eq!(tokens[0].token, Token::Ident("a".into()));
224        assert_eq!(tokens[1].token, Token::Arrow);
225        assert_eq!(tokens[2].token, Token::Ident("b".into()));
226        assert_eq!(tokens[3].token, Token::AndAnd);
227        assert_eq!(tokens[5].token, Token::OrOr);
228    }
229
230    #[test]
231    fn layout_let_block() {
232        let input = "let\n  x = 1\n  y = 2\nin x";
233        let tokens = tokenize(input).unwrap_or_default();
234        let kinds: Vec<&Token> = tokens.iter().map(|s| &s.token).collect();
235        // Should have: Let, Indent, Ident(x), Eq, Int(1), Newline,
236        //              Ident(y), Eq, Int(2), Dedent, In, Ident(x), Eof
237        assert!(kinds.contains(&&Token::Indent));
238        assert!(kinds.contains(&&Token::Newline));
239        assert!(kinds.contains(&&Token::Dedent));
240    }
241
242    #[test]
243    fn comprehension_tokens() {
244        let tokens = tokenize("[ a | a <- xs, a > 0 ]").unwrap_or_default();
245        assert_eq!(tokens[0].token, Token::LBracket);
246        assert_eq!(tokens[1].token, Token::Ident("a".into()));
247        assert_eq!(tokens[2].token, Token::Pipe);
248        assert_eq!(tokens[3].token, Token::Ident("a".into()));
249        assert_eq!(tokens[4].token, Token::LeftArrow);
250    }
251
252    #[test]
253    fn comment_skipped() {
254        let tokens = tokenize("x -- this is a comment\ny").unwrap_or_default();
255        let idents: Vec<&str> = tokens
256            .iter()
257            .filter_map(|s| {
258                if let Token::Ident(ref name) = s.token {
259                    Some(name.as_str())
260                } else {
261                    None
262                }
263            })
264            .collect();
265        assert_eq!(idents, vec!["x", "y"]);
266    }
267
268    #[test]
269    fn float_literal() {
270        let tokens = tokenize("3.125").unwrap_or_default();
271        assert!(matches!(tokens[0].token, Token::Float(f) if (f - 3.125).abs() < f64::EPSILON));
272    }
273
274    #[test]
275    fn hex_literal() {
276        let tokens = tokenize("0xFF").unwrap_or_default();
277        assert_eq!(tokens[0].token, Token::Int(255));
278    }
279
280    #[test]
281    fn upper_ident() {
282        let tokens = tokenize("True Nothing MyType").unwrap_or_default();
283        assert_eq!(tokens[0].token, Token::True);
284        assert_eq!(tokens[1].token, Token::Nothing);
285        assert_eq!(tokens[2].token, Token::UpperIdent("MyType".into()));
286    }
287
288    #[test]
289    fn lambda_tokens() {
290        let tokens = tokenize("\\x -> x + 1").unwrap_or_default();
291        assert_eq!(tokens[0].token, Token::Backslash);
292        assert_eq!(tokens[1].token, Token::Ident("x".into()));
293        assert_eq!(tokens[2].token, Token::Arrow);
294    }
295
296    #[test]
297    fn edge_traversal() {
298        let tokens = tokenize("doc -> layers -> annotations").unwrap_or_default();
299        assert_eq!(tokens[0].token, Token::Ident("doc".into()));
300        assert_eq!(tokens[1].token, Token::Arrow);
301        assert_eq!(tokens[2].token, Token::Ident("layers".into()));
302        assert_eq!(tokens[3].token, Token::Arrow);
303        assert_eq!(tokens[4].token, Token::Ident("annotations".into()));
304    }
305}