koi_parser/
lexer.rs

1use std::{
2    num::{ParseFloatError, ParseIntError},
3    ops::Range,
4};
5
6use logos::Logos;
7
8pub fn lexer(source: &str) -> impl Iterator<Item = (Token, Range<usize>)> {
9    Token::lexer(source)
10        .spanned()
11        .map(|(result, span)| match result {
12            Ok(v) => (v, span),
13            Err(e) => (Token::Error(e), span),
14        })
15        .chain(std::iter::once((Token::EOF, 0..0)))
16}
17
18/// Tokens for a simplified Rust-like language
19#[derive(Logos, Debug, Clone, PartialEq)]
20// #[logos(error(LexingError, LexingError::from_lexer))]
21#[logos(error(LexingError))]
22#[logos(skip r"\s+")]
23pub enum Token {
24    // --- Keywords ---
25    #[token("fn")]
26    Fn,
27    #[token("let")]
28    Let,
29    #[token("return")]
30    Return,
31    // #[token("if")]
32    // If,
33    // #[token("else")]
34    // Else,
35    // #[token("match")]
36    // Match,
37    // #[token("in")]
38    // In,
39    // #[token("loop")]
40    // Loop,
41    // #[token("break")]
42    // Break,
43    // #[token("continue")]
44    // Continue,
45    // #[token("use")]
46    // Use,
47    #[token("struct")]
48    Struct,
49    #[token("enum")]
50    Enum,
51    // #[token("impl")]
52    // Impl,
53    // #[token("trait")]
54    // Trait,
55    // #[token("pub")]
56    // Pub,
57    // #[token("alias")]
58    // Alias,
59    // #[token("error")]
60    // Error,
61
62    // --- Literals ---
63    #[regex(r"[0-9]+\.[0-9]+", |lex| lex.slice().parse::<f64>())]
64    Float(f64),
65    #[regex("[0-9]+", |lex| lex.slice().parse::<i64>())]
66    Int(i64),
67    #[token("true", |lex| lex.slice().parse::<bool>().unwrap())]
68    #[token("false", |lex| lex.slice().parse::<bool>().unwrap())]
69    Bool(bool),
70
71    // #[regex(r#"f"([^"\\]|\\.)*""#)]
72    // #[token(r#"f""""#, parse_fblock_string)]
73    // FString(String),
74    #[regex(r#""([^"\\]|\\.)*""#, |lex| lex.slice().to_string())]
75    #[token(r#"""""#, parse_block_string)]
76    String(String),
77
78    // --- Comments ---
79    #[regex(r"///[^\n]*", |lex| lex.slice().to_string())]
80    DocComment(String), // captures `/// ...`
81    #[regex(r"//[^\n]*", |lex| lex.slice().to_string())]
82    LineComment(String),
83
84    // --- Punctuation ---
85    #[token("(")]
86    LParen,
87    #[token(")")]
88    RParen,
89    #[token("{")]
90    LBrace,
91    #[token("}")]
92    RBrace,
93    #[token("[")]
94    LBracket,
95    #[token("]")]
96    RBracket,
97    #[token(",")]
98    Comma,
99    #[token(";")]
100    Semi,
101    // #[token("::")]
102    // Turbo,
103    #[token(":")]
104    Colon,
105    #[token("->")]
106    Arrow,
107    // #[token("=>")]
108    // FatArrow,
109
110    // --- Operators ---
111    #[token("+")]
112    Plus,
113    #[token("-")]
114    Minus,
115    #[token("*")]
116    Star,
117    #[token("/")]
118    Slash,
119
120    #[token("==")]
121    Eq,
122    #[token("!=")]
123    Neq,
124    #[token("&&")]
125    And,
126    #[token("||")]
127    Or,
128    #[token("<")]
129    Lt, // Note: This is not also LAngle, since we use `[` instead for generics
130    #[token("<=")]
131    Lte,
132    #[token(">")]
133    Gt, // Note: This is not also RAngle, since we use `]` instead for generics
134    #[token(">=")]
135    Gte,
136
137    // --- Special ---
138    // #[token("!")]
139    // Exclamation,
140    // #[token("?")]
141    // Question,
142    #[token("=")]
143    Assignment,
144    // #[token("_")]
145    // Underscore,
146    // #[token("..=")]
147    // RangeInclusive,
148    // #[token("..")]
149    // RangeExclusive,
150    // #[token(".")]
151    // Dot,
152
153    // --- Identifiers ---
154    // Match identifiers that either:
155    // - Start with a letter, or
156    // - Start with _ followed by at least one more character
157    #[regex(r"[a-zA-Z][a-zA-Z0-9_]*|_[a-zA-Z0-9_]+", |lex| lex.slice().to_string())]
158    Ident(String),
159
160    // Error encountered during lexing
161    Error(LexingError),
162    // End of file
163    EOF,
164}
165
166fn parse_block_string<'src>(lex: &mut logos::Lexer<'src, Token>) -> Result<String, LexingError> {
167    let start = lex.span().end; // Position after the opening """
168
169    let remainder = lex.remainder();
170
171    if let Some(end_pos) = remainder.find("\"\"\"") {
172        lex.bump(end_pos + 3);
173
174        let full_source = lex.source();
175        return Ok(full_source[start..start + end_pos].to_string());
176    }
177
178    Err(LexingError::BlockString(
179        "Block string missing terminating `\"\"\"`".to_string(),
180    ))
181}
182
183fn parse_fblock_string<'src>(lex: &mut logos::Lexer<'src, Token>) -> Result<String, LexingError> {
184    debug_assert_eq!(lex.slice(), "f\"\"\"", "Expected opening triple quotes");
185
186    parse_block_string(lex)
187}
188
189#[derive(Default, Debug, Clone, PartialEq)]
190pub enum LexingError {
191    Int(String),
192    Float(String),
193    BlockString(String),
194    #[default]
195    Other,
196}
197
198impl From<ParseIntError> for LexingError {
199    fn from(err: ParseIntError) -> Self {
200        LexingError::Int(err.to_string())
201    }
202}
203
204impl From<ParseFloatError> for LexingError {
205    fn from(err: ParseFloatError) -> Self {
206        LexingError::Float(err.to_string())
207    }
208}
209
210// impl LexingError {
211//     fn from_lexer(lex: &mut logos::Lexer<'_, Token>) -> Self {
212//         todo!()
213//     }
214// }
215
216#[test]
217fn test() {
218    let source = r#"
219        /// This function adds
220        /// more comment
221        fn add(x: i32, y: i32) -> i32 {
222            let _ = """ 
223This is a block string
224""";
225            let sum = x + y;
226            return sum;
227        }
228"#;
229
230    let mut lexer = lexer(source);
231
232    while let Some((token, span)) = lexer.next() {
233        println!(
234            "{:?} => {:?}{:?}",
235            token,
236            &span,
237            source[span.clone()].to_string()
238        );
239    }
240}