1use crate::error::Error;
13use crate::syntax::{Position, VarName};
14
15#[derive(Debug, Clone, PartialEq, Eq)]
17pub struct Token {
18 kind: TokenKind,
19 at: Position,
20}
21
22impl Token {
23 #[must_use]
25 pub fn kind(&self) -> &TokenKind {
26 &self.kind
27 }
28
29 #[must_use]
31 pub fn at(&self) -> Position {
32 self.at
33 }
34
35 fn new(kind: TokenKind, at: Position) -> Self {
36 Self { kind, at }
37 }
38}
39
40#[derive(Debug, Clone, PartialEq, Eq)]
42pub enum TokenKind {
43 Ident(VarName),
45 KwLet,
47 KwIn,
49 KwFix,
51 KwRef,
53 Lambda,
55 Dot,
57 Equals,
59 LParen,
61 RParen,
63 Bang,
65 Assign,
67 Semicolon,
69}
70
71impl std::fmt::Display for TokenKind {
72 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
73 match self {
74 Self::Ident(name) => write!(f, "identifier {:?}", name.as_str()),
75 Self::KwLet => f.write_str("keyword `let`"),
76 Self::KwIn => f.write_str("keyword `in`"),
77 Self::KwFix => f.write_str("keyword `fix`"),
78 Self::KwRef => f.write_str("keyword `ref`"),
79 Self::Lambda => f.write_str("`\\`"),
80 Self::Dot => f.write_str("`.`"),
81 Self::Equals => f.write_str("`=`"),
82 Self::LParen => f.write_str("`(`"),
83 Self::RParen => f.write_str("`)`"),
84 Self::Bang => f.write_str("`!`"),
85 Self::Assign => f.write_str("`:=`"),
86 Self::Semicolon => f.write_str("`;`"),
87 }
88 }
89}
90
91enum Step {
93 End,
94 Byte(u8),
95}
96
97fn peek(src: &[u8], pos: usize) -> Step {
98 src.get(pos).copied().map_or(Step::End, Step::Byte)
99}
100
101pub fn lex(src: &str) -> Result<Vec<Token>, Error> {
124 step(src.as_bytes(), 0, Vec::new())
125}
126
127fn step(src: &[u8], pos: usize, acc: Vec<Token>) -> Result<Vec<Token>, Error> {
128 match peek(src, pos) {
129 Step::End => Ok(acc),
130 Step::Byte(b) => take_token(src, pos, acc, b),
131 }
132}
133
134fn take_token(src: &[u8], pos: usize, acc: Vec<Token>, b: u8) -> Result<Vec<Token>, Error> {
135 match b {
136 b' ' | b'\t' | b'\n' | b'\r' => step(src, pos + 1, acc),
137 b'\\' => emit_single(src, pos, acc, TokenKind::Lambda),
138 b'.' => emit_single(src, pos, acc, TokenKind::Dot),
139 b'=' => emit_single(src, pos, acc, TokenKind::Equals),
140 b'(' => emit_single(src, pos, acc, TokenKind::LParen),
141 b')' => emit_single(src, pos, acc, TokenKind::RParen),
142 b'!' => emit_single(src, pos, acc, TokenKind::Bang),
143 b';' => emit_single(src, pos, acc, TokenKind::Semicolon),
144 b':' => take_colon(src, pos, acc),
145 other if is_ident_start(other) => read_ident(src, pos, acc),
146 other => Err(Error::UnexpectedChar {
147 at: pos.into(),
148 ch: char::from(other),
149 }),
150 }
151}
152
153fn emit_single(
154 src: &[u8],
155 pos: usize,
156 acc: Vec<Token>,
157 kind: TokenKind,
158) -> Result<Vec<Token>, Error> {
159 step(src, pos + 1, push(acc, Token::new(kind, pos.into())))
160}
161
162fn take_colon(src: &[u8], pos: usize, acc: Vec<Token>) -> Result<Vec<Token>, Error> {
163 match peek(src, pos + 1) {
164 Step::End => Err(Error::UnexpectedEnd {
165 expected: "`=` after `:`",
166 }),
167 Step::Byte(b'=') => step(
168 src,
169 pos + 2,
170 push(acc, Token::new(TokenKind::Assign, pos.into())),
171 ),
172 Step::Byte(other) => Err(Error::UnexpectedChar {
173 at: (pos + 1).into(),
174 ch: char::from(other),
175 }),
176 }
177}
178
179fn push(acc: Vec<Token>, token: Token) -> Vec<Token> {
180 acc.into_iter().chain(std::iter::once(token)).collect()
181}
182
183fn read_ident(src: &[u8], start: usize, acc: Vec<Token>) -> Result<Vec<Token>, Error> {
184 let end = scan_ident(src, start);
185 let slice = src.get(start..end).unwrap_or(&[]);
186 let token = classify_ident(slice, start);
187 step(src, end, push(acc, token))
188}
189
190fn scan_ident(src: &[u8], pos: usize) -> usize {
191 src.get(pos)
192 .copied()
193 .filter(|b| is_ident_continue(*b))
194 .map_or(pos, |_| scan_ident(src, pos + 1))
195}
196
197fn classify_ident(slice: &[u8], start: usize) -> Token {
198 let at = Position::from(start);
199 match slice {
200 b"let" => Token::new(TokenKind::KwLet, at),
201 b"in" => Token::new(TokenKind::KwIn, at),
202 b"fix" => Token::new(TokenKind::KwFix, at),
203 b"ref" => Token::new(TokenKind::KwRef, at),
204 bytes => Token::new(
205 TokenKind::Ident(VarName::from(
206 std::str::from_utf8(bytes).unwrap_or_default(),
207 )),
208 at,
209 ),
210 }
211}
212
213fn is_ident_start(b: u8) -> bool {
214 b.is_ascii_alphabetic() || b == b'_'
215}
216
217fn is_ident_continue(b: u8) -> bool {
218 b.is_ascii_alphanumeric() || b == b'_'
219}
220
221#[cfg(test)]
222mod tests {
223 use super::*;
224
225 #[test]
226 fn lex_ref_and_assign() -> Result<(), Error> {
227 let tokens = lex("ref x := y")?;
228 let kinds: Vec<TokenKind> = tokens.iter().map(|t| t.kind().clone()).collect();
229 let expected = vec![
230 TokenKind::KwRef,
231 TokenKind::Ident(VarName::from("x")),
232 TokenKind::Assign,
233 TokenKind::Ident(VarName::from("y")),
234 ];
235 (kinds == expected)
236 .then_some(())
237 .ok_or(Error::UnexpectedEnd {
238 expected: "ref/assign tokenization",
239 })
240 }
241
242 #[test]
243 fn lex_sequence_and_bang() -> Result<(), Error> {
244 let tokens = lex("!x ; y")?;
245 let kinds: Vec<TokenKind> = tokens.iter().map(|t| t.kind().clone()).collect();
246 let expected = vec![
247 TokenKind::Bang,
248 TokenKind::Ident(VarName::from("x")),
249 TokenKind::Semicolon,
250 TokenKind::Ident(VarName::from("y")),
251 ];
252 (kinds == expected)
253 .then_some(())
254 .ok_or(Error::UnexpectedEnd {
255 expected: "bang/semi tokenization",
256 })
257 }
258
259 #[test]
260 fn bare_colon_is_error() -> Result<(), Error> {
261 let result = lex(":x");
262 match result {
263 Err(Error::UnexpectedChar { .. }) => Ok(()),
264 Err(other) => Err(other),
265 Ok(_) => Err(Error::UnexpectedEnd {
266 expected: "bare colon rejection",
267 }),
268 }
269 }
270}