1use crate::error::Error;
7use crate::syntax::{Position, VarName};
8
9#[derive(Debug, Clone, PartialEq, Eq)]
11pub struct Token {
12 kind: TokenKind,
13 at: Position,
14}
15
16impl Token {
17 #[must_use]
19 pub fn kind(&self) -> &TokenKind {
20 &self.kind
21 }
22
23 #[must_use]
25 pub fn at(&self) -> Position {
26 self.at
27 }
28
29 fn new(kind: TokenKind, at: Position) -> Self {
30 Self { kind, at }
31 }
32}
33
34#[derive(Debug, Clone, PartialEq, Eq)]
36pub enum TokenKind {
37 Ident(VarName),
39 KwLet,
41 KwIn,
43 KwFix,
45 KwRef,
47 KwExtend,
49 KwThrow,
51 KwTry,
53 KwCatch,
55 Lambda,
57 Dot,
60 Equals,
62 LParen,
64 RParen,
66 LBrace,
68 RBrace,
70 Comma,
72 Bang,
74 Assign,
76 Semicolon,
78}
79
80impl std::fmt::Display for TokenKind {
81 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
82 match self {
83 Self::Ident(name) => write!(f, "identifier {:?}", name.as_str()),
84 Self::KwLet => f.write_str("keyword `let`"),
85 Self::KwIn => f.write_str("keyword `in`"),
86 Self::KwFix => f.write_str("keyword `fix`"),
87 Self::KwRef => f.write_str("keyword `ref`"),
88 Self::KwExtend => f.write_str("keyword `extend`"),
89 Self::KwThrow => f.write_str("keyword `throw`"),
90 Self::KwTry => f.write_str("keyword `try`"),
91 Self::KwCatch => f.write_str("keyword `catch`"),
92 Self::Lambda => f.write_str("`\\`"),
93 Self::Dot => f.write_str("`.`"),
94 Self::Equals => f.write_str("`=`"),
95 Self::LParen => f.write_str("`(`"),
96 Self::RParen => f.write_str("`)`"),
97 Self::LBrace => f.write_str("`{`"),
98 Self::RBrace => f.write_str("`}`"),
99 Self::Comma => f.write_str("`,`"),
100 Self::Bang => f.write_str("`!`"),
101 Self::Assign => f.write_str("`:=`"),
102 Self::Semicolon => f.write_str("`;`"),
103 }
104 }
105}
106
107enum Step {
108 End,
109 Byte(u8),
110}
111
112fn peek(src: &[u8], pos: usize) -> Step {
113 src.get(pos).copied().map_or(Step::End, Step::Byte)
114}
115
116pub fn lex(src: &str) -> Result<Vec<Token>, Error> {
139 step(src.as_bytes(), 0, Vec::new())
140}
141
142fn step(src: &[u8], pos: usize, acc: Vec<Token>) -> Result<Vec<Token>, Error> {
143 match peek(src, pos) {
144 Step::End => Ok(acc),
145 Step::Byte(b) => take_token(src, pos, acc, b),
146 }
147}
148
149fn take_token(src: &[u8], pos: usize, acc: Vec<Token>, b: u8) -> Result<Vec<Token>, Error> {
150 match b {
151 b' ' | b'\t' | b'\n' | b'\r' => step(src, pos + 1, acc),
152 b'\\' => emit_single(src, pos, acc, TokenKind::Lambda),
153 b'.' => emit_single(src, pos, acc, TokenKind::Dot),
154 b'=' => emit_single(src, pos, acc, TokenKind::Equals),
155 b'(' => emit_single(src, pos, acc, TokenKind::LParen),
156 b')' => emit_single(src, pos, acc, TokenKind::RParen),
157 b'{' => emit_single(src, pos, acc, TokenKind::LBrace),
158 b'}' => emit_single(src, pos, acc, TokenKind::RBrace),
159 b',' => emit_single(src, pos, acc, TokenKind::Comma),
160 b'!' => emit_single(src, pos, acc, TokenKind::Bang),
161 b';' => emit_single(src, pos, acc, TokenKind::Semicolon),
162 b':' => take_colon(src, pos, acc),
163 other if is_ident_start(other) => read_ident(src, pos, acc),
164 other => Err(Error::UnexpectedChar {
165 at: pos.into(),
166 ch: char::from(other),
167 }),
168 }
169}
170
171fn emit_single(
172 src: &[u8],
173 pos: usize,
174 acc: Vec<Token>,
175 kind: TokenKind,
176) -> Result<Vec<Token>, Error> {
177 step(src, pos + 1, push(acc, Token::new(kind, pos.into())))
178}
179
180fn take_colon(src: &[u8], pos: usize, acc: Vec<Token>) -> Result<Vec<Token>, Error> {
181 match peek(src, pos + 1) {
182 Step::End => Err(Error::UnexpectedEnd {
183 expected: "`=` after `:`",
184 }),
185 Step::Byte(b'=') => step(
186 src,
187 pos + 2,
188 push(acc, Token::new(TokenKind::Assign, pos.into())),
189 ),
190 Step::Byte(other) => Err(Error::UnexpectedChar {
191 at: (pos + 1).into(),
192 ch: char::from(other),
193 }),
194 }
195}
196
197fn push(acc: Vec<Token>, token: Token) -> Vec<Token> {
198 acc.into_iter().chain(std::iter::once(token)).collect()
199}
200
201fn read_ident(src: &[u8], start: usize, acc: Vec<Token>) -> Result<Vec<Token>, Error> {
202 let end = scan_ident(src, start);
203 let slice = src.get(start..end).unwrap_or(&[]);
204 let token = classify_ident(slice, start);
205 step(src, end, push(acc, token))
206}
207
208fn scan_ident(src: &[u8], pos: usize) -> usize {
209 src.get(pos)
210 .copied()
211 .filter(|b| is_ident_continue(*b))
212 .map_or(pos, |_| scan_ident(src, pos + 1))
213}
214
215fn classify_ident(slice: &[u8], start: usize) -> Token {
216 let at = Position::from(start);
217 match slice {
218 b"let" => Token::new(TokenKind::KwLet, at),
219 b"in" => Token::new(TokenKind::KwIn, at),
220 b"fix" => Token::new(TokenKind::KwFix, at),
221 b"ref" => Token::new(TokenKind::KwRef, at),
222 b"extend" => Token::new(TokenKind::KwExtend, at),
223 b"throw" => Token::new(TokenKind::KwThrow, at),
224 b"try" => Token::new(TokenKind::KwTry, at),
225 b"catch" => Token::new(TokenKind::KwCatch, at),
226 bytes => Token::new(
227 TokenKind::Ident(VarName::from(
228 std::str::from_utf8(bytes).unwrap_or_default(),
229 )),
230 at,
231 ),
232 }
233}
234
235fn is_ident_start(b: u8) -> bool {
236 b.is_ascii_alphabetic() || b == b'_'
237}
238
239fn is_ident_continue(b: u8) -> bool {
240 b.is_ascii_alphanumeric() || b == b'_'
241}
242
243#[cfg(test)]
244mod tests {
245 use super::*;
246
247 #[test]
248 fn lex_object_literal() -> Result<(), Error> {
249 let tokens = lex("{ foo = bar, baz = qux }")?;
250 let kinds: Vec<TokenKind> = tokens.iter().map(|t| t.kind().clone()).collect();
251 let expected = vec![
252 TokenKind::LBrace,
253 TokenKind::Ident(VarName::from("foo")),
254 TokenKind::Equals,
255 TokenKind::Ident(VarName::from("bar")),
256 TokenKind::Comma,
257 TokenKind::Ident(VarName::from("baz")),
258 TokenKind::Equals,
259 TokenKind::Ident(VarName::from("qux")),
260 TokenKind::RBrace,
261 ];
262 (kinds == expected)
263 .then_some(())
264 .ok_or(Error::UnexpectedEnd {
265 expected: "object literal tokenization",
266 })
267 }
268
269 #[test]
270 fn lex_extend_keyword() -> Result<(), Error> {
271 let tokens = lex("extend p {}")?;
272 let kinds: Vec<TokenKind> = tokens.iter().map(|t| t.kind().clone()).collect();
273 let expected = vec![
274 TokenKind::KwExtend,
275 TokenKind::Ident(VarName::from("p")),
276 TokenKind::LBrace,
277 TokenKind::RBrace,
278 ];
279 (kinds == expected)
280 .then_some(())
281 .ok_or(Error::UnexpectedEnd {
282 expected: "extend tokenization",
283 })
284 }
285
286 #[test]
287 fn lex_field_access_dot() -> Result<(), Error> {
288 let tokens = lex("obj.field")?;
289 let kinds: Vec<TokenKind> = tokens.iter().map(|t| t.kind().clone()).collect();
290 let expected = vec![
291 TokenKind::Ident(VarName::from("obj")),
292 TokenKind::Dot,
293 TokenKind::Ident(VarName::from("field")),
294 ];
295 (kinds == expected)
296 .then_some(())
297 .ok_or(Error::UnexpectedEnd {
298 expected: "field access tokenization",
299 })
300 }
301}