1use std::str::Chars;
8
9use thiserror::Error;
10
11#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
13pub struct Position {
14 pub offset: usize,
16 pub line: usize,
18 pub column: usize,
20}
21
22impl Position {
23 #[must_use]
25 pub const fn start() -> Self {
26 Self {
27 offset: 0,
28 line: 1,
29 column: 1,
30 }
31 }
32}
33
34#[derive(Clone, Debug, PartialEq)]
40pub enum Token {
41 Symbol(String),
43 TypedSymbol {
47 name: String,
49 kind: String,
51 },
52 Bareword(String),
56 Timestamp(String),
59 Integer(i64),
61 Float(f64),
63 String(String),
65 Boolean(bool),
67 Nil,
69 Keyword(String),
71 LParen,
73 RParen,
75}
76
77#[derive(Clone, Debug, PartialEq)]
79pub struct Spanned {
80 pub token: Token,
82 pub position: Position,
84}
85
86#[derive(Debug, Error, PartialEq)]
88pub enum LexError {
89 #[error("unterminated string starting at {start:?}")]
91 UnterminatedString {
92 start: Position,
94 },
95
96 #[error("invalid escape '\\{escape}' at {pos:?}")]
98 InvalidEscape {
99 escape: char,
101 pos: Position,
103 },
104
105 #[error("invalid number {text:?} at {pos:?}")]
107 InvalidNumber {
108 text: String,
110 pos: Position,
112 },
113
114 #[error("invalid identifier {text:?} at {pos:?}")]
116 InvalidIdentifier {
117 text: String,
119 pos: Position,
121 },
122
123 #[error("unexpected byte {byte:#04x} at {pos:?}")]
125 UnexpectedByte {
126 byte: u8,
128 pos: Position,
130 },
131
132 #[error("invalid UTF-8 at {pos:?}")]
134 InvalidUtf8 {
135 pos: Position,
137 },
138}
139
140pub fn tokenize(input: &str) -> Result<Vec<Spanned>, LexError> {
160 let mut lexer = Lexer::new(input);
161 let mut out = Vec::new();
162 while let Some(spanned) = lexer.next_token()? {
163 out.push(spanned);
164 }
165 Ok(out)
166}
167
168struct Lexer<'a> {
169 input: &'a str,
170 chars: Chars<'a>,
171 pos: Position,
172}
173
174impl<'a> Lexer<'a> {
175 fn new(input: &'a str) -> Self {
176 Self {
177 input,
178 chars: input.chars(),
179 pos: Position::start(),
180 }
181 }
182
183 fn peek(&self) -> Option<char> {
184 self.chars.clone().next()
185 }
186
187 fn bump(&mut self) -> Option<char> {
188 let c = self.chars.next()?;
189 let len = c.len_utf8();
190 self.pos.offset += len;
191 if c == '\n' {
192 self.pos.line += 1;
193 self.pos.column = 1;
194 } else {
195 self.pos.column += 1;
196 }
197 Some(c)
198 }
199
200 fn skip_whitespace_and_comments(&mut self) {
201 while let Some(c) = self.peek() {
202 if c.is_whitespace() {
203 self.bump();
204 } else if c == ';' {
205 while let Some(cc) = self.peek() {
207 if cc == '\n' {
208 break;
209 }
210 self.bump();
211 }
212 } else {
213 break;
214 }
215 }
216 }
217
218 fn next_token(&mut self) -> Result<Option<Spanned>, LexError> {
219 self.skip_whitespace_and_comments();
220 let start = self.pos;
221 let Some(c) = self.peek() else {
222 return Ok(None);
223 };
224 let token = match c {
225 '(' => {
226 self.bump();
227 Token::LParen
228 }
229 ')' => {
230 self.bump();
231 Token::RParen
232 }
233 '"' => self.lex_string(start)?,
234 '@' => self.lex_symbol_or_typed(start)?,
235 ':' => self.lex_keyword(start)?,
236 '-' | '0'..='9' => self.lex_number_or_timestamp(start)?,
237 'a'..='z' | '_' => self.lex_bareword_or_reserved(start)?,
238 _ => {
239 let byte = c as u32;
240 #[allow(clippy::cast_possible_truncation)]
241 return Err(LexError::UnexpectedByte {
242 byte: byte as u8,
243 pos: start,
244 });
245 }
246 };
247 Ok(Some(Spanned {
248 token,
249 position: start,
250 }))
251 }
252
253 fn lex_string(&mut self, start: Position) -> Result<Token, LexError> {
254 self.bump(); let mut buf = String::new();
256 loop {
257 let pos = self.pos;
258 let Some(c) = self.bump() else {
259 return Err(LexError::UnterminatedString { start });
260 };
261 match c {
262 '"' => return Ok(Token::String(buf)),
263 '\\' => {
264 let Some(esc) = self.bump() else {
265 return Err(LexError::UnterminatedString { start });
266 };
267 let resolved = match esc {
268 'n' => '\n',
269 'r' => '\r',
270 't' => '\t',
271 '\\' => '\\',
272 '"' => '"',
273 other => return Err(LexError::InvalidEscape { escape: other, pos }),
274 };
275 buf.push(resolved);
276 }
277 other => buf.push(other),
278 }
279 }
280 }
281
282 fn lex_symbol_or_typed(&mut self, start: Position) -> Result<Token, LexError> {
283 self.bump(); let name_start = self.pos.offset;
285 self.consume_identifier();
286 let name_end = self.pos.offset;
287 let name = self.input[name_start..name_end].to_string();
288 if name.is_empty() || !is_valid_identifier_start(&name) {
289 return Err(LexError::InvalidIdentifier {
290 text: format!("@{name}"),
291 pos: start,
292 });
293 }
294 if self.peek() == Some(':') {
295 self.bump();
296 let kind_start = self.pos.offset;
297 self.consume_kind_annotation();
298 let kind_end = self.pos.offset;
299 let kind = self.input[kind_start..kind_end].to_string();
300 if kind.is_empty() || !is_valid_kind_annotation(&kind) {
301 return Err(LexError::InvalidIdentifier {
302 text: format!("@{name}:{kind}"),
303 pos: start,
304 });
305 }
306 Ok(Token::TypedSymbol { name, kind })
307 } else {
308 Ok(Token::Symbol(name))
309 }
310 }
311
312 fn lex_keyword(&mut self, start: Position) -> Result<Token, LexError> {
313 self.bump(); let name_start = self.pos.offset;
315 self.consume_identifier();
316 let name_end = self.pos.offset;
317 let name = self.input[name_start..name_end].to_string();
318 if name.is_empty() || !is_valid_identifier_start(&name) {
319 return Err(LexError::InvalidIdentifier {
320 text: format!(":{name}"),
321 pos: start,
322 });
323 }
324 Ok(Token::Keyword(name))
325 }
326
327 fn lex_number_or_timestamp(&mut self, start: Position) -> Result<Token, LexError> {
328 let begin = self.pos.offset;
329 while let Some(c) = self.peek() {
333 if c.is_ascii_digit() || matches!(c, '-' | '.' | ':' | 'T' | 'Z') {
334 self.bump();
335 } else {
336 break;
337 }
338 }
339 let end = self.pos.offset;
340 let text = &self.input[begin..end];
341 if looks_like_timestamp(text) {
342 return Ok(Token::Timestamp(text.to_string()));
343 }
344 if text.contains('.') {
345 text.parse::<f64>()
346 .map(Token::Float)
347 .map_err(|_| LexError::InvalidNumber {
348 text: text.to_string(),
349 pos: start,
350 })
351 } else {
352 text.parse::<i64>()
353 .map(Token::Integer)
354 .map_err(|_| LexError::InvalidNumber {
355 text: text.to_string(),
356 pos: start,
357 })
358 }
359 }
360
361 fn lex_bareword_or_reserved(&mut self, start: Position) -> Result<Token, LexError> {
362 let begin = self.pos.offset;
363 self.consume_identifier();
364 let end = self.pos.offset;
365 let text = &self.input[begin..end];
366 let token = match text {
367 "true" => Token::Boolean(true),
368 "false" => Token::Boolean(false),
369 "nil" => Token::Nil,
370 _ => {
371 if is_valid_identifier_start(text) {
372 Token::Bareword(text.to_string())
373 } else {
374 return Err(LexError::InvalidIdentifier {
375 text: text.to_string(),
376 pos: start,
377 });
378 }
379 }
380 };
381 Ok(token)
382 }
383
384 fn consume_identifier(&mut self) {
385 while let Some(c) = self.peek() {
386 if c.is_ascii_lowercase() || c.is_ascii_digit() || c == '_' {
387 self.bump();
388 } else {
389 break;
390 }
391 }
392 }
393
394 fn consume_kind_annotation(&mut self) {
395 while let Some(c) = self.peek() {
398 if c.is_ascii_alphabetic() || c.is_ascii_digit() {
399 self.bump();
400 } else {
401 break;
402 }
403 }
404 }
405}
406
407fn is_valid_identifier_start(s: &str) -> bool {
408 let mut chars = s.chars();
409 match chars.next() {
410 Some(c) if c.is_ascii_lowercase() || c == '_' => {
411 chars.all(|c| c.is_ascii_lowercase() || c.is_ascii_digit() || c == '_')
412 }
413 _ => false,
414 }
415}
416
417fn is_valid_kind_annotation(s: &str) -> bool {
418 let mut chars = s.chars();
419 match chars.next() {
420 Some(c) if c.is_ascii_uppercase() => chars.all(char::is_alphanumeric),
421 _ => false,
422 }
423}
424
425fn looks_like_timestamp(text: &str) -> bool {
426 let bytes = text.as_bytes();
430 if bytes.len() < 10 {
431 return false;
432 }
433 if !(bytes[..4].iter().all(u8::is_ascii_digit)
435 && bytes[4] == b'-'
436 && bytes[5..7].iter().all(u8::is_ascii_digit)
437 && bytes[7] == b'-'
438 && bytes[8..10].iter().all(u8::is_ascii_digit))
439 {
440 return false;
441 }
442 if bytes.len() == 10 {
443 return true;
444 }
445 if bytes[10] != b'T' {
447 return false;
448 }
449 let rest = &bytes[11..];
452 rest.contains(&b':')
453}
454
455#[cfg(test)]
456mod tests {
457 use super::*;
458
459 fn first(tokens: &[Spanned]) -> &Token {
460 &tokens[0].token
461 }
462
463 #[test]
464 fn empty_input_produces_no_tokens() {
465 assert!(tokenize("").unwrap().is_empty());
466 assert!(tokenize(" \t\n ").unwrap().is_empty());
467 }
468
469 #[test]
470 fn parens_are_tokens() {
471 let t = tokenize("( )").unwrap();
472 assert_eq!(t.len(), 2);
473 assert_eq!(first(&t), &Token::LParen);
474 assert_eq!(&t[1].token, &Token::RParen);
475 }
476
477 #[test]
478 fn symbol_with_and_without_kind() {
479 let t = tokenize("@alice @alice:Agent").unwrap();
480 assert_eq!(first(&t), &Token::Symbol("alice".into()));
481 assert_eq!(
482 &t[1].token,
483 &Token::TypedSymbol {
484 name: "alice".into(),
485 kind: "Agent".into(),
486 }
487 );
488 }
489
490 #[test]
491 fn bareword_and_reserved_words() {
492 let t = tokenize("email true false nil sem").unwrap();
493 assert_eq!(first(&t), &Token::Bareword("email".into()));
494 assert_eq!(&t[1].token, &Token::Boolean(true));
495 assert_eq!(&t[2].token, &Token::Boolean(false));
496 assert_eq!(&t[3].token, &Token::Nil);
497 assert_eq!(&t[4].token, &Token::Bareword("sem".into()));
498 }
499
500 #[test]
501 fn numbers_distinguish_int_and_float() {
502 let t = tokenize("42 -17 3.14 -0.5").unwrap();
503 assert_eq!(first(&t), &Token::Integer(42));
504 assert_eq!(&t[1].token, &Token::Integer(-17));
505 match &t[2].token {
506 Token::Float(f) => assert!((f - 3.14).abs() < 1e-9),
507 other => panic!("expected Float, got {other:?}"),
508 }
509 match &t[3].token {
510 Token::Float(f) => assert!((f + 0.5).abs() < 1e-9),
511 other => panic!("expected Float, got {other:?}"),
512 }
513 }
514
515 #[test]
516 fn timestamps_are_distinct_from_numbers() {
517 let t = tokenize("2024-01-15 2026-04-17T10:00:00Z").unwrap();
518 match first(&t) {
519 Token::Timestamp(s) => assert_eq!(s, "2024-01-15"),
520 other => panic!("expected Timestamp, got {other:?}"),
521 }
522 match &t[1].token {
523 Token::Timestamp(s) => assert_eq!(s, "2026-04-17T10:00:00Z"),
524 other => panic!("expected Timestamp, got {other:?}"),
525 }
526 }
527
528 #[test]
529 fn strings_resolve_escapes() {
530 let t = tokenize(r#" "hello\nworld" "a\"b" "#).unwrap();
531 assert_eq!(first(&t), &Token::String("hello\nworld".into()));
532 assert_eq!(&t[1].token, &Token::String("a\"b".into()));
533 }
534
535 #[test]
536 fn keyword_stripped_of_colon() {
537 let t = tokenize(":src :confidence_threshold").unwrap();
538 assert_eq!(first(&t), &Token::Keyword("src".into()));
539 assert_eq!(&t[1].token, &Token::Keyword("confidence_threshold".into()));
540 }
541
542 #[test]
543 fn line_comments_skipped() {
544 let t = tokenize("; a comment\n@alice").unwrap();
545 assert_eq!(t.len(), 1);
546 assert_eq!(first(&t), &Token::Symbol("alice".into()));
547 }
548
549 #[test]
550 fn unterminated_string_errors() {
551 let result = tokenize(r#" "no close "#);
552 assert!(matches!(result, Err(LexError::UnterminatedString { .. })));
553 }
554
555 #[test]
556 fn invalid_escape_errors() {
557 let result = tokenize(r#" "\q" "#);
558 assert!(matches!(
559 result,
560 Err(LexError::InvalidEscape { escape: 'q', .. })
561 ));
562 }
563
564 #[test]
565 fn unexpected_byte_errors() {
566 let result = tokenize("$");
567 assert!(matches!(result, Err(LexError::UnexpectedByte { .. })));
568 }
569
570 #[test]
571 fn positions_track_line_and_column() {
572 let t = tokenize("(\n@alice").unwrap();
573 assert_eq!(t[0].position.line, 1);
574 assert_eq!(t[0].position.column, 1);
575 assert_eq!(t[1].position.line, 2);
576 assert_eq!(t[1].position.column, 1);
577 }
578}