1use std::fmt::{Debug, Display};
2
3use anyhow::bail;
4
5pub fn lex(text: &str) -> anyhow::Result<Vec<(Token, TextPos)>> {
10 let mut tokens: Vec<(Token, TextPos)> = Vec::new();
11
12 let mut line_n: usize = 1;
14 let mut last_line_i: usize = 0;
15 let mut tok_start_pos = TextPos(line_n, 0, 0);
16
17 let mut tok: Token = Token::None;
19 let mut tok_finished = false;
20
21 let mut escape = false;
23 let mut num_str = String::new();
24
25 for (i, c) in text.chars().enumerate() {
26 let pos = TextPos(line_n, i - last_line_i, i);
27 if c == '\n' {
28 line_n += 1;
29 last_line_i = i + 1;
31 }
32
33 loop {
35 let mut repeat = false;
36 match &mut tok {
37 Token::None => match c {
38 ';' => {
39 tok = Token::Semicolon;
40 tok_finished = true;
41 }
42 ':' => {
43 tok = Token::Colon;
44 tok_finished = true;
45 }
46 ',' => {
47 tok = Token::Comma;
48 tok_finished = true;
49 }
50 '|' => {
51 tok = Token::Pipe;
52 tok_finished = true;
53 }
54 '{' => {
55 tok = Token::Curly(Side::Left);
56 tok_finished = true;
57 }
58 '}' => {
59 tok = Token::Curly(Side::Right);
60 tok_finished = true;
61 }
62 '[' => {
63 tok = Token::Square(Side::Left);
64 tok_finished = true;
65 }
66 ']' => {
67 tok = Token::Square(Side::Right);
68 tok_finished = true;
69 }
70 '(' => {
71 tok = Token::Paren(Side::Left);
72 tok_finished = true;
73 }
74 ')' => {
75 tok = Token::Paren(Side::Right);
76 tok_finished = true;
77 }
78 '<' => {
79 tok = Token::Angle(Side::Left);
80 tok_finished = true;
81 }
82 '>' => {
83 tok = Token::Angle(Side::Right);
84 tok_finished = true;
85 }
86 '@' => {
87 tok = Token::At;
88 tok_finished = true;
89 }
90 '!' => {
91 tok = Token::Bang;
92 tok_finished = true;
93 }
94 '=' => {
95 tok = Token::Equal;
96 tok_finished = true;
97 }
98 '%' => {
99 tok = Token::Percent;
100 tok_finished = true;
101 }
102 '&' => {
103 tok = Token::Ampersand;
104 tok_finished = true;
105 }
106 '~' => {
107 tok = Token::Tilde;
108 tok_finished = true;
109 }
110 '"' => tok = Token::Str(String::new()),
111 '#' => tok = Token::Comment(String::new()),
112 '$' => tok = Token::Variable(String::new()),
113 c if is_whitespace(c) => tok = Token::Whitespace,
114 c if is_num(c, true) => {
115 tok = Token::Num(0);
116 num_str = c.to_string();
117 }
118 c if is_ident(c, true) => tok = Token::Ident(c.into()),
119 _ => bail!("Unexpected token {tok:?} {pos}"),
120 },
121 Token::Str(string) => match lex_string_char(c, escape) {
122 StrLexResult::Append => {
123 string.push(c);
124 escape = false;
125 }
126 StrLexResult::Escape => escape = true,
127 StrLexResult::End => {
128 escape = false;
129 tok_finished = true;
130 }
131 },
132 Token::Comment(string) => {
133 if c == '\n' {
134 tok_finished = true;
135 } else {
136 string.push(c);
137 }
138 }
139 Token::Variable(name) => {
140 let allowed = is_ident(c, name.is_empty());
141
142 if allowed {
143 name.push(c);
144 } else {
145 repeat = true;
146 tokens.push((tok, tok_start_pos.clone()));
147 tok = Token::None;
148 }
149 }
150 Token::Whitespace => {
151 if !is_whitespace(c) {
152 repeat = true;
153 tokens.push((tok, tok_start_pos.clone()));
154 tok_start_pos = pos.clone();
155 tok = Token::None;
156 }
157 }
158 Token::Ident(name) => {
159 if is_ident(c, false) {
160 name.push(c);
161 } else {
162 repeat = true;
163 tokens.push((tok, tok_start_pos.clone()));
164 tok_start_pos = pos.clone();
165 tok = Token::None;
166 }
167 }
168 Token::Num(num) => {
169 if is_num(c, false) {
170 num_str.push(c);
171 } else if c == '.' {
172 tok = Token::Decimal(TryInto::<i32>::try_into(*num)?.try_into()?);
173 num_str.push('.');
174 } else {
175 repeat = true;
176 if num_str == "-" {
177 bail!("Invalid number '{num_str}', {pos}");
178 }
179 *num = num_str.parse().expect("Number contains invalid characters");
180 tokens.push((tok, tok_start_pos.clone()));
181 tok_start_pos = pos.clone();
182 tok = Token::None;
183 }
184 }
185 Token::Decimal(num) => {
186 if is_decimal(c, false, true) {
187 num_str.push(c);
188 } else {
189 repeat = true;
190 if num_str == "-" {
191 bail!("Invalid number '{num_str}', {pos}");
192 }
193 *num = num_str
194 .parse()
195 .expect("Decimal number contains invalid characters");
196 tokens.push((tok, tok_start_pos.clone()));
197 tok_start_pos = pos.clone();
198 tok = Token::None;
199 }
200 }
201 _ => {}
202 }
203 if !repeat {
204 break;
205 }
206 }
207 if tok_finished {
208 tok_finished = false;
209 tokens.push((tok, tok_start_pos));
210 tok_start_pos = pos.clone();
211 tok_start_pos.increase_col(1);
213 tok = Token::None;
214 }
215 }
216
217 match &mut tok {
218 Token::Num(num) => {
219 *num = num_str.parse().expect("Number contains invalid characters");
220 tokens.push((tok, tok_start_pos.clone()));
221 }
222 Token::Decimal(num) => {
223 *num = num_str.parse().expect("Number contains invalid characters");
224 tokens.push((tok, tok_start_pos.clone()));
225 }
226 Token::None => {}
227 _ => tokens.push((tok, tok_start_pos.clone())),
228 }
229 Ok(tokens)
230}
231
232#[derive(Debug, PartialEq, Clone)]
234pub enum Token {
235 None,
238 Whitespace,
240 Semicolon,
242 Colon,
244 Comma,
246 Pipe,
248 At,
250 Bang,
252 Equal,
254 Percent,
256 Ampersand,
258 Tilde,
260 Variable(String),
262 Curly(Side),
264 Square(Side),
266 Paren(Side),
268 Angle(Side),
270 Comment(String),
272 Ident(String),
274 Num(i128),
276 Decimal(f64),
278 Str(String),
280}
281
282impl Token {
283 pub fn as_string(&self) -> String {
285 match self {
286 Token::None => "none".into(),
287 Token::Whitespace => " ".into(),
288 Token::Semicolon => ";".into(),
289 Token::Colon => ":".into(),
290 Token::Comma => ",".into(),
291 Token::Pipe => "|".into(),
292 Token::At => "@".into(),
293 Token::Bang => "!".into(),
294 Token::Equal => "=".into(),
295 Token::Percent => "%".into(),
296 Token::Ampersand => "&".into(),
297 Token::Tilde => "~".into(),
298 Token::Variable(name) => "$".to_string() + name,
299 Token::Curly(Side::Left) => "{".into(),
300 Token::Curly(Side::Right) => "}".into(),
301 Token::Square(Side::Left) => "[".into(),
302 Token::Square(Side::Right) => "]".into(),
303 Token::Paren(Side::Left) => "(".into(),
304 Token::Paren(Side::Right) => ")".into(),
305 Token::Angle(Side::Left) => "<".into(),
306 Token::Angle(Side::Right) => ">".into(),
307 Token::Comment(text) => "# ".to_string() + text,
308 Token::Ident(name) => name.clone(),
309 Token::Num(num) => num.to_string(),
310 Token::Decimal(num) => num.to_string(),
311 Token::Str(string) => format!("\"{string}\""),
312 }
313 }
314
315 pub fn is_ignored(&self) -> bool {
317 matches!(self, Token::None | Token::Comment(..) | Token::Whitespace)
318 }
319}
320
321#[derive(Debug, PartialEq, Copy, Clone)]
323pub enum Side {
324 Left,
326 Right,
328}
329
330#[derive(Clone, PartialEq, Eq)]
332pub struct TextPos(usize, usize, usize);
333
334impl Debug for TextPos {
335 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
336 write!(f, "({}:{}:{})", self.0, self.1, self.2)
337 }
338}
339
340impl Display for TextPos {
341 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
342 write!(f, "({}:{})", self.0, self.1)
343 }
344}
345
346impl TextPos {
347 pub fn new(row: usize, col: usize, abs: usize) -> Self {
349 Self(row, col, abs)
350 }
351
352 pub fn row(&self) -> &usize {
354 &self.0
355 }
356
357 pub fn col(&self) -> &usize {
359 &self.1
360 }
361
362 pub fn absolute(&self) -> &usize {
364 &self.2
365 }
366
367 pub fn increase_col(&mut self, amt: usize) {
369 self.1 += amt;
370 self.2 += amt;
371 }
372}
373
374pub type TokenAndPos = (Token, TextPos);
376
377#[derive(Debug, PartialEq)]
379enum StrLexResult {
380 Append,
381 Escape,
382 End,
383}
384
385fn lex_string_char(c: char, escape: bool) -> StrLexResult {
387 if escape {
388 StrLexResult::Append
389 } else {
390 match c {
391 '"' => StrLexResult::End,
392 '\\' => StrLexResult::Escape,
393 _ => StrLexResult::Append,
394 }
395 }
396}
397
398fn is_whitespace(c: char) -> bool {
399 c.is_whitespace()
400}
401
402fn is_ident(c: char, first: bool) -> bool {
404 if first && c.is_numeric() {
405 return false;
406 }
407 c.is_alphanumeric() || c == '_'
408}
409
410fn is_num(c: char, first: bool) -> bool {
412 if first {
413 c.is_numeric() || c == '-'
414 } else {
415 c.is_numeric()
416 }
417}
418
419fn is_decimal(c: char, first: bool, after_decimal: bool) -> bool {
421 if first {
422 c.is_numeric() || c == '-' || c == '.'
423 } else if after_decimal {
424 c.is_numeric()
425 } else {
426 c.is_numeric() || c == '.'
427 }
428}
429
430pub fn reduce_tokens<'a, T: Iterator<Item = &'a TokenAndPos>>(
432 tokens: T,
433) -> impl Iterator<Item = &'a TokenAndPos> {
434 tokens.filter(|(tok, ..)| !tok.is_ignored())
435}
436
437#[cfg(test)]
438mod tests {
439 use super::*;
440
441 macro_rules! assert_tokens {
442 ($text:literal, $toks:expr) => {
443 assert_tokens!(lex($text), $toks)
444 };
445
446 ($lexed:expr, $toks:expr) => {
447 match $lexed {
448 Ok(lexed) => {
449 assert_eq!(lexed.len(), $toks.len());
450 for ((left, _), right) in lexed.iter().zip($toks) {
451 assert_eq!(left, &right);
452 }
453 }
454 Err(e) => {
455 println!("{e}");
456 panic!();
457 }
458 };
459 };
460 }
461
462 #[test]
463 fn test_chars() {
464 assert!(is_ident('a', false));
465 assert!(is_ident('a', true));
466 assert!(is_ident('B', false));
467 assert!(is_ident('B', true));
468 assert!(is_ident('_', false));
469 assert!(is_ident('_', true));
470
471 assert!(is_ident('5', false));
472 assert!(!is_ident('2', true));
473
474 assert!(is_num('8', false));
475 assert!(is_num('8', true));
476 assert!(!is_num('t', false));
477 assert!(!is_num('t', true));
478 assert!(!is_num('.', false));
479 assert!(!is_num('.', true));
480 assert!(is_num('-', true));
481 assert!(!is_num('-', false));
482
483 assert!(is_whitespace(' '));
484 assert!(is_whitespace('\n'));
485 assert!(!is_whitespace('a'));
486 assert!(!is_whitespace('%'));
487 }
488
489 #[test]
490 fn test_semicolon() {
491 assert_tokens!(";;", vec![Token::Semicolon, Token::Semicolon]);
492 }
493
494 #[test]
495 fn test_string_chars() {
496 assert_eq!(lex_string_char('d', false), StrLexResult::Append);
497 assert_eq!(lex_string_char('\'', false), StrLexResult::Append);
498 assert_eq!(lex_string_char('"', false), StrLexResult::End);
499 assert_eq!(lex_string_char('"', true), StrLexResult::Append);
500 assert_eq!(lex_string_char('\\', false), StrLexResult::Escape);
501 assert_eq!(lex_string_char('\\', true), StrLexResult::Append);
502 }
503
504 #[test]
505 fn test_string() {
506 assert_tokens!("\"Hello\"", vec![Token::Str("Hello".into())]);
507 }
508
509 #[test]
510 fn test_combo() {
511 assert_tokens!(
512 "\"Uno\"; \"Dos\"; \"Tres\"; Identifier",
513 vec![
514 Token::Str("Uno".into()),
515 Token::Semicolon,
516 Token::Whitespace,
517 Token::Str("Dos".into()),
518 Token::Semicolon,
519 Token::Whitespace,
520 Token::Str("Tres".into()),
521 Token::Semicolon,
522 Token::Whitespace,
523 Token::Ident("Identifier".into())
524 ]
525 );
526 }
527
528 #[test]
529 fn test_all() {
530 assert_tokens!(
531 "\"Hello\"; ident{}@routine[]$var():-1000,|# comment",
532 vec![
533 Token::Str("Hello".into()),
534 Token::Semicolon,
535 Token::Whitespace,
536 Token::Ident("ident".into()),
537 Token::Curly(Side::Left),
538 Token::Curly(Side::Right),
539 Token::At,
540 Token::Ident("routine".into()),
541 Token::Square(Side::Left),
542 Token::Square(Side::Right),
543 Token::Variable("var".into()),
544 Token::Paren(Side::Left),
545 Token::Paren(Side::Right),
546 Token::Colon,
547 Token::Num(-1000),
548 Token::Comma,
549 Token::Pipe,
550 Token::Comment(" comment".into())
551 ]
552 );
553 }
554
555 #[test]
556 fn test_comment() {
557 assert_tokens!(
558 "\"Foo\" # Comment\n \"Bar\"",
559 vec![
560 Token::Str("Foo".into()),
561 Token::Whitespace,
562 Token::Comment(" Comment".into()),
563 Token::Whitespace,
564 Token::Str("Bar".into())
565 ]
566 );
567 }
568
569 #[test]
570 fn test_num() {
571 assert_tokens!(
572 "12345;888;0;-10",
573 vec![
574 Token::Num(12345),
575 Token::Semicolon,
576 Token::Num(888),
577 Token::Semicolon,
578 Token::Num(0),
579 Token::Semicolon,
580 Token::Num(-10)
581 ]
582 );
583 }
584
585 #[test]
586 fn test_decimal() {
587 assert_tokens!(
588 "12345;88.0,-73.5,-0.03",
589 vec![
590 Token::Num(12345),
591 Token::Semicolon,
592 Token::Decimal(88.0),
593 Token::Comma,
594 Token::Decimal(-73.5),
595 Token::Comma,
596 Token::Decimal(-0.03)
597 ]
598 );
599 }
600
601 macro_rules! assert_token_positions {
602 ($text:literal, $positions:expr) => {
603 assert_token_positions!(lex($text), $positions)
604 };
605
606 ($toks:expr, $positions:expr) => {
607 match $toks {
608 Ok(toks) => {
609 dbg!(&toks, $positions);
610 assert_eq!(toks.len(), $positions.len());
611 for (i, ((_, tok_pos), (expected_row, expected_col))) in
612 toks.iter().zip($positions.iter()).enumerate()
613 {
614 assert_eq!(tok_pos.0, *expected_row as usize, "Index: {i}");
616 assert_eq!(tok_pos.1, *expected_col as usize, "Index: {i}");
617 }
618 }
619 Err(e) => {
620 println!("{e}");
621 panic!();
622 }
623 }
624 };
625 }
626
627 #[test]
628 fn test_token_pos_simple() {
629 assert_token_positions!(
630 "hello;world!!\nwhy\n\"where\";",
631 [
632 (1, 0),
633 (1, 5),
634 (1, 6),
635 (1, 11),
636 (1, 12),
637 (1, 13),
638 (2, 0),
639 (2, 3),
640 (3, 0),
641 (3, 7),
642 ]
643 );
644 }
645}