Skip to main content

mical_cli_lexer/
lib.rs

1use mical_cli_syntax::token::{TokenKind::*, *};
2use std::iter;
3
4mod cursor;
5use cursor::Cursor;
6
7struct TokenStreamImpl<'src, I: Iterator<Item = Token>> {
8    source: &'src str,
9    iter: I,
10}
11
12impl<I: Iterator<Item = Token>> Iterator for TokenStreamImpl<'_, I> {
13    type Item = Token;
14    fn next(&mut self) -> Option<Self::Item> {
15        self.iter.next()
16    }
17}
18impl<'s, I: Iterator<Item = Token>> TokenStream<'s> for TokenStreamImpl<'s, I> {
19    fn source(&self) -> &'s str {
20        self.source
21    }
22}
23
24pub fn tokenize(source: &str) -> impl TokenStream<'_> {
25    let mut cursor = Cursor::new(source);
26    TokenStreamImpl { source, iter: iter::from_fn(move || advance_token(&mut cursor)) }
27}
28
29fn advance_token(cursor: &mut Cursor) -> Option<Token> {
30    let kind = match cursor.next()? {
31        't' => true_or_word(cursor),
32        'f' => false_or_word(cursor),
33        '\t' => {
34            cursor.eat_while(|c| c == '\t');
35            Tab
36        }
37        '\n' => Newline,
38        '\r' => {
39            if let Some('\n') = cursor.peek() {
40                cursor.next();
41            }
42            Newline
43        }
44        ' ' => {
45            cursor.eat_while(|c| c == ' ');
46            Space
47        }
48        '}' => CloseBrace,
49        '>' => Greater,
50        '-' => Minus,
51        '{' => OpenBrace,
52        '|' => Pipe,
53        '+' => Plus,
54        '#' => Sharp,
55        '"' => string::<'"'>(cursor),
56        '\'' => string::<'\''>(cursor),
57        c @ '0'..='9' => integer_or_word(cursor, c),
58        _ => word(cursor),
59    };
60    let token = cursor.bump(kind);
61    Some(token)
62}
63
64fn true_or_word(cursor: &mut Cursor) -> TokenKind {
65    debug_assert!(cursor.prev() == 't');
66    if let Some('r') = cursor.peek() {
67        cursor.next();
68        if let Some('u') = cursor.peek() {
69            cursor.next();
70            if let Some('e') = cursor.peek() {
71                cursor.next();
72                return True;
73            }
74        }
75    }
76    word(cursor)
77}
78
79fn false_or_word(cursor: &mut Cursor) -> TokenKind {
80    debug_assert!(cursor.prev() == 'f');
81    if let Some('a') = cursor.peek() {
82        cursor.next();
83        if let Some('l') = cursor.peek() {
84            cursor.next();
85            if let Some('s') = cursor.peek() {
86                cursor.next();
87                if let Some('e') = cursor.peek() {
88                    cursor.next();
89                    return False;
90                }
91            }
92        }
93    }
94    word(cursor)
95}
96
97fn string<const Q: char>(cursor: &mut Cursor) -> TokenKind {
98    const { assert!(Q == '"' || Q == '\'') };
99    debug_assert!(cursor.prev() == Q);
100
101    let mut terminated = false;
102    while let Some(c) = cursor.peek() {
103        match c {
104            '\\' => {
105                cursor.next();
106                let peek = cursor.peek();
107                if peek == Some(Q) || peek == Some('\\') {
108                    cursor.next();
109                }
110            }
111            '\n' | '\r' => {
112                break;
113            }
114            q if q == Q => {
115                terminated = true;
116                cursor.next();
117                break;
118            }
119            _ => {
120                cursor.next();
121            }
122        }
123    }
124    String {
125        is_terminated: terminated,
126        quote: const {
127            match Q {
128                '"' => Quote::Double,
129                '\'' => Quote::Single,
130                _ => unreachable!(),
131            }
132        },
133    }
134}
135
136fn integer_or_word(cursor: &mut Cursor, first_digit: char) -> TokenKind {
137    debug_assert!(first_digit.is_ascii_digit()); // 0..=9
138    fn eat_decimal_digits(cursor: &mut Cursor) -> bool {
139        let mut has_digits = false;
140        while let Some(c) = cursor.peek() {
141            match c {
142                '_' => (),
143                '0'..='9' => has_digits = true,
144                _ => break,
145            };
146            cursor.next();
147        }
148        has_digits
149    }
150    fn eat_hexadecimal_digits(cursor: &mut Cursor) -> bool {
151        let mut has_digits = false;
152        while let Some(c) = cursor.peek() {
153            match c {
154                '_' => (),
155                '0'..='9' | 'a'..='f' | 'A'..='F' => has_digits = true,
156                _ => break,
157            };
158            cursor.next();
159        }
160        has_digits
161    }
162    let mut radix = Radix::Decimal;
163    let has_digits = if first_digit == '0' {
164        match cursor.peek() {
165            Some('b') => {
166                radix = Radix::Binary;
167                cursor.next();
168                eat_decimal_digits(cursor)
169            }
170            Some('o') => {
171                radix = Radix::Octal;
172                cursor.next();
173                eat_decimal_digits(cursor)
174            }
175            Some('x') => {
176                radix = Radix::Hexadecimal;
177                cursor.next();
178                eat_hexadecimal_digits(cursor)
179            }
180            Some('0'..='9' | '_') => eat_decimal_digits(cursor),
181            _ => true, // single '0'
182        }
183    } else {
184        eat_decimal_digits(cursor);
185        true // first_digit itself is always a valid digit
186    };
187    match cursor.peek() {
188        Some('\t' | '\n' | ' ') | None => Numeral { radix, is_empty: !has_digits },
189        _ => word(cursor),
190    }
191}
192
193fn word(cursor: &mut Cursor) -> TokenKind {
194    cursor.eat_while(|c| !matches!(c, '\t' | '\n' | ' '));
195    Word
196}