sylt_tokenizer/
tokenizer.rs

1use logos::Logos;
2use std::{fs, path::Path};
3pub use token::Token;
4
5mod token;
6
7#[derive(Debug, Copy, Clone, PartialEq)]
8/// A location in a file containing source code.
9pub struct Span {
10    // TODO(ed): Do this more intelligent, so
11    // we can show ranges. Maybe even go back
12    // to offsets from start of the file.
13    pub line: usize,
14    /// The first column that this Span contains.
15    pub col_start: usize,
16    /// The first column that this Span doesn't contain.
17    pub col_end: usize,
18}
19
20pub static ZERO_SPAN: Span = Span {
21    line: 0,
22    col_start: 0,
23    col_end: 0,
24};
25
26impl Span {
27    pub fn zero() -> Self {
28        Self {
29            line: 0,
30            col_start: 0,
31            col_end: 0,
32        }
33    }
34}
35
36#[derive(Debug, PartialEq)]
37pub struct PlacedToken {
38    pub token: Token,
39    pub span: Span,
40}
41
42pub fn string_to_tokens(content: &str) -> Vec<PlacedToken> {
43    // A list containing which char index a specific byte index is at.
44    //
45    // Since &str contains UTF-8, a byte offset (which is what the lexer gives
46    // us) won't necessarily match up with the char index. For example, given
47    // the string "123", '3' has both byte index 3 and char index 3. However, in
48    // the string "ä23", '3' has char index 3 as before, but byte index 4 since
49    // 'ä' contains two bytes.
50    //
51    // This list ensures that the byte offset the lexer gives us can be matched
52    // with a char index. None means that the byte offset points inside a char
53    // which should not be possible.
54    let mut char_at_byte = vec![None; content.len()];
55    for (i, (pos, _)) in content.char_indices().enumerate() {
56        char_at_byte[pos] = Some(i + 1);
57    }
58    // Push a last value since the byte offset end is exclusive.
59    char_at_byte.push(Some(content.chars().count() + 1));
60
61    // We also need to keep track of the current line and which char index the
62    // previous newline was at. Given a char index we can then subtract the last
63    // newline char index and get the column in the current line of the char.
64    let mut line = 1;
65    let mut last_newline = 0;
66
67    Token::lexer(&content)
68        .spanned()
69        // Contains side-effects.
70        .map(|(token, byte_range)| {
71            let is_newline = token == Token::Newline;
72            let col_start = char_at_byte[byte_range.start].unwrap() - last_newline;
73            let col_end = char_at_byte[byte_range.end].unwrap() - last_newline;
74            let placed_token = PlacedToken {
75                token,
76                span: Span {
77                    line,
78                    col_start,
79                    col_end,
80                },
81            };
82            if is_newline {
83                last_newline = char_at_byte[byte_range.start].unwrap();
84                line += 1;
85            }
86            placed_token
87        })
88        .collect()
89}
90
91pub fn file_to_tokens(file: &Path) -> Result<Vec<PlacedToken>, std::io::Error> {
92    Ok(string_to_tokens(&fs::read_to_string(file)?))
93}
94
95#[cfg(test)]
96mod tests {
97    use crate::{Token, string_to_tokens};
98    use logos::Logos;
99
100    fn lex(s: &str) -> Vec<Token> {
101        Token::lexer(s).collect()
102    }
103
104    fn lex_once(s: &str) -> Token {
105        let mut lexer = Token::lexer(s);
106        let res = lexer.next().unwrap();
107        assert_eq!(lexer.next(), None);
108        res
109    }
110
111    fn vecs_match<T: PartialEq<T>>(a: &Vec<T>, b: &Vec<T>) -> bool {
112        if a.len() == b.len() {
113            a.iter().zip(b.iter()).all(|(a, b)| a == b)
114        } else {
115            false
116        }
117    }
118
119    macro_rules! assert_placed_eq {
120        ($a:expr, $( ($token:expr, $line:expr, $range:expr) ),+ $(,)? ) => {
121            let a = $a;
122            let b = vec![ $(
123                $crate::PlacedToken {
124                    token: $token,
125                    span: $crate::Span {
126                        line: $line,
127                        col_start: $range.start,
128                        col_end: $range.end,
129                    }
130                }
131            ),*];
132            if !vecs_match(&a, &b) {
133                panic!("\n{:?}\ndoes not match\n{:?}", a, b);
134            }
135        };
136    }
137
138    #[test]
139    fn simple_span() {
140        assert_placed_eq!(
141            string_to_tokens("1"),
142            (Token::Int(1), 1, 1..2),
143        );
144        assert_placed_eq!(
145            string_to_tokens("1\n"),
146            (Token::Int(1),  1, 1..2),
147            (Token::Newline, 1, 2..3),
148        );
149        assert_placed_eq!(
150            string_to_tokens("1\n23\n456"),
151            (Token::Int(1),   1, 1..2),
152            (Token::Newline,  1, 2..3),
153            (Token::Int(23),  2, 1..3),
154            (Token::Newline,  2, 3..4),
155            (Token::Int(456), 3, 1..4),
156        );
157    }
158
159    #[test]
160    fn span_with_non_ascii() {
161        // The 'ö' is an error but we want to check that its span is a single char.
162        assert_placed_eq!(
163            string_to_tokens("wow\nwöw\n"),
164            (Token::Identifier(String::from("wow")), 1, 1..4),
165            (Token::Newline,                         1, 4..5),
166
167            (Token::Identifier(String::from("w")),   2, 1..2),
168            (Token::Error,                           2, 2..3),
169            (Token::Identifier(String::from("w")),   2, 3..4),
170            (Token::Newline,                         2, 4..5),
171        );
172    }
173
174    #[test]
175    fn test_lex_once() {
176        lex_once("1");
177    }
178
179    #[test]
180    #[should_panic]
181    fn test_lex_once_panic() {
182        lex_once("1 2");
183    }
184
185    #[test]
186    fn number() {
187        assert_eq!(lex_once("1"), Token::Int(1));
188        assert_eq!(lex_once("1.1"), Token::Float(1.1));
189        assert_eq!(lex_once("123"), Token::Int(123));
190        assert_eq!(lex_once(".1"), Token::Float(0.1));
191        assert_eq!(lex_once("1."), Token::Float(1.0));
192    }
193
194    #[test]
195    fn identifiers() {
196        let ident_cmp = |s| assert_eq!(lex_once(s), Token::Identifier(String::from(s)));
197        ident_cmp("a");
198        ident_cmp("aaaaaaaa");
199        ident_cmp("a1");
200        ident_cmp("a_");
201        ident_cmp("_a");
202        ident_cmp("__");
203    }
204
205    #[test]
206    fn whitespace() {
207        lex_once("1 ");
208        lex_once(" 1");
209        lex_once(" 1 ");
210
211        assert_eq!(lex("1 2").len(), 2);
212        assert_eq!(lex("1\t2").len(), 2);
213        assert_eq!(lex("1             2").len(), 2);
214        assert_eq!(lex("\t1   \t  \t\t     2\t").len(), 2);
215    }
216
217    #[test]
218    fn comment() {
219        assert_eq!(lex("// a\n1").len(), 2);
220        assert_eq!(lex("1// a\n2").len(), 3);
221        assert_eq!(lex("1\n// a\n2").len(), 4); // newline is also a token
222    }
223}