token_iter/lib.rs
1#![no_std]
2
3//! This crate makes it easier to write tokenizers for textual languages.
4//! A tokenizer takes a sequence of characters as input and produces a sequence
5//! of "tokens" -- instances of some type that categorizes groups of characters.
6//! For example, the text "foo < 10" might be tokenized into three tokens representing
7//! the identifier "foo", the less-than symbol, and the integer 10.
8//!
9//! Note that a sequence of characters that is considered a unit is called
10//! a "lexeme," and the lexeme gets converted to a token (typically an enum value).
11//! For example, the lexeme "while" might be mapped to a Token::While enum value,
12//! and "731" to Token::Int(731).
13//!
14//! This library was designed with the following principles in mind:
15//! 1. The library should NOT define tokens, or place constraints on how they are represented.
16//! The library's token type should be an opaque generic parameter, with no type constraints.
17//! 2. The library should NOT have any ideas about how the text is interpreted,
18//! beyond that a lexeme may not span lines. For example, the library should not
19//! assume that whitespace is insignificant, have any notion of identifiers
20//! or numbers, or limit line length.
21//! 3. The logic deciding what constitutes a lexeme should be expressed in Rust code
22//! (rather than, for example, regular expressions).
23//! 4. The API should give you an Iterator over the tokens in an &str,
24//! or in an Iterator of &str's.
25//! 5. The library should automatically add the line number and column range for a token.
26//! 6. The library should be very time-efficient.
27//! 7. The library should be no_std and have no dependencies.
28//! In particular, it should not allocate heap memory for lexemes,
29//! instead yielding &str's of substrings of the input.
30//! 8. Invalid tokens are tokens, not errors. Tokenization shouldn't stop just because
31//! it doesn't recognize something. And you want the same line/column info for bad tokens
32//! as you do for good ones. So the tokenizer just produces tokens, not Results that
33//! contain either a token or an error.
34//! 9. The library should work with multibyte characters.
35//! 10. The API should support a functional programming style.
36//!
37//! To use the library, you must
38//! 1. define a type for your tokens (typically, but not necessarily, an enum)
39//! 2. write a tokenizer function that uses a Lexer (a type provided by this library)
40//! to recognize lexemes and convert them to tokens
41//! 3. call either\
42//! `tokens_in_line(line, &tokenizer)`, if you know your code is a single line, or\
43//! `tokens_in(line_iter, &tokenizer)`, to get line numbers with the tokens
44//!
45//! Here is a [sequence diagram](https://www.websequencediagrams.com/cgi-bin/cdraw?lz=dGl0bGUgVG9rZW5pemluZyBhIGxpbmUgd2l0aCB0aGUgdG9rZW4taXRlciBjcmF0ZSdzAA0Gc19pbl8AJgVmdW5jdGlvbgpwYXJ0aWNpcGFudCBZT1VSIFBST0dSQU0gYXMgUAARDQAzD2FzIFQAMQ1hbiBJdGVyYXRvciBvdmVyXG4oY29sdW1uIHJhbmdlLACBEAYpIGFzIEkAZRJUT0tFTklaRVIgRm5cbnRoYXQgcmVjb2duaXplcyBsZXhlbWVzXG4gYW5kIGNyZWF0ZQCBTAggYXMgRgB8DlxuTGV4ZXIgYXMgTQpQIC0-KyBUOgCBdA8obGluZSwgJgCCJwVpemVyKQpUIC0-LSBQOgCBIyhcbmZvciAAgScFAIJ7BW9mIGNvZGUAawdJOiBuZXh0KCkKSQB9BUY6IGNhbGxzIHlvdXIAgWEFAIMrBWEgAIEnBQpGAIEiBU06IHZhcmlvdXMAJwZcbnRvIGZpbmQgYQCBfgcKTQCBIQVGOiAAKQphbmQgcGVyaGFwc1xuZ2V0IGl0cyB0ZXh0ACMKADYHRgCBXgVJOiBTb21lKACDDAYAgRkHTTogd2hlcmUgd2FzAIFQBT8AZAdJOiAAgz0MIG9mAIEDCEkAgikIAEkFAINaFSkAggQKa2VlcACBUQZpbmcAghUHIHVudGlsIC4uLgCBdy9uAIFRDU5vbgCBUQsACQUAgRMJABcFCg&s=earth)
46//! showing how `tokens_in_line` works with your tokenizer function to generate a token.
47//! `tokens_in` runs `tokens_in_line` on each line, flattening the results and adding line numbers.
48//!
49//! And here is a fairly beefy example:
50//! ```rust
51//! use token_iter::*;
52//!
53//! // A type for the tokens.
54//! #[derive(Clone, Debug, PartialEq)]
55//! enum Token {
56//! LT, LE, // < <=
57//! GT, GE, // > >=
58//! EqEq, EQ, // == =
59//! LCurly, RCurly, // { }
60//! While, If,
61//! Identifier(String), // e.g. foo12
62//! Int(u64), // e.g. 273
63//! BadInt(String), // e.g. 9873487239482398477132498723423987234
64//! Unrecognized(char)
65//! }
66//!
67//! // Produces a token, using a Lexer to examine the characters.
68//! fn tokenizer(lx: &mut Lexer) -> Option<Token> {
69//! use Token::*;
70//! let is_digit = |c| char::is_ascii_digit(&c);
71//! Some(
72//! match lx.skip_while(char::is_whitespace).next()? {
73//! '<' => if lx.at('=') {LE} else {LT},
74//! '>' => if lx.at('=') {GE} else {GT},
75//! '=' => if lx.at('=') {EqEq} else {EQ},
76//! '{' => LCurly,
77//! '}' => RCurly,
78//! c if c.is_alphabetic() =>
79//! match lx.take_while(char::is_alphanumeric).get() {
80//! "while" => While,
81//! "if" => If,
82//! s => Identifier(s.into())
83//! },
84//! c if is_digit(c) =>
85//! lx.take_while(is_digit).map( |s|
86//! if let Ok(n) = s.parse::<u64>() {
87//! Int(n)
88//! } else {
89//! BadInt(s.into())
90//! }
91//! ),
92//! c => Unrecognized(c)
93//! }
94//! )
95//! }
96//!
97//! fn main() {
98//! let code = r#"
99//! if foo > bar {
100//! foo = 1
101//! }
102//! "#;
103//! for (line_num, col_range, token) in tokens_in(code.lines(), &tokenizer) {
104//! println!("On line {line_num} at columns {col_range:?}: {token:?}");
105//! }
106//! }
107//!
108//! // On line 1 at columns 8..10: If
109//! // On line 1 at columns 11..14: Identifier("foo")
110//! // On line 1 at columns 15..16: GT
111//! // On line 1 at columns 17..20: Identifier("bar")
112//! // On line 1 at columns 21..22: LCurly
113//! // On line 2 at columns 12..15: Identifier("foo")
114//! // On line 2 at columns 16..17: EQ
115//! // On line 2 at columns 18..19: Int(1)
116//! // On line 3 at columns 8..9: RCurly
117//! ```
118
119use core::ops::Range;
120use core::str::CharIndices;
121
122/// Returns an Iterator over the tokens found in the specified line,
123/// along with their column ranges. Column numbers start at 0.
124pub fn tokens_in_line<'a, Str, Token: 'a, Tokenizer>(
125 line: Str,
126 tokenizer: &'a Tokenizer,
127) -> impl Iterator<Item = (Range<usize>, Token)> + 'a
128where
129 Str: Into<&'a str>,
130 Tokenizer: Fn(&mut Lexer<'a>) -> Option<Token>,
131{
132 let line = line.into();
133 let mut chars = line.char_indices();
134 let next = chars.next();
135 StrTokenIterator::<'a, Token, Tokenizer> {
136 lexer: Lexer::<'a> {
137 line,
138 chars,
139 current: next,
140 column: 0,
141 start_ix: 0,
142 start_column: 0,
143 },
144 tokenizer,
145 }
146}
147
148/// Returns an Iterator over the tokens found in the specified lines,
149/// along with their line numbers and column ranges (both of which start at 0).
150pub fn tokens_in<'a, Token: 'a, Tokenizer, StrIter, Str>(
151 iter: StrIter,
152 tokenizer: &'a Tokenizer,
153) -> impl Iterator<Item = (usize, Range<usize>, Token)> + 'a
154where
155 StrIter: Iterator<Item = Str> + 'a,
156 Str: Into<&'a str> + 'a,
157 Tokenizer: Fn(&mut Lexer<'a>) -> Option<Token>,
158{
159 iter.enumerate().flat_map(|(line_num, line)| {
160 tokens_in_line(line, tokenizer)
161 .map(move |(column_range, token)| (line_num, column_range, token))
162 })
163}
164
165///// StrTokenIterator /////////////////////////
166
167struct StrTokenIterator<'a, Token, Tokenizer>
168where
169 Tokenizer: Fn(&mut Lexer<'a>) -> Option<Token>,
170{
171 lexer: Lexer<'a>,
172 tokenizer: &'a Tokenizer,
173}
174
175impl<'a, Token, Tokenizer> Iterator for StrTokenIterator<'a, Token, Tokenizer>
176where
177 Tokenizer: Fn(&mut Lexer<'a>) -> Option<Token>,
178{
179 type Item = (Range<usize>, Token);
180
181 fn next(&mut self) -> Option<Self::Item> {
182 self.lexer.current.and_then(|_| {
183 (self.tokenizer)(self.lexer.mark_start())
184 .map(|token| (self.lexer.column_range(), token))
185 })
186 }
187}
188
189///// Lexer /////////////////////////
190
191/// The tokenizer you write will be passed a Lexer as an argument,
192/// and will call methods on it to figure out what the next lexeme is
193/// and, if needed, get its text. The Lexer keeps track of the
194/// column range of the lexeme.
195pub struct Lexer<'a> {
196 line: &'a str,
197 chars: CharIndices<'a>,
198 current: Option<(usize, char)>,
199 column: usize,
200 start_ix: usize,
201 start_column: usize,
202}
203
204impl<'a> Lexer<'a> {
205 /// Adds the current character to the lexeme and advances to the next.
206 #[inline]
207 fn advance(&mut self) {
208 if self.current.is_some() {
209 self.current = self.chars.next();
210 self.column += 1;
211 }
212 }
213
214 /// Makes the next char the start of the lexeme.
215 fn mark_start(&mut self) -> &mut Self {
216 self.start_column = self.column;
217 self.start_ix = if let Some((start_ix, _)) = self.current {
218 start_ix
219 } else {
220 self.line.len()
221 };
222 self
223 }
224
225 /// Returns the next character and adds it to the lexeme.
226 #[allow(clippy::should_implement_trait)]
227 pub fn next(&mut self) -> Option<char> {
228 self.current.map(|(_, c)| {
229 self.advance();
230 c
231 })
232 }
233
234 /// Returns the next character without advancing.
235 pub fn peek(&self) -> Option<char> {
236 self.current.map(|(_, c)| c)
237 }
238
239 /// If the next character is c (the argument, not the letter),
240 /// adds c to the lexeme and returns true. Otherwise just returns false.
241 pub fn at(&mut self, c: char) -> bool {
242 if let Some((_, next_c)) = self.current {
243 if next_c == c {
244 self.advance();
245 return true;
246 }
247 }
248 false
249 }
250
251 /// Keeps adding characters to the lexeme while f(char) is true.
252 /// Returns self to allow chaining.
253 pub fn take_while(&mut self, f: impl Fn(char) -> bool) -> &mut Self {
254 while let Some((_, c)) = self.current {
255 if !f(c) {
256 break;
257 }
258 self.advance();
259 }
260 self
261 }
262
263 /// Skips over characters while f(char) is true;
264 /// the lexeme will start at the char where f(char) is false.
265 /// Returns self to allow chaining.
266 pub fn skip_while(&mut self, f: impl Fn(char) -> bool) -> &mut Self {
267 self.take_while(f);
268 self.mark_start();
269 self
270 }
271
272 /// Returns the lexeme as an &str.
273 pub fn get(&self) -> &'a str {
274 &self.line[self.str_range()]
275 }
276
277 /// Returns the lexeme converted to the needed type (commonly a String).
278 /// This is just a shortcut for .lexeme().into().
279 pub fn into<T: From<&'a str>>(&mut self) -> T {
280 T::from(self.get())
281 }
282
283 /// Returns the result of applying f() to the lexeme.
284 pub fn map<T>(&mut self, f: impl Fn(&'a str) -> T) -> T {
285 f(self.get())
286 }
287
288 /// Gets the range of columns associated with the lexeme.
289 /// You probably won't need to use this, since the column range is included
290 /// in the Iterator's output, but it could be useful if the interpretation
291 /// of lexemes depends upon the columns in which you find them.
292 pub fn column_range(&self) -> Range<usize> {
293 self.start_column..self.column
294 }
295
296 /// Gets the byte range of the line associated with the lexeme.
297 /// You probably won't need to use this, since .get() gives you the &str
298 /// for the lexeme, but it could be useful if the interpretation of lexemes
299 /// depends on the surrounding text.
300 pub fn str_range(&self) -> Range<usize> {
301 self.start_ix..self.current.map_or_else(|| self.line.len(), |(ix, _)| ix)
302 }
303
304 /// Returns the line we are currently tokenizing.
305 /// You probably won't need to use this, but it could be useful if the
306 /// interpretation of lexemes depends on the surrounding text.
307 pub fn line(&self) -> &str {
308 self.line
309 }
310} // impl Lexer