token_iter/
lib.rs

1#![no_std]
2
3//! This crate makes it easier to write tokenizers for textual languages.
4//! A tokenizer takes a sequence of characters as input and produces a sequence
5//! of "tokens" -- instances of some type that categorizes groups of characters.
6//! For example, the text "foo < 10" might be tokenized into three tokens representing
7//! the identifier "foo", the less-than symbol, and the integer 10.
8//!
9//! Note that a sequence of characters that is considered a unit is called
10//! a "lexeme," and the lexeme gets converted to a token (typically an enum value).
11//! For example, the lexeme "while" might be mapped to a Token::While enum value,
12//! and "731" to Token::Int(731).
13//!
14//! This library was designed with the following principles in mind:
15//!  1. The library should NOT define tokens, or place constraints on how they are represented.
16//!     The library's token type should be an opaque generic parameter, with no type constraints.
17//!  2. The library should NOT have any ideas about how the text is interpreted,
18//!     beyond that a lexeme may not span lines.  For example, the library should not
19//!     assume that whitespace is insignificant, have any notion of identifiers
20//!     or numbers, or limit line length.
21//!  3. The logic deciding what constitutes a lexeme should be expressed in Rust code
22//!     (rather than, for example, regular expressions).
23//!  4. The API should give you an Iterator over the tokens in an &str,
24//!     or in an Iterator of &str's.
25//!  5. The library should automatically add the line number and column range for a token.
26//!  6. The library should be very time-efficient.
27//!  7. The library should be no_std and have no dependencies.
28//!     In particular, it should not allocate heap memory for lexemes,
29//!     instead yielding &str's of substrings of the input.
30//!  8. Invalid tokens are tokens, not errors.  Tokenization shouldn't stop just because
31//!     it doesn't recognize something.  And you want the same line/column info for bad tokens
32//!     as you do for good ones.  So the tokenizer just produces tokens, not Results that
33//!     contain either a token or an error.
34//!  9. The library should work with multibyte characters.
35//! 10. The API should support a functional programming style.
36//!
37//! To use the library, you must
38//!  1. define a type for your tokens (typically, but not necessarily, an enum)
39//!  2. write a tokenizer function that uses a Lexer (a type provided by this library)
40//!     to recognize lexemes and convert them to tokens
41//!  3. call either\
42//!     `tokens_in_line(line, &tokenizer)`, if you know your code is a single line, or\
43//!     `tokens_in(line_iter, &tokenizer)`, to get line numbers with the tokens
44//!
45//! Here is a [sequence diagram](https://www.websequencediagrams.com/cgi-bin/cdraw?lz=dGl0bGUgVG9rZW5pemluZyBhIGxpbmUgd2l0aCB0aGUgdG9rZW4taXRlciBjcmF0ZSdzAA0Gc19pbl8AJgVmdW5jdGlvbgpwYXJ0aWNpcGFudCBZT1VSIFBST0dSQU0gYXMgUAARDQAzD2FzIFQAMQ1hbiBJdGVyYXRvciBvdmVyXG4oY29sdW1uIHJhbmdlLACBEAYpIGFzIEkAZRJUT0tFTklaRVIgRm5cbnRoYXQgcmVjb2duaXplcyBsZXhlbWVzXG4gYW5kIGNyZWF0ZQCBTAggYXMgRgB8DlxuTGV4ZXIgYXMgTQpQIC0-KyBUOgCBdA8obGluZSwgJgCCJwVpemVyKQpUIC0-LSBQOgCBIyhcbmZvciAAgScFAIJ7BW9mIGNvZGUAawdJOiBuZXh0KCkKSQB9BUY6IGNhbGxzIHlvdXIAgWEFAIMrBWEgAIEnBQpGAIEiBU06IHZhcmlvdXMAJwZcbnRvIGZpbmQgYQCBfgcKTQCBIQVGOiAAKQphbmQgcGVyaGFwc1xuZ2V0IGl0cyB0ZXh0ACMKADYHRgCBXgVJOiBTb21lKACDDAYAgRkHTTogd2hlcmUgd2FzAIFQBT8AZAdJOiAAgz0MIG9mAIEDCEkAgikIAEkFAINaFSkAggQKa2VlcACBUQZpbmcAghUHIHVudGlsIC4uLgCBdy9uAIFRDU5vbgCBUQsACQUAgRMJABcFCg&s=earth)
46//! showing how `tokens_in_line` works with your tokenizer function to generate a token.
47//! `tokens_in` runs `tokens_in_line` on each line, flattening the results and adding line numbers.
48//!
49//! And here is a fairly beefy example:
50//! ```rust
51//! use token_iter::*;
52//!
53//! // A type for the tokens.
54//! #[derive(Clone, Debug, PartialEq)]
55//! enum Token {
56//!     LT, LE,                 // < <=
57//!     GT, GE,                 // > >=
58//!     EqEq, EQ,               // == =
59//!     LCurly, RCurly,         // { }
60//!     While, If,
61//!     Identifier(String),     // e.g. foo12
62//!     Int(u64),               // e.g. 273
63//!     BadInt(String),         // e.g. 9873487239482398477132498723423987234
64//!     Unrecognized(char)
65//! }
66//!
67//! // Produces a token, using a Lexer to examine the characters.
68//! fn tokenizer(lx: &mut Lexer) -> Option<Token> {
69//!     use Token::*;
70//!     let is_digit = |c| char::is_ascii_digit(&c);
71//!     Some(
72//!         match lx.skip_while(char::is_whitespace).next()? {
73//!             '<' => if lx.at('=') {LE} else {LT},
74//!             '>' => if lx.at('=') {GE} else {GT},
75//!             '=' => if lx.at('=') {EqEq} else {EQ},
76//!             '{' => LCurly,
77//!             '}' => RCurly,
78//!             c if c.is_alphabetic() =>
79//!                    match lx.take_while(char::is_alphanumeric).get() {
80//!                        "while" => While,
81//!                        "if" => If,
82//!                        s => Identifier(s.into())
83//!                    },
84//!             c if is_digit(c) =>
85//!                    lx.take_while(is_digit).map( |s|
86//!                        if let Ok(n) = s.parse::<u64>() {
87//!                            Int(n)
88//!                        } else {
89//!                            BadInt(s.into())
90//!                        }
91//!                    ),
92//!             c => Unrecognized(c)
93//!         }
94//!     )
95//! }
96//!
97//! fn main() {
98//!     let code = r#"
99//!         if foo > bar {
100//!             foo = 1
101//!         }
102//!     "#;
103//!     for (line_num, col_range, token) in tokens_in(code.lines(), &tokenizer) {
104//!         println!("On line {line_num} at columns {col_range:?}: {token:?}");
105//!     }
106//! }
107//!
108//! // On line 1 at columns 8..10: If
109//! // On line 1 at columns 11..14: Identifier("foo")
110//! // On line 1 at columns 15..16: GT
111//! // On line 1 at columns 17..20: Identifier("bar")
112//! // On line 1 at columns 21..22: LCurly
113//! // On line 2 at columns 12..15: Identifier("foo")
114//! // On line 2 at columns 16..17: EQ
115//! // On line 2 at columns 18..19: Int(1)
116//! // On line 3 at columns 8..9: RCurly
117//! ```
118
119use core::ops::Range;
120use core::str::CharIndices;
121
122/// Returns an Iterator over the tokens found in the specified line,
123/// along with their column ranges.  Column numbers start at 0.
124pub fn tokens_in_line<'a, Str, Token: 'a, Tokenizer>(
125    line: Str,
126    tokenizer: &'a Tokenizer,
127) -> impl Iterator<Item = (Range<usize>, Token)> + 'a
128where
129    Str: Into<&'a str>,
130    Tokenizer: Fn(&mut Lexer<'a>) -> Option<Token>,
131{
132    let line = line.into();
133    let mut chars = line.char_indices();
134    let next = chars.next();
135    StrTokenIterator::<'a, Token, Tokenizer> {
136        lexer: Lexer::<'a> {
137            line,
138            chars,
139            current: next,
140            column: 0,
141            start_ix: 0,
142            start_column: 0,
143        },
144        tokenizer,
145    }
146}
147
148/// Returns an Iterator over the tokens found in the specified lines,
149/// along with their line numbers and column ranges (both of which start at 0).
150pub fn tokens_in<'a, Token: 'a, Tokenizer, StrIter, Str>(
151    iter: StrIter,
152    tokenizer: &'a Tokenizer,
153) -> impl Iterator<Item = (usize, Range<usize>, Token)> + 'a
154where
155    StrIter: Iterator<Item = Str> + 'a,
156    Str: Into<&'a str> + 'a,
157    Tokenizer: Fn(&mut Lexer<'a>) -> Option<Token>,
158{
159    iter.enumerate().flat_map(|(line_num, line)| {
160        tokens_in_line(line, tokenizer)
161            .map(move |(column_range, token)| (line_num, column_range, token))
162    })
163}
164
165///// StrTokenIterator /////////////////////////
166
167struct StrTokenIterator<'a, Token, Tokenizer>
168where
169    Tokenizer: Fn(&mut Lexer<'a>) -> Option<Token>,
170{
171    lexer: Lexer<'a>,
172    tokenizer: &'a Tokenizer,
173}
174
175impl<'a, Token, Tokenizer> Iterator for StrTokenIterator<'a, Token, Tokenizer>
176where
177    Tokenizer: Fn(&mut Lexer<'a>) -> Option<Token>,
178{
179    type Item = (Range<usize>, Token);
180
181    fn next(&mut self) -> Option<Self::Item> {
182        self.lexer.current.and_then(|_| {
183            (self.tokenizer)(self.lexer.mark_start())
184                .map(|token| (self.lexer.column_range(), token))
185        })
186    }
187}
188
189///// Lexer /////////////////////////
190
191/// The tokenizer you write will be passed a Lexer as an argument,
192/// and will call methods on it to figure out what the next lexeme is
193/// and, if needed, get its text.  The Lexer keeps track of the
194/// column range of the lexeme.
195pub struct Lexer<'a> {
196    line: &'a str,
197    chars: CharIndices<'a>,
198    current: Option<(usize, char)>,
199    column: usize,
200    start_ix: usize,
201    start_column: usize,
202}
203
204impl<'a> Lexer<'a> {
205    /// Adds the current character to the lexeme and advances to the next.
206    #[inline]
207    fn advance(&mut self) {
208        if self.current.is_some() {
209            self.current = self.chars.next();
210            self.column += 1;
211        }
212    }
213
214    /// Makes the next char the start of the lexeme.
215    fn mark_start(&mut self) -> &mut Self {
216        self.start_column = self.column;
217        self.start_ix = if let Some((start_ix, _)) = self.current {
218            start_ix
219        } else {
220            self.line.len()
221        };
222        self
223    }
224
225    /// Returns the next character and adds it to the lexeme.
226    #[allow(clippy::should_implement_trait)]
227    pub fn next(&mut self) -> Option<char> {
228        self.current.map(|(_, c)| {
229            self.advance();
230            c
231        })
232    }
233
234    /// Returns the next character without advancing.
235    pub fn peek(&self) -> Option<char> {
236        self.current.map(|(_, c)| c)
237    }
238
239    /// If the next character is c (the argument, not the letter),
240    /// adds c to the lexeme and returns true.  Otherwise just returns false.
241    pub fn at(&mut self, c: char) -> bool {
242        if let Some((_, next_c)) = self.current {
243            if next_c == c {
244                self.advance();
245                return true;
246            }
247        }
248        false
249    }
250
251    /// Keeps adding characters to the lexeme while f(char) is true.
252    /// Returns self to allow chaining.
253    pub fn take_while(&mut self, f: impl Fn(char) -> bool) -> &mut Self {
254        while let Some((_, c)) = self.current {
255            if !f(c) {
256                break;
257            }
258            self.advance();
259        }
260        self
261    }
262
263    /// Skips over characters while f(char) is true;
264    /// the lexeme will start at the char where f(char) is false.
265    /// Returns self to allow chaining.
266    pub fn skip_while(&mut self, f: impl Fn(char) -> bool) -> &mut Self {
267        self.take_while(f);
268        self.mark_start();
269        self
270    }
271
272    /// Returns the lexeme as an &str.
273    pub fn get(&self) -> &'a str {
274        &self.line[self.str_range()]
275    }
276
277    /// Returns the lexeme converted to the needed type (commonly a String).
278    /// This is just a shortcut for .lexeme().into().
279    pub fn into<T: From<&'a str>>(&mut self) -> T {
280        T::from(self.get())
281    }
282
283    /// Returns the result of applying f() to the lexeme.
284    pub fn map<T>(&mut self, f: impl Fn(&'a str) -> T) -> T {
285        f(self.get())
286    }
287
288    /// Gets the range of columns associated with the lexeme.
289    /// You probably won't need to use this, since the column range is included
290    /// in the Iterator's output, but it could be useful if the interpretation
291    /// of lexemes depends upon the columns in which you find them.
292    pub fn column_range(&self) -> Range<usize> {
293        self.start_column..self.column
294    }
295
296    /// Gets the byte range of the line associated with the lexeme.
297    /// You probably won't need to use this, since .get() gives you the &str
298    /// for the lexeme, but it could be useful if the interpretation of lexemes
299    /// depends on the surrounding text.
300    pub fn str_range(&self) -> Range<usize> {
301        self.start_ix..self.current.map_or_else(|| self.line.len(), |(ix, _)| ix)
302    }
303
304    /// Returns the line we are currently tokenizing.
305    /// You probably won't need to use this, but it could be useful if the
306    /// interpretation of lexemes depends on the surrounding text.
307    pub fn line(&self) -> &str {
308        self.line
309    }
310} // impl Lexer
token_iter/lib.rs

token_iter/
lib.rs