gramatika/
lexer.rs

1//! This module defines the [`Lexer`], [`Token`], and [`TokenStream`] types that
2//! lay the groundwork for parsing with Gramatika.
3//!
4//! In this documentation, we'll look at some less trivial `Token` examples and
5//! explore how [`TokenStream`] --- the built-in [`Lexer`] implementation ---
6//! tokenizes input.
7//!
8//! ## Defining a token
9//!
10//! Each variant of your `Token` enum should be a tuple variant wrapping a
11//! [`Substr`] and [`Span`]:
12//! ```
13//! #[macro_use]
14//! extern crate gramatika;
15//!
16//! # fn main() {
17//! use gramatika::{Span, Substr};
18//!
19//! #[derive(Token, PartialEq)]
20//! enum Token {
21//! #    #[discard]
22//! #    #[pattern = "//.*"]
23//! #    LineComment(Substr, Span),
24//! #
25//! #    #[discard]
26//! #    #[multiline]
27//! #    #[pattern = r"/\*.*?\*/"]
28//! #    BlockComment(Substr, Span),
29//! #
30//! #    #[subset_of(Ident)]
31//! #    #[pattern = "if|else|switch|case|break|for|while|var|print"]
32//! #    Keyword(Substr, Span),
33//! #
34//! #    #[pattern = "[a-zA-Z_][a-zA-Z_0-9]*"]
35//!     Ident(Substr, Span),
36//! #
37//! #    #[pattern = r"[(){}\[\]]"]
38//! #    Brace(Substr, Span),
39//! #
40//! #    #[pattern = "[,.;]"]
41//! #    Punct(Substr, Span),
42//! #
43//! #    #[pattern = "[=!<>]=?"]
44//! #    #[pattern = "[-+*/]"]
45//! #    Operator(Substr, Span),
46//! #
47//! #    #[pattern = "(0[xb])?[0-9A-Fa-f][0-9A-Fa-f.]*"]
48//! #    NumLiteral(Substr, Span),
49//! #
50//! #    #[pattern = r#""[^"]+""#]
51//! #    StrLiteral(Substr, Span),
52//! }
53//! # }
54//! ```
55//! * The [`Substr`] portion (which we'll refer to as the _lexeme_) is an atomic
56//!   reference-counted view into the original source string, provided by the
57//!   [`arcstr`] crate. These can be `clone`d for very little cost, because only
58//!   the pointer to the original string is copied, not the underlying string
59//!   itself.
60//!
61//!   ```
62//!   # use gramatika::{ArcStr, Substr};
63//!   let source = ArcStr::from("foo bar baz");
64//!   {
65//!       let foo = source.substr(..3);
66//!       let baz = source.substr(8..);
67//!
68//!       assert_eq!(foo, "foo");
69//!       assert_eq!(baz, "baz");
70//!       assert!(ArcStr::ptr_eq(foo.parent(), baz.parent()));
71//!       assert_eq!(ArcStr::strong_count(&source), Some(3));
72//!   }
73//!   assert_eq!(ArcStr::strong_count(&source), Some(1));
74//!   ```
75//!
76//! * The [`Span`] indicates the token's location in the original source
77//!   document by line and character number.
78//!
79//!   It's important to note that while the actual values stored in the `Span`
80//!   are zero-indexed, printing the `Span` with the `Debug` trait will display
81//!   _one-indexed_ values to match the conventions of most code and text
82//!   editors.
83//!
84//!   ```
85//!   # use gramatika::Span;
86//!   let span = Span::new((0, 0), (0, 4));
87//!   let printed = format!("{span:?}");
88//!   assert_eq!(printed, "1:1..1:5");
89//!   ```
90//!
91//! When the [`Token`] and [`Spanned`] traits are in scope, the lexeme and span
92//! can be extracted from a [`Token`] without needing to pattern-match. You can
93//! also grab them both in one go with the generated `as_inner` method.
94//!
95//! ```
96//! #[macro_use]
97//! extern crate gramatika;
98//!
99//! # fn main() {
100//! use gramatika::{Span, Spanned, Substr, Token as _, span};
101//!
102//! #[derive(Token, PartialEq)]
103//! enum Token {
104//!     Ident(Substr, Span),
105//! }
106//!
107//! let my_ident = Token::Ident("foo".into(), span![1:1..1:4]);
108//!
109//! assert_eq!(my_ident.lexeme(), "foo");
110//! assert_eq!(my_ident.span(), span![1:1..1:4]);
111//!
112//! let (lexeme, span) = my_ident.as_inner();
113//! assert_eq!(lexeme, my_ident.lexeme());
114//! assert_eq!(span, my_ident.span());
115//! # }
116//! ```
117//!
118//! ### Debug and Display
119//!
120//! It's a good idea to derive [`DebugLispToken`] for the enum, and implement
121//! both [`Debug`](core::fmt::Debug) and [`Display`](core::fmt::Display):
122//!
123//! ```
124//! #[macro_use]
125//! extern crate gramatika;
126//!
127//! # fn main() {
128//! use core::fmt;
129//! use gramatika::{Span, Spanned, Substr, Token as _, span};
130//!
131//! #[derive(Token, DebugLispToken, PartialEq)]
132//! enum Token {
133//!     Ident(Substr, Span),
134//! }
135//!
136//! impl fmt::Display for Token {
137//!     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
138//!         write!(f, "{}", self.lexeme())
139//!     }
140//! }
141//!
142//! impl fmt::Debug for Token {
143//!     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
144//!         <Self as gramatika::DebugLisp>::fmt(self, f, 0)
145//!     }
146//! }
147//!
148//! let my_ident = Token::Ident("foo".into(), span![1:1..1:4]);
149//!
150//! let display = format!("{my_ident}");
151//! assert_eq!(display, "foo");
152//!
153//! let debug = format!("{my_ident:?}");
154//! assert_eq!(debug, "`foo` (Ident (1:1..1:4))");
155//! # }
156//! ```
157//! [`DebugLispToken`]: gramatika_macro::DebugLispToken
158//!
159//! ## Configuring Tokenization
160//!
161//! The real power of Gramatika comes from its compile-time code generation.
162//! Let's define a pattern for our identifier token, and import the [`Lexer`]
163//! and [`TokenStream`] types:
164//!
165//! ```
166//! # #[macro_use]
167//! # extern crate gramatika;
168//! #
169//! # fn main() {
170//! # use core::fmt;
171//! use gramatika::{Lexer, Span, Spanned, Substr, Token as _, TokenStream, span};
172//! #
173//! // ...
174//! #[derive(Token, DebugLispToken, PartialEq)]
175//! enum Token {
176//!     #[pattern = "[a-zA-Z_][a-zA-Z_0-9]*"]
177//!     Ident(Substr, Span),
178//! }
179//! #
180//! # impl fmt::Display for Token {
181//! #     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
182//! #         write!(f, "{}", self.lexeme())
183//! #     }
184//! # }
185//! #
186//! # impl fmt::Debug for Token {
187//! #     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
188//! #         <Self as gramatika::DebugLisp>::fmt(self, f, 0)
189//! #     }
190//! # }
191//!
192//! let input = "
193//!     foo bar baz
194//!     foobar
195//!     loremIpsum
196//!     dolor_sit_amet
197//! ";
198//! let mut lexer = TokenStream::<Token>::new(input.into());
199//! let tokens = lexer.scan();
200//!
201//! assert_eq!(tokens.len(), 6);
202//! assert_eq!(&tokens[0], &Token::Ident("foo".into(), span![2:5..2:8]));
203//! assert_eq!(&tokens[5], &Token::Ident("dolor_sit_amet".into(), span![5:5..5:19]));
204//! # }
205//! ```
206//!
207//! The `#[pattern]` attribute accepts the same syntax and features as the Rust
208//! [`regex` crate]. Those patterns are compiled to static [deterministic finite
209//! automata] and used by the [`TokenStream`] to find token matches in an input
210//! string.
211//!
212//! [`regex` crate]: https://docs.rs/regex/latest/regex/
213//! [deterministic finite automata]: https://swtch.com/~rsc/regexp/regexp1.html
214//!
215//! ### Unrecognized input
216//!
217//! If the lexer receives any input that it doesn't know how to handle, it
218//! panics. That's not ideal, so to avoid that we'll define a catch-all
219//! `Unrecognized` token that eats up all non-whitespace characters. Our
220//! [`Parse`] implementations can then handle those gracefully by emitting a
221//! useful error message for the user.
222//!
223//! [`Parse`]: crate::Parse
224//!
225//! ```
226//! # #[macro_use]
227//! # extern crate gramatika;
228//! #
229//! # fn main() {
230//! # use core::fmt;
231//! # use gramatika::{Lexer, Span, Spanned, Substr, Token as _, span, TokenStream};
232//! #
233//! // ...
234//! #[derive(Token, DebugLispToken, PartialEq)]
235//! enum Token {
236//!     #[pattern = "[a-zA-Z_][a-zA-Z_0-9]*"]
237//!     Ident(Substr, Span),
238//!
239//!     #[pattern = r"\S+"]
240//!     Unrecognized(Substr, Span),
241//! }
242//! #
243//! # impl fmt::Display for Token {
244//! #     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
245//! #         write!(f, "{}", self.lexeme())
246//! #     }
247//! # }
248//! #
249//! # impl fmt::Debug for Token {
250//! #     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
251//! #         <Self as gramatika::DebugLisp>::fmt(self, f, 0)
252//! #     }
253//! # }
254//!
255//! let input = "foo 42";
256//! let mut lexer = TokenStream::<Token>::new(input.into());
257//! let tokens = lexer.scan();
258//!
259//! assert_eq!(tokens.len(), 2);
260//! assert_eq!(&tokens[0], &Token::Ident("foo".into(), span![1:1..1:4]));
261//! assert_eq!(&tokens[1], &Token::Unrecognized("42".into(), span![1:5..1:7]));
262//! # }
263//! ```
264//!
265//! ### Discarding input
266//!
267//! Often we want to _recognize_ some input, especially input that might be
268//! valid at any location in a source file, without needing to manually deal
269//! with those tokens in our [`Parse`] implementations. Essentially, we want to
270//! _discard_ that input. The lexer automatically does thie by default for
271//! whitespace characters, but we can expand that functionality to any other
272//! syntax that we want to ignore completely:
273//!
274//! ```
275//! # #[macro_use]
276//! # extern crate gramatika;
277//! #
278//! # fn main() {
279//! # use core::fmt;
280//! # use gramatika::{Lexer, Span, Spanned, Substr, Token as _, TokenStream, span};
281//! #
282//! // ...
283//! #[derive(Token, DebugLispToken, PartialEq)]
284//! enum Token {
285//!     #[discard]
286//!     #[pattern = "//.*"]
287//!     Comment(Substr, Span),
288//!     // ...
289//! #     #[pattern = "[a-zA-Z_][a-zA-Z_0-9]*"]
290//! #     Ident(Substr, Span),
291//! #     #[pattern = r"\S+"]
292//! #     Unrecognized(Substr, Span),
293//! }
294//! #
295//! # impl fmt::Display for Token {
296//! #     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
297//! #         write!(f, "{}", self.lexeme())
298//! #     }
299//! # }
300//! #
301//! # impl fmt::Debug for Token {
302//! #     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
303//! #         <Self as gramatika::DebugLisp>::fmt(self, f, 0)
304//! #     }
305//! # }
306//!
307//! let input = "
308//!     // Here's a comment
309//!     foo // Here's another one
310//! ";
311//!
312//! let mut lexer = TokenStream::<Token>::new(input.into());
313//! let tokens = lexer.scan();
314//!
315//! assert_eq!(tokens.len(), 1);
316//! assert_eq!(&tokens[0], &Token::Ident("foo".into(), span![3:5..3:8]));
317//! # }
318//! ```
319//!
320//! ### Matching tokens across multiple lines
321//!
322//! We may also want to define some tokens that can span multiple lines, but by
323//! default a regular expression's `.` doesn't match newline characters. We can
324//! change that by adding the `#[multiline]` attribute:
325//!
326//! ```
327//! # #[macro_use]
328//! # extern crate gramatika;
329//! #
330//! # fn main() {
331//! # use core::fmt;
332//! # use gramatika::{Lexer, Span, Spanned, Substr, Token as _, TokenStream, span};
333//! #
334//! // ...
335//! #[derive(Token, DebugLispToken, PartialEq)]
336//! enum Token {
337//!     #[discard]
338//!     #[multiline]
339//!     #[pattern = r"/\*.*?\*/"]
340//!     BlockComment(Substr, Span),
341//!
342//!     #[discard]
343//!     #[pattern = "//.*"]
344//!     LineComment(Substr, Span),
345//!     // ...
346//! #     #[pattern = "[a-zA-Z_][a-zA-Z_0-9]*"]
347//! #     Ident(Substr, Span),
348//! #     #[pattern = r"\S+"]
349//! #     Unrecognized(Substr, Span),
350//! }
351//! #
352//! # impl fmt::Display for Token {
353//! #     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
354//! #         write!(f, "{}", self.lexeme())
355//! #     }
356//! # }
357//! #
358//! # impl fmt::Debug for Token {
359//! #     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
360//! #         <Self as gramatika::DebugLisp>::fmt(self, f, 0)
361//! #     }
362//! # }
363//!
364//! let input = "
365//!     /*
366//!     Here's a block comment.
367//!     It can span as many lines as we please!
368//!     */
369//!     foo // Here's a line comment
370//! ";
371//!
372//! let mut lexer = TokenStream::<Token>::new(input.into());
373//! let tokens = lexer.scan();
374//!
375//! assert_eq!(tokens.len(), 1);
376//! assert_eq!(&tokens[0], &Token::Ident("foo".into(), span![6:5..6:8]));
377//! # }
378//! ```
379//! ### Matching keywords
380//!
381//! Keywords are a tricky thing, because they will almost certainly overlap with
382//! your language's "identifier" token. The matching of tokens is prioritized by
383//! declaration order from top to bottom, so you might think we could just
384//! declare our `Keyword` token first, but that has an unfortunate consequence:
385//!
386//! ```
387//! # #[macro_use]
388//! # extern crate gramatika;
389//! #
390//! # fn main() {
391//! # use core::fmt;
392//! # use gramatika::{Lexer, Span, Spanned, Substr, Token as _, TokenStream, span};
393//! #
394//! // ...
395//! #[derive(Token, DebugLispToken, PartialEq)]
396//! enum Token {
397//!     #[pattern = "if|else|for|in|switch|case|break"]
398//!     Keyword(Substr, Span),
399//!
400//!     #[pattern = "[a-zA-Z_][a-zA-Z_0-9]*"]
401//!     Ident(Substr, Span),
402//! #
403//! #     #[pattern = r"\S+"]
404//! #     Unrecognized(Substr, Span),
405//! }
406//! #
407//! # impl fmt::Display for Token {
408//! #     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
409//! #         write!(f, "{}", self.lexeme())
410//! #     }
411//! # }
412//! #
413//! # impl fmt::Debug for Token {
414//! #     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
415//! #         <Self as gramatika::DebugLisp>::fmt(self, f, 0)
416//! #     }
417//! # }
418//!
419//! let input = "
420//!     for foo in bar
421//!         intent
422//! ";
423//!
424//! let mut lexer = TokenStream::<Token>::new(input.into());
425//! let tokens = lexer.scan();
426//!
427//! assert_eq!(tokens, vec![
428//!     Token::Keyword("for".into(), span![2:5..2:8]),
429//!     Token::Ident("foo".into(),   span![2:9..2:12]),
430//!     Token::Keyword("in".into(),  span![2:13..2:15]),
431//!     Token::Ident("bar".into(),   span![2:16..2:19]),
432//!     // Wait a minute, this is not what we wanted!
433//!     Token::Keyword("in".into(),  span![3:9..3:11]),
434//!     Token::Ident("tent".into(),  span![3:11..3:15]),
435//! ]);
436//! # }
437//! ```
438//!
439//! To fix that, we can use the `#[subset_of(Other)]` attribute to specify that
440//! a token's pattern overlaps with `Other`'s pattern, and should only match if
441//! `Other`'s _entire lexeme_ is _also_ a match for the subset.
442//!
443//! ```
444//! # #[macro_use]
445//! # extern crate gramatika;
446//! #
447//! # fn main() {
448//! # use core::fmt;
449//! # use gramatika::{Lexer, Span, Spanned, Substr, Token as _, TokenStream, span};
450//! #
451//! // ...
452//! #[derive(Token, DebugLispToken, PartialEq)]
453//! enum Token {
454//!     #[subset_of(Ident)]
455//!     #[pattern = "if|else|for|in|switch|case|break"]
456//!     Keyword(Substr, Span),
457//!
458//!     #[pattern = "[a-zA-Z_][a-zA-Z_0-9]*"]
459//!     Ident(Substr, Span),
460//! #
461//! #     #[pattern = r"\S+"]
462//! #     Unrecognized(Substr, Span),
463//! }
464//! #
465//! # impl fmt::Display for Token {
466//! #     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
467//! #         write!(f, "{}", self.lexeme())
468//! #     }
469//! # }
470//! #
471//! # impl fmt::Debug for Token {
472//! #     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
473//! #         <Self as gramatika::DebugLisp>::fmt(self, f, 0)
474//! #     }
475//! # }
476//!
477//! let input = "
478//!     for foo in bar
479//!         intent
480//! ";
481//!
482//! let mut lexer = TokenStream::<Token>::new(input.into());
483//! let tokens = lexer.scan();
484//!
485//! assert_eq!(tokens, vec![
486//!     Token::Keyword("for".into(),  span![2:5..2:8]),
487//!     Token::Ident("foo".into(),    span![2:9..2:12]),
488//!     Token::Keyword("in".into(),   span![2:13..2:15]),
489//!     Token::Ident("bar".into(),    span![2:16..2:19]),
490//!     // That's better!
491//!     Token::Ident("intent".into(), span![3:9..3:15]),
492//! ]);
493//! # }
494//! ```
495//!
496//! ### Composing complex patterns
497//!
498//! Regular expressions are not known for their readability in the best of
499//! circumstances, but that's especially true for long patterns with lots of
500//! "or" branches. We can define multiple `#[pattern]` attributes for a single
501//! token to _compose_ those patterns into a single regular expression:
502//!
503//! ```
504//! # #[macro_use]
505//! # extern crate gramatika;
506//! #
507//! # fn main() {
508//! # use core::fmt;
509//! # use gramatika::{Lexer, Span, Spanned, Substr, Token as _, TokenStream, span};
510//! #
511//! // ...
512//! #[derive(Token, DebugLispToken, PartialEq)]
513//! enum Token {
514//!     #[pattern = r"[0-9]*\.[0-9]+"]
515//!     #[pattern = r"[0-9]+\.[0-9]*"]
516//!     FloatLiteral(Substr, Span),
517//! #
518//! #     #[pattern = r"\S+"]
519//! #     Unrecognized(Substr, Span),
520//! }
521//! #
522//! # impl fmt::Display for Token {
523//! #     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
524//! #         write!(f, "{}", self.lexeme())
525//! #     }
526//! # }
527//! #
528//! # impl fmt::Debug for Token {
529//! #     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
530//! #         <Self as gramatika::DebugLisp>::fmt(self, f, 0)
531//! #     }
532//! # }
533//!
534//! let input = "
535//!     3.141
536//!     .25
537//!     50.
538//! ";
539//!
540//! let mut lexer = TokenStream::<Token>::new(input.into());
541//! let tokens = lexer.scan();
542//!
543//! assert_eq!(tokens, vec![
544//!     Token::FloatLiteral("3.141".into(), span![2:5..2:10]),
545//!     Token::FloatLiteral(".25".into(),   span![3:5..3:8]),
546//!     Token::FloatLiteral("50.".into(),   span![4:5..4:8]),
547//! ]);
548//! # }
549//! ```
550//! The pattern above is exactly equivalent to `[0-9]*\.[0-9]+|[0-9]+\.[0-9]*`,
551//! but by splitting the pattern into separate lines we can more easily tell the
552//! difference between the two alternations (the first makes the digits _before_
553//! the `.` optional, while the second does the same for digits _after_ the `.`).
554//!
555
556use std::{collections::HashSet, fmt, hash::Hash, marker::PhantomData};
557
558use arcstr::Substr;
559
560use crate::{Position, SourceStr, Span, Spanned};
561
562/// A lexer (AKA scanner, AKA tokenizer) is the piece of the parsing toolchain
563/// that takes raw input (e.g., the text of a source file) and "scans" it into
564/// discrete chunks of meaningful information (i.e., [tokens]).
565///
566/// In Gramatika, [parsing] is usually performed on a stream of tokens that are
567/// scanned on-demand by a compile-time-generated type implementing this trait.
568/// Except perhaps for unit-testing, you should rarely need to interact with the
569/// lexer directly.
570///
571/// [tokens]: crate::Token
572/// [parsing]: crate::parse
573pub trait Lexer {
574	/// The concrete type of [`Token`] this lexer should scan.
575	type Output: Token;
576
577	/// Create a new lexer that can scan the provided `input` to a stream of
578	/// [`Output`] tokens.
579	///
580	/// [`Output`]: Lexer::Output
581	fn new(input: SourceStr) -> Self;
582
583	/// Experimental
584	#[doc(hidden)]
585	#[allow(unused_variables)]
586	fn with_runtime_matcher<F>(self, matcher: F) -> Self
587	where
588		Self: Sized,
589		F: Fn(&str) -> Option<(usize, <Self::Output as Token>::Kind)> + 'static,
590	{
591		self
592	}
593
594	/// Returns an owned copy of the input [`SourceStr`] this lexer is scanning.
595	fn source(&self) -> SourceStr;
596
597	/// Scans a single token from the input.
598	///
599	/// The implementation generated by the [`derive macro`] does this lazily,
600	/// only checking the input for a match when this method is called, and
601	/// stopping as soon as the first match is (or isn't) found.
602	///
603	/// [`derive macro`]: gramatika_macro::Lexer
604	fn scan_token(&mut self) -> Option<Self::Output>;
605
606	/// Eagerly scans the entire input to an array of tokens.
607	///
608	/// Most of the time you'll want to use [`scan_token`] instead to _stream_
609	/// tokens from the input on an as-needed basis.
610	///
611	/// [`scan_token`]: Lexer::scan_token
612	fn scan(&mut self) -> Vec<Self::Output> {
613		let mut result = vec![];
614		while let Some(token) = self.scan_token() {
615			result.push(token);
616		}
617		result
618	}
619}
620
621/// Experimental
622#[doc(hidden)]
623pub trait PartialLexer {
624	/// Initialize a new lexer from the state of some previous one that was
625	/// interrupted.
626	fn from_remaining(input: SourceStr, position: Position) -> Self;
627
628	/// Consumes the lexer, returning the remaining (unscanned) portion of the
629	/// input and the current cursor position.
630	fn stop(self) -> (Substr, Position);
631}
632
633/// In the parlance of language parsing, "tokens" are the smallest discrete
634/// chunks of meaningful information that can be extracted from some raw input,
635/// like the text of a source file.
636///
637/// If the individual characters of that text are thought of as _atoms_, then
638/// its "tokens" could be considered the _molecules_: words, punctuation marks,
639/// mathematical operators, etc.
640///
641/// In Gramatika, this trait is usually [derived] (along with its [`Lexer`]) for
642/// some user-defined enum type, with regular-expression patterns specifying
643/// the forms that can be taken by each of its variants. See the
644/// [module-level documentation] for (much) more detail.
645///
646/// [derived]: gramatika_macro::Token
647/// [module-level documentation]: crate::lexer
648pub trait Token
649where Self: Clone + Spanned
650{
651	type Kind: Copy + fmt::Debug + PartialEq + Eq + Hash + 'static;
652
653	/// Find the first match for this token in a [`str`] slice, returning the
654	/// start and end byte offsets and the matching [`Kind`](Token::Kind).
655	fn find<S>(input: S) -> Option<(usize, usize, Self::Kind)>
656	where S: AsRef<str>;
657
658	/// Construct a token from its constituent parts.
659	fn from_parts(kind: Self::Kind, substr: Substr, span: Span) -> Self;
660
661	/// Returns the set of this token's variants which should be treated as
662	/// multi-line patterns (i.e., the `.` character in regex patterns should
663	/// match `\n` characters).
664	///
665	/// When using the [derive macro], returns the variants annotated with the
666	/// `#[multiline]` attribute.
667	///
668	/// [derive macro]: macro@crate::Token
669	fn multilines() -> &'static HashSet<Self::Kind>;
670
671	/// Returns the set of this token's variants which should be discarded by the
672	/// lexer when scanning.
673	///
674	/// When using the [derive macro], returns the variants annotated with the
675	/// `#[discard]` attribute.
676	///
677	/// [derive macro]: macro@crate::Token
678	fn discards() -> &'static HashSet<Self::Kind>;
679
680	/// Returns the actual text content of a token.
681	///
682	/// ```
683	/// #[macro_use]
684	/// extern crate gramatika;
685	///
686	/// use gramatika::{
687	///     arcstr::literal_substr,
688	///     Lexer, Substr, Span, Token as _, TokenStream,
689	/// };
690	///
691	/// # fn main() {
692	/// #[derive(Token)]
693	/// enum Token {
694	///     #[subset_of(Ident)]
695	///     #[pattern = "var"]
696	///     Keyword(Substr, Span),
697	///
698	///     #[pattern = "[a-zA-Z_][a-zA-Z0-9_]*"]
699	///     Ident(Substr, Span),
700	///
701	///     #[pattern = "[0-9]+"]
702	///     IntLiteral(Substr, Span),
703	///
704	///     #[pattern = "="]
705	///     Operator(Substr, Span),
706	///
707	///     #[pattern = ";"]
708	///     Punct(Substr, Span),
709	/// }
710	///
711	/// let src = "var the_answer = 42;";
712	/// let tokens = TokenStream::<Token>::new(src.into()).scan();
713	///
714	/// assert_eq!(tokens[1].lexeme(), literal_substr!("the_answer"));
715	/// # }
716	/// ```
717	fn lexeme(&self) -> Substr;
718
719	/// Returns the [`Kind`] of this token. Used in [`ParseStreamer`] methods like
720	/// [`check_kind`] and [`consume_kind`]. Effectively a more user-friendly version of
721	/// [`std::mem::discriminant`].
722	///
723	/// [`Kind`]: Token::Kind
724	/// [`ParseStreamer`]: crate::parse::ParseStreamer
725	/// [`check_kind`]: crate::parse::ParseStreamer::check_kind
726	/// [`consume_kind`]: crate::parse::ParseStreamer::consume_kind
727	///
728	/// ```
729	/// #[macro_use]
730	/// extern crate gramatika;
731	///
732	/// use gramatika::{Lexer, Substr, Span, Token as _, TokenStream};
733	///
734	/// # fn main() {
735	/// #[derive(Token)]
736	/// enum Token {
737	///     #[pattern = "[a-zA-Z_][a-zA-Z0-9_]*"]
738	///     Ident(Substr, Span),
739	/// }
740	///
741	/// let input = "foo";
742	/// let token = TokenStream::<Token>::new(input.into())
743	///     .scan_token()
744	///     .expect("Expected to match Ident `foo`");
745	///
746	/// assert_eq!(token.kind(), TokenKind::Ident);
747	/// # }
748	/// ```
749	fn kind(&self) -> Self::Kind;
750
751	/// Decomposes the token into its constituent [`Kind`], [`lexeme`] and
752	/// [`Span`]) parts, with the `lexeme` as a [`&str`] slice for compatibility
753	/// with pattern matching.
754	///
755	/// [`Kind`]: Token::Kind
756	/// [`lexeme`]: Token::lexeme
757	/// [`span`]: crate::Span
758	/// [`&str`]: prim@str
759	///
760	/// ```
761	/// #[macro_use]
762	/// extern crate gramatika;
763	///
764	/// use gramatika::{Lexer, Substr, Span, Token as _, TokenStream};
765	///
766	/// # fn main() {
767	/// #[derive(Token)]
768	/// enum Token {
769	///     #[pattern = "[a-zA-Z_][a-zA-Z0-9_]*"]
770	///     Ident(Substr, Span),
771	/// }
772	///
773	/// let input = "foo";
774	/// let token = TokenStream::<Token>::new(input.into())
775	///     .scan_token()
776	///     .expect("Expected to match Ident `foo`");
777	///
778	/// assert!(matches!(token.as_matchable(), (TokenKind::Ident, "foo", _)));
779	/// # }
780	/// ```
781	fn as_matchable(&self) -> (Self::Kind, &str, Span);
782}
783
784/// A concrete implementation of the [`Lexer`] interface.
785pub struct TokenStream<T> {
786	input: SourceStr,
787	remaining: Substr,
788	current: Position,
789	lookahead: Position,
790	_marker: PhantomData<T>,
791}
792
793impl<T> Lexer for TokenStream<T>
794where T: Token
795{
796	type Output = T;
797
798	fn new(input: SourceStr) -> Self {
799		Self {
800			remaining: input.substr(..),
801			input,
802			current: Position::default(),
803			lookahead: Position::default(),
804			_marker: Default::default(),
805		}
806	}
807
808	fn source(&self) -> SourceStr {
809		self.input.clone()
810	}
811
812	fn scan(&mut self) -> Vec<T> {
813		let mut output = vec![];
814		while let Some(token) = self.scan_token() {
815			output.push(token);
816		}
817
818		output
819	}
820
821	fn scan_token(&mut self) -> Option<T> {
822		match <T as Token>::find(&self.remaining) {
823			Some((start, end, kind)) => {
824				let lexeme = self.remaining.substr(start..end);
825
826				if <T as Token>::multilines().contains(&kind) {
827					let mut line_inc = 0_usize;
828					let mut remaining = lexeme.as_str();
829
830					while let Some(idx) = remaining.find('\n') {
831						line_inc += 1;
832						remaining = &remaining[idx + 1..];
833					}
834					let char_inc = remaining.len();
835
836					self.lookahead.line += line_inc;
837
838					if line_inc > 0 {
839						self.lookahead.character = char_inc;
840					} else {
841						self.lookahead.character += char_inc;
842					}
843				} else {
844					self.lookahead.character += end;
845				}
846
847				let span = Span {
848					start: self.current,
849					end: self.lookahead,
850				};
851
852				let token = <T as Token>::from_parts(kind, lexeme, span);
853
854				self.remaining = self.remaining.substr(end..);
855				self.current = self.lookahead;
856
857				if <T as Token>::discards().contains(&kind) {
858					self.scan_token()
859				} else {
860					Some(token)
861				}
862			}
863			None => self.remaining.clone().chars().next().and_then(|c| match c {
864				'\n' => {
865					self.lookahead.line += 1;
866					self.lookahead.character = 0;
867					self.current = self.lookahead;
868					self.remaining = self.remaining.substr(1..);
869
870					self.scan_token()
871				}
872				other if other.is_whitespace() => {
873					let len = other.len_utf8();
874
875					self.lookahead.character += len;
876					self.current.character += len;
877					self.remaining = self.remaining.substr(len..);
878
879					self.scan_token()
880				}
881				other => panic!("Unsupported input: `{other}` at {:?}", self.current),
882			}),
883		}
884	}
885}
886
887impl<T> PartialLexer for TokenStream<T> {
888	fn from_remaining(input: SourceStr, position: Position) -> Self {
889		Self {
890			remaining: input.substr(..),
891			input,
892			current: position,
893			lookahead: position,
894			_marker: Default::default(),
895		}
896	}
897
898	fn stop(self) -> (Substr, Position) {
899		(self.remaining, self.current)
900	}
901}
gramatika/lexer.rs

gramatika/
lexer.rs