gramatika/lexer.rs
1//! This module defines the [`Lexer`], [`Token`], and [`TokenStream`] types that
2//! lay the groundwork for parsing with Gramatika.
3//!
4//! In this documentation, we'll look at some less trivial `Token` examples and
5//! explore how [`TokenStream`] --- the built-in [`Lexer`] implementation ---
6//! tokenizes input.
7//!
8//! ## Defining a token
9//!
10//! Each variant of your `Token` enum should be a tuple variant wrapping a
11//! [`Substr`] and [`Span`]:
12//! ```
13//! #[macro_use]
14//! extern crate gramatika;
15//!
16//! # fn main() {
17//! use gramatika::{Span, Substr};
18//!
19//! #[derive(Token, PartialEq)]
20//! enum Token {
21//! # #[discard]
22//! # #[pattern = "//.*"]
23//! # LineComment(Substr, Span),
24//! #
25//! # #[discard]
26//! # #[multiline]
27//! # #[pattern = r"/\*.*?\*/"]
28//! # BlockComment(Substr, Span),
29//! #
30//! # #[subset_of(Ident)]
31//! # #[pattern = "if|else|switch|case|break|for|while|var|print"]
32//! # Keyword(Substr, Span),
33//! #
34//! # #[pattern = "[a-zA-Z_][a-zA-Z_0-9]*"]
35//! Ident(Substr, Span),
36//! #
37//! # #[pattern = r"[(){}\[\]]"]
38//! # Brace(Substr, Span),
39//! #
40//! # #[pattern = "[,.;]"]
41//! # Punct(Substr, Span),
42//! #
43//! # #[pattern = "[=!<>]=?"]
44//! # #[pattern = "[-+*/]"]
45//! # Operator(Substr, Span),
46//! #
47//! # #[pattern = "(0[xb])?[0-9A-Fa-f][0-9A-Fa-f.]*"]
48//! # NumLiteral(Substr, Span),
49//! #
50//! # #[pattern = r#""[^"]+""#]
51//! # StrLiteral(Substr, Span),
52//! }
53//! # }
54//! ```
55//! * The [`Substr`] portion (which we'll refer to as the _lexeme_) is an atomic
56//! reference-counted view into the original source string, provided by the
57//! [`arcstr`] crate. These can be `clone`d for very little cost, because only
58//! the pointer to the original string is copied, not the underlying string
59//! itself.
60//!
61//! ```
62//! # use gramatika::{ArcStr, Substr};
63//! let source = ArcStr::from("foo bar baz");
64//! {
65//! let foo = source.substr(..3);
66//! let baz = source.substr(8..);
67//!
68//! assert_eq!(foo, "foo");
69//! assert_eq!(baz, "baz");
70//! assert!(ArcStr::ptr_eq(foo.parent(), baz.parent()));
71//! assert_eq!(ArcStr::strong_count(&source), Some(3));
72//! }
73//! assert_eq!(ArcStr::strong_count(&source), Some(1));
74//! ```
75//!
76//! * The [`Span`] indicates the token's location in the original source
77//! document by line and character number.
78//!
79//! It's important to note that while the actual values stored in the `Span`
80//! are zero-indexed, printing the `Span` with the `Debug` trait will display
81//! _one-indexed_ values to match the conventions of most code and text
82//! editors.
83//!
84//! ```
85//! # use gramatika::Span;
86//! let span = Span::new((0, 0), (0, 4));
87//! let printed = format!("{span:?}");
88//! assert_eq!(printed, "1:1..1:5");
89//! ```
90//!
91//! When the [`Token`] and [`Spanned`] traits are in scope, the lexeme and span
92//! can be extracted from a [`Token`] without needing to pattern-match. You can
93//! also grab them both in one go with the generated `as_inner` method.
94//!
95//! ```
96//! #[macro_use]
97//! extern crate gramatika;
98//!
99//! # fn main() {
100//! use gramatika::{Span, Spanned, Substr, Token as _, span};
101//!
102//! #[derive(Token, PartialEq)]
103//! enum Token {
104//! Ident(Substr, Span),
105//! }
106//!
107//! let my_ident = Token::Ident("foo".into(), span![1:1..1:4]);
108//!
109//! assert_eq!(my_ident.lexeme(), "foo");
110//! assert_eq!(my_ident.span(), span![1:1..1:4]);
111//!
112//! let (lexeme, span) = my_ident.as_inner();
113//! assert_eq!(lexeme, my_ident.lexeme());
114//! assert_eq!(span, my_ident.span());
115//! # }
116//! ```
117//!
118//! ### Debug and Display
119//!
120//! It's a good idea to derive [`DebugLispToken`] for the enum, and implement
121//! both [`Debug`](core::fmt::Debug) and [`Display`](core::fmt::Display):
122//!
123//! ```
124//! #[macro_use]
125//! extern crate gramatika;
126//!
127//! # fn main() {
128//! use core::fmt;
129//! use gramatika::{Span, Spanned, Substr, Token as _, span};
130//!
131//! #[derive(Token, DebugLispToken, PartialEq)]
132//! enum Token {
133//! Ident(Substr, Span),
134//! }
135//!
136//! impl fmt::Display for Token {
137//! fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
138//! write!(f, "{}", self.lexeme())
139//! }
140//! }
141//!
142//! impl fmt::Debug for Token {
143//! fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
144//! <Self as gramatika::DebugLisp>::fmt(self, f, 0)
145//! }
146//! }
147//!
148//! let my_ident = Token::Ident("foo".into(), span![1:1..1:4]);
149//!
150//! let display = format!("{my_ident}");
151//! assert_eq!(display, "foo");
152//!
153//! let debug = format!("{my_ident:?}");
154//! assert_eq!(debug, "`foo` (Ident (1:1..1:4))");
155//! # }
156//! ```
157//! [`DebugLispToken`]: gramatika_macro::DebugLispToken
158//!
159//! ## Configuring Tokenization
160//!
161//! The real power of Gramatika comes from its compile-time code generation.
162//! Let's define a pattern for our identifier token, and import the [`Lexer`]
163//! and [`TokenStream`] types:
164//!
165//! ```
166//! # #[macro_use]
167//! # extern crate gramatika;
168//! #
169//! # fn main() {
170//! # use core::fmt;
171//! use gramatika::{Lexer, Span, Spanned, Substr, Token as _, TokenStream, span};
172//! #
173//! // ...
174//! #[derive(Token, DebugLispToken, PartialEq)]
175//! enum Token {
176//! #[pattern = "[a-zA-Z_][a-zA-Z_0-9]*"]
177//! Ident(Substr, Span),
178//! }
179//! #
180//! # impl fmt::Display for Token {
181//! # fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
182//! # write!(f, "{}", self.lexeme())
183//! # }
184//! # }
185//! #
186//! # impl fmt::Debug for Token {
187//! # fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
188//! # <Self as gramatika::DebugLisp>::fmt(self, f, 0)
189//! # }
190//! # }
191//!
192//! let input = "
193//! foo bar baz
194//! foobar
195//! loremIpsum
196//! dolor_sit_amet
197//! ";
198//! let mut lexer = TokenStream::<Token>::new(input.into());
199//! let tokens = lexer.scan();
200//!
201//! assert_eq!(tokens.len(), 6);
202//! assert_eq!(&tokens[0], &Token::Ident("foo".into(), span![2:5..2:8]));
203//! assert_eq!(&tokens[5], &Token::Ident("dolor_sit_amet".into(), span![5:5..5:19]));
204//! # }
205//! ```
206//!
207//! The `#[pattern]` attribute accepts the same syntax and features as the Rust
208//! [`regex` crate]. Those patterns are compiled to static [deterministic finite
209//! automata] and used by the [`TokenStream`] to find token matches in an input
210//! string.
211//!
212//! [`regex` crate]: https://docs.rs/regex/latest/regex/
213//! [deterministic finite automata]: https://swtch.com/~rsc/regexp/regexp1.html
214//!
215//! ### Unrecognized input
216//!
217//! If the lexer receives any input that it doesn't know how to handle, it
218//! panics. That's not ideal, so to avoid that we'll define a catch-all
219//! `Unrecognized` token that eats up all non-whitespace characters. Our
220//! [`Parse`] implementations can then handle those gracefully by emitting a
221//! useful error message for the user.
222//!
223//! [`Parse`]: crate::Parse
224//!
225//! ```
226//! # #[macro_use]
227//! # extern crate gramatika;
228//! #
229//! # fn main() {
230//! # use core::fmt;
231//! # use gramatika::{Lexer, Span, Spanned, Substr, Token as _, span, TokenStream};
232//! #
233//! // ...
234//! #[derive(Token, DebugLispToken, PartialEq)]
235//! enum Token {
236//! #[pattern = "[a-zA-Z_][a-zA-Z_0-9]*"]
237//! Ident(Substr, Span),
238//!
239//! #[pattern = r"\S+"]
240//! Unrecognized(Substr, Span),
241//! }
242//! #
243//! # impl fmt::Display for Token {
244//! # fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
245//! # write!(f, "{}", self.lexeme())
246//! # }
247//! # }
248//! #
249//! # impl fmt::Debug for Token {
250//! # fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
251//! # <Self as gramatika::DebugLisp>::fmt(self, f, 0)
252//! # }
253//! # }
254//!
255//! let input = "foo 42";
256//! let mut lexer = TokenStream::<Token>::new(input.into());
257//! let tokens = lexer.scan();
258//!
259//! assert_eq!(tokens.len(), 2);
260//! assert_eq!(&tokens[0], &Token::Ident("foo".into(), span![1:1..1:4]));
261//! assert_eq!(&tokens[1], &Token::Unrecognized("42".into(), span![1:5..1:7]));
262//! # }
263//! ```
264//!
265//! ### Discarding input
266//!
267//! Often we want to _recognize_ some input, especially input that might be
268//! valid at any location in a source file, without needing to manually deal
269//! with those tokens in our [`Parse`] implementations. Essentially, we want to
270//! _discard_ that input. The lexer automatically does thie by default for
271//! whitespace characters, but we can expand that functionality to any other
272//! syntax that we want to ignore completely:
273//!
274//! ```
275//! # #[macro_use]
276//! # extern crate gramatika;
277//! #
278//! # fn main() {
279//! # use core::fmt;
280//! # use gramatika::{Lexer, Span, Spanned, Substr, Token as _, TokenStream, span};
281//! #
282//! // ...
283//! #[derive(Token, DebugLispToken, PartialEq)]
284//! enum Token {
285//! #[discard]
286//! #[pattern = "//.*"]
287//! Comment(Substr, Span),
288//! // ...
289//! # #[pattern = "[a-zA-Z_][a-zA-Z_0-9]*"]
290//! # Ident(Substr, Span),
291//! # #[pattern = r"\S+"]
292//! # Unrecognized(Substr, Span),
293//! }
294//! #
295//! # impl fmt::Display for Token {
296//! # fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
297//! # write!(f, "{}", self.lexeme())
298//! # }
299//! # }
300//! #
301//! # impl fmt::Debug for Token {
302//! # fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
303//! # <Self as gramatika::DebugLisp>::fmt(self, f, 0)
304//! # }
305//! # }
306//!
307//! let input = "
308//! // Here's a comment
309//! foo // Here's another one
310//! ";
311//!
312//! let mut lexer = TokenStream::<Token>::new(input.into());
313//! let tokens = lexer.scan();
314//!
315//! assert_eq!(tokens.len(), 1);
316//! assert_eq!(&tokens[0], &Token::Ident("foo".into(), span![3:5..3:8]));
317//! # }
318//! ```
319//!
320//! ### Matching tokens across multiple lines
321//!
322//! We may also want to define some tokens that can span multiple lines, but by
323//! default a regular expression's `.` doesn't match newline characters. We can
324//! change that by adding the `#[multiline]` attribute:
325//!
326//! ```
327//! # #[macro_use]
328//! # extern crate gramatika;
329//! #
330//! # fn main() {
331//! # use core::fmt;
332//! # use gramatika::{Lexer, Span, Spanned, Substr, Token as _, TokenStream, span};
333//! #
334//! // ...
335//! #[derive(Token, DebugLispToken, PartialEq)]
336//! enum Token {
337//! #[discard]
338//! #[multiline]
339//! #[pattern = r"/\*.*?\*/"]
340//! BlockComment(Substr, Span),
341//!
342//! #[discard]
343//! #[pattern = "//.*"]
344//! LineComment(Substr, Span),
345//! // ...
346//! # #[pattern = "[a-zA-Z_][a-zA-Z_0-9]*"]
347//! # Ident(Substr, Span),
348//! # #[pattern = r"\S+"]
349//! # Unrecognized(Substr, Span),
350//! }
351//! #
352//! # impl fmt::Display for Token {
353//! # fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
354//! # write!(f, "{}", self.lexeme())
355//! # }
356//! # }
357//! #
358//! # impl fmt::Debug for Token {
359//! # fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
360//! # <Self as gramatika::DebugLisp>::fmt(self, f, 0)
361//! # }
362//! # }
363//!
364//! let input = "
365//! /*
366//! Here's a block comment.
367//! It can span as many lines as we please!
368//! */
369//! foo // Here's a line comment
370//! ";
371//!
372//! let mut lexer = TokenStream::<Token>::new(input.into());
373//! let tokens = lexer.scan();
374//!
375//! assert_eq!(tokens.len(), 1);
376//! assert_eq!(&tokens[0], &Token::Ident("foo".into(), span![6:5..6:8]));
377//! # }
378//! ```
379//! ### Matching keywords
380//!
381//! Keywords are a tricky thing, because they will almost certainly overlap with
382//! your language's "identifier" token. The matching of tokens is prioritized by
383//! declaration order from top to bottom, so you might think we could just
384//! declare our `Keyword` token first, but that has an unfortunate consequence:
385//!
386//! ```
387//! # #[macro_use]
388//! # extern crate gramatika;
389//! #
390//! # fn main() {
391//! # use core::fmt;
392//! # use gramatika::{Lexer, Span, Spanned, Substr, Token as _, TokenStream, span};
393//! #
394//! // ...
395//! #[derive(Token, DebugLispToken, PartialEq)]
396//! enum Token {
397//! #[pattern = "if|else|for|in|switch|case|break"]
398//! Keyword(Substr, Span),
399//!
400//! #[pattern = "[a-zA-Z_][a-zA-Z_0-9]*"]
401//! Ident(Substr, Span),
402//! #
403//! # #[pattern = r"\S+"]
404//! # Unrecognized(Substr, Span),
405//! }
406//! #
407//! # impl fmt::Display for Token {
408//! # fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
409//! # write!(f, "{}", self.lexeme())
410//! # }
411//! # }
412//! #
413//! # impl fmt::Debug for Token {
414//! # fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
415//! # <Self as gramatika::DebugLisp>::fmt(self, f, 0)
416//! # }
417//! # }
418//!
419//! let input = "
420//! for foo in bar
421//! intent
422//! ";
423//!
424//! let mut lexer = TokenStream::<Token>::new(input.into());
425//! let tokens = lexer.scan();
426//!
427//! assert_eq!(tokens, vec![
428//! Token::Keyword("for".into(), span![2:5..2:8]),
429//! Token::Ident("foo".into(), span![2:9..2:12]),
430//! Token::Keyword("in".into(), span![2:13..2:15]),
431//! Token::Ident("bar".into(), span![2:16..2:19]),
432//! // Wait a minute, this is not what we wanted!
433//! Token::Keyword("in".into(), span![3:9..3:11]),
434//! Token::Ident("tent".into(), span![3:11..3:15]),
435//! ]);
436//! # }
437//! ```
438//!
439//! To fix that, we can use the `#[subset_of(Other)]` attribute to specify that
440//! a token's pattern overlaps with `Other`'s pattern, and should only match if
441//! `Other`'s _entire lexeme_ is _also_ a match for the subset.
442//!
443//! ```
444//! # #[macro_use]
445//! # extern crate gramatika;
446//! #
447//! # fn main() {
448//! # use core::fmt;
449//! # use gramatika::{Lexer, Span, Spanned, Substr, Token as _, TokenStream, span};
450//! #
451//! // ...
452//! #[derive(Token, DebugLispToken, PartialEq)]
453//! enum Token {
454//! #[subset_of(Ident)]
455//! #[pattern = "if|else|for|in|switch|case|break"]
456//! Keyword(Substr, Span),
457//!
458//! #[pattern = "[a-zA-Z_][a-zA-Z_0-9]*"]
459//! Ident(Substr, Span),
460//! #
461//! # #[pattern = r"\S+"]
462//! # Unrecognized(Substr, Span),
463//! }
464//! #
465//! # impl fmt::Display for Token {
466//! # fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
467//! # write!(f, "{}", self.lexeme())
468//! # }
469//! # }
470//! #
471//! # impl fmt::Debug for Token {
472//! # fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
473//! # <Self as gramatika::DebugLisp>::fmt(self, f, 0)
474//! # }
475//! # }
476//!
477//! let input = "
478//! for foo in bar
479//! intent
480//! ";
481//!
482//! let mut lexer = TokenStream::<Token>::new(input.into());
483//! let tokens = lexer.scan();
484//!
485//! assert_eq!(tokens, vec![
486//! Token::Keyword("for".into(), span![2:5..2:8]),
487//! Token::Ident("foo".into(), span![2:9..2:12]),
488//! Token::Keyword("in".into(), span![2:13..2:15]),
489//! Token::Ident("bar".into(), span![2:16..2:19]),
490//! // That's better!
491//! Token::Ident("intent".into(), span![3:9..3:15]),
492//! ]);
493//! # }
494//! ```
495//!
496//! ### Composing complex patterns
497//!
498//! Regular expressions are not known for their readability in the best of
499//! circumstances, but that's especially true for long patterns with lots of
500//! "or" branches. We can define multiple `#[pattern]` attributes for a single
501//! token to _compose_ those patterns into a single regular expression:
502//!
503//! ```
504//! # #[macro_use]
505//! # extern crate gramatika;
506//! #
507//! # fn main() {
508//! # use core::fmt;
509//! # use gramatika::{Lexer, Span, Spanned, Substr, Token as _, TokenStream, span};
510//! #
511//! // ...
512//! #[derive(Token, DebugLispToken, PartialEq)]
513//! enum Token {
514//! #[pattern = r"[0-9]*\.[0-9]+"]
515//! #[pattern = r"[0-9]+\.[0-9]*"]
516//! FloatLiteral(Substr, Span),
517//! #
518//! # #[pattern = r"\S+"]
519//! # Unrecognized(Substr, Span),
520//! }
521//! #
522//! # impl fmt::Display for Token {
523//! # fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
524//! # write!(f, "{}", self.lexeme())
525//! # }
526//! # }
527//! #
528//! # impl fmt::Debug for Token {
529//! # fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
530//! # <Self as gramatika::DebugLisp>::fmt(self, f, 0)
531//! # }
532//! # }
533//!
534//! let input = "
535//! 3.141
536//! .25
537//! 50.
538//! ";
539//!
540//! let mut lexer = TokenStream::<Token>::new(input.into());
541//! let tokens = lexer.scan();
542//!
543//! assert_eq!(tokens, vec![
544//! Token::FloatLiteral("3.141".into(), span![2:5..2:10]),
545//! Token::FloatLiteral(".25".into(), span![3:5..3:8]),
546//! Token::FloatLiteral("50.".into(), span![4:5..4:8]),
547//! ]);
548//! # }
549//! ```
550//! The pattern above is exactly equivalent to `[0-9]*\.[0-9]+|[0-9]+\.[0-9]*`,
551//! but by splitting the pattern into separate lines we can more easily tell the
552//! difference between the two alternations (the first makes the digits _before_
553//! the `.` optional, while the second does the same for digits _after_ the `.`).
554//!
555
556use std::{collections::HashSet, fmt, hash::Hash, marker::PhantomData};
557
558use arcstr::Substr;
559
560use crate::{Position, SourceStr, Span, Spanned};
561
562/// A lexer (AKA scanner, AKA tokenizer) is the piece of the parsing toolchain
563/// that takes raw input (e.g., the text of a source file) and "scans" it into
564/// discrete chunks of meaningful information (i.e., [tokens]).
565///
566/// In Gramatika, [parsing] is usually performed on a stream of tokens that are
567/// scanned on-demand by a compile-time-generated type implementing this trait.
568/// Except perhaps for unit-testing, you should rarely need to interact with the
569/// lexer directly.
570///
571/// [tokens]: crate::Token
572/// [parsing]: crate::parse
573pub trait Lexer {
574 /// The concrete type of [`Token`] this lexer should scan.
575 type Output: Token;
576
577 /// Create a new lexer that can scan the provided `input` to a stream of
578 /// [`Output`] tokens.
579 ///
580 /// [`Output`]: Lexer::Output
581 fn new(input: SourceStr) -> Self;
582
583 /// Experimental
584 #[doc(hidden)]
585 #[allow(unused_variables)]
586 fn with_runtime_matcher<F>(self, matcher: F) -> Self
587 where
588 Self: Sized,
589 F: Fn(&str) -> Option<(usize, <Self::Output as Token>::Kind)> + 'static,
590 {
591 self
592 }
593
594 /// Returns an owned copy of the input [`SourceStr`] this lexer is scanning.
595 fn source(&self) -> SourceStr;
596
597 /// Scans a single token from the input.
598 ///
599 /// The implementation generated by the [`derive macro`] does this lazily,
600 /// only checking the input for a match when this method is called, and
601 /// stopping as soon as the first match is (or isn't) found.
602 ///
603 /// [`derive macro`]: gramatika_macro::Lexer
604 fn scan_token(&mut self) -> Option<Self::Output>;
605
606 /// Eagerly scans the entire input to an array of tokens.
607 ///
608 /// Most of the time you'll want to use [`scan_token`] instead to _stream_
609 /// tokens from the input on an as-needed basis.
610 ///
611 /// [`scan_token`]: Lexer::scan_token
612 fn scan(&mut self) -> Vec<Self::Output> {
613 let mut result = vec![];
614 while let Some(token) = self.scan_token() {
615 result.push(token);
616 }
617 result
618 }
619}
620
621/// Experimental
622#[doc(hidden)]
623pub trait PartialLexer {
624 /// Initialize a new lexer from the state of some previous one that was
625 /// interrupted.
626 fn from_remaining(input: SourceStr, position: Position) -> Self;
627
628 /// Consumes the lexer, returning the remaining (unscanned) portion of the
629 /// input and the current cursor position.
630 fn stop(self) -> (Substr, Position);
631}
632
633/// In the parlance of language parsing, "tokens" are the smallest discrete
634/// chunks of meaningful information that can be extracted from some raw input,
635/// like the text of a source file.
636///
637/// If the individual characters of that text are thought of as _atoms_, then
638/// its "tokens" could be considered the _molecules_: words, punctuation marks,
639/// mathematical operators, etc.
640///
641/// In Gramatika, this trait is usually [derived] (along with its [`Lexer`]) for
642/// some user-defined enum type, with regular-expression patterns specifying
643/// the forms that can be taken by each of its variants. See the
644/// [module-level documentation] for (much) more detail.
645///
646/// [derived]: gramatika_macro::Token
647/// [module-level documentation]: crate::lexer
648pub trait Token
649where Self: Clone + Spanned
650{
651 type Kind: Copy + fmt::Debug + PartialEq + Eq + Hash + 'static;
652
653 /// Find the first match for this token in a [`str`] slice, returning the
654 /// start and end byte offsets and the matching [`Kind`](Token::Kind).
655 fn find<S>(input: S) -> Option<(usize, usize, Self::Kind)>
656 where S: AsRef<str>;
657
658 /// Construct a token from its constituent parts.
659 fn from_parts(kind: Self::Kind, substr: Substr, span: Span) -> Self;
660
661 /// Returns the set of this token's variants which should be treated as
662 /// multi-line patterns (i.e., the `.` character in regex patterns should
663 /// match `\n` characters).
664 ///
665 /// When using the [derive macro], returns the variants annotated with the
666 /// `#[multiline]` attribute.
667 ///
668 /// [derive macro]: macro@crate::Token
669 fn multilines() -> &'static HashSet<Self::Kind>;
670
671 /// Returns the set of this token's variants which should be discarded by the
672 /// lexer when scanning.
673 ///
674 /// When using the [derive macro], returns the variants annotated with the
675 /// `#[discard]` attribute.
676 ///
677 /// [derive macro]: macro@crate::Token
678 fn discards() -> &'static HashSet<Self::Kind>;
679
680 /// Returns the actual text content of a token.
681 ///
682 /// ```
683 /// #[macro_use]
684 /// extern crate gramatika;
685 ///
686 /// use gramatika::{
687 /// arcstr::literal_substr,
688 /// Lexer, Substr, Span, Token as _, TokenStream,
689 /// };
690 ///
691 /// # fn main() {
692 /// #[derive(Token)]
693 /// enum Token {
694 /// #[subset_of(Ident)]
695 /// #[pattern = "var"]
696 /// Keyword(Substr, Span),
697 ///
698 /// #[pattern = "[a-zA-Z_][a-zA-Z0-9_]*"]
699 /// Ident(Substr, Span),
700 ///
701 /// #[pattern = "[0-9]+"]
702 /// IntLiteral(Substr, Span),
703 ///
704 /// #[pattern = "="]
705 /// Operator(Substr, Span),
706 ///
707 /// #[pattern = ";"]
708 /// Punct(Substr, Span),
709 /// }
710 ///
711 /// let src = "var the_answer = 42;";
712 /// let tokens = TokenStream::<Token>::new(src.into()).scan();
713 ///
714 /// assert_eq!(tokens[1].lexeme(), literal_substr!("the_answer"));
715 /// # }
716 /// ```
717 fn lexeme(&self) -> Substr;
718
719 /// Returns the [`Kind`] of this token. Used in [`ParseStreamer`] methods like
720 /// [`check_kind`] and [`consume_kind`]. Effectively a more user-friendly version of
721 /// [`std::mem::discriminant`].
722 ///
723 /// [`Kind`]: Token::Kind
724 /// [`ParseStreamer`]: crate::parse::ParseStreamer
725 /// [`check_kind`]: crate::parse::ParseStreamer::check_kind
726 /// [`consume_kind`]: crate::parse::ParseStreamer::consume_kind
727 ///
728 /// ```
729 /// #[macro_use]
730 /// extern crate gramatika;
731 ///
732 /// use gramatika::{Lexer, Substr, Span, Token as _, TokenStream};
733 ///
734 /// # fn main() {
735 /// #[derive(Token)]
736 /// enum Token {
737 /// #[pattern = "[a-zA-Z_][a-zA-Z0-9_]*"]
738 /// Ident(Substr, Span),
739 /// }
740 ///
741 /// let input = "foo";
742 /// let token = TokenStream::<Token>::new(input.into())
743 /// .scan_token()
744 /// .expect("Expected to match Ident `foo`");
745 ///
746 /// assert_eq!(token.kind(), TokenKind::Ident);
747 /// # }
748 /// ```
749 fn kind(&self) -> Self::Kind;
750
751 /// Decomposes the token into its constituent [`Kind`], [`lexeme`] and
752 /// [`Span`]) parts, with the `lexeme` as a [`&str`] slice for compatibility
753 /// with pattern matching.
754 ///
755 /// [`Kind`]: Token::Kind
756 /// [`lexeme`]: Token::lexeme
757 /// [`span`]: crate::Span
758 /// [`&str`]: prim@str
759 ///
760 /// ```
761 /// #[macro_use]
762 /// extern crate gramatika;
763 ///
764 /// use gramatika::{Lexer, Substr, Span, Token as _, TokenStream};
765 ///
766 /// # fn main() {
767 /// #[derive(Token)]
768 /// enum Token {
769 /// #[pattern = "[a-zA-Z_][a-zA-Z0-9_]*"]
770 /// Ident(Substr, Span),
771 /// }
772 ///
773 /// let input = "foo";
774 /// let token = TokenStream::<Token>::new(input.into())
775 /// .scan_token()
776 /// .expect("Expected to match Ident `foo`");
777 ///
778 /// assert!(matches!(token.as_matchable(), (TokenKind::Ident, "foo", _)));
779 /// # }
780 /// ```
781 fn as_matchable(&self) -> (Self::Kind, &str, Span);
782}
783
784/// A concrete implementation of the [`Lexer`] interface.
785pub struct TokenStream<T> {
786 input: SourceStr,
787 remaining: Substr,
788 current: Position,
789 lookahead: Position,
790 _marker: PhantomData<T>,
791}
792
793impl<T> Lexer for TokenStream<T>
794where T: Token
795{
796 type Output = T;
797
798 fn new(input: SourceStr) -> Self {
799 Self {
800 remaining: input.substr(..),
801 input,
802 current: Position::default(),
803 lookahead: Position::default(),
804 _marker: Default::default(),
805 }
806 }
807
808 fn source(&self) -> SourceStr {
809 self.input.clone()
810 }
811
812 fn scan(&mut self) -> Vec<T> {
813 let mut output = vec![];
814 while let Some(token) = self.scan_token() {
815 output.push(token);
816 }
817
818 output
819 }
820
821 fn scan_token(&mut self) -> Option<T> {
822 match <T as Token>::find(&self.remaining) {
823 Some((start, end, kind)) => {
824 let lexeme = self.remaining.substr(start..end);
825
826 if <T as Token>::multilines().contains(&kind) {
827 let mut line_inc = 0_usize;
828 let mut remaining = lexeme.as_str();
829
830 while let Some(idx) = remaining.find('\n') {
831 line_inc += 1;
832 remaining = &remaining[idx + 1..];
833 }
834 let char_inc = remaining.len();
835
836 self.lookahead.line += line_inc;
837
838 if line_inc > 0 {
839 self.lookahead.character = char_inc;
840 } else {
841 self.lookahead.character += char_inc;
842 }
843 } else {
844 self.lookahead.character += end;
845 }
846
847 let span = Span {
848 start: self.current,
849 end: self.lookahead,
850 };
851
852 let token = <T as Token>::from_parts(kind, lexeme, span);
853
854 self.remaining = self.remaining.substr(end..);
855 self.current = self.lookahead;
856
857 if <T as Token>::discards().contains(&kind) {
858 self.scan_token()
859 } else {
860 Some(token)
861 }
862 }
863 None => self.remaining.clone().chars().next().and_then(|c| match c {
864 '\n' => {
865 self.lookahead.line += 1;
866 self.lookahead.character = 0;
867 self.current = self.lookahead;
868 self.remaining = self.remaining.substr(1..);
869
870 self.scan_token()
871 }
872 other if other.is_whitespace() => {
873 let len = other.len_utf8();
874
875 self.lookahead.character += len;
876 self.current.character += len;
877 self.remaining = self.remaining.substr(len..);
878
879 self.scan_token()
880 }
881 other => panic!("Unsupported input: `{other}` at {:?}", self.current),
882 }),
883 }
884 }
885}
886
887impl<T> PartialLexer for TokenStream<T> {
888 fn from_remaining(input: SourceStr, position: Position) -> Self {
889 Self {
890 remaining: input.substr(..),
891 input,
892 current: position,
893 lookahead: position,
894 _marker: Default::default(),
895 }
896 }
897
898 fn stop(self) -> (Substr, Position) {
899 (self.remaining, self.current)
900 }
901}