Skip to main content

logos/
lexer.rs

1use super::internal::LexerInternal;
2use super::Logos;
3use crate::source::{self, Source};
4
5use core::fmt::{self, Debug};
6use core::ops::{Deref, DerefMut};
7
8/// Byte range in the source.
9pub type Span = core::ops::Range<usize>;
10
11/// `Lexer` is the main struct of the crate that allows you to read through a
12/// `Source` and produce tokens for enums implementing the `Logos` trait.
13pub struct Lexer<'source, Token: Logos<'source>> {
14    source: &'source Token::Source,
15
16    token_start: usize,
17    token_end: usize,
18
19    /// Extras associated with the `Token`.
20    pub extras: Token::Extras,
21}
22
23impl<'source, Token> Debug for Lexer<'source, Token>
24where
25    Token: Logos<'source>,
26    Token::Source: Debug,
27    Token::Extras: Debug,
28{
29    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
30        fmt.debug_map()
31            .entry(&"source", &self.source)
32            .entry(&"extras", &self.extras)
33            .finish()
34    }
35}
36
37impl<'source, Token: Logos<'source>> Lexer<'source, Token> {
38    /// Create a new `Lexer`.
39    ///
40    /// Due to type inference, it might be more ergonomic to construct
41    /// it by calling [`Token::lexer`](./trait.Logos.html#method.lexer) on any `Token` with derived `Logos`.
42    pub fn new(source: &'source Token::Source) -> Self
43    where
44        Token::Extras: Default,
45    {
46        Self::with_extras(source, Default::default())
47    }
48
49    /// Create a new `Lexer` with the provided `Extras`.
50    ///
51    /// Due to type inference, it might be more ergonomic to construct
52    /// it by calling [`Token::lexer_with_extras`](./trait.Logos.html#method.lexer_with_extras) on any `Token` with derived `Logos`.
53    pub fn with_extras(source: &'source Token::Source, extras: Token::Extras) -> Self {
54        Lexer {
55            source,
56            extras,
57            token_start: 0,
58            token_end: 0,
59        }
60    }
61
62    /// Source from which this Lexer is reading tokens.
63    #[inline]
64    pub fn source(&self) -> &'source Token::Source {
65        self.source
66    }
67
68    /// Wrap the `Lexer` in an [`Iterator`](https://doc.rust-lang.org/std/iter/trait.Iterator.html)
69    /// that produces tuples of `(Token, `[`Span`](./type.Span.html)`)`.
70    ///
71    /// # Example
72    ///
73    /// ```
74    /// use logos::Logos;
75    ///
76    /// #[derive(Debug, PartialEq, Clone, Default)]
77    /// enum LexingError {
78    ///     NumberParseError,
79    ///     #[default]
80    ///     Other
81    /// }
82    ///
83    /// impl From<std::num::ParseIntError> for LexingError {
84    ///    fn from(_: std::num::ParseIntError) -> Self {
85    ///       LexingError::NumberParseError
86    ///   }
87    /// }
88    ///
89    /// impl From<std::num::ParseFloatError> for LexingError {
90    ///   fn from(_: std::num::ParseFloatError) -> Self {
91    ///      LexingError::NumberParseError
92    ///   }
93    /// }
94    ///
95    /// #[derive(Logos, Debug, PartialEq)]
96    /// #[logos(error = LexingError)]
97    /// enum Example {
98    ///     #[regex(r"[ \n\t\f]+", logos::skip)]
99    ///     Ignored,
100    ///
101    ///     #[regex("-?[0-9]+", |lex| lex.slice().parse())]
102    ///     Integer(i64),
103    ///
104    ///     #[regex("-?[0-9]+\\.[0-9]+", |lex| lex.slice().parse())]
105    ///     Float(f64),
106    /// }
107    ///
108    /// let tokens: Vec<_> = Example::lexer("42 3.14 -5 f").spanned().collect();
109    ///
110    /// assert_eq!(
111    ///     tokens,
112    ///     &[
113    ///         (Ok(Example::Integer(42)), 0..2),
114    ///         (Ok(Example::Float(3.14)), 3..7),
115    ///         (Ok(Example::Integer(-5)), 8..10),
116    ///         (Err(LexingError::Other), 11..12), // 'f' is not a recognized token
117    ///     ],
118    /// );
119    /// ```
120    #[inline]
121    pub fn spanned(self) -> SpannedIter<'source, Token> {
122        SpannedIter { lexer: self }
123    }
124
125    #[inline]
126    #[doc(hidden)]
127    #[deprecated(since = "0.11.0", note = "please use `span` instead")]
128    pub fn range(&self) -> Span {
129        self.span()
130    }
131
132    /// Get the range for the current token in `Source`.
133    #[inline]
134    pub fn span(&self) -> Span {
135        self.token_start..self.token_end
136    }
137
138    /// Get a string slice of the current token.
139    #[inline]
140    pub fn slice(&self) -> <Token::Source as Source>::Slice<'source> {
141        // SAFETY: in bounds if `token_start` and `token_end` are in bounds.
142        // * `token_start` is initially zero and is set to `token_end` in `next`, so
143        //   it remains in bounds as long as `token_end` remains in bounds.
144        // * `token_end` is initially zero and is only incremented in `bump`. `bump`
145        //   will panic if `Source::is_boundary` is false.
146        // * Thus safety is contingent on the correct implementation of the `is_boundary`
147        //   method.
148        #[cfg(not(feature = "forbid_unsafe"))]
149        unsafe {
150            self.source.slice_unchecked(self.span())
151        }
152        #[cfg(feature = "forbid_unsafe")]
153        self.source.slice(self.span()).unwrap()
154    }
155
156    /// Get a slice of remaining source, starting at the end of current token.
157    #[inline]
158    pub fn remainder(&self) -> <Token::Source as Source>::Slice<'source> {
159        #[cfg(not(feature = "forbid_unsafe"))]
160        unsafe {
161            self.source
162                .slice_unchecked(self.token_end..self.source.len())
163        }
164        #[cfg(feature = "forbid_unsafe")]
165        self.source
166            .slice(self.token_end..self.source.len())
167            .unwrap()
168    }
169
170    /// Turn this lexer into a lexer for a new token type.
171    ///
172    /// The new lexer continues to point at the same span as the current lexer,
173    /// and the current token becomes the error token of the new token type.
174    pub fn morph<Token2>(self) -> Lexer<'source, Token2>
175    where
176        Token2: Logos<'source, Source = Token::Source>,
177        Token::Extras: Into<Token2::Extras>,
178    {
179        Lexer {
180            source: self.source,
181            extras: self.extras.into(),
182            token_start: self.token_start,
183            token_end: self.token_end,
184        }
185    }
186
187    /// Bumps the end of currently lexed token by `n` bytes.
188    ///
189    /// # Panics
190    ///
191    /// Panics if adding `n` to current offset would place the `Lexer` beyond the last byte,
192    /// or in the middle of an UTF-8 code point (does not apply when lexing raw `&[u8]`).
193    pub fn bump(&mut self, n: usize) {
194        self.token_end += n;
195
196        assert!(
197            self.source.is_boundary(self.token_end),
198            "Invalid Lexer bump",
199        )
200    }
201}
202
203impl<'source, Token> Clone for Lexer<'source, Token>
204where
205    Token: Logos<'source> + Clone,
206    Token::Extras: Clone,
207{
208    fn clone(&self) -> Self {
209        Lexer {
210            extras: self.extras.clone(),
211            ..*self
212        }
213    }
214}
215
216impl<'source, Token> Iterator for Lexer<'source, Token>
217where
218    Token: Logos<'source>,
219{
220    type Item = Result<Token, Token::Error>;
221
222    #[inline]
223    fn next(&mut self) -> Option<Result<Token, Token::Error>> {
224        self.token_start = self.token_end;
225
226        Token::lex(self)
227    }
228}
229
230/// Iterator that pairs tokens with their position in the source.
231///
232/// Look at [`Lexer::spanned`](./struct.Lexer.html#method.spanned) for documentation.
233pub struct SpannedIter<'source, Token: Logos<'source>> {
234    lexer: Lexer<'source, Token>,
235}
236
237// deriving Clone doesn't infer the necessary `Token::Extras: Clone` bound
238impl<'source, Token> Clone for SpannedIter<'source, Token>
239where
240    Token: Logos<'source> + Clone,
241    Token::Extras: Clone,
242{
243    fn clone(&self) -> Self {
244        SpannedIter {
245            lexer: self.lexer.clone(),
246        }
247    }
248}
249
250impl<'source, Token> Iterator for SpannedIter<'source, Token>
251where
252    Token: Logos<'source>,
253{
254    type Item = (Result<Token, Token::Error>, Span);
255
256    fn next(&mut self) -> Option<Self::Item> {
257        self.lexer.next().map(|token| (token, self.lexer.span()))
258    }
259}
260
261impl<'source, Token> Deref for SpannedIter<'source, Token>
262where
263    Token: Logos<'source>,
264{
265    type Target = Lexer<'source, Token>;
266
267    fn deref(&self) -> &Lexer<'source, Token> {
268        &self.lexer
269    }
270}
271
272impl<'source, Token> DerefMut for SpannedIter<'source, Token>
273where
274    Token: Logos<'source>,
275{
276    fn deref_mut(&mut self) -> &mut Lexer<'source, Token> {
277        &mut self.lexer
278    }
279}
280
281#[doc(hidden)]
282/// # WARNING!
283///
284/// **This trait, and its methods, are not meant to be used outside of the
285/// code produced by `#[derive(Logos)]` macro.**
286impl<'source, Token> LexerInternal<'source> for Lexer<'source, Token>
287where
288    Token: Logos<'source>,
289{
290    type Token = Token;
291
292    /// Read a `Chunk` at current position of the `Lexer`. If end
293    /// of the `Source` has been reached, this will return `0`.
294    #[inline]
295    fn read<Chunk>(&self, offset: usize) -> Option<Chunk>
296    where
297        Chunk: source::Chunk<'source>,
298    {
299        self.source.read(offset)
300    }
301
302    /// Reset `token_start` to `token_end`.
303    #[inline]
304    fn trivia(&mut self) {
305        self.token_start = self.token_end;
306    }
307
308    /// Set the current token to appropriate `#[error]` variant.
309    /// Guarantee that `token_end` is at char boundary for `&str`.
310    #[inline]
311    fn end_to_boundary(&mut self, offset: usize) {
312        self.token_end = self.source.find_boundary(offset);
313    }
314
315    #[inline]
316    fn end(&mut self, offset: usize) {
317        self.token_end = offset;
318    }
319
320    #[inline]
321    fn offset(&self) -> usize {
322        self.token_start
323    }
324}