logos/
lexer.rs

1use super::internal::LexerInternal;
2use super::Logos;
3use crate::source::{self, Source};
4
5use core::fmt::{self, Debug};
6use core::ops::{Deref, DerefMut};
7
8/// Byte range in the source.
9pub type Span = core::ops::Range<usize>;
10
11/// `Lexer` is the main struct of the crate that allows you to read through a
12/// `Source` and produce tokens for enums implementing the `Logos` trait.
13pub struct Lexer<'source, Token: Logos<'source>> {
14    source: &'source Token::Source,
15
16    #[cfg(not(feature = "forbid_unsafe"))]
17    token: core::mem::ManuallyDrop<Option<Result<Token, Token::Error>>>,
18    #[cfg(feature = "forbid_unsafe")]
19    token: Option<Result<Token, Token::Error>>,
20
21    token_start: usize,
22    token_end: usize,
23
24    /// Extras associated with the `Token`.
25    pub extras: Token::Extras,
26}
27
28impl<'source, Token> Debug for Lexer<'source, Token>
29where
30    Token: Logos<'source>,
31    Token::Source: Debug,
32    Token::Extras: Debug,
33{
34    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
35        fmt.debug_map()
36            .entry(&"source", &self.source)
37            .entry(&"extras", &self.extras)
38            .finish()
39    }
40}
41
42impl<'source, Token: Logos<'source>> Lexer<'source, Token> {
43    /// Create a new `Lexer`.
44    ///
45    /// Due to type inference, it might be more ergonomic to construct
46    /// it by calling [`Token::lexer`](./trait.Logos.html#method.lexer) on any `Token` with derived `Logos`.
47    pub fn new(source: &'source Token::Source) -> Self
48    where
49        Token::Extras: Default,
50    {
51        Self::with_extras(source, Default::default())
52    }
53
54    /// Create a new `Lexer` with the provided `Extras`.
55    ///
56    /// Due to type inference, it might be more ergonomic to construct
57    /// it by calling [`Token::lexer_with_extras`](./trait.Logos.html#method.lexer_with_extras) on any `Token` with derived `Logos`.
58    pub fn with_extras(source: &'source Token::Source, extras: Token::Extras) -> Self {
59        Lexer {
60            source,
61            token: Default::default(),
62            extras,
63            token_start: 0,
64            token_end: 0,
65        }
66    }
67
68    /// Source from which this Lexer is reading tokens.
69    #[inline]
70    pub fn source(&self) -> &'source Token::Source {
71        self.source
72    }
73
74    /// Wrap the `Lexer` in an [`Iterator`](https://doc.rust-lang.org/std/iter/trait.Iterator.html)
75    /// that produces tuples of `(Token, `[`Span`](./type.Span.html)`)`.
76    ///
77    /// # Example
78    ///
79    /// ```
80    /// use logos::Logos;
81    ///
82    /// #[derive(Debug, PartialEq, Clone, Default)]
83    /// enum LexingError {
84    ///     NumberParseError,
85    ///     #[default]
86    ///     Other
87    /// }
88    ///
89    /// impl From<std::num::ParseIntError> for LexingError {
90    ///    fn from(_: std::num::ParseIntError) -> Self {
91    ///       LexingError::NumberParseError
92    ///   }
93    /// }
94    ///
95    /// impl From<std::num::ParseFloatError> for LexingError {
96    ///   fn from(_: std::num::ParseFloatError) -> Self {
97    ///      LexingError::NumberParseError
98    ///   }
99    /// }
100    ///
101    /// #[derive(Logos, Debug, PartialEq)]
102    /// #[logos(error = LexingError)]
103    /// enum Example {
104    ///     #[regex(r"[ \n\t\f]+", logos::skip)]
105    ///     Ignored,
106    ///
107    ///     #[regex("-?[0-9]+", |lex| lex.slice().parse())]
108    ///     Integer(i64),
109    ///
110    ///     #[regex("-?[0-9]+\\.[0-9]+", |lex| lex.slice().parse())]
111    ///     Float(f64),
112    /// }
113    ///
114    /// let tokens: Vec<_> = Example::lexer("42 3.14 -5 f").spanned().collect();
115    ///
116    /// assert_eq!(
117    ///     tokens,
118    ///     &[
119    ///         (Ok(Example::Integer(42)), 0..2),
120    ///         (Ok(Example::Float(3.14)), 3..7),
121    ///         (Ok(Example::Integer(-5)), 8..10),
122    ///         (Err(LexingError::Other), 11..12), // 'f' is not a recognized token
123    ///     ],
124    /// );
125    /// ```
126    #[inline]
127    pub fn spanned(self) -> SpannedIter<'source, Token> {
128        SpannedIter { lexer: self }
129    }
130
131    #[inline]
132    #[doc(hidden)]
133    #[deprecated(since = "0.11.0", note = "please use `span` instead")]
134    pub fn range(&self) -> Span {
135        self.span()
136    }
137
138    /// Get the range for the current token in `Source`.
139    #[inline]
140    pub fn span(&self) -> Span {
141        self.token_start..self.token_end
142    }
143
144    /// Get a string slice of the current token.
145    #[inline]
146    pub fn slice(&self) -> <Token::Source as Source>::Slice<'source> {
147        // SAFETY: in bounds if `token_start` and `token_end` are in bounds.
148        // * `token_start` is initially zero and is set to `token_end` in `next`, so
149        //   it remains in bounds as long as `token_end` remains in bounds.
150        // * `token_end` is initially zero and is only incremented in `bump`. `bump`
151        //   will panic if `Source::is_boundary` is false.
152        // * Thus safety is contingent on the correct implementation of the `is_boundary`
153        //   method.
154        #[cfg(not(feature = "forbid_unsafe"))]
155        unsafe {
156            self.source.slice_unchecked(self.span())
157        }
158        #[cfg(feature = "forbid_unsafe")]
159        self.source.slice(self.span()).unwrap()
160    }
161
162    /// Get a slice of remaining source, starting at the end of current token.
163    #[inline]
164    pub fn remainder(&self) -> <Token::Source as Source>::Slice<'source> {
165        #[cfg(not(feature = "forbid_unsafe"))]
166        unsafe {
167            self.source
168                .slice_unchecked(self.token_end..self.source.len())
169        }
170        #[cfg(feature = "forbid_unsafe")]
171        self.source
172            .slice(self.token_end..self.source.len())
173            .unwrap()
174    }
175
176    /// Turn this lexer into a lexer for a new token type.
177    ///
178    /// The new lexer continues to point at the same span as the current lexer,
179    /// and the current token becomes the error token of the new token type.
180    pub fn morph<Token2>(self) -> Lexer<'source, Token2>
181    where
182        Token2: Logos<'source, Source = Token::Source>,
183        Token::Extras: Into<Token2::Extras>,
184    {
185        Lexer {
186            source: self.source,
187            token: Default::default(),
188            extras: self.extras.into(),
189            token_start: self.token_start,
190            token_end: self.token_end,
191        }
192    }
193
194    /// Bumps the end of currently lexed token by `n` bytes.
195    ///
196    /// # Panics
197    ///
198    /// Panics if adding `n` to current offset would place the `Lexer` beyond the last byte,
199    /// or in the middle of an UTF-8 code point (does not apply when lexing raw `&[u8]`).
200    pub fn bump(&mut self, n: usize) {
201        self.token_end += n;
202
203        assert!(
204            self.source.is_boundary(self.token_end),
205            "Invalid Lexer bump",
206        )
207    }
208}
209
210impl<'source, Token> Clone for Lexer<'source, Token>
211where
212    Token: Logos<'source> + Clone,
213    Token::Extras: Clone,
214{
215    fn clone(&self) -> Self {
216        Lexer {
217            extras: self.extras.clone(),
218            token: Default::default(),
219            ..*self
220        }
221    }
222}
223
224impl<'source, Token> Iterator for Lexer<'source, Token>
225where
226    Token: Logos<'source>,
227{
228    type Item = Result<Token, Token::Error>;
229
230    #[inline]
231    fn next(&mut self) -> Option<Result<Token, Token::Error>> {
232        self.token_start = self.token_end;
233
234        Token::lex(self);
235
236        // This basically treats self.token as a temporary field.
237        // Since we always immediately return a newly set token here,
238        // we don't have to replace it with `None` or manually drop
239        // it later.
240        #[cfg(not(feature = "forbid_unsafe"))]
241        unsafe {
242            core::mem::ManuallyDrop::take(&mut self.token)
243        }
244        #[cfg(feature = "forbid_unsafe")]
245        {
246            self.token.take()
247        }
248    }
249}
250
251/// Iterator that pairs tokens with their position in the source.
252///
253/// Look at [`Lexer::spanned`](./struct.Lexer.html#method.spanned) for documentation.
254pub struct SpannedIter<'source, Token: Logos<'source>> {
255    lexer: Lexer<'source, Token>,
256}
257
258// deriving Clone doesn't infer the necessary `Token::Extras: Clone` bound
259impl<'source, Token> Clone for SpannedIter<'source, Token>
260where
261    Token: Logos<'source> + Clone,
262    Token::Extras: Clone,
263{
264    fn clone(&self) -> Self {
265        SpannedIter {
266            lexer: self.lexer.clone(),
267        }
268    }
269}
270
271impl<'source, Token> Iterator for SpannedIter<'source, Token>
272where
273    Token: Logos<'source>,
274{
275    type Item = (Result<Token, Token::Error>, Span);
276
277    fn next(&mut self) -> Option<Self::Item> {
278        self.lexer.next().map(|token| (token, self.lexer.span()))
279    }
280}
281
282impl<'source, Token> Deref for SpannedIter<'source, Token>
283where
284    Token: Logos<'source>,
285{
286    type Target = Lexer<'source, Token>;
287
288    fn deref(&self) -> &Lexer<'source, Token> {
289        &self.lexer
290    }
291}
292
293impl<'source, Token> DerefMut for SpannedIter<'source, Token>
294where
295    Token: Logos<'source>,
296{
297    fn deref_mut(&mut self) -> &mut Lexer<'source, Token> {
298        &mut self.lexer
299    }
300}
301
302#[doc(hidden)]
303/// # WARNING!
304///
305/// **This trait, and its methods, are not meant to be used outside of the
306/// code produced by `#[derive(Logos)]` macro.**
307impl<'source, Token> LexerInternal<'source> for Lexer<'source, Token>
308where
309    Token: Logos<'source>,
310{
311    type Token = Token;
312
313    /// Read a `Chunk` at current position of the `Lexer`. If end
314    /// of the `Source` has been reached, this will return `0`.
315    #[inline]
316    fn read<Chunk>(&self) -> Option<Chunk>
317    where
318        Chunk: source::Chunk<'source>,
319    {
320        self.source.read(self.token_end)
321    }
322
323    /// Read a `Chunk` at a position offset by `n`.
324    #[inline]
325    fn read_at<Chunk>(&self, n: usize) -> Option<Chunk>
326    where
327        Chunk: source::Chunk<'source>,
328    {
329        self.source.read(self.token_end + n)
330    }
331
332    #[inline]
333    #[cfg(not(feature = "forbid_unsafe"))]
334    unsafe fn read_byte_unchecked(&self, n: usize) -> u8 {
335        self.source.read_byte_unchecked(self.token_end + n)
336    }
337
338    #[inline]
339    #[cfg(feature = "forbid_unsafe")]
340    fn read_byte(&self, n: usize) -> u8 {
341        self.source.read_byte(self.token_end + n)
342    }
343
344    /// Test a chunk at current position with a closure.
345    #[inline]
346    fn test<T, F>(&self, test: F) -> bool
347    where
348        T: source::Chunk<'source>,
349        F: FnOnce(T) -> bool,
350    {
351        match self.source.read::<T>(self.token_end) {
352            Some(chunk) => test(chunk),
353            None => false,
354        }
355    }
356
357    /// Bump the position `Lexer` is reading from by `size`.
358    #[inline]
359    fn bump_unchecked(&mut self, size: usize) {
360        debug_assert!(
361            self.token_end + size <= self.source.len(),
362            "Bumping out of bounds!"
363        );
364
365        self.token_end += size;
366    }
367
368    /// Reset `token_start` to `token_end`.
369    #[inline]
370    fn trivia(&mut self) {
371        self.token_start = self.token_end;
372    }
373
374    /// Set the current token to appropriate `#[error]` variant.
375    /// Guarantee that `token_end` is at char boundary for `&str`.
376    #[inline]
377    fn error(&mut self) {
378        self.token_end = self.source.find_boundary(self.token_end);
379        #[cfg(not(feature = "forbid_unsafe"))]
380        {
381            self.token = core::mem::ManuallyDrop::new(Some(Err(Token::Error::default())));
382        }
383        #[cfg(feature = "forbid_unsafe")]
384        {
385            self.token = Some(Err(Token::Error::default()));
386        }
387    }
388
389    #[inline]
390    fn end(&mut self) {
391        self.token = Default::default();
392    }
393
394    #[inline]
395    fn set(
396        &mut self,
397        token: Result<
398            Self::Token,
399            <<Self as LexerInternal<'source>>::Token as Logos<'source>>::Error,
400        >,
401    ) {
402        #[cfg(not(feature = "forbid_unsafe"))]
403        {
404            self.token = core::mem::ManuallyDrop::new(Some(token));
405        }
406        #[cfg(feature = "forbid_unsafe")]
407        {
408            self.token = Some(token)
409        }
410    }
411}