logos/lexer.rs
1use super::internal::LexerInternal;
2use super::Logos;
3use crate::source::{self, Source};
4
5use core::fmt::{self, Debug};
6use core::ops::{Deref, DerefMut};
7
8/// Byte range in the source.
9pub type Span = core::ops::Range<usize>;
10
11/// `Lexer` is the main struct of the crate that allows you to read through a
12/// `Source` and produce tokens for enums implementing the `Logos` trait.
13pub struct Lexer<'source, Token: Logos<'source>> {
14 source: &'source Token::Source,
15
16 token_start: usize,
17 token_end: usize,
18
19 /// Extras associated with the `Token`.
20 pub extras: Token::Extras,
21}
22
23impl<'source, Token> Debug for Lexer<'source, Token>
24where
25 Token: Logos<'source>,
26 Token::Source: Debug,
27 Token::Extras: Debug,
28{
29 fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
30 fmt.debug_map()
31 .entry(&"source", &self.source)
32 .entry(&"extras", &self.extras)
33 .finish()
34 }
35}
36
37impl<'source, Token: Logos<'source>> Lexer<'source, Token> {
38 /// Create a new `Lexer`.
39 ///
40 /// Due to type inference, it might be more ergonomic to construct
41 /// it by calling [`Token::lexer`](./trait.Logos.html#method.lexer) on any `Token` with derived `Logos`.
42 pub fn new(source: &'source Token::Source) -> Self
43 where
44 Token::Extras: Default,
45 {
46 Self::with_extras(source, Default::default())
47 }
48
49 /// Create a new `Lexer` with the provided `Extras`.
50 ///
51 /// Due to type inference, it might be more ergonomic to construct
52 /// it by calling [`Token::lexer_with_extras`](./trait.Logos.html#method.lexer_with_extras) on any `Token` with derived `Logos`.
53 pub fn with_extras(source: &'source Token::Source, extras: Token::Extras) -> Self {
54 Lexer {
55 source,
56 extras,
57 token_start: 0,
58 token_end: 0,
59 }
60 }
61
62 /// Source from which this Lexer is reading tokens.
63 #[inline]
64 pub fn source(&self) -> &'source Token::Source {
65 self.source
66 }
67
68 /// Wrap the `Lexer` in an [`Iterator`](https://doc.rust-lang.org/std/iter/trait.Iterator.html)
69 /// that produces tuples of `(Token, `[`Span`](./type.Span.html)`)`.
70 ///
71 /// # Example
72 ///
73 /// ```
74 /// use logos::Logos;
75 ///
76 /// #[derive(Debug, PartialEq, Clone, Default)]
77 /// enum LexingError {
78 /// NumberParseError,
79 /// #[default]
80 /// Other
81 /// }
82 ///
83 /// impl From<std::num::ParseIntError> for LexingError {
84 /// fn from(_: std::num::ParseIntError) -> Self {
85 /// LexingError::NumberParseError
86 /// }
87 /// }
88 ///
89 /// impl From<std::num::ParseFloatError> for LexingError {
90 /// fn from(_: std::num::ParseFloatError) -> Self {
91 /// LexingError::NumberParseError
92 /// }
93 /// }
94 ///
95 /// #[derive(Logos, Debug, PartialEq)]
96 /// #[logos(error = LexingError)]
97 /// enum Example {
98 /// #[regex(r"[ \n\t\f]+", logos::skip)]
99 /// Ignored,
100 ///
101 /// #[regex("-?[0-9]+", |lex| lex.slice().parse())]
102 /// Integer(i64),
103 ///
104 /// #[regex("-?[0-9]+\\.[0-9]+", |lex| lex.slice().parse())]
105 /// Float(f64),
106 /// }
107 ///
108 /// let tokens: Vec<_> = Example::lexer("42 3.14 -5 f").spanned().collect();
109 ///
110 /// assert_eq!(
111 /// tokens,
112 /// &[
113 /// (Ok(Example::Integer(42)), 0..2),
114 /// (Ok(Example::Float(3.14)), 3..7),
115 /// (Ok(Example::Integer(-5)), 8..10),
116 /// (Err(LexingError::Other), 11..12), // 'f' is not a recognized token
117 /// ],
118 /// );
119 /// ```
120 #[inline]
121 pub fn spanned(self) -> SpannedIter<'source, Token> {
122 SpannedIter { lexer: self }
123 }
124
125 #[inline]
126 #[doc(hidden)]
127 #[deprecated(since = "0.11.0", note = "please use `span` instead")]
128 pub fn range(&self) -> Span {
129 self.span()
130 }
131
132 /// Get the range for the current token in `Source`.
133 #[inline]
134 pub fn span(&self) -> Span {
135 self.token_start..self.token_end
136 }
137
138 /// Get a string slice of the current token.
139 #[inline]
140 pub fn slice(&self) -> <Token::Source as Source>::Slice<'source> {
141 // SAFETY: in bounds if `token_start` and `token_end` are in bounds.
142 // * `token_start` is initially zero and is set to `token_end` in `next`, so
143 // it remains in bounds as long as `token_end` remains in bounds.
144 // * `token_end` is initially zero and is only incremented in `bump`. `bump`
145 // will panic if `Source::is_boundary` is false.
146 // * Thus safety is contingent on the correct implementation of the `is_boundary`
147 // method.
148 #[cfg(not(feature = "forbid_unsafe"))]
149 unsafe {
150 self.source.slice_unchecked(self.span())
151 }
152 #[cfg(feature = "forbid_unsafe")]
153 self.source.slice(self.span()).unwrap()
154 }
155
156 /// Get a slice of remaining source, starting at the end of current token.
157 #[inline]
158 pub fn remainder(&self) -> <Token::Source as Source>::Slice<'source> {
159 #[cfg(not(feature = "forbid_unsafe"))]
160 unsafe {
161 self.source
162 .slice_unchecked(self.token_end..self.source.len())
163 }
164 #[cfg(feature = "forbid_unsafe")]
165 self.source
166 .slice(self.token_end..self.source.len())
167 .unwrap()
168 }
169
170 /// Turn this lexer into a lexer for a new token type.
171 ///
172 /// The new lexer continues to point at the same span as the current lexer,
173 /// and the current token becomes the error token of the new token type.
174 pub fn morph<Token2>(self) -> Lexer<'source, Token2>
175 where
176 Token2: Logos<'source, Source = Token::Source>,
177 Token::Extras: Into<Token2::Extras>,
178 {
179 Lexer {
180 source: self.source,
181 extras: self.extras.into(),
182 token_start: self.token_start,
183 token_end: self.token_end,
184 }
185 }
186
187 /// Bumps the end of currently lexed token by `n` bytes.
188 ///
189 /// # Panics
190 ///
191 /// Panics if adding `n` to current offset would place the `Lexer` beyond the last byte,
192 /// or in the middle of an UTF-8 code point (does not apply when lexing raw `&[u8]`).
193 pub fn bump(&mut self, n: usize) {
194 self.token_end += n;
195
196 assert!(
197 self.source.is_boundary(self.token_end),
198 "Invalid Lexer bump",
199 )
200 }
201}
202
203impl<'source, Token> Clone for Lexer<'source, Token>
204where
205 Token: Logos<'source> + Clone,
206 Token::Extras: Clone,
207{
208 fn clone(&self) -> Self {
209 Lexer {
210 extras: self.extras.clone(),
211 ..*self
212 }
213 }
214}
215
216impl<'source, Token> Iterator for Lexer<'source, Token>
217where
218 Token: Logos<'source>,
219{
220 type Item = Result<Token, Token::Error>;
221
222 #[inline]
223 fn next(&mut self) -> Option<Result<Token, Token::Error>> {
224 self.token_start = self.token_end;
225
226 Token::lex(self)
227 }
228}
229
230/// Iterator that pairs tokens with their position in the source.
231///
232/// Look at [`Lexer::spanned`](./struct.Lexer.html#method.spanned) for documentation.
233pub struct SpannedIter<'source, Token: Logos<'source>> {
234 lexer: Lexer<'source, Token>,
235}
236
237// deriving Clone doesn't infer the necessary `Token::Extras: Clone` bound
238impl<'source, Token> Clone for SpannedIter<'source, Token>
239where
240 Token: Logos<'source> + Clone,
241 Token::Extras: Clone,
242{
243 fn clone(&self) -> Self {
244 SpannedIter {
245 lexer: self.lexer.clone(),
246 }
247 }
248}
249
250impl<'source, Token> Iterator for SpannedIter<'source, Token>
251where
252 Token: Logos<'source>,
253{
254 type Item = (Result<Token, Token::Error>, Span);
255
256 fn next(&mut self) -> Option<Self::Item> {
257 self.lexer.next().map(|token| (token, self.lexer.span()))
258 }
259}
260
261impl<'source, Token> Deref for SpannedIter<'source, Token>
262where
263 Token: Logos<'source>,
264{
265 type Target = Lexer<'source, Token>;
266
267 fn deref(&self) -> &Lexer<'source, Token> {
268 &self.lexer
269 }
270}
271
272impl<'source, Token> DerefMut for SpannedIter<'source, Token>
273where
274 Token: Logos<'source>,
275{
276 fn deref_mut(&mut self) -> &mut Lexer<'source, Token> {
277 &mut self.lexer
278 }
279}
280
281#[doc(hidden)]
282/// # WARNING!
283///
284/// **This trait, and its methods, are not meant to be used outside of the
285/// code produced by `#[derive(Logos)]` macro.**
286impl<'source, Token> LexerInternal<'source> for Lexer<'source, Token>
287where
288 Token: Logos<'source>,
289{
290 type Token = Token;
291
292 /// Read a `Chunk` at current position of the `Lexer`. If end
293 /// of the `Source` has been reached, this will return `0`.
294 #[inline]
295 fn read<Chunk>(&self, offset: usize) -> Option<Chunk>
296 where
297 Chunk: source::Chunk<'source>,
298 {
299 self.source.read(offset)
300 }
301
302 /// Reset `token_start` to `token_end`.
303 #[inline]
304 fn trivia(&mut self) {
305 self.token_start = self.token_end;
306 }
307
308 /// Set the current token to appropriate `#[error]` variant.
309 /// Guarantee that `token_end` is at char boundary for `&str`.
310 #[inline]
311 fn end_to_boundary(&mut self, offset: usize) {
312 self.token_end = self.source.find_boundary(offset);
313 }
314
315 #[inline]
316 fn end(&mut self, offset: usize) {
317 self.token_end = offset;
318 }
319
320 #[inline]
321 fn offset(&self) -> usize {
322 self.token_start
323 }
324}