wdl_grammar/
lexer.rs

1//! Module for the lexer implementation.
2
3use logos::Logos;
4
5use super::Span;
6use super::parser::ParserToken;
7use super::tree::SyntaxKind;
8
9pub mod v1;
10
11/// Represents a set of tokens as a bitset.
12///
13/// As Rust does not currently support const functions in traits,
14/// `TokenSet` operates on "raw" forms of tokens (i.e. `u8`).
15///
16/// This allows `TokenSet` to work with different token types but also
17/// allow for the sets to be created in const contexts.
18#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
19pub struct TokenSet(u128);
20
21impl TokenSet {
22    /// An empty token set.
23    pub const EMPTY: Self = Self(0);
24
25    /// Constructs a token set from a slice of tokens.
26    pub const fn new(tokens: &[u8]) -> Self {
27        let mut bits = 0u128;
28        let mut i = 0;
29        while i < tokens.len() {
30            bits |= Self::mask(tokens[i]);
31            i += 1;
32        }
33        Self(bits)
34    }
35
36    /// Unions two token sets together.
37    pub const fn union(self, other: Self) -> Self {
38        Self(self.0 | other.0)
39    }
40
41    /// Returns a new token set with all tokens from `other` removed from
42    /// `self`. If a token from `other` is not present in `self`, it has no
43    /// effect.
44    pub const fn without(self, other: Self) -> Self {
45        Self(self.0 & !other.0)
46    }
47
48    /// Checks if the token is contained in the set.
49    pub const fn contains(&self, token: u8) -> bool {
50        self.0 & Self::mask(token) != 0
51    }
52
53    /// Gets the count of tokens in the set.
54    pub const fn count(&self) -> usize {
55        self.0.count_ones() as usize
56    }
57
58    /// Iterates the raw tokens in the set.
59    pub fn iter(&self) -> impl Iterator<Item = u8> + use<> {
60        let mut bits = self.0;
61        std::iter::from_fn(move || {
62            if bits == 0 {
63                return None;
64            }
65
66            let token = u8::try_from(bits.trailing_zeros())
67                .expect("the maximum token value should be less than 128");
68
69            bits ^= bits & bits.overflowing_neg().0;
70            Some(token)
71        })
72    }
73
74    /// Masks the given token to a `u128`.
75    const fn mask(token: u8) -> u128 {
76        1u128 << (token as usize)
77    }
78}
79
80/// Represents a token for lexing WDL document preambles.
81///
82/// A WDL parser may initially use this token to lex the version
83/// statement at the start of a WDL document.
84///
85/// Once the version statement has been parsed, the parser will then
86/// [morph][Lexer::morph] the lexer to the appropriate token for the
87/// document's WDL version and pass the lexer to the matching version
88/// of the WDL grammar.
89#[derive(Logos, Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
90#[repr(u8)]
91pub enum PreambleToken {
92    /// Contiguous whitespace.
93    #[regex(r"[ \t\r\n]+")]
94    Whitespace,
95
96    /// A comment.
97    #[regex(r"#[^\r\n]*")]
98    Comment,
99
100    /// The `version` keyword.
101    #[token("version")]
102    VersionKeyword,
103
104    /// Any other token that isn't whitespace, comment, or the `version`
105    /// keyword.
106    #[regex("[^ \t\r\n#]")]
107    Any,
108
109    // WARNING: this must always be the last variant.
110    /// The exclusive maximum token value.
111    MAX,
112}
113
114/// Asserts that PreambleToken can fit in a TokenSet.
115const _: () = assert!(PreambleToken::MAX as u8 <= 128);
116
117impl ParserToken<'_> for PreambleToken {
118    fn into_syntax(self) -> SyntaxKind {
119        match self {
120            Self::Whitespace => SyntaxKind::Whitespace,
121            Self::Comment => SyntaxKind::Comment,
122            Self::VersionKeyword => SyntaxKind::VersionKeyword,
123            Self::Any | Self::MAX => unreachable!(),
124        }
125    }
126
127    fn into_raw(self) -> u8 {
128        self as u8
129    }
130
131    fn from_raw(token: u8) -> Self {
132        assert!(token < Self::MAX as u8, "invalid token value");
133        unsafe { std::mem::transmute(token) }
134    }
135
136    fn describe(self) -> &'static str {
137        match self {
138            Self::Whitespace => "whitespace",
139            Self::Comment => "comment",
140            Self::VersionKeyword => "`version` keyword",
141            Self::Any | Self::MAX => unreachable!(),
142        }
143    }
144
145    fn is_trivia(self) -> bool {
146        matches!(self, Self::Whitespace | Self::Comment)
147    }
148}
149
150/// Represents a token for lexing WDL version statements.
151///
152/// This exists as a separate token type because WDL versions and
153/// identifiers overlap on their regex.
154///
155/// Therefore, version statements are tokenized separately from the rest
156/// of the WDL document.
157#[derive(Logos, Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
158#[repr(u8)]
159pub enum VersionStatementToken {
160    /// Contiguous whitespace.
161    #[regex(r"[ \t\r\n]+")]
162    Whitespace,
163
164    /// A comment.
165    #[regex(r"#[^\r\n]*")]
166    Comment,
167
168    /// A WDL version.
169    #[regex(r"[a-zA-Z0-9][a-zA-Z0-9.\-]*")]
170    Version,
171
172    // WARNING: this must always be the last variant.
173    /// The exclusive maximum token value.
174    MAX,
175}
176
177/// Asserts that VersionStatementToken can fit in a TokenSet.
178const _: () = assert!(VersionStatementToken::MAX as u8 <= 128);
179
180impl ParserToken<'_> for VersionStatementToken {
181    fn into_syntax(self) -> SyntaxKind {
182        match self {
183            Self::Whitespace => SyntaxKind::Whitespace,
184            Self::Comment => SyntaxKind::Comment,
185            Self::Version => SyntaxKind::Version,
186            Self::MAX => unreachable!(),
187        }
188    }
189
190    fn into_raw(self) -> u8 {
191        self as u8
192    }
193
194    fn from_raw(token: u8) -> Self {
195        assert!(token < Self::MAX as u8, "invalid token value");
196        unsafe { std::mem::transmute(token) }
197    }
198
199    fn describe(self) -> &'static str {
200        match self {
201            Self::Whitespace => "whitespace",
202            Self::Comment => "comment",
203            Self::Version => "version",
204            Self::MAX => unreachable!(),
205        }
206    }
207
208    fn is_trivia(self) -> bool {
209        matches!(self, Self::Whitespace | Self::Comment)
210    }
211}
212
213/// The result type for the lexer.
214pub type LexerResult<T> = Result<T, ()>;
215
216/// Records information for a lexer peek operation.
217///
218/// See the [Lexer::peek] method.
219#[derive(Debug, Clone, Copy)]
220struct Peeked<T> {
221    /// The result of the peek operation.
222    result: LexerResult<T>,
223    /// The span of the result.
224    span: Span,
225    /// The offset *before* the peek.
226    ///
227    /// This is used to discard the peek for morphing lexers.
228    offset: usize,
229}
230
231/// Implements a WDL lexer.
232///
233/// A lexer produces a stream of tokens from a WDL source string.
234#[allow(missing_debug_implementations)]
235#[derive(Clone)]
236pub struct Lexer<'a, T>
237where
238    T: Logos<'a, Extras = ()>,
239{
240    /// The underlying logos lexer.
241    lexer: logos::Lexer<'a, T>,
242    /// The peeked token.
243    peeked: Option<Peeked<T>>,
244}
245
246impl<'a, T> Lexer<'a, T>
247where
248    T: Logos<'a, Source = str, Error = (), Extras = ()> + Copy,
249{
250    /// Creates a new lexer for the given source string.
251    pub fn new(source: &'a str) -> Self
252    where
253        T::Extras: Default,
254    {
255        Self {
256            lexer: T::lexer(source),
257            peeked: None,
258        }
259    }
260
261    /// Gets the source string of the given span.
262    pub fn source(&self, span: Span) -> &'a str {
263        &self.lexer.source()[span.start()..span.end()]
264    }
265
266    /// Gets the length of the source.
267    pub fn source_len(&self) -> usize {
268        self.lexer.source().len()
269    }
270
271    /// Gets the current span of the lexer.
272    pub fn span(&self) -> Span {
273        self.lexer.span().into()
274    }
275
276    /// Peeks at the next token.
277    pub fn peek(&mut self) -> Option<(LexerResult<T>, Span)> {
278        if self.peeked.is_none() {
279            let offset = self.lexer.span().start;
280            self.peeked = self.lexer.next().map(|r| Peeked {
281                result: r,
282                span: self.lexer.span().into(),
283                offset,
284            });
285        }
286
287        self.peeked.map(|p| (p.result, p.span))
288    }
289
290    /// Morph this lexer into a lexer for a new token type.
291    ///
292    /// The returned lexer continues to point at the same span
293    /// as the current lexer.
294    pub fn morph<T2>(self) -> Lexer<'a, T2>
295    where
296        T2: Logos<'a, Source = str, Error = (), Extras = ()> + Copy,
297    {
298        // If the lexer has peeked, we need to "reset" the lexer so that it is no longer
299        // peeked; this allows the morphed lexer to lex the previously peeked
300        // span
301        let lexer = match self.peeked {
302            Some(peeked) => {
303                let mut lexer = T2::lexer(self.lexer.source());
304                if peeked.offset > 0 {
305                    lexer.bump(peeked.offset);
306                    lexer.next();
307                }
308
309                lexer
310            }
311            None => self.lexer.morph(),
312        };
313
314        Lexer {
315            lexer,
316            peeked: None,
317        }
318    }
319
320    /// Consumes the remainder of the source, returning the span
321    /// of the consumed text.
322    pub fn consume_remainder(&mut self) -> Option<Span> {
323        // Reset the lexer if we've peeked
324        if let Some(peeked) = self.peeked.take() {
325            self.lexer = T::lexer(self.lexer.source());
326            if peeked.offset > 0 {
327                self.lexer.bump(peeked.offset);
328                self.lexer.next();
329            }
330        }
331
332        // Bump the remaining source
333        self.lexer.next();
334        self.lexer.bump(self.lexer.remainder().len());
335        let span = self.lexer.span();
336        assert!(self.next().is_none(), "lexer should be completed");
337        if span.is_empty() {
338            None
339        } else {
340            Some(span.into())
341        }
342    }
343}
344
345impl<'a, T> Iterator for Lexer<'a, T>
346where
347    T: Logos<'a, Error = (), Extras = ()> + Copy,
348{
349    type Item = (LexerResult<T>, Span);
350
351    fn next(&mut self) -> Option<Self::Item> {
352        if let Some(peeked) = self.peeked.take() {
353            return Some((peeked.result, peeked.span));
354        }
355
356        self.lexer.next().map(|r| (r, self.lexer.span().into()))
357    }
358}
359
360#[cfg(test)]
361mod test {
362    use pretty_assertions::assert_eq;
363
364    use super::*;
365
366    pub(crate) fn map<T>(
367        (t, s): (LexerResult<T>, Span),
368    ) -> (LexerResult<T>, std::ops::Range<usize>) {
369        (t, s.start()..s.end())
370    }
371
372    #[test]
373    fn test_version_1_0() {
374        let mut lexer = Lexer::<PreambleToken>::new(
375            "
376# Test for 1.0 documents
377version 1.0",
378        );
379        assert_eq!(
380            lexer.next().map(map).unwrap(),
381            (Ok(PreambleToken::Whitespace), 0..1)
382        );
383        assert_eq!(
384            lexer.next().map(map).unwrap(),
385            (Ok(PreambleToken::Comment), 1..25),
386        );
387        assert_eq!(
388            lexer.next().map(map).unwrap(),
389            (Ok(PreambleToken::Whitespace), 25..26),
390        );
391        assert_eq!(
392            lexer.next().map(map).unwrap(),
393            (Ok(PreambleToken::VersionKeyword), 26..33),
394        );
395
396        let mut lexer: Lexer<'_, VersionStatementToken> = lexer.morph();
397        assert_eq!(
398            lexer.next().map(map).unwrap(),
399            (Ok(VersionStatementToken::Whitespace), 33..34),
400        );
401        assert_eq!(
402            lexer.next().map(map).unwrap(),
403            (Ok(VersionStatementToken::Version), 34..37)
404        );
405    }
406
407    #[test]
408    fn test_version_1_1() {
409        let mut lexer = Lexer::<PreambleToken>::new(
410            "
411# Test for 1.1 documents
412version 1.1",
413        );
414        assert_eq!(
415            lexer.next().map(map).unwrap(),
416            (Ok(PreambleToken::Whitespace), 0..1)
417        );
418        assert_eq!(
419            lexer.next().map(map).unwrap(),
420            (Ok(PreambleToken::Comment), 1..25)
421        );
422        assert_eq!(
423            lexer.next().map(map).unwrap(),
424            (Ok(PreambleToken::Whitespace), 25..26)
425        );
426        assert_eq!(
427            lexer.next().map(map).unwrap(),
428            (Ok(PreambleToken::VersionKeyword), 26..33)
429        );
430
431        let mut lexer: Lexer<'_, VersionStatementToken> = lexer.morph();
432        assert_eq!(
433            lexer.next().map(map).unwrap(),
434            (Ok(VersionStatementToken::Whitespace), 33..34)
435        );
436        assert_eq!(
437            lexer.next().map(map).unwrap(),
438            (Ok(VersionStatementToken::Version), 34..37)
439        );
440    }
441
442    #[test]
443    fn test_version_draft3() {
444        // Note: draft-3 documents aren't supported by `wdl`, but
445        // the lexer needs to ensure it can lex any valid version
446        // token so that the parser may gracefully reject parsing
447        // the document.
448        let mut lexer = Lexer::<PreambleToken>::new(
449            "
450# Test for draft-3 documents
451version draft-3",
452        );
453        assert_eq!(
454            lexer.next().map(map).unwrap(),
455            (Ok(PreambleToken::Whitespace), 0..1)
456        );
457        assert_eq!(
458            lexer.next().map(map).unwrap(),
459            (Ok(PreambleToken::Comment), 1..29)
460        );
461        assert_eq!(
462            lexer.next().map(map).unwrap(),
463            (Ok(PreambleToken::Whitespace), 29..30)
464        );
465        assert_eq!(
466            lexer.next().map(map).unwrap(),
467            (Ok(PreambleToken::VersionKeyword), 30..37)
468        );
469
470        let mut lexer: Lexer<'_, VersionStatementToken> = lexer.morph();
471        assert_eq!(
472            lexer.next().map(map).unwrap(),
473            (Ok(VersionStatementToken::Whitespace), 37..38)
474        );
475        assert_eq!(
476            lexer.next().map(map).unwrap(),
477            (Ok(VersionStatementToken::Version), 38..45)
478        );
479    }
480}