solar_parse/lexer/cursor/token.rs
1//! Raw, low-level tokens. Created using [`Cursor`](crate::Cursor).
2
3use solar_ast::{Base, StrKind};
4
5/// A raw token.
6///
7/// It doesn't contain information about data that has been parsed, only the type of the token and
8/// its size.
9#[derive(Clone, Debug, PartialEq, Eq)]
10pub struct RawToken {
11 /// The kind of token.
12 pub kind: RawTokenKind,
13 /// The length of the token in bytes.
14 pub len: u32,
15}
16
17impl RawToken {
18 /// The [`EOF`](RawTokenKind::Eof) token with length 0.
19 pub const EOF: Self = Self::new(RawTokenKind::Eof, 0);
20
21 /// Creates a new token.
22 #[inline]
23 pub const fn new(kind: RawTokenKind, len: u32) -> Self {
24 Self { kind, len }
25 }
26}
27
28/// Common lexeme types.
29#[derive(Clone, Copy, Debug, PartialEq, Eq)]
30pub enum RawTokenKind {
31 // Multi-char tokens:
32 /// `// comment`
33 ///
34 /// `/// doc comment`
35 LineComment { is_doc: bool },
36
37 /// `/* block comment */`
38 ///
39 /// `/** block doc comment */`
40 BlockComment { is_doc: bool, terminated: bool },
41
42 /// Any whitespace character sequence.
43 Whitespace,
44
45 /// `ident` or `continue`
46 ///
47 /// At this step, keywords are also considered identifiers.
48 Ident,
49
50 /// Examples: `123`, `0x123`, `hex"123"`. Note that `_` is an invalid
51 /// suffix, but may be present here on string and float literals. Users of
52 /// this type will need to check for and reject that case.
53 ///
54 /// See [`RawLiteralKind`] for more details.
55 Literal { kind: RawLiteralKind },
56
57 // One-char tokens:
58 /// `;`
59 Semi,
60 /// `,`
61 Comma,
62 /// `.`
63 Dot,
64 /// `(`
65 OpenParen,
66 /// `)`
67 CloseParen,
68 /// `{`
69 OpenBrace,
70 /// `}`
71 CloseBrace,
72 /// `[`
73 OpenBracket,
74 /// `]`
75 CloseBracket,
76 /// `~`
77 Tilde,
78 /// `?`
79 Question,
80 /// `:`
81 Colon,
82 /// `=`
83 Eq,
84 /// `!`
85 Bang,
86 /// `<`
87 Lt,
88 /// `>`
89 Gt,
90 /// `-`
91 Minus,
92 /// `&`
93 And,
94 /// `|`
95 Or,
96 /// `+`
97 Plus,
98 /// `*`
99 Star,
100 /// `/`
101 Slash,
102 /// `^`
103 Caret,
104 /// `%`
105 Percent,
106
107 /// Unknown token, not expected by the lexer, e.g. `№`
108 Unknown,
109
110 /// End of input.
111 Eof,
112}
113
114impl RawTokenKind {
115 /// Returns `true` if this token is EOF.
116 #[inline]
117 pub const fn is_eof(&self) -> bool {
118 matches!(self, Self::Eof)
119 }
120
121 /// Returns `true` if this token is a line comment or a block comment.
122 #[inline]
123 pub const fn is_comment(&self) -> bool {
124 matches!(self, Self::LineComment { .. } | Self::BlockComment { .. })
125 }
126
127 /// Returns `true` if this token is a whitespace, line comment, or block comment.
128 #[inline]
129 pub const fn is_trivial(&self) -> bool {
130 matches!(self, Self::Whitespace | Self::LineComment { .. } | Self::BlockComment { .. })
131 }
132}
133
134/// The literal types supported by the lexer.
135#[derive(Clone, Copy, Debug, PartialEq, Eq)]
136pub enum RawLiteralKind {
137 /// `123`, `0x123`; empty_int: `0x`
138 Int { base: Base, empty_int: bool },
139 /// `123.321`, `1.2e3`, `.2e3`; empty_exponent: `2e`, `2.3e`, `.3e`
140 Rational { base: Base, empty_exponent: bool },
141 /// `"abc"`, `"abc`; `unicode"abc"`, `unicode"abc`; `hex"abc"`, `hex"abc`
142 Str { kind: StrKind, terminated: bool },
143}