plsql_parser/tokens.rs
1//! Token tape types.
2//!
3//! The token tape is the **lossless** representation of the source. Every
4//! token carries a byte-offset span; trivia (whitespace, comments) is
5//! preserved verbatim in a side-table. Round-tripping is:
6//!
7//! ```text
8//! reconstruct(token_tape(input)) == input // byte-for-byte
9//! ```
10//!
11//! This contract is enforced by the proptest in `tests/conformance.rs`.
12
13use plsql_core::Span;
14use serde::{Deserialize, Serialize};
15
16// ---------------------------------------------------------------------------
17// TokenKind
18// ---------------------------------------------------------------------------
19
20/// Discriminator for a syntactic token.
21///
22/// The set is deliberately coarse at this layer — backends map their
23/// internal token vocabulary into these kinds. The mapping is
24/// backend-private (R20).
25#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash, Serialize, Deserialize)]
26pub enum TokenKind {
27 // Literals
28 /// A string literal (`'hello'`, `q'[...]'`).
29 StringLiteral,
30 /// A numeric literal (`42`, `3.14`, `1e-3`).
31 NumericLiteral,
32 /// A quoted identifier (`"My_Table"`).
33 QuotedIdentifier,
34
35 // Keywords
36 /// A PL/SQL or SQL keyword (`SELECT`, `BEGIN`, `PACKAGE`).
37 Keyword,
38 /// A built-in Oracle function name treated as keyword contextually.
39 BuiltIn,
40
41 // Identifiers
42 /// An unquoted identifier (`EMPLOYEES`, `v_count`).
43 Identifier,
44
45 // Punctuation / delimiters
46 /// A semicolon (`;`).
47 Semicolon,
48 /// A forward slash (`/`) — statement terminator in SQL*Plus.
49 Slash,
50 /// A dot (`.`).
51 Dot,
52 /// A comma (`,`).
53 Comma,
54 /// An opening parenthesis (`(`).
55 LParen,
56 /// A closing parenthesis (`)`).
57 RParen,
58 /// An assignment operator (`:=`).
59 Assign,
60 /// The fat arrow (`=>`).
61 Arrow,
62 /// The pipe-pipe concatenation (`||`).
63 Concat,
64 /// Any other operator (`+`, `-`, `*`, `/`, `=`, `<`, `>`, etc.).
65 Operator,
66 /// An `@` or `@@` include directive.
67 IncludeDirective,
68 /// A `/` on a line by itself (SQL*Plus statement terminator).
69 StatementTerminator,
70
71 // Error
72 /// The backend could not classify this token.
73 Unknown,
74}
75
76// ---------------------------------------------------------------------------
77// Token
78// ---------------------------------------------------------------------------
79
80/// A single syntactic token in the token tape.
81#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
82pub struct Token {
83 /// What kind of token this is.
84 pub kind: TokenKind,
85 /// Byte-offset span in the original source.
86 pub span: Span,
87 /// The raw source text of this token (verbatim).
88 pub text: String,
89}
90
91impl Token {
92 #[must_use]
93 pub fn new(kind: TokenKind, span: Span, text: impl Into<String>) -> Self {
94 Self {
95 kind,
96 span,
97 text: text.into(),
98 }
99 }
100}
101
102// ---------------------------------------------------------------------------
103// Trivia
104// ---------------------------------------------------------------------------
105
106/// A piece of trivia — whitespace, comments, or other non-token source text
107/// that must be preserved for lossless round-tripping.
108#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
109pub enum Trivia {
110 /// Horizontal or vertical whitespace.
111 Whitespace(String),
112 /// A single-line comment (`-- ...`).
113 LineComment(String),
114 /// A block comment (`/* ... */`).
115 BlockComment(String),
116}
117
118// ---------------------------------------------------------------------------
119// TriviaTable
120// ---------------------------------------------------------------------------
121
122/// Maps each token index to the trivia that **precedes** it.
123///
124/// Index `i` in this table holds the trivia between token `i-1` and token `i`.
125/// Index `0` holds leading trivia (before the first token).
126/// Trailing trivia (after the last token) is stored at index `tokens.len()`.
127///
128/// This is a sparse mapping: if a token has no preceding trivia, the entry is
129/// an empty `Vec`.
130#[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize)]
131pub struct TriviaTable {
132 /// `leading[i]` = trivia preceding the i-th token. Trailing trivia goes
133 /// at index `tokens.len()`.
134 pub leading: Vec<Vec<Trivia>>,
135}
136
137impl TriviaTable {
138 #[must_use]
139 pub fn new() -> Self {
140 Self {
141 leading: Vec::new(),
142 }
143 }
144
145 /// Push a trivia entry for the given token index.
146 pub fn push(&mut self, token_index: usize, trivia: Trivia) {
147 while self.leading.len() <= token_index {
148 self.leading.push(Vec::new());
149 }
150 self.leading[token_index].push(trivia);
151 }
152
153 /// Get the trivia preceding the given token index.
154 #[must_use]
155 pub fn get(&self, token_index: usize) -> &[Trivia] {
156 self.leading.get(token_index).map_or(&[], |v| v.as_slice())
157 }
158
159 /// Total number of trivia entries across all tokens.
160 #[must_use]
161 pub fn total_count(&self) -> usize {
162 self.leading.iter().map(Vec::len).sum()
163 }
164}
165
166// ---------------------------------------------------------------------------
167// TokenTape
168// ---------------------------------------------------------------------------
169
170/// An ordered sequence of tokens representing the full lexed source.
171///
172/// Combined with the [`TriviaTable`], this allows perfect reconstruction
173/// of the original source text.
174#[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize)]
175pub struct TokenTape {
176 pub tokens: Vec<Token>,
177}
178
179impl TokenTape {
180 #[must_use]
181 pub fn new() -> Self {
182 Self { tokens: Vec::new() }
183 }
184
185 #[must_use]
186 pub fn len(&self) -> usize {
187 self.tokens.len()
188 }
189
190 #[must_use]
191 pub fn is_empty(&self) -> bool {
192 self.tokens.is_empty()
193 }
194
195 /// Push a token onto the tape.
196 pub fn push(&mut self, token: Token) {
197 self.tokens.push(token);
198 }
199
200 /// Reconstruct the original source text from the token tape + trivia.
201 ///
202 /// This is the lossless round-trip function. For a valid tape:
203 ///
204 /// ```text
205 /// reconstruct(tape, trivia) == original_source
206 /// ```
207 #[must_use]
208 pub fn reconstruct(&self, trivia: &TriviaTable) -> String {
209 let mut out = String::new();
210 for (i, token) in self.tokens.iter().enumerate() {
211 // Emit preceding trivia
212 for t in trivia.get(i) {
213 match t {
214 Trivia::Whitespace(s) | Trivia::LineComment(s) | Trivia::BlockComment(s) => {
215 out.push_str(s)
216 }
217 }
218 }
219 // Emit the token itself
220 out.push_str(&token.text);
221 }
222 // Trailing trivia (after last token)
223 for t in trivia.get(self.tokens.len()) {
224 match t {
225 Trivia::Whitespace(s) | Trivia::LineComment(s) | Trivia::BlockComment(s) => {
226 out.push_str(s)
227 }
228 }
229 }
230 out
231 }
232}
233
234// ---------------------------------------------------------------------------
235// Tests
236// ---------------------------------------------------------------------------
237
238#[cfg(test)]
239mod tests {
240 use super::*;
241 use plsql_core::{FileId, Position};
242
243 fn span(start: u32, len: u32) -> Span {
244 Span::new(
245 FileId::new(0),
246 Position::new(1, 1, start),
247 Position::new(1, 1, start + len),
248 )
249 }
250
251 #[test]
252 fn empty_tape_reconstructs_to_empty_string() {
253 let tape = TokenTape::new();
254 let trivia = TriviaTable::new();
255 assert_eq!(tape.reconstruct(&trivia), "");
256 }
257
258 #[test]
259 fn single_token_no_trivia() {
260 let mut tape = TokenTape::new();
261 tape.push(Token::new(TokenKind::Keyword, span(0, 6), "SELECT"));
262 let trivia = TriviaTable::new();
263 assert_eq!(tape.reconstruct(&trivia), "SELECT");
264 }
265
266 #[test]
267 fn reconstruct_with_leading_and_inter_token_trivia() {
268 let mut tape = TokenTape::new();
269 tape.push(Token::new(TokenKind::Keyword, span(2, 6), "SELECT"));
270 tape.push(Token::new(TokenKind::Identifier, span(9, 4), "name"));
271 tape.push(Token::new(TokenKind::Keyword, span(14, 4), "FROM"));
272 tape.push(Token::new(TokenKind::Identifier, span(19, 5), "users"));
273 tape.push(Token::new(TokenKind::Semicolon, span(24, 1), ";"));
274
275 let mut trivia = TriviaTable::new();
276 // Leading whitespace before SELECT
277 trivia.push(0, Trivia::Whitespace(" ".to_string()));
278 // Whitespace between SELECT and name
279 trivia.push(1, Trivia::Whitespace(" ".to_string()));
280 // Whitespace between name and FROM
281 trivia.push(2, Trivia::Whitespace(" ".to_string()));
282 // Whitespace between FROM and users
283 trivia.push(3, Trivia::Whitespace(" ".to_string()));
284 // No trivia before semicolon
285
286 assert_eq!(tape.reconstruct(&trivia), " SELECT name FROM users;");
287 }
288
289 #[test]
290 fn reconstruct_preserves_comments() {
291 let mut tape = TokenTape::new();
292 tape.push(Token::new(TokenKind::Keyword, span(12, 6), "SELECT"));
293 tape.push(Token::new(TokenKind::NumericLiteral, span(19, 1), "1"));
294 tape.push(Token::new(TokenKind::Semicolon, span(20, 1), ";"));
295
296 let mut trivia = TriviaTable::new();
297 trivia.push(0, Trivia::LineComment("-- pick one\n".to_string()));
298 trivia.push(1, Trivia::Whitespace(" ".to_string()));
299
300 assert_eq!(tape.reconstruct(&trivia), "-- pick one\nSELECT 1;");
301 }
302
303 #[test]
304 fn trivia_table_total_count() {
305 let mut trivia = TriviaTable::new();
306 trivia.push(0, Trivia::Whitespace(" ".to_string()));
307 trivia.push(2, Trivia::Whitespace(" ".to_string()));
308 trivia.push(2, Trivia::LineComment("-- x".to_string()));
309 assert_eq!(trivia.total_count(), 3);
310 }
311
312 #[test]
313 fn trivia_table_get_out_of_bounds_returns_empty() {
314 let trivia = TriviaTable::new();
315 assert_eq!(trivia.get(999), &[] as &[Trivia]);
316 }
317}