Skip to main content

plsql_parser/
tokens.rs

1//! Token tape types.
2//!
3//! The token tape is the **lossless** representation of the source.  Every
4//! token carries a byte-offset span; trivia (whitespace, comments) is
5//! preserved verbatim in a side-table.  Round-tripping is:
6//!
7//! ```text
8//! reconstruct(token_tape(input)) == input   // byte-for-byte
9//! ```
10//!
11//! This contract is enforced by the proptest in `tests/conformance.rs`.
12
13use plsql_core::Span;
14use serde::{Deserialize, Serialize};
15
16// ---------------------------------------------------------------------------
17// TokenKind
18// ---------------------------------------------------------------------------
19
20/// Discriminator for a syntactic token.
21///
22/// The set is deliberately coarse at this layer — backends map their
23/// internal token vocabulary into these kinds.  The mapping is
24/// backend-private (R20).
25#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash, Serialize, Deserialize)]
26pub enum TokenKind {
27    // Literals
28    /// A string literal (`'hello'`, `q'[...]'`).
29    StringLiteral,
30    /// A numeric literal (`42`, `3.14`, `1e-3`).
31    NumericLiteral,
32    /// A quoted identifier (`"My_Table"`).
33    QuotedIdentifier,
34
35    // Keywords
36    /// A PL/SQL or SQL keyword (`SELECT`, `BEGIN`, `PACKAGE`).
37    Keyword,
38    /// A built-in Oracle function name treated as keyword contextually.
39    BuiltIn,
40
41    // Identifiers
42    /// An unquoted identifier (`EMPLOYEES`, `v_count`).
43    Identifier,
44
45    // Punctuation / delimiters
46    /// A semicolon (`;`).
47    Semicolon,
48    /// A forward slash (`/`) — statement terminator in SQL*Plus.
49    Slash,
50    /// A dot (`.`).
51    Dot,
52    /// A comma (`,`).
53    Comma,
54    /// An opening parenthesis (`(`).
55    LParen,
56    /// A closing parenthesis (`)`).
57    RParen,
58    /// An assignment operator (`:=`).
59    Assign,
60    /// The fat arrow (`=>`).
61    Arrow,
62    /// The pipe-pipe concatenation (`||`).
63    Concat,
64    /// Any other operator (`+`, `-`, `*`, `/`, `=`, `<`, `>`, etc.).
65    Operator,
66    /// An `@` or `@@` include directive.
67    IncludeDirective,
68    /// A `/` on a line by itself (SQL*Plus statement terminator).
69    StatementTerminator,
70
71    // Error
72    /// The backend could not classify this token.
73    Unknown,
74}
75
76// ---------------------------------------------------------------------------
77// Token
78// ---------------------------------------------------------------------------
79
80/// A single syntactic token in the token tape.
81#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
82pub struct Token {
83    /// What kind of token this is.
84    pub kind: TokenKind,
85    /// Byte-offset span in the original source.
86    pub span: Span,
87    /// The raw source text of this token (verbatim).
88    pub text: String,
89}
90
91impl Token {
92    #[must_use]
93    pub fn new(kind: TokenKind, span: Span, text: impl Into<String>) -> Self {
94        Self {
95            kind,
96            span,
97            text: text.into(),
98        }
99    }
100}
101
102// ---------------------------------------------------------------------------
103// Trivia
104// ---------------------------------------------------------------------------
105
106/// A piece of trivia — whitespace, comments, or other non-token source text
107/// that must be preserved for lossless round-tripping.
108#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
109pub enum Trivia {
110    /// Horizontal or vertical whitespace.
111    Whitespace(String),
112    /// A single-line comment (`-- ...`).
113    LineComment(String),
114    /// A block comment (`/* ... */`).
115    BlockComment(String),
116}
117
118// ---------------------------------------------------------------------------
119// TriviaTable
120// ---------------------------------------------------------------------------
121
122/// Maps each token index to the trivia that **precedes** it.
123///
124/// Index `i` in this table holds the trivia between token `i-1` and token `i`.
125/// Index `0` holds leading trivia (before the first token).
126/// Trailing trivia (after the last token) is stored at index `tokens.len()`.
127///
128/// This is a sparse mapping: if a token has no preceding trivia, the entry is
129/// an empty `Vec`.
130#[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize)]
131pub struct TriviaTable {
132    /// `leading[i]` = trivia preceding the i-th token.  Trailing trivia goes
133    /// at index `tokens.len()`.
134    pub leading: Vec<Vec<Trivia>>,
135}
136
137impl TriviaTable {
138    #[must_use]
139    pub fn new() -> Self {
140        Self {
141            leading: Vec::new(),
142        }
143    }
144
145    /// Push a trivia entry for the given token index.
146    pub fn push(&mut self, token_index: usize, trivia: Trivia) {
147        while self.leading.len() <= token_index {
148            self.leading.push(Vec::new());
149        }
150        self.leading[token_index].push(trivia);
151    }
152
153    /// Get the trivia preceding the given token index.
154    #[must_use]
155    pub fn get(&self, token_index: usize) -> &[Trivia] {
156        self.leading.get(token_index).map_or(&[], |v| v.as_slice())
157    }
158
159    /// Total number of trivia entries across all tokens.
160    #[must_use]
161    pub fn total_count(&self) -> usize {
162        self.leading.iter().map(Vec::len).sum()
163    }
164}
165
166// ---------------------------------------------------------------------------
167// TokenTape
168// ---------------------------------------------------------------------------
169
170/// An ordered sequence of tokens representing the full lexed source.
171///
172/// Combined with the [`TriviaTable`], this allows perfect reconstruction
173/// of the original source text.
174#[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize)]
175pub struct TokenTape {
176    pub tokens: Vec<Token>,
177}
178
179impl TokenTape {
180    #[must_use]
181    pub fn new() -> Self {
182        Self { tokens: Vec::new() }
183    }
184
185    #[must_use]
186    pub fn len(&self) -> usize {
187        self.tokens.len()
188    }
189
190    #[must_use]
191    pub fn is_empty(&self) -> bool {
192        self.tokens.is_empty()
193    }
194
195    /// Push a token onto the tape.
196    pub fn push(&mut self, token: Token) {
197        self.tokens.push(token);
198    }
199
200    /// Reconstruct the original source text from the token tape + trivia.
201    ///
202    /// This is the lossless round-trip function.  For a valid tape:
203    ///
204    /// ```text
205    /// reconstruct(tape, trivia) == original_source
206    /// ```
207    #[must_use]
208    pub fn reconstruct(&self, trivia: &TriviaTable) -> String {
209        let mut out = String::new();
210        for (i, token) in self.tokens.iter().enumerate() {
211            // Emit preceding trivia
212            for t in trivia.get(i) {
213                match t {
214                    Trivia::Whitespace(s) | Trivia::LineComment(s) | Trivia::BlockComment(s) => {
215                        out.push_str(s)
216                    }
217                }
218            }
219            // Emit the token itself
220            out.push_str(&token.text);
221        }
222        // Trailing trivia (after last token)
223        for t in trivia.get(self.tokens.len()) {
224            match t {
225                Trivia::Whitespace(s) | Trivia::LineComment(s) | Trivia::BlockComment(s) => {
226                    out.push_str(s)
227                }
228            }
229        }
230        out
231    }
232}
233
234// ---------------------------------------------------------------------------
235// Tests
236// ---------------------------------------------------------------------------
237
238#[cfg(test)]
239mod tests {
240    use super::*;
241    use plsql_core::{FileId, Position};
242
243    fn span(start: u32, len: u32) -> Span {
244        Span::new(
245            FileId::new(0),
246            Position::new(1, 1, start),
247            Position::new(1, 1, start + len),
248        )
249    }
250
251    #[test]
252    fn empty_tape_reconstructs_to_empty_string() {
253        let tape = TokenTape::new();
254        let trivia = TriviaTable::new();
255        assert_eq!(tape.reconstruct(&trivia), "");
256    }
257
258    #[test]
259    fn single_token_no_trivia() {
260        let mut tape = TokenTape::new();
261        tape.push(Token::new(TokenKind::Keyword, span(0, 6), "SELECT"));
262        let trivia = TriviaTable::new();
263        assert_eq!(tape.reconstruct(&trivia), "SELECT");
264    }
265
266    #[test]
267    fn reconstruct_with_leading_and_inter_token_trivia() {
268        let mut tape = TokenTape::new();
269        tape.push(Token::new(TokenKind::Keyword, span(2, 6), "SELECT"));
270        tape.push(Token::new(TokenKind::Identifier, span(9, 4), "name"));
271        tape.push(Token::new(TokenKind::Keyword, span(14, 4), "FROM"));
272        tape.push(Token::new(TokenKind::Identifier, span(19, 5), "users"));
273        tape.push(Token::new(TokenKind::Semicolon, span(24, 1), ";"));
274
275        let mut trivia = TriviaTable::new();
276        // Leading whitespace before SELECT
277        trivia.push(0, Trivia::Whitespace("  ".to_string()));
278        // Whitespace between SELECT and name
279        trivia.push(1, Trivia::Whitespace(" ".to_string()));
280        // Whitespace between name and FROM
281        trivia.push(2, Trivia::Whitespace(" ".to_string()));
282        // Whitespace between FROM and users
283        trivia.push(3, Trivia::Whitespace(" ".to_string()));
284        // No trivia before semicolon
285
286        assert_eq!(tape.reconstruct(&trivia), "  SELECT name FROM users;");
287    }
288
289    #[test]
290    fn reconstruct_preserves_comments() {
291        let mut tape = TokenTape::new();
292        tape.push(Token::new(TokenKind::Keyword, span(12, 6), "SELECT"));
293        tape.push(Token::new(TokenKind::NumericLiteral, span(19, 1), "1"));
294        tape.push(Token::new(TokenKind::Semicolon, span(20, 1), ";"));
295
296        let mut trivia = TriviaTable::new();
297        trivia.push(0, Trivia::LineComment("-- pick one\n".to_string()));
298        trivia.push(1, Trivia::Whitespace(" ".to_string()));
299
300        assert_eq!(tape.reconstruct(&trivia), "-- pick one\nSELECT 1;");
301    }
302
303    #[test]
304    fn trivia_table_total_count() {
305        let mut trivia = TriviaTable::new();
306        trivia.push(0, Trivia::Whitespace(" ".to_string()));
307        trivia.push(2, Trivia::Whitespace(" ".to_string()));
308        trivia.push(2, Trivia::LineComment("-- x".to_string()));
309        assert_eq!(trivia.total_count(), 3);
310    }
311
312    #[test]
313    fn trivia_table_get_out_of_bounds_returns_empty() {
314        let trivia = TriviaTable::new();
315        assert_eq!(trivia.get(999), &[] as &[Trivia]);
316    }
317}