Skip to main content

sochdb_query/sql/
token.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// SochDB - LLM-Optimized Embedded Database
3// Copyright (C) 2026 Sushanth Reddy Vanagala (https://github.com/sushanthpy)
4//
5// This program is free software: you can redistribute it and/or modify
6// it under the terms of the GNU Affero General Public License as published by
7// the Free Software Foundation, either version 3 of the License, or
8// (at your option) any later version.
9//
10// This program is distributed in the hope that it will be useful,
11// but WITHOUT ANY WARRANTY; without even the implied warranty of
12// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13// GNU Affero General Public License for more details.
14//
15// You should have received a copy of the GNU Affero General Public License
16// along with this program. If not, see <https://www.gnu.org/licenses/>.
17
18//! SQL Token Types
19//!
20//! Comprehensive token set for SQL-92 with SochDB extensions.
21
22use std::borrow::Cow;
23use std::fmt;
24use std::hash::Hash;
25
26/// Source location for error reporting
27#[derive(Debug, Clone, Copy, PartialEq, Eq)]
28pub struct Span {
29    pub start: usize,
30    pub end: usize,
31    pub line: usize,
32    pub column: usize,
33}
34
35impl Span {
36    pub fn new(start: usize, end: usize, line: usize, column: usize) -> Self {
37        Self {
38            start,
39            end,
40            line,
41            column,
42        }
43    }
44
45    pub fn merge(self, other: Span) -> Span {
46        Span {
47            start: self.start.min(other.start),
48            end: self.end.max(other.end),
49            line: self.line,
50            column: self.column,
51        }
52    }
53}
54
55impl Default for Span {
56    fn default() -> Self {
57        Self {
58            start: 0,
59            end: 0,
60            line: 1,
61            column: 1,
62        }
63    }
64}
65
66/// SQL Token with location information
67#[derive(Debug, Clone, PartialEq)]
68pub struct Token<'a> {
69    pub kind: TokenKind<'a>,
70    pub span: Span,
71    pub literal: &'a str,
72}
73
74impl<'a> Token<'a> {
75    pub fn new(kind: TokenKind<'a>, span: Span, literal: &'a str) -> Self {
76        Self {
77            kind,
78            span,
79            literal,
80        }
81    }
82}
83
84/// Token classification
85#[derive(Debug, Clone, PartialEq)]
86pub enum TokenKind<'a> {
87    // Literals
88    Integer(i64),
89    Float(f64),
90    String(Cow<'a, str>),
91    Blob(Vec<u8>),
92    Null,
93    True,
94    False,
95
96    // Identifiers
97    Identifier(&'a str),
98    QuotedIdentifier(Cow<'a, str>), // "column name" or `column name`
99
100    // Keywords - DDL
101    Create,
102    Table,
103    Index,
104    Drop,
105    Alter,
106    Add,
107    Column,
108    Rename,
109    To,
110    Cascade,
111    Primary,
112    Key,
113    Foreign,
114    References,
115    Unique,
116    Default,
117    AutoIncrement,
118    If,
119    Exists,
120
121    // Keywords - Conflict/Upsert (Dialect Support)
122    Ignore,    // MySQL: INSERT IGNORE
123    Replace,   // SQLite: INSERT OR REPLACE
124    Conflict,  // PostgreSQL: ON CONFLICT
125    Do,        // PostgreSQL: ON CONFLICT DO
126    Nothing,   // PostgreSQL: DO NOTHING
127    Duplicate, // MySQL: ON DUPLICATE KEY UPDATE
128    Abort,     // SQLite: INSERT OR ABORT
129    Fail,      // SQLite: INSERT OR FAIL
130    Returning, // PostgreSQL/SQLite: RETURNING clause
131
132    // Keywords - DML
133    Select,
134    Insert,
135    Update,
136    Delete,
137    Into,
138    Values,
139    Set,
140    From,
141    Where,
142    Join,
143    Inner,
144    Left,
145    Right,
146    Outer,
147    Cross,
148    On,
149    Using,
150
151    // Keywords - Clauses
152    As,
153    Distinct,
154    All,
155    Group,
156    Having,
157    Order,
158    By,
159    Asc,
160    Desc,
161    Nulls,
162    First,
163    Last,
164    Limit,
165    Offset,
166    Union,
167    Intersect,
168    Except,
169
170    // Keywords - Expressions
171    And,
172    Or,
173    Not,
174    Is,
175    In,
176    Like,
177    Escape,
178    Between,
179    Case,
180    When,
181    Then,
182    Else,
183    End,
184    Cast,
185    Collate,
186
187    // Keywords - Transactions
188    Begin,
189    Commit,
190    Rollback,
191    Transaction,
192    Savepoint,
193    Release,
194
195    // Keywords - Types
196    Int,
197    IntegerKw,
198    Bigint,
199    Smallint,
200    Tinyint,
201    FloatKw,
202    Double,
203    Real,
204    Decimal,
205    Numeric,
206    Varchar,
207    Char,
208    Text,
209    BlobKw,
210    Boolean,
211    Bool,
212    Date,
213    Time,
214    Timestamp,
215    Datetime,
216
217    // Keywords - Aggregates
218    Count,
219    Sum,
220    Avg,
221    Min,
222    Max,
223
224    // Keywords - SochDB Extensions
225    Vector,
226    VectorSearch,
227    JsonExtract,
228    JsonSet,
229    ContextWindow,
230    Embedding,
231    Cosine,
232    Euclidean,
233    DotProduct,
234
235    // Keywords - Graph & Real-Time (P1)
236    Relate,
237    Live,
238    Content,
239    Event,
240    Diff,
241
242    // Keywords - Security DDL (P2 — Scope-Based Auth)
243    Define,
244    Scope,
245    Remove,
246    Session,
247    Signin,
248    Signup,
249    Permissions,
250    For,
251
252    // Operators
253    Plus,       // +
254    Minus,      // -
255    Star,       // *
256    Slash,      // /
257    Percent,    // %
258    Eq,         // =
259    Ne,         // != or <>
260    Lt,         // <
261    Le,         // <=
262    Gt,         // >
263    Ge,         // >=
264    Concat,     // ||
265    BitAnd,     // &
266    BitOr,      // |
267    BitNot,     // ~
268    LeftShift,  // <<
269    RightShift, // >>
270
271    // Punctuation
272    LParen,       // (
273    RParen,       // )
274    LBracket,     // [
275    RBracket,     // ]
276    Comma,        // ,
277    Semicolon,    // ;
278    Dot,          // .
279    Colon,        // :
280    DoubleColon,  // ::
281    Arrow,        // ->
282    DoubleArrow,  // ->>
283    LeftArrow,    // <-
284    BiArrow,      // <->
285    QuestionMark, // ?
286    At,           // @
287
288    // Special
289    Placeholder(u32), // $1, $2, ... or ?
290    Comment(&'a str),
291    Whitespace,
292    Eof,
293    Invalid(&'a str),
294}
295
296impl Eq for TokenKind<'_> {}
297
298impl Hash for TokenKind<'_> {
299    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
300        std::mem::discriminant(self).hash(state);
301        match self {
302            TokenKind::Integer(n) => n.hash(state),
303            TokenKind::Float(f) => f.to_bits().hash(state),
304            TokenKind::String(s) => s.hash(state),
305            TokenKind::Blob(b) => b.hash(state),
306            TokenKind::Identifier(s) => s.hash(state),
307            TokenKind::QuotedIdentifier(s) => s.hash(state),
308            TokenKind::Placeholder(n) => n.hash(state),
309            TokenKind::Comment(s) => s.hash(state),
310            TokenKind::Invalid(s) => s.hash(state),
311            _ => {}
312        }
313    }
314}
315
316impl<'a> TokenKind<'a> {
317    /// Check if this token is a keyword
318    pub fn is_keyword(&self) -> bool {
319        matches!(
320            self,
321            TokenKind::Select
322                | TokenKind::Insert
323                | TokenKind::Update
324                | TokenKind::Delete
325                | TokenKind::Create
326                | TokenKind::Drop
327                | TokenKind::From
328                | TokenKind::Where
329                | TokenKind::And
330                | TokenKind::Or
331                | TokenKind::Not
332                | TokenKind::Join
333                | TokenKind::Inner
334                | TokenKind::Left
335                | TokenKind::Right
336                | TokenKind::Outer
337                | TokenKind::Cross
338                | TokenKind::On
339                | TokenKind::As
340                | TokenKind::Distinct
341                | TokenKind::All
342                | TokenKind::Group
343                | TokenKind::Having
344                | TokenKind::Order
345                | TokenKind::By
346                | TokenKind::Asc
347                | TokenKind::Desc
348                | TokenKind::Limit
349                | TokenKind::Offset
350                | TokenKind::Values
351                | TokenKind::Into
352                | TokenKind::Set
353                | TokenKind::Begin
354                | TokenKind::Commit
355                | TokenKind::Rollback
356                | TokenKind::Table
357                | TokenKind::Index
358                | TokenKind::Alter
359                | TokenKind::To
360                | TokenKind::Cascade
361                | TokenKind::Primary
362                | TokenKind::Key
363                | TokenKind::Foreign
364                | TokenKind::References
365                | TokenKind::Unique
366                | TokenKind::Default
367                | TokenKind::If
368                | TokenKind::Exists
369                | TokenKind::Case
370                | TokenKind::When
371                | TokenKind::Then
372                | TokenKind::Else
373                | TokenKind::End
374                | TokenKind::Cast
375                | TokenKind::Union
376                | TokenKind::Intersect
377                | TokenKind::Except
378                | TokenKind::Count
379                | TokenKind::Sum
380                | TokenKind::Avg
381                | TokenKind::Min
382                | TokenKind::Max
383                | TokenKind::Is
384                | TokenKind::In
385                | TokenKind::Like
386                | TokenKind::Between
387                | TokenKind::Null
388                | TokenKind::True
389                | TokenKind::False
390                | TokenKind::Int
391                | TokenKind::IntegerKw
392                | TokenKind::Bigint
393                | TokenKind::Smallint
394                | TokenKind::FloatKw
395                | TokenKind::Double
396                | TokenKind::Real
397                | TokenKind::Varchar
398                | TokenKind::Char
399                | TokenKind::Text
400                | TokenKind::BlobKw
401                | TokenKind::Boolean
402                | TokenKind::Bool
403                | TokenKind::Date
404                | TokenKind::Time
405                | TokenKind::Timestamp
406                | TokenKind::Datetime
407                | TokenKind::Vector
408                | TokenKind::VectorSearch
409                | TokenKind::Embedding
410                | TokenKind::Cosine
411                | TokenKind::Euclidean
412                | TokenKind::DotProduct
413                | TokenKind::ContextWindow
414                | TokenKind::Using
415                | TokenKind::Transaction
416                | TokenKind::Savepoint
417                | TokenKind::Release
418                | TokenKind::Escape
419                | TokenKind::Nulls
420                | TokenKind::First
421                | TokenKind::Last
422                | TokenKind::AutoIncrement
423                | TokenKind::Add
424                | TokenKind::Column
425                | TokenKind::Rename
426                | TokenKind::Collate
427                | TokenKind::Tinyint
428                | TokenKind::Decimal
429                | TokenKind::Numeric
430                | TokenKind::JsonExtract
431                | TokenKind::JsonSet
432                // Conflict/Upsert keywords
433                | TokenKind::Ignore
434                | TokenKind::Replace
435                | TokenKind::Conflict
436                | TokenKind::Do
437                | TokenKind::Nothing
438                | TokenKind::Duplicate
439                | TokenKind::Abort
440                | TokenKind::Fail
441                | TokenKind::Returning
442                // Graph & Real-Time keywords
443                | TokenKind::Relate
444                | TokenKind::Live
445                | TokenKind::Content
446                | TokenKind::Event
447                | TokenKind::Diff
448        )
449    }
450
451    /// Get keyword from string (case-insensitive)
452    /// Uses a stack-allocated buffer to avoid heap allocation.
453    pub fn from_keyword(s: &str) -> Option<TokenKind<'a>> {
454        let len = s.len();
455        if len == 0 || len > 20 {
456            return None;
457        }
458        let mut buf = [0u8; 20];
459        for (i, &b) in s.as_bytes().iter().enumerate() {
460            buf[i] = b.to_ascii_uppercase();
461        }
462        // SAFETY: to_ascii_uppercase on valid ASCII bytes preserves UTF-8 validity.
463        // scan_identifier only accepts ASCII chars, so this is always sound.
464        let upper = unsafe { std::str::from_utf8_unchecked(&buf[..len]) };
465        match upper {
466            "SELECT" => Some(TokenKind::Select),
467            "INSERT" => Some(TokenKind::Insert),
468            "UPDATE" => Some(TokenKind::Update),
469            "DELETE" => Some(TokenKind::Delete),
470            "CREATE" => Some(TokenKind::Create),
471            "TABLE" => Some(TokenKind::Table),
472            "DROP" => Some(TokenKind::Drop),
473            "ALTER" => Some(TokenKind::Alter),
474            "ADD" => Some(TokenKind::Add),
475            "COLUMN" => Some(TokenKind::Column),
476            "RENAME" => Some(TokenKind::Rename),
477            "TO" => Some(TokenKind::To),
478            "CASCADE" => Some(TokenKind::Cascade),
479            "INDEX" => Some(TokenKind::Index),
480            "FROM" => Some(TokenKind::From),
481            "WHERE" => Some(TokenKind::Where),
482            "AND" => Some(TokenKind::And),
483            "OR" => Some(TokenKind::Or),
484            "NOT" => Some(TokenKind::Not),
485            "NULL" => Some(TokenKind::Null),
486            "TRUE" => Some(TokenKind::True),
487            "FALSE" => Some(TokenKind::False),
488            "IS" => Some(TokenKind::Is),
489            "IN" => Some(TokenKind::In),
490            "LIKE" => Some(TokenKind::Like),
491            "ESCAPE" => Some(TokenKind::Escape),
492            "BETWEEN" => Some(TokenKind::Between),
493            "JOIN" => Some(TokenKind::Join),
494            "INNER" => Some(TokenKind::Inner),
495            "LEFT" => Some(TokenKind::Left),
496            "RIGHT" => Some(TokenKind::Right),
497            "OUTER" => Some(TokenKind::Outer),
498            "CROSS" => Some(TokenKind::Cross),
499            "ON" => Some(TokenKind::On),
500            "USING" => Some(TokenKind::Using),
501            "AS" => Some(TokenKind::As),
502            "DISTINCT" => Some(TokenKind::Distinct),
503            "ALL" => Some(TokenKind::All),
504            "GROUP" => Some(TokenKind::Group),
505            "HAVING" => Some(TokenKind::Having),
506            "ORDER" => Some(TokenKind::Order),
507            "BY" => Some(TokenKind::By),
508            "ASC" => Some(TokenKind::Asc),
509            "DESC" => Some(TokenKind::Desc),
510            "NULLS" => Some(TokenKind::Nulls),
511            "FIRST" => Some(TokenKind::First),
512            "LAST" => Some(TokenKind::Last),
513            "LIMIT" => Some(TokenKind::Limit),
514            "OFFSET" => Some(TokenKind::Offset),
515            "VALUES" => Some(TokenKind::Values),
516            "INTO" => Some(TokenKind::Into),
517            "SET" => Some(TokenKind::Set),
518            "BEGIN" => Some(TokenKind::Begin),
519            "COMMIT" => Some(TokenKind::Commit),
520            "ROLLBACK" => Some(TokenKind::Rollback),
521            "TRANSACTION" => Some(TokenKind::Transaction),
522            "SAVEPOINT" => Some(TokenKind::Savepoint),
523            "RELEASE" => Some(TokenKind::Release),
524            "PRIMARY" => Some(TokenKind::Primary),
525            "KEY" => Some(TokenKind::Key),
526            "FOREIGN" => Some(TokenKind::Foreign),
527            "REFERENCES" => Some(TokenKind::References),
528            "UNIQUE" => Some(TokenKind::Unique),
529            "DEFAULT" => Some(TokenKind::Default),
530            "AUTOINCREMENT" | "AUTO_INCREMENT" => Some(TokenKind::AutoIncrement),
531            "IF" => Some(TokenKind::If),
532            "EXISTS" => Some(TokenKind::Exists),
533            "CASE" => Some(TokenKind::Case),
534            "WHEN" => Some(TokenKind::When),
535            "THEN" => Some(TokenKind::Then),
536            "ELSE" => Some(TokenKind::Else),
537            "END" => Some(TokenKind::End),
538            "CAST" => Some(TokenKind::Cast),
539            "COLLATE" => Some(TokenKind::Collate),
540            "UNION" => Some(TokenKind::Union),
541            "INTERSECT" => Some(TokenKind::Intersect),
542            "EXCEPT" => Some(TokenKind::Except),
543            "COUNT" => Some(TokenKind::Count),
544            "SUM" => Some(TokenKind::Sum),
545            "AVG" => Some(TokenKind::Avg),
546            "MIN" => Some(TokenKind::Min),
547            "MAX" => Some(TokenKind::Max),
548            // Conflict/Upsert keywords
549            "IGNORE" => Some(TokenKind::Ignore),
550            "REPLACE" => Some(TokenKind::Replace),
551            "CONFLICT" => Some(TokenKind::Conflict),
552            "DO" => Some(TokenKind::Do),
553            "NOTHING" => Some(TokenKind::Nothing),
554            "DUPLICATE" => Some(TokenKind::Duplicate),
555            "ABORT" => Some(TokenKind::Abort),
556            "FAIL" => Some(TokenKind::Fail),
557            "RETURNING" => Some(TokenKind::Returning),
558            // Types
559            "INT" => Some(TokenKind::Int),
560            "INTEGER" => Some(TokenKind::IntegerKw),
561            "BIGINT" => Some(TokenKind::Bigint),
562            "SMALLINT" => Some(TokenKind::Smallint),
563            "TINYINT" => Some(TokenKind::Tinyint),
564            "FLOAT" => Some(TokenKind::FloatKw),
565            "DOUBLE" => Some(TokenKind::Double),
566            "REAL" => Some(TokenKind::Real),
567            "DECIMAL" => Some(TokenKind::Decimal),
568            "NUMERIC" => Some(TokenKind::Numeric),
569            "VARCHAR" => Some(TokenKind::Varchar),
570            "CHAR" => Some(TokenKind::Char),
571            "TEXT" => Some(TokenKind::Text),
572            "BLOB" => Some(TokenKind::BlobKw),
573            "BOOLEAN" => Some(TokenKind::Boolean),
574            "BOOL" => Some(TokenKind::Bool),
575            "DATE" => Some(TokenKind::Date),
576            "TIME" => Some(TokenKind::Time),
577            "TIMESTAMP" => Some(TokenKind::Timestamp),
578            "DATETIME" => Some(TokenKind::Datetime),
579            // SochDB Extensions
580            "VECTOR" => Some(TokenKind::Vector),
581            "VECTOR_SEARCH" => Some(TokenKind::VectorSearch),
582            "JSON_EXTRACT" => Some(TokenKind::JsonExtract),
583            "JSON_SET" => Some(TokenKind::JsonSet),
584            "CONTEXT_WINDOW" => Some(TokenKind::ContextWindow),
585            "EMBEDDING" => Some(TokenKind::Embedding),
586            "COSINE" => Some(TokenKind::Cosine),
587            "EUCLIDEAN" => Some(TokenKind::Euclidean),
588            "DOT_PRODUCT" => Some(TokenKind::DotProduct),
589            // Graph & Real-Time
590            "RELATE" => Some(TokenKind::Relate),
591            "LIVE" => Some(TokenKind::Live),
592            "CONTENT" => Some(TokenKind::Content),
593            "EVENT" => Some(TokenKind::Event),
594            "DIFF" => Some(TokenKind::Diff),
595            // Security DDL
596            "DEFINE" => Some(TokenKind::Define),
597            "SCOPE" => Some(TokenKind::Scope),
598            "REMOVE" => Some(TokenKind::Remove),
599            "SESSION" => Some(TokenKind::Session),
600            "SIGNIN" => Some(TokenKind::Signin),
601            "SIGNUP" => Some(TokenKind::Signup),
602            "PERMISSIONS" => Some(TokenKind::Permissions),
603            "FOR" => Some(TokenKind::For),
604            _ => None,
605        }
606    }
607}
608
609impl fmt::Display for TokenKind<'_> {
610    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
611        match self {
612            TokenKind::Integer(n) => write!(f, "{}", n),
613            TokenKind::Float(n) => write!(f, "{}", n),
614            TokenKind::String(s) => write!(f, "'{}'", s),
615            TokenKind::Identifier(s) => write!(f, "{}", s),
616            TokenKind::QuotedIdentifier(s) => write!(f, "\"{}\"", s),
617            TokenKind::Select => write!(f, "SELECT"),
618            TokenKind::From => write!(f, "FROM"),
619            TokenKind::Where => write!(f, "WHERE"),
620            TokenKind::Plus => write!(f, "+"),
621            TokenKind::Minus => write!(f, "-"),
622            TokenKind::Star => write!(f, "*"),
623            TokenKind::Slash => write!(f, "/"),
624            TokenKind::Eq => write!(f, "="),
625            TokenKind::Ne => write!(f, "!="),
626            TokenKind::Lt => write!(f, "<"),
627            TokenKind::Le => write!(f, "<="),
628            TokenKind::Gt => write!(f, ">"),
629            TokenKind::Ge => write!(f, ">="),
630            TokenKind::LParen => write!(f, "("),
631            TokenKind::RParen => write!(f, ")"),
632            TokenKind::LBracket => write!(f, "["),
633            TokenKind::RBracket => write!(f, "]"),
634            TokenKind::Comma => write!(f, ","),
635            TokenKind::Semicolon => write!(f, ";"),
636            TokenKind::Dot => write!(f, "."),
637            TokenKind::Eof => write!(f, "EOF"),
638            TokenKind::Null => write!(f, "NULL"),
639            TokenKind::True => write!(f, "TRUE"),
640            TokenKind::False => write!(f, "FALSE"),
641            TokenKind::And => write!(f, "AND"),
642            TokenKind::Or => write!(f, "OR"),
643            TokenKind::Not => write!(f, "NOT"),
644            _ => write!(f, "{:?}", self),
645        }
646    }
647}