Skip to main content

sochdb_query/sql/
token.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// SochDB - LLM-Optimized Embedded Database
3// Copyright (C) 2026 Sushanth Reddy Vanagala (https://github.com/sushanthpy)
4//
5// This program is free software: you can redistribute it and/or modify
6// it under the terms of the GNU Affero General Public License as published by
7// the Free Software Foundation, either version 3 of the License, or
8// (at your option) any later version.
9//
10// This program is distributed in the hope that it will be useful,
11// but WITHOUT ANY WARRANTY; without even the implied warranty of
12// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13// GNU Affero General Public License for more details.
14//
15// You should have received a copy of the GNU Affero General Public License
16// along with this program. If not, see <https://www.gnu.org/licenses/>.
17
18//! SQL Token Types
19//!
20//! Comprehensive token set for SQL-92 with SochDB extensions.
21
22use std::fmt;
23use std::hash::Hash;
24
25/// Source location for error reporting
26#[derive(Debug, Clone, Copy, PartialEq, Eq)]
27pub struct Span {
28    pub start: usize,
29    pub end: usize,
30    pub line: usize,
31    pub column: usize,
32}
33
34impl Span {
35    pub fn new(start: usize, end: usize, line: usize, column: usize) -> Self {
36        Self {
37            start,
38            end,
39            line,
40            column,
41        }
42    }
43
44    pub fn merge(self, other: Span) -> Span {
45        Span {
46            start: self.start.min(other.start),
47            end: self.end.max(other.end),
48            line: self.line,
49            column: self.column,
50        }
51    }
52}
53
54impl Default for Span {
55    fn default() -> Self {
56        Self {
57            start: 0,
58            end: 0,
59            line: 1,
60            column: 1,
61        }
62    }
63}
64
65/// SQL Token with location information
66#[derive(Debug, Clone, PartialEq)]
67pub struct Token {
68    pub kind: TokenKind,
69    pub span: Span,
70    pub literal: String,
71}
72
73impl Token {
74    pub fn new(kind: TokenKind, span: Span, literal: impl Into<String>) -> Self {
75        Self {
76            kind,
77            span,
78            literal: literal.into(),
79        }
80    }
81}
82
83/// Token classification
84#[derive(Debug, Clone, PartialEq)]
85pub enum TokenKind {
86    // Literals
87    Integer(i64),
88    Float(f64),
89    String(String),
90    Blob(Vec<u8>),
91    Null,
92    True,
93    False,
94
95    // Identifiers
96    Identifier(String),
97    QuotedIdentifier(String), // "column name" or `column name`
98
99    // Keywords - DDL
100    Create,
101    Table,
102    Index,
103    Drop,
104    Alter,
105    Add,
106    Column,
107    Rename,
108    Primary,
109    Key,
110    Foreign,
111    References,
112    Unique,
113    Default,
114    AutoIncrement,
115    If,
116    Exists,
117
118    // Keywords - Conflict/Upsert (Dialect Support)
119    Ignore,    // MySQL: INSERT IGNORE
120    Replace,   // SQLite: INSERT OR REPLACE
121    Conflict,  // PostgreSQL: ON CONFLICT
122    Do,        // PostgreSQL: ON CONFLICT DO
123    Nothing,   // PostgreSQL: DO NOTHING
124    Duplicate, // MySQL: ON DUPLICATE KEY UPDATE
125    Abort,     // SQLite: INSERT OR ABORT
126    Fail,      // SQLite: INSERT OR FAIL
127    Returning, // PostgreSQL/SQLite: RETURNING clause
128
129    // Keywords - DML
130    Select,
131    Insert,
132    Update,
133    Delete,
134    Into,
135    Values,
136    Set,
137    From,
138    Where,
139    Join,
140    Inner,
141    Left,
142    Right,
143    Outer,
144    Cross,
145    On,
146    Using,
147
148    // Keywords - Clauses
149    As,
150    Distinct,
151    All,
152    Group,
153    Having,
154    Order,
155    By,
156    Asc,
157    Desc,
158    Nulls,
159    First,
160    Last,
161    Limit,
162    Offset,
163    Union,
164    Intersect,
165    Except,
166
167    // Keywords - Expressions
168    And,
169    Or,
170    Not,
171    Is,
172    In,
173    Like,
174    Escape,
175    Between,
176    Case,
177    When,
178    Then,
179    Else,
180    End,
181    Cast,
182    Collate,
183
184    // Keywords - Transactions
185    Begin,
186    Commit,
187    Rollback,
188    Transaction,
189    Savepoint,
190    Release,
191
192    // Keywords - Types
193    Int,
194    IntegerKw,
195    Bigint,
196    Smallint,
197    Tinyint,
198    FloatKw,
199    Double,
200    Real,
201    Decimal,
202    Numeric,
203    Varchar,
204    Char,
205    Text,
206    BlobKw,
207    Boolean,
208    Bool,
209    Date,
210    Time,
211    Timestamp,
212    Datetime,
213
214    // Keywords - Aggregates
215    Count,
216    Sum,
217    Avg,
218    Min,
219    Max,
220
221    // Keywords - SochDB Extensions
222    Vector,
223    VectorSearch,
224    JsonExtract,
225    JsonSet,
226    ContextWindow,
227    Embedding,
228    Cosine,
229    Euclidean,
230    DotProduct,
231
232    // Operators
233    Plus,       // +
234    Minus,      // -
235    Star,       // *
236    Slash,      // /
237    Percent,    // %
238    Eq,         // =
239    Ne,         // != or <>
240    Lt,         // <
241    Le,         // <=
242    Gt,         // >
243    Ge,         // >=
244    Concat,     // ||
245    BitAnd,     // &
246    BitOr,      // |
247    BitNot,     // ~
248    LeftShift,  // <<
249    RightShift, // >>
250
251    // Punctuation
252    LParen,       // (
253    RParen,       // )
254    LBracket,     // [
255    RBracket,     // ]
256    Comma,        // ,
257    Semicolon,    // ;
258    Dot,          // .
259    Colon,        // :
260    DoubleColon,  // ::
261    Arrow,        // ->
262    DoubleArrow,  // ->>
263    QuestionMark, // ?
264    At,           // @
265
266    // Special
267    Placeholder(u32), // $1, $2, ... or ?
268    Comment(String),
269    Whitespace,
270    Eof,
271    Invalid(String),
272}
273
274impl Eq for TokenKind {}
275
276impl Hash for TokenKind {
277    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
278        std::mem::discriminant(self).hash(state);
279        match self {
280            TokenKind::Integer(n) => n.hash(state),
281            TokenKind::Float(f) => f.to_bits().hash(state),
282            TokenKind::String(s) => s.hash(state),
283            TokenKind::Blob(b) => b.hash(state),
284            TokenKind::Identifier(s) => s.hash(state),
285            TokenKind::QuotedIdentifier(s) => s.hash(state),
286            TokenKind::Placeholder(n) => n.hash(state),
287            TokenKind::Comment(s) => s.hash(state),
288            TokenKind::Invalid(s) => s.hash(state),
289            _ => {}
290        }
291    }
292}
293
294impl TokenKind {
295    /// Check if this token is a keyword
296    pub fn is_keyword(&self) -> bool {
297        matches!(
298            self,
299            TokenKind::Select
300                | TokenKind::Insert
301                | TokenKind::Update
302                | TokenKind::Delete
303                | TokenKind::Create
304                | TokenKind::Drop
305                | TokenKind::From
306                | TokenKind::Where
307                | TokenKind::And
308                | TokenKind::Or
309                | TokenKind::Not
310                | TokenKind::Join
311                | TokenKind::Inner
312                | TokenKind::Left
313                | TokenKind::Right
314                | TokenKind::Outer
315                | TokenKind::Cross
316                | TokenKind::On
317                | TokenKind::As
318                | TokenKind::Distinct
319                | TokenKind::All
320                | TokenKind::Group
321                | TokenKind::Having
322                | TokenKind::Order
323                | TokenKind::By
324                | TokenKind::Asc
325                | TokenKind::Desc
326                | TokenKind::Limit
327                | TokenKind::Offset
328                | TokenKind::Values
329                | TokenKind::Into
330                | TokenKind::Set
331                | TokenKind::Begin
332                | TokenKind::Commit
333                | TokenKind::Rollback
334                | TokenKind::Table
335                | TokenKind::Index
336                | TokenKind::Alter
337                | TokenKind::Primary
338                | TokenKind::Key
339                | TokenKind::Foreign
340                | TokenKind::References
341                | TokenKind::Unique
342                | TokenKind::Default
343                | TokenKind::If
344                | TokenKind::Exists
345                | TokenKind::Case
346                | TokenKind::When
347                | TokenKind::Then
348                | TokenKind::Else
349                | TokenKind::End
350                | TokenKind::Cast
351                | TokenKind::Union
352                | TokenKind::Intersect
353                | TokenKind::Except
354                | TokenKind::Count
355                | TokenKind::Sum
356                | TokenKind::Avg
357                | TokenKind::Min
358                | TokenKind::Max
359                | TokenKind::Is
360                | TokenKind::In
361                | TokenKind::Like
362                | TokenKind::Between
363                | TokenKind::Null
364                | TokenKind::True
365                | TokenKind::False
366                | TokenKind::Int
367                | TokenKind::IntegerKw
368                | TokenKind::Bigint
369                | TokenKind::Smallint
370                | TokenKind::FloatKw
371                | TokenKind::Double
372                | TokenKind::Real
373                | TokenKind::Varchar
374                | TokenKind::Char
375                | TokenKind::Text
376                | TokenKind::BlobKw
377                | TokenKind::Boolean
378                | TokenKind::Bool
379                | TokenKind::Date
380                | TokenKind::Time
381                | TokenKind::Timestamp
382                | TokenKind::Datetime
383                | TokenKind::Vector
384                | TokenKind::VectorSearch
385                | TokenKind::Embedding
386                | TokenKind::Cosine
387                | TokenKind::Euclidean
388                | TokenKind::DotProduct
389                | TokenKind::ContextWindow
390                | TokenKind::Using
391                | TokenKind::Transaction
392                | TokenKind::Savepoint
393                | TokenKind::Release
394                | TokenKind::Escape
395                | TokenKind::Nulls
396                | TokenKind::First
397                | TokenKind::Last
398                | TokenKind::AutoIncrement
399                | TokenKind::Add
400                | TokenKind::Column
401                | TokenKind::Rename
402                | TokenKind::Collate
403                | TokenKind::Tinyint
404                | TokenKind::Decimal
405                | TokenKind::Numeric
406                | TokenKind::JsonExtract
407                | TokenKind::JsonSet
408                // Conflict/Upsert keywords
409                | TokenKind::Ignore
410                | TokenKind::Replace
411                | TokenKind::Conflict
412                | TokenKind::Do
413                | TokenKind::Nothing
414                | TokenKind::Duplicate
415                | TokenKind::Abort
416                | TokenKind::Fail
417                | TokenKind::Returning
418        )
419    }
420
421    /// Get keyword from string (case-insensitive)
422    pub fn from_keyword(s: &str) -> Option<TokenKind> {
423        match s.to_uppercase().as_str() {
424            "SELECT" => Some(TokenKind::Select),
425            "INSERT" => Some(TokenKind::Insert),
426            "UPDATE" => Some(TokenKind::Update),
427            "DELETE" => Some(TokenKind::Delete),
428            "CREATE" => Some(TokenKind::Create),
429            "TABLE" => Some(TokenKind::Table),
430            "DROP" => Some(TokenKind::Drop),
431            "ALTER" => Some(TokenKind::Alter),
432            "ADD" => Some(TokenKind::Add),
433            "COLUMN" => Some(TokenKind::Column),
434            "RENAME" => Some(TokenKind::Rename),
435            "INDEX" => Some(TokenKind::Index),
436            "FROM" => Some(TokenKind::From),
437            "WHERE" => Some(TokenKind::Where),
438            "AND" => Some(TokenKind::And),
439            "OR" => Some(TokenKind::Or),
440            "NOT" => Some(TokenKind::Not),
441            "NULL" => Some(TokenKind::Null),
442            "TRUE" => Some(TokenKind::True),
443            "FALSE" => Some(TokenKind::False),
444            "IS" => Some(TokenKind::Is),
445            "IN" => Some(TokenKind::In),
446            "LIKE" => Some(TokenKind::Like),
447            "ESCAPE" => Some(TokenKind::Escape),
448            "BETWEEN" => Some(TokenKind::Between),
449            "JOIN" => Some(TokenKind::Join),
450            "INNER" => Some(TokenKind::Inner),
451            "LEFT" => Some(TokenKind::Left),
452            "RIGHT" => Some(TokenKind::Right),
453            "OUTER" => Some(TokenKind::Outer),
454            "CROSS" => Some(TokenKind::Cross),
455            "ON" => Some(TokenKind::On),
456            "USING" => Some(TokenKind::Using),
457            "AS" => Some(TokenKind::As),
458            "DISTINCT" => Some(TokenKind::Distinct),
459            "ALL" => Some(TokenKind::All),
460            "GROUP" => Some(TokenKind::Group),
461            "HAVING" => Some(TokenKind::Having),
462            "ORDER" => Some(TokenKind::Order),
463            "BY" => Some(TokenKind::By),
464            "ASC" => Some(TokenKind::Asc),
465            "DESC" => Some(TokenKind::Desc),
466            "NULLS" => Some(TokenKind::Nulls),
467            "FIRST" => Some(TokenKind::First),
468            "LAST" => Some(TokenKind::Last),
469            "LIMIT" => Some(TokenKind::Limit),
470            "OFFSET" => Some(TokenKind::Offset),
471            "VALUES" => Some(TokenKind::Values),
472            "INTO" => Some(TokenKind::Into),
473            "SET" => Some(TokenKind::Set),
474            "BEGIN" => Some(TokenKind::Begin),
475            "COMMIT" => Some(TokenKind::Commit),
476            "ROLLBACK" => Some(TokenKind::Rollback),
477            "TRANSACTION" => Some(TokenKind::Transaction),
478            "SAVEPOINT" => Some(TokenKind::Savepoint),
479            "RELEASE" => Some(TokenKind::Release),
480            "PRIMARY" => Some(TokenKind::Primary),
481            "KEY" => Some(TokenKind::Key),
482            "FOREIGN" => Some(TokenKind::Foreign),
483            "REFERENCES" => Some(TokenKind::References),
484            "UNIQUE" => Some(TokenKind::Unique),
485            "DEFAULT" => Some(TokenKind::Default),
486            "AUTOINCREMENT" | "AUTO_INCREMENT" => Some(TokenKind::AutoIncrement),
487            "IF" => Some(TokenKind::If),
488            "EXISTS" => Some(TokenKind::Exists),
489            "CASE" => Some(TokenKind::Case),
490            "WHEN" => Some(TokenKind::When),
491            "THEN" => Some(TokenKind::Then),
492            "ELSE" => Some(TokenKind::Else),
493            "END" => Some(TokenKind::End),
494            "CAST" => Some(TokenKind::Cast),
495            "COLLATE" => Some(TokenKind::Collate),
496            "UNION" => Some(TokenKind::Union),
497            "INTERSECT" => Some(TokenKind::Intersect),
498            "EXCEPT" => Some(TokenKind::Except),
499            "COUNT" => Some(TokenKind::Count),
500            "SUM" => Some(TokenKind::Sum),
501            "AVG" => Some(TokenKind::Avg),
502            "MIN" => Some(TokenKind::Min),
503            "MAX" => Some(TokenKind::Max),
504            // Conflict/Upsert keywords
505            "IGNORE" => Some(TokenKind::Ignore),
506            "REPLACE" => Some(TokenKind::Replace),
507            "CONFLICT" => Some(TokenKind::Conflict),
508            "DO" => Some(TokenKind::Do),
509            "NOTHING" => Some(TokenKind::Nothing),
510            "DUPLICATE" => Some(TokenKind::Duplicate),
511            "ABORT" => Some(TokenKind::Abort),
512            "FAIL" => Some(TokenKind::Fail),
513            "RETURNING" => Some(TokenKind::Returning),
514            // Types
515            "INT" => Some(TokenKind::Int),
516            "INTEGER" => Some(TokenKind::IntegerKw),
517            "BIGINT" => Some(TokenKind::Bigint),
518            "SMALLINT" => Some(TokenKind::Smallint),
519            "TINYINT" => Some(TokenKind::Tinyint),
520            "FLOAT" => Some(TokenKind::FloatKw),
521            "DOUBLE" => Some(TokenKind::Double),
522            "REAL" => Some(TokenKind::Real),
523            "DECIMAL" => Some(TokenKind::Decimal),
524            "NUMERIC" => Some(TokenKind::Numeric),
525            "VARCHAR" => Some(TokenKind::Varchar),
526            "CHAR" => Some(TokenKind::Char),
527            "TEXT" => Some(TokenKind::Text),
528            "BLOB" => Some(TokenKind::BlobKw),
529            "BOOLEAN" => Some(TokenKind::Boolean),
530            "BOOL" => Some(TokenKind::Bool),
531            "DATE" => Some(TokenKind::Date),
532            "TIME" => Some(TokenKind::Time),
533            "TIMESTAMP" => Some(TokenKind::Timestamp),
534            "DATETIME" => Some(TokenKind::Datetime),
535            // SochDB Extensions
536            "VECTOR" => Some(TokenKind::Vector),
537            "VECTOR_SEARCH" => Some(TokenKind::VectorSearch),
538            "JSON_EXTRACT" => Some(TokenKind::JsonExtract),
539            "JSON_SET" => Some(TokenKind::JsonSet),
540            "CONTEXT_WINDOW" => Some(TokenKind::ContextWindow),
541            "EMBEDDING" => Some(TokenKind::Embedding),
542            "COSINE" => Some(TokenKind::Cosine),
543            "EUCLIDEAN" => Some(TokenKind::Euclidean),
544            "DOT_PRODUCT" => Some(TokenKind::DotProduct),
545            _ => None,
546        }
547    }
548}
549
550impl fmt::Display for TokenKind {
551    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
552        match self {
553            TokenKind::Integer(n) => write!(f, "{}", n),
554            TokenKind::Float(n) => write!(f, "{}", n),
555            TokenKind::String(s) => write!(f, "'{}'", s),
556            TokenKind::Identifier(s) => write!(f, "{}", s),
557            TokenKind::QuotedIdentifier(s) => write!(f, "\"{}\"", s),
558            TokenKind::Select => write!(f, "SELECT"),
559            TokenKind::From => write!(f, "FROM"),
560            TokenKind::Where => write!(f, "WHERE"),
561            TokenKind::Plus => write!(f, "+"),
562            TokenKind::Minus => write!(f, "-"),
563            TokenKind::Star => write!(f, "*"),
564            TokenKind::Slash => write!(f, "/"),
565            TokenKind::Eq => write!(f, "="),
566            TokenKind::Ne => write!(f, "!="),
567            TokenKind::Lt => write!(f, "<"),
568            TokenKind::Le => write!(f, "<="),
569            TokenKind::Gt => write!(f, ">"),
570            TokenKind::Ge => write!(f, ">="),
571            TokenKind::LParen => write!(f, "("),
572            TokenKind::RParen => write!(f, ")"),
573            TokenKind::LBracket => write!(f, "["),
574            TokenKind::RBracket => write!(f, "]"),
575            TokenKind::Comma => write!(f, ","),
576            TokenKind::Semicolon => write!(f, ";"),
577            TokenKind::Dot => write!(f, "."),
578            TokenKind::Eof => write!(f, "EOF"),
579            TokenKind::Null => write!(f, "NULL"),
580            TokenKind::True => write!(f, "TRUE"),
581            TokenKind::False => write!(f, "FALSE"),
582            TokenKind::And => write!(f, "AND"),
583            TokenKind::Or => write!(f, "OR"),
584            TokenKind::Not => write!(f, "NOT"),
585            _ => write!(f, "{:?}", self),
586        }
587    }
588}