sochdb_query/sql/
token.rs

1// Copyright 2025 Sushanth (https://github.com/sushanthpy)
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! SQL Token Types
16//!
17//! Comprehensive token set for SQL-92 with SochDB extensions.
18
19use std::fmt;
20use std::hash::Hash;
21
22/// Source location for error reporting
23#[derive(Debug, Clone, Copy, PartialEq, Eq)]
24pub struct Span {
25    pub start: usize,
26    pub end: usize,
27    pub line: usize,
28    pub column: usize,
29}
30
31impl Span {
32    pub fn new(start: usize, end: usize, line: usize, column: usize) -> Self {
33        Self {
34            start,
35            end,
36            line,
37            column,
38        }
39    }
40
41    pub fn merge(self, other: Span) -> Span {
42        Span {
43            start: self.start.min(other.start),
44            end: self.end.max(other.end),
45            line: self.line,
46            column: self.column,
47        }
48    }
49}
50
51impl Default for Span {
52    fn default() -> Self {
53        Self {
54            start: 0,
55            end: 0,
56            line: 1,
57            column: 1,
58        }
59    }
60}
61
62/// SQL Token with location information
63#[derive(Debug, Clone, PartialEq)]
64pub struct Token {
65    pub kind: TokenKind,
66    pub span: Span,
67    pub literal: String,
68}
69
70impl Token {
71    pub fn new(kind: TokenKind, span: Span, literal: impl Into<String>) -> Self {
72        Self {
73            kind,
74            span,
75            literal: literal.into(),
76        }
77    }
78}
79
80/// Token classification
81#[derive(Debug, Clone, PartialEq)]
82pub enum TokenKind {
83    // Literals
84    Integer(i64),
85    Float(f64),
86    String(String),
87    Blob(Vec<u8>),
88    Null,
89    True,
90    False,
91
92    // Identifiers
93    Identifier(String),
94    QuotedIdentifier(String), // "column name" or `column name`
95
96    // Keywords - DDL
97    Create,
98    Table,
99    Index,
100    Drop,
101    Alter,
102    Add,
103    Column,
104    Rename,
105    Primary,
106    Key,
107    Foreign,
108    References,
109    Unique,
110    Default,
111    AutoIncrement,
112    If,
113    Exists,
114
115    // Keywords - Conflict/Upsert (Dialect Support)
116    Ignore,    // MySQL: INSERT IGNORE
117    Replace,   // SQLite: INSERT OR REPLACE
118    Conflict,  // PostgreSQL: ON CONFLICT
119    Do,        // PostgreSQL: ON CONFLICT DO
120    Nothing,   // PostgreSQL: DO NOTHING
121    Duplicate, // MySQL: ON DUPLICATE KEY UPDATE
122    Abort,     // SQLite: INSERT OR ABORT
123    Fail,      // SQLite: INSERT OR FAIL
124    Returning, // PostgreSQL/SQLite: RETURNING clause
125
126    // Keywords - DML
127    Select,
128    Insert,
129    Update,
130    Delete,
131    Into,
132    Values,
133    Set,
134    From,
135    Where,
136    Join,
137    Inner,
138    Left,
139    Right,
140    Outer,
141    Cross,
142    On,
143    Using,
144
145    // Keywords - Clauses
146    As,
147    Distinct,
148    All,
149    Group,
150    Having,
151    Order,
152    By,
153    Asc,
154    Desc,
155    Nulls,
156    First,
157    Last,
158    Limit,
159    Offset,
160    Union,
161    Intersect,
162    Except,
163
164    // Keywords - Expressions
165    And,
166    Or,
167    Not,
168    Is,
169    In,
170    Like,
171    Escape,
172    Between,
173    Case,
174    When,
175    Then,
176    Else,
177    End,
178    Cast,
179    Collate,
180
181    // Keywords - Transactions
182    Begin,
183    Commit,
184    Rollback,
185    Transaction,
186    Savepoint,
187    Release,
188
189    // Keywords - Types
190    Int,
191    IntegerKw,
192    Bigint,
193    Smallint,
194    Tinyint,
195    FloatKw,
196    Double,
197    Real,
198    Decimal,
199    Numeric,
200    Varchar,
201    Char,
202    Text,
203    BlobKw,
204    Boolean,
205    Bool,
206    Date,
207    Time,
208    Timestamp,
209    Datetime,
210
211    // Keywords - Aggregates
212    Count,
213    Sum,
214    Avg,
215    Min,
216    Max,
217
218    // Keywords - SochDB Extensions
219    Vector,
220    VectorSearch,
221    JsonExtract,
222    JsonSet,
223    ContextWindow,
224    Embedding,
225    Cosine,
226    Euclidean,
227    DotProduct,
228
229    // Operators
230    Plus,       // +
231    Minus,      // -
232    Star,       // *
233    Slash,      // /
234    Percent,    // %
235    Eq,         // =
236    Ne,         // != or <>
237    Lt,         // <
238    Le,         // <=
239    Gt,         // >
240    Ge,         // >=
241    Concat,     // ||
242    BitAnd,     // &
243    BitOr,      // |
244    BitNot,     // ~
245    LeftShift,  // <<
246    RightShift, // >>
247
248    // Punctuation
249    LParen,       // (
250    RParen,       // )
251    LBracket,     // [
252    RBracket,     // ]
253    Comma,        // ,
254    Semicolon,    // ;
255    Dot,          // .
256    Colon,        // :
257    DoubleColon,  // ::
258    Arrow,        // ->
259    DoubleArrow,  // ->>
260    QuestionMark, // ?
261    At,           // @
262
263    // Special
264    Placeholder(u32), // $1, $2, ... or ?
265    Comment(String),
266    Whitespace,
267    Eof,
268    Invalid(String),
269}
270
271impl Eq for TokenKind {}
272
273impl Hash for TokenKind {
274    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
275        std::mem::discriminant(self).hash(state);
276        match self {
277            TokenKind::Integer(n) => n.hash(state),
278            TokenKind::Float(f) => f.to_bits().hash(state),
279            TokenKind::String(s) => s.hash(state),
280            TokenKind::Blob(b) => b.hash(state),
281            TokenKind::Identifier(s) => s.hash(state),
282            TokenKind::QuotedIdentifier(s) => s.hash(state),
283            TokenKind::Placeholder(n) => n.hash(state),
284            TokenKind::Comment(s) => s.hash(state),
285            TokenKind::Invalid(s) => s.hash(state),
286            _ => {}
287        }
288    }
289}
290
291impl TokenKind {
292    /// Check if this token is a keyword
293    pub fn is_keyword(&self) -> bool {
294        matches!(
295            self,
296            TokenKind::Select
297                | TokenKind::Insert
298                | TokenKind::Update
299                | TokenKind::Delete
300                | TokenKind::Create
301                | TokenKind::Drop
302                | TokenKind::From
303                | TokenKind::Where
304                | TokenKind::And
305                | TokenKind::Or
306                | TokenKind::Not
307                | TokenKind::Join
308                | TokenKind::Inner
309                | TokenKind::Left
310                | TokenKind::Right
311                | TokenKind::Outer
312                | TokenKind::Cross
313                | TokenKind::On
314                | TokenKind::As
315                | TokenKind::Distinct
316                | TokenKind::All
317                | TokenKind::Group
318                | TokenKind::Having
319                | TokenKind::Order
320                | TokenKind::By
321                | TokenKind::Asc
322                | TokenKind::Desc
323                | TokenKind::Limit
324                | TokenKind::Offset
325                | TokenKind::Values
326                | TokenKind::Into
327                | TokenKind::Set
328                | TokenKind::Begin
329                | TokenKind::Commit
330                | TokenKind::Rollback
331                | TokenKind::Table
332                | TokenKind::Index
333                | TokenKind::Alter
334                | TokenKind::Primary
335                | TokenKind::Key
336                | TokenKind::Foreign
337                | TokenKind::References
338                | TokenKind::Unique
339                | TokenKind::Default
340                | TokenKind::If
341                | TokenKind::Exists
342                | TokenKind::Case
343                | TokenKind::When
344                | TokenKind::Then
345                | TokenKind::Else
346                | TokenKind::End
347                | TokenKind::Cast
348                | TokenKind::Union
349                | TokenKind::Intersect
350                | TokenKind::Except
351                | TokenKind::Count
352                | TokenKind::Sum
353                | TokenKind::Avg
354                | TokenKind::Min
355                | TokenKind::Max
356                | TokenKind::Is
357                | TokenKind::In
358                | TokenKind::Like
359                | TokenKind::Between
360                | TokenKind::Null
361                | TokenKind::True
362                | TokenKind::False
363                | TokenKind::Int
364                | TokenKind::IntegerKw
365                | TokenKind::Bigint
366                | TokenKind::Smallint
367                | TokenKind::FloatKw
368                | TokenKind::Double
369                | TokenKind::Real
370                | TokenKind::Varchar
371                | TokenKind::Char
372                | TokenKind::Text
373                | TokenKind::BlobKw
374                | TokenKind::Boolean
375                | TokenKind::Bool
376                | TokenKind::Date
377                | TokenKind::Time
378                | TokenKind::Timestamp
379                | TokenKind::Datetime
380                | TokenKind::Vector
381                | TokenKind::VectorSearch
382                | TokenKind::Embedding
383                | TokenKind::Cosine
384                | TokenKind::Euclidean
385                | TokenKind::DotProduct
386                | TokenKind::ContextWindow
387                | TokenKind::Using
388                | TokenKind::Transaction
389                | TokenKind::Savepoint
390                | TokenKind::Release
391                | TokenKind::Escape
392                | TokenKind::Nulls
393                | TokenKind::First
394                | TokenKind::Last
395                | TokenKind::AutoIncrement
396                | TokenKind::Add
397                | TokenKind::Column
398                | TokenKind::Rename
399                | TokenKind::Collate
400                | TokenKind::Tinyint
401                | TokenKind::Decimal
402                | TokenKind::Numeric
403                | TokenKind::JsonExtract
404                | TokenKind::JsonSet
405                // Conflict/Upsert keywords
406                | TokenKind::Ignore
407                | TokenKind::Replace
408                | TokenKind::Conflict
409                | TokenKind::Do
410                | TokenKind::Nothing
411                | TokenKind::Duplicate
412                | TokenKind::Abort
413                | TokenKind::Fail
414                | TokenKind::Returning
415        )
416    }
417
418    /// Get keyword from string (case-insensitive)
419    pub fn from_keyword(s: &str) -> Option<TokenKind> {
420        match s.to_uppercase().as_str() {
421            "SELECT" => Some(TokenKind::Select),
422            "INSERT" => Some(TokenKind::Insert),
423            "UPDATE" => Some(TokenKind::Update),
424            "DELETE" => Some(TokenKind::Delete),
425            "CREATE" => Some(TokenKind::Create),
426            "TABLE" => Some(TokenKind::Table),
427            "DROP" => Some(TokenKind::Drop),
428            "ALTER" => Some(TokenKind::Alter),
429            "ADD" => Some(TokenKind::Add),
430            "COLUMN" => Some(TokenKind::Column),
431            "RENAME" => Some(TokenKind::Rename),
432            "INDEX" => Some(TokenKind::Index),
433            "FROM" => Some(TokenKind::From),
434            "WHERE" => Some(TokenKind::Where),
435            "AND" => Some(TokenKind::And),
436            "OR" => Some(TokenKind::Or),
437            "NOT" => Some(TokenKind::Not),
438            "NULL" => Some(TokenKind::Null),
439            "TRUE" => Some(TokenKind::True),
440            "FALSE" => Some(TokenKind::False),
441            "IS" => Some(TokenKind::Is),
442            "IN" => Some(TokenKind::In),
443            "LIKE" => Some(TokenKind::Like),
444            "ESCAPE" => Some(TokenKind::Escape),
445            "BETWEEN" => Some(TokenKind::Between),
446            "JOIN" => Some(TokenKind::Join),
447            "INNER" => Some(TokenKind::Inner),
448            "LEFT" => Some(TokenKind::Left),
449            "RIGHT" => Some(TokenKind::Right),
450            "OUTER" => Some(TokenKind::Outer),
451            "CROSS" => Some(TokenKind::Cross),
452            "ON" => Some(TokenKind::On),
453            "USING" => Some(TokenKind::Using),
454            "AS" => Some(TokenKind::As),
455            "DISTINCT" => Some(TokenKind::Distinct),
456            "ALL" => Some(TokenKind::All),
457            "GROUP" => Some(TokenKind::Group),
458            "HAVING" => Some(TokenKind::Having),
459            "ORDER" => Some(TokenKind::Order),
460            "BY" => Some(TokenKind::By),
461            "ASC" => Some(TokenKind::Asc),
462            "DESC" => Some(TokenKind::Desc),
463            "NULLS" => Some(TokenKind::Nulls),
464            "FIRST" => Some(TokenKind::First),
465            "LAST" => Some(TokenKind::Last),
466            "LIMIT" => Some(TokenKind::Limit),
467            "OFFSET" => Some(TokenKind::Offset),
468            "VALUES" => Some(TokenKind::Values),
469            "INTO" => Some(TokenKind::Into),
470            "SET" => Some(TokenKind::Set),
471            "BEGIN" => Some(TokenKind::Begin),
472            "COMMIT" => Some(TokenKind::Commit),
473            "ROLLBACK" => Some(TokenKind::Rollback),
474            "TRANSACTION" => Some(TokenKind::Transaction),
475            "SAVEPOINT" => Some(TokenKind::Savepoint),
476            "RELEASE" => Some(TokenKind::Release),
477            "PRIMARY" => Some(TokenKind::Primary),
478            "KEY" => Some(TokenKind::Key),
479            "FOREIGN" => Some(TokenKind::Foreign),
480            "REFERENCES" => Some(TokenKind::References),
481            "UNIQUE" => Some(TokenKind::Unique),
482            "DEFAULT" => Some(TokenKind::Default),
483            "AUTOINCREMENT" | "AUTO_INCREMENT" => Some(TokenKind::AutoIncrement),
484            "IF" => Some(TokenKind::If),
485            "EXISTS" => Some(TokenKind::Exists),
486            "CASE" => Some(TokenKind::Case),
487            "WHEN" => Some(TokenKind::When),
488            "THEN" => Some(TokenKind::Then),
489            "ELSE" => Some(TokenKind::Else),
490            "END" => Some(TokenKind::End),
491            "CAST" => Some(TokenKind::Cast),
492            "COLLATE" => Some(TokenKind::Collate),
493            "UNION" => Some(TokenKind::Union),
494            "INTERSECT" => Some(TokenKind::Intersect),
495            "EXCEPT" => Some(TokenKind::Except),
496            "COUNT" => Some(TokenKind::Count),
497            "SUM" => Some(TokenKind::Sum),
498            "AVG" => Some(TokenKind::Avg),
499            "MIN" => Some(TokenKind::Min),
500            "MAX" => Some(TokenKind::Max),
501            // Conflict/Upsert keywords
502            "IGNORE" => Some(TokenKind::Ignore),
503            "REPLACE" => Some(TokenKind::Replace),
504            "CONFLICT" => Some(TokenKind::Conflict),
505            "DO" => Some(TokenKind::Do),
506            "NOTHING" => Some(TokenKind::Nothing),
507            "DUPLICATE" => Some(TokenKind::Duplicate),
508            "ABORT" => Some(TokenKind::Abort),
509            "FAIL" => Some(TokenKind::Fail),
510            "RETURNING" => Some(TokenKind::Returning),
511            // Types
512            "INT" => Some(TokenKind::Int),
513            "INTEGER" => Some(TokenKind::IntegerKw),
514            "BIGINT" => Some(TokenKind::Bigint),
515            "SMALLINT" => Some(TokenKind::Smallint),
516            "TINYINT" => Some(TokenKind::Tinyint),
517            "FLOAT" => Some(TokenKind::FloatKw),
518            "DOUBLE" => Some(TokenKind::Double),
519            "REAL" => Some(TokenKind::Real),
520            "DECIMAL" => Some(TokenKind::Decimal),
521            "NUMERIC" => Some(TokenKind::Numeric),
522            "VARCHAR" => Some(TokenKind::Varchar),
523            "CHAR" => Some(TokenKind::Char),
524            "TEXT" => Some(TokenKind::Text),
525            "BLOB" => Some(TokenKind::BlobKw),
526            "BOOLEAN" => Some(TokenKind::Boolean),
527            "BOOL" => Some(TokenKind::Bool),
528            "DATE" => Some(TokenKind::Date),
529            "TIME" => Some(TokenKind::Time),
530            "TIMESTAMP" => Some(TokenKind::Timestamp),
531            "DATETIME" => Some(TokenKind::Datetime),
532            // SochDB Extensions
533            "VECTOR" => Some(TokenKind::Vector),
534            "VECTOR_SEARCH" => Some(TokenKind::VectorSearch),
535            "JSON_EXTRACT" => Some(TokenKind::JsonExtract),
536            "JSON_SET" => Some(TokenKind::JsonSet),
537            "CONTEXT_WINDOW" => Some(TokenKind::ContextWindow),
538            "EMBEDDING" => Some(TokenKind::Embedding),
539            "COSINE" => Some(TokenKind::Cosine),
540            "EUCLIDEAN" => Some(TokenKind::Euclidean),
541            "DOT_PRODUCT" => Some(TokenKind::DotProduct),
542            _ => None,
543        }
544    }
545}
546
547impl fmt::Display for TokenKind {
548    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
549        match self {
550            TokenKind::Integer(n) => write!(f, "{}", n),
551            TokenKind::Float(n) => write!(f, "{}", n),
552            TokenKind::String(s) => write!(f, "'{}'", s),
553            TokenKind::Identifier(s) => write!(f, "{}", s),
554            TokenKind::QuotedIdentifier(s) => write!(f, "\"{}\"", s),
555            TokenKind::Select => write!(f, "SELECT"),
556            TokenKind::From => write!(f, "FROM"),
557            TokenKind::Where => write!(f, "WHERE"),
558            TokenKind::Plus => write!(f, "+"),
559            TokenKind::Minus => write!(f, "-"),
560            TokenKind::Star => write!(f, "*"),
561            TokenKind::Slash => write!(f, "/"),
562            TokenKind::Eq => write!(f, "="),
563            TokenKind::Ne => write!(f, "!="),
564            TokenKind::Lt => write!(f, "<"),
565            TokenKind::Le => write!(f, "<="),
566            TokenKind::Gt => write!(f, ">"),
567            TokenKind::Ge => write!(f, ">="),
568            TokenKind::LParen => write!(f, "("),
569            TokenKind::RParen => write!(f, ")"),
570            TokenKind::LBracket => write!(f, "["),
571            TokenKind::RBracket => write!(f, "]"),
572            TokenKind::Comma => write!(f, ","),
573            TokenKind::Semicolon => write!(f, ";"),
574            TokenKind::Dot => write!(f, "."),
575            TokenKind::Eof => write!(f, "EOF"),
576            TokenKind::Null => write!(f, "NULL"),
577            TokenKind::True => write!(f, "TRUE"),
578            TokenKind::False => write!(f, "FALSE"),
579            TokenKind::And => write!(f, "AND"),
580            TokenKind::Or => write!(f, "OR"),
581            TokenKind::Not => write!(f, "NOT"),
582            _ => write!(f, "{:?}", self),
583        }
584    }
585}