polyglot_sql/
tokens.rs

1//! Token types and tokenization for SQL parsing
2//!
3//! This module defines all SQL token types and the tokenizer that converts
4//! SQL strings into token streams.
5
6use crate::error::{Error, Result};
7use serde::{Deserialize, Serialize};
8use std::fmt;
9
10/// Parse a DollarString token text into (tag, content).
11/// If the text contains '\x00', the part before is the tag and after is content.
12/// Otherwise, the whole text is the content with no tag.
13pub fn parse_dollar_string_token(text: &str) -> (Option<String>, String) {
14    if let Some(pos) = text.find('\x00') {
15        let tag = &text[..pos];
16        let content = &text[pos + 1..];
17        (Some(tag.to_string()), content.to_string())
18    } else {
19        (None, text.to_string())
20    }
21}
22
23/// Represents a position in the source SQL
24#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
25pub struct Span {
26    /// Starting byte offset
27    pub start: usize,
28    /// Ending byte offset (exclusive)
29    pub end: usize,
30    /// Line number (1-based)
31    pub line: usize,
32    /// Column number (1-based)
33    pub column: usize,
34}
35
36impl Span {
37    pub fn new(start: usize, end: usize, line: usize, column: usize) -> Self {
38        Self { start, end, line, column }
39    }
40}
41
42/// A token in the SQL token stream
43#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
44pub struct Token {
45    /// The type of token
46    pub token_type: TokenType,
47    /// The raw text of the token
48    pub text: String,
49    /// Position information
50    pub span: Span,
51    /// Leading comments (comments that appeared before this token)
52    #[serde(default)]
53    pub comments: Vec<String>,
54    /// Trailing comments (comments that appeared after this token, before the next one)
55    #[serde(default)]
56    pub trailing_comments: Vec<String>,
57}
58
59impl Token {
60    /// Create a new token
61    pub fn new(token_type: TokenType, text: impl Into<String>, span: Span) -> Self {
62        Self {
63            token_type,
64            text: text.into(),
65            span,
66            comments: Vec::new(),
67            trailing_comments: Vec::new(),
68        }
69    }
70
71    /// Create a NUMBER token
72    pub fn number(n: i64) -> Self {
73        Self::new(TokenType::Number, n.to_string(), Span::default())
74    }
75
76    /// Create a STRING token
77    pub fn string(s: impl Into<String>) -> Self {
78        Self::new(TokenType::String, s, Span::default())
79    }
80
81    /// Create an IDENTIFIER token
82    pub fn identifier(s: impl Into<String>) -> Self {
83        Self::new(TokenType::Identifier, s, Span::default())
84    }
85
86    /// Create a VAR token
87    pub fn var(s: impl Into<String>) -> Self {
88        Self::new(TokenType::Var, s, Span::default())
89    }
90
91    /// Add a comment to this token
92    pub fn with_comment(mut self, comment: impl Into<String>) -> Self {
93        self.comments.push(comment.into());
94        self
95    }
96}
97
98impl fmt::Display for Token {
99    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
100        write!(f, "{:?}({})", self.token_type, self.text)
101    }
102}
103
104/// All possible token types in SQL
105#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
106#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
107#[repr(u16)]
108pub enum TokenType {
109    // Punctuation
110    LParen,
111    RParen,
112    LBracket,
113    RBracket,
114    LBrace,
115    RBrace,
116    Comma,
117    Dot,
118    Dash,
119    Plus,
120    Colon,
121    DotColon,
122    DColon,
123    DColonDollar,
124    DColonPercent,
125    DColonQMark,
126    DQMark,
127    Semicolon,
128    Star,
129    Backslash,
130    Slash,
131    Lt,
132    Lte,
133    Gt,
134    Gte,
135    Not,
136    Eq,
137    Neq,
138    NullsafeEq,
139    ColonEq,
140    ColonGt,
141    NColonGt,
142    And,
143    Or,
144    Amp,
145    DPipe,
146    PipeGt,
147    Pipe,
148    PipeSlash,
149    DPipeSlash,
150    Caret,
151    CaretAt,
152    LtLt,  // <<
153    GtGt,  // >>
154    Tilde,
155    Arrow,
156    DArrow,
157    FArrow,
158    Hash,
159    HashArrow,
160    DHashArrow,
161    LrArrow,
162    DAt,
163    AtAt,
164    LtAt,
165    AtGt,
166    Dollar,
167    Parameter,
168    Session,
169    SessionParameter,
170    SessionUser,
171    DAmp,
172    AmpLt,
173    AmpGt,
174    Adjacent,
175    Xor,
176    DStar,
177    QMarkAmp,
178    QMarkPipe,
179    HashDash,
180    Exclamation,
181
182    UriStart,
183    BlockStart,
184    BlockEnd,
185    Space,
186    Break,
187
188    // Comments (emitted as tokens for round-trip fidelity)
189    BlockComment,  // /* ... */
190    LineComment,   // -- ...
191
192    // Literals
193    String,
194    DollarString,  // $$...$$
195    TripleDoubleQuotedString,  // """..."""
196    TripleSingleQuotedString,  // '''...'''
197    Number,
198    Identifier,
199    QuotedIdentifier,
200    Database,
201    Column,
202    ColumnDef,
203    Schema,
204    Table,
205    Warehouse,
206    Stage,
207    Streamlit,
208    Var,
209    BitString,
210    HexString,
211    /// Hex number: 0xA, 0xFF (BigQuery, SQLite style) - represents an integer in hex notation
212    HexNumber,
213    ByteString,
214    NationalString,
215    EscapeString,  // PostgreSQL E'...' escape string
216    RawString,
217    HeredocString,
218    HeredocStringAlternative,
219    UnicodeString,
220
221    // Data Types
222    Bit,
223    Boolean,
224    TinyInt,
225    UTinyInt,
226    SmallInt,
227    USmallInt,
228    MediumInt,
229    UMediumInt,
230    Int,
231    UInt,
232    BigInt,
233    UBigInt,
234    BigNum,
235    Int128,
236    UInt128,
237    Int256,
238    UInt256,
239    Float,
240    Double,
241    UDouble,
242    Decimal,
243    Decimal32,
244    Decimal64,
245    Decimal128,
246    Decimal256,
247    DecFloat,
248    UDecimal,
249    BigDecimal,
250    Char,
251    NChar,
252    VarChar,
253    NVarChar,
254    BpChar,
255    Text,
256    MediumText,
257    LongText,
258    Blob,
259    MediumBlob,
260    LongBlob,
261    TinyBlob,
262    TinyText,
263    Name,
264    Binary,
265    VarBinary,
266    Json,
267    JsonB,
268    Time,
269    TimeTz,
270    TimeNs,
271    Timestamp,
272    TimestampTz,
273    TimestampLtz,
274    TimestampNtz,
275    TimestampS,
276    TimestampMs,
277    TimestampNs,
278    DateTime,
279    DateTime2,
280    DateTime64,
281    SmallDateTime,
282    Date,
283    Date32,
284    Int4Range,
285    Int4MultiRange,
286    Int8Range,
287    Int8MultiRange,
288    NumRange,
289    NumMultiRange,
290    TsRange,
291    TsMultiRange,
292    TsTzRange,
293    TsTzMultiRange,
294    DateRange,
295    DateMultiRange,
296    Uuid,
297    Geography,
298    GeographyPoint,
299    Nullable,
300    Geometry,
301    Point,
302    Ring,
303    LineString,
304    LocalTime,
305    LocalTimestamp,
306    SysTimestamp,
307    MultiLineString,
308    Polygon,
309    MultiPolygon,
310    HllSketch,
311    HStore,
312    Super,
313    Serial,
314    SmallSerial,
315    BigSerial,
316    Xml,
317    Year,
318    UserDefined,
319    Money,
320    SmallMoney,
321    RowVersion,
322    Image,
323    Variant,
324    Object,
325    Inet,
326    IpAddress,
327    IpPrefix,
328    Ipv4,
329    Ipv6,
330    Enum,
331    Enum8,
332    Enum16,
333    FixedString,
334    LowCardinality,
335    Nested,
336    AggregateFunction,
337    SimpleAggregateFunction,
338    TDigest,
339    Unknown,
340    Vector,
341    Dynamic,
342    Void,
343
344    // Keywords
345    Add,
346    Alias,
347    Alter,
348    All,
349    Anti,
350    Any,
351    Apply,
352    Array,
353    Asc,
354    AsOf,
355    Attach,
356    AutoIncrement,
357    Begin,
358    Between,
359    BulkCollectInto,
360    Cache,
361    Cascade,
362    Case,
363    CharacterSet,
364    Cluster,
365    ClusterBy,
366    Collate,
367    Command,
368    Comment,
369    Commit,
370    Preserve,
371    Connect,
372    ConnectBy,
373    Constraint,
374    Copy,
375    Create,
376    Cross,
377    Cube,
378    CurrentDate,
379    CurrentDateTime,
380    CurrentSchema,
381    CurrentTime,
382    CurrentTimestamp,
383    CurrentUser,
384    CurrentRole,
385    CurrentCatalog,
386    Declare,
387    Default,
388    Delete,
389    Desc,
390    Describe,
391    Detach,
392    Dictionary,
393    Distinct,
394    Distribute,
395    DistributeBy,
396    Div,
397    Drop,
398    Else,
399    End,
400    Escape,
401    Except,
402    Execute,
403    Exists,
404    False,
405    Fetch,
406    File,
407    FileFormat,
408    Filter,
409    Final,
410    First,
411    For,
412    Force,
413    ForeignKey,
414    Format,
415    From,
416    Full,
417    Function,
418    Get,
419    Glob,
420    Global,
421    Grant,
422    GroupBy,
423    GroupingSets,
424    Having,
425    Hint,
426    Ignore,
427    ILike,
428    In,
429    Index,
430    IndexedBy,
431    Inner,
432    Input,
433    Insert,
434    Install,
435    Intersect,
436    Interval,
437    Into,
438    Inpath,
439    InputFormat,
440    Introducer,
441    IRLike,
442    Is,
443    IsNull,
444    Join,
445    JoinMarker,
446    Keep,
447    Key,
448    Kill,
449    Lambda,
450    Language,
451    Lateral,
452    Left,
453    Like,
454    NotLike,     // !~~ operator (PostgreSQL)
455    NotILike,    // !~~* operator (PostgreSQL)
456    NotRLike,    // !~ operator (PostgreSQL)
457    NotIRLike,   // !~* operator (PostgreSQL)
458    Limit,
459    List,
460    Load,
461    Local,
462    Lock,
463    Map,
464    Match,
465    MatchCondition,
466    MatchRecognize,
467    MemberOf,
468    Materialized,
469    Merge,
470    Mod,
471    Model,
472    Natural,
473    Next,
474    NoAction,
475    Nothing,
476    NotNull,
477    Null,
478    ObjectIdentifier,
479    Offset,
480    On,
481    Only,
482    Operator,
483    OrderBy,
484    OrderSiblingsBy,
485    Ordered,
486    Ordinality,
487    Out,
488    Outer,
489    Output,
490    Over,
491    Overlaps,
492    Overwrite,
493    Partition,
494    PartitionBy,
495    Percent,
496    Pivot,
497    Placeholder,
498    Positional,
499    Pragma,
500    Prewhere,
501    PrimaryKey,
502    Procedure,
503    Properties,
504    PseudoType,
505    Put,
506    Qualify,
507    Quote,
508    QDColon,
509    Range,
510    Recursive,
511    Refresh,
512    Rename,
513    Replace,
514    Returning,
515    Revoke,
516    References,
517    Restrict,
518    Right,
519    RLike,
520    Rollback,
521    Rollup,
522    Row,
523    Rows,
524    Select,
525    Semi,
526    Savepoint,
527    Separator,
528    Sequence,
529    Serde,
530    SerdeProperties,
531    Set,
532    Settings,
533    Show,
534    Siblings,
535    SimilarTo,
536    Some,
537    Sort,
538    SortBy,
539    SoundsLike,
540    StartWith,
541    StorageIntegration,
542    StraightJoin,
543    Struct,
544    Summarize,
545    TableSample,
546    Sample,
547    Bernoulli,
548    System,
549    Block,
550    Seed,
551    Repeatable,
552    Tag,
553    Temporary,
554    Transaction,
555    To,
556    Top,
557    Then,
558    True,
559    Truncate,
560    Uncache,
561    Union,
562    Unnest,
563    Unpivot,
564    Update,
565    Use,
566    Using,
567    Values,
568    View,
569    SemanticView,
570    Volatile,
571    When,
572    Where,
573    Window,
574    With,
575    Ties,
576    Exclude,
577    No,
578    Others,
579    Unique,
580    UtcDate,
581    UtcTime,
582    UtcTimestamp,
583    VersionSnapshot,
584    TimestampSnapshot,
585    Option,
586    Sink,
587    Source,
588    Analyze,
589    Namespace,
590    Export,
591    As,
592    By,
593    Nulls,
594    Respect,
595    Last,
596    If,
597    Cast,
598    TryCast,
599    SafeCast,
600    Count,
601    Extract,
602    Substring,
603    Trim,
604    Leading,
605    Trailing,
606    Both,
607    Position,
608    Overlaying,
609    Placing,
610    Treat,
611    Within,
612    Group,
613    Order,
614
615    // Window function keywords
616    Unbounded,
617    Preceding,
618    Following,
619    Current,
620    Groups,
621
622    // DDL-specific keywords (Phase 4)
623    Trigger,
624    Type,
625    Domain,
626    Returns,
627    Body,
628    Increment,
629    Minvalue,
630    Maxvalue,
631    Start,
632    Cycle,
633    NoCycle,
634    Prior,
635    Generated,
636    Identity,
637    Always,
638    // MATCH_RECOGNIZE tokens
639    Measures,
640    Pattern,
641    Define,
642    Running,
643    Owned,
644    After,
645    Before,
646    Instead,
647    Each,
648    Statement,
649    Referencing,
650    Old,
651    New,
652    Of,
653    Check,
654    Authorization,
655    Restart,
656
657    // Special
658    Eof,
659}
660
661impl TokenType {
662    /// Check if this token type is a keyword that can be used as an identifier in certain contexts
663    pub fn is_keyword(&self) -> bool {
664        matches!(
665            self,
666            TokenType::Select
667                | TokenType::From
668                | TokenType::Where
669                | TokenType::And
670                | TokenType::Or
671                | TokenType::Not
672                | TokenType::In
673                | TokenType::Is
674                | TokenType::Null
675                | TokenType::True
676                | TokenType::False
677                | TokenType::As
678                | TokenType::On
679                | TokenType::Join
680                | TokenType::Left
681                | TokenType::Right
682                | TokenType::Inner
683                | TokenType::Outer
684                | TokenType::Full
685                | TokenType::Cross
686                | TokenType::Semi
687                | TokenType::Anti
688                | TokenType::Union
689                | TokenType::Except
690                | TokenType::Intersect
691                | TokenType::GroupBy
692                | TokenType::OrderBy
693                | TokenType::Having
694                | TokenType::Limit
695                | TokenType::Offset
696                | TokenType::Case
697                | TokenType::When
698                | TokenType::Then
699                | TokenType::Else
700                | TokenType::End
701                | TokenType::Create
702                | TokenType::Drop
703                | TokenType::Alter
704                | TokenType::Insert
705                | TokenType::Update
706                | TokenType::Delete
707                | TokenType::Into
708                | TokenType::Values
709                | TokenType::Set
710                | TokenType::With
711                | TokenType::Distinct
712                | TokenType::All
713                | TokenType::Exists
714                | TokenType::Between
715                | TokenType::Like
716                | TokenType::ILike
717                // Additional keywords that can be used as identifiers
718                | TokenType::Filter
719                | TokenType::Date
720                | TokenType::Timestamp
721                | TokenType::TimestampTz
722                | TokenType::Interval
723                | TokenType::Time
724                | TokenType::Table
725                | TokenType::Index
726                | TokenType::Column
727                | TokenType::Database
728                | TokenType::Schema
729                | TokenType::View
730                | TokenType::Function
731                | TokenType::Procedure
732                | TokenType::Trigger
733                | TokenType::Sequence
734                | TokenType::Over
735                | TokenType::Partition
736                | TokenType::Window
737                | TokenType::Rows
738                | TokenType::Range
739                | TokenType::First
740                | TokenType::Last
741                | TokenType::Preceding
742                | TokenType::Following
743                | TokenType::Current
744                | TokenType::Row
745                | TokenType::Unbounded
746                | TokenType::Array
747                | TokenType::Struct
748                | TokenType::Map
749                | TokenType::PrimaryKey
750                | TokenType::Key
751                | TokenType::ForeignKey
752                | TokenType::References
753                | TokenType::Unique
754                | TokenType::Check
755                | TokenType::Default
756                | TokenType::Constraint
757                | TokenType::Comment
758                | TokenType::Rollup
759                | TokenType::Cube
760                | TokenType::Grant
761                | TokenType::Revoke
762                | TokenType::Type
763                | TokenType::Use
764                | TokenType::Cache
765                | TokenType::Uncache
766                | TokenType::Load
767                | TokenType::Any
768                | TokenType::Some
769                | TokenType::Asc
770                | TokenType::Desc
771                | TokenType::Nulls
772                | TokenType::Lateral
773                | TokenType::Natural
774                | TokenType::Escape
775                | TokenType::Glob
776                | TokenType::Match
777                | TokenType::Recursive
778                | TokenType::Replace
779                | TokenType::Returns
780                | TokenType::If
781                | TokenType::Pivot
782                | TokenType::Unpivot
783                | TokenType::Json
784                | TokenType::Blob
785                | TokenType::Text
786                | TokenType::Int
787                | TokenType::BigInt
788                | TokenType::SmallInt
789                | TokenType::TinyInt
790                | TokenType::Int128
791                | TokenType::UInt128
792                | TokenType::Int256
793                | TokenType::UInt256
794                | TokenType::UInt
795                | TokenType::UBigInt
796                | TokenType::Float
797                | TokenType::Double
798                | TokenType::Decimal
799                | TokenType::Boolean
800                | TokenType::VarChar
801                | TokenType::Char
802                | TokenType::Binary
803                | TokenType::VarBinary
804                | TokenType::No
805                | TokenType::DateTime
806                | TokenType::Truncate
807                | TokenType::Execute
808                | TokenType::Merge
809                | TokenType::Top
810                | TokenType::Begin
811                | TokenType::Generated
812                | TokenType::Identity
813                | TokenType::Always
814                | TokenType::Extract
815                // Keywords that can be identifiers in certain contexts
816                | TokenType::AsOf
817                | TokenType::Prior
818                | TokenType::After
819                | TokenType::Restrict
820                | TokenType::Cascade
821                | TokenType::Local
822                | TokenType::Rename
823                | TokenType::Enum
824                | TokenType::Within
825                | TokenType::Format
826                | TokenType::Final
827                | TokenType::FileFormat
828                | TokenType::Input
829                | TokenType::InputFormat
830                | TokenType::Copy
831                | TokenType::Put
832                | TokenType::Get
833                | TokenType::Show
834                | TokenType::Serde
835                | TokenType::Sample
836                | TokenType::Sort
837                | TokenType::Collate
838                | TokenType::Ties
839                | TokenType::IsNull
840                | TokenType::NotNull
841                | TokenType::Exclude
842                | TokenType::Temporary
843                | TokenType::Add
844                | TokenType::Ordinality
845                | TokenType::Overlaps
846                | TokenType::Block
847                | TokenType::Pattern
848                | TokenType::Group
849                | TokenType::Cluster
850                | TokenType::Repeatable
851                | TokenType::Groups
852                | TokenType::Commit
853                | TokenType::Warehouse
854                | TokenType::System
855                | TokenType::By
856                | TokenType::To
857                | TokenType::Fetch
858                | TokenType::For
859                | TokenType::Only
860                | TokenType::Next
861                | TokenType::Lock
862                | TokenType::Refresh
863                | TokenType::Settings
864                | TokenType::Operator
865                | TokenType::Overwrite
866                | TokenType::StraightJoin
867                | TokenType::Start
868        )
869    }
870
871    /// Check if this token type is a comparison operator
872    pub fn is_comparison(&self) -> bool {
873        matches!(
874            self,
875            TokenType::Eq
876                | TokenType::Neq
877                | TokenType::Lt
878                | TokenType::Lte
879                | TokenType::Gt
880                | TokenType::Gte
881                | TokenType::NullsafeEq
882        )
883    }
884
885    /// Check if this token type is an arithmetic operator
886    pub fn is_arithmetic(&self) -> bool {
887        matches!(
888            self,
889            TokenType::Plus
890                | TokenType::Dash
891                | TokenType::Star
892                | TokenType::Slash
893                | TokenType::Percent
894                | TokenType::Mod
895                | TokenType::Div
896        )
897    }
898}
899
900impl fmt::Display for TokenType {
901    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
902        write!(f, "{:?}", self)
903    }
904}
905
906/// Tokenizer configuration for a dialect
907#[derive(Debug, Clone)]
908pub struct TokenizerConfig {
909    /// Keywords mapping (uppercase keyword -> token type)
910    pub keywords: std::collections::HashMap<String, TokenType>,
911    /// Single character tokens
912    pub single_tokens: std::collections::HashMap<char, TokenType>,
913    /// Quote characters (start -> end)
914    pub quotes: std::collections::HashMap<String, String>,
915    /// Identifier quote characters (start -> end)
916    pub identifiers: std::collections::HashMap<char, char>,
917    /// Comment definitions (start -> optional end)
918    pub comments: std::collections::HashMap<String, Option<String>>,
919    /// String escape characters
920    pub string_escapes: Vec<char>,
921    /// Whether to support nested comments
922    pub nested_comments: bool,
923    /// Valid escape follow characters (for MySQL-style escaping).
924    /// When a backslash is followed by a character NOT in this list,
925    /// the backslash is discarded. When empty, all backslash escapes
926    /// preserve the backslash for unrecognized sequences.
927    pub escape_follow_chars: Vec<char>,
928    /// Whether b'...' is a byte string (true for BigQuery) or bit string (false for standard SQL).
929    /// Default is false (bit string).
930    pub b_prefix_is_byte_string: bool,
931    /// Numeric literal suffixes (uppercase suffix -> type name), e.g. {"L": "BIGINT", "S": "SMALLINT"}
932    /// Used by Hive/Spark to parse 1L as CAST(1 AS BIGINT)
933    pub numeric_literals: std::collections::HashMap<String, String>,
934    /// Whether unquoted identifiers can start with a digit (e.g., `1a`, `1_a`).
935    /// When true, a number followed by letters/underscore is treated as an identifier.
936    /// Used by Hive, Spark, MySQL, ClickHouse.
937    pub identifiers_can_start_with_digit: bool,
938    /// Whether 0x/0X prefix should be treated as hex literals.
939    /// When true, `0XCC` is tokenized instead of Number("0") + Identifier("XCC").
940    /// Used by BigQuery, SQLite, Teradata.
941    pub hex_number_strings: bool,
942    /// Whether hex string literals from 0x prefix represent integer values.
943    /// When true (BigQuery), 0xA is tokenized as HexNumber (integer in hex notation).
944    /// When false (SQLite, Teradata), 0xCC is tokenized as HexString (binary/blob value).
945    pub hex_string_is_integer_type: bool,
946    /// Whether string escape sequences (like \') are allowed in raw strings.
947    /// When true (BigQuery default), \' inside r'...' escapes the quote.
948    /// When false (Spark/Databricks), backslashes in raw strings are always literal.
949    /// Python sqlglot: STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS (default True)
950    pub string_escapes_allowed_in_raw_strings: bool,
951}
952
953impl Default for TokenizerConfig {
954    fn default() -> Self {
955        let mut keywords = std::collections::HashMap::new();
956        // Add basic SQL keywords
957        keywords.insert("SELECT".to_string(), TokenType::Select);
958        keywords.insert("FROM".to_string(), TokenType::From);
959        keywords.insert("WHERE".to_string(), TokenType::Where);
960        keywords.insert("AND".to_string(), TokenType::And);
961        keywords.insert("OR".to_string(), TokenType::Or);
962        keywords.insert("NOT".to_string(), TokenType::Not);
963        keywords.insert("AS".to_string(), TokenType::As);
964        keywords.insert("ON".to_string(), TokenType::On);
965        keywords.insert("JOIN".to_string(), TokenType::Join);
966        keywords.insert("LEFT".to_string(), TokenType::Left);
967        keywords.insert("RIGHT".to_string(), TokenType::Right);
968        keywords.insert("INNER".to_string(), TokenType::Inner);
969        keywords.insert("OUTER".to_string(), TokenType::Outer);
970        keywords.insert("OUTPUT".to_string(), TokenType::Output);
971        keywords.insert("FULL".to_string(), TokenType::Full);
972        keywords.insert("CROSS".to_string(), TokenType::Cross);
973        keywords.insert("SEMI".to_string(), TokenType::Semi);
974        keywords.insert("ANTI".to_string(), TokenType::Anti);
975        keywords.insert("STRAIGHT_JOIN".to_string(), TokenType::StraightJoin);
976        keywords.insert("UNION".to_string(), TokenType::Union);
977        keywords.insert("EXCEPT".to_string(), TokenType::Except);
978        keywords.insert("MINUS".to_string(), TokenType::Except); // Oracle/Redshift alias for EXCEPT
979        keywords.insert("INTERSECT".to_string(), TokenType::Intersect);
980        keywords.insert("GROUP".to_string(), TokenType::Group);
981        keywords.insert("CUBE".to_string(), TokenType::Cube);
982        keywords.insert("ROLLUP".to_string(), TokenType::Rollup);
983        keywords.insert("WITHIN".to_string(), TokenType::Within);
984        keywords.insert("ORDER".to_string(), TokenType::Order);
985        keywords.insert("BY".to_string(), TokenType::By);
986        keywords.insert("HAVING".to_string(), TokenType::Having);
987        keywords.insert("LIMIT".to_string(), TokenType::Limit);
988        keywords.insert("OFFSET".to_string(), TokenType::Offset);
989        keywords.insert("ORDINALITY".to_string(), TokenType::Ordinality);
990        keywords.insert("FETCH".to_string(), TokenType::Fetch);
991        keywords.insert("FIRST".to_string(), TokenType::First);
992        keywords.insert("NEXT".to_string(), TokenType::Next);
993        keywords.insert("ONLY".to_string(), TokenType::Only);
994        keywords.insert("KEEP".to_string(), TokenType::Keep);
995        keywords.insert("IGNORE".to_string(), TokenType::Ignore);
996        keywords.insert("INPUT".to_string(), TokenType::Input);
997        keywords.insert("CASE".to_string(), TokenType::Case);
998        keywords.insert("WHEN".to_string(), TokenType::When);
999        keywords.insert("THEN".to_string(), TokenType::Then);
1000        keywords.insert("ELSE".to_string(), TokenType::Else);
1001        keywords.insert("END".to_string(), TokenType::End);
1002        keywords.insert("ENDIF".to_string(), TokenType::End); // Exasol alias for END
1003        keywords.insert("NULL".to_string(), TokenType::Null);
1004        keywords.insert("TRUE".to_string(), TokenType::True);
1005        keywords.insert("FALSE".to_string(), TokenType::False);
1006        keywords.insert("IS".to_string(), TokenType::Is);
1007        keywords.insert("IN".to_string(), TokenType::In);
1008        keywords.insert("BETWEEN".to_string(), TokenType::Between);
1009        keywords.insert("OVERLAPS".to_string(), TokenType::Overlaps);
1010        keywords.insert("LIKE".to_string(), TokenType::Like);
1011        keywords.insert("ILIKE".to_string(), TokenType::ILike);
1012        keywords.insert("RLIKE".to_string(), TokenType::RLike);
1013        keywords.insert("REGEXP".to_string(), TokenType::RLike);
1014        keywords.insert("ESCAPE".to_string(), TokenType::Escape);
1015        keywords.insert("EXISTS".to_string(), TokenType::Exists);
1016        keywords.insert("DISTINCT".to_string(), TokenType::Distinct);
1017        keywords.insert("ALL".to_string(), TokenType::All);
1018        keywords.insert("WITH".to_string(), TokenType::With);
1019        keywords.insert("CREATE".to_string(), TokenType::Create);
1020        keywords.insert("DROP".to_string(), TokenType::Drop);
1021        keywords.insert("ALTER".to_string(), TokenType::Alter);
1022        keywords.insert("TRUNCATE".to_string(), TokenType::Truncate);
1023        keywords.insert("TABLE".to_string(), TokenType::Table);
1024        keywords.insert("VIEW".to_string(), TokenType::View);
1025        keywords.insert("INDEX".to_string(), TokenType::Index);
1026        keywords.insert("COLUMN".to_string(), TokenType::Column);
1027        keywords.insert("CONSTRAINT".to_string(), TokenType::Constraint);
1028        keywords.insert("ADD".to_string(), TokenType::Add);
1029        keywords.insert("CASCADE".to_string(), TokenType::Cascade);
1030        keywords.insert("RESTRICT".to_string(), TokenType::Restrict);
1031        keywords.insert("RENAME".to_string(), TokenType::Rename);
1032        keywords.insert("TEMPORARY".to_string(), TokenType::Temporary);
1033        keywords.insert("TEMP".to_string(), TokenType::Temporary);
1034        keywords.insert("UNIQUE".to_string(), TokenType::Unique);
1035        keywords.insert("PRIMARY".to_string(), TokenType::PrimaryKey);
1036        keywords.insert("FOREIGN".to_string(), TokenType::ForeignKey);
1037        keywords.insert("KEY".to_string(), TokenType::Key);
1038        keywords.insert("KILL".to_string(), TokenType::Kill);
1039        keywords.insert("REFERENCES".to_string(), TokenType::References);
1040        keywords.insert("DEFAULT".to_string(), TokenType::Default);
1041        keywords.insert("DECLARE".to_string(), TokenType::Declare);
1042        keywords.insert("AUTO_INCREMENT".to_string(), TokenType::AutoIncrement);
1043        keywords.insert("AUTOINCREMENT".to_string(), TokenType::AutoIncrement);  // Snowflake style
1044        keywords.insert("MATERIALIZED".to_string(), TokenType::Materialized);
1045        keywords.insert("REPLACE".to_string(), TokenType::Replace);
1046        keywords.insert("TO".to_string(), TokenType::To);
1047        keywords.insert("INSERT".to_string(), TokenType::Insert);
1048        keywords.insert("OVERWRITE".to_string(), TokenType::Overwrite);
1049        keywords.insert("UPDATE".to_string(), TokenType::Update);
1050        keywords.insert("USE".to_string(), TokenType::Use);
1051        keywords.insert("WAREHOUSE".to_string(), TokenType::Warehouse);
1052        keywords.insert("GLOB".to_string(), TokenType::Glob);
1053        keywords.insert("DELETE".to_string(), TokenType::Delete);
1054        keywords.insert("MERGE".to_string(), TokenType::Merge);
1055        keywords.insert("CACHE".to_string(), TokenType::Cache);
1056        keywords.insert("UNCACHE".to_string(), TokenType::Uncache);
1057        keywords.insert("REFRESH".to_string(), TokenType::Refresh);
1058        keywords.insert("GRANT".to_string(), TokenType::Grant);
1059        keywords.insert("REVOKE".to_string(), TokenType::Revoke);
1060        keywords.insert("COMMENT".to_string(), TokenType::Comment);
1061        keywords.insert("COLLATE".to_string(), TokenType::Collate);
1062        keywords.insert("INTO".to_string(), TokenType::Into);
1063        keywords.insert("VALUES".to_string(), TokenType::Values);
1064        keywords.insert("SET".to_string(), TokenType::Set);
1065        keywords.insert("SETTINGS".to_string(), TokenType::Settings);
1066        keywords.insert("SEPARATOR".to_string(), TokenType::Separator);
1067        keywords.insert("ASC".to_string(), TokenType::Asc);
1068        keywords.insert("DESC".to_string(), TokenType::Desc);
1069        keywords.insert("NULLS".to_string(), TokenType::Nulls);
1070        keywords.insert("RESPECT".to_string(), TokenType::Respect);
1071        keywords.insert("FIRST".to_string(), TokenType::First);
1072        keywords.insert("LAST".to_string(), TokenType::Last);
1073        keywords.insert("IF".to_string(), TokenType::If);
1074        keywords.insert("CAST".to_string(), TokenType::Cast);
1075        keywords.insert("TRY_CAST".to_string(), TokenType::TryCast);
1076        keywords.insert("SAFE_CAST".to_string(), TokenType::SafeCast);
1077        keywords.insert("OVER".to_string(), TokenType::Over);
1078        keywords.insert("PARTITION".to_string(), TokenType::Partition);
1079        keywords.insert("PLACING".to_string(), TokenType::Placing);
1080        keywords.insert("WINDOW".to_string(), TokenType::Window);
1081        keywords.insert("ROWS".to_string(), TokenType::Rows);
1082        keywords.insert("RANGE".to_string(), TokenType::Range);
1083        keywords.insert("FILTER".to_string(), TokenType::Filter);
1084        keywords.insert("NATURAL".to_string(), TokenType::Natural);
1085        keywords.insert("USING".to_string(), TokenType::Using);
1086        keywords.insert("UNBOUNDED".to_string(), TokenType::Unbounded);
1087        keywords.insert("PRECEDING".to_string(), TokenType::Preceding);
1088        keywords.insert("FOLLOWING".to_string(), TokenType::Following);
1089        keywords.insert("CURRENT".to_string(), TokenType::Current);
1090        keywords.insert("ROW".to_string(), TokenType::Row);
1091        keywords.insert("GROUPS".to_string(), TokenType::Groups);
1092        keywords.insert("RECURSIVE".to_string(), TokenType::Recursive);
1093        // TRIM function position keywords
1094        keywords.insert("BOTH".to_string(), TokenType::Both);
1095        keywords.insert("LEADING".to_string(), TokenType::Leading);
1096        keywords.insert("TRAILING".to_string(), TokenType::Trailing);
1097        keywords.insert("INTERVAL".to_string(), TokenType::Interval);
1098        // Phase 3: Additional keywords
1099        keywords.insert("TOP".to_string(), TokenType::Top);
1100        keywords.insert("QUALIFY".to_string(), TokenType::Qualify);
1101        keywords.insert("SAMPLE".to_string(), TokenType::Sample);
1102        keywords.insert("TABLESAMPLE".to_string(), TokenType::TableSample);
1103        keywords.insert("BERNOULLI".to_string(), TokenType::Bernoulli);
1104        keywords.insert("SYSTEM".to_string(), TokenType::System);
1105        keywords.insert("BLOCK".to_string(), TokenType::Block);
1106        keywords.insert("SEED".to_string(), TokenType::Seed);
1107        keywords.insert("REPEATABLE".to_string(), TokenType::Repeatable);
1108        keywords.insert("TIES".to_string(), TokenType::Ties);
1109        keywords.insert("LATERAL".to_string(), TokenType::Lateral);
1110        keywords.insert("LAMBDA".to_string(), TokenType::Lambda);
1111        keywords.insert("APPLY".to_string(), TokenType::Apply);
1112        // Oracle CONNECT BY keywords
1113        keywords.insert("CONNECT".to_string(), TokenType::Connect);
1114        // Hive/Spark specific keywords
1115        keywords.insert("CLUSTER".to_string(), TokenType::Cluster);
1116        keywords.insert("DISTRIBUTE".to_string(), TokenType::Distribute);
1117        keywords.insert("SORT".to_string(), TokenType::Sort);
1118        keywords.insert("PIVOT".to_string(), TokenType::Pivot);
1119        keywords.insert("PREWHERE".to_string(), TokenType::Prewhere);
1120        keywords.insert("UNPIVOT".to_string(), TokenType::Unpivot);
1121        keywords.insert("FOR".to_string(), TokenType::For);
1122        keywords.insert("ANY".to_string(), TokenType::Any);
1123        keywords.insert("SOME".to_string(), TokenType::Some);
1124        keywords.insert("ASOF".to_string(), TokenType::AsOf);
1125        keywords.insert("PERCENT".to_string(), TokenType::Percent);
1126        keywords.insert("EXCLUDE".to_string(), TokenType::Exclude);
1127        keywords.insert("NO".to_string(), TokenType::No);
1128        keywords.insert("OTHERS".to_string(), TokenType::Others);
1129        // PostgreSQL OPERATOR() syntax for schema-qualified operators
1130        keywords.insert("OPERATOR".to_string(), TokenType::Operator);
1131        // Phase 4: DDL keywords
1132        keywords.insert("SCHEMA".to_string(), TokenType::Schema);
1133        keywords.insert("NAMESPACE".to_string(), TokenType::Namespace);
1134        keywords.insert("DATABASE".to_string(), TokenType::Database);
1135        keywords.insert("FUNCTION".to_string(), TokenType::Function);
1136        keywords.insert("PROCEDURE".to_string(), TokenType::Procedure);
1137        keywords.insert("PROC".to_string(), TokenType::Procedure);
1138        keywords.insert("SEQUENCE".to_string(), TokenType::Sequence);
1139        keywords.insert("TRIGGER".to_string(), TokenType::Trigger);
1140        keywords.insert("TYPE".to_string(), TokenType::Type);
1141        keywords.insert("DOMAIN".to_string(), TokenType::Domain);
1142        keywords.insert("RETURNS".to_string(), TokenType::Returns);
1143        keywords.insert("RETURNING".to_string(), TokenType::Returning);
1144        keywords.insert("LANGUAGE".to_string(), TokenType::Language);
1145        keywords.insert("ROLLBACK".to_string(), TokenType::Rollback);
1146        keywords.insert("COMMIT".to_string(), TokenType::Commit);
1147        keywords.insert("BEGIN".to_string(), TokenType::Begin);
1148        keywords.insert("DESCRIBE".to_string(), TokenType::Describe);
1149        keywords.insert("PRESERVE".to_string(), TokenType::Preserve);
1150        keywords.insert("TRANSACTION".to_string(), TokenType::Transaction);
1151        keywords.insert("SAVEPOINT".to_string(), TokenType::Savepoint);
1152        keywords.insert("BODY".to_string(), TokenType::Body);
1153        keywords.insert("INCREMENT".to_string(), TokenType::Increment);
1154        keywords.insert("MINVALUE".to_string(), TokenType::Minvalue);
1155        keywords.insert("MAXVALUE".to_string(), TokenType::Maxvalue);
1156        keywords.insert("CYCLE".to_string(), TokenType::Cycle);
1157        keywords.insert("NOCYCLE".to_string(), TokenType::NoCycle);
1158        keywords.insert("PRIOR".to_string(), TokenType::Prior);
1159        // MATCH_RECOGNIZE keywords
1160        keywords.insert("MATCH".to_string(), TokenType::Match);
1161        keywords.insert("MATCH_RECOGNIZE".to_string(), TokenType::MatchRecognize);
1162        keywords.insert("MEASURES".to_string(), TokenType::Measures);
1163        keywords.insert("PATTERN".to_string(), TokenType::Pattern);
1164        keywords.insert("DEFINE".to_string(), TokenType::Define);
1165        keywords.insert("RUNNING".to_string(), TokenType::Running);
1166        keywords.insert("FINAL".to_string(), TokenType::Final);
1167        keywords.insert("OWNED".to_string(), TokenType::Owned);
1168        keywords.insert("AFTER".to_string(), TokenType::After);
1169        keywords.insert("BEFORE".to_string(), TokenType::Before);
1170        keywords.insert("INSTEAD".to_string(), TokenType::Instead);
1171        keywords.insert("EACH".to_string(), TokenType::Each);
1172        keywords.insert("STATEMENT".to_string(), TokenType::Statement);
1173        keywords.insert("REFERENCING".to_string(), TokenType::Referencing);
1174        keywords.insert("OLD".to_string(), TokenType::Old);
1175        keywords.insert("NEW".to_string(), TokenType::New);
1176        keywords.insert("OF".to_string(), TokenType::Of);
1177        keywords.insert("CHECK".to_string(), TokenType::Check);
1178        keywords.insert("START".to_string(), TokenType::Start);
1179        keywords.insert("ENUM".to_string(), TokenType::Enum);
1180        keywords.insert("AUTHORIZATION".to_string(), TokenType::Authorization);
1181        keywords.insert("RESTART".to_string(), TokenType::Restart);
1182        // Date/time literal keywords
1183        keywords.insert("DATE".to_string(), TokenType::Date);
1184        keywords.insert("TIME".to_string(), TokenType::Time);
1185        keywords.insert("TIMESTAMP".to_string(), TokenType::Timestamp);
1186        keywords.insert("DATETIME".to_string(), TokenType::DateTime);
1187        keywords.insert("GENERATED".to_string(), TokenType::Generated);
1188        keywords.insert("IDENTITY".to_string(), TokenType::Identity);
1189        keywords.insert("ALWAYS".to_string(), TokenType::Always);
1190        // LOAD DATA keywords
1191        keywords.insert("LOAD".to_string(), TokenType::Load);
1192        keywords.insert("LOCAL".to_string(), TokenType::Local);
1193        keywords.insert("INPATH".to_string(), TokenType::Inpath);
1194        keywords.insert("INPUTFORMAT".to_string(), TokenType::InputFormat);
1195        keywords.insert("SERDE".to_string(), TokenType::Serde);
1196        keywords.insert("SERDEPROPERTIES".to_string(), TokenType::SerdeProperties);
1197        keywords.insert("FORMAT".to_string(), TokenType::Format);
1198        // SQLite
1199        keywords.insert("PRAGMA".to_string(), TokenType::Pragma);
1200        // SHOW statement
1201        keywords.insert("SHOW".to_string(), TokenType::Show);
1202        // Oracle ORDER SIBLINGS BY (hierarchical queries)
1203        keywords.insert("SIBLINGS".to_string(), TokenType::Siblings);
1204        // COPY and PUT statements (Snowflake, PostgreSQL)
1205        keywords.insert("COPY".to_string(), TokenType::Copy);
1206        keywords.insert("PUT".to_string(), TokenType::Put);
1207        keywords.insert("GET".to_string(), TokenType::Get);
1208        // EXEC/EXECUTE statement (TSQL, etc.)
1209        keywords.insert("EXEC".to_string(), TokenType::Execute);
1210        keywords.insert("EXECUTE".to_string(), TokenType::Execute);
1211        // Postfix null check operators (PostgreSQL/SQLite)
1212        keywords.insert("ISNULL".to_string(), TokenType::IsNull);
1213        keywords.insert("NOTNULL".to_string(), TokenType::NotNull);
1214
1215        let mut single_tokens = std::collections::HashMap::new();
1216        single_tokens.insert('(', TokenType::LParen);
1217        single_tokens.insert(')', TokenType::RParen);
1218        single_tokens.insert('[', TokenType::LBracket);
1219        single_tokens.insert(']', TokenType::RBracket);
1220        single_tokens.insert('{', TokenType::LBrace);
1221        single_tokens.insert('}', TokenType::RBrace);
1222        single_tokens.insert(',', TokenType::Comma);
1223        single_tokens.insert('.', TokenType::Dot);
1224        single_tokens.insert(';', TokenType::Semicolon);
1225        single_tokens.insert('+', TokenType::Plus);
1226        single_tokens.insert('-', TokenType::Dash);
1227        single_tokens.insert('*', TokenType::Star);
1228        single_tokens.insert('/', TokenType::Slash);
1229        single_tokens.insert('%', TokenType::Percent);
1230        single_tokens.insert('&', TokenType::Amp);
1231        single_tokens.insert('|', TokenType::Pipe);
1232        single_tokens.insert('^', TokenType::Caret);
1233        single_tokens.insert('~', TokenType::Tilde);
1234        single_tokens.insert('<', TokenType::Lt);
1235        single_tokens.insert('>', TokenType::Gt);
1236        single_tokens.insert('=', TokenType::Eq);
1237        single_tokens.insert('!', TokenType::Exclamation);
1238        single_tokens.insert(':', TokenType::Colon);
1239        single_tokens.insert('@', TokenType::DAt);
1240        single_tokens.insert('#', TokenType::Hash);
1241        single_tokens.insert('$', TokenType::Dollar);
1242        single_tokens.insert('?', TokenType::Parameter);
1243
1244        let mut quotes = std::collections::HashMap::new();
1245        quotes.insert("'".to_string(), "'".to_string());
1246        // Triple-quoted strings (e.g., """x""")
1247        quotes.insert("\"\"\"".to_string(), "\"\"\"".to_string());
1248
1249        let mut identifiers = std::collections::HashMap::new();
1250        identifiers.insert('"', '"');
1251        identifiers.insert('`', '`');
1252        // Note: TSQL bracket-quoted identifiers [name] are handled in the parser
1253        // because [ is also used for arrays and subscripts
1254
1255        let mut comments = std::collections::HashMap::new();
1256        comments.insert("--".to_string(), None);
1257        comments.insert("/*".to_string(), Some("*/".to_string()));
1258
1259        Self {
1260            keywords,
1261            single_tokens,
1262            quotes,
1263            identifiers,
1264            comments,
1265            // Standard SQL: only '' (doubled quote) escapes a quote
1266            // Backslash escapes are dialect-specific (MySQL, etc.)
1267            string_escapes: vec!['\''],
1268            nested_comments: true,
1269            // By default, no escape_follow_chars means preserve backslash for unrecognized escapes
1270            escape_follow_chars: vec![],
1271            // Default: b'...' is bit string (standard SQL), not byte string (BigQuery)
1272            b_prefix_is_byte_string: false,
1273            numeric_literals: std::collections::HashMap::new(),
1274            identifiers_can_start_with_digit: false,
1275            hex_number_strings: false,
1276            hex_string_is_integer_type: false,
1277            // Default: backslash escapes ARE allowed in raw strings (sqlglot default)
1278            // Spark/Databricks set this to false
1279            string_escapes_allowed_in_raw_strings: true,
1280        }
1281    }
1282}
1283
1284/// SQL Tokenizer
1285pub struct Tokenizer {
1286    config: TokenizerConfig,
1287}
1288
1289impl Tokenizer {
1290    /// Create a new tokenizer with the given configuration
1291    pub fn new(config: TokenizerConfig) -> Self {
1292        Self { config }
1293    }
1294
1295    /// Create a tokenizer with default configuration
1296    pub fn default_config() -> Self {
1297        Self::new(TokenizerConfig::default())
1298    }
1299
1300    /// Tokenize a SQL string
1301    pub fn tokenize(&self, sql: &str) -> Result<Vec<Token>> {
1302        let mut state = TokenizerState::new(sql, &self.config);
1303        state.tokenize()
1304    }
1305}
1306
1307impl Default for Tokenizer {
1308    fn default() -> Self {
1309        Self::default_config()
1310    }
1311}
1312
1313/// Internal state for tokenization
1314struct TokenizerState<'a> {
1315    chars: Vec<char>,
1316    size: usize,
1317    tokens: Vec<Token>,
1318    start: usize,
1319    current: usize,
1320    line: usize,
1321    column: usize,
1322    comments: Vec<String>,
1323    config: &'a TokenizerConfig,
1324}
1325
1326impl<'a> TokenizerState<'a> {
1327    fn new(sql: &str, config: &'a TokenizerConfig) -> Self {
1328        let chars: Vec<char> = sql.chars().collect();
1329        let size = chars.len();
1330        Self {
1331            chars,
1332            size,
1333            tokens: Vec::new(),
1334            start: 0,
1335            current: 0,
1336            line: 1,
1337            column: 1,
1338            comments: Vec::new(),
1339            config,
1340        }
1341    }
1342
1343    fn tokenize(&mut self) -> Result<Vec<Token>> {
1344        while !self.is_at_end() {
1345            self.skip_whitespace();
1346            if self.is_at_end() {
1347                break;
1348            }
1349
1350            self.start = self.current;
1351            self.scan_token()?;
1352        }
1353
1354        // Handle edge case: comments with no tokens (e.g., SQL that's just a comment)
1355        // In this case, self.comments contains leading comments that couldn't be attached
1356        // to any token. We can't do much with them here.
1357        // Note: After the first token is created, comments go directly to trailing_comments
1358        // via scan_block_comment/scan_line_comment, so self.comments should typically be empty.
1359
1360        Ok(std::mem::take(&mut self.tokens))
1361    }
1362
1363    fn is_at_end(&self) -> bool {
1364        self.current >= self.size
1365    }
1366
1367    fn peek(&self) -> char {
1368        if self.is_at_end() {
1369            '\0'
1370        } else {
1371            self.chars[self.current]
1372        }
1373    }
1374
1375    fn peek_next(&self) -> char {
1376        if self.current + 1 >= self.size {
1377            '\0'
1378        } else {
1379            self.chars[self.current + 1]
1380        }
1381    }
1382
1383    fn advance(&mut self) -> char {
1384        let c = self.peek();
1385        self.current += 1;
1386        if c == '\n' {
1387            self.line += 1;
1388            self.column = 1;
1389        } else {
1390            self.column += 1;
1391        }
1392        c
1393    }
1394
1395    fn skip_whitespace(&mut self) {
1396        while !self.is_at_end() {
1397            let c = self.peek();
1398            match c {
1399                ' ' | '\t' | '\r' | '\n' => {
1400                    self.advance();
1401                }
1402                '-' if self.peek_next() == '-' => {
1403                    self.scan_line_comment();
1404                }
1405                '/' if self.peek_next() == '*' => {
1406                    // Check if this is a hint comment /*+ ... */
1407                    if self.current + 2 < self.size && self.chars[self.current + 2] == '+' {
1408                        // This is a hint comment, handle it as a token instead of skipping
1409                        break;
1410                    }
1411                    if self.scan_block_comment().is_err() {
1412                        return;
1413                    }
1414                }
1415                _ => break,
1416            }
1417        }
1418    }
1419
1420    fn scan_line_comment(&mut self) {
1421        self.advance(); // -
1422        self.advance(); // -
1423        let start = self.current;
1424        while !self.is_at_end() && self.peek() != '\n' {
1425            self.advance();
1426        }
1427        let comment: String = self.chars[start..self.current].iter().collect();
1428        let comment_text = comment.trim().to_string();
1429
1430        // Attach to previous token as trailing comment, or buffer for next token
1431        if let Some(last) = self.tokens.last_mut() {
1432            last.trailing_comments.push(comment_text);
1433        } else {
1434            self.comments.push(comment_text);
1435        }
1436    }
1437
1438    fn scan_block_comment(&mut self) -> Result<()> {
1439        self.advance(); // /
1440        self.advance(); // *
1441        let content_start = self.current;
1442        let mut depth = 1;
1443
1444        while !self.is_at_end() && depth > 0 {
1445            if self.peek() == '/' && self.peek_next() == '*' && self.config.nested_comments {
1446                self.advance();
1447                self.advance();
1448                depth += 1;
1449            } else if self.peek() == '*' && self.peek_next() == '/' {
1450                depth -= 1;
1451                if depth > 0 {
1452                    self.advance();
1453                    self.advance();
1454                }
1455            } else {
1456                self.advance();
1457            }
1458        }
1459
1460        if depth > 0 {
1461            return Err(Error::tokenize(
1462                "Unterminated block comment",
1463                self.line,
1464                self.column,
1465            ));
1466        }
1467
1468        // Get the content between /* and */ (preserving internal whitespace for nested comments)
1469        let content: String = self.chars[content_start..self.current].iter().collect();
1470        self.advance(); // *
1471        self.advance(); // /
1472
1473        // For round-trip fidelity, preserve the exact comment content including nested comments
1474        let comment_text = format!("/*{}*/", content);
1475
1476        // Attach to previous token as trailing comment, or buffer for next token
1477        if let Some(last) = self.tokens.last_mut() {
1478            last.trailing_comments.push(comment_text);
1479        } else {
1480            self.comments.push(comment_text);
1481        }
1482
1483        Ok(())
1484    }
1485
1486    /// Scan a hint comment /*+ ... */ and return it as a Hint token
1487    fn scan_hint(&mut self) -> Result<()> {
1488        self.advance(); // /
1489        self.advance(); // *
1490        self.advance(); // +
1491        let hint_start = self.current;
1492
1493        // Scan until we find */
1494        while !self.is_at_end() {
1495            if self.peek() == '*' && self.peek_next() == '/' {
1496                break;
1497            }
1498            self.advance();
1499        }
1500
1501        if self.is_at_end() {
1502            return Err(Error::tokenize(
1503                "Unterminated hint comment",
1504                self.line,
1505                self.column,
1506            ));
1507        }
1508
1509        let hint_text: String = self.chars[hint_start..self.current].iter().collect();
1510        self.advance(); // *
1511        self.advance(); // /
1512
1513        self.add_token_with_text(TokenType::Hint, hint_text.trim().to_string());
1514
1515        Ok(())
1516    }
1517
1518    /// Scan a positional parameter: $1, $2, etc.
1519    fn scan_positional_parameter(&mut self) -> Result<()> {
1520        self.advance(); // consume $
1521        let start = self.current;
1522
1523        while !self.is_at_end() && self.peek().is_ascii_digit() {
1524            self.advance();
1525        }
1526
1527        let number: String = self.chars[start..self.current].iter().collect();
1528        self.add_token_with_text(TokenType::Parameter, number);
1529        Ok(())
1530    }
1531
1532    /// Try to scan a tagged dollar-quoted string: $tag$content$tag$
1533    /// Returns Some(()) if successful, None if this isn't a tagged dollar string.
1534    ///
1535    /// The token text is stored as "tag\x00content" to preserve the tag for later use.
1536    fn try_scan_tagged_dollar_string(&mut self) -> Result<Option<()>> {
1537        let saved_pos = self.current;
1538
1539        // We're at '$', next char is alphabetic
1540        self.advance(); // consume opening $
1541
1542        // Scan the tag (identifier: alphanumeric + underscore, including Unicode)
1543        // Tags can contain Unicode characters like emojis (e.g., $🦆$)
1544        let tag_start = self.current;
1545        while !self.is_at_end() && (self.peek().is_alphanumeric() || self.peek() == '_' || !self.peek().is_ascii()) {
1546            self.advance();
1547        }
1548        let tag: String = self.chars[tag_start..self.current].iter().collect();
1549
1550        // Must have a closing $ after the tag
1551        if self.is_at_end() || self.peek() != '$' {
1552            // Not a tagged dollar string - restore position
1553            self.current = saved_pos;
1554            return Ok(None);
1555        }
1556        self.advance(); // consume closing $ of opening tag
1557
1558        // Now scan content until we find $tag$
1559        let content_start = self.current;
1560        let closing_tag = format!("${}$", tag);
1561        let closing_chars: Vec<char> = closing_tag.chars().collect();
1562
1563        loop {
1564            if self.is_at_end() {
1565                // Unterminated - restore and fall through
1566                self.current = saved_pos;
1567                return Ok(None);
1568            }
1569
1570            // Check if we've reached the closing tag
1571            if self.peek() == '$' && self.current + closing_chars.len() <= self.size {
1572                let matches = closing_chars.iter().enumerate().all(|(j, &ch)| {
1573                    self.current + j < self.size && self.chars[self.current + j] == ch
1574                });
1575                if matches {
1576                    let content: String = self.chars[content_start..self.current].iter().collect();
1577                    // Consume closing tag
1578                    for _ in 0..closing_chars.len() {
1579                        self.advance();
1580                    }
1581                    // Store as "tag\x00content" to preserve the tag
1582                    let token_text = format!("{}\x00{}", tag, content);
1583                    self.add_token_with_text(TokenType::DollarString, token_text);
1584                    return Ok(Some(()));
1585                }
1586            }
1587            self.advance();
1588        }
1589    }
1590
1591    /// Scan a dollar-quoted string: $$content$$ or $tag$content$tag$
1592    ///
1593    /// For $$...$$ (no tag), the token text is just the content.
1594    /// For $tag$...$tag$, use try_scan_tagged_dollar_string instead.
1595    fn scan_dollar_quoted_string(&mut self) -> Result<()> {
1596        self.advance(); // consume first $
1597        self.advance(); // consume second $
1598
1599        // For $$...$$ (no tag), just scan until closing $$
1600        let start = self.current;
1601        while !self.is_at_end() {
1602            if self.peek() == '$' && self.current + 1 < self.size && self.chars[self.current + 1] == '$' {
1603                break;
1604            }
1605            self.advance();
1606        }
1607
1608        let content: String = self.chars[start..self.current].iter().collect();
1609
1610        if !self.is_at_end() {
1611            self.advance(); // consume first $
1612            self.advance(); // consume second $
1613        }
1614
1615        self.add_token_with_text(TokenType::DollarString, content);
1616        Ok(())
1617    }
1618
1619    fn scan_token(&mut self) -> Result<()> {
1620        let c = self.peek();
1621
1622        // Check for string literal
1623        if c == '\'' {
1624            // Check for triple-quoted string '''...''' if configured
1625            if self.config.quotes.contains_key("'''")
1626               && self.peek_next() == '\''
1627               && self.current + 2 < self.size && self.chars[self.current + 2] == '\'' {
1628                return self.scan_triple_quoted_string('\'');
1629            }
1630            return self.scan_string();
1631        }
1632
1633        // Check for triple-quoted string """...""" if configured
1634        if c == '"' && self.config.quotes.contains_key("\"\"\"")
1635           && self.peek_next() == '"'
1636           && self.current + 2 < self.size && self.chars[self.current + 2] == '"' {
1637            return self.scan_triple_quoted_string('"');
1638        }
1639
1640        // Check for double-quoted strings when dialect supports them (e.g., BigQuery)
1641        // This must come before identifier quotes check
1642        if c == '"' && self.config.quotes.contains_key("\"") && !self.config.identifiers.contains_key(&'"') {
1643            return self.scan_double_quoted_string();
1644        }
1645
1646        // Check for identifier quotes
1647        if let Some(&end_quote) = self.config.identifiers.get(&c) {
1648            return self.scan_quoted_identifier(end_quote);
1649        }
1650
1651        // Check for numbers (including numbers starting with a dot like .25)
1652        if c.is_ascii_digit() {
1653            return self.scan_number();
1654        }
1655
1656        // Check for numbers starting with a dot (e.g., .25, .5)
1657        // This must come before single character token handling
1658        // Don't treat as a number if:
1659        // - Previous char was also a dot (e.g., 1..2 should be 1, ., ., 2)
1660        // - Previous char is an identifier character (e.g., foo.25 should be foo, ., 25)
1661        //   This handles BigQuery numeric table parts like project.dataset.25
1662        if c == '.' && self.peek_next().is_ascii_digit() {
1663            let prev_char = if self.current > 0 { self.chars[self.current - 1] } else { '\0' };
1664            let is_after_ident = prev_char.is_alphanumeric() || prev_char == '_'
1665                || prev_char == '`' || prev_char == '"' || prev_char == ']'
1666                || prev_char == ')';
1667            if prev_char != '.' && !is_after_ident {
1668                return self.scan_number_starting_with_dot();
1669            }
1670        }
1671
1672        // Check for hint comment /*+ ... */
1673        if c == '/' && self.peek_next() == '*' && self.current + 2 < self.size && self.chars[self.current + 2] == '+' {
1674            return self.scan_hint();
1675        }
1676
1677        // Check for multi-character operators first
1678        if let Some(token_type) = self.try_scan_multi_char_operator() {
1679            self.add_token(token_type);
1680            return Ok(());
1681        }
1682
1683        // Check for tagged dollar-quoted strings: $tag$content$tag$
1684        // Tags can contain Unicode characters (including emojis like 🦆) and digits (e.g., $1$)
1685        if c == '$'
1686            && (self.peek_next().is_alphanumeric() || self.peek_next() == '_' || !self.peek_next().is_ascii())
1687        {
1688            if let Some(()) = self.try_scan_tagged_dollar_string()? {
1689                return Ok(());
1690            }
1691        }
1692
1693        // Check for dollar-quoted strings: $$...$$
1694        if c == '$' && self.peek_next() == '$' {
1695            return self.scan_dollar_quoted_string();
1696        }
1697
1698        // Check for positional parameters: $1, $2, etc.
1699        if c == '$' && self.peek_next().is_ascii_digit() {
1700            return self.scan_positional_parameter();
1701        }
1702
1703        // TSQL: Check for identifiers starting with # (temp tables) or @ (variables)
1704        // e.g., #temp, ##global_temp, @variable
1705        if (c == '#' || c == '@') && (self.peek_next().is_alphanumeric() || self.peek_next() == '_' || self.peek_next() == '#') {
1706            return self.scan_tsql_identifier();
1707        }
1708
1709        // Check for single character tokens
1710        if let Some(&token_type) = self.config.single_tokens.get(&c) {
1711            self.advance();
1712            self.add_token(token_type);
1713            return Ok(());
1714        }
1715
1716        // Must be an identifier or keyword
1717        self.scan_identifier_or_keyword()
1718    }
1719
1720    fn try_scan_multi_char_operator(&mut self) -> Option<TokenType> {
1721        let c = self.peek();
1722        let next = self.peek_next();
1723        let third = if self.current + 2 < self.size { self.chars[self.current + 2] } else { '\0' };
1724
1725        // Check for three-character operators first
1726        // -|- (Adjacent - PostgreSQL range adjacency)
1727        if c == '-' && next == '|' && third == '-' {
1728            self.advance();
1729            self.advance();
1730            self.advance();
1731            return Some(TokenType::Adjacent);
1732        }
1733
1734        // ||/ (Cube root - PostgreSQL)
1735        if c == '|' && next == '|' && third == '/' {
1736            self.advance();
1737            self.advance();
1738            self.advance();
1739            return Some(TokenType::DPipeSlash);
1740        }
1741
1742        // #>> (JSONB path text extraction - PostgreSQL)
1743        if c == '#' && next == '>' && third == '>' {
1744            self.advance();
1745            self.advance();
1746            self.advance();
1747            return Some(TokenType::DHashArrow);
1748        }
1749
1750        // ->> (JSON text extraction - PostgreSQL/MySQL)
1751        if c == '-' && next == '>' && third == '>' {
1752            self.advance();
1753            self.advance();
1754            self.advance();
1755            return Some(TokenType::DArrow);
1756        }
1757
1758        // <=> (NULL-safe equality - MySQL)
1759        if c == '<' && next == '=' && third == '>' {
1760            self.advance();
1761            self.advance();
1762            self.advance();
1763            return Some(TokenType::NullsafeEq);
1764        }
1765
1766        // <-> (Distance operator - PostgreSQL)
1767        if c == '<' && next == '-' && third == '>' {
1768            self.advance();
1769            self.advance();
1770            self.advance();
1771            return Some(TokenType::LrArrow);
1772        }
1773
1774        // <@ (Contained by - PostgreSQL)
1775        if c == '<' && next == '@' {
1776            self.advance();
1777            self.advance();
1778            return Some(TokenType::LtAt);
1779        }
1780
1781        // @> (Contains - PostgreSQL)
1782        if c == '@' && next == '>' {
1783            self.advance();
1784            self.advance();
1785            return Some(TokenType::AtGt);
1786        }
1787
1788        // ~~~ (Glob - PostgreSQL)
1789        if c == '~' && next == '~' && third == '~' {
1790            self.advance();
1791            self.advance();
1792            self.advance();
1793            return Some(TokenType::Glob);
1794        }
1795
1796        // ~~* (ILike - PostgreSQL)
1797        if c == '~' && next == '~' && third == '*' {
1798            self.advance();
1799            self.advance();
1800            self.advance();
1801            return Some(TokenType::ILike);
1802        }
1803
1804        // !~~* (Not ILike - PostgreSQL)
1805        let fourth = if self.current + 3 < self.size { self.chars[self.current + 3] } else { '\0' };
1806        if c == '!' && next == '~' && third == '~' && fourth == '*' {
1807            self.advance();
1808            self.advance();
1809            self.advance();
1810            self.advance();
1811            return Some(TokenType::NotILike);
1812        }
1813
1814        // !~~ (Not Like - PostgreSQL)
1815        if c == '!' && next == '~' && third == '~' {
1816            self.advance();
1817            self.advance();
1818            self.advance();
1819            return Some(TokenType::NotLike);
1820        }
1821
1822        // !~* (Not Regexp ILike - PostgreSQL)
1823        if c == '!' && next == '~' && third == '*' {
1824            self.advance();
1825            self.advance();
1826            self.advance();
1827            return Some(TokenType::NotIRLike);
1828        }
1829
1830        // !:> (Not cast / Try cast - SingleStore)
1831        if c == '!' && next == ':' && third == '>' {
1832            self.advance();
1833            self.advance();
1834            self.advance();
1835            return Some(TokenType::NColonGt);
1836        }
1837
1838        // ?:: (TRY_CAST shorthand - Databricks)
1839        if c == '?' && next == ':' && third == ':' {
1840            self.advance();
1841            self.advance();
1842            self.advance();
1843            return Some(TokenType::QDColon);
1844        }
1845
1846        // !~ (Not Regexp - PostgreSQL)
1847        if c == '!' && next == '~' {
1848            self.advance();
1849            self.advance();
1850            return Some(TokenType::NotRLike);
1851        }
1852
1853        // ~~ (Like - PostgreSQL)
1854        if c == '~' && next == '~' {
1855            self.advance();
1856            self.advance();
1857            return Some(TokenType::Like);
1858        }
1859
1860        // ~* (Regexp ILike - PostgreSQL)
1861        if c == '~' && next == '*' {
1862            self.advance();
1863            self.advance();
1864            return Some(TokenType::IRLike);
1865        }
1866
1867        // SingleStore three-character JSON path operators (must be checked before :: two-char)
1868        // ::$ (JSON extract string), ::% (JSON extract double), ::? (JSON match)
1869        if c == ':' && next == ':' && third == '$' {
1870            self.advance();
1871            self.advance();
1872            self.advance();
1873            return Some(TokenType::DColonDollar);
1874        }
1875        if c == ':' && next == ':' && third == '%' {
1876            self.advance();
1877            self.advance();
1878            self.advance();
1879            return Some(TokenType::DColonPercent);
1880        }
1881        if c == ':' && next == ':' && third == '?' {
1882            self.advance();
1883            self.advance();
1884            self.advance();
1885            return Some(TokenType::DColonQMark);
1886        }
1887
1888        // Two-character operators
1889        let token_type = match (c, next) {
1890            ('.', ':') => Some(TokenType::DotColon),
1891            ('=', '=') => Some(TokenType::Eq),  // Hive/Spark == equality operator
1892            ('<', '=') => Some(TokenType::Lte),
1893            ('>', '=') => Some(TokenType::Gte),
1894            ('!', '=') => Some(TokenType::Neq),
1895            ('<', '>') => Some(TokenType::Neq),
1896            ('^', '=') => Some(TokenType::Neq),
1897            ('<', '<') => Some(TokenType::LtLt),
1898            ('>', '>') => Some(TokenType::GtGt),
1899            ('|', '|') => Some(TokenType::DPipe),
1900            ('|', '/') => Some(TokenType::PipeSlash),  // Square root - PostgreSQL
1901            (':', ':') => Some(TokenType::DColon),
1902            (':', '=') => Some(TokenType::ColonEq),    // := (assignment, named args)
1903            (':', '>') => Some(TokenType::ColonGt),    // ::> (TSQL)
1904            ('-', '>') => Some(TokenType::Arrow),      // JSON object access
1905            ('=', '>') => Some(TokenType::FArrow),     // Fat arrow (lambda)
1906            ('&', '&') => Some(TokenType::DAmp),
1907            ('&', '<') => Some(TokenType::AmpLt),      // PostgreSQL range operator
1908            ('&', '>') => Some(TokenType::AmpGt),      // PostgreSQL range operator
1909            ('@', '@') => Some(TokenType::AtAt),       // Text search match
1910            ('?', '|') => Some(TokenType::QMarkPipe),  // JSONB contains any key
1911            ('?', '&') => Some(TokenType::QMarkAmp),   // JSONB contains all keys
1912            ('?', '?') => Some(TokenType::DQMark),     // Double question mark
1913            ('#', '>') => Some(TokenType::HashArrow),  // JSONB path extraction
1914            ('#', '-') => Some(TokenType::HashDash),   // JSONB delete
1915            ('^', '@') => Some(TokenType::CaretAt),    // PostgreSQL starts-with operator
1916            ('*', '*') => Some(TokenType::DStar),      // Power operator
1917            ('|', '>') => Some(TokenType::PipeGt),     // Pipe-greater (some dialects)
1918            _ => None,
1919        };
1920
1921        if token_type.is_some() {
1922            self.advance();
1923            self.advance();
1924        }
1925
1926        token_type
1927    }
1928
1929    fn scan_string(&mut self) -> Result<()> {
1930        self.advance(); // Opening quote
1931        let mut value = String::new();
1932
1933        while !self.is_at_end() {
1934            let c = self.peek();
1935            if c == '\'' {
1936                if self.peek_next() == '\'' {
1937                    // Escaped quote
1938                    value.push('\'');
1939                    self.advance();
1940                    self.advance();
1941                } else {
1942                    break;
1943                }
1944            } else if c == '\\' && self.config.string_escapes.contains(&'\\') {
1945                // Handle escape sequences
1946                self.advance(); // Consume the backslash
1947                if !self.is_at_end() {
1948                    let escaped = self.advance();
1949                    match escaped {
1950                        'n' => value.push('\n'),
1951                        'r' => value.push('\r'),
1952                        't' => value.push('\t'),
1953                        '0' => value.push('\0'),
1954                        'Z' => value.push('\x1A'), // Ctrl+Z (MySQL)
1955                        'a' => value.push('\x07'), // Alert/bell
1956                        'b' => value.push('\x08'), // Backspace
1957                        'f' => value.push('\x0C'), // Form feed
1958                        'v' => value.push('\x0B'), // Vertical tab
1959                        '\\' => value.push('\\'),
1960                        '\'' => value.push('\''),
1961                        '"' => value.push('"'),
1962                        '%' => {
1963                            // MySQL: \% in LIKE patterns
1964                            value.push('%');
1965                        }
1966                        '_' => {
1967                            // MySQL: \_ in LIKE patterns
1968                            value.push('_');
1969                        }
1970                        // For unrecognized escape sequences:
1971                        // If escape_follow_chars is set, only preserve backslash for chars in that list
1972                        // Otherwise (empty list), preserve backslash + char for unrecognized escapes
1973                        _ => {
1974                            if !self.config.escape_follow_chars.is_empty() {
1975                                // MySQL-style: discard backslash for unrecognized escapes
1976                                value.push(escaped);
1977                            } else {
1978                                // Standard: preserve backslash + char
1979                                value.push('\\');
1980                                value.push(escaped);
1981                            }
1982                        }
1983                    }
1984                }
1985            } else {
1986                value.push(self.advance());
1987            }
1988        }
1989
1990        if self.is_at_end() {
1991            return Err(Error::tokenize(
1992                "Unterminated string",
1993                self.line,
1994                self.column,
1995            ));
1996        }
1997
1998        self.advance(); // Closing quote
1999        self.add_token_with_text(TokenType::String, value);
2000        Ok(())
2001    }
2002
2003    /// Scan a double-quoted string (for dialects like BigQuery where " is a string delimiter)
2004    fn scan_double_quoted_string(&mut self) -> Result<()> {
2005        self.advance(); // Opening quote
2006        let mut value = String::new();
2007
2008        while !self.is_at_end() {
2009            let c = self.peek();
2010            if c == '"' {
2011                if self.peek_next() == '"' {
2012                    // Escaped quote
2013                    value.push('"');
2014                    self.advance();
2015                    self.advance();
2016                } else {
2017                    break;
2018                }
2019            } else if c == '\\' && self.config.string_escapes.contains(&'\\') {
2020                // Handle escape sequences
2021                self.advance(); // Consume the backslash
2022                if !self.is_at_end() {
2023                    let escaped = self.advance();
2024                    match escaped {
2025                        'n' => value.push('\n'),
2026                        'r' => value.push('\r'),
2027                        't' => value.push('\t'),
2028                        '0' => value.push('\0'),
2029                        'Z' => value.push('\x1A'), // Ctrl+Z (MySQL)
2030                        'a' => value.push('\x07'), // Alert/bell
2031                        'b' => value.push('\x08'), // Backspace
2032                        'f' => value.push('\x0C'), // Form feed
2033                        'v' => value.push('\x0B'), // Vertical tab
2034                        '\\' => value.push('\\'),
2035                        '\'' => value.push('\''),
2036                        '"' => value.push('"'),
2037                        '%' => {
2038                            // MySQL: \% in LIKE patterns
2039                            value.push('%');
2040                        }
2041                        '_' => {
2042                            // MySQL: \_ in LIKE patterns
2043                            value.push('_');
2044                        }
2045                        // For unrecognized escape sequences:
2046                        // If escape_follow_chars is set, only preserve backslash for chars in that list
2047                        // Otherwise (empty list), preserve backslash + char for unrecognized escapes
2048                        _ => {
2049                            if !self.config.escape_follow_chars.is_empty() {
2050                                // MySQL-style: discard backslash for unrecognized escapes
2051                                value.push(escaped);
2052                            } else {
2053                                // Standard: preserve backslash + char
2054                                value.push('\\');
2055                                value.push(escaped);
2056                            }
2057                        }
2058                    }
2059                }
2060            } else {
2061                value.push(self.advance());
2062            }
2063        }
2064
2065        if self.is_at_end() {
2066            return Err(Error::tokenize(
2067                "Unterminated double-quoted string",
2068                self.line,
2069                self.column,
2070            ));
2071        }
2072
2073        self.advance(); // Closing quote
2074        self.add_token_with_text(TokenType::String, value);
2075        Ok(())
2076    }
2077
2078    fn scan_triple_quoted_string(&mut self, quote_char: char) -> Result<()> {
2079        // Advance past the three opening quotes
2080        self.advance();
2081        self.advance();
2082        self.advance();
2083        let mut value = String::new();
2084
2085        while !self.is_at_end() {
2086            // Check for closing triple quote
2087            if self.peek() == quote_char
2088                && self.current + 1 < self.size && self.chars[self.current + 1] == quote_char
2089                && self.current + 2 < self.size && self.chars[self.current + 2] == quote_char
2090            {
2091                // Found closing """
2092                break;
2093            }
2094            value.push(self.advance());
2095        }
2096
2097        if self.is_at_end() {
2098            return Err(Error::tokenize(
2099                "Unterminated triple-quoted string",
2100                self.line,
2101                self.column,
2102            ));
2103        }
2104
2105        // Advance past the three closing quotes
2106        self.advance();
2107        self.advance();
2108        self.advance();
2109        let token_type = if quote_char == '"' {
2110            TokenType::TripleDoubleQuotedString
2111        } else {
2112            TokenType::TripleSingleQuotedString
2113        };
2114        self.add_token_with_text(token_type, value);
2115        Ok(())
2116    }
2117
2118    fn scan_quoted_identifier(&mut self, end_quote: char) -> Result<()> {
2119        self.advance(); // Opening quote
2120        let start = self.current;
2121
2122        while !self.is_at_end() && self.peek() != end_quote {
2123            if self.peek() == end_quote && self.peek_next() == end_quote {
2124                // Escaped quote
2125                self.advance();
2126            }
2127            self.advance();
2128        }
2129
2130        if self.is_at_end() {
2131            return Err(Error::tokenize(
2132                "Unterminated identifier",
2133                self.line,
2134                self.column,
2135            ));
2136        }
2137
2138        let value: String = self.chars[start..self.current].iter().collect();
2139        self.advance(); // Closing quote
2140        self.add_token_with_text(TokenType::QuotedIdentifier, value);
2141        Ok(())
2142    }
2143
2144    fn scan_number(&mut self) -> Result<()> {
2145        // Check for 0x/0X hex number prefix (SQLite-style)
2146        if self.config.hex_number_strings && self.peek() == '0' && !self.is_at_end() {
2147            let next = if self.current + 1 < self.size { self.chars[self.current + 1] } else { '\0' };
2148            if next == 'x' || next == 'X' {
2149                // Advance past '0' and 'x'/'X'
2150                self.advance();
2151                self.advance();
2152                // Collect hex digits
2153                let hex_start = self.current;
2154                while !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2155                    self.advance();
2156                }
2157                if self.current > hex_start {
2158                    let hex_value: String = self.chars[hex_start..self.current].iter().collect();
2159                    if self.config.hex_string_is_integer_type {
2160                        // BigQuery: 0xA represents an integer in hex notation
2161                        self.add_token_with_text(TokenType::HexNumber, hex_value);
2162                    } else {
2163                        // SQLite/Teradata: 0xCC represents a binary/blob hex string
2164                        self.add_token_with_text(TokenType::HexString, hex_value);
2165                    }
2166                    return Ok(());
2167                }
2168                // No hex digits after 0x - fall through to normal number parsing
2169                // (reset current back to after '0')
2170                self.current = self.start + 1;
2171            }
2172        }
2173
2174        // Allow underscores as digit separators (e.g., 20_000, 1_000_000)
2175        while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2176            // Don't allow underscore at the end (must be followed by digit)
2177            if self.peek() == '_' && (self.is_at_end() || !self.peek_next().is_ascii_digit()) {
2178                break;
2179            }
2180            self.advance();
2181        }
2182
2183        // Look for decimal part - allow trailing dot (e.g., "1.")
2184        // In PostgreSQL (and sqlglot), "1.x" parses as float "1." with alias "x"
2185        // So we always consume the dot as part of the number, even if followed by an identifier
2186        if self.peek() == '.' {
2187            let next = self.peek_next();
2188            // Only consume the dot if:
2189            // 1. Followed by a digit (normal decimal like 1.5)
2190            // 2. Followed by an identifier start (like 1.x -> becomes 1. with alias x)
2191            // 3. End of input or other non-dot character (trailing decimal like "1.")
2192            // Do NOT consume if it's a double dot (..) which is a range operator
2193            if next != '.' {
2194                self.advance(); // consume the .
2195                // Only consume digits after the decimal point (not identifiers)
2196                while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2197                    if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2198                        break;
2199                    }
2200                    self.advance();
2201                }
2202            }
2203        }
2204
2205        // Look for exponent
2206        if self.peek() == 'e' || self.peek() == 'E' {
2207            self.advance();
2208            if self.peek() == '+' || self.peek() == '-' {
2209                self.advance();
2210            }
2211            while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2212                if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2213                    break;
2214                }
2215                self.advance();
2216            }
2217        }
2218
2219        let text: String = self.chars[self.start..self.current].iter().collect();
2220
2221        // Check for numeric literal suffixes (e.g., 1L -> BIGINT, 1s -> SMALLINT in Hive/Spark)
2222        if !self.config.numeric_literals.is_empty() && !self.is_at_end() {
2223            let next_char = self.peek().to_uppercase().to_string();
2224            // Try 2-char suffix first (e.g., "BD"), then 1-char
2225            let suffix_match = if self.current + 1 < self.size {
2226                let two_char: String = vec![self.chars[self.current], self.chars[self.current + 1]]
2227                    .iter().collect::<String>().to_uppercase();
2228                if self.config.numeric_literals.contains_key(&two_char) {
2229                    // Make sure the 2-char suffix is not followed by more identifier chars
2230                    let after_suffix = if self.current + 2 < self.size { self.chars[self.current + 2] } else { ' ' };
2231                    if !after_suffix.is_alphanumeric() && after_suffix != '_' {
2232                        Some((two_char, 2))
2233                    } else {
2234                        None
2235                    }
2236                } else if self.config.numeric_literals.contains_key(&next_char) {
2237                    // 1-char suffix - make sure not followed by more identifier chars
2238                    let after_suffix = if self.current + 1 < self.size { self.chars[self.current + 1] } else { ' ' };
2239                    if !after_suffix.is_alphanumeric() && after_suffix != '_' {
2240                        Some((next_char, 1))
2241                    } else {
2242                        None
2243                    }
2244                } else {
2245                    None
2246                }
2247            } else if self.config.numeric_literals.contains_key(&next_char) {
2248                // At end of input, 1-char suffix
2249                Some((next_char, 1))
2250            } else {
2251                None
2252            };
2253
2254            if let Some((suffix, len)) = suffix_match {
2255                // Consume the suffix characters
2256                for _ in 0..len {
2257                    self.advance();
2258                }
2259                // Emit as a special number-with-suffix token
2260                // We'll encode as "number::TYPE" so the parser can split it
2261                let type_name = self.config.numeric_literals.get(&suffix).expect("suffix verified by contains_key above").clone();
2262                let combined = format!("{}::{}", text, type_name);
2263                self.add_token_with_text(TokenType::Number, combined);
2264                return Ok(());
2265            }
2266        }
2267
2268        // Check for identifiers that start with a digit (e.g., 1a, 1_a, 1a_1a)
2269        // In Hive/Spark/MySQL/ClickHouse, these are valid unquoted identifiers
2270        if self.config.identifiers_can_start_with_digit && !self.is_at_end() {
2271            let next = self.peek();
2272            if next.is_alphabetic() || next == '_' {
2273                // Continue scanning as an identifier
2274                while !self.is_at_end() {
2275                    let ch = self.peek();
2276                    if ch.is_alphanumeric() || ch == '_' {
2277                        self.advance();
2278                    } else {
2279                        break;
2280                    }
2281                }
2282                let ident_text: String = self.chars[self.start..self.current].iter().collect();
2283                self.add_token_with_text(TokenType::Identifier, ident_text);
2284                return Ok(());
2285            }
2286        }
2287
2288        self.add_token_with_text(TokenType::Number, text);
2289        Ok(())
2290    }
2291
2292    /// Scan a number that starts with a dot (e.g., .25, .5, .123e10)
2293    fn scan_number_starting_with_dot(&mut self) -> Result<()> {
2294        // Consume the leading dot
2295        self.advance();
2296
2297        // Consume the fractional digits
2298        while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2299            if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2300                break;
2301            }
2302            self.advance();
2303        }
2304
2305        // Look for exponent
2306        if self.peek() == 'e' || self.peek() == 'E' {
2307            self.advance();
2308            if self.peek() == '+' || self.peek() == '-' {
2309                self.advance();
2310            }
2311            while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2312                if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2313                    break;
2314                }
2315                self.advance();
2316            }
2317        }
2318
2319        let text: String = self.chars[self.start..self.current].iter().collect();
2320        self.add_token_with_text(TokenType::Number, text);
2321        Ok(())
2322    }
2323
2324    fn scan_identifier_or_keyword(&mut self) -> Result<()> {
2325        // Guard against unrecognized characters that could cause infinite loops
2326        let first_char = self.peek();
2327        if !first_char.is_alphanumeric() && first_char != '_' {
2328            // Unknown character - skip it and return an error
2329            let c = self.advance();
2330            return Err(Error::tokenize(
2331                format!("Unexpected character: '{}'", c),
2332                self.line,
2333                self.column,
2334            ));
2335        }
2336
2337        while !self.is_at_end() {
2338            let c = self.peek();
2339            // Allow alphanumeric, underscore, $, # and @ in identifiers
2340            // PostgreSQL allows $, TSQL allows # and @
2341            // But stop consuming # if followed by > or >> (PostgreSQL #> and #>> operators)
2342            if c == '#' {
2343                let next_c = if self.current + 1 < self.size { self.chars[self.current + 1] } else { '\0' };
2344                if next_c == '>' || next_c == '-' {
2345                    break; // Don't consume # — it's part of #>, #>>, or #- operator
2346                }
2347                self.advance();
2348            } else if c.is_alphanumeric() || c == '_' || c == '$' || c == '@' {
2349                self.advance();
2350            } else {
2351                break;
2352            }
2353        }
2354
2355        let text: String = self.chars[self.start..self.current].iter().collect();
2356        let upper = text.to_uppercase();
2357
2358        // Special-case NOT= (Teradata and other dialects)
2359        if upper == "NOT" && self.peek() == '=' {
2360            self.advance(); // consume '='
2361            self.add_token(TokenType::Neq);
2362            return Ok(());
2363        }
2364
2365        // Check for special string prefixes like N'...', X'...', B'...', U&'...', r'...', b'...'
2366        // Also handle double-quoted variants for dialects that support them (e.g., BigQuery)
2367        let next_char = self.peek();
2368        let is_single_quote = next_char == '\'';
2369        let is_double_quote = next_char == '"' && self.config.quotes.contains_key("\"");
2370        // For raw strings (r"..." or r'...'), we allow double quotes even if " is not in quotes config
2371        // because raw strings are a special case used in Spark/Databricks where " is for identifiers
2372        let is_double_quote_for_raw = next_char == '"';
2373
2374        // Handle raw strings first - they're special because they work with both ' and "
2375        // even in dialects where " is normally an identifier delimiter (like Databricks)
2376        if upper == "R" && (is_single_quote || is_double_quote_for_raw) {
2377            // Raw string r'...' or r"..." or r'''...''' or r"""...""" (BigQuery style)
2378            // In raw strings, backslashes are treated literally (no escape processing)
2379            let quote_char = if is_single_quote { '\'' } else { '"' };
2380            self.advance(); // consume the first opening quote
2381
2382            // Check for triple-quoted raw string (r"""...""" or r'''...''')
2383            if self.peek() == quote_char && self.peek_next() == quote_char {
2384                // Triple-quoted raw string
2385                self.advance(); // consume second quote
2386                self.advance(); // consume third quote
2387                let string_value = self.scan_raw_triple_quoted_content(quote_char)?;
2388                self.add_token_with_text(TokenType::RawString, string_value);
2389            } else {
2390                let string_value = self.scan_raw_string_content(quote_char)?;
2391                self.add_token_with_text(TokenType::RawString, string_value);
2392            }
2393            return Ok(());
2394        }
2395
2396        if is_single_quote || is_double_quote {
2397            match upper.as_str() {
2398                "N" => {
2399                    // National string N'...'
2400                    self.advance(); // consume the opening quote
2401                    let string_value = if is_single_quote {
2402                        self.scan_string_content()?
2403                    } else {
2404                        self.scan_double_quoted_string_content()?
2405                    };
2406                    self.add_token_with_text(TokenType::NationalString, string_value);
2407                    return Ok(());
2408                }
2409                "E" => {
2410                    // PostgreSQL escape string E'...' or e'...'
2411                    // Preserve the case by prefixing with "e:" or "E:"
2412                    // Always use backslash escapes for escape strings (e.g., \' is an escaped quote)
2413                    let lowercase = text == "e";
2414                    let prefix = if lowercase { "e:" } else { "E:" };
2415                    self.advance(); // consume the opening quote
2416                    let string_value = self.scan_string_content_with_escapes(true)?;
2417                    self.add_token_with_text(TokenType::EscapeString, format!("{}{}", prefix, string_value));
2418                    return Ok(());
2419                }
2420                "X" => {
2421                    // Hex string X'...'
2422                    self.advance(); // consume the opening quote
2423                    let string_value = if is_single_quote {
2424                        self.scan_string_content()?
2425                    } else {
2426                        self.scan_double_quoted_string_content()?
2427                    };
2428                    self.add_token_with_text(TokenType::HexString, string_value);
2429                    return Ok(());
2430                }
2431                "B" if is_double_quote => {
2432                    // Byte string b"..." (BigQuery style) - MUST check before single quote B'...'
2433                    self.advance(); // consume the opening quote
2434                    let string_value = self.scan_double_quoted_string_content()?;
2435                    self.add_token_with_text(TokenType::ByteString, string_value);
2436                    return Ok(());
2437                }
2438                "B" if is_single_quote => {
2439                    // For BigQuery: b'...' is a byte string (bytes data)
2440                    // For standard SQL: B'...' is a bit string (binary digits)
2441                    self.advance(); // consume the opening quote
2442                    let string_value = self.scan_string_content()?;
2443                    if self.config.b_prefix_is_byte_string {
2444                        self.add_token_with_text(TokenType::ByteString, string_value);
2445                    } else {
2446                        self.add_token_with_text(TokenType::BitString, string_value);
2447                    }
2448                    return Ok(());
2449                }
2450                _ => {}
2451            }
2452        }
2453
2454        // Check for U&'...' Unicode string syntax (SQL standard)
2455        if upper == "U" && self.peek() == '&' && self.current + 1 < self.size && self.chars[self.current + 1] == '\'' {
2456            self.advance(); // consume '&'
2457            self.advance(); // consume opening quote
2458            let string_value = self.scan_string_content()?;
2459            self.add_token_with_text(TokenType::UnicodeString, string_value);
2460            return Ok(());
2461        }
2462
2463        let token_type = self
2464            .config
2465            .keywords
2466            .get(&upper)
2467            .copied()
2468            .unwrap_or(TokenType::Var);
2469
2470        self.add_token_with_text(token_type, text);
2471        Ok(())
2472    }
2473
2474    /// Scan string content (everything between quotes)
2475    /// If `force_backslash_escapes` is true, backslash is always treated as an escape character
2476    /// (used for PostgreSQL E'...' escape strings)
2477    fn scan_string_content_with_escapes(&mut self, force_backslash_escapes: bool) -> Result<String> {
2478        let mut value = String::new();
2479        let use_backslash_escapes = force_backslash_escapes || self.config.string_escapes.contains(&'\\');
2480
2481        while !self.is_at_end() {
2482            let c = self.peek();
2483            if c == '\'' {
2484                if self.peek_next() == '\'' {
2485                    // Escaped quote ''
2486                    value.push('\'');
2487                    self.advance();
2488                    self.advance();
2489                } else {
2490                    break;
2491                }
2492            } else if c == '\\' && use_backslash_escapes {
2493                // Preserve escape sequences literally (including \' for escape strings)
2494                value.push(self.advance());
2495                if !self.is_at_end() {
2496                    value.push(self.advance());
2497                }
2498            } else {
2499                value.push(self.advance());
2500            }
2501        }
2502
2503        if self.is_at_end() {
2504            return Err(Error::tokenize(
2505                "Unterminated string",
2506                self.line,
2507                self.column,
2508            ));
2509        }
2510
2511        self.advance(); // Closing quote
2512        Ok(value)
2513    }
2514
2515    /// Scan string content (everything between quotes)
2516    fn scan_string_content(&mut self) -> Result<String> {
2517        self.scan_string_content_with_escapes(false)
2518    }
2519
2520    /// Scan double-quoted string content (for dialects like BigQuery where " is a string delimiter)
2521    /// This is used for prefixed strings like b"..." or N"..."
2522    fn scan_double_quoted_string_content(&mut self) -> Result<String> {
2523        let mut value = String::new();
2524        let use_backslash_escapes = self.config.string_escapes.contains(&'\\');
2525
2526        while !self.is_at_end() {
2527            let c = self.peek();
2528            if c == '"' {
2529                if self.peek_next() == '"' {
2530                    // Escaped quote ""
2531                    value.push('"');
2532                    self.advance();
2533                    self.advance();
2534                } else {
2535                    break;
2536                }
2537            } else if c == '\\' && use_backslash_escapes {
2538                // Handle escape sequences
2539                self.advance(); // Consume backslash
2540                if !self.is_at_end() {
2541                    let escaped = self.advance();
2542                    match escaped {
2543                        'n' => value.push('\n'),
2544                        'r' => value.push('\r'),
2545                        't' => value.push('\t'),
2546                        '0' => value.push('\0'),
2547                        '\\' => value.push('\\'),
2548                        '"' => value.push('"'),
2549                        '\'' => value.push('\''),
2550                        'x' => {
2551                            // Hex escape \xNN - collect hex digits
2552                            let mut hex = String::new();
2553                            for _ in 0..2 {
2554                                if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2555                                    hex.push(self.advance());
2556                                }
2557                            }
2558                            if let Ok(byte) = u8::from_str_radix(&hex, 16) {
2559                                value.push(byte as char);
2560                            } else {
2561                                // Invalid hex escape, keep it literal
2562                                value.push('\\');
2563                                value.push('x');
2564                                value.push_str(&hex);
2565                            }
2566                        }
2567                        _ => {
2568                            // For unrecognized escapes, preserve backslash + char
2569                            value.push('\\');
2570                            value.push(escaped);
2571                        }
2572                    }
2573                }
2574            } else {
2575                value.push(self.advance());
2576            }
2577        }
2578
2579        if self.is_at_end() {
2580            return Err(Error::tokenize(
2581                "Unterminated double-quoted string",
2582                self.line,
2583                self.column,
2584            ));
2585        }
2586
2587        self.advance(); // Closing quote
2588        Ok(value)
2589    }
2590
2591    /// Scan raw string content (limited escape processing for quotes)
2592    /// Used for BigQuery r'...' and r"..." strings
2593    /// In raw strings, backslashes are literal EXCEPT that escape sequences for the
2594    /// quote character still work (e.g., \' in r'...' escapes the quote, '' also works)
2595    fn scan_raw_string_content(&mut self, quote_char: char) -> Result<String> {
2596        let mut value = String::new();
2597
2598        while !self.is_at_end() {
2599            let c = self.peek();
2600            if c == quote_char {
2601                if self.peek_next() == quote_char {
2602                    // Escaped quote (doubled) - e.g., '' inside r'...'
2603                    value.push(quote_char);
2604                    self.advance();
2605                    self.advance();
2606                } else {
2607                    break;
2608                }
2609            } else if c == '\\' && self.peek_next() == quote_char && self.config.string_escapes_allowed_in_raw_strings {
2610                // Backslash-escaped quote - works in raw strings when string_escapes_allowed_in_raw_strings is true
2611                // e.g., \' inside r'...' becomes literal ' (BigQuery behavior)
2612                // Spark/Databricks has this set to false, so backslash is always literal there
2613                value.push(quote_char);
2614                self.advance(); // consume backslash
2615                self.advance(); // consume quote
2616            } else {
2617                // In raw strings, everything including backslashes is literal
2618                value.push(self.advance());
2619            }
2620        }
2621
2622        if self.is_at_end() {
2623            return Err(Error::tokenize(
2624                "Unterminated raw string",
2625                self.line,
2626                self.column,
2627            ));
2628        }
2629
2630        self.advance(); // Closing quote
2631        Ok(value)
2632    }
2633
2634    /// Scan raw triple-quoted string content (r"""...""" or r'''...''')
2635    /// Terminates when three consecutive quote_chars are found
2636    fn scan_raw_triple_quoted_content(&mut self, quote_char: char) -> Result<String> {
2637        let mut value = String::new();
2638
2639        while !self.is_at_end() {
2640            let c = self.peek();
2641            if c == quote_char && self.peek_next() == quote_char {
2642                // Check for third quote
2643                if self.current + 2 < self.size && self.chars[self.current + 2] == quote_char {
2644                    // Found three consecutive quotes - end of string
2645                    self.advance(); // first closing quote
2646                    self.advance(); // second closing quote
2647                    self.advance(); // third closing quote
2648                    return Ok(value);
2649                }
2650            }
2651            // In raw strings, everything including backslashes is literal
2652            let ch = self.advance();
2653            value.push(ch);
2654        }
2655
2656        Err(Error::tokenize(
2657            "Unterminated raw triple-quoted string",
2658            self.line,
2659            self.column,
2660        ))
2661    }
2662
2663    /// Scan TSQL identifiers that start with # (temp tables) or @ (variables)
2664    /// Examples: #temp, ##global_temp, @variable
2665    fn scan_tsql_identifier(&mut self) -> Result<()> {
2666        // Consume the leading # or @ (or ##)
2667        let first = self.advance();
2668
2669        // For ##, consume the second #
2670        if first == '#' && self.peek() == '#' {
2671            self.advance();
2672        }
2673
2674        // Now scan the rest of the identifier
2675        while !self.is_at_end() {
2676            let c = self.peek();
2677            if c.is_alphanumeric() || c == '_' || c == '$' || c == '#' || c == '@' {
2678                self.advance();
2679            } else {
2680                break;
2681            }
2682        }
2683
2684        let text: String = self.chars[self.start..self.current].iter().collect();
2685        // These are always identifiers (variables or temp table names), never keywords
2686        self.add_token_with_text(TokenType::Var, text);
2687        Ok(())
2688    }
2689
2690    fn add_token(&mut self, token_type: TokenType) {
2691        let text: String = self.chars[self.start..self.current].iter().collect();
2692        self.add_token_with_text(token_type, text);
2693    }
2694
2695    fn add_token_with_text(&mut self, token_type: TokenType, text: String) {
2696        let span = Span::new(self.start, self.current, self.line, self.column);
2697        let mut token = Token::new(token_type, text, span);
2698        token.comments.append(&mut self.comments);
2699        self.tokens.push(token);
2700    }
2701}
2702
2703#[cfg(test)]
2704mod tests {
2705    use super::*;
2706
2707    #[test]
2708    fn test_simple_select() {
2709        let tokenizer = Tokenizer::default();
2710        let tokens = tokenizer.tokenize("SELECT 1").unwrap();
2711
2712        assert_eq!(tokens.len(), 2);
2713        assert_eq!(tokens[0].token_type, TokenType::Select);
2714        assert_eq!(tokens[1].token_type, TokenType::Number);
2715        assert_eq!(tokens[1].text, "1");
2716    }
2717
2718    #[test]
2719    fn test_select_with_identifier() {
2720        let tokenizer = Tokenizer::default();
2721        let tokens = tokenizer.tokenize("SELECT a, b FROM t").unwrap();
2722
2723        assert_eq!(tokens.len(), 6);
2724        assert_eq!(tokens[0].token_type, TokenType::Select);
2725        assert_eq!(tokens[1].token_type, TokenType::Var);
2726        assert_eq!(tokens[1].text, "a");
2727        assert_eq!(tokens[2].token_type, TokenType::Comma);
2728        assert_eq!(tokens[3].token_type, TokenType::Var);
2729        assert_eq!(tokens[3].text, "b");
2730        assert_eq!(tokens[4].token_type, TokenType::From);
2731        assert_eq!(tokens[5].token_type, TokenType::Var);
2732        assert_eq!(tokens[5].text, "t");
2733    }
2734
2735    #[test]
2736    fn test_string_literal() {
2737        let tokenizer = Tokenizer::default();
2738        let tokens = tokenizer.tokenize("SELECT 'hello'").unwrap();
2739
2740        assert_eq!(tokens.len(), 2);
2741        assert_eq!(tokens[1].token_type, TokenType::String);
2742        assert_eq!(tokens[1].text, "hello");
2743    }
2744
2745    #[test]
2746    fn test_escaped_string() {
2747        let tokenizer = Tokenizer::default();
2748        let tokens = tokenizer.tokenize("SELECT 'it''s'").unwrap();
2749
2750        assert_eq!(tokens.len(), 2);
2751        assert_eq!(tokens[1].token_type, TokenType::String);
2752        assert_eq!(tokens[1].text, "it's");
2753    }
2754
2755    #[test]
2756    fn test_comments() {
2757        let tokenizer = Tokenizer::default();
2758        let tokens = tokenizer.tokenize("SELECT -- comment\n1").unwrap();
2759
2760        assert_eq!(tokens.len(), 2);
2761        // Comments are attached to the PREVIOUS token as trailing_comments
2762        // This is better for round-trip fidelity (e.g., SELECT c /* comment */ FROM)
2763        assert_eq!(tokens[0].trailing_comments.len(), 1);
2764        assert_eq!(tokens[0].trailing_comments[0], "comment");
2765    }
2766
2767    #[test]
2768    fn test_operators() {
2769        let tokenizer = Tokenizer::default();
2770        let tokens = tokenizer.tokenize("1 + 2 * 3").unwrap();
2771
2772        assert_eq!(tokens.len(), 5);
2773        assert_eq!(tokens[0].token_type, TokenType::Number);
2774        assert_eq!(tokens[1].token_type, TokenType::Plus);
2775        assert_eq!(tokens[2].token_type, TokenType::Number);
2776        assert_eq!(tokens[3].token_type, TokenType::Star);
2777        assert_eq!(tokens[4].token_type, TokenType::Number);
2778    }
2779
2780    #[test]
2781    fn test_comparison_operators() {
2782        let tokenizer = Tokenizer::default();
2783        let tokens = tokenizer.tokenize("a <= b >= c != d").unwrap();
2784
2785        assert_eq!(tokens[1].token_type, TokenType::Lte);
2786        assert_eq!(tokens[3].token_type, TokenType::Gte);
2787        assert_eq!(tokens[5].token_type, TokenType::Neq);
2788    }
2789
2790    #[test]
2791    fn test_national_string() {
2792        let tokenizer = Tokenizer::default();
2793        let tokens = tokenizer.tokenize("N'abc'").unwrap();
2794
2795        assert_eq!(tokens.len(), 1, "Expected 1 token for N'abc', got {:?}", tokens);
2796        assert_eq!(tokens[0].token_type, TokenType::NationalString);
2797        assert_eq!(tokens[0].text, "abc");
2798    }
2799
2800    #[test]
2801    fn test_hex_string() {
2802        let tokenizer = Tokenizer::default();
2803        let tokens = tokenizer.tokenize("X'ABCD'").unwrap();
2804
2805        assert_eq!(tokens.len(), 1, "Expected 1 token for X'ABCD', got {:?}", tokens);
2806        assert_eq!(tokens[0].token_type, TokenType::HexString);
2807        assert_eq!(tokens[0].text, "ABCD");
2808    }
2809
2810    #[test]
2811    fn test_bit_string() {
2812        let tokenizer = Tokenizer::default();
2813        let tokens = tokenizer.tokenize("B'01010'").unwrap();
2814
2815        assert_eq!(tokens.len(), 1, "Expected 1 token for B'01010', got {:?}", tokens);
2816        assert_eq!(tokens[0].token_type, TokenType::BitString);
2817        assert_eq!(tokens[0].text, "01010");
2818    }
2819
2820    #[test]
2821    fn test_trailing_dot_number() {
2822        let tokenizer = Tokenizer::default();
2823
2824        // Test trailing dot
2825        let tokens = tokenizer.tokenize("SELECT 1.").unwrap();
2826        assert_eq!(tokens.len(), 2, "Expected 2 tokens for 'SELECT 1.', got {:?}", tokens);
2827        assert_eq!(tokens[1].token_type, TokenType::Number);
2828        assert_eq!(tokens[1].text, "1.");
2829
2830        // Test normal decimal
2831        let tokens = tokenizer.tokenize("SELECT 1.5").unwrap();
2832        assert_eq!(tokens[1].text, "1.5");
2833
2834        // Test number followed by dot and identifier
2835        // In PostgreSQL (and sqlglot), "1.x" parses as float "1." with alias "x"
2836        let tokens = tokenizer.tokenize("SELECT 1.a").unwrap();
2837        assert_eq!(tokens.len(), 3, "Expected 3 tokens for 'SELECT 1.a', got {:?}", tokens);
2838        assert_eq!(tokens[1].token_type, TokenType::Number);
2839        assert_eq!(tokens[1].text, "1.");
2840        assert_eq!(tokens[2].token_type, TokenType::Var);
2841
2842        // Test two dots (range operator) - dot is NOT consumed when followed by another dot
2843        let tokens = tokenizer.tokenize("SELECT 1..2").unwrap();
2844        assert_eq!(tokens[1].token_type, TokenType::Number);
2845        assert_eq!(tokens[1].text, "1");
2846        assert_eq!(tokens[2].token_type, TokenType::Dot);
2847        assert_eq!(tokens[3].token_type, TokenType::Dot);
2848        assert_eq!(tokens[4].token_type, TokenType::Number);
2849        assert_eq!(tokens[4].text, "2");
2850    }
2851
2852    #[test]
2853    fn test_leading_dot_number() {
2854        let tokenizer = Tokenizer::default();
2855
2856        // Test leading dot number (e.g., .25 for 0.25)
2857        let tokens = tokenizer.tokenize(".25").unwrap();
2858        assert_eq!(tokens.len(), 1, "Expected 1 token for '.25', got {:?}", tokens);
2859        assert_eq!(tokens[0].token_type, TokenType::Number);
2860        assert_eq!(tokens[0].text, ".25");
2861
2862        // Test leading dot in context (Oracle SAMPLE clause)
2863        let tokens = tokenizer.tokenize("SAMPLE (.25)").unwrap();
2864        assert_eq!(tokens.len(), 4, "Expected 4 tokens for 'SAMPLE (.25)', got {:?}", tokens);
2865        assert_eq!(tokens[0].token_type, TokenType::Sample);
2866        assert_eq!(tokens[1].token_type, TokenType::LParen);
2867        assert_eq!(tokens[2].token_type, TokenType::Number);
2868        assert_eq!(tokens[2].text, ".25");
2869        assert_eq!(tokens[3].token_type, TokenType::RParen);
2870
2871        // Test leading dot with exponent
2872        let tokens = tokenizer.tokenize(".5e10").unwrap();
2873        assert_eq!(tokens.len(), 1, "Expected 1 token for '.5e10', got {:?}", tokens);
2874        assert_eq!(tokens[0].token_type, TokenType::Number);
2875        assert_eq!(tokens[0].text, ".5e10");
2876
2877        // Test that plain dot is still a Dot token
2878        let tokens = tokenizer.tokenize("a.b").unwrap();
2879        assert_eq!(tokens.len(), 3, "Expected 3 tokens for 'a.b', got {:?}", tokens);
2880        assert_eq!(tokens[1].token_type, TokenType::Dot);
2881    }
2882
2883    #[test]
2884    fn test_unrecognized_character() {
2885        let tokenizer = Tokenizer::default();
2886
2887        // Test that unrecognized characters don't cause infinite loops
2888        let result = tokenizer.tokenize("SELECT \u{2018}hello\u{2019}");
2889        // Should return an error for the smart quote, not hang
2890        assert!(result.is_err(), "Should error on unrecognized character, got: {:?}", result);
2891
2892        // Unicode bullet character
2893        let result = tokenizer.tokenize("SELECT • FROM t");
2894        assert!(result.is_err());
2895    }
2896
2897    #[test]
2898    fn test_colon_eq_tokenization() {
2899        let tokenizer = Tokenizer::default();
2900
2901        // := should be a single ColonEq token
2902        let tokens = tokenizer.tokenize("a := 1").unwrap();
2903        assert_eq!(tokens.len(), 3);
2904        assert_eq!(tokens[0].token_type, TokenType::Var);
2905        assert_eq!(tokens[1].token_type, TokenType::ColonEq);
2906        assert_eq!(tokens[2].token_type, TokenType::Number);
2907
2908        // : followed by non-= should still be Colon
2909        let tokens = tokenizer.tokenize("a:b").unwrap();
2910        assert!(tokens.iter().any(|t| t.token_type == TokenType::Colon));
2911        assert!(!tokens.iter().any(|t| t.token_type == TokenType::ColonEq));
2912
2913        // :: should still be DColon
2914        let tokens = tokenizer.tokenize("a::INT").unwrap();
2915        assert!(tokens.iter().any(|t| t.token_type == TokenType::DColon));
2916    }
2917
2918    #[test]
2919    fn test_colon_eq_parsing() {
2920        use crate::parser::Parser;
2921        use crate::generator::Generator;
2922
2923        // MySQL @var := value in SELECT
2924        let ast = Parser::parse_sql("SELECT @var1 := 1, @var2").expect("Failed to parse MySQL @var := expr");
2925        let output = Generator::sql(&ast[0]).expect("Failed to generate");
2926        assert_eq!(output, "SELECT @var1 := 1, @var2");
2927
2928        // MySQL @var := @var in SELECT
2929        let ast = Parser::parse_sql("SELECT @var1, @var2 := @var1").expect("Failed to parse MySQL @var2 := @var1");
2930        let output = Generator::sql(&ast[0]).expect("Failed to generate");
2931        assert_eq!(output, "SELECT @var1, @var2 := @var1");
2932
2933        // MySQL @var := COUNT(*)
2934        let ast = Parser::parse_sql("SELECT @var1 := COUNT(*) FROM t1").expect("Failed to parse MySQL @var := COUNT(*)");
2935        let output = Generator::sql(&ast[0]).expect("Failed to generate");
2936        assert_eq!(output, "SELECT @var1 := COUNT(*) FROM t1");
2937
2938        // MySQL SET @var := 1 (should normalize to = in output)
2939        let ast = Parser::parse_sql("SET @var1 := 1").expect("Failed to parse SET @var1 := 1");
2940        let output = Generator::sql(&ast[0]).expect("Failed to generate");
2941        assert_eq!(output, "SET @var1 = 1");
2942
2943        // Function named args with :=
2944        let ast = Parser::parse_sql("UNION_VALUE(k1 := 1)").expect("Failed to parse named arg with :=");
2945        let output = Generator::sql(&ast[0]).expect("Failed to generate");
2946        assert_eq!(output, "UNION_VALUE(k1 := 1)");
2947
2948        // UNNEST with recursive := TRUE
2949        let ast = Parser::parse_sql("SELECT UNNEST(col, recursive := TRUE) FROM t").expect("Failed to parse UNNEST with :=");
2950        let output = Generator::sql(&ast[0]).expect("Failed to generate");
2951        assert_eq!(output, "SELECT UNNEST(col, recursive := TRUE) FROM t");
2952
2953        // DuckDB prefix alias: foo: 1 means 1 AS foo
2954        let ast = Parser::parse_sql("SELECT foo: 1").expect("Failed to parse DuckDB prefix alias foo: 1");
2955        let output = Generator::sql(&ast[0]).expect("Failed to generate");
2956        assert_eq!(output, "SELECT 1 AS foo");
2957
2958        // DuckDB prefix alias with multiple columns
2959        let ast = Parser::parse_sql("SELECT foo: 1, bar: 2, baz: 3").expect("Failed to parse DuckDB multiple prefix aliases");
2960        let output = Generator::sql(&ast[0]).expect("Failed to generate");
2961        assert_eq!(output, "SELECT 1 AS foo, 2 AS bar, 3 AS baz");
2962    }
2963
2964    #[test]
2965    fn test_colon_eq_dialect_roundtrip() {
2966        use crate::dialects::{Dialect, DialectType};
2967
2968        fn check(dialect: DialectType, sql: &str, expected: Option<&str>) {
2969            let d = Dialect::get(dialect);
2970            let ast = d.parse(sql).unwrap_or_else(|e| panic!("Parse error for '{}': {}", sql, e));
2971            assert!(!ast.is_empty(), "Empty AST for: {}", sql);
2972            let transformed = d.transform(ast[0].clone()).unwrap_or_else(|e| panic!("Transform error for '{}': {}", sql, e));
2973            let output = d.generate(&transformed).unwrap_or_else(|e| panic!("Generate error for '{}': {}", sql, e));
2974            let expected = expected.unwrap_or(sql);
2975            assert_eq!(output, expected, "Roundtrip failed for: {}", sql);
2976        }
2977
2978        // MySQL := tests
2979        check(DialectType::MySQL, "SELECT @var1 := 1, @var2", None);
2980        check(DialectType::MySQL, "SELECT @var1, @var2 := @var1", None);
2981        check(DialectType::MySQL, "SELECT @var1 := COUNT(*) FROM t1", None);
2982        check(DialectType::MySQL, "SET @var1 := 1", Some("SET @var1 = 1"));
2983
2984        // DuckDB := tests
2985        check(DialectType::DuckDB, "SELECT UNNEST(col, recursive := TRUE) FROM t", None);
2986        check(DialectType::DuckDB, "UNION_VALUE(k1 := 1)", None);
2987
2988        // STRUCT_PACK(a := 'b')::json should at least parse without error
2989        // (The STRUCT_PACK -> Struct transformation is a separate feature)
2990        {
2991            let d = Dialect::get(DialectType::DuckDB);
2992            let ast = d.parse("STRUCT_PACK(a := 'b')::json").expect("Failed to parse STRUCT_PACK(a := 'b')::json");
2993            assert!(!ast.is_empty(), "Empty AST for STRUCT_PACK(a := 'b')::json");
2994        }
2995
2996        // DuckDB prefix alias tests
2997        check(DialectType::DuckDB, "SELECT foo: 1", Some("SELECT 1 AS foo"));
2998        check(DialectType::DuckDB, "SELECT foo: 1, bar: 2, baz: 3", Some("SELECT 1 AS foo, 2 AS bar, 3 AS baz"));
2999    }
3000
3001    #[test]
3002    fn test_comment_roundtrip() {
3003        use crate::parser::Parser;
3004        use crate::generator::Generator;
3005
3006        fn check_roundtrip(sql: &str) -> Option<String> {
3007            let ast = match Parser::parse_sql(sql) {
3008                Ok(a) => a,
3009                Err(e) => return Some(format!("Parse error: {:?}", e)),
3010            };
3011            if ast.is_empty() {
3012                return Some("Empty AST".to_string());
3013            }
3014            let mut generator = Generator::default();
3015            let output = match generator.generate(&ast[0]) {
3016                Ok(o) => o,
3017                Err(e) => return Some(format!("Gen error: {:?}", e)),
3018            };
3019            if output == sql {
3020                None
3021            } else {
3022                Some(format!("Mismatch:\n  input:  {}\n  output: {}", sql, output))
3023            }
3024        }
3025
3026        let tests = vec![
3027            // Nested comments
3028            "SELECT c /* c1 /* c2 */ c3 */",
3029            "SELECT c /* c1 /* c2 /* c3 */ */ */",
3030            // Simple alias with comments
3031            "SELECT c /* c1 */ AS alias /* c2 */",
3032            // Multiple columns with comments
3033            "SELECT a /* x */, b /* x */",
3034            // Multiple comments after column
3035            "SELECT a /* x */ /* y */ /* z */, b /* k */ /* m */",
3036            // FROM tables with comments
3037            "SELECT * FROM foo /* x */, bla /* x */",
3038            // Arithmetic with comments
3039            "SELECT 1 /* comment */ + 1",
3040            "SELECT 1 /* c1 */ + 2 /* c2 */",
3041            "SELECT 1 /* c1 */ + /* c2 */ 2 /* c3 */",
3042            // CAST with comments
3043            "SELECT CAST(x AS INT) /* comment */ FROM foo",
3044            // Function arguments with comments
3045            "SELECT FOO(x /* c */) /* FOO */, b /* b */",
3046            // Multi-part table names with comments
3047            "SELECT x FROM a.b.c /* x */, e.f.g /* x */",
3048            // INSERT with comments
3049            "INSERT INTO t1 (tc1 /* tc1 */, tc2 /* tc2 */) SELECT c1 /* sc1 */, c2 /* sc2 */ FROM t",
3050            // Leading comments on statements
3051            "/* c */ WITH x AS (SELECT 1) SELECT * FROM x",
3052            "/* comment1 */ INSERT INTO x /* comment2 */ VALUES (1, 2, 3)",
3053            "/* comment1 */ UPDATE tbl /* comment2 */ SET x = 2 WHERE x < 2",
3054            "/* comment1 */ DELETE FROM x /* comment2 */ WHERE y > 1",
3055            "/* comment */ CREATE TABLE foo AS SELECT 1",
3056            // Trailing comments on statements
3057            "INSERT INTO foo SELECT * FROM bar /* comment */",
3058            // Complex nested expressions with comments
3059            "SELECT FOO(x /* c1 */ + y /* c2 */ + BLA(5 /* c3 */)) FROM (VALUES (1 /* c4 */, \"test\" /* c5 */)) /* c6 */",
3060        ];
3061
3062        let mut failures = Vec::new();
3063        for sql in tests {
3064            if let Some(e) = check_roundtrip(sql) {
3065                failures.push(e);
3066            }
3067        }
3068
3069        if !failures.is_empty() {
3070            panic!("Comment roundtrip failures:\n{}", failures.join("\n\n"));
3071        }
3072    }
3073
3074    #[test]
3075    fn test_dollar_quoted_string_parsing() {
3076        use crate::dialects::{Dialect, DialectType};
3077
3078        // Test dollar string token parsing utility function
3079        let (tag, content) = super::parse_dollar_string_token("FOO\x00content here");
3080        assert_eq!(tag, Some("FOO".to_string()));
3081        assert_eq!(content, "content here");
3082
3083        let (tag, content) = super::parse_dollar_string_token("just content");
3084        assert_eq!(tag, None);
3085        assert_eq!(content, "just content");
3086
3087        // Test roundtrip for Databricks dialect with dollar-quoted function body
3088        fn check_databricks(sql: &str, expected: Option<&str>) {
3089            let d = Dialect::get(DialectType::Databricks);
3090            let ast = d.parse(sql).unwrap_or_else(|e| panic!("Parse error for '{}': {}", sql, e));
3091            assert!(!ast.is_empty(), "Empty AST for: {}", sql);
3092            let transformed = d.transform(ast[0].clone()).unwrap_or_else(|e| panic!("Transform error for '{}': {}", sql, e));
3093            let output = d.generate(&transformed).unwrap_or_else(|e| panic!("Generate error for '{}': {}", sql, e));
3094            let expected = expected.unwrap_or(sql);
3095            assert_eq!(output, expected, "Roundtrip failed for: {}", sql);
3096        }
3097
3098        // Test [42]: $$...$$ heredoc
3099        check_databricks(
3100            "CREATE FUNCTION add_one(x INT) RETURNS INT LANGUAGE PYTHON AS $$def add_one(x):\n  return x+1$$",
3101            None
3102        );
3103
3104        // Test [43]: $FOO$...$FOO$ tagged heredoc
3105        check_databricks(
3106            "CREATE FUNCTION add_one(x INT) RETURNS INT LANGUAGE PYTHON AS $FOO$def add_one(x):\n  return x+1$FOO$",
3107            None
3108        );
3109    }
3110}
polyglot_sql/tokens.rs

polyglot_sql/
tokens.rs