polyglot_sql/
tokens.rs

1//! Token types and tokenization for SQL parsing
2//!
3//! This module defines all SQL token types and the tokenizer that converts
4//! SQL strings into token streams.
5
6use crate::error::{Error, Result};
7use serde::{Deserialize, Serialize};
8use std::fmt;
9#[cfg(feature = "bindings")]
10use ts_rs::TS;
11
12/// Parse a DollarString token text into (tag, content).
13/// If the text contains '\x00', the part before is the tag and after is content.
14/// Otherwise, the whole text is the content with no tag.
15pub fn parse_dollar_string_token(text: &str) -> (Option<String>, String) {
16    if let Some(pos) = text.find('\x00') {
17        let tag = &text[..pos];
18        let content = &text[pos + 1..];
19        (Some(tag.to_string()), content.to_string())
20    } else {
21        (None, text.to_string())
22    }
23}
24
25/// Represents a position in the source SQL
26#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
27#[cfg_attr(feature = "bindings", derive(TS))]
28pub struct Span {
29    /// Starting byte offset
30    pub start: usize,
31    /// Ending byte offset (exclusive)
32    pub end: usize,
33    /// Line number (1-based)
34    pub line: usize,
35    /// Column number (1-based)
36    pub column: usize,
37}
38
39impl Span {
40    pub fn new(start: usize, end: usize, line: usize, column: usize) -> Self {
41        Self {
42            start,
43            end,
44            line,
45            column,
46        }
47    }
48}
49
50/// A token in the SQL token stream
51#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
52pub struct Token {
53    /// The type of token
54    pub token_type: TokenType,
55    /// The raw text of the token
56    pub text: String,
57    /// Position information
58    pub span: Span,
59    /// Leading comments (comments that appeared before this token)
60    #[serde(default)]
61    pub comments: Vec<String>,
62    /// Trailing comments (comments that appeared after this token, before the next one)
63    #[serde(default)]
64    pub trailing_comments: Vec<String>,
65}
66
67impl Token {
68    /// Create a new token
69    pub fn new(token_type: TokenType, text: impl Into<String>, span: Span) -> Self {
70        Self {
71            token_type,
72            text: text.into(),
73            span,
74            comments: Vec::new(),
75            trailing_comments: Vec::new(),
76        }
77    }
78
79    /// Create a NUMBER token
80    pub fn number(n: i64) -> Self {
81        Self::new(TokenType::Number, n.to_string(), Span::default())
82    }
83
84    /// Create a STRING token
85    pub fn string(s: impl Into<String>) -> Self {
86        Self::new(TokenType::String, s, Span::default())
87    }
88
89    /// Create an IDENTIFIER token
90    pub fn identifier(s: impl Into<String>) -> Self {
91        Self::new(TokenType::Identifier, s, Span::default())
92    }
93
94    /// Create a VAR token
95    pub fn var(s: impl Into<String>) -> Self {
96        Self::new(TokenType::Var, s, Span::default())
97    }
98
99    /// Add a comment to this token
100    pub fn with_comment(mut self, comment: impl Into<String>) -> Self {
101        self.comments.push(comment.into());
102        self
103    }
104}
105
106impl fmt::Display for Token {
107    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
108        write!(f, "{:?}({})", self.token_type, self.text)
109    }
110}
111
112/// All possible token types in SQL
113#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
114#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
115#[repr(u16)]
116pub enum TokenType {
117    // Punctuation
118    LParen,
119    RParen,
120    LBracket,
121    RBracket,
122    LBrace,
123    RBrace,
124    Comma,
125    Dot,
126    Dash,
127    Plus,
128    Colon,
129    DotColon,
130    DColon,
131    DColonDollar,
132    DColonPercent,
133    DColonQMark,
134    DQMark,
135    Semicolon,
136    Star,
137    Backslash,
138    Slash,
139    Lt,
140    Lte,
141    Gt,
142    Gte,
143    Not,
144    Eq,
145    Neq,
146    NullsafeEq,
147    ColonEq,
148    ColonGt,
149    NColonGt,
150    And,
151    Or,
152    Amp,
153    DPipe,
154    PipeGt,
155    Pipe,
156    PipeSlash,
157    DPipeSlash,
158    Caret,
159    CaretAt,
160    LtLt, // <<
161    GtGt, // >>
162    Tilde,
163    Arrow,
164    DArrow,
165    FArrow,
166    Hash,
167    HashArrow,
168    DHashArrow,
169    LrArrow,
170    DAt,
171    AtAt,
172    LtAt,
173    AtGt,
174    Dollar,
175    Parameter,
176    Session,
177    SessionParameter,
178    SessionUser,
179    DAmp,
180    AmpLt,
181    AmpGt,
182    Adjacent,
183    Xor,
184    DStar,
185    QMarkAmp,
186    QMarkPipe,
187    HashDash,
188    Exclamation,
189
190    UriStart,
191    BlockStart,
192    BlockEnd,
193    Space,
194    Break,
195
196    // Comments (emitted as tokens for round-trip fidelity)
197    BlockComment, // /* ... */
198    LineComment,  // -- ...
199
200    // Literals
201    String,
202    DollarString,             // $$...$$
203    TripleDoubleQuotedString, // """..."""
204    TripleSingleQuotedString, // '''...'''
205    Number,
206    Identifier,
207    QuotedIdentifier,
208    Database,
209    Column,
210    ColumnDef,
211    Schema,
212    Table,
213    Warehouse,
214    Stage,
215    Streamlit,
216    Var,
217    BitString,
218    HexString,
219    /// Hex number: 0xA, 0xFF (BigQuery, SQLite style) - represents an integer in hex notation
220    HexNumber,
221    ByteString,
222    NationalString,
223    EscapeString, // PostgreSQL E'...' escape string
224    RawString,
225    HeredocString,
226    HeredocStringAlternative,
227    UnicodeString,
228
229    // Data Types
230    Bit,
231    Boolean,
232    TinyInt,
233    UTinyInt,
234    SmallInt,
235    USmallInt,
236    MediumInt,
237    UMediumInt,
238    Int,
239    UInt,
240    BigInt,
241    UBigInt,
242    BigNum,
243    Int128,
244    UInt128,
245    Int256,
246    UInt256,
247    Float,
248    Double,
249    UDouble,
250    Decimal,
251    Decimal32,
252    Decimal64,
253    Decimal128,
254    Decimal256,
255    DecFloat,
256    UDecimal,
257    BigDecimal,
258    Char,
259    NChar,
260    VarChar,
261    NVarChar,
262    BpChar,
263    Text,
264    MediumText,
265    LongText,
266    Blob,
267    MediumBlob,
268    LongBlob,
269    TinyBlob,
270    TinyText,
271    Name,
272    Binary,
273    VarBinary,
274    Json,
275    JsonB,
276    Time,
277    TimeTz,
278    TimeNs,
279    Timestamp,
280    TimestampTz,
281    TimestampLtz,
282    TimestampNtz,
283    TimestampS,
284    TimestampMs,
285    TimestampNs,
286    DateTime,
287    DateTime2,
288    DateTime64,
289    SmallDateTime,
290    Date,
291    Date32,
292    Int4Range,
293    Int4MultiRange,
294    Int8Range,
295    Int8MultiRange,
296    NumRange,
297    NumMultiRange,
298    TsRange,
299    TsMultiRange,
300    TsTzRange,
301    TsTzMultiRange,
302    DateRange,
303    DateMultiRange,
304    Uuid,
305    Geography,
306    GeographyPoint,
307    Nullable,
308    Geometry,
309    Point,
310    Ring,
311    LineString,
312    LocalTime,
313    LocalTimestamp,
314    SysTimestamp,
315    MultiLineString,
316    Polygon,
317    MultiPolygon,
318    HllSketch,
319    HStore,
320    Super,
321    Serial,
322    SmallSerial,
323    BigSerial,
324    Xml,
325    Year,
326    UserDefined,
327    Money,
328    SmallMoney,
329    RowVersion,
330    Image,
331    Variant,
332    Object,
333    Inet,
334    IpAddress,
335    IpPrefix,
336    Ipv4,
337    Ipv6,
338    Enum,
339    Enum8,
340    Enum16,
341    FixedString,
342    LowCardinality,
343    Nested,
344    AggregateFunction,
345    SimpleAggregateFunction,
346    TDigest,
347    Unknown,
348    Vector,
349    Dynamic,
350    Void,
351
352    // Keywords
353    Add,
354    Alias,
355    Alter,
356    All,
357    Anti,
358    Any,
359    Apply,
360    Array,
361    Asc,
362    AsOf,
363    Attach,
364    AutoIncrement,
365    Begin,
366    Between,
367    BulkCollectInto,
368    Cache,
369    Cascade,
370    Case,
371    CharacterSet,
372    Cluster,
373    ClusterBy,
374    Collate,
375    Command,
376    Comment,
377    Commit,
378    Preserve,
379    Connect,
380    ConnectBy,
381    Constraint,
382    Copy,
383    Create,
384    Cross,
385    Cube,
386    CurrentDate,
387    CurrentDateTime,
388    CurrentSchema,
389    CurrentTime,
390    CurrentTimestamp,
391    CurrentUser,
392    CurrentRole,
393    CurrentCatalog,
394    Declare,
395    Default,
396    Delete,
397    Desc,
398    Describe,
399    Detach,
400    Dictionary,
401    Distinct,
402    Distribute,
403    DistributeBy,
404    Div,
405    Drop,
406    Else,
407    End,
408    Escape,
409    Except,
410    Execute,
411    Exists,
412    False,
413    Fetch,
414    File,
415    FileFormat,
416    Filter,
417    Final,
418    First,
419    For,
420    Force,
421    ForeignKey,
422    Format,
423    From,
424    Full,
425    Function,
426    Get,
427    Glob,
428    Global,
429    Grant,
430    GroupBy,
431    GroupingSets,
432    Having,
433    Hint,
434    Ignore,
435    ILike,
436    In,
437    Index,
438    IndexedBy,
439    Inner,
440    Input,
441    Insert,
442    Install,
443    Intersect,
444    Interval,
445    Into,
446    Inpath,
447    InputFormat,
448    Introducer,
449    IRLike,
450    Is,
451    IsNull,
452    Join,
453    JoinMarker,
454    Keep,
455    Key,
456    Kill,
457    Lambda,
458    Language,
459    Lateral,
460    Left,
461    Like,
462    NotLike,   // !~~ operator (PostgreSQL)
463    NotILike,  // !~~* operator (PostgreSQL)
464    NotRLike,  // !~ operator (PostgreSQL)
465    NotIRLike, // !~* operator (PostgreSQL)
466    Limit,
467    List,
468    Load,
469    Local,
470    Lock,
471    Map,
472    Match,
473    MatchCondition,
474    MatchRecognize,
475    MemberOf,
476    Materialized,
477    Merge,
478    Mod,
479    Model,
480    Natural,
481    Next,
482    NoAction,
483    Nothing,
484    NotNull,
485    Null,
486    ObjectIdentifier,
487    Offset,
488    On,
489    Only,
490    Operator,
491    OrderBy,
492    OrderSiblingsBy,
493    Ordered,
494    Ordinality,
495    Out,
496    Outer,
497    Output,
498    Over,
499    Overlaps,
500    Overwrite,
501    Partition,
502    PartitionBy,
503    Percent,
504    Pivot,
505    Placeholder,
506    Positional,
507    Pragma,
508    Prewhere,
509    PrimaryKey,
510    Procedure,
511    Properties,
512    PseudoType,
513    Put,
514    Qualify,
515    Quote,
516    QDColon,
517    Range,
518    Recursive,
519    Refresh,
520    Rename,
521    Replace,
522    Returning,
523    Revoke,
524    References,
525    Restrict,
526    Right,
527    RLike,
528    Rollback,
529    Rollup,
530    Row,
531    Rows,
532    Select,
533    Semi,
534    Savepoint,
535    Separator,
536    Sequence,
537    Serde,
538    SerdeProperties,
539    Set,
540    Settings,
541    Show,
542    Siblings,
543    SimilarTo,
544    Some,
545    Sort,
546    SortBy,
547    SoundsLike,
548    StartWith,
549    StorageIntegration,
550    StraightJoin,
551    Struct,
552    Summarize,
553    TableSample,
554    Sample,
555    Bernoulli,
556    System,
557    Block,
558    Seed,
559    Repeatable,
560    Tag,
561    Temporary,
562    Transaction,
563    To,
564    Top,
565    Then,
566    True,
567    Truncate,
568    Uncache,
569    Union,
570    Unnest,
571    Unpivot,
572    Update,
573    Use,
574    Using,
575    Values,
576    View,
577    SemanticView,
578    Volatile,
579    When,
580    Where,
581    Window,
582    With,
583    Ties,
584    Exclude,
585    No,
586    Others,
587    Unique,
588    UtcDate,
589    UtcTime,
590    UtcTimestamp,
591    VersionSnapshot,
592    TimestampSnapshot,
593    Option,
594    Sink,
595    Source,
596    Analyze,
597    Namespace,
598    Export,
599    As,
600    By,
601    Nulls,
602    Respect,
603    Last,
604    If,
605    Cast,
606    TryCast,
607    SafeCast,
608    Count,
609    Extract,
610    Substring,
611    Trim,
612    Leading,
613    Trailing,
614    Both,
615    Position,
616    Overlaying,
617    Placing,
618    Treat,
619    Within,
620    Group,
621    Order,
622
623    // Window function keywords
624    Unbounded,
625    Preceding,
626    Following,
627    Current,
628    Groups,
629
630    // DDL-specific keywords (Phase 4)
631    Trigger,
632    Type,
633    Domain,
634    Returns,
635    Body,
636    Increment,
637    Minvalue,
638    Maxvalue,
639    Start,
640    Cycle,
641    NoCycle,
642    Prior,
643    Generated,
644    Identity,
645    Always,
646    // MATCH_RECOGNIZE tokens
647    Measures,
648    Pattern,
649    Define,
650    Running,
651    Owned,
652    After,
653    Before,
654    Instead,
655    Each,
656    Statement,
657    Referencing,
658    Old,
659    New,
660    Of,
661    Check,
662    Authorization,
663    Restart,
664
665    // Special
666    Eof,
667}
668
669impl TokenType {
670    /// Check if this token type is a keyword that can be used as an identifier in certain contexts
671    pub fn is_keyword(&self) -> bool {
672        matches!(
673            self,
674            TokenType::Select
675                | TokenType::From
676                | TokenType::Where
677                | TokenType::And
678                | TokenType::Or
679                | TokenType::Not
680                | TokenType::In
681                | TokenType::Is
682                | TokenType::Null
683                | TokenType::True
684                | TokenType::False
685                | TokenType::As
686                | TokenType::On
687                | TokenType::Join
688                | TokenType::Left
689                | TokenType::Right
690                | TokenType::Inner
691                | TokenType::Outer
692                | TokenType::Full
693                | TokenType::Cross
694                | TokenType::Semi
695                | TokenType::Anti
696                | TokenType::Union
697                | TokenType::Except
698                | TokenType::Intersect
699                | TokenType::GroupBy
700                | TokenType::OrderBy
701                | TokenType::Having
702                | TokenType::Limit
703                | TokenType::Offset
704                | TokenType::Case
705                | TokenType::When
706                | TokenType::Then
707                | TokenType::Else
708                | TokenType::End
709                | TokenType::Create
710                | TokenType::Drop
711                | TokenType::Alter
712                | TokenType::Insert
713                | TokenType::Update
714                | TokenType::Delete
715                | TokenType::Into
716                | TokenType::Values
717                | TokenType::Set
718                | TokenType::With
719                | TokenType::Distinct
720                | TokenType::All
721                | TokenType::Exists
722                | TokenType::Between
723                | TokenType::Like
724                | TokenType::ILike
725                // Additional keywords that can be used as identifiers
726                | TokenType::Filter
727                | TokenType::Date
728                | TokenType::Timestamp
729                | TokenType::TimestampTz
730                | TokenType::Interval
731                | TokenType::Time
732                | TokenType::Table
733                | TokenType::Index
734                | TokenType::Column
735                | TokenType::Database
736                | TokenType::Schema
737                | TokenType::View
738                | TokenType::Function
739                | TokenType::Procedure
740                | TokenType::Trigger
741                | TokenType::Sequence
742                | TokenType::Over
743                | TokenType::Partition
744                | TokenType::Window
745                | TokenType::Rows
746                | TokenType::Range
747                | TokenType::First
748                | TokenType::Last
749                | TokenType::Preceding
750                | TokenType::Following
751                | TokenType::Current
752                | TokenType::Row
753                | TokenType::Unbounded
754                | TokenType::Array
755                | TokenType::Struct
756                | TokenType::Map
757                | TokenType::PrimaryKey
758                | TokenType::Key
759                | TokenType::ForeignKey
760                | TokenType::References
761                | TokenType::Unique
762                | TokenType::Check
763                | TokenType::Default
764                | TokenType::Constraint
765                | TokenType::Comment
766                | TokenType::Rollup
767                | TokenType::Cube
768                | TokenType::Grant
769                | TokenType::Revoke
770                | TokenType::Type
771                | TokenType::Use
772                | TokenType::Cache
773                | TokenType::Uncache
774                | TokenType::Load
775                | TokenType::Any
776                | TokenType::Some
777                | TokenType::Asc
778                | TokenType::Desc
779                | TokenType::Nulls
780                | TokenType::Lateral
781                | TokenType::Natural
782                | TokenType::Escape
783                | TokenType::Glob
784                | TokenType::Match
785                | TokenType::Recursive
786                | TokenType::Replace
787                | TokenType::Returns
788                | TokenType::If
789                | TokenType::Pivot
790                | TokenType::Unpivot
791                | TokenType::Json
792                | TokenType::Blob
793                | TokenType::Text
794                | TokenType::Int
795                | TokenType::BigInt
796                | TokenType::SmallInt
797                | TokenType::TinyInt
798                | TokenType::Int128
799                | TokenType::UInt128
800                | TokenType::Int256
801                | TokenType::UInt256
802                | TokenType::UInt
803                | TokenType::UBigInt
804                | TokenType::Float
805                | TokenType::Double
806                | TokenType::Decimal
807                | TokenType::Boolean
808                | TokenType::VarChar
809                | TokenType::Char
810                | TokenType::Binary
811                | TokenType::VarBinary
812                | TokenType::No
813                | TokenType::DateTime
814                | TokenType::Truncate
815                | TokenType::Execute
816                | TokenType::Merge
817                | TokenType::Top
818                | TokenType::Begin
819                | TokenType::Generated
820                | TokenType::Identity
821                | TokenType::Always
822                | TokenType::Extract
823                // Keywords that can be identifiers in certain contexts
824                | TokenType::AsOf
825                | TokenType::Prior
826                | TokenType::After
827                | TokenType::Restrict
828                | TokenType::Cascade
829                | TokenType::Local
830                | TokenType::Rename
831                | TokenType::Enum
832                | TokenType::Within
833                | TokenType::Format
834                | TokenType::Final
835                | TokenType::FileFormat
836                | TokenType::Input
837                | TokenType::InputFormat
838                | TokenType::Copy
839                | TokenType::Put
840                | TokenType::Get
841                | TokenType::Show
842                | TokenType::Serde
843                | TokenType::Sample
844                | TokenType::Sort
845                | TokenType::Collate
846                | TokenType::Ties
847                | TokenType::IsNull
848                | TokenType::NotNull
849                | TokenType::Exclude
850                | TokenType::Temporary
851                | TokenType::Add
852                | TokenType::Ordinality
853                | TokenType::Overlaps
854                | TokenType::Block
855                | TokenType::Pattern
856                | TokenType::Group
857                | TokenType::Cluster
858                | TokenType::Repeatable
859                | TokenType::Groups
860                | TokenType::Commit
861                | TokenType::Warehouse
862                | TokenType::System
863                | TokenType::By
864                | TokenType::To
865                | TokenType::Fetch
866                | TokenType::For
867                | TokenType::Only
868                | TokenType::Next
869                | TokenType::Lock
870                | TokenType::Refresh
871                | TokenType::Settings
872                | TokenType::Operator
873                | TokenType::Overwrite
874                | TokenType::StraightJoin
875                | TokenType::Start
876                // Additional keywords registered in tokenizer but previously missing from is_keyword()
877                | TokenType::Ignore
878                | TokenType::Domain
879                | TokenType::Apply
880                | TokenType::Respect
881                | TokenType::Materialized
882                | TokenType::Prewhere
883                | TokenType::Old
884                | TokenType::New
885                | TokenType::Cast
886                | TokenType::TryCast
887                | TokenType::SafeCast
888                | TokenType::Transaction
889                | TokenType::Describe
890                | TokenType::Kill
891                | TokenType::Lambda
892                | TokenType::Declare
893                | TokenType::Keep
894                | TokenType::Output
895                | TokenType::Percent
896                | TokenType::Qualify
897                | TokenType::Returning
898                | TokenType::Language
899                | TokenType::Preserve
900                | TokenType::Savepoint
901                | TokenType::Rollback
902                | TokenType::Body
903                | TokenType::Increment
904                | TokenType::Minvalue
905                | TokenType::Maxvalue
906                | TokenType::Cycle
907                | TokenType::NoCycle
908                | TokenType::Seed
909                | TokenType::Namespace
910                | TokenType::Authorization
911                | TokenType::Order
912                | TokenType::Restart
913                | TokenType::Before
914                | TokenType::Instead
915                | TokenType::Each
916                | TokenType::Statement
917                | TokenType::Referencing
918                | TokenType::Of
919                | TokenType::Separator
920                | TokenType::Others
921                | TokenType::Placing
922                | TokenType::Owned
923                | TokenType::Running
924                | TokenType::Define
925                | TokenType::Measures
926                | TokenType::MatchRecognize
927                | TokenType::AutoIncrement
928                | TokenType::Connect
929                | TokenType::Distribute
930                | TokenType::Bernoulli
931                | TokenType::TableSample
932                | TokenType::Inpath
933                | TokenType::Pragma
934                | TokenType::Siblings
935                | TokenType::SerdeProperties
936                | TokenType::RLike
937        )
938    }
939
940    /// Check if this token type is a comparison operator
941    pub fn is_comparison(&self) -> bool {
942        matches!(
943            self,
944            TokenType::Eq
945                | TokenType::Neq
946                | TokenType::Lt
947                | TokenType::Lte
948                | TokenType::Gt
949                | TokenType::Gte
950                | TokenType::NullsafeEq
951        )
952    }
953
954    /// Check if this token type is an arithmetic operator
955    pub fn is_arithmetic(&self) -> bool {
956        matches!(
957            self,
958            TokenType::Plus
959                | TokenType::Dash
960                | TokenType::Star
961                | TokenType::Slash
962                | TokenType::Percent
963                | TokenType::Mod
964                | TokenType::Div
965        )
966    }
967}
968
969impl fmt::Display for TokenType {
970    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
971        write!(f, "{:?}", self)
972    }
973}
974
975/// Tokenizer configuration for a dialect
976#[derive(Debug, Clone)]
977pub struct TokenizerConfig {
978    /// Keywords mapping (uppercase keyword -> token type)
979    pub keywords: std::collections::HashMap<String, TokenType>,
980    /// Single character tokens
981    pub single_tokens: std::collections::HashMap<char, TokenType>,
982    /// Quote characters (start -> end)
983    pub quotes: std::collections::HashMap<String, String>,
984    /// Identifier quote characters (start -> end)
985    pub identifiers: std::collections::HashMap<char, char>,
986    /// Comment definitions (start -> optional end)
987    pub comments: std::collections::HashMap<String, Option<String>>,
988    /// String escape characters
989    pub string_escapes: Vec<char>,
990    /// Whether to support nested comments
991    pub nested_comments: bool,
992    /// Valid escape follow characters (for MySQL-style escaping).
993    /// When a backslash is followed by a character NOT in this list,
994    /// the backslash is discarded. When empty, all backslash escapes
995    /// preserve the backslash for unrecognized sequences.
996    pub escape_follow_chars: Vec<char>,
997    /// Whether b'...' is a byte string (true for BigQuery) or bit string (false for standard SQL).
998    /// Default is false (bit string).
999    pub b_prefix_is_byte_string: bool,
1000    /// Numeric literal suffixes (uppercase suffix -> type name), e.g. {"L": "BIGINT", "S": "SMALLINT"}
1001    /// Used by Hive/Spark to parse 1L as CAST(1 AS BIGINT)
1002    pub numeric_literals: std::collections::HashMap<String, String>,
1003    /// Whether unquoted identifiers can start with a digit (e.g., `1a`, `1_a`).
1004    /// When true, a number followed by letters/underscore is treated as an identifier.
1005    /// Used by Hive, Spark, MySQL, ClickHouse.
1006    pub identifiers_can_start_with_digit: bool,
1007    /// Whether 0x/0X prefix should be treated as hex literals.
1008    /// When true, `0XCC` is tokenized instead of Number("0") + Identifier("XCC").
1009    /// Used by BigQuery, SQLite, Teradata.
1010    pub hex_number_strings: bool,
1011    /// Whether hex string literals from 0x prefix represent integer values.
1012    /// When true (BigQuery), 0xA is tokenized as HexNumber (integer in hex notation).
1013    /// When false (SQLite, Teradata), 0xCC is tokenized as HexString (binary/blob value).
1014    pub hex_string_is_integer_type: bool,
1015    /// Whether string escape sequences (like \') are allowed in raw strings.
1016    /// When true (BigQuery default), \' inside r'...' escapes the quote.
1017    /// When false (Spark/Databricks), backslashes in raw strings are always literal.
1018    /// Python sqlglot: STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS (default True)
1019    pub string_escapes_allowed_in_raw_strings: bool,
1020    /// Whether # starts a single-line comment (ClickHouse, MySQL)
1021    pub hash_comments: bool,
1022    /// Whether $ can start/continue an identifier (ClickHouse).
1023    /// When true, a bare `$` that is not part of a dollar-quoted string or positional
1024    /// parameter is treated as an identifier character.
1025    pub dollar_sign_is_identifier: bool,
1026    /// Whether INSERT ... FORMAT <name> should treat subsequent data as raw (ClickHouse).
1027    /// When true, after tokenizing `INSERT ... FORMAT <non-VALUES-name>`, all text until
1028    /// the next blank line or end of input is consumed as a raw data token.
1029    pub insert_format_raw_data: bool,
1030}
1031
1032impl Default for TokenizerConfig {
1033    fn default() -> Self {
1034        let mut keywords = std::collections::HashMap::new();
1035        // Add basic SQL keywords
1036        keywords.insert("SELECT".to_string(), TokenType::Select);
1037        keywords.insert("FROM".to_string(), TokenType::From);
1038        keywords.insert("WHERE".to_string(), TokenType::Where);
1039        keywords.insert("AND".to_string(), TokenType::And);
1040        keywords.insert("OR".to_string(), TokenType::Or);
1041        keywords.insert("NOT".to_string(), TokenType::Not);
1042        keywords.insert("AS".to_string(), TokenType::As);
1043        keywords.insert("ON".to_string(), TokenType::On);
1044        keywords.insert("JOIN".to_string(), TokenType::Join);
1045        keywords.insert("LEFT".to_string(), TokenType::Left);
1046        keywords.insert("RIGHT".to_string(), TokenType::Right);
1047        keywords.insert("INNER".to_string(), TokenType::Inner);
1048        keywords.insert("OUTER".to_string(), TokenType::Outer);
1049        keywords.insert("OUTPUT".to_string(), TokenType::Output);
1050        keywords.insert("FULL".to_string(), TokenType::Full);
1051        keywords.insert("CROSS".to_string(), TokenType::Cross);
1052        keywords.insert("SEMI".to_string(), TokenType::Semi);
1053        keywords.insert("ANTI".to_string(), TokenType::Anti);
1054        keywords.insert("STRAIGHT_JOIN".to_string(), TokenType::StraightJoin);
1055        keywords.insert("UNION".to_string(), TokenType::Union);
1056        keywords.insert("EXCEPT".to_string(), TokenType::Except);
1057        keywords.insert("MINUS".to_string(), TokenType::Except); // Oracle/Redshift alias for EXCEPT
1058        keywords.insert("INTERSECT".to_string(), TokenType::Intersect);
1059        keywords.insert("GROUP".to_string(), TokenType::Group);
1060        keywords.insert("CUBE".to_string(), TokenType::Cube);
1061        keywords.insert("ROLLUP".to_string(), TokenType::Rollup);
1062        keywords.insert("WITHIN".to_string(), TokenType::Within);
1063        keywords.insert("ORDER".to_string(), TokenType::Order);
1064        keywords.insert("BY".to_string(), TokenType::By);
1065        keywords.insert("HAVING".to_string(), TokenType::Having);
1066        keywords.insert("LIMIT".to_string(), TokenType::Limit);
1067        keywords.insert("OFFSET".to_string(), TokenType::Offset);
1068        keywords.insert("ORDINALITY".to_string(), TokenType::Ordinality);
1069        keywords.insert("FETCH".to_string(), TokenType::Fetch);
1070        keywords.insert("FIRST".to_string(), TokenType::First);
1071        keywords.insert("NEXT".to_string(), TokenType::Next);
1072        keywords.insert("ONLY".to_string(), TokenType::Only);
1073        keywords.insert("KEEP".to_string(), TokenType::Keep);
1074        keywords.insert("IGNORE".to_string(), TokenType::Ignore);
1075        keywords.insert("INPUT".to_string(), TokenType::Input);
1076        keywords.insert("CASE".to_string(), TokenType::Case);
1077        keywords.insert("WHEN".to_string(), TokenType::When);
1078        keywords.insert("THEN".to_string(), TokenType::Then);
1079        keywords.insert("ELSE".to_string(), TokenType::Else);
1080        keywords.insert("END".to_string(), TokenType::End);
1081        keywords.insert("ENDIF".to_string(), TokenType::End); // Exasol alias for END
1082        keywords.insert("NULL".to_string(), TokenType::Null);
1083        keywords.insert("TRUE".to_string(), TokenType::True);
1084        keywords.insert("FALSE".to_string(), TokenType::False);
1085        keywords.insert("IS".to_string(), TokenType::Is);
1086        keywords.insert("IN".to_string(), TokenType::In);
1087        keywords.insert("BETWEEN".to_string(), TokenType::Between);
1088        keywords.insert("OVERLAPS".to_string(), TokenType::Overlaps);
1089        keywords.insert("LIKE".to_string(), TokenType::Like);
1090        keywords.insert("ILIKE".to_string(), TokenType::ILike);
1091        keywords.insert("RLIKE".to_string(), TokenType::RLike);
1092        keywords.insert("REGEXP".to_string(), TokenType::RLike);
1093        keywords.insert("ESCAPE".to_string(), TokenType::Escape);
1094        keywords.insert("EXISTS".to_string(), TokenType::Exists);
1095        keywords.insert("DISTINCT".to_string(), TokenType::Distinct);
1096        keywords.insert("ALL".to_string(), TokenType::All);
1097        keywords.insert("WITH".to_string(), TokenType::With);
1098        keywords.insert("CREATE".to_string(), TokenType::Create);
1099        keywords.insert("DROP".to_string(), TokenType::Drop);
1100        keywords.insert("ALTER".to_string(), TokenType::Alter);
1101        keywords.insert("TRUNCATE".to_string(), TokenType::Truncate);
1102        keywords.insert("TABLE".to_string(), TokenType::Table);
1103        keywords.insert("VIEW".to_string(), TokenType::View);
1104        keywords.insert("INDEX".to_string(), TokenType::Index);
1105        keywords.insert("COLUMN".to_string(), TokenType::Column);
1106        keywords.insert("CONSTRAINT".to_string(), TokenType::Constraint);
1107        keywords.insert("ADD".to_string(), TokenType::Add);
1108        keywords.insert("CASCADE".to_string(), TokenType::Cascade);
1109        keywords.insert("RESTRICT".to_string(), TokenType::Restrict);
1110        keywords.insert("RENAME".to_string(), TokenType::Rename);
1111        keywords.insert("TEMPORARY".to_string(), TokenType::Temporary);
1112        keywords.insert("TEMP".to_string(), TokenType::Temporary);
1113        keywords.insert("UNIQUE".to_string(), TokenType::Unique);
1114        keywords.insert("PRIMARY".to_string(), TokenType::PrimaryKey);
1115        keywords.insert("FOREIGN".to_string(), TokenType::ForeignKey);
1116        keywords.insert("KEY".to_string(), TokenType::Key);
1117        keywords.insert("KILL".to_string(), TokenType::Kill);
1118        keywords.insert("REFERENCES".to_string(), TokenType::References);
1119        keywords.insert("DEFAULT".to_string(), TokenType::Default);
1120        keywords.insert("DECLARE".to_string(), TokenType::Declare);
1121        keywords.insert("AUTO_INCREMENT".to_string(), TokenType::AutoIncrement);
1122        keywords.insert("AUTOINCREMENT".to_string(), TokenType::AutoIncrement); // Snowflake style
1123        keywords.insert("MATERIALIZED".to_string(), TokenType::Materialized);
1124        keywords.insert("REPLACE".to_string(), TokenType::Replace);
1125        keywords.insert("TO".to_string(), TokenType::To);
1126        keywords.insert("INSERT".to_string(), TokenType::Insert);
1127        keywords.insert("OVERWRITE".to_string(), TokenType::Overwrite);
1128        keywords.insert("UPDATE".to_string(), TokenType::Update);
1129        keywords.insert("USE".to_string(), TokenType::Use);
1130        keywords.insert("WAREHOUSE".to_string(), TokenType::Warehouse);
1131        keywords.insert("GLOB".to_string(), TokenType::Glob);
1132        keywords.insert("DELETE".to_string(), TokenType::Delete);
1133        keywords.insert("MERGE".to_string(), TokenType::Merge);
1134        keywords.insert("CACHE".to_string(), TokenType::Cache);
1135        keywords.insert("UNCACHE".to_string(), TokenType::Uncache);
1136        keywords.insert("REFRESH".to_string(), TokenType::Refresh);
1137        keywords.insert("GRANT".to_string(), TokenType::Grant);
1138        keywords.insert("REVOKE".to_string(), TokenType::Revoke);
1139        keywords.insert("COMMENT".to_string(), TokenType::Comment);
1140        keywords.insert("COLLATE".to_string(), TokenType::Collate);
1141        keywords.insert("INTO".to_string(), TokenType::Into);
1142        keywords.insert("VALUES".to_string(), TokenType::Values);
1143        keywords.insert("SET".to_string(), TokenType::Set);
1144        keywords.insert("SETTINGS".to_string(), TokenType::Settings);
1145        keywords.insert("SEPARATOR".to_string(), TokenType::Separator);
1146        keywords.insert("ASC".to_string(), TokenType::Asc);
1147        keywords.insert("DESC".to_string(), TokenType::Desc);
1148        keywords.insert("NULLS".to_string(), TokenType::Nulls);
1149        keywords.insert("RESPECT".to_string(), TokenType::Respect);
1150        keywords.insert("FIRST".to_string(), TokenType::First);
1151        keywords.insert("LAST".to_string(), TokenType::Last);
1152        keywords.insert("IF".to_string(), TokenType::If);
1153        keywords.insert("CAST".to_string(), TokenType::Cast);
1154        keywords.insert("TRY_CAST".to_string(), TokenType::TryCast);
1155        keywords.insert("SAFE_CAST".to_string(), TokenType::SafeCast);
1156        keywords.insert("OVER".to_string(), TokenType::Over);
1157        keywords.insert("PARTITION".to_string(), TokenType::Partition);
1158        keywords.insert("PLACING".to_string(), TokenType::Placing);
1159        keywords.insert("WINDOW".to_string(), TokenType::Window);
1160        keywords.insert("ROWS".to_string(), TokenType::Rows);
1161        keywords.insert("RANGE".to_string(), TokenType::Range);
1162        keywords.insert("FILTER".to_string(), TokenType::Filter);
1163        keywords.insert("NATURAL".to_string(), TokenType::Natural);
1164        keywords.insert("USING".to_string(), TokenType::Using);
1165        keywords.insert("UNBOUNDED".to_string(), TokenType::Unbounded);
1166        keywords.insert("PRECEDING".to_string(), TokenType::Preceding);
1167        keywords.insert("FOLLOWING".to_string(), TokenType::Following);
1168        keywords.insert("CURRENT".to_string(), TokenType::Current);
1169        keywords.insert("ROW".to_string(), TokenType::Row);
1170        keywords.insert("GROUPS".to_string(), TokenType::Groups);
1171        keywords.insert("RECURSIVE".to_string(), TokenType::Recursive);
1172        // TRIM function position keywords
1173        keywords.insert("BOTH".to_string(), TokenType::Both);
1174        keywords.insert("LEADING".to_string(), TokenType::Leading);
1175        keywords.insert("TRAILING".to_string(), TokenType::Trailing);
1176        keywords.insert("INTERVAL".to_string(), TokenType::Interval);
1177        // Phase 3: Additional keywords
1178        keywords.insert("TOP".to_string(), TokenType::Top);
1179        keywords.insert("QUALIFY".to_string(), TokenType::Qualify);
1180        keywords.insert("SAMPLE".to_string(), TokenType::Sample);
1181        keywords.insert("TABLESAMPLE".to_string(), TokenType::TableSample);
1182        keywords.insert("BERNOULLI".to_string(), TokenType::Bernoulli);
1183        keywords.insert("SYSTEM".to_string(), TokenType::System);
1184        keywords.insert("BLOCK".to_string(), TokenType::Block);
1185        keywords.insert("SEED".to_string(), TokenType::Seed);
1186        keywords.insert("REPEATABLE".to_string(), TokenType::Repeatable);
1187        keywords.insert("TIES".to_string(), TokenType::Ties);
1188        keywords.insert("LATERAL".to_string(), TokenType::Lateral);
1189        keywords.insert("LAMBDA".to_string(), TokenType::Lambda);
1190        keywords.insert("APPLY".to_string(), TokenType::Apply);
1191        // Oracle CONNECT BY keywords
1192        keywords.insert("CONNECT".to_string(), TokenType::Connect);
1193        // Hive/Spark specific keywords
1194        keywords.insert("CLUSTER".to_string(), TokenType::Cluster);
1195        keywords.insert("DISTRIBUTE".to_string(), TokenType::Distribute);
1196        keywords.insert("SORT".to_string(), TokenType::Sort);
1197        keywords.insert("PIVOT".to_string(), TokenType::Pivot);
1198        keywords.insert("PREWHERE".to_string(), TokenType::Prewhere);
1199        keywords.insert("UNPIVOT".to_string(), TokenType::Unpivot);
1200        keywords.insert("FOR".to_string(), TokenType::For);
1201        keywords.insert("ANY".to_string(), TokenType::Any);
1202        keywords.insert("SOME".to_string(), TokenType::Some);
1203        keywords.insert("ASOF".to_string(), TokenType::AsOf);
1204        keywords.insert("PERCENT".to_string(), TokenType::Percent);
1205        keywords.insert("EXCLUDE".to_string(), TokenType::Exclude);
1206        keywords.insert("NO".to_string(), TokenType::No);
1207        keywords.insert("OTHERS".to_string(), TokenType::Others);
1208        // PostgreSQL OPERATOR() syntax for schema-qualified operators
1209        keywords.insert("OPERATOR".to_string(), TokenType::Operator);
1210        // Phase 4: DDL keywords
1211        keywords.insert("SCHEMA".to_string(), TokenType::Schema);
1212        keywords.insert("NAMESPACE".to_string(), TokenType::Namespace);
1213        keywords.insert("DATABASE".to_string(), TokenType::Database);
1214        keywords.insert("FUNCTION".to_string(), TokenType::Function);
1215        keywords.insert("PROCEDURE".to_string(), TokenType::Procedure);
1216        keywords.insert("PROC".to_string(), TokenType::Procedure);
1217        keywords.insert("SEQUENCE".to_string(), TokenType::Sequence);
1218        keywords.insert("TRIGGER".to_string(), TokenType::Trigger);
1219        keywords.insert("TYPE".to_string(), TokenType::Type);
1220        keywords.insert("DOMAIN".to_string(), TokenType::Domain);
1221        keywords.insert("RETURNS".to_string(), TokenType::Returns);
1222        keywords.insert("RETURNING".to_string(), TokenType::Returning);
1223        keywords.insert("LANGUAGE".to_string(), TokenType::Language);
1224        keywords.insert("ROLLBACK".to_string(), TokenType::Rollback);
1225        keywords.insert("COMMIT".to_string(), TokenType::Commit);
1226        keywords.insert("BEGIN".to_string(), TokenType::Begin);
1227        keywords.insert("DESCRIBE".to_string(), TokenType::Describe);
1228        keywords.insert("PRESERVE".to_string(), TokenType::Preserve);
1229        keywords.insert("TRANSACTION".to_string(), TokenType::Transaction);
1230        keywords.insert("SAVEPOINT".to_string(), TokenType::Savepoint);
1231        keywords.insert("BODY".to_string(), TokenType::Body);
1232        keywords.insert("INCREMENT".to_string(), TokenType::Increment);
1233        keywords.insert("MINVALUE".to_string(), TokenType::Minvalue);
1234        keywords.insert("MAXVALUE".to_string(), TokenType::Maxvalue);
1235        keywords.insert("CYCLE".to_string(), TokenType::Cycle);
1236        keywords.insert("NOCYCLE".to_string(), TokenType::NoCycle);
1237        keywords.insert("PRIOR".to_string(), TokenType::Prior);
1238        // MATCH_RECOGNIZE keywords
1239        keywords.insert("MATCH".to_string(), TokenType::Match);
1240        keywords.insert("MATCH_RECOGNIZE".to_string(), TokenType::MatchRecognize);
1241        keywords.insert("MEASURES".to_string(), TokenType::Measures);
1242        keywords.insert("PATTERN".to_string(), TokenType::Pattern);
1243        keywords.insert("DEFINE".to_string(), TokenType::Define);
1244        keywords.insert("RUNNING".to_string(), TokenType::Running);
1245        keywords.insert("FINAL".to_string(), TokenType::Final);
1246        keywords.insert("OWNED".to_string(), TokenType::Owned);
1247        keywords.insert("AFTER".to_string(), TokenType::After);
1248        keywords.insert("BEFORE".to_string(), TokenType::Before);
1249        keywords.insert("INSTEAD".to_string(), TokenType::Instead);
1250        keywords.insert("EACH".to_string(), TokenType::Each);
1251        keywords.insert("STATEMENT".to_string(), TokenType::Statement);
1252        keywords.insert("REFERENCING".to_string(), TokenType::Referencing);
1253        keywords.insert("OLD".to_string(), TokenType::Old);
1254        keywords.insert("NEW".to_string(), TokenType::New);
1255        keywords.insert("OF".to_string(), TokenType::Of);
1256        keywords.insert("CHECK".to_string(), TokenType::Check);
1257        keywords.insert("START".to_string(), TokenType::Start);
1258        keywords.insert("ENUM".to_string(), TokenType::Enum);
1259        keywords.insert("AUTHORIZATION".to_string(), TokenType::Authorization);
1260        keywords.insert("RESTART".to_string(), TokenType::Restart);
1261        // Date/time literal keywords
1262        keywords.insert("DATE".to_string(), TokenType::Date);
1263        keywords.insert("TIME".to_string(), TokenType::Time);
1264        keywords.insert("TIMESTAMP".to_string(), TokenType::Timestamp);
1265        keywords.insert("DATETIME".to_string(), TokenType::DateTime);
1266        keywords.insert("GENERATED".to_string(), TokenType::Generated);
1267        keywords.insert("IDENTITY".to_string(), TokenType::Identity);
1268        keywords.insert("ALWAYS".to_string(), TokenType::Always);
1269        // LOAD DATA keywords
1270        keywords.insert("LOAD".to_string(), TokenType::Load);
1271        keywords.insert("LOCAL".to_string(), TokenType::Local);
1272        keywords.insert("INPATH".to_string(), TokenType::Inpath);
1273        keywords.insert("INPUTFORMAT".to_string(), TokenType::InputFormat);
1274        keywords.insert("SERDE".to_string(), TokenType::Serde);
1275        keywords.insert("SERDEPROPERTIES".to_string(), TokenType::SerdeProperties);
1276        keywords.insert("FORMAT".to_string(), TokenType::Format);
1277        // SQLite
1278        keywords.insert("PRAGMA".to_string(), TokenType::Pragma);
1279        // SHOW statement
1280        keywords.insert("SHOW".to_string(), TokenType::Show);
1281        // Oracle ORDER SIBLINGS BY (hierarchical queries)
1282        keywords.insert("SIBLINGS".to_string(), TokenType::Siblings);
1283        // COPY and PUT statements (Snowflake, PostgreSQL)
1284        keywords.insert("COPY".to_string(), TokenType::Copy);
1285        keywords.insert("PUT".to_string(), TokenType::Put);
1286        keywords.insert("GET".to_string(), TokenType::Get);
1287        // EXEC/EXECUTE statement (TSQL, etc.)
1288        keywords.insert("EXEC".to_string(), TokenType::Execute);
1289        keywords.insert("EXECUTE".to_string(), TokenType::Execute);
1290        // Postfix null check operators (PostgreSQL/SQLite)
1291        keywords.insert("ISNULL".to_string(), TokenType::IsNull);
1292        keywords.insert("NOTNULL".to_string(), TokenType::NotNull);
1293
1294        let mut single_tokens = std::collections::HashMap::new();
1295        single_tokens.insert('(', TokenType::LParen);
1296        single_tokens.insert(')', TokenType::RParen);
1297        single_tokens.insert('[', TokenType::LBracket);
1298        single_tokens.insert(']', TokenType::RBracket);
1299        single_tokens.insert('{', TokenType::LBrace);
1300        single_tokens.insert('}', TokenType::RBrace);
1301        single_tokens.insert(',', TokenType::Comma);
1302        single_tokens.insert('.', TokenType::Dot);
1303        single_tokens.insert(';', TokenType::Semicolon);
1304        single_tokens.insert('+', TokenType::Plus);
1305        single_tokens.insert('-', TokenType::Dash);
1306        single_tokens.insert('*', TokenType::Star);
1307        single_tokens.insert('/', TokenType::Slash);
1308        single_tokens.insert('%', TokenType::Percent);
1309        single_tokens.insert('&', TokenType::Amp);
1310        single_tokens.insert('|', TokenType::Pipe);
1311        single_tokens.insert('^', TokenType::Caret);
1312        single_tokens.insert('~', TokenType::Tilde);
1313        single_tokens.insert('<', TokenType::Lt);
1314        single_tokens.insert('>', TokenType::Gt);
1315        single_tokens.insert('=', TokenType::Eq);
1316        single_tokens.insert('!', TokenType::Exclamation);
1317        single_tokens.insert(':', TokenType::Colon);
1318        single_tokens.insert('@', TokenType::DAt);
1319        single_tokens.insert('#', TokenType::Hash);
1320        single_tokens.insert('$', TokenType::Dollar);
1321        single_tokens.insert('?', TokenType::Parameter);
1322
1323        let mut quotes = std::collections::HashMap::new();
1324        quotes.insert("'".to_string(), "'".to_string());
1325        // Triple-quoted strings (e.g., """x""")
1326        quotes.insert("\"\"\"".to_string(), "\"\"\"".to_string());
1327
1328        let mut identifiers = std::collections::HashMap::new();
1329        identifiers.insert('"', '"');
1330        identifiers.insert('`', '`');
1331        // Note: TSQL bracket-quoted identifiers [name] are handled in the parser
1332        // because [ is also used for arrays and subscripts
1333
1334        let mut comments = std::collections::HashMap::new();
1335        comments.insert("--".to_string(), None);
1336        comments.insert("/*".to_string(), Some("*/".to_string()));
1337
1338        Self {
1339            keywords,
1340            single_tokens,
1341            quotes,
1342            identifiers,
1343            comments,
1344            // Standard SQL: only '' (doubled quote) escapes a quote
1345            // Backslash escapes are dialect-specific (MySQL, etc.)
1346            string_escapes: vec!['\''],
1347            nested_comments: true,
1348            // By default, no escape_follow_chars means preserve backslash for unrecognized escapes
1349            escape_follow_chars: vec![],
1350            // Default: b'...' is bit string (standard SQL), not byte string (BigQuery)
1351            b_prefix_is_byte_string: false,
1352            numeric_literals: std::collections::HashMap::new(),
1353            identifiers_can_start_with_digit: false,
1354            hex_number_strings: false,
1355            hex_string_is_integer_type: false,
1356            // Default: backslash escapes ARE allowed in raw strings (sqlglot default)
1357            // Spark/Databricks set this to false
1358            string_escapes_allowed_in_raw_strings: true,
1359            hash_comments: false,
1360            dollar_sign_is_identifier: false,
1361            insert_format_raw_data: false,
1362        }
1363    }
1364}
1365
1366/// SQL Tokenizer
1367pub struct Tokenizer {
1368    config: TokenizerConfig,
1369}
1370
1371impl Tokenizer {
1372    /// Create a new tokenizer with the given configuration
1373    pub fn new(config: TokenizerConfig) -> Self {
1374        Self { config }
1375    }
1376
1377    /// Create a tokenizer with default configuration
1378    pub fn default_config() -> Self {
1379        Self::new(TokenizerConfig::default())
1380    }
1381
1382    /// Tokenize a SQL string
1383    pub fn tokenize(&self, sql: &str) -> Result<Vec<Token>> {
1384        let mut state = TokenizerState::new(sql, &self.config);
1385        state.tokenize()
1386    }
1387}
1388
1389impl Default for Tokenizer {
1390    fn default() -> Self {
1391        Self::default_config()
1392    }
1393}
1394
1395/// Internal state for tokenization
1396struct TokenizerState<'a> {
1397    source: &'a str,
1398    source_is_ascii: bool,
1399    chars: Vec<char>,
1400    size: usize,
1401    tokens: Vec<Token>,
1402    start: usize,
1403    current: usize,
1404    line: usize,
1405    column: usize,
1406    comments: Vec<String>,
1407    config: &'a TokenizerConfig,
1408}
1409
1410impl<'a> TokenizerState<'a> {
1411    fn new(sql: &'a str, config: &'a TokenizerConfig) -> Self {
1412        let chars: Vec<char> = sql.chars().collect();
1413        let size = chars.len();
1414        Self {
1415            source: sql,
1416            source_is_ascii: sql.is_ascii(),
1417            chars,
1418            size,
1419            tokens: Vec::new(),
1420            start: 0,
1421            current: 0,
1422            line: 1,
1423            column: 1,
1424            comments: Vec::new(),
1425            config,
1426        }
1427    }
1428
1429    fn tokenize(&mut self) -> Result<Vec<Token>> {
1430        while !self.is_at_end() {
1431            self.skip_whitespace();
1432            if self.is_at_end() {
1433                break;
1434            }
1435
1436            self.start = self.current;
1437            self.scan_token()?;
1438
1439            // ClickHouse: After INSERT ... FORMAT <name> (where name != VALUES),
1440            // the rest until the next blank line or end of input is raw data.
1441            if self.config.insert_format_raw_data {
1442                if let Some(raw) = self.try_scan_insert_format_raw_data() {
1443                    if !raw.is_empty() {
1444                        self.start = self.current;
1445                        self.add_token_with_text(TokenType::Var, raw);
1446                    }
1447                }
1448            }
1449        }
1450
1451        // Handle leftover leading comments at end of input.
1452        // These are comments on a new line after the last token that couldn't be attached
1453        // as leading comments to a subsequent token (because there is none).
1454        // Attach them as trailing comments on the last token so they're preserved.
1455        if !self.comments.is_empty() {
1456            if let Some(last) = self.tokens.last_mut() {
1457                last.trailing_comments.extend(self.comments.drain(..));
1458            }
1459        }
1460
1461        Ok(std::mem::take(&mut self.tokens))
1462    }
1463
1464    fn is_at_end(&self) -> bool {
1465        self.current >= self.size
1466    }
1467
1468    #[inline]
1469    fn text_from_range(&self, start: usize, end: usize) -> String {
1470        if self.source_is_ascii {
1471            self.source[start..end].to_string()
1472        } else {
1473            self.chars[start..end].iter().collect()
1474        }
1475    }
1476
1477    fn peek(&self) -> char {
1478        if self.is_at_end() {
1479            '\0'
1480        } else {
1481            self.chars[self.current]
1482        }
1483    }
1484
1485    fn peek_next(&self) -> char {
1486        if self.current + 1 >= self.size {
1487            '\0'
1488        } else {
1489            self.chars[self.current + 1]
1490        }
1491    }
1492
1493    fn advance(&mut self) -> char {
1494        let c = self.peek();
1495        self.current += 1;
1496        if c == '\n' {
1497            self.line += 1;
1498            self.column = 1;
1499        } else {
1500            self.column += 1;
1501        }
1502        c
1503    }
1504
1505    fn skip_whitespace(&mut self) {
1506        // Track whether we've seen a newline since the last token.
1507        // Comments on a new line (after a newline) are leading comments on the next token,
1508        // while comments on the same line are trailing comments on the previous token.
1509        // This matches Python sqlglot's behavior.
1510        let mut saw_newline = false;
1511        while !self.is_at_end() {
1512            let c = self.peek();
1513            match c {
1514                ' ' | '\t' | '\r' => {
1515                    self.advance();
1516                }
1517                '\n' => {
1518                    saw_newline = true;
1519                    self.advance();
1520                }
1521                '\u{00A0}' // non-breaking space
1522                | '\u{2000}'..='\u{200B}' // various Unicode spaces + zero-width space
1523                | '\u{3000}' // ideographic (full-width) space
1524                | '\u{FEFF}' // BOM / zero-width no-break space
1525                => {
1526                    self.advance();
1527                }
1528                '-' if self.peek_next() == '-' => {
1529                    self.scan_line_comment(saw_newline);
1530                    // After a line comment, we're always on a new line
1531                    saw_newline = true;
1532                }
1533                '/' if self.peek_next() == '/' && self.config.hash_comments => {
1534                    // ClickHouse: // single-line comments (same dialects that support # comments)
1535                    self.scan_double_slash_comment();
1536                }
1537                '/' if self.peek_next() == '*' => {
1538                    // Check if this is a hint comment /*+ ... */
1539                    if self.current + 2 < self.size && self.chars[self.current + 2] == '+' {
1540                        // This is a hint comment, handle it as a token instead of skipping
1541                        break;
1542                    }
1543                    if self.scan_block_comment(saw_newline).is_err() {
1544                        return;
1545                    }
1546                    // Don't reset saw_newline - it carries forward
1547                }
1548                '/' if self.peek_next() == '/' && self.config.comments.contains_key("//") => {
1549                    // Dialect-specific // line comment (e.g., Snowflake)
1550                    // But NOT inside URIs like file:// or paths with consecutive slashes
1551                    // Check that previous non-whitespace char is not ':' or '/'
1552                    let prev_non_ws = if self.current > 0 {
1553                        let mut i = self.current - 1;
1554                        while i > 0 && (self.chars[i] == ' ' || self.chars[i] == '\t') {
1555                            i -= 1;
1556                        }
1557                        self.chars[i]
1558                    } else {
1559                        '\0'
1560                    };
1561                    if prev_non_ws == ':' || prev_non_ws == '/' {
1562                        // This is likely a URI (file://, http://) or path, not a comment
1563                        break;
1564                    }
1565                    self.scan_line_comment(saw_newline);
1566                    // After a line comment, we're always on a new line
1567                    saw_newline = true;
1568                }
1569                '#' if self.config.hash_comments => {
1570                    self.scan_hash_line_comment();
1571                }
1572                _ => break,
1573            }
1574        }
1575    }
1576
1577    fn scan_hash_line_comment(&mut self) {
1578        self.advance(); // #
1579        let start = self.current;
1580        while !self.is_at_end() && self.peek() != '\n' {
1581            self.advance();
1582        }
1583        let comment = self.text_from_range(start, self.current);
1584        let comment_text = comment.trim().to_string();
1585        if let Some(last) = self.tokens.last_mut() {
1586            last.trailing_comments.push(comment_text);
1587        } else {
1588            self.comments.push(comment_text);
1589        }
1590    }
1591
1592    fn scan_double_slash_comment(&mut self) {
1593        self.advance(); // /
1594        self.advance(); // /
1595        let start = self.current;
1596        while !self.is_at_end() && self.peek() != '\n' {
1597            self.advance();
1598        }
1599        let comment = self.text_from_range(start, self.current);
1600        let comment_text = comment.trim().to_string();
1601        if let Some(last) = self.tokens.last_mut() {
1602            last.trailing_comments.push(comment_text);
1603        } else {
1604            self.comments.push(comment_text);
1605        }
1606    }
1607
1608    fn scan_line_comment(&mut self, after_newline: bool) {
1609        self.advance(); // -
1610        self.advance(); // -
1611        let start = self.current;
1612        while !self.is_at_end() && self.peek() != '\n' {
1613            self.advance();
1614        }
1615        let comment_text = self.text_from_range(start, self.current);
1616
1617        // If the comment starts on a new line (after_newline), it's a leading comment
1618        // on the next token. Otherwise, it's a trailing comment on the previous token.
1619        if after_newline || self.tokens.is_empty() {
1620            self.comments.push(comment_text);
1621        } else if let Some(last) = self.tokens.last_mut() {
1622            last.trailing_comments.push(comment_text);
1623        }
1624    }
1625
1626    fn scan_block_comment(&mut self, after_newline: bool) -> Result<()> {
1627        self.advance(); // /
1628        self.advance(); // *
1629        let content_start = self.current;
1630        let mut depth = 1;
1631
1632        while !self.is_at_end() && depth > 0 {
1633            if self.peek() == '/' && self.peek_next() == '*' && self.config.nested_comments {
1634                self.advance();
1635                self.advance();
1636                depth += 1;
1637            } else if self.peek() == '*' && self.peek_next() == '/' {
1638                depth -= 1;
1639                if depth > 0 {
1640                    self.advance();
1641                    self.advance();
1642                }
1643            } else {
1644                self.advance();
1645            }
1646        }
1647
1648        if depth > 0 {
1649            return Err(Error::tokenize(
1650                "Unterminated block comment",
1651                self.line,
1652                self.column,
1653                self.start,
1654                self.current,
1655            ));
1656        }
1657
1658        // Get the content between /* and */ (preserving internal whitespace for nested comments)
1659        let content = self.text_from_range(content_start, self.current);
1660        self.advance(); // *
1661        self.advance(); // /
1662
1663        // For round-trip fidelity, preserve the exact comment content including nested comments
1664        let comment_text = format!("/*{}*/", content);
1665
1666        // If the comment starts on a new line (after_newline), it's a leading comment
1667        // on the next token. Otherwise, it's a trailing comment on the previous token.
1668        if after_newline || self.tokens.is_empty() {
1669            self.comments.push(comment_text);
1670        } else if let Some(last) = self.tokens.last_mut() {
1671            last.trailing_comments.push(comment_text);
1672        }
1673
1674        Ok(())
1675    }
1676
1677    /// Scan a hint comment /*+ ... */ and return it as a Hint token
1678    fn scan_hint(&mut self) -> Result<()> {
1679        self.advance(); // /
1680        self.advance(); // *
1681        self.advance(); // +
1682        let hint_start = self.current;
1683
1684        // Scan until we find */
1685        while !self.is_at_end() {
1686            if self.peek() == '*' && self.peek_next() == '/' {
1687                break;
1688            }
1689            self.advance();
1690        }
1691
1692        if self.is_at_end() {
1693            return Err(Error::tokenize(
1694                "Unterminated hint comment",
1695                self.line,
1696                self.column,
1697                self.start,
1698                self.current,
1699            ));
1700        }
1701
1702        let hint_text = self.text_from_range(hint_start, self.current);
1703        self.advance(); // *
1704        self.advance(); // /
1705
1706        self.add_token_with_text(TokenType::Hint, hint_text.trim().to_string());
1707
1708        Ok(())
1709    }
1710
1711    /// Scan a positional parameter: $1, $2, etc.
1712    fn scan_positional_parameter(&mut self) -> Result<()> {
1713        self.advance(); // consume $
1714        let start = self.current;
1715
1716        while !self.is_at_end() && self.peek().is_ascii_digit() {
1717            self.advance();
1718        }
1719
1720        let number = self.text_from_range(start, self.current);
1721        self.add_token_with_text(TokenType::Parameter, number);
1722        Ok(())
1723    }
1724
1725    /// Try to scan a tagged dollar-quoted string: $tag$content$tag$
1726    /// Returns Some(()) if successful, None if this isn't a tagged dollar string.
1727    ///
1728    /// The token text is stored as "tag\x00content" to preserve the tag for later use.
1729    fn try_scan_tagged_dollar_string(&mut self) -> Result<Option<()>> {
1730        let saved_pos = self.current;
1731
1732        // We're at '$', next char is alphabetic
1733        self.advance(); // consume opening $
1734
1735        // Scan the tag (identifier: alphanumeric + underscore, including Unicode)
1736        // Tags can contain Unicode characters like emojis (e.g., $🦆$)
1737        let tag_start = self.current;
1738        while !self.is_at_end()
1739            && (self.peek().is_alphanumeric() || self.peek() == '_' || !self.peek().is_ascii())
1740        {
1741            self.advance();
1742        }
1743        let tag = self.text_from_range(tag_start, self.current);
1744
1745        // Must have a closing $ after the tag
1746        if self.is_at_end() || self.peek() != '$' {
1747            // Not a tagged dollar string - restore position
1748            self.current = saved_pos;
1749            return Ok(None);
1750        }
1751        self.advance(); // consume closing $ of opening tag
1752
1753        // Now scan content until we find $tag$
1754        let content_start = self.current;
1755        let closing_tag = format!("${}$", tag);
1756        let closing_chars: Vec<char> = closing_tag.chars().collect();
1757
1758        loop {
1759            if self.is_at_end() {
1760                // Unterminated - restore and fall through
1761                self.current = saved_pos;
1762                return Ok(None);
1763            }
1764
1765            // Check if we've reached the closing tag
1766            if self.peek() == '$' && self.current + closing_chars.len() <= self.size {
1767                let matches = closing_chars.iter().enumerate().all(|(j, &ch)| {
1768                    self.current + j < self.size && self.chars[self.current + j] == ch
1769                });
1770                if matches {
1771                    let content = self.text_from_range(content_start, self.current);
1772                    // Consume closing tag
1773                    for _ in 0..closing_chars.len() {
1774                        self.advance();
1775                    }
1776                    // Store as "tag\x00content" to preserve the tag
1777                    let token_text = format!("{}\x00{}", tag, content);
1778                    self.add_token_with_text(TokenType::DollarString, token_text);
1779                    return Ok(Some(()));
1780                }
1781            }
1782            self.advance();
1783        }
1784    }
1785
1786    /// Scan a dollar-quoted string: $$content$$ or $tag$content$tag$
1787    ///
1788    /// For $$...$$ (no tag), the token text is just the content.
1789    /// For $tag$...$tag$, use try_scan_tagged_dollar_string instead.
1790    fn scan_dollar_quoted_string(&mut self) -> Result<()> {
1791        self.advance(); // consume first $
1792        self.advance(); // consume second $
1793
1794        // For $$...$$ (no tag), just scan until closing $$
1795        let start = self.current;
1796        while !self.is_at_end() {
1797            if self.peek() == '$'
1798                && self.current + 1 < self.size
1799                && self.chars[self.current + 1] == '$'
1800            {
1801                break;
1802            }
1803            self.advance();
1804        }
1805
1806        let content = self.text_from_range(start, self.current);
1807
1808        if !self.is_at_end() {
1809            self.advance(); // consume first $
1810            self.advance(); // consume second $
1811        }
1812
1813        self.add_token_with_text(TokenType::DollarString, content);
1814        Ok(())
1815    }
1816
1817    fn scan_token(&mut self) -> Result<()> {
1818        let c = self.peek();
1819
1820        // Check for string literal
1821        if c == '\'' {
1822            // Check for triple-quoted string '''...''' if configured
1823            if self.config.quotes.contains_key("'''")
1824                && self.peek_next() == '\''
1825                && self.current + 2 < self.size
1826                && self.chars[self.current + 2] == '\''
1827            {
1828                return self.scan_triple_quoted_string('\'');
1829            }
1830            return self.scan_string();
1831        }
1832
1833        // Check for triple-quoted string """...""" if configured
1834        if c == '"'
1835            && self.config.quotes.contains_key("\"\"\"")
1836            && self.peek_next() == '"'
1837            && self.current + 2 < self.size
1838            && self.chars[self.current + 2] == '"'
1839        {
1840            return self.scan_triple_quoted_string('"');
1841        }
1842
1843        // Check for double-quoted strings when dialect supports them (e.g., BigQuery)
1844        // This must come before identifier quotes check
1845        if c == '"'
1846            && self.config.quotes.contains_key("\"")
1847            && !self.config.identifiers.contains_key(&'"')
1848        {
1849            return self.scan_double_quoted_string();
1850        }
1851
1852        // Check for identifier quotes
1853        if let Some(&end_quote) = self.config.identifiers.get(&c) {
1854            return self.scan_quoted_identifier(end_quote);
1855        }
1856
1857        // Check for numbers (including numbers starting with a dot like .25)
1858        if c.is_ascii_digit() {
1859            return self.scan_number();
1860        }
1861
1862        // Check for numbers starting with a dot (e.g., .25, .5)
1863        // This must come before single character token handling
1864        // Don't treat as a number if:
1865        // - Previous char was also a dot (e.g., 1..2 should be 1, ., ., 2)
1866        // - Previous char is an identifier character (e.g., foo.25 should be foo, ., 25)
1867        //   This handles BigQuery numeric table parts like project.dataset.25
1868        if c == '.' && self.peek_next().is_ascii_digit() {
1869            let prev_char = if self.current > 0 {
1870                self.chars[self.current - 1]
1871            } else {
1872                '\0'
1873            };
1874            let is_after_ident = prev_char.is_alphanumeric()
1875                || prev_char == '_'
1876                || prev_char == '`'
1877                || prev_char == '"'
1878                || prev_char == ']'
1879                || prev_char == ')';
1880            if prev_char != '.' && !is_after_ident {
1881                return self.scan_number_starting_with_dot();
1882            }
1883        }
1884
1885        // Check for hint comment /*+ ... */
1886        if c == '/'
1887            && self.peek_next() == '*'
1888            && self.current + 2 < self.size
1889            && self.chars[self.current + 2] == '+'
1890        {
1891            return self.scan_hint();
1892        }
1893
1894        // Check for multi-character operators first
1895        if let Some(token_type) = self.try_scan_multi_char_operator() {
1896            self.add_token(token_type);
1897            return Ok(());
1898        }
1899
1900        // Check for tagged dollar-quoted strings: $tag$content$tag$
1901        // Tags can contain Unicode characters (including emojis like 🦆) and digits (e.g., $1$)
1902        if c == '$'
1903            && (self.peek_next().is_alphanumeric()
1904                || self.peek_next() == '_'
1905                || !self.peek_next().is_ascii())
1906        {
1907            if let Some(()) = self.try_scan_tagged_dollar_string()? {
1908                return Ok(());
1909            }
1910            // If tagged dollar string didn't match and dollar_sign_is_identifier is set,
1911            // treat the $ and following chars as an identifier (e.g., ClickHouse $alias$name$).
1912            if self.config.dollar_sign_is_identifier {
1913                return self.scan_dollar_identifier();
1914            }
1915        }
1916
1917        // Check for dollar-quoted strings: $$...$$
1918        if c == '$' && self.peek_next() == '$' {
1919            return self.scan_dollar_quoted_string();
1920        }
1921
1922        // Check for positional parameters: $1, $2, etc.
1923        if c == '$' && self.peek_next().is_ascii_digit() {
1924            return self.scan_positional_parameter();
1925        }
1926
1927        // ClickHouse: bare $ (not followed by alphanumeric/underscore) as identifier
1928        if c == '$' && self.config.dollar_sign_is_identifier {
1929            return self.scan_dollar_identifier();
1930        }
1931
1932        // TSQL: Check for identifiers starting with # (temp tables) or @ (variables)
1933        // e.g., #temp, ##global_temp, @variable
1934        if (c == '#' || c == '@')
1935            && (self.peek_next().is_alphanumeric()
1936                || self.peek_next() == '_'
1937                || self.peek_next() == '#')
1938        {
1939            return self.scan_tsql_identifier();
1940        }
1941
1942        // Check for single character tokens
1943        if let Some(&token_type) = self.config.single_tokens.get(&c) {
1944            self.advance();
1945            self.add_token(token_type);
1946            return Ok(());
1947        }
1948
1949        // Unicode minus (U+2212) → treat as regular minus
1950        if c == '\u{2212}' {
1951            self.advance();
1952            self.add_token(TokenType::Dash);
1953            return Ok(());
1954        }
1955
1956        // Unicode fraction slash (U+2044) → treat as regular slash
1957        if c == '\u{2044}' {
1958            self.advance();
1959            self.add_token(TokenType::Slash);
1960            return Ok(());
1961        }
1962
1963        // Unicode curly/smart quotes → treat as regular string quotes
1964        if c == '\u{2018}' || c == '\u{2019}' {
1965            // Left/right single quotation marks → scan as string with matching end
1966            return self.scan_unicode_quoted_string(c);
1967        }
1968        if c == '\u{201C}' || c == '\u{201D}' {
1969            // Left/right double quotation marks → scan as quoted identifier
1970            return self.scan_unicode_quoted_identifier(c);
1971        }
1972
1973        // Must be an identifier or keyword
1974        self.scan_identifier_or_keyword()
1975    }
1976
1977    fn try_scan_multi_char_operator(&mut self) -> Option<TokenType> {
1978        let c = self.peek();
1979        let next = self.peek_next();
1980        let third = if self.current + 2 < self.size {
1981            self.chars[self.current + 2]
1982        } else {
1983            '\0'
1984        };
1985
1986        // Check for three-character operators first
1987        // -|- (Adjacent - PostgreSQL range adjacency)
1988        if c == '-' && next == '|' && third == '-' {
1989            self.advance();
1990            self.advance();
1991            self.advance();
1992            return Some(TokenType::Adjacent);
1993        }
1994
1995        // ||/ (Cube root - PostgreSQL)
1996        if c == '|' && next == '|' && third == '/' {
1997            self.advance();
1998            self.advance();
1999            self.advance();
2000            return Some(TokenType::DPipeSlash);
2001        }
2002
2003        // #>> (JSONB path text extraction - PostgreSQL)
2004        if c == '#' && next == '>' && third == '>' {
2005            self.advance();
2006            self.advance();
2007            self.advance();
2008            return Some(TokenType::DHashArrow);
2009        }
2010
2011        // ->> (JSON text extraction - PostgreSQL/MySQL)
2012        if c == '-' && next == '>' && third == '>' {
2013            self.advance();
2014            self.advance();
2015            self.advance();
2016            return Some(TokenType::DArrow);
2017        }
2018
2019        // <=> (NULL-safe equality - MySQL)
2020        if c == '<' && next == '=' && third == '>' {
2021            self.advance();
2022            self.advance();
2023            self.advance();
2024            return Some(TokenType::NullsafeEq);
2025        }
2026
2027        // <-> (Distance operator - PostgreSQL)
2028        if c == '<' && next == '-' && third == '>' {
2029            self.advance();
2030            self.advance();
2031            self.advance();
2032            return Some(TokenType::LrArrow);
2033        }
2034
2035        // <@ (Contained by - PostgreSQL)
2036        if c == '<' && next == '@' {
2037            self.advance();
2038            self.advance();
2039            return Some(TokenType::LtAt);
2040        }
2041
2042        // @> (Contains - PostgreSQL)
2043        if c == '@' && next == '>' {
2044            self.advance();
2045            self.advance();
2046            return Some(TokenType::AtGt);
2047        }
2048
2049        // ~~~ (Glob - PostgreSQL)
2050        if c == '~' && next == '~' && third == '~' {
2051            self.advance();
2052            self.advance();
2053            self.advance();
2054            return Some(TokenType::Glob);
2055        }
2056
2057        // ~~* (ILike - PostgreSQL)
2058        if c == '~' && next == '~' && third == '*' {
2059            self.advance();
2060            self.advance();
2061            self.advance();
2062            return Some(TokenType::ILike);
2063        }
2064
2065        // !~~* (Not ILike - PostgreSQL)
2066        let fourth = if self.current + 3 < self.size {
2067            self.chars[self.current + 3]
2068        } else {
2069            '\0'
2070        };
2071        if c == '!' && next == '~' && third == '~' && fourth == '*' {
2072            self.advance();
2073            self.advance();
2074            self.advance();
2075            self.advance();
2076            return Some(TokenType::NotILike);
2077        }
2078
2079        // !~~ (Not Like - PostgreSQL)
2080        if c == '!' && next == '~' && third == '~' {
2081            self.advance();
2082            self.advance();
2083            self.advance();
2084            return Some(TokenType::NotLike);
2085        }
2086
2087        // !~* (Not Regexp ILike - PostgreSQL)
2088        if c == '!' && next == '~' && third == '*' {
2089            self.advance();
2090            self.advance();
2091            self.advance();
2092            return Some(TokenType::NotIRLike);
2093        }
2094
2095        // !:> (Not cast / Try cast - SingleStore)
2096        if c == '!' && next == ':' && third == '>' {
2097            self.advance();
2098            self.advance();
2099            self.advance();
2100            return Some(TokenType::NColonGt);
2101        }
2102
2103        // ?:: (TRY_CAST shorthand - Databricks)
2104        if c == '?' && next == ':' && third == ':' {
2105            self.advance();
2106            self.advance();
2107            self.advance();
2108            return Some(TokenType::QDColon);
2109        }
2110
2111        // !~ (Not Regexp - PostgreSQL)
2112        if c == '!' && next == '~' {
2113            self.advance();
2114            self.advance();
2115            return Some(TokenType::NotRLike);
2116        }
2117
2118        // ~~ (Like - PostgreSQL)
2119        if c == '~' && next == '~' {
2120            self.advance();
2121            self.advance();
2122            return Some(TokenType::Like);
2123        }
2124
2125        // ~* (Regexp ILike - PostgreSQL)
2126        if c == '~' && next == '*' {
2127            self.advance();
2128            self.advance();
2129            return Some(TokenType::IRLike);
2130        }
2131
2132        // SingleStore three-character JSON path operators (must be checked before :: two-char)
2133        // ::$ (JSON extract string), ::% (JSON extract double), ::? (JSON match)
2134        if c == ':' && next == ':' && third == '$' {
2135            self.advance();
2136            self.advance();
2137            self.advance();
2138            return Some(TokenType::DColonDollar);
2139        }
2140        if c == ':' && next == ':' && third == '%' {
2141            self.advance();
2142            self.advance();
2143            self.advance();
2144            return Some(TokenType::DColonPercent);
2145        }
2146        if c == ':' && next == ':' && third == '?' {
2147            self.advance();
2148            self.advance();
2149            self.advance();
2150            return Some(TokenType::DColonQMark);
2151        }
2152
2153        // Two-character operators
2154        let token_type = match (c, next) {
2155            ('.', ':') => Some(TokenType::DotColon),
2156            ('=', '=') => Some(TokenType::Eq), // Hive/Spark == equality operator
2157            ('<', '=') => Some(TokenType::Lte),
2158            ('>', '=') => Some(TokenType::Gte),
2159            ('!', '=') => Some(TokenType::Neq),
2160            ('<', '>') => Some(TokenType::Neq),
2161            ('^', '=') => Some(TokenType::Neq),
2162            ('<', '<') => Some(TokenType::LtLt),
2163            ('>', '>') => Some(TokenType::GtGt),
2164            ('|', '|') => Some(TokenType::DPipe),
2165            ('|', '/') => Some(TokenType::PipeSlash), // Square root - PostgreSQL
2166            (':', ':') => Some(TokenType::DColon),
2167            (':', '=') => Some(TokenType::ColonEq), // := (assignment, named args)
2168            (':', '>') => Some(TokenType::ColonGt), // ::> (TSQL)
2169            ('-', '>') => Some(TokenType::Arrow),   // JSON object access
2170            ('=', '>') => Some(TokenType::FArrow),  // Fat arrow (lambda)
2171            ('&', '&') => Some(TokenType::DAmp),
2172            ('&', '<') => Some(TokenType::AmpLt), // PostgreSQL range operator
2173            ('&', '>') => Some(TokenType::AmpGt), // PostgreSQL range operator
2174            ('@', '@') => Some(TokenType::AtAt),  // Text search match
2175            ('?', '|') => Some(TokenType::QMarkPipe), // JSONB contains any key
2176            ('?', '&') => Some(TokenType::QMarkAmp), // JSONB contains all keys
2177            ('?', '?') => Some(TokenType::DQMark), // Double question mark
2178            ('#', '>') => Some(TokenType::HashArrow), // JSONB path extraction
2179            ('#', '-') => Some(TokenType::HashDash), // JSONB delete
2180            ('^', '@') => Some(TokenType::CaretAt), // PostgreSQL starts-with operator
2181            ('*', '*') => Some(TokenType::DStar), // Power operator
2182            ('|', '>') => Some(TokenType::PipeGt), // Pipe-greater (some dialects)
2183            _ => None,
2184        };
2185
2186        if token_type.is_some() {
2187            self.advance();
2188            self.advance();
2189        }
2190
2191        token_type
2192    }
2193
2194    fn scan_string(&mut self) -> Result<()> {
2195        self.advance(); // Opening quote
2196        let mut value = String::new();
2197
2198        while !self.is_at_end() {
2199            let c = self.peek();
2200            if c == '\'' {
2201                if self.peek_next() == '\'' {
2202                    // Escaped quote
2203                    value.push('\'');
2204                    self.advance();
2205                    self.advance();
2206                } else {
2207                    break;
2208                }
2209            } else if c == '\\' && self.config.string_escapes.contains(&'\\') {
2210                // Handle escape sequences
2211                self.advance(); // Consume the backslash
2212                if !self.is_at_end() {
2213                    let escaped = self.advance();
2214                    match escaped {
2215                        'n' => value.push('\n'),
2216                        'r' => value.push('\r'),
2217                        't' => value.push('\t'),
2218                        '0' => value.push('\0'),
2219                        'Z' => value.push('\x1A'), // Ctrl+Z (MySQL)
2220                        'a' => value.push('\x07'), // Alert/bell
2221                        'b' => value.push('\x08'), // Backspace
2222                        'f' => value.push('\x0C'), // Form feed
2223                        'v' => value.push('\x0B'), // Vertical tab
2224                        'x' => {
2225                            // Hex escape: \xNN (exactly 2 hex digits)
2226                            let mut hex = String::with_capacity(2);
2227                            for _ in 0..2 {
2228                                if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2229                                    hex.push(self.advance());
2230                                }
2231                            }
2232                            if hex.len() == 2 {
2233                                if let Ok(byte) = u8::from_str_radix(&hex, 16) {
2234                                    value.push(byte as char);
2235                                } else {
2236                                    value.push('\\');
2237                                    value.push('x');
2238                                    value.push_str(&hex);
2239                                }
2240                            } else {
2241                                // Not enough hex digits, preserve literally
2242                                value.push('\\');
2243                                value.push('x');
2244                                value.push_str(&hex);
2245                            }
2246                        }
2247                        '\\' => value.push('\\'),
2248                        '\'' => value.push('\''),
2249                        '"' => value.push('"'),
2250                        '%' => {
2251                            // MySQL: \% in LIKE patterns
2252                            value.push('%');
2253                        }
2254                        '_' => {
2255                            // MySQL: \_ in LIKE patterns
2256                            value.push('_');
2257                        }
2258                        // For unrecognized escape sequences:
2259                        // If escape_follow_chars is set, only preserve backslash for chars in that list
2260                        // Otherwise (empty list), preserve backslash + char for unrecognized escapes
2261                        _ => {
2262                            if !self.config.escape_follow_chars.is_empty() {
2263                                // MySQL-style: discard backslash for unrecognized escapes
2264                                value.push(escaped);
2265                            } else {
2266                                // Standard: preserve backslash + char
2267                                value.push('\\');
2268                                value.push(escaped);
2269                            }
2270                        }
2271                    }
2272                }
2273            } else {
2274                value.push(self.advance());
2275            }
2276        }
2277
2278        if self.is_at_end() {
2279            return Err(Error::tokenize(
2280                "Unterminated string",
2281                self.line,
2282                self.column,
2283                self.start,
2284                self.current,
2285            ));
2286        }
2287
2288        self.advance(); // Closing quote
2289        self.add_token_with_text(TokenType::String, value);
2290        Ok(())
2291    }
2292
2293    /// Scan a double-quoted string (for dialects like BigQuery where " is a string delimiter)
2294    fn scan_double_quoted_string(&mut self) -> Result<()> {
2295        self.advance(); // Opening quote
2296        let mut value = String::new();
2297
2298        while !self.is_at_end() {
2299            let c = self.peek();
2300            if c == '"' {
2301                if self.peek_next() == '"' {
2302                    // Escaped quote
2303                    value.push('"');
2304                    self.advance();
2305                    self.advance();
2306                } else {
2307                    break;
2308                }
2309            } else if c == '\\' && self.config.string_escapes.contains(&'\\') {
2310                // Handle escape sequences
2311                self.advance(); // Consume the backslash
2312                if !self.is_at_end() {
2313                    let escaped = self.advance();
2314                    match escaped {
2315                        'n' => value.push('\n'),
2316                        'r' => value.push('\r'),
2317                        't' => value.push('\t'),
2318                        '0' => value.push('\0'),
2319                        'Z' => value.push('\x1A'), // Ctrl+Z (MySQL)
2320                        'a' => value.push('\x07'), // Alert/bell
2321                        'b' => value.push('\x08'), // Backspace
2322                        'f' => value.push('\x0C'), // Form feed
2323                        'v' => value.push('\x0B'), // Vertical tab
2324                        'x' => {
2325                            // Hex escape: \xNN (exactly 2 hex digits)
2326                            let mut hex = String::with_capacity(2);
2327                            for _ in 0..2 {
2328                                if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2329                                    hex.push(self.advance());
2330                                }
2331                            }
2332                            if hex.len() == 2 {
2333                                if let Ok(byte) = u8::from_str_radix(&hex, 16) {
2334                                    value.push(byte as char);
2335                                } else {
2336                                    value.push('\\');
2337                                    value.push('x');
2338                                    value.push_str(&hex);
2339                                }
2340                            } else {
2341                                // Not enough hex digits, preserve literally
2342                                value.push('\\');
2343                                value.push('x');
2344                                value.push_str(&hex);
2345                            }
2346                        }
2347                        '\\' => value.push('\\'),
2348                        '\'' => value.push('\''),
2349                        '"' => value.push('"'),
2350                        '%' => {
2351                            // MySQL: \% in LIKE patterns
2352                            value.push('%');
2353                        }
2354                        '_' => {
2355                            // MySQL: \_ in LIKE patterns
2356                            value.push('_');
2357                        }
2358                        // For unrecognized escape sequences:
2359                        // If escape_follow_chars is set, only preserve backslash for chars in that list
2360                        // Otherwise (empty list), preserve backslash + char for unrecognized escapes
2361                        _ => {
2362                            if !self.config.escape_follow_chars.is_empty() {
2363                                // MySQL-style: discard backslash for unrecognized escapes
2364                                value.push(escaped);
2365                            } else {
2366                                // Standard: preserve backslash + char
2367                                value.push('\\');
2368                                value.push(escaped);
2369                            }
2370                        }
2371                    }
2372                }
2373            } else {
2374                value.push(self.advance());
2375            }
2376        }
2377
2378        if self.is_at_end() {
2379            return Err(Error::tokenize(
2380                "Unterminated double-quoted string",
2381                self.line,
2382                self.column,
2383                self.start,
2384                self.current,
2385            ));
2386        }
2387
2388        self.advance(); // Closing quote
2389        self.add_token_with_text(TokenType::String, value);
2390        Ok(())
2391    }
2392
2393    fn scan_triple_quoted_string(&mut self, quote_char: char) -> Result<()> {
2394        // Advance past the three opening quotes
2395        self.advance();
2396        self.advance();
2397        self.advance();
2398        let mut value = String::new();
2399
2400        while !self.is_at_end() {
2401            // Check for closing triple quote
2402            if self.peek() == quote_char
2403                && self.current + 1 < self.size
2404                && self.chars[self.current + 1] == quote_char
2405                && self.current + 2 < self.size
2406                && self.chars[self.current + 2] == quote_char
2407            {
2408                // Found closing """
2409                break;
2410            }
2411            value.push(self.advance());
2412        }
2413
2414        if self.is_at_end() {
2415            return Err(Error::tokenize(
2416                "Unterminated triple-quoted string",
2417                self.line,
2418                self.column,
2419                self.start,
2420                self.current,
2421            ));
2422        }
2423
2424        // Advance past the three closing quotes
2425        self.advance();
2426        self.advance();
2427        self.advance();
2428        let token_type = if quote_char == '"' {
2429            TokenType::TripleDoubleQuotedString
2430        } else {
2431            TokenType::TripleSingleQuotedString
2432        };
2433        self.add_token_with_text(token_type, value);
2434        Ok(())
2435    }
2436
2437    fn scan_quoted_identifier(&mut self, end_quote: char) -> Result<()> {
2438        self.advance(); // Opening quote
2439        let mut value = String::new();
2440
2441        loop {
2442            if self.is_at_end() {
2443                return Err(Error::tokenize(
2444                    "Unterminated identifier",
2445                    self.line,
2446                    self.column,
2447                    self.start,
2448                    self.current,
2449                ));
2450            }
2451            if self.peek() == end_quote {
2452                if self.peek_next() == end_quote {
2453                    // Escaped quote (e.g., "" inside "x""y") -> store single quote
2454                    value.push(end_quote);
2455                    self.advance(); // skip first quote
2456                    self.advance(); // skip second quote
2457                } else {
2458                    // End of identifier
2459                    break;
2460                }
2461            } else {
2462                value.push(self.peek());
2463                self.advance();
2464            }
2465        }
2466
2467        self.advance(); // Closing quote
2468        self.add_token_with_text(TokenType::QuotedIdentifier, value);
2469        Ok(())
2470    }
2471
2472    /// Scan a string delimited by Unicode curly single quotes (U+2018/U+2019).
2473    /// Content between curly quotes is literal (no escape processing).
2474    /// When opened with \u{2018} (left), close with \u{2019} (right) only.
2475    /// When opened with \u{2019} (right), close with \u{2019} (right) — self-closing.
2476    fn scan_unicode_quoted_string(&mut self, open_quote: char) -> Result<()> {
2477        self.advance(); // Opening curly quote
2478        let start = self.current;
2479        // Determine closing quote: left opens -> right closes; right opens -> right closes
2480        let close_quote = if open_quote == '\u{2018}' {
2481            '\u{2019}' // left opens, right closes
2482        } else {
2483            '\u{2019}' // right quote also closes with right quote
2484        };
2485        while !self.is_at_end() && self.peek() != close_quote {
2486            self.advance();
2487        }
2488        let value = self.text_from_range(start, self.current);
2489        if !self.is_at_end() {
2490            self.advance(); // Closing quote
2491        }
2492        self.add_token_with_text(TokenType::String, value);
2493        Ok(())
2494    }
2495
2496    /// Scan an identifier delimited by Unicode curly double quotes (U+201C/U+201D).
2497    /// When opened with \u{201C} (left), close with \u{201D} (right) only.
2498    fn scan_unicode_quoted_identifier(&mut self, open_quote: char) -> Result<()> {
2499        self.advance(); // Opening curly quote
2500        let start = self.current;
2501        let close_quote = if open_quote == '\u{201C}' {
2502            '\u{201D}' // left opens, right closes
2503        } else {
2504            '\u{201D}' // right also closes with right
2505        };
2506        while !self.is_at_end() && self.peek() != close_quote && self.peek() != '"' {
2507            self.advance();
2508        }
2509        let value = self.text_from_range(start, self.current);
2510        if !self.is_at_end() {
2511            self.advance(); // Closing quote
2512        }
2513        self.add_token_with_text(TokenType::QuotedIdentifier, value);
2514        Ok(())
2515    }
2516
2517    fn scan_number(&mut self) -> Result<()> {
2518        // Check for 0x/0X hex number prefix (SQLite-style)
2519        if self.config.hex_number_strings && self.peek() == '0' && !self.is_at_end() {
2520            let next = if self.current + 1 < self.size {
2521                self.chars[self.current + 1]
2522            } else {
2523                '\0'
2524            };
2525            if next == 'x' || next == 'X' {
2526                // Advance past '0' and 'x'/'X'
2527                self.advance();
2528                self.advance();
2529                // Collect hex digits (allow underscores as separators, e.g., 0xbad_cafe)
2530                let hex_start = self.current;
2531                while !self.is_at_end() && (self.peek().is_ascii_hexdigit() || self.peek() == '_') {
2532                    if self.peek() == '_' && !self.peek_next().is_ascii_hexdigit() {
2533                        break;
2534                    }
2535                    self.advance();
2536                }
2537                if self.current > hex_start {
2538                    // Check for hex float: 0xABC.DEFpEXP or 0xABCpEXP
2539                    let mut is_hex_float = false;
2540                    // Optional fractional part: .hexdigits
2541                    if !self.is_at_end() && self.peek() == '.' {
2542                        let after_dot = if self.current + 1 < self.size {
2543                            self.chars[self.current + 1]
2544                        } else {
2545                            '\0'
2546                        };
2547                        if after_dot.is_ascii_hexdigit() {
2548                            is_hex_float = true;
2549                            self.advance(); // consume '.'
2550                            while !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2551                                self.advance();
2552                            }
2553                        }
2554                    }
2555                    // Optional binary exponent: p/P [+/-] digits
2556                    if !self.is_at_end() && (self.peek() == 'p' || self.peek() == 'P') {
2557                        is_hex_float = true;
2558                        self.advance(); // consume p/P
2559                        if !self.is_at_end() && (self.peek() == '+' || self.peek() == '-') {
2560                            self.advance();
2561                        }
2562                        while !self.is_at_end() && self.peek().is_ascii_digit() {
2563                            self.advance();
2564                        }
2565                    }
2566                    if is_hex_float {
2567                        // Hex float literal — emit as regular Number token with full text
2568                        let full_text = self.text_from_range(self.start, self.current);
2569                        self.add_token_with_text(TokenType::Number, full_text);
2570                    } else if self.config.hex_string_is_integer_type {
2571                        // BigQuery/ClickHouse: 0xA represents an integer in hex notation
2572                        let hex_value = self.text_from_range(hex_start, self.current);
2573                        self.add_token_with_text(TokenType::HexNumber, hex_value);
2574                    } else {
2575                        // SQLite/Teradata: 0xCC represents a binary/blob hex string
2576                        let hex_value = self.text_from_range(hex_start, self.current);
2577                        self.add_token_with_text(TokenType::HexString, hex_value);
2578                    }
2579                    return Ok(());
2580                }
2581                // No hex digits after 0x - fall through to normal number parsing
2582                // (reset current back to after '0')
2583                self.current = self.start + 1;
2584            }
2585        }
2586
2587        // Allow underscores as digit separators (e.g., 20_000, 1_000_000)
2588        while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2589            // Don't allow underscore at the end (must be followed by digit)
2590            if self.peek() == '_' && (self.is_at_end() || !self.peek_next().is_ascii_digit()) {
2591                break;
2592            }
2593            self.advance();
2594        }
2595
2596        // Look for decimal part - allow trailing dot (e.g., "1.")
2597        // In PostgreSQL (and sqlglot), "1.x" parses as float "1." with alias "x"
2598        // So we always consume the dot as part of the number, even if followed by an identifier
2599        if self.peek() == '.' {
2600            let next = self.peek_next();
2601            // Only consume the dot if:
2602            // 1. Followed by a digit (normal decimal like 1.5)
2603            // 2. Followed by an identifier start (like 1.x -> becomes 1. with alias x)
2604            // 3. End of input or other non-dot character (trailing decimal like "1.")
2605            // Do NOT consume if it's a double dot (..) which is a range operator
2606            if next != '.' {
2607                self.advance(); // consume the .
2608                                // Only consume digits after the decimal point (not identifiers)
2609                while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2610                    if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2611                        break;
2612                    }
2613                    self.advance();
2614                }
2615            }
2616        }
2617
2618        // Look for exponent
2619        if self.peek() == 'e' || self.peek() == 'E' {
2620            self.advance();
2621            if self.peek() == '+' || self.peek() == '-' {
2622                self.advance();
2623            }
2624            while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2625                if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2626                    break;
2627                }
2628                self.advance();
2629            }
2630        }
2631
2632        let text = self.text_from_range(self.start, self.current);
2633
2634        // Check for numeric literal suffixes (e.g., 1L -> BIGINT, 1s -> SMALLINT in Hive/Spark)
2635        if !self.config.numeric_literals.is_empty() && !self.is_at_end() {
2636            let next_char = self.peek().to_uppercase().to_string();
2637            // Try 2-char suffix first (e.g., "BD"), then 1-char
2638            let suffix_match = if self.current + 1 < self.size {
2639                let two_char: String = vec![self.chars[self.current], self.chars[self.current + 1]]
2640                    .iter()
2641                    .collect::<String>()
2642                    .to_uppercase();
2643                if self.config.numeric_literals.contains_key(&two_char) {
2644                    // Make sure the 2-char suffix is not followed by more identifier chars
2645                    let after_suffix = if self.current + 2 < self.size {
2646                        self.chars[self.current + 2]
2647                    } else {
2648                        ' '
2649                    };
2650                    if !after_suffix.is_alphanumeric() && after_suffix != '_' {
2651                        Some((two_char, 2))
2652                    } else {
2653                        None
2654                    }
2655                } else if self.config.numeric_literals.contains_key(&next_char) {
2656                    // 1-char suffix - make sure not followed by more identifier chars
2657                    let after_suffix = if self.current + 1 < self.size {
2658                        self.chars[self.current + 1]
2659                    } else {
2660                        ' '
2661                    };
2662                    if !after_suffix.is_alphanumeric() && after_suffix != '_' {
2663                        Some((next_char, 1))
2664                    } else {
2665                        None
2666                    }
2667                } else {
2668                    None
2669                }
2670            } else if self.config.numeric_literals.contains_key(&next_char) {
2671                // At end of input, 1-char suffix
2672                Some((next_char, 1))
2673            } else {
2674                None
2675            };
2676
2677            if let Some((suffix, len)) = suffix_match {
2678                // Consume the suffix characters
2679                for _ in 0..len {
2680                    self.advance();
2681                }
2682                // Emit as a special number-with-suffix token
2683                // We'll encode as "number::TYPE" so the parser can split it
2684                let type_name = self
2685                    .config
2686                    .numeric_literals
2687                    .get(&suffix)
2688                    .expect("suffix verified by contains_key above")
2689                    .clone();
2690                let combined = format!("{}::{}", text, type_name);
2691                self.add_token_with_text(TokenType::Number, combined);
2692                return Ok(());
2693            }
2694        }
2695
2696        // Check for identifiers that start with a digit (e.g., 1a, 1_a, 1a_1a)
2697        // In Hive/Spark/MySQL/ClickHouse, these are valid unquoted identifiers
2698        if self.config.identifiers_can_start_with_digit && !self.is_at_end() {
2699            let next = self.peek();
2700            if next.is_alphabetic() || next == '_' {
2701                // Continue scanning as an identifier
2702                while !self.is_at_end() {
2703                    let ch = self.peek();
2704                    if ch.is_alphanumeric() || ch == '_' {
2705                        self.advance();
2706                    } else {
2707                        break;
2708                    }
2709                }
2710                let ident_text = self.text_from_range(self.start, self.current);
2711                self.add_token_with_text(TokenType::Identifier, ident_text);
2712                return Ok(());
2713            }
2714        }
2715
2716        self.add_token_with_text(TokenType::Number, text);
2717        Ok(())
2718    }
2719
2720    /// Scan a number that starts with a dot (e.g., .25, .5, .123e10)
2721    fn scan_number_starting_with_dot(&mut self) -> Result<()> {
2722        // Consume the leading dot
2723        self.advance();
2724
2725        // Consume the fractional digits
2726        while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2727            if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2728                break;
2729            }
2730            self.advance();
2731        }
2732
2733        // Look for exponent
2734        if self.peek() == 'e' || self.peek() == 'E' {
2735            self.advance();
2736            if self.peek() == '+' || self.peek() == '-' {
2737                self.advance();
2738            }
2739            while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2740                if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2741                    break;
2742                }
2743                self.advance();
2744            }
2745        }
2746
2747        let text = self.text_from_range(self.start, self.current);
2748        self.add_token_with_text(TokenType::Number, text);
2749        Ok(())
2750    }
2751
2752    fn scan_identifier_or_keyword(&mut self) -> Result<()> {
2753        // Guard against unrecognized characters that could cause infinite loops
2754        let first_char = self.peek();
2755        if !first_char.is_alphanumeric() && first_char != '_' {
2756            // Unknown character - skip it and return an error
2757            let c = self.advance();
2758            return Err(Error::tokenize(
2759                format!("Unexpected character: '{}'", c),
2760                self.line,
2761                self.column,
2762                self.start,
2763                self.current,
2764            ));
2765        }
2766
2767        while !self.is_at_end() {
2768            let c = self.peek();
2769            // Allow alphanumeric, underscore, $, # and @ in identifiers
2770            // PostgreSQL allows $, TSQL allows # and @
2771            // But stop consuming # if followed by > or >> (PostgreSQL #> and #>> operators)
2772            if c == '#' {
2773                let next_c = if self.current + 1 < self.size {
2774                    self.chars[self.current + 1]
2775                } else {
2776                    '\0'
2777                };
2778                if next_c == '>' || next_c == '-' {
2779                    break; // Don't consume # — it's part of #>, #>>, or #- operator
2780                }
2781                self.advance();
2782            } else if c.is_alphanumeric() || c == '_' || c == '$' || c == '@' {
2783                self.advance();
2784            } else {
2785                break;
2786            }
2787        }
2788
2789        let text = self.text_from_range(self.start, self.current);
2790        let upper = text.to_uppercase();
2791
2792        // Special-case NOT= (Teradata and other dialects)
2793        if upper == "NOT" && self.peek() == '=' {
2794            self.advance(); // consume '='
2795            self.add_token(TokenType::Neq);
2796            return Ok(());
2797        }
2798
2799        // Check for special string prefixes like N'...', X'...', B'...', U&'...', r'...', b'...'
2800        // Also handle double-quoted variants for dialects that support them (e.g., BigQuery)
2801        let next_char = self.peek();
2802        let is_single_quote = next_char == '\'';
2803        let is_double_quote = next_char == '"' && self.config.quotes.contains_key("\"");
2804        // For raw strings (r"..." or r'...'), we allow double quotes even if " is not in quotes config
2805        // because raw strings are a special case used in Spark/Databricks where " is for identifiers
2806        let is_double_quote_for_raw = next_char == '"';
2807
2808        // Handle raw strings first - they're special because they work with both ' and "
2809        // even in dialects where " is normally an identifier delimiter (like Databricks)
2810        if upper == "R" && (is_single_quote || is_double_quote_for_raw) {
2811            // Raw string r'...' or r"..." or r'''...''' or r"""...""" (BigQuery style)
2812            // In raw strings, backslashes are treated literally (no escape processing)
2813            let quote_char = if is_single_quote { '\'' } else { '"' };
2814            self.advance(); // consume the first opening quote
2815
2816            // Check for triple-quoted raw string (r"""...""" or r'''...''')
2817            if self.peek() == quote_char && self.peek_next() == quote_char {
2818                // Triple-quoted raw string
2819                self.advance(); // consume second quote
2820                self.advance(); // consume third quote
2821                let string_value = self.scan_raw_triple_quoted_content(quote_char)?;
2822                self.add_token_with_text(TokenType::RawString, string_value);
2823            } else {
2824                let string_value = self.scan_raw_string_content(quote_char)?;
2825                self.add_token_with_text(TokenType::RawString, string_value);
2826            }
2827            return Ok(());
2828        }
2829
2830        if is_single_quote || is_double_quote {
2831            match upper.as_str() {
2832                "N" => {
2833                    // National string N'...'
2834                    self.advance(); // consume the opening quote
2835                    let string_value = if is_single_quote {
2836                        self.scan_string_content()?
2837                    } else {
2838                        self.scan_double_quoted_string_content()?
2839                    };
2840                    self.add_token_with_text(TokenType::NationalString, string_value);
2841                    return Ok(());
2842                }
2843                "E" => {
2844                    // PostgreSQL escape string E'...' or e'...'
2845                    // Preserve the case by prefixing with "e:" or "E:"
2846                    // Always use backslash escapes for escape strings (e.g., \' is an escaped quote)
2847                    let lowercase = text == "e";
2848                    let prefix = if lowercase { "e:" } else { "E:" };
2849                    self.advance(); // consume the opening quote
2850                    let string_value = self.scan_string_content_with_escapes(true)?;
2851                    self.add_token_with_text(
2852                        TokenType::EscapeString,
2853                        format!("{}{}", prefix, string_value),
2854                    );
2855                    return Ok(());
2856                }
2857                "X" => {
2858                    // Hex string X'...'
2859                    self.advance(); // consume the opening quote
2860                    let string_value = if is_single_quote {
2861                        self.scan_string_content()?
2862                    } else {
2863                        self.scan_double_quoted_string_content()?
2864                    };
2865                    self.add_token_with_text(TokenType::HexString, string_value);
2866                    return Ok(());
2867                }
2868                "B" if is_double_quote => {
2869                    // Byte string b"..." (BigQuery style) - MUST check before single quote B'...'
2870                    self.advance(); // consume the opening quote
2871                    let string_value = self.scan_double_quoted_string_content()?;
2872                    self.add_token_with_text(TokenType::ByteString, string_value);
2873                    return Ok(());
2874                }
2875                "B" if is_single_quote => {
2876                    // For BigQuery: b'...' is a byte string (bytes data)
2877                    // For standard SQL: B'...' is a bit string (binary digits)
2878                    self.advance(); // consume the opening quote
2879                    let string_value = self.scan_string_content()?;
2880                    if self.config.b_prefix_is_byte_string {
2881                        self.add_token_with_text(TokenType::ByteString, string_value);
2882                    } else {
2883                        self.add_token_with_text(TokenType::BitString, string_value);
2884                    }
2885                    return Ok(());
2886                }
2887                _ => {}
2888            }
2889        }
2890
2891        // Check for U&'...' Unicode string syntax (SQL standard)
2892        if upper == "U"
2893            && self.peek() == '&'
2894            && self.current + 1 < self.size
2895            && self.chars[self.current + 1] == '\''
2896        {
2897            self.advance(); // consume '&'
2898            self.advance(); // consume opening quote
2899            let string_value = self.scan_string_content()?;
2900            self.add_token_with_text(TokenType::UnicodeString, string_value);
2901            return Ok(());
2902        }
2903
2904        let token_type = self
2905            .config
2906            .keywords
2907            .get(&upper)
2908            .copied()
2909            .unwrap_or(TokenType::Var);
2910
2911        self.add_token_with_text(token_type, text);
2912        Ok(())
2913    }
2914
2915    /// Scan string content (everything between quotes)
2916    /// If `force_backslash_escapes` is true, backslash is always treated as an escape character
2917    /// (used for PostgreSQL E'...' escape strings)
2918    fn scan_string_content_with_escapes(
2919        &mut self,
2920        force_backslash_escapes: bool,
2921    ) -> Result<String> {
2922        let mut value = String::new();
2923        let use_backslash_escapes =
2924            force_backslash_escapes || self.config.string_escapes.contains(&'\\');
2925
2926        while !self.is_at_end() {
2927            let c = self.peek();
2928            if c == '\'' {
2929                if self.peek_next() == '\'' {
2930                    // Escaped quote ''
2931                    value.push('\'');
2932                    self.advance();
2933                    self.advance();
2934                } else {
2935                    break;
2936                }
2937            } else if c == '\\' && use_backslash_escapes {
2938                // Preserve escape sequences literally (including \' for escape strings)
2939                value.push(self.advance());
2940                if !self.is_at_end() {
2941                    value.push(self.advance());
2942                }
2943            } else {
2944                value.push(self.advance());
2945            }
2946        }
2947
2948        if self.is_at_end() {
2949            return Err(Error::tokenize(
2950                "Unterminated string",
2951                self.line,
2952                self.column,
2953                self.start,
2954                self.current,
2955            ));
2956        }
2957
2958        self.advance(); // Closing quote
2959        Ok(value)
2960    }
2961
2962    /// Scan string content (everything between quotes)
2963    fn scan_string_content(&mut self) -> Result<String> {
2964        self.scan_string_content_with_escapes(false)
2965    }
2966
2967    /// Scan double-quoted string content (for dialects like BigQuery where " is a string delimiter)
2968    /// This is used for prefixed strings like b"..." or N"..."
2969    fn scan_double_quoted_string_content(&mut self) -> Result<String> {
2970        let mut value = String::new();
2971        let use_backslash_escapes = self.config.string_escapes.contains(&'\\');
2972
2973        while !self.is_at_end() {
2974            let c = self.peek();
2975            if c == '"' {
2976                if self.peek_next() == '"' {
2977                    // Escaped quote ""
2978                    value.push('"');
2979                    self.advance();
2980                    self.advance();
2981                } else {
2982                    break;
2983                }
2984            } else if c == '\\' && use_backslash_escapes {
2985                // Handle escape sequences
2986                self.advance(); // Consume backslash
2987                if !self.is_at_end() {
2988                    let escaped = self.advance();
2989                    match escaped {
2990                        'n' => value.push('\n'),
2991                        'r' => value.push('\r'),
2992                        't' => value.push('\t'),
2993                        '0' => value.push('\0'),
2994                        '\\' => value.push('\\'),
2995                        '"' => value.push('"'),
2996                        '\'' => value.push('\''),
2997                        'x' => {
2998                            // Hex escape \xNN - collect hex digits
2999                            let mut hex = String::new();
3000                            for _ in 0..2 {
3001                                if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
3002                                    hex.push(self.advance());
3003                                }
3004                            }
3005                            if let Ok(byte) = u8::from_str_radix(&hex, 16) {
3006                                value.push(byte as char);
3007                            } else {
3008                                // Invalid hex escape, keep it literal
3009                                value.push('\\');
3010                                value.push('x');
3011                                value.push_str(&hex);
3012                            }
3013                        }
3014                        _ => {
3015                            // For unrecognized escapes, preserve backslash + char
3016                            value.push('\\');
3017                            value.push(escaped);
3018                        }
3019                    }
3020                }
3021            } else {
3022                value.push(self.advance());
3023            }
3024        }
3025
3026        if self.is_at_end() {
3027            return Err(Error::tokenize(
3028                "Unterminated double-quoted string",
3029                self.line,
3030                self.column,
3031                self.start,
3032                self.current,
3033            ));
3034        }
3035
3036        self.advance(); // Closing quote
3037        Ok(value)
3038    }
3039
3040    /// Scan raw string content (limited escape processing for quotes)
3041    /// Used for BigQuery r'...' and r"..." strings
3042    /// In raw strings, backslashes are literal EXCEPT that escape sequences for the
3043    /// quote character still work (e.g., \' in r'...' escapes the quote, '' also works)
3044    fn scan_raw_string_content(&mut self, quote_char: char) -> Result<String> {
3045        let mut value = String::new();
3046
3047        while !self.is_at_end() {
3048            let c = self.peek();
3049            if c == quote_char {
3050                if self.peek_next() == quote_char {
3051                    // Escaped quote (doubled) - e.g., '' inside r'...'
3052                    value.push(quote_char);
3053                    self.advance();
3054                    self.advance();
3055                } else {
3056                    break;
3057                }
3058            } else if c == '\\'
3059                && self.peek_next() == quote_char
3060                && self.config.string_escapes_allowed_in_raw_strings
3061            {
3062                // Backslash-escaped quote - works in raw strings when string_escapes_allowed_in_raw_strings is true
3063                // e.g., \' inside r'...' becomes literal ' (BigQuery behavior)
3064                // Spark/Databricks has this set to false, so backslash is always literal there
3065                value.push(quote_char);
3066                self.advance(); // consume backslash
3067                self.advance(); // consume quote
3068            } else {
3069                // In raw strings, everything including backslashes is literal
3070                value.push(self.advance());
3071            }
3072        }
3073
3074        if self.is_at_end() {
3075            return Err(Error::tokenize(
3076                "Unterminated raw string",
3077                self.line,
3078                self.column,
3079                self.start,
3080                self.current,
3081            ));
3082        }
3083
3084        self.advance(); // Closing quote
3085        Ok(value)
3086    }
3087
3088    /// Scan raw triple-quoted string content (r"""...""" or r'''...''')
3089    /// Terminates when three consecutive quote_chars are found
3090    fn scan_raw_triple_quoted_content(&mut self, quote_char: char) -> Result<String> {
3091        let mut value = String::new();
3092
3093        while !self.is_at_end() {
3094            let c = self.peek();
3095            if c == quote_char && self.peek_next() == quote_char {
3096                // Check for third quote
3097                if self.current + 2 < self.size && self.chars[self.current + 2] == quote_char {
3098                    // Found three consecutive quotes - end of string
3099                    self.advance(); // first closing quote
3100                    self.advance(); // second closing quote
3101                    self.advance(); // third closing quote
3102                    return Ok(value);
3103                }
3104            }
3105            // In raw strings, everything including backslashes is literal
3106            let ch = self.advance();
3107            value.push(ch);
3108        }
3109
3110        Err(Error::tokenize(
3111            "Unterminated raw triple-quoted string",
3112            self.line,
3113            self.column,
3114            self.start,
3115            self.current,
3116        ))
3117    }
3118
3119    /// Scan TSQL identifiers that start with # (temp tables) or @ (variables)
3120    /// Examples: #temp, ##global_temp, @variable
3121    /// Scan an identifier that starts with `$` (ClickHouse).
3122    /// Examples: `$alias$name$`, `$x`
3123    fn scan_dollar_identifier(&mut self) -> Result<()> {
3124        // Consume the leading $
3125        self.advance();
3126
3127        // Consume alphanumeric, _, and $ continuation chars
3128        while !self.is_at_end() {
3129            let c = self.peek();
3130            if c.is_alphanumeric() || c == '_' || c == '$' {
3131                self.advance();
3132            } else {
3133                break;
3134            }
3135        }
3136
3137        let text = self.text_from_range(self.start, self.current);
3138        self.add_token_with_text(TokenType::Var, text);
3139        Ok(())
3140    }
3141
3142    fn scan_tsql_identifier(&mut self) -> Result<()> {
3143        // Consume the leading # or @ (or ##)
3144        let first = self.advance();
3145
3146        // For ##, consume the second #
3147        if first == '#' && self.peek() == '#' {
3148            self.advance();
3149        }
3150
3151        // Now scan the rest of the identifier
3152        while !self.is_at_end() {
3153            let c = self.peek();
3154            if c.is_alphanumeric() || c == '_' || c == '$' || c == '#' || c == '@' {
3155                self.advance();
3156            } else {
3157                break;
3158            }
3159        }
3160
3161        let text = self.text_from_range(self.start, self.current);
3162        // These are always identifiers (variables or temp table names), never keywords
3163        self.add_token_with_text(TokenType::Var, text);
3164        Ok(())
3165    }
3166
3167    /// Check if the last tokens match INSERT ... FORMAT <name> (not VALUES).
3168    /// If so, consume everything until the next blank line (two consecutive newlines)
3169    /// or end of input as raw data.
3170    fn try_scan_insert_format_raw_data(&mut self) -> Option<String> {
3171        let len = self.tokens.len();
3172        if len < 3 {
3173            return None;
3174        }
3175
3176        // Last token should be the format name (Identifier or Var, not VALUES)
3177        let last = &self.tokens[len - 1];
3178        if last.text.eq_ignore_ascii_case("VALUES") {
3179            return None;
3180        }
3181        if !matches!(last.token_type, TokenType::Var | TokenType::Identifier) {
3182            return None;
3183        }
3184
3185        // Second-to-last should be FORMAT
3186        let format_tok = &self.tokens[len - 2];
3187        if !format_tok.text.eq_ignore_ascii_case("FORMAT") {
3188            return None;
3189        }
3190
3191        // Check that there's an INSERT somewhere earlier in the tokens
3192        let has_insert = self.tokens[..len - 2]
3193            .iter()
3194            .rev()
3195            .take(20)
3196            .any(|t| t.token_type == TokenType::Insert);
3197        if !has_insert {
3198            return None;
3199        }
3200
3201        // We're in INSERT ... FORMAT <name> context. Consume everything until:
3202        // - A blank line (two consecutive newlines, possibly with whitespace between)
3203        // - End of input
3204        let raw_start = self.current;
3205        while !self.is_at_end() {
3206            let c = self.peek();
3207            if c == '\n' {
3208                // Check for blank line: \n followed by optional \r and \n
3209                let saved = self.current;
3210                self.advance(); // consume first \n
3211                                // Skip \r if present
3212                while !self.is_at_end() && self.peek() == '\r' {
3213                    self.advance();
3214                }
3215                if self.is_at_end() || self.peek() == '\n' {
3216                    // Found blank line or end of input - stop here
3217                    // Don't consume the second \n so subsequent SQL can be tokenized
3218                    let raw = self.text_from_range(raw_start, saved);
3219                    return Some(raw.trim().to_string());
3220                }
3221                // Not a blank line, continue scanning
3222            } else {
3223                self.advance();
3224            }
3225        }
3226
3227        // Reached end of input
3228        let raw = self.text_from_range(raw_start, self.current);
3229        let trimmed = raw.trim().to_string();
3230        if trimmed.is_empty() {
3231            None
3232        } else {
3233            Some(trimmed)
3234        }
3235    }
3236
3237    fn add_token(&mut self, token_type: TokenType) {
3238        let text = self.text_from_range(self.start, self.current);
3239        self.add_token_with_text(token_type, text);
3240    }
3241
3242    fn add_token_with_text(&mut self, token_type: TokenType, text: String) {
3243        let span = Span::new(self.start, self.current, self.line, self.column);
3244        let mut token = Token::new(token_type, text, span);
3245        token.comments.append(&mut self.comments);
3246        self.tokens.push(token);
3247    }
3248}
3249
3250#[cfg(test)]
3251mod tests {
3252    use super::*;
3253
3254    #[test]
3255    fn test_simple_select() {
3256        let tokenizer = Tokenizer::default();
3257        let tokens = tokenizer.tokenize("SELECT 1").unwrap();
3258
3259        assert_eq!(tokens.len(), 2);
3260        assert_eq!(tokens[0].token_type, TokenType::Select);
3261        assert_eq!(tokens[1].token_type, TokenType::Number);
3262        assert_eq!(tokens[1].text, "1");
3263    }
3264
3265    #[test]
3266    fn test_select_with_identifier() {
3267        let tokenizer = Tokenizer::default();
3268        let tokens = tokenizer.tokenize("SELECT a, b FROM t").unwrap();
3269
3270        assert_eq!(tokens.len(), 6);
3271        assert_eq!(tokens[0].token_type, TokenType::Select);
3272        assert_eq!(tokens[1].token_type, TokenType::Var);
3273        assert_eq!(tokens[1].text, "a");
3274        assert_eq!(tokens[2].token_type, TokenType::Comma);
3275        assert_eq!(tokens[3].token_type, TokenType::Var);
3276        assert_eq!(tokens[3].text, "b");
3277        assert_eq!(tokens[4].token_type, TokenType::From);
3278        assert_eq!(tokens[5].token_type, TokenType::Var);
3279        assert_eq!(tokens[5].text, "t");
3280    }
3281
3282    #[test]
3283    fn test_string_literal() {
3284        let tokenizer = Tokenizer::default();
3285        let tokens = tokenizer.tokenize("SELECT 'hello'").unwrap();
3286
3287        assert_eq!(tokens.len(), 2);
3288        assert_eq!(tokens[1].token_type, TokenType::String);
3289        assert_eq!(tokens[1].text, "hello");
3290    }
3291
3292    #[test]
3293    fn test_escaped_string() {
3294        let tokenizer = Tokenizer::default();
3295        let tokens = tokenizer.tokenize("SELECT 'it''s'").unwrap();
3296
3297        assert_eq!(tokens.len(), 2);
3298        assert_eq!(tokens[1].token_type, TokenType::String);
3299        assert_eq!(tokens[1].text, "it's");
3300    }
3301
3302    #[test]
3303    fn test_comments() {
3304        let tokenizer = Tokenizer::default();
3305        let tokens = tokenizer.tokenize("SELECT -- comment\n1").unwrap();
3306
3307        assert_eq!(tokens.len(), 2);
3308        // Comments are attached to the PREVIOUS token as trailing_comments
3309        // This is better for round-trip fidelity (e.g., SELECT c /* comment */ FROM)
3310        assert_eq!(tokens[0].trailing_comments.len(), 1);
3311        assert_eq!(tokens[0].trailing_comments[0], " comment");
3312    }
3313
3314    #[test]
3315    fn test_comment_in_and_chain() {
3316        use crate::generator::Generator;
3317        use crate::parser::Parser;
3318
3319        // Line comments between AND clauses should appear after the AND operator
3320        let sql = "SELECT a FROM b WHERE foo\n-- c1\nAND bar\n-- c2\nAND bla";
3321        let ast = Parser::parse_sql(sql).unwrap();
3322        let mut gen = Generator::default();
3323        let output = gen.generate(&ast[0]).unwrap();
3324        assert_eq!(
3325            output,
3326            "SELECT a FROM b WHERE foo AND /* c1 */ bar AND /* c2 */ bla"
3327        );
3328    }
3329
3330    #[test]
3331    fn test_operators() {
3332        let tokenizer = Tokenizer::default();
3333        let tokens = tokenizer.tokenize("1 + 2 * 3").unwrap();
3334
3335        assert_eq!(tokens.len(), 5);
3336        assert_eq!(tokens[0].token_type, TokenType::Number);
3337        assert_eq!(tokens[1].token_type, TokenType::Plus);
3338        assert_eq!(tokens[2].token_type, TokenType::Number);
3339        assert_eq!(tokens[3].token_type, TokenType::Star);
3340        assert_eq!(tokens[4].token_type, TokenType::Number);
3341    }
3342
3343    #[test]
3344    fn test_comparison_operators() {
3345        let tokenizer = Tokenizer::default();
3346        let tokens = tokenizer.tokenize("a <= b >= c != d").unwrap();
3347
3348        assert_eq!(tokens[1].token_type, TokenType::Lte);
3349        assert_eq!(tokens[3].token_type, TokenType::Gte);
3350        assert_eq!(tokens[5].token_type, TokenType::Neq);
3351    }
3352
3353    #[test]
3354    fn test_national_string() {
3355        let tokenizer = Tokenizer::default();
3356        let tokens = tokenizer.tokenize("N'abc'").unwrap();
3357
3358        assert_eq!(
3359            tokens.len(),
3360            1,
3361            "Expected 1 token for N'abc', got {:?}",
3362            tokens
3363        );
3364        assert_eq!(tokens[0].token_type, TokenType::NationalString);
3365        assert_eq!(tokens[0].text, "abc");
3366    }
3367
3368    #[test]
3369    fn test_hex_string() {
3370        let tokenizer = Tokenizer::default();
3371        let tokens = tokenizer.tokenize("X'ABCD'").unwrap();
3372
3373        assert_eq!(
3374            tokens.len(),
3375            1,
3376            "Expected 1 token for X'ABCD', got {:?}",
3377            tokens
3378        );
3379        assert_eq!(tokens[0].token_type, TokenType::HexString);
3380        assert_eq!(tokens[0].text, "ABCD");
3381    }
3382
3383    #[test]
3384    fn test_bit_string() {
3385        let tokenizer = Tokenizer::default();
3386        let tokens = tokenizer.tokenize("B'01010'").unwrap();
3387
3388        assert_eq!(
3389            tokens.len(),
3390            1,
3391            "Expected 1 token for B'01010', got {:?}",
3392            tokens
3393        );
3394        assert_eq!(tokens[0].token_type, TokenType::BitString);
3395        assert_eq!(tokens[0].text, "01010");
3396    }
3397
3398    #[test]
3399    fn test_trailing_dot_number() {
3400        let tokenizer = Tokenizer::default();
3401
3402        // Test trailing dot
3403        let tokens = tokenizer.tokenize("SELECT 1.").unwrap();
3404        assert_eq!(
3405            tokens.len(),
3406            2,
3407            "Expected 2 tokens for 'SELECT 1.', got {:?}",
3408            tokens
3409        );
3410        assert_eq!(tokens[1].token_type, TokenType::Number);
3411        assert_eq!(tokens[1].text, "1.");
3412
3413        // Test normal decimal
3414        let tokens = tokenizer.tokenize("SELECT 1.5").unwrap();
3415        assert_eq!(tokens[1].text, "1.5");
3416
3417        // Test number followed by dot and identifier
3418        // In PostgreSQL (and sqlglot), "1.x" parses as float "1." with alias "x"
3419        let tokens = tokenizer.tokenize("SELECT 1.a").unwrap();
3420        assert_eq!(
3421            tokens.len(),
3422            3,
3423            "Expected 3 tokens for 'SELECT 1.a', got {:?}",
3424            tokens
3425        );
3426        assert_eq!(tokens[1].token_type, TokenType::Number);
3427        assert_eq!(tokens[1].text, "1.");
3428        assert_eq!(tokens[2].token_type, TokenType::Var);
3429
3430        // Test two dots (range operator) - dot is NOT consumed when followed by another dot
3431        let tokens = tokenizer.tokenize("SELECT 1..2").unwrap();
3432        assert_eq!(tokens[1].token_type, TokenType::Number);
3433        assert_eq!(tokens[1].text, "1");
3434        assert_eq!(tokens[2].token_type, TokenType::Dot);
3435        assert_eq!(tokens[3].token_type, TokenType::Dot);
3436        assert_eq!(tokens[4].token_type, TokenType::Number);
3437        assert_eq!(tokens[4].text, "2");
3438    }
3439
3440    #[test]
3441    fn test_leading_dot_number() {
3442        let tokenizer = Tokenizer::default();
3443
3444        // Test leading dot number (e.g., .25 for 0.25)
3445        let tokens = tokenizer.tokenize(".25").unwrap();
3446        assert_eq!(
3447            tokens.len(),
3448            1,
3449            "Expected 1 token for '.25', got {:?}",
3450            tokens
3451        );
3452        assert_eq!(tokens[0].token_type, TokenType::Number);
3453        assert_eq!(tokens[0].text, ".25");
3454
3455        // Test leading dot in context (Oracle SAMPLE clause)
3456        let tokens = tokenizer.tokenize("SAMPLE (.25)").unwrap();
3457        assert_eq!(
3458            tokens.len(),
3459            4,
3460            "Expected 4 tokens for 'SAMPLE (.25)', got {:?}",
3461            tokens
3462        );
3463        assert_eq!(tokens[0].token_type, TokenType::Sample);
3464        assert_eq!(tokens[1].token_type, TokenType::LParen);
3465        assert_eq!(tokens[2].token_type, TokenType::Number);
3466        assert_eq!(tokens[2].text, ".25");
3467        assert_eq!(tokens[3].token_type, TokenType::RParen);
3468
3469        // Test leading dot with exponent
3470        let tokens = tokenizer.tokenize(".5e10").unwrap();
3471        assert_eq!(
3472            tokens.len(),
3473            1,
3474            "Expected 1 token for '.5e10', got {:?}",
3475            tokens
3476        );
3477        assert_eq!(tokens[0].token_type, TokenType::Number);
3478        assert_eq!(tokens[0].text, ".5e10");
3479
3480        // Test that plain dot is still a Dot token
3481        let tokens = tokenizer.tokenize("a.b").unwrap();
3482        assert_eq!(
3483            tokens.len(),
3484            3,
3485            "Expected 3 tokens for 'a.b', got {:?}",
3486            tokens
3487        );
3488        assert_eq!(tokens[1].token_type, TokenType::Dot);
3489    }
3490
3491    #[test]
3492    fn test_unrecognized_character() {
3493        let tokenizer = Tokenizer::default();
3494
3495        // Unicode curly quotes are now handled as string delimiters
3496        let result = tokenizer.tokenize("SELECT \u{2018}hello\u{2019}");
3497        assert!(
3498            result.is_ok(),
3499            "Curly quotes should be tokenized as strings"
3500        );
3501
3502        // Unicode bullet character should still error
3503        let result = tokenizer.tokenize("SELECT • FROM t");
3504        assert!(result.is_err());
3505    }
3506
3507    #[test]
3508    fn test_colon_eq_tokenization() {
3509        let tokenizer = Tokenizer::default();
3510
3511        // := should be a single ColonEq token
3512        let tokens = tokenizer.tokenize("a := 1").unwrap();
3513        assert_eq!(tokens.len(), 3);
3514        assert_eq!(tokens[0].token_type, TokenType::Var);
3515        assert_eq!(tokens[1].token_type, TokenType::ColonEq);
3516        assert_eq!(tokens[2].token_type, TokenType::Number);
3517
3518        // : followed by non-= should still be Colon
3519        let tokens = tokenizer.tokenize("a:b").unwrap();
3520        assert!(tokens.iter().any(|t| t.token_type == TokenType::Colon));
3521        assert!(!tokens.iter().any(|t| t.token_type == TokenType::ColonEq));
3522
3523        // :: should still be DColon
3524        let tokens = tokenizer.tokenize("a::INT").unwrap();
3525        assert!(tokens.iter().any(|t| t.token_type == TokenType::DColon));
3526    }
3527
3528    #[test]
3529    fn test_colon_eq_parsing() {
3530        use crate::generator::Generator;
3531        use crate::parser::Parser;
3532
3533        // MySQL @var := value in SELECT
3534        let ast = Parser::parse_sql("SELECT @var1 := 1, @var2")
3535            .expect("Failed to parse MySQL @var := expr");
3536        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3537        assert_eq!(output, "SELECT @var1 := 1, @var2");
3538
3539        // MySQL @var := @var in SELECT
3540        let ast = Parser::parse_sql("SELECT @var1, @var2 := @var1")
3541            .expect("Failed to parse MySQL @var2 := @var1");
3542        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3543        assert_eq!(output, "SELECT @var1, @var2 := @var1");
3544
3545        // MySQL @var := COUNT(*)
3546        let ast = Parser::parse_sql("SELECT @var1 := COUNT(*) FROM t1")
3547            .expect("Failed to parse MySQL @var := COUNT(*)");
3548        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3549        assert_eq!(output, "SELECT @var1 := COUNT(*) FROM t1");
3550
3551        // MySQL SET @var := 1 (should normalize to = in output)
3552        let ast = Parser::parse_sql("SET @var1 := 1").expect("Failed to parse SET @var1 := 1");
3553        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3554        assert_eq!(output, "SET @var1 = 1");
3555
3556        // Function named args with :=
3557        let ast =
3558            Parser::parse_sql("UNION_VALUE(k1 := 1)").expect("Failed to parse named arg with :=");
3559        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3560        assert_eq!(output, "UNION_VALUE(k1 := 1)");
3561
3562        // UNNEST with recursive := TRUE
3563        let ast = Parser::parse_sql("SELECT UNNEST(col, recursive := TRUE) FROM t")
3564            .expect("Failed to parse UNNEST with :=");
3565        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3566        assert_eq!(output, "SELECT UNNEST(col, recursive := TRUE) FROM t");
3567
3568        // DuckDB prefix alias: foo: 1 means 1 AS foo
3569        let ast =
3570            Parser::parse_sql("SELECT foo: 1").expect("Failed to parse DuckDB prefix alias foo: 1");
3571        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3572        assert_eq!(output, "SELECT 1 AS foo");
3573
3574        // DuckDB prefix alias with multiple columns
3575        let ast = Parser::parse_sql("SELECT foo: 1, bar: 2, baz: 3")
3576            .expect("Failed to parse DuckDB multiple prefix aliases");
3577        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3578        assert_eq!(output, "SELECT 1 AS foo, 2 AS bar, 3 AS baz");
3579    }
3580
3581    #[test]
3582    fn test_colon_eq_dialect_roundtrip() {
3583        use crate::dialects::{Dialect, DialectType};
3584
3585        fn check(dialect: DialectType, sql: &str, expected: Option<&str>) {
3586            let d = Dialect::get(dialect);
3587            let ast = d
3588                .parse(sql)
3589                .unwrap_or_else(|e| panic!("Parse error for '{}': {}", sql, e));
3590            assert!(!ast.is_empty(), "Empty AST for: {}", sql);
3591            let transformed = d
3592                .transform(ast[0].clone())
3593                .unwrap_or_else(|e| panic!("Transform error for '{}': {}", sql, e));
3594            let output = d
3595                .generate(&transformed)
3596                .unwrap_or_else(|e| panic!("Generate error for '{}': {}", sql, e));
3597            let expected = expected.unwrap_or(sql);
3598            assert_eq!(output, expected, "Roundtrip failed for: {}", sql);
3599        }
3600
3601        // MySQL := tests
3602        check(DialectType::MySQL, "SELECT @var1 := 1, @var2", None);
3603        check(DialectType::MySQL, "SELECT @var1, @var2 := @var1", None);
3604        check(DialectType::MySQL, "SELECT @var1 := COUNT(*) FROM t1", None);
3605        check(DialectType::MySQL, "SET @var1 := 1", Some("SET @var1 = 1"));
3606
3607        // DuckDB := tests
3608        check(
3609            DialectType::DuckDB,
3610            "SELECT UNNEST(col, recursive := TRUE) FROM t",
3611            None,
3612        );
3613        check(DialectType::DuckDB, "UNION_VALUE(k1 := 1)", None);
3614
3615        // STRUCT_PACK(a := 'b')::json should at least parse without error
3616        // (The STRUCT_PACK -> Struct transformation is a separate feature)
3617        {
3618            let d = Dialect::get(DialectType::DuckDB);
3619            let ast = d
3620                .parse("STRUCT_PACK(a := 'b')::json")
3621                .expect("Failed to parse STRUCT_PACK(a := 'b')::json");
3622            assert!(!ast.is_empty(), "Empty AST for STRUCT_PACK(a := 'b')::json");
3623        }
3624
3625        // DuckDB prefix alias tests
3626        check(
3627            DialectType::DuckDB,
3628            "SELECT foo: 1",
3629            Some("SELECT 1 AS foo"),
3630        );
3631        check(
3632            DialectType::DuckDB,
3633            "SELECT foo: 1, bar: 2, baz: 3",
3634            Some("SELECT 1 AS foo, 2 AS bar, 3 AS baz"),
3635        );
3636    }
3637
3638    #[test]
3639    fn test_comment_roundtrip() {
3640        use crate::generator::Generator;
3641        use crate::parser::Parser;
3642
3643        fn check_roundtrip(sql: &str) -> Option<String> {
3644            let ast = match Parser::parse_sql(sql) {
3645                Ok(a) => a,
3646                Err(e) => return Some(format!("Parse error: {:?}", e)),
3647            };
3648            if ast.is_empty() {
3649                return Some("Empty AST".to_string());
3650            }
3651            let mut generator = Generator::default();
3652            let output = match generator.generate(&ast[0]) {
3653                Ok(o) => o,
3654                Err(e) => return Some(format!("Gen error: {:?}", e)),
3655            };
3656            if output == sql {
3657                None
3658            } else {
3659                Some(format!(
3660                    "Mismatch:\n  input:  {}\n  output: {}",
3661                    sql, output
3662                ))
3663            }
3664        }
3665
3666        let tests = vec![
3667            // Nested comments
3668            "SELECT c /* c1 /* c2 */ c3 */",
3669            "SELECT c /* c1 /* c2 /* c3 */ */ */",
3670            // Simple alias with comments
3671            "SELECT c /* c1 */ AS alias /* c2 */",
3672            // Multiple columns with comments
3673            "SELECT a /* x */, b /* x */",
3674            // Multiple comments after column
3675            "SELECT a /* x */ /* y */ /* z */, b /* k */ /* m */",
3676            // FROM tables with comments
3677            "SELECT * FROM foo /* x */, bla /* x */",
3678            // Arithmetic with comments
3679            "SELECT 1 /* comment */ + 1",
3680            "SELECT 1 /* c1 */ + 2 /* c2 */",
3681            "SELECT 1 /* c1 */ + /* c2 */ 2 /* c3 */",
3682            // CAST with comments
3683            "SELECT CAST(x AS INT) /* comment */ FROM foo",
3684            // Function arguments with comments
3685            "SELECT FOO(x /* c */) /* FOO */, b /* b */",
3686            // Multi-part table names with comments
3687            "SELECT x FROM a.b.c /* x */, e.f.g /* x */",
3688            // INSERT with comments
3689            "INSERT INTO t1 (tc1 /* tc1 */, tc2 /* tc2 */) SELECT c1 /* sc1 */, c2 /* sc2 */ FROM t",
3690            // Leading comments on statements
3691            "/* c */ WITH x AS (SELECT 1) SELECT * FROM x",
3692            "/* comment1 */ INSERT INTO x /* comment2 */ VALUES (1, 2, 3)",
3693            "/* comment1 */ UPDATE tbl /* comment2 */ SET x = 2 WHERE x < 2",
3694            "/* comment1 */ DELETE FROM x /* comment2 */ WHERE y > 1",
3695            "/* comment */ CREATE TABLE foo AS SELECT 1",
3696            // Trailing comments on statements
3697            "INSERT INTO foo SELECT * FROM bar /* comment */",
3698            // Complex nested expressions with comments
3699            "SELECT FOO(x /* c1 */ + y /* c2 */ + BLA(5 /* c3 */)) FROM (VALUES (1 /* c4 */, \"test\" /* c5 */)) /* c6 */",
3700        ];
3701
3702        let mut failures = Vec::new();
3703        for sql in tests {
3704            if let Some(e) = check_roundtrip(sql) {
3705                failures.push(e);
3706            }
3707        }
3708
3709        if !failures.is_empty() {
3710            panic!("Comment roundtrip failures:\n{}", failures.join("\n\n"));
3711        }
3712    }
3713
3714    #[test]
3715    fn test_dollar_quoted_string_parsing() {
3716        use crate::dialects::{Dialect, DialectType};
3717
3718        // Test dollar string token parsing utility function
3719        let (tag, content) = super::parse_dollar_string_token("FOO\x00content here");
3720        assert_eq!(tag, Some("FOO".to_string()));
3721        assert_eq!(content, "content here");
3722
3723        let (tag, content) = super::parse_dollar_string_token("just content");
3724        assert_eq!(tag, None);
3725        assert_eq!(content, "just content");
3726
3727        // Test roundtrip for Databricks dialect with dollar-quoted function body
3728        fn check_databricks(sql: &str, expected: Option<&str>) {
3729            let d = Dialect::get(DialectType::Databricks);
3730            let ast = d
3731                .parse(sql)
3732                .unwrap_or_else(|e| panic!("Parse error for '{}': {}", sql, e));
3733            assert!(!ast.is_empty(), "Empty AST for: {}", sql);
3734            let transformed = d
3735                .transform(ast[0].clone())
3736                .unwrap_or_else(|e| panic!("Transform error for '{}': {}", sql, e));
3737            let output = d
3738                .generate(&transformed)
3739                .unwrap_or_else(|e| panic!("Generate error for '{}': {}", sql, e));
3740            let expected = expected.unwrap_or(sql);
3741            assert_eq!(output, expected, "Roundtrip failed for: {}", sql);
3742        }
3743
3744        // Test [42]: $$...$$ heredoc
3745        check_databricks(
3746            "CREATE FUNCTION add_one(x INT) RETURNS INT LANGUAGE PYTHON AS $$def add_one(x):\n  return x+1$$",
3747            None
3748        );
3749
3750        // Test [43]: $FOO$...$FOO$ tagged heredoc
3751        check_databricks(
3752            "CREATE FUNCTION add_one(x INT) RETURNS INT LANGUAGE PYTHON AS $FOO$def add_one(x):\n  return x+1$FOO$",
3753            None
3754        );
3755    }
3756}
polyglot_sql/tokens.rs

polyglot_sql/
tokens.rs