polyglot_sql/
tokens.rs

1//! Token types and tokenization for SQL parsing
2//!
3//! This module defines all SQL token types and the tokenizer that converts
4//! SQL strings into token streams.
5
6use crate::error::{Error, Result};
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9use std::fmt;
10use std::sync::LazyLock;
11#[cfg(feature = "bindings")]
12use ts_rs::TS;
13
14/// Parse a DollarString token text into (tag, content).
15/// If the text contains '\x00', the part before is the tag and after is content.
16/// Otherwise, the whole text is the content with no tag.
17pub fn parse_dollar_string_token(text: &str) -> (Option<String>, String) {
18    if let Some(pos) = text.find('\x00') {
19        let tag = &text[..pos];
20        let content = &text[pos + 1..];
21        (Some(tag.to_string()), content.to_string())
22    } else {
23        (None, text.to_string())
24    }
25}
26
27/// Represents a position in the source SQL
28#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
29#[cfg_attr(feature = "bindings", derive(TS))]
30pub struct Span {
31    /// Starting byte offset
32    pub start: usize,
33    /// Ending byte offset (exclusive)
34    pub end: usize,
35    /// Line number (1-based)
36    pub line: usize,
37    /// Column number (1-based)
38    pub column: usize,
39}
40
41impl Span {
42    pub fn new(start: usize, end: usize, line: usize, column: usize) -> Self {
43        Self {
44            start,
45            end,
46            line,
47            column,
48        }
49    }
50}
51
52/// A token in the SQL token stream
53#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
54pub struct Token {
55    /// The type of token
56    pub token_type: TokenType,
57    /// The raw text of the token
58    pub text: String,
59    /// Position information
60    pub span: Span,
61    /// Leading comments (comments that appeared before this token)
62    #[serde(default)]
63    pub comments: Vec<String>,
64    /// Trailing comments (comments that appeared after this token, before the next one)
65    #[serde(default)]
66    pub trailing_comments: Vec<String>,
67}
68
69impl Token {
70    /// Create a new token
71    pub fn new(token_type: TokenType, text: impl Into<String>, span: Span) -> Self {
72        Self {
73            token_type,
74            text: text.into(),
75            span,
76            comments: Vec::new(),
77            trailing_comments: Vec::new(),
78        }
79    }
80
81    /// Create a NUMBER token
82    pub fn number(n: i64) -> Self {
83        Self::new(TokenType::Number, n.to_string(), Span::default())
84    }
85
86    /// Create a STRING token
87    pub fn string(s: impl Into<String>) -> Self {
88        Self::new(TokenType::String, s, Span::default())
89    }
90
91    /// Create an IDENTIFIER token
92    pub fn identifier(s: impl Into<String>) -> Self {
93        Self::new(TokenType::Identifier, s, Span::default())
94    }
95
96    /// Create a VAR token
97    pub fn var(s: impl Into<String>) -> Self {
98        Self::new(TokenType::Var, s, Span::default())
99    }
100
101    /// Add a comment to this token
102    pub fn with_comment(mut self, comment: impl Into<String>) -> Self {
103        self.comments.push(comment.into());
104        self
105    }
106}
107
108impl fmt::Display for Token {
109    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
110        write!(f, "{:?}({})", self.token_type, self.text)
111    }
112}
113
114/// All possible token types in SQL
115#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
116#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
117#[repr(u16)]
118pub enum TokenType {
119    // Punctuation
120    LParen,
121    RParen,
122    LBracket,
123    RBracket,
124    LBrace,
125    RBrace,
126    Comma,
127    Dot,
128    Dash,
129    Plus,
130    Colon,
131    DotColon,
132    DColon,
133    DColonDollar,
134    DColonPercent,
135    DColonQMark,
136    DQMark,
137    Semicolon,
138    Star,
139    Backslash,
140    Slash,
141    Lt,
142    Lte,
143    Gt,
144    Gte,
145    Not,
146    Eq,
147    Neq,
148    NullsafeEq,
149    ColonEq,
150    ColonGt,
151    NColonGt,
152    And,
153    Or,
154    Amp,
155    DPipe,
156    PipeGt,
157    Pipe,
158    PipeSlash,
159    DPipeSlash,
160    Caret,
161    CaretAt,
162    LtLt, // <<
163    GtGt, // >>
164    Tilde,
165    Arrow,
166    DArrow,
167    FArrow,
168    Hash,
169    HashArrow,
170    DHashArrow,
171    LrArrow,
172    DAt,
173    AtAt,
174    LtAt,
175    AtGt,
176    Dollar,
177    Parameter,
178    Session,
179    SessionParameter,
180    SessionUser,
181    DAmp,
182    AmpLt,
183    AmpGt,
184    Adjacent,
185    Xor,
186    DStar,
187    QMarkAmp,
188    QMarkPipe,
189    HashDash,
190    Exclamation,
191
192    UriStart,
193    BlockStart,
194    BlockEnd,
195    Space,
196    Break,
197
198    // Comments (emitted as tokens for round-trip fidelity)
199    BlockComment, // /* ... */
200    LineComment,  // -- ...
201
202    // Literals
203    String,
204    DollarString,             // $$...$$
205    TripleDoubleQuotedString, // """..."""
206    TripleSingleQuotedString, // '''...'''
207    Number,
208    Identifier,
209    QuotedIdentifier,
210    Database,
211    Column,
212    ColumnDef,
213    Schema,
214    Table,
215    Warehouse,
216    Stage,
217    Streamlit,
218    Var,
219    BitString,
220    HexString,
221    /// Hex number: 0xA, 0xFF (BigQuery, SQLite style) - represents an integer in hex notation
222    HexNumber,
223    ByteString,
224    NationalString,
225    EscapeString, // PostgreSQL E'...' escape string
226    RawString,
227    HeredocString,
228    HeredocStringAlternative,
229    UnicodeString,
230
231    // Data Types
232    Bit,
233    Boolean,
234    TinyInt,
235    UTinyInt,
236    SmallInt,
237    USmallInt,
238    MediumInt,
239    UMediumInt,
240    Int,
241    UInt,
242    BigInt,
243    UBigInt,
244    BigNum,
245    Int128,
246    UInt128,
247    Int256,
248    UInt256,
249    Float,
250    Double,
251    UDouble,
252    Decimal,
253    Decimal32,
254    Decimal64,
255    Decimal128,
256    Decimal256,
257    DecFloat,
258    UDecimal,
259    BigDecimal,
260    Char,
261    NChar,
262    VarChar,
263    NVarChar,
264    BpChar,
265    Text,
266    MediumText,
267    LongText,
268    Blob,
269    MediumBlob,
270    LongBlob,
271    TinyBlob,
272    TinyText,
273    Name,
274    Binary,
275    VarBinary,
276    Json,
277    JsonB,
278    Time,
279    TimeTz,
280    TimeNs,
281    Timestamp,
282    TimestampTz,
283    TimestampLtz,
284    TimestampNtz,
285    TimestampS,
286    TimestampMs,
287    TimestampNs,
288    DateTime,
289    DateTime2,
290    DateTime64,
291    SmallDateTime,
292    Date,
293    Date32,
294    Int4Range,
295    Int4MultiRange,
296    Int8Range,
297    Int8MultiRange,
298    NumRange,
299    NumMultiRange,
300    TsRange,
301    TsMultiRange,
302    TsTzRange,
303    TsTzMultiRange,
304    DateRange,
305    DateMultiRange,
306    Uuid,
307    Geography,
308    GeographyPoint,
309    Nullable,
310    Geometry,
311    Point,
312    Ring,
313    LineString,
314    LocalTime,
315    LocalTimestamp,
316    SysTimestamp,
317    MultiLineString,
318    Polygon,
319    MultiPolygon,
320    HllSketch,
321    HStore,
322    Super,
323    Serial,
324    SmallSerial,
325    BigSerial,
326    Xml,
327    Year,
328    UserDefined,
329    Money,
330    SmallMoney,
331    RowVersion,
332    Image,
333    Variant,
334    Object,
335    Inet,
336    IpAddress,
337    IpPrefix,
338    Ipv4,
339    Ipv6,
340    Enum,
341    Enum8,
342    Enum16,
343    FixedString,
344    LowCardinality,
345    Nested,
346    AggregateFunction,
347    SimpleAggregateFunction,
348    TDigest,
349    Unknown,
350    Vector,
351    Dynamic,
352    Void,
353
354    // Keywords
355    Add,
356    Alias,
357    Alter,
358    All,
359    Anti,
360    Any,
361    Apply,
362    Array,
363    Asc,
364    AsOf,
365    Attach,
366    AutoIncrement,
367    Begin,
368    Between,
369    BulkCollectInto,
370    Cache,
371    Cascade,
372    Case,
373    CharacterSet,
374    Cluster,
375    ClusterBy,
376    Collate,
377    Command,
378    Comment,
379    Commit,
380    Preserve,
381    Connect,
382    ConnectBy,
383    Constraint,
384    Copy,
385    Create,
386    Cross,
387    Cube,
388    CurrentDate,
389    CurrentDateTime,
390    CurrentSchema,
391    CurrentTime,
392    CurrentTimestamp,
393    CurrentUser,
394    CurrentRole,
395    CurrentCatalog,
396    Declare,
397    Default,
398    Delete,
399    Desc,
400    Describe,
401    Detach,
402    Dictionary,
403    Distinct,
404    Distribute,
405    DistributeBy,
406    Div,
407    Drop,
408    Else,
409    End,
410    Escape,
411    Except,
412    Execute,
413    Exists,
414    False,
415    Fetch,
416    File,
417    FileFormat,
418    Filter,
419    Final,
420    First,
421    For,
422    Force,
423    ForeignKey,
424    Format,
425    From,
426    Full,
427    Function,
428    Get,
429    Glob,
430    Global,
431    Grant,
432    GroupBy,
433    GroupingSets,
434    Having,
435    Hint,
436    Ignore,
437    ILike,
438    In,
439    Index,
440    IndexedBy,
441    Inner,
442    Input,
443    Insert,
444    Install,
445    Intersect,
446    Interval,
447    Into,
448    Inpath,
449    InputFormat,
450    Introducer,
451    IRLike,
452    Is,
453    IsNull,
454    Join,
455    JoinMarker,
456    Keep,
457    Key,
458    Kill,
459    Lambda,
460    Language,
461    Lateral,
462    Left,
463    Like,
464    NotLike,   // !~~ operator (PostgreSQL)
465    NotILike,  // !~~* operator (PostgreSQL)
466    NotRLike,  // !~ operator (PostgreSQL)
467    NotIRLike, // !~* operator (PostgreSQL)
468    Limit,
469    List,
470    Load,
471    Local,
472    Lock,
473    Map,
474    Match,
475    MatchCondition,
476    MatchRecognize,
477    MemberOf,
478    Materialized,
479    Merge,
480    Mod,
481    Model,
482    Natural,
483    Next,
484    NoAction,
485    Nothing,
486    NotNull,
487    Null,
488    ObjectIdentifier,
489    Offset,
490    On,
491    Only,
492    Operator,
493    OrderBy,
494    OrderSiblingsBy,
495    Ordered,
496    Ordinality,
497    Out,
498    Outer,
499    Output,
500    Over,
501    Overlaps,
502    Overwrite,
503    Partition,
504    PartitionBy,
505    Percent,
506    Pivot,
507    Placeholder,
508    Positional,
509    Pragma,
510    Prewhere,
511    PrimaryKey,
512    Procedure,
513    Properties,
514    PseudoType,
515    Put,
516    Qualify,
517    Quote,
518    QDColon,
519    Range,
520    Recursive,
521    Refresh,
522    Rename,
523    Replace,
524    Returning,
525    Revoke,
526    References,
527    Restrict,
528    Right,
529    RLike,
530    Rollback,
531    Rollup,
532    Row,
533    Rows,
534    Select,
535    Semi,
536    Savepoint,
537    Separator,
538    Sequence,
539    Serde,
540    SerdeProperties,
541    Set,
542    Settings,
543    Show,
544    Siblings,
545    SimilarTo,
546    Some,
547    Sort,
548    SortBy,
549    SoundsLike,
550    StartWith,
551    StorageIntegration,
552    StraightJoin,
553    Struct,
554    Summarize,
555    TableSample,
556    Sample,
557    Bernoulli,
558    System,
559    Block,
560    Seed,
561    Repeatable,
562    Tag,
563    Temporary,
564    Transaction,
565    To,
566    Top,
567    Then,
568    True,
569    Truncate,
570    Uncache,
571    Union,
572    Unnest,
573    Unpivot,
574    Update,
575    Use,
576    Using,
577    Values,
578    View,
579    SemanticView,
580    Volatile,
581    When,
582    Where,
583    Window,
584    With,
585    Ties,
586    Exclude,
587    No,
588    Others,
589    Unique,
590    UtcDate,
591    UtcTime,
592    UtcTimestamp,
593    VersionSnapshot,
594    TimestampSnapshot,
595    Option,
596    Sink,
597    Source,
598    Analyze,
599    Namespace,
600    Export,
601    As,
602    By,
603    Nulls,
604    Respect,
605    Last,
606    If,
607    Cast,
608    TryCast,
609    SafeCast,
610    Count,
611    Extract,
612    Substring,
613    Trim,
614    Leading,
615    Trailing,
616    Both,
617    Position,
618    Overlaying,
619    Placing,
620    Treat,
621    Within,
622    Group,
623    Order,
624
625    // Window function keywords
626    Unbounded,
627    Preceding,
628    Following,
629    Current,
630    Groups,
631
632    // DDL-specific keywords (Phase 4)
633    Trigger,
634    Type,
635    Domain,
636    Returns,
637    Body,
638    Increment,
639    Minvalue,
640    Maxvalue,
641    Start,
642    Cycle,
643    NoCycle,
644    Prior,
645    Generated,
646    Identity,
647    Always,
648    // MATCH_RECOGNIZE tokens
649    Measures,
650    Pattern,
651    Define,
652    Running,
653    Owned,
654    After,
655    Before,
656    Instead,
657    Each,
658    Statement,
659    Referencing,
660    Old,
661    New,
662    Of,
663    Check,
664    Authorization,
665    Restart,
666
667    // Special
668    Eof,
669}
670
671impl TokenType {
672    /// Check if this token type is a keyword that can be used as an identifier in certain contexts
673    pub fn is_keyword(&self) -> bool {
674        matches!(
675            self,
676            TokenType::Select
677                | TokenType::From
678                | TokenType::Where
679                | TokenType::And
680                | TokenType::Or
681                | TokenType::Not
682                | TokenType::In
683                | TokenType::Is
684                | TokenType::Null
685                | TokenType::True
686                | TokenType::False
687                | TokenType::As
688                | TokenType::On
689                | TokenType::Join
690                | TokenType::Left
691                | TokenType::Right
692                | TokenType::Inner
693                | TokenType::Outer
694                | TokenType::Full
695                | TokenType::Cross
696                | TokenType::Semi
697                | TokenType::Anti
698                | TokenType::Union
699                | TokenType::Except
700                | TokenType::Intersect
701                | TokenType::GroupBy
702                | TokenType::OrderBy
703                | TokenType::Having
704                | TokenType::Limit
705                | TokenType::Offset
706                | TokenType::Case
707                | TokenType::When
708                | TokenType::Then
709                | TokenType::Else
710                | TokenType::End
711                | TokenType::Create
712                | TokenType::Drop
713                | TokenType::Alter
714                | TokenType::Insert
715                | TokenType::Update
716                | TokenType::Delete
717                | TokenType::Into
718                | TokenType::Values
719                | TokenType::Set
720                | TokenType::With
721                | TokenType::Distinct
722                | TokenType::All
723                | TokenType::Exists
724                | TokenType::Between
725                | TokenType::Like
726                | TokenType::ILike
727                // Additional keywords that can be used as identifiers
728                | TokenType::Filter
729                | TokenType::Date
730                | TokenType::Timestamp
731                | TokenType::TimestampTz
732                | TokenType::Interval
733                | TokenType::Time
734                | TokenType::Table
735                | TokenType::Index
736                | TokenType::Column
737                | TokenType::Database
738                | TokenType::Schema
739                | TokenType::View
740                | TokenType::Function
741                | TokenType::Procedure
742                | TokenType::Trigger
743                | TokenType::Sequence
744                | TokenType::Over
745                | TokenType::Partition
746                | TokenType::Window
747                | TokenType::Rows
748                | TokenType::Range
749                | TokenType::First
750                | TokenType::Last
751                | TokenType::Preceding
752                | TokenType::Following
753                | TokenType::Current
754                | TokenType::Row
755                | TokenType::Unbounded
756                | TokenType::Array
757                | TokenType::Struct
758                | TokenType::Map
759                | TokenType::PrimaryKey
760                | TokenType::Key
761                | TokenType::ForeignKey
762                | TokenType::References
763                | TokenType::Unique
764                | TokenType::Check
765                | TokenType::Default
766                | TokenType::Constraint
767                | TokenType::Comment
768                | TokenType::Rollup
769                | TokenType::Cube
770                | TokenType::Grant
771                | TokenType::Revoke
772                | TokenType::Type
773                | TokenType::Use
774                | TokenType::Cache
775                | TokenType::Uncache
776                | TokenType::Load
777                | TokenType::Any
778                | TokenType::Some
779                | TokenType::Asc
780                | TokenType::Desc
781                | TokenType::Nulls
782                | TokenType::Lateral
783                | TokenType::Natural
784                | TokenType::Escape
785                | TokenType::Glob
786                | TokenType::Match
787                | TokenType::Recursive
788                | TokenType::Replace
789                | TokenType::Returns
790                | TokenType::If
791                | TokenType::Pivot
792                | TokenType::Unpivot
793                | TokenType::Json
794                | TokenType::Blob
795                | TokenType::Text
796                | TokenType::Int
797                | TokenType::BigInt
798                | TokenType::SmallInt
799                | TokenType::TinyInt
800                | TokenType::Int128
801                | TokenType::UInt128
802                | TokenType::Int256
803                | TokenType::UInt256
804                | TokenType::UInt
805                | TokenType::UBigInt
806                | TokenType::Float
807                | TokenType::Double
808                | TokenType::Decimal
809                | TokenType::Boolean
810                | TokenType::VarChar
811                | TokenType::Char
812                | TokenType::Binary
813                | TokenType::VarBinary
814                | TokenType::No
815                | TokenType::DateTime
816                | TokenType::Truncate
817                | TokenType::Execute
818                | TokenType::Merge
819                | TokenType::Top
820                | TokenType::Begin
821                | TokenType::Generated
822                | TokenType::Identity
823                | TokenType::Always
824                | TokenType::Extract
825                // Keywords that can be identifiers in certain contexts
826                | TokenType::AsOf
827                | TokenType::Prior
828                | TokenType::After
829                | TokenType::Restrict
830                | TokenType::Cascade
831                | TokenType::Local
832                | TokenType::Rename
833                | TokenType::Enum
834                | TokenType::Within
835                | TokenType::Format
836                | TokenType::Final
837                | TokenType::FileFormat
838                | TokenType::Input
839                | TokenType::InputFormat
840                | TokenType::Copy
841                | TokenType::Put
842                | TokenType::Get
843                | TokenType::Show
844                | TokenType::Serde
845                | TokenType::Sample
846                | TokenType::Sort
847                | TokenType::Collate
848                | TokenType::Ties
849                | TokenType::IsNull
850                | TokenType::NotNull
851                | TokenType::Exclude
852                | TokenType::Temporary
853                | TokenType::Add
854                | TokenType::Ordinality
855                | TokenType::Overlaps
856                | TokenType::Block
857                | TokenType::Pattern
858                | TokenType::Group
859                | TokenType::Cluster
860                | TokenType::Repeatable
861                | TokenType::Groups
862                | TokenType::Commit
863                | TokenType::Warehouse
864                | TokenType::System
865                | TokenType::By
866                | TokenType::To
867                | TokenType::Fetch
868                | TokenType::For
869                | TokenType::Only
870                | TokenType::Next
871                | TokenType::Lock
872                | TokenType::Refresh
873                | TokenType::Settings
874                | TokenType::Operator
875                | TokenType::Overwrite
876                | TokenType::StraightJoin
877                | TokenType::Start
878                // Additional keywords registered in tokenizer but previously missing from is_keyword()
879                | TokenType::Ignore
880                | TokenType::Domain
881                | TokenType::Apply
882                | TokenType::Respect
883                | TokenType::Materialized
884                | TokenType::Prewhere
885                | TokenType::Old
886                | TokenType::New
887                | TokenType::Cast
888                | TokenType::TryCast
889                | TokenType::SafeCast
890                | TokenType::Transaction
891                | TokenType::Describe
892                | TokenType::Kill
893                | TokenType::Lambda
894                | TokenType::Declare
895                | TokenType::Keep
896                | TokenType::Output
897                | TokenType::Percent
898                | TokenType::Qualify
899                | TokenType::Returning
900                | TokenType::Language
901                | TokenType::Preserve
902                | TokenType::Savepoint
903                | TokenType::Rollback
904                | TokenType::Body
905                | TokenType::Increment
906                | TokenType::Minvalue
907                | TokenType::Maxvalue
908                | TokenType::Cycle
909                | TokenType::NoCycle
910                | TokenType::Seed
911                | TokenType::Namespace
912                | TokenType::Authorization
913                | TokenType::Order
914                | TokenType::Restart
915                | TokenType::Before
916                | TokenType::Instead
917                | TokenType::Each
918                | TokenType::Statement
919                | TokenType::Referencing
920                | TokenType::Of
921                | TokenType::Separator
922                | TokenType::Others
923                | TokenType::Placing
924                | TokenType::Owned
925                | TokenType::Running
926                | TokenType::Define
927                | TokenType::Measures
928                | TokenType::MatchRecognize
929                | TokenType::AutoIncrement
930                | TokenType::Connect
931                | TokenType::Distribute
932                | TokenType::Bernoulli
933                | TokenType::TableSample
934                | TokenType::Inpath
935                | TokenType::Pragma
936                | TokenType::Siblings
937                | TokenType::SerdeProperties
938                | TokenType::RLike
939        )
940    }
941
942    /// Check if this token type is a comparison operator
943    pub fn is_comparison(&self) -> bool {
944        matches!(
945            self,
946            TokenType::Eq
947                | TokenType::Neq
948                | TokenType::Lt
949                | TokenType::Lte
950                | TokenType::Gt
951                | TokenType::Gte
952                | TokenType::NullsafeEq
953        )
954    }
955
956    /// Check if this token type is an arithmetic operator
957    pub fn is_arithmetic(&self) -> bool {
958        matches!(
959            self,
960            TokenType::Plus
961                | TokenType::Dash
962                | TokenType::Star
963                | TokenType::Slash
964                | TokenType::Percent
965                | TokenType::Mod
966                | TokenType::Div
967        )
968    }
969}
970
971impl fmt::Display for TokenType {
972    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
973        write!(f, "{:?}", self)
974    }
975}
976
977// ── Cached default maps for TokenizerConfig ─────────────────────────────────
978
979static DEFAULT_KEYWORDS: LazyLock<HashMap<String, TokenType>> = LazyLock::new(|| {
980    let mut keywords = HashMap::with_capacity(300);
981    // Add basic SQL keywords
982    keywords.insert("SELECT".to_string(), TokenType::Select);
983    keywords.insert("FROM".to_string(), TokenType::From);
984    keywords.insert("WHERE".to_string(), TokenType::Where);
985    keywords.insert("AND".to_string(), TokenType::And);
986    keywords.insert("OR".to_string(), TokenType::Or);
987    keywords.insert("NOT".to_string(), TokenType::Not);
988    keywords.insert("AS".to_string(), TokenType::As);
989    keywords.insert("ON".to_string(), TokenType::On);
990    keywords.insert("JOIN".to_string(), TokenType::Join);
991    keywords.insert("LEFT".to_string(), TokenType::Left);
992    keywords.insert("RIGHT".to_string(), TokenType::Right);
993    keywords.insert("INNER".to_string(), TokenType::Inner);
994    keywords.insert("OUTER".to_string(), TokenType::Outer);
995    keywords.insert("OUTPUT".to_string(), TokenType::Output);
996    keywords.insert("FULL".to_string(), TokenType::Full);
997    keywords.insert("CROSS".to_string(), TokenType::Cross);
998    keywords.insert("SEMI".to_string(), TokenType::Semi);
999    keywords.insert("ANTI".to_string(), TokenType::Anti);
1000    keywords.insert("STRAIGHT_JOIN".to_string(), TokenType::StraightJoin);
1001    keywords.insert("UNION".to_string(), TokenType::Union);
1002    keywords.insert("EXCEPT".to_string(), TokenType::Except);
1003    keywords.insert("MINUS".to_string(), TokenType::Except); // Oracle/Redshift alias for EXCEPT
1004    keywords.insert("INTERSECT".to_string(), TokenType::Intersect);
1005    keywords.insert("GROUP".to_string(), TokenType::Group);
1006    keywords.insert("CUBE".to_string(), TokenType::Cube);
1007    keywords.insert("ROLLUP".to_string(), TokenType::Rollup);
1008    keywords.insert("WITHIN".to_string(), TokenType::Within);
1009    keywords.insert("ORDER".to_string(), TokenType::Order);
1010    keywords.insert("BY".to_string(), TokenType::By);
1011    keywords.insert("HAVING".to_string(), TokenType::Having);
1012    keywords.insert("LIMIT".to_string(), TokenType::Limit);
1013    keywords.insert("OFFSET".to_string(), TokenType::Offset);
1014    keywords.insert("ORDINALITY".to_string(), TokenType::Ordinality);
1015    keywords.insert("FETCH".to_string(), TokenType::Fetch);
1016    keywords.insert("FIRST".to_string(), TokenType::First);
1017    keywords.insert("NEXT".to_string(), TokenType::Next);
1018    keywords.insert("ONLY".to_string(), TokenType::Only);
1019    keywords.insert("KEEP".to_string(), TokenType::Keep);
1020    keywords.insert("IGNORE".to_string(), TokenType::Ignore);
1021    keywords.insert("INPUT".to_string(), TokenType::Input);
1022    keywords.insert("CASE".to_string(), TokenType::Case);
1023    keywords.insert("WHEN".to_string(), TokenType::When);
1024    keywords.insert("THEN".to_string(), TokenType::Then);
1025    keywords.insert("ELSE".to_string(), TokenType::Else);
1026    keywords.insert("END".to_string(), TokenType::End);
1027    keywords.insert("ENDIF".to_string(), TokenType::End); // Exasol alias for END
1028    keywords.insert("NULL".to_string(), TokenType::Null);
1029    keywords.insert("TRUE".to_string(), TokenType::True);
1030    keywords.insert("FALSE".to_string(), TokenType::False);
1031    keywords.insert("IS".to_string(), TokenType::Is);
1032    keywords.insert("IN".to_string(), TokenType::In);
1033    keywords.insert("BETWEEN".to_string(), TokenType::Between);
1034    keywords.insert("OVERLAPS".to_string(), TokenType::Overlaps);
1035    keywords.insert("LIKE".to_string(), TokenType::Like);
1036    keywords.insert("ILIKE".to_string(), TokenType::ILike);
1037    keywords.insert("RLIKE".to_string(), TokenType::RLike);
1038    keywords.insert("REGEXP".to_string(), TokenType::RLike);
1039    keywords.insert("ESCAPE".to_string(), TokenType::Escape);
1040    keywords.insert("EXISTS".to_string(), TokenType::Exists);
1041    keywords.insert("DISTINCT".to_string(), TokenType::Distinct);
1042    keywords.insert("ALL".to_string(), TokenType::All);
1043    keywords.insert("WITH".to_string(), TokenType::With);
1044    keywords.insert("CREATE".to_string(), TokenType::Create);
1045    keywords.insert("DROP".to_string(), TokenType::Drop);
1046    keywords.insert("ALTER".to_string(), TokenType::Alter);
1047    keywords.insert("TRUNCATE".to_string(), TokenType::Truncate);
1048    keywords.insert("TABLE".to_string(), TokenType::Table);
1049    keywords.insert("VIEW".to_string(), TokenType::View);
1050    keywords.insert("INDEX".to_string(), TokenType::Index);
1051    keywords.insert("COLUMN".to_string(), TokenType::Column);
1052    keywords.insert("CONSTRAINT".to_string(), TokenType::Constraint);
1053    keywords.insert("ADD".to_string(), TokenType::Add);
1054    keywords.insert("CASCADE".to_string(), TokenType::Cascade);
1055    keywords.insert("RESTRICT".to_string(), TokenType::Restrict);
1056    keywords.insert("RENAME".to_string(), TokenType::Rename);
1057    keywords.insert("TEMPORARY".to_string(), TokenType::Temporary);
1058    keywords.insert("TEMP".to_string(), TokenType::Temporary);
1059    keywords.insert("UNIQUE".to_string(), TokenType::Unique);
1060    keywords.insert("PRIMARY".to_string(), TokenType::PrimaryKey);
1061    keywords.insert("FOREIGN".to_string(), TokenType::ForeignKey);
1062    keywords.insert("KEY".to_string(), TokenType::Key);
1063    keywords.insert("KILL".to_string(), TokenType::Kill);
1064    keywords.insert("REFERENCES".to_string(), TokenType::References);
1065    keywords.insert("DEFAULT".to_string(), TokenType::Default);
1066    keywords.insert("DECLARE".to_string(), TokenType::Declare);
1067    keywords.insert("AUTO_INCREMENT".to_string(), TokenType::AutoIncrement);
1068    keywords.insert("AUTOINCREMENT".to_string(), TokenType::AutoIncrement); // Snowflake style
1069    keywords.insert("MATERIALIZED".to_string(), TokenType::Materialized);
1070    keywords.insert("REPLACE".to_string(), TokenType::Replace);
1071    keywords.insert("TO".to_string(), TokenType::To);
1072    keywords.insert("INSERT".to_string(), TokenType::Insert);
1073    keywords.insert("OVERWRITE".to_string(), TokenType::Overwrite);
1074    keywords.insert("UPDATE".to_string(), TokenType::Update);
1075    keywords.insert("USE".to_string(), TokenType::Use);
1076    keywords.insert("WAREHOUSE".to_string(), TokenType::Warehouse);
1077    keywords.insert("GLOB".to_string(), TokenType::Glob);
1078    keywords.insert("DELETE".to_string(), TokenType::Delete);
1079    keywords.insert("MERGE".to_string(), TokenType::Merge);
1080    keywords.insert("CACHE".to_string(), TokenType::Cache);
1081    keywords.insert("UNCACHE".to_string(), TokenType::Uncache);
1082    keywords.insert("REFRESH".to_string(), TokenType::Refresh);
1083    keywords.insert("GRANT".to_string(), TokenType::Grant);
1084    keywords.insert("REVOKE".to_string(), TokenType::Revoke);
1085    keywords.insert("COMMENT".to_string(), TokenType::Comment);
1086    keywords.insert("COLLATE".to_string(), TokenType::Collate);
1087    keywords.insert("INTO".to_string(), TokenType::Into);
1088    keywords.insert("VALUES".to_string(), TokenType::Values);
1089    keywords.insert("SET".to_string(), TokenType::Set);
1090    keywords.insert("SETTINGS".to_string(), TokenType::Settings);
1091    keywords.insert("SEPARATOR".to_string(), TokenType::Separator);
1092    keywords.insert("ASC".to_string(), TokenType::Asc);
1093    keywords.insert("DESC".to_string(), TokenType::Desc);
1094    keywords.insert("NULLS".to_string(), TokenType::Nulls);
1095    keywords.insert("RESPECT".to_string(), TokenType::Respect);
1096    keywords.insert("FIRST".to_string(), TokenType::First);
1097    keywords.insert("LAST".to_string(), TokenType::Last);
1098    keywords.insert("IF".to_string(), TokenType::If);
1099    keywords.insert("CAST".to_string(), TokenType::Cast);
1100    keywords.insert("TRY_CAST".to_string(), TokenType::TryCast);
1101    keywords.insert("SAFE_CAST".to_string(), TokenType::SafeCast);
1102    keywords.insert("OVER".to_string(), TokenType::Over);
1103    keywords.insert("PARTITION".to_string(), TokenType::Partition);
1104    keywords.insert("PLACING".to_string(), TokenType::Placing);
1105    keywords.insert("WINDOW".to_string(), TokenType::Window);
1106    keywords.insert("ROWS".to_string(), TokenType::Rows);
1107    keywords.insert("RANGE".to_string(), TokenType::Range);
1108    keywords.insert("FILTER".to_string(), TokenType::Filter);
1109    keywords.insert("NATURAL".to_string(), TokenType::Natural);
1110    keywords.insert("USING".to_string(), TokenType::Using);
1111    keywords.insert("UNBOUNDED".to_string(), TokenType::Unbounded);
1112    keywords.insert("PRECEDING".to_string(), TokenType::Preceding);
1113    keywords.insert("FOLLOWING".to_string(), TokenType::Following);
1114    keywords.insert("CURRENT".to_string(), TokenType::Current);
1115    keywords.insert("ROW".to_string(), TokenType::Row);
1116    keywords.insert("GROUPS".to_string(), TokenType::Groups);
1117    keywords.insert("RECURSIVE".to_string(), TokenType::Recursive);
1118    // TRIM function position keywords
1119    keywords.insert("BOTH".to_string(), TokenType::Both);
1120    keywords.insert("LEADING".to_string(), TokenType::Leading);
1121    keywords.insert("TRAILING".to_string(), TokenType::Trailing);
1122    keywords.insert("INTERVAL".to_string(), TokenType::Interval);
1123    // Phase 3: Additional keywords
1124    keywords.insert("TOP".to_string(), TokenType::Top);
1125    keywords.insert("QUALIFY".to_string(), TokenType::Qualify);
1126    keywords.insert("SAMPLE".to_string(), TokenType::Sample);
1127    keywords.insert("TABLESAMPLE".to_string(), TokenType::TableSample);
1128    keywords.insert("BERNOULLI".to_string(), TokenType::Bernoulli);
1129    keywords.insert("SYSTEM".to_string(), TokenType::System);
1130    keywords.insert("BLOCK".to_string(), TokenType::Block);
1131    keywords.insert("SEED".to_string(), TokenType::Seed);
1132    keywords.insert("REPEATABLE".to_string(), TokenType::Repeatable);
1133    keywords.insert("TIES".to_string(), TokenType::Ties);
1134    keywords.insert("LATERAL".to_string(), TokenType::Lateral);
1135    keywords.insert("LAMBDA".to_string(), TokenType::Lambda);
1136    keywords.insert("APPLY".to_string(), TokenType::Apply);
1137    // Oracle CONNECT BY keywords
1138    keywords.insert("CONNECT".to_string(), TokenType::Connect);
1139    // Hive/Spark specific keywords
1140    keywords.insert("CLUSTER".to_string(), TokenType::Cluster);
1141    keywords.insert("DISTRIBUTE".to_string(), TokenType::Distribute);
1142    keywords.insert("SORT".to_string(), TokenType::Sort);
1143    keywords.insert("PIVOT".to_string(), TokenType::Pivot);
1144    keywords.insert("PREWHERE".to_string(), TokenType::Prewhere);
1145    keywords.insert("UNPIVOT".to_string(), TokenType::Unpivot);
1146    keywords.insert("FOR".to_string(), TokenType::For);
1147    keywords.insert("ANY".to_string(), TokenType::Any);
1148    keywords.insert("SOME".to_string(), TokenType::Some);
1149    keywords.insert("ASOF".to_string(), TokenType::AsOf);
1150    keywords.insert("PERCENT".to_string(), TokenType::Percent);
1151    keywords.insert("EXCLUDE".to_string(), TokenType::Exclude);
1152    keywords.insert("NO".to_string(), TokenType::No);
1153    keywords.insert("OTHERS".to_string(), TokenType::Others);
1154    // PostgreSQL OPERATOR() syntax for schema-qualified operators
1155    keywords.insert("OPERATOR".to_string(), TokenType::Operator);
1156    // Phase 4: DDL keywords
1157    keywords.insert("SCHEMA".to_string(), TokenType::Schema);
1158    keywords.insert("NAMESPACE".to_string(), TokenType::Namespace);
1159    keywords.insert("DATABASE".to_string(), TokenType::Database);
1160    keywords.insert("FUNCTION".to_string(), TokenType::Function);
1161    keywords.insert("PROCEDURE".to_string(), TokenType::Procedure);
1162    keywords.insert("PROC".to_string(), TokenType::Procedure);
1163    keywords.insert("SEQUENCE".to_string(), TokenType::Sequence);
1164    keywords.insert("TRIGGER".to_string(), TokenType::Trigger);
1165    keywords.insert("TYPE".to_string(), TokenType::Type);
1166    keywords.insert("DOMAIN".to_string(), TokenType::Domain);
1167    keywords.insert("RETURNS".to_string(), TokenType::Returns);
1168    keywords.insert("RETURNING".to_string(), TokenType::Returning);
1169    keywords.insert("LANGUAGE".to_string(), TokenType::Language);
1170    keywords.insert("ROLLBACK".to_string(), TokenType::Rollback);
1171    keywords.insert("COMMIT".to_string(), TokenType::Commit);
1172    keywords.insert("BEGIN".to_string(), TokenType::Begin);
1173    keywords.insert("DESCRIBE".to_string(), TokenType::Describe);
1174    keywords.insert("PRESERVE".to_string(), TokenType::Preserve);
1175    keywords.insert("TRANSACTION".to_string(), TokenType::Transaction);
1176    keywords.insert("SAVEPOINT".to_string(), TokenType::Savepoint);
1177    keywords.insert("BODY".to_string(), TokenType::Body);
1178    keywords.insert("INCREMENT".to_string(), TokenType::Increment);
1179    keywords.insert("MINVALUE".to_string(), TokenType::Minvalue);
1180    keywords.insert("MAXVALUE".to_string(), TokenType::Maxvalue);
1181    keywords.insert("CYCLE".to_string(), TokenType::Cycle);
1182    keywords.insert("NOCYCLE".to_string(), TokenType::NoCycle);
1183    keywords.insert("PRIOR".to_string(), TokenType::Prior);
1184    // MATCH_RECOGNIZE keywords
1185    keywords.insert("MATCH".to_string(), TokenType::Match);
1186    keywords.insert("MATCH_RECOGNIZE".to_string(), TokenType::MatchRecognize);
1187    keywords.insert("MEASURES".to_string(), TokenType::Measures);
1188    keywords.insert("PATTERN".to_string(), TokenType::Pattern);
1189    keywords.insert("DEFINE".to_string(), TokenType::Define);
1190    keywords.insert("RUNNING".to_string(), TokenType::Running);
1191    keywords.insert("FINAL".to_string(), TokenType::Final);
1192    keywords.insert("OWNED".to_string(), TokenType::Owned);
1193    keywords.insert("AFTER".to_string(), TokenType::After);
1194    keywords.insert("BEFORE".to_string(), TokenType::Before);
1195    keywords.insert("INSTEAD".to_string(), TokenType::Instead);
1196    keywords.insert("EACH".to_string(), TokenType::Each);
1197    keywords.insert("STATEMENT".to_string(), TokenType::Statement);
1198    keywords.insert("REFERENCING".to_string(), TokenType::Referencing);
1199    keywords.insert("OLD".to_string(), TokenType::Old);
1200    keywords.insert("NEW".to_string(), TokenType::New);
1201    keywords.insert("OF".to_string(), TokenType::Of);
1202    keywords.insert("CHECK".to_string(), TokenType::Check);
1203    keywords.insert("START".to_string(), TokenType::Start);
1204    keywords.insert("ENUM".to_string(), TokenType::Enum);
1205    keywords.insert("AUTHORIZATION".to_string(), TokenType::Authorization);
1206    keywords.insert("RESTART".to_string(), TokenType::Restart);
1207    // Date/time literal keywords
1208    keywords.insert("DATE".to_string(), TokenType::Date);
1209    keywords.insert("TIME".to_string(), TokenType::Time);
1210    keywords.insert("TIMESTAMP".to_string(), TokenType::Timestamp);
1211    keywords.insert("DATETIME".to_string(), TokenType::DateTime);
1212    keywords.insert("GENERATED".to_string(), TokenType::Generated);
1213    keywords.insert("IDENTITY".to_string(), TokenType::Identity);
1214    keywords.insert("ALWAYS".to_string(), TokenType::Always);
1215    // LOAD DATA keywords
1216    keywords.insert("LOAD".to_string(), TokenType::Load);
1217    keywords.insert("LOCAL".to_string(), TokenType::Local);
1218    keywords.insert("INPATH".to_string(), TokenType::Inpath);
1219    keywords.insert("INPUTFORMAT".to_string(), TokenType::InputFormat);
1220    keywords.insert("SERDE".to_string(), TokenType::Serde);
1221    keywords.insert("SERDEPROPERTIES".to_string(), TokenType::SerdeProperties);
1222    keywords.insert("FORMAT".to_string(), TokenType::Format);
1223    // SQLite
1224    keywords.insert("PRAGMA".to_string(), TokenType::Pragma);
1225    // SHOW statement
1226    keywords.insert("SHOW".to_string(), TokenType::Show);
1227    // Oracle ORDER SIBLINGS BY (hierarchical queries)
1228    keywords.insert("SIBLINGS".to_string(), TokenType::Siblings);
1229    // COPY and PUT statements (Snowflake, PostgreSQL)
1230    keywords.insert("COPY".to_string(), TokenType::Copy);
1231    keywords.insert("PUT".to_string(), TokenType::Put);
1232    keywords.insert("GET".to_string(), TokenType::Get);
1233    // EXEC/EXECUTE statement (TSQL, etc.)
1234    keywords.insert("EXEC".to_string(), TokenType::Execute);
1235    keywords.insert("EXECUTE".to_string(), TokenType::Execute);
1236    // Postfix null check operators (PostgreSQL/SQLite)
1237    keywords.insert("ISNULL".to_string(), TokenType::IsNull);
1238    keywords.insert("NOTNULL".to_string(), TokenType::NotNull);
1239    keywords
1240});
1241
1242static DEFAULT_SINGLE_TOKENS: LazyLock<HashMap<char, TokenType>> = LazyLock::new(|| {
1243    let mut single_tokens = HashMap::with_capacity(30);
1244    single_tokens.insert('(', TokenType::LParen);
1245    single_tokens.insert(')', TokenType::RParen);
1246    single_tokens.insert('[', TokenType::LBracket);
1247    single_tokens.insert(']', TokenType::RBracket);
1248    single_tokens.insert('{', TokenType::LBrace);
1249    single_tokens.insert('}', TokenType::RBrace);
1250    single_tokens.insert(',', TokenType::Comma);
1251    single_tokens.insert('.', TokenType::Dot);
1252    single_tokens.insert(';', TokenType::Semicolon);
1253    single_tokens.insert('+', TokenType::Plus);
1254    single_tokens.insert('-', TokenType::Dash);
1255    single_tokens.insert('*', TokenType::Star);
1256    single_tokens.insert('/', TokenType::Slash);
1257    single_tokens.insert('%', TokenType::Percent);
1258    single_tokens.insert('&', TokenType::Amp);
1259    single_tokens.insert('|', TokenType::Pipe);
1260    single_tokens.insert('^', TokenType::Caret);
1261    single_tokens.insert('~', TokenType::Tilde);
1262    single_tokens.insert('<', TokenType::Lt);
1263    single_tokens.insert('>', TokenType::Gt);
1264    single_tokens.insert('=', TokenType::Eq);
1265    single_tokens.insert('!', TokenType::Exclamation);
1266    single_tokens.insert(':', TokenType::Colon);
1267    single_tokens.insert('@', TokenType::DAt);
1268    single_tokens.insert('#', TokenType::Hash);
1269    single_tokens.insert('$', TokenType::Dollar);
1270    single_tokens.insert('?', TokenType::Parameter);
1271    single_tokens
1272});
1273
1274static DEFAULT_QUOTES: LazyLock<HashMap<String, String>> = LazyLock::new(|| {
1275    let mut quotes = HashMap::with_capacity(4);
1276    quotes.insert("'".to_string(), "'".to_string());
1277    // Triple-quoted strings (e.g., """x""")
1278    quotes.insert("\"\"\"".to_string(), "\"\"\"".to_string());
1279    quotes
1280});
1281
1282static DEFAULT_IDENTIFIERS: LazyLock<HashMap<char, char>> = LazyLock::new(|| {
1283    let mut identifiers = HashMap::with_capacity(4);
1284    identifiers.insert('"', '"');
1285    identifiers.insert('`', '`');
1286    // Note: TSQL bracket-quoted identifiers [name] are handled in the parser
1287    // because [ is also used for arrays and subscripts
1288    identifiers
1289});
1290
1291static DEFAULT_COMMENTS: LazyLock<HashMap<String, Option<String>>> = LazyLock::new(|| {
1292    let mut comments = HashMap::with_capacity(4);
1293    comments.insert("--".to_string(), None);
1294    comments.insert("/*".to_string(), Some("*/".to_string()));
1295    comments
1296});
1297
1298/// Tokenizer configuration for a dialect
1299#[derive(Debug, Clone)]
1300pub struct TokenizerConfig {
1301    /// Keywords mapping (uppercase keyword -> token type)
1302    pub keywords: HashMap<String, TokenType>,
1303    /// Single character tokens
1304    pub single_tokens: HashMap<char, TokenType>,
1305    /// Quote characters (start -> end)
1306    pub quotes: HashMap<String, String>,
1307    /// Identifier quote characters (start -> end)
1308    pub identifiers: HashMap<char, char>,
1309    /// Comment definitions (start -> optional end)
1310    pub comments: HashMap<String, Option<String>>,
1311    /// String escape characters
1312    pub string_escapes: Vec<char>,
1313    /// Whether to support nested comments
1314    pub nested_comments: bool,
1315    /// Valid escape follow characters (for MySQL-style escaping).
1316    /// When a backslash is followed by a character NOT in this list,
1317    /// the backslash is discarded. When empty, all backslash escapes
1318    /// preserve the backslash for unrecognized sequences.
1319    pub escape_follow_chars: Vec<char>,
1320    /// Whether b'...' is a byte string (true for BigQuery) or bit string (false for standard SQL).
1321    /// Default is false (bit string).
1322    pub b_prefix_is_byte_string: bool,
1323    /// Numeric literal suffixes (uppercase suffix -> type name), e.g. {"L": "BIGINT", "S": "SMALLINT"}
1324    /// Used by Hive/Spark to parse 1L as CAST(1 AS BIGINT)
1325    pub numeric_literals: HashMap<String, String>,
1326    /// Whether unquoted identifiers can start with a digit (e.g., `1a`, `1_a`).
1327    /// When true, a number followed by letters/underscore is treated as an identifier.
1328    /// Used by Hive, Spark, MySQL, ClickHouse.
1329    pub identifiers_can_start_with_digit: bool,
1330    /// Whether 0x/0X prefix should be treated as hex literals.
1331    /// When true, `0XCC` is tokenized instead of Number("0") + Identifier("XCC").
1332    /// Used by BigQuery, SQLite, Teradata.
1333    pub hex_number_strings: bool,
1334    /// Whether hex string literals from 0x prefix represent integer values.
1335    /// When true (BigQuery), 0xA is tokenized as HexNumber (integer in hex notation).
1336    /// When false (SQLite, Teradata), 0xCC is tokenized as HexString (binary/blob value).
1337    pub hex_string_is_integer_type: bool,
1338    /// Whether string escape sequences (like \') are allowed in raw strings.
1339    /// When true (BigQuery default), \' inside r'...' escapes the quote.
1340    /// When false (Spark/Databricks), backslashes in raw strings are always literal.
1341    /// Python sqlglot: STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS (default True)
1342    pub string_escapes_allowed_in_raw_strings: bool,
1343    /// Whether # starts a single-line comment (ClickHouse, MySQL)
1344    pub hash_comments: bool,
1345    /// Whether $ can start/continue an identifier (ClickHouse).
1346    /// When true, a bare `$` that is not part of a dollar-quoted string or positional
1347    /// parameter is treated as an identifier character.
1348    pub dollar_sign_is_identifier: bool,
1349    /// Whether INSERT ... FORMAT <name> should treat subsequent data as raw (ClickHouse).
1350    /// When true, after tokenizing `INSERT ... FORMAT <non-VALUES-name>`, all text until
1351    /// the next blank line or end of input is consumed as a raw data token.
1352    pub insert_format_raw_data: bool,
1353    /// Whether numeric literals can contain underscores as digit separators.
1354    /// When true, `1_000` is tokenized as `1000`. Used by ClickHouse and DuckDB.
1355    /// Python sqlglot: NUMBERS_CAN_BE_UNDERSCORE_SEPARATED (default False)
1356    pub numbers_can_be_underscore_separated: bool,
1357}
1358
1359impl Default for TokenizerConfig {
1360    fn default() -> Self {
1361        Self {
1362            keywords: DEFAULT_KEYWORDS.clone(),
1363            single_tokens: DEFAULT_SINGLE_TOKENS.clone(),
1364            quotes: DEFAULT_QUOTES.clone(),
1365            identifiers: DEFAULT_IDENTIFIERS.clone(),
1366            comments: DEFAULT_COMMENTS.clone(),
1367            // Standard SQL: only '' (doubled quote) escapes a quote
1368            // Backslash escapes are dialect-specific (MySQL, etc.)
1369            string_escapes: vec!['\''],
1370            nested_comments: true,
1371            // By default, no escape_follow_chars means preserve backslash for unrecognized escapes
1372            escape_follow_chars: vec![],
1373            // Default: b'...' is bit string (standard SQL), not byte string (BigQuery)
1374            b_prefix_is_byte_string: false,
1375            numeric_literals: HashMap::new(),
1376            identifiers_can_start_with_digit: false,
1377            hex_number_strings: false,
1378            hex_string_is_integer_type: false,
1379            // Default: backslash escapes ARE allowed in raw strings (sqlglot default)
1380            // Spark/Databricks set this to false
1381            string_escapes_allowed_in_raw_strings: true,
1382            hash_comments: false,
1383            dollar_sign_is_identifier: false,
1384            insert_format_raw_data: false,
1385            numbers_can_be_underscore_separated: false,
1386        }
1387    }
1388}
1389
1390/// SQL Tokenizer
1391pub struct Tokenizer {
1392    config: TokenizerConfig,
1393}
1394
1395impl Tokenizer {
1396    /// Create a new tokenizer with the given configuration
1397    pub fn new(config: TokenizerConfig) -> Self {
1398        Self { config }
1399    }
1400
1401    /// Create a tokenizer with default configuration
1402    pub fn default_config() -> Self {
1403        Self::new(TokenizerConfig::default())
1404    }
1405
1406    /// Tokenize a SQL string
1407    pub fn tokenize(&self, sql: &str) -> Result<Vec<Token>> {
1408        let mut state = TokenizerState::new(sql, &self.config);
1409        state.tokenize()
1410    }
1411}
1412
1413impl Default for Tokenizer {
1414    fn default() -> Self {
1415        Self::default_config()
1416    }
1417}
1418
1419/// Internal state for tokenization
1420struct TokenizerState<'a> {
1421    source: &'a str,
1422    source_is_ascii: bool,
1423    chars: Vec<char>,
1424    size: usize,
1425    tokens: Vec<Token>,
1426    start: usize,
1427    current: usize,
1428    line: usize,
1429    column: usize,
1430    comments: Vec<String>,
1431    config: &'a TokenizerConfig,
1432}
1433
1434impl<'a> TokenizerState<'a> {
1435    fn new(sql: &'a str, config: &'a TokenizerConfig) -> Self {
1436        let chars: Vec<char> = sql.chars().collect();
1437        let size = chars.len();
1438        Self {
1439            source: sql,
1440            source_is_ascii: sql.is_ascii(),
1441            chars,
1442            size,
1443            tokens: Vec::new(),
1444            start: 0,
1445            current: 0,
1446            line: 1,
1447            column: 1,
1448            comments: Vec::new(),
1449            config,
1450        }
1451    }
1452
1453    fn tokenize(&mut self) -> Result<Vec<Token>> {
1454        while !self.is_at_end() {
1455            self.skip_whitespace();
1456            if self.is_at_end() {
1457                break;
1458            }
1459
1460            self.start = self.current;
1461            self.scan_token()?;
1462
1463            // ClickHouse: After INSERT ... FORMAT <name> (where name != VALUES),
1464            // the rest until the next blank line or end of input is raw data.
1465            if self.config.insert_format_raw_data {
1466                if let Some(raw) = self.try_scan_insert_format_raw_data() {
1467                    if !raw.is_empty() {
1468                        self.start = self.current;
1469                        self.add_token_with_text(TokenType::Var, raw);
1470                    }
1471                }
1472            }
1473        }
1474
1475        // Handle leftover leading comments at end of input.
1476        // These are comments on a new line after the last token that couldn't be attached
1477        // as leading comments to a subsequent token (because there is none).
1478        // Attach them as trailing comments on the last token so they're preserved.
1479        if !self.comments.is_empty() {
1480            if let Some(last) = self.tokens.last_mut() {
1481                last.trailing_comments.extend(self.comments.drain(..));
1482            }
1483        }
1484
1485        Ok(std::mem::take(&mut self.tokens))
1486    }
1487
1488    #[inline]
1489    fn is_at_end(&self) -> bool {
1490        self.current >= self.size
1491    }
1492
1493    #[inline]
1494    fn text_from_range(&self, start: usize, end: usize) -> String {
1495        if self.source_is_ascii {
1496            self.source[start..end].to_string()
1497        } else {
1498            self.chars[start..end].iter().collect()
1499        }
1500    }
1501
1502    #[inline]
1503    fn peek(&self) -> char {
1504        if self.is_at_end() {
1505            '\0'
1506        } else {
1507            self.chars[self.current]
1508        }
1509    }
1510
1511    #[inline]
1512    fn peek_next(&self) -> char {
1513        if self.current + 1 >= self.size {
1514            '\0'
1515        } else {
1516            self.chars[self.current + 1]
1517        }
1518    }
1519
1520    #[inline]
1521    fn advance(&mut self) -> char {
1522        let c = self.peek();
1523        self.current += 1;
1524        if c == '\n' {
1525            self.line += 1;
1526            self.column = 1;
1527        } else {
1528            self.column += 1;
1529        }
1530        c
1531    }
1532
1533    fn skip_whitespace(&mut self) {
1534        // Track whether we've seen a newline since the last token.
1535        // Comments on a new line (after a newline) are leading comments on the next token,
1536        // while comments on the same line are trailing comments on the previous token.
1537        // This matches Python sqlglot's behavior.
1538        let mut saw_newline = false;
1539        while !self.is_at_end() {
1540            let c = self.peek();
1541            match c {
1542                ' ' | '\t' | '\r' => {
1543                    self.advance();
1544                }
1545                '\n' => {
1546                    saw_newline = true;
1547                    self.advance();
1548                }
1549                '\u{00A0}' // non-breaking space
1550                | '\u{2000}'..='\u{200B}' // various Unicode spaces + zero-width space
1551                | '\u{3000}' // ideographic (full-width) space
1552                | '\u{FEFF}' // BOM / zero-width no-break space
1553                => {
1554                    self.advance();
1555                }
1556                '-' if self.peek_next() == '-' => {
1557                    self.scan_line_comment(saw_newline);
1558                    // After a line comment, we're always on a new line
1559                    saw_newline = true;
1560                }
1561                '/' if self.peek_next() == '/' && self.config.hash_comments => {
1562                    // ClickHouse: // single-line comments (same dialects that support # comments)
1563                    self.scan_double_slash_comment();
1564                }
1565                '/' if self.peek_next() == '*' => {
1566                    // Check if this is a hint comment /*+ ... */
1567                    if self.current + 2 < self.size && self.chars[self.current + 2] == '+' {
1568                        // This is a hint comment, handle it as a token instead of skipping
1569                        break;
1570                    }
1571                    if self.scan_block_comment(saw_newline).is_err() {
1572                        return;
1573                    }
1574                    // Don't reset saw_newline - it carries forward
1575                }
1576                '/' if self.peek_next() == '/' && self.config.comments.contains_key("//") => {
1577                    // Dialect-specific // line comment (e.g., Snowflake)
1578                    // But NOT inside URIs like file:// or paths with consecutive slashes
1579                    // Check that previous non-whitespace char is not ':' or '/'
1580                    let prev_non_ws = if self.current > 0 {
1581                        let mut i = self.current - 1;
1582                        while i > 0 && (self.chars[i] == ' ' || self.chars[i] == '\t') {
1583                            i -= 1;
1584                        }
1585                        self.chars[i]
1586                    } else {
1587                        '\0'
1588                    };
1589                    if prev_non_ws == ':' || prev_non_ws == '/' {
1590                        // This is likely a URI (file://, http://) or path, not a comment
1591                        break;
1592                    }
1593                    self.scan_line_comment(saw_newline);
1594                    // After a line comment, we're always on a new line
1595                    saw_newline = true;
1596                }
1597                '#' if self.config.hash_comments => {
1598                    self.scan_hash_line_comment();
1599                }
1600                _ => break,
1601            }
1602        }
1603    }
1604
1605    fn scan_hash_line_comment(&mut self) {
1606        self.advance(); // #
1607        let start = self.current;
1608        while !self.is_at_end() && self.peek() != '\n' {
1609            self.advance();
1610        }
1611        let comment = self.text_from_range(start, self.current);
1612        let comment_text = comment.trim().to_string();
1613        if let Some(last) = self.tokens.last_mut() {
1614            last.trailing_comments.push(comment_text);
1615        } else {
1616            self.comments.push(comment_text);
1617        }
1618    }
1619
1620    fn scan_double_slash_comment(&mut self) {
1621        self.advance(); // /
1622        self.advance(); // /
1623        let start = self.current;
1624        while !self.is_at_end() && self.peek() != '\n' {
1625            self.advance();
1626        }
1627        let comment = self.text_from_range(start, self.current);
1628        let comment_text = comment.trim().to_string();
1629        if let Some(last) = self.tokens.last_mut() {
1630            last.trailing_comments.push(comment_text);
1631        } else {
1632            self.comments.push(comment_text);
1633        }
1634    }
1635
1636    fn scan_line_comment(&mut self, after_newline: bool) {
1637        self.advance(); // -
1638        self.advance(); // -
1639        let start = self.current;
1640        while !self.is_at_end() && self.peek() != '\n' {
1641            self.advance();
1642        }
1643        let comment_text = self.text_from_range(start, self.current);
1644
1645        // If the comment starts on a new line (after_newline), it's a leading comment
1646        // on the next token. Otherwise, it's a trailing comment on the previous token.
1647        if after_newline || self.tokens.is_empty() {
1648            self.comments.push(comment_text);
1649        } else if let Some(last) = self.tokens.last_mut() {
1650            last.trailing_comments.push(comment_text);
1651        }
1652    }
1653
1654    fn scan_block_comment(&mut self, after_newline: bool) -> Result<()> {
1655        self.advance(); // /
1656        self.advance(); // *
1657        let content_start = self.current;
1658        let mut depth = 1;
1659
1660        while !self.is_at_end() && depth > 0 {
1661            if self.peek() == '/' && self.peek_next() == '*' && self.config.nested_comments {
1662                self.advance();
1663                self.advance();
1664                depth += 1;
1665            } else if self.peek() == '*' && self.peek_next() == '/' {
1666                depth -= 1;
1667                if depth > 0 {
1668                    self.advance();
1669                    self.advance();
1670                }
1671            } else {
1672                self.advance();
1673            }
1674        }
1675
1676        if depth > 0 {
1677            return Err(Error::tokenize(
1678                "Unterminated block comment",
1679                self.line,
1680                self.column,
1681                self.start,
1682                self.current,
1683            ));
1684        }
1685
1686        // Get the content between /* and */ (preserving internal whitespace for nested comments)
1687        let content = self.text_from_range(content_start, self.current);
1688        self.advance(); // *
1689        self.advance(); // /
1690
1691        // For round-trip fidelity, preserve the exact comment content including nested comments
1692        let comment_text = format!("/*{}*/", content);
1693
1694        // If the comment starts on a new line (after_newline), it's a leading comment
1695        // on the next token. Otherwise, it's a trailing comment on the previous token.
1696        if after_newline || self.tokens.is_empty() {
1697            self.comments.push(comment_text);
1698        } else if let Some(last) = self.tokens.last_mut() {
1699            last.trailing_comments.push(comment_text);
1700        }
1701
1702        Ok(())
1703    }
1704
1705    /// Scan a hint comment /*+ ... */ and return it as a Hint token
1706    fn scan_hint(&mut self) -> Result<()> {
1707        self.advance(); // /
1708        self.advance(); // *
1709        self.advance(); // +
1710        let hint_start = self.current;
1711
1712        // Scan until we find */
1713        while !self.is_at_end() {
1714            if self.peek() == '*' && self.peek_next() == '/' {
1715                break;
1716            }
1717            self.advance();
1718        }
1719
1720        if self.is_at_end() {
1721            return Err(Error::tokenize(
1722                "Unterminated hint comment",
1723                self.line,
1724                self.column,
1725                self.start,
1726                self.current,
1727            ));
1728        }
1729
1730        let hint_text = self.text_from_range(hint_start, self.current);
1731        self.advance(); // *
1732        self.advance(); // /
1733
1734        self.add_token_with_text(TokenType::Hint, hint_text.trim().to_string());
1735
1736        Ok(())
1737    }
1738
1739    /// Scan a positional parameter: $1, $2, etc.
1740    fn scan_positional_parameter(&mut self) -> Result<()> {
1741        self.advance(); // consume $
1742        let start = self.current;
1743
1744        while !self.is_at_end() && self.peek().is_ascii_digit() {
1745            self.advance();
1746        }
1747
1748        let number = self.text_from_range(start, self.current);
1749        self.add_token_with_text(TokenType::Parameter, number);
1750        Ok(())
1751    }
1752
1753    /// Try to scan a tagged dollar-quoted string: $tag$content$tag$
1754    /// Returns Some(()) if successful, None if this isn't a tagged dollar string.
1755    ///
1756    /// The token text is stored as "tag\x00content" to preserve the tag for later use.
1757    fn try_scan_tagged_dollar_string(&mut self) -> Result<Option<()>> {
1758        let saved_pos = self.current;
1759
1760        // We're at '$', next char is alphabetic
1761        self.advance(); // consume opening $
1762
1763        // Scan the tag (identifier: alphanumeric + underscore, including Unicode)
1764        // Tags can contain Unicode characters like emojis (e.g., $🦆$)
1765        let tag_start = self.current;
1766        while !self.is_at_end()
1767            && (self.peek().is_alphanumeric() || self.peek() == '_' || !self.peek().is_ascii())
1768        {
1769            self.advance();
1770        }
1771        let tag = self.text_from_range(tag_start, self.current);
1772
1773        // Must have a closing $ after the tag
1774        if self.is_at_end() || self.peek() != '$' {
1775            // Not a tagged dollar string - restore position
1776            self.current = saved_pos;
1777            return Ok(None);
1778        }
1779        self.advance(); // consume closing $ of opening tag
1780
1781        // Now scan content until we find $tag$
1782        let content_start = self.current;
1783        let closing_tag = format!("${}$", tag);
1784        let closing_chars: Vec<char> = closing_tag.chars().collect();
1785
1786        loop {
1787            if self.is_at_end() {
1788                // Unterminated - restore and fall through
1789                self.current = saved_pos;
1790                return Ok(None);
1791            }
1792
1793            // Check if we've reached the closing tag
1794            if self.peek() == '$' && self.current + closing_chars.len() <= self.size {
1795                let matches = closing_chars.iter().enumerate().all(|(j, &ch)| {
1796                    self.current + j < self.size && self.chars[self.current + j] == ch
1797                });
1798                if matches {
1799                    let content = self.text_from_range(content_start, self.current);
1800                    // Consume closing tag
1801                    for _ in 0..closing_chars.len() {
1802                        self.advance();
1803                    }
1804                    // Store as "tag\x00content" to preserve the tag
1805                    let token_text = format!("{}\x00{}", tag, content);
1806                    self.add_token_with_text(TokenType::DollarString, token_text);
1807                    return Ok(Some(()));
1808                }
1809            }
1810            self.advance();
1811        }
1812    }
1813
1814    /// Scan a dollar-quoted string: $$content$$ or $tag$content$tag$
1815    ///
1816    /// For $$...$$ (no tag), the token text is just the content.
1817    /// For $tag$...$tag$, use try_scan_tagged_dollar_string instead.
1818    fn scan_dollar_quoted_string(&mut self) -> Result<()> {
1819        self.advance(); // consume first $
1820        self.advance(); // consume second $
1821
1822        // For $$...$$ (no tag), just scan until closing $$
1823        let start = self.current;
1824        while !self.is_at_end() {
1825            if self.peek() == '$'
1826                && self.current + 1 < self.size
1827                && self.chars[self.current + 1] == '$'
1828            {
1829                break;
1830            }
1831            self.advance();
1832        }
1833
1834        let content = self.text_from_range(start, self.current);
1835
1836        if !self.is_at_end() {
1837            self.advance(); // consume first $
1838            self.advance(); // consume second $
1839        }
1840
1841        self.add_token_with_text(TokenType::DollarString, content);
1842        Ok(())
1843    }
1844
1845    fn scan_token(&mut self) -> Result<()> {
1846        let c = self.peek();
1847
1848        // Check for string literal
1849        if c == '\'' {
1850            // Check for triple-quoted string '''...''' if configured
1851            if self.config.quotes.contains_key("'''")
1852                && self.peek_next() == '\''
1853                && self.current + 2 < self.size
1854                && self.chars[self.current + 2] == '\''
1855            {
1856                return self.scan_triple_quoted_string('\'');
1857            }
1858            return self.scan_string();
1859        }
1860
1861        // Check for triple-quoted string """...""" if configured
1862        if c == '"'
1863            && self.config.quotes.contains_key("\"\"\"")
1864            && self.peek_next() == '"'
1865            && self.current + 2 < self.size
1866            && self.chars[self.current + 2] == '"'
1867        {
1868            return self.scan_triple_quoted_string('"');
1869        }
1870
1871        // Check for double-quoted strings when dialect supports them (e.g., BigQuery)
1872        // This must come before identifier quotes check
1873        if c == '"'
1874            && self.config.quotes.contains_key("\"")
1875            && !self.config.identifiers.contains_key(&'"')
1876        {
1877            return self.scan_double_quoted_string();
1878        }
1879
1880        // Check for identifier quotes
1881        if let Some(&end_quote) = self.config.identifiers.get(&c) {
1882            return self.scan_quoted_identifier(end_quote);
1883        }
1884
1885        // Check for numbers (including numbers starting with a dot like .25)
1886        if c.is_ascii_digit() {
1887            return self.scan_number();
1888        }
1889
1890        // Check for numbers starting with a dot (e.g., .25, .5)
1891        // This must come before single character token handling
1892        // Don't treat as a number if:
1893        // - Previous char was also a dot (e.g., 1..2 should be 1, ., ., 2)
1894        // - Previous char is an identifier character (e.g., foo.25 should be foo, ., 25)
1895        //   This handles BigQuery numeric table parts like project.dataset.25
1896        if c == '.' && self.peek_next().is_ascii_digit() {
1897            let prev_char = if self.current > 0 {
1898                self.chars[self.current - 1]
1899            } else {
1900                '\0'
1901            };
1902            let is_after_ident = prev_char.is_alphanumeric()
1903                || prev_char == '_'
1904                || prev_char == '`'
1905                || prev_char == '"'
1906                || prev_char == ']'
1907                || prev_char == ')';
1908            if prev_char != '.' && !is_after_ident {
1909                return self.scan_number_starting_with_dot();
1910            }
1911        }
1912
1913        // Check for hint comment /*+ ... */
1914        if c == '/'
1915            && self.peek_next() == '*'
1916            && self.current + 2 < self.size
1917            && self.chars[self.current + 2] == '+'
1918        {
1919            return self.scan_hint();
1920        }
1921
1922        // Check for multi-character operators first
1923        if let Some(token_type) = self.try_scan_multi_char_operator() {
1924            self.add_token(token_type);
1925            return Ok(());
1926        }
1927
1928        // Check for tagged dollar-quoted strings: $tag$content$tag$
1929        // Tags can contain Unicode characters (including emojis like 🦆) and digits (e.g., $1$)
1930        if c == '$'
1931            && (self.peek_next().is_alphanumeric()
1932                || self.peek_next() == '_'
1933                || !self.peek_next().is_ascii())
1934        {
1935            if let Some(()) = self.try_scan_tagged_dollar_string()? {
1936                return Ok(());
1937            }
1938            // If tagged dollar string didn't match and dollar_sign_is_identifier is set,
1939            // treat the $ and following chars as an identifier (e.g., ClickHouse $alias$name$).
1940            if self.config.dollar_sign_is_identifier {
1941                return self.scan_dollar_identifier();
1942            }
1943        }
1944
1945        // Check for dollar-quoted strings: $$...$$
1946        if c == '$' && self.peek_next() == '$' {
1947            return self.scan_dollar_quoted_string();
1948        }
1949
1950        // Check for positional parameters: $1, $2, etc.
1951        if c == '$' && self.peek_next().is_ascii_digit() {
1952            return self.scan_positional_parameter();
1953        }
1954
1955        // ClickHouse: bare $ (not followed by alphanumeric/underscore) as identifier
1956        if c == '$' && self.config.dollar_sign_is_identifier {
1957            return self.scan_dollar_identifier();
1958        }
1959
1960        // TSQL: Check for identifiers starting with # (temp tables) or @ (variables)
1961        // e.g., #temp, ##global_temp, @variable
1962        if (c == '#' || c == '@')
1963            && (self.peek_next().is_alphanumeric()
1964                || self.peek_next() == '_'
1965                || self.peek_next() == '#')
1966        {
1967            return self.scan_tsql_identifier();
1968        }
1969
1970        // Check for single character tokens
1971        if let Some(&token_type) = self.config.single_tokens.get(&c) {
1972            self.advance();
1973            self.add_token(token_type);
1974            return Ok(());
1975        }
1976
1977        // Unicode minus (U+2212) → treat as regular minus
1978        if c == '\u{2212}' {
1979            self.advance();
1980            self.add_token(TokenType::Dash);
1981            return Ok(());
1982        }
1983
1984        // Unicode fraction slash (U+2044) → treat as regular slash
1985        if c == '\u{2044}' {
1986            self.advance();
1987            self.add_token(TokenType::Slash);
1988            return Ok(());
1989        }
1990
1991        // Unicode curly/smart quotes → treat as regular string quotes
1992        if c == '\u{2018}' || c == '\u{2019}' {
1993            // Left/right single quotation marks → scan as string with matching end
1994            return self.scan_unicode_quoted_string(c);
1995        }
1996        if c == '\u{201C}' || c == '\u{201D}' {
1997            // Left/right double quotation marks → scan as quoted identifier
1998            return self.scan_unicode_quoted_identifier(c);
1999        }
2000
2001        // Must be an identifier or keyword
2002        self.scan_identifier_or_keyword()
2003    }
2004
2005    fn try_scan_multi_char_operator(&mut self) -> Option<TokenType> {
2006        let c = self.peek();
2007        let next = self.peek_next();
2008        let third = if self.current + 2 < self.size {
2009            self.chars[self.current + 2]
2010        } else {
2011            '\0'
2012        };
2013
2014        // Check for three-character operators first
2015        // -|- (Adjacent - PostgreSQL range adjacency)
2016        if c == '-' && next == '|' && third == '-' {
2017            self.advance();
2018            self.advance();
2019            self.advance();
2020            return Some(TokenType::Adjacent);
2021        }
2022
2023        // ||/ (Cube root - PostgreSQL)
2024        if c == '|' && next == '|' && third == '/' {
2025            self.advance();
2026            self.advance();
2027            self.advance();
2028            return Some(TokenType::DPipeSlash);
2029        }
2030
2031        // #>> (JSONB path text extraction - PostgreSQL)
2032        if c == '#' && next == '>' && third == '>' {
2033            self.advance();
2034            self.advance();
2035            self.advance();
2036            return Some(TokenType::DHashArrow);
2037        }
2038
2039        // ->> (JSON text extraction - PostgreSQL/MySQL)
2040        if c == '-' && next == '>' && third == '>' {
2041            self.advance();
2042            self.advance();
2043            self.advance();
2044            return Some(TokenType::DArrow);
2045        }
2046
2047        // <=> (NULL-safe equality - MySQL)
2048        if c == '<' && next == '=' && third == '>' {
2049            self.advance();
2050            self.advance();
2051            self.advance();
2052            return Some(TokenType::NullsafeEq);
2053        }
2054
2055        // <-> (Distance operator - PostgreSQL)
2056        if c == '<' && next == '-' && third == '>' {
2057            self.advance();
2058            self.advance();
2059            self.advance();
2060            return Some(TokenType::LrArrow);
2061        }
2062
2063        // <@ (Contained by - PostgreSQL)
2064        if c == '<' && next == '@' {
2065            self.advance();
2066            self.advance();
2067            return Some(TokenType::LtAt);
2068        }
2069
2070        // @> (Contains - PostgreSQL)
2071        if c == '@' && next == '>' {
2072            self.advance();
2073            self.advance();
2074            return Some(TokenType::AtGt);
2075        }
2076
2077        // ~~~ (Glob - PostgreSQL)
2078        if c == '~' && next == '~' && third == '~' {
2079            self.advance();
2080            self.advance();
2081            self.advance();
2082            return Some(TokenType::Glob);
2083        }
2084
2085        // ~~* (ILike - PostgreSQL)
2086        if c == '~' && next == '~' && third == '*' {
2087            self.advance();
2088            self.advance();
2089            self.advance();
2090            return Some(TokenType::ILike);
2091        }
2092
2093        // !~~* (Not ILike - PostgreSQL)
2094        let fourth = if self.current + 3 < self.size {
2095            self.chars[self.current + 3]
2096        } else {
2097            '\0'
2098        };
2099        if c == '!' && next == '~' && third == '~' && fourth == '*' {
2100            self.advance();
2101            self.advance();
2102            self.advance();
2103            self.advance();
2104            return Some(TokenType::NotILike);
2105        }
2106
2107        // !~~ (Not Like - PostgreSQL)
2108        if c == '!' && next == '~' && third == '~' {
2109            self.advance();
2110            self.advance();
2111            self.advance();
2112            return Some(TokenType::NotLike);
2113        }
2114
2115        // !~* (Not Regexp ILike - PostgreSQL)
2116        if c == '!' && next == '~' && third == '*' {
2117            self.advance();
2118            self.advance();
2119            self.advance();
2120            return Some(TokenType::NotIRLike);
2121        }
2122
2123        // !:> (Not cast / Try cast - SingleStore)
2124        if c == '!' && next == ':' && third == '>' {
2125            self.advance();
2126            self.advance();
2127            self.advance();
2128            return Some(TokenType::NColonGt);
2129        }
2130
2131        // ?:: (TRY_CAST shorthand - Databricks)
2132        if c == '?' && next == ':' && third == ':' {
2133            self.advance();
2134            self.advance();
2135            self.advance();
2136            return Some(TokenType::QDColon);
2137        }
2138
2139        // !~ (Not Regexp - PostgreSQL)
2140        if c == '!' && next == '~' {
2141            self.advance();
2142            self.advance();
2143            return Some(TokenType::NotRLike);
2144        }
2145
2146        // ~~ (Like - PostgreSQL)
2147        if c == '~' && next == '~' {
2148            self.advance();
2149            self.advance();
2150            return Some(TokenType::Like);
2151        }
2152
2153        // ~* (Regexp ILike - PostgreSQL)
2154        if c == '~' && next == '*' {
2155            self.advance();
2156            self.advance();
2157            return Some(TokenType::IRLike);
2158        }
2159
2160        // SingleStore three-character JSON path operators (must be checked before :: two-char)
2161        // ::$ (JSON extract string), ::% (JSON extract double), ::? (JSON match)
2162        if c == ':' && next == ':' && third == '$' {
2163            self.advance();
2164            self.advance();
2165            self.advance();
2166            return Some(TokenType::DColonDollar);
2167        }
2168        if c == ':' && next == ':' && third == '%' {
2169            self.advance();
2170            self.advance();
2171            self.advance();
2172            return Some(TokenType::DColonPercent);
2173        }
2174        if c == ':' && next == ':' && third == '?' {
2175            self.advance();
2176            self.advance();
2177            self.advance();
2178            return Some(TokenType::DColonQMark);
2179        }
2180
2181        // Two-character operators
2182        let token_type = match (c, next) {
2183            ('.', ':') => Some(TokenType::DotColon),
2184            ('=', '=') => Some(TokenType::Eq), // Hive/Spark == equality operator
2185            ('<', '=') => Some(TokenType::Lte),
2186            ('>', '=') => Some(TokenType::Gte),
2187            ('!', '=') => Some(TokenType::Neq),
2188            ('<', '>') => Some(TokenType::Neq),
2189            ('^', '=') => Some(TokenType::Neq),
2190            ('<', '<') => Some(TokenType::LtLt),
2191            ('>', '>') => Some(TokenType::GtGt),
2192            ('|', '|') => Some(TokenType::DPipe),
2193            ('|', '/') => Some(TokenType::PipeSlash), // Square root - PostgreSQL
2194            (':', ':') => Some(TokenType::DColon),
2195            (':', '=') => Some(TokenType::ColonEq), // := (assignment, named args)
2196            (':', '>') => Some(TokenType::ColonGt), // ::> (TSQL)
2197            ('-', '>') => Some(TokenType::Arrow),   // JSON object access
2198            ('=', '>') => Some(TokenType::FArrow),  // Fat arrow (lambda)
2199            ('&', '&') => Some(TokenType::DAmp),
2200            ('&', '<') => Some(TokenType::AmpLt), // PostgreSQL range operator
2201            ('&', '>') => Some(TokenType::AmpGt), // PostgreSQL range operator
2202            ('@', '@') => Some(TokenType::AtAt),  // Text search match
2203            ('?', '|') => Some(TokenType::QMarkPipe), // JSONB contains any key
2204            ('?', '&') => Some(TokenType::QMarkAmp), // JSONB contains all keys
2205            ('?', '?') => Some(TokenType::DQMark), // Double question mark
2206            ('#', '>') => Some(TokenType::HashArrow), // JSONB path extraction
2207            ('#', '-') => Some(TokenType::HashDash), // JSONB delete
2208            ('^', '@') => Some(TokenType::CaretAt), // PostgreSQL starts-with operator
2209            ('*', '*') => Some(TokenType::DStar), // Power operator
2210            ('|', '>') => Some(TokenType::PipeGt), // Pipe-greater (some dialects)
2211            _ => None,
2212        };
2213
2214        if token_type.is_some() {
2215            self.advance();
2216            self.advance();
2217        }
2218
2219        token_type
2220    }
2221
2222    fn scan_string(&mut self) -> Result<()> {
2223        self.advance(); // Opening quote
2224        let mut value = String::new();
2225
2226        while !self.is_at_end() {
2227            let c = self.peek();
2228            if c == '\'' {
2229                if self.peek_next() == '\'' {
2230                    // Escaped quote
2231                    value.push('\'');
2232                    self.advance();
2233                    self.advance();
2234                } else {
2235                    break;
2236                }
2237            } else if c == '\\' && self.config.string_escapes.contains(&'\\') {
2238                // Handle escape sequences
2239                self.advance(); // Consume the backslash
2240                if !self.is_at_end() {
2241                    let escaped = self.advance();
2242                    match escaped {
2243                        'n' => value.push('\n'),
2244                        'r' => value.push('\r'),
2245                        't' => value.push('\t'),
2246                        '0' => value.push('\0'),
2247                        'Z' => value.push('\x1A'), // Ctrl+Z (MySQL)
2248                        'a' => value.push('\x07'), // Alert/bell
2249                        'b' => value.push('\x08'), // Backspace
2250                        'f' => value.push('\x0C'), // Form feed
2251                        'v' => value.push('\x0B'), // Vertical tab
2252                        'x' => {
2253                            // Hex escape: \xNN (exactly 2 hex digits)
2254                            let mut hex = String::with_capacity(2);
2255                            for _ in 0..2 {
2256                                if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2257                                    hex.push(self.advance());
2258                                }
2259                            }
2260                            if hex.len() == 2 {
2261                                if let Ok(byte) = u8::from_str_radix(&hex, 16) {
2262                                    value.push(byte as char);
2263                                } else {
2264                                    value.push('\\');
2265                                    value.push('x');
2266                                    value.push_str(&hex);
2267                                }
2268                            } else {
2269                                // Not enough hex digits, preserve literally
2270                                value.push('\\');
2271                                value.push('x');
2272                                value.push_str(&hex);
2273                            }
2274                        }
2275                        '\\' => value.push('\\'),
2276                        '\'' => value.push('\''),
2277                        '"' => value.push('"'),
2278                        '%' => {
2279                            // MySQL: \% in LIKE patterns
2280                            value.push('%');
2281                        }
2282                        '_' => {
2283                            // MySQL: \_ in LIKE patterns
2284                            value.push('_');
2285                        }
2286                        // For unrecognized escape sequences:
2287                        // If escape_follow_chars is set, only preserve backslash for chars in that list
2288                        // Otherwise (empty list), preserve backslash + char for unrecognized escapes
2289                        _ => {
2290                            if !self.config.escape_follow_chars.is_empty() {
2291                                // MySQL-style: discard backslash for unrecognized escapes
2292                                value.push(escaped);
2293                            } else {
2294                                // Standard: preserve backslash + char
2295                                value.push('\\');
2296                                value.push(escaped);
2297                            }
2298                        }
2299                    }
2300                }
2301            } else {
2302                value.push(self.advance());
2303            }
2304        }
2305
2306        if self.is_at_end() {
2307            return Err(Error::tokenize(
2308                "Unterminated string",
2309                self.line,
2310                self.column,
2311                self.start,
2312                self.current,
2313            ));
2314        }
2315
2316        self.advance(); // Closing quote
2317        self.add_token_with_text(TokenType::String, value);
2318        Ok(())
2319    }
2320
2321    /// Scan a double-quoted string (for dialects like BigQuery where " is a string delimiter)
2322    fn scan_double_quoted_string(&mut self) -> Result<()> {
2323        self.advance(); // Opening quote
2324        let mut value = String::new();
2325
2326        while !self.is_at_end() {
2327            let c = self.peek();
2328            if c == '"' {
2329                if self.peek_next() == '"' {
2330                    // Escaped quote
2331                    value.push('"');
2332                    self.advance();
2333                    self.advance();
2334                } else {
2335                    break;
2336                }
2337            } else if c == '\\' && self.config.string_escapes.contains(&'\\') {
2338                // Handle escape sequences
2339                self.advance(); // Consume the backslash
2340                if !self.is_at_end() {
2341                    let escaped = self.advance();
2342                    match escaped {
2343                        'n' => value.push('\n'),
2344                        'r' => value.push('\r'),
2345                        't' => value.push('\t'),
2346                        '0' => value.push('\0'),
2347                        'Z' => value.push('\x1A'), // Ctrl+Z (MySQL)
2348                        'a' => value.push('\x07'), // Alert/bell
2349                        'b' => value.push('\x08'), // Backspace
2350                        'f' => value.push('\x0C'), // Form feed
2351                        'v' => value.push('\x0B'), // Vertical tab
2352                        'x' => {
2353                            // Hex escape: \xNN (exactly 2 hex digits)
2354                            let mut hex = String::with_capacity(2);
2355                            for _ in 0..2 {
2356                                if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2357                                    hex.push(self.advance());
2358                                }
2359                            }
2360                            if hex.len() == 2 {
2361                                if let Ok(byte) = u8::from_str_radix(&hex, 16) {
2362                                    value.push(byte as char);
2363                                } else {
2364                                    value.push('\\');
2365                                    value.push('x');
2366                                    value.push_str(&hex);
2367                                }
2368                            } else {
2369                                // Not enough hex digits, preserve literally
2370                                value.push('\\');
2371                                value.push('x');
2372                                value.push_str(&hex);
2373                            }
2374                        }
2375                        '\\' => value.push('\\'),
2376                        '\'' => value.push('\''),
2377                        '"' => value.push('"'),
2378                        '%' => {
2379                            // MySQL: \% in LIKE patterns
2380                            value.push('%');
2381                        }
2382                        '_' => {
2383                            // MySQL: \_ in LIKE patterns
2384                            value.push('_');
2385                        }
2386                        // For unrecognized escape sequences:
2387                        // If escape_follow_chars is set, only preserve backslash for chars in that list
2388                        // Otherwise (empty list), preserve backslash + char for unrecognized escapes
2389                        _ => {
2390                            if !self.config.escape_follow_chars.is_empty() {
2391                                // MySQL-style: discard backslash for unrecognized escapes
2392                                value.push(escaped);
2393                            } else {
2394                                // Standard: preserve backslash + char
2395                                value.push('\\');
2396                                value.push(escaped);
2397                            }
2398                        }
2399                    }
2400                }
2401            } else {
2402                value.push(self.advance());
2403            }
2404        }
2405
2406        if self.is_at_end() {
2407            return Err(Error::tokenize(
2408                "Unterminated double-quoted string",
2409                self.line,
2410                self.column,
2411                self.start,
2412                self.current,
2413            ));
2414        }
2415
2416        self.advance(); // Closing quote
2417        self.add_token_with_text(TokenType::String, value);
2418        Ok(())
2419    }
2420
2421    fn scan_triple_quoted_string(&mut self, quote_char: char) -> Result<()> {
2422        // Advance past the three opening quotes
2423        self.advance();
2424        self.advance();
2425        self.advance();
2426        let mut value = String::new();
2427
2428        while !self.is_at_end() {
2429            // Check for closing triple quote
2430            if self.peek() == quote_char
2431                && self.current + 1 < self.size
2432                && self.chars[self.current + 1] == quote_char
2433                && self.current + 2 < self.size
2434                && self.chars[self.current + 2] == quote_char
2435            {
2436                // Found closing """
2437                break;
2438            }
2439            value.push(self.advance());
2440        }
2441
2442        if self.is_at_end() {
2443            return Err(Error::tokenize(
2444                "Unterminated triple-quoted string",
2445                self.line,
2446                self.column,
2447                self.start,
2448                self.current,
2449            ));
2450        }
2451
2452        // Advance past the three closing quotes
2453        self.advance();
2454        self.advance();
2455        self.advance();
2456        let token_type = if quote_char == '"' {
2457            TokenType::TripleDoubleQuotedString
2458        } else {
2459            TokenType::TripleSingleQuotedString
2460        };
2461        self.add_token_with_text(token_type, value);
2462        Ok(())
2463    }
2464
2465    fn scan_quoted_identifier(&mut self, end_quote: char) -> Result<()> {
2466        self.advance(); // Opening quote
2467        let mut value = String::new();
2468
2469        loop {
2470            if self.is_at_end() {
2471                return Err(Error::tokenize(
2472                    "Unterminated identifier",
2473                    self.line,
2474                    self.column,
2475                    self.start,
2476                    self.current,
2477                ));
2478            }
2479            if self.peek() == end_quote {
2480                if self.peek_next() == end_quote {
2481                    // Escaped quote (e.g., "" inside "x""y") -> store single quote
2482                    value.push(end_quote);
2483                    self.advance(); // skip first quote
2484                    self.advance(); // skip second quote
2485                } else {
2486                    // End of identifier
2487                    break;
2488                }
2489            } else {
2490                value.push(self.peek());
2491                self.advance();
2492            }
2493        }
2494
2495        self.advance(); // Closing quote
2496        self.add_token_with_text(TokenType::QuotedIdentifier, value);
2497        Ok(())
2498    }
2499
2500    /// Scan a string delimited by Unicode curly single quotes (U+2018/U+2019).
2501    /// Content between curly quotes is literal (no escape processing).
2502    /// When opened with \u{2018} (left), close with \u{2019} (right) only.
2503    /// When opened with \u{2019} (right), close with \u{2019} (right) — self-closing.
2504    fn scan_unicode_quoted_string(&mut self, open_quote: char) -> Result<()> {
2505        self.advance(); // Opening curly quote
2506        let start = self.current;
2507        // Determine closing quote: left opens -> right closes; right opens -> right closes
2508        let close_quote = if open_quote == '\u{2018}' {
2509            '\u{2019}' // left opens, right closes
2510        } else {
2511            '\u{2019}' // right quote also closes with right quote
2512        };
2513        while !self.is_at_end() && self.peek() != close_quote {
2514            self.advance();
2515        }
2516        let value = self.text_from_range(start, self.current);
2517        if !self.is_at_end() {
2518            self.advance(); // Closing quote
2519        }
2520        self.add_token_with_text(TokenType::String, value);
2521        Ok(())
2522    }
2523
2524    /// Scan an identifier delimited by Unicode curly double quotes (U+201C/U+201D).
2525    /// When opened with \u{201C} (left), close with \u{201D} (right) only.
2526    fn scan_unicode_quoted_identifier(&mut self, open_quote: char) -> Result<()> {
2527        self.advance(); // Opening curly quote
2528        let start = self.current;
2529        let close_quote = if open_quote == '\u{201C}' {
2530            '\u{201D}' // left opens, right closes
2531        } else {
2532            '\u{201D}' // right also closes with right
2533        };
2534        while !self.is_at_end() && self.peek() != close_quote && self.peek() != '"' {
2535            self.advance();
2536        }
2537        let value = self.text_from_range(start, self.current);
2538        if !self.is_at_end() {
2539            self.advance(); // Closing quote
2540        }
2541        self.add_token_with_text(TokenType::QuotedIdentifier, value);
2542        Ok(())
2543    }
2544
2545    fn scan_number(&mut self) -> Result<()> {
2546        // Check for 0x/0X hex number prefix (SQLite-style)
2547        if self.config.hex_number_strings && self.peek() == '0' && !self.is_at_end() {
2548            let next = if self.current + 1 < self.size {
2549                self.chars[self.current + 1]
2550            } else {
2551                '\0'
2552            };
2553            if next == 'x' || next == 'X' {
2554                // Advance past '0' and 'x'/'X'
2555                self.advance();
2556                self.advance();
2557                // Collect hex digits (allow underscores as separators, e.g., 0xbad_cafe)
2558                let hex_start = self.current;
2559                while !self.is_at_end() && (self.peek().is_ascii_hexdigit() || self.peek() == '_') {
2560                    if self.peek() == '_' && !self.peek_next().is_ascii_hexdigit() {
2561                        break;
2562                    }
2563                    self.advance();
2564                }
2565                if self.current > hex_start {
2566                    // Check for hex float: 0xABC.DEFpEXP or 0xABCpEXP
2567                    let mut is_hex_float = false;
2568                    // Optional fractional part: .hexdigits
2569                    if !self.is_at_end() && self.peek() == '.' {
2570                        let after_dot = if self.current + 1 < self.size {
2571                            self.chars[self.current + 1]
2572                        } else {
2573                            '\0'
2574                        };
2575                        if after_dot.is_ascii_hexdigit() {
2576                            is_hex_float = true;
2577                            self.advance(); // consume '.'
2578                            while !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2579                                self.advance();
2580                            }
2581                        }
2582                    }
2583                    // Optional binary exponent: p/P [+/-] digits
2584                    if !self.is_at_end() && (self.peek() == 'p' || self.peek() == 'P') {
2585                        is_hex_float = true;
2586                        self.advance(); // consume p/P
2587                        if !self.is_at_end() && (self.peek() == '+' || self.peek() == '-') {
2588                            self.advance();
2589                        }
2590                        while !self.is_at_end() && self.peek().is_ascii_digit() {
2591                            self.advance();
2592                        }
2593                    }
2594                    if is_hex_float {
2595                        // Hex float literal — emit as regular Number token with full text
2596                        let raw_text = self.text_from_range(self.start, self.current);
2597                        let full_text = if self.config.numbers_can_be_underscore_separated && raw_text.contains('_') {
2598                            raw_text.replace('_', "")
2599                        } else {
2600                            raw_text
2601                        };
2602                        self.add_token_with_text(TokenType::Number, full_text);
2603                    } else if self.config.hex_string_is_integer_type {
2604                        // BigQuery/ClickHouse: 0xA represents an integer in hex notation
2605                        let raw_value = self.text_from_range(hex_start, self.current);
2606                        let hex_value = if self.config.numbers_can_be_underscore_separated && raw_value.contains('_') {
2607                            raw_value.replace('_', "")
2608                        } else {
2609                            raw_value
2610                        };
2611                        self.add_token_with_text(TokenType::HexNumber, hex_value);
2612                    } else {
2613                        // SQLite/Teradata: 0xCC represents a binary/blob hex string
2614                        let raw_value = self.text_from_range(hex_start, self.current);
2615                        let hex_value = if self.config.numbers_can_be_underscore_separated && raw_value.contains('_') {
2616                            raw_value.replace('_', "")
2617                        } else {
2618                            raw_value
2619                        };
2620                        self.add_token_with_text(TokenType::HexString, hex_value);
2621                    }
2622                    return Ok(());
2623                }
2624                // No hex digits after 0x - fall through to normal number parsing
2625                // (reset current back to after '0')
2626                self.current = self.start + 1;
2627            }
2628        }
2629
2630        // Allow underscores as digit separators (e.g., 20_000, 1_000_000)
2631        while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2632            // Don't allow underscore at the end (must be followed by digit)
2633            if self.peek() == '_' && (self.is_at_end() || !self.peek_next().is_ascii_digit()) {
2634                break;
2635            }
2636            self.advance();
2637        }
2638
2639        // Look for decimal part - allow trailing dot (e.g., "1.")
2640        // In PostgreSQL (and sqlglot), "1.x" parses as float "1." with alias "x"
2641        // So we always consume the dot as part of the number, even if followed by an identifier
2642        if self.peek() == '.' {
2643            let next = self.peek_next();
2644            // Only consume the dot if:
2645            // 1. Followed by a digit (normal decimal like 1.5)
2646            // 2. Followed by an identifier start (like 1.x -> becomes 1. with alias x)
2647            // 3. End of input or other non-dot character (trailing decimal like "1.")
2648            // Do NOT consume if it's a double dot (..) which is a range operator
2649            if next != '.' {
2650                self.advance(); // consume the .
2651                                // Only consume digits after the decimal point (not identifiers)
2652                while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2653                    if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2654                        break;
2655                    }
2656                    self.advance();
2657                }
2658            }
2659        }
2660
2661        // Look for exponent
2662        if self.peek() == 'e' || self.peek() == 'E' {
2663            self.advance();
2664            if self.peek() == '+' || self.peek() == '-' {
2665                self.advance();
2666            }
2667            while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2668                if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2669                    break;
2670                }
2671                self.advance();
2672            }
2673        }
2674
2675        let raw_text = self.text_from_range(self.start, self.current);
2676        // Strip underscore digit separators (e.g., 20_000 -> 20000, 1_2E+1_0 -> 12E+10)
2677        // Only for dialects that support this (ClickHouse, DuckDB)
2678        let text = if self.config.numbers_can_be_underscore_separated && raw_text.contains('_') {
2679            raw_text.replace('_', "")
2680        } else {
2681            raw_text
2682        };
2683
2684        // Check for numeric literal suffixes (e.g., 1L -> BIGINT, 1s -> SMALLINT in Hive/Spark)
2685        if !self.config.numeric_literals.is_empty() && !self.is_at_end() {
2686            let next_char: String = self.peek().to_ascii_uppercase().to_string();
2687            // Try 2-char suffix first (e.g., "BD"), then 1-char
2688            let suffix_match = if self.current + 1 < self.size {
2689                let two_char: String = [
2690                    self.chars[self.current].to_ascii_uppercase(),
2691                    self.chars[self.current + 1].to_ascii_uppercase(),
2692                ]
2693                .iter()
2694                .collect();
2695                if self.config.numeric_literals.contains_key(&two_char) {
2696                    // Make sure the 2-char suffix is not followed by more identifier chars
2697                    let after_suffix = if self.current + 2 < self.size {
2698                        self.chars[self.current + 2]
2699                    } else {
2700                        ' '
2701                    };
2702                    if !after_suffix.is_alphanumeric() && after_suffix != '_' {
2703                        Some((two_char, 2))
2704                    } else {
2705                        None
2706                    }
2707                } else if self.config.numeric_literals.contains_key(&next_char) {
2708                    // 1-char suffix - make sure not followed by more identifier chars
2709                    let after_suffix = if self.current + 1 < self.size {
2710                        self.chars[self.current + 1]
2711                    } else {
2712                        ' '
2713                    };
2714                    if !after_suffix.is_alphanumeric() && after_suffix != '_' {
2715                        Some((next_char, 1))
2716                    } else {
2717                        None
2718                    }
2719                } else {
2720                    None
2721                }
2722            } else if self.config.numeric_literals.contains_key(&next_char) {
2723                // At end of input, 1-char suffix
2724                Some((next_char, 1))
2725            } else {
2726                None
2727            };
2728
2729            if let Some((suffix, len)) = suffix_match {
2730                // Consume the suffix characters
2731                for _ in 0..len {
2732                    self.advance();
2733                }
2734                // Emit as a special number-with-suffix token
2735                // We'll encode as "number::TYPE" so the parser can split it
2736                let type_name = self
2737                    .config
2738                    .numeric_literals
2739                    .get(&suffix)
2740                    .expect("suffix verified by contains_key above")
2741                    .clone();
2742                let combined = format!("{}::{}", text, type_name);
2743                self.add_token_with_text(TokenType::Number, combined);
2744                return Ok(());
2745            }
2746        }
2747
2748        // Check for identifiers that start with a digit (e.g., 1a, 1_a, 1a_1a)
2749        // In Hive/Spark/MySQL/ClickHouse, these are valid unquoted identifiers
2750        if self.config.identifiers_can_start_with_digit && !self.is_at_end() {
2751            let next = self.peek();
2752            if next.is_alphabetic() || next == '_' {
2753                // Continue scanning as an identifier
2754                while !self.is_at_end() {
2755                    let ch = self.peek();
2756                    if ch.is_alphanumeric() || ch == '_' {
2757                        self.advance();
2758                    } else {
2759                        break;
2760                    }
2761                }
2762                let ident_text = self.text_from_range(self.start, self.current);
2763                self.add_token_with_text(TokenType::Identifier, ident_text);
2764                return Ok(());
2765            }
2766        }
2767
2768        self.add_token_with_text(TokenType::Number, text);
2769        Ok(())
2770    }
2771
2772    /// Scan a number that starts with a dot (e.g., .25, .5, .123e10)
2773    fn scan_number_starting_with_dot(&mut self) -> Result<()> {
2774        // Consume the leading dot
2775        self.advance();
2776
2777        // Consume the fractional digits
2778        while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2779            if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2780                break;
2781            }
2782            self.advance();
2783        }
2784
2785        // Look for exponent
2786        if self.peek() == 'e' || self.peek() == 'E' {
2787            self.advance();
2788            if self.peek() == '+' || self.peek() == '-' {
2789                self.advance();
2790            }
2791            while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2792                if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2793                    break;
2794                }
2795                self.advance();
2796            }
2797        }
2798
2799        let raw_text = self.text_from_range(self.start, self.current);
2800        // Strip underscore digit separators (e.g., .1_5 -> .15)
2801        // Only for dialects that support this (ClickHouse, DuckDB)
2802        let text = if self.config.numbers_can_be_underscore_separated && raw_text.contains('_') {
2803            raw_text.replace('_', "")
2804        } else {
2805            raw_text
2806        };
2807        self.add_token_with_text(TokenType::Number, text);
2808        Ok(())
2809    }
2810
2811    /// Look up a keyword using a stack buffer for ASCII uppercasing, avoiding heap allocation.
2812    /// Returns `TokenType::Var` for texts longer than 128 bytes or non-UTF-8 results.
2813    #[inline]
2814    fn lookup_keyword_ascii(keywords: &HashMap<String, TokenType>, text: &str) -> TokenType {
2815        if text.len() > 128 {
2816            return TokenType::Var;
2817        }
2818        let mut buf = [0u8; 128];
2819        for (i, b) in text.bytes().enumerate() {
2820            buf[i] = b.to_ascii_uppercase();
2821        }
2822        if let Ok(upper) = std::str::from_utf8(&buf[..text.len()]) {
2823            keywords.get(upper).copied().unwrap_or(TokenType::Var)
2824        } else {
2825            TokenType::Var
2826        }
2827    }
2828
2829    fn scan_identifier_or_keyword(&mut self) -> Result<()> {
2830        // Guard against unrecognized characters that could cause infinite loops
2831        let first_char = self.peek();
2832        if !first_char.is_alphanumeric() && first_char != '_' {
2833            // Unknown character - skip it and return an error
2834            let c = self.advance();
2835            return Err(Error::tokenize(
2836                format!("Unexpected character: '{}'", c),
2837                self.line,
2838                self.column,
2839                self.start,
2840                self.current,
2841            ));
2842        }
2843
2844        while !self.is_at_end() {
2845            let c = self.peek();
2846            // Allow alphanumeric, underscore, $, # and @ in identifiers
2847            // PostgreSQL allows $, TSQL allows # and @
2848            // But stop consuming # if followed by > or >> (PostgreSQL #> and #>> operators)
2849            if c == '#' {
2850                let next_c = if self.current + 1 < self.size {
2851                    self.chars[self.current + 1]
2852                } else {
2853                    '\0'
2854                };
2855                if next_c == '>' || next_c == '-' {
2856                    break; // Don't consume # — it's part of #>, #>>, or #- operator
2857                }
2858                self.advance();
2859            } else if c.is_alphanumeric() || c == '_' || c == '$' || c == '@' {
2860                self.advance();
2861            } else {
2862                break;
2863            }
2864        }
2865
2866        let text = self.text_from_range(self.start, self.current);
2867
2868        // Special-case NOT= (Teradata and other dialects)
2869        if text.eq_ignore_ascii_case("NOT") && self.peek() == '=' {
2870            self.advance(); // consume '='
2871            self.add_token(TokenType::Neq);
2872            return Ok(());
2873        }
2874
2875        // Check for special string prefixes like N'...', X'...', B'...', U&'...', r'...', b'...'
2876        // Also handle double-quoted variants for dialects that support them (e.g., BigQuery)
2877        let next_char = self.peek();
2878        let is_single_quote = next_char == '\'';
2879        let is_double_quote = next_char == '"' && self.config.quotes.contains_key("\"");
2880        // For raw strings (r"..." or r'...'), we allow double quotes even if " is not in quotes config
2881        // because raw strings are a special case used in Spark/Databricks where " is for identifiers
2882        let is_double_quote_for_raw = next_char == '"';
2883
2884        // Handle raw strings first - they're special because they work with both ' and "
2885        // even in dialects where " is normally an identifier delimiter (like Databricks)
2886        if text.eq_ignore_ascii_case("R") && (is_single_quote || is_double_quote_for_raw) {
2887            // Raw string r'...' or r"..." or r'''...''' or r"""...""" (BigQuery style)
2888            // In raw strings, backslashes are treated literally (no escape processing)
2889            let quote_char = if is_single_quote { '\'' } else { '"' };
2890            self.advance(); // consume the first opening quote
2891
2892            // Check for triple-quoted raw string (r"""...""" or r'''...''')
2893            if self.peek() == quote_char && self.peek_next() == quote_char {
2894                // Triple-quoted raw string
2895                self.advance(); // consume second quote
2896                self.advance(); // consume third quote
2897                let string_value = self.scan_raw_triple_quoted_content(quote_char)?;
2898                self.add_token_with_text(TokenType::RawString, string_value);
2899            } else {
2900                let string_value = self.scan_raw_string_content(quote_char)?;
2901                self.add_token_with_text(TokenType::RawString, string_value);
2902            }
2903            return Ok(());
2904        }
2905
2906        if is_single_quote || is_double_quote {
2907            if text.eq_ignore_ascii_case("N") {
2908                // National string N'...'
2909                self.advance(); // consume the opening quote
2910                let string_value = if is_single_quote {
2911                    self.scan_string_content()?
2912                } else {
2913                    self.scan_double_quoted_string_content()?
2914                };
2915                self.add_token_with_text(TokenType::NationalString, string_value);
2916                return Ok(());
2917            } else if text.eq_ignore_ascii_case("E") {
2918                // PostgreSQL escape string E'...' or e'...'
2919                // Preserve the case by prefixing with "e:" or "E:"
2920                // Always use backslash escapes for escape strings (e.g., \' is an escaped quote)
2921                let lowercase = text == "e";
2922                let prefix = if lowercase { "e:" } else { "E:" };
2923                self.advance(); // consume the opening quote
2924                let string_value = self.scan_string_content_with_escapes(true)?;
2925                self.add_token_with_text(
2926                    TokenType::EscapeString,
2927                    format!("{}{}", prefix, string_value),
2928                );
2929                return Ok(());
2930            } else if text.eq_ignore_ascii_case("X") {
2931                // Hex string X'...'
2932                self.advance(); // consume the opening quote
2933                let string_value = if is_single_quote {
2934                    self.scan_string_content()?
2935                } else {
2936                    self.scan_double_quoted_string_content()?
2937                };
2938                self.add_token_with_text(TokenType::HexString, string_value);
2939                return Ok(());
2940            } else if text.eq_ignore_ascii_case("B") && is_double_quote {
2941                // Byte string b"..." (BigQuery style) - MUST check before single quote B'...'
2942                self.advance(); // consume the opening quote
2943                let string_value = self.scan_double_quoted_string_content()?;
2944                self.add_token_with_text(TokenType::ByteString, string_value);
2945                return Ok(());
2946            } else if text.eq_ignore_ascii_case("B") && is_single_quote {
2947                // For BigQuery: b'...' is a byte string (bytes data)
2948                // For standard SQL: B'...' is a bit string (binary digits)
2949                self.advance(); // consume the opening quote
2950                let string_value = self.scan_string_content()?;
2951                if self.config.b_prefix_is_byte_string {
2952                    self.add_token_with_text(TokenType::ByteString, string_value);
2953                } else {
2954                    self.add_token_with_text(TokenType::BitString, string_value);
2955                }
2956                return Ok(());
2957            }
2958        }
2959
2960        // Check for U&'...' Unicode string syntax (SQL standard)
2961        if text.eq_ignore_ascii_case("U")
2962            && self.peek() == '&'
2963            && self.current + 1 < self.size
2964            && self.chars[self.current + 1] == '\''
2965        {
2966            self.advance(); // consume '&'
2967            self.advance(); // consume opening quote
2968            let string_value = self.scan_string_content()?;
2969            self.add_token_with_text(TokenType::UnicodeString, string_value);
2970            return Ok(());
2971        }
2972
2973        let token_type = Self::lookup_keyword_ascii(&self.config.keywords, &text);
2974
2975        self.add_token_with_text(token_type, text);
2976        Ok(())
2977    }
2978
2979    /// Scan string content (everything between quotes)
2980    /// If `force_backslash_escapes` is true, backslash is always treated as an escape character
2981    /// (used for PostgreSQL E'...' escape strings)
2982    fn scan_string_content_with_escapes(
2983        &mut self,
2984        force_backslash_escapes: bool,
2985    ) -> Result<String> {
2986        let mut value = String::new();
2987        let use_backslash_escapes =
2988            force_backslash_escapes || self.config.string_escapes.contains(&'\\');
2989
2990        while !self.is_at_end() {
2991            let c = self.peek();
2992            if c == '\'' {
2993                if self.peek_next() == '\'' {
2994                    // Escaped quote ''
2995                    value.push('\'');
2996                    self.advance();
2997                    self.advance();
2998                } else {
2999                    break;
3000                }
3001            } else if c == '\\' && use_backslash_escapes {
3002                // Preserve escape sequences literally (including \' for escape strings)
3003                value.push(self.advance());
3004                if !self.is_at_end() {
3005                    value.push(self.advance());
3006                }
3007            } else {
3008                value.push(self.advance());
3009            }
3010        }
3011
3012        if self.is_at_end() {
3013            return Err(Error::tokenize(
3014                "Unterminated string",
3015                self.line,
3016                self.column,
3017                self.start,
3018                self.current,
3019            ));
3020        }
3021
3022        self.advance(); // Closing quote
3023        Ok(value)
3024    }
3025
3026    /// Scan string content (everything between quotes)
3027    fn scan_string_content(&mut self) -> Result<String> {
3028        self.scan_string_content_with_escapes(false)
3029    }
3030
3031    /// Scan double-quoted string content (for dialects like BigQuery where " is a string delimiter)
3032    /// This is used for prefixed strings like b"..." or N"..."
3033    fn scan_double_quoted_string_content(&mut self) -> Result<String> {
3034        let mut value = String::new();
3035        let use_backslash_escapes = self.config.string_escapes.contains(&'\\');
3036
3037        while !self.is_at_end() {
3038            let c = self.peek();
3039            if c == '"' {
3040                if self.peek_next() == '"' {
3041                    // Escaped quote ""
3042                    value.push('"');
3043                    self.advance();
3044                    self.advance();
3045                } else {
3046                    break;
3047                }
3048            } else if c == '\\' && use_backslash_escapes {
3049                // Handle escape sequences
3050                self.advance(); // Consume backslash
3051                if !self.is_at_end() {
3052                    let escaped = self.advance();
3053                    match escaped {
3054                        'n' => value.push('\n'),
3055                        'r' => value.push('\r'),
3056                        't' => value.push('\t'),
3057                        '0' => value.push('\0'),
3058                        '\\' => value.push('\\'),
3059                        '"' => value.push('"'),
3060                        '\'' => value.push('\''),
3061                        'x' => {
3062                            // Hex escape \xNN - collect hex digits
3063                            let mut hex = String::new();
3064                            for _ in 0..2 {
3065                                if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
3066                                    hex.push(self.advance());
3067                                }
3068                            }
3069                            if let Ok(byte) = u8::from_str_radix(&hex, 16) {
3070                                value.push(byte as char);
3071                            } else {
3072                                // Invalid hex escape, keep it literal
3073                                value.push('\\');
3074                                value.push('x');
3075                                value.push_str(&hex);
3076                            }
3077                        }
3078                        _ => {
3079                            // For unrecognized escapes, preserve backslash + char
3080                            value.push('\\');
3081                            value.push(escaped);
3082                        }
3083                    }
3084                }
3085            } else {
3086                value.push(self.advance());
3087            }
3088        }
3089
3090        if self.is_at_end() {
3091            return Err(Error::tokenize(
3092                "Unterminated double-quoted string",
3093                self.line,
3094                self.column,
3095                self.start,
3096                self.current,
3097            ));
3098        }
3099
3100        self.advance(); // Closing quote
3101        Ok(value)
3102    }
3103
3104    /// Scan raw string content (limited escape processing for quotes)
3105    /// Used for BigQuery r'...' and r"..." strings
3106    /// In raw strings, backslashes are literal EXCEPT that escape sequences for the
3107    /// quote character still work (e.g., \' in r'...' escapes the quote, '' also works)
3108    fn scan_raw_string_content(&mut self, quote_char: char) -> Result<String> {
3109        let mut value = String::new();
3110
3111        while !self.is_at_end() {
3112            let c = self.peek();
3113            if c == quote_char {
3114                if self.peek_next() == quote_char {
3115                    // Escaped quote (doubled) - e.g., '' inside r'...'
3116                    value.push(quote_char);
3117                    self.advance();
3118                    self.advance();
3119                } else {
3120                    break;
3121                }
3122            } else if c == '\\'
3123                && self.peek_next() == quote_char
3124                && self.config.string_escapes_allowed_in_raw_strings
3125            {
3126                // Backslash-escaped quote - works in raw strings when string_escapes_allowed_in_raw_strings is true
3127                // e.g., \' inside r'...' becomes literal ' (BigQuery behavior)
3128                // Spark/Databricks has this set to false, so backslash is always literal there
3129                value.push(quote_char);
3130                self.advance(); // consume backslash
3131                self.advance(); // consume quote
3132            } else {
3133                // In raw strings, everything including backslashes is literal
3134                value.push(self.advance());
3135            }
3136        }
3137
3138        if self.is_at_end() {
3139            return Err(Error::tokenize(
3140                "Unterminated raw string",
3141                self.line,
3142                self.column,
3143                self.start,
3144                self.current,
3145            ));
3146        }
3147
3148        self.advance(); // Closing quote
3149        Ok(value)
3150    }
3151
3152    /// Scan raw triple-quoted string content (r"""...""" or r'''...''')
3153    /// Terminates when three consecutive quote_chars are found
3154    fn scan_raw_triple_quoted_content(&mut self, quote_char: char) -> Result<String> {
3155        let mut value = String::new();
3156
3157        while !self.is_at_end() {
3158            let c = self.peek();
3159            if c == quote_char && self.peek_next() == quote_char {
3160                // Check for third quote
3161                if self.current + 2 < self.size && self.chars[self.current + 2] == quote_char {
3162                    // Found three consecutive quotes - end of string
3163                    self.advance(); // first closing quote
3164                    self.advance(); // second closing quote
3165                    self.advance(); // third closing quote
3166                    return Ok(value);
3167                }
3168            }
3169            // In raw strings, everything including backslashes is literal
3170            let ch = self.advance();
3171            value.push(ch);
3172        }
3173
3174        Err(Error::tokenize(
3175            "Unterminated raw triple-quoted string",
3176            self.line,
3177            self.column,
3178            self.start,
3179            self.current,
3180        ))
3181    }
3182
3183    /// Scan TSQL identifiers that start with # (temp tables) or @ (variables)
3184    /// Examples: #temp, ##global_temp, @variable
3185    /// Scan an identifier that starts with `$` (ClickHouse).
3186    /// Examples: `$alias$name$`, `$x`
3187    fn scan_dollar_identifier(&mut self) -> Result<()> {
3188        // Consume the leading $
3189        self.advance();
3190
3191        // Consume alphanumeric, _, and $ continuation chars
3192        while !self.is_at_end() {
3193            let c = self.peek();
3194            if c.is_alphanumeric() || c == '_' || c == '$' {
3195                self.advance();
3196            } else {
3197                break;
3198            }
3199        }
3200
3201        let text = self.text_from_range(self.start, self.current);
3202        self.add_token_with_text(TokenType::Var, text);
3203        Ok(())
3204    }
3205
3206    fn scan_tsql_identifier(&mut self) -> Result<()> {
3207        // Consume the leading # or @ (or ##)
3208        let first = self.advance();
3209
3210        // For ##, consume the second #
3211        if first == '#' && self.peek() == '#' {
3212            self.advance();
3213        }
3214
3215        // Now scan the rest of the identifier
3216        while !self.is_at_end() {
3217            let c = self.peek();
3218            if c.is_alphanumeric() || c == '_' || c == '$' || c == '#' || c == '@' {
3219                self.advance();
3220            } else {
3221                break;
3222            }
3223        }
3224
3225        let text = self.text_from_range(self.start, self.current);
3226        // These are always identifiers (variables or temp table names), never keywords
3227        self.add_token_with_text(TokenType::Var, text);
3228        Ok(())
3229    }
3230
3231    /// Check if the last tokens match INSERT ... FORMAT <name> (not VALUES).
3232    /// If so, consume everything until the next blank line (two consecutive newlines)
3233    /// or end of input as raw data.
3234    fn try_scan_insert_format_raw_data(&mut self) -> Option<String> {
3235        let len = self.tokens.len();
3236        if len < 3 {
3237            return None;
3238        }
3239
3240        // Last token should be the format name (Identifier or Var, not VALUES)
3241        let last = &self.tokens[len - 1];
3242        if last.text.eq_ignore_ascii_case("VALUES") {
3243            return None;
3244        }
3245        if !matches!(last.token_type, TokenType::Var | TokenType::Identifier) {
3246            return None;
3247        }
3248
3249        // Second-to-last should be FORMAT
3250        let format_tok = &self.tokens[len - 2];
3251        if !format_tok.text.eq_ignore_ascii_case("FORMAT") {
3252            return None;
3253        }
3254
3255        // Check that there's an INSERT somewhere earlier in the tokens
3256        let has_insert = self.tokens[..len - 2]
3257            .iter()
3258            .rev()
3259            .take(20)
3260            .any(|t| t.token_type == TokenType::Insert);
3261        if !has_insert {
3262            return None;
3263        }
3264
3265        // We're in INSERT ... FORMAT <name> context. Consume everything until:
3266        // - A blank line (two consecutive newlines, possibly with whitespace between)
3267        // - End of input
3268        let raw_start = self.current;
3269        while !self.is_at_end() {
3270            let c = self.peek();
3271            if c == '\n' {
3272                // Check for blank line: \n followed by optional \r and \n
3273                let saved = self.current;
3274                self.advance(); // consume first \n
3275                                // Skip \r if present
3276                while !self.is_at_end() && self.peek() == '\r' {
3277                    self.advance();
3278                }
3279                if self.is_at_end() || self.peek() == '\n' {
3280                    // Found blank line or end of input - stop here
3281                    // Don't consume the second \n so subsequent SQL can be tokenized
3282                    let raw = self.text_from_range(raw_start, saved);
3283                    return Some(raw.trim().to_string());
3284                }
3285                // Not a blank line, continue scanning
3286            } else {
3287                self.advance();
3288            }
3289        }
3290
3291        // Reached end of input
3292        let raw = self.text_from_range(raw_start, self.current);
3293        let trimmed = raw.trim().to_string();
3294        if trimmed.is_empty() {
3295            None
3296        } else {
3297            Some(trimmed)
3298        }
3299    }
3300
3301    fn add_token(&mut self, token_type: TokenType) {
3302        let text = self.text_from_range(self.start, self.current);
3303        self.add_token_with_text(token_type, text);
3304    }
3305
3306    fn add_token_with_text(&mut self, token_type: TokenType, text: String) {
3307        let span = Span::new(self.start, self.current, self.line, self.column);
3308        let mut token = Token::new(token_type, text, span);
3309        token.comments.append(&mut self.comments);
3310        self.tokens.push(token);
3311    }
3312}
3313
3314#[cfg(test)]
3315mod tests {
3316    use super::*;
3317
3318    #[test]
3319    fn test_simple_select() {
3320        let tokenizer = Tokenizer::default();
3321        let tokens = tokenizer.tokenize("SELECT 1").unwrap();
3322
3323        assert_eq!(tokens.len(), 2);
3324        assert_eq!(tokens[0].token_type, TokenType::Select);
3325        assert_eq!(tokens[1].token_type, TokenType::Number);
3326        assert_eq!(tokens[1].text, "1");
3327    }
3328
3329    #[test]
3330    fn test_select_with_identifier() {
3331        let tokenizer = Tokenizer::default();
3332        let tokens = tokenizer.tokenize("SELECT a, b FROM t").unwrap();
3333
3334        assert_eq!(tokens.len(), 6);
3335        assert_eq!(tokens[0].token_type, TokenType::Select);
3336        assert_eq!(tokens[1].token_type, TokenType::Var);
3337        assert_eq!(tokens[1].text, "a");
3338        assert_eq!(tokens[2].token_type, TokenType::Comma);
3339        assert_eq!(tokens[3].token_type, TokenType::Var);
3340        assert_eq!(tokens[3].text, "b");
3341        assert_eq!(tokens[4].token_type, TokenType::From);
3342        assert_eq!(tokens[5].token_type, TokenType::Var);
3343        assert_eq!(tokens[5].text, "t");
3344    }
3345
3346    #[test]
3347    fn test_string_literal() {
3348        let tokenizer = Tokenizer::default();
3349        let tokens = tokenizer.tokenize("SELECT 'hello'").unwrap();
3350
3351        assert_eq!(tokens.len(), 2);
3352        assert_eq!(tokens[1].token_type, TokenType::String);
3353        assert_eq!(tokens[1].text, "hello");
3354    }
3355
3356    #[test]
3357    fn test_escaped_string() {
3358        let tokenizer = Tokenizer::default();
3359        let tokens = tokenizer.tokenize("SELECT 'it''s'").unwrap();
3360
3361        assert_eq!(tokens.len(), 2);
3362        assert_eq!(tokens[1].token_type, TokenType::String);
3363        assert_eq!(tokens[1].text, "it's");
3364    }
3365
3366    #[test]
3367    fn test_comments() {
3368        let tokenizer = Tokenizer::default();
3369        let tokens = tokenizer.tokenize("SELECT -- comment\n1").unwrap();
3370
3371        assert_eq!(tokens.len(), 2);
3372        // Comments are attached to the PREVIOUS token as trailing_comments
3373        // This is better for round-trip fidelity (e.g., SELECT c /* comment */ FROM)
3374        assert_eq!(tokens[0].trailing_comments.len(), 1);
3375        assert_eq!(tokens[0].trailing_comments[0], " comment");
3376    }
3377
3378    #[test]
3379    fn test_comment_in_and_chain() {
3380        use crate::generator::Generator;
3381        use crate::parser::Parser;
3382
3383        // Line comments between AND clauses should appear after the AND operator
3384        let sql = "SELECT a FROM b WHERE foo\n-- c1\nAND bar\n-- c2\nAND bla";
3385        let ast = Parser::parse_sql(sql).unwrap();
3386        let mut gen = Generator::default();
3387        let output = gen.generate(&ast[0]).unwrap();
3388        assert_eq!(
3389            output,
3390            "SELECT a FROM b WHERE foo AND /* c1 */ bar AND /* c2 */ bla"
3391        );
3392    }
3393
3394    #[test]
3395    fn test_operators() {
3396        let tokenizer = Tokenizer::default();
3397        let tokens = tokenizer.tokenize("1 + 2 * 3").unwrap();
3398
3399        assert_eq!(tokens.len(), 5);
3400        assert_eq!(tokens[0].token_type, TokenType::Number);
3401        assert_eq!(tokens[1].token_type, TokenType::Plus);
3402        assert_eq!(tokens[2].token_type, TokenType::Number);
3403        assert_eq!(tokens[3].token_type, TokenType::Star);
3404        assert_eq!(tokens[4].token_type, TokenType::Number);
3405    }
3406
3407    #[test]
3408    fn test_comparison_operators() {
3409        let tokenizer = Tokenizer::default();
3410        let tokens = tokenizer.tokenize("a <= b >= c != d").unwrap();
3411
3412        assert_eq!(tokens[1].token_type, TokenType::Lte);
3413        assert_eq!(tokens[3].token_type, TokenType::Gte);
3414        assert_eq!(tokens[5].token_type, TokenType::Neq);
3415    }
3416
3417    #[test]
3418    fn test_national_string() {
3419        let tokenizer = Tokenizer::default();
3420        let tokens = tokenizer.tokenize("N'abc'").unwrap();
3421
3422        assert_eq!(
3423            tokens.len(),
3424            1,
3425            "Expected 1 token for N'abc', got {:?}",
3426            tokens
3427        );
3428        assert_eq!(tokens[0].token_type, TokenType::NationalString);
3429        assert_eq!(tokens[0].text, "abc");
3430    }
3431
3432    #[test]
3433    fn test_hex_string() {
3434        let tokenizer = Tokenizer::default();
3435        let tokens = tokenizer.tokenize("X'ABCD'").unwrap();
3436
3437        assert_eq!(
3438            tokens.len(),
3439            1,
3440            "Expected 1 token for X'ABCD', got {:?}",
3441            tokens
3442        );
3443        assert_eq!(tokens[0].token_type, TokenType::HexString);
3444        assert_eq!(tokens[0].text, "ABCD");
3445    }
3446
3447    #[test]
3448    fn test_bit_string() {
3449        let tokenizer = Tokenizer::default();
3450        let tokens = tokenizer.tokenize("B'01010'").unwrap();
3451
3452        assert_eq!(
3453            tokens.len(),
3454            1,
3455            "Expected 1 token for B'01010', got {:?}",
3456            tokens
3457        );
3458        assert_eq!(tokens[0].token_type, TokenType::BitString);
3459        assert_eq!(tokens[0].text, "01010");
3460    }
3461
3462    #[test]
3463    fn test_trailing_dot_number() {
3464        let tokenizer = Tokenizer::default();
3465
3466        // Test trailing dot
3467        let tokens = tokenizer.tokenize("SELECT 1.").unwrap();
3468        assert_eq!(
3469            tokens.len(),
3470            2,
3471            "Expected 2 tokens for 'SELECT 1.', got {:?}",
3472            tokens
3473        );
3474        assert_eq!(tokens[1].token_type, TokenType::Number);
3475        assert_eq!(tokens[1].text, "1.");
3476
3477        // Test normal decimal
3478        let tokens = tokenizer.tokenize("SELECT 1.5").unwrap();
3479        assert_eq!(tokens[1].text, "1.5");
3480
3481        // Test number followed by dot and identifier
3482        // In PostgreSQL (and sqlglot), "1.x" parses as float "1." with alias "x"
3483        let tokens = tokenizer.tokenize("SELECT 1.a").unwrap();
3484        assert_eq!(
3485            tokens.len(),
3486            3,
3487            "Expected 3 tokens for 'SELECT 1.a', got {:?}",
3488            tokens
3489        );
3490        assert_eq!(tokens[1].token_type, TokenType::Number);
3491        assert_eq!(tokens[1].text, "1.");
3492        assert_eq!(tokens[2].token_type, TokenType::Var);
3493
3494        // Test two dots (range operator) - dot is NOT consumed when followed by another dot
3495        let tokens = tokenizer.tokenize("SELECT 1..2").unwrap();
3496        assert_eq!(tokens[1].token_type, TokenType::Number);
3497        assert_eq!(tokens[1].text, "1");
3498        assert_eq!(tokens[2].token_type, TokenType::Dot);
3499        assert_eq!(tokens[3].token_type, TokenType::Dot);
3500        assert_eq!(tokens[4].token_type, TokenType::Number);
3501        assert_eq!(tokens[4].text, "2");
3502    }
3503
3504    #[test]
3505    fn test_leading_dot_number() {
3506        let tokenizer = Tokenizer::default();
3507
3508        // Test leading dot number (e.g., .25 for 0.25)
3509        let tokens = tokenizer.tokenize(".25").unwrap();
3510        assert_eq!(
3511            tokens.len(),
3512            1,
3513            "Expected 1 token for '.25', got {:?}",
3514            tokens
3515        );
3516        assert_eq!(tokens[0].token_type, TokenType::Number);
3517        assert_eq!(tokens[0].text, ".25");
3518
3519        // Test leading dot in context (Oracle SAMPLE clause)
3520        let tokens = tokenizer.tokenize("SAMPLE (.25)").unwrap();
3521        assert_eq!(
3522            tokens.len(),
3523            4,
3524            "Expected 4 tokens for 'SAMPLE (.25)', got {:?}",
3525            tokens
3526        );
3527        assert_eq!(tokens[0].token_type, TokenType::Sample);
3528        assert_eq!(tokens[1].token_type, TokenType::LParen);
3529        assert_eq!(tokens[2].token_type, TokenType::Number);
3530        assert_eq!(tokens[2].text, ".25");
3531        assert_eq!(tokens[3].token_type, TokenType::RParen);
3532
3533        // Test leading dot with exponent
3534        let tokens = tokenizer.tokenize(".5e10").unwrap();
3535        assert_eq!(
3536            tokens.len(),
3537            1,
3538            "Expected 1 token for '.5e10', got {:?}",
3539            tokens
3540        );
3541        assert_eq!(tokens[0].token_type, TokenType::Number);
3542        assert_eq!(tokens[0].text, ".5e10");
3543
3544        // Test that plain dot is still a Dot token
3545        let tokens = tokenizer.tokenize("a.b").unwrap();
3546        assert_eq!(
3547            tokens.len(),
3548            3,
3549            "Expected 3 tokens for 'a.b', got {:?}",
3550            tokens
3551        );
3552        assert_eq!(tokens[1].token_type, TokenType::Dot);
3553    }
3554
3555    #[test]
3556    fn test_unrecognized_character() {
3557        let tokenizer = Tokenizer::default();
3558
3559        // Unicode curly quotes are now handled as string delimiters
3560        let result = tokenizer.tokenize("SELECT \u{2018}hello\u{2019}");
3561        assert!(
3562            result.is_ok(),
3563            "Curly quotes should be tokenized as strings"
3564        );
3565
3566        // Unicode bullet character should still error
3567        let result = tokenizer.tokenize("SELECT • FROM t");
3568        assert!(result.is_err());
3569    }
3570
3571    #[test]
3572    fn test_colon_eq_tokenization() {
3573        let tokenizer = Tokenizer::default();
3574
3575        // := should be a single ColonEq token
3576        let tokens = tokenizer.tokenize("a := 1").unwrap();
3577        assert_eq!(tokens.len(), 3);
3578        assert_eq!(tokens[0].token_type, TokenType::Var);
3579        assert_eq!(tokens[1].token_type, TokenType::ColonEq);
3580        assert_eq!(tokens[2].token_type, TokenType::Number);
3581
3582        // : followed by non-= should still be Colon
3583        let tokens = tokenizer.tokenize("a:b").unwrap();
3584        assert!(tokens.iter().any(|t| t.token_type == TokenType::Colon));
3585        assert!(!tokens.iter().any(|t| t.token_type == TokenType::ColonEq));
3586
3587        // :: should still be DColon
3588        let tokens = tokenizer.tokenize("a::INT").unwrap();
3589        assert!(tokens.iter().any(|t| t.token_type == TokenType::DColon));
3590    }
3591
3592    #[test]
3593    fn test_colon_eq_parsing() {
3594        use crate::generator::Generator;
3595        use crate::parser::Parser;
3596
3597        // MySQL @var := value in SELECT
3598        let ast = Parser::parse_sql("SELECT @var1 := 1, @var2")
3599            .expect("Failed to parse MySQL @var := expr");
3600        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3601        assert_eq!(output, "SELECT @var1 := 1, @var2");
3602
3603        // MySQL @var := @var in SELECT
3604        let ast = Parser::parse_sql("SELECT @var1, @var2 := @var1")
3605            .expect("Failed to parse MySQL @var2 := @var1");
3606        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3607        assert_eq!(output, "SELECT @var1, @var2 := @var1");
3608
3609        // MySQL @var := COUNT(*)
3610        let ast = Parser::parse_sql("SELECT @var1 := COUNT(*) FROM t1")
3611            .expect("Failed to parse MySQL @var := COUNT(*)");
3612        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3613        assert_eq!(output, "SELECT @var1 := COUNT(*) FROM t1");
3614
3615        // MySQL SET @var := 1 (should normalize to = in output)
3616        let ast = Parser::parse_sql("SET @var1 := 1").expect("Failed to parse SET @var1 := 1");
3617        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3618        assert_eq!(output, "SET @var1 = 1");
3619
3620        // Function named args with :=
3621        let ast =
3622            Parser::parse_sql("UNION_VALUE(k1 := 1)").expect("Failed to parse named arg with :=");
3623        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3624        assert_eq!(output, "UNION_VALUE(k1 := 1)");
3625
3626        // UNNEST with recursive := TRUE
3627        let ast = Parser::parse_sql("SELECT UNNEST(col, recursive := TRUE) FROM t")
3628            .expect("Failed to parse UNNEST with :=");
3629        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3630        assert_eq!(output, "SELECT UNNEST(col, recursive := TRUE) FROM t");
3631
3632        // DuckDB prefix alias: foo: 1 means 1 AS foo
3633        let ast =
3634            Parser::parse_sql("SELECT foo: 1").expect("Failed to parse DuckDB prefix alias foo: 1");
3635        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3636        assert_eq!(output, "SELECT 1 AS foo");
3637
3638        // DuckDB prefix alias with multiple columns
3639        let ast = Parser::parse_sql("SELECT foo: 1, bar: 2, baz: 3")
3640            .expect("Failed to parse DuckDB multiple prefix aliases");
3641        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3642        assert_eq!(output, "SELECT 1 AS foo, 2 AS bar, 3 AS baz");
3643    }
3644
3645    #[test]
3646    fn test_colon_eq_dialect_roundtrip() {
3647        use crate::dialects::{Dialect, DialectType};
3648
3649        fn check(dialect: DialectType, sql: &str, expected: Option<&str>) {
3650            let d = Dialect::get(dialect);
3651            let ast = d
3652                .parse(sql)
3653                .unwrap_or_else(|e| panic!("Parse error for '{}': {}", sql, e));
3654            assert!(!ast.is_empty(), "Empty AST for: {}", sql);
3655            let transformed = d
3656                .transform(ast[0].clone())
3657                .unwrap_or_else(|e| panic!("Transform error for '{}': {}", sql, e));
3658            let output = d
3659                .generate(&transformed)
3660                .unwrap_or_else(|e| panic!("Generate error for '{}': {}", sql, e));
3661            let expected = expected.unwrap_or(sql);
3662            assert_eq!(output, expected, "Roundtrip failed for: {}", sql);
3663        }
3664
3665        // MySQL := tests
3666        check(DialectType::MySQL, "SELECT @var1 := 1, @var2", None);
3667        check(DialectType::MySQL, "SELECT @var1, @var2 := @var1", None);
3668        check(DialectType::MySQL, "SELECT @var1 := COUNT(*) FROM t1", None);
3669        check(DialectType::MySQL, "SET @var1 := 1", Some("SET @var1 = 1"));
3670
3671        // DuckDB := tests
3672        check(
3673            DialectType::DuckDB,
3674            "SELECT UNNEST(col, recursive := TRUE) FROM t",
3675            None,
3676        );
3677        check(DialectType::DuckDB, "UNION_VALUE(k1 := 1)", None);
3678
3679        // STRUCT_PACK(a := 'b')::json should at least parse without error
3680        // (The STRUCT_PACK -> Struct transformation is a separate feature)
3681        {
3682            let d = Dialect::get(DialectType::DuckDB);
3683            let ast = d
3684                .parse("STRUCT_PACK(a := 'b')::json")
3685                .expect("Failed to parse STRUCT_PACK(a := 'b')::json");
3686            assert!(!ast.is_empty(), "Empty AST for STRUCT_PACK(a := 'b')::json");
3687        }
3688
3689        // DuckDB prefix alias tests
3690        check(
3691            DialectType::DuckDB,
3692            "SELECT foo: 1",
3693            Some("SELECT 1 AS foo"),
3694        );
3695        check(
3696            DialectType::DuckDB,
3697            "SELECT foo: 1, bar: 2, baz: 3",
3698            Some("SELECT 1 AS foo, 2 AS bar, 3 AS baz"),
3699        );
3700    }
3701
3702    #[test]
3703    fn test_comment_roundtrip() {
3704        use crate::generator::Generator;
3705        use crate::parser::Parser;
3706
3707        fn check_roundtrip(sql: &str) -> Option<String> {
3708            let ast = match Parser::parse_sql(sql) {
3709                Ok(a) => a,
3710                Err(e) => return Some(format!("Parse error: {:?}", e)),
3711            };
3712            if ast.is_empty() {
3713                return Some("Empty AST".to_string());
3714            }
3715            let mut generator = Generator::default();
3716            let output = match generator.generate(&ast[0]) {
3717                Ok(o) => o,
3718                Err(e) => return Some(format!("Gen error: {:?}", e)),
3719            };
3720            if output == sql {
3721                None
3722            } else {
3723                Some(format!(
3724                    "Mismatch:\n  input:  {}\n  output: {}",
3725                    sql, output
3726                ))
3727            }
3728        }
3729
3730        let tests = vec![
3731            // Nested comments
3732            "SELECT c /* c1 /* c2 */ c3 */",
3733            "SELECT c /* c1 /* c2 /* c3 */ */ */",
3734            // Simple alias with comments
3735            "SELECT c /* c1 */ AS alias /* c2 */",
3736            // Multiple columns with comments
3737            "SELECT a /* x */, b /* x */",
3738            // Multiple comments after column
3739            "SELECT a /* x */ /* y */ /* z */, b /* k */ /* m */",
3740            // FROM tables with comments
3741            "SELECT * FROM foo /* x */, bla /* x */",
3742            // Arithmetic with comments
3743            "SELECT 1 /* comment */ + 1",
3744            "SELECT 1 /* c1 */ + 2 /* c2 */",
3745            "SELECT 1 /* c1 */ + /* c2 */ 2 /* c3 */",
3746            // CAST with comments
3747            "SELECT CAST(x AS INT) /* comment */ FROM foo",
3748            // Function arguments with comments
3749            "SELECT FOO(x /* c */) /* FOO */, b /* b */",
3750            // Multi-part table names with comments
3751            "SELECT x FROM a.b.c /* x */, e.f.g /* x */",
3752            // INSERT with comments
3753            "INSERT INTO t1 (tc1 /* tc1 */, tc2 /* tc2 */) SELECT c1 /* sc1 */, c2 /* sc2 */ FROM t",
3754            // Leading comments on statements
3755            "/* c */ WITH x AS (SELECT 1) SELECT * FROM x",
3756            "/* comment1 */ INSERT INTO x /* comment2 */ VALUES (1, 2, 3)",
3757            "/* comment1 */ UPDATE tbl /* comment2 */ SET x = 2 WHERE x < 2",
3758            "/* comment1 */ DELETE FROM x /* comment2 */ WHERE y > 1",
3759            "/* comment */ CREATE TABLE foo AS SELECT 1",
3760            // Trailing comments on statements
3761            "INSERT INTO foo SELECT * FROM bar /* comment */",
3762            // Complex nested expressions with comments
3763            "SELECT FOO(x /* c1 */ + y /* c2 */ + BLA(5 /* c3 */)) FROM (VALUES (1 /* c4 */, \"test\" /* c5 */)) /* c6 */",
3764        ];
3765
3766        let mut failures = Vec::new();
3767        for sql in tests {
3768            if let Some(e) = check_roundtrip(sql) {
3769                failures.push(e);
3770            }
3771        }
3772
3773        if !failures.is_empty() {
3774            panic!("Comment roundtrip failures:\n{}", failures.join("\n\n"));
3775        }
3776    }
3777
3778    #[test]
3779    fn test_dollar_quoted_string_parsing() {
3780        use crate::dialects::{Dialect, DialectType};
3781
3782        // Test dollar string token parsing utility function
3783        let (tag, content) = super::parse_dollar_string_token("FOO\x00content here");
3784        assert_eq!(tag, Some("FOO".to_string()));
3785        assert_eq!(content, "content here");
3786
3787        let (tag, content) = super::parse_dollar_string_token("just content");
3788        assert_eq!(tag, None);
3789        assert_eq!(content, "just content");
3790
3791        // Test roundtrip for Databricks dialect with dollar-quoted function body
3792        fn check_databricks(sql: &str, expected: Option<&str>) {
3793            let d = Dialect::get(DialectType::Databricks);
3794            let ast = d
3795                .parse(sql)
3796                .unwrap_or_else(|e| panic!("Parse error for '{}': {}", sql, e));
3797            assert!(!ast.is_empty(), "Empty AST for: {}", sql);
3798            let transformed = d
3799                .transform(ast[0].clone())
3800                .unwrap_or_else(|e| panic!("Transform error for '{}': {}", sql, e));
3801            let output = d
3802                .generate(&transformed)
3803                .unwrap_or_else(|e| panic!("Generate error for '{}': {}", sql, e));
3804            let expected = expected.unwrap_or(sql);
3805            assert_eq!(output, expected, "Roundtrip failed for: {}", sql);
3806        }
3807
3808        // Test [42]: $$...$$ heredoc
3809        check_databricks(
3810            "CREATE FUNCTION add_one(x INT) RETURNS INT LANGUAGE PYTHON AS $$def add_one(x):\n  return x+1$$",
3811            None
3812        );
3813
3814        // Test [43]: $FOO$...$FOO$ tagged heredoc
3815        check_databricks(
3816            "CREATE FUNCTION add_one(x INT) RETURNS INT LANGUAGE PYTHON AS $FOO$def add_one(x):\n  return x+1$FOO$",
3817            None
3818        );
3819    }
3820
3821    #[test]
3822    fn test_numeric_underscore_stripping() {
3823        // Underscore stripping only happens when numbers_can_be_underscore_separated is true
3824        let mut config = TokenizerConfig::default();
3825        config.numbers_can_be_underscore_separated = true;
3826        let tokenizer = Tokenizer::new(config);
3827
3828        // Simple integer with underscores
3829        let tokens = tokenizer.tokenize("SELECT 1_2_3_4_5").unwrap();
3830        assert_eq!(tokens[1].token_type, TokenType::Number);
3831        assert_eq!(tokens[1].text, "12345");
3832
3833        // Thousands separator
3834        let tokens = tokenizer.tokenize("SELECT 20_000").unwrap();
3835        assert_eq!(tokens[1].token_type, TokenType::Number);
3836        assert_eq!(tokens[1].text, "20000");
3837
3838        // Scientific notation with underscores
3839        let tokens = tokenizer.tokenize("SELECT 1_2E+1_0").unwrap();
3840        assert_eq!(tokens[1].token_type, TokenType::Number);
3841        assert_eq!(tokens[1].text, "12E+10");
3842
3843        // Default tokenizer should NOT strip underscores
3844        let default_tokenizer = Tokenizer::default();
3845        let tokens = default_tokenizer.tokenize("SELECT 1_2_3_4_5").unwrap();
3846        assert_eq!(tokens[1].token_type, TokenType::Number);
3847        assert_eq!(tokens[1].text, "1_2_3_4_5");
3848    }
3849}
polyglot_sql/tokens.rs

polyglot_sql/
tokens.rs