polyglot_sql/
tokens.rs

1//! Token types and tokenization for SQL parsing
2//!
3//! This module defines all SQL token types and the tokenizer that converts
4//! SQL strings into token streams.
5
6use crate::error::{Error, Result};
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9use std::fmt;
10use std::sync::LazyLock;
11#[cfg(feature = "bindings")]
12use ts_rs::TS;
13
14/// Parse a DollarString token text into (tag, content).
15/// If the text contains '\x00', the part before is the tag and after is content.
16/// Otherwise, the whole text is the content with no tag.
17pub fn parse_dollar_string_token(text: &str) -> (Option<String>, String) {
18    if let Some(pos) = text.find('\x00') {
19        let tag = &text[..pos];
20        let content = &text[pos + 1..];
21        (Some(tag.to_string()), content.to_string())
22    } else {
23        (None, text.to_string())
24    }
25}
26
27/// Represents a position in the source SQL
28#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
29#[cfg_attr(feature = "bindings", derive(TS))]
30pub struct Span {
31    /// Starting byte offset
32    pub start: usize,
33    /// Ending byte offset (exclusive)
34    pub end: usize,
35    /// Line number (1-based)
36    pub line: usize,
37    /// Column number (1-based)
38    pub column: usize,
39}
40
41impl Span {
42    pub fn new(start: usize, end: usize, line: usize, column: usize) -> Self {
43        Self {
44            start,
45            end,
46            line,
47            column,
48        }
49    }
50}
51
52/// A token in the SQL token stream
53#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
54pub struct Token {
55    /// The type of token
56    pub token_type: TokenType,
57    /// The raw text of the token
58    pub text: String,
59    /// Position information
60    pub span: Span,
61    /// Leading comments (comments that appeared before this token)
62    #[serde(default)]
63    pub comments: Vec<String>,
64    /// Trailing comments (comments that appeared after this token, before the next one)
65    #[serde(default)]
66    pub trailing_comments: Vec<String>,
67}
68
69impl Token {
70    /// Create a new token
71    pub fn new(token_type: TokenType, text: impl Into<String>, span: Span) -> Self {
72        Self {
73            token_type,
74            text: text.into(),
75            span,
76            comments: Vec::new(),
77            trailing_comments: Vec::new(),
78        }
79    }
80
81    /// Create a NUMBER token
82    pub fn number(n: i64) -> Self {
83        Self::new(TokenType::Number, n.to_string(), Span::default())
84    }
85
86    /// Create a STRING token
87    pub fn string(s: impl Into<String>) -> Self {
88        Self::new(TokenType::String, s, Span::default())
89    }
90
91    /// Create an IDENTIFIER token
92    pub fn identifier(s: impl Into<String>) -> Self {
93        Self::new(TokenType::Identifier, s, Span::default())
94    }
95
96    /// Create a VAR token
97    pub fn var(s: impl Into<String>) -> Self {
98        Self::new(TokenType::Var, s, Span::default())
99    }
100
101    /// Add a comment to this token
102    pub fn with_comment(mut self, comment: impl Into<String>) -> Self {
103        self.comments.push(comment.into());
104        self
105    }
106}
107
108impl fmt::Display for Token {
109    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
110        write!(f, "{:?}({})", self.token_type, self.text)
111    }
112}
113
114/// All possible token types in SQL
115#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
116#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
117#[repr(u16)]
118pub enum TokenType {
119    // Punctuation
120    LParen,
121    RParen,
122    LBracket,
123    RBracket,
124    LBrace,
125    RBrace,
126    Comma,
127    Dot,
128    Dash,
129    Plus,
130    Colon,
131    DotColon,
132    DColon,
133    DColonDollar,
134    DColonPercent,
135    DColonQMark,
136    DQMark,
137    Semicolon,
138    Star,
139    Backslash,
140    Slash,
141    Lt,
142    Lte,
143    Gt,
144    Gte,
145    Not,
146    Eq,
147    Neq,
148    NullsafeEq,
149    ColonEq,
150    ColonGt,
151    NColonGt,
152    And,
153    Or,
154    Amp,
155    DPipe,
156    PipeGt,
157    Pipe,
158    PipeSlash,
159    DPipeSlash,
160    Caret,
161    CaretAt,
162    LtLt, // <<
163    GtGt, // >>
164    Tilde,
165    Arrow,
166    DArrow,
167    FArrow,
168    Hash,
169    HashArrow,
170    DHashArrow,
171    LrArrow,
172    DAt,
173    AtAt,
174    LtAt,
175    AtGt,
176    Dollar,
177    Parameter,
178    Session,
179    SessionParameter,
180    SessionUser,
181    DAmp,
182    AmpLt,
183    AmpGt,
184    Adjacent,
185    Xor,
186    DStar,
187    QMarkAmp,
188    QMarkPipe,
189    HashDash,
190    Exclamation,
191
192    UriStart,
193    BlockStart,
194    BlockEnd,
195    Space,
196    Break,
197
198    // Comments (emitted as tokens for round-trip fidelity)
199    BlockComment, // /* ... */
200    LineComment,  // -- ...
201
202    // Literals
203    String,
204    DollarString,             // $$...$$
205    TripleDoubleQuotedString, // """..."""
206    TripleSingleQuotedString, // '''...'''
207    Number,
208    Identifier,
209    QuotedIdentifier,
210    Database,
211    Column,
212    ColumnDef,
213    Schema,
214    Table,
215    Warehouse,
216    Stage,
217    Streamlit,
218    Var,
219    BitString,
220    HexString,
221    /// Hex number: 0xA, 0xFF (BigQuery, SQLite style) - represents an integer in hex notation
222    HexNumber,
223    ByteString,
224    NationalString,
225    EscapeString, // PostgreSQL E'...' escape string
226    RawString,
227    HeredocString,
228    HeredocStringAlternative,
229    UnicodeString,
230
231    // Data Types
232    Bit,
233    Boolean,
234    TinyInt,
235    UTinyInt,
236    SmallInt,
237    USmallInt,
238    MediumInt,
239    UMediumInt,
240    Int,
241    UInt,
242    BigInt,
243    UBigInt,
244    BigNum,
245    Int128,
246    UInt128,
247    Int256,
248    UInt256,
249    Float,
250    Double,
251    UDouble,
252    Decimal,
253    Decimal32,
254    Decimal64,
255    Decimal128,
256    Decimal256,
257    DecFloat,
258    UDecimal,
259    BigDecimal,
260    Char,
261    NChar,
262    VarChar,
263    NVarChar,
264    BpChar,
265    Text,
266    MediumText,
267    LongText,
268    Blob,
269    MediumBlob,
270    LongBlob,
271    TinyBlob,
272    TinyText,
273    Name,
274    Binary,
275    VarBinary,
276    Json,
277    JsonB,
278    Time,
279    TimeTz,
280    TimeNs,
281    Timestamp,
282    TimestampTz,
283    TimestampLtz,
284    TimestampNtz,
285    TimestampS,
286    TimestampMs,
287    TimestampNs,
288    DateTime,
289    DateTime2,
290    DateTime64,
291    SmallDateTime,
292    Date,
293    Date32,
294    Int4Range,
295    Int4MultiRange,
296    Int8Range,
297    Int8MultiRange,
298    NumRange,
299    NumMultiRange,
300    TsRange,
301    TsMultiRange,
302    TsTzRange,
303    TsTzMultiRange,
304    DateRange,
305    DateMultiRange,
306    Uuid,
307    Geography,
308    GeographyPoint,
309    Nullable,
310    Geometry,
311    Point,
312    Ring,
313    LineString,
314    LocalTime,
315    LocalTimestamp,
316    SysTimestamp,
317    MultiLineString,
318    Polygon,
319    MultiPolygon,
320    HllSketch,
321    HStore,
322    Super,
323    Serial,
324    SmallSerial,
325    BigSerial,
326    Xml,
327    Year,
328    UserDefined,
329    Money,
330    SmallMoney,
331    RowVersion,
332    Image,
333    Variant,
334    Object,
335    Inet,
336    IpAddress,
337    IpPrefix,
338    Ipv4,
339    Ipv6,
340    Enum,
341    Enum8,
342    Enum16,
343    FixedString,
344    LowCardinality,
345    Nested,
346    AggregateFunction,
347    SimpleAggregateFunction,
348    TDigest,
349    Unknown,
350    Vector,
351    Dynamic,
352    Void,
353
354    // Keywords
355    Add,
356    Alias,
357    Alter,
358    All,
359    Anti,
360    Any,
361    Apply,
362    Array,
363    Asc,
364    AsOf,
365    Attach,
366    AutoIncrement,
367    Begin,
368    Between,
369    BulkCollectInto,
370    Cache,
371    Cascade,
372    Case,
373    CharacterSet,
374    Cluster,
375    ClusterBy,
376    Collate,
377    Command,
378    Comment,
379    Commit,
380    Preserve,
381    Connect,
382    ConnectBy,
383    Constraint,
384    Copy,
385    Create,
386    Cross,
387    Cube,
388    CurrentDate,
389    CurrentDateTime,
390    CurrentSchema,
391    CurrentTime,
392    CurrentTimestamp,
393    CurrentUser,
394    CurrentRole,
395    CurrentCatalog,
396    Declare,
397    Default,
398    Delete,
399    Desc,
400    Describe,
401    Detach,
402    Dictionary,
403    Distinct,
404    Distribute,
405    DistributeBy,
406    Div,
407    Drop,
408    Else,
409    End,
410    Escape,
411    Except,
412    Execute,
413    Exists,
414    False,
415    Fetch,
416    File,
417    FileFormat,
418    Filter,
419    Final,
420    First,
421    For,
422    Force,
423    ForeignKey,
424    Format,
425    From,
426    Full,
427    Function,
428    Get,
429    Glob,
430    Global,
431    Grant,
432    GroupBy,
433    GroupingSets,
434    Having,
435    Hint,
436    Ignore,
437    ILike,
438    In,
439    Index,
440    IndexedBy,
441    Inner,
442    Input,
443    Insert,
444    Install,
445    Intersect,
446    Interval,
447    Into,
448    Inpath,
449    InputFormat,
450    Introducer,
451    IRLike,
452    Is,
453    IsNull,
454    Join,
455    JoinMarker,
456    Keep,
457    Key,
458    Kill,
459    Lambda,
460    Language,
461    Lateral,
462    Left,
463    Like,
464    NotLike,   // !~~ operator (PostgreSQL)
465    NotILike,  // !~~* operator (PostgreSQL)
466    NotRLike,  // !~ operator (PostgreSQL)
467    NotIRLike, // !~* operator (PostgreSQL)
468    Limit,
469    List,
470    Load,
471    Local,
472    Lock,
473    Map,
474    Match,
475    MatchCondition,
476    MatchRecognize,
477    MemberOf,
478    Materialized,
479    Merge,
480    Mod,
481    Model,
482    Natural,
483    Next,
484    NoAction,
485    Nothing,
486    NotNull,
487    Null,
488    ObjectIdentifier,
489    Offset,
490    On,
491    Only,
492    Operator,
493    OrderBy,
494    OrderSiblingsBy,
495    Ordered,
496    Ordinality,
497    Out,
498    Outer,
499    Output,
500    Over,
501    Overlaps,
502    Overwrite,
503    Partition,
504    PartitionBy,
505    Percent,
506    Pivot,
507    Placeholder,
508    Positional,
509    Pragma,
510    Prewhere,
511    PrimaryKey,
512    Procedure,
513    Properties,
514    PseudoType,
515    Put,
516    Qualify,
517    Quote,
518    QDColon,
519    Range,
520    Recursive,
521    Refresh,
522    Rename,
523    Replace,
524    Returning,
525    Revoke,
526    References,
527    Restrict,
528    Right,
529    RLike,
530    Rollback,
531    Rollup,
532    Row,
533    Rows,
534    Select,
535    Semi,
536    Savepoint,
537    Separator,
538    Sequence,
539    Serde,
540    SerdeProperties,
541    Set,
542    Settings,
543    Show,
544    Siblings,
545    SimilarTo,
546    Some,
547    Sort,
548    SortBy,
549    SoundsLike,
550    StartWith,
551    StorageIntegration,
552    StraightJoin,
553    Struct,
554    Summarize,
555    TableSample,
556    Sample,
557    Bernoulli,
558    System,
559    Block,
560    Seed,
561    Repeatable,
562    Tag,
563    Temporary,
564    Transaction,
565    To,
566    Top,
567    Then,
568    True,
569    Truncate,
570    Uncache,
571    Union,
572    Unnest,
573    Unpivot,
574    Update,
575    Use,
576    Using,
577    Values,
578    View,
579    SemanticView,
580    Volatile,
581    When,
582    Where,
583    Window,
584    With,
585    Ties,
586    Exclude,
587    No,
588    Others,
589    Unique,
590    UtcDate,
591    UtcTime,
592    UtcTimestamp,
593    VersionSnapshot,
594    TimestampSnapshot,
595    Option,
596    Sink,
597    Source,
598    Analyze,
599    Namespace,
600    Export,
601    As,
602    By,
603    Nulls,
604    Respect,
605    Last,
606    If,
607    Cast,
608    TryCast,
609    SafeCast,
610    Count,
611    Extract,
612    Substring,
613    Trim,
614    Leading,
615    Trailing,
616    Both,
617    Position,
618    Overlaying,
619    Placing,
620    Treat,
621    Within,
622    Group,
623    Order,
624
625    // Window function keywords
626    Unbounded,
627    Preceding,
628    Following,
629    Current,
630    Groups,
631
632    // DDL-specific keywords (Phase 4)
633    Trigger,
634    Type,
635    Domain,
636    Returns,
637    Body,
638    Increment,
639    Minvalue,
640    Maxvalue,
641    Start,
642    Cycle,
643    NoCycle,
644    Prior,
645    Generated,
646    Identity,
647    Always,
648    // MATCH_RECOGNIZE tokens
649    Measures,
650    Pattern,
651    Define,
652    Running,
653    Owned,
654    After,
655    Before,
656    Instead,
657    Each,
658    Statement,
659    Referencing,
660    Old,
661    New,
662    Of,
663    Check,
664    Authorization,
665    Restart,
666
667    // Special
668    Eof,
669}
670
671impl TokenType {
672    /// Check if this token type is a keyword that can be used as an identifier in certain contexts
673    pub fn is_keyword(&self) -> bool {
674        matches!(
675            self,
676            TokenType::Select
677                | TokenType::From
678                | TokenType::Where
679                | TokenType::And
680                | TokenType::Or
681                | TokenType::Not
682                | TokenType::In
683                | TokenType::Is
684                | TokenType::Null
685                | TokenType::True
686                | TokenType::False
687                | TokenType::As
688                | TokenType::On
689                | TokenType::Join
690                | TokenType::Left
691                | TokenType::Right
692                | TokenType::Inner
693                | TokenType::Outer
694                | TokenType::Full
695                | TokenType::Cross
696                | TokenType::Semi
697                | TokenType::Anti
698                | TokenType::Union
699                | TokenType::Except
700                | TokenType::Intersect
701                | TokenType::GroupBy
702                | TokenType::OrderBy
703                | TokenType::Having
704                | TokenType::Limit
705                | TokenType::Offset
706                | TokenType::Case
707                | TokenType::When
708                | TokenType::Then
709                | TokenType::Else
710                | TokenType::End
711                | TokenType::Create
712                | TokenType::Drop
713                | TokenType::Alter
714                | TokenType::Insert
715                | TokenType::Update
716                | TokenType::Delete
717                | TokenType::Into
718                | TokenType::Values
719                | TokenType::Set
720                | TokenType::With
721                | TokenType::Distinct
722                | TokenType::All
723                | TokenType::Exists
724                | TokenType::Between
725                | TokenType::Like
726                | TokenType::ILike
727                // Additional keywords that can be used as identifiers
728                | TokenType::Filter
729                | TokenType::Date
730                | TokenType::Timestamp
731                | TokenType::TimestampTz
732                | TokenType::Interval
733                | TokenType::Time
734                | TokenType::Table
735                | TokenType::Index
736                | TokenType::Column
737                | TokenType::Database
738                | TokenType::Schema
739                | TokenType::View
740                | TokenType::Function
741                | TokenType::Procedure
742                | TokenType::Trigger
743                | TokenType::Sequence
744                | TokenType::Over
745                | TokenType::Partition
746                | TokenType::Window
747                | TokenType::Rows
748                | TokenType::Range
749                | TokenType::First
750                | TokenType::Last
751                | TokenType::Preceding
752                | TokenType::Following
753                | TokenType::Current
754                | TokenType::Row
755                | TokenType::Unbounded
756                | TokenType::Array
757                | TokenType::Struct
758                | TokenType::Map
759                | TokenType::PrimaryKey
760                | TokenType::Key
761                | TokenType::ForeignKey
762                | TokenType::References
763                | TokenType::Unique
764                | TokenType::Check
765                | TokenType::Default
766                | TokenType::Constraint
767                | TokenType::Comment
768                | TokenType::Rollup
769                | TokenType::Cube
770                | TokenType::Grant
771                | TokenType::Revoke
772                | TokenType::Type
773                | TokenType::Use
774                | TokenType::Cache
775                | TokenType::Uncache
776                | TokenType::Load
777                | TokenType::Any
778                | TokenType::Some
779                | TokenType::Asc
780                | TokenType::Desc
781                | TokenType::Nulls
782                | TokenType::Lateral
783                | TokenType::Natural
784                | TokenType::Escape
785                | TokenType::Glob
786                | TokenType::Match
787                | TokenType::Recursive
788                | TokenType::Replace
789                | TokenType::Returns
790                | TokenType::If
791                | TokenType::Pivot
792                | TokenType::Unpivot
793                | TokenType::Json
794                | TokenType::Blob
795                | TokenType::Text
796                | TokenType::Int
797                | TokenType::BigInt
798                | TokenType::SmallInt
799                | TokenType::TinyInt
800                | TokenType::Int128
801                | TokenType::UInt128
802                | TokenType::Int256
803                | TokenType::UInt256
804                | TokenType::UInt
805                | TokenType::UBigInt
806                | TokenType::Float
807                | TokenType::Double
808                | TokenType::Decimal
809                | TokenType::Boolean
810                | TokenType::VarChar
811                | TokenType::Char
812                | TokenType::Binary
813                | TokenType::VarBinary
814                | TokenType::No
815                | TokenType::DateTime
816                | TokenType::Truncate
817                | TokenType::Execute
818                | TokenType::Merge
819                | TokenType::Top
820                | TokenType::Begin
821                | TokenType::Generated
822                | TokenType::Identity
823                | TokenType::Always
824                | TokenType::Extract
825                // Keywords that can be identifiers in certain contexts
826                | TokenType::AsOf
827                | TokenType::Prior
828                | TokenType::After
829                | TokenType::Restrict
830                | TokenType::Cascade
831                | TokenType::Local
832                | TokenType::Rename
833                | TokenType::Enum
834                | TokenType::Within
835                | TokenType::Format
836                | TokenType::Final
837                | TokenType::FileFormat
838                | TokenType::Input
839                | TokenType::InputFormat
840                | TokenType::Copy
841                | TokenType::Put
842                | TokenType::Get
843                | TokenType::Show
844                | TokenType::Serde
845                | TokenType::Sample
846                | TokenType::Sort
847                | TokenType::Collate
848                | TokenType::Ties
849                | TokenType::IsNull
850                | TokenType::NotNull
851                | TokenType::Exclude
852                | TokenType::Temporary
853                | TokenType::Add
854                | TokenType::Ordinality
855                | TokenType::Overlaps
856                | TokenType::Block
857                | TokenType::Pattern
858                | TokenType::Group
859                | TokenType::Cluster
860                | TokenType::Repeatable
861                | TokenType::Groups
862                | TokenType::Commit
863                | TokenType::Warehouse
864                | TokenType::System
865                | TokenType::By
866                | TokenType::To
867                | TokenType::Fetch
868                | TokenType::For
869                | TokenType::Only
870                | TokenType::Next
871                | TokenType::Lock
872                | TokenType::Refresh
873                | TokenType::Settings
874                | TokenType::Operator
875                | TokenType::Overwrite
876                | TokenType::StraightJoin
877                | TokenType::Start
878                // Additional keywords registered in tokenizer but previously missing from is_keyword()
879                | TokenType::Ignore
880                | TokenType::Domain
881                | TokenType::Apply
882                | TokenType::Respect
883                | TokenType::Materialized
884                | TokenType::Prewhere
885                | TokenType::Old
886                | TokenType::New
887                | TokenType::Cast
888                | TokenType::TryCast
889                | TokenType::SafeCast
890                | TokenType::Transaction
891                | TokenType::Describe
892                | TokenType::Kill
893                | TokenType::Lambda
894                | TokenType::Declare
895                | TokenType::Keep
896                | TokenType::Output
897                | TokenType::Percent
898                | TokenType::Qualify
899                | TokenType::Returning
900                | TokenType::Language
901                | TokenType::Preserve
902                | TokenType::Savepoint
903                | TokenType::Rollback
904                | TokenType::Body
905                | TokenType::Increment
906                | TokenType::Minvalue
907                | TokenType::Maxvalue
908                | TokenType::Cycle
909                | TokenType::NoCycle
910                | TokenType::Seed
911                | TokenType::Namespace
912                | TokenType::Authorization
913                | TokenType::Order
914                | TokenType::Restart
915                | TokenType::Before
916                | TokenType::Instead
917                | TokenType::Each
918                | TokenType::Statement
919                | TokenType::Referencing
920                | TokenType::Of
921                | TokenType::Separator
922                | TokenType::Others
923                | TokenType::Placing
924                | TokenType::Owned
925                | TokenType::Running
926                | TokenType::Define
927                | TokenType::Measures
928                | TokenType::MatchRecognize
929                | TokenType::AutoIncrement
930                | TokenType::Connect
931                | TokenType::Distribute
932                | TokenType::Bernoulli
933                | TokenType::TableSample
934                | TokenType::Inpath
935                | TokenType::Pragma
936                | TokenType::Siblings
937                | TokenType::SerdeProperties
938                | TokenType::RLike
939        )
940    }
941
942    /// Check if this token type is a comparison operator
943    pub fn is_comparison(&self) -> bool {
944        matches!(
945            self,
946            TokenType::Eq
947                | TokenType::Neq
948                | TokenType::Lt
949                | TokenType::Lte
950                | TokenType::Gt
951                | TokenType::Gte
952                | TokenType::NullsafeEq
953        )
954    }
955
956    /// Check if this token type is an arithmetic operator
957    pub fn is_arithmetic(&self) -> bool {
958        matches!(
959            self,
960            TokenType::Plus
961                | TokenType::Dash
962                | TokenType::Star
963                | TokenType::Slash
964                | TokenType::Percent
965                | TokenType::Mod
966                | TokenType::Div
967        )
968    }
969}
970
971impl fmt::Display for TokenType {
972    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
973        write!(f, "{:?}", self)
974    }
975}
976
977// ── Cached default maps for TokenizerConfig ─────────────────────────────────
978
979static DEFAULT_KEYWORDS: LazyLock<HashMap<String, TokenType>> = LazyLock::new(|| {
980    let mut keywords = HashMap::with_capacity(300);
981    // Add basic SQL keywords
982    keywords.insert("SELECT".to_string(), TokenType::Select);
983    keywords.insert("FROM".to_string(), TokenType::From);
984    keywords.insert("WHERE".to_string(), TokenType::Where);
985    keywords.insert("AND".to_string(), TokenType::And);
986    keywords.insert("OR".to_string(), TokenType::Or);
987    keywords.insert("NOT".to_string(), TokenType::Not);
988    keywords.insert("AS".to_string(), TokenType::As);
989    keywords.insert("ON".to_string(), TokenType::On);
990    keywords.insert("JOIN".to_string(), TokenType::Join);
991    keywords.insert("LEFT".to_string(), TokenType::Left);
992    keywords.insert("RIGHT".to_string(), TokenType::Right);
993    keywords.insert("INNER".to_string(), TokenType::Inner);
994    keywords.insert("OUTER".to_string(), TokenType::Outer);
995    keywords.insert("OUTPUT".to_string(), TokenType::Output);
996    keywords.insert("FULL".to_string(), TokenType::Full);
997    keywords.insert("CROSS".to_string(), TokenType::Cross);
998    keywords.insert("SEMI".to_string(), TokenType::Semi);
999    keywords.insert("ANTI".to_string(), TokenType::Anti);
1000    keywords.insert("STRAIGHT_JOIN".to_string(), TokenType::StraightJoin);
1001    keywords.insert("UNION".to_string(), TokenType::Union);
1002    keywords.insert("EXCEPT".to_string(), TokenType::Except);
1003    keywords.insert("MINUS".to_string(), TokenType::Except); // Oracle/Redshift alias for EXCEPT
1004    keywords.insert("INTERSECT".to_string(), TokenType::Intersect);
1005    keywords.insert("GROUP".to_string(), TokenType::Group);
1006    keywords.insert("CUBE".to_string(), TokenType::Cube);
1007    keywords.insert("ROLLUP".to_string(), TokenType::Rollup);
1008    keywords.insert("WITHIN".to_string(), TokenType::Within);
1009    keywords.insert("ORDER".to_string(), TokenType::Order);
1010    keywords.insert("BY".to_string(), TokenType::By);
1011    keywords.insert("HAVING".to_string(), TokenType::Having);
1012    keywords.insert("LIMIT".to_string(), TokenType::Limit);
1013    keywords.insert("OFFSET".to_string(), TokenType::Offset);
1014    keywords.insert("ORDINALITY".to_string(), TokenType::Ordinality);
1015    keywords.insert("FETCH".to_string(), TokenType::Fetch);
1016    keywords.insert("FIRST".to_string(), TokenType::First);
1017    keywords.insert("NEXT".to_string(), TokenType::Next);
1018    keywords.insert("ONLY".to_string(), TokenType::Only);
1019    keywords.insert("KEEP".to_string(), TokenType::Keep);
1020    keywords.insert("IGNORE".to_string(), TokenType::Ignore);
1021    keywords.insert("INPUT".to_string(), TokenType::Input);
1022    keywords.insert("CASE".to_string(), TokenType::Case);
1023    keywords.insert("WHEN".to_string(), TokenType::When);
1024    keywords.insert("THEN".to_string(), TokenType::Then);
1025    keywords.insert("ELSE".to_string(), TokenType::Else);
1026    keywords.insert("END".to_string(), TokenType::End);
1027    keywords.insert("ENDIF".to_string(), TokenType::End); // Exasol alias for END
1028    keywords.insert("NULL".to_string(), TokenType::Null);
1029    keywords.insert("TRUE".to_string(), TokenType::True);
1030    keywords.insert("FALSE".to_string(), TokenType::False);
1031    keywords.insert("IS".to_string(), TokenType::Is);
1032    keywords.insert("IN".to_string(), TokenType::In);
1033    keywords.insert("BETWEEN".to_string(), TokenType::Between);
1034    keywords.insert("OVERLAPS".to_string(), TokenType::Overlaps);
1035    keywords.insert("LIKE".to_string(), TokenType::Like);
1036    keywords.insert("ILIKE".to_string(), TokenType::ILike);
1037    keywords.insert("RLIKE".to_string(), TokenType::RLike);
1038    keywords.insert("REGEXP".to_string(), TokenType::RLike);
1039    keywords.insert("ESCAPE".to_string(), TokenType::Escape);
1040    keywords.insert("EXISTS".to_string(), TokenType::Exists);
1041    keywords.insert("DISTINCT".to_string(), TokenType::Distinct);
1042    keywords.insert("ALL".to_string(), TokenType::All);
1043    keywords.insert("WITH".to_string(), TokenType::With);
1044    keywords.insert("CREATE".to_string(), TokenType::Create);
1045    keywords.insert("DROP".to_string(), TokenType::Drop);
1046    keywords.insert("ALTER".to_string(), TokenType::Alter);
1047    keywords.insert("TRUNCATE".to_string(), TokenType::Truncate);
1048    keywords.insert("TABLE".to_string(), TokenType::Table);
1049    keywords.insert("VIEW".to_string(), TokenType::View);
1050    keywords.insert("INDEX".to_string(), TokenType::Index);
1051    keywords.insert("COLUMN".to_string(), TokenType::Column);
1052    keywords.insert("CONSTRAINT".to_string(), TokenType::Constraint);
1053    keywords.insert("ADD".to_string(), TokenType::Add);
1054    keywords.insert("CASCADE".to_string(), TokenType::Cascade);
1055    keywords.insert("RESTRICT".to_string(), TokenType::Restrict);
1056    keywords.insert("RENAME".to_string(), TokenType::Rename);
1057    keywords.insert("TEMPORARY".to_string(), TokenType::Temporary);
1058    keywords.insert("TEMP".to_string(), TokenType::Temporary);
1059    keywords.insert("UNIQUE".to_string(), TokenType::Unique);
1060    keywords.insert("PRIMARY".to_string(), TokenType::PrimaryKey);
1061    keywords.insert("FOREIGN".to_string(), TokenType::ForeignKey);
1062    keywords.insert("KEY".to_string(), TokenType::Key);
1063    keywords.insert("KILL".to_string(), TokenType::Kill);
1064    keywords.insert("REFERENCES".to_string(), TokenType::References);
1065    keywords.insert("DEFAULT".to_string(), TokenType::Default);
1066    keywords.insert("DECLARE".to_string(), TokenType::Declare);
1067    keywords.insert("AUTO_INCREMENT".to_string(), TokenType::AutoIncrement);
1068    keywords.insert("AUTOINCREMENT".to_string(), TokenType::AutoIncrement); // Snowflake style
1069    keywords.insert("MATERIALIZED".to_string(), TokenType::Materialized);
1070    keywords.insert("REPLACE".to_string(), TokenType::Replace);
1071    keywords.insert("TO".to_string(), TokenType::To);
1072    keywords.insert("INSERT".to_string(), TokenType::Insert);
1073    keywords.insert("OVERWRITE".to_string(), TokenType::Overwrite);
1074    keywords.insert("UPDATE".to_string(), TokenType::Update);
1075    keywords.insert("USE".to_string(), TokenType::Use);
1076    keywords.insert("WAREHOUSE".to_string(), TokenType::Warehouse);
1077    keywords.insert("GLOB".to_string(), TokenType::Glob);
1078    keywords.insert("DELETE".to_string(), TokenType::Delete);
1079    keywords.insert("MERGE".to_string(), TokenType::Merge);
1080    keywords.insert("CACHE".to_string(), TokenType::Cache);
1081    keywords.insert("UNCACHE".to_string(), TokenType::Uncache);
1082    keywords.insert("REFRESH".to_string(), TokenType::Refresh);
1083    keywords.insert("GRANT".to_string(), TokenType::Grant);
1084    keywords.insert("REVOKE".to_string(), TokenType::Revoke);
1085    keywords.insert("COMMENT".to_string(), TokenType::Comment);
1086    keywords.insert("COLLATE".to_string(), TokenType::Collate);
1087    keywords.insert("INTO".to_string(), TokenType::Into);
1088    keywords.insert("VALUES".to_string(), TokenType::Values);
1089    keywords.insert("SET".to_string(), TokenType::Set);
1090    keywords.insert("SETTINGS".to_string(), TokenType::Settings);
1091    keywords.insert("SEPARATOR".to_string(), TokenType::Separator);
1092    keywords.insert("ASC".to_string(), TokenType::Asc);
1093    keywords.insert("DESC".to_string(), TokenType::Desc);
1094    keywords.insert("NULLS".to_string(), TokenType::Nulls);
1095    keywords.insert("RESPECT".to_string(), TokenType::Respect);
1096    keywords.insert("FIRST".to_string(), TokenType::First);
1097    keywords.insert("LAST".to_string(), TokenType::Last);
1098    keywords.insert("IF".to_string(), TokenType::If);
1099    keywords.insert("CAST".to_string(), TokenType::Cast);
1100    keywords.insert("TRY_CAST".to_string(), TokenType::TryCast);
1101    keywords.insert("SAFE_CAST".to_string(), TokenType::SafeCast);
1102    keywords.insert("OVER".to_string(), TokenType::Over);
1103    keywords.insert("PARTITION".to_string(), TokenType::Partition);
1104    keywords.insert("PLACING".to_string(), TokenType::Placing);
1105    keywords.insert("WINDOW".to_string(), TokenType::Window);
1106    keywords.insert("ROWS".to_string(), TokenType::Rows);
1107    keywords.insert("RANGE".to_string(), TokenType::Range);
1108    keywords.insert("FILTER".to_string(), TokenType::Filter);
1109    keywords.insert("NATURAL".to_string(), TokenType::Natural);
1110    keywords.insert("USING".to_string(), TokenType::Using);
1111    keywords.insert("UNBOUNDED".to_string(), TokenType::Unbounded);
1112    keywords.insert("PRECEDING".to_string(), TokenType::Preceding);
1113    keywords.insert("FOLLOWING".to_string(), TokenType::Following);
1114    keywords.insert("CURRENT".to_string(), TokenType::Current);
1115    keywords.insert("ROW".to_string(), TokenType::Row);
1116    keywords.insert("GROUPS".to_string(), TokenType::Groups);
1117    keywords.insert("RECURSIVE".to_string(), TokenType::Recursive);
1118    // TRIM function position keywords
1119    keywords.insert("BOTH".to_string(), TokenType::Both);
1120    keywords.insert("LEADING".to_string(), TokenType::Leading);
1121    keywords.insert("TRAILING".to_string(), TokenType::Trailing);
1122    keywords.insert("INTERVAL".to_string(), TokenType::Interval);
1123    // Phase 3: Additional keywords
1124    keywords.insert("TOP".to_string(), TokenType::Top);
1125    keywords.insert("QUALIFY".to_string(), TokenType::Qualify);
1126    keywords.insert("SAMPLE".to_string(), TokenType::Sample);
1127    keywords.insert("TABLESAMPLE".to_string(), TokenType::TableSample);
1128    keywords.insert("BERNOULLI".to_string(), TokenType::Bernoulli);
1129    keywords.insert("SYSTEM".to_string(), TokenType::System);
1130    keywords.insert("BLOCK".to_string(), TokenType::Block);
1131    keywords.insert("SEED".to_string(), TokenType::Seed);
1132    keywords.insert("REPEATABLE".to_string(), TokenType::Repeatable);
1133    keywords.insert("TIES".to_string(), TokenType::Ties);
1134    keywords.insert("LATERAL".to_string(), TokenType::Lateral);
1135    keywords.insert("LAMBDA".to_string(), TokenType::Lambda);
1136    keywords.insert("APPLY".to_string(), TokenType::Apply);
1137    // Oracle CONNECT BY keywords
1138    keywords.insert("CONNECT".to_string(), TokenType::Connect);
1139    // Hive/Spark specific keywords
1140    keywords.insert("CLUSTER".to_string(), TokenType::Cluster);
1141    keywords.insert("DISTRIBUTE".to_string(), TokenType::Distribute);
1142    keywords.insert("SORT".to_string(), TokenType::Sort);
1143    keywords.insert("PIVOT".to_string(), TokenType::Pivot);
1144    keywords.insert("PREWHERE".to_string(), TokenType::Prewhere);
1145    keywords.insert("UNPIVOT".to_string(), TokenType::Unpivot);
1146    keywords.insert("FOR".to_string(), TokenType::For);
1147    keywords.insert("ANY".to_string(), TokenType::Any);
1148    keywords.insert("SOME".to_string(), TokenType::Some);
1149    keywords.insert("ASOF".to_string(), TokenType::AsOf);
1150    keywords.insert("PERCENT".to_string(), TokenType::Percent);
1151    keywords.insert("EXCLUDE".to_string(), TokenType::Exclude);
1152    keywords.insert("NO".to_string(), TokenType::No);
1153    keywords.insert("OTHERS".to_string(), TokenType::Others);
1154    // PostgreSQL OPERATOR() syntax for schema-qualified operators
1155    keywords.insert("OPERATOR".to_string(), TokenType::Operator);
1156    // Phase 4: DDL keywords
1157    keywords.insert("SCHEMA".to_string(), TokenType::Schema);
1158    keywords.insert("NAMESPACE".to_string(), TokenType::Namespace);
1159    keywords.insert("DATABASE".to_string(), TokenType::Database);
1160    keywords.insert("FUNCTION".to_string(), TokenType::Function);
1161    keywords.insert("PROCEDURE".to_string(), TokenType::Procedure);
1162    keywords.insert("PROC".to_string(), TokenType::Procedure);
1163    keywords.insert("SEQUENCE".to_string(), TokenType::Sequence);
1164    keywords.insert("TRIGGER".to_string(), TokenType::Trigger);
1165    keywords.insert("TYPE".to_string(), TokenType::Type);
1166    keywords.insert("DOMAIN".to_string(), TokenType::Domain);
1167    keywords.insert("RETURNS".to_string(), TokenType::Returns);
1168    keywords.insert("RETURNING".to_string(), TokenType::Returning);
1169    keywords.insert("LANGUAGE".to_string(), TokenType::Language);
1170    keywords.insert("ROLLBACK".to_string(), TokenType::Rollback);
1171    keywords.insert("COMMIT".to_string(), TokenType::Commit);
1172    keywords.insert("BEGIN".to_string(), TokenType::Begin);
1173    keywords.insert("DESCRIBE".to_string(), TokenType::Describe);
1174    keywords.insert("PRESERVE".to_string(), TokenType::Preserve);
1175    keywords.insert("TRANSACTION".to_string(), TokenType::Transaction);
1176    keywords.insert("SAVEPOINT".to_string(), TokenType::Savepoint);
1177    keywords.insert("BODY".to_string(), TokenType::Body);
1178    keywords.insert("INCREMENT".to_string(), TokenType::Increment);
1179    keywords.insert("MINVALUE".to_string(), TokenType::Minvalue);
1180    keywords.insert("MAXVALUE".to_string(), TokenType::Maxvalue);
1181    keywords.insert("CYCLE".to_string(), TokenType::Cycle);
1182    keywords.insert("NOCYCLE".to_string(), TokenType::NoCycle);
1183    keywords.insert("PRIOR".to_string(), TokenType::Prior);
1184    // MATCH_RECOGNIZE keywords
1185    keywords.insert("MATCH".to_string(), TokenType::Match);
1186    keywords.insert("MATCH_RECOGNIZE".to_string(), TokenType::MatchRecognize);
1187    keywords.insert("MEASURES".to_string(), TokenType::Measures);
1188    keywords.insert("PATTERN".to_string(), TokenType::Pattern);
1189    keywords.insert("DEFINE".to_string(), TokenType::Define);
1190    keywords.insert("RUNNING".to_string(), TokenType::Running);
1191    keywords.insert("FINAL".to_string(), TokenType::Final);
1192    keywords.insert("OWNED".to_string(), TokenType::Owned);
1193    keywords.insert("AFTER".to_string(), TokenType::After);
1194    keywords.insert("BEFORE".to_string(), TokenType::Before);
1195    keywords.insert("INSTEAD".to_string(), TokenType::Instead);
1196    keywords.insert("EACH".to_string(), TokenType::Each);
1197    keywords.insert("STATEMENT".to_string(), TokenType::Statement);
1198    keywords.insert("REFERENCING".to_string(), TokenType::Referencing);
1199    keywords.insert("OLD".to_string(), TokenType::Old);
1200    keywords.insert("NEW".to_string(), TokenType::New);
1201    keywords.insert("OF".to_string(), TokenType::Of);
1202    keywords.insert("CHECK".to_string(), TokenType::Check);
1203    keywords.insert("START".to_string(), TokenType::Start);
1204    keywords.insert("ENUM".to_string(), TokenType::Enum);
1205    keywords.insert("AUTHORIZATION".to_string(), TokenType::Authorization);
1206    keywords.insert("RESTART".to_string(), TokenType::Restart);
1207    // Date/time literal keywords
1208    keywords.insert("DATE".to_string(), TokenType::Date);
1209    keywords.insert("TIME".to_string(), TokenType::Time);
1210    keywords.insert("TIMESTAMP".to_string(), TokenType::Timestamp);
1211    keywords.insert("DATETIME".to_string(), TokenType::DateTime);
1212    keywords.insert("GENERATED".to_string(), TokenType::Generated);
1213    keywords.insert("IDENTITY".to_string(), TokenType::Identity);
1214    keywords.insert("ALWAYS".to_string(), TokenType::Always);
1215    // LOAD DATA keywords
1216    keywords.insert("LOAD".to_string(), TokenType::Load);
1217    keywords.insert("LOCAL".to_string(), TokenType::Local);
1218    keywords.insert("INPATH".to_string(), TokenType::Inpath);
1219    keywords.insert("INPUTFORMAT".to_string(), TokenType::InputFormat);
1220    keywords.insert("SERDE".to_string(), TokenType::Serde);
1221    keywords.insert("SERDEPROPERTIES".to_string(), TokenType::SerdeProperties);
1222    keywords.insert("FORMAT".to_string(), TokenType::Format);
1223    // SQLite
1224    keywords.insert("PRAGMA".to_string(), TokenType::Pragma);
1225    // SHOW statement
1226    keywords.insert("SHOW".to_string(), TokenType::Show);
1227    // Oracle ORDER SIBLINGS BY (hierarchical queries)
1228    keywords.insert("SIBLINGS".to_string(), TokenType::Siblings);
1229    // COPY and PUT statements (Snowflake, PostgreSQL)
1230    keywords.insert("COPY".to_string(), TokenType::Copy);
1231    keywords.insert("PUT".to_string(), TokenType::Put);
1232    keywords.insert("GET".to_string(), TokenType::Get);
1233    // EXEC/EXECUTE statement (TSQL, etc.)
1234    keywords.insert("EXEC".to_string(), TokenType::Execute);
1235    keywords.insert("EXECUTE".to_string(), TokenType::Execute);
1236    // Postfix null check operators (PostgreSQL/SQLite)
1237    keywords.insert("ISNULL".to_string(), TokenType::IsNull);
1238    keywords.insert("NOTNULL".to_string(), TokenType::NotNull);
1239    keywords
1240});
1241
1242static DEFAULT_SINGLE_TOKENS: LazyLock<HashMap<char, TokenType>> = LazyLock::new(|| {
1243    let mut single_tokens = HashMap::with_capacity(30);
1244    single_tokens.insert('(', TokenType::LParen);
1245    single_tokens.insert(')', TokenType::RParen);
1246    single_tokens.insert('[', TokenType::LBracket);
1247    single_tokens.insert(']', TokenType::RBracket);
1248    single_tokens.insert('{', TokenType::LBrace);
1249    single_tokens.insert('}', TokenType::RBrace);
1250    single_tokens.insert(',', TokenType::Comma);
1251    single_tokens.insert('.', TokenType::Dot);
1252    single_tokens.insert(';', TokenType::Semicolon);
1253    single_tokens.insert('+', TokenType::Plus);
1254    single_tokens.insert('-', TokenType::Dash);
1255    single_tokens.insert('*', TokenType::Star);
1256    single_tokens.insert('/', TokenType::Slash);
1257    single_tokens.insert('%', TokenType::Percent);
1258    single_tokens.insert('&', TokenType::Amp);
1259    single_tokens.insert('|', TokenType::Pipe);
1260    single_tokens.insert('^', TokenType::Caret);
1261    single_tokens.insert('~', TokenType::Tilde);
1262    single_tokens.insert('<', TokenType::Lt);
1263    single_tokens.insert('>', TokenType::Gt);
1264    single_tokens.insert('=', TokenType::Eq);
1265    single_tokens.insert('!', TokenType::Exclamation);
1266    single_tokens.insert(':', TokenType::Colon);
1267    single_tokens.insert('@', TokenType::DAt);
1268    single_tokens.insert('#', TokenType::Hash);
1269    single_tokens.insert('$', TokenType::Dollar);
1270    single_tokens.insert('?', TokenType::Parameter);
1271    single_tokens
1272});
1273
1274static DEFAULT_QUOTES: LazyLock<HashMap<String, String>> = LazyLock::new(|| {
1275    let mut quotes = HashMap::with_capacity(4);
1276    quotes.insert("'".to_string(), "'".to_string());
1277    // Triple-quoted strings (e.g., """x""")
1278    quotes.insert("\"\"\"".to_string(), "\"\"\"".to_string());
1279    quotes
1280});
1281
1282static DEFAULT_IDENTIFIERS: LazyLock<HashMap<char, char>> = LazyLock::new(|| {
1283    let mut identifiers = HashMap::with_capacity(4);
1284    identifiers.insert('"', '"');
1285    identifiers.insert('`', '`');
1286    // Note: TSQL bracket-quoted identifiers [name] are handled in the parser
1287    // because [ is also used for arrays and subscripts
1288    identifiers
1289});
1290
1291static DEFAULT_COMMENTS: LazyLock<HashMap<String, Option<String>>> = LazyLock::new(|| {
1292    let mut comments = HashMap::with_capacity(4);
1293    comments.insert("--".to_string(), None);
1294    comments.insert("/*".to_string(), Some("*/".to_string()));
1295    comments
1296});
1297
1298/// Tokenizer configuration for a dialect
1299#[derive(Debug, Clone)]
1300pub struct TokenizerConfig {
1301    /// Keywords mapping (uppercase keyword -> token type)
1302    pub keywords: HashMap<String, TokenType>,
1303    /// Single character tokens
1304    pub single_tokens: HashMap<char, TokenType>,
1305    /// Quote characters (start -> end)
1306    pub quotes: HashMap<String, String>,
1307    /// Identifier quote characters (start -> end)
1308    pub identifiers: HashMap<char, char>,
1309    /// Comment definitions (start -> optional end)
1310    pub comments: HashMap<String, Option<String>>,
1311    /// String escape characters
1312    pub string_escapes: Vec<char>,
1313    /// Whether to support nested comments
1314    pub nested_comments: bool,
1315    /// Valid escape follow characters (for MySQL-style escaping).
1316    /// When a backslash is followed by a character NOT in this list,
1317    /// the backslash is discarded. When empty, all backslash escapes
1318    /// preserve the backslash for unrecognized sequences.
1319    pub escape_follow_chars: Vec<char>,
1320    /// Whether b'...' is a byte string (true for BigQuery) or bit string (false for standard SQL).
1321    /// Default is false (bit string).
1322    pub b_prefix_is_byte_string: bool,
1323    /// Numeric literal suffixes (uppercase suffix -> type name), e.g. {"L": "BIGINT", "S": "SMALLINT"}
1324    /// Used by Hive/Spark to parse 1L as CAST(1 AS BIGINT)
1325    pub numeric_literals: HashMap<String, String>,
1326    /// Whether unquoted identifiers can start with a digit (e.g., `1a`, `1_a`).
1327    /// When true, a number followed by letters/underscore is treated as an identifier.
1328    /// Used by Hive, Spark, MySQL, ClickHouse.
1329    pub identifiers_can_start_with_digit: bool,
1330    /// Whether 0x/0X prefix should be treated as hex literals.
1331    /// When true, `0XCC` is tokenized instead of Number("0") + Identifier("XCC").
1332    /// Used by BigQuery, SQLite, Teradata.
1333    pub hex_number_strings: bool,
1334    /// Whether hex string literals from 0x prefix represent integer values.
1335    /// When true (BigQuery), 0xA is tokenized as HexNumber (integer in hex notation).
1336    /// When false (SQLite, Teradata), 0xCC is tokenized as HexString (binary/blob value).
1337    pub hex_string_is_integer_type: bool,
1338    /// Whether string escape sequences (like \') are allowed in raw strings.
1339    /// When true (BigQuery default), \' inside r'...' escapes the quote.
1340    /// When false (Spark/Databricks), backslashes in raw strings are always literal.
1341    /// Python sqlglot: STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS (default True)
1342    pub string_escapes_allowed_in_raw_strings: bool,
1343    /// Whether # starts a single-line comment (ClickHouse, MySQL)
1344    pub hash_comments: bool,
1345    /// Whether $ can start/continue an identifier (ClickHouse).
1346    /// When true, a bare `$` that is not part of a dollar-quoted string or positional
1347    /// parameter is treated as an identifier character.
1348    pub dollar_sign_is_identifier: bool,
1349    /// Whether INSERT ... FORMAT <name> should treat subsequent data as raw (ClickHouse).
1350    /// When true, after tokenizing `INSERT ... FORMAT <non-VALUES-name>`, all text until
1351    /// the next blank line or end of input is consumed as a raw data token.
1352    pub insert_format_raw_data: bool,
1353}
1354
1355impl Default for TokenizerConfig {
1356    fn default() -> Self {
1357        Self {
1358            keywords: DEFAULT_KEYWORDS.clone(),
1359            single_tokens: DEFAULT_SINGLE_TOKENS.clone(),
1360            quotes: DEFAULT_QUOTES.clone(),
1361            identifiers: DEFAULT_IDENTIFIERS.clone(),
1362            comments: DEFAULT_COMMENTS.clone(),
1363            // Standard SQL: only '' (doubled quote) escapes a quote
1364            // Backslash escapes are dialect-specific (MySQL, etc.)
1365            string_escapes: vec!['\''],
1366            nested_comments: true,
1367            // By default, no escape_follow_chars means preserve backslash for unrecognized escapes
1368            escape_follow_chars: vec![],
1369            // Default: b'...' is bit string (standard SQL), not byte string (BigQuery)
1370            b_prefix_is_byte_string: false,
1371            numeric_literals: HashMap::new(),
1372            identifiers_can_start_with_digit: false,
1373            hex_number_strings: false,
1374            hex_string_is_integer_type: false,
1375            // Default: backslash escapes ARE allowed in raw strings (sqlglot default)
1376            // Spark/Databricks set this to false
1377            string_escapes_allowed_in_raw_strings: true,
1378            hash_comments: false,
1379            dollar_sign_is_identifier: false,
1380            insert_format_raw_data: false,
1381        }
1382    }
1383}
1384
1385/// SQL Tokenizer
1386pub struct Tokenizer {
1387    config: TokenizerConfig,
1388}
1389
1390impl Tokenizer {
1391    /// Create a new tokenizer with the given configuration
1392    pub fn new(config: TokenizerConfig) -> Self {
1393        Self { config }
1394    }
1395
1396    /// Create a tokenizer with default configuration
1397    pub fn default_config() -> Self {
1398        Self::new(TokenizerConfig::default())
1399    }
1400
1401    /// Tokenize a SQL string
1402    pub fn tokenize(&self, sql: &str) -> Result<Vec<Token>> {
1403        let mut state = TokenizerState::new(sql, &self.config);
1404        state.tokenize()
1405    }
1406}
1407
1408impl Default for Tokenizer {
1409    fn default() -> Self {
1410        Self::default_config()
1411    }
1412}
1413
1414/// Internal state for tokenization
1415struct TokenizerState<'a> {
1416    source: &'a str,
1417    source_is_ascii: bool,
1418    chars: Vec<char>,
1419    size: usize,
1420    tokens: Vec<Token>,
1421    start: usize,
1422    current: usize,
1423    line: usize,
1424    column: usize,
1425    comments: Vec<String>,
1426    config: &'a TokenizerConfig,
1427}
1428
1429impl<'a> TokenizerState<'a> {
1430    fn new(sql: &'a str, config: &'a TokenizerConfig) -> Self {
1431        let chars: Vec<char> = sql.chars().collect();
1432        let size = chars.len();
1433        Self {
1434            source: sql,
1435            source_is_ascii: sql.is_ascii(),
1436            chars,
1437            size,
1438            tokens: Vec::new(),
1439            start: 0,
1440            current: 0,
1441            line: 1,
1442            column: 1,
1443            comments: Vec::new(),
1444            config,
1445        }
1446    }
1447
1448    fn tokenize(&mut self) -> Result<Vec<Token>> {
1449        while !self.is_at_end() {
1450            self.skip_whitespace();
1451            if self.is_at_end() {
1452                break;
1453            }
1454
1455            self.start = self.current;
1456            self.scan_token()?;
1457
1458            // ClickHouse: After INSERT ... FORMAT <name> (where name != VALUES),
1459            // the rest until the next blank line or end of input is raw data.
1460            if self.config.insert_format_raw_data {
1461                if let Some(raw) = self.try_scan_insert_format_raw_data() {
1462                    if !raw.is_empty() {
1463                        self.start = self.current;
1464                        self.add_token_with_text(TokenType::Var, raw);
1465                    }
1466                }
1467            }
1468        }
1469
1470        // Handle leftover leading comments at end of input.
1471        // These are comments on a new line after the last token that couldn't be attached
1472        // as leading comments to a subsequent token (because there is none).
1473        // Attach them as trailing comments on the last token so they're preserved.
1474        if !self.comments.is_empty() {
1475            if let Some(last) = self.tokens.last_mut() {
1476                last.trailing_comments.extend(self.comments.drain(..));
1477            }
1478        }
1479
1480        Ok(std::mem::take(&mut self.tokens))
1481    }
1482
1483    #[inline]
1484    fn is_at_end(&self) -> bool {
1485        self.current >= self.size
1486    }
1487
1488    #[inline]
1489    fn text_from_range(&self, start: usize, end: usize) -> String {
1490        if self.source_is_ascii {
1491            self.source[start..end].to_string()
1492        } else {
1493            self.chars[start..end].iter().collect()
1494        }
1495    }
1496
1497    #[inline]
1498    fn peek(&self) -> char {
1499        if self.is_at_end() {
1500            '\0'
1501        } else {
1502            self.chars[self.current]
1503        }
1504    }
1505
1506    #[inline]
1507    fn peek_next(&self) -> char {
1508        if self.current + 1 >= self.size {
1509            '\0'
1510        } else {
1511            self.chars[self.current + 1]
1512        }
1513    }
1514
1515    #[inline]
1516    fn advance(&mut self) -> char {
1517        let c = self.peek();
1518        self.current += 1;
1519        if c == '\n' {
1520            self.line += 1;
1521            self.column = 1;
1522        } else {
1523            self.column += 1;
1524        }
1525        c
1526    }
1527
1528    fn skip_whitespace(&mut self) {
1529        // Track whether we've seen a newline since the last token.
1530        // Comments on a new line (after a newline) are leading comments on the next token,
1531        // while comments on the same line are trailing comments on the previous token.
1532        // This matches Python sqlglot's behavior.
1533        let mut saw_newline = false;
1534        while !self.is_at_end() {
1535            let c = self.peek();
1536            match c {
1537                ' ' | '\t' | '\r' => {
1538                    self.advance();
1539                }
1540                '\n' => {
1541                    saw_newline = true;
1542                    self.advance();
1543                }
1544                '\u{00A0}' // non-breaking space
1545                | '\u{2000}'..='\u{200B}' // various Unicode spaces + zero-width space
1546                | '\u{3000}' // ideographic (full-width) space
1547                | '\u{FEFF}' // BOM / zero-width no-break space
1548                => {
1549                    self.advance();
1550                }
1551                '-' if self.peek_next() == '-' => {
1552                    self.scan_line_comment(saw_newline);
1553                    // After a line comment, we're always on a new line
1554                    saw_newline = true;
1555                }
1556                '/' if self.peek_next() == '/' && self.config.hash_comments => {
1557                    // ClickHouse: // single-line comments (same dialects that support # comments)
1558                    self.scan_double_slash_comment();
1559                }
1560                '/' if self.peek_next() == '*' => {
1561                    // Check if this is a hint comment /*+ ... */
1562                    if self.current + 2 < self.size && self.chars[self.current + 2] == '+' {
1563                        // This is a hint comment, handle it as a token instead of skipping
1564                        break;
1565                    }
1566                    if self.scan_block_comment(saw_newline).is_err() {
1567                        return;
1568                    }
1569                    // Don't reset saw_newline - it carries forward
1570                }
1571                '/' if self.peek_next() == '/' && self.config.comments.contains_key("//") => {
1572                    // Dialect-specific // line comment (e.g., Snowflake)
1573                    // But NOT inside URIs like file:// or paths with consecutive slashes
1574                    // Check that previous non-whitespace char is not ':' or '/'
1575                    let prev_non_ws = if self.current > 0 {
1576                        let mut i = self.current - 1;
1577                        while i > 0 && (self.chars[i] == ' ' || self.chars[i] == '\t') {
1578                            i -= 1;
1579                        }
1580                        self.chars[i]
1581                    } else {
1582                        '\0'
1583                    };
1584                    if prev_non_ws == ':' || prev_non_ws == '/' {
1585                        // This is likely a URI (file://, http://) or path, not a comment
1586                        break;
1587                    }
1588                    self.scan_line_comment(saw_newline);
1589                    // After a line comment, we're always on a new line
1590                    saw_newline = true;
1591                }
1592                '#' if self.config.hash_comments => {
1593                    self.scan_hash_line_comment();
1594                }
1595                _ => break,
1596            }
1597        }
1598    }
1599
1600    fn scan_hash_line_comment(&mut self) {
1601        self.advance(); // #
1602        let start = self.current;
1603        while !self.is_at_end() && self.peek() != '\n' {
1604            self.advance();
1605        }
1606        let comment = self.text_from_range(start, self.current);
1607        let comment_text = comment.trim().to_string();
1608        if let Some(last) = self.tokens.last_mut() {
1609            last.trailing_comments.push(comment_text);
1610        } else {
1611            self.comments.push(comment_text);
1612        }
1613    }
1614
1615    fn scan_double_slash_comment(&mut self) {
1616        self.advance(); // /
1617        self.advance(); // /
1618        let start = self.current;
1619        while !self.is_at_end() && self.peek() != '\n' {
1620            self.advance();
1621        }
1622        let comment = self.text_from_range(start, self.current);
1623        let comment_text = comment.trim().to_string();
1624        if let Some(last) = self.tokens.last_mut() {
1625            last.trailing_comments.push(comment_text);
1626        } else {
1627            self.comments.push(comment_text);
1628        }
1629    }
1630
1631    fn scan_line_comment(&mut self, after_newline: bool) {
1632        self.advance(); // -
1633        self.advance(); // -
1634        let start = self.current;
1635        while !self.is_at_end() && self.peek() != '\n' {
1636            self.advance();
1637        }
1638        let comment_text = self.text_from_range(start, self.current);
1639
1640        // If the comment starts on a new line (after_newline), it's a leading comment
1641        // on the next token. Otherwise, it's a trailing comment on the previous token.
1642        if after_newline || self.tokens.is_empty() {
1643            self.comments.push(comment_text);
1644        } else if let Some(last) = self.tokens.last_mut() {
1645            last.trailing_comments.push(comment_text);
1646        }
1647    }
1648
1649    fn scan_block_comment(&mut self, after_newline: bool) -> Result<()> {
1650        self.advance(); // /
1651        self.advance(); // *
1652        let content_start = self.current;
1653        let mut depth = 1;
1654
1655        while !self.is_at_end() && depth > 0 {
1656            if self.peek() == '/' && self.peek_next() == '*' && self.config.nested_comments {
1657                self.advance();
1658                self.advance();
1659                depth += 1;
1660            } else if self.peek() == '*' && self.peek_next() == '/' {
1661                depth -= 1;
1662                if depth > 0 {
1663                    self.advance();
1664                    self.advance();
1665                }
1666            } else {
1667                self.advance();
1668            }
1669        }
1670
1671        if depth > 0 {
1672            return Err(Error::tokenize(
1673                "Unterminated block comment",
1674                self.line,
1675                self.column,
1676                self.start,
1677                self.current,
1678            ));
1679        }
1680
1681        // Get the content between /* and */ (preserving internal whitespace for nested comments)
1682        let content = self.text_from_range(content_start, self.current);
1683        self.advance(); // *
1684        self.advance(); // /
1685
1686        // For round-trip fidelity, preserve the exact comment content including nested comments
1687        let comment_text = format!("/*{}*/", content);
1688
1689        // If the comment starts on a new line (after_newline), it's a leading comment
1690        // on the next token. Otherwise, it's a trailing comment on the previous token.
1691        if after_newline || self.tokens.is_empty() {
1692            self.comments.push(comment_text);
1693        } else if let Some(last) = self.tokens.last_mut() {
1694            last.trailing_comments.push(comment_text);
1695        }
1696
1697        Ok(())
1698    }
1699
1700    /// Scan a hint comment /*+ ... */ and return it as a Hint token
1701    fn scan_hint(&mut self) -> Result<()> {
1702        self.advance(); // /
1703        self.advance(); // *
1704        self.advance(); // +
1705        let hint_start = self.current;
1706
1707        // Scan until we find */
1708        while !self.is_at_end() {
1709            if self.peek() == '*' && self.peek_next() == '/' {
1710                break;
1711            }
1712            self.advance();
1713        }
1714
1715        if self.is_at_end() {
1716            return Err(Error::tokenize(
1717                "Unterminated hint comment",
1718                self.line,
1719                self.column,
1720                self.start,
1721                self.current,
1722            ));
1723        }
1724
1725        let hint_text = self.text_from_range(hint_start, self.current);
1726        self.advance(); // *
1727        self.advance(); // /
1728
1729        self.add_token_with_text(TokenType::Hint, hint_text.trim().to_string());
1730
1731        Ok(())
1732    }
1733
1734    /// Scan a positional parameter: $1, $2, etc.
1735    fn scan_positional_parameter(&mut self) -> Result<()> {
1736        self.advance(); // consume $
1737        let start = self.current;
1738
1739        while !self.is_at_end() && self.peek().is_ascii_digit() {
1740            self.advance();
1741        }
1742
1743        let number = self.text_from_range(start, self.current);
1744        self.add_token_with_text(TokenType::Parameter, number);
1745        Ok(())
1746    }
1747
1748    /// Try to scan a tagged dollar-quoted string: $tag$content$tag$
1749    /// Returns Some(()) if successful, None if this isn't a tagged dollar string.
1750    ///
1751    /// The token text is stored as "tag\x00content" to preserve the tag for later use.
1752    fn try_scan_tagged_dollar_string(&mut self) -> Result<Option<()>> {
1753        let saved_pos = self.current;
1754
1755        // We're at '$', next char is alphabetic
1756        self.advance(); // consume opening $
1757
1758        // Scan the tag (identifier: alphanumeric + underscore, including Unicode)
1759        // Tags can contain Unicode characters like emojis (e.g., $🦆$)
1760        let tag_start = self.current;
1761        while !self.is_at_end()
1762            && (self.peek().is_alphanumeric() || self.peek() == '_' || !self.peek().is_ascii())
1763        {
1764            self.advance();
1765        }
1766        let tag = self.text_from_range(tag_start, self.current);
1767
1768        // Must have a closing $ after the tag
1769        if self.is_at_end() || self.peek() != '$' {
1770            // Not a tagged dollar string - restore position
1771            self.current = saved_pos;
1772            return Ok(None);
1773        }
1774        self.advance(); // consume closing $ of opening tag
1775
1776        // Now scan content until we find $tag$
1777        let content_start = self.current;
1778        let closing_tag = format!("${}$", tag);
1779        let closing_chars: Vec<char> = closing_tag.chars().collect();
1780
1781        loop {
1782            if self.is_at_end() {
1783                // Unterminated - restore and fall through
1784                self.current = saved_pos;
1785                return Ok(None);
1786            }
1787
1788            // Check if we've reached the closing tag
1789            if self.peek() == '$' && self.current + closing_chars.len() <= self.size {
1790                let matches = closing_chars.iter().enumerate().all(|(j, &ch)| {
1791                    self.current + j < self.size && self.chars[self.current + j] == ch
1792                });
1793                if matches {
1794                    let content = self.text_from_range(content_start, self.current);
1795                    // Consume closing tag
1796                    for _ in 0..closing_chars.len() {
1797                        self.advance();
1798                    }
1799                    // Store as "tag\x00content" to preserve the tag
1800                    let token_text = format!("{}\x00{}", tag, content);
1801                    self.add_token_with_text(TokenType::DollarString, token_text);
1802                    return Ok(Some(()));
1803                }
1804            }
1805            self.advance();
1806        }
1807    }
1808
1809    /// Scan a dollar-quoted string: $$content$$ or $tag$content$tag$
1810    ///
1811    /// For $$...$$ (no tag), the token text is just the content.
1812    /// For $tag$...$tag$, use try_scan_tagged_dollar_string instead.
1813    fn scan_dollar_quoted_string(&mut self) -> Result<()> {
1814        self.advance(); // consume first $
1815        self.advance(); // consume second $
1816
1817        // For $$...$$ (no tag), just scan until closing $$
1818        let start = self.current;
1819        while !self.is_at_end() {
1820            if self.peek() == '$'
1821                && self.current + 1 < self.size
1822                && self.chars[self.current + 1] == '$'
1823            {
1824                break;
1825            }
1826            self.advance();
1827        }
1828
1829        let content = self.text_from_range(start, self.current);
1830
1831        if !self.is_at_end() {
1832            self.advance(); // consume first $
1833            self.advance(); // consume second $
1834        }
1835
1836        self.add_token_with_text(TokenType::DollarString, content);
1837        Ok(())
1838    }
1839
1840    fn scan_token(&mut self) -> Result<()> {
1841        let c = self.peek();
1842
1843        // Check for string literal
1844        if c == '\'' {
1845            // Check for triple-quoted string '''...''' if configured
1846            if self.config.quotes.contains_key("'''")
1847                && self.peek_next() == '\''
1848                && self.current + 2 < self.size
1849                && self.chars[self.current + 2] == '\''
1850            {
1851                return self.scan_triple_quoted_string('\'');
1852            }
1853            return self.scan_string();
1854        }
1855
1856        // Check for triple-quoted string """...""" if configured
1857        if c == '"'
1858            && self.config.quotes.contains_key("\"\"\"")
1859            && self.peek_next() == '"'
1860            && self.current + 2 < self.size
1861            && self.chars[self.current + 2] == '"'
1862        {
1863            return self.scan_triple_quoted_string('"');
1864        }
1865
1866        // Check for double-quoted strings when dialect supports them (e.g., BigQuery)
1867        // This must come before identifier quotes check
1868        if c == '"'
1869            && self.config.quotes.contains_key("\"")
1870            && !self.config.identifiers.contains_key(&'"')
1871        {
1872            return self.scan_double_quoted_string();
1873        }
1874
1875        // Check for identifier quotes
1876        if let Some(&end_quote) = self.config.identifiers.get(&c) {
1877            return self.scan_quoted_identifier(end_quote);
1878        }
1879
1880        // Check for numbers (including numbers starting with a dot like .25)
1881        if c.is_ascii_digit() {
1882            return self.scan_number();
1883        }
1884
1885        // Check for numbers starting with a dot (e.g., .25, .5)
1886        // This must come before single character token handling
1887        // Don't treat as a number if:
1888        // - Previous char was also a dot (e.g., 1..2 should be 1, ., ., 2)
1889        // - Previous char is an identifier character (e.g., foo.25 should be foo, ., 25)
1890        //   This handles BigQuery numeric table parts like project.dataset.25
1891        if c == '.' && self.peek_next().is_ascii_digit() {
1892            let prev_char = if self.current > 0 {
1893                self.chars[self.current - 1]
1894            } else {
1895                '\0'
1896            };
1897            let is_after_ident = prev_char.is_alphanumeric()
1898                || prev_char == '_'
1899                || prev_char == '`'
1900                || prev_char == '"'
1901                || prev_char == ']'
1902                || prev_char == ')';
1903            if prev_char != '.' && !is_after_ident {
1904                return self.scan_number_starting_with_dot();
1905            }
1906        }
1907
1908        // Check for hint comment /*+ ... */
1909        if c == '/'
1910            && self.peek_next() == '*'
1911            && self.current + 2 < self.size
1912            && self.chars[self.current + 2] == '+'
1913        {
1914            return self.scan_hint();
1915        }
1916
1917        // Check for multi-character operators first
1918        if let Some(token_type) = self.try_scan_multi_char_operator() {
1919            self.add_token(token_type);
1920            return Ok(());
1921        }
1922
1923        // Check for tagged dollar-quoted strings: $tag$content$tag$
1924        // Tags can contain Unicode characters (including emojis like 🦆) and digits (e.g., $1$)
1925        if c == '$'
1926            && (self.peek_next().is_alphanumeric()
1927                || self.peek_next() == '_'
1928                || !self.peek_next().is_ascii())
1929        {
1930            if let Some(()) = self.try_scan_tagged_dollar_string()? {
1931                return Ok(());
1932            }
1933            // If tagged dollar string didn't match and dollar_sign_is_identifier is set,
1934            // treat the $ and following chars as an identifier (e.g., ClickHouse $alias$name$).
1935            if self.config.dollar_sign_is_identifier {
1936                return self.scan_dollar_identifier();
1937            }
1938        }
1939
1940        // Check for dollar-quoted strings: $$...$$
1941        if c == '$' && self.peek_next() == '$' {
1942            return self.scan_dollar_quoted_string();
1943        }
1944
1945        // Check for positional parameters: $1, $2, etc.
1946        if c == '$' && self.peek_next().is_ascii_digit() {
1947            return self.scan_positional_parameter();
1948        }
1949
1950        // ClickHouse: bare $ (not followed by alphanumeric/underscore) as identifier
1951        if c == '$' && self.config.dollar_sign_is_identifier {
1952            return self.scan_dollar_identifier();
1953        }
1954
1955        // TSQL: Check for identifiers starting with # (temp tables) or @ (variables)
1956        // e.g., #temp, ##global_temp, @variable
1957        if (c == '#' || c == '@')
1958            && (self.peek_next().is_alphanumeric()
1959                || self.peek_next() == '_'
1960                || self.peek_next() == '#')
1961        {
1962            return self.scan_tsql_identifier();
1963        }
1964
1965        // Check for single character tokens
1966        if let Some(&token_type) = self.config.single_tokens.get(&c) {
1967            self.advance();
1968            self.add_token(token_type);
1969            return Ok(());
1970        }
1971
1972        // Unicode minus (U+2212) → treat as regular minus
1973        if c == '\u{2212}' {
1974            self.advance();
1975            self.add_token(TokenType::Dash);
1976            return Ok(());
1977        }
1978
1979        // Unicode fraction slash (U+2044) → treat as regular slash
1980        if c == '\u{2044}' {
1981            self.advance();
1982            self.add_token(TokenType::Slash);
1983            return Ok(());
1984        }
1985
1986        // Unicode curly/smart quotes → treat as regular string quotes
1987        if c == '\u{2018}' || c == '\u{2019}' {
1988            // Left/right single quotation marks → scan as string with matching end
1989            return self.scan_unicode_quoted_string(c);
1990        }
1991        if c == '\u{201C}' || c == '\u{201D}' {
1992            // Left/right double quotation marks → scan as quoted identifier
1993            return self.scan_unicode_quoted_identifier(c);
1994        }
1995
1996        // Must be an identifier or keyword
1997        self.scan_identifier_or_keyword()
1998    }
1999
2000    fn try_scan_multi_char_operator(&mut self) -> Option<TokenType> {
2001        let c = self.peek();
2002        let next = self.peek_next();
2003        let third = if self.current + 2 < self.size {
2004            self.chars[self.current + 2]
2005        } else {
2006            '\0'
2007        };
2008
2009        // Check for three-character operators first
2010        // -|- (Adjacent - PostgreSQL range adjacency)
2011        if c == '-' && next == '|' && third == '-' {
2012            self.advance();
2013            self.advance();
2014            self.advance();
2015            return Some(TokenType::Adjacent);
2016        }
2017
2018        // ||/ (Cube root - PostgreSQL)
2019        if c == '|' && next == '|' && third == '/' {
2020            self.advance();
2021            self.advance();
2022            self.advance();
2023            return Some(TokenType::DPipeSlash);
2024        }
2025
2026        // #>> (JSONB path text extraction - PostgreSQL)
2027        if c == '#' && next == '>' && third == '>' {
2028            self.advance();
2029            self.advance();
2030            self.advance();
2031            return Some(TokenType::DHashArrow);
2032        }
2033
2034        // ->> (JSON text extraction - PostgreSQL/MySQL)
2035        if c == '-' && next == '>' && third == '>' {
2036            self.advance();
2037            self.advance();
2038            self.advance();
2039            return Some(TokenType::DArrow);
2040        }
2041
2042        // <=> (NULL-safe equality - MySQL)
2043        if c == '<' && next == '=' && third == '>' {
2044            self.advance();
2045            self.advance();
2046            self.advance();
2047            return Some(TokenType::NullsafeEq);
2048        }
2049
2050        // <-> (Distance operator - PostgreSQL)
2051        if c == '<' && next == '-' && third == '>' {
2052            self.advance();
2053            self.advance();
2054            self.advance();
2055            return Some(TokenType::LrArrow);
2056        }
2057
2058        // <@ (Contained by - PostgreSQL)
2059        if c == '<' && next == '@' {
2060            self.advance();
2061            self.advance();
2062            return Some(TokenType::LtAt);
2063        }
2064
2065        // @> (Contains - PostgreSQL)
2066        if c == '@' && next == '>' {
2067            self.advance();
2068            self.advance();
2069            return Some(TokenType::AtGt);
2070        }
2071
2072        // ~~~ (Glob - PostgreSQL)
2073        if c == '~' && next == '~' && third == '~' {
2074            self.advance();
2075            self.advance();
2076            self.advance();
2077            return Some(TokenType::Glob);
2078        }
2079
2080        // ~~* (ILike - PostgreSQL)
2081        if c == '~' && next == '~' && third == '*' {
2082            self.advance();
2083            self.advance();
2084            self.advance();
2085            return Some(TokenType::ILike);
2086        }
2087
2088        // !~~* (Not ILike - PostgreSQL)
2089        let fourth = if self.current + 3 < self.size {
2090            self.chars[self.current + 3]
2091        } else {
2092            '\0'
2093        };
2094        if c == '!' && next == '~' && third == '~' && fourth == '*' {
2095            self.advance();
2096            self.advance();
2097            self.advance();
2098            self.advance();
2099            return Some(TokenType::NotILike);
2100        }
2101
2102        // !~~ (Not Like - PostgreSQL)
2103        if c == '!' && next == '~' && third == '~' {
2104            self.advance();
2105            self.advance();
2106            self.advance();
2107            return Some(TokenType::NotLike);
2108        }
2109
2110        // !~* (Not Regexp ILike - PostgreSQL)
2111        if c == '!' && next == '~' && third == '*' {
2112            self.advance();
2113            self.advance();
2114            self.advance();
2115            return Some(TokenType::NotIRLike);
2116        }
2117
2118        // !:> (Not cast / Try cast - SingleStore)
2119        if c == '!' && next == ':' && third == '>' {
2120            self.advance();
2121            self.advance();
2122            self.advance();
2123            return Some(TokenType::NColonGt);
2124        }
2125
2126        // ?:: (TRY_CAST shorthand - Databricks)
2127        if c == '?' && next == ':' && third == ':' {
2128            self.advance();
2129            self.advance();
2130            self.advance();
2131            return Some(TokenType::QDColon);
2132        }
2133
2134        // !~ (Not Regexp - PostgreSQL)
2135        if c == '!' && next == '~' {
2136            self.advance();
2137            self.advance();
2138            return Some(TokenType::NotRLike);
2139        }
2140
2141        // ~~ (Like - PostgreSQL)
2142        if c == '~' && next == '~' {
2143            self.advance();
2144            self.advance();
2145            return Some(TokenType::Like);
2146        }
2147
2148        // ~* (Regexp ILike - PostgreSQL)
2149        if c == '~' && next == '*' {
2150            self.advance();
2151            self.advance();
2152            return Some(TokenType::IRLike);
2153        }
2154
2155        // SingleStore three-character JSON path operators (must be checked before :: two-char)
2156        // ::$ (JSON extract string), ::% (JSON extract double), ::? (JSON match)
2157        if c == ':' && next == ':' && third == '$' {
2158            self.advance();
2159            self.advance();
2160            self.advance();
2161            return Some(TokenType::DColonDollar);
2162        }
2163        if c == ':' && next == ':' && third == '%' {
2164            self.advance();
2165            self.advance();
2166            self.advance();
2167            return Some(TokenType::DColonPercent);
2168        }
2169        if c == ':' && next == ':' && third == '?' {
2170            self.advance();
2171            self.advance();
2172            self.advance();
2173            return Some(TokenType::DColonQMark);
2174        }
2175
2176        // Two-character operators
2177        let token_type = match (c, next) {
2178            ('.', ':') => Some(TokenType::DotColon),
2179            ('=', '=') => Some(TokenType::Eq), // Hive/Spark == equality operator
2180            ('<', '=') => Some(TokenType::Lte),
2181            ('>', '=') => Some(TokenType::Gte),
2182            ('!', '=') => Some(TokenType::Neq),
2183            ('<', '>') => Some(TokenType::Neq),
2184            ('^', '=') => Some(TokenType::Neq),
2185            ('<', '<') => Some(TokenType::LtLt),
2186            ('>', '>') => Some(TokenType::GtGt),
2187            ('|', '|') => Some(TokenType::DPipe),
2188            ('|', '/') => Some(TokenType::PipeSlash), // Square root - PostgreSQL
2189            (':', ':') => Some(TokenType::DColon),
2190            (':', '=') => Some(TokenType::ColonEq), // := (assignment, named args)
2191            (':', '>') => Some(TokenType::ColonGt), // ::> (TSQL)
2192            ('-', '>') => Some(TokenType::Arrow),   // JSON object access
2193            ('=', '>') => Some(TokenType::FArrow),  // Fat arrow (lambda)
2194            ('&', '&') => Some(TokenType::DAmp),
2195            ('&', '<') => Some(TokenType::AmpLt), // PostgreSQL range operator
2196            ('&', '>') => Some(TokenType::AmpGt), // PostgreSQL range operator
2197            ('@', '@') => Some(TokenType::AtAt),  // Text search match
2198            ('?', '|') => Some(TokenType::QMarkPipe), // JSONB contains any key
2199            ('?', '&') => Some(TokenType::QMarkAmp), // JSONB contains all keys
2200            ('?', '?') => Some(TokenType::DQMark), // Double question mark
2201            ('#', '>') => Some(TokenType::HashArrow), // JSONB path extraction
2202            ('#', '-') => Some(TokenType::HashDash), // JSONB delete
2203            ('^', '@') => Some(TokenType::CaretAt), // PostgreSQL starts-with operator
2204            ('*', '*') => Some(TokenType::DStar), // Power operator
2205            ('|', '>') => Some(TokenType::PipeGt), // Pipe-greater (some dialects)
2206            _ => None,
2207        };
2208
2209        if token_type.is_some() {
2210            self.advance();
2211            self.advance();
2212        }
2213
2214        token_type
2215    }
2216
2217    fn scan_string(&mut self) -> Result<()> {
2218        self.advance(); // Opening quote
2219        let mut value = String::new();
2220
2221        while !self.is_at_end() {
2222            let c = self.peek();
2223            if c == '\'' {
2224                if self.peek_next() == '\'' {
2225                    // Escaped quote
2226                    value.push('\'');
2227                    self.advance();
2228                    self.advance();
2229                } else {
2230                    break;
2231                }
2232            } else if c == '\\' && self.config.string_escapes.contains(&'\\') {
2233                // Handle escape sequences
2234                self.advance(); // Consume the backslash
2235                if !self.is_at_end() {
2236                    let escaped = self.advance();
2237                    match escaped {
2238                        'n' => value.push('\n'),
2239                        'r' => value.push('\r'),
2240                        't' => value.push('\t'),
2241                        '0' => value.push('\0'),
2242                        'Z' => value.push('\x1A'), // Ctrl+Z (MySQL)
2243                        'a' => value.push('\x07'), // Alert/bell
2244                        'b' => value.push('\x08'), // Backspace
2245                        'f' => value.push('\x0C'), // Form feed
2246                        'v' => value.push('\x0B'), // Vertical tab
2247                        'x' => {
2248                            // Hex escape: \xNN (exactly 2 hex digits)
2249                            let mut hex = String::with_capacity(2);
2250                            for _ in 0..2 {
2251                                if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2252                                    hex.push(self.advance());
2253                                }
2254                            }
2255                            if hex.len() == 2 {
2256                                if let Ok(byte) = u8::from_str_radix(&hex, 16) {
2257                                    value.push(byte as char);
2258                                } else {
2259                                    value.push('\\');
2260                                    value.push('x');
2261                                    value.push_str(&hex);
2262                                }
2263                            } else {
2264                                // Not enough hex digits, preserve literally
2265                                value.push('\\');
2266                                value.push('x');
2267                                value.push_str(&hex);
2268                            }
2269                        }
2270                        '\\' => value.push('\\'),
2271                        '\'' => value.push('\''),
2272                        '"' => value.push('"'),
2273                        '%' => {
2274                            // MySQL: \% in LIKE patterns
2275                            value.push('%');
2276                        }
2277                        '_' => {
2278                            // MySQL: \_ in LIKE patterns
2279                            value.push('_');
2280                        }
2281                        // For unrecognized escape sequences:
2282                        // If escape_follow_chars is set, only preserve backslash for chars in that list
2283                        // Otherwise (empty list), preserve backslash + char for unrecognized escapes
2284                        _ => {
2285                            if !self.config.escape_follow_chars.is_empty() {
2286                                // MySQL-style: discard backslash for unrecognized escapes
2287                                value.push(escaped);
2288                            } else {
2289                                // Standard: preserve backslash + char
2290                                value.push('\\');
2291                                value.push(escaped);
2292                            }
2293                        }
2294                    }
2295                }
2296            } else {
2297                value.push(self.advance());
2298            }
2299        }
2300
2301        if self.is_at_end() {
2302            return Err(Error::tokenize(
2303                "Unterminated string",
2304                self.line,
2305                self.column,
2306                self.start,
2307                self.current,
2308            ));
2309        }
2310
2311        self.advance(); // Closing quote
2312        self.add_token_with_text(TokenType::String, value);
2313        Ok(())
2314    }
2315
2316    /// Scan a double-quoted string (for dialects like BigQuery where " is a string delimiter)
2317    fn scan_double_quoted_string(&mut self) -> Result<()> {
2318        self.advance(); // Opening quote
2319        let mut value = String::new();
2320
2321        while !self.is_at_end() {
2322            let c = self.peek();
2323            if c == '"' {
2324                if self.peek_next() == '"' {
2325                    // Escaped quote
2326                    value.push('"');
2327                    self.advance();
2328                    self.advance();
2329                } else {
2330                    break;
2331                }
2332            } else if c == '\\' && self.config.string_escapes.contains(&'\\') {
2333                // Handle escape sequences
2334                self.advance(); // Consume the backslash
2335                if !self.is_at_end() {
2336                    let escaped = self.advance();
2337                    match escaped {
2338                        'n' => value.push('\n'),
2339                        'r' => value.push('\r'),
2340                        't' => value.push('\t'),
2341                        '0' => value.push('\0'),
2342                        'Z' => value.push('\x1A'), // Ctrl+Z (MySQL)
2343                        'a' => value.push('\x07'), // Alert/bell
2344                        'b' => value.push('\x08'), // Backspace
2345                        'f' => value.push('\x0C'), // Form feed
2346                        'v' => value.push('\x0B'), // Vertical tab
2347                        'x' => {
2348                            // Hex escape: \xNN (exactly 2 hex digits)
2349                            let mut hex = String::with_capacity(2);
2350                            for _ in 0..2 {
2351                                if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2352                                    hex.push(self.advance());
2353                                }
2354                            }
2355                            if hex.len() == 2 {
2356                                if let Ok(byte) = u8::from_str_radix(&hex, 16) {
2357                                    value.push(byte as char);
2358                                } else {
2359                                    value.push('\\');
2360                                    value.push('x');
2361                                    value.push_str(&hex);
2362                                }
2363                            } else {
2364                                // Not enough hex digits, preserve literally
2365                                value.push('\\');
2366                                value.push('x');
2367                                value.push_str(&hex);
2368                            }
2369                        }
2370                        '\\' => value.push('\\'),
2371                        '\'' => value.push('\''),
2372                        '"' => value.push('"'),
2373                        '%' => {
2374                            // MySQL: \% in LIKE patterns
2375                            value.push('%');
2376                        }
2377                        '_' => {
2378                            // MySQL: \_ in LIKE patterns
2379                            value.push('_');
2380                        }
2381                        // For unrecognized escape sequences:
2382                        // If escape_follow_chars is set, only preserve backslash for chars in that list
2383                        // Otherwise (empty list), preserve backslash + char for unrecognized escapes
2384                        _ => {
2385                            if !self.config.escape_follow_chars.is_empty() {
2386                                // MySQL-style: discard backslash for unrecognized escapes
2387                                value.push(escaped);
2388                            } else {
2389                                // Standard: preserve backslash + char
2390                                value.push('\\');
2391                                value.push(escaped);
2392                            }
2393                        }
2394                    }
2395                }
2396            } else {
2397                value.push(self.advance());
2398            }
2399        }
2400
2401        if self.is_at_end() {
2402            return Err(Error::tokenize(
2403                "Unterminated double-quoted string",
2404                self.line,
2405                self.column,
2406                self.start,
2407                self.current,
2408            ));
2409        }
2410
2411        self.advance(); // Closing quote
2412        self.add_token_with_text(TokenType::String, value);
2413        Ok(())
2414    }
2415
2416    fn scan_triple_quoted_string(&mut self, quote_char: char) -> Result<()> {
2417        // Advance past the three opening quotes
2418        self.advance();
2419        self.advance();
2420        self.advance();
2421        let mut value = String::new();
2422
2423        while !self.is_at_end() {
2424            // Check for closing triple quote
2425            if self.peek() == quote_char
2426                && self.current + 1 < self.size
2427                && self.chars[self.current + 1] == quote_char
2428                && self.current + 2 < self.size
2429                && self.chars[self.current + 2] == quote_char
2430            {
2431                // Found closing """
2432                break;
2433            }
2434            value.push(self.advance());
2435        }
2436
2437        if self.is_at_end() {
2438            return Err(Error::tokenize(
2439                "Unterminated triple-quoted string",
2440                self.line,
2441                self.column,
2442                self.start,
2443                self.current,
2444            ));
2445        }
2446
2447        // Advance past the three closing quotes
2448        self.advance();
2449        self.advance();
2450        self.advance();
2451        let token_type = if quote_char == '"' {
2452            TokenType::TripleDoubleQuotedString
2453        } else {
2454            TokenType::TripleSingleQuotedString
2455        };
2456        self.add_token_with_text(token_type, value);
2457        Ok(())
2458    }
2459
2460    fn scan_quoted_identifier(&mut self, end_quote: char) -> Result<()> {
2461        self.advance(); // Opening quote
2462        let mut value = String::new();
2463
2464        loop {
2465            if self.is_at_end() {
2466                return Err(Error::tokenize(
2467                    "Unterminated identifier",
2468                    self.line,
2469                    self.column,
2470                    self.start,
2471                    self.current,
2472                ));
2473            }
2474            if self.peek() == end_quote {
2475                if self.peek_next() == end_quote {
2476                    // Escaped quote (e.g., "" inside "x""y") -> store single quote
2477                    value.push(end_quote);
2478                    self.advance(); // skip first quote
2479                    self.advance(); // skip second quote
2480                } else {
2481                    // End of identifier
2482                    break;
2483                }
2484            } else {
2485                value.push(self.peek());
2486                self.advance();
2487            }
2488        }
2489
2490        self.advance(); // Closing quote
2491        self.add_token_with_text(TokenType::QuotedIdentifier, value);
2492        Ok(())
2493    }
2494
2495    /// Scan a string delimited by Unicode curly single quotes (U+2018/U+2019).
2496    /// Content between curly quotes is literal (no escape processing).
2497    /// When opened with \u{2018} (left), close with \u{2019} (right) only.
2498    /// When opened with \u{2019} (right), close with \u{2019} (right) — self-closing.
2499    fn scan_unicode_quoted_string(&mut self, open_quote: char) -> Result<()> {
2500        self.advance(); // Opening curly quote
2501        let start = self.current;
2502        // Determine closing quote: left opens -> right closes; right opens -> right closes
2503        let close_quote = if open_quote == '\u{2018}' {
2504            '\u{2019}' // left opens, right closes
2505        } else {
2506            '\u{2019}' // right quote also closes with right quote
2507        };
2508        while !self.is_at_end() && self.peek() != close_quote {
2509            self.advance();
2510        }
2511        let value = self.text_from_range(start, self.current);
2512        if !self.is_at_end() {
2513            self.advance(); // Closing quote
2514        }
2515        self.add_token_with_text(TokenType::String, value);
2516        Ok(())
2517    }
2518
2519    /// Scan an identifier delimited by Unicode curly double quotes (U+201C/U+201D).
2520    /// When opened with \u{201C} (left), close with \u{201D} (right) only.
2521    fn scan_unicode_quoted_identifier(&mut self, open_quote: char) -> Result<()> {
2522        self.advance(); // Opening curly quote
2523        let start = self.current;
2524        let close_quote = if open_quote == '\u{201C}' {
2525            '\u{201D}' // left opens, right closes
2526        } else {
2527            '\u{201D}' // right also closes with right
2528        };
2529        while !self.is_at_end() && self.peek() != close_quote && self.peek() != '"' {
2530            self.advance();
2531        }
2532        let value = self.text_from_range(start, self.current);
2533        if !self.is_at_end() {
2534            self.advance(); // Closing quote
2535        }
2536        self.add_token_with_text(TokenType::QuotedIdentifier, value);
2537        Ok(())
2538    }
2539
2540    fn scan_number(&mut self) -> Result<()> {
2541        // Check for 0x/0X hex number prefix (SQLite-style)
2542        if self.config.hex_number_strings && self.peek() == '0' && !self.is_at_end() {
2543            let next = if self.current + 1 < self.size {
2544                self.chars[self.current + 1]
2545            } else {
2546                '\0'
2547            };
2548            if next == 'x' || next == 'X' {
2549                // Advance past '0' and 'x'/'X'
2550                self.advance();
2551                self.advance();
2552                // Collect hex digits (allow underscores as separators, e.g., 0xbad_cafe)
2553                let hex_start = self.current;
2554                while !self.is_at_end() && (self.peek().is_ascii_hexdigit() || self.peek() == '_') {
2555                    if self.peek() == '_' && !self.peek_next().is_ascii_hexdigit() {
2556                        break;
2557                    }
2558                    self.advance();
2559                }
2560                if self.current > hex_start {
2561                    // Check for hex float: 0xABC.DEFpEXP or 0xABCpEXP
2562                    let mut is_hex_float = false;
2563                    // Optional fractional part: .hexdigits
2564                    if !self.is_at_end() && self.peek() == '.' {
2565                        let after_dot = if self.current + 1 < self.size {
2566                            self.chars[self.current + 1]
2567                        } else {
2568                            '\0'
2569                        };
2570                        if after_dot.is_ascii_hexdigit() {
2571                            is_hex_float = true;
2572                            self.advance(); // consume '.'
2573                            while !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2574                                self.advance();
2575                            }
2576                        }
2577                    }
2578                    // Optional binary exponent: p/P [+/-] digits
2579                    if !self.is_at_end() && (self.peek() == 'p' || self.peek() == 'P') {
2580                        is_hex_float = true;
2581                        self.advance(); // consume p/P
2582                        if !self.is_at_end() && (self.peek() == '+' || self.peek() == '-') {
2583                            self.advance();
2584                        }
2585                        while !self.is_at_end() && self.peek().is_ascii_digit() {
2586                            self.advance();
2587                        }
2588                    }
2589                    if is_hex_float {
2590                        // Hex float literal — emit as regular Number token with full text
2591                        let full_text = self.text_from_range(self.start, self.current);
2592                        self.add_token_with_text(TokenType::Number, full_text);
2593                    } else if self.config.hex_string_is_integer_type {
2594                        // BigQuery/ClickHouse: 0xA represents an integer in hex notation
2595                        let hex_value = self.text_from_range(hex_start, self.current);
2596                        self.add_token_with_text(TokenType::HexNumber, hex_value);
2597                    } else {
2598                        // SQLite/Teradata: 0xCC represents a binary/blob hex string
2599                        let hex_value = self.text_from_range(hex_start, self.current);
2600                        self.add_token_with_text(TokenType::HexString, hex_value);
2601                    }
2602                    return Ok(());
2603                }
2604                // No hex digits after 0x - fall through to normal number parsing
2605                // (reset current back to after '0')
2606                self.current = self.start + 1;
2607            }
2608        }
2609
2610        // Allow underscores as digit separators (e.g., 20_000, 1_000_000)
2611        while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2612            // Don't allow underscore at the end (must be followed by digit)
2613            if self.peek() == '_' && (self.is_at_end() || !self.peek_next().is_ascii_digit()) {
2614                break;
2615            }
2616            self.advance();
2617        }
2618
2619        // Look for decimal part - allow trailing dot (e.g., "1.")
2620        // In PostgreSQL (and sqlglot), "1.x" parses as float "1." with alias "x"
2621        // So we always consume the dot as part of the number, even if followed by an identifier
2622        if self.peek() == '.' {
2623            let next = self.peek_next();
2624            // Only consume the dot if:
2625            // 1. Followed by a digit (normal decimal like 1.5)
2626            // 2. Followed by an identifier start (like 1.x -> becomes 1. with alias x)
2627            // 3. End of input or other non-dot character (trailing decimal like "1.")
2628            // Do NOT consume if it's a double dot (..) which is a range operator
2629            if next != '.' {
2630                self.advance(); // consume the .
2631                                // Only consume digits after the decimal point (not identifiers)
2632                while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2633                    if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2634                        break;
2635                    }
2636                    self.advance();
2637                }
2638            }
2639        }
2640
2641        // Look for exponent
2642        if self.peek() == 'e' || self.peek() == 'E' {
2643            self.advance();
2644            if self.peek() == '+' || self.peek() == '-' {
2645                self.advance();
2646            }
2647            while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2648                if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2649                    break;
2650                }
2651                self.advance();
2652            }
2653        }
2654
2655        let text = self.text_from_range(self.start, self.current);
2656
2657        // Check for numeric literal suffixes (e.g., 1L -> BIGINT, 1s -> SMALLINT in Hive/Spark)
2658        if !self.config.numeric_literals.is_empty() && !self.is_at_end() {
2659            let next_char: String = self.peek().to_ascii_uppercase().to_string();
2660            // Try 2-char suffix first (e.g., "BD"), then 1-char
2661            let suffix_match = if self.current + 1 < self.size {
2662                let two_char: String = [
2663                    self.chars[self.current].to_ascii_uppercase(),
2664                    self.chars[self.current + 1].to_ascii_uppercase(),
2665                ]
2666                .iter()
2667                .collect();
2668                if self.config.numeric_literals.contains_key(&two_char) {
2669                    // Make sure the 2-char suffix is not followed by more identifier chars
2670                    let after_suffix = if self.current + 2 < self.size {
2671                        self.chars[self.current + 2]
2672                    } else {
2673                        ' '
2674                    };
2675                    if !after_suffix.is_alphanumeric() && after_suffix != '_' {
2676                        Some((two_char, 2))
2677                    } else {
2678                        None
2679                    }
2680                } else if self.config.numeric_literals.contains_key(&next_char) {
2681                    // 1-char suffix - make sure not followed by more identifier chars
2682                    let after_suffix = if self.current + 1 < self.size {
2683                        self.chars[self.current + 1]
2684                    } else {
2685                        ' '
2686                    };
2687                    if !after_suffix.is_alphanumeric() && after_suffix != '_' {
2688                        Some((next_char, 1))
2689                    } else {
2690                        None
2691                    }
2692                } else {
2693                    None
2694                }
2695            } else if self.config.numeric_literals.contains_key(&next_char) {
2696                // At end of input, 1-char suffix
2697                Some((next_char, 1))
2698            } else {
2699                None
2700            };
2701
2702            if let Some((suffix, len)) = suffix_match {
2703                // Consume the suffix characters
2704                for _ in 0..len {
2705                    self.advance();
2706                }
2707                // Emit as a special number-with-suffix token
2708                // We'll encode as "number::TYPE" so the parser can split it
2709                let type_name = self
2710                    .config
2711                    .numeric_literals
2712                    .get(&suffix)
2713                    .expect("suffix verified by contains_key above")
2714                    .clone();
2715                let combined = format!("{}::{}", text, type_name);
2716                self.add_token_with_text(TokenType::Number, combined);
2717                return Ok(());
2718            }
2719        }
2720
2721        // Check for identifiers that start with a digit (e.g., 1a, 1_a, 1a_1a)
2722        // In Hive/Spark/MySQL/ClickHouse, these are valid unquoted identifiers
2723        if self.config.identifiers_can_start_with_digit && !self.is_at_end() {
2724            let next = self.peek();
2725            if next.is_alphabetic() || next == '_' {
2726                // Continue scanning as an identifier
2727                while !self.is_at_end() {
2728                    let ch = self.peek();
2729                    if ch.is_alphanumeric() || ch == '_' {
2730                        self.advance();
2731                    } else {
2732                        break;
2733                    }
2734                }
2735                let ident_text = self.text_from_range(self.start, self.current);
2736                self.add_token_with_text(TokenType::Identifier, ident_text);
2737                return Ok(());
2738            }
2739        }
2740
2741        self.add_token_with_text(TokenType::Number, text);
2742        Ok(())
2743    }
2744
2745    /// Scan a number that starts with a dot (e.g., .25, .5, .123e10)
2746    fn scan_number_starting_with_dot(&mut self) -> Result<()> {
2747        // Consume the leading dot
2748        self.advance();
2749
2750        // Consume the fractional digits
2751        while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2752            if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2753                break;
2754            }
2755            self.advance();
2756        }
2757
2758        // Look for exponent
2759        if self.peek() == 'e' || self.peek() == 'E' {
2760            self.advance();
2761            if self.peek() == '+' || self.peek() == '-' {
2762                self.advance();
2763            }
2764            while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2765                if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2766                    break;
2767                }
2768                self.advance();
2769            }
2770        }
2771
2772        let text = self.text_from_range(self.start, self.current);
2773        self.add_token_with_text(TokenType::Number, text);
2774        Ok(())
2775    }
2776
2777    /// Look up a keyword using a stack buffer for ASCII uppercasing, avoiding heap allocation.
2778    /// Returns `TokenType::Var` for texts longer than 128 bytes or non-UTF-8 results.
2779    #[inline]
2780    fn lookup_keyword_ascii(keywords: &HashMap<String, TokenType>, text: &str) -> TokenType {
2781        if text.len() > 128 {
2782            return TokenType::Var;
2783        }
2784        let mut buf = [0u8; 128];
2785        for (i, b) in text.bytes().enumerate() {
2786            buf[i] = b.to_ascii_uppercase();
2787        }
2788        if let Ok(upper) = std::str::from_utf8(&buf[..text.len()]) {
2789            keywords.get(upper).copied().unwrap_or(TokenType::Var)
2790        } else {
2791            TokenType::Var
2792        }
2793    }
2794
2795    fn scan_identifier_or_keyword(&mut self) -> Result<()> {
2796        // Guard against unrecognized characters that could cause infinite loops
2797        let first_char = self.peek();
2798        if !first_char.is_alphanumeric() && first_char != '_' {
2799            // Unknown character - skip it and return an error
2800            let c = self.advance();
2801            return Err(Error::tokenize(
2802                format!("Unexpected character: '{}'", c),
2803                self.line,
2804                self.column,
2805                self.start,
2806                self.current,
2807            ));
2808        }
2809
2810        while !self.is_at_end() {
2811            let c = self.peek();
2812            // Allow alphanumeric, underscore, $, # and @ in identifiers
2813            // PostgreSQL allows $, TSQL allows # and @
2814            // But stop consuming # if followed by > or >> (PostgreSQL #> and #>> operators)
2815            if c == '#' {
2816                let next_c = if self.current + 1 < self.size {
2817                    self.chars[self.current + 1]
2818                } else {
2819                    '\0'
2820                };
2821                if next_c == '>' || next_c == '-' {
2822                    break; // Don't consume # — it's part of #>, #>>, or #- operator
2823                }
2824                self.advance();
2825            } else if c.is_alphanumeric() || c == '_' || c == '$' || c == '@' {
2826                self.advance();
2827            } else {
2828                break;
2829            }
2830        }
2831
2832        let text = self.text_from_range(self.start, self.current);
2833
2834        // Special-case NOT= (Teradata and other dialects)
2835        if text.eq_ignore_ascii_case("NOT") && self.peek() == '=' {
2836            self.advance(); // consume '='
2837            self.add_token(TokenType::Neq);
2838            return Ok(());
2839        }
2840
2841        // Check for special string prefixes like N'...', X'...', B'...', U&'...', r'...', b'...'
2842        // Also handle double-quoted variants for dialects that support them (e.g., BigQuery)
2843        let next_char = self.peek();
2844        let is_single_quote = next_char == '\'';
2845        let is_double_quote = next_char == '"' && self.config.quotes.contains_key("\"");
2846        // For raw strings (r"..." or r'...'), we allow double quotes even if " is not in quotes config
2847        // because raw strings are a special case used in Spark/Databricks where " is for identifiers
2848        let is_double_quote_for_raw = next_char == '"';
2849
2850        // Handle raw strings first - they're special because they work with both ' and "
2851        // even in dialects where " is normally an identifier delimiter (like Databricks)
2852        if text.eq_ignore_ascii_case("R") && (is_single_quote || is_double_quote_for_raw) {
2853            // Raw string r'...' or r"..." or r'''...''' or r"""...""" (BigQuery style)
2854            // In raw strings, backslashes are treated literally (no escape processing)
2855            let quote_char = if is_single_quote { '\'' } else { '"' };
2856            self.advance(); // consume the first opening quote
2857
2858            // Check for triple-quoted raw string (r"""...""" or r'''...''')
2859            if self.peek() == quote_char && self.peek_next() == quote_char {
2860                // Triple-quoted raw string
2861                self.advance(); // consume second quote
2862                self.advance(); // consume third quote
2863                let string_value = self.scan_raw_triple_quoted_content(quote_char)?;
2864                self.add_token_with_text(TokenType::RawString, string_value);
2865            } else {
2866                let string_value = self.scan_raw_string_content(quote_char)?;
2867                self.add_token_with_text(TokenType::RawString, string_value);
2868            }
2869            return Ok(());
2870        }
2871
2872        if is_single_quote || is_double_quote {
2873            if text.eq_ignore_ascii_case("N") {
2874                // National string N'...'
2875                self.advance(); // consume the opening quote
2876                let string_value = if is_single_quote {
2877                    self.scan_string_content()?
2878                } else {
2879                    self.scan_double_quoted_string_content()?
2880                };
2881                self.add_token_with_text(TokenType::NationalString, string_value);
2882                return Ok(());
2883            } else if text.eq_ignore_ascii_case("E") {
2884                // PostgreSQL escape string E'...' or e'...'
2885                // Preserve the case by prefixing with "e:" or "E:"
2886                // Always use backslash escapes for escape strings (e.g., \' is an escaped quote)
2887                let lowercase = text == "e";
2888                let prefix = if lowercase { "e:" } else { "E:" };
2889                self.advance(); // consume the opening quote
2890                let string_value = self.scan_string_content_with_escapes(true)?;
2891                self.add_token_with_text(
2892                    TokenType::EscapeString,
2893                    format!("{}{}", prefix, string_value),
2894                );
2895                return Ok(());
2896            } else if text.eq_ignore_ascii_case("X") {
2897                // Hex string X'...'
2898                self.advance(); // consume the opening quote
2899                let string_value = if is_single_quote {
2900                    self.scan_string_content()?
2901                } else {
2902                    self.scan_double_quoted_string_content()?
2903                };
2904                self.add_token_with_text(TokenType::HexString, string_value);
2905                return Ok(());
2906            } else if text.eq_ignore_ascii_case("B") && is_double_quote {
2907                // Byte string b"..." (BigQuery style) - MUST check before single quote B'...'
2908                self.advance(); // consume the opening quote
2909                let string_value = self.scan_double_quoted_string_content()?;
2910                self.add_token_with_text(TokenType::ByteString, string_value);
2911                return Ok(());
2912            } else if text.eq_ignore_ascii_case("B") && is_single_quote {
2913                // For BigQuery: b'...' is a byte string (bytes data)
2914                // For standard SQL: B'...' is a bit string (binary digits)
2915                self.advance(); // consume the opening quote
2916                let string_value = self.scan_string_content()?;
2917                if self.config.b_prefix_is_byte_string {
2918                    self.add_token_with_text(TokenType::ByteString, string_value);
2919                } else {
2920                    self.add_token_with_text(TokenType::BitString, string_value);
2921                }
2922                return Ok(());
2923            }
2924        }
2925
2926        // Check for U&'...' Unicode string syntax (SQL standard)
2927        if text.eq_ignore_ascii_case("U")
2928            && self.peek() == '&'
2929            && self.current + 1 < self.size
2930            && self.chars[self.current + 1] == '\''
2931        {
2932            self.advance(); // consume '&'
2933            self.advance(); // consume opening quote
2934            let string_value = self.scan_string_content()?;
2935            self.add_token_with_text(TokenType::UnicodeString, string_value);
2936            return Ok(());
2937        }
2938
2939        let token_type = Self::lookup_keyword_ascii(&self.config.keywords, &text);
2940
2941        self.add_token_with_text(token_type, text);
2942        Ok(())
2943    }
2944
2945    /// Scan string content (everything between quotes)
2946    /// If `force_backslash_escapes` is true, backslash is always treated as an escape character
2947    /// (used for PostgreSQL E'...' escape strings)
2948    fn scan_string_content_with_escapes(
2949        &mut self,
2950        force_backslash_escapes: bool,
2951    ) -> Result<String> {
2952        let mut value = String::new();
2953        let use_backslash_escapes =
2954            force_backslash_escapes || self.config.string_escapes.contains(&'\\');
2955
2956        while !self.is_at_end() {
2957            let c = self.peek();
2958            if c == '\'' {
2959                if self.peek_next() == '\'' {
2960                    // Escaped quote ''
2961                    value.push('\'');
2962                    self.advance();
2963                    self.advance();
2964                } else {
2965                    break;
2966                }
2967            } else if c == '\\' && use_backslash_escapes {
2968                // Preserve escape sequences literally (including \' for escape strings)
2969                value.push(self.advance());
2970                if !self.is_at_end() {
2971                    value.push(self.advance());
2972                }
2973            } else {
2974                value.push(self.advance());
2975            }
2976        }
2977
2978        if self.is_at_end() {
2979            return Err(Error::tokenize(
2980                "Unterminated string",
2981                self.line,
2982                self.column,
2983                self.start,
2984                self.current,
2985            ));
2986        }
2987
2988        self.advance(); // Closing quote
2989        Ok(value)
2990    }
2991
2992    /// Scan string content (everything between quotes)
2993    fn scan_string_content(&mut self) -> Result<String> {
2994        self.scan_string_content_with_escapes(false)
2995    }
2996
2997    /// Scan double-quoted string content (for dialects like BigQuery where " is a string delimiter)
2998    /// This is used for prefixed strings like b"..." or N"..."
2999    fn scan_double_quoted_string_content(&mut self) -> Result<String> {
3000        let mut value = String::new();
3001        let use_backslash_escapes = self.config.string_escapes.contains(&'\\');
3002
3003        while !self.is_at_end() {
3004            let c = self.peek();
3005            if c == '"' {
3006                if self.peek_next() == '"' {
3007                    // Escaped quote ""
3008                    value.push('"');
3009                    self.advance();
3010                    self.advance();
3011                } else {
3012                    break;
3013                }
3014            } else if c == '\\' && use_backslash_escapes {
3015                // Handle escape sequences
3016                self.advance(); // Consume backslash
3017                if !self.is_at_end() {
3018                    let escaped = self.advance();
3019                    match escaped {
3020                        'n' => value.push('\n'),
3021                        'r' => value.push('\r'),
3022                        't' => value.push('\t'),
3023                        '0' => value.push('\0'),
3024                        '\\' => value.push('\\'),
3025                        '"' => value.push('"'),
3026                        '\'' => value.push('\''),
3027                        'x' => {
3028                            // Hex escape \xNN - collect hex digits
3029                            let mut hex = String::new();
3030                            for _ in 0..2 {
3031                                if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
3032                                    hex.push(self.advance());
3033                                }
3034                            }
3035                            if let Ok(byte) = u8::from_str_radix(&hex, 16) {
3036                                value.push(byte as char);
3037                            } else {
3038                                // Invalid hex escape, keep it literal
3039                                value.push('\\');
3040                                value.push('x');
3041                                value.push_str(&hex);
3042                            }
3043                        }
3044                        _ => {
3045                            // For unrecognized escapes, preserve backslash + char
3046                            value.push('\\');
3047                            value.push(escaped);
3048                        }
3049                    }
3050                }
3051            } else {
3052                value.push(self.advance());
3053            }
3054        }
3055
3056        if self.is_at_end() {
3057            return Err(Error::tokenize(
3058                "Unterminated double-quoted string",
3059                self.line,
3060                self.column,
3061                self.start,
3062                self.current,
3063            ));
3064        }
3065
3066        self.advance(); // Closing quote
3067        Ok(value)
3068    }
3069
3070    /// Scan raw string content (limited escape processing for quotes)
3071    /// Used for BigQuery r'...' and r"..." strings
3072    /// In raw strings, backslashes are literal EXCEPT that escape sequences for the
3073    /// quote character still work (e.g., \' in r'...' escapes the quote, '' also works)
3074    fn scan_raw_string_content(&mut self, quote_char: char) -> Result<String> {
3075        let mut value = String::new();
3076
3077        while !self.is_at_end() {
3078            let c = self.peek();
3079            if c == quote_char {
3080                if self.peek_next() == quote_char {
3081                    // Escaped quote (doubled) - e.g., '' inside r'...'
3082                    value.push(quote_char);
3083                    self.advance();
3084                    self.advance();
3085                } else {
3086                    break;
3087                }
3088            } else if c == '\\'
3089                && self.peek_next() == quote_char
3090                && self.config.string_escapes_allowed_in_raw_strings
3091            {
3092                // Backslash-escaped quote - works in raw strings when string_escapes_allowed_in_raw_strings is true
3093                // e.g., \' inside r'...' becomes literal ' (BigQuery behavior)
3094                // Spark/Databricks has this set to false, so backslash is always literal there
3095                value.push(quote_char);
3096                self.advance(); // consume backslash
3097                self.advance(); // consume quote
3098            } else {
3099                // In raw strings, everything including backslashes is literal
3100                value.push(self.advance());
3101            }
3102        }
3103
3104        if self.is_at_end() {
3105            return Err(Error::tokenize(
3106                "Unterminated raw string",
3107                self.line,
3108                self.column,
3109                self.start,
3110                self.current,
3111            ));
3112        }
3113
3114        self.advance(); // Closing quote
3115        Ok(value)
3116    }
3117
3118    /// Scan raw triple-quoted string content (r"""...""" or r'''...''')
3119    /// Terminates when three consecutive quote_chars are found
3120    fn scan_raw_triple_quoted_content(&mut self, quote_char: char) -> Result<String> {
3121        let mut value = String::new();
3122
3123        while !self.is_at_end() {
3124            let c = self.peek();
3125            if c == quote_char && self.peek_next() == quote_char {
3126                // Check for third quote
3127                if self.current + 2 < self.size && self.chars[self.current + 2] == quote_char {
3128                    // Found three consecutive quotes - end of string
3129                    self.advance(); // first closing quote
3130                    self.advance(); // second closing quote
3131                    self.advance(); // third closing quote
3132                    return Ok(value);
3133                }
3134            }
3135            // In raw strings, everything including backslashes is literal
3136            let ch = self.advance();
3137            value.push(ch);
3138        }
3139
3140        Err(Error::tokenize(
3141            "Unterminated raw triple-quoted string",
3142            self.line,
3143            self.column,
3144            self.start,
3145            self.current,
3146        ))
3147    }
3148
3149    /// Scan TSQL identifiers that start with # (temp tables) or @ (variables)
3150    /// Examples: #temp, ##global_temp, @variable
3151    /// Scan an identifier that starts with `$` (ClickHouse).
3152    /// Examples: `$alias$name$`, `$x`
3153    fn scan_dollar_identifier(&mut self) -> Result<()> {
3154        // Consume the leading $
3155        self.advance();
3156
3157        // Consume alphanumeric, _, and $ continuation chars
3158        while !self.is_at_end() {
3159            let c = self.peek();
3160            if c.is_alphanumeric() || c == '_' || c == '$' {
3161                self.advance();
3162            } else {
3163                break;
3164            }
3165        }
3166
3167        let text = self.text_from_range(self.start, self.current);
3168        self.add_token_with_text(TokenType::Var, text);
3169        Ok(())
3170    }
3171
3172    fn scan_tsql_identifier(&mut self) -> Result<()> {
3173        // Consume the leading # or @ (or ##)
3174        let first = self.advance();
3175
3176        // For ##, consume the second #
3177        if first == '#' && self.peek() == '#' {
3178            self.advance();
3179        }
3180
3181        // Now scan the rest of the identifier
3182        while !self.is_at_end() {
3183            let c = self.peek();
3184            if c.is_alphanumeric() || c == '_' || c == '$' || c == '#' || c == '@' {
3185                self.advance();
3186            } else {
3187                break;
3188            }
3189        }
3190
3191        let text = self.text_from_range(self.start, self.current);
3192        // These are always identifiers (variables or temp table names), never keywords
3193        self.add_token_with_text(TokenType::Var, text);
3194        Ok(())
3195    }
3196
3197    /// Check if the last tokens match INSERT ... FORMAT <name> (not VALUES).
3198    /// If so, consume everything until the next blank line (two consecutive newlines)
3199    /// or end of input as raw data.
3200    fn try_scan_insert_format_raw_data(&mut self) -> Option<String> {
3201        let len = self.tokens.len();
3202        if len < 3 {
3203            return None;
3204        }
3205
3206        // Last token should be the format name (Identifier or Var, not VALUES)
3207        let last = &self.tokens[len - 1];
3208        if last.text.eq_ignore_ascii_case("VALUES") {
3209            return None;
3210        }
3211        if !matches!(last.token_type, TokenType::Var | TokenType::Identifier) {
3212            return None;
3213        }
3214
3215        // Second-to-last should be FORMAT
3216        let format_tok = &self.tokens[len - 2];
3217        if !format_tok.text.eq_ignore_ascii_case("FORMAT") {
3218            return None;
3219        }
3220
3221        // Check that there's an INSERT somewhere earlier in the tokens
3222        let has_insert = self.tokens[..len - 2]
3223            .iter()
3224            .rev()
3225            .take(20)
3226            .any(|t| t.token_type == TokenType::Insert);
3227        if !has_insert {
3228            return None;
3229        }
3230
3231        // We're in INSERT ... FORMAT <name> context. Consume everything until:
3232        // - A blank line (two consecutive newlines, possibly with whitespace between)
3233        // - End of input
3234        let raw_start = self.current;
3235        while !self.is_at_end() {
3236            let c = self.peek();
3237            if c == '\n' {
3238                // Check for blank line: \n followed by optional \r and \n
3239                let saved = self.current;
3240                self.advance(); // consume first \n
3241                                // Skip \r if present
3242                while !self.is_at_end() && self.peek() == '\r' {
3243                    self.advance();
3244                }
3245                if self.is_at_end() || self.peek() == '\n' {
3246                    // Found blank line or end of input - stop here
3247                    // Don't consume the second \n so subsequent SQL can be tokenized
3248                    let raw = self.text_from_range(raw_start, saved);
3249                    return Some(raw.trim().to_string());
3250                }
3251                // Not a blank line, continue scanning
3252            } else {
3253                self.advance();
3254            }
3255        }
3256
3257        // Reached end of input
3258        let raw = self.text_from_range(raw_start, self.current);
3259        let trimmed = raw.trim().to_string();
3260        if trimmed.is_empty() {
3261            None
3262        } else {
3263            Some(trimmed)
3264        }
3265    }
3266
3267    fn add_token(&mut self, token_type: TokenType) {
3268        let text = self.text_from_range(self.start, self.current);
3269        self.add_token_with_text(token_type, text);
3270    }
3271
3272    fn add_token_with_text(&mut self, token_type: TokenType, text: String) {
3273        let span = Span::new(self.start, self.current, self.line, self.column);
3274        let mut token = Token::new(token_type, text, span);
3275        token.comments.append(&mut self.comments);
3276        self.tokens.push(token);
3277    }
3278}
3279
3280#[cfg(test)]
3281mod tests {
3282    use super::*;
3283
3284    #[test]
3285    fn test_simple_select() {
3286        let tokenizer = Tokenizer::default();
3287        let tokens = tokenizer.tokenize("SELECT 1").unwrap();
3288
3289        assert_eq!(tokens.len(), 2);
3290        assert_eq!(tokens[0].token_type, TokenType::Select);
3291        assert_eq!(tokens[1].token_type, TokenType::Number);
3292        assert_eq!(tokens[1].text, "1");
3293    }
3294
3295    #[test]
3296    fn test_select_with_identifier() {
3297        let tokenizer = Tokenizer::default();
3298        let tokens = tokenizer.tokenize("SELECT a, b FROM t").unwrap();
3299
3300        assert_eq!(tokens.len(), 6);
3301        assert_eq!(tokens[0].token_type, TokenType::Select);
3302        assert_eq!(tokens[1].token_type, TokenType::Var);
3303        assert_eq!(tokens[1].text, "a");
3304        assert_eq!(tokens[2].token_type, TokenType::Comma);
3305        assert_eq!(tokens[3].token_type, TokenType::Var);
3306        assert_eq!(tokens[3].text, "b");
3307        assert_eq!(tokens[4].token_type, TokenType::From);
3308        assert_eq!(tokens[5].token_type, TokenType::Var);
3309        assert_eq!(tokens[5].text, "t");
3310    }
3311
3312    #[test]
3313    fn test_string_literal() {
3314        let tokenizer = Tokenizer::default();
3315        let tokens = tokenizer.tokenize("SELECT 'hello'").unwrap();
3316
3317        assert_eq!(tokens.len(), 2);
3318        assert_eq!(tokens[1].token_type, TokenType::String);
3319        assert_eq!(tokens[1].text, "hello");
3320    }
3321
3322    #[test]
3323    fn test_escaped_string() {
3324        let tokenizer = Tokenizer::default();
3325        let tokens = tokenizer.tokenize("SELECT 'it''s'").unwrap();
3326
3327        assert_eq!(tokens.len(), 2);
3328        assert_eq!(tokens[1].token_type, TokenType::String);
3329        assert_eq!(tokens[1].text, "it's");
3330    }
3331
3332    #[test]
3333    fn test_comments() {
3334        let tokenizer = Tokenizer::default();
3335        let tokens = tokenizer.tokenize("SELECT -- comment\n1").unwrap();
3336
3337        assert_eq!(tokens.len(), 2);
3338        // Comments are attached to the PREVIOUS token as trailing_comments
3339        // This is better for round-trip fidelity (e.g., SELECT c /* comment */ FROM)
3340        assert_eq!(tokens[0].trailing_comments.len(), 1);
3341        assert_eq!(tokens[0].trailing_comments[0], " comment");
3342    }
3343
3344    #[test]
3345    fn test_comment_in_and_chain() {
3346        use crate::generator::Generator;
3347        use crate::parser::Parser;
3348
3349        // Line comments between AND clauses should appear after the AND operator
3350        let sql = "SELECT a FROM b WHERE foo\n-- c1\nAND bar\n-- c2\nAND bla";
3351        let ast = Parser::parse_sql(sql).unwrap();
3352        let mut gen = Generator::default();
3353        let output = gen.generate(&ast[0]).unwrap();
3354        assert_eq!(
3355            output,
3356            "SELECT a FROM b WHERE foo AND /* c1 */ bar AND /* c2 */ bla"
3357        );
3358    }
3359
3360    #[test]
3361    fn test_operators() {
3362        let tokenizer = Tokenizer::default();
3363        let tokens = tokenizer.tokenize("1 + 2 * 3").unwrap();
3364
3365        assert_eq!(tokens.len(), 5);
3366        assert_eq!(tokens[0].token_type, TokenType::Number);
3367        assert_eq!(tokens[1].token_type, TokenType::Plus);
3368        assert_eq!(tokens[2].token_type, TokenType::Number);
3369        assert_eq!(tokens[3].token_type, TokenType::Star);
3370        assert_eq!(tokens[4].token_type, TokenType::Number);
3371    }
3372
3373    #[test]
3374    fn test_comparison_operators() {
3375        let tokenizer = Tokenizer::default();
3376        let tokens = tokenizer.tokenize("a <= b >= c != d").unwrap();
3377
3378        assert_eq!(tokens[1].token_type, TokenType::Lte);
3379        assert_eq!(tokens[3].token_type, TokenType::Gte);
3380        assert_eq!(tokens[5].token_type, TokenType::Neq);
3381    }
3382
3383    #[test]
3384    fn test_national_string() {
3385        let tokenizer = Tokenizer::default();
3386        let tokens = tokenizer.tokenize("N'abc'").unwrap();
3387
3388        assert_eq!(
3389            tokens.len(),
3390            1,
3391            "Expected 1 token for N'abc', got {:?}",
3392            tokens
3393        );
3394        assert_eq!(tokens[0].token_type, TokenType::NationalString);
3395        assert_eq!(tokens[0].text, "abc");
3396    }
3397
3398    #[test]
3399    fn test_hex_string() {
3400        let tokenizer = Tokenizer::default();
3401        let tokens = tokenizer.tokenize("X'ABCD'").unwrap();
3402
3403        assert_eq!(
3404            tokens.len(),
3405            1,
3406            "Expected 1 token for X'ABCD', got {:?}",
3407            tokens
3408        );
3409        assert_eq!(tokens[0].token_type, TokenType::HexString);
3410        assert_eq!(tokens[0].text, "ABCD");
3411    }
3412
3413    #[test]
3414    fn test_bit_string() {
3415        let tokenizer = Tokenizer::default();
3416        let tokens = tokenizer.tokenize("B'01010'").unwrap();
3417
3418        assert_eq!(
3419            tokens.len(),
3420            1,
3421            "Expected 1 token for B'01010', got {:?}",
3422            tokens
3423        );
3424        assert_eq!(tokens[0].token_type, TokenType::BitString);
3425        assert_eq!(tokens[0].text, "01010");
3426    }
3427
3428    #[test]
3429    fn test_trailing_dot_number() {
3430        let tokenizer = Tokenizer::default();
3431
3432        // Test trailing dot
3433        let tokens = tokenizer.tokenize("SELECT 1.").unwrap();
3434        assert_eq!(
3435            tokens.len(),
3436            2,
3437            "Expected 2 tokens for 'SELECT 1.', got {:?}",
3438            tokens
3439        );
3440        assert_eq!(tokens[1].token_type, TokenType::Number);
3441        assert_eq!(tokens[1].text, "1.");
3442
3443        // Test normal decimal
3444        let tokens = tokenizer.tokenize("SELECT 1.5").unwrap();
3445        assert_eq!(tokens[1].text, "1.5");
3446
3447        // Test number followed by dot and identifier
3448        // In PostgreSQL (and sqlglot), "1.x" parses as float "1." with alias "x"
3449        let tokens = tokenizer.tokenize("SELECT 1.a").unwrap();
3450        assert_eq!(
3451            tokens.len(),
3452            3,
3453            "Expected 3 tokens for 'SELECT 1.a', got {:?}",
3454            tokens
3455        );
3456        assert_eq!(tokens[1].token_type, TokenType::Number);
3457        assert_eq!(tokens[1].text, "1.");
3458        assert_eq!(tokens[2].token_type, TokenType::Var);
3459
3460        // Test two dots (range operator) - dot is NOT consumed when followed by another dot
3461        let tokens = tokenizer.tokenize("SELECT 1..2").unwrap();
3462        assert_eq!(tokens[1].token_type, TokenType::Number);
3463        assert_eq!(tokens[1].text, "1");
3464        assert_eq!(tokens[2].token_type, TokenType::Dot);
3465        assert_eq!(tokens[3].token_type, TokenType::Dot);
3466        assert_eq!(tokens[4].token_type, TokenType::Number);
3467        assert_eq!(tokens[4].text, "2");
3468    }
3469
3470    #[test]
3471    fn test_leading_dot_number() {
3472        let tokenizer = Tokenizer::default();
3473
3474        // Test leading dot number (e.g., .25 for 0.25)
3475        let tokens = tokenizer.tokenize(".25").unwrap();
3476        assert_eq!(
3477            tokens.len(),
3478            1,
3479            "Expected 1 token for '.25', got {:?}",
3480            tokens
3481        );
3482        assert_eq!(tokens[0].token_type, TokenType::Number);
3483        assert_eq!(tokens[0].text, ".25");
3484
3485        // Test leading dot in context (Oracle SAMPLE clause)
3486        let tokens = tokenizer.tokenize("SAMPLE (.25)").unwrap();
3487        assert_eq!(
3488            tokens.len(),
3489            4,
3490            "Expected 4 tokens for 'SAMPLE (.25)', got {:?}",
3491            tokens
3492        );
3493        assert_eq!(tokens[0].token_type, TokenType::Sample);
3494        assert_eq!(tokens[1].token_type, TokenType::LParen);
3495        assert_eq!(tokens[2].token_type, TokenType::Number);
3496        assert_eq!(tokens[2].text, ".25");
3497        assert_eq!(tokens[3].token_type, TokenType::RParen);
3498
3499        // Test leading dot with exponent
3500        let tokens = tokenizer.tokenize(".5e10").unwrap();
3501        assert_eq!(
3502            tokens.len(),
3503            1,
3504            "Expected 1 token for '.5e10', got {:?}",
3505            tokens
3506        );
3507        assert_eq!(tokens[0].token_type, TokenType::Number);
3508        assert_eq!(tokens[0].text, ".5e10");
3509
3510        // Test that plain dot is still a Dot token
3511        let tokens = tokenizer.tokenize("a.b").unwrap();
3512        assert_eq!(
3513            tokens.len(),
3514            3,
3515            "Expected 3 tokens for 'a.b', got {:?}",
3516            tokens
3517        );
3518        assert_eq!(tokens[1].token_type, TokenType::Dot);
3519    }
3520
3521    #[test]
3522    fn test_unrecognized_character() {
3523        let tokenizer = Tokenizer::default();
3524
3525        // Unicode curly quotes are now handled as string delimiters
3526        let result = tokenizer.tokenize("SELECT \u{2018}hello\u{2019}");
3527        assert!(
3528            result.is_ok(),
3529            "Curly quotes should be tokenized as strings"
3530        );
3531
3532        // Unicode bullet character should still error
3533        let result = tokenizer.tokenize("SELECT • FROM t");
3534        assert!(result.is_err());
3535    }
3536
3537    #[test]
3538    fn test_colon_eq_tokenization() {
3539        let tokenizer = Tokenizer::default();
3540
3541        // := should be a single ColonEq token
3542        let tokens = tokenizer.tokenize("a := 1").unwrap();
3543        assert_eq!(tokens.len(), 3);
3544        assert_eq!(tokens[0].token_type, TokenType::Var);
3545        assert_eq!(tokens[1].token_type, TokenType::ColonEq);
3546        assert_eq!(tokens[2].token_type, TokenType::Number);
3547
3548        // : followed by non-= should still be Colon
3549        let tokens = tokenizer.tokenize("a:b").unwrap();
3550        assert!(tokens.iter().any(|t| t.token_type == TokenType::Colon));
3551        assert!(!tokens.iter().any(|t| t.token_type == TokenType::ColonEq));
3552
3553        // :: should still be DColon
3554        let tokens = tokenizer.tokenize("a::INT").unwrap();
3555        assert!(tokens.iter().any(|t| t.token_type == TokenType::DColon));
3556    }
3557
3558    #[test]
3559    fn test_colon_eq_parsing() {
3560        use crate::generator::Generator;
3561        use crate::parser::Parser;
3562
3563        // MySQL @var := value in SELECT
3564        let ast = Parser::parse_sql("SELECT @var1 := 1, @var2")
3565            .expect("Failed to parse MySQL @var := expr");
3566        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3567        assert_eq!(output, "SELECT @var1 := 1, @var2");
3568
3569        // MySQL @var := @var in SELECT
3570        let ast = Parser::parse_sql("SELECT @var1, @var2 := @var1")
3571            .expect("Failed to parse MySQL @var2 := @var1");
3572        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3573        assert_eq!(output, "SELECT @var1, @var2 := @var1");
3574
3575        // MySQL @var := COUNT(*)
3576        let ast = Parser::parse_sql("SELECT @var1 := COUNT(*) FROM t1")
3577            .expect("Failed to parse MySQL @var := COUNT(*)");
3578        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3579        assert_eq!(output, "SELECT @var1 := COUNT(*) FROM t1");
3580
3581        // MySQL SET @var := 1 (should normalize to = in output)
3582        let ast = Parser::parse_sql("SET @var1 := 1").expect("Failed to parse SET @var1 := 1");
3583        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3584        assert_eq!(output, "SET @var1 = 1");
3585
3586        // Function named args with :=
3587        let ast =
3588            Parser::parse_sql("UNION_VALUE(k1 := 1)").expect("Failed to parse named arg with :=");
3589        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3590        assert_eq!(output, "UNION_VALUE(k1 := 1)");
3591
3592        // UNNEST with recursive := TRUE
3593        let ast = Parser::parse_sql("SELECT UNNEST(col, recursive := TRUE) FROM t")
3594            .expect("Failed to parse UNNEST with :=");
3595        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3596        assert_eq!(output, "SELECT UNNEST(col, recursive := TRUE) FROM t");
3597
3598        // DuckDB prefix alias: foo: 1 means 1 AS foo
3599        let ast =
3600            Parser::parse_sql("SELECT foo: 1").expect("Failed to parse DuckDB prefix alias foo: 1");
3601        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3602        assert_eq!(output, "SELECT 1 AS foo");
3603
3604        // DuckDB prefix alias with multiple columns
3605        let ast = Parser::parse_sql("SELECT foo: 1, bar: 2, baz: 3")
3606            .expect("Failed to parse DuckDB multiple prefix aliases");
3607        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3608        assert_eq!(output, "SELECT 1 AS foo, 2 AS bar, 3 AS baz");
3609    }
3610
3611    #[test]
3612    fn test_colon_eq_dialect_roundtrip() {
3613        use crate::dialects::{Dialect, DialectType};
3614
3615        fn check(dialect: DialectType, sql: &str, expected: Option<&str>) {
3616            let d = Dialect::get(dialect);
3617            let ast = d
3618                .parse(sql)
3619                .unwrap_or_else(|e| panic!("Parse error for '{}': {}", sql, e));
3620            assert!(!ast.is_empty(), "Empty AST for: {}", sql);
3621            let transformed = d
3622                .transform(ast[0].clone())
3623                .unwrap_or_else(|e| panic!("Transform error for '{}': {}", sql, e));
3624            let output = d
3625                .generate(&transformed)
3626                .unwrap_or_else(|e| panic!("Generate error for '{}': {}", sql, e));
3627            let expected = expected.unwrap_or(sql);
3628            assert_eq!(output, expected, "Roundtrip failed for: {}", sql);
3629        }
3630
3631        // MySQL := tests
3632        check(DialectType::MySQL, "SELECT @var1 := 1, @var2", None);
3633        check(DialectType::MySQL, "SELECT @var1, @var2 := @var1", None);
3634        check(DialectType::MySQL, "SELECT @var1 := COUNT(*) FROM t1", None);
3635        check(DialectType::MySQL, "SET @var1 := 1", Some("SET @var1 = 1"));
3636
3637        // DuckDB := tests
3638        check(
3639            DialectType::DuckDB,
3640            "SELECT UNNEST(col, recursive := TRUE) FROM t",
3641            None,
3642        );
3643        check(DialectType::DuckDB, "UNION_VALUE(k1 := 1)", None);
3644
3645        // STRUCT_PACK(a := 'b')::json should at least parse without error
3646        // (The STRUCT_PACK -> Struct transformation is a separate feature)
3647        {
3648            let d = Dialect::get(DialectType::DuckDB);
3649            let ast = d
3650                .parse("STRUCT_PACK(a := 'b')::json")
3651                .expect("Failed to parse STRUCT_PACK(a := 'b')::json");
3652            assert!(!ast.is_empty(), "Empty AST for STRUCT_PACK(a := 'b')::json");
3653        }
3654
3655        // DuckDB prefix alias tests
3656        check(
3657            DialectType::DuckDB,
3658            "SELECT foo: 1",
3659            Some("SELECT 1 AS foo"),
3660        );
3661        check(
3662            DialectType::DuckDB,
3663            "SELECT foo: 1, bar: 2, baz: 3",
3664            Some("SELECT 1 AS foo, 2 AS bar, 3 AS baz"),
3665        );
3666    }
3667
3668    #[test]
3669    fn test_comment_roundtrip() {
3670        use crate::generator::Generator;
3671        use crate::parser::Parser;
3672
3673        fn check_roundtrip(sql: &str) -> Option<String> {
3674            let ast = match Parser::parse_sql(sql) {
3675                Ok(a) => a,
3676                Err(e) => return Some(format!("Parse error: {:?}", e)),
3677            };
3678            if ast.is_empty() {
3679                return Some("Empty AST".to_string());
3680            }
3681            let mut generator = Generator::default();
3682            let output = match generator.generate(&ast[0]) {
3683                Ok(o) => o,
3684                Err(e) => return Some(format!("Gen error: {:?}", e)),
3685            };
3686            if output == sql {
3687                None
3688            } else {
3689                Some(format!(
3690                    "Mismatch:\n  input:  {}\n  output: {}",
3691                    sql, output
3692                ))
3693            }
3694        }
3695
3696        let tests = vec![
3697            // Nested comments
3698            "SELECT c /* c1 /* c2 */ c3 */",
3699            "SELECT c /* c1 /* c2 /* c3 */ */ */",
3700            // Simple alias with comments
3701            "SELECT c /* c1 */ AS alias /* c2 */",
3702            // Multiple columns with comments
3703            "SELECT a /* x */, b /* x */",
3704            // Multiple comments after column
3705            "SELECT a /* x */ /* y */ /* z */, b /* k */ /* m */",
3706            // FROM tables with comments
3707            "SELECT * FROM foo /* x */, bla /* x */",
3708            // Arithmetic with comments
3709            "SELECT 1 /* comment */ + 1",
3710            "SELECT 1 /* c1 */ + 2 /* c2 */",
3711            "SELECT 1 /* c1 */ + /* c2 */ 2 /* c3 */",
3712            // CAST with comments
3713            "SELECT CAST(x AS INT) /* comment */ FROM foo",
3714            // Function arguments with comments
3715            "SELECT FOO(x /* c */) /* FOO */, b /* b */",
3716            // Multi-part table names with comments
3717            "SELECT x FROM a.b.c /* x */, e.f.g /* x */",
3718            // INSERT with comments
3719            "INSERT INTO t1 (tc1 /* tc1 */, tc2 /* tc2 */) SELECT c1 /* sc1 */, c2 /* sc2 */ FROM t",
3720            // Leading comments on statements
3721            "/* c */ WITH x AS (SELECT 1) SELECT * FROM x",
3722            "/* comment1 */ INSERT INTO x /* comment2 */ VALUES (1, 2, 3)",
3723            "/* comment1 */ UPDATE tbl /* comment2 */ SET x = 2 WHERE x < 2",
3724            "/* comment1 */ DELETE FROM x /* comment2 */ WHERE y > 1",
3725            "/* comment */ CREATE TABLE foo AS SELECT 1",
3726            // Trailing comments on statements
3727            "INSERT INTO foo SELECT * FROM bar /* comment */",
3728            // Complex nested expressions with comments
3729            "SELECT FOO(x /* c1 */ + y /* c2 */ + BLA(5 /* c3 */)) FROM (VALUES (1 /* c4 */, \"test\" /* c5 */)) /* c6 */",
3730        ];
3731
3732        let mut failures = Vec::new();
3733        for sql in tests {
3734            if let Some(e) = check_roundtrip(sql) {
3735                failures.push(e);
3736            }
3737        }
3738
3739        if !failures.is_empty() {
3740            panic!("Comment roundtrip failures:\n{}", failures.join("\n\n"));
3741        }
3742    }
3743
3744    #[test]
3745    fn test_dollar_quoted_string_parsing() {
3746        use crate::dialects::{Dialect, DialectType};
3747
3748        // Test dollar string token parsing utility function
3749        let (tag, content) = super::parse_dollar_string_token("FOO\x00content here");
3750        assert_eq!(tag, Some("FOO".to_string()));
3751        assert_eq!(content, "content here");
3752
3753        let (tag, content) = super::parse_dollar_string_token("just content");
3754        assert_eq!(tag, None);
3755        assert_eq!(content, "just content");
3756
3757        // Test roundtrip for Databricks dialect with dollar-quoted function body
3758        fn check_databricks(sql: &str, expected: Option<&str>) {
3759            let d = Dialect::get(DialectType::Databricks);
3760            let ast = d
3761                .parse(sql)
3762                .unwrap_or_else(|e| panic!("Parse error for '{}': {}", sql, e));
3763            assert!(!ast.is_empty(), "Empty AST for: {}", sql);
3764            let transformed = d
3765                .transform(ast[0].clone())
3766                .unwrap_or_else(|e| panic!("Transform error for '{}': {}", sql, e));
3767            let output = d
3768                .generate(&transformed)
3769                .unwrap_or_else(|e| panic!("Generate error for '{}': {}", sql, e));
3770            let expected = expected.unwrap_or(sql);
3771            assert_eq!(output, expected, "Roundtrip failed for: {}", sql);
3772        }
3773
3774        // Test [42]: $$...$$ heredoc
3775        check_databricks(
3776            "CREATE FUNCTION add_one(x INT) RETURNS INT LANGUAGE PYTHON AS $$def add_one(x):\n  return x+1$$",
3777            None
3778        );
3779
3780        // Test [43]: $FOO$...$FOO$ tagged heredoc
3781        check_databricks(
3782            "CREATE FUNCTION add_one(x INT) RETURNS INT LANGUAGE PYTHON AS $FOO$def add_one(x):\n  return x+1$FOO$",
3783            None
3784        );
3785    }
3786}
polyglot_sql/tokens.rs

polyglot_sql/
tokens.rs