polyglot_sql/
tokens.rs

1//! Token types and tokenization for SQL parsing
2//!
3//! This module defines all SQL token types and the tokenizer that converts
4//! SQL strings into token streams.
5
6use crate::error::{Error, Result};
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9use std::fmt;
10use std::sync::LazyLock;
11#[cfg(feature = "bindings")]
12use ts_rs::TS;
13
14/// Parse a DollarString token text into (tag, content).
15/// If the text contains '\x00', the part before is the tag and after is content.
16/// Otherwise, the whole text is the content with no tag.
17pub fn parse_dollar_string_token(text: &str) -> (Option<String>, String) {
18    if let Some(pos) = text.find('\x00') {
19        let tag = &text[..pos];
20        let content = &text[pos + 1..];
21        (Some(tag.to_string()), content.to_string())
22    } else {
23        (None, text.to_string())
24    }
25}
26
27/// Represents a position in the source SQL
28#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
29#[cfg_attr(feature = "bindings", derive(TS))]
30pub struct Span {
31    /// Starting byte offset
32    pub start: usize,
33    /// Ending byte offset (exclusive)
34    pub end: usize,
35    /// Line number (1-based)
36    pub line: usize,
37    /// Column number (1-based)
38    pub column: usize,
39}
40
41impl Span {
42    pub fn new(start: usize, end: usize, line: usize, column: usize) -> Self {
43        Self {
44            start,
45            end,
46            line,
47            column,
48        }
49    }
50}
51
52/// A token in the SQL token stream
53#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
54pub struct Token {
55    /// The type of token
56    pub token_type: TokenType,
57    /// The raw text of the token
58    pub text: String,
59    /// Position information
60    pub span: Span,
61    /// Leading comments (comments that appeared before this token)
62    #[serde(default)]
63    pub comments: Vec<String>,
64    /// Trailing comments (comments that appeared after this token, before the next one)
65    #[serde(default)]
66    pub trailing_comments: Vec<String>,
67}
68
69impl Token {
70    /// Create a new token
71    pub fn new(token_type: TokenType, text: impl Into<String>, span: Span) -> Self {
72        Self {
73            token_type,
74            text: text.into(),
75            span,
76            comments: Vec::new(),
77            trailing_comments: Vec::new(),
78        }
79    }
80
81    /// Create a NUMBER token
82    pub fn number(n: i64) -> Self {
83        Self::new(TokenType::Number, n.to_string(), Span::default())
84    }
85
86    /// Create a STRING token
87    pub fn string(s: impl Into<String>) -> Self {
88        Self::new(TokenType::String, s, Span::default())
89    }
90
91    /// Create an IDENTIFIER token
92    pub fn identifier(s: impl Into<String>) -> Self {
93        Self::new(TokenType::Identifier, s, Span::default())
94    }
95
96    /// Create a VAR token
97    pub fn var(s: impl Into<String>) -> Self {
98        Self::new(TokenType::Var, s, Span::default())
99    }
100
101    /// Add a comment to this token
102    pub fn with_comment(mut self, comment: impl Into<String>) -> Self {
103        self.comments.push(comment.into());
104        self
105    }
106}
107
108impl fmt::Display for Token {
109    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
110        write!(f, "{:?}({})", self.token_type, self.text)
111    }
112}
113
114/// All possible token types in SQL
115#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
116#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
117#[repr(u16)]
118pub enum TokenType {
119    // Punctuation
120    LParen,
121    RParen,
122    LBracket,
123    RBracket,
124    LBrace,
125    RBrace,
126    Comma,
127    Dot,
128    Dash,
129    Plus,
130    Colon,
131    DotColon,
132    DColon,
133    DColonDollar,
134    DColonPercent,
135    DColonQMark,
136    DQMark,
137    Semicolon,
138    Star,
139    Backslash,
140    Slash,
141    Lt,
142    Lte,
143    Gt,
144    Gte,
145    Not,
146    Eq,
147    Neq,
148    NullsafeEq,
149    ColonEq,
150    ColonGt,
151    NColonGt,
152    And,
153    Or,
154    Amp,
155    DPipe,
156    PipeGt,
157    Pipe,
158    PipeSlash,
159    DPipeSlash,
160    Caret,
161    CaretAt,
162    LtLt, // <<
163    GtGt, // >>
164    Tilde,
165    Arrow,
166    DArrow,
167    FArrow,
168    Hash,
169    HashArrow,
170    DHashArrow,
171    LrArrow,
172    DAt,
173    AtAt,
174    LtAt,
175    AtGt,
176    Dollar,
177    Parameter,
178    Session,
179    SessionParameter,
180    SessionUser,
181    DAmp,
182    AmpLt,
183    AmpGt,
184    Adjacent,
185    Xor,
186    DStar,
187    QMarkAmp,
188    QMarkPipe,
189    HashDash,
190    Exclamation,
191
192    UriStart,
193    BlockStart,
194    BlockEnd,
195    Space,
196    Break,
197
198    // Comments (emitted as tokens for round-trip fidelity)
199    BlockComment, // /* ... */
200    LineComment,  // -- ...
201
202    // Literals
203    String,
204    DollarString,             // $$...$$
205    TripleDoubleQuotedString, // """..."""
206    TripleSingleQuotedString, // '''...'''
207    Number,
208    Identifier,
209    QuotedIdentifier,
210    Database,
211    Column,
212    ColumnDef,
213    Schema,
214    Table,
215    Warehouse,
216    Stage,
217    Streamlit,
218    Var,
219    BitString,
220    HexString,
221    /// Hex number: 0xA, 0xFF (BigQuery, SQLite style) - represents an integer in hex notation
222    HexNumber,
223    ByteString,
224    NationalString,
225    EscapeString, // PostgreSQL E'...' escape string
226    RawString,
227    HeredocString,
228    HeredocStringAlternative,
229    UnicodeString,
230
231    // Data Types
232    Bit,
233    Boolean,
234    TinyInt,
235    UTinyInt,
236    SmallInt,
237    USmallInt,
238    MediumInt,
239    UMediumInt,
240    Int,
241    UInt,
242    BigInt,
243    UBigInt,
244    BigNum,
245    Int128,
246    UInt128,
247    Int256,
248    UInt256,
249    Float,
250    Double,
251    UDouble,
252    Decimal,
253    Decimal32,
254    Decimal64,
255    Decimal128,
256    Decimal256,
257    DecFloat,
258    UDecimal,
259    BigDecimal,
260    Char,
261    NChar,
262    VarChar,
263    NVarChar,
264    BpChar,
265    Text,
266    MediumText,
267    LongText,
268    Blob,
269    MediumBlob,
270    LongBlob,
271    TinyBlob,
272    TinyText,
273    Name,
274    Binary,
275    VarBinary,
276    Json,
277    JsonB,
278    Time,
279    TimeTz,
280    TimeNs,
281    Timestamp,
282    TimestampTz,
283    TimestampLtz,
284    TimestampNtz,
285    TimestampS,
286    TimestampMs,
287    TimestampNs,
288    DateTime,
289    DateTime2,
290    DateTime64,
291    SmallDateTime,
292    Date,
293    Date32,
294    Int4Range,
295    Int4MultiRange,
296    Int8Range,
297    Int8MultiRange,
298    NumRange,
299    NumMultiRange,
300    TsRange,
301    TsMultiRange,
302    TsTzRange,
303    TsTzMultiRange,
304    DateRange,
305    DateMultiRange,
306    Uuid,
307    Geography,
308    GeographyPoint,
309    Nullable,
310    Geometry,
311    Point,
312    Ring,
313    LineString,
314    LocalTime,
315    LocalTimestamp,
316    SysTimestamp,
317    MultiLineString,
318    Polygon,
319    MultiPolygon,
320    HllSketch,
321    HStore,
322    Super,
323    Serial,
324    SmallSerial,
325    BigSerial,
326    Xml,
327    Year,
328    UserDefined,
329    Money,
330    SmallMoney,
331    RowVersion,
332    Image,
333    Variant,
334    Object,
335    Inet,
336    IpAddress,
337    IpPrefix,
338    Ipv4,
339    Ipv6,
340    Enum,
341    Enum8,
342    Enum16,
343    FixedString,
344    LowCardinality,
345    Nested,
346    AggregateFunction,
347    SimpleAggregateFunction,
348    TDigest,
349    Unknown,
350    Vector,
351    Dynamic,
352    Void,
353
354    // Keywords
355    Add,
356    Alias,
357    Alter,
358    All,
359    Anti,
360    Any,
361    Apply,
362    Array,
363    Asc,
364    AsOf,
365    Attach,
366    AutoIncrement,
367    Begin,
368    Between,
369    BulkCollectInto,
370    Cache,
371    Cascade,
372    Case,
373    CharacterSet,
374    Cluster,
375    ClusterBy,
376    Collate,
377    Command,
378    Comment,
379    Commit,
380    Prepare,
381    Preserve,
382    Connect,
383    ConnectBy,
384    Constraint,
385    Copy,
386    Create,
387    Cross,
388    Cube,
389    CurrentDate,
390    CurrentDateTime,
391    CurrentSchema,
392    CurrentTime,
393    CurrentTimestamp,
394    CurrentUser,
395    CurrentRole,
396    CurrentCatalog,
397    Declare,
398    Default,
399    Delete,
400    Desc,
401    Describe,
402    Detach,
403    Dictionary,
404    Distinct,
405    Distribute,
406    DistributeBy,
407    Div,
408    Drop,
409    Else,
410    End,
411    Escape,
412    Except,
413    Execute,
414    Exists,
415    False,
416    Fetch,
417    File,
418    FileFormat,
419    Filter,
420    Final,
421    First,
422    For,
423    Force,
424    ForeignKey,
425    Format,
426    From,
427    Full,
428    Function,
429    Get,
430    Glob,
431    Global,
432    Grant,
433    GroupBy,
434    GroupingSets,
435    Having,
436    Hint,
437    Ignore,
438    ILike,
439    In,
440    Index,
441    IndexedBy,
442    Inner,
443    Input,
444    Insert,
445    Install,
446    Intersect,
447    Interval,
448    Into,
449    Inpath,
450    InputFormat,
451    Introducer,
452    IRLike,
453    Is,
454    IsNull,
455    Join,
456    JoinMarker,
457    Keep,
458    Key,
459    Kill,
460    Lambda,
461    Language,
462    Lateral,
463    Left,
464    Like,
465    NotLike,   // !~~ operator (PostgreSQL)
466    NotILike,  // !~~* operator (PostgreSQL)
467    NotRLike,  // !~ operator (PostgreSQL)
468    NotIRLike, // !~* operator (PostgreSQL)
469    Limit,
470    List,
471    Load,
472    Local,
473    Lock,
474    Map,
475    Match,
476    MatchCondition,
477    MatchRecognize,
478    MemberOf,
479    Materialized,
480    Merge,
481    Mod,
482    Model,
483    Natural,
484    Next,
485    NoAction,
486    Nothing,
487    NotNull,
488    Null,
489    ObjectIdentifier,
490    Offset,
491    On,
492    Only,
493    Operator,
494    OrderBy,
495    OrderSiblingsBy,
496    Ordered,
497    Ordinality,
498    Out,
499    Outer,
500    Output,
501    Over,
502    Overlaps,
503    Overwrite,
504    Partition,
505    PartitionBy,
506    Percent,
507    Pivot,
508    Placeholder,
509    Positional,
510    Pragma,
511    Prewhere,
512    PrimaryKey,
513    Procedure,
514    Properties,
515    PseudoType,
516    Put,
517    Qualify,
518    Quote,
519    QDColon,
520    Range,
521    Recursive,
522    Refresh,
523    Rename,
524    Replace,
525    Returning,
526    Revoke,
527    References,
528    Restrict,
529    Right,
530    RLike,
531    Rollback,
532    Rollup,
533    Row,
534    Rows,
535    Select,
536    Semi,
537    Savepoint,
538    Separator,
539    Sequence,
540    Serde,
541    SerdeProperties,
542    Set,
543    Settings,
544    Show,
545    Siblings,
546    SimilarTo,
547    Some,
548    Sort,
549    SortBy,
550    SoundsLike,
551    StartWith,
552    StorageIntegration,
553    StraightJoin,
554    Struct,
555    Summarize,
556    TableSample,
557    Sample,
558    Bernoulli,
559    System,
560    Block,
561    Seed,
562    Repeatable,
563    Tag,
564    Temporary,
565    Transaction,
566    To,
567    Top,
568    Then,
569    True,
570    Truncate,
571    Uncache,
572    Union,
573    Unnest,
574    Unpivot,
575    Update,
576    Use,
577    Using,
578    Values,
579    View,
580    SemanticView,
581    Volatile,
582    When,
583    Where,
584    Window,
585    With,
586    Ties,
587    Exclude,
588    No,
589    Others,
590    Unique,
591    UtcDate,
592    UtcTime,
593    UtcTimestamp,
594    VersionSnapshot,
595    TimestampSnapshot,
596    Option,
597    Sink,
598    Source,
599    Analyze,
600    Namespace,
601    Export,
602    As,
603    By,
604    Nulls,
605    Respect,
606    Last,
607    If,
608    Cast,
609    TryCast,
610    SafeCast,
611    Count,
612    Extract,
613    Substring,
614    Trim,
615    Leading,
616    Trailing,
617    Both,
618    Position,
619    Overlaying,
620    Placing,
621    Treat,
622    Within,
623    Group,
624    Order,
625
626    // Window function keywords
627    Unbounded,
628    Preceding,
629    Following,
630    Current,
631    Groups,
632
633    // DDL-specific keywords (Phase 4)
634    Trigger,
635    Type,
636    Domain,
637    Returns,
638    Body,
639    Increment,
640    Minvalue,
641    Maxvalue,
642    Start,
643    Cycle,
644    NoCycle,
645    Prior,
646    Generated,
647    Identity,
648    Always,
649    // MATCH_RECOGNIZE tokens
650    Measures,
651    Pattern,
652    Define,
653    Running,
654    Owned,
655    After,
656    Before,
657    Instead,
658    Each,
659    Statement,
660    Referencing,
661    Old,
662    New,
663    Of,
664    Check,
665    Authorization,
666    Restart,
667
668    // Special
669    Eof,
670}
671
672impl TokenType {
673    /// Check if this token type is a keyword that can be used as an identifier in certain contexts
674    pub fn is_keyword(&self) -> bool {
675        matches!(
676            self,
677            TokenType::Select
678                | TokenType::From
679                | TokenType::Where
680                | TokenType::And
681                | TokenType::Or
682                | TokenType::Not
683                | TokenType::In
684                | TokenType::Is
685                | TokenType::Null
686                | TokenType::True
687                | TokenType::False
688                | TokenType::As
689                | TokenType::On
690                | TokenType::Join
691                | TokenType::Left
692                | TokenType::Right
693                | TokenType::Inner
694                | TokenType::Outer
695                | TokenType::Full
696                | TokenType::Cross
697                | TokenType::Semi
698                | TokenType::Anti
699                | TokenType::Union
700                | TokenType::Except
701                | TokenType::Intersect
702                | TokenType::GroupBy
703                | TokenType::OrderBy
704                | TokenType::Having
705                | TokenType::Limit
706                | TokenType::Offset
707                | TokenType::Case
708                | TokenType::When
709                | TokenType::Then
710                | TokenType::Else
711                | TokenType::End
712                | TokenType::Create
713                | TokenType::Drop
714                | TokenType::Alter
715                | TokenType::Insert
716                | TokenType::Update
717                | TokenType::Delete
718                | TokenType::Into
719                | TokenType::Values
720                | TokenType::Set
721                | TokenType::With
722                | TokenType::Distinct
723                | TokenType::All
724                | TokenType::Exists
725                | TokenType::Between
726                | TokenType::Like
727                | TokenType::ILike
728                // Additional keywords that can be used as identifiers
729                | TokenType::Filter
730                | TokenType::Date
731                | TokenType::Timestamp
732                | TokenType::TimestampTz
733                | TokenType::Interval
734                | TokenType::Time
735                | TokenType::Table
736                | TokenType::Index
737                | TokenType::Column
738                | TokenType::Database
739                | TokenType::Schema
740                | TokenType::View
741                | TokenType::Function
742                | TokenType::Procedure
743                | TokenType::Trigger
744                | TokenType::Sequence
745                | TokenType::Over
746                | TokenType::Partition
747                | TokenType::Window
748                | TokenType::Rows
749                | TokenType::Range
750                | TokenType::First
751                | TokenType::Last
752                | TokenType::Preceding
753                | TokenType::Following
754                | TokenType::Current
755                | TokenType::Row
756                | TokenType::Unbounded
757                | TokenType::Array
758                | TokenType::Struct
759                | TokenType::Map
760                | TokenType::PrimaryKey
761                | TokenType::Key
762                | TokenType::ForeignKey
763                | TokenType::References
764                | TokenType::Unique
765                | TokenType::Check
766                | TokenType::Default
767                | TokenType::Constraint
768                | TokenType::Comment
769                | TokenType::Rollup
770                | TokenType::Cube
771                | TokenType::Grant
772                | TokenType::Revoke
773                | TokenType::Type
774                | TokenType::Use
775                | TokenType::Cache
776                | TokenType::Uncache
777                | TokenType::Load
778                | TokenType::Any
779                | TokenType::Some
780                | TokenType::Asc
781                | TokenType::Desc
782                | TokenType::Nulls
783                | TokenType::Lateral
784                | TokenType::Natural
785                | TokenType::Escape
786                | TokenType::Glob
787                | TokenType::Match
788                | TokenType::Recursive
789                | TokenType::Replace
790                | TokenType::Returns
791                | TokenType::If
792                | TokenType::Pivot
793                | TokenType::Unpivot
794                | TokenType::Json
795                | TokenType::Blob
796                | TokenType::Text
797                | TokenType::Int
798                | TokenType::BigInt
799                | TokenType::SmallInt
800                | TokenType::TinyInt
801                | TokenType::Int128
802                | TokenType::UInt128
803                | TokenType::Int256
804                | TokenType::UInt256
805                | TokenType::UInt
806                | TokenType::UBigInt
807                | TokenType::Float
808                | TokenType::Double
809                | TokenType::Decimal
810                | TokenType::Boolean
811                | TokenType::VarChar
812                | TokenType::Char
813                | TokenType::Binary
814                | TokenType::VarBinary
815                | TokenType::No
816                | TokenType::DateTime
817                | TokenType::Truncate
818                | TokenType::Execute
819                | TokenType::Merge
820                | TokenType::Top
821                | TokenType::Begin
822                | TokenType::Generated
823                | TokenType::Identity
824                | TokenType::Always
825                | TokenType::Extract
826                // Keywords that can be identifiers in certain contexts
827                | TokenType::AsOf
828                | TokenType::Prior
829                | TokenType::After
830                | TokenType::Restrict
831                | TokenType::Cascade
832                | TokenType::Local
833                | TokenType::Rename
834                | TokenType::Enum
835                | TokenType::Within
836                | TokenType::Format
837                | TokenType::Final
838                | TokenType::FileFormat
839                | TokenType::Input
840                | TokenType::InputFormat
841                | TokenType::Copy
842                | TokenType::Put
843                | TokenType::Get
844                | TokenType::Show
845                | TokenType::Serde
846                | TokenType::Sample
847                | TokenType::Sort
848                | TokenType::Collate
849                | TokenType::Ties
850                | TokenType::IsNull
851                | TokenType::NotNull
852                | TokenType::Exclude
853                | TokenType::Temporary
854                | TokenType::Add
855                | TokenType::Ordinality
856                | TokenType::Overlaps
857                | TokenType::Block
858                | TokenType::Pattern
859                | TokenType::Group
860                | TokenType::Cluster
861                | TokenType::Repeatable
862                | TokenType::Groups
863                | TokenType::Commit
864                | TokenType::Warehouse
865                | TokenType::System
866                | TokenType::By
867                | TokenType::To
868                | TokenType::Fetch
869                | TokenType::For
870                | TokenType::Only
871                | TokenType::Next
872                | TokenType::Lock
873                | TokenType::Refresh
874                | TokenType::Settings
875                | TokenType::Operator
876                | TokenType::Overwrite
877                | TokenType::StraightJoin
878                | TokenType::Start
879                // Additional keywords registered in tokenizer but previously missing from is_keyword()
880                | TokenType::Ignore
881                | TokenType::Domain
882                | TokenType::Apply
883                | TokenType::Respect
884                | TokenType::Materialized
885                | TokenType::Prewhere
886                | TokenType::Old
887                | TokenType::New
888                | TokenType::Cast
889                | TokenType::TryCast
890                | TokenType::SafeCast
891                | TokenType::Transaction
892                | TokenType::Describe
893                | TokenType::Kill
894                | TokenType::Lambda
895                | TokenType::Declare
896                | TokenType::Keep
897                | TokenType::Output
898                | TokenType::Percent
899                | TokenType::Qualify
900                | TokenType::Returning
901                | TokenType::Language
902                | TokenType::Prepare
903                | TokenType::Preserve
904                | TokenType::Savepoint
905                | TokenType::Rollback
906                | TokenType::Body
907                | TokenType::Increment
908                | TokenType::Minvalue
909                | TokenType::Maxvalue
910                | TokenType::Cycle
911                | TokenType::NoCycle
912                | TokenType::Seed
913                | TokenType::Namespace
914                | TokenType::Authorization
915                | TokenType::Order
916                | TokenType::Restart
917                | TokenType::Before
918                | TokenType::Instead
919                | TokenType::Each
920                | TokenType::Statement
921                | TokenType::Referencing
922                | TokenType::Of
923                | TokenType::Separator
924                | TokenType::Others
925                | TokenType::Placing
926                | TokenType::Owned
927                | TokenType::Running
928                | TokenType::Define
929                | TokenType::Measures
930                | TokenType::MatchRecognize
931                | TokenType::AutoIncrement
932                | TokenType::Connect
933                | TokenType::Distribute
934                | TokenType::Bernoulli
935                | TokenType::TableSample
936                | TokenType::Inpath
937                | TokenType::Pragma
938                | TokenType::Siblings
939                | TokenType::SerdeProperties
940                | TokenType::RLike
941        )
942    }
943
944    /// Check if this token type is a comparison operator
945    pub fn is_comparison(&self) -> bool {
946        matches!(
947            self,
948            TokenType::Eq
949                | TokenType::Neq
950                | TokenType::Lt
951                | TokenType::Lte
952                | TokenType::Gt
953                | TokenType::Gte
954                | TokenType::NullsafeEq
955        )
956    }
957
958    /// Check if this token type is an arithmetic operator
959    pub fn is_arithmetic(&self) -> bool {
960        matches!(
961            self,
962            TokenType::Plus
963                | TokenType::Dash
964                | TokenType::Star
965                | TokenType::Slash
966                | TokenType::Percent
967                | TokenType::Mod
968                | TokenType::Div
969        )
970    }
971}
972
973impl fmt::Display for TokenType {
974    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
975        write!(f, "{:?}", self)
976    }
977}
978
979// ── Cached default maps for TokenizerConfig ─────────────────────────────────
980
981static DEFAULT_KEYWORDS: LazyLock<HashMap<String, TokenType>> = LazyLock::new(|| {
982    let mut keywords = HashMap::with_capacity(300);
983    // Add basic SQL keywords
984    keywords.insert("SELECT".to_string(), TokenType::Select);
985    keywords.insert("FROM".to_string(), TokenType::From);
986    keywords.insert("WHERE".to_string(), TokenType::Where);
987    keywords.insert("AND".to_string(), TokenType::And);
988    keywords.insert("OR".to_string(), TokenType::Or);
989    keywords.insert("NOT".to_string(), TokenType::Not);
990    keywords.insert("AS".to_string(), TokenType::As);
991    keywords.insert("ON".to_string(), TokenType::On);
992    keywords.insert("JOIN".to_string(), TokenType::Join);
993    keywords.insert("LEFT".to_string(), TokenType::Left);
994    keywords.insert("RIGHT".to_string(), TokenType::Right);
995    keywords.insert("INNER".to_string(), TokenType::Inner);
996    keywords.insert("OUTER".to_string(), TokenType::Outer);
997    keywords.insert("OUTPUT".to_string(), TokenType::Output);
998    keywords.insert("FULL".to_string(), TokenType::Full);
999    keywords.insert("CROSS".to_string(), TokenType::Cross);
1000    keywords.insert("SEMI".to_string(), TokenType::Semi);
1001    keywords.insert("ANTI".to_string(), TokenType::Anti);
1002    keywords.insert("STRAIGHT_JOIN".to_string(), TokenType::StraightJoin);
1003    keywords.insert("UNION".to_string(), TokenType::Union);
1004    keywords.insert("EXCEPT".to_string(), TokenType::Except);
1005    keywords.insert("MINUS".to_string(), TokenType::Except); // Oracle/Redshift alias for EXCEPT
1006    keywords.insert("INTERSECT".to_string(), TokenType::Intersect);
1007    keywords.insert("GROUP".to_string(), TokenType::Group);
1008    keywords.insert("CUBE".to_string(), TokenType::Cube);
1009    keywords.insert("ROLLUP".to_string(), TokenType::Rollup);
1010    keywords.insert("WITHIN".to_string(), TokenType::Within);
1011    keywords.insert("ORDER".to_string(), TokenType::Order);
1012    keywords.insert("BY".to_string(), TokenType::By);
1013    keywords.insert("HAVING".to_string(), TokenType::Having);
1014    keywords.insert("LIMIT".to_string(), TokenType::Limit);
1015    keywords.insert("OFFSET".to_string(), TokenType::Offset);
1016    keywords.insert("ORDINALITY".to_string(), TokenType::Ordinality);
1017    keywords.insert("FETCH".to_string(), TokenType::Fetch);
1018    keywords.insert("FIRST".to_string(), TokenType::First);
1019    keywords.insert("NEXT".to_string(), TokenType::Next);
1020    keywords.insert("ONLY".to_string(), TokenType::Only);
1021    keywords.insert("KEEP".to_string(), TokenType::Keep);
1022    keywords.insert("IGNORE".to_string(), TokenType::Ignore);
1023    keywords.insert("INPUT".to_string(), TokenType::Input);
1024    keywords.insert("CASE".to_string(), TokenType::Case);
1025    keywords.insert("WHEN".to_string(), TokenType::When);
1026    keywords.insert("THEN".to_string(), TokenType::Then);
1027    keywords.insert("ELSE".to_string(), TokenType::Else);
1028    keywords.insert("END".to_string(), TokenType::End);
1029    keywords.insert("ENDIF".to_string(), TokenType::End); // Exasol alias for END
1030    keywords.insert("NULL".to_string(), TokenType::Null);
1031    keywords.insert("TRUE".to_string(), TokenType::True);
1032    keywords.insert("FALSE".to_string(), TokenType::False);
1033    keywords.insert("IS".to_string(), TokenType::Is);
1034    keywords.insert("IN".to_string(), TokenType::In);
1035    keywords.insert("BETWEEN".to_string(), TokenType::Between);
1036    keywords.insert("OVERLAPS".to_string(), TokenType::Overlaps);
1037    keywords.insert("LIKE".to_string(), TokenType::Like);
1038    keywords.insert("ILIKE".to_string(), TokenType::ILike);
1039    keywords.insert("RLIKE".to_string(), TokenType::RLike);
1040    keywords.insert("REGEXP".to_string(), TokenType::RLike);
1041    keywords.insert("ESCAPE".to_string(), TokenType::Escape);
1042    keywords.insert("EXISTS".to_string(), TokenType::Exists);
1043    keywords.insert("DISTINCT".to_string(), TokenType::Distinct);
1044    keywords.insert("ALL".to_string(), TokenType::All);
1045    keywords.insert("WITH".to_string(), TokenType::With);
1046    keywords.insert("CREATE".to_string(), TokenType::Create);
1047    keywords.insert("DROP".to_string(), TokenType::Drop);
1048    keywords.insert("ALTER".to_string(), TokenType::Alter);
1049    keywords.insert("TRUNCATE".to_string(), TokenType::Truncate);
1050    keywords.insert("TABLE".to_string(), TokenType::Table);
1051    keywords.insert("VIEW".to_string(), TokenType::View);
1052    keywords.insert("INDEX".to_string(), TokenType::Index);
1053    keywords.insert("COLUMN".to_string(), TokenType::Column);
1054    keywords.insert("CONSTRAINT".to_string(), TokenType::Constraint);
1055    keywords.insert("ADD".to_string(), TokenType::Add);
1056    keywords.insert("CASCADE".to_string(), TokenType::Cascade);
1057    keywords.insert("RESTRICT".to_string(), TokenType::Restrict);
1058    keywords.insert("RENAME".to_string(), TokenType::Rename);
1059    keywords.insert("TEMPORARY".to_string(), TokenType::Temporary);
1060    keywords.insert("TEMP".to_string(), TokenType::Temporary);
1061    keywords.insert("UNIQUE".to_string(), TokenType::Unique);
1062    keywords.insert("PRIMARY".to_string(), TokenType::PrimaryKey);
1063    keywords.insert("FOREIGN".to_string(), TokenType::ForeignKey);
1064    keywords.insert("KEY".to_string(), TokenType::Key);
1065    keywords.insert("KILL".to_string(), TokenType::Kill);
1066    keywords.insert("REFERENCES".to_string(), TokenType::References);
1067    keywords.insert("DEFAULT".to_string(), TokenType::Default);
1068    keywords.insert("DECLARE".to_string(), TokenType::Declare);
1069    keywords.insert("AUTO_INCREMENT".to_string(), TokenType::AutoIncrement);
1070    keywords.insert("AUTOINCREMENT".to_string(), TokenType::AutoIncrement); // Snowflake style
1071    keywords.insert("MATERIALIZED".to_string(), TokenType::Materialized);
1072    keywords.insert("REPLACE".to_string(), TokenType::Replace);
1073    keywords.insert("TO".to_string(), TokenType::To);
1074    keywords.insert("INSERT".to_string(), TokenType::Insert);
1075    keywords.insert("OVERWRITE".to_string(), TokenType::Overwrite);
1076    keywords.insert("UPDATE".to_string(), TokenType::Update);
1077    keywords.insert("USE".to_string(), TokenType::Use);
1078    keywords.insert("WAREHOUSE".to_string(), TokenType::Warehouse);
1079    keywords.insert("GLOB".to_string(), TokenType::Glob);
1080    keywords.insert("DELETE".to_string(), TokenType::Delete);
1081    keywords.insert("MERGE".to_string(), TokenType::Merge);
1082    keywords.insert("CACHE".to_string(), TokenType::Cache);
1083    keywords.insert("UNCACHE".to_string(), TokenType::Uncache);
1084    keywords.insert("REFRESH".to_string(), TokenType::Refresh);
1085    keywords.insert("GRANT".to_string(), TokenType::Grant);
1086    keywords.insert("REVOKE".to_string(), TokenType::Revoke);
1087    keywords.insert("COMMENT".to_string(), TokenType::Comment);
1088    keywords.insert("COLLATE".to_string(), TokenType::Collate);
1089    keywords.insert("INTO".to_string(), TokenType::Into);
1090    keywords.insert("VALUES".to_string(), TokenType::Values);
1091    keywords.insert("SET".to_string(), TokenType::Set);
1092    keywords.insert("SETTINGS".to_string(), TokenType::Settings);
1093    keywords.insert("SEPARATOR".to_string(), TokenType::Separator);
1094    keywords.insert("ASC".to_string(), TokenType::Asc);
1095    keywords.insert("DESC".to_string(), TokenType::Desc);
1096    keywords.insert("NULLS".to_string(), TokenType::Nulls);
1097    keywords.insert("RESPECT".to_string(), TokenType::Respect);
1098    keywords.insert("FIRST".to_string(), TokenType::First);
1099    keywords.insert("LAST".to_string(), TokenType::Last);
1100    keywords.insert("IF".to_string(), TokenType::If);
1101    keywords.insert("CAST".to_string(), TokenType::Cast);
1102    keywords.insert("TRY_CAST".to_string(), TokenType::TryCast);
1103    keywords.insert("SAFE_CAST".to_string(), TokenType::SafeCast);
1104    keywords.insert("OVER".to_string(), TokenType::Over);
1105    keywords.insert("PARTITION".to_string(), TokenType::Partition);
1106    keywords.insert("PLACING".to_string(), TokenType::Placing);
1107    keywords.insert("WINDOW".to_string(), TokenType::Window);
1108    keywords.insert("ROWS".to_string(), TokenType::Rows);
1109    keywords.insert("RANGE".to_string(), TokenType::Range);
1110    keywords.insert("FILTER".to_string(), TokenType::Filter);
1111    keywords.insert("NATURAL".to_string(), TokenType::Natural);
1112    keywords.insert("USING".to_string(), TokenType::Using);
1113    keywords.insert("UNBOUNDED".to_string(), TokenType::Unbounded);
1114    keywords.insert("PRECEDING".to_string(), TokenType::Preceding);
1115    keywords.insert("FOLLOWING".to_string(), TokenType::Following);
1116    keywords.insert("CURRENT".to_string(), TokenType::Current);
1117    keywords.insert("ROW".to_string(), TokenType::Row);
1118    keywords.insert("GROUPS".to_string(), TokenType::Groups);
1119    keywords.insert("RECURSIVE".to_string(), TokenType::Recursive);
1120    // TRIM function position keywords
1121    keywords.insert("BOTH".to_string(), TokenType::Both);
1122    keywords.insert("LEADING".to_string(), TokenType::Leading);
1123    keywords.insert("TRAILING".to_string(), TokenType::Trailing);
1124    keywords.insert("INTERVAL".to_string(), TokenType::Interval);
1125    // Phase 3: Additional keywords
1126    keywords.insert("TOP".to_string(), TokenType::Top);
1127    keywords.insert("QUALIFY".to_string(), TokenType::Qualify);
1128    keywords.insert("SAMPLE".to_string(), TokenType::Sample);
1129    keywords.insert("TABLESAMPLE".to_string(), TokenType::TableSample);
1130    keywords.insert("BERNOULLI".to_string(), TokenType::Bernoulli);
1131    keywords.insert("SYSTEM".to_string(), TokenType::System);
1132    keywords.insert("BLOCK".to_string(), TokenType::Block);
1133    keywords.insert("SEED".to_string(), TokenType::Seed);
1134    keywords.insert("REPEATABLE".to_string(), TokenType::Repeatable);
1135    keywords.insert("TIES".to_string(), TokenType::Ties);
1136    keywords.insert("LATERAL".to_string(), TokenType::Lateral);
1137    keywords.insert("LAMBDA".to_string(), TokenType::Lambda);
1138    keywords.insert("APPLY".to_string(), TokenType::Apply);
1139    // Oracle CONNECT BY keywords
1140    keywords.insert("CONNECT".to_string(), TokenType::Connect);
1141    // Hive/Spark specific keywords
1142    keywords.insert("CLUSTER".to_string(), TokenType::Cluster);
1143    keywords.insert("DISTRIBUTE".to_string(), TokenType::Distribute);
1144    keywords.insert("SORT".to_string(), TokenType::Sort);
1145    keywords.insert("PIVOT".to_string(), TokenType::Pivot);
1146    keywords.insert("PREWHERE".to_string(), TokenType::Prewhere);
1147    keywords.insert("UNPIVOT".to_string(), TokenType::Unpivot);
1148    keywords.insert("FOR".to_string(), TokenType::For);
1149    keywords.insert("ANY".to_string(), TokenType::Any);
1150    keywords.insert("SOME".to_string(), TokenType::Some);
1151    keywords.insert("ASOF".to_string(), TokenType::AsOf);
1152    keywords.insert("PERCENT".to_string(), TokenType::Percent);
1153    keywords.insert("EXCLUDE".to_string(), TokenType::Exclude);
1154    keywords.insert("NO".to_string(), TokenType::No);
1155    keywords.insert("OTHERS".to_string(), TokenType::Others);
1156    // PostgreSQL OPERATOR() syntax for schema-qualified operators
1157    keywords.insert("OPERATOR".to_string(), TokenType::Operator);
1158    // Phase 4: DDL keywords
1159    keywords.insert("SCHEMA".to_string(), TokenType::Schema);
1160    keywords.insert("NAMESPACE".to_string(), TokenType::Namespace);
1161    keywords.insert("DATABASE".to_string(), TokenType::Database);
1162    keywords.insert("FUNCTION".to_string(), TokenType::Function);
1163    keywords.insert("PROCEDURE".to_string(), TokenType::Procedure);
1164    keywords.insert("PROC".to_string(), TokenType::Procedure);
1165    keywords.insert("SEQUENCE".to_string(), TokenType::Sequence);
1166    keywords.insert("TRIGGER".to_string(), TokenType::Trigger);
1167    keywords.insert("TYPE".to_string(), TokenType::Type);
1168    keywords.insert("DOMAIN".to_string(), TokenType::Domain);
1169    keywords.insert("RETURNS".to_string(), TokenType::Returns);
1170    keywords.insert("RETURNING".to_string(), TokenType::Returning);
1171    keywords.insert("LANGUAGE".to_string(), TokenType::Language);
1172    keywords.insert("ROLLBACK".to_string(), TokenType::Rollback);
1173    keywords.insert("COMMIT".to_string(), TokenType::Commit);
1174    keywords.insert("BEGIN".to_string(), TokenType::Begin);
1175    keywords.insert("DESCRIBE".to_string(), TokenType::Describe);
1176    keywords.insert("PREPARE".to_string(), TokenType::Prepare);
1177    keywords.insert("PRESERVE".to_string(), TokenType::Preserve);
1178    keywords.insert("TRANSACTION".to_string(), TokenType::Transaction);
1179    keywords.insert("SAVEPOINT".to_string(), TokenType::Savepoint);
1180    keywords.insert("BODY".to_string(), TokenType::Body);
1181    keywords.insert("INCREMENT".to_string(), TokenType::Increment);
1182    keywords.insert("MINVALUE".to_string(), TokenType::Minvalue);
1183    keywords.insert("MAXVALUE".to_string(), TokenType::Maxvalue);
1184    keywords.insert("CYCLE".to_string(), TokenType::Cycle);
1185    keywords.insert("NOCYCLE".to_string(), TokenType::NoCycle);
1186    keywords.insert("PRIOR".to_string(), TokenType::Prior);
1187    // MATCH_RECOGNIZE keywords
1188    keywords.insert("MATCH".to_string(), TokenType::Match);
1189    keywords.insert("MATCH_RECOGNIZE".to_string(), TokenType::MatchRecognize);
1190    keywords.insert("MEASURES".to_string(), TokenType::Measures);
1191    keywords.insert("PATTERN".to_string(), TokenType::Pattern);
1192    keywords.insert("DEFINE".to_string(), TokenType::Define);
1193    keywords.insert("RUNNING".to_string(), TokenType::Running);
1194    keywords.insert("FINAL".to_string(), TokenType::Final);
1195    keywords.insert("OWNED".to_string(), TokenType::Owned);
1196    keywords.insert("AFTER".to_string(), TokenType::After);
1197    keywords.insert("BEFORE".to_string(), TokenType::Before);
1198    keywords.insert("INSTEAD".to_string(), TokenType::Instead);
1199    keywords.insert("EACH".to_string(), TokenType::Each);
1200    keywords.insert("STATEMENT".to_string(), TokenType::Statement);
1201    keywords.insert("REFERENCING".to_string(), TokenType::Referencing);
1202    keywords.insert("OLD".to_string(), TokenType::Old);
1203    keywords.insert("NEW".to_string(), TokenType::New);
1204    keywords.insert("OF".to_string(), TokenType::Of);
1205    keywords.insert("CHECK".to_string(), TokenType::Check);
1206    keywords.insert("START".to_string(), TokenType::Start);
1207    keywords.insert("ENUM".to_string(), TokenType::Enum);
1208    keywords.insert("AUTHORIZATION".to_string(), TokenType::Authorization);
1209    keywords.insert("RESTART".to_string(), TokenType::Restart);
1210    // Date/time literal keywords
1211    keywords.insert("DATE".to_string(), TokenType::Date);
1212    keywords.insert("TIME".to_string(), TokenType::Time);
1213    keywords.insert("TIMESTAMP".to_string(), TokenType::Timestamp);
1214    keywords.insert("DATETIME".to_string(), TokenType::DateTime);
1215    keywords.insert("GENERATED".to_string(), TokenType::Generated);
1216    keywords.insert("IDENTITY".to_string(), TokenType::Identity);
1217    keywords.insert("ALWAYS".to_string(), TokenType::Always);
1218    // LOAD DATA keywords
1219    keywords.insert("LOAD".to_string(), TokenType::Load);
1220    keywords.insert("LOCAL".to_string(), TokenType::Local);
1221    keywords.insert("INPATH".to_string(), TokenType::Inpath);
1222    keywords.insert("INPUTFORMAT".to_string(), TokenType::InputFormat);
1223    keywords.insert("SERDE".to_string(), TokenType::Serde);
1224    keywords.insert("SERDEPROPERTIES".to_string(), TokenType::SerdeProperties);
1225    keywords.insert("FORMAT".to_string(), TokenType::Format);
1226    // SQLite
1227    keywords.insert("PRAGMA".to_string(), TokenType::Pragma);
1228    // SHOW statement
1229    keywords.insert("SHOW".to_string(), TokenType::Show);
1230    // Oracle ORDER SIBLINGS BY (hierarchical queries)
1231    keywords.insert("SIBLINGS".to_string(), TokenType::Siblings);
1232    // COPY and PUT statements (Snowflake, PostgreSQL)
1233    keywords.insert("COPY".to_string(), TokenType::Copy);
1234    keywords.insert("PUT".to_string(), TokenType::Put);
1235    keywords.insert("GET".to_string(), TokenType::Get);
1236    // EXEC/EXECUTE statement (TSQL, etc.)
1237    keywords.insert("EXEC".to_string(), TokenType::Execute);
1238    keywords.insert("EXECUTE".to_string(), TokenType::Execute);
1239    // Postfix null check operators (PostgreSQL/SQLite)
1240    keywords.insert("ISNULL".to_string(), TokenType::IsNull);
1241    keywords.insert("NOTNULL".to_string(), TokenType::NotNull);
1242    keywords
1243});
1244
1245static DEFAULT_SINGLE_TOKENS: LazyLock<HashMap<char, TokenType>> = LazyLock::new(|| {
1246    let mut single_tokens = HashMap::with_capacity(30);
1247    single_tokens.insert('(', TokenType::LParen);
1248    single_tokens.insert(')', TokenType::RParen);
1249    single_tokens.insert('[', TokenType::LBracket);
1250    single_tokens.insert(']', TokenType::RBracket);
1251    single_tokens.insert('{', TokenType::LBrace);
1252    single_tokens.insert('}', TokenType::RBrace);
1253    single_tokens.insert(',', TokenType::Comma);
1254    single_tokens.insert('.', TokenType::Dot);
1255    single_tokens.insert(';', TokenType::Semicolon);
1256    single_tokens.insert('+', TokenType::Plus);
1257    single_tokens.insert('-', TokenType::Dash);
1258    single_tokens.insert('*', TokenType::Star);
1259    single_tokens.insert('/', TokenType::Slash);
1260    single_tokens.insert('%', TokenType::Percent);
1261    single_tokens.insert('&', TokenType::Amp);
1262    single_tokens.insert('|', TokenType::Pipe);
1263    single_tokens.insert('^', TokenType::Caret);
1264    single_tokens.insert('~', TokenType::Tilde);
1265    single_tokens.insert('<', TokenType::Lt);
1266    single_tokens.insert('>', TokenType::Gt);
1267    single_tokens.insert('=', TokenType::Eq);
1268    single_tokens.insert('!', TokenType::Exclamation);
1269    single_tokens.insert(':', TokenType::Colon);
1270    single_tokens.insert('@', TokenType::DAt);
1271    single_tokens.insert('#', TokenType::Hash);
1272    single_tokens.insert('$', TokenType::Dollar);
1273    single_tokens.insert('?', TokenType::Parameter);
1274    single_tokens
1275});
1276
1277static DEFAULT_QUOTES: LazyLock<HashMap<String, String>> = LazyLock::new(|| {
1278    let mut quotes = HashMap::with_capacity(4);
1279    quotes.insert("'".to_string(), "'".to_string());
1280    // Triple-quoted strings (e.g., """x""")
1281    quotes.insert("\"\"\"".to_string(), "\"\"\"".to_string());
1282    quotes
1283});
1284
1285static DEFAULT_IDENTIFIERS: LazyLock<HashMap<char, char>> = LazyLock::new(|| {
1286    let mut identifiers = HashMap::with_capacity(4);
1287    identifiers.insert('"', '"');
1288    identifiers.insert('`', '`');
1289    // Note: TSQL bracket-quoted identifiers [name] are handled in the parser
1290    // because [ is also used for arrays and subscripts
1291    identifiers
1292});
1293
1294static DEFAULT_COMMENTS: LazyLock<HashMap<String, Option<String>>> = LazyLock::new(|| {
1295    let mut comments = HashMap::with_capacity(4);
1296    comments.insert("--".to_string(), None);
1297    comments.insert("/*".to_string(), Some("*/".to_string()));
1298    comments
1299});
1300
1301/// Tokenizer configuration for a dialect
1302#[derive(Debug, Clone)]
1303pub struct TokenizerConfig {
1304    /// Keywords mapping (uppercase keyword -> token type)
1305    pub keywords: HashMap<String, TokenType>,
1306    /// Single character tokens
1307    pub single_tokens: HashMap<char, TokenType>,
1308    /// Quote characters (start -> end)
1309    pub quotes: HashMap<String, String>,
1310    /// Identifier quote characters (start -> end)
1311    pub identifiers: HashMap<char, char>,
1312    /// Comment definitions (start -> optional end)
1313    pub comments: HashMap<String, Option<String>>,
1314    /// String escape characters
1315    pub string_escapes: Vec<char>,
1316    /// Whether to support nested comments
1317    pub nested_comments: bool,
1318    /// Valid escape follow characters (for MySQL-style escaping).
1319    /// When a backslash is followed by a character NOT in this list,
1320    /// the backslash is discarded. When empty, all backslash escapes
1321    /// preserve the backslash for unrecognized sequences.
1322    pub escape_follow_chars: Vec<char>,
1323    /// Whether b'...' is a byte string (true for BigQuery) or bit string (false for standard SQL).
1324    /// Default is false (bit string).
1325    pub b_prefix_is_byte_string: bool,
1326    /// Numeric literal suffixes (uppercase suffix -> type name), e.g. {"L": "BIGINT", "S": "SMALLINT"}
1327    /// Used by Hive/Spark to parse 1L as CAST(1 AS BIGINT)
1328    pub numeric_literals: HashMap<String, String>,
1329    /// Whether unquoted identifiers can start with a digit (e.g., `1a`, `1_a`).
1330    /// When true, a number followed by letters/underscore is treated as an identifier.
1331    /// Used by Hive, Spark, MySQL, ClickHouse.
1332    pub identifiers_can_start_with_digit: bool,
1333    /// Whether 0x/0X prefix should be treated as hex literals.
1334    /// When true, `0XCC` is tokenized instead of Number("0") + Identifier("XCC").
1335    /// Used by BigQuery, SQLite, Teradata.
1336    pub hex_number_strings: bool,
1337    /// Whether hex string literals from 0x prefix represent integer values.
1338    /// When true (BigQuery), 0xA is tokenized as HexNumber (integer in hex notation).
1339    /// When false (SQLite, Teradata), 0xCC is tokenized as HexString (binary/blob value).
1340    pub hex_string_is_integer_type: bool,
1341    /// Whether string escape sequences (like \') are allowed in raw strings.
1342    /// When true (BigQuery default), \' inside r'...' escapes the quote.
1343    /// When false (Spark/Databricks), backslashes in raw strings are always literal.
1344    /// Python sqlglot: STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS (default True)
1345    pub string_escapes_allowed_in_raw_strings: bool,
1346    /// Whether # starts a single-line comment (ClickHouse, MySQL)
1347    pub hash_comments: bool,
1348    /// Whether $ can start/continue an identifier (ClickHouse).
1349    /// When true, a bare `$` that is not part of a dollar-quoted string or positional
1350    /// parameter is treated as an identifier character.
1351    pub dollar_sign_is_identifier: bool,
1352    /// Whether INSERT ... FORMAT <name> should treat subsequent data as raw (ClickHouse).
1353    /// When true, after tokenizing `INSERT ... FORMAT <non-VALUES-name>`, all text until
1354    /// the next blank line or end of input is consumed as a raw data token.
1355    pub insert_format_raw_data: bool,
1356    /// Whether numeric literals can contain underscores as digit separators.
1357    /// When true, `1_000` is tokenized as `1000`. Used by ClickHouse and DuckDB.
1358    /// Python sqlglot: NUMBERS_CAN_BE_UNDERSCORE_SEPARATED (default False)
1359    pub numbers_can_be_underscore_separated: bool,
1360    /// Recover strings like `'a\' or 1=1` by treating the escaped quote as the
1361    /// closing quote when no later quote exists. This matches SQLGlot's permissive
1362    /// handling for a few malformed ClickHouse SHOW LIKE fixtures.
1363    pub recover_terminal_backslash_quote: bool,
1364    /// Recover a terminal single-quoted string without a closing quote by treating
1365    /// end-of-input as the close. This is only enabled for ClickHouse fixture
1366    /// coverage, where some extracted corpus rows contain partial string probes.
1367    pub recover_unterminated_string: bool,
1368}
1369
1370impl Default for TokenizerConfig {
1371    fn default() -> Self {
1372        Self {
1373            keywords: DEFAULT_KEYWORDS.clone(),
1374            single_tokens: DEFAULT_SINGLE_TOKENS.clone(),
1375            quotes: DEFAULT_QUOTES.clone(),
1376            identifiers: DEFAULT_IDENTIFIERS.clone(),
1377            comments: DEFAULT_COMMENTS.clone(),
1378            // Standard SQL: only '' (doubled quote) escapes a quote
1379            // Backslash escapes are dialect-specific (MySQL, etc.)
1380            string_escapes: vec!['\''],
1381            nested_comments: true,
1382            // By default, no escape_follow_chars means preserve backslash for unrecognized escapes
1383            escape_follow_chars: vec![],
1384            // Default: b'...' is bit string (standard SQL), not byte string (BigQuery)
1385            b_prefix_is_byte_string: false,
1386            numeric_literals: HashMap::new(),
1387            identifiers_can_start_with_digit: false,
1388            hex_number_strings: false,
1389            hex_string_is_integer_type: false,
1390            // Default: backslash escapes ARE allowed in raw strings (sqlglot default)
1391            // Spark/Databricks set this to false
1392            string_escapes_allowed_in_raw_strings: true,
1393            hash_comments: false,
1394            dollar_sign_is_identifier: false,
1395            insert_format_raw_data: false,
1396            numbers_can_be_underscore_separated: false,
1397            recover_terminal_backslash_quote: false,
1398            recover_unterminated_string: false,
1399        }
1400    }
1401}
1402
1403/// SQL Tokenizer
1404pub struct Tokenizer {
1405    config: TokenizerConfig,
1406}
1407
1408impl Tokenizer {
1409    /// Create a new tokenizer with the given configuration
1410    pub fn new(config: TokenizerConfig) -> Self {
1411        Self { config }
1412    }
1413
1414    /// Create a tokenizer with default configuration
1415    pub fn default_config() -> Self {
1416        Self::new(TokenizerConfig::default())
1417    }
1418
1419    /// Tokenize a SQL string
1420    pub fn tokenize(&self, sql: &str) -> Result<Vec<Token>> {
1421        let mut state = TokenizerState::new(sql, &self.config);
1422        state.tokenize()
1423    }
1424}
1425
1426impl Default for Tokenizer {
1427    fn default() -> Self {
1428        Self::default_config()
1429    }
1430}
1431
1432/// Internal state for tokenization
1433struct TokenizerState<'a> {
1434    source: &'a str,
1435    source_is_ascii: bool,
1436    chars: Vec<char>,
1437    size: usize,
1438    tokens: Vec<Token>,
1439    start: usize,
1440    current: usize,
1441    line: usize,
1442    column: usize,
1443    comments: Vec<String>,
1444    config: &'a TokenizerConfig,
1445}
1446
1447impl<'a> TokenizerState<'a> {
1448    fn new(sql: &'a str, config: &'a TokenizerConfig) -> Self {
1449        let chars: Vec<char> = sql.chars().collect();
1450        let size = chars.len();
1451        Self {
1452            source: sql,
1453            source_is_ascii: sql.is_ascii(),
1454            chars,
1455            size,
1456            tokens: Vec::new(),
1457            start: 0,
1458            current: 0,
1459            line: 1,
1460            column: 1,
1461            comments: Vec::new(),
1462            config,
1463        }
1464    }
1465
1466    fn tokenize(&mut self) -> Result<Vec<Token>> {
1467        while !self.is_at_end() {
1468            self.skip_whitespace();
1469            if self.is_at_end() {
1470                break;
1471            }
1472
1473            self.start = self.current;
1474            self.scan_token()?;
1475
1476            // ClickHouse: After INSERT ... FORMAT <name> (where name != VALUES),
1477            // the rest until the next blank line or end of input is raw data.
1478            if self.config.insert_format_raw_data {
1479                if let Some(raw) = self.try_scan_insert_format_raw_data() {
1480                    if !raw.is_empty() {
1481                        self.start = self.current;
1482                        self.add_token_with_text(TokenType::Var, raw);
1483                    }
1484                }
1485            }
1486        }
1487
1488        // Handle leftover leading comments at end of input.
1489        // These are comments on a new line after the last token that couldn't be attached
1490        // as leading comments to a subsequent token (because there is none).
1491        // Attach them as trailing comments on the last token so they're preserved.
1492        if !self.comments.is_empty() {
1493            if let Some(last) = self.tokens.last_mut() {
1494                last.trailing_comments.extend(self.comments.drain(..));
1495            }
1496        }
1497
1498        Ok(std::mem::take(&mut self.tokens))
1499    }
1500
1501    #[inline]
1502    fn is_at_end(&self) -> bool {
1503        self.current >= self.size
1504    }
1505
1506    #[inline]
1507    fn text_from_range(&self, start: usize, end: usize) -> String {
1508        if self.source_is_ascii {
1509            self.source[start..end].to_string()
1510        } else {
1511            self.chars[start..end].iter().collect()
1512        }
1513    }
1514
1515    #[inline]
1516    fn peek(&self) -> char {
1517        if self.is_at_end() {
1518            '\0'
1519        } else {
1520            self.chars[self.current]
1521        }
1522    }
1523
1524    #[inline]
1525    fn peek_next(&self) -> char {
1526        if self.current + 1 >= self.size {
1527            '\0'
1528        } else {
1529            self.chars[self.current + 1]
1530        }
1531    }
1532
1533    #[inline]
1534    fn advance(&mut self) -> char {
1535        let c = self.peek();
1536        self.current += 1;
1537        if c == '\n' {
1538            self.line += 1;
1539            self.column = 1;
1540        } else {
1541            self.column += 1;
1542        }
1543        c
1544    }
1545
1546    fn skip_whitespace(&mut self) {
1547        // Track whether we've seen a newline since the last token.
1548        // Comments on a new line (after a newline) are leading comments on the next token,
1549        // while comments on the same line are trailing comments on the previous token.
1550        // This matches Python sqlglot's behavior.
1551        let mut saw_newline = false;
1552        while !self.is_at_end() {
1553            let c = self.peek();
1554            match c {
1555                ' ' | '\t' | '\r' => {
1556                    self.advance();
1557                }
1558                '\n' => {
1559                    saw_newline = true;
1560                    self.advance();
1561                }
1562                '\u{00A0}' // non-breaking space
1563                | '\u{2000}'..='\u{200B}' // various Unicode spaces + zero-width space
1564                | '\u{3000}' // ideographic (full-width) space
1565                | '\u{FEFF}' // BOM / zero-width no-break space
1566                => {
1567                    self.advance();
1568                }
1569                '-' if self.peek_next() == '-' => {
1570                    self.scan_line_comment(saw_newline);
1571                    // After a line comment, we're always on a new line
1572                    saw_newline = true;
1573                }
1574                '/' if self.peek_next() == '/' && self.config.hash_comments => {
1575                    // ClickHouse: // single-line comments (same dialects that support # comments)
1576                    self.scan_double_slash_comment();
1577                }
1578                '/' if self.peek_next() == '*' => {
1579                    // Check if this is a hint comment /*+ ... */
1580                    if self.current + 2 < self.size && self.chars[self.current + 2] == '+' {
1581                        // This is a hint comment, handle it as a token instead of skipping
1582                        break;
1583                    }
1584                    if self.scan_block_comment(saw_newline).is_err() {
1585                        return;
1586                    }
1587                    // Don't reset saw_newline - it carries forward
1588                }
1589                '/' if self.peek_next() == '/' && self.config.comments.contains_key("//") => {
1590                    // Dialect-specific // line comment (e.g., Snowflake)
1591                    // But NOT inside URIs like file:// or paths with consecutive slashes
1592                    // Check that previous non-whitespace char is not ':' or '/'
1593                    let prev_non_ws = if self.current > 0 {
1594                        let mut i = self.current - 1;
1595                        while i > 0 && (self.chars[i] == ' ' || self.chars[i] == '\t') {
1596                            i -= 1;
1597                        }
1598                        self.chars[i]
1599                    } else {
1600                        '\0'
1601                    };
1602                    if prev_non_ws == ':' || prev_non_ws == '/' {
1603                        // This is likely a URI (file://, http://) or path, not a comment
1604                        break;
1605                    }
1606                    self.scan_line_comment(saw_newline);
1607                    // After a line comment, we're always on a new line
1608                    saw_newline = true;
1609                }
1610                '#' if self.config.hash_comments => {
1611                    self.scan_hash_line_comment();
1612                }
1613                _ => break,
1614            }
1615        }
1616    }
1617
1618    fn scan_hash_line_comment(&mut self) {
1619        self.advance(); // #
1620        let start = self.current;
1621        while !self.is_at_end() && self.peek() != '\n' {
1622            self.advance();
1623        }
1624        let comment = self.text_from_range(start, self.current);
1625        let comment_text = comment.trim().to_string();
1626        if let Some(last) = self.tokens.last_mut() {
1627            last.trailing_comments.push(comment_text);
1628        } else {
1629            self.comments.push(comment_text);
1630        }
1631    }
1632
1633    fn scan_double_slash_comment(&mut self) {
1634        self.advance(); // /
1635        self.advance(); // /
1636        let start = self.current;
1637        while !self.is_at_end() && self.peek() != '\n' {
1638            self.advance();
1639        }
1640        let comment = self.text_from_range(start, self.current);
1641        let comment_text = comment.trim().to_string();
1642        if let Some(last) = self.tokens.last_mut() {
1643            last.trailing_comments.push(comment_text);
1644        } else {
1645            self.comments.push(comment_text);
1646        }
1647    }
1648
1649    fn scan_line_comment(&mut self, after_newline: bool) {
1650        self.advance(); // -
1651        self.advance(); // -
1652        let start = self.current;
1653        while !self.is_at_end() && self.peek() != '\n' {
1654            self.advance();
1655        }
1656        let comment_text = self.text_from_range(start, self.current);
1657
1658        // If the comment starts on a new line (after_newline), it's a leading comment
1659        // on the next token. Otherwise, it's a trailing comment on the previous token.
1660        if after_newline || self.tokens.is_empty() {
1661            self.comments.push(comment_text);
1662        } else if let Some(last) = self.tokens.last_mut() {
1663            last.trailing_comments.push(comment_text);
1664        }
1665    }
1666
1667    fn scan_block_comment(&mut self, after_newline: bool) -> Result<()> {
1668        self.advance(); // /
1669        self.advance(); // *
1670        let content_start = self.current;
1671        let mut depth = 1;
1672
1673        while !self.is_at_end() && depth > 0 {
1674            if self.peek() == '/' && self.peek_next() == '*' && self.config.nested_comments {
1675                self.advance();
1676                self.advance();
1677                depth += 1;
1678            } else if self.peek() == '*' && self.peek_next() == '/' {
1679                depth -= 1;
1680                if depth > 0 {
1681                    self.advance();
1682                    self.advance();
1683                }
1684            } else {
1685                self.advance();
1686            }
1687        }
1688
1689        if depth > 0 {
1690            return Err(Error::tokenize(
1691                "Unterminated block comment",
1692                self.line,
1693                self.column,
1694                self.start,
1695                self.current,
1696            ));
1697        }
1698
1699        // Get the content between /* and */ (preserving internal whitespace for nested comments)
1700        let content = self.text_from_range(content_start, self.current);
1701        self.advance(); // *
1702        self.advance(); // /
1703
1704        // For round-trip fidelity, preserve the exact comment content including nested comments
1705        let comment_text = format!("/*{}*/", content);
1706
1707        // If the comment starts on a new line (after_newline), it's a leading comment
1708        // on the next token. Otherwise, it's a trailing comment on the previous token.
1709        if after_newline || self.tokens.is_empty() {
1710            self.comments.push(comment_text);
1711        } else if let Some(last) = self.tokens.last_mut() {
1712            last.trailing_comments.push(comment_text);
1713        }
1714
1715        Ok(())
1716    }
1717
1718    /// Scan a hint comment /*+ ... */ and return it as a Hint token
1719    fn scan_hint(&mut self) -> Result<()> {
1720        self.advance(); // /
1721        self.advance(); // *
1722        self.advance(); // +
1723        let hint_start = self.current;
1724
1725        // Scan until we find */
1726        while !self.is_at_end() {
1727            if self.peek() == '*' && self.peek_next() == '/' {
1728                break;
1729            }
1730            self.advance();
1731        }
1732
1733        if self.is_at_end() {
1734            return Err(Error::tokenize(
1735                "Unterminated hint comment",
1736                self.line,
1737                self.column,
1738                self.start,
1739                self.current,
1740            ));
1741        }
1742
1743        let hint_text = self.text_from_range(hint_start, self.current);
1744        self.advance(); // *
1745        self.advance(); // /
1746
1747        self.add_token_with_text(TokenType::Hint, hint_text.trim().to_string());
1748
1749        Ok(())
1750    }
1751
1752    /// Scan a positional parameter: $1, $2, etc.
1753    fn scan_positional_parameter(&mut self) -> Result<()> {
1754        self.advance(); // consume $
1755        let start = self.current;
1756
1757        while !self.is_at_end() && self.peek().is_ascii_digit() {
1758            self.advance();
1759        }
1760
1761        let number = self.text_from_range(start, self.current);
1762        self.add_token_with_text(TokenType::Parameter, number);
1763        Ok(())
1764    }
1765
1766    /// Try to scan a tagged dollar-quoted string: $tag$content$tag$
1767    /// Returns Some(()) if successful, None if this isn't a tagged dollar string.
1768    ///
1769    /// The token text is stored as "tag\x00content" to preserve the tag for later use.
1770    fn try_scan_tagged_dollar_string(&mut self) -> Result<Option<()>> {
1771        let saved_pos = self.current;
1772
1773        // We're at '$', next char is alphabetic
1774        self.advance(); // consume opening $
1775
1776        // Scan the tag (identifier: alphanumeric + underscore, including Unicode)
1777        // Tags can contain Unicode characters like emojis (e.g., $🦆$)
1778        let tag_start = self.current;
1779        while !self.is_at_end()
1780            && (self.peek().is_alphanumeric() || self.peek() == '_' || !self.peek().is_ascii())
1781        {
1782            self.advance();
1783        }
1784        let tag = self.text_from_range(tag_start, self.current);
1785
1786        // Must have a closing $ after the tag
1787        if self.is_at_end() || self.peek() != '$' {
1788            // Not a tagged dollar string - restore position
1789            self.current = saved_pos;
1790            return Ok(None);
1791        }
1792        self.advance(); // consume closing $ of opening tag
1793
1794        // Now scan content until we find $tag$
1795        let content_start = self.current;
1796        let closing_tag = format!("${}$", tag);
1797        let closing_chars: Vec<char> = closing_tag.chars().collect();
1798
1799        loop {
1800            if self.is_at_end() {
1801                // Unterminated - restore and fall through
1802                self.current = saved_pos;
1803                return Ok(None);
1804            }
1805
1806            // Check if we've reached the closing tag
1807            if self.peek() == '$' && self.current + closing_chars.len() <= self.size {
1808                let matches = closing_chars.iter().enumerate().all(|(j, &ch)| {
1809                    self.current + j < self.size && self.chars[self.current + j] == ch
1810                });
1811                if matches {
1812                    let content = self.text_from_range(content_start, self.current);
1813                    // Consume closing tag
1814                    for _ in 0..closing_chars.len() {
1815                        self.advance();
1816                    }
1817                    // Store as "tag\x00content" to preserve the tag
1818                    let token_text = format!("{}\x00{}", tag, content);
1819                    self.add_token_with_text(TokenType::DollarString, token_text);
1820                    return Ok(Some(()));
1821                }
1822            }
1823            self.advance();
1824        }
1825    }
1826
1827    /// Scan a dollar-quoted string: $$content$$ or $tag$content$tag$
1828    ///
1829    /// For $$...$$ (no tag), the token text is just the content.
1830    /// For $tag$...$tag$, use try_scan_tagged_dollar_string instead.
1831    fn scan_dollar_quoted_string(&mut self) -> Result<()> {
1832        self.advance(); // consume first $
1833        self.advance(); // consume second $
1834
1835        // For $$...$$ (no tag), just scan until closing $$
1836        let start = self.current;
1837        while !self.is_at_end() {
1838            if self.peek() == '$'
1839                && self.current + 1 < self.size
1840                && self.chars[self.current + 1] == '$'
1841            {
1842                break;
1843            }
1844            self.advance();
1845        }
1846
1847        let content = self.text_from_range(start, self.current);
1848
1849        if !self.is_at_end() {
1850            self.advance(); // consume first $
1851            self.advance(); // consume second $
1852        }
1853
1854        self.add_token_with_text(TokenType::DollarString, content);
1855        Ok(())
1856    }
1857
1858    fn scan_token(&mut self) -> Result<()> {
1859        let c = self.peek();
1860
1861        // Check for string literal
1862        if c == '\'' {
1863            // Check for triple-quoted string '''...''' if configured
1864            if self.config.quotes.contains_key("'''")
1865                && self.peek_next() == '\''
1866                && self.current + 2 < self.size
1867                && self.chars[self.current + 2] == '\''
1868            {
1869                return self.scan_triple_quoted_string('\'');
1870            }
1871            return self.scan_string();
1872        }
1873
1874        // Check for triple-quoted string """...""" if configured
1875        if c == '"'
1876            && self.config.quotes.contains_key("\"\"\"")
1877            && self.peek_next() == '"'
1878            && self.current + 2 < self.size
1879            && self.chars[self.current + 2] == '"'
1880        {
1881            return self.scan_triple_quoted_string('"');
1882        }
1883
1884        // Check for double-quoted strings when dialect supports them (e.g., BigQuery)
1885        // This must come before identifier quotes check
1886        if c == '"'
1887            && self.config.quotes.contains_key("\"")
1888            && !self.config.identifiers.contains_key(&'"')
1889        {
1890            return self.scan_double_quoted_string();
1891        }
1892
1893        // Check for identifier quotes
1894        if let Some(&end_quote) = self.config.identifiers.get(&c) {
1895            return self.scan_quoted_identifier(end_quote);
1896        }
1897
1898        // Check for numbers (including numbers starting with a dot like .25)
1899        if c.is_ascii_digit() {
1900            return self.scan_number();
1901        }
1902
1903        // Check for numbers starting with a dot (e.g., .25, .5)
1904        // This must come before single character token handling
1905        // Don't treat as a number if:
1906        // - Previous char was also a dot (e.g., 1..2 should be 1, ., ., 2)
1907        // - Previous char is an identifier character (e.g., foo.25 should be foo, ., 25)
1908        //   This handles BigQuery numeric table parts like project.dataset.25
1909        if c == '.' && self.peek_next().is_ascii_digit() {
1910            let prev_char = if self.current > 0 {
1911                self.chars[self.current - 1]
1912            } else {
1913                '\0'
1914            };
1915            let is_after_ident = prev_char.is_alphanumeric()
1916                || prev_char == '_'
1917                || prev_char == '`'
1918                || prev_char == '"'
1919                || prev_char == ']'
1920                || prev_char == ')';
1921            if prev_char != '.' && !is_after_ident {
1922                return self.scan_number_starting_with_dot();
1923            }
1924        }
1925
1926        // Check for hint comment /*+ ... */
1927        if c == '/'
1928            && self.peek_next() == '*'
1929            && self.current + 2 < self.size
1930            && self.chars[self.current + 2] == '+'
1931        {
1932            return self.scan_hint();
1933        }
1934
1935        // Check for multi-character operators first
1936        if let Some(token_type) = self.try_scan_multi_char_operator() {
1937            self.add_token(token_type);
1938            return Ok(());
1939        }
1940
1941        // Check for tagged dollar-quoted strings: $tag$content$tag$
1942        // Tags can contain Unicode characters (including emojis like 🦆) and digits (e.g., $1$)
1943        if c == '$'
1944            && (self.peek_next().is_alphanumeric()
1945                || self.peek_next() == '_'
1946                || !self.peek_next().is_ascii())
1947        {
1948            if let Some(()) = self.try_scan_tagged_dollar_string()? {
1949                return Ok(());
1950            }
1951            // If tagged dollar string didn't match and dollar_sign_is_identifier is set,
1952            // treat the $ and following chars as an identifier (e.g., ClickHouse $alias$name$).
1953            if self.config.dollar_sign_is_identifier {
1954                return self.scan_dollar_identifier();
1955            }
1956        }
1957
1958        // Check for dollar-quoted strings: $$...$$
1959        if c == '$' && self.peek_next() == '$' {
1960            return self.scan_dollar_quoted_string();
1961        }
1962
1963        // Check for positional parameters: $1, $2, etc.
1964        if c == '$' && self.peek_next().is_ascii_digit() {
1965            return self.scan_positional_parameter();
1966        }
1967
1968        // ClickHouse: bare $ (not followed by alphanumeric/underscore) as identifier
1969        if c == '$' && self.config.dollar_sign_is_identifier {
1970            return self.scan_dollar_identifier();
1971        }
1972
1973        // TSQL: Check for identifiers starting with # (temp tables) or @ (variables)
1974        // e.g., #temp, ##global_temp, @variable
1975        if (c == '#' || c == '@')
1976            && (self.peek_next().is_alphanumeric()
1977                || self.peek_next() == '_'
1978                || self.peek_next() == '#')
1979        {
1980            return self.scan_tsql_identifier();
1981        }
1982
1983        // Check for single character tokens
1984        if let Some(&token_type) = self.config.single_tokens.get(&c) {
1985            self.advance();
1986            self.add_token(token_type);
1987            return Ok(());
1988        }
1989
1990        // Unicode minus (U+2212) → treat as regular minus
1991        if c == '\u{2212}' {
1992            self.advance();
1993            self.add_token(TokenType::Dash);
1994            return Ok(());
1995        }
1996
1997        // Unicode fraction slash (U+2044) → treat as regular slash
1998        if c == '\u{2044}' {
1999            self.advance();
2000            self.add_token(TokenType::Slash);
2001            return Ok(());
2002        }
2003
2004        // Unicode curly/smart quotes → treat as regular string quotes
2005        if c == '\u{2018}' || c == '\u{2019}' {
2006            // Left/right single quotation marks → scan as string with matching end
2007            return self.scan_unicode_quoted_string(c);
2008        }
2009        if c == '\u{201C}' || c == '\u{201D}' {
2010            // Left/right double quotation marks → scan as quoted identifier
2011            return self.scan_unicode_quoted_identifier(c);
2012        }
2013
2014        // Must be an identifier or keyword
2015        self.scan_identifier_or_keyword()
2016    }
2017
2018    fn try_scan_multi_char_operator(&mut self) -> Option<TokenType> {
2019        let c = self.peek();
2020        let next = self.peek_next();
2021        let third = if self.current + 2 < self.size {
2022            self.chars[self.current + 2]
2023        } else {
2024            '\0'
2025        };
2026
2027        // Check for three-character operators first
2028        // -|- (Adjacent - PostgreSQL range adjacency)
2029        if c == '-' && next == '|' && third == '-' {
2030            self.advance();
2031            self.advance();
2032            self.advance();
2033            return Some(TokenType::Adjacent);
2034        }
2035
2036        // ||/ (Cube root - PostgreSQL)
2037        if c == '|' && next == '|' && third == '/' {
2038            self.advance();
2039            self.advance();
2040            self.advance();
2041            return Some(TokenType::DPipeSlash);
2042        }
2043
2044        // #>> (JSONB path text extraction - PostgreSQL)
2045        if c == '#' && next == '>' && third == '>' {
2046            self.advance();
2047            self.advance();
2048            self.advance();
2049            return Some(TokenType::DHashArrow);
2050        }
2051
2052        // ->> (JSON text extraction - PostgreSQL/MySQL)
2053        if c == '-' && next == '>' && third == '>' {
2054            self.advance();
2055            self.advance();
2056            self.advance();
2057            return Some(TokenType::DArrow);
2058        }
2059
2060        // <=> (NULL-safe equality - MySQL)
2061        if c == '<' && next == '=' && third == '>' {
2062            self.advance();
2063            self.advance();
2064            self.advance();
2065            return Some(TokenType::NullsafeEq);
2066        }
2067
2068        // <-> (Distance operator - PostgreSQL)
2069        if c == '<' && next == '-' && third == '>' {
2070            self.advance();
2071            self.advance();
2072            self.advance();
2073            return Some(TokenType::LrArrow);
2074        }
2075
2076        // <@ (Contained by - PostgreSQL)
2077        if c == '<' && next == '@' {
2078            self.advance();
2079            self.advance();
2080            return Some(TokenType::LtAt);
2081        }
2082
2083        // @> (Contains - PostgreSQL)
2084        if c == '@' && next == '>' {
2085            self.advance();
2086            self.advance();
2087            return Some(TokenType::AtGt);
2088        }
2089
2090        // ~~~ (Glob - PostgreSQL)
2091        if c == '~' && next == '~' && third == '~' {
2092            self.advance();
2093            self.advance();
2094            self.advance();
2095            return Some(TokenType::Glob);
2096        }
2097
2098        // ~~* (ILike - PostgreSQL)
2099        if c == '~' && next == '~' && third == '*' {
2100            self.advance();
2101            self.advance();
2102            self.advance();
2103            return Some(TokenType::ILike);
2104        }
2105
2106        // !~~* (Not ILike - PostgreSQL)
2107        let fourth = if self.current + 3 < self.size {
2108            self.chars[self.current + 3]
2109        } else {
2110            '\0'
2111        };
2112        if c == '!' && next == '~' && third == '~' && fourth == '*' {
2113            self.advance();
2114            self.advance();
2115            self.advance();
2116            self.advance();
2117            return Some(TokenType::NotILike);
2118        }
2119
2120        // !~~ (Not Like - PostgreSQL)
2121        if c == '!' && next == '~' && third == '~' {
2122            self.advance();
2123            self.advance();
2124            self.advance();
2125            return Some(TokenType::NotLike);
2126        }
2127
2128        // !~* (Not Regexp ILike - PostgreSQL)
2129        if c == '!' && next == '~' && third == '*' {
2130            self.advance();
2131            self.advance();
2132            self.advance();
2133            return Some(TokenType::NotIRLike);
2134        }
2135
2136        // !:> (Not cast / Try cast - SingleStore)
2137        if c == '!' && next == ':' && third == '>' {
2138            self.advance();
2139            self.advance();
2140            self.advance();
2141            return Some(TokenType::NColonGt);
2142        }
2143
2144        // ?:: (TRY_CAST shorthand - Databricks)
2145        if c == '?' && next == ':' && third == ':' {
2146            self.advance();
2147            self.advance();
2148            self.advance();
2149            return Some(TokenType::QDColon);
2150        }
2151
2152        // !~ (Not Regexp - PostgreSQL)
2153        if c == '!' && next == '~' {
2154            self.advance();
2155            self.advance();
2156            return Some(TokenType::NotRLike);
2157        }
2158
2159        // ~~ (Like - PostgreSQL)
2160        if c == '~' && next == '~' {
2161            self.advance();
2162            self.advance();
2163            return Some(TokenType::Like);
2164        }
2165
2166        // ~* (Regexp ILike - PostgreSQL)
2167        if c == '~' && next == '*' {
2168            self.advance();
2169            self.advance();
2170            return Some(TokenType::IRLike);
2171        }
2172
2173        // SingleStore three-character JSON path operators (must be checked before :: two-char)
2174        // ::$ (JSON extract string), ::% (JSON extract double), ::? (JSON match)
2175        if c == ':' && next == ':' && third == '$' {
2176            self.advance();
2177            self.advance();
2178            self.advance();
2179            return Some(TokenType::DColonDollar);
2180        }
2181        if c == ':' && next == ':' && third == '%' {
2182            self.advance();
2183            self.advance();
2184            self.advance();
2185            return Some(TokenType::DColonPercent);
2186        }
2187        if c == ':' && next == ':' && third == '?' {
2188            self.advance();
2189            self.advance();
2190            self.advance();
2191            return Some(TokenType::DColonQMark);
2192        }
2193
2194        // Two-character operators
2195        let token_type = match (c, next) {
2196            ('.', ':') => Some(TokenType::DotColon),
2197            ('=', '=') => Some(TokenType::Eq), // Hive/Spark == equality operator
2198            ('<', '=') => Some(TokenType::Lte),
2199            ('>', '=') => Some(TokenType::Gte),
2200            ('!', '=') => Some(TokenType::Neq),
2201            ('<', '>') => Some(TokenType::Neq),
2202            ('^', '=') => Some(TokenType::Neq),
2203            ('<', '<') => Some(TokenType::LtLt),
2204            ('>', '>') => Some(TokenType::GtGt),
2205            ('|', '|') => Some(TokenType::DPipe),
2206            ('|', '/') => Some(TokenType::PipeSlash), // Square root - PostgreSQL
2207            (':', ':') => Some(TokenType::DColon),
2208            (':', '=') => Some(TokenType::ColonEq), // := (assignment, named args)
2209            (':', '>') => Some(TokenType::ColonGt), // ::> (TSQL)
2210            ('-', '>') => Some(TokenType::Arrow),   // JSON object access
2211            ('=', '>') => Some(TokenType::FArrow),  // Fat arrow (lambda)
2212            ('&', '&') => Some(TokenType::DAmp),
2213            ('&', '<') => Some(TokenType::AmpLt), // PostgreSQL range operator
2214            ('&', '>') => Some(TokenType::AmpGt), // PostgreSQL range operator
2215            ('@', '@') => Some(TokenType::AtAt),  // Text search match
2216            ('?', '|') => Some(TokenType::QMarkPipe), // JSONB contains any key
2217            ('?', '&') => Some(TokenType::QMarkAmp), // JSONB contains all keys
2218            ('?', '?') => Some(TokenType::DQMark), // Double question mark
2219            ('#', '>') => Some(TokenType::HashArrow), // JSONB path extraction
2220            ('#', '-') => Some(TokenType::HashDash), // JSONB delete
2221            ('^', '@') => Some(TokenType::CaretAt), // PostgreSQL starts-with operator
2222            ('*', '*') => Some(TokenType::DStar), // Power operator
2223            ('|', '>') => Some(TokenType::PipeGt), // Pipe-greater (some dialects)
2224            _ => None,
2225        };
2226
2227        if token_type.is_some() {
2228            self.advance();
2229            self.advance();
2230        }
2231
2232        token_type
2233    }
2234
2235    fn scan_string(&mut self) -> Result<()> {
2236        self.advance(); // Opening quote
2237        let mut value = String::new();
2238
2239        while !self.is_at_end() {
2240            let c = self.peek();
2241            if c == '\'' {
2242                if self.peek_next() == '\'' {
2243                    // Escaped quote
2244                    value.push('\'');
2245                    self.advance();
2246                    self.advance();
2247                } else {
2248                    break;
2249                }
2250            } else if c == '\\' && self.config.string_escapes.contains(&'\\') {
2251                if self.config.recover_terminal_backslash_quote
2252                    && self.peek_next() == '\''
2253                    && !self.chars[self.current + 2..].contains(&'\'')
2254                {
2255                    value.push(self.advance());
2256                    break;
2257                }
2258
2259                // Handle escape sequences
2260                self.advance(); // Consume the backslash
2261                if !self.is_at_end() {
2262                    let escaped = self.advance();
2263                    match escaped {
2264                        'n' => value.push('\n'),
2265                        'r' => value.push('\r'),
2266                        't' => value.push('\t'),
2267                        '0' => value.push('\0'),
2268                        'Z' => value.push('\x1A'), // Ctrl+Z (MySQL)
2269                        'a' => value.push('\x07'), // Alert/bell
2270                        'b' => value.push('\x08'), // Backspace
2271                        'f' => value.push('\x0C'), // Form feed
2272                        'v' => value.push('\x0B'), // Vertical tab
2273                        'x' => {
2274                            // Hex escape: \xNN (exactly 2 hex digits)
2275                            let mut hex = String::with_capacity(2);
2276                            for _ in 0..2 {
2277                                if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2278                                    hex.push(self.advance());
2279                                }
2280                            }
2281                            if hex.len() == 2 {
2282                                if let Ok(byte) = u8::from_str_radix(&hex, 16) {
2283                                    value.push(byte as char);
2284                                } else {
2285                                    value.push('\\');
2286                                    value.push('x');
2287                                    value.push_str(&hex);
2288                                }
2289                            } else {
2290                                // Not enough hex digits, preserve literally
2291                                value.push('\\');
2292                                value.push('x');
2293                                value.push_str(&hex);
2294                            }
2295                        }
2296                        '\\' => value.push('\\'),
2297                        '\'' => value.push('\''),
2298                        '"' => value.push('"'),
2299                        '%' => {
2300                            // MySQL: \% in LIKE patterns
2301                            value.push('%');
2302                        }
2303                        '_' => {
2304                            // MySQL: \_ in LIKE patterns
2305                            value.push('_');
2306                        }
2307                        // For unrecognized escape sequences:
2308                        // If escape_follow_chars is set, only preserve backslash for chars in that list
2309                        // Otherwise (empty list), preserve backslash + char for unrecognized escapes
2310                        _ => {
2311                            if !self.config.escape_follow_chars.is_empty() {
2312                                // MySQL-style: discard backslash for unrecognized escapes
2313                                value.push(escaped);
2314                            } else {
2315                                // Standard: preserve backslash + char
2316                                value.push('\\');
2317                                value.push(escaped);
2318                            }
2319                        }
2320                    }
2321                }
2322            } else {
2323                value.push(self.advance());
2324            }
2325        }
2326
2327        if self.is_at_end() {
2328            if self.config.recover_unterminated_string {
2329                self.add_token_with_text(TokenType::String, value);
2330                return Ok(());
2331            }
2332
2333            return Err(Error::tokenize(
2334                "Unterminated string",
2335                self.line,
2336                self.column,
2337                self.start,
2338                self.current,
2339            ));
2340        }
2341
2342        self.advance(); // Closing quote
2343        self.add_token_with_text(TokenType::String, value);
2344        Ok(())
2345    }
2346
2347    /// Scan a double-quoted string (for dialects like BigQuery where " is a string delimiter)
2348    fn scan_double_quoted_string(&mut self) -> Result<()> {
2349        self.advance(); // Opening quote
2350        let mut value = String::new();
2351
2352        while !self.is_at_end() {
2353            let c = self.peek();
2354            if c == '"' {
2355                if self.peek_next() == '"' {
2356                    // Escaped quote
2357                    value.push('"');
2358                    self.advance();
2359                    self.advance();
2360                } else {
2361                    break;
2362                }
2363            } else if c == '\\' && self.config.string_escapes.contains(&'\\') {
2364                // Handle escape sequences
2365                self.advance(); // Consume the backslash
2366                if !self.is_at_end() {
2367                    let escaped = self.advance();
2368                    match escaped {
2369                        'n' => value.push('\n'),
2370                        'r' => value.push('\r'),
2371                        't' => value.push('\t'),
2372                        '0' => value.push('\0'),
2373                        'Z' => value.push('\x1A'), // Ctrl+Z (MySQL)
2374                        'a' => value.push('\x07'), // Alert/bell
2375                        'b' => value.push('\x08'), // Backspace
2376                        'f' => value.push('\x0C'), // Form feed
2377                        'v' => value.push('\x0B'), // Vertical tab
2378                        'x' => {
2379                            // Hex escape: \xNN (exactly 2 hex digits)
2380                            let mut hex = String::with_capacity(2);
2381                            for _ in 0..2 {
2382                                if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2383                                    hex.push(self.advance());
2384                                }
2385                            }
2386                            if hex.len() == 2 {
2387                                if let Ok(byte) = u8::from_str_radix(&hex, 16) {
2388                                    value.push(byte as char);
2389                                } else {
2390                                    value.push('\\');
2391                                    value.push('x');
2392                                    value.push_str(&hex);
2393                                }
2394                            } else {
2395                                // Not enough hex digits, preserve literally
2396                                value.push('\\');
2397                                value.push('x');
2398                                value.push_str(&hex);
2399                            }
2400                        }
2401                        '\\' => value.push('\\'),
2402                        '\'' => value.push('\''),
2403                        '"' => value.push('"'),
2404                        '%' => {
2405                            // MySQL: \% in LIKE patterns
2406                            value.push('%');
2407                        }
2408                        '_' => {
2409                            // MySQL: \_ in LIKE patterns
2410                            value.push('_');
2411                        }
2412                        // For unrecognized escape sequences:
2413                        // If escape_follow_chars is set, only preserve backslash for chars in that list
2414                        // Otherwise (empty list), preserve backslash + char for unrecognized escapes
2415                        _ => {
2416                            if !self.config.escape_follow_chars.is_empty() {
2417                                // MySQL-style: discard backslash for unrecognized escapes
2418                                value.push(escaped);
2419                            } else {
2420                                // Standard: preserve backslash + char
2421                                value.push('\\');
2422                                value.push(escaped);
2423                            }
2424                        }
2425                    }
2426                }
2427            } else {
2428                value.push(self.advance());
2429            }
2430        }
2431
2432        if self.is_at_end() {
2433            return Err(Error::tokenize(
2434                "Unterminated double-quoted string",
2435                self.line,
2436                self.column,
2437                self.start,
2438                self.current,
2439            ));
2440        }
2441
2442        self.advance(); // Closing quote
2443        self.add_token_with_text(TokenType::String, value);
2444        Ok(())
2445    }
2446
2447    fn scan_triple_quoted_string(&mut self, quote_char: char) -> Result<()> {
2448        // Advance past the three opening quotes
2449        self.advance();
2450        self.advance();
2451        self.advance();
2452        let mut value = String::new();
2453
2454        while !self.is_at_end() {
2455            // Check for closing triple quote
2456            if self.peek() == quote_char
2457                && self.current + 1 < self.size
2458                && self.chars[self.current + 1] == quote_char
2459                && self.current + 2 < self.size
2460                && self.chars[self.current + 2] == quote_char
2461            {
2462                // Found closing """
2463                break;
2464            }
2465            value.push(self.advance());
2466        }
2467
2468        if self.is_at_end() {
2469            return Err(Error::tokenize(
2470                "Unterminated triple-quoted string",
2471                self.line,
2472                self.column,
2473                self.start,
2474                self.current,
2475            ));
2476        }
2477
2478        // Advance past the three closing quotes
2479        self.advance();
2480        self.advance();
2481        self.advance();
2482        let token_type = if quote_char == '"' {
2483            TokenType::TripleDoubleQuotedString
2484        } else {
2485            TokenType::TripleSingleQuotedString
2486        };
2487        self.add_token_with_text(token_type, value);
2488        Ok(())
2489    }
2490
2491    fn scan_quoted_identifier(&mut self, end_quote: char) -> Result<()> {
2492        self.advance(); // Opening quote
2493        let mut value = String::new();
2494
2495        loop {
2496            if self.is_at_end() {
2497                return Err(Error::tokenize(
2498                    "Unterminated identifier",
2499                    self.line,
2500                    self.column,
2501                    self.start,
2502                    self.current,
2503                ));
2504            }
2505            if self.peek() == end_quote {
2506                if self.peek_next() == end_quote {
2507                    // Escaped quote (e.g., "" inside "x""y") -> store single quote
2508                    value.push(end_quote);
2509                    self.advance(); // skip first quote
2510                    self.advance(); // skip second quote
2511                } else {
2512                    // End of identifier
2513                    break;
2514                }
2515            } else {
2516                value.push(self.peek());
2517                self.advance();
2518            }
2519        }
2520
2521        self.advance(); // Closing quote
2522        self.add_token_with_text(TokenType::QuotedIdentifier, value);
2523        Ok(())
2524    }
2525
2526    /// Scan a string delimited by Unicode curly single quotes (U+2018/U+2019).
2527    /// Content between curly quotes is literal (no escape processing).
2528    /// When opened with \u{2018} (left), close with \u{2019} (right) only.
2529    /// When opened with \u{2019} (right), close with \u{2019} (right) — self-closing.
2530    fn scan_unicode_quoted_string(&mut self, open_quote: char) -> Result<()> {
2531        self.advance(); // Opening curly quote
2532        let start = self.current;
2533        // Determine closing quote: left opens -> right closes; right opens -> right closes
2534        let close_quote = if open_quote == '\u{2018}' {
2535            '\u{2019}' // left opens, right closes
2536        } else {
2537            '\u{2019}' // right quote also closes with right quote
2538        };
2539        while !self.is_at_end() && self.peek() != close_quote {
2540            self.advance();
2541        }
2542        let value = self.text_from_range(start, self.current);
2543        if !self.is_at_end() {
2544            self.advance(); // Closing quote
2545        }
2546        self.add_token_with_text(TokenType::String, value);
2547        Ok(())
2548    }
2549
2550    /// Scan an identifier delimited by Unicode curly double quotes (U+201C/U+201D).
2551    /// When opened with \u{201C} (left), close with \u{201D} (right) only.
2552    fn scan_unicode_quoted_identifier(&mut self, open_quote: char) -> Result<()> {
2553        self.advance(); // Opening curly quote
2554        let start = self.current;
2555        let close_quote = if open_quote == '\u{201C}' {
2556            '\u{201D}' // left opens, right closes
2557        } else {
2558            '\u{201D}' // right also closes with right
2559        };
2560        while !self.is_at_end() && self.peek() != close_quote && self.peek() != '"' {
2561            self.advance();
2562        }
2563        let value = self.text_from_range(start, self.current);
2564        if !self.is_at_end() {
2565            self.advance(); // Closing quote
2566        }
2567        self.add_token_with_text(TokenType::QuotedIdentifier, value);
2568        Ok(())
2569    }
2570
2571    fn scan_number(&mut self) -> Result<()> {
2572        // Check for 0x/0X hex number prefix (SQLite-style)
2573        if self.config.hex_number_strings && self.peek() == '0' && !self.is_at_end() {
2574            let next = if self.current + 1 < self.size {
2575                self.chars[self.current + 1]
2576            } else {
2577                '\0'
2578            };
2579            if next == 'x' || next == 'X' {
2580                // Advance past '0' and 'x'/'X'
2581                self.advance();
2582                self.advance();
2583                // Collect hex digits (allow underscores as separators, e.g., 0xbad_cafe)
2584                let hex_start = self.current;
2585                while !self.is_at_end() && (self.peek().is_ascii_hexdigit() || self.peek() == '_') {
2586                    if self.peek() == '_' && !self.peek_next().is_ascii_hexdigit() {
2587                        break;
2588                    }
2589                    self.advance();
2590                }
2591                if self.current > hex_start {
2592                    // Check for hex float: 0xABC.DEFpEXP or 0xABCpEXP
2593                    let mut is_hex_float = false;
2594                    // Optional fractional part: .hexdigits
2595                    if !self.is_at_end() && self.peek() == '.' {
2596                        let after_dot = if self.current + 1 < self.size {
2597                            self.chars[self.current + 1]
2598                        } else {
2599                            '\0'
2600                        };
2601                        if after_dot.is_ascii_hexdigit() {
2602                            is_hex_float = true;
2603                            self.advance(); // consume '.'
2604                            while !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2605                                self.advance();
2606                            }
2607                        }
2608                    }
2609                    // Optional binary exponent: p/P [+/-] digits
2610                    if !self.is_at_end() && (self.peek() == 'p' || self.peek() == 'P') {
2611                        is_hex_float = true;
2612                        self.advance(); // consume p/P
2613                        if !self.is_at_end() && (self.peek() == '+' || self.peek() == '-') {
2614                            self.advance();
2615                        }
2616                        while !self.is_at_end() && self.peek().is_ascii_digit() {
2617                            self.advance();
2618                        }
2619                    }
2620                    if is_hex_float {
2621                        // Hex float literal — emit as regular Number token with full text
2622                        let raw_text = self.text_from_range(self.start, self.current);
2623                        let full_text = if self.config.numbers_can_be_underscore_separated
2624                            && raw_text.contains('_')
2625                        {
2626                            raw_text.replace('_', "")
2627                        } else {
2628                            raw_text
2629                        };
2630                        self.add_token_with_text(TokenType::Number, full_text);
2631                    } else if self.config.hex_string_is_integer_type {
2632                        // BigQuery/ClickHouse: 0xA represents an integer in hex notation
2633                        let raw_value = self.text_from_range(hex_start, self.current);
2634                        let hex_value = if self.config.numbers_can_be_underscore_separated
2635                            && raw_value.contains('_')
2636                        {
2637                            raw_value.replace('_', "")
2638                        } else {
2639                            raw_value
2640                        };
2641                        self.add_token_with_text(TokenType::HexNumber, hex_value);
2642                    } else {
2643                        // SQLite/Teradata: 0xCC represents a binary/blob hex string
2644                        let raw_value = self.text_from_range(hex_start, self.current);
2645                        let hex_value = if self.config.numbers_can_be_underscore_separated
2646                            && raw_value.contains('_')
2647                        {
2648                            raw_value.replace('_', "")
2649                        } else {
2650                            raw_value
2651                        };
2652                        self.add_token_with_text(TokenType::HexString, hex_value);
2653                    }
2654                    return Ok(());
2655                }
2656                // No hex digits after 0x - fall through to normal number parsing
2657                // (reset current back to after '0')
2658                self.current = self.start + 1;
2659            }
2660        }
2661
2662        // Allow underscores as digit separators (e.g., 20_000, 1_000_000)
2663        while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2664            // Don't allow underscore at the end (must be followed by digit)
2665            if self.peek() == '_' && (self.is_at_end() || !self.peek_next().is_ascii_digit()) {
2666                break;
2667            }
2668            self.advance();
2669        }
2670
2671        // Look for decimal part - allow trailing dot (e.g., "1.")
2672        // In PostgreSQL (and sqlglot), "1.x" parses as float "1." with alias "x"
2673        // So we always consume the dot as part of the number, even if followed by an identifier
2674        if self.peek() == '.' {
2675            let next = self.peek_next();
2676            // Only consume the dot if:
2677            // 1. Followed by a digit (normal decimal like 1.5)
2678            // 2. Followed by an identifier start (like 1.x -> becomes 1. with alias x)
2679            // 3. End of input or other non-dot character (trailing decimal like "1.")
2680            // Do NOT consume if it's a double dot (..) which is a range operator
2681            if next != '.' {
2682                self.advance(); // consume the .
2683                                // Only consume digits after the decimal point (not identifiers)
2684                while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2685                    if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2686                        break;
2687                    }
2688                    self.advance();
2689                }
2690            }
2691        }
2692
2693        // Look for exponent
2694        if self.peek() == 'e' || self.peek() == 'E' {
2695            self.advance();
2696            if self.peek() == '+' || self.peek() == '-' {
2697                self.advance();
2698            }
2699            while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2700                if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2701                    break;
2702                }
2703                self.advance();
2704            }
2705        }
2706
2707        let raw_text = self.text_from_range(self.start, self.current);
2708        // Strip underscore digit separators (e.g., 20_000 -> 20000, 1_2E+1_0 -> 12E+10)
2709        // Only for dialects that support this (ClickHouse, DuckDB)
2710        let text = if self.config.numbers_can_be_underscore_separated && raw_text.contains('_') {
2711            raw_text.replace('_', "")
2712        } else {
2713            raw_text
2714        };
2715
2716        // Check for numeric literal suffixes (e.g., 1L -> BIGINT, 1s -> SMALLINT in Hive/Spark)
2717        if !self.config.numeric_literals.is_empty() && !self.is_at_end() {
2718            let next_char: String = self.peek().to_ascii_uppercase().to_string();
2719            // Try 2-char suffix first (e.g., "BD"), then 1-char
2720            let suffix_match = if self.current + 1 < self.size {
2721                let two_char: String = [
2722                    self.chars[self.current].to_ascii_uppercase(),
2723                    self.chars[self.current + 1].to_ascii_uppercase(),
2724                ]
2725                .iter()
2726                .collect();
2727                if self.config.numeric_literals.contains_key(&two_char) {
2728                    // Make sure the 2-char suffix is not followed by more identifier chars
2729                    let after_suffix = if self.current + 2 < self.size {
2730                        self.chars[self.current + 2]
2731                    } else {
2732                        ' '
2733                    };
2734                    if !after_suffix.is_alphanumeric() && after_suffix != '_' {
2735                        Some((two_char, 2))
2736                    } else {
2737                        None
2738                    }
2739                } else if self.config.numeric_literals.contains_key(&next_char) {
2740                    // 1-char suffix - make sure not followed by more identifier chars
2741                    let after_suffix = if self.current + 1 < self.size {
2742                        self.chars[self.current + 1]
2743                    } else {
2744                        ' '
2745                    };
2746                    if !after_suffix.is_alphanumeric() && after_suffix != '_' {
2747                        Some((next_char, 1))
2748                    } else {
2749                        None
2750                    }
2751                } else {
2752                    None
2753                }
2754            } else if self.config.numeric_literals.contains_key(&next_char) {
2755                // At end of input, 1-char suffix
2756                Some((next_char, 1))
2757            } else {
2758                None
2759            };
2760
2761            if let Some((suffix, len)) = suffix_match {
2762                // Consume the suffix characters
2763                for _ in 0..len {
2764                    self.advance();
2765                }
2766                // Emit as a special number-with-suffix token
2767                // We'll encode as "number::TYPE" so the parser can split it
2768                let type_name = self
2769                    .config
2770                    .numeric_literals
2771                    .get(&suffix)
2772                    .expect("suffix verified by contains_key above")
2773                    .clone();
2774                let combined = format!("{}::{}", text, type_name);
2775                self.add_token_with_text(TokenType::Number, combined);
2776                return Ok(());
2777            }
2778        }
2779
2780        // Check for identifiers that start with a digit (e.g., 1a, 1_a, 1a_1a)
2781        // In Hive/Spark/MySQL/ClickHouse, these are valid unquoted identifiers
2782        if self.config.identifiers_can_start_with_digit && !self.is_at_end() {
2783            let next = self.peek();
2784            if next.is_alphabetic() || next == '_' {
2785                // Continue scanning as an identifier
2786                while !self.is_at_end() {
2787                    let ch = self.peek();
2788                    if ch.is_alphanumeric() || ch == '_' {
2789                        self.advance();
2790                    } else {
2791                        break;
2792                    }
2793                }
2794                let ident_text = self.text_from_range(self.start, self.current);
2795                self.add_token_with_text(TokenType::Identifier, ident_text);
2796                return Ok(());
2797            }
2798        }
2799
2800        self.add_token_with_text(TokenType::Number, text);
2801        Ok(())
2802    }
2803
2804    /// Scan a number that starts with a dot (e.g., .25, .5, .123e10)
2805    fn scan_number_starting_with_dot(&mut self) -> Result<()> {
2806        // Consume the leading dot
2807        self.advance();
2808
2809        // Consume the fractional digits
2810        while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2811            if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2812                break;
2813            }
2814            self.advance();
2815        }
2816
2817        // Look for exponent
2818        if self.peek() == 'e' || self.peek() == 'E' {
2819            self.advance();
2820            if self.peek() == '+' || self.peek() == '-' {
2821                self.advance();
2822            }
2823            while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2824                if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2825                    break;
2826                }
2827                self.advance();
2828            }
2829        }
2830
2831        let raw_text = self.text_from_range(self.start, self.current);
2832        // Strip underscore digit separators (e.g., .1_5 -> .15)
2833        // Only for dialects that support this (ClickHouse, DuckDB)
2834        let text = if self.config.numbers_can_be_underscore_separated && raw_text.contains('_') {
2835            raw_text.replace('_', "")
2836        } else {
2837            raw_text
2838        };
2839        self.add_token_with_text(TokenType::Number, text);
2840        Ok(())
2841    }
2842
2843    /// Look up a keyword using a stack buffer for ASCII uppercasing, avoiding heap allocation.
2844    /// Returns `TokenType::Var` for texts longer than 128 bytes or non-UTF-8 results.
2845    #[inline]
2846    fn lookup_keyword_ascii(keywords: &HashMap<String, TokenType>, text: &str) -> TokenType {
2847        if text.len() > 128 {
2848            return TokenType::Var;
2849        }
2850        let mut buf = [0u8; 128];
2851        for (i, b) in text.bytes().enumerate() {
2852            buf[i] = b.to_ascii_uppercase();
2853        }
2854        if let Ok(upper) = std::str::from_utf8(&buf[..text.len()]) {
2855            keywords.get(upper).copied().unwrap_or(TokenType::Var)
2856        } else {
2857            TokenType::Var
2858        }
2859    }
2860
2861    fn scan_identifier_or_keyword(&mut self) -> Result<()> {
2862        // Guard against unrecognized characters that could cause infinite loops
2863        let first_char = self.peek();
2864        if !first_char.is_alphanumeric() && first_char != '_' {
2865            // Unknown character - skip it and return an error
2866            let c = self.advance();
2867            return Err(Error::tokenize(
2868                format!("Unexpected character: '{}'", c),
2869                self.line,
2870                self.column,
2871                self.start,
2872                self.current,
2873            ));
2874        }
2875
2876        while !self.is_at_end() {
2877            let c = self.peek();
2878            // Allow alphanumeric, underscore, $, # and @ in identifiers
2879            // PostgreSQL allows $, TSQL allows # and @
2880            // But stop consuming # if followed by > or >> (PostgreSQL #> and #>> operators)
2881            if c == '#' {
2882                let next_c = if self.current + 1 < self.size {
2883                    self.chars[self.current + 1]
2884                } else {
2885                    '\0'
2886                };
2887                if next_c == '>' || next_c == '-' {
2888                    break; // Don't consume # — it's part of #>, #>>, or #- operator
2889                }
2890                self.advance();
2891            } else if c.is_alphanumeric() || c == '_' || c == '$' || c == '@' {
2892                self.advance();
2893            } else {
2894                break;
2895            }
2896        }
2897
2898        let text = self.text_from_range(self.start, self.current);
2899
2900        // Special-case NOT= (Teradata and other dialects)
2901        if text.eq_ignore_ascii_case("NOT") && self.peek() == '=' {
2902            self.advance(); // consume '='
2903            self.add_token(TokenType::Neq);
2904            return Ok(());
2905        }
2906
2907        // Check for special string prefixes like N'...', X'...', B'...', U&'...', r'...', b'...'
2908        // Also handle double-quoted variants for dialects that support them (e.g., BigQuery)
2909        let next_char = self.peek();
2910        let is_single_quote = next_char == '\'';
2911        let is_double_quote = next_char == '"' && self.config.quotes.contains_key("\"");
2912        // For raw strings (r"..." or r'...'), we allow double quotes even if " is not in quotes config
2913        // because raw strings are a special case used in Spark/Databricks where " is for identifiers
2914        let is_double_quote_for_raw = next_char == '"';
2915
2916        // Handle raw strings first - they're special because they work with both ' and "
2917        // even in dialects where " is normally an identifier delimiter (like Databricks)
2918        if text.eq_ignore_ascii_case("R") && (is_single_quote || is_double_quote_for_raw) {
2919            // Raw string r'...' or r"..." or r'''...''' or r"""...""" (BigQuery style)
2920            // In raw strings, backslashes are treated literally (no escape processing)
2921            let quote_char = if is_single_quote { '\'' } else { '"' };
2922            self.advance(); // consume the first opening quote
2923
2924            // Check for triple-quoted raw string (r"""...""" or r'''...''')
2925            if self.peek() == quote_char && self.peek_next() == quote_char {
2926                // Triple-quoted raw string
2927                self.advance(); // consume second quote
2928                self.advance(); // consume third quote
2929                let string_value = self.scan_raw_triple_quoted_content(quote_char)?;
2930                self.add_token_with_text(TokenType::RawString, string_value);
2931            } else {
2932                let string_value = self.scan_raw_string_content(quote_char)?;
2933                self.add_token_with_text(TokenType::RawString, string_value);
2934            }
2935            return Ok(());
2936        }
2937
2938        if is_single_quote || is_double_quote {
2939            if text.eq_ignore_ascii_case("N") {
2940                // National string N'...'
2941                self.advance(); // consume the opening quote
2942                let string_value = if is_single_quote {
2943                    self.scan_string_content()?
2944                } else {
2945                    self.scan_double_quoted_string_content()?
2946                };
2947                self.add_token_with_text(TokenType::NationalString, string_value);
2948                return Ok(());
2949            } else if text.eq_ignore_ascii_case("E") {
2950                // PostgreSQL escape string E'...' or e'...'
2951                // Preserve the case by prefixing with "e:" or "E:"
2952                // Always use backslash escapes for escape strings (e.g., \' is an escaped quote)
2953                let lowercase = text == "e";
2954                let prefix = if lowercase { "e:" } else { "E:" };
2955                self.advance(); // consume the opening quote
2956                let string_value = self.scan_string_content_with_escapes(true)?;
2957                self.add_token_with_text(
2958                    TokenType::EscapeString,
2959                    format!("{}{}", prefix, string_value),
2960                );
2961                return Ok(());
2962            } else if text.eq_ignore_ascii_case("X") {
2963                // Hex string X'...'
2964                self.advance(); // consume the opening quote
2965                let string_value = if is_single_quote {
2966                    self.scan_string_content()?
2967                } else {
2968                    self.scan_double_quoted_string_content()?
2969                };
2970                self.add_token_with_text(TokenType::HexString, string_value);
2971                return Ok(());
2972            } else if text.eq_ignore_ascii_case("B") && is_double_quote {
2973                // Byte string b"..." (BigQuery style) - MUST check before single quote B'...'
2974                self.advance(); // consume the opening quote
2975                let string_value = self.scan_double_quoted_string_content()?;
2976                self.add_token_with_text(TokenType::ByteString, string_value);
2977                return Ok(());
2978            } else if text.eq_ignore_ascii_case("B") && is_single_quote {
2979                // For BigQuery: b'...' is a byte string (bytes data)
2980                // For standard SQL: B'...' is a bit string (binary digits)
2981                self.advance(); // consume the opening quote
2982                let string_value = self.scan_string_content()?;
2983                if self.config.b_prefix_is_byte_string {
2984                    self.add_token_with_text(TokenType::ByteString, string_value);
2985                } else {
2986                    self.add_token_with_text(TokenType::BitString, string_value);
2987                }
2988                return Ok(());
2989            }
2990        }
2991
2992        // Check for U&'...' Unicode string syntax (SQL standard)
2993        if text.eq_ignore_ascii_case("U")
2994            && self.peek() == '&'
2995            && self.current + 1 < self.size
2996            && self.chars[self.current + 1] == '\''
2997        {
2998            self.advance(); // consume '&'
2999            self.advance(); // consume opening quote
3000            let string_value = self.scan_string_content()?;
3001            self.add_token_with_text(TokenType::UnicodeString, string_value);
3002            return Ok(());
3003        }
3004
3005        let token_type = Self::lookup_keyword_ascii(&self.config.keywords, &text);
3006
3007        self.add_token_with_text(token_type, text);
3008        Ok(())
3009    }
3010
3011    /// Scan string content (everything between quotes)
3012    /// If `force_backslash_escapes` is true, backslash is always treated as an escape character
3013    /// (used for PostgreSQL E'...' escape strings)
3014    fn scan_string_content_with_escapes(
3015        &mut self,
3016        force_backslash_escapes: bool,
3017    ) -> Result<String> {
3018        let mut value = String::new();
3019        let use_backslash_escapes =
3020            force_backslash_escapes || self.config.string_escapes.contains(&'\\');
3021
3022        while !self.is_at_end() {
3023            let c = self.peek();
3024            if c == '\'' {
3025                if self.peek_next() == '\'' {
3026                    // Escaped quote ''
3027                    value.push('\'');
3028                    self.advance();
3029                    self.advance();
3030                } else {
3031                    break;
3032                }
3033            } else if c == '\\' && use_backslash_escapes {
3034                // Preserve escape sequences literally (including \' for escape strings)
3035                value.push(self.advance());
3036                if !self.is_at_end() {
3037                    value.push(self.advance());
3038                }
3039            } else {
3040                value.push(self.advance());
3041            }
3042        }
3043
3044        if self.is_at_end() {
3045            return Err(Error::tokenize(
3046                "Unterminated string",
3047                self.line,
3048                self.column,
3049                self.start,
3050                self.current,
3051            ));
3052        }
3053
3054        self.advance(); // Closing quote
3055        Ok(value)
3056    }
3057
3058    /// Scan string content (everything between quotes)
3059    fn scan_string_content(&mut self) -> Result<String> {
3060        self.scan_string_content_with_escapes(false)
3061    }
3062
3063    /// Scan double-quoted string content (for dialects like BigQuery where " is a string delimiter)
3064    /// This is used for prefixed strings like b"..." or N"..."
3065    fn scan_double_quoted_string_content(&mut self) -> Result<String> {
3066        let mut value = String::new();
3067        let use_backslash_escapes = self.config.string_escapes.contains(&'\\');
3068
3069        while !self.is_at_end() {
3070            let c = self.peek();
3071            if c == '"' {
3072                if self.peek_next() == '"' {
3073                    // Escaped quote ""
3074                    value.push('"');
3075                    self.advance();
3076                    self.advance();
3077                } else {
3078                    break;
3079                }
3080            } else if c == '\\' && use_backslash_escapes {
3081                // Handle escape sequences
3082                self.advance(); // Consume backslash
3083                if !self.is_at_end() {
3084                    let escaped = self.advance();
3085                    match escaped {
3086                        'n' => value.push('\n'),
3087                        'r' => value.push('\r'),
3088                        't' => value.push('\t'),
3089                        '0' => value.push('\0'),
3090                        '\\' => value.push('\\'),
3091                        '"' => value.push('"'),
3092                        '\'' => value.push('\''),
3093                        'x' => {
3094                            // Hex escape \xNN - collect hex digits
3095                            let mut hex = String::new();
3096                            for _ in 0..2 {
3097                                if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
3098                                    hex.push(self.advance());
3099                                }
3100                            }
3101                            if let Ok(byte) = u8::from_str_radix(&hex, 16) {
3102                                value.push(byte as char);
3103                            } else {
3104                                // Invalid hex escape, keep it literal
3105                                value.push('\\');
3106                                value.push('x');
3107                                value.push_str(&hex);
3108                            }
3109                        }
3110                        _ => {
3111                            // For unrecognized escapes, preserve backslash + char
3112                            value.push('\\');
3113                            value.push(escaped);
3114                        }
3115                    }
3116                }
3117            } else {
3118                value.push(self.advance());
3119            }
3120        }
3121
3122        if self.is_at_end() {
3123            return Err(Error::tokenize(
3124                "Unterminated double-quoted string",
3125                self.line,
3126                self.column,
3127                self.start,
3128                self.current,
3129            ));
3130        }
3131
3132        self.advance(); // Closing quote
3133        Ok(value)
3134    }
3135
3136    /// Scan raw string content (limited escape processing for quotes)
3137    /// Used for BigQuery r'...' and r"..." strings
3138    /// In raw strings, backslashes are literal EXCEPT that escape sequences for the
3139    /// quote character still work (e.g., \' in r'...' escapes the quote, '' also works)
3140    fn scan_raw_string_content(&mut self, quote_char: char) -> Result<String> {
3141        let mut value = String::new();
3142
3143        while !self.is_at_end() {
3144            let c = self.peek();
3145            if c == quote_char {
3146                if self.peek_next() == quote_char {
3147                    // Escaped quote (doubled) - e.g., '' inside r'...'
3148                    value.push(quote_char);
3149                    self.advance();
3150                    self.advance();
3151                } else {
3152                    break;
3153                }
3154            } else if c == '\\'
3155                && self.peek_next() == quote_char
3156                && self.config.string_escapes_allowed_in_raw_strings
3157            {
3158                // Backslash-escaped quote - works in raw strings when string_escapes_allowed_in_raw_strings is true
3159                // e.g., \' inside r'...' becomes literal ' (BigQuery behavior)
3160                // Spark/Databricks has this set to false, so backslash is always literal there
3161                value.push(quote_char);
3162                self.advance(); // consume backslash
3163                self.advance(); // consume quote
3164            } else {
3165                // In raw strings, everything including backslashes is literal
3166                value.push(self.advance());
3167            }
3168        }
3169
3170        if self.is_at_end() {
3171            return Err(Error::tokenize(
3172                "Unterminated raw string",
3173                self.line,
3174                self.column,
3175                self.start,
3176                self.current,
3177            ));
3178        }
3179
3180        self.advance(); // Closing quote
3181        Ok(value)
3182    }
3183
3184    /// Scan raw triple-quoted string content (r"""...""" or r'''...''')
3185    /// Terminates when three consecutive quote_chars are found
3186    fn scan_raw_triple_quoted_content(&mut self, quote_char: char) -> Result<String> {
3187        let mut value = String::new();
3188
3189        while !self.is_at_end() {
3190            let c = self.peek();
3191            if c == quote_char && self.peek_next() == quote_char {
3192                // Check for third quote
3193                if self.current + 2 < self.size && self.chars[self.current + 2] == quote_char {
3194                    // Found three consecutive quotes - end of string
3195                    self.advance(); // first closing quote
3196                    self.advance(); // second closing quote
3197                    self.advance(); // third closing quote
3198                    return Ok(value);
3199                }
3200            }
3201            // In raw strings, everything including backslashes is literal
3202            let ch = self.advance();
3203            value.push(ch);
3204        }
3205
3206        Err(Error::tokenize(
3207            "Unterminated raw triple-quoted string",
3208            self.line,
3209            self.column,
3210            self.start,
3211            self.current,
3212        ))
3213    }
3214
3215    /// Scan TSQL identifiers that start with # (temp tables) or @ (variables)
3216    /// Examples: #temp, ##global_temp, @variable
3217    /// Scan an identifier that starts with `$` (ClickHouse).
3218    /// Examples: `$alias$name$`, `$x`
3219    fn scan_dollar_identifier(&mut self) -> Result<()> {
3220        // Consume the leading $
3221        self.advance();
3222
3223        // Consume alphanumeric, _, and $ continuation chars
3224        while !self.is_at_end() {
3225            let c = self.peek();
3226            if c.is_alphanumeric() || c == '_' || c == '$' {
3227                self.advance();
3228            } else {
3229                break;
3230            }
3231        }
3232
3233        let text = self.text_from_range(self.start, self.current);
3234        self.add_token_with_text(TokenType::Var, text);
3235        Ok(())
3236    }
3237
3238    fn scan_tsql_identifier(&mut self) -> Result<()> {
3239        // Consume the leading # or @ (or ##)
3240        let first = self.advance();
3241
3242        // For ##, consume the second #
3243        if first == '#' && self.peek() == '#' {
3244            self.advance();
3245        }
3246
3247        // Now scan the rest of the identifier
3248        while !self.is_at_end() {
3249            let c = self.peek();
3250            if c.is_alphanumeric() || c == '_' || c == '$' || c == '#' || c == '@' {
3251                self.advance();
3252            } else {
3253                break;
3254            }
3255        }
3256
3257        let text = self.text_from_range(self.start, self.current);
3258        // These are always identifiers (variables or temp table names), never keywords
3259        self.add_token_with_text(TokenType::Var, text);
3260        Ok(())
3261    }
3262
3263    /// Check if the last tokens match INSERT ... FORMAT <name> (not VALUES).
3264    /// If so, consume everything until the next blank line (two consecutive newlines)
3265    /// or end of input as raw data.
3266    fn try_scan_insert_format_raw_data(&mut self) -> Option<String> {
3267        let len = self.tokens.len();
3268        if len < 3 {
3269            return None;
3270        }
3271
3272        // Last token should be the format name (Identifier or Var, not VALUES)
3273        let last = &self.tokens[len - 1];
3274        if last.text.eq_ignore_ascii_case("VALUES") {
3275            return None;
3276        }
3277        if !matches!(last.token_type, TokenType::Var | TokenType::Identifier) {
3278            return None;
3279        }
3280
3281        // Second-to-last should be FORMAT
3282        let format_tok = &self.tokens[len - 2];
3283        if !format_tok.text.eq_ignore_ascii_case("FORMAT") {
3284            return None;
3285        }
3286
3287        // Check that there's an INSERT somewhere earlier in the tokens
3288        let has_insert = self.tokens[..len - 2]
3289            .iter()
3290            .rev()
3291            .take(20)
3292            .any(|t| t.token_type == TokenType::Insert);
3293        if !has_insert {
3294            return None;
3295        }
3296
3297        // We're in INSERT ... FORMAT <name> context. Consume everything until:
3298        // - A blank line (two consecutive newlines, possibly with whitespace between)
3299        // - End of input
3300        let raw_start = self.current;
3301        while !self.is_at_end() {
3302            let c = self.peek();
3303            if c == '\n' {
3304                // Check for blank line: \n followed by optional \r and \n
3305                let saved = self.current;
3306                self.advance(); // consume first \n
3307                                // Skip \r if present
3308                while !self.is_at_end() && self.peek() == '\r' {
3309                    self.advance();
3310                }
3311                if self.is_at_end() || self.peek() == '\n' {
3312                    // Found blank line or end of input - stop here
3313                    // Don't consume the second \n so subsequent SQL can be tokenized
3314                    let raw = self.text_from_range(raw_start, saved);
3315                    return Some(raw.trim().to_string());
3316                }
3317                // Not a blank line, continue scanning
3318            } else {
3319                self.advance();
3320            }
3321        }
3322
3323        // Reached end of input
3324        let raw = self.text_from_range(raw_start, self.current);
3325        let trimmed = raw.trim().to_string();
3326        if trimmed.is_empty() {
3327            None
3328        } else {
3329            Some(trimmed)
3330        }
3331    }
3332
3333    fn add_token(&mut self, token_type: TokenType) {
3334        let text = self.text_from_range(self.start, self.current);
3335        self.add_token_with_text(token_type, text);
3336    }
3337
3338    fn add_token_with_text(&mut self, token_type: TokenType, text: String) {
3339        let span = Span::new(self.start, self.current, self.line, self.column);
3340        let mut token = Token::new(token_type, text, span);
3341        token.comments.append(&mut self.comments);
3342        self.tokens.push(token);
3343    }
3344}
3345
3346#[cfg(test)]
3347mod tests {
3348    use super::*;
3349
3350    #[test]
3351    fn test_simple_select() {
3352        let tokenizer = Tokenizer::default();
3353        let tokens = tokenizer.tokenize("SELECT 1").unwrap();
3354
3355        assert_eq!(tokens.len(), 2);
3356        assert_eq!(tokens[0].token_type, TokenType::Select);
3357        assert_eq!(tokens[1].token_type, TokenType::Number);
3358        assert_eq!(tokens[1].text, "1");
3359    }
3360
3361    #[test]
3362    fn test_select_with_identifier() {
3363        let tokenizer = Tokenizer::default();
3364        let tokens = tokenizer.tokenize("SELECT a, b FROM t").unwrap();
3365
3366        assert_eq!(tokens.len(), 6);
3367        assert_eq!(tokens[0].token_type, TokenType::Select);
3368        assert_eq!(tokens[1].token_type, TokenType::Var);
3369        assert_eq!(tokens[1].text, "a");
3370        assert_eq!(tokens[2].token_type, TokenType::Comma);
3371        assert_eq!(tokens[3].token_type, TokenType::Var);
3372        assert_eq!(tokens[3].text, "b");
3373        assert_eq!(tokens[4].token_type, TokenType::From);
3374        assert_eq!(tokens[5].token_type, TokenType::Var);
3375        assert_eq!(tokens[5].text, "t");
3376    }
3377
3378    #[test]
3379    fn test_string_literal() {
3380        let tokenizer = Tokenizer::default();
3381        let tokens = tokenizer.tokenize("SELECT 'hello'").unwrap();
3382
3383        assert_eq!(tokens.len(), 2);
3384        assert_eq!(tokens[1].token_type, TokenType::String);
3385        assert_eq!(tokens[1].text, "hello");
3386    }
3387
3388    #[test]
3389    fn test_escaped_string() {
3390        let tokenizer = Tokenizer::default();
3391        let tokens = tokenizer.tokenize("SELECT 'it''s'").unwrap();
3392
3393        assert_eq!(tokens.len(), 2);
3394        assert_eq!(tokens[1].token_type, TokenType::String);
3395        assert_eq!(tokens[1].text, "it's");
3396    }
3397
3398    #[test]
3399    fn test_terminal_backslash_quote_recovery() {
3400        let mut config = TokenizerConfig::default();
3401        config.string_escapes.push('\\');
3402        config.recover_terminal_backslash_quote = true;
3403        let tokenizer = Tokenizer::new(config);
3404        let tokens = tokenizer
3405            .tokenize("SHOW FUNCTIONS LIKE 'a\\' OR 1=1")
3406            .unwrap();
3407
3408        assert_eq!(tokens.len(), 8);
3409        assert_eq!(tokens[3].token_type, TokenType::String);
3410        assert_eq!(tokens[3].text, "a\\");
3411        assert_eq!(tokens[4].token_type, TokenType::Or);
3412    }
3413
3414    #[test]
3415    fn test_comments() {
3416        let tokenizer = Tokenizer::default();
3417        let tokens = tokenizer.tokenize("SELECT -- comment\n1").unwrap();
3418
3419        assert_eq!(tokens.len(), 2);
3420        // Comments are attached to the PREVIOUS token as trailing_comments
3421        // This is better for round-trip fidelity (e.g., SELECT c /* comment */ FROM)
3422        assert_eq!(tokens[0].trailing_comments.len(), 1);
3423        assert_eq!(tokens[0].trailing_comments[0], " comment");
3424    }
3425
3426    #[test]
3427    fn test_comment_in_and_chain() {
3428        use crate::generator::Generator;
3429        use crate::parser::Parser;
3430
3431        // Line comments between AND clauses should appear after the AND operator
3432        let sql = "SELECT a FROM b WHERE foo\n-- c1\nAND bar\n-- c2\nAND bla";
3433        let ast = Parser::parse_sql(sql).unwrap();
3434        let mut gen = Generator::default();
3435        let output = gen.generate(&ast[0]).unwrap();
3436        assert_eq!(
3437            output,
3438            "SELECT a FROM b WHERE foo AND /* c1 */ bar AND /* c2 */ bla"
3439        );
3440    }
3441
3442    #[test]
3443    fn test_operators() {
3444        let tokenizer = Tokenizer::default();
3445        let tokens = tokenizer.tokenize("1 + 2 * 3").unwrap();
3446
3447        assert_eq!(tokens.len(), 5);
3448        assert_eq!(tokens[0].token_type, TokenType::Number);
3449        assert_eq!(tokens[1].token_type, TokenType::Plus);
3450        assert_eq!(tokens[2].token_type, TokenType::Number);
3451        assert_eq!(tokens[3].token_type, TokenType::Star);
3452        assert_eq!(tokens[4].token_type, TokenType::Number);
3453    }
3454
3455    #[test]
3456    fn test_comparison_operators() {
3457        let tokenizer = Tokenizer::default();
3458        let tokens = tokenizer.tokenize("a <= b >= c != d").unwrap();
3459
3460        assert_eq!(tokens[1].token_type, TokenType::Lte);
3461        assert_eq!(tokens[3].token_type, TokenType::Gte);
3462        assert_eq!(tokens[5].token_type, TokenType::Neq);
3463    }
3464
3465    #[test]
3466    fn test_national_string() {
3467        let tokenizer = Tokenizer::default();
3468        let tokens = tokenizer.tokenize("N'abc'").unwrap();
3469
3470        assert_eq!(
3471            tokens.len(),
3472            1,
3473            "Expected 1 token for N'abc', got {:?}",
3474            tokens
3475        );
3476        assert_eq!(tokens[0].token_type, TokenType::NationalString);
3477        assert_eq!(tokens[0].text, "abc");
3478    }
3479
3480    #[test]
3481    fn test_hex_string() {
3482        let tokenizer = Tokenizer::default();
3483        let tokens = tokenizer.tokenize("X'ABCD'").unwrap();
3484
3485        assert_eq!(
3486            tokens.len(),
3487            1,
3488            "Expected 1 token for X'ABCD', got {:?}",
3489            tokens
3490        );
3491        assert_eq!(tokens[0].token_type, TokenType::HexString);
3492        assert_eq!(tokens[0].text, "ABCD");
3493    }
3494
3495    #[test]
3496    fn test_bit_string() {
3497        let tokenizer = Tokenizer::default();
3498        let tokens = tokenizer.tokenize("B'01010'").unwrap();
3499
3500        assert_eq!(
3501            tokens.len(),
3502            1,
3503            "Expected 1 token for B'01010', got {:?}",
3504            tokens
3505        );
3506        assert_eq!(tokens[0].token_type, TokenType::BitString);
3507        assert_eq!(tokens[0].text, "01010");
3508    }
3509
3510    #[test]
3511    fn test_trailing_dot_number() {
3512        let tokenizer = Tokenizer::default();
3513
3514        // Test trailing dot
3515        let tokens = tokenizer.tokenize("SELECT 1.").unwrap();
3516        assert_eq!(
3517            tokens.len(),
3518            2,
3519            "Expected 2 tokens for 'SELECT 1.', got {:?}",
3520            tokens
3521        );
3522        assert_eq!(tokens[1].token_type, TokenType::Number);
3523        assert_eq!(tokens[1].text, "1.");
3524
3525        // Test normal decimal
3526        let tokens = tokenizer.tokenize("SELECT 1.5").unwrap();
3527        assert_eq!(tokens[1].text, "1.5");
3528
3529        // Test number followed by dot and identifier
3530        // In PostgreSQL (and sqlglot), "1.x" parses as float "1." with alias "x"
3531        let tokens = tokenizer.tokenize("SELECT 1.a").unwrap();
3532        assert_eq!(
3533            tokens.len(),
3534            3,
3535            "Expected 3 tokens for 'SELECT 1.a', got {:?}",
3536            tokens
3537        );
3538        assert_eq!(tokens[1].token_type, TokenType::Number);
3539        assert_eq!(tokens[1].text, "1.");
3540        assert_eq!(tokens[2].token_type, TokenType::Var);
3541
3542        // Test two dots (range operator) - dot is NOT consumed when followed by another dot
3543        let tokens = tokenizer.tokenize("SELECT 1..2").unwrap();
3544        assert_eq!(tokens[1].token_type, TokenType::Number);
3545        assert_eq!(tokens[1].text, "1");
3546        assert_eq!(tokens[2].token_type, TokenType::Dot);
3547        assert_eq!(tokens[3].token_type, TokenType::Dot);
3548        assert_eq!(tokens[4].token_type, TokenType::Number);
3549        assert_eq!(tokens[4].text, "2");
3550    }
3551
3552    #[test]
3553    fn test_leading_dot_number() {
3554        let tokenizer = Tokenizer::default();
3555
3556        // Test leading dot number (e.g., .25 for 0.25)
3557        let tokens = tokenizer.tokenize(".25").unwrap();
3558        assert_eq!(
3559            tokens.len(),
3560            1,
3561            "Expected 1 token for '.25', got {:?}",
3562            tokens
3563        );
3564        assert_eq!(tokens[0].token_type, TokenType::Number);
3565        assert_eq!(tokens[0].text, ".25");
3566
3567        // Test leading dot in context (Oracle SAMPLE clause)
3568        let tokens = tokenizer.tokenize("SAMPLE (.25)").unwrap();
3569        assert_eq!(
3570            tokens.len(),
3571            4,
3572            "Expected 4 tokens for 'SAMPLE (.25)', got {:?}",
3573            tokens
3574        );
3575        assert_eq!(tokens[0].token_type, TokenType::Sample);
3576        assert_eq!(tokens[1].token_type, TokenType::LParen);
3577        assert_eq!(tokens[2].token_type, TokenType::Number);
3578        assert_eq!(tokens[2].text, ".25");
3579        assert_eq!(tokens[3].token_type, TokenType::RParen);
3580
3581        // Test leading dot with exponent
3582        let tokens = tokenizer.tokenize(".5e10").unwrap();
3583        assert_eq!(
3584            tokens.len(),
3585            1,
3586            "Expected 1 token for '.5e10', got {:?}",
3587            tokens
3588        );
3589        assert_eq!(tokens[0].token_type, TokenType::Number);
3590        assert_eq!(tokens[0].text, ".5e10");
3591
3592        // Test that plain dot is still a Dot token
3593        let tokens = tokenizer.tokenize("a.b").unwrap();
3594        assert_eq!(
3595            tokens.len(),
3596            3,
3597            "Expected 3 tokens for 'a.b', got {:?}",
3598            tokens
3599        );
3600        assert_eq!(tokens[1].token_type, TokenType::Dot);
3601    }
3602
3603    #[test]
3604    fn test_unrecognized_character() {
3605        let tokenizer = Tokenizer::default();
3606
3607        // Unicode curly quotes are now handled as string delimiters
3608        let result = tokenizer.tokenize("SELECT \u{2018}hello\u{2019}");
3609        assert!(
3610            result.is_ok(),
3611            "Curly quotes should be tokenized as strings"
3612        );
3613
3614        // Unicode bullet character should still error
3615        let result = tokenizer.tokenize("SELECT • FROM t");
3616        assert!(result.is_err());
3617    }
3618
3619    #[test]
3620    fn test_colon_eq_tokenization() {
3621        let tokenizer = Tokenizer::default();
3622
3623        // := should be a single ColonEq token
3624        let tokens = tokenizer.tokenize("a := 1").unwrap();
3625        assert_eq!(tokens.len(), 3);
3626        assert_eq!(tokens[0].token_type, TokenType::Var);
3627        assert_eq!(tokens[1].token_type, TokenType::ColonEq);
3628        assert_eq!(tokens[2].token_type, TokenType::Number);
3629
3630        // : followed by non-= should still be Colon
3631        let tokens = tokenizer.tokenize("a:b").unwrap();
3632        assert!(tokens.iter().any(|t| t.token_type == TokenType::Colon));
3633        assert!(!tokens.iter().any(|t| t.token_type == TokenType::ColonEq));
3634
3635        // :: should still be DColon
3636        let tokens = tokenizer.tokenize("a::INT").unwrap();
3637        assert!(tokens.iter().any(|t| t.token_type == TokenType::DColon));
3638    }
3639
3640    #[test]
3641    fn test_colon_eq_parsing() {
3642        use crate::generator::Generator;
3643        use crate::parser::Parser;
3644
3645        // MySQL @var := value in SELECT
3646        let ast = Parser::parse_sql("SELECT @var1 := 1, @var2")
3647            .expect("Failed to parse MySQL @var := expr");
3648        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3649        assert_eq!(output, "SELECT @var1 := 1, @var2");
3650
3651        // MySQL @var := @var in SELECT
3652        let ast = Parser::parse_sql("SELECT @var1, @var2 := @var1")
3653            .expect("Failed to parse MySQL @var2 := @var1");
3654        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3655        assert_eq!(output, "SELECT @var1, @var2 := @var1");
3656
3657        // MySQL @var := COUNT(*)
3658        let ast = Parser::parse_sql("SELECT @var1 := COUNT(*) FROM t1")
3659            .expect("Failed to parse MySQL @var := COUNT(*)");
3660        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3661        assert_eq!(output, "SELECT @var1 := COUNT(*) FROM t1");
3662
3663        // MySQL SET @var := 1 (should normalize to = in output)
3664        let ast = Parser::parse_sql("SET @var1 := 1").expect("Failed to parse SET @var1 := 1");
3665        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3666        assert_eq!(output, "SET @var1 = 1");
3667
3668        // Function named args with :=
3669        let ast =
3670            Parser::parse_sql("UNION_VALUE(k1 := 1)").expect("Failed to parse named arg with :=");
3671        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3672        assert_eq!(output, "UNION_VALUE(k1 := 1)");
3673
3674        // UNNEST with recursive := TRUE
3675        let ast = Parser::parse_sql("SELECT UNNEST(col, recursive := TRUE) FROM t")
3676            .expect("Failed to parse UNNEST with :=");
3677        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3678        assert_eq!(output, "SELECT UNNEST(col, recursive := TRUE) FROM t");
3679
3680        // DuckDB prefix alias: foo: 1 means 1 AS foo
3681        let ast =
3682            Parser::parse_sql("SELECT foo: 1").expect("Failed to parse DuckDB prefix alias foo: 1");
3683        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3684        assert_eq!(output, "SELECT 1 AS foo");
3685
3686        // DuckDB prefix alias with multiple columns
3687        let ast = Parser::parse_sql("SELECT foo: 1, bar: 2, baz: 3")
3688            .expect("Failed to parse DuckDB multiple prefix aliases");
3689        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3690        assert_eq!(output, "SELECT 1 AS foo, 2 AS bar, 3 AS baz");
3691    }
3692
3693    #[test]
3694    fn test_colon_eq_dialect_roundtrip() {
3695        use crate::dialects::{Dialect, DialectType};
3696
3697        fn check(dialect: DialectType, sql: &str, expected: Option<&str>) {
3698            let d = Dialect::get(dialect);
3699            let ast = d
3700                .parse(sql)
3701                .unwrap_or_else(|e| panic!("Parse error for '{}': {}", sql, e));
3702            assert!(!ast.is_empty(), "Empty AST for: {}", sql);
3703            let transformed = d
3704                .transform(ast[0].clone())
3705                .unwrap_or_else(|e| panic!("Transform error for '{}': {}", sql, e));
3706            let output = d
3707                .generate(&transformed)
3708                .unwrap_or_else(|e| panic!("Generate error for '{}': {}", sql, e));
3709            let expected = expected.unwrap_or(sql);
3710            assert_eq!(output, expected, "Roundtrip failed for: {}", sql);
3711        }
3712
3713        // MySQL := tests
3714        check(DialectType::MySQL, "SELECT @var1 := 1, @var2", None);
3715        check(DialectType::MySQL, "SELECT @var1, @var2 := @var1", None);
3716        check(DialectType::MySQL, "SELECT @var1 := COUNT(*) FROM t1", None);
3717        check(DialectType::MySQL, "SET @var1 := 1", Some("SET @var1 = 1"));
3718
3719        // DuckDB := tests
3720        check(
3721            DialectType::DuckDB,
3722            "SELECT UNNEST(col, recursive := TRUE) FROM t",
3723            None,
3724        );
3725        check(DialectType::DuckDB, "UNION_VALUE(k1 := 1)", None);
3726
3727        // STRUCT_PACK(a := 'b')::json should at least parse without error
3728        // (The STRUCT_PACK -> Struct transformation is a separate feature)
3729        {
3730            let d = Dialect::get(DialectType::DuckDB);
3731            let ast = d
3732                .parse("STRUCT_PACK(a := 'b')::json")
3733                .expect("Failed to parse STRUCT_PACK(a := 'b')::json");
3734            assert!(!ast.is_empty(), "Empty AST for STRUCT_PACK(a := 'b')::json");
3735        }
3736
3737        // DuckDB prefix alias tests
3738        check(
3739            DialectType::DuckDB,
3740            "SELECT foo: 1",
3741            Some("SELECT 1 AS foo"),
3742        );
3743        check(
3744            DialectType::DuckDB,
3745            "SELECT foo: 1, bar: 2, baz: 3",
3746            Some("SELECT 1 AS foo, 2 AS bar, 3 AS baz"),
3747        );
3748    }
3749
3750    #[test]
3751    fn test_comment_roundtrip() {
3752        use crate::generator::Generator;
3753        use crate::parser::Parser;
3754
3755        fn check_roundtrip(sql: &str) -> Option<String> {
3756            let ast = match Parser::parse_sql(sql) {
3757                Ok(a) => a,
3758                Err(e) => return Some(format!("Parse error: {:?}", e)),
3759            };
3760            if ast.is_empty() {
3761                return Some("Empty AST".to_string());
3762            }
3763            let mut generator = Generator::default();
3764            let output = match generator.generate(&ast[0]) {
3765                Ok(o) => o,
3766                Err(e) => return Some(format!("Gen error: {:?}", e)),
3767            };
3768            if output == sql {
3769                None
3770            } else {
3771                Some(format!(
3772                    "Mismatch:\n  input:  {}\n  output: {}",
3773                    sql, output
3774                ))
3775            }
3776        }
3777
3778        let tests = vec![
3779            // Nested comments are sanitized: inner /* and */ are escaped
3780            // These no longer round-trip exactly (by design, matches Python sqlglot)
3781            // "SELECT c /* c1 /* c2 */ c3 */",        // becomes /* c1 / * c2 * / c3 */
3782            // "SELECT c /* c1 /* c2 /* c3 */ */ */",   // becomes /* c1 / * c2 / * c3 * / * / */
3783            // Simple alias with comments
3784            "SELECT c /* c1 */ AS alias /* c2 */",
3785            // Multiple columns with comments
3786            "SELECT a /* x */, b /* x */",
3787            // Multiple comments after column
3788            "SELECT a /* x */ /* y */ /* z */, b /* k */ /* m */",
3789            // FROM tables with comments
3790            "SELECT * FROM foo /* x */, bla /* x */",
3791            // Arithmetic with comments
3792            "SELECT 1 /* comment */ + 1",
3793            "SELECT 1 /* c1 */ + 2 /* c2 */",
3794            "SELECT 1 /* c1 */ + /* c2 */ 2 /* c3 */",
3795            // CAST with comments
3796            "SELECT CAST(x AS INT) /* comment */ FROM foo",
3797            // Function arguments with comments
3798            "SELECT FOO(x /* c */) /* FOO */, b /* b */",
3799            // Multi-part table names with comments
3800            "SELECT x FROM a.b.c /* x */, e.f.g /* x */",
3801            // INSERT with comments
3802            "INSERT INTO t1 (tc1 /* tc1 */, tc2 /* tc2 */) SELECT c1 /* sc1 */, c2 /* sc2 */ FROM t",
3803            // Leading comments on statements
3804            "/* c */ WITH x AS (SELECT 1) SELECT * FROM x",
3805            "/* comment1 */ INSERT INTO x /* comment2 */ VALUES (1, 2, 3)",
3806            "/* comment1 */ UPDATE tbl /* comment2 */ SET x = 2 WHERE x < 2",
3807            "/* comment1 */ DELETE FROM x /* comment2 */ WHERE y > 1",
3808            "/* comment */ CREATE TABLE foo AS SELECT 1",
3809            // Trailing comments on statements
3810            "INSERT INTO foo SELECT * FROM bar /* comment */",
3811            // Complex nested expressions with comments
3812            "SELECT FOO(x /* c1 */ + y /* c2 */ + BLA(5 /* c3 */)) FROM (VALUES (1 /* c4 */, \"test\" /* c5 */)) /* c6 */",
3813        ];
3814
3815        let mut failures = Vec::new();
3816        for sql in tests {
3817            if let Some(e) = check_roundtrip(sql) {
3818                failures.push(e);
3819            }
3820        }
3821
3822        if !failures.is_empty() {
3823            panic!("Comment roundtrip failures:\n{}", failures.join("\n\n"));
3824        }
3825    }
3826
3827    #[test]
3828    fn test_dollar_quoted_string_parsing() {
3829        use crate::dialects::{Dialect, DialectType};
3830
3831        // Test dollar string token parsing utility function
3832        let (tag, content) = super::parse_dollar_string_token("FOO\x00content here");
3833        assert_eq!(tag, Some("FOO".to_string()));
3834        assert_eq!(content, "content here");
3835
3836        let (tag, content) = super::parse_dollar_string_token("just content");
3837        assert_eq!(tag, None);
3838        assert_eq!(content, "just content");
3839
3840        // Test roundtrip for Databricks dialect with dollar-quoted function body
3841        fn check_databricks(sql: &str, expected: Option<&str>) {
3842            let d = Dialect::get(DialectType::Databricks);
3843            let ast = d
3844                .parse(sql)
3845                .unwrap_or_else(|e| panic!("Parse error for '{}': {}", sql, e));
3846            assert!(!ast.is_empty(), "Empty AST for: {}", sql);
3847            let transformed = d
3848                .transform(ast[0].clone())
3849                .unwrap_or_else(|e| panic!("Transform error for '{}': {}", sql, e));
3850            let output = d
3851                .generate(&transformed)
3852                .unwrap_or_else(|e| panic!("Generate error for '{}': {}", sql, e));
3853            let expected = expected.unwrap_or(sql);
3854            assert_eq!(output, expected, "Roundtrip failed for: {}", sql);
3855        }
3856
3857        // Test [42]: $$...$$ heredoc
3858        check_databricks(
3859            "CREATE FUNCTION add_one(x INT) RETURNS INT LANGUAGE PYTHON AS $$def add_one(x):\n  return x+1$$",
3860            None
3861        );
3862
3863        // Test [43]: $FOO$...$FOO$ tagged heredoc
3864        check_databricks(
3865            "CREATE FUNCTION add_one(x INT) RETURNS INT LANGUAGE PYTHON AS $FOO$def add_one(x):\n  return x+1$FOO$",
3866            None
3867        );
3868    }
3869
3870    #[test]
3871    fn test_numeric_underscore_stripping() {
3872        // Underscore stripping only happens when numbers_can_be_underscore_separated is true
3873        let mut config = TokenizerConfig::default();
3874        config.numbers_can_be_underscore_separated = true;
3875        let tokenizer = Tokenizer::new(config);
3876
3877        // Simple integer with underscores
3878        let tokens = tokenizer.tokenize("SELECT 1_2_3_4_5").unwrap();
3879        assert_eq!(tokens[1].token_type, TokenType::Number);
3880        assert_eq!(tokens[1].text, "12345");
3881
3882        // Thousands separator
3883        let tokens = tokenizer.tokenize("SELECT 20_000").unwrap();
3884        assert_eq!(tokens[1].token_type, TokenType::Number);
3885        assert_eq!(tokens[1].text, "20000");
3886
3887        // Scientific notation with underscores
3888        let tokens = tokenizer.tokenize("SELECT 1_2E+1_0").unwrap();
3889        assert_eq!(tokens[1].token_type, TokenType::Number);
3890        assert_eq!(tokens[1].text, "12E+10");
3891
3892        // Default tokenizer should NOT strip underscores
3893        let default_tokenizer = Tokenizer::default();
3894        let tokens = default_tokenizer.tokenize("SELECT 1_2_3_4_5").unwrap();
3895        assert_eq!(tokens[1].token_type, TokenType::Number);
3896        assert_eq!(tokens[1].text, "1_2_3_4_5");
3897    }
3898}
polyglot_sql/tokens.rs

polyglot_sql/
tokens.rs