polyglot_sql/
tokens.rs

1//! Token types and tokenization for SQL parsing
2//!
3//! This module defines all SQL token types and the tokenizer that converts
4//! SQL strings into token streams.
5
6use crate::error::{Error, Result};
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9use std::fmt;
10use std::sync::LazyLock;
11#[cfg(feature = "bindings")]
12use ts_rs::TS;
13
14/// Parse a DollarString token text into (tag, content).
15/// If the text contains '\x00', the part before is the tag and after is content.
16/// Otherwise, the whole text is the content with no tag.
17pub fn parse_dollar_string_token(text: &str) -> (Option<String>, String) {
18    if let Some(pos) = text.find('\x00') {
19        let tag = &text[..pos];
20        let content = &text[pos + 1..];
21        (Some(tag.to_string()), content.to_string())
22    } else {
23        (None, text.to_string())
24    }
25}
26
27/// Represents a position in the source SQL
28#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
29#[cfg_attr(feature = "bindings", derive(TS))]
30pub struct Span {
31    /// Starting byte offset
32    pub start: usize,
33    /// Ending byte offset (exclusive)
34    pub end: usize,
35    /// Line number (1-based)
36    pub line: usize,
37    /// Column number (1-based)
38    pub column: usize,
39}
40
41impl Span {
42    pub fn new(start: usize, end: usize, line: usize, column: usize) -> Self {
43        Self {
44            start,
45            end,
46            line,
47            column,
48        }
49    }
50}
51
52/// A token in the SQL token stream
53#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
54pub struct Token {
55    /// The type of token
56    pub token_type: TokenType,
57    /// The raw text of the token
58    pub text: String,
59    /// Position information
60    pub span: Span,
61    /// Leading comments (comments that appeared before this token)
62    #[serde(default)]
63    pub comments: Vec<String>,
64    /// Trailing comments (comments that appeared after this token, before the next one)
65    #[serde(default)]
66    pub trailing_comments: Vec<String>,
67}
68
69impl Token {
70    /// Create a new token
71    pub fn new(token_type: TokenType, text: impl Into<String>, span: Span) -> Self {
72        Self {
73            token_type,
74            text: text.into(),
75            span,
76            comments: Vec::new(),
77            trailing_comments: Vec::new(),
78        }
79    }
80
81    /// Create a NUMBER token
82    pub fn number(n: i64) -> Self {
83        Self::new(TokenType::Number, n.to_string(), Span::default())
84    }
85
86    /// Create a STRING token
87    pub fn string(s: impl Into<String>) -> Self {
88        Self::new(TokenType::String, s, Span::default())
89    }
90
91    /// Create an IDENTIFIER token
92    pub fn identifier(s: impl Into<String>) -> Self {
93        Self::new(TokenType::Identifier, s, Span::default())
94    }
95
96    /// Create a VAR token
97    pub fn var(s: impl Into<String>) -> Self {
98        Self::new(TokenType::Var, s, Span::default())
99    }
100
101    /// Add a comment to this token
102    pub fn with_comment(mut self, comment: impl Into<String>) -> Self {
103        self.comments.push(comment.into());
104        self
105    }
106}
107
108impl fmt::Display for Token {
109    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
110        write!(f, "{:?}({})", self.token_type, self.text)
111    }
112}
113
114/// All possible token types in SQL
115#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
116#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
117#[repr(u16)]
118pub enum TokenType {
119    // Punctuation
120    LParen,
121    RParen,
122    LBracket,
123    RBracket,
124    LBrace,
125    RBrace,
126    Comma,
127    Dot,
128    Dash,
129    Plus,
130    Colon,
131    DotColon,
132    DColon,
133    DColonDollar,
134    DColonPercent,
135    DColonQMark,
136    DQMark,
137    Semicolon,
138    Star,
139    Backslash,
140    Slash,
141    Lt,
142    Lte,
143    Gt,
144    Gte,
145    Not,
146    Eq,
147    Neq,
148    NullsafeEq,
149    ColonEq,
150    ColonGt,
151    NColonGt,
152    And,
153    Or,
154    Amp,
155    DPipe,
156    PipeGt,
157    Pipe,
158    PipeSlash,
159    DPipeSlash,
160    Caret,
161    CaretAt,
162    LtLt, // <<
163    GtGt, // >>
164    Tilde,
165    Arrow,
166    DArrow,
167    FArrow,
168    Hash,
169    HashArrow,
170    DHashArrow,
171    LrArrow,
172    DAt,
173    AtAt,
174    AtQMark,
175    LtAt,
176    AtGt,
177    Dollar,
178    Parameter,
179    Session,
180    SessionParameter,
181    SessionUser,
182    DAmp,
183    AmpLt,
184    AmpGt,
185    Adjacent,
186    Xor,
187    DStar,
188    QMarkAmp,
189    QMarkPipe,
190    HashDash,
191    Exclamation,
192
193    UriStart,
194    BlockStart,
195    BlockEnd,
196    Space,
197    Break,
198
199    // Comments (emitted as tokens for round-trip fidelity)
200    BlockComment, // /* ... */
201    LineComment,  // -- ...
202
203    // Literals
204    String,
205    DollarString,             // $$...$$
206    TripleDoubleQuotedString, // """..."""
207    TripleSingleQuotedString, // '''...'''
208    Number,
209    Identifier,
210    QuotedIdentifier,
211    Database,
212    Column,
213    ColumnDef,
214    Schema,
215    Table,
216    Warehouse,
217    Stage,
218    Streamlit,
219    Var,
220    BitString,
221    HexString,
222    /// Hex number: 0xA, 0xFF (BigQuery, SQLite style) - represents an integer in hex notation
223    HexNumber,
224    ByteString,
225    NationalString,
226    EscapeString, // PostgreSQL E'...' escape string
227    RawString,
228    HeredocString,
229    HeredocStringAlternative,
230    UnicodeString,
231
232    // Data Types
233    Bit,
234    Boolean,
235    TinyInt,
236    UTinyInt,
237    SmallInt,
238    USmallInt,
239    MediumInt,
240    UMediumInt,
241    Int,
242    UInt,
243    BigInt,
244    UBigInt,
245    BigNum,
246    Int128,
247    UInt128,
248    Int256,
249    UInt256,
250    Float,
251    Double,
252    UDouble,
253    Decimal,
254    Decimal32,
255    Decimal64,
256    Decimal128,
257    Decimal256,
258    DecFloat,
259    UDecimal,
260    BigDecimal,
261    Char,
262    NChar,
263    VarChar,
264    NVarChar,
265    BpChar,
266    Text,
267    MediumText,
268    LongText,
269    Blob,
270    MediumBlob,
271    LongBlob,
272    TinyBlob,
273    TinyText,
274    Name,
275    Binary,
276    VarBinary,
277    Json,
278    JsonB,
279    Time,
280    TimeTz,
281    TimeNs,
282    Timestamp,
283    TimestampTz,
284    TimestampLtz,
285    TimestampNtz,
286    TimestampS,
287    TimestampMs,
288    TimestampNs,
289    DateTime,
290    DateTime2,
291    DateTime64,
292    SmallDateTime,
293    Date,
294    Date32,
295    Int4Range,
296    Int4MultiRange,
297    Int8Range,
298    Int8MultiRange,
299    NumRange,
300    NumMultiRange,
301    TsRange,
302    TsMultiRange,
303    TsTzRange,
304    TsTzMultiRange,
305    DateRange,
306    DateMultiRange,
307    Uuid,
308    Geography,
309    GeographyPoint,
310    Nullable,
311    Geometry,
312    Point,
313    Ring,
314    LineString,
315    LocalTime,
316    LocalTimestamp,
317    SysTimestamp,
318    MultiLineString,
319    Polygon,
320    MultiPolygon,
321    HllSketch,
322    HStore,
323    Super,
324    Serial,
325    SmallSerial,
326    BigSerial,
327    Xml,
328    Year,
329    UserDefined,
330    Money,
331    SmallMoney,
332    RowVersion,
333    Image,
334    Variant,
335    Object,
336    Inet,
337    IpAddress,
338    IpPrefix,
339    Ipv4,
340    Ipv6,
341    Enum,
342    Enum8,
343    Enum16,
344    FixedString,
345    LowCardinality,
346    Nested,
347    AggregateFunction,
348    SimpleAggregateFunction,
349    TDigest,
350    Unknown,
351    Vector,
352    Dynamic,
353    Void,
354
355    // Keywords
356    Add,
357    Alias,
358    Alter,
359    All,
360    Anti,
361    Any,
362    Apply,
363    Array,
364    Asc,
365    AsOf,
366    Attach,
367    AutoIncrement,
368    Begin,
369    Between,
370    BulkCollectInto,
371    Cache,
372    Cascade,
373    Case,
374    CharacterSet,
375    Cluster,
376    ClusterBy,
377    Collate,
378    Command,
379    Comment,
380    Commit,
381    Prepare,
382    Preserve,
383    Connect,
384    ConnectBy,
385    Constraint,
386    Copy,
387    Create,
388    Cross,
389    Cube,
390    CurrentDate,
391    CurrentDateTime,
392    CurrentSchema,
393    CurrentTime,
394    CurrentTimestamp,
395    CurrentUser,
396    CurrentRole,
397    CurrentCatalog,
398    Declare,
399    Default,
400    Delete,
401    Desc,
402    Describe,
403    Detach,
404    Dictionary,
405    Distinct,
406    Distribute,
407    DistributeBy,
408    Div,
409    Drop,
410    Else,
411    End,
412    Escape,
413    Except,
414    Execute,
415    Exists,
416    False,
417    Fetch,
418    File,
419    FileFormat,
420    Filter,
421    Final,
422    First,
423    For,
424    Force,
425    ForeignKey,
426    Format,
427    From,
428    Full,
429    Function,
430    Get,
431    Glob,
432    Global,
433    Grant,
434    GroupBy,
435    GroupingSets,
436    Having,
437    Hint,
438    Ignore,
439    ILike,
440    In,
441    Index,
442    IndexedBy,
443    Inner,
444    Input,
445    Insert,
446    Install,
447    Intersect,
448    Interval,
449    Into,
450    Inpath,
451    InputFormat,
452    Introducer,
453    IRLike,
454    Is,
455    IsNull,
456    Join,
457    JoinMarker,
458    Keep,
459    Key,
460    Kill,
461    Lambda,
462    Language,
463    Lateral,
464    Left,
465    Like,
466    NotLike,   // !~~ operator (PostgreSQL)
467    NotILike,  // !~~* operator (PostgreSQL)
468    NotRLike,  // !~ operator (PostgreSQL)
469    NotIRLike, // !~* operator (PostgreSQL)
470    Limit,
471    List,
472    Load,
473    Local,
474    Lock,
475    Map,
476    Match,
477    MatchCondition,
478    MatchRecognize,
479    MemberOf,
480    Materialized,
481    Merge,
482    Mod,
483    Model,
484    Natural,
485    Next,
486    NoAction,
487    Nothing,
488    NotNull,
489    Null,
490    ObjectIdentifier,
491    Offset,
492    On,
493    Only,
494    Operator,
495    OrderBy,
496    OrderSiblingsBy,
497    Ordered,
498    Ordinality,
499    Out,
500    Outer,
501    Output,
502    Over,
503    Overlaps,
504    Overwrite,
505    Partition,
506    PartitionBy,
507    Percent,
508    Pivot,
509    Placeholder,
510    Positional,
511    Pragma,
512    Prewhere,
513    PrimaryKey,
514    Procedure,
515    Properties,
516    PseudoType,
517    Put,
518    Qualify,
519    Quote,
520    QDColon,
521    Range,
522    Recursive,
523    Refresh,
524    Rename,
525    Replace,
526    Returning,
527    Revoke,
528    References,
529    Restrict,
530    Right,
531    RLike,
532    Rollback,
533    Rollup,
534    Row,
535    Rows,
536    Select,
537    Semi,
538    Savepoint,
539    Separator,
540    Sequence,
541    Serde,
542    SerdeProperties,
543    Set,
544    Settings,
545    Show,
546    Siblings,
547    SimilarTo,
548    Some,
549    Sort,
550    SortBy,
551    SoundsLike,
552    StartWith,
553    StorageIntegration,
554    StraightJoin,
555    Struct,
556    Summarize,
557    TableSample,
558    Sample,
559    Bernoulli,
560    System,
561    Block,
562    Seed,
563    Repeatable,
564    Tag,
565    Temporary,
566    Transaction,
567    To,
568    Top,
569    Then,
570    True,
571    Truncate,
572    Uncache,
573    Union,
574    Unnest,
575    Unpivot,
576    Update,
577    Use,
578    Using,
579    Values,
580    View,
581    SemanticView,
582    Volatile,
583    When,
584    Where,
585    Window,
586    With,
587    Ties,
588    Exclude,
589    No,
590    Others,
591    Unique,
592    UtcDate,
593    UtcTime,
594    UtcTimestamp,
595    VersionSnapshot,
596    TimestampSnapshot,
597    Option,
598    Sink,
599    Source,
600    Analyze,
601    Namespace,
602    Export,
603    As,
604    By,
605    Nulls,
606    Respect,
607    Last,
608    If,
609    Cast,
610    TryCast,
611    SafeCast,
612    Count,
613    Extract,
614    Substring,
615    Trim,
616    Leading,
617    Trailing,
618    Both,
619    Position,
620    Overlaying,
621    Placing,
622    Treat,
623    Within,
624    Group,
625    Order,
626
627    // Window function keywords
628    Unbounded,
629    Preceding,
630    Following,
631    Current,
632    Groups,
633
634    // DDL-specific keywords (Phase 4)
635    Trigger,
636    Type,
637    Domain,
638    Returns,
639    Body,
640    Increment,
641    Minvalue,
642    Maxvalue,
643    Start,
644    Cycle,
645    NoCycle,
646    Prior,
647    Generated,
648    Identity,
649    Always,
650    // MATCH_RECOGNIZE tokens
651    Measures,
652    Pattern,
653    Define,
654    Running,
655    Owned,
656    After,
657    Before,
658    Instead,
659    Each,
660    Statement,
661    Referencing,
662    Old,
663    New,
664    Of,
665    Check,
666    Authorization,
667    Restart,
668
669    // Special
670    Eof,
671}
672
673impl TokenType {
674    /// Check if this token type is a keyword that can be used as an identifier in certain contexts
675    pub fn is_keyword(&self) -> bool {
676        matches!(
677            self,
678            TokenType::Select
679                | TokenType::From
680                | TokenType::Where
681                | TokenType::And
682                | TokenType::Or
683                | TokenType::Not
684                | TokenType::In
685                | TokenType::Is
686                | TokenType::Null
687                | TokenType::True
688                | TokenType::False
689                | TokenType::As
690                | TokenType::On
691                | TokenType::Join
692                | TokenType::Left
693                | TokenType::Right
694                | TokenType::Inner
695                | TokenType::Outer
696                | TokenType::Full
697                | TokenType::Cross
698                | TokenType::Semi
699                | TokenType::Anti
700                | TokenType::Union
701                | TokenType::Except
702                | TokenType::Intersect
703                | TokenType::GroupBy
704                | TokenType::OrderBy
705                | TokenType::Having
706                | TokenType::Limit
707                | TokenType::Offset
708                | TokenType::Case
709                | TokenType::When
710                | TokenType::Then
711                | TokenType::Else
712                | TokenType::End
713                | TokenType::Create
714                | TokenType::Drop
715                | TokenType::Alter
716                | TokenType::Insert
717                | TokenType::Update
718                | TokenType::Delete
719                | TokenType::Into
720                | TokenType::Values
721                | TokenType::Set
722                | TokenType::With
723                | TokenType::Distinct
724                | TokenType::All
725                | TokenType::Exists
726                | TokenType::Between
727                | TokenType::Like
728                | TokenType::ILike
729                // Additional keywords that can be used as identifiers
730                | TokenType::Filter
731                | TokenType::Date
732                | TokenType::Timestamp
733                | TokenType::TimestampTz
734                | TokenType::Interval
735                | TokenType::Time
736                | TokenType::Table
737                | TokenType::Index
738                | TokenType::Column
739                | TokenType::Database
740                | TokenType::Schema
741                | TokenType::View
742                | TokenType::Function
743                | TokenType::Procedure
744                | TokenType::Trigger
745                | TokenType::Sequence
746                | TokenType::Over
747                | TokenType::Partition
748                | TokenType::Window
749                | TokenType::Rows
750                | TokenType::Range
751                | TokenType::First
752                | TokenType::Last
753                | TokenType::Preceding
754                | TokenType::Following
755                | TokenType::Current
756                | TokenType::Row
757                | TokenType::Unbounded
758                | TokenType::Array
759                | TokenType::Struct
760                | TokenType::Map
761                | TokenType::PrimaryKey
762                | TokenType::Key
763                | TokenType::ForeignKey
764                | TokenType::References
765                | TokenType::Unique
766                | TokenType::Check
767                | TokenType::Default
768                | TokenType::Constraint
769                | TokenType::Comment
770                | TokenType::Rollup
771                | TokenType::Cube
772                | TokenType::Grant
773                | TokenType::Revoke
774                | TokenType::Type
775                | TokenType::Use
776                | TokenType::Cache
777                | TokenType::Uncache
778                | TokenType::Load
779                | TokenType::Any
780                | TokenType::Some
781                | TokenType::Asc
782                | TokenType::Desc
783                | TokenType::Nulls
784                | TokenType::Lateral
785                | TokenType::Natural
786                | TokenType::Escape
787                | TokenType::Glob
788                | TokenType::Match
789                | TokenType::Recursive
790                | TokenType::Replace
791                | TokenType::Returns
792                | TokenType::If
793                | TokenType::Pivot
794                | TokenType::Unpivot
795                | TokenType::Json
796                | TokenType::Blob
797                | TokenType::Text
798                | TokenType::Int
799                | TokenType::BigInt
800                | TokenType::SmallInt
801                | TokenType::TinyInt
802                | TokenType::Int128
803                | TokenType::UInt128
804                | TokenType::Int256
805                | TokenType::UInt256
806                | TokenType::UInt
807                | TokenType::UBigInt
808                | TokenType::Float
809                | TokenType::Double
810                | TokenType::Decimal
811                | TokenType::Boolean
812                | TokenType::VarChar
813                | TokenType::Char
814                | TokenType::Binary
815                | TokenType::VarBinary
816                | TokenType::No
817                | TokenType::DateTime
818                | TokenType::Truncate
819                | TokenType::Execute
820                | TokenType::Merge
821                | TokenType::Top
822                | TokenType::Begin
823                | TokenType::Generated
824                | TokenType::Identity
825                | TokenType::Always
826                | TokenType::Extract
827                // Keywords that can be identifiers in certain contexts
828                | TokenType::AsOf
829                | TokenType::Prior
830                | TokenType::After
831                | TokenType::Restrict
832                | TokenType::Cascade
833                | TokenType::Local
834                | TokenType::Rename
835                | TokenType::Enum
836                | TokenType::Within
837                | TokenType::Format
838                | TokenType::Final
839                | TokenType::FileFormat
840                | TokenType::Input
841                | TokenType::InputFormat
842                | TokenType::Copy
843                | TokenType::Put
844                | TokenType::Get
845                | TokenType::Show
846                | TokenType::Serde
847                | TokenType::Sample
848                | TokenType::Sort
849                | TokenType::Collate
850                | TokenType::Ties
851                | TokenType::IsNull
852                | TokenType::NotNull
853                | TokenType::Exclude
854                | TokenType::Temporary
855                | TokenType::Add
856                | TokenType::Ordinality
857                | TokenType::Overlaps
858                | TokenType::Block
859                | TokenType::Pattern
860                | TokenType::Group
861                | TokenType::Cluster
862                | TokenType::Repeatable
863                | TokenType::Groups
864                | TokenType::Commit
865                | TokenType::Warehouse
866                | TokenType::System
867                | TokenType::By
868                | TokenType::To
869                | TokenType::Fetch
870                | TokenType::For
871                | TokenType::Only
872                | TokenType::Next
873                | TokenType::Lock
874                | TokenType::Refresh
875                | TokenType::Settings
876                | TokenType::Operator
877                | TokenType::Overwrite
878                | TokenType::StraightJoin
879                | TokenType::Start
880                // Additional keywords registered in tokenizer but previously missing from is_keyword()
881                | TokenType::Ignore
882                | TokenType::Domain
883                | TokenType::Apply
884                | TokenType::Respect
885                | TokenType::Materialized
886                | TokenType::Prewhere
887                | TokenType::Old
888                | TokenType::New
889                | TokenType::Cast
890                | TokenType::TryCast
891                | TokenType::SafeCast
892                | TokenType::Transaction
893                | TokenType::Describe
894                | TokenType::Kill
895                | TokenType::Lambda
896                | TokenType::Declare
897                | TokenType::Keep
898                | TokenType::Output
899                | TokenType::Percent
900                | TokenType::Qualify
901                | TokenType::Returning
902                | TokenType::Language
903                | TokenType::Prepare
904                | TokenType::Preserve
905                | TokenType::Savepoint
906                | TokenType::Rollback
907                | TokenType::Body
908                | TokenType::Increment
909                | TokenType::Minvalue
910                | TokenType::Maxvalue
911                | TokenType::Cycle
912                | TokenType::NoCycle
913                | TokenType::Seed
914                | TokenType::Namespace
915                | TokenType::Authorization
916                | TokenType::Order
917                | TokenType::Restart
918                | TokenType::Before
919                | TokenType::Instead
920                | TokenType::Each
921                | TokenType::Statement
922                | TokenType::Referencing
923                | TokenType::Of
924                | TokenType::Separator
925                | TokenType::Others
926                | TokenType::Placing
927                | TokenType::Owned
928                | TokenType::Running
929                | TokenType::Define
930                | TokenType::Measures
931                | TokenType::MatchRecognize
932                | TokenType::AutoIncrement
933                | TokenType::Connect
934                | TokenType::Distribute
935                | TokenType::Bernoulli
936                | TokenType::TableSample
937                | TokenType::Inpath
938                | TokenType::Pragma
939                | TokenType::Siblings
940                | TokenType::SerdeProperties
941                | TokenType::RLike
942        )
943    }
944
945    /// Check if this token type is a comparison operator
946    pub fn is_comparison(&self) -> bool {
947        matches!(
948            self,
949            TokenType::Eq
950                | TokenType::Neq
951                | TokenType::Lt
952                | TokenType::Lte
953                | TokenType::Gt
954                | TokenType::Gte
955                | TokenType::NullsafeEq
956        )
957    }
958
959    /// Check if this token type is an arithmetic operator
960    pub fn is_arithmetic(&self) -> bool {
961        matches!(
962            self,
963            TokenType::Plus
964                | TokenType::Dash
965                | TokenType::Star
966                | TokenType::Slash
967                | TokenType::Percent
968                | TokenType::Mod
969                | TokenType::Div
970        )
971    }
972}
973
974impl fmt::Display for TokenType {
975    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
976        write!(f, "{:?}", self)
977    }
978}
979
980// ── Cached default maps for TokenizerConfig ─────────────────────────────────
981
982static DEFAULT_KEYWORDS: LazyLock<HashMap<String, TokenType>> = LazyLock::new(|| {
983    let mut keywords = HashMap::with_capacity(300);
984    // Add basic SQL keywords
985    keywords.insert("SELECT".to_string(), TokenType::Select);
986    keywords.insert("FROM".to_string(), TokenType::From);
987    keywords.insert("WHERE".to_string(), TokenType::Where);
988    keywords.insert("AND".to_string(), TokenType::And);
989    keywords.insert("OR".to_string(), TokenType::Or);
990    keywords.insert("NOT".to_string(), TokenType::Not);
991    keywords.insert("AS".to_string(), TokenType::As);
992    keywords.insert("ON".to_string(), TokenType::On);
993    keywords.insert("JOIN".to_string(), TokenType::Join);
994    keywords.insert("LEFT".to_string(), TokenType::Left);
995    keywords.insert("RIGHT".to_string(), TokenType::Right);
996    keywords.insert("INNER".to_string(), TokenType::Inner);
997    keywords.insert("OUTER".to_string(), TokenType::Outer);
998    keywords.insert("OUTPUT".to_string(), TokenType::Output);
999    keywords.insert("FULL".to_string(), TokenType::Full);
1000    keywords.insert("CROSS".to_string(), TokenType::Cross);
1001    keywords.insert("SEMI".to_string(), TokenType::Semi);
1002    keywords.insert("ANTI".to_string(), TokenType::Anti);
1003    keywords.insert("STRAIGHT_JOIN".to_string(), TokenType::StraightJoin);
1004    keywords.insert("UNION".to_string(), TokenType::Union);
1005    keywords.insert("EXCEPT".to_string(), TokenType::Except);
1006    keywords.insert("MINUS".to_string(), TokenType::Except); // Oracle/Redshift alias for EXCEPT
1007    keywords.insert("INTERSECT".to_string(), TokenType::Intersect);
1008    keywords.insert("GROUP".to_string(), TokenType::Group);
1009    keywords.insert("CUBE".to_string(), TokenType::Cube);
1010    keywords.insert("ROLLUP".to_string(), TokenType::Rollup);
1011    keywords.insert("WITHIN".to_string(), TokenType::Within);
1012    keywords.insert("ORDER".to_string(), TokenType::Order);
1013    keywords.insert("BY".to_string(), TokenType::By);
1014    keywords.insert("HAVING".to_string(), TokenType::Having);
1015    keywords.insert("LIMIT".to_string(), TokenType::Limit);
1016    keywords.insert("OFFSET".to_string(), TokenType::Offset);
1017    keywords.insert("ORDINALITY".to_string(), TokenType::Ordinality);
1018    keywords.insert("FETCH".to_string(), TokenType::Fetch);
1019    keywords.insert("FIRST".to_string(), TokenType::First);
1020    keywords.insert("NEXT".to_string(), TokenType::Next);
1021    keywords.insert("ONLY".to_string(), TokenType::Only);
1022    keywords.insert("KEEP".to_string(), TokenType::Keep);
1023    keywords.insert("IGNORE".to_string(), TokenType::Ignore);
1024    keywords.insert("INPUT".to_string(), TokenType::Input);
1025    keywords.insert("CASE".to_string(), TokenType::Case);
1026    keywords.insert("WHEN".to_string(), TokenType::When);
1027    keywords.insert("THEN".to_string(), TokenType::Then);
1028    keywords.insert("ELSE".to_string(), TokenType::Else);
1029    keywords.insert("END".to_string(), TokenType::End);
1030    keywords.insert("ENDIF".to_string(), TokenType::End); // Exasol alias for END
1031    keywords.insert("NULL".to_string(), TokenType::Null);
1032    keywords.insert("TRUE".to_string(), TokenType::True);
1033    keywords.insert("FALSE".to_string(), TokenType::False);
1034    keywords.insert("IS".to_string(), TokenType::Is);
1035    keywords.insert("IN".to_string(), TokenType::In);
1036    keywords.insert("BETWEEN".to_string(), TokenType::Between);
1037    keywords.insert("OVERLAPS".to_string(), TokenType::Overlaps);
1038    keywords.insert("LIKE".to_string(), TokenType::Like);
1039    keywords.insert("ILIKE".to_string(), TokenType::ILike);
1040    keywords.insert("RLIKE".to_string(), TokenType::RLike);
1041    keywords.insert("REGEXP".to_string(), TokenType::RLike);
1042    keywords.insert("ESCAPE".to_string(), TokenType::Escape);
1043    keywords.insert("EXISTS".to_string(), TokenType::Exists);
1044    keywords.insert("DISTINCT".to_string(), TokenType::Distinct);
1045    keywords.insert("ALL".to_string(), TokenType::All);
1046    keywords.insert("WITH".to_string(), TokenType::With);
1047    keywords.insert("CREATE".to_string(), TokenType::Create);
1048    keywords.insert("DROP".to_string(), TokenType::Drop);
1049    keywords.insert("ALTER".to_string(), TokenType::Alter);
1050    keywords.insert("TRUNCATE".to_string(), TokenType::Truncate);
1051    keywords.insert("TABLE".to_string(), TokenType::Table);
1052    keywords.insert("VIEW".to_string(), TokenType::View);
1053    keywords.insert("INDEX".to_string(), TokenType::Index);
1054    keywords.insert("COLUMN".to_string(), TokenType::Column);
1055    keywords.insert("CONSTRAINT".to_string(), TokenType::Constraint);
1056    keywords.insert("ADD".to_string(), TokenType::Add);
1057    keywords.insert("CASCADE".to_string(), TokenType::Cascade);
1058    keywords.insert("RESTRICT".to_string(), TokenType::Restrict);
1059    keywords.insert("RENAME".to_string(), TokenType::Rename);
1060    keywords.insert("TEMPORARY".to_string(), TokenType::Temporary);
1061    keywords.insert("TEMP".to_string(), TokenType::Temporary);
1062    keywords.insert("UNIQUE".to_string(), TokenType::Unique);
1063    keywords.insert("PRIMARY".to_string(), TokenType::PrimaryKey);
1064    keywords.insert("FOREIGN".to_string(), TokenType::ForeignKey);
1065    keywords.insert("KEY".to_string(), TokenType::Key);
1066    keywords.insert("KILL".to_string(), TokenType::Kill);
1067    keywords.insert("REFERENCES".to_string(), TokenType::References);
1068    keywords.insert("DEFAULT".to_string(), TokenType::Default);
1069    keywords.insert("DECLARE".to_string(), TokenType::Declare);
1070    keywords.insert("AUTO_INCREMENT".to_string(), TokenType::AutoIncrement);
1071    keywords.insert("AUTOINCREMENT".to_string(), TokenType::AutoIncrement); // Snowflake style
1072    keywords.insert("MATERIALIZED".to_string(), TokenType::Materialized);
1073    keywords.insert("REPLACE".to_string(), TokenType::Replace);
1074    keywords.insert("TO".to_string(), TokenType::To);
1075    keywords.insert("INSERT".to_string(), TokenType::Insert);
1076    keywords.insert("OVERWRITE".to_string(), TokenType::Overwrite);
1077    keywords.insert("UPDATE".to_string(), TokenType::Update);
1078    keywords.insert("USE".to_string(), TokenType::Use);
1079    keywords.insert("WAREHOUSE".to_string(), TokenType::Warehouse);
1080    keywords.insert("GLOB".to_string(), TokenType::Glob);
1081    keywords.insert("DELETE".to_string(), TokenType::Delete);
1082    keywords.insert("MERGE".to_string(), TokenType::Merge);
1083    keywords.insert("CACHE".to_string(), TokenType::Cache);
1084    keywords.insert("UNCACHE".to_string(), TokenType::Uncache);
1085    keywords.insert("REFRESH".to_string(), TokenType::Refresh);
1086    keywords.insert("GRANT".to_string(), TokenType::Grant);
1087    keywords.insert("REVOKE".to_string(), TokenType::Revoke);
1088    keywords.insert("COMMENT".to_string(), TokenType::Comment);
1089    keywords.insert("COLLATE".to_string(), TokenType::Collate);
1090    keywords.insert("INTO".to_string(), TokenType::Into);
1091    keywords.insert("VALUES".to_string(), TokenType::Values);
1092    keywords.insert("SET".to_string(), TokenType::Set);
1093    keywords.insert("SETTINGS".to_string(), TokenType::Settings);
1094    keywords.insert("SEPARATOR".to_string(), TokenType::Separator);
1095    keywords.insert("ASC".to_string(), TokenType::Asc);
1096    keywords.insert("DESC".to_string(), TokenType::Desc);
1097    keywords.insert("NULLS".to_string(), TokenType::Nulls);
1098    keywords.insert("RESPECT".to_string(), TokenType::Respect);
1099    keywords.insert("FIRST".to_string(), TokenType::First);
1100    keywords.insert("LAST".to_string(), TokenType::Last);
1101    keywords.insert("IF".to_string(), TokenType::If);
1102    keywords.insert("CAST".to_string(), TokenType::Cast);
1103    keywords.insert("TRY_CAST".to_string(), TokenType::TryCast);
1104    keywords.insert("SAFE_CAST".to_string(), TokenType::SafeCast);
1105    keywords.insert("OVER".to_string(), TokenType::Over);
1106    keywords.insert("PARTITION".to_string(), TokenType::Partition);
1107    keywords.insert("PLACING".to_string(), TokenType::Placing);
1108    keywords.insert("WINDOW".to_string(), TokenType::Window);
1109    keywords.insert("ROWS".to_string(), TokenType::Rows);
1110    keywords.insert("RANGE".to_string(), TokenType::Range);
1111    keywords.insert("FILTER".to_string(), TokenType::Filter);
1112    keywords.insert("NATURAL".to_string(), TokenType::Natural);
1113    keywords.insert("USING".to_string(), TokenType::Using);
1114    keywords.insert("UNBOUNDED".to_string(), TokenType::Unbounded);
1115    keywords.insert("PRECEDING".to_string(), TokenType::Preceding);
1116    keywords.insert("FOLLOWING".to_string(), TokenType::Following);
1117    keywords.insert("CURRENT".to_string(), TokenType::Current);
1118    keywords.insert("ROW".to_string(), TokenType::Row);
1119    keywords.insert("GROUPS".to_string(), TokenType::Groups);
1120    keywords.insert("RECURSIVE".to_string(), TokenType::Recursive);
1121    // TRIM function position keywords
1122    keywords.insert("BOTH".to_string(), TokenType::Both);
1123    keywords.insert("LEADING".to_string(), TokenType::Leading);
1124    keywords.insert("TRAILING".to_string(), TokenType::Trailing);
1125    keywords.insert("INTERVAL".to_string(), TokenType::Interval);
1126    // Phase 3: Additional keywords
1127    keywords.insert("TOP".to_string(), TokenType::Top);
1128    keywords.insert("QUALIFY".to_string(), TokenType::Qualify);
1129    keywords.insert("SAMPLE".to_string(), TokenType::Sample);
1130    keywords.insert("TABLESAMPLE".to_string(), TokenType::TableSample);
1131    keywords.insert("BERNOULLI".to_string(), TokenType::Bernoulli);
1132    keywords.insert("SYSTEM".to_string(), TokenType::System);
1133    keywords.insert("BLOCK".to_string(), TokenType::Block);
1134    keywords.insert("SEED".to_string(), TokenType::Seed);
1135    keywords.insert("REPEATABLE".to_string(), TokenType::Repeatable);
1136    keywords.insert("TIES".to_string(), TokenType::Ties);
1137    keywords.insert("LATERAL".to_string(), TokenType::Lateral);
1138    keywords.insert("LAMBDA".to_string(), TokenType::Lambda);
1139    keywords.insert("APPLY".to_string(), TokenType::Apply);
1140    // Oracle CONNECT BY keywords
1141    keywords.insert("CONNECT".to_string(), TokenType::Connect);
1142    // Hive/Spark specific keywords
1143    keywords.insert("CLUSTER".to_string(), TokenType::Cluster);
1144    keywords.insert("DISTRIBUTE".to_string(), TokenType::Distribute);
1145    keywords.insert("SORT".to_string(), TokenType::Sort);
1146    keywords.insert("PIVOT".to_string(), TokenType::Pivot);
1147    keywords.insert("PREWHERE".to_string(), TokenType::Prewhere);
1148    keywords.insert("UNPIVOT".to_string(), TokenType::Unpivot);
1149    keywords.insert("FOR".to_string(), TokenType::For);
1150    keywords.insert("ANY".to_string(), TokenType::Any);
1151    keywords.insert("SOME".to_string(), TokenType::Some);
1152    keywords.insert("ASOF".to_string(), TokenType::AsOf);
1153    keywords.insert("PERCENT".to_string(), TokenType::Percent);
1154    keywords.insert("EXCLUDE".to_string(), TokenType::Exclude);
1155    keywords.insert("NO".to_string(), TokenType::No);
1156    keywords.insert("OTHERS".to_string(), TokenType::Others);
1157    // PostgreSQL OPERATOR() syntax for schema-qualified operators
1158    keywords.insert("OPERATOR".to_string(), TokenType::Operator);
1159    // Phase 4: DDL keywords
1160    keywords.insert("SCHEMA".to_string(), TokenType::Schema);
1161    keywords.insert("NAMESPACE".to_string(), TokenType::Namespace);
1162    keywords.insert("DATABASE".to_string(), TokenType::Database);
1163    keywords.insert("FUNCTION".to_string(), TokenType::Function);
1164    keywords.insert("PROCEDURE".to_string(), TokenType::Procedure);
1165    keywords.insert("PROC".to_string(), TokenType::Procedure);
1166    keywords.insert("SEQUENCE".to_string(), TokenType::Sequence);
1167    keywords.insert("TRIGGER".to_string(), TokenType::Trigger);
1168    keywords.insert("TYPE".to_string(), TokenType::Type);
1169    keywords.insert("DOMAIN".to_string(), TokenType::Domain);
1170    keywords.insert("RETURNS".to_string(), TokenType::Returns);
1171    keywords.insert("RETURNING".to_string(), TokenType::Returning);
1172    keywords.insert("LANGUAGE".to_string(), TokenType::Language);
1173    keywords.insert("ROLLBACK".to_string(), TokenType::Rollback);
1174    keywords.insert("COMMIT".to_string(), TokenType::Commit);
1175    keywords.insert("BEGIN".to_string(), TokenType::Begin);
1176    keywords.insert("DESCRIBE".to_string(), TokenType::Describe);
1177    keywords.insert("PREPARE".to_string(), TokenType::Prepare);
1178    keywords.insert("PRESERVE".to_string(), TokenType::Preserve);
1179    keywords.insert("TRANSACTION".to_string(), TokenType::Transaction);
1180    keywords.insert("SAVEPOINT".to_string(), TokenType::Savepoint);
1181    keywords.insert("BODY".to_string(), TokenType::Body);
1182    keywords.insert("INCREMENT".to_string(), TokenType::Increment);
1183    keywords.insert("MINVALUE".to_string(), TokenType::Minvalue);
1184    keywords.insert("MAXVALUE".to_string(), TokenType::Maxvalue);
1185    keywords.insert("CYCLE".to_string(), TokenType::Cycle);
1186    keywords.insert("NOCYCLE".to_string(), TokenType::NoCycle);
1187    keywords.insert("PRIOR".to_string(), TokenType::Prior);
1188    // MATCH_RECOGNIZE keywords
1189    keywords.insert("MATCH".to_string(), TokenType::Match);
1190    keywords.insert("MATCH_RECOGNIZE".to_string(), TokenType::MatchRecognize);
1191    keywords.insert("MEASURES".to_string(), TokenType::Measures);
1192    keywords.insert("PATTERN".to_string(), TokenType::Pattern);
1193    keywords.insert("DEFINE".to_string(), TokenType::Define);
1194    keywords.insert("RUNNING".to_string(), TokenType::Running);
1195    keywords.insert("FINAL".to_string(), TokenType::Final);
1196    keywords.insert("OWNED".to_string(), TokenType::Owned);
1197    keywords.insert("AFTER".to_string(), TokenType::After);
1198    keywords.insert("BEFORE".to_string(), TokenType::Before);
1199    keywords.insert("INSTEAD".to_string(), TokenType::Instead);
1200    keywords.insert("EACH".to_string(), TokenType::Each);
1201    keywords.insert("STATEMENT".to_string(), TokenType::Statement);
1202    keywords.insert("REFERENCING".to_string(), TokenType::Referencing);
1203    keywords.insert("OLD".to_string(), TokenType::Old);
1204    keywords.insert("NEW".to_string(), TokenType::New);
1205    keywords.insert("OF".to_string(), TokenType::Of);
1206    keywords.insert("CHECK".to_string(), TokenType::Check);
1207    keywords.insert("START".to_string(), TokenType::Start);
1208    keywords.insert("ENUM".to_string(), TokenType::Enum);
1209    keywords.insert("AUTHORIZATION".to_string(), TokenType::Authorization);
1210    keywords.insert("RESTART".to_string(), TokenType::Restart);
1211    // Date/time literal keywords
1212    keywords.insert("DATE".to_string(), TokenType::Date);
1213    keywords.insert("TIME".to_string(), TokenType::Time);
1214    keywords.insert("TIMESTAMP".to_string(), TokenType::Timestamp);
1215    keywords.insert("DATETIME".to_string(), TokenType::DateTime);
1216    keywords.insert("GENERATED".to_string(), TokenType::Generated);
1217    keywords.insert("IDENTITY".to_string(), TokenType::Identity);
1218    keywords.insert("ALWAYS".to_string(), TokenType::Always);
1219    // LOAD DATA keywords
1220    keywords.insert("LOAD".to_string(), TokenType::Load);
1221    keywords.insert("LOCAL".to_string(), TokenType::Local);
1222    keywords.insert("INPATH".to_string(), TokenType::Inpath);
1223    keywords.insert("INPUTFORMAT".to_string(), TokenType::InputFormat);
1224    keywords.insert("SERDE".to_string(), TokenType::Serde);
1225    keywords.insert("SERDEPROPERTIES".to_string(), TokenType::SerdeProperties);
1226    keywords.insert("FORMAT".to_string(), TokenType::Format);
1227    // SQLite
1228    keywords.insert("PRAGMA".to_string(), TokenType::Pragma);
1229    // SHOW statement
1230    keywords.insert("SHOW".to_string(), TokenType::Show);
1231    // Oracle ORDER SIBLINGS BY (hierarchical queries)
1232    keywords.insert("SIBLINGS".to_string(), TokenType::Siblings);
1233    // COPY and PUT statements (Snowflake, PostgreSQL)
1234    keywords.insert("COPY".to_string(), TokenType::Copy);
1235    keywords.insert("PUT".to_string(), TokenType::Put);
1236    keywords.insert("GET".to_string(), TokenType::Get);
1237    // EXEC/EXECUTE statement (TSQL, etc.)
1238    keywords.insert("EXEC".to_string(), TokenType::Execute);
1239    keywords.insert("EXECUTE".to_string(), TokenType::Execute);
1240    // Postfix null check operators (PostgreSQL/SQLite)
1241    keywords.insert("ISNULL".to_string(), TokenType::IsNull);
1242    keywords.insert("NOTNULL".to_string(), TokenType::NotNull);
1243    keywords
1244});
1245
1246static DEFAULT_SINGLE_TOKENS: LazyLock<HashMap<char, TokenType>> = LazyLock::new(|| {
1247    let mut single_tokens = HashMap::with_capacity(30);
1248    single_tokens.insert('(', TokenType::LParen);
1249    single_tokens.insert(')', TokenType::RParen);
1250    single_tokens.insert('[', TokenType::LBracket);
1251    single_tokens.insert(']', TokenType::RBracket);
1252    single_tokens.insert('{', TokenType::LBrace);
1253    single_tokens.insert('}', TokenType::RBrace);
1254    single_tokens.insert(',', TokenType::Comma);
1255    single_tokens.insert('.', TokenType::Dot);
1256    single_tokens.insert(';', TokenType::Semicolon);
1257    single_tokens.insert('+', TokenType::Plus);
1258    single_tokens.insert('-', TokenType::Dash);
1259    single_tokens.insert('*', TokenType::Star);
1260    single_tokens.insert('/', TokenType::Slash);
1261    single_tokens.insert('%', TokenType::Percent);
1262    single_tokens.insert('&', TokenType::Amp);
1263    single_tokens.insert('|', TokenType::Pipe);
1264    single_tokens.insert('^', TokenType::Caret);
1265    single_tokens.insert('~', TokenType::Tilde);
1266    single_tokens.insert('<', TokenType::Lt);
1267    single_tokens.insert('>', TokenType::Gt);
1268    single_tokens.insert('=', TokenType::Eq);
1269    single_tokens.insert('!', TokenType::Exclamation);
1270    single_tokens.insert(':', TokenType::Colon);
1271    single_tokens.insert('@', TokenType::DAt);
1272    single_tokens.insert('#', TokenType::Hash);
1273    single_tokens.insert('$', TokenType::Dollar);
1274    single_tokens.insert('?', TokenType::Parameter);
1275    single_tokens
1276});
1277
1278static DEFAULT_QUOTES: LazyLock<HashMap<String, String>> = LazyLock::new(|| {
1279    let mut quotes = HashMap::with_capacity(4);
1280    quotes.insert("'".to_string(), "'".to_string());
1281    // Triple-quoted strings (e.g., """x""")
1282    quotes.insert("\"\"\"".to_string(), "\"\"\"".to_string());
1283    quotes
1284});
1285
1286static DEFAULT_IDENTIFIERS: LazyLock<HashMap<char, char>> = LazyLock::new(|| {
1287    let mut identifiers = HashMap::with_capacity(4);
1288    identifiers.insert('"', '"');
1289    identifiers.insert('`', '`');
1290    // Note: TSQL bracket-quoted identifiers [name] are handled in the parser
1291    // because [ is also used for arrays and subscripts
1292    identifiers
1293});
1294
1295static DEFAULT_COMMENTS: LazyLock<HashMap<String, Option<String>>> = LazyLock::new(|| {
1296    let mut comments = HashMap::with_capacity(4);
1297    comments.insert("--".to_string(), None);
1298    comments.insert("/*".to_string(), Some("*/".to_string()));
1299    comments
1300});
1301
1302/// Tokenizer configuration for a dialect
1303#[derive(Debug, Clone)]
1304pub struct TokenizerConfig {
1305    /// Keywords mapping (uppercase keyword -> token type)
1306    pub keywords: HashMap<String, TokenType>,
1307    /// Single character tokens
1308    pub single_tokens: HashMap<char, TokenType>,
1309    /// Quote characters (start -> end)
1310    pub quotes: HashMap<String, String>,
1311    /// Identifier quote characters (start -> end)
1312    pub identifiers: HashMap<char, char>,
1313    /// Comment definitions (start -> optional end)
1314    pub comments: HashMap<String, Option<String>>,
1315    /// String escape characters
1316    pub string_escapes: Vec<char>,
1317    /// Whether to support nested comments
1318    pub nested_comments: bool,
1319    /// Valid escape follow characters (for MySQL-style escaping).
1320    /// When a backslash is followed by a character NOT in this list,
1321    /// the backslash is discarded. When empty, all backslash escapes
1322    /// preserve the backslash for unrecognized sequences.
1323    pub escape_follow_chars: Vec<char>,
1324    /// Whether b'...' is a byte string (true for BigQuery) or bit string (false for standard SQL).
1325    /// Default is false (bit string).
1326    pub b_prefix_is_byte_string: bool,
1327    /// Numeric literal suffixes (uppercase suffix -> type name), e.g. {"L": "BIGINT", "S": "SMALLINT"}
1328    /// Used by Hive/Spark to parse 1L as CAST(1 AS BIGINT)
1329    pub numeric_literals: HashMap<String, String>,
1330    /// Whether unquoted identifiers can start with a digit (e.g., `1a`, `1_a`).
1331    /// When true, a number followed by letters/underscore is treated as an identifier.
1332    /// Used by Hive, Spark, MySQL, ClickHouse.
1333    pub identifiers_can_start_with_digit: bool,
1334    /// Whether 0x/0X prefix should be treated as hex literals.
1335    /// When true, `0XCC` is tokenized instead of Number("0") + Identifier("XCC").
1336    /// Used by BigQuery, SQLite, Teradata.
1337    pub hex_number_strings: bool,
1338    /// Whether hex string literals from 0x prefix represent integer values.
1339    /// When true (BigQuery), 0xA is tokenized as HexNumber (integer in hex notation).
1340    /// When false (SQLite, Teradata), 0xCC is tokenized as HexString (binary/blob value).
1341    pub hex_string_is_integer_type: bool,
1342    /// Whether string escape sequences (like \') are allowed in raw strings.
1343    /// When true (BigQuery default), \' inside r'...' escapes the quote.
1344    /// When false (Spark/Databricks), backslashes in raw strings are always literal.
1345    /// Python sqlglot: STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS (default True)
1346    pub string_escapes_allowed_in_raw_strings: bool,
1347    /// Whether # starts a single-line comment (ClickHouse, MySQL)
1348    pub hash_comments: bool,
1349    /// Whether $ can start/continue an identifier (ClickHouse).
1350    /// When true, a bare `$` that is not part of a dollar-quoted string or positional
1351    /// parameter is treated as an identifier character.
1352    pub dollar_sign_is_identifier: bool,
1353    /// Whether INSERT ... FORMAT <name> should treat subsequent data as raw (ClickHouse).
1354    /// When true, after tokenizing `INSERT ... FORMAT <non-VALUES-name>`, all text until
1355    /// the next blank line or end of input is consumed as a raw data token.
1356    pub insert_format_raw_data: bool,
1357    /// Whether numeric literals can contain underscores as digit separators.
1358    /// When true, `1_000` is tokenized as `1000`. Used by ClickHouse and DuckDB.
1359    /// Python sqlglot: NUMBERS_CAN_BE_UNDERSCORE_SEPARATED (default False)
1360    pub numbers_can_be_underscore_separated: bool,
1361    /// Recover strings like `'a\' or 1=1` by treating the escaped quote as the
1362    /// closing quote when no later quote exists. This matches SQLGlot's permissive
1363    /// handling for a few malformed ClickHouse SHOW LIKE fixtures.
1364    pub recover_terminal_backslash_quote: bool,
1365    /// Recover a terminal single-quoted string without a closing quote by treating
1366    /// end-of-input as the close. This is only enabled for ClickHouse fixture
1367    /// coverage, where some extracted corpus rows contain partial string probes.
1368    pub recover_unterminated_string: bool,
1369}
1370
1371impl Default for TokenizerConfig {
1372    fn default() -> Self {
1373        Self {
1374            keywords: DEFAULT_KEYWORDS.clone(),
1375            single_tokens: DEFAULT_SINGLE_TOKENS.clone(),
1376            quotes: DEFAULT_QUOTES.clone(),
1377            identifiers: DEFAULT_IDENTIFIERS.clone(),
1378            comments: DEFAULT_COMMENTS.clone(),
1379            // Standard SQL: only '' (doubled quote) escapes a quote
1380            // Backslash escapes are dialect-specific (MySQL, etc.)
1381            string_escapes: vec!['\''],
1382            nested_comments: true,
1383            // By default, no escape_follow_chars means preserve backslash for unrecognized escapes
1384            escape_follow_chars: vec![],
1385            // Default: b'...' is bit string (standard SQL), not byte string (BigQuery)
1386            b_prefix_is_byte_string: false,
1387            numeric_literals: HashMap::new(),
1388            identifiers_can_start_with_digit: false,
1389            hex_number_strings: false,
1390            hex_string_is_integer_type: false,
1391            // Default: backslash escapes ARE allowed in raw strings (sqlglot default)
1392            // Spark/Databricks set this to false
1393            string_escapes_allowed_in_raw_strings: true,
1394            hash_comments: false,
1395            dollar_sign_is_identifier: false,
1396            insert_format_raw_data: false,
1397            numbers_can_be_underscore_separated: false,
1398            recover_terminal_backslash_quote: false,
1399            recover_unterminated_string: false,
1400        }
1401    }
1402}
1403
1404/// SQL Tokenizer
1405pub struct Tokenizer {
1406    config: TokenizerConfig,
1407}
1408
1409impl Tokenizer {
1410    /// Create a new tokenizer with the given configuration
1411    pub fn new(config: TokenizerConfig) -> Self {
1412        Self { config }
1413    }
1414
1415    /// Create a tokenizer with default configuration
1416    pub fn default_config() -> Self {
1417        Self::new(TokenizerConfig::default())
1418    }
1419
1420    /// Tokenize a SQL string
1421    pub fn tokenize(&self, sql: &str) -> Result<Vec<Token>> {
1422        let mut state = TokenizerState::new(sql, &self.config);
1423        state.tokenize()
1424    }
1425}
1426
1427impl Default for Tokenizer {
1428    fn default() -> Self {
1429        Self::default_config()
1430    }
1431}
1432
1433/// Internal state for tokenization
1434struct TokenizerState<'a> {
1435    source: &'a str,
1436    source_is_ascii: bool,
1437    chars: Vec<char>,
1438    size: usize,
1439    tokens: Vec<Token>,
1440    start: usize,
1441    current: usize,
1442    line: usize,
1443    column: usize,
1444    comments: Vec<String>,
1445    config: &'a TokenizerConfig,
1446}
1447
1448impl<'a> TokenizerState<'a> {
1449    fn new(sql: &'a str, config: &'a TokenizerConfig) -> Self {
1450        let chars: Vec<char> = sql.chars().collect();
1451        let size = chars.len();
1452        Self {
1453            source: sql,
1454            source_is_ascii: sql.is_ascii(),
1455            chars,
1456            size,
1457            tokens: Vec::new(),
1458            start: 0,
1459            current: 0,
1460            line: 1,
1461            column: 1,
1462            comments: Vec::new(),
1463            config,
1464        }
1465    }
1466
1467    fn tokenize(&mut self) -> Result<Vec<Token>> {
1468        while !self.is_at_end() {
1469            self.skip_whitespace();
1470            if self.is_at_end() {
1471                break;
1472            }
1473
1474            self.start = self.current;
1475            self.scan_token()?;
1476
1477            // ClickHouse: After INSERT ... FORMAT <name> (where name != VALUES),
1478            // the rest until the next blank line or end of input is raw data.
1479            if self.config.insert_format_raw_data {
1480                if let Some(raw) = self.try_scan_insert_format_raw_data() {
1481                    if !raw.is_empty() {
1482                        self.start = self.current;
1483                        self.add_token_with_text(TokenType::Var, raw);
1484                    }
1485                }
1486            }
1487        }
1488
1489        // Handle leftover leading comments at end of input.
1490        // These are comments on a new line after the last token that couldn't be attached
1491        // as leading comments to a subsequent token (because there is none).
1492        // Attach them as trailing comments on the last token so they're preserved.
1493        if !self.comments.is_empty() {
1494            if let Some(last) = self.tokens.last_mut() {
1495                last.trailing_comments.extend(self.comments.drain(..));
1496            }
1497        }
1498
1499        Ok(std::mem::take(&mut self.tokens))
1500    }
1501
1502    #[inline]
1503    fn is_at_end(&self) -> bool {
1504        self.current >= self.size
1505    }
1506
1507    #[inline]
1508    fn text_from_range(&self, start: usize, end: usize) -> String {
1509        if self.source_is_ascii {
1510            self.source[start..end].to_string()
1511        } else {
1512            self.chars[start..end].iter().collect()
1513        }
1514    }
1515
1516    #[inline]
1517    fn peek(&self) -> char {
1518        if self.is_at_end() {
1519            '\0'
1520        } else {
1521            self.chars[self.current]
1522        }
1523    }
1524
1525    #[inline]
1526    fn peek_next(&self) -> char {
1527        if self.current + 1 >= self.size {
1528            '\0'
1529        } else {
1530            self.chars[self.current + 1]
1531        }
1532    }
1533
1534    #[inline]
1535    fn advance(&mut self) -> char {
1536        let c = self.peek();
1537        self.current += 1;
1538        if c == '\n' {
1539            self.line += 1;
1540            self.column = 1;
1541        } else {
1542            self.column += 1;
1543        }
1544        c
1545    }
1546
1547    fn skip_whitespace(&mut self) {
1548        // Track whether we've seen a newline since the last token.
1549        // Comments on a new line (after a newline) are leading comments on the next token,
1550        // while comments on the same line are trailing comments on the previous token.
1551        // This matches Python sqlglot's behavior.
1552        let mut saw_newline = false;
1553        while !self.is_at_end() {
1554            let c = self.peek();
1555            match c {
1556                ' ' | '\t' | '\r' => {
1557                    self.advance();
1558                }
1559                '\n' => {
1560                    saw_newline = true;
1561                    self.advance();
1562                }
1563                '\u{00A0}' // non-breaking space
1564                | '\u{2000}'..='\u{200B}' // various Unicode spaces + zero-width space
1565                | '\u{3000}' // ideographic (full-width) space
1566                | '\u{FEFF}' // BOM / zero-width no-break space
1567                => {
1568                    self.advance();
1569                }
1570                '-' if self.peek_next() == '-' => {
1571                    self.scan_line_comment(saw_newline);
1572                    // After a line comment, we're always on a new line
1573                    saw_newline = true;
1574                }
1575                '/' if self.peek_next() == '/' && self.config.hash_comments => {
1576                    // ClickHouse: // single-line comments (same dialects that support # comments)
1577                    self.scan_double_slash_comment();
1578                }
1579                '/' if self.peek_next() == '*' => {
1580                    // Check if this is a hint comment /*+ ... */
1581                    if self.current + 2 < self.size && self.chars[self.current + 2] == '+' {
1582                        // This is a hint comment, handle it as a token instead of skipping
1583                        break;
1584                    }
1585                    if self.scan_block_comment(saw_newline).is_err() {
1586                        return;
1587                    }
1588                    // Don't reset saw_newline - it carries forward
1589                }
1590                '/' if self.peek_next() == '/' && self.config.comments.contains_key("//") => {
1591                    // Dialect-specific // line comment (e.g., Snowflake)
1592                    // But NOT inside URIs like file:// or paths with consecutive slashes
1593                    // Check that previous non-whitespace char is not ':' or '/'
1594                    let prev_non_ws = if self.current > 0 {
1595                        let mut i = self.current - 1;
1596                        while i > 0 && (self.chars[i] == ' ' || self.chars[i] == '\t') {
1597                            i -= 1;
1598                        }
1599                        self.chars[i]
1600                    } else {
1601                        '\0'
1602                    };
1603                    if prev_non_ws == ':' || prev_non_ws == '/' {
1604                        // This is likely a URI (file://, http://) or path, not a comment
1605                        break;
1606                    }
1607                    self.scan_line_comment(saw_newline);
1608                    // After a line comment, we're always on a new line
1609                    saw_newline = true;
1610                }
1611                '#' if self.config.hash_comments => {
1612                    self.scan_hash_line_comment();
1613                }
1614                _ => break,
1615            }
1616        }
1617    }
1618
1619    fn scan_hash_line_comment(&mut self) {
1620        self.advance(); // #
1621        let start = self.current;
1622        while !self.is_at_end() && self.peek() != '\n' {
1623            self.advance();
1624        }
1625        let comment = self.text_from_range(start, self.current);
1626        let comment_text = comment.trim().to_string();
1627        if let Some(last) = self.tokens.last_mut() {
1628            last.trailing_comments.push(comment_text);
1629        } else {
1630            self.comments.push(comment_text);
1631        }
1632    }
1633
1634    fn scan_double_slash_comment(&mut self) {
1635        self.advance(); // /
1636        self.advance(); // /
1637        let start = self.current;
1638        while !self.is_at_end() && self.peek() != '\n' {
1639            self.advance();
1640        }
1641        let comment = self.text_from_range(start, self.current);
1642        let comment_text = comment.trim().to_string();
1643        if let Some(last) = self.tokens.last_mut() {
1644            last.trailing_comments.push(comment_text);
1645        } else {
1646            self.comments.push(comment_text);
1647        }
1648    }
1649
1650    fn scan_line_comment(&mut self, after_newline: bool) {
1651        self.advance(); // -
1652        self.advance(); // -
1653        let start = self.current;
1654        while !self.is_at_end() && self.peek() != '\n' {
1655            self.advance();
1656        }
1657        let comment_text = self.text_from_range(start, self.current);
1658
1659        // If the comment starts on a new line (after_newline), it's a leading comment
1660        // on the next token. Otherwise, it's a trailing comment on the previous token.
1661        if after_newline || self.tokens.is_empty() {
1662            self.comments.push(comment_text);
1663        } else if let Some(last) = self.tokens.last_mut() {
1664            last.trailing_comments.push(comment_text);
1665        }
1666    }
1667
1668    fn scan_block_comment(&mut self, after_newline: bool) -> Result<()> {
1669        self.advance(); // /
1670        self.advance(); // *
1671        let content_start = self.current;
1672        let mut depth = 1;
1673
1674        while !self.is_at_end() && depth > 0 {
1675            if self.peek() == '/' && self.peek_next() == '*' && self.config.nested_comments {
1676                self.advance();
1677                self.advance();
1678                depth += 1;
1679            } else if self.peek() == '*' && self.peek_next() == '/' {
1680                depth -= 1;
1681                if depth > 0 {
1682                    self.advance();
1683                    self.advance();
1684                }
1685            } else {
1686                self.advance();
1687            }
1688        }
1689
1690        if depth > 0 {
1691            return Err(Error::tokenize(
1692                "Unterminated block comment",
1693                self.line,
1694                self.column,
1695                self.start,
1696                self.current,
1697            ));
1698        }
1699
1700        // Get the content between /* and */ (preserving internal whitespace for nested comments)
1701        let content = self.text_from_range(content_start, self.current);
1702        self.advance(); // *
1703        self.advance(); // /
1704
1705        // For round-trip fidelity, preserve the exact comment content including nested comments
1706        let comment_text = format!("/*{}*/", content);
1707
1708        // If the comment starts on a new line (after_newline), it's a leading comment
1709        // on the next token. Otherwise, it's a trailing comment on the previous token.
1710        if after_newline || self.tokens.is_empty() {
1711            self.comments.push(comment_text);
1712        } else if let Some(last) = self.tokens.last_mut() {
1713            last.trailing_comments.push(comment_text);
1714        }
1715
1716        Ok(())
1717    }
1718
1719    /// Scan a hint comment /*+ ... */ and return it as a Hint token
1720    fn scan_hint(&mut self) -> Result<()> {
1721        self.advance(); // /
1722        self.advance(); // *
1723        self.advance(); // +
1724        let hint_start = self.current;
1725
1726        // Scan until we find */
1727        while !self.is_at_end() {
1728            if self.peek() == '*' && self.peek_next() == '/' {
1729                break;
1730            }
1731            self.advance();
1732        }
1733
1734        if self.is_at_end() {
1735            return Err(Error::tokenize(
1736                "Unterminated hint comment",
1737                self.line,
1738                self.column,
1739                self.start,
1740                self.current,
1741            ));
1742        }
1743
1744        let hint_text = self.text_from_range(hint_start, self.current);
1745        self.advance(); // *
1746        self.advance(); // /
1747
1748        self.add_token_with_text(TokenType::Hint, hint_text.trim().to_string());
1749
1750        Ok(())
1751    }
1752
1753    /// Scan a positional parameter: $1, $2, etc.
1754    fn scan_positional_parameter(&mut self) -> Result<()> {
1755        self.advance(); // consume $
1756        let start = self.current;
1757
1758        while !self.is_at_end() && self.peek().is_ascii_digit() {
1759            self.advance();
1760        }
1761
1762        let number = self.text_from_range(start, self.current);
1763        self.add_token_with_text(TokenType::Parameter, number);
1764        Ok(())
1765    }
1766
1767    /// Try to scan a tagged dollar-quoted string: $tag$content$tag$
1768    /// Returns Some(()) if successful, None if this isn't a tagged dollar string.
1769    ///
1770    /// The token text is stored as "tag\x00content" to preserve the tag for later use.
1771    fn try_scan_tagged_dollar_string(&mut self) -> Result<Option<()>> {
1772        let saved_pos = self.current;
1773
1774        // We're at '$', next char is alphabetic
1775        self.advance(); // consume opening $
1776
1777        // Scan the tag (identifier: alphanumeric + underscore, including Unicode)
1778        // Tags can contain Unicode characters like emojis (e.g., $🦆$)
1779        let tag_start = self.current;
1780        while !self.is_at_end()
1781            && (self.peek().is_alphanumeric() || self.peek() == '_' || !self.peek().is_ascii())
1782        {
1783            self.advance();
1784        }
1785        let tag = self.text_from_range(tag_start, self.current);
1786
1787        // Must have a closing $ after the tag
1788        if self.is_at_end() || self.peek() != '$' {
1789            // Not a tagged dollar string - restore position
1790            self.current = saved_pos;
1791            return Ok(None);
1792        }
1793        self.advance(); // consume closing $ of opening tag
1794
1795        // Now scan content until we find $tag$
1796        let content_start = self.current;
1797        let closing_tag = format!("${}$", tag);
1798        let closing_chars: Vec<char> = closing_tag.chars().collect();
1799
1800        loop {
1801            if self.is_at_end() {
1802                // Unterminated - restore and fall through
1803                self.current = saved_pos;
1804                return Ok(None);
1805            }
1806
1807            // Check if we've reached the closing tag
1808            if self.peek() == '$' && self.current + closing_chars.len() <= self.size {
1809                let matches = closing_chars.iter().enumerate().all(|(j, &ch)| {
1810                    self.current + j < self.size && self.chars[self.current + j] == ch
1811                });
1812                if matches {
1813                    let content = self.text_from_range(content_start, self.current);
1814                    // Consume closing tag
1815                    for _ in 0..closing_chars.len() {
1816                        self.advance();
1817                    }
1818                    // Store as "tag\x00content" to preserve the tag
1819                    let token_text = format!("{}\x00{}", tag, content);
1820                    self.add_token_with_text(TokenType::DollarString, token_text);
1821                    return Ok(Some(()));
1822                }
1823            }
1824            self.advance();
1825        }
1826    }
1827
1828    /// Scan a dollar-quoted string: $$content$$ or $tag$content$tag$
1829    ///
1830    /// For $$...$$ (no tag), the token text is just the content.
1831    /// For $tag$...$tag$, use try_scan_tagged_dollar_string instead.
1832    fn scan_dollar_quoted_string(&mut self) -> Result<()> {
1833        self.advance(); // consume first $
1834        self.advance(); // consume second $
1835
1836        // For $$...$$ (no tag), just scan until closing $$
1837        let start = self.current;
1838        while !self.is_at_end() {
1839            if self.peek() == '$'
1840                && self.current + 1 < self.size
1841                && self.chars[self.current + 1] == '$'
1842            {
1843                break;
1844            }
1845            self.advance();
1846        }
1847
1848        let content = self.text_from_range(start, self.current);
1849
1850        if !self.is_at_end() {
1851            self.advance(); // consume first $
1852            self.advance(); // consume second $
1853        }
1854
1855        self.add_token_with_text(TokenType::DollarString, content);
1856        Ok(())
1857    }
1858
1859    fn scan_token(&mut self) -> Result<()> {
1860        let c = self.peek();
1861
1862        // Check for string literal
1863        if c == '\'' {
1864            // Check for triple-quoted string '''...''' if configured
1865            if self.config.quotes.contains_key("'''")
1866                && self.peek_next() == '\''
1867                && self.current + 2 < self.size
1868                && self.chars[self.current + 2] == '\''
1869            {
1870                return self.scan_triple_quoted_string('\'');
1871            }
1872            return self.scan_string();
1873        }
1874
1875        // Check for triple-quoted string """...""" if configured
1876        if c == '"'
1877            && self.config.quotes.contains_key("\"\"\"")
1878            && self.peek_next() == '"'
1879            && self.current + 2 < self.size
1880            && self.chars[self.current + 2] == '"'
1881        {
1882            return self.scan_triple_quoted_string('"');
1883        }
1884
1885        // Check for double-quoted strings when dialect supports them (e.g., BigQuery)
1886        // This must come before identifier quotes check
1887        if c == '"'
1888            && self.config.quotes.contains_key("\"")
1889            && !self.config.identifiers.contains_key(&'"')
1890        {
1891            return self.scan_double_quoted_string();
1892        }
1893
1894        // Check for identifier quotes
1895        if let Some(&end_quote) = self.config.identifiers.get(&c) {
1896            return self.scan_quoted_identifier(end_quote);
1897        }
1898
1899        // Check for numbers (including numbers starting with a dot like .25)
1900        if c.is_ascii_digit() {
1901            return self.scan_number();
1902        }
1903
1904        // Check for numbers starting with a dot (e.g., .25, .5)
1905        // This must come before single character token handling
1906        // Don't treat as a number if:
1907        // - Previous char was also a dot (e.g., 1..2 should be 1, ., ., 2)
1908        // - Previous char is an identifier character (e.g., foo.25 should be foo, ., 25)
1909        //   This handles BigQuery numeric table parts like project.dataset.25
1910        if c == '.' && self.peek_next().is_ascii_digit() {
1911            let prev_char = if self.current > 0 {
1912                self.chars[self.current - 1]
1913            } else {
1914                '\0'
1915            };
1916            let is_after_ident = prev_char.is_alphanumeric()
1917                || prev_char == '_'
1918                || prev_char == '`'
1919                || prev_char == '"'
1920                || prev_char == ']'
1921                || prev_char == ')';
1922            if prev_char != '.' && !is_after_ident {
1923                return self.scan_number_starting_with_dot();
1924            }
1925        }
1926
1927        // Check for hint comment /*+ ... */
1928        if c == '/'
1929            && self.peek_next() == '*'
1930            && self.current + 2 < self.size
1931            && self.chars[self.current + 2] == '+'
1932        {
1933            return self.scan_hint();
1934        }
1935
1936        // Check for multi-character operators first
1937        if let Some(token_type) = self.try_scan_multi_char_operator() {
1938            self.add_token(token_type);
1939            return Ok(());
1940        }
1941
1942        // Check for tagged dollar-quoted strings: $tag$content$tag$
1943        // Tags can contain Unicode characters (including emojis like 🦆) and digits (e.g., $1$)
1944        if c == '$'
1945            && (self.peek_next().is_alphanumeric()
1946                || self.peek_next() == '_'
1947                || !self.peek_next().is_ascii())
1948        {
1949            if let Some(()) = self.try_scan_tagged_dollar_string()? {
1950                return Ok(());
1951            }
1952            // If tagged dollar string didn't match and dollar_sign_is_identifier is set,
1953            // treat the $ and following chars as an identifier (e.g., ClickHouse $alias$name$).
1954            if self.config.dollar_sign_is_identifier {
1955                return self.scan_dollar_identifier();
1956            }
1957        }
1958
1959        // Check for dollar-quoted strings: $$...$$
1960        if c == '$' && self.peek_next() == '$' {
1961            return self.scan_dollar_quoted_string();
1962        }
1963
1964        // Check for positional parameters: $1, $2, etc.
1965        if c == '$' && self.peek_next().is_ascii_digit() {
1966            return self.scan_positional_parameter();
1967        }
1968
1969        // ClickHouse: bare $ (not followed by alphanumeric/underscore) as identifier
1970        if c == '$' && self.config.dollar_sign_is_identifier {
1971            return self.scan_dollar_identifier();
1972        }
1973
1974        // TSQL: Check for identifiers starting with # (temp tables) or @ (variables)
1975        // e.g., #temp, ##global_temp, @variable
1976        if (c == '#' || c == '@')
1977            && (self.peek_next().is_alphanumeric()
1978                || self.peek_next() == '_'
1979                || self.peek_next() == '#')
1980        {
1981            return self.scan_tsql_identifier();
1982        }
1983
1984        // Check for single character tokens
1985        if let Some(&token_type) = self.config.single_tokens.get(&c) {
1986            self.advance();
1987            self.add_token(token_type);
1988            return Ok(());
1989        }
1990
1991        // Unicode minus (U+2212) → treat as regular minus
1992        if c == '\u{2212}' {
1993            self.advance();
1994            self.add_token(TokenType::Dash);
1995            return Ok(());
1996        }
1997
1998        // Unicode fraction slash (U+2044) → treat as regular slash
1999        if c == '\u{2044}' {
2000            self.advance();
2001            self.add_token(TokenType::Slash);
2002            return Ok(());
2003        }
2004
2005        // Unicode curly/smart quotes → treat as regular string quotes
2006        if c == '\u{2018}' || c == '\u{2019}' {
2007            // Left/right single quotation marks → scan as string with matching end
2008            return self.scan_unicode_quoted_string(c);
2009        }
2010        if c == '\u{201C}' || c == '\u{201D}' {
2011            // Left/right double quotation marks → scan as quoted identifier
2012            return self.scan_unicode_quoted_identifier(c);
2013        }
2014
2015        // Must be an identifier or keyword
2016        self.scan_identifier_or_keyword()
2017    }
2018
2019    fn try_scan_multi_char_operator(&mut self) -> Option<TokenType> {
2020        let c = self.peek();
2021        let next = self.peek_next();
2022        let third = if self.current + 2 < self.size {
2023            self.chars[self.current + 2]
2024        } else {
2025            '\0'
2026        };
2027
2028        // Check for three-character operators first
2029        // -|- (Adjacent - PostgreSQL range adjacency)
2030        if c == '-' && next == '|' && third == '-' {
2031            self.advance();
2032            self.advance();
2033            self.advance();
2034            return Some(TokenType::Adjacent);
2035        }
2036
2037        // ||/ (Cube root - PostgreSQL)
2038        if c == '|' && next == '|' && third == '/' {
2039            self.advance();
2040            self.advance();
2041            self.advance();
2042            return Some(TokenType::DPipeSlash);
2043        }
2044
2045        // #>> (JSONB path text extraction - PostgreSQL)
2046        if c == '#' && next == '>' && third == '>' {
2047            self.advance();
2048            self.advance();
2049            self.advance();
2050            return Some(TokenType::DHashArrow);
2051        }
2052
2053        // ->> (JSON text extraction - PostgreSQL/MySQL)
2054        if c == '-' && next == '>' && third == '>' {
2055            self.advance();
2056            self.advance();
2057            self.advance();
2058            return Some(TokenType::DArrow);
2059        }
2060
2061        // <=> (NULL-safe equality - MySQL)
2062        if c == '<' && next == '=' && third == '>' {
2063            self.advance();
2064            self.advance();
2065            self.advance();
2066            return Some(TokenType::NullsafeEq);
2067        }
2068
2069        // <-> (Distance operator - PostgreSQL)
2070        if c == '<' && next == '-' && third == '>' {
2071            self.advance();
2072            self.advance();
2073            self.advance();
2074            return Some(TokenType::LrArrow);
2075        }
2076
2077        // <@ (Contained by - PostgreSQL)
2078        if c == '<' && next == '@' {
2079            self.advance();
2080            self.advance();
2081            return Some(TokenType::LtAt);
2082        }
2083
2084        // @> (Contains - PostgreSQL)
2085        if c == '@' && next == '>' {
2086            self.advance();
2087            self.advance();
2088            return Some(TokenType::AtGt);
2089        }
2090
2091        // ~~~ (Glob - PostgreSQL)
2092        if c == '~' && next == '~' && third == '~' {
2093            self.advance();
2094            self.advance();
2095            self.advance();
2096            return Some(TokenType::Glob);
2097        }
2098
2099        // ~~* (ILike - PostgreSQL)
2100        if c == '~' && next == '~' && third == '*' {
2101            self.advance();
2102            self.advance();
2103            self.advance();
2104            return Some(TokenType::ILike);
2105        }
2106
2107        // !~~* (Not ILike - PostgreSQL)
2108        let fourth = if self.current + 3 < self.size {
2109            self.chars[self.current + 3]
2110        } else {
2111            '\0'
2112        };
2113        if c == '!' && next == '~' && third == '~' && fourth == '*' {
2114            self.advance();
2115            self.advance();
2116            self.advance();
2117            self.advance();
2118            return Some(TokenType::NotILike);
2119        }
2120
2121        // !~~ (Not Like - PostgreSQL)
2122        if c == '!' && next == '~' && third == '~' {
2123            self.advance();
2124            self.advance();
2125            self.advance();
2126            return Some(TokenType::NotLike);
2127        }
2128
2129        // !~* (Not Regexp ILike - PostgreSQL)
2130        if c == '!' && next == '~' && third == '*' {
2131            self.advance();
2132            self.advance();
2133            self.advance();
2134            return Some(TokenType::NotIRLike);
2135        }
2136
2137        // !:> (Not cast / Try cast - SingleStore)
2138        if c == '!' && next == ':' && third == '>' {
2139            self.advance();
2140            self.advance();
2141            self.advance();
2142            return Some(TokenType::NColonGt);
2143        }
2144
2145        // ?:: (TRY_CAST shorthand - Databricks)
2146        if c == '?' && next == ':' && third == ':' {
2147            self.advance();
2148            self.advance();
2149            self.advance();
2150            return Some(TokenType::QDColon);
2151        }
2152
2153        // !~ (Not Regexp - PostgreSQL)
2154        if c == '!' && next == '~' {
2155            self.advance();
2156            self.advance();
2157            return Some(TokenType::NotRLike);
2158        }
2159
2160        // ~~ (Like - PostgreSQL)
2161        if c == '~' && next == '~' {
2162            self.advance();
2163            self.advance();
2164            return Some(TokenType::Like);
2165        }
2166
2167        // ~* (Regexp ILike - PostgreSQL)
2168        if c == '~' && next == '*' {
2169            self.advance();
2170            self.advance();
2171            return Some(TokenType::IRLike);
2172        }
2173
2174        // SingleStore three-character JSON path operators (must be checked before :: two-char)
2175        // ::$ (JSON extract string), ::% (JSON extract double), ::? (JSON match)
2176        if c == ':' && next == ':' && third == '$' {
2177            self.advance();
2178            self.advance();
2179            self.advance();
2180            return Some(TokenType::DColonDollar);
2181        }
2182        if c == ':' && next == ':' && third == '%' {
2183            self.advance();
2184            self.advance();
2185            self.advance();
2186            return Some(TokenType::DColonPercent);
2187        }
2188        if c == ':' && next == ':' && third == '?' {
2189            self.advance();
2190            self.advance();
2191            self.advance();
2192            return Some(TokenType::DColonQMark);
2193        }
2194
2195        // Two-character operators
2196        let token_type = match (c, next) {
2197            ('.', ':') => Some(TokenType::DotColon),
2198            ('=', '=') => Some(TokenType::Eq), // Hive/Spark == equality operator
2199            ('<', '=') => Some(TokenType::Lte),
2200            ('>', '=') => Some(TokenType::Gte),
2201            ('!', '=') => Some(TokenType::Neq),
2202            ('<', '>') => Some(TokenType::Neq),
2203            ('^', '=') => Some(TokenType::Neq),
2204            ('<', '<') => Some(TokenType::LtLt),
2205            ('>', '>') => Some(TokenType::GtGt),
2206            ('|', '|') => Some(TokenType::DPipe),
2207            ('|', '/') => Some(TokenType::PipeSlash), // Square root - PostgreSQL
2208            (':', ':') => Some(TokenType::DColon),
2209            (':', '=') => Some(TokenType::ColonEq), // := (assignment, named args)
2210            (':', '>') => Some(TokenType::ColonGt), // ::> (TSQL)
2211            ('-', '>') => Some(TokenType::Arrow),   // JSON object access
2212            ('=', '>') => Some(TokenType::FArrow),  // Fat arrow (lambda)
2213            ('&', '&') => Some(TokenType::DAmp),
2214            ('&', '<') => Some(TokenType::AmpLt), // PostgreSQL range operator
2215            ('&', '>') => Some(TokenType::AmpGt), // PostgreSQL range operator
2216            ('@', '@') => Some(TokenType::AtAt),  // Text search match
2217            ('@', '?') => Some(TokenType::AtQMark), // JSON path exists - PostgreSQL
2218            ('?', '|') => Some(TokenType::QMarkPipe), // JSONB contains any key
2219            ('?', '&') => Some(TokenType::QMarkAmp), // JSONB contains all keys
2220            ('?', '?') => Some(TokenType::DQMark), // Double question mark
2221            ('#', '>') => Some(TokenType::HashArrow), // JSONB path extraction
2222            ('#', '-') => Some(TokenType::HashDash), // JSONB delete
2223            ('^', '@') => Some(TokenType::CaretAt), // PostgreSQL starts-with operator
2224            ('*', '*') => Some(TokenType::DStar), // Power operator
2225            ('|', '>') => Some(TokenType::PipeGt), // Pipe-greater (some dialects)
2226            _ => None,
2227        };
2228
2229        if token_type.is_some() {
2230            self.advance();
2231            self.advance();
2232        }
2233
2234        token_type
2235    }
2236
2237    fn scan_string(&mut self) -> Result<()> {
2238        self.advance(); // Opening quote
2239        let mut value = String::new();
2240
2241        while !self.is_at_end() {
2242            let c = self.peek();
2243            if c == '\'' {
2244                if self.peek_next() == '\'' {
2245                    // Escaped quote
2246                    value.push('\'');
2247                    self.advance();
2248                    self.advance();
2249                } else {
2250                    break;
2251                }
2252            } else if c == '\\' && self.config.string_escapes.contains(&'\\') {
2253                if self.config.recover_terminal_backslash_quote
2254                    && self.peek_next() == '\''
2255                    && !self.chars[self.current + 2..].contains(&'\'')
2256                {
2257                    value.push(self.advance());
2258                    break;
2259                }
2260
2261                // Handle escape sequences
2262                self.advance(); // Consume the backslash
2263                if !self.is_at_end() {
2264                    let escaped = self.advance();
2265                    match escaped {
2266                        'n' => value.push('\n'),
2267                        'r' => value.push('\r'),
2268                        't' => value.push('\t'),
2269                        '0' => value.push('\0'),
2270                        'Z' => value.push('\x1A'), // Ctrl+Z (MySQL)
2271                        'a' => value.push('\x07'), // Alert/bell
2272                        'b' => value.push('\x08'), // Backspace
2273                        'f' => value.push('\x0C'), // Form feed
2274                        'v' => value.push('\x0B'), // Vertical tab
2275                        'x' => {
2276                            // Hex escape: \xNN (exactly 2 hex digits)
2277                            let mut hex = String::with_capacity(2);
2278                            for _ in 0..2 {
2279                                if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2280                                    hex.push(self.advance());
2281                                }
2282                            }
2283                            if hex.len() == 2 {
2284                                if let Ok(byte) = u8::from_str_radix(&hex, 16) {
2285                                    value.push(byte as char);
2286                                } else {
2287                                    value.push('\\');
2288                                    value.push('x');
2289                                    value.push_str(&hex);
2290                                }
2291                            } else {
2292                                // Not enough hex digits, preserve literally
2293                                value.push('\\');
2294                                value.push('x');
2295                                value.push_str(&hex);
2296                            }
2297                        }
2298                        '\\' => value.push('\\'),
2299                        '\'' => value.push('\''),
2300                        '"' => value.push('"'),
2301                        '%' => {
2302                            // MySQL: \% in LIKE patterns
2303                            value.push('%');
2304                        }
2305                        '_' => {
2306                            // MySQL: \_ in LIKE patterns
2307                            value.push('_');
2308                        }
2309                        // For unrecognized escape sequences:
2310                        // If escape_follow_chars is set, only preserve backslash for chars in that list
2311                        // Otherwise (empty list), preserve backslash + char for unrecognized escapes
2312                        _ => {
2313                            if !self.config.escape_follow_chars.is_empty() {
2314                                // MySQL-style: discard backslash for unrecognized escapes
2315                                value.push(escaped);
2316                            } else {
2317                                // Standard: preserve backslash + char
2318                                value.push('\\');
2319                                value.push(escaped);
2320                            }
2321                        }
2322                    }
2323                }
2324            } else {
2325                value.push(self.advance());
2326            }
2327        }
2328
2329        if self.is_at_end() {
2330            if self.config.recover_unterminated_string {
2331                self.add_token_with_text(TokenType::String, value);
2332                return Ok(());
2333            }
2334
2335            return Err(Error::tokenize(
2336                "Unterminated string",
2337                self.line,
2338                self.column,
2339                self.start,
2340                self.current,
2341            ));
2342        }
2343
2344        self.advance(); // Closing quote
2345        self.add_token_with_text(TokenType::String, value);
2346        Ok(())
2347    }
2348
2349    /// Scan a double-quoted string (for dialects like BigQuery where " is a string delimiter)
2350    fn scan_double_quoted_string(&mut self) -> Result<()> {
2351        self.advance(); // Opening quote
2352        let mut value = String::new();
2353
2354        while !self.is_at_end() {
2355            let c = self.peek();
2356            if c == '"' {
2357                if self.peek_next() == '"' {
2358                    // Escaped quote
2359                    value.push('"');
2360                    self.advance();
2361                    self.advance();
2362                } else {
2363                    break;
2364                }
2365            } else if c == '\\' && self.config.string_escapes.contains(&'\\') {
2366                // Handle escape sequences
2367                self.advance(); // Consume the backslash
2368                if !self.is_at_end() {
2369                    let escaped = self.advance();
2370                    match escaped {
2371                        'n' => value.push('\n'),
2372                        'r' => value.push('\r'),
2373                        't' => value.push('\t'),
2374                        '0' => value.push('\0'),
2375                        'Z' => value.push('\x1A'), // Ctrl+Z (MySQL)
2376                        'a' => value.push('\x07'), // Alert/bell
2377                        'b' => value.push('\x08'), // Backspace
2378                        'f' => value.push('\x0C'), // Form feed
2379                        'v' => value.push('\x0B'), // Vertical tab
2380                        'x' => {
2381                            // Hex escape: \xNN (exactly 2 hex digits)
2382                            let mut hex = String::with_capacity(2);
2383                            for _ in 0..2 {
2384                                if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2385                                    hex.push(self.advance());
2386                                }
2387                            }
2388                            if hex.len() == 2 {
2389                                if let Ok(byte) = u8::from_str_radix(&hex, 16) {
2390                                    value.push(byte as char);
2391                                } else {
2392                                    value.push('\\');
2393                                    value.push('x');
2394                                    value.push_str(&hex);
2395                                }
2396                            } else {
2397                                // Not enough hex digits, preserve literally
2398                                value.push('\\');
2399                                value.push('x');
2400                                value.push_str(&hex);
2401                            }
2402                        }
2403                        '\\' => value.push('\\'),
2404                        '\'' => value.push('\''),
2405                        '"' => value.push('"'),
2406                        '%' => {
2407                            // MySQL: \% in LIKE patterns
2408                            value.push('%');
2409                        }
2410                        '_' => {
2411                            // MySQL: \_ in LIKE patterns
2412                            value.push('_');
2413                        }
2414                        // For unrecognized escape sequences:
2415                        // If escape_follow_chars is set, only preserve backslash for chars in that list
2416                        // Otherwise (empty list), preserve backslash + char for unrecognized escapes
2417                        _ => {
2418                            if !self.config.escape_follow_chars.is_empty() {
2419                                // MySQL-style: discard backslash for unrecognized escapes
2420                                value.push(escaped);
2421                            } else {
2422                                // Standard: preserve backslash + char
2423                                value.push('\\');
2424                                value.push(escaped);
2425                            }
2426                        }
2427                    }
2428                }
2429            } else {
2430                value.push(self.advance());
2431            }
2432        }
2433
2434        if self.is_at_end() {
2435            return Err(Error::tokenize(
2436                "Unterminated double-quoted string",
2437                self.line,
2438                self.column,
2439                self.start,
2440                self.current,
2441            ));
2442        }
2443
2444        self.advance(); // Closing quote
2445        self.add_token_with_text(TokenType::String, value);
2446        Ok(())
2447    }
2448
2449    fn scan_triple_quoted_string(&mut self, quote_char: char) -> Result<()> {
2450        // Advance past the three opening quotes
2451        self.advance();
2452        self.advance();
2453        self.advance();
2454        let mut value = String::new();
2455
2456        while !self.is_at_end() {
2457            // Check for closing triple quote
2458            if self.peek() == quote_char
2459                && self.current + 1 < self.size
2460                && self.chars[self.current + 1] == quote_char
2461                && self.current + 2 < self.size
2462                && self.chars[self.current + 2] == quote_char
2463            {
2464                // Found closing """
2465                break;
2466            }
2467            value.push(self.advance());
2468        }
2469
2470        if self.is_at_end() {
2471            return Err(Error::tokenize(
2472                "Unterminated triple-quoted string",
2473                self.line,
2474                self.column,
2475                self.start,
2476                self.current,
2477            ));
2478        }
2479
2480        // Advance past the three closing quotes
2481        self.advance();
2482        self.advance();
2483        self.advance();
2484        let token_type = if quote_char == '"' {
2485            TokenType::TripleDoubleQuotedString
2486        } else {
2487            TokenType::TripleSingleQuotedString
2488        };
2489        self.add_token_with_text(token_type, value);
2490        Ok(())
2491    }
2492
2493    fn scan_quoted_identifier(&mut self, end_quote: char) -> Result<()> {
2494        self.advance(); // Opening quote
2495        let mut value = String::new();
2496
2497        loop {
2498            if self.is_at_end() {
2499                return Err(Error::tokenize(
2500                    "Unterminated identifier",
2501                    self.line,
2502                    self.column,
2503                    self.start,
2504                    self.current,
2505                ));
2506            }
2507            if end_quote == '`' && self.peek() == '\\' && self.peek_next() == end_quote {
2508                // ClickHouse allows escaped backticks inside backtick-quoted identifiers.
2509                value.push(end_quote);
2510                self.advance(); // skip backslash
2511                self.advance(); // skip escaped quote
2512                continue;
2513            }
2514            if self.peek() == end_quote {
2515                if self.peek_next() == end_quote {
2516                    // Escaped quote (e.g., "" inside "x""y") -> store single quote
2517                    value.push(end_quote);
2518                    self.advance(); // skip first quote
2519                    self.advance(); // skip second quote
2520                } else {
2521                    // End of identifier
2522                    break;
2523                }
2524            } else {
2525                value.push(self.peek());
2526                self.advance();
2527            }
2528        }
2529
2530        self.advance(); // Closing quote
2531        self.add_token_with_text(TokenType::QuotedIdentifier, value);
2532        Ok(())
2533    }
2534
2535    /// Scan a string delimited by Unicode curly single quotes (U+2018/U+2019).
2536    /// Content between curly quotes is literal (no escape processing).
2537    /// When opened with \u{2018} (left), close with \u{2019} (right) only.
2538    /// When opened with \u{2019} (right), close with \u{2019} (right) — self-closing.
2539    fn scan_unicode_quoted_string(&mut self, open_quote: char) -> Result<()> {
2540        self.advance(); // Opening curly quote
2541        let start = self.current;
2542        // Determine closing quote: left opens -> right closes; right opens -> right closes
2543        let close_quote = if open_quote == '\u{2018}' {
2544            '\u{2019}' // left opens, right closes
2545        } else {
2546            '\u{2019}' // right quote also closes with right quote
2547        };
2548        while !self.is_at_end() && self.peek() != close_quote {
2549            self.advance();
2550        }
2551        let value = self.text_from_range(start, self.current);
2552        if !self.is_at_end() {
2553            self.advance(); // Closing quote
2554        }
2555        self.add_token_with_text(TokenType::String, value);
2556        Ok(())
2557    }
2558
2559    /// Scan an identifier delimited by Unicode curly double quotes (U+201C/U+201D).
2560    /// When opened with \u{201C} (left), close with \u{201D} (right) only.
2561    fn scan_unicode_quoted_identifier(&mut self, open_quote: char) -> Result<()> {
2562        self.advance(); // Opening curly quote
2563        let start = self.current;
2564        let close_quote = if open_quote == '\u{201C}' {
2565            '\u{201D}' // left opens, right closes
2566        } else {
2567            '\u{201D}' // right also closes with right
2568        };
2569        while !self.is_at_end() && self.peek() != close_quote && self.peek() != '"' {
2570            self.advance();
2571        }
2572        let value = self.text_from_range(start, self.current);
2573        if !self.is_at_end() {
2574            self.advance(); // Closing quote
2575        }
2576        self.add_token_with_text(TokenType::QuotedIdentifier, value);
2577        Ok(())
2578    }
2579
2580    fn scan_number(&mut self) -> Result<()> {
2581        // Check for 0x/0X hex number prefix (SQLite-style)
2582        if self.config.hex_number_strings && self.peek() == '0' && !self.is_at_end() {
2583            let next = if self.current + 1 < self.size {
2584                self.chars[self.current + 1]
2585            } else {
2586                '\0'
2587            };
2588            if next == 'x' || next == 'X' {
2589                // Advance past '0' and 'x'/'X'
2590                self.advance();
2591                self.advance();
2592                // Collect hex digits (allow underscores as separators, e.g., 0xbad_cafe)
2593                let hex_start = self.current;
2594                while !self.is_at_end() && (self.peek().is_ascii_hexdigit() || self.peek() == '_') {
2595                    if self.peek() == '_' && !self.peek_next().is_ascii_hexdigit() {
2596                        break;
2597                    }
2598                    self.advance();
2599                }
2600                if self.current > hex_start {
2601                    // Check for hex float: 0xABC.DEFpEXP or 0xABCpEXP
2602                    let mut is_hex_float = false;
2603                    // Optional fractional part: .hexdigits
2604                    if !self.is_at_end() && self.peek() == '.' {
2605                        let after_dot = if self.current + 1 < self.size {
2606                            self.chars[self.current + 1]
2607                        } else {
2608                            '\0'
2609                        };
2610                        if after_dot.is_ascii_hexdigit() {
2611                            is_hex_float = true;
2612                            self.advance(); // consume '.'
2613                            while !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2614                                self.advance();
2615                            }
2616                        }
2617                    }
2618                    // Optional binary exponent: p/P [+/-] digits
2619                    if !self.is_at_end() && (self.peek() == 'p' || self.peek() == 'P') {
2620                        is_hex_float = true;
2621                        self.advance(); // consume p/P
2622                        if !self.is_at_end() && (self.peek() == '+' || self.peek() == '-') {
2623                            self.advance();
2624                        }
2625                        while !self.is_at_end() && self.peek().is_ascii_digit() {
2626                            self.advance();
2627                        }
2628                    }
2629                    if is_hex_float {
2630                        // Hex float literal — emit as regular Number token with full text
2631                        let raw_text = self.text_from_range(self.start, self.current);
2632                        let full_text = if self.config.numbers_can_be_underscore_separated
2633                            && raw_text.contains('_')
2634                        {
2635                            raw_text.replace('_', "")
2636                        } else {
2637                            raw_text
2638                        };
2639                        self.add_token_with_text(TokenType::Number, full_text);
2640                    } else if self.config.hex_string_is_integer_type {
2641                        // BigQuery/ClickHouse: 0xA represents an integer in hex notation
2642                        let raw_value = self.text_from_range(hex_start, self.current);
2643                        let hex_value = if self.config.numbers_can_be_underscore_separated
2644                            && raw_value.contains('_')
2645                        {
2646                            raw_value.replace('_', "")
2647                        } else {
2648                            raw_value
2649                        };
2650                        self.add_token_with_text(TokenType::HexNumber, hex_value);
2651                    } else {
2652                        // SQLite/Teradata: 0xCC represents a binary/blob hex string
2653                        let raw_value = self.text_from_range(hex_start, self.current);
2654                        let hex_value = if self.config.numbers_can_be_underscore_separated
2655                            && raw_value.contains('_')
2656                        {
2657                            raw_value.replace('_', "")
2658                        } else {
2659                            raw_value
2660                        };
2661                        self.add_token_with_text(TokenType::HexString, hex_value);
2662                    }
2663                    return Ok(());
2664                }
2665                // No hex digits after 0x - fall through to normal number parsing
2666                // (reset current back to after '0')
2667                self.current = self.start + 1;
2668            }
2669        }
2670
2671        // Allow underscores as digit separators (e.g., 20_000, 1_000_000)
2672        while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2673            // Don't allow underscore at the end (must be followed by digit)
2674            if self.peek() == '_' && (self.is_at_end() || !self.peek_next().is_ascii_digit()) {
2675                break;
2676            }
2677            self.advance();
2678        }
2679
2680        // Look for decimal part - allow trailing dot (e.g., "1.")
2681        // In PostgreSQL (and sqlglot), "1.x" parses as float "1." with alias "x"
2682        // So we always consume the dot as part of the number, even if followed by an identifier
2683        if self.peek() == '.' {
2684            let next = self.peek_next();
2685            // Only consume the dot if:
2686            // 1. Followed by a digit (normal decimal like 1.5)
2687            // 2. Followed by an identifier start (like 1.x -> becomes 1. with alias x)
2688            // 3. End of input or other non-dot character (trailing decimal like "1.")
2689            // Do NOT consume if it's a double dot (..) which is a range operator
2690            if next != '.' {
2691                self.advance(); // consume the .
2692                                // Only consume digits after the decimal point (not identifiers)
2693                while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2694                    if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2695                        break;
2696                    }
2697                    self.advance();
2698                }
2699            }
2700        }
2701
2702        // Look for exponent
2703        if self.peek() == 'e' || self.peek() == 'E' {
2704            self.advance();
2705            if self.peek() == '+' || self.peek() == '-' {
2706                self.advance();
2707            }
2708            while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2709                if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2710                    break;
2711                }
2712                self.advance();
2713            }
2714        }
2715
2716        let raw_text = self.text_from_range(self.start, self.current);
2717        // Strip underscore digit separators (e.g., 20_000 -> 20000, 1_2E+1_0 -> 12E+10)
2718        // Only for dialects that support this (ClickHouse, DuckDB)
2719        let text = if self.config.numbers_can_be_underscore_separated && raw_text.contains('_') {
2720            raw_text.replace('_', "")
2721        } else {
2722            raw_text
2723        };
2724
2725        // Check for numeric literal suffixes (e.g., 1L -> BIGINT, 1s -> SMALLINT in Hive/Spark)
2726        if !self.config.numeric_literals.is_empty() && !self.is_at_end() {
2727            let next_char: String = self.peek().to_ascii_uppercase().to_string();
2728            // Try 2-char suffix first (e.g., "BD"), then 1-char
2729            let suffix_match = if self.current + 1 < self.size {
2730                let two_char: String = [
2731                    self.chars[self.current].to_ascii_uppercase(),
2732                    self.chars[self.current + 1].to_ascii_uppercase(),
2733                ]
2734                .iter()
2735                .collect();
2736                if self.config.numeric_literals.contains_key(&two_char) {
2737                    // Make sure the 2-char suffix is not followed by more identifier chars
2738                    let after_suffix = if self.current + 2 < self.size {
2739                        self.chars[self.current + 2]
2740                    } else {
2741                        ' '
2742                    };
2743                    if !after_suffix.is_alphanumeric() && after_suffix != '_' {
2744                        Some((two_char, 2))
2745                    } else {
2746                        None
2747                    }
2748                } else if self.config.numeric_literals.contains_key(&next_char) {
2749                    // 1-char suffix - make sure not followed by more identifier chars
2750                    let after_suffix = if self.current + 1 < self.size {
2751                        self.chars[self.current + 1]
2752                    } else {
2753                        ' '
2754                    };
2755                    if !after_suffix.is_alphanumeric() && after_suffix != '_' {
2756                        Some((next_char, 1))
2757                    } else {
2758                        None
2759                    }
2760                } else {
2761                    None
2762                }
2763            } else if self.config.numeric_literals.contains_key(&next_char) {
2764                // At end of input, 1-char suffix
2765                Some((next_char, 1))
2766            } else {
2767                None
2768            };
2769
2770            if let Some((suffix, len)) = suffix_match {
2771                // Consume the suffix characters
2772                for _ in 0..len {
2773                    self.advance();
2774                }
2775                // Emit as a special number-with-suffix token
2776                // We'll encode as "number::TYPE" so the parser can split it
2777                let type_name = self
2778                    .config
2779                    .numeric_literals
2780                    .get(&suffix)
2781                    .expect("suffix verified by contains_key above")
2782                    .clone();
2783                let combined = format!("{}::{}", text, type_name);
2784                self.add_token_with_text(TokenType::Number, combined);
2785                return Ok(());
2786            }
2787        }
2788
2789        // Check for identifiers that start with a digit (e.g., 1a, 1_a, 1a_1a)
2790        // In Hive/Spark/MySQL/ClickHouse, these are valid unquoted identifiers
2791        if self.config.identifiers_can_start_with_digit && !self.is_at_end() {
2792            let next = self.peek();
2793            if next.is_alphabetic() || next == '_' {
2794                // Continue scanning as an identifier
2795                while !self.is_at_end() {
2796                    let ch = self.peek();
2797                    if ch.is_alphanumeric() || ch == '_' {
2798                        self.advance();
2799                    } else {
2800                        break;
2801                    }
2802                }
2803                let ident_text = self.text_from_range(self.start, self.current);
2804                self.add_token_with_text(TokenType::Identifier, ident_text);
2805                return Ok(());
2806            }
2807        }
2808
2809        self.add_token_with_text(TokenType::Number, text);
2810        Ok(())
2811    }
2812
2813    /// Scan a number that starts with a dot (e.g., .25, .5, .123e10)
2814    fn scan_number_starting_with_dot(&mut self) -> Result<()> {
2815        // Consume the leading dot
2816        self.advance();
2817
2818        // Consume the fractional digits
2819        while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2820            if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2821                break;
2822            }
2823            self.advance();
2824        }
2825
2826        // Look for exponent
2827        if self.peek() == 'e' || self.peek() == 'E' {
2828            self.advance();
2829            if self.peek() == '+' || self.peek() == '-' {
2830                self.advance();
2831            }
2832            while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2833                if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2834                    break;
2835                }
2836                self.advance();
2837            }
2838        }
2839
2840        let raw_text = self.text_from_range(self.start, self.current);
2841        // Strip underscore digit separators (e.g., .1_5 -> .15)
2842        // Only for dialects that support this (ClickHouse, DuckDB)
2843        let text = if self.config.numbers_can_be_underscore_separated && raw_text.contains('_') {
2844            raw_text.replace('_', "")
2845        } else {
2846            raw_text
2847        };
2848        self.add_token_with_text(TokenType::Number, text);
2849        Ok(())
2850    }
2851
2852    /// Look up a keyword using a stack buffer for ASCII uppercasing, avoiding heap allocation.
2853    /// Returns `TokenType::Var` for texts longer than 128 bytes or non-UTF-8 results.
2854    #[inline]
2855    fn lookup_keyword_ascii(keywords: &HashMap<String, TokenType>, text: &str) -> TokenType {
2856        if text.len() > 128 {
2857            return TokenType::Var;
2858        }
2859        let mut buf = [0u8; 128];
2860        for (i, b) in text.bytes().enumerate() {
2861            buf[i] = b.to_ascii_uppercase();
2862        }
2863        if let Ok(upper) = std::str::from_utf8(&buf[..text.len()]) {
2864            keywords.get(upper).copied().unwrap_or(TokenType::Var)
2865        } else {
2866            TokenType::Var
2867        }
2868    }
2869
2870    fn scan_identifier_or_keyword(&mut self) -> Result<()> {
2871        // Guard against unrecognized characters that could cause infinite loops
2872        let first_char = self.peek();
2873        if !first_char.is_alphanumeric() && first_char != '_' {
2874            // Unknown character - skip it and return an error
2875            let c = self.advance();
2876            return Err(Error::tokenize(
2877                format!("Unexpected character: '{}'", c),
2878                self.line,
2879                self.column,
2880                self.start,
2881                self.current,
2882            ));
2883        }
2884
2885        while !self.is_at_end() {
2886            let c = self.peek();
2887            // Allow alphanumeric, underscore, $, # and @ in identifiers
2888            // PostgreSQL allows $, TSQL allows # and @
2889            // But stop consuming # if followed by > or >> (PostgreSQL #> and #>> operators)
2890            if c == '#' {
2891                let next_c = if self.current + 1 < self.size {
2892                    self.chars[self.current + 1]
2893                } else {
2894                    '\0'
2895                };
2896                if next_c == '>' || next_c == '-' {
2897                    break; // Don't consume # — it's part of #>, #>>, or #- operator
2898                }
2899                self.advance();
2900            } else if c.is_alphanumeric() || c == '_' || c == '$' || c == '@' {
2901                self.advance();
2902            } else {
2903                break;
2904            }
2905        }
2906
2907        let text = self.text_from_range(self.start, self.current);
2908
2909        // Special-case NOT= (Teradata and other dialects)
2910        if text.eq_ignore_ascii_case("NOT") && self.peek() == '=' {
2911            self.advance(); // consume '='
2912            self.add_token(TokenType::Neq);
2913            return Ok(());
2914        }
2915
2916        // Check for special string prefixes like N'...', X'...', B'...', U&'...', r'...', b'...'
2917        // Also handle double-quoted variants for dialects that support them (e.g., BigQuery)
2918        let next_char = self.peek();
2919        let is_single_quote = next_char == '\'';
2920        let is_double_quote = next_char == '"' && self.config.quotes.contains_key("\"");
2921        // For raw strings (r"..." or r'...'), we allow double quotes even if " is not in quotes config
2922        // because raw strings are a special case used in Spark/Databricks where " is for identifiers
2923        let is_double_quote_for_raw = next_char == '"';
2924
2925        // Handle raw strings first - they're special because they work with both ' and "
2926        // even in dialects where " is normally an identifier delimiter (like Databricks)
2927        if text.eq_ignore_ascii_case("R") && (is_single_quote || is_double_quote_for_raw) {
2928            // Raw string r'...' or r"..." or r'''...''' or r"""...""" (BigQuery style)
2929            // In raw strings, backslashes are treated literally (no escape processing)
2930            let quote_char = if is_single_quote { '\'' } else { '"' };
2931            self.advance(); // consume the first opening quote
2932
2933            // Check for triple-quoted raw string (r"""...""" or r'''...''')
2934            if self.peek() == quote_char && self.peek_next() == quote_char {
2935                // Triple-quoted raw string
2936                self.advance(); // consume second quote
2937                self.advance(); // consume third quote
2938                let string_value = self.scan_raw_triple_quoted_content(quote_char)?;
2939                self.add_token_with_text(TokenType::RawString, string_value);
2940            } else {
2941                let string_value = self.scan_raw_string_content(quote_char)?;
2942                self.add_token_with_text(TokenType::RawString, string_value);
2943            }
2944            return Ok(());
2945        }
2946
2947        if is_single_quote || is_double_quote {
2948            if text.eq_ignore_ascii_case("N") {
2949                // National string N'...'
2950                self.advance(); // consume the opening quote
2951                let string_value = if is_single_quote {
2952                    self.scan_string_content()?
2953                } else {
2954                    self.scan_double_quoted_string_content()?
2955                };
2956                self.add_token_with_text(TokenType::NationalString, string_value);
2957                return Ok(());
2958            } else if text.eq_ignore_ascii_case("E") {
2959                // PostgreSQL escape string E'...' or e'...'
2960                // Preserve the case by prefixing with "e:" or "E:"
2961                // Always use backslash escapes for escape strings (e.g., \' is an escaped quote)
2962                let lowercase = text == "e";
2963                let prefix = if lowercase { "e:" } else { "E:" };
2964                self.advance(); // consume the opening quote
2965                let string_value = self.scan_string_content_with_escapes(true)?;
2966                self.add_token_with_text(
2967                    TokenType::EscapeString,
2968                    format!("{}{}", prefix, string_value),
2969                );
2970                return Ok(());
2971            } else if text.eq_ignore_ascii_case("X") {
2972                // Hex string X'...'
2973                self.advance(); // consume the opening quote
2974                let string_value = if is_single_quote {
2975                    self.scan_string_content()?
2976                } else {
2977                    self.scan_double_quoted_string_content()?
2978                };
2979                self.add_token_with_text(TokenType::HexString, string_value);
2980                return Ok(());
2981            } else if text.eq_ignore_ascii_case("B") && is_double_quote {
2982                // Byte string b"..." (BigQuery style) - MUST check before single quote B'...'
2983                self.advance(); // consume the opening quote
2984                let string_value = self.scan_double_quoted_string_content()?;
2985                self.add_token_with_text(TokenType::ByteString, string_value);
2986                return Ok(());
2987            } else if text.eq_ignore_ascii_case("B") && is_single_quote {
2988                // For BigQuery: b'...' is a byte string (bytes data)
2989                // For standard SQL: B'...' is a bit string (binary digits)
2990                self.advance(); // consume the opening quote
2991                let string_value = self.scan_string_content()?;
2992                if self.config.b_prefix_is_byte_string {
2993                    self.add_token_with_text(TokenType::ByteString, string_value);
2994                } else {
2995                    self.add_token_with_text(TokenType::BitString, string_value);
2996                }
2997                return Ok(());
2998            }
2999        }
3000
3001        // Check for U&'...' Unicode string syntax (SQL standard)
3002        if text.eq_ignore_ascii_case("U")
3003            && self.peek() == '&'
3004            && self.current + 1 < self.size
3005            && self.chars[self.current + 1] == '\''
3006        {
3007            self.advance(); // consume '&'
3008            self.advance(); // consume opening quote
3009            let string_value = self.scan_string_content()?;
3010            self.add_token_with_text(TokenType::UnicodeString, string_value);
3011            return Ok(());
3012        }
3013
3014        let token_type = Self::lookup_keyword_ascii(&self.config.keywords, &text);
3015
3016        self.add_token_with_text(token_type, text);
3017        Ok(())
3018    }
3019
3020    /// Scan string content (everything between quotes)
3021    /// If `force_backslash_escapes` is true, backslash is always treated as an escape character
3022    /// (used for PostgreSQL E'...' escape strings)
3023    fn scan_string_content_with_escapes(
3024        &mut self,
3025        force_backslash_escapes: bool,
3026    ) -> Result<String> {
3027        let mut value = String::new();
3028        let use_backslash_escapes =
3029            force_backslash_escapes || self.config.string_escapes.contains(&'\\');
3030
3031        while !self.is_at_end() {
3032            let c = self.peek();
3033            if c == '\'' {
3034                if self.peek_next() == '\'' {
3035                    // Escaped quote ''
3036                    value.push('\'');
3037                    self.advance();
3038                    self.advance();
3039                } else {
3040                    break;
3041                }
3042            } else if c == '\\' && use_backslash_escapes {
3043                // Preserve escape sequences literally (including \' for escape strings)
3044                value.push(self.advance());
3045                if !self.is_at_end() {
3046                    value.push(self.advance());
3047                }
3048            } else {
3049                value.push(self.advance());
3050            }
3051        }
3052
3053        if self.is_at_end() {
3054            return Err(Error::tokenize(
3055                "Unterminated string",
3056                self.line,
3057                self.column,
3058                self.start,
3059                self.current,
3060            ));
3061        }
3062
3063        self.advance(); // Closing quote
3064        Ok(value)
3065    }
3066
3067    /// Scan string content (everything between quotes)
3068    fn scan_string_content(&mut self) -> Result<String> {
3069        self.scan_string_content_with_escapes(false)
3070    }
3071
3072    /// Scan double-quoted string content (for dialects like BigQuery where " is a string delimiter)
3073    /// This is used for prefixed strings like b"..." or N"..."
3074    fn scan_double_quoted_string_content(&mut self) -> Result<String> {
3075        let mut value = String::new();
3076        let use_backslash_escapes = self.config.string_escapes.contains(&'\\');
3077
3078        while !self.is_at_end() {
3079            let c = self.peek();
3080            if c == '"' {
3081                if self.peek_next() == '"' {
3082                    // Escaped quote ""
3083                    value.push('"');
3084                    self.advance();
3085                    self.advance();
3086                } else {
3087                    break;
3088                }
3089            } else if c == '\\' && use_backslash_escapes {
3090                // Handle escape sequences
3091                self.advance(); // Consume backslash
3092                if !self.is_at_end() {
3093                    let escaped = self.advance();
3094                    match escaped {
3095                        'n' => value.push('\n'),
3096                        'r' => value.push('\r'),
3097                        't' => value.push('\t'),
3098                        '0' => value.push('\0'),
3099                        '\\' => value.push('\\'),
3100                        '"' => value.push('"'),
3101                        '\'' => value.push('\''),
3102                        'x' => {
3103                            // Hex escape \xNN - collect hex digits
3104                            let mut hex = String::new();
3105                            for _ in 0..2 {
3106                                if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
3107                                    hex.push(self.advance());
3108                                }
3109                            }
3110                            if let Ok(byte) = u8::from_str_radix(&hex, 16) {
3111                                value.push(byte as char);
3112                            } else {
3113                                // Invalid hex escape, keep it literal
3114                                value.push('\\');
3115                                value.push('x');
3116                                value.push_str(&hex);
3117                            }
3118                        }
3119                        _ => {
3120                            // For unrecognized escapes, preserve backslash + char
3121                            value.push('\\');
3122                            value.push(escaped);
3123                        }
3124                    }
3125                }
3126            } else {
3127                value.push(self.advance());
3128            }
3129        }
3130
3131        if self.is_at_end() {
3132            return Err(Error::tokenize(
3133                "Unterminated double-quoted string",
3134                self.line,
3135                self.column,
3136                self.start,
3137                self.current,
3138            ));
3139        }
3140
3141        self.advance(); // Closing quote
3142        Ok(value)
3143    }
3144
3145    /// Scan raw string content (limited escape processing for quotes)
3146    /// Used for BigQuery r'...' and r"..." strings
3147    /// In raw strings, backslashes are literal EXCEPT that escape sequences for the
3148    /// quote character still work (e.g., \' in r'...' escapes the quote, '' also works)
3149    fn scan_raw_string_content(&mut self, quote_char: char) -> Result<String> {
3150        let mut value = String::new();
3151
3152        while !self.is_at_end() {
3153            let c = self.peek();
3154            if c == quote_char {
3155                if self.peek_next() == quote_char {
3156                    // Escaped quote (doubled) - e.g., '' inside r'...'
3157                    value.push(quote_char);
3158                    self.advance();
3159                    self.advance();
3160                } else {
3161                    break;
3162                }
3163            } else if c == '\\'
3164                && self.peek_next() == quote_char
3165                && self.config.string_escapes_allowed_in_raw_strings
3166            {
3167                // Backslash-escaped quote - works in raw strings when string_escapes_allowed_in_raw_strings is true
3168                // e.g., \' inside r'...' becomes literal ' (BigQuery behavior)
3169                // Spark/Databricks has this set to false, so backslash is always literal there
3170                value.push(quote_char);
3171                self.advance(); // consume backslash
3172                self.advance(); // consume quote
3173            } else {
3174                // In raw strings, everything including backslashes is literal
3175                value.push(self.advance());
3176            }
3177        }
3178
3179        if self.is_at_end() {
3180            return Err(Error::tokenize(
3181                "Unterminated raw string",
3182                self.line,
3183                self.column,
3184                self.start,
3185                self.current,
3186            ));
3187        }
3188
3189        self.advance(); // Closing quote
3190        Ok(value)
3191    }
3192
3193    /// Scan raw triple-quoted string content (r"""...""" or r'''...''')
3194    /// Terminates when three consecutive quote_chars are found
3195    fn scan_raw_triple_quoted_content(&mut self, quote_char: char) -> Result<String> {
3196        let mut value = String::new();
3197
3198        while !self.is_at_end() {
3199            let c = self.peek();
3200            if c == quote_char && self.peek_next() == quote_char {
3201                // Check for third quote
3202                if self.current + 2 < self.size && self.chars[self.current + 2] == quote_char {
3203                    // Found three consecutive quotes - end of string
3204                    self.advance(); // first closing quote
3205                    self.advance(); // second closing quote
3206                    self.advance(); // third closing quote
3207                    return Ok(value);
3208                }
3209            }
3210            // In raw strings, everything including backslashes is literal
3211            let ch = self.advance();
3212            value.push(ch);
3213        }
3214
3215        Err(Error::tokenize(
3216            "Unterminated raw triple-quoted string",
3217            self.line,
3218            self.column,
3219            self.start,
3220            self.current,
3221        ))
3222    }
3223
3224    /// Scan TSQL identifiers that start with # (temp tables) or @ (variables)
3225    /// Examples: #temp, ##global_temp, @variable
3226    /// Scan an identifier that starts with `$` (ClickHouse).
3227    /// Examples: `$alias$name$`, `$x`
3228    fn scan_dollar_identifier(&mut self) -> Result<()> {
3229        // Consume the leading $
3230        self.advance();
3231
3232        // Consume alphanumeric, _, and $ continuation chars
3233        while !self.is_at_end() {
3234            let c = self.peek();
3235            if c.is_alphanumeric() || c == '_' || c == '$' {
3236                self.advance();
3237            } else {
3238                break;
3239            }
3240        }
3241
3242        let text = self.text_from_range(self.start, self.current);
3243        self.add_token_with_text(TokenType::Var, text);
3244        Ok(())
3245    }
3246
3247    fn scan_tsql_identifier(&mut self) -> Result<()> {
3248        // Consume the leading # or @ (or ##)
3249        let first = self.advance();
3250
3251        // For ##, consume the second #
3252        if first == '#' && self.peek() == '#' {
3253            self.advance();
3254        }
3255
3256        // Now scan the rest of the identifier
3257        while !self.is_at_end() {
3258            let c = self.peek();
3259            if c.is_alphanumeric() || c == '_' || c == '$' || c == '#' || c == '@' {
3260                self.advance();
3261            } else {
3262                break;
3263            }
3264        }
3265
3266        let text = self.text_from_range(self.start, self.current);
3267        // These are always identifiers (variables or temp table names), never keywords
3268        self.add_token_with_text(TokenType::Var, text);
3269        Ok(())
3270    }
3271
3272    /// Check if the last tokens match INSERT ... FORMAT <name> (not VALUES).
3273    /// If so, consume everything until the next blank line (two consecutive newlines)
3274    /// or end of input as raw data.
3275    fn try_scan_insert_format_raw_data(&mut self) -> Option<String> {
3276        let len = self.tokens.len();
3277        if len < 3 {
3278            return None;
3279        }
3280
3281        // Last token should be the format name (Identifier or Var, not VALUES)
3282        let last = &self.tokens[len - 1];
3283        if last.text.eq_ignore_ascii_case("VALUES") {
3284            return None;
3285        }
3286        if !matches!(last.token_type, TokenType::Var | TokenType::Identifier) {
3287            return None;
3288        }
3289
3290        // Second-to-last should be FORMAT
3291        let format_tok = &self.tokens[len - 2];
3292        if !format_tok.text.eq_ignore_ascii_case("FORMAT") {
3293            return None;
3294        }
3295
3296        // Check that there's an INSERT somewhere earlier in the tokens
3297        let has_insert = self.tokens[..len - 2]
3298            .iter()
3299            .rev()
3300            .take(20)
3301            .any(|t| t.token_type == TokenType::Insert);
3302        if !has_insert {
3303            return None;
3304        }
3305
3306        // We're in INSERT ... FORMAT <name> context. Consume everything until:
3307        // - A blank line (two consecutive newlines, possibly with whitespace between)
3308        // - End of input
3309        let raw_start = self.current;
3310        while !self.is_at_end() {
3311            let c = self.peek();
3312            if c == '\n' {
3313                // Check for blank line: \n followed by optional \r and \n
3314                let saved = self.current;
3315                self.advance(); // consume first \n
3316                                // Skip \r if present
3317                while !self.is_at_end() && self.peek() == '\r' {
3318                    self.advance();
3319                }
3320                if self.is_at_end() || self.peek() == '\n' {
3321                    // Found blank line or end of input - stop here
3322                    // Don't consume the second \n so subsequent SQL can be tokenized
3323                    let raw = self.text_from_range(raw_start, saved);
3324                    return Some(raw.trim().to_string());
3325                }
3326                // Not a blank line, continue scanning
3327            } else {
3328                self.advance();
3329            }
3330        }
3331
3332        // Reached end of input
3333        let raw = self.text_from_range(raw_start, self.current);
3334        let trimmed = raw.trim().to_string();
3335        if trimmed.is_empty() {
3336            None
3337        } else {
3338            Some(trimmed)
3339        }
3340    }
3341
3342    fn add_token(&mut self, token_type: TokenType) {
3343        let text = self.text_from_range(self.start, self.current);
3344        self.add_token_with_text(token_type, text);
3345    }
3346
3347    fn add_token_with_text(&mut self, token_type: TokenType, text: String) {
3348        let span = Span::new(self.start, self.current, self.line, self.column);
3349        let mut token = Token::new(token_type, text, span);
3350        token.comments.append(&mut self.comments);
3351        self.tokens.push(token);
3352    }
3353}
3354
3355#[cfg(test)]
3356mod tests {
3357    use super::*;
3358
3359    #[test]
3360    fn test_simple_select() {
3361        let tokenizer = Tokenizer::default();
3362        let tokens = tokenizer.tokenize("SELECT 1").unwrap();
3363
3364        assert_eq!(tokens.len(), 2);
3365        assert_eq!(tokens[0].token_type, TokenType::Select);
3366        assert_eq!(tokens[1].token_type, TokenType::Number);
3367        assert_eq!(tokens[1].text, "1");
3368    }
3369
3370    #[test]
3371    fn test_select_with_identifier() {
3372        let tokenizer = Tokenizer::default();
3373        let tokens = tokenizer.tokenize("SELECT a, b FROM t").unwrap();
3374
3375        assert_eq!(tokens.len(), 6);
3376        assert_eq!(tokens[0].token_type, TokenType::Select);
3377        assert_eq!(tokens[1].token_type, TokenType::Var);
3378        assert_eq!(tokens[1].text, "a");
3379        assert_eq!(tokens[2].token_type, TokenType::Comma);
3380        assert_eq!(tokens[3].token_type, TokenType::Var);
3381        assert_eq!(tokens[3].text, "b");
3382        assert_eq!(tokens[4].token_type, TokenType::From);
3383        assert_eq!(tokens[5].token_type, TokenType::Var);
3384        assert_eq!(tokens[5].text, "t");
3385    }
3386
3387    #[test]
3388    fn test_string_literal() {
3389        let tokenizer = Tokenizer::default();
3390        let tokens = tokenizer.tokenize("SELECT 'hello'").unwrap();
3391
3392        assert_eq!(tokens.len(), 2);
3393        assert_eq!(tokens[1].token_type, TokenType::String);
3394        assert_eq!(tokens[1].text, "hello");
3395    }
3396
3397    #[test]
3398    fn test_escaped_string() {
3399        let tokenizer = Tokenizer::default();
3400        let tokens = tokenizer.tokenize("SELECT 'it''s'").unwrap();
3401
3402        assert_eq!(tokens.len(), 2);
3403        assert_eq!(tokens[1].token_type, TokenType::String);
3404        assert_eq!(tokens[1].text, "it's");
3405    }
3406
3407    #[test]
3408    fn test_terminal_backslash_quote_recovery() {
3409        let mut config = TokenizerConfig::default();
3410        config.string_escapes.push('\\');
3411        config.recover_terminal_backslash_quote = true;
3412        let tokenizer = Tokenizer::new(config);
3413        let tokens = tokenizer
3414            .tokenize("SHOW FUNCTIONS LIKE 'a\\' OR 1=1")
3415            .unwrap();
3416
3417        assert_eq!(tokens.len(), 8);
3418        assert_eq!(tokens[3].token_type, TokenType::String);
3419        assert_eq!(tokens[3].text, "a\\");
3420        assert_eq!(tokens[4].token_type, TokenType::Or);
3421    }
3422
3423    #[test]
3424    fn test_comments() {
3425        let tokenizer = Tokenizer::default();
3426        let tokens = tokenizer.tokenize("SELECT -- comment\n1").unwrap();
3427
3428        assert_eq!(tokens.len(), 2);
3429        // Comments are attached to the PREVIOUS token as trailing_comments
3430        // This is better for round-trip fidelity (e.g., SELECT c /* comment */ FROM)
3431        assert_eq!(tokens[0].trailing_comments.len(), 1);
3432        assert_eq!(tokens[0].trailing_comments[0], " comment");
3433    }
3434
3435    #[test]
3436    fn test_comment_in_and_chain() {
3437        use crate::generator::Generator;
3438        use crate::parser::Parser;
3439
3440        // Line comments between AND clauses should appear after the AND operator
3441        let sql = "SELECT a FROM b WHERE foo\n-- c1\nAND bar\n-- c2\nAND bla";
3442        let ast = Parser::parse_sql(sql).unwrap();
3443        let mut gen = Generator::default();
3444        let output = gen.generate(&ast[0]).unwrap();
3445        assert_eq!(
3446            output,
3447            "SELECT a FROM b WHERE foo AND /* c1 */ bar AND /* c2 */ bla"
3448        );
3449    }
3450
3451    #[test]
3452    fn test_operators() {
3453        let tokenizer = Tokenizer::default();
3454        let tokens = tokenizer.tokenize("1 + 2 * 3").unwrap();
3455
3456        assert_eq!(tokens.len(), 5);
3457        assert_eq!(tokens[0].token_type, TokenType::Number);
3458        assert_eq!(tokens[1].token_type, TokenType::Plus);
3459        assert_eq!(tokens[2].token_type, TokenType::Number);
3460        assert_eq!(tokens[3].token_type, TokenType::Star);
3461        assert_eq!(tokens[4].token_type, TokenType::Number);
3462    }
3463
3464    #[test]
3465    fn test_comparison_operators() {
3466        let tokenizer = Tokenizer::default();
3467        let tokens = tokenizer.tokenize("a <= b >= c != d").unwrap();
3468
3469        assert_eq!(tokens[1].token_type, TokenType::Lte);
3470        assert_eq!(tokens[3].token_type, TokenType::Gte);
3471        assert_eq!(tokens[5].token_type, TokenType::Neq);
3472    }
3473
3474    #[test]
3475    fn test_national_string() {
3476        let tokenizer = Tokenizer::default();
3477        let tokens = tokenizer.tokenize("N'abc'").unwrap();
3478
3479        assert_eq!(
3480            tokens.len(),
3481            1,
3482            "Expected 1 token for N'abc', got {:?}",
3483            tokens
3484        );
3485        assert_eq!(tokens[0].token_type, TokenType::NationalString);
3486        assert_eq!(tokens[0].text, "abc");
3487    }
3488
3489    #[test]
3490    fn test_hex_string() {
3491        let tokenizer = Tokenizer::default();
3492        let tokens = tokenizer.tokenize("X'ABCD'").unwrap();
3493
3494        assert_eq!(
3495            tokens.len(),
3496            1,
3497            "Expected 1 token for X'ABCD', got {:?}",
3498            tokens
3499        );
3500        assert_eq!(tokens[0].token_type, TokenType::HexString);
3501        assert_eq!(tokens[0].text, "ABCD");
3502    }
3503
3504    #[test]
3505    fn test_bit_string() {
3506        let tokenizer = Tokenizer::default();
3507        let tokens = tokenizer.tokenize("B'01010'").unwrap();
3508
3509        assert_eq!(
3510            tokens.len(),
3511            1,
3512            "Expected 1 token for B'01010', got {:?}",
3513            tokens
3514        );
3515        assert_eq!(tokens[0].token_type, TokenType::BitString);
3516        assert_eq!(tokens[0].text, "01010");
3517    }
3518
3519    #[test]
3520    fn test_trailing_dot_number() {
3521        let tokenizer = Tokenizer::default();
3522
3523        // Test trailing dot
3524        let tokens = tokenizer.tokenize("SELECT 1.").unwrap();
3525        assert_eq!(
3526            tokens.len(),
3527            2,
3528            "Expected 2 tokens for 'SELECT 1.', got {:?}",
3529            tokens
3530        );
3531        assert_eq!(tokens[1].token_type, TokenType::Number);
3532        assert_eq!(tokens[1].text, "1.");
3533
3534        // Test normal decimal
3535        let tokens = tokenizer.tokenize("SELECT 1.5").unwrap();
3536        assert_eq!(tokens[1].text, "1.5");
3537
3538        // Test number followed by dot and identifier
3539        // In PostgreSQL (and sqlglot), "1.x" parses as float "1." with alias "x"
3540        let tokens = tokenizer.tokenize("SELECT 1.a").unwrap();
3541        assert_eq!(
3542            tokens.len(),
3543            3,
3544            "Expected 3 tokens for 'SELECT 1.a', got {:?}",
3545            tokens
3546        );
3547        assert_eq!(tokens[1].token_type, TokenType::Number);
3548        assert_eq!(tokens[1].text, "1.");
3549        assert_eq!(tokens[2].token_type, TokenType::Var);
3550
3551        // Test two dots (range operator) - dot is NOT consumed when followed by another dot
3552        let tokens = tokenizer.tokenize("SELECT 1..2").unwrap();
3553        assert_eq!(tokens[1].token_type, TokenType::Number);
3554        assert_eq!(tokens[1].text, "1");
3555        assert_eq!(tokens[2].token_type, TokenType::Dot);
3556        assert_eq!(tokens[3].token_type, TokenType::Dot);
3557        assert_eq!(tokens[4].token_type, TokenType::Number);
3558        assert_eq!(tokens[4].text, "2");
3559    }
3560
3561    #[test]
3562    fn test_leading_dot_number() {
3563        let tokenizer = Tokenizer::default();
3564
3565        // Test leading dot number (e.g., .25 for 0.25)
3566        let tokens = tokenizer.tokenize(".25").unwrap();
3567        assert_eq!(
3568            tokens.len(),
3569            1,
3570            "Expected 1 token for '.25', got {:?}",
3571            tokens
3572        );
3573        assert_eq!(tokens[0].token_type, TokenType::Number);
3574        assert_eq!(tokens[0].text, ".25");
3575
3576        // Test leading dot in context (Oracle SAMPLE clause)
3577        let tokens = tokenizer.tokenize("SAMPLE (.25)").unwrap();
3578        assert_eq!(
3579            tokens.len(),
3580            4,
3581            "Expected 4 tokens for 'SAMPLE (.25)', got {:?}",
3582            tokens
3583        );
3584        assert_eq!(tokens[0].token_type, TokenType::Sample);
3585        assert_eq!(tokens[1].token_type, TokenType::LParen);
3586        assert_eq!(tokens[2].token_type, TokenType::Number);
3587        assert_eq!(tokens[2].text, ".25");
3588        assert_eq!(tokens[3].token_type, TokenType::RParen);
3589
3590        // Test leading dot with exponent
3591        let tokens = tokenizer.tokenize(".5e10").unwrap();
3592        assert_eq!(
3593            tokens.len(),
3594            1,
3595            "Expected 1 token for '.5e10', got {:?}",
3596            tokens
3597        );
3598        assert_eq!(tokens[0].token_type, TokenType::Number);
3599        assert_eq!(tokens[0].text, ".5e10");
3600
3601        // Test that plain dot is still a Dot token
3602        let tokens = tokenizer.tokenize("a.b").unwrap();
3603        assert_eq!(
3604            tokens.len(),
3605            3,
3606            "Expected 3 tokens for 'a.b', got {:?}",
3607            tokens
3608        );
3609        assert_eq!(tokens[1].token_type, TokenType::Dot);
3610    }
3611
3612    #[test]
3613    fn test_unrecognized_character() {
3614        let tokenizer = Tokenizer::default();
3615
3616        // Unicode curly quotes are now handled as string delimiters
3617        let result = tokenizer.tokenize("SELECT \u{2018}hello\u{2019}");
3618        assert!(
3619            result.is_ok(),
3620            "Curly quotes should be tokenized as strings"
3621        );
3622
3623        // Unicode bullet character should still error
3624        let result = tokenizer.tokenize("SELECT • FROM t");
3625        assert!(result.is_err());
3626    }
3627
3628    #[test]
3629    fn test_colon_eq_tokenization() {
3630        let tokenizer = Tokenizer::default();
3631
3632        // := should be a single ColonEq token
3633        let tokens = tokenizer.tokenize("a := 1").unwrap();
3634        assert_eq!(tokens.len(), 3);
3635        assert_eq!(tokens[0].token_type, TokenType::Var);
3636        assert_eq!(tokens[1].token_type, TokenType::ColonEq);
3637        assert_eq!(tokens[2].token_type, TokenType::Number);
3638
3639        // : followed by non-= should still be Colon
3640        let tokens = tokenizer.tokenize("a:b").unwrap();
3641        assert!(tokens.iter().any(|t| t.token_type == TokenType::Colon));
3642        assert!(!tokens.iter().any(|t| t.token_type == TokenType::ColonEq));
3643
3644        // :: should still be DColon
3645        let tokens = tokenizer.tokenize("a::INT").unwrap();
3646        assert!(tokens.iter().any(|t| t.token_type == TokenType::DColon));
3647    }
3648
3649    #[test]
3650    fn test_colon_eq_parsing() {
3651        use crate::generator::Generator;
3652        use crate::parser::Parser;
3653
3654        // MySQL @var := value in SELECT
3655        let ast = Parser::parse_sql("SELECT @var1 := 1, @var2")
3656            .expect("Failed to parse MySQL @var := expr");
3657        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3658        assert_eq!(output, "SELECT @var1 := 1, @var2");
3659
3660        // MySQL @var := @var in SELECT
3661        let ast = Parser::parse_sql("SELECT @var1, @var2 := @var1")
3662            .expect("Failed to parse MySQL @var2 := @var1");
3663        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3664        assert_eq!(output, "SELECT @var1, @var2 := @var1");
3665
3666        // MySQL @var := COUNT(*)
3667        let ast = Parser::parse_sql("SELECT @var1 := COUNT(*) FROM t1")
3668            .expect("Failed to parse MySQL @var := COUNT(*)");
3669        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3670        assert_eq!(output, "SELECT @var1 := COUNT(*) FROM t1");
3671
3672        // MySQL SET @var := 1 (should normalize to = in output)
3673        let ast = Parser::parse_sql("SET @var1 := 1").expect("Failed to parse SET @var1 := 1");
3674        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3675        assert_eq!(output, "SET @var1 = 1");
3676
3677        // Function named args with :=
3678        let ast =
3679            Parser::parse_sql("UNION_VALUE(k1 := 1)").expect("Failed to parse named arg with :=");
3680        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3681        assert_eq!(output, "UNION_VALUE(k1 := 1)");
3682
3683        // UNNEST with recursive := TRUE
3684        let ast = Parser::parse_sql("SELECT UNNEST(col, recursive := TRUE) FROM t")
3685            .expect("Failed to parse UNNEST with :=");
3686        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3687        assert_eq!(output, "SELECT UNNEST(col, recursive := TRUE) FROM t");
3688
3689        // DuckDB prefix alias: foo: 1 means 1 AS foo
3690        let ast =
3691            Parser::parse_sql("SELECT foo: 1").expect("Failed to parse DuckDB prefix alias foo: 1");
3692        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3693        assert_eq!(output, "SELECT 1 AS foo");
3694
3695        // DuckDB prefix alias with multiple columns
3696        let ast = Parser::parse_sql("SELECT foo: 1, bar: 2, baz: 3")
3697            .expect("Failed to parse DuckDB multiple prefix aliases");
3698        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3699        assert_eq!(output, "SELECT 1 AS foo, 2 AS bar, 3 AS baz");
3700    }
3701
3702    #[test]
3703    fn test_colon_eq_dialect_roundtrip() {
3704        use crate::dialects::{Dialect, DialectType};
3705
3706        fn check(dialect: DialectType, sql: &str, expected: Option<&str>) {
3707            let d = Dialect::get(dialect);
3708            let ast = d
3709                .parse(sql)
3710                .unwrap_or_else(|e| panic!("Parse error for '{}': {}", sql, e));
3711            assert!(!ast.is_empty(), "Empty AST for: {}", sql);
3712            let transformed = d
3713                .transform(ast[0].clone())
3714                .unwrap_or_else(|e| panic!("Transform error for '{}': {}", sql, e));
3715            let output = d
3716                .generate(&transformed)
3717                .unwrap_or_else(|e| panic!("Generate error for '{}': {}", sql, e));
3718            let expected = expected.unwrap_or(sql);
3719            assert_eq!(output, expected, "Roundtrip failed for: {}", sql);
3720        }
3721
3722        // MySQL := tests
3723        check(DialectType::MySQL, "SELECT @var1 := 1, @var2", None);
3724        check(DialectType::MySQL, "SELECT @var1, @var2 := @var1", None);
3725        check(DialectType::MySQL, "SELECT @var1 := COUNT(*) FROM t1", None);
3726        check(DialectType::MySQL, "SET @var1 := 1", Some("SET @var1 = 1"));
3727
3728        // DuckDB := tests
3729        check(
3730            DialectType::DuckDB,
3731            "SELECT UNNEST(col, recursive := TRUE) FROM t",
3732            None,
3733        );
3734        check(DialectType::DuckDB, "UNION_VALUE(k1 := 1)", None);
3735
3736        // STRUCT_PACK(a := 'b')::json should at least parse without error
3737        // (The STRUCT_PACK -> Struct transformation is a separate feature)
3738        {
3739            let d = Dialect::get(DialectType::DuckDB);
3740            let ast = d
3741                .parse("STRUCT_PACK(a := 'b')::json")
3742                .expect("Failed to parse STRUCT_PACK(a := 'b')::json");
3743            assert!(!ast.is_empty(), "Empty AST for STRUCT_PACK(a := 'b')::json");
3744        }
3745
3746        // DuckDB prefix alias tests
3747        check(
3748            DialectType::DuckDB,
3749            "SELECT foo: 1",
3750            Some("SELECT 1 AS foo"),
3751        );
3752        check(
3753            DialectType::DuckDB,
3754            "SELECT foo: 1, bar: 2, baz: 3",
3755            Some("SELECT 1 AS foo, 2 AS bar, 3 AS baz"),
3756        );
3757    }
3758
3759    #[test]
3760    fn test_comment_roundtrip() {
3761        use crate::generator::Generator;
3762        use crate::parser::Parser;
3763
3764        fn check_roundtrip(sql: &str) -> Option<String> {
3765            let ast = match Parser::parse_sql(sql) {
3766                Ok(a) => a,
3767                Err(e) => return Some(format!("Parse error: {:?}", e)),
3768            };
3769            if ast.is_empty() {
3770                return Some("Empty AST".to_string());
3771            }
3772            let mut generator = Generator::default();
3773            let output = match generator.generate(&ast[0]) {
3774                Ok(o) => o,
3775                Err(e) => return Some(format!("Gen error: {:?}", e)),
3776            };
3777            if output == sql {
3778                None
3779            } else {
3780                Some(format!(
3781                    "Mismatch:\n  input:  {}\n  output: {}",
3782                    sql, output
3783                ))
3784            }
3785        }
3786
3787        let tests = vec![
3788            // Nested comments are sanitized: inner /* and */ are escaped
3789            // These no longer round-trip exactly (by design, matches Python sqlglot)
3790            // "SELECT c /* c1 /* c2 */ c3 */",        // becomes /* c1 / * c2 * / c3 */
3791            // "SELECT c /* c1 /* c2 /* c3 */ */ */",   // becomes /* c1 / * c2 / * c3 * / * / */
3792            // Simple alias with comments
3793            "SELECT c /* c1 */ AS alias /* c2 */",
3794            // Multiple columns with comments
3795            "SELECT a /* x */, b /* x */",
3796            // Multiple comments after column
3797            "SELECT a /* x */ /* y */ /* z */, b /* k */ /* m */",
3798            // FROM tables with comments
3799            "SELECT * FROM foo /* x */, bla /* x */",
3800            // Arithmetic with comments
3801            "SELECT 1 /* comment */ + 1",
3802            "SELECT 1 /* c1 */ + 2 /* c2 */",
3803            "SELECT 1 /* c1 */ + /* c2 */ 2 /* c3 */",
3804            // CAST with comments
3805            "SELECT CAST(x AS INT) /* comment */ FROM foo",
3806            // Function arguments with comments
3807            "SELECT FOO(x /* c */) /* FOO */, b /* b */",
3808            // Multi-part table names with comments
3809            "SELECT x FROM a.b.c /* x */, e.f.g /* x */",
3810            // INSERT with comments
3811            "INSERT INTO t1 (tc1 /* tc1 */, tc2 /* tc2 */) SELECT c1 /* sc1 */, c2 /* sc2 */ FROM t",
3812            // Leading comments on statements
3813            "/* c */ WITH x AS (SELECT 1) SELECT * FROM x",
3814            "/* comment1 */ INSERT INTO x /* comment2 */ VALUES (1, 2, 3)",
3815            "/* comment1 */ UPDATE tbl /* comment2 */ SET x = 2 WHERE x < 2",
3816            "/* comment1 */ DELETE FROM x /* comment2 */ WHERE y > 1",
3817            "/* comment */ CREATE TABLE foo AS SELECT 1",
3818            // Trailing comments on statements
3819            "INSERT INTO foo SELECT * FROM bar /* comment */",
3820            // Complex nested expressions with comments
3821            "SELECT FOO(x /* c1 */ + y /* c2 */ + BLA(5 /* c3 */)) FROM (VALUES (1 /* c4 */, \"test\" /* c5 */)) /* c6 */",
3822        ];
3823
3824        let mut failures = Vec::new();
3825        for sql in tests {
3826            if let Some(e) = check_roundtrip(sql) {
3827                failures.push(e);
3828            }
3829        }
3830
3831        if !failures.is_empty() {
3832            panic!("Comment roundtrip failures:\n{}", failures.join("\n\n"));
3833        }
3834    }
3835
3836    #[test]
3837    fn test_dollar_quoted_string_parsing() {
3838        use crate::dialects::{Dialect, DialectType};
3839
3840        // Test dollar string token parsing utility function
3841        let (tag, content) = super::parse_dollar_string_token("FOO\x00content here");
3842        assert_eq!(tag, Some("FOO".to_string()));
3843        assert_eq!(content, "content here");
3844
3845        let (tag, content) = super::parse_dollar_string_token("just content");
3846        assert_eq!(tag, None);
3847        assert_eq!(content, "just content");
3848
3849        // Test roundtrip for Databricks dialect with dollar-quoted function body
3850        fn check_databricks(sql: &str, expected: Option<&str>) {
3851            let d = Dialect::get(DialectType::Databricks);
3852            let ast = d
3853                .parse(sql)
3854                .unwrap_or_else(|e| panic!("Parse error for '{}': {}", sql, e));
3855            assert!(!ast.is_empty(), "Empty AST for: {}", sql);
3856            let transformed = d
3857                .transform(ast[0].clone())
3858                .unwrap_or_else(|e| panic!("Transform error for '{}': {}", sql, e));
3859            let output = d
3860                .generate(&transformed)
3861                .unwrap_or_else(|e| panic!("Generate error for '{}': {}", sql, e));
3862            let expected = expected.unwrap_or(sql);
3863            assert_eq!(output, expected, "Roundtrip failed for: {}", sql);
3864        }
3865
3866        // Test [42]: $$...$$ heredoc
3867        check_databricks(
3868            "CREATE FUNCTION add_one(x INT) RETURNS INT LANGUAGE PYTHON AS $$def add_one(x):\n  return x+1$$",
3869            None
3870        );
3871
3872        // Test [43]: $FOO$...$FOO$ tagged heredoc
3873        check_databricks(
3874            "CREATE FUNCTION add_one(x INT) RETURNS INT LANGUAGE PYTHON AS $FOO$def add_one(x):\n  return x+1$FOO$",
3875            None
3876        );
3877    }
3878
3879    #[test]
3880    fn test_numeric_underscore_stripping() {
3881        // Underscore stripping only happens when numbers_can_be_underscore_separated is true
3882        let mut config = TokenizerConfig::default();
3883        config.numbers_can_be_underscore_separated = true;
3884        let tokenizer = Tokenizer::new(config);
3885
3886        // Simple integer with underscores
3887        let tokens = tokenizer.tokenize("SELECT 1_2_3_4_5").unwrap();
3888        assert_eq!(tokens[1].token_type, TokenType::Number);
3889        assert_eq!(tokens[1].text, "12345");
3890
3891        // Thousands separator
3892        let tokens = tokenizer.tokenize("SELECT 20_000").unwrap();
3893        assert_eq!(tokens[1].token_type, TokenType::Number);
3894        assert_eq!(tokens[1].text, "20000");
3895
3896        // Scientific notation with underscores
3897        let tokens = tokenizer.tokenize("SELECT 1_2E+1_0").unwrap();
3898        assert_eq!(tokens[1].token_type, TokenType::Number);
3899        assert_eq!(tokens[1].text, "12E+10");
3900
3901        // Default tokenizer should NOT strip underscores
3902        let default_tokenizer = Tokenizer::default();
3903        let tokens = default_tokenizer.tokenize("SELECT 1_2_3_4_5").unwrap();
3904        assert_eq!(tokens[1].token_type, TokenType::Number);
3905        assert_eq!(tokens[1].text, "1_2_3_4_5");
3906    }
3907}
polyglot_sql/tokens.rs

polyglot_sql/
tokens.rs