polyglot_sql/
tokens.rs

1//! Token types and tokenization for SQL parsing
2//!
3//! This module defines all SQL token types and the tokenizer that converts
4//! SQL strings into token streams.
5
6use crate::error::{Error, Result};
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9use std::fmt;
10use std::sync::LazyLock;
11#[cfg(feature = "bindings")]
12use ts_rs::TS;
13
14/// Parse a DollarString token text into (tag, content).
15/// If the text contains '\x00', the part before is the tag and after is content.
16/// Otherwise, the whole text is the content with no tag.
17pub fn parse_dollar_string_token(text: &str) -> (Option<String>, String) {
18    if let Some(pos) = text.find('\x00') {
19        let tag = &text[..pos];
20        let content = &text[pos + 1..];
21        (Some(tag.to_string()), content.to_string())
22    } else {
23        (None, text.to_string())
24    }
25}
26
27/// Represents a position in the source SQL
28#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
29#[cfg_attr(feature = "bindings", derive(TS))]
30pub struct Span {
31    /// Starting byte offset
32    pub start: usize,
33    /// Ending byte offset (exclusive)
34    pub end: usize,
35    /// Line number (1-based)
36    pub line: usize,
37    /// Column number (1-based)
38    pub column: usize,
39}
40
41impl Span {
42    pub fn new(start: usize, end: usize, line: usize, column: usize) -> Self {
43        Self {
44            start,
45            end,
46            line,
47            column,
48        }
49    }
50}
51
52/// A token in the SQL token stream
53#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
54pub struct Token {
55    /// The type of token
56    pub token_type: TokenType,
57    /// The raw text of the token
58    pub text: String,
59    /// Position information
60    pub span: Span,
61    /// Leading comments (comments that appeared before this token)
62    #[serde(default)]
63    pub comments: Vec<String>,
64    /// Trailing comments (comments that appeared after this token, before the next one)
65    #[serde(default)]
66    pub trailing_comments: Vec<String>,
67}
68
69impl Token {
70    /// Create a new token
71    pub fn new(token_type: TokenType, text: impl Into<String>, span: Span) -> Self {
72        Self {
73            token_type,
74            text: text.into(),
75            span,
76            comments: Vec::new(),
77            trailing_comments: Vec::new(),
78        }
79    }
80
81    /// Create a NUMBER token
82    pub fn number(n: i64) -> Self {
83        Self::new(TokenType::Number, n.to_string(), Span::default())
84    }
85
86    /// Create a STRING token
87    pub fn string(s: impl Into<String>) -> Self {
88        Self::new(TokenType::String, s, Span::default())
89    }
90
91    /// Create an IDENTIFIER token
92    pub fn identifier(s: impl Into<String>) -> Self {
93        Self::new(TokenType::Identifier, s, Span::default())
94    }
95
96    /// Create a VAR token
97    pub fn var(s: impl Into<String>) -> Self {
98        Self::new(TokenType::Var, s, Span::default())
99    }
100
101    /// Add a comment to this token
102    pub fn with_comment(mut self, comment: impl Into<String>) -> Self {
103        self.comments.push(comment.into());
104        self
105    }
106}
107
108impl fmt::Display for Token {
109    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
110        write!(f, "{:?}({})", self.token_type, self.text)
111    }
112}
113
114/// All possible token types in SQL
115#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
116#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
117#[repr(u16)]
118pub enum TokenType {
119    // Punctuation
120    LParen,
121    RParen,
122    LBracket,
123    RBracket,
124    LBrace,
125    RBrace,
126    Comma,
127    Dot,
128    Dash,
129    Plus,
130    Colon,
131    DotColon,
132    DColon,
133    DColonDollar,
134    DColonPercent,
135    DColonQMark,
136    DQMark,
137    Semicolon,
138    Star,
139    Backslash,
140    Slash,
141    Lt,
142    Lte,
143    Gt,
144    Gte,
145    Not,
146    Eq,
147    Neq,
148    NullsafeEq,
149    ColonEq,
150    ColonGt,
151    NColonGt,
152    And,
153    Or,
154    Amp,
155    DPipe,
156    PipeGt,
157    Pipe,
158    PipeSlash,
159    DPipeSlash,
160    Caret,
161    CaretAt,
162    LtLt, // <<
163    GtGt, // >>
164    Tilde,
165    Arrow,
166    DArrow,
167    FArrow,
168    Hash,
169    HashArrow,
170    DHashArrow,
171    LrArrow,
172    DAt,
173    AtAt,
174    LtAt,
175    AtGt,
176    Dollar,
177    Parameter,
178    Session,
179    SessionParameter,
180    SessionUser,
181    DAmp,
182    AmpLt,
183    AmpGt,
184    Adjacent,
185    Xor,
186    DStar,
187    QMarkAmp,
188    QMarkPipe,
189    HashDash,
190    Exclamation,
191
192    UriStart,
193    BlockStart,
194    BlockEnd,
195    Space,
196    Break,
197
198    // Comments (emitted as tokens for round-trip fidelity)
199    BlockComment, // /* ... */
200    LineComment,  // -- ...
201
202    // Literals
203    String,
204    DollarString,             // $$...$$
205    TripleDoubleQuotedString, // """..."""
206    TripleSingleQuotedString, // '''...'''
207    Number,
208    Identifier,
209    QuotedIdentifier,
210    Database,
211    Column,
212    ColumnDef,
213    Schema,
214    Table,
215    Warehouse,
216    Stage,
217    Streamlit,
218    Var,
219    BitString,
220    HexString,
221    /// Hex number: 0xA, 0xFF (BigQuery, SQLite style) - represents an integer in hex notation
222    HexNumber,
223    ByteString,
224    NationalString,
225    EscapeString, // PostgreSQL E'...' escape string
226    RawString,
227    HeredocString,
228    HeredocStringAlternative,
229    UnicodeString,
230
231    // Data Types
232    Bit,
233    Boolean,
234    TinyInt,
235    UTinyInt,
236    SmallInt,
237    USmallInt,
238    MediumInt,
239    UMediumInt,
240    Int,
241    UInt,
242    BigInt,
243    UBigInt,
244    BigNum,
245    Int128,
246    UInt128,
247    Int256,
248    UInt256,
249    Float,
250    Double,
251    UDouble,
252    Decimal,
253    Decimal32,
254    Decimal64,
255    Decimal128,
256    Decimal256,
257    DecFloat,
258    UDecimal,
259    BigDecimal,
260    Char,
261    NChar,
262    VarChar,
263    NVarChar,
264    BpChar,
265    Text,
266    MediumText,
267    LongText,
268    Blob,
269    MediumBlob,
270    LongBlob,
271    TinyBlob,
272    TinyText,
273    Name,
274    Binary,
275    VarBinary,
276    Json,
277    JsonB,
278    Time,
279    TimeTz,
280    TimeNs,
281    Timestamp,
282    TimestampTz,
283    TimestampLtz,
284    TimestampNtz,
285    TimestampS,
286    TimestampMs,
287    TimestampNs,
288    DateTime,
289    DateTime2,
290    DateTime64,
291    SmallDateTime,
292    Date,
293    Date32,
294    Int4Range,
295    Int4MultiRange,
296    Int8Range,
297    Int8MultiRange,
298    NumRange,
299    NumMultiRange,
300    TsRange,
301    TsMultiRange,
302    TsTzRange,
303    TsTzMultiRange,
304    DateRange,
305    DateMultiRange,
306    Uuid,
307    Geography,
308    GeographyPoint,
309    Nullable,
310    Geometry,
311    Point,
312    Ring,
313    LineString,
314    LocalTime,
315    LocalTimestamp,
316    SysTimestamp,
317    MultiLineString,
318    Polygon,
319    MultiPolygon,
320    HllSketch,
321    HStore,
322    Super,
323    Serial,
324    SmallSerial,
325    BigSerial,
326    Xml,
327    Year,
328    UserDefined,
329    Money,
330    SmallMoney,
331    RowVersion,
332    Image,
333    Variant,
334    Object,
335    Inet,
336    IpAddress,
337    IpPrefix,
338    Ipv4,
339    Ipv6,
340    Enum,
341    Enum8,
342    Enum16,
343    FixedString,
344    LowCardinality,
345    Nested,
346    AggregateFunction,
347    SimpleAggregateFunction,
348    TDigest,
349    Unknown,
350    Vector,
351    Dynamic,
352    Void,
353
354    // Keywords
355    Add,
356    Alias,
357    Alter,
358    All,
359    Anti,
360    Any,
361    Apply,
362    Array,
363    Asc,
364    AsOf,
365    Attach,
366    AutoIncrement,
367    Begin,
368    Between,
369    BulkCollectInto,
370    Cache,
371    Cascade,
372    Case,
373    CharacterSet,
374    Cluster,
375    ClusterBy,
376    Collate,
377    Command,
378    Comment,
379    Commit,
380    Preserve,
381    Connect,
382    ConnectBy,
383    Constraint,
384    Copy,
385    Create,
386    Cross,
387    Cube,
388    CurrentDate,
389    CurrentDateTime,
390    CurrentSchema,
391    CurrentTime,
392    CurrentTimestamp,
393    CurrentUser,
394    CurrentRole,
395    CurrentCatalog,
396    Declare,
397    Default,
398    Delete,
399    Desc,
400    Describe,
401    Detach,
402    Dictionary,
403    Distinct,
404    Distribute,
405    DistributeBy,
406    Div,
407    Drop,
408    Else,
409    End,
410    Escape,
411    Except,
412    Execute,
413    Exists,
414    False,
415    Fetch,
416    File,
417    FileFormat,
418    Filter,
419    Final,
420    First,
421    For,
422    Force,
423    ForeignKey,
424    Format,
425    From,
426    Full,
427    Function,
428    Get,
429    Glob,
430    Global,
431    Grant,
432    GroupBy,
433    GroupingSets,
434    Having,
435    Hint,
436    Ignore,
437    ILike,
438    In,
439    Index,
440    IndexedBy,
441    Inner,
442    Input,
443    Insert,
444    Install,
445    Intersect,
446    Interval,
447    Into,
448    Inpath,
449    InputFormat,
450    Introducer,
451    IRLike,
452    Is,
453    IsNull,
454    Join,
455    JoinMarker,
456    Keep,
457    Key,
458    Kill,
459    Lambda,
460    Language,
461    Lateral,
462    Left,
463    Like,
464    NotLike,   // !~~ operator (PostgreSQL)
465    NotILike,  // !~~* operator (PostgreSQL)
466    NotRLike,  // !~ operator (PostgreSQL)
467    NotIRLike, // !~* operator (PostgreSQL)
468    Limit,
469    List,
470    Load,
471    Local,
472    Lock,
473    Map,
474    Match,
475    MatchCondition,
476    MatchRecognize,
477    MemberOf,
478    Materialized,
479    Merge,
480    Mod,
481    Model,
482    Natural,
483    Next,
484    NoAction,
485    Nothing,
486    NotNull,
487    Null,
488    ObjectIdentifier,
489    Offset,
490    On,
491    Only,
492    Operator,
493    OrderBy,
494    OrderSiblingsBy,
495    Ordered,
496    Ordinality,
497    Out,
498    Outer,
499    Output,
500    Over,
501    Overlaps,
502    Overwrite,
503    Partition,
504    PartitionBy,
505    Percent,
506    Pivot,
507    Placeholder,
508    Positional,
509    Pragma,
510    Prewhere,
511    PrimaryKey,
512    Procedure,
513    Properties,
514    PseudoType,
515    Put,
516    Qualify,
517    Quote,
518    QDColon,
519    Range,
520    Recursive,
521    Refresh,
522    Rename,
523    Replace,
524    Returning,
525    Revoke,
526    References,
527    Restrict,
528    Right,
529    RLike,
530    Rollback,
531    Rollup,
532    Row,
533    Rows,
534    Select,
535    Semi,
536    Savepoint,
537    Separator,
538    Sequence,
539    Serde,
540    SerdeProperties,
541    Set,
542    Settings,
543    Show,
544    Siblings,
545    SimilarTo,
546    Some,
547    Sort,
548    SortBy,
549    SoundsLike,
550    StartWith,
551    StorageIntegration,
552    StraightJoin,
553    Struct,
554    Summarize,
555    TableSample,
556    Sample,
557    Bernoulli,
558    System,
559    Block,
560    Seed,
561    Repeatable,
562    Tag,
563    Temporary,
564    Transaction,
565    To,
566    Top,
567    Then,
568    True,
569    Truncate,
570    Uncache,
571    Union,
572    Unnest,
573    Unpivot,
574    Update,
575    Use,
576    Using,
577    Values,
578    View,
579    SemanticView,
580    Volatile,
581    When,
582    Where,
583    Window,
584    With,
585    Ties,
586    Exclude,
587    No,
588    Others,
589    Unique,
590    UtcDate,
591    UtcTime,
592    UtcTimestamp,
593    VersionSnapshot,
594    TimestampSnapshot,
595    Option,
596    Sink,
597    Source,
598    Analyze,
599    Namespace,
600    Export,
601    As,
602    By,
603    Nulls,
604    Respect,
605    Last,
606    If,
607    Cast,
608    TryCast,
609    SafeCast,
610    Count,
611    Extract,
612    Substring,
613    Trim,
614    Leading,
615    Trailing,
616    Both,
617    Position,
618    Overlaying,
619    Placing,
620    Treat,
621    Within,
622    Group,
623    Order,
624
625    // Window function keywords
626    Unbounded,
627    Preceding,
628    Following,
629    Current,
630    Groups,
631
632    // DDL-specific keywords (Phase 4)
633    Trigger,
634    Type,
635    Domain,
636    Returns,
637    Body,
638    Increment,
639    Minvalue,
640    Maxvalue,
641    Start,
642    Cycle,
643    NoCycle,
644    Prior,
645    Generated,
646    Identity,
647    Always,
648    // MATCH_RECOGNIZE tokens
649    Measures,
650    Pattern,
651    Define,
652    Running,
653    Owned,
654    After,
655    Before,
656    Instead,
657    Each,
658    Statement,
659    Referencing,
660    Old,
661    New,
662    Of,
663    Check,
664    Authorization,
665    Restart,
666
667    // Special
668    Eof,
669}
670
671impl TokenType {
672    /// Check if this token type is a keyword that can be used as an identifier in certain contexts
673    pub fn is_keyword(&self) -> bool {
674        matches!(
675            self,
676            TokenType::Select
677                | TokenType::From
678                | TokenType::Where
679                | TokenType::And
680                | TokenType::Or
681                | TokenType::Not
682                | TokenType::In
683                | TokenType::Is
684                | TokenType::Null
685                | TokenType::True
686                | TokenType::False
687                | TokenType::As
688                | TokenType::On
689                | TokenType::Join
690                | TokenType::Left
691                | TokenType::Right
692                | TokenType::Inner
693                | TokenType::Outer
694                | TokenType::Full
695                | TokenType::Cross
696                | TokenType::Semi
697                | TokenType::Anti
698                | TokenType::Union
699                | TokenType::Except
700                | TokenType::Intersect
701                | TokenType::GroupBy
702                | TokenType::OrderBy
703                | TokenType::Having
704                | TokenType::Limit
705                | TokenType::Offset
706                | TokenType::Case
707                | TokenType::When
708                | TokenType::Then
709                | TokenType::Else
710                | TokenType::End
711                | TokenType::Create
712                | TokenType::Drop
713                | TokenType::Alter
714                | TokenType::Insert
715                | TokenType::Update
716                | TokenType::Delete
717                | TokenType::Into
718                | TokenType::Values
719                | TokenType::Set
720                | TokenType::With
721                | TokenType::Distinct
722                | TokenType::All
723                | TokenType::Exists
724                | TokenType::Between
725                | TokenType::Like
726                | TokenType::ILike
727                // Additional keywords that can be used as identifiers
728                | TokenType::Filter
729                | TokenType::Date
730                | TokenType::Timestamp
731                | TokenType::TimestampTz
732                | TokenType::Interval
733                | TokenType::Time
734                | TokenType::Table
735                | TokenType::Index
736                | TokenType::Column
737                | TokenType::Database
738                | TokenType::Schema
739                | TokenType::View
740                | TokenType::Function
741                | TokenType::Procedure
742                | TokenType::Trigger
743                | TokenType::Sequence
744                | TokenType::Over
745                | TokenType::Partition
746                | TokenType::Window
747                | TokenType::Rows
748                | TokenType::Range
749                | TokenType::First
750                | TokenType::Last
751                | TokenType::Preceding
752                | TokenType::Following
753                | TokenType::Current
754                | TokenType::Row
755                | TokenType::Unbounded
756                | TokenType::Array
757                | TokenType::Struct
758                | TokenType::Map
759                | TokenType::PrimaryKey
760                | TokenType::Key
761                | TokenType::ForeignKey
762                | TokenType::References
763                | TokenType::Unique
764                | TokenType::Check
765                | TokenType::Default
766                | TokenType::Constraint
767                | TokenType::Comment
768                | TokenType::Rollup
769                | TokenType::Cube
770                | TokenType::Grant
771                | TokenType::Revoke
772                | TokenType::Type
773                | TokenType::Use
774                | TokenType::Cache
775                | TokenType::Uncache
776                | TokenType::Load
777                | TokenType::Any
778                | TokenType::Some
779                | TokenType::Asc
780                | TokenType::Desc
781                | TokenType::Nulls
782                | TokenType::Lateral
783                | TokenType::Natural
784                | TokenType::Escape
785                | TokenType::Glob
786                | TokenType::Match
787                | TokenType::Recursive
788                | TokenType::Replace
789                | TokenType::Returns
790                | TokenType::If
791                | TokenType::Pivot
792                | TokenType::Unpivot
793                | TokenType::Json
794                | TokenType::Blob
795                | TokenType::Text
796                | TokenType::Int
797                | TokenType::BigInt
798                | TokenType::SmallInt
799                | TokenType::TinyInt
800                | TokenType::Int128
801                | TokenType::UInt128
802                | TokenType::Int256
803                | TokenType::UInt256
804                | TokenType::UInt
805                | TokenType::UBigInt
806                | TokenType::Float
807                | TokenType::Double
808                | TokenType::Decimal
809                | TokenType::Boolean
810                | TokenType::VarChar
811                | TokenType::Char
812                | TokenType::Binary
813                | TokenType::VarBinary
814                | TokenType::No
815                | TokenType::DateTime
816                | TokenType::Truncate
817                | TokenType::Execute
818                | TokenType::Merge
819                | TokenType::Top
820                | TokenType::Begin
821                | TokenType::Generated
822                | TokenType::Identity
823                | TokenType::Always
824                | TokenType::Extract
825                // Keywords that can be identifiers in certain contexts
826                | TokenType::AsOf
827                | TokenType::Prior
828                | TokenType::After
829                | TokenType::Restrict
830                | TokenType::Cascade
831                | TokenType::Local
832                | TokenType::Rename
833                | TokenType::Enum
834                | TokenType::Within
835                | TokenType::Format
836                | TokenType::Final
837                | TokenType::FileFormat
838                | TokenType::Input
839                | TokenType::InputFormat
840                | TokenType::Copy
841                | TokenType::Put
842                | TokenType::Get
843                | TokenType::Show
844                | TokenType::Serde
845                | TokenType::Sample
846                | TokenType::Sort
847                | TokenType::Collate
848                | TokenType::Ties
849                | TokenType::IsNull
850                | TokenType::NotNull
851                | TokenType::Exclude
852                | TokenType::Temporary
853                | TokenType::Add
854                | TokenType::Ordinality
855                | TokenType::Overlaps
856                | TokenType::Block
857                | TokenType::Pattern
858                | TokenType::Group
859                | TokenType::Cluster
860                | TokenType::Repeatable
861                | TokenType::Groups
862                | TokenType::Commit
863                | TokenType::Warehouse
864                | TokenType::System
865                | TokenType::By
866                | TokenType::To
867                | TokenType::Fetch
868                | TokenType::For
869                | TokenType::Only
870                | TokenType::Next
871                | TokenType::Lock
872                | TokenType::Refresh
873                | TokenType::Settings
874                | TokenType::Operator
875                | TokenType::Overwrite
876                | TokenType::StraightJoin
877                | TokenType::Start
878                // Additional keywords registered in tokenizer but previously missing from is_keyword()
879                | TokenType::Ignore
880                | TokenType::Domain
881                | TokenType::Apply
882                | TokenType::Respect
883                | TokenType::Materialized
884                | TokenType::Prewhere
885                | TokenType::Old
886                | TokenType::New
887                | TokenType::Cast
888                | TokenType::TryCast
889                | TokenType::SafeCast
890                | TokenType::Transaction
891                | TokenType::Describe
892                | TokenType::Kill
893                | TokenType::Lambda
894                | TokenType::Declare
895                | TokenType::Keep
896                | TokenType::Output
897                | TokenType::Percent
898                | TokenType::Qualify
899                | TokenType::Returning
900                | TokenType::Language
901                | TokenType::Preserve
902                | TokenType::Savepoint
903                | TokenType::Rollback
904                | TokenType::Body
905                | TokenType::Increment
906                | TokenType::Minvalue
907                | TokenType::Maxvalue
908                | TokenType::Cycle
909                | TokenType::NoCycle
910                | TokenType::Seed
911                | TokenType::Namespace
912                | TokenType::Authorization
913                | TokenType::Order
914                | TokenType::Restart
915                | TokenType::Before
916                | TokenType::Instead
917                | TokenType::Each
918                | TokenType::Statement
919                | TokenType::Referencing
920                | TokenType::Of
921                | TokenType::Separator
922                | TokenType::Others
923                | TokenType::Placing
924                | TokenType::Owned
925                | TokenType::Running
926                | TokenType::Define
927                | TokenType::Measures
928                | TokenType::MatchRecognize
929                | TokenType::AutoIncrement
930                | TokenType::Connect
931                | TokenType::Distribute
932                | TokenType::Bernoulli
933                | TokenType::TableSample
934                | TokenType::Inpath
935                | TokenType::Pragma
936                | TokenType::Siblings
937                | TokenType::SerdeProperties
938                | TokenType::RLike
939        )
940    }
941
942    /// Check if this token type is a comparison operator
943    pub fn is_comparison(&self) -> bool {
944        matches!(
945            self,
946            TokenType::Eq
947                | TokenType::Neq
948                | TokenType::Lt
949                | TokenType::Lte
950                | TokenType::Gt
951                | TokenType::Gte
952                | TokenType::NullsafeEq
953        )
954    }
955
956    /// Check if this token type is an arithmetic operator
957    pub fn is_arithmetic(&self) -> bool {
958        matches!(
959            self,
960            TokenType::Plus
961                | TokenType::Dash
962                | TokenType::Star
963                | TokenType::Slash
964                | TokenType::Percent
965                | TokenType::Mod
966                | TokenType::Div
967        )
968    }
969}
970
971impl fmt::Display for TokenType {
972    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
973        write!(f, "{:?}", self)
974    }
975}
976
977// ── Cached default maps for TokenizerConfig ─────────────────────────────────
978
979static DEFAULT_KEYWORDS: LazyLock<HashMap<String, TokenType>> = LazyLock::new(|| {
980    let mut keywords = HashMap::with_capacity(300);
981    // Add basic SQL keywords
982    keywords.insert("SELECT".to_string(), TokenType::Select);
983    keywords.insert("FROM".to_string(), TokenType::From);
984    keywords.insert("WHERE".to_string(), TokenType::Where);
985    keywords.insert("AND".to_string(), TokenType::And);
986    keywords.insert("OR".to_string(), TokenType::Or);
987    keywords.insert("NOT".to_string(), TokenType::Not);
988    keywords.insert("AS".to_string(), TokenType::As);
989    keywords.insert("ON".to_string(), TokenType::On);
990    keywords.insert("JOIN".to_string(), TokenType::Join);
991    keywords.insert("LEFT".to_string(), TokenType::Left);
992    keywords.insert("RIGHT".to_string(), TokenType::Right);
993    keywords.insert("INNER".to_string(), TokenType::Inner);
994    keywords.insert("OUTER".to_string(), TokenType::Outer);
995    keywords.insert("OUTPUT".to_string(), TokenType::Output);
996    keywords.insert("FULL".to_string(), TokenType::Full);
997    keywords.insert("CROSS".to_string(), TokenType::Cross);
998    keywords.insert("SEMI".to_string(), TokenType::Semi);
999    keywords.insert("ANTI".to_string(), TokenType::Anti);
1000    keywords.insert("STRAIGHT_JOIN".to_string(), TokenType::StraightJoin);
1001    keywords.insert("UNION".to_string(), TokenType::Union);
1002    keywords.insert("EXCEPT".to_string(), TokenType::Except);
1003    keywords.insert("MINUS".to_string(), TokenType::Except); // Oracle/Redshift alias for EXCEPT
1004    keywords.insert("INTERSECT".to_string(), TokenType::Intersect);
1005    keywords.insert("GROUP".to_string(), TokenType::Group);
1006    keywords.insert("CUBE".to_string(), TokenType::Cube);
1007    keywords.insert("ROLLUP".to_string(), TokenType::Rollup);
1008    keywords.insert("WITHIN".to_string(), TokenType::Within);
1009    keywords.insert("ORDER".to_string(), TokenType::Order);
1010    keywords.insert("BY".to_string(), TokenType::By);
1011    keywords.insert("HAVING".to_string(), TokenType::Having);
1012    keywords.insert("LIMIT".to_string(), TokenType::Limit);
1013    keywords.insert("OFFSET".to_string(), TokenType::Offset);
1014    keywords.insert("ORDINALITY".to_string(), TokenType::Ordinality);
1015    keywords.insert("FETCH".to_string(), TokenType::Fetch);
1016    keywords.insert("FIRST".to_string(), TokenType::First);
1017    keywords.insert("NEXT".to_string(), TokenType::Next);
1018    keywords.insert("ONLY".to_string(), TokenType::Only);
1019    keywords.insert("KEEP".to_string(), TokenType::Keep);
1020    keywords.insert("IGNORE".to_string(), TokenType::Ignore);
1021    keywords.insert("INPUT".to_string(), TokenType::Input);
1022    keywords.insert("CASE".to_string(), TokenType::Case);
1023    keywords.insert("WHEN".to_string(), TokenType::When);
1024    keywords.insert("THEN".to_string(), TokenType::Then);
1025    keywords.insert("ELSE".to_string(), TokenType::Else);
1026    keywords.insert("END".to_string(), TokenType::End);
1027    keywords.insert("ENDIF".to_string(), TokenType::End); // Exasol alias for END
1028    keywords.insert("NULL".to_string(), TokenType::Null);
1029    keywords.insert("TRUE".to_string(), TokenType::True);
1030    keywords.insert("FALSE".to_string(), TokenType::False);
1031    keywords.insert("IS".to_string(), TokenType::Is);
1032    keywords.insert("IN".to_string(), TokenType::In);
1033    keywords.insert("BETWEEN".to_string(), TokenType::Between);
1034    keywords.insert("OVERLAPS".to_string(), TokenType::Overlaps);
1035    keywords.insert("LIKE".to_string(), TokenType::Like);
1036    keywords.insert("ILIKE".to_string(), TokenType::ILike);
1037    keywords.insert("RLIKE".to_string(), TokenType::RLike);
1038    keywords.insert("REGEXP".to_string(), TokenType::RLike);
1039    keywords.insert("ESCAPE".to_string(), TokenType::Escape);
1040    keywords.insert("EXISTS".to_string(), TokenType::Exists);
1041    keywords.insert("DISTINCT".to_string(), TokenType::Distinct);
1042    keywords.insert("ALL".to_string(), TokenType::All);
1043    keywords.insert("WITH".to_string(), TokenType::With);
1044    keywords.insert("CREATE".to_string(), TokenType::Create);
1045    keywords.insert("DROP".to_string(), TokenType::Drop);
1046    keywords.insert("ALTER".to_string(), TokenType::Alter);
1047    keywords.insert("TRUNCATE".to_string(), TokenType::Truncate);
1048    keywords.insert("TABLE".to_string(), TokenType::Table);
1049    keywords.insert("VIEW".to_string(), TokenType::View);
1050    keywords.insert("INDEX".to_string(), TokenType::Index);
1051    keywords.insert("COLUMN".to_string(), TokenType::Column);
1052    keywords.insert("CONSTRAINT".to_string(), TokenType::Constraint);
1053    keywords.insert("ADD".to_string(), TokenType::Add);
1054    keywords.insert("CASCADE".to_string(), TokenType::Cascade);
1055    keywords.insert("RESTRICT".to_string(), TokenType::Restrict);
1056    keywords.insert("RENAME".to_string(), TokenType::Rename);
1057    keywords.insert("TEMPORARY".to_string(), TokenType::Temporary);
1058    keywords.insert("TEMP".to_string(), TokenType::Temporary);
1059    keywords.insert("UNIQUE".to_string(), TokenType::Unique);
1060    keywords.insert("PRIMARY".to_string(), TokenType::PrimaryKey);
1061    keywords.insert("FOREIGN".to_string(), TokenType::ForeignKey);
1062    keywords.insert("KEY".to_string(), TokenType::Key);
1063    keywords.insert("KILL".to_string(), TokenType::Kill);
1064    keywords.insert("REFERENCES".to_string(), TokenType::References);
1065    keywords.insert("DEFAULT".to_string(), TokenType::Default);
1066    keywords.insert("DECLARE".to_string(), TokenType::Declare);
1067    keywords.insert("AUTO_INCREMENT".to_string(), TokenType::AutoIncrement);
1068    keywords.insert("AUTOINCREMENT".to_string(), TokenType::AutoIncrement); // Snowflake style
1069    keywords.insert("MATERIALIZED".to_string(), TokenType::Materialized);
1070    keywords.insert("REPLACE".to_string(), TokenType::Replace);
1071    keywords.insert("TO".to_string(), TokenType::To);
1072    keywords.insert("INSERT".to_string(), TokenType::Insert);
1073    keywords.insert("OVERWRITE".to_string(), TokenType::Overwrite);
1074    keywords.insert("UPDATE".to_string(), TokenType::Update);
1075    keywords.insert("USE".to_string(), TokenType::Use);
1076    keywords.insert("WAREHOUSE".to_string(), TokenType::Warehouse);
1077    keywords.insert("GLOB".to_string(), TokenType::Glob);
1078    keywords.insert("DELETE".to_string(), TokenType::Delete);
1079    keywords.insert("MERGE".to_string(), TokenType::Merge);
1080    keywords.insert("CACHE".to_string(), TokenType::Cache);
1081    keywords.insert("UNCACHE".to_string(), TokenType::Uncache);
1082    keywords.insert("REFRESH".to_string(), TokenType::Refresh);
1083    keywords.insert("GRANT".to_string(), TokenType::Grant);
1084    keywords.insert("REVOKE".to_string(), TokenType::Revoke);
1085    keywords.insert("COMMENT".to_string(), TokenType::Comment);
1086    keywords.insert("COLLATE".to_string(), TokenType::Collate);
1087    keywords.insert("INTO".to_string(), TokenType::Into);
1088    keywords.insert("VALUES".to_string(), TokenType::Values);
1089    keywords.insert("SET".to_string(), TokenType::Set);
1090    keywords.insert("SETTINGS".to_string(), TokenType::Settings);
1091    keywords.insert("SEPARATOR".to_string(), TokenType::Separator);
1092    keywords.insert("ASC".to_string(), TokenType::Asc);
1093    keywords.insert("DESC".to_string(), TokenType::Desc);
1094    keywords.insert("NULLS".to_string(), TokenType::Nulls);
1095    keywords.insert("RESPECT".to_string(), TokenType::Respect);
1096    keywords.insert("FIRST".to_string(), TokenType::First);
1097    keywords.insert("LAST".to_string(), TokenType::Last);
1098    keywords.insert("IF".to_string(), TokenType::If);
1099    keywords.insert("CAST".to_string(), TokenType::Cast);
1100    keywords.insert("TRY_CAST".to_string(), TokenType::TryCast);
1101    keywords.insert("SAFE_CAST".to_string(), TokenType::SafeCast);
1102    keywords.insert("OVER".to_string(), TokenType::Over);
1103    keywords.insert("PARTITION".to_string(), TokenType::Partition);
1104    keywords.insert("PLACING".to_string(), TokenType::Placing);
1105    keywords.insert("WINDOW".to_string(), TokenType::Window);
1106    keywords.insert("ROWS".to_string(), TokenType::Rows);
1107    keywords.insert("RANGE".to_string(), TokenType::Range);
1108    keywords.insert("FILTER".to_string(), TokenType::Filter);
1109    keywords.insert("NATURAL".to_string(), TokenType::Natural);
1110    keywords.insert("USING".to_string(), TokenType::Using);
1111    keywords.insert("UNBOUNDED".to_string(), TokenType::Unbounded);
1112    keywords.insert("PRECEDING".to_string(), TokenType::Preceding);
1113    keywords.insert("FOLLOWING".to_string(), TokenType::Following);
1114    keywords.insert("CURRENT".to_string(), TokenType::Current);
1115    keywords.insert("ROW".to_string(), TokenType::Row);
1116    keywords.insert("GROUPS".to_string(), TokenType::Groups);
1117    keywords.insert("RECURSIVE".to_string(), TokenType::Recursive);
1118    // TRIM function position keywords
1119    keywords.insert("BOTH".to_string(), TokenType::Both);
1120    keywords.insert("LEADING".to_string(), TokenType::Leading);
1121    keywords.insert("TRAILING".to_string(), TokenType::Trailing);
1122    keywords.insert("INTERVAL".to_string(), TokenType::Interval);
1123    // Phase 3: Additional keywords
1124    keywords.insert("TOP".to_string(), TokenType::Top);
1125    keywords.insert("QUALIFY".to_string(), TokenType::Qualify);
1126    keywords.insert("SAMPLE".to_string(), TokenType::Sample);
1127    keywords.insert("TABLESAMPLE".to_string(), TokenType::TableSample);
1128    keywords.insert("BERNOULLI".to_string(), TokenType::Bernoulli);
1129    keywords.insert("SYSTEM".to_string(), TokenType::System);
1130    keywords.insert("BLOCK".to_string(), TokenType::Block);
1131    keywords.insert("SEED".to_string(), TokenType::Seed);
1132    keywords.insert("REPEATABLE".to_string(), TokenType::Repeatable);
1133    keywords.insert("TIES".to_string(), TokenType::Ties);
1134    keywords.insert("LATERAL".to_string(), TokenType::Lateral);
1135    keywords.insert("LAMBDA".to_string(), TokenType::Lambda);
1136    keywords.insert("APPLY".to_string(), TokenType::Apply);
1137    // Oracle CONNECT BY keywords
1138    keywords.insert("CONNECT".to_string(), TokenType::Connect);
1139    // Hive/Spark specific keywords
1140    keywords.insert("CLUSTER".to_string(), TokenType::Cluster);
1141    keywords.insert("DISTRIBUTE".to_string(), TokenType::Distribute);
1142    keywords.insert("SORT".to_string(), TokenType::Sort);
1143    keywords.insert("PIVOT".to_string(), TokenType::Pivot);
1144    keywords.insert("PREWHERE".to_string(), TokenType::Prewhere);
1145    keywords.insert("UNPIVOT".to_string(), TokenType::Unpivot);
1146    keywords.insert("FOR".to_string(), TokenType::For);
1147    keywords.insert("ANY".to_string(), TokenType::Any);
1148    keywords.insert("SOME".to_string(), TokenType::Some);
1149    keywords.insert("ASOF".to_string(), TokenType::AsOf);
1150    keywords.insert("PERCENT".to_string(), TokenType::Percent);
1151    keywords.insert("EXCLUDE".to_string(), TokenType::Exclude);
1152    keywords.insert("NO".to_string(), TokenType::No);
1153    keywords.insert("OTHERS".to_string(), TokenType::Others);
1154    // PostgreSQL OPERATOR() syntax for schema-qualified operators
1155    keywords.insert("OPERATOR".to_string(), TokenType::Operator);
1156    // Phase 4: DDL keywords
1157    keywords.insert("SCHEMA".to_string(), TokenType::Schema);
1158    keywords.insert("NAMESPACE".to_string(), TokenType::Namespace);
1159    keywords.insert("DATABASE".to_string(), TokenType::Database);
1160    keywords.insert("FUNCTION".to_string(), TokenType::Function);
1161    keywords.insert("PROCEDURE".to_string(), TokenType::Procedure);
1162    keywords.insert("PROC".to_string(), TokenType::Procedure);
1163    keywords.insert("SEQUENCE".to_string(), TokenType::Sequence);
1164    keywords.insert("TRIGGER".to_string(), TokenType::Trigger);
1165    keywords.insert("TYPE".to_string(), TokenType::Type);
1166    keywords.insert("DOMAIN".to_string(), TokenType::Domain);
1167    keywords.insert("RETURNS".to_string(), TokenType::Returns);
1168    keywords.insert("RETURNING".to_string(), TokenType::Returning);
1169    keywords.insert("LANGUAGE".to_string(), TokenType::Language);
1170    keywords.insert("ROLLBACK".to_string(), TokenType::Rollback);
1171    keywords.insert("COMMIT".to_string(), TokenType::Commit);
1172    keywords.insert("BEGIN".to_string(), TokenType::Begin);
1173    keywords.insert("DESCRIBE".to_string(), TokenType::Describe);
1174    keywords.insert("PRESERVE".to_string(), TokenType::Preserve);
1175    keywords.insert("TRANSACTION".to_string(), TokenType::Transaction);
1176    keywords.insert("SAVEPOINT".to_string(), TokenType::Savepoint);
1177    keywords.insert("BODY".to_string(), TokenType::Body);
1178    keywords.insert("INCREMENT".to_string(), TokenType::Increment);
1179    keywords.insert("MINVALUE".to_string(), TokenType::Minvalue);
1180    keywords.insert("MAXVALUE".to_string(), TokenType::Maxvalue);
1181    keywords.insert("CYCLE".to_string(), TokenType::Cycle);
1182    keywords.insert("NOCYCLE".to_string(), TokenType::NoCycle);
1183    keywords.insert("PRIOR".to_string(), TokenType::Prior);
1184    // MATCH_RECOGNIZE keywords
1185    keywords.insert("MATCH".to_string(), TokenType::Match);
1186    keywords.insert("MATCH_RECOGNIZE".to_string(), TokenType::MatchRecognize);
1187    keywords.insert("MEASURES".to_string(), TokenType::Measures);
1188    keywords.insert("PATTERN".to_string(), TokenType::Pattern);
1189    keywords.insert("DEFINE".to_string(), TokenType::Define);
1190    keywords.insert("RUNNING".to_string(), TokenType::Running);
1191    keywords.insert("FINAL".to_string(), TokenType::Final);
1192    keywords.insert("OWNED".to_string(), TokenType::Owned);
1193    keywords.insert("AFTER".to_string(), TokenType::After);
1194    keywords.insert("BEFORE".to_string(), TokenType::Before);
1195    keywords.insert("INSTEAD".to_string(), TokenType::Instead);
1196    keywords.insert("EACH".to_string(), TokenType::Each);
1197    keywords.insert("STATEMENT".to_string(), TokenType::Statement);
1198    keywords.insert("REFERENCING".to_string(), TokenType::Referencing);
1199    keywords.insert("OLD".to_string(), TokenType::Old);
1200    keywords.insert("NEW".to_string(), TokenType::New);
1201    keywords.insert("OF".to_string(), TokenType::Of);
1202    keywords.insert("CHECK".to_string(), TokenType::Check);
1203    keywords.insert("START".to_string(), TokenType::Start);
1204    keywords.insert("ENUM".to_string(), TokenType::Enum);
1205    keywords.insert("AUTHORIZATION".to_string(), TokenType::Authorization);
1206    keywords.insert("RESTART".to_string(), TokenType::Restart);
1207    // Date/time literal keywords
1208    keywords.insert("DATE".to_string(), TokenType::Date);
1209    keywords.insert("TIME".to_string(), TokenType::Time);
1210    keywords.insert("TIMESTAMP".to_string(), TokenType::Timestamp);
1211    keywords.insert("DATETIME".to_string(), TokenType::DateTime);
1212    keywords.insert("GENERATED".to_string(), TokenType::Generated);
1213    keywords.insert("IDENTITY".to_string(), TokenType::Identity);
1214    keywords.insert("ALWAYS".to_string(), TokenType::Always);
1215    // LOAD DATA keywords
1216    keywords.insert("LOAD".to_string(), TokenType::Load);
1217    keywords.insert("LOCAL".to_string(), TokenType::Local);
1218    keywords.insert("INPATH".to_string(), TokenType::Inpath);
1219    keywords.insert("INPUTFORMAT".to_string(), TokenType::InputFormat);
1220    keywords.insert("SERDE".to_string(), TokenType::Serde);
1221    keywords.insert("SERDEPROPERTIES".to_string(), TokenType::SerdeProperties);
1222    keywords.insert("FORMAT".to_string(), TokenType::Format);
1223    // SQLite
1224    keywords.insert("PRAGMA".to_string(), TokenType::Pragma);
1225    // SHOW statement
1226    keywords.insert("SHOW".to_string(), TokenType::Show);
1227    // Oracle ORDER SIBLINGS BY (hierarchical queries)
1228    keywords.insert("SIBLINGS".to_string(), TokenType::Siblings);
1229    // COPY and PUT statements (Snowflake, PostgreSQL)
1230    keywords.insert("COPY".to_string(), TokenType::Copy);
1231    keywords.insert("PUT".to_string(), TokenType::Put);
1232    keywords.insert("GET".to_string(), TokenType::Get);
1233    // EXEC/EXECUTE statement (TSQL, etc.)
1234    keywords.insert("EXEC".to_string(), TokenType::Execute);
1235    keywords.insert("EXECUTE".to_string(), TokenType::Execute);
1236    // Postfix null check operators (PostgreSQL/SQLite)
1237    keywords.insert("ISNULL".to_string(), TokenType::IsNull);
1238    keywords.insert("NOTNULL".to_string(), TokenType::NotNull);
1239    keywords
1240});
1241
1242static DEFAULT_SINGLE_TOKENS: LazyLock<HashMap<char, TokenType>> = LazyLock::new(|| {
1243    let mut single_tokens = HashMap::with_capacity(30);
1244    single_tokens.insert('(', TokenType::LParen);
1245    single_tokens.insert(')', TokenType::RParen);
1246    single_tokens.insert('[', TokenType::LBracket);
1247    single_tokens.insert(']', TokenType::RBracket);
1248    single_tokens.insert('{', TokenType::LBrace);
1249    single_tokens.insert('}', TokenType::RBrace);
1250    single_tokens.insert(',', TokenType::Comma);
1251    single_tokens.insert('.', TokenType::Dot);
1252    single_tokens.insert(';', TokenType::Semicolon);
1253    single_tokens.insert('+', TokenType::Plus);
1254    single_tokens.insert('-', TokenType::Dash);
1255    single_tokens.insert('*', TokenType::Star);
1256    single_tokens.insert('/', TokenType::Slash);
1257    single_tokens.insert('%', TokenType::Percent);
1258    single_tokens.insert('&', TokenType::Amp);
1259    single_tokens.insert('|', TokenType::Pipe);
1260    single_tokens.insert('^', TokenType::Caret);
1261    single_tokens.insert('~', TokenType::Tilde);
1262    single_tokens.insert('<', TokenType::Lt);
1263    single_tokens.insert('>', TokenType::Gt);
1264    single_tokens.insert('=', TokenType::Eq);
1265    single_tokens.insert('!', TokenType::Exclamation);
1266    single_tokens.insert(':', TokenType::Colon);
1267    single_tokens.insert('@', TokenType::DAt);
1268    single_tokens.insert('#', TokenType::Hash);
1269    single_tokens.insert('$', TokenType::Dollar);
1270    single_tokens.insert('?', TokenType::Parameter);
1271    single_tokens
1272});
1273
1274static DEFAULT_QUOTES: LazyLock<HashMap<String, String>> = LazyLock::new(|| {
1275    let mut quotes = HashMap::with_capacity(4);
1276    quotes.insert("'".to_string(), "'".to_string());
1277    // Triple-quoted strings (e.g., """x""")
1278    quotes.insert("\"\"\"".to_string(), "\"\"\"".to_string());
1279    quotes
1280});
1281
1282static DEFAULT_IDENTIFIERS: LazyLock<HashMap<char, char>> = LazyLock::new(|| {
1283    let mut identifiers = HashMap::with_capacity(4);
1284    identifiers.insert('"', '"');
1285    identifiers.insert('`', '`');
1286    // Note: TSQL bracket-quoted identifiers [name] are handled in the parser
1287    // because [ is also used for arrays and subscripts
1288    identifiers
1289});
1290
1291static DEFAULT_COMMENTS: LazyLock<HashMap<String, Option<String>>> = LazyLock::new(|| {
1292    let mut comments = HashMap::with_capacity(4);
1293    comments.insert("--".to_string(), None);
1294    comments.insert("/*".to_string(), Some("*/".to_string()));
1295    comments
1296});
1297
1298/// Tokenizer configuration for a dialect
1299#[derive(Debug, Clone)]
1300pub struct TokenizerConfig {
1301    /// Keywords mapping (uppercase keyword -> token type)
1302    pub keywords: HashMap<String, TokenType>,
1303    /// Single character tokens
1304    pub single_tokens: HashMap<char, TokenType>,
1305    /// Quote characters (start -> end)
1306    pub quotes: HashMap<String, String>,
1307    /// Identifier quote characters (start -> end)
1308    pub identifiers: HashMap<char, char>,
1309    /// Comment definitions (start -> optional end)
1310    pub comments: HashMap<String, Option<String>>,
1311    /// String escape characters
1312    pub string_escapes: Vec<char>,
1313    /// Whether to support nested comments
1314    pub nested_comments: bool,
1315    /// Valid escape follow characters (for MySQL-style escaping).
1316    /// When a backslash is followed by a character NOT in this list,
1317    /// the backslash is discarded. When empty, all backslash escapes
1318    /// preserve the backslash for unrecognized sequences.
1319    pub escape_follow_chars: Vec<char>,
1320    /// Whether b'...' is a byte string (true for BigQuery) or bit string (false for standard SQL).
1321    /// Default is false (bit string).
1322    pub b_prefix_is_byte_string: bool,
1323    /// Numeric literal suffixes (uppercase suffix -> type name), e.g. {"L": "BIGINT", "S": "SMALLINT"}
1324    /// Used by Hive/Spark to parse 1L as CAST(1 AS BIGINT)
1325    pub numeric_literals: HashMap<String, String>,
1326    /// Whether unquoted identifiers can start with a digit (e.g., `1a`, `1_a`).
1327    /// When true, a number followed by letters/underscore is treated as an identifier.
1328    /// Used by Hive, Spark, MySQL, ClickHouse.
1329    pub identifiers_can_start_with_digit: bool,
1330    /// Whether 0x/0X prefix should be treated as hex literals.
1331    /// When true, `0XCC` is tokenized instead of Number("0") + Identifier("XCC").
1332    /// Used by BigQuery, SQLite, Teradata.
1333    pub hex_number_strings: bool,
1334    /// Whether hex string literals from 0x prefix represent integer values.
1335    /// When true (BigQuery), 0xA is tokenized as HexNumber (integer in hex notation).
1336    /// When false (SQLite, Teradata), 0xCC is tokenized as HexString (binary/blob value).
1337    pub hex_string_is_integer_type: bool,
1338    /// Whether string escape sequences (like \') are allowed in raw strings.
1339    /// When true (BigQuery default), \' inside r'...' escapes the quote.
1340    /// When false (Spark/Databricks), backslashes in raw strings are always literal.
1341    /// Python sqlglot: STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS (default True)
1342    pub string_escapes_allowed_in_raw_strings: bool,
1343    /// Whether # starts a single-line comment (ClickHouse, MySQL)
1344    pub hash_comments: bool,
1345    /// Whether $ can start/continue an identifier (ClickHouse).
1346    /// When true, a bare `$` that is not part of a dollar-quoted string or positional
1347    /// parameter is treated as an identifier character.
1348    pub dollar_sign_is_identifier: bool,
1349    /// Whether INSERT ... FORMAT <name> should treat subsequent data as raw (ClickHouse).
1350    /// When true, after tokenizing `INSERT ... FORMAT <non-VALUES-name>`, all text until
1351    /// the next blank line or end of input is consumed as a raw data token.
1352    pub insert_format_raw_data: bool,
1353    /// Whether numeric literals can contain underscores as digit separators.
1354    /// When true, `1_000` is tokenized as `1000`. Used by ClickHouse and DuckDB.
1355    /// Python sqlglot: NUMBERS_CAN_BE_UNDERSCORE_SEPARATED (default False)
1356    pub numbers_can_be_underscore_separated: bool,
1357    /// Recover strings like `'a\' or 1=1` by treating the escaped quote as the
1358    /// closing quote when no later quote exists. This matches SQLGlot's permissive
1359    /// handling for a few malformed ClickHouse SHOW LIKE fixtures.
1360    pub recover_terminal_backslash_quote: bool,
1361    /// Recover a terminal single-quoted string without a closing quote by treating
1362    /// end-of-input as the close. This is only enabled for ClickHouse fixture
1363    /// coverage, where some extracted corpus rows contain partial string probes.
1364    pub recover_unterminated_string: bool,
1365}
1366
1367impl Default for TokenizerConfig {
1368    fn default() -> Self {
1369        Self {
1370            keywords: DEFAULT_KEYWORDS.clone(),
1371            single_tokens: DEFAULT_SINGLE_TOKENS.clone(),
1372            quotes: DEFAULT_QUOTES.clone(),
1373            identifiers: DEFAULT_IDENTIFIERS.clone(),
1374            comments: DEFAULT_COMMENTS.clone(),
1375            // Standard SQL: only '' (doubled quote) escapes a quote
1376            // Backslash escapes are dialect-specific (MySQL, etc.)
1377            string_escapes: vec!['\''],
1378            nested_comments: true,
1379            // By default, no escape_follow_chars means preserve backslash for unrecognized escapes
1380            escape_follow_chars: vec![],
1381            // Default: b'...' is bit string (standard SQL), not byte string (BigQuery)
1382            b_prefix_is_byte_string: false,
1383            numeric_literals: HashMap::new(),
1384            identifiers_can_start_with_digit: false,
1385            hex_number_strings: false,
1386            hex_string_is_integer_type: false,
1387            // Default: backslash escapes ARE allowed in raw strings (sqlglot default)
1388            // Spark/Databricks set this to false
1389            string_escapes_allowed_in_raw_strings: true,
1390            hash_comments: false,
1391            dollar_sign_is_identifier: false,
1392            insert_format_raw_data: false,
1393            numbers_can_be_underscore_separated: false,
1394            recover_terminal_backslash_quote: false,
1395            recover_unterminated_string: false,
1396        }
1397    }
1398}
1399
1400/// SQL Tokenizer
1401pub struct Tokenizer {
1402    config: TokenizerConfig,
1403}
1404
1405impl Tokenizer {
1406    /// Create a new tokenizer with the given configuration
1407    pub fn new(config: TokenizerConfig) -> Self {
1408        Self { config }
1409    }
1410
1411    /// Create a tokenizer with default configuration
1412    pub fn default_config() -> Self {
1413        Self::new(TokenizerConfig::default())
1414    }
1415
1416    /// Tokenize a SQL string
1417    pub fn tokenize(&self, sql: &str) -> Result<Vec<Token>> {
1418        let mut state = TokenizerState::new(sql, &self.config);
1419        state.tokenize()
1420    }
1421}
1422
1423impl Default for Tokenizer {
1424    fn default() -> Self {
1425        Self::default_config()
1426    }
1427}
1428
1429/// Internal state for tokenization
1430struct TokenizerState<'a> {
1431    source: &'a str,
1432    source_is_ascii: bool,
1433    chars: Vec<char>,
1434    size: usize,
1435    tokens: Vec<Token>,
1436    start: usize,
1437    current: usize,
1438    line: usize,
1439    column: usize,
1440    comments: Vec<String>,
1441    config: &'a TokenizerConfig,
1442}
1443
1444impl<'a> TokenizerState<'a> {
1445    fn new(sql: &'a str, config: &'a TokenizerConfig) -> Self {
1446        let chars: Vec<char> = sql.chars().collect();
1447        let size = chars.len();
1448        Self {
1449            source: sql,
1450            source_is_ascii: sql.is_ascii(),
1451            chars,
1452            size,
1453            tokens: Vec::new(),
1454            start: 0,
1455            current: 0,
1456            line: 1,
1457            column: 1,
1458            comments: Vec::new(),
1459            config,
1460        }
1461    }
1462
1463    fn tokenize(&mut self) -> Result<Vec<Token>> {
1464        while !self.is_at_end() {
1465            self.skip_whitespace();
1466            if self.is_at_end() {
1467                break;
1468            }
1469
1470            self.start = self.current;
1471            self.scan_token()?;
1472
1473            // ClickHouse: After INSERT ... FORMAT <name> (where name != VALUES),
1474            // the rest until the next blank line or end of input is raw data.
1475            if self.config.insert_format_raw_data {
1476                if let Some(raw) = self.try_scan_insert_format_raw_data() {
1477                    if !raw.is_empty() {
1478                        self.start = self.current;
1479                        self.add_token_with_text(TokenType::Var, raw);
1480                    }
1481                }
1482            }
1483        }
1484
1485        // Handle leftover leading comments at end of input.
1486        // These are comments on a new line after the last token that couldn't be attached
1487        // as leading comments to a subsequent token (because there is none).
1488        // Attach them as trailing comments on the last token so they're preserved.
1489        if !self.comments.is_empty() {
1490            if let Some(last) = self.tokens.last_mut() {
1491                last.trailing_comments.extend(self.comments.drain(..));
1492            }
1493        }
1494
1495        Ok(std::mem::take(&mut self.tokens))
1496    }
1497
1498    #[inline]
1499    fn is_at_end(&self) -> bool {
1500        self.current >= self.size
1501    }
1502
1503    #[inline]
1504    fn text_from_range(&self, start: usize, end: usize) -> String {
1505        if self.source_is_ascii {
1506            self.source[start..end].to_string()
1507        } else {
1508            self.chars[start..end].iter().collect()
1509        }
1510    }
1511
1512    #[inline]
1513    fn peek(&self) -> char {
1514        if self.is_at_end() {
1515            '\0'
1516        } else {
1517            self.chars[self.current]
1518        }
1519    }
1520
1521    #[inline]
1522    fn peek_next(&self) -> char {
1523        if self.current + 1 >= self.size {
1524            '\0'
1525        } else {
1526            self.chars[self.current + 1]
1527        }
1528    }
1529
1530    #[inline]
1531    fn advance(&mut self) -> char {
1532        let c = self.peek();
1533        self.current += 1;
1534        if c == '\n' {
1535            self.line += 1;
1536            self.column = 1;
1537        } else {
1538            self.column += 1;
1539        }
1540        c
1541    }
1542
1543    fn skip_whitespace(&mut self) {
1544        // Track whether we've seen a newline since the last token.
1545        // Comments on a new line (after a newline) are leading comments on the next token,
1546        // while comments on the same line are trailing comments on the previous token.
1547        // This matches Python sqlglot's behavior.
1548        let mut saw_newline = false;
1549        while !self.is_at_end() {
1550            let c = self.peek();
1551            match c {
1552                ' ' | '\t' | '\r' => {
1553                    self.advance();
1554                }
1555                '\n' => {
1556                    saw_newline = true;
1557                    self.advance();
1558                }
1559                '\u{00A0}' // non-breaking space
1560                | '\u{2000}'..='\u{200B}' // various Unicode spaces + zero-width space
1561                | '\u{3000}' // ideographic (full-width) space
1562                | '\u{FEFF}' // BOM / zero-width no-break space
1563                => {
1564                    self.advance();
1565                }
1566                '-' if self.peek_next() == '-' => {
1567                    self.scan_line_comment(saw_newline);
1568                    // After a line comment, we're always on a new line
1569                    saw_newline = true;
1570                }
1571                '/' if self.peek_next() == '/' && self.config.hash_comments => {
1572                    // ClickHouse: // single-line comments (same dialects that support # comments)
1573                    self.scan_double_slash_comment();
1574                }
1575                '/' if self.peek_next() == '*' => {
1576                    // Check if this is a hint comment /*+ ... */
1577                    if self.current + 2 < self.size && self.chars[self.current + 2] == '+' {
1578                        // This is a hint comment, handle it as a token instead of skipping
1579                        break;
1580                    }
1581                    if self.scan_block_comment(saw_newline).is_err() {
1582                        return;
1583                    }
1584                    // Don't reset saw_newline - it carries forward
1585                }
1586                '/' if self.peek_next() == '/' && self.config.comments.contains_key("//") => {
1587                    // Dialect-specific // line comment (e.g., Snowflake)
1588                    // But NOT inside URIs like file:// or paths with consecutive slashes
1589                    // Check that previous non-whitespace char is not ':' or '/'
1590                    let prev_non_ws = if self.current > 0 {
1591                        let mut i = self.current - 1;
1592                        while i > 0 && (self.chars[i] == ' ' || self.chars[i] == '\t') {
1593                            i -= 1;
1594                        }
1595                        self.chars[i]
1596                    } else {
1597                        '\0'
1598                    };
1599                    if prev_non_ws == ':' || prev_non_ws == '/' {
1600                        // This is likely a URI (file://, http://) or path, not a comment
1601                        break;
1602                    }
1603                    self.scan_line_comment(saw_newline);
1604                    // After a line comment, we're always on a new line
1605                    saw_newline = true;
1606                }
1607                '#' if self.config.hash_comments => {
1608                    self.scan_hash_line_comment();
1609                }
1610                _ => break,
1611            }
1612        }
1613    }
1614
1615    fn scan_hash_line_comment(&mut self) {
1616        self.advance(); // #
1617        let start = self.current;
1618        while !self.is_at_end() && self.peek() != '\n' {
1619            self.advance();
1620        }
1621        let comment = self.text_from_range(start, self.current);
1622        let comment_text = comment.trim().to_string();
1623        if let Some(last) = self.tokens.last_mut() {
1624            last.trailing_comments.push(comment_text);
1625        } else {
1626            self.comments.push(comment_text);
1627        }
1628    }
1629
1630    fn scan_double_slash_comment(&mut self) {
1631        self.advance(); // /
1632        self.advance(); // /
1633        let start = self.current;
1634        while !self.is_at_end() && self.peek() != '\n' {
1635            self.advance();
1636        }
1637        let comment = self.text_from_range(start, self.current);
1638        let comment_text = comment.trim().to_string();
1639        if let Some(last) = self.tokens.last_mut() {
1640            last.trailing_comments.push(comment_text);
1641        } else {
1642            self.comments.push(comment_text);
1643        }
1644    }
1645
1646    fn scan_line_comment(&mut self, after_newline: bool) {
1647        self.advance(); // -
1648        self.advance(); // -
1649        let start = self.current;
1650        while !self.is_at_end() && self.peek() != '\n' {
1651            self.advance();
1652        }
1653        let comment_text = self.text_from_range(start, self.current);
1654
1655        // If the comment starts on a new line (after_newline), it's a leading comment
1656        // on the next token. Otherwise, it's a trailing comment on the previous token.
1657        if after_newline || self.tokens.is_empty() {
1658            self.comments.push(comment_text);
1659        } else if let Some(last) = self.tokens.last_mut() {
1660            last.trailing_comments.push(comment_text);
1661        }
1662    }
1663
1664    fn scan_block_comment(&mut self, after_newline: bool) -> Result<()> {
1665        self.advance(); // /
1666        self.advance(); // *
1667        let content_start = self.current;
1668        let mut depth = 1;
1669
1670        while !self.is_at_end() && depth > 0 {
1671            if self.peek() == '/' && self.peek_next() == '*' && self.config.nested_comments {
1672                self.advance();
1673                self.advance();
1674                depth += 1;
1675            } else if self.peek() == '*' && self.peek_next() == '/' {
1676                depth -= 1;
1677                if depth > 0 {
1678                    self.advance();
1679                    self.advance();
1680                }
1681            } else {
1682                self.advance();
1683            }
1684        }
1685
1686        if depth > 0 {
1687            return Err(Error::tokenize(
1688                "Unterminated block comment",
1689                self.line,
1690                self.column,
1691                self.start,
1692                self.current,
1693            ));
1694        }
1695
1696        // Get the content between /* and */ (preserving internal whitespace for nested comments)
1697        let content = self.text_from_range(content_start, self.current);
1698        self.advance(); // *
1699        self.advance(); // /
1700
1701        // For round-trip fidelity, preserve the exact comment content including nested comments
1702        let comment_text = format!("/*{}*/", content);
1703
1704        // If the comment starts on a new line (after_newline), it's a leading comment
1705        // on the next token. Otherwise, it's a trailing comment on the previous token.
1706        if after_newline || self.tokens.is_empty() {
1707            self.comments.push(comment_text);
1708        } else if let Some(last) = self.tokens.last_mut() {
1709            last.trailing_comments.push(comment_text);
1710        }
1711
1712        Ok(())
1713    }
1714
1715    /// Scan a hint comment /*+ ... */ and return it as a Hint token
1716    fn scan_hint(&mut self) -> Result<()> {
1717        self.advance(); // /
1718        self.advance(); // *
1719        self.advance(); // +
1720        let hint_start = self.current;
1721
1722        // Scan until we find */
1723        while !self.is_at_end() {
1724            if self.peek() == '*' && self.peek_next() == '/' {
1725                break;
1726            }
1727            self.advance();
1728        }
1729
1730        if self.is_at_end() {
1731            return Err(Error::tokenize(
1732                "Unterminated hint comment",
1733                self.line,
1734                self.column,
1735                self.start,
1736                self.current,
1737            ));
1738        }
1739
1740        let hint_text = self.text_from_range(hint_start, self.current);
1741        self.advance(); // *
1742        self.advance(); // /
1743
1744        self.add_token_with_text(TokenType::Hint, hint_text.trim().to_string());
1745
1746        Ok(())
1747    }
1748
1749    /// Scan a positional parameter: $1, $2, etc.
1750    fn scan_positional_parameter(&mut self) -> Result<()> {
1751        self.advance(); // consume $
1752        let start = self.current;
1753
1754        while !self.is_at_end() && self.peek().is_ascii_digit() {
1755            self.advance();
1756        }
1757
1758        let number = self.text_from_range(start, self.current);
1759        self.add_token_with_text(TokenType::Parameter, number);
1760        Ok(())
1761    }
1762
1763    /// Try to scan a tagged dollar-quoted string: $tag$content$tag$
1764    /// Returns Some(()) if successful, None if this isn't a tagged dollar string.
1765    ///
1766    /// The token text is stored as "tag\x00content" to preserve the tag for later use.
1767    fn try_scan_tagged_dollar_string(&mut self) -> Result<Option<()>> {
1768        let saved_pos = self.current;
1769
1770        // We're at '$', next char is alphabetic
1771        self.advance(); // consume opening $
1772
1773        // Scan the tag (identifier: alphanumeric + underscore, including Unicode)
1774        // Tags can contain Unicode characters like emojis (e.g., $🦆$)
1775        let tag_start = self.current;
1776        while !self.is_at_end()
1777            && (self.peek().is_alphanumeric() || self.peek() == '_' || !self.peek().is_ascii())
1778        {
1779            self.advance();
1780        }
1781        let tag = self.text_from_range(tag_start, self.current);
1782
1783        // Must have a closing $ after the tag
1784        if self.is_at_end() || self.peek() != '$' {
1785            // Not a tagged dollar string - restore position
1786            self.current = saved_pos;
1787            return Ok(None);
1788        }
1789        self.advance(); // consume closing $ of opening tag
1790
1791        // Now scan content until we find $tag$
1792        let content_start = self.current;
1793        let closing_tag = format!("${}$", tag);
1794        let closing_chars: Vec<char> = closing_tag.chars().collect();
1795
1796        loop {
1797            if self.is_at_end() {
1798                // Unterminated - restore and fall through
1799                self.current = saved_pos;
1800                return Ok(None);
1801            }
1802
1803            // Check if we've reached the closing tag
1804            if self.peek() == '$' && self.current + closing_chars.len() <= self.size {
1805                let matches = closing_chars.iter().enumerate().all(|(j, &ch)| {
1806                    self.current + j < self.size && self.chars[self.current + j] == ch
1807                });
1808                if matches {
1809                    let content = self.text_from_range(content_start, self.current);
1810                    // Consume closing tag
1811                    for _ in 0..closing_chars.len() {
1812                        self.advance();
1813                    }
1814                    // Store as "tag\x00content" to preserve the tag
1815                    let token_text = format!("{}\x00{}", tag, content);
1816                    self.add_token_with_text(TokenType::DollarString, token_text);
1817                    return Ok(Some(()));
1818                }
1819            }
1820            self.advance();
1821        }
1822    }
1823
1824    /// Scan a dollar-quoted string: $$content$$ or $tag$content$tag$
1825    ///
1826    /// For $$...$$ (no tag), the token text is just the content.
1827    /// For $tag$...$tag$, use try_scan_tagged_dollar_string instead.
1828    fn scan_dollar_quoted_string(&mut self) -> Result<()> {
1829        self.advance(); // consume first $
1830        self.advance(); // consume second $
1831
1832        // For $$...$$ (no tag), just scan until closing $$
1833        let start = self.current;
1834        while !self.is_at_end() {
1835            if self.peek() == '$'
1836                && self.current + 1 < self.size
1837                && self.chars[self.current + 1] == '$'
1838            {
1839                break;
1840            }
1841            self.advance();
1842        }
1843
1844        let content = self.text_from_range(start, self.current);
1845
1846        if !self.is_at_end() {
1847            self.advance(); // consume first $
1848            self.advance(); // consume second $
1849        }
1850
1851        self.add_token_with_text(TokenType::DollarString, content);
1852        Ok(())
1853    }
1854
1855    fn scan_token(&mut self) -> Result<()> {
1856        let c = self.peek();
1857
1858        // Check for string literal
1859        if c == '\'' {
1860            // Check for triple-quoted string '''...''' if configured
1861            if self.config.quotes.contains_key("'''")
1862                && self.peek_next() == '\''
1863                && self.current + 2 < self.size
1864                && self.chars[self.current + 2] == '\''
1865            {
1866                return self.scan_triple_quoted_string('\'');
1867            }
1868            return self.scan_string();
1869        }
1870
1871        // Check for triple-quoted string """...""" if configured
1872        if c == '"'
1873            && self.config.quotes.contains_key("\"\"\"")
1874            && self.peek_next() == '"'
1875            && self.current + 2 < self.size
1876            && self.chars[self.current + 2] == '"'
1877        {
1878            return self.scan_triple_quoted_string('"');
1879        }
1880
1881        // Check for double-quoted strings when dialect supports them (e.g., BigQuery)
1882        // This must come before identifier quotes check
1883        if c == '"'
1884            && self.config.quotes.contains_key("\"")
1885            && !self.config.identifiers.contains_key(&'"')
1886        {
1887            return self.scan_double_quoted_string();
1888        }
1889
1890        // Check for identifier quotes
1891        if let Some(&end_quote) = self.config.identifiers.get(&c) {
1892            return self.scan_quoted_identifier(end_quote);
1893        }
1894
1895        // Check for numbers (including numbers starting with a dot like .25)
1896        if c.is_ascii_digit() {
1897            return self.scan_number();
1898        }
1899
1900        // Check for numbers starting with a dot (e.g., .25, .5)
1901        // This must come before single character token handling
1902        // Don't treat as a number if:
1903        // - Previous char was also a dot (e.g., 1..2 should be 1, ., ., 2)
1904        // - Previous char is an identifier character (e.g., foo.25 should be foo, ., 25)
1905        //   This handles BigQuery numeric table parts like project.dataset.25
1906        if c == '.' && self.peek_next().is_ascii_digit() {
1907            let prev_char = if self.current > 0 {
1908                self.chars[self.current - 1]
1909            } else {
1910                '\0'
1911            };
1912            let is_after_ident = prev_char.is_alphanumeric()
1913                || prev_char == '_'
1914                || prev_char == '`'
1915                || prev_char == '"'
1916                || prev_char == ']'
1917                || prev_char == ')';
1918            if prev_char != '.' && !is_after_ident {
1919                return self.scan_number_starting_with_dot();
1920            }
1921        }
1922
1923        // Check for hint comment /*+ ... */
1924        if c == '/'
1925            && self.peek_next() == '*'
1926            && self.current + 2 < self.size
1927            && self.chars[self.current + 2] == '+'
1928        {
1929            return self.scan_hint();
1930        }
1931
1932        // Check for multi-character operators first
1933        if let Some(token_type) = self.try_scan_multi_char_operator() {
1934            self.add_token(token_type);
1935            return Ok(());
1936        }
1937
1938        // Check for tagged dollar-quoted strings: $tag$content$tag$
1939        // Tags can contain Unicode characters (including emojis like 🦆) and digits (e.g., $1$)
1940        if c == '$'
1941            && (self.peek_next().is_alphanumeric()
1942                || self.peek_next() == '_'
1943                || !self.peek_next().is_ascii())
1944        {
1945            if let Some(()) = self.try_scan_tagged_dollar_string()? {
1946                return Ok(());
1947            }
1948            // If tagged dollar string didn't match and dollar_sign_is_identifier is set,
1949            // treat the $ and following chars as an identifier (e.g., ClickHouse $alias$name$).
1950            if self.config.dollar_sign_is_identifier {
1951                return self.scan_dollar_identifier();
1952            }
1953        }
1954
1955        // Check for dollar-quoted strings: $$...$$
1956        if c == '$' && self.peek_next() == '$' {
1957            return self.scan_dollar_quoted_string();
1958        }
1959
1960        // Check for positional parameters: $1, $2, etc.
1961        if c == '$' && self.peek_next().is_ascii_digit() {
1962            return self.scan_positional_parameter();
1963        }
1964
1965        // ClickHouse: bare $ (not followed by alphanumeric/underscore) as identifier
1966        if c == '$' && self.config.dollar_sign_is_identifier {
1967            return self.scan_dollar_identifier();
1968        }
1969
1970        // TSQL: Check for identifiers starting with # (temp tables) or @ (variables)
1971        // e.g., #temp, ##global_temp, @variable
1972        if (c == '#' || c == '@')
1973            && (self.peek_next().is_alphanumeric()
1974                || self.peek_next() == '_'
1975                || self.peek_next() == '#')
1976        {
1977            return self.scan_tsql_identifier();
1978        }
1979
1980        // Check for single character tokens
1981        if let Some(&token_type) = self.config.single_tokens.get(&c) {
1982            self.advance();
1983            self.add_token(token_type);
1984            return Ok(());
1985        }
1986
1987        // Unicode minus (U+2212) → treat as regular minus
1988        if c == '\u{2212}' {
1989            self.advance();
1990            self.add_token(TokenType::Dash);
1991            return Ok(());
1992        }
1993
1994        // Unicode fraction slash (U+2044) → treat as regular slash
1995        if c == '\u{2044}' {
1996            self.advance();
1997            self.add_token(TokenType::Slash);
1998            return Ok(());
1999        }
2000
2001        // Unicode curly/smart quotes → treat as regular string quotes
2002        if c == '\u{2018}' || c == '\u{2019}' {
2003            // Left/right single quotation marks → scan as string with matching end
2004            return self.scan_unicode_quoted_string(c);
2005        }
2006        if c == '\u{201C}' || c == '\u{201D}' {
2007            // Left/right double quotation marks → scan as quoted identifier
2008            return self.scan_unicode_quoted_identifier(c);
2009        }
2010
2011        // Must be an identifier or keyword
2012        self.scan_identifier_or_keyword()
2013    }
2014
2015    fn try_scan_multi_char_operator(&mut self) -> Option<TokenType> {
2016        let c = self.peek();
2017        let next = self.peek_next();
2018        let third = if self.current + 2 < self.size {
2019            self.chars[self.current + 2]
2020        } else {
2021            '\0'
2022        };
2023
2024        // Check for three-character operators first
2025        // -|- (Adjacent - PostgreSQL range adjacency)
2026        if c == '-' && next == '|' && third == '-' {
2027            self.advance();
2028            self.advance();
2029            self.advance();
2030            return Some(TokenType::Adjacent);
2031        }
2032
2033        // ||/ (Cube root - PostgreSQL)
2034        if c == '|' && next == '|' && third == '/' {
2035            self.advance();
2036            self.advance();
2037            self.advance();
2038            return Some(TokenType::DPipeSlash);
2039        }
2040
2041        // #>> (JSONB path text extraction - PostgreSQL)
2042        if c == '#' && next == '>' && third == '>' {
2043            self.advance();
2044            self.advance();
2045            self.advance();
2046            return Some(TokenType::DHashArrow);
2047        }
2048
2049        // ->> (JSON text extraction - PostgreSQL/MySQL)
2050        if c == '-' && next == '>' && third == '>' {
2051            self.advance();
2052            self.advance();
2053            self.advance();
2054            return Some(TokenType::DArrow);
2055        }
2056
2057        // <=> (NULL-safe equality - MySQL)
2058        if c == '<' && next == '=' && third == '>' {
2059            self.advance();
2060            self.advance();
2061            self.advance();
2062            return Some(TokenType::NullsafeEq);
2063        }
2064
2065        // <-> (Distance operator - PostgreSQL)
2066        if c == '<' && next == '-' && third == '>' {
2067            self.advance();
2068            self.advance();
2069            self.advance();
2070            return Some(TokenType::LrArrow);
2071        }
2072
2073        // <@ (Contained by - PostgreSQL)
2074        if c == '<' && next == '@' {
2075            self.advance();
2076            self.advance();
2077            return Some(TokenType::LtAt);
2078        }
2079
2080        // @> (Contains - PostgreSQL)
2081        if c == '@' && next == '>' {
2082            self.advance();
2083            self.advance();
2084            return Some(TokenType::AtGt);
2085        }
2086
2087        // ~~~ (Glob - PostgreSQL)
2088        if c == '~' && next == '~' && third == '~' {
2089            self.advance();
2090            self.advance();
2091            self.advance();
2092            return Some(TokenType::Glob);
2093        }
2094
2095        // ~~* (ILike - PostgreSQL)
2096        if c == '~' && next == '~' && third == '*' {
2097            self.advance();
2098            self.advance();
2099            self.advance();
2100            return Some(TokenType::ILike);
2101        }
2102
2103        // !~~* (Not ILike - PostgreSQL)
2104        let fourth = if self.current + 3 < self.size {
2105            self.chars[self.current + 3]
2106        } else {
2107            '\0'
2108        };
2109        if c == '!' && next == '~' && third == '~' && fourth == '*' {
2110            self.advance();
2111            self.advance();
2112            self.advance();
2113            self.advance();
2114            return Some(TokenType::NotILike);
2115        }
2116
2117        // !~~ (Not Like - PostgreSQL)
2118        if c == '!' && next == '~' && third == '~' {
2119            self.advance();
2120            self.advance();
2121            self.advance();
2122            return Some(TokenType::NotLike);
2123        }
2124
2125        // !~* (Not Regexp ILike - PostgreSQL)
2126        if c == '!' && next == '~' && third == '*' {
2127            self.advance();
2128            self.advance();
2129            self.advance();
2130            return Some(TokenType::NotIRLike);
2131        }
2132
2133        // !:> (Not cast / Try cast - SingleStore)
2134        if c == '!' && next == ':' && third == '>' {
2135            self.advance();
2136            self.advance();
2137            self.advance();
2138            return Some(TokenType::NColonGt);
2139        }
2140
2141        // ?:: (TRY_CAST shorthand - Databricks)
2142        if c == '?' && next == ':' && third == ':' {
2143            self.advance();
2144            self.advance();
2145            self.advance();
2146            return Some(TokenType::QDColon);
2147        }
2148
2149        // !~ (Not Regexp - PostgreSQL)
2150        if c == '!' && next == '~' {
2151            self.advance();
2152            self.advance();
2153            return Some(TokenType::NotRLike);
2154        }
2155
2156        // ~~ (Like - PostgreSQL)
2157        if c == '~' && next == '~' {
2158            self.advance();
2159            self.advance();
2160            return Some(TokenType::Like);
2161        }
2162
2163        // ~* (Regexp ILike - PostgreSQL)
2164        if c == '~' && next == '*' {
2165            self.advance();
2166            self.advance();
2167            return Some(TokenType::IRLike);
2168        }
2169
2170        // SingleStore three-character JSON path operators (must be checked before :: two-char)
2171        // ::$ (JSON extract string), ::% (JSON extract double), ::? (JSON match)
2172        if c == ':' && next == ':' && third == '$' {
2173            self.advance();
2174            self.advance();
2175            self.advance();
2176            return Some(TokenType::DColonDollar);
2177        }
2178        if c == ':' && next == ':' && third == '%' {
2179            self.advance();
2180            self.advance();
2181            self.advance();
2182            return Some(TokenType::DColonPercent);
2183        }
2184        if c == ':' && next == ':' && third == '?' {
2185            self.advance();
2186            self.advance();
2187            self.advance();
2188            return Some(TokenType::DColonQMark);
2189        }
2190
2191        // Two-character operators
2192        let token_type = match (c, next) {
2193            ('.', ':') => Some(TokenType::DotColon),
2194            ('=', '=') => Some(TokenType::Eq), // Hive/Spark == equality operator
2195            ('<', '=') => Some(TokenType::Lte),
2196            ('>', '=') => Some(TokenType::Gte),
2197            ('!', '=') => Some(TokenType::Neq),
2198            ('<', '>') => Some(TokenType::Neq),
2199            ('^', '=') => Some(TokenType::Neq),
2200            ('<', '<') => Some(TokenType::LtLt),
2201            ('>', '>') => Some(TokenType::GtGt),
2202            ('|', '|') => Some(TokenType::DPipe),
2203            ('|', '/') => Some(TokenType::PipeSlash), // Square root - PostgreSQL
2204            (':', ':') => Some(TokenType::DColon),
2205            (':', '=') => Some(TokenType::ColonEq), // := (assignment, named args)
2206            (':', '>') => Some(TokenType::ColonGt), // ::> (TSQL)
2207            ('-', '>') => Some(TokenType::Arrow),   // JSON object access
2208            ('=', '>') => Some(TokenType::FArrow),  // Fat arrow (lambda)
2209            ('&', '&') => Some(TokenType::DAmp),
2210            ('&', '<') => Some(TokenType::AmpLt), // PostgreSQL range operator
2211            ('&', '>') => Some(TokenType::AmpGt), // PostgreSQL range operator
2212            ('@', '@') => Some(TokenType::AtAt),  // Text search match
2213            ('?', '|') => Some(TokenType::QMarkPipe), // JSONB contains any key
2214            ('?', '&') => Some(TokenType::QMarkAmp), // JSONB contains all keys
2215            ('?', '?') => Some(TokenType::DQMark), // Double question mark
2216            ('#', '>') => Some(TokenType::HashArrow), // JSONB path extraction
2217            ('#', '-') => Some(TokenType::HashDash), // JSONB delete
2218            ('^', '@') => Some(TokenType::CaretAt), // PostgreSQL starts-with operator
2219            ('*', '*') => Some(TokenType::DStar), // Power operator
2220            ('|', '>') => Some(TokenType::PipeGt), // Pipe-greater (some dialects)
2221            _ => None,
2222        };
2223
2224        if token_type.is_some() {
2225            self.advance();
2226            self.advance();
2227        }
2228
2229        token_type
2230    }
2231
2232    fn scan_string(&mut self) -> Result<()> {
2233        self.advance(); // Opening quote
2234        let mut value = String::new();
2235
2236        while !self.is_at_end() {
2237            let c = self.peek();
2238            if c == '\'' {
2239                if self.peek_next() == '\'' {
2240                    // Escaped quote
2241                    value.push('\'');
2242                    self.advance();
2243                    self.advance();
2244                } else {
2245                    break;
2246                }
2247            } else if c == '\\' && self.config.string_escapes.contains(&'\\') {
2248                if self.config.recover_terminal_backslash_quote
2249                    && self.peek_next() == '\''
2250                    && !self.chars[self.current + 2..].contains(&'\'')
2251                {
2252                    value.push(self.advance());
2253                    break;
2254                }
2255
2256                // Handle escape sequences
2257                self.advance(); // Consume the backslash
2258                if !self.is_at_end() {
2259                    let escaped = self.advance();
2260                    match escaped {
2261                        'n' => value.push('\n'),
2262                        'r' => value.push('\r'),
2263                        't' => value.push('\t'),
2264                        '0' => value.push('\0'),
2265                        'Z' => value.push('\x1A'), // Ctrl+Z (MySQL)
2266                        'a' => value.push('\x07'), // Alert/bell
2267                        'b' => value.push('\x08'), // Backspace
2268                        'f' => value.push('\x0C'), // Form feed
2269                        'v' => value.push('\x0B'), // Vertical tab
2270                        'x' => {
2271                            // Hex escape: \xNN (exactly 2 hex digits)
2272                            let mut hex = String::with_capacity(2);
2273                            for _ in 0..2 {
2274                                if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2275                                    hex.push(self.advance());
2276                                }
2277                            }
2278                            if hex.len() == 2 {
2279                                if let Ok(byte) = u8::from_str_radix(&hex, 16) {
2280                                    value.push(byte as char);
2281                                } else {
2282                                    value.push('\\');
2283                                    value.push('x');
2284                                    value.push_str(&hex);
2285                                }
2286                            } else {
2287                                // Not enough hex digits, preserve literally
2288                                value.push('\\');
2289                                value.push('x');
2290                                value.push_str(&hex);
2291                            }
2292                        }
2293                        '\\' => value.push('\\'),
2294                        '\'' => value.push('\''),
2295                        '"' => value.push('"'),
2296                        '%' => {
2297                            // MySQL: \% in LIKE patterns
2298                            value.push('%');
2299                        }
2300                        '_' => {
2301                            // MySQL: \_ in LIKE patterns
2302                            value.push('_');
2303                        }
2304                        // For unrecognized escape sequences:
2305                        // If escape_follow_chars is set, only preserve backslash for chars in that list
2306                        // Otherwise (empty list), preserve backslash + char for unrecognized escapes
2307                        _ => {
2308                            if !self.config.escape_follow_chars.is_empty() {
2309                                // MySQL-style: discard backslash for unrecognized escapes
2310                                value.push(escaped);
2311                            } else {
2312                                // Standard: preserve backslash + char
2313                                value.push('\\');
2314                                value.push(escaped);
2315                            }
2316                        }
2317                    }
2318                }
2319            } else {
2320                value.push(self.advance());
2321            }
2322        }
2323
2324        if self.is_at_end() {
2325            if self.config.recover_unterminated_string {
2326                self.add_token_with_text(TokenType::String, value);
2327                return Ok(());
2328            }
2329
2330            return Err(Error::tokenize(
2331                "Unterminated string",
2332                self.line,
2333                self.column,
2334                self.start,
2335                self.current,
2336            ));
2337        }
2338
2339        self.advance(); // Closing quote
2340        self.add_token_with_text(TokenType::String, value);
2341        Ok(())
2342    }
2343
2344    /// Scan a double-quoted string (for dialects like BigQuery where " is a string delimiter)
2345    fn scan_double_quoted_string(&mut self) -> Result<()> {
2346        self.advance(); // Opening quote
2347        let mut value = String::new();
2348
2349        while !self.is_at_end() {
2350            let c = self.peek();
2351            if c == '"' {
2352                if self.peek_next() == '"' {
2353                    // Escaped quote
2354                    value.push('"');
2355                    self.advance();
2356                    self.advance();
2357                } else {
2358                    break;
2359                }
2360            } else if c == '\\' && self.config.string_escapes.contains(&'\\') {
2361                // Handle escape sequences
2362                self.advance(); // Consume the backslash
2363                if !self.is_at_end() {
2364                    let escaped = self.advance();
2365                    match escaped {
2366                        'n' => value.push('\n'),
2367                        'r' => value.push('\r'),
2368                        't' => value.push('\t'),
2369                        '0' => value.push('\0'),
2370                        'Z' => value.push('\x1A'), // Ctrl+Z (MySQL)
2371                        'a' => value.push('\x07'), // Alert/bell
2372                        'b' => value.push('\x08'), // Backspace
2373                        'f' => value.push('\x0C'), // Form feed
2374                        'v' => value.push('\x0B'), // Vertical tab
2375                        'x' => {
2376                            // Hex escape: \xNN (exactly 2 hex digits)
2377                            let mut hex = String::with_capacity(2);
2378                            for _ in 0..2 {
2379                                if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2380                                    hex.push(self.advance());
2381                                }
2382                            }
2383                            if hex.len() == 2 {
2384                                if let Ok(byte) = u8::from_str_radix(&hex, 16) {
2385                                    value.push(byte as char);
2386                                } else {
2387                                    value.push('\\');
2388                                    value.push('x');
2389                                    value.push_str(&hex);
2390                                }
2391                            } else {
2392                                // Not enough hex digits, preserve literally
2393                                value.push('\\');
2394                                value.push('x');
2395                                value.push_str(&hex);
2396                            }
2397                        }
2398                        '\\' => value.push('\\'),
2399                        '\'' => value.push('\''),
2400                        '"' => value.push('"'),
2401                        '%' => {
2402                            // MySQL: \% in LIKE patterns
2403                            value.push('%');
2404                        }
2405                        '_' => {
2406                            // MySQL: \_ in LIKE patterns
2407                            value.push('_');
2408                        }
2409                        // For unrecognized escape sequences:
2410                        // If escape_follow_chars is set, only preserve backslash for chars in that list
2411                        // Otherwise (empty list), preserve backslash + char for unrecognized escapes
2412                        _ => {
2413                            if !self.config.escape_follow_chars.is_empty() {
2414                                // MySQL-style: discard backslash for unrecognized escapes
2415                                value.push(escaped);
2416                            } else {
2417                                // Standard: preserve backslash + char
2418                                value.push('\\');
2419                                value.push(escaped);
2420                            }
2421                        }
2422                    }
2423                }
2424            } else {
2425                value.push(self.advance());
2426            }
2427        }
2428
2429        if self.is_at_end() {
2430            return Err(Error::tokenize(
2431                "Unterminated double-quoted string",
2432                self.line,
2433                self.column,
2434                self.start,
2435                self.current,
2436            ));
2437        }
2438
2439        self.advance(); // Closing quote
2440        self.add_token_with_text(TokenType::String, value);
2441        Ok(())
2442    }
2443
2444    fn scan_triple_quoted_string(&mut self, quote_char: char) -> Result<()> {
2445        // Advance past the three opening quotes
2446        self.advance();
2447        self.advance();
2448        self.advance();
2449        let mut value = String::new();
2450
2451        while !self.is_at_end() {
2452            // Check for closing triple quote
2453            if self.peek() == quote_char
2454                && self.current + 1 < self.size
2455                && self.chars[self.current + 1] == quote_char
2456                && self.current + 2 < self.size
2457                && self.chars[self.current + 2] == quote_char
2458            {
2459                // Found closing """
2460                break;
2461            }
2462            value.push(self.advance());
2463        }
2464
2465        if self.is_at_end() {
2466            return Err(Error::tokenize(
2467                "Unterminated triple-quoted string",
2468                self.line,
2469                self.column,
2470                self.start,
2471                self.current,
2472            ));
2473        }
2474
2475        // Advance past the three closing quotes
2476        self.advance();
2477        self.advance();
2478        self.advance();
2479        let token_type = if quote_char == '"' {
2480            TokenType::TripleDoubleQuotedString
2481        } else {
2482            TokenType::TripleSingleQuotedString
2483        };
2484        self.add_token_with_text(token_type, value);
2485        Ok(())
2486    }
2487
2488    fn scan_quoted_identifier(&mut self, end_quote: char) -> Result<()> {
2489        self.advance(); // Opening quote
2490        let mut value = String::new();
2491
2492        loop {
2493            if self.is_at_end() {
2494                return Err(Error::tokenize(
2495                    "Unterminated identifier",
2496                    self.line,
2497                    self.column,
2498                    self.start,
2499                    self.current,
2500                ));
2501            }
2502            if self.peek() == end_quote {
2503                if self.peek_next() == end_quote {
2504                    // Escaped quote (e.g., "" inside "x""y") -> store single quote
2505                    value.push(end_quote);
2506                    self.advance(); // skip first quote
2507                    self.advance(); // skip second quote
2508                } else {
2509                    // End of identifier
2510                    break;
2511                }
2512            } else {
2513                value.push(self.peek());
2514                self.advance();
2515            }
2516        }
2517
2518        self.advance(); // Closing quote
2519        self.add_token_with_text(TokenType::QuotedIdentifier, value);
2520        Ok(())
2521    }
2522
2523    /// Scan a string delimited by Unicode curly single quotes (U+2018/U+2019).
2524    /// Content between curly quotes is literal (no escape processing).
2525    /// When opened with \u{2018} (left), close with \u{2019} (right) only.
2526    /// When opened with \u{2019} (right), close with \u{2019} (right) — self-closing.
2527    fn scan_unicode_quoted_string(&mut self, open_quote: char) -> Result<()> {
2528        self.advance(); // Opening curly quote
2529        let start = self.current;
2530        // Determine closing quote: left opens -> right closes; right opens -> right closes
2531        let close_quote = if open_quote == '\u{2018}' {
2532            '\u{2019}' // left opens, right closes
2533        } else {
2534            '\u{2019}' // right quote also closes with right quote
2535        };
2536        while !self.is_at_end() && self.peek() != close_quote {
2537            self.advance();
2538        }
2539        let value = self.text_from_range(start, self.current);
2540        if !self.is_at_end() {
2541            self.advance(); // Closing quote
2542        }
2543        self.add_token_with_text(TokenType::String, value);
2544        Ok(())
2545    }
2546
2547    /// Scan an identifier delimited by Unicode curly double quotes (U+201C/U+201D).
2548    /// When opened with \u{201C} (left), close with \u{201D} (right) only.
2549    fn scan_unicode_quoted_identifier(&mut self, open_quote: char) -> Result<()> {
2550        self.advance(); // Opening curly quote
2551        let start = self.current;
2552        let close_quote = if open_quote == '\u{201C}' {
2553            '\u{201D}' // left opens, right closes
2554        } else {
2555            '\u{201D}' // right also closes with right
2556        };
2557        while !self.is_at_end() && self.peek() != close_quote && self.peek() != '"' {
2558            self.advance();
2559        }
2560        let value = self.text_from_range(start, self.current);
2561        if !self.is_at_end() {
2562            self.advance(); // Closing quote
2563        }
2564        self.add_token_with_text(TokenType::QuotedIdentifier, value);
2565        Ok(())
2566    }
2567
2568    fn scan_number(&mut self) -> Result<()> {
2569        // Check for 0x/0X hex number prefix (SQLite-style)
2570        if self.config.hex_number_strings && self.peek() == '0' && !self.is_at_end() {
2571            let next = if self.current + 1 < self.size {
2572                self.chars[self.current + 1]
2573            } else {
2574                '\0'
2575            };
2576            if next == 'x' || next == 'X' {
2577                // Advance past '0' and 'x'/'X'
2578                self.advance();
2579                self.advance();
2580                // Collect hex digits (allow underscores as separators, e.g., 0xbad_cafe)
2581                let hex_start = self.current;
2582                while !self.is_at_end() && (self.peek().is_ascii_hexdigit() || self.peek() == '_') {
2583                    if self.peek() == '_' && !self.peek_next().is_ascii_hexdigit() {
2584                        break;
2585                    }
2586                    self.advance();
2587                }
2588                if self.current > hex_start {
2589                    // Check for hex float: 0xABC.DEFpEXP or 0xABCpEXP
2590                    let mut is_hex_float = false;
2591                    // Optional fractional part: .hexdigits
2592                    if !self.is_at_end() && self.peek() == '.' {
2593                        let after_dot = if self.current + 1 < self.size {
2594                            self.chars[self.current + 1]
2595                        } else {
2596                            '\0'
2597                        };
2598                        if after_dot.is_ascii_hexdigit() {
2599                            is_hex_float = true;
2600                            self.advance(); // consume '.'
2601                            while !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2602                                self.advance();
2603                            }
2604                        }
2605                    }
2606                    // Optional binary exponent: p/P [+/-] digits
2607                    if !self.is_at_end() && (self.peek() == 'p' || self.peek() == 'P') {
2608                        is_hex_float = true;
2609                        self.advance(); // consume p/P
2610                        if !self.is_at_end() && (self.peek() == '+' || self.peek() == '-') {
2611                            self.advance();
2612                        }
2613                        while !self.is_at_end() && self.peek().is_ascii_digit() {
2614                            self.advance();
2615                        }
2616                    }
2617                    if is_hex_float {
2618                        // Hex float literal — emit as regular Number token with full text
2619                        let raw_text = self.text_from_range(self.start, self.current);
2620                        let full_text = if self.config.numbers_can_be_underscore_separated
2621                            && raw_text.contains('_')
2622                        {
2623                            raw_text.replace('_', "")
2624                        } else {
2625                            raw_text
2626                        };
2627                        self.add_token_with_text(TokenType::Number, full_text);
2628                    } else if self.config.hex_string_is_integer_type {
2629                        // BigQuery/ClickHouse: 0xA represents an integer in hex notation
2630                        let raw_value = self.text_from_range(hex_start, self.current);
2631                        let hex_value = if self.config.numbers_can_be_underscore_separated
2632                            && raw_value.contains('_')
2633                        {
2634                            raw_value.replace('_', "")
2635                        } else {
2636                            raw_value
2637                        };
2638                        self.add_token_with_text(TokenType::HexNumber, hex_value);
2639                    } else {
2640                        // SQLite/Teradata: 0xCC represents a binary/blob hex string
2641                        let raw_value = self.text_from_range(hex_start, self.current);
2642                        let hex_value = if self.config.numbers_can_be_underscore_separated
2643                            && raw_value.contains('_')
2644                        {
2645                            raw_value.replace('_', "")
2646                        } else {
2647                            raw_value
2648                        };
2649                        self.add_token_with_text(TokenType::HexString, hex_value);
2650                    }
2651                    return Ok(());
2652                }
2653                // No hex digits after 0x - fall through to normal number parsing
2654                // (reset current back to after '0')
2655                self.current = self.start + 1;
2656            }
2657        }
2658
2659        // Allow underscores as digit separators (e.g., 20_000, 1_000_000)
2660        while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2661            // Don't allow underscore at the end (must be followed by digit)
2662            if self.peek() == '_' && (self.is_at_end() || !self.peek_next().is_ascii_digit()) {
2663                break;
2664            }
2665            self.advance();
2666        }
2667
2668        // Look for decimal part - allow trailing dot (e.g., "1.")
2669        // In PostgreSQL (and sqlglot), "1.x" parses as float "1." with alias "x"
2670        // So we always consume the dot as part of the number, even if followed by an identifier
2671        if self.peek() == '.' {
2672            let next = self.peek_next();
2673            // Only consume the dot if:
2674            // 1. Followed by a digit (normal decimal like 1.5)
2675            // 2. Followed by an identifier start (like 1.x -> becomes 1. with alias x)
2676            // 3. End of input or other non-dot character (trailing decimal like "1.")
2677            // Do NOT consume if it's a double dot (..) which is a range operator
2678            if next != '.' {
2679                self.advance(); // consume the .
2680                                // Only consume digits after the decimal point (not identifiers)
2681                while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2682                    if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2683                        break;
2684                    }
2685                    self.advance();
2686                }
2687            }
2688        }
2689
2690        // Look for exponent
2691        if self.peek() == 'e' || self.peek() == 'E' {
2692            self.advance();
2693            if self.peek() == '+' || self.peek() == '-' {
2694                self.advance();
2695            }
2696            while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2697                if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2698                    break;
2699                }
2700                self.advance();
2701            }
2702        }
2703
2704        let raw_text = self.text_from_range(self.start, self.current);
2705        // Strip underscore digit separators (e.g., 20_000 -> 20000, 1_2E+1_0 -> 12E+10)
2706        // Only for dialects that support this (ClickHouse, DuckDB)
2707        let text = if self.config.numbers_can_be_underscore_separated && raw_text.contains('_') {
2708            raw_text.replace('_', "")
2709        } else {
2710            raw_text
2711        };
2712
2713        // Check for numeric literal suffixes (e.g., 1L -> BIGINT, 1s -> SMALLINT in Hive/Spark)
2714        if !self.config.numeric_literals.is_empty() && !self.is_at_end() {
2715            let next_char: String = self.peek().to_ascii_uppercase().to_string();
2716            // Try 2-char suffix first (e.g., "BD"), then 1-char
2717            let suffix_match = if self.current + 1 < self.size {
2718                let two_char: String = [
2719                    self.chars[self.current].to_ascii_uppercase(),
2720                    self.chars[self.current + 1].to_ascii_uppercase(),
2721                ]
2722                .iter()
2723                .collect();
2724                if self.config.numeric_literals.contains_key(&two_char) {
2725                    // Make sure the 2-char suffix is not followed by more identifier chars
2726                    let after_suffix = if self.current + 2 < self.size {
2727                        self.chars[self.current + 2]
2728                    } else {
2729                        ' '
2730                    };
2731                    if !after_suffix.is_alphanumeric() && after_suffix != '_' {
2732                        Some((two_char, 2))
2733                    } else {
2734                        None
2735                    }
2736                } else if self.config.numeric_literals.contains_key(&next_char) {
2737                    // 1-char suffix - make sure not followed by more identifier chars
2738                    let after_suffix = if self.current + 1 < self.size {
2739                        self.chars[self.current + 1]
2740                    } else {
2741                        ' '
2742                    };
2743                    if !after_suffix.is_alphanumeric() && after_suffix != '_' {
2744                        Some((next_char, 1))
2745                    } else {
2746                        None
2747                    }
2748                } else {
2749                    None
2750                }
2751            } else if self.config.numeric_literals.contains_key(&next_char) {
2752                // At end of input, 1-char suffix
2753                Some((next_char, 1))
2754            } else {
2755                None
2756            };
2757
2758            if let Some((suffix, len)) = suffix_match {
2759                // Consume the suffix characters
2760                for _ in 0..len {
2761                    self.advance();
2762                }
2763                // Emit as a special number-with-suffix token
2764                // We'll encode as "number::TYPE" so the parser can split it
2765                let type_name = self
2766                    .config
2767                    .numeric_literals
2768                    .get(&suffix)
2769                    .expect("suffix verified by contains_key above")
2770                    .clone();
2771                let combined = format!("{}::{}", text, type_name);
2772                self.add_token_with_text(TokenType::Number, combined);
2773                return Ok(());
2774            }
2775        }
2776
2777        // Check for identifiers that start with a digit (e.g., 1a, 1_a, 1a_1a)
2778        // In Hive/Spark/MySQL/ClickHouse, these are valid unquoted identifiers
2779        if self.config.identifiers_can_start_with_digit && !self.is_at_end() {
2780            let next = self.peek();
2781            if next.is_alphabetic() || next == '_' {
2782                // Continue scanning as an identifier
2783                while !self.is_at_end() {
2784                    let ch = self.peek();
2785                    if ch.is_alphanumeric() || ch == '_' {
2786                        self.advance();
2787                    } else {
2788                        break;
2789                    }
2790                }
2791                let ident_text = self.text_from_range(self.start, self.current);
2792                self.add_token_with_text(TokenType::Identifier, ident_text);
2793                return Ok(());
2794            }
2795        }
2796
2797        self.add_token_with_text(TokenType::Number, text);
2798        Ok(())
2799    }
2800
2801    /// Scan a number that starts with a dot (e.g., .25, .5, .123e10)
2802    fn scan_number_starting_with_dot(&mut self) -> Result<()> {
2803        // Consume the leading dot
2804        self.advance();
2805
2806        // Consume the fractional digits
2807        while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2808            if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2809                break;
2810            }
2811            self.advance();
2812        }
2813
2814        // Look for exponent
2815        if self.peek() == 'e' || self.peek() == 'E' {
2816            self.advance();
2817            if self.peek() == '+' || self.peek() == '-' {
2818                self.advance();
2819            }
2820            while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2821                if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2822                    break;
2823                }
2824                self.advance();
2825            }
2826        }
2827
2828        let raw_text = self.text_from_range(self.start, self.current);
2829        // Strip underscore digit separators (e.g., .1_5 -> .15)
2830        // Only for dialects that support this (ClickHouse, DuckDB)
2831        let text = if self.config.numbers_can_be_underscore_separated && raw_text.contains('_') {
2832            raw_text.replace('_', "")
2833        } else {
2834            raw_text
2835        };
2836        self.add_token_with_text(TokenType::Number, text);
2837        Ok(())
2838    }
2839
2840    /// Look up a keyword using a stack buffer for ASCII uppercasing, avoiding heap allocation.
2841    /// Returns `TokenType::Var` for texts longer than 128 bytes or non-UTF-8 results.
2842    #[inline]
2843    fn lookup_keyword_ascii(keywords: &HashMap<String, TokenType>, text: &str) -> TokenType {
2844        if text.len() > 128 {
2845            return TokenType::Var;
2846        }
2847        let mut buf = [0u8; 128];
2848        for (i, b) in text.bytes().enumerate() {
2849            buf[i] = b.to_ascii_uppercase();
2850        }
2851        if let Ok(upper) = std::str::from_utf8(&buf[..text.len()]) {
2852            keywords.get(upper).copied().unwrap_or(TokenType::Var)
2853        } else {
2854            TokenType::Var
2855        }
2856    }
2857
2858    fn scan_identifier_or_keyword(&mut self) -> Result<()> {
2859        // Guard against unrecognized characters that could cause infinite loops
2860        let first_char = self.peek();
2861        if !first_char.is_alphanumeric() && first_char != '_' {
2862            // Unknown character - skip it and return an error
2863            let c = self.advance();
2864            return Err(Error::tokenize(
2865                format!("Unexpected character: '{}'", c),
2866                self.line,
2867                self.column,
2868                self.start,
2869                self.current,
2870            ));
2871        }
2872
2873        while !self.is_at_end() {
2874            let c = self.peek();
2875            // Allow alphanumeric, underscore, $, # and @ in identifiers
2876            // PostgreSQL allows $, TSQL allows # and @
2877            // But stop consuming # if followed by > or >> (PostgreSQL #> and #>> operators)
2878            if c == '#' {
2879                let next_c = if self.current + 1 < self.size {
2880                    self.chars[self.current + 1]
2881                } else {
2882                    '\0'
2883                };
2884                if next_c == '>' || next_c == '-' {
2885                    break; // Don't consume # — it's part of #>, #>>, or #- operator
2886                }
2887                self.advance();
2888            } else if c.is_alphanumeric() || c == '_' || c == '$' || c == '@' {
2889                self.advance();
2890            } else {
2891                break;
2892            }
2893        }
2894
2895        let text = self.text_from_range(self.start, self.current);
2896
2897        // Special-case NOT= (Teradata and other dialects)
2898        if text.eq_ignore_ascii_case("NOT") && self.peek() == '=' {
2899            self.advance(); // consume '='
2900            self.add_token(TokenType::Neq);
2901            return Ok(());
2902        }
2903
2904        // Check for special string prefixes like N'...', X'...', B'...', U&'...', r'...', b'...'
2905        // Also handle double-quoted variants for dialects that support them (e.g., BigQuery)
2906        let next_char = self.peek();
2907        let is_single_quote = next_char == '\'';
2908        let is_double_quote = next_char == '"' && self.config.quotes.contains_key("\"");
2909        // For raw strings (r"..." or r'...'), we allow double quotes even if " is not in quotes config
2910        // because raw strings are a special case used in Spark/Databricks where " is for identifiers
2911        let is_double_quote_for_raw = next_char == '"';
2912
2913        // Handle raw strings first - they're special because they work with both ' and "
2914        // even in dialects where " is normally an identifier delimiter (like Databricks)
2915        if text.eq_ignore_ascii_case("R") && (is_single_quote || is_double_quote_for_raw) {
2916            // Raw string r'...' or r"..." or r'''...''' or r"""...""" (BigQuery style)
2917            // In raw strings, backslashes are treated literally (no escape processing)
2918            let quote_char = if is_single_quote { '\'' } else { '"' };
2919            self.advance(); // consume the first opening quote
2920
2921            // Check for triple-quoted raw string (r"""...""" or r'''...''')
2922            if self.peek() == quote_char && self.peek_next() == quote_char {
2923                // Triple-quoted raw string
2924                self.advance(); // consume second quote
2925                self.advance(); // consume third quote
2926                let string_value = self.scan_raw_triple_quoted_content(quote_char)?;
2927                self.add_token_with_text(TokenType::RawString, string_value);
2928            } else {
2929                let string_value = self.scan_raw_string_content(quote_char)?;
2930                self.add_token_with_text(TokenType::RawString, string_value);
2931            }
2932            return Ok(());
2933        }
2934
2935        if is_single_quote || is_double_quote {
2936            if text.eq_ignore_ascii_case("N") {
2937                // National string N'...'
2938                self.advance(); // consume the opening quote
2939                let string_value = if is_single_quote {
2940                    self.scan_string_content()?
2941                } else {
2942                    self.scan_double_quoted_string_content()?
2943                };
2944                self.add_token_with_text(TokenType::NationalString, string_value);
2945                return Ok(());
2946            } else if text.eq_ignore_ascii_case("E") {
2947                // PostgreSQL escape string E'...' or e'...'
2948                // Preserve the case by prefixing with "e:" or "E:"
2949                // Always use backslash escapes for escape strings (e.g., \' is an escaped quote)
2950                let lowercase = text == "e";
2951                let prefix = if lowercase { "e:" } else { "E:" };
2952                self.advance(); // consume the opening quote
2953                let string_value = self.scan_string_content_with_escapes(true)?;
2954                self.add_token_with_text(
2955                    TokenType::EscapeString,
2956                    format!("{}{}", prefix, string_value),
2957                );
2958                return Ok(());
2959            } else if text.eq_ignore_ascii_case("X") {
2960                // Hex string X'...'
2961                self.advance(); // consume the opening quote
2962                let string_value = if is_single_quote {
2963                    self.scan_string_content()?
2964                } else {
2965                    self.scan_double_quoted_string_content()?
2966                };
2967                self.add_token_with_text(TokenType::HexString, string_value);
2968                return Ok(());
2969            } else if text.eq_ignore_ascii_case("B") && is_double_quote {
2970                // Byte string b"..." (BigQuery style) - MUST check before single quote B'...'
2971                self.advance(); // consume the opening quote
2972                let string_value = self.scan_double_quoted_string_content()?;
2973                self.add_token_with_text(TokenType::ByteString, string_value);
2974                return Ok(());
2975            } else if text.eq_ignore_ascii_case("B") && is_single_quote {
2976                // For BigQuery: b'...' is a byte string (bytes data)
2977                // For standard SQL: B'...' is a bit string (binary digits)
2978                self.advance(); // consume the opening quote
2979                let string_value = self.scan_string_content()?;
2980                if self.config.b_prefix_is_byte_string {
2981                    self.add_token_with_text(TokenType::ByteString, string_value);
2982                } else {
2983                    self.add_token_with_text(TokenType::BitString, string_value);
2984                }
2985                return Ok(());
2986            }
2987        }
2988
2989        // Check for U&'...' Unicode string syntax (SQL standard)
2990        if text.eq_ignore_ascii_case("U")
2991            && self.peek() == '&'
2992            && self.current + 1 < self.size
2993            && self.chars[self.current + 1] == '\''
2994        {
2995            self.advance(); // consume '&'
2996            self.advance(); // consume opening quote
2997            let string_value = self.scan_string_content()?;
2998            self.add_token_with_text(TokenType::UnicodeString, string_value);
2999            return Ok(());
3000        }
3001
3002        let token_type = Self::lookup_keyword_ascii(&self.config.keywords, &text);
3003
3004        self.add_token_with_text(token_type, text);
3005        Ok(())
3006    }
3007
3008    /// Scan string content (everything between quotes)
3009    /// If `force_backslash_escapes` is true, backslash is always treated as an escape character
3010    /// (used for PostgreSQL E'...' escape strings)
3011    fn scan_string_content_with_escapes(
3012        &mut self,
3013        force_backslash_escapes: bool,
3014    ) -> Result<String> {
3015        let mut value = String::new();
3016        let use_backslash_escapes =
3017            force_backslash_escapes || self.config.string_escapes.contains(&'\\');
3018
3019        while !self.is_at_end() {
3020            let c = self.peek();
3021            if c == '\'' {
3022                if self.peek_next() == '\'' {
3023                    // Escaped quote ''
3024                    value.push('\'');
3025                    self.advance();
3026                    self.advance();
3027                } else {
3028                    break;
3029                }
3030            } else if c == '\\' && use_backslash_escapes {
3031                // Preserve escape sequences literally (including \' for escape strings)
3032                value.push(self.advance());
3033                if !self.is_at_end() {
3034                    value.push(self.advance());
3035                }
3036            } else {
3037                value.push(self.advance());
3038            }
3039        }
3040
3041        if self.is_at_end() {
3042            return Err(Error::tokenize(
3043                "Unterminated string",
3044                self.line,
3045                self.column,
3046                self.start,
3047                self.current,
3048            ));
3049        }
3050
3051        self.advance(); // Closing quote
3052        Ok(value)
3053    }
3054
3055    /// Scan string content (everything between quotes)
3056    fn scan_string_content(&mut self) -> Result<String> {
3057        self.scan_string_content_with_escapes(false)
3058    }
3059
3060    /// Scan double-quoted string content (for dialects like BigQuery where " is a string delimiter)
3061    /// This is used for prefixed strings like b"..." or N"..."
3062    fn scan_double_quoted_string_content(&mut self) -> Result<String> {
3063        let mut value = String::new();
3064        let use_backslash_escapes = self.config.string_escapes.contains(&'\\');
3065
3066        while !self.is_at_end() {
3067            let c = self.peek();
3068            if c == '"' {
3069                if self.peek_next() == '"' {
3070                    // Escaped quote ""
3071                    value.push('"');
3072                    self.advance();
3073                    self.advance();
3074                } else {
3075                    break;
3076                }
3077            } else if c == '\\' && use_backslash_escapes {
3078                // Handle escape sequences
3079                self.advance(); // Consume backslash
3080                if !self.is_at_end() {
3081                    let escaped = self.advance();
3082                    match escaped {
3083                        'n' => value.push('\n'),
3084                        'r' => value.push('\r'),
3085                        't' => value.push('\t'),
3086                        '0' => value.push('\0'),
3087                        '\\' => value.push('\\'),
3088                        '"' => value.push('"'),
3089                        '\'' => value.push('\''),
3090                        'x' => {
3091                            // Hex escape \xNN - collect hex digits
3092                            let mut hex = String::new();
3093                            for _ in 0..2 {
3094                                if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
3095                                    hex.push(self.advance());
3096                                }
3097                            }
3098                            if let Ok(byte) = u8::from_str_radix(&hex, 16) {
3099                                value.push(byte as char);
3100                            } else {
3101                                // Invalid hex escape, keep it literal
3102                                value.push('\\');
3103                                value.push('x');
3104                                value.push_str(&hex);
3105                            }
3106                        }
3107                        _ => {
3108                            // For unrecognized escapes, preserve backslash + char
3109                            value.push('\\');
3110                            value.push(escaped);
3111                        }
3112                    }
3113                }
3114            } else {
3115                value.push(self.advance());
3116            }
3117        }
3118
3119        if self.is_at_end() {
3120            return Err(Error::tokenize(
3121                "Unterminated double-quoted string",
3122                self.line,
3123                self.column,
3124                self.start,
3125                self.current,
3126            ));
3127        }
3128
3129        self.advance(); // Closing quote
3130        Ok(value)
3131    }
3132
3133    /// Scan raw string content (limited escape processing for quotes)
3134    /// Used for BigQuery r'...' and r"..." strings
3135    /// In raw strings, backslashes are literal EXCEPT that escape sequences for the
3136    /// quote character still work (e.g., \' in r'...' escapes the quote, '' also works)
3137    fn scan_raw_string_content(&mut self, quote_char: char) -> Result<String> {
3138        let mut value = String::new();
3139
3140        while !self.is_at_end() {
3141            let c = self.peek();
3142            if c == quote_char {
3143                if self.peek_next() == quote_char {
3144                    // Escaped quote (doubled) - e.g., '' inside r'...'
3145                    value.push(quote_char);
3146                    self.advance();
3147                    self.advance();
3148                } else {
3149                    break;
3150                }
3151            } else if c == '\\'
3152                && self.peek_next() == quote_char
3153                && self.config.string_escapes_allowed_in_raw_strings
3154            {
3155                // Backslash-escaped quote - works in raw strings when string_escapes_allowed_in_raw_strings is true
3156                // e.g., \' inside r'...' becomes literal ' (BigQuery behavior)
3157                // Spark/Databricks has this set to false, so backslash is always literal there
3158                value.push(quote_char);
3159                self.advance(); // consume backslash
3160                self.advance(); // consume quote
3161            } else {
3162                // In raw strings, everything including backslashes is literal
3163                value.push(self.advance());
3164            }
3165        }
3166
3167        if self.is_at_end() {
3168            return Err(Error::tokenize(
3169                "Unterminated raw string",
3170                self.line,
3171                self.column,
3172                self.start,
3173                self.current,
3174            ));
3175        }
3176
3177        self.advance(); // Closing quote
3178        Ok(value)
3179    }
3180
3181    /// Scan raw triple-quoted string content (r"""...""" or r'''...''')
3182    /// Terminates when three consecutive quote_chars are found
3183    fn scan_raw_triple_quoted_content(&mut self, quote_char: char) -> Result<String> {
3184        let mut value = String::new();
3185
3186        while !self.is_at_end() {
3187            let c = self.peek();
3188            if c == quote_char && self.peek_next() == quote_char {
3189                // Check for third quote
3190                if self.current + 2 < self.size && self.chars[self.current + 2] == quote_char {
3191                    // Found three consecutive quotes - end of string
3192                    self.advance(); // first closing quote
3193                    self.advance(); // second closing quote
3194                    self.advance(); // third closing quote
3195                    return Ok(value);
3196                }
3197            }
3198            // In raw strings, everything including backslashes is literal
3199            let ch = self.advance();
3200            value.push(ch);
3201        }
3202
3203        Err(Error::tokenize(
3204            "Unterminated raw triple-quoted string",
3205            self.line,
3206            self.column,
3207            self.start,
3208            self.current,
3209        ))
3210    }
3211
3212    /// Scan TSQL identifiers that start with # (temp tables) or @ (variables)
3213    /// Examples: #temp, ##global_temp, @variable
3214    /// Scan an identifier that starts with `$` (ClickHouse).
3215    /// Examples: `$alias$name$`, `$x`
3216    fn scan_dollar_identifier(&mut self) -> Result<()> {
3217        // Consume the leading $
3218        self.advance();
3219
3220        // Consume alphanumeric, _, and $ continuation chars
3221        while !self.is_at_end() {
3222            let c = self.peek();
3223            if c.is_alphanumeric() || c == '_' || c == '$' {
3224                self.advance();
3225            } else {
3226                break;
3227            }
3228        }
3229
3230        let text = self.text_from_range(self.start, self.current);
3231        self.add_token_with_text(TokenType::Var, text);
3232        Ok(())
3233    }
3234
3235    fn scan_tsql_identifier(&mut self) -> Result<()> {
3236        // Consume the leading # or @ (or ##)
3237        let first = self.advance();
3238
3239        // For ##, consume the second #
3240        if first == '#' && self.peek() == '#' {
3241            self.advance();
3242        }
3243
3244        // Now scan the rest of the identifier
3245        while !self.is_at_end() {
3246            let c = self.peek();
3247            if c.is_alphanumeric() || c == '_' || c == '$' || c == '#' || c == '@' {
3248                self.advance();
3249            } else {
3250                break;
3251            }
3252        }
3253
3254        let text = self.text_from_range(self.start, self.current);
3255        // These are always identifiers (variables or temp table names), never keywords
3256        self.add_token_with_text(TokenType::Var, text);
3257        Ok(())
3258    }
3259
3260    /// Check if the last tokens match INSERT ... FORMAT <name> (not VALUES).
3261    /// If so, consume everything until the next blank line (two consecutive newlines)
3262    /// or end of input as raw data.
3263    fn try_scan_insert_format_raw_data(&mut self) -> Option<String> {
3264        let len = self.tokens.len();
3265        if len < 3 {
3266            return None;
3267        }
3268
3269        // Last token should be the format name (Identifier or Var, not VALUES)
3270        let last = &self.tokens[len - 1];
3271        if last.text.eq_ignore_ascii_case("VALUES") {
3272            return None;
3273        }
3274        if !matches!(last.token_type, TokenType::Var | TokenType::Identifier) {
3275            return None;
3276        }
3277
3278        // Second-to-last should be FORMAT
3279        let format_tok = &self.tokens[len - 2];
3280        if !format_tok.text.eq_ignore_ascii_case("FORMAT") {
3281            return None;
3282        }
3283
3284        // Check that there's an INSERT somewhere earlier in the tokens
3285        let has_insert = self.tokens[..len - 2]
3286            .iter()
3287            .rev()
3288            .take(20)
3289            .any(|t| t.token_type == TokenType::Insert);
3290        if !has_insert {
3291            return None;
3292        }
3293
3294        // We're in INSERT ... FORMAT <name> context. Consume everything until:
3295        // - A blank line (two consecutive newlines, possibly with whitespace between)
3296        // - End of input
3297        let raw_start = self.current;
3298        while !self.is_at_end() {
3299            let c = self.peek();
3300            if c == '\n' {
3301                // Check for blank line: \n followed by optional \r and \n
3302                let saved = self.current;
3303                self.advance(); // consume first \n
3304                                // Skip \r if present
3305                while !self.is_at_end() && self.peek() == '\r' {
3306                    self.advance();
3307                }
3308                if self.is_at_end() || self.peek() == '\n' {
3309                    // Found blank line or end of input - stop here
3310                    // Don't consume the second \n so subsequent SQL can be tokenized
3311                    let raw = self.text_from_range(raw_start, saved);
3312                    return Some(raw.trim().to_string());
3313                }
3314                // Not a blank line, continue scanning
3315            } else {
3316                self.advance();
3317            }
3318        }
3319
3320        // Reached end of input
3321        let raw = self.text_from_range(raw_start, self.current);
3322        let trimmed = raw.trim().to_string();
3323        if trimmed.is_empty() {
3324            None
3325        } else {
3326            Some(trimmed)
3327        }
3328    }
3329
3330    fn add_token(&mut self, token_type: TokenType) {
3331        let text = self.text_from_range(self.start, self.current);
3332        self.add_token_with_text(token_type, text);
3333    }
3334
3335    fn add_token_with_text(&mut self, token_type: TokenType, text: String) {
3336        let span = Span::new(self.start, self.current, self.line, self.column);
3337        let mut token = Token::new(token_type, text, span);
3338        token.comments.append(&mut self.comments);
3339        self.tokens.push(token);
3340    }
3341}
3342
3343#[cfg(test)]
3344mod tests {
3345    use super::*;
3346
3347    #[test]
3348    fn test_simple_select() {
3349        let tokenizer = Tokenizer::default();
3350        let tokens = tokenizer.tokenize("SELECT 1").unwrap();
3351
3352        assert_eq!(tokens.len(), 2);
3353        assert_eq!(tokens[0].token_type, TokenType::Select);
3354        assert_eq!(tokens[1].token_type, TokenType::Number);
3355        assert_eq!(tokens[1].text, "1");
3356    }
3357
3358    #[test]
3359    fn test_select_with_identifier() {
3360        let tokenizer = Tokenizer::default();
3361        let tokens = tokenizer.tokenize("SELECT a, b FROM t").unwrap();
3362
3363        assert_eq!(tokens.len(), 6);
3364        assert_eq!(tokens[0].token_type, TokenType::Select);
3365        assert_eq!(tokens[1].token_type, TokenType::Var);
3366        assert_eq!(tokens[1].text, "a");
3367        assert_eq!(tokens[2].token_type, TokenType::Comma);
3368        assert_eq!(tokens[3].token_type, TokenType::Var);
3369        assert_eq!(tokens[3].text, "b");
3370        assert_eq!(tokens[4].token_type, TokenType::From);
3371        assert_eq!(tokens[5].token_type, TokenType::Var);
3372        assert_eq!(tokens[5].text, "t");
3373    }
3374
3375    #[test]
3376    fn test_string_literal() {
3377        let tokenizer = Tokenizer::default();
3378        let tokens = tokenizer.tokenize("SELECT 'hello'").unwrap();
3379
3380        assert_eq!(tokens.len(), 2);
3381        assert_eq!(tokens[1].token_type, TokenType::String);
3382        assert_eq!(tokens[1].text, "hello");
3383    }
3384
3385    #[test]
3386    fn test_escaped_string() {
3387        let tokenizer = Tokenizer::default();
3388        let tokens = tokenizer.tokenize("SELECT 'it''s'").unwrap();
3389
3390        assert_eq!(tokens.len(), 2);
3391        assert_eq!(tokens[1].token_type, TokenType::String);
3392        assert_eq!(tokens[1].text, "it's");
3393    }
3394
3395    #[test]
3396    fn test_terminal_backslash_quote_recovery() {
3397        let mut config = TokenizerConfig::default();
3398        config.string_escapes.push('\\');
3399        config.recover_terminal_backslash_quote = true;
3400        let tokenizer = Tokenizer::new(config);
3401        let tokens = tokenizer
3402            .tokenize("SHOW FUNCTIONS LIKE 'a\\' OR 1=1")
3403            .unwrap();
3404
3405        assert_eq!(tokens.len(), 8);
3406        assert_eq!(tokens[3].token_type, TokenType::String);
3407        assert_eq!(tokens[3].text, "a\\");
3408        assert_eq!(tokens[4].token_type, TokenType::Or);
3409    }
3410
3411    #[test]
3412    fn test_comments() {
3413        let tokenizer = Tokenizer::default();
3414        let tokens = tokenizer.tokenize("SELECT -- comment\n1").unwrap();
3415
3416        assert_eq!(tokens.len(), 2);
3417        // Comments are attached to the PREVIOUS token as trailing_comments
3418        // This is better for round-trip fidelity (e.g., SELECT c /* comment */ FROM)
3419        assert_eq!(tokens[0].trailing_comments.len(), 1);
3420        assert_eq!(tokens[0].trailing_comments[0], " comment");
3421    }
3422
3423    #[test]
3424    fn test_comment_in_and_chain() {
3425        use crate::generator::Generator;
3426        use crate::parser::Parser;
3427
3428        // Line comments between AND clauses should appear after the AND operator
3429        let sql = "SELECT a FROM b WHERE foo\n-- c1\nAND bar\n-- c2\nAND bla";
3430        let ast = Parser::parse_sql(sql).unwrap();
3431        let mut gen = Generator::default();
3432        let output = gen.generate(&ast[0]).unwrap();
3433        assert_eq!(
3434            output,
3435            "SELECT a FROM b WHERE foo AND /* c1 */ bar AND /* c2 */ bla"
3436        );
3437    }
3438
3439    #[test]
3440    fn test_operators() {
3441        let tokenizer = Tokenizer::default();
3442        let tokens = tokenizer.tokenize("1 + 2 * 3").unwrap();
3443
3444        assert_eq!(tokens.len(), 5);
3445        assert_eq!(tokens[0].token_type, TokenType::Number);
3446        assert_eq!(tokens[1].token_type, TokenType::Plus);
3447        assert_eq!(tokens[2].token_type, TokenType::Number);
3448        assert_eq!(tokens[3].token_type, TokenType::Star);
3449        assert_eq!(tokens[4].token_type, TokenType::Number);
3450    }
3451
3452    #[test]
3453    fn test_comparison_operators() {
3454        let tokenizer = Tokenizer::default();
3455        let tokens = tokenizer.tokenize("a <= b >= c != d").unwrap();
3456
3457        assert_eq!(tokens[1].token_type, TokenType::Lte);
3458        assert_eq!(tokens[3].token_type, TokenType::Gte);
3459        assert_eq!(tokens[5].token_type, TokenType::Neq);
3460    }
3461
3462    #[test]
3463    fn test_national_string() {
3464        let tokenizer = Tokenizer::default();
3465        let tokens = tokenizer.tokenize("N'abc'").unwrap();
3466
3467        assert_eq!(
3468            tokens.len(),
3469            1,
3470            "Expected 1 token for N'abc', got {:?}",
3471            tokens
3472        );
3473        assert_eq!(tokens[0].token_type, TokenType::NationalString);
3474        assert_eq!(tokens[0].text, "abc");
3475    }
3476
3477    #[test]
3478    fn test_hex_string() {
3479        let tokenizer = Tokenizer::default();
3480        let tokens = tokenizer.tokenize("X'ABCD'").unwrap();
3481
3482        assert_eq!(
3483            tokens.len(),
3484            1,
3485            "Expected 1 token for X'ABCD', got {:?}",
3486            tokens
3487        );
3488        assert_eq!(tokens[0].token_type, TokenType::HexString);
3489        assert_eq!(tokens[0].text, "ABCD");
3490    }
3491
3492    #[test]
3493    fn test_bit_string() {
3494        let tokenizer = Tokenizer::default();
3495        let tokens = tokenizer.tokenize("B'01010'").unwrap();
3496
3497        assert_eq!(
3498            tokens.len(),
3499            1,
3500            "Expected 1 token for B'01010', got {:?}",
3501            tokens
3502        );
3503        assert_eq!(tokens[0].token_type, TokenType::BitString);
3504        assert_eq!(tokens[0].text, "01010");
3505    }
3506
3507    #[test]
3508    fn test_trailing_dot_number() {
3509        let tokenizer = Tokenizer::default();
3510
3511        // Test trailing dot
3512        let tokens = tokenizer.tokenize("SELECT 1.").unwrap();
3513        assert_eq!(
3514            tokens.len(),
3515            2,
3516            "Expected 2 tokens for 'SELECT 1.', got {:?}",
3517            tokens
3518        );
3519        assert_eq!(tokens[1].token_type, TokenType::Number);
3520        assert_eq!(tokens[1].text, "1.");
3521
3522        // Test normal decimal
3523        let tokens = tokenizer.tokenize("SELECT 1.5").unwrap();
3524        assert_eq!(tokens[1].text, "1.5");
3525
3526        // Test number followed by dot and identifier
3527        // In PostgreSQL (and sqlglot), "1.x" parses as float "1." with alias "x"
3528        let tokens = tokenizer.tokenize("SELECT 1.a").unwrap();
3529        assert_eq!(
3530            tokens.len(),
3531            3,
3532            "Expected 3 tokens for 'SELECT 1.a', got {:?}",
3533            tokens
3534        );
3535        assert_eq!(tokens[1].token_type, TokenType::Number);
3536        assert_eq!(tokens[1].text, "1.");
3537        assert_eq!(tokens[2].token_type, TokenType::Var);
3538
3539        // Test two dots (range operator) - dot is NOT consumed when followed by another dot
3540        let tokens = tokenizer.tokenize("SELECT 1..2").unwrap();
3541        assert_eq!(tokens[1].token_type, TokenType::Number);
3542        assert_eq!(tokens[1].text, "1");
3543        assert_eq!(tokens[2].token_type, TokenType::Dot);
3544        assert_eq!(tokens[3].token_type, TokenType::Dot);
3545        assert_eq!(tokens[4].token_type, TokenType::Number);
3546        assert_eq!(tokens[4].text, "2");
3547    }
3548
3549    #[test]
3550    fn test_leading_dot_number() {
3551        let tokenizer = Tokenizer::default();
3552
3553        // Test leading dot number (e.g., .25 for 0.25)
3554        let tokens = tokenizer.tokenize(".25").unwrap();
3555        assert_eq!(
3556            tokens.len(),
3557            1,
3558            "Expected 1 token for '.25', got {:?}",
3559            tokens
3560        );
3561        assert_eq!(tokens[0].token_type, TokenType::Number);
3562        assert_eq!(tokens[0].text, ".25");
3563
3564        // Test leading dot in context (Oracle SAMPLE clause)
3565        let tokens = tokenizer.tokenize("SAMPLE (.25)").unwrap();
3566        assert_eq!(
3567            tokens.len(),
3568            4,
3569            "Expected 4 tokens for 'SAMPLE (.25)', got {:?}",
3570            tokens
3571        );
3572        assert_eq!(tokens[0].token_type, TokenType::Sample);
3573        assert_eq!(tokens[1].token_type, TokenType::LParen);
3574        assert_eq!(tokens[2].token_type, TokenType::Number);
3575        assert_eq!(tokens[2].text, ".25");
3576        assert_eq!(tokens[3].token_type, TokenType::RParen);
3577
3578        // Test leading dot with exponent
3579        let tokens = tokenizer.tokenize(".5e10").unwrap();
3580        assert_eq!(
3581            tokens.len(),
3582            1,
3583            "Expected 1 token for '.5e10', got {:?}",
3584            tokens
3585        );
3586        assert_eq!(tokens[0].token_type, TokenType::Number);
3587        assert_eq!(tokens[0].text, ".5e10");
3588
3589        // Test that plain dot is still a Dot token
3590        let tokens = tokenizer.tokenize("a.b").unwrap();
3591        assert_eq!(
3592            tokens.len(),
3593            3,
3594            "Expected 3 tokens for 'a.b', got {:?}",
3595            tokens
3596        );
3597        assert_eq!(tokens[1].token_type, TokenType::Dot);
3598    }
3599
3600    #[test]
3601    fn test_unrecognized_character() {
3602        let tokenizer = Tokenizer::default();
3603
3604        // Unicode curly quotes are now handled as string delimiters
3605        let result = tokenizer.tokenize("SELECT \u{2018}hello\u{2019}");
3606        assert!(
3607            result.is_ok(),
3608            "Curly quotes should be tokenized as strings"
3609        );
3610
3611        // Unicode bullet character should still error
3612        let result = tokenizer.tokenize("SELECT • FROM t");
3613        assert!(result.is_err());
3614    }
3615
3616    #[test]
3617    fn test_colon_eq_tokenization() {
3618        let tokenizer = Tokenizer::default();
3619
3620        // := should be a single ColonEq token
3621        let tokens = tokenizer.tokenize("a := 1").unwrap();
3622        assert_eq!(tokens.len(), 3);
3623        assert_eq!(tokens[0].token_type, TokenType::Var);
3624        assert_eq!(tokens[1].token_type, TokenType::ColonEq);
3625        assert_eq!(tokens[2].token_type, TokenType::Number);
3626
3627        // : followed by non-= should still be Colon
3628        let tokens = tokenizer.tokenize("a:b").unwrap();
3629        assert!(tokens.iter().any(|t| t.token_type == TokenType::Colon));
3630        assert!(!tokens.iter().any(|t| t.token_type == TokenType::ColonEq));
3631
3632        // :: should still be DColon
3633        let tokens = tokenizer.tokenize("a::INT").unwrap();
3634        assert!(tokens.iter().any(|t| t.token_type == TokenType::DColon));
3635    }
3636
3637    #[test]
3638    fn test_colon_eq_parsing() {
3639        use crate::generator::Generator;
3640        use crate::parser::Parser;
3641
3642        // MySQL @var := value in SELECT
3643        let ast = Parser::parse_sql("SELECT @var1 := 1, @var2")
3644            .expect("Failed to parse MySQL @var := expr");
3645        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3646        assert_eq!(output, "SELECT @var1 := 1, @var2");
3647
3648        // MySQL @var := @var in SELECT
3649        let ast = Parser::parse_sql("SELECT @var1, @var2 := @var1")
3650            .expect("Failed to parse MySQL @var2 := @var1");
3651        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3652        assert_eq!(output, "SELECT @var1, @var2 := @var1");
3653
3654        // MySQL @var := COUNT(*)
3655        let ast = Parser::parse_sql("SELECT @var1 := COUNT(*) FROM t1")
3656            .expect("Failed to parse MySQL @var := COUNT(*)");
3657        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3658        assert_eq!(output, "SELECT @var1 := COUNT(*) FROM t1");
3659
3660        // MySQL SET @var := 1 (should normalize to = in output)
3661        let ast = Parser::parse_sql("SET @var1 := 1").expect("Failed to parse SET @var1 := 1");
3662        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3663        assert_eq!(output, "SET @var1 = 1");
3664
3665        // Function named args with :=
3666        let ast =
3667            Parser::parse_sql("UNION_VALUE(k1 := 1)").expect("Failed to parse named arg with :=");
3668        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3669        assert_eq!(output, "UNION_VALUE(k1 := 1)");
3670
3671        // UNNEST with recursive := TRUE
3672        let ast = Parser::parse_sql("SELECT UNNEST(col, recursive := TRUE) FROM t")
3673            .expect("Failed to parse UNNEST with :=");
3674        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3675        assert_eq!(output, "SELECT UNNEST(col, recursive := TRUE) FROM t");
3676
3677        // DuckDB prefix alias: foo: 1 means 1 AS foo
3678        let ast =
3679            Parser::parse_sql("SELECT foo: 1").expect("Failed to parse DuckDB prefix alias foo: 1");
3680        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3681        assert_eq!(output, "SELECT 1 AS foo");
3682
3683        // DuckDB prefix alias with multiple columns
3684        let ast = Parser::parse_sql("SELECT foo: 1, bar: 2, baz: 3")
3685            .expect("Failed to parse DuckDB multiple prefix aliases");
3686        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3687        assert_eq!(output, "SELECT 1 AS foo, 2 AS bar, 3 AS baz");
3688    }
3689
3690    #[test]
3691    fn test_colon_eq_dialect_roundtrip() {
3692        use crate::dialects::{Dialect, DialectType};
3693
3694        fn check(dialect: DialectType, sql: &str, expected: Option<&str>) {
3695            let d = Dialect::get(dialect);
3696            let ast = d
3697                .parse(sql)
3698                .unwrap_or_else(|e| panic!("Parse error for '{}': {}", sql, e));
3699            assert!(!ast.is_empty(), "Empty AST for: {}", sql);
3700            let transformed = d
3701                .transform(ast[0].clone())
3702                .unwrap_or_else(|e| panic!("Transform error for '{}': {}", sql, e));
3703            let output = d
3704                .generate(&transformed)
3705                .unwrap_or_else(|e| panic!("Generate error for '{}': {}", sql, e));
3706            let expected = expected.unwrap_or(sql);
3707            assert_eq!(output, expected, "Roundtrip failed for: {}", sql);
3708        }
3709
3710        // MySQL := tests
3711        check(DialectType::MySQL, "SELECT @var1 := 1, @var2", None);
3712        check(DialectType::MySQL, "SELECT @var1, @var2 := @var1", None);
3713        check(DialectType::MySQL, "SELECT @var1 := COUNT(*) FROM t1", None);
3714        check(DialectType::MySQL, "SET @var1 := 1", Some("SET @var1 = 1"));
3715
3716        // DuckDB := tests
3717        check(
3718            DialectType::DuckDB,
3719            "SELECT UNNEST(col, recursive := TRUE) FROM t",
3720            None,
3721        );
3722        check(DialectType::DuckDB, "UNION_VALUE(k1 := 1)", None);
3723
3724        // STRUCT_PACK(a := 'b')::json should at least parse without error
3725        // (The STRUCT_PACK -> Struct transformation is a separate feature)
3726        {
3727            let d = Dialect::get(DialectType::DuckDB);
3728            let ast = d
3729                .parse("STRUCT_PACK(a := 'b')::json")
3730                .expect("Failed to parse STRUCT_PACK(a := 'b')::json");
3731            assert!(!ast.is_empty(), "Empty AST for STRUCT_PACK(a := 'b')::json");
3732        }
3733
3734        // DuckDB prefix alias tests
3735        check(
3736            DialectType::DuckDB,
3737            "SELECT foo: 1",
3738            Some("SELECT 1 AS foo"),
3739        );
3740        check(
3741            DialectType::DuckDB,
3742            "SELECT foo: 1, bar: 2, baz: 3",
3743            Some("SELECT 1 AS foo, 2 AS bar, 3 AS baz"),
3744        );
3745    }
3746
3747    #[test]
3748    fn test_comment_roundtrip() {
3749        use crate::generator::Generator;
3750        use crate::parser::Parser;
3751
3752        fn check_roundtrip(sql: &str) -> Option<String> {
3753            let ast = match Parser::parse_sql(sql) {
3754                Ok(a) => a,
3755                Err(e) => return Some(format!("Parse error: {:?}", e)),
3756            };
3757            if ast.is_empty() {
3758                return Some("Empty AST".to_string());
3759            }
3760            let mut generator = Generator::default();
3761            let output = match generator.generate(&ast[0]) {
3762                Ok(o) => o,
3763                Err(e) => return Some(format!("Gen error: {:?}", e)),
3764            };
3765            if output == sql {
3766                None
3767            } else {
3768                Some(format!(
3769                    "Mismatch:\n  input:  {}\n  output: {}",
3770                    sql, output
3771                ))
3772            }
3773        }
3774
3775        let tests = vec![
3776            // Nested comments are sanitized: inner /* and */ are escaped
3777            // These no longer round-trip exactly (by design, matches Python sqlglot)
3778            // "SELECT c /* c1 /* c2 */ c3 */",        // becomes /* c1 / * c2 * / c3 */
3779            // "SELECT c /* c1 /* c2 /* c3 */ */ */",   // becomes /* c1 / * c2 / * c3 * / * / */
3780            // Simple alias with comments
3781            "SELECT c /* c1 */ AS alias /* c2 */",
3782            // Multiple columns with comments
3783            "SELECT a /* x */, b /* x */",
3784            // Multiple comments after column
3785            "SELECT a /* x */ /* y */ /* z */, b /* k */ /* m */",
3786            // FROM tables with comments
3787            "SELECT * FROM foo /* x */, bla /* x */",
3788            // Arithmetic with comments
3789            "SELECT 1 /* comment */ + 1",
3790            "SELECT 1 /* c1 */ + 2 /* c2 */",
3791            "SELECT 1 /* c1 */ + /* c2 */ 2 /* c3 */",
3792            // CAST with comments
3793            "SELECT CAST(x AS INT) /* comment */ FROM foo",
3794            // Function arguments with comments
3795            "SELECT FOO(x /* c */) /* FOO */, b /* b */",
3796            // Multi-part table names with comments
3797            "SELECT x FROM a.b.c /* x */, e.f.g /* x */",
3798            // INSERT with comments
3799            "INSERT INTO t1 (tc1 /* tc1 */, tc2 /* tc2 */) SELECT c1 /* sc1 */, c2 /* sc2 */ FROM t",
3800            // Leading comments on statements
3801            "/* c */ WITH x AS (SELECT 1) SELECT * FROM x",
3802            "/* comment1 */ INSERT INTO x /* comment2 */ VALUES (1, 2, 3)",
3803            "/* comment1 */ UPDATE tbl /* comment2 */ SET x = 2 WHERE x < 2",
3804            "/* comment1 */ DELETE FROM x /* comment2 */ WHERE y > 1",
3805            "/* comment */ CREATE TABLE foo AS SELECT 1",
3806            // Trailing comments on statements
3807            "INSERT INTO foo SELECT * FROM bar /* comment */",
3808            // Complex nested expressions with comments
3809            "SELECT FOO(x /* c1 */ + y /* c2 */ + BLA(5 /* c3 */)) FROM (VALUES (1 /* c4 */, \"test\" /* c5 */)) /* c6 */",
3810        ];
3811
3812        let mut failures = Vec::new();
3813        for sql in tests {
3814            if let Some(e) = check_roundtrip(sql) {
3815                failures.push(e);
3816            }
3817        }
3818
3819        if !failures.is_empty() {
3820            panic!("Comment roundtrip failures:\n{}", failures.join("\n\n"));
3821        }
3822    }
3823
3824    #[test]
3825    fn test_dollar_quoted_string_parsing() {
3826        use crate::dialects::{Dialect, DialectType};
3827
3828        // Test dollar string token parsing utility function
3829        let (tag, content) = super::parse_dollar_string_token("FOO\x00content here");
3830        assert_eq!(tag, Some("FOO".to_string()));
3831        assert_eq!(content, "content here");
3832
3833        let (tag, content) = super::parse_dollar_string_token("just content");
3834        assert_eq!(tag, None);
3835        assert_eq!(content, "just content");
3836
3837        // Test roundtrip for Databricks dialect with dollar-quoted function body
3838        fn check_databricks(sql: &str, expected: Option<&str>) {
3839            let d = Dialect::get(DialectType::Databricks);
3840            let ast = d
3841                .parse(sql)
3842                .unwrap_or_else(|e| panic!("Parse error for '{}': {}", sql, e));
3843            assert!(!ast.is_empty(), "Empty AST for: {}", sql);
3844            let transformed = d
3845                .transform(ast[0].clone())
3846                .unwrap_or_else(|e| panic!("Transform error for '{}': {}", sql, e));
3847            let output = d
3848                .generate(&transformed)
3849                .unwrap_or_else(|e| panic!("Generate error for '{}': {}", sql, e));
3850            let expected = expected.unwrap_or(sql);
3851            assert_eq!(output, expected, "Roundtrip failed for: {}", sql);
3852        }
3853
3854        // Test [42]: $$...$$ heredoc
3855        check_databricks(
3856            "CREATE FUNCTION add_one(x INT) RETURNS INT LANGUAGE PYTHON AS $$def add_one(x):\n  return x+1$$",
3857            None
3858        );
3859
3860        // Test [43]: $FOO$...$FOO$ tagged heredoc
3861        check_databricks(
3862            "CREATE FUNCTION add_one(x INT) RETURNS INT LANGUAGE PYTHON AS $FOO$def add_one(x):\n  return x+1$FOO$",
3863            None
3864        );
3865    }
3866
3867    #[test]
3868    fn test_numeric_underscore_stripping() {
3869        // Underscore stripping only happens when numbers_can_be_underscore_separated is true
3870        let mut config = TokenizerConfig::default();
3871        config.numbers_can_be_underscore_separated = true;
3872        let tokenizer = Tokenizer::new(config);
3873
3874        // Simple integer with underscores
3875        let tokens = tokenizer.tokenize("SELECT 1_2_3_4_5").unwrap();
3876        assert_eq!(tokens[1].token_type, TokenType::Number);
3877        assert_eq!(tokens[1].text, "12345");
3878
3879        // Thousands separator
3880        let tokens = tokenizer.tokenize("SELECT 20_000").unwrap();
3881        assert_eq!(tokens[1].token_type, TokenType::Number);
3882        assert_eq!(tokens[1].text, "20000");
3883
3884        // Scientific notation with underscores
3885        let tokens = tokenizer.tokenize("SELECT 1_2E+1_0").unwrap();
3886        assert_eq!(tokens[1].token_type, TokenType::Number);
3887        assert_eq!(tokens[1].text, "12E+10");
3888
3889        // Default tokenizer should NOT strip underscores
3890        let default_tokenizer = Tokenizer::default();
3891        let tokens = default_tokenizer.tokenize("SELECT 1_2_3_4_5").unwrap();
3892        assert_eq!(tokens[1].token_type, TokenType::Number);
3893        assert_eq!(tokens[1].text, "1_2_3_4_5");
3894    }
3895}
polyglot_sql/tokens.rs

polyglot_sql/
tokens.rs