polyglot_sql/
tokens.rs

1//! Token types and tokenization for SQL parsing
2//!
3//! This module defines all SQL token types and the tokenizer that converts
4//! SQL strings into token streams.
5
6use crate::error::{Error, Result};
7use serde::{Deserialize, Serialize};
8use std::fmt;
9
10/// Parse a DollarString token text into (tag, content).
11/// If the text contains '\x00', the part before is the tag and after is content.
12/// Otherwise, the whole text is the content with no tag.
13pub fn parse_dollar_string_token(text: &str) -> (Option<String>, String) {
14    if let Some(pos) = text.find('\x00') {
15        let tag = &text[..pos];
16        let content = &text[pos + 1..];
17        (Some(tag.to_string()), content.to_string())
18    } else {
19        (None, text.to_string())
20    }
21}
22
23/// Represents a position in the source SQL
24#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
25pub struct Span {
26    /// Starting byte offset
27    pub start: usize,
28    /// Ending byte offset (exclusive)
29    pub end: usize,
30    /// Line number (1-based)
31    pub line: usize,
32    /// Column number (1-based)
33    pub column: usize,
34}
35
36impl Span {
37    pub fn new(start: usize, end: usize, line: usize, column: usize) -> Self {
38        Self {
39            start,
40            end,
41            line,
42            column,
43        }
44    }
45}
46
47/// A token in the SQL token stream
48#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
49pub struct Token {
50    /// The type of token
51    pub token_type: TokenType,
52    /// The raw text of the token
53    pub text: String,
54    /// Position information
55    pub span: Span,
56    /// Leading comments (comments that appeared before this token)
57    #[serde(default)]
58    pub comments: Vec<String>,
59    /// Trailing comments (comments that appeared after this token, before the next one)
60    #[serde(default)]
61    pub trailing_comments: Vec<String>,
62}
63
64impl Token {
65    /// Create a new token
66    pub fn new(token_type: TokenType, text: impl Into<String>, span: Span) -> Self {
67        Self {
68            token_type,
69            text: text.into(),
70            span,
71            comments: Vec::new(),
72            trailing_comments: Vec::new(),
73        }
74    }
75
76    /// Create a NUMBER token
77    pub fn number(n: i64) -> Self {
78        Self::new(TokenType::Number, n.to_string(), Span::default())
79    }
80
81    /// Create a STRING token
82    pub fn string(s: impl Into<String>) -> Self {
83        Self::new(TokenType::String, s, Span::default())
84    }
85
86    /// Create an IDENTIFIER token
87    pub fn identifier(s: impl Into<String>) -> Self {
88        Self::new(TokenType::Identifier, s, Span::default())
89    }
90
91    /// Create a VAR token
92    pub fn var(s: impl Into<String>) -> Self {
93        Self::new(TokenType::Var, s, Span::default())
94    }
95
96    /// Add a comment to this token
97    pub fn with_comment(mut self, comment: impl Into<String>) -> Self {
98        self.comments.push(comment.into());
99        self
100    }
101}
102
103impl fmt::Display for Token {
104    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
105        write!(f, "{:?}({})", self.token_type, self.text)
106    }
107}
108
109/// All possible token types in SQL
110#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
111#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
112#[repr(u16)]
113pub enum TokenType {
114    // Punctuation
115    LParen,
116    RParen,
117    LBracket,
118    RBracket,
119    LBrace,
120    RBrace,
121    Comma,
122    Dot,
123    Dash,
124    Plus,
125    Colon,
126    DotColon,
127    DColon,
128    DColonDollar,
129    DColonPercent,
130    DColonQMark,
131    DQMark,
132    Semicolon,
133    Star,
134    Backslash,
135    Slash,
136    Lt,
137    Lte,
138    Gt,
139    Gte,
140    Not,
141    Eq,
142    Neq,
143    NullsafeEq,
144    ColonEq,
145    ColonGt,
146    NColonGt,
147    And,
148    Or,
149    Amp,
150    DPipe,
151    PipeGt,
152    Pipe,
153    PipeSlash,
154    DPipeSlash,
155    Caret,
156    CaretAt,
157    LtLt, // <<
158    GtGt, // >>
159    Tilde,
160    Arrow,
161    DArrow,
162    FArrow,
163    Hash,
164    HashArrow,
165    DHashArrow,
166    LrArrow,
167    DAt,
168    AtAt,
169    LtAt,
170    AtGt,
171    Dollar,
172    Parameter,
173    Session,
174    SessionParameter,
175    SessionUser,
176    DAmp,
177    AmpLt,
178    AmpGt,
179    Adjacent,
180    Xor,
181    DStar,
182    QMarkAmp,
183    QMarkPipe,
184    HashDash,
185    Exclamation,
186
187    UriStart,
188    BlockStart,
189    BlockEnd,
190    Space,
191    Break,
192
193    // Comments (emitted as tokens for round-trip fidelity)
194    BlockComment, // /* ... */
195    LineComment,  // -- ...
196
197    // Literals
198    String,
199    DollarString,             // $$...$$
200    TripleDoubleQuotedString, // """..."""
201    TripleSingleQuotedString, // '''...'''
202    Number,
203    Identifier,
204    QuotedIdentifier,
205    Database,
206    Column,
207    ColumnDef,
208    Schema,
209    Table,
210    Warehouse,
211    Stage,
212    Streamlit,
213    Var,
214    BitString,
215    HexString,
216    /// Hex number: 0xA, 0xFF (BigQuery, SQLite style) - represents an integer in hex notation
217    HexNumber,
218    ByteString,
219    NationalString,
220    EscapeString, // PostgreSQL E'...' escape string
221    RawString,
222    HeredocString,
223    HeredocStringAlternative,
224    UnicodeString,
225
226    // Data Types
227    Bit,
228    Boolean,
229    TinyInt,
230    UTinyInt,
231    SmallInt,
232    USmallInt,
233    MediumInt,
234    UMediumInt,
235    Int,
236    UInt,
237    BigInt,
238    UBigInt,
239    BigNum,
240    Int128,
241    UInt128,
242    Int256,
243    UInt256,
244    Float,
245    Double,
246    UDouble,
247    Decimal,
248    Decimal32,
249    Decimal64,
250    Decimal128,
251    Decimal256,
252    DecFloat,
253    UDecimal,
254    BigDecimal,
255    Char,
256    NChar,
257    VarChar,
258    NVarChar,
259    BpChar,
260    Text,
261    MediumText,
262    LongText,
263    Blob,
264    MediumBlob,
265    LongBlob,
266    TinyBlob,
267    TinyText,
268    Name,
269    Binary,
270    VarBinary,
271    Json,
272    JsonB,
273    Time,
274    TimeTz,
275    TimeNs,
276    Timestamp,
277    TimestampTz,
278    TimestampLtz,
279    TimestampNtz,
280    TimestampS,
281    TimestampMs,
282    TimestampNs,
283    DateTime,
284    DateTime2,
285    DateTime64,
286    SmallDateTime,
287    Date,
288    Date32,
289    Int4Range,
290    Int4MultiRange,
291    Int8Range,
292    Int8MultiRange,
293    NumRange,
294    NumMultiRange,
295    TsRange,
296    TsMultiRange,
297    TsTzRange,
298    TsTzMultiRange,
299    DateRange,
300    DateMultiRange,
301    Uuid,
302    Geography,
303    GeographyPoint,
304    Nullable,
305    Geometry,
306    Point,
307    Ring,
308    LineString,
309    LocalTime,
310    LocalTimestamp,
311    SysTimestamp,
312    MultiLineString,
313    Polygon,
314    MultiPolygon,
315    HllSketch,
316    HStore,
317    Super,
318    Serial,
319    SmallSerial,
320    BigSerial,
321    Xml,
322    Year,
323    UserDefined,
324    Money,
325    SmallMoney,
326    RowVersion,
327    Image,
328    Variant,
329    Object,
330    Inet,
331    IpAddress,
332    IpPrefix,
333    Ipv4,
334    Ipv6,
335    Enum,
336    Enum8,
337    Enum16,
338    FixedString,
339    LowCardinality,
340    Nested,
341    AggregateFunction,
342    SimpleAggregateFunction,
343    TDigest,
344    Unknown,
345    Vector,
346    Dynamic,
347    Void,
348
349    // Keywords
350    Add,
351    Alias,
352    Alter,
353    All,
354    Anti,
355    Any,
356    Apply,
357    Array,
358    Asc,
359    AsOf,
360    Attach,
361    AutoIncrement,
362    Begin,
363    Between,
364    BulkCollectInto,
365    Cache,
366    Cascade,
367    Case,
368    CharacterSet,
369    Cluster,
370    ClusterBy,
371    Collate,
372    Command,
373    Comment,
374    Commit,
375    Preserve,
376    Connect,
377    ConnectBy,
378    Constraint,
379    Copy,
380    Create,
381    Cross,
382    Cube,
383    CurrentDate,
384    CurrentDateTime,
385    CurrentSchema,
386    CurrentTime,
387    CurrentTimestamp,
388    CurrentUser,
389    CurrentRole,
390    CurrentCatalog,
391    Declare,
392    Default,
393    Delete,
394    Desc,
395    Describe,
396    Detach,
397    Dictionary,
398    Distinct,
399    Distribute,
400    DistributeBy,
401    Div,
402    Drop,
403    Else,
404    End,
405    Escape,
406    Except,
407    Execute,
408    Exists,
409    False,
410    Fetch,
411    File,
412    FileFormat,
413    Filter,
414    Final,
415    First,
416    For,
417    Force,
418    ForeignKey,
419    Format,
420    From,
421    Full,
422    Function,
423    Get,
424    Glob,
425    Global,
426    Grant,
427    GroupBy,
428    GroupingSets,
429    Having,
430    Hint,
431    Ignore,
432    ILike,
433    In,
434    Index,
435    IndexedBy,
436    Inner,
437    Input,
438    Insert,
439    Install,
440    Intersect,
441    Interval,
442    Into,
443    Inpath,
444    InputFormat,
445    Introducer,
446    IRLike,
447    Is,
448    IsNull,
449    Join,
450    JoinMarker,
451    Keep,
452    Key,
453    Kill,
454    Lambda,
455    Language,
456    Lateral,
457    Left,
458    Like,
459    NotLike,   // !~~ operator (PostgreSQL)
460    NotILike,  // !~~* operator (PostgreSQL)
461    NotRLike,  // !~ operator (PostgreSQL)
462    NotIRLike, // !~* operator (PostgreSQL)
463    Limit,
464    List,
465    Load,
466    Local,
467    Lock,
468    Map,
469    Match,
470    MatchCondition,
471    MatchRecognize,
472    MemberOf,
473    Materialized,
474    Merge,
475    Mod,
476    Model,
477    Natural,
478    Next,
479    NoAction,
480    Nothing,
481    NotNull,
482    Null,
483    ObjectIdentifier,
484    Offset,
485    On,
486    Only,
487    Operator,
488    OrderBy,
489    OrderSiblingsBy,
490    Ordered,
491    Ordinality,
492    Out,
493    Outer,
494    Output,
495    Over,
496    Overlaps,
497    Overwrite,
498    Partition,
499    PartitionBy,
500    Percent,
501    Pivot,
502    Placeholder,
503    Positional,
504    Pragma,
505    Prewhere,
506    PrimaryKey,
507    Procedure,
508    Properties,
509    PseudoType,
510    Put,
511    Qualify,
512    Quote,
513    QDColon,
514    Range,
515    Recursive,
516    Refresh,
517    Rename,
518    Replace,
519    Returning,
520    Revoke,
521    References,
522    Restrict,
523    Right,
524    RLike,
525    Rollback,
526    Rollup,
527    Row,
528    Rows,
529    Select,
530    Semi,
531    Savepoint,
532    Separator,
533    Sequence,
534    Serde,
535    SerdeProperties,
536    Set,
537    Settings,
538    Show,
539    Siblings,
540    SimilarTo,
541    Some,
542    Sort,
543    SortBy,
544    SoundsLike,
545    StartWith,
546    StorageIntegration,
547    StraightJoin,
548    Struct,
549    Summarize,
550    TableSample,
551    Sample,
552    Bernoulli,
553    System,
554    Block,
555    Seed,
556    Repeatable,
557    Tag,
558    Temporary,
559    Transaction,
560    To,
561    Top,
562    Then,
563    True,
564    Truncate,
565    Uncache,
566    Union,
567    Unnest,
568    Unpivot,
569    Update,
570    Use,
571    Using,
572    Values,
573    View,
574    SemanticView,
575    Volatile,
576    When,
577    Where,
578    Window,
579    With,
580    Ties,
581    Exclude,
582    No,
583    Others,
584    Unique,
585    UtcDate,
586    UtcTime,
587    UtcTimestamp,
588    VersionSnapshot,
589    TimestampSnapshot,
590    Option,
591    Sink,
592    Source,
593    Analyze,
594    Namespace,
595    Export,
596    As,
597    By,
598    Nulls,
599    Respect,
600    Last,
601    If,
602    Cast,
603    TryCast,
604    SafeCast,
605    Count,
606    Extract,
607    Substring,
608    Trim,
609    Leading,
610    Trailing,
611    Both,
612    Position,
613    Overlaying,
614    Placing,
615    Treat,
616    Within,
617    Group,
618    Order,
619
620    // Window function keywords
621    Unbounded,
622    Preceding,
623    Following,
624    Current,
625    Groups,
626
627    // DDL-specific keywords (Phase 4)
628    Trigger,
629    Type,
630    Domain,
631    Returns,
632    Body,
633    Increment,
634    Minvalue,
635    Maxvalue,
636    Start,
637    Cycle,
638    NoCycle,
639    Prior,
640    Generated,
641    Identity,
642    Always,
643    // MATCH_RECOGNIZE tokens
644    Measures,
645    Pattern,
646    Define,
647    Running,
648    Owned,
649    After,
650    Before,
651    Instead,
652    Each,
653    Statement,
654    Referencing,
655    Old,
656    New,
657    Of,
658    Check,
659    Authorization,
660    Restart,
661
662    // Special
663    Eof,
664}
665
666impl TokenType {
667    /// Check if this token type is a keyword that can be used as an identifier in certain contexts
668    pub fn is_keyword(&self) -> bool {
669        matches!(
670            self,
671            TokenType::Select
672                | TokenType::From
673                | TokenType::Where
674                | TokenType::And
675                | TokenType::Or
676                | TokenType::Not
677                | TokenType::In
678                | TokenType::Is
679                | TokenType::Null
680                | TokenType::True
681                | TokenType::False
682                | TokenType::As
683                | TokenType::On
684                | TokenType::Join
685                | TokenType::Left
686                | TokenType::Right
687                | TokenType::Inner
688                | TokenType::Outer
689                | TokenType::Full
690                | TokenType::Cross
691                | TokenType::Semi
692                | TokenType::Anti
693                | TokenType::Union
694                | TokenType::Except
695                | TokenType::Intersect
696                | TokenType::GroupBy
697                | TokenType::OrderBy
698                | TokenType::Having
699                | TokenType::Limit
700                | TokenType::Offset
701                | TokenType::Case
702                | TokenType::When
703                | TokenType::Then
704                | TokenType::Else
705                | TokenType::End
706                | TokenType::Create
707                | TokenType::Drop
708                | TokenType::Alter
709                | TokenType::Insert
710                | TokenType::Update
711                | TokenType::Delete
712                | TokenType::Into
713                | TokenType::Values
714                | TokenType::Set
715                | TokenType::With
716                | TokenType::Distinct
717                | TokenType::All
718                | TokenType::Exists
719                | TokenType::Between
720                | TokenType::Like
721                | TokenType::ILike
722                // Additional keywords that can be used as identifiers
723                | TokenType::Filter
724                | TokenType::Date
725                | TokenType::Timestamp
726                | TokenType::TimestampTz
727                | TokenType::Interval
728                | TokenType::Time
729                | TokenType::Table
730                | TokenType::Index
731                | TokenType::Column
732                | TokenType::Database
733                | TokenType::Schema
734                | TokenType::View
735                | TokenType::Function
736                | TokenType::Procedure
737                | TokenType::Trigger
738                | TokenType::Sequence
739                | TokenType::Over
740                | TokenType::Partition
741                | TokenType::Window
742                | TokenType::Rows
743                | TokenType::Range
744                | TokenType::First
745                | TokenType::Last
746                | TokenType::Preceding
747                | TokenType::Following
748                | TokenType::Current
749                | TokenType::Row
750                | TokenType::Unbounded
751                | TokenType::Array
752                | TokenType::Struct
753                | TokenType::Map
754                | TokenType::PrimaryKey
755                | TokenType::Key
756                | TokenType::ForeignKey
757                | TokenType::References
758                | TokenType::Unique
759                | TokenType::Check
760                | TokenType::Default
761                | TokenType::Constraint
762                | TokenType::Comment
763                | TokenType::Rollup
764                | TokenType::Cube
765                | TokenType::Grant
766                | TokenType::Revoke
767                | TokenType::Type
768                | TokenType::Use
769                | TokenType::Cache
770                | TokenType::Uncache
771                | TokenType::Load
772                | TokenType::Any
773                | TokenType::Some
774                | TokenType::Asc
775                | TokenType::Desc
776                | TokenType::Nulls
777                | TokenType::Lateral
778                | TokenType::Natural
779                | TokenType::Escape
780                | TokenType::Glob
781                | TokenType::Match
782                | TokenType::Recursive
783                | TokenType::Replace
784                | TokenType::Returns
785                | TokenType::If
786                | TokenType::Pivot
787                | TokenType::Unpivot
788                | TokenType::Json
789                | TokenType::Blob
790                | TokenType::Text
791                | TokenType::Int
792                | TokenType::BigInt
793                | TokenType::SmallInt
794                | TokenType::TinyInt
795                | TokenType::Int128
796                | TokenType::UInt128
797                | TokenType::Int256
798                | TokenType::UInt256
799                | TokenType::UInt
800                | TokenType::UBigInt
801                | TokenType::Float
802                | TokenType::Double
803                | TokenType::Decimal
804                | TokenType::Boolean
805                | TokenType::VarChar
806                | TokenType::Char
807                | TokenType::Binary
808                | TokenType::VarBinary
809                | TokenType::No
810                | TokenType::DateTime
811                | TokenType::Truncate
812                | TokenType::Execute
813                | TokenType::Merge
814                | TokenType::Top
815                | TokenType::Begin
816                | TokenType::Generated
817                | TokenType::Identity
818                | TokenType::Always
819                | TokenType::Extract
820                // Keywords that can be identifiers in certain contexts
821                | TokenType::AsOf
822                | TokenType::Prior
823                | TokenType::After
824                | TokenType::Restrict
825                | TokenType::Cascade
826                | TokenType::Local
827                | TokenType::Rename
828                | TokenType::Enum
829                | TokenType::Within
830                | TokenType::Format
831                | TokenType::Final
832                | TokenType::FileFormat
833                | TokenType::Input
834                | TokenType::InputFormat
835                | TokenType::Copy
836                | TokenType::Put
837                | TokenType::Get
838                | TokenType::Show
839                | TokenType::Serde
840                | TokenType::Sample
841                | TokenType::Sort
842                | TokenType::Collate
843                | TokenType::Ties
844                | TokenType::IsNull
845                | TokenType::NotNull
846                | TokenType::Exclude
847                | TokenType::Temporary
848                | TokenType::Add
849                | TokenType::Ordinality
850                | TokenType::Overlaps
851                | TokenType::Block
852                | TokenType::Pattern
853                | TokenType::Group
854                | TokenType::Cluster
855                | TokenType::Repeatable
856                | TokenType::Groups
857                | TokenType::Commit
858                | TokenType::Warehouse
859                | TokenType::System
860                | TokenType::By
861                | TokenType::To
862                | TokenType::Fetch
863                | TokenType::For
864                | TokenType::Only
865                | TokenType::Next
866                | TokenType::Lock
867                | TokenType::Refresh
868                | TokenType::Settings
869                | TokenType::Operator
870                | TokenType::Overwrite
871                | TokenType::StraightJoin
872                | TokenType::Start
873                // Additional keywords registered in tokenizer but previously missing from is_keyword()
874                | TokenType::Ignore
875                | TokenType::Domain
876                | TokenType::Apply
877                | TokenType::Respect
878                | TokenType::Materialized
879                | TokenType::Prewhere
880                | TokenType::Old
881                | TokenType::New
882                | TokenType::Cast
883                | TokenType::TryCast
884                | TokenType::SafeCast
885                | TokenType::Transaction
886                | TokenType::Describe
887                | TokenType::Kill
888                | TokenType::Lambda
889                | TokenType::Declare
890                | TokenType::Keep
891                | TokenType::Output
892                | TokenType::Percent
893                | TokenType::Qualify
894                | TokenType::Returning
895                | TokenType::Language
896                | TokenType::Preserve
897                | TokenType::Savepoint
898                | TokenType::Rollback
899                | TokenType::Body
900                | TokenType::Increment
901                | TokenType::Minvalue
902                | TokenType::Maxvalue
903                | TokenType::Cycle
904                | TokenType::NoCycle
905                | TokenType::Seed
906                | TokenType::Namespace
907                | TokenType::Authorization
908                | TokenType::Order
909                | TokenType::Restart
910                | TokenType::Before
911                | TokenType::Instead
912                | TokenType::Each
913                | TokenType::Statement
914                | TokenType::Referencing
915                | TokenType::Of
916                | TokenType::Separator
917                | TokenType::Others
918                | TokenType::Placing
919                | TokenType::Owned
920                | TokenType::Running
921                | TokenType::Define
922                | TokenType::Measures
923                | TokenType::MatchRecognize
924                | TokenType::AutoIncrement
925                | TokenType::Connect
926                | TokenType::Distribute
927                | TokenType::Bernoulli
928                | TokenType::TableSample
929                | TokenType::Inpath
930                | TokenType::Pragma
931                | TokenType::Siblings
932                | TokenType::SerdeProperties
933                | TokenType::RLike
934        )
935    }
936
937    /// Check if this token type is a comparison operator
938    pub fn is_comparison(&self) -> bool {
939        matches!(
940            self,
941            TokenType::Eq
942                | TokenType::Neq
943                | TokenType::Lt
944                | TokenType::Lte
945                | TokenType::Gt
946                | TokenType::Gte
947                | TokenType::NullsafeEq
948        )
949    }
950
951    /// Check if this token type is an arithmetic operator
952    pub fn is_arithmetic(&self) -> bool {
953        matches!(
954            self,
955            TokenType::Plus
956                | TokenType::Dash
957                | TokenType::Star
958                | TokenType::Slash
959                | TokenType::Percent
960                | TokenType::Mod
961                | TokenType::Div
962        )
963    }
964}
965
966impl fmt::Display for TokenType {
967    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
968        write!(f, "{:?}", self)
969    }
970}
971
972/// Tokenizer configuration for a dialect
973#[derive(Debug, Clone)]
974pub struct TokenizerConfig {
975    /// Keywords mapping (uppercase keyword -> token type)
976    pub keywords: std::collections::HashMap<String, TokenType>,
977    /// Single character tokens
978    pub single_tokens: std::collections::HashMap<char, TokenType>,
979    /// Quote characters (start -> end)
980    pub quotes: std::collections::HashMap<String, String>,
981    /// Identifier quote characters (start -> end)
982    pub identifiers: std::collections::HashMap<char, char>,
983    /// Comment definitions (start -> optional end)
984    pub comments: std::collections::HashMap<String, Option<String>>,
985    /// String escape characters
986    pub string_escapes: Vec<char>,
987    /// Whether to support nested comments
988    pub nested_comments: bool,
989    /// Valid escape follow characters (for MySQL-style escaping).
990    /// When a backslash is followed by a character NOT in this list,
991    /// the backslash is discarded. When empty, all backslash escapes
992    /// preserve the backslash for unrecognized sequences.
993    pub escape_follow_chars: Vec<char>,
994    /// Whether b'...' is a byte string (true for BigQuery) or bit string (false for standard SQL).
995    /// Default is false (bit string).
996    pub b_prefix_is_byte_string: bool,
997    /// Numeric literal suffixes (uppercase suffix -> type name), e.g. {"L": "BIGINT", "S": "SMALLINT"}
998    /// Used by Hive/Spark to parse 1L as CAST(1 AS BIGINT)
999    pub numeric_literals: std::collections::HashMap<String, String>,
1000    /// Whether unquoted identifiers can start with a digit (e.g., `1a`, `1_a`).
1001    /// When true, a number followed by letters/underscore is treated as an identifier.
1002    /// Used by Hive, Spark, MySQL, ClickHouse.
1003    pub identifiers_can_start_with_digit: bool,
1004    /// Whether 0x/0X prefix should be treated as hex literals.
1005    /// When true, `0XCC` is tokenized instead of Number("0") + Identifier("XCC").
1006    /// Used by BigQuery, SQLite, Teradata.
1007    pub hex_number_strings: bool,
1008    /// Whether hex string literals from 0x prefix represent integer values.
1009    /// When true (BigQuery), 0xA is tokenized as HexNumber (integer in hex notation).
1010    /// When false (SQLite, Teradata), 0xCC is tokenized as HexString (binary/blob value).
1011    pub hex_string_is_integer_type: bool,
1012    /// Whether string escape sequences (like \') are allowed in raw strings.
1013    /// When true (BigQuery default), \' inside r'...' escapes the quote.
1014    /// When false (Spark/Databricks), backslashes in raw strings are always literal.
1015    /// Python sqlglot: STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS (default True)
1016    pub string_escapes_allowed_in_raw_strings: bool,
1017    /// Whether # starts a single-line comment (ClickHouse, MySQL)
1018    pub hash_comments: bool,
1019    /// Whether $ can start/continue an identifier (ClickHouse).
1020    /// When true, a bare `$` that is not part of a dollar-quoted string or positional
1021    /// parameter is treated as an identifier character.
1022    pub dollar_sign_is_identifier: bool,
1023    /// Whether INSERT ... FORMAT <name> should treat subsequent data as raw (ClickHouse).
1024    /// When true, after tokenizing `INSERT ... FORMAT <non-VALUES-name>`, all text until
1025    /// the next blank line or end of input is consumed as a raw data token.
1026    pub insert_format_raw_data: bool,
1027}
1028
1029impl Default for TokenizerConfig {
1030    fn default() -> Self {
1031        let mut keywords = std::collections::HashMap::new();
1032        // Add basic SQL keywords
1033        keywords.insert("SELECT".to_string(), TokenType::Select);
1034        keywords.insert("FROM".to_string(), TokenType::From);
1035        keywords.insert("WHERE".to_string(), TokenType::Where);
1036        keywords.insert("AND".to_string(), TokenType::And);
1037        keywords.insert("OR".to_string(), TokenType::Or);
1038        keywords.insert("NOT".to_string(), TokenType::Not);
1039        keywords.insert("AS".to_string(), TokenType::As);
1040        keywords.insert("ON".to_string(), TokenType::On);
1041        keywords.insert("JOIN".to_string(), TokenType::Join);
1042        keywords.insert("LEFT".to_string(), TokenType::Left);
1043        keywords.insert("RIGHT".to_string(), TokenType::Right);
1044        keywords.insert("INNER".to_string(), TokenType::Inner);
1045        keywords.insert("OUTER".to_string(), TokenType::Outer);
1046        keywords.insert("OUTPUT".to_string(), TokenType::Output);
1047        keywords.insert("FULL".to_string(), TokenType::Full);
1048        keywords.insert("CROSS".to_string(), TokenType::Cross);
1049        keywords.insert("SEMI".to_string(), TokenType::Semi);
1050        keywords.insert("ANTI".to_string(), TokenType::Anti);
1051        keywords.insert("STRAIGHT_JOIN".to_string(), TokenType::StraightJoin);
1052        keywords.insert("UNION".to_string(), TokenType::Union);
1053        keywords.insert("EXCEPT".to_string(), TokenType::Except);
1054        keywords.insert("MINUS".to_string(), TokenType::Except); // Oracle/Redshift alias for EXCEPT
1055        keywords.insert("INTERSECT".to_string(), TokenType::Intersect);
1056        keywords.insert("GROUP".to_string(), TokenType::Group);
1057        keywords.insert("CUBE".to_string(), TokenType::Cube);
1058        keywords.insert("ROLLUP".to_string(), TokenType::Rollup);
1059        keywords.insert("WITHIN".to_string(), TokenType::Within);
1060        keywords.insert("ORDER".to_string(), TokenType::Order);
1061        keywords.insert("BY".to_string(), TokenType::By);
1062        keywords.insert("HAVING".to_string(), TokenType::Having);
1063        keywords.insert("LIMIT".to_string(), TokenType::Limit);
1064        keywords.insert("OFFSET".to_string(), TokenType::Offset);
1065        keywords.insert("ORDINALITY".to_string(), TokenType::Ordinality);
1066        keywords.insert("FETCH".to_string(), TokenType::Fetch);
1067        keywords.insert("FIRST".to_string(), TokenType::First);
1068        keywords.insert("NEXT".to_string(), TokenType::Next);
1069        keywords.insert("ONLY".to_string(), TokenType::Only);
1070        keywords.insert("KEEP".to_string(), TokenType::Keep);
1071        keywords.insert("IGNORE".to_string(), TokenType::Ignore);
1072        keywords.insert("INPUT".to_string(), TokenType::Input);
1073        keywords.insert("CASE".to_string(), TokenType::Case);
1074        keywords.insert("WHEN".to_string(), TokenType::When);
1075        keywords.insert("THEN".to_string(), TokenType::Then);
1076        keywords.insert("ELSE".to_string(), TokenType::Else);
1077        keywords.insert("END".to_string(), TokenType::End);
1078        keywords.insert("ENDIF".to_string(), TokenType::End); // Exasol alias for END
1079        keywords.insert("NULL".to_string(), TokenType::Null);
1080        keywords.insert("TRUE".to_string(), TokenType::True);
1081        keywords.insert("FALSE".to_string(), TokenType::False);
1082        keywords.insert("IS".to_string(), TokenType::Is);
1083        keywords.insert("IN".to_string(), TokenType::In);
1084        keywords.insert("BETWEEN".to_string(), TokenType::Between);
1085        keywords.insert("OVERLAPS".to_string(), TokenType::Overlaps);
1086        keywords.insert("LIKE".to_string(), TokenType::Like);
1087        keywords.insert("ILIKE".to_string(), TokenType::ILike);
1088        keywords.insert("RLIKE".to_string(), TokenType::RLike);
1089        keywords.insert("REGEXP".to_string(), TokenType::RLike);
1090        keywords.insert("ESCAPE".to_string(), TokenType::Escape);
1091        keywords.insert("EXISTS".to_string(), TokenType::Exists);
1092        keywords.insert("DISTINCT".to_string(), TokenType::Distinct);
1093        keywords.insert("ALL".to_string(), TokenType::All);
1094        keywords.insert("WITH".to_string(), TokenType::With);
1095        keywords.insert("CREATE".to_string(), TokenType::Create);
1096        keywords.insert("DROP".to_string(), TokenType::Drop);
1097        keywords.insert("ALTER".to_string(), TokenType::Alter);
1098        keywords.insert("TRUNCATE".to_string(), TokenType::Truncate);
1099        keywords.insert("TABLE".to_string(), TokenType::Table);
1100        keywords.insert("VIEW".to_string(), TokenType::View);
1101        keywords.insert("INDEX".to_string(), TokenType::Index);
1102        keywords.insert("COLUMN".to_string(), TokenType::Column);
1103        keywords.insert("CONSTRAINT".to_string(), TokenType::Constraint);
1104        keywords.insert("ADD".to_string(), TokenType::Add);
1105        keywords.insert("CASCADE".to_string(), TokenType::Cascade);
1106        keywords.insert("RESTRICT".to_string(), TokenType::Restrict);
1107        keywords.insert("RENAME".to_string(), TokenType::Rename);
1108        keywords.insert("TEMPORARY".to_string(), TokenType::Temporary);
1109        keywords.insert("TEMP".to_string(), TokenType::Temporary);
1110        keywords.insert("UNIQUE".to_string(), TokenType::Unique);
1111        keywords.insert("PRIMARY".to_string(), TokenType::PrimaryKey);
1112        keywords.insert("FOREIGN".to_string(), TokenType::ForeignKey);
1113        keywords.insert("KEY".to_string(), TokenType::Key);
1114        keywords.insert("KILL".to_string(), TokenType::Kill);
1115        keywords.insert("REFERENCES".to_string(), TokenType::References);
1116        keywords.insert("DEFAULT".to_string(), TokenType::Default);
1117        keywords.insert("DECLARE".to_string(), TokenType::Declare);
1118        keywords.insert("AUTO_INCREMENT".to_string(), TokenType::AutoIncrement);
1119        keywords.insert("AUTOINCREMENT".to_string(), TokenType::AutoIncrement); // Snowflake style
1120        keywords.insert("MATERIALIZED".to_string(), TokenType::Materialized);
1121        keywords.insert("REPLACE".to_string(), TokenType::Replace);
1122        keywords.insert("TO".to_string(), TokenType::To);
1123        keywords.insert("INSERT".to_string(), TokenType::Insert);
1124        keywords.insert("OVERWRITE".to_string(), TokenType::Overwrite);
1125        keywords.insert("UPDATE".to_string(), TokenType::Update);
1126        keywords.insert("USE".to_string(), TokenType::Use);
1127        keywords.insert("WAREHOUSE".to_string(), TokenType::Warehouse);
1128        keywords.insert("GLOB".to_string(), TokenType::Glob);
1129        keywords.insert("DELETE".to_string(), TokenType::Delete);
1130        keywords.insert("MERGE".to_string(), TokenType::Merge);
1131        keywords.insert("CACHE".to_string(), TokenType::Cache);
1132        keywords.insert("UNCACHE".to_string(), TokenType::Uncache);
1133        keywords.insert("REFRESH".to_string(), TokenType::Refresh);
1134        keywords.insert("GRANT".to_string(), TokenType::Grant);
1135        keywords.insert("REVOKE".to_string(), TokenType::Revoke);
1136        keywords.insert("COMMENT".to_string(), TokenType::Comment);
1137        keywords.insert("COLLATE".to_string(), TokenType::Collate);
1138        keywords.insert("INTO".to_string(), TokenType::Into);
1139        keywords.insert("VALUES".to_string(), TokenType::Values);
1140        keywords.insert("SET".to_string(), TokenType::Set);
1141        keywords.insert("SETTINGS".to_string(), TokenType::Settings);
1142        keywords.insert("SEPARATOR".to_string(), TokenType::Separator);
1143        keywords.insert("ASC".to_string(), TokenType::Asc);
1144        keywords.insert("DESC".to_string(), TokenType::Desc);
1145        keywords.insert("NULLS".to_string(), TokenType::Nulls);
1146        keywords.insert("RESPECT".to_string(), TokenType::Respect);
1147        keywords.insert("FIRST".to_string(), TokenType::First);
1148        keywords.insert("LAST".to_string(), TokenType::Last);
1149        keywords.insert("IF".to_string(), TokenType::If);
1150        keywords.insert("CAST".to_string(), TokenType::Cast);
1151        keywords.insert("TRY_CAST".to_string(), TokenType::TryCast);
1152        keywords.insert("SAFE_CAST".to_string(), TokenType::SafeCast);
1153        keywords.insert("OVER".to_string(), TokenType::Over);
1154        keywords.insert("PARTITION".to_string(), TokenType::Partition);
1155        keywords.insert("PLACING".to_string(), TokenType::Placing);
1156        keywords.insert("WINDOW".to_string(), TokenType::Window);
1157        keywords.insert("ROWS".to_string(), TokenType::Rows);
1158        keywords.insert("RANGE".to_string(), TokenType::Range);
1159        keywords.insert("FILTER".to_string(), TokenType::Filter);
1160        keywords.insert("NATURAL".to_string(), TokenType::Natural);
1161        keywords.insert("USING".to_string(), TokenType::Using);
1162        keywords.insert("UNBOUNDED".to_string(), TokenType::Unbounded);
1163        keywords.insert("PRECEDING".to_string(), TokenType::Preceding);
1164        keywords.insert("FOLLOWING".to_string(), TokenType::Following);
1165        keywords.insert("CURRENT".to_string(), TokenType::Current);
1166        keywords.insert("ROW".to_string(), TokenType::Row);
1167        keywords.insert("GROUPS".to_string(), TokenType::Groups);
1168        keywords.insert("RECURSIVE".to_string(), TokenType::Recursive);
1169        // TRIM function position keywords
1170        keywords.insert("BOTH".to_string(), TokenType::Both);
1171        keywords.insert("LEADING".to_string(), TokenType::Leading);
1172        keywords.insert("TRAILING".to_string(), TokenType::Trailing);
1173        keywords.insert("INTERVAL".to_string(), TokenType::Interval);
1174        // Phase 3: Additional keywords
1175        keywords.insert("TOP".to_string(), TokenType::Top);
1176        keywords.insert("QUALIFY".to_string(), TokenType::Qualify);
1177        keywords.insert("SAMPLE".to_string(), TokenType::Sample);
1178        keywords.insert("TABLESAMPLE".to_string(), TokenType::TableSample);
1179        keywords.insert("BERNOULLI".to_string(), TokenType::Bernoulli);
1180        keywords.insert("SYSTEM".to_string(), TokenType::System);
1181        keywords.insert("BLOCK".to_string(), TokenType::Block);
1182        keywords.insert("SEED".to_string(), TokenType::Seed);
1183        keywords.insert("REPEATABLE".to_string(), TokenType::Repeatable);
1184        keywords.insert("TIES".to_string(), TokenType::Ties);
1185        keywords.insert("LATERAL".to_string(), TokenType::Lateral);
1186        keywords.insert("LAMBDA".to_string(), TokenType::Lambda);
1187        keywords.insert("APPLY".to_string(), TokenType::Apply);
1188        // Oracle CONNECT BY keywords
1189        keywords.insert("CONNECT".to_string(), TokenType::Connect);
1190        // Hive/Spark specific keywords
1191        keywords.insert("CLUSTER".to_string(), TokenType::Cluster);
1192        keywords.insert("DISTRIBUTE".to_string(), TokenType::Distribute);
1193        keywords.insert("SORT".to_string(), TokenType::Sort);
1194        keywords.insert("PIVOT".to_string(), TokenType::Pivot);
1195        keywords.insert("PREWHERE".to_string(), TokenType::Prewhere);
1196        keywords.insert("UNPIVOT".to_string(), TokenType::Unpivot);
1197        keywords.insert("FOR".to_string(), TokenType::For);
1198        keywords.insert("ANY".to_string(), TokenType::Any);
1199        keywords.insert("SOME".to_string(), TokenType::Some);
1200        keywords.insert("ASOF".to_string(), TokenType::AsOf);
1201        keywords.insert("PERCENT".to_string(), TokenType::Percent);
1202        keywords.insert("EXCLUDE".to_string(), TokenType::Exclude);
1203        keywords.insert("NO".to_string(), TokenType::No);
1204        keywords.insert("OTHERS".to_string(), TokenType::Others);
1205        // PostgreSQL OPERATOR() syntax for schema-qualified operators
1206        keywords.insert("OPERATOR".to_string(), TokenType::Operator);
1207        // Phase 4: DDL keywords
1208        keywords.insert("SCHEMA".to_string(), TokenType::Schema);
1209        keywords.insert("NAMESPACE".to_string(), TokenType::Namespace);
1210        keywords.insert("DATABASE".to_string(), TokenType::Database);
1211        keywords.insert("FUNCTION".to_string(), TokenType::Function);
1212        keywords.insert("PROCEDURE".to_string(), TokenType::Procedure);
1213        keywords.insert("PROC".to_string(), TokenType::Procedure);
1214        keywords.insert("SEQUENCE".to_string(), TokenType::Sequence);
1215        keywords.insert("TRIGGER".to_string(), TokenType::Trigger);
1216        keywords.insert("TYPE".to_string(), TokenType::Type);
1217        keywords.insert("DOMAIN".to_string(), TokenType::Domain);
1218        keywords.insert("RETURNS".to_string(), TokenType::Returns);
1219        keywords.insert("RETURNING".to_string(), TokenType::Returning);
1220        keywords.insert("LANGUAGE".to_string(), TokenType::Language);
1221        keywords.insert("ROLLBACK".to_string(), TokenType::Rollback);
1222        keywords.insert("COMMIT".to_string(), TokenType::Commit);
1223        keywords.insert("BEGIN".to_string(), TokenType::Begin);
1224        keywords.insert("DESCRIBE".to_string(), TokenType::Describe);
1225        keywords.insert("PRESERVE".to_string(), TokenType::Preserve);
1226        keywords.insert("TRANSACTION".to_string(), TokenType::Transaction);
1227        keywords.insert("SAVEPOINT".to_string(), TokenType::Savepoint);
1228        keywords.insert("BODY".to_string(), TokenType::Body);
1229        keywords.insert("INCREMENT".to_string(), TokenType::Increment);
1230        keywords.insert("MINVALUE".to_string(), TokenType::Minvalue);
1231        keywords.insert("MAXVALUE".to_string(), TokenType::Maxvalue);
1232        keywords.insert("CYCLE".to_string(), TokenType::Cycle);
1233        keywords.insert("NOCYCLE".to_string(), TokenType::NoCycle);
1234        keywords.insert("PRIOR".to_string(), TokenType::Prior);
1235        // MATCH_RECOGNIZE keywords
1236        keywords.insert("MATCH".to_string(), TokenType::Match);
1237        keywords.insert("MATCH_RECOGNIZE".to_string(), TokenType::MatchRecognize);
1238        keywords.insert("MEASURES".to_string(), TokenType::Measures);
1239        keywords.insert("PATTERN".to_string(), TokenType::Pattern);
1240        keywords.insert("DEFINE".to_string(), TokenType::Define);
1241        keywords.insert("RUNNING".to_string(), TokenType::Running);
1242        keywords.insert("FINAL".to_string(), TokenType::Final);
1243        keywords.insert("OWNED".to_string(), TokenType::Owned);
1244        keywords.insert("AFTER".to_string(), TokenType::After);
1245        keywords.insert("BEFORE".to_string(), TokenType::Before);
1246        keywords.insert("INSTEAD".to_string(), TokenType::Instead);
1247        keywords.insert("EACH".to_string(), TokenType::Each);
1248        keywords.insert("STATEMENT".to_string(), TokenType::Statement);
1249        keywords.insert("REFERENCING".to_string(), TokenType::Referencing);
1250        keywords.insert("OLD".to_string(), TokenType::Old);
1251        keywords.insert("NEW".to_string(), TokenType::New);
1252        keywords.insert("OF".to_string(), TokenType::Of);
1253        keywords.insert("CHECK".to_string(), TokenType::Check);
1254        keywords.insert("START".to_string(), TokenType::Start);
1255        keywords.insert("ENUM".to_string(), TokenType::Enum);
1256        keywords.insert("AUTHORIZATION".to_string(), TokenType::Authorization);
1257        keywords.insert("RESTART".to_string(), TokenType::Restart);
1258        // Date/time literal keywords
1259        keywords.insert("DATE".to_string(), TokenType::Date);
1260        keywords.insert("TIME".to_string(), TokenType::Time);
1261        keywords.insert("TIMESTAMP".to_string(), TokenType::Timestamp);
1262        keywords.insert("DATETIME".to_string(), TokenType::DateTime);
1263        keywords.insert("GENERATED".to_string(), TokenType::Generated);
1264        keywords.insert("IDENTITY".to_string(), TokenType::Identity);
1265        keywords.insert("ALWAYS".to_string(), TokenType::Always);
1266        // LOAD DATA keywords
1267        keywords.insert("LOAD".to_string(), TokenType::Load);
1268        keywords.insert("LOCAL".to_string(), TokenType::Local);
1269        keywords.insert("INPATH".to_string(), TokenType::Inpath);
1270        keywords.insert("INPUTFORMAT".to_string(), TokenType::InputFormat);
1271        keywords.insert("SERDE".to_string(), TokenType::Serde);
1272        keywords.insert("SERDEPROPERTIES".to_string(), TokenType::SerdeProperties);
1273        keywords.insert("FORMAT".to_string(), TokenType::Format);
1274        // SQLite
1275        keywords.insert("PRAGMA".to_string(), TokenType::Pragma);
1276        // SHOW statement
1277        keywords.insert("SHOW".to_string(), TokenType::Show);
1278        // Oracle ORDER SIBLINGS BY (hierarchical queries)
1279        keywords.insert("SIBLINGS".to_string(), TokenType::Siblings);
1280        // COPY and PUT statements (Snowflake, PostgreSQL)
1281        keywords.insert("COPY".to_string(), TokenType::Copy);
1282        keywords.insert("PUT".to_string(), TokenType::Put);
1283        keywords.insert("GET".to_string(), TokenType::Get);
1284        // EXEC/EXECUTE statement (TSQL, etc.)
1285        keywords.insert("EXEC".to_string(), TokenType::Execute);
1286        keywords.insert("EXECUTE".to_string(), TokenType::Execute);
1287        // Postfix null check operators (PostgreSQL/SQLite)
1288        keywords.insert("ISNULL".to_string(), TokenType::IsNull);
1289        keywords.insert("NOTNULL".to_string(), TokenType::NotNull);
1290
1291        let mut single_tokens = std::collections::HashMap::new();
1292        single_tokens.insert('(', TokenType::LParen);
1293        single_tokens.insert(')', TokenType::RParen);
1294        single_tokens.insert('[', TokenType::LBracket);
1295        single_tokens.insert(']', TokenType::RBracket);
1296        single_tokens.insert('{', TokenType::LBrace);
1297        single_tokens.insert('}', TokenType::RBrace);
1298        single_tokens.insert(',', TokenType::Comma);
1299        single_tokens.insert('.', TokenType::Dot);
1300        single_tokens.insert(';', TokenType::Semicolon);
1301        single_tokens.insert('+', TokenType::Plus);
1302        single_tokens.insert('-', TokenType::Dash);
1303        single_tokens.insert('*', TokenType::Star);
1304        single_tokens.insert('/', TokenType::Slash);
1305        single_tokens.insert('%', TokenType::Percent);
1306        single_tokens.insert('&', TokenType::Amp);
1307        single_tokens.insert('|', TokenType::Pipe);
1308        single_tokens.insert('^', TokenType::Caret);
1309        single_tokens.insert('~', TokenType::Tilde);
1310        single_tokens.insert('<', TokenType::Lt);
1311        single_tokens.insert('>', TokenType::Gt);
1312        single_tokens.insert('=', TokenType::Eq);
1313        single_tokens.insert('!', TokenType::Exclamation);
1314        single_tokens.insert(':', TokenType::Colon);
1315        single_tokens.insert('@', TokenType::DAt);
1316        single_tokens.insert('#', TokenType::Hash);
1317        single_tokens.insert('$', TokenType::Dollar);
1318        single_tokens.insert('?', TokenType::Parameter);
1319
1320        let mut quotes = std::collections::HashMap::new();
1321        quotes.insert("'".to_string(), "'".to_string());
1322        // Triple-quoted strings (e.g., """x""")
1323        quotes.insert("\"\"\"".to_string(), "\"\"\"".to_string());
1324
1325        let mut identifiers = std::collections::HashMap::new();
1326        identifiers.insert('"', '"');
1327        identifiers.insert('`', '`');
1328        // Note: TSQL bracket-quoted identifiers [name] are handled in the parser
1329        // because [ is also used for arrays and subscripts
1330
1331        let mut comments = std::collections::HashMap::new();
1332        comments.insert("--".to_string(), None);
1333        comments.insert("/*".to_string(), Some("*/".to_string()));
1334
1335        Self {
1336            keywords,
1337            single_tokens,
1338            quotes,
1339            identifiers,
1340            comments,
1341            // Standard SQL: only '' (doubled quote) escapes a quote
1342            // Backslash escapes are dialect-specific (MySQL, etc.)
1343            string_escapes: vec!['\''],
1344            nested_comments: true,
1345            // By default, no escape_follow_chars means preserve backslash for unrecognized escapes
1346            escape_follow_chars: vec![],
1347            // Default: b'...' is bit string (standard SQL), not byte string (BigQuery)
1348            b_prefix_is_byte_string: false,
1349            numeric_literals: std::collections::HashMap::new(),
1350            identifiers_can_start_with_digit: false,
1351            hex_number_strings: false,
1352            hex_string_is_integer_type: false,
1353            // Default: backslash escapes ARE allowed in raw strings (sqlglot default)
1354            // Spark/Databricks set this to false
1355            string_escapes_allowed_in_raw_strings: true,
1356            hash_comments: false,
1357            dollar_sign_is_identifier: false,
1358            insert_format_raw_data: false,
1359        }
1360    }
1361}
1362
1363/// SQL Tokenizer
1364pub struct Tokenizer {
1365    config: TokenizerConfig,
1366}
1367
1368impl Tokenizer {
1369    /// Create a new tokenizer with the given configuration
1370    pub fn new(config: TokenizerConfig) -> Self {
1371        Self { config }
1372    }
1373
1374    /// Create a tokenizer with default configuration
1375    pub fn default_config() -> Self {
1376        Self::new(TokenizerConfig::default())
1377    }
1378
1379    /// Tokenize a SQL string
1380    pub fn tokenize(&self, sql: &str) -> Result<Vec<Token>> {
1381        let mut state = TokenizerState::new(sql, &self.config);
1382        state.tokenize()
1383    }
1384}
1385
1386impl Default for Tokenizer {
1387    fn default() -> Self {
1388        Self::default_config()
1389    }
1390}
1391
1392/// Internal state for tokenization
1393struct TokenizerState<'a> {
1394    chars: Vec<char>,
1395    size: usize,
1396    tokens: Vec<Token>,
1397    start: usize,
1398    current: usize,
1399    line: usize,
1400    column: usize,
1401    comments: Vec<String>,
1402    config: &'a TokenizerConfig,
1403}
1404
1405impl<'a> TokenizerState<'a> {
1406    fn new(sql: &str, config: &'a TokenizerConfig) -> Self {
1407        let chars: Vec<char> = sql.chars().collect();
1408        let size = chars.len();
1409        Self {
1410            chars,
1411            size,
1412            tokens: Vec::new(),
1413            start: 0,
1414            current: 0,
1415            line: 1,
1416            column: 1,
1417            comments: Vec::new(),
1418            config,
1419        }
1420    }
1421
1422    fn tokenize(&mut self) -> Result<Vec<Token>> {
1423        while !self.is_at_end() {
1424            self.skip_whitespace();
1425            if self.is_at_end() {
1426                break;
1427            }
1428
1429            self.start = self.current;
1430            self.scan_token()?;
1431
1432            // ClickHouse: After INSERT ... FORMAT <name> (where name != VALUES),
1433            // the rest until the next blank line or end of input is raw data.
1434            if self.config.insert_format_raw_data {
1435                if let Some(raw) = self.try_scan_insert_format_raw_data() {
1436                    if !raw.is_empty() {
1437                        self.start = self.current;
1438                        self.add_token_with_text(TokenType::Var, raw);
1439                    }
1440                }
1441            }
1442        }
1443
1444        // Handle leftover leading comments at end of input.
1445        // These are comments on a new line after the last token that couldn't be attached
1446        // as leading comments to a subsequent token (because there is none).
1447        // Attach them as trailing comments on the last token so they're preserved.
1448        if !self.comments.is_empty() {
1449            if let Some(last) = self.tokens.last_mut() {
1450                last.trailing_comments.extend(self.comments.drain(..));
1451            }
1452        }
1453
1454        Ok(std::mem::take(&mut self.tokens))
1455    }
1456
1457    fn is_at_end(&self) -> bool {
1458        self.current >= self.size
1459    }
1460
1461    fn peek(&self) -> char {
1462        if self.is_at_end() {
1463            '\0'
1464        } else {
1465            self.chars[self.current]
1466        }
1467    }
1468
1469    fn peek_next(&self) -> char {
1470        if self.current + 1 >= self.size {
1471            '\0'
1472        } else {
1473            self.chars[self.current + 1]
1474        }
1475    }
1476
1477    fn advance(&mut self) -> char {
1478        let c = self.peek();
1479        self.current += 1;
1480        if c == '\n' {
1481            self.line += 1;
1482            self.column = 1;
1483        } else {
1484            self.column += 1;
1485        }
1486        c
1487    }
1488
1489    fn skip_whitespace(&mut self) {
1490        // Track whether we've seen a newline since the last token.
1491        // Comments on a new line (after a newline) are leading comments on the next token,
1492        // while comments on the same line are trailing comments on the previous token.
1493        // This matches Python sqlglot's behavior.
1494        let mut saw_newline = false;
1495        while !self.is_at_end() {
1496            let c = self.peek();
1497            match c {
1498                ' ' | '\t' | '\r' => {
1499                    self.advance();
1500                }
1501                '\n' => {
1502                    saw_newline = true;
1503                    self.advance();
1504                }
1505                '\u{00A0}' // non-breaking space
1506                | '\u{2000}'..='\u{200B}' // various Unicode spaces + zero-width space
1507                | '\u{3000}' // ideographic (full-width) space
1508                | '\u{FEFF}' // BOM / zero-width no-break space
1509                => {
1510                    self.advance();
1511                }
1512                '-' if self.peek_next() == '-' => {
1513                    self.scan_line_comment(saw_newline);
1514                    // After a line comment, we're always on a new line
1515                    saw_newline = true;
1516                }
1517                '/' if self.peek_next() == '/' && self.config.hash_comments => {
1518                    // ClickHouse: // single-line comments (same dialects that support # comments)
1519                    self.scan_double_slash_comment();
1520                }
1521                '/' if self.peek_next() == '*' => {
1522                    // Check if this is a hint comment /*+ ... */
1523                    if self.current + 2 < self.size && self.chars[self.current + 2] == '+' {
1524                        // This is a hint comment, handle it as a token instead of skipping
1525                        break;
1526                    }
1527                    if self.scan_block_comment(saw_newline).is_err() {
1528                        return;
1529                    }
1530                    // Don't reset saw_newline - it carries forward
1531                }
1532                '/' if self.peek_next() == '/' && self.config.comments.contains_key("//") => {
1533                    // Dialect-specific // line comment (e.g., Snowflake)
1534                    // But NOT inside URIs like file:// or paths with consecutive slashes
1535                    // Check that previous non-whitespace char is not ':' or '/'
1536                    let prev_non_ws = if self.current > 0 {
1537                        let mut i = self.current - 1;
1538                        while i > 0 && (self.chars[i] == ' ' || self.chars[i] == '\t') {
1539                            i -= 1;
1540                        }
1541                        self.chars[i]
1542                    } else {
1543                        '\0'
1544                    };
1545                    if prev_non_ws == ':' || prev_non_ws == '/' {
1546                        // This is likely a URI (file://, http://) or path, not a comment
1547                        break;
1548                    }
1549                    self.scan_line_comment(saw_newline);
1550                    // After a line comment, we're always on a new line
1551                    saw_newline = true;
1552                }
1553                '#' if self.config.hash_comments => {
1554                    self.scan_hash_line_comment();
1555                }
1556                _ => break,
1557            }
1558        }
1559    }
1560
1561    fn scan_hash_line_comment(&mut self) {
1562        self.advance(); // #
1563        let start = self.current;
1564        while !self.is_at_end() && self.peek() != '\n' {
1565            self.advance();
1566        }
1567        let comment: String = self.chars[start..self.current].iter().collect();
1568        let comment_text = comment.trim().to_string();
1569        if let Some(last) = self.tokens.last_mut() {
1570            last.trailing_comments.push(comment_text);
1571        } else {
1572            self.comments.push(comment_text);
1573        }
1574    }
1575
1576    fn scan_double_slash_comment(&mut self) {
1577        self.advance(); // /
1578        self.advance(); // /
1579        let start = self.current;
1580        while !self.is_at_end() && self.peek() != '\n' {
1581            self.advance();
1582        }
1583        let comment: String = self.chars[start..self.current].iter().collect();
1584        let comment_text = comment.trim().to_string();
1585        if let Some(last) = self.tokens.last_mut() {
1586            last.trailing_comments.push(comment_text);
1587        } else {
1588            self.comments.push(comment_text);
1589        }
1590    }
1591
1592    fn scan_line_comment(&mut self, after_newline: bool) {
1593        self.advance(); // -
1594        self.advance(); // -
1595        let start = self.current;
1596        while !self.is_at_end() && self.peek() != '\n' {
1597            self.advance();
1598        }
1599        let comment_text: String = self.chars[start..self.current].iter().collect();
1600
1601        // If the comment starts on a new line (after_newline), it's a leading comment
1602        // on the next token. Otherwise, it's a trailing comment on the previous token.
1603        if after_newline || self.tokens.is_empty() {
1604            self.comments.push(comment_text);
1605        } else if let Some(last) = self.tokens.last_mut() {
1606            last.trailing_comments.push(comment_text);
1607        }
1608    }
1609
1610    fn scan_block_comment(&mut self, after_newline: bool) -> Result<()> {
1611        self.advance(); // /
1612        self.advance(); // *
1613        let content_start = self.current;
1614        let mut depth = 1;
1615
1616        while !self.is_at_end() && depth > 0 {
1617            if self.peek() == '/' && self.peek_next() == '*' && self.config.nested_comments {
1618                self.advance();
1619                self.advance();
1620                depth += 1;
1621            } else if self.peek() == '*' && self.peek_next() == '/' {
1622                depth -= 1;
1623                if depth > 0 {
1624                    self.advance();
1625                    self.advance();
1626                }
1627            } else {
1628                self.advance();
1629            }
1630        }
1631
1632        if depth > 0 {
1633            return Err(Error::tokenize(
1634                "Unterminated block comment",
1635                self.line,
1636                self.column,
1637            ));
1638        }
1639
1640        // Get the content between /* and */ (preserving internal whitespace for nested comments)
1641        let content: String = self.chars[content_start..self.current].iter().collect();
1642        self.advance(); // *
1643        self.advance(); // /
1644
1645        // For round-trip fidelity, preserve the exact comment content including nested comments
1646        let comment_text = format!("/*{}*/", content);
1647
1648        // If the comment starts on a new line (after_newline), it's a leading comment
1649        // on the next token. Otherwise, it's a trailing comment on the previous token.
1650        if after_newline || self.tokens.is_empty() {
1651            self.comments.push(comment_text);
1652        } else if let Some(last) = self.tokens.last_mut() {
1653            last.trailing_comments.push(comment_text);
1654        }
1655
1656        Ok(())
1657    }
1658
1659    /// Scan a hint comment /*+ ... */ and return it as a Hint token
1660    fn scan_hint(&mut self) -> Result<()> {
1661        self.advance(); // /
1662        self.advance(); // *
1663        self.advance(); // +
1664        let hint_start = self.current;
1665
1666        // Scan until we find */
1667        while !self.is_at_end() {
1668            if self.peek() == '*' && self.peek_next() == '/' {
1669                break;
1670            }
1671            self.advance();
1672        }
1673
1674        if self.is_at_end() {
1675            return Err(Error::tokenize(
1676                "Unterminated hint comment",
1677                self.line,
1678                self.column,
1679            ));
1680        }
1681
1682        let hint_text: String = self.chars[hint_start..self.current].iter().collect();
1683        self.advance(); // *
1684        self.advance(); // /
1685
1686        self.add_token_with_text(TokenType::Hint, hint_text.trim().to_string());
1687
1688        Ok(())
1689    }
1690
1691    /// Scan a positional parameter: $1, $2, etc.
1692    fn scan_positional_parameter(&mut self) -> Result<()> {
1693        self.advance(); // consume $
1694        let start = self.current;
1695
1696        while !self.is_at_end() && self.peek().is_ascii_digit() {
1697            self.advance();
1698        }
1699
1700        let number: String = self.chars[start..self.current].iter().collect();
1701        self.add_token_with_text(TokenType::Parameter, number);
1702        Ok(())
1703    }
1704
1705    /// Try to scan a tagged dollar-quoted string: $tag$content$tag$
1706    /// Returns Some(()) if successful, None if this isn't a tagged dollar string.
1707    ///
1708    /// The token text is stored as "tag\x00content" to preserve the tag for later use.
1709    fn try_scan_tagged_dollar_string(&mut self) -> Result<Option<()>> {
1710        let saved_pos = self.current;
1711
1712        // We're at '$', next char is alphabetic
1713        self.advance(); // consume opening $
1714
1715        // Scan the tag (identifier: alphanumeric + underscore, including Unicode)
1716        // Tags can contain Unicode characters like emojis (e.g., $🦆$)
1717        let tag_start = self.current;
1718        while !self.is_at_end()
1719            && (self.peek().is_alphanumeric() || self.peek() == '_' || !self.peek().is_ascii())
1720        {
1721            self.advance();
1722        }
1723        let tag: String = self.chars[tag_start..self.current].iter().collect();
1724
1725        // Must have a closing $ after the tag
1726        if self.is_at_end() || self.peek() != '$' {
1727            // Not a tagged dollar string - restore position
1728            self.current = saved_pos;
1729            return Ok(None);
1730        }
1731        self.advance(); // consume closing $ of opening tag
1732
1733        // Now scan content until we find $tag$
1734        let content_start = self.current;
1735        let closing_tag = format!("${}$", tag);
1736        let closing_chars: Vec<char> = closing_tag.chars().collect();
1737
1738        loop {
1739            if self.is_at_end() {
1740                // Unterminated - restore and fall through
1741                self.current = saved_pos;
1742                return Ok(None);
1743            }
1744
1745            // Check if we've reached the closing tag
1746            if self.peek() == '$' && self.current + closing_chars.len() <= self.size {
1747                let matches = closing_chars.iter().enumerate().all(|(j, &ch)| {
1748                    self.current + j < self.size && self.chars[self.current + j] == ch
1749                });
1750                if matches {
1751                    let content: String = self.chars[content_start..self.current].iter().collect();
1752                    // Consume closing tag
1753                    for _ in 0..closing_chars.len() {
1754                        self.advance();
1755                    }
1756                    // Store as "tag\x00content" to preserve the tag
1757                    let token_text = format!("{}\x00{}", tag, content);
1758                    self.add_token_with_text(TokenType::DollarString, token_text);
1759                    return Ok(Some(()));
1760                }
1761            }
1762            self.advance();
1763        }
1764    }
1765
1766    /// Scan a dollar-quoted string: $$content$$ or $tag$content$tag$
1767    ///
1768    /// For $$...$$ (no tag), the token text is just the content.
1769    /// For $tag$...$tag$, use try_scan_tagged_dollar_string instead.
1770    fn scan_dollar_quoted_string(&mut self) -> Result<()> {
1771        self.advance(); // consume first $
1772        self.advance(); // consume second $
1773
1774        // For $$...$$ (no tag), just scan until closing $$
1775        let start = self.current;
1776        while !self.is_at_end() {
1777            if self.peek() == '$'
1778                && self.current + 1 < self.size
1779                && self.chars[self.current + 1] == '$'
1780            {
1781                break;
1782            }
1783            self.advance();
1784        }
1785
1786        let content: String = self.chars[start..self.current].iter().collect();
1787
1788        if !self.is_at_end() {
1789            self.advance(); // consume first $
1790            self.advance(); // consume second $
1791        }
1792
1793        self.add_token_with_text(TokenType::DollarString, content);
1794        Ok(())
1795    }
1796
1797    fn scan_token(&mut self) -> Result<()> {
1798        let c = self.peek();
1799
1800        // Check for string literal
1801        if c == '\'' {
1802            // Check for triple-quoted string '''...''' if configured
1803            if self.config.quotes.contains_key("'''")
1804                && self.peek_next() == '\''
1805                && self.current + 2 < self.size
1806                && self.chars[self.current + 2] == '\''
1807            {
1808                return self.scan_triple_quoted_string('\'');
1809            }
1810            return self.scan_string();
1811        }
1812
1813        // Check for triple-quoted string """...""" if configured
1814        if c == '"'
1815            && self.config.quotes.contains_key("\"\"\"")
1816            && self.peek_next() == '"'
1817            && self.current + 2 < self.size
1818            && self.chars[self.current + 2] == '"'
1819        {
1820            return self.scan_triple_quoted_string('"');
1821        }
1822
1823        // Check for double-quoted strings when dialect supports them (e.g., BigQuery)
1824        // This must come before identifier quotes check
1825        if c == '"'
1826            && self.config.quotes.contains_key("\"")
1827            && !self.config.identifiers.contains_key(&'"')
1828        {
1829            return self.scan_double_quoted_string();
1830        }
1831
1832        // Check for identifier quotes
1833        if let Some(&end_quote) = self.config.identifiers.get(&c) {
1834            return self.scan_quoted_identifier(end_quote);
1835        }
1836
1837        // Check for numbers (including numbers starting with a dot like .25)
1838        if c.is_ascii_digit() {
1839            return self.scan_number();
1840        }
1841
1842        // Check for numbers starting with a dot (e.g., .25, .5)
1843        // This must come before single character token handling
1844        // Don't treat as a number if:
1845        // - Previous char was also a dot (e.g., 1..2 should be 1, ., ., 2)
1846        // - Previous char is an identifier character (e.g., foo.25 should be foo, ., 25)
1847        //   This handles BigQuery numeric table parts like project.dataset.25
1848        if c == '.' && self.peek_next().is_ascii_digit() {
1849            let prev_char = if self.current > 0 {
1850                self.chars[self.current - 1]
1851            } else {
1852                '\0'
1853            };
1854            let is_after_ident = prev_char.is_alphanumeric()
1855                || prev_char == '_'
1856                || prev_char == '`'
1857                || prev_char == '"'
1858                || prev_char == ']'
1859                || prev_char == ')';
1860            if prev_char != '.' && !is_after_ident {
1861                return self.scan_number_starting_with_dot();
1862            }
1863        }
1864
1865        // Check for hint comment /*+ ... */
1866        if c == '/'
1867            && self.peek_next() == '*'
1868            && self.current + 2 < self.size
1869            && self.chars[self.current + 2] == '+'
1870        {
1871            return self.scan_hint();
1872        }
1873
1874        // Check for multi-character operators first
1875        if let Some(token_type) = self.try_scan_multi_char_operator() {
1876            self.add_token(token_type);
1877            return Ok(());
1878        }
1879
1880        // Check for tagged dollar-quoted strings: $tag$content$tag$
1881        // Tags can contain Unicode characters (including emojis like 🦆) and digits (e.g., $1$)
1882        if c == '$'
1883            && (self.peek_next().is_alphanumeric()
1884                || self.peek_next() == '_'
1885                || !self.peek_next().is_ascii())
1886        {
1887            if let Some(()) = self.try_scan_tagged_dollar_string()? {
1888                return Ok(());
1889            }
1890            // If tagged dollar string didn't match and dollar_sign_is_identifier is set,
1891            // treat the $ and following chars as an identifier (e.g., ClickHouse $alias$name$).
1892            if self.config.dollar_sign_is_identifier {
1893                return self.scan_dollar_identifier();
1894            }
1895        }
1896
1897        // Check for dollar-quoted strings: $$...$$
1898        if c == '$' && self.peek_next() == '$' {
1899            return self.scan_dollar_quoted_string();
1900        }
1901
1902        // Check for positional parameters: $1, $2, etc.
1903        if c == '$' && self.peek_next().is_ascii_digit() {
1904            return self.scan_positional_parameter();
1905        }
1906
1907        // ClickHouse: bare $ (not followed by alphanumeric/underscore) as identifier
1908        if c == '$' && self.config.dollar_sign_is_identifier {
1909            return self.scan_dollar_identifier();
1910        }
1911
1912        // TSQL: Check for identifiers starting with # (temp tables) or @ (variables)
1913        // e.g., #temp, ##global_temp, @variable
1914        if (c == '#' || c == '@')
1915            && (self.peek_next().is_alphanumeric()
1916                || self.peek_next() == '_'
1917                || self.peek_next() == '#')
1918        {
1919            return self.scan_tsql_identifier();
1920        }
1921
1922        // Check for single character tokens
1923        if let Some(&token_type) = self.config.single_tokens.get(&c) {
1924            self.advance();
1925            self.add_token(token_type);
1926            return Ok(());
1927        }
1928
1929        // Unicode minus (U+2212) → treat as regular minus
1930        if c == '\u{2212}' {
1931            self.advance();
1932            self.add_token(TokenType::Dash);
1933            return Ok(());
1934        }
1935
1936        // Unicode fraction slash (U+2044) → treat as regular slash
1937        if c == '\u{2044}' {
1938            self.advance();
1939            self.add_token(TokenType::Slash);
1940            return Ok(());
1941        }
1942
1943        // Unicode curly/smart quotes → treat as regular string quotes
1944        if c == '\u{2018}' || c == '\u{2019}' {
1945            // Left/right single quotation marks → scan as string with matching end
1946            return self.scan_unicode_quoted_string(c);
1947        }
1948        if c == '\u{201C}' || c == '\u{201D}' {
1949            // Left/right double quotation marks → scan as quoted identifier
1950            return self.scan_unicode_quoted_identifier(c);
1951        }
1952
1953        // Must be an identifier or keyword
1954        self.scan_identifier_or_keyword()
1955    }
1956
1957    fn try_scan_multi_char_operator(&mut self) -> Option<TokenType> {
1958        let c = self.peek();
1959        let next = self.peek_next();
1960        let third = if self.current + 2 < self.size {
1961            self.chars[self.current + 2]
1962        } else {
1963            '\0'
1964        };
1965
1966        // Check for three-character operators first
1967        // -|- (Adjacent - PostgreSQL range adjacency)
1968        if c == '-' && next == '|' && third == '-' {
1969            self.advance();
1970            self.advance();
1971            self.advance();
1972            return Some(TokenType::Adjacent);
1973        }
1974
1975        // ||/ (Cube root - PostgreSQL)
1976        if c == '|' && next == '|' && third == '/' {
1977            self.advance();
1978            self.advance();
1979            self.advance();
1980            return Some(TokenType::DPipeSlash);
1981        }
1982
1983        // #>> (JSONB path text extraction - PostgreSQL)
1984        if c == '#' && next == '>' && third == '>' {
1985            self.advance();
1986            self.advance();
1987            self.advance();
1988            return Some(TokenType::DHashArrow);
1989        }
1990
1991        // ->> (JSON text extraction - PostgreSQL/MySQL)
1992        if c == '-' && next == '>' && third == '>' {
1993            self.advance();
1994            self.advance();
1995            self.advance();
1996            return Some(TokenType::DArrow);
1997        }
1998
1999        // <=> (NULL-safe equality - MySQL)
2000        if c == '<' && next == '=' && third == '>' {
2001            self.advance();
2002            self.advance();
2003            self.advance();
2004            return Some(TokenType::NullsafeEq);
2005        }
2006
2007        // <-> (Distance operator - PostgreSQL)
2008        if c == '<' && next == '-' && third == '>' {
2009            self.advance();
2010            self.advance();
2011            self.advance();
2012            return Some(TokenType::LrArrow);
2013        }
2014
2015        // <@ (Contained by - PostgreSQL)
2016        if c == '<' && next == '@' {
2017            self.advance();
2018            self.advance();
2019            return Some(TokenType::LtAt);
2020        }
2021
2022        // @> (Contains - PostgreSQL)
2023        if c == '@' && next == '>' {
2024            self.advance();
2025            self.advance();
2026            return Some(TokenType::AtGt);
2027        }
2028
2029        // ~~~ (Glob - PostgreSQL)
2030        if c == '~' && next == '~' && third == '~' {
2031            self.advance();
2032            self.advance();
2033            self.advance();
2034            return Some(TokenType::Glob);
2035        }
2036
2037        // ~~* (ILike - PostgreSQL)
2038        if c == '~' && next == '~' && third == '*' {
2039            self.advance();
2040            self.advance();
2041            self.advance();
2042            return Some(TokenType::ILike);
2043        }
2044
2045        // !~~* (Not ILike - PostgreSQL)
2046        let fourth = if self.current + 3 < self.size {
2047            self.chars[self.current + 3]
2048        } else {
2049            '\0'
2050        };
2051        if c == '!' && next == '~' && third == '~' && fourth == '*' {
2052            self.advance();
2053            self.advance();
2054            self.advance();
2055            self.advance();
2056            return Some(TokenType::NotILike);
2057        }
2058
2059        // !~~ (Not Like - PostgreSQL)
2060        if c == '!' && next == '~' && third == '~' {
2061            self.advance();
2062            self.advance();
2063            self.advance();
2064            return Some(TokenType::NotLike);
2065        }
2066
2067        // !~* (Not Regexp ILike - PostgreSQL)
2068        if c == '!' && next == '~' && third == '*' {
2069            self.advance();
2070            self.advance();
2071            self.advance();
2072            return Some(TokenType::NotIRLike);
2073        }
2074
2075        // !:> (Not cast / Try cast - SingleStore)
2076        if c == '!' && next == ':' && third == '>' {
2077            self.advance();
2078            self.advance();
2079            self.advance();
2080            return Some(TokenType::NColonGt);
2081        }
2082
2083        // ?:: (TRY_CAST shorthand - Databricks)
2084        if c == '?' && next == ':' && third == ':' {
2085            self.advance();
2086            self.advance();
2087            self.advance();
2088            return Some(TokenType::QDColon);
2089        }
2090
2091        // !~ (Not Regexp - PostgreSQL)
2092        if c == '!' && next == '~' {
2093            self.advance();
2094            self.advance();
2095            return Some(TokenType::NotRLike);
2096        }
2097
2098        // ~~ (Like - PostgreSQL)
2099        if c == '~' && next == '~' {
2100            self.advance();
2101            self.advance();
2102            return Some(TokenType::Like);
2103        }
2104
2105        // ~* (Regexp ILike - PostgreSQL)
2106        if c == '~' && next == '*' {
2107            self.advance();
2108            self.advance();
2109            return Some(TokenType::IRLike);
2110        }
2111
2112        // SingleStore three-character JSON path operators (must be checked before :: two-char)
2113        // ::$ (JSON extract string), ::% (JSON extract double), ::? (JSON match)
2114        if c == ':' && next == ':' && third == '$' {
2115            self.advance();
2116            self.advance();
2117            self.advance();
2118            return Some(TokenType::DColonDollar);
2119        }
2120        if c == ':' && next == ':' && third == '%' {
2121            self.advance();
2122            self.advance();
2123            self.advance();
2124            return Some(TokenType::DColonPercent);
2125        }
2126        if c == ':' && next == ':' && third == '?' {
2127            self.advance();
2128            self.advance();
2129            self.advance();
2130            return Some(TokenType::DColonQMark);
2131        }
2132
2133        // Two-character operators
2134        let token_type = match (c, next) {
2135            ('.', ':') => Some(TokenType::DotColon),
2136            ('=', '=') => Some(TokenType::Eq), // Hive/Spark == equality operator
2137            ('<', '=') => Some(TokenType::Lte),
2138            ('>', '=') => Some(TokenType::Gte),
2139            ('!', '=') => Some(TokenType::Neq),
2140            ('<', '>') => Some(TokenType::Neq),
2141            ('^', '=') => Some(TokenType::Neq),
2142            ('<', '<') => Some(TokenType::LtLt),
2143            ('>', '>') => Some(TokenType::GtGt),
2144            ('|', '|') => Some(TokenType::DPipe),
2145            ('|', '/') => Some(TokenType::PipeSlash), // Square root - PostgreSQL
2146            (':', ':') => Some(TokenType::DColon),
2147            (':', '=') => Some(TokenType::ColonEq), // := (assignment, named args)
2148            (':', '>') => Some(TokenType::ColonGt), // ::> (TSQL)
2149            ('-', '>') => Some(TokenType::Arrow),   // JSON object access
2150            ('=', '>') => Some(TokenType::FArrow),  // Fat arrow (lambda)
2151            ('&', '&') => Some(TokenType::DAmp),
2152            ('&', '<') => Some(TokenType::AmpLt), // PostgreSQL range operator
2153            ('&', '>') => Some(TokenType::AmpGt), // PostgreSQL range operator
2154            ('@', '@') => Some(TokenType::AtAt),  // Text search match
2155            ('?', '|') => Some(TokenType::QMarkPipe), // JSONB contains any key
2156            ('?', '&') => Some(TokenType::QMarkAmp), // JSONB contains all keys
2157            ('?', '?') => Some(TokenType::DQMark), // Double question mark
2158            ('#', '>') => Some(TokenType::HashArrow), // JSONB path extraction
2159            ('#', '-') => Some(TokenType::HashDash), // JSONB delete
2160            ('^', '@') => Some(TokenType::CaretAt), // PostgreSQL starts-with operator
2161            ('*', '*') => Some(TokenType::DStar), // Power operator
2162            ('|', '>') => Some(TokenType::PipeGt), // Pipe-greater (some dialects)
2163            _ => None,
2164        };
2165
2166        if token_type.is_some() {
2167            self.advance();
2168            self.advance();
2169        }
2170
2171        token_type
2172    }
2173
2174    fn scan_string(&mut self) -> Result<()> {
2175        self.advance(); // Opening quote
2176        let mut value = String::new();
2177
2178        while !self.is_at_end() {
2179            let c = self.peek();
2180            if c == '\'' {
2181                if self.peek_next() == '\'' {
2182                    // Escaped quote
2183                    value.push('\'');
2184                    self.advance();
2185                    self.advance();
2186                } else {
2187                    break;
2188                }
2189            } else if c == '\\' && self.config.string_escapes.contains(&'\\') {
2190                // Handle escape sequences
2191                self.advance(); // Consume the backslash
2192                if !self.is_at_end() {
2193                    let escaped = self.advance();
2194                    match escaped {
2195                        'n' => value.push('\n'),
2196                        'r' => value.push('\r'),
2197                        't' => value.push('\t'),
2198                        '0' => value.push('\0'),
2199                        'Z' => value.push('\x1A'), // Ctrl+Z (MySQL)
2200                        'a' => value.push('\x07'), // Alert/bell
2201                        'b' => value.push('\x08'), // Backspace
2202                        'f' => value.push('\x0C'), // Form feed
2203                        'v' => value.push('\x0B'), // Vertical tab
2204                        '\\' => value.push('\\'),
2205                        '\'' => value.push('\''),
2206                        '"' => value.push('"'),
2207                        '%' => {
2208                            // MySQL: \% in LIKE patterns
2209                            value.push('%');
2210                        }
2211                        '_' => {
2212                            // MySQL: \_ in LIKE patterns
2213                            value.push('_');
2214                        }
2215                        // For unrecognized escape sequences:
2216                        // If escape_follow_chars is set, only preserve backslash for chars in that list
2217                        // Otherwise (empty list), preserve backslash + char for unrecognized escapes
2218                        _ => {
2219                            if !self.config.escape_follow_chars.is_empty() {
2220                                // MySQL-style: discard backslash for unrecognized escapes
2221                                value.push(escaped);
2222                            } else {
2223                                // Standard: preserve backslash + char
2224                                value.push('\\');
2225                                value.push(escaped);
2226                            }
2227                        }
2228                    }
2229                }
2230            } else {
2231                value.push(self.advance());
2232            }
2233        }
2234
2235        if self.is_at_end() {
2236            return Err(Error::tokenize(
2237                "Unterminated string",
2238                self.line,
2239                self.column,
2240            ));
2241        }
2242
2243        self.advance(); // Closing quote
2244        self.add_token_with_text(TokenType::String, value);
2245        Ok(())
2246    }
2247
2248    /// Scan a double-quoted string (for dialects like BigQuery where " is a string delimiter)
2249    fn scan_double_quoted_string(&mut self) -> Result<()> {
2250        self.advance(); // Opening quote
2251        let mut value = String::new();
2252
2253        while !self.is_at_end() {
2254            let c = self.peek();
2255            if c == '"' {
2256                if self.peek_next() == '"' {
2257                    // Escaped quote
2258                    value.push('"');
2259                    self.advance();
2260                    self.advance();
2261                } else {
2262                    break;
2263                }
2264            } else if c == '\\' && self.config.string_escapes.contains(&'\\') {
2265                // Handle escape sequences
2266                self.advance(); // Consume the backslash
2267                if !self.is_at_end() {
2268                    let escaped = self.advance();
2269                    match escaped {
2270                        'n' => value.push('\n'),
2271                        'r' => value.push('\r'),
2272                        't' => value.push('\t'),
2273                        '0' => value.push('\0'),
2274                        'Z' => value.push('\x1A'), // Ctrl+Z (MySQL)
2275                        'a' => value.push('\x07'), // Alert/bell
2276                        'b' => value.push('\x08'), // Backspace
2277                        'f' => value.push('\x0C'), // Form feed
2278                        'v' => value.push('\x0B'), // Vertical tab
2279                        '\\' => value.push('\\'),
2280                        '\'' => value.push('\''),
2281                        '"' => value.push('"'),
2282                        '%' => {
2283                            // MySQL: \% in LIKE patterns
2284                            value.push('%');
2285                        }
2286                        '_' => {
2287                            // MySQL: \_ in LIKE patterns
2288                            value.push('_');
2289                        }
2290                        // For unrecognized escape sequences:
2291                        // If escape_follow_chars is set, only preserve backslash for chars in that list
2292                        // Otherwise (empty list), preserve backslash + char for unrecognized escapes
2293                        _ => {
2294                            if !self.config.escape_follow_chars.is_empty() {
2295                                // MySQL-style: discard backslash for unrecognized escapes
2296                                value.push(escaped);
2297                            } else {
2298                                // Standard: preserve backslash + char
2299                                value.push('\\');
2300                                value.push(escaped);
2301                            }
2302                        }
2303                    }
2304                }
2305            } else {
2306                value.push(self.advance());
2307            }
2308        }
2309
2310        if self.is_at_end() {
2311            return Err(Error::tokenize(
2312                "Unterminated double-quoted string",
2313                self.line,
2314                self.column,
2315            ));
2316        }
2317
2318        self.advance(); // Closing quote
2319        self.add_token_with_text(TokenType::String, value);
2320        Ok(())
2321    }
2322
2323    fn scan_triple_quoted_string(&mut self, quote_char: char) -> Result<()> {
2324        // Advance past the three opening quotes
2325        self.advance();
2326        self.advance();
2327        self.advance();
2328        let mut value = String::new();
2329
2330        while !self.is_at_end() {
2331            // Check for closing triple quote
2332            if self.peek() == quote_char
2333                && self.current + 1 < self.size
2334                && self.chars[self.current + 1] == quote_char
2335                && self.current + 2 < self.size
2336                && self.chars[self.current + 2] == quote_char
2337            {
2338                // Found closing """
2339                break;
2340            }
2341            value.push(self.advance());
2342        }
2343
2344        if self.is_at_end() {
2345            return Err(Error::tokenize(
2346                "Unterminated triple-quoted string",
2347                self.line,
2348                self.column,
2349            ));
2350        }
2351
2352        // Advance past the three closing quotes
2353        self.advance();
2354        self.advance();
2355        self.advance();
2356        let token_type = if quote_char == '"' {
2357            TokenType::TripleDoubleQuotedString
2358        } else {
2359            TokenType::TripleSingleQuotedString
2360        };
2361        self.add_token_with_text(token_type, value);
2362        Ok(())
2363    }
2364
2365    fn scan_quoted_identifier(&mut self, end_quote: char) -> Result<()> {
2366        self.advance(); // Opening quote
2367        let mut value = String::new();
2368
2369        loop {
2370            if self.is_at_end() {
2371                return Err(Error::tokenize(
2372                    "Unterminated identifier",
2373                    self.line,
2374                    self.column,
2375                ));
2376            }
2377            if self.peek() == end_quote {
2378                if self.peek_next() == end_quote {
2379                    // Escaped quote (e.g., "" inside "x""y") -> store single quote
2380                    value.push(end_quote);
2381                    self.advance(); // skip first quote
2382                    self.advance(); // skip second quote
2383                } else {
2384                    // End of identifier
2385                    break;
2386                }
2387            } else {
2388                value.push(self.peek());
2389                self.advance();
2390            }
2391        }
2392
2393        self.advance(); // Closing quote
2394        self.add_token_with_text(TokenType::QuotedIdentifier, value);
2395        Ok(())
2396    }
2397
2398    /// Scan a string delimited by Unicode curly single quotes (U+2018/U+2019).
2399    /// Content between curly quotes is literal (no escape processing).
2400    /// When opened with \u{2018} (left), close with \u{2019} (right) only.
2401    /// When opened with \u{2019} (right), close with \u{2019} (right) — self-closing.
2402    fn scan_unicode_quoted_string(&mut self, open_quote: char) -> Result<()> {
2403        self.advance(); // Opening curly quote
2404        let start = self.current;
2405        // Determine closing quote: left opens -> right closes; right opens -> right closes
2406        let close_quote = if open_quote == '\u{2018}' {
2407            '\u{2019}' // left opens, right closes
2408        } else {
2409            '\u{2019}' // right quote also closes with right quote
2410        };
2411        while !self.is_at_end() && self.peek() != close_quote {
2412            self.advance();
2413        }
2414        let value: String = self.chars[start..self.current].iter().collect();
2415        if !self.is_at_end() {
2416            self.advance(); // Closing quote
2417        }
2418        self.add_token_with_text(TokenType::String, value);
2419        Ok(())
2420    }
2421
2422    /// Scan an identifier delimited by Unicode curly double quotes (U+201C/U+201D).
2423    /// When opened with \u{201C} (left), close with \u{201D} (right) only.
2424    fn scan_unicode_quoted_identifier(&mut self, open_quote: char) -> Result<()> {
2425        self.advance(); // Opening curly quote
2426        let start = self.current;
2427        let close_quote = if open_quote == '\u{201C}' {
2428            '\u{201D}' // left opens, right closes
2429        } else {
2430            '\u{201D}' // right also closes with right
2431        };
2432        while !self.is_at_end() && self.peek() != close_quote && self.peek() != '"' {
2433            self.advance();
2434        }
2435        let value: String = self.chars[start..self.current].iter().collect();
2436        if !self.is_at_end() {
2437            self.advance(); // Closing quote
2438        }
2439        self.add_token_with_text(TokenType::QuotedIdentifier, value);
2440        Ok(())
2441    }
2442
2443    fn scan_number(&mut self) -> Result<()> {
2444        // Check for 0x/0X hex number prefix (SQLite-style)
2445        if self.config.hex_number_strings && self.peek() == '0' && !self.is_at_end() {
2446            let next = if self.current + 1 < self.size {
2447                self.chars[self.current + 1]
2448            } else {
2449                '\0'
2450            };
2451            if next == 'x' || next == 'X' {
2452                // Advance past '0' and 'x'/'X'
2453                self.advance();
2454                self.advance();
2455                // Collect hex digits (allow underscores as separators, e.g., 0xbad_cafe)
2456                let hex_start = self.current;
2457                while !self.is_at_end() && (self.peek().is_ascii_hexdigit() || self.peek() == '_') {
2458                    if self.peek() == '_' && !self.peek_next().is_ascii_hexdigit() {
2459                        break;
2460                    }
2461                    self.advance();
2462                }
2463                if self.current > hex_start {
2464                    // Check for hex float: 0xABC.DEFpEXP or 0xABCpEXP
2465                    let mut is_hex_float = false;
2466                    // Optional fractional part: .hexdigits
2467                    if !self.is_at_end() && self.peek() == '.' {
2468                        let after_dot = if self.current + 1 < self.size {
2469                            self.chars[self.current + 1]
2470                        } else {
2471                            '\0'
2472                        };
2473                        if after_dot.is_ascii_hexdigit() {
2474                            is_hex_float = true;
2475                            self.advance(); // consume '.'
2476                            while !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2477                                self.advance();
2478                            }
2479                        }
2480                    }
2481                    // Optional binary exponent: p/P [+/-] digits
2482                    if !self.is_at_end() && (self.peek() == 'p' || self.peek() == 'P') {
2483                        is_hex_float = true;
2484                        self.advance(); // consume p/P
2485                        if !self.is_at_end() && (self.peek() == '+' || self.peek() == '-') {
2486                            self.advance();
2487                        }
2488                        while !self.is_at_end() && self.peek().is_ascii_digit() {
2489                            self.advance();
2490                        }
2491                    }
2492                    if is_hex_float {
2493                        // Hex float literal — emit as regular Number token with full text
2494                        let full_text: String =
2495                            self.chars[self.start..self.current].iter().collect();
2496                        self.add_token_with_text(TokenType::Number, full_text);
2497                    } else if self.config.hex_string_is_integer_type {
2498                        // BigQuery/ClickHouse: 0xA represents an integer in hex notation
2499                        let hex_value: String =
2500                            self.chars[hex_start..self.current].iter().collect();
2501                        self.add_token_with_text(TokenType::HexNumber, hex_value);
2502                    } else {
2503                        // SQLite/Teradata: 0xCC represents a binary/blob hex string
2504                        let hex_value: String =
2505                            self.chars[hex_start..self.current].iter().collect();
2506                        self.add_token_with_text(TokenType::HexString, hex_value);
2507                    }
2508                    return Ok(());
2509                }
2510                // No hex digits after 0x - fall through to normal number parsing
2511                // (reset current back to after '0')
2512                self.current = self.start + 1;
2513            }
2514        }
2515
2516        // Allow underscores as digit separators (e.g., 20_000, 1_000_000)
2517        while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2518            // Don't allow underscore at the end (must be followed by digit)
2519            if self.peek() == '_' && (self.is_at_end() || !self.peek_next().is_ascii_digit()) {
2520                break;
2521            }
2522            self.advance();
2523        }
2524
2525        // Look for decimal part - allow trailing dot (e.g., "1.")
2526        // In PostgreSQL (and sqlglot), "1.x" parses as float "1." with alias "x"
2527        // So we always consume the dot as part of the number, even if followed by an identifier
2528        if self.peek() == '.' {
2529            let next = self.peek_next();
2530            // Only consume the dot if:
2531            // 1. Followed by a digit (normal decimal like 1.5)
2532            // 2. Followed by an identifier start (like 1.x -> becomes 1. with alias x)
2533            // 3. End of input or other non-dot character (trailing decimal like "1.")
2534            // Do NOT consume if it's a double dot (..) which is a range operator
2535            if next != '.' {
2536                self.advance(); // consume the .
2537                                // Only consume digits after the decimal point (not identifiers)
2538                while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2539                    if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2540                        break;
2541                    }
2542                    self.advance();
2543                }
2544            }
2545        }
2546
2547        // Look for exponent
2548        if self.peek() == 'e' || self.peek() == 'E' {
2549            self.advance();
2550            if self.peek() == '+' || self.peek() == '-' {
2551                self.advance();
2552            }
2553            while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2554                if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2555                    break;
2556                }
2557                self.advance();
2558            }
2559        }
2560
2561        let text: String = self.chars[self.start..self.current].iter().collect();
2562
2563        // Check for numeric literal suffixes (e.g., 1L -> BIGINT, 1s -> SMALLINT in Hive/Spark)
2564        if !self.config.numeric_literals.is_empty() && !self.is_at_end() {
2565            let next_char = self.peek().to_uppercase().to_string();
2566            // Try 2-char suffix first (e.g., "BD"), then 1-char
2567            let suffix_match = if self.current + 1 < self.size {
2568                let two_char: String = vec![self.chars[self.current], self.chars[self.current + 1]]
2569                    .iter()
2570                    .collect::<String>()
2571                    .to_uppercase();
2572                if self.config.numeric_literals.contains_key(&two_char) {
2573                    // Make sure the 2-char suffix is not followed by more identifier chars
2574                    let after_suffix = if self.current + 2 < self.size {
2575                        self.chars[self.current + 2]
2576                    } else {
2577                        ' '
2578                    };
2579                    if !after_suffix.is_alphanumeric() && after_suffix != '_' {
2580                        Some((two_char, 2))
2581                    } else {
2582                        None
2583                    }
2584                } else if self.config.numeric_literals.contains_key(&next_char) {
2585                    // 1-char suffix - make sure not followed by more identifier chars
2586                    let after_suffix = if self.current + 1 < self.size {
2587                        self.chars[self.current + 1]
2588                    } else {
2589                        ' '
2590                    };
2591                    if !after_suffix.is_alphanumeric() && after_suffix != '_' {
2592                        Some((next_char, 1))
2593                    } else {
2594                        None
2595                    }
2596                } else {
2597                    None
2598                }
2599            } else if self.config.numeric_literals.contains_key(&next_char) {
2600                // At end of input, 1-char suffix
2601                Some((next_char, 1))
2602            } else {
2603                None
2604            };
2605
2606            if let Some((suffix, len)) = suffix_match {
2607                // Consume the suffix characters
2608                for _ in 0..len {
2609                    self.advance();
2610                }
2611                // Emit as a special number-with-suffix token
2612                // We'll encode as "number::TYPE" so the parser can split it
2613                let type_name = self
2614                    .config
2615                    .numeric_literals
2616                    .get(&suffix)
2617                    .expect("suffix verified by contains_key above")
2618                    .clone();
2619                let combined = format!("{}::{}", text, type_name);
2620                self.add_token_with_text(TokenType::Number, combined);
2621                return Ok(());
2622            }
2623        }
2624
2625        // Check for identifiers that start with a digit (e.g., 1a, 1_a, 1a_1a)
2626        // In Hive/Spark/MySQL/ClickHouse, these are valid unquoted identifiers
2627        if self.config.identifiers_can_start_with_digit && !self.is_at_end() {
2628            let next = self.peek();
2629            if next.is_alphabetic() || next == '_' {
2630                // Continue scanning as an identifier
2631                while !self.is_at_end() {
2632                    let ch = self.peek();
2633                    if ch.is_alphanumeric() || ch == '_' {
2634                        self.advance();
2635                    } else {
2636                        break;
2637                    }
2638                }
2639                let ident_text: String = self.chars[self.start..self.current].iter().collect();
2640                self.add_token_with_text(TokenType::Identifier, ident_text);
2641                return Ok(());
2642            }
2643        }
2644
2645        self.add_token_with_text(TokenType::Number, text);
2646        Ok(())
2647    }
2648
2649    /// Scan a number that starts with a dot (e.g., .25, .5, .123e10)
2650    fn scan_number_starting_with_dot(&mut self) -> Result<()> {
2651        // Consume the leading dot
2652        self.advance();
2653
2654        // Consume the fractional digits
2655        while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2656            if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2657                break;
2658            }
2659            self.advance();
2660        }
2661
2662        // Look for exponent
2663        if self.peek() == 'e' || self.peek() == 'E' {
2664            self.advance();
2665            if self.peek() == '+' || self.peek() == '-' {
2666                self.advance();
2667            }
2668            while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2669                if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2670                    break;
2671                }
2672                self.advance();
2673            }
2674        }
2675
2676        let text: String = self.chars[self.start..self.current].iter().collect();
2677        self.add_token_with_text(TokenType::Number, text);
2678        Ok(())
2679    }
2680
2681    fn scan_identifier_or_keyword(&mut self) -> Result<()> {
2682        // Guard against unrecognized characters that could cause infinite loops
2683        let first_char = self.peek();
2684        if !first_char.is_alphanumeric() && first_char != '_' {
2685            // Unknown character - skip it and return an error
2686            let c = self.advance();
2687            return Err(Error::tokenize(
2688                format!("Unexpected character: '{}'", c),
2689                self.line,
2690                self.column,
2691            ));
2692        }
2693
2694        while !self.is_at_end() {
2695            let c = self.peek();
2696            // Allow alphanumeric, underscore, $, # and @ in identifiers
2697            // PostgreSQL allows $, TSQL allows # and @
2698            // But stop consuming # if followed by > or >> (PostgreSQL #> and #>> operators)
2699            if c == '#' {
2700                let next_c = if self.current + 1 < self.size {
2701                    self.chars[self.current + 1]
2702                } else {
2703                    '\0'
2704                };
2705                if next_c == '>' || next_c == '-' {
2706                    break; // Don't consume # — it's part of #>, #>>, or #- operator
2707                }
2708                self.advance();
2709            } else if c.is_alphanumeric() || c == '_' || c == '$' || c == '@' {
2710                self.advance();
2711            } else {
2712                break;
2713            }
2714        }
2715
2716        let text: String = self.chars[self.start..self.current].iter().collect();
2717        let upper = text.to_uppercase();
2718
2719        // Special-case NOT= (Teradata and other dialects)
2720        if upper == "NOT" && self.peek() == '=' {
2721            self.advance(); // consume '='
2722            self.add_token(TokenType::Neq);
2723            return Ok(());
2724        }
2725
2726        // Check for special string prefixes like N'...', X'...', B'...', U&'...', r'...', b'...'
2727        // Also handle double-quoted variants for dialects that support them (e.g., BigQuery)
2728        let next_char = self.peek();
2729        let is_single_quote = next_char == '\'';
2730        let is_double_quote = next_char == '"' && self.config.quotes.contains_key("\"");
2731        // For raw strings (r"..." or r'...'), we allow double quotes even if " is not in quotes config
2732        // because raw strings are a special case used in Spark/Databricks where " is for identifiers
2733        let is_double_quote_for_raw = next_char == '"';
2734
2735        // Handle raw strings first - they're special because they work with both ' and "
2736        // even in dialects where " is normally an identifier delimiter (like Databricks)
2737        if upper == "R" && (is_single_quote || is_double_quote_for_raw) {
2738            // Raw string r'...' or r"..." or r'''...''' or r"""...""" (BigQuery style)
2739            // In raw strings, backslashes are treated literally (no escape processing)
2740            let quote_char = if is_single_quote { '\'' } else { '"' };
2741            self.advance(); // consume the first opening quote
2742
2743            // Check for triple-quoted raw string (r"""...""" or r'''...''')
2744            if self.peek() == quote_char && self.peek_next() == quote_char {
2745                // Triple-quoted raw string
2746                self.advance(); // consume second quote
2747                self.advance(); // consume third quote
2748                let string_value = self.scan_raw_triple_quoted_content(quote_char)?;
2749                self.add_token_with_text(TokenType::RawString, string_value);
2750            } else {
2751                let string_value = self.scan_raw_string_content(quote_char)?;
2752                self.add_token_with_text(TokenType::RawString, string_value);
2753            }
2754            return Ok(());
2755        }
2756
2757        if is_single_quote || is_double_quote {
2758            match upper.as_str() {
2759                "N" => {
2760                    // National string N'...'
2761                    self.advance(); // consume the opening quote
2762                    let string_value = if is_single_quote {
2763                        self.scan_string_content()?
2764                    } else {
2765                        self.scan_double_quoted_string_content()?
2766                    };
2767                    self.add_token_with_text(TokenType::NationalString, string_value);
2768                    return Ok(());
2769                }
2770                "E" => {
2771                    // PostgreSQL escape string E'...' or e'...'
2772                    // Preserve the case by prefixing with "e:" or "E:"
2773                    // Always use backslash escapes for escape strings (e.g., \' is an escaped quote)
2774                    let lowercase = text == "e";
2775                    let prefix = if lowercase { "e:" } else { "E:" };
2776                    self.advance(); // consume the opening quote
2777                    let string_value = self.scan_string_content_with_escapes(true)?;
2778                    self.add_token_with_text(
2779                        TokenType::EscapeString,
2780                        format!("{}{}", prefix, string_value),
2781                    );
2782                    return Ok(());
2783                }
2784                "X" => {
2785                    // Hex string X'...'
2786                    self.advance(); // consume the opening quote
2787                    let string_value = if is_single_quote {
2788                        self.scan_string_content()?
2789                    } else {
2790                        self.scan_double_quoted_string_content()?
2791                    };
2792                    self.add_token_with_text(TokenType::HexString, string_value);
2793                    return Ok(());
2794                }
2795                "B" if is_double_quote => {
2796                    // Byte string b"..." (BigQuery style) - MUST check before single quote B'...'
2797                    self.advance(); // consume the opening quote
2798                    let string_value = self.scan_double_quoted_string_content()?;
2799                    self.add_token_with_text(TokenType::ByteString, string_value);
2800                    return Ok(());
2801                }
2802                "B" if is_single_quote => {
2803                    // For BigQuery: b'...' is a byte string (bytes data)
2804                    // For standard SQL: B'...' is a bit string (binary digits)
2805                    self.advance(); // consume the opening quote
2806                    let string_value = self.scan_string_content()?;
2807                    if self.config.b_prefix_is_byte_string {
2808                        self.add_token_with_text(TokenType::ByteString, string_value);
2809                    } else {
2810                        self.add_token_with_text(TokenType::BitString, string_value);
2811                    }
2812                    return Ok(());
2813                }
2814                _ => {}
2815            }
2816        }
2817
2818        // Check for U&'...' Unicode string syntax (SQL standard)
2819        if upper == "U"
2820            && self.peek() == '&'
2821            && self.current + 1 < self.size
2822            && self.chars[self.current + 1] == '\''
2823        {
2824            self.advance(); // consume '&'
2825            self.advance(); // consume opening quote
2826            let string_value = self.scan_string_content()?;
2827            self.add_token_with_text(TokenType::UnicodeString, string_value);
2828            return Ok(());
2829        }
2830
2831        let token_type = self
2832            .config
2833            .keywords
2834            .get(&upper)
2835            .copied()
2836            .unwrap_or(TokenType::Var);
2837
2838        self.add_token_with_text(token_type, text);
2839        Ok(())
2840    }
2841
2842    /// Scan string content (everything between quotes)
2843    /// If `force_backslash_escapes` is true, backslash is always treated as an escape character
2844    /// (used for PostgreSQL E'...' escape strings)
2845    fn scan_string_content_with_escapes(
2846        &mut self,
2847        force_backslash_escapes: bool,
2848    ) -> Result<String> {
2849        let mut value = String::new();
2850        let use_backslash_escapes =
2851            force_backslash_escapes || self.config.string_escapes.contains(&'\\');
2852
2853        while !self.is_at_end() {
2854            let c = self.peek();
2855            if c == '\'' {
2856                if self.peek_next() == '\'' {
2857                    // Escaped quote ''
2858                    value.push('\'');
2859                    self.advance();
2860                    self.advance();
2861                } else {
2862                    break;
2863                }
2864            } else if c == '\\' && use_backslash_escapes {
2865                // Preserve escape sequences literally (including \' for escape strings)
2866                value.push(self.advance());
2867                if !self.is_at_end() {
2868                    value.push(self.advance());
2869                }
2870            } else {
2871                value.push(self.advance());
2872            }
2873        }
2874
2875        if self.is_at_end() {
2876            return Err(Error::tokenize(
2877                "Unterminated string",
2878                self.line,
2879                self.column,
2880            ));
2881        }
2882
2883        self.advance(); // Closing quote
2884        Ok(value)
2885    }
2886
2887    /// Scan string content (everything between quotes)
2888    fn scan_string_content(&mut self) -> Result<String> {
2889        self.scan_string_content_with_escapes(false)
2890    }
2891
2892    /// Scan double-quoted string content (for dialects like BigQuery where " is a string delimiter)
2893    /// This is used for prefixed strings like b"..." or N"..."
2894    fn scan_double_quoted_string_content(&mut self) -> Result<String> {
2895        let mut value = String::new();
2896        let use_backslash_escapes = self.config.string_escapes.contains(&'\\');
2897
2898        while !self.is_at_end() {
2899            let c = self.peek();
2900            if c == '"' {
2901                if self.peek_next() == '"' {
2902                    // Escaped quote ""
2903                    value.push('"');
2904                    self.advance();
2905                    self.advance();
2906                } else {
2907                    break;
2908                }
2909            } else if c == '\\' && use_backslash_escapes {
2910                // Handle escape sequences
2911                self.advance(); // Consume backslash
2912                if !self.is_at_end() {
2913                    let escaped = self.advance();
2914                    match escaped {
2915                        'n' => value.push('\n'),
2916                        'r' => value.push('\r'),
2917                        't' => value.push('\t'),
2918                        '0' => value.push('\0'),
2919                        '\\' => value.push('\\'),
2920                        '"' => value.push('"'),
2921                        '\'' => value.push('\''),
2922                        'x' => {
2923                            // Hex escape \xNN - collect hex digits
2924                            let mut hex = String::new();
2925                            for _ in 0..2 {
2926                                if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2927                                    hex.push(self.advance());
2928                                }
2929                            }
2930                            if let Ok(byte) = u8::from_str_radix(&hex, 16) {
2931                                value.push(byte as char);
2932                            } else {
2933                                // Invalid hex escape, keep it literal
2934                                value.push('\\');
2935                                value.push('x');
2936                                value.push_str(&hex);
2937                            }
2938                        }
2939                        _ => {
2940                            // For unrecognized escapes, preserve backslash + char
2941                            value.push('\\');
2942                            value.push(escaped);
2943                        }
2944                    }
2945                }
2946            } else {
2947                value.push(self.advance());
2948            }
2949        }
2950
2951        if self.is_at_end() {
2952            return Err(Error::tokenize(
2953                "Unterminated double-quoted string",
2954                self.line,
2955                self.column,
2956            ));
2957        }
2958
2959        self.advance(); // Closing quote
2960        Ok(value)
2961    }
2962
2963    /// Scan raw string content (limited escape processing for quotes)
2964    /// Used for BigQuery r'...' and r"..." strings
2965    /// In raw strings, backslashes are literal EXCEPT that escape sequences for the
2966    /// quote character still work (e.g., \' in r'...' escapes the quote, '' also works)
2967    fn scan_raw_string_content(&mut self, quote_char: char) -> Result<String> {
2968        let mut value = String::new();
2969
2970        while !self.is_at_end() {
2971            let c = self.peek();
2972            if c == quote_char {
2973                if self.peek_next() == quote_char {
2974                    // Escaped quote (doubled) - e.g., '' inside r'...'
2975                    value.push(quote_char);
2976                    self.advance();
2977                    self.advance();
2978                } else {
2979                    break;
2980                }
2981            } else if c == '\\'
2982                && self.peek_next() == quote_char
2983                && self.config.string_escapes_allowed_in_raw_strings
2984            {
2985                // Backslash-escaped quote - works in raw strings when string_escapes_allowed_in_raw_strings is true
2986                // e.g., \' inside r'...' becomes literal ' (BigQuery behavior)
2987                // Spark/Databricks has this set to false, so backslash is always literal there
2988                value.push(quote_char);
2989                self.advance(); // consume backslash
2990                self.advance(); // consume quote
2991            } else {
2992                // In raw strings, everything including backslashes is literal
2993                value.push(self.advance());
2994            }
2995        }
2996
2997        if self.is_at_end() {
2998            return Err(Error::tokenize(
2999                "Unterminated raw string",
3000                self.line,
3001                self.column,
3002            ));
3003        }
3004
3005        self.advance(); // Closing quote
3006        Ok(value)
3007    }
3008
3009    /// Scan raw triple-quoted string content (r"""...""" or r'''...''')
3010    /// Terminates when three consecutive quote_chars are found
3011    fn scan_raw_triple_quoted_content(&mut self, quote_char: char) -> Result<String> {
3012        let mut value = String::new();
3013
3014        while !self.is_at_end() {
3015            let c = self.peek();
3016            if c == quote_char && self.peek_next() == quote_char {
3017                // Check for third quote
3018                if self.current + 2 < self.size && self.chars[self.current + 2] == quote_char {
3019                    // Found three consecutive quotes - end of string
3020                    self.advance(); // first closing quote
3021                    self.advance(); // second closing quote
3022                    self.advance(); // third closing quote
3023                    return Ok(value);
3024                }
3025            }
3026            // In raw strings, everything including backslashes is literal
3027            let ch = self.advance();
3028            value.push(ch);
3029        }
3030
3031        Err(Error::tokenize(
3032            "Unterminated raw triple-quoted string",
3033            self.line,
3034            self.column,
3035        ))
3036    }
3037
3038    /// Scan TSQL identifiers that start with # (temp tables) or @ (variables)
3039    /// Examples: #temp, ##global_temp, @variable
3040    /// Scan an identifier that starts with `$` (ClickHouse).
3041    /// Examples: `$alias$name$`, `$x`
3042    fn scan_dollar_identifier(&mut self) -> Result<()> {
3043        // Consume the leading $
3044        self.advance();
3045
3046        // Consume alphanumeric, _, and $ continuation chars
3047        while !self.is_at_end() {
3048            let c = self.peek();
3049            if c.is_alphanumeric() || c == '_' || c == '$' {
3050                self.advance();
3051            } else {
3052                break;
3053            }
3054        }
3055
3056        let text: String = self.chars[self.start..self.current].iter().collect();
3057        self.add_token_with_text(TokenType::Var, text);
3058        Ok(())
3059    }
3060
3061    fn scan_tsql_identifier(&mut self) -> Result<()> {
3062        // Consume the leading # or @ (or ##)
3063        let first = self.advance();
3064
3065        // For ##, consume the second #
3066        if first == '#' && self.peek() == '#' {
3067            self.advance();
3068        }
3069
3070        // Now scan the rest of the identifier
3071        while !self.is_at_end() {
3072            let c = self.peek();
3073            if c.is_alphanumeric() || c == '_' || c == '$' || c == '#' || c == '@' {
3074                self.advance();
3075            } else {
3076                break;
3077            }
3078        }
3079
3080        let text: String = self.chars[self.start..self.current].iter().collect();
3081        // These are always identifiers (variables or temp table names), never keywords
3082        self.add_token_with_text(TokenType::Var, text);
3083        Ok(())
3084    }
3085
3086    /// Check if the last tokens match INSERT ... FORMAT <name> (not VALUES).
3087    /// If so, consume everything until the next blank line (two consecutive newlines)
3088    /// or end of input as raw data.
3089    fn try_scan_insert_format_raw_data(&mut self) -> Option<String> {
3090        let len = self.tokens.len();
3091        if len < 3 {
3092            return None;
3093        }
3094
3095        // Last token should be the format name (Identifier or Var, not VALUES)
3096        let last = &self.tokens[len - 1];
3097        if last.text.eq_ignore_ascii_case("VALUES") {
3098            return None;
3099        }
3100        if !matches!(last.token_type, TokenType::Var | TokenType::Identifier) {
3101            return None;
3102        }
3103
3104        // Second-to-last should be FORMAT
3105        let format_tok = &self.tokens[len - 2];
3106        if !format_tok.text.eq_ignore_ascii_case("FORMAT") {
3107            return None;
3108        }
3109
3110        // Check that there's an INSERT somewhere earlier in the tokens
3111        let has_insert = self.tokens[..len - 2]
3112            .iter()
3113            .rev()
3114            .take(20)
3115            .any(|t| t.token_type == TokenType::Insert);
3116        if !has_insert {
3117            return None;
3118        }
3119
3120        // We're in INSERT ... FORMAT <name> context. Consume everything until:
3121        // - A blank line (two consecutive newlines, possibly with whitespace between)
3122        // - End of input
3123        let raw_start = self.current;
3124        while !self.is_at_end() {
3125            let c = self.peek();
3126            if c == '\n' {
3127                // Check for blank line: \n followed by optional \r and \n
3128                let saved = self.current;
3129                self.advance(); // consume first \n
3130                                // Skip \r if present
3131                while !self.is_at_end() && self.peek() == '\r' {
3132                    self.advance();
3133                }
3134                if self.is_at_end() || self.peek() == '\n' {
3135                    // Found blank line or end of input - stop here
3136                    // Don't consume the second \n so subsequent SQL can be tokenized
3137                    let raw: String = self.chars[raw_start..saved].iter().collect();
3138                    return Some(raw.trim().to_string());
3139                }
3140                // Not a blank line, continue scanning
3141            } else {
3142                self.advance();
3143            }
3144        }
3145
3146        // Reached end of input
3147        let raw: String = self.chars[raw_start..self.current].iter().collect();
3148        let trimmed = raw.trim().to_string();
3149        if trimmed.is_empty() {
3150            None
3151        } else {
3152            Some(trimmed)
3153        }
3154    }
3155
3156    fn add_token(&mut self, token_type: TokenType) {
3157        let text: String = self.chars[self.start..self.current].iter().collect();
3158        self.add_token_with_text(token_type, text);
3159    }
3160
3161    fn add_token_with_text(&mut self, token_type: TokenType, text: String) {
3162        let span = Span::new(self.start, self.current, self.line, self.column);
3163        let mut token = Token::new(token_type, text, span);
3164        token.comments.append(&mut self.comments);
3165        self.tokens.push(token);
3166    }
3167}
3168
3169#[cfg(test)]
3170mod tests {
3171    use super::*;
3172
3173    #[test]
3174    fn test_simple_select() {
3175        let tokenizer = Tokenizer::default();
3176        let tokens = tokenizer.tokenize("SELECT 1").unwrap();
3177
3178        assert_eq!(tokens.len(), 2);
3179        assert_eq!(tokens[0].token_type, TokenType::Select);
3180        assert_eq!(tokens[1].token_type, TokenType::Number);
3181        assert_eq!(tokens[1].text, "1");
3182    }
3183
3184    #[test]
3185    fn test_select_with_identifier() {
3186        let tokenizer = Tokenizer::default();
3187        let tokens = tokenizer.tokenize("SELECT a, b FROM t").unwrap();
3188
3189        assert_eq!(tokens.len(), 6);
3190        assert_eq!(tokens[0].token_type, TokenType::Select);
3191        assert_eq!(tokens[1].token_type, TokenType::Var);
3192        assert_eq!(tokens[1].text, "a");
3193        assert_eq!(tokens[2].token_type, TokenType::Comma);
3194        assert_eq!(tokens[3].token_type, TokenType::Var);
3195        assert_eq!(tokens[3].text, "b");
3196        assert_eq!(tokens[4].token_type, TokenType::From);
3197        assert_eq!(tokens[5].token_type, TokenType::Var);
3198        assert_eq!(tokens[5].text, "t");
3199    }
3200
3201    #[test]
3202    fn test_string_literal() {
3203        let tokenizer = Tokenizer::default();
3204        let tokens = tokenizer.tokenize("SELECT 'hello'").unwrap();
3205
3206        assert_eq!(tokens.len(), 2);
3207        assert_eq!(tokens[1].token_type, TokenType::String);
3208        assert_eq!(tokens[1].text, "hello");
3209    }
3210
3211    #[test]
3212    fn test_escaped_string() {
3213        let tokenizer = Tokenizer::default();
3214        let tokens = tokenizer.tokenize("SELECT 'it''s'").unwrap();
3215
3216        assert_eq!(tokens.len(), 2);
3217        assert_eq!(tokens[1].token_type, TokenType::String);
3218        assert_eq!(tokens[1].text, "it's");
3219    }
3220
3221    #[test]
3222    fn test_comments() {
3223        let tokenizer = Tokenizer::default();
3224        let tokens = tokenizer.tokenize("SELECT -- comment\n1").unwrap();
3225
3226        assert_eq!(tokens.len(), 2);
3227        // Comments are attached to the PREVIOUS token as trailing_comments
3228        // This is better for round-trip fidelity (e.g., SELECT c /* comment */ FROM)
3229        assert_eq!(tokens[0].trailing_comments.len(), 1);
3230        assert_eq!(tokens[0].trailing_comments[0], " comment");
3231    }
3232
3233    #[test]
3234    fn test_comment_in_and_chain() {
3235        use crate::generator::Generator;
3236        use crate::parser::Parser;
3237
3238        // Line comments between AND clauses should appear after the AND operator
3239        let sql = "SELECT a FROM b WHERE foo\n-- c1\nAND bar\n-- c2\nAND bla";
3240        let ast = Parser::parse_sql(sql).unwrap();
3241        let mut gen = Generator::default();
3242        let output = gen.generate(&ast[0]).unwrap();
3243        assert_eq!(
3244            output,
3245            "SELECT a FROM b WHERE foo AND /* c1 */ bar AND /* c2 */ bla"
3246        );
3247    }
3248
3249    #[test]
3250    fn test_operators() {
3251        let tokenizer = Tokenizer::default();
3252        let tokens = tokenizer.tokenize("1 + 2 * 3").unwrap();
3253
3254        assert_eq!(tokens.len(), 5);
3255        assert_eq!(tokens[0].token_type, TokenType::Number);
3256        assert_eq!(tokens[1].token_type, TokenType::Plus);
3257        assert_eq!(tokens[2].token_type, TokenType::Number);
3258        assert_eq!(tokens[3].token_type, TokenType::Star);
3259        assert_eq!(tokens[4].token_type, TokenType::Number);
3260    }
3261
3262    #[test]
3263    fn test_comparison_operators() {
3264        let tokenizer = Tokenizer::default();
3265        let tokens = tokenizer.tokenize("a <= b >= c != d").unwrap();
3266
3267        assert_eq!(tokens[1].token_type, TokenType::Lte);
3268        assert_eq!(tokens[3].token_type, TokenType::Gte);
3269        assert_eq!(tokens[5].token_type, TokenType::Neq);
3270    }
3271
3272    #[test]
3273    fn test_national_string() {
3274        let tokenizer = Tokenizer::default();
3275        let tokens = tokenizer.tokenize("N'abc'").unwrap();
3276
3277        assert_eq!(
3278            tokens.len(),
3279            1,
3280            "Expected 1 token for N'abc', got {:?}",
3281            tokens
3282        );
3283        assert_eq!(tokens[0].token_type, TokenType::NationalString);
3284        assert_eq!(tokens[0].text, "abc");
3285    }
3286
3287    #[test]
3288    fn test_hex_string() {
3289        let tokenizer = Tokenizer::default();
3290        let tokens = tokenizer.tokenize("X'ABCD'").unwrap();
3291
3292        assert_eq!(
3293            tokens.len(),
3294            1,
3295            "Expected 1 token for X'ABCD', got {:?}",
3296            tokens
3297        );
3298        assert_eq!(tokens[0].token_type, TokenType::HexString);
3299        assert_eq!(tokens[0].text, "ABCD");
3300    }
3301
3302    #[test]
3303    fn test_bit_string() {
3304        let tokenizer = Tokenizer::default();
3305        let tokens = tokenizer.tokenize("B'01010'").unwrap();
3306
3307        assert_eq!(
3308            tokens.len(),
3309            1,
3310            "Expected 1 token for B'01010', got {:?}",
3311            tokens
3312        );
3313        assert_eq!(tokens[0].token_type, TokenType::BitString);
3314        assert_eq!(tokens[0].text, "01010");
3315    }
3316
3317    #[test]
3318    fn test_trailing_dot_number() {
3319        let tokenizer = Tokenizer::default();
3320
3321        // Test trailing dot
3322        let tokens = tokenizer.tokenize("SELECT 1.").unwrap();
3323        assert_eq!(
3324            tokens.len(),
3325            2,
3326            "Expected 2 tokens for 'SELECT 1.', got {:?}",
3327            tokens
3328        );
3329        assert_eq!(tokens[1].token_type, TokenType::Number);
3330        assert_eq!(tokens[1].text, "1.");
3331
3332        // Test normal decimal
3333        let tokens = tokenizer.tokenize("SELECT 1.5").unwrap();
3334        assert_eq!(tokens[1].text, "1.5");
3335
3336        // Test number followed by dot and identifier
3337        // In PostgreSQL (and sqlglot), "1.x" parses as float "1." with alias "x"
3338        let tokens = tokenizer.tokenize("SELECT 1.a").unwrap();
3339        assert_eq!(
3340            tokens.len(),
3341            3,
3342            "Expected 3 tokens for 'SELECT 1.a', got {:?}",
3343            tokens
3344        );
3345        assert_eq!(tokens[1].token_type, TokenType::Number);
3346        assert_eq!(tokens[1].text, "1.");
3347        assert_eq!(tokens[2].token_type, TokenType::Var);
3348
3349        // Test two dots (range operator) - dot is NOT consumed when followed by another dot
3350        let tokens = tokenizer.tokenize("SELECT 1..2").unwrap();
3351        assert_eq!(tokens[1].token_type, TokenType::Number);
3352        assert_eq!(tokens[1].text, "1");
3353        assert_eq!(tokens[2].token_type, TokenType::Dot);
3354        assert_eq!(tokens[3].token_type, TokenType::Dot);
3355        assert_eq!(tokens[4].token_type, TokenType::Number);
3356        assert_eq!(tokens[4].text, "2");
3357    }
3358
3359    #[test]
3360    fn test_leading_dot_number() {
3361        let tokenizer = Tokenizer::default();
3362
3363        // Test leading dot number (e.g., .25 for 0.25)
3364        let tokens = tokenizer.tokenize(".25").unwrap();
3365        assert_eq!(
3366            tokens.len(),
3367            1,
3368            "Expected 1 token for '.25', got {:?}",
3369            tokens
3370        );
3371        assert_eq!(tokens[0].token_type, TokenType::Number);
3372        assert_eq!(tokens[0].text, ".25");
3373
3374        // Test leading dot in context (Oracle SAMPLE clause)
3375        let tokens = tokenizer.tokenize("SAMPLE (.25)").unwrap();
3376        assert_eq!(
3377            tokens.len(),
3378            4,
3379            "Expected 4 tokens for 'SAMPLE (.25)', got {:?}",
3380            tokens
3381        );
3382        assert_eq!(tokens[0].token_type, TokenType::Sample);
3383        assert_eq!(tokens[1].token_type, TokenType::LParen);
3384        assert_eq!(tokens[2].token_type, TokenType::Number);
3385        assert_eq!(tokens[2].text, ".25");
3386        assert_eq!(tokens[3].token_type, TokenType::RParen);
3387
3388        // Test leading dot with exponent
3389        let tokens = tokenizer.tokenize(".5e10").unwrap();
3390        assert_eq!(
3391            tokens.len(),
3392            1,
3393            "Expected 1 token for '.5e10', got {:?}",
3394            tokens
3395        );
3396        assert_eq!(tokens[0].token_type, TokenType::Number);
3397        assert_eq!(tokens[0].text, ".5e10");
3398
3399        // Test that plain dot is still a Dot token
3400        let tokens = tokenizer.tokenize("a.b").unwrap();
3401        assert_eq!(
3402            tokens.len(),
3403            3,
3404            "Expected 3 tokens for 'a.b', got {:?}",
3405            tokens
3406        );
3407        assert_eq!(tokens[1].token_type, TokenType::Dot);
3408    }
3409
3410    #[test]
3411    fn test_unrecognized_character() {
3412        let tokenizer = Tokenizer::default();
3413
3414        // Unicode curly quotes are now handled as string delimiters
3415        let result = tokenizer.tokenize("SELECT \u{2018}hello\u{2019}");
3416        assert!(
3417            result.is_ok(),
3418            "Curly quotes should be tokenized as strings"
3419        );
3420
3421        // Unicode bullet character should still error
3422        let result = tokenizer.tokenize("SELECT • FROM t");
3423        assert!(result.is_err());
3424    }
3425
3426    #[test]
3427    fn test_colon_eq_tokenization() {
3428        let tokenizer = Tokenizer::default();
3429
3430        // := should be a single ColonEq token
3431        let tokens = tokenizer.tokenize("a := 1").unwrap();
3432        assert_eq!(tokens.len(), 3);
3433        assert_eq!(tokens[0].token_type, TokenType::Var);
3434        assert_eq!(tokens[1].token_type, TokenType::ColonEq);
3435        assert_eq!(tokens[2].token_type, TokenType::Number);
3436
3437        // : followed by non-= should still be Colon
3438        let tokens = tokenizer.tokenize("a:b").unwrap();
3439        assert!(tokens.iter().any(|t| t.token_type == TokenType::Colon));
3440        assert!(!tokens.iter().any(|t| t.token_type == TokenType::ColonEq));
3441
3442        // :: should still be DColon
3443        let tokens = tokenizer.tokenize("a::INT").unwrap();
3444        assert!(tokens.iter().any(|t| t.token_type == TokenType::DColon));
3445    }
3446
3447    #[test]
3448    fn test_colon_eq_parsing() {
3449        use crate::generator::Generator;
3450        use crate::parser::Parser;
3451
3452        // MySQL @var := value in SELECT
3453        let ast = Parser::parse_sql("SELECT @var1 := 1, @var2")
3454            .expect("Failed to parse MySQL @var := expr");
3455        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3456        assert_eq!(output, "SELECT @var1 := 1, @var2");
3457
3458        // MySQL @var := @var in SELECT
3459        let ast = Parser::parse_sql("SELECT @var1, @var2 := @var1")
3460            .expect("Failed to parse MySQL @var2 := @var1");
3461        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3462        assert_eq!(output, "SELECT @var1, @var2 := @var1");
3463
3464        // MySQL @var := COUNT(*)
3465        let ast = Parser::parse_sql("SELECT @var1 := COUNT(*) FROM t1")
3466            .expect("Failed to parse MySQL @var := COUNT(*)");
3467        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3468        assert_eq!(output, "SELECT @var1 := COUNT(*) FROM t1");
3469
3470        // MySQL SET @var := 1 (should normalize to = in output)
3471        let ast = Parser::parse_sql("SET @var1 := 1").expect("Failed to parse SET @var1 := 1");
3472        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3473        assert_eq!(output, "SET @var1 = 1");
3474
3475        // Function named args with :=
3476        let ast =
3477            Parser::parse_sql("UNION_VALUE(k1 := 1)").expect("Failed to parse named arg with :=");
3478        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3479        assert_eq!(output, "UNION_VALUE(k1 := 1)");
3480
3481        // UNNEST with recursive := TRUE
3482        let ast = Parser::parse_sql("SELECT UNNEST(col, recursive := TRUE) FROM t")
3483            .expect("Failed to parse UNNEST with :=");
3484        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3485        assert_eq!(output, "SELECT UNNEST(col, recursive := TRUE) FROM t");
3486
3487        // DuckDB prefix alias: foo: 1 means 1 AS foo
3488        let ast =
3489            Parser::parse_sql("SELECT foo: 1").expect("Failed to parse DuckDB prefix alias foo: 1");
3490        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3491        assert_eq!(output, "SELECT 1 AS foo");
3492
3493        // DuckDB prefix alias with multiple columns
3494        let ast = Parser::parse_sql("SELECT foo: 1, bar: 2, baz: 3")
3495            .expect("Failed to parse DuckDB multiple prefix aliases");
3496        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3497        assert_eq!(output, "SELECT 1 AS foo, 2 AS bar, 3 AS baz");
3498    }
3499
3500    #[test]
3501    fn test_colon_eq_dialect_roundtrip() {
3502        use crate::dialects::{Dialect, DialectType};
3503
3504        fn check(dialect: DialectType, sql: &str, expected: Option<&str>) {
3505            let d = Dialect::get(dialect);
3506            let ast = d
3507                .parse(sql)
3508                .unwrap_or_else(|e| panic!("Parse error for '{}': {}", sql, e));
3509            assert!(!ast.is_empty(), "Empty AST for: {}", sql);
3510            let transformed = d
3511                .transform(ast[0].clone())
3512                .unwrap_or_else(|e| panic!("Transform error for '{}': {}", sql, e));
3513            let output = d
3514                .generate(&transformed)
3515                .unwrap_or_else(|e| panic!("Generate error for '{}': {}", sql, e));
3516            let expected = expected.unwrap_or(sql);
3517            assert_eq!(output, expected, "Roundtrip failed for: {}", sql);
3518        }
3519
3520        // MySQL := tests
3521        check(DialectType::MySQL, "SELECT @var1 := 1, @var2", None);
3522        check(DialectType::MySQL, "SELECT @var1, @var2 := @var1", None);
3523        check(DialectType::MySQL, "SELECT @var1 := COUNT(*) FROM t1", None);
3524        check(DialectType::MySQL, "SET @var1 := 1", Some("SET @var1 = 1"));
3525
3526        // DuckDB := tests
3527        check(
3528            DialectType::DuckDB,
3529            "SELECT UNNEST(col, recursive := TRUE) FROM t",
3530            None,
3531        );
3532        check(DialectType::DuckDB, "UNION_VALUE(k1 := 1)", None);
3533
3534        // STRUCT_PACK(a := 'b')::json should at least parse without error
3535        // (The STRUCT_PACK -> Struct transformation is a separate feature)
3536        {
3537            let d = Dialect::get(DialectType::DuckDB);
3538            let ast = d
3539                .parse("STRUCT_PACK(a := 'b')::json")
3540                .expect("Failed to parse STRUCT_PACK(a := 'b')::json");
3541            assert!(!ast.is_empty(), "Empty AST for STRUCT_PACK(a := 'b')::json");
3542        }
3543
3544        // DuckDB prefix alias tests
3545        check(
3546            DialectType::DuckDB,
3547            "SELECT foo: 1",
3548            Some("SELECT 1 AS foo"),
3549        );
3550        check(
3551            DialectType::DuckDB,
3552            "SELECT foo: 1, bar: 2, baz: 3",
3553            Some("SELECT 1 AS foo, 2 AS bar, 3 AS baz"),
3554        );
3555    }
3556
3557    #[test]
3558    fn test_comment_roundtrip() {
3559        use crate::generator::Generator;
3560        use crate::parser::Parser;
3561
3562        fn check_roundtrip(sql: &str) -> Option<String> {
3563            let ast = match Parser::parse_sql(sql) {
3564                Ok(a) => a,
3565                Err(e) => return Some(format!("Parse error: {:?}", e)),
3566            };
3567            if ast.is_empty() {
3568                return Some("Empty AST".to_string());
3569            }
3570            let mut generator = Generator::default();
3571            let output = match generator.generate(&ast[0]) {
3572                Ok(o) => o,
3573                Err(e) => return Some(format!("Gen error: {:?}", e)),
3574            };
3575            if output == sql {
3576                None
3577            } else {
3578                Some(format!(
3579                    "Mismatch:\n  input:  {}\n  output: {}",
3580                    sql, output
3581                ))
3582            }
3583        }
3584
3585        let tests = vec![
3586            // Nested comments
3587            "SELECT c /* c1 /* c2 */ c3 */",
3588            "SELECT c /* c1 /* c2 /* c3 */ */ */",
3589            // Simple alias with comments
3590            "SELECT c /* c1 */ AS alias /* c2 */",
3591            // Multiple columns with comments
3592            "SELECT a /* x */, b /* x */",
3593            // Multiple comments after column
3594            "SELECT a /* x */ /* y */ /* z */, b /* k */ /* m */",
3595            // FROM tables with comments
3596            "SELECT * FROM foo /* x */, bla /* x */",
3597            // Arithmetic with comments
3598            "SELECT 1 /* comment */ + 1",
3599            "SELECT 1 /* c1 */ + 2 /* c2 */",
3600            "SELECT 1 /* c1 */ + /* c2 */ 2 /* c3 */",
3601            // CAST with comments
3602            "SELECT CAST(x AS INT) /* comment */ FROM foo",
3603            // Function arguments with comments
3604            "SELECT FOO(x /* c */) /* FOO */, b /* b */",
3605            // Multi-part table names with comments
3606            "SELECT x FROM a.b.c /* x */, e.f.g /* x */",
3607            // INSERT with comments
3608            "INSERT INTO t1 (tc1 /* tc1 */, tc2 /* tc2 */) SELECT c1 /* sc1 */, c2 /* sc2 */ FROM t",
3609            // Leading comments on statements
3610            "/* c */ WITH x AS (SELECT 1) SELECT * FROM x",
3611            "/* comment1 */ INSERT INTO x /* comment2 */ VALUES (1, 2, 3)",
3612            "/* comment1 */ UPDATE tbl /* comment2 */ SET x = 2 WHERE x < 2",
3613            "/* comment1 */ DELETE FROM x /* comment2 */ WHERE y > 1",
3614            "/* comment */ CREATE TABLE foo AS SELECT 1",
3615            // Trailing comments on statements
3616            "INSERT INTO foo SELECT * FROM bar /* comment */",
3617            // Complex nested expressions with comments
3618            "SELECT FOO(x /* c1 */ + y /* c2 */ + BLA(5 /* c3 */)) FROM (VALUES (1 /* c4 */, \"test\" /* c5 */)) /* c6 */",
3619        ];
3620
3621        let mut failures = Vec::new();
3622        for sql in tests {
3623            if let Some(e) = check_roundtrip(sql) {
3624                failures.push(e);
3625            }
3626        }
3627
3628        if !failures.is_empty() {
3629            panic!("Comment roundtrip failures:\n{}", failures.join("\n\n"));
3630        }
3631    }
3632
3633    #[test]
3634    fn test_dollar_quoted_string_parsing() {
3635        use crate::dialects::{Dialect, DialectType};
3636
3637        // Test dollar string token parsing utility function
3638        let (tag, content) = super::parse_dollar_string_token("FOO\x00content here");
3639        assert_eq!(tag, Some("FOO".to_string()));
3640        assert_eq!(content, "content here");
3641
3642        let (tag, content) = super::parse_dollar_string_token("just content");
3643        assert_eq!(tag, None);
3644        assert_eq!(content, "just content");
3645
3646        // Test roundtrip for Databricks dialect with dollar-quoted function body
3647        fn check_databricks(sql: &str, expected: Option<&str>) {
3648            let d = Dialect::get(DialectType::Databricks);
3649            let ast = d
3650                .parse(sql)
3651                .unwrap_or_else(|e| panic!("Parse error for '{}': {}", sql, e));
3652            assert!(!ast.is_empty(), "Empty AST for: {}", sql);
3653            let transformed = d
3654                .transform(ast[0].clone())
3655                .unwrap_or_else(|e| panic!("Transform error for '{}': {}", sql, e));
3656            let output = d
3657                .generate(&transformed)
3658                .unwrap_or_else(|e| panic!("Generate error for '{}': {}", sql, e));
3659            let expected = expected.unwrap_or(sql);
3660            assert_eq!(output, expected, "Roundtrip failed for: {}", sql);
3661        }
3662
3663        // Test [42]: $$...$$ heredoc
3664        check_databricks(
3665            "CREATE FUNCTION add_one(x INT) RETURNS INT LANGUAGE PYTHON AS $$def add_one(x):\n  return x+1$$",
3666            None
3667        );
3668
3669        // Test [43]: $FOO$...$FOO$ tagged heredoc
3670        check_databricks(
3671            "CREATE FUNCTION add_one(x INT) RETURNS INT LANGUAGE PYTHON AS $FOO$def add_one(x):\n  return x+1$FOO$",
3672            None
3673        );
3674    }
3675}
polyglot_sql/tokens.rs

polyglot_sql/
tokens.rs