polyglot_sql/
tokens.rs

1//! Token types and tokenization for SQL parsing
2//!
3//! This module defines all SQL token types and the tokenizer that converts
4//! SQL strings into token streams.
5
6use crate::error::{Error, Result};
7use serde::{Deserialize, Serialize};
8use std::fmt;
9
10/// Parse a DollarString token text into (tag, content).
11/// If the text contains '\x00', the part before is the tag and after is content.
12/// Otherwise, the whole text is the content with no tag.
13pub fn parse_dollar_string_token(text: &str) -> (Option<String>, String) {
14    if let Some(pos) = text.find('\x00') {
15        let tag = &text[..pos];
16        let content = &text[pos + 1..];
17        (Some(tag.to_string()), content.to_string())
18    } else {
19        (None, text.to_string())
20    }
21}
22
23/// Represents a position in the source SQL
24#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
25pub struct Span {
26    /// Starting byte offset
27    pub start: usize,
28    /// Ending byte offset (exclusive)
29    pub end: usize,
30    /// Line number (1-based)
31    pub line: usize,
32    /// Column number (1-based)
33    pub column: usize,
34}
35
36impl Span {
37    pub fn new(start: usize, end: usize, line: usize, column: usize) -> Self {
38        Self {
39            start,
40            end,
41            line,
42            column,
43        }
44    }
45}
46
47/// A token in the SQL token stream
48#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
49pub struct Token {
50    /// The type of token
51    pub token_type: TokenType,
52    /// The raw text of the token
53    pub text: String,
54    /// Position information
55    pub span: Span,
56    /// Leading comments (comments that appeared before this token)
57    #[serde(default)]
58    pub comments: Vec<String>,
59    /// Trailing comments (comments that appeared after this token, before the next one)
60    #[serde(default)]
61    pub trailing_comments: Vec<String>,
62}
63
64impl Token {
65    /// Create a new token
66    pub fn new(token_type: TokenType, text: impl Into<String>, span: Span) -> Self {
67        Self {
68            token_type,
69            text: text.into(),
70            span,
71            comments: Vec::new(),
72            trailing_comments: Vec::new(),
73        }
74    }
75
76    /// Create a NUMBER token
77    pub fn number(n: i64) -> Self {
78        Self::new(TokenType::Number, n.to_string(), Span::default())
79    }
80
81    /// Create a STRING token
82    pub fn string(s: impl Into<String>) -> Self {
83        Self::new(TokenType::String, s, Span::default())
84    }
85
86    /// Create an IDENTIFIER token
87    pub fn identifier(s: impl Into<String>) -> Self {
88        Self::new(TokenType::Identifier, s, Span::default())
89    }
90
91    /// Create a VAR token
92    pub fn var(s: impl Into<String>) -> Self {
93        Self::new(TokenType::Var, s, Span::default())
94    }
95
96    /// Add a comment to this token
97    pub fn with_comment(mut self, comment: impl Into<String>) -> Self {
98        self.comments.push(comment.into());
99        self
100    }
101}
102
103impl fmt::Display for Token {
104    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
105        write!(f, "{:?}({})", self.token_type, self.text)
106    }
107}
108
109/// All possible token types in SQL
110#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
111#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
112#[repr(u16)]
113pub enum TokenType {
114    // Punctuation
115    LParen,
116    RParen,
117    LBracket,
118    RBracket,
119    LBrace,
120    RBrace,
121    Comma,
122    Dot,
123    Dash,
124    Plus,
125    Colon,
126    DotColon,
127    DColon,
128    DColonDollar,
129    DColonPercent,
130    DColonQMark,
131    DQMark,
132    Semicolon,
133    Star,
134    Backslash,
135    Slash,
136    Lt,
137    Lte,
138    Gt,
139    Gte,
140    Not,
141    Eq,
142    Neq,
143    NullsafeEq,
144    ColonEq,
145    ColonGt,
146    NColonGt,
147    And,
148    Or,
149    Amp,
150    DPipe,
151    PipeGt,
152    Pipe,
153    PipeSlash,
154    DPipeSlash,
155    Caret,
156    CaretAt,
157    LtLt, // <<
158    GtGt, // >>
159    Tilde,
160    Arrow,
161    DArrow,
162    FArrow,
163    Hash,
164    HashArrow,
165    DHashArrow,
166    LrArrow,
167    DAt,
168    AtAt,
169    LtAt,
170    AtGt,
171    Dollar,
172    Parameter,
173    Session,
174    SessionParameter,
175    SessionUser,
176    DAmp,
177    AmpLt,
178    AmpGt,
179    Adjacent,
180    Xor,
181    DStar,
182    QMarkAmp,
183    QMarkPipe,
184    HashDash,
185    Exclamation,
186
187    UriStart,
188    BlockStart,
189    BlockEnd,
190    Space,
191    Break,
192
193    // Comments (emitted as tokens for round-trip fidelity)
194    BlockComment, // /* ... */
195    LineComment,  // -- ...
196
197    // Literals
198    String,
199    DollarString,             // $$...$$
200    TripleDoubleQuotedString, // """..."""
201    TripleSingleQuotedString, // '''...'''
202    Number,
203    Identifier,
204    QuotedIdentifier,
205    Database,
206    Column,
207    ColumnDef,
208    Schema,
209    Table,
210    Warehouse,
211    Stage,
212    Streamlit,
213    Var,
214    BitString,
215    HexString,
216    /// Hex number: 0xA, 0xFF (BigQuery, SQLite style) - represents an integer in hex notation
217    HexNumber,
218    ByteString,
219    NationalString,
220    EscapeString, // PostgreSQL E'...' escape string
221    RawString,
222    HeredocString,
223    HeredocStringAlternative,
224    UnicodeString,
225
226    // Data Types
227    Bit,
228    Boolean,
229    TinyInt,
230    UTinyInt,
231    SmallInt,
232    USmallInt,
233    MediumInt,
234    UMediumInt,
235    Int,
236    UInt,
237    BigInt,
238    UBigInt,
239    BigNum,
240    Int128,
241    UInt128,
242    Int256,
243    UInt256,
244    Float,
245    Double,
246    UDouble,
247    Decimal,
248    Decimal32,
249    Decimal64,
250    Decimal128,
251    Decimal256,
252    DecFloat,
253    UDecimal,
254    BigDecimal,
255    Char,
256    NChar,
257    VarChar,
258    NVarChar,
259    BpChar,
260    Text,
261    MediumText,
262    LongText,
263    Blob,
264    MediumBlob,
265    LongBlob,
266    TinyBlob,
267    TinyText,
268    Name,
269    Binary,
270    VarBinary,
271    Json,
272    JsonB,
273    Time,
274    TimeTz,
275    TimeNs,
276    Timestamp,
277    TimestampTz,
278    TimestampLtz,
279    TimestampNtz,
280    TimestampS,
281    TimestampMs,
282    TimestampNs,
283    DateTime,
284    DateTime2,
285    DateTime64,
286    SmallDateTime,
287    Date,
288    Date32,
289    Int4Range,
290    Int4MultiRange,
291    Int8Range,
292    Int8MultiRange,
293    NumRange,
294    NumMultiRange,
295    TsRange,
296    TsMultiRange,
297    TsTzRange,
298    TsTzMultiRange,
299    DateRange,
300    DateMultiRange,
301    Uuid,
302    Geography,
303    GeographyPoint,
304    Nullable,
305    Geometry,
306    Point,
307    Ring,
308    LineString,
309    LocalTime,
310    LocalTimestamp,
311    SysTimestamp,
312    MultiLineString,
313    Polygon,
314    MultiPolygon,
315    HllSketch,
316    HStore,
317    Super,
318    Serial,
319    SmallSerial,
320    BigSerial,
321    Xml,
322    Year,
323    UserDefined,
324    Money,
325    SmallMoney,
326    RowVersion,
327    Image,
328    Variant,
329    Object,
330    Inet,
331    IpAddress,
332    IpPrefix,
333    Ipv4,
334    Ipv6,
335    Enum,
336    Enum8,
337    Enum16,
338    FixedString,
339    LowCardinality,
340    Nested,
341    AggregateFunction,
342    SimpleAggregateFunction,
343    TDigest,
344    Unknown,
345    Vector,
346    Dynamic,
347    Void,
348
349    // Keywords
350    Add,
351    Alias,
352    Alter,
353    All,
354    Anti,
355    Any,
356    Apply,
357    Array,
358    Asc,
359    AsOf,
360    Attach,
361    AutoIncrement,
362    Begin,
363    Between,
364    BulkCollectInto,
365    Cache,
366    Cascade,
367    Case,
368    CharacterSet,
369    Cluster,
370    ClusterBy,
371    Collate,
372    Command,
373    Comment,
374    Commit,
375    Preserve,
376    Connect,
377    ConnectBy,
378    Constraint,
379    Copy,
380    Create,
381    Cross,
382    Cube,
383    CurrentDate,
384    CurrentDateTime,
385    CurrentSchema,
386    CurrentTime,
387    CurrentTimestamp,
388    CurrentUser,
389    CurrentRole,
390    CurrentCatalog,
391    Declare,
392    Default,
393    Delete,
394    Desc,
395    Describe,
396    Detach,
397    Dictionary,
398    Distinct,
399    Distribute,
400    DistributeBy,
401    Div,
402    Drop,
403    Else,
404    End,
405    Escape,
406    Except,
407    Execute,
408    Exists,
409    False,
410    Fetch,
411    File,
412    FileFormat,
413    Filter,
414    Final,
415    First,
416    For,
417    Force,
418    ForeignKey,
419    Format,
420    From,
421    Full,
422    Function,
423    Get,
424    Glob,
425    Global,
426    Grant,
427    GroupBy,
428    GroupingSets,
429    Having,
430    Hint,
431    Ignore,
432    ILike,
433    In,
434    Index,
435    IndexedBy,
436    Inner,
437    Input,
438    Insert,
439    Install,
440    Intersect,
441    Interval,
442    Into,
443    Inpath,
444    InputFormat,
445    Introducer,
446    IRLike,
447    Is,
448    IsNull,
449    Join,
450    JoinMarker,
451    Keep,
452    Key,
453    Kill,
454    Lambda,
455    Language,
456    Lateral,
457    Left,
458    Like,
459    NotLike,   // !~~ operator (PostgreSQL)
460    NotILike,  // !~~* operator (PostgreSQL)
461    NotRLike,  // !~ operator (PostgreSQL)
462    NotIRLike, // !~* operator (PostgreSQL)
463    Limit,
464    List,
465    Load,
466    Local,
467    Lock,
468    Map,
469    Match,
470    MatchCondition,
471    MatchRecognize,
472    MemberOf,
473    Materialized,
474    Merge,
475    Mod,
476    Model,
477    Natural,
478    Next,
479    NoAction,
480    Nothing,
481    NotNull,
482    Null,
483    ObjectIdentifier,
484    Offset,
485    On,
486    Only,
487    Operator,
488    OrderBy,
489    OrderSiblingsBy,
490    Ordered,
491    Ordinality,
492    Out,
493    Outer,
494    Output,
495    Over,
496    Overlaps,
497    Overwrite,
498    Partition,
499    PartitionBy,
500    Percent,
501    Pivot,
502    Placeholder,
503    Positional,
504    Pragma,
505    Prewhere,
506    PrimaryKey,
507    Procedure,
508    Properties,
509    PseudoType,
510    Put,
511    Qualify,
512    Quote,
513    QDColon,
514    Range,
515    Recursive,
516    Refresh,
517    Rename,
518    Replace,
519    Returning,
520    Revoke,
521    References,
522    Restrict,
523    Right,
524    RLike,
525    Rollback,
526    Rollup,
527    Row,
528    Rows,
529    Select,
530    Semi,
531    Savepoint,
532    Separator,
533    Sequence,
534    Serde,
535    SerdeProperties,
536    Set,
537    Settings,
538    Show,
539    Siblings,
540    SimilarTo,
541    Some,
542    Sort,
543    SortBy,
544    SoundsLike,
545    StartWith,
546    StorageIntegration,
547    StraightJoin,
548    Struct,
549    Summarize,
550    TableSample,
551    Sample,
552    Bernoulli,
553    System,
554    Block,
555    Seed,
556    Repeatable,
557    Tag,
558    Temporary,
559    Transaction,
560    To,
561    Top,
562    Then,
563    True,
564    Truncate,
565    Uncache,
566    Union,
567    Unnest,
568    Unpivot,
569    Update,
570    Use,
571    Using,
572    Values,
573    View,
574    SemanticView,
575    Volatile,
576    When,
577    Where,
578    Window,
579    With,
580    Ties,
581    Exclude,
582    No,
583    Others,
584    Unique,
585    UtcDate,
586    UtcTime,
587    UtcTimestamp,
588    VersionSnapshot,
589    TimestampSnapshot,
590    Option,
591    Sink,
592    Source,
593    Analyze,
594    Namespace,
595    Export,
596    As,
597    By,
598    Nulls,
599    Respect,
600    Last,
601    If,
602    Cast,
603    TryCast,
604    SafeCast,
605    Count,
606    Extract,
607    Substring,
608    Trim,
609    Leading,
610    Trailing,
611    Both,
612    Position,
613    Overlaying,
614    Placing,
615    Treat,
616    Within,
617    Group,
618    Order,
619
620    // Window function keywords
621    Unbounded,
622    Preceding,
623    Following,
624    Current,
625    Groups,
626
627    // DDL-specific keywords (Phase 4)
628    Trigger,
629    Type,
630    Domain,
631    Returns,
632    Body,
633    Increment,
634    Minvalue,
635    Maxvalue,
636    Start,
637    Cycle,
638    NoCycle,
639    Prior,
640    Generated,
641    Identity,
642    Always,
643    // MATCH_RECOGNIZE tokens
644    Measures,
645    Pattern,
646    Define,
647    Running,
648    Owned,
649    After,
650    Before,
651    Instead,
652    Each,
653    Statement,
654    Referencing,
655    Old,
656    New,
657    Of,
658    Check,
659    Authorization,
660    Restart,
661
662    // Special
663    Eof,
664}
665
666impl TokenType {
667    /// Check if this token type is a keyword that can be used as an identifier in certain contexts
668    pub fn is_keyword(&self) -> bool {
669        matches!(
670            self,
671            TokenType::Select
672                | TokenType::From
673                | TokenType::Where
674                | TokenType::And
675                | TokenType::Or
676                | TokenType::Not
677                | TokenType::In
678                | TokenType::Is
679                | TokenType::Null
680                | TokenType::True
681                | TokenType::False
682                | TokenType::As
683                | TokenType::On
684                | TokenType::Join
685                | TokenType::Left
686                | TokenType::Right
687                | TokenType::Inner
688                | TokenType::Outer
689                | TokenType::Full
690                | TokenType::Cross
691                | TokenType::Semi
692                | TokenType::Anti
693                | TokenType::Union
694                | TokenType::Except
695                | TokenType::Intersect
696                | TokenType::GroupBy
697                | TokenType::OrderBy
698                | TokenType::Having
699                | TokenType::Limit
700                | TokenType::Offset
701                | TokenType::Case
702                | TokenType::When
703                | TokenType::Then
704                | TokenType::Else
705                | TokenType::End
706                | TokenType::Create
707                | TokenType::Drop
708                | TokenType::Alter
709                | TokenType::Insert
710                | TokenType::Update
711                | TokenType::Delete
712                | TokenType::Into
713                | TokenType::Values
714                | TokenType::Set
715                | TokenType::With
716                | TokenType::Distinct
717                | TokenType::All
718                | TokenType::Exists
719                | TokenType::Between
720                | TokenType::Like
721                | TokenType::ILike
722                // Additional keywords that can be used as identifiers
723                | TokenType::Filter
724                | TokenType::Date
725                | TokenType::Timestamp
726                | TokenType::TimestampTz
727                | TokenType::Interval
728                | TokenType::Time
729                | TokenType::Table
730                | TokenType::Index
731                | TokenType::Column
732                | TokenType::Database
733                | TokenType::Schema
734                | TokenType::View
735                | TokenType::Function
736                | TokenType::Procedure
737                | TokenType::Trigger
738                | TokenType::Sequence
739                | TokenType::Over
740                | TokenType::Partition
741                | TokenType::Window
742                | TokenType::Rows
743                | TokenType::Range
744                | TokenType::First
745                | TokenType::Last
746                | TokenType::Preceding
747                | TokenType::Following
748                | TokenType::Current
749                | TokenType::Row
750                | TokenType::Unbounded
751                | TokenType::Array
752                | TokenType::Struct
753                | TokenType::Map
754                | TokenType::PrimaryKey
755                | TokenType::Key
756                | TokenType::ForeignKey
757                | TokenType::References
758                | TokenType::Unique
759                | TokenType::Check
760                | TokenType::Default
761                | TokenType::Constraint
762                | TokenType::Comment
763                | TokenType::Rollup
764                | TokenType::Cube
765                | TokenType::Grant
766                | TokenType::Revoke
767                | TokenType::Type
768                | TokenType::Use
769                | TokenType::Cache
770                | TokenType::Uncache
771                | TokenType::Load
772                | TokenType::Any
773                | TokenType::Some
774                | TokenType::Asc
775                | TokenType::Desc
776                | TokenType::Nulls
777                | TokenType::Lateral
778                | TokenType::Natural
779                | TokenType::Escape
780                | TokenType::Glob
781                | TokenType::Match
782                | TokenType::Recursive
783                | TokenType::Replace
784                | TokenType::Returns
785                | TokenType::If
786                | TokenType::Pivot
787                | TokenType::Unpivot
788                | TokenType::Json
789                | TokenType::Blob
790                | TokenType::Text
791                | TokenType::Int
792                | TokenType::BigInt
793                | TokenType::SmallInt
794                | TokenType::TinyInt
795                | TokenType::Int128
796                | TokenType::UInt128
797                | TokenType::Int256
798                | TokenType::UInt256
799                | TokenType::UInt
800                | TokenType::UBigInt
801                | TokenType::Float
802                | TokenType::Double
803                | TokenType::Decimal
804                | TokenType::Boolean
805                | TokenType::VarChar
806                | TokenType::Char
807                | TokenType::Binary
808                | TokenType::VarBinary
809                | TokenType::No
810                | TokenType::DateTime
811                | TokenType::Truncate
812                | TokenType::Execute
813                | TokenType::Merge
814                | TokenType::Top
815                | TokenType::Begin
816                | TokenType::Generated
817                | TokenType::Identity
818                | TokenType::Always
819                | TokenType::Extract
820                // Keywords that can be identifiers in certain contexts
821                | TokenType::AsOf
822                | TokenType::Prior
823                | TokenType::After
824                | TokenType::Restrict
825                | TokenType::Cascade
826                | TokenType::Local
827                | TokenType::Rename
828                | TokenType::Enum
829                | TokenType::Within
830                | TokenType::Format
831                | TokenType::Final
832                | TokenType::FileFormat
833                | TokenType::Input
834                | TokenType::InputFormat
835                | TokenType::Copy
836                | TokenType::Put
837                | TokenType::Get
838                | TokenType::Show
839                | TokenType::Serde
840                | TokenType::Sample
841                | TokenType::Sort
842                | TokenType::Collate
843                | TokenType::Ties
844                | TokenType::IsNull
845                | TokenType::NotNull
846                | TokenType::Exclude
847                | TokenType::Temporary
848                | TokenType::Add
849                | TokenType::Ordinality
850                | TokenType::Overlaps
851                | TokenType::Block
852                | TokenType::Pattern
853                | TokenType::Group
854                | TokenType::Cluster
855                | TokenType::Repeatable
856                | TokenType::Groups
857                | TokenType::Commit
858                | TokenType::Warehouse
859                | TokenType::System
860                | TokenType::By
861                | TokenType::To
862                | TokenType::Fetch
863                | TokenType::For
864                | TokenType::Only
865                | TokenType::Next
866                | TokenType::Lock
867                | TokenType::Refresh
868                | TokenType::Settings
869                | TokenType::Operator
870                | TokenType::Overwrite
871                | TokenType::StraightJoin
872                | TokenType::Start
873                // Additional keywords registered in tokenizer but previously missing from is_keyword()
874                | TokenType::Ignore
875                | TokenType::Domain
876                | TokenType::Apply
877                | TokenType::Respect
878                | TokenType::Materialized
879                | TokenType::Prewhere
880                | TokenType::Old
881                | TokenType::New
882                | TokenType::Cast
883                | TokenType::TryCast
884                | TokenType::SafeCast
885                | TokenType::Transaction
886                | TokenType::Describe
887                | TokenType::Kill
888                | TokenType::Lambda
889                | TokenType::Declare
890                | TokenType::Keep
891                | TokenType::Output
892                | TokenType::Percent
893                | TokenType::Qualify
894                | TokenType::Returning
895                | TokenType::Language
896                | TokenType::Preserve
897                | TokenType::Savepoint
898                | TokenType::Rollback
899                | TokenType::Body
900                | TokenType::Increment
901                | TokenType::Minvalue
902                | TokenType::Maxvalue
903                | TokenType::Cycle
904                | TokenType::NoCycle
905                | TokenType::Seed
906                | TokenType::Namespace
907                | TokenType::Authorization
908                | TokenType::Order
909                | TokenType::Restart
910                | TokenType::Before
911                | TokenType::Instead
912                | TokenType::Each
913                | TokenType::Statement
914                | TokenType::Referencing
915                | TokenType::Of
916                | TokenType::Separator
917                | TokenType::Others
918                | TokenType::Placing
919                | TokenType::Owned
920                | TokenType::Running
921                | TokenType::Define
922                | TokenType::Measures
923                | TokenType::MatchRecognize
924                | TokenType::AutoIncrement
925                | TokenType::Connect
926                | TokenType::Distribute
927                | TokenType::Bernoulli
928                | TokenType::TableSample
929                | TokenType::Inpath
930                | TokenType::Pragma
931                | TokenType::Siblings
932                | TokenType::SerdeProperties
933                | TokenType::RLike
934        )
935    }
936
937    /// Check if this token type is a comparison operator
938    pub fn is_comparison(&self) -> bool {
939        matches!(
940            self,
941            TokenType::Eq
942                | TokenType::Neq
943                | TokenType::Lt
944                | TokenType::Lte
945                | TokenType::Gt
946                | TokenType::Gte
947                | TokenType::NullsafeEq
948        )
949    }
950
951    /// Check if this token type is an arithmetic operator
952    pub fn is_arithmetic(&self) -> bool {
953        matches!(
954            self,
955            TokenType::Plus
956                | TokenType::Dash
957                | TokenType::Star
958                | TokenType::Slash
959                | TokenType::Percent
960                | TokenType::Mod
961                | TokenType::Div
962        )
963    }
964}
965
966impl fmt::Display for TokenType {
967    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
968        write!(f, "{:?}", self)
969    }
970}
971
972/// Tokenizer configuration for a dialect
973#[derive(Debug, Clone)]
974pub struct TokenizerConfig {
975    /// Keywords mapping (uppercase keyword -> token type)
976    pub keywords: std::collections::HashMap<String, TokenType>,
977    /// Single character tokens
978    pub single_tokens: std::collections::HashMap<char, TokenType>,
979    /// Quote characters (start -> end)
980    pub quotes: std::collections::HashMap<String, String>,
981    /// Identifier quote characters (start -> end)
982    pub identifiers: std::collections::HashMap<char, char>,
983    /// Comment definitions (start -> optional end)
984    pub comments: std::collections::HashMap<String, Option<String>>,
985    /// String escape characters
986    pub string_escapes: Vec<char>,
987    /// Whether to support nested comments
988    pub nested_comments: bool,
989    /// Valid escape follow characters (for MySQL-style escaping).
990    /// When a backslash is followed by a character NOT in this list,
991    /// the backslash is discarded. When empty, all backslash escapes
992    /// preserve the backslash for unrecognized sequences.
993    pub escape_follow_chars: Vec<char>,
994    /// Whether b'...' is a byte string (true for BigQuery) or bit string (false for standard SQL).
995    /// Default is false (bit string).
996    pub b_prefix_is_byte_string: bool,
997    /// Numeric literal suffixes (uppercase suffix -> type name), e.g. {"L": "BIGINT", "S": "SMALLINT"}
998    /// Used by Hive/Spark to parse 1L as CAST(1 AS BIGINT)
999    pub numeric_literals: std::collections::HashMap<String, String>,
1000    /// Whether unquoted identifiers can start with a digit (e.g., `1a`, `1_a`).
1001    /// When true, a number followed by letters/underscore is treated as an identifier.
1002    /// Used by Hive, Spark, MySQL, ClickHouse.
1003    pub identifiers_can_start_with_digit: bool,
1004    /// Whether 0x/0X prefix should be treated as hex literals.
1005    /// When true, `0XCC` is tokenized instead of Number("0") + Identifier("XCC").
1006    /// Used by BigQuery, SQLite, Teradata.
1007    pub hex_number_strings: bool,
1008    /// Whether hex string literals from 0x prefix represent integer values.
1009    /// When true (BigQuery), 0xA is tokenized as HexNumber (integer in hex notation).
1010    /// When false (SQLite, Teradata), 0xCC is tokenized as HexString (binary/blob value).
1011    pub hex_string_is_integer_type: bool,
1012    /// Whether string escape sequences (like \') are allowed in raw strings.
1013    /// When true (BigQuery default), \' inside r'...' escapes the quote.
1014    /// When false (Spark/Databricks), backslashes in raw strings are always literal.
1015    /// Python sqlglot: STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS (default True)
1016    pub string_escapes_allowed_in_raw_strings: bool,
1017    /// Whether # starts a single-line comment (ClickHouse, MySQL)
1018    pub hash_comments: bool,
1019    /// Whether $ can start/continue an identifier (ClickHouse).
1020    /// When true, a bare `$` that is not part of a dollar-quoted string or positional
1021    /// parameter is treated as an identifier character.
1022    pub dollar_sign_is_identifier: bool,
1023    /// Whether INSERT ... FORMAT <name> should treat subsequent data as raw (ClickHouse).
1024    /// When true, after tokenizing `INSERT ... FORMAT <non-VALUES-name>`, all text until
1025    /// the next blank line or end of input is consumed as a raw data token.
1026    pub insert_format_raw_data: bool,
1027}
1028
1029impl Default for TokenizerConfig {
1030    fn default() -> Self {
1031        let mut keywords = std::collections::HashMap::new();
1032        // Add basic SQL keywords
1033        keywords.insert("SELECT".to_string(), TokenType::Select);
1034        keywords.insert("FROM".to_string(), TokenType::From);
1035        keywords.insert("WHERE".to_string(), TokenType::Where);
1036        keywords.insert("AND".to_string(), TokenType::And);
1037        keywords.insert("OR".to_string(), TokenType::Or);
1038        keywords.insert("NOT".to_string(), TokenType::Not);
1039        keywords.insert("AS".to_string(), TokenType::As);
1040        keywords.insert("ON".to_string(), TokenType::On);
1041        keywords.insert("JOIN".to_string(), TokenType::Join);
1042        keywords.insert("LEFT".to_string(), TokenType::Left);
1043        keywords.insert("RIGHT".to_string(), TokenType::Right);
1044        keywords.insert("INNER".to_string(), TokenType::Inner);
1045        keywords.insert("OUTER".to_string(), TokenType::Outer);
1046        keywords.insert("OUTPUT".to_string(), TokenType::Output);
1047        keywords.insert("FULL".to_string(), TokenType::Full);
1048        keywords.insert("CROSS".to_string(), TokenType::Cross);
1049        keywords.insert("SEMI".to_string(), TokenType::Semi);
1050        keywords.insert("ANTI".to_string(), TokenType::Anti);
1051        keywords.insert("STRAIGHT_JOIN".to_string(), TokenType::StraightJoin);
1052        keywords.insert("UNION".to_string(), TokenType::Union);
1053        keywords.insert("EXCEPT".to_string(), TokenType::Except);
1054        keywords.insert("MINUS".to_string(), TokenType::Except); // Oracle/Redshift alias for EXCEPT
1055        keywords.insert("INTERSECT".to_string(), TokenType::Intersect);
1056        keywords.insert("GROUP".to_string(), TokenType::Group);
1057        keywords.insert("CUBE".to_string(), TokenType::Cube);
1058        keywords.insert("ROLLUP".to_string(), TokenType::Rollup);
1059        keywords.insert("WITHIN".to_string(), TokenType::Within);
1060        keywords.insert("ORDER".to_string(), TokenType::Order);
1061        keywords.insert("BY".to_string(), TokenType::By);
1062        keywords.insert("HAVING".to_string(), TokenType::Having);
1063        keywords.insert("LIMIT".to_string(), TokenType::Limit);
1064        keywords.insert("OFFSET".to_string(), TokenType::Offset);
1065        keywords.insert("ORDINALITY".to_string(), TokenType::Ordinality);
1066        keywords.insert("FETCH".to_string(), TokenType::Fetch);
1067        keywords.insert("FIRST".to_string(), TokenType::First);
1068        keywords.insert("NEXT".to_string(), TokenType::Next);
1069        keywords.insert("ONLY".to_string(), TokenType::Only);
1070        keywords.insert("KEEP".to_string(), TokenType::Keep);
1071        keywords.insert("IGNORE".to_string(), TokenType::Ignore);
1072        keywords.insert("INPUT".to_string(), TokenType::Input);
1073        keywords.insert("CASE".to_string(), TokenType::Case);
1074        keywords.insert("WHEN".to_string(), TokenType::When);
1075        keywords.insert("THEN".to_string(), TokenType::Then);
1076        keywords.insert("ELSE".to_string(), TokenType::Else);
1077        keywords.insert("END".to_string(), TokenType::End);
1078        keywords.insert("ENDIF".to_string(), TokenType::End); // Exasol alias for END
1079        keywords.insert("NULL".to_string(), TokenType::Null);
1080        keywords.insert("TRUE".to_string(), TokenType::True);
1081        keywords.insert("FALSE".to_string(), TokenType::False);
1082        keywords.insert("IS".to_string(), TokenType::Is);
1083        keywords.insert("IN".to_string(), TokenType::In);
1084        keywords.insert("BETWEEN".to_string(), TokenType::Between);
1085        keywords.insert("OVERLAPS".to_string(), TokenType::Overlaps);
1086        keywords.insert("LIKE".to_string(), TokenType::Like);
1087        keywords.insert("ILIKE".to_string(), TokenType::ILike);
1088        keywords.insert("RLIKE".to_string(), TokenType::RLike);
1089        keywords.insert("REGEXP".to_string(), TokenType::RLike);
1090        keywords.insert("ESCAPE".to_string(), TokenType::Escape);
1091        keywords.insert("EXISTS".to_string(), TokenType::Exists);
1092        keywords.insert("DISTINCT".to_string(), TokenType::Distinct);
1093        keywords.insert("ALL".to_string(), TokenType::All);
1094        keywords.insert("WITH".to_string(), TokenType::With);
1095        keywords.insert("CREATE".to_string(), TokenType::Create);
1096        keywords.insert("DROP".to_string(), TokenType::Drop);
1097        keywords.insert("ALTER".to_string(), TokenType::Alter);
1098        keywords.insert("TRUNCATE".to_string(), TokenType::Truncate);
1099        keywords.insert("TABLE".to_string(), TokenType::Table);
1100        keywords.insert("VIEW".to_string(), TokenType::View);
1101        keywords.insert("INDEX".to_string(), TokenType::Index);
1102        keywords.insert("COLUMN".to_string(), TokenType::Column);
1103        keywords.insert("CONSTRAINT".to_string(), TokenType::Constraint);
1104        keywords.insert("ADD".to_string(), TokenType::Add);
1105        keywords.insert("CASCADE".to_string(), TokenType::Cascade);
1106        keywords.insert("RESTRICT".to_string(), TokenType::Restrict);
1107        keywords.insert("RENAME".to_string(), TokenType::Rename);
1108        keywords.insert("TEMPORARY".to_string(), TokenType::Temporary);
1109        keywords.insert("TEMP".to_string(), TokenType::Temporary);
1110        keywords.insert("UNIQUE".to_string(), TokenType::Unique);
1111        keywords.insert("PRIMARY".to_string(), TokenType::PrimaryKey);
1112        keywords.insert("FOREIGN".to_string(), TokenType::ForeignKey);
1113        keywords.insert("KEY".to_string(), TokenType::Key);
1114        keywords.insert("KILL".to_string(), TokenType::Kill);
1115        keywords.insert("REFERENCES".to_string(), TokenType::References);
1116        keywords.insert("DEFAULT".to_string(), TokenType::Default);
1117        keywords.insert("DECLARE".to_string(), TokenType::Declare);
1118        keywords.insert("AUTO_INCREMENT".to_string(), TokenType::AutoIncrement);
1119        keywords.insert("AUTOINCREMENT".to_string(), TokenType::AutoIncrement); // Snowflake style
1120        keywords.insert("MATERIALIZED".to_string(), TokenType::Materialized);
1121        keywords.insert("REPLACE".to_string(), TokenType::Replace);
1122        keywords.insert("TO".to_string(), TokenType::To);
1123        keywords.insert("INSERT".to_string(), TokenType::Insert);
1124        keywords.insert("OVERWRITE".to_string(), TokenType::Overwrite);
1125        keywords.insert("UPDATE".to_string(), TokenType::Update);
1126        keywords.insert("USE".to_string(), TokenType::Use);
1127        keywords.insert("WAREHOUSE".to_string(), TokenType::Warehouse);
1128        keywords.insert("GLOB".to_string(), TokenType::Glob);
1129        keywords.insert("DELETE".to_string(), TokenType::Delete);
1130        keywords.insert("MERGE".to_string(), TokenType::Merge);
1131        keywords.insert("CACHE".to_string(), TokenType::Cache);
1132        keywords.insert("UNCACHE".to_string(), TokenType::Uncache);
1133        keywords.insert("REFRESH".to_string(), TokenType::Refresh);
1134        keywords.insert("GRANT".to_string(), TokenType::Grant);
1135        keywords.insert("REVOKE".to_string(), TokenType::Revoke);
1136        keywords.insert("COMMENT".to_string(), TokenType::Comment);
1137        keywords.insert("COLLATE".to_string(), TokenType::Collate);
1138        keywords.insert("INTO".to_string(), TokenType::Into);
1139        keywords.insert("VALUES".to_string(), TokenType::Values);
1140        keywords.insert("SET".to_string(), TokenType::Set);
1141        keywords.insert("SETTINGS".to_string(), TokenType::Settings);
1142        keywords.insert("SEPARATOR".to_string(), TokenType::Separator);
1143        keywords.insert("ASC".to_string(), TokenType::Asc);
1144        keywords.insert("DESC".to_string(), TokenType::Desc);
1145        keywords.insert("NULLS".to_string(), TokenType::Nulls);
1146        keywords.insert("RESPECT".to_string(), TokenType::Respect);
1147        keywords.insert("FIRST".to_string(), TokenType::First);
1148        keywords.insert("LAST".to_string(), TokenType::Last);
1149        keywords.insert("IF".to_string(), TokenType::If);
1150        keywords.insert("CAST".to_string(), TokenType::Cast);
1151        keywords.insert("TRY_CAST".to_string(), TokenType::TryCast);
1152        keywords.insert("SAFE_CAST".to_string(), TokenType::SafeCast);
1153        keywords.insert("OVER".to_string(), TokenType::Over);
1154        keywords.insert("PARTITION".to_string(), TokenType::Partition);
1155        keywords.insert("PLACING".to_string(), TokenType::Placing);
1156        keywords.insert("WINDOW".to_string(), TokenType::Window);
1157        keywords.insert("ROWS".to_string(), TokenType::Rows);
1158        keywords.insert("RANGE".to_string(), TokenType::Range);
1159        keywords.insert("FILTER".to_string(), TokenType::Filter);
1160        keywords.insert("NATURAL".to_string(), TokenType::Natural);
1161        keywords.insert("USING".to_string(), TokenType::Using);
1162        keywords.insert("UNBOUNDED".to_string(), TokenType::Unbounded);
1163        keywords.insert("PRECEDING".to_string(), TokenType::Preceding);
1164        keywords.insert("FOLLOWING".to_string(), TokenType::Following);
1165        keywords.insert("CURRENT".to_string(), TokenType::Current);
1166        keywords.insert("ROW".to_string(), TokenType::Row);
1167        keywords.insert("GROUPS".to_string(), TokenType::Groups);
1168        keywords.insert("RECURSIVE".to_string(), TokenType::Recursive);
1169        // TRIM function position keywords
1170        keywords.insert("BOTH".to_string(), TokenType::Both);
1171        keywords.insert("LEADING".to_string(), TokenType::Leading);
1172        keywords.insert("TRAILING".to_string(), TokenType::Trailing);
1173        keywords.insert("INTERVAL".to_string(), TokenType::Interval);
1174        // Phase 3: Additional keywords
1175        keywords.insert("TOP".to_string(), TokenType::Top);
1176        keywords.insert("QUALIFY".to_string(), TokenType::Qualify);
1177        keywords.insert("SAMPLE".to_string(), TokenType::Sample);
1178        keywords.insert("TABLESAMPLE".to_string(), TokenType::TableSample);
1179        keywords.insert("BERNOULLI".to_string(), TokenType::Bernoulli);
1180        keywords.insert("SYSTEM".to_string(), TokenType::System);
1181        keywords.insert("BLOCK".to_string(), TokenType::Block);
1182        keywords.insert("SEED".to_string(), TokenType::Seed);
1183        keywords.insert("REPEATABLE".to_string(), TokenType::Repeatable);
1184        keywords.insert("TIES".to_string(), TokenType::Ties);
1185        keywords.insert("LATERAL".to_string(), TokenType::Lateral);
1186        keywords.insert("LAMBDA".to_string(), TokenType::Lambda);
1187        keywords.insert("APPLY".to_string(), TokenType::Apply);
1188        // Oracle CONNECT BY keywords
1189        keywords.insert("CONNECT".to_string(), TokenType::Connect);
1190        // Hive/Spark specific keywords
1191        keywords.insert("CLUSTER".to_string(), TokenType::Cluster);
1192        keywords.insert("DISTRIBUTE".to_string(), TokenType::Distribute);
1193        keywords.insert("SORT".to_string(), TokenType::Sort);
1194        keywords.insert("PIVOT".to_string(), TokenType::Pivot);
1195        keywords.insert("PREWHERE".to_string(), TokenType::Prewhere);
1196        keywords.insert("UNPIVOT".to_string(), TokenType::Unpivot);
1197        keywords.insert("FOR".to_string(), TokenType::For);
1198        keywords.insert("ANY".to_string(), TokenType::Any);
1199        keywords.insert("SOME".to_string(), TokenType::Some);
1200        keywords.insert("ASOF".to_string(), TokenType::AsOf);
1201        keywords.insert("PERCENT".to_string(), TokenType::Percent);
1202        keywords.insert("EXCLUDE".to_string(), TokenType::Exclude);
1203        keywords.insert("NO".to_string(), TokenType::No);
1204        keywords.insert("OTHERS".to_string(), TokenType::Others);
1205        // PostgreSQL OPERATOR() syntax for schema-qualified operators
1206        keywords.insert("OPERATOR".to_string(), TokenType::Operator);
1207        // Phase 4: DDL keywords
1208        keywords.insert("SCHEMA".to_string(), TokenType::Schema);
1209        keywords.insert("NAMESPACE".to_string(), TokenType::Namespace);
1210        keywords.insert("DATABASE".to_string(), TokenType::Database);
1211        keywords.insert("FUNCTION".to_string(), TokenType::Function);
1212        keywords.insert("PROCEDURE".to_string(), TokenType::Procedure);
1213        keywords.insert("PROC".to_string(), TokenType::Procedure);
1214        keywords.insert("SEQUENCE".to_string(), TokenType::Sequence);
1215        keywords.insert("TRIGGER".to_string(), TokenType::Trigger);
1216        keywords.insert("TYPE".to_string(), TokenType::Type);
1217        keywords.insert("DOMAIN".to_string(), TokenType::Domain);
1218        keywords.insert("RETURNS".to_string(), TokenType::Returns);
1219        keywords.insert("RETURNING".to_string(), TokenType::Returning);
1220        keywords.insert("LANGUAGE".to_string(), TokenType::Language);
1221        keywords.insert("ROLLBACK".to_string(), TokenType::Rollback);
1222        keywords.insert("COMMIT".to_string(), TokenType::Commit);
1223        keywords.insert("BEGIN".to_string(), TokenType::Begin);
1224        keywords.insert("DESCRIBE".to_string(), TokenType::Describe);
1225        keywords.insert("PRESERVE".to_string(), TokenType::Preserve);
1226        keywords.insert("TRANSACTION".to_string(), TokenType::Transaction);
1227        keywords.insert("SAVEPOINT".to_string(), TokenType::Savepoint);
1228        keywords.insert("BODY".to_string(), TokenType::Body);
1229        keywords.insert("INCREMENT".to_string(), TokenType::Increment);
1230        keywords.insert("MINVALUE".to_string(), TokenType::Minvalue);
1231        keywords.insert("MAXVALUE".to_string(), TokenType::Maxvalue);
1232        keywords.insert("CYCLE".to_string(), TokenType::Cycle);
1233        keywords.insert("NOCYCLE".to_string(), TokenType::NoCycle);
1234        keywords.insert("PRIOR".to_string(), TokenType::Prior);
1235        // MATCH_RECOGNIZE keywords
1236        keywords.insert("MATCH".to_string(), TokenType::Match);
1237        keywords.insert("MATCH_RECOGNIZE".to_string(), TokenType::MatchRecognize);
1238        keywords.insert("MEASURES".to_string(), TokenType::Measures);
1239        keywords.insert("PATTERN".to_string(), TokenType::Pattern);
1240        keywords.insert("DEFINE".to_string(), TokenType::Define);
1241        keywords.insert("RUNNING".to_string(), TokenType::Running);
1242        keywords.insert("FINAL".to_string(), TokenType::Final);
1243        keywords.insert("OWNED".to_string(), TokenType::Owned);
1244        keywords.insert("AFTER".to_string(), TokenType::After);
1245        keywords.insert("BEFORE".to_string(), TokenType::Before);
1246        keywords.insert("INSTEAD".to_string(), TokenType::Instead);
1247        keywords.insert("EACH".to_string(), TokenType::Each);
1248        keywords.insert("STATEMENT".to_string(), TokenType::Statement);
1249        keywords.insert("REFERENCING".to_string(), TokenType::Referencing);
1250        keywords.insert("OLD".to_string(), TokenType::Old);
1251        keywords.insert("NEW".to_string(), TokenType::New);
1252        keywords.insert("OF".to_string(), TokenType::Of);
1253        keywords.insert("CHECK".to_string(), TokenType::Check);
1254        keywords.insert("START".to_string(), TokenType::Start);
1255        keywords.insert("ENUM".to_string(), TokenType::Enum);
1256        keywords.insert("AUTHORIZATION".to_string(), TokenType::Authorization);
1257        keywords.insert("RESTART".to_string(), TokenType::Restart);
1258        // Date/time literal keywords
1259        keywords.insert("DATE".to_string(), TokenType::Date);
1260        keywords.insert("TIME".to_string(), TokenType::Time);
1261        keywords.insert("TIMESTAMP".to_string(), TokenType::Timestamp);
1262        keywords.insert("DATETIME".to_string(), TokenType::DateTime);
1263        keywords.insert("GENERATED".to_string(), TokenType::Generated);
1264        keywords.insert("IDENTITY".to_string(), TokenType::Identity);
1265        keywords.insert("ALWAYS".to_string(), TokenType::Always);
1266        // LOAD DATA keywords
1267        keywords.insert("LOAD".to_string(), TokenType::Load);
1268        keywords.insert("LOCAL".to_string(), TokenType::Local);
1269        keywords.insert("INPATH".to_string(), TokenType::Inpath);
1270        keywords.insert("INPUTFORMAT".to_string(), TokenType::InputFormat);
1271        keywords.insert("SERDE".to_string(), TokenType::Serde);
1272        keywords.insert("SERDEPROPERTIES".to_string(), TokenType::SerdeProperties);
1273        keywords.insert("FORMAT".to_string(), TokenType::Format);
1274        // SQLite
1275        keywords.insert("PRAGMA".to_string(), TokenType::Pragma);
1276        // SHOW statement
1277        keywords.insert("SHOW".to_string(), TokenType::Show);
1278        // Oracle ORDER SIBLINGS BY (hierarchical queries)
1279        keywords.insert("SIBLINGS".to_string(), TokenType::Siblings);
1280        // COPY and PUT statements (Snowflake, PostgreSQL)
1281        keywords.insert("COPY".to_string(), TokenType::Copy);
1282        keywords.insert("PUT".to_string(), TokenType::Put);
1283        keywords.insert("GET".to_string(), TokenType::Get);
1284        // EXEC/EXECUTE statement (TSQL, etc.)
1285        keywords.insert("EXEC".to_string(), TokenType::Execute);
1286        keywords.insert("EXECUTE".to_string(), TokenType::Execute);
1287        // Postfix null check operators (PostgreSQL/SQLite)
1288        keywords.insert("ISNULL".to_string(), TokenType::IsNull);
1289        keywords.insert("NOTNULL".to_string(), TokenType::NotNull);
1290
1291        let mut single_tokens = std::collections::HashMap::new();
1292        single_tokens.insert('(', TokenType::LParen);
1293        single_tokens.insert(')', TokenType::RParen);
1294        single_tokens.insert('[', TokenType::LBracket);
1295        single_tokens.insert(']', TokenType::RBracket);
1296        single_tokens.insert('{', TokenType::LBrace);
1297        single_tokens.insert('}', TokenType::RBrace);
1298        single_tokens.insert(',', TokenType::Comma);
1299        single_tokens.insert('.', TokenType::Dot);
1300        single_tokens.insert(';', TokenType::Semicolon);
1301        single_tokens.insert('+', TokenType::Plus);
1302        single_tokens.insert('-', TokenType::Dash);
1303        single_tokens.insert('*', TokenType::Star);
1304        single_tokens.insert('/', TokenType::Slash);
1305        single_tokens.insert('%', TokenType::Percent);
1306        single_tokens.insert('&', TokenType::Amp);
1307        single_tokens.insert('|', TokenType::Pipe);
1308        single_tokens.insert('^', TokenType::Caret);
1309        single_tokens.insert('~', TokenType::Tilde);
1310        single_tokens.insert('<', TokenType::Lt);
1311        single_tokens.insert('>', TokenType::Gt);
1312        single_tokens.insert('=', TokenType::Eq);
1313        single_tokens.insert('!', TokenType::Exclamation);
1314        single_tokens.insert(':', TokenType::Colon);
1315        single_tokens.insert('@', TokenType::DAt);
1316        single_tokens.insert('#', TokenType::Hash);
1317        single_tokens.insert('$', TokenType::Dollar);
1318        single_tokens.insert('?', TokenType::Parameter);
1319
1320        let mut quotes = std::collections::HashMap::new();
1321        quotes.insert("'".to_string(), "'".to_string());
1322        // Triple-quoted strings (e.g., """x""")
1323        quotes.insert("\"\"\"".to_string(), "\"\"\"".to_string());
1324
1325        let mut identifiers = std::collections::HashMap::new();
1326        identifiers.insert('"', '"');
1327        identifiers.insert('`', '`');
1328        // Note: TSQL bracket-quoted identifiers [name] are handled in the parser
1329        // because [ is also used for arrays and subscripts
1330
1331        let mut comments = std::collections::HashMap::new();
1332        comments.insert("--".to_string(), None);
1333        comments.insert("/*".to_string(), Some("*/".to_string()));
1334
1335        Self {
1336            keywords,
1337            single_tokens,
1338            quotes,
1339            identifiers,
1340            comments,
1341            // Standard SQL: only '' (doubled quote) escapes a quote
1342            // Backslash escapes are dialect-specific (MySQL, etc.)
1343            string_escapes: vec!['\''],
1344            nested_comments: true,
1345            // By default, no escape_follow_chars means preserve backslash for unrecognized escapes
1346            escape_follow_chars: vec![],
1347            // Default: b'...' is bit string (standard SQL), not byte string (BigQuery)
1348            b_prefix_is_byte_string: false,
1349            numeric_literals: std::collections::HashMap::new(),
1350            identifiers_can_start_with_digit: false,
1351            hex_number_strings: false,
1352            hex_string_is_integer_type: false,
1353            // Default: backslash escapes ARE allowed in raw strings (sqlglot default)
1354            // Spark/Databricks set this to false
1355            string_escapes_allowed_in_raw_strings: true,
1356            hash_comments: false,
1357            dollar_sign_is_identifier: false,
1358            insert_format_raw_data: false,
1359        }
1360    }
1361}
1362
1363/// SQL Tokenizer
1364pub struct Tokenizer {
1365    config: TokenizerConfig,
1366}
1367
1368impl Tokenizer {
1369    /// Create a new tokenizer with the given configuration
1370    pub fn new(config: TokenizerConfig) -> Self {
1371        Self { config }
1372    }
1373
1374    /// Create a tokenizer with default configuration
1375    pub fn default_config() -> Self {
1376        Self::new(TokenizerConfig::default())
1377    }
1378
1379    /// Tokenize a SQL string
1380    pub fn tokenize(&self, sql: &str) -> Result<Vec<Token>> {
1381        let mut state = TokenizerState::new(sql, &self.config);
1382        state.tokenize()
1383    }
1384}
1385
1386impl Default for Tokenizer {
1387    fn default() -> Self {
1388        Self::default_config()
1389    }
1390}
1391
1392/// Internal state for tokenization
1393struct TokenizerState<'a> {
1394    chars: Vec<char>,
1395    size: usize,
1396    tokens: Vec<Token>,
1397    start: usize,
1398    current: usize,
1399    line: usize,
1400    column: usize,
1401    comments: Vec<String>,
1402    config: &'a TokenizerConfig,
1403}
1404
1405impl<'a> TokenizerState<'a> {
1406    fn new(sql: &str, config: &'a TokenizerConfig) -> Self {
1407        let chars: Vec<char> = sql.chars().collect();
1408        let size = chars.len();
1409        Self {
1410            chars,
1411            size,
1412            tokens: Vec::new(),
1413            start: 0,
1414            current: 0,
1415            line: 1,
1416            column: 1,
1417            comments: Vec::new(),
1418            config,
1419        }
1420    }
1421
1422    fn tokenize(&mut self) -> Result<Vec<Token>> {
1423        while !self.is_at_end() {
1424            self.skip_whitespace();
1425            if self.is_at_end() {
1426                break;
1427            }
1428
1429            self.start = self.current;
1430            self.scan_token()?;
1431
1432            // ClickHouse: After INSERT ... FORMAT <name> (where name != VALUES),
1433            // the rest until the next blank line or end of input is raw data.
1434            if self.config.insert_format_raw_data {
1435                if let Some(raw) = self.try_scan_insert_format_raw_data() {
1436                    if !raw.is_empty() {
1437                        self.start = self.current;
1438                        self.add_token_with_text(TokenType::Var, raw);
1439                    }
1440                }
1441            }
1442        }
1443
1444        // Handle leftover leading comments at end of input.
1445        // These are comments on a new line after the last token that couldn't be attached
1446        // as leading comments to a subsequent token (because there is none).
1447        // Attach them as trailing comments on the last token so they're preserved.
1448        if !self.comments.is_empty() {
1449            if let Some(last) = self.tokens.last_mut() {
1450                last.trailing_comments.extend(self.comments.drain(..));
1451            }
1452        }
1453
1454        Ok(std::mem::take(&mut self.tokens))
1455    }
1456
1457    fn is_at_end(&self) -> bool {
1458        self.current >= self.size
1459    }
1460
1461    fn peek(&self) -> char {
1462        if self.is_at_end() {
1463            '\0'
1464        } else {
1465            self.chars[self.current]
1466        }
1467    }
1468
1469    fn peek_next(&self) -> char {
1470        if self.current + 1 >= self.size {
1471            '\0'
1472        } else {
1473            self.chars[self.current + 1]
1474        }
1475    }
1476
1477    fn advance(&mut self) -> char {
1478        let c = self.peek();
1479        self.current += 1;
1480        if c == '\n' {
1481            self.line += 1;
1482            self.column = 1;
1483        } else {
1484            self.column += 1;
1485        }
1486        c
1487    }
1488
1489    fn skip_whitespace(&mut self) {
1490        // Track whether we've seen a newline since the last token.
1491        // Comments on a new line (after a newline) are leading comments on the next token,
1492        // while comments on the same line are trailing comments on the previous token.
1493        // This matches Python sqlglot's behavior.
1494        let mut saw_newline = false;
1495        while !self.is_at_end() {
1496            let c = self.peek();
1497            match c {
1498                ' ' | '\t' | '\r' => {
1499                    self.advance();
1500                }
1501                '\n' => {
1502                    saw_newline = true;
1503                    self.advance();
1504                }
1505                '\u{00A0}' // non-breaking space
1506                | '\u{2000}'..='\u{200B}' // various Unicode spaces + zero-width space
1507                | '\u{3000}' // ideographic (full-width) space
1508                | '\u{FEFF}' // BOM / zero-width no-break space
1509                => {
1510                    self.advance();
1511                }
1512                '-' if self.peek_next() == '-' => {
1513                    self.scan_line_comment(saw_newline);
1514                    // After a line comment, we're always on a new line
1515                    saw_newline = true;
1516                }
1517                '/' if self.peek_next() == '/' && self.config.hash_comments => {
1518                    // ClickHouse: // single-line comments (same dialects that support # comments)
1519                    self.scan_double_slash_comment();
1520                }
1521                '/' if self.peek_next() == '*' => {
1522                    // Check if this is a hint comment /*+ ... */
1523                    if self.current + 2 < self.size && self.chars[self.current + 2] == '+' {
1524                        // This is a hint comment, handle it as a token instead of skipping
1525                        break;
1526                    }
1527                    if self.scan_block_comment(saw_newline).is_err() {
1528                        return;
1529                    }
1530                    // Don't reset saw_newline - it carries forward
1531                }
1532                '/' if self.peek_next() == '/' && self.config.comments.contains_key("//") => {
1533                    // Dialect-specific // line comment (e.g., Snowflake)
1534                    // But NOT inside URIs like file:// or paths with consecutive slashes
1535                    // Check that previous non-whitespace char is not ':' or '/'
1536                    let prev_non_ws = if self.current > 0 {
1537                        let mut i = self.current - 1;
1538                        while i > 0 && (self.chars[i] == ' ' || self.chars[i] == '\t') {
1539                            i -= 1;
1540                        }
1541                        self.chars[i]
1542                    } else {
1543                        '\0'
1544                    };
1545                    if prev_non_ws == ':' || prev_non_ws == '/' {
1546                        // This is likely a URI (file://, http://) or path, not a comment
1547                        break;
1548                    }
1549                    self.scan_line_comment(saw_newline);
1550                    // After a line comment, we're always on a new line
1551                    saw_newline = true;
1552                }
1553                '#' if self.config.hash_comments => {
1554                    self.scan_hash_line_comment();
1555                }
1556                _ => break,
1557            }
1558        }
1559    }
1560
1561    fn scan_hash_line_comment(&mut self) {
1562        self.advance(); // #
1563        let start = self.current;
1564        while !self.is_at_end() && self.peek() != '\n' {
1565            self.advance();
1566        }
1567        let comment: String = self.chars[start..self.current].iter().collect();
1568        let comment_text = comment.trim().to_string();
1569        if let Some(last) = self.tokens.last_mut() {
1570            last.trailing_comments.push(comment_text);
1571        } else {
1572            self.comments.push(comment_text);
1573        }
1574    }
1575
1576    fn scan_double_slash_comment(&mut self) {
1577        self.advance(); // /
1578        self.advance(); // /
1579        let start = self.current;
1580        while !self.is_at_end() && self.peek() != '\n' {
1581            self.advance();
1582        }
1583        let comment: String = self.chars[start..self.current].iter().collect();
1584        let comment_text = comment.trim().to_string();
1585        if let Some(last) = self.tokens.last_mut() {
1586            last.trailing_comments.push(comment_text);
1587        } else {
1588            self.comments.push(comment_text);
1589        }
1590    }
1591
1592    fn scan_line_comment(&mut self, after_newline: bool) {
1593        self.advance(); // -
1594        self.advance(); // -
1595        let start = self.current;
1596        while !self.is_at_end() && self.peek() != '\n' {
1597            self.advance();
1598        }
1599        let comment_text: String = self.chars[start..self.current].iter().collect();
1600
1601        // If the comment starts on a new line (after_newline), it's a leading comment
1602        // on the next token. Otherwise, it's a trailing comment on the previous token.
1603        if after_newline || self.tokens.is_empty() {
1604            self.comments.push(comment_text);
1605        } else if let Some(last) = self.tokens.last_mut() {
1606            last.trailing_comments.push(comment_text);
1607        }
1608    }
1609
1610    fn scan_block_comment(&mut self, after_newline: bool) -> Result<()> {
1611        self.advance(); // /
1612        self.advance(); // *
1613        let content_start = self.current;
1614        let mut depth = 1;
1615
1616        while !self.is_at_end() && depth > 0 {
1617            if self.peek() == '/' && self.peek_next() == '*' && self.config.nested_comments {
1618                self.advance();
1619                self.advance();
1620                depth += 1;
1621            } else if self.peek() == '*' && self.peek_next() == '/' {
1622                depth -= 1;
1623                if depth > 0 {
1624                    self.advance();
1625                    self.advance();
1626                }
1627            } else {
1628                self.advance();
1629            }
1630        }
1631
1632        if depth > 0 {
1633            return Err(Error::tokenize(
1634                "Unterminated block comment",
1635                self.line,
1636                self.column,
1637            ));
1638        }
1639
1640        // Get the content between /* and */ (preserving internal whitespace for nested comments)
1641        let content: String = self.chars[content_start..self.current].iter().collect();
1642        self.advance(); // *
1643        self.advance(); // /
1644
1645        // For round-trip fidelity, preserve the exact comment content including nested comments
1646        let comment_text = format!("/*{}*/", content);
1647
1648        // If the comment starts on a new line (after_newline), it's a leading comment
1649        // on the next token. Otherwise, it's a trailing comment on the previous token.
1650        if after_newline || self.tokens.is_empty() {
1651            self.comments.push(comment_text);
1652        } else if let Some(last) = self.tokens.last_mut() {
1653            last.trailing_comments.push(comment_text);
1654        }
1655
1656        Ok(())
1657    }
1658
1659    /// Scan a hint comment /*+ ... */ and return it as a Hint token
1660    fn scan_hint(&mut self) -> Result<()> {
1661        self.advance(); // /
1662        self.advance(); // *
1663        self.advance(); // +
1664        let hint_start = self.current;
1665
1666        // Scan until we find */
1667        while !self.is_at_end() {
1668            if self.peek() == '*' && self.peek_next() == '/' {
1669                break;
1670            }
1671            self.advance();
1672        }
1673
1674        if self.is_at_end() {
1675            return Err(Error::tokenize(
1676                "Unterminated hint comment",
1677                self.line,
1678                self.column,
1679            ));
1680        }
1681
1682        let hint_text: String = self.chars[hint_start..self.current].iter().collect();
1683        self.advance(); // *
1684        self.advance(); // /
1685
1686        self.add_token_with_text(TokenType::Hint, hint_text.trim().to_string());
1687
1688        Ok(())
1689    }
1690
1691    /// Scan a positional parameter: $1, $2, etc.
1692    fn scan_positional_parameter(&mut self) -> Result<()> {
1693        self.advance(); // consume $
1694        let start = self.current;
1695
1696        while !self.is_at_end() && self.peek().is_ascii_digit() {
1697            self.advance();
1698        }
1699
1700        let number: String = self.chars[start..self.current].iter().collect();
1701        self.add_token_with_text(TokenType::Parameter, number);
1702        Ok(())
1703    }
1704
1705    /// Try to scan a tagged dollar-quoted string: $tag$content$tag$
1706    /// Returns Some(()) if successful, None if this isn't a tagged dollar string.
1707    ///
1708    /// The token text is stored as "tag\x00content" to preserve the tag for later use.
1709    fn try_scan_tagged_dollar_string(&mut self) -> Result<Option<()>> {
1710        let saved_pos = self.current;
1711
1712        // We're at '$', next char is alphabetic
1713        self.advance(); // consume opening $
1714
1715        // Scan the tag (identifier: alphanumeric + underscore, including Unicode)
1716        // Tags can contain Unicode characters like emojis (e.g., $🦆$)
1717        let tag_start = self.current;
1718        while !self.is_at_end()
1719            && (self.peek().is_alphanumeric() || self.peek() == '_' || !self.peek().is_ascii())
1720        {
1721            self.advance();
1722        }
1723        let tag: String = self.chars[tag_start..self.current].iter().collect();
1724
1725        // Must have a closing $ after the tag
1726        if self.is_at_end() || self.peek() != '$' {
1727            // Not a tagged dollar string - restore position
1728            self.current = saved_pos;
1729            return Ok(None);
1730        }
1731        self.advance(); // consume closing $ of opening tag
1732
1733        // Now scan content until we find $tag$
1734        let content_start = self.current;
1735        let closing_tag = format!("${}$", tag);
1736        let closing_chars: Vec<char> = closing_tag.chars().collect();
1737
1738        loop {
1739            if self.is_at_end() {
1740                // Unterminated - restore and fall through
1741                self.current = saved_pos;
1742                return Ok(None);
1743            }
1744
1745            // Check if we've reached the closing tag
1746            if self.peek() == '$' && self.current + closing_chars.len() <= self.size {
1747                let matches = closing_chars.iter().enumerate().all(|(j, &ch)| {
1748                    self.current + j < self.size && self.chars[self.current + j] == ch
1749                });
1750                if matches {
1751                    let content: String = self.chars[content_start..self.current].iter().collect();
1752                    // Consume closing tag
1753                    for _ in 0..closing_chars.len() {
1754                        self.advance();
1755                    }
1756                    // Store as "tag\x00content" to preserve the tag
1757                    let token_text = format!("{}\x00{}", tag, content);
1758                    self.add_token_with_text(TokenType::DollarString, token_text);
1759                    return Ok(Some(()));
1760                }
1761            }
1762            self.advance();
1763        }
1764    }
1765
1766    /// Scan a dollar-quoted string: $$content$$ or $tag$content$tag$
1767    ///
1768    /// For $$...$$ (no tag), the token text is just the content.
1769    /// For $tag$...$tag$, use try_scan_tagged_dollar_string instead.
1770    fn scan_dollar_quoted_string(&mut self) -> Result<()> {
1771        self.advance(); // consume first $
1772        self.advance(); // consume second $
1773
1774        // For $$...$$ (no tag), just scan until closing $$
1775        let start = self.current;
1776        while !self.is_at_end() {
1777            if self.peek() == '$'
1778                && self.current + 1 < self.size
1779                && self.chars[self.current + 1] == '$'
1780            {
1781                break;
1782            }
1783            self.advance();
1784        }
1785
1786        let content: String = self.chars[start..self.current].iter().collect();
1787
1788        if !self.is_at_end() {
1789            self.advance(); // consume first $
1790            self.advance(); // consume second $
1791        }
1792
1793        self.add_token_with_text(TokenType::DollarString, content);
1794        Ok(())
1795    }
1796
1797    fn scan_token(&mut self) -> Result<()> {
1798        let c = self.peek();
1799
1800        // Check for string literal
1801        if c == '\'' {
1802            // Check for triple-quoted string '''...''' if configured
1803            if self.config.quotes.contains_key("'''")
1804                && self.peek_next() == '\''
1805                && self.current + 2 < self.size
1806                && self.chars[self.current + 2] == '\''
1807            {
1808                return self.scan_triple_quoted_string('\'');
1809            }
1810            return self.scan_string();
1811        }
1812
1813        // Check for triple-quoted string """...""" if configured
1814        if c == '"'
1815            && self.config.quotes.contains_key("\"\"\"")
1816            && self.peek_next() == '"'
1817            && self.current + 2 < self.size
1818            && self.chars[self.current + 2] == '"'
1819        {
1820            return self.scan_triple_quoted_string('"');
1821        }
1822
1823        // Check for double-quoted strings when dialect supports them (e.g., BigQuery)
1824        // This must come before identifier quotes check
1825        if c == '"'
1826            && self.config.quotes.contains_key("\"")
1827            && !self.config.identifiers.contains_key(&'"')
1828        {
1829            return self.scan_double_quoted_string();
1830        }
1831
1832        // Check for identifier quotes
1833        if let Some(&end_quote) = self.config.identifiers.get(&c) {
1834            return self.scan_quoted_identifier(end_quote);
1835        }
1836
1837        // Check for numbers (including numbers starting with a dot like .25)
1838        if c.is_ascii_digit() {
1839            return self.scan_number();
1840        }
1841
1842        // Check for numbers starting with a dot (e.g., .25, .5)
1843        // This must come before single character token handling
1844        // Don't treat as a number if:
1845        // - Previous char was also a dot (e.g., 1..2 should be 1, ., ., 2)
1846        // - Previous char is an identifier character (e.g., foo.25 should be foo, ., 25)
1847        //   This handles BigQuery numeric table parts like project.dataset.25
1848        if c == '.' && self.peek_next().is_ascii_digit() {
1849            let prev_char = if self.current > 0 {
1850                self.chars[self.current - 1]
1851            } else {
1852                '\0'
1853            };
1854            let is_after_ident = prev_char.is_alphanumeric()
1855                || prev_char == '_'
1856                || prev_char == '`'
1857                || prev_char == '"'
1858                || prev_char == ']'
1859                || prev_char == ')';
1860            if prev_char != '.' && !is_after_ident {
1861                return self.scan_number_starting_with_dot();
1862            }
1863        }
1864
1865        // Check for hint comment /*+ ... */
1866        if c == '/'
1867            && self.peek_next() == '*'
1868            && self.current + 2 < self.size
1869            && self.chars[self.current + 2] == '+'
1870        {
1871            return self.scan_hint();
1872        }
1873
1874        // Check for multi-character operators first
1875        if let Some(token_type) = self.try_scan_multi_char_operator() {
1876            self.add_token(token_type);
1877            return Ok(());
1878        }
1879
1880        // Check for tagged dollar-quoted strings: $tag$content$tag$
1881        // Tags can contain Unicode characters (including emojis like 🦆) and digits (e.g., $1$)
1882        if c == '$'
1883            && (self.peek_next().is_alphanumeric()
1884                || self.peek_next() == '_'
1885                || !self.peek_next().is_ascii())
1886        {
1887            if let Some(()) = self.try_scan_tagged_dollar_string()? {
1888                return Ok(());
1889            }
1890            // If tagged dollar string didn't match and dollar_sign_is_identifier is set,
1891            // treat the $ and following chars as an identifier (e.g., ClickHouse $alias$name$).
1892            if self.config.dollar_sign_is_identifier {
1893                return self.scan_dollar_identifier();
1894            }
1895        }
1896
1897        // Check for dollar-quoted strings: $$...$$
1898        if c == '$' && self.peek_next() == '$' {
1899            return self.scan_dollar_quoted_string();
1900        }
1901
1902        // Check for positional parameters: $1, $2, etc.
1903        if c == '$' && self.peek_next().is_ascii_digit() {
1904            return self.scan_positional_parameter();
1905        }
1906
1907        // ClickHouse: bare $ (not followed by alphanumeric/underscore) as identifier
1908        if c == '$' && self.config.dollar_sign_is_identifier {
1909            return self.scan_dollar_identifier();
1910        }
1911
1912        // TSQL: Check for identifiers starting with # (temp tables) or @ (variables)
1913        // e.g., #temp, ##global_temp, @variable
1914        if (c == '#' || c == '@')
1915            && (self.peek_next().is_alphanumeric()
1916                || self.peek_next() == '_'
1917                || self.peek_next() == '#')
1918        {
1919            return self.scan_tsql_identifier();
1920        }
1921
1922        // Check for single character tokens
1923        if let Some(&token_type) = self.config.single_tokens.get(&c) {
1924            self.advance();
1925            self.add_token(token_type);
1926            return Ok(());
1927        }
1928
1929        // Unicode minus (U+2212) → treat as regular minus
1930        if c == '\u{2212}' {
1931            self.advance();
1932            self.add_token(TokenType::Dash);
1933            return Ok(());
1934        }
1935
1936        // Unicode fraction slash (U+2044) → treat as regular slash
1937        if c == '\u{2044}' {
1938            self.advance();
1939            self.add_token(TokenType::Slash);
1940            return Ok(());
1941        }
1942
1943        // Unicode curly/smart quotes → treat as regular string quotes
1944        if c == '\u{2018}' || c == '\u{2019}' {
1945            // Left/right single quotation marks → scan as string with matching end
1946            return self.scan_unicode_quoted_string(c);
1947        }
1948        if c == '\u{201C}' || c == '\u{201D}' {
1949            // Left/right double quotation marks → scan as quoted identifier
1950            return self.scan_unicode_quoted_identifier(c);
1951        }
1952
1953        // Must be an identifier or keyword
1954        self.scan_identifier_or_keyword()
1955    }
1956
1957    fn try_scan_multi_char_operator(&mut self) -> Option<TokenType> {
1958        let c = self.peek();
1959        let next = self.peek_next();
1960        let third = if self.current + 2 < self.size {
1961            self.chars[self.current + 2]
1962        } else {
1963            '\0'
1964        };
1965
1966        // Check for three-character operators first
1967        // -|- (Adjacent - PostgreSQL range adjacency)
1968        if c == '-' && next == '|' && third == '-' {
1969            self.advance();
1970            self.advance();
1971            self.advance();
1972            return Some(TokenType::Adjacent);
1973        }
1974
1975        // ||/ (Cube root - PostgreSQL)
1976        if c == '|' && next == '|' && third == '/' {
1977            self.advance();
1978            self.advance();
1979            self.advance();
1980            return Some(TokenType::DPipeSlash);
1981        }
1982
1983        // #>> (JSONB path text extraction - PostgreSQL)
1984        if c == '#' && next == '>' && third == '>' {
1985            self.advance();
1986            self.advance();
1987            self.advance();
1988            return Some(TokenType::DHashArrow);
1989        }
1990
1991        // ->> (JSON text extraction - PostgreSQL/MySQL)
1992        if c == '-' && next == '>' && third == '>' {
1993            self.advance();
1994            self.advance();
1995            self.advance();
1996            return Some(TokenType::DArrow);
1997        }
1998
1999        // <=> (NULL-safe equality - MySQL)
2000        if c == '<' && next == '=' && third == '>' {
2001            self.advance();
2002            self.advance();
2003            self.advance();
2004            return Some(TokenType::NullsafeEq);
2005        }
2006
2007        // <-> (Distance operator - PostgreSQL)
2008        if c == '<' && next == '-' && third == '>' {
2009            self.advance();
2010            self.advance();
2011            self.advance();
2012            return Some(TokenType::LrArrow);
2013        }
2014
2015        // <@ (Contained by - PostgreSQL)
2016        if c == '<' && next == '@' {
2017            self.advance();
2018            self.advance();
2019            return Some(TokenType::LtAt);
2020        }
2021
2022        // @> (Contains - PostgreSQL)
2023        if c == '@' && next == '>' {
2024            self.advance();
2025            self.advance();
2026            return Some(TokenType::AtGt);
2027        }
2028
2029        // ~~~ (Glob - PostgreSQL)
2030        if c == '~' && next == '~' && third == '~' {
2031            self.advance();
2032            self.advance();
2033            self.advance();
2034            return Some(TokenType::Glob);
2035        }
2036
2037        // ~~* (ILike - PostgreSQL)
2038        if c == '~' && next == '~' && third == '*' {
2039            self.advance();
2040            self.advance();
2041            self.advance();
2042            return Some(TokenType::ILike);
2043        }
2044
2045        // !~~* (Not ILike - PostgreSQL)
2046        let fourth = if self.current + 3 < self.size {
2047            self.chars[self.current + 3]
2048        } else {
2049            '\0'
2050        };
2051        if c == '!' && next == '~' && third == '~' && fourth == '*' {
2052            self.advance();
2053            self.advance();
2054            self.advance();
2055            self.advance();
2056            return Some(TokenType::NotILike);
2057        }
2058
2059        // !~~ (Not Like - PostgreSQL)
2060        if c == '!' && next == '~' && third == '~' {
2061            self.advance();
2062            self.advance();
2063            self.advance();
2064            return Some(TokenType::NotLike);
2065        }
2066
2067        // !~* (Not Regexp ILike - PostgreSQL)
2068        if c == '!' && next == '~' && third == '*' {
2069            self.advance();
2070            self.advance();
2071            self.advance();
2072            return Some(TokenType::NotIRLike);
2073        }
2074
2075        // !:> (Not cast / Try cast - SingleStore)
2076        if c == '!' && next == ':' && third == '>' {
2077            self.advance();
2078            self.advance();
2079            self.advance();
2080            return Some(TokenType::NColonGt);
2081        }
2082
2083        // ?:: (TRY_CAST shorthand - Databricks)
2084        if c == '?' && next == ':' && third == ':' {
2085            self.advance();
2086            self.advance();
2087            self.advance();
2088            return Some(TokenType::QDColon);
2089        }
2090
2091        // !~ (Not Regexp - PostgreSQL)
2092        if c == '!' && next == '~' {
2093            self.advance();
2094            self.advance();
2095            return Some(TokenType::NotRLike);
2096        }
2097
2098        // ~~ (Like - PostgreSQL)
2099        if c == '~' && next == '~' {
2100            self.advance();
2101            self.advance();
2102            return Some(TokenType::Like);
2103        }
2104
2105        // ~* (Regexp ILike - PostgreSQL)
2106        if c == '~' && next == '*' {
2107            self.advance();
2108            self.advance();
2109            return Some(TokenType::IRLike);
2110        }
2111
2112        // SingleStore three-character JSON path operators (must be checked before :: two-char)
2113        // ::$ (JSON extract string), ::% (JSON extract double), ::? (JSON match)
2114        if c == ':' && next == ':' && third == '$' {
2115            self.advance();
2116            self.advance();
2117            self.advance();
2118            return Some(TokenType::DColonDollar);
2119        }
2120        if c == ':' && next == ':' && third == '%' {
2121            self.advance();
2122            self.advance();
2123            self.advance();
2124            return Some(TokenType::DColonPercent);
2125        }
2126        if c == ':' && next == ':' && third == '?' {
2127            self.advance();
2128            self.advance();
2129            self.advance();
2130            return Some(TokenType::DColonQMark);
2131        }
2132
2133        // Two-character operators
2134        let token_type = match (c, next) {
2135            ('.', ':') => Some(TokenType::DotColon),
2136            ('=', '=') => Some(TokenType::Eq), // Hive/Spark == equality operator
2137            ('<', '=') => Some(TokenType::Lte),
2138            ('>', '=') => Some(TokenType::Gte),
2139            ('!', '=') => Some(TokenType::Neq),
2140            ('<', '>') => Some(TokenType::Neq),
2141            ('^', '=') => Some(TokenType::Neq),
2142            ('<', '<') => Some(TokenType::LtLt),
2143            ('>', '>') => Some(TokenType::GtGt),
2144            ('|', '|') => Some(TokenType::DPipe),
2145            ('|', '/') => Some(TokenType::PipeSlash), // Square root - PostgreSQL
2146            (':', ':') => Some(TokenType::DColon),
2147            (':', '=') => Some(TokenType::ColonEq), // := (assignment, named args)
2148            (':', '>') => Some(TokenType::ColonGt), // ::> (TSQL)
2149            ('-', '>') => Some(TokenType::Arrow),   // JSON object access
2150            ('=', '>') => Some(TokenType::FArrow),  // Fat arrow (lambda)
2151            ('&', '&') => Some(TokenType::DAmp),
2152            ('&', '<') => Some(TokenType::AmpLt), // PostgreSQL range operator
2153            ('&', '>') => Some(TokenType::AmpGt), // PostgreSQL range operator
2154            ('@', '@') => Some(TokenType::AtAt),  // Text search match
2155            ('?', '|') => Some(TokenType::QMarkPipe), // JSONB contains any key
2156            ('?', '&') => Some(TokenType::QMarkAmp), // JSONB contains all keys
2157            ('?', '?') => Some(TokenType::DQMark), // Double question mark
2158            ('#', '>') => Some(TokenType::HashArrow), // JSONB path extraction
2159            ('#', '-') => Some(TokenType::HashDash), // JSONB delete
2160            ('^', '@') => Some(TokenType::CaretAt), // PostgreSQL starts-with operator
2161            ('*', '*') => Some(TokenType::DStar), // Power operator
2162            ('|', '>') => Some(TokenType::PipeGt), // Pipe-greater (some dialects)
2163            _ => None,
2164        };
2165
2166        if token_type.is_some() {
2167            self.advance();
2168            self.advance();
2169        }
2170
2171        token_type
2172    }
2173
2174    fn scan_string(&mut self) -> Result<()> {
2175        self.advance(); // Opening quote
2176        let mut value = String::new();
2177
2178        while !self.is_at_end() {
2179            let c = self.peek();
2180            if c == '\'' {
2181                if self.peek_next() == '\'' {
2182                    // Escaped quote
2183                    value.push('\'');
2184                    self.advance();
2185                    self.advance();
2186                } else {
2187                    break;
2188                }
2189            } else if c == '\\' && self.config.string_escapes.contains(&'\\') {
2190                // Handle escape sequences
2191                self.advance(); // Consume the backslash
2192                if !self.is_at_end() {
2193                    let escaped = self.advance();
2194                    match escaped {
2195                        'n' => value.push('\n'),
2196                        'r' => value.push('\r'),
2197                        't' => value.push('\t'),
2198                        '0' => value.push('\0'),
2199                        'Z' => value.push('\x1A'), // Ctrl+Z (MySQL)
2200                        'a' => value.push('\x07'), // Alert/bell
2201                        'b' => value.push('\x08'), // Backspace
2202                        'f' => value.push('\x0C'), // Form feed
2203                        'v' => value.push('\x0B'), // Vertical tab
2204                        'x' => {
2205                            // Hex escape: \xNN (exactly 2 hex digits)
2206                            let mut hex = String::with_capacity(2);
2207                            for _ in 0..2 {
2208                                if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2209                                    hex.push(self.advance());
2210                                }
2211                            }
2212                            if hex.len() == 2 {
2213                                if let Ok(byte) = u8::from_str_radix(&hex, 16) {
2214                                    value.push(byte as char);
2215                                } else {
2216                                    value.push('\\');
2217                                    value.push('x');
2218                                    value.push_str(&hex);
2219                                }
2220                            } else {
2221                                // Not enough hex digits, preserve literally
2222                                value.push('\\');
2223                                value.push('x');
2224                                value.push_str(&hex);
2225                            }
2226                        }
2227                        '\\' => value.push('\\'),
2228                        '\'' => value.push('\''),
2229                        '"' => value.push('"'),
2230                        '%' => {
2231                            // MySQL: \% in LIKE patterns
2232                            value.push('%');
2233                        }
2234                        '_' => {
2235                            // MySQL: \_ in LIKE patterns
2236                            value.push('_');
2237                        }
2238                        // For unrecognized escape sequences:
2239                        // If escape_follow_chars is set, only preserve backslash for chars in that list
2240                        // Otherwise (empty list), preserve backslash + char for unrecognized escapes
2241                        _ => {
2242                            if !self.config.escape_follow_chars.is_empty() {
2243                                // MySQL-style: discard backslash for unrecognized escapes
2244                                value.push(escaped);
2245                            } else {
2246                                // Standard: preserve backslash + char
2247                                value.push('\\');
2248                                value.push(escaped);
2249                            }
2250                        }
2251                    }
2252                }
2253            } else {
2254                value.push(self.advance());
2255            }
2256        }
2257
2258        if self.is_at_end() {
2259            return Err(Error::tokenize(
2260                "Unterminated string",
2261                self.line,
2262                self.column,
2263            ));
2264        }
2265
2266        self.advance(); // Closing quote
2267        self.add_token_with_text(TokenType::String, value);
2268        Ok(())
2269    }
2270
2271    /// Scan a double-quoted string (for dialects like BigQuery where " is a string delimiter)
2272    fn scan_double_quoted_string(&mut self) -> Result<()> {
2273        self.advance(); // Opening quote
2274        let mut value = String::new();
2275
2276        while !self.is_at_end() {
2277            let c = self.peek();
2278            if c == '"' {
2279                if self.peek_next() == '"' {
2280                    // Escaped quote
2281                    value.push('"');
2282                    self.advance();
2283                    self.advance();
2284                } else {
2285                    break;
2286                }
2287            } else if c == '\\' && self.config.string_escapes.contains(&'\\') {
2288                // Handle escape sequences
2289                self.advance(); // Consume the backslash
2290                if !self.is_at_end() {
2291                    let escaped = self.advance();
2292                    match escaped {
2293                        'n' => value.push('\n'),
2294                        'r' => value.push('\r'),
2295                        't' => value.push('\t'),
2296                        '0' => value.push('\0'),
2297                        'Z' => value.push('\x1A'), // Ctrl+Z (MySQL)
2298                        'a' => value.push('\x07'), // Alert/bell
2299                        'b' => value.push('\x08'), // Backspace
2300                        'f' => value.push('\x0C'), // Form feed
2301                        'v' => value.push('\x0B'), // Vertical tab
2302                        'x' => {
2303                            // Hex escape: \xNN (exactly 2 hex digits)
2304                            let mut hex = String::with_capacity(2);
2305                            for _ in 0..2 {
2306                                if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2307                                    hex.push(self.advance());
2308                                }
2309                            }
2310                            if hex.len() == 2 {
2311                                if let Ok(byte) = u8::from_str_radix(&hex, 16) {
2312                                    value.push(byte as char);
2313                                } else {
2314                                    value.push('\\');
2315                                    value.push('x');
2316                                    value.push_str(&hex);
2317                                }
2318                            } else {
2319                                // Not enough hex digits, preserve literally
2320                                value.push('\\');
2321                                value.push('x');
2322                                value.push_str(&hex);
2323                            }
2324                        }
2325                        '\\' => value.push('\\'),
2326                        '\'' => value.push('\''),
2327                        '"' => value.push('"'),
2328                        '%' => {
2329                            // MySQL: \% in LIKE patterns
2330                            value.push('%');
2331                        }
2332                        '_' => {
2333                            // MySQL: \_ in LIKE patterns
2334                            value.push('_');
2335                        }
2336                        // For unrecognized escape sequences:
2337                        // If escape_follow_chars is set, only preserve backslash for chars in that list
2338                        // Otherwise (empty list), preserve backslash + char for unrecognized escapes
2339                        _ => {
2340                            if !self.config.escape_follow_chars.is_empty() {
2341                                // MySQL-style: discard backslash for unrecognized escapes
2342                                value.push(escaped);
2343                            } else {
2344                                // Standard: preserve backslash + char
2345                                value.push('\\');
2346                                value.push(escaped);
2347                            }
2348                        }
2349                    }
2350                }
2351            } else {
2352                value.push(self.advance());
2353            }
2354        }
2355
2356        if self.is_at_end() {
2357            return Err(Error::tokenize(
2358                "Unterminated double-quoted string",
2359                self.line,
2360                self.column,
2361            ));
2362        }
2363
2364        self.advance(); // Closing quote
2365        self.add_token_with_text(TokenType::String, value);
2366        Ok(())
2367    }
2368
2369    fn scan_triple_quoted_string(&mut self, quote_char: char) -> Result<()> {
2370        // Advance past the three opening quotes
2371        self.advance();
2372        self.advance();
2373        self.advance();
2374        let mut value = String::new();
2375
2376        while !self.is_at_end() {
2377            // Check for closing triple quote
2378            if self.peek() == quote_char
2379                && self.current + 1 < self.size
2380                && self.chars[self.current + 1] == quote_char
2381                && self.current + 2 < self.size
2382                && self.chars[self.current + 2] == quote_char
2383            {
2384                // Found closing """
2385                break;
2386            }
2387            value.push(self.advance());
2388        }
2389
2390        if self.is_at_end() {
2391            return Err(Error::tokenize(
2392                "Unterminated triple-quoted string",
2393                self.line,
2394                self.column,
2395            ));
2396        }
2397
2398        // Advance past the three closing quotes
2399        self.advance();
2400        self.advance();
2401        self.advance();
2402        let token_type = if quote_char == '"' {
2403            TokenType::TripleDoubleQuotedString
2404        } else {
2405            TokenType::TripleSingleQuotedString
2406        };
2407        self.add_token_with_text(token_type, value);
2408        Ok(())
2409    }
2410
2411    fn scan_quoted_identifier(&mut self, end_quote: char) -> Result<()> {
2412        self.advance(); // Opening quote
2413        let mut value = String::new();
2414
2415        loop {
2416            if self.is_at_end() {
2417                return Err(Error::tokenize(
2418                    "Unterminated identifier",
2419                    self.line,
2420                    self.column,
2421                ));
2422            }
2423            if self.peek() == end_quote {
2424                if self.peek_next() == end_quote {
2425                    // Escaped quote (e.g., "" inside "x""y") -> store single quote
2426                    value.push(end_quote);
2427                    self.advance(); // skip first quote
2428                    self.advance(); // skip second quote
2429                } else {
2430                    // End of identifier
2431                    break;
2432                }
2433            } else {
2434                value.push(self.peek());
2435                self.advance();
2436            }
2437        }
2438
2439        self.advance(); // Closing quote
2440        self.add_token_with_text(TokenType::QuotedIdentifier, value);
2441        Ok(())
2442    }
2443
2444    /// Scan a string delimited by Unicode curly single quotes (U+2018/U+2019).
2445    /// Content between curly quotes is literal (no escape processing).
2446    /// When opened with \u{2018} (left), close with \u{2019} (right) only.
2447    /// When opened with \u{2019} (right), close with \u{2019} (right) — self-closing.
2448    fn scan_unicode_quoted_string(&mut self, open_quote: char) -> Result<()> {
2449        self.advance(); // Opening curly quote
2450        let start = self.current;
2451        // Determine closing quote: left opens -> right closes; right opens -> right closes
2452        let close_quote = if open_quote == '\u{2018}' {
2453            '\u{2019}' // left opens, right closes
2454        } else {
2455            '\u{2019}' // right quote also closes with right quote
2456        };
2457        while !self.is_at_end() && self.peek() != close_quote {
2458            self.advance();
2459        }
2460        let value: String = self.chars[start..self.current].iter().collect();
2461        if !self.is_at_end() {
2462            self.advance(); // Closing quote
2463        }
2464        self.add_token_with_text(TokenType::String, value);
2465        Ok(())
2466    }
2467
2468    /// Scan an identifier delimited by Unicode curly double quotes (U+201C/U+201D).
2469    /// When opened with \u{201C} (left), close with \u{201D} (right) only.
2470    fn scan_unicode_quoted_identifier(&mut self, open_quote: char) -> Result<()> {
2471        self.advance(); // Opening curly quote
2472        let start = self.current;
2473        let close_quote = if open_quote == '\u{201C}' {
2474            '\u{201D}' // left opens, right closes
2475        } else {
2476            '\u{201D}' // right also closes with right
2477        };
2478        while !self.is_at_end() && self.peek() != close_quote && self.peek() != '"' {
2479            self.advance();
2480        }
2481        let value: String = self.chars[start..self.current].iter().collect();
2482        if !self.is_at_end() {
2483            self.advance(); // Closing quote
2484        }
2485        self.add_token_with_text(TokenType::QuotedIdentifier, value);
2486        Ok(())
2487    }
2488
2489    fn scan_number(&mut self) -> Result<()> {
2490        // Check for 0x/0X hex number prefix (SQLite-style)
2491        if self.config.hex_number_strings && self.peek() == '0' && !self.is_at_end() {
2492            let next = if self.current + 1 < self.size {
2493                self.chars[self.current + 1]
2494            } else {
2495                '\0'
2496            };
2497            if next == 'x' || next == 'X' {
2498                // Advance past '0' and 'x'/'X'
2499                self.advance();
2500                self.advance();
2501                // Collect hex digits (allow underscores as separators, e.g., 0xbad_cafe)
2502                let hex_start = self.current;
2503                while !self.is_at_end() && (self.peek().is_ascii_hexdigit() || self.peek() == '_') {
2504                    if self.peek() == '_' && !self.peek_next().is_ascii_hexdigit() {
2505                        break;
2506                    }
2507                    self.advance();
2508                }
2509                if self.current > hex_start {
2510                    // Check for hex float: 0xABC.DEFpEXP or 0xABCpEXP
2511                    let mut is_hex_float = false;
2512                    // Optional fractional part: .hexdigits
2513                    if !self.is_at_end() && self.peek() == '.' {
2514                        let after_dot = if self.current + 1 < self.size {
2515                            self.chars[self.current + 1]
2516                        } else {
2517                            '\0'
2518                        };
2519                        if after_dot.is_ascii_hexdigit() {
2520                            is_hex_float = true;
2521                            self.advance(); // consume '.'
2522                            while !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2523                                self.advance();
2524                            }
2525                        }
2526                    }
2527                    // Optional binary exponent: p/P [+/-] digits
2528                    if !self.is_at_end() && (self.peek() == 'p' || self.peek() == 'P') {
2529                        is_hex_float = true;
2530                        self.advance(); // consume p/P
2531                        if !self.is_at_end() && (self.peek() == '+' || self.peek() == '-') {
2532                            self.advance();
2533                        }
2534                        while !self.is_at_end() && self.peek().is_ascii_digit() {
2535                            self.advance();
2536                        }
2537                    }
2538                    if is_hex_float {
2539                        // Hex float literal — emit as regular Number token with full text
2540                        let full_text: String =
2541                            self.chars[self.start..self.current].iter().collect();
2542                        self.add_token_with_text(TokenType::Number, full_text);
2543                    } else if self.config.hex_string_is_integer_type {
2544                        // BigQuery/ClickHouse: 0xA represents an integer in hex notation
2545                        let hex_value: String =
2546                            self.chars[hex_start..self.current].iter().collect();
2547                        self.add_token_with_text(TokenType::HexNumber, hex_value);
2548                    } else {
2549                        // SQLite/Teradata: 0xCC represents a binary/blob hex string
2550                        let hex_value: String =
2551                            self.chars[hex_start..self.current].iter().collect();
2552                        self.add_token_with_text(TokenType::HexString, hex_value);
2553                    }
2554                    return Ok(());
2555                }
2556                // No hex digits after 0x - fall through to normal number parsing
2557                // (reset current back to after '0')
2558                self.current = self.start + 1;
2559            }
2560        }
2561
2562        // Allow underscores as digit separators (e.g., 20_000, 1_000_000)
2563        while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2564            // Don't allow underscore at the end (must be followed by digit)
2565            if self.peek() == '_' && (self.is_at_end() || !self.peek_next().is_ascii_digit()) {
2566                break;
2567            }
2568            self.advance();
2569        }
2570
2571        // Look for decimal part - allow trailing dot (e.g., "1.")
2572        // In PostgreSQL (and sqlglot), "1.x" parses as float "1." with alias "x"
2573        // So we always consume the dot as part of the number, even if followed by an identifier
2574        if self.peek() == '.' {
2575            let next = self.peek_next();
2576            // Only consume the dot if:
2577            // 1. Followed by a digit (normal decimal like 1.5)
2578            // 2. Followed by an identifier start (like 1.x -> becomes 1. with alias x)
2579            // 3. End of input or other non-dot character (trailing decimal like "1.")
2580            // Do NOT consume if it's a double dot (..) which is a range operator
2581            if next != '.' {
2582                self.advance(); // consume the .
2583                                // Only consume digits after the decimal point (not identifiers)
2584                while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2585                    if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2586                        break;
2587                    }
2588                    self.advance();
2589                }
2590            }
2591        }
2592
2593        // Look for exponent
2594        if self.peek() == 'e' || self.peek() == 'E' {
2595            self.advance();
2596            if self.peek() == '+' || self.peek() == '-' {
2597                self.advance();
2598            }
2599            while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2600                if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2601                    break;
2602                }
2603                self.advance();
2604            }
2605        }
2606
2607        let text: String = self.chars[self.start..self.current].iter().collect();
2608
2609        // Check for numeric literal suffixes (e.g., 1L -> BIGINT, 1s -> SMALLINT in Hive/Spark)
2610        if !self.config.numeric_literals.is_empty() && !self.is_at_end() {
2611            let next_char = self.peek().to_uppercase().to_string();
2612            // Try 2-char suffix first (e.g., "BD"), then 1-char
2613            let suffix_match = if self.current + 1 < self.size {
2614                let two_char: String = vec![self.chars[self.current], self.chars[self.current + 1]]
2615                    .iter()
2616                    .collect::<String>()
2617                    .to_uppercase();
2618                if self.config.numeric_literals.contains_key(&two_char) {
2619                    // Make sure the 2-char suffix is not followed by more identifier chars
2620                    let after_suffix = if self.current + 2 < self.size {
2621                        self.chars[self.current + 2]
2622                    } else {
2623                        ' '
2624                    };
2625                    if !after_suffix.is_alphanumeric() && after_suffix != '_' {
2626                        Some((two_char, 2))
2627                    } else {
2628                        None
2629                    }
2630                } else if self.config.numeric_literals.contains_key(&next_char) {
2631                    // 1-char suffix - make sure not followed by more identifier chars
2632                    let after_suffix = if self.current + 1 < self.size {
2633                        self.chars[self.current + 1]
2634                    } else {
2635                        ' '
2636                    };
2637                    if !after_suffix.is_alphanumeric() && after_suffix != '_' {
2638                        Some((next_char, 1))
2639                    } else {
2640                        None
2641                    }
2642                } else {
2643                    None
2644                }
2645            } else if self.config.numeric_literals.contains_key(&next_char) {
2646                // At end of input, 1-char suffix
2647                Some((next_char, 1))
2648            } else {
2649                None
2650            };
2651
2652            if let Some((suffix, len)) = suffix_match {
2653                // Consume the suffix characters
2654                for _ in 0..len {
2655                    self.advance();
2656                }
2657                // Emit as a special number-with-suffix token
2658                // We'll encode as "number::TYPE" so the parser can split it
2659                let type_name = self
2660                    .config
2661                    .numeric_literals
2662                    .get(&suffix)
2663                    .expect("suffix verified by contains_key above")
2664                    .clone();
2665                let combined = format!("{}::{}", text, type_name);
2666                self.add_token_with_text(TokenType::Number, combined);
2667                return Ok(());
2668            }
2669        }
2670
2671        // Check for identifiers that start with a digit (e.g., 1a, 1_a, 1a_1a)
2672        // In Hive/Spark/MySQL/ClickHouse, these are valid unquoted identifiers
2673        if self.config.identifiers_can_start_with_digit && !self.is_at_end() {
2674            let next = self.peek();
2675            if next.is_alphabetic() || next == '_' {
2676                // Continue scanning as an identifier
2677                while !self.is_at_end() {
2678                    let ch = self.peek();
2679                    if ch.is_alphanumeric() || ch == '_' {
2680                        self.advance();
2681                    } else {
2682                        break;
2683                    }
2684                }
2685                let ident_text: String = self.chars[self.start..self.current].iter().collect();
2686                self.add_token_with_text(TokenType::Identifier, ident_text);
2687                return Ok(());
2688            }
2689        }
2690
2691        self.add_token_with_text(TokenType::Number, text);
2692        Ok(())
2693    }
2694
2695    /// Scan a number that starts with a dot (e.g., .25, .5, .123e10)
2696    fn scan_number_starting_with_dot(&mut self) -> Result<()> {
2697        // Consume the leading dot
2698        self.advance();
2699
2700        // Consume the fractional digits
2701        while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2702            if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2703                break;
2704            }
2705            self.advance();
2706        }
2707
2708        // Look for exponent
2709        if self.peek() == 'e' || self.peek() == 'E' {
2710            self.advance();
2711            if self.peek() == '+' || self.peek() == '-' {
2712                self.advance();
2713            }
2714            while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2715                if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2716                    break;
2717                }
2718                self.advance();
2719            }
2720        }
2721
2722        let text: String = self.chars[self.start..self.current].iter().collect();
2723        self.add_token_with_text(TokenType::Number, text);
2724        Ok(())
2725    }
2726
2727    fn scan_identifier_or_keyword(&mut self) -> Result<()> {
2728        // Guard against unrecognized characters that could cause infinite loops
2729        let first_char = self.peek();
2730        if !first_char.is_alphanumeric() && first_char != '_' {
2731            // Unknown character - skip it and return an error
2732            let c = self.advance();
2733            return Err(Error::tokenize(
2734                format!("Unexpected character: '{}'", c),
2735                self.line,
2736                self.column,
2737            ));
2738        }
2739
2740        while !self.is_at_end() {
2741            let c = self.peek();
2742            // Allow alphanumeric, underscore, $, # and @ in identifiers
2743            // PostgreSQL allows $, TSQL allows # and @
2744            // But stop consuming # if followed by > or >> (PostgreSQL #> and #>> operators)
2745            if c == '#' {
2746                let next_c = if self.current + 1 < self.size {
2747                    self.chars[self.current + 1]
2748                } else {
2749                    '\0'
2750                };
2751                if next_c == '>' || next_c == '-' {
2752                    break; // Don't consume # — it's part of #>, #>>, or #- operator
2753                }
2754                self.advance();
2755            } else if c.is_alphanumeric() || c == '_' || c == '$' || c == '@' {
2756                self.advance();
2757            } else {
2758                break;
2759            }
2760        }
2761
2762        let text: String = self.chars[self.start..self.current].iter().collect();
2763        let upper = text.to_uppercase();
2764
2765        // Special-case NOT= (Teradata and other dialects)
2766        if upper == "NOT" && self.peek() == '=' {
2767            self.advance(); // consume '='
2768            self.add_token(TokenType::Neq);
2769            return Ok(());
2770        }
2771
2772        // Check for special string prefixes like N'...', X'...', B'...', U&'...', r'...', b'...'
2773        // Also handle double-quoted variants for dialects that support them (e.g., BigQuery)
2774        let next_char = self.peek();
2775        let is_single_quote = next_char == '\'';
2776        let is_double_quote = next_char == '"' && self.config.quotes.contains_key("\"");
2777        // For raw strings (r"..." or r'...'), we allow double quotes even if " is not in quotes config
2778        // because raw strings are a special case used in Spark/Databricks where " is for identifiers
2779        let is_double_quote_for_raw = next_char == '"';
2780
2781        // Handle raw strings first - they're special because they work with both ' and "
2782        // even in dialects where " is normally an identifier delimiter (like Databricks)
2783        if upper == "R" && (is_single_quote || is_double_quote_for_raw) {
2784            // Raw string r'...' or r"..." or r'''...''' or r"""...""" (BigQuery style)
2785            // In raw strings, backslashes are treated literally (no escape processing)
2786            let quote_char = if is_single_quote { '\'' } else { '"' };
2787            self.advance(); // consume the first opening quote
2788
2789            // Check for triple-quoted raw string (r"""...""" or r'''...''')
2790            if self.peek() == quote_char && self.peek_next() == quote_char {
2791                // Triple-quoted raw string
2792                self.advance(); // consume second quote
2793                self.advance(); // consume third quote
2794                let string_value = self.scan_raw_triple_quoted_content(quote_char)?;
2795                self.add_token_with_text(TokenType::RawString, string_value);
2796            } else {
2797                let string_value = self.scan_raw_string_content(quote_char)?;
2798                self.add_token_with_text(TokenType::RawString, string_value);
2799            }
2800            return Ok(());
2801        }
2802
2803        if is_single_quote || is_double_quote {
2804            match upper.as_str() {
2805                "N" => {
2806                    // National string N'...'
2807                    self.advance(); // consume the opening quote
2808                    let string_value = if is_single_quote {
2809                        self.scan_string_content()?
2810                    } else {
2811                        self.scan_double_quoted_string_content()?
2812                    };
2813                    self.add_token_with_text(TokenType::NationalString, string_value);
2814                    return Ok(());
2815                }
2816                "E" => {
2817                    // PostgreSQL escape string E'...' or e'...'
2818                    // Preserve the case by prefixing with "e:" or "E:"
2819                    // Always use backslash escapes for escape strings (e.g., \' is an escaped quote)
2820                    let lowercase = text == "e";
2821                    let prefix = if lowercase { "e:" } else { "E:" };
2822                    self.advance(); // consume the opening quote
2823                    let string_value = self.scan_string_content_with_escapes(true)?;
2824                    self.add_token_with_text(
2825                        TokenType::EscapeString,
2826                        format!("{}{}", prefix, string_value),
2827                    );
2828                    return Ok(());
2829                }
2830                "X" => {
2831                    // Hex string X'...'
2832                    self.advance(); // consume the opening quote
2833                    let string_value = if is_single_quote {
2834                        self.scan_string_content()?
2835                    } else {
2836                        self.scan_double_quoted_string_content()?
2837                    };
2838                    self.add_token_with_text(TokenType::HexString, string_value);
2839                    return Ok(());
2840                }
2841                "B" if is_double_quote => {
2842                    // Byte string b"..." (BigQuery style) - MUST check before single quote B'...'
2843                    self.advance(); // consume the opening quote
2844                    let string_value = self.scan_double_quoted_string_content()?;
2845                    self.add_token_with_text(TokenType::ByteString, string_value);
2846                    return Ok(());
2847                }
2848                "B" if is_single_quote => {
2849                    // For BigQuery: b'...' is a byte string (bytes data)
2850                    // For standard SQL: B'...' is a bit string (binary digits)
2851                    self.advance(); // consume the opening quote
2852                    let string_value = self.scan_string_content()?;
2853                    if self.config.b_prefix_is_byte_string {
2854                        self.add_token_with_text(TokenType::ByteString, string_value);
2855                    } else {
2856                        self.add_token_with_text(TokenType::BitString, string_value);
2857                    }
2858                    return Ok(());
2859                }
2860                _ => {}
2861            }
2862        }
2863
2864        // Check for U&'...' Unicode string syntax (SQL standard)
2865        if upper == "U"
2866            && self.peek() == '&'
2867            && self.current + 1 < self.size
2868            && self.chars[self.current + 1] == '\''
2869        {
2870            self.advance(); // consume '&'
2871            self.advance(); // consume opening quote
2872            let string_value = self.scan_string_content()?;
2873            self.add_token_with_text(TokenType::UnicodeString, string_value);
2874            return Ok(());
2875        }
2876
2877        let token_type = self
2878            .config
2879            .keywords
2880            .get(&upper)
2881            .copied()
2882            .unwrap_or(TokenType::Var);
2883
2884        self.add_token_with_text(token_type, text);
2885        Ok(())
2886    }
2887
2888    /// Scan string content (everything between quotes)
2889    /// If `force_backslash_escapes` is true, backslash is always treated as an escape character
2890    /// (used for PostgreSQL E'...' escape strings)
2891    fn scan_string_content_with_escapes(
2892        &mut self,
2893        force_backslash_escapes: bool,
2894    ) -> Result<String> {
2895        let mut value = String::new();
2896        let use_backslash_escapes =
2897            force_backslash_escapes || self.config.string_escapes.contains(&'\\');
2898
2899        while !self.is_at_end() {
2900            let c = self.peek();
2901            if c == '\'' {
2902                if self.peek_next() == '\'' {
2903                    // Escaped quote ''
2904                    value.push('\'');
2905                    self.advance();
2906                    self.advance();
2907                } else {
2908                    break;
2909                }
2910            } else if c == '\\' && use_backslash_escapes {
2911                // Preserve escape sequences literally (including \' for escape strings)
2912                value.push(self.advance());
2913                if !self.is_at_end() {
2914                    value.push(self.advance());
2915                }
2916            } else {
2917                value.push(self.advance());
2918            }
2919        }
2920
2921        if self.is_at_end() {
2922            return Err(Error::tokenize(
2923                "Unterminated string",
2924                self.line,
2925                self.column,
2926            ));
2927        }
2928
2929        self.advance(); // Closing quote
2930        Ok(value)
2931    }
2932
2933    /// Scan string content (everything between quotes)
2934    fn scan_string_content(&mut self) -> Result<String> {
2935        self.scan_string_content_with_escapes(false)
2936    }
2937
2938    /// Scan double-quoted string content (for dialects like BigQuery where " is a string delimiter)
2939    /// This is used for prefixed strings like b"..." or N"..."
2940    fn scan_double_quoted_string_content(&mut self) -> Result<String> {
2941        let mut value = String::new();
2942        let use_backslash_escapes = self.config.string_escapes.contains(&'\\');
2943
2944        while !self.is_at_end() {
2945            let c = self.peek();
2946            if c == '"' {
2947                if self.peek_next() == '"' {
2948                    // Escaped quote ""
2949                    value.push('"');
2950                    self.advance();
2951                    self.advance();
2952                } else {
2953                    break;
2954                }
2955            } else if c == '\\' && use_backslash_escapes {
2956                // Handle escape sequences
2957                self.advance(); // Consume backslash
2958                if !self.is_at_end() {
2959                    let escaped = self.advance();
2960                    match escaped {
2961                        'n' => value.push('\n'),
2962                        'r' => value.push('\r'),
2963                        't' => value.push('\t'),
2964                        '0' => value.push('\0'),
2965                        '\\' => value.push('\\'),
2966                        '"' => value.push('"'),
2967                        '\'' => value.push('\''),
2968                        'x' => {
2969                            // Hex escape \xNN - collect hex digits
2970                            let mut hex = String::new();
2971                            for _ in 0..2 {
2972                                if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2973                                    hex.push(self.advance());
2974                                }
2975                            }
2976                            if let Ok(byte) = u8::from_str_radix(&hex, 16) {
2977                                value.push(byte as char);
2978                            } else {
2979                                // Invalid hex escape, keep it literal
2980                                value.push('\\');
2981                                value.push('x');
2982                                value.push_str(&hex);
2983                            }
2984                        }
2985                        _ => {
2986                            // For unrecognized escapes, preserve backslash + char
2987                            value.push('\\');
2988                            value.push(escaped);
2989                        }
2990                    }
2991                }
2992            } else {
2993                value.push(self.advance());
2994            }
2995        }
2996
2997        if self.is_at_end() {
2998            return Err(Error::tokenize(
2999                "Unterminated double-quoted string",
3000                self.line,
3001                self.column,
3002            ));
3003        }
3004
3005        self.advance(); // Closing quote
3006        Ok(value)
3007    }
3008
3009    /// Scan raw string content (limited escape processing for quotes)
3010    /// Used for BigQuery r'...' and r"..." strings
3011    /// In raw strings, backslashes are literal EXCEPT that escape sequences for the
3012    /// quote character still work (e.g., \' in r'...' escapes the quote, '' also works)
3013    fn scan_raw_string_content(&mut self, quote_char: char) -> Result<String> {
3014        let mut value = String::new();
3015
3016        while !self.is_at_end() {
3017            let c = self.peek();
3018            if c == quote_char {
3019                if self.peek_next() == quote_char {
3020                    // Escaped quote (doubled) - e.g., '' inside r'...'
3021                    value.push(quote_char);
3022                    self.advance();
3023                    self.advance();
3024                } else {
3025                    break;
3026                }
3027            } else if c == '\\'
3028                && self.peek_next() == quote_char
3029                && self.config.string_escapes_allowed_in_raw_strings
3030            {
3031                // Backslash-escaped quote - works in raw strings when string_escapes_allowed_in_raw_strings is true
3032                // e.g., \' inside r'...' becomes literal ' (BigQuery behavior)
3033                // Spark/Databricks has this set to false, so backslash is always literal there
3034                value.push(quote_char);
3035                self.advance(); // consume backslash
3036                self.advance(); // consume quote
3037            } else {
3038                // In raw strings, everything including backslashes is literal
3039                value.push(self.advance());
3040            }
3041        }
3042
3043        if self.is_at_end() {
3044            return Err(Error::tokenize(
3045                "Unterminated raw string",
3046                self.line,
3047                self.column,
3048            ));
3049        }
3050
3051        self.advance(); // Closing quote
3052        Ok(value)
3053    }
3054
3055    /// Scan raw triple-quoted string content (r"""...""" or r'''...''')
3056    /// Terminates when three consecutive quote_chars are found
3057    fn scan_raw_triple_quoted_content(&mut self, quote_char: char) -> Result<String> {
3058        let mut value = String::new();
3059
3060        while !self.is_at_end() {
3061            let c = self.peek();
3062            if c == quote_char && self.peek_next() == quote_char {
3063                // Check for third quote
3064                if self.current + 2 < self.size && self.chars[self.current + 2] == quote_char {
3065                    // Found three consecutive quotes - end of string
3066                    self.advance(); // first closing quote
3067                    self.advance(); // second closing quote
3068                    self.advance(); // third closing quote
3069                    return Ok(value);
3070                }
3071            }
3072            // In raw strings, everything including backslashes is literal
3073            let ch = self.advance();
3074            value.push(ch);
3075        }
3076
3077        Err(Error::tokenize(
3078            "Unterminated raw triple-quoted string",
3079            self.line,
3080            self.column,
3081        ))
3082    }
3083
3084    /// Scan TSQL identifiers that start with # (temp tables) or @ (variables)
3085    /// Examples: #temp, ##global_temp, @variable
3086    /// Scan an identifier that starts with `$` (ClickHouse).
3087    /// Examples: `$alias$name$`, `$x`
3088    fn scan_dollar_identifier(&mut self) -> Result<()> {
3089        // Consume the leading $
3090        self.advance();
3091
3092        // Consume alphanumeric, _, and $ continuation chars
3093        while !self.is_at_end() {
3094            let c = self.peek();
3095            if c.is_alphanumeric() || c == '_' || c == '$' {
3096                self.advance();
3097            } else {
3098                break;
3099            }
3100        }
3101
3102        let text: String = self.chars[self.start..self.current].iter().collect();
3103        self.add_token_with_text(TokenType::Var, text);
3104        Ok(())
3105    }
3106
3107    fn scan_tsql_identifier(&mut self) -> Result<()> {
3108        // Consume the leading # or @ (or ##)
3109        let first = self.advance();
3110
3111        // For ##, consume the second #
3112        if first == '#' && self.peek() == '#' {
3113            self.advance();
3114        }
3115
3116        // Now scan the rest of the identifier
3117        while !self.is_at_end() {
3118            let c = self.peek();
3119            if c.is_alphanumeric() || c == '_' || c == '$' || c == '#' || c == '@' {
3120                self.advance();
3121            } else {
3122                break;
3123            }
3124        }
3125
3126        let text: String = self.chars[self.start..self.current].iter().collect();
3127        // These are always identifiers (variables or temp table names), never keywords
3128        self.add_token_with_text(TokenType::Var, text);
3129        Ok(())
3130    }
3131
3132    /// Check if the last tokens match INSERT ... FORMAT <name> (not VALUES).
3133    /// If so, consume everything until the next blank line (two consecutive newlines)
3134    /// or end of input as raw data.
3135    fn try_scan_insert_format_raw_data(&mut self) -> Option<String> {
3136        let len = self.tokens.len();
3137        if len < 3 {
3138            return None;
3139        }
3140
3141        // Last token should be the format name (Identifier or Var, not VALUES)
3142        let last = &self.tokens[len - 1];
3143        if last.text.eq_ignore_ascii_case("VALUES") {
3144            return None;
3145        }
3146        if !matches!(last.token_type, TokenType::Var | TokenType::Identifier) {
3147            return None;
3148        }
3149
3150        // Second-to-last should be FORMAT
3151        let format_tok = &self.tokens[len - 2];
3152        if !format_tok.text.eq_ignore_ascii_case("FORMAT") {
3153            return None;
3154        }
3155
3156        // Check that there's an INSERT somewhere earlier in the tokens
3157        let has_insert = self.tokens[..len - 2]
3158            .iter()
3159            .rev()
3160            .take(20)
3161            .any(|t| t.token_type == TokenType::Insert);
3162        if !has_insert {
3163            return None;
3164        }
3165
3166        // We're in INSERT ... FORMAT <name> context. Consume everything until:
3167        // - A blank line (two consecutive newlines, possibly with whitespace between)
3168        // - End of input
3169        let raw_start = self.current;
3170        while !self.is_at_end() {
3171            let c = self.peek();
3172            if c == '\n' {
3173                // Check for blank line: \n followed by optional \r and \n
3174                let saved = self.current;
3175                self.advance(); // consume first \n
3176                                // Skip \r if present
3177                while !self.is_at_end() && self.peek() == '\r' {
3178                    self.advance();
3179                }
3180                if self.is_at_end() || self.peek() == '\n' {
3181                    // Found blank line or end of input - stop here
3182                    // Don't consume the second \n so subsequent SQL can be tokenized
3183                    let raw: String = self.chars[raw_start..saved].iter().collect();
3184                    return Some(raw.trim().to_string());
3185                }
3186                // Not a blank line, continue scanning
3187            } else {
3188                self.advance();
3189            }
3190        }
3191
3192        // Reached end of input
3193        let raw: String = self.chars[raw_start..self.current].iter().collect();
3194        let trimmed = raw.trim().to_string();
3195        if trimmed.is_empty() {
3196            None
3197        } else {
3198            Some(trimmed)
3199        }
3200    }
3201
3202    fn add_token(&mut self, token_type: TokenType) {
3203        let text: String = self.chars[self.start..self.current].iter().collect();
3204        self.add_token_with_text(token_type, text);
3205    }
3206
3207    fn add_token_with_text(&mut self, token_type: TokenType, text: String) {
3208        let span = Span::new(self.start, self.current, self.line, self.column);
3209        let mut token = Token::new(token_type, text, span);
3210        token.comments.append(&mut self.comments);
3211        self.tokens.push(token);
3212    }
3213}
3214
3215#[cfg(test)]
3216mod tests {
3217    use super::*;
3218
3219    #[test]
3220    fn test_simple_select() {
3221        let tokenizer = Tokenizer::default();
3222        let tokens = tokenizer.tokenize("SELECT 1").unwrap();
3223
3224        assert_eq!(tokens.len(), 2);
3225        assert_eq!(tokens[0].token_type, TokenType::Select);
3226        assert_eq!(tokens[1].token_type, TokenType::Number);
3227        assert_eq!(tokens[1].text, "1");
3228    }
3229
3230    #[test]
3231    fn test_select_with_identifier() {
3232        let tokenizer = Tokenizer::default();
3233        let tokens = tokenizer.tokenize("SELECT a, b FROM t").unwrap();
3234
3235        assert_eq!(tokens.len(), 6);
3236        assert_eq!(tokens[0].token_type, TokenType::Select);
3237        assert_eq!(tokens[1].token_type, TokenType::Var);
3238        assert_eq!(tokens[1].text, "a");
3239        assert_eq!(tokens[2].token_type, TokenType::Comma);
3240        assert_eq!(tokens[3].token_type, TokenType::Var);
3241        assert_eq!(tokens[3].text, "b");
3242        assert_eq!(tokens[4].token_type, TokenType::From);
3243        assert_eq!(tokens[5].token_type, TokenType::Var);
3244        assert_eq!(tokens[5].text, "t");
3245    }
3246
3247    #[test]
3248    fn test_string_literal() {
3249        let tokenizer = Tokenizer::default();
3250        let tokens = tokenizer.tokenize("SELECT 'hello'").unwrap();
3251
3252        assert_eq!(tokens.len(), 2);
3253        assert_eq!(tokens[1].token_type, TokenType::String);
3254        assert_eq!(tokens[1].text, "hello");
3255    }
3256
3257    #[test]
3258    fn test_escaped_string() {
3259        let tokenizer = Tokenizer::default();
3260        let tokens = tokenizer.tokenize("SELECT 'it''s'").unwrap();
3261
3262        assert_eq!(tokens.len(), 2);
3263        assert_eq!(tokens[1].token_type, TokenType::String);
3264        assert_eq!(tokens[1].text, "it's");
3265    }
3266
3267    #[test]
3268    fn test_comments() {
3269        let tokenizer = Tokenizer::default();
3270        let tokens = tokenizer.tokenize("SELECT -- comment\n1").unwrap();
3271
3272        assert_eq!(tokens.len(), 2);
3273        // Comments are attached to the PREVIOUS token as trailing_comments
3274        // This is better for round-trip fidelity (e.g., SELECT c /* comment */ FROM)
3275        assert_eq!(tokens[0].trailing_comments.len(), 1);
3276        assert_eq!(tokens[0].trailing_comments[0], " comment");
3277    }
3278
3279    #[test]
3280    fn test_comment_in_and_chain() {
3281        use crate::generator::Generator;
3282        use crate::parser::Parser;
3283
3284        // Line comments between AND clauses should appear after the AND operator
3285        let sql = "SELECT a FROM b WHERE foo\n-- c1\nAND bar\n-- c2\nAND bla";
3286        let ast = Parser::parse_sql(sql).unwrap();
3287        let mut gen = Generator::default();
3288        let output = gen.generate(&ast[0]).unwrap();
3289        assert_eq!(
3290            output,
3291            "SELECT a FROM b WHERE foo AND /* c1 */ bar AND /* c2 */ bla"
3292        );
3293    }
3294
3295    #[test]
3296    fn test_operators() {
3297        let tokenizer = Tokenizer::default();
3298        let tokens = tokenizer.tokenize("1 + 2 * 3").unwrap();
3299
3300        assert_eq!(tokens.len(), 5);
3301        assert_eq!(tokens[0].token_type, TokenType::Number);
3302        assert_eq!(tokens[1].token_type, TokenType::Plus);
3303        assert_eq!(tokens[2].token_type, TokenType::Number);
3304        assert_eq!(tokens[3].token_type, TokenType::Star);
3305        assert_eq!(tokens[4].token_type, TokenType::Number);
3306    }
3307
3308    #[test]
3309    fn test_comparison_operators() {
3310        let tokenizer = Tokenizer::default();
3311        let tokens = tokenizer.tokenize("a <= b >= c != d").unwrap();
3312
3313        assert_eq!(tokens[1].token_type, TokenType::Lte);
3314        assert_eq!(tokens[3].token_type, TokenType::Gte);
3315        assert_eq!(tokens[5].token_type, TokenType::Neq);
3316    }
3317
3318    #[test]
3319    fn test_national_string() {
3320        let tokenizer = Tokenizer::default();
3321        let tokens = tokenizer.tokenize("N'abc'").unwrap();
3322
3323        assert_eq!(
3324            tokens.len(),
3325            1,
3326            "Expected 1 token for N'abc', got {:?}",
3327            tokens
3328        );
3329        assert_eq!(tokens[0].token_type, TokenType::NationalString);
3330        assert_eq!(tokens[0].text, "abc");
3331    }
3332
3333    #[test]
3334    fn test_hex_string() {
3335        let tokenizer = Tokenizer::default();
3336        let tokens = tokenizer.tokenize("X'ABCD'").unwrap();
3337
3338        assert_eq!(
3339            tokens.len(),
3340            1,
3341            "Expected 1 token for X'ABCD', got {:?}",
3342            tokens
3343        );
3344        assert_eq!(tokens[0].token_type, TokenType::HexString);
3345        assert_eq!(tokens[0].text, "ABCD");
3346    }
3347
3348    #[test]
3349    fn test_bit_string() {
3350        let tokenizer = Tokenizer::default();
3351        let tokens = tokenizer.tokenize("B'01010'").unwrap();
3352
3353        assert_eq!(
3354            tokens.len(),
3355            1,
3356            "Expected 1 token for B'01010', got {:?}",
3357            tokens
3358        );
3359        assert_eq!(tokens[0].token_type, TokenType::BitString);
3360        assert_eq!(tokens[0].text, "01010");
3361    }
3362
3363    #[test]
3364    fn test_trailing_dot_number() {
3365        let tokenizer = Tokenizer::default();
3366
3367        // Test trailing dot
3368        let tokens = tokenizer.tokenize("SELECT 1.").unwrap();
3369        assert_eq!(
3370            tokens.len(),
3371            2,
3372            "Expected 2 tokens for 'SELECT 1.', got {:?}",
3373            tokens
3374        );
3375        assert_eq!(tokens[1].token_type, TokenType::Number);
3376        assert_eq!(tokens[1].text, "1.");
3377
3378        // Test normal decimal
3379        let tokens = tokenizer.tokenize("SELECT 1.5").unwrap();
3380        assert_eq!(tokens[1].text, "1.5");
3381
3382        // Test number followed by dot and identifier
3383        // In PostgreSQL (and sqlglot), "1.x" parses as float "1." with alias "x"
3384        let tokens = tokenizer.tokenize("SELECT 1.a").unwrap();
3385        assert_eq!(
3386            tokens.len(),
3387            3,
3388            "Expected 3 tokens for 'SELECT 1.a', got {:?}",
3389            tokens
3390        );
3391        assert_eq!(tokens[1].token_type, TokenType::Number);
3392        assert_eq!(tokens[1].text, "1.");
3393        assert_eq!(tokens[2].token_type, TokenType::Var);
3394
3395        // Test two dots (range operator) - dot is NOT consumed when followed by another dot
3396        let tokens = tokenizer.tokenize("SELECT 1..2").unwrap();
3397        assert_eq!(tokens[1].token_type, TokenType::Number);
3398        assert_eq!(tokens[1].text, "1");
3399        assert_eq!(tokens[2].token_type, TokenType::Dot);
3400        assert_eq!(tokens[3].token_type, TokenType::Dot);
3401        assert_eq!(tokens[4].token_type, TokenType::Number);
3402        assert_eq!(tokens[4].text, "2");
3403    }
3404
3405    #[test]
3406    fn test_leading_dot_number() {
3407        let tokenizer = Tokenizer::default();
3408
3409        // Test leading dot number (e.g., .25 for 0.25)
3410        let tokens = tokenizer.tokenize(".25").unwrap();
3411        assert_eq!(
3412            tokens.len(),
3413            1,
3414            "Expected 1 token for '.25', got {:?}",
3415            tokens
3416        );
3417        assert_eq!(tokens[0].token_type, TokenType::Number);
3418        assert_eq!(tokens[0].text, ".25");
3419
3420        // Test leading dot in context (Oracle SAMPLE clause)
3421        let tokens = tokenizer.tokenize("SAMPLE (.25)").unwrap();
3422        assert_eq!(
3423            tokens.len(),
3424            4,
3425            "Expected 4 tokens for 'SAMPLE (.25)', got {:?}",
3426            tokens
3427        );
3428        assert_eq!(tokens[0].token_type, TokenType::Sample);
3429        assert_eq!(tokens[1].token_type, TokenType::LParen);
3430        assert_eq!(tokens[2].token_type, TokenType::Number);
3431        assert_eq!(tokens[2].text, ".25");
3432        assert_eq!(tokens[3].token_type, TokenType::RParen);
3433
3434        // Test leading dot with exponent
3435        let tokens = tokenizer.tokenize(".5e10").unwrap();
3436        assert_eq!(
3437            tokens.len(),
3438            1,
3439            "Expected 1 token for '.5e10', got {:?}",
3440            tokens
3441        );
3442        assert_eq!(tokens[0].token_type, TokenType::Number);
3443        assert_eq!(tokens[0].text, ".5e10");
3444
3445        // Test that plain dot is still a Dot token
3446        let tokens = tokenizer.tokenize("a.b").unwrap();
3447        assert_eq!(
3448            tokens.len(),
3449            3,
3450            "Expected 3 tokens for 'a.b', got {:?}",
3451            tokens
3452        );
3453        assert_eq!(tokens[1].token_type, TokenType::Dot);
3454    }
3455
3456    #[test]
3457    fn test_unrecognized_character() {
3458        let tokenizer = Tokenizer::default();
3459
3460        // Unicode curly quotes are now handled as string delimiters
3461        let result = tokenizer.tokenize("SELECT \u{2018}hello\u{2019}");
3462        assert!(
3463            result.is_ok(),
3464            "Curly quotes should be tokenized as strings"
3465        );
3466
3467        // Unicode bullet character should still error
3468        let result = tokenizer.tokenize("SELECT • FROM t");
3469        assert!(result.is_err());
3470    }
3471
3472    #[test]
3473    fn test_colon_eq_tokenization() {
3474        let tokenizer = Tokenizer::default();
3475
3476        // := should be a single ColonEq token
3477        let tokens = tokenizer.tokenize("a := 1").unwrap();
3478        assert_eq!(tokens.len(), 3);
3479        assert_eq!(tokens[0].token_type, TokenType::Var);
3480        assert_eq!(tokens[1].token_type, TokenType::ColonEq);
3481        assert_eq!(tokens[2].token_type, TokenType::Number);
3482
3483        // : followed by non-= should still be Colon
3484        let tokens = tokenizer.tokenize("a:b").unwrap();
3485        assert!(tokens.iter().any(|t| t.token_type == TokenType::Colon));
3486        assert!(!tokens.iter().any(|t| t.token_type == TokenType::ColonEq));
3487
3488        // :: should still be DColon
3489        let tokens = tokenizer.tokenize("a::INT").unwrap();
3490        assert!(tokens.iter().any(|t| t.token_type == TokenType::DColon));
3491    }
3492
3493    #[test]
3494    fn test_colon_eq_parsing() {
3495        use crate::generator::Generator;
3496        use crate::parser::Parser;
3497
3498        // MySQL @var := value in SELECT
3499        let ast = Parser::parse_sql("SELECT @var1 := 1, @var2")
3500            .expect("Failed to parse MySQL @var := expr");
3501        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3502        assert_eq!(output, "SELECT @var1 := 1, @var2");
3503
3504        // MySQL @var := @var in SELECT
3505        let ast = Parser::parse_sql("SELECT @var1, @var2 := @var1")
3506            .expect("Failed to parse MySQL @var2 := @var1");
3507        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3508        assert_eq!(output, "SELECT @var1, @var2 := @var1");
3509
3510        // MySQL @var := COUNT(*)
3511        let ast = Parser::parse_sql("SELECT @var1 := COUNT(*) FROM t1")
3512            .expect("Failed to parse MySQL @var := COUNT(*)");
3513        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3514        assert_eq!(output, "SELECT @var1 := COUNT(*) FROM t1");
3515
3516        // MySQL SET @var := 1 (should normalize to = in output)
3517        let ast = Parser::parse_sql("SET @var1 := 1").expect("Failed to parse SET @var1 := 1");
3518        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3519        assert_eq!(output, "SET @var1 = 1");
3520
3521        // Function named args with :=
3522        let ast =
3523            Parser::parse_sql("UNION_VALUE(k1 := 1)").expect("Failed to parse named arg with :=");
3524        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3525        assert_eq!(output, "UNION_VALUE(k1 := 1)");
3526
3527        // UNNEST with recursive := TRUE
3528        let ast = Parser::parse_sql("SELECT UNNEST(col, recursive := TRUE) FROM t")
3529            .expect("Failed to parse UNNEST with :=");
3530        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3531        assert_eq!(output, "SELECT UNNEST(col, recursive := TRUE) FROM t");
3532
3533        // DuckDB prefix alias: foo: 1 means 1 AS foo
3534        let ast =
3535            Parser::parse_sql("SELECT foo: 1").expect("Failed to parse DuckDB prefix alias foo: 1");
3536        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3537        assert_eq!(output, "SELECT 1 AS foo");
3538
3539        // DuckDB prefix alias with multiple columns
3540        let ast = Parser::parse_sql("SELECT foo: 1, bar: 2, baz: 3")
3541            .expect("Failed to parse DuckDB multiple prefix aliases");
3542        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3543        assert_eq!(output, "SELECT 1 AS foo, 2 AS bar, 3 AS baz");
3544    }
3545
3546    #[test]
3547    fn test_colon_eq_dialect_roundtrip() {
3548        use crate::dialects::{Dialect, DialectType};
3549
3550        fn check(dialect: DialectType, sql: &str, expected: Option<&str>) {
3551            let d = Dialect::get(dialect);
3552            let ast = d
3553                .parse(sql)
3554                .unwrap_or_else(|e| panic!("Parse error for '{}': {}", sql, e));
3555            assert!(!ast.is_empty(), "Empty AST for: {}", sql);
3556            let transformed = d
3557                .transform(ast[0].clone())
3558                .unwrap_or_else(|e| panic!("Transform error for '{}': {}", sql, e));
3559            let output = d
3560                .generate(&transformed)
3561                .unwrap_or_else(|e| panic!("Generate error for '{}': {}", sql, e));
3562            let expected = expected.unwrap_or(sql);
3563            assert_eq!(output, expected, "Roundtrip failed for: {}", sql);
3564        }
3565
3566        // MySQL := tests
3567        check(DialectType::MySQL, "SELECT @var1 := 1, @var2", None);
3568        check(DialectType::MySQL, "SELECT @var1, @var2 := @var1", None);
3569        check(DialectType::MySQL, "SELECT @var1 := COUNT(*) FROM t1", None);
3570        check(DialectType::MySQL, "SET @var1 := 1", Some("SET @var1 = 1"));
3571
3572        // DuckDB := tests
3573        check(
3574            DialectType::DuckDB,
3575            "SELECT UNNEST(col, recursive := TRUE) FROM t",
3576            None,
3577        );
3578        check(DialectType::DuckDB, "UNION_VALUE(k1 := 1)", None);
3579
3580        // STRUCT_PACK(a := 'b')::json should at least parse without error
3581        // (The STRUCT_PACK -> Struct transformation is a separate feature)
3582        {
3583            let d = Dialect::get(DialectType::DuckDB);
3584            let ast = d
3585                .parse("STRUCT_PACK(a := 'b')::json")
3586                .expect("Failed to parse STRUCT_PACK(a := 'b')::json");
3587            assert!(!ast.is_empty(), "Empty AST for STRUCT_PACK(a := 'b')::json");
3588        }
3589
3590        // DuckDB prefix alias tests
3591        check(
3592            DialectType::DuckDB,
3593            "SELECT foo: 1",
3594            Some("SELECT 1 AS foo"),
3595        );
3596        check(
3597            DialectType::DuckDB,
3598            "SELECT foo: 1, bar: 2, baz: 3",
3599            Some("SELECT 1 AS foo, 2 AS bar, 3 AS baz"),
3600        );
3601    }
3602
3603    #[test]
3604    fn test_comment_roundtrip() {
3605        use crate::generator::Generator;
3606        use crate::parser::Parser;
3607
3608        fn check_roundtrip(sql: &str) -> Option<String> {
3609            let ast = match Parser::parse_sql(sql) {
3610                Ok(a) => a,
3611                Err(e) => return Some(format!("Parse error: {:?}", e)),
3612            };
3613            if ast.is_empty() {
3614                return Some("Empty AST".to_string());
3615            }
3616            let mut generator = Generator::default();
3617            let output = match generator.generate(&ast[0]) {
3618                Ok(o) => o,
3619                Err(e) => return Some(format!("Gen error: {:?}", e)),
3620            };
3621            if output == sql {
3622                None
3623            } else {
3624                Some(format!(
3625                    "Mismatch:\n  input:  {}\n  output: {}",
3626                    sql, output
3627                ))
3628            }
3629        }
3630
3631        let tests = vec![
3632            // Nested comments
3633            "SELECT c /* c1 /* c2 */ c3 */",
3634            "SELECT c /* c1 /* c2 /* c3 */ */ */",
3635            // Simple alias with comments
3636            "SELECT c /* c1 */ AS alias /* c2 */",
3637            // Multiple columns with comments
3638            "SELECT a /* x */, b /* x */",
3639            // Multiple comments after column
3640            "SELECT a /* x */ /* y */ /* z */, b /* k */ /* m */",
3641            // FROM tables with comments
3642            "SELECT * FROM foo /* x */, bla /* x */",
3643            // Arithmetic with comments
3644            "SELECT 1 /* comment */ + 1",
3645            "SELECT 1 /* c1 */ + 2 /* c2 */",
3646            "SELECT 1 /* c1 */ + /* c2 */ 2 /* c3 */",
3647            // CAST with comments
3648            "SELECT CAST(x AS INT) /* comment */ FROM foo",
3649            // Function arguments with comments
3650            "SELECT FOO(x /* c */) /* FOO */, b /* b */",
3651            // Multi-part table names with comments
3652            "SELECT x FROM a.b.c /* x */, e.f.g /* x */",
3653            // INSERT with comments
3654            "INSERT INTO t1 (tc1 /* tc1 */, tc2 /* tc2 */) SELECT c1 /* sc1 */, c2 /* sc2 */ FROM t",
3655            // Leading comments on statements
3656            "/* c */ WITH x AS (SELECT 1) SELECT * FROM x",
3657            "/* comment1 */ INSERT INTO x /* comment2 */ VALUES (1, 2, 3)",
3658            "/* comment1 */ UPDATE tbl /* comment2 */ SET x = 2 WHERE x < 2",
3659            "/* comment1 */ DELETE FROM x /* comment2 */ WHERE y > 1",
3660            "/* comment */ CREATE TABLE foo AS SELECT 1",
3661            // Trailing comments on statements
3662            "INSERT INTO foo SELECT * FROM bar /* comment */",
3663            // Complex nested expressions with comments
3664            "SELECT FOO(x /* c1 */ + y /* c2 */ + BLA(5 /* c3 */)) FROM (VALUES (1 /* c4 */, \"test\" /* c5 */)) /* c6 */",
3665        ];
3666
3667        let mut failures = Vec::new();
3668        for sql in tests {
3669            if let Some(e) = check_roundtrip(sql) {
3670                failures.push(e);
3671            }
3672        }
3673
3674        if !failures.is_empty() {
3675            panic!("Comment roundtrip failures:\n{}", failures.join("\n\n"));
3676        }
3677    }
3678
3679    #[test]
3680    fn test_dollar_quoted_string_parsing() {
3681        use crate::dialects::{Dialect, DialectType};
3682
3683        // Test dollar string token parsing utility function
3684        let (tag, content) = super::parse_dollar_string_token("FOO\x00content here");
3685        assert_eq!(tag, Some("FOO".to_string()));
3686        assert_eq!(content, "content here");
3687
3688        let (tag, content) = super::parse_dollar_string_token("just content");
3689        assert_eq!(tag, None);
3690        assert_eq!(content, "just content");
3691
3692        // Test roundtrip for Databricks dialect with dollar-quoted function body
3693        fn check_databricks(sql: &str, expected: Option<&str>) {
3694            let d = Dialect::get(DialectType::Databricks);
3695            let ast = d
3696                .parse(sql)
3697                .unwrap_or_else(|e| panic!("Parse error for '{}': {}", sql, e));
3698            assert!(!ast.is_empty(), "Empty AST for: {}", sql);
3699            let transformed = d
3700                .transform(ast[0].clone())
3701                .unwrap_or_else(|e| panic!("Transform error for '{}': {}", sql, e));
3702            let output = d
3703                .generate(&transformed)
3704                .unwrap_or_else(|e| panic!("Generate error for '{}': {}", sql, e));
3705            let expected = expected.unwrap_or(sql);
3706            assert_eq!(output, expected, "Roundtrip failed for: {}", sql);
3707        }
3708
3709        // Test [42]: $$...$$ heredoc
3710        check_databricks(
3711            "CREATE FUNCTION add_one(x INT) RETURNS INT LANGUAGE PYTHON AS $$def add_one(x):\n  return x+1$$",
3712            None
3713        );
3714
3715        // Test [43]: $FOO$...$FOO$ tagged heredoc
3716        check_databricks(
3717            "CREATE FUNCTION add_one(x INT) RETURNS INT LANGUAGE PYTHON AS $FOO$def add_one(x):\n  return x+1$FOO$",
3718            None
3719        );
3720    }
3721}
polyglot_sql/tokens.rs

polyglot_sql/
tokens.rs