polyglot_sql/
tokens.rs

1//! Token types and tokenization for SQL parsing
2//!
3//! This module defines all SQL token types and the tokenizer that converts
4//! SQL strings into token streams.
5
6use crate::error::{Error, Result};
7use serde::{Deserialize, Serialize};
8use std::fmt;
9
10/// Parse a DollarString token text into (tag, content).
11/// If the text contains '\x00', the part before is the tag and after is content.
12/// Otherwise, the whole text is the content with no tag.
13pub fn parse_dollar_string_token(text: &str) -> (Option<String>, String) {
14    if let Some(pos) = text.find('\x00') {
15        let tag = &text[..pos];
16        let content = &text[pos + 1..];
17        (Some(tag.to_string()), content.to_string())
18    } else {
19        (None, text.to_string())
20    }
21}
22
23/// Represents a position in the source SQL
24#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
25pub struct Span {
26    /// Starting byte offset
27    pub start: usize,
28    /// Ending byte offset (exclusive)
29    pub end: usize,
30    /// Line number (1-based)
31    pub line: usize,
32    /// Column number (1-based)
33    pub column: usize,
34}
35
36impl Span {
37    pub fn new(start: usize, end: usize, line: usize, column: usize) -> Self {
38        Self {
39            start,
40            end,
41            line,
42            column,
43        }
44    }
45}
46
47/// A token in the SQL token stream
48#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
49pub struct Token {
50    /// The type of token
51    pub token_type: TokenType,
52    /// The raw text of the token
53    pub text: String,
54    /// Position information
55    pub span: Span,
56    /// Leading comments (comments that appeared before this token)
57    #[serde(default)]
58    pub comments: Vec<String>,
59    /// Trailing comments (comments that appeared after this token, before the next one)
60    #[serde(default)]
61    pub trailing_comments: Vec<String>,
62}
63
64impl Token {
65    /// Create a new token
66    pub fn new(token_type: TokenType, text: impl Into<String>, span: Span) -> Self {
67        Self {
68            token_type,
69            text: text.into(),
70            span,
71            comments: Vec::new(),
72            trailing_comments: Vec::new(),
73        }
74    }
75
76    /// Create a NUMBER token
77    pub fn number(n: i64) -> Self {
78        Self::new(TokenType::Number, n.to_string(), Span::default())
79    }
80
81    /// Create a STRING token
82    pub fn string(s: impl Into<String>) -> Self {
83        Self::new(TokenType::String, s, Span::default())
84    }
85
86    /// Create an IDENTIFIER token
87    pub fn identifier(s: impl Into<String>) -> Self {
88        Self::new(TokenType::Identifier, s, Span::default())
89    }
90
91    /// Create a VAR token
92    pub fn var(s: impl Into<String>) -> Self {
93        Self::new(TokenType::Var, s, Span::default())
94    }
95
96    /// Add a comment to this token
97    pub fn with_comment(mut self, comment: impl Into<String>) -> Self {
98        self.comments.push(comment.into());
99        self
100    }
101}
102
103impl fmt::Display for Token {
104    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
105        write!(f, "{:?}({})", self.token_type, self.text)
106    }
107}
108
109/// All possible token types in SQL
110#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
111#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
112#[repr(u16)]
113pub enum TokenType {
114    // Punctuation
115    LParen,
116    RParen,
117    LBracket,
118    RBracket,
119    LBrace,
120    RBrace,
121    Comma,
122    Dot,
123    Dash,
124    Plus,
125    Colon,
126    DotColon,
127    DColon,
128    DColonDollar,
129    DColonPercent,
130    DColonQMark,
131    DQMark,
132    Semicolon,
133    Star,
134    Backslash,
135    Slash,
136    Lt,
137    Lte,
138    Gt,
139    Gte,
140    Not,
141    Eq,
142    Neq,
143    NullsafeEq,
144    ColonEq,
145    ColonGt,
146    NColonGt,
147    And,
148    Or,
149    Amp,
150    DPipe,
151    PipeGt,
152    Pipe,
153    PipeSlash,
154    DPipeSlash,
155    Caret,
156    CaretAt,
157    LtLt, // <<
158    GtGt, // >>
159    Tilde,
160    Arrow,
161    DArrow,
162    FArrow,
163    Hash,
164    HashArrow,
165    DHashArrow,
166    LrArrow,
167    DAt,
168    AtAt,
169    LtAt,
170    AtGt,
171    Dollar,
172    Parameter,
173    Session,
174    SessionParameter,
175    SessionUser,
176    DAmp,
177    AmpLt,
178    AmpGt,
179    Adjacent,
180    Xor,
181    DStar,
182    QMarkAmp,
183    QMarkPipe,
184    HashDash,
185    Exclamation,
186
187    UriStart,
188    BlockStart,
189    BlockEnd,
190    Space,
191    Break,
192
193    // Comments (emitted as tokens for round-trip fidelity)
194    BlockComment, // /* ... */
195    LineComment,  // -- ...
196
197    // Literals
198    String,
199    DollarString,             // $$...$$
200    TripleDoubleQuotedString, // """..."""
201    TripleSingleQuotedString, // '''...'''
202    Number,
203    Identifier,
204    QuotedIdentifier,
205    Database,
206    Column,
207    ColumnDef,
208    Schema,
209    Table,
210    Warehouse,
211    Stage,
212    Streamlit,
213    Var,
214    BitString,
215    HexString,
216    /// Hex number: 0xA, 0xFF (BigQuery, SQLite style) - represents an integer in hex notation
217    HexNumber,
218    ByteString,
219    NationalString,
220    EscapeString, // PostgreSQL E'...' escape string
221    RawString,
222    HeredocString,
223    HeredocStringAlternative,
224    UnicodeString,
225
226    // Data Types
227    Bit,
228    Boolean,
229    TinyInt,
230    UTinyInt,
231    SmallInt,
232    USmallInt,
233    MediumInt,
234    UMediumInt,
235    Int,
236    UInt,
237    BigInt,
238    UBigInt,
239    BigNum,
240    Int128,
241    UInt128,
242    Int256,
243    UInt256,
244    Float,
245    Double,
246    UDouble,
247    Decimal,
248    Decimal32,
249    Decimal64,
250    Decimal128,
251    Decimal256,
252    DecFloat,
253    UDecimal,
254    BigDecimal,
255    Char,
256    NChar,
257    VarChar,
258    NVarChar,
259    BpChar,
260    Text,
261    MediumText,
262    LongText,
263    Blob,
264    MediumBlob,
265    LongBlob,
266    TinyBlob,
267    TinyText,
268    Name,
269    Binary,
270    VarBinary,
271    Json,
272    JsonB,
273    Time,
274    TimeTz,
275    TimeNs,
276    Timestamp,
277    TimestampTz,
278    TimestampLtz,
279    TimestampNtz,
280    TimestampS,
281    TimestampMs,
282    TimestampNs,
283    DateTime,
284    DateTime2,
285    DateTime64,
286    SmallDateTime,
287    Date,
288    Date32,
289    Int4Range,
290    Int4MultiRange,
291    Int8Range,
292    Int8MultiRange,
293    NumRange,
294    NumMultiRange,
295    TsRange,
296    TsMultiRange,
297    TsTzRange,
298    TsTzMultiRange,
299    DateRange,
300    DateMultiRange,
301    Uuid,
302    Geography,
303    GeographyPoint,
304    Nullable,
305    Geometry,
306    Point,
307    Ring,
308    LineString,
309    LocalTime,
310    LocalTimestamp,
311    SysTimestamp,
312    MultiLineString,
313    Polygon,
314    MultiPolygon,
315    HllSketch,
316    HStore,
317    Super,
318    Serial,
319    SmallSerial,
320    BigSerial,
321    Xml,
322    Year,
323    UserDefined,
324    Money,
325    SmallMoney,
326    RowVersion,
327    Image,
328    Variant,
329    Object,
330    Inet,
331    IpAddress,
332    IpPrefix,
333    Ipv4,
334    Ipv6,
335    Enum,
336    Enum8,
337    Enum16,
338    FixedString,
339    LowCardinality,
340    Nested,
341    AggregateFunction,
342    SimpleAggregateFunction,
343    TDigest,
344    Unknown,
345    Vector,
346    Dynamic,
347    Void,
348
349    // Keywords
350    Add,
351    Alias,
352    Alter,
353    All,
354    Anti,
355    Any,
356    Apply,
357    Array,
358    Asc,
359    AsOf,
360    Attach,
361    AutoIncrement,
362    Begin,
363    Between,
364    BulkCollectInto,
365    Cache,
366    Cascade,
367    Case,
368    CharacterSet,
369    Cluster,
370    ClusterBy,
371    Collate,
372    Command,
373    Comment,
374    Commit,
375    Preserve,
376    Connect,
377    ConnectBy,
378    Constraint,
379    Copy,
380    Create,
381    Cross,
382    Cube,
383    CurrentDate,
384    CurrentDateTime,
385    CurrentSchema,
386    CurrentTime,
387    CurrentTimestamp,
388    CurrentUser,
389    CurrentRole,
390    CurrentCatalog,
391    Declare,
392    Default,
393    Delete,
394    Desc,
395    Describe,
396    Detach,
397    Dictionary,
398    Distinct,
399    Distribute,
400    DistributeBy,
401    Div,
402    Drop,
403    Else,
404    End,
405    Escape,
406    Except,
407    Execute,
408    Exists,
409    False,
410    Fetch,
411    File,
412    FileFormat,
413    Filter,
414    Final,
415    First,
416    For,
417    Force,
418    ForeignKey,
419    Format,
420    From,
421    Full,
422    Function,
423    Get,
424    Glob,
425    Global,
426    Grant,
427    GroupBy,
428    GroupingSets,
429    Having,
430    Hint,
431    Ignore,
432    ILike,
433    In,
434    Index,
435    IndexedBy,
436    Inner,
437    Input,
438    Insert,
439    Install,
440    Intersect,
441    Interval,
442    Into,
443    Inpath,
444    InputFormat,
445    Introducer,
446    IRLike,
447    Is,
448    IsNull,
449    Join,
450    JoinMarker,
451    Keep,
452    Key,
453    Kill,
454    Lambda,
455    Language,
456    Lateral,
457    Left,
458    Like,
459    NotLike,   // !~~ operator (PostgreSQL)
460    NotILike,  // !~~* operator (PostgreSQL)
461    NotRLike,  // !~ operator (PostgreSQL)
462    NotIRLike, // !~* operator (PostgreSQL)
463    Limit,
464    List,
465    Load,
466    Local,
467    Lock,
468    Map,
469    Match,
470    MatchCondition,
471    MatchRecognize,
472    MemberOf,
473    Materialized,
474    Merge,
475    Mod,
476    Model,
477    Natural,
478    Next,
479    NoAction,
480    Nothing,
481    NotNull,
482    Null,
483    ObjectIdentifier,
484    Offset,
485    On,
486    Only,
487    Operator,
488    OrderBy,
489    OrderSiblingsBy,
490    Ordered,
491    Ordinality,
492    Out,
493    Outer,
494    Output,
495    Over,
496    Overlaps,
497    Overwrite,
498    Partition,
499    PartitionBy,
500    Percent,
501    Pivot,
502    Placeholder,
503    Positional,
504    Pragma,
505    Prewhere,
506    PrimaryKey,
507    Procedure,
508    Properties,
509    PseudoType,
510    Put,
511    Qualify,
512    Quote,
513    QDColon,
514    Range,
515    Recursive,
516    Refresh,
517    Rename,
518    Replace,
519    Returning,
520    Revoke,
521    References,
522    Restrict,
523    Right,
524    RLike,
525    Rollback,
526    Rollup,
527    Row,
528    Rows,
529    Select,
530    Semi,
531    Savepoint,
532    Separator,
533    Sequence,
534    Serde,
535    SerdeProperties,
536    Set,
537    Settings,
538    Show,
539    Siblings,
540    SimilarTo,
541    Some,
542    Sort,
543    SortBy,
544    SoundsLike,
545    StartWith,
546    StorageIntegration,
547    StraightJoin,
548    Struct,
549    Summarize,
550    TableSample,
551    Sample,
552    Bernoulli,
553    System,
554    Block,
555    Seed,
556    Repeatable,
557    Tag,
558    Temporary,
559    Transaction,
560    To,
561    Top,
562    Then,
563    True,
564    Truncate,
565    Uncache,
566    Union,
567    Unnest,
568    Unpivot,
569    Update,
570    Use,
571    Using,
572    Values,
573    View,
574    SemanticView,
575    Volatile,
576    When,
577    Where,
578    Window,
579    With,
580    Ties,
581    Exclude,
582    No,
583    Others,
584    Unique,
585    UtcDate,
586    UtcTime,
587    UtcTimestamp,
588    VersionSnapshot,
589    TimestampSnapshot,
590    Option,
591    Sink,
592    Source,
593    Analyze,
594    Namespace,
595    Export,
596    As,
597    By,
598    Nulls,
599    Respect,
600    Last,
601    If,
602    Cast,
603    TryCast,
604    SafeCast,
605    Count,
606    Extract,
607    Substring,
608    Trim,
609    Leading,
610    Trailing,
611    Both,
612    Position,
613    Overlaying,
614    Placing,
615    Treat,
616    Within,
617    Group,
618    Order,
619
620    // Window function keywords
621    Unbounded,
622    Preceding,
623    Following,
624    Current,
625    Groups,
626
627    // DDL-specific keywords (Phase 4)
628    Trigger,
629    Type,
630    Domain,
631    Returns,
632    Body,
633    Increment,
634    Minvalue,
635    Maxvalue,
636    Start,
637    Cycle,
638    NoCycle,
639    Prior,
640    Generated,
641    Identity,
642    Always,
643    // MATCH_RECOGNIZE tokens
644    Measures,
645    Pattern,
646    Define,
647    Running,
648    Owned,
649    After,
650    Before,
651    Instead,
652    Each,
653    Statement,
654    Referencing,
655    Old,
656    New,
657    Of,
658    Check,
659    Authorization,
660    Restart,
661
662    // Special
663    Eof,
664}
665
666impl TokenType {
667    /// Check if this token type is a keyword that can be used as an identifier in certain contexts
668    pub fn is_keyword(&self) -> bool {
669        matches!(
670            self,
671            TokenType::Select
672                | TokenType::From
673                | TokenType::Where
674                | TokenType::And
675                | TokenType::Or
676                | TokenType::Not
677                | TokenType::In
678                | TokenType::Is
679                | TokenType::Null
680                | TokenType::True
681                | TokenType::False
682                | TokenType::As
683                | TokenType::On
684                | TokenType::Join
685                | TokenType::Left
686                | TokenType::Right
687                | TokenType::Inner
688                | TokenType::Outer
689                | TokenType::Full
690                | TokenType::Cross
691                | TokenType::Semi
692                | TokenType::Anti
693                | TokenType::Union
694                | TokenType::Except
695                | TokenType::Intersect
696                | TokenType::GroupBy
697                | TokenType::OrderBy
698                | TokenType::Having
699                | TokenType::Limit
700                | TokenType::Offset
701                | TokenType::Case
702                | TokenType::When
703                | TokenType::Then
704                | TokenType::Else
705                | TokenType::End
706                | TokenType::Create
707                | TokenType::Drop
708                | TokenType::Alter
709                | TokenType::Insert
710                | TokenType::Update
711                | TokenType::Delete
712                | TokenType::Into
713                | TokenType::Values
714                | TokenType::Set
715                | TokenType::With
716                | TokenType::Distinct
717                | TokenType::All
718                | TokenType::Exists
719                | TokenType::Between
720                | TokenType::Like
721                | TokenType::ILike
722                // Additional keywords that can be used as identifiers
723                | TokenType::Filter
724                | TokenType::Date
725                | TokenType::Timestamp
726                | TokenType::TimestampTz
727                | TokenType::Interval
728                | TokenType::Time
729                | TokenType::Table
730                | TokenType::Index
731                | TokenType::Column
732                | TokenType::Database
733                | TokenType::Schema
734                | TokenType::View
735                | TokenType::Function
736                | TokenType::Procedure
737                | TokenType::Trigger
738                | TokenType::Sequence
739                | TokenType::Over
740                | TokenType::Partition
741                | TokenType::Window
742                | TokenType::Rows
743                | TokenType::Range
744                | TokenType::First
745                | TokenType::Last
746                | TokenType::Preceding
747                | TokenType::Following
748                | TokenType::Current
749                | TokenType::Row
750                | TokenType::Unbounded
751                | TokenType::Array
752                | TokenType::Struct
753                | TokenType::Map
754                | TokenType::PrimaryKey
755                | TokenType::Key
756                | TokenType::ForeignKey
757                | TokenType::References
758                | TokenType::Unique
759                | TokenType::Check
760                | TokenType::Default
761                | TokenType::Constraint
762                | TokenType::Comment
763                | TokenType::Rollup
764                | TokenType::Cube
765                | TokenType::Grant
766                | TokenType::Revoke
767                | TokenType::Type
768                | TokenType::Use
769                | TokenType::Cache
770                | TokenType::Uncache
771                | TokenType::Load
772                | TokenType::Any
773                | TokenType::Some
774                | TokenType::Asc
775                | TokenType::Desc
776                | TokenType::Nulls
777                | TokenType::Lateral
778                | TokenType::Natural
779                | TokenType::Escape
780                | TokenType::Glob
781                | TokenType::Match
782                | TokenType::Recursive
783                | TokenType::Replace
784                | TokenType::Returns
785                | TokenType::If
786                | TokenType::Pivot
787                | TokenType::Unpivot
788                | TokenType::Json
789                | TokenType::Blob
790                | TokenType::Text
791                | TokenType::Int
792                | TokenType::BigInt
793                | TokenType::SmallInt
794                | TokenType::TinyInt
795                | TokenType::Int128
796                | TokenType::UInt128
797                | TokenType::Int256
798                | TokenType::UInt256
799                | TokenType::UInt
800                | TokenType::UBigInt
801                | TokenType::Float
802                | TokenType::Double
803                | TokenType::Decimal
804                | TokenType::Boolean
805                | TokenType::VarChar
806                | TokenType::Char
807                | TokenType::Binary
808                | TokenType::VarBinary
809                | TokenType::No
810                | TokenType::DateTime
811                | TokenType::Truncate
812                | TokenType::Execute
813                | TokenType::Merge
814                | TokenType::Top
815                | TokenType::Begin
816                | TokenType::Generated
817                | TokenType::Identity
818                | TokenType::Always
819                | TokenType::Extract
820                // Keywords that can be identifiers in certain contexts
821                | TokenType::AsOf
822                | TokenType::Prior
823                | TokenType::After
824                | TokenType::Restrict
825                | TokenType::Cascade
826                | TokenType::Local
827                | TokenType::Rename
828                | TokenType::Enum
829                | TokenType::Within
830                | TokenType::Format
831                | TokenType::Final
832                | TokenType::FileFormat
833                | TokenType::Input
834                | TokenType::InputFormat
835                | TokenType::Copy
836                | TokenType::Put
837                | TokenType::Get
838                | TokenType::Show
839                | TokenType::Serde
840                | TokenType::Sample
841                | TokenType::Sort
842                | TokenType::Collate
843                | TokenType::Ties
844                | TokenType::IsNull
845                | TokenType::NotNull
846                | TokenType::Exclude
847                | TokenType::Temporary
848                | TokenType::Add
849                | TokenType::Ordinality
850                | TokenType::Overlaps
851                | TokenType::Block
852                | TokenType::Pattern
853                | TokenType::Group
854                | TokenType::Cluster
855                | TokenType::Repeatable
856                | TokenType::Groups
857                | TokenType::Commit
858                | TokenType::Warehouse
859                | TokenType::System
860                | TokenType::By
861                | TokenType::To
862                | TokenType::Fetch
863                | TokenType::For
864                | TokenType::Only
865                | TokenType::Next
866                | TokenType::Lock
867                | TokenType::Refresh
868                | TokenType::Settings
869                | TokenType::Operator
870                | TokenType::Overwrite
871                | TokenType::StraightJoin
872                | TokenType::Start
873                // Additional keywords registered in tokenizer but previously missing from is_keyword()
874                | TokenType::Ignore
875                | TokenType::Domain
876                | TokenType::Apply
877                | TokenType::Respect
878                | TokenType::Materialized
879                | TokenType::Prewhere
880                | TokenType::Old
881                | TokenType::New
882                | TokenType::Cast
883                | TokenType::TryCast
884                | TokenType::SafeCast
885                | TokenType::Transaction
886                | TokenType::Describe
887                | TokenType::Kill
888                | TokenType::Lambda
889                | TokenType::Declare
890                | TokenType::Keep
891                | TokenType::Output
892                | TokenType::Percent
893                | TokenType::Qualify
894                | TokenType::Returning
895                | TokenType::Language
896                | TokenType::Preserve
897                | TokenType::Savepoint
898                | TokenType::Rollback
899                | TokenType::Body
900                | TokenType::Increment
901                | TokenType::Minvalue
902                | TokenType::Maxvalue
903                | TokenType::Cycle
904                | TokenType::NoCycle
905                | TokenType::Seed
906                | TokenType::Namespace
907                | TokenType::Authorization
908                | TokenType::Order
909                | TokenType::Restart
910                | TokenType::Before
911                | TokenType::Instead
912                | TokenType::Each
913                | TokenType::Statement
914                | TokenType::Referencing
915                | TokenType::Of
916                | TokenType::Separator
917                | TokenType::Others
918                | TokenType::Placing
919                | TokenType::Owned
920                | TokenType::Running
921                | TokenType::Define
922                | TokenType::Measures
923                | TokenType::MatchRecognize
924                | TokenType::AutoIncrement
925                | TokenType::Connect
926                | TokenType::Distribute
927                | TokenType::Bernoulli
928                | TokenType::TableSample
929                | TokenType::Inpath
930                | TokenType::Pragma
931                | TokenType::Siblings
932                | TokenType::SerdeProperties
933                | TokenType::RLike
934        )
935    }
936
937    /// Check if this token type is a comparison operator
938    pub fn is_comparison(&self) -> bool {
939        matches!(
940            self,
941            TokenType::Eq
942                | TokenType::Neq
943                | TokenType::Lt
944                | TokenType::Lte
945                | TokenType::Gt
946                | TokenType::Gte
947                | TokenType::NullsafeEq
948        )
949    }
950
951    /// Check if this token type is an arithmetic operator
952    pub fn is_arithmetic(&self) -> bool {
953        matches!(
954            self,
955            TokenType::Plus
956                | TokenType::Dash
957                | TokenType::Star
958                | TokenType::Slash
959                | TokenType::Percent
960                | TokenType::Mod
961                | TokenType::Div
962        )
963    }
964}
965
966impl fmt::Display for TokenType {
967    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
968        write!(f, "{:?}", self)
969    }
970}
971
972/// Tokenizer configuration for a dialect
973#[derive(Debug, Clone)]
974pub struct TokenizerConfig {
975    /// Keywords mapping (uppercase keyword -> token type)
976    pub keywords: std::collections::HashMap<String, TokenType>,
977    /// Single character tokens
978    pub single_tokens: std::collections::HashMap<char, TokenType>,
979    /// Quote characters (start -> end)
980    pub quotes: std::collections::HashMap<String, String>,
981    /// Identifier quote characters (start -> end)
982    pub identifiers: std::collections::HashMap<char, char>,
983    /// Comment definitions (start -> optional end)
984    pub comments: std::collections::HashMap<String, Option<String>>,
985    /// String escape characters
986    pub string_escapes: Vec<char>,
987    /// Whether to support nested comments
988    pub nested_comments: bool,
989    /// Valid escape follow characters (for MySQL-style escaping).
990    /// When a backslash is followed by a character NOT in this list,
991    /// the backslash is discarded. When empty, all backslash escapes
992    /// preserve the backslash for unrecognized sequences.
993    pub escape_follow_chars: Vec<char>,
994    /// Whether b'...' is a byte string (true for BigQuery) or bit string (false for standard SQL).
995    /// Default is false (bit string).
996    pub b_prefix_is_byte_string: bool,
997    /// Numeric literal suffixes (uppercase suffix -> type name), e.g. {"L": "BIGINT", "S": "SMALLINT"}
998    /// Used by Hive/Spark to parse 1L as CAST(1 AS BIGINT)
999    pub numeric_literals: std::collections::HashMap<String, String>,
1000    /// Whether unquoted identifiers can start with a digit (e.g., `1a`, `1_a`).
1001    /// When true, a number followed by letters/underscore is treated as an identifier.
1002    /// Used by Hive, Spark, MySQL, ClickHouse.
1003    pub identifiers_can_start_with_digit: bool,
1004    /// Whether 0x/0X prefix should be treated as hex literals.
1005    /// When true, `0XCC` is tokenized instead of Number("0") + Identifier("XCC").
1006    /// Used by BigQuery, SQLite, Teradata.
1007    pub hex_number_strings: bool,
1008    /// Whether hex string literals from 0x prefix represent integer values.
1009    /// When true (BigQuery), 0xA is tokenized as HexNumber (integer in hex notation).
1010    /// When false (SQLite, Teradata), 0xCC is tokenized as HexString (binary/blob value).
1011    pub hex_string_is_integer_type: bool,
1012    /// Whether string escape sequences (like \') are allowed in raw strings.
1013    /// When true (BigQuery default), \' inside r'...' escapes the quote.
1014    /// When false (Spark/Databricks), backslashes in raw strings are always literal.
1015    /// Python sqlglot: STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS (default True)
1016    pub string_escapes_allowed_in_raw_strings: bool,
1017    /// Whether # starts a single-line comment (ClickHouse, MySQL)
1018    pub hash_comments: bool,
1019    /// Whether $ can start/continue an identifier (ClickHouse).
1020    /// When true, a bare `$` that is not part of a dollar-quoted string or positional
1021    /// parameter is treated as an identifier character.
1022    pub dollar_sign_is_identifier: bool,
1023    /// Whether INSERT ... FORMAT <name> should treat subsequent data as raw (ClickHouse).
1024    /// When true, after tokenizing `INSERT ... FORMAT <non-VALUES-name>`, all text until
1025    /// the next blank line or end of input is consumed as a raw data token.
1026    pub insert_format_raw_data: bool,
1027}
1028
1029impl Default for TokenizerConfig {
1030    fn default() -> Self {
1031        let mut keywords = std::collections::HashMap::new();
1032        // Add basic SQL keywords
1033        keywords.insert("SELECT".to_string(), TokenType::Select);
1034        keywords.insert("FROM".to_string(), TokenType::From);
1035        keywords.insert("WHERE".to_string(), TokenType::Where);
1036        keywords.insert("AND".to_string(), TokenType::And);
1037        keywords.insert("OR".to_string(), TokenType::Or);
1038        keywords.insert("NOT".to_string(), TokenType::Not);
1039        keywords.insert("AS".to_string(), TokenType::As);
1040        keywords.insert("ON".to_string(), TokenType::On);
1041        keywords.insert("JOIN".to_string(), TokenType::Join);
1042        keywords.insert("LEFT".to_string(), TokenType::Left);
1043        keywords.insert("RIGHT".to_string(), TokenType::Right);
1044        keywords.insert("INNER".to_string(), TokenType::Inner);
1045        keywords.insert("OUTER".to_string(), TokenType::Outer);
1046        keywords.insert("OUTPUT".to_string(), TokenType::Output);
1047        keywords.insert("FULL".to_string(), TokenType::Full);
1048        keywords.insert("CROSS".to_string(), TokenType::Cross);
1049        keywords.insert("SEMI".to_string(), TokenType::Semi);
1050        keywords.insert("ANTI".to_string(), TokenType::Anti);
1051        keywords.insert("STRAIGHT_JOIN".to_string(), TokenType::StraightJoin);
1052        keywords.insert("UNION".to_string(), TokenType::Union);
1053        keywords.insert("EXCEPT".to_string(), TokenType::Except);
1054        keywords.insert("MINUS".to_string(), TokenType::Except); // Oracle/Redshift alias for EXCEPT
1055        keywords.insert("INTERSECT".to_string(), TokenType::Intersect);
1056        keywords.insert("GROUP".to_string(), TokenType::Group);
1057        keywords.insert("CUBE".to_string(), TokenType::Cube);
1058        keywords.insert("ROLLUP".to_string(), TokenType::Rollup);
1059        keywords.insert("WITHIN".to_string(), TokenType::Within);
1060        keywords.insert("ORDER".to_string(), TokenType::Order);
1061        keywords.insert("BY".to_string(), TokenType::By);
1062        keywords.insert("HAVING".to_string(), TokenType::Having);
1063        keywords.insert("LIMIT".to_string(), TokenType::Limit);
1064        keywords.insert("OFFSET".to_string(), TokenType::Offset);
1065        keywords.insert("ORDINALITY".to_string(), TokenType::Ordinality);
1066        keywords.insert("FETCH".to_string(), TokenType::Fetch);
1067        keywords.insert("FIRST".to_string(), TokenType::First);
1068        keywords.insert("NEXT".to_string(), TokenType::Next);
1069        keywords.insert("ONLY".to_string(), TokenType::Only);
1070        keywords.insert("KEEP".to_string(), TokenType::Keep);
1071        keywords.insert("IGNORE".to_string(), TokenType::Ignore);
1072        keywords.insert("INPUT".to_string(), TokenType::Input);
1073        keywords.insert("CASE".to_string(), TokenType::Case);
1074        keywords.insert("WHEN".to_string(), TokenType::When);
1075        keywords.insert("THEN".to_string(), TokenType::Then);
1076        keywords.insert("ELSE".to_string(), TokenType::Else);
1077        keywords.insert("END".to_string(), TokenType::End);
1078        keywords.insert("ENDIF".to_string(), TokenType::End); // Exasol alias for END
1079        keywords.insert("NULL".to_string(), TokenType::Null);
1080        keywords.insert("TRUE".to_string(), TokenType::True);
1081        keywords.insert("FALSE".to_string(), TokenType::False);
1082        keywords.insert("IS".to_string(), TokenType::Is);
1083        keywords.insert("IN".to_string(), TokenType::In);
1084        keywords.insert("BETWEEN".to_string(), TokenType::Between);
1085        keywords.insert("OVERLAPS".to_string(), TokenType::Overlaps);
1086        keywords.insert("LIKE".to_string(), TokenType::Like);
1087        keywords.insert("ILIKE".to_string(), TokenType::ILike);
1088        keywords.insert("RLIKE".to_string(), TokenType::RLike);
1089        keywords.insert("REGEXP".to_string(), TokenType::RLike);
1090        keywords.insert("ESCAPE".to_string(), TokenType::Escape);
1091        keywords.insert("EXISTS".to_string(), TokenType::Exists);
1092        keywords.insert("DISTINCT".to_string(), TokenType::Distinct);
1093        keywords.insert("ALL".to_string(), TokenType::All);
1094        keywords.insert("WITH".to_string(), TokenType::With);
1095        keywords.insert("CREATE".to_string(), TokenType::Create);
1096        keywords.insert("DROP".to_string(), TokenType::Drop);
1097        keywords.insert("ALTER".to_string(), TokenType::Alter);
1098        keywords.insert("TRUNCATE".to_string(), TokenType::Truncate);
1099        keywords.insert("TABLE".to_string(), TokenType::Table);
1100        keywords.insert("VIEW".to_string(), TokenType::View);
1101        keywords.insert("INDEX".to_string(), TokenType::Index);
1102        keywords.insert("COLUMN".to_string(), TokenType::Column);
1103        keywords.insert("CONSTRAINT".to_string(), TokenType::Constraint);
1104        keywords.insert("ADD".to_string(), TokenType::Add);
1105        keywords.insert("CASCADE".to_string(), TokenType::Cascade);
1106        keywords.insert("RESTRICT".to_string(), TokenType::Restrict);
1107        keywords.insert("RENAME".to_string(), TokenType::Rename);
1108        keywords.insert("TEMPORARY".to_string(), TokenType::Temporary);
1109        keywords.insert("TEMP".to_string(), TokenType::Temporary);
1110        keywords.insert("UNIQUE".to_string(), TokenType::Unique);
1111        keywords.insert("PRIMARY".to_string(), TokenType::PrimaryKey);
1112        keywords.insert("FOREIGN".to_string(), TokenType::ForeignKey);
1113        keywords.insert("KEY".to_string(), TokenType::Key);
1114        keywords.insert("KILL".to_string(), TokenType::Kill);
1115        keywords.insert("REFERENCES".to_string(), TokenType::References);
1116        keywords.insert("DEFAULT".to_string(), TokenType::Default);
1117        keywords.insert("DECLARE".to_string(), TokenType::Declare);
1118        keywords.insert("AUTO_INCREMENT".to_string(), TokenType::AutoIncrement);
1119        keywords.insert("AUTOINCREMENT".to_string(), TokenType::AutoIncrement); // Snowflake style
1120        keywords.insert("MATERIALIZED".to_string(), TokenType::Materialized);
1121        keywords.insert("REPLACE".to_string(), TokenType::Replace);
1122        keywords.insert("TO".to_string(), TokenType::To);
1123        keywords.insert("INSERT".to_string(), TokenType::Insert);
1124        keywords.insert("OVERWRITE".to_string(), TokenType::Overwrite);
1125        keywords.insert("UPDATE".to_string(), TokenType::Update);
1126        keywords.insert("USE".to_string(), TokenType::Use);
1127        keywords.insert("WAREHOUSE".to_string(), TokenType::Warehouse);
1128        keywords.insert("GLOB".to_string(), TokenType::Glob);
1129        keywords.insert("DELETE".to_string(), TokenType::Delete);
1130        keywords.insert("MERGE".to_string(), TokenType::Merge);
1131        keywords.insert("CACHE".to_string(), TokenType::Cache);
1132        keywords.insert("UNCACHE".to_string(), TokenType::Uncache);
1133        keywords.insert("REFRESH".to_string(), TokenType::Refresh);
1134        keywords.insert("GRANT".to_string(), TokenType::Grant);
1135        keywords.insert("REVOKE".to_string(), TokenType::Revoke);
1136        keywords.insert("COMMENT".to_string(), TokenType::Comment);
1137        keywords.insert("COLLATE".to_string(), TokenType::Collate);
1138        keywords.insert("INTO".to_string(), TokenType::Into);
1139        keywords.insert("VALUES".to_string(), TokenType::Values);
1140        keywords.insert("SET".to_string(), TokenType::Set);
1141        keywords.insert("SETTINGS".to_string(), TokenType::Settings);
1142        keywords.insert("SEPARATOR".to_string(), TokenType::Separator);
1143        keywords.insert("ASC".to_string(), TokenType::Asc);
1144        keywords.insert("DESC".to_string(), TokenType::Desc);
1145        keywords.insert("NULLS".to_string(), TokenType::Nulls);
1146        keywords.insert("RESPECT".to_string(), TokenType::Respect);
1147        keywords.insert("FIRST".to_string(), TokenType::First);
1148        keywords.insert("LAST".to_string(), TokenType::Last);
1149        keywords.insert("IF".to_string(), TokenType::If);
1150        keywords.insert("CAST".to_string(), TokenType::Cast);
1151        keywords.insert("TRY_CAST".to_string(), TokenType::TryCast);
1152        keywords.insert("SAFE_CAST".to_string(), TokenType::SafeCast);
1153        keywords.insert("OVER".to_string(), TokenType::Over);
1154        keywords.insert("PARTITION".to_string(), TokenType::Partition);
1155        keywords.insert("PLACING".to_string(), TokenType::Placing);
1156        keywords.insert("WINDOW".to_string(), TokenType::Window);
1157        keywords.insert("ROWS".to_string(), TokenType::Rows);
1158        keywords.insert("RANGE".to_string(), TokenType::Range);
1159        keywords.insert("FILTER".to_string(), TokenType::Filter);
1160        keywords.insert("NATURAL".to_string(), TokenType::Natural);
1161        keywords.insert("USING".to_string(), TokenType::Using);
1162        keywords.insert("UNBOUNDED".to_string(), TokenType::Unbounded);
1163        keywords.insert("PRECEDING".to_string(), TokenType::Preceding);
1164        keywords.insert("FOLLOWING".to_string(), TokenType::Following);
1165        keywords.insert("CURRENT".to_string(), TokenType::Current);
1166        keywords.insert("ROW".to_string(), TokenType::Row);
1167        keywords.insert("GROUPS".to_string(), TokenType::Groups);
1168        keywords.insert("RECURSIVE".to_string(), TokenType::Recursive);
1169        // TRIM function position keywords
1170        keywords.insert("BOTH".to_string(), TokenType::Both);
1171        keywords.insert("LEADING".to_string(), TokenType::Leading);
1172        keywords.insert("TRAILING".to_string(), TokenType::Trailing);
1173        keywords.insert("INTERVAL".to_string(), TokenType::Interval);
1174        // Phase 3: Additional keywords
1175        keywords.insert("TOP".to_string(), TokenType::Top);
1176        keywords.insert("QUALIFY".to_string(), TokenType::Qualify);
1177        keywords.insert("SAMPLE".to_string(), TokenType::Sample);
1178        keywords.insert("TABLESAMPLE".to_string(), TokenType::TableSample);
1179        keywords.insert("BERNOULLI".to_string(), TokenType::Bernoulli);
1180        keywords.insert("SYSTEM".to_string(), TokenType::System);
1181        keywords.insert("BLOCK".to_string(), TokenType::Block);
1182        keywords.insert("SEED".to_string(), TokenType::Seed);
1183        keywords.insert("REPEATABLE".to_string(), TokenType::Repeatable);
1184        keywords.insert("TIES".to_string(), TokenType::Ties);
1185        keywords.insert("LATERAL".to_string(), TokenType::Lateral);
1186        keywords.insert("LAMBDA".to_string(), TokenType::Lambda);
1187        keywords.insert("APPLY".to_string(), TokenType::Apply);
1188        // Oracle CONNECT BY keywords
1189        keywords.insert("CONNECT".to_string(), TokenType::Connect);
1190        // Hive/Spark specific keywords
1191        keywords.insert("CLUSTER".to_string(), TokenType::Cluster);
1192        keywords.insert("DISTRIBUTE".to_string(), TokenType::Distribute);
1193        keywords.insert("SORT".to_string(), TokenType::Sort);
1194        keywords.insert("PIVOT".to_string(), TokenType::Pivot);
1195        keywords.insert("PREWHERE".to_string(), TokenType::Prewhere);
1196        keywords.insert("UNPIVOT".to_string(), TokenType::Unpivot);
1197        keywords.insert("FOR".to_string(), TokenType::For);
1198        keywords.insert("ANY".to_string(), TokenType::Any);
1199        keywords.insert("SOME".to_string(), TokenType::Some);
1200        keywords.insert("ASOF".to_string(), TokenType::AsOf);
1201        keywords.insert("PERCENT".to_string(), TokenType::Percent);
1202        keywords.insert("EXCLUDE".to_string(), TokenType::Exclude);
1203        keywords.insert("NO".to_string(), TokenType::No);
1204        keywords.insert("OTHERS".to_string(), TokenType::Others);
1205        // PostgreSQL OPERATOR() syntax for schema-qualified operators
1206        keywords.insert("OPERATOR".to_string(), TokenType::Operator);
1207        // Phase 4: DDL keywords
1208        keywords.insert("SCHEMA".to_string(), TokenType::Schema);
1209        keywords.insert("NAMESPACE".to_string(), TokenType::Namespace);
1210        keywords.insert("DATABASE".to_string(), TokenType::Database);
1211        keywords.insert("FUNCTION".to_string(), TokenType::Function);
1212        keywords.insert("PROCEDURE".to_string(), TokenType::Procedure);
1213        keywords.insert("PROC".to_string(), TokenType::Procedure);
1214        keywords.insert("SEQUENCE".to_string(), TokenType::Sequence);
1215        keywords.insert("TRIGGER".to_string(), TokenType::Trigger);
1216        keywords.insert("TYPE".to_string(), TokenType::Type);
1217        keywords.insert("DOMAIN".to_string(), TokenType::Domain);
1218        keywords.insert("RETURNS".to_string(), TokenType::Returns);
1219        keywords.insert("RETURNING".to_string(), TokenType::Returning);
1220        keywords.insert("LANGUAGE".to_string(), TokenType::Language);
1221        keywords.insert("ROLLBACK".to_string(), TokenType::Rollback);
1222        keywords.insert("COMMIT".to_string(), TokenType::Commit);
1223        keywords.insert("BEGIN".to_string(), TokenType::Begin);
1224        keywords.insert("DESCRIBE".to_string(), TokenType::Describe);
1225        keywords.insert("PRESERVE".to_string(), TokenType::Preserve);
1226        keywords.insert("TRANSACTION".to_string(), TokenType::Transaction);
1227        keywords.insert("SAVEPOINT".to_string(), TokenType::Savepoint);
1228        keywords.insert("BODY".to_string(), TokenType::Body);
1229        keywords.insert("INCREMENT".to_string(), TokenType::Increment);
1230        keywords.insert("MINVALUE".to_string(), TokenType::Minvalue);
1231        keywords.insert("MAXVALUE".to_string(), TokenType::Maxvalue);
1232        keywords.insert("CYCLE".to_string(), TokenType::Cycle);
1233        keywords.insert("NOCYCLE".to_string(), TokenType::NoCycle);
1234        keywords.insert("PRIOR".to_string(), TokenType::Prior);
1235        // MATCH_RECOGNIZE keywords
1236        keywords.insert("MATCH".to_string(), TokenType::Match);
1237        keywords.insert("MATCH_RECOGNIZE".to_string(), TokenType::MatchRecognize);
1238        keywords.insert("MEASURES".to_string(), TokenType::Measures);
1239        keywords.insert("PATTERN".to_string(), TokenType::Pattern);
1240        keywords.insert("DEFINE".to_string(), TokenType::Define);
1241        keywords.insert("RUNNING".to_string(), TokenType::Running);
1242        keywords.insert("FINAL".to_string(), TokenType::Final);
1243        keywords.insert("OWNED".to_string(), TokenType::Owned);
1244        keywords.insert("AFTER".to_string(), TokenType::After);
1245        keywords.insert("BEFORE".to_string(), TokenType::Before);
1246        keywords.insert("INSTEAD".to_string(), TokenType::Instead);
1247        keywords.insert("EACH".to_string(), TokenType::Each);
1248        keywords.insert("STATEMENT".to_string(), TokenType::Statement);
1249        keywords.insert("REFERENCING".to_string(), TokenType::Referencing);
1250        keywords.insert("OLD".to_string(), TokenType::Old);
1251        keywords.insert("NEW".to_string(), TokenType::New);
1252        keywords.insert("OF".to_string(), TokenType::Of);
1253        keywords.insert("CHECK".to_string(), TokenType::Check);
1254        keywords.insert("START".to_string(), TokenType::Start);
1255        keywords.insert("ENUM".to_string(), TokenType::Enum);
1256        keywords.insert("AUTHORIZATION".to_string(), TokenType::Authorization);
1257        keywords.insert("RESTART".to_string(), TokenType::Restart);
1258        // Date/time literal keywords
1259        keywords.insert("DATE".to_string(), TokenType::Date);
1260        keywords.insert("TIME".to_string(), TokenType::Time);
1261        keywords.insert("TIMESTAMP".to_string(), TokenType::Timestamp);
1262        keywords.insert("DATETIME".to_string(), TokenType::DateTime);
1263        keywords.insert("GENERATED".to_string(), TokenType::Generated);
1264        keywords.insert("IDENTITY".to_string(), TokenType::Identity);
1265        keywords.insert("ALWAYS".to_string(), TokenType::Always);
1266        // LOAD DATA keywords
1267        keywords.insert("LOAD".to_string(), TokenType::Load);
1268        keywords.insert("LOCAL".to_string(), TokenType::Local);
1269        keywords.insert("INPATH".to_string(), TokenType::Inpath);
1270        keywords.insert("INPUTFORMAT".to_string(), TokenType::InputFormat);
1271        keywords.insert("SERDE".to_string(), TokenType::Serde);
1272        keywords.insert("SERDEPROPERTIES".to_string(), TokenType::SerdeProperties);
1273        keywords.insert("FORMAT".to_string(), TokenType::Format);
1274        // SQLite
1275        keywords.insert("PRAGMA".to_string(), TokenType::Pragma);
1276        // SHOW statement
1277        keywords.insert("SHOW".to_string(), TokenType::Show);
1278        // Oracle ORDER SIBLINGS BY (hierarchical queries)
1279        keywords.insert("SIBLINGS".to_string(), TokenType::Siblings);
1280        // COPY and PUT statements (Snowflake, PostgreSQL)
1281        keywords.insert("COPY".to_string(), TokenType::Copy);
1282        keywords.insert("PUT".to_string(), TokenType::Put);
1283        keywords.insert("GET".to_string(), TokenType::Get);
1284        // EXEC/EXECUTE statement (TSQL, etc.)
1285        keywords.insert("EXEC".to_string(), TokenType::Execute);
1286        keywords.insert("EXECUTE".to_string(), TokenType::Execute);
1287        // Postfix null check operators (PostgreSQL/SQLite)
1288        keywords.insert("ISNULL".to_string(), TokenType::IsNull);
1289        keywords.insert("NOTNULL".to_string(), TokenType::NotNull);
1290
1291        let mut single_tokens = std::collections::HashMap::new();
1292        single_tokens.insert('(', TokenType::LParen);
1293        single_tokens.insert(')', TokenType::RParen);
1294        single_tokens.insert('[', TokenType::LBracket);
1295        single_tokens.insert(']', TokenType::RBracket);
1296        single_tokens.insert('{', TokenType::LBrace);
1297        single_tokens.insert('}', TokenType::RBrace);
1298        single_tokens.insert(',', TokenType::Comma);
1299        single_tokens.insert('.', TokenType::Dot);
1300        single_tokens.insert(';', TokenType::Semicolon);
1301        single_tokens.insert('+', TokenType::Plus);
1302        single_tokens.insert('-', TokenType::Dash);
1303        single_tokens.insert('*', TokenType::Star);
1304        single_tokens.insert('/', TokenType::Slash);
1305        single_tokens.insert('%', TokenType::Percent);
1306        single_tokens.insert('&', TokenType::Amp);
1307        single_tokens.insert('|', TokenType::Pipe);
1308        single_tokens.insert('^', TokenType::Caret);
1309        single_tokens.insert('~', TokenType::Tilde);
1310        single_tokens.insert('<', TokenType::Lt);
1311        single_tokens.insert('>', TokenType::Gt);
1312        single_tokens.insert('=', TokenType::Eq);
1313        single_tokens.insert('!', TokenType::Exclamation);
1314        single_tokens.insert(':', TokenType::Colon);
1315        single_tokens.insert('@', TokenType::DAt);
1316        single_tokens.insert('#', TokenType::Hash);
1317        single_tokens.insert('$', TokenType::Dollar);
1318        single_tokens.insert('?', TokenType::Parameter);
1319
1320        let mut quotes = std::collections::HashMap::new();
1321        quotes.insert("'".to_string(), "'".to_string());
1322        // Triple-quoted strings (e.g., """x""")
1323        quotes.insert("\"\"\"".to_string(), "\"\"\"".to_string());
1324
1325        let mut identifiers = std::collections::HashMap::new();
1326        identifiers.insert('"', '"');
1327        identifiers.insert('`', '`');
1328        // Note: TSQL bracket-quoted identifiers [name] are handled in the parser
1329        // because [ is also used for arrays and subscripts
1330
1331        let mut comments = std::collections::HashMap::new();
1332        comments.insert("--".to_string(), None);
1333        comments.insert("/*".to_string(), Some("*/".to_string()));
1334
1335        Self {
1336            keywords,
1337            single_tokens,
1338            quotes,
1339            identifiers,
1340            comments,
1341            // Standard SQL: only '' (doubled quote) escapes a quote
1342            // Backslash escapes are dialect-specific (MySQL, etc.)
1343            string_escapes: vec!['\''],
1344            nested_comments: true,
1345            // By default, no escape_follow_chars means preserve backslash for unrecognized escapes
1346            escape_follow_chars: vec![],
1347            // Default: b'...' is bit string (standard SQL), not byte string (BigQuery)
1348            b_prefix_is_byte_string: false,
1349            numeric_literals: std::collections::HashMap::new(),
1350            identifiers_can_start_with_digit: false,
1351            hex_number_strings: false,
1352            hex_string_is_integer_type: false,
1353            // Default: backslash escapes ARE allowed in raw strings (sqlglot default)
1354            // Spark/Databricks set this to false
1355            string_escapes_allowed_in_raw_strings: true,
1356            hash_comments: false,
1357            dollar_sign_is_identifier: false,
1358            insert_format_raw_data: false,
1359        }
1360    }
1361}
1362
1363/// SQL Tokenizer
1364pub struct Tokenizer {
1365    config: TokenizerConfig,
1366}
1367
1368impl Tokenizer {
1369    /// Create a new tokenizer with the given configuration
1370    pub fn new(config: TokenizerConfig) -> Self {
1371        Self { config }
1372    }
1373
1374    /// Create a tokenizer with default configuration
1375    pub fn default_config() -> Self {
1376        Self::new(TokenizerConfig::default())
1377    }
1378
1379    /// Tokenize a SQL string
1380    pub fn tokenize(&self, sql: &str) -> Result<Vec<Token>> {
1381        let mut state = TokenizerState::new(sql, &self.config);
1382        state.tokenize()
1383    }
1384}
1385
1386impl Default for Tokenizer {
1387    fn default() -> Self {
1388        Self::default_config()
1389    }
1390}
1391
1392/// Internal state for tokenization
1393struct TokenizerState<'a> {
1394    source: &'a str,
1395    source_is_ascii: bool,
1396    chars: Vec<char>,
1397    size: usize,
1398    tokens: Vec<Token>,
1399    start: usize,
1400    current: usize,
1401    line: usize,
1402    column: usize,
1403    comments: Vec<String>,
1404    config: &'a TokenizerConfig,
1405}
1406
1407impl<'a> TokenizerState<'a> {
1408    fn new(sql: &'a str, config: &'a TokenizerConfig) -> Self {
1409        let chars: Vec<char> = sql.chars().collect();
1410        let size = chars.len();
1411        Self {
1412            source: sql,
1413            source_is_ascii: sql.is_ascii(),
1414            chars,
1415            size,
1416            tokens: Vec::new(),
1417            start: 0,
1418            current: 0,
1419            line: 1,
1420            column: 1,
1421            comments: Vec::new(),
1422            config,
1423        }
1424    }
1425
1426    fn tokenize(&mut self) -> Result<Vec<Token>> {
1427        while !self.is_at_end() {
1428            self.skip_whitespace();
1429            if self.is_at_end() {
1430                break;
1431            }
1432
1433            self.start = self.current;
1434            self.scan_token()?;
1435
1436            // ClickHouse: After INSERT ... FORMAT <name> (where name != VALUES),
1437            // the rest until the next blank line or end of input is raw data.
1438            if self.config.insert_format_raw_data {
1439                if let Some(raw) = self.try_scan_insert_format_raw_data() {
1440                    if !raw.is_empty() {
1441                        self.start = self.current;
1442                        self.add_token_with_text(TokenType::Var, raw);
1443                    }
1444                }
1445            }
1446        }
1447
1448        // Handle leftover leading comments at end of input.
1449        // These are comments on a new line after the last token that couldn't be attached
1450        // as leading comments to a subsequent token (because there is none).
1451        // Attach them as trailing comments on the last token so they're preserved.
1452        if !self.comments.is_empty() {
1453            if let Some(last) = self.tokens.last_mut() {
1454                last.trailing_comments.extend(self.comments.drain(..));
1455            }
1456        }
1457
1458        Ok(std::mem::take(&mut self.tokens))
1459    }
1460
1461    fn is_at_end(&self) -> bool {
1462        self.current >= self.size
1463    }
1464
1465    #[inline]
1466    fn text_from_range(&self, start: usize, end: usize) -> String {
1467        if self.source_is_ascii {
1468            self.source[start..end].to_string()
1469        } else {
1470            self.chars[start..end].iter().collect()
1471        }
1472    }
1473
1474    fn peek(&self) -> char {
1475        if self.is_at_end() {
1476            '\0'
1477        } else {
1478            self.chars[self.current]
1479        }
1480    }
1481
1482    fn peek_next(&self) -> char {
1483        if self.current + 1 >= self.size {
1484            '\0'
1485        } else {
1486            self.chars[self.current + 1]
1487        }
1488    }
1489
1490    fn advance(&mut self) -> char {
1491        let c = self.peek();
1492        self.current += 1;
1493        if c == '\n' {
1494            self.line += 1;
1495            self.column = 1;
1496        } else {
1497            self.column += 1;
1498        }
1499        c
1500    }
1501
1502    fn skip_whitespace(&mut self) {
1503        // Track whether we've seen a newline since the last token.
1504        // Comments on a new line (after a newline) are leading comments on the next token,
1505        // while comments on the same line are trailing comments on the previous token.
1506        // This matches Python sqlglot's behavior.
1507        let mut saw_newline = false;
1508        while !self.is_at_end() {
1509            let c = self.peek();
1510            match c {
1511                ' ' | '\t' | '\r' => {
1512                    self.advance();
1513                }
1514                '\n' => {
1515                    saw_newline = true;
1516                    self.advance();
1517                }
1518                '\u{00A0}' // non-breaking space
1519                | '\u{2000}'..='\u{200B}' // various Unicode spaces + zero-width space
1520                | '\u{3000}' // ideographic (full-width) space
1521                | '\u{FEFF}' // BOM / zero-width no-break space
1522                => {
1523                    self.advance();
1524                }
1525                '-' if self.peek_next() == '-' => {
1526                    self.scan_line_comment(saw_newline);
1527                    // After a line comment, we're always on a new line
1528                    saw_newline = true;
1529                }
1530                '/' if self.peek_next() == '/' && self.config.hash_comments => {
1531                    // ClickHouse: // single-line comments (same dialects that support # comments)
1532                    self.scan_double_slash_comment();
1533                }
1534                '/' if self.peek_next() == '*' => {
1535                    // Check if this is a hint comment /*+ ... */
1536                    if self.current + 2 < self.size && self.chars[self.current + 2] == '+' {
1537                        // This is a hint comment, handle it as a token instead of skipping
1538                        break;
1539                    }
1540                    if self.scan_block_comment(saw_newline).is_err() {
1541                        return;
1542                    }
1543                    // Don't reset saw_newline - it carries forward
1544                }
1545                '/' if self.peek_next() == '/' && self.config.comments.contains_key("//") => {
1546                    // Dialect-specific // line comment (e.g., Snowflake)
1547                    // But NOT inside URIs like file:// or paths with consecutive slashes
1548                    // Check that previous non-whitespace char is not ':' or '/'
1549                    let prev_non_ws = if self.current > 0 {
1550                        let mut i = self.current - 1;
1551                        while i > 0 && (self.chars[i] == ' ' || self.chars[i] == '\t') {
1552                            i -= 1;
1553                        }
1554                        self.chars[i]
1555                    } else {
1556                        '\0'
1557                    };
1558                    if prev_non_ws == ':' || prev_non_ws == '/' {
1559                        // This is likely a URI (file://, http://) or path, not a comment
1560                        break;
1561                    }
1562                    self.scan_line_comment(saw_newline);
1563                    // After a line comment, we're always on a new line
1564                    saw_newline = true;
1565                }
1566                '#' if self.config.hash_comments => {
1567                    self.scan_hash_line_comment();
1568                }
1569                _ => break,
1570            }
1571        }
1572    }
1573
1574    fn scan_hash_line_comment(&mut self) {
1575        self.advance(); // #
1576        let start = self.current;
1577        while !self.is_at_end() && self.peek() != '\n' {
1578            self.advance();
1579        }
1580        let comment = self.text_from_range(start, self.current);
1581        let comment_text = comment.trim().to_string();
1582        if let Some(last) = self.tokens.last_mut() {
1583            last.trailing_comments.push(comment_text);
1584        } else {
1585            self.comments.push(comment_text);
1586        }
1587    }
1588
1589    fn scan_double_slash_comment(&mut self) {
1590        self.advance(); // /
1591        self.advance(); // /
1592        let start = self.current;
1593        while !self.is_at_end() && self.peek() != '\n' {
1594            self.advance();
1595        }
1596        let comment = self.text_from_range(start, self.current);
1597        let comment_text = comment.trim().to_string();
1598        if let Some(last) = self.tokens.last_mut() {
1599            last.trailing_comments.push(comment_text);
1600        } else {
1601            self.comments.push(comment_text);
1602        }
1603    }
1604
1605    fn scan_line_comment(&mut self, after_newline: bool) {
1606        self.advance(); // -
1607        self.advance(); // -
1608        let start = self.current;
1609        while !self.is_at_end() && self.peek() != '\n' {
1610            self.advance();
1611        }
1612        let comment_text = self.text_from_range(start, self.current);
1613
1614        // If the comment starts on a new line (after_newline), it's a leading comment
1615        // on the next token. Otherwise, it's a trailing comment on the previous token.
1616        if after_newline || self.tokens.is_empty() {
1617            self.comments.push(comment_text);
1618        } else if let Some(last) = self.tokens.last_mut() {
1619            last.trailing_comments.push(comment_text);
1620        }
1621    }
1622
1623    fn scan_block_comment(&mut self, after_newline: bool) -> Result<()> {
1624        self.advance(); // /
1625        self.advance(); // *
1626        let content_start = self.current;
1627        let mut depth = 1;
1628
1629        while !self.is_at_end() && depth > 0 {
1630            if self.peek() == '/' && self.peek_next() == '*' && self.config.nested_comments {
1631                self.advance();
1632                self.advance();
1633                depth += 1;
1634            } else if self.peek() == '*' && self.peek_next() == '/' {
1635                depth -= 1;
1636                if depth > 0 {
1637                    self.advance();
1638                    self.advance();
1639                }
1640            } else {
1641                self.advance();
1642            }
1643        }
1644
1645        if depth > 0 {
1646            return Err(Error::tokenize(
1647                "Unterminated block comment",
1648                self.line,
1649                self.column,
1650            ));
1651        }
1652
1653        // Get the content between /* and */ (preserving internal whitespace for nested comments)
1654        let content = self.text_from_range(content_start, self.current);
1655        self.advance(); // *
1656        self.advance(); // /
1657
1658        // For round-trip fidelity, preserve the exact comment content including nested comments
1659        let comment_text = format!("/*{}*/", content);
1660
1661        // If the comment starts on a new line (after_newline), it's a leading comment
1662        // on the next token. Otherwise, it's a trailing comment on the previous token.
1663        if after_newline || self.tokens.is_empty() {
1664            self.comments.push(comment_text);
1665        } else if let Some(last) = self.tokens.last_mut() {
1666            last.trailing_comments.push(comment_text);
1667        }
1668
1669        Ok(())
1670    }
1671
1672    /// Scan a hint comment /*+ ... */ and return it as a Hint token
1673    fn scan_hint(&mut self) -> Result<()> {
1674        self.advance(); // /
1675        self.advance(); // *
1676        self.advance(); // +
1677        let hint_start = self.current;
1678
1679        // Scan until we find */
1680        while !self.is_at_end() {
1681            if self.peek() == '*' && self.peek_next() == '/' {
1682                break;
1683            }
1684            self.advance();
1685        }
1686
1687        if self.is_at_end() {
1688            return Err(Error::tokenize(
1689                "Unterminated hint comment",
1690                self.line,
1691                self.column,
1692            ));
1693        }
1694
1695        let hint_text = self.text_from_range(hint_start, self.current);
1696        self.advance(); // *
1697        self.advance(); // /
1698
1699        self.add_token_with_text(TokenType::Hint, hint_text.trim().to_string());
1700
1701        Ok(())
1702    }
1703
1704    /// Scan a positional parameter: $1, $2, etc.
1705    fn scan_positional_parameter(&mut self) -> Result<()> {
1706        self.advance(); // consume $
1707        let start = self.current;
1708
1709        while !self.is_at_end() && self.peek().is_ascii_digit() {
1710            self.advance();
1711        }
1712
1713        let number = self.text_from_range(start, self.current);
1714        self.add_token_with_text(TokenType::Parameter, number);
1715        Ok(())
1716    }
1717
1718    /// Try to scan a tagged dollar-quoted string: $tag$content$tag$
1719    /// Returns Some(()) if successful, None if this isn't a tagged dollar string.
1720    ///
1721    /// The token text is stored as "tag\x00content" to preserve the tag for later use.
1722    fn try_scan_tagged_dollar_string(&mut self) -> Result<Option<()>> {
1723        let saved_pos = self.current;
1724
1725        // We're at '$', next char is alphabetic
1726        self.advance(); // consume opening $
1727
1728        // Scan the tag (identifier: alphanumeric + underscore, including Unicode)
1729        // Tags can contain Unicode characters like emojis (e.g., $🦆$)
1730        let tag_start = self.current;
1731        while !self.is_at_end()
1732            && (self.peek().is_alphanumeric() || self.peek() == '_' || !self.peek().is_ascii())
1733        {
1734            self.advance();
1735        }
1736        let tag = self.text_from_range(tag_start, self.current);
1737
1738        // Must have a closing $ after the tag
1739        if self.is_at_end() || self.peek() != '$' {
1740            // Not a tagged dollar string - restore position
1741            self.current = saved_pos;
1742            return Ok(None);
1743        }
1744        self.advance(); // consume closing $ of opening tag
1745
1746        // Now scan content until we find $tag$
1747        let content_start = self.current;
1748        let closing_tag = format!("${}$", tag);
1749        let closing_chars: Vec<char> = closing_tag.chars().collect();
1750
1751        loop {
1752            if self.is_at_end() {
1753                // Unterminated - restore and fall through
1754                self.current = saved_pos;
1755                return Ok(None);
1756            }
1757
1758            // Check if we've reached the closing tag
1759            if self.peek() == '$' && self.current + closing_chars.len() <= self.size {
1760                let matches = closing_chars.iter().enumerate().all(|(j, &ch)| {
1761                    self.current + j < self.size && self.chars[self.current + j] == ch
1762                });
1763                if matches {
1764                    let content = self.text_from_range(content_start, self.current);
1765                    // Consume closing tag
1766                    for _ in 0..closing_chars.len() {
1767                        self.advance();
1768                    }
1769                    // Store as "tag\x00content" to preserve the tag
1770                    let token_text = format!("{}\x00{}", tag, content);
1771                    self.add_token_with_text(TokenType::DollarString, token_text);
1772                    return Ok(Some(()));
1773                }
1774            }
1775            self.advance();
1776        }
1777    }
1778
1779    /// Scan a dollar-quoted string: $$content$$ or $tag$content$tag$
1780    ///
1781    /// For $$...$$ (no tag), the token text is just the content.
1782    /// For $tag$...$tag$, use try_scan_tagged_dollar_string instead.
1783    fn scan_dollar_quoted_string(&mut self) -> Result<()> {
1784        self.advance(); // consume first $
1785        self.advance(); // consume second $
1786
1787        // For $$...$$ (no tag), just scan until closing $$
1788        let start = self.current;
1789        while !self.is_at_end() {
1790            if self.peek() == '$'
1791                && self.current + 1 < self.size
1792                && self.chars[self.current + 1] == '$'
1793            {
1794                break;
1795            }
1796            self.advance();
1797        }
1798
1799        let content = self.text_from_range(start, self.current);
1800
1801        if !self.is_at_end() {
1802            self.advance(); // consume first $
1803            self.advance(); // consume second $
1804        }
1805
1806        self.add_token_with_text(TokenType::DollarString, content);
1807        Ok(())
1808    }
1809
1810    fn scan_token(&mut self) -> Result<()> {
1811        let c = self.peek();
1812
1813        // Check for string literal
1814        if c == '\'' {
1815            // Check for triple-quoted string '''...''' if configured
1816            if self.config.quotes.contains_key("'''")
1817                && self.peek_next() == '\''
1818                && self.current + 2 < self.size
1819                && self.chars[self.current + 2] == '\''
1820            {
1821                return self.scan_triple_quoted_string('\'');
1822            }
1823            return self.scan_string();
1824        }
1825
1826        // Check for triple-quoted string """...""" if configured
1827        if c == '"'
1828            && self.config.quotes.contains_key("\"\"\"")
1829            && self.peek_next() == '"'
1830            && self.current + 2 < self.size
1831            && self.chars[self.current + 2] == '"'
1832        {
1833            return self.scan_triple_quoted_string('"');
1834        }
1835
1836        // Check for double-quoted strings when dialect supports them (e.g., BigQuery)
1837        // This must come before identifier quotes check
1838        if c == '"'
1839            && self.config.quotes.contains_key("\"")
1840            && !self.config.identifiers.contains_key(&'"')
1841        {
1842            return self.scan_double_quoted_string();
1843        }
1844
1845        // Check for identifier quotes
1846        if let Some(&end_quote) = self.config.identifiers.get(&c) {
1847            return self.scan_quoted_identifier(end_quote);
1848        }
1849
1850        // Check for numbers (including numbers starting with a dot like .25)
1851        if c.is_ascii_digit() {
1852            return self.scan_number();
1853        }
1854
1855        // Check for numbers starting with a dot (e.g., .25, .5)
1856        // This must come before single character token handling
1857        // Don't treat as a number if:
1858        // - Previous char was also a dot (e.g., 1..2 should be 1, ., ., 2)
1859        // - Previous char is an identifier character (e.g., foo.25 should be foo, ., 25)
1860        //   This handles BigQuery numeric table parts like project.dataset.25
1861        if c == '.' && self.peek_next().is_ascii_digit() {
1862            let prev_char = if self.current > 0 {
1863                self.chars[self.current - 1]
1864            } else {
1865                '\0'
1866            };
1867            let is_after_ident = prev_char.is_alphanumeric()
1868                || prev_char == '_'
1869                || prev_char == '`'
1870                || prev_char == '"'
1871                || prev_char == ']'
1872                || prev_char == ')';
1873            if prev_char != '.' && !is_after_ident {
1874                return self.scan_number_starting_with_dot();
1875            }
1876        }
1877
1878        // Check for hint comment /*+ ... */
1879        if c == '/'
1880            && self.peek_next() == '*'
1881            && self.current + 2 < self.size
1882            && self.chars[self.current + 2] == '+'
1883        {
1884            return self.scan_hint();
1885        }
1886
1887        // Check for multi-character operators first
1888        if let Some(token_type) = self.try_scan_multi_char_operator() {
1889            self.add_token(token_type);
1890            return Ok(());
1891        }
1892
1893        // Check for tagged dollar-quoted strings: $tag$content$tag$
1894        // Tags can contain Unicode characters (including emojis like 🦆) and digits (e.g., $1$)
1895        if c == '$'
1896            && (self.peek_next().is_alphanumeric()
1897                || self.peek_next() == '_'
1898                || !self.peek_next().is_ascii())
1899        {
1900            if let Some(()) = self.try_scan_tagged_dollar_string()? {
1901                return Ok(());
1902            }
1903            // If tagged dollar string didn't match and dollar_sign_is_identifier is set,
1904            // treat the $ and following chars as an identifier (e.g., ClickHouse $alias$name$).
1905            if self.config.dollar_sign_is_identifier {
1906                return self.scan_dollar_identifier();
1907            }
1908        }
1909
1910        // Check for dollar-quoted strings: $$...$$
1911        if c == '$' && self.peek_next() == '$' {
1912            return self.scan_dollar_quoted_string();
1913        }
1914
1915        // Check for positional parameters: $1, $2, etc.
1916        if c == '$' && self.peek_next().is_ascii_digit() {
1917            return self.scan_positional_parameter();
1918        }
1919
1920        // ClickHouse: bare $ (not followed by alphanumeric/underscore) as identifier
1921        if c == '$' && self.config.dollar_sign_is_identifier {
1922            return self.scan_dollar_identifier();
1923        }
1924
1925        // TSQL: Check for identifiers starting with # (temp tables) or @ (variables)
1926        // e.g., #temp, ##global_temp, @variable
1927        if (c == '#' || c == '@')
1928            && (self.peek_next().is_alphanumeric()
1929                || self.peek_next() == '_'
1930                || self.peek_next() == '#')
1931        {
1932            return self.scan_tsql_identifier();
1933        }
1934
1935        // Check for single character tokens
1936        if let Some(&token_type) = self.config.single_tokens.get(&c) {
1937            self.advance();
1938            self.add_token(token_type);
1939            return Ok(());
1940        }
1941
1942        // Unicode minus (U+2212) → treat as regular minus
1943        if c == '\u{2212}' {
1944            self.advance();
1945            self.add_token(TokenType::Dash);
1946            return Ok(());
1947        }
1948
1949        // Unicode fraction slash (U+2044) → treat as regular slash
1950        if c == '\u{2044}' {
1951            self.advance();
1952            self.add_token(TokenType::Slash);
1953            return Ok(());
1954        }
1955
1956        // Unicode curly/smart quotes → treat as regular string quotes
1957        if c == '\u{2018}' || c == '\u{2019}' {
1958            // Left/right single quotation marks → scan as string with matching end
1959            return self.scan_unicode_quoted_string(c);
1960        }
1961        if c == '\u{201C}' || c == '\u{201D}' {
1962            // Left/right double quotation marks → scan as quoted identifier
1963            return self.scan_unicode_quoted_identifier(c);
1964        }
1965
1966        // Must be an identifier or keyword
1967        self.scan_identifier_or_keyword()
1968    }
1969
1970    fn try_scan_multi_char_operator(&mut self) -> Option<TokenType> {
1971        let c = self.peek();
1972        let next = self.peek_next();
1973        let third = if self.current + 2 < self.size {
1974            self.chars[self.current + 2]
1975        } else {
1976            '\0'
1977        };
1978
1979        // Check for three-character operators first
1980        // -|- (Adjacent - PostgreSQL range adjacency)
1981        if c == '-' && next == '|' && third == '-' {
1982            self.advance();
1983            self.advance();
1984            self.advance();
1985            return Some(TokenType::Adjacent);
1986        }
1987
1988        // ||/ (Cube root - PostgreSQL)
1989        if c == '|' && next == '|' && third == '/' {
1990            self.advance();
1991            self.advance();
1992            self.advance();
1993            return Some(TokenType::DPipeSlash);
1994        }
1995
1996        // #>> (JSONB path text extraction - PostgreSQL)
1997        if c == '#' && next == '>' && third == '>' {
1998            self.advance();
1999            self.advance();
2000            self.advance();
2001            return Some(TokenType::DHashArrow);
2002        }
2003
2004        // ->> (JSON text extraction - PostgreSQL/MySQL)
2005        if c == '-' && next == '>' && third == '>' {
2006            self.advance();
2007            self.advance();
2008            self.advance();
2009            return Some(TokenType::DArrow);
2010        }
2011
2012        // <=> (NULL-safe equality - MySQL)
2013        if c == '<' && next == '=' && third == '>' {
2014            self.advance();
2015            self.advance();
2016            self.advance();
2017            return Some(TokenType::NullsafeEq);
2018        }
2019
2020        // <-> (Distance operator - PostgreSQL)
2021        if c == '<' && next == '-' && third == '>' {
2022            self.advance();
2023            self.advance();
2024            self.advance();
2025            return Some(TokenType::LrArrow);
2026        }
2027
2028        // <@ (Contained by - PostgreSQL)
2029        if c == '<' && next == '@' {
2030            self.advance();
2031            self.advance();
2032            return Some(TokenType::LtAt);
2033        }
2034
2035        // @> (Contains - PostgreSQL)
2036        if c == '@' && next == '>' {
2037            self.advance();
2038            self.advance();
2039            return Some(TokenType::AtGt);
2040        }
2041
2042        // ~~~ (Glob - PostgreSQL)
2043        if c == '~' && next == '~' && third == '~' {
2044            self.advance();
2045            self.advance();
2046            self.advance();
2047            return Some(TokenType::Glob);
2048        }
2049
2050        // ~~* (ILike - PostgreSQL)
2051        if c == '~' && next == '~' && third == '*' {
2052            self.advance();
2053            self.advance();
2054            self.advance();
2055            return Some(TokenType::ILike);
2056        }
2057
2058        // !~~* (Not ILike - PostgreSQL)
2059        let fourth = if self.current + 3 < self.size {
2060            self.chars[self.current + 3]
2061        } else {
2062            '\0'
2063        };
2064        if c == '!' && next == '~' && third == '~' && fourth == '*' {
2065            self.advance();
2066            self.advance();
2067            self.advance();
2068            self.advance();
2069            return Some(TokenType::NotILike);
2070        }
2071
2072        // !~~ (Not Like - PostgreSQL)
2073        if c == '!' && next == '~' && third == '~' {
2074            self.advance();
2075            self.advance();
2076            self.advance();
2077            return Some(TokenType::NotLike);
2078        }
2079
2080        // !~* (Not Regexp ILike - PostgreSQL)
2081        if c == '!' && next == '~' && third == '*' {
2082            self.advance();
2083            self.advance();
2084            self.advance();
2085            return Some(TokenType::NotIRLike);
2086        }
2087
2088        // !:> (Not cast / Try cast - SingleStore)
2089        if c == '!' && next == ':' && third == '>' {
2090            self.advance();
2091            self.advance();
2092            self.advance();
2093            return Some(TokenType::NColonGt);
2094        }
2095
2096        // ?:: (TRY_CAST shorthand - Databricks)
2097        if c == '?' && next == ':' && third == ':' {
2098            self.advance();
2099            self.advance();
2100            self.advance();
2101            return Some(TokenType::QDColon);
2102        }
2103
2104        // !~ (Not Regexp - PostgreSQL)
2105        if c == '!' && next == '~' {
2106            self.advance();
2107            self.advance();
2108            return Some(TokenType::NotRLike);
2109        }
2110
2111        // ~~ (Like - PostgreSQL)
2112        if c == '~' && next == '~' {
2113            self.advance();
2114            self.advance();
2115            return Some(TokenType::Like);
2116        }
2117
2118        // ~* (Regexp ILike - PostgreSQL)
2119        if c == '~' && next == '*' {
2120            self.advance();
2121            self.advance();
2122            return Some(TokenType::IRLike);
2123        }
2124
2125        // SingleStore three-character JSON path operators (must be checked before :: two-char)
2126        // ::$ (JSON extract string), ::% (JSON extract double), ::? (JSON match)
2127        if c == ':' && next == ':' && third == '$' {
2128            self.advance();
2129            self.advance();
2130            self.advance();
2131            return Some(TokenType::DColonDollar);
2132        }
2133        if c == ':' && next == ':' && third == '%' {
2134            self.advance();
2135            self.advance();
2136            self.advance();
2137            return Some(TokenType::DColonPercent);
2138        }
2139        if c == ':' && next == ':' && third == '?' {
2140            self.advance();
2141            self.advance();
2142            self.advance();
2143            return Some(TokenType::DColonQMark);
2144        }
2145
2146        // Two-character operators
2147        let token_type = match (c, next) {
2148            ('.', ':') => Some(TokenType::DotColon),
2149            ('=', '=') => Some(TokenType::Eq), // Hive/Spark == equality operator
2150            ('<', '=') => Some(TokenType::Lte),
2151            ('>', '=') => Some(TokenType::Gte),
2152            ('!', '=') => Some(TokenType::Neq),
2153            ('<', '>') => Some(TokenType::Neq),
2154            ('^', '=') => Some(TokenType::Neq),
2155            ('<', '<') => Some(TokenType::LtLt),
2156            ('>', '>') => Some(TokenType::GtGt),
2157            ('|', '|') => Some(TokenType::DPipe),
2158            ('|', '/') => Some(TokenType::PipeSlash), // Square root - PostgreSQL
2159            (':', ':') => Some(TokenType::DColon),
2160            (':', '=') => Some(TokenType::ColonEq), // := (assignment, named args)
2161            (':', '>') => Some(TokenType::ColonGt), // ::> (TSQL)
2162            ('-', '>') => Some(TokenType::Arrow),   // JSON object access
2163            ('=', '>') => Some(TokenType::FArrow),  // Fat arrow (lambda)
2164            ('&', '&') => Some(TokenType::DAmp),
2165            ('&', '<') => Some(TokenType::AmpLt), // PostgreSQL range operator
2166            ('&', '>') => Some(TokenType::AmpGt), // PostgreSQL range operator
2167            ('@', '@') => Some(TokenType::AtAt),  // Text search match
2168            ('?', '|') => Some(TokenType::QMarkPipe), // JSONB contains any key
2169            ('?', '&') => Some(TokenType::QMarkAmp), // JSONB contains all keys
2170            ('?', '?') => Some(TokenType::DQMark), // Double question mark
2171            ('#', '>') => Some(TokenType::HashArrow), // JSONB path extraction
2172            ('#', '-') => Some(TokenType::HashDash), // JSONB delete
2173            ('^', '@') => Some(TokenType::CaretAt), // PostgreSQL starts-with operator
2174            ('*', '*') => Some(TokenType::DStar), // Power operator
2175            ('|', '>') => Some(TokenType::PipeGt), // Pipe-greater (some dialects)
2176            _ => None,
2177        };
2178
2179        if token_type.is_some() {
2180            self.advance();
2181            self.advance();
2182        }
2183
2184        token_type
2185    }
2186
2187    fn scan_string(&mut self) -> Result<()> {
2188        self.advance(); // Opening quote
2189        let mut value = String::new();
2190
2191        while !self.is_at_end() {
2192            let c = self.peek();
2193            if c == '\'' {
2194                if self.peek_next() == '\'' {
2195                    // Escaped quote
2196                    value.push('\'');
2197                    self.advance();
2198                    self.advance();
2199                } else {
2200                    break;
2201                }
2202            } else if c == '\\' && self.config.string_escapes.contains(&'\\') {
2203                // Handle escape sequences
2204                self.advance(); // Consume the backslash
2205                if !self.is_at_end() {
2206                    let escaped = self.advance();
2207                    match escaped {
2208                        'n' => value.push('\n'),
2209                        'r' => value.push('\r'),
2210                        't' => value.push('\t'),
2211                        '0' => value.push('\0'),
2212                        'Z' => value.push('\x1A'), // Ctrl+Z (MySQL)
2213                        'a' => value.push('\x07'), // Alert/bell
2214                        'b' => value.push('\x08'), // Backspace
2215                        'f' => value.push('\x0C'), // Form feed
2216                        'v' => value.push('\x0B'), // Vertical tab
2217                        'x' => {
2218                            // Hex escape: \xNN (exactly 2 hex digits)
2219                            let mut hex = String::with_capacity(2);
2220                            for _ in 0..2 {
2221                                if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2222                                    hex.push(self.advance());
2223                                }
2224                            }
2225                            if hex.len() == 2 {
2226                                if let Ok(byte) = u8::from_str_radix(&hex, 16) {
2227                                    value.push(byte as char);
2228                                } else {
2229                                    value.push('\\');
2230                                    value.push('x');
2231                                    value.push_str(&hex);
2232                                }
2233                            } else {
2234                                // Not enough hex digits, preserve literally
2235                                value.push('\\');
2236                                value.push('x');
2237                                value.push_str(&hex);
2238                            }
2239                        }
2240                        '\\' => value.push('\\'),
2241                        '\'' => value.push('\''),
2242                        '"' => value.push('"'),
2243                        '%' => {
2244                            // MySQL: \% in LIKE patterns
2245                            value.push('%');
2246                        }
2247                        '_' => {
2248                            // MySQL: \_ in LIKE patterns
2249                            value.push('_');
2250                        }
2251                        // For unrecognized escape sequences:
2252                        // If escape_follow_chars is set, only preserve backslash for chars in that list
2253                        // Otherwise (empty list), preserve backslash + char for unrecognized escapes
2254                        _ => {
2255                            if !self.config.escape_follow_chars.is_empty() {
2256                                // MySQL-style: discard backslash for unrecognized escapes
2257                                value.push(escaped);
2258                            } else {
2259                                // Standard: preserve backslash + char
2260                                value.push('\\');
2261                                value.push(escaped);
2262                            }
2263                        }
2264                    }
2265                }
2266            } else {
2267                value.push(self.advance());
2268            }
2269        }
2270
2271        if self.is_at_end() {
2272            return Err(Error::tokenize(
2273                "Unterminated string",
2274                self.line,
2275                self.column,
2276            ));
2277        }
2278
2279        self.advance(); // Closing quote
2280        self.add_token_with_text(TokenType::String, value);
2281        Ok(())
2282    }
2283
2284    /// Scan a double-quoted string (for dialects like BigQuery where " is a string delimiter)
2285    fn scan_double_quoted_string(&mut self) -> Result<()> {
2286        self.advance(); // Opening quote
2287        let mut value = String::new();
2288
2289        while !self.is_at_end() {
2290            let c = self.peek();
2291            if c == '"' {
2292                if self.peek_next() == '"' {
2293                    // Escaped quote
2294                    value.push('"');
2295                    self.advance();
2296                    self.advance();
2297                } else {
2298                    break;
2299                }
2300            } else if c == '\\' && self.config.string_escapes.contains(&'\\') {
2301                // Handle escape sequences
2302                self.advance(); // Consume the backslash
2303                if !self.is_at_end() {
2304                    let escaped = self.advance();
2305                    match escaped {
2306                        'n' => value.push('\n'),
2307                        'r' => value.push('\r'),
2308                        't' => value.push('\t'),
2309                        '0' => value.push('\0'),
2310                        'Z' => value.push('\x1A'), // Ctrl+Z (MySQL)
2311                        'a' => value.push('\x07'), // Alert/bell
2312                        'b' => value.push('\x08'), // Backspace
2313                        'f' => value.push('\x0C'), // Form feed
2314                        'v' => value.push('\x0B'), // Vertical tab
2315                        'x' => {
2316                            // Hex escape: \xNN (exactly 2 hex digits)
2317                            let mut hex = String::with_capacity(2);
2318                            for _ in 0..2 {
2319                                if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2320                                    hex.push(self.advance());
2321                                }
2322                            }
2323                            if hex.len() == 2 {
2324                                if let Ok(byte) = u8::from_str_radix(&hex, 16) {
2325                                    value.push(byte as char);
2326                                } else {
2327                                    value.push('\\');
2328                                    value.push('x');
2329                                    value.push_str(&hex);
2330                                }
2331                            } else {
2332                                // Not enough hex digits, preserve literally
2333                                value.push('\\');
2334                                value.push('x');
2335                                value.push_str(&hex);
2336                            }
2337                        }
2338                        '\\' => value.push('\\'),
2339                        '\'' => value.push('\''),
2340                        '"' => value.push('"'),
2341                        '%' => {
2342                            // MySQL: \% in LIKE patterns
2343                            value.push('%');
2344                        }
2345                        '_' => {
2346                            // MySQL: \_ in LIKE patterns
2347                            value.push('_');
2348                        }
2349                        // For unrecognized escape sequences:
2350                        // If escape_follow_chars is set, only preserve backslash for chars in that list
2351                        // Otherwise (empty list), preserve backslash + char for unrecognized escapes
2352                        _ => {
2353                            if !self.config.escape_follow_chars.is_empty() {
2354                                // MySQL-style: discard backslash for unrecognized escapes
2355                                value.push(escaped);
2356                            } else {
2357                                // Standard: preserve backslash + char
2358                                value.push('\\');
2359                                value.push(escaped);
2360                            }
2361                        }
2362                    }
2363                }
2364            } else {
2365                value.push(self.advance());
2366            }
2367        }
2368
2369        if self.is_at_end() {
2370            return Err(Error::tokenize(
2371                "Unterminated double-quoted string",
2372                self.line,
2373                self.column,
2374            ));
2375        }
2376
2377        self.advance(); // Closing quote
2378        self.add_token_with_text(TokenType::String, value);
2379        Ok(())
2380    }
2381
2382    fn scan_triple_quoted_string(&mut self, quote_char: char) -> Result<()> {
2383        // Advance past the three opening quotes
2384        self.advance();
2385        self.advance();
2386        self.advance();
2387        let mut value = String::new();
2388
2389        while !self.is_at_end() {
2390            // Check for closing triple quote
2391            if self.peek() == quote_char
2392                && self.current + 1 < self.size
2393                && self.chars[self.current + 1] == quote_char
2394                && self.current + 2 < self.size
2395                && self.chars[self.current + 2] == quote_char
2396            {
2397                // Found closing """
2398                break;
2399            }
2400            value.push(self.advance());
2401        }
2402
2403        if self.is_at_end() {
2404            return Err(Error::tokenize(
2405                "Unterminated triple-quoted string",
2406                self.line,
2407                self.column,
2408            ));
2409        }
2410
2411        // Advance past the three closing quotes
2412        self.advance();
2413        self.advance();
2414        self.advance();
2415        let token_type = if quote_char == '"' {
2416            TokenType::TripleDoubleQuotedString
2417        } else {
2418            TokenType::TripleSingleQuotedString
2419        };
2420        self.add_token_with_text(token_type, value);
2421        Ok(())
2422    }
2423
2424    fn scan_quoted_identifier(&mut self, end_quote: char) -> Result<()> {
2425        self.advance(); // Opening quote
2426        let mut value = String::new();
2427
2428        loop {
2429            if self.is_at_end() {
2430                return Err(Error::tokenize(
2431                    "Unterminated identifier",
2432                    self.line,
2433                    self.column,
2434                ));
2435            }
2436            if self.peek() == end_quote {
2437                if self.peek_next() == end_quote {
2438                    // Escaped quote (e.g., "" inside "x""y") -> store single quote
2439                    value.push(end_quote);
2440                    self.advance(); // skip first quote
2441                    self.advance(); // skip second quote
2442                } else {
2443                    // End of identifier
2444                    break;
2445                }
2446            } else {
2447                value.push(self.peek());
2448                self.advance();
2449            }
2450        }
2451
2452        self.advance(); // Closing quote
2453        self.add_token_with_text(TokenType::QuotedIdentifier, value);
2454        Ok(())
2455    }
2456
2457    /// Scan a string delimited by Unicode curly single quotes (U+2018/U+2019).
2458    /// Content between curly quotes is literal (no escape processing).
2459    /// When opened with \u{2018} (left), close with \u{2019} (right) only.
2460    /// When opened with \u{2019} (right), close with \u{2019} (right) — self-closing.
2461    fn scan_unicode_quoted_string(&mut self, open_quote: char) -> Result<()> {
2462        self.advance(); // Opening curly quote
2463        let start = self.current;
2464        // Determine closing quote: left opens -> right closes; right opens -> right closes
2465        let close_quote = if open_quote == '\u{2018}' {
2466            '\u{2019}' // left opens, right closes
2467        } else {
2468            '\u{2019}' // right quote also closes with right quote
2469        };
2470        while !self.is_at_end() && self.peek() != close_quote {
2471            self.advance();
2472        }
2473        let value = self.text_from_range(start, self.current);
2474        if !self.is_at_end() {
2475            self.advance(); // Closing quote
2476        }
2477        self.add_token_with_text(TokenType::String, value);
2478        Ok(())
2479    }
2480
2481    /// Scan an identifier delimited by Unicode curly double quotes (U+201C/U+201D).
2482    /// When opened with \u{201C} (left), close with \u{201D} (right) only.
2483    fn scan_unicode_quoted_identifier(&mut self, open_quote: char) -> Result<()> {
2484        self.advance(); // Opening curly quote
2485        let start = self.current;
2486        let close_quote = if open_quote == '\u{201C}' {
2487            '\u{201D}' // left opens, right closes
2488        } else {
2489            '\u{201D}' // right also closes with right
2490        };
2491        while !self.is_at_end() && self.peek() != close_quote && self.peek() != '"' {
2492            self.advance();
2493        }
2494        let value = self.text_from_range(start, self.current);
2495        if !self.is_at_end() {
2496            self.advance(); // Closing quote
2497        }
2498        self.add_token_with_text(TokenType::QuotedIdentifier, value);
2499        Ok(())
2500    }
2501
2502    fn scan_number(&mut self) -> Result<()> {
2503        // Check for 0x/0X hex number prefix (SQLite-style)
2504        if self.config.hex_number_strings && self.peek() == '0' && !self.is_at_end() {
2505            let next = if self.current + 1 < self.size {
2506                self.chars[self.current + 1]
2507            } else {
2508                '\0'
2509            };
2510            if next == 'x' || next == 'X' {
2511                // Advance past '0' and 'x'/'X'
2512                self.advance();
2513                self.advance();
2514                // Collect hex digits (allow underscores as separators, e.g., 0xbad_cafe)
2515                let hex_start = self.current;
2516                while !self.is_at_end() && (self.peek().is_ascii_hexdigit() || self.peek() == '_') {
2517                    if self.peek() == '_' && !self.peek_next().is_ascii_hexdigit() {
2518                        break;
2519                    }
2520                    self.advance();
2521                }
2522                if self.current > hex_start {
2523                    // Check for hex float: 0xABC.DEFpEXP or 0xABCpEXP
2524                    let mut is_hex_float = false;
2525                    // Optional fractional part: .hexdigits
2526                    if !self.is_at_end() && self.peek() == '.' {
2527                        let after_dot = if self.current + 1 < self.size {
2528                            self.chars[self.current + 1]
2529                        } else {
2530                            '\0'
2531                        };
2532                        if after_dot.is_ascii_hexdigit() {
2533                            is_hex_float = true;
2534                            self.advance(); // consume '.'
2535                            while !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2536                                self.advance();
2537                            }
2538                        }
2539                    }
2540                    // Optional binary exponent: p/P [+/-] digits
2541                    if !self.is_at_end() && (self.peek() == 'p' || self.peek() == 'P') {
2542                        is_hex_float = true;
2543                        self.advance(); // consume p/P
2544                        if !self.is_at_end() && (self.peek() == '+' || self.peek() == '-') {
2545                            self.advance();
2546                        }
2547                        while !self.is_at_end() && self.peek().is_ascii_digit() {
2548                            self.advance();
2549                        }
2550                    }
2551                    if is_hex_float {
2552                        // Hex float literal — emit as regular Number token with full text
2553                        let full_text = self.text_from_range(self.start, self.current);
2554                        self.add_token_with_text(TokenType::Number, full_text);
2555                    } else if self.config.hex_string_is_integer_type {
2556                        // BigQuery/ClickHouse: 0xA represents an integer in hex notation
2557                        let hex_value = self.text_from_range(hex_start, self.current);
2558                        self.add_token_with_text(TokenType::HexNumber, hex_value);
2559                    } else {
2560                        // SQLite/Teradata: 0xCC represents a binary/blob hex string
2561                        let hex_value = self.text_from_range(hex_start, self.current);
2562                        self.add_token_with_text(TokenType::HexString, hex_value);
2563                    }
2564                    return Ok(());
2565                }
2566                // No hex digits after 0x - fall through to normal number parsing
2567                // (reset current back to after '0')
2568                self.current = self.start + 1;
2569            }
2570        }
2571
2572        // Allow underscores as digit separators (e.g., 20_000, 1_000_000)
2573        while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2574            // Don't allow underscore at the end (must be followed by digit)
2575            if self.peek() == '_' && (self.is_at_end() || !self.peek_next().is_ascii_digit()) {
2576                break;
2577            }
2578            self.advance();
2579        }
2580
2581        // Look for decimal part - allow trailing dot (e.g., "1.")
2582        // In PostgreSQL (and sqlglot), "1.x" parses as float "1." with alias "x"
2583        // So we always consume the dot as part of the number, even if followed by an identifier
2584        if self.peek() == '.' {
2585            let next = self.peek_next();
2586            // Only consume the dot if:
2587            // 1. Followed by a digit (normal decimal like 1.5)
2588            // 2. Followed by an identifier start (like 1.x -> becomes 1. with alias x)
2589            // 3. End of input or other non-dot character (trailing decimal like "1.")
2590            // Do NOT consume if it's a double dot (..) which is a range operator
2591            if next != '.' {
2592                self.advance(); // consume the .
2593                                // Only consume digits after the decimal point (not identifiers)
2594                while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2595                    if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2596                        break;
2597                    }
2598                    self.advance();
2599                }
2600            }
2601        }
2602
2603        // Look for exponent
2604        if self.peek() == 'e' || self.peek() == 'E' {
2605            self.advance();
2606            if self.peek() == '+' || self.peek() == '-' {
2607                self.advance();
2608            }
2609            while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2610                if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2611                    break;
2612                }
2613                self.advance();
2614            }
2615        }
2616
2617        let text = self.text_from_range(self.start, self.current);
2618
2619        // Check for numeric literal suffixes (e.g., 1L -> BIGINT, 1s -> SMALLINT in Hive/Spark)
2620        if !self.config.numeric_literals.is_empty() && !self.is_at_end() {
2621            let next_char = self.peek().to_uppercase().to_string();
2622            // Try 2-char suffix first (e.g., "BD"), then 1-char
2623            let suffix_match = if self.current + 1 < self.size {
2624                let two_char: String = vec![self.chars[self.current], self.chars[self.current + 1]]
2625                    .iter()
2626                    .collect::<String>()
2627                    .to_uppercase();
2628                if self.config.numeric_literals.contains_key(&two_char) {
2629                    // Make sure the 2-char suffix is not followed by more identifier chars
2630                    let after_suffix = if self.current + 2 < self.size {
2631                        self.chars[self.current + 2]
2632                    } else {
2633                        ' '
2634                    };
2635                    if !after_suffix.is_alphanumeric() && after_suffix != '_' {
2636                        Some((two_char, 2))
2637                    } else {
2638                        None
2639                    }
2640                } else if self.config.numeric_literals.contains_key(&next_char) {
2641                    // 1-char suffix - make sure not followed by more identifier chars
2642                    let after_suffix = if self.current + 1 < self.size {
2643                        self.chars[self.current + 1]
2644                    } else {
2645                        ' '
2646                    };
2647                    if !after_suffix.is_alphanumeric() && after_suffix != '_' {
2648                        Some((next_char, 1))
2649                    } else {
2650                        None
2651                    }
2652                } else {
2653                    None
2654                }
2655            } else if self.config.numeric_literals.contains_key(&next_char) {
2656                // At end of input, 1-char suffix
2657                Some((next_char, 1))
2658            } else {
2659                None
2660            };
2661
2662            if let Some((suffix, len)) = suffix_match {
2663                // Consume the suffix characters
2664                for _ in 0..len {
2665                    self.advance();
2666                }
2667                // Emit as a special number-with-suffix token
2668                // We'll encode as "number::TYPE" so the parser can split it
2669                let type_name = self
2670                    .config
2671                    .numeric_literals
2672                    .get(&suffix)
2673                    .expect("suffix verified by contains_key above")
2674                    .clone();
2675                let combined = format!("{}::{}", text, type_name);
2676                self.add_token_with_text(TokenType::Number, combined);
2677                return Ok(());
2678            }
2679        }
2680
2681        // Check for identifiers that start with a digit (e.g., 1a, 1_a, 1a_1a)
2682        // In Hive/Spark/MySQL/ClickHouse, these are valid unquoted identifiers
2683        if self.config.identifiers_can_start_with_digit && !self.is_at_end() {
2684            let next = self.peek();
2685            if next.is_alphabetic() || next == '_' {
2686                // Continue scanning as an identifier
2687                while !self.is_at_end() {
2688                    let ch = self.peek();
2689                    if ch.is_alphanumeric() || ch == '_' {
2690                        self.advance();
2691                    } else {
2692                        break;
2693                    }
2694                }
2695                let ident_text = self.text_from_range(self.start, self.current);
2696                self.add_token_with_text(TokenType::Identifier, ident_text);
2697                return Ok(());
2698            }
2699        }
2700
2701        self.add_token_with_text(TokenType::Number, text);
2702        Ok(())
2703    }
2704
2705    /// Scan a number that starts with a dot (e.g., .25, .5, .123e10)
2706    fn scan_number_starting_with_dot(&mut self) -> Result<()> {
2707        // Consume the leading dot
2708        self.advance();
2709
2710        // Consume the fractional digits
2711        while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2712            if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2713                break;
2714            }
2715            self.advance();
2716        }
2717
2718        // Look for exponent
2719        if self.peek() == 'e' || self.peek() == 'E' {
2720            self.advance();
2721            if self.peek() == '+' || self.peek() == '-' {
2722                self.advance();
2723            }
2724            while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2725                if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2726                    break;
2727                }
2728                self.advance();
2729            }
2730        }
2731
2732        let text = self.text_from_range(self.start, self.current);
2733        self.add_token_with_text(TokenType::Number, text);
2734        Ok(())
2735    }
2736
2737    fn scan_identifier_or_keyword(&mut self) -> Result<()> {
2738        // Guard against unrecognized characters that could cause infinite loops
2739        let first_char = self.peek();
2740        if !first_char.is_alphanumeric() && first_char != '_' {
2741            // Unknown character - skip it and return an error
2742            let c = self.advance();
2743            return Err(Error::tokenize(
2744                format!("Unexpected character: '{}'", c),
2745                self.line,
2746                self.column,
2747            ));
2748        }
2749
2750        while !self.is_at_end() {
2751            let c = self.peek();
2752            // Allow alphanumeric, underscore, $, # and @ in identifiers
2753            // PostgreSQL allows $, TSQL allows # and @
2754            // But stop consuming # if followed by > or >> (PostgreSQL #> and #>> operators)
2755            if c == '#' {
2756                let next_c = if self.current + 1 < self.size {
2757                    self.chars[self.current + 1]
2758                } else {
2759                    '\0'
2760                };
2761                if next_c == '>' || next_c == '-' {
2762                    break; // Don't consume # — it's part of #>, #>>, or #- operator
2763                }
2764                self.advance();
2765            } else if c.is_alphanumeric() || c == '_' || c == '$' || c == '@' {
2766                self.advance();
2767            } else {
2768                break;
2769            }
2770        }
2771
2772        let text = self.text_from_range(self.start, self.current);
2773        let upper = text.to_uppercase();
2774
2775        // Special-case NOT= (Teradata and other dialects)
2776        if upper == "NOT" && self.peek() == '=' {
2777            self.advance(); // consume '='
2778            self.add_token(TokenType::Neq);
2779            return Ok(());
2780        }
2781
2782        // Check for special string prefixes like N'...', X'...', B'...', U&'...', r'...', b'...'
2783        // Also handle double-quoted variants for dialects that support them (e.g., BigQuery)
2784        let next_char = self.peek();
2785        let is_single_quote = next_char == '\'';
2786        let is_double_quote = next_char == '"' && self.config.quotes.contains_key("\"");
2787        // For raw strings (r"..." or r'...'), we allow double quotes even if " is not in quotes config
2788        // because raw strings are a special case used in Spark/Databricks where " is for identifiers
2789        let is_double_quote_for_raw = next_char == '"';
2790
2791        // Handle raw strings first - they're special because they work with both ' and "
2792        // even in dialects where " is normally an identifier delimiter (like Databricks)
2793        if upper == "R" && (is_single_quote || is_double_quote_for_raw) {
2794            // Raw string r'...' or r"..." or r'''...''' or r"""...""" (BigQuery style)
2795            // In raw strings, backslashes are treated literally (no escape processing)
2796            let quote_char = if is_single_quote { '\'' } else { '"' };
2797            self.advance(); // consume the first opening quote
2798
2799            // Check for triple-quoted raw string (r"""...""" or r'''...''')
2800            if self.peek() == quote_char && self.peek_next() == quote_char {
2801                // Triple-quoted raw string
2802                self.advance(); // consume second quote
2803                self.advance(); // consume third quote
2804                let string_value = self.scan_raw_triple_quoted_content(quote_char)?;
2805                self.add_token_with_text(TokenType::RawString, string_value);
2806            } else {
2807                let string_value = self.scan_raw_string_content(quote_char)?;
2808                self.add_token_with_text(TokenType::RawString, string_value);
2809            }
2810            return Ok(());
2811        }
2812
2813        if is_single_quote || is_double_quote {
2814            match upper.as_str() {
2815                "N" => {
2816                    // National string N'...'
2817                    self.advance(); // consume the opening quote
2818                    let string_value = if is_single_quote {
2819                        self.scan_string_content()?
2820                    } else {
2821                        self.scan_double_quoted_string_content()?
2822                    };
2823                    self.add_token_with_text(TokenType::NationalString, string_value);
2824                    return Ok(());
2825                }
2826                "E" => {
2827                    // PostgreSQL escape string E'...' or e'...'
2828                    // Preserve the case by prefixing with "e:" or "E:"
2829                    // Always use backslash escapes for escape strings (e.g., \' is an escaped quote)
2830                    let lowercase = text == "e";
2831                    let prefix = if lowercase { "e:" } else { "E:" };
2832                    self.advance(); // consume the opening quote
2833                    let string_value = self.scan_string_content_with_escapes(true)?;
2834                    self.add_token_with_text(
2835                        TokenType::EscapeString,
2836                        format!("{}{}", prefix, string_value),
2837                    );
2838                    return Ok(());
2839                }
2840                "X" => {
2841                    // Hex string X'...'
2842                    self.advance(); // consume the opening quote
2843                    let string_value = if is_single_quote {
2844                        self.scan_string_content()?
2845                    } else {
2846                        self.scan_double_quoted_string_content()?
2847                    };
2848                    self.add_token_with_text(TokenType::HexString, string_value);
2849                    return Ok(());
2850                }
2851                "B" if is_double_quote => {
2852                    // Byte string b"..." (BigQuery style) - MUST check before single quote B'...'
2853                    self.advance(); // consume the opening quote
2854                    let string_value = self.scan_double_quoted_string_content()?;
2855                    self.add_token_with_text(TokenType::ByteString, string_value);
2856                    return Ok(());
2857                }
2858                "B" if is_single_quote => {
2859                    // For BigQuery: b'...' is a byte string (bytes data)
2860                    // For standard SQL: B'...' is a bit string (binary digits)
2861                    self.advance(); // consume the opening quote
2862                    let string_value = self.scan_string_content()?;
2863                    if self.config.b_prefix_is_byte_string {
2864                        self.add_token_with_text(TokenType::ByteString, string_value);
2865                    } else {
2866                        self.add_token_with_text(TokenType::BitString, string_value);
2867                    }
2868                    return Ok(());
2869                }
2870                _ => {}
2871            }
2872        }
2873
2874        // Check for U&'...' Unicode string syntax (SQL standard)
2875        if upper == "U"
2876            && self.peek() == '&'
2877            && self.current + 1 < self.size
2878            && self.chars[self.current + 1] == '\''
2879        {
2880            self.advance(); // consume '&'
2881            self.advance(); // consume opening quote
2882            let string_value = self.scan_string_content()?;
2883            self.add_token_with_text(TokenType::UnicodeString, string_value);
2884            return Ok(());
2885        }
2886
2887        let token_type = self
2888            .config
2889            .keywords
2890            .get(&upper)
2891            .copied()
2892            .unwrap_or(TokenType::Var);
2893
2894        self.add_token_with_text(token_type, text);
2895        Ok(())
2896    }
2897
2898    /// Scan string content (everything between quotes)
2899    /// If `force_backslash_escapes` is true, backslash is always treated as an escape character
2900    /// (used for PostgreSQL E'...' escape strings)
2901    fn scan_string_content_with_escapes(
2902        &mut self,
2903        force_backslash_escapes: bool,
2904    ) -> Result<String> {
2905        let mut value = String::new();
2906        let use_backslash_escapes =
2907            force_backslash_escapes || self.config.string_escapes.contains(&'\\');
2908
2909        while !self.is_at_end() {
2910            let c = self.peek();
2911            if c == '\'' {
2912                if self.peek_next() == '\'' {
2913                    // Escaped quote ''
2914                    value.push('\'');
2915                    self.advance();
2916                    self.advance();
2917                } else {
2918                    break;
2919                }
2920            } else if c == '\\' && use_backslash_escapes {
2921                // Preserve escape sequences literally (including \' for escape strings)
2922                value.push(self.advance());
2923                if !self.is_at_end() {
2924                    value.push(self.advance());
2925                }
2926            } else {
2927                value.push(self.advance());
2928            }
2929        }
2930
2931        if self.is_at_end() {
2932            return Err(Error::tokenize(
2933                "Unterminated string",
2934                self.line,
2935                self.column,
2936            ));
2937        }
2938
2939        self.advance(); // Closing quote
2940        Ok(value)
2941    }
2942
2943    /// Scan string content (everything between quotes)
2944    fn scan_string_content(&mut self) -> Result<String> {
2945        self.scan_string_content_with_escapes(false)
2946    }
2947
2948    /// Scan double-quoted string content (for dialects like BigQuery where " is a string delimiter)
2949    /// This is used for prefixed strings like b"..." or N"..."
2950    fn scan_double_quoted_string_content(&mut self) -> Result<String> {
2951        let mut value = String::new();
2952        let use_backslash_escapes = self.config.string_escapes.contains(&'\\');
2953
2954        while !self.is_at_end() {
2955            let c = self.peek();
2956            if c == '"' {
2957                if self.peek_next() == '"' {
2958                    // Escaped quote ""
2959                    value.push('"');
2960                    self.advance();
2961                    self.advance();
2962                } else {
2963                    break;
2964                }
2965            } else if c == '\\' && use_backslash_escapes {
2966                // Handle escape sequences
2967                self.advance(); // Consume backslash
2968                if !self.is_at_end() {
2969                    let escaped = self.advance();
2970                    match escaped {
2971                        'n' => value.push('\n'),
2972                        'r' => value.push('\r'),
2973                        't' => value.push('\t'),
2974                        '0' => value.push('\0'),
2975                        '\\' => value.push('\\'),
2976                        '"' => value.push('"'),
2977                        '\'' => value.push('\''),
2978                        'x' => {
2979                            // Hex escape \xNN - collect hex digits
2980                            let mut hex = String::new();
2981                            for _ in 0..2 {
2982                                if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2983                                    hex.push(self.advance());
2984                                }
2985                            }
2986                            if let Ok(byte) = u8::from_str_radix(&hex, 16) {
2987                                value.push(byte as char);
2988                            } else {
2989                                // Invalid hex escape, keep it literal
2990                                value.push('\\');
2991                                value.push('x');
2992                                value.push_str(&hex);
2993                            }
2994                        }
2995                        _ => {
2996                            // For unrecognized escapes, preserve backslash + char
2997                            value.push('\\');
2998                            value.push(escaped);
2999                        }
3000                    }
3001                }
3002            } else {
3003                value.push(self.advance());
3004            }
3005        }
3006
3007        if self.is_at_end() {
3008            return Err(Error::tokenize(
3009                "Unterminated double-quoted string",
3010                self.line,
3011                self.column,
3012            ));
3013        }
3014
3015        self.advance(); // Closing quote
3016        Ok(value)
3017    }
3018
3019    /// Scan raw string content (limited escape processing for quotes)
3020    /// Used for BigQuery r'...' and r"..." strings
3021    /// In raw strings, backslashes are literal EXCEPT that escape sequences for the
3022    /// quote character still work (e.g., \' in r'...' escapes the quote, '' also works)
3023    fn scan_raw_string_content(&mut self, quote_char: char) -> Result<String> {
3024        let mut value = String::new();
3025
3026        while !self.is_at_end() {
3027            let c = self.peek();
3028            if c == quote_char {
3029                if self.peek_next() == quote_char {
3030                    // Escaped quote (doubled) - e.g., '' inside r'...'
3031                    value.push(quote_char);
3032                    self.advance();
3033                    self.advance();
3034                } else {
3035                    break;
3036                }
3037            } else if c == '\\'
3038                && self.peek_next() == quote_char
3039                && self.config.string_escapes_allowed_in_raw_strings
3040            {
3041                // Backslash-escaped quote - works in raw strings when string_escapes_allowed_in_raw_strings is true
3042                // e.g., \' inside r'...' becomes literal ' (BigQuery behavior)
3043                // Spark/Databricks has this set to false, so backslash is always literal there
3044                value.push(quote_char);
3045                self.advance(); // consume backslash
3046                self.advance(); // consume quote
3047            } else {
3048                // In raw strings, everything including backslashes is literal
3049                value.push(self.advance());
3050            }
3051        }
3052
3053        if self.is_at_end() {
3054            return Err(Error::tokenize(
3055                "Unterminated raw string",
3056                self.line,
3057                self.column,
3058            ));
3059        }
3060
3061        self.advance(); // Closing quote
3062        Ok(value)
3063    }
3064
3065    /// Scan raw triple-quoted string content (r"""...""" or r'''...''')
3066    /// Terminates when three consecutive quote_chars are found
3067    fn scan_raw_triple_quoted_content(&mut self, quote_char: char) -> Result<String> {
3068        let mut value = String::new();
3069
3070        while !self.is_at_end() {
3071            let c = self.peek();
3072            if c == quote_char && self.peek_next() == quote_char {
3073                // Check for third quote
3074                if self.current + 2 < self.size && self.chars[self.current + 2] == quote_char {
3075                    // Found three consecutive quotes - end of string
3076                    self.advance(); // first closing quote
3077                    self.advance(); // second closing quote
3078                    self.advance(); // third closing quote
3079                    return Ok(value);
3080                }
3081            }
3082            // In raw strings, everything including backslashes is literal
3083            let ch = self.advance();
3084            value.push(ch);
3085        }
3086
3087        Err(Error::tokenize(
3088            "Unterminated raw triple-quoted string",
3089            self.line,
3090            self.column,
3091        ))
3092    }
3093
3094    /// Scan TSQL identifiers that start with # (temp tables) or @ (variables)
3095    /// Examples: #temp, ##global_temp, @variable
3096    /// Scan an identifier that starts with `$` (ClickHouse).
3097    /// Examples: `$alias$name$`, `$x`
3098    fn scan_dollar_identifier(&mut self) -> Result<()> {
3099        // Consume the leading $
3100        self.advance();
3101
3102        // Consume alphanumeric, _, and $ continuation chars
3103        while !self.is_at_end() {
3104            let c = self.peek();
3105            if c.is_alphanumeric() || c == '_' || c == '$' {
3106                self.advance();
3107            } else {
3108                break;
3109            }
3110        }
3111
3112        let text = self.text_from_range(self.start, self.current);
3113        self.add_token_with_text(TokenType::Var, text);
3114        Ok(())
3115    }
3116
3117    fn scan_tsql_identifier(&mut self) -> Result<()> {
3118        // Consume the leading # or @ (or ##)
3119        let first = self.advance();
3120
3121        // For ##, consume the second #
3122        if first == '#' && self.peek() == '#' {
3123            self.advance();
3124        }
3125
3126        // Now scan the rest of the identifier
3127        while !self.is_at_end() {
3128            let c = self.peek();
3129            if c.is_alphanumeric() || c == '_' || c == '$' || c == '#' || c == '@' {
3130                self.advance();
3131            } else {
3132                break;
3133            }
3134        }
3135
3136        let text = self.text_from_range(self.start, self.current);
3137        // These are always identifiers (variables or temp table names), never keywords
3138        self.add_token_with_text(TokenType::Var, text);
3139        Ok(())
3140    }
3141
3142    /// Check if the last tokens match INSERT ... FORMAT <name> (not VALUES).
3143    /// If so, consume everything until the next blank line (two consecutive newlines)
3144    /// or end of input as raw data.
3145    fn try_scan_insert_format_raw_data(&mut self) -> Option<String> {
3146        let len = self.tokens.len();
3147        if len < 3 {
3148            return None;
3149        }
3150
3151        // Last token should be the format name (Identifier or Var, not VALUES)
3152        let last = &self.tokens[len - 1];
3153        if last.text.eq_ignore_ascii_case("VALUES") {
3154            return None;
3155        }
3156        if !matches!(last.token_type, TokenType::Var | TokenType::Identifier) {
3157            return None;
3158        }
3159
3160        // Second-to-last should be FORMAT
3161        let format_tok = &self.tokens[len - 2];
3162        if !format_tok.text.eq_ignore_ascii_case("FORMAT") {
3163            return None;
3164        }
3165
3166        // Check that there's an INSERT somewhere earlier in the tokens
3167        let has_insert = self.tokens[..len - 2]
3168            .iter()
3169            .rev()
3170            .take(20)
3171            .any(|t| t.token_type == TokenType::Insert);
3172        if !has_insert {
3173            return None;
3174        }
3175
3176        // We're in INSERT ... FORMAT <name> context. Consume everything until:
3177        // - A blank line (two consecutive newlines, possibly with whitespace between)
3178        // - End of input
3179        let raw_start = self.current;
3180        while !self.is_at_end() {
3181            let c = self.peek();
3182            if c == '\n' {
3183                // Check for blank line: \n followed by optional \r and \n
3184                let saved = self.current;
3185                self.advance(); // consume first \n
3186                                // Skip \r if present
3187                while !self.is_at_end() && self.peek() == '\r' {
3188                    self.advance();
3189                }
3190                if self.is_at_end() || self.peek() == '\n' {
3191                    // Found blank line or end of input - stop here
3192                    // Don't consume the second \n so subsequent SQL can be tokenized
3193                    let raw = self.text_from_range(raw_start, saved);
3194                    return Some(raw.trim().to_string());
3195                }
3196                // Not a blank line, continue scanning
3197            } else {
3198                self.advance();
3199            }
3200        }
3201
3202        // Reached end of input
3203        let raw = self.text_from_range(raw_start, self.current);
3204        let trimmed = raw.trim().to_string();
3205        if trimmed.is_empty() {
3206            None
3207        } else {
3208            Some(trimmed)
3209        }
3210    }
3211
3212    fn add_token(&mut self, token_type: TokenType) {
3213        let text = self.text_from_range(self.start, self.current);
3214        self.add_token_with_text(token_type, text);
3215    }
3216
3217    fn add_token_with_text(&mut self, token_type: TokenType, text: String) {
3218        let span = Span::new(self.start, self.current, self.line, self.column);
3219        let mut token = Token::new(token_type, text, span);
3220        token.comments.append(&mut self.comments);
3221        self.tokens.push(token);
3222    }
3223}
3224
3225#[cfg(test)]
3226mod tests {
3227    use super::*;
3228
3229    #[test]
3230    fn test_simple_select() {
3231        let tokenizer = Tokenizer::default();
3232        let tokens = tokenizer.tokenize("SELECT 1").unwrap();
3233
3234        assert_eq!(tokens.len(), 2);
3235        assert_eq!(tokens[0].token_type, TokenType::Select);
3236        assert_eq!(tokens[1].token_type, TokenType::Number);
3237        assert_eq!(tokens[1].text, "1");
3238    }
3239
3240    #[test]
3241    fn test_select_with_identifier() {
3242        let tokenizer = Tokenizer::default();
3243        let tokens = tokenizer.tokenize("SELECT a, b FROM t").unwrap();
3244
3245        assert_eq!(tokens.len(), 6);
3246        assert_eq!(tokens[0].token_type, TokenType::Select);
3247        assert_eq!(tokens[1].token_type, TokenType::Var);
3248        assert_eq!(tokens[1].text, "a");
3249        assert_eq!(tokens[2].token_type, TokenType::Comma);
3250        assert_eq!(tokens[3].token_type, TokenType::Var);
3251        assert_eq!(tokens[3].text, "b");
3252        assert_eq!(tokens[4].token_type, TokenType::From);
3253        assert_eq!(tokens[5].token_type, TokenType::Var);
3254        assert_eq!(tokens[5].text, "t");
3255    }
3256
3257    #[test]
3258    fn test_string_literal() {
3259        let tokenizer = Tokenizer::default();
3260        let tokens = tokenizer.tokenize("SELECT 'hello'").unwrap();
3261
3262        assert_eq!(tokens.len(), 2);
3263        assert_eq!(tokens[1].token_type, TokenType::String);
3264        assert_eq!(tokens[1].text, "hello");
3265    }
3266
3267    #[test]
3268    fn test_escaped_string() {
3269        let tokenizer = Tokenizer::default();
3270        let tokens = tokenizer.tokenize("SELECT 'it''s'").unwrap();
3271
3272        assert_eq!(tokens.len(), 2);
3273        assert_eq!(tokens[1].token_type, TokenType::String);
3274        assert_eq!(tokens[1].text, "it's");
3275    }
3276
3277    #[test]
3278    fn test_comments() {
3279        let tokenizer = Tokenizer::default();
3280        let tokens = tokenizer.tokenize("SELECT -- comment\n1").unwrap();
3281
3282        assert_eq!(tokens.len(), 2);
3283        // Comments are attached to the PREVIOUS token as trailing_comments
3284        // This is better for round-trip fidelity (e.g., SELECT c /* comment */ FROM)
3285        assert_eq!(tokens[0].trailing_comments.len(), 1);
3286        assert_eq!(tokens[0].trailing_comments[0], " comment");
3287    }
3288
3289    #[test]
3290    fn test_comment_in_and_chain() {
3291        use crate::generator::Generator;
3292        use crate::parser::Parser;
3293
3294        // Line comments between AND clauses should appear after the AND operator
3295        let sql = "SELECT a FROM b WHERE foo\n-- c1\nAND bar\n-- c2\nAND bla";
3296        let ast = Parser::parse_sql(sql).unwrap();
3297        let mut gen = Generator::default();
3298        let output = gen.generate(&ast[0]).unwrap();
3299        assert_eq!(
3300            output,
3301            "SELECT a FROM b WHERE foo AND /* c1 */ bar AND /* c2 */ bla"
3302        );
3303    }
3304
3305    #[test]
3306    fn test_operators() {
3307        let tokenizer = Tokenizer::default();
3308        let tokens = tokenizer.tokenize("1 + 2 * 3").unwrap();
3309
3310        assert_eq!(tokens.len(), 5);
3311        assert_eq!(tokens[0].token_type, TokenType::Number);
3312        assert_eq!(tokens[1].token_type, TokenType::Plus);
3313        assert_eq!(tokens[2].token_type, TokenType::Number);
3314        assert_eq!(tokens[3].token_type, TokenType::Star);
3315        assert_eq!(tokens[4].token_type, TokenType::Number);
3316    }
3317
3318    #[test]
3319    fn test_comparison_operators() {
3320        let tokenizer = Tokenizer::default();
3321        let tokens = tokenizer.tokenize("a <= b >= c != d").unwrap();
3322
3323        assert_eq!(tokens[1].token_type, TokenType::Lte);
3324        assert_eq!(tokens[3].token_type, TokenType::Gte);
3325        assert_eq!(tokens[5].token_type, TokenType::Neq);
3326    }
3327
3328    #[test]
3329    fn test_national_string() {
3330        let tokenizer = Tokenizer::default();
3331        let tokens = tokenizer.tokenize("N'abc'").unwrap();
3332
3333        assert_eq!(
3334            tokens.len(),
3335            1,
3336            "Expected 1 token for N'abc', got {:?}",
3337            tokens
3338        );
3339        assert_eq!(tokens[0].token_type, TokenType::NationalString);
3340        assert_eq!(tokens[0].text, "abc");
3341    }
3342
3343    #[test]
3344    fn test_hex_string() {
3345        let tokenizer = Tokenizer::default();
3346        let tokens = tokenizer.tokenize("X'ABCD'").unwrap();
3347
3348        assert_eq!(
3349            tokens.len(),
3350            1,
3351            "Expected 1 token for X'ABCD', got {:?}",
3352            tokens
3353        );
3354        assert_eq!(tokens[0].token_type, TokenType::HexString);
3355        assert_eq!(tokens[0].text, "ABCD");
3356    }
3357
3358    #[test]
3359    fn test_bit_string() {
3360        let tokenizer = Tokenizer::default();
3361        let tokens = tokenizer.tokenize("B'01010'").unwrap();
3362
3363        assert_eq!(
3364            tokens.len(),
3365            1,
3366            "Expected 1 token for B'01010', got {:?}",
3367            tokens
3368        );
3369        assert_eq!(tokens[0].token_type, TokenType::BitString);
3370        assert_eq!(tokens[0].text, "01010");
3371    }
3372
3373    #[test]
3374    fn test_trailing_dot_number() {
3375        let tokenizer = Tokenizer::default();
3376
3377        // Test trailing dot
3378        let tokens = tokenizer.tokenize("SELECT 1.").unwrap();
3379        assert_eq!(
3380            tokens.len(),
3381            2,
3382            "Expected 2 tokens for 'SELECT 1.', got {:?}",
3383            tokens
3384        );
3385        assert_eq!(tokens[1].token_type, TokenType::Number);
3386        assert_eq!(tokens[1].text, "1.");
3387
3388        // Test normal decimal
3389        let tokens = tokenizer.tokenize("SELECT 1.5").unwrap();
3390        assert_eq!(tokens[1].text, "1.5");
3391
3392        // Test number followed by dot and identifier
3393        // In PostgreSQL (and sqlglot), "1.x" parses as float "1." with alias "x"
3394        let tokens = tokenizer.tokenize("SELECT 1.a").unwrap();
3395        assert_eq!(
3396            tokens.len(),
3397            3,
3398            "Expected 3 tokens for 'SELECT 1.a', got {:?}",
3399            tokens
3400        );
3401        assert_eq!(tokens[1].token_type, TokenType::Number);
3402        assert_eq!(tokens[1].text, "1.");
3403        assert_eq!(tokens[2].token_type, TokenType::Var);
3404
3405        // Test two dots (range operator) - dot is NOT consumed when followed by another dot
3406        let tokens = tokenizer.tokenize("SELECT 1..2").unwrap();
3407        assert_eq!(tokens[1].token_type, TokenType::Number);
3408        assert_eq!(tokens[1].text, "1");
3409        assert_eq!(tokens[2].token_type, TokenType::Dot);
3410        assert_eq!(tokens[3].token_type, TokenType::Dot);
3411        assert_eq!(tokens[4].token_type, TokenType::Number);
3412        assert_eq!(tokens[4].text, "2");
3413    }
3414
3415    #[test]
3416    fn test_leading_dot_number() {
3417        let tokenizer = Tokenizer::default();
3418
3419        // Test leading dot number (e.g., .25 for 0.25)
3420        let tokens = tokenizer.tokenize(".25").unwrap();
3421        assert_eq!(
3422            tokens.len(),
3423            1,
3424            "Expected 1 token for '.25', got {:?}",
3425            tokens
3426        );
3427        assert_eq!(tokens[0].token_type, TokenType::Number);
3428        assert_eq!(tokens[0].text, ".25");
3429
3430        // Test leading dot in context (Oracle SAMPLE clause)
3431        let tokens = tokenizer.tokenize("SAMPLE (.25)").unwrap();
3432        assert_eq!(
3433            tokens.len(),
3434            4,
3435            "Expected 4 tokens for 'SAMPLE (.25)', got {:?}",
3436            tokens
3437        );
3438        assert_eq!(tokens[0].token_type, TokenType::Sample);
3439        assert_eq!(tokens[1].token_type, TokenType::LParen);
3440        assert_eq!(tokens[2].token_type, TokenType::Number);
3441        assert_eq!(tokens[2].text, ".25");
3442        assert_eq!(tokens[3].token_type, TokenType::RParen);
3443
3444        // Test leading dot with exponent
3445        let tokens = tokenizer.tokenize(".5e10").unwrap();
3446        assert_eq!(
3447            tokens.len(),
3448            1,
3449            "Expected 1 token for '.5e10', got {:?}",
3450            tokens
3451        );
3452        assert_eq!(tokens[0].token_type, TokenType::Number);
3453        assert_eq!(tokens[0].text, ".5e10");
3454
3455        // Test that plain dot is still a Dot token
3456        let tokens = tokenizer.tokenize("a.b").unwrap();
3457        assert_eq!(
3458            tokens.len(),
3459            3,
3460            "Expected 3 tokens for 'a.b', got {:?}",
3461            tokens
3462        );
3463        assert_eq!(tokens[1].token_type, TokenType::Dot);
3464    }
3465
3466    #[test]
3467    fn test_unrecognized_character() {
3468        let tokenizer = Tokenizer::default();
3469
3470        // Unicode curly quotes are now handled as string delimiters
3471        let result = tokenizer.tokenize("SELECT \u{2018}hello\u{2019}");
3472        assert!(
3473            result.is_ok(),
3474            "Curly quotes should be tokenized as strings"
3475        );
3476
3477        // Unicode bullet character should still error
3478        let result = tokenizer.tokenize("SELECT • FROM t");
3479        assert!(result.is_err());
3480    }
3481
3482    #[test]
3483    fn test_colon_eq_tokenization() {
3484        let tokenizer = Tokenizer::default();
3485
3486        // := should be a single ColonEq token
3487        let tokens = tokenizer.tokenize("a := 1").unwrap();
3488        assert_eq!(tokens.len(), 3);
3489        assert_eq!(tokens[0].token_type, TokenType::Var);
3490        assert_eq!(tokens[1].token_type, TokenType::ColonEq);
3491        assert_eq!(tokens[2].token_type, TokenType::Number);
3492
3493        // : followed by non-= should still be Colon
3494        let tokens = tokenizer.tokenize("a:b").unwrap();
3495        assert!(tokens.iter().any(|t| t.token_type == TokenType::Colon));
3496        assert!(!tokens.iter().any(|t| t.token_type == TokenType::ColonEq));
3497
3498        // :: should still be DColon
3499        let tokens = tokenizer.tokenize("a::INT").unwrap();
3500        assert!(tokens.iter().any(|t| t.token_type == TokenType::DColon));
3501    }
3502
3503    #[test]
3504    fn test_colon_eq_parsing() {
3505        use crate::generator::Generator;
3506        use crate::parser::Parser;
3507
3508        // MySQL @var := value in SELECT
3509        let ast = Parser::parse_sql("SELECT @var1 := 1, @var2")
3510            .expect("Failed to parse MySQL @var := expr");
3511        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3512        assert_eq!(output, "SELECT @var1 := 1, @var2");
3513
3514        // MySQL @var := @var in SELECT
3515        let ast = Parser::parse_sql("SELECT @var1, @var2 := @var1")
3516            .expect("Failed to parse MySQL @var2 := @var1");
3517        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3518        assert_eq!(output, "SELECT @var1, @var2 := @var1");
3519
3520        // MySQL @var := COUNT(*)
3521        let ast = Parser::parse_sql("SELECT @var1 := COUNT(*) FROM t1")
3522            .expect("Failed to parse MySQL @var := COUNT(*)");
3523        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3524        assert_eq!(output, "SELECT @var1 := COUNT(*) FROM t1");
3525
3526        // MySQL SET @var := 1 (should normalize to = in output)
3527        let ast = Parser::parse_sql("SET @var1 := 1").expect("Failed to parse SET @var1 := 1");
3528        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3529        assert_eq!(output, "SET @var1 = 1");
3530
3531        // Function named args with :=
3532        let ast =
3533            Parser::parse_sql("UNION_VALUE(k1 := 1)").expect("Failed to parse named arg with :=");
3534        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3535        assert_eq!(output, "UNION_VALUE(k1 := 1)");
3536
3537        // UNNEST with recursive := TRUE
3538        let ast = Parser::parse_sql("SELECT UNNEST(col, recursive := TRUE) FROM t")
3539            .expect("Failed to parse UNNEST with :=");
3540        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3541        assert_eq!(output, "SELECT UNNEST(col, recursive := TRUE) FROM t");
3542
3543        // DuckDB prefix alias: foo: 1 means 1 AS foo
3544        let ast =
3545            Parser::parse_sql("SELECT foo: 1").expect("Failed to parse DuckDB prefix alias foo: 1");
3546        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3547        assert_eq!(output, "SELECT 1 AS foo");
3548
3549        // DuckDB prefix alias with multiple columns
3550        let ast = Parser::parse_sql("SELECT foo: 1, bar: 2, baz: 3")
3551            .expect("Failed to parse DuckDB multiple prefix aliases");
3552        let output = Generator::sql(&ast[0]).expect("Failed to generate");
3553        assert_eq!(output, "SELECT 1 AS foo, 2 AS bar, 3 AS baz");
3554    }
3555
3556    #[test]
3557    fn test_colon_eq_dialect_roundtrip() {
3558        use crate::dialects::{Dialect, DialectType};
3559
3560        fn check(dialect: DialectType, sql: &str, expected: Option<&str>) {
3561            let d = Dialect::get(dialect);
3562            let ast = d
3563                .parse(sql)
3564                .unwrap_or_else(|e| panic!("Parse error for '{}': {}", sql, e));
3565            assert!(!ast.is_empty(), "Empty AST for: {}", sql);
3566            let transformed = d
3567                .transform(ast[0].clone())
3568                .unwrap_or_else(|e| panic!("Transform error for '{}': {}", sql, e));
3569            let output = d
3570                .generate(&transformed)
3571                .unwrap_or_else(|e| panic!("Generate error for '{}': {}", sql, e));
3572            let expected = expected.unwrap_or(sql);
3573            assert_eq!(output, expected, "Roundtrip failed for: {}", sql);
3574        }
3575
3576        // MySQL := tests
3577        check(DialectType::MySQL, "SELECT @var1 := 1, @var2", None);
3578        check(DialectType::MySQL, "SELECT @var1, @var2 := @var1", None);
3579        check(DialectType::MySQL, "SELECT @var1 := COUNT(*) FROM t1", None);
3580        check(DialectType::MySQL, "SET @var1 := 1", Some("SET @var1 = 1"));
3581
3582        // DuckDB := tests
3583        check(
3584            DialectType::DuckDB,
3585            "SELECT UNNEST(col, recursive := TRUE) FROM t",
3586            None,
3587        );
3588        check(DialectType::DuckDB, "UNION_VALUE(k1 := 1)", None);
3589
3590        // STRUCT_PACK(a := 'b')::json should at least parse without error
3591        // (The STRUCT_PACK -> Struct transformation is a separate feature)
3592        {
3593            let d = Dialect::get(DialectType::DuckDB);
3594            let ast = d
3595                .parse("STRUCT_PACK(a := 'b')::json")
3596                .expect("Failed to parse STRUCT_PACK(a := 'b')::json");
3597            assert!(!ast.is_empty(), "Empty AST for STRUCT_PACK(a := 'b')::json");
3598        }
3599
3600        // DuckDB prefix alias tests
3601        check(
3602            DialectType::DuckDB,
3603            "SELECT foo: 1",
3604            Some("SELECT 1 AS foo"),
3605        );
3606        check(
3607            DialectType::DuckDB,
3608            "SELECT foo: 1, bar: 2, baz: 3",
3609            Some("SELECT 1 AS foo, 2 AS bar, 3 AS baz"),
3610        );
3611    }
3612
3613    #[test]
3614    fn test_comment_roundtrip() {
3615        use crate::generator::Generator;
3616        use crate::parser::Parser;
3617
3618        fn check_roundtrip(sql: &str) -> Option<String> {
3619            let ast = match Parser::parse_sql(sql) {
3620                Ok(a) => a,
3621                Err(e) => return Some(format!("Parse error: {:?}", e)),
3622            };
3623            if ast.is_empty() {
3624                return Some("Empty AST".to_string());
3625            }
3626            let mut generator = Generator::default();
3627            let output = match generator.generate(&ast[0]) {
3628                Ok(o) => o,
3629                Err(e) => return Some(format!("Gen error: {:?}", e)),
3630            };
3631            if output == sql {
3632                None
3633            } else {
3634                Some(format!(
3635                    "Mismatch:\n  input:  {}\n  output: {}",
3636                    sql, output
3637                ))
3638            }
3639        }
3640
3641        let tests = vec![
3642            // Nested comments
3643            "SELECT c /* c1 /* c2 */ c3 */",
3644            "SELECT c /* c1 /* c2 /* c3 */ */ */",
3645            // Simple alias with comments
3646            "SELECT c /* c1 */ AS alias /* c2 */",
3647            // Multiple columns with comments
3648            "SELECT a /* x */, b /* x */",
3649            // Multiple comments after column
3650            "SELECT a /* x */ /* y */ /* z */, b /* k */ /* m */",
3651            // FROM tables with comments
3652            "SELECT * FROM foo /* x */, bla /* x */",
3653            // Arithmetic with comments
3654            "SELECT 1 /* comment */ + 1",
3655            "SELECT 1 /* c1 */ + 2 /* c2 */",
3656            "SELECT 1 /* c1 */ + /* c2 */ 2 /* c3 */",
3657            // CAST with comments
3658            "SELECT CAST(x AS INT) /* comment */ FROM foo",
3659            // Function arguments with comments
3660            "SELECT FOO(x /* c */) /* FOO */, b /* b */",
3661            // Multi-part table names with comments
3662            "SELECT x FROM a.b.c /* x */, e.f.g /* x */",
3663            // INSERT with comments
3664            "INSERT INTO t1 (tc1 /* tc1 */, tc2 /* tc2 */) SELECT c1 /* sc1 */, c2 /* sc2 */ FROM t",
3665            // Leading comments on statements
3666            "/* c */ WITH x AS (SELECT 1) SELECT * FROM x",
3667            "/* comment1 */ INSERT INTO x /* comment2 */ VALUES (1, 2, 3)",
3668            "/* comment1 */ UPDATE tbl /* comment2 */ SET x = 2 WHERE x < 2",
3669            "/* comment1 */ DELETE FROM x /* comment2 */ WHERE y > 1",
3670            "/* comment */ CREATE TABLE foo AS SELECT 1",
3671            // Trailing comments on statements
3672            "INSERT INTO foo SELECT * FROM bar /* comment */",
3673            // Complex nested expressions with comments
3674            "SELECT FOO(x /* c1 */ + y /* c2 */ + BLA(5 /* c3 */)) FROM (VALUES (1 /* c4 */, \"test\" /* c5 */)) /* c6 */",
3675        ];
3676
3677        let mut failures = Vec::new();
3678        for sql in tests {
3679            if let Some(e) = check_roundtrip(sql) {
3680                failures.push(e);
3681            }
3682        }
3683
3684        if !failures.is_empty() {
3685            panic!("Comment roundtrip failures:\n{}", failures.join("\n\n"));
3686        }
3687    }
3688
3689    #[test]
3690    fn test_dollar_quoted_string_parsing() {
3691        use crate::dialects::{Dialect, DialectType};
3692
3693        // Test dollar string token parsing utility function
3694        let (tag, content) = super::parse_dollar_string_token("FOO\x00content here");
3695        assert_eq!(tag, Some("FOO".to_string()));
3696        assert_eq!(content, "content here");
3697
3698        let (tag, content) = super::parse_dollar_string_token("just content");
3699        assert_eq!(tag, None);
3700        assert_eq!(content, "just content");
3701
3702        // Test roundtrip for Databricks dialect with dollar-quoted function body
3703        fn check_databricks(sql: &str, expected: Option<&str>) {
3704            let d = Dialect::get(DialectType::Databricks);
3705            let ast = d
3706                .parse(sql)
3707                .unwrap_or_else(|e| panic!("Parse error for '{}': {}", sql, e));
3708            assert!(!ast.is_empty(), "Empty AST for: {}", sql);
3709            let transformed = d
3710                .transform(ast[0].clone())
3711                .unwrap_or_else(|e| panic!("Transform error for '{}': {}", sql, e));
3712            let output = d
3713                .generate(&transformed)
3714                .unwrap_or_else(|e| panic!("Generate error for '{}': {}", sql, e));
3715            let expected = expected.unwrap_or(sql);
3716            assert_eq!(output, expected, "Roundtrip failed for: {}", sql);
3717        }
3718
3719        // Test [42]: $$...$$ heredoc
3720        check_databricks(
3721            "CREATE FUNCTION add_one(x INT) RETURNS INT LANGUAGE PYTHON AS $$def add_one(x):\n  return x+1$$",
3722            None
3723        );
3724
3725        // Test [43]: $FOO$...$FOO$ tagged heredoc
3726        check_databricks(
3727            "CREATE FUNCTION add_one(x INT) RETURNS INT LANGUAGE PYTHON AS $FOO$def add_one(x):\n  return x+1$FOO$",
3728            None
3729        );
3730    }
3731}
polyglot_sql/tokens.rs

polyglot_sql/
tokens.rs