1use crate::error::{Error, Result};
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9use std::fmt;
10use std::sync::LazyLock;
11#[cfg(feature = "bindings")]
12use ts_rs::TS;
13
14pub fn parse_dollar_string_token(text: &str) -> (Option<String>, String) {
18 if let Some(pos) = text.find('\x00') {
19 let tag = &text[..pos];
20 let content = &text[pos + 1..];
21 (Some(tag.to_string()), content.to_string())
22 } else {
23 (None, text.to_string())
24 }
25}
26
27#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
29#[cfg_attr(feature = "bindings", derive(TS))]
30pub struct Span {
31 pub start: usize,
33 pub end: usize,
35 pub line: usize,
37 pub column: usize,
39}
40
41impl Span {
42 pub fn new(start: usize, end: usize, line: usize, column: usize) -> Self {
43 Self {
44 start,
45 end,
46 line,
47 column,
48 }
49 }
50}
51
52#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
54pub struct Token {
55 pub token_type: TokenType,
57 pub text: String,
59 pub span: Span,
61 #[serde(default)]
63 pub comments: Vec<String>,
64 #[serde(default)]
66 pub trailing_comments: Vec<String>,
67}
68
69impl Token {
70 pub fn new(token_type: TokenType, text: impl Into<String>, span: Span) -> Self {
72 Self {
73 token_type,
74 text: text.into(),
75 span,
76 comments: Vec::new(),
77 trailing_comments: Vec::new(),
78 }
79 }
80
81 pub fn number(n: i64) -> Self {
83 Self::new(TokenType::Number, n.to_string(), Span::default())
84 }
85
86 pub fn string(s: impl Into<String>) -> Self {
88 Self::new(TokenType::String, s, Span::default())
89 }
90
91 pub fn identifier(s: impl Into<String>) -> Self {
93 Self::new(TokenType::Identifier, s, Span::default())
94 }
95
96 pub fn var(s: impl Into<String>) -> Self {
98 Self::new(TokenType::Var, s, Span::default())
99 }
100
101 pub fn with_comment(mut self, comment: impl Into<String>) -> Self {
103 self.comments.push(comment.into());
104 self
105 }
106}
107
108impl fmt::Display for Token {
109 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
110 write!(f, "{:?}({})", self.token_type, self.text)
111 }
112}
113
114#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
116#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
117#[repr(u16)]
118pub enum TokenType {
119 LParen,
121 RParen,
122 LBracket,
123 RBracket,
124 LBrace,
125 RBrace,
126 Comma,
127 Dot,
128 Dash,
129 Plus,
130 Colon,
131 DotColon,
132 DColon,
133 DColonDollar,
134 DColonPercent,
135 DColonQMark,
136 DQMark,
137 Semicolon,
138 Star,
139 Backslash,
140 Slash,
141 Lt,
142 Lte,
143 Gt,
144 Gte,
145 Not,
146 Eq,
147 Neq,
148 NullsafeEq,
149 ColonEq,
150 ColonGt,
151 NColonGt,
152 And,
153 Or,
154 Amp,
155 DPipe,
156 PipeGt,
157 Pipe,
158 PipeSlash,
159 DPipeSlash,
160 Caret,
161 CaretAt,
162 LtLt, GtGt, Tilde,
165 Arrow,
166 DArrow,
167 FArrow,
168 Hash,
169 HashArrow,
170 DHashArrow,
171 LrArrow,
172 DAt,
173 AtAt,
174 LtAt,
175 AtGt,
176 Dollar,
177 Parameter,
178 Session,
179 SessionParameter,
180 SessionUser,
181 DAmp,
182 AmpLt,
183 AmpGt,
184 Adjacent,
185 Xor,
186 DStar,
187 QMarkAmp,
188 QMarkPipe,
189 HashDash,
190 Exclamation,
191
192 UriStart,
193 BlockStart,
194 BlockEnd,
195 Space,
196 Break,
197
198 BlockComment, LineComment, String,
204 DollarString, TripleDoubleQuotedString, TripleSingleQuotedString, Number,
208 Identifier,
209 QuotedIdentifier,
210 Database,
211 Column,
212 ColumnDef,
213 Schema,
214 Table,
215 Warehouse,
216 Stage,
217 Streamlit,
218 Var,
219 BitString,
220 HexString,
221 HexNumber,
223 ByteString,
224 NationalString,
225 EscapeString, RawString,
227 HeredocString,
228 HeredocStringAlternative,
229 UnicodeString,
230
231 Bit,
233 Boolean,
234 TinyInt,
235 UTinyInt,
236 SmallInt,
237 USmallInt,
238 MediumInt,
239 UMediumInt,
240 Int,
241 UInt,
242 BigInt,
243 UBigInt,
244 BigNum,
245 Int128,
246 UInt128,
247 Int256,
248 UInt256,
249 Float,
250 Double,
251 UDouble,
252 Decimal,
253 Decimal32,
254 Decimal64,
255 Decimal128,
256 Decimal256,
257 DecFloat,
258 UDecimal,
259 BigDecimal,
260 Char,
261 NChar,
262 VarChar,
263 NVarChar,
264 BpChar,
265 Text,
266 MediumText,
267 LongText,
268 Blob,
269 MediumBlob,
270 LongBlob,
271 TinyBlob,
272 TinyText,
273 Name,
274 Binary,
275 VarBinary,
276 Json,
277 JsonB,
278 Time,
279 TimeTz,
280 TimeNs,
281 Timestamp,
282 TimestampTz,
283 TimestampLtz,
284 TimestampNtz,
285 TimestampS,
286 TimestampMs,
287 TimestampNs,
288 DateTime,
289 DateTime2,
290 DateTime64,
291 SmallDateTime,
292 Date,
293 Date32,
294 Int4Range,
295 Int4MultiRange,
296 Int8Range,
297 Int8MultiRange,
298 NumRange,
299 NumMultiRange,
300 TsRange,
301 TsMultiRange,
302 TsTzRange,
303 TsTzMultiRange,
304 DateRange,
305 DateMultiRange,
306 Uuid,
307 Geography,
308 GeographyPoint,
309 Nullable,
310 Geometry,
311 Point,
312 Ring,
313 LineString,
314 LocalTime,
315 LocalTimestamp,
316 SysTimestamp,
317 MultiLineString,
318 Polygon,
319 MultiPolygon,
320 HllSketch,
321 HStore,
322 Super,
323 Serial,
324 SmallSerial,
325 BigSerial,
326 Xml,
327 Year,
328 UserDefined,
329 Money,
330 SmallMoney,
331 RowVersion,
332 Image,
333 Variant,
334 Object,
335 Inet,
336 IpAddress,
337 IpPrefix,
338 Ipv4,
339 Ipv6,
340 Enum,
341 Enum8,
342 Enum16,
343 FixedString,
344 LowCardinality,
345 Nested,
346 AggregateFunction,
347 SimpleAggregateFunction,
348 TDigest,
349 Unknown,
350 Vector,
351 Dynamic,
352 Void,
353
354 Add,
356 Alias,
357 Alter,
358 All,
359 Anti,
360 Any,
361 Apply,
362 Array,
363 Asc,
364 AsOf,
365 Attach,
366 AutoIncrement,
367 Begin,
368 Between,
369 BulkCollectInto,
370 Cache,
371 Cascade,
372 Case,
373 CharacterSet,
374 Cluster,
375 ClusterBy,
376 Collate,
377 Command,
378 Comment,
379 Commit,
380 Preserve,
381 Connect,
382 ConnectBy,
383 Constraint,
384 Copy,
385 Create,
386 Cross,
387 Cube,
388 CurrentDate,
389 CurrentDateTime,
390 CurrentSchema,
391 CurrentTime,
392 CurrentTimestamp,
393 CurrentUser,
394 CurrentRole,
395 CurrentCatalog,
396 Declare,
397 Default,
398 Delete,
399 Desc,
400 Describe,
401 Detach,
402 Dictionary,
403 Distinct,
404 Distribute,
405 DistributeBy,
406 Div,
407 Drop,
408 Else,
409 End,
410 Escape,
411 Except,
412 Execute,
413 Exists,
414 False,
415 Fetch,
416 File,
417 FileFormat,
418 Filter,
419 Final,
420 First,
421 For,
422 Force,
423 ForeignKey,
424 Format,
425 From,
426 Full,
427 Function,
428 Get,
429 Glob,
430 Global,
431 Grant,
432 GroupBy,
433 GroupingSets,
434 Having,
435 Hint,
436 Ignore,
437 ILike,
438 In,
439 Index,
440 IndexedBy,
441 Inner,
442 Input,
443 Insert,
444 Install,
445 Intersect,
446 Interval,
447 Into,
448 Inpath,
449 InputFormat,
450 Introducer,
451 IRLike,
452 Is,
453 IsNull,
454 Join,
455 JoinMarker,
456 Keep,
457 Key,
458 Kill,
459 Lambda,
460 Language,
461 Lateral,
462 Left,
463 Like,
464 NotLike, NotILike, NotRLike, NotIRLike, Limit,
469 List,
470 Load,
471 Local,
472 Lock,
473 Map,
474 Match,
475 MatchCondition,
476 MatchRecognize,
477 MemberOf,
478 Materialized,
479 Merge,
480 Mod,
481 Model,
482 Natural,
483 Next,
484 NoAction,
485 Nothing,
486 NotNull,
487 Null,
488 ObjectIdentifier,
489 Offset,
490 On,
491 Only,
492 Operator,
493 OrderBy,
494 OrderSiblingsBy,
495 Ordered,
496 Ordinality,
497 Out,
498 Outer,
499 Output,
500 Over,
501 Overlaps,
502 Overwrite,
503 Partition,
504 PartitionBy,
505 Percent,
506 Pivot,
507 Placeholder,
508 Positional,
509 Pragma,
510 Prewhere,
511 PrimaryKey,
512 Procedure,
513 Properties,
514 PseudoType,
515 Put,
516 Qualify,
517 Quote,
518 QDColon,
519 Range,
520 Recursive,
521 Refresh,
522 Rename,
523 Replace,
524 Returning,
525 Revoke,
526 References,
527 Restrict,
528 Right,
529 RLike,
530 Rollback,
531 Rollup,
532 Row,
533 Rows,
534 Select,
535 Semi,
536 Savepoint,
537 Separator,
538 Sequence,
539 Serde,
540 SerdeProperties,
541 Set,
542 Settings,
543 Show,
544 Siblings,
545 SimilarTo,
546 Some,
547 Sort,
548 SortBy,
549 SoundsLike,
550 StartWith,
551 StorageIntegration,
552 StraightJoin,
553 Struct,
554 Summarize,
555 TableSample,
556 Sample,
557 Bernoulli,
558 System,
559 Block,
560 Seed,
561 Repeatable,
562 Tag,
563 Temporary,
564 Transaction,
565 To,
566 Top,
567 Then,
568 True,
569 Truncate,
570 Uncache,
571 Union,
572 Unnest,
573 Unpivot,
574 Update,
575 Use,
576 Using,
577 Values,
578 View,
579 SemanticView,
580 Volatile,
581 When,
582 Where,
583 Window,
584 With,
585 Ties,
586 Exclude,
587 No,
588 Others,
589 Unique,
590 UtcDate,
591 UtcTime,
592 UtcTimestamp,
593 VersionSnapshot,
594 TimestampSnapshot,
595 Option,
596 Sink,
597 Source,
598 Analyze,
599 Namespace,
600 Export,
601 As,
602 By,
603 Nulls,
604 Respect,
605 Last,
606 If,
607 Cast,
608 TryCast,
609 SafeCast,
610 Count,
611 Extract,
612 Substring,
613 Trim,
614 Leading,
615 Trailing,
616 Both,
617 Position,
618 Overlaying,
619 Placing,
620 Treat,
621 Within,
622 Group,
623 Order,
624
625 Unbounded,
627 Preceding,
628 Following,
629 Current,
630 Groups,
631
632 Trigger,
634 Type,
635 Domain,
636 Returns,
637 Body,
638 Increment,
639 Minvalue,
640 Maxvalue,
641 Start,
642 Cycle,
643 NoCycle,
644 Prior,
645 Generated,
646 Identity,
647 Always,
648 Measures,
650 Pattern,
651 Define,
652 Running,
653 Owned,
654 After,
655 Before,
656 Instead,
657 Each,
658 Statement,
659 Referencing,
660 Old,
661 New,
662 Of,
663 Check,
664 Authorization,
665 Restart,
666
667 Eof,
669}
670
671impl TokenType {
672 pub fn is_keyword(&self) -> bool {
674 matches!(
675 self,
676 TokenType::Select
677 | TokenType::From
678 | TokenType::Where
679 | TokenType::And
680 | TokenType::Or
681 | TokenType::Not
682 | TokenType::In
683 | TokenType::Is
684 | TokenType::Null
685 | TokenType::True
686 | TokenType::False
687 | TokenType::As
688 | TokenType::On
689 | TokenType::Join
690 | TokenType::Left
691 | TokenType::Right
692 | TokenType::Inner
693 | TokenType::Outer
694 | TokenType::Full
695 | TokenType::Cross
696 | TokenType::Semi
697 | TokenType::Anti
698 | TokenType::Union
699 | TokenType::Except
700 | TokenType::Intersect
701 | TokenType::GroupBy
702 | TokenType::OrderBy
703 | TokenType::Having
704 | TokenType::Limit
705 | TokenType::Offset
706 | TokenType::Case
707 | TokenType::When
708 | TokenType::Then
709 | TokenType::Else
710 | TokenType::End
711 | TokenType::Create
712 | TokenType::Drop
713 | TokenType::Alter
714 | TokenType::Insert
715 | TokenType::Update
716 | TokenType::Delete
717 | TokenType::Into
718 | TokenType::Values
719 | TokenType::Set
720 | TokenType::With
721 | TokenType::Distinct
722 | TokenType::All
723 | TokenType::Exists
724 | TokenType::Between
725 | TokenType::Like
726 | TokenType::ILike
727 | TokenType::Filter
729 | TokenType::Date
730 | TokenType::Timestamp
731 | TokenType::TimestampTz
732 | TokenType::Interval
733 | TokenType::Time
734 | TokenType::Table
735 | TokenType::Index
736 | TokenType::Column
737 | TokenType::Database
738 | TokenType::Schema
739 | TokenType::View
740 | TokenType::Function
741 | TokenType::Procedure
742 | TokenType::Trigger
743 | TokenType::Sequence
744 | TokenType::Over
745 | TokenType::Partition
746 | TokenType::Window
747 | TokenType::Rows
748 | TokenType::Range
749 | TokenType::First
750 | TokenType::Last
751 | TokenType::Preceding
752 | TokenType::Following
753 | TokenType::Current
754 | TokenType::Row
755 | TokenType::Unbounded
756 | TokenType::Array
757 | TokenType::Struct
758 | TokenType::Map
759 | TokenType::PrimaryKey
760 | TokenType::Key
761 | TokenType::ForeignKey
762 | TokenType::References
763 | TokenType::Unique
764 | TokenType::Check
765 | TokenType::Default
766 | TokenType::Constraint
767 | TokenType::Comment
768 | TokenType::Rollup
769 | TokenType::Cube
770 | TokenType::Grant
771 | TokenType::Revoke
772 | TokenType::Type
773 | TokenType::Use
774 | TokenType::Cache
775 | TokenType::Uncache
776 | TokenType::Load
777 | TokenType::Any
778 | TokenType::Some
779 | TokenType::Asc
780 | TokenType::Desc
781 | TokenType::Nulls
782 | TokenType::Lateral
783 | TokenType::Natural
784 | TokenType::Escape
785 | TokenType::Glob
786 | TokenType::Match
787 | TokenType::Recursive
788 | TokenType::Replace
789 | TokenType::Returns
790 | TokenType::If
791 | TokenType::Pivot
792 | TokenType::Unpivot
793 | TokenType::Json
794 | TokenType::Blob
795 | TokenType::Text
796 | TokenType::Int
797 | TokenType::BigInt
798 | TokenType::SmallInt
799 | TokenType::TinyInt
800 | TokenType::Int128
801 | TokenType::UInt128
802 | TokenType::Int256
803 | TokenType::UInt256
804 | TokenType::UInt
805 | TokenType::UBigInt
806 | TokenType::Float
807 | TokenType::Double
808 | TokenType::Decimal
809 | TokenType::Boolean
810 | TokenType::VarChar
811 | TokenType::Char
812 | TokenType::Binary
813 | TokenType::VarBinary
814 | TokenType::No
815 | TokenType::DateTime
816 | TokenType::Truncate
817 | TokenType::Execute
818 | TokenType::Merge
819 | TokenType::Top
820 | TokenType::Begin
821 | TokenType::Generated
822 | TokenType::Identity
823 | TokenType::Always
824 | TokenType::Extract
825 | TokenType::AsOf
827 | TokenType::Prior
828 | TokenType::After
829 | TokenType::Restrict
830 | TokenType::Cascade
831 | TokenType::Local
832 | TokenType::Rename
833 | TokenType::Enum
834 | TokenType::Within
835 | TokenType::Format
836 | TokenType::Final
837 | TokenType::FileFormat
838 | TokenType::Input
839 | TokenType::InputFormat
840 | TokenType::Copy
841 | TokenType::Put
842 | TokenType::Get
843 | TokenType::Show
844 | TokenType::Serde
845 | TokenType::Sample
846 | TokenType::Sort
847 | TokenType::Collate
848 | TokenType::Ties
849 | TokenType::IsNull
850 | TokenType::NotNull
851 | TokenType::Exclude
852 | TokenType::Temporary
853 | TokenType::Add
854 | TokenType::Ordinality
855 | TokenType::Overlaps
856 | TokenType::Block
857 | TokenType::Pattern
858 | TokenType::Group
859 | TokenType::Cluster
860 | TokenType::Repeatable
861 | TokenType::Groups
862 | TokenType::Commit
863 | TokenType::Warehouse
864 | TokenType::System
865 | TokenType::By
866 | TokenType::To
867 | TokenType::Fetch
868 | TokenType::For
869 | TokenType::Only
870 | TokenType::Next
871 | TokenType::Lock
872 | TokenType::Refresh
873 | TokenType::Settings
874 | TokenType::Operator
875 | TokenType::Overwrite
876 | TokenType::StraightJoin
877 | TokenType::Start
878 | TokenType::Ignore
880 | TokenType::Domain
881 | TokenType::Apply
882 | TokenType::Respect
883 | TokenType::Materialized
884 | TokenType::Prewhere
885 | TokenType::Old
886 | TokenType::New
887 | TokenType::Cast
888 | TokenType::TryCast
889 | TokenType::SafeCast
890 | TokenType::Transaction
891 | TokenType::Describe
892 | TokenType::Kill
893 | TokenType::Lambda
894 | TokenType::Declare
895 | TokenType::Keep
896 | TokenType::Output
897 | TokenType::Percent
898 | TokenType::Qualify
899 | TokenType::Returning
900 | TokenType::Language
901 | TokenType::Preserve
902 | TokenType::Savepoint
903 | TokenType::Rollback
904 | TokenType::Body
905 | TokenType::Increment
906 | TokenType::Minvalue
907 | TokenType::Maxvalue
908 | TokenType::Cycle
909 | TokenType::NoCycle
910 | TokenType::Seed
911 | TokenType::Namespace
912 | TokenType::Authorization
913 | TokenType::Order
914 | TokenType::Restart
915 | TokenType::Before
916 | TokenType::Instead
917 | TokenType::Each
918 | TokenType::Statement
919 | TokenType::Referencing
920 | TokenType::Of
921 | TokenType::Separator
922 | TokenType::Others
923 | TokenType::Placing
924 | TokenType::Owned
925 | TokenType::Running
926 | TokenType::Define
927 | TokenType::Measures
928 | TokenType::MatchRecognize
929 | TokenType::AutoIncrement
930 | TokenType::Connect
931 | TokenType::Distribute
932 | TokenType::Bernoulli
933 | TokenType::TableSample
934 | TokenType::Inpath
935 | TokenType::Pragma
936 | TokenType::Siblings
937 | TokenType::SerdeProperties
938 | TokenType::RLike
939 )
940 }
941
942 pub fn is_comparison(&self) -> bool {
944 matches!(
945 self,
946 TokenType::Eq
947 | TokenType::Neq
948 | TokenType::Lt
949 | TokenType::Lte
950 | TokenType::Gt
951 | TokenType::Gte
952 | TokenType::NullsafeEq
953 )
954 }
955
956 pub fn is_arithmetic(&self) -> bool {
958 matches!(
959 self,
960 TokenType::Plus
961 | TokenType::Dash
962 | TokenType::Star
963 | TokenType::Slash
964 | TokenType::Percent
965 | TokenType::Mod
966 | TokenType::Div
967 )
968 }
969}
970
971impl fmt::Display for TokenType {
972 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
973 write!(f, "{:?}", self)
974 }
975}
976
977static DEFAULT_KEYWORDS: LazyLock<HashMap<String, TokenType>> = LazyLock::new(|| {
980 let mut keywords = HashMap::with_capacity(300);
981 keywords.insert("SELECT".to_string(), TokenType::Select);
983 keywords.insert("FROM".to_string(), TokenType::From);
984 keywords.insert("WHERE".to_string(), TokenType::Where);
985 keywords.insert("AND".to_string(), TokenType::And);
986 keywords.insert("OR".to_string(), TokenType::Or);
987 keywords.insert("NOT".to_string(), TokenType::Not);
988 keywords.insert("AS".to_string(), TokenType::As);
989 keywords.insert("ON".to_string(), TokenType::On);
990 keywords.insert("JOIN".to_string(), TokenType::Join);
991 keywords.insert("LEFT".to_string(), TokenType::Left);
992 keywords.insert("RIGHT".to_string(), TokenType::Right);
993 keywords.insert("INNER".to_string(), TokenType::Inner);
994 keywords.insert("OUTER".to_string(), TokenType::Outer);
995 keywords.insert("OUTPUT".to_string(), TokenType::Output);
996 keywords.insert("FULL".to_string(), TokenType::Full);
997 keywords.insert("CROSS".to_string(), TokenType::Cross);
998 keywords.insert("SEMI".to_string(), TokenType::Semi);
999 keywords.insert("ANTI".to_string(), TokenType::Anti);
1000 keywords.insert("STRAIGHT_JOIN".to_string(), TokenType::StraightJoin);
1001 keywords.insert("UNION".to_string(), TokenType::Union);
1002 keywords.insert("EXCEPT".to_string(), TokenType::Except);
1003 keywords.insert("MINUS".to_string(), TokenType::Except); keywords.insert("INTERSECT".to_string(), TokenType::Intersect);
1005 keywords.insert("GROUP".to_string(), TokenType::Group);
1006 keywords.insert("CUBE".to_string(), TokenType::Cube);
1007 keywords.insert("ROLLUP".to_string(), TokenType::Rollup);
1008 keywords.insert("WITHIN".to_string(), TokenType::Within);
1009 keywords.insert("ORDER".to_string(), TokenType::Order);
1010 keywords.insert("BY".to_string(), TokenType::By);
1011 keywords.insert("HAVING".to_string(), TokenType::Having);
1012 keywords.insert("LIMIT".to_string(), TokenType::Limit);
1013 keywords.insert("OFFSET".to_string(), TokenType::Offset);
1014 keywords.insert("ORDINALITY".to_string(), TokenType::Ordinality);
1015 keywords.insert("FETCH".to_string(), TokenType::Fetch);
1016 keywords.insert("FIRST".to_string(), TokenType::First);
1017 keywords.insert("NEXT".to_string(), TokenType::Next);
1018 keywords.insert("ONLY".to_string(), TokenType::Only);
1019 keywords.insert("KEEP".to_string(), TokenType::Keep);
1020 keywords.insert("IGNORE".to_string(), TokenType::Ignore);
1021 keywords.insert("INPUT".to_string(), TokenType::Input);
1022 keywords.insert("CASE".to_string(), TokenType::Case);
1023 keywords.insert("WHEN".to_string(), TokenType::When);
1024 keywords.insert("THEN".to_string(), TokenType::Then);
1025 keywords.insert("ELSE".to_string(), TokenType::Else);
1026 keywords.insert("END".to_string(), TokenType::End);
1027 keywords.insert("ENDIF".to_string(), TokenType::End); keywords.insert("NULL".to_string(), TokenType::Null);
1029 keywords.insert("TRUE".to_string(), TokenType::True);
1030 keywords.insert("FALSE".to_string(), TokenType::False);
1031 keywords.insert("IS".to_string(), TokenType::Is);
1032 keywords.insert("IN".to_string(), TokenType::In);
1033 keywords.insert("BETWEEN".to_string(), TokenType::Between);
1034 keywords.insert("OVERLAPS".to_string(), TokenType::Overlaps);
1035 keywords.insert("LIKE".to_string(), TokenType::Like);
1036 keywords.insert("ILIKE".to_string(), TokenType::ILike);
1037 keywords.insert("RLIKE".to_string(), TokenType::RLike);
1038 keywords.insert("REGEXP".to_string(), TokenType::RLike);
1039 keywords.insert("ESCAPE".to_string(), TokenType::Escape);
1040 keywords.insert("EXISTS".to_string(), TokenType::Exists);
1041 keywords.insert("DISTINCT".to_string(), TokenType::Distinct);
1042 keywords.insert("ALL".to_string(), TokenType::All);
1043 keywords.insert("WITH".to_string(), TokenType::With);
1044 keywords.insert("CREATE".to_string(), TokenType::Create);
1045 keywords.insert("DROP".to_string(), TokenType::Drop);
1046 keywords.insert("ALTER".to_string(), TokenType::Alter);
1047 keywords.insert("TRUNCATE".to_string(), TokenType::Truncate);
1048 keywords.insert("TABLE".to_string(), TokenType::Table);
1049 keywords.insert("VIEW".to_string(), TokenType::View);
1050 keywords.insert("INDEX".to_string(), TokenType::Index);
1051 keywords.insert("COLUMN".to_string(), TokenType::Column);
1052 keywords.insert("CONSTRAINT".to_string(), TokenType::Constraint);
1053 keywords.insert("ADD".to_string(), TokenType::Add);
1054 keywords.insert("CASCADE".to_string(), TokenType::Cascade);
1055 keywords.insert("RESTRICT".to_string(), TokenType::Restrict);
1056 keywords.insert("RENAME".to_string(), TokenType::Rename);
1057 keywords.insert("TEMPORARY".to_string(), TokenType::Temporary);
1058 keywords.insert("TEMP".to_string(), TokenType::Temporary);
1059 keywords.insert("UNIQUE".to_string(), TokenType::Unique);
1060 keywords.insert("PRIMARY".to_string(), TokenType::PrimaryKey);
1061 keywords.insert("FOREIGN".to_string(), TokenType::ForeignKey);
1062 keywords.insert("KEY".to_string(), TokenType::Key);
1063 keywords.insert("KILL".to_string(), TokenType::Kill);
1064 keywords.insert("REFERENCES".to_string(), TokenType::References);
1065 keywords.insert("DEFAULT".to_string(), TokenType::Default);
1066 keywords.insert("DECLARE".to_string(), TokenType::Declare);
1067 keywords.insert("AUTO_INCREMENT".to_string(), TokenType::AutoIncrement);
1068 keywords.insert("AUTOINCREMENT".to_string(), TokenType::AutoIncrement); keywords.insert("MATERIALIZED".to_string(), TokenType::Materialized);
1070 keywords.insert("REPLACE".to_string(), TokenType::Replace);
1071 keywords.insert("TO".to_string(), TokenType::To);
1072 keywords.insert("INSERT".to_string(), TokenType::Insert);
1073 keywords.insert("OVERWRITE".to_string(), TokenType::Overwrite);
1074 keywords.insert("UPDATE".to_string(), TokenType::Update);
1075 keywords.insert("USE".to_string(), TokenType::Use);
1076 keywords.insert("WAREHOUSE".to_string(), TokenType::Warehouse);
1077 keywords.insert("GLOB".to_string(), TokenType::Glob);
1078 keywords.insert("DELETE".to_string(), TokenType::Delete);
1079 keywords.insert("MERGE".to_string(), TokenType::Merge);
1080 keywords.insert("CACHE".to_string(), TokenType::Cache);
1081 keywords.insert("UNCACHE".to_string(), TokenType::Uncache);
1082 keywords.insert("REFRESH".to_string(), TokenType::Refresh);
1083 keywords.insert("GRANT".to_string(), TokenType::Grant);
1084 keywords.insert("REVOKE".to_string(), TokenType::Revoke);
1085 keywords.insert("COMMENT".to_string(), TokenType::Comment);
1086 keywords.insert("COLLATE".to_string(), TokenType::Collate);
1087 keywords.insert("INTO".to_string(), TokenType::Into);
1088 keywords.insert("VALUES".to_string(), TokenType::Values);
1089 keywords.insert("SET".to_string(), TokenType::Set);
1090 keywords.insert("SETTINGS".to_string(), TokenType::Settings);
1091 keywords.insert("SEPARATOR".to_string(), TokenType::Separator);
1092 keywords.insert("ASC".to_string(), TokenType::Asc);
1093 keywords.insert("DESC".to_string(), TokenType::Desc);
1094 keywords.insert("NULLS".to_string(), TokenType::Nulls);
1095 keywords.insert("RESPECT".to_string(), TokenType::Respect);
1096 keywords.insert("FIRST".to_string(), TokenType::First);
1097 keywords.insert("LAST".to_string(), TokenType::Last);
1098 keywords.insert("IF".to_string(), TokenType::If);
1099 keywords.insert("CAST".to_string(), TokenType::Cast);
1100 keywords.insert("TRY_CAST".to_string(), TokenType::TryCast);
1101 keywords.insert("SAFE_CAST".to_string(), TokenType::SafeCast);
1102 keywords.insert("OVER".to_string(), TokenType::Over);
1103 keywords.insert("PARTITION".to_string(), TokenType::Partition);
1104 keywords.insert("PLACING".to_string(), TokenType::Placing);
1105 keywords.insert("WINDOW".to_string(), TokenType::Window);
1106 keywords.insert("ROWS".to_string(), TokenType::Rows);
1107 keywords.insert("RANGE".to_string(), TokenType::Range);
1108 keywords.insert("FILTER".to_string(), TokenType::Filter);
1109 keywords.insert("NATURAL".to_string(), TokenType::Natural);
1110 keywords.insert("USING".to_string(), TokenType::Using);
1111 keywords.insert("UNBOUNDED".to_string(), TokenType::Unbounded);
1112 keywords.insert("PRECEDING".to_string(), TokenType::Preceding);
1113 keywords.insert("FOLLOWING".to_string(), TokenType::Following);
1114 keywords.insert("CURRENT".to_string(), TokenType::Current);
1115 keywords.insert("ROW".to_string(), TokenType::Row);
1116 keywords.insert("GROUPS".to_string(), TokenType::Groups);
1117 keywords.insert("RECURSIVE".to_string(), TokenType::Recursive);
1118 keywords.insert("BOTH".to_string(), TokenType::Both);
1120 keywords.insert("LEADING".to_string(), TokenType::Leading);
1121 keywords.insert("TRAILING".to_string(), TokenType::Trailing);
1122 keywords.insert("INTERVAL".to_string(), TokenType::Interval);
1123 keywords.insert("TOP".to_string(), TokenType::Top);
1125 keywords.insert("QUALIFY".to_string(), TokenType::Qualify);
1126 keywords.insert("SAMPLE".to_string(), TokenType::Sample);
1127 keywords.insert("TABLESAMPLE".to_string(), TokenType::TableSample);
1128 keywords.insert("BERNOULLI".to_string(), TokenType::Bernoulli);
1129 keywords.insert("SYSTEM".to_string(), TokenType::System);
1130 keywords.insert("BLOCK".to_string(), TokenType::Block);
1131 keywords.insert("SEED".to_string(), TokenType::Seed);
1132 keywords.insert("REPEATABLE".to_string(), TokenType::Repeatable);
1133 keywords.insert("TIES".to_string(), TokenType::Ties);
1134 keywords.insert("LATERAL".to_string(), TokenType::Lateral);
1135 keywords.insert("LAMBDA".to_string(), TokenType::Lambda);
1136 keywords.insert("APPLY".to_string(), TokenType::Apply);
1137 keywords.insert("CONNECT".to_string(), TokenType::Connect);
1139 keywords.insert("CLUSTER".to_string(), TokenType::Cluster);
1141 keywords.insert("DISTRIBUTE".to_string(), TokenType::Distribute);
1142 keywords.insert("SORT".to_string(), TokenType::Sort);
1143 keywords.insert("PIVOT".to_string(), TokenType::Pivot);
1144 keywords.insert("PREWHERE".to_string(), TokenType::Prewhere);
1145 keywords.insert("UNPIVOT".to_string(), TokenType::Unpivot);
1146 keywords.insert("FOR".to_string(), TokenType::For);
1147 keywords.insert("ANY".to_string(), TokenType::Any);
1148 keywords.insert("SOME".to_string(), TokenType::Some);
1149 keywords.insert("ASOF".to_string(), TokenType::AsOf);
1150 keywords.insert("PERCENT".to_string(), TokenType::Percent);
1151 keywords.insert("EXCLUDE".to_string(), TokenType::Exclude);
1152 keywords.insert("NO".to_string(), TokenType::No);
1153 keywords.insert("OTHERS".to_string(), TokenType::Others);
1154 keywords.insert("OPERATOR".to_string(), TokenType::Operator);
1156 keywords.insert("SCHEMA".to_string(), TokenType::Schema);
1158 keywords.insert("NAMESPACE".to_string(), TokenType::Namespace);
1159 keywords.insert("DATABASE".to_string(), TokenType::Database);
1160 keywords.insert("FUNCTION".to_string(), TokenType::Function);
1161 keywords.insert("PROCEDURE".to_string(), TokenType::Procedure);
1162 keywords.insert("PROC".to_string(), TokenType::Procedure);
1163 keywords.insert("SEQUENCE".to_string(), TokenType::Sequence);
1164 keywords.insert("TRIGGER".to_string(), TokenType::Trigger);
1165 keywords.insert("TYPE".to_string(), TokenType::Type);
1166 keywords.insert("DOMAIN".to_string(), TokenType::Domain);
1167 keywords.insert("RETURNS".to_string(), TokenType::Returns);
1168 keywords.insert("RETURNING".to_string(), TokenType::Returning);
1169 keywords.insert("LANGUAGE".to_string(), TokenType::Language);
1170 keywords.insert("ROLLBACK".to_string(), TokenType::Rollback);
1171 keywords.insert("COMMIT".to_string(), TokenType::Commit);
1172 keywords.insert("BEGIN".to_string(), TokenType::Begin);
1173 keywords.insert("DESCRIBE".to_string(), TokenType::Describe);
1174 keywords.insert("PRESERVE".to_string(), TokenType::Preserve);
1175 keywords.insert("TRANSACTION".to_string(), TokenType::Transaction);
1176 keywords.insert("SAVEPOINT".to_string(), TokenType::Savepoint);
1177 keywords.insert("BODY".to_string(), TokenType::Body);
1178 keywords.insert("INCREMENT".to_string(), TokenType::Increment);
1179 keywords.insert("MINVALUE".to_string(), TokenType::Minvalue);
1180 keywords.insert("MAXVALUE".to_string(), TokenType::Maxvalue);
1181 keywords.insert("CYCLE".to_string(), TokenType::Cycle);
1182 keywords.insert("NOCYCLE".to_string(), TokenType::NoCycle);
1183 keywords.insert("PRIOR".to_string(), TokenType::Prior);
1184 keywords.insert("MATCH".to_string(), TokenType::Match);
1186 keywords.insert("MATCH_RECOGNIZE".to_string(), TokenType::MatchRecognize);
1187 keywords.insert("MEASURES".to_string(), TokenType::Measures);
1188 keywords.insert("PATTERN".to_string(), TokenType::Pattern);
1189 keywords.insert("DEFINE".to_string(), TokenType::Define);
1190 keywords.insert("RUNNING".to_string(), TokenType::Running);
1191 keywords.insert("FINAL".to_string(), TokenType::Final);
1192 keywords.insert("OWNED".to_string(), TokenType::Owned);
1193 keywords.insert("AFTER".to_string(), TokenType::After);
1194 keywords.insert("BEFORE".to_string(), TokenType::Before);
1195 keywords.insert("INSTEAD".to_string(), TokenType::Instead);
1196 keywords.insert("EACH".to_string(), TokenType::Each);
1197 keywords.insert("STATEMENT".to_string(), TokenType::Statement);
1198 keywords.insert("REFERENCING".to_string(), TokenType::Referencing);
1199 keywords.insert("OLD".to_string(), TokenType::Old);
1200 keywords.insert("NEW".to_string(), TokenType::New);
1201 keywords.insert("OF".to_string(), TokenType::Of);
1202 keywords.insert("CHECK".to_string(), TokenType::Check);
1203 keywords.insert("START".to_string(), TokenType::Start);
1204 keywords.insert("ENUM".to_string(), TokenType::Enum);
1205 keywords.insert("AUTHORIZATION".to_string(), TokenType::Authorization);
1206 keywords.insert("RESTART".to_string(), TokenType::Restart);
1207 keywords.insert("DATE".to_string(), TokenType::Date);
1209 keywords.insert("TIME".to_string(), TokenType::Time);
1210 keywords.insert("TIMESTAMP".to_string(), TokenType::Timestamp);
1211 keywords.insert("DATETIME".to_string(), TokenType::DateTime);
1212 keywords.insert("GENERATED".to_string(), TokenType::Generated);
1213 keywords.insert("IDENTITY".to_string(), TokenType::Identity);
1214 keywords.insert("ALWAYS".to_string(), TokenType::Always);
1215 keywords.insert("LOAD".to_string(), TokenType::Load);
1217 keywords.insert("LOCAL".to_string(), TokenType::Local);
1218 keywords.insert("INPATH".to_string(), TokenType::Inpath);
1219 keywords.insert("INPUTFORMAT".to_string(), TokenType::InputFormat);
1220 keywords.insert("SERDE".to_string(), TokenType::Serde);
1221 keywords.insert("SERDEPROPERTIES".to_string(), TokenType::SerdeProperties);
1222 keywords.insert("FORMAT".to_string(), TokenType::Format);
1223 keywords.insert("PRAGMA".to_string(), TokenType::Pragma);
1225 keywords.insert("SHOW".to_string(), TokenType::Show);
1227 keywords.insert("SIBLINGS".to_string(), TokenType::Siblings);
1229 keywords.insert("COPY".to_string(), TokenType::Copy);
1231 keywords.insert("PUT".to_string(), TokenType::Put);
1232 keywords.insert("GET".to_string(), TokenType::Get);
1233 keywords.insert("EXEC".to_string(), TokenType::Execute);
1235 keywords.insert("EXECUTE".to_string(), TokenType::Execute);
1236 keywords.insert("ISNULL".to_string(), TokenType::IsNull);
1238 keywords.insert("NOTNULL".to_string(), TokenType::NotNull);
1239 keywords
1240});
1241
1242static DEFAULT_SINGLE_TOKENS: LazyLock<HashMap<char, TokenType>> = LazyLock::new(|| {
1243 let mut single_tokens = HashMap::with_capacity(30);
1244 single_tokens.insert('(', TokenType::LParen);
1245 single_tokens.insert(')', TokenType::RParen);
1246 single_tokens.insert('[', TokenType::LBracket);
1247 single_tokens.insert(']', TokenType::RBracket);
1248 single_tokens.insert('{', TokenType::LBrace);
1249 single_tokens.insert('}', TokenType::RBrace);
1250 single_tokens.insert(',', TokenType::Comma);
1251 single_tokens.insert('.', TokenType::Dot);
1252 single_tokens.insert(';', TokenType::Semicolon);
1253 single_tokens.insert('+', TokenType::Plus);
1254 single_tokens.insert('-', TokenType::Dash);
1255 single_tokens.insert('*', TokenType::Star);
1256 single_tokens.insert('/', TokenType::Slash);
1257 single_tokens.insert('%', TokenType::Percent);
1258 single_tokens.insert('&', TokenType::Amp);
1259 single_tokens.insert('|', TokenType::Pipe);
1260 single_tokens.insert('^', TokenType::Caret);
1261 single_tokens.insert('~', TokenType::Tilde);
1262 single_tokens.insert('<', TokenType::Lt);
1263 single_tokens.insert('>', TokenType::Gt);
1264 single_tokens.insert('=', TokenType::Eq);
1265 single_tokens.insert('!', TokenType::Exclamation);
1266 single_tokens.insert(':', TokenType::Colon);
1267 single_tokens.insert('@', TokenType::DAt);
1268 single_tokens.insert('#', TokenType::Hash);
1269 single_tokens.insert('$', TokenType::Dollar);
1270 single_tokens.insert('?', TokenType::Parameter);
1271 single_tokens
1272});
1273
1274static DEFAULT_QUOTES: LazyLock<HashMap<String, String>> = LazyLock::new(|| {
1275 let mut quotes = HashMap::with_capacity(4);
1276 quotes.insert("'".to_string(), "'".to_string());
1277 quotes.insert("\"\"\"".to_string(), "\"\"\"".to_string());
1279 quotes
1280});
1281
1282static DEFAULT_IDENTIFIERS: LazyLock<HashMap<char, char>> = LazyLock::new(|| {
1283 let mut identifiers = HashMap::with_capacity(4);
1284 identifiers.insert('"', '"');
1285 identifiers.insert('`', '`');
1286 identifiers
1289});
1290
1291static DEFAULT_COMMENTS: LazyLock<HashMap<String, Option<String>>> = LazyLock::new(|| {
1292 let mut comments = HashMap::with_capacity(4);
1293 comments.insert("--".to_string(), None);
1294 comments.insert("/*".to_string(), Some("*/".to_string()));
1295 comments
1296});
1297
1298#[derive(Debug, Clone)]
1300pub struct TokenizerConfig {
1301 pub keywords: HashMap<String, TokenType>,
1303 pub single_tokens: HashMap<char, TokenType>,
1305 pub quotes: HashMap<String, String>,
1307 pub identifiers: HashMap<char, char>,
1309 pub comments: HashMap<String, Option<String>>,
1311 pub string_escapes: Vec<char>,
1313 pub nested_comments: bool,
1315 pub escape_follow_chars: Vec<char>,
1320 pub b_prefix_is_byte_string: bool,
1323 pub numeric_literals: HashMap<String, String>,
1326 pub identifiers_can_start_with_digit: bool,
1330 pub hex_number_strings: bool,
1334 pub hex_string_is_integer_type: bool,
1338 pub string_escapes_allowed_in_raw_strings: bool,
1343 pub hash_comments: bool,
1345 pub dollar_sign_is_identifier: bool,
1349 pub insert_format_raw_data: bool,
1353}
1354
1355impl Default for TokenizerConfig {
1356 fn default() -> Self {
1357 Self {
1358 keywords: DEFAULT_KEYWORDS.clone(),
1359 single_tokens: DEFAULT_SINGLE_TOKENS.clone(),
1360 quotes: DEFAULT_QUOTES.clone(),
1361 identifiers: DEFAULT_IDENTIFIERS.clone(),
1362 comments: DEFAULT_COMMENTS.clone(),
1363 string_escapes: vec!['\''],
1366 nested_comments: true,
1367 escape_follow_chars: vec![],
1369 b_prefix_is_byte_string: false,
1371 numeric_literals: HashMap::new(),
1372 identifiers_can_start_with_digit: false,
1373 hex_number_strings: false,
1374 hex_string_is_integer_type: false,
1375 string_escapes_allowed_in_raw_strings: true,
1378 hash_comments: false,
1379 dollar_sign_is_identifier: false,
1380 insert_format_raw_data: false,
1381 }
1382 }
1383}
1384
1385pub struct Tokenizer {
1387 config: TokenizerConfig,
1388}
1389
1390impl Tokenizer {
1391 pub fn new(config: TokenizerConfig) -> Self {
1393 Self { config }
1394 }
1395
1396 pub fn default_config() -> Self {
1398 Self::new(TokenizerConfig::default())
1399 }
1400
1401 pub fn tokenize(&self, sql: &str) -> Result<Vec<Token>> {
1403 let mut state = TokenizerState::new(sql, &self.config);
1404 state.tokenize()
1405 }
1406}
1407
1408impl Default for Tokenizer {
1409 fn default() -> Self {
1410 Self::default_config()
1411 }
1412}
1413
1414struct TokenizerState<'a> {
1416 source: &'a str,
1417 source_is_ascii: bool,
1418 chars: Vec<char>,
1419 size: usize,
1420 tokens: Vec<Token>,
1421 start: usize,
1422 current: usize,
1423 line: usize,
1424 column: usize,
1425 comments: Vec<String>,
1426 config: &'a TokenizerConfig,
1427}
1428
1429impl<'a> TokenizerState<'a> {
1430 fn new(sql: &'a str, config: &'a TokenizerConfig) -> Self {
1431 let chars: Vec<char> = sql.chars().collect();
1432 let size = chars.len();
1433 Self {
1434 source: sql,
1435 source_is_ascii: sql.is_ascii(),
1436 chars,
1437 size,
1438 tokens: Vec::new(),
1439 start: 0,
1440 current: 0,
1441 line: 1,
1442 column: 1,
1443 comments: Vec::new(),
1444 config,
1445 }
1446 }
1447
1448 fn tokenize(&mut self) -> Result<Vec<Token>> {
1449 while !self.is_at_end() {
1450 self.skip_whitespace();
1451 if self.is_at_end() {
1452 break;
1453 }
1454
1455 self.start = self.current;
1456 self.scan_token()?;
1457
1458 if self.config.insert_format_raw_data {
1461 if let Some(raw) = self.try_scan_insert_format_raw_data() {
1462 if !raw.is_empty() {
1463 self.start = self.current;
1464 self.add_token_with_text(TokenType::Var, raw);
1465 }
1466 }
1467 }
1468 }
1469
1470 if !self.comments.is_empty() {
1475 if let Some(last) = self.tokens.last_mut() {
1476 last.trailing_comments.extend(self.comments.drain(..));
1477 }
1478 }
1479
1480 Ok(std::mem::take(&mut self.tokens))
1481 }
1482
1483 #[inline]
1484 fn is_at_end(&self) -> bool {
1485 self.current >= self.size
1486 }
1487
1488 #[inline]
1489 fn text_from_range(&self, start: usize, end: usize) -> String {
1490 if self.source_is_ascii {
1491 self.source[start..end].to_string()
1492 } else {
1493 self.chars[start..end].iter().collect()
1494 }
1495 }
1496
1497 #[inline]
1498 fn peek(&self) -> char {
1499 if self.is_at_end() {
1500 '\0'
1501 } else {
1502 self.chars[self.current]
1503 }
1504 }
1505
1506 #[inline]
1507 fn peek_next(&self) -> char {
1508 if self.current + 1 >= self.size {
1509 '\0'
1510 } else {
1511 self.chars[self.current + 1]
1512 }
1513 }
1514
1515 #[inline]
1516 fn advance(&mut self) -> char {
1517 let c = self.peek();
1518 self.current += 1;
1519 if c == '\n' {
1520 self.line += 1;
1521 self.column = 1;
1522 } else {
1523 self.column += 1;
1524 }
1525 c
1526 }
1527
1528 fn skip_whitespace(&mut self) {
1529 let mut saw_newline = false;
1534 while !self.is_at_end() {
1535 let c = self.peek();
1536 match c {
1537 ' ' | '\t' | '\r' => {
1538 self.advance();
1539 }
1540 '\n' => {
1541 saw_newline = true;
1542 self.advance();
1543 }
1544 '\u{00A0}' | '\u{2000}'..='\u{200B}' | '\u{3000}' | '\u{FEFF}' => {
1549 self.advance();
1550 }
1551 '-' if self.peek_next() == '-' => {
1552 self.scan_line_comment(saw_newline);
1553 saw_newline = true;
1555 }
1556 '/' if self.peek_next() == '/' && self.config.hash_comments => {
1557 self.scan_double_slash_comment();
1559 }
1560 '/' if self.peek_next() == '*' => {
1561 if self.current + 2 < self.size && self.chars[self.current + 2] == '+' {
1563 break;
1565 }
1566 if self.scan_block_comment(saw_newline).is_err() {
1567 return;
1568 }
1569 }
1571 '/' if self.peek_next() == '/' && self.config.comments.contains_key("//") => {
1572 let prev_non_ws = if self.current > 0 {
1576 let mut i = self.current - 1;
1577 while i > 0 && (self.chars[i] == ' ' || self.chars[i] == '\t') {
1578 i -= 1;
1579 }
1580 self.chars[i]
1581 } else {
1582 '\0'
1583 };
1584 if prev_non_ws == ':' || prev_non_ws == '/' {
1585 break;
1587 }
1588 self.scan_line_comment(saw_newline);
1589 saw_newline = true;
1591 }
1592 '#' if self.config.hash_comments => {
1593 self.scan_hash_line_comment();
1594 }
1595 _ => break,
1596 }
1597 }
1598 }
1599
1600 fn scan_hash_line_comment(&mut self) {
1601 self.advance(); let start = self.current;
1603 while !self.is_at_end() && self.peek() != '\n' {
1604 self.advance();
1605 }
1606 let comment = self.text_from_range(start, self.current);
1607 let comment_text = comment.trim().to_string();
1608 if let Some(last) = self.tokens.last_mut() {
1609 last.trailing_comments.push(comment_text);
1610 } else {
1611 self.comments.push(comment_text);
1612 }
1613 }
1614
1615 fn scan_double_slash_comment(&mut self) {
1616 self.advance(); self.advance(); let start = self.current;
1619 while !self.is_at_end() && self.peek() != '\n' {
1620 self.advance();
1621 }
1622 let comment = self.text_from_range(start, self.current);
1623 let comment_text = comment.trim().to_string();
1624 if let Some(last) = self.tokens.last_mut() {
1625 last.trailing_comments.push(comment_text);
1626 } else {
1627 self.comments.push(comment_text);
1628 }
1629 }
1630
1631 fn scan_line_comment(&mut self, after_newline: bool) {
1632 self.advance(); self.advance(); let start = self.current;
1635 while !self.is_at_end() && self.peek() != '\n' {
1636 self.advance();
1637 }
1638 let comment_text = self.text_from_range(start, self.current);
1639
1640 if after_newline || self.tokens.is_empty() {
1643 self.comments.push(comment_text);
1644 } else if let Some(last) = self.tokens.last_mut() {
1645 last.trailing_comments.push(comment_text);
1646 }
1647 }
1648
1649 fn scan_block_comment(&mut self, after_newline: bool) -> Result<()> {
1650 self.advance(); self.advance(); let content_start = self.current;
1653 let mut depth = 1;
1654
1655 while !self.is_at_end() && depth > 0 {
1656 if self.peek() == '/' && self.peek_next() == '*' && self.config.nested_comments {
1657 self.advance();
1658 self.advance();
1659 depth += 1;
1660 } else if self.peek() == '*' && self.peek_next() == '/' {
1661 depth -= 1;
1662 if depth > 0 {
1663 self.advance();
1664 self.advance();
1665 }
1666 } else {
1667 self.advance();
1668 }
1669 }
1670
1671 if depth > 0 {
1672 return Err(Error::tokenize(
1673 "Unterminated block comment",
1674 self.line,
1675 self.column,
1676 self.start,
1677 self.current,
1678 ));
1679 }
1680
1681 let content = self.text_from_range(content_start, self.current);
1683 self.advance(); self.advance(); let comment_text = format!("/*{}*/", content);
1688
1689 if after_newline || self.tokens.is_empty() {
1692 self.comments.push(comment_text);
1693 } else if let Some(last) = self.tokens.last_mut() {
1694 last.trailing_comments.push(comment_text);
1695 }
1696
1697 Ok(())
1698 }
1699
1700 fn scan_hint(&mut self) -> Result<()> {
1702 self.advance(); self.advance(); self.advance(); let hint_start = self.current;
1706
1707 while !self.is_at_end() {
1709 if self.peek() == '*' && self.peek_next() == '/' {
1710 break;
1711 }
1712 self.advance();
1713 }
1714
1715 if self.is_at_end() {
1716 return Err(Error::tokenize(
1717 "Unterminated hint comment",
1718 self.line,
1719 self.column,
1720 self.start,
1721 self.current,
1722 ));
1723 }
1724
1725 let hint_text = self.text_from_range(hint_start, self.current);
1726 self.advance(); self.advance(); self.add_token_with_text(TokenType::Hint, hint_text.trim().to_string());
1730
1731 Ok(())
1732 }
1733
1734 fn scan_positional_parameter(&mut self) -> Result<()> {
1736 self.advance(); let start = self.current;
1738
1739 while !self.is_at_end() && self.peek().is_ascii_digit() {
1740 self.advance();
1741 }
1742
1743 let number = self.text_from_range(start, self.current);
1744 self.add_token_with_text(TokenType::Parameter, number);
1745 Ok(())
1746 }
1747
1748 fn try_scan_tagged_dollar_string(&mut self) -> Result<Option<()>> {
1753 let saved_pos = self.current;
1754
1755 self.advance(); let tag_start = self.current;
1761 while !self.is_at_end()
1762 && (self.peek().is_alphanumeric() || self.peek() == '_' || !self.peek().is_ascii())
1763 {
1764 self.advance();
1765 }
1766 let tag = self.text_from_range(tag_start, self.current);
1767
1768 if self.is_at_end() || self.peek() != '$' {
1770 self.current = saved_pos;
1772 return Ok(None);
1773 }
1774 self.advance(); let content_start = self.current;
1778 let closing_tag = format!("${}$", tag);
1779 let closing_chars: Vec<char> = closing_tag.chars().collect();
1780
1781 loop {
1782 if self.is_at_end() {
1783 self.current = saved_pos;
1785 return Ok(None);
1786 }
1787
1788 if self.peek() == '$' && self.current + closing_chars.len() <= self.size {
1790 let matches = closing_chars.iter().enumerate().all(|(j, &ch)| {
1791 self.current + j < self.size && self.chars[self.current + j] == ch
1792 });
1793 if matches {
1794 let content = self.text_from_range(content_start, self.current);
1795 for _ in 0..closing_chars.len() {
1797 self.advance();
1798 }
1799 let token_text = format!("{}\x00{}", tag, content);
1801 self.add_token_with_text(TokenType::DollarString, token_text);
1802 return Ok(Some(()));
1803 }
1804 }
1805 self.advance();
1806 }
1807 }
1808
1809 fn scan_dollar_quoted_string(&mut self) -> Result<()> {
1814 self.advance(); self.advance(); let start = self.current;
1819 while !self.is_at_end() {
1820 if self.peek() == '$'
1821 && self.current + 1 < self.size
1822 && self.chars[self.current + 1] == '$'
1823 {
1824 break;
1825 }
1826 self.advance();
1827 }
1828
1829 let content = self.text_from_range(start, self.current);
1830
1831 if !self.is_at_end() {
1832 self.advance(); self.advance(); }
1835
1836 self.add_token_with_text(TokenType::DollarString, content);
1837 Ok(())
1838 }
1839
1840 fn scan_token(&mut self) -> Result<()> {
1841 let c = self.peek();
1842
1843 if c == '\'' {
1845 if self.config.quotes.contains_key("'''")
1847 && self.peek_next() == '\''
1848 && self.current + 2 < self.size
1849 && self.chars[self.current + 2] == '\''
1850 {
1851 return self.scan_triple_quoted_string('\'');
1852 }
1853 return self.scan_string();
1854 }
1855
1856 if c == '"'
1858 && self.config.quotes.contains_key("\"\"\"")
1859 && self.peek_next() == '"'
1860 && self.current + 2 < self.size
1861 && self.chars[self.current + 2] == '"'
1862 {
1863 return self.scan_triple_quoted_string('"');
1864 }
1865
1866 if c == '"'
1869 && self.config.quotes.contains_key("\"")
1870 && !self.config.identifiers.contains_key(&'"')
1871 {
1872 return self.scan_double_quoted_string();
1873 }
1874
1875 if let Some(&end_quote) = self.config.identifiers.get(&c) {
1877 return self.scan_quoted_identifier(end_quote);
1878 }
1879
1880 if c.is_ascii_digit() {
1882 return self.scan_number();
1883 }
1884
1885 if c == '.' && self.peek_next().is_ascii_digit() {
1892 let prev_char = if self.current > 0 {
1893 self.chars[self.current - 1]
1894 } else {
1895 '\0'
1896 };
1897 let is_after_ident = prev_char.is_alphanumeric()
1898 || prev_char == '_'
1899 || prev_char == '`'
1900 || prev_char == '"'
1901 || prev_char == ']'
1902 || prev_char == ')';
1903 if prev_char != '.' && !is_after_ident {
1904 return self.scan_number_starting_with_dot();
1905 }
1906 }
1907
1908 if c == '/'
1910 && self.peek_next() == '*'
1911 && self.current + 2 < self.size
1912 && self.chars[self.current + 2] == '+'
1913 {
1914 return self.scan_hint();
1915 }
1916
1917 if let Some(token_type) = self.try_scan_multi_char_operator() {
1919 self.add_token(token_type);
1920 return Ok(());
1921 }
1922
1923 if c == '$'
1926 && (self.peek_next().is_alphanumeric()
1927 || self.peek_next() == '_'
1928 || !self.peek_next().is_ascii())
1929 {
1930 if let Some(()) = self.try_scan_tagged_dollar_string()? {
1931 return Ok(());
1932 }
1933 if self.config.dollar_sign_is_identifier {
1936 return self.scan_dollar_identifier();
1937 }
1938 }
1939
1940 if c == '$' && self.peek_next() == '$' {
1942 return self.scan_dollar_quoted_string();
1943 }
1944
1945 if c == '$' && self.peek_next().is_ascii_digit() {
1947 return self.scan_positional_parameter();
1948 }
1949
1950 if c == '$' && self.config.dollar_sign_is_identifier {
1952 return self.scan_dollar_identifier();
1953 }
1954
1955 if (c == '#' || c == '@')
1958 && (self.peek_next().is_alphanumeric()
1959 || self.peek_next() == '_'
1960 || self.peek_next() == '#')
1961 {
1962 return self.scan_tsql_identifier();
1963 }
1964
1965 if let Some(&token_type) = self.config.single_tokens.get(&c) {
1967 self.advance();
1968 self.add_token(token_type);
1969 return Ok(());
1970 }
1971
1972 if c == '\u{2212}' {
1974 self.advance();
1975 self.add_token(TokenType::Dash);
1976 return Ok(());
1977 }
1978
1979 if c == '\u{2044}' {
1981 self.advance();
1982 self.add_token(TokenType::Slash);
1983 return Ok(());
1984 }
1985
1986 if c == '\u{2018}' || c == '\u{2019}' {
1988 return self.scan_unicode_quoted_string(c);
1990 }
1991 if c == '\u{201C}' || c == '\u{201D}' {
1992 return self.scan_unicode_quoted_identifier(c);
1994 }
1995
1996 self.scan_identifier_or_keyword()
1998 }
1999
2000 fn try_scan_multi_char_operator(&mut self) -> Option<TokenType> {
2001 let c = self.peek();
2002 let next = self.peek_next();
2003 let third = if self.current + 2 < self.size {
2004 self.chars[self.current + 2]
2005 } else {
2006 '\0'
2007 };
2008
2009 if c == '-' && next == '|' && third == '-' {
2012 self.advance();
2013 self.advance();
2014 self.advance();
2015 return Some(TokenType::Adjacent);
2016 }
2017
2018 if c == '|' && next == '|' && third == '/' {
2020 self.advance();
2021 self.advance();
2022 self.advance();
2023 return Some(TokenType::DPipeSlash);
2024 }
2025
2026 if c == '#' && next == '>' && third == '>' {
2028 self.advance();
2029 self.advance();
2030 self.advance();
2031 return Some(TokenType::DHashArrow);
2032 }
2033
2034 if c == '-' && next == '>' && third == '>' {
2036 self.advance();
2037 self.advance();
2038 self.advance();
2039 return Some(TokenType::DArrow);
2040 }
2041
2042 if c == '<' && next == '=' && third == '>' {
2044 self.advance();
2045 self.advance();
2046 self.advance();
2047 return Some(TokenType::NullsafeEq);
2048 }
2049
2050 if c == '<' && next == '-' && third == '>' {
2052 self.advance();
2053 self.advance();
2054 self.advance();
2055 return Some(TokenType::LrArrow);
2056 }
2057
2058 if c == '<' && next == '@' {
2060 self.advance();
2061 self.advance();
2062 return Some(TokenType::LtAt);
2063 }
2064
2065 if c == '@' && next == '>' {
2067 self.advance();
2068 self.advance();
2069 return Some(TokenType::AtGt);
2070 }
2071
2072 if c == '~' && next == '~' && third == '~' {
2074 self.advance();
2075 self.advance();
2076 self.advance();
2077 return Some(TokenType::Glob);
2078 }
2079
2080 if c == '~' && next == '~' && third == '*' {
2082 self.advance();
2083 self.advance();
2084 self.advance();
2085 return Some(TokenType::ILike);
2086 }
2087
2088 let fourth = if self.current + 3 < self.size {
2090 self.chars[self.current + 3]
2091 } else {
2092 '\0'
2093 };
2094 if c == '!' && next == '~' && third == '~' && fourth == '*' {
2095 self.advance();
2096 self.advance();
2097 self.advance();
2098 self.advance();
2099 return Some(TokenType::NotILike);
2100 }
2101
2102 if c == '!' && next == '~' && third == '~' {
2104 self.advance();
2105 self.advance();
2106 self.advance();
2107 return Some(TokenType::NotLike);
2108 }
2109
2110 if c == '!' && next == '~' && third == '*' {
2112 self.advance();
2113 self.advance();
2114 self.advance();
2115 return Some(TokenType::NotIRLike);
2116 }
2117
2118 if c == '!' && next == ':' && third == '>' {
2120 self.advance();
2121 self.advance();
2122 self.advance();
2123 return Some(TokenType::NColonGt);
2124 }
2125
2126 if c == '?' && next == ':' && third == ':' {
2128 self.advance();
2129 self.advance();
2130 self.advance();
2131 return Some(TokenType::QDColon);
2132 }
2133
2134 if c == '!' && next == '~' {
2136 self.advance();
2137 self.advance();
2138 return Some(TokenType::NotRLike);
2139 }
2140
2141 if c == '~' && next == '~' {
2143 self.advance();
2144 self.advance();
2145 return Some(TokenType::Like);
2146 }
2147
2148 if c == '~' && next == '*' {
2150 self.advance();
2151 self.advance();
2152 return Some(TokenType::IRLike);
2153 }
2154
2155 if c == ':' && next == ':' && third == '$' {
2158 self.advance();
2159 self.advance();
2160 self.advance();
2161 return Some(TokenType::DColonDollar);
2162 }
2163 if c == ':' && next == ':' && third == '%' {
2164 self.advance();
2165 self.advance();
2166 self.advance();
2167 return Some(TokenType::DColonPercent);
2168 }
2169 if c == ':' && next == ':' && third == '?' {
2170 self.advance();
2171 self.advance();
2172 self.advance();
2173 return Some(TokenType::DColonQMark);
2174 }
2175
2176 let token_type = match (c, next) {
2178 ('.', ':') => Some(TokenType::DotColon),
2179 ('=', '=') => Some(TokenType::Eq), ('<', '=') => Some(TokenType::Lte),
2181 ('>', '=') => Some(TokenType::Gte),
2182 ('!', '=') => Some(TokenType::Neq),
2183 ('<', '>') => Some(TokenType::Neq),
2184 ('^', '=') => Some(TokenType::Neq),
2185 ('<', '<') => Some(TokenType::LtLt),
2186 ('>', '>') => Some(TokenType::GtGt),
2187 ('|', '|') => Some(TokenType::DPipe),
2188 ('|', '/') => Some(TokenType::PipeSlash), (':', ':') => Some(TokenType::DColon),
2190 (':', '=') => Some(TokenType::ColonEq), (':', '>') => Some(TokenType::ColonGt), ('-', '>') => Some(TokenType::Arrow), ('=', '>') => Some(TokenType::FArrow), ('&', '&') => Some(TokenType::DAmp),
2195 ('&', '<') => Some(TokenType::AmpLt), ('&', '>') => Some(TokenType::AmpGt), ('@', '@') => Some(TokenType::AtAt), ('?', '|') => Some(TokenType::QMarkPipe), ('?', '&') => Some(TokenType::QMarkAmp), ('?', '?') => Some(TokenType::DQMark), ('#', '>') => Some(TokenType::HashArrow), ('#', '-') => Some(TokenType::HashDash), ('^', '@') => Some(TokenType::CaretAt), ('*', '*') => Some(TokenType::DStar), ('|', '>') => Some(TokenType::PipeGt), _ => None,
2207 };
2208
2209 if token_type.is_some() {
2210 self.advance();
2211 self.advance();
2212 }
2213
2214 token_type
2215 }
2216
2217 fn scan_string(&mut self) -> Result<()> {
2218 self.advance(); let mut value = String::new();
2220
2221 while !self.is_at_end() {
2222 let c = self.peek();
2223 if c == '\'' {
2224 if self.peek_next() == '\'' {
2225 value.push('\'');
2227 self.advance();
2228 self.advance();
2229 } else {
2230 break;
2231 }
2232 } else if c == '\\' && self.config.string_escapes.contains(&'\\') {
2233 self.advance(); if !self.is_at_end() {
2236 let escaped = self.advance();
2237 match escaped {
2238 'n' => value.push('\n'),
2239 'r' => value.push('\r'),
2240 't' => value.push('\t'),
2241 '0' => value.push('\0'),
2242 'Z' => value.push('\x1A'), 'a' => value.push('\x07'), 'b' => value.push('\x08'), 'f' => value.push('\x0C'), 'v' => value.push('\x0B'), 'x' => {
2248 let mut hex = String::with_capacity(2);
2250 for _ in 0..2 {
2251 if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2252 hex.push(self.advance());
2253 }
2254 }
2255 if hex.len() == 2 {
2256 if let Ok(byte) = u8::from_str_radix(&hex, 16) {
2257 value.push(byte as char);
2258 } else {
2259 value.push('\\');
2260 value.push('x');
2261 value.push_str(&hex);
2262 }
2263 } else {
2264 value.push('\\');
2266 value.push('x');
2267 value.push_str(&hex);
2268 }
2269 }
2270 '\\' => value.push('\\'),
2271 '\'' => value.push('\''),
2272 '"' => value.push('"'),
2273 '%' => {
2274 value.push('%');
2276 }
2277 '_' => {
2278 value.push('_');
2280 }
2281 _ => {
2285 if !self.config.escape_follow_chars.is_empty() {
2286 value.push(escaped);
2288 } else {
2289 value.push('\\');
2291 value.push(escaped);
2292 }
2293 }
2294 }
2295 }
2296 } else {
2297 value.push(self.advance());
2298 }
2299 }
2300
2301 if self.is_at_end() {
2302 return Err(Error::tokenize(
2303 "Unterminated string",
2304 self.line,
2305 self.column,
2306 self.start,
2307 self.current,
2308 ));
2309 }
2310
2311 self.advance(); self.add_token_with_text(TokenType::String, value);
2313 Ok(())
2314 }
2315
2316 fn scan_double_quoted_string(&mut self) -> Result<()> {
2318 self.advance(); let mut value = String::new();
2320
2321 while !self.is_at_end() {
2322 let c = self.peek();
2323 if c == '"' {
2324 if self.peek_next() == '"' {
2325 value.push('"');
2327 self.advance();
2328 self.advance();
2329 } else {
2330 break;
2331 }
2332 } else if c == '\\' && self.config.string_escapes.contains(&'\\') {
2333 self.advance(); if !self.is_at_end() {
2336 let escaped = self.advance();
2337 match escaped {
2338 'n' => value.push('\n'),
2339 'r' => value.push('\r'),
2340 't' => value.push('\t'),
2341 '0' => value.push('\0'),
2342 'Z' => value.push('\x1A'), 'a' => value.push('\x07'), 'b' => value.push('\x08'), 'f' => value.push('\x0C'), 'v' => value.push('\x0B'), 'x' => {
2348 let mut hex = String::with_capacity(2);
2350 for _ in 0..2 {
2351 if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2352 hex.push(self.advance());
2353 }
2354 }
2355 if hex.len() == 2 {
2356 if let Ok(byte) = u8::from_str_radix(&hex, 16) {
2357 value.push(byte as char);
2358 } else {
2359 value.push('\\');
2360 value.push('x');
2361 value.push_str(&hex);
2362 }
2363 } else {
2364 value.push('\\');
2366 value.push('x');
2367 value.push_str(&hex);
2368 }
2369 }
2370 '\\' => value.push('\\'),
2371 '\'' => value.push('\''),
2372 '"' => value.push('"'),
2373 '%' => {
2374 value.push('%');
2376 }
2377 '_' => {
2378 value.push('_');
2380 }
2381 _ => {
2385 if !self.config.escape_follow_chars.is_empty() {
2386 value.push(escaped);
2388 } else {
2389 value.push('\\');
2391 value.push(escaped);
2392 }
2393 }
2394 }
2395 }
2396 } else {
2397 value.push(self.advance());
2398 }
2399 }
2400
2401 if self.is_at_end() {
2402 return Err(Error::tokenize(
2403 "Unterminated double-quoted string",
2404 self.line,
2405 self.column,
2406 self.start,
2407 self.current,
2408 ));
2409 }
2410
2411 self.advance(); self.add_token_with_text(TokenType::String, value);
2413 Ok(())
2414 }
2415
2416 fn scan_triple_quoted_string(&mut self, quote_char: char) -> Result<()> {
2417 self.advance();
2419 self.advance();
2420 self.advance();
2421 let mut value = String::new();
2422
2423 while !self.is_at_end() {
2424 if self.peek() == quote_char
2426 && self.current + 1 < self.size
2427 && self.chars[self.current + 1] == quote_char
2428 && self.current + 2 < self.size
2429 && self.chars[self.current + 2] == quote_char
2430 {
2431 break;
2433 }
2434 value.push(self.advance());
2435 }
2436
2437 if self.is_at_end() {
2438 return Err(Error::tokenize(
2439 "Unterminated triple-quoted string",
2440 self.line,
2441 self.column,
2442 self.start,
2443 self.current,
2444 ));
2445 }
2446
2447 self.advance();
2449 self.advance();
2450 self.advance();
2451 let token_type = if quote_char == '"' {
2452 TokenType::TripleDoubleQuotedString
2453 } else {
2454 TokenType::TripleSingleQuotedString
2455 };
2456 self.add_token_with_text(token_type, value);
2457 Ok(())
2458 }
2459
2460 fn scan_quoted_identifier(&mut self, end_quote: char) -> Result<()> {
2461 self.advance(); let mut value = String::new();
2463
2464 loop {
2465 if self.is_at_end() {
2466 return Err(Error::tokenize(
2467 "Unterminated identifier",
2468 self.line,
2469 self.column,
2470 self.start,
2471 self.current,
2472 ));
2473 }
2474 if self.peek() == end_quote {
2475 if self.peek_next() == end_quote {
2476 value.push(end_quote);
2478 self.advance(); self.advance(); } else {
2481 break;
2483 }
2484 } else {
2485 value.push(self.peek());
2486 self.advance();
2487 }
2488 }
2489
2490 self.advance(); self.add_token_with_text(TokenType::QuotedIdentifier, value);
2492 Ok(())
2493 }
2494
2495 fn scan_unicode_quoted_string(&mut self, open_quote: char) -> Result<()> {
2500 self.advance(); let start = self.current;
2502 let close_quote = if open_quote == '\u{2018}' {
2504 '\u{2019}' } else {
2506 '\u{2019}' };
2508 while !self.is_at_end() && self.peek() != close_quote {
2509 self.advance();
2510 }
2511 let value = self.text_from_range(start, self.current);
2512 if !self.is_at_end() {
2513 self.advance(); }
2515 self.add_token_with_text(TokenType::String, value);
2516 Ok(())
2517 }
2518
2519 fn scan_unicode_quoted_identifier(&mut self, open_quote: char) -> Result<()> {
2522 self.advance(); let start = self.current;
2524 let close_quote = if open_quote == '\u{201C}' {
2525 '\u{201D}' } else {
2527 '\u{201D}' };
2529 while !self.is_at_end() && self.peek() != close_quote && self.peek() != '"' {
2530 self.advance();
2531 }
2532 let value = self.text_from_range(start, self.current);
2533 if !self.is_at_end() {
2534 self.advance(); }
2536 self.add_token_with_text(TokenType::QuotedIdentifier, value);
2537 Ok(())
2538 }
2539
2540 fn scan_number(&mut self) -> Result<()> {
2541 if self.config.hex_number_strings && self.peek() == '0' && !self.is_at_end() {
2543 let next = if self.current + 1 < self.size {
2544 self.chars[self.current + 1]
2545 } else {
2546 '\0'
2547 };
2548 if next == 'x' || next == 'X' {
2549 self.advance();
2551 self.advance();
2552 let hex_start = self.current;
2554 while !self.is_at_end() && (self.peek().is_ascii_hexdigit() || self.peek() == '_') {
2555 if self.peek() == '_' && !self.peek_next().is_ascii_hexdigit() {
2556 break;
2557 }
2558 self.advance();
2559 }
2560 if self.current > hex_start {
2561 let mut is_hex_float = false;
2563 if !self.is_at_end() && self.peek() == '.' {
2565 let after_dot = if self.current + 1 < self.size {
2566 self.chars[self.current + 1]
2567 } else {
2568 '\0'
2569 };
2570 if after_dot.is_ascii_hexdigit() {
2571 is_hex_float = true;
2572 self.advance(); while !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2574 self.advance();
2575 }
2576 }
2577 }
2578 if !self.is_at_end() && (self.peek() == 'p' || self.peek() == 'P') {
2580 is_hex_float = true;
2581 self.advance(); if !self.is_at_end() && (self.peek() == '+' || self.peek() == '-') {
2583 self.advance();
2584 }
2585 while !self.is_at_end() && self.peek().is_ascii_digit() {
2586 self.advance();
2587 }
2588 }
2589 if is_hex_float {
2590 let full_text = self.text_from_range(self.start, self.current);
2592 self.add_token_with_text(TokenType::Number, full_text);
2593 } else if self.config.hex_string_is_integer_type {
2594 let hex_value = self.text_from_range(hex_start, self.current);
2596 self.add_token_with_text(TokenType::HexNumber, hex_value);
2597 } else {
2598 let hex_value = self.text_from_range(hex_start, self.current);
2600 self.add_token_with_text(TokenType::HexString, hex_value);
2601 }
2602 return Ok(());
2603 }
2604 self.current = self.start + 1;
2607 }
2608 }
2609
2610 while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2612 if self.peek() == '_' && (self.is_at_end() || !self.peek_next().is_ascii_digit()) {
2614 break;
2615 }
2616 self.advance();
2617 }
2618
2619 if self.peek() == '.' {
2623 let next = self.peek_next();
2624 if next != '.' {
2630 self.advance(); while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2633 if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2634 break;
2635 }
2636 self.advance();
2637 }
2638 }
2639 }
2640
2641 if self.peek() == 'e' || self.peek() == 'E' {
2643 self.advance();
2644 if self.peek() == '+' || self.peek() == '-' {
2645 self.advance();
2646 }
2647 while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2648 if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2649 break;
2650 }
2651 self.advance();
2652 }
2653 }
2654
2655 let text = self.text_from_range(self.start, self.current);
2656
2657 if !self.config.numeric_literals.is_empty() && !self.is_at_end() {
2659 let next_char: String = self.peek().to_ascii_uppercase().to_string();
2660 let suffix_match = if self.current + 1 < self.size {
2662 let two_char: String = [
2663 self.chars[self.current].to_ascii_uppercase(),
2664 self.chars[self.current + 1].to_ascii_uppercase(),
2665 ]
2666 .iter()
2667 .collect();
2668 if self.config.numeric_literals.contains_key(&two_char) {
2669 let after_suffix = if self.current + 2 < self.size {
2671 self.chars[self.current + 2]
2672 } else {
2673 ' '
2674 };
2675 if !after_suffix.is_alphanumeric() && after_suffix != '_' {
2676 Some((two_char, 2))
2677 } else {
2678 None
2679 }
2680 } else if self.config.numeric_literals.contains_key(&next_char) {
2681 let after_suffix = if self.current + 1 < self.size {
2683 self.chars[self.current + 1]
2684 } else {
2685 ' '
2686 };
2687 if !after_suffix.is_alphanumeric() && after_suffix != '_' {
2688 Some((next_char, 1))
2689 } else {
2690 None
2691 }
2692 } else {
2693 None
2694 }
2695 } else if self.config.numeric_literals.contains_key(&next_char) {
2696 Some((next_char, 1))
2698 } else {
2699 None
2700 };
2701
2702 if let Some((suffix, len)) = suffix_match {
2703 for _ in 0..len {
2705 self.advance();
2706 }
2707 let type_name = self
2710 .config
2711 .numeric_literals
2712 .get(&suffix)
2713 .expect("suffix verified by contains_key above")
2714 .clone();
2715 let combined = format!("{}::{}", text, type_name);
2716 self.add_token_with_text(TokenType::Number, combined);
2717 return Ok(());
2718 }
2719 }
2720
2721 if self.config.identifiers_can_start_with_digit && !self.is_at_end() {
2724 let next = self.peek();
2725 if next.is_alphabetic() || next == '_' {
2726 while !self.is_at_end() {
2728 let ch = self.peek();
2729 if ch.is_alphanumeric() || ch == '_' {
2730 self.advance();
2731 } else {
2732 break;
2733 }
2734 }
2735 let ident_text = self.text_from_range(self.start, self.current);
2736 self.add_token_with_text(TokenType::Identifier, ident_text);
2737 return Ok(());
2738 }
2739 }
2740
2741 self.add_token_with_text(TokenType::Number, text);
2742 Ok(())
2743 }
2744
2745 fn scan_number_starting_with_dot(&mut self) -> Result<()> {
2747 self.advance();
2749
2750 while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2752 if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2753 break;
2754 }
2755 self.advance();
2756 }
2757
2758 if self.peek() == 'e' || self.peek() == 'E' {
2760 self.advance();
2761 if self.peek() == '+' || self.peek() == '-' {
2762 self.advance();
2763 }
2764 while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2765 if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2766 break;
2767 }
2768 self.advance();
2769 }
2770 }
2771
2772 let text = self.text_from_range(self.start, self.current);
2773 self.add_token_with_text(TokenType::Number, text);
2774 Ok(())
2775 }
2776
2777 #[inline]
2780 fn lookup_keyword_ascii(keywords: &HashMap<String, TokenType>, text: &str) -> TokenType {
2781 if text.len() > 128 {
2782 return TokenType::Var;
2783 }
2784 let mut buf = [0u8; 128];
2785 for (i, b) in text.bytes().enumerate() {
2786 buf[i] = b.to_ascii_uppercase();
2787 }
2788 if let Ok(upper) = std::str::from_utf8(&buf[..text.len()]) {
2789 keywords.get(upper).copied().unwrap_or(TokenType::Var)
2790 } else {
2791 TokenType::Var
2792 }
2793 }
2794
2795 fn scan_identifier_or_keyword(&mut self) -> Result<()> {
2796 let first_char = self.peek();
2798 if !first_char.is_alphanumeric() && first_char != '_' {
2799 let c = self.advance();
2801 return Err(Error::tokenize(
2802 format!("Unexpected character: '{}'", c),
2803 self.line,
2804 self.column,
2805 self.start,
2806 self.current,
2807 ));
2808 }
2809
2810 while !self.is_at_end() {
2811 let c = self.peek();
2812 if c == '#' {
2816 let next_c = if self.current + 1 < self.size {
2817 self.chars[self.current + 1]
2818 } else {
2819 '\0'
2820 };
2821 if next_c == '>' || next_c == '-' {
2822 break; }
2824 self.advance();
2825 } else if c.is_alphanumeric() || c == '_' || c == '$' || c == '@' {
2826 self.advance();
2827 } else {
2828 break;
2829 }
2830 }
2831
2832 let text = self.text_from_range(self.start, self.current);
2833
2834 if text.eq_ignore_ascii_case("NOT") && self.peek() == '=' {
2836 self.advance(); self.add_token(TokenType::Neq);
2838 return Ok(());
2839 }
2840
2841 let next_char = self.peek();
2844 let is_single_quote = next_char == '\'';
2845 let is_double_quote = next_char == '"' && self.config.quotes.contains_key("\"");
2846 let is_double_quote_for_raw = next_char == '"';
2849
2850 if text.eq_ignore_ascii_case("R") && (is_single_quote || is_double_quote_for_raw) {
2853 let quote_char = if is_single_quote { '\'' } else { '"' };
2856 self.advance(); if self.peek() == quote_char && self.peek_next() == quote_char {
2860 self.advance(); self.advance(); let string_value = self.scan_raw_triple_quoted_content(quote_char)?;
2864 self.add_token_with_text(TokenType::RawString, string_value);
2865 } else {
2866 let string_value = self.scan_raw_string_content(quote_char)?;
2867 self.add_token_with_text(TokenType::RawString, string_value);
2868 }
2869 return Ok(());
2870 }
2871
2872 if is_single_quote || is_double_quote {
2873 if text.eq_ignore_ascii_case("N") {
2874 self.advance(); let string_value = if is_single_quote {
2877 self.scan_string_content()?
2878 } else {
2879 self.scan_double_quoted_string_content()?
2880 };
2881 self.add_token_with_text(TokenType::NationalString, string_value);
2882 return Ok(());
2883 } else if text.eq_ignore_ascii_case("E") {
2884 let lowercase = text == "e";
2888 let prefix = if lowercase { "e:" } else { "E:" };
2889 self.advance(); let string_value = self.scan_string_content_with_escapes(true)?;
2891 self.add_token_with_text(
2892 TokenType::EscapeString,
2893 format!("{}{}", prefix, string_value),
2894 );
2895 return Ok(());
2896 } else if text.eq_ignore_ascii_case("X") {
2897 self.advance(); let string_value = if is_single_quote {
2900 self.scan_string_content()?
2901 } else {
2902 self.scan_double_quoted_string_content()?
2903 };
2904 self.add_token_with_text(TokenType::HexString, string_value);
2905 return Ok(());
2906 } else if text.eq_ignore_ascii_case("B") && is_double_quote {
2907 self.advance(); let string_value = self.scan_double_quoted_string_content()?;
2910 self.add_token_with_text(TokenType::ByteString, string_value);
2911 return Ok(());
2912 } else if text.eq_ignore_ascii_case("B") && is_single_quote {
2913 self.advance(); let string_value = self.scan_string_content()?;
2917 if self.config.b_prefix_is_byte_string {
2918 self.add_token_with_text(TokenType::ByteString, string_value);
2919 } else {
2920 self.add_token_with_text(TokenType::BitString, string_value);
2921 }
2922 return Ok(());
2923 }
2924 }
2925
2926 if text.eq_ignore_ascii_case("U")
2928 && self.peek() == '&'
2929 && self.current + 1 < self.size
2930 && self.chars[self.current + 1] == '\''
2931 {
2932 self.advance(); self.advance(); let string_value = self.scan_string_content()?;
2935 self.add_token_with_text(TokenType::UnicodeString, string_value);
2936 return Ok(());
2937 }
2938
2939 let token_type = Self::lookup_keyword_ascii(&self.config.keywords, &text);
2940
2941 self.add_token_with_text(token_type, text);
2942 Ok(())
2943 }
2944
2945 fn scan_string_content_with_escapes(
2949 &mut self,
2950 force_backslash_escapes: bool,
2951 ) -> Result<String> {
2952 let mut value = String::new();
2953 let use_backslash_escapes =
2954 force_backslash_escapes || self.config.string_escapes.contains(&'\\');
2955
2956 while !self.is_at_end() {
2957 let c = self.peek();
2958 if c == '\'' {
2959 if self.peek_next() == '\'' {
2960 value.push('\'');
2962 self.advance();
2963 self.advance();
2964 } else {
2965 break;
2966 }
2967 } else if c == '\\' && use_backslash_escapes {
2968 value.push(self.advance());
2970 if !self.is_at_end() {
2971 value.push(self.advance());
2972 }
2973 } else {
2974 value.push(self.advance());
2975 }
2976 }
2977
2978 if self.is_at_end() {
2979 return Err(Error::tokenize(
2980 "Unterminated string",
2981 self.line,
2982 self.column,
2983 self.start,
2984 self.current,
2985 ));
2986 }
2987
2988 self.advance(); Ok(value)
2990 }
2991
2992 fn scan_string_content(&mut self) -> Result<String> {
2994 self.scan_string_content_with_escapes(false)
2995 }
2996
2997 fn scan_double_quoted_string_content(&mut self) -> Result<String> {
3000 let mut value = String::new();
3001 let use_backslash_escapes = self.config.string_escapes.contains(&'\\');
3002
3003 while !self.is_at_end() {
3004 let c = self.peek();
3005 if c == '"' {
3006 if self.peek_next() == '"' {
3007 value.push('"');
3009 self.advance();
3010 self.advance();
3011 } else {
3012 break;
3013 }
3014 } else if c == '\\' && use_backslash_escapes {
3015 self.advance(); if !self.is_at_end() {
3018 let escaped = self.advance();
3019 match escaped {
3020 'n' => value.push('\n'),
3021 'r' => value.push('\r'),
3022 't' => value.push('\t'),
3023 '0' => value.push('\0'),
3024 '\\' => value.push('\\'),
3025 '"' => value.push('"'),
3026 '\'' => value.push('\''),
3027 'x' => {
3028 let mut hex = String::new();
3030 for _ in 0..2 {
3031 if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
3032 hex.push(self.advance());
3033 }
3034 }
3035 if let Ok(byte) = u8::from_str_radix(&hex, 16) {
3036 value.push(byte as char);
3037 } else {
3038 value.push('\\');
3040 value.push('x');
3041 value.push_str(&hex);
3042 }
3043 }
3044 _ => {
3045 value.push('\\');
3047 value.push(escaped);
3048 }
3049 }
3050 }
3051 } else {
3052 value.push(self.advance());
3053 }
3054 }
3055
3056 if self.is_at_end() {
3057 return Err(Error::tokenize(
3058 "Unterminated double-quoted string",
3059 self.line,
3060 self.column,
3061 self.start,
3062 self.current,
3063 ));
3064 }
3065
3066 self.advance(); Ok(value)
3068 }
3069
3070 fn scan_raw_string_content(&mut self, quote_char: char) -> Result<String> {
3075 let mut value = String::new();
3076
3077 while !self.is_at_end() {
3078 let c = self.peek();
3079 if c == quote_char {
3080 if self.peek_next() == quote_char {
3081 value.push(quote_char);
3083 self.advance();
3084 self.advance();
3085 } else {
3086 break;
3087 }
3088 } else if c == '\\'
3089 && self.peek_next() == quote_char
3090 && self.config.string_escapes_allowed_in_raw_strings
3091 {
3092 value.push(quote_char);
3096 self.advance(); self.advance(); } else {
3099 value.push(self.advance());
3101 }
3102 }
3103
3104 if self.is_at_end() {
3105 return Err(Error::tokenize(
3106 "Unterminated raw string",
3107 self.line,
3108 self.column,
3109 self.start,
3110 self.current,
3111 ));
3112 }
3113
3114 self.advance(); Ok(value)
3116 }
3117
3118 fn scan_raw_triple_quoted_content(&mut self, quote_char: char) -> Result<String> {
3121 let mut value = String::new();
3122
3123 while !self.is_at_end() {
3124 let c = self.peek();
3125 if c == quote_char && self.peek_next() == quote_char {
3126 if self.current + 2 < self.size && self.chars[self.current + 2] == quote_char {
3128 self.advance(); self.advance(); self.advance(); return Ok(value);
3133 }
3134 }
3135 let ch = self.advance();
3137 value.push(ch);
3138 }
3139
3140 Err(Error::tokenize(
3141 "Unterminated raw triple-quoted string",
3142 self.line,
3143 self.column,
3144 self.start,
3145 self.current,
3146 ))
3147 }
3148
3149 fn scan_dollar_identifier(&mut self) -> Result<()> {
3154 self.advance();
3156
3157 while !self.is_at_end() {
3159 let c = self.peek();
3160 if c.is_alphanumeric() || c == '_' || c == '$' {
3161 self.advance();
3162 } else {
3163 break;
3164 }
3165 }
3166
3167 let text = self.text_from_range(self.start, self.current);
3168 self.add_token_with_text(TokenType::Var, text);
3169 Ok(())
3170 }
3171
3172 fn scan_tsql_identifier(&mut self) -> Result<()> {
3173 let first = self.advance();
3175
3176 if first == '#' && self.peek() == '#' {
3178 self.advance();
3179 }
3180
3181 while !self.is_at_end() {
3183 let c = self.peek();
3184 if c.is_alphanumeric() || c == '_' || c == '$' || c == '#' || c == '@' {
3185 self.advance();
3186 } else {
3187 break;
3188 }
3189 }
3190
3191 let text = self.text_from_range(self.start, self.current);
3192 self.add_token_with_text(TokenType::Var, text);
3194 Ok(())
3195 }
3196
3197 fn try_scan_insert_format_raw_data(&mut self) -> Option<String> {
3201 let len = self.tokens.len();
3202 if len < 3 {
3203 return None;
3204 }
3205
3206 let last = &self.tokens[len - 1];
3208 if last.text.eq_ignore_ascii_case("VALUES") {
3209 return None;
3210 }
3211 if !matches!(last.token_type, TokenType::Var | TokenType::Identifier) {
3212 return None;
3213 }
3214
3215 let format_tok = &self.tokens[len - 2];
3217 if !format_tok.text.eq_ignore_ascii_case("FORMAT") {
3218 return None;
3219 }
3220
3221 let has_insert = self.tokens[..len - 2]
3223 .iter()
3224 .rev()
3225 .take(20)
3226 .any(|t| t.token_type == TokenType::Insert);
3227 if !has_insert {
3228 return None;
3229 }
3230
3231 let raw_start = self.current;
3235 while !self.is_at_end() {
3236 let c = self.peek();
3237 if c == '\n' {
3238 let saved = self.current;
3240 self.advance(); while !self.is_at_end() && self.peek() == '\r' {
3243 self.advance();
3244 }
3245 if self.is_at_end() || self.peek() == '\n' {
3246 let raw = self.text_from_range(raw_start, saved);
3249 return Some(raw.trim().to_string());
3250 }
3251 } else {
3253 self.advance();
3254 }
3255 }
3256
3257 let raw = self.text_from_range(raw_start, self.current);
3259 let trimmed = raw.trim().to_string();
3260 if trimmed.is_empty() {
3261 None
3262 } else {
3263 Some(trimmed)
3264 }
3265 }
3266
3267 fn add_token(&mut self, token_type: TokenType) {
3268 let text = self.text_from_range(self.start, self.current);
3269 self.add_token_with_text(token_type, text);
3270 }
3271
3272 fn add_token_with_text(&mut self, token_type: TokenType, text: String) {
3273 let span = Span::new(self.start, self.current, self.line, self.column);
3274 let mut token = Token::new(token_type, text, span);
3275 token.comments.append(&mut self.comments);
3276 self.tokens.push(token);
3277 }
3278}
3279
3280#[cfg(test)]
3281mod tests {
3282 use super::*;
3283
3284 #[test]
3285 fn test_simple_select() {
3286 let tokenizer = Tokenizer::default();
3287 let tokens = tokenizer.tokenize("SELECT 1").unwrap();
3288
3289 assert_eq!(tokens.len(), 2);
3290 assert_eq!(tokens[0].token_type, TokenType::Select);
3291 assert_eq!(tokens[1].token_type, TokenType::Number);
3292 assert_eq!(tokens[1].text, "1");
3293 }
3294
3295 #[test]
3296 fn test_select_with_identifier() {
3297 let tokenizer = Tokenizer::default();
3298 let tokens = tokenizer.tokenize("SELECT a, b FROM t").unwrap();
3299
3300 assert_eq!(tokens.len(), 6);
3301 assert_eq!(tokens[0].token_type, TokenType::Select);
3302 assert_eq!(tokens[1].token_type, TokenType::Var);
3303 assert_eq!(tokens[1].text, "a");
3304 assert_eq!(tokens[2].token_type, TokenType::Comma);
3305 assert_eq!(tokens[3].token_type, TokenType::Var);
3306 assert_eq!(tokens[3].text, "b");
3307 assert_eq!(tokens[4].token_type, TokenType::From);
3308 assert_eq!(tokens[5].token_type, TokenType::Var);
3309 assert_eq!(tokens[5].text, "t");
3310 }
3311
3312 #[test]
3313 fn test_string_literal() {
3314 let tokenizer = Tokenizer::default();
3315 let tokens = tokenizer.tokenize("SELECT 'hello'").unwrap();
3316
3317 assert_eq!(tokens.len(), 2);
3318 assert_eq!(tokens[1].token_type, TokenType::String);
3319 assert_eq!(tokens[1].text, "hello");
3320 }
3321
3322 #[test]
3323 fn test_escaped_string() {
3324 let tokenizer = Tokenizer::default();
3325 let tokens = tokenizer.tokenize("SELECT 'it''s'").unwrap();
3326
3327 assert_eq!(tokens.len(), 2);
3328 assert_eq!(tokens[1].token_type, TokenType::String);
3329 assert_eq!(tokens[1].text, "it's");
3330 }
3331
3332 #[test]
3333 fn test_comments() {
3334 let tokenizer = Tokenizer::default();
3335 let tokens = tokenizer.tokenize("SELECT -- comment\n1").unwrap();
3336
3337 assert_eq!(tokens.len(), 2);
3338 assert_eq!(tokens[0].trailing_comments.len(), 1);
3341 assert_eq!(tokens[0].trailing_comments[0], " comment");
3342 }
3343
3344 #[test]
3345 fn test_comment_in_and_chain() {
3346 use crate::generator::Generator;
3347 use crate::parser::Parser;
3348
3349 let sql = "SELECT a FROM b WHERE foo\n-- c1\nAND bar\n-- c2\nAND bla";
3351 let ast = Parser::parse_sql(sql).unwrap();
3352 let mut gen = Generator::default();
3353 let output = gen.generate(&ast[0]).unwrap();
3354 assert_eq!(
3355 output,
3356 "SELECT a FROM b WHERE foo AND /* c1 */ bar AND /* c2 */ bla"
3357 );
3358 }
3359
3360 #[test]
3361 fn test_operators() {
3362 let tokenizer = Tokenizer::default();
3363 let tokens = tokenizer.tokenize("1 + 2 * 3").unwrap();
3364
3365 assert_eq!(tokens.len(), 5);
3366 assert_eq!(tokens[0].token_type, TokenType::Number);
3367 assert_eq!(tokens[1].token_type, TokenType::Plus);
3368 assert_eq!(tokens[2].token_type, TokenType::Number);
3369 assert_eq!(tokens[3].token_type, TokenType::Star);
3370 assert_eq!(tokens[4].token_type, TokenType::Number);
3371 }
3372
3373 #[test]
3374 fn test_comparison_operators() {
3375 let tokenizer = Tokenizer::default();
3376 let tokens = tokenizer.tokenize("a <= b >= c != d").unwrap();
3377
3378 assert_eq!(tokens[1].token_type, TokenType::Lte);
3379 assert_eq!(tokens[3].token_type, TokenType::Gte);
3380 assert_eq!(tokens[5].token_type, TokenType::Neq);
3381 }
3382
3383 #[test]
3384 fn test_national_string() {
3385 let tokenizer = Tokenizer::default();
3386 let tokens = tokenizer.tokenize("N'abc'").unwrap();
3387
3388 assert_eq!(
3389 tokens.len(),
3390 1,
3391 "Expected 1 token for N'abc', got {:?}",
3392 tokens
3393 );
3394 assert_eq!(tokens[0].token_type, TokenType::NationalString);
3395 assert_eq!(tokens[0].text, "abc");
3396 }
3397
3398 #[test]
3399 fn test_hex_string() {
3400 let tokenizer = Tokenizer::default();
3401 let tokens = tokenizer.tokenize("X'ABCD'").unwrap();
3402
3403 assert_eq!(
3404 tokens.len(),
3405 1,
3406 "Expected 1 token for X'ABCD', got {:?}",
3407 tokens
3408 );
3409 assert_eq!(tokens[0].token_type, TokenType::HexString);
3410 assert_eq!(tokens[0].text, "ABCD");
3411 }
3412
3413 #[test]
3414 fn test_bit_string() {
3415 let tokenizer = Tokenizer::default();
3416 let tokens = tokenizer.tokenize("B'01010'").unwrap();
3417
3418 assert_eq!(
3419 tokens.len(),
3420 1,
3421 "Expected 1 token for B'01010', got {:?}",
3422 tokens
3423 );
3424 assert_eq!(tokens[0].token_type, TokenType::BitString);
3425 assert_eq!(tokens[0].text, "01010");
3426 }
3427
3428 #[test]
3429 fn test_trailing_dot_number() {
3430 let tokenizer = Tokenizer::default();
3431
3432 let tokens = tokenizer.tokenize("SELECT 1.").unwrap();
3434 assert_eq!(
3435 tokens.len(),
3436 2,
3437 "Expected 2 tokens for 'SELECT 1.', got {:?}",
3438 tokens
3439 );
3440 assert_eq!(tokens[1].token_type, TokenType::Number);
3441 assert_eq!(tokens[1].text, "1.");
3442
3443 let tokens = tokenizer.tokenize("SELECT 1.5").unwrap();
3445 assert_eq!(tokens[1].text, "1.5");
3446
3447 let tokens = tokenizer.tokenize("SELECT 1.a").unwrap();
3450 assert_eq!(
3451 tokens.len(),
3452 3,
3453 "Expected 3 tokens for 'SELECT 1.a', got {:?}",
3454 tokens
3455 );
3456 assert_eq!(tokens[1].token_type, TokenType::Number);
3457 assert_eq!(tokens[1].text, "1.");
3458 assert_eq!(tokens[2].token_type, TokenType::Var);
3459
3460 let tokens = tokenizer.tokenize("SELECT 1..2").unwrap();
3462 assert_eq!(tokens[1].token_type, TokenType::Number);
3463 assert_eq!(tokens[1].text, "1");
3464 assert_eq!(tokens[2].token_type, TokenType::Dot);
3465 assert_eq!(tokens[3].token_type, TokenType::Dot);
3466 assert_eq!(tokens[4].token_type, TokenType::Number);
3467 assert_eq!(tokens[4].text, "2");
3468 }
3469
3470 #[test]
3471 fn test_leading_dot_number() {
3472 let tokenizer = Tokenizer::default();
3473
3474 let tokens = tokenizer.tokenize(".25").unwrap();
3476 assert_eq!(
3477 tokens.len(),
3478 1,
3479 "Expected 1 token for '.25', got {:?}",
3480 tokens
3481 );
3482 assert_eq!(tokens[0].token_type, TokenType::Number);
3483 assert_eq!(tokens[0].text, ".25");
3484
3485 let tokens = tokenizer.tokenize("SAMPLE (.25)").unwrap();
3487 assert_eq!(
3488 tokens.len(),
3489 4,
3490 "Expected 4 tokens for 'SAMPLE (.25)', got {:?}",
3491 tokens
3492 );
3493 assert_eq!(tokens[0].token_type, TokenType::Sample);
3494 assert_eq!(tokens[1].token_type, TokenType::LParen);
3495 assert_eq!(tokens[2].token_type, TokenType::Number);
3496 assert_eq!(tokens[2].text, ".25");
3497 assert_eq!(tokens[3].token_type, TokenType::RParen);
3498
3499 let tokens = tokenizer.tokenize(".5e10").unwrap();
3501 assert_eq!(
3502 tokens.len(),
3503 1,
3504 "Expected 1 token for '.5e10', got {:?}",
3505 tokens
3506 );
3507 assert_eq!(tokens[0].token_type, TokenType::Number);
3508 assert_eq!(tokens[0].text, ".5e10");
3509
3510 let tokens = tokenizer.tokenize("a.b").unwrap();
3512 assert_eq!(
3513 tokens.len(),
3514 3,
3515 "Expected 3 tokens for 'a.b', got {:?}",
3516 tokens
3517 );
3518 assert_eq!(tokens[1].token_type, TokenType::Dot);
3519 }
3520
3521 #[test]
3522 fn test_unrecognized_character() {
3523 let tokenizer = Tokenizer::default();
3524
3525 let result = tokenizer.tokenize("SELECT \u{2018}hello\u{2019}");
3527 assert!(
3528 result.is_ok(),
3529 "Curly quotes should be tokenized as strings"
3530 );
3531
3532 let result = tokenizer.tokenize("SELECT • FROM t");
3534 assert!(result.is_err());
3535 }
3536
3537 #[test]
3538 fn test_colon_eq_tokenization() {
3539 let tokenizer = Tokenizer::default();
3540
3541 let tokens = tokenizer.tokenize("a := 1").unwrap();
3543 assert_eq!(tokens.len(), 3);
3544 assert_eq!(tokens[0].token_type, TokenType::Var);
3545 assert_eq!(tokens[1].token_type, TokenType::ColonEq);
3546 assert_eq!(tokens[2].token_type, TokenType::Number);
3547
3548 let tokens = tokenizer.tokenize("a:b").unwrap();
3550 assert!(tokens.iter().any(|t| t.token_type == TokenType::Colon));
3551 assert!(!tokens.iter().any(|t| t.token_type == TokenType::ColonEq));
3552
3553 let tokens = tokenizer.tokenize("a::INT").unwrap();
3555 assert!(tokens.iter().any(|t| t.token_type == TokenType::DColon));
3556 }
3557
3558 #[test]
3559 fn test_colon_eq_parsing() {
3560 use crate::generator::Generator;
3561 use crate::parser::Parser;
3562
3563 let ast = Parser::parse_sql("SELECT @var1 := 1, @var2")
3565 .expect("Failed to parse MySQL @var := expr");
3566 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3567 assert_eq!(output, "SELECT @var1 := 1, @var2");
3568
3569 let ast = Parser::parse_sql("SELECT @var1, @var2 := @var1")
3571 .expect("Failed to parse MySQL @var2 := @var1");
3572 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3573 assert_eq!(output, "SELECT @var1, @var2 := @var1");
3574
3575 let ast = Parser::parse_sql("SELECT @var1 := COUNT(*) FROM t1")
3577 .expect("Failed to parse MySQL @var := COUNT(*)");
3578 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3579 assert_eq!(output, "SELECT @var1 := COUNT(*) FROM t1");
3580
3581 let ast = Parser::parse_sql("SET @var1 := 1").expect("Failed to parse SET @var1 := 1");
3583 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3584 assert_eq!(output, "SET @var1 = 1");
3585
3586 let ast =
3588 Parser::parse_sql("UNION_VALUE(k1 := 1)").expect("Failed to parse named arg with :=");
3589 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3590 assert_eq!(output, "UNION_VALUE(k1 := 1)");
3591
3592 let ast = Parser::parse_sql("SELECT UNNEST(col, recursive := TRUE) FROM t")
3594 .expect("Failed to parse UNNEST with :=");
3595 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3596 assert_eq!(output, "SELECT UNNEST(col, recursive := TRUE) FROM t");
3597
3598 let ast =
3600 Parser::parse_sql("SELECT foo: 1").expect("Failed to parse DuckDB prefix alias foo: 1");
3601 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3602 assert_eq!(output, "SELECT 1 AS foo");
3603
3604 let ast = Parser::parse_sql("SELECT foo: 1, bar: 2, baz: 3")
3606 .expect("Failed to parse DuckDB multiple prefix aliases");
3607 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3608 assert_eq!(output, "SELECT 1 AS foo, 2 AS bar, 3 AS baz");
3609 }
3610
3611 #[test]
3612 fn test_colon_eq_dialect_roundtrip() {
3613 use crate::dialects::{Dialect, DialectType};
3614
3615 fn check(dialect: DialectType, sql: &str, expected: Option<&str>) {
3616 let d = Dialect::get(dialect);
3617 let ast = d
3618 .parse(sql)
3619 .unwrap_or_else(|e| panic!("Parse error for '{}': {}", sql, e));
3620 assert!(!ast.is_empty(), "Empty AST for: {}", sql);
3621 let transformed = d
3622 .transform(ast[0].clone())
3623 .unwrap_or_else(|e| panic!("Transform error for '{}': {}", sql, e));
3624 let output = d
3625 .generate(&transformed)
3626 .unwrap_or_else(|e| panic!("Generate error for '{}': {}", sql, e));
3627 let expected = expected.unwrap_or(sql);
3628 assert_eq!(output, expected, "Roundtrip failed for: {}", sql);
3629 }
3630
3631 check(DialectType::MySQL, "SELECT @var1 := 1, @var2", None);
3633 check(DialectType::MySQL, "SELECT @var1, @var2 := @var1", None);
3634 check(DialectType::MySQL, "SELECT @var1 := COUNT(*) FROM t1", None);
3635 check(DialectType::MySQL, "SET @var1 := 1", Some("SET @var1 = 1"));
3636
3637 check(
3639 DialectType::DuckDB,
3640 "SELECT UNNEST(col, recursive := TRUE) FROM t",
3641 None,
3642 );
3643 check(DialectType::DuckDB, "UNION_VALUE(k1 := 1)", None);
3644
3645 {
3648 let d = Dialect::get(DialectType::DuckDB);
3649 let ast = d
3650 .parse("STRUCT_PACK(a := 'b')::json")
3651 .expect("Failed to parse STRUCT_PACK(a := 'b')::json");
3652 assert!(!ast.is_empty(), "Empty AST for STRUCT_PACK(a := 'b')::json");
3653 }
3654
3655 check(
3657 DialectType::DuckDB,
3658 "SELECT foo: 1",
3659 Some("SELECT 1 AS foo"),
3660 );
3661 check(
3662 DialectType::DuckDB,
3663 "SELECT foo: 1, bar: 2, baz: 3",
3664 Some("SELECT 1 AS foo, 2 AS bar, 3 AS baz"),
3665 );
3666 }
3667
3668 #[test]
3669 fn test_comment_roundtrip() {
3670 use crate::generator::Generator;
3671 use crate::parser::Parser;
3672
3673 fn check_roundtrip(sql: &str) -> Option<String> {
3674 let ast = match Parser::parse_sql(sql) {
3675 Ok(a) => a,
3676 Err(e) => return Some(format!("Parse error: {:?}", e)),
3677 };
3678 if ast.is_empty() {
3679 return Some("Empty AST".to_string());
3680 }
3681 let mut generator = Generator::default();
3682 let output = match generator.generate(&ast[0]) {
3683 Ok(o) => o,
3684 Err(e) => return Some(format!("Gen error: {:?}", e)),
3685 };
3686 if output == sql {
3687 None
3688 } else {
3689 Some(format!(
3690 "Mismatch:\n input: {}\n output: {}",
3691 sql, output
3692 ))
3693 }
3694 }
3695
3696 let tests = vec![
3697 "SELECT c /* c1 /* c2 */ c3 */",
3699 "SELECT c /* c1 /* c2 /* c3 */ */ */",
3700 "SELECT c /* c1 */ AS alias /* c2 */",
3702 "SELECT a /* x */, b /* x */",
3704 "SELECT a /* x */ /* y */ /* z */, b /* k */ /* m */",
3706 "SELECT * FROM foo /* x */, bla /* x */",
3708 "SELECT 1 /* comment */ + 1",
3710 "SELECT 1 /* c1 */ + 2 /* c2 */",
3711 "SELECT 1 /* c1 */ + /* c2 */ 2 /* c3 */",
3712 "SELECT CAST(x AS INT) /* comment */ FROM foo",
3714 "SELECT FOO(x /* c */) /* FOO */, b /* b */",
3716 "SELECT x FROM a.b.c /* x */, e.f.g /* x */",
3718 "INSERT INTO t1 (tc1 /* tc1 */, tc2 /* tc2 */) SELECT c1 /* sc1 */, c2 /* sc2 */ FROM t",
3720 "/* c */ WITH x AS (SELECT 1) SELECT * FROM x",
3722 "/* comment1 */ INSERT INTO x /* comment2 */ VALUES (1, 2, 3)",
3723 "/* comment1 */ UPDATE tbl /* comment2 */ SET x = 2 WHERE x < 2",
3724 "/* comment1 */ DELETE FROM x /* comment2 */ WHERE y > 1",
3725 "/* comment */ CREATE TABLE foo AS SELECT 1",
3726 "INSERT INTO foo SELECT * FROM bar /* comment */",
3728 "SELECT FOO(x /* c1 */ + y /* c2 */ + BLA(5 /* c3 */)) FROM (VALUES (1 /* c4 */, \"test\" /* c5 */)) /* c6 */",
3730 ];
3731
3732 let mut failures = Vec::new();
3733 for sql in tests {
3734 if let Some(e) = check_roundtrip(sql) {
3735 failures.push(e);
3736 }
3737 }
3738
3739 if !failures.is_empty() {
3740 panic!("Comment roundtrip failures:\n{}", failures.join("\n\n"));
3741 }
3742 }
3743
3744 #[test]
3745 fn test_dollar_quoted_string_parsing() {
3746 use crate::dialects::{Dialect, DialectType};
3747
3748 let (tag, content) = super::parse_dollar_string_token("FOO\x00content here");
3750 assert_eq!(tag, Some("FOO".to_string()));
3751 assert_eq!(content, "content here");
3752
3753 let (tag, content) = super::parse_dollar_string_token("just content");
3754 assert_eq!(tag, None);
3755 assert_eq!(content, "just content");
3756
3757 fn check_databricks(sql: &str, expected: Option<&str>) {
3759 let d = Dialect::get(DialectType::Databricks);
3760 let ast = d
3761 .parse(sql)
3762 .unwrap_or_else(|e| panic!("Parse error for '{}': {}", sql, e));
3763 assert!(!ast.is_empty(), "Empty AST for: {}", sql);
3764 let transformed = d
3765 .transform(ast[0].clone())
3766 .unwrap_or_else(|e| panic!("Transform error for '{}': {}", sql, e));
3767 let output = d
3768 .generate(&transformed)
3769 .unwrap_or_else(|e| panic!("Generate error for '{}': {}", sql, e));
3770 let expected = expected.unwrap_or(sql);
3771 assert_eq!(output, expected, "Roundtrip failed for: {}", sql);
3772 }
3773
3774 check_databricks(
3776 "CREATE FUNCTION add_one(x INT) RETURNS INT LANGUAGE PYTHON AS $$def add_one(x):\n return x+1$$",
3777 None
3778 );
3779
3780 check_databricks(
3782 "CREATE FUNCTION add_one(x INT) RETURNS INT LANGUAGE PYTHON AS $FOO$def add_one(x):\n return x+1$FOO$",
3783 None
3784 );
3785 }
3786}