1use crate::error::{Error, Result};
7use serde::{Deserialize, Serialize};
8use std::fmt;
9
10pub fn parse_dollar_string_token(text: &str) -> (Option<String>, String) {
14 if let Some(pos) = text.find('\x00') {
15 let tag = &text[..pos];
16 let content = &text[pos + 1..];
17 (Some(tag.to_string()), content.to_string())
18 } else {
19 (None, text.to_string())
20 }
21}
22
23#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
25pub struct Span {
26 pub start: usize,
28 pub end: usize,
30 pub line: usize,
32 pub column: usize,
34}
35
36impl Span {
37 pub fn new(start: usize, end: usize, line: usize, column: usize) -> Self {
38 Self {
39 start,
40 end,
41 line,
42 column,
43 }
44 }
45}
46
47#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
49pub struct Token {
50 pub token_type: TokenType,
52 pub text: String,
54 pub span: Span,
56 #[serde(default)]
58 pub comments: Vec<String>,
59 #[serde(default)]
61 pub trailing_comments: Vec<String>,
62}
63
64impl Token {
65 pub fn new(token_type: TokenType, text: impl Into<String>, span: Span) -> Self {
67 Self {
68 token_type,
69 text: text.into(),
70 span,
71 comments: Vec::new(),
72 trailing_comments: Vec::new(),
73 }
74 }
75
76 pub fn number(n: i64) -> Self {
78 Self::new(TokenType::Number, n.to_string(), Span::default())
79 }
80
81 pub fn string(s: impl Into<String>) -> Self {
83 Self::new(TokenType::String, s, Span::default())
84 }
85
86 pub fn identifier(s: impl Into<String>) -> Self {
88 Self::new(TokenType::Identifier, s, Span::default())
89 }
90
91 pub fn var(s: impl Into<String>) -> Self {
93 Self::new(TokenType::Var, s, Span::default())
94 }
95
96 pub fn with_comment(mut self, comment: impl Into<String>) -> Self {
98 self.comments.push(comment.into());
99 self
100 }
101}
102
103impl fmt::Display for Token {
104 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
105 write!(f, "{:?}({})", self.token_type, self.text)
106 }
107}
108
109#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
111#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
112#[repr(u16)]
113pub enum TokenType {
114 LParen,
116 RParen,
117 LBracket,
118 RBracket,
119 LBrace,
120 RBrace,
121 Comma,
122 Dot,
123 Dash,
124 Plus,
125 Colon,
126 DotColon,
127 DColon,
128 DColonDollar,
129 DColonPercent,
130 DColonQMark,
131 DQMark,
132 Semicolon,
133 Star,
134 Backslash,
135 Slash,
136 Lt,
137 Lte,
138 Gt,
139 Gte,
140 Not,
141 Eq,
142 Neq,
143 NullsafeEq,
144 ColonEq,
145 ColonGt,
146 NColonGt,
147 And,
148 Or,
149 Amp,
150 DPipe,
151 PipeGt,
152 Pipe,
153 PipeSlash,
154 DPipeSlash,
155 Caret,
156 CaretAt,
157 LtLt, GtGt, Tilde,
160 Arrow,
161 DArrow,
162 FArrow,
163 Hash,
164 HashArrow,
165 DHashArrow,
166 LrArrow,
167 DAt,
168 AtAt,
169 LtAt,
170 AtGt,
171 Dollar,
172 Parameter,
173 Session,
174 SessionParameter,
175 SessionUser,
176 DAmp,
177 AmpLt,
178 AmpGt,
179 Adjacent,
180 Xor,
181 DStar,
182 QMarkAmp,
183 QMarkPipe,
184 HashDash,
185 Exclamation,
186
187 UriStart,
188 BlockStart,
189 BlockEnd,
190 Space,
191 Break,
192
193 BlockComment, LineComment, String,
199 DollarString, TripleDoubleQuotedString, TripleSingleQuotedString, Number,
203 Identifier,
204 QuotedIdentifier,
205 Database,
206 Column,
207 ColumnDef,
208 Schema,
209 Table,
210 Warehouse,
211 Stage,
212 Streamlit,
213 Var,
214 BitString,
215 HexString,
216 HexNumber,
218 ByteString,
219 NationalString,
220 EscapeString, RawString,
222 HeredocString,
223 HeredocStringAlternative,
224 UnicodeString,
225
226 Bit,
228 Boolean,
229 TinyInt,
230 UTinyInt,
231 SmallInt,
232 USmallInt,
233 MediumInt,
234 UMediumInt,
235 Int,
236 UInt,
237 BigInt,
238 UBigInt,
239 BigNum,
240 Int128,
241 UInt128,
242 Int256,
243 UInt256,
244 Float,
245 Double,
246 UDouble,
247 Decimal,
248 Decimal32,
249 Decimal64,
250 Decimal128,
251 Decimal256,
252 DecFloat,
253 UDecimal,
254 BigDecimal,
255 Char,
256 NChar,
257 VarChar,
258 NVarChar,
259 BpChar,
260 Text,
261 MediumText,
262 LongText,
263 Blob,
264 MediumBlob,
265 LongBlob,
266 TinyBlob,
267 TinyText,
268 Name,
269 Binary,
270 VarBinary,
271 Json,
272 JsonB,
273 Time,
274 TimeTz,
275 TimeNs,
276 Timestamp,
277 TimestampTz,
278 TimestampLtz,
279 TimestampNtz,
280 TimestampS,
281 TimestampMs,
282 TimestampNs,
283 DateTime,
284 DateTime2,
285 DateTime64,
286 SmallDateTime,
287 Date,
288 Date32,
289 Int4Range,
290 Int4MultiRange,
291 Int8Range,
292 Int8MultiRange,
293 NumRange,
294 NumMultiRange,
295 TsRange,
296 TsMultiRange,
297 TsTzRange,
298 TsTzMultiRange,
299 DateRange,
300 DateMultiRange,
301 Uuid,
302 Geography,
303 GeographyPoint,
304 Nullable,
305 Geometry,
306 Point,
307 Ring,
308 LineString,
309 LocalTime,
310 LocalTimestamp,
311 SysTimestamp,
312 MultiLineString,
313 Polygon,
314 MultiPolygon,
315 HllSketch,
316 HStore,
317 Super,
318 Serial,
319 SmallSerial,
320 BigSerial,
321 Xml,
322 Year,
323 UserDefined,
324 Money,
325 SmallMoney,
326 RowVersion,
327 Image,
328 Variant,
329 Object,
330 Inet,
331 IpAddress,
332 IpPrefix,
333 Ipv4,
334 Ipv6,
335 Enum,
336 Enum8,
337 Enum16,
338 FixedString,
339 LowCardinality,
340 Nested,
341 AggregateFunction,
342 SimpleAggregateFunction,
343 TDigest,
344 Unknown,
345 Vector,
346 Dynamic,
347 Void,
348
349 Add,
351 Alias,
352 Alter,
353 All,
354 Anti,
355 Any,
356 Apply,
357 Array,
358 Asc,
359 AsOf,
360 Attach,
361 AutoIncrement,
362 Begin,
363 Between,
364 BulkCollectInto,
365 Cache,
366 Cascade,
367 Case,
368 CharacterSet,
369 Cluster,
370 ClusterBy,
371 Collate,
372 Command,
373 Comment,
374 Commit,
375 Preserve,
376 Connect,
377 ConnectBy,
378 Constraint,
379 Copy,
380 Create,
381 Cross,
382 Cube,
383 CurrentDate,
384 CurrentDateTime,
385 CurrentSchema,
386 CurrentTime,
387 CurrentTimestamp,
388 CurrentUser,
389 CurrentRole,
390 CurrentCatalog,
391 Declare,
392 Default,
393 Delete,
394 Desc,
395 Describe,
396 Detach,
397 Dictionary,
398 Distinct,
399 Distribute,
400 DistributeBy,
401 Div,
402 Drop,
403 Else,
404 End,
405 Escape,
406 Except,
407 Execute,
408 Exists,
409 False,
410 Fetch,
411 File,
412 FileFormat,
413 Filter,
414 Final,
415 First,
416 For,
417 Force,
418 ForeignKey,
419 Format,
420 From,
421 Full,
422 Function,
423 Get,
424 Glob,
425 Global,
426 Grant,
427 GroupBy,
428 GroupingSets,
429 Having,
430 Hint,
431 Ignore,
432 ILike,
433 In,
434 Index,
435 IndexedBy,
436 Inner,
437 Input,
438 Insert,
439 Install,
440 Intersect,
441 Interval,
442 Into,
443 Inpath,
444 InputFormat,
445 Introducer,
446 IRLike,
447 Is,
448 IsNull,
449 Join,
450 JoinMarker,
451 Keep,
452 Key,
453 Kill,
454 Lambda,
455 Language,
456 Lateral,
457 Left,
458 Like,
459 NotLike, NotILike, NotRLike, NotIRLike, Limit,
464 List,
465 Load,
466 Local,
467 Lock,
468 Map,
469 Match,
470 MatchCondition,
471 MatchRecognize,
472 MemberOf,
473 Materialized,
474 Merge,
475 Mod,
476 Model,
477 Natural,
478 Next,
479 NoAction,
480 Nothing,
481 NotNull,
482 Null,
483 ObjectIdentifier,
484 Offset,
485 On,
486 Only,
487 Operator,
488 OrderBy,
489 OrderSiblingsBy,
490 Ordered,
491 Ordinality,
492 Out,
493 Outer,
494 Output,
495 Over,
496 Overlaps,
497 Overwrite,
498 Partition,
499 PartitionBy,
500 Percent,
501 Pivot,
502 Placeholder,
503 Positional,
504 Pragma,
505 Prewhere,
506 PrimaryKey,
507 Procedure,
508 Properties,
509 PseudoType,
510 Put,
511 Qualify,
512 Quote,
513 QDColon,
514 Range,
515 Recursive,
516 Refresh,
517 Rename,
518 Replace,
519 Returning,
520 Revoke,
521 References,
522 Restrict,
523 Right,
524 RLike,
525 Rollback,
526 Rollup,
527 Row,
528 Rows,
529 Select,
530 Semi,
531 Savepoint,
532 Separator,
533 Sequence,
534 Serde,
535 SerdeProperties,
536 Set,
537 Settings,
538 Show,
539 Siblings,
540 SimilarTo,
541 Some,
542 Sort,
543 SortBy,
544 SoundsLike,
545 StartWith,
546 StorageIntegration,
547 StraightJoin,
548 Struct,
549 Summarize,
550 TableSample,
551 Sample,
552 Bernoulli,
553 System,
554 Block,
555 Seed,
556 Repeatable,
557 Tag,
558 Temporary,
559 Transaction,
560 To,
561 Top,
562 Then,
563 True,
564 Truncate,
565 Uncache,
566 Union,
567 Unnest,
568 Unpivot,
569 Update,
570 Use,
571 Using,
572 Values,
573 View,
574 SemanticView,
575 Volatile,
576 When,
577 Where,
578 Window,
579 With,
580 Ties,
581 Exclude,
582 No,
583 Others,
584 Unique,
585 UtcDate,
586 UtcTime,
587 UtcTimestamp,
588 VersionSnapshot,
589 TimestampSnapshot,
590 Option,
591 Sink,
592 Source,
593 Analyze,
594 Namespace,
595 Export,
596 As,
597 By,
598 Nulls,
599 Respect,
600 Last,
601 If,
602 Cast,
603 TryCast,
604 SafeCast,
605 Count,
606 Extract,
607 Substring,
608 Trim,
609 Leading,
610 Trailing,
611 Both,
612 Position,
613 Overlaying,
614 Placing,
615 Treat,
616 Within,
617 Group,
618 Order,
619
620 Unbounded,
622 Preceding,
623 Following,
624 Current,
625 Groups,
626
627 Trigger,
629 Type,
630 Domain,
631 Returns,
632 Body,
633 Increment,
634 Minvalue,
635 Maxvalue,
636 Start,
637 Cycle,
638 NoCycle,
639 Prior,
640 Generated,
641 Identity,
642 Always,
643 Measures,
645 Pattern,
646 Define,
647 Running,
648 Owned,
649 After,
650 Before,
651 Instead,
652 Each,
653 Statement,
654 Referencing,
655 Old,
656 New,
657 Of,
658 Check,
659 Authorization,
660 Restart,
661
662 Eof,
664}
665
666impl TokenType {
667 pub fn is_keyword(&self) -> bool {
669 matches!(
670 self,
671 TokenType::Select
672 | TokenType::From
673 | TokenType::Where
674 | TokenType::And
675 | TokenType::Or
676 | TokenType::Not
677 | TokenType::In
678 | TokenType::Is
679 | TokenType::Null
680 | TokenType::True
681 | TokenType::False
682 | TokenType::As
683 | TokenType::On
684 | TokenType::Join
685 | TokenType::Left
686 | TokenType::Right
687 | TokenType::Inner
688 | TokenType::Outer
689 | TokenType::Full
690 | TokenType::Cross
691 | TokenType::Semi
692 | TokenType::Anti
693 | TokenType::Union
694 | TokenType::Except
695 | TokenType::Intersect
696 | TokenType::GroupBy
697 | TokenType::OrderBy
698 | TokenType::Having
699 | TokenType::Limit
700 | TokenType::Offset
701 | TokenType::Case
702 | TokenType::When
703 | TokenType::Then
704 | TokenType::Else
705 | TokenType::End
706 | TokenType::Create
707 | TokenType::Drop
708 | TokenType::Alter
709 | TokenType::Insert
710 | TokenType::Update
711 | TokenType::Delete
712 | TokenType::Into
713 | TokenType::Values
714 | TokenType::Set
715 | TokenType::With
716 | TokenType::Distinct
717 | TokenType::All
718 | TokenType::Exists
719 | TokenType::Between
720 | TokenType::Like
721 | TokenType::ILike
722 | TokenType::Filter
724 | TokenType::Date
725 | TokenType::Timestamp
726 | TokenType::TimestampTz
727 | TokenType::Interval
728 | TokenType::Time
729 | TokenType::Table
730 | TokenType::Index
731 | TokenType::Column
732 | TokenType::Database
733 | TokenType::Schema
734 | TokenType::View
735 | TokenType::Function
736 | TokenType::Procedure
737 | TokenType::Trigger
738 | TokenType::Sequence
739 | TokenType::Over
740 | TokenType::Partition
741 | TokenType::Window
742 | TokenType::Rows
743 | TokenType::Range
744 | TokenType::First
745 | TokenType::Last
746 | TokenType::Preceding
747 | TokenType::Following
748 | TokenType::Current
749 | TokenType::Row
750 | TokenType::Unbounded
751 | TokenType::Array
752 | TokenType::Struct
753 | TokenType::Map
754 | TokenType::PrimaryKey
755 | TokenType::Key
756 | TokenType::ForeignKey
757 | TokenType::References
758 | TokenType::Unique
759 | TokenType::Check
760 | TokenType::Default
761 | TokenType::Constraint
762 | TokenType::Comment
763 | TokenType::Rollup
764 | TokenType::Cube
765 | TokenType::Grant
766 | TokenType::Revoke
767 | TokenType::Type
768 | TokenType::Use
769 | TokenType::Cache
770 | TokenType::Uncache
771 | TokenType::Load
772 | TokenType::Any
773 | TokenType::Some
774 | TokenType::Asc
775 | TokenType::Desc
776 | TokenType::Nulls
777 | TokenType::Lateral
778 | TokenType::Natural
779 | TokenType::Escape
780 | TokenType::Glob
781 | TokenType::Match
782 | TokenType::Recursive
783 | TokenType::Replace
784 | TokenType::Returns
785 | TokenType::If
786 | TokenType::Pivot
787 | TokenType::Unpivot
788 | TokenType::Json
789 | TokenType::Blob
790 | TokenType::Text
791 | TokenType::Int
792 | TokenType::BigInt
793 | TokenType::SmallInt
794 | TokenType::TinyInt
795 | TokenType::Int128
796 | TokenType::UInt128
797 | TokenType::Int256
798 | TokenType::UInt256
799 | TokenType::UInt
800 | TokenType::UBigInt
801 | TokenType::Float
802 | TokenType::Double
803 | TokenType::Decimal
804 | TokenType::Boolean
805 | TokenType::VarChar
806 | TokenType::Char
807 | TokenType::Binary
808 | TokenType::VarBinary
809 | TokenType::No
810 | TokenType::DateTime
811 | TokenType::Truncate
812 | TokenType::Execute
813 | TokenType::Merge
814 | TokenType::Top
815 | TokenType::Begin
816 | TokenType::Generated
817 | TokenType::Identity
818 | TokenType::Always
819 | TokenType::Extract
820 | TokenType::AsOf
822 | TokenType::Prior
823 | TokenType::After
824 | TokenType::Restrict
825 | TokenType::Cascade
826 | TokenType::Local
827 | TokenType::Rename
828 | TokenType::Enum
829 | TokenType::Within
830 | TokenType::Format
831 | TokenType::Final
832 | TokenType::FileFormat
833 | TokenType::Input
834 | TokenType::InputFormat
835 | TokenType::Copy
836 | TokenType::Put
837 | TokenType::Get
838 | TokenType::Show
839 | TokenType::Serde
840 | TokenType::Sample
841 | TokenType::Sort
842 | TokenType::Collate
843 | TokenType::Ties
844 | TokenType::IsNull
845 | TokenType::NotNull
846 | TokenType::Exclude
847 | TokenType::Temporary
848 | TokenType::Add
849 | TokenType::Ordinality
850 | TokenType::Overlaps
851 | TokenType::Block
852 | TokenType::Pattern
853 | TokenType::Group
854 | TokenType::Cluster
855 | TokenType::Repeatable
856 | TokenType::Groups
857 | TokenType::Commit
858 | TokenType::Warehouse
859 | TokenType::System
860 | TokenType::By
861 | TokenType::To
862 | TokenType::Fetch
863 | TokenType::For
864 | TokenType::Only
865 | TokenType::Next
866 | TokenType::Lock
867 | TokenType::Refresh
868 | TokenType::Settings
869 | TokenType::Operator
870 | TokenType::Overwrite
871 | TokenType::StraightJoin
872 | TokenType::Start
873 | TokenType::Ignore
875 | TokenType::Domain
876 | TokenType::Apply
877 | TokenType::Respect
878 | TokenType::Materialized
879 | TokenType::Prewhere
880 | TokenType::Old
881 | TokenType::New
882 | TokenType::Cast
883 | TokenType::TryCast
884 | TokenType::SafeCast
885 | TokenType::Transaction
886 | TokenType::Describe
887 | TokenType::Kill
888 | TokenType::Lambda
889 | TokenType::Declare
890 | TokenType::Keep
891 | TokenType::Output
892 | TokenType::Percent
893 | TokenType::Qualify
894 | TokenType::Returning
895 | TokenType::Language
896 | TokenType::Preserve
897 | TokenType::Savepoint
898 | TokenType::Rollback
899 | TokenType::Body
900 | TokenType::Increment
901 | TokenType::Minvalue
902 | TokenType::Maxvalue
903 | TokenType::Cycle
904 | TokenType::NoCycle
905 | TokenType::Seed
906 | TokenType::Namespace
907 | TokenType::Authorization
908 | TokenType::Order
909 | TokenType::Restart
910 | TokenType::Before
911 | TokenType::Instead
912 | TokenType::Each
913 | TokenType::Statement
914 | TokenType::Referencing
915 | TokenType::Of
916 | TokenType::Separator
917 | TokenType::Others
918 | TokenType::Placing
919 | TokenType::Owned
920 | TokenType::Running
921 | TokenType::Define
922 | TokenType::Measures
923 | TokenType::MatchRecognize
924 | TokenType::AutoIncrement
925 | TokenType::Connect
926 | TokenType::Distribute
927 | TokenType::Bernoulli
928 | TokenType::TableSample
929 | TokenType::Inpath
930 | TokenType::Pragma
931 | TokenType::Siblings
932 | TokenType::SerdeProperties
933 | TokenType::RLike
934 )
935 }
936
937 pub fn is_comparison(&self) -> bool {
939 matches!(
940 self,
941 TokenType::Eq
942 | TokenType::Neq
943 | TokenType::Lt
944 | TokenType::Lte
945 | TokenType::Gt
946 | TokenType::Gte
947 | TokenType::NullsafeEq
948 )
949 }
950
951 pub fn is_arithmetic(&self) -> bool {
953 matches!(
954 self,
955 TokenType::Plus
956 | TokenType::Dash
957 | TokenType::Star
958 | TokenType::Slash
959 | TokenType::Percent
960 | TokenType::Mod
961 | TokenType::Div
962 )
963 }
964}
965
966impl fmt::Display for TokenType {
967 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
968 write!(f, "{:?}", self)
969 }
970}
971
972#[derive(Debug, Clone)]
974pub struct TokenizerConfig {
975 pub keywords: std::collections::HashMap<String, TokenType>,
977 pub single_tokens: std::collections::HashMap<char, TokenType>,
979 pub quotes: std::collections::HashMap<String, String>,
981 pub identifiers: std::collections::HashMap<char, char>,
983 pub comments: std::collections::HashMap<String, Option<String>>,
985 pub string_escapes: Vec<char>,
987 pub nested_comments: bool,
989 pub escape_follow_chars: Vec<char>,
994 pub b_prefix_is_byte_string: bool,
997 pub numeric_literals: std::collections::HashMap<String, String>,
1000 pub identifiers_can_start_with_digit: bool,
1004 pub hex_number_strings: bool,
1008 pub hex_string_is_integer_type: bool,
1012 pub string_escapes_allowed_in_raw_strings: bool,
1017 pub hash_comments: bool,
1019 pub dollar_sign_is_identifier: bool,
1023 pub insert_format_raw_data: bool,
1027}
1028
1029impl Default for TokenizerConfig {
1030 fn default() -> Self {
1031 let mut keywords = std::collections::HashMap::new();
1032 keywords.insert("SELECT".to_string(), TokenType::Select);
1034 keywords.insert("FROM".to_string(), TokenType::From);
1035 keywords.insert("WHERE".to_string(), TokenType::Where);
1036 keywords.insert("AND".to_string(), TokenType::And);
1037 keywords.insert("OR".to_string(), TokenType::Or);
1038 keywords.insert("NOT".to_string(), TokenType::Not);
1039 keywords.insert("AS".to_string(), TokenType::As);
1040 keywords.insert("ON".to_string(), TokenType::On);
1041 keywords.insert("JOIN".to_string(), TokenType::Join);
1042 keywords.insert("LEFT".to_string(), TokenType::Left);
1043 keywords.insert("RIGHT".to_string(), TokenType::Right);
1044 keywords.insert("INNER".to_string(), TokenType::Inner);
1045 keywords.insert("OUTER".to_string(), TokenType::Outer);
1046 keywords.insert("OUTPUT".to_string(), TokenType::Output);
1047 keywords.insert("FULL".to_string(), TokenType::Full);
1048 keywords.insert("CROSS".to_string(), TokenType::Cross);
1049 keywords.insert("SEMI".to_string(), TokenType::Semi);
1050 keywords.insert("ANTI".to_string(), TokenType::Anti);
1051 keywords.insert("STRAIGHT_JOIN".to_string(), TokenType::StraightJoin);
1052 keywords.insert("UNION".to_string(), TokenType::Union);
1053 keywords.insert("EXCEPT".to_string(), TokenType::Except);
1054 keywords.insert("MINUS".to_string(), TokenType::Except); keywords.insert("INTERSECT".to_string(), TokenType::Intersect);
1056 keywords.insert("GROUP".to_string(), TokenType::Group);
1057 keywords.insert("CUBE".to_string(), TokenType::Cube);
1058 keywords.insert("ROLLUP".to_string(), TokenType::Rollup);
1059 keywords.insert("WITHIN".to_string(), TokenType::Within);
1060 keywords.insert("ORDER".to_string(), TokenType::Order);
1061 keywords.insert("BY".to_string(), TokenType::By);
1062 keywords.insert("HAVING".to_string(), TokenType::Having);
1063 keywords.insert("LIMIT".to_string(), TokenType::Limit);
1064 keywords.insert("OFFSET".to_string(), TokenType::Offset);
1065 keywords.insert("ORDINALITY".to_string(), TokenType::Ordinality);
1066 keywords.insert("FETCH".to_string(), TokenType::Fetch);
1067 keywords.insert("FIRST".to_string(), TokenType::First);
1068 keywords.insert("NEXT".to_string(), TokenType::Next);
1069 keywords.insert("ONLY".to_string(), TokenType::Only);
1070 keywords.insert("KEEP".to_string(), TokenType::Keep);
1071 keywords.insert("IGNORE".to_string(), TokenType::Ignore);
1072 keywords.insert("INPUT".to_string(), TokenType::Input);
1073 keywords.insert("CASE".to_string(), TokenType::Case);
1074 keywords.insert("WHEN".to_string(), TokenType::When);
1075 keywords.insert("THEN".to_string(), TokenType::Then);
1076 keywords.insert("ELSE".to_string(), TokenType::Else);
1077 keywords.insert("END".to_string(), TokenType::End);
1078 keywords.insert("ENDIF".to_string(), TokenType::End); keywords.insert("NULL".to_string(), TokenType::Null);
1080 keywords.insert("TRUE".to_string(), TokenType::True);
1081 keywords.insert("FALSE".to_string(), TokenType::False);
1082 keywords.insert("IS".to_string(), TokenType::Is);
1083 keywords.insert("IN".to_string(), TokenType::In);
1084 keywords.insert("BETWEEN".to_string(), TokenType::Between);
1085 keywords.insert("OVERLAPS".to_string(), TokenType::Overlaps);
1086 keywords.insert("LIKE".to_string(), TokenType::Like);
1087 keywords.insert("ILIKE".to_string(), TokenType::ILike);
1088 keywords.insert("RLIKE".to_string(), TokenType::RLike);
1089 keywords.insert("REGEXP".to_string(), TokenType::RLike);
1090 keywords.insert("ESCAPE".to_string(), TokenType::Escape);
1091 keywords.insert("EXISTS".to_string(), TokenType::Exists);
1092 keywords.insert("DISTINCT".to_string(), TokenType::Distinct);
1093 keywords.insert("ALL".to_string(), TokenType::All);
1094 keywords.insert("WITH".to_string(), TokenType::With);
1095 keywords.insert("CREATE".to_string(), TokenType::Create);
1096 keywords.insert("DROP".to_string(), TokenType::Drop);
1097 keywords.insert("ALTER".to_string(), TokenType::Alter);
1098 keywords.insert("TRUNCATE".to_string(), TokenType::Truncate);
1099 keywords.insert("TABLE".to_string(), TokenType::Table);
1100 keywords.insert("VIEW".to_string(), TokenType::View);
1101 keywords.insert("INDEX".to_string(), TokenType::Index);
1102 keywords.insert("COLUMN".to_string(), TokenType::Column);
1103 keywords.insert("CONSTRAINT".to_string(), TokenType::Constraint);
1104 keywords.insert("ADD".to_string(), TokenType::Add);
1105 keywords.insert("CASCADE".to_string(), TokenType::Cascade);
1106 keywords.insert("RESTRICT".to_string(), TokenType::Restrict);
1107 keywords.insert("RENAME".to_string(), TokenType::Rename);
1108 keywords.insert("TEMPORARY".to_string(), TokenType::Temporary);
1109 keywords.insert("TEMP".to_string(), TokenType::Temporary);
1110 keywords.insert("UNIQUE".to_string(), TokenType::Unique);
1111 keywords.insert("PRIMARY".to_string(), TokenType::PrimaryKey);
1112 keywords.insert("FOREIGN".to_string(), TokenType::ForeignKey);
1113 keywords.insert("KEY".to_string(), TokenType::Key);
1114 keywords.insert("KILL".to_string(), TokenType::Kill);
1115 keywords.insert("REFERENCES".to_string(), TokenType::References);
1116 keywords.insert("DEFAULT".to_string(), TokenType::Default);
1117 keywords.insert("DECLARE".to_string(), TokenType::Declare);
1118 keywords.insert("AUTO_INCREMENT".to_string(), TokenType::AutoIncrement);
1119 keywords.insert("AUTOINCREMENT".to_string(), TokenType::AutoIncrement); keywords.insert("MATERIALIZED".to_string(), TokenType::Materialized);
1121 keywords.insert("REPLACE".to_string(), TokenType::Replace);
1122 keywords.insert("TO".to_string(), TokenType::To);
1123 keywords.insert("INSERT".to_string(), TokenType::Insert);
1124 keywords.insert("OVERWRITE".to_string(), TokenType::Overwrite);
1125 keywords.insert("UPDATE".to_string(), TokenType::Update);
1126 keywords.insert("USE".to_string(), TokenType::Use);
1127 keywords.insert("WAREHOUSE".to_string(), TokenType::Warehouse);
1128 keywords.insert("GLOB".to_string(), TokenType::Glob);
1129 keywords.insert("DELETE".to_string(), TokenType::Delete);
1130 keywords.insert("MERGE".to_string(), TokenType::Merge);
1131 keywords.insert("CACHE".to_string(), TokenType::Cache);
1132 keywords.insert("UNCACHE".to_string(), TokenType::Uncache);
1133 keywords.insert("REFRESH".to_string(), TokenType::Refresh);
1134 keywords.insert("GRANT".to_string(), TokenType::Grant);
1135 keywords.insert("REVOKE".to_string(), TokenType::Revoke);
1136 keywords.insert("COMMENT".to_string(), TokenType::Comment);
1137 keywords.insert("COLLATE".to_string(), TokenType::Collate);
1138 keywords.insert("INTO".to_string(), TokenType::Into);
1139 keywords.insert("VALUES".to_string(), TokenType::Values);
1140 keywords.insert("SET".to_string(), TokenType::Set);
1141 keywords.insert("SETTINGS".to_string(), TokenType::Settings);
1142 keywords.insert("SEPARATOR".to_string(), TokenType::Separator);
1143 keywords.insert("ASC".to_string(), TokenType::Asc);
1144 keywords.insert("DESC".to_string(), TokenType::Desc);
1145 keywords.insert("NULLS".to_string(), TokenType::Nulls);
1146 keywords.insert("RESPECT".to_string(), TokenType::Respect);
1147 keywords.insert("FIRST".to_string(), TokenType::First);
1148 keywords.insert("LAST".to_string(), TokenType::Last);
1149 keywords.insert("IF".to_string(), TokenType::If);
1150 keywords.insert("CAST".to_string(), TokenType::Cast);
1151 keywords.insert("TRY_CAST".to_string(), TokenType::TryCast);
1152 keywords.insert("SAFE_CAST".to_string(), TokenType::SafeCast);
1153 keywords.insert("OVER".to_string(), TokenType::Over);
1154 keywords.insert("PARTITION".to_string(), TokenType::Partition);
1155 keywords.insert("PLACING".to_string(), TokenType::Placing);
1156 keywords.insert("WINDOW".to_string(), TokenType::Window);
1157 keywords.insert("ROWS".to_string(), TokenType::Rows);
1158 keywords.insert("RANGE".to_string(), TokenType::Range);
1159 keywords.insert("FILTER".to_string(), TokenType::Filter);
1160 keywords.insert("NATURAL".to_string(), TokenType::Natural);
1161 keywords.insert("USING".to_string(), TokenType::Using);
1162 keywords.insert("UNBOUNDED".to_string(), TokenType::Unbounded);
1163 keywords.insert("PRECEDING".to_string(), TokenType::Preceding);
1164 keywords.insert("FOLLOWING".to_string(), TokenType::Following);
1165 keywords.insert("CURRENT".to_string(), TokenType::Current);
1166 keywords.insert("ROW".to_string(), TokenType::Row);
1167 keywords.insert("GROUPS".to_string(), TokenType::Groups);
1168 keywords.insert("RECURSIVE".to_string(), TokenType::Recursive);
1169 keywords.insert("BOTH".to_string(), TokenType::Both);
1171 keywords.insert("LEADING".to_string(), TokenType::Leading);
1172 keywords.insert("TRAILING".to_string(), TokenType::Trailing);
1173 keywords.insert("INTERVAL".to_string(), TokenType::Interval);
1174 keywords.insert("TOP".to_string(), TokenType::Top);
1176 keywords.insert("QUALIFY".to_string(), TokenType::Qualify);
1177 keywords.insert("SAMPLE".to_string(), TokenType::Sample);
1178 keywords.insert("TABLESAMPLE".to_string(), TokenType::TableSample);
1179 keywords.insert("BERNOULLI".to_string(), TokenType::Bernoulli);
1180 keywords.insert("SYSTEM".to_string(), TokenType::System);
1181 keywords.insert("BLOCK".to_string(), TokenType::Block);
1182 keywords.insert("SEED".to_string(), TokenType::Seed);
1183 keywords.insert("REPEATABLE".to_string(), TokenType::Repeatable);
1184 keywords.insert("TIES".to_string(), TokenType::Ties);
1185 keywords.insert("LATERAL".to_string(), TokenType::Lateral);
1186 keywords.insert("LAMBDA".to_string(), TokenType::Lambda);
1187 keywords.insert("APPLY".to_string(), TokenType::Apply);
1188 keywords.insert("CONNECT".to_string(), TokenType::Connect);
1190 keywords.insert("CLUSTER".to_string(), TokenType::Cluster);
1192 keywords.insert("DISTRIBUTE".to_string(), TokenType::Distribute);
1193 keywords.insert("SORT".to_string(), TokenType::Sort);
1194 keywords.insert("PIVOT".to_string(), TokenType::Pivot);
1195 keywords.insert("PREWHERE".to_string(), TokenType::Prewhere);
1196 keywords.insert("UNPIVOT".to_string(), TokenType::Unpivot);
1197 keywords.insert("FOR".to_string(), TokenType::For);
1198 keywords.insert("ANY".to_string(), TokenType::Any);
1199 keywords.insert("SOME".to_string(), TokenType::Some);
1200 keywords.insert("ASOF".to_string(), TokenType::AsOf);
1201 keywords.insert("PERCENT".to_string(), TokenType::Percent);
1202 keywords.insert("EXCLUDE".to_string(), TokenType::Exclude);
1203 keywords.insert("NO".to_string(), TokenType::No);
1204 keywords.insert("OTHERS".to_string(), TokenType::Others);
1205 keywords.insert("OPERATOR".to_string(), TokenType::Operator);
1207 keywords.insert("SCHEMA".to_string(), TokenType::Schema);
1209 keywords.insert("NAMESPACE".to_string(), TokenType::Namespace);
1210 keywords.insert("DATABASE".to_string(), TokenType::Database);
1211 keywords.insert("FUNCTION".to_string(), TokenType::Function);
1212 keywords.insert("PROCEDURE".to_string(), TokenType::Procedure);
1213 keywords.insert("PROC".to_string(), TokenType::Procedure);
1214 keywords.insert("SEQUENCE".to_string(), TokenType::Sequence);
1215 keywords.insert("TRIGGER".to_string(), TokenType::Trigger);
1216 keywords.insert("TYPE".to_string(), TokenType::Type);
1217 keywords.insert("DOMAIN".to_string(), TokenType::Domain);
1218 keywords.insert("RETURNS".to_string(), TokenType::Returns);
1219 keywords.insert("RETURNING".to_string(), TokenType::Returning);
1220 keywords.insert("LANGUAGE".to_string(), TokenType::Language);
1221 keywords.insert("ROLLBACK".to_string(), TokenType::Rollback);
1222 keywords.insert("COMMIT".to_string(), TokenType::Commit);
1223 keywords.insert("BEGIN".to_string(), TokenType::Begin);
1224 keywords.insert("DESCRIBE".to_string(), TokenType::Describe);
1225 keywords.insert("PRESERVE".to_string(), TokenType::Preserve);
1226 keywords.insert("TRANSACTION".to_string(), TokenType::Transaction);
1227 keywords.insert("SAVEPOINT".to_string(), TokenType::Savepoint);
1228 keywords.insert("BODY".to_string(), TokenType::Body);
1229 keywords.insert("INCREMENT".to_string(), TokenType::Increment);
1230 keywords.insert("MINVALUE".to_string(), TokenType::Minvalue);
1231 keywords.insert("MAXVALUE".to_string(), TokenType::Maxvalue);
1232 keywords.insert("CYCLE".to_string(), TokenType::Cycle);
1233 keywords.insert("NOCYCLE".to_string(), TokenType::NoCycle);
1234 keywords.insert("PRIOR".to_string(), TokenType::Prior);
1235 keywords.insert("MATCH".to_string(), TokenType::Match);
1237 keywords.insert("MATCH_RECOGNIZE".to_string(), TokenType::MatchRecognize);
1238 keywords.insert("MEASURES".to_string(), TokenType::Measures);
1239 keywords.insert("PATTERN".to_string(), TokenType::Pattern);
1240 keywords.insert("DEFINE".to_string(), TokenType::Define);
1241 keywords.insert("RUNNING".to_string(), TokenType::Running);
1242 keywords.insert("FINAL".to_string(), TokenType::Final);
1243 keywords.insert("OWNED".to_string(), TokenType::Owned);
1244 keywords.insert("AFTER".to_string(), TokenType::After);
1245 keywords.insert("BEFORE".to_string(), TokenType::Before);
1246 keywords.insert("INSTEAD".to_string(), TokenType::Instead);
1247 keywords.insert("EACH".to_string(), TokenType::Each);
1248 keywords.insert("STATEMENT".to_string(), TokenType::Statement);
1249 keywords.insert("REFERENCING".to_string(), TokenType::Referencing);
1250 keywords.insert("OLD".to_string(), TokenType::Old);
1251 keywords.insert("NEW".to_string(), TokenType::New);
1252 keywords.insert("OF".to_string(), TokenType::Of);
1253 keywords.insert("CHECK".to_string(), TokenType::Check);
1254 keywords.insert("START".to_string(), TokenType::Start);
1255 keywords.insert("ENUM".to_string(), TokenType::Enum);
1256 keywords.insert("AUTHORIZATION".to_string(), TokenType::Authorization);
1257 keywords.insert("RESTART".to_string(), TokenType::Restart);
1258 keywords.insert("DATE".to_string(), TokenType::Date);
1260 keywords.insert("TIME".to_string(), TokenType::Time);
1261 keywords.insert("TIMESTAMP".to_string(), TokenType::Timestamp);
1262 keywords.insert("DATETIME".to_string(), TokenType::DateTime);
1263 keywords.insert("GENERATED".to_string(), TokenType::Generated);
1264 keywords.insert("IDENTITY".to_string(), TokenType::Identity);
1265 keywords.insert("ALWAYS".to_string(), TokenType::Always);
1266 keywords.insert("LOAD".to_string(), TokenType::Load);
1268 keywords.insert("LOCAL".to_string(), TokenType::Local);
1269 keywords.insert("INPATH".to_string(), TokenType::Inpath);
1270 keywords.insert("INPUTFORMAT".to_string(), TokenType::InputFormat);
1271 keywords.insert("SERDE".to_string(), TokenType::Serde);
1272 keywords.insert("SERDEPROPERTIES".to_string(), TokenType::SerdeProperties);
1273 keywords.insert("FORMAT".to_string(), TokenType::Format);
1274 keywords.insert("PRAGMA".to_string(), TokenType::Pragma);
1276 keywords.insert("SHOW".to_string(), TokenType::Show);
1278 keywords.insert("SIBLINGS".to_string(), TokenType::Siblings);
1280 keywords.insert("COPY".to_string(), TokenType::Copy);
1282 keywords.insert("PUT".to_string(), TokenType::Put);
1283 keywords.insert("GET".to_string(), TokenType::Get);
1284 keywords.insert("EXEC".to_string(), TokenType::Execute);
1286 keywords.insert("EXECUTE".to_string(), TokenType::Execute);
1287 keywords.insert("ISNULL".to_string(), TokenType::IsNull);
1289 keywords.insert("NOTNULL".to_string(), TokenType::NotNull);
1290
1291 let mut single_tokens = std::collections::HashMap::new();
1292 single_tokens.insert('(', TokenType::LParen);
1293 single_tokens.insert(')', TokenType::RParen);
1294 single_tokens.insert('[', TokenType::LBracket);
1295 single_tokens.insert(']', TokenType::RBracket);
1296 single_tokens.insert('{', TokenType::LBrace);
1297 single_tokens.insert('}', TokenType::RBrace);
1298 single_tokens.insert(',', TokenType::Comma);
1299 single_tokens.insert('.', TokenType::Dot);
1300 single_tokens.insert(';', TokenType::Semicolon);
1301 single_tokens.insert('+', TokenType::Plus);
1302 single_tokens.insert('-', TokenType::Dash);
1303 single_tokens.insert('*', TokenType::Star);
1304 single_tokens.insert('/', TokenType::Slash);
1305 single_tokens.insert('%', TokenType::Percent);
1306 single_tokens.insert('&', TokenType::Amp);
1307 single_tokens.insert('|', TokenType::Pipe);
1308 single_tokens.insert('^', TokenType::Caret);
1309 single_tokens.insert('~', TokenType::Tilde);
1310 single_tokens.insert('<', TokenType::Lt);
1311 single_tokens.insert('>', TokenType::Gt);
1312 single_tokens.insert('=', TokenType::Eq);
1313 single_tokens.insert('!', TokenType::Exclamation);
1314 single_tokens.insert(':', TokenType::Colon);
1315 single_tokens.insert('@', TokenType::DAt);
1316 single_tokens.insert('#', TokenType::Hash);
1317 single_tokens.insert('$', TokenType::Dollar);
1318 single_tokens.insert('?', TokenType::Parameter);
1319
1320 let mut quotes = std::collections::HashMap::new();
1321 quotes.insert("'".to_string(), "'".to_string());
1322 quotes.insert("\"\"\"".to_string(), "\"\"\"".to_string());
1324
1325 let mut identifiers = std::collections::HashMap::new();
1326 identifiers.insert('"', '"');
1327 identifiers.insert('`', '`');
1328 let mut comments = std::collections::HashMap::new();
1332 comments.insert("--".to_string(), None);
1333 comments.insert("/*".to_string(), Some("*/".to_string()));
1334
1335 Self {
1336 keywords,
1337 single_tokens,
1338 quotes,
1339 identifiers,
1340 comments,
1341 string_escapes: vec!['\''],
1344 nested_comments: true,
1345 escape_follow_chars: vec![],
1347 b_prefix_is_byte_string: false,
1349 numeric_literals: std::collections::HashMap::new(),
1350 identifiers_can_start_with_digit: false,
1351 hex_number_strings: false,
1352 hex_string_is_integer_type: false,
1353 string_escapes_allowed_in_raw_strings: true,
1356 hash_comments: false,
1357 dollar_sign_is_identifier: false,
1358 insert_format_raw_data: false,
1359 }
1360 }
1361}
1362
1363pub struct Tokenizer {
1365 config: TokenizerConfig,
1366}
1367
1368impl Tokenizer {
1369 pub fn new(config: TokenizerConfig) -> Self {
1371 Self { config }
1372 }
1373
1374 pub fn default_config() -> Self {
1376 Self::new(TokenizerConfig::default())
1377 }
1378
1379 pub fn tokenize(&self, sql: &str) -> Result<Vec<Token>> {
1381 let mut state = TokenizerState::new(sql, &self.config);
1382 state.tokenize()
1383 }
1384}
1385
1386impl Default for Tokenizer {
1387 fn default() -> Self {
1388 Self::default_config()
1389 }
1390}
1391
1392struct TokenizerState<'a> {
1394 chars: Vec<char>,
1395 size: usize,
1396 tokens: Vec<Token>,
1397 start: usize,
1398 current: usize,
1399 line: usize,
1400 column: usize,
1401 comments: Vec<String>,
1402 config: &'a TokenizerConfig,
1403}
1404
1405impl<'a> TokenizerState<'a> {
1406 fn new(sql: &str, config: &'a TokenizerConfig) -> Self {
1407 let chars: Vec<char> = sql.chars().collect();
1408 let size = chars.len();
1409 Self {
1410 chars,
1411 size,
1412 tokens: Vec::new(),
1413 start: 0,
1414 current: 0,
1415 line: 1,
1416 column: 1,
1417 comments: Vec::new(),
1418 config,
1419 }
1420 }
1421
1422 fn tokenize(&mut self) -> Result<Vec<Token>> {
1423 while !self.is_at_end() {
1424 self.skip_whitespace();
1425 if self.is_at_end() {
1426 break;
1427 }
1428
1429 self.start = self.current;
1430 self.scan_token()?;
1431
1432 if self.config.insert_format_raw_data {
1435 if let Some(raw) = self.try_scan_insert_format_raw_data() {
1436 if !raw.is_empty() {
1437 self.start = self.current;
1438 self.add_token_with_text(TokenType::Var, raw);
1439 }
1440 }
1441 }
1442 }
1443
1444 if !self.comments.is_empty() {
1449 if let Some(last) = self.tokens.last_mut() {
1450 last.trailing_comments.extend(self.comments.drain(..));
1451 }
1452 }
1453
1454 Ok(std::mem::take(&mut self.tokens))
1455 }
1456
1457 fn is_at_end(&self) -> bool {
1458 self.current >= self.size
1459 }
1460
1461 fn peek(&self) -> char {
1462 if self.is_at_end() {
1463 '\0'
1464 } else {
1465 self.chars[self.current]
1466 }
1467 }
1468
1469 fn peek_next(&self) -> char {
1470 if self.current + 1 >= self.size {
1471 '\0'
1472 } else {
1473 self.chars[self.current + 1]
1474 }
1475 }
1476
1477 fn advance(&mut self) -> char {
1478 let c = self.peek();
1479 self.current += 1;
1480 if c == '\n' {
1481 self.line += 1;
1482 self.column = 1;
1483 } else {
1484 self.column += 1;
1485 }
1486 c
1487 }
1488
1489 fn skip_whitespace(&mut self) {
1490 let mut saw_newline = false;
1495 while !self.is_at_end() {
1496 let c = self.peek();
1497 match c {
1498 ' ' | '\t' | '\r' => {
1499 self.advance();
1500 }
1501 '\n' => {
1502 saw_newline = true;
1503 self.advance();
1504 }
1505 '\u{00A0}' | '\u{2000}'..='\u{200B}' | '\u{3000}' | '\u{FEFF}' => {
1510 self.advance();
1511 }
1512 '-' if self.peek_next() == '-' => {
1513 self.scan_line_comment(saw_newline);
1514 saw_newline = true;
1516 }
1517 '/' if self.peek_next() == '/' && self.config.hash_comments => {
1518 self.scan_double_slash_comment();
1520 }
1521 '/' if self.peek_next() == '*' => {
1522 if self.current + 2 < self.size && self.chars[self.current + 2] == '+' {
1524 break;
1526 }
1527 if self.scan_block_comment(saw_newline).is_err() {
1528 return;
1529 }
1530 }
1532 '/' if self.peek_next() == '/' && self.config.comments.contains_key("//") => {
1533 let prev_non_ws = if self.current > 0 {
1537 let mut i = self.current - 1;
1538 while i > 0 && (self.chars[i] == ' ' || self.chars[i] == '\t') {
1539 i -= 1;
1540 }
1541 self.chars[i]
1542 } else {
1543 '\0'
1544 };
1545 if prev_non_ws == ':' || prev_non_ws == '/' {
1546 break;
1548 }
1549 self.scan_line_comment(saw_newline);
1550 saw_newline = true;
1552 }
1553 '#' if self.config.hash_comments => {
1554 self.scan_hash_line_comment();
1555 }
1556 _ => break,
1557 }
1558 }
1559 }
1560
1561 fn scan_hash_line_comment(&mut self) {
1562 self.advance(); let start = self.current;
1564 while !self.is_at_end() && self.peek() != '\n' {
1565 self.advance();
1566 }
1567 let comment: String = self.chars[start..self.current].iter().collect();
1568 let comment_text = comment.trim().to_string();
1569 if let Some(last) = self.tokens.last_mut() {
1570 last.trailing_comments.push(comment_text);
1571 } else {
1572 self.comments.push(comment_text);
1573 }
1574 }
1575
1576 fn scan_double_slash_comment(&mut self) {
1577 self.advance(); self.advance(); let start = self.current;
1580 while !self.is_at_end() && self.peek() != '\n' {
1581 self.advance();
1582 }
1583 let comment: String = self.chars[start..self.current].iter().collect();
1584 let comment_text = comment.trim().to_string();
1585 if let Some(last) = self.tokens.last_mut() {
1586 last.trailing_comments.push(comment_text);
1587 } else {
1588 self.comments.push(comment_text);
1589 }
1590 }
1591
1592 fn scan_line_comment(&mut self, after_newline: bool) {
1593 self.advance(); self.advance(); let start = self.current;
1596 while !self.is_at_end() && self.peek() != '\n' {
1597 self.advance();
1598 }
1599 let comment_text: String = self.chars[start..self.current].iter().collect();
1600
1601 if after_newline || self.tokens.is_empty() {
1604 self.comments.push(comment_text);
1605 } else if let Some(last) = self.tokens.last_mut() {
1606 last.trailing_comments.push(comment_text);
1607 }
1608 }
1609
1610 fn scan_block_comment(&mut self, after_newline: bool) -> Result<()> {
1611 self.advance(); self.advance(); let content_start = self.current;
1614 let mut depth = 1;
1615
1616 while !self.is_at_end() && depth > 0 {
1617 if self.peek() == '/' && self.peek_next() == '*' && self.config.nested_comments {
1618 self.advance();
1619 self.advance();
1620 depth += 1;
1621 } else if self.peek() == '*' && self.peek_next() == '/' {
1622 depth -= 1;
1623 if depth > 0 {
1624 self.advance();
1625 self.advance();
1626 }
1627 } else {
1628 self.advance();
1629 }
1630 }
1631
1632 if depth > 0 {
1633 return Err(Error::tokenize(
1634 "Unterminated block comment",
1635 self.line,
1636 self.column,
1637 ));
1638 }
1639
1640 let content: String = self.chars[content_start..self.current].iter().collect();
1642 self.advance(); self.advance(); let comment_text = format!("/*{}*/", content);
1647
1648 if after_newline || self.tokens.is_empty() {
1651 self.comments.push(comment_text);
1652 } else if let Some(last) = self.tokens.last_mut() {
1653 last.trailing_comments.push(comment_text);
1654 }
1655
1656 Ok(())
1657 }
1658
1659 fn scan_hint(&mut self) -> Result<()> {
1661 self.advance(); self.advance(); self.advance(); let hint_start = self.current;
1665
1666 while !self.is_at_end() {
1668 if self.peek() == '*' && self.peek_next() == '/' {
1669 break;
1670 }
1671 self.advance();
1672 }
1673
1674 if self.is_at_end() {
1675 return Err(Error::tokenize(
1676 "Unterminated hint comment",
1677 self.line,
1678 self.column,
1679 ));
1680 }
1681
1682 let hint_text: String = self.chars[hint_start..self.current].iter().collect();
1683 self.advance(); self.advance(); self.add_token_with_text(TokenType::Hint, hint_text.trim().to_string());
1687
1688 Ok(())
1689 }
1690
1691 fn scan_positional_parameter(&mut self) -> Result<()> {
1693 self.advance(); let start = self.current;
1695
1696 while !self.is_at_end() && self.peek().is_ascii_digit() {
1697 self.advance();
1698 }
1699
1700 let number: String = self.chars[start..self.current].iter().collect();
1701 self.add_token_with_text(TokenType::Parameter, number);
1702 Ok(())
1703 }
1704
1705 fn try_scan_tagged_dollar_string(&mut self) -> Result<Option<()>> {
1710 let saved_pos = self.current;
1711
1712 self.advance(); let tag_start = self.current;
1718 while !self.is_at_end()
1719 && (self.peek().is_alphanumeric() || self.peek() == '_' || !self.peek().is_ascii())
1720 {
1721 self.advance();
1722 }
1723 let tag: String = self.chars[tag_start..self.current].iter().collect();
1724
1725 if self.is_at_end() || self.peek() != '$' {
1727 self.current = saved_pos;
1729 return Ok(None);
1730 }
1731 self.advance(); let content_start = self.current;
1735 let closing_tag = format!("${}$", tag);
1736 let closing_chars: Vec<char> = closing_tag.chars().collect();
1737
1738 loop {
1739 if self.is_at_end() {
1740 self.current = saved_pos;
1742 return Ok(None);
1743 }
1744
1745 if self.peek() == '$' && self.current + closing_chars.len() <= self.size {
1747 let matches = closing_chars.iter().enumerate().all(|(j, &ch)| {
1748 self.current + j < self.size && self.chars[self.current + j] == ch
1749 });
1750 if matches {
1751 let content: String = self.chars[content_start..self.current].iter().collect();
1752 for _ in 0..closing_chars.len() {
1754 self.advance();
1755 }
1756 let token_text = format!("{}\x00{}", tag, content);
1758 self.add_token_with_text(TokenType::DollarString, token_text);
1759 return Ok(Some(()));
1760 }
1761 }
1762 self.advance();
1763 }
1764 }
1765
1766 fn scan_dollar_quoted_string(&mut self) -> Result<()> {
1771 self.advance(); self.advance(); let start = self.current;
1776 while !self.is_at_end() {
1777 if self.peek() == '$'
1778 && self.current + 1 < self.size
1779 && self.chars[self.current + 1] == '$'
1780 {
1781 break;
1782 }
1783 self.advance();
1784 }
1785
1786 let content: String = self.chars[start..self.current].iter().collect();
1787
1788 if !self.is_at_end() {
1789 self.advance(); self.advance(); }
1792
1793 self.add_token_with_text(TokenType::DollarString, content);
1794 Ok(())
1795 }
1796
1797 fn scan_token(&mut self) -> Result<()> {
1798 let c = self.peek();
1799
1800 if c == '\'' {
1802 if self.config.quotes.contains_key("'''")
1804 && self.peek_next() == '\''
1805 && self.current + 2 < self.size
1806 && self.chars[self.current + 2] == '\''
1807 {
1808 return self.scan_triple_quoted_string('\'');
1809 }
1810 return self.scan_string();
1811 }
1812
1813 if c == '"'
1815 && self.config.quotes.contains_key("\"\"\"")
1816 && self.peek_next() == '"'
1817 && self.current + 2 < self.size
1818 && self.chars[self.current + 2] == '"'
1819 {
1820 return self.scan_triple_quoted_string('"');
1821 }
1822
1823 if c == '"'
1826 && self.config.quotes.contains_key("\"")
1827 && !self.config.identifiers.contains_key(&'"')
1828 {
1829 return self.scan_double_quoted_string();
1830 }
1831
1832 if let Some(&end_quote) = self.config.identifiers.get(&c) {
1834 return self.scan_quoted_identifier(end_quote);
1835 }
1836
1837 if c.is_ascii_digit() {
1839 return self.scan_number();
1840 }
1841
1842 if c == '.' && self.peek_next().is_ascii_digit() {
1849 let prev_char = if self.current > 0 {
1850 self.chars[self.current - 1]
1851 } else {
1852 '\0'
1853 };
1854 let is_after_ident = prev_char.is_alphanumeric()
1855 || prev_char == '_'
1856 || prev_char == '`'
1857 || prev_char == '"'
1858 || prev_char == ']'
1859 || prev_char == ')';
1860 if prev_char != '.' && !is_after_ident {
1861 return self.scan_number_starting_with_dot();
1862 }
1863 }
1864
1865 if c == '/'
1867 && self.peek_next() == '*'
1868 && self.current + 2 < self.size
1869 && self.chars[self.current + 2] == '+'
1870 {
1871 return self.scan_hint();
1872 }
1873
1874 if let Some(token_type) = self.try_scan_multi_char_operator() {
1876 self.add_token(token_type);
1877 return Ok(());
1878 }
1879
1880 if c == '$'
1883 && (self.peek_next().is_alphanumeric()
1884 || self.peek_next() == '_'
1885 || !self.peek_next().is_ascii())
1886 {
1887 if let Some(()) = self.try_scan_tagged_dollar_string()? {
1888 return Ok(());
1889 }
1890 if self.config.dollar_sign_is_identifier {
1893 return self.scan_dollar_identifier();
1894 }
1895 }
1896
1897 if c == '$' && self.peek_next() == '$' {
1899 return self.scan_dollar_quoted_string();
1900 }
1901
1902 if c == '$' && self.peek_next().is_ascii_digit() {
1904 return self.scan_positional_parameter();
1905 }
1906
1907 if c == '$' && self.config.dollar_sign_is_identifier {
1909 return self.scan_dollar_identifier();
1910 }
1911
1912 if (c == '#' || c == '@')
1915 && (self.peek_next().is_alphanumeric()
1916 || self.peek_next() == '_'
1917 || self.peek_next() == '#')
1918 {
1919 return self.scan_tsql_identifier();
1920 }
1921
1922 if let Some(&token_type) = self.config.single_tokens.get(&c) {
1924 self.advance();
1925 self.add_token(token_type);
1926 return Ok(());
1927 }
1928
1929 if c == '\u{2212}' {
1931 self.advance();
1932 self.add_token(TokenType::Dash);
1933 return Ok(());
1934 }
1935
1936 if c == '\u{2044}' {
1938 self.advance();
1939 self.add_token(TokenType::Slash);
1940 return Ok(());
1941 }
1942
1943 if c == '\u{2018}' || c == '\u{2019}' {
1945 return self.scan_unicode_quoted_string(c);
1947 }
1948 if c == '\u{201C}' || c == '\u{201D}' {
1949 return self.scan_unicode_quoted_identifier(c);
1951 }
1952
1953 self.scan_identifier_or_keyword()
1955 }
1956
1957 fn try_scan_multi_char_operator(&mut self) -> Option<TokenType> {
1958 let c = self.peek();
1959 let next = self.peek_next();
1960 let third = if self.current + 2 < self.size {
1961 self.chars[self.current + 2]
1962 } else {
1963 '\0'
1964 };
1965
1966 if c == '-' && next == '|' && third == '-' {
1969 self.advance();
1970 self.advance();
1971 self.advance();
1972 return Some(TokenType::Adjacent);
1973 }
1974
1975 if c == '|' && next == '|' && third == '/' {
1977 self.advance();
1978 self.advance();
1979 self.advance();
1980 return Some(TokenType::DPipeSlash);
1981 }
1982
1983 if c == '#' && next == '>' && third == '>' {
1985 self.advance();
1986 self.advance();
1987 self.advance();
1988 return Some(TokenType::DHashArrow);
1989 }
1990
1991 if c == '-' && next == '>' && third == '>' {
1993 self.advance();
1994 self.advance();
1995 self.advance();
1996 return Some(TokenType::DArrow);
1997 }
1998
1999 if c == '<' && next == '=' && third == '>' {
2001 self.advance();
2002 self.advance();
2003 self.advance();
2004 return Some(TokenType::NullsafeEq);
2005 }
2006
2007 if c == '<' && next == '-' && third == '>' {
2009 self.advance();
2010 self.advance();
2011 self.advance();
2012 return Some(TokenType::LrArrow);
2013 }
2014
2015 if c == '<' && next == '@' {
2017 self.advance();
2018 self.advance();
2019 return Some(TokenType::LtAt);
2020 }
2021
2022 if c == '@' && next == '>' {
2024 self.advance();
2025 self.advance();
2026 return Some(TokenType::AtGt);
2027 }
2028
2029 if c == '~' && next == '~' && third == '~' {
2031 self.advance();
2032 self.advance();
2033 self.advance();
2034 return Some(TokenType::Glob);
2035 }
2036
2037 if c == '~' && next == '~' && third == '*' {
2039 self.advance();
2040 self.advance();
2041 self.advance();
2042 return Some(TokenType::ILike);
2043 }
2044
2045 let fourth = if self.current + 3 < self.size {
2047 self.chars[self.current + 3]
2048 } else {
2049 '\0'
2050 };
2051 if c == '!' && next == '~' && third == '~' && fourth == '*' {
2052 self.advance();
2053 self.advance();
2054 self.advance();
2055 self.advance();
2056 return Some(TokenType::NotILike);
2057 }
2058
2059 if c == '!' && next == '~' && third == '~' {
2061 self.advance();
2062 self.advance();
2063 self.advance();
2064 return Some(TokenType::NotLike);
2065 }
2066
2067 if c == '!' && next == '~' && third == '*' {
2069 self.advance();
2070 self.advance();
2071 self.advance();
2072 return Some(TokenType::NotIRLike);
2073 }
2074
2075 if c == '!' && next == ':' && third == '>' {
2077 self.advance();
2078 self.advance();
2079 self.advance();
2080 return Some(TokenType::NColonGt);
2081 }
2082
2083 if c == '?' && next == ':' && third == ':' {
2085 self.advance();
2086 self.advance();
2087 self.advance();
2088 return Some(TokenType::QDColon);
2089 }
2090
2091 if c == '!' && next == '~' {
2093 self.advance();
2094 self.advance();
2095 return Some(TokenType::NotRLike);
2096 }
2097
2098 if c == '~' && next == '~' {
2100 self.advance();
2101 self.advance();
2102 return Some(TokenType::Like);
2103 }
2104
2105 if c == '~' && next == '*' {
2107 self.advance();
2108 self.advance();
2109 return Some(TokenType::IRLike);
2110 }
2111
2112 if c == ':' && next == ':' && third == '$' {
2115 self.advance();
2116 self.advance();
2117 self.advance();
2118 return Some(TokenType::DColonDollar);
2119 }
2120 if c == ':' && next == ':' && third == '%' {
2121 self.advance();
2122 self.advance();
2123 self.advance();
2124 return Some(TokenType::DColonPercent);
2125 }
2126 if c == ':' && next == ':' && third == '?' {
2127 self.advance();
2128 self.advance();
2129 self.advance();
2130 return Some(TokenType::DColonQMark);
2131 }
2132
2133 let token_type = match (c, next) {
2135 ('.', ':') => Some(TokenType::DotColon),
2136 ('=', '=') => Some(TokenType::Eq), ('<', '=') => Some(TokenType::Lte),
2138 ('>', '=') => Some(TokenType::Gte),
2139 ('!', '=') => Some(TokenType::Neq),
2140 ('<', '>') => Some(TokenType::Neq),
2141 ('^', '=') => Some(TokenType::Neq),
2142 ('<', '<') => Some(TokenType::LtLt),
2143 ('>', '>') => Some(TokenType::GtGt),
2144 ('|', '|') => Some(TokenType::DPipe),
2145 ('|', '/') => Some(TokenType::PipeSlash), (':', ':') => Some(TokenType::DColon),
2147 (':', '=') => Some(TokenType::ColonEq), (':', '>') => Some(TokenType::ColonGt), ('-', '>') => Some(TokenType::Arrow), ('=', '>') => Some(TokenType::FArrow), ('&', '&') => Some(TokenType::DAmp),
2152 ('&', '<') => Some(TokenType::AmpLt), ('&', '>') => Some(TokenType::AmpGt), ('@', '@') => Some(TokenType::AtAt), ('?', '|') => Some(TokenType::QMarkPipe), ('?', '&') => Some(TokenType::QMarkAmp), ('?', '?') => Some(TokenType::DQMark), ('#', '>') => Some(TokenType::HashArrow), ('#', '-') => Some(TokenType::HashDash), ('^', '@') => Some(TokenType::CaretAt), ('*', '*') => Some(TokenType::DStar), ('|', '>') => Some(TokenType::PipeGt), _ => None,
2164 };
2165
2166 if token_type.is_some() {
2167 self.advance();
2168 self.advance();
2169 }
2170
2171 token_type
2172 }
2173
2174 fn scan_string(&mut self) -> Result<()> {
2175 self.advance(); let mut value = String::new();
2177
2178 while !self.is_at_end() {
2179 let c = self.peek();
2180 if c == '\'' {
2181 if self.peek_next() == '\'' {
2182 value.push('\'');
2184 self.advance();
2185 self.advance();
2186 } else {
2187 break;
2188 }
2189 } else if c == '\\' && self.config.string_escapes.contains(&'\\') {
2190 self.advance(); if !self.is_at_end() {
2193 let escaped = self.advance();
2194 match escaped {
2195 'n' => value.push('\n'),
2196 'r' => value.push('\r'),
2197 't' => value.push('\t'),
2198 '0' => value.push('\0'),
2199 'Z' => value.push('\x1A'), 'a' => value.push('\x07'), 'b' => value.push('\x08'), 'f' => value.push('\x0C'), 'v' => value.push('\x0B'), '\\' => value.push('\\'),
2205 '\'' => value.push('\''),
2206 '"' => value.push('"'),
2207 '%' => {
2208 value.push('%');
2210 }
2211 '_' => {
2212 value.push('_');
2214 }
2215 _ => {
2219 if !self.config.escape_follow_chars.is_empty() {
2220 value.push(escaped);
2222 } else {
2223 value.push('\\');
2225 value.push(escaped);
2226 }
2227 }
2228 }
2229 }
2230 } else {
2231 value.push(self.advance());
2232 }
2233 }
2234
2235 if self.is_at_end() {
2236 return Err(Error::tokenize(
2237 "Unterminated string",
2238 self.line,
2239 self.column,
2240 ));
2241 }
2242
2243 self.advance(); self.add_token_with_text(TokenType::String, value);
2245 Ok(())
2246 }
2247
2248 fn scan_double_quoted_string(&mut self) -> Result<()> {
2250 self.advance(); let mut value = String::new();
2252
2253 while !self.is_at_end() {
2254 let c = self.peek();
2255 if c == '"' {
2256 if self.peek_next() == '"' {
2257 value.push('"');
2259 self.advance();
2260 self.advance();
2261 } else {
2262 break;
2263 }
2264 } else if c == '\\' && self.config.string_escapes.contains(&'\\') {
2265 self.advance(); if !self.is_at_end() {
2268 let escaped = self.advance();
2269 match escaped {
2270 'n' => value.push('\n'),
2271 'r' => value.push('\r'),
2272 't' => value.push('\t'),
2273 '0' => value.push('\0'),
2274 'Z' => value.push('\x1A'), 'a' => value.push('\x07'), 'b' => value.push('\x08'), 'f' => value.push('\x0C'), 'v' => value.push('\x0B'), '\\' => value.push('\\'),
2280 '\'' => value.push('\''),
2281 '"' => value.push('"'),
2282 '%' => {
2283 value.push('%');
2285 }
2286 '_' => {
2287 value.push('_');
2289 }
2290 _ => {
2294 if !self.config.escape_follow_chars.is_empty() {
2295 value.push(escaped);
2297 } else {
2298 value.push('\\');
2300 value.push(escaped);
2301 }
2302 }
2303 }
2304 }
2305 } else {
2306 value.push(self.advance());
2307 }
2308 }
2309
2310 if self.is_at_end() {
2311 return Err(Error::tokenize(
2312 "Unterminated double-quoted string",
2313 self.line,
2314 self.column,
2315 ));
2316 }
2317
2318 self.advance(); self.add_token_with_text(TokenType::String, value);
2320 Ok(())
2321 }
2322
2323 fn scan_triple_quoted_string(&mut self, quote_char: char) -> Result<()> {
2324 self.advance();
2326 self.advance();
2327 self.advance();
2328 let mut value = String::new();
2329
2330 while !self.is_at_end() {
2331 if self.peek() == quote_char
2333 && self.current + 1 < self.size
2334 && self.chars[self.current + 1] == quote_char
2335 && self.current + 2 < self.size
2336 && self.chars[self.current + 2] == quote_char
2337 {
2338 break;
2340 }
2341 value.push(self.advance());
2342 }
2343
2344 if self.is_at_end() {
2345 return Err(Error::tokenize(
2346 "Unterminated triple-quoted string",
2347 self.line,
2348 self.column,
2349 ));
2350 }
2351
2352 self.advance();
2354 self.advance();
2355 self.advance();
2356 let token_type = if quote_char == '"' {
2357 TokenType::TripleDoubleQuotedString
2358 } else {
2359 TokenType::TripleSingleQuotedString
2360 };
2361 self.add_token_with_text(token_type, value);
2362 Ok(())
2363 }
2364
2365 fn scan_quoted_identifier(&mut self, end_quote: char) -> Result<()> {
2366 self.advance(); let mut value = String::new();
2368
2369 loop {
2370 if self.is_at_end() {
2371 return Err(Error::tokenize(
2372 "Unterminated identifier",
2373 self.line,
2374 self.column,
2375 ));
2376 }
2377 if self.peek() == end_quote {
2378 if self.peek_next() == end_quote {
2379 value.push(end_quote);
2381 self.advance(); self.advance(); } else {
2384 break;
2386 }
2387 } else {
2388 value.push(self.peek());
2389 self.advance();
2390 }
2391 }
2392
2393 self.advance(); self.add_token_with_text(TokenType::QuotedIdentifier, value);
2395 Ok(())
2396 }
2397
2398 fn scan_unicode_quoted_string(&mut self, open_quote: char) -> Result<()> {
2403 self.advance(); let start = self.current;
2405 let close_quote = if open_quote == '\u{2018}' {
2407 '\u{2019}' } else {
2409 '\u{2019}' };
2411 while !self.is_at_end() && self.peek() != close_quote {
2412 self.advance();
2413 }
2414 let value: String = self.chars[start..self.current].iter().collect();
2415 if !self.is_at_end() {
2416 self.advance(); }
2418 self.add_token_with_text(TokenType::String, value);
2419 Ok(())
2420 }
2421
2422 fn scan_unicode_quoted_identifier(&mut self, open_quote: char) -> Result<()> {
2425 self.advance(); let start = self.current;
2427 let close_quote = if open_quote == '\u{201C}' {
2428 '\u{201D}' } else {
2430 '\u{201D}' };
2432 while !self.is_at_end() && self.peek() != close_quote && self.peek() != '"' {
2433 self.advance();
2434 }
2435 let value: String = self.chars[start..self.current].iter().collect();
2436 if !self.is_at_end() {
2437 self.advance(); }
2439 self.add_token_with_text(TokenType::QuotedIdentifier, value);
2440 Ok(())
2441 }
2442
2443 fn scan_number(&mut self) -> Result<()> {
2444 if self.config.hex_number_strings && self.peek() == '0' && !self.is_at_end() {
2446 let next = if self.current + 1 < self.size {
2447 self.chars[self.current + 1]
2448 } else {
2449 '\0'
2450 };
2451 if next == 'x' || next == 'X' {
2452 self.advance();
2454 self.advance();
2455 let hex_start = self.current;
2457 while !self.is_at_end() && (self.peek().is_ascii_hexdigit() || self.peek() == '_') {
2458 if self.peek() == '_' && !self.peek_next().is_ascii_hexdigit() {
2459 break;
2460 }
2461 self.advance();
2462 }
2463 if self.current > hex_start {
2464 let mut is_hex_float = false;
2466 if !self.is_at_end() && self.peek() == '.' {
2468 let after_dot = if self.current + 1 < self.size {
2469 self.chars[self.current + 1]
2470 } else {
2471 '\0'
2472 };
2473 if after_dot.is_ascii_hexdigit() {
2474 is_hex_float = true;
2475 self.advance(); while !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2477 self.advance();
2478 }
2479 }
2480 }
2481 if !self.is_at_end() && (self.peek() == 'p' || self.peek() == 'P') {
2483 is_hex_float = true;
2484 self.advance(); if !self.is_at_end() && (self.peek() == '+' || self.peek() == '-') {
2486 self.advance();
2487 }
2488 while !self.is_at_end() && self.peek().is_ascii_digit() {
2489 self.advance();
2490 }
2491 }
2492 if is_hex_float {
2493 let full_text: String =
2495 self.chars[self.start..self.current].iter().collect();
2496 self.add_token_with_text(TokenType::Number, full_text);
2497 } else if self.config.hex_string_is_integer_type {
2498 let hex_value: String =
2500 self.chars[hex_start..self.current].iter().collect();
2501 self.add_token_with_text(TokenType::HexNumber, hex_value);
2502 } else {
2503 let hex_value: String =
2505 self.chars[hex_start..self.current].iter().collect();
2506 self.add_token_with_text(TokenType::HexString, hex_value);
2507 }
2508 return Ok(());
2509 }
2510 self.current = self.start + 1;
2513 }
2514 }
2515
2516 while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2518 if self.peek() == '_' && (self.is_at_end() || !self.peek_next().is_ascii_digit()) {
2520 break;
2521 }
2522 self.advance();
2523 }
2524
2525 if self.peek() == '.' {
2529 let next = self.peek_next();
2530 if next != '.' {
2536 self.advance(); while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2539 if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2540 break;
2541 }
2542 self.advance();
2543 }
2544 }
2545 }
2546
2547 if self.peek() == 'e' || self.peek() == 'E' {
2549 self.advance();
2550 if self.peek() == '+' || self.peek() == '-' {
2551 self.advance();
2552 }
2553 while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2554 if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2555 break;
2556 }
2557 self.advance();
2558 }
2559 }
2560
2561 let text: String = self.chars[self.start..self.current].iter().collect();
2562
2563 if !self.config.numeric_literals.is_empty() && !self.is_at_end() {
2565 let next_char = self.peek().to_uppercase().to_string();
2566 let suffix_match = if self.current + 1 < self.size {
2568 let two_char: String = vec![self.chars[self.current], self.chars[self.current + 1]]
2569 .iter()
2570 .collect::<String>()
2571 .to_uppercase();
2572 if self.config.numeric_literals.contains_key(&two_char) {
2573 let after_suffix = if self.current + 2 < self.size {
2575 self.chars[self.current + 2]
2576 } else {
2577 ' '
2578 };
2579 if !after_suffix.is_alphanumeric() && after_suffix != '_' {
2580 Some((two_char, 2))
2581 } else {
2582 None
2583 }
2584 } else if self.config.numeric_literals.contains_key(&next_char) {
2585 let after_suffix = if self.current + 1 < self.size {
2587 self.chars[self.current + 1]
2588 } else {
2589 ' '
2590 };
2591 if !after_suffix.is_alphanumeric() && after_suffix != '_' {
2592 Some((next_char, 1))
2593 } else {
2594 None
2595 }
2596 } else {
2597 None
2598 }
2599 } else if self.config.numeric_literals.contains_key(&next_char) {
2600 Some((next_char, 1))
2602 } else {
2603 None
2604 };
2605
2606 if let Some((suffix, len)) = suffix_match {
2607 for _ in 0..len {
2609 self.advance();
2610 }
2611 let type_name = self
2614 .config
2615 .numeric_literals
2616 .get(&suffix)
2617 .expect("suffix verified by contains_key above")
2618 .clone();
2619 let combined = format!("{}::{}", text, type_name);
2620 self.add_token_with_text(TokenType::Number, combined);
2621 return Ok(());
2622 }
2623 }
2624
2625 if self.config.identifiers_can_start_with_digit && !self.is_at_end() {
2628 let next = self.peek();
2629 if next.is_alphabetic() || next == '_' {
2630 while !self.is_at_end() {
2632 let ch = self.peek();
2633 if ch.is_alphanumeric() || ch == '_' {
2634 self.advance();
2635 } else {
2636 break;
2637 }
2638 }
2639 let ident_text: String = self.chars[self.start..self.current].iter().collect();
2640 self.add_token_with_text(TokenType::Identifier, ident_text);
2641 return Ok(());
2642 }
2643 }
2644
2645 self.add_token_with_text(TokenType::Number, text);
2646 Ok(())
2647 }
2648
2649 fn scan_number_starting_with_dot(&mut self) -> Result<()> {
2651 self.advance();
2653
2654 while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2656 if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2657 break;
2658 }
2659 self.advance();
2660 }
2661
2662 if self.peek() == 'e' || self.peek() == 'E' {
2664 self.advance();
2665 if self.peek() == '+' || self.peek() == '-' {
2666 self.advance();
2667 }
2668 while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2669 if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2670 break;
2671 }
2672 self.advance();
2673 }
2674 }
2675
2676 let text: String = self.chars[self.start..self.current].iter().collect();
2677 self.add_token_with_text(TokenType::Number, text);
2678 Ok(())
2679 }
2680
2681 fn scan_identifier_or_keyword(&mut self) -> Result<()> {
2682 let first_char = self.peek();
2684 if !first_char.is_alphanumeric() && first_char != '_' {
2685 let c = self.advance();
2687 return Err(Error::tokenize(
2688 format!("Unexpected character: '{}'", c),
2689 self.line,
2690 self.column,
2691 ));
2692 }
2693
2694 while !self.is_at_end() {
2695 let c = self.peek();
2696 if c == '#' {
2700 let next_c = if self.current + 1 < self.size {
2701 self.chars[self.current + 1]
2702 } else {
2703 '\0'
2704 };
2705 if next_c == '>' || next_c == '-' {
2706 break; }
2708 self.advance();
2709 } else if c.is_alphanumeric() || c == '_' || c == '$' || c == '@' {
2710 self.advance();
2711 } else {
2712 break;
2713 }
2714 }
2715
2716 let text: String = self.chars[self.start..self.current].iter().collect();
2717 let upper = text.to_uppercase();
2718
2719 if upper == "NOT" && self.peek() == '=' {
2721 self.advance(); self.add_token(TokenType::Neq);
2723 return Ok(());
2724 }
2725
2726 let next_char = self.peek();
2729 let is_single_quote = next_char == '\'';
2730 let is_double_quote = next_char == '"' && self.config.quotes.contains_key("\"");
2731 let is_double_quote_for_raw = next_char == '"';
2734
2735 if upper == "R" && (is_single_quote || is_double_quote_for_raw) {
2738 let quote_char = if is_single_quote { '\'' } else { '"' };
2741 self.advance(); if self.peek() == quote_char && self.peek_next() == quote_char {
2745 self.advance(); self.advance(); let string_value = self.scan_raw_triple_quoted_content(quote_char)?;
2749 self.add_token_with_text(TokenType::RawString, string_value);
2750 } else {
2751 let string_value = self.scan_raw_string_content(quote_char)?;
2752 self.add_token_with_text(TokenType::RawString, string_value);
2753 }
2754 return Ok(());
2755 }
2756
2757 if is_single_quote || is_double_quote {
2758 match upper.as_str() {
2759 "N" => {
2760 self.advance(); let string_value = if is_single_quote {
2763 self.scan_string_content()?
2764 } else {
2765 self.scan_double_quoted_string_content()?
2766 };
2767 self.add_token_with_text(TokenType::NationalString, string_value);
2768 return Ok(());
2769 }
2770 "E" => {
2771 let lowercase = text == "e";
2775 let prefix = if lowercase { "e:" } else { "E:" };
2776 self.advance(); let string_value = self.scan_string_content_with_escapes(true)?;
2778 self.add_token_with_text(
2779 TokenType::EscapeString,
2780 format!("{}{}", prefix, string_value),
2781 );
2782 return Ok(());
2783 }
2784 "X" => {
2785 self.advance(); let string_value = if is_single_quote {
2788 self.scan_string_content()?
2789 } else {
2790 self.scan_double_quoted_string_content()?
2791 };
2792 self.add_token_with_text(TokenType::HexString, string_value);
2793 return Ok(());
2794 }
2795 "B" if is_double_quote => {
2796 self.advance(); let string_value = self.scan_double_quoted_string_content()?;
2799 self.add_token_with_text(TokenType::ByteString, string_value);
2800 return Ok(());
2801 }
2802 "B" if is_single_quote => {
2803 self.advance(); let string_value = self.scan_string_content()?;
2807 if self.config.b_prefix_is_byte_string {
2808 self.add_token_with_text(TokenType::ByteString, string_value);
2809 } else {
2810 self.add_token_with_text(TokenType::BitString, string_value);
2811 }
2812 return Ok(());
2813 }
2814 _ => {}
2815 }
2816 }
2817
2818 if upper == "U"
2820 && self.peek() == '&'
2821 && self.current + 1 < self.size
2822 && self.chars[self.current + 1] == '\''
2823 {
2824 self.advance(); self.advance(); let string_value = self.scan_string_content()?;
2827 self.add_token_with_text(TokenType::UnicodeString, string_value);
2828 return Ok(());
2829 }
2830
2831 let token_type = self
2832 .config
2833 .keywords
2834 .get(&upper)
2835 .copied()
2836 .unwrap_or(TokenType::Var);
2837
2838 self.add_token_with_text(token_type, text);
2839 Ok(())
2840 }
2841
2842 fn scan_string_content_with_escapes(
2846 &mut self,
2847 force_backslash_escapes: bool,
2848 ) -> Result<String> {
2849 let mut value = String::new();
2850 let use_backslash_escapes =
2851 force_backslash_escapes || self.config.string_escapes.contains(&'\\');
2852
2853 while !self.is_at_end() {
2854 let c = self.peek();
2855 if c == '\'' {
2856 if self.peek_next() == '\'' {
2857 value.push('\'');
2859 self.advance();
2860 self.advance();
2861 } else {
2862 break;
2863 }
2864 } else if c == '\\' && use_backslash_escapes {
2865 value.push(self.advance());
2867 if !self.is_at_end() {
2868 value.push(self.advance());
2869 }
2870 } else {
2871 value.push(self.advance());
2872 }
2873 }
2874
2875 if self.is_at_end() {
2876 return Err(Error::tokenize(
2877 "Unterminated string",
2878 self.line,
2879 self.column,
2880 ));
2881 }
2882
2883 self.advance(); Ok(value)
2885 }
2886
2887 fn scan_string_content(&mut self) -> Result<String> {
2889 self.scan_string_content_with_escapes(false)
2890 }
2891
2892 fn scan_double_quoted_string_content(&mut self) -> Result<String> {
2895 let mut value = String::new();
2896 let use_backslash_escapes = self.config.string_escapes.contains(&'\\');
2897
2898 while !self.is_at_end() {
2899 let c = self.peek();
2900 if c == '"' {
2901 if self.peek_next() == '"' {
2902 value.push('"');
2904 self.advance();
2905 self.advance();
2906 } else {
2907 break;
2908 }
2909 } else if c == '\\' && use_backslash_escapes {
2910 self.advance(); if !self.is_at_end() {
2913 let escaped = self.advance();
2914 match escaped {
2915 'n' => value.push('\n'),
2916 'r' => value.push('\r'),
2917 't' => value.push('\t'),
2918 '0' => value.push('\0'),
2919 '\\' => value.push('\\'),
2920 '"' => value.push('"'),
2921 '\'' => value.push('\''),
2922 'x' => {
2923 let mut hex = String::new();
2925 for _ in 0..2 {
2926 if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2927 hex.push(self.advance());
2928 }
2929 }
2930 if let Ok(byte) = u8::from_str_radix(&hex, 16) {
2931 value.push(byte as char);
2932 } else {
2933 value.push('\\');
2935 value.push('x');
2936 value.push_str(&hex);
2937 }
2938 }
2939 _ => {
2940 value.push('\\');
2942 value.push(escaped);
2943 }
2944 }
2945 }
2946 } else {
2947 value.push(self.advance());
2948 }
2949 }
2950
2951 if self.is_at_end() {
2952 return Err(Error::tokenize(
2953 "Unterminated double-quoted string",
2954 self.line,
2955 self.column,
2956 ));
2957 }
2958
2959 self.advance(); Ok(value)
2961 }
2962
2963 fn scan_raw_string_content(&mut self, quote_char: char) -> Result<String> {
2968 let mut value = String::new();
2969
2970 while !self.is_at_end() {
2971 let c = self.peek();
2972 if c == quote_char {
2973 if self.peek_next() == quote_char {
2974 value.push(quote_char);
2976 self.advance();
2977 self.advance();
2978 } else {
2979 break;
2980 }
2981 } else if c == '\\'
2982 && self.peek_next() == quote_char
2983 && self.config.string_escapes_allowed_in_raw_strings
2984 {
2985 value.push(quote_char);
2989 self.advance(); self.advance(); } else {
2992 value.push(self.advance());
2994 }
2995 }
2996
2997 if self.is_at_end() {
2998 return Err(Error::tokenize(
2999 "Unterminated raw string",
3000 self.line,
3001 self.column,
3002 ));
3003 }
3004
3005 self.advance(); Ok(value)
3007 }
3008
3009 fn scan_raw_triple_quoted_content(&mut self, quote_char: char) -> Result<String> {
3012 let mut value = String::new();
3013
3014 while !self.is_at_end() {
3015 let c = self.peek();
3016 if c == quote_char && self.peek_next() == quote_char {
3017 if self.current + 2 < self.size && self.chars[self.current + 2] == quote_char {
3019 self.advance(); self.advance(); self.advance(); return Ok(value);
3024 }
3025 }
3026 let ch = self.advance();
3028 value.push(ch);
3029 }
3030
3031 Err(Error::tokenize(
3032 "Unterminated raw triple-quoted string",
3033 self.line,
3034 self.column,
3035 ))
3036 }
3037
3038 fn scan_dollar_identifier(&mut self) -> Result<()> {
3043 self.advance();
3045
3046 while !self.is_at_end() {
3048 let c = self.peek();
3049 if c.is_alphanumeric() || c == '_' || c == '$' {
3050 self.advance();
3051 } else {
3052 break;
3053 }
3054 }
3055
3056 let text: String = self.chars[self.start..self.current].iter().collect();
3057 self.add_token_with_text(TokenType::Var, text);
3058 Ok(())
3059 }
3060
3061 fn scan_tsql_identifier(&mut self) -> Result<()> {
3062 let first = self.advance();
3064
3065 if first == '#' && self.peek() == '#' {
3067 self.advance();
3068 }
3069
3070 while !self.is_at_end() {
3072 let c = self.peek();
3073 if c.is_alphanumeric() || c == '_' || c == '$' || c == '#' || c == '@' {
3074 self.advance();
3075 } else {
3076 break;
3077 }
3078 }
3079
3080 let text: String = self.chars[self.start..self.current].iter().collect();
3081 self.add_token_with_text(TokenType::Var, text);
3083 Ok(())
3084 }
3085
3086 fn try_scan_insert_format_raw_data(&mut self) -> Option<String> {
3090 let len = self.tokens.len();
3091 if len < 3 {
3092 return None;
3093 }
3094
3095 let last = &self.tokens[len - 1];
3097 if last.text.eq_ignore_ascii_case("VALUES") {
3098 return None;
3099 }
3100 if !matches!(last.token_type, TokenType::Var | TokenType::Identifier) {
3101 return None;
3102 }
3103
3104 let format_tok = &self.tokens[len - 2];
3106 if !format_tok.text.eq_ignore_ascii_case("FORMAT") {
3107 return None;
3108 }
3109
3110 let has_insert = self.tokens[..len - 2]
3112 .iter()
3113 .rev()
3114 .take(20)
3115 .any(|t| t.token_type == TokenType::Insert);
3116 if !has_insert {
3117 return None;
3118 }
3119
3120 let raw_start = self.current;
3124 while !self.is_at_end() {
3125 let c = self.peek();
3126 if c == '\n' {
3127 let saved = self.current;
3129 self.advance(); while !self.is_at_end() && self.peek() == '\r' {
3132 self.advance();
3133 }
3134 if self.is_at_end() || self.peek() == '\n' {
3135 let raw: String = self.chars[raw_start..saved].iter().collect();
3138 return Some(raw.trim().to_string());
3139 }
3140 } else {
3142 self.advance();
3143 }
3144 }
3145
3146 let raw: String = self.chars[raw_start..self.current].iter().collect();
3148 let trimmed = raw.trim().to_string();
3149 if trimmed.is_empty() {
3150 None
3151 } else {
3152 Some(trimmed)
3153 }
3154 }
3155
3156 fn add_token(&mut self, token_type: TokenType) {
3157 let text: String = self.chars[self.start..self.current].iter().collect();
3158 self.add_token_with_text(token_type, text);
3159 }
3160
3161 fn add_token_with_text(&mut self, token_type: TokenType, text: String) {
3162 let span = Span::new(self.start, self.current, self.line, self.column);
3163 let mut token = Token::new(token_type, text, span);
3164 token.comments.append(&mut self.comments);
3165 self.tokens.push(token);
3166 }
3167}
3168
3169#[cfg(test)]
3170mod tests {
3171 use super::*;
3172
3173 #[test]
3174 fn test_simple_select() {
3175 let tokenizer = Tokenizer::default();
3176 let tokens = tokenizer.tokenize("SELECT 1").unwrap();
3177
3178 assert_eq!(tokens.len(), 2);
3179 assert_eq!(tokens[0].token_type, TokenType::Select);
3180 assert_eq!(tokens[1].token_type, TokenType::Number);
3181 assert_eq!(tokens[1].text, "1");
3182 }
3183
3184 #[test]
3185 fn test_select_with_identifier() {
3186 let tokenizer = Tokenizer::default();
3187 let tokens = tokenizer.tokenize("SELECT a, b FROM t").unwrap();
3188
3189 assert_eq!(tokens.len(), 6);
3190 assert_eq!(tokens[0].token_type, TokenType::Select);
3191 assert_eq!(tokens[1].token_type, TokenType::Var);
3192 assert_eq!(tokens[1].text, "a");
3193 assert_eq!(tokens[2].token_type, TokenType::Comma);
3194 assert_eq!(tokens[3].token_type, TokenType::Var);
3195 assert_eq!(tokens[3].text, "b");
3196 assert_eq!(tokens[4].token_type, TokenType::From);
3197 assert_eq!(tokens[5].token_type, TokenType::Var);
3198 assert_eq!(tokens[5].text, "t");
3199 }
3200
3201 #[test]
3202 fn test_string_literal() {
3203 let tokenizer = Tokenizer::default();
3204 let tokens = tokenizer.tokenize("SELECT 'hello'").unwrap();
3205
3206 assert_eq!(tokens.len(), 2);
3207 assert_eq!(tokens[1].token_type, TokenType::String);
3208 assert_eq!(tokens[1].text, "hello");
3209 }
3210
3211 #[test]
3212 fn test_escaped_string() {
3213 let tokenizer = Tokenizer::default();
3214 let tokens = tokenizer.tokenize("SELECT 'it''s'").unwrap();
3215
3216 assert_eq!(tokens.len(), 2);
3217 assert_eq!(tokens[1].token_type, TokenType::String);
3218 assert_eq!(tokens[1].text, "it's");
3219 }
3220
3221 #[test]
3222 fn test_comments() {
3223 let tokenizer = Tokenizer::default();
3224 let tokens = tokenizer.tokenize("SELECT -- comment\n1").unwrap();
3225
3226 assert_eq!(tokens.len(), 2);
3227 assert_eq!(tokens[0].trailing_comments.len(), 1);
3230 assert_eq!(tokens[0].trailing_comments[0], " comment");
3231 }
3232
3233 #[test]
3234 fn test_comment_in_and_chain() {
3235 use crate::generator::Generator;
3236 use crate::parser::Parser;
3237
3238 let sql = "SELECT a FROM b WHERE foo\n-- c1\nAND bar\n-- c2\nAND bla";
3240 let ast = Parser::parse_sql(sql).unwrap();
3241 let mut gen = Generator::default();
3242 let output = gen.generate(&ast[0]).unwrap();
3243 assert_eq!(
3244 output,
3245 "SELECT a FROM b WHERE foo AND /* c1 */ bar AND /* c2 */ bla"
3246 );
3247 }
3248
3249 #[test]
3250 fn test_operators() {
3251 let tokenizer = Tokenizer::default();
3252 let tokens = tokenizer.tokenize("1 + 2 * 3").unwrap();
3253
3254 assert_eq!(tokens.len(), 5);
3255 assert_eq!(tokens[0].token_type, TokenType::Number);
3256 assert_eq!(tokens[1].token_type, TokenType::Plus);
3257 assert_eq!(tokens[2].token_type, TokenType::Number);
3258 assert_eq!(tokens[3].token_type, TokenType::Star);
3259 assert_eq!(tokens[4].token_type, TokenType::Number);
3260 }
3261
3262 #[test]
3263 fn test_comparison_operators() {
3264 let tokenizer = Tokenizer::default();
3265 let tokens = tokenizer.tokenize("a <= b >= c != d").unwrap();
3266
3267 assert_eq!(tokens[1].token_type, TokenType::Lte);
3268 assert_eq!(tokens[3].token_type, TokenType::Gte);
3269 assert_eq!(tokens[5].token_type, TokenType::Neq);
3270 }
3271
3272 #[test]
3273 fn test_national_string() {
3274 let tokenizer = Tokenizer::default();
3275 let tokens = tokenizer.tokenize("N'abc'").unwrap();
3276
3277 assert_eq!(
3278 tokens.len(),
3279 1,
3280 "Expected 1 token for N'abc', got {:?}",
3281 tokens
3282 );
3283 assert_eq!(tokens[0].token_type, TokenType::NationalString);
3284 assert_eq!(tokens[0].text, "abc");
3285 }
3286
3287 #[test]
3288 fn test_hex_string() {
3289 let tokenizer = Tokenizer::default();
3290 let tokens = tokenizer.tokenize("X'ABCD'").unwrap();
3291
3292 assert_eq!(
3293 tokens.len(),
3294 1,
3295 "Expected 1 token for X'ABCD', got {:?}",
3296 tokens
3297 );
3298 assert_eq!(tokens[0].token_type, TokenType::HexString);
3299 assert_eq!(tokens[0].text, "ABCD");
3300 }
3301
3302 #[test]
3303 fn test_bit_string() {
3304 let tokenizer = Tokenizer::default();
3305 let tokens = tokenizer.tokenize("B'01010'").unwrap();
3306
3307 assert_eq!(
3308 tokens.len(),
3309 1,
3310 "Expected 1 token for B'01010', got {:?}",
3311 tokens
3312 );
3313 assert_eq!(tokens[0].token_type, TokenType::BitString);
3314 assert_eq!(tokens[0].text, "01010");
3315 }
3316
3317 #[test]
3318 fn test_trailing_dot_number() {
3319 let tokenizer = Tokenizer::default();
3320
3321 let tokens = tokenizer.tokenize("SELECT 1.").unwrap();
3323 assert_eq!(
3324 tokens.len(),
3325 2,
3326 "Expected 2 tokens for 'SELECT 1.', got {:?}",
3327 tokens
3328 );
3329 assert_eq!(tokens[1].token_type, TokenType::Number);
3330 assert_eq!(tokens[1].text, "1.");
3331
3332 let tokens = tokenizer.tokenize("SELECT 1.5").unwrap();
3334 assert_eq!(tokens[1].text, "1.5");
3335
3336 let tokens = tokenizer.tokenize("SELECT 1.a").unwrap();
3339 assert_eq!(
3340 tokens.len(),
3341 3,
3342 "Expected 3 tokens for 'SELECT 1.a', got {:?}",
3343 tokens
3344 );
3345 assert_eq!(tokens[1].token_type, TokenType::Number);
3346 assert_eq!(tokens[1].text, "1.");
3347 assert_eq!(tokens[2].token_type, TokenType::Var);
3348
3349 let tokens = tokenizer.tokenize("SELECT 1..2").unwrap();
3351 assert_eq!(tokens[1].token_type, TokenType::Number);
3352 assert_eq!(tokens[1].text, "1");
3353 assert_eq!(tokens[2].token_type, TokenType::Dot);
3354 assert_eq!(tokens[3].token_type, TokenType::Dot);
3355 assert_eq!(tokens[4].token_type, TokenType::Number);
3356 assert_eq!(tokens[4].text, "2");
3357 }
3358
3359 #[test]
3360 fn test_leading_dot_number() {
3361 let tokenizer = Tokenizer::default();
3362
3363 let tokens = tokenizer.tokenize(".25").unwrap();
3365 assert_eq!(
3366 tokens.len(),
3367 1,
3368 "Expected 1 token for '.25', got {:?}",
3369 tokens
3370 );
3371 assert_eq!(tokens[0].token_type, TokenType::Number);
3372 assert_eq!(tokens[0].text, ".25");
3373
3374 let tokens = tokenizer.tokenize("SAMPLE (.25)").unwrap();
3376 assert_eq!(
3377 tokens.len(),
3378 4,
3379 "Expected 4 tokens for 'SAMPLE (.25)', got {:?}",
3380 tokens
3381 );
3382 assert_eq!(tokens[0].token_type, TokenType::Sample);
3383 assert_eq!(tokens[1].token_type, TokenType::LParen);
3384 assert_eq!(tokens[2].token_type, TokenType::Number);
3385 assert_eq!(tokens[2].text, ".25");
3386 assert_eq!(tokens[3].token_type, TokenType::RParen);
3387
3388 let tokens = tokenizer.tokenize(".5e10").unwrap();
3390 assert_eq!(
3391 tokens.len(),
3392 1,
3393 "Expected 1 token for '.5e10', got {:?}",
3394 tokens
3395 );
3396 assert_eq!(tokens[0].token_type, TokenType::Number);
3397 assert_eq!(tokens[0].text, ".5e10");
3398
3399 let tokens = tokenizer.tokenize("a.b").unwrap();
3401 assert_eq!(
3402 tokens.len(),
3403 3,
3404 "Expected 3 tokens for 'a.b', got {:?}",
3405 tokens
3406 );
3407 assert_eq!(tokens[1].token_type, TokenType::Dot);
3408 }
3409
3410 #[test]
3411 fn test_unrecognized_character() {
3412 let tokenizer = Tokenizer::default();
3413
3414 let result = tokenizer.tokenize("SELECT \u{2018}hello\u{2019}");
3416 assert!(
3417 result.is_ok(),
3418 "Curly quotes should be tokenized as strings"
3419 );
3420
3421 let result = tokenizer.tokenize("SELECT • FROM t");
3423 assert!(result.is_err());
3424 }
3425
3426 #[test]
3427 fn test_colon_eq_tokenization() {
3428 let tokenizer = Tokenizer::default();
3429
3430 let tokens = tokenizer.tokenize("a := 1").unwrap();
3432 assert_eq!(tokens.len(), 3);
3433 assert_eq!(tokens[0].token_type, TokenType::Var);
3434 assert_eq!(tokens[1].token_type, TokenType::ColonEq);
3435 assert_eq!(tokens[2].token_type, TokenType::Number);
3436
3437 let tokens = tokenizer.tokenize("a:b").unwrap();
3439 assert!(tokens.iter().any(|t| t.token_type == TokenType::Colon));
3440 assert!(!tokens.iter().any(|t| t.token_type == TokenType::ColonEq));
3441
3442 let tokens = tokenizer.tokenize("a::INT").unwrap();
3444 assert!(tokens.iter().any(|t| t.token_type == TokenType::DColon));
3445 }
3446
3447 #[test]
3448 fn test_colon_eq_parsing() {
3449 use crate::generator::Generator;
3450 use crate::parser::Parser;
3451
3452 let ast = Parser::parse_sql("SELECT @var1 := 1, @var2")
3454 .expect("Failed to parse MySQL @var := expr");
3455 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3456 assert_eq!(output, "SELECT @var1 := 1, @var2");
3457
3458 let ast = Parser::parse_sql("SELECT @var1, @var2 := @var1")
3460 .expect("Failed to parse MySQL @var2 := @var1");
3461 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3462 assert_eq!(output, "SELECT @var1, @var2 := @var1");
3463
3464 let ast = Parser::parse_sql("SELECT @var1 := COUNT(*) FROM t1")
3466 .expect("Failed to parse MySQL @var := COUNT(*)");
3467 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3468 assert_eq!(output, "SELECT @var1 := COUNT(*) FROM t1");
3469
3470 let ast = Parser::parse_sql("SET @var1 := 1").expect("Failed to parse SET @var1 := 1");
3472 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3473 assert_eq!(output, "SET @var1 = 1");
3474
3475 let ast =
3477 Parser::parse_sql("UNION_VALUE(k1 := 1)").expect("Failed to parse named arg with :=");
3478 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3479 assert_eq!(output, "UNION_VALUE(k1 := 1)");
3480
3481 let ast = Parser::parse_sql("SELECT UNNEST(col, recursive := TRUE) FROM t")
3483 .expect("Failed to parse UNNEST with :=");
3484 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3485 assert_eq!(output, "SELECT UNNEST(col, recursive := TRUE) FROM t");
3486
3487 let ast =
3489 Parser::parse_sql("SELECT foo: 1").expect("Failed to parse DuckDB prefix alias foo: 1");
3490 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3491 assert_eq!(output, "SELECT 1 AS foo");
3492
3493 let ast = Parser::parse_sql("SELECT foo: 1, bar: 2, baz: 3")
3495 .expect("Failed to parse DuckDB multiple prefix aliases");
3496 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3497 assert_eq!(output, "SELECT 1 AS foo, 2 AS bar, 3 AS baz");
3498 }
3499
3500 #[test]
3501 fn test_colon_eq_dialect_roundtrip() {
3502 use crate::dialects::{Dialect, DialectType};
3503
3504 fn check(dialect: DialectType, sql: &str, expected: Option<&str>) {
3505 let d = Dialect::get(dialect);
3506 let ast = d
3507 .parse(sql)
3508 .unwrap_or_else(|e| panic!("Parse error for '{}': {}", sql, e));
3509 assert!(!ast.is_empty(), "Empty AST for: {}", sql);
3510 let transformed = d
3511 .transform(ast[0].clone())
3512 .unwrap_or_else(|e| panic!("Transform error for '{}': {}", sql, e));
3513 let output = d
3514 .generate(&transformed)
3515 .unwrap_or_else(|e| panic!("Generate error for '{}': {}", sql, e));
3516 let expected = expected.unwrap_or(sql);
3517 assert_eq!(output, expected, "Roundtrip failed for: {}", sql);
3518 }
3519
3520 check(DialectType::MySQL, "SELECT @var1 := 1, @var2", None);
3522 check(DialectType::MySQL, "SELECT @var1, @var2 := @var1", None);
3523 check(DialectType::MySQL, "SELECT @var1 := COUNT(*) FROM t1", None);
3524 check(DialectType::MySQL, "SET @var1 := 1", Some("SET @var1 = 1"));
3525
3526 check(
3528 DialectType::DuckDB,
3529 "SELECT UNNEST(col, recursive := TRUE) FROM t",
3530 None,
3531 );
3532 check(DialectType::DuckDB, "UNION_VALUE(k1 := 1)", None);
3533
3534 {
3537 let d = Dialect::get(DialectType::DuckDB);
3538 let ast = d
3539 .parse("STRUCT_PACK(a := 'b')::json")
3540 .expect("Failed to parse STRUCT_PACK(a := 'b')::json");
3541 assert!(!ast.is_empty(), "Empty AST for STRUCT_PACK(a := 'b')::json");
3542 }
3543
3544 check(
3546 DialectType::DuckDB,
3547 "SELECT foo: 1",
3548 Some("SELECT 1 AS foo"),
3549 );
3550 check(
3551 DialectType::DuckDB,
3552 "SELECT foo: 1, bar: 2, baz: 3",
3553 Some("SELECT 1 AS foo, 2 AS bar, 3 AS baz"),
3554 );
3555 }
3556
3557 #[test]
3558 fn test_comment_roundtrip() {
3559 use crate::generator::Generator;
3560 use crate::parser::Parser;
3561
3562 fn check_roundtrip(sql: &str) -> Option<String> {
3563 let ast = match Parser::parse_sql(sql) {
3564 Ok(a) => a,
3565 Err(e) => return Some(format!("Parse error: {:?}", e)),
3566 };
3567 if ast.is_empty() {
3568 return Some("Empty AST".to_string());
3569 }
3570 let mut generator = Generator::default();
3571 let output = match generator.generate(&ast[0]) {
3572 Ok(o) => o,
3573 Err(e) => return Some(format!("Gen error: {:?}", e)),
3574 };
3575 if output == sql {
3576 None
3577 } else {
3578 Some(format!(
3579 "Mismatch:\n input: {}\n output: {}",
3580 sql, output
3581 ))
3582 }
3583 }
3584
3585 let tests = vec![
3586 "SELECT c /* c1 /* c2 */ c3 */",
3588 "SELECT c /* c1 /* c2 /* c3 */ */ */",
3589 "SELECT c /* c1 */ AS alias /* c2 */",
3591 "SELECT a /* x */, b /* x */",
3593 "SELECT a /* x */ /* y */ /* z */, b /* k */ /* m */",
3595 "SELECT * FROM foo /* x */, bla /* x */",
3597 "SELECT 1 /* comment */ + 1",
3599 "SELECT 1 /* c1 */ + 2 /* c2 */",
3600 "SELECT 1 /* c1 */ + /* c2 */ 2 /* c3 */",
3601 "SELECT CAST(x AS INT) /* comment */ FROM foo",
3603 "SELECT FOO(x /* c */) /* FOO */, b /* b */",
3605 "SELECT x FROM a.b.c /* x */, e.f.g /* x */",
3607 "INSERT INTO t1 (tc1 /* tc1 */, tc2 /* tc2 */) SELECT c1 /* sc1 */, c2 /* sc2 */ FROM t",
3609 "/* c */ WITH x AS (SELECT 1) SELECT * FROM x",
3611 "/* comment1 */ INSERT INTO x /* comment2 */ VALUES (1, 2, 3)",
3612 "/* comment1 */ UPDATE tbl /* comment2 */ SET x = 2 WHERE x < 2",
3613 "/* comment1 */ DELETE FROM x /* comment2 */ WHERE y > 1",
3614 "/* comment */ CREATE TABLE foo AS SELECT 1",
3615 "INSERT INTO foo SELECT * FROM bar /* comment */",
3617 "SELECT FOO(x /* c1 */ + y /* c2 */ + BLA(5 /* c3 */)) FROM (VALUES (1 /* c4 */, \"test\" /* c5 */)) /* c6 */",
3619 ];
3620
3621 let mut failures = Vec::new();
3622 for sql in tests {
3623 if let Some(e) = check_roundtrip(sql) {
3624 failures.push(e);
3625 }
3626 }
3627
3628 if !failures.is_empty() {
3629 panic!("Comment roundtrip failures:\n{}", failures.join("\n\n"));
3630 }
3631 }
3632
3633 #[test]
3634 fn test_dollar_quoted_string_parsing() {
3635 use crate::dialects::{Dialect, DialectType};
3636
3637 let (tag, content) = super::parse_dollar_string_token("FOO\x00content here");
3639 assert_eq!(tag, Some("FOO".to_string()));
3640 assert_eq!(content, "content here");
3641
3642 let (tag, content) = super::parse_dollar_string_token("just content");
3643 assert_eq!(tag, None);
3644 assert_eq!(content, "just content");
3645
3646 fn check_databricks(sql: &str, expected: Option<&str>) {
3648 let d = Dialect::get(DialectType::Databricks);
3649 let ast = d
3650 .parse(sql)
3651 .unwrap_or_else(|e| panic!("Parse error for '{}': {}", sql, e));
3652 assert!(!ast.is_empty(), "Empty AST for: {}", sql);
3653 let transformed = d
3654 .transform(ast[0].clone())
3655 .unwrap_or_else(|e| panic!("Transform error for '{}': {}", sql, e));
3656 let output = d
3657 .generate(&transformed)
3658 .unwrap_or_else(|e| panic!("Generate error for '{}': {}", sql, e));
3659 let expected = expected.unwrap_or(sql);
3660 assert_eq!(output, expected, "Roundtrip failed for: {}", sql);
3661 }
3662
3663 check_databricks(
3665 "CREATE FUNCTION add_one(x INT) RETURNS INT LANGUAGE PYTHON AS $$def add_one(x):\n return x+1$$",
3666 None
3667 );
3668
3669 check_databricks(
3671 "CREATE FUNCTION add_one(x INT) RETURNS INT LANGUAGE PYTHON AS $FOO$def add_one(x):\n return x+1$FOO$",
3672 None
3673 );
3674 }
3675}