1use crate::error::{Error, Result};
7use serde::{Deserialize, Serialize};
8use std::fmt;
9
10pub fn parse_dollar_string_token(text: &str) -> (Option<String>, String) {
14 if let Some(pos) = text.find('\x00') {
15 let tag = &text[..pos];
16 let content = &text[pos + 1..];
17 (Some(tag.to_string()), content.to_string())
18 } else {
19 (None, text.to_string())
20 }
21}
22
23#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
25pub struct Span {
26 pub start: usize,
28 pub end: usize,
30 pub line: usize,
32 pub column: usize,
34}
35
36impl Span {
37 pub fn new(start: usize, end: usize, line: usize, column: usize) -> Self {
38 Self {
39 start,
40 end,
41 line,
42 column,
43 }
44 }
45}
46
47#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
49pub struct Token {
50 pub token_type: TokenType,
52 pub text: String,
54 pub span: Span,
56 #[serde(default)]
58 pub comments: Vec<String>,
59 #[serde(default)]
61 pub trailing_comments: Vec<String>,
62}
63
64impl Token {
65 pub fn new(token_type: TokenType, text: impl Into<String>, span: Span) -> Self {
67 Self {
68 token_type,
69 text: text.into(),
70 span,
71 comments: Vec::new(),
72 trailing_comments: Vec::new(),
73 }
74 }
75
76 pub fn number(n: i64) -> Self {
78 Self::new(TokenType::Number, n.to_string(), Span::default())
79 }
80
81 pub fn string(s: impl Into<String>) -> Self {
83 Self::new(TokenType::String, s, Span::default())
84 }
85
86 pub fn identifier(s: impl Into<String>) -> Self {
88 Self::new(TokenType::Identifier, s, Span::default())
89 }
90
91 pub fn var(s: impl Into<String>) -> Self {
93 Self::new(TokenType::Var, s, Span::default())
94 }
95
96 pub fn with_comment(mut self, comment: impl Into<String>) -> Self {
98 self.comments.push(comment.into());
99 self
100 }
101}
102
103impl fmt::Display for Token {
104 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
105 write!(f, "{:?}({})", self.token_type, self.text)
106 }
107}
108
109#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
111#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
112#[repr(u16)]
113pub enum TokenType {
114 LParen,
116 RParen,
117 LBracket,
118 RBracket,
119 LBrace,
120 RBrace,
121 Comma,
122 Dot,
123 Dash,
124 Plus,
125 Colon,
126 DotColon,
127 DColon,
128 DColonDollar,
129 DColonPercent,
130 DColonQMark,
131 DQMark,
132 Semicolon,
133 Star,
134 Backslash,
135 Slash,
136 Lt,
137 Lte,
138 Gt,
139 Gte,
140 Not,
141 Eq,
142 Neq,
143 NullsafeEq,
144 ColonEq,
145 ColonGt,
146 NColonGt,
147 And,
148 Or,
149 Amp,
150 DPipe,
151 PipeGt,
152 Pipe,
153 PipeSlash,
154 DPipeSlash,
155 Caret,
156 CaretAt,
157 LtLt, GtGt, Tilde,
160 Arrow,
161 DArrow,
162 FArrow,
163 Hash,
164 HashArrow,
165 DHashArrow,
166 LrArrow,
167 DAt,
168 AtAt,
169 LtAt,
170 AtGt,
171 Dollar,
172 Parameter,
173 Session,
174 SessionParameter,
175 SessionUser,
176 DAmp,
177 AmpLt,
178 AmpGt,
179 Adjacent,
180 Xor,
181 DStar,
182 QMarkAmp,
183 QMarkPipe,
184 HashDash,
185 Exclamation,
186
187 UriStart,
188 BlockStart,
189 BlockEnd,
190 Space,
191 Break,
192
193 BlockComment, LineComment, String,
199 DollarString, TripleDoubleQuotedString, TripleSingleQuotedString, Number,
203 Identifier,
204 QuotedIdentifier,
205 Database,
206 Column,
207 ColumnDef,
208 Schema,
209 Table,
210 Warehouse,
211 Stage,
212 Streamlit,
213 Var,
214 BitString,
215 HexString,
216 HexNumber,
218 ByteString,
219 NationalString,
220 EscapeString, RawString,
222 HeredocString,
223 HeredocStringAlternative,
224 UnicodeString,
225
226 Bit,
228 Boolean,
229 TinyInt,
230 UTinyInt,
231 SmallInt,
232 USmallInt,
233 MediumInt,
234 UMediumInt,
235 Int,
236 UInt,
237 BigInt,
238 UBigInt,
239 BigNum,
240 Int128,
241 UInt128,
242 Int256,
243 UInt256,
244 Float,
245 Double,
246 UDouble,
247 Decimal,
248 Decimal32,
249 Decimal64,
250 Decimal128,
251 Decimal256,
252 DecFloat,
253 UDecimal,
254 BigDecimal,
255 Char,
256 NChar,
257 VarChar,
258 NVarChar,
259 BpChar,
260 Text,
261 MediumText,
262 LongText,
263 Blob,
264 MediumBlob,
265 LongBlob,
266 TinyBlob,
267 TinyText,
268 Name,
269 Binary,
270 VarBinary,
271 Json,
272 JsonB,
273 Time,
274 TimeTz,
275 TimeNs,
276 Timestamp,
277 TimestampTz,
278 TimestampLtz,
279 TimestampNtz,
280 TimestampS,
281 TimestampMs,
282 TimestampNs,
283 DateTime,
284 DateTime2,
285 DateTime64,
286 SmallDateTime,
287 Date,
288 Date32,
289 Int4Range,
290 Int4MultiRange,
291 Int8Range,
292 Int8MultiRange,
293 NumRange,
294 NumMultiRange,
295 TsRange,
296 TsMultiRange,
297 TsTzRange,
298 TsTzMultiRange,
299 DateRange,
300 DateMultiRange,
301 Uuid,
302 Geography,
303 GeographyPoint,
304 Nullable,
305 Geometry,
306 Point,
307 Ring,
308 LineString,
309 LocalTime,
310 LocalTimestamp,
311 SysTimestamp,
312 MultiLineString,
313 Polygon,
314 MultiPolygon,
315 HllSketch,
316 HStore,
317 Super,
318 Serial,
319 SmallSerial,
320 BigSerial,
321 Xml,
322 Year,
323 UserDefined,
324 Money,
325 SmallMoney,
326 RowVersion,
327 Image,
328 Variant,
329 Object,
330 Inet,
331 IpAddress,
332 IpPrefix,
333 Ipv4,
334 Ipv6,
335 Enum,
336 Enum8,
337 Enum16,
338 FixedString,
339 LowCardinality,
340 Nested,
341 AggregateFunction,
342 SimpleAggregateFunction,
343 TDigest,
344 Unknown,
345 Vector,
346 Dynamic,
347 Void,
348
349 Add,
351 Alias,
352 Alter,
353 All,
354 Anti,
355 Any,
356 Apply,
357 Array,
358 Asc,
359 AsOf,
360 Attach,
361 AutoIncrement,
362 Begin,
363 Between,
364 BulkCollectInto,
365 Cache,
366 Cascade,
367 Case,
368 CharacterSet,
369 Cluster,
370 ClusterBy,
371 Collate,
372 Command,
373 Comment,
374 Commit,
375 Preserve,
376 Connect,
377 ConnectBy,
378 Constraint,
379 Copy,
380 Create,
381 Cross,
382 Cube,
383 CurrentDate,
384 CurrentDateTime,
385 CurrentSchema,
386 CurrentTime,
387 CurrentTimestamp,
388 CurrentUser,
389 CurrentRole,
390 CurrentCatalog,
391 Declare,
392 Default,
393 Delete,
394 Desc,
395 Describe,
396 Detach,
397 Dictionary,
398 Distinct,
399 Distribute,
400 DistributeBy,
401 Div,
402 Drop,
403 Else,
404 End,
405 Escape,
406 Except,
407 Execute,
408 Exists,
409 False,
410 Fetch,
411 File,
412 FileFormat,
413 Filter,
414 Final,
415 First,
416 For,
417 Force,
418 ForeignKey,
419 Format,
420 From,
421 Full,
422 Function,
423 Get,
424 Glob,
425 Global,
426 Grant,
427 GroupBy,
428 GroupingSets,
429 Having,
430 Hint,
431 Ignore,
432 ILike,
433 In,
434 Index,
435 IndexedBy,
436 Inner,
437 Input,
438 Insert,
439 Install,
440 Intersect,
441 Interval,
442 Into,
443 Inpath,
444 InputFormat,
445 Introducer,
446 IRLike,
447 Is,
448 IsNull,
449 Join,
450 JoinMarker,
451 Keep,
452 Key,
453 Kill,
454 Lambda,
455 Language,
456 Lateral,
457 Left,
458 Like,
459 NotLike, NotILike, NotRLike, NotIRLike, Limit,
464 List,
465 Load,
466 Local,
467 Lock,
468 Map,
469 Match,
470 MatchCondition,
471 MatchRecognize,
472 MemberOf,
473 Materialized,
474 Merge,
475 Mod,
476 Model,
477 Natural,
478 Next,
479 NoAction,
480 Nothing,
481 NotNull,
482 Null,
483 ObjectIdentifier,
484 Offset,
485 On,
486 Only,
487 Operator,
488 OrderBy,
489 OrderSiblingsBy,
490 Ordered,
491 Ordinality,
492 Out,
493 Outer,
494 Output,
495 Over,
496 Overlaps,
497 Overwrite,
498 Partition,
499 PartitionBy,
500 Percent,
501 Pivot,
502 Placeholder,
503 Positional,
504 Pragma,
505 Prewhere,
506 PrimaryKey,
507 Procedure,
508 Properties,
509 PseudoType,
510 Put,
511 Qualify,
512 Quote,
513 QDColon,
514 Range,
515 Recursive,
516 Refresh,
517 Rename,
518 Replace,
519 Returning,
520 Revoke,
521 References,
522 Restrict,
523 Right,
524 RLike,
525 Rollback,
526 Rollup,
527 Row,
528 Rows,
529 Select,
530 Semi,
531 Savepoint,
532 Separator,
533 Sequence,
534 Serde,
535 SerdeProperties,
536 Set,
537 Settings,
538 Show,
539 Siblings,
540 SimilarTo,
541 Some,
542 Sort,
543 SortBy,
544 SoundsLike,
545 StartWith,
546 StorageIntegration,
547 StraightJoin,
548 Struct,
549 Summarize,
550 TableSample,
551 Sample,
552 Bernoulli,
553 System,
554 Block,
555 Seed,
556 Repeatable,
557 Tag,
558 Temporary,
559 Transaction,
560 To,
561 Top,
562 Then,
563 True,
564 Truncate,
565 Uncache,
566 Union,
567 Unnest,
568 Unpivot,
569 Update,
570 Use,
571 Using,
572 Values,
573 View,
574 SemanticView,
575 Volatile,
576 When,
577 Where,
578 Window,
579 With,
580 Ties,
581 Exclude,
582 No,
583 Others,
584 Unique,
585 UtcDate,
586 UtcTime,
587 UtcTimestamp,
588 VersionSnapshot,
589 TimestampSnapshot,
590 Option,
591 Sink,
592 Source,
593 Analyze,
594 Namespace,
595 Export,
596 As,
597 By,
598 Nulls,
599 Respect,
600 Last,
601 If,
602 Cast,
603 TryCast,
604 SafeCast,
605 Count,
606 Extract,
607 Substring,
608 Trim,
609 Leading,
610 Trailing,
611 Both,
612 Position,
613 Overlaying,
614 Placing,
615 Treat,
616 Within,
617 Group,
618 Order,
619
620 Unbounded,
622 Preceding,
623 Following,
624 Current,
625 Groups,
626
627 Trigger,
629 Type,
630 Domain,
631 Returns,
632 Body,
633 Increment,
634 Minvalue,
635 Maxvalue,
636 Start,
637 Cycle,
638 NoCycle,
639 Prior,
640 Generated,
641 Identity,
642 Always,
643 Measures,
645 Pattern,
646 Define,
647 Running,
648 Owned,
649 After,
650 Before,
651 Instead,
652 Each,
653 Statement,
654 Referencing,
655 Old,
656 New,
657 Of,
658 Check,
659 Authorization,
660 Restart,
661
662 Eof,
664}
665
666impl TokenType {
667 pub fn is_keyword(&self) -> bool {
669 matches!(
670 self,
671 TokenType::Select
672 | TokenType::From
673 | TokenType::Where
674 | TokenType::And
675 | TokenType::Or
676 | TokenType::Not
677 | TokenType::In
678 | TokenType::Is
679 | TokenType::Null
680 | TokenType::True
681 | TokenType::False
682 | TokenType::As
683 | TokenType::On
684 | TokenType::Join
685 | TokenType::Left
686 | TokenType::Right
687 | TokenType::Inner
688 | TokenType::Outer
689 | TokenType::Full
690 | TokenType::Cross
691 | TokenType::Semi
692 | TokenType::Anti
693 | TokenType::Union
694 | TokenType::Except
695 | TokenType::Intersect
696 | TokenType::GroupBy
697 | TokenType::OrderBy
698 | TokenType::Having
699 | TokenType::Limit
700 | TokenType::Offset
701 | TokenType::Case
702 | TokenType::When
703 | TokenType::Then
704 | TokenType::Else
705 | TokenType::End
706 | TokenType::Create
707 | TokenType::Drop
708 | TokenType::Alter
709 | TokenType::Insert
710 | TokenType::Update
711 | TokenType::Delete
712 | TokenType::Into
713 | TokenType::Values
714 | TokenType::Set
715 | TokenType::With
716 | TokenType::Distinct
717 | TokenType::All
718 | TokenType::Exists
719 | TokenType::Between
720 | TokenType::Like
721 | TokenType::ILike
722 | TokenType::Filter
724 | TokenType::Date
725 | TokenType::Timestamp
726 | TokenType::TimestampTz
727 | TokenType::Interval
728 | TokenType::Time
729 | TokenType::Table
730 | TokenType::Index
731 | TokenType::Column
732 | TokenType::Database
733 | TokenType::Schema
734 | TokenType::View
735 | TokenType::Function
736 | TokenType::Procedure
737 | TokenType::Trigger
738 | TokenType::Sequence
739 | TokenType::Over
740 | TokenType::Partition
741 | TokenType::Window
742 | TokenType::Rows
743 | TokenType::Range
744 | TokenType::First
745 | TokenType::Last
746 | TokenType::Preceding
747 | TokenType::Following
748 | TokenType::Current
749 | TokenType::Row
750 | TokenType::Unbounded
751 | TokenType::Array
752 | TokenType::Struct
753 | TokenType::Map
754 | TokenType::PrimaryKey
755 | TokenType::Key
756 | TokenType::ForeignKey
757 | TokenType::References
758 | TokenType::Unique
759 | TokenType::Check
760 | TokenType::Default
761 | TokenType::Constraint
762 | TokenType::Comment
763 | TokenType::Rollup
764 | TokenType::Cube
765 | TokenType::Grant
766 | TokenType::Revoke
767 | TokenType::Type
768 | TokenType::Use
769 | TokenType::Cache
770 | TokenType::Uncache
771 | TokenType::Load
772 | TokenType::Any
773 | TokenType::Some
774 | TokenType::Asc
775 | TokenType::Desc
776 | TokenType::Nulls
777 | TokenType::Lateral
778 | TokenType::Natural
779 | TokenType::Escape
780 | TokenType::Glob
781 | TokenType::Match
782 | TokenType::Recursive
783 | TokenType::Replace
784 | TokenType::Returns
785 | TokenType::If
786 | TokenType::Pivot
787 | TokenType::Unpivot
788 | TokenType::Json
789 | TokenType::Blob
790 | TokenType::Text
791 | TokenType::Int
792 | TokenType::BigInt
793 | TokenType::SmallInt
794 | TokenType::TinyInt
795 | TokenType::Int128
796 | TokenType::UInt128
797 | TokenType::Int256
798 | TokenType::UInt256
799 | TokenType::UInt
800 | TokenType::UBigInt
801 | TokenType::Float
802 | TokenType::Double
803 | TokenType::Decimal
804 | TokenType::Boolean
805 | TokenType::VarChar
806 | TokenType::Char
807 | TokenType::Binary
808 | TokenType::VarBinary
809 | TokenType::No
810 | TokenType::DateTime
811 | TokenType::Truncate
812 | TokenType::Execute
813 | TokenType::Merge
814 | TokenType::Top
815 | TokenType::Begin
816 | TokenType::Generated
817 | TokenType::Identity
818 | TokenType::Always
819 | TokenType::Extract
820 | TokenType::AsOf
822 | TokenType::Prior
823 | TokenType::After
824 | TokenType::Restrict
825 | TokenType::Cascade
826 | TokenType::Local
827 | TokenType::Rename
828 | TokenType::Enum
829 | TokenType::Within
830 | TokenType::Format
831 | TokenType::Final
832 | TokenType::FileFormat
833 | TokenType::Input
834 | TokenType::InputFormat
835 | TokenType::Copy
836 | TokenType::Put
837 | TokenType::Get
838 | TokenType::Show
839 | TokenType::Serde
840 | TokenType::Sample
841 | TokenType::Sort
842 | TokenType::Collate
843 | TokenType::Ties
844 | TokenType::IsNull
845 | TokenType::NotNull
846 | TokenType::Exclude
847 | TokenType::Temporary
848 | TokenType::Add
849 | TokenType::Ordinality
850 | TokenType::Overlaps
851 | TokenType::Block
852 | TokenType::Pattern
853 | TokenType::Group
854 | TokenType::Cluster
855 | TokenType::Repeatable
856 | TokenType::Groups
857 | TokenType::Commit
858 | TokenType::Warehouse
859 | TokenType::System
860 | TokenType::By
861 | TokenType::To
862 | TokenType::Fetch
863 | TokenType::For
864 | TokenType::Only
865 | TokenType::Next
866 | TokenType::Lock
867 | TokenType::Refresh
868 | TokenType::Settings
869 | TokenType::Operator
870 | TokenType::Overwrite
871 | TokenType::StraightJoin
872 | TokenType::Start
873 | TokenType::Ignore
875 | TokenType::Domain
876 | TokenType::Apply
877 | TokenType::Respect
878 | TokenType::Materialized
879 | TokenType::Prewhere
880 | TokenType::Old
881 | TokenType::New
882 | TokenType::Cast
883 | TokenType::TryCast
884 | TokenType::SafeCast
885 | TokenType::Transaction
886 | TokenType::Describe
887 | TokenType::Kill
888 | TokenType::Lambda
889 | TokenType::Declare
890 | TokenType::Keep
891 | TokenType::Output
892 | TokenType::Percent
893 | TokenType::Qualify
894 | TokenType::Returning
895 | TokenType::Language
896 | TokenType::Preserve
897 | TokenType::Savepoint
898 | TokenType::Rollback
899 | TokenType::Body
900 | TokenType::Increment
901 | TokenType::Minvalue
902 | TokenType::Maxvalue
903 | TokenType::Cycle
904 | TokenType::NoCycle
905 | TokenType::Seed
906 | TokenType::Namespace
907 | TokenType::Authorization
908 | TokenType::Order
909 | TokenType::Restart
910 | TokenType::Before
911 | TokenType::Instead
912 | TokenType::Each
913 | TokenType::Statement
914 | TokenType::Referencing
915 | TokenType::Of
916 | TokenType::Separator
917 | TokenType::Others
918 | TokenType::Placing
919 | TokenType::Owned
920 | TokenType::Running
921 | TokenType::Define
922 | TokenType::Measures
923 | TokenType::MatchRecognize
924 | TokenType::AutoIncrement
925 | TokenType::Connect
926 | TokenType::Distribute
927 | TokenType::Bernoulli
928 | TokenType::TableSample
929 | TokenType::Inpath
930 | TokenType::Pragma
931 | TokenType::Siblings
932 | TokenType::SerdeProperties
933 | TokenType::RLike
934 )
935 }
936
937 pub fn is_comparison(&self) -> bool {
939 matches!(
940 self,
941 TokenType::Eq
942 | TokenType::Neq
943 | TokenType::Lt
944 | TokenType::Lte
945 | TokenType::Gt
946 | TokenType::Gte
947 | TokenType::NullsafeEq
948 )
949 }
950
951 pub fn is_arithmetic(&self) -> bool {
953 matches!(
954 self,
955 TokenType::Plus
956 | TokenType::Dash
957 | TokenType::Star
958 | TokenType::Slash
959 | TokenType::Percent
960 | TokenType::Mod
961 | TokenType::Div
962 )
963 }
964}
965
966impl fmt::Display for TokenType {
967 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
968 write!(f, "{:?}", self)
969 }
970}
971
972#[derive(Debug, Clone)]
974pub struct TokenizerConfig {
975 pub keywords: std::collections::HashMap<String, TokenType>,
977 pub single_tokens: std::collections::HashMap<char, TokenType>,
979 pub quotes: std::collections::HashMap<String, String>,
981 pub identifiers: std::collections::HashMap<char, char>,
983 pub comments: std::collections::HashMap<String, Option<String>>,
985 pub string_escapes: Vec<char>,
987 pub nested_comments: bool,
989 pub escape_follow_chars: Vec<char>,
994 pub b_prefix_is_byte_string: bool,
997 pub numeric_literals: std::collections::HashMap<String, String>,
1000 pub identifiers_can_start_with_digit: bool,
1004 pub hex_number_strings: bool,
1008 pub hex_string_is_integer_type: bool,
1012 pub string_escapes_allowed_in_raw_strings: bool,
1017 pub hash_comments: bool,
1019 pub dollar_sign_is_identifier: bool,
1023 pub insert_format_raw_data: bool,
1027}
1028
1029impl Default for TokenizerConfig {
1030 fn default() -> Self {
1031 let mut keywords = std::collections::HashMap::new();
1032 keywords.insert("SELECT".to_string(), TokenType::Select);
1034 keywords.insert("FROM".to_string(), TokenType::From);
1035 keywords.insert("WHERE".to_string(), TokenType::Where);
1036 keywords.insert("AND".to_string(), TokenType::And);
1037 keywords.insert("OR".to_string(), TokenType::Or);
1038 keywords.insert("NOT".to_string(), TokenType::Not);
1039 keywords.insert("AS".to_string(), TokenType::As);
1040 keywords.insert("ON".to_string(), TokenType::On);
1041 keywords.insert("JOIN".to_string(), TokenType::Join);
1042 keywords.insert("LEFT".to_string(), TokenType::Left);
1043 keywords.insert("RIGHT".to_string(), TokenType::Right);
1044 keywords.insert("INNER".to_string(), TokenType::Inner);
1045 keywords.insert("OUTER".to_string(), TokenType::Outer);
1046 keywords.insert("OUTPUT".to_string(), TokenType::Output);
1047 keywords.insert("FULL".to_string(), TokenType::Full);
1048 keywords.insert("CROSS".to_string(), TokenType::Cross);
1049 keywords.insert("SEMI".to_string(), TokenType::Semi);
1050 keywords.insert("ANTI".to_string(), TokenType::Anti);
1051 keywords.insert("STRAIGHT_JOIN".to_string(), TokenType::StraightJoin);
1052 keywords.insert("UNION".to_string(), TokenType::Union);
1053 keywords.insert("EXCEPT".to_string(), TokenType::Except);
1054 keywords.insert("MINUS".to_string(), TokenType::Except); keywords.insert("INTERSECT".to_string(), TokenType::Intersect);
1056 keywords.insert("GROUP".to_string(), TokenType::Group);
1057 keywords.insert("CUBE".to_string(), TokenType::Cube);
1058 keywords.insert("ROLLUP".to_string(), TokenType::Rollup);
1059 keywords.insert("WITHIN".to_string(), TokenType::Within);
1060 keywords.insert("ORDER".to_string(), TokenType::Order);
1061 keywords.insert("BY".to_string(), TokenType::By);
1062 keywords.insert("HAVING".to_string(), TokenType::Having);
1063 keywords.insert("LIMIT".to_string(), TokenType::Limit);
1064 keywords.insert("OFFSET".to_string(), TokenType::Offset);
1065 keywords.insert("ORDINALITY".to_string(), TokenType::Ordinality);
1066 keywords.insert("FETCH".to_string(), TokenType::Fetch);
1067 keywords.insert("FIRST".to_string(), TokenType::First);
1068 keywords.insert("NEXT".to_string(), TokenType::Next);
1069 keywords.insert("ONLY".to_string(), TokenType::Only);
1070 keywords.insert("KEEP".to_string(), TokenType::Keep);
1071 keywords.insert("IGNORE".to_string(), TokenType::Ignore);
1072 keywords.insert("INPUT".to_string(), TokenType::Input);
1073 keywords.insert("CASE".to_string(), TokenType::Case);
1074 keywords.insert("WHEN".to_string(), TokenType::When);
1075 keywords.insert("THEN".to_string(), TokenType::Then);
1076 keywords.insert("ELSE".to_string(), TokenType::Else);
1077 keywords.insert("END".to_string(), TokenType::End);
1078 keywords.insert("ENDIF".to_string(), TokenType::End); keywords.insert("NULL".to_string(), TokenType::Null);
1080 keywords.insert("TRUE".to_string(), TokenType::True);
1081 keywords.insert("FALSE".to_string(), TokenType::False);
1082 keywords.insert("IS".to_string(), TokenType::Is);
1083 keywords.insert("IN".to_string(), TokenType::In);
1084 keywords.insert("BETWEEN".to_string(), TokenType::Between);
1085 keywords.insert("OVERLAPS".to_string(), TokenType::Overlaps);
1086 keywords.insert("LIKE".to_string(), TokenType::Like);
1087 keywords.insert("ILIKE".to_string(), TokenType::ILike);
1088 keywords.insert("RLIKE".to_string(), TokenType::RLike);
1089 keywords.insert("REGEXP".to_string(), TokenType::RLike);
1090 keywords.insert("ESCAPE".to_string(), TokenType::Escape);
1091 keywords.insert("EXISTS".to_string(), TokenType::Exists);
1092 keywords.insert("DISTINCT".to_string(), TokenType::Distinct);
1093 keywords.insert("ALL".to_string(), TokenType::All);
1094 keywords.insert("WITH".to_string(), TokenType::With);
1095 keywords.insert("CREATE".to_string(), TokenType::Create);
1096 keywords.insert("DROP".to_string(), TokenType::Drop);
1097 keywords.insert("ALTER".to_string(), TokenType::Alter);
1098 keywords.insert("TRUNCATE".to_string(), TokenType::Truncate);
1099 keywords.insert("TABLE".to_string(), TokenType::Table);
1100 keywords.insert("VIEW".to_string(), TokenType::View);
1101 keywords.insert("INDEX".to_string(), TokenType::Index);
1102 keywords.insert("COLUMN".to_string(), TokenType::Column);
1103 keywords.insert("CONSTRAINT".to_string(), TokenType::Constraint);
1104 keywords.insert("ADD".to_string(), TokenType::Add);
1105 keywords.insert("CASCADE".to_string(), TokenType::Cascade);
1106 keywords.insert("RESTRICT".to_string(), TokenType::Restrict);
1107 keywords.insert("RENAME".to_string(), TokenType::Rename);
1108 keywords.insert("TEMPORARY".to_string(), TokenType::Temporary);
1109 keywords.insert("TEMP".to_string(), TokenType::Temporary);
1110 keywords.insert("UNIQUE".to_string(), TokenType::Unique);
1111 keywords.insert("PRIMARY".to_string(), TokenType::PrimaryKey);
1112 keywords.insert("FOREIGN".to_string(), TokenType::ForeignKey);
1113 keywords.insert("KEY".to_string(), TokenType::Key);
1114 keywords.insert("KILL".to_string(), TokenType::Kill);
1115 keywords.insert("REFERENCES".to_string(), TokenType::References);
1116 keywords.insert("DEFAULT".to_string(), TokenType::Default);
1117 keywords.insert("DECLARE".to_string(), TokenType::Declare);
1118 keywords.insert("AUTO_INCREMENT".to_string(), TokenType::AutoIncrement);
1119 keywords.insert("AUTOINCREMENT".to_string(), TokenType::AutoIncrement); keywords.insert("MATERIALIZED".to_string(), TokenType::Materialized);
1121 keywords.insert("REPLACE".to_string(), TokenType::Replace);
1122 keywords.insert("TO".to_string(), TokenType::To);
1123 keywords.insert("INSERT".to_string(), TokenType::Insert);
1124 keywords.insert("OVERWRITE".to_string(), TokenType::Overwrite);
1125 keywords.insert("UPDATE".to_string(), TokenType::Update);
1126 keywords.insert("USE".to_string(), TokenType::Use);
1127 keywords.insert("WAREHOUSE".to_string(), TokenType::Warehouse);
1128 keywords.insert("GLOB".to_string(), TokenType::Glob);
1129 keywords.insert("DELETE".to_string(), TokenType::Delete);
1130 keywords.insert("MERGE".to_string(), TokenType::Merge);
1131 keywords.insert("CACHE".to_string(), TokenType::Cache);
1132 keywords.insert("UNCACHE".to_string(), TokenType::Uncache);
1133 keywords.insert("REFRESH".to_string(), TokenType::Refresh);
1134 keywords.insert("GRANT".to_string(), TokenType::Grant);
1135 keywords.insert("REVOKE".to_string(), TokenType::Revoke);
1136 keywords.insert("COMMENT".to_string(), TokenType::Comment);
1137 keywords.insert("COLLATE".to_string(), TokenType::Collate);
1138 keywords.insert("INTO".to_string(), TokenType::Into);
1139 keywords.insert("VALUES".to_string(), TokenType::Values);
1140 keywords.insert("SET".to_string(), TokenType::Set);
1141 keywords.insert("SETTINGS".to_string(), TokenType::Settings);
1142 keywords.insert("SEPARATOR".to_string(), TokenType::Separator);
1143 keywords.insert("ASC".to_string(), TokenType::Asc);
1144 keywords.insert("DESC".to_string(), TokenType::Desc);
1145 keywords.insert("NULLS".to_string(), TokenType::Nulls);
1146 keywords.insert("RESPECT".to_string(), TokenType::Respect);
1147 keywords.insert("FIRST".to_string(), TokenType::First);
1148 keywords.insert("LAST".to_string(), TokenType::Last);
1149 keywords.insert("IF".to_string(), TokenType::If);
1150 keywords.insert("CAST".to_string(), TokenType::Cast);
1151 keywords.insert("TRY_CAST".to_string(), TokenType::TryCast);
1152 keywords.insert("SAFE_CAST".to_string(), TokenType::SafeCast);
1153 keywords.insert("OVER".to_string(), TokenType::Over);
1154 keywords.insert("PARTITION".to_string(), TokenType::Partition);
1155 keywords.insert("PLACING".to_string(), TokenType::Placing);
1156 keywords.insert("WINDOW".to_string(), TokenType::Window);
1157 keywords.insert("ROWS".to_string(), TokenType::Rows);
1158 keywords.insert("RANGE".to_string(), TokenType::Range);
1159 keywords.insert("FILTER".to_string(), TokenType::Filter);
1160 keywords.insert("NATURAL".to_string(), TokenType::Natural);
1161 keywords.insert("USING".to_string(), TokenType::Using);
1162 keywords.insert("UNBOUNDED".to_string(), TokenType::Unbounded);
1163 keywords.insert("PRECEDING".to_string(), TokenType::Preceding);
1164 keywords.insert("FOLLOWING".to_string(), TokenType::Following);
1165 keywords.insert("CURRENT".to_string(), TokenType::Current);
1166 keywords.insert("ROW".to_string(), TokenType::Row);
1167 keywords.insert("GROUPS".to_string(), TokenType::Groups);
1168 keywords.insert("RECURSIVE".to_string(), TokenType::Recursive);
1169 keywords.insert("BOTH".to_string(), TokenType::Both);
1171 keywords.insert("LEADING".to_string(), TokenType::Leading);
1172 keywords.insert("TRAILING".to_string(), TokenType::Trailing);
1173 keywords.insert("INTERVAL".to_string(), TokenType::Interval);
1174 keywords.insert("TOP".to_string(), TokenType::Top);
1176 keywords.insert("QUALIFY".to_string(), TokenType::Qualify);
1177 keywords.insert("SAMPLE".to_string(), TokenType::Sample);
1178 keywords.insert("TABLESAMPLE".to_string(), TokenType::TableSample);
1179 keywords.insert("BERNOULLI".to_string(), TokenType::Bernoulli);
1180 keywords.insert("SYSTEM".to_string(), TokenType::System);
1181 keywords.insert("BLOCK".to_string(), TokenType::Block);
1182 keywords.insert("SEED".to_string(), TokenType::Seed);
1183 keywords.insert("REPEATABLE".to_string(), TokenType::Repeatable);
1184 keywords.insert("TIES".to_string(), TokenType::Ties);
1185 keywords.insert("LATERAL".to_string(), TokenType::Lateral);
1186 keywords.insert("LAMBDA".to_string(), TokenType::Lambda);
1187 keywords.insert("APPLY".to_string(), TokenType::Apply);
1188 keywords.insert("CONNECT".to_string(), TokenType::Connect);
1190 keywords.insert("CLUSTER".to_string(), TokenType::Cluster);
1192 keywords.insert("DISTRIBUTE".to_string(), TokenType::Distribute);
1193 keywords.insert("SORT".to_string(), TokenType::Sort);
1194 keywords.insert("PIVOT".to_string(), TokenType::Pivot);
1195 keywords.insert("PREWHERE".to_string(), TokenType::Prewhere);
1196 keywords.insert("UNPIVOT".to_string(), TokenType::Unpivot);
1197 keywords.insert("FOR".to_string(), TokenType::For);
1198 keywords.insert("ANY".to_string(), TokenType::Any);
1199 keywords.insert("SOME".to_string(), TokenType::Some);
1200 keywords.insert("ASOF".to_string(), TokenType::AsOf);
1201 keywords.insert("PERCENT".to_string(), TokenType::Percent);
1202 keywords.insert("EXCLUDE".to_string(), TokenType::Exclude);
1203 keywords.insert("NO".to_string(), TokenType::No);
1204 keywords.insert("OTHERS".to_string(), TokenType::Others);
1205 keywords.insert("OPERATOR".to_string(), TokenType::Operator);
1207 keywords.insert("SCHEMA".to_string(), TokenType::Schema);
1209 keywords.insert("NAMESPACE".to_string(), TokenType::Namespace);
1210 keywords.insert("DATABASE".to_string(), TokenType::Database);
1211 keywords.insert("FUNCTION".to_string(), TokenType::Function);
1212 keywords.insert("PROCEDURE".to_string(), TokenType::Procedure);
1213 keywords.insert("PROC".to_string(), TokenType::Procedure);
1214 keywords.insert("SEQUENCE".to_string(), TokenType::Sequence);
1215 keywords.insert("TRIGGER".to_string(), TokenType::Trigger);
1216 keywords.insert("TYPE".to_string(), TokenType::Type);
1217 keywords.insert("DOMAIN".to_string(), TokenType::Domain);
1218 keywords.insert("RETURNS".to_string(), TokenType::Returns);
1219 keywords.insert("RETURNING".to_string(), TokenType::Returning);
1220 keywords.insert("LANGUAGE".to_string(), TokenType::Language);
1221 keywords.insert("ROLLBACK".to_string(), TokenType::Rollback);
1222 keywords.insert("COMMIT".to_string(), TokenType::Commit);
1223 keywords.insert("BEGIN".to_string(), TokenType::Begin);
1224 keywords.insert("DESCRIBE".to_string(), TokenType::Describe);
1225 keywords.insert("PRESERVE".to_string(), TokenType::Preserve);
1226 keywords.insert("TRANSACTION".to_string(), TokenType::Transaction);
1227 keywords.insert("SAVEPOINT".to_string(), TokenType::Savepoint);
1228 keywords.insert("BODY".to_string(), TokenType::Body);
1229 keywords.insert("INCREMENT".to_string(), TokenType::Increment);
1230 keywords.insert("MINVALUE".to_string(), TokenType::Minvalue);
1231 keywords.insert("MAXVALUE".to_string(), TokenType::Maxvalue);
1232 keywords.insert("CYCLE".to_string(), TokenType::Cycle);
1233 keywords.insert("NOCYCLE".to_string(), TokenType::NoCycle);
1234 keywords.insert("PRIOR".to_string(), TokenType::Prior);
1235 keywords.insert("MATCH".to_string(), TokenType::Match);
1237 keywords.insert("MATCH_RECOGNIZE".to_string(), TokenType::MatchRecognize);
1238 keywords.insert("MEASURES".to_string(), TokenType::Measures);
1239 keywords.insert("PATTERN".to_string(), TokenType::Pattern);
1240 keywords.insert("DEFINE".to_string(), TokenType::Define);
1241 keywords.insert("RUNNING".to_string(), TokenType::Running);
1242 keywords.insert("FINAL".to_string(), TokenType::Final);
1243 keywords.insert("OWNED".to_string(), TokenType::Owned);
1244 keywords.insert("AFTER".to_string(), TokenType::After);
1245 keywords.insert("BEFORE".to_string(), TokenType::Before);
1246 keywords.insert("INSTEAD".to_string(), TokenType::Instead);
1247 keywords.insert("EACH".to_string(), TokenType::Each);
1248 keywords.insert("STATEMENT".to_string(), TokenType::Statement);
1249 keywords.insert("REFERENCING".to_string(), TokenType::Referencing);
1250 keywords.insert("OLD".to_string(), TokenType::Old);
1251 keywords.insert("NEW".to_string(), TokenType::New);
1252 keywords.insert("OF".to_string(), TokenType::Of);
1253 keywords.insert("CHECK".to_string(), TokenType::Check);
1254 keywords.insert("START".to_string(), TokenType::Start);
1255 keywords.insert("ENUM".to_string(), TokenType::Enum);
1256 keywords.insert("AUTHORIZATION".to_string(), TokenType::Authorization);
1257 keywords.insert("RESTART".to_string(), TokenType::Restart);
1258 keywords.insert("DATE".to_string(), TokenType::Date);
1260 keywords.insert("TIME".to_string(), TokenType::Time);
1261 keywords.insert("TIMESTAMP".to_string(), TokenType::Timestamp);
1262 keywords.insert("DATETIME".to_string(), TokenType::DateTime);
1263 keywords.insert("GENERATED".to_string(), TokenType::Generated);
1264 keywords.insert("IDENTITY".to_string(), TokenType::Identity);
1265 keywords.insert("ALWAYS".to_string(), TokenType::Always);
1266 keywords.insert("LOAD".to_string(), TokenType::Load);
1268 keywords.insert("LOCAL".to_string(), TokenType::Local);
1269 keywords.insert("INPATH".to_string(), TokenType::Inpath);
1270 keywords.insert("INPUTFORMAT".to_string(), TokenType::InputFormat);
1271 keywords.insert("SERDE".to_string(), TokenType::Serde);
1272 keywords.insert("SERDEPROPERTIES".to_string(), TokenType::SerdeProperties);
1273 keywords.insert("FORMAT".to_string(), TokenType::Format);
1274 keywords.insert("PRAGMA".to_string(), TokenType::Pragma);
1276 keywords.insert("SHOW".to_string(), TokenType::Show);
1278 keywords.insert("SIBLINGS".to_string(), TokenType::Siblings);
1280 keywords.insert("COPY".to_string(), TokenType::Copy);
1282 keywords.insert("PUT".to_string(), TokenType::Put);
1283 keywords.insert("GET".to_string(), TokenType::Get);
1284 keywords.insert("EXEC".to_string(), TokenType::Execute);
1286 keywords.insert("EXECUTE".to_string(), TokenType::Execute);
1287 keywords.insert("ISNULL".to_string(), TokenType::IsNull);
1289 keywords.insert("NOTNULL".to_string(), TokenType::NotNull);
1290
1291 let mut single_tokens = std::collections::HashMap::new();
1292 single_tokens.insert('(', TokenType::LParen);
1293 single_tokens.insert(')', TokenType::RParen);
1294 single_tokens.insert('[', TokenType::LBracket);
1295 single_tokens.insert(']', TokenType::RBracket);
1296 single_tokens.insert('{', TokenType::LBrace);
1297 single_tokens.insert('}', TokenType::RBrace);
1298 single_tokens.insert(',', TokenType::Comma);
1299 single_tokens.insert('.', TokenType::Dot);
1300 single_tokens.insert(';', TokenType::Semicolon);
1301 single_tokens.insert('+', TokenType::Plus);
1302 single_tokens.insert('-', TokenType::Dash);
1303 single_tokens.insert('*', TokenType::Star);
1304 single_tokens.insert('/', TokenType::Slash);
1305 single_tokens.insert('%', TokenType::Percent);
1306 single_tokens.insert('&', TokenType::Amp);
1307 single_tokens.insert('|', TokenType::Pipe);
1308 single_tokens.insert('^', TokenType::Caret);
1309 single_tokens.insert('~', TokenType::Tilde);
1310 single_tokens.insert('<', TokenType::Lt);
1311 single_tokens.insert('>', TokenType::Gt);
1312 single_tokens.insert('=', TokenType::Eq);
1313 single_tokens.insert('!', TokenType::Exclamation);
1314 single_tokens.insert(':', TokenType::Colon);
1315 single_tokens.insert('@', TokenType::DAt);
1316 single_tokens.insert('#', TokenType::Hash);
1317 single_tokens.insert('$', TokenType::Dollar);
1318 single_tokens.insert('?', TokenType::Parameter);
1319
1320 let mut quotes = std::collections::HashMap::new();
1321 quotes.insert("'".to_string(), "'".to_string());
1322 quotes.insert("\"\"\"".to_string(), "\"\"\"".to_string());
1324
1325 let mut identifiers = std::collections::HashMap::new();
1326 identifiers.insert('"', '"');
1327 identifiers.insert('`', '`');
1328 let mut comments = std::collections::HashMap::new();
1332 comments.insert("--".to_string(), None);
1333 comments.insert("/*".to_string(), Some("*/".to_string()));
1334
1335 Self {
1336 keywords,
1337 single_tokens,
1338 quotes,
1339 identifiers,
1340 comments,
1341 string_escapes: vec!['\''],
1344 nested_comments: true,
1345 escape_follow_chars: vec![],
1347 b_prefix_is_byte_string: false,
1349 numeric_literals: std::collections::HashMap::new(),
1350 identifiers_can_start_with_digit: false,
1351 hex_number_strings: false,
1352 hex_string_is_integer_type: false,
1353 string_escapes_allowed_in_raw_strings: true,
1356 hash_comments: false,
1357 dollar_sign_is_identifier: false,
1358 insert_format_raw_data: false,
1359 }
1360 }
1361}
1362
1363pub struct Tokenizer {
1365 config: TokenizerConfig,
1366}
1367
1368impl Tokenizer {
1369 pub fn new(config: TokenizerConfig) -> Self {
1371 Self { config }
1372 }
1373
1374 pub fn default_config() -> Self {
1376 Self::new(TokenizerConfig::default())
1377 }
1378
1379 pub fn tokenize(&self, sql: &str) -> Result<Vec<Token>> {
1381 let mut state = TokenizerState::new(sql, &self.config);
1382 state.tokenize()
1383 }
1384}
1385
1386impl Default for Tokenizer {
1387 fn default() -> Self {
1388 Self::default_config()
1389 }
1390}
1391
1392struct TokenizerState<'a> {
1394 chars: Vec<char>,
1395 size: usize,
1396 tokens: Vec<Token>,
1397 start: usize,
1398 current: usize,
1399 line: usize,
1400 column: usize,
1401 comments: Vec<String>,
1402 config: &'a TokenizerConfig,
1403}
1404
1405impl<'a> TokenizerState<'a> {
1406 fn new(sql: &str, config: &'a TokenizerConfig) -> Self {
1407 let chars: Vec<char> = sql.chars().collect();
1408 let size = chars.len();
1409 Self {
1410 chars,
1411 size,
1412 tokens: Vec::new(),
1413 start: 0,
1414 current: 0,
1415 line: 1,
1416 column: 1,
1417 comments: Vec::new(),
1418 config,
1419 }
1420 }
1421
1422 fn tokenize(&mut self) -> Result<Vec<Token>> {
1423 while !self.is_at_end() {
1424 self.skip_whitespace();
1425 if self.is_at_end() {
1426 break;
1427 }
1428
1429 self.start = self.current;
1430 self.scan_token()?;
1431
1432 if self.config.insert_format_raw_data {
1435 if let Some(raw) = self.try_scan_insert_format_raw_data() {
1436 if !raw.is_empty() {
1437 self.start = self.current;
1438 self.add_token_with_text(TokenType::Var, raw);
1439 }
1440 }
1441 }
1442 }
1443
1444 if !self.comments.is_empty() {
1449 if let Some(last) = self.tokens.last_mut() {
1450 last.trailing_comments.extend(self.comments.drain(..));
1451 }
1452 }
1453
1454 Ok(std::mem::take(&mut self.tokens))
1455 }
1456
1457 fn is_at_end(&self) -> bool {
1458 self.current >= self.size
1459 }
1460
1461 fn peek(&self) -> char {
1462 if self.is_at_end() {
1463 '\0'
1464 } else {
1465 self.chars[self.current]
1466 }
1467 }
1468
1469 fn peek_next(&self) -> char {
1470 if self.current + 1 >= self.size {
1471 '\0'
1472 } else {
1473 self.chars[self.current + 1]
1474 }
1475 }
1476
1477 fn advance(&mut self) -> char {
1478 let c = self.peek();
1479 self.current += 1;
1480 if c == '\n' {
1481 self.line += 1;
1482 self.column = 1;
1483 } else {
1484 self.column += 1;
1485 }
1486 c
1487 }
1488
1489 fn skip_whitespace(&mut self) {
1490 let mut saw_newline = false;
1495 while !self.is_at_end() {
1496 let c = self.peek();
1497 match c {
1498 ' ' | '\t' | '\r' => {
1499 self.advance();
1500 }
1501 '\n' => {
1502 saw_newline = true;
1503 self.advance();
1504 }
1505 '\u{00A0}' | '\u{2000}'..='\u{200B}' | '\u{3000}' | '\u{FEFF}' => {
1510 self.advance();
1511 }
1512 '-' if self.peek_next() == '-' => {
1513 self.scan_line_comment(saw_newline);
1514 saw_newline = true;
1516 }
1517 '/' if self.peek_next() == '/' && self.config.hash_comments => {
1518 self.scan_double_slash_comment();
1520 }
1521 '/' if self.peek_next() == '*' => {
1522 if self.current + 2 < self.size && self.chars[self.current + 2] == '+' {
1524 break;
1526 }
1527 if self.scan_block_comment(saw_newline).is_err() {
1528 return;
1529 }
1530 }
1532 '/' if self.peek_next() == '/' && self.config.comments.contains_key("//") => {
1533 let prev_non_ws = if self.current > 0 {
1537 let mut i = self.current - 1;
1538 while i > 0 && (self.chars[i] == ' ' || self.chars[i] == '\t') {
1539 i -= 1;
1540 }
1541 self.chars[i]
1542 } else {
1543 '\0'
1544 };
1545 if prev_non_ws == ':' || prev_non_ws == '/' {
1546 break;
1548 }
1549 self.scan_line_comment(saw_newline);
1550 saw_newline = true;
1552 }
1553 '#' if self.config.hash_comments => {
1554 self.scan_hash_line_comment();
1555 }
1556 _ => break,
1557 }
1558 }
1559 }
1560
1561 fn scan_hash_line_comment(&mut self) {
1562 self.advance(); let start = self.current;
1564 while !self.is_at_end() && self.peek() != '\n' {
1565 self.advance();
1566 }
1567 let comment: String = self.chars[start..self.current].iter().collect();
1568 let comment_text = comment.trim().to_string();
1569 if let Some(last) = self.tokens.last_mut() {
1570 last.trailing_comments.push(comment_text);
1571 } else {
1572 self.comments.push(comment_text);
1573 }
1574 }
1575
1576 fn scan_double_slash_comment(&mut self) {
1577 self.advance(); self.advance(); let start = self.current;
1580 while !self.is_at_end() && self.peek() != '\n' {
1581 self.advance();
1582 }
1583 let comment: String = self.chars[start..self.current].iter().collect();
1584 let comment_text = comment.trim().to_string();
1585 if let Some(last) = self.tokens.last_mut() {
1586 last.trailing_comments.push(comment_text);
1587 } else {
1588 self.comments.push(comment_text);
1589 }
1590 }
1591
1592 fn scan_line_comment(&mut self, after_newline: bool) {
1593 self.advance(); self.advance(); let start = self.current;
1596 while !self.is_at_end() && self.peek() != '\n' {
1597 self.advance();
1598 }
1599 let comment_text: String = self.chars[start..self.current].iter().collect();
1600
1601 if after_newline || self.tokens.is_empty() {
1604 self.comments.push(comment_text);
1605 } else if let Some(last) = self.tokens.last_mut() {
1606 last.trailing_comments.push(comment_text);
1607 }
1608 }
1609
1610 fn scan_block_comment(&mut self, after_newline: bool) -> Result<()> {
1611 self.advance(); self.advance(); let content_start = self.current;
1614 let mut depth = 1;
1615
1616 while !self.is_at_end() && depth > 0 {
1617 if self.peek() == '/' && self.peek_next() == '*' && self.config.nested_comments {
1618 self.advance();
1619 self.advance();
1620 depth += 1;
1621 } else if self.peek() == '*' && self.peek_next() == '/' {
1622 depth -= 1;
1623 if depth > 0 {
1624 self.advance();
1625 self.advance();
1626 }
1627 } else {
1628 self.advance();
1629 }
1630 }
1631
1632 if depth > 0 {
1633 return Err(Error::tokenize(
1634 "Unterminated block comment",
1635 self.line,
1636 self.column,
1637 ));
1638 }
1639
1640 let content: String = self.chars[content_start..self.current].iter().collect();
1642 self.advance(); self.advance(); let comment_text = format!("/*{}*/", content);
1647
1648 if after_newline || self.tokens.is_empty() {
1651 self.comments.push(comment_text);
1652 } else if let Some(last) = self.tokens.last_mut() {
1653 last.trailing_comments.push(comment_text);
1654 }
1655
1656 Ok(())
1657 }
1658
1659 fn scan_hint(&mut self) -> Result<()> {
1661 self.advance(); self.advance(); self.advance(); let hint_start = self.current;
1665
1666 while !self.is_at_end() {
1668 if self.peek() == '*' && self.peek_next() == '/' {
1669 break;
1670 }
1671 self.advance();
1672 }
1673
1674 if self.is_at_end() {
1675 return Err(Error::tokenize(
1676 "Unterminated hint comment",
1677 self.line,
1678 self.column,
1679 ));
1680 }
1681
1682 let hint_text: String = self.chars[hint_start..self.current].iter().collect();
1683 self.advance(); self.advance(); self.add_token_with_text(TokenType::Hint, hint_text.trim().to_string());
1687
1688 Ok(())
1689 }
1690
1691 fn scan_positional_parameter(&mut self) -> Result<()> {
1693 self.advance(); let start = self.current;
1695
1696 while !self.is_at_end() && self.peek().is_ascii_digit() {
1697 self.advance();
1698 }
1699
1700 let number: String = self.chars[start..self.current].iter().collect();
1701 self.add_token_with_text(TokenType::Parameter, number);
1702 Ok(())
1703 }
1704
1705 fn try_scan_tagged_dollar_string(&mut self) -> Result<Option<()>> {
1710 let saved_pos = self.current;
1711
1712 self.advance(); let tag_start = self.current;
1718 while !self.is_at_end()
1719 && (self.peek().is_alphanumeric() || self.peek() == '_' || !self.peek().is_ascii())
1720 {
1721 self.advance();
1722 }
1723 let tag: String = self.chars[tag_start..self.current].iter().collect();
1724
1725 if self.is_at_end() || self.peek() != '$' {
1727 self.current = saved_pos;
1729 return Ok(None);
1730 }
1731 self.advance(); let content_start = self.current;
1735 let closing_tag = format!("${}$", tag);
1736 let closing_chars: Vec<char> = closing_tag.chars().collect();
1737
1738 loop {
1739 if self.is_at_end() {
1740 self.current = saved_pos;
1742 return Ok(None);
1743 }
1744
1745 if self.peek() == '$' && self.current + closing_chars.len() <= self.size {
1747 let matches = closing_chars.iter().enumerate().all(|(j, &ch)| {
1748 self.current + j < self.size && self.chars[self.current + j] == ch
1749 });
1750 if matches {
1751 let content: String = self.chars[content_start..self.current].iter().collect();
1752 for _ in 0..closing_chars.len() {
1754 self.advance();
1755 }
1756 let token_text = format!("{}\x00{}", tag, content);
1758 self.add_token_with_text(TokenType::DollarString, token_text);
1759 return Ok(Some(()));
1760 }
1761 }
1762 self.advance();
1763 }
1764 }
1765
1766 fn scan_dollar_quoted_string(&mut self) -> Result<()> {
1771 self.advance(); self.advance(); let start = self.current;
1776 while !self.is_at_end() {
1777 if self.peek() == '$'
1778 && self.current + 1 < self.size
1779 && self.chars[self.current + 1] == '$'
1780 {
1781 break;
1782 }
1783 self.advance();
1784 }
1785
1786 let content: String = self.chars[start..self.current].iter().collect();
1787
1788 if !self.is_at_end() {
1789 self.advance(); self.advance(); }
1792
1793 self.add_token_with_text(TokenType::DollarString, content);
1794 Ok(())
1795 }
1796
1797 fn scan_token(&mut self) -> Result<()> {
1798 let c = self.peek();
1799
1800 if c == '\'' {
1802 if self.config.quotes.contains_key("'''")
1804 && self.peek_next() == '\''
1805 && self.current + 2 < self.size
1806 && self.chars[self.current + 2] == '\''
1807 {
1808 return self.scan_triple_quoted_string('\'');
1809 }
1810 return self.scan_string();
1811 }
1812
1813 if c == '"'
1815 && self.config.quotes.contains_key("\"\"\"")
1816 && self.peek_next() == '"'
1817 && self.current + 2 < self.size
1818 && self.chars[self.current + 2] == '"'
1819 {
1820 return self.scan_triple_quoted_string('"');
1821 }
1822
1823 if c == '"'
1826 && self.config.quotes.contains_key("\"")
1827 && !self.config.identifiers.contains_key(&'"')
1828 {
1829 return self.scan_double_quoted_string();
1830 }
1831
1832 if let Some(&end_quote) = self.config.identifiers.get(&c) {
1834 return self.scan_quoted_identifier(end_quote);
1835 }
1836
1837 if c.is_ascii_digit() {
1839 return self.scan_number();
1840 }
1841
1842 if c == '.' && self.peek_next().is_ascii_digit() {
1849 let prev_char = if self.current > 0 {
1850 self.chars[self.current - 1]
1851 } else {
1852 '\0'
1853 };
1854 let is_after_ident = prev_char.is_alphanumeric()
1855 || prev_char == '_'
1856 || prev_char == '`'
1857 || prev_char == '"'
1858 || prev_char == ']'
1859 || prev_char == ')';
1860 if prev_char != '.' && !is_after_ident {
1861 return self.scan_number_starting_with_dot();
1862 }
1863 }
1864
1865 if c == '/'
1867 && self.peek_next() == '*'
1868 && self.current + 2 < self.size
1869 && self.chars[self.current + 2] == '+'
1870 {
1871 return self.scan_hint();
1872 }
1873
1874 if let Some(token_type) = self.try_scan_multi_char_operator() {
1876 self.add_token(token_type);
1877 return Ok(());
1878 }
1879
1880 if c == '$'
1883 && (self.peek_next().is_alphanumeric()
1884 || self.peek_next() == '_'
1885 || !self.peek_next().is_ascii())
1886 {
1887 if let Some(()) = self.try_scan_tagged_dollar_string()? {
1888 return Ok(());
1889 }
1890 if self.config.dollar_sign_is_identifier {
1893 return self.scan_dollar_identifier();
1894 }
1895 }
1896
1897 if c == '$' && self.peek_next() == '$' {
1899 return self.scan_dollar_quoted_string();
1900 }
1901
1902 if c == '$' && self.peek_next().is_ascii_digit() {
1904 return self.scan_positional_parameter();
1905 }
1906
1907 if c == '$' && self.config.dollar_sign_is_identifier {
1909 return self.scan_dollar_identifier();
1910 }
1911
1912 if (c == '#' || c == '@')
1915 && (self.peek_next().is_alphanumeric()
1916 || self.peek_next() == '_'
1917 || self.peek_next() == '#')
1918 {
1919 return self.scan_tsql_identifier();
1920 }
1921
1922 if let Some(&token_type) = self.config.single_tokens.get(&c) {
1924 self.advance();
1925 self.add_token(token_type);
1926 return Ok(());
1927 }
1928
1929 if c == '\u{2212}' {
1931 self.advance();
1932 self.add_token(TokenType::Dash);
1933 return Ok(());
1934 }
1935
1936 if c == '\u{2044}' {
1938 self.advance();
1939 self.add_token(TokenType::Slash);
1940 return Ok(());
1941 }
1942
1943 if c == '\u{2018}' || c == '\u{2019}' {
1945 return self.scan_unicode_quoted_string(c);
1947 }
1948 if c == '\u{201C}' || c == '\u{201D}' {
1949 return self.scan_unicode_quoted_identifier(c);
1951 }
1952
1953 self.scan_identifier_or_keyword()
1955 }
1956
1957 fn try_scan_multi_char_operator(&mut self) -> Option<TokenType> {
1958 let c = self.peek();
1959 let next = self.peek_next();
1960 let third = if self.current + 2 < self.size {
1961 self.chars[self.current + 2]
1962 } else {
1963 '\0'
1964 };
1965
1966 if c == '-' && next == '|' && third == '-' {
1969 self.advance();
1970 self.advance();
1971 self.advance();
1972 return Some(TokenType::Adjacent);
1973 }
1974
1975 if c == '|' && next == '|' && third == '/' {
1977 self.advance();
1978 self.advance();
1979 self.advance();
1980 return Some(TokenType::DPipeSlash);
1981 }
1982
1983 if c == '#' && next == '>' && third == '>' {
1985 self.advance();
1986 self.advance();
1987 self.advance();
1988 return Some(TokenType::DHashArrow);
1989 }
1990
1991 if c == '-' && next == '>' && third == '>' {
1993 self.advance();
1994 self.advance();
1995 self.advance();
1996 return Some(TokenType::DArrow);
1997 }
1998
1999 if c == '<' && next == '=' && third == '>' {
2001 self.advance();
2002 self.advance();
2003 self.advance();
2004 return Some(TokenType::NullsafeEq);
2005 }
2006
2007 if c == '<' && next == '-' && third == '>' {
2009 self.advance();
2010 self.advance();
2011 self.advance();
2012 return Some(TokenType::LrArrow);
2013 }
2014
2015 if c == '<' && next == '@' {
2017 self.advance();
2018 self.advance();
2019 return Some(TokenType::LtAt);
2020 }
2021
2022 if c == '@' && next == '>' {
2024 self.advance();
2025 self.advance();
2026 return Some(TokenType::AtGt);
2027 }
2028
2029 if c == '~' && next == '~' && third == '~' {
2031 self.advance();
2032 self.advance();
2033 self.advance();
2034 return Some(TokenType::Glob);
2035 }
2036
2037 if c == '~' && next == '~' && third == '*' {
2039 self.advance();
2040 self.advance();
2041 self.advance();
2042 return Some(TokenType::ILike);
2043 }
2044
2045 let fourth = if self.current + 3 < self.size {
2047 self.chars[self.current + 3]
2048 } else {
2049 '\0'
2050 };
2051 if c == '!' && next == '~' && third == '~' && fourth == '*' {
2052 self.advance();
2053 self.advance();
2054 self.advance();
2055 self.advance();
2056 return Some(TokenType::NotILike);
2057 }
2058
2059 if c == '!' && next == '~' && third == '~' {
2061 self.advance();
2062 self.advance();
2063 self.advance();
2064 return Some(TokenType::NotLike);
2065 }
2066
2067 if c == '!' && next == '~' && third == '*' {
2069 self.advance();
2070 self.advance();
2071 self.advance();
2072 return Some(TokenType::NotIRLike);
2073 }
2074
2075 if c == '!' && next == ':' && third == '>' {
2077 self.advance();
2078 self.advance();
2079 self.advance();
2080 return Some(TokenType::NColonGt);
2081 }
2082
2083 if c == '?' && next == ':' && third == ':' {
2085 self.advance();
2086 self.advance();
2087 self.advance();
2088 return Some(TokenType::QDColon);
2089 }
2090
2091 if c == '!' && next == '~' {
2093 self.advance();
2094 self.advance();
2095 return Some(TokenType::NotRLike);
2096 }
2097
2098 if c == '~' && next == '~' {
2100 self.advance();
2101 self.advance();
2102 return Some(TokenType::Like);
2103 }
2104
2105 if c == '~' && next == '*' {
2107 self.advance();
2108 self.advance();
2109 return Some(TokenType::IRLike);
2110 }
2111
2112 if c == ':' && next == ':' && third == '$' {
2115 self.advance();
2116 self.advance();
2117 self.advance();
2118 return Some(TokenType::DColonDollar);
2119 }
2120 if c == ':' && next == ':' && third == '%' {
2121 self.advance();
2122 self.advance();
2123 self.advance();
2124 return Some(TokenType::DColonPercent);
2125 }
2126 if c == ':' && next == ':' && third == '?' {
2127 self.advance();
2128 self.advance();
2129 self.advance();
2130 return Some(TokenType::DColonQMark);
2131 }
2132
2133 let token_type = match (c, next) {
2135 ('.', ':') => Some(TokenType::DotColon),
2136 ('=', '=') => Some(TokenType::Eq), ('<', '=') => Some(TokenType::Lte),
2138 ('>', '=') => Some(TokenType::Gte),
2139 ('!', '=') => Some(TokenType::Neq),
2140 ('<', '>') => Some(TokenType::Neq),
2141 ('^', '=') => Some(TokenType::Neq),
2142 ('<', '<') => Some(TokenType::LtLt),
2143 ('>', '>') => Some(TokenType::GtGt),
2144 ('|', '|') => Some(TokenType::DPipe),
2145 ('|', '/') => Some(TokenType::PipeSlash), (':', ':') => Some(TokenType::DColon),
2147 (':', '=') => Some(TokenType::ColonEq), (':', '>') => Some(TokenType::ColonGt), ('-', '>') => Some(TokenType::Arrow), ('=', '>') => Some(TokenType::FArrow), ('&', '&') => Some(TokenType::DAmp),
2152 ('&', '<') => Some(TokenType::AmpLt), ('&', '>') => Some(TokenType::AmpGt), ('@', '@') => Some(TokenType::AtAt), ('?', '|') => Some(TokenType::QMarkPipe), ('?', '&') => Some(TokenType::QMarkAmp), ('?', '?') => Some(TokenType::DQMark), ('#', '>') => Some(TokenType::HashArrow), ('#', '-') => Some(TokenType::HashDash), ('^', '@') => Some(TokenType::CaretAt), ('*', '*') => Some(TokenType::DStar), ('|', '>') => Some(TokenType::PipeGt), _ => None,
2164 };
2165
2166 if token_type.is_some() {
2167 self.advance();
2168 self.advance();
2169 }
2170
2171 token_type
2172 }
2173
2174 fn scan_string(&mut self) -> Result<()> {
2175 self.advance(); let mut value = String::new();
2177
2178 while !self.is_at_end() {
2179 let c = self.peek();
2180 if c == '\'' {
2181 if self.peek_next() == '\'' {
2182 value.push('\'');
2184 self.advance();
2185 self.advance();
2186 } else {
2187 break;
2188 }
2189 } else if c == '\\' && self.config.string_escapes.contains(&'\\') {
2190 self.advance(); if !self.is_at_end() {
2193 let escaped = self.advance();
2194 match escaped {
2195 'n' => value.push('\n'),
2196 'r' => value.push('\r'),
2197 't' => value.push('\t'),
2198 '0' => value.push('\0'),
2199 'Z' => value.push('\x1A'), 'a' => value.push('\x07'), 'b' => value.push('\x08'), 'f' => value.push('\x0C'), 'v' => value.push('\x0B'), 'x' => {
2205 let mut hex = String::with_capacity(2);
2207 for _ in 0..2 {
2208 if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2209 hex.push(self.advance());
2210 }
2211 }
2212 if hex.len() == 2 {
2213 if let Ok(byte) = u8::from_str_radix(&hex, 16) {
2214 value.push(byte as char);
2215 } else {
2216 value.push('\\');
2217 value.push('x');
2218 value.push_str(&hex);
2219 }
2220 } else {
2221 value.push('\\');
2223 value.push('x');
2224 value.push_str(&hex);
2225 }
2226 }
2227 '\\' => value.push('\\'),
2228 '\'' => value.push('\''),
2229 '"' => value.push('"'),
2230 '%' => {
2231 value.push('%');
2233 }
2234 '_' => {
2235 value.push('_');
2237 }
2238 _ => {
2242 if !self.config.escape_follow_chars.is_empty() {
2243 value.push(escaped);
2245 } else {
2246 value.push('\\');
2248 value.push(escaped);
2249 }
2250 }
2251 }
2252 }
2253 } else {
2254 value.push(self.advance());
2255 }
2256 }
2257
2258 if self.is_at_end() {
2259 return Err(Error::tokenize(
2260 "Unterminated string",
2261 self.line,
2262 self.column,
2263 ));
2264 }
2265
2266 self.advance(); self.add_token_with_text(TokenType::String, value);
2268 Ok(())
2269 }
2270
2271 fn scan_double_quoted_string(&mut self) -> Result<()> {
2273 self.advance(); let mut value = String::new();
2275
2276 while !self.is_at_end() {
2277 let c = self.peek();
2278 if c == '"' {
2279 if self.peek_next() == '"' {
2280 value.push('"');
2282 self.advance();
2283 self.advance();
2284 } else {
2285 break;
2286 }
2287 } else if c == '\\' && self.config.string_escapes.contains(&'\\') {
2288 self.advance(); if !self.is_at_end() {
2291 let escaped = self.advance();
2292 match escaped {
2293 'n' => value.push('\n'),
2294 'r' => value.push('\r'),
2295 't' => value.push('\t'),
2296 '0' => value.push('\0'),
2297 'Z' => value.push('\x1A'), 'a' => value.push('\x07'), 'b' => value.push('\x08'), 'f' => value.push('\x0C'), 'v' => value.push('\x0B'), 'x' => {
2303 let mut hex = String::with_capacity(2);
2305 for _ in 0..2 {
2306 if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2307 hex.push(self.advance());
2308 }
2309 }
2310 if hex.len() == 2 {
2311 if let Ok(byte) = u8::from_str_radix(&hex, 16) {
2312 value.push(byte as char);
2313 } else {
2314 value.push('\\');
2315 value.push('x');
2316 value.push_str(&hex);
2317 }
2318 } else {
2319 value.push('\\');
2321 value.push('x');
2322 value.push_str(&hex);
2323 }
2324 }
2325 '\\' => value.push('\\'),
2326 '\'' => value.push('\''),
2327 '"' => value.push('"'),
2328 '%' => {
2329 value.push('%');
2331 }
2332 '_' => {
2333 value.push('_');
2335 }
2336 _ => {
2340 if !self.config.escape_follow_chars.is_empty() {
2341 value.push(escaped);
2343 } else {
2344 value.push('\\');
2346 value.push(escaped);
2347 }
2348 }
2349 }
2350 }
2351 } else {
2352 value.push(self.advance());
2353 }
2354 }
2355
2356 if self.is_at_end() {
2357 return Err(Error::tokenize(
2358 "Unterminated double-quoted string",
2359 self.line,
2360 self.column,
2361 ));
2362 }
2363
2364 self.advance(); self.add_token_with_text(TokenType::String, value);
2366 Ok(())
2367 }
2368
2369 fn scan_triple_quoted_string(&mut self, quote_char: char) -> Result<()> {
2370 self.advance();
2372 self.advance();
2373 self.advance();
2374 let mut value = String::new();
2375
2376 while !self.is_at_end() {
2377 if self.peek() == quote_char
2379 && self.current + 1 < self.size
2380 && self.chars[self.current + 1] == quote_char
2381 && self.current + 2 < self.size
2382 && self.chars[self.current + 2] == quote_char
2383 {
2384 break;
2386 }
2387 value.push(self.advance());
2388 }
2389
2390 if self.is_at_end() {
2391 return Err(Error::tokenize(
2392 "Unterminated triple-quoted string",
2393 self.line,
2394 self.column,
2395 ));
2396 }
2397
2398 self.advance();
2400 self.advance();
2401 self.advance();
2402 let token_type = if quote_char == '"' {
2403 TokenType::TripleDoubleQuotedString
2404 } else {
2405 TokenType::TripleSingleQuotedString
2406 };
2407 self.add_token_with_text(token_type, value);
2408 Ok(())
2409 }
2410
2411 fn scan_quoted_identifier(&mut self, end_quote: char) -> Result<()> {
2412 self.advance(); let mut value = String::new();
2414
2415 loop {
2416 if self.is_at_end() {
2417 return Err(Error::tokenize(
2418 "Unterminated identifier",
2419 self.line,
2420 self.column,
2421 ));
2422 }
2423 if self.peek() == end_quote {
2424 if self.peek_next() == end_quote {
2425 value.push(end_quote);
2427 self.advance(); self.advance(); } else {
2430 break;
2432 }
2433 } else {
2434 value.push(self.peek());
2435 self.advance();
2436 }
2437 }
2438
2439 self.advance(); self.add_token_with_text(TokenType::QuotedIdentifier, value);
2441 Ok(())
2442 }
2443
2444 fn scan_unicode_quoted_string(&mut self, open_quote: char) -> Result<()> {
2449 self.advance(); let start = self.current;
2451 let close_quote = if open_quote == '\u{2018}' {
2453 '\u{2019}' } else {
2455 '\u{2019}' };
2457 while !self.is_at_end() && self.peek() != close_quote {
2458 self.advance();
2459 }
2460 let value: String = self.chars[start..self.current].iter().collect();
2461 if !self.is_at_end() {
2462 self.advance(); }
2464 self.add_token_with_text(TokenType::String, value);
2465 Ok(())
2466 }
2467
2468 fn scan_unicode_quoted_identifier(&mut self, open_quote: char) -> Result<()> {
2471 self.advance(); let start = self.current;
2473 let close_quote = if open_quote == '\u{201C}' {
2474 '\u{201D}' } else {
2476 '\u{201D}' };
2478 while !self.is_at_end() && self.peek() != close_quote && self.peek() != '"' {
2479 self.advance();
2480 }
2481 let value: String = self.chars[start..self.current].iter().collect();
2482 if !self.is_at_end() {
2483 self.advance(); }
2485 self.add_token_with_text(TokenType::QuotedIdentifier, value);
2486 Ok(())
2487 }
2488
2489 fn scan_number(&mut self) -> Result<()> {
2490 if self.config.hex_number_strings && self.peek() == '0' && !self.is_at_end() {
2492 let next = if self.current + 1 < self.size {
2493 self.chars[self.current + 1]
2494 } else {
2495 '\0'
2496 };
2497 if next == 'x' || next == 'X' {
2498 self.advance();
2500 self.advance();
2501 let hex_start = self.current;
2503 while !self.is_at_end() && (self.peek().is_ascii_hexdigit() || self.peek() == '_') {
2504 if self.peek() == '_' && !self.peek_next().is_ascii_hexdigit() {
2505 break;
2506 }
2507 self.advance();
2508 }
2509 if self.current > hex_start {
2510 let mut is_hex_float = false;
2512 if !self.is_at_end() && self.peek() == '.' {
2514 let after_dot = if self.current + 1 < self.size {
2515 self.chars[self.current + 1]
2516 } else {
2517 '\0'
2518 };
2519 if after_dot.is_ascii_hexdigit() {
2520 is_hex_float = true;
2521 self.advance(); while !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2523 self.advance();
2524 }
2525 }
2526 }
2527 if !self.is_at_end() && (self.peek() == 'p' || self.peek() == 'P') {
2529 is_hex_float = true;
2530 self.advance(); if !self.is_at_end() && (self.peek() == '+' || self.peek() == '-') {
2532 self.advance();
2533 }
2534 while !self.is_at_end() && self.peek().is_ascii_digit() {
2535 self.advance();
2536 }
2537 }
2538 if is_hex_float {
2539 let full_text: String =
2541 self.chars[self.start..self.current].iter().collect();
2542 self.add_token_with_text(TokenType::Number, full_text);
2543 } else if self.config.hex_string_is_integer_type {
2544 let hex_value: String =
2546 self.chars[hex_start..self.current].iter().collect();
2547 self.add_token_with_text(TokenType::HexNumber, hex_value);
2548 } else {
2549 let hex_value: String =
2551 self.chars[hex_start..self.current].iter().collect();
2552 self.add_token_with_text(TokenType::HexString, hex_value);
2553 }
2554 return Ok(());
2555 }
2556 self.current = self.start + 1;
2559 }
2560 }
2561
2562 while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2564 if self.peek() == '_' && (self.is_at_end() || !self.peek_next().is_ascii_digit()) {
2566 break;
2567 }
2568 self.advance();
2569 }
2570
2571 if self.peek() == '.' {
2575 let next = self.peek_next();
2576 if next != '.' {
2582 self.advance(); while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2585 if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2586 break;
2587 }
2588 self.advance();
2589 }
2590 }
2591 }
2592
2593 if self.peek() == 'e' || self.peek() == 'E' {
2595 self.advance();
2596 if self.peek() == '+' || self.peek() == '-' {
2597 self.advance();
2598 }
2599 while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2600 if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2601 break;
2602 }
2603 self.advance();
2604 }
2605 }
2606
2607 let text: String = self.chars[self.start..self.current].iter().collect();
2608
2609 if !self.config.numeric_literals.is_empty() && !self.is_at_end() {
2611 let next_char = self.peek().to_uppercase().to_string();
2612 let suffix_match = if self.current + 1 < self.size {
2614 let two_char: String = vec![self.chars[self.current], self.chars[self.current + 1]]
2615 .iter()
2616 .collect::<String>()
2617 .to_uppercase();
2618 if self.config.numeric_literals.contains_key(&two_char) {
2619 let after_suffix = if self.current + 2 < self.size {
2621 self.chars[self.current + 2]
2622 } else {
2623 ' '
2624 };
2625 if !after_suffix.is_alphanumeric() && after_suffix != '_' {
2626 Some((two_char, 2))
2627 } else {
2628 None
2629 }
2630 } else if self.config.numeric_literals.contains_key(&next_char) {
2631 let after_suffix = if self.current + 1 < self.size {
2633 self.chars[self.current + 1]
2634 } else {
2635 ' '
2636 };
2637 if !after_suffix.is_alphanumeric() && after_suffix != '_' {
2638 Some((next_char, 1))
2639 } else {
2640 None
2641 }
2642 } else {
2643 None
2644 }
2645 } else if self.config.numeric_literals.contains_key(&next_char) {
2646 Some((next_char, 1))
2648 } else {
2649 None
2650 };
2651
2652 if let Some((suffix, len)) = suffix_match {
2653 for _ in 0..len {
2655 self.advance();
2656 }
2657 let type_name = self
2660 .config
2661 .numeric_literals
2662 .get(&suffix)
2663 .expect("suffix verified by contains_key above")
2664 .clone();
2665 let combined = format!("{}::{}", text, type_name);
2666 self.add_token_with_text(TokenType::Number, combined);
2667 return Ok(());
2668 }
2669 }
2670
2671 if self.config.identifiers_can_start_with_digit && !self.is_at_end() {
2674 let next = self.peek();
2675 if next.is_alphabetic() || next == '_' {
2676 while !self.is_at_end() {
2678 let ch = self.peek();
2679 if ch.is_alphanumeric() || ch == '_' {
2680 self.advance();
2681 } else {
2682 break;
2683 }
2684 }
2685 let ident_text: String = self.chars[self.start..self.current].iter().collect();
2686 self.add_token_with_text(TokenType::Identifier, ident_text);
2687 return Ok(());
2688 }
2689 }
2690
2691 self.add_token_with_text(TokenType::Number, text);
2692 Ok(())
2693 }
2694
2695 fn scan_number_starting_with_dot(&mut self) -> Result<()> {
2697 self.advance();
2699
2700 while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2702 if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2703 break;
2704 }
2705 self.advance();
2706 }
2707
2708 if self.peek() == 'e' || self.peek() == 'E' {
2710 self.advance();
2711 if self.peek() == '+' || self.peek() == '-' {
2712 self.advance();
2713 }
2714 while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2715 if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2716 break;
2717 }
2718 self.advance();
2719 }
2720 }
2721
2722 let text: String = self.chars[self.start..self.current].iter().collect();
2723 self.add_token_with_text(TokenType::Number, text);
2724 Ok(())
2725 }
2726
2727 fn scan_identifier_or_keyword(&mut self) -> Result<()> {
2728 let first_char = self.peek();
2730 if !first_char.is_alphanumeric() && first_char != '_' {
2731 let c = self.advance();
2733 return Err(Error::tokenize(
2734 format!("Unexpected character: '{}'", c),
2735 self.line,
2736 self.column,
2737 ));
2738 }
2739
2740 while !self.is_at_end() {
2741 let c = self.peek();
2742 if c == '#' {
2746 let next_c = if self.current + 1 < self.size {
2747 self.chars[self.current + 1]
2748 } else {
2749 '\0'
2750 };
2751 if next_c == '>' || next_c == '-' {
2752 break; }
2754 self.advance();
2755 } else if c.is_alphanumeric() || c == '_' || c == '$' || c == '@' {
2756 self.advance();
2757 } else {
2758 break;
2759 }
2760 }
2761
2762 let text: String = self.chars[self.start..self.current].iter().collect();
2763 let upper = text.to_uppercase();
2764
2765 if upper == "NOT" && self.peek() == '=' {
2767 self.advance(); self.add_token(TokenType::Neq);
2769 return Ok(());
2770 }
2771
2772 let next_char = self.peek();
2775 let is_single_quote = next_char == '\'';
2776 let is_double_quote = next_char == '"' && self.config.quotes.contains_key("\"");
2777 let is_double_quote_for_raw = next_char == '"';
2780
2781 if upper == "R" && (is_single_quote || is_double_quote_for_raw) {
2784 let quote_char = if is_single_quote { '\'' } else { '"' };
2787 self.advance(); if self.peek() == quote_char && self.peek_next() == quote_char {
2791 self.advance(); self.advance(); let string_value = self.scan_raw_triple_quoted_content(quote_char)?;
2795 self.add_token_with_text(TokenType::RawString, string_value);
2796 } else {
2797 let string_value = self.scan_raw_string_content(quote_char)?;
2798 self.add_token_with_text(TokenType::RawString, string_value);
2799 }
2800 return Ok(());
2801 }
2802
2803 if is_single_quote || is_double_quote {
2804 match upper.as_str() {
2805 "N" => {
2806 self.advance(); let string_value = if is_single_quote {
2809 self.scan_string_content()?
2810 } else {
2811 self.scan_double_quoted_string_content()?
2812 };
2813 self.add_token_with_text(TokenType::NationalString, string_value);
2814 return Ok(());
2815 }
2816 "E" => {
2817 let lowercase = text == "e";
2821 let prefix = if lowercase { "e:" } else { "E:" };
2822 self.advance(); let string_value = self.scan_string_content_with_escapes(true)?;
2824 self.add_token_with_text(
2825 TokenType::EscapeString,
2826 format!("{}{}", prefix, string_value),
2827 );
2828 return Ok(());
2829 }
2830 "X" => {
2831 self.advance(); let string_value = if is_single_quote {
2834 self.scan_string_content()?
2835 } else {
2836 self.scan_double_quoted_string_content()?
2837 };
2838 self.add_token_with_text(TokenType::HexString, string_value);
2839 return Ok(());
2840 }
2841 "B" if is_double_quote => {
2842 self.advance(); let string_value = self.scan_double_quoted_string_content()?;
2845 self.add_token_with_text(TokenType::ByteString, string_value);
2846 return Ok(());
2847 }
2848 "B" if is_single_quote => {
2849 self.advance(); let string_value = self.scan_string_content()?;
2853 if self.config.b_prefix_is_byte_string {
2854 self.add_token_with_text(TokenType::ByteString, string_value);
2855 } else {
2856 self.add_token_with_text(TokenType::BitString, string_value);
2857 }
2858 return Ok(());
2859 }
2860 _ => {}
2861 }
2862 }
2863
2864 if upper == "U"
2866 && self.peek() == '&'
2867 && self.current + 1 < self.size
2868 && self.chars[self.current + 1] == '\''
2869 {
2870 self.advance(); self.advance(); let string_value = self.scan_string_content()?;
2873 self.add_token_with_text(TokenType::UnicodeString, string_value);
2874 return Ok(());
2875 }
2876
2877 let token_type = self
2878 .config
2879 .keywords
2880 .get(&upper)
2881 .copied()
2882 .unwrap_or(TokenType::Var);
2883
2884 self.add_token_with_text(token_type, text);
2885 Ok(())
2886 }
2887
2888 fn scan_string_content_with_escapes(
2892 &mut self,
2893 force_backslash_escapes: bool,
2894 ) -> Result<String> {
2895 let mut value = String::new();
2896 let use_backslash_escapes =
2897 force_backslash_escapes || self.config.string_escapes.contains(&'\\');
2898
2899 while !self.is_at_end() {
2900 let c = self.peek();
2901 if c == '\'' {
2902 if self.peek_next() == '\'' {
2903 value.push('\'');
2905 self.advance();
2906 self.advance();
2907 } else {
2908 break;
2909 }
2910 } else if c == '\\' && use_backslash_escapes {
2911 value.push(self.advance());
2913 if !self.is_at_end() {
2914 value.push(self.advance());
2915 }
2916 } else {
2917 value.push(self.advance());
2918 }
2919 }
2920
2921 if self.is_at_end() {
2922 return Err(Error::tokenize(
2923 "Unterminated string",
2924 self.line,
2925 self.column,
2926 ));
2927 }
2928
2929 self.advance(); Ok(value)
2931 }
2932
2933 fn scan_string_content(&mut self) -> Result<String> {
2935 self.scan_string_content_with_escapes(false)
2936 }
2937
2938 fn scan_double_quoted_string_content(&mut self) -> Result<String> {
2941 let mut value = String::new();
2942 let use_backslash_escapes = self.config.string_escapes.contains(&'\\');
2943
2944 while !self.is_at_end() {
2945 let c = self.peek();
2946 if c == '"' {
2947 if self.peek_next() == '"' {
2948 value.push('"');
2950 self.advance();
2951 self.advance();
2952 } else {
2953 break;
2954 }
2955 } else if c == '\\' && use_backslash_escapes {
2956 self.advance(); if !self.is_at_end() {
2959 let escaped = self.advance();
2960 match escaped {
2961 'n' => value.push('\n'),
2962 'r' => value.push('\r'),
2963 't' => value.push('\t'),
2964 '0' => value.push('\0'),
2965 '\\' => value.push('\\'),
2966 '"' => value.push('"'),
2967 '\'' => value.push('\''),
2968 'x' => {
2969 let mut hex = String::new();
2971 for _ in 0..2 {
2972 if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2973 hex.push(self.advance());
2974 }
2975 }
2976 if let Ok(byte) = u8::from_str_radix(&hex, 16) {
2977 value.push(byte as char);
2978 } else {
2979 value.push('\\');
2981 value.push('x');
2982 value.push_str(&hex);
2983 }
2984 }
2985 _ => {
2986 value.push('\\');
2988 value.push(escaped);
2989 }
2990 }
2991 }
2992 } else {
2993 value.push(self.advance());
2994 }
2995 }
2996
2997 if self.is_at_end() {
2998 return Err(Error::tokenize(
2999 "Unterminated double-quoted string",
3000 self.line,
3001 self.column,
3002 ));
3003 }
3004
3005 self.advance(); Ok(value)
3007 }
3008
3009 fn scan_raw_string_content(&mut self, quote_char: char) -> Result<String> {
3014 let mut value = String::new();
3015
3016 while !self.is_at_end() {
3017 let c = self.peek();
3018 if c == quote_char {
3019 if self.peek_next() == quote_char {
3020 value.push(quote_char);
3022 self.advance();
3023 self.advance();
3024 } else {
3025 break;
3026 }
3027 } else if c == '\\'
3028 && self.peek_next() == quote_char
3029 && self.config.string_escapes_allowed_in_raw_strings
3030 {
3031 value.push(quote_char);
3035 self.advance(); self.advance(); } else {
3038 value.push(self.advance());
3040 }
3041 }
3042
3043 if self.is_at_end() {
3044 return Err(Error::tokenize(
3045 "Unterminated raw string",
3046 self.line,
3047 self.column,
3048 ));
3049 }
3050
3051 self.advance(); Ok(value)
3053 }
3054
3055 fn scan_raw_triple_quoted_content(&mut self, quote_char: char) -> Result<String> {
3058 let mut value = String::new();
3059
3060 while !self.is_at_end() {
3061 let c = self.peek();
3062 if c == quote_char && self.peek_next() == quote_char {
3063 if self.current + 2 < self.size && self.chars[self.current + 2] == quote_char {
3065 self.advance(); self.advance(); self.advance(); return Ok(value);
3070 }
3071 }
3072 let ch = self.advance();
3074 value.push(ch);
3075 }
3076
3077 Err(Error::tokenize(
3078 "Unterminated raw triple-quoted string",
3079 self.line,
3080 self.column,
3081 ))
3082 }
3083
3084 fn scan_dollar_identifier(&mut self) -> Result<()> {
3089 self.advance();
3091
3092 while !self.is_at_end() {
3094 let c = self.peek();
3095 if c.is_alphanumeric() || c == '_' || c == '$' {
3096 self.advance();
3097 } else {
3098 break;
3099 }
3100 }
3101
3102 let text: String = self.chars[self.start..self.current].iter().collect();
3103 self.add_token_with_text(TokenType::Var, text);
3104 Ok(())
3105 }
3106
3107 fn scan_tsql_identifier(&mut self) -> Result<()> {
3108 let first = self.advance();
3110
3111 if first == '#' && self.peek() == '#' {
3113 self.advance();
3114 }
3115
3116 while !self.is_at_end() {
3118 let c = self.peek();
3119 if c.is_alphanumeric() || c == '_' || c == '$' || c == '#' || c == '@' {
3120 self.advance();
3121 } else {
3122 break;
3123 }
3124 }
3125
3126 let text: String = self.chars[self.start..self.current].iter().collect();
3127 self.add_token_with_text(TokenType::Var, text);
3129 Ok(())
3130 }
3131
3132 fn try_scan_insert_format_raw_data(&mut self) -> Option<String> {
3136 let len = self.tokens.len();
3137 if len < 3 {
3138 return None;
3139 }
3140
3141 let last = &self.tokens[len - 1];
3143 if last.text.eq_ignore_ascii_case("VALUES") {
3144 return None;
3145 }
3146 if !matches!(last.token_type, TokenType::Var | TokenType::Identifier) {
3147 return None;
3148 }
3149
3150 let format_tok = &self.tokens[len - 2];
3152 if !format_tok.text.eq_ignore_ascii_case("FORMAT") {
3153 return None;
3154 }
3155
3156 let has_insert = self.tokens[..len - 2]
3158 .iter()
3159 .rev()
3160 .take(20)
3161 .any(|t| t.token_type == TokenType::Insert);
3162 if !has_insert {
3163 return None;
3164 }
3165
3166 let raw_start = self.current;
3170 while !self.is_at_end() {
3171 let c = self.peek();
3172 if c == '\n' {
3173 let saved = self.current;
3175 self.advance(); while !self.is_at_end() && self.peek() == '\r' {
3178 self.advance();
3179 }
3180 if self.is_at_end() || self.peek() == '\n' {
3181 let raw: String = self.chars[raw_start..saved].iter().collect();
3184 return Some(raw.trim().to_string());
3185 }
3186 } else {
3188 self.advance();
3189 }
3190 }
3191
3192 let raw: String = self.chars[raw_start..self.current].iter().collect();
3194 let trimmed = raw.trim().to_string();
3195 if trimmed.is_empty() {
3196 None
3197 } else {
3198 Some(trimmed)
3199 }
3200 }
3201
3202 fn add_token(&mut self, token_type: TokenType) {
3203 let text: String = self.chars[self.start..self.current].iter().collect();
3204 self.add_token_with_text(token_type, text);
3205 }
3206
3207 fn add_token_with_text(&mut self, token_type: TokenType, text: String) {
3208 let span = Span::new(self.start, self.current, self.line, self.column);
3209 let mut token = Token::new(token_type, text, span);
3210 token.comments.append(&mut self.comments);
3211 self.tokens.push(token);
3212 }
3213}
3214
3215#[cfg(test)]
3216mod tests {
3217 use super::*;
3218
3219 #[test]
3220 fn test_simple_select() {
3221 let tokenizer = Tokenizer::default();
3222 let tokens = tokenizer.tokenize("SELECT 1").unwrap();
3223
3224 assert_eq!(tokens.len(), 2);
3225 assert_eq!(tokens[0].token_type, TokenType::Select);
3226 assert_eq!(tokens[1].token_type, TokenType::Number);
3227 assert_eq!(tokens[1].text, "1");
3228 }
3229
3230 #[test]
3231 fn test_select_with_identifier() {
3232 let tokenizer = Tokenizer::default();
3233 let tokens = tokenizer.tokenize("SELECT a, b FROM t").unwrap();
3234
3235 assert_eq!(tokens.len(), 6);
3236 assert_eq!(tokens[0].token_type, TokenType::Select);
3237 assert_eq!(tokens[1].token_type, TokenType::Var);
3238 assert_eq!(tokens[1].text, "a");
3239 assert_eq!(tokens[2].token_type, TokenType::Comma);
3240 assert_eq!(tokens[3].token_type, TokenType::Var);
3241 assert_eq!(tokens[3].text, "b");
3242 assert_eq!(tokens[4].token_type, TokenType::From);
3243 assert_eq!(tokens[5].token_type, TokenType::Var);
3244 assert_eq!(tokens[5].text, "t");
3245 }
3246
3247 #[test]
3248 fn test_string_literal() {
3249 let tokenizer = Tokenizer::default();
3250 let tokens = tokenizer.tokenize("SELECT 'hello'").unwrap();
3251
3252 assert_eq!(tokens.len(), 2);
3253 assert_eq!(tokens[1].token_type, TokenType::String);
3254 assert_eq!(tokens[1].text, "hello");
3255 }
3256
3257 #[test]
3258 fn test_escaped_string() {
3259 let tokenizer = Tokenizer::default();
3260 let tokens = tokenizer.tokenize("SELECT 'it''s'").unwrap();
3261
3262 assert_eq!(tokens.len(), 2);
3263 assert_eq!(tokens[1].token_type, TokenType::String);
3264 assert_eq!(tokens[1].text, "it's");
3265 }
3266
3267 #[test]
3268 fn test_comments() {
3269 let tokenizer = Tokenizer::default();
3270 let tokens = tokenizer.tokenize("SELECT -- comment\n1").unwrap();
3271
3272 assert_eq!(tokens.len(), 2);
3273 assert_eq!(tokens[0].trailing_comments.len(), 1);
3276 assert_eq!(tokens[0].trailing_comments[0], " comment");
3277 }
3278
3279 #[test]
3280 fn test_comment_in_and_chain() {
3281 use crate::generator::Generator;
3282 use crate::parser::Parser;
3283
3284 let sql = "SELECT a FROM b WHERE foo\n-- c1\nAND bar\n-- c2\nAND bla";
3286 let ast = Parser::parse_sql(sql).unwrap();
3287 let mut gen = Generator::default();
3288 let output = gen.generate(&ast[0]).unwrap();
3289 assert_eq!(
3290 output,
3291 "SELECT a FROM b WHERE foo AND /* c1 */ bar AND /* c2 */ bla"
3292 );
3293 }
3294
3295 #[test]
3296 fn test_operators() {
3297 let tokenizer = Tokenizer::default();
3298 let tokens = tokenizer.tokenize("1 + 2 * 3").unwrap();
3299
3300 assert_eq!(tokens.len(), 5);
3301 assert_eq!(tokens[0].token_type, TokenType::Number);
3302 assert_eq!(tokens[1].token_type, TokenType::Plus);
3303 assert_eq!(tokens[2].token_type, TokenType::Number);
3304 assert_eq!(tokens[3].token_type, TokenType::Star);
3305 assert_eq!(tokens[4].token_type, TokenType::Number);
3306 }
3307
3308 #[test]
3309 fn test_comparison_operators() {
3310 let tokenizer = Tokenizer::default();
3311 let tokens = tokenizer.tokenize("a <= b >= c != d").unwrap();
3312
3313 assert_eq!(tokens[1].token_type, TokenType::Lte);
3314 assert_eq!(tokens[3].token_type, TokenType::Gte);
3315 assert_eq!(tokens[5].token_type, TokenType::Neq);
3316 }
3317
3318 #[test]
3319 fn test_national_string() {
3320 let tokenizer = Tokenizer::default();
3321 let tokens = tokenizer.tokenize("N'abc'").unwrap();
3322
3323 assert_eq!(
3324 tokens.len(),
3325 1,
3326 "Expected 1 token for N'abc', got {:?}",
3327 tokens
3328 );
3329 assert_eq!(tokens[0].token_type, TokenType::NationalString);
3330 assert_eq!(tokens[0].text, "abc");
3331 }
3332
3333 #[test]
3334 fn test_hex_string() {
3335 let tokenizer = Tokenizer::default();
3336 let tokens = tokenizer.tokenize("X'ABCD'").unwrap();
3337
3338 assert_eq!(
3339 tokens.len(),
3340 1,
3341 "Expected 1 token for X'ABCD', got {:?}",
3342 tokens
3343 );
3344 assert_eq!(tokens[0].token_type, TokenType::HexString);
3345 assert_eq!(tokens[0].text, "ABCD");
3346 }
3347
3348 #[test]
3349 fn test_bit_string() {
3350 let tokenizer = Tokenizer::default();
3351 let tokens = tokenizer.tokenize("B'01010'").unwrap();
3352
3353 assert_eq!(
3354 tokens.len(),
3355 1,
3356 "Expected 1 token for B'01010', got {:?}",
3357 tokens
3358 );
3359 assert_eq!(tokens[0].token_type, TokenType::BitString);
3360 assert_eq!(tokens[0].text, "01010");
3361 }
3362
3363 #[test]
3364 fn test_trailing_dot_number() {
3365 let tokenizer = Tokenizer::default();
3366
3367 let tokens = tokenizer.tokenize("SELECT 1.").unwrap();
3369 assert_eq!(
3370 tokens.len(),
3371 2,
3372 "Expected 2 tokens for 'SELECT 1.', got {:?}",
3373 tokens
3374 );
3375 assert_eq!(tokens[1].token_type, TokenType::Number);
3376 assert_eq!(tokens[1].text, "1.");
3377
3378 let tokens = tokenizer.tokenize("SELECT 1.5").unwrap();
3380 assert_eq!(tokens[1].text, "1.5");
3381
3382 let tokens = tokenizer.tokenize("SELECT 1.a").unwrap();
3385 assert_eq!(
3386 tokens.len(),
3387 3,
3388 "Expected 3 tokens for 'SELECT 1.a', got {:?}",
3389 tokens
3390 );
3391 assert_eq!(tokens[1].token_type, TokenType::Number);
3392 assert_eq!(tokens[1].text, "1.");
3393 assert_eq!(tokens[2].token_type, TokenType::Var);
3394
3395 let tokens = tokenizer.tokenize("SELECT 1..2").unwrap();
3397 assert_eq!(tokens[1].token_type, TokenType::Number);
3398 assert_eq!(tokens[1].text, "1");
3399 assert_eq!(tokens[2].token_type, TokenType::Dot);
3400 assert_eq!(tokens[3].token_type, TokenType::Dot);
3401 assert_eq!(tokens[4].token_type, TokenType::Number);
3402 assert_eq!(tokens[4].text, "2");
3403 }
3404
3405 #[test]
3406 fn test_leading_dot_number() {
3407 let tokenizer = Tokenizer::default();
3408
3409 let tokens = tokenizer.tokenize(".25").unwrap();
3411 assert_eq!(
3412 tokens.len(),
3413 1,
3414 "Expected 1 token for '.25', got {:?}",
3415 tokens
3416 );
3417 assert_eq!(tokens[0].token_type, TokenType::Number);
3418 assert_eq!(tokens[0].text, ".25");
3419
3420 let tokens = tokenizer.tokenize("SAMPLE (.25)").unwrap();
3422 assert_eq!(
3423 tokens.len(),
3424 4,
3425 "Expected 4 tokens for 'SAMPLE (.25)', got {:?}",
3426 tokens
3427 );
3428 assert_eq!(tokens[0].token_type, TokenType::Sample);
3429 assert_eq!(tokens[1].token_type, TokenType::LParen);
3430 assert_eq!(tokens[2].token_type, TokenType::Number);
3431 assert_eq!(tokens[2].text, ".25");
3432 assert_eq!(tokens[3].token_type, TokenType::RParen);
3433
3434 let tokens = tokenizer.tokenize(".5e10").unwrap();
3436 assert_eq!(
3437 tokens.len(),
3438 1,
3439 "Expected 1 token for '.5e10', got {:?}",
3440 tokens
3441 );
3442 assert_eq!(tokens[0].token_type, TokenType::Number);
3443 assert_eq!(tokens[0].text, ".5e10");
3444
3445 let tokens = tokenizer.tokenize("a.b").unwrap();
3447 assert_eq!(
3448 tokens.len(),
3449 3,
3450 "Expected 3 tokens for 'a.b', got {:?}",
3451 tokens
3452 );
3453 assert_eq!(tokens[1].token_type, TokenType::Dot);
3454 }
3455
3456 #[test]
3457 fn test_unrecognized_character() {
3458 let tokenizer = Tokenizer::default();
3459
3460 let result = tokenizer.tokenize("SELECT \u{2018}hello\u{2019}");
3462 assert!(
3463 result.is_ok(),
3464 "Curly quotes should be tokenized as strings"
3465 );
3466
3467 let result = tokenizer.tokenize("SELECT • FROM t");
3469 assert!(result.is_err());
3470 }
3471
3472 #[test]
3473 fn test_colon_eq_tokenization() {
3474 let tokenizer = Tokenizer::default();
3475
3476 let tokens = tokenizer.tokenize("a := 1").unwrap();
3478 assert_eq!(tokens.len(), 3);
3479 assert_eq!(tokens[0].token_type, TokenType::Var);
3480 assert_eq!(tokens[1].token_type, TokenType::ColonEq);
3481 assert_eq!(tokens[2].token_type, TokenType::Number);
3482
3483 let tokens = tokenizer.tokenize("a:b").unwrap();
3485 assert!(tokens.iter().any(|t| t.token_type == TokenType::Colon));
3486 assert!(!tokens.iter().any(|t| t.token_type == TokenType::ColonEq));
3487
3488 let tokens = tokenizer.tokenize("a::INT").unwrap();
3490 assert!(tokens.iter().any(|t| t.token_type == TokenType::DColon));
3491 }
3492
3493 #[test]
3494 fn test_colon_eq_parsing() {
3495 use crate::generator::Generator;
3496 use crate::parser::Parser;
3497
3498 let ast = Parser::parse_sql("SELECT @var1 := 1, @var2")
3500 .expect("Failed to parse MySQL @var := expr");
3501 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3502 assert_eq!(output, "SELECT @var1 := 1, @var2");
3503
3504 let ast = Parser::parse_sql("SELECT @var1, @var2 := @var1")
3506 .expect("Failed to parse MySQL @var2 := @var1");
3507 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3508 assert_eq!(output, "SELECT @var1, @var2 := @var1");
3509
3510 let ast = Parser::parse_sql("SELECT @var1 := COUNT(*) FROM t1")
3512 .expect("Failed to parse MySQL @var := COUNT(*)");
3513 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3514 assert_eq!(output, "SELECT @var1 := COUNT(*) FROM t1");
3515
3516 let ast = Parser::parse_sql("SET @var1 := 1").expect("Failed to parse SET @var1 := 1");
3518 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3519 assert_eq!(output, "SET @var1 = 1");
3520
3521 let ast =
3523 Parser::parse_sql("UNION_VALUE(k1 := 1)").expect("Failed to parse named arg with :=");
3524 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3525 assert_eq!(output, "UNION_VALUE(k1 := 1)");
3526
3527 let ast = Parser::parse_sql("SELECT UNNEST(col, recursive := TRUE) FROM t")
3529 .expect("Failed to parse UNNEST with :=");
3530 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3531 assert_eq!(output, "SELECT UNNEST(col, recursive := TRUE) FROM t");
3532
3533 let ast =
3535 Parser::parse_sql("SELECT foo: 1").expect("Failed to parse DuckDB prefix alias foo: 1");
3536 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3537 assert_eq!(output, "SELECT 1 AS foo");
3538
3539 let ast = Parser::parse_sql("SELECT foo: 1, bar: 2, baz: 3")
3541 .expect("Failed to parse DuckDB multiple prefix aliases");
3542 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3543 assert_eq!(output, "SELECT 1 AS foo, 2 AS bar, 3 AS baz");
3544 }
3545
3546 #[test]
3547 fn test_colon_eq_dialect_roundtrip() {
3548 use crate::dialects::{Dialect, DialectType};
3549
3550 fn check(dialect: DialectType, sql: &str, expected: Option<&str>) {
3551 let d = Dialect::get(dialect);
3552 let ast = d
3553 .parse(sql)
3554 .unwrap_or_else(|e| panic!("Parse error for '{}': {}", sql, e));
3555 assert!(!ast.is_empty(), "Empty AST for: {}", sql);
3556 let transformed = d
3557 .transform(ast[0].clone())
3558 .unwrap_or_else(|e| panic!("Transform error for '{}': {}", sql, e));
3559 let output = d
3560 .generate(&transformed)
3561 .unwrap_or_else(|e| panic!("Generate error for '{}': {}", sql, e));
3562 let expected = expected.unwrap_or(sql);
3563 assert_eq!(output, expected, "Roundtrip failed for: {}", sql);
3564 }
3565
3566 check(DialectType::MySQL, "SELECT @var1 := 1, @var2", None);
3568 check(DialectType::MySQL, "SELECT @var1, @var2 := @var1", None);
3569 check(DialectType::MySQL, "SELECT @var1 := COUNT(*) FROM t1", None);
3570 check(DialectType::MySQL, "SET @var1 := 1", Some("SET @var1 = 1"));
3571
3572 check(
3574 DialectType::DuckDB,
3575 "SELECT UNNEST(col, recursive := TRUE) FROM t",
3576 None,
3577 );
3578 check(DialectType::DuckDB, "UNION_VALUE(k1 := 1)", None);
3579
3580 {
3583 let d = Dialect::get(DialectType::DuckDB);
3584 let ast = d
3585 .parse("STRUCT_PACK(a := 'b')::json")
3586 .expect("Failed to parse STRUCT_PACK(a := 'b')::json");
3587 assert!(!ast.is_empty(), "Empty AST for STRUCT_PACK(a := 'b')::json");
3588 }
3589
3590 check(
3592 DialectType::DuckDB,
3593 "SELECT foo: 1",
3594 Some("SELECT 1 AS foo"),
3595 );
3596 check(
3597 DialectType::DuckDB,
3598 "SELECT foo: 1, bar: 2, baz: 3",
3599 Some("SELECT 1 AS foo, 2 AS bar, 3 AS baz"),
3600 );
3601 }
3602
3603 #[test]
3604 fn test_comment_roundtrip() {
3605 use crate::generator::Generator;
3606 use crate::parser::Parser;
3607
3608 fn check_roundtrip(sql: &str) -> Option<String> {
3609 let ast = match Parser::parse_sql(sql) {
3610 Ok(a) => a,
3611 Err(e) => return Some(format!("Parse error: {:?}", e)),
3612 };
3613 if ast.is_empty() {
3614 return Some("Empty AST".to_string());
3615 }
3616 let mut generator = Generator::default();
3617 let output = match generator.generate(&ast[0]) {
3618 Ok(o) => o,
3619 Err(e) => return Some(format!("Gen error: {:?}", e)),
3620 };
3621 if output == sql {
3622 None
3623 } else {
3624 Some(format!(
3625 "Mismatch:\n input: {}\n output: {}",
3626 sql, output
3627 ))
3628 }
3629 }
3630
3631 let tests = vec![
3632 "SELECT c /* c1 /* c2 */ c3 */",
3634 "SELECT c /* c1 /* c2 /* c3 */ */ */",
3635 "SELECT c /* c1 */ AS alias /* c2 */",
3637 "SELECT a /* x */, b /* x */",
3639 "SELECT a /* x */ /* y */ /* z */, b /* k */ /* m */",
3641 "SELECT * FROM foo /* x */, bla /* x */",
3643 "SELECT 1 /* comment */ + 1",
3645 "SELECT 1 /* c1 */ + 2 /* c2 */",
3646 "SELECT 1 /* c1 */ + /* c2 */ 2 /* c3 */",
3647 "SELECT CAST(x AS INT) /* comment */ FROM foo",
3649 "SELECT FOO(x /* c */) /* FOO */, b /* b */",
3651 "SELECT x FROM a.b.c /* x */, e.f.g /* x */",
3653 "INSERT INTO t1 (tc1 /* tc1 */, tc2 /* tc2 */) SELECT c1 /* sc1 */, c2 /* sc2 */ FROM t",
3655 "/* c */ WITH x AS (SELECT 1) SELECT * FROM x",
3657 "/* comment1 */ INSERT INTO x /* comment2 */ VALUES (1, 2, 3)",
3658 "/* comment1 */ UPDATE tbl /* comment2 */ SET x = 2 WHERE x < 2",
3659 "/* comment1 */ DELETE FROM x /* comment2 */ WHERE y > 1",
3660 "/* comment */ CREATE TABLE foo AS SELECT 1",
3661 "INSERT INTO foo SELECT * FROM bar /* comment */",
3663 "SELECT FOO(x /* c1 */ + y /* c2 */ + BLA(5 /* c3 */)) FROM (VALUES (1 /* c4 */, \"test\" /* c5 */)) /* c6 */",
3665 ];
3666
3667 let mut failures = Vec::new();
3668 for sql in tests {
3669 if let Some(e) = check_roundtrip(sql) {
3670 failures.push(e);
3671 }
3672 }
3673
3674 if !failures.is_empty() {
3675 panic!("Comment roundtrip failures:\n{}", failures.join("\n\n"));
3676 }
3677 }
3678
3679 #[test]
3680 fn test_dollar_quoted_string_parsing() {
3681 use crate::dialects::{Dialect, DialectType};
3682
3683 let (tag, content) = super::parse_dollar_string_token("FOO\x00content here");
3685 assert_eq!(tag, Some("FOO".to_string()));
3686 assert_eq!(content, "content here");
3687
3688 let (tag, content) = super::parse_dollar_string_token("just content");
3689 assert_eq!(tag, None);
3690 assert_eq!(content, "just content");
3691
3692 fn check_databricks(sql: &str, expected: Option<&str>) {
3694 let d = Dialect::get(DialectType::Databricks);
3695 let ast = d
3696 .parse(sql)
3697 .unwrap_or_else(|e| panic!("Parse error for '{}': {}", sql, e));
3698 assert!(!ast.is_empty(), "Empty AST for: {}", sql);
3699 let transformed = d
3700 .transform(ast[0].clone())
3701 .unwrap_or_else(|e| panic!("Transform error for '{}': {}", sql, e));
3702 let output = d
3703 .generate(&transformed)
3704 .unwrap_or_else(|e| panic!("Generate error for '{}': {}", sql, e));
3705 let expected = expected.unwrap_or(sql);
3706 assert_eq!(output, expected, "Roundtrip failed for: {}", sql);
3707 }
3708
3709 check_databricks(
3711 "CREATE FUNCTION add_one(x INT) RETURNS INT LANGUAGE PYTHON AS $$def add_one(x):\n return x+1$$",
3712 None
3713 );
3714
3715 check_databricks(
3717 "CREATE FUNCTION add_one(x INT) RETURNS INT LANGUAGE PYTHON AS $FOO$def add_one(x):\n return x+1$FOO$",
3718 None
3719 );
3720 }
3721}