1use crate::error::{Error, Result};
7use serde::{Deserialize, Serialize};
8use std::fmt;
9
10pub fn parse_dollar_string_token(text: &str) -> (Option<String>, String) {
14 if let Some(pos) = text.find('\x00') {
15 let tag = &text[..pos];
16 let content = &text[pos + 1..];
17 (Some(tag.to_string()), content.to_string())
18 } else {
19 (None, text.to_string())
20 }
21}
22
23#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
25pub struct Span {
26 pub start: usize,
28 pub end: usize,
30 pub line: usize,
32 pub column: usize,
34}
35
36impl Span {
37 pub fn new(start: usize, end: usize, line: usize, column: usize) -> Self {
38 Self {
39 start,
40 end,
41 line,
42 column,
43 }
44 }
45}
46
47#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
49pub struct Token {
50 pub token_type: TokenType,
52 pub text: String,
54 pub span: Span,
56 #[serde(default)]
58 pub comments: Vec<String>,
59 #[serde(default)]
61 pub trailing_comments: Vec<String>,
62}
63
64impl Token {
65 pub fn new(token_type: TokenType, text: impl Into<String>, span: Span) -> Self {
67 Self {
68 token_type,
69 text: text.into(),
70 span,
71 comments: Vec::new(),
72 trailing_comments: Vec::new(),
73 }
74 }
75
76 pub fn number(n: i64) -> Self {
78 Self::new(TokenType::Number, n.to_string(), Span::default())
79 }
80
81 pub fn string(s: impl Into<String>) -> Self {
83 Self::new(TokenType::String, s, Span::default())
84 }
85
86 pub fn identifier(s: impl Into<String>) -> Self {
88 Self::new(TokenType::Identifier, s, Span::default())
89 }
90
91 pub fn var(s: impl Into<String>) -> Self {
93 Self::new(TokenType::Var, s, Span::default())
94 }
95
96 pub fn with_comment(mut self, comment: impl Into<String>) -> Self {
98 self.comments.push(comment.into());
99 self
100 }
101}
102
103impl fmt::Display for Token {
104 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
105 write!(f, "{:?}({})", self.token_type, self.text)
106 }
107}
108
109#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
111#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
112#[repr(u16)]
113pub enum TokenType {
114 LParen,
116 RParen,
117 LBracket,
118 RBracket,
119 LBrace,
120 RBrace,
121 Comma,
122 Dot,
123 Dash,
124 Plus,
125 Colon,
126 DotColon,
127 DColon,
128 DColonDollar,
129 DColonPercent,
130 DColonQMark,
131 DQMark,
132 Semicolon,
133 Star,
134 Backslash,
135 Slash,
136 Lt,
137 Lte,
138 Gt,
139 Gte,
140 Not,
141 Eq,
142 Neq,
143 NullsafeEq,
144 ColonEq,
145 ColonGt,
146 NColonGt,
147 And,
148 Or,
149 Amp,
150 DPipe,
151 PipeGt,
152 Pipe,
153 PipeSlash,
154 DPipeSlash,
155 Caret,
156 CaretAt,
157 LtLt, GtGt, Tilde,
160 Arrow,
161 DArrow,
162 FArrow,
163 Hash,
164 HashArrow,
165 DHashArrow,
166 LrArrow,
167 DAt,
168 AtAt,
169 LtAt,
170 AtGt,
171 Dollar,
172 Parameter,
173 Session,
174 SessionParameter,
175 SessionUser,
176 DAmp,
177 AmpLt,
178 AmpGt,
179 Adjacent,
180 Xor,
181 DStar,
182 QMarkAmp,
183 QMarkPipe,
184 HashDash,
185 Exclamation,
186
187 UriStart,
188 BlockStart,
189 BlockEnd,
190 Space,
191 Break,
192
193 BlockComment, LineComment, String,
199 DollarString, TripleDoubleQuotedString, TripleSingleQuotedString, Number,
203 Identifier,
204 QuotedIdentifier,
205 Database,
206 Column,
207 ColumnDef,
208 Schema,
209 Table,
210 Warehouse,
211 Stage,
212 Streamlit,
213 Var,
214 BitString,
215 HexString,
216 HexNumber,
218 ByteString,
219 NationalString,
220 EscapeString, RawString,
222 HeredocString,
223 HeredocStringAlternative,
224 UnicodeString,
225
226 Bit,
228 Boolean,
229 TinyInt,
230 UTinyInt,
231 SmallInt,
232 USmallInt,
233 MediumInt,
234 UMediumInt,
235 Int,
236 UInt,
237 BigInt,
238 UBigInt,
239 BigNum,
240 Int128,
241 UInt128,
242 Int256,
243 UInt256,
244 Float,
245 Double,
246 UDouble,
247 Decimal,
248 Decimal32,
249 Decimal64,
250 Decimal128,
251 Decimal256,
252 DecFloat,
253 UDecimal,
254 BigDecimal,
255 Char,
256 NChar,
257 VarChar,
258 NVarChar,
259 BpChar,
260 Text,
261 MediumText,
262 LongText,
263 Blob,
264 MediumBlob,
265 LongBlob,
266 TinyBlob,
267 TinyText,
268 Name,
269 Binary,
270 VarBinary,
271 Json,
272 JsonB,
273 Time,
274 TimeTz,
275 TimeNs,
276 Timestamp,
277 TimestampTz,
278 TimestampLtz,
279 TimestampNtz,
280 TimestampS,
281 TimestampMs,
282 TimestampNs,
283 DateTime,
284 DateTime2,
285 DateTime64,
286 SmallDateTime,
287 Date,
288 Date32,
289 Int4Range,
290 Int4MultiRange,
291 Int8Range,
292 Int8MultiRange,
293 NumRange,
294 NumMultiRange,
295 TsRange,
296 TsMultiRange,
297 TsTzRange,
298 TsTzMultiRange,
299 DateRange,
300 DateMultiRange,
301 Uuid,
302 Geography,
303 GeographyPoint,
304 Nullable,
305 Geometry,
306 Point,
307 Ring,
308 LineString,
309 LocalTime,
310 LocalTimestamp,
311 SysTimestamp,
312 MultiLineString,
313 Polygon,
314 MultiPolygon,
315 HllSketch,
316 HStore,
317 Super,
318 Serial,
319 SmallSerial,
320 BigSerial,
321 Xml,
322 Year,
323 UserDefined,
324 Money,
325 SmallMoney,
326 RowVersion,
327 Image,
328 Variant,
329 Object,
330 Inet,
331 IpAddress,
332 IpPrefix,
333 Ipv4,
334 Ipv6,
335 Enum,
336 Enum8,
337 Enum16,
338 FixedString,
339 LowCardinality,
340 Nested,
341 AggregateFunction,
342 SimpleAggregateFunction,
343 TDigest,
344 Unknown,
345 Vector,
346 Dynamic,
347 Void,
348
349 Add,
351 Alias,
352 Alter,
353 All,
354 Anti,
355 Any,
356 Apply,
357 Array,
358 Asc,
359 AsOf,
360 Attach,
361 AutoIncrement,
362 Begin,
363 Between,
364 BulkCollectInto,
365 Cache,
366 Cascade,
367 Case,
368 CharacterSet,
369 Cluster,
370 ClusterBy,
371 Collate,
372 Command,
373 Comment,
374 Commit,
375 Preserve,
376 Connect,
377 ConnectBy,
378 Constraint,
379 Copy,
380 Create,
381 Cross,
382 Cube,
383 CurrentDate,
384 CurrentDateTime,
385 CurrentSchema,
386 CurrentTime,
387 CurrentTimestamp,
388 CurrentUser,
389 CurrentRole,
390 CurrentCatalog,
391 Declare,
392 Default,
393 Delete,
394 Desc,
395 Describe,
396 Detach,
397 Dictionary,
398 Distinct,
399 Distribute,
400 DistributeBy,
401 Div,
402 Drop,
403 Else,
404 End,
405 Escape,
406 Except,
407 Execute,
408 Exists,
409 False,
410 Fetch,
411 File,
412 FileFormat,
413 Filter,
414 Final,
415 First,
416 For,
417 Force,
418 ForeignKey,
419 Format,
420 From,
421 Full,
422 Function,
423 Get,
424 Glob,
425 Global,
426 Grant,
427 GroupBy,
428 GroupingSets,
429 Having,
430 Hint,
431 Ignore,
432 ILike,
433 In,
434 Index,
435 IndexedBy,
436 Inner,
437 Input,
438 Insert,
439 Install,
440 Intersect,
441 Interval,
442 Into,
443 Inpath,
444 InputFormat,
445 Introducer,
446 IRLike,
447 Is,
448 IsNull,
449 Join,
450 JoinMarker,
451 Keep,
452 Key,
453 Kill,
454 Lambda,
455 Language,
456 Lateral,
457 Left,
458 Like,
459 NotLike, NotILike, NotRLike, NotIRLike, Limit,
464 List,
465 Load,
466 Local,
467 Lock,
468 Map,
469 Match,
470 MatchCondition,
471 MatchRecognize,
472 MemberOf,
473 Materialized,
474 Merge,
475 Mod,
476 Model,
477 Natural,
478 Next,
479 NoAction,
480 Nothing,
481 NotNull,
482 Null,
483 ObjectIdentifier,
484 Offset,
485 On,
486 Only,
487 Operator,
488 OrderBy,
489 OrderSiblingsBy,
490 Ordered,
491 Ordinality,
492 Out,
493 Outer,
494 Output,
495 Over,
496 Overlaps,
497 Overwrite,
498 Partition,
499 PartitionBy,
500 Percent,
501 Pivot,
502 Placeholder,
503 Positional,
504 Pragma,
505 Prewhere,
506 PrimaryKey,
507 Procedure,
508 Properties,
509 PseudoType,
510 Put,
511 Qualify,
512 Quote,
513 QDColon,
514 Range,
515 Recursive,
516 Refresh,
517 Rename,
518 Replace,
519 Returning,
520 Revoke,
521 References,
522 Restrict,
523 Right,
524 RLike,
525 Rollback,
526 Rollup,
527 Row,
528 Rows,
529 Select,
530 Semi,
531 Savepoint,
532 Separator,
533 Sequence,
534 Serde,
535 SerdeProperties,
536 Set,
537 Settings,
538 Show,
539 Siblings,
540 SimilarTo,
541 Some,
542 Sort,
543 SortBy,
544 SoundsLike,
545 StartWith,
546 StorageIntegration,
547 StraightJoin,
548 Struct,
549 Summarize,
550 TableSample,
551 Sample,
552 Bernoulli,
553 System,
554 Block,
555 Seed,
556 Repeatable,
557 Tag,
558 Temporary,
559 Transaction,
560 To,
561 Top,
562 Then,
563 True,
564 Truncate,
565 Uncache,
566 Union,
567 Unnest,
568 Unpivot,
569 Update,
570 Use,
571 Using,
572 Values,
573 View,
574 SemanticView,
575 Volatile,
576 When,
577 Where,
578 Window,
579 With,
580 Ties,
581 Exclude,
582 No,
583 Others,
584 Unique,
585 UtcDate,
586 UtcTime,
587 UtcTimestamp,
588 VersionSnapshot,
589 TimestampSnapshot,
590 Option,
591 Sink,
592 Source,
593 Analyze,
594 Namespace,
595 Export,
596 As,
597 By,
598 Nulls,
599 Respect,
600 Last,
601 If,
602 Cast,
603 TryCast,
604 SafeCast,
605 Count,
606 Extract,
607 Substring,
608 Trim,
609 Leading,
610 Trailing,
611 Both,
612 Position,
613 Overlaying,
614 Placing,
615 Treat,
616 Within,
617 Group,
618 Order,
619
620 Unbounded,
622 Preceding,
623 Following,
624 Current,
625 Groups,
626
627 Trigger,
629 Type,
630 Domain,
631 Returns,
632 Body,
633 Increment,
634 Minvalue,
635 Maxvalue,
636 Start,
637 Cycle,
638 NoCycle,
639 Prior,
640 Generated,
641 Identity,
642 Always,
643 Measures,
645 Pattern,
646 Define,
647 Running,
648 Owned,
649 After,
650 Before,
651 Instead,
652 Each,
653 Statement,
654 Referencing,
655 Old,
656 New,
657 Of,
658 Check,
659 Authorization,
660 Restart,
661
662 Eof,
664}
665
666impl TokenType {
667 pub fn is_keyword(&self) -> bool {
669 matches!(
670 self,
671 TokenType::Select
672 | TokenType::From
673 | TokenType::Where
674 | TokenType::And
675 | TokenType::Or
676 | TokenType::Not
677 | TokenType::In
678 | TokenType::Is
679 | TokenType::Null
680 | TokenType::True
681 | TokenType::False
682 | TokenType::As
683 | TokenType::On
684 | TokenType::Join
685 | TokenType::Left
686 | TokenType::Right
687 | TokenType::Inner
688 | TokenType::Outer
689 | TokenType::Full
690 | TokenType::Cross
691 | TokenType::Semi
692 | TokenType::Anti
693 | TokenType::Union
694 | TokenType::Except
695 | TokenType::Intersect
696 | TokenType::GroupBy
697 | TokenType::OrderBy
698 | TokenType::Having
699 | TokenType::Limit
700 | TokenType::Offset
701 | TokenType::Case
702 | TokenType::When
703 | TokenType::Then
704 | TokenType::Else
705 | TokenType::End
706 | TokenType::Create
707 | TokenType::Drop
708 | TokenType::Alter
709 | TokenType::Insert
710 | TokenType::Update
711 | TokenType::Delete
712 | TokenType::Into
713 | TokenType::Values
714 | TokenType::Set
715 | TokenType::With
716 | TokenType::Distinct
717 | TokenType::All
718 | TokenType::Exists
719 | TokenType::Between
720 | TokenType::Like
721 | TokenType::ILike
722 | TokenType::Filter
724 | TokenType::Date
725 | TokenType::Timestamp
726 | TokenType::TimestampTz
727 | TokenType::Interval
728 | TokenType::Time
729 | TokenType::Table
730 | TokenType::Index
731 | TokenType::Column
732 | TokenType::Database
733 | TokenType::Schema
734 | TokenType::View
735 | TokenType::Function
736 | TokenType::Procedure
737 | TokenType::Trigger
738 | TokenType::Sequence
739 | TokenType::Over
740 | TokenType::Partition
741 | TokenType::Window
742 | TokenType::Rows
743 | TokenType::Range
744 | TokenType::First
745 | TokenType::Last
746 | TokenType::Preceding
747 | TokenType::Following
748 | TokenType::Current
749 | TokenType::Row
750 | TokenType::Unbounded
751 | TokenType::Array
752 | TokenType::Struct
753 | TokenType::Map
754 | TokenType::PrimaryKey
755 | TokenType::Key
756 | TokenType::ForeignKey
757 | TokenType::References
758 | TokenType::Unique
759 | TokenType::Check
760 | TokenType::Default
761 | TokenType::Constraint
762 | TokenType::Comment
763 | TokenType::Rollup
764 | TokenType::Cube
765 | TokenType::Grant
766 | TokenType::Revoke
767 | TokenType::Type
768 | TokenType::Use
769 | TokenType::Cache
770 | TokenType::Uncache
771 | TokenType::Load
772 | TokenType::Any
773 | TokenType::Some
774 | TokenType::Asc
775 | TokenType::Desc
776 | TokenType::Nulls
777 | TokenType::Lateral
778 | TokenType::Natural
779 | TokenType::Escape
780 | TokenType::Glob
781 | TokenType::Match
782 | TokenType::Recursive
783 | TokenType::Replace
784 | TokenType::Returns
785 | TokenType::If
786 | TokenType::Pivot
787 | TokenType::Unpivot
788 | TokenType::Json
789 | TokenType::Blob
790 | TokenType::Text
791 | TokenType::Int
792 | TokenType::BigInt
793 | TokenType::SmallInt
794 | TokenType::TinyInt
795 | TokenType::Int128
796 | TokenType::UInt128
797 | TokenType::Int256
798 | TokenType::UInt256
799 | TokenType::UInt
800 | TokenType::UBigInt
801 | TokenType::Float
802 | TokenType::Double
803 | TokenType::Decimal
804 | TokenType::Boolean
805 | TokenType::VarChar
806 | TokenType::Char
807 | TokenType::Binary
808 | TokenType::VarBinary
809 | TokenType::No
810 | TokenType::DateTime
811 | TokenType::Truncate
812 | TokenType::Execute
813 | TokenType::Merge
814 | TokenType::Top
815 | TokenType::Begin
816 | TokenType::Generated
817 | TokenType::Identity
818 | TokenType::Always
819 | TokenType::Extract
820 | TokenType::AsOf
822 | TokenType::Prior
823 | TokenType::After
824 | TokenType::Restrict
825 | TokenType::Cascade
826 | TokenType::Local
827 | TokenType::Rename
828 | TokenType::Enum
829 | TokenType::Within
830 | TokenType::Format
831 | TokenType::Final
832 | TokenType::FileFormat
833 | TokenType::Input
834 | TokenType::InputFormat
835 | TokenType::Copy
836 | TokenType::Put
837 | TokenType::Get
838 | TokenType::Show
839 | TokenType::Serde
840 | TokenType::Sample
841 | TokenType::Sort
842 | TokenType::Collate
843 | TokenType::Ties
844 | TokenType::IsNull
845 | TokenType::NotNull
846 | TokenType::Exclude
847 | TokenType::Temporary
848 | TokenType::Add
849 | TokenType::Ordinality
850 | TokenType::Overlaps
851 | TokenType::Block
852 | TokenType::Pattern
853 | TokenType::Group
854 | TokenType::Cluster
855 | TokenType::Repeatable
856 | TokenType::Groups
857 | TokenType::Commit
858 | TokenType::Warehouse
859 | TokenType::System
860 | TokenType::By
861 | TokenType::To
862 | TokenType::Fetch
863 | TokenType::For
864 | TokenType::Only
865 | TokenType::Next
866 | TokenType::Lock
867 | TokenType::Refresh
868 | TokenType::Settings
869 | TokenType::Operator
870 | TokenType::Overwrite
871 | TokenType::StraightJoin
872 | TokenType::Start
873 | TokenType::Ignore
875 | TokenType::Domain
876 | TokenType::Apply
877 | TokenType::Respect
878 | TokenType::Materialized
879 | TokenType::Prewhere
880 | TokenType::Old
881 | TokenType::New
882 | TokenType::Cast
883 | TokenType::TryCast
884 | TokenType::SafeCast
885 | TokenType::Transaction
886 | TokenType::Describe
887 | TokenType::Kill
888 | TokenType::Lambda
889 | TokenType::Declare
890 | TokenType::Keep
891 | TokenType::Output
892 | TokenType::Percent
893 | TokenType::Qualify
894 | TokenType::Returning
895 | TokenType::Language
896 | TokenType::Preserve
897 | TokenType::Savepoint
898 | TokenType::Rollback
899 | TokenType::Body
900 | TokenType::Increment
901 | TokenType::Minvalue
902 | TokenType::Maxvalue
903 | TokenType::Cycle
904 | TokenType::NoCycle
905 | TokenType::Seed
906 | TokenType::Namespace
907 | TokenType::Authorization
908 | TokenType::Order
909 | TokenType::Restart
910 | TokenType::Before
911 | TokenType::Instead
912 | TokenType::Each
913 | TokenType::Statement
914 | TokenType::Referencing
915 | TokenType::Of
916 | TokenType::Separator
917 | TokenType::Others
918 | TokenType::Placing
919 | TokenType::Owned
920 | TokenType::Running
921 | TokenType::Define
922 | TokenType::Measures
923 | TokenType::MatchRecognize
924 | TokenType::AutoIncrement
925 | TokenType::Connect
926 | TokenType::Distribute
927 | TokenType::Bernoulli
928 | TokenType::TableSample
929 | TokenType::Inpath
930 | TokenType::Pragma
931 | TokenType::Siblings
932 | TokenType::SerdeProperties
933 | TokenType::RLike
934 )
935 }
936
937 pub fn is_comparison(&self) -> bool {
939 matches!(
940 self,
941 TokenType::Eq
942 | TokenType::Neq
943 | TokenType::Lt
944 | TokenType::Lte
945 | TokenType::Gt
946 | TokenType::Gte
947 | TokenType::NullsafeEq
948 )
949 }
950
951 pub fn is_arithmetic(&self) -> bool {
953 matches!(
954 self,
955 TokenType::Plus
956 | TokenType::Dash
957 | TokenType::Star
958 | TokenType::Slash
959 | TokenType::Percent
960 | TokenType::Mod
961 | TokenType::Div
962 )
963 }
964}
965
966impl fmt::Display for TokenType {
967 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
968 write!(f, "{:?}", self)
969 }
970}
971
972#[derive(Debug, Clone)]
974pub struct TokenizerConfig {
975 pub keywords: std::collections::HashMap<String, TokenType>,
977 pub single_tokens: std::collections::HashMap<char, TokenType>,
979 pub quotes: std::collections::HashMap<String, String>,
981 pub identifiers: std::collections::HashMap<char, char>,
983 pub comments: std::collections::HashMap<String, Option<String>>,
985 pub string_escapes: Vec<char>,
987 pub nested_comments: bool,
989 pub escape_follow_chars: Vec<char>,
994 pub b_prefix_is_byte_string: bool,
997 pub numeric_literals: std::collections::HashMap<String, String>,
1000 pub identifiers_can_start_with_digit: bool,
1004 pub hex_number_strings: bool,
1008 pub hex_string_is_integer_type: bool,
1012 pub string_escapes_allowed_in_raw_strings: bool,
1017 pub hash_comments: bool,
1019 pub dollar_sign_is_identifier: bool,
1023 pub insert_format_raw_data: bool,
1027}
1028
1029impl Default for TokenizerConfig {
1030 fn default() -> Self {
1031 let mut keywords = std::collections::HashMap::new();
1032 keywords.insert("SELECT".to_string(), TokenType::Select);
1034 keywords.insert("FROM".to_string(), TokenType::From);
1035 keywords.insert("WHERE".to_string(), TokenType::Where);
1036 keywords.insert("AND".to_string(), TokenType::And);
1037 keywords.insert("OR".to_string(), TokenType::Or);
1038 keywords.insert("NOT".to_string(), TokenType::Not);
1039 keywords.insert("AS".to_string(), TokenType::As);
1040 keywords.insert("ON".to_string(), TokenType::On);
1041 keywords.insert("JOIN".to_string(), TokenType::Join);
1042 keywords.insert("LEFT".to_string(), TokenType::Left);
1043 keywords.insert("RIGHT".to_string(), TokenType::Right);
1044 keywords.insert("INNER".to_string(), TokenType::Inner);
1045 keywords.insert("OUTER".to_string(), TokenType::Outer);
1046 keywords.insert("OUTPUT".to_string(), TokenType::Output);
1047 keywords.insert("FULL".to_string(), TokenType::Full);
1048 keywords.insert("CROSS".to_string(), TokenType::Cross);
1049 keywords.insert("SEMI".to_string(), TokenType::Semi);
1050 keywords.insert("ANTI".to_string(), TokenType::Anti);
1051 keywords.insert("STRAIGHT_JOIN".to_string(), TokenType::StraightJoin);
1052 keywords.insert("UNION".to_string(), TokenType::Union);
1053 keywords.insert("EXCEPT".to_string(), TokenType::Except);
1054 keywords.insert("MINUS".to_string(), TokenType::Except); keywords.insert("INTERSECT".to_string(), TokenType::Intersect);
1056 keywords.insert("GROUP".to_string(), TokenType::Group);
1057 keywords.insert("CUBE".to_string(), TokenType::Cube);
1058 keywords.insert("ROLLUP".to_string(), TokenType::Rollup);
1059 keywords.insert("WITHIN".to_string(), TokenType::Within);
1060 keywords.insert("ORDER".to_string(), TokenType::Order);
1061 keywords.insert("BY".to_string(), TokenType::By);
1062 keywords.insert("HAVING".to_string(), TokenType::Having);
1063 keywords.insert("LIMIT".to_string(), TokenType::Limit);
1064 keywords.insert("OFFSET".to_string(), TokenType::Offset);
1065 keywords.insert("ORDINALITY".to_string(), TokenType::Ordinality);
1066 keywords.insert("FETCH".to_string(), TokenType::Fetch);
1067 keywords.insert("FIRST".to_string(), TokenType::First);
1068 keywords.insert("NEXT".to_string(), TokenType::Next);
1069 keywords.insert("ONLY".to_string(), TokenType::Only);
1070 keywords.insert("KEEP".to_string(), TokenType::Keep);
1071 keywords.insert("IGNORE".to_string(), TokenType::Ignore);
1072 keywords.insert("INPUT".to_string(), TokenType::Input);
1073 keywords.insert("CASE".to_string(), TokenType::Case);
1074 keywords.insert("WHEN".to_string(), TokenType::When);
1075 keywords.insert("THEN".to_string(), TokenType::Then);
1076 keywords.insert("ELSE".to_string(), TokenType::Else);
1077 keywords.insert("END".to_string(), TokenType::End);
1078 keywords.insert("ENDIF".to_string(), TokenType::End); keywords.insert("NULL".to_string(), TokenType::Null);
1080 keywords.insert("TRUE".to_string(), TokenType::True);
1081 keywords.insert("FALSE".to_string(), TokenType::False);
1082 keywords.insert("IS".to_string(), TokenType::Is);
1083 keywords.insert("IN".to_string(), TokenType::In);
1084 keywords.insert("BETWEEN".to_string(), TokenType::Between);
1085 keywords.insert("OVERLAPS".to_string(), TokenType::Overlaps);
1086 keywords.insert("LIKE".to_string(), TokenType::Like);
1087 keywords.insert("ILIKE".to_string(), TokenType::ILike);
1088 keywords.insert("RLIKE".to_string(), TokenType::RLike);
1089 keywords.insert("REGEXP".to_string(), TokenType::RLike);
1090 keywords.insert("ESCAPE".to_string(), TokenType::Escape);
1091 keywords.insert("EXISTS".to_string(), TokenType::Exists);
1092 keywords.insert("DISTINCT".to_string(), TokenType::Distinct);
1093 keywords.insert("ALL".to_string(), TokenType::All);
1094 keywords.insert("WITH".to_string(), TokenType::With);
1095 keywords.insert("CREATE".to_string(), TokenType::Create);
1096 keywords.insert("DROP".to_string(), TokenType::Drop);
1097 keywords.insert("ALTER".to_string(), TokenType::Alter);
1098 keywords.insert("TRUNCATE".to_string(), TokenType::Truncate);
1099 keywords.insert("TABLE".to_string(), TokenType::Table);
1100 keywords.insert("VIEW".to_string(), TokenType::View);
1101 keywords.insert("INDEX".to_string(), TokenType::Index);
1102 keywords.insert("COLUMN".to_string(), TokenType::Column);
1103 keywords.insert("CONSTRAINT".to_string(), TokenType::Constraint);
1104 keywords.insert("ADD".to_string(), TokenType::Add);
1105 keywords.insert("CASCADE".to_string(), TokenType::Cascade);
1106 keywords.insert("RESTRICT".to_string(), TokenType::Restrict);
1107 keywords.insert("RENAME".to_string(), TokenType::Rename);
1108 keywords.insert("TEMPORARY".to_string(), TokenType::Temporary);
1109 keywords.insert("TEMP".to_string(), TokenType::Temporary);
1110 keywords.insert("UNIQUE".to_string(), TokenType::Unique);
1111 keywords.insert("PRIMARY".to_string(), TokenType::PrimaryKey);
1112 keywords.insert("FOREIGN".to_string(), TokenType::ForeignKey);
1113 keywords.insert("KEY".to_string(), TokenType::Key);
1114 keywords.insert("KILL".to_string(), TokenType::Kill);
1115 keywords.insert("REFERENCES".to_string(), TokenType::References);
1116 keywords.insert("DEFAULT".to_string(), TokenType::Default);
1117 keywords.insert("DECLARE".to_string(), TokenType::Declare);
1118 keywords.insert("AUTO_INCREMENT".to_string(), TokenType::AutoIncrement);
1119 keywords.insert("AUTOINCREMENT".to_string(), TokenType::AutoIncrement); keywords.insert("MATERIALIZED".to_string(), TokenType::Materialized);
1121 keywords.insert("REPLACE".to_string(), TokenType::Replace);
1122 keywords.insert("TO".to_string(), TokenType::To);
1123 keywords.insert("INSERT".to_string(), TokenType::Insert);
1124 keywords.insert("OVERWRITE".to_string(), TokenType::Overwrite);
1125 keywords.insert("UPDATE".to_string(), TokenType::Update);
1126 keywords.insert("USE".to_string(), TokenType::Use);
1127 keywords.insert("WAREHOUSE".to_string(), TokenType::Warehouse);
1128 keywords.insert("GLOB".to_string(), TokenType::Glob);
1129 keywords.insert("DELETE".to_string(), TokenType::Delete);
1130 keywords.insert("MERGE".to_string(), TokenType::Merge);
1131 keywords.insert("CACHE".to_string(), TokenType::Cache);
1132 keywords.insert("UNCACHE".to_string(), TokenType::Uncache);
1133 keywords.insert("REFRESH".to_string(), TokenType::Refresh);
1134 keywords.insert("GRANT".to_string(), TokenType::Grant);
1135 keywords.insert("REVOKE".to_string(), TokenType::Revoke);
1136 keywords.insert("COMMENT".to_string(), TokenType::Comment);
1137 keywords.insert("COLLATE".to_string(), TokenType::Collate);
1138 keywords.insert("INTO".to_string(), TokenType::Into);
1139 keywords.insert("VALUES".to_string(), TokenType::Values);
1140 keywords.insert("SET".to_string(), TokenType::Set);
1141 keywords.insert("SETTINGS".to_string(), TokenType::Settings);
1142 keywords.insert("SEPARATOR".to_string(), TokenType::Separator);
1143 keywords.insert("ASC".to_string(), TokenType::Asc);
1144 keywords.insert("DESC".to_string(), TokenType::Desc);
1145 keywords.insert("NULLS".to_string(), TokenType::Nulls);
1146 keywords.insert("RESPECT".to_string(), TokenType::Respect);
1147 keywords.insert("FIRST".to_string(), TokenType::First);
1148 keywords.insert("LAST".to_string(), TokenType::Last);
1149 keywords.insert("IF".to_string(), TokenType::If);
1150 keywords.insert("CAST".to_string(), TokenType::Cast);
1151 keywords.insert("TRY_CAST".to_string(), TokenType::TryCast);
1152 keywords.insert("SAFE_CAST".to_string(), TokenType::SafeCast);
1153 keywords.insert("OVER".to_string(), TokenType::Over);
1154 keywords.insert("PARTITION".to_string(), TokenType::Partition);
1155 keywords.insert("PLACING".to_string(), TokenType::Placing);
1156 keywords.insert("WINDOW".to_string(), TokenType::Window);
1157 keywords.insert("ROWS".to_string(), TokenType::Rows);
1158 keywords.insert("RANGE".to_string(), TokenType::Range);
1159 keywords.insert("FILTER".to_string(), TokenType::Filter);
1160 keywords.insert("NATURAL".to_string(), TokenType::Natural);
1161 keywords.insert("USING".to_string(), TokenType::Using);
1162 keywords.insert("UNBOUNDED".to_string(), TokenType::Unbounded);
1163 keywords.insert("PRECEDING".to_string(), TokenType::Preceding);
1164 keywords.insert("FOLLOWING".to_string(), TokenType::Following);
1165 keywords.insert("CURRENT".to_string(), TokenType::Current);
1166 keywords.insert("ROW".to_string(), TokenType::Row);
1167 keywords.insert("GROUPS".to_string(), TokenType::Groups);
1168 keywords.insert("RECURSIVE".to_string(), TokenType::Recursive);
1169 keywords.insert("BOTH".to_string(), TokenType::Both);
1171 keywords.insert("LEADING".to_string(), TokenType::Leading);
1172 keywords.insert("TRAILING".to_string(), TokenType::Trailing);
1173 keywords.insert("INTERVAL".to_string(), TokenType::Interval);
1174 keywords.insert("TOP".to_string(), TokenType::Top);
1176 keywords.insert("QUALIFY".to_string(), TokenType::Qualify);
1177 keywords.insert("SAMPLE".to_string(), TokenType::Sample);
1178 keywords.insert("TABLESAMPLE".to_string(), TokenType::TableSample);
1179 keywords.insert("BERNOULLI".to_string(), TokenType::Bernoulli);
1180 keywords.insert("SYSTEM".to_string(), TokenType::System);
1181 keywords.insert("BLOCK".to_string(), TokenType::Block);
1182 keywords.insert("SEED".to_string(), TokenType::Seed);
1183 keywords.insert("REPEATABLE".to_string(), TokenType::Repeatable);
1184 keywords.insert("TIES".to_string(), TokenType::Ties);
1185 keywords.insert("LATERAL".to_string(), TokenType::Lateral);
1186 keywords.insert("LAMBDA".to_string(), TokenType::Lambda);
1187 keywords.insert("APPLY".to_string(), TokenType::Apply);
1188 keywords.insert("CONNECT".to_string(), TokenType::Connect);
1190 keywords.insert("CLUSTER".to_string(), TokenType::Cluster);
1192 keywords.insert("DISTRIBUTE".to_string(), TokenType::Distribute);
1193 keywords.insert("SORT".to_string(), TokenType::Sort);
1194 keywords.insert("PIVOT".to_string(), TokenType::Pivot);
1195 keywords.insert("PREWHERE".to_string(), TokenType::Prewhere);
1196 keywords.insert("UNPIVOT".to_string(), TokenType::Unpivot);
1197 keywords.insert("FOR".to_string(), TokenType::For);
1198 keywords.insert("ANY".to_string(), TokenType::Any);
1199 keywords.insert("SOME".to_string(), TokenType::Some);
1200 keywords.insert("ASOF".to_string(), TokenType::AsOf);
1201 keywords.insert("PERCENT".to_string(), TokenType::Percent);
1202 keywords.insert("EXCLUDE".to_string(), TokenType::Exclude);
1203 keywords.insert("NO".to_string(), TokenType::No);
1204 keywords.insert("OTHERS".to_string(), TokenType::Others);
1205 keywords.insert("OPERATOR".to_string(), TokenType::Operator);
1207 keywords.insert("SCHEMA".to_string(), TokenType::Schema);
1209 keywords.insert("NAMESPACE".to_string(), TokenType::Namespace);
1210 keywords.insert("DATABASE".to_string(), TokenType::Database);
1211 keywords.insert("FUNCTION".to_string(), TokenType::Function);
1212 keywords.insert("PROCEDURE".to_string(), TokenType::Procedure);
1213 keywords.insert("PROC".to_string(), TokenType::Procedure);
1214 keywords.insert("SEQUENCE".to_string(), TokenType::Sequence);
1215 keywords.insert("TRIGGER".to_string(), TokenType::Trigger);
1216 keywords.insert("TYPE".to_string(), TokenType::Type);
1217 keywords.insert("DOMAIN".to_string(), TokenType::Domain);
1218 keywords.insert("RETURNS".to_string(), TokenType::Returns);
1219 keywords.insert("RETURNING".to_string(), TokenType::Returning);
1220 keywords.insert("LANGUAGE".to_string(), TokenType::Language);
1221 keywords.insert("ROLLBACK".to_string(), TokenType::Rollback);
1222 keywords.insert("COMMIT".to_string(), TokenType::Commit);
1223 keywords.insert("BEGIN".to_string(), TokenType::Begin);
1224 keywords.insert("DESCRIBE".to_string(), TokenType::Describe);
1225 keywords.insert("PRESERVE".to_string(), TokenType::Preserve);
1226 keywords.insert("TRANSACTION".to_string(), TokenType::Transaction);
1227 keywords.insert("SAVEPOINT".to_string(), TokenType::Savepoint);
1228 keywords.insert("BODY".to_string(), TokenType::Body);
1229 keywords.insert("INCREMENT".to_string(), TokenType::Increment);
1230 keywords.insert("MINVALUE".to_string(), TokenType::Minvalue);
1231 keywords.insert("MAXVALUE".to_string(), TokenType::Maxvalue);
1232 keywords.insert("CYCLE".to_string(), TokenType::Cycle);
1233 keywords.insert("NOCYCLE".to_string(), TokenType::NoCycle);
1234 keywords.insert("PRIOR".to_string(), TokenType::Prior);
1235 keywords.insert("MATCH".to_string(), TokenType::Match);
1237 keywords.insert("MATCH_RECOGNIZE".to_string(), TokenType::MatchRecognize);
1238 keywords.insert("MEASURES".to_string(), TokenType::Measures);
1239 keywords.insert("PATTERN".to_string(), TokenType::Pattern);
1240 keywords.insert("DEFINE".to_string(), TokenType::Define);
1241 keywords.insert("RUNNING".to_string(), TokenType::Running);
1242 keywords.insert("FINAL".to_string(), TokenType::Final);
1243 keywords.insert("OWNED".to_string(), TokenType::Owned);
1244 keywords.insert("AFTER".to_string(), TokenType::After);
1245 keywords.insert("BEFORE".to_string(), TokenType::Before);
1246 keywords.insert("INSTEAD".to_string(), TokenType::Instead);
1247 keywords.insert("EACH".to_string(), TokenType::Each);
1248 keywords.insert("STATEMENT".to_string(), TokenType::Statement);
1249 keywords.insert("REFERENCING".to_string(), TokenType::Referencing);
1250 keywords.insert("OLD".to_string(), TokenType::Old);
1251 keywords.insert("NEW".to_string(), TokenType::New);
1252 keywords.insert("OF".to_string(), TokenType::Of);
1253 keywords.insert("CHECK".to_string(), TokenType::Check);
1254 keywords.insert("START".to_string(), TokenType::Start);
1255 keywords.insert("ENUM".to_string(), TokenType::Enum);
1256 keywords.insert("AUTHORIZATION".to_string(), TokenType::Authorization);
1257 keywords.insert("RESTART".to_string(), TokenType::Restart);
1258 keywords.insert("DATE".to_string(), TokenType::Date);
1260 keywords.insert("TIME".to_string(), TokenType::Time);
1261 keywords.insert("TIMESTAMP".to_string(), TokenType::Timestamp);
1262 keywords.insert("DATETIME".to_string(), TokenType::DateTime);
1263 keywords.insert("GENERATED".to_string(), TokenType::Generated);
1264 keywords.insert("IDENTITY".to_string(), TokenType::Identity);
1265 keywords.insert("ALWAYS".to_string(), TokenType::Always);
1266 keywords.insert("LOAD".to_string(), TokenType::Load);
1268 keywords.insert("LOCAL".to_string(), TokenType::Local);
1269 keywords.insert("INPATH".to_string(), TokenType::Inpath);
1270 keywords.insert("INPUTFORMAT".to_string(), TokenType::InputFormat);
1271 keywords.insert("SERDE".to_string(), TokenType::Serde);
1272 keywords.insert("SERDEPROPERTIES".to_string(), TokenType::SerdeProperties);
1273 keywords.insert("FORMAT".to_string(), TokenType::Format);
1274 keywords.insert("PRAGMA".to_string(), TokenType::Pragma);
1276 keywords.insert("SHOW".to_string(), TokenType::Show);
1278 keywords.insert("SIBLINGS".to_string(), TokenType::Siblings);
1280 keywords.insert("COPY".to_string(), TokenType::Copy);
1282 keywords.insert("PUT".to_string(), TokenType::Put);
1283 keywords.insert("GET".to_string(), TokenType::Get);
1284 keywords.insert("EXEC".to_string(), TokenType::Execute);
1286 keywords.insert("EXECUTE".to_string(), TokenType::Execute);
1287 keywords.insert("ISNULL".to_string(), TokenType::IsNull);
1289 keywords.insert("NOTNULL".to_string(), TokenType::NotNull);
1290
1291 let mut single_tokens = std::collections::HashMap::new();
1292 single_tokens.insert('(', TokenType::LParen);
1293 single_tokens.insert(')', TokenType::RParen);
1294 single_tokens.insert('[', TokenType::LBracket);
1295 single_tokens.insert(']', TokenType::RBracket);
1296 single_tokens.insert('{', TokenType::LBrace);
1297 single_tokens.insert('}', TokenType::RBrace);
1298 single_tokens.insert(',', TokenType::Comma);
1299 single_tokens.insert('.', TokenType::Dot);
1300 single_tokens.insert(';', TokenType::Semicolon);
1301 single_tokens.insert('+', TokenType::Plus);
1302 single_tokens.insert('-', TokenType::Dash);
1303 single_tokens.insert('*', TokenType::Star);
1304 single_tokens.insert('/', TokenType::Slash);
1305 single_tokens.insert('%', TokenType::Percent);
1306 single_tokens.insert('&', TokenType::Amp);
1307 single_tokens.insert('|', TokenType::Pipe);
1308 single_tokens.insert('^', TokenType::Caret);
1309 single_tokens.insert('~', TokenType::Tilde);
1310 single_tokens.insert('<', TokenType::Lt);
1311 single_tokens.insert('>', TokenType::Gt);
1312 single_tokens.insert('=', TokenType::Eq);
1313 single_tokens.insert('!', TokenType::Exclamation);
1314 single_tokens.insert(':', TokenType::Colon);
1315 single_tokens.insert('@', TokenType::DAt);
1316 single_tokens.insert('#', TokenType::Hash);
1317 single_tokens.insert('$', TokenType::Dollar);
1318 single_tokens.insert('?', TokenType::Parameter);
1319
1320 let mut quotes = std::collections::HashMap::new();
1321 quotes.insert("'".to_string(), "'".to_string());
1322 quotes.insert("\"\"\"".to_string(), "\"\"\"".to_string());
1324
1325 let mut identifiers = std::collections::HashMap::new();
1326 identifiers.insert('"', '"');
1327 identifiers.insert('`', '`');
1328 let mut comments = std::collections::HashMap::new();
1332 comments.insert("--".to_string(), None);
1333 comments.insert("/*".to_string(), Some("*/".to_string()));
1334
1335 Self {
1336 keywords,
1337 single_tokens,
1338 quotes,
1339 identifiers,
1340 comments,
1341 string_escapes: vec!['\''],
1344 nested_comments: true,
1345 escape_follow_chars: vec![],
1347 b_prefix_is_byte_string: false,
1349 numeric_literals: std::collections::HashMap::new(),
1350 identifiers_can_start_with_digit: false,
1351 hex_number_strings: false,
1352 hex_string_is_integer_type: false,
1353 string_escapes_allowed_in_raw_strings: true,
1356 hash_comments: false,
1357 dollar_sign_is_identifier: false,
1358 insert_format_raw_data: false,
1359 }
1360 }
1361}
1362
1363pub struct Tokenizer {
1365 config: TokenizerConfig,
1366}
1367
1368impl Tokenizer {
1369 pub fn new(config: TokenizerConfig) -> Self {
1371 Self { config }
1372 }
1373
1374 pub fn default_config() -> Self {
1376 Self::new(TokenizerConfig::default())
1377 }
1378
1379 pub fn tokenize(&self, sql: &str) -> Result<Vec<Token>> {
1381 let mut state = TokenizerState::new(sql, &self.config);
1382 state.tokenize()
1383 }
1384}
1385
1386impl Default for Tokenizer {
1387 fn default() -> Self {
1388 Self::default_config()
1389 }
1390}
1391
1392struct TokenizerState<'a> {
1394 source: &'a str,
1395 source_is_ascii: bool,
1396 chars: Vec<char>,
1397 size: usize,
1398 tokens: Vec<Token>,
1399 start: usize,
1400 current: usize,
1401 line: usize,
1402 column: usize,
1403 comments: Vec<String>,
1404 config: &'a TokenizerConfig,
1405}
1406
1407impl<'a> TokenizerState<'a> {
1408 fn new(sql: &'a str, config: &'a TokenizerConfig) -> Self {
1409 let chars: Vec<char> = sql.chars().collect();
1410 let size = chars.len();
1411 Self {
1412 source: sql,
1413 source_is_ascii: sql.is_ascii(),
1414 chars,
1415 size,
1416 tokens: Vec::new(),
1417 start: 0,
1418 current: 0,
1419 line: 1,
1420 column: 1,
1421 comments: Vec::new(),
1422 config,
1423 }
1424 }
1425
1426 fn tokenize(&mut self) -> Result<Vec<Token>> {
1427 while !self.is_at_end() {
1428 self.skip_whitespace();
1429 if self.is_at_end() {
1430 break;
1431 }
1432
1433 self.start = self.current;
1434 self.scan_token()?;
1435
1436 if self.config.insert_format_raw_data {
1439 if let Some(raw) = self.try_scan_insert_format_raw_data() {
1440 if !raw.is_empty() {
1441 self.start = self.current;
1442 self.add_token_with_text(TokenType::Var, raw);
1443 }
1444 }
1445 }
1446 }
1447
1448 if !self.comments.is_empty() {
1453 if let Some(last) = self.tokens.last_mut() {
1454 last.trailing_comments.extend(self.comments.drain(..));
1455 }
1456 }
1457
1458 Ok(std::mem::take(&mut self.tokens))
1459 }
1460
1461 fn is_at_end(&self) -> bool {
1462 self.current >= self.size
1463 }
1464
1465 #[inline]
1466 fn text_from_range(&self, start: usize, end: usize) -> String {
1467 if self.source_is_ascii {
1468 self.source[start..end].to_string()
1469 } else {
1470 self.chars[start..end].iter().collect()
1471 }
1472 }
1473
1474 fn peek(&self) -> char {
1475 if self.is_at_end() {
1476 '\0'
1477 } else {
1478 self.chars[self.current]
1479 }
1480 }
1481
1482 fn peek_next(&self) -> char {
1483 if self.current + 1 >= self.size {
1484 '\0'
1485 } else {
1486 self.chars[self.current + 1]
1487 }
1488 }
1489
1490 fn advance(&mut self) -> char {
1491 let c = self.peek();
1492 self.current += 1;
1493 if c == '\n' {
1494 self.line += 1;
1495 self.column = 1;
1496 } else {
1497 self.column += 1;
1498 }
1499 c
1500 }
1501
1502 fn skip_whitespace(&mut self) {
1503 let mut saw_newline = false;
1508 while !self.is_at_end() {
1509 let c = self.peek();
1510 match c {
1511 ' ' | '\t' | '\r' => {
1512 self.advance();
1513 }
1514 '\n' => {
1515 saw_newline = true;
1516 self.advance();
1517 }
1518 '\u{00A0}' | '\u{2000}'..='\u{200B}' | '\u{3000}' | '\u{FEFF}' => {
1523 self.advance();
1524 }
1525 '-' if self.peek_next() == '-' => {
1526 self.scan_line_comment(saw_newline);
1527 saw_newline = true;
1529 }
1530 '/' if self.peek_next() == '/' && self.config.hash_comments => {
1531 self.scan_double_slash_comment();
1533 }
1534 '/' if self.peek_next() == '*' => {
1535 if self.current + 2 < self.size && self.chars[self.current + 2] == '+' {
1537 break;
1539 }
1540 if self.scan_block_comment(saw_newline).is_err() {
1541 return;
1542 }
1543 }
1545 '/' if self.peek_next() == '/' && self.config.comments.contains_key("//") => {
1546 let prev_non_ws = if self.current > 0 {
1550 let mut i = self.current - 1;
1551 while i > 0 && (self.chars[i] == ' ' || self.chars[i] == '\t') {
1552 i -= 1;
1553 }
1554 self.chars[i]
1555 } else {
1556 '\0'
1557 };
1558 if prev_non_ws == ':' || prev_non_ws == '/' {
1559 break;
1561 }
1562 self.scan_line_comment(saw_newline);
1563 saw_newline = true;
1565 }
1566 '#' if self.config.hash_comments => {
1567 self.scan_hash_line_comment();
1568 }
1569 _ => break,
1570 }
1571 }
1572 }
1573
1574 fn scan_hash_line_comment(&mut self) {
1575 self.advance(); let start = self.current;
1577 while !self.is_at_end() && self.peek() != '\n' {
1578 self.advance();
1579 }
1580 let comment = self.text_from_range(start, self.current);
1581 let comment_text = comment.trim().to_string();
1582 if let Some(last) = self.tokens.last_mut() {
1583 last.trailing_comments.push(comment_text);
1584 } else {
1585 self.comments.push(comment_text);
1586 }
1587 }
1588
1589 fn scan_double_slash_comment(&mut self) {
1590 self.advance(); self.advance(); let start = self.current;
1593 while !self.is_at_end() && self.peek() != '\n' {
1594 self.advance();
1595 }
1596 let comment = self.text_from_range(start, self.current);
1597 let comment_text = comment.trim().to_string();
1598 if let Some(last) = self.tokens.last_mut() {
1599 last.trailing_comments.push(comment_text);
1600 } else {
1601 self.comments.push(comment_text);
1602 }
1603 }
1604
1605 fn scan_line_comment(&mut self, after_newline: bool) {
1606 self.advance(); self.advance(); let start = self.current;
1609 while !self.is_at_end() && self.peek() != '\n' {
1610 self.advance();
1611 }
1612 let comment_text = self.text_from_range(start, self.current);
1613
1614 if after_newline || self.tokens.is_empty() {
1617 self.comments.push(comment_text);
1618 } else if let Some(last) = self.tokens.last_mut() {
1619 last.trailing_comments.push(comment_text);
1620 }
1621 }
1622
1623 fn scan_block_comment(&mut self, after_newline: bool) -> Result<()> {
1624 self.advance(); self.advance(); let content_start = self.current;
1627 let mut depth = 1;
1628
1629 while !self.is_at_end() && depth > 0 {
1630 if self.peek() == '/' && self.peek_next() == '*' && self.config.nested_comments {
1631 self.advance();
1632 self.advance();
1633 depth += 1;
1634 } else if self.peek() == '*' && self.peek_next() == '/' {
1635 depth -= 1;
1636 if depth > 0 {
1637 self.advance();
1638 self.advance();
1639 }
1640 } else {
1641 self.advance();
1642 }
1643 }
1644
1645 if depth > 0 {
1646 return Err(Error::tokenize(
1647 "Unterminated block comment",
1648 self.line,
1649 self.column,
1650 ));
1651 }
1652
1653 let content = self.text_from_range(content_start, self.current);
1655 self.advance(); self.advance(); let comment_text = format!("/*{}*/", content);
1660
1661 if after_newline || self.tokens.is_empty() {
1664 self.comments.push(comment_text);
1665 } else if let Some(last) = self.tokens.last_mut() {
1666 last.trailing_comments.push(comment_text);
1667 }
1668
1669 Ok(())
1670 }
1671
1672 fn scan_hint(&mut self) -> Result<()> {
1674 self.advance(); self.advance(); self.advance(); let hint_start = self.current;
1678
1679 while !self.is_at_end() {
1681 if self.peek() == '*' && self.peek_next() == '/' {
1682 break;
1683 }
1684 self.advance();
1685 }
1686
1687 if self.is_at_end() {
1688 return Err(Error::tokenize(
1689 "Unterminated hint comment",
1690 self.line,
1691 self.column,
1692 ));
1693 }
1694
1695 let hint_text = self.text_from_range(hint_start, self.current);
1696 self.advance(); self.advance(); self.add_token_with_text(TokenType::Hint, hint_text.trim().to_string());
1700
1701 Ok(())
1702 }
1703
1704 fn scan_positional_parameter(&mut self) -> Result<()> {
1706 self.advance(); let start = self.current;
1708
1709 while !self.is_at_end() && self.peek().is_ascii_digit() {
1710 self.advance();
1711 }
1712
1713 let number = self.text_from_range(start, self.current);
1714 self.add_token_with_text(TokenType::Parameter, number);
1715 Ok(())
1716 }
1717
1718 fn try_scan_tagged_dollar_string(&mut self) -> Result<Option<()>> {
1723 let saved_pos = self.current;
1724
1725 self.advance(); let tag_start = self.current;
1731 while !self.is_at_end()
1732 && (self.peek().is_alphanumeric() || self.peek() == '_' || !self.peek().is_ascii())
1733 {
1734 self.advance();
1735 }
1736 let tag = self.text_from_range(tag_start, self.current);
1737
1738 if self.is_at_end() || self.peek() != '$' {
1740 self.current = saved_pos;
1742 return Ok(None);
1743 }
1744 self.advance(); let content_start = self.current;
1748 let closing_tag = format!("${}$", tag);
1749 let closing_chars: Vec<char> = closing_tag.chars().collect();
1750
1751 loop {
1752 if self.is_at_end() {
1753 self.current = saved_pos;
1755 return Ok(None);
1756 }
1757
1758 if self.peek() == '$' && self.current + closing_chars.len() <= self.size {
1760 let matches = closing_chars.iter().enumerate().all(|(j, &ch)| {
1761 self.current + j < self.size && self.chars[self.current + j] == ch
1762 });
1763 if matches {
1764 let content = self.text_from_range(content_start, self.current);
1765 for _ in 0..closing_chars.len() {
1767 self.advance();
1768 }
1769 let token_text = format!("{}\x00{}", tag, content);
1771 self.add_token_with_text(TokenType::DollarString, token_text);
1772 return Ok(Some(()));
1773 }
1774 }
1775 self.advance();
1776 }
1777 }
1778
1779 fn scan_dollar_quoted_string(&mut self) -> Result<()> {
1784 self.advance(); self.advance(); let start = self.current;
1789 while !self.is_at_end() {
1790 if self.peek() == '$'
1791 && self.current + 1 < self.size
1792 && self.chars[self.current + 1] == '$'
1793 {
1794 break;
1795 }
1796 self.advance();
1797 }
1798
1799 let content = self.text_from_range(start, self.current);
1800
1801 if !self.is_at_end() {
1802 self.advance(); self.advance(); }
1805
1806 self.add_token_with_text(TokenType::DollarString, content);
1807 Ok(())
1808 }
1809
1810 fn scan_token(&mut self) -> Result<()> {
1811 let c = self.peek();
1812
1813 if c == '\'' {
1815 if self.config.quotes.contains_key("'''")
1817 && self.peek_next() == '\''
1818 && self.current + 2 < self.size
1819 && self.chars[self.current + 2] == '\''
1820 {
1821 return self.scan_triple_quoted_string('\'');
1822 }
1823 return self.scan_string();
1824 }
1825
1826 if c == '"'
1828 && self.config.quotes.contains_key("\"\"\"")
1829 && self.peek_next() == '"'
1830 && self.current + 2 < self.size
1831 && self.chars[self.current + 2] == '"'
1832 {
1833 return self.scan_triple_quoted_string('"');
1834 }
1835
1836 if c == '"'
1839 && self.config.quotes.contains_key("\"")
1840 && !self.config.identifiers.contains_key(&'"')
1841 {
1842 return self.scan_double_quoted_string();
1843 }
1844
1845 if let Some(&end_quote) = self.config.identifiers.get(&c) {
1847 return self.scan_quoted_identifier(end_quote);
1848 }
1849
1850 if c.is_ascii_digit() {
1852 return self.scan_number();
1853 }
1854
1855 if c == '.' && self.peek_next().is_ascii_digit() {
1862 let prev_char = if self.current > 0 {
1863 self.chars[self.current - 1]
1864 } else {
1865 '\0'
1866 };
1867 let is_after_ident = prev_char.is_alphanumeric()
1868 || prev_char == '_'
1869 || prev_char == '`'
1870 || prev_char == '"'
1871 || prev_char == ']'
1872 || prev_char == ')';
1873 if prev_char != '.' && !is_after_ident {
1874 return self.scan_number_starting_with_dot();
1875 }
1876 }
1877
1878 if c == '/'
1880 && self.peek_next() == '*'
1881 && self.current + 2 < self.size
1882 && self.chars[self.current + 2] == '+'
1883 {
1884 return self.scan_hint();
1885 }
1886
1887 if let Some(token_type) = self.try_scan_multi_char_operator() {
1889 self.add_token(token_type);
1890 return Ok(());
1891 }
1892
1893 if c == '$'
1896 && (self.peek_next().is_alphanumeric()
1897 || self.peek_next() == '_'
1898 || !self.peek_next().is_ascii())
1899 {
1900 if let Some(()) = self.try_scan_tagged_dollar_string()? {
1901 return Ok(());
1902 }
1903 if self.config.dollar_sign_is_identifier {
1906 return self.scan_dollar_identifier();
1907 }
1908 }
1909
1910 if c == '$' && self.peek_next() == '$' {
1912 return self.scan_dollar_quoted_string();
1913 }
1914
1915 if c == '$' && self.peek_next().is_ascii_digit() {
1917 return self.scan_positional_parameter();
1918 }
1919
1920 if c == '$' && self.config.dollar_sign_is_identifier {
1922 return self.scan_dollar_identifier();
1923 }
1924
1925 if (c == '#' || c == '@')
1928 && (self.peek_next().is_alphanumeric()
1929 || self.peek_next() == '_'
1930 || self.peek_next() == '#')
1931 {
1932 return self.scan_tsql_identifier();
1933 }
1934
1935 if let Some(&token_type) = self.config.single_tokens.get(&c) {
1937 self.advance();
1938 self.add_token(token_type);
1939 return Ok(());
1940 }
1941
1942 if c == '\u{2212}' {
1944 self.advance();
1945 self.add_token(TokenType::Dash);
1946 return Ok(());
1947 }
1948
1949 if c == '\u{2044}' {
1951 self.advance();
1952 self.add_token(TokenType::Slash);
1953 return Ok(());
1954 }
1955
1956 if c == '\u{2018}' || c == '\u{2019}' {
1958 return self.scan_unicode_quoted_string(c);
1960 }
1961 if c == '\u{201C}' || c == '\u{201D}' {
1962 return self.scan_unicode_quoted_identifier(c);
1964 }
1965
1966 self.scan_identifier_or_keyword()
1968 }
1969
1970 fn try_scan_multi_char_operator(&mut self) -> Option<TokenType> {
1971 let c = self.peek();
1972 let next = self.peek_next();
1973 let third = if self.current + 2 < self.size {
1974 self.chars[self.current + 2]
1975 } else {
1976 '\0'
1977 };
1978
1979 if c == '-' && next == '|' && third == '-' {
1982 self.advance();
1983 self.advance();
1984 self.advance();
1985 return Some(TokenType::Adjacent);
1986 }
1987
1988 if c == '|' && next == '|' && third == '/' {
1990 self.advance();
1991 self.advance();
1992 self.advance();
1993 return Some(TokenType::DPipeSlash);
1994 }
1995
1996 if c == '#' && next == '>' && third == '>' {
1998 self.advance();
1999 self.advance();
2000 self.advance();
2001 return Some(TokenType::DHashArrow);
2002 }
2003
2004 if c == '-' && next == '>' && third == '>' {
2006 self.advance();
2007 self.advance();
2008 self.advance();
2009 return Some(TokenType::DArrow);
2010 }
2011
2012 if c == '<' && next == '=' && third == '>' {
2014 self.advance();
2015 self.advance();
2016 self.advance();
2017 return Some(TokenType::NullsafeEq);
2018 }
2019
2020 if c == '<' && next == '-' && third == '>' {
2022 self.advance();
2023 self.advance();
2024 self.advance();
2025 return Some(TokenType::LrArrow);
2026 }
2027
2028 if c == '<' && next == '@' {
2030 self.advance();
2031 self.advance();
2032 return Some(TokenType::LtAt);
2033 }
2034
2035 if c == '@' && next == '>' {
2037 self.advance();
2038 self.advance();
2039 return Some(TokenType::AtGt);
2040 }
2041
2042 if c == '~' && next == '~' && third == '~' {
2044 self.advance();
2045 self.advance();
2046 self.advance();
2047 return Some(TokenType::Glob);
2048 }
2049
2050 if c == '~' && next == '~' && third == '*' {
2052 self.advance();
2053 self.advance();
2054 self.advance();
2055 return Some(TokenType::ILike);
2056 }
2057
2058 let fourth = if self.current + 3 < self.size {
2060 self.chars[self.current + 3]
2061 } else {
2062 '\0'
2063 };
2064 if c == '!' && next == '~' && third == '~' && fourth == '*' {
2065 self.advance();
2066 self.advance();
2067 self.advance();
2068 self.advance();
2069 return Some(TokenType::NotILike);
2070 }
2071
2072 if c == '!' && next == '~' && third == '~' {
2074 self.advance();
2075 self.advance();
2076 self.advance();
2077 return Some(TokenType::NotLike);
2078 }
2079
2080 if c == '!' && next == '~' && third == '*' {
2082 self.advance();
2083 self.advance();
2084 self.advance();
2085 return Some(TokenType::NotIRLike);
2086 }
2087
2088 if c == '!' && next == ':' && third == '>' {
2090 self.advance();
2091 self.advance();
2092 self.advance();
2093 return Some(TokenType::NColonGt);
2094 }
2095
2096 if c == '?' && next == ':' && third == ':' {
2098 self.advance();
2099 self.advance();
2100 self.advance();
2101 return Some(TokenType::QDColon);
2102 }
2103
2104 if c == '!' && next == '~' {
2106 self.advance();
2107 self.advance();
2108 return Some(TokenType::NotRLike);
2109 }
2110
2111 if c == '~' && next == '~' {
2113 self.advance();
2114 self.advance();
2115 return Some(TokenType::Like);
2116 }
2117
2118 if c == '~' && next == '*' {
2120 self.advance();
2121 self.advance();
2122 return Some(TokenType::IRLike);
2123 }
2124
2125 if c == ':' && next == ':' && third == '$' {
2128 self.advance();
2129 self.advance();
2130 self.advance();
2131 return Some(TokenType::DColonDollar);
2132 }
2133 if c == ':' && next == ':' && third == '%' {
2134 self.advance();
2135 self.advance();
2136 self.advance();
2137 return Some(TokenType::DColonPercent);
2138 }
2139 if c == ':' && next == ':' && third == '?' {
2140 self.advance();
2141 self.advance();
2142 self.advance();
2143 return Some(TokenType::DColonQMark);
2144 }
2145
2146 let token_type = match (c, next) {
2148 ('.', ':') => Some(TokenType::DotColon),
2149 ('=', '=') => Some(TokenType::Eq), ('<', '=') => Some(TokenType::Lte),
2151 ('>', '=') => Some(TokenType::Gte),
2152 ('!', '=') => Some(TokenType::Neq),
2153 ('<', '>') => Some(TokenType::Neq),
2154 ('^', '=') => Some(TokenType::Neq),
2155 ('<', '<') => Some(TokenType::LtLt),
2156 ('>', '>') => Some(TokenType::GtGt),
2157 ('|', '|') => Some(TokenType::DPipe),
2158 ('|', '/') => Some(TokenType::PipeSlash), (':', ':') => Some(TokenType::DColon),
2160 (':', '=') => Some(TokenType::ColonEq), (':', '>') => Some(TokenType::ColonGt), ('-', '>') => Some(TokenType::Arrow), ('=', '>') => Some(TokenType::FArrow), ('&', '&') => Some(TokenType::DAmp),
2165 ('&', '<') => Some(TokenType::AmpLt), ('&', '>') => Some(TokenType::AmpGt), ('@', '@') => Some(TokenType::AtAt), ('?', '|') => Some(TokenType::QMarkPipe), ('?', '&') => Some(TokenType::QMarkAmp), ('?', '?') => Some(TokenType::DQMark), ('#', '>') => Some(TokenType::HashArrow), ('#', '-') => Some(TokenType::HashDash), ('^', '@') => Some(TokenType::CaretAt), ('*', '*') => Some(TokenType::DStar), ('|', '>') => Some(TokenType::PipeGt), _ => None,
2177 };
2178
2179 if token_type.is_some() {
2180 self.advance();
2181 self.advance();
2182 }
2183
2184 token_type
2185 }
2186
2187 fn scan_string(&mut self) -> Result<()> {
2188 self.advance(); let mut value = String::new();
2190
2191 while !self.is_at_end() {
2192 let c = self.peek();
2193 if c == '\'' {
2194 if self.peek_next() == '\'' {
2195 value.push('\'');
2197 self.advance();
2198 self.advance();
2199 } else {
2200 break;
2201 }
2202 } else if c == '\\' && self.config.string_escapes.contains(&'\\') {
2203 self.advance(); if !self.is_at_end() {
2206 let escaped = self.advance();
2207 match escaped {
2208 'n' => value.push('\n'),
2209 'r' => value.push('\r'),
2210 't' => value.push('\t'),
2211 '0' => value.push('\0'),
2212 'Z' => value.push('\x1A'), 'a' => value.push('\x07'), 'b' => value.push('\x08'), 'f' => value.push('\x0C'), 'v' => value.push('\x0B'), 'x' => {
2218 let mut hex = String::with_capacity(2);
2220 for _ in 0..2 {
2221 if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2222 hex.push(self.advance());
2223 }
2224 }
2225 if hex.len() == 2 {
2226 if let Ok(byte) = u8::from_str_radix(&hex, 16) {
2227 value.push(byte as char);
2228 } else {
2229 value.push('\\');
2230 value.push('x');
2231 value.push_str(&hex);
2232 }
2233 } else {
2234 value.push('\\');
2236 value.push('x');
2237 value.push_str(&hex);
2238 }
2239 }
2240 '\\' => value.push('\\'),
2241 '\'' => value.push('\''),
2242 '"' => value.push('"'),
2243 '%' => {
2244 value.push('%');
2246 }
2247 '_' => {
2248 value.push('_');
2250 }
2251 _ => {
2255 if !self.config.escape_follow_chars.is_empty() {
2256 value.push(escaped);
2258 } else {
2259 value.push('\\');
2261 value.push(escaped);
2262 }
2263 }
2264 }
2265 }
2266 } else {
2267 value.push(self.advance());
2268 }
2269 }
2270
2271 if self.is_at_end() {
2272 return Err(Error::tokenize(
2273 "Unterminated string",
2274 self.line,
2275 self.column,
2276 ));
2277 }
2278
2279 self.advance(); self.add_token_with_text(TokenType::String, value);
2281 Ok(())
2282 }
2283
2284 fn scan_double_quoted_string(&mut self) -> Result<()> {
2286 self.advance(); let mut value = String::new();
2288
2289 while !self.is_at_end() {
2290 let c = self.peek();
2291 if c == '"' {
2292 if self.peek_next() == '"' {
2293 value.push('"');
2295 self.advance();
2296 self.advance();
2297 } else {
2298 break;
2299 }
2300 } else if c == '\\' && self.config.string_escapes.contains(&'\\') {
2301 self.advance(); if !self.is_at_end() {
2304 let escaped = self.advance();
2305 match escaped {
2306 'n' => value.push('\n'),
2307 'r' => value.push('\r'),
2308 't' => value.push('\t'),
2309 '0' => value.push('\0'),
2310 'Z' => value.push('\x1A'), 'a' => value.push('\x07'), 'b' => value.push('\x08'), 'f' => value.push('\x0C'), 'v' => value.push('\x0B'), 'x' => {
2316 let mut hex = String::with_capacity(2);
2318 for _ in 0..2 {
2319 if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2320 hex.push(self.advance());
2321 }
2322 }
2323 if hex.len() == 2 {
2324 if let Ok(byte) = u8::from_str_radix(&hex, 16) {
2325 value.push(byte as char);
2326 } else {
2327 value.push('\\');
2328 value.push('x');
2329 value.push_str(&hex);
2330 }
2331 } else {
2332 value.push('\\');
2334 value.push('x');
2335 value.push_str(&hex);
2336 }
2337 }
2338 '\\' => value.push('\\'),
2339 '\'' => value.push('\''),
2340 '"' => value.push('"'),
2341 '%' => {
2342 value.push('%');
2344 }
2345 '_' => {
2346 value.push('_');
2348 }
2349 _ => {
2353 if !self.config.escape_follow_chars.is_empty() {
2354 value.push(escaped);
2356 } else {
2357 value.push('\\');
2359 value.push(escaped);
2360 }
2361 }
2362 }
2363 }
2364 } else {
2365 value.push(self.advance());
2366 }
2367 }
2368
2369 if self.is_at_end() {
2370 return Err(Error::tokenize(
2371 "Unterminated double-quoted string",
2372 self.line,
2373 self.column,
2374 ));
2375 }
2376
2377 self.advance(); self.add_token_with_text(TokenType::String, value);
2379 Ok(())
2380 }
2381
2382 fn scan_triple_quoted_string(&mut self, quote_char: char) -> Result<()> {
2383 self.advance();
2385 self.advance();
2386 self.advance();
2387 let mut value = String::new();
2388
2389 while !self.is_at_end() {
2390 if self.peek() == quote_char
2392 && self.current + 1 < self.size
2393 && self.chars[self.current + 1] == quote_char
2394 && self.current + 2 < self.size
2395 && self.chars[self.current + 2] == quote_char
2396 {
2397 break;
2399 }
2400 value.push(self.advance());
2401 }
2402
2403 if self.is_at_end() {
2404 return Err(Error::tokenize(
2405 "Unterminated triple-quoted string",
2406 self.line,
2407 self.column,
2408 ));
2409 }
2410
2411 self.advance();
2413 self.advance();
2414 self.advance();
2415 let token_type = if quote_char == '"' {
2416 TokenType::TripleDoubleQuotedString
2417 } else {
2418 TokenType::TripleSingleQuotedString
2419 };
2420 self.add_token_with_text(token_type, value);
2421 Ok(())
2422 }
2423
2424 fn scan_quoted_identifier(&mut self, end_quote: char) -> Result<()> {
2425 self.advance(); let mut value = String::new();
2427
2428 loop {
2429 if self.is_at_end() {
2430 return Err(Error::tokenize(
2431 "Unterminated identifier",
2432 self.line,
2433 self.column,
2434 ));
2435 }
2436 if self.peek() == end_quote {
2437 if self.peek_next() == end_quote {
2438 value.push(end_quote);
2440 self.advance(); self.advance(); } else {
2443 break;
2445 }
2446 } else {
2447 value.push(self.peek());
2448 self.advance();
2449 }
2450 }
2451
2452 self.advance(); self.add_token_with_text(TokenType::QuotedIdentifier, value);
2454 Ok(())
2455 }
2456
2457 fn scan_unicode_quoted_string(&mut self, open_quote: char) -> Result<()> {
2462 self.advance(); let start = self.current;
2464 let close_quote = if open_quote == '\u{2018}' {
2466 '\u{2019}' } else {
2468 '\u{2019}' };
2470 while !self.is_at_end() && self.peek() != close_quote {
2471 self.advance();
2472 }
2473 let value = self.text_from_range(start, self.current);
2474 if !self.is_at_end() {
2475 self.advance(); }
2477 self.add_token_with_text(TokenType::String, value);
2478 Ok(())
2479 }
2480
2481 fn scan_unicode_quoted_identifier(&mut self, open_quote: char) -> Result<()> {
2484 self.advance(); let start = self.current;
2486 let close_quote = if open_quote == '\u{201C}' {
2487 '\u{201D}' } else {
2489 '\u{201D}' };
2491 while !self.is_at_end() && self.peek() != close_quote && self.peek() != '"' {
2492 self.advance();
2493 }
2494 let value = self.text_from_range(start, self.current);
2495 if !self.is_at_end() {
2496 self.advance(); }
2498 self.add_token_with_text(TokenType::QuotedIdentifier, value);
2499 Ok(())
2500 }
2501
2502 fn scan_number(&mut self) -> Result<()> {
2503 if self.config.hex_number_strings && self.peek() == '0' && !self.is_at_end() {
2505 let next = if self.current + 1 < self.size {
2506 self.chars[self.current + 1]
2507 } else {
2508 '\0'
2509 };
2510 if next == 'x' || next == 'X' {
2511 self.advance();
2513 self.advance();
2514 let hex_start = self.current;
2516 while !self.is_at_end() && (self.peek().is_ascii_hexdigit() || self.peek() == '_') {
2517 if self.peek() == '_' && !self.peek_next().is_ascii_hexdigit() {
2518 break;
2519 }
2520 self.advance();
2521 }
2522 if self.current > hex_start {
2523 let mut is_hex_float = false;
2525 if !self.is_at_end() && self.peek() == '.' {
2527 let after_dot = if self.current + 1 < self.size {
2528 self.chars[self.current + 1]
2529 } else {
2530 '\0'
2531 };
2532 if after_dot.is_ascii_hexdigit() {
2533 is_hex_float = true;
2534 self.advance(); while !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2536 self.advance();
2537 }
2538 }
2539 }
2540 if !self.is_at_end() && (self.peek() == 'p' || self.peek() == 'P') {
2542 is_hex_float = true;
2543 self.advance(); if !self.is_at_end() && (self.peek() == '+' || self.peek() == '-') {
2545 self.advance();
2546 }
2547 while !self.is_at_end() && self.peek().is_ascii_digit() {
2548 self.advance();
2549 }
2550 }
2551 if is_hex_float {
2552 let full_text = self.text_from_range(self.start, self.current);
2554 self.add_token_with_text(TokenType::Number, full_text);
2555 } else if self.config.hex_string_is_integer_type {
2556 let hex_value = self.text_from_range(hex_start, self.current);
2558 self.add_token_with_text(TokenType::HexNumber, hex_value);
2559 } else {
2560 let hex_value = self.text_from_range(hex_start, self.current);
2562 self.add_token_with_text(TokenType::HexString, hex_value);
2563 }
2564 return Ok(());
2565 }
2566 self.current = self.start + 1;
2569 }
2570 }
2571
2572 while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2574 if self.peek() == '_' && (self.is_at_end() || !self.peek_next().is_ascii_digit()) {
2576 break;
2577 }
2578 self.advance();
2579 }
2580
2581 if self.peek() == '.' {
2585 let next = self.peek_next();
2586 if next != '.' {
2592 self.advance(); while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2595 if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2596 break;
2597 }
2598 self.advance();
2599 }
2600 }
2601 }
2602
2603 if self.peek() == 'e' || self.peek() == 'E' {
2605 self.advance();
2606 if self.peek() == '+' || self.peek() == '-' {
2607 self.advance();
2608 }
2609 while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2610 if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2611 break;
2612 }
2613 self.advance();
2614 }
2615 }
2616
2617 let text = self.text_from_range(self.start, self.current);
2618
2619 if !self.config.numeric_literals.is_empty() && !self.is_at_end() {
2621 let next_char = self.peek().to_uppercase().to_string();
2622 let suffix_match = if self.current + 1 < self.size {
2624 let two_char: String = vec![self.chars[self.current], self.chars[self.current + 1]]
2625 .iter()
2626 .collect::<String>()
2627 .to_uppercase();
2628 if self.config.numeric_literals.contains_key(&two_char) {
2629 let after_suffix = if self.current + 2 < self.size {
2631 self.chars[self.current + 2]
2632 } else {
2633 ' '
2634 };
2635 if !after_suffix.is_alphanumeric() && after_suffix != '_' {
2636 Some((two_char, 2))
2637 } else {
2638 None
2639 }
2640 } else if self.config.numeric_literals.contains_key(&next_char) {
2641 let after_suffix = if self.current + 1 < self.size {
2643 self.chars[self.current + 1]
2644 } else {
2645 ' '
2646 };
2647 if !after_suffix.is_alphanumeric() && after_suffix != '_' {
2648 Some((next_char, 1))
2649 } else {
2650 None
2651 }
2652 } else {
2653 None
2654 }
2655 } else if self.config.numeric_literals.contains_key(&next_char) {
2656 Some((next_char, 1))
2658 } else {
2659 None
2660 };
2661
2662 if let Some((suffix, len)) = suffix_match {
2663 for _ in 0..len {
2665 self.advance();
2666 }
2667 let type_name = self
2670 .config
2671 .numeric_literals
2672 .get(&suffix)
2673 .expect("suffix verified by contains_key above")
2674 .clone();
2675 let combined = format!("{}::{}", text, type_name);
2676 self.add_token_with_text(TokenType::Number, combined);
2677 return Ok(());
2678 }
2679 }
2680
2681 if self.config.identifiers_can_start_with_digit && !self.is_at_end() {
2684 let next = self.peek();
2685 if next.is_alphabetic() || next == '_' {
2686 while !self.is_at_end() {
2688 let ch = self.peek();
2689 if ch.is_alphanumeric() || ch == '_' {
2690 self.advance();
2691 } else {
2692 break;
2693 }
2694 }
2695 let ident_text = self.text_from_range(self.start, self.current);
2696 self.add_token_with_text(TokenType::Identifier, ident_text);
2697 return Ok(());
2698 }
2699 }
2700
2701 self.add_token_with_text(TokenType::Number, text);
2702 Ok(())
2703 }
2704
2705 fn scan_number_starting_with_dot(&mut self) -> Result<()> {
2707 self.advance();
2709
2710 while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2712 if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2713 break;
2714 }
2715 self.advance();
2716 }
2717
2718 if self.peek() == 'e' || self.peek() == 'E' {
2720 self.advance();
2721 if self.peek() == '+' || self.peek() == '-' {
2722 self.advance();
2723 }
2724 while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2725 if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2726 break;
2727 }
2728 self.advance();
2729 }
2730 }
2731
2732 let text = self.text_from_range(self.start, self.current);
2733 self.add_token_with_text(TokenType::Number, text);
2734 Ok(())
2735 }
2736
2737 fn scan_identifier_or_keyword(&mut self) -> Result<()> {
2738 let first_char = self.peek();
2740 if !first_char.is_alphanumeric() && first_char != '_' {
2741 let c = self.advance();
2743 return Err(Error::tokenize(
2744 format!("Unexpected character: '{}'", c),
2745 self.line,
2746 self.column,
2747 ));
2748 }
2749
2750 while !self.is_at_end() {
2751 let c = self.peek();
2752 if c == '#' {
2756 let next_c = if self.current + 1 < self.size {
2757 self.chars[self.current + 1]
2758 } else {
2759 '\0'
2760 };
2761 if next_c == '>' || next_c == '-' {
2762 break; }
2764 self.advance();
2765 } else if c.is_alphanumeric() || c == '_' || c == '$' || c == '@' {
2766 self.advance();
2767 } else {
2768 break;
2769 }
2770 }
2771
2772 let text = self.text_from_range(self.start, self.current);
2773 let upper = text.to_uppercase();
2774
2775 if upper == "NOT" && self.peek() == '=' {
2777 self.advance(); self.add_token(TokenType::Neq);
2779 return Ok(());
2780 }
2781
2782 let next_char = self.peek();
2785 let is_single_quote = next_char == '\'';
2786 let is_double_quote = next_char == '"' && self.config.quotes.contains_key("\"");
2787 let is_double_quote_for_raw = next_char == '"';
2790
2791 if upper == "R" && (is_single_quote || is_double_quote_for_raw) {
2794 let quote_char = if is_single_quote { '\'' } else { '"' };
2797 self.advance(); if self.peek() == quote_char && self.peek_next() == quote_char {
2801 self.advance(); self.advance(); let string_value = self.scan_raw_triple_quoted_content(quote_char)?;
2805 self.add_token_with_text(TokenType::RawString, string_value);
2806 } else {
2807 let string_value = self.scan_raw_string_content(quote_char)?;
2808 self.add_token_with_text(TokenType::RawString, string_value);
2809 }
2810 return Ok(());
2811 }
2812
2813 if is_single_quote || is_double_quote {
2814 match upper.as_str() {
2815 "N" => {
2816 self.advance(); let string_value = if is_single_quote {
2819 self.scan_string_content()?
2820 } else {
2821 self.scan_double_quoted_string_content()?
2822 };
2823 self.add_token_with_text(TokenType::NationalString, string_value);
2824 return Ok(());
2825 }
2826 "E" => {
2827 let lowercase = text == "e";
2831 let prefix = if lowercase { "e:" } else { "E:" };
2832 self.advance(); let string_value = self.scan_string_content_with_escapes(true)?;
2834 self.add_token_with_text(
2835 TokenType::EscapeString,
2836 format!("{}{}", prefix, string_value),
2837 );
2838 return Ok(());
2839 }
2840 "X" => {
2841 self.advance(); let string_value = if is_single_quote {
2844 self.scan_string_content()?
2845 } else {
2846 self.scan_double_quoted_string_content()?
2847 };
2848 self.add_token_with_text(TokenType::HexString, string_value);
2849 return Ok(());
2850 }
2851 "B" if is_double_quote => {
2852 self.advance(); let string_value = self.scan_double_quoted_string_content()?;
2855 self.add_token_with_text(TokenType::ByteString, string_value);
2856 return Ok(());
2857 }
2858 "B" if is_single_quote => {
2859 self.advance(); let string_value = self.scan_string_content()?;
2863 if self.config.b_prefix_is_byte_string {
2864 self.add_token_with_text(TokenType::ByteString, string_value);
2865 } else {
2866 self.add_token_with_text(TokenType::BitString, string_value);
2867 }
2868 return Ok(());
2869 }
2870 _ => {}
2871 }
2872 }
2873
2874 if upper == "U"
2876 && self.peek() == '&'
2877 && self.current + 1 < self.size
2878 && self.chars[self.current + 1] == '\''
2879 {
2880 self.advance(); self.advance(); let string_value = self.scan_string_content()?;
2883 self.add_token_with_text(TokenType::UnicodeString, string_value);
2884 return Ok(());
2885 }
2886
2887 let token_type = self
2888 .config
2889 .keywords
2890 .get(&upper)
2891 .copied()
2892 .unwrap_or(TokenType::Var);
2893
2894 self.add_token_with_text(token_type, text);
2895 Ok(())
2896 }
2897
2898 fn scan_string_content_with_escapes(
2902 &mut self,
2903 force_backslash_escapes: bool,
2904 ) -> Result<String> {
2905 let mut value = String::new();
2906 let use_backslash_escapes =
2907 force_backslash_escapes || self.config.string_escapes.contains(&'\\');
2908
2909 while !self.is_at_end() {
2910 let c = self.peek();
2911 if c == '\'' {
2912 if self.peek_next() == '\'' {
2913 value.push('\'');
2915 self.advance();
2916 self.advance();
2917 } else {
2918 break;
2919 }
2920 } else if c == '\\' && use_backslash_escapes {
2921 value.push(self.advance());
2923 if !self.is_at_end() {
2924 value.push(self.advance());
2925 }
2926 } else {
2927 value.push(self.advance());
2928 }
2929 }
2930
2931 if self.is_at_end() {
2932 return Err(Error::tokenize(
2933 "Unterminated string",
2934 self.line,
2935 self.column,
2936 ));
2937 }
2938
2939 self.advance(); Ok(value)
2941 }
2942
2943 fn scan_string_content(&mut self) -> Result<String> {
2945 self.scan_string_content_with_escapes(false)
2946 }
2947
2948 fn scan_double_quoted_string_content(&mut self) -> Result<String> {
2951 let mut value = String::new();
2952 let use_backslash_escapes = self.config.string_escapes.contains(&'\\');
2953
2954 while !self.is_at_end() {
2955 let c = self.peek();
2956 if c == '"' {
2957 if self.peek_next() == '"' {
2958 value.push('"');
2960 self.advance();
2961 self.advance();
2962 } else {
2963 break;
2964 }
2965 } else if c == '\\' && use_backslash_escapes {
2966 self.advance(); if !self.is_at_end() {
2969 let escaped = self.advance();
2970 match escaped {
2971 'n' => value.push('\n'),
2972 'r' => value.push('\r'),
2973 't' => value.push('\t'),
2974 '0' => value.push('\0'),
2975 '\\' => value.push('\\'),
2976 '"' => value.push('"'),
2977 '\'' => value.push('\''),
2978 'x' => {
2979 let mut hex = String::new();
2981 for _ in 0..2 {
2982 if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2983 hex.push(self.advance());
2984 }
2985 }
2986 if let Ok(byte) = u8::from_str_radix(&hex, 16) {
2987 value.push(byte as char);
2988 } else {
2989 value.push('\\');
2991 value.push('x');
2992 value.push_str(&hex);
2993 }
2994 }
2995 _ => {
2996 value.push('\\');
2998 value.push(escaped);
2999 }
3000 }
3001 }
3002 } else {
3003 value.push(self.advance());
3004 }
3005 }
3006
3007 if self.is_at_end() {
3008 return Err(Error::tokenize(
3009 "Unterminated double-quoted string",
3010 self.line,
3011 self.column,
3012 ));
3013 }
3014
3015 self.advance(); Ok(value)
3017 }
3018
3019 fn scan_raw_string_content(&mut self, quote_char: char) -> Result<String> {
3024 let mut value = String::new();
3025
3026 while !self.is_at_end() {
3027 let c = self.peek();
3028 if c == quote_char {
3029 if self.peek_next() == quote_char {
3030 value.push(quote_char);
3032 self.advance();
3033 self.advance();
3034 } else {
3035 break;
3036 }
3037 } else if c == '\\'
3038 && self.peek_next() == quote_char
3039 && self.config.string_escapes_allowed_in_raw_strings
3040 {
3041 value.push(quote_char);
3045 self.advance(); self.advance(); } else {
3048 value.push(self.advance());
3050 }
3051 }
3052
3053 if self.is_at_end() {
3054 return Err(Error::tokenize(
3055 "Unterminated raw string",
3056 self.line,
3057 self.column,
3058 ));
3059 }
3060
3061 self.advance(); Ok(value)
3063 }
3064
3065 fn scan_raw_triple_quoted_content(&mut self, quote_char: char) -> Result<String> {
3068 let mut value = String::new();
3069
3070 while !self.is_at_end() {
3071 let c = self.peek();
3072 if c == quote_char && self.peek_next() == quote_char {
3073 if self.current + 2 < self.size && self.chars[self.current + 2] == quote_char {
3075 self.advance(); self.advance(); self.advance(); return Ok(value);
3080 }
3081 }
3082 let ch = self.advance();
3084 value.push(ch);
3085 }
3086
3087 Err(Error::tokenize(
3088 "Unterminated raw triple-quoted string",
3089 self.line,
3090 self.column,
3091 ))
3092 }
3093
3094 fn scan_dollar_identifier(&mut self) -> Result<()> {
3099 self.advance();
3101
3102 while !self.is_at_end() {
3104 let c = self.peek();
3105 if c.is_alphanumeric() || c == '_' || c == '$' {
3106 self.advance();
3107 } else {
3108 break;
3109 }
3110 }
3111
3112 let text = self.text_from_range(self.start, self.current);
3113 self.add_token_with_text(TokenType::Var, text);
3114 Ok(())
3115 }
3116
3117 fn scan_tsql_identifier(&mut self) -> Result<()> {
3118 let first = self.advance();
3120
3121 if first == '#' && self.peek() == '#' {
3123 self.advance();
3124 }
3125
3126 while !self.is_at_end() {
3128 let c = self.peek();
3129 if c.is_alphanumeric() || c == '_' || c == '$' || c == '#' || c == '@' {
3130 self.advance();
3131 } else {
3132 break;
3133 }
3134 }
3135
3136 let text = self.text_from_range(self.start, self.current);
3137 self.add_token_with_text(TokenType::Var, text);
3139 Ok(())
3140 }
3141
3142 fn try_scan_insert_format_raw_data(&mut self) -> Option<String> {
3146 let len = self.tokens.len();
3147 if len < 3 {
3148 return None;
3149 }
3150
3151 let last = &self.tokens[len - 1];
3153 if last.text.eq_ignore_ascii_case("VALUES") {
3154 return None;
3155 }
3156 if !matches!(last.token_type, TokenType::Var | TokenType::Identifier) {
3157 return None;
3158 }
3159
3160 let format_tok = &self.tokens[len - 2];
3162 if !format_tok.text.eq_ignore_ascii_case("FORMAT") {
3163 return None;
3164 }
3165
3166 let has_insert = self.tokens[..len - 2]
3168 .iter()
3169 .rev()
3170 .take(20)
3171 .any(|t| t.token_type == TokenType::Insert);
3172 if !has_insert {
3173 return None;
3174 }
3175
3176 let raw_start = self.current;
3180 while !self.is_at_end() {
3181 let c = self.peek();
3182 if c == '\n' {
3183 let saved = self.current;
3185 self.advance(); while !self.is_at_end() && self.peek() == '\r' {
3188 self.advance();
3189 }
3190 if self.is_at_end() || self.peek() == '\n' {
3191 let raw = self.text_from_range(raw_start, saved);
3194 return Some(raw.trim().to_string());
3195 }
3196 } else {
3198 self.advance();
3199 }
3200 }
3201
3202 let raw = self.text_from_range(raw_start, self.current);
3204 let trimmed = raw.trim().to_string();
3205 if trimmed.is_empty() {
3206 None
3207 } else {
3208 Some(trimmed)
3209 }
3210 }
3211
3212 fn add_token(&mut self, token_type: TokenType) {
3213 let text = self.text_from_range(self.start, self.current);
3214 self.add_token_with_text(token_type, text);
3215 }
3216
3217 fn add_token_with_text(&mut self, token_type: TokenType, text: String) {
3218 let span = Span::new(self.start, self.current, self.line, self.column);
3219 let mut token = Token::new(token_type, text, span);
3220 token.comments.append(&mut self.comments);
3221 self.tokens.push(token);
3222 }
3223}
3224
3225#[cfg(test)]
3226mod tests {
3227 use super::*;
3228
3229 #[test]
3230 fn test_simple_select() {
3231 let tokenizer = Tokenizer::default();
3232 let tokens = tokenizer.tokenize("SELECT 1").unwrap();
3233
3234 assert_eq!(tokens.len(), 2);
3235 assert_eq!(tokens[0].token_type, TokenType::Select);
3236 assert_eq!(tokens[1].token_type, TokenType::Number);
3237 assert_eq!(tokens[1].text, "1");
3238 }
3239
3240 #[test]
3241 fn test_select_with_identifier() {
3242 let tokenizer = Tokenizer::default();
3243 let tokens = tokenizer.tokenize("SELECT a, b FROM t").unwrap();
3244
3245 assert_eq!(tokens.len(), 6);
3246 assert_eq!(tokens[0].token_type, TokenType::Select);
3247 assert_eq!(tokens[1].token_type, TokenType::Var);
3248 assert_eq!(tokens[1].text, "a");
3249 assert_eq!(tokens[2].token_type, TokenType::Comma);
3250 assert_eq!(tokens[3].token_type, TokenType::Var);
3251 assert_eq!(tokens[3].text, "b");
3252 assert_eq!(tokens[4].token_type, TokenType::From);
3253 assert_eq!(tokens[5].token_type, TokenType::Var);
3254 assert_eq!(tokens[5].text, "t");
3255 }
3256
3257 #[test]
3258 fn test_string_literal() {
3259 let tokenizer = Tokenizer::default();
3260 let tokens = tokenizer.tokenize("SELECT 'hello'").unwrap();
3261
3262 assert_eq!(tokens.len(), 2);
3263 assert_eq!(tokens[1].token_type, TokenType::String);
3264 assert_eq!(tokens[1].text, "hello");
3265 }
3266
3267 #[test]
3268 fn test_escaped_string() {
3269 let tokenizer = Tokenizer::default();
3270 let tokens = tokenizer.tokenize("SELECT 'it''s'").unwrap();
3271
3272 assert_eq!(tokens.len(), 2);
3273 assert_eq!(tokens[1].token_type, TokenType::String);
3274 assert_eq!(tokens[1].text, "it's");
3275 }
3276
3277 #[test]
3278 fn test_comments() {
3279 let tokenizer = Tokenizer::default();
3280 let tokens = tokenizer.tokenize("SELECT -- comment\n1").unwrap();
3281
3282 assert_eq!(tokens.len(), 2);
3283 assert_eq!(tokens[0].trailing_comments.len(), 1);
3286 assert_eq!(tokens[0].trailing_comments[0], " comment");
3287 }
3288
3289 #[test]
3290 fn test_comment_in_and_chain() {
3291 use crate::generator::Generator;
3292 use crate::parser::Parser;
3293
3294 let sql = "SELECT a FROM b WHERE foo\n-- c1\nAND bar\n-- c2\nAND bla";
3296 let ast = Parser::parse_sql(sql).unwrap();
3297 let mut gen = Generator::default();
3298 let output = gen.generate(&ast[0]).unwrap();
3299 assert_eq!(
3300 output,
3301 "SELECT a FROM b WHERE foo AND /* c1 */ bar AND /* c2 */ bla"
3302 );
3303 }
3304
3305 #[test]
3306 fn test_operators() {
3307 let tokenizer = Tokenizer::default();
3308 let tokens = tokenizer.tokenize("1 + 2 * 3").unwrap();
3309
3310 assert_eq!(tokens.len(), 5);
3311 assert_eq!(tokens[0].token_type, TokenType::Number);
3312 assert_eq!(tokens[1].token_type, TokenType::Plus);
3313 assert_eq!(tokens[2].token_type, TokenType::Number);
3314 assert_eq!(tokens[3].token_type, TokenType::Star);
3315 assert_eq!(tokens[4].token_type, TokenType::Number);
3316 }
3317
3318 #[test]
3319 fn test_comparison_operators() {
3320 let tokenizer = Tokenizer::default();
3321 let tokens = tokenizer.tokenize("a <= b >= c != d").unwrap();
3322
3323 assert_eq!(tokens[1].token_type, TokenType::Lte);
3324 assert_eq!(tokens[3].token_type, TokenType::Gte);
3325 assert_eq!(tokens[5].token_type, TokenType::Neq);
3326 }
3327
3328 #[test]
3329 fn test_national_string() {
3330 let tokenizer = Tokenizer::default();
3331 let tokens = tokenizer.tokenize("N'abc'").unwrap();
3332
3333 assert_eq!(
3334 tokens.len(),
3335 1,
3336 "Expected 1 token for N'abc', got {:?}",
3337 tokens
3338 );
3339 assert_eq!(tokens[0].token_type, TokenType::NationalString);
3340 assert_eq!(tokens[0].text, "abc");
3341 }
3342
3343 #[test]
3344 fn test_hex_string() {
3345 let tokenizer = Tokenizer::default();
3346 let tokens = tokenizer.tokenize("X'ABCD'").unwrap();
3347
3348 assert_eq!(
3349 tokens.len(),
3350 1,
3351 "Expected 1 token for X'ABCD', got {:?}",
3352 tokens
3353 );
3354 assert_eq!(tokens[0].token_type, TokenType::HexString);
3355 assert_eq!(tokens[0].text, "ABCD");
3356 }
3357
3358 #[test]
3359 fn test_bit_string() {
3360 let tokenizer = Tokenizer::default();
3361 let tokens = tokenizer.tokenize("B'01010'").unwrap();
3362
3363 assert_eq!(
3364 tokens.len(),
3365 1,
3366 "Expected 1 token for B'01010', got {:?}",
3367 tokens
3368 );
3369 assert_eq!(tokens[0].token_type, TokenType::BitString);
3370 assert_eq!(tokens[0].text, "01010");
3371 }
3372
3373 #[test]
3374 fn test_trailing_dot_number() {
3375 let tokenizer = Tokenizer::default();
3376
3377 let tokens = tokenizer.tokenize("SELECT 1.").unwrap();
3379 assert_eq!(
3380 tokens.len(),
3381 2,
3382 "Expected 2 tokens for 'SELECT 1.', got {:?}",
3383 tokens
3384 );
3385 assert_eq!(tokens[1].token_type, TokenType::Number);
3386 assert_eq!(tokens[1].text, "1.");
3387
3388 let tokens = tokenizer.tokenize("SELECT 1.5").unwrap();
3390 assert_eq!(tokens[1].text, "1.5");
3391
3392 let tokens = tokenizer.tokenize("SELECT 1.a").unwrap();
3395 assert_eq!(
3396 tokens.len(),
3397 3,
3398 "Expected 3 tokens for 'SELECT 1.a', got {:?}",
3399 tokens
3400 );
3401 assert_eq!(tokens[1].token_type, TokenType::Number);
3402 assert_eq!(tokens[1].text, "1.");
3403 assert_eq!(tokens[2].token_type, TokenType::Var);
3404
3405 let tokens = tokenizer.tokenize("SELECT 1..2").unwrap();
3407 assert_eq!(tokens[1].token_type, TokenType::Number);
3408 assert_eq!(tokens[1].text, "1");
3409 assert_eq!(tokens[2].token_type, TokenType::Dot);
3410 assert_eq!(tokens[3].token_type, TokenType::Dot);
3411 assert_eq!(tokens[4].token_type, TokenType::Number);
3412 assert_eq!(tokens[4].text, "2");
3413 }
3414
3415 #[test]
3416 fn test_leading_dot_number() {
3417 let tokenizer = Tokenizer::default();
3418
3419 let tokens = tokenizer.tokenize(".25").unwrap();
3421 assert_eq!(
3422 tokens.len(),
3423 1,
3424 "Expected 1 token for '.25', got {:?}",
3425 tokens
3426 );
3427 assert_eq!(tokens[0].token_type, TokenType::Number);
3428 assert_eq!(tokens[0].text, ".25");
3429
3430 let tokens = tokenizer.tokenize("SAMPLE (.25)").unwrap();
3432 assert_eq!(
3433 tokens.len(),
3434 4,
3435 "Expected 4 tokens for 'SAMPLE (.25)', got {:?}",
3436 tokens
3437 );
3438 assert_eq!(tokens[0].token_type, TokenType::Sample);
3439 assert_eq!(tokens[1].token_type, TokenType::LParen);
3440 assert_eq!(tokens[2].token_type, TokenType::Number);
3441 assert_eq!(tokens[2].text, ".25");
3442 assert_eq!(tokens[3].token_type, TokenType::RParen);
3443
3444 let tokens = tokenizer.tokenize(".5e10").unwrap();
3446 assert_eq!(
3447 tokens.len(),
3448 1,
3449 "Expected 1 token for '.5e10', got {:?}",
3450 tokens
3451 );
3452 assert_eq!(tokens[0].token_type, TokenType::Number);
3453 assert_eq!(tokens[0].text, ".5e10");
3454
3455 let tokens = tokenizer.tokenize("a.b").unwrap();
3457 assert_eq!(
3458 tokens.len(),
3459 3,
3460 "Expected 3 tokens for 'a.b', got {:?}",
3461 tokens
3462 );
3463 assert_eq!(tokens[1].token_type, TokenType::Dot);
3464 }
3465
3466 #[test]
3467 fn test_unrecognized_character() {
3468 let tokenizer = Tokenizer::default();
3469
3470 let result = tokenizer.tokenize("SELECT \u{2018}hello\u{2019}");
3472 assert!(
3473 result.is_ok(),
3474 "Curly quotes should be tokenized as strings"
3475 );
3476
3477 let result = tokenizer.tokenize("SELECT • FROM t");
3479 assert!(result.is_err());
3480 }
3481
3482 #[test]
3483 fn test_colon_eq_tokenization() {
3484 let tokenizer = Tokenizer::default();
3485
3486 let tokens = tokenizer.tokenize("a := 1").unwrap();
3488 assert_eq!(tokens.len(), 3);
3489 assert_eq!(tokens[0].token_type, TokenType::Var);
3490 assert_eq!(tokens[1].token_type, TokenType::ColonEq);
3491 assert_eq!(tokens[2].token_type, TokenType::Number);
3492
3493 let tokens = tokenizer.tokenize("a:b").unwrap();
3495 assert!(tokens.iter().any(|t| t.token_type == TokenType::Colon));
3496 assert!(!tokens.iter().any(|t| t.token_type == TokenType::ColonEq));
3497
3498 let tokens = tokenizer.tokenize("a::INT").unwrap();
3500 assert!(tokens.iter().any(|t| t.token_type == TokenType::DColon));
3501 }
3502
3503 #[test]
3504 fn test_colon_eq_parsing() {
3505 use crate::generator::Generator;
3506 use crate::parser::Parser;
3507
3508 let ast = Parser::parse_sql("SELECT @var1 := 1, @var2")
3510 .expect("Failed to parse MySQL @var := expr");
3511 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3512 assert_eq!(output, "SELECT @var1 := 1, @var2");
3513
3514 let ast = Parser::parse_sql("SELECT @var1, @var2 := @var1")
3516 .expect("Failed to parse MySQL @var2 := @var1");
3517 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3518 assert_eq!(output, "SELECT @var1, @var2 := @var1");
3519
3520 let ast = Parser::parse_sql("SELECT @var1 := COUNT(*) FROM t1")
3522 .expect("Failed to parse MySQL @var := COUNT(*)");
3523 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3524 assert_eq!(output, "SELECT @var1 := COUNT(*) FROM t1");
3525
3526 let ast = Parser::parse_sql("SET @var1 := 1").expect("Failed to parse SET @var1 := 1");
3528 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3529 assert_eq!(output, "SET @var1 = 1");
3530
3531 let ast =
3533 Parser::parse_sql("UNION_VALUE(k1 := 1)").expect("Failed to parse named arg with :=");
3534 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3535 assert_eq!(output, "UNION_VALUE(k1 := 1)");
3536
3537 let ast = Parser::parse_sql("SELECT UNNEST(col, recursive := TRUE) FROM t")
3539 .expect("Failed to parse UNNEST with :=");
3540 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3541 assert_eq!(output, "SELECT UNNEST(col, recursive := TRUE) FROM t");
3542
3543 let ast =
3545 Parser::parse_sql("SELECT foo: 1").expect("Failed to parse DuckDB prefix alias foo: 1");
3546 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3547 assert_eq!(output, "SELECT 1 AS foo");
3548
3549 let ast = Parser::parse_sql("SELECT foo: 1, bar: 2, baz: 3")
3551 .expect("Failed to parse DuckDB multiple prefix aliases");
3552 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3553 assert_eq!(output, "SELECT 1 AS foo, 2 AS bar, 3 AS baz");
3554 }
3555
3556 #[test]
3557 fn test_colon_eq_dialect_roundtrip() {
3558 use crate::dialects::{Dialect, DialectType};
3559
3560 fn check(dialect: DialectType, sql: &str, expected: Option<&str>) {
3561 let d = Dialect::get(dialect);
3562 let ast = d
3563 .parse(sql)
3564 .unwrap_or_else(|e| panic!("Parse error for '{}': {}", sql, e));
3565 assert!(!ast.is_empty(), "Empty AST for: {}", sql);
3566 let transformed = d
3567 .transform(ast[0].clone())
3568 .unwrap_or_else(|e| panic!("Transform error for '{}': {}", sql, e));
3569 let output = d
3570 .generate(&transformed)
3571 .unwrap_or_else(|e| panic!("Generate error for '{}': {}", sql, e));
3572 let expected = expected.unwrap_or(sql);
3573 assert_eq!(output, expected, "Roundtrip failed for: {}", sql);
3574 }
3575
3576 check(DialectType::MySQL, "SELECT @var1 := 1, @var2", None);
3578 check(DialectType::MySQL, "SELECT @var1, @var2 := @var1", None);
3579 check(DialectType::MySQL, "SELECT @var1 := COUNT(*) FROM t1", None);
3580 check(DialectType::MySQL, "SET @var1 := 1", Some("SET @var1 = 1"));
3581
3582 check(
3584 DialectType::DuckDB,
3585 "SELECT UNNEST(col, recursive := TRUE) FROM t",
3586 None,
3587 );
3588 check(DialectType::DuckDB, "UNION_VALUE(k1 := 1)", None);
3589
3590 {
3593 let d = Dialect::get(DialectType::DuckDB);
3594 let ast = d
3595 .parse("STRUCT_PACK(a := 'b')::json")
3596 .expect("Failed to parse STRUCT_PACK(a := 'b')::json");
3597 assert!(!ast.is_empty(), "Empty AST for STRUCT_PACK(a := 'b')::json");
3598 }
3599
3600 check(
3602 DialectType::DuckDB,
3603 "SELECT foo: 1",
3604 Some("SELECT 1 AS foo"),
3605 );
3606 check(
3607 DialectType::DuckDB,
3608 "SELECT foo: 1, bar: 2, baz: 3",
3609 Some("SELECT 1 AS foo, 2 AS bar, 3 AS baz"),
3610 );
3611 }
3612
3613 #[test]
3614 fn test_comment_roundtrip() {
3615 use crate::generator::Generator;
3616 use crate::parser::Parser;
3617
3618 fn check_roundtrip(sql: &str) -> Option<String> {
3619 let ast = match Parser::parse_sql(sql) {
3620 Ok(a) => a,
3621 Err(e) => return Some(format!("Parse error: {:?}", e)),
3622 };
3623 if ast.is_empty() {
3624 return Some("Empty AST".to_string());
3625 }
3626 let mut generator = Generator::default();
3627 let output = match generator.generate(&ast[0]) {
3628 Ok(o) => o,
3629 Err(e) => return Some(format!("Gen error: {:?}", e)),
3630 };
3631 if output == sql {
3632 None
3633 } else {
3634 Some(format!(
3635 "Mismatch:\n input: {}\n output: {}",
3636 sql, output
3637 ))
3638 }
3639 }
3640
3641 let tests = vec![
3642 "SELECT c /* c1 /* c2 */ c3 */",
3644 "SELECT c /* c1 /* c2 /* c3 */ */ */",
3645 "SELECT c /* c1 */ AS alias /* c2 */",
3647 "SELECT a /* x */, b /* x */",
3649 "SELECT a /* x */ /* y */ /* z */, b /* k */ /* m */",
3651 "SELECT * FROM foo /* x */, bla /* x */",
3653 "SELECT 1 /* comment */ + 1",
3655 "SELECT 1 /* c1 */ + 2 /* c2 */",
3656 "SELECT 1 /* c1 */ + /* c2 */ 2 /* c3 */",
3657 "SELECT CAST(x AS INT) /* comment */ FROM foo",
3659 "SELECT FOO(x /* c */) /* FOO */, b /* b */",
3661 "SELECT x FROM a.b.c /* x */, e.f.g /* x */",
3663 "INSERT INTO t1 (tc1 /* tc1 */, tc2 /* tc2 */) SELECT c1 /* sc1 */, c2 /* sc2 */ FROM t",
3665 "/* c */ WITH x AS (SELECT 1) SELECT * FROM x",
3667 "/* comment1 */ INSERT INTO x /* comment2 */ VALUES (1, 2, 3)",
3668 "/* comment1 */ UPDATE tbl /* comment2 */ SET x = 2 WHERE x < 2",
3669 "/* comment1 */ DELETE FROM x /* comment2 */ WHERE y > 1",
3670 "/* comment */ CREATE TABLE foo AS SELECT 1",
3671 "INSERT INTO foo SELECT * FROM bar /* comment */",
3673 "SELECT FOO(x /* c1 */ + y /* c2 */ + BLA(5 /* c3 */)) FROM (VALUES (1 /* c4 */, \"test\" /* c5 */)) /* c6 */",
3675 ];
3676
3677 let mut failures = Vec::new();
3678 for sql in tests {
3679 if let Some(e) = check_roundtrip(sql) {
3680 failures.push(e);
3681 }
3682 }
3683
3684 if !failures.is_empty() {
3685 panic!("Comment roundtrip failures:\n{}", failures.join("\n\n"));
3686 }
3687 }
3688
3689 #[test]
3690 fn test_dollar_quoted_string_parsing() {
3691 use crate::dialects::{Dialect, DialectType};
3692
3693 let (tag, content) = super::parse_dollar_string_token("FOO\x00content here");
3695 assert_eq!(tag, Some("FOO".to_string()));
3696 assert_eq!(content, "content here");
3697
3698 let (tag, content) = super::parse_dollar_string_token("just content");
3699 assert_eq!(tag, None);
3700 assert_eq!(content, "just content");
3701
3702 fn check_databricks(sql: &str, expected: Option<&str>) {
3704 let d = Dialect::get(DialectType::Databricks);
3705 let ast = d
3706 .parse(sql)
3707 .unwrap_or_else(|e| panic!("Parse error for '{}': {}", sql, e));
3708 assert!(!ast.is_empty(), "Empty AST for: {}", sql);
3709 let transformed = d
3710 .transform(ast[0].clone())
3711 .unwrap_or_else(|e| panic!("Transform error for '{}': {}", sql, e));
3712 let output = d
3713 .generate(&transformed)
3714 .unwrap_or_else(|e| panic!("Generate error for '{}': {}", sql, e));
3715 let expected = expected.unwrap_or(sql);
3716 assert_eq!(output, expected, "Roundtrip failed for: {}", sql);
3717 }
3718
3719 check_databricks(
3721 "CREATE FUNCTION add_one(x INT) RETURNS INT LANGUAGE PYTHON AS $$def add_one(x):\n return x+1$$",
3722 None
3723 );
3724
3725 check_databricks(
3727 "CREATE FUNCTION add_one(x INT) RETURNS INT LANGUAGE PYTHON AS $FOO$def add_one(x):\n return x+1$FOO$",
3728 None
3729 );
3730 }
3731}