1use crate::error::{Error, Result};
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9use std::fmt;
10use std::sync::LazyLock;
11#[cfg(feature = "bindings")]
12use ts_rs::TS;
13
14pub fn parse_dollar_string_token(text: &str) -> (Option<String>, String) {
18 if let Some(pos) = text.find('\x00') {
19 let tag = &text[..pos];
20 let content = &text[pos + 1..];
21 (Some(tag.to_string()), content.to_string())
22 } else {
23 (None, text.to_string())
24 }
25}
26
27#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
29#[cfg_attr(feature = "bindings", derive(TS))]
30pub struct Span {
31 pub start: usize,
33 pub end: usize,
35 pub line: usize,
37 pub column: usize,
39}
40
41impl Span {
42 pub fn new(start: usize, end: usize, line: usize, column: usize) -> Self {
43 Self {
44 start,
45 end,
46 line,
47 column,
48 }
49 }
50}
51
52#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
54pub struct Token {
55 pub token_type: TokenType,
57 pub text: String,
59 pub span: Span,
61 #[serde(default)]
63 pub comments: Vec<String>,
64 #[serde(default)]
66 pub trailing_comments: Vec<String>,
67}
68
69impl Token {
70 pub fn new(token_type: TokenType, text: impl Into<String>, span: Span) -> Self {
72 Self {
73 token_type,
74 text: text.into(),
75 span,
76 comments: Vec::new(),
77 trailing_comments: Vec::new(),
78 }
79 }
80
81 pub fn number(n: i64) -> Self {
83 Self::new(TokenType::Number, n.to_string(), Span::default())
84 }
85
86 pub fn string(s: impl Into<String>) -> Self {
88 Self::new(TokenType::String, s, Span::default())
89 }
90
91 pub fn identifier(s: impl Into<String>) -> Self {
93 Self::new(TokenType::Identifier, s, Span::default())
94 }
95
96 pub fn var(s: impl Into<String>) -> Self {
98 Self::new(TokenType::Var, s, Span::default())
99 }
100
101 pub fn with_comment(mut self, comment: impl Into<String>) -> Self {
103 self.comments.push(comment.into());
104 self
105 }
106}
107
108impl fmt::Display for Token {
109 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
110 write!(f, "{:?}({})", self.token_type, self.text)
111 }
112}
113
114#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
116#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
117#[repr(u16)]
118pub enum TokenType {
119 LParen,
121 RParen,
122 LBracket,
123 RBracket,
124 LBrace,
125 RBrace,
126 Comma,
127 Dot,
128 Dash,
129 Plus,
130 Colon,
131 DotColon,
132 DColon,
133 DColonDollar,
134 DColonPercent,
135 DColonQMark,
136 DQMark,
137 Semicolon,
138 Star,
139 Backslash,
140 Slash,
141 Lt,
142 Lte,
143 Gt,
144 Gte,
145 Not,
146 Eq,
147 Neq,
148 NullsafeEq,
149 ColonEq,
150 ColonGt,
151 NColonGt,
152 And,
153 Or,
154 Amp,
155 DPipe,
156 PipeGt,
157 Pipe,
158 PipeSlash,
159 DPipeSlash,
160 Caret,
161 CaretAt,
162 LtLt, GtGt, Tilde,
165 Arrow,
166 DArrow,
167 FArrow,
168 Hash,
169 HashArrow,
170 DHashArrow,
171 LrArrow,
172 DAt,
173 AtAt,
174 LtAt,
175 AtGt,
176 Dollar,
177 Parameter,
178 Session,
179 SessionParameter,
180 SessionUser,
181 DAmp,
182 AmpLt,
183 AmpGt,
184 Adjacent,
185 Xor,
186 DStar,
187 QMarkAmp,
188 QMarkPipe,
189 HashDash,
190 Exclamation,
191
192 UriStart,
193 BlockStart,
194 BlockEnd,
195 Space,
196 Break,
197
198 BlockComment, LineComment, String,
204 DollarString, TripleDoubleQuotedString, TripleSingleQuotedString, Number,
208 Identifier,
209 QuotedIdentifier,
210 Database,
211 Column,
212 ColumnDef,
213 Schema,
214 Table,
215 Warehouse,
216 Stage,
217 Streamlit,
218 Var,
219 BitString,
220 HexString,
221 HexNumber,
223 ByteString,
224 NationalString,
225 EscapeString, RawString,
227 HeredocString,
228 HeredocStringAlternative,
229 UnicodeString,
230
231 Bit,
233 Boolean,
234 TinyInt,
235 UTinyInt,
236 SmallInt,
237 USmallInt,
238 MediumInt,
239 UMediumInt,
240 Int,
241 UInt,
242 BigInt,
243 UBigInt,
244 BigNum,
245 Int128,
246 UInt128,
247 Int256,
248 UInt256,
249 Float,
250 Double,
251 UDouble,
252 Decimal,
253 Decimal32,
254 Decimal64,
255 Decimal128,
256 Decimal256,
257 DecFloat,
258 UDecimal,
259 BigDecimal,
260 Char,
261 NChar,
262 VarChar,
263 NVarChar,
264 BpChar,
265 Text,
266 MediumText,
267 LongText,
268 Blob,
269 MediumBlob,
270 LongBlob,
271 TinyBlob,
272 TinyText,
273 Name,
274 Binary,
275 VarBinary,
276 Json,
277 JsonB,
278 Time,
279 TimeTz,
280 TimeNs,
281 Timestamp,
282 TimestampTz,
283 TimestampLtz,
284 TimestampNtz,
285 TimestampS,
286 TimestampMs,
287 TimestampNs,
288 DateTime,
289 DateTime2,
290 DateTime64,
291 SmallDateTime,
292 Date,
293 Date32,
294 Int4Range,
295 Int4MultiRange,
296 Int8Range,
297 Int8MultiRange,
298 NumRange,
299 NumMultiRange,
300 TsRange,
301 TsMultiRange,
302 TsTzRange,
303 TsTzMultiRange,
304 DateRange,
305 DateMultiRange,
306 Uuid,
307 Geography,
308 GeographyPoint,
309 Nullable,
310 Geometry,
311 Point,
312 Ring,
313 LineString,
314 LocalTime,
315 LocalTimestamp,
316 SysTimestamp,
317 MultiLineString,
318 Polygon,
319 MultiPolygon,
320 HllSketch,
321 HStore,
322 Super,
323 Serial,
324 SmallSerial,
325 BigSerial,
326 Xml,
327 Year,
328 UserDefined,
329 Money,
330 SmallMoney,
331 RowVersion,
332 Image,
333 Variant,
334 Object,
335 Inet,
336 IpAddress,
337 IpPrefix,
338 Ipv4,
339 Ipv6,
340 Enum,
341 Enum8,
342 Enum16,
343 FixedString,
344 LowCardinality,
345 Nested,
346 AggregateFunction,
347 SimpleAggregateFunction,
348 TDigest,
349 Unknown,
350 Vector,
351 Dynamic,
352 Void,
353
354 Add,
356 Alias,
357 Alter,
358 All,
359 Anti,
360 Any,
361 Apply,
362 Array,
363 Asc,
364 AsOf,
365 Attach,
366 AutoIncrement,
367 Begin,
368 Between,
369 BulkCollectInto,
370 Cache,
371 Cascade,
372 Case,
373 CharacterSet,
374 Cluster,
375 ClusterBy,
376 Collate,
377 Command,
378 Comment,
379 Commit,
380 Prepare,
381 Preserve,
382 Connect,
383 ConnectBy,
384 Constraint,
385 Copy,
386 Create,
387 Cross,
388 Cube,
389 CurrentDate,
390 CurrentDateTime,
391 CurrentSchema,
392 CurrentTime,
393 CurrentTimestamp,
394 CurrentUser,
395 CurrentRole,
396 CurrentCatalog,
397 Declare,
398 Default,
399 Delete,
400 Desc,
401 Describe,
402 Detach,
403 Dictionary,
404 Distinct,
405 Distribute,
406 DistributeBy,
407 Div,
408 Drop,
409 Else,
410 End,
411 Escape,
412 Except,
413 Execute,
414 Exists,
415 False,
416 Fetch,
417 File,
418 FileFormat,
419 Filter,
420 Final,
421 First,
422 For,
423 Force,
424 ForeignKey,
425 Format,
426 From,
427 Full,
428 Function,
429 Get,
430 Glob,
431 Global,
432 Grant,
433 GroupBy,
434 GroupingSets,
435 Having,
436 Hint,
437 Ignore,
438 ILike,
439 In,
440 Index,
441 IndexedBy,
442 Inner,
443 Input,
444 Insert,
445 Install,
446 Intersect,
447 Interval,
448 Into,
449 Inpath,
450 InputFormat,
451 Introducer,
452 IRLike,
453 Is,
454 IsNull,
455 Join,
456 JoinMarker,
457 Keep,
458 Key,
459 Kill,
460 Lambda,
461 Language,
462 Lateral,
463 Left,
464 Like,
465 NotLike, NotILike, NotRLike, NotIRLike, Limit,
470 List,
471 Load,
472 Local,
473 Lock,
474 Map,
475 Match,
476 MatchCondition,
477 MatchRecognize,
478 MemberOf,
479 Materialized,
480 Merge,
481 Mod,
482 Model,
483 Natural,
484 Next,
485 NoAction,
486 Nothing,
487 NotNull,
488 Null,
489 ObjectIdentifier,
490 Offset,
491 On,
492 Only,
493 Operator,
494 OrderBy,
495 OrderSiblingsBy,
496 Ordered,
497 Ordinality,
498 Out,
499 Outer,
500 Output,
501 Over,
502 Overlaps,
503 Overwrite,
504 Partition,
505 PartitionBy,
506 Percent,
507 Pivot,
508 Placeholder,
509 Positional,
510 Pragma,
511 Prewhere,
512 PrimaryKey,
513 Procedure,
514 Properties,
515 PseudoType,
516 Put,
517 Qualify,
518 Quote,
519 QDColon,
520 Range,
521 Recursive,
522 Refresh,
523 Rename,
524 Replace,
525 Returning,
526 Revoke,
527 References,
528 Restrict,
529 Right,
530 RLike,
531 Rollback,
532 Rollup,
533 Row,
534 Rows,
535 Select,
536 Semi,
537 Savepoint,
538 Separator,
539 Sequence,
540 Serde,
541 SerdeProperties,
542 Set,
543 Settings,
544 Show,
545 Siblings,
546 SimilarTo,
547 Some,
548 Sort,
549 SortBy,
550 SoundsLike,
551 StartWith,
552 StorageIntegration,
553 StraightJoin,
554 Struct,
555 Summarize,
556 TableSample,
557 Sample,
558 Bernoulli,
559 System,
560 Block,
561 Seed,
562 Repeatable,
563 Tag,
564 Temporary,
565 Transaction,
566 To,
567 Top,
568 Then,
569 True,
570 Truncate,
571 Uncache,
572 Union,
573 Unnest,
574 Unpivot,
575 Update,
576 Use,
577 Using,
578 Values,
579 View,
580 SemanticView,
581 Volatile,
582 When,
583 Where,
584 Window,
585 With,
586 Ties,
587 Exclude,
588 No,
589 Others,
590 Unique,
591 UtcDate,
592 UtcTime,
593 UtcTimestamp,
594 VersionSnapshot,
595 TimestampSnapshot,
596 Option,
597 Sink,
598 Source,
599 Analyze,
600 Namespace,
601 Export,
602 As,
603 By,
604 Nulls,
605 Respect,
606 Last,
607 If,
608 Cast,
609 TryCast,
610 SafeCast,
611 Count,
612 Extract,
613 Substring,
614 Trim,
615 Leading,
616 Trailing,
617 Both,
618 Position,
619 Overlaying,
620 Placing,
621 Treat,
622 Within,
623 Group,
624 Order,
625
626 Unbounded,
628 Preceding,
629 Following,
630 Current,
631 Groups,
632
633 Trigger,
635 Type,
636 Domain,
637 Returns,
638 Body,
639 Increment,
640 Minvalue,
641 Maxvalue,
642 Start,
643 Cycle,
644 NoCycle,
645 Prior,
646 Generated,
647 Identity,
648 Always,
649 Measures,
651 Pattern,
652 Define,
653 Running,
654 Owned,
655 After,
656 Before,
657 Instead,
658 Each,
659 Statement,
660 Referencing,
661 Old,
662 New,
663 Of,
664 Check,
665 Authorization,
666 Restart,
667
668 Eof,
670}
671
672impl TokenType {
673 pub fn is_keyword(&self) -> bool {
675 matches!(
676 self,
677 TokenType::Select
678 | TokenType::From
679 | TokenType::Where
680 | TokenType::And
681 | TokenType::Or
682 | TokenType::Not
683 | TokenType::In
684 | TokenType::Is
685 | TokenType::Null
686 | TokenType::True
687 | TokenType::False
688 | TokenType::As
689 | TokenType::On
690 | TokenType::Join
691 | TokenType::Left
692 | TokenType::Right
693 | TokenType::Inner
694 | TokenType::Outer
695 | TokenType::Full
696 | TokenType::Cross
697 | TokenType::Semi
698 | TokenType::Anti
699 | TokenType::Union
700 | TokenType::Except
701 | TokenType::Intersect
702 | TokenType::GroupBy
703 | TokenType::OrderBy
704 | TokenType::Having
705 | TokenType::Limit
706 | TokenType::Offset
707 | TokenType::Case
708 | TokenType::When
709 | TokenType::Then
710 | TokenType::Else
711 | TokenType::End
712 | TokenType::Create
713 | TokenType::Drop
714 | TokenType::Alter
715 | TokenType::Insert
716 | TokenType::Update
717 | TokenType::Delete
718 | TokenType::Into
719 | TokenType::Values
720 | TokenType::Set
721 | TokenType::With
722 | TokenType::Distinct
723 | TokenType::All
724 | TokenType::Exists
725 | TokenType::Between
726 | TokenType::Like
727 | TokenType::ILike
728 | TokenType::Filter
730 | TokenType::Date
731 | TokenType::Timestamp
732 | TokenType::TimestampTz
733 | TokenType::Interval
734 | TokenType::Time
735 | TokenType::Table
736 | TokenType::Index
737 | TokenType::Column
738 | TokenType::Database
739 | TokenType::Schema
740 | TokenType::View
741 | TokenType::Function
742 | TokenType::Procedure
743 | TokenType::Trigger
744 | TokenType::Sequence
745 | TokenType::Over
746 | TokenType::Partition
747 | TokenType::Window
748 | TokenType::Rows
749 | TokenType::Range
750 | TokenType::First
751 | TokenType::Last
752 | TokenType::Preceding
753 | TokenType::Following
754 | TokenType::Current
755 | TokenType::Row
756 | TokenType::Unbounded
757 | TokenType::Array
758 | TokenType::Struct
759 | TokenType::Map
760 | TokenType::PrimaryKey
761 | TokenType::Key
762 | TokenType::ForeignKey
763 | TokenType::References
764 | TokenType::Unique
765 | TokenType::Check
766 | TokenType::Default
767 | TokenType::Constraint
768 | TokenType::Comment
769 | TokenType::Rollup
770 | TokenType::Cube
771 | TokenType::Grant
772 | TokenType::Revoke
773 | TokenType::Type
774 | TokenType::Use
775 | TokenType::Cache
776 | TokenType::Uncache
777 | TokenType::Load
778 | TokenType::Any
779 | TokenType::Some
780 | TokenType::Asc
781 | TokenType::Desc
782 | TokenType::Nulls
783 | TokenType::Lateral
784 | TokenType::Natural
785 | TokenType::Escape
786 | TokenType::Glob
787 | TokenType::Match
788 | TokenType::Recursive
789 | TokenType::Replace
790 | TokenType::Returns
791 | TokenType::If
792 | TokenType::Pivot
793 | TokenType::Unpivot
794 | TokenType::Json
795 | TokenType::Blob
796 | TokenType::Text
797 | TokenType::Int
798 | TokenType::BigInt
799 | TokenType::SmallInt
800 | TokenType::TinyInt
801 | TokenType::Int128
802 | TokenType::UInt128
803 | TokenType::Int256
804 | TokenType::UInt256
805 | TokenType::UInt
806 | TokenType::UBigInt
807 | TokenType::Float
808 | TokenType::Double
809 | TokenType::Decimal
810 | TokenType::Boolean
811 | TokenType::VarChar
812 | TokenType::Char
813 | TokenType::Binary
814 | TokenType::VarBinary
815 | TokenType::No
816 | TokenType::DateTime
817 | TokenType::Truncate
818 | TokenType::Execute
819 | TokenType::Merge
820 | TokenType::Top
821 | TokenType::Begin
822 | TokenType::Generated
823 | TokenType::Identity
824 | TokenType::Always
825 | TokenType::Extract
826 | TokenType::AsOf
828 | TokenType::Prior
829 | TokenType::After
830 | TokenType::Restrict
831 | TokenType::Cascade
832 | TokenType::Local
833 | TokenType::Rename
834 | TokenType::Enum
835 | TokenType::Within
836 | TokenType::Format
837 | TokenType::Final
838 | TokenType::FileFormat
839 | TokenType::Input
840 | TokenType::InputFormat
841 | TokenType::Copy
842 | TokenType::Put
843 | TokenType::Get
844 | TokenType::Show
845 | TokenType::Serde
846 | TokenType::Sample
847 | TokenType::Sort
848 | TokenType::Collate
849 | TokenType::Ties
850 | TokenType::IsNull
851 | TokenType::NotNull
852 | TokenType::Exclude
853 | TokenType::Temporary
854 | TokenType::Add
855 | TokenType::Ordinality
856 | TokenType::Overlaps
857 | TokenType::Block
858 | TokenType::Pattern
859 | TokenType::Group
860 | TokenType::Cluster
861 | TokenType::Repeatable
862 | TokenType::Groups
863 | TokenType::Commit
864 | TokenType::Warehouse
865 | TokenType::System
866 | TokenType::By
867 | TokenType::To
868 | TokenType::Fetch
869 | TokenType::For
870 | TokenType::Only
871 | TokenType::Next
872 | TokenType::Lock
873 | TokenType::Refresh
874 | TokenType::Settings
875 | TokenType::Operator
876 | TokenType::Overwrite
877 | TokenType::StraightJoin
878 | TokenType::Start
879 | TokenType::Ignore
881 | TokenType::Domain
882 | TokenType::Apply
883 | TokenType::Respect
884 | TokenType::Materialized
885 | TokenType::Prewhere
886 | TokenType::Old
887 | TokenType::New
888 | TokenType::Cast
889 | TokenType::TryCast
890 | TokenType::SafeCast
891 | TokenType::Transaction
892 | TokenType::Describe
893 | TokenType::Kill
894 | TokenType::Lambda
895 | TokenType::Declare
896 | TokenType::Keep
897 | TokenType::Output
898 | TokenType::Percent
899 | TokenType::Qualify
900 | TokenType::Returning
901 | TokenType::Language
902 | TokenType::Prepare
903 | TokenType::Preserve
904 | TokenType::Savepoint
905 | TokenType::Rollback
906 | TokenType::Body
907 | TokenType::Increment
908 | TokenType::Minvalue
909 | TokenType::Maxvalue
910 | TokenType::Cycle
911 | TokenType::NoCycle
912 | TokenType::Seed
913 | TokenType::Namespace
914 | TokenType::Authorization
915 | TokenType::Order
916 | TokenType::Restart
917 | TokenType::Before
918 | TokenType::Instead
919 | TokenType::Each
920 | TokenType::Statement
921 | TokenType::Referencing
922 | TokenType::Of
923 | TokenType::Separator
924 | TokenType::Others
925 | TokenType::Placing
926 | TokenType::Owned
927 | TokenType::Running
928 | TokenType::Define
929 | TokenType::Measures
930 | TokenType::MatchRecognize
931 | TokenType::AutoIncrement
932 | TokenType::Connect
933 | TokenType::Distribute
934 | TokenType::Bernoulli
935 | TokenType::TableSample
936 | TokenType::Inpath
937 | TokenType::Pragma
938 | TokenType::Siblings
939 | TokenType::SerdeProperties
940 | TokenType::RLike
941 )
942 }
943
944 pub fn is_comparison(&self) -> bool {
946 matches!(
947 self,
948 TokenType::Eq
949 | TokenType::Neq
950 | TokenType::Lt
951 | TokenType::Lte
952 | TokenType::Gt
953 | TokenType::Gte
954 | TokenType::NullsafeEq
955 )
956 }
957
958 pub fn is_arithmetic(&self) -> bool {
960 matches!(
961 self,
962 TokenType::Plus
963 | TokenType::Dash
964 | TokenType::Star
965 | TokenType::Slash
966 | TokenType::Percent
967 | TokenType::Mod
968 | TokenType::Div
969 )
970 }
971}
972
973impl fmt::Display for TokenType {
974 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
975 write!(f, "{:?}", self)
976 }
977}
978
979static DEFAULT_KEYWORDS: LazyLock<HashMap<String, TokenType>> = LazyLock::new(|| {
982 let mut keywords = HashMap::with_capacity(300);
983 keywords.insert("SELECT".to_string(), TokenType::Select);
985 keywords.insert("FROM".to_string(), TokenType::From);
986 keywords.insert("WHERE".to_string(), TokenType::Where);
987 keywords.insert("AND".to_string(), TokenType::And);
988 keywords.insert("OR".to_string(), TokenType::Or);
989 keywords.insert("NOT".to_string(), TokenType::Not);
990 keywords.insert("AS".to_string(), TokenType::As);
991 keywords.insert("ON".to_string(), TokenType::On);
992 keywords.insert("JOIN".to_string(), TokenType::Join);
993 keywords.insert("LEFT".to_string(), TokenType::Left);
994 keywords.insert("RIGHT".to_string(), TokenType::Right);
995 keywords.insert("INNER".to_string(), TokenType::Inner);
996 keywords.insert("OUTER".to_string(), TokenType::Outer);
997 keywords.insert("OUTPUT".to_string(), TokenType::Output);
998 keywords.insert("FULL".to_string(), TokenType::Full);
999 keywords.insert("CROSS".to_string(), TokenType::Cross);
1000 keywords.insert("SEMI".to_string(), TokenType::Semi);
1001 keywords.insert("ANTI".to_string(), TokenType::Anti);
1002 keywords.insert("STRAIGHT_JOIN".to_string(), TokenType::StraightJoin);
1003 keywords.insert("UNION".to_string(), TokenType::Union);
1004 keywords.insert("EXCEPT".to_string(), TokenType::Except);
1005 keywords.insert("MINUS".to_string(), TokenType::Except); keywords.insert("INTERSECT".to_string(), TokenType::Intersect);
1007 keywords.insert("GROUP".to_string(), TokenType::Group);
1008 keywords.insert("CUBE".to_string(), TokenType::Cube);
1009 keywords.insert("ROLLUP".to_string(), TokenType::Rollup);
1010 keywords.insert("WITHIN".to_string(), TokenType::Within);
1011 keywords.insert("ORDER".to_string(), TokenType::Order);
1012 keywords.insert("BY".to_string(), TokenType::By);
1013 keywords.insert("HAVING".to_string(), TokenType::Having);
1014 keywords.insert("LIMIT".to_string(), TokenType::Limit);
1015 keywords.insert("OFFSET".to_string(), TokenType::Offset);
1016 keywords.insert("ORDINALITY".to_string(), TokenType::Ordinality);
1017 keywords.insert("FETCH".to_string(), TokenType::Fetch);
1018 keywords.insert("FIRST".to_string(), TokenType::First);
1019 keywords.insert("NEXT".to_string(), TokenType::Next);
1020 keywords.insert("ONLY".to_string(), TokenType::Only);
1021 keywords.insert("KEEP".to_string(), TokenType::Keep);
1022 keywords.insert("IGNORE".to_string(), TokenType::Ignore);
1023 keywords.insert("INPUT".to_string(), TokenType::Input);
1024 keywords.insert("CASE".to_string(), TokenType::Case);
1025 keywords.insert("WHEN".to_string(), TokenType::When);
1026 keywords.insert("THEN".to_string(), TokenType::Then);
1027 keywords.insert("ELSE".to_string(), TokenType::Else);
1028 keywords.insert("END".to_string(), TokenType::End);
1029 keywords.insert("ENDIF".to_string(), TokenType::End); keywords.insert("NULL".to_string(), TokenType::Null);
1031 keywords.insert("TRUE".to_string(), TokenType::True);
1032 keywords.insert("FALSE".to_string(), TokenType::False);
1033 keywords.insert("IS".to_string(), TokenType::Is);
1034 keywords.insert("IN".to_string(), TokenType::In);
1035 keywords.insert("BETWEEN".to_string(), TokenType::Between);
1036 keywords.insert("OVERLAPS".to_string(), TokenType::Overlaps);
1037 keywords.insert("LIKE".to_string(), TokenType::Like);
1038 keywords.insert("ILIKE".to_string(), TokenType::ILike);
1039 keywords.insert("RLIKE".to_string(), TokenType::RLike);
1040 keywords.insert("REGEXP".to_string(), TokenType::RLike);
1041 keywords.insert("ESCAPE".to_string(), TokenType::Escape);
1042 keywords.insert("EXISTS".to_string(), TokenType::Exists);
1043 keywords.insert("DISTINCT".to_string(), TokenType::Distinct);
1044 keywords.insert("ALL".to_string(), TokenType::All);
1045 keywords.insert("WITH".to_string(), TokenType::With);
1046 keywords.insert("CREATE".to_string(), TokenType::Create);
1047 keywords.insert("DROP".to_string(), TokenType::Drop);
1048 keywords.insert("ALTER".to_string(), TokenType::Alter);
1049 keywords.insert("TRUNCATE".to_string(), TokenType::Truncate);
1050 keywords.insert("TABLE".to_string(), TokenType::Table);
1051 keywords.insert("VIEW".to_string(), TokenType::View);
1052 keywords.insert("INDEX".to_string(), TokenType::Index);
1053 keywords.insert("COLUMN".to_string(), TokenType::Column);
1054 keywords.insert("CONSTRAINT".to_string(), TokenType::Constraint);
1055 keywords.insert("ADD".to_string(), TokenType::Add);
1056 keywords.insert("CASCADE".to_string(), TokenType::Cascade);
1057 keywords.insert("RESTRICT".to_string(), TokenType::Restrict);
1058 keywords.insert("RENAME".to_string(), TokenType::Rename);
1059 keywords.insert("TEMPORARY".to_string(), TokenType::Temporary);
1060 keywords.insert("TEMP".to_string(), TokenType::Temporary);
1061 keywords.insert("UNIQUE".to_string(), TokenType::Unique);
1062 keywords.insert("PRIMARY".to_string(), TokenType::PrimaryKey);
1063 keywords.insert("FOREIGN".to_string(), TokenType::ForeignKey);
1064 keywords.insert("KEY".to_string(), TokenType::Key);
1065 keywords.insert("KILL".to_string(), TokenType::Kill);
1066 keywords.insert("REFERENCES".to_string(), TokenType::References);
1067 keywords.insert("DEFAULT".to_string(), TokenType::Default);
1068 keywords.insert("DECLARE".to_string(), TokenType::Declare);
1069 keywords.insert("AUTO_INCREMENT".to_string(), TokenType::AutoIncrement);
1070 keywords.insert("AUTOINCREMENT".to_string(), TokenType::AutoIncrement); keywords.insert("MATERIALIZED".to_string(), TokenType::Materialized);
1072 keywords.insert("REPLACE".to_string(), TokenType::Replace);
1073 keywords.insert("TO".to_string(), TokenType::To);
1074 keywords.insert("INSERT".to_string(), TokenType::Insert);
1075 keywords.insert("OVERWRITE".to_string(), TokenType::Overwrite);
1076 keywords.insert("UPDATE".to_string(), TokenType::Update);
1077 keywords.insert("USE".to_string(), TokenType::Use);
1078 keywords.insert("WAREHOUSE".to_string(), TokenType::Warehouse);
1079 keywords.insert("GLOB".to_string(), TokenType::Glob);
1080 keywords.insert("DELETE".to_string(), TokenType::Delete);
1081 keywords.insert("MERGE".to_string(), TokenType::Merge);
1082 keywords.insert("CACHE".to_string(), TokenType::Cache);
1083 keywords.insert("UNCACHE".to_string(), TokenType::Uncache);
1084 keywords.insert("REFRESH".to_string(), TokenType::Refresh);
1085 keywords.insert("GRANT".to_string(), TokenType::Grant);
1086 keywords.insert("REVOKE".to_string(), TokenType::Revoke);
1087 keywords.insert("COMMENT".to_string(), TokenType::Comment);
1088 keywords.insert("COLLATE".to_string(), TokenType::Collate);
1089 keywords.insert("INTO".to_string(), TokenType::Into);
1090 keywords.insert("VALUES".to_string(), TokenType::Values);
1091 keywords.insert("SET".to_string(), TokenType::Set);
1092 keywords.insert("SETTINGS".to_string(), TokenType::Settings);
1093 keywords.insert("SEPARATOR".to_string(), TokenType::Separator);
1094 keywords.insert("ASC".to_string(), TokenType::Asc);
1095 keywords.insert("DESC".to_string(), TokenType::Desc);
1096 keywords.insert("NULLS".to_string(), TokenType::Nulls);
1097 keywords.insert("RESPECT".to_string(), TokenType::Respect);
1098 keywords.insert("FIRST".to_string(), TokenType::First);
1099 keywords.insert("LAST".to_string(), TokenType::Last);
1100 keywords.insert("IF".to_string(), TokenType::If);
1101 keywords.insert("CAST".to_string(), TokenType::Cast);
1102 keywords.insert("TRY_CAST".to_string(), TokenType::TryCast);
1103 keywords.insert("SAFE_CAST".to_string(), TokenType::SafeCast);
1104 keywords.insert("OVER".to_string(), TokenType::Over);
1105 keywords.insert("PARTITION".to_string(), TokenType::Partition);
1106 keywords.insert("PLACING".to_string(), TokenType::Placing);
1107 keywords.insert("WINDOW".to_string(), TokenType::Window);
1108 keywords.insert("ROWS".to_string(), TokenType::Rows);
1109 keywords.insert("RANGE".to_string(), TokenType::Range);
1110 keywords.insert("FILTER".to_string(), TokenType::Filter);
1111 keywords.insert("NATURAL".to_string(), TokenType::Natural);
1112 keywords.insert("USING".to_string(), TokenType::Using);
1113 keywords.insert("UNBOUNDED".to_string(), TokenType::Unbounded);
1114 keywords.insert("PRECEDING".to_string(), TokenType::Preceding);
1115 keywords.insert("FOLLOWING".to_string(), TokenType::Following);
1116 keywords.insert("CURRENT".to_string(), TokenType::Current);
1117 keywords.insert("ROW".to_string(), TokenType::Row);
1118 keywords.insert("GROUPS".to_string(), TokenType::Groups);
1119 keywords.insert("RECURSIVE".to_string(), TokenType::Recursive);
1120 keywords.insert("BOTH".to_string(), TokenType::Both);
1122 keywords.insert("LEADING".to_string(), TokenType::Leading);
1123 keywords.insert("TRAILING".to_string(), TokenType::Trailing);
1124 keywords.insert("INTERVAL".to_string(), TokenType::Interval);
1125 keywords.insert("TOP".to_string(), TokenType::Top);
1127 keywords.insert("QUALIFY".to_string(), TokenType::Qualify);
1128 keywords.insert("SAMPLE".to_string(), TokenType::Sample);
1129 keywords.insert("TABLESAMPLE".to_string(), TokenType::TableSample);
1130 keywords.insert("BERNOULLI".to_string(), TokenType::Bernoulli);
1131 keywords.insert("SYSTEM".to_string(), TokenType::System);
1132 keywords.insert("BLOCK".to_string(), TokenType::Block);
1133 keywords.insert("SEED".to_string(), TokenType::Seed);
1134 keywords.insert("REPEATABLE".to_string(), TokenType::Repeatable);
1135 keywords.insert("TIES".to_string(), TokenType::Ties);
1136 keywords.insert("LATERAL".to_string(), TokenType::Lateral);
1137 keywords.insert("LAMBDA".to_string(), TokenType::Lambda);
1138 keywords.insert("APPLY".to_string(), TokenType::Apply);
1139 keywords.insert("CONNECT".to_string(), TokenType::Connect);
1141 keywords.insert("CLUSTER".to_string(), TokenType::Cluster);
1143 keywords.insert("DISTRIBUTE".to_string(), TokenType::Distribute);
1144 keywords.insert("SORT".to_string(), TokenType::Sort);
1145 keywords.insert("PIVOT".to_string(), TokenType::Pivot);
1146 keywords.insert("PREWHERE".to_string(), TokenType::Prewhere);
1147 keywords.insert("UNPIVOT".to_string(), TokenType::Unpivot);
1148 keywords.insert("FOR".to_string(), TokenType::For);
1149 keywords.insert("ANY".to_string(), TokenType::Any);
1150 keywords.insert("SOME".to_string(), TokenType::Some);
1151 keywords.insert("ASOF".to_string(), TokenType::AsOf);
1152 keywords.insert("PERCENT".to_string(), TokenType::Percent);
1153 keywords.insert("EXCLUDE".to_string(), TokenType::Exclude);
1154 keywords.insert("NO".to_string(), TokenType::No);
1155 keywords.insert("OTHERS".to_string(), TokenType::Others);
1156 keywords.insert("OPERATOR".to_string(), TokenType::Operator);
1158 keywords.insert("SCHEMA".to_string(), TokenType::Schema);
1160 keywords.insert("NAMESPACE".to_string(), TokenType::Namespace);
1161 keywords.insert("DATABASE".to_string(), TokenType::Database);
1162 keywords.insert("FUNCTION".to_string(), TokenType::Function);
1163 keywords.insert("PROCEDURE".to_string(), TokenType::Procedure);
1164 keywords.insert("PROC".to_string(), TokenType::Procedure);
1165 keywords.insert("SEQUENCE".to_string(), TokenType::Sequence);
1166 keywords.insert("TRIGGER".to_string(), TokenType::Trigger);
1167 keywords.insert("TYPE".to_string(), TokenType::Type);
1168 keywords.insert("DOMAIN".to_string(), TokenType::Domain);
1169 keywords.insert("RETURNS".to_string(), TokenType::Returns);
1170 keywords.insert("RETURNING".to_string(), TokenType::Returning);
1171 keywords.insert("LANGUAGE".to_string(), TokenType::Language);
1172 keywords.insert("ROLLBACK".to_string(), TokenType::Rollback);
1173 keywords.insert("COMMIT".to_string(), TokenType::Commit);
1174 keywords.insert("BEGIN".to_string(), TokenType::Begin);
1175 keywords.insert("DESCRIBE".to_string(), TokenType::Describe);
1176 keywords.insert("PREPARE".to_string(), TokenType::Prepare);
1177 keywords.insert("PRESERVE".to_string(), TokenType::Preserve);
1178 keywords.insert("TRANSACTION".to_string(), TokenType::Transaction);
1179 keywords.insert("SAVEPOINT".to_string(), TokenType::Savepoint);
1180 keywords.insert("BODY".to_string(), TokenType::Body);
1181 keywords.insert("INCREMENT".to_string(), TokenType::Increment);
1182 keywords.insert("MINVALUE".to_string(), TokenType::Minvalue);
1183 keywords.insert("MAXVALUE".to_string(), TokenType::Maxvalue);
1184 keywords.insert("CYCLE".to_string(), TokenType::Cycle);
1185 keywords.insert("NOCYCLE".to_string(), TokenType::NoCycle);
1186 keywords.insert("PRIOR".to_string(), TokenType::Prior);
1187 keywords.insert("MATCH".to_string(), TokenType::Match);
1189 keywords.insert("MATCH_RECOGNIZE".to_string(), TokenType::MatchRecognize);
1190 keywords.insert("MEASURES".to_string(), TokenType::Measures);
1191 keywords.insert("PATTERN".to_string(), TokenType::Pattern);
1192 keywords.insert("DEFINE".to_string(), TokenType::Define);
1193 keywords.insert("RUNNING".to_string(), TokenType::Running);
1194 keywords.insert("FINAL".to_string(), TokenType::Final);
1195 keywords.insert("OWNED".to_string(), TokenType::Owned);
1196 keywords.insert("AFTER".to_string(), TokenType::After);
1197 keywords.insert("BEFORE".to_string(), TokenType::Before);
1198 keywords.insert("INSTEAD".to_string(), TokenType::Instead);
1199 keywords.insert("EACH".to_string(), TokenType::Each);
1200 keywords.insert("STATEMENT".to_string(), TokenType::Statement);
1201 keywords.insert("REFERENCING".to_string(), TokenType::Referencing);
1202 keywords.insert("OLD".to_string(), TokenType::Old);
1203 keywords.insert("NEW".to_string(), TokenType::New);
1204 keywords.insert("OF".to_string(), TokenType::Of);
1205 keywords.insert("CHECK".to_string(), TokenType::Check);
1206 keywords.insert("START".to_string(), TokenType::Start);
1207 keywords.insert("ENUM".to_string(), TokenType::Enum);
1208 keywords.insert("AUTHORIZATION".to_string(), TokenType::Authorization);
1209 keywords.insert("RESTART".to_string(), TokenType::Restart);
1210 keywords.insert("DATE".to_string(), TokenType::Date);
1212 keywords.insert("TIME".to_string(), TokenType::Time);
1213 keywords.insert("TIMESTAMP".to_string(), TokenType::Timestamp);
1214 keywords.insert("DATETIME".to_string(), TokenType::DateTime);
1215 keywords.insert("GENERATED".to_string(), TokenType::Generated);
1216 keywords.insert("IDENTITY".to_string(), TokenType::Identity);
1217 keywords.insert("ALWAYS".to_string(), TokenType::Always);
1218 keywords.insert("LOAD".to_string(), TokenType::Load);
1220 keywords.insert("LOCAL".to_string(), TokenType::Local);
1221 keywords.insert("INPATH".to_string(), TokenType::Inpath);
1222 keywords.insert("INPUTFORMAT".to_string(), TokenType::InputFormat);
1223 keywords.insert("SERDE".to_string(), TokenType::Serde);
1224 keywords.insert("SERDEPROPERTIES".to_string(), TokenType::SerdeProperties);
1225 keywords.insert("FORMAT".to_string(), TokenType::Format);
1226 keywords.insert("PRAGMA".to_string(), TokenType::Pragma);
1228 keywords.insert("SHOW".to_string(), TokenType::Show);
1230 keywords.insert("SIBLINGS".to_string(), TokenType::Siblings);
1232 keywords.insert("COPY".to_string(), TokenType::Copy);
1234 keywords.insert("PUT".to_string(), TokenType::Put);
1235 keywords.insert("GET".to_string(), TokenType::Get);
1236 keywords.insert("EXEC".to_string(), TokenType::Execute);
1238 keywords.insert("EXECUTE".to_string(), TokenType::Execute);
1239 keywords.insert("ISNULL".to_string(), TokenType::IsNull);
1241 keywords.insert("NOTNULL".to_string(), TokenType::NotNull);
1242 keywords
1243});
1244
1245static DEFAULT_SINGLE_TOKENS: LazyLock<HashMap<char, TokenType>> = LazyLock::new(|| {
1246 let mut single_tokens = HashMap::with_capacity(30);
1247 single_tokens.insert('(', TokenType::LParen);
1248 single_tokens.insert(')', TokenType::RParen);
1249 single_tokens.insert('[', TokenType::LBracket);
1250 single_tokens.insert(']', TokenType::RBracket);
1251 single_tokens.insert('{', TokenType::LBrace);
1252 single_tokens.insert('}', TokenType::RBrace);
1253 single_tokens.insert(',', TokenType::Comma);
1254 single_tokens.insert('.', TokenType::Dot);
1255 single_tokens.insert(';', TokenType::Semicolon);
1256 single_tokens.insert('+', TokenType::Plus);
1257 single_tokens.insert('-', TokenType::Dash);
1258 single_tokens.insert('*', TokenType::Star);
1259 single_tokens.insert('/', TokenType::Slash);
1260 single_tokens.insert('%', TokenType::Percent);
1261 single_tokens.insert('&', TokenType::Amp);
1262 single_tokens.insert('|', TokenType::Pipe);
1263 single_tokens.insert('^', TokenType::Caret);
1264 single_tokens.insert('~', TokenType::Tilde);
1265 single_tokens.insert('<', TokenType::Lt);
1266 single_tokens.insert('>', TokenType::Gt);
1267 single_tokens.insert('=', TokenType::Eq);
1268 single_tokens.insert('!', TokenType::Exclamation);
1269 single_tokens.insert(':', TokenType::Colon);
1270 single_tokens.insert('@', TokenType::DAt);
1271 single_tokens.insert('#', TokenType::Hash);
1272 single_tokens.insert('$', TokenType::Dollar);
1273 single_tokens.insert('?', TokenType::Parameter);
1274 single_tokens
1275});
1276
1277static DEFAULT_QUOTES: LazyLock<HashMap<String, String>> = LazyLock::new(|| {
1278 let mut quotes = HashMap::with_capacity(4);
1279 quotes.insert("'".to_string(), "'".to_string());
1280 quotes.insert("\"\"\"".to_string(), "\"\"\"".to_string());
1282 quotes
1283});
1284
1285static DEFAULT_IDENTIFIERS: LazyLock<HashMap<char, char>> = LazyLock::new(|| {
1286 let mut identifiers = HashMap::with_capacity(4);
1287 identifiers.insert('"', '"');
1288 identifiers.insert('`', '`');
1289 identifiers
1292});
1293
1294static DEFAULT_COMMENTS: LazyLock<HashMap<String, Option<String>>> = LazyLock::new(|| {
1295 let mut comments = HashMap::with_capacity(4);
1296 comments.insert("--".to_string(), None);
1297 comments.insert("/*".to_string(), Some("*/".to_string()));
1298 comments
1299});
1300
1301#[derive(Debug, Clone)]
1303pub struct TokenizerConfig {
1304 pub keywords: HashMap<String, TokenType>,
1306 pub single_tokens: HashMap<char, TokenType>,
1308 pub quotes: HashMap<String, String>,
1310 pub identifiers: HashMap<char, char>,
1312 pub comments: HashMap<String, Option<String>>,
1314 pub string_escapes: Vec<char>,
1316 pub nested_comments: bool,
1318 pub escape_follow_chars: Vec<char>,
1323 pub b_prefix_is_byte_string: bool,
1326 pub numeric_literals: HashMap<String, String>,
1329 pub identifiers_can_start_with_digit: bool,
1333 pub hex_number_strings: bool,
1337 pub hex_string_is_integer_type: bool,
1341 pub string_escapes_allowed_in_raw_strings: bool,
1346 pub hash_comments: bool,
1348 pub dollar_sign_is_identifier: bool,
1352 pub insert_format_raw_data: bool,
1356 pub numbers_can_be_underscore_separated: bool,
1360 pub recover_terminal_backslash_quote: bool,
1364 pub recover_unterminated_string: bool,
1368}
1369
1370impl Default for TokenizerConfig {
1371 fn default() -> Self {
1372 Self {
1373 keywords: DEFAULT_KEYWORDS.clone(),
1374 single_tokens: DEFAULT_SINGLE_TOKENS.clone(),
1375 quotes: DEFAULT_QUOTES.clone(),
1376 identifiers: DEFAULT_IDENTIFIERS.clone(),
1377 comments: DEFAULT_COMMENTS.clone(),
1378 string_escapes: vec!['\''],
1381 nested_comments: true,
1382 escape_follow_chars: vec![],
1384 b_prefix_is_byte_string: false,
1386 numeric_literals: HashMap::new(),
1387 identifiers_can_start_with_digit: false,
1388 hex_number_strings: false,
1389 hex_string_is_integer_type: false,
1390 string_escapes_allowed_in_raw_strings: true,
1393 hash_comments: false,
1394 dollar_sign_is_identifier: false,
1395 insert_format_raw_data: false,
1396 numbers_can_be_underscore_separated: false,
1397 recover_terminal_backslash_quote: false,
1398 recover_unterminated_string: false,
1399 }
1400 }
1401}
1402
1403pub struct Tokenizer {
1405 config: TokenizerConfig,
1406}
1407
1408impl Tokenizer {
1409 pub fn new(config: TokenizerConfig) -> Self {
1411 Self { config }
1412 }
1413
1414 pub fn default_config() -> Self {
1416 Self::new(TokenizerConfig::default())
1417 }
1418
1419 pub fn tokenize(&self, sql: &str) -> Result<Vec<Token>> {
1421 let mut state = TokenizerState::new(sql, &self.config);
1422 state.tokenize()
1423 }
1424}
1425
1426impl Default for Tokenizer {
1427 fn default() -> Self {
1428 Self::default_config()
1429 }
1430}
1431
1432struct TokenizerState<'a> {
1434 source: &'a str,
1435 source_is_ascii: bool,
1436 chars: Vec<char>,
1437 size: usize,
1438 tokens: Vec<Token>,
1439 start: usize,
1440 current: usize,
1441 line: usize,
1442 column: usize,
1443 comments: Vec<String>,
1444 config: &'a TokenizerConfig,
1445}
1446
1447impl<'a> TokenizerState<'a> {
1448 fn new(sql: &'a str, config: &'a TokenizerConfig) -> Self {
1449 let chars: Vec<char> = sql.chars().collect();
1450 let size = chars.len();
1451 Self {
1452 source: sql,
1453 source_is_ascii: sql.is_ascii(),
1454 chars,
1455 size,
1456 tokens: Vec::new(),
1457 start: 0,
1458 current: 0,
1459 line: 1,
1460 column: 1,
1461 comments: Vec::new(),
1462 config,
1463 }
1464 }
1465
1466 fn tokenize(&mut self) -> Result<Vec<Token>> {
1467 while !self.is_at_end() {
1468 self.skip_whitespace();
1469 if self.is_at_end() {
1470 break;
1471 }
1472
1473 self.start = self.current;
1474 self.scan_token()?;
1475
1476 if self.config.insert_format_raw_data {
1479 if let Some(raw) = self.try_scan_insert_format_raw_data() {
1480 if !raw.is_empty() {
1481 self.start = self.current;
1482 self.add_token_with_text(TokenType::Var, raw);
1483 }
1484 }
1485 }
1486 }
1487
1488 if !self.comments.is_empty() {
1493 if let Some(last) = self.tokens.last_mut() {
1494 last.trailing_comments.extend(self.comments.drain(..));
1495 }
1496 }
1497
1498 Ok(std::mem::take(&mut self.tokens))
1499 }
1500
1501 #[inline]
1502 fn is_at_end(&self) -> bool {
1503 self.current >= self.size
1504 }
1505
1506 #[inline]
1507 fn text_from_range(&self, start: usize, end: usize) -> String {
1508 if self.source_is_ascii {
1509 self.source[start..end].to_string()
1510 } else {
1511 self.chars[start..end].iter().collect()
1512 }
1513 }
1514
1515 #[inline]
1516 fn peek(&self) -> char {
1517 if self.is_at_end() {
1518 '\0'
1519 } else {
1520 self.chars[self.current]
1521 }
1522 }
1523
1524 #[inline]
1525 fn peek_next(&self) -> char {
1526 if self.current + 1 >= self.size {
1527 '\0'
1528 } else {
1529 self.chars[self.current + 1]
1530 }
1531 }
1532
1533 #[inline]
1534 fn advance(&mut self) -> char {
1535 let c = self.peek();
1536 self.current += 1;
1537 if c == '\n' {
1538 self.line += 1;
1539 self.column = 1;
1540 } else {
1541 self.column += 1;
1542 }
1543 c
1544 }
1545
1546 fn skip_whitespace(&mut self) {
1547 let mut saw_newline = false;
1552 while !self.is_at_end() {
1553 let c = self.peek();
1554 match c {
1555 ' ' | '\t' | '\r' => {
1556 self.advance();
1557 }
1558 '\n' => {
1559 saw_newline = true;
1560 self.advance();
1561 }
1562 '\u{00A0}' | '\u{2000}'..='\u{200B}' | '\u{3000}' | '\u{FEFF}' => {
1567 self.advance();
1568 }
1569 '-' if self.peek_next() == '-' => {
1570 self.scan_line_comment(saw_newline);
1571 saw_newline = true;
1573 }
1574 '/' if self.peek_next() == '/' && self.config.hash_comments => {
1575 self.scan_double_slash_comment();
1577 }
1578 '/' if self.peek_next() == '*' => {
1579 if self.current + 2 < self.size && self.chars[self.current + 2] == '+' {
1581 break;
1583 }
1584 if self.scan_block_comment(saw_newline).is_err() {
1585 return;
1586 }
1587 }
1589 '/' if self.peek_next() == '/' && self.config.comments.contains_key("//") => {
1590 let prev_non_ws = if self.current > 0 {
1594 let mut i = self.current - 1;
1595 while i > 0 && (self.chars[i] == ' ' || self.chars[i] == '\t') {
1596 i -= 1;
1597 }
1598 self.chars[i]
1599 } else {
1600 '\0'
1601 };
1602 if prev_non_ws == ':' || prev_non_ws == '/' {
1603 break;
1605 }
1606 self.scan_line_comment(saw_newline);
1607 saw_newline = true;
1609 }
1610 '#' if self.config.hash_comments => {
1611 self.scan_hash_line_comment();
1612 }
1613 _ => break,
1614 }
1615 }
1616 }
1617
1618 fn scan_hash_line_comment(&mut self) {
1619 self.advance(); let start = self.current;
1621 while !self.is_at_end() && self.peek() != '\n' {
1622 self.advance();
1623 }
1624 let comment = self.text_from_range(start, self.current);
1625 let comment_text = comment.trim().to_string();
1626 if let Some(last) = self.tokens.last_mut() {
1627 last.trailing_comments.push(comment_text);
1628 } else {
1629 self.comments.push(comment_text);
1630 }
1631 }
1632
1633 fn scan_double_slash_comment(&mut self) {
1634 self.advance(); self.advance(); let start = self.current;
1637 while !self.is_at_end() && self.peek() != '\n' {
1638 self.advance();
1639 }
1640 let comment = self.text_from_range(start, self.current);
1641 let comment_text = comment.trim().to_string();
1642 if let Some(last) = self.tokens.last_mut() {
1643 last.trailing_comments.push(comment_text);
1644 } else {
1645 self.comments.push(comment_text);
1646 }
1647 }
1648
1649 fn scan_line_comment(&mut self, after_newline: bool) {
1650 self.advance(); self.advance(); let start = self.current;
1653 while !self.is_at_end() && self.peek() != '\n' {
1654 self.advance();
1655 }
1656 let comment_text = self.text_from_range(start, self.current);
1657
1658 if after_newline || self.tokens.is_empty() {
1661 self.comments.push(comment_text);
1662 } else if let Some(last) = self.tokens.last_mut() {
1663 last.trailing_comments.push(comment_text);
1664 }
1665 }
1666
1667 fn scan_block_comment(&mut self, after_newline: bool) -> Result<()> {
1668 self.advance(); self.advance(); let content_start = self.current;
1671 let mut depth = 1;
1672
1673 while !self.is_at_end() && depth > 0 {
1674 if self.peek() == '/' && self.peek_next() == '*' && self.config.nested_comments {
1675 self.advance();
1676 self.advance();
1677 depth += 1;
1678 } else if self.peek() == '*' && self.peek_next() == '/' {
1679 depth -= 1;
1680 if depth > 0 {
1681 self.advance();
1682 self.advance();
1683 }
1684 } else {
1685 self.advance();
1686 }
1687 }
1688
1689 if depth > 0 {
1690 return Err(Error::tokenize(
1691 "Unterminated block comment",
1692 self.line,
1693 self.column,
1694 self.start,
1695 self.current,
1696 ));
1697 }
1698
1699 let content = self.text_from_range(content_start, self.current);
1701 self.advance(); self.advance(); let comment_text = format!("/*{}*/", content);
1706
1707 if after_newline || self.tokens.is_empty() {
1710 self.comments.push(comment_text);
1711 } else if let Some(last) = self.tokens.last_mut() {
1712 last.trailing_comments.push(comment_text);
1713 }
1714
1715 Ok(())
1716 }
1717
1718 fn scan_hint(&mut self) -> Result<()> {
1720 self.advance(); self.advance(); self.advance(); let hint_start = self.current;
1724
1725 while !self.is_at_end() {
1727 if self.peek() == '*' && self.peek_next() == '/' {
1728 break;
1729 }
1730 self.advance();
1731 }
1732
1733 if self.is_at_end() {
1734 return Err(Error::tokenize(
1735 "Unterminated hint comment",
1736 self.line,
1737 self.column,
1738 self.start,
1739 self.current,
1740 ));
1741 }
1742
1743 let hint_text = self.text_from_range(hint_start, self.current);
1744 self.advance(); self.advance(); self.add_token_with_text(TokenType::Hint, hint_text.trim().to_string());
1748
1749 Ok(())
1750 }
1751
1752 fn scan_positional_parameter(&mut self) -> Result<()> {
1754 self.advance(); let start = self.current;
1756
1757 while !self.is_at_end() && self.peek().is_ascii_digit() {
1758 self.advance();
1759 }
1760
1761 let number = self.text_from_range(start, self.current);
1762 self.add_token_with_text(TokenType::Parameter, number);
1763 Ok(())
1764 }
1765
1766 fn try_scan_tagged_dollar_string(&mut self) -> Result<Option<()>> {
1771 let saved_pos = self.current;
1772
1773 self.advance(); let tag_start = self.current;
1779 while !self.is_at_end()
1780 && (self.peek().is_alphanumeric() || self.peek() == '_' || !self.peek().is_ascii())
1781 {
1782 self.advance();
1783 }
1784 let tag = self.text_from_range(tag_start, self.current);
1785
1786 if self.is_at_end() || self.peek() != '$' {
1788 self.current = saved_pos;
1790 return Ok(None);
1791 }
1792 self.advance(); let content_start = self.current;
1796 let closing_tag = format!("${}$", tag);
1797 let closing_chars: Vec<char> = closing_tag.chars().collect();
1798
1799 loop {
1800 if self.is_at_end() {
1801 self.current = saved_pos;
1803 return Ok(None);
1804 }
1805
1806 if self.peek() == '$' && self.current + closing_chars.len() <= self.size {
1808 let matches = closing_chars.iter().enumerate().all(|(j, &ch)| {
1809 self.current + j < self.size && self.chars[self.current + j] == ch
1810 });
1811 if matches {
1812 let content = self.text_from_range(content_start, self.current);
1813 for _ in 0..closing_chars.len() {
1815 self.advance();
1816 }
1817 let token_text = format!("{}\x00{}", tag, content);
1819 self.add_token_with_text(TokenType::DollarString, token_text);
1820 return Ok(Some(()));
1821 }
1822 }
1823 self.advance();
1824 }
1825 }
1826
1827 fn scan_dollar_quoted_string(&mut self) -> Result<()> {
1832 self.advance(); self.advance(); let start = self.current;
1837 while !self.is_at_end() {
1838 if self.peek() == '$'
1839 && self.current + 1 < self.size
1840 && self.chars[self.current + 1] == '$'
1841 {
1842 break;
1843 }
1844 self.advance();
1845 }
1846
1847 let content = self.text_from_range(start, self.current);
1848
1849 if !self.is_at_end() {
1850 self.advance(); self.advance(); }
1853
1854 self.add_token_with_text(TokenType::DollarString, content);
1855 Ok(())
1856 }
1857
1858 fn scan_token(&mut self) -> Result<()> {
1859 let c = self.peek();
1860
1861 if c == '\'' {
1863 if self.config.quotes.contains_key("'''")
1865 && self.peek_next() == '\''
1866 && self.current + 2 < self.size
1867 && self.chars[self.current + 2] == '\''
1868 {
1869 return self.scan_triple_quoted_string('\'');
1870 }
1871 return self.scan_string();
1872 }
1873
1874 if c == '"'
1876 && self.config.quotes.contains_key("\"\"\"")
1877 && self.peek_next() == '"'
1878 && self.current + 2 < self.size
1879 && self.chars[self.current + 2] == '"'
1880 {
1881 return self.scan_triple_quoted_string('"');
1882 }
1883
1884 if c == '"'
1887 && self.config.quotes.contains_key("\"")
1888 && !self.config.identifiers.contains_key(&'"')
1889 {
1890 return self.scan_double_quoted_string();
1891 }
1892
1893 if let Some(&end_quote) = self.config.identifiers.get(&c) {
1895 return self.scan_quoted_identifier(end_quote);
1896 }
1897
1898 if c.is_ascii_digit() {
1900 return self.scan_number();
1901 }
1902
1903 if c == '.' && self.peek_next().is_ascii_digit() {
1910 let prev_char = if self.current > 0 {
1911 self.chars[self.current - 1]
1912 } else {
1913 '\0'
1914 };
1915 let is_after_ident = prev_char.is_alphanumeric()
1916 || prev_char == '_'
1917 || prev_char == '`'
1918 || prev_char == '"'
1919 || prev_char == ']'
1920 || prev_char == ')';
1921 if prev_char != '.' && !is_after_ident {
1922 return self.scan_number_starting_with_dot();
1923 }
1924 }
1925
1926 if c == '/'
1928 && self.peek_next() == '*'
1929 && self.current + 2 < self.size
1930 && self.chars[self.current + 2] == '+'
1931 {
1932 return self.scan_hint();
1933 }
1934
1935 if let Some(token_type) = self.try_scan_multi_char_operator() {
1937 self.add_token(token_type);
1938 return Ok(());
1939 }
1940
1941 if c == '$'
1944 && (self.peek_next().is_alphanumeric()
1945 || self.peek_next() == '_'
1946 || !self.peek_next().is_ascii())
1947 {
1948 if let Some(()) = self.try_scan_tagged_dollar_string()? {
1949 return Ok(());
1950 }
1951 if self.config.dollar_sign_is_identifier {
1954 return self.scan_dollar_identifier();
1955 }
1956 }
1957
1958 if c == '$' && self.peek_next() == '$' {
1960 return self.scan_dollar_quoted_string();
1961 }
1962
1963 if c == '$' && self.peek_next().is_ascii_digit() {
1965 return self.scan_positional_parameter();
1966 }
1967
1968 if c == '$' && self.config.dollar_sign_is_identifier {
1970 return self.scan_dollar_identifier();
1971 }
1972
1973 if (c == '#' || c == '@')
1976 && (self.peek_next().is_alphanumeric()
1977 || self.peek_next() == '_'
1978 || self.peek_next() == '#')
1979 {
1980 return self.scan_tsql_identifier();
1981 }
1982
1983 if let Some(&token_type) = self.config.single_tokens.get(&c) {
1985 self.advance();
1986 self.add_token(token_type);
1987 return Ok(());
1988 }
1989
1990 if c == '\u{2212}' {
1992 self.advance();
1993 self.add_token(TokenType::Dash);
1994 return Ok(());
1995 }
1996
1997 if c == '\u{2044}' {
1999 self.advance();
2000 self.add_token(TokenType::Slash);
2001 return Ok(());
2002 }
2003
2004 if c == '\u{2018}' || c == '\u{2019}' {
2006 return self.scan_unicode_quoted_string(c);
2008 }
2009 if c == '\u{201C}' || c == '\u{201D}' {
2010 return self.scan_unicode_quoted_identifier(c);
2012 }
2013
2014 self.scan_identifier_or_keyword()
2016 }
2017
2018 fn try_scan_multi_char_operator(&mut self) -> Option<TokenType> {
2019 let c = self.peek();
2020 let next = self.peek_next();
2021 let third = if self.current + 2 < self.size {
2022 self.chars[self.current + 2]
2023 } else {
2024 '\0'
2025 };
2026
2027 if c == '-' && next == '|' && third == '-' {
2030 self.advance();
2031 self.advance();
2032 self.advance();
2033 return Some(TokenType::Adjacent);
2034 }
2035
2036 if c == '|' && next == '|' && third == '/' {
2038 self.advance();
2039 self.advance();
2040 self.advance();
2041 return Some(TokenType::DPipeSlash);
2042 }
2043
2044 if c == '#' && next == '>' && third == '>' {
2046 self.advance();
2047 self.advance();
2048 self.advance();
2049 return Some(TokenType::DHashArrow);
2050 }
2051
2052 if c == '-' && next == '>' && third == '>' {
2054 self.advance();
2055 self.advance();
2056 self.advance();
2057 return Some(TokenType::DArrow);
2058 }
2059
2060 if c == '<' && next == '=' && third == '>' {
2062 self.advance();
2063 self.advance();
2064 self.advance();
2065 return Some(TokenType::NullsafeEq);
2066 }
2067
2068 if c == '<' && next == '-' && third == '>' {
2070 self.advance();
2071 self.advance();
2072 self.advance();
2073 return Some(TokenType::LrArrow);
2074 }
2075
2076 if c == '<' && next == '@' {
2078 self.advance();
2079 self.advance();
2080 return Some(TokenType::LtAt);
2081 }
2082
2083 if c == '@' && next == '>' {
2085 self.advance();
2086 self.advance();
2087 return Some(TokenType::AtGt);
2088 }
2089
2090 if c == '~' && next == '~' && third == '~' {
2092 self.advance();
2093 self.advance();
2094 self.advance();
2095 return Some(TokenType::Glob);
2096 }
2097
2098 if c == '~' && next == '~' && third == '*' {
2100 self.advance();
2101 self.advance();
2102 self.advance();
2103 return Some(TokenType::ILike);
2104 }
2105
2106 let fourth = if self.current + 3 < self.size {
2108 self.chars[self.current + 3]
2109 } else {
2110 '\0'
2111 };
2112 if c == '!' && next == '~' && third == '~' && fourth == '*' {
2113 self.advance();
2114 self.advance();
2115 self.advance();
2116 self.advance();
2117 return Some(TokenType::NotILike);
2118 }
2119
2120 if c == '!' && next == '~' && third == '~' {
2122 self.advance();
2123 self.advance();
2124 self.advance();
2125 return Some(TokenType::NotLike);
2126 }
2127
2128 if c == '!' && next == '~' && third == '*' {
2130 self.advance();
2131 self.advance();
2132 self.advance();
2133 return Some(TokenType::NotIRLike);
2134 }
2135
2136 if c == '!' && next == ':' && third == '>' {
2138 self.advance();
2139 self.advance();
2140 self.advance();
2141 return Some(TokenType::NColonGt);
2142 }
2143
2144 if c == '?' && next == ':' && third == ':' {
2146 self.advance();
2147 self.advance();
2148 self.advance();
2149 return Some(TokenType::QDColon);
2150 }
2151
2152 if c == '!' && next == '~' {
2154 self.advance();
2155 self.advance();
2156 return Some(TokenType::NotRLike);
2157 }
2158
2159 if c == '~' && next == '~' {
2161 self.advance();
2162 self.advance();
2163 return Some(TokenType::Like);
2164 }
2165
2166 if c == '~' && next == '*' {
2168 self.advance();
2169 self.advance();
2170 return Some(TokenType::IRLike);
2171 }
2172
2173 if c == ':' && next == ':' && third == '$' {
2176 self.advance();
2177 self.advance();
2178 self.advance();
2179 return Some(TokenType::DColonDollar);
2180 }
2181 if c == ':' && next == ':' && third == '%' {
2182 self.advance();
2183 self.advance();
2184 self.advance();
2185 return Some(TokenType::DColonPercent);
2186 }
2187 if c == ':' && next == ':' && third == '?' {
2188 self.advance();
2189 self.advance();
2190 self.advance();
2191 return Some(TokenType::DColonQMark);
2192 }
2193
2194 let token_type = match (c, next) {
2196 ('.', ':') => Some(TokenType::DotColon),
2197 ('=', '=') => Some(TokenType::Eq), ('<', '=') => Some(TokenType::Lte),
2199 ('>', '=') => Some(TokenType::Gte),
2200 ('!', '=') => Some(TokenType::Neq),
2201 ('<', '>') => Some(TokenType::Neq),
2202 ('^', '=') => Some(TokenType::Neq),
2203 ('<', '<') => Some(TokenType::LtLt),
2204 ('>', '>') => Some(TokenType::GtGt),
2205 ('|', '|') => Some(TokenType::DPipe),
2206 ('|', '/') => Some(TokenType::PipeSlash), (':', ':') => Some(TokenType::DColon),
2208 (':', '=') => Some(TokenType::ColonEq), (':', '>') => Some(TokenType::ColonGt), ('-', '>') => Some(TokenType::Arrow), ('=', '>') => Some(TokenType::FArrow), ('&', '&') => Some(TokenType::DAmp),
2213 ('&', '<') => Some(TokenType::AmpLt), ('&', '>') => Some(TokenType::AmpGt), ('@', '@') => Some(TokenType::AtAt), ('?', '|') => Some(TokenType::QMarkPipe), ('?', '&') => Some(TokenType::QMarkAmp), ('?', '?') => Some(TokenType::DQMark), ('#', '>') => Some(TokenType::HashArrow), ('#', '-') => Some(TokenType::HashDash), ('^', '@') => Some(TokenType::CaretAt), ('*', '*') => Some(TokenType::DStar), ('|', '>') => Some(TokenType::PipeGt), _ => None,
2225 };
2226
2227 if token_type.is_some() {
2228 self.advance();
2229 self.advance();
2230 }
2231
2232 token_type
2233 }
2234
2235 fn scan_string(&mut self) -> Result<()> {
2236 self.advance(); let mut value = String::new();
2238
2239 while !self.is_at_end() {
2240 let c = self.peek();
2241 if c == '\'' {
2242 if self.peek_next() == '\'' {
2243 value.push('\'');
2245 self.advance();
2246 self.advance();
2247 } else {
2248 break;
2249 }
2250 } else if c == '\\' && self.config.string_escapes.contains(&'\\') {
2251 if self.config.recover_terminal_backslash_quote
2252 && self.peek_next() == '\''
2253 && !self.chars[self.current + 2..].contains(&'\'')
2254 {
2255 value.push(self.advance());
2256 break;
2257 }
2258
2259 self.advance(); if !self.is_at_end() {
2262 let escaped = self.advance();
2263 match escaped {
2264 'n' => value.push('\n'),
2265 'r' => value.push('\r'),
2266 't' => value.push('\t'),
2267 '0' => value.push('\0'),
2268 'Z' => value.push('\x1A'), 'a' => value.push('\x07'), 'b' => value.push('\x08'), 'f' => value.push('\x0C'), 'v' => value.push('\x0B'), 'x' => {
2274 let mut hex = String::with_capacity(2);
2276 for _ in 0..2 {
2277 if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2278 hex.push(self.advance());
2279 }
2280 }
2281 if hex.len() == 2 {
2282 if let Ok(byte) = u8::from_str_radix(&hex, 16) {
2283 value.push(byte as char);
2284 } else {
2285 value.push('\\');
2286 value.push('x');
2287 value.push_str(&hex);
2288 }
2289 } else {
2290 value.push('\\');
2292 value.push('x');
2293 value.push_str(&hex);
2294 }
2295 }
2296 '\\' => value.push('\\'),
2297 '\'' => value.push('\''),
2298 '"' => value.push('"'),
2299 '%' => {
2300 value.push('%');
2302 }
2303 '_' => {
2304 value.push('_');
2306 }
2307 _ => {
2311 if !self.config.escape_follow_chars.is_empty() {
2312 value.push(escaped);
2314 } else {
2315 value.push('\\');
2317 value.push(escaped);
2318 }
2319 }
2320 }
2321 }
2322 } else {
2323 value.push(self.advance());
2324 }
2325 }
2326
2327 if self.is_at_end() {
2328 if self.config.recover_unterminated_string {
2329 self.add_token_with_text(TokenType::String, value);
2330 return Ok(());
2331 }
2332
2333 return Err(Error::tokenize(
2334 "Unterminated string",
2335 self.line,
2336 self.column,
2337 self.start,
2338 self.current,
2339 ));
2340 }
2341
2342 self.advance(); self.add_token_with_text(TokenType::String, value);
2344 Ok(())
2345 }
2346
2347 fn scan_double_quoted_string(&mut self) -> Result<()> {
2349 self.advance(); let mut value = String::new();
2351
2352 while !self.is_at_end() {
2353 let c = self.peek();
2354 if c == '"' {
2355 if self.peek_next() == '"' {
2356 value.push('"');
2358 self.advance();
2359 self.advance();
2360 } else {
2361 break;
2362 }
2363 } else if c == '\\' && self.config.string_escapes.contains(&'\\') {
2364 self.advance(); if !self.is_at_end() {
2367 let escaped = self.advance();
2368 match escaped {
2369 'n' => value.push('\n'),
2370 'r' => value.push('\r'),
2371 't' => value.push('\t'),
2372 '0' => value.push('\0'),
2373 'Z' => value.push('\x1A'), 'a' => value.push('\x07'), 'b' => value.push('\x08'), 'f' => value.push('\x0C'), 'v' => value.push('\x0B'), 'x' => {
2379 let mut hex = String::with_capacity(2);
2381 for _ in 0..2 {
2382 if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2383 hex.push(self.advance());
2384 }
2385 }
2386 if hex.len() == 2 {
2387 if let Ok(byte) = u8::from_str_radix(&hex, 16) {
2388 value.push(byte as char);
2389 } else {
2390 value.push('\\');
2391 value.push('x');
2392 value.push_str(&hex);
2393 }
2394 } else {
2395 value.push('\\');
2397 value.push('x');
2398 value.push_str(&hex);
2399 }
2400 }
2401 '\\' => value.push('\\'),
2402 '\'' => value.push('\''),
2403 '"' => value.push('"'),
2404 '%' => {
2405 value.push('%');
2407 }
2408 '_' => {
2409 value.push('_');
2411 }
2412 _ => {
2416 if !self.config.escape_follow_chars.is_empty() {
2417 value.push(escaped);
2419 } else {
2420 value.push('\\');
2422 value.push(escaped);
2423 }
2424 }
2425 }
2426 }
2427 } else {
2428 value.push(self.advance());
2429 }
2430 }
2431
2432 if self.is_at_end() {
2433 return Err(Error::tokenize(
2434 "Unterminated double-quoted string",
2435 self.line,
2436 self.column,
2437 self.start,
2438 self.current,
2439 ));
2440 }
2441
2442 self.advance(); self.add_token_with_text(TokenType::String, value);
2444 Ok(())
2445 }
2446
2447 fn scan_triple_quoted_string(&mut self, quote_char: char) -> Result<()> {
2448 self.advance();
2450 self.advance();
2451 self.advance();
2452 let mut value = String::new();
2453
2454 while !self.is_at_end() {
2455 if self.peek() == quote_char
2457 && self.current + 1 < self.size
2458 && self.chars[self.current + 1] == quote_char
2459 && self.current + 2 < self.size
2460 && self.chars[self.current + 2] == quote_char
2461 {
2462 break;
2464 }
2465 value.push(self.advance());
2466 }
2467
2468 if self.is_at_end() {
2469 return Err(Error::tokenize(
2470 "Unterminated triple-quoted string",
2471 self.line,
2472 self.column,
2473 self.start,
2474 self.current,
2475 ));
2476 }
2477
2478 self.advance();
2480 self.advance();
2481 self.advance();
2482 let token_type = if quote_char == '"' {
2483 TokenType::TripleDoubleQuotedString
2484 } else {
2485 TokenType::TripleSingleQuotedString
2486 };
2487 self.add_token_with_text(token_type, value);
2488 Ok(())
2489 }
2490
2491 fn scan_quoted_identifier(&mut self, end_quote: char) -> Result<()> {
2492 self.advance(); let mut value = String::new();
2494
2495 loop {
2496 if self.is_at_end() {
2497 return Err(Error::tokenize(
2498 "Unterminated identifier",
2499 self.line,
2500 self.column,
2501 self.start,
2502 self.current,
2503 ));
2504 }
2505 if self.peek() == end_quote {
2506 if self.peek_next() == end_quote {
2507 value.push(end_quote);
2509 self.advance(); self.advance(); } else {
2512 break;
2514 }
2515 } else {
2516 value.push(self.peek());
2517 self.advance();
2518 }
2519 }
2520
2521 self.advance(); self.add_token_with_text(TokenType::QuotedIdentifier, value);
2523 Ok(())
2524 }
2525
2526 fn scan_unicode_quoted_string(&mut self, open_quote: char) -> Result<()> {
2531 self.advance(); let start = self.current;
2533 let close_quote = if open_quote == '\u{2018}' {
2535 '\u{2019}' } else {
2537 '\u{2019}' };
2539 while !self.is_at_end() && self.peek() != close_quote {
2540 self.advance();
2541 }
2542 let value = self.text_from_range(start, self.current);
2543 if !self.is_at_end() {
2544 self.advance(); }
2546 self.add_token_with_text(TokenType::String, value);
2547 Ok(())
2548 }
2549
2550 fn scan_unicode_quoted_identifier(&mut self, open_quote: char) -> Result<()> {
2553 self.advance(); let start = self.current;
2555 let close_quote = if open_quote == '\u{201C}' {
2556 '\u{201D}' } else {
2558 '\u{201D}' };
2560 while !self.is_at_end() && self.peek() != close_quote && self.peek() != '"' {
2561 self.advance();
2562 }
2563 let value = self.text_from_range(start, self.current);
2564 if !self.is_at_end() {
2565 self.advance(); }
2567 self.add_token_with_text(TokenType::QuotedIdentifier, value);
2568 Ok(())
2569 }
2570
2571 fn scan_number(&mut self) -> Result<()> {
2572 if self.config.hex_number_strings && self.peek() == '0' && !self.is_at_end() {
2574 let next = if self.current + 1 < self.size {
2575 self.chars[self.current + 1]
2576 } else {
2577 '\0'
2578 };
2579 if next == 'x' || next == 'X' {
2580 self.advance();
2582 self.advance();
2583 let hex_start = self.current;
2585 while !self.is_at_end() && (self.peek().is_ascii_hexdigit() || self.peek() == '_') {
2586 if self.peek() == '_' && !self.peek_next().is_ascii_hexdigit() {
2587 break;
2588 }
2589 self.advance();
2590 }
2591 if self.current > hex_start {
2592 let mut is_hex_float = false;
2594 if !self.is_at_end() && self.peek() == '.' {
2596 let after_dot = if self.current + 1 < self.size {
2597 self.chars[self.current + 1]
2598 } else {
2599 '\0'
2600 };
2601 if after_dot.is_ascii_hexdigit() {
2602 is_hex_float = true;
2603 self.advance(); while !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2605 self.advance();
2606 }
2607 }
2608 }
2609 if !self.is_at_end() && (self.peek() == 'p' || self.peek() == 'P') {
2611 is_hex_float = true;
2612 self.advance(); if !self.is_at_end() && (self.peek() == '+' || self.peek() == '-') {
2614 self.advance();
2615 }
2616 while !self.is_at_end() && self.peek().is_ascii_digit() {
2617 self.advance();
2618 }
2619 }
2620 if is_hex_float {
2621 let raw_text = self.text_from_range(self.start, self.current);
2623 let full_text = if self.config.numbers_can_be_underscore_separated
2624 && raw_text.contains('_')
2625 {
2626 raw_text.replace('_', "")
2627 } else {
2628 raw_text
2629 };
2630 self.add_token_with_text(TokenType::Number, full_text);
2631 } else if self.config.hex_string_is_integer_type {
2632 let raw_value = self.text_from_range(hex_start, self.current);
2634 let hex_value = if self.config.numbers_can_be_underscore_separated
2635 && raw_value.contains('_')
2636 {
2637 raw_value.replace('_', "")
2638 } else {
2639 raw_value
2640 };
2641 self.add_token_with_text(TokenType::HexNumber, hex_value);
2642 } else {
2643 let raw_value = self.text_from_range(hex_start, self.current);
2645 let hex_value = if self.config.numbers_can_be_underscore_separated
2646 && raw_value.contains('_')
2647 {
2648 raw_value.replace('_', "")
2649 } else {
2650 raw_value
2651 };
2652 self.add_token_with_text(TokenType::HexString, hex_value);
2653 }
2654 return Ok(());
2655 }
2656 self.current = self.start + 1;
2659 }
2660 }
2661
2662 while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2664 if self.peek() == '_' && (self.is_at_end() || !self.peek_next().is_ascii_digit()) {
2666 break;
2667 }
2668 self.advance();
2669 }
2670
2671 if self.peek() == '.' {
2675 let next = self.peek_next();
2676 if next != '.' {
2682 self.advance(); while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2685 if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2686 break;
2687 }
2688 self.advance();
2689 }
2690 }
2691 }
2692
2693 if self.peek() == 'e' || self.peek() == 'E' {
2695 self.advance();
2696 if self.peek() == '+' || self.peek() == '-' {
2697 self.advance();
2698 }
2699 while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2700 if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2701 break;
2702 }
2703 self.advance();
2704 }
2705 }
2706
2707 let raw_text = self.text_from_range(self.start, self.current);
2708 let text = if self.config.numbers_can_be_underscore_separated && raw_text.contains('_') {
2711 raw_text.replace('_', "")
2712 } else {
2713 raw_text
2714 };
2715
2716 if !self.config.numeric_literals.is_empty() && !self.is_at_end() {
2718 let next_char: String = self.peek().to_ascii_uppercase().to_string();
2719 let suffix_match = if self.current + 1 < self.size {
2721 let two_char: String = [
2722 self.chars[self.current].to_ascii_uppercase(),
2723 self.chars[self.current + 1].to_ascii_uppercase(),
2724 ]
2725 .iter()
2726 .collect();
2727 if self.config.numeric_literals.contains_key(&two_char) {
2728 let after_suffix = if self.current + 2 < self.size {
2730 self.chars[self.current + 2]
2731 } else {
2732 ' '
2733 };
2734 if !after_suffix.is_alphanumeric() && after_suffix != '_' {
2735 Some((two_char, 2))
2736 } else {
2737 None
2738 }
2739 } else if self.config.numeric_literals.contains_key(&next_char) {
2740 let after_suffix = if self.current + 1 < self.size {
2742 self.chars[self.current + 1]
2743 } else {
2744 ' '
2745 };
2746 if !after_suffix.is_alphanumeric() && after_suffix != '_' {
2747 Some((next_char, 1))
2748 } else {
2749 None
2750 }
2751 } else {
2752 None
2753 }
2754 } else if self.config.numeric_literals.contains_key(&next_char) {
2755 Some((next_char, 1))
2757 } else {
2758 None
2759 };
2760
2761 if let Some((suffix, len)) = suffix_match {
2762 for _ in 0..len {
2764 self.advance();
2765 }
2766 let type_name = self
2769 .config
2770 .numeric_literals
2771 .get(&suffix)
2772 .expect("suffix verified by contains_key above")
2773 .clone();
2774 let combined = format!("{}::{}", text, type_name);
2775 self.add_token_with_text(TokenType::Number, combined);
2776 return Ok(());
2777 }
2778 }
2779
2780 if self.config.identifiers_can_start_with_digit && !self.is_at_end() {
2783 let next = self.peek();
2784 if next.is_alphabetic() || next == '_' {
2785 while !self.is_at_end() {
2787 let ch = self.peek();
2788 if ch.is_alphanumeric() || ch == '_' {
2789 self.advance();
2790 } else {
2791 break;
2792 }
2793 }
2794 let ident_text = self.text_from_range(self.start, self.current);
2795 self.add_token_with_text(TokenType::Identifier, ident_text);
2796 return Ok(());
2797 }
2798 }
2799
2800 self.add_token_with_text(TokenType::Number, text);
2801 Ok(())
2802 }
2803
2804 fn scan_number_starting_with_dot(&mut self) -> Result<()> {
2806 self.advance();
2808
2809 while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2811 if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2812 break;
2813 }
2814 self.advance();
2815 }
2816
2817 if self.peek() == 'e' || self.peek() == 'E' {
2819 self.advance();
2820 if self.peek() == '+' || self.peek() == '-' {
2821 self.advance();
2822 }
2823 while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2824 if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2825 break;
2826 }
2827 self.advance();
2828 }
2829 }
2830
2831 let raw_text = self.text_from_range(self.start, self.current);
2832 let text = if self.config.numbers_can_be_underscore_separated && raw_text.contains('_') {
2835 raw_text.replace('_', "")
2836 } else {
2837 raw_text
2838 };
2839 self.add_token_with_text(TokenType::Number, text);
2840 Ok(())
2841 }
2842
2843 #[inline]
2846 fn lookup_keyword_ascii(keywords: &HashMap<String, TokenType>, text: &str) -> TokenType {
2847 if text.len() > 128 {
2848 return TokenType::Var;
2849 }
2850 let mut buf = [0u8; 128];
2851 for (i, b) in text.bytes().enumerate() {
2852 buf[i] = b.to_ascii_uppercase();
2853 }
2854 if let Ok(upper) = std::str::from_utf8(&buf[..text.len()]) {
2855 keywords.get(upper).copied().unwrap_or(TokenType::Var)
2856 } else {
2857 TokenType::Var
2858 }
2859 }
2860
2861 fn scan_identifier_or_keyword(&mut self) -> Result<()> {
2862 let first_char = self.peek();
2864 if !first_char.is_alphanumeric() && first_char != '_' {
2865 let c = self.advance();
2867 return Err(Error::tokenize(
2868 format!("Unexpected character: '{}'", c),
2869 self.line,
2870 self.column,
2871 self.start,
2872 self.current,
2873 ));
2874 }
2875
2876 while !self.is_at_end() {
2877 let c = self.peek();
2878 if c == '#' {
2882 let next_c = if self.current + 1 < self.size {
2883 self.chars[self.current + 1]
2884 } else {
2885 '\0'
2886 };
2887 if next_c == '>' || next_c == '-' {
2888 break; }
2890 self.advance();
2891 } else if c.is_alphanumeric() || c == '_' || c == '$' || c == '@' {
2892 self.advance();
2893 } else {
2894 break;
2895 }
2896 }
2897
2898 let text = self.text_from_range(self.start, self.current);
2899
2900 if text.eq_ignore_ascii_case("NOT") && self.peek() == '=' {
2902 self.advance(); self.add_token(TokenType::Neq);
2904 return Ok(());
2905 }
2906
2907 let next_char = self.peek();
2910 let is_single_quote = next_char == '\'';
2911 let is_double_quote = next_char == '"' && self.config.quotes.contains_key("\"");
2912 let is_double_quote_for_raw = next_char == '"';
2915
2916 if text.eq_ignore_ascii_case("R") && (is_single_quote || is_double_quote_for_raw) {
2919 let quote_char = if is_single_quote { '\'' } else { '"' };
2922 self.advance(); if self.peek() == quote_char && self.peek_next() == quote_char {
2926 self.advance(); self.advance(); let string_value = self.scan_raw_triple_quoted_content(quote_char)?;
2930 self.add_token_with_text(TokenType::RawString, string_value);
2931 } else {
2932 let string_value = self.scan_raw_string_content(quote_char)?;
2933 self.add_token_with_text(TokenType::RawString, string_value);
2934 }
2935 return Ok(());
2936 }
2937
2938 if is_single_quote || is_double_quote {
2939 if text.eq_ignore_ascii_case("N") {
2940 self.advance(); let string_value = if is_single_quote {
2943 self.scan_string_content()?
2944 } else {
2945 self.scan_double_quoted_string_content()?
2946 };
2947 self.add_token_with_text(TokenType::NationalString, string_value);
2948 return Ok(());
2949 } else if text.eq_ignore_ascii_case("E") {
2950 let lowercase = text == "e";
2954 let prefix = if lowercase { "e:" } else { "E:" };
2955 self.advance(); let string_value = self.scan_string_content_with_escapes(true)?;
2957 self.add_token_with_text(
2958 TokenType::EscapeString,
2959 format!("{}{}", prefix, string_value),
2960 );
2961 return Ok(());
2962 } else if text.eq_ignore_ascii_case("X") {
2963 self.advance(); let string_value = if is_single_quote {
2966 self.scan_string_content()?
2967 } else {
2968 self.scan_double_quoted_string_content()?
2969 };
2970 self.add_token_with_text(TokenType::HexString, string_value);
2971 return Ok(());
2972 } else if text.eq_ignore_ascii_case("B") && is_double_quote {
2973 self.advance(); let string_value = self.scan_double_quoted_string_content()?;
2976 self.add_token_with_text(TokenType::ByteString, string_value);
2977 return Ok(());
2978 } else if text.eq_ignore_ascii_case("B") && is_single_quote {
2979 self.advance(); let string_value = self.scan_string_content()?;
2983 if self.config.b_prefix_is_byte_string {
2984 self.add_token_with_text(TokenType::ByteString, string_value);
2985 } else {
2986 self.add_token_with_text(TokenType::BitString, string_value);
2987 }
2988 return Ok(());
2989 }
2990 }
2991
2992 if text.eq_ignore_ascii_case("U")
2994 && self.peek() == '&'
2995 && self.current + 1 < self.size
2996 && self.chars[self.current + 1] == '\''
2997 {
2998 self.advance(); self.advance(); let string_value = self.scan_string_content()?;
3001 self.add_token_with_text(TokenType::UnicodeString, string_value);
3002 return Ok(());
3003 }
3004
3005 let token_type = Self::lookup_keyword_ascii(&self.config.keywords, &text);
3006
3007 self.add_token_with_text(token_type, text);
3008 Ok(())
3009 }
3010
3011 fn scan_string_content_with_escapes(
3015 &mut self,
3016 force_backslash_escapes: bool,
3017 ) -> Result<String> {
3018 let mut value = String::new();
3019 let use_backslash_escapes =
3020 force_backslash_escapes || self.config.string_escapes.contains(&'\\');
3021
3022 while !self.is_at_end() {
3023 let c = self.peek();
3024 if c == '\'' {
3025 if self.peek_next() == '\'' {
3026 value.push('\'');
3028 self.advance();
3029 self.advance();
3030 } else {
3031 break;
3032 }
3033 } else if c == '\\' && use_backslash_escapes {
3034 value.push(self.advance());
3036 if !self.is_at_end() {
3037 value.push(self.advance());
3038 }
3039 } else {
3040 value.push(self.advance());
3041 }
3042 }
3043
3044 if self.is_at_end() {
3045 return Err(Error::tokenize(
3046 "Unterminated string",
3047 self.line,
3048 self.column,
3049 self.start,
3050 self.current,
3051 ));
3052 }
3053
3054 self.advance(); Ok(value)
3056 }
3057
3058 fn scan_string_content(&mut self) -> Result<String> {
3060 self.scan_string_content_with_escapes(false)
3061 }
3062
3063 fn scan_double_quoted_string_content(&mut self) -> Result<String> {
3066 let mut value = String::new();
3067 let use_backslash_escapes = self.config.string_escapes.contains(&'\\');
3068
3069 while !self.is_at_end() {
3070 let c = self.peek();
3071 if c == '"' {
3072 if self.peek_next() == '"' {
3073 value.push('"');
3075 self.advance();
3076 self.advance();
3077 } else {
3078 break;
3079 }
3080 } else if c == '\\' && use_backslash_escapes {
3081 self.advance(); if !self.is_at_end() {
3084 let escaped = self.advance();
3085 match escaped {
3086 'n' => value.push('\n'),
3087 'r' => value.push('\r'),
3088 't' => value.push('\t'),
3089 '0' => value.push('\0'),
3090 '\\' => value.push('\\'),
3091 '"' => value.push('"'),
3092 '\'' => value.push('\''),
3093 'x' => {
3094 let mut hex = String::new();
3096 for _ in 0..2 {
3097 if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
3098 hex.push(self.advance());
3099 }
3100 }
3101 if let Ok(byte) = u8::from_str_radix(&hex, 16) {
3102 value.push(byte as char);
3103 } else {
3104 value.push('\\');
3106 value.push('x');
3107 value.push_str(&hex);
3108 }
3109 }
3110 _ => {
3111 value.push('\\');
3113 value.push(escaped);
3114 }
3115 }
3116 }
3117 } else {
3118 value.push(self.advance());
3119 }
3120 }
3121
3122 if self.is_at_end() {
3123 return Err(Error::tokenize(
3124 "Unterminated double-quoted string",
3125 self.line,
3126 self.column,
3127 self.start,
3128 self.current,
3129 ));
3130 }
3131
3132 self.advance(); Ok(value)
3134 }
3135
3136 fn scan_raw_string_content(&mut self, quote_char: char) -> Result<String> {
3141 let mut value = String::new();
3142
3143 while !self.is_at_end() {
3144 let c = self.peek();
3145 if c == quote_char {
3146 if self.peek_next() == quote_char {
3147 value.push(quote_char);
3149 self.advance();
3150 self.advance();
3151 } else {
3152 break;
3153 }
3154 } else if c == '\\'
3155 && self.peek_next() == quote_char
3156 && self.config.string_escapes_allowed_in_raw_strings
3157 {
3158 value.push(quote_char);
3162 self.advance(); self.advance(); } else {
3165 value.push(self.advance());
3167 }
3168 }
3169
3170 if self.is_at_end() {
3171 return Err(Error::tokenize(
3172 "Unterminated raw string",
3173 self.line,
3174 self.column,
3175 self.start,
3176 self.current,
3177 ));
3178 }
3179
3180 self.advance(); Ok(value)
3182 }
3183
3184 fn scan_raw_triple_quoted_content(&mut self, quote_char: char) -> Result<String> {
3187 let mut value = String::new();
3188
3189 while !self.is_at_end() {
3190 let c = self.peek();
3191 if c == quote_char && self.peek_next() == quote_char {
3192 if self.current + 2 < self.size && self.chars[self.current + 2] == quote_char {
3194 self.advance(); self.advance(); self.advance(); return Ok(value);
3199 }
3200 }
3201 let ch = self.advance();
3203 value.push(ch);
3204 }
3205
3206 Err(Error::tokenize(
3207 "Unterminated raw triple-quoted string",
3208 self.line,
3209 self.column,
3210 self.start,
3211 self.current,
3212 ))
3213 }
3214
3215 fn scan_dollar_identifier(&mut self) -> Result<()> {
3220 self.advance();
3222
3223 while !self.is_at_end() {
3225 let c = self.peek();
3226 if c.is_alphanumeric() || c == '_' || c == '$' {
3227 self.advance();
3228 } else {
3229 break;
3230 }
3231 }
3232
3233 let text = self.text_from_range(self.start, self.current);
3234 self.add_token_with_text(TokenType::Var, text);
3235 Ok(())
3236 }
3237
3238 fn scan_tsql_identifier(&mut self) -> Result<()> {
3239 let first = self.advance();
3241
3242 if first == '#' && self.peek() == '#' {
3244 self.advance();
3245 }
3246
3247 while !self.is_at_end() {
3249 let c = self.peek();
3250 if c.is_alphanumeric() || c == '_' || c == '$' || c == '#' || c == '@' {
3251 self.advance();
3252 } else {
3253 break;
3254 }
3255 }
3256
3257 let text = self.text_from_range(self.start, self.current);
3258 self.add_token_with_text(TokenType::Var, text);
3260 Ok(())
3261 }
3262
3263 fn try_scan_insert_format_raw_data(&mut self) -> Option<String> {
3267 let len = self.tokens.len();
3268 if len < 3 {
3269 return None;
3270 }
3271
3272 let last = &self.tokens[len - 1];
3274 if last.text.eq_ignore_ascii_case("VALUES") {
3275 return None;
3276 }
3277 if !matches!(last.token_type, TokenType::Var | TokenType::Identifier) {
3278 return None;
3279 }
3280
3281 let format_tok = &self.tokens[len - 2];
3283 if !format_tok.text.eq_ignore_ascii_case("FORMAT") {
3284 return None;
3285 }
3286
3287 let has_insert = self.tokens[..len - 2]
3289 .iter()
3290 .rev()
3291 .take(20)
3292 .any(|t| t.token_type == TokenType::Insert);
3293 if !has_insert {
3294 return None;
3295 }
3296
3297 let raw_start = self.current;
3301 while !self.is_at_end() {
3302 let c = self.peek();
3303 if c == '\n' {
3304 let saved = self.current;
3306 self.advance(); while !self.is_at_end() && self.peek() == '\r' {
3309 self.advance();
3310 }
3311 if self.is_at_end() || self.peek() == '\n' {
3312 let raw = self.text_from_range(raw_start, saved);
3315 return Some(raw.trim().to_string());
3316 }
3317 } else {
3319 self.advance();
3320 }
3321 }
3322
3323 let raw = self.text_from_range(raw_start, self.current);
3325 let trimmed = raw.trim().to_string();
3326 if trimmed.is_empty() {
3327 None
3328 } else {
3329 Some(trimmed)
3330 }
3331 }
3332
3333 fn add_token(&mut self, token_type: TokenType) {
3334 let text = self.text_from_range(self.start, self.current);
3335 self.add_token_with_text(token_type, text);
3336 }
3337
3338 fn add_token_with_text(&mut self, token_type: TokenType, text: String) {
3339 let span = Span::new(self.start, self.current, self.line, self.column);
3340 let mut token = Token::new(token_type, text, span);
3341 token.comments.append(&mut self.comments);
3342 self.tokens.push(token);
3343 }
3344}
3345
3346#[cfg(test)]
3347mod tests {
3348 use super::*;
3349
3350 #[test]
3351 fn test_simple_select() {
3352 let tokenizer = Tokenizer::default();
3353 let tokens = tokenizer.tokenize("SELECT 1").unwrap();
3354
3355 assert_eq!(tokens.len(), 2);
3356 assert_eq!(tokens[0].token_type, TokenType::Select);
3357 assert_eq!(tokens[1].token_type, TokenType::Number);
3358 assert_eq!(tokens[1].text, "1");
3359 }
3360
3361 #[test]
3362 fn test_select_with_identifier() {
3363 let tokenizer = Tokenizer::default();
3364 let tokens = tokenizer.tokenize("SELECT a, b FROM t").unwrap();
3365
3366 assert_eq!(tokens.len(), 6);
3367 assert_eq!(tokens[0].token_type, TokenType::Select);
3368 assert_eq!(tokens[1].token_type, TokenType::Var);
3369 assert_eq!(tokens[1].text, "a");
3370 assert_eq!(tokens[2].token_type, TokenType::Comma);
3371 assert_eq!(tokens[3].token_type, TokenType::Var);
3372 assert_eq!(tokens[3].text, "b");
3373 assert_eq!(tokens[4].token_type, TokenType::From);
3374 assert_eq!(tokens[5].token_type, TokenType::Var);
3375 assert_eq!(tokens[5].text, "t");
3376 }
3377
3378 #[test]
3379 fn test_string_literal() {
3380 let tokenizer = Tokenizer::default();
3381 let tokens = tokenizer.tokenize("SELECT 'hello'").unwrap();
3382
3383 assert_eq!(tokens.len(), 2);
3384 assert_eq!(tokens[1].token_type, TokenType::String);
3385 assert_eq!(tokens[1].text, "hello");
3386 }
3387
3388 #[test]
3389 fn test_escaped_string() {
3390 let tokenizer = Tokenizer::default();
3391 let tokens = tokenizer.tokenize("SELECT 'it''s'").unwrap();
3392
3393 assert_eq!(tokens.len(), 2);
3394 assert_eq!(tokens[1].token_type, TokenType::String);
3395 assert_eq!(tokens[1].text, "it's");
3396 }
3397
3398 #[test]
3399 fn test_terminal_backslash_quote_recovery() {
3400 let mut config = TokenizerConfig::default();
3401 config.string_escapes.push('\\');
3402 config.recover_terminal_backslash_quote = true;
3403 let tokenizer = Tokenizer::new(config);
3404 let tokens = tokenizer
3405 .tokenize("SHOW FUNCTIONS LIKE 'a\\' OR 1=1")
3406 .unwrap();
3407
3408 assert_eq!(tokens.len(), 8);
3409 assert_eq!(tokens[3].token_type, TokenType::String);
3410 assert_eq!(tokens[3].text, "a\\");
3411 assert_eq!(tokens[4].token_type, TokenType::Or);
3412 }
3413
3414 #[test]
3415 fn test_comments() {
3416 let tokenizer = Tokenizer::default();
3417 let tokens = tokenizer.tokenize("SELECT -- comment\n1").unwrap();
3418
3419 assert_eq!(tokens.len(), 2);
3420 assert_eq!(tokens[0].trailing_comments.len(), 1);
3423 assert_eq!(tokens[0].trailing_comments[0], " comment");
3424 }
3425
3426 #[test]
3427 fn test_comment_in_and_chain() {
3428 use crate::generator::Generator;
3429 use crate::parser::Parser;
3430
3431 let sql = "SELECT a FROM b WHERE foo\n-- c1\nAND bar\n-- c2\nAND bla";
3433 let ast = Parser::parse_sql(sql).unwrap();
3434 let mut gen = Generator::default();
3435 let output = gen.generate(&ast[0]).unwrap();
3436 assert_eq!(
3437 output,
3438 "SELECT a FROM b WHERE foo AND /* c1 */ bar AND /* c2 */ bla"
3439 );
3440 }
3441
3442 #[test]
3443 fn test_operators() {
3444 let tokenizer = Tokenizer::default();
3445 let tokens = tokenizer.tokenize("1 + 2 * 3").unwrap();
3446
3447 assert_eq!(tokens.len(), 5);
3448 assert_eq!(tokens[0].token_type, TokenType::Number);
3449 assert_eq!(tokens[1].token_type, TokenType::Plus);
3450 assert_eq!(tokens[2].token_type, TokenType::Number);
3451 assert_eq!(tokens[3].token_type, TokenType::Star);
3452 assert_eq!(tokens[4].token_type, TokenType::Number);
3453 }
3454
3455 #[test]
3456 fn test_comparison_operators() {
3457 let tokenizer = Tokenizer::default();
3458 let tokens = tokenizer.tokenize("a <= b >= c != d").unwrap();
3459
3460 assert_eq!(tokens[1].token_type, TokenType::Lte);
3461 assert_eq!(tokens[3].token_type, TokenType::Gte);
3462 assert_eq!(tokens[5].token_type, TokenType::Neq);
3463 }
3464
3465 #[test]
3466 fn test_national_string() {
3467 let tokenizer = Tokenizer::default();
3468 let tokens = tokenizer.tokenize("N'abc'").unwrap();
3469
3470 assert_eq!(
3471 tokens.len(),
3472 1,
3473 "Expected 1 token for N'abc', got {:?}",
3474 tokens
3475 );
3476 assert_eq!(tokens[0].token_type, TokenType::NationalString);
3477 assert_eq!(tokens[0].text, "abc");
3478 }
3479
3480 #[test]
3481 fn test_hex_string() {
3482 let tokenizer = Tokenizer::default();
3483 let tokens = tokenizer.tokenize("X'ABCD'").unwrap();
3484
3485 assert_eq!(
3486 tokens.len(),
3487 1,
3488 "Expected 1 token for X'ABCD', got {:?}",
3489 tokens
3490 );
3491 assert_eq!(tokens[0].token_type, TokenType::HexString);
3492 assert_eq!(tokens[0].text, "ABCD");
3493 }
3494
3495 #[test]
3496 fn test_bit_string() {
3497 let tokenizer = Tokenizer::default();
3498 let tokens = tokenizer.tokenize("B'01010'").unwrap();
3499
3500 assert_eq!(
3501 tokens.len(),
3502 1,
3503 "Expected 1 token for B'01010', got {:?}",
3504 tokens
3505 );
3506 assert_eq!(tokens[0].token_type, TokenType::BitString);
3507 assert_eq!(tokens[0].text, "01010");
3508 }
3509
3510 #[test]
3511 fn test_trailing_dot_number() {
3512 let tokenizer = Tokenizer::default();
3513
3514 let tokens = tokenizer.tokenize("SELECT 1.").unwrap();
3516 assert_eq!(
3517 tokens.len(),
3518 2,
3519 "Expected 2 tokens for 'SELECT 1.', got {:?}",
3520 tokens
3521 );
3522 assert_eq!(tokens[1].token_type, TokenType::Number);
3523 assert_eq!(tokens[1].text, "1.");
3524
3525 let tokens = tokenizer.tokenize("SELECT 1.5").unwrap();
3527 assert_eq!(tokens[1].text, "1.5");
3528
3529 let tokens = tokenizer.tokenize("SELECT 1.a").unwrap();
3532 assert_eq!(
3533 tokens.len(),
3534 3,
3535 "Expected 3 tokens for 'SELECT 1.a', got {:?}",
3536 tokens
3537 );
3538 assert_eq!(tokens[1].token_type, TokenType::Number);
3539 assert_eq!(tokens[1].text, "1.");
3540 assert_eq!(tokens[2].token_type, TokenType::Var);
3541
3542 let tokens = tokenizer.tokenize("SELECT 1..2").unwrap();
3544 assert_eq!(tokens[1].token_type, TokenType::Number);
3545 assert_eq!(tokens[1].text, "1");
3546 assert_eq!(tokens[2].token_type, TokenType::Dot);
3547 assert_eq!(tokens[3].token_type, TokenType::Dot);
3548 assert_eq!(tokens[4].token_type, TokenType::Number);
3549 assert_eq!(tokens[4].text, "2");
3550 }
3551
3552 #[test]
3553 fn test_leading_dot_number() {
3554 let tokenizer = Tokenizer::default();
3555
3556 let tokens = tokenizer.tokenize(".25").unwrap();
3558 assert_eq!(
3559 tokens.len(),
3560 1,
3561 "Expected 1 token for '.25', got {:?}",
3562 tokens
3563 );
3564 assert_eq!(tokens[0].token_type, TokenType::Number);
3565 assert_eq!(tokens[0].text, ".25");
3566
3567 let tokens = tokenizer.tokenize("SAMPLE (.25)").unwrap();
3569 assert_eq!(
3570 tokens.len(),
3571 4,
3572 "Expected 4 tokens for 'SAMPLE (.25)', got {:?}",
3573 tokens
3574 );
3575 assert_eq!(tokens[0].token_type, TokenType::Sample);
3576 assert_eq!(tokens[1].token_type, TokenType::LParen);
3577 assert_eq!(tokens[2].token_type, TokenType::Number);
3578 assert_eq!(tokens[2].text, ".25");
3579 assert_eq!(tokens[3].token_type, TokenType::RParen);
3580
3581 let tokens = tokenizer.tokenize(".5e10").unwrap();
3583 assert_eq!(
3584 tokens.len(),
3585 1,
3586 "Expected 1 token for '.5e10', got {:?}",
3587 tokens
3588 );
3589 assert_eq!(tokens[0].token_type, TokenType::Number);
3590 assert_eq!(tokens[0].text, ".5e10");
3591
3592 let tokens = tokenizer.tokenize("a.b").unwrap();
3594 assert_eq!(
3595 tokens.len(),
3596 3,
3597 "Expected 3 tokens for 'a.b', got {:?}",
3598 tokens
3599 );
3600 assert_eq!(tokens[1].token_type, TokenType::Dot);
3601 }
3602
3603 #[test]
3604 fn test_unrecognized_character() {
3605 let tokenizer = Tokenizer::default();
3606
3607 let result = tokenizer.tokenize("SELECT \u{2018}hello\u{2019}");
3609 assert!(
3610 result.is_ok(),
3611 "Curly quotes should be tokenized as strings"
3612 );
3613
3614 let result = tokenizer.tokenize("SELECT • FROM t");
3616 assert!(result.is_err());
3617 }
3618
3619 #[test]
3620 fn test_colon_eq_tokenization() {
3621 let tokenizer = Tokenizer::default();
3622
3623 let tokens = tokenizer.tokenize("a := 1").unwrap();
3625 assert_eq!(tokens.len(), 3);
3626 assert_eq!(tokens[0].token_type, TokenType::Var);
3627 assert_eq!(tokens[1].token_type, TokenType::ColonEq);
3628 assert_eq!(tokens[2].token_type, TokenType::Number);
3629
3630 let tokens = tokenizer.tokenize("a:b").unwrap();
3632 assert!(tokens.iter().any(|t| t.token_type == TokenType::Colon));
3633 assert!(!tokens.iter().any(|t| t.token_type == TokenType::ColonEq));
3634
3635 let tokens = tokenizer.tokenize("a::INT").unwrap();
3637 assert!(tokens.iter().any(|t| t.token_type == TokenType::DColon));
3638 }
3639
3640 #[test]
3641 fn test_colon_eq_parsing() {
3642 use crate::generator::Generator;
3643 use crate::parser::Parser;
3644
3645 let ast = Parser::parse_sql("SELECT @var1 := 1, @var2")
3647 .expect("Failed to parse MySQL @var := expr");
3648 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3649 assert_eq!(output, "SELECT @var1 := 1, @var2");
3650
3651 let ast = Parser::parse_sql("SELECT @var1, @var2 := @var1")
3653 .expect("Failed to parse MySQL @var2 := @var1");
3654 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3655 assert_eq!(output, "SELECT @var1, @var2 := @var1");
3656
3657 let ast = Parser::parse_sql("SELECT @var1 := COUNT(*) FROM t1")
3659 .expect("Failed to parse MySQL @var := COUNT(*)");
3660 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3661 assert_eq!(output, "SELECT @var1 := COUNT(*) FROM t1");
3662
3663 let ast = Parser::parse_sql("SET @var1 := 1").expect("Failed to parse SET @var1 := 1");
3665 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3666 assert_eq!(output, "SET @var1 = 1");
3667
3668 let ast =
3670 Parser::parse_sql("UNION_VALUE(k1 := 1)").expect("Failed to parse named arg with :=");
3671 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3672 assert_eq!(output, "UNION_VALUE(k1 := 1)");
3673
3674 let ast = Parser::parse_sql("SELECT UNNEST(col, recursive := TRUE) FROM t")
3676 .expect("Failed to parse UNNEST with :=");
3677 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3678 assert_eq!(output, "SELECT UNNEST(col, recursive := TRUE) FROM t");
3679
3680 let ast =
3682 Parser::parse_sql("SELECT foo: 1").expect("Failed to parse DuckDB prefix alias foo: 1");
3683 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3684 assert_eq!(output, "SELECT 1 AS foo");
3685
3686 let ast = Parser::parse_sql("SELECT foo: 1, bar: 2, baz: 3")
3688 .expect("Failed to parse DuckDB multiple prefix aliases");
3689 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3690 assert_eq!(output, "SELECT 1 AS foo, 2 AS bar, 3 AS baz");
3691 }
3692
3693 #[test]
3694 fn test_colon_eq_dialect_roundtrip() {
3695 use crate::dialects::{Dialect, DialectType};
3696
3697 fn check(dialect: DialectType, sql: &str, expected: Option<&str>) {
3698 let d = Dialect::get(dialect);
3699 let ast = d
3700 .parse(sql)
3701 .unwrap_or_else(|e| panic!("Parse error for '{}': {}", sql, e));
3702 assert!(!ast.is_empty(), "Empty AST for: {}", sql);
3703 let transformed = d
3704 .transform(ast[0].clone())
3705 .unwrap_or_else(|e| panic!("Transform error for '{}': {}", sql, e));
3706 let output = d
3707 .generate(&transformed)
3708 .unwrap_or_else(|e| panic!("Generate error for '{}': {}", sql, e));
3709 let expected = expected.unwrap_or(sql);
3710 assert_eq!(output, expected, "Roundtrip failed for: {}", sql);
3711 }
3712
3713 check(DialectType::MySQL, "SELECT @var1 := 1, @var2", None);
3715 check(DialectType::MySQL, "SELECT @var1, @var2 := @var1", None);
3716 check(DialectType::MySQL, "SELECT @var1 := COUNT(*) FROM t1", None);
3717 check(DialectType::MySQL, "SET @var1 := 1", Some("SET @var1 = 1"));
3718
3719 check(
3721 DialectType::DuckDB,
3722 "SELECT UNNEST(col, recursive := TRUE) FROM t",
3723 None,
3724 );
3725 check(DialectType::DuckDB, "UNION_VALUE(k1 := 1)", None);
3726
3727 {
3730 let d = Dialect::get(DialectType::DuckDB);
3731 let ast = d
3732 .parse("STRUCT_PACK(a := 'b')::json")
3733 .expect("Failed to parse STRUCT_PACK(a := 'b')::json");
3734 assert!(!ast.is_empty(), "Empty AST for STRUCT_PACK(a := 'b')::json");
3735 }
3736
3737 check(
3739 DialectType::DuckDB,
3740 "SELECT foo: 1",
3741 Some("SELECT 1 AS foo"),
3742 );
3743 check(
3744 DialectType::DuckDB,
3745 "SELECT foo: 1, bar: 2, baz: 3",
3746 Some("SELECT 1 AS foo, 2 AS bar, 3 AS baz"),
3747 );
3748 }
3749
3750 #[test]
3751 fn test_comment_roundtrip() {
3752 use crate::generator::Generator;
3753 use crate::parser::Parser;
3754
3755 fn check_roundtrip(sql: &str) -> Option<String> {
3756 let ast = match Parser::parse_sql(sql) {
3757 Ok(a) => a,
3758 Err(e) => return Some(format!("Parse error: {:?}", e)),
3759 };
3760 if ast.is_empty() {
3761 return Some("Empty AST".to_string());
3762 }
3763 let mut generator = Generator::default();
3764 let output = match generator.generate(&ast[0]) {
3765 Ok(o) => o,
3766 Err(e) => return Some(format!("Gen error: {:?}", e)),
3767 };
3768 if output == sql {
3769 None
3770 } else {
3771 Some(format!(
3772 "Mismatch:\n input: {}\n output: {}",
3773 sql, output
3774 ))
3775 }
3776 }
3777
3778 let tests = vec![
3779 "SELECT c /* c1 */ AS alias /* c2 */",
3785 "SELECT a /* x */, b /* x */",
3787 "SELECT a /* x */ /* y */ /* z */, b /* k */ /* m */",
3789 "SELECT * FROM foo /* x */, bla /* x */",
3791 "SELECT 1 /* comment */ + 1",
3793 "SELECT 1 /* c1 */ + 2 /* c2 */",
3794 "SELECT 1 /* c1 */ + /* c2 */ 2 /* c3 */",
3795 "SELECT CAST(x AS INT) /* comment */ FROM foo",
3797 "SELECT FOO(x /* c */) /* FOO */, b /* b */",
3799 "SELECT x FROM a.b.c /* x */, e.f.g /* x */",
3801 "INSERT INTO t1 (tc1 /* tc1 */, tc2 /* tc2 */) SELECT c1 /* sc1 */, c2 /* sc2 */ FROM t",
3803 "/* c */ WITH x AS (SELECT 1) SELECT * FROM x",
3805 "/* comment1 */ INSERT INTO x /* comment2 */ VALUES (1, 2, 3)",
3806 "/* comment1 */ UPDATE tbl /* comment2 */ SET x = 2 WHERE x < 2",
3807 "/* comment1 */ DELETE FROM x /* comment2 */ WHERE y > 1",
3808 "/* comment */ CREATE TABLE foo AS SELECT 1",
3809 "INSERT INTO foo SELECT * FROM bar /* comment */",
3811 "SELECT FOO(x /* c1 */ + y /* c2 */ + BLA(5 /* c3 */)) FROM (VALUES (1 /* c4 */, \"test\" /* c5 */)) /* c6 */",
3813 ];
3814
3815 let mut failures = Vec::new();
3816 for sql in tests {
3817 if let Some(e) = check_roundtrip(sql) {
3818 failures.push(e);
3819 }
3820 }
3821
3822 if !failures.is_empty() {
3823 panic!("Comment roundtrip failures:\n{}", failures.join("\n\n"));
3824 }
3825 }
3826
3827 #[test]
3828 fn test_dollar_quoted_string_parsing() {
3829 use crate::dialects::{Dialect, DialectType};
3830
3831 let (tag, content) = super::parse_dollar_string_token("FOO\x00content here");
3833 assert_eq!(tag, Some("FOO".to_string()));
3834 assert_eq!(content, "content here");
3835
3836 let (tag, content) = super::parse_dollar_string_token("just content");
3837 assert_eq!(tag, None);
3838 assert_eq!(content, "just content");
3839
3840 fn check_databricks(sql: &str, expected: Option<&str>) {
3842 let d = Dialect::get(DialectType::Databricks);
3843 let ast = d
3844 .parse(sql)
3845 .unwrap_or_else(|e| panic!("Parse error for '{}': {}", sql, e));
3846 assert!(!ast.is_empty(), "Empty AST for: {}", sql);
3847 let transformed = d
3848 .transform(ast[0].clone())
3849 .unwrap_or_else(|e| panic!("Transform error for '{}': {}", sql, e));
3850 let output = d
3851 .generate(&transformed)
3852 .unwrap_or_else(|e| panic!("Generate error for '{}': {}", sql, e));
3853 let expected = expected.unwrap_or(sql);
3854 assert_eq!(output, expected, "Roundtrip failed for: {}", sql);
3855 }
3856
3857 check_databricks(
3859 "CREATE FUNCTION add_one(x INT) RETURNS INT LANGUAGE PYTHON AS $$def add_one(x):\n return x+1$$",
3860 None
3861 );
3862
3863 check_databricks(
3865 "CREATE FUNCTION add_one(x INT) RETURNS INT LANGUAGE PYTHON AS $FOO$def add_one(x):\n return x+1$FOO$",
3866 None
3867 );
3868 }
3869
3870 #[test]
3871 fn test_numeric_underscore_stripping() {
3872 let mut config = TokenizerConfig::default();
3874 config.numbers_can_be_underscore_separated = true;
3875 let tokenizer = Tokenizer::new(config);
3876
3877 let tokens = tokenizer.tokenize("SELECT 1_2_3_4_5").unwrap();
3879 assert_eq!(tokens[1].token_type, TokenType::Number);
3880 assert_eq!(tokens[1].text, "12345");
3881
3882 let tokens = tokenizer.tokenize("SELECT 20_000").unwrap();
3884 assert_eq!(tokens[1].token_type, TokenType::Number);
3885 assert_eq!(tokens[1].text, "20000");
3886
3887 let tokens = tokenizer.tokenize("SELECT 1_2E+1_0").unwrap();
3889 assert_eq!(tokens[1].token_type, TokenType::Number);
3890 assert_eq!(tokens[1].text, "12E+10");
3891
3892 let default_tokenizer = Tokenizer::default();
3894 let tokens = default_tokenizer.tokenize("SELECT 1_2_3_4_5").unwrap();
3895 assert_eq!(tokens[1].token_type, TokenType::Number);
3896 assert_eq!(tokens[1].text, "1_2_3_4_5");
3897 }
3898}