1use crate::error::{Error, Result};
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9use std::fmt;
10use std::sync::LazyLock;
11#[cfg(feature = "bindings")]
12use ts_rs::TS;
13
14pub fn parse_dollar_string_token(text: &str) -> (Option<String>, String) {
18 if let Some(pos) = text.find('\x00') {
19 let tag = &text[..pos];
20 let content = &text[pos + 1..];
21 (Some(tag.to_string()), content.to_string())
22 } else {
23 (None, text.to_string())
24 }
25}
26
27#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
29#[cfg_attr(feature = "bindings", derive(TS))]
30pub struct Span {
31 pub start: usize,
33 pub end: usize,
35 pub line: usize,
37 pub column: usize,
39}
40
41impl Span {
42 pub fn new(start: usize, end: usize, line: usize, column: usize) -> Self {
43 Self {
44 start,
45 end,
46 line,
47 column,
48 }
49 }
50}
51
52#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
54pub struct Token {
55 pub token_type: TokenType,
57 pub text: String,
59 pub span: Span,
61 #[serde(default)]
63 pub comments: Vec<String>,
64 #[serde(default)]
66 pub trailing_comments: Vec<String>,
67}
68
69impl Token {
70 pub fn new(token_type: TokenType, text: impl Into<String>, span: Span) -> Self {
72 Self {
73 token_type,
74 text: text.into(),
75 span,
76 comments: Vec::new(),
77 trailing_comments: Vec::new(),
78 }
79 }
80
81 pub fn number(n: i64) -> Self {
83 Self::new(TokenType::Number, n.to_string(), Span::default())
84 }
85
86 pub fn string(s: impl Into<String>) -> Self {
88 Self::new(TokenType::String, s, Span::default())
89 }
90
91 pub fn identifier(s: impl Into<String>) -> Self {
93 Self::new(TokenType::Identifier, s, Span::default())
94 }
95
96 pub fn var(s: impl Into<String>) -> Self {
98 Self::new(TokenType::Var, s, Span::default())
99 }
100
101 pub fn with_comment(mut self, comment: impl Into<String>) -> Self {
103 self.comments.push(comment.into());
104 self
105 }
106}
107
108impl fmt::Display for Token {
109 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
110 write!(f, "{:?}({})", self.token_type, self.text)
111 }
112}
113
114#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
116#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
117#[repr(u16)]
118pub enum TokenType {
119 LParen,
121 RParen,
122 LBracket,
123 RBracket,
124 LBrace,
125 RBrace,
126 Comma,
127 Dot,
128 Dash,
129 Plus,
130 Colon,
131 DotColon,
132 DColon,
133 DColonDollar,
134 DColonPercent,
135 DColonQMark,
136 DQMark,
137 Semicolon,
138 Star,
139 Backslash,
140 Slash,
141 Lt,
142 Lte,
143 Gt,
144 Gte,
145 Not,
146 Eq,
147 Neq,
148 NullsafeEq,
149 ColonEq,
150 ColonGt,
151 NColonGt,
152 And,
153 Or,
154 Amp,
155 DPipe,
156 PipeGt,
157 Pipe,
158 PipeSlash,
159 DPipeSlash,
160 Caret,
161 CaretAt,
162 LtLt, GtGt, Tilde,
165 Arrow,
166 DArrow,
167 FArrow,
168 Hash,
169 HashArrow,
170 DHashArrow,
171 LrArrow,
172 DAt,
173 AtAt,
174 AtQMark,
175 LtAt,
176 AtGt,
177 Dollar,
178 Parameter,
179 Session,
180 SessionParameter,
181 SessionUser,
182 DAmp,
183 AmpLt,
184 AmpGt,
185 Adjacent,
186 Xor,
187 DStar,
188 QMarkAmp,
189 QMarkPipe,
190 HashDash,
191 Exclamation,
192
193 UriStart,
194 BlockStart,
195 BlockEnd,
196 Space,
197 Break,
198
199 BlockComment, LineComment, String,
205 DollarString, TripleDoubleQuotedString, TripleSingleQuotedString, Number,
209 Identifier,
210 QuotedIdentifier,
211 Database,
212 Column,
213 ColumnDef,
214 Schema,
215 Table,
216 Warehouse,
217 Stage,
218 Streamlit,
219 Var,
220 BitString,
221 HexString,
222 HexNumber,
224 ByteString,
225 NationalString,
226 EscapeString, RawString,
228 HeredocString,
229 HeredocStringAlternative,
230 UnicodeString,
231
232 Bit,
234 Boolean,
235 TinyInt,
236 UTinyInt,
237 SmallInt,
238 USmallInt,
239 MediumInt,
240 UMediumInt,
241 Int,
242 UInt,
243 BigInt,
244 UBigInt,
245 BigNum,
246 Int128,
247 UInt128,
248 Int256,
249 UInt256,
250 Float,
251 Double,
252 UDouble,
253 Decimal,
254 Decimal32,
255 Decimal64,
256 Decimal128,
257 Decimal256,
258 DecFloat,
259 UDecimal,
260 BigDecimal,
261 Char,
262 NChar,
263 VarChar,
264 NVarChar,
265 BpChar,
266 Text,
267 MediumText,
268 LongText,
269 Blob,
270 MediumBlob,
271 LongBlob,
272 TinyBlob,
273 TinyText,
274 Name,
275 Binary,
276 VarBinary,
277 Json,
278 JsonB,
279 Time,
280 TimeTz,
281 TimeNs,
282 Timestamp,
283 TimestampTz,
284 TimestampLtz,
285 TimestampNtz,
286 TimestampS,
287 TimestampMs,
288 TimestampNs,
289 DateTime,
290 DateTime2,
291 DateTime64,
292 SmallDateTime,
293 Date,
294 Date32,
295 Int4Range,
296 Int4MultiRange,
297 Int8Range,
298 Int8MultiRange,
299 NumRange,
300 NumMultiRange,
301 TsRange,
302 TsMultiRange,
303 TsTzRange,
304 TsTzMultiRange,
305 DateRange,
306 DateMultiRange,
307 Uuid,
308 Geography,
309 GeographyPoint,
310 Nullable,
311 Geometry,
312 Point,
313 Ring,
314 LineString,
315 LocalTime,
316 LocalTimestamp,
317 SysTimestamp,
318 MultiLineString,
319 Polygon,
320 MultiPolygon,
321 HllSketch,
322 HStore,
323 Super,
324 Serial,
325 SmallSerial,
326 BigSerial,
327 Xml,
328 Year,
329 UserDefined,
330 Money,
331 SmallMoney,
332 RowVersion,
333 Image,
334 Variant,
335 Object,
336 Inet,
337 IpAddress,
338 IpPrefix,
339 Ipv4,
340 Ipv6,
341 Enum,
342 Enum8,
343 Enum16,
344 FixedString,
345 LowCardinality,
346 Nested,
347 AggregateFunction,
348 SimpleAggregateFunction,
349 TDigest,
350 Unknown,
351 Vector,
352 Dynamic,
353 Void,
354
355 Add,
357 Alias,
358 Alter,
359 All,
360 Anti,
361 Any,
362 Apply,
363 Array,
364 Asc,
365 AsOf,
366 Attach,
367 AutoIncrement,
368 Begin,
369 Between,
370 BulkCollectInto,
371 Cache,
372 Cascade,
373 Case,
374 CharacterSet,
375 Cluster,
376 ClusterBy,
377 Collate,
378 Command,
379 Comment,
380 Commit,
381 Prepare,
382 Preserve,
383 Connect,
384 ConnectBy,
385 Constraint,
386 Copy,
387 Create,
388 Cross,
389 Cube,
390 CurrentDate,
391 CurrentDateTime,
392 CurrentSchema,
393 CurrentTime,
394 CurrentTimestamp,
395 CurrentUser,
396 CurrentRole,
397 CurrentCatalog,
398 Declare,
399 Default,
400 Delete,
401 Desc,
402 Describe,
403 Detach,
404 Dictionary,
405 Distinct,
406 Distribute,
407 DistributeBy,
408 Div,
409 Drop,
410 Else,
411 End,
412 Escape,
413 Except,
414 Execute,
415 Exists,
416 False,
417 Fetch,
418 File,
419 FileFormat,
420 Filter,
421 Final,
422 First,
423 For,
424 Force,
425 ForeignKey,
426 Format,
427 From,
428 Full,
429 Function,
430 Get,
431 Glob,
432 Global,
433 Grant,
434 GroupBy,
435 GroupingSets,
436 Having,
437 Hint,
438 Ignore,
439 ILike,
440 In,
441 Index,
442 IndexedBy,
443 Inner,
444 Input,
445 Insert,
446 Install,
447 Intersect,
448 Interval,
449 Into,
450 Inpath,
451 InputFormat,
452 Introducer,
453 IRLike,
454 Is,
455 IsNull,
456 Join,
457 JoinMarker,
458 Keep,
459 Key,
460 Kill,
461 Lambda,
462 Language,
463 Lateral,
464 Left,
465 Like,
466 NotLike, NotILike, NotRLike, NotIRLike, Limit,
471 List,
472 Load,
473 Local,
474 Lock,
475 Map,
476 Match,
477 MatchCondition,
478 MatchRecognize,
479 MemberOf,
480 Materialized,
481 Merge,
482 Mod,
483 Model,
484 Natural,
485 Next,
486 NoAction,
487 Nothing,
488 NotNull,
489 Null,
490 ObjectIdentifier,
491 Offset,
492 On,
493 Only,
494 Operator,
495 OrderBy,
496 OrderSiblingsBy,
497 Ordered,
498 Ordinality,
499 Out,
500 Outer,
501 Output,
502 Over,
503 Overlaps,
504 Overwrite,
505 Partition,
506 PartitionBy,
507 Percent,
508 Pivot,
509 Placeholder,
510 Positional,
511 Pragma,
512 Prewhere,
513 PrimaryKey,
514 Procedure,
515 Properties,
516 PseudoType,
517 Put,
518 Qualify,
519 Quote,
520 QDColon,
521 Range,
522 Recursive,
523 Refresh,
524 Rename,
525 Replace,
526 Returning,
527 Revoke,
528 References,
529 Restrict,
530 Right,
531 RLike,
532 Rollback,
533 Rollup,
534 Row,
535 Rows,
536 Select,
537 Semi,
538 Savepoint,
539 Separator,
540 Sequence,
541 Serde,
542 SerdeProperties,
543 Set,
544 Settings,
545 Show,
546 Siblings,
547 SimilarTo,
548 Some,
549 Sort,
550 SortBy,
551 SoundsLike,
552 StartWith,
553 StorageIntegration,
554 StraightJoin,
555 Struct,
556 Summarize,
557 TableSample,
558 Sample,
559 Bernoulli,
560 System,
561 Block,
562 Seed,
563 Repeatable,
564 Tag,
565 Temporary,
566 Transaction,
567 To,
568 Top,
569 Then,
570 True,
571 Truncate,
572 Uncache,
573 Union,
574 Unnest,
575 Unpivot,
576 Update,
577 Use,
578 Using,
579 Values,
580 View,
581 SemanticView,
582 Volatile,
583 When,
584 Where,
585 Window,
586 With,
587 Ties,
588 Exclude,
589 No,
590 Others,
591 Unique,
592 UtcDate,
593 UtcTime,
594 UtcTimestamp,
595 VersionSnapshot,
596 TimestampSnapshot,
597 Option,
598 Sink,
599 Source,
600 Analyze,
601 Namespace,
602 Export,
603 As,
604 By,
605 Nulls,
606 Respect,
607 Last,
608 If,
609 Cast,
610 TryCast,
611 SafeCast,
612 Count,
613 Extract,
614 Substring,
615 Trim,
616 Leading,
617 Trailing,
618 Both,
619 Position,
620 Overlaying,
621 Placing,
622 Treat,
623 Within,
624 Group,
625 Order,
626
627 Unbounded,
629 Preceding,
630 Following,
631 Current,
632 Groups,
633
634 Trigger,
636 Type,
637 Domain,
638 Returns,
639 Body,
640 Increment,
641 Minvalue,
642 Maxvalue,
643 Start,
644 Cycle,
645 NoCycle,
646 Prior,
647 Generated,
648 Identity,
649 Always,
650 Measures,
652 Pattern,
653 Define,
654 Running,
655 Owned,
656 After,
657 Before,
658 Instead,
659 Each,
660 Statement,
661 Referencing,
662 Old,
663 New,
664 Of,
665 Check,
666 Authorization,
667 Restart,
668
669 Eof,
671}
672
673impl TokenType {
674 pub fn is_keyword(&self) -> bool {
676 matches!(
677 self,
678 TokenType::Select
679 | TokenType::From
680 | TokenType::Where
681 | TokenType::And
682 | TokenType::Or
683 | TokenType::Not
684 | TokenType::In
685 | TokenType::Is
686 | TokenType::Null
687 | TokenType::True
688 | TokenType::False
689 | TokenType::As
690 | TokenType::On
691 | TokenType::Join
692 | TokenType::Left
693 | TokenType::Right
694 | TokenType::Inner
695 | TokenType::Outer
696 | TokenType::Full
697 | TokenType::Cross
698 | TokenType::Semi
699 | TokenType::Anti
700 | TokenType::Union
701 | TokenType::Except
702 | TokenType::Intersect
703 | TokenType::GroupBy
704 | TokenType::OrderBy
705 | TokenType::Having
706 | TokenType::Limit
707 | TokenType::Offset
708 | TokenType::Case
709 | TokenType::When
710 | TokenType::Then
711 | TokenType::Else
712 | TokenType::End
713 | TokenType::Create
714 | TokenType::Drop
715 | TokenType::Alter
716 | TokenType::Insert
717 | TokenType::Update
718 | TokenType::Delete
719 | TokenType::Into
720 | TokenType::Values
721 | TokenType::Set
722 | TokenType::With
723 | TokenType::Distinct
724 | TokenType::All
725 | TokenType::Exists
726 | TokenType::Between
727 | TokenType::Like
728 | TokenType::ILike
729 | TokenType::Filter
731 | TokenType::Date
732 | TokenType::Timestamp
733 | TokenType::TimestampTz
734 | TokenType::Interval
735 | TokenType::Time
736 | TokenType::Table
737 | TokenType::Index
738 | TokenType::Column
739 | TokenType::Database
740 | TokenType::Schema
741 | TokenType::View
742 | TokenType::Function
743 | TokenType::Procedure
744 | TokenType::Trigger
745 | TokenType::Sequence
746 | TokenType::Over
747 | TokenType::Partition
748 | TokenType::Window
749 | TokenType::Rows
750 | TokenType::Range
751 | TokenType::First
752 | TokenType::Last
753 | TokenType::Preceding
754 | TokenType::Following
755 | TokenType::Current
756 | TokenType::Row
757 | TokenType::Unbounded
758 | TokenType::Array
759 | TokenType::Struct
760 | TokenType::Map
761 | TokenType::PrimaryKey
762 | TokenType::Key
763 | TokenType::ForeignKey
764 | TokenType::References
765 | TokenType::Unique
766 | TokenType::Check
767 | TokenType::Default
768 | TokenType::Constraint
769 | TokenType::Comment
770 | TokenType::Rollup
771 | TokenType::Cube
772 | TokenType::Grant
773 | TokenType::Revoke
774 | TokenType::Type
775 | TokenType::Use
776 | TokenType::Cache
777 | TokenType::Uncache
778 | TokenType::Load
779 | TokenType::Any
780 | TokenType::Some
781 | TokenType::Asc
782 | TokenType::Desc
783 | TokenType::Nulls
784 | TokenType::Lateral
785 | TokenType::Natural
786 | TokenType::Escape
787 | TokenType::Glob
788 | TokenType::Match
789 | TokenType::Recursive
790 | TokenType::Replace
791 | TokenType::Returns
792 | TokenType::If
793 | TokenType::Pivot
794 | TokenType::Unpivot
795 | TokenType::Json
796 | TokenType::Blob
797 | TokenType::Text
798 | TokenType::Int
799 | TokenType::BigInt
800 | TokenType::SmallInt
801 | TokenType::TinyInt
802 | TokenType::Int128
803 | TokenType::UInt128
804 | TokenType::Int256
805 | TokenType::UInt256
806 | TokenType::UInt
807 | TokenType::UBigInt
808 | TokenType::Float
809 | TokenType::Double
810 | TokenType::Decimal
811 | TokenType::Boolean
812 | TokenType::VarChar
813 | TokenType::Char
814 | TokenType::Binary
815 | TokenType::VarBinary
816 | TokenType::No
817 | TokenType::DateTime
818 | TokenType::Truncate
819 | TokenType::Execute
820 | TokenType::Merge
821 | TokenType::Top
822 | TokenType::Begin
823 | TokenType::Generated
824 | TokenType::Identity
825 | TokenType::Always
826 | TokenType::Extract
827 | TokenType::AsOf
829 | TokenType::Prior
830 | TokenType::After
831 | TokenType::Restrict
832 | TokenType::Cascade
833 | TokenType::Local
834 | TokenType::Rename
835 | TokenType::Enum
836 | TokenType::Within
837 | TokenType::Format
838 | TokenType::Final
839 | TokenType::FileFormat
840 | TokenType::Input
841 | TokenType::InputFormat
842 | TokenType::Copy
843 | TokenType::Put
844 | TokenType::Get
845 | TokenType::Show
846 | TokenType::Serde
847 | TokenType::Sample
848 | TokenType::Sort
849 | TokenType::Collate
850 | TokenType::Ties
851 | TokenType::IsNull
852 | TokenType::NotNull
853 | TokenType::Exclude
854 | TokenType::Temporary
855 | TokenType::Add
856 | TokenType::Ordinality
857 | TokenType::Overlaps
858 | TokenType::Block
859 | TokenType::Pattern
860 | TokenType::Group
861 | TokenType::Cluster
862 | TokenType::Repeatable
863 | TokenType::Groups
864 | TokenType::Commit
865 | TokenType::Warehouse
866 | TokenType::System
867 | TokenType::By
868 | TokenType::To
869 | TokenType::Fetch
870 | TokenType::For
871 | TokenType::Only
872 | TokenType::Next
873 | TokenType::Lock
874 | TokenType::Refresh
875 | TokenType::Settings
876 | TokenType::Operator
877 | TokenType::Overwrite
878 | TokenType::StraightJoin
879 | TokenType::Start
880 | TokenType::Ignore
882 | TokenType::Domain
883 | TokenType::Apply
884 | TokenType::Respect
885 | TokenType::Materialized
886 | TokenType::Prewhere
887 | TokenType::Old
888 | TokenType::New
889 | TokenType::Cast
890 | TokenType::TryCast
891 | TokenType::SafeCast
892 | TokenType::Transaction
893 | TokenType::Describe
894 | TokenType::Kill
895 | TokenType::Lambda
896 | TokenType::Declare
897 | TokenType::Keep
898 | TokenType::Output
899 | TokenType::Percent
900 | TokenType::Qualify
901 | TokenType::Returning
902 | TokenType::Language
903 | TokenType::Prepare
904 | TokenType::Preserve
905 | TokenType::Savepoint
906 | TokenType::Rollback
907 | TokenType::Body
908 | TokenType::Increment
909 | TokenType::Minvalue
910 | TokenType::Maxvalue
911 | TokenType::Cycle
912 | TokenType::NoCycle
913 | TokenType::Seed
914 | TokenType::Namespace
915 | TokenType::Authorization
916 | TokenType::Order
917 | TokenType::Restart
918 | TokenType::Before
919 | TokenType::Instead
920 | TokenType::Each
921 | TokenType::Statement
922 | TokenType::Referencing
923 | TokenType::Of
924 | TokenType::Separator
925 | TokenType::Others
926 | TokenType::Placing
927 | TokenType::Owned
928 | TokenType::Running
929 | TokenType::Define
930 | TokenType::Measures
931 | TokenType::MatchRecognize
932 | TokenType::AutoIncrement
933 | TokenType::Connect
934 | TokenType::Distribute
935 | TokenType::Bernoulli
936 | TokenType::TableSample
937 | TokenType::Inpath
938 | TokenType::Pragma
939 | TokenType::Siblings
940 | TokenType::SerdeProperties
941 | TokenType::RLike
942 )
943 }
944
945 pub fn is_comparison(&self) -> bool {
947 matches!(
948 self,
949 TokenType::Eq
950 | TokenType::Neq
951 | TokenType::Lt
952 | TokenType::Lte
953 | TokenType::Gt
954 | TokenType::Gte
955 | TokenType::NullsafeEq
956 )
957 }
958
959 pub fn is_arithmetic(&self) -> bool {
961 matches!(
962 self,
963 TokenType::Plus
964 | TokenType::Dash
965 | TokenType::Star
966 | TokenType::Slash
967 | TokenType::Percent
968 | TokenType::Mod
969 | TokenType::Div
970 )
971 }
972}
973
974impl fmt::Display for TokenType {
975 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
976 write!(f, "{:?}", self)
977 }
978}
979
980static DEFAULT_KEYWORDS: LazyLock<HashMap<String, TokenType>> = LazyLock::new(|| {
983 let mut keywords = HashMap::with_capacity(300);
984 keywords.insert("SELECT".to_string(), TokenType::Select);
986 keywords.insert("FROM".to_string(), TokenType::From);
987 keywords.insert("WHERE".to_string(), TokenType::Where);
988 keywords.insert("AND".to_string(), TokenType::And);
989 keywords.insert("OR".to_string(), TokenType::Or);
990 keywords.insert("NOT".to_string(), TokenType::Not);
991 keywords.insert("AS".to_string(), TokenType::As);
992 keywords.insert("ON".to_string(), TokenType::On);
993 keywords.insert("JOIN".to_string(), TokenType::Join);
994 keywords.insert("LEFT".to_string(), TokenType::Left);
995 keywords.insert("RIGHT".to_string(), TokenType::Right);
996 keywords.insert("INNER".to_string(), TokenType::Inner);
997 keywords.insert("OUTER".to_string(), TokenType::Outer);
998 keywords.insert("OUTPUT".to_string(), TokenType::Output);
999 keywords.insert("FULL".to_string(), TokenType::Full);
1000 keywords.insert("CROSS".to_string(), TokenType::Cross);
1001 keywords.insert("SEMI".to_string(), TokenType::Semi);
1002 keywords.insert("ANTI".to_string(), TokenType::Anti);
1003 keywords.insert("STRAIGHT_JOIN".to_string(), TokenType::StraightJoin);
1004 keywords.insert("UNION".to_string(), TokenType::Union);
1005 keywords.insert("EXCEPT".to_string(), TokenType::Except);
1006 keywords.insert("MINUS".to_string(), TokenType::Except); keywords.insert("INTERSECT".to_string(), TokenType::Intersect);
1008 keywords.insert("GROUP".to_string(), TokenType::Group);
1009 keywords.insert("CUBE".to_string(), TokenType::Cube);
1010 keywords.insert("ROLLUP".to_string(), TokenType::Rollup);
1011 keywords.insert("WITHIN".to_string(), TokenType::Within);
1012 keywords.insert("ORDER".to_string(), TokenType::Order);
1013 keywords.insert("BY".to_string(), TokenType::By);
1014 keywords.insert("HAVING".to_string(), TokenType::Having);
1015 keywords.insert("LIMIT".to_string(), TokenType::Limit);
1016 keywords.insert("OFFSET".to_string(), TokenType::Offset);
1017 keywords.insert("ORDINALITY".to_string(), TokenType::Ordinality);
1018 keywords.insert("FETCH".to_string(), TokenType::Fetch);
1019 keywords.insert("FIRST".to_string(), TokenType::First);
1020 keywords.insert("NEXT".to_string(), TokenType::Next);
1021 keywords.insert("ONLY".to_string(), TokenType::Only);
1022 keywords.insert("KEEP".to_string(), TokenType::Keep);
1023 keywords.insert("IGNORE".to_string(), TokenType::Ignore);
1024 keywords.insert("INPUT".to_string(), TokenType::Input);
1025 keywords.insert("CASE".to_string(), TokenType::Case);
1026 keywords.insert("WHEN".to_string(), TokenType::When);
1027 keywords.insert("THEN".to_string(), TokenType::Then);
1028 keywords.insert("ELSE".to_string(), TokenType::Else);
1029 keywords.insert("END".to_string(), TokenType::End);
1030 keywords.insert("ENDIF".to_string(), TokenType::End); keywords.insert("NULL".to_string(), TokenType::Null);
1032 keywords.insert("TRUE".to_string(), TokenType::True);
1033 keywords.insert("FALSE".to_string(), TokenType::False);
1034 keywords.insert("IS".to_string(), TokenType::Is);
1035 keywords.insert("IN".to_string(), TokenType::In);
1036 keywords.insert("BETWEEN".to_string(), TokenType::Between);
1037 keywords.insert("OVERLAPS".to_string(), TokenType::Overlaps);
1038 keywords.insert("LIKE".to_string(), TokenType::Like);
1039 keywords.insert("ILIKE".to_string(), TokenType::ILike);
1040 keywords.insert("RLIKE".to_string(), TokenType::RLike);
1041 keywords.insert("REGEXP".to_string(), TokenType::RLike);
1042 keywords.insert("ESCAPE".to_string(), TokenType::Escape);
1043 keywords.insert("EXISTS".to_string(), TokenType::Exists);
1044 keywords.insert("DISTINCT".to_string(), TokenType::Distinct);
1045 keywords.insert("ALL".to_string(), TokenType::All);
1046 keywords.insert("WITH".to_string(), TokenType::With);
1047 keywords.insert("CREATE".to_string(), TokenType::Create);
1048 keywords.insert("DROP".to_string(), TokenType::Drop);
1049 keywords.insert("ALTER".to_string(), TokenType::Alter);
1050 keywords.insert("TRUNCATE".to_string(), TokenType::Truncate);
1051 keywords.insert("TABLE".to_string(), TokenType::Table);
1052 keywords.insert("VIEW".to_string(), TokenType::View);
1053 keywords.insert("INDEX".to_string(), TokenType::Index);
1054 keywords.insert("COLUMN".to_string(), TokenType::Column);
1055 keywords.insert("CONSTRAINT".to_string(), TokenType::Constraint);
1056 keywords.insert("ADD".to_string(), TokenType::Add);
1057 keywords.insert("CASCADE".to_string(), TokenType::Cascade);
1058 keywords.insert("RESTRICT".to_string(), TokenType::Restrict);
1059 keywords.insert("RENAME".to_string(), TokenType::Rename);
1060 keywords.insert("TEMPORARY".to_string(), TokenType::Temporary);
1061 keywords.insert("TEMP".to_string(), TokenType::Temporary);
1062 keywords.insert("UNIQUE".to_string(), TokenType::Unique);
1063 keywords.insert("PRIMARY".to_string(), TokenType::PrimaryKey);
1064 keywords.insert("FOREIGN".to_string(), TokenType::ForeignKey);
1065 keywords.insert("KEY".to_string(), TokenType::Key);
1066 keywords.insert("KILL".to_string(), TokenType::Kill);
1067 keywords.insert("REFERENCES".to_string(), TokenType::References);
1068 keywords.insert("DEFAULT".to_string(), TokenType::Default);
1069 keywords.insert("DECLARE".to_string(), TokenType::Declare);
1070 keywords.insert("AUTO_INCREMENT".to_string(), TokenType::AutoIncrement);
1071 keywords.insert("AUTOINCREMENT".to_string(), TokenType::AutoIncrement); keywords.insert("MATERIALIZED".to_string(), TokenType::Materialized);
1073 keywords.insert("REPLACE".to_string(), TokenType::Replace);
1074 keywords.insert("TO".to_string(), TokenType::To);
1075 keywords.insert("INSERT".to_string(), TokenType::Insert);
1076 keywords.insert("OVERWRITE".to_string(), TokenType::Overwrite);
1077 keywords.insert("UPDATE".to_string(), TokenType::Update);
1078 keywords.insert("USE".to_string(), TokenType::Use);
1079 keywords.insert("WAREHOUSE".to_string(), TokenType::Warehouse);
1080 keywords.insert("GLOB".to_string(), TokenType::Glob);
1081 keywords.insert("DELETE".to_string(), TokenType::Delete);
1082 keywords.insert("MERGE".to_string(), TokenType::Merge);
1083 keywords.insert("CACHE".to_string(), TokenType::Cache);
1084 keywords.insert("UNCACHE".to_string(), TokenType::Uncache);
1085 keywords.insert("REFRESH".to_string(), TokenType::Refresh);
1086 keywords.insert("GRANT".to_string(), TokenType::Grant);
1087 keywords.insert("REVOKE".to_string(), TokenType::Revoke);
1088 keywords.insert("COMMENT".to_string(), TokenType::Comment);
1089 keywords.insert("COLLATE".to_string(), TokenType::Collate);
1090 keywords.insert("INTO".to_string(), TokenType::Into);
1091 keywords.insert("VALUES".to_string(), TokenType::Values);
1092 keywords.insert("SET".to_string(), TokenType::Set);
1093 keywords.insert("SETTINGS".to_string(), TokenType::Settings);
1094 keywords.insert("SEPARATOR".to_string(), TokenType::Separator);
1095 keywords.insert("ASC".to_string(), TokenType::Asc);
1096 keywords.insert("DESC".to_string(), TokenType::Desc);
1097 keywords.insert("NULLS".to_string(), TokenType::Nulls);
1098 keywords.insert("RESPECT".to_string(), TokenType::Respect);
1099 keywords.insert("FIRST".to_string(), TokenType::First);
1100 keywords.insert("LAST".to_string(), TokenType::Last);
1101 keywords.insert("IF".to_string(), TokenType::If);
1102 keywords.insert("CAST".to_string(), TokenType::Cast);
1103 keywords.insert("TRY_CAST".to_string(), TokenType::TryCast);
1104 keywords.insert("SAFE_CAST".to_string(), TokenType::SafeCast);
1105 keywords.insert("OVER".to_string(), TokenType::Over);
1106 keywords.insert("PARTITION".to_string(), TokenType::Partition);
1107 keywords.insert("PLACING".to_string(), TokenType::Placing);
1108 keywords.insert("WINDOW".to_string(), TokenType::Window);
1109 keywords.insert("ROWS".to_string(), TokenType::Rows);
1110 keywords.insert("RANGE".to_string(), TokenType::Range);
1111 keywords.insert("FILTER".to_string(), TokenType::Filter);
1112 keywords.insert("NATURAL".to_string(), TokenType::Natural);
1113 keywords.insert("USING".to_string(), TokenType::Using);
1114 keywords.insert("UNBOUNDED".to_string(), TokenType::Unbounded);
1115 keywords.insert("PRECEDING".to_string(), TokenType::Preceding);
1116 keywords.insert("FOLLOWING".to_string(), TokenType::Following);
1117 keywords.insert("CURRENT".to_string(), TokenType::Current);
1118 keywords.insert("ROW".to_string(), TokenType::Row);
1119 keywords.insert("GROUPS".to_string(), TokenType::Groups);
1120 keywords.insert("RECURSIVE".to_string(), TokenType::Recursive);
1121 keywords.insert("BOTH".to_string(), TokenType::Both);
1123 keywords.insert("LEADING".to_string(), TokenType::Leading);
1124 keywords.insert("TRAILING".to_string(), TokenType::Trailing);
1125 keywords.insert("INTERVAL".to_string(), TokenType::Interval);
1126 keywords.insert("TOP".to_string(), TokenType::Top);
1128 keywords.insert("QUALIFY".to_string(), TokenType::Qualify);
1129 keywords.insert("SAMPLE".to_string(), TokenType::Sample);
1130 keywords.insert("TABLESAMPLE".to_string(), TokenType::TableSample);
1131 keywords.insert("BERNOULLI".to_string(), TokenType::Bernoulli);
1132 keywords.insert("SYSTEM".to_string(), TokenType::System);
1133 keywords.insert("BLOCK".to_string(), TokenType::Block);
1134 keywords.insert("SEED".to_string(), TokenType::Seed);
1135 keywords.insert("REPEATABLE".to_string(), TokenType::Repeatable);
1136 keywords.insert("TIES".to_string(), TokenType::Ties);
1137 keywords.insert("LATERAL".to_string(), TokenType::Lateral);
1138 keywords.insert("LAMBDA".to_string(), TokenType::Lambda);
1139 keywords.insert("APPLY".to_string(), TokenType::Apply);
1140 keywords.insert("CONNECT".to_string(), TokenType::Connect);
1142 keywords.insert("CLUSTER".to_string(), TokenType::Cluster);
1144 keywords.insert("DISTRIBUTE".to_string(), TokenType::Distribute);
1145 keywords.insert("SORT".to_string(), TokenType::Sort);
1146 keywords.insert("PIVOT".to_string(), TokenType::Pivot);
1147 keywords.insert("PREWHERE".to_string(), TokenType::Prewhere);
1148 keywords.insert("UNPIVOT".to_string(), TokenType::Unpivot);
1149 keywords.insert("FOR".to_string(), TokenType::For);
1150 keywords.insert("ANY".to_string(), TokenType::Any);
1151 keywords.insert("SOME".to_string(), TokenType::Some);
1152 keywords.insert("ASOF".to_string(), TokenType::AsOf);
1153 keywords.insert("PERCENT".to_string(), TokenType::Percent);
1154 keywords.insert("EXCLUDE".to_string(), TokenType::Exclude);
1155 keywords.insert("NO".to_string(), TokenType::No);
1156 keywords.insert("OTHERS".to_string(), TokenType::Others);
1157 keywords.insert("OPERATOR".to_string(), TokenType::Operator);
1159 keywords.insert("SCHEMA".to_string(), TokenType::Schema);
1161 keywords.insert("NAMESPACE".to_string(), TokenType::Namespace);
1162 keywords.insert("DATABASE".to_string(), TokenType::Database);
1163 keywords.insert("FUNCTION".to_string(), TokenType::Function);
1164 keywords.insert("PROCEDURE".to_string(), TokenType::Procedure);
1165 keywords.insert("PROC".to_string(), TokenType::Procedure);
1166 keywords.insert("SEQUENCE".to_string(), TokenType::Sequence);
1167 keywords.insert("TRIGGER".to_string(), TokenType::Trigger);
1168 keywords.insert("TYPE".to_string(), TokenType::Type);
1169 keywords.insert("DOMAIN".to_string(), TokenType::Domain);
1170 keywords.insert("RETURNS".to_string(), TokenType::Returns);
1171 keywords.insert("RETURNING".to_string(), TokenType::Returning);
1172 keywords.insert("LANGUAGE".to_string(), TokenType::Language);
1173 keywords.insert("ROLLBACK".to_string(), TokenType::Rollback);
1174 keywords.insert("COMMIT".to_string(), TokenType::Commit);
1175 keywords.insert("BEGIN".to_string(), TokenType::Begin);
1176 keywords.insert("DESCRIBE".to_string(), TokenType::Describe);
1177 keywords.insert("PREPARE".to_string(), TokenType::Prepare);
1178 keywords.insert("PRESERVE".to_string(), TokenType::Preserve);
1179 keywords.insert("TRANSACTION".to_string(), TokenType::Transaction);
1180 keywords.insert("SAVEPOINT".to_string(), TokenType::Savepoint);
1181 keywords.insert("BODY".to_string(), TokenType::Body);
1182 keywords.insert("INCREMENT".to_string(), TokenType::Increment);
1183 keywords.insert("MINVALUE".to_string(), TokenType::Minvalue);
1184 keywords.insert("MAXVALUE".to_string(), TokenType::Maxvalue);
1185 keywords.insert("CYCLE".to_string(), TokenType::Cycle);
1186 keywords.insert("NOCYCLE".to_string(), TokenType::NoCycle);
1187 keywords.insert("PRIOR".to_string(), TokenType::Prior);
1188 keywords.insert("MATCH".to_string(), TokenType::Match);
1190 keywords.insert("MATCH_RECOGNIZE".to_string(), TokenType::MatchRecognize);
1191 keywords.insert("MEASURES".to_string(), TokenType::Measures);
1192 keywords.insert("PATTERN".to_string(), TokenType::Pattern);
1193 keywords.insert("DEFINE".to_string(), TokenType::Define);
1194 keywords.insert("RUNNING".to_string(), TokenType::Running);
1195 keywords.insert("FINAL".to_string(), TokenType::Final);
1196 keywords.insert("OWNED".to_string(), TokenType::Owned);
1197 keywords.insert("AFTER".to_string(), TokenType::After);
1198 keywords.insert("BEFORE".to_string(), TokenType::Before);
1199 keywords.insert("INSTEAD".to_string(), TokenType::Instead);
1200 keywords.insert("EACH".to_string(), TokenType::Each);
1201 keywords.insert("STATEMENT".to_string(), TokenType::Statement);
1202 keywords.insert("REFERENCING".to_string(), TokenType::Referencing);
1203 keywords.insert("OLD".to_string(), TokenType::Old);
1204 keywords.insert("NEW".to_string(), TokenType::New);
1205 keywords.insert("OF".to_string(), TokenType::Of);
1206 keywords.insert("CHECK".to_string(), TokenType::Check);
1207 keywords.insert("START".to_string(), TokenType::Start);
1208 keywords.insert("ENUM".to_string(), TokenType::Enum);
1209 keywords.insert("AUTHORIZATION".to_string(), TokenType::Authorization);
1210 keywords.insert("RESTART".to_string(), TokenType::Restart);
1211 keywords.insert("DATE".to_string(), TokenType::Date);
1213 keywords.insert("TIME".to_string(), TokenType::Time);
1214 keywords.insert("TIMESTAMP".to_string(), TokenType::Timestamp);
1215 keywords.insert("DATETIME".to_string(), TokenType::DateTime);
1216 keywords.insert("GENERATED".to_string(), TokenType::Generated);
1217 keywords.insert("IDENTITY".to_string(), TokenType::Identity);
1218 keywords.insert("ALWAYS".to_string(), TokenType::Always);
1219 keywords.insert("LOAD".to_string(), TokenType::Load);
1221 keywords.insert("LOCAL".to_string(), TokenType::Local);
1222 keywords.insert("INPATH".to_string(), TokenType::Inpath);
1223 keywords.insert("INPUTFORMAT".to_string(), TokenType::InputFormat);
1224 keywords.insert("SERDE".to_string(), TokenType::Serde);
1225 keywords.insert("SERDEPROPERTIES".to_string(), TokenType::SerdeProperties);
1226 keywords.insert("FORMAT".to_string(), TokenType::Format);
1227 keywords.insert("PRAGMA".to_string(), TokenType::Pragma);
1229 keywords.insert("SHOW".to_string(), TokenType::Show);
1231 keywords.insert("SIBLINGS".to_string(), TokenType::Siblings);
1233 keywords.insert("COPY".to_string(), TokenType::Copy);
1235 keywords.insert("PUT".to_string(), TokenType::Put);
1236 keywords.insert("GET".to_string(), TokenType::Get);
1237 keywords.insert("EXEC".to_string(), TokenType::Execute);
1239 keywords.insert("EXECUTE".to_string(), TokenType::Execute);
1240 keywords.insert("ISNULL".to_string(), TokenType::IsNull);
1242 keywords.insert("NOTNULL".to_string(), TokenType::NotNull);
1243 keywords
1244});
1245
1246static DEFAULT_SINGLE_TOKENS: LazyLock<HashMap<char, TokenType>> = LazyLock::new(|| {
1247 let mut single_tokens = HashMap::with_capacity(30);
1248 single_tokens.insert('(', TokenType::LParen);
1249 single_tokens.insert(')', TokenType::RParen);
1250 single_tokens.insert('[', TokenType::LBracket);
1251 single_tokens.insert(']', TokenType::RBracket);
1252 single_tokens.insert('{', TokenType::LBrace);
1253 single_tokens.insert('}', TokenType::RBrace);
1254 single_tokens.insert(',', TokenType::Comma);
1255 single_tokens.insert('.', TokenType::Dot);
1256 single_tokens.insert(';', TokenType::Semicolon);
1257 single_tokens.insert('+', TokenType::Plus);
1258 single_tokens.insert('-', TokenType::Dash);
1259 single_tokens.insert('*', TokenType::Star);
1260 single_tokens.insert('/', TokenType::Slash);
1261 single_tokens.insert('%', TokenType::Percent);
1262 single_tokens.insert('&', TokenType::Amp);
1263 single_tokens.insert('|', TokenType::Pipe);
1264 single_tokens.insert('^', TokenType::Caret);
1265 single_tokens.insert('~', TokenType::Tilde);
1266 single_tokens.insert('<', TokenType::Lt);
1267 single_tokens.insert('>', TokenType::Gt);
1268 single_tokens.insert('=', TokenType::Eq);
1269 single_tokens.insert('!', TokenType::Exclamation);
1270 single_tokens.insert(':', TokenType::Colon);
1271 single_tokens.insert('@', TokenType::DAt);
1272 single_tokens.insert('#', TokenType::Hash);
1273 single_tokens.insert('$', TokenType::Dollar);
1274 single_tokens.insert('?', TokenType::Parameter);
1275 single_tokens
1276});
1277
1278static DEFAULT_QUOTES: LazyLock<HashMap<String, String>> = LazyLock::new(|| {
1279 let mut quotes = HashMap::with_capacity(4);
1280 quotes.insert("'".to_string(), "'".to_string());
1281 quotes.insert("\"\"\"".to_string(), "\"\"\"".to_string());
1283 quotes
1284});
1285
1286static DEFAULT_IDENTIFIERS: LazyLock<HashMap<char, char>> = LazyLock::new(|| {
1287 let mut identifiers = HashMap::with_capacity(4);
1288 identifiers.insert('"', '"');
1289 identifiers.insert('`', '`');
1290 identifiers
1293});
1294
1295static DEFAULT_COMMENTS: LazyLock<HashMap<String, Option<String>>> = LazyLock::new(|| {
1296 let mut comments = HashMap::with_capacity(4);
1297 comments.insert("--".to_string(), None);
1298 comments.insert("/*".to_string(), Some("*/".to_string()));
1299 comments
1300});
1301
1302#[derive(Debug, Clone)]
1304pub struct TokenizerConfig {
1305 pub keywords: HashMap<String, TokenType>,
1307 pub single_tokens: HashMap<char, TokenType>,
1309 pub quotes: HashMap<String, String>,
1311 pub identifiers: HashMap<char, char>,
1313 pub comments: HashMap<String, Option<String>>,
1315 pub string_escapes: Vec<char>,
1317 pub nested_comments: bool,
1319 pub escape_follow_chars: Vec<char>,
1324 pub b_prefix_is_byte_string: bool,
1327 pub numeric_literals: HashMap<String, String>,
1330 pub identifiers_can_start_with_digit: bool,
1334 pub hex_number_strings: bool,
1338 pub hex_string_is_integer_type: bool,
1342 pub string_escapes_allowed_in_raw_strings: bool,
1347 pub hash_comments: bool,
1349 pub dollar_sign_is_identifier: bool,
1353 pub insert_format_raw_data: bool,
1357 pub numbers_can_be_underscore_separated: bool,
1361 pub recover_terminal_backslash_quote: bool,
1365 pub recover_unterminated_string: bool,
1369}
1370
1371impl Default for TokenizerConfig {
1372 fn default() -> Self {
1373 Self {
1374 keywords: DEFAULT_KEYWORDS.clone(),
1375 single_tokens: DEFAULT_SINGLE_TOKENS.clone(),
1376 quotes: DEFAULT_QUOTES.clone(),
1377 identifiers: DEFAULT_IDENTIFIERS.clone(),
1378 comments: DEFAULT_COMMENTS.clone(),
1379 string_escapes: vec!['\''],
1382 nested_comments: true,
1383 escape_follow_chars: vec![],
1385 b_prefix_is_byte_string: false,
1387 numeric_literals: HashMap::new(),
1388 identifiers_can_start_with_digit: false,
1389 hex_number_strings: false,
1390 hex_string_is_integer_type: false,
1391 string_escapes_allowed_in_raw_strings: true,
1394 hash_comments: false,
1395 dollar_sign_is_identifier: false,
1396 insert_format_raw_data: false,
1397 numbers_can_be_underscore_separated: false,
1398 recover_terminal_backslash_quote: false,
1399 recover_unterminated_string: false,
1400 }
1401 }
1402}
1403
1404pub struct Tokenizer {
1406 config: TokenizerConfig,
1407}
1408
1409impl Tokenizer {
1410 pub fn new(config: TokenizerConfig) -> Self {
1412 Self { config }
1413 }
1414
1415 pub fn default_config() -> Self {
1417 Self::new(TokenizerConfig::default())
1418 }
1419
1420 pub fn tokenize(&self, sql: &str) -> Result<Vec<Token>> {
1422 let mut state = TokenizerState::new(sql, &self.config);
1423 state.tokenize()
1424 }
1425}
1426
1427impl Default for Tokenizer {
1428 fn default() -> Self {
1429 Self::default_config()
1430 }
1431}
1432
1433struct TokenizerState<'a> {
1435 source: &'a str,
1436 source_is_ascii: bool,
1437 chars: Vec<char>,
1438 size: usize,
1439 tokens: Vec<Token>,
1440 start: usize,
1441 current: usize,
1442 line: usize,
1443 column: usize,
1444 comments: Vec<String>,
1445 config: &'a TokenizerConfig,
1446}
1447
1448impl<'a> TokenizerState<'a> {
1449 fn new(sql: &'a str, config: &'a TokenizerConfig) -> Self {
1450 let chars: Vec<char> = sql.chars().collect();
1451 let size = chars.len();
1452 Self {
1453 source: sql,
1454 source_is_ascii: sql.is_ascii(),
1455 chars,
1456 size,
1457 tokens: Vec::new(),
1458 start: 0,
1459 current: 0,
1460 line: 1,
1461 column: 1,
1462 comments: Vec::new(),
1463 config,
1464 }
1465 }
1466
1467 fn tokenize(&mut self) -> Result<Vec<Token>> {
1468 while !self.is_at_end() {
1469 self.skip_whitespace();
1470 if self.is_at_end() {
1471 break;
1472 }
1473
1474 self.start = self.current;
1475 self.scan_token()?;
1476
1477 if self.config.insert_format_raw_data {
1480 if let Some(raw) = self.try_scan_insert_format_raw_data() {
1481 if !raw.is_empty() {
1482 self.start = self.current;
1483 self.add_token_with_text(TokenType::Var, raw);
1484 }
1485 }
1486 }
1487 }
1488
1489 if !self.comments.is_empty() {
1494 if let Some(last) = self.tokens.last_mut() {
1495 last.trailing_comments.extend(self.comments.drain(..));
1496 }
1497 }
1498
1499 Ok(std::mem::take(&mut self.tokens))
1500 }
1501
1502 #[inline]
1503 fn is_at_end(&self) -> bool {
1504 self.current >= self.size
1505 }
1506
1507 #[inline]
1508 fn text_from_range(&self, start: usize, end: usize) -> String {
1509 if self.source_is_ascii {
1510 self.source[start..end].to_string()
1511 } else {
1512 self.chars[start..end].iter().collect()
1513 }
1514 }
1515
1516 #[inline]
1517 fn peek(&self) -> char {
1518 if self.is_at_end() {
1519 '\0'
1520 } else {
1521 self.chars[self.current]
1522 }
1523 }
1524
1525 #[inline]
1526 fn peek_next(&self) -> char {
1527 if self.current + 1 >= self.size {
1528 '\0'
1529 } else {
1530 self.chars[self.current + 1]
1531 }
1532 }
1533
1534 #[inline]
1535 fn advance(&mut self) -> char {
1536 let c = self.peek();
1537 self.current += 1;
1538 if c == '\n' {
1539 self.line += 1;
1540 self.column = 1;
1541 } else {
1542 self.column += 1;
1543 }
1544 c
1545 }
1546
1547 fn skip_whitespace(&mut self) {
1548 let mut saw_newline = false;
1553 while !self.is_at_end() {
1554 let c = self.peek();
1555 match c {
1556 ' ' | '\t' | '\r' => {
1557 self.advance();
1558 }
1559 '\n' => {
1560 saw_newline = true;
1561 self.advance();
1562 }
1563 '\u{00A0}' | '\u{2000}'..='\u{200B}' | '\u{3000}' | '\u{FEFF}' => {
1568 self.advance();
1569 }
1570 '-' if self.peek_next() == '-' => {
1571 self.scan_line_comment(saw_newline);
1572 saw_newline = true;
1574 }
1575 '/' if self.peek_next() == '/' && self.config.hash_comments => {
1576 self.scan_double_slash_comment();
1578 }
1579 '/' if self.peek_next() == '*' => {
1580 if self.current + 2 < self.size && self.chars[self.current + 2] == '+' {
1582 break;
1584 }
1585 if self.scan_block_comment(saw_newline).is_err() {
1586 return;
1587 }
1588 }
1590 '/' if self.peek_next() == '/' && self.config.comments.contains_key("//") => {
1591 let prev_non_ws = if self.current > 0 {
1595 let mut i = self.current - 1;
1596 while i > 0 && (self.chars[i] == ' ' || self.chars[i] == '\t') {
1597 i -= 1;
1598 }
1599 self.chars[i]
1600 } else {
1601 '\0'
1602 };
1603 if prev_non_ws == ':' || prev_non_ws == '/' {
1604 break;
1606 }
1607 self.scan_line_comment(saw_newline);
1608 saw_newline = true;
1610 }
1611 '#' if self.config.hash_comments => {
1612 self.scan_hash_line_comment();
1613 }
1614 _ => break,
1615 }
1616 }
1617 }
1618
1619 fn scan_hash_line_comment(&mut self) {
1620 self.advance(); let start = self.current;
1622 while !self.is_at_end() && self.peek() != '\n' {
1623 self.advance();
1624 }
1625 let comment = self.text_from_range(start, self.current);
1626 let comment_text = comment.trim().to_string();
1627 if let Some(last) = self.tokens.last_mut() {
1628 last.trailing_comments.push(comment_text);
1629 } else {
1630 self.comments.push(comment_text);
1631 }
1632 }
1633
1634 fn scan_double_slash_comment(&mut self) {
1635 self.advance(); self.advance(); let start = self.current;
1638 while !self.is_at_end() && self.peek() != '\n' {
1639 self.advance();
1640 }
1641 let comment = self.text_from_range(start, self.current);
1642 let comment_text = comment.trim().to_string();
1643 if let Some(last) = self.tokens.last_mut() {
1644 last.trailing_comments.push(comment_text);
1645 } else {
1646 self.comments.push(comment_text);
1647 }
1648 }
1649
1650 fn scan_line_comment(&mut self, after_newline: bool) {
1651 self.advance(); self.advance(); let start = self.current;
1654 while !self.is_at_end() && self.peek() != '\n' {
1655 self.advance();
1656 }
1657 let comment_text = self.text_from_range(start, self.current);
1658
1659 if after_newline || self.tokens.is_empty() {
1662 self.comments.push(comment_text);
1663 } else if let Some(last) = self.tokens.last_mut() {
1664 last.trailing_comments.push(comment_text);
1665 }
1666 }
1667
1668 fn scan_block_comment(&mut self, after_newline: bool) -> Result<()> {
1669 self.advance(); self.advance(); let content_start = self.current;
1672 let mut depth = 1;
1673
1674 while !self.is_at_end() && depth > 0 {
1675 if self.peek() == '/' && self.peek_next() == '*' && self.config.nested_comments {
1676 self.advance();
1677 self.advance();
1678 depth += 1;
1679 } else if self.peek() == '*' && self.peek_next() == '/' {
1680 depth -= 1;
1681 if depth > 0 {
1682 self.advance();
1683 self.advance();
1684 }
1685 } else {
1686 self.advance();
1687 }
1688 }
1689
1690 if depth > 0 {
1691 return Err(Error::tokenize(
1692 "Unterminated block comment",
1693 self.line,
1694 self.column,
1695 self.start,
1696 self.current,
1697 ));
1698 }
1699
1700 let content = self.text_from_range(content_start, self.current);
1702 self.advance(); self.advance(); let comment_text = format!("/*{}*/", content);
1707
1708 if after_newline || self.tokens.is_empty() {
1711 self.comments.push(comment_text);
1712 } else if let Some(last) = self.tokens.last_mut() {
1713 last.trailing_comments.push(comment_text);
1714 }
1715
1716 Ok(())
1717 }
1718
1719 fn scan_hint(&mut self) -> Result<()> {
1721 self.advance(); self.advance(); self.advance(); let hint_start = self.current;
1725
1726 while !self.is_at_end() {
1728 if self.peek() == '*' && self.peek_next() == '/' {
1729 break;
1730 }
1731 self.advance();
1732 }
1733
1734 if self.is_at_end() {
1735 return Err(Error::tokenize(
1736 "Unterminated hint comment",
1737 self.line,
1738 self.column,
1739 self.start,
1740 self.current,
1741 ));
1742 }
1743
1744 let hint_text = self.text_from_range(hint_start, self.current);
1745 self.advance(); self.advance(); self.add_token_with_text(TokenType::Hint, hint_text.trim().to_string());
1749
1750 Ok(())
1751 }
1752
1753 fn scan_positional_parameter(&mut self) -> Result<()> {
1755 self.advance(); let start = self.current;
1757
1758 while !self.is_at_end() && self.peek().is_ascii_digit() {
1759 self.advance();
1760 }
1761
1762 let number = self.text_from_range(start, self.current);
1763 self.add_token_with_text(TokenType::Parameter, number);
1764 Ok(())
1765 }
1766
1767 fn try_scan_tagged_dollar_string(&mut self) -> Result<Option<()>> {
1772 let saved_pos = self.current;
1773
1774 self.advance(); let tag_start = self.current;
1780 while !self.is_at_end()
1781 && (self.peek().is_alphanumeric() || self.peek() == '_' || !self.peek().is_ascii())
1782 {
1783 self.advance();
1784 }
1785 let tag = self.text_from_range(tag_start, self.current);
1786
1787 if self.is_at_end() || self.peek() != '$' {
1789 self.current = saved_pos;
1791 return Ok(None);
1792 }
1793 self.advance(); let content_start = self.current;
1797 let closing_tag = format!("${}$", tag);
1798 let closing_chars: Vec<char> = closing_tag.chars().collect();
1799
1800 loop {
1801 if self.is_at_end() {
1802 self.current = saved_pos;
1804 return Ok(None);
1805 }
1806
1807 if self.peek() == '$' && self.current + closing_chars.len() <= self.size {
1809 let matches = closing_chars.iter().enumerate().all(|(j, &ch)| {
1810 self.current + j < self.size && self.chars[self.current + j] == ch
1811 });
1812 if matches {
1813 let content = self.text_from_range(content_start, self.current);
1814 for _ in 0..closing_chars.len() {
1816 self.advance();
1817 }
1818 let token_text = format!("{}\x00{}", tag, content);
1820 self.add_token_with_text(TokenType::DollarString, token_text);
1821 return Ok(Some(()));
1822 }
1823 }
1824 self.advance();
1825 }
1826 }
1827
1828 fn scan_dollar_quoted_string(&mut self) -> Result<()> {
1833 self.advance(); self.advance(); let start = self.current;
1838 while !self.is_at_end() {
1839 if self.peek() == '$'
1840 && self.current + 1 < self.size
1841 && self.chars[self.current + 1] == '$'
1842 {
1843 break;
1844 }
1845 self.advance();
1846 }
1847
1848 let content = self.text_from_range(start, self.current);
1849
1850 if !self.is_at_end() {
1851 self.advance(); self.advance(); }
1854
1855 self.add_token_with_text(TokenType::DollarString, content);
1856 Ok(())
1857 }
1858
1859 fn scan_token(&mut self) -> Result<()> {
1860 let c = self.peek();
1861
1862 if c == '\'' {
1864 if self.config.quotes.contains_key("'''")
1866 && self.peek_next() == '\''
1867 && self.current + 2 < self.size
1868 && self.chars[self.current + 2] == '\''
1869 {
1870 return self.scan_triple_quoted_string('\'');
1871 }
1872 return self.scan_string();
1873 }
1874
1875 if c == '"'
1877 && self.config.quotes.contains_key("\"\"\"")
1878 && self.peek_next() == '"'
1879 && self.current + 2 < self.size
1880 && self.chars[self.current + 2] == '"'
1881 {
1882 return self.scan_triple_quoted_string('"');
1883 }
1884
1885 if c == '"'
1888 && self.config.quotes.contains_key("\"")
1889 && !self.config.identifiers.contains_key(&'"')
1890 {
1891 return self.scan_double_quoted_string();
1892 }
1893
1894 if let Some(&end_quote) = self.config.identifiers.get(&c) {
1896 return self.scan_quoted_identifier(end_quote);
1897 }
1898
1899 if c.is_ascii_digit() {
1901 return self.scan_number();
1902 }
1903
1904 if c == '.' && self.peek_next().is_ascii_digit() {
1911 let prev_char = if self.current > 0 {
1912 self.chars[self.current - 1]
1913 } else {
1914 '\0'
1915 };
1916 let is_after_ident = prev_char.is_alphanumeric()
1917 || prev_char == '_'
1918 || prev_char == '`'
1919 || prev_char == '"'
1920 || prev_char == ']'
1921 || prev_char == ')';
1922 if prev_char != '.' && !is_after_ident {
1923 return self.scan_number_starting_with_dot();
1924 }
1925 }
1926
1927 if c == '/'
1929 && self.peek_next() == '*'
1930 && self.current + 2 < self.size
1931 && self.chars[self.current + 2] == '+'
1932 {
1933 return self.scan_hint();
1934 }
1935
1936 if let Some(token_type) = self.try_scan_multi_char_operator() {
1938 self.add_token(token_type);
1939 return Ok(());
1940 }
1941
1942 if c == '$'
1945 && (self.peek_next().is_alphanumeric()
1946 || self.peek_next() == '_'
1947 || !self.peek_next().is_ascii())
1948 {
1949 if let Some(()) = self.try_scan_tagged_dollar_string()? {
1950 return Ok(());
1951 }
1952 if self.config.dollar_sign_is_identifier {
1955 return self.scan_dollar_identifier();
1956 }
1957 }
1958
1959 if c == '$' && self.peek_next() == '$' {
1961 return self.scan_dollar_quoted_string();
1962 }
1963
1964 if c == '$' && self.peek_next().is_ascii_digit() {
1966 return self.scan_positional_parameter();
1967 }
1968
1969 if c == '$' && self.config.dollar_sign_is_identifier {
1971 return self.scan_dollar_identifier();
1972 }
1973
1974 if (c == '#' || c == '@')
1977 && (self.peek_next().is_alphanumeric()
1978 || self.peek_next() == '_'
1979 || self.peek_next() == '#')
1980 {
1981 return self.scan_tsql_identifier();
1982 }
1983
1984 if let Some(&token_type) = self.config.single_tokens.get(&c) {
1986 self.advance();
1987 self.add_token(token_type);
1988 return Ok(());
1989 }
1990
1991 if c == '\u{2212}' {
1993 self.advance();
1994 self.add_token(TokenType::Dash);
1995 return Ok(());
1996 }
1997
1998 if c == '\u{2044}' {
2000 self.advance();
2001 self.add_token(TokenType::Slash);
2002 return Ok(());
2003 }
2004
2005 if c == '\u{2018}' || c == '\u{2019}' {
2007 return self.scan_unicode_quoted_string(c);
2009 }
2010 if c == '\u{201C}' || c == '\u{201D}' {
2011 return self.scan_unicode_quoted_identifier(c);
2013 }
2014
2015 self.scan_identifier_or_keyword()
2017 }
2018
2019 fn try_scan_multi_char_operator(&mut self) -> Option<TokenType> {
2020 let c = self.peek();
2021 let next = self.peek_next();
2022 let third = if self.current + 2 < self.size {
2023 self.chars[self.current + 2]
2024 } else {
2025 '\0'
2026 };
2027
2028 if c == '-' && next == '|' && third == '-' {
2031 self.advance();
2032 self.advance();
2033 self.advance();
2034 return Some(TokenType::Adjacent);
2035 }
2036
2037 if c == '|' && next == '|' && third == '/' {
2039 self.advance();
2040 self.advance();
2041 self.advance();
2042 return Some(TokenType::DPipeSlash);
2043 }
2044
2045 if c == '#' && next == '>' && third == '>' {
2047 self.advance();
2048 self.advance();
2049 self.advance();
2050 return Some(TokenType::DHashArrow);
2051 }
2052
2053 if c == '-' && next == '>' && third == '>' {
2055 self.advance();
2056 self.advance();
2057 self.advance();
2058 return Some(TokenType::DArrow);
2059 }
2060
2061 if c == '<' && next == '=' && third == '>' {
2063 self.advance();
2064 self.advance();
2065 self.advance();
2066 return Some(TokenType::NullsafeEq);
2067 }
2068
2069 if c == '<' && next == '-' && third == '>' {
2071 self.advance();
2072 self.advance();
2073 self.advance();
2074 return Some(TokenType::LrArrow);
2075 }
2076
2077 if c == '<' && next == '@' {
2079 self.advance();
2080 self.advance();
2081 return Some(TokenType::LtAt);
2082 }
2083
2084 if c == '@' && next == '>' {
2086 self.advance();
2087 self.advance();
2088 return Some(TokenType::AtGt);
2089 }
2090
2091 if c == '~' && next == '~' && third == '~' {
2093 self.advance();
2094 self.advance();
2095 self.advance();
2096 return Some(TokenType::Glob);
2097 }
2098
2099 if c == '~' && next == '~' && third == '*' {
2101 self.advance();
2102 self.advance();
2103 self.advance();
2104 return Some(TokenType::ILike);
2105 }
2106
2107 let fourth = if self.current + 3 < self.size {
2109 self.chars[self.current + 3]
2110 } else {
2111 '\0'
2112 };
2113 if c == '!' && next == '~' && third == '~' && fourth == '*' {
2114 self.advance();
2115 self.advance();
2116 self.advance();
2117 self.advance();
2118 return Some(TokenType::NotILike);
2119 }
2120
2121 if c == '!' && next == '~' && third == '~' {
2123 self.advance();
2124 self.advance();
2125 self.advance();
2126 return Some(TokenType::NotLike);
2127 }
2128
2129 if c == '!' && next == '~' && third == '*' {
2131 self.advance();
2132 self.advance();
2133 self.advance();
2134 return Some(TokenType::NotIRLike);
2135 }
2136
2137 if c == '!' && next == ':' && third == '>' {
2139 self.advance();
2140 self.advance();
2141 self.advance();
2142 return Some(TokenType::NColonGt);
2143 }
2144
2145 if c == '?' && next == ':' && third == ':' {
2147 self.advance();
2148 self.advance();
2149 self.advance();
2150 return Some(TokenType::QDColon);
2151 }
2152
2153 if c == '!' && next == '~' {
2155 self.advance();
2156 self.advance();
2157 return Some(TokenType::NotRLike);
2158 }
2159
2160 if c == '~' && next == '~' {
2162 self.advance();
2163 self.advance();
2164 return Some(TokenType::Like);
2165 }
2166
2167 if c == '~' && next == '*' {
2169 self.advance();
2170 self.advance();
2171 return Some(TokenType::IRLike);
2172 }
2173
2174 if c == ':' && next == ':' && third == '$' {
2177 self.advance();
2178 self.advance();
2179 self.advance();
2180 return Some(TokenType::DColonDollar);
2181 }
2182 if c == ':' && next == ':' && third == '%' {
2183 self.advance();
2184 self.advance();
2185 self.advance();
2186 return Some(TokenType::DColonPercent);
2187 }
2188 if c == ':' && next == ':' && third == '?' {
2189 self.advance();
2190 self.advance();
2191 self.advance();
2192 return Some(TokenType::DColonQMark);
2193 }
2194
2195 let token_type = match (c, next) {
2197 ('.', ':') => Some(TokenType::DotColon),
2198 ('=', '=') => Some(TokenType::Eq), ('<', '=') => Some(TokenType::Lte),
2200 ('>', '=') => Some(TokenType::Gte),
2201 ('!', '=') => Some(TokenType::Neq),
2202 ('<', '>') => Some(TokenType::Neq),
2203 ('^', '=') => Some(TokenType::Neq),
2204 ('<', '<') => Some(TokenType::LtLt),
2205 ('>', '>') => Some(TokenType::GtGt),
2206 ('|', '|') => Some(TokenType::DPipe),
2207 ('|', '/') => Some(TokenType::PipeSlash), (':', ':') => Some(TokenType::DColon),
2209 (':', '=') => Some(TokenType::ColonEq), (':', '>') => Some(TokenType::ColonGt), ('-', '>') => Some(TokenType::Arrow), ('=', '>') => Some(TokenType::FArrow), ('&', '&') => Some(TokenType::DAmp),
2214 ('&', '<') => Some(TokenType::AmpLt), ('&', '>') => Some(TokenType::AmpGt), ('@', '@') => Some(TokenType::AtAt), ('@', '?') => Some(TokenType::AtQMark), ('?', '|') => Some(TokenType::QMarkPipe), ('?', '&') => Some(TokenType::QMarkAmp), ('?', '?') => Some(TokenType::DQMark), ('#', '>') => Some(TokenType::HashArrow), ('#', '-') => Some(TokenType::HashDash), ('^', '@') => Some(TokenType::CaretAt), ('*', '*') => Some(TokenType::DStar), ('|', '>') => Some(TokenType::PipeGt), _ => None,
2227 };
2228
2229 if token_type.is_some() {
2230 self.advance();
2231 self.advance();
2232 }
2233
2234 token_type
2235 }
2236
2237 fn scan_string(&mut self) -> Result<()> {
2238 self.advance(); let mut value = String::new();
2240
2241 while !self.is_at_end() {
2242 let c = self.peek();
2243 if c == '\'' {
2244 if self.peek_next() == '\'' {
2245 value.push('\'');
2247 self.advance();
2248 self.advance();
2249 } else {
2250 break;
2251 }
2252 } else if c == '\\' && self.config.string_escapes.contains(&'\\') {
2253 if self.config.recover_terminal_backslash_quote
2254 && self.peek_next() == '\''
2255 && !self.chars[self.current + 2..].contains(&'\'')
2256 {
2257 value.push(self.advance());
2258 break;
2259 }
2260
2261 self.advance(); if !self.is_at_end() {
2264 let escaped = self.advance();
2265 match escaped {
2266 'n' => value.push('\n'),
2267 'r' => value.push('\r'),
2268 't' => value.push('\t'),
2269 '0' => value.push('\0'),
2270 'Z' => value.push('\x1A'), 'a' => value.push('\x07'), 'b' => value.push('\x08'), 'f' => value.push('\x0C'), 'v' => value.push('\x0B'), 'x' => {
2276 let mut hex = String::with_capacity(2);
2278 for _ in 0..2 {
2279 if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2280 hex.push(self.advance());
2281 }
2282 }
2283 if hex.len() == 2 {
2284 if let Ok(byte) = u8::from_str_radix(&hex, 16) {
2285 value.push(byte as char);
2286 } else {
2287 value.push('\\');
2288 value.push('x');
2289 value.push_str(&hex);
2290 }
2291 } else {
2292 value.push('\\');
2294 value.push('x');
2295 value.push_str(&hex);
2296 }
2297 }
2298 '\\' => value.push('\\'),
2299 '\'' => value.push('\''),
2300 '"' => value.push('"'),
2301 '%' => {
2302 value.push('%');
2304 }
2305 '_' => {
2306 value.push('_');
2308 }
2309 _ => {
2313 if !self.config.escape_follow_chars.is_empty() {
2314 value.push(escaped);
2316 } else {
2317 value.push('\\');
2319 value.push(escaped);
2320 }
2321 }
2322 }
2323 }
2324 } else {
2325 value.push(self.advance());
2326 }
2327 }
2328
2329 if self.is_at_end() {
2330 if self.config.recover_unterminated_string {
2331 self.add_token_with_text(TokenType::String, value);
2332 return Ok(());
2333 }
2334
2335 return Err(Error::tokenize(
2336 "Unterminated string",
2337 self.line,
2338 self.column,
2339 self.start,
2340 self.current,
2341 ));
2342 }
2343
2344 self.advance(); self.add_token_with_text(TokenType::String, value);
2346 Ok(())
2347 }
2348
2349 fn scan_double_quoted_string(&mut self) -> Result<()> {
2351 self.advance(); let mut value = String::new();
2353
2354 while !self.is_at_end() {
2355 let c = self.peek();
2356 if c == '"' {
2357 if self.peek_next() == '"' {
2358 value.push('"');
2360 self.advance();
2361 self.advance();
2362 } else {
2363 break;
2364 }
2365 } else if c == '\\' && self.config.string_escapes.contains(&'\\') {
2366 self.advance(); if !self.is_at_end() {
2369 let escaped = self.advance();
2370 match escaped {
2371 'n' => value.push('\n'),
2372 'r' => value.push('\r'),
2373 't' => value.push('\t'),
2374 '0' => value.push('\0'),
2375 'Z' => value.push('\x1A'), 'a' => value.push('\x07'), 'b' => value.push('\x08'), 'f' => value.push('\x0C'), 'v' => value.push('\x0B'), 'x' => {
2381 let mut hex = String::with_capacity(2);
2383 for _ in 0..2 {
2384 if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2385 hex.push(self.advance());
2386 }
2387 }
2388 if hex.len() == 2 {
2389 if let Ok(byte) = u8::from_str_radix(&hex, 16) {
2390 value.push(byte as char);
2391 } else {
2392 value.push('\\');
2393 value.push('x');
2394 value.push_str(&hex);
2395 }
2396 } else {
2397 value.push('\\');
2399 value.push('x');
2400 value.push_str(&hex);
2401 }
2402 }
2403 '\\' => value.push('\\'),
2404 '\'' => value.push('\''),
2405 '"' => value.push('"'),
2406 '%' => {
2407 value.push('%');
2409 }
2410 '_' => {
2411 value.push('_');
2413 }
2414 _ => {
2418 if !self.config.escape_follow_chars.is_empty() {
2419 value.push(escaped);
2421 } else {
2422 value.push('\\');
2424 value.push(escaped);
2425 }
2426 }
2427 }
2428 }
2429 } else {
2430 value.push(self.advance());
2431 }
2432 }
2433
2434 if self.is_at_end() {
2435 return Err(Error::tokenize(
2436 "Unterminated double-quoted string",
2437 self.line,
2438 self.column,
2439 self.start,
2440 self.current,
2441 ));
2442 }
2443
2444 self.advance(); self.add_token_with_text(TokenType::String, value);
2446 Ok(())
2447 }
2448
2449 fn scan_triple_quoted_string(&mut self, quote_char: char) -> Result<()> {
2450 self.advance();
2452 self.advance();
2453 self.advance();
2454 let mut value = String::new();
2455
2456 while !self.is_at_end() {
2457 if self.peek() == quote_char
2459 && self.current + 1 < self.size
2460 && self.chars[self.current + 1] == quote_char
2461 && self.current + 2 < self.size
2462 && self.chars[self.current + 2] == quote_char
2463 {
2464 break;
2466 }
2467 value.push(self.advance());
2468 }
2469
2470 if self.is_at_end() {
2471 return Err(Error::tokenize(
2472 "Unterminated triple-quoted string",
2473 self.line,
2474 self.column,
2475 self.start,
2476 self.current,
2477 ));
2478 }
2479
2480 self.advance();
2482 self.advance();
2483 self.advance();
2484 let token_type = if quote_char == '"' {
2485 TokenType::TripleDoubleQuotedString
2486 } else {
2487 TokenType::TripleSingleQuotedString
2488 };
2489 self.add_token_with_text(token_type, value);
2490 Ok(())
2491 }
2492
2493 fn scan_quoted_identifier(&mut self, end_quote: char) -> Result<()> {
2494 self.advance(); let mut value = String::new();
2496
2497 loop {
2498 if self.is_at_end() {
2499 return Err(Error::tokenize(
2500 "Unterminated identifier",
2501 self.line,
2502 self.column,
2503 self.start,
2504 self.current,
2505 ));
2506 }
2507 if end_quote == '`' && self.peek() == '\\' && self.peek_next() == end_quote {
2508 value.push(end_quote);
2510 self.advance(); self.advance(); continue;
2513 }
2514 if self.peek() == end_quote {
2515 if self.peek_next() == end_quote {
2516 value.push(end_quote);
2518 self.advance(); self.advance(); } else {
2521 break;
2523 }
2524 } else {
2525 value.push(self.peek());
2526 self.advance();
2527 }
2528 }
2529
2530 self.advance(); self.add_token_with_text(TokenType::QuotedIdentifier, value);
2532 Ok(())
2533 }
2534
2535 fn scan_unicode_quoted_string(&mut self, open_quote: char) -> Result<()> {
2540 self.advance(); let start = self.current;
2542 let close_quote = if open_quote == '\u{2018}' {
2544 '\u{2019}' } else {
2546 '\u{2019}' };
2548 while !self.is_at_end() && self.peek() != close_quote {
2549 self.advance();
2550 }
2551 let value = self.text_from_range(start, self.current);
2552 if !self.is_at_end() {
2553 self.advance(); }
2555 self.add_token_with_text(TokenType::String, value);
2556 Ok(())
2557 }
2558
2559 fn scan_unicode_quoted_identifier(&mut self, open_quote: char) -> Result<()> {
2562 self.advance(); let start = self.current;
2564 let close_quote = if open_quote == '\u{201C}' {
2565 '\u{201D}' } else {
2567 '\u{201D}' };
2569 while !self.is_at_end() && self.peek() != close_quote && self.peek() != '"' {
2570 self.advance();
2571 }
2572 let value = self.text_from_range(start, self.current);
2573 if !self.is_at_end() {
2574 self.advance(); }
2576 self.add_token_with_text(TokenType::QuotedIdentifier, value);
2577 Ok(())
2578 }
2579
2580 fn scan_number(&mut self) -> Result<()> {
2581 if self.config.hex_number_strings && self.peek() == '0' && !self.is_at_end() {
2583 let next = if self.current + 1 < self.size {
2584 self.chars[self.current + 1]
2585 } else {
2586 '\0'
2587 };
2588 if next == 'x' || next == 'X' {
2589 self.advance();
2591 self.advance();
2592 let hex_start = self.current;
2594 while !self.is_at_end() && (self.peek().is_ascii_hexdigit() || self.peek() == '_') {
2595 if self.peek() == '_' && !self.peek_next().is_ascii_hexdigit() {
2596 break;
2597 }
2598 self.advance();
2599 }
2600 if self.current > hex_start {
2601 let mut is_hex_float = false;
2603 if !self.is_at_end() && self.peek() == '.' {
2605 let after_dot = if self.current + 1 < self.size {
2606 self.chars[self.current + 1]
2607 } else {
2608 '\0'
2609 };
2610 if after_dot.is_ascii_hexdigit() {
2611 is_hex_float = true;
2612 self.advance(); while !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2614 self.advance();
2615 }
2616 }
2617 }
2618 if !self.is_at_end() && (self.peek() == 'p' || self.peek() == 'P') {
2620 is_hex_float = true;
2621 self.advance(); if !self.is_at_end() && (self.peek() == '+' || self.peek() == '-') {
2623 self.advance();
2624 }
2625 while !self.is_at_end() && self.peek().is_ascii_digit() {
2626 self.advance();
2627 }
2628 }
2629 if is_hex_float {
2630 let raw_text = self.text_from_range(self.start, self.current);
2632 let full_text = if self.config.numbers_can_be_underscore_separated
2633 && raw_text.contains('_')
2634 {
2635 raw_text.replace('_', "")
2636 } else {
2637 raw_text
2638 };
2639 self.add_token_with_text(TokenType::Number, full_text);
2640 } else if self.config.hex_string_is_integer_type {
2641 let raw_value = self.text_from_range(hex_start, self.current);
2643 let hex_value = if self.config.numbers_can_be_underscore_separated
2644 && raw_value.contains('_')
2645 {
2646 raw_value.replace('_', "")
2647 } else {
2648 raw_value
2649 };
2650 self.add_token_with_text(TokenType::HexNumber, hex_value);
2651 } else {
2652 let raw_value = self.text_from_range(hex_start, self.current);
2654 let hex_value = if self.config.numbers_can_be_underscore_separated
2655 && raw_value.contains('_')
2656 {
2657 raw_value.replace('_', "")
2658 } else {
2659 raw_value
2660 };
2661 self.add_token_with_text(TokenType::HexString, hex_value);
2662 }
2663 return Ok(());
2664 }
2665 self.current = self.start + 1;
2668 }
2669 }
2670
2671 while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2673 if self.peek() == '_' && (self.is_at_end() || !self.peek_next().is_ascii_digit()) {
2675 break;
2676 }
2677 self.advance();
2678 }
2679
2680 if self.peek() == '.' {
2684 let next = self.peek_next();
2685 if next != '.' {
2691 self.advance(); while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2694 if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2695 break;
2696 }
2697 self.advance();
2698 }
2699 }
2700 }
2701
2702 if self.peek() == 'e' || self.peek() == 'E' {
2704 self.advance();
2705 if self.peek() == '+' || self.peek() == '-' {
2706 self.advance();
2707 }
2708 while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2709 if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2710 break;
2711 }
2712 self.advance();
2713 }
2714 }
2715
2716 let raw_text = self.text_from_range(self.start, self.current);
2717 let text = if self.config.numbers_can_be_underscore_separated && raw_text.contains('_') {
2720 raw_text.replace('_', "")
2721 } else {
2722 raw_text
2723 };
2724
2725 if !self.config.numeric_literals.is_empty() && !self.is_at_end() {
2727 let next_char: String = self.peek().to_ascii_uppercase().to_string();
2728 let suffix_match = if self.current + 1 < self.size {
2730 let two_char: String = [
2731 self.chars[self.current].to_ascii_uppercase(),
2732 self.chars[self.current + 1].to_ascii_uppercase(),
2733 ]
2734 .iter()
2735 .collect();
2736 if self.config.numeric_literals.contains_key(&two_char) {
2737 let after_suffix = if self.current + 2 < self.size {
2739 self.chars[self.current + 2]
2740 } else {
2741 ' '
2742 };
2743 if !after_suffix.is_alphanumeric() && after_suffix != '_' {
2744 Some((two_char, 2))
2745 } else {
2746 None
2747 }
2748 } else if self.config.numeric_literals.contains_key(&next_char) {
2749 let after_suffix = if self.current + 1 < self.size {
2751 self.chars[self.current + 1]
2752 } else {
2753 ' '
2754 };
2755 if !after_suffix.is_alphanumeric() && after_suffix != '_' {
2756 Some((next_char, 1))
2757 } else {
2758 None
2759 }
2760 } else {
2761 None
2762 }
2763 } else if self.config.numeric_literals.contains_key(&next_char) {
2764 Some((next_char, 1))
2766 } else {
2767 None
2768 };
2769
2770 if let Some((suffix, len)) = suffix_match {
2771 for _ in 0..len {
2773 self.advance();
2774 }
2775 let type_name = self
2778 .config
2779 .numeric_literals
2780 .get(&suffix)
2781 .expect("suffix verified by contains_key above")
2782 .clone();
2783 let combined = format!("{}::{}", text, type_name);
2784 self.add_token_with_text(TokenType::Number, combined);
2785 return Ok(());
2786 }
2787 }
2788
2789 if self.config.identifiers_can_start_with_digit && !self.is_at_end() {
2792 let next = self.peek();
2793 if next.is_alphabetic() || next == '_' {
2794 while !self.is_at_end() {
2796 let ch = self.peek();
2797 if ch.is_alphanumeric() || ch == '_' {
2798 self.advance();
2799 } else {
2800 break;
2801 }
2802 }
2803 let ident_text = self.text_from_range(self.start, self.current);
2804 self.add_token_with_text(TokenType::Identifier, ident_text);
2805 return Ok(());
2806 }
2807 }
2808
2809 self.add_token_with_text(TokenType::Number, text);
2810 Ok(())
2811 }
2812
2813 fn scan_number_starting_with_dot(&mut self) -> Result<()> {
2815 self.advance();
2817
2818 while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2820 if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2821 break;
2822 }
2823 self.advance();
2824 }
2825
2826 if self.peek() == 'e' || self.peek() == 'E' {
2828 self.advance();
2829 if self.peek() == '+' || self.peek() == '-' {
2830 self.advance();
2831 }
2832 while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2833 if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2834 break;
2835 }
2836 self.advance();
2837 }
2838 }
2839
2840 let raw_text = self.text_from_range(self.start, self.current);
2841 let text = if self.config.numbers_can_be_underscore_separated && raw_text.contains('_') {
2844 raw_text.replace('_', "")
2845 } else {
2846 raw_text
2847 };
2848 self.add_token_with_text(TokenType::Number, text);
2849 Ok(())
2850 }
2851
2852 #[inline]
2855 fn lookup_keyword_ascii(keywords: &HashMap<String, TokenType>, text: &str) -> TokenType {
2856 if text.len() > 128 {
2857 return TokenType::Var;
2858 }
2859 let mut buf = [0u8; 128];
2860 for (i, b) in text.bytes().enumerate() {
2861 buf[i] = b.to_ascii_uppercase();
2862 }
2863 if let Ok(upper) = std::str::from_utf8(&buf[..text.len()]) {
2864 keywords.get(upper).copied().unwrap_or(TokenType::Var)
2865 } else {
2866 TokenType::Var
2867 }
2868 }
2869
2870 fn scan_identifier_or_keyword(&mut self) -> Result<()> {
2871 let first_char = self.peek();
2873 if !first_char.is_alphanumeric() && first_char != '_' {
2874 let c = self.advance();
2876 return Err(Error::tokenize(
2877 format!("Unexpected character: '{}'", c),
2878 self.line,
2879 self.column,
2880 self.start,
2881 self.current,
2882 ));
2883 }
2884
2885 while !self.is_at_end() {
2886 let c = self.peek();
2887 if c == '#' {
2891 let next_c = if self.current + 1 < self.size {
2892 self.chars[self.current + 1]
2893 } else {
2894 '\0'
2895 };
2896 if next_c == '>' || next_c == '-' {
2897 break; }
2899 self.advance();
2900 } else if c.is_alphanumeric() || c == '_' || c == '$' || c == '@' {
2901 self.advance();
2902 } else {
2903 break;
2904 }
2905 }
2906
2907 let text = self.text_from_range(self.start, self.current);
2908
2909 if text.eq_ignore_ascii_case("NOT") && self.peek() == '=' {
2911 self.advance(); self.add_token(TokenType::Neq);
2913 return Ok(());
2914 }
2915
2916 let next_char = self.peek();
2919 let is_single_quote = next_char == '\'';
2920 let is_double_quote = next_char == '"' && self.config.quotes.contains_key("\"");
2921 let is_double_quote_for_raw = next_char == '"';
2924
2925 if text.eq_ignore_ascii_case("R") && (is_single_quote || is_double_quote_for_raw) {
2928 let quote_char = if is_single_quote { '\'' } else { '"' };
2931 self.advance(); if self.peek() == quote_char && self.peek_next() == quote_char {
2935 self.advance(); self.advance(); let string_value = self.scan_raw_triple_quoted_content(quote_char)?;
2939 self.add_token_with_text(TokenType::RawString, string_value);
2940 } else {
2941 let string_value = self.scan_raw_string_content(quote_char)?;
2942 self.add_token_with_text(TokenType::RawString, string_value);
2943 }
2944 return Ok(());
2945 }
2946
2947 if is_single_quote || is_double_quote {
2948 if text.eq_ignore_ascii_case("N") {
2949 self.advance(); let string_value = if is_single_quote {
2952 self.scan_string_content()?
2953 } else {
2954 self.scan_double_quoted_string_content()?
2955 };
2956 self.add_token_with_text(TokenType::NationalString, string_value);
2957 return Ok(());
2958 } else if text.eq_ignore_ascii_case("E") {
2959 let lowercase = text == "e";
2963 let prefix = if lowercase { "e:" } else { "E:" };
2964 self.advance(); let string_value = self.scan_string_content_with_escapes(true)?;
2966 self.add_token_with_text(
2967 TokenType::EscapeString,
2968 format!("{}{}", prefix, string_value),
2969 );
2970 return Ok(());
2971 } else if text.eq_ignore_ascii_case("X") {
2972 self.advance(); let string_value = if is_single_quote {
2975 self.scan_string_content()?
2976 } else {
2977 self.scan_double_quoted_string_content()?
2978 };
2979 self.add_token_with_text(TokenType::HexString, string_value);
2980 return Ok(());
2981 } else if text.eq_ignore_ascii_case("B") && is_double_quote {
2982 self.advance(); let string_value = self.scan_double_quoted_string_content()?;
2985 self.add_token_with_text(TokenType::ByteString, string_value);
2986 return Ok(());
2987 } else if text.eq_ignore_ascii_case("B") && is_single_quote {
2988 self.advance(); let string_value = self.scan_string_content()?;
2992 if self.config.b_prefix_is_byte_string {
2993 self.add_token_with_text(TokenType::ByteString, string_value);
2994 } else {
2995 self.add_token_with_text(TokenType::BitString, string_value);
2996 }
2997 return Ok(());
2998 }
2999 }
3000
3001 if text.eq_ignore_ascii_case("U")
3003 && self.peek() == '&'
3004 && self.current + 1 < self.size
3005 && self.chars[self.current + 1] == '\''
3006 {
3007 self.advance(); self.advance(); let string_value = self.scan_string_content()?;
3010 self.add_token_with_text(TokenType::UnicodeString, string_value);
3011 return Ok(());
3012 }
3013
3014 let token_type = Self::lookup_keyword_ascii(&self.config.keywords, &text);
3015
3016 self.add_token_with_text(token_type, text);
3017 Ok(())
3018 }
3019
3020 fn scan_string_content_with_escapes(
3024 &mut self,
3025 force_backslash_escapes: bool,
3026 ) -> Result<String> {
3027 let mut value = String::new();
3028 let use_backslash_escapes =
3029 force_backslash_escapes || self.config.string_escapes.contains(&'\\');
3030
3031 while !self.is_at_end() {
3032 let c = self.peek();
3033 if c == '\'' {
3034 if self.peek_next() == '\'' {
3035 value.push('\'');
3037 self.advance();
3038 self.advance();
3039 } else {
3040 break;
3041 }
3042 } else if c == '\\' && use_backslash_escapes {
3043 value.push(self.advance());
3045 if !self.is_at_end() {
3046 value.push(self.advance());
3047 }
3048 } else {
3049 value.push(self.advance());
3050 }
3051 }
3052
3053 if self.is_at_end() {
3054 return Err(Error::tokenize(
3055 "Unterminated string",
3056 self.line,
3057 self.column,
3058 self.start,
3059 self.current,
3060 ));
3061 }
3062
3063 self.advance(); Ok(value)
3065 }
3066
3067 fn scan_string_content(&mut self) -> Result<String> {
3069 self.scan_string_content_with_escapes(false)
3070 }
3071
3072 fn scan_double_quoted_string_content(&mut self) -> Result<String> {
3075 let mut value = String::new();
3076 let use_backslash_escapes = self.config.string_escapes.contains(&'\\');
3077
3078 while !self.is_at_end() {
3079 let c = self.peek();
3080 if c == '"' {
3081 if self.peek_next() == '"' {
3082 value.push('"');
3084 self.advance();
3085 self.advance();
3086 } else {
3087 break;
3088 }
3089 } else if c == '\\' && use_backslash_escapes {
3090 self.advance(); if !self.is_at_end() {
3093 let escaped = self.advance();
3094 match escaped {
3095 'n' => value.push('\n'),
3096 'r' => value.push('\r'),
3097 't' => value.push('\t'),
3098 '0' => value.push('\0'),
3099 '\\' => value.push('\\'),
3100 '"' => value.push('"'),
3101 '\'' => value.push('\''),
3102 'x' => {
3103 let mut hex = String::new();
3105 for _ in 0..2 {
3106 if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
3107 hex.push(self.advance());
3108 }
3109 }
3110 if let Ok(byte) = u8::from_str_radix(&hex, 16) {
3111 value.push(byte as char);
3112 } else {
3113 value.push('\\');
3115 value.push('x');
3116 value.push_str(&hex);
3117 }
3118 }
3119 _ => {
3120 value.push('\\');
3122 value.push(escaped);
3123 }
3124 }
3125 }
3126 } else {
3127 value.push(self.advance());
3128 }
3129 }
3130
3131 if self.is_at_end() {
3132 return Err(Error::tokenize(
3133 "Unterminated double-quoted string",
3134 self.line,
3135 self.column,
3136 self.start,
3137 self.current,
3138 ));
3139 }
3140
3141 self.advance(); Ok(value)
3143 }
3144
3145 fn scan_raw_string_content(&mut self, quote_char: char) -> Result<String> {
3150 let mut value = String::new();
3151
3152 while !self.is_at_end() {
3153 let c = self.peek();
3154 if c == quote_char {
3155 if self.peek_next() == quote_char {
3156 value.push(quote_char);
3158 self.advance();
3159 self.advance();
3160 } else {
3161 break;
3162 }
3163 } else if c == '\\'
3164 && self.peek_next() == quote_char
3165 && self.config.string_escapes_allowed_in_raw_strings
3166 {
3167 value.push(quote_char);
3171 self.advance(); self.advance(); } else {
3174 value.push(self.advance());
3176 }
3177 }
3178
3179 if self.is_at_end() {
3180 return Err(Error::tokenize(
3181 "Unterminated raw string",
3182 self.line,
3183 self.column,
3184 self.start,
3185 self.current,
3186 ));
3187 }
3188
3189 self.advance(); Ok(value)
3191 }
3192
3193 fn scan_raw_triple_quoted_content(&mut self, quote_char: char) -> Result<String> {
3196 let mut value = String::new();
3197
3198 while !self.is_at_end() {
3199 let c = self.peek();
3200 if c == quote_char && self.peek_next() == quote_char {
3201 if self.current + 2 < self.size && self.chars[self.current + 2] == quote_char {
3203 self.advance(); self.advance(); self.advance(); return Ok(value);
3208 }
3209 }
3210 let ch = self.advance();
3212 value.push(ch);
3213 }
3214
3215 Err(Error::tokenize(
3216 "Unterminated raw triple-quoted string",
3217 self.line,
3218 self.column,
3219 self.start,
3220 self.current,
3221 ))
3222 }
3223
3224 fn scan_dollar_identifier(&mut self) -> Result<()> {
3229 self.advance();
3231
3232 while !self.is_at_end() {
3234 let c = self.peek();
3235 if c.is_alphanumeric() || c == '_' || c == '$' {
3236 self.advance();
3237 } else {
3238 break;
3239 }
3240 }
3241
3242 let text = self.text_from_range(self.start, self.current);
3243 self.add_token_with_text(TokenType::Var, text);
3244 Ok(())
3245 }
3246
3247 fn scan_tsql_identifier(&mut self) -> Result<()> {
3248 let first = self.advance();
3250
3251 if first == '#' && self.peek() == '#' {
3253 self.advance();
3254 }
3255
3256 while !self.is_at_end() {
3258 let c = self.peek();
3259 if c.is_alphanumeric() || c == '_' || c == '$' || c == '#' || c == '@' {
3260 self.advance();
3261 } else {
3262 break;
3263 }
3264 }
3265
3266 let text = self.text_from_range(self.start, self.current);
3267 self.add_token_with_text(TokenType::Var, text);
3269 Ok(())
3270 }
3271
3272 fn try_scan_insert_format_raw_data(&mut self) -> Option<String> {
3276 let len = self.tokens.len();
3277 if len < 3 {
3278 return None;
3279 }
3280
3281 let last = &self.tokens[len - 1];
3283 if last.text.eq_ignore_ascii_case("VALUES") {
3284 return None;
3285 }
3286 if !matches!(last.token_type, TokenType::Var | TokenType::Identifier) {
3287 return None;
3288 }
3289
3290 let format_tok = &self.tokens[len - 2];
3292 if !format_tok.text.eq_ignore_ascii_case("FORMAT") {
3293 return None;
3294 }
3295
3296 let has_insert = self.tokens[..len - 2]
3298 .iter()
3299 .rev()
3300 .take(20)
3301 .any(|t| t.token_type == TokenType::Insert);
3302 if !has_insert {
3303 return None;
3304 }
3305
3306 let raw_start = self.current;
3310 while !self.is_at_end() {
3311 let c = self.peek();
3312 if c == '\n' {
3313 let saved = self.current;
3315 self.advance(); while !self.is_at_end() && self.peek() == '\r' {
3318 self.advance();
3319 }
3320 if self.is_at_end() || self.peek() == '\n' {
3321 let raw = self.text_from_range(raw_start, saved);
3324 return Some(raw.trim().to_string());
3325 }
3326 } else {
3328 self.advance();
3329 }
3330 }
3331
3332 let raw = self.text_from_range(raw_start, self.current);
3334 let trimmed = raw.trim().to_string();
3335 if trimmed.is_empty() {
3336 None
3337 } else {
3338 Some(trimmed)
3339 }
3340 }
3341
3342 fn add_token(&mut self, token_type: TokenType) {
3343 let text = self.text_from_range(self.start, self.current);
3344 self.add_token_with_text(token_type, text);
3345 }
3346
3347 fn add_token_with_text(&mut self, token_type: TokenType, text: String) {
3348 let span = Span::new(self.start, self.current, self.line, self.column);
3349 let mut token = Token::new(token_type, text, span);
3350 token.comments.append(&mut self.comments);
3351 self.tokens.push(token);
3352 }
3353}
3354
3355#[cfg(test)]
3356mod tests {
3357 use super::*;
3358
3359 #[test]
3360 fn test_simple_select() {
3361 let tokenizer = Tokenizer::default();
3362 let tokens = tokenizer.tokenize("SELECT 1").unwrap();
3363
3364 assert_eq!(tokens.len(), 2);
3365 assert_eq!(tokens[0].token_type, TokenType::Select);
3366 assert_eq!(tokens[1].token_type, TokenType::Number);
3367 assert_eq!(tokens[1].text, "1");
3368 }
3369
3370 #[test]
3371 fn test_select_with_identifier() {
3372 let tokenizer = Tokenizer::default();
3373 let tokens = tokenizer.tokenize("SELECT a, b FROM t").unwrap();
3374
3375 assert_eq!(tokens.len(), 6);
3376 assert_eq!(tokens[0].token_type, TokenType::Select);
3377 assert_eq!(tokens[1].token_type, TokenType::Var);
3378 assert_eq!(tokens[1].text, "a");
3379 assert_eq!(tokens[2].token_type, TokenType::Comma);
3380 assert_eq!(tokens[3].token_type, TokenType::Var);
3381 assert_eq!(tokens[3].text, "b");
3382 assert_eq!(tokens[4].token_type, TokenType::From);
3383 assert_eq!(tokens[5].token_type, TokenType::Var);
3384 assert_eq!(tokens[5].text, "t");
3385 }
3386
3387 #[test]
3388 fn test_string_literal() {
3389 let tokenizer = Tokenizer::default();
3390 let tokens = tokenizer.tokenize("SELECT 'hello'").unwrap();
3391
3392 assert_eq!(tokens.len(), 2);
3393 assert_eq!(tokens[1].token_type, TokenType::String);
3394 assert_eq!(tokens[1].text, "hello");
3395 }
3396
3397 #[test]
3398 fn test_escaped_string() {
3399 let tokenizer = Tokenizer::default();
3400 let tokens = tokenizer.tokenize("SELECT 'it''s'").unwrap();
3401
3402 assert_eq!(tokens.len(), 2);
3403 assert_eq!(tokens[1].token_type, TokenType::String);
3404 assert_eq!(tokens[1].text, "it's");
3405 }
3406
3407 #[test]
3408 fn test_terminal_backslash_quote_recovery() {
3409 let mut config = TokenizerConfig::default();
3410 config.string_escapes.push('\\');
3411 config.recover_terminal_backslash_quote = true;
3412 let tokenizer = Tokenizer::new(config);
3413 let tokens = tokenizer
3414 .tokenize("SHOW FUNCTIONS LIKE 'a\\' OR 1=1")
3415 .unwrap();
3416
3417 assert_eq!(tokens.len(), 8);
3418 assert_eq!(tokens[3].token_type, TokenType::String);
3419 assert_eq!(tokens[3].text, "a\\");
3420 assert_eq!(tokens[4].token_type, TokenType::Or);
3421 }
3422
3423 #[test]
3424 fn test_comments() {
3425 let tokenizer = Tokenizer::default();
3426 let tokens = tokenizer.tokenize("SELECT -- comment\n1").unwrap();
3427
3428 assert_eq!(tokens.len(), 2);
3429 assert_eq!(tokens[0].trailing_comments.len(), 1);
3432 assert_eq!(tokens[0].trailing_comments[0], " comment");
3433 }
3434
3435 #[test]
3436 fn test_comment_in_and_chain() {
3437 use crate::generator::Generator;
3438 use crate::parser::Parser;
3439
3440 let sql = "SELECT a FROM b WHERE foo\n-- c1\nAND bar\n-- c2\nAND bla";
3442 let ast = Parser::parse_sql(sql).unwrap();
3443 let mut gen = Generator::default();
3444 let output = gen.generate(&ast[0]).unwrap();
3445 assert_eq!(
3446 output,
3447 "SELECT a FROM b WHERE foo AND /* c1 */ bar AND /* c2 */ bla"
3448 );
3449 }
3450
3451 #[test]
3452 fn test_operators() {
3453 let tokenizer = Tokenizer::default();
3454 let tokens = tokenizer.tokenize("1 + 2 * 3").unwrap();
3455
3456 assert_eq!(tokens.len(), 5);
3457 assert_eq!(tokens[0].token_type, TokenType::Number);
3458 assert_eq!(tokens[1].token_type, TokenType::Plus);
3459 assert_eq!(tokens[2].token_type, TokenType::Number);
3460 assert_eq!(tokens[3].token_type, TokenType::Star);
3461 assert_eq!(tokens[4].token_type, TokenType::Number);
3462 }
3463
3464 #[test]
3465 fn test_comparison_operators() {
3466 let tokenizer = Tokenizer::default();
3467 let tokens = tokenizer.tokenize("a <= b >= c != d").unwrap();
3468
3469 assert_eq!(tokens[1].token_type, TokenType::Lte);
3470 assert_eq!(tokens[3].token_type, TokenType::Gte);
3471 assert_eq!(tokens[5].token_type, TokenType::Neq);
3472 }
3473
3474 #[test]
3475 fn test_national_string() {
3476 let tokenizer = Tokenizer::default();
3477 let tokens = tokenizer.tokenize("N'abc'").unwrap();
3478
3479 assert_eq!(
3480 tokens.len(),
3481 1,
3482 "Expected 1 token for N'abc', got {:?}",
3483 tokens
3484 );
3485 assert_eq!(tokens[0].token_type, TokenType::NationalString);
3486 assert_eq!(tokens[0].text, "abc");
3487 }
3488
3489 #[test]
3490 fn test_hex_string() {
3491 let tokenizer = Tokenizer::default();
3492 let tokens = tokenizer.tokenize("X'ABCD'").unwrap();
3493
3494 assert_eq!(
3495 tokens.len(),
3496 1,
3497 "Expected 1 token for X'ABCD', got {:?}",
3498 tokens
3499 );
3500 assert_eq!(tokens[0].token_type, TokenType::HexString);
3501 assert_eq!(tokens[0].text, "ABCD");
3502 }
3503
3504 #[test]
3505 fn test_bit_string() {
3506 let tokenizer = Tokenizer::default();
3507 let tokens = tokenizer.tokenize("B'01010'").unwrap();
3508
3509 assert_eq!(
3510 tokens.len(),
3511 1,
3512 "Expected 1 token for B'01010', got {:?}",
3513 tokens
3514 );
3515 assert_eq!(tokens[0].token_type, TokenType::BitString);
3516 assert_eq!(tokens[0].text, "01010");
3517 }
3518
3519 #[test]
3520 fn test_trailing_dot_number() {
3521 let tokenizer = Tokenizer::default();
3522
3523 let tokens = tokenizer.tokenize("SELECT 1.").unwrap();
3525 assert_eq!(
3526 tokens.len(),
3527 2,
3528 "Expected 2 tokens for 'SELECT 1.', got {:?}",
3529 tokens
3530 );
3531 assert_eq!(tokens[1].token_type, TokenType::Number);
3532 assert_eq!(tokens[1].text, "1.");
3533
3534 let tokens = tokenizer.tokenize("SELECT 1.5").unwrap();
3536 assert_eq!(tokens[1].text, "1.5");
3537
3538 let tokens = tokenizer.tokenize("SELECT 1.a").unwrap();
3541 assert_eq!(
3542 tokens.len(),
3543 3,
3544 "Expected 3 tokens for 'SELECT 1.a', got {:?}",
3545 tokens
3546 );
3547 assert_eq!(tokens[1].token_type, TokenType::Number);
3548 assert_eq!(tokens[1].text, "1.");
3549 assert_eq!(tokens[2].token_type, TokenType::Var);
3550
3551 let tokens = tokenizer.tokenize("SELECT 1..2").unwrap();
3553 assert_eq!(tokens[1].token_type, TokenType::Number);
3554 assert_eq!(tokens[1].text, "1");
3555 assert_eq!(tokens[2].token_type, TokenType::Dot);
3556 assert_eq!(tokens[3].token_type, TokenType::Dot);
3557 assert_eq!(tokens[4].token_type, TokenType::Number);
3558 assert_eq!(tokens[4].text, "2");
3559 }
3560
3561 #[test]
3562 fn test_leading_dot_number() {
3563 let tokenizer = Tokenizer::default();
3564
3565 let tokens = tokenizer.tokenize(".25").unwrap();
3567 assert_eq!(
3568 tokens.len(),
3569 1,
3570 "Expected 1 token for '.25', got {:?}",
3571 tokens
3572 );
3573 assert_eq!(tokens[0].token_type, TokenType::Number);
3574 assert_eq!(tokens[0].text, ".25");
3575
3576 let tokens = tokenizer.tokenize("SAMPLE (.25)").unwrap();
3578 assert_eq!(
3579 tokens.len(),
3580 4,
3581 "Expected 4 tokens for 'SAMPLE (.25)', got {:?}",
3582 tokens
3583 );
3584 assert_eq!(tokens[0].token_type, TokenType::Sample);
3585 assert_eq!(tokens[1].token_type, TokenType::LParen);
3586 assert_eq!(tokens[2].token_type, TokenType::Number);
3587 assert_eq!(tokens[2].text, ".25");
3588 assert_eq!(tokens[3].token_type, TokenType::RParen);
3589
3590 let tokens = tokenizer.tokenize(".5e10").unwrap();
3592 assert_eq!(
3593 tokens.len(),
3594 1,
3595 "Expected 1 token for '.5e10', got {:?}",
3596 tokens
3597 );
3598 assert_eq!(tokens[0].token_type, TokenType::Number);
3599 assert_eq!(tokens[0].text, ".5e10");
3600
3601 let tokens = tokenizer.tokenize("a.b").unwrap();
3603 assert_eq!(
3604 tokens.len(),
3605 3,
3606 "Expected 3 tokens for 'a.b', got {:?}",
3607 tokens
3608 );
3609 assert_eq!(tokens[1].token_type, TokenType::Dot);
3610 }
3611
3612 #[test]
3613 fn test_unrecognized_character() {
3614 let tokenizer = Tokenizer::default();
3615
3616 let result = tokenizer.tokenize("SELECT \u{2018}hello\u{2019}");
3618 assert!(
3619 result.is_ok(),
3620 "Curly quotes should be tokenized as strings"
3621 );
3622
3623 let result = tokenizer.tokenize("SELECT • FROM t");
3625 assert!(result.is_err());
3626 }
3627
3628 #[test]
3629 fn test_colon_eq_tokenization() {
3630 let tokenizer = Tokenizer::default();
3631
3632 let tokens = tokenizer.tokenize("a := 1").unwrap();
3634 assert_eq!(tokens.len(), 3);
3635 assert_eq!(tokens[0].token_type, TokenType::Var);
3636 assert_eq!(tokens[1].token_type, TokenType::ColonEq);
3637 assert_eq!(tokens[2].token_type, TokenType::Number);
3638
3639 let tokens = tokenizer.tokenize("a:b").unwrap();
3641 assert!(tokens.iter().any(|t| t.token_type == TokenType::Colon));
3642 assert!(!tokens.iter().any(|t| t.token_type == TokenType::ColonEq));
3643
3644 let tokens = tokenizer.tokenize("a::INT").unwrap();
3646 assert!(tokens.iter().any(|t| t.token_type == TokenType::DColon));
3647 }
3648
3649 #[test]
3650 fn test_colon_eq_parsing() {
3651 use crate::generator::Generator;
3652 use crate::parser::Parser;
3653
3654 let ast = Parser::parse_sql("SELECT @var1 := 1, @var2")
3656 .expect("Failed to parse MySQL @var := expr");
3657 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3658 assert_eq!(output, "SELECT @var1 := 1, @var2");
3659
3660 let ast = Parser::parse_sql("SELECT @var1, @var2 := @var1")
3662 .expect("Failed to parse MySQL @var2 := @var1");
3663 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3664 assert_eq!(output, "SELECT @var1, @var2 := @var1");
3665
3666 let ast = Parser::parse_sql("SELECT @var1 := COUNT(*) FROM t1")
3668 .expect("Failed to parse MySQL @var := COUNT(*)");
3669 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3670 assert_eq!(output, "SELECT @var1 := COUNT(*) FROM t1");
3671
3672 let ast = Parser::parse_sql("SET @var1 := 1").expect("Failed to parse SET @var1 := 1");
3674 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3675 assert_eq!(output, "SET @var1 = 1");
3676
3677 let ast =
3679 Parser::parse_sql("UNION_VALUE(k1 := 1)").expect("Failed to parse named arg with :=");
3680 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3681 assert_eq!(output, "UNION_VALUE(k1 := 1)");
3682
3683 let ast = Parser::parse_sql("SELECT UNNEST(col, recursive := TRUE) FROM t")
3685 .expect("Failed to parse UNNEST with :=");
3686 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3687 assert_eq!(output, "SELECT UNNEST(col, recursive := TRUE) FROM t");
3688
3689 let ast =
3691 Parser::parse_sql("SELECT foo: 1").expect("Failed to parse DuckDB prefix alias foo: 1");
3692 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3693 assert_eq!(output, "SELECT 1 AS foo");
3694
3695 let ast = Parser::parse_sql("SELECT foo: 1, bar: 2, baz: 3")
3697 .expect("Failed to parse DuckDB multiple prefix aliases");
3698 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3699 assert_eq!(output, "SELECT 1 AS foo, 2 AS bar, 3 AS baz");
3700 }
3701
3702 #[test]
3703 fn test_colon_eq_dialect_roundtrip() {
3704 use crate::dialects::{Dialect, DialectType};
3705
3706 fn check(dialect: DialectType, sql: &str, expected: Option<&str>) {
3707 let d = Dialect::get(dialect);
3708 let ast = d
3709 .parse(sql)
3710 .unwrap_or_else(|e| panic!("Parse error for '{}': {}", sql, e));
3711 assert!(!ast.is_empty(), "Empty AST for: {}", sql);
3712 let transformed = d
3713 .transform(ast[0].clone())
3714 .unwrap_or_else(|e| panic!("Transform error for '{}': {}", sql, e));
3715 let output = d
3716 .generate(&transformed)
3717 .unwrap_or_else(|e| panic!("Generate error for '{}': {}", sql, e));
3718 let expected = expected.unwrap_or(sql);
3719 assert_eq!(output, expected, "Roundtrip failed for: {}", sql);
3720 }
3721
3722 check(DialectType::MySQL, "SELECT @var1 := 1, @var2", None);
3724 check(DialectType::MySQL, "SELECT @var1, @var2 := @var1", None);
3725 check(DialectType::MySQL, "SELECT @var1 := COUNT(*) FROM t1", None);
3726 check(DialectType::MySQL, "SET @var1 := 1", Some("SET @var1 = 1"));
3727
3728 check(
3730 DialectType::DuckDB,
3731 "SELECT UNNEST(col, recursive := TRUE) FROM t",
3732 None,
3733 );
3734 check(DialectType::DuckDB, "UNION_VALUE(k1 := 1)", None);
3735
3736 {
3739 let d = Dialect::get(DialectType::DuckDB);
3740 let ast = d
3741 .parse("STRUCT_PACK(a := 'b')::json")
3742 .expect("Failed to parse STRUCT_PACK(a := 'b')::json");
3743 assert!(!ast.is_empty(), "Empty AST for STRUCT_PACK(a := 'b')::json");
3744 }
3745
3746 check(
3748 DialectType::DuckDB,
3749 "SELECT foo: 1",
3750 Some("SELECT 1 AS foo"),
3751 );
3752 check(
3753 DialectType::DuckDB,
3754 "SELECT foo: 1, bar: 2, baz: 3",
3755 Some("SELECT 1 AS foo, 2 AS bar, 3 AS baz"),
3756 );
3757 }
3758
3759 #[test]
3760 fn test_comment_roundtrip() {
3761 use crate::generator::Generator;
3762 use crate::parser::Parser;
3763
3764 fn check_roundtrip(sql: &str) -> Option<String> {
3765 let ast = match Parser::parse_sql(sql) {
3766 Ok(a) => a,
3767 Err(e) => return Some(format!("Parse error: {:?}", e)),
3768 };
3769 if ast.is_empty() {
3770 return Some("Empty AST".to_string());
3771 }
3772 let mut generator = Generator::default();
3773 let output = match generator.generate(&ast[0]) {
3774 Ok(o) => o,
3775 Err(e) => return Some(format!("Gen error: {:?}", e)),
3776 };
3777 if output == sql {
3778 None
3779 } else {
3780 Some(format!(
3781 "Mismatch:\n input: {}\n output: {}",
3782 sql, output
3783 ))
3784 }
3785 }
3786
3787 let tests = vec![
3788 "SELECT c /* c1 */ AS alias /* c2 */",
3794 "SELECT a /* x */, b /* x */",
3796 "SELECT a /* x */ /* y */ /* z */, b /* k */ /* m */",
3798 "SELECT * FROM foo /* x */, bla /* x */",
3800 "SELECT 1 /* comment */ + 1",
3802 "SELECT 1 /* c1 */ + 2 /* c2 */",
3803 "SELECT 1 /* c1 */ + /* c2 */ 2 /* c3 */",
3804 "SELECT CAST(x AS INT) /* comment */ FROM foo",
3806 "SELECT FOO(x /* c */) /* FOO */, b /* b */",
3808 "SELECT x FROM a.b.c /* x */, e.f.g /* x */",
3810 "INSERT INTO t1 (tc1 /* tc1 */, tc2 /* tc2 */) SELECT c1 /* sc1 */, c2 /* sc2 */ FROM t",
3812 "/* c */ WITH x AS (SELECT 1) SELECT * FROM x",
3814 "/* comment1 */ INSERT INTO x /* comment2 */ VALUES (1, 2, 3)",
3815 "/* comment1 */ UPDATE tbl /* comment2 */ SET x = 2 WHERE x < 2",
3816 "/* comment1 */ DELETE FROM x /* comment2 */ WHERE y > 1",
3817 "/* comment */ CREATE TABLE foo AS SELECT 1",
3818 "INSERT INTO foo SELECT * FROM bar /* comment */",
3820 "SELECT FOO(x /* c1 */ + y /* c2 */ + BLA(5 /* c3 */)) FROM (VALUES (1 /* c4 */, \"test\" /* c5 */)) /* c6 */",
3822 ];
3823
3824 let mut failures = Vec::new();
3825 for sql in tests {
3826 if let Some(e) = check_roundtrip(sql) {
3827 failures.push(e);
3828 }
3829 }
3830
3831 if !failures.is_empty() {
3832 panic!("Comment roundtrip failures:\n{}", failures.join("\n\n"));
3833 }
3834 }
3835
3836 #[test]
3837 fn test_dollar_quoted_string_parsing() {
3838 use crate::dialects::{Dialect, DialectType};
3839
3840 let (tag, content) = super::parse_dollar_string_token("FOO\x00content here");
3842 assert_eq!(tag, Some("FOO".to_string()));
3843 assert_eq!(content, "content here");
3844
3845 let (tag, content) = super::parse_dollar_string_token("just content");
3846 assert_eq!(tag, None);
3847 assert_eq!(content, "just content");
3848
3849 fn check_databricks(sql: &str, expected: Option<&str>) {
3851 let d = Dialect::get(DialectType::Databricks);
3852 let ast = d
3853 .parse(sql)
3854 .unwrap_or_else(|e| panic!("Parse error for '{}': {}", sql, e));
3855 assert!(!ast.is_empty(), "Empty AST for: {}", sql);
3856 let transformed = d
3857 .transform(ast[0].clone())
3858 .unwrap_or_else(|e| panic!("Transform error for '{}': {}", sql, e));
3859 let output = d
3860 .generate(&transformed)
3861 .unwrap_or_else(|e| panic!("Generate error for '{}': {}", sql, e));
3862 let expected = expected.unwrap_or(sql);
3863 assert_eq!(output, expected, "Roundtrip failed for: {}", sql);
3864 }
3865
3866 check_databricks(
3868 "CREATE FUNCTION add_one(x INT) RETURNS INT LANGUAGE PYTHON AS $$def add_one(x):\n return x+1$$",
3869 None
3870 );
3871
3872 check_databricks(
3874 "CREATE FUNCTION add_one(x INT) RETURNS INT LANGUAGE PYTHON AS $FOO$def add_one(x):\n return x+1$FOO$",
3875 None
3876 );
3877 }
3878
3879 #[test]
3880 fn test_numeric_underscore_stripping() {
3881 let mut config = TokenizerConfig::default();
3883 config.numbers_can_be_underscore_separated = true;
3884 let tokenizer = Tokenizer::new(config);
3885
3886 let tokens = tokenizer.tokenize("SELECT 1_2_3_4_5").unwrap();
3888 assert_eq!(tokens[1].token_type, TokenType::Number);
3889 assert_eq!(tokens[1].text, "12345");
3890
3891 let tokens = tokenizer.tokenize("SELECT 20_000").unwrap();
3893 assert_eq!(tokens[1].token_type, TokenType::Number);
3894 assert_eq!(tokens[1].text, "20000");
3895
3896 let tokens = tokenizer.tokenize("SELECT 1_2E+1_0").unwrap();
3898 assert_eq!(tokens[1].token_type, TokenType::Number);
3899 assert_eq!(tokens[1].text, "12E+10");
3900
3901 let default_tokenizer = Tokenizer::default();
3903 let tokens = default_tokenizer.tokenize("SELECT 1_2_3_4_5").unwrap();
3904 assert_eq!(tokens[1].token_type, TokenType::Number);
3905 assert_eq!(tokens[1].text, "1_2_3_4_5");
3906 }
3907}