1use crate::error::{Error, Result};
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9use std::fmt;
10use std::sync::LazyLock;
11#[cfg(feature = "bindings")]
12use ts_rs::TS;
13
14pub fn parse_dollar_string_token(text: &str) -> (Option<String>, String) {
18 if let Some(pos) = text.find('\x00') {
19 let tag = &text[..pos];
20 let content = &text[pos + 1..];
21 (Some(tag.to_string()), content.to_string())
22 } else {
23 (None, text.to_string())
24 }
25}
26
27#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
29#[cfg_attr(feature = "bindings", derive(TS))]
30pub struct Span {
31 pub start: usize,
33 pub end: usize,
35 pub line: usize,
37 pub column: usize,
39}
40
41impl Span {
42 pub fn new(start: usize, end: usize, line: usize, column: usize) -> Self {
43 Self {
44 start,
45 end,
46 line,
47 column,
48 }
49 }
50}
51
52#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
54pub struct Token {
55 pub token_type: TokenType,
57 pub text: String,
59 pub span: Span,
61 #[serde(default)]
63 pub comments: Vec<String>,
64 #[serde(default)]
66 pub trailing_comments: Vec<String>,
67}
68
69impl Token {
70 pub fn new(token_type: TokenType, text: impl Into<String>, span: Span) -> Self {
72 Self {
73 token_type,
74 text: text.into(),
75 span,
76 comments: Vec::new(),
77 trailing_comments: Vec::new(),
78 }
79 }
80
81 pub fn number(n: i64) -> Self {
83 Self::new(TokenType::Number, n.to_string(), Span::default())
84 }
85
86 pub fn string(s: impl Into<String>) -> Self {
88 Self::new(TokenType::String, s, Span::default())
89 }
90
91 pub fn identifier(s: impl Into<String>) -> Self {
93 Self::new(TokenType::Identifier, s, Span::default())
94 }
95
96 pub fn var(s: impl Into<String>) -> Self {
98 Self::new(TokenType::Var, s, Span::default())
99 }
100
101 pub fn with_comment(mut self, comment: impl Into<String>) -> Self {
103 self.comments.push(comment.into());
104 self
105 }
106}
107
108impl fmt::Display for Token {
109 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
110 write!(f, "{:?}({})", self.token_type, self.text)
111 }
112}
113
114#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
116#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
117#[repr(u16)]
118pub enum TokenType {
119 LParen,
121 RParen,
122 LBracket,
123 RBracket,
124 LBrace,
125 RBrace,
126 Comma,
127 Dot,
128 Dash,
129 Plus,
130 Colon,
131 DotColon,
132 DColon,
133 DColonDollar,
134 DColonPercent,
135 DColonQMark,
136 DQMark,
137 Semicolon,
138 Star,
139 Backslash,
140 Slash,
141 Lt,
142 Lte,
143 Gt,
144 Gte,
145 Not,
146 Eq,
147 Neq,
148 NullsafeEq,
149 ColonEq,
150 ColonGt,
151 NColonGt,
152 And,
153 Or,
154 Amp,
155 DPipe,
156 PipeGt,
157 Pipe,
158 PipeSlash,
159 DPipeSlash,
160 Caret,
161 CaretAt,
162 LtLt, GtGt, Tilde,
165 Arrow,
166 DArrow,
167 FArrow,
168 Hash,
169 HashArrow,
170 DHashArrow,
171 LrArrow,
172 DAt,
173 AtAt,
174 LtAt,
175 AtGt,
176 Dollar,
177 Parameter,
178 Session,
179 SessionParameter,
180 SessionUser,
181 DAmp,
182 AmpLt,
183 AmpGt,
184 Adjacent,
185 Xor,
186 DStar,
187 QMarkAmp,
188 QMarkPipe,
189 HashDash,
190 Exclamation,
191
192 UriStart,
193 BlockStart,
194 BlockEnd,
195 Space,
196 Break,
197
198 BlockComment, LineComment, String,
204 DollarString, TripleDoubleQuotedString, TripleSingleQuotedString, Number,
208 Identifier,
209 QuotedIdentifier,
210 Database,
211 Column,
212 ColumnDef,
213 Schema,
214 Table,
215 Warehouse,
216 Stage,
217 Streamlit,
218 Var,
219 BitString,
220 HexString,
221 HexNumber,
223 ByteString,
224 NationalString,
225 EscapeString, RawString,
227 HeredocString,
228 HeredocStringAlternative,
229 UnicodeString,
230
231 Bit,
233 Boolean,
234 TinyInt,
235 UTinyInt,
236 SmallInt,
237 USmallInt,
238 MediumInt,
239 UMediumInt,
240 Int,
241 UInt,
242 BigInt,
243 UBigInt,
244 BigNum,
245 Int128,
246 UInt128,
247 Int256,
248 UInt256,
249 Float,
250 Double,
251 UDouble,
252 Decimal,
253 Decimal32,
254 Decimal64,
255 Decimal128,
256 Decimal256,
257 DecFloat,
258 UDecimal,
259 BigDecimal,
260 Char,
261 NChar,
262 VarChar,
263 NVarChar,
264 BpChar,
265 Text,
266 MediumText,
267 LongText,
268 Blob,
269 MediumBlob,
270 LongBlob,
271 TinyBlob,
272 TinyText,
273 Name,
274 Binary,
275 VarBinary,
276 Json,
277 JsonB,
278 Time,
279 TimeTz,
280 TimeNs,
281 Timestamp,
282 TimestampTz,
283 TimestampLtz,
284 TimestampNtz,
285 TimestampS,
286 TimestampMs,
287 TimestampNs,
288 DateTime,
289 DateTime2,
290 DateTime64,
291 SmallDateTime,
292 Date,
293 Date32,
294 Int4Range,
295 Int4MultiRange,
296 Int8Range,
297 Int8MultiRange,
298 NumRange,
299 NumMultiRange,
300 TsRange,
301 TsMultiRange,
302 TsTzRange,
303 TsTzMultiRange,
304 DateRange,
305 DateMultiRange,
306 Uuid,
307 Geography,
308 GeographyPoint,
309 Nullable,
310 Geometry,
311 Point,
312 Ring,
313 LineString,
314 LocalTime,
315 LocalTimestamp,
316 SysTimestamp,
317 MultiLineString,
318 Polygon,
319 MultiPolygon,
320 HllSketch,
321 HStore,
322 Super,
323 Serial,
324 SmallSerial,
325 BigSerial,
326 Xml,
327 Year,
328 UserDefined,
329 Money,
330 SmallMoney,
331 RowVersion,
332 Image,
333 Variant,
334 Object,
335 Inet,
336 IpAddress,
337 IpPrefix,
338 Ipv4,
339 Ipv6,
340 Enum,
341 Enum8,
342 Enum16,
343 FixedString,
344 LowCardinality,
345 Nested,
346 AggregateFunction,
347 SimpleAggregateFunction,
348 TDigest,
349 Unknown,
350 Vector,
351 Dynamic,
352 Void,
353
354 Add,
356 Alias,
357 Alter,
358 All,
359 Anti,
360 Any,
361 Apply,
362 Array,
363 Asc,
364 AsOf,
365 Attach,
366 AutoIncrement,
367 Begin,
368 Between,
369 BulkCollectInto,
370 Cache,
371 Cascade,
372 Case,
373 CharacterSet,
374 Cluster,
375 ClusterBy,
376 Collate,
377 Command,
378 Comment,
379 Commit,
380 Preserve,
381 Connect,
382 ConnectBy,
383 Constraint,
384 Copy,
385 Create,
386 Cross,
387 Cube,
388 CurrentDate,
389 CurrentDateTime,
390 CurrentSchema,
391 CurrentTime,
392 CurrentTimestamp,
393 CurrentUser,
394 CurrentRole,
395 CurrentCatalog,
396 Declare,
397 Default,
398 Delete,
399 Desc,
400 Describe,
401 Detach,
402 Dictionary,
403 Distinct,
404 Distribute,
405 DistributeBy,
406 Div,
407 Drop,
408 Else,
409 End,
410 Escape,
411 Except,
412 Execute,
413 Exists,
414 False,
415 Fetch,
416 File,
417 FileFormat,
418 Filter,
419 Final,
420 First,
421 For,
422 Force,
423 ForeignKey,
424 Format,
425 From,
426 Full,
427 Function,
428 Get,
429 Glob,
430 Global,
431 Grant,
432 GroupBy,
433 GroupingSets,
434 Having,
435 Hint,
436 Ignore,
437 ILike,
438 In,
439 Index,
440 IndexedBy,
441 Inner,
442 Input,
443 Insert,
444 Install,
445 Intersect,
446 Interval,
447 Into,
448 Inpath,
449 InputFormat,
450 Introducer,
451 IRLike,
452 Is,
453 IsNull,
454 Join,
455 JoinMarker,
456 Keep,
457 Key,
458 Kill,
459 Lambda,
460 Language,
461 Lateral,
462 Left,
463 Like,
464 NotLike, NotILike, NotRLike, NotIRLike, Limit,
469 List,
470 Load,
471 Local,
472 Lock,
473 Map,
474 Match,
475 MatchCondition,
476 MatchRecognize,
477 MemberOf,
478 Materialized,
479 Merge,
480 Mod,
481 Model,
482 Natural,
483 Next,
484 NoAction,
485 Nothing,
486 NotNull,
487 Null,
488 ObjectIdentifier,
489 Offset,
490 On,
491 Only,
492 Operator,
493 OrderBy,
494 OrderSiblingsBy,
495 Ordered,
496 Ordinality,
497 Out,
498 Outer,
499 Output,
500 Over,
501 Overlaps,
502 Overwrite,
503 Partition,
504 PartitionBy,
505 Percent,
506 Pivot,
507 Placeholder,
508 Positional,
509 Pragma,
510 Prewhere,
511 PrimaryKey,
512 Procedure,
513 Properties,
514 PseudoType,
515 Put,
516 Qualify,
517 Quote,
518 QDColon,
519 Range,
520 Recursive,
521 Refresh,
522 Rename,
523 Replace,
524 Returning,
525 Revoke,
526 References,
527 Restrict,
528 Right,
529 RLike,
530 Rollback,
531 Rollup,
532 Row,
533 Rows,
534 Select,
535 Semi,
536 Savepoint,
537 Separator,
538 Sequence,
539 Serde,
540 SerdeProperties,
541 Set,
542 Settings,
543 Show,
544 Siblings,
545 SimilarTo,
546 Some,
547 Sort,
548 SortBy,
549 SoundsLike,
550 StartWith,
551 StorageIntegration,
552 StraightJoin,
553 Struct,
554 Summarize,
555 TableSample,
556 Sample,
557 Bernoulli,
558 System,
559 Block,
560 Seed,
561 Repeatable,
562 Tag,
563 Temporary,
564 Transaction,
565 To,
566 Top,
567 Then,
568 True,
569 Truncate,
570 Uncache,
571 Union,
572 Unnest,
573 Unpivot,
574 Update,
575 Use,
576 Using,
577 Values,
578 View,
579 SemanticView,
580 Volatile,
581 When,
582 Where,
583 Window,
584 With,
585 Ties,
586 Exclude,
587 No,
588 Others,
589 Unique,
590 UtcDate,
591 UtcTime,
592 UtcTimestamp,
593 VersionSnapshot,
594 TimestampSnapshot,
595 Option,
596 Sink,
597 Source,
598 Analyze,
599 Namespace,
600 Export,
601 As,
602 By,
603 Nulls,
604 Respect,
605 Last,
606 If,
607 Cast,
608 TryCast,
609 SafeCast,
610 Count,
611 Extract,
612 Substring,
613 Trim,
614 Leading,
615 Trailing,
616 Both,
617 Position,
618 Overlaying,
619 Placing,
620 Treat,
621 Within,
622 Group,
623 Order,
624
625 Unbounded,
627 Preceding,
628 Following,
629 Current,
630 Groups,
631
632 Trigger,
634 Type,
635 Domain,
636 Returns,
637 Body,
638 Increment,
639 Minvalue,
640 Maxvalue,
641 Start,
642 Cycle,
643 NoCycle,
644 Prior,
645 Generated,
646 Identity,
647 Always,
648 Measures,
650 Pattern,
651 Define,
652 Running,
653 Owned,
654 After,
655 Before,
656 Instead,
657 Each,
658 Statement,
659 Referencing,
660 Old,
661 New,
662 Of,
663 Check,
664 Authorization,
665 Restart,
666
667 Eof,
669}
670
671impl TokenType {
672 pub fn is_keyword(&self) -> bool {
674 matches!(
675 self,
676 TokenType::Select
677 | TokenType::From
678 | TokenType::Where
679 | TokenType::And
680 | TokenType::Or
681 | TokenType::Not
682 | TokenType::In
683 | TokenType::Is
684 | TokenType::Null
685 | TokenType::True
686 | TokenType::False
687 | TokenType::As
688 | TokenType::On
689 | TokenType::Join
690 | TokenType::Left
691 | TokenType::Right
692 | TokenType::Inner
693 | TokenType::Outer
694 | TokenType::Full
695 | TokenType::Cross
696 | TokenType::Semi
697 | TokenType::Anti
698 | TokenType::Union
699 | TokenType::Except
700 | TokenType::Intersect
701 | TokenType::GroupBy
702 | TokenType::OrderBy
703 | TokenType::Having
704 | TokenType::Limit
705 | TokenType::Offset
706 | TokenType::Case
707 | TokenType::When
708 | TokenType::Then
709 | TokenType::Else
710 | TokenType::End
711 | TokenType::Create
712 | TokenType::Drop
713 | TokenType::Alter
714 | TokenType::Insert
715 | TokenType::Update
716 | TokenType::Delete
717 | TokenType::Into
718 | TokenType::Values
719 | TokenType::Set
720 | TokenType::With
721 | TokenType::Distinct
722 | TokenType::All
723 | TokenType::Exists
724 | TokenType::Between
725 | TokenType::Like
726 | TokenType::ILike
727 | TokenType::Filter
729 | TokenType::Date
730 | TokenType::Timestamp
731 | TokenType::TimestampTz
732 | TokenType::Interval
733 | TokenType::Time
734 | TokenType::Table
735 | TokenType::Index
736 | TokenType::Column
737 | TokenType::Database
738 | TokenType::Schema
739 | TokenType::View
740 | TokenType::Function
741 | TokenType::Procedure
742 | TokenType::Trigger
743 | TokenType::Sequence
744 | TokenType::Over
745 | TokenType::Partition
746 | TokenType::Window
747 | TokenType::Rows
748 | TokenType::Range
749 | TokenType::First
750 | TokenType::Last
751 | TokenType::Preceding
752 | TokenType::Following
753 | TokenType::Current
754 | TokenType::Row
755 | TokenType::Unbounded
756 | TokenType::Array
757 | TokenType::Struct
758 | TokenType::Map
759 | TokenType::PrimaryKey
760 | TokenType::Key
761 | TokenType::ForeignKey
762 | TokenType::References
763 | TokenType::Unique
764 | TokenType::Check
765 | TokenType::Default
766 | TokenType::Constraint
767 | TokenType::Comment
768 | TokenType::Rollup
769 | TokenType::Cube
770 | TokenType::Grant
771 | TokenType::Revoke
772 | TokenType::Type
773 | TokenType::Use
774 | TokenType::Cache
775 | TokenType::Uncache
776 | TokenType::Load
777 | TokenType::Any
778 | TokenType::Some
779 | TokenType::Asc
780 | TokenType::Desc
781 | TokenType::Nulls
782 | TokenType::Lateral
783 | TokenType::Natural
784 | TokenType::Escape
785 | TokenType::Glob
786 | TokenType::Match
787 | TokenType::Recursive
788 | TokenType::Replace
789 | TokenType::Returns
790 | TokenType::If
791 | TokenType::Pivot
792 | TokenType::Unpivot
793 | TokenType::Json
794 | TokenType::Blob
795 | TokenType::Text
796 | TokenType::Int
797 | TokenType::BigInt
798 | TokenType::SmallInt
799 | TokenType::TinyInt
800 | TokenType::Int128
801 | TokenType::UInt128
802 | TokenType::Int256
803 | TokenType::UInt256
804 | TokenType::UInt
805 | TokenType::UBigInt
806 | TokenType::Float
807 | TokenType::Double
808 | TokenType::Decimal
809 | TokenType::Boolean
810 | TokenType::VarChar
811 | TokenType::Char
812 | TokenType::Binary
813 | TokenType::VarBinary
814 | TokenType::No
815 | TokenType::DateTime
816 | TokenType::Truncate
817 | TokenType::Execute
818 | TokenType::Merge
819 | TokenType::Top
820 | TokenType::Begin
821 | TokenType::Generated
822 | TokenType::Identity
823 | TokenType::Always
824 | TokenType::Extract
825 | TokenType::AsOf
827 | TokenType::Prior
828 | TokenType::After
829 | TokenType::Restrict
830 | TokenType::Cascade
831 | TokenType::Local
832 | TokenType::Rename
833 | TokenType::Enum
834 | TokenType::Within
835 | TokenType::Format
836 | TokenType::Final
837 | TokenType::FileFormat
838 | TokenType::Input
839 | TokenType::InputFormat
840 | TokenType::Copy
841 | TokenType::Put
842 | TokenType::Get
843 | TokenType::Show
844 | TokenType::Serde
845 | TokenType::Sample
846 | TokenType::Sort
847 | TokenType::Collate
848 | TokenType::Ties
849 | TokenType::IsNull
850 | TokenType::NotNull
851 | TokenType::Exclude
852 | TokenType::Temporary
853 | TokenType::Add
854 | TokenType::Ordinality
855 | TokenType::Overlaps
856 | TokenType::Block
857 | TokenType::Pattern
858 | TokenType::Group
859 | TokenType::Cluster
860 | TokenType::Repeatable
861 | TokenType::Groups
862 | TokenType::Commit
863 | TokenType::Warehouse
864 | TokenType::System
865 | TokenType::By
866 | TokenType::To
867 | TokenType::Fetch
868 | TokenType::For
869 | TokenType::Only
870 | TokenType::Next
871 | TokenType::Lock
872 | TokenType::Refresh
873 | TokenType::Settings
874 | TokenType::Operator
875 | TokenType::Overwrite
876 | TokenType::StraightJoin
877 | TokenType::Start
878 | TokenType::Ignore
880 | TokenType::Domain
881 | TokenType::Apply
882 | TokenType::Respect
883 | TokenType::Materialized
884 | TokenType::Prewhere
885 | TokenType::Old
886 | TokenType::New
887 | TokenType::Cast
888 | TokenType::TryCast
889 | TokenType::SafeCast
890 | TokenType::Transaction
891 | TokenType::Describe
892 | TokenType::Kill
893 | TokenType::Lambda
894 | TokenType::Declare
895 | TokenType::Keep
896 | TokenType::Output
897 | TokenType::Percent
898 | TokenType::Qualify
899 | TokenType::Returning
900 | TokenType::Language
901 | TokenType::Preserve
902 | TokenType::Savepoint
903 | TokenType::Rollback
904 | TokenType::Body
905 | TokenType::Increment
906 | TokenType::Minvalue
907 | TokenType::Maxvalue
908 | TokenType::Cycle
909 | TokenType::NoCycle
910 | TokenType::Seed
911 | TokenType::Namespace
912 | TokenType::Authorization
913 | TokenType::Order
914 | TokenType::Restart
915 | TokenType::Before
916 | TokenType::Instead
917 | TokenType::Each
918 | TokenType::Statement
919 | TokenType::Referencing
920 | TokenType::Of
921 | TokenType::Separator
922 | TokenType::Others
923 | TokenType::Placing
924 | TokenType::Owned
925 | TokenType::Running
926 | TokenType::Define
927 | TokenType::Measures
928 | TokenType::MatchRecognize
929 | TokenType::AutoIncrement
930 | TokenType::Connect
931 | TokenType::Distribute
932 | TokenType::Bernoulli
933 | TokenType::TableSample
934 | TokenType::Inpath
935 | TokenType::Pragma
936 | TokenType::Siblings
937 | TokenType::SerdeProperties
938 | TokenType::RLike
939 )
940 }
941
942 pub fn is_comparison(&self) -> bool {
944 matches!(
945 self,
946 TokenType::Eq
947 | TokenType::Neq
948 | TokenType::Lt
949 | TokenType::Lte
950 | TokenType::Gt
951 | TokenType::Gte
952 | TokenType::NullsafeEq
953 )
954 }
955
956 pub fn is_arithmetic(&self) -> bool {
958 matches!(
959 self,
960 TokenType::Plus
961 | TokenType::Dash
962 | TokenType::Star
963 | TokenType::Slash
964 | TokenType::Percent
965 | TokenType::Mod
966 | TokenType::Div
967 )
968 }
969}
970
971impl fmt::Display for TokenType {
972 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
973 write!(f, "{:?}", self)
974 }
975}
976
977static DEFAULT_KEYWORDS: LazyLock<HashMap<String, TokenType>> = LazyLock::new(|| {
980 let mut keywords = HashMap::with_capacity(300);
981 keywords.insert("SELECT".to_string(), TokenType::Select);
983 keywords.insert("FROM".to_string(), TokenType::From);
984 keywords.insert("WHERE".to_string(), TokenType::Where);
985 keywords.insert("AND".to_string(), TokenType::And);
986 keywords.insert("OR".to_string(), TokenType::Or);
987 keywords.insert("NOT".to_string(), TokenType::Not);
988 keywords.insert("AS".to_string(), TokenType::As);
989 keywords.insert("ON".to_string(), TokenType::On);
990 keywords.insert("JOIN".to_string(), TokenType::Join);
991 keywords.insert("LEFT".to_string(), TokenType::Left);
992 keywords.insert("RIGHT".to_string(), TokenType::Right);
993 keywords.insert("INNER".to_string(), TokenType::Inner);
994 keywords.insert("OUTER".to_string(), TokenType::Outer);
995 keywords.insert("OUTPUT".to_string(), TokenType::Output);
996 keywords.insert("FULL".to_string(), TokenType::Full);
997 keywords.insert("CROSS".to_string(), TokenType::Cross);
998 keywords.insert("SEMI".to_string(), TokenType::Semi);
999 keywords.insert("ANTI".to_string(), TokenType::Anti);
1000 keywords.insert("STRAIGHT_JOIN".to_string(), TokenType::StraightJoin);
1001 keywords.insert("UNION".to_string(), TokenType::Union);
1002 keywords.insert("EXCEPT".to_string(), TokenType::Except);
1003 keywords.insert("MINUS".to_string(), TokenType::Except); keywords.insert("INTERSECT".to_string(), TokenType::Intersect);
1005 keywords.insert("GROUP".to_string(), TokenType::Group);
1006 keywords.insert("CUBE".to_string(), TokenType::Cube);
1007 keywords.insert("ROLLUP".to_string(), TokenType::Rollup);
1008 keywords.insert("WITHIN".to_string(), TokenType::Within);
1009 keywords.insert("ORDER".to_string(), TokenType::Order);
1010 keywords.insert("BY".to_string(), TokenType::By);
1011 keywords.insert("HAVING".to_string(), TokenType::Having);
1012 keywords.insert("LIMIT".to_string(), TokenType::Limit);
1013 keywords.insert("OFFSET".to_string(), TokenType::Offset);
1014 keywords.insert("ORDINALITY".to_string(), TokenType::Ordinality);
1015 keywords.insert("FETCH".to_string(), TokenType::Fetch);
1016 keywords.insert("FIRST".to_string(), TokenType::First);
1017 keywords.insert("NEXT".to_string(), TokenType::Next);
1018 keywords.insert("ONLY".to_string(), TokenType::Only);
1019 keywords.insert("KEEP".to_string(), TokenType::Keep);
1020 keywords.insert("IGNORE".to_string(), TokenType::Ignore);
1021 keywords.insert("INPUT".to_string(), TokenType::Input);
1022 keywords.insert("CASE".to_string(), TokenType::Case);
1023 keywords.insert("WHEN".to_string(), TokenType::When);
1024 keywords.insert("THEN".to_string(), TokenType::Then);
1025 keywords.insert("ELSE".to_string(), TokenType::Else);
1026 keywords.insert("END".to_string(), TokenType::End);
1027 keywords.insert("ENDIF".to_string(), TokenType::End); keywords.insert("NULL".to_string(), TokenType::Null);
1029 keywords.insert("TRUE".to_string(), TokenType::True);
1030 keywords.insert("FALSE".to_string(), TokenType::False);
1031 keywords.insert("IS".to_string(), TokenType::Is);
1032 keywords.insert("IN".to_string(), TokenType::In);
1033 keywords.insert("BETWEEN".to_string(), TokenType::Between);
1034 keywords.insert("OVERLAPS".to_string(), TokenType::Overlaps);
1035 keywords.insert("LIKE".to_string(), TokenType::Like);
1036 keywords.insert("ILIKE".to_string(), TokenType::ILike);
1037 keywords.insert("RLIKE".to_string(), TokenType::RLike);
1038 keywords.insert("REGEXP".to_string(), TokenType::RLike);
1039 keywords.insert("ESCAPE".to_string(), TokenType::Escape);
1040 keywords.insert("EXISTS".to_string(), TokenType::Exists);
1041 keywords.insert("DISTINCT".to_string(), TokenType::Distinct);
1042 keywords.insert("ALL".to_string(), TokenType::All);
1043 keywords.insert("WITH".to_string(), TokenType::With);
1044 keywords.insert("CREATE".to_string(), TokenType::Create);
1045 keywords.insert("DROP".to_string(), TokenType::Drop);
1046 keywords.insert("ALTER".to_string(), TokenType::Alter);
1047 keywords.insert("TRUNCATE".to_string(), TokenType::Truncate);
1048 keywords.insert("TABLE".to_string(), TokenType::Table);
1049 keywords.insert("VIEW".to_string(), TokenType::View);
1050 keywords.insert("INDEX".to_string(), TokenType::Index);
1051 keywords.insert("COLUMN".to_string(), TokenType::Column);
1052 keywords.insert("CONSTRAINT".to_string(), TokenType::Constraint);
1053 keywords.insert("ADD".to_string(), TokenType::Add);
1054 keywords.insert("CASCADE".to_string(), TokenType::Cascade);
1055 keywords.insert("RESTRICT".to_string(), TokenType::Restrict);
1056 keywords.insert("RENAME".to_string(), TokenType::Rename);
1057 keywords.insert("TEMPORARY".to_string(), TokenType::Temporary);
1058 keywords.insert("TEMP".to_string(), TokenType::Temporary);
1059 keywords.insert("UNIQUE".to_string(), TokenType::Unique);
1060 keywords.insert("PRIMARY".to_string(), TokenType::PrimaryKey);
1061 keywords.insert("FOREIGN".to_string(), TokenType::ForeignKey);
1062 keywords.insert("KEY".to_string(), TokenType::Key);
1063 keywords.insert("KILL".to_string(), TokenType::Kill);
1064 keywords.insert("REFERENCES".to_string(), TokenType::References);
1065 keywords.insert("DEFAULT".to_string(), TokenType::Default);
1066 keywords.insert("DECLARE".to_string(), TokenType::Declare);
1067 keywords.insert("AUTO_INCREMENT".to_string(), TokenType::AutoIncrement);
1068 keywords.insert("AUTOINCREMENT".to_string(), TokenType::AutoIncrement); keywords.insert("MATERIALIZED".to_string(), TokenType::Materialized);
1070 keywords.insert("REPLACE".to_string(), TokenType::Replace);
1071 keywords.insert("TO".to_string(), TokenType::To);
1072 keywords.insert("INSERT".to_string(), TokenType::Insert);
1073 keywords.insert("OVERWRITE".to_string(), TokenType::Overwrite);
1074 keywords.insert("UPDATE".to_string(), TokenType::Update);
1075 keywords.insert("USE".to_string(), TokenType::Use);
1076 keywords.insert("WAREHOUSE".to_string(), TokenType::Warehouse);
1077 keywords.insert("GLOB".to_string(), TokenType::Glob);
1078 keywords.insert("DELETE".to_string(), TokenType::Delete);
1079 keywords.insert("MERGE".to_string(), TokenType::Merge);
1080 keywords.insert("CACHE".to_string(), TokenType::Cache);
1081 keywords.insert("UNCACHE".to_string(), TokenType::Uncache);
1082 keywords.insert("REFRESH".to_string(), TokenType::Refresh);
1083 keywords.insert("GRANT".to_string(), TokenType::Grant);
1084 keywords.insert("REVOKE".to_string(), TokenType::Revoke);
1085 keywords.insert("COMMENT".to_string(), TokenType::Comment);
1086 keywords.insert("COLLATE".to_string(), TokenType::Collate);
1087 keywords.insert("INTO".to_string(), TokenType::Into);
1088 keywords.insert("VALUES".to_string(), TokenType::Values);
1089 keywords.insert("SET".to_string(), TokenType::Set);
1090 keywords.insert("SETTINGS".to_string(), TokenType::Settings);
1091 keywords.insert("SEPARATOR".to_string(), TokenType::Separator);
1092 keywords.insert("ASC".to_string(), TokenType::Asc);
1093 keywords.insert("DESC".to_string(), TokenType::Desc);
1094 keywords.insert("NULLS".to_string(), TokenType::Nulls);
1095 keywords.insert("RESPECT".to_string(), TokenType::Respect);
1096 keywords.insert("FIRST".to_string(), TokenType::First);
1097 keywords.insert("LAST".to_string(), TokenType::Last);
1098 keywords.insert("IF".to_string(), TokenType::If);
1099 keywords.insert("CAST".to_string(), TokenType::Cast);
1100 keywords.insert("TRY_CAST".to_string(), TokenType::TryCast);
1101 keywords.insert("SAFE_CAST".to_string(), TokenType::SafeCast);
1102 keywords.insert("OVER".to_string(), TokenType::Over);
1103 keywords.insert("PARTITION".to_string(), TokenType::Partition);
1104 keywords.insert("PLACING".to_string(), TokenType::Placing);
1105 keywords.insert("WINDOW".to_string(), TokenType::Window);
1106 keywords.insert("ROWS".to_string(), TokenType::Rows);
1107 keywords.insert("RANGE".to_string(), TokenType::Range);
1108 keywords.insert("FILTER".to_string(), TokenType::Filter);
1109 keywords.insert("NATURAL".to_string(), TokenType::Natural);
1110 keywords.insert("USING".to_string(), TokenType::Using);
1111 keywords.insert("UNBOUNDED".to_string(), TokenType::Unbounded);
1112 keywords.insert("PRECEDING".to_string(), TokenType::Preceding);
1113 keywords.insert("FOLLOWING".to_string(), TokenType::Following);
1114 keywords.insert("CURRENT".to_string(), TokenType::Current);
1115 keywords.insert("ROW".to_string(), TokenType::Row);
1116 keywords.insert("GROUPS".to_string(), TokenType::Groups);
1117 keywords.insert("RECURSIVE".to_string(), TokenType::Recursive);
1118 keywords.insert("BOTH".to_string(), TokenType::Both);
1120 keywords.insert("LEADING".to_string(), TokenType::Leading);
1121 keywords.insert("TRAILING".to_string(), TokenType::Trailing);
1122 keywords.insert("INTERVAL".to_string(), TokenType::Interval);
1123 keywords.insert("TOP".to_string(), TokenType::Top);
1125 keywords.insert("QUALIFY".to_string(), TokenType::Qualify);
1126 keywords.insert("SAMPLE".to_string(), TokenType::Sample);
1127 keywords.insert("TABLESAMPLE".to_string(), TokenType::TableSample);
1128 keywords.insert("BERNOULLI".to_string(), TokenType::Bernoulli);
1129 keywords.insert("SYSTEM".to_string(), TokenType::System);
1130 keywords.insert("BLOCK".to_string(), TokenType::Block);
1131 keywords.insert("SEED".to_string(), TokenType::Seed);
1132 keywords.insert("REPEATABLE".to_string(), TokenType::Repeatable);
1133 keywords.insert("TIES".to_string(), TokenType::Ties);
1134 keywords.insert("LATERAL".to_string(), TokenType::Lateral);
1135 keywords.insert("LAMBDA".to_string(), TokenType::Lambda);
1136 keywords.insert("APPLY".to_string(), TokenType::Apply);
1137 keywords.insert("CONNECT".to_string(), TokenType::Connect);
1139 keywords.insert("CLUSTER".to_string(), TokenType::Cluster);
1141 keywords.insert("DISTRIBUTE".to_string(), TokenType::Distribute);
1142 keywords.insert("SORT".to_string(), TokenType::Sort);
1143 keywords.insert("PIVOT".to_string(), TokenType::Pivot);
1144 keywords.insert("PREWHERE".to_string(), TokenType::Prewhere);
1145 keywords.insert("UNPIVOT".to_string(), TokenType::Unpivot);
1146 keywords.insert("FOR".to_string(), TokenType::For);
1147 keywords.insert("ANY".to_string(), TokenType::Any);
1148 keywords.insert("SOME".to_string(), TokenType::Some);
1149 keywords.insert("ASOF".to_string(), TokenType::AsOf);
1150 keywords.insert("PERCENT".to_string(), TokenType::Percent);
1151 keywords.insert("EXCLUDE".to_string(), TokenType::Exclude);
1152 keywords.insert("NO".to_string(), TokenType::No);
1153 keywords.insert("OTHERS".to_string(), TokenType::Others);
1154 keywords.insert("OPERATOR".to_string(), TokenType::Operator);
1156 keywords.insert("SCHEMA".to_string(), TokenType::Schema);
1158 keywords.insert("NAMESPACE".to_string(), TokenType::Namespace);
1159 keywords.insert("DATABASE".to_string(), TokenType::Database);
1160 keywords.insert("FUNCTION".to_string(), TokenType::Function);
1161 keywords.insert("PROCEDURE".to_string(), TokenType::Procedure);
1162 keywords.insert("PROC".to_string(), TokenType::Procedure);
1163 keywords.insert("SEQUENCE".to_string(), TokenType::Sequence);
1164 keywords.insert("TRIGGER".to_string(), TokenType::Trigger);
1165 keywords.insert("TYPE".to_string(), TokenType::Type);
1166 keywords.insert("DOMAIN".to_string(), TokenType::Domain);
1167 keywords.insert("RETURNS".to_string(), TokenType::Returns);
1168 keywords.insert("RETURNING".to_string(), TokenType::Returning);
1169 keywords.insert("LANGUAGE".to_string(), TokenType::Language);
1170 keywords.insert("ROLLBACK".to_string(), TokenType::Rollback);
1171 keywords.insert("COMMIT".to_string(), TokenType::Commit);
1172 keywords.insert("BEGIN".to_string(), TokenType::Begin);
1173 keywords.insert("DESCRIBE".to_string(), TokenType::Describe);
1174 keywords.insert("PRESERVE".to_string(), TokenType::Preserve);
1175 keywords.insert("TRANSACTION".to_string(), TokenType::Transaction);
1176 keywords.insert("SAVEPOINT".to_string(), TokenType::Savepoint);
1177 keywords.insert("BODY".to_string(), TokenType::Body);
1178 keywords.insert("INCREMENT".to_string(), TokenType::Increment);
1179 keywords.insert("MINVALUE".to_string(), TokenType::Minvalue);
1180 keywords.insert("MAXVALUE".to_string(), TokenType::Maxvalue);
1181 keywords.insert("CYCLE".to_string(), TokenType::Cycle);
1182 keywords.insert("NOCYCLE".to_string(), TokenType::NoCycle);
1183 keywords.insert("PRIOR".to_string(), TokenType::Prior);
1184 keywords.insert("MATCH".to_string(), TokenType::Match);
1186 keywords.insert("MATCH_RECOGNIZE".to_string(), TokenType::MatchRecognize);
1187 keywords.insert("MEASURES".to_string(), TokenType::Measures);
1188 keywords.insert("PATTERN".to_string(), TokenType::Pattern);
1189 keywords.insert("DEFINE".to_string(), TokenType::Define);
1190 keywords.insert("RUNNING".to_string(), TokenType::Running);
1191 keywords.insert("FINAL".to_string(), TokenType::Final);
1192 keywords.insert("OWNED".to_string(), TokenType::Owned);
1193 keywords.insert("AFTER".to_string(), TokenType::After);
1194 keywords.insert("BEFORE".to_string(), TokenType::Before);
1195 keywords.insert("INSTEAD".to_string(), TokenType::Instead);
1196 keywords.insert("EACH".to_string(), TokenType::Each);
1197 keywords.insert("STATEMENT".to_string(), TokenType::Statement);
1198 keywords.insert("REFERENCING".to_string(), TokenType::Referencing);
1199 keywords.insert("OLD".to_string(), TokenType::Old);
1200 keywords.insert("NEW".to_string(), TokenType::New);
1201 keywords.insert("OF".to_string(), TokenType::Of);
1202 keywords.insert("CHECK".to_string(), TokenType::Check);
1203 keywords.insert("START".to_string(), TokenType::Start);
1204 keywords.insert("ENUM".to_string(), TokenType::Enum);
1205 keywords.insert("AUTHORIZATION".to_string(), TokenType::Authorization);
1206 keywords.insert("RESTART".to_string(), TokenType::Restart);
1207 keywords.insert("DATE".to_string(), TokenType::Date);
1209 keywords.insert("TIME".to_string(), TokenType::Time);
1210 keywords.insert("TIMESTAMP".to_string(), TokenType::Timestamp);
1211 keywords.insert("DATETIME".to_string(), TokenType::DateTime);
1212 keywords.insert("GENERATED".to_string(), TokenType::Generated);
1213 keywords.insert("IDENTITY".to_string(), TokenType::Identity);
1214 keywords.insert("ALWAYS".to_string(), TokenType::Always);
1215 keywords.insert("LOAD".to_string(), TokenType::Load);
1217 keywords.insert("LOCAL".to_string(), TokenType::Local);
1218 keywords.insert("INPATH".to_string(), TokenType::Inpath);
1219 keywords.insert("INPUTFORMAT".to_string(), TokenType::InputFormat);
1220 keywords.insert("SERDE".to_string(), TokenType::Serde);
1221 keywords.insert("SERDEPROPERTIES".to_string(), TokenType::SerdeProperties);
1222 keywords.insert("FORMAT".to_string(), TokenType::Format);
1223 keywords.insert("PRAGMA".to_string(), TokenType::Pragma);
1225 keywords.insert("SHOW".to_string(), TokenType::Show);
1227 keywords.insert("SIBLINGS".to_string(), TokenType::Siblings);
1229 keywords.insert("COPY".to_string(), TokenType::Copy);
1231 keywords.insert("PUT".to_string(), TokenType::Put);
1232 keywords.insert("GET".to_string(), TokenType::Get);
1233 keywords.insert("EXEC".to_string(), TokenType::Execute);
1235 keywords.insert("EXECUTE".to_string(), TokenType::Execute);
1236 keywords.insert("ISNULL".to_string(), TokenType::IsNull);
1238 keywords.insert("NOTNULL".to_string(), TokenType::NotNull);
1239 keywords
1240});
1241
1242static DEFAULT_SINGLE_TOKENS: LazyLock<HashMap<char, TokenType>> = LazyLock::new(|| {
1243 let mut single_tokens = HashMap::with_capacity(30);
1244 single_tokens.insert('(', TokenType::LParen);
1245 single_tokens.insert(')', TokenType::RParen);
1246 single_tokens.insert('[', TokenType::LBracket);
1247 single_tokens.insert(']', TokenType::RBracket);
1248 single_tokens.insert('{', TokenType::LBrace);
1249 single_tokens.insert('}', TokenType::RBrace);
1250 single_tokens.insert(',', TokenType::Comma);
1251 single_tokens.insert('.', TokenType::Dot);
1252 single_tokens.insert(';', TokenType::Semicolon);
1253 single_tokens.insert('+', TokenType::Plus);
1254 single_tokens.insert('-', TokenType::Dash);
1255 single_tokens.insert('*', TokenType::Star);
1256 single_tokens.insert('/', TokenType::Slash);
1257 single_tokens.insert('%', TokenType::Percent);
1258 single_tokens.insert('&', TokenType::Amp);
1259 single_tokens.insert('|', TokenType::Pipe);
1260 single_tokens.insert('^', TokenType::Caret);
1261 single_tokens.insert('~', TokenType::Tilde);
1262 single_tokens.insert('<', TokenType::Lt);
1263 single_tokens.insert('>', TokenType::Gt);
1264 single_tokens.insert('=', TokenType::Eq);
1265 single_tokens.insert('!', TokenType::Exclamation);
1266 single_tokens.insert(':', TokenType::Colon);
1267 single_tokens.insert('@', TokenType::DAt);
1268 single_tokens.insert('#', TokenType::Hash);
1269 single_tokens.insert('$', TokenType::Dollar);
1270 single_tokens.insert('?', TokenType::Parameter);
1271 single_tokens
1272});
1273
1274static DEFAULT_QUOTES: LazyLock<HashMap<String, String>> = LazyLock::new(|| {
1275 let mut quotes = HashMap::with_capacity(4);
1276 quotes.insert("'".to_string(), "'".to_string());
1277 quotes.insert("\"\"\"".to_string(), "\"\"\"".to_string());
1279 quotes
1280});
1281
1282static DEFAULT_IDENTIFIERS: LazyLock<HashMap<char, char>> = LazyLock::new(|| {
1283 let mut identifiers = HashMap::with_capacity(4);
1284 identifiers.insert('"', '"');
1285 identifiers.insert('`', '`');
1286 identifiers
1289});
1290
1291static DEFAULT_COMMENTS: LazyLock<HashMap<String, Option<String>>> = LazyLock::new(|| {
1292 let mut comments = HashMap::with_capacity(4);
1293 comments.insert("--".to_string(), None);
1294 comments.insert("/*".to_string(), Some("*/".to_string()));
1295 comments
1296});
1297
1298#[derive(Debug, Clone)]
1300pub struct TokenizerConfig {
1301 pub keywords: HashMap<String, TokenType>,
1303 pub single_tokens: HashMap<char, TokenType>,
1305 pub quotes: HashMap<String, String>,
1307 pub identifiers: HashMap<char, char>,
1309 pub comments: HashMap<String, Option<String>>,
1311 pub string_escapes: Vec<char>,
1313 pub nested_comments: bool,
1315 pub escape_follow_chars: Vec<char>,
1320 pub b_prefix_is_byte_string: bool,
1323 pub numeric_literals: HashMap<String, String>,
1326 pub identifiers_can_start_with_digit: bool,
1330 pub hex_number_strings: bool,
1334 pub hex_string_is_integer_type: bool,
1338 pub string_escapes_allowed_in_raw_strings: bool,
1343 pub hash_comments: bool,
1345 pub dollar_sign_is_identifier: bool,
1349 pub insert_format_raw_data: bool,
1353 pub numbers_can_be_underscore_separated: bool,
1357 pub recover_terminal_backslash_quote: bool,
1361 pub recover_unterminated_string: bool,
1365}
1366
1367impl Default for TokenizerConfig {
1368 fn default() -> Self {
1369 Self {
1370 keywords: DEFAULT_KEYWORDS.clone(),
1371 single_tokens: DEFAULT_SINGLE_TOKENS.clone(),
1372 quotes: DEFAULT_QUOTES.clone(),
1373 identifiers: DEFAULT_IDENTIFIERS.clone(),
1374 comments: DEFAULT_COMMENTS.clone(),
1375 string_escapes: vec!['\''],
1378 nested_comments: true,
1379 escape_follow_chars: vec![],
1381 b_prefix_is_byte_string: false,
1383 numeric_literals: HashMap::new(),
1384 identifiers_can_start_with_digit: false,
1385 hex_number_strings: false,
1386 hex_string_is_integer_type: false,
1387 string_escapes_allowed_in_raw_strings: true,
1390 hash_comments: false,
1391 dollar_sign_is_identifier: false,
1392 insert_format_raw_data: false,
1393 numbers_can_be_underscore_separated: false,
1394 recover_terminal_backslash_quote: false,
1395 recover_unterminated_string: false,
1396 }
1397 }
1398}
1399
1400pub struct Tokenizer {
1402 config: TokenizerConfig,
1403}
1404
1405impl Tokenizer {
1406 pub fn new(config: TokenizerConfig) -> Self {
1408 Self { config }
1409 }
1410
1411 pub fn default_config() -> Self {
1413 Self::new(TokenizerConfig::default())
1414 }
1415
1416 pub fn tokenize(&self, sql: &str) -> Result<Vec<Token>> {
1418 let mut state = TokenizerState::new(sql, &self.config);
1419 state.tokenize()
1420 }
1421}
1422
1423impl Default for Tokenizer {
1424 fn default() -> Self {
1425 Self::default_config()
1426 }
1427}
1428
1429struct TokenizerState<'a> {
1431 source: &'a str,
1432 source_is_ascii: bool,
1433 chars: Vec<char>,
1434 size: usize,
1435 tokens: Vec<Token>,
1436 start: usize,
1437 current: usize,
1438 line: usize,
1439 column: usize,
1440 comments: Vec<String>,
1441 config: &'a TokenizerConfig,
1442}
1443
1444impl<'a> TokenizerState<'a> {
1445 fn new(sql: &'a str, config: &'a TokenizerConfig) -> Self {
1446 let chars: Vec<char> = sql.chars().collect();
1447 let size = chars.len();
1448 Self {
1449 source: sql,
1450 source_is_ascii: sql.is_ascii(),
1451 chars,
1452 size,
1453 tokens: Vec::new(),
1454 start: 0,
1455 current: 0,
1456 line: 1,
1457 column: 1,
1458 comments: Vec::new(),
1459 config,
1460 }
1461 }
1462
1463 fn tokenize(&mut self) -> Result<Vec<Token>> {
1464 while !self.is_at_end() {
1465 self.skip_whitespace();
1466 if self.is_at_end() {
1467 break;
1468 }
1469
1470 self.start = self.current;
1471 self.scan_token()?;
1472
1473 if self.config.insert_format_raw_data {
1476 if let Some(raw) = self.try_scan_insert_format_raw_data() {
1477 if !raw.is_empty() {
1478 self.start = self.current;
1479 self.add_token_with_text(TokenType::Var, raw);
1480 }
1481 }
1482 }
1483 }
1484
1485 if !self.comments.is_empty() {
1490 if let Some(last) = self.tokens.last_mut() {
1491 last.trailing_comments.extend(self.comments.drain(..));
1492 }
1493 }
1494
1495 Ok(std::mem::take(&mut self.tokens))
1496 }
1497
1498 #[inline]
1499 fn is_at_end(&self) -> bool {
1500 self.current >= self.size
1501 }
1502
1503 #[inline]
1504 fn text_from_range(&self, start: usize, end: usize) -> String {
1505 if self.source_is_ascii {
1506 self.source[start..end].to_string()
1507 } else {
1508 self.chars[start..end].iter().collect()
1509 }
1510 }
1511
1512 #[inline]
1513 fn peek(&self) -> char {
1514 if self.is_at_end() {
1515 '\0'
1516 } else {
1517 self.chars[self.current]
1518 }
1519 }
1520
1521 #[inline]
1522 fn peek_next(&self) -> char {
1523 if self.current + 1 >= self.size {
1524 '\0'
1525 } else {
1526 self.chars[self.current + 1]
1527 }
1528 }
1529
1530 #[inline]
1531 fn advance(&mut self) -> char {
1532 let c = self.peek();
1533 self.current += 1;
1534 if c == '\n' {
1535 self.line += 1;
1536 self.column = 1;
1537 } else {
1538 self.column += 1;
1539 }
1540 c
1541 }
1542
1543 fn skip_whitespace(&mut self) {
1544 let mut saw_newline = false;
1549 while !self.is_at_end() {
1550 let c = self.peek();
1551 match c {
1552 ' ' | '\t' | '\r' => {
1553 self.advance();
1554 }
1555 '\n' => {
1556 saw_newline = true;
1557 self.advance();
1558 }
1559 '\u{00A0}' | '\u{2000}'..='\u{200B}' | '\u{3000}' | '\u{FEFF}' => {
1564 self.advance();
1565 }
1566 '-' if self.peek_next() == '-' => {
1567 self.scan_line_comment(saw_newline);
1568 saw_newline = true;
1570 }
1571 '/' if self.peek_next() == '/' && self.config.hash_comments => {
1572 self.scan_double_slash_comment();
1574 }
1575 '/' if self.peek_next() == '*' => {
1576 if self.current + 2 < self.size && self.chars[self.current + 2] == '+' {
1578 break;
1580 }
1581 if self.scan_block_comment(saw_newline).is_err() {
1582 return;
1583 }
1584 }
1586 '/' if self.peek_next() == '/' && self.config.comments.contains_key("//") => {
1587 let prev_non_ws = if self.current > 0 {
1591 let mut i = self.current - 1;
1592 while i > 0 && (self.chars[i] == ' ' || self.chars[i] == '\t') {
1593 i -= 1;
1594 }
1595 self.chars[i]
1596 } else {
1597 '\0'
1598 };
1599 if prev_non_ws == ':' || prev_non_ws == '/' {
1600 break;
1602 }
1603 self.scan_line_comment(saw_newline);
1604 saw_newline = true;
1606 }
1607 '#' if self.config.hash_comments => {
1608 self.scan_hash_line_comment();
1609 }
1610 _ => break,
1611 }
1612 }
1613 }
1614
1615 fn scan_hash_line_comment(&mut self) {
1616 self.advance(); let start = self.current;
1618 while !self.is_at_end() && self.peek() != '\n' {
1619 self.advance();
1620 }
1621 let comment = self.text_from_range(start, self.current);
1622 let comment_text = comment.trim().to_string();
1623 if let Some(last) = self.tokens.last_mut() {
1624 last.trailing_comments.push(comment_text);
1625 } else {
1626 self.comments.push(comment_text);
1627 }
1628 }
1629
1630 fn scan_double_slash_comment(&mut self) {
1631 self.advance(); self.advance(); let start = self.current;
1634 while !self.is_at_end() && self.peek() != '\n' {
1635 self.advance();
1636 }
1637 let comment = self.text_from_range(start, self.current);
1638 let comment_text = comment.trim().to_string();
1639 if let Some(last) = self.tokens.last_mut() {
1640 last.trailing_comments.push(comment_text);
1641 } else {
1642 self.comments.push(comment_text);
1643 }
1644 }
1645
1646 fn scan_line_comment(&mut self, after_newline: bool) {
1647 self.advance(); self.advance(); let start = self.current;
1650 while !self.is_at_end() && self.peek() != '\n' {
1651 self.advance();
1652 }
1653 let comment_text = self.text_from_range(start, self.current);
1654
1655 if after_newline || self.tokens.is_empty() {
1658 self.comments.push(comment_text);
1659 } else if let Some(last) = self.tokens.last_mut() {
1660 last.trailing_comments.push(comment_text);
1661 }
1662 }
1663
1664 fn scan_block_comment(&mut self, after_newline: bool) -> Result<()> {
1665 self.advance(); self.advance(); let content_start = self.current;
1668 let mut depth = 1;
1669
1670 while !self.is_at_end() && depth > 0 {
1671 if self.peek() == '/' && self.peek_next() == '*' && self.config.nested_comments {
1672 self.advance();
1673 self.advance();
1674 depth += 1;
1675 } else if self.peek() == '*' && self.peek_next() == '/' {
1676 depth -= 1;
1677 if depth > 0 {
1678 self.advance();
1679 self.advance();
1680 }
1681 } else {
1682 self.advance();
1683 }
1684 }
1685
1686 if depth > 0 {
1687 return Err(Error::tokenize(
1688 "Unterminated block comment",
1689 self.line,
1690 self.column,
1691 self.start,
1692 self.current,
1693 ));
1694 }
1695
1696 let content = self.text_from_range(content_start, self.current);
1698 self.advance(); self.advance(); let comment_text = format!("/*{}*/", content);
1703
1704 if after_newline || self.tokens.is_empty() {
1707 self.comments.push(comment_text);
1708 } else if let Some(last) = self.tokens.last_mut() {
1709 last.trailing_comments.push(comment_text);
1710 }
1711
1712 Ok(())
1713 }
1714
1715 fn scan_hint(&mut self) -> Result<()> {
1717 self.advance(); self.advance(); self.advance(); let hint_start = self.current;
1721
1722 while !self.is_at_end() {
1724 if self.peek() == '*' && self.peek_next() == '/' {
1725 break;
1726 }
1727 self.advance();
1728 }
1729
1730 if self.is_at_end() {
1731 return Err(Error::tokenize(
1732 "Unterminated hint comment",
1733 self.line,
1734 self.column,
1735 self.start,
1736 self.current,
1737 ));
1738 }
1739
1740 let hint_text = self.text_from_range(hint_start, self.current);
1741 self.advance(); self.advance(); self.add_token_with_text(TokenType::Hint, hint_text.trim().to_string());
1745
1746 Ok(())
1747 }
1748
1749 fn scan_positional_parameter(&mut self) -> Result<()> {
1751 self.advance(); let start = self.current;
1753
1754 while !self.is_at_end() && self.peek().is_ascii_digit() {
1755 self.advance();
1756 }
1757
1758 let number = self.text_from_range(start, self.current);
1759 self.add_token_with_text(TokenType::Parameter, number);
1760 Ok(())
1761 }
1762
1763 fn try_scan_tagged_dollar_string(&mut self) -> Result<Option<()>> {
1768 let saved_pos = self.current;
1769
1770 self.advance(); let tag_start = self.current;
1776 while !self.is_at_end()
1777 && (self.peek().is_alphanumeric() || self.peek() == '_' || !self.peek().is_ascii())
1778 {
1779 self.advance();
1780 }
1781 let tag = self.text_from_range(tag_start, self.current);
1782
1783 if self.is_at_end() || self.peek() != '$' {
1785 self.current = saved_pos;
1787 return Ok(None);
1788 }
1789 self.advance(); let content_start = self.current;
1793 let closing_tag = format!("${}$", tag);
1794 let closing_chars: Vec<char> = closing_tag.chars().collect();
1795
1796 loop {
1797 if self.is_at_end() {
1798 self.current = saved_pos;
1800 return Ok(None);
1801 }
1802
1803 if self.peek() == '$' && self.current + closing_chars.len() <= self.size {
1805 let matches = closing_chars.iter().enumerate().all(|(j, &ch)| {
1806 self.current + j < self.size && self.chars[self.current + j] == ch
1807 });
1808 if matches {
1809 let content = self.text_from_range(content_start, self.current);
1810 for _ in 0..closing_chars.len() {
1812 self.advance();
1813 }
1814 let token_text = format!("{}\x00{}", tag, content);
1816 self.add_token_with_text(TokenType::DollarString, token_text);
1817 return Ok(Some(()));
1818 }
1819 }
1820 self.advance();
1821 }
1822 }
1823
1824 fn scan_dollar_quoted_string(&mut self) -> Result<()> {
1829 self.advance(); self.advance(); let start = self.current;
1834 while !self.is_at_end() {
1835 if self.peek() == '$'
1836 && self.current + 1 < self.size
1837 && self.chars[self.current + 1] == '$'
1838 {
1839 break;
1840 }
1841 self.advance();
1842 }
1843
1844 let content = self.text_from_range(start, self.current);
1845
1846 if !self.is_at_end() {
1847 self.advance(); self.advance(); }
1850
1851 self.add_token_with_text(TokenType::DollarString, content);
1852 Ok(())
1853 }
1854
1855 fn scan_token(&mut self) -> Result<()> {
1856 let c = self.peek();
1857
1858 if c == '\'' {
1860 if self.config.quotes.contains_key("'''")
1862 && self.peek_next() == '\''
1863 && self.current + 2 < self.size
1864 && self.chars[self.current + 2] == '\''
1865 {
1866 return self.scan_triple_quoted_string('\'');
1867 }
1868 return self.scan_string();
1869 }
1870
1871 if c == '"'
1873 && self.config.quotes.contains_key("\"\"\"")
1874 && self.peek_next() == '"'
1875 && self.current + 2 < self.size
1876 && self.chars[self.current + 2] == '"'
1877 {
1878 return self.scan_triple_quoted_string('"');
1879 }
1880
1881 if c == '"'
1884 && self.config.quotes.contains_key("\"")
1885 && !self.config.identifiers.contains_key(&'"')
1886 {
1887 return self.scan_double_quoted_string();
1888 }
1889
1890 if let Some(&end_quote) = self.config.identifiers.get(&c) {
1892 return self.scan_quoted_identifier(end_quote);
1893 }
1894
1895 if c.is_ascii_digit() {
1897 return self.scan_number();
1898 }
1899
1900 if c == '.' && self.peek_next().is_ascii_digit() {
1907 let prev_char = if self.current > 0 {
1908 self.chars[self.current - 1]
1909 } else {
1910 '\0'
1911 };
1912 let is_after_ident = prev_char.is_alphanumeric()
1913 || prev_char == '_'
1914 || prev_char == '`'
1915 || prev_char == '"'
1916 || prev_char == ']'
1917 || prev_char == ')';
1918 if prev_char != '.' && !is_after_ident {
1919 return self.scan_number_starting_with_dot();
1920 }
1921 }
1922
1923 if c == '/'
1925 && self.peek_next() == '*'
1926 && self.current + 2 < self.size
1927 && self.chars[self.current + 2] == '+'
1928 {
1929 return self.scan_hint();
1930 }
1931
1932 if let Some(token_type) = self.try_scan_multi_char_operator() {
1934 self.add_token(token_type);
1935 return Ok(());
1936 }
1937
1938 if c == '$'
1941 && (self.peek_next().is_alphanumeric()
1942 || self.peek_next() == '_'
1943 || !self.peek_next().is_ascii())
1944 {
1945 if let Some(()) = self.try_scan_tagged_dollar_string()? {
1946 return Ok(());
1947 }
1948 if self.config.dollar_sign_is_identifier {
1951 return self.scan_dollar_identifier();
1952 }
1953 }
1954
1955 if c == '$' && self.peek_next() == '$' {
1957 return self.scan_dollar_quoted_string();
1958 }
1959
1960 if c == '$' && self.peek_next().is_ascii_digit() {
1962 return self.scan_positional_parameter();
1963 }
1964
1965 if c == '$' && self.config.dollar_sign_is_identifier {
1967 return self.scan_dollar_identifier();
1968 }
1969
1970 if (c == '#' || c == '@')
1973 && (self.peek_next().is_alphanumeric()
1974 || self.peek_next() == '_'
1975 || self.peek_next() == '#')
1976 {
1977 return self.scan_tsql_identifier();
1978 }
1979
1980 if let Some(&token_type) = self.config.single_tokens.get(&c) {
1982 self.advance();
1983 self.add_token(token_type);
1984 return Ok(());
1985 }
1986
1987 if c == '\u{2212}' {
1989 self.advance();
1990 self.add_token(TokenType::Dash);
1991 return Ok(());
1992 }
1993
1994 if c == '\u{2044}' {
1996 self.advance();
1997 self.add_token(TokenType::Slash);
1998 return Ok(());
1999 }
2000
2001 if c == '\u{2018}' || c == '\u{2019}' {
2003 return self.scan_unicode_quoted_string(c);
2005 }
2006 if c == '\u{201C}' || c == '\u{201D}' {
2007 return self.scan_unicode_quoted_identifier(c);
2009 }
2010
2011 self.scan_identifier_or_keyword()
2013 }
2014
2015 fn try_scan_multi_char_operator(&mut self) -> Option<TokenType> {
2016 let c = self.peek();
2017 let next = self.peek_next();
2018 let third = if self.current + 2 < self.size {
2019 self.chars[self.current + 2]
2020 } else {
2021 '\0'
2022 };
2023
2024 if c == '-' && next == '|' && third == '-' {
2027 self.advance();
2028 self.advance();
2029 self.advance();
2030 return Some(TokenType::Adjacent);
2031 }
2032
2033 if c == '|' && next == '|' && third == '/' {
2035 self.advance();
2036 self.advance();
2037 self.advance();
2038 return Some(TokenType::DPipeSlash);
2039 }
2040
2041 if c == '#' && next == '>' && third == '>' {
2043 self.advance();
2044 self.advance();
2045 self.advance();
2046 return Some(TokenType::DHashArrow);
2047 }
2048
2049 if c == '-' && next == '>' && third == '>' {
2051 self.advance();
2052 self.advance();
2053 self.advance();
2054 return Some(TokenType::DArrow);
2055 }
2056
2057 if c == '<' && next == '=' && third == '>' {
2059 self.advance();
2060 self.advance();
2061 self.advance();
2062 return Some(TokenType::NullsafeEq);
2063 }
2064
2065 if c == '<' && next == '-' && third == '>' {
2067 self.advance();
2068 self.advance();
2069 self.advance();
2070 return Some(TokenType::LrArrow);
2071 }
2072
2073 if c == '<' && next == '@' {
2075 self.advance();
2076 self.advance();
2077 return Some(TokenType::LtAt);
2078 }
2079
2080 if c == '@' && next == '>' {
2082 self.advance();
2083 self.advance();
2084 return Some(TokenType::AtGt);
2085 }
2086
2087 if c == '~' && next == '~' && third == '~' {
2089 self.advance();
2090 self.advance();
2091 self.advance();
2092 return Some(TokenType::Glob);
2093 }
2094
2095 if c == '~' && next == '~' && third == '*' {
2097 self.advance();
2098 self.advance();
2099 self.advance();
2100 return Some(TokenType::ILike);
2101 }
2102
2103 let fourth = if self.current + 3 < self.size {
2105 self.chars[self.current + 3]
2106 } else {
2107 '\0'
2108 };
2109 if c == '!' && next == '~' && third == '~' && fourth == '*' {
2110 self.advance();
2111 self.advance();
2112 self.advance();
2113 self.advance();
2114 return Some(TokenType::NotILike);
2115 }
2116
2117 if c == '!' && next == '~' && third == '~' {
2119 self.advance();
2120 self.advance();
2121 self.advance();
2122 return Some(TokenType::NotLike);
2123 }
2124
2125 if c == '!' && next == '~' && third == '*' {
2127 self.advance();
2128 self.advance();
2129 self.advance();
2130 return Some(TokenType::NotIRLike);
2131 }
2132
2133 if c == '!' && next == ':' && third == '>' {
2135 self.advance();
2136 self.advance();
2137 self.advance();
2138 return Some(TokenType::NColonGt);
2139 }
2140
2141 if c == '?' && next == ':' && third == ':' {
2143 self.advance();
2144 self.advance();
2145 self.advance();
2146 return Some(TokenType::QDColon);
2147 }
2148
2149 if c == '!' && next == '~' {
2151 self.advance();
2152 self.advance();
2153 return Some(TokenType::NotRLike);
2154 }
2155
2156 if c == '~' && next == '~' {
2158 self.advance();
2159 self.advance();
2160 return Some(TokenType::Like);
2161 }
2162
2163 if c == '~' && next == '*' {
2165 self.advance();
2166 self.advance();
2167 return Some(TokenType::IRLike);
2168 }
2169
2170 if c == ':' && next == ':' && third == '$' {
2173 self.advance();
2174 self.advance();
2175 self.advance();
2176 return Some(TokenType::DColonDollar);
2177 }
2178 if c == ':' && next == ':' && third == '%' {
2179 self.advance();
2180 self.advance();
2181 self.advance();
2182 return Some(TokenType::DColonPercent);
2183 }
2184 if c == ':' && next == ':' && third == '?' {
2185 self.advance();
2186 self.advance();
2187 self.advance();
2188 return Some(TokenType::DColonQMark);
2189 }
2190
2191 let token_type = match (c, next) {
2193 ('.', ':') => Some(TokenType::DotColon),
2194 ('=', '=') => Some(TokenType::Eq), ('<', '=') => Some(TokenType::Lte),
2196 ('>', '=') => Some(TokenType::Gte),
2197 ('!', '=') => Some(TokenType::Neq),
2198 ('<', '>') => Some(TokenType::Neq),
2199 ('^', '=') => Some(TokenType::Neq),
2200 ('<', '<') => Some(TokenType::LtLt),
2201 ('>', '>') => Some(TokenType::GtGt),
2202 ('|', '|') => Some(TokenType::DPipe),
2203 ('|', '/') => Some(TokenType::PipeSlash), (':', ':') => Some(TokenType::DColon),
2205 (':', '=') => Some(TokenType::ColonEq), (':', '>') => Some(TokenType::ColonGt), ('-', '>') => Some(TokenType::Arrow), ('=', '>') => Some(TokenType::FArrow), ('&', '&') => Some(TokenType::DAmp),
2210 ('&', '<') => Some(TokenType::AmpLt), ('&', '>') => Some(TokenType::AmpGt), ('@', '@') => Some(TokenType::AtAt), ('?', '|') => Some(TokenType::QMarkPipe), ('?', '&') => Some(TokenType::QMarkAmp), ('?', '?') => Some(TokenType::DQMark), ('#', '>') => Some(TokenType::HashArrow), ('#', '-') => Some(TokenType::HashDash), ('^', '@') => Some(TokenType::CaretAt), ('*', '*') => Some(TokenType::DStar), ('|', '>') => Some(TokenType::PipeGt), _ => None,
2222 };
2223
2224 if token_type.is_some() {
2225 self.advance();
2226 self.advance();
2227 }
2228
2229 token_type
2230 }
2231
2232 fn scan_string(&mut self) -> Result<()> {
2233 self.advance(); let mut value = String::new();
2235
2236 while !self.is_at_end() {
2237 let c = self.peek();
2238 if c == '\'' {
2239 if self.peek_next() == '\'' {
2240 value.push('\'');
2242 self.advance();
2243 self.advance();
2244 } else {
2245 break;
2246 }
2247 } else if c == '\\' && self.config.string_escapes.contains(&'\\') {
2248 if self.config.recover_terminal_backslash_quote
2249 && self.peek_next() == '\''
2250 && !self.chars[self.current + 2..].contains(&'\'')
2251 {
2252 value.push(self.advance());
2253 break;
2254 }
2255
2256 self.advance(); if !self.is_at_end() {
2259 let escaped = self.advance();
2260 match escaped {
2261 'n' => value.push('\n'),
2262 'r' => value.push('\r'),
2263 't' => value.push('\t'),
2264 '0' => value.push('\0'),
2265 'Z' => value.push('\x1A'), 'a' => value.push('\x07'), 'b' => value.push('\x08'), 'f' => value.push('\x0C'), 'v' => value.push('\x0B'), 'x' => {
2271 let mut hex = String::with_capacity(2);
2273 for _ in 0..2 {
2274 if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2275 hex.push(self.advance());
2276 }
2277 }
2278 if hex.len() == 2 {
2279 if let Ok(byte) = u8::from_str_radix(&hex, 16) {
2280 value.push(byte as char);
2281 } else {
2282 value.push('\\');
2283 value.push('x');
2284 value.push_str(&hex);
2285 }
2286 } else {
2287 value.push('\\');
2289 value.push('x');
2290 value.push_str(&hex);
2291 }
2292 }
2293 '\\' => value.push('\\'),
2294 '\'' => value.push('\''),
2295 '"' => value.push('"'),
2296 '%' => {
2297 value.push('%');
2299 }
2300 '_' => {
2301 value.push('_');
2303 }
2304 _ => {
2308 if !self.config.escape_follow_chars.is_empty() {
2309 value.push(escaped);
2311 } else {
2312 value.push('\\');
2314 value.push(escaped);
2315 }
2316 }
2317 }
2318 }
2319 } else {
2320 value.push(self.advance());
2321 }
2322 }
2323
2324 if self.is_at_end() {
2325 if self.config.recover_unterminated_string {
2326 self.add_token_with_text(TokenType::String, value);
2327 return Ok(());
2328 }
2329
2330 return Err(Error::tokenize(
2331 "Unterminated string",
2332 self.line,
2333 self.column,
2334 self.start,
2335 self.current,
2336 ));
2337 }
2338
2339 self.advance(); self.add_token_with_text(TokenType::String, value);
2341 Ok(())
2342 }
2343
2344 fn scan_double_quoted_string(&mut self) -> Result<()> {
2346 self.advance(); let mut value = String::new();
2348
2349 while !self.is_at_end() {
2350 let c = self.peek();
2351 if c == '"' {
2352 if self.peek_next() == '"' {
2353 value.push('"');
2355 self.advance();
2356 self.advance();
2357 } else {
2358 break;
2359 }
2360 } else if c == '\\' && self.config.string_escapes.contains(&'\\') {
2361 self.advance(); if !self.is_at_end() {
2364 let escaped = self.advance();
2365 match escaped {
2366 'n' => value.push('\n'),
2367 'r' => value.push('\r'),
2368 't' => value.push('\t'),
2369 '0' => value.push('\0'),
2370 'Z' => value.push('\x1A'), 'a' => value.push('\x07'), 'b' => value.push('\x08'), 'f' => value.push('\x0C'), 'v' => value.push('\x0B'), 'x' => {
2376 let mut hex = String::with_capacity(2);
2378 for _ in 0..2 {
2379 if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2380 hex.push(self.advance());
2381 }
2382 }
2383 if hex.len() == 2 {
2384 if let Ok(byte) = u8::from_str_radix(&hex, 16) {
2385 value.push(byte as char);
2386 } else {
2387 value.push('\\');
2388 value.push('x');
2389 value.push_str(&hex);
2390 }
2391 } else {
2392 value.push('\\');
2394 value.push('x');
2395 value.push_str(&hex);
2396 }
2397 }
2398 '\\' => value.push('\\'),
2399 '\'' => value.push('\''),
2400 '"' => value.push('"'),
2401 '%' => {
2402 value.push('%');
2404 }
2405 '_' => {
2406 value.push('_');
2408 }
2409 _ => {
2413 if !self.config.escape_follow_chars.is_empty() {
2414 value.push(escaped);
2416 } else {
2417 value.push('\\');
2419 value.push(escaped);
2420 }
2421 }
2422 }
2423 }
2424 } else {
2425 value.push(self.advance());
2426 }
2427 }
2428
2429 if self.is_at_end() {
2430 return Err(Error::tokenize(
2431 "Unterminated double-quoted string",
2432 self.line,
2433 self.column,
2434 self.start,
2435 self.current,
2436 ));
2437 }
2438
2439 self.advance(); self.add_token_with_text(TokenType::String, value);
2441 Ok(())
2442 }
2443
2444 fn scan_triple_quoted_string(&mut self, quote_char: char) -> Result<()> {
2445 self.advance();
2447 self.advance();
2448 self.advance();
2449 let mut value = String::new();
2450
2451 while !self.is_at_end() {
2452 if self.peek() == quote_char
2454 && self.current + 1 < self.size
2455 && self.chars[self.current + 1] == quote_char
2456 && self.current + 2 < self.size
2457 && self.chars[self.current + 2] == quote_char
2458 {
2459 break;
2461 }
2462 value.push(self.advance());
2463 }
2464
2465 if self.is_at_end() {
2466 return Err(Error::tokenize(
2467 "Unterminated triple-quoted string",
2468 self.line,
2469 self.column,
2470 self.start,
2471 self.current,
2472 ));
2473 }
2474
2475 self.advance();
2477 self.advance();
2478 self.advance();
2479 let token_type = if quote_char == '"' {
2480 TokenType::TripleDoubleQuotedString
2481 } else {
2482 TokenType::TripleSingleQuotedString
2483 };
2484 self.add_token_with_text(token_type, value);
2485 Ok(())
2486 }
2487
2488 fn scan_quoted_identifier(&mut self, end_quote: char) -> Result<()> {
2489 self.advance(); let mut value = String::new();
2491
2492 loop {
2493 if self.is_at_end() {
2494 return Err(Error::tokenize(
2495 "Unterminated identifier",
2496 self.line,
2497 self.column,
2498 self.start,
2499 self.current,
2500 ));
2501 }
2502 if self.peek() == end_quote {
2503 if self.peek_next() == end_quote {
2504 value.push(end_quote);
2506 self.advance(); self.advance(); } else {
2509 break;
2511 }
2512 } else {
2513 value.push(self.peek());
2514 self.advance();
2515 }
2516 }
2517
2518 self.advance(); self.add_token_with_text(TokenType::QuotedIdentifier, value);
2520 Ok(())
2521 }
2522
2523 fn scan_unicode_quoted_string(&mut self, open_quote: char) -> Result<()> {
2528 self.advance(); let start = self.current;
2530 let close_quote = if open_quote == '\u{2018}' {
2532 '\u{2019}' } else {
2534 '\u{2019}' };
2536 while !self.is_at_end() && self.peek() != close_quote {
2537 self.advance();
2538 }
2539 let value = self.text_from_range(start, self.current);
2540 if !self.is_at_end() {
2541 self.advance(); }
2543 self.add_token_with_text(TokenType::String, value);
2544 Ok(())
2545 }
2546
2547 fn scan_unicode_quoted_identifier(&mut self, open_quote: char) -> Result<()> {
2550 self.advance(); let start = self.current;
2552 let close_quote = if open_quote == '\u{201C}' {
2553 '\u{201D}' } else {
2555 '\u{201D}' };
2557 while !self.is_at_end() && self.peek() != close_quote && self.peek() != '"' {
2558 self.advance();
2559 }
2560 let value = self.text_from_range(start, self.current);
2561 if !self.is_at_end() {
2562 self.advance(); }
2564 self.add_token_with_text(TokenType::QuotedIdentifier, value);
2565 Ok(())
2566 }
2567
2568 fn scan_number(&mut self) -> Result<()> {
2569 if self.config.hex_number_strings && self.peek() == '0' && !self.is_at_end() {
2571 let next = if self.current + 1 < self.size {
2572 self.chars[self.current + 1]
2573 } else {
2574 '\0'
2575 };
2576 if next == 'x' || next == 'X' {
2577 self.advance();
2579 self.advance();
2580 let hex_start = self.current;
2582 while !self.is_at_end() && (self.peek().is_ascii_hexdigit() || self.peek() == '_') {
2583 if self.peek() == '_' && !self.peek_next().is_ascii_hexdigit() {
2584 break;
2585 }
2586 self.advance();
2587 }
2588 if self.current > hex_start {
2589 let mut is_hex_float = false;
2591 if !self.is_at_end() && self.peek() == '.' {
2593 let after_dot = if self.current + 1 < self.size {
2594 self.chars[self.current + 1]
2595 } else {
2596 '\0'
2597 };
2598 if after_dot.is_ascii_hexdigit() {
2599 is_hex_float = true;
2600 self.advance(); while !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2602 self.advance();
2603 }
2604 }
2605 }
2606 if !self.is_at_end() && (self.peek() == 'p' || self.peek() == 'P') {
2608 is_hex_float = true;
2609 self.advance(); if !self.is_at_end() && (self.peek() == '+' || self.peek() == '-') {
2611 self.advance();
2612 }
2613 while !self.is_at_end() && self.peek().is_ascii_digit() {
2614 self.advance();
2615 }
2616 }
2617 if is_hex_float {
2618 let raw_text = self.text_from_range(self.start, self.current);
2620 let full_text = if self.config.numbers_can_be_underscore_separated
2621 && raw_text.contains('_')
2622 {
2623 raw_text.replace('_', "")
2624 } else {
2625 raw_text
2626 };
2627 self.add_token_with_text(TokenType::Number, full_text);
2628 } else if self.config.hex_string_is_integer_type {
2629 let raw_value = self.text_from_range(hex_start, self.current);
2631 let hex_value = if self.config.numbers_can_be_underscore_separated
2632 && raw_value.contains('_')
2633 {
2634 raw_value.replace('_', "")
2635 } else {
2636 raw_value
2637 };
2638 self.add_token_with_text(TokenType::HexNumber, hex_value);
2639 } else {
2640 let raw_value = self.text_from_range(hex_start, self.current);
2642 let hex_value = if self.config.numbers_can_be_underscore_separated
2643 && raw_value.contains('_')
2644 {
2645 raw_value.replace('_', "")
2646 } else {
2647 raw_value
2648 };
2649 self.add_token_with_text(TokenType::HexString, hex_value);
2650 }
2651 return Ok(());
2652 }
2653 self.current = self.start + 1;
2656 }
2657 }
2658
2659 while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2661 if self.peek() == '_' && (self.is_at_end() || !self.peek_next().is_ascii_digit()) {
2663 break;
2664 }
2665 self.advance();
2666 }
2667
2668 if self.peek() == '.' {
2672 let next = self.peek_next();
2673 if next != '.' {
2679 self.advance(); while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2682 if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2683 break;
2684 }
2685 self.advance();
2686 }
2687 }
2688 }
2689
2690 if self.peek() == 'e' || self.peek() == 'E' {
2692 self.advance();
2693 if self.peek() == '+' || self.peek() == '-' {
2694 self.advance();
2695 }
2696 while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2697 if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2698 break;
2699 }
2700 self.advance();
2701 }
2702 }
2703
2704 let raw_text = self.text_from_range(self.start, self.current);
2705 let text = if self.config.numbers_can_be_underscore_separated && raw_text.contains('_') {
2708 raw_text.replace('_', "")
2709 } else {
2710 raw_text
2711 };
2712
2713 if !self.config.numeric_literals.is_empty() && !self.is_at_end() {
2715 let next_char: String = self.peek().to_ascii_uppercase().to_string();
2716 let suffix_match = if self.current + 1 < self.size {
2718 let two_char: String = [
2719 self.chars[self.current].to_ascii_uppercase(),
2720 self.chars[self.current + 1].to_ascii_uppercase(),
2721 ]
2722 .iter()
2723 .collect();
2724 if self.config.numeric_literals.contains_key(&two_char) {
2725 let after_suffix = if self.current + 2 < self.size {
2727 self.chars[self.current + 2]
2728 } else {
2729 ' '
2730 };
2731 if !after_suffix.is_alphanumeric() && after_suffix != '_' {
2732 Some((two_char, 2))
2733 } else {
2734 None
2735 }
2736 } else if self.config.numeric_literals.contains_key(&next_char) {
2737 let after_suffix = if self.current + 1 < self.size {
2739 self.chars[self.current + 1]
2740 } else {
2741 ' '
2742 };
2743 if !after_suffix.is_alphanumeric() && after_suffix != '_' {
2744 Some((next_char, 1))
2745 } else {
2746 None
2747 }
2748 } else {
2749 None
2750 }
2751 } else if self.config.numeric_literals.contains_key(&next_char) {
2752 Some((next_char, 1))
2754 } else {
2755 None
2756 };
2757
2758 if let Some((suffix, len)) = suffix_match {
2759 for _ in 0..len {
2761 self.advance();
2762 }
2763 let type_name = self
2766 .config
2767 .numeric_literals
2768 .get(&suffix)
2769 .expect("suffix verified by contains_key above")
2770 .clone();
2771 let combined = format!("{}::{}", text, type_name);
2772 self.add_token_with_text(TokenType::Number, combined);
2773 return Ok(());
2774 }
2775 }
2776
2777 if self.config.identifiers_can_start_with_digit && !self.is_at_end() {
2780 let next = self.peek();
2781 if next.is_alphabetic() || next == '_' {
2782 while !self.is_at_end() {
2784 let ch = self.peek();
2785 if ch.is_alphanumeric() || ch == '_' {
2786 self.advance();
2787 } else {
2788 break;
2789 }
2790 }
2791 let ident_text = self.text_from_range(self.start, self.current);
2792 self.add_token_with_text(TokenType::Identifier, ident_text);
2793 return Ok(());
2794 }
2795 }
2796
2797 self.add_token_with_text(TokenType::Number, text);
2798 Ok(())
2799 }
2800
2801 fn scan_number_starting_with_dot(&mut self) -> Result<()> {
2803 self.advance();
2805
2806 while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2808 if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2809 break;
2810 }
2811 self.advance();
2812 }
2813
2814 if self.peek() == 'e' || self.peek() == 'E' {
2816 self.advance();
2817 if self.peek() == '+' || self.peek() == '-' {
2818 self.advance();
2819 }
2820 while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2821 if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2822 break;
2823 }
2824 self.advance();
2825 }
2826 }
2827
2828 let raw_text = self.text_from_range(self.start, self.current);
2829 let text = if self.config.numbers_can_be_underscore_separated && raw_text.contains('_') {
2832 raw_text.replace('_', "")
2833 } else {
2834 raw_text
2835 };
2836 self.add_token_with_text(TokenType::Number, text);
2837 Ok(())
2838 }
2839
2840 #[inline]
2843 fn lookup_keyword_ascii(keywords: &HashMap<String, TokenType>, text: &str) -> TokenType {
2844 if text.len() > 128 {
2845 return TokenType::Var;
2846 }
2847 let mut buf = [0u8; 128];
2848 for (i, b) in text.bytes().enumerate() {
2849 buf[i] = b.to_ascii_uppercase();
2850 }
2851 if let Ok(upper) = std::str::from_utf8(&buf[..text.len()]) {
2852 keywords.get(upper).copied().unwrap_or(TokenType::Var)
2853 } else {
2854 TokenType::Var
2855 }
2856 }
2857
2858 fn scan_identifier_or_keyword(&mut self) -> Result<()> {
2859 let first_char = self.peek();
2861 if !first_char.is_alphanumeric() && first_char != '_' {
2862 let c = self.advance();
2864 return Err(Error::tokenize(
2865 format!("Unexpected character: '{}'", c),
2866 self.line,
2867 self.column,
2868 self.start,
2869 self.current,
2870 ));
2871 }
2872
2873 while !self.is_at_end() {
2874 let c = self.peek();
2875 if c == '#' {
2879 let next_c = if self.current + 1 < self.size {
2880 self.chars[self.current + 1]
2881 } else {
2882 '\0'
2883 };
2884 if next_c == '>' || next_c == '-' {
2885 break; }
2887 self.advance();
2888 } else if c.is_alphanumeric() || c == '_' || c == '$' || c == '@' {
2889 self.advance();
2890 } else {
2891 break;
2892 }
2893 }
2894
2895 let text = self.text_from_range(self.start, self.current);
2896
2897 if text.eq_ignore_ascii_case("NOT") && self.peek() == '=' {
2899 self.advance(); self.add_token(TokenType::Neq);
2901 return Ok(());
2902 }
2903
2904 let next_char = self.peek();
2907 let is_single_quote = next_char == '\'';
2908 let is_double_quote = next_char == '"' && self.config.quotes.contains_key("\"");
2909 let is_double_quote_for_raw = next_char == '"';
2912
2913 if text.eq_ignore_ascii_case("R") && (is_single_quote || is_double_quote_for_raw) {
2916 let quote_char = if is_single_quote { '\'' } else { '"' };
2919 self.advance(); if self.peek() == quote_char && self.peek_next() == quote_char {
2923 self.advance(); self.advance(); let string_value = self.scan_raw_triple_quoted_content(quote_char)?;
2927 self.add_token_with_text(TokenType::RawString, string_value);
2928 } else {
2929 let string_value = self.scan_raw_string_content(quote_char)?;
2930 self.add_token_with_text(TokenType::RawString, string_value);
2931 }
2932 return Ok(());
2933 }
2934
2935 if is_single_quote || is_double_quote {
2936 if text.eq_ignore_ascii_case("N") {
2937 self.advance(); let string_value = if is_single_quote {
2940 self.scan_string_content()?
2941 } else {
2942 self.scan_double_quoted_string_content()?
2943 };
2944 self.add_token_with_text(TokenType::NationalString, string_value);
2945 return Ok(());
2946 } else if text.eq_ignore_ascii_case("E") {
2947 let lowercase = text == "e";
2951 let prefix = if lowercase { "e:" } else { "E:" };
2952 self.advance(); let string_value = self.scan_string_content_with_escapes(true)?;
2954 self.add_token_with_text(
2955 TokenType::EscapeString,
2956 format!("{}{}", prefix, string_value),
2957 );
2958 return Ok(());
2959 } else if text.eq_ignore_ascii_case("X") {
2960 self.advance(); let string_value = if is_single_quote {
2963 self.scan_string_content()?
2964 } else {
2965 self.scan_double_quoted_string_content()?
2966 };
2967 self.add_token_with_text(TokenType::HexString, string_value);
2968 return Ok(());
2969 } else if text.eq_ignore_ascii_case("B") && is_double_quote {
2970 self.advance(); let string_value = self.scan_double_quoted_string_content()?;
2973 self.add_token_with_text(TokenType::ByteString, string_value);
2974 return Ok(());
2975 } else if text.eq_ignore_ascii_case("B") && is_single_quote {
2976 self.advance(); let string_value = self.scan_string_content()?;
2980 if self.config.b_prefix_is_byte_string {
2981 self.add_token_with_text(TokenType::ByteString, string_value);
2982 } else {
2983 self.add_token_with_text(TokenType::BitString, string_value);
2984 }
2985 return Ok(());
2986 }
2987 }
2988
2989 if text.eq_ignore_ascii_case("U")
2991 && self.peek() == '&'
2992 && self.current + 1 < self.size
2993 && self.chars[self.current + 1] == '\''
2994 {
2995 self.advance(); self.advance(); let string_value = self.scan_string_content()?;
2998 self.add_token_with_text(TokenType::UnicodeString, string_value);
2999 return Ok(());
3000 }
3001
3002 let token_type = Self::lookup_keyword_ascii(&self.config.keywords, &text);
3003
3004 self.add_token_with_text(token_type, text);
3005 Ok(())
3006 }
3007
3008 fn scan_string_content_with_escapes(
3012 &mut self,
3013 force_backslash_escapes: bool,
3014 ) -> Result<String> {
3015 let mut value = String::new();
3016 let use_backslash_escapes =
3017 force_backslash_escapes || self.config.string_escapes.contains(&'\\');
3018
3019 while !self.is_at_end() {
3020 let c = self.peek();
3021 if c == '\'' {
3022 if self.peek_next() == '\'' {
3023 value.push('\'');
3025 self.advance();
3026 self.advance();
3027 } else {
3028 break;
3029 }
3030 } else if c == '\\' && use_backslash_escapes {
3031 value.push(self.advance());
3033 if !self.is_at_end() {
3034 value.push(self.advance());
3035 }
3036 } else {
3037 value.push(self.advance());
3038 }
3039 }
3040
3041 if self.is_at_end() {
3042 return Err(Error::tokenize(
3043 "Unterminated string",
3044 self.line,
3045 self.column,
3046 self.start,
3047 self.current,
3048 ));
3049 }
3050
3051 self.advance(); Ok(value)
3053 }
3054
3055 fn scan_string_content(&mut self) -> Result<String> {
3057 self.scan_string_content_with_escapes(false)
3058 }
3059
3060 fn scan_double_quoted_string_content(&mut self) -> Result<String> {
3063 let mut value = String::new();
3064 let use_backslash_escapes = self.config.string_escapes.contains(&'\\');
3065
3066 while !self.is_at_end() {
3067 let c = self.peek();
3068 if c == '"' {
3069 if self.peek_next() == '"' {
3070 value.push('"');
3072 self.advance();
3073 self.advance();
3074 } else {
3075 break;
3076 }
3077 } else if c == '\\' && use_backslash_escapes {
3078 self.advance(); if !self.is_at_end() {
3081 let escaped = self.advance();
3082 match escaped {
3083 'n' => value.push('\n'),
3084 'r' => value.push('\r'),
3085 't' => value.push('\t'),
3086 '0' => value.push('\0'),
3087 '\\' => value.push('\\'),
3088 '"' => value.push('"'),
3089 '\'' => value.push('\''),
3090 'x' => {
3091 let mut hex = String::new();
3093 for _ in 0..2 {
3094 if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
3095 hex.push(self.advance());
3096 }
3097 }
3098 if let Ok(byte) = u8::from_str_radix(&hex, 16) {
3099 value.push(byte as char);
3100 } else {
3101 value.push('\\');
3103 value.push('x');
3104 value.push_str(&hex);
3105 }
3106 }
3107 _ => {
3108 value.push('\\');
3110 value.push(escaped);
3111 }
3112 }
3113 }
3114 } else {
3115 value.push(self.advance());
3116 }
3117 }
3118
3119 if self.is_at_end() {
3120 return Err(Error::tokenize(
3121 "Unterminated double-quoted string",
3122 self.line,
3123 self.column,
3124 self.start,
3125 self.current,
3126 ));
3127 }
3128
3129 self.advance(); Ok(value)
3131 }
3132
3133 fn scan_raw_string_content(&mut self, quote_char: char) -> Result<String> {
3138 let mut value = String::new();
3139
3140 while !self.is_at_end() {
3141 let c = self.peek();
3142 if c == quote_char {
3143 if self.peek_next() == quote_char {
3144 value.push(quote_char);
3146 self.advance();
3147 self.advance();
3148 } else {
3149 break;
3150 }
3151 } else if c == '\\'
3152 && self.peek_next() == quote_char
3153 && self.config.string_escapes_allowed_in_raw_strings
3154 {
3155 value.push(quote_char);
3159 self.advance(); self.advance(); } else {
3162 value.push(self.advance());
3164 }
3165 }
3166
3167 if self.is_at_end() {
3168 return Err(Error::tokenize(
3169 "Unterminated raw string",
3170 self.line,
3171 self.column,
3172 self.start,
3173 self.current,
3174 ));
3175 }
3176
3177 self.advance(); Ok(value)
3179 }
3180
3181 fn scan_raw_triple_quoted_content(&mut self, quote_char: char) -> Result<String> {
3184 let mut value = String::new();
3185
3186 while !self.is_at_end() {
3187 let c = self.peek();
3188 if c == quote_char && self.peek_next() == quote_char {
3189 if self.current + 2 < self.size && self.chars[self.current + 2] == quote_char {
3191 self.advance(); self.advance(); self.advance(); return Ok(value);
3196 }
3197 }
3198 let ch = self.advance();
3200 value.push(ch);
3201 }
3202
3203 Err(Error::tokenize(
3204 "Unterminated raw triple-quoted string",
3205 self.line,
3206 self.column,
3207 self.start,
3208 self.current,
3209 ))
3210 }
3211
3212 fn scan_dollar_identifier(&mut self) -> Result<()> {
3217 self.advance();
3219
3220 while !self.is_at_end() {
3222 let c = self.peek();
3223 if c.is_alphanumeric() || c == '_' || c == '$' {
3224 self.advance();
3225 } else {
3226 break;
3227 }
3228 }
3229
3230 let text = self.text_from_range(self.start, self.current);
3231 self.add_token_with_text(TokenType::Var, text);
3232 Ok(())
3233 }
3234
3235 fn scan_tsql_identifier(&mut self) -> Result<()> {
3236 let first = self.advance();
3238
3239 if first == '#' && self.peek() == '#' {
3241 self.advance();
3242 }
3243
3244 while !self.is_at_end() {
3246 let c = self.peek();
3247 if c.is_alphanumeric() || c == '_' || c == '$' || c == '#' || c == '@' {
3248 self.advance();
3249 } else {
3250 break;
3251 }
3252 }
3253
3254 let text = self.text_from_range(self.start, self.current);
3255 self.add_token_with_text(TokenType::Var, text);
3257 Ok(())
3258 }
3259
3260 fn try_scan_insert_format_raw_data(&mut self) -> Option<String> {
3264 let len = self.tokens.len();
3265 if len < 3 {
3266 return None;
3267 }
3268
3269 let last = &self.tokens[len - 1];
3271 if last.text.eq_ignore_ascii_case("VALUES") {
3272 return None;
3273 }
3274 if !matches!(last.token_type, TokenType::Var | TokenType::Identifier) {
3275 return None;
3276 }
3277
3278 let format_tok = &self.tokens[len - 2];
3280 if !format_tok.text.eq_ignore_ascii_case("FORMAT") {
3281 return None;
3282 }
3283
3284 let has_insert = self.tokens[..len - 2]
3286 .iter()
3287 .rev()
3288 .take(20)
3289 .any(|t| t.token_type == TokenType::Insert);
3290 if !has_insert {
3291 return None;
3292 }
3293
3294 let raw_start = self.current;
3298 while !self.is_at_end() {
3299 let c = self.peek();
3300 if c == '\n' {
3301 let saved = self.current;
3303 self.advance(); while !self.is_at_end() && self.peek() == '\r' {
3306 self.advance();
3307 }
3308 if self.is_at_end() || self.peek() == '\n' {
3309 let raw = self.text_from_range(raw_start, saved);
3312 return Some(raw.trim().to_string());
3313 }
3314 } else {
3316 self.advance();
3317 }
3318 }
3319
3320 let raw = self.text_from_range(raw_start, self.current);
3322 let trimmed = raw.trim().to_string();
3323 if trimmed.is_empty() {
3324 None
3325 } else {
3326 Some(trimmed)
3327 }
3328 }
3329
3330 fn add_token(&mut self, token_type: TokenType) {
3331 let text = self.text_from_range(self.start, self.current);
3332 self.add_token_with_text(token_type, text);
3333 }
3334
3335 fn add_token_with_text(&mut self, token_type: TokenType, text: String) {
3336 let span = Span::new(self.start, self.current, self.line, self.column);
3337 let mut token = Token::new(token_type, text, span);
3338 token.comments.append(&mut self.comments);
3339 self.tokens.push(token);
3340 }
3341}
3342
3343#[cfg(test)]
3344mod tests {
3345 use super::*;
3346
3347 #[test]
3348 fn test_simple_select() {
3349 let tokenizer = Tokenizer::default();
3350 let tokens = tokenizer.tokenize("SELECT 1").unwrap();
3351
3352 assert_eq!(tokens.len(), 2);
3353 assert_eq!(tokens[0].token_type, TokenType::Select);
3354 assert_eq!(tokens[1].token_type, TokenType::Number);
3355 assert_eq!(tokens[1].text, "1");
3356 }
3357
3358 #[test]
3359 fn test_select_with_identifier() {
3360 let tokenizer = Tokenizer::default();
3361 let tokens = tokenizer.tokenize("SELECT a, b FROM t").unwrap();
3362
3363 assert_eq!(tokens.len(), 6);
3364 assert_eq!(tokens[0].token_type, TokenType::Select);
3365 assert_eq!(tokens[1].token_type, TokenType::Var);
3366 assert_eq!(tokens[1].text, "a");
3367 assert_eq!(tokens[2].token_type, TokenType::Comma);
3368 assert_eq!(tokens[3].token_type, TokenType::Var);
3369 assert_eq!(tokens[3].text, "b");
3370 assert_eq!(tokens[4].token_type, TokenType::From);
3371 assert_eq!(tokens[5].token_type, TokenType::Var);
3372 assert_eq!(tokens[5].text, "t");
3373 }
3374
3375 #[test]
3376 fn test_string_literal() {
3377 let tokenizer = Tokenizer::default();
3378 let tokens = tokenizer.tokenize("SELECT 'hello'").unwrap();
3379
3380 assert_eq!(tokens.len(), 2);
3381 assert_eq!(tokens[1].token_type, TokenType::String);
3382 assert_eq!(tokens[1].text, "hello");
3383 }
3384
3385 #[test]
3386 fn test_escaped_string() {
3387 let tokenizer = Tokenizer::default();
3388 let tokens = tokenizer.tokenize("SELECT 'it''s'").unwrap();
3389
3390 assert_eq!(tokens.len(), 2);
3391 assert_eq!(tokens[1].token_type, TokenType::String);
3392 assert_eq!(tokens[1].text, "it's");
3393 }
3394
3395 #[test]
3396 fn test_terminal_backslash_quote_recovery() {
3397 let mut config = TokenizerConfig::default();
3398 config.string_escapes.push('\\');
3399 config.recover_terminal_backslash_quote = true;
3400 let tokenizer = Tokenizer::new(config);
3401 let tokens = tokenizer
3402 .tokenize("SHOW FUNCTIONS LIKE 'a\\' OR 1=1")
3403 .unwrap();
3404
3405 assert_eq!(tokens.len(), 8);
3406 assert_eq!(tokens[3].token_type, TokenType::String);
3407 assert_eq!(tokens[3].text, "a\\");
3408 assert_eq!(tokens[4].token_type, TokenType::Or);
3409 }
3410
3411 #[test]
3412 fn test_comments() {
3413 let tokenizer = Tokenizer::default();
3414 let tokens = tokenizer.tokenize("SELECT -- comment\n1").unwrap();
3415
3416 assert_eq!(tokens.len(), 2);
3417 assert_eq!(tokens[0].trailing_comments.len(), 1);
3420 assert_eq!(tokens[0].trailing_comments[0], " comment");
3421 }
3422
3423 #[test]
3424 fn test_comment_in_and_chain() {
3425 use crate::generator::Generator;
3426 use crate::parser::Parser;
3427
3428 let sql = "SELECT a FROM b WHERE foo\n-- c1\nAND bar\n-- c2\nAND bla";
3430 let ast = Parser::parse_sql(sql).unwrap();
3431 let mut gen = Generator::default();
3432 let output = gen.generate(&ast[0]).unwrap();
3433 assert_eq!(
3434 output,
3435 "SELECT a FROM b WHERE foo AND /* c1 */ bar AND /* c2 */ bla"
3436 );
3437 }
3438
3439 #[test]
3440 fn test_operators() {
3441 let tokenizer = Tokenizer::default();
3442 let tokens = tokenizer.tokenize("1 + 2 * 3").unwrap();
3443
3444 assert_eq!(tokens.len(), 5);
3445 assert_eq!(tokens[0].token_type, TokenType::Number);
3446 assert_eq!(tokens[1].token_type, TokenType::Plus);
3447 assert_eq!(tokens[2].token_type, TokenType::Number);
3448 assert_eq!(tokens[3].token_type, TokenType::Star);
3449 assert_eq!(tokens[4].token_type, TokenType::Number);
3450 }
3451
3452 #[test]
3453 fn test_comparison_operators() {
3454 let tokenizer = Tokenizer::default();
3455 let tokens = tokenizer.tokenize("a <= b >= c != d").unwrap();
3456
3457 assert_eq!(tokens[1].token_type, TokenType::Lte);
3458 assert_eq!(tokens[3].token_type, TokenType::Gte);
3459 assert_eq!(tokens[5].token_type, TokenType::Neq);
3460 }
3461
3462 #[test]
3463 fn test_national_string() {
3464 let tokenizer = Tokenizer::default();
3465 let tokens = tokenizer.tokenize("N'abc'").unwrap();
3466
3467 assert_eq!(
3468 tokens.len(),
3469 1,
3470 "Expected 1 token for N'abc', got {:?}",
3471 tokens
3472 );
3473 assert_eq!(tokens[0].token_type, TokenType::NationalString);
3474 assert_eq!(tokens[0].text, "abc");
3475 }
3476
3477 #[test]
3478 fn test_hex_string() {
3479 let tokenizer = Tokenizer::default();
3480 let tokens = tokenizer.tokenize("X'ABCD'").unwrap();
3481
3482 assert_eq!(
3483 tokens.len(),
3484 1,
3485 "Expected 1 token for X'ABCD', got {:?}",
3486 tokens
3487 );
3488 assert_eq!(tokens[0].token_type, TokenType::HexString);
3489 assert_eq!(tokens[0].text, "ABCD");
3490 }
3491
3492 #[test]
3493 fn test_bit_string() {
3494 let tokenizer = Tokenizer::default();
3495 let tokens = tokenizer.tokenize("B'01010'").unwrap();
3496
3497 assert_eq!(
3498 tokens.len(),
3499 1,
3500 "Expected 1 token for B'01010', got {:?}",
3501 tokens
3502 );
3503 assert_eq!(tokens[0].token_type, TokenType::BitString);
3504 assert_eq!(tokens[0].text, "01010");
3505 }
3506
3507 #[test]
3508 fn test_trailing_dot_number() {
3509 let tokenizer = Tokenizer::default();
3510
3511 let tokens = tokenizer.tokenize("SELECT 1.").unwrap();
3513 assert_eq!(
3514 tokens.len(),
3515 2,
3516 "Expected 2 tokens for 'SELECT 1.', got {:?}",
3517 tokens
3518 );
3519 assert_eq!(tokens[1].token_type, TokenType::Number);
3520 assert_eq!(tokens[1].text, "1.");
3521
3522 let tokens = tokenizer.tokenize("SELECT 1.5").unwrap();
3524 assert_eq!(tokens[1].text, "1.5");
3525
3526 let tokens = tokenizer.tokenize("SELECT 1.a").unwrap();
3529 assert_eq!(
3530 tokens.len(),
3531 3,
3532 "Expected 3 tokens for 'SELECT 1.a', got {:?}",
3533 tokens
3534 );
3535 assert_eq!(tokens[1].token_type, TokenType::Number);
3536 assert_eq!(tokens[1].text, "1.");
3537 assert_eq!(tokens[2].token_type, TokenType::Var);
3538
3539 let tokens = tokenizer.tokenize("SELECT 1..2").unwrap();
3541 assert_eq!(tokens[1].token_type, TokenType::Number);
3542 assert_eq!(tokens[1].text, "1");
3543 assert_eq!(tokens[2].token_type, TokenType::Dot);
3544 assert_eq!(tokens[3].token_type, TokenType::Dot);
3545 assert_eq!(tokens[4].token_type, TokenType::Number);
3546 assert_eq!(tokens[4].text, "2");
3547 }
3548
3549 #[test]
3550 fn test_leading_dot_number() {
3551 let tokenizer = Tokenizer::default();
3552
3553 let tokens = tokenizer.tokenize(".25").unwrap();
3555 assert_eq!(
3556 tokens.len(),
3557 1,
3558 "Expected 1 token for '.25', got {:?}",
3559 tokens
3560 );
3561 assert_eq!(tokens[0].token_type, TokenType::Number);
3562 assert_eq!(tokens[0].text, ".25");
3563
3564 let tokens = tokenizer.tokenize("SAMPLE (.25)").unwrap();
3566 assert_eq!(
3567 tokens.len(),
3568 4,
3569 "Expected 4 tokens for 'SAMPLE (.25)', got {:?}",
3570 tokens
3571 );
3572 assert_eq!(tokens[0].token_type, TokenType::Sample);
3573 assert_eq!(tokens[1].token_type, TokenType::LParen);
3574 assert_eq!(tokens[2].token_type, TokenType::Number);
3575 assert_eq!(tokens[2].text, ".25");
3576 assert_eq!(tokens[3].token_type, TokenType::RParen);
3577
3578 let tokens = tokenizer.tokenize(".5e10").unwrap();
3580 assert_eq!(
3581 tokens.len(),
3582 1,
3583 "Expected 1 token for '.5e10', got {:?}",
3584 tokens
3585 );
3586 assert_eq!(tokens[0].token_type, TokenType::Number);
3587 assert_eq!(tokens[0].text, ".5e10");
3588
3589 let tokens = tokenizer.tokenize("a.b").unwrap();
3591 assert_eq!(
3592 tokens.len(),
3593 3,
3594 "Expected 3 tokens for 'a.b', got {:?}",
3595 tokens
3596 );
3597 assert_eq!(tokens[1].token_type, TokenType::Dot);
3598 }
3599
3600 #[test]
3601 fn test_unrecognized_character() {
3602 let tokenizer = Tokenizer::default();
3603
3604 let result = tokenizer.tokenize("SELECT \u{2018}hello\u{2019}");
3606 assert!(
3607 result.is_ok(),
3608 "Curly quotes should be tokenized as strings"
3609 );
3610
3611 let result = tokenizer.tokenize("SELECT • FROM t");
3613 assert!(result.is_err());
3614 }
3615
3616 #[test]
3617 fn test_colon_eq_tokenization() {
3618 let tokenizer = Tokenizer::default();
3619
3620 let tokens = tokenizer.tokenize("a := 1").unwrap();
3622 assert_eq!(tokens.len(), 3);
3623 assert_eq!(tokens[0].token_type, TokenType::Var);
3624 assert_eq!(tokens[1].token_type, TokenType::ColonEq);
3625 assert_eq!(tokens[2].token_type, TokenType::Number);
3626
3627 let tokens = tokenizer.tokenize("a:b").unwrap();
3629 assert!(tokens.iter().any(|t| t.token_type == TokenType::Colon));
3630 assert!(!tokens.iter().any(|t| t.token_type == TokenType::ColonEq));
3631
3632 let tokens = tokenizer.tokenize("a::INT").unwrap();
3634 assert!(tokens.iter().any(|t| t.token_type == TokenType::DColon));
3635 }
3636
3637 #[test]
3638 fn test_colon_eq_parsing() {
3639 use crate::generator::Generator;
3640 use crate::parser::Parser;
3641
3642 let ast = Parser::parse_sql("SELECT @var1 := 1, @var2")
3644 .expect("Failed to parse MySQL @var := expr");
3645 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3646 assert_eq!(output, "SELECT @var1 := 1, @var2");
3647
3648 let ast = Parser::parse_sql("SELECT @var1, @var2 := @var1")
3650 .expect("Failed to parse MySQL @var2 := @var1");
3651 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3652 assert_eq!(output, "SELECT @var1, @var2 := @var1");
3653
3654 let ast = Parser::parse_sql("SELECT @var1 := COUNT(*) FROM t1")
3656 .expect("Failed to parse MySQL @var := COUNT(*)");
3657 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3658 assert_eq!(output, "SELECT @var1 := COUNT(*) FROM t1");
3659
3660 let ast = Parser::parse_sql("SET @var1 := 1").expect("Failed to parse SET @var1 := 1");
3662 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3663 assert_eq!(output, "SET @var1 = 1");
3664
3665 let ast =
3667 Parser::parse_sql("UNION_VALUE(k1 := 1)").expect("Failed to parse named arg with :=");
3668 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3669 assert_eq!(output, "UNION_VALUE(k1 := 1)");
3670
3671 let ast = Parser::parse_sql("SELECT UNNEST(col, recursive := TRUE) FROM t")
3673 .expect("Failed to parse UNNEST with :=");
3674 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3675 assert_eq!(output, "SELECT UNNEST(col, recursive := TRUE) FROM t");
3676
3677 let ast =
3679 Parser::parse_sql("SELECT foo: 1").expect("Failed to parse DuckDB prefix alias foo: 1");
3680 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3681 assert_eq!(output, "SELECT 1 AS foo");
3682
3683 let ast = Parser::parse_sql("SELECT foo: 1, bar: 2, baz: 3")
3685 .expect("Failed to parse DuckDB multiple prefix aliases");
3686 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3687 assert_eq!(output, "SELECT 1 AS foo, 2 AS bar, 3 AS baz");
3688 }
3689
3690 #[test]
3691 fn test_colon_eq_dialect_roundtrip() {
3692 use crate::dialects::{Dialect, DialectType};
3693
3694 fn check(dialect: DialectType, sql: &str, expected: Option<&str>) {
3695 let d = Dialect::get(dialect);
3696 let ast = d
3697 .parse(sql)
3698 .unwrap_or_else(|e| panic!("Parse error for '{}': {}", sql, e));
3699 assert!(!ast.is_empty(), "Empty AST for: {}", sql);
3700 let transformed = d
3701 .transform(ast[0].clone())
3702 .unwrap_or_else(|e| panic!("Transform error for '{}': {}", sql, e));
3703 let output = d
3704 .generate(&transformed)
3705 .unwrap_or_else(|e| panic!("Generate error for '{}': {}", sql, e));
3706 let expected = expected.unwrap_or(sql);
3707 assert_eq!(output, expected, "Roundtrip failed for: {}", sql);
3708 }
3709
3710 check(DialectType::MySQL, "SELECT @var1 := 1, @var2", None);
3712 check(DialectType::MySQL, "SELECT @var1, @var2 := @var1", None);
3713 check(DialectType::MySQL, "SELECT @var1 := COUNT(*) FROM t1", None);
3714 check(DialectType::MySQL, "SET @var1 := 1", Some("SET @var1 = 1"));
3715
3716 check(
3718 DialectType::DuckDB,
3719 "SELECT UNNEST(col, recursive := TRUE) FROM t",
3720 None,
3721 );
3722 check(DialectType::DuckDB, "UNION_VALUE(k1 := 1)", None);
3723
3724 {
3727 let d = Dialect::get(DialectType::DuckDB);
3728 let ast = d
3729 .parse("STRUCT_PACK(a := 'b')::json")
3730 .expect("Failed to parse STRUCT_PACK(a := 'b')::json");
3731 assert!(!ast.is_empty(), "Empty AST for STRUCT_PACK(a := 'b')::json");
3732 }
3733
3734 check(
3736 DialectType::DuckDB,
3737 "SELECT foo: 1",
3738 Some("SELECT 1 AS foo"),
3739 );
3740 check(
3741 DialectType::DuckDB,
3742 "SELECT foo: 1, bar: 2, baz: 3",
3743 Some("SELECT 1 AS foo, 2 AS bar, 3 AS baz"),
3744 );
3745 }
3746
3747 #[test]
3748 fn test_comment_roundtrip() {
3749 use crate::generator::Generator;
3750 use crate::parser::Parser;
3751
3752 fn check_roundtrip(sql: &str) -> Option<String> {
3753 let ast = match Parser::parse_sql(sql) {
3754 Ok(a) => a,
3755 Err(e) => return Some(format!("Parse error: {:?}", e)),
3756 };
3757 if ast.is_empty() {
3758 return Some("Empty AST".to_string());
3759 }
3760 let mut generator = Generator::default();
3761 let output = match generator.generate(&ast[0]) {
3762 Ok(o) => o,
3763 Err(e) => return Some(format!("Gen error: {:?}", e)),
3764 };
3765 if output == sql {
3766 None
3767 } else {
3768 Some(format!(
3769 "Mismatch:\n input: {}\n output: {}",
3770 sql, output
3771 ))
3772 }
3773 }
3774
3775 let tests = vec![
3776 "SELECT c /* c1 */ AS alias /* c2 */",
3782 "SELECT a /* x */, b /* x */",
3784 "SELECT a /* x */ /* y */ /* z */, b /* k */ /* m */",
3786 "SELECT * FROM foo /* x */, bla /* x */",
3788 "SELECT 1 /* comment */ + 1",
3790 "SELECT 1 /* c1 */ + 2 /* c2 */",
3791 "SELECT 1 /* c1 */ + /* c2 */ 2 /* c3 */",
3792 "SELECT CAST(x AS INT) /* comment */ FROM foo",
3794 "SELECT FOO(x /* c */) /* FOO */, b /* b */",
3796 "SELECT x FROM a.b.c /* x */, e.f.g /* x */",
3798 "INSERT INTO t1 (tc1 /* tc1 */, tc2 /* tc2 */) SELECT c1 /* sc1 */, c2 /* sc2 */ FROM t",
3800 "/* c */ WITH x AS (SELECT 1) SELECT * FROM x",
3802 "/* comment1 */ INSERT INTO x /* comment2 */ VALUES (1, 2, 3)",
3803 "/* comment1 */ UPDATE tbl /* comment2 */ SET x = 2 WHERE x < 2",
3804 "/* comment1 */ DELETE FROM x /* comment2 */ WHERE y > 1",
3805 "/* comment */ CREATE TABLE foo AS SELECT 1",
3806 "INSERT INTO foo SELECT * FROM bar /* comment */",
3808 "SELECT FOO(x /* c1 */ + y /* c2 */ + BLA(5 /* c3 */)) FROM (VALUES (1 /* c4 */, \"test\" /* c5 */)) /* c6 */",
3810 ];
3811
3812 let mut failures = Vec::new();
3813 for sql in tests {
3814 if let Some(e) = check_roundtrip(sql) {
3815 failures.push(e);
3816 }
3817 }
3818
3819 if !failures.is_empty() {
3820 panic!("Comment roundtrip failures:\n{}", failures.join("\n\n"));
3821 }
3822 }
3823
3824 #[test]
3825 fn test_dollar_quoted_string_parsing() {
3826 use crate::dialects::{Dialect, DialectType};
3827
3828 let (tag, content) = super::parse_dollar_string_token("FOO\x00content here");
3830 assert_eq!(tag, Some("FOO".to_string()));
3831 assert_eq!(content, "content here");
3832
3833 let (tag, content) = super::parse_dollar_string_token("just content");
3834 assert_eq!(tag, None);
3835 assert_eq!(content, "just content");
3836
3837 fn check_databricks(sql: &str, expected: Option<&str>) {
3839 let d = Dialect::get(DialectType::Databricks);
3840 let ast = d
3841 .parse(sql)
3842 .unwrap_or_else(|e| panic!("Parse error for '{}': {}", sql, e));
3843 assert!(!ast.is_empty(), "Empty AST for: {}", sql);
3844 let transformed = d
3845 .transform(ast[0].clone())
3846 .unwrap_or_else(|e| panic!("Transform error for '{}': {}", sql, e));
3847 let output = d
3848 .generate(&transformed)
3849 .unwrap_or_else(|e| panic!("Generate error for '{}': {}", sql, e));
3850 let expected = expected.unwrap_or(sql);
3851 assert_eq!(output, expected, "Roundtrip failed for: {}", sql);
3852 }
3853
3854 check_databricks(
3856 "CREATE FUNCTION add_one(x INT) RETURNS INT LANGUAGE PYTHON AS $$def add_one(x):\n return x+1$$",
3857 None
3858 );
3859
3860 check_databricks(
3862 "CREATE FUNCTION add_one(x INT) RETURNS INT LANGUAGE PYTHON AS $FOO$def add_one(x):\n return x+1$FOO$",
3863 None
3864 );
3865 }
3866
3867 #[test]
3868 fn test_numeric_underscore_stripping() {
3869 let mut config = TokenizerConfig::default();
3871 config.numbers_can_be_underscore_separated = true;
3872 let tokenizer = Tokenizer::new(config);
3873
3874 let tokens = tokenizer.tokenize("SELECT 1_2_3_4_5").unwrap();
3876 assert_eq!(tokens[1].token_type, TokenType::Number);
3877 assert_eq!(tokens[1].text, "12345");
3878
3879 let tokens = tokenizer.tokenize("SELECT 20_000").unwrap();
3881 assert_eq!(tokens[1].token_type, TokenType::Number);
3882 assert_eq!(tokens[1].text, "20000");
3883
3884 let tokens = tokenizer.tokenize("SELECT 1_2E+1_0").unwrap();
3886 assert_eq!(tokens[1].token_type, TokenType::Number);
3887 assert_eq!(tokens[1].text, "12E+10");
3888
3889 let default_tokenizer = Tokenizer::default();
3891 let tokens = default_tokenizer.tokenize("SELECT 1_2_3_4_5").unwrap();
3892 assert_eq!(tokens[1].token_type, TokenType::Number);
3893 assert_eq!(tokens[1].text, "1_2_3_4_5");
3894 }
3895}