1use crate::error::{Error, Result};
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9use std::fmt;
10use std::sync::LazyLock;
11#[cfg(feature = "bindings")]
12use ts_rs::TS;
13
14pub fn parse_dollar_string_token(text: &str) -> (Option<String>, String) {
18 if let Some(pos) = text.find('\x00') {
19 let tag = &text[..pos];
20 let content = &text[pos + 1..];
21 (Some(tag.to_string()), content.to_string())
22 } else {
23 (None, text.to_string())
24 }
25}
26
27#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
29#[cfg_attr(feature = "bindings", derive(TS))]
30pub struct Span {
31 pub start: usize,
33 pub end: usize,
35 pub line: usize,
37 pub column: usize,
39}
40
41impl Span {
42 pub fn new(start: usize, end: usize, line: usize, column: usize) -> Self {
43 Self {
44 start,
45 end,
46 line,
47 column,
48 }
49 }
50}
51
52#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
54pub struct Token {
55 pub token_type: TokenType,
57 pub text: String,
59 pub span: Span,
61 #[serde(default)]
63 pub comments: Vec<String>,
64 #[serde(default)]
66 pub trailing_comments: Vec<String>,
67}
68
69impl Token {
70 pub fn new(token_type: TokenType, text: impl Into<String>, span: Span) -> Self {
72 Self {
73 token_type,
74 text: text.into(),
75 span,
76 comments: Vec::new(),
77 trailing_comments: Vec::new(),
78 }
79 }
80
81 pub fn number(n: i64) -> Self {
83 Self::new(TokenType::Number, n.to_string(), Span::default())
84 }
85
86 pub fn string(s: impl Into<String>) -> Self {
88 Self::new(TokenType::String, s, Span::default())
89 }
90
91 pub fn identifier(s: impl Into<String>) -> Self {
93 Self::new(TokenType::Identifier, s, Span::default())
94 }
95
96 pub fn var(s: impl Into<String>) -> Self {
98 Self::new(TokenType::Var, s, Span::default())
99 }
100
101 pub fn with_comment(mut self, comment: impl Into<String>) -> Self {
103 self.comments.push(comment.into());
104 self
105 }
106}
107
108impl fmt::Display for Token {
109 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
110 write!(f, "{:?}({})", self.token_type, self.text)
111 }
112}
113
114#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
116#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
117#[repr(u16)]
118pub enum TokenType {
119 LParen,
121 RParen,
122 LBracket,
123 RBracket,
124 LBrace,
125 RBrace,
126 Comma,
127 Dot,
128 Dash,
129 Plus,
130 Colon,
131 DotColon,
132 DColon,
133 DColonDollar,
134 DColonPercent,
135 DColonQMark,
136 DQMark,
137 Semicolon,
138 Star,
139 Backslash,
140 Slash,
141 Lt,
142 Lte,
143 Gt,
144 Gte,
145 Not,
146 Eq,
147 Neq,
148 NullsafeEq,
149 ColonEq,
150 ColonGt,
151 NColonGt,
152 And,
153 Or,
154 Amp,
155 DPipe,
156 PipeGt,
157 Pipe,
158 PipeSlash,
159 DPipeSlash,
160 Caret,
161 CaretAt,
162 LtLt, GtGt, Tilde,
165 Arrow,
166 DArrow,
167 FArrow,
168 Hash,
169 HashArrow,
170 DHashArrow,
171 LrArrow,
172 DAt,
173 AtAt,
174 LtAt,
175 AtGt,
176 Dollar,
177 Parameter,
178 Session,
179 SessionParameter,
180 SessionUser,
181 DAmp,
182 AmpLt,
183 AmpGt,
184 Adjacent,
185 Xor,
186 DStar,
187 QMarkAmp,
188 QMarkPipe,
189 HashDash,
190 Exclamation,
191
192 UriStart,
193 BlockStart,
194 BlockEnd,
195 Space,
196 Break,
197
198 BlockComment, LineComment, String,
204 DollarString, TripleDoubleQuotedString, TripleSingleQuotedString, Number,
208 Identifier,
209 QuotedIdentifier,
210 Database,
211 Column,
212 ColumnDef,
213 Schema,
214 Table,
215 Warehouse,
216 Stage,
217 Streamlit,
218 Var,
219 BitString,
220 HexString,
221 HexNumber,
223 ByteString,
224 NationalString,
225 EscapeString, RawString,
227 HeredocString,
228 HeredocStringAlternative,
229 UnicodeString,
230
231 Bit,
233 Boolean,
234 TinyInt,
235 UTinyInt,
236 SmallInt,
237 USmallInt,
238 MediumInt,
239 UMediumInt,
240 Int,
241 UInt,
242 BigInt,
243 UBigInt,
244 BigNum,
245 Int128,
246 UInt128,
247 Int256,
248 UInt256,
249 Float,
250 Double,
251 UDouble,
252 Decimal,
253 Decimal32,
254 Decimal64,
255 Decimal128,
256 Decimal256,
257 DecFloat,
258 UDecimal,
259 BigDecimal,
260 Char,
261 NChar,
262 VarChar,
263 NVarChar,
264 BpChar,
265 Text,
266 MediumText,
267 LongText,
268 Blob,
269 MediumBlob,
270 LongBlob,
271 TinyBlob,
272 TinyText,
273 Name,
274 Binary,
275 VarBinary,
276 Json,
277 JsonB,
278 Time,
279 TimeTz,
280 TimeNs,
281 Timestamp,
282 TimestampTz,
283 TimestampLtz,
284 TimestampNtz,
285 TimestampS,
286 TimestampMs,
287 TimestampNs,
288 DateTime,
289 DateTime2,
290 DateTime64,
291 SmallDateTime,
292 Date,
293 Date32,
294 Int4Range,
295 Int4MultiRange,
296 Int8Range,
297 Int8MultiRange,
298 NumRange,
299 NumMultiRange,
300 TsRange,
301 TsMultiRange,
302 TsTzRange,
303 TsTzMultiRange,
304 DateRange,
305 DateMultiRange,
306 Uuid,
307 Geography,
308 GeographyPoint,
309 Nullable,
310 Geometry,
311 Point,
312 Ring,
313 LineString,
314 LocalTime,
315 LocalTimestamp,
316 SysTimestamp,
317 MultiLineString,
318 Polygon,
319 MultiPolygon,
320 HllSketch,
321 HStore,
322 Super,
323 Serial,
324 SmallSerial,
325 BigSerial,
326 Xml,
327 Year,
328 UserDefined,
329 Money,
330 SmallMoney,
331 RowVersion,
332 Image,
333 Variant,
334 Object,
335 Inet,
336 IpAddress,
337 IpPrefix,
338 Ipv4,
339 Ipv6,
340 Enum,
341 Enum8,
342 Enum16,
343 FixedString,
344 LowCardinality,
345 Nested,
346 AggregateFunction,
347 SimpleAggregateFunction,
348 TDigest,
349 Unknown,
350 Vector,
351 Dynamic,
352 Void,
353
354 Add,
356 Alias,
357 Alter,
358 All,
359 Anti,
360 Any,
361 Apply,
362 Array,
363 Asc,
364 AsOf,
365 Attach,
366 AutoIncrement,
367 Begin,
368 Between,
369 BulkCollectInto,
370 Cache,
371 Cascade,
372 Case,
373 CharacterSet,
374 Cluster,
375 ClusterBy,
376 Collate,
377 Command,
378 Comment,
379 Commit,
380 Preserve,
381 Connect,
382 ConnectBy,
383 Constraint,
384 Copy,
385 Create,
386 Cross,
387 Cube,
388 CurrentDate,
389 CurrentDateTime,
390 CurrentSchema,
391 CurrentTime,
392 CurrentTimestamp,
393 CurrentUser,
394 CurrentRole,
395 CurrentCatalog,
396 Declare,
397 Default,
398 Delete,
399 Desc,
400 Describe,
401 Detach,
402 Dictionary,
403 Distinct,
404 Distribute,
405 DistributeBy,
406 Div,
407 Drop,
408 Else,
409 End,
410 Escape,
411 Except,
412 Execute,
413 Exists,
414 False,
415 Fetch,
416 File,
417 FileFormat,
418 Filter,
419 Final,
420 First,
421 For,
422 Force,
423 ForeignKey,
424 Format,
425 From,
426 Full,
427 Function,
428 Get,
429 Glob,
430 Global,
431 Grant,
432 GroupBy,
433 GroupingSets,
434 Having,
435 Hint,
436 Ignore,
437 ILike,
438 In,
439 Index,
440 IndexedBy,
441 Inner,
442 Input,
443 Insert,
444 Install,
445 Intersect,
446 Interval,
447 Into,
448 Inpath,
449 InputFormat,
450 Introducer,
451 IRLike,
452 Is,
453 IsNull,
454 Join,
455 JoinMarker,
456 Keep,
457 Key,
458 Kill,
459 Lambda,
460 Language,
461 Lateral,
462 Left,
463 Like,
464 NotLike, NotILike, NotRLike, NotIRLike, Limit,
469 List,
470 Load,
471 Local,
472 Lock,
473 Map,
474 Match,
475 MatchCondition,
476 MatchRecognize,
477 MemberOf,
478 Materialized,
479 Merge,
480 Mod,
481 Model,
482 Natural,
483 Next,
484 NoAction,
485 Nothing,
486 NotNull,
487 Null,
488 ObjectIdentifier,
489 Offset,
490 On,
491 Only,
492 Operator,
493 OrderBy,
494 OrderSiblingsBy,
495 Ordered,
496 Ordinality,
497 Out,
498 Outer,
499 Output,
500 Over,
501 Overlaps,
502 Overwrite,
503 Partition,
504 PartitionBy,
505 Percent,
506 Pivot,
507 Placeholder,
508 Positional,
509 Pragma,
510 Prewhere,
511 PrimaryKey,
512 Procedure,
513 Properties,
514 PseudoType,
515 Put,
516 Qualify,
517 Quote,
518 QDColon,
519 Range,
520 Recursive,
521 Refresh,
522 Rename,
523 Replace,
524 Returning,
525 Revoke,
526 References,
527 Restrict,
528 Right,
529 RLike,
530 Rollback,
531 Rollup,
532 Row,
533 Rows,
534 Select,
535 Semi,
536 Savepoint,
537 Separator,
538 Sequence,
539 Serde,
540 SerdeProperties,
541 Set,
542 Settings,
543 Show,
544 Siblings,
545 SimilarTo,
546 Some,
547 Sort,
548 SortBy,
549 SoundsLike,
550 StartWith,
551 StorageIntegration,
552 StraightJoin,
553 Struct,
554 Summarize,
555 TableSample,
556 Sample,
557 Bernoulli,
558 System,
559 Block,
560 Seed,
561 Repeatable,
562 Tag,
563 Temporary,
564 Transaction,
565 To,
566 Top,
567 Then,
568 True,
569 Truncate,
570 Uncache,
571 Union,
572 Unnest,
573 Unpivot,
574 Update,
575 Use,
576 Using,
577 Values,
578 View,
579 SemanticView,
580 Volatile,
581 When,
582 Where,
583 Window,
584 With,
585 Ties,
586 Exclude,
587 No,
588 Others,
589 Unique,
590 UtcDate,
591 UtcTime,
592 UtcTimestamp,
593 VersionSnapshot,
594 TimestampSnapshot,
595 Option,
596 Sink,
597 Source,
598 Analyze,
599 Namespace,
600 Export,
601 As,
602 By,
603 Nulls,
604 Respect,
605 Last,
606 If,
607 Cast,
608 TryCast,
609 SafeCast,
610 Count,
611 Extract,
612 Substring,
613 Trim,
614 Leading,
615 Trailing,
616 Both,
617 Position,
618 Overlaying,
619 Placing,
620 Treat,
621 Within,
622 Group,
623 Order,
624
625 Unbounded,
627 Preceding,
628 Following,
629 Current,
630 Groups,
631
632 Trigger,
634 Type,
635 Domain,
636 Returns,
637 Body,
638 Increment,
639 Minvalue,
640 Maxvalue,
641 Start,
642 Cycle,
643 NoCycle,
644 Prior,
645 Generated,
646 Identity,
647 Always,
648 Measures,
650 Pattern,
651 Define,
652 Running,
653 Owned,
654 After,
655 Before,
656 Instead,
657 Each,
658 Statement,
659 Referencing,
660 Old,
661 New,
662 Of,
663 Check,
664 Authorization,
665 Restart,
666
667 Eof,
669}
670
671impl TokenType {
672 pub fn is_keyword(&self) -> bool {
674 matches!(
675 self,
676 TokenType::Select
677 | TokenType::From
678 | TokenType::Where
679 | TokenType::And
680 | TokenType::Or
681 | TokenType::Not
682 | TokenType::In
683 | TokenType::Is
684 | TokenType::Null
685 | TokenType::True
686 | TokenType::False
687 | TokenType::As
688 | TokenType::On
689 | TokenType::Join
690 | TokenType::Left
691 | TokenType::Right
692 | TokenType::Inner
693 | TokenType::Outer
694 | TokenType::Full
695 | TokenType::Cross
696 | TokenType::Semi
697 | TokenType::Anti
698 | TokenType::Union
699 | TokenType::Except
700 | TokenType::Intersect
701 | TokenType::GroupBy
702 | TokenType::OrderBy
703 | TokenType::Having
704 | TokenType::Limit
705 | TokenType::Offset
706 | TokenType::Case
707 | TokenType::When
708 | TokenType::Then
709 | TokenType::Else
710 | TokenType::End
711 | TokenType::Create
712 | TokenType::Drop
713 | TokenType::Alter
714 | TokenType::Insert
715 | TokenType::Update
716 | TokenType::Delete
717 | TokenType::Into
718 | TokenType::Values
719 | TokenType::Set
720 | TokenType::With
721 | TokenType::Distinct
722 | TokenType::All
723 | TokenType::Exists
724 | TokenType::Between
725 | TokenType::Like
726 | TokenType::ILike
727 | TokenType::Filter
729 | TokenType::Date
730 | TokenType::Timestamp
731 | TokenType::TimestampTz
732 | TokenType::Interval
733 | TokenType::Time
734 | TokenType::Table
735 | TokenType::Index
736 | TokenType::Column
737 | TokenType::Database
738 | TokenType::Schema
739 | TokenType::View
740 | TokenType::Function
741 | TokenType::Procedure
742 | TokenType::Trigger
743 | TokenType::Sequence
744 | TokenType::Over
745 | TokenType::Partition
746 | TokenType::Window
747 | TokenType::Rows
748 | TokenType::Range
749 | TokenType::First
750 | TokenType::Last
751 | TokenType::Preceding
752 | TokenType::Following
753 | TokenType::Current
754 | TokenType::Row
755 | TokenType::Unbounded
756 | TokenType::Array
757 | TokenType::Struct
758 | TokenType::Map
759 | TokenType::PrimaryKey
760 | TokenType::Key
761 | TokenType::ForeignKey
762 | TokenType::References
763 | TokenType::Unique
764 | TokenType::Check
765 | TokenType::Default
766 | TokenType::Constraint
767 | TokenType::Comment
768 | TokenType::Rollup
769 | TokenType::Cube
770 | TokenType::Grant
771 | TokenType::Revoke
772 | TokenType::Type
773 | TokenType::Use
774 | TokenType::Cache
775 | TokenType::Uncache
776 | TokenType::Load
777 | TokenType::Any
778 | TokenType::Some
779 | TokenType::Asc
780 | TokenType::Desc
781 | TokenType::Nulls
782 | TokenType::Lateral
783 | TokenType::Natural
784 | TokenType::Escape
785 | TokenType::Glob
786 | TokenType::Match
787 | TokenType::Recursive
788 | TokenType::Replace
789 | TokenType::Returns
790 | TokenType::If
791 | TokenType::Pivot
792 | TokenType::Unpivot
793 | TokenType::Json
794 | TokenType::Blob
795 | TokenType::Text
796 | TokenType::Int
797 | TokenType::BigInt
798 | TokenType::SmallInt
799 | TokenType::TinyInt
800 | TokenType::Int128
801 | TokenType::UInt128
802 | TokenType::Int256
803 | TokenType::UInt256
804 | TokenType::UInt
805 | TokenType::UBigInt
806 | TokenType::Float
807 | TokenType::Double
808 | TokenType::Decimal
809 | TokenType::Boolean
810 | TokenType::VarChar
811 | TokenType::Char
812 | TokenType::Binary
813 | TokenType::VarBinary
814 | TokenType::No
815 | TokenType::DateTime
816 | TokenType::Truncate
817 | TokenType::Execute
818 | TokenType::Merge
819 | TokenType::Top
820 | TokenType::Begin
821 | TokenType::Generated
822 | TokenType::Identity
823 | TokenType::Always
824 | TokenType::Extract
825 | TokenType::AsOf
827 | TokenType::Prior
828 | TokenType::After
829 | TokenType::Restrict
830 | TokenType::Cascade
831 | TokenType::Local
832 | TokenType::Rename
833 | TokenType::Enum
834 | TokenType::Within
835 | TokenType::Format
836 | TokenType::Final
837 | TokenType::FileFormat
838 | TokenType::Input
839 | TokenType::InputFormat
840 | TokenType::Copy
841 | TokenType::Put
842 | TokenType::Get
843 | TokenType::Show
844 | TokenType::Serde
845 | TokenType::Sample
846 | TokenType::Sort
847 | TokenType::Collate
848 | TokenType::Ties
849 | TokenType::IsNull
850 | TokenType::NotNull
851 | TokenType::Exclude
852 | TokenType::Temporary
853 | TokenType::Add
854 | TokenType::Ordinality
855 | TokenType::Overlaps
856 | TokenType::Block
857 | TokenType::Pattern
858 | TokenType::Group
859 | TokenType::Cluster
860 | TokenType::Repeatable
861 | TokenType::Groups
862 | TokenType::Commit
863 | TokenType::Warehouse
864 | TokenType::System
865 | TokenType::By
866 | TokenType::To
867 | TokenType::Fetch
868 | TokenType::For
869 | TokenType::Only
870 | TokenType::Next
871 | TokenType::Lock
872 | TokenType::Refresh
873 | TokenType::Settings
874 | TokenType::Operator
875 | TokenType::Overwrite
876 | TokenType::StraightJoin
877 | TokenType::Start
878 | TokenType::Ignore
880 | TokenType::Domain
881 | TokenType::Apply
882 | TokenType::Respect
883 | TokenType::Materialized
884 | TokenType::Prewhere
885 | TokenType::Old
886 | TokenType::New
887 | TokenType::Cast
888 | TokenType::TryCast
889 | TokenType::SafeCast
890 | TokenType::Transaction
891 | TokenType::Describe
892 | TokenType::Kill
893 | TokenType::Lambda
894 | TokenType::Declare
895 | TokenType::Keep
896 | TokenType::Output
897 | TokenType::Percent
898 | TokenType::Qualify
899 | TokenType::Returning
900 | TokenType::Language
901 | TokenType::Preserve
902 | TokenType::Savepoint
903 | TokenType::Rollback
904 | TokenType::Body
905 | TokenType::Increment
906 | TokenType::Minvalue
907 | TokenType::Maxvalue
908 | TokenType::Cycle
909 | TokenType::NoCycle
910 | TokenType::Seed
911 | TokenType::Namespace
912 | TokenType::Authorization
913 | TokenType::Order
914 | TokenType::Restart
915 | TokenType::Before
916 | TokenType::Instead
917 | TokenType::Each
918 | TokenType::Statement
919 | TokenType::Referencing
920 | TokenType::Of
921 | TokenType::Separator
922 | TokenType::Others
923 | TokenType::Placing
924 | TokenType::Owned
925 | TokenType::Running
926 | TokenType::Define
927 | TokenType::Measures
928 | TokenType::MatchRecognize
929 | TokenType::AutoIncrement
930 | TokenType::Connect
931 | TokenType::Distribute
932 | TokenType::Bernoulli
933 | TokenType::TableSample
934 | TokenType::Inpath
935 | TokenType::Pragma
936 | TokenType::Siblings
937 | TokenType::SerdeProperties
938 | TokenType::RLike
939 )
940 }
941
942 pub fn is_comparison(&self) -> bool {
944 matches!(
945 self,
946 TokenType::Eq
947 | TokenType::Neq
948 | TokenType::Lt
949 | TokenType::Lte
950 | TokenType::Gt
951 | TokenType::Gte
952 | TokenType::NullsafeEq
953 )
954 }
955
956 pub fn is_arithmetic(&self) -> bool {
958 matches!(
959 self,
960 TokenType::Plus
961 | TokenType::Dash
962 | TokenType::Star
963 | TokenType::Slash
964 | TokenType::Percent
965 | TokenType::Mod
966 | TokenType::Div
967 )
968 }
969}
970
971impl fmt::Display for TokenType {
972 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
973 write!(f, "{:?}", self)
974 }
975}
976
977static DEFAULT_KEYWORDS: LazyLock<HashMap<String, TokenType>> = LazyLock::new(|| {
980 let mut keywords = HashMap::with_capacity(300);
981 keywords.insert("SELECT".to_string(), TokenType::Select);
983 keywords.insert("FROM".to_string(), TokenType::From);
984 keywords.insert("WHERE".to_string(), TokenType::Where);
985 keywords.insert("AND".to_string(), TokenType::And);
986 keywords.insert("OR".to_string(), TokenType::Or);
987 keywords.insert("NOT".to_string(), TokenType::Not);
988 keywords.insert("AS".to_string(), TokenType::As);
989 keywords.insert("ON".to_string(), TokenType::On);
990 keywords.insert("JOIN".to_string(), TokenType::Join);
991 keywords.insert("LEFT".to_string(), TokenType::Left);
992 keywords.insert("RIGHT".to_string(), TokenType::Right);
993 keywords.insert("INNER".to_string(), TokenType::Inner);
994 keywords.insert("OUTER".to_string(), TokenType::Outer);
995 keywords.insert("OUTPUT".to_string(), TokenType::Output);
996 keywords.insert("FULL".to_string(), TokenType::Full);
997 keywords.insert("CROSS".to_string(), TokenType::Cross);
998 keywords.insert("SEMI".to_string(), TokenType::Semi);
999 keywords.insert("ANTI".to_string(), TokenType::Anti);
1000 keywords.insert("STRAIGHT_JOIN".to_string(), TokenType::StraightJoin);
1001 keywords.insert("UNION".to_string(), TokenType::Union);
1002 keywords.insert("EXCEPT".to_string(), TokenType::Except);
1003 keywords.insert("MINUS".to_string(), TokenType::Except); keywords.insert("INTERSECT".to_string(), TokenType::Intersect);
1005 keywords.insert("GROUP".to_string(), TokenType::Group);
1006 keywords.insert("CUBE".to_string(), TokenType::Cube);
1007 keywords.insert("ROLLUP".to_string(), TokenType::Rollup);
1008 keywords.insert("WITHIN".to_string(), TokenType::Within);
1009 keywords.insert("ORDER".to_string(), TokenType::Order);
1010 keywords.insert("BY".to_string(), TokenType::By);
1011 keywords.insert("HAVING".to_string(), TokenType::Having);
1012 keywords.insert("LIMIT".to_string(), TokenType::Limit);
1013 keywords.insert("OFFSET".to_string(), TokenType::Offset);
1014 keywords.insert("ORDINALITY".to_string(), TokenType::Ordinality);
1015 keywords.insert("FETCH".to_string(), TokenType::Fetch);
1016 keywords.insert("FIRST".to_string(), TokenType::First);
1017 keywords.insert("NEXT".to_string(), TokenType::Next);
1018 keywords.insert("ONLY".to_string(), TokenType::Only);
1019 keywords.insert("KEEP".to_string(), TokenType::Keep);
1020 keywords.insert("IGNORE".to_string(), TokenType::Ignore);
1021 keywords.insert("INPUT".to_string(), TokenType::Input);
1022 keywords.insert("CASE".to_string(), TokenType::Case);
1023 keywords.insert("WHEN".to_string(), TokenType::When);
1024 keywords.insert("THEN".to_string(), TokenType::Then);
1025 keywords.insert("ELSE".to_string(), TokenType::Else);
1026 keywords.insert("END".to_string(), TokenType::End);
1027 keywords.insert("ENDIF".to_string(), TokenType::End); keywords.insert("NULL".to_string(), TokenType::Null);
1029 keywords.insert("TRUE".to_string(), TokenType::True);
1030 keywords.insert("FALSE".to_string(), TokenType::False);
1031 keywords.insert("IS".to_string(), TokenType::Is);
1032 keywords.insert("IN".to_string(), TokenType::In);
1033 keywords.insert("BETWEEN".to_string(), TokenType::Between);
1034 keywords.insert("OVERLAPS".to_string(), TokenType::Overlaps);
1035 keywords.insert("LIKE".to_string(), TokenType::Like);
1036 keywords.insert("ILIKE".to_string(), TokenType::ILike);
1037 keywords.insert("RLIKE".to_string(), TokenType::RLike);
1038 keywords.insert("REGEXP".to_string(), TokenType::RLike);
1039 keywords.insert("ESCAPE".to_string(), TokenType::Escape);
1040 keywords.insert("EXISTS".to_string(), TokenType::Exists);
1041 keywords.insert("DISTINCT".to_string(), TokenType::Distinct);
1042 keywords.insert("ALL".to_string(), TokenType::All);
1043 keywords.insert("WITH".to_string(), TokenType::With);
1044 keywords.insert("CREATE".to_string(), TokenType::Create);
1045 keywords.insert("DROP".to_string(), TokenType::Drop);
1046 keywords.insert("ALTER".to_string(), TokenType::Alter);
1047 keywords.insert("TRUNCATE".to_string(), TokenType::Truncate);
1048 keywords.insert("TABLE".to_string(), TokenType::Table);
1049 keywords.insert("VIEW".to_string(), TokenType::View);
1050 keywords.insert("INDEX".to_string(), TokenType::Index);
1051 keywords.insert("COLUMN".to_string(), TokenType::Column);
1052 keywords.insert("CONSTRAINT".to_string(), TokenType::Constraint);
1053 keywords.insert("ADD".to_string(), TokenType::Add);
1054 keywords.insert("CASCADE".to_string(), TokenType::Cascade);
1055 keywords.insert("RESTRICT".to_string(), TokenType::Restrict);
1056 keywords.insert("RENAME".to_string(), TokenType::Rename);
1057 keywords.insert("TEMPORARY".to_string(), TokenType::Temporary);
1058 keywords.insert("TEMP".to_string(), TokenType::Temporary);
1059 keywords.insert("UNIQUE".to_string(), TokenType::Unique);
1060 keywords.insert("PRIMARY".to_string(), TokenType::PrimaryKey);
1061 keywords.insert("FOREIGN".to_string(), TokenType::ForeignKey);
1062 keywords.insert("KEY".to_string(), TokenType::Key);
1063 keywords.insert("KILL".to_string(), TokenType::Kill);
1064 keywords.insert("REFERENCES".to_string(), TokenType::References);
1065 keywords.insert("DEFAULT".to_string(), TokenType::Default);
1066 keywords.insert("DECLARE".to_string(), TokenType::Declare);
1067 keywords.insert("AUTO_INCREMENT".to_string(), TokenType::AutoIncrement);
1068 keywords.insert("AUTOINCREMENT".to_string(), TokenType::AutoIncrement); keywords.insert("MATERIALIZED".to_string(), TokenType::Materialized);
1070 keywords.insert("REPLACE".to_string(), TokenType::Replace);
1071 keywords.insert("TO".to_string(), TokenType::To);
1072 keywords.insert("INSERT".to_string(), TokenType::Insert);
1073 keywords.insert("OVERWRITE".to_string(), TokenType::Overwrite);
1074 keywords.insert("UPDATE".to_string(), TokenType::Update);
1075 keywords.insert("USE".to_string(), TokenType::Use);
1076 keywords.insert("WAREHOUSE".to_string(), TokenType::Warehouse);
1077 keywords.insert("GLOB".to_string(), TokenType::Glob);
1078 keywords.insert("DELETE".to_string(), TokenType::Delete);
1079 keywords.insert("MERGE".to_string(), TokenType::Merge);
1080 keywords.insert("CACHE".to_string(), TokenType::Cache);
1081 keywords.insert("UNCACHE".to_string(), TokenType::Uncache);
1082 keywords.insert("REFRESH".to_string(), TokenType::Refresh);
1083 keywords.insert("GRANT".to_string(), TokenType::Grant);
1084 keywords.insert("REVOKE".to_string(), TokenType::Revoke);
1085 keywords.insert("COMMENT".to_string(), TokenType::Comment);
1086 keywords.insert("COLLATE".to_string(), TokenType::Collate);
1087 keywords.insert("INTO".to_string(), TokenType::Into);
1088 keywords.insert("VALUES".to_string(), TokenType::Values);
1089 keywords.insert("SET".to_string(), TokenType::Set);
1090 keywords.insert("SETTINGS".to_string(), TokenType::Settings);
1091 keywords.insert("SEPARATOR".to_string(), TokenType::Separator);
1092 keywords.insert("ASC".to_string(), TokenType::Asc);
1093 keywords.insert("DESC".to_string(), TokenType::Desc);
1094 keywords.insert("NULLS".to_string(), TokenType::Nulls);
1095 keywords.insert("RESPECT".to_string(), TokenType::Respect);
1096 keywords.insert("FIRST".to_string(), TokenType::First);
1097 keywords.insert("LAST".to_string(), TokenType::Last);
1098 keywords.insert("IF".to_string(), TokenType::If);
1099 keywords.insert("CAST".to_string(), TokenType::Cast);
1100 keywords.insert("TRY_CAST".to_string(), TokenType::TryCast);
1101 keywords.insert("SAFE_CAST".to_string(), TokenType::SafeCast);
1102 keywords.insert("OVER".to_string(), TokenType::Over);
1103 keywords.insert("PARTITION".to_string(), TokenType::Partition);
1104 keywords.insert("PLACING".to_string(), TokenType::Placing);
1105 keywords.insert("WINDOW".to_string(), TokenType::Window);
1106 keywords.insert("ROWS".to_string(), TokenType::Rows);
1107 keywords.insert("RANGE".to_string(), TokenType::Range);
1108 keywords.insert("FILTER".to_string(), TokenType::Filter);
1109 keywords.insert("NATURAL".to_string(), TokenType::Natural);
1110 keywords.insert("USING".to_string(), TokenType::Using);
1111 keywords.insert("UNBOUNDED".to_string(), TokenType::Unbounded);
1112 keywords.insert("PRECEDING".to_string(), TokenType::Preceding);
1113 keywords.insert("FOLLOWING".to_string(), TokenType::Following);
1114 keywords.insert("CURRENT".to_string(), TokenType::Current);
1115 keywords.insert("ROW".to_string(), TokenType::Row);
1116 keywords.insert("GROUPS".to_string(), TokenType::Groups);
1117 keywords.insert("RECURSIVE".to_string(), TokenType::Recursive);
1118 keywords.insert("BOTH".to_string(), TokenType::Both);
1120 keywords.insert("LEADING".to_string(), TokenType::Leading);
1121 keywords.insert("TRAILING".to_string(), TokenType::Trailing);
1122 keywords.insert("INTERVAL".to_string(), TokenType::Interval);
1123 keywords.insert("TOP".to_string(), TokenType::Top);
1125 keywords.insert("QUALIFY".to_string(), TokenType::Qualify);
1126 keywords.insert("SAMPLE".to_string(), TokenType::Sample);
1127 keywords.insert("TABLESAMPLE".to_string(), TokenType::TableSample);
1128 keywords.insert("BERNOULLI".to_string(), TokenType::Bernoulli);
1129 keywords.insert("SYSTEM".to_string(), TokenType::System);
1130 keywords.insert("BLOCK".to_string(), TokenType::Block);
1131 keywords.insert("SEED".to_string(), TokenType::Seed);
1132 keywords.insert("REPEATABLE".to_string(), TokenType::Repeatable);
1133 keywords.insert("TIES".to_string(), TokenType::Ties);
1134 keywords.insert("LATERAL".to_string(), TokenType::Lateral);
1135 keywords.insert("LAMBDA".to_string(), TokenType::Lambda);
1136 keywords.insert("APPLY".to_string(), TokenType::Apply);
1137 keywords.insert("CONNECT".to_string(), TokenType::Connect);
1139 keywords.insert("CLUSTER".to_string(), TokenType::Cluster);
1141 keywords.insert("DISTRIBUTE".to_string(), TokenType::Distribute);
1142 keywords.insert("SORT".to_string(), TokenType::Sort);
1143 keywords.insert("PIVOT".to_string(), TokenType::Pivot);
1144 keywords.insert("PREWHERE".to_string(), TokenType::Prewhere);
1145 keywords.insert("UNPIVOT".to_string(), TokenType::Unpivot);
1146 keywords.insert("FOR".to_string(), TokenType::For);
1147 keywords.insert("ANY".to_string(), TokenType::Any);
1148 keywords.insert("SOME".to_string(), TokenType::Some);
1149 keywords.insert("ASOF".to_string(), TokenType::AsOf);
1150 keywords.insert("PERCENT".to_string(), TokenType::Percent);
1151 keywords.insert("EXCLUDE".to_string(), TokenType::Exclude);
1152 keywords.insert("NO".to_string(), TokenType::No);
1153 keywords.insert("OTHERS".to_string(), TokenType::Others);
1154 keywords.insert("OPERATOR".to_string(), TokenType::Operator);
1156 keywords.insert("SCHEMA".to_string(), TokenType::Schema);
1158 keywords.insert("NAMESPACE".to_string(), TokenType::Namespace);
1159 keywords.insert("DATABASE".to_string(), TokenType::Database);
1160 keywords.insert("FUNCTION".to_string(), TokenType::Function);
1161 keywords.insert("PROCEDURE".to_string(), TokenType::Procedure);
1162 keywords.insert("PROC".to_string(), TokenType::Procedure);
1163 keywords.insert("SEQUENCE".to_string(), TokenType::Sequence);
1164 keywords.insert("TRIGGER".to_string(), TokenType::Trigger);
1165 keywords.insert("TYPE".to_string(), TokenType::Type);
1166 keywords.insert("DOMAIN".to_string(), TokenType::Domain);
1167 keywords.insert("RETURNS".to_string(), TokenType::Returns);
1168 keywords.insert("RETURNING".to_string(), TokenType::Returning);
1169 keywords.insert("LANGUAGE".to_string(), TokenType::Language);
1170 keywords.insert("ROLLBACK".to_string(), TokenType::Rollback);
1171 keywords.insert("COMMIT".to_string(), TokenType::Commit);
1172 keywords.insert("BEGIN".to_string(), TokenType::Begin);
1173 keywords.insert("DESCRIBE".to_string(), TokenType::Describe);
1174 keywords.insert("PRESERVE".to_string(), TokenType::Preserve);
1175 keywords.insert("TRANSACTION".to_string(), TokenType::Transaction);
1176 keywords.insert("SAVEPOINT".to_string(), TokenType::Savepoint);
1177 keywords.insert("BODY".to_string(), TokenType::Body);
1178 keywords.insert("INCREMENT".to_string(), TokenType::Increment);
1179 keywords.insert("MINVALUE".to_string(), TokenType::Minvalue);
1180 keywords.insert("MAXVALUE".to_string(), TokenType::Maxvalue);
1181 keywords.insert("CYCLE".to_string(), TokenType::Cycle);
1182 keywords.insert("NOCYCLE".to_string(), TokenType::NoCycle);
1183 keywords.insert("PRIOR".to_string(), TokenType::Prior);
1184 keywords.insert("MATCH".to_string(), TokenType::Match);
1186 keywords.insert("MATCH_RECOGNIZE".to_string(), TokenType::MatchRecognize);
1187 keywords.insert("MEASURES".to_string(), TokenType::Measures);
1188 keywords.insert("PATTERN".to_string(), TokenType::Pattern);
1189 keywords.insert("DEFINE".to_string(), TokenType::Define);
1190 keywords.insert("RUNNING".to_string(), TokenType::Running);
1191 keywords.insert("FINAL".to_string(), TokenType::Final);
1192 keywords.insert("OWNED".to_string(), TokenType::Owned);
1193 keywords.insert("AFTER".to_string(), TokenType::After);
1194 keywords.insert("BEFORE".to_string(), TokenType::Before);
1195 keywords.insert("INSTEAD".to_string(), TokenType::Instead);
1196 keywords.insert("EACH".to_string(), TokenType::Each);
1197 keywords.insert("STATEMENT".to_string(), TokenType::Statement);
1198 keywords.insert("REFERENCING".to_string(), TokenType::Referencing);
1199 keywords.insert("OLD".to_string(), TokenType::Old);
1200 keywords.insert("NEW".to_string(), TokenType::New);
1201 keywords.insert("OF".to_string(), TokenType::Of);
1202 keywords.insert("CHECK".to_string(), TokenType::Check);
1203 keywords.insert("START".to_string(), TokenType::Start);
1204 keywords.insert("ENUM".to_string(), TokenType::Enum);
1205 keywords.insert("AUTHORIZATION".to_string(), TokenType::Authorization);
1206 keywords.insert("RESTART".to_string(), TokenType::Restart);
1207 keywords.insert("DATE".to_string(), TokenType::Date);
1209 keywords.insert("TIME".to_string(), TokenType::Time);
1210 keywords.insert("TIMESTAMP".to_string(), TokenType::Timestamp);
1211 keywords.insert("DATETIME".to_string(), TokenType::DateTime);
1212 keywords.insert("GENERATED".to_string(), TokenType::Generated);
1213 keywords.insert("IDENTITY".to_string(), TokenType::Identity);
1214 keywords.insert("ALWAYS".to_string(), TokenType::Always);
1215 keywords.insert("LOAD".to_string(), TokenType::Load);
1217 keywords.insert("LOCAL".to_string(), TokenType::Local);
1218 keywords.insert("INPATH".to_string(), TokenType::Inpath);
1219 keywords.insert("INPUTFORMAT".to_string(), TokenType::InputFormat);
1220 keywords.insert("SERDE".to_string(), TokenType::Serde);
1221 keywords.insert("SERDEPROPERTIES".to_string(), TokenType::SerdeProperties);
1222 keywords.insert("FORMAT".to_string(), TokenType::Format);
1223 keywords.insert("PRAGMA".to_string(), TokenType::Pragma);
1225 keywords.insert("SHOW".to_string(), TokenType::Show);
1227 keywords.insert("SIBLINGS".to_string(), TokenType::Siblings);
1229 keywords.insert("COPY".to_string(), TokenType::Copy);
1231 keywords.insert("PUT".to_string(), TokenType::Put);
1232 keywords.insert("GET".to_string(), TokenType::Get);
1233 keywords.insert("EXEC".to_string(), TokenType::Execute);
1235 keywords.insert("EXECUTE".to_string(), TokenType::Execute);
1236 keywords.insert("ISNULL".to_string(), TokenType::IsNull);
1238 keywords.insert("NOTNULL".to_string(), TokenType::NotNull);
1239 keywords
1240});
1241
1242static DEFAULT_SINGLE_TOKENS: LazyLock<HashMap<char, TokenType>> = LazyLock::new(|| {
1243 let mut single_tokens = HashMap::with_capacity(30);
1244 single_tokens.insert('(', TokenType::LParen);
1245 single_tokens.insert(')', TokenType::RParen);
1246 single_tokens.insert('[', TokenType::LBracket);
1247 single_tokens.insert(']', TokenType::RBracket);
1248 single_tokens.insert('{', TokenType::LBrace);
1249 single_tokens.insert('}', TokenType::RBrace);
1250 single_tokens.insert(',', TokenType::Comma);
1251 single_tokens.insert('.', TokenType::Dot);
1252 single_tokens.insert(';', TokenType::Semicolon);
1253 single_tokens.insert('+', TokenType::Plus);
1254 single_tokens.insert('-', TokenType::Dash);
1255 single_tokens.insert('*', TokenType::Star);
1256 single_tokens.insert('/', TokenType::Slash);
1257 single_tokens.insert('%', TokenType::Percent);
1258 single_tokens.insert('&', TokenType::Amp);
1259 single_tokens.insert('|', TokenType::Pipe);
1260 single_tokens.insert('^', TokenType::Caret);
1261 single_tokens.insert('~', TokenType::Tilde);
1262 single_tokens.insert('<', TokenType::Lt);
1263 single_tokens.insert('>', TokenType::Gt);
1264 single_tokens.insert('=', TokenType::Eq);
1265 single_tokens.insert('!', TokenType::Exclamation);
1266 single_tokens.insert(':', TokenType::Colon);
1267 single_tokens.insert('@', TokenType::DAt);
1268 single_tokens.insert('#', TokenType::Hash);
1269 single_tokens.insert('$', TokenType::Dollar);
1270 single_tokens.insert('?', TokenType::Parameter);
1271 single_tokens
1272});
1273
1274static DEFAULT_QUOTES: LazyLock<HashMap<String, String>> = LazyLock::new(|| {
1275 let mut quotes = HashMap::with_capacity(4);
1276 quotes.insert("'".to_string(), "'".to_string());
1277 quotes.insert("\"\"\"".to_string(), "\"\"\"".to_string());
1279 quotes
1280});
1281
1282static DEFAULT_IDENTIFIERS: LazyLock<HashMap<char, char>> = LazyLock::new(|| {
1283 let mut identifiers = HashMap::with_capacity(4);
1284 identifiers.insert('"', '"');
1285 identifiers.insert('`', '`');
1286 identifiers
1289});
1290
1291static DEFAULT_COMMENTS: LazyLock<HashMap<String, Option<String>>> = LazyLock::new(|| {
1292 let mut comments = HashMap::with_capacity(4);
1293 comments.insert("--".to_string(), None);
1294 comments.insert("/*".to_string(), Some("*/".to_string()));
1295 comments
1296});
1297
1298#[derive(Debug, Clone)]
1300pub struct TokenizerConfig {
1301 pub keywords: HashMap<String, TokenType>,
1303 pub single_tokens: HashMap<char, TokenType>,
1305 pub quotes: HashMap<String, String>,
1307 pub identifiers: HashMap<char, char>,
1309 pub comments: HashMap<String, Option<String>>,
1311 pub string_escapes: Vec<char>,
1313 pub nested_comments: bool,
1315 pub escape_follow_chars: Vec<char>,
1320 pub b_prefix_is_byte_string: bool,
1323 pub numeric_literals: HashMap<String, String>,
1326 pub identifiers_can_start_with_digit: bool,
1330 pub hex_number_strings: bool,
1334 pub hex_string_is_integer_type: bool,
1338 pub string_escapes_allowed_in_raw_strings: bool,
1343 pub hash_comments: bool,
1345 pub dollar_sign_is_identifier: bool,
1349 pub insert_format_raw_data: bool,
1353 pub numbers_can_be_underscore_separated: bool,
1357}
1358
1359impl Default for TokenizerConfig {
1360 fn default() -> Self {
1361 Self {
1362 keywords: DEFAULT_KEYWORDS.clone(),
1363 single_tokens: DEFAULT_SINGLE_TOKENS.clone(),
1364 quotes: DEFAULT_QUOTES.clone(),
1365 identifiers: DEFAULT_IDENTIFIERS.clone(),
1366 comments: DEFAULT_COMMENTS.clone(),
1367 string_escapes: vec!['\''],
1370 nested_comments: true,
1371 escape_follow_chars: vec![],
1373 b_prefix_is_byte_string: false,
1375 numeric_literals: HashMap::new(),
1376 identifiers_can_start_with_digit: false,
1377 hex_number_strings: false,
1378 hex_string_is_integer_type: false,
1379 string_escapes_allowed_in_raw_strings: true,
1382 hash_comments: false,
1383 dollar_sign_is_identifier: false,
1384 insert_format_raw_data: false,
1385 numbers_can_be_underscore_separated: false,
1386 }
1387 }
1388}
1389
1390pub struct Tokenizer {
1392 config: TokenizerConfig,
1393}
1394
1395impl Tokenizer {
1396 pub fn new(config: TokenizerConfig) -> Self {
1398 Self { config }
1399 }
1400
1401 pub fn default_config() -> Self {
1403 Self::new(TokenizerConfig::default())
1404 }
1405
1406 pub fn tokenize(&self, sql: &str) -> Result<Vec<Token>> {
1408 let mut state = TokenizerState::new(sql, &self.config);
1409 state.tokenize()
1410 }
1411}
1412
1413impl Default for Tokenizer {
1414 fn default() -> Self {
1415 Self::default_config()
1416 }
1417}
1418
1419struct TokenizerState<'a> {
1421 source: &'a str,
1422 source_is_ascii: bool,
1423 chars: Vec<char>,
1424 size: usize,
1425 tokens: Vec<Token>,
1426 start: usize,
1427 current: usize,
1428 line: usize,
1429 column: usize,
1430 comments: Vec<String>,
1431 config: &'a TokenizerConfig,
1432}
1433
1434impl<'a> TokenizerState<'a> {
1435 fn new(sql: &'a str, config: &'a TokenizerConfig) -> Self {
1436 let chars: Vec<char> = sql.chars().collect();
1437 let size = chars.len();
1438 Self {
1439 source: sql,
1440 source_is_ascii: sql.is_ascii(),
1441 chars,
1442 size,
1443 tokens: Vec::new(),
1444 start: 0,
1445 current: 0,
1446 line: 1,
1447 column: 1,
1448 comments: Vec::new(),
1449 config,
1450 }
1451 }
1452
1453 fn tokenize(&mut self) -> Result<Vec<Token>> {
1454 while !self.is_at_end() {
1455 self.skip_whitespace();
1456 if self.is_at_end() {
1457 break;
1458 }
1459
1460 self.start = self.current;
1461 self.scan_token()?;
1462
1463 if self.config.insert_format_raw_data {
1466 if let Some(raw) = self.try_scan_insert_format_raw_data() {
1467 if !raw.is_empty() {
1468 self.start = self.current;
1469 self.add_token_with_text(TokenType::Var, raw);
1470 }
1471 }
1472 }
1473 }
1474
1475 if !self.comments.is_empty() {
1480 if let Some(last) = self.tokens.last_mut() {
1481 last.trailing_comments.extend(self.comments.drain(..));
1482 }
1483 }
1484
1485 Ok(std::mem::take(&mut self.tokens))
1486 }
1487
1488 #[inline]
1489 fn is_at_end(&self) -> bool {
1490 self.current >= self.size
1491 }
1492
1493 #[inline]
1494 fn text_from_range(&self, start: usize, end: usize) -> String {
1495 if self.source_is_ascii {
1496 self.source[start..end].to_string()
1497 } else {
1498 self.chars[start..end].iter().collect()
1499 }
1500 }
1501
1502 #[inline]
1503 fn peek(&self) -> char {
1504 if self.is_at_end() {
1505 '\0'
1506 } else {
1507 self.chars[self.current]
1508 }
1509 }
1510
1511 #[inline]
1512 fn peek_next(&self) -> char {
1513 if self.current + 1 >= self.size {
1514 '\0'
1515 } else {
1516 self.chars[self.current + 1]
1517 }
1518 }
1519
1520 #[inline]
1521 fn advance(&mut self) -> char {
1522 let c = self.peek();
1523 self.current += 1;
1524 if c == '\n' {
1525 self.line += 1;
1526 self.column = 1;
1527 } else {
1528 self.column += 1;
1529 }
1530 c
1531 }
1532
1533 fn skip_whitespace(&mut self) {
1534 let mut saw_newline = false;
1539 while !self.is_at_end() {
1540 let c = self.peek();
1541 match c {
1542 ' ' | '\t' | '\r' => {
1543 self.advance();
1544 }
1545 '\n' => {
1546 saw_newline = true;
1547 self.advance();
1548 }
1549 '\u{00A0}' | '\u{2000}'..='\u{200B}' | '\u{3000}' | '\u{FEFF}' => {
1554 self.advance();
1555 }
1556 '-' if self.peek_next() == '-' => {
1557 self.scan_line_comment(saw_newline);
1558 saw_newline = true;
1560 }
1561 '/' if self.peek_next() == '/' && self.config.hash_comments => {
1562 self.scan_double_slash_comment();
1564 }
1565 '/' if self.peek_next() == '*' => {
1566 if self.current + 2 < self.size && self.chars[self.current + 2] == '+' {
1568 break;
1570 }
1571 if self.scan_block_comment(saw_newline).is_err() {
1572 return;
1573 }
1574 }
1576 '/' if self.peek_next() == '/' && self.config.comments.contains_key("//") => {
1577 let prev_non_ws = if self.current > 0 {
1581 let mut i = self.current - 1;
1582 while i > 0 && (self.chars[i] == ' ' || self.chars[i] == '\t') {
1583 i -= 1;
1584 }
1585 self.chars[i]
1586 } else {
1587 '\0'
1588 };
1589 if prev_non_ws == ':' || prev_non_ws == '/' {
1590 break;
1592 }
1593 self.scan_line_comment(saw_newline);
1594 saw_newline = true;
1596 }
1597 '#' if self.config.hash_comments => {
1598 self.scan_hash_line_comment();
1599 }
1600 _ => break,
1601 }
1602 }
1603 }
1604
1605 fn scan_hash_line_comment(&mut self) {
1606 self.advance(); let start = self.current;
1608 while !self.is_at_end() && self.peek() != '\n' {
1609 self.advance();
1610 }
1611 let comment = self.text_from_range(start, self.current);
1612 let comment_text = comment.trim().to_string();
1613 if let Some(last) = self.tokens.last_mut() {
1614 last.trailing_comments.push(comment_text);
1615 } else {
1616 self.comments.push(comment_text);
1617 }
1618 }
1619
1620 fn scan_double_slash_comment(&mut self) {
1621 self.advance(); self.advance(); let start = self.current;
1624 while !self.is_at_end() && self.peek() != '\n' {
1625 self.advance();
1626 }
1627 let comment = self.text_from_range(start, self.current);
1628 let comment_text = comment.trim().to_string();
1629 if let Some(last) = self.tokens.last_mut() {
1630 last.trailing_comments.push(comment_text);
1631 } else {
1632 self.comments.push(comment_text);
1633 }
1634 }
1635
1636 fn scan_line_comment(&mut self, after_newline: bool) {
1637 self.advance(); self.advance(); let start = self.current;
1640 while !self.is_at_end() && self.peek() != '\n' {
1641 self.advance();
1642 }
1643 let comment_text = self.text_from_range(start, self.current);
1644
1645 if after_newline || self.tokens.is_empty() {
1648 self.comments.push(comment_text);
1649 } else if let Some(last) = self.tokens.last_mut() {
1650 last.trailing_comments.push(comment_text);
1651 }
1652 }
1653
1654 fn scan_block_comment(&mut self, after_newline: bool) -> Result<()> {
1655 self.advance(); self.advance(); let content_start = self.current;
1658 let mut depth = 1;
1659
1660 while !self.is_at_end() && depth > 0 {
1661 if self.peek() == '/' && self.peek_next() == '*' && self.config.nested_comments {
1662 self.advance();
1663 self.advance();
1664 depth += 1;
1665 } else if self.peek() == '*' && self.peek_next() == '/' {
1666 depth -= 1;
1667 if depth > 0 {
1668 self.advance();
1669 self.advance();
1670 }
1671 } else {
1672 self.advance();
1673 }
1674 }
1675
1676 if depth > 0 {
1677 return Err(Error::tokenize(
1678 "Unterminated block comment",
1679 self.line,
1680 self.column,
1681 self.start,
1682 self.current,
1683 ));
1684 }
1685
1686 let content = self.text_from_range(content_start, self.current);
1688 self.advance(); self.advance(); let comment_text = format!("/*{}*/", content);
1693
1694 if after_newline || self.tokens.is_empty() {
1697 self.comments.push(comment_text);
1698 } else if let Some(last) = self.tokens.last_mut() {
1699 last.trailing_comments.push(comment_text);
1700 }
1701
1702 Ok(())
1703 }
1704
1705 fn scan_hint(&mut self) -> Result<()> {
1707 self.advance(); self.advance(); self.advance(); let hint_start = self.current;
1711
1712 while !self.is_at_end() {
1714 if self.peek() == '*' && self.peek_next() == '/' {
1715 break;
1716 }
1717 self.advance();
1718 }
1719
1720 if self.is_at_end() {
1721 return Err(Error::tokenize(
1722 "Unterminated hint comment",
1723 self.line,
1724 self.column,
1725 self.start,
1726 self.current,
1727 ));
1728 }
1729
1730 let hint_text = self.text_from_range(hint_start, self.current);
1731 self.advance(); self.advance(); self.add_token_with_text(TokenType::Hint, hint_text.trim().to_string());
1735
1736 Ok(())
1737 }
1738
1739 fn scan_positional_parameter(&mut self) -> Result<()> {
1741 self.advance(); let start = self.current;
1743
1744 while !self.is_at_end() && self.peek().is_ascii_digit() {
1745 self.advance();
1746 }
1747
1748 let number = self.text_from_range(start, self.current);
1749 self.add_token_with_text(TokenType::Parameter, number);
1750 Ok(())
1751 }
1752
1753 fn try_scan_tagged_dollar_string(&mut self) -> Result<Option<()>> {
1758 let saved_pos = self.current;
1759
1760 self.advance(); let tag_start = self.current;
1766 while !self.is_at_end()
1767 && (self.peek().is_alphanumeric() || self.peek() == '_' || !self.peek().is_ascii())
1768 {
1769 self.advance();
1770 }
1771 let tag = self.text_from_range(tag_start, self.current);
1772
1773 if self.is_at_end() || self.peek() != '$' {
1775 self.current = saved_pos;
1777 return Ok(None);
1778 }
1779 self.advance(); let content_start = self.current;
1783 let closing_tag = format!("${}$", tag);
1784 let closing_chars: Vec<char> = closing_tag.chars().collect();
1785
1786 loop {
1787 if self.is_at_end() {
1788 self.current = saved_pos;
1790 return Ok(None);
1791 }
1792
1793 if self.peek() == '$' && self.current + closing_chars.len() <= self.size {
1795 let matches = closing_chars.iter().enumerate().all(|(j, &ch)| {
1796 self.current + j < self.size && self.chars[self.current + j] == ch
1797 });
1798 if matches {
1799 let content = self.text_from_range(content_start, self.current);
1800 for _ in 0..closing_chars.len() {
1802 self.advance();
1803 }
1804 let token_text = format!("{}\x00{}", tag, content);
1806 self.add_token_with_text(TokenType::DollarString, token_text);
1807 return Ok(Some(()));
1808 }
1809 }
1810 self.advance();
1811 }
1812 }
1813
1814 fn scan_dollar_quoted_string(&mut self) -> Result<()> {
1819 self.advance(); self.advance(); let start = self.current;
1824 while !self.is_at_end() {
1825 if self.peek() == '$'
1826 && self.current + 1 < self.size
1827 && self.chars[self.current + 1] == '$'
1828 {
1829 break;
1830 }
1831 self.advance();
1832 }
1833
1834 let content = self.text_from_range(start, self.current);
1835
1836 if !self.is_at_end() {
1837 self.advance(); self.advance(); }
1840
1841 self.add_token_with_text(TokenType::DollarString, content);
1842 Ok(())
1843 }
1844
1845 fn scan_token(&mut self) -> Result<()> {
1846 let c = self.peek();
1847
1848 if c == '\'' {
1850 if self.config.quotes.contains_key("'''")
1852 && self.peek_next() == '\''
1853 && self.current + 2 < self.size
1854 && self.chars[self.current + 2] == '\''
1855 {
1856 return self.scan_triple_quoted_string('\'');
1857 }
1858 return self.scan_string();
1859 }
1860
1861 if c == '"'
1863 && self.config.quotes.contains_key("\"\"\"")
1864 && self.peek_next() == '"'
1865 && self.current + 2 < self.size
1866 && self.chars[self.current + 2] == '"'
1867 {
1868 return self.scan_triple_quoted_string('"');
1869 }
1870
1871 if c == '"'
1874 && self.config.quotes.contains_key("\"")
1875 && !self.config.identifiers.contains_key(&'"')
1876 {
1877 return self.scan_double_quoted_string();
1878 }
1879
1880 if let Some(&end_quote) = self.config.identifiers.get(&c) {
1882 return self.scan_quoted_identifier(end_quote);
1883 }
1884
1885 if c.is_ascii_digit() {
1887 return self.scan_number();
1888 }
1889
1890 if c == '.' && self.peek_next().is_ascii_digit() {
1897 let prev_char = if self.current > 0 {
1898 self.chars[self.current - 1]
1899 } else {
1900 '\0'
1901 };
1902 let is_after_ident = prev_char.is_alphanumeric()
1903 || prev_char == '_'
1904 || prev_char == '`'
1905 || prev_char == '"'
1906 || prev_char == ']'
1907 || prev_char == ')';
1908 if prev_char != '.' && !is_after_ident {
1909 return self.scan_number_starting_with_dot();
1910 }
1911 }
1912
1913 if c == '/'
1915 && self.peek_next() == '*'
1916 && self.current + 2 < self.size
1917 && self.chars[self.current + 2] == '+'
1918 {
1919 return self.scan_hint();
1920 }
1921
1922 if let Some(token_type) = self.try_scan_multi_char_operator() {
1924 self.add_token(token_type);
1925 return Ok(());
1926 }
1927
1928 if c == '$'
1931 && (self.peek_next().is_alphanumeric()
1932 || self.peek_next() == '_'
1933 || !self.peek_next().is_ascii())
1934 {
1935 if let Some(()) = self.try_scan_tagged_dollar_string()? {
1936 return Ok(());
1937 }
1938 if self.config.dollar_sign_is_identifier {
1941 return self.scan_dollar_identifier();
1942 }
1943 }
1944
1945 if c == '$' && self.peek_next() == '$' {
1947 return self.scan_dollar_quoted_string();
1948 }
1949
1950 if c == '$' && self.peek_next().is_ascii_digit() {
1952 return self.scan_positional_parameter();
1953 }
1954
1955 if c == '$' && self.config.dollar_sign_is_identifier {
1957 return self.scan_dollar_identifier();
1958 }
1959
1960 if (c == '#' || c == '@')
1963 && (self.peek_next().is_alphanumeric()
1964 || self.peek_next() == '_'
1965 || self.peek_next() == '#')
1966 {
1967 return self.scan_tsql_identifier();
1968 }
1969
1970 if let Some(&token_type) = self.config.single_tokens.get(&c) {
1972 self.advance();
1973 self.add_token(token_type);
1974 return Ok(());
1975 }
1976
1977 if c == '\u{2212}' {
1979 self.advance();
1980 self.add_token(TokenType::Dash);
1981 return Ok(());
1982 }
1983
1984 if c == '\u{2044}' {
1986 self.advance();
1987 self.add_token(TokenType::Slash);
1988 return Ok(());
1989 }
1990
1991 if c == '\u{2018}' || c == '\u{2019}' {
1993 return self.scan_unicode_quoted_string(c);
1995 }
1996 if c == '\u{201C}' || c == '\u{201D}' {
1997 return self.scan_unicode_quoted_identifier(c);
1999 }
2000
2001 self.scan_identifier_or_keyword()
2003 }
2004
2005 fn try_scan_multi_char_operator(&mut self) -> Option<TokenType> {
2006 let c = self.peek();
2007 let next = self.peek_next();
2008 let third = if self.current + 2 < self.size {
2009 self.chars[self.current + 2]
2010 } else {
2011 '\0'
2012 };
2013
2014 if c == '-' && next == '|' && third == '-' {
2017 self.advance();
2018 self.advance();
2019 self.advance();
2020 return Some(TokenType::Adjacent);
2021 }
2022
2023 if c == '|' && next == '|' && third == '/' {
2025 self.advance();
2026 self.advance();
2027 self.advance();
2028 return Some(TokenType::DPipeSlash);
2029 }
2030
2031 if c == '#' && next == '>' && third == '>' {
2033 self.advance();
2034 self.advance();
2035 self.advance();
2036 return Some(TokenType::DHashArrow);
2037 }
2038
2039 if c == '-' && next == '>' && third == '>' {
2041 self.advance();
2042 self.advance();
2043 self.advance();
2044 return Some(TokenType::DArrow);
2045 }
2046
2047 if c == '<' && next == '=' && third == '>' {
2049 self.advance();
2050 self.advance();
2051 self.advance();
2052 return Some(TokenType::NullsafeEq);
2053 }
2054
2055 if c == '<' && next == '-' && third == '>' {
2057 self.advance();
2058 self.advance();
2059 self.advance();
2060 return Some(TokenType::LrArrow);
2061 }
2062
2063 if c == '<' && next == '@' {
2065 self.advance();
2066 self.advance();
2067 return Some(TokenType::LtAt);
2068 }
2069
2070 if c == '@' && next == '>' {
2072 self.advance();
2073 self.advance();
2074 return Some(TokenType::AtGt);
2075 }
2076
2077 if c == '~' && next == '~' && third == '~' {
2079 self.advance();
2080 self.advance();
2081 self.advance();
2082 return Some(TokenType::Glob);
2083 }
2084
2085 if c == '~' && next == '~' && third == '*' {
2087 self.advance();
2088 self.advance();
2089 self.advance();
2090 return Some(TokenType::ILike);
2091 }
2092
2093 let fourth = if self.current + 3 < self.size {
2095 self.chars[self.current + 3]
2096 } else {
2097 '\0'
2098 };
2099 if c == '!' && next == '~' && third == '~' && fourth == '*' {
2100 self.advance();
2101 self.advance();
2102 self.advance();
2103 self.advance();
2104 return Some(TokenType::NotILike);
2105 }
2106
2107 if c == '!' && next == '~' && third == '~' {
2109 self.advance();
2110 self.advance();
2111 self.advance();
2112 return Some(TokenType::NotLike);
2113 }
2114
2115 if c == '!' && next == '~' && third == '*' {
2117 self.advance();
2118 self.advance();
2119 self.advance();
2120 return Some(TokenType::NotIRLike);
2121 }
2122
2123 if c == '!' && next == ':' && third == '>' {
2125 self.advance();
2126 self.advance();
2127 self.advance();
2128 return Some(TokenType::NColonGt);
2129 }
2130
2131 if c == '?' && next == ':' && third == ':' {
2133 self.advance();
2134 self.advance();
2135 self.advance();
2136 return Some(TokenType::QDColon);
2137 }
2138
2139 if c == '!' && next == '~' {
2141 self.advance();
2142 self.advance();
2143 return Some(TokenType::NotRLike);
2144 }
2145
2146 if c == '~' && next == '~' {
2148 self.advance();
2149 self.advance();
2150 return Some(TokenType::Like);
2151 }
2152
2153 if c == '~' && next == '*' {
2155 self.advance();
2156 self.advance();
2157 return Some(TokenType::IRLike);
2158 }
2159
2160 if c == ':' && next == ':' && third == '$' {
2163 self.advance();
2164 self.advance();
2165 self.advance();
2166 return Some(TokenType::DColonDollar);
2167 }
2168 if c == ':' && next == ':' && third == '%' {
2169 self.advance();
2170 self.advance();
2171 self.advance();
2172 return Some(TokenType::DColonPercent);
2173 }
2174 if c == ':' && next == ':' && third == '?' {
2175 self.advance();
2176 self.advance();
2177 self.advance();
2178 return Some(TokenType::DColonQMark);
2179 }
2180
2181 let token_type = match (c, next) {
2183 ('.', ':') => Some(TokenType::DotColon),
2184 ('=', '=') => Some(TokenType::Eq), ('<', '=') => Some(TokenType::Lte),
2186 ('>', '=') => Some(TokenType::Gte),
2187 ('!', '=') => Some(TokenType::Neq),
2188 ('<', '>') => Some(TokenType::Neq),
2189 ('^', '=') => Some(TokenType::Neq),
2190 ('<', '<') => Some(TokenType::LtLt),
2191 ('>', '>') => Some(TokenType::GtGt),
2192 ('|', '|') => Some(TokenType::DPipe),
2193 ('|', '/') => Some(TokenType::PipeSlash), (':', ':') => Some(TokenType::DColon),
2195 (':', '=') => Some(TokenType::ColonEq), (':', '>') => Some(TokenType::ColonGt), ('-', '>') => Some(TokenType::Arrow), ('=', '>') => Some(TokenType::FArrow), ('&', '&') => Some(TokenType::DAmp),
2200 ('&', '<') => Some(TokenType::AmpLt), ('&', '>') => Some(TokenType::AmpGt), ('@', '@') => Some(TokenType::AtAt), ('?', '|') => Some(TokenType::QMarkPipe), ('?', '&') => Some(TokenType::QMarkAmp), ('?', '?') => Some(TokenType::DQMark), ('#', '>') => Some(TokenType::HashArrow), ('#', '-') => Some(TokenType::HashDash), ('^', '@') => Some(TokenType::CaretAt), ('*', '*') => Some(TokenType::DStar), ('|', '>') => Some(TokenType::PipeGt), _ => None,
2212 };
2213
2214 if token_type.is_some() {
2215 self.advance();
2216 self.advance();
2217 }
2218
2219 token_type
2220 }
2221
2222 fn scan_string(&mut self) -> Result<()> {
2223 self.advance(); let mut value = String::new();
2225
2226 while !self.is_at_end() {
2227 let c = self.peek();
2228 if c == '\'' {
2229 if self.peek_next() == '\'' {
2230 value.push('\'');
2232 self.advance();
2233 self.advance();
2234 } else {
2235 break;
2236 }
2237 } else if c == '\\' && self.config.string_escapes.contains(&'\\') {
2238 self.advance(); if !self.is_at_end() {
2241 let escaped = self.advance();
2242 match escaped {
2243 'n' => value.push('\n'),
2244 'r' => value.push('\r'),
2245 't' => value.push('\t'),
2246 '0' => value.push('\0'),
2247 'Z' => value.push('\x1A'), 'a' => value.push('\x07'), 'b' => value.push('\x08'), 'f' => value.push('\x0C'), 'v' => value.push('\x0B'), 'x' => {
2253 let mut hex = String::with_capacity(2);
2255 for _ in 0..2 {
2256 if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2257 hex.push(self.advance());
2258 }
2259 }
2260 if hex.len() == 2 {
2261 if let Ok(byte) = u8::from_str_radix(&hex, 16) {
2262 value.push(byte as char);
2263 } else {
2264 value.push('\\');
2265 value.push('x');
2266 value.push_str(&hex);
2267 }
2268 } else {
2269 value.push('\\');
2271 value.push('x');
2272 value.push_str(&hex);
2273 }
2274 }
2275 '\\' => value.push('\\'),
2276 '\'' => value.push('\''),
2277 '"' => value.push('"'),
2278 '%' => {
2279 value.push('%');
2281 }
2282 '_' => {
2283 value.push('_');
2285 }
2286 _ => {
2290 if !self.config.escape_follow_chars.is_empty() {
2291 value.push(escaped);
2293 } else {
2294 value.push('\\');
2296 value.push(escaped);
2297 }
2298 }
2299 }
2300 }
2301 } else {
2302 value.push(self.advance());
2303 }
2304 }
2305
2306 if self.is_at_end() {
2307 return Err(Error::tokenize(
2308 "Unterminated string",
2309 self.line,
2310 self.column,
2311 self.start,
2312 self.current,
2313 ));
2314 }
2315
2316 self.advance(); self.add_token_with_text(TokenType::String, value);
2318 Ok(())
2319 }
2320
2321 fn scan_double_quoted_string(&mut self) -> Result<()> {
2323 self.advance(); let mut value = String::new();
2325
2326 while !self.is_at_end() {
2327 let c = self.peek();
2328 if c == '"' {
2329 if self.peek_next() == '"' {
2330 value.push('"');
2332 self.advance();
2333 self.advance();
2334 } else {
2335 break;
2336 }
2337 } else if c == '\\' && self.config.string_escapes.contains(&'\\') {
2338 self.advance(); if !self.is_at_end() {
2341 let escaped = self.advance();
2342 match escaped {
2343 'n' => value.push('\n'),
2344 'r' => value.push('\r'),
2345 't' => value.push('\t'),
2346 '0' => value.push('\0'),
2347 'Z' => value.push('\x1A'), 'a' => value.push('\x07'), 'b' => value.push('\x08'), 'f' => value.push('\x0C'), 'v' => value.push('\x0B'), 'x' => {
2353 let mut hex = String::with_capacity(2);
2355 for _ in 0..2 {
2356 if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2357 hex.push(self.advance());
2358 }
2359 }
2360 if hex.len() == 2 {
2361 if let Ok(byte) = u8::from_str_radix(&hex, 16) {
2362 value.push(byte as char);
2363 } else {
2364 value.push('\\');
2365 value.push('x');
2366 value.push_str(&hex);
2367 }
2368 } else {
2369 value.push('\\');
2371 value.push('x');
2372 value.push_str(&hex);
2373 }
2374 }
2375 '\\' => value.push('\\'),
2376 '\'' => value.push('\''),
2377 '"' => value.push('"'),
2378 '%' => {
2379 value.push('%');
2381 }
2382 '_' => {
2383 value.push('_');
2385 }
2386 _ => {
2390 if !self.config.escape_follow_chars.is_empty() {
2391 value.push(escaped);
2393 } else {
2394 value.push('\\');
2396 value.push(escaped);
2397 }
2398 }
2399 }
2400 }
2401 } else {
2402 value.push(self.advance());
2403 }
2404 }
2405
2406 if self.is_at_end() {
2407 return Err(Error::tokenize(
2408 "Unterminated double-quoted string",
2409 self.line,
2410 self.column,
2411 self.start,
2412 self.current,
2413 ));
2414 }
2415
2416 self.advance(); self.add_token_with_text(TokenType::String, value);
2418 Ok(())
2419 }
2420
2421 fn scan_triple_quoted_string(&mut self, quote_char: char) -> Result<()> {
2422 self.advance();
2424 self.advance();
2425 self.advance();
2426 let mut value = String::new();
2427
2428 while !self.is_at_end() {
2429 if self.peek() == quote_char
2431 && self.current + 1 < self.size
2432 && self.chars[self.current + 1] == quote_char
2433 && self.current + 2 < self.size
2434 && self.chars[self.current + 2] == quote_char
2435 {
2436 break;
2438 }
2439 value.push(self.advance());
2440 }
2441
2442 if self.is_at_end() {
2443 return Err(Error::tokenize(
2444 "Unterminated triple-quoted string",
2445 self.line,
2446 self.column,
2447 self.start,
2448 self.current,
2449 ));
2450 }
2451
2452 self.advance();
2454 self.advance();
2455 self.advance();
2456 let token_type = if quote_char == '"' {
2457 TokenType::TripleDoubleQuotedString
2458 } else {
2459 TokenType::TripleSingleQuotedString
2460 };
2461 self.add_token_with_text(token_type, value);
2462 Ok(())
2463 }
2464
2465 fn scan_quoted_identifier(&mut self, end_quote: char) -> Result<()> {
2466 self.advance(); let mut value = String::new();
2468
2469 loop {
2470 if self.is_at_end() {
2471 return Err(Error::tokenize(
2472 "Unterminated identifier",
2473 self.line,
2474 self.column,
2475 self.start,
2476 self.current,
2477 ));
2478 }
2479 if self.peek() == end_quote {
2480 if self.peek_next() == end_quote {
2481 value.push(end_quote);
2483 self.advance(); self.advance(); } else {
2486 break;
2488 }
2489 } else {
2490 value.push(self.peek());
2491 self.advance();
2492 }
2493 }
2494
2495 self.advance(); self.add_token_with_text(TokenType::QuotedIdentifier, value);
2497 Ok(())
2498 }
2499
2500 fn scan_unicode_quoted_string(&mut self, open_quote: char) -> Result<()> {
2505 self.advance(); let start = self.current;
2507 let close_quote = if open_quote == '\u{2018}' {
2509 '\u{2019}' } else {
2511 '\u{2019}' };
2513 while !self.is_at_end() && self.peek() != close_quote {
2514 self.advance();
2515 }
2516 let value = self.text_from_range(start, self.current);
2517 if !self.is_at_end() {
2518 self.advance(); }
2520 self.add_token_with_text(TokenType::String, value);
2521 Ok(())
2522 }
2523
2524 fn scan_unicode_quoted_identifier(&mut self, open_quote: char) -> Result<()> {
2527 self.advance(); let start = self.current;
2529 let close_quote = if open_quote == '\u{201C}' {
2530 '\u{201D}' } else {
2532 '\u{201D}' };
2534 while !self.is_at_end() && self.peek() != close_quote && self.peek() != '"' {
2535 self.advance();
2536 }
2537 let value = self.text_from_range(start, self.current);
2538 if !self.is_at_end() {
2539 self.advance(); }
2541 self.add_token_with_text(TokenType::QuotedIdentifier, value);
2542 Ok(())
2543 }
2544
2545 fn scan_number(&mut self) -> Result<()> {
2546 if self.config.hex_number_strings && self.peek() == '0' && !self.is_at_end() {
2548 let next = if self.current + 1 < self.size {
2549 self.chars[self.current + 1]
2550 } else {
2551 '\0'
2552 };
2553 if next == 'x' || next == 'X' {
2554 self.advance();
2556 self.advance();
2557 let hex_start = self.current;
2559 while !self.is_at_end() && (self.peek().is_ascii_hexdigit() || self.peek() == '_') {
2560 if self.peek() == '_' && !self.peek_next().is_ascii_hexdigit() {
2561 break;
2562 }
2563 self.advance();
2564 }
2565 if self.current > hex_start {
2566 let mut is_hex_float = false;
2568 if !self.is_at_end() && self.peek() == '.' {
2570 let after_dot = if self.current + 1 < self.size {
2571 self.chars[self.current + 1]
2572 } else {
2573 '\0'
2574 };
2575 if after_dot.is_ascii_hexdigit() {
2576 is_hex_float = true;
2577 self.advance(); while !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2579 self.advance();
2580 }
2581 }
2582 }
2583 if !self.is_at_end() && (self.peek() == 'p' || self.peek() == 'P') {
2585 is_hex_float = true;
2586 self.advance(); if !self.is_at_end() && (self.peek() == '+' || self.peek() == '-') {
2588 self.advance();
2589 }
2590 while !self.is_at_end() && self.peek().is_ascii_digit() {
2591 self.advance();
2592 }
2593 }
2594 if is_hex_float {
2595 let raw_text = self.text_from_range(self.start, self.current);
2597 let full_text = if self.config.numbers_can_be_underscore_separated && raw_text.contains('_') {
2598 raw_text.replace('_', "")
2599 } else {
2600 raw_text
2601 };
2602 self.add_token_with_text(TokenType::Number, full_text);
2603 } else if self.config.hex_string_is_integer_type {
2604 let raw_value = self.text_from_range(hex_start, self.current);
2606 let hex_value = if self.config.numbers_can_be_underscore_separated && raw_value.contains('_') {
2607 raw_value.replace('_', "")
2608 } else {
2609 raw_value
2610 };
2611 self.add_token_with_text(TokenType::HexNumber, hex_value);
2612 } else {
2613 let raw_value = self.text_from_range(hex_start, self.current);
2615 let hex_value = if self.config.numbers_can_be_underscore_separated && raw_value.contains('_') {
2616 raw_value.replace('_', "")
2617 } else {
2618 raw_value
2619 };
2620 self.add_token_with_text(TokenType::HexString, hex_value);
2621 }
2622 return Ok(());
2623 }
2624 self.current = self.start + 1;
2627 }
2628 }
2629
2630 while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2632 if self.peek() == '_' && (self.is_at_end() || !self.peek_next().is_ascii_digit()) {
2634 break;
2635 }
2636 self.advance();
2637 }
2638
2639 if self.peek() == '.' {
2643 let next = self.peek_next();
2644 if next != '.' {
2650 self.advance(); while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2653 if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2654 break;
2655 }
2656 self.advance();
2657 }
2658 }
2659 }
2660
2661 if self.peek() == 'e' || self.peek() == 'E' {
2663 self.advance();
2664 if self.peek() == '+' || self.peek() == '-' {
2665 self.advance();
2666 }
2667 while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2668 if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2669 break;
2670 }
2671 self.advance();
2672 }
2673 }
2674
2675 let raw_text = self.text_from_range(self.start, self.current);
2676 let text = if self.config.numbers_can_be_underscore_separated && raw_text.contains('_') {
2679 raw_text.replace('_', "")
2680 } else {
2681 raw_text
2682 };
2683
2684 if !self.config.numeric_literals.is_empty() && !self.is_at_end() {
2686 let next_char: String = self.peek().to_ascii_uppercase().to_string();
2687 let suffix_match = if self.current + 1 < self.size {
2689 let two_char: String = [
2690 self.chars[self.current].to_ascii_uppercase(),
2691 self.chars[self.current + 1].to_ascii_uppercase(),
2692 ]
2693 .iter()
2694 .collect();
2695 if self.config.numeric_literals.contains_key(&two_char) {
2696 let after_suffix = if self.current + 2 < self.size {
2698 self.chars[self.current + 2]
2699 } else {
2700 ' '
2701 };
2702 if !after_suffix.is_alphanumeric() && after_suffix != '_' {
2703 Some((two_char, 2))
2704 } else {
2705 None
2706 }
2707 } else if self.config.numeric_literals.contains_key(&next_char) {
2708 let after_suffix = if self.current + 1 < self.size {
2710 self.chars[self.current + 1]
2711 } else {
2712 ' '
2713 };
2714 if !after_suffix.is_alphanumeric() && after_suffix != '_' {
2715 Some((next_char, 1))
2716 } else {
2717 None
2718 }
2719 } else {
2720 None
2721 }
2722 } else if self.config.numeric_literals.contains_key(&next_char) {
2723 Some((next_char, 1))
2725 } else {
2726 None
2727 };
2728
2729 if let Some((suffix, len)) = suffix_match {
2730 for _ in 0..len {
2732 self.advance();
2733 }
2734 let type_name = self
2737 .config
2738 .numeric_literals
2739 .get(&suffix)
2740 .expect("suffix verified by contains_key above")
2741 .clone();
2742 let combined = format!("{}::{}", text, type_name);
2743 self.add_token_with_text(TokenType::Number, combined);
2744 return Ok(());
2745 }
2746 }
2747
2748 if self.config.identifiers_can_start_with_digit && !self.is_at_end() {
2751 let next = self.peek();
2752 if next.is_alphabetic() || next == '_' {
2753 while !self.is_at_end() {
2755 let ch = self.peek();
2756 if ch.is_alphanumeric() || ch == '_' {
2757 self.advance();
2758 } else {
2759 break;
2760 }
2761 }
2762 let ident_text = self.text_from_range(self.start, self.current);
2763 self.add_token_with_text(TokenType::Identifier, ident_text);
2764 return Ok(());
2765 }
2766 }
2767
2768 self.add_token_with_text(TokenType::Number, text);
2769 Ok(())
2770 }
2771
2772 fn scan_number_starting_with_dot(&mut self) -> Result<()> {
2774 self.advance();
2776
2777 while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2779 if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2780 break;
2781 }
2782 self.advance();
2783 }
2784
2785 if self.peek() == 'e' || self.peek() == 'E' {
2787 self.advance();
2788 if self.peek() == '+' || self.peek() == '-' {
2789 self.advance();
2790 }
2791 while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2792 if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2793 break;
2794 }
2795 self.advance();
2796 }
2797 }
2798
2799 let raw_text = self.text_from_range(self.start, self.current);
2800 let text = if self.config.numbers_can_be_underscore_separated && raw_text.contains('_') {
2803 raw_text.replace('_', "")
2804 } else {
2805 raw_text
2806 };
2807 self.add_token_with_text(TokenType::Number, text);
2808 Ok(())
2809 }
2810
2811 #[inline]
2814 fn lookup_keyword_ascii(keywords: &HashMap<String, TokenType>, text: &str) -> TokenType {
2815 if text.len() > 128 {
2816 return TokenType::Var;
2817 }
2818 let mut buf = [0u8; 128];
2819 for (i, b) in text.bytes().enumerate() {
2820 buf[i] = b.to_ascii_uppercase();
2821 }
2822 if let Ok(upper) = std::str::from_utf8(&buf[..text.len()]) {
2823 keywords.get(upper).copied().unwrap_or(TokenType::Var)
2824 } else {
2825 TokenType::Var
2826 }
2827 }
2828
2829 fn scan_identifier_or_keyword(&mut self) -> Result<()> {
2830 let first_char = self.peek();
2832 if !first_char.is_alphanumeric() && first_char != '_' {
2833 let c = self.advance();
2835 return Err(Error::tokenize(
2836 format!("Unexpected character: '{}'", c),
2837 self.line,
2838 self.column,
2839 self.start,
2840 self.current,
2841 ));
2842 }
2843
2844 while !self.is_at_end() {
2845 let c = self.peek();
2846 if c == '#' {
2850 let next_c = if self.current + 1 < self.size {
2851 self.chars[self.current + 1]
2852 } else {
2853 '\0'
2854 };
2855 if next_c == '>' || next_c == '-' {
2856 break; }
2858 self.advance();
2859 } else if c.is_alphanumeric() || c == '_' || c == '$' || c == '@' {
2860 self.advance();
2861 } else {
2862 break;
2863 }
2864 }
2865
2866 let text = self.text_from_range(self.start, self.current);
2867
2868 if text.eq_ignore_ascii_case("NOT") && self.peek() == '=' {
2870 self.advance(); self.add_token(TokenType::Neq);
2872 return Ok(());
2873 }
2874
2875 let next_char = self.peek();
2878 let is_single_quote = next_char == '\'';
2879 let is_double_quote = next_char == '"' && self.config.quotes.contains_key("\"");
2880 let is_double_quote_for_raw = next_char == '"';
2883
2884 if text.eq_ignore_ascii_case("R") && (is_single_quote || is_double_quote_for_raw) {
2887 let quote_char = if is_single_quote { '\'' } else { '"' };
2890 self.advance(); if self.peek() == quote_char && self.peek_next() == quote_char {
2894 self.advance(); self.advance(); let string_value = self.scan_raw_triple_quoted_content(quote_char)?;
2898 self.add_token_with_text(TokenType::RawString, string_value);
2899 } else {
2900 let string_value = self.scan_raw_string_content(quote_char)?;
2901 self.add_token_with_text(TokenType::RawString, string_value);
2902 }
2903 return Ok(());
2904 }
2905
2906 if is_single_quote || is_double_quote {
2907 if text.eq_ignore_ascii_case("N") {
2908 self.advance(); let string_value = if is_single_quote {
2911 self.scan_string_content()?
2912 } else {
2913 self.scan_double_quoted_string_content()?
2914 };
2915 self.add_token_with_text(TokenType::NationalString, string_value);
2916 return Ok(());
2917 } else if text.eq_ignore_ascii_case("E") {
2918 let lowercase = text == "e";
2922 let prefix = if lowercase { "e:" } else { "E:" };
2923 self.advance(); let string_value = self.scan_string_content_with_escapes(true)?;
2925 self.add_token_with_text(
2926 TokenType::EscapeString,
2927 format!("{}{}", prefix, string_value),
2928 );
2929 return Ok(());
2930 } else if text.eq_ignore_ascii_case("X") {
2931 self.advance(); let string_value = if is_single_quote {
2934 self.scan_string_content()?
2935 } else {
2936 self.scan_double_quoted_string_content()?
2937 };
2938 self.add_token_with_text(TokenType::HexString, string_value);
2939 return Ok(());
2940 } else if text.eq_ignore_ascii_case("B") && is_double_quote {
2941 self.advance(); let string_value = self.scan_double_quoted_string_content()?;
2944 self.add_token_with_text(TokenType::ByteString, string_value);
2945 return Ok(());
2946 } else if text.eq_ignore_ascii_case("B") && is_single_quote {
2947 self.advance(); let string_value = self.scan_string_content()?;
2951 if self.config.b_prefix_is_byte_string {
2952 self.add_token_with_text(TokenType::ByteString, string_value);
2953 } else {
2954 self.add_token_with_text(TokenType::BitString, string_value);
2955 }
2956 return Ok(());
2957 }
2958 }
2959
2960 if text.eq_ignore_ascii_case("U")
2962 && self.peek() == '&'
2963 && self.current + 1 < self.size
2964 && self.chars[self.current + 1] == '\''
2965 {
2966 self.advance(); self.advance(); let string_value = self.scan_string_content()?;
2969 self.add_token_with_text(TokenType::UnicodeString, string_value);
2970 return Ok(());
2971 }
2972
2973 let token_type = Self::lookup_keyword_ascii(&self.config.keywords, &text);
2974
2975 self.add_token_with_text(token_type, text);
2976 Ok(())
2977 }
2978
2979 fn scan_string_content_with_escapes(
2983 &mut self,
2984 force_backslash_escapes: bool,
2985 ) -> Result<String> {
2986 let mut value = String::new();
2987 let use_backslash_escapes =
2988 force_backslash_escapes || self.config.string_escapes.contains(&'\\');
2989
2990 while !self.is_at_end() {
2991 let c = self.peek();
2992 if c == '\'' {
2993 if self.peek_next() == '\'' {
2994 value.push('\'');
2996 self.advance();
2997 self.advance();
2998 } else {
2999 break;
3000 }
3001 } else if c == '\\' && use_backslash_escapes {
3002 value.push(self.advance());
3004 if !self.is_at_end() {
3005 value.push(self.advance());
3006 }
3007 } else {
3008 value.push(self.advance());
3009 }
3010 }
3011
3012 if self.is_at_end() {
3013 return Err(Error::tokenize(
3014 "Unterminated string",
3015 self.line,
3016 self.column,
3017 self.start,
3018 self.current,
3019 ));
3020 }
3021
3022 self.advance(); Ok(value)
3024 }
3025
3026 fn scan_string_content(&mut self) -> Result<String> {
3028 self.scan_string_content_with_escapes(false)
3029 }
3030
3031 fn scan_double_quoted_string_content(&mut self) -> Result<String> {
3034 let mut value = String::new();
3035 let use_backslash_escapes = self.config.string_escapes.contains(&'\\');
3036
3037 while !self.is_at_end() {
3038 let c = self.peek();
3039 if c == '"' {
3040 if self.peek_next() == '"' {
3041 value.push('"');
3043 self.advance();
3044 self.advance();
3045 } else {
3046 break;
3047 }
3048 } else if c == '\\' && use_backslash_escapes {
3049 self.advance(); if !self.is_at_end() {
3052 let escaped = self.advance();
3053 match escaped {
3054 'n' => value.push('\n'),
3055 'r' => value.push('\r'),
3056 't' => value.push('\t'),
3057 '0' => value.push('\0'),
3058 '\\' => value.push('\\'),
3059 '"' => value.push('"'),
3060 '\'' => value.push('\''),
3061 'x' => {
3062 let mut hex = String::new();
3064 for _ in 0..2 {
3065 if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
3066 hex.push(self.advance());
3067 }
3068 }
3069 if let Ok(byte) = u8::from_str_radix(&hex, 16) {
3070 value.push(byte as char);
3071 } else {
3072 value.push('\\');
3074 value.push('x');
3075 value.push_str(&hex);
3076 }
3077 }
3078 _ => {
3079 value.push('\\');
3081 value.push(escaped);
3082 }
3083 }
3084 }
3085 } else {
3086 value.push(self.advance());
3087 }
3088 }
3089
3090 if self.is_at_end() {
3091 return Err(Error::tokenize(
3092 "Unterminated double-quoted string",
3093 self.line,
3094 self.column,
3095 self.start,
3096 self.current,
3097 ));
3098 }
3099
3100 self.advance(); Ok(value)
3102 }
3103
3104 fn scan_raw_string_content(&mut self, quote_char: char) -> Result<String> {
3109 let mut value = String::new();
3110
3111 while !self.is_at_end() {
3112 let c = self.peek();
3113 if c == quote_char {
3114 if self.peek_next() == quote_char {
3115 value.push(quote_char);
3117 self.advance();
3118 self.advance();
3119 } else {
3120 break;
3121 }
3122 } else if c == '\\'
3123 && self.peek_next() == quote_char
3124 && self.config.string_escapes_allowed_in_raw_strings
3125 {
3126 value.push(quote_char);
3130 self.advance(); self.advance(); } else {
3133 value.push(self.advance());
3135 }
3136 }
3137
3138 if self.is_at_end() {
3139 return Err(Error::tokenize(
3140 "Unterminated raw string",
3141 self.line,
3142 self.column,
3143 self.start,
3144 self.current,
3145 ));
3146 }
3147
3148 self.advance(); Ok(value)
3150 }
3151
3152 fn scan_raw_triple_quoted_content(&mut self, quote_char: char) -> Result<String> {
3155 let mut value = String::new();
3156
3157 while !self.is_at_end() {
3158 let c = self.peek();
3159 if c == quote_char && self.peek_next() == quote_char {
3160 if self.current + 2 < self.size && self.chars[self.current + 2] == quote_char {
3162 self.advance(); self.advance(); self.advance(); return Ok(value);
3167 }
3168 }
3169 let ch = self.advance();
3171 value.push(ch);
3172 }
3173
3174 Err(Error::tokenize(
3175 "Unterminated raw triple-quoted string",
3176 self.line,
3177 self.column,
3178 self.start,
3179 self.current,
3180 ))
3181 }
3182
3183 fn scan_dollar_identifier(&mut self) -> Result<()> {
3188 self.advance();
3190
3191 while !self.is_at_end() {
3193 let c = self.peek();
3194 if c.is_alphanumeric() || c == '_' || c == '$' {
3195 self.advance();
3196 } else {
3197 break;
3198 }
3199 }
3200
3201 let text = self.text_from_range(self.start, self.current);
3202 self.add_token_with_text(TokenType::Var, text);
3203 Ok(())
3204 }
3205
3206 fn scan_tsql_identifier(&mut self) -> Result<()> {
3207 let first = self.advance();
3209
3210 if first == '#' && self.peek() == '#' {
3212 self.advance();
3213 }
3214
3215 while !self.is_at_end() {
3217 let c = self.peek();
3218 if c.is_alphanumeric() || c == '_' || c == '$' || c == '#' || c == '@' {
3219 self.advance();
3220 } else {
3221 break;
3222 }
3223 }
3224
3225 let text = self.text_from_range(self.start, self.current);
3226 self.add_token_with_text(TokenType::Var, text);
3228 Ok(())
3229 }
3230
3231 fn try_scan_insert_format_raw_data(&mut self) -> Option<String> {
3235 let len = self.tokens.len();
3236 if len < 3 {
3237 return None;
3238 }
3239
3240 let last = &self.tokens[len - 1];
3242 if last.text.eq_ignore_ascii_case("VALUES") {
3243 return None;
3244 }
3245 if !matches!(last.token_type, TokenType::Var | TokenType::Identifier) {
3246 return None;
3247 }
3248
3249 let format_tok = &self.tokens[len - 2];
3251 if !format_tok.text.eq_ignore_ascii_case("FORMAT") {
3252 return None;
3253 }
3254
3255 let has_insert = self.tokens[..len - 2]
3257 .iter()
3258 .rev()
3259 .take(20)
3260 .any(|t| t.token_type == TokenType::Insert);
3261 if !has_insert {
3262 return None;
3263 }
3264
3265 let raw_start = self.current;
3269 while !self.is_at_end() {
3270 let c = self.peek();
3271 if c == '\n' {
3272 let saved = self.current;
3274 self.advance(); while !self.is_at_end() && self.peek() == '\r' {
3277 self.advance();
3278 }
3279 if self.is_at_end() || self.peek() == '\n' {
3280 let raw = self.text_from_range(raw_start, saved);
3283 return Some(raw.trim().to_string());
3284 }
3285 } else {
3287 self.advance();
3288 }
3289 }
3290
3291 let raw = self.text_from_range(raw_start, self.current);
3293 let trimmed = raw.trim().to_string();
3294 if trimmed.is_empty() {
3295 None
3296 } else {
3297 Some(trimmed)
3298 }
3299 }
3300
3301 fn add_token(&mut self, token_type: TokenType) {
3302 let text = self.text_from_range(self.start, self.current);
3303 self.add_token_with_text(token_type, text);
3304 }
3305
3306 fn add_token_with_text(&mut self, token_type: TokenType, text: String) {
3307 let span = Span::new(self.start, self.current, self.line, self.column);
3308 let mut token = Token::new(token_type, text, span);
3309 token.comments.append(&mut self.comments);
3310 self.tokens.push(token);
3311 }
3312}
3313
3314#[cfg(test)]
3315mod tests {
3316 use super::*;
3317
3318 #[test]
3319 fn test_simple_select() {
3320 let tokenizer = Tokenizer::default();
3321 let tokens = tokenizer.tokenize("SELECT 1").unwrap();
3322
3323 assert_eq!(tokens.len(), 2);
3324 assert_eq!(tokens[0].token_type, TokenType::Select);
3325 assert_eq!(tokens[1].token_type, TokenType::Number);
3326 assert_eq!(tokens[1].text, "1");
3327 }
3328
3329 #[test]
3330 fn test_select_with_identifier() {
3331 let tokenizer = Tokenizer::default();
3332 let tokens = tokenizer.tokenize("SELECT a, b FROM t").unwrap();
3333
3334 assert_eq!(tokens.len(), 6);
3335 assert_eq!(tokens[0].token_type, TokenType::Select);
3336 assert_eq!(tokens[1].token_type, TokenType::Var);
3337 assert_eq!(tokens[1].text, "a");
3338 assert_eq!(tokens[2].token_type, TokenType::Comma);
3339 assert_eq!(tokens[3].token_type, TokenType::Var);
3340 assert_eq!(tokens[3].text, "b");
3341 assert_eq!(tokens[4].token_type, TokenType::From);
3342 assert_eq!(tokens[5].token_type, TokenType::Var);
3343 assert_eq!(tokens[5].text, "t");
3344 }
3345
3346 #[test]
3347 fn test_string_literal() {
3348 let tokenizer = Tokenizer::default();
3349 let tokens = tokenizer.tokenize("SELECT 'hello'").unwrap();
3350
3351 assert_eq!(tokens.len(), 2);
3352 assert_eq!(tokens[1].token_type, TokenType::String);
3353 assert_eq!(tokens[1].text, "hello");
3354 }
3355
3356 #[test]
3357 fn test_escaped_string() {
3358 let tokenizer = Tokenizer::default();
3359 let tokens = tokenizer.tokenize("SELECT 'it''s'").unwrap();
3360
3361 assert_eq!(tokens.len(), 2);
3362 assert_eq!(tokens[1].token_type, TokenType::String);
3363 assert_eq!(tokens[1].text, "it's");
3364 }
3365
3366 #[test]
3367 fn test_comments() {
3368 let tokenizer = Tokenizer::default();
3369 let tokens = tokenizer.tokenize("SELECT -- comment\n1").unwrap();
3370
3371 assert_eq!(tokens.len(), 2);
3372 assert_eq!(tokens[0].trailing_comments.len(), 1);
3375 assert_eq!(tokens[0].trailing_comments[0], " comment");
3376 }
3377
3378 #[test]
3379 fn test_comment_in_and_chain() {
3380 use crate::generator::Generator;
3381 use crate::parser::Parser;
3382
3383 let sql = "SELECT a FROM b WHERE foo\n-- c1\nAND bar\n-- c2\nAND bla";
3385 let ast = Parser::parse_sql(sql).unwrap();
3386 let mut gen = Generator::default();
3387 let output = gen.generate(&ast[0]).unwrap();
3388 assert_eq!(
3389 output,
3390 "SELECT a FROM b WHERE foo AND /* c1 */ bar AND /* c2 */ bla"
3391 );
3392 }
3393
3394 #[test]
3395 fn test_operators() {
3396 let tokenizer = Tokenizer::default();
3397 let tokens = tokenizer.tokenize("1 + 2 * 3").unwrap();
3398
3399 assert_eq!(tokens.len(), 5);
3400 assert_eq!(tokens[0].token_type, TokenType::Number);
3401 assert_eq!(tokens[1].token_type, TokenType::Plus);
3402 assert_eq!(tokens[2].token_type, TokenType::Number);
3403 assert_eq!(tokens[3].token_type, TokenType::Star);
3404 assert_eq!(tokens[4].token_type, TokenType::Number);
3405 }
3406
3407 #[test]
3408 fn test_comparison_operators() {
3409 let tokenizer = Tokenizer::default();
3410 let tokens = tokenizer.tokenize("a <= b >= c != d").unwrap();
3411
3412 assert_eq!(tokens[1].token_type, TokenType::Lte);
3413 assert_eq!(tokens[3].token_type, TokenType::Gte);
3414 assert_eq!(tokens[5].token_type, TokenType::Neq);
3415 }
3416
3417 #[test]
3418 fn test_national_string() {
3419 let tokenizer = Tokenizer::default();
3420 let tokens = tokenizer.tokenize("N'abc'").unwrap();
3421
3422 assert_eq!(
3423 tokens.len(),
3424 1,
3425 "Expected 1 token for N'abc', got {:?}",
3426 tokens
3427 );
3428 assert_eq!(tokens[0].token_type, TokenType::NationalString);
3429 assert_eq!(tokens[0].text, "abc");
3430 }
3431
3432 #[test]
3433 fn test_hex_string() {
3434 let tokenizer = Tokenizer::default();
3435 let tokens = tokenizer.tokenize("X'ABCD'").unwrap();
3436
3437 assert_eq!(
3438 tokens.len(),
3439 1,
3440 "Expected 1 token for X'ABCD', got {:?}",
3441 tokens
3442 );
3443 assert_eq!(tokens[0].token_type, TokenType::HexString);
3444 assert_eq!(tokens[0].text, "ABCD");
3445 }
3446
3447 #[test]
3448 fn test_bit_string() {
3449 let tokenizer = Tokenizer::default();
3450 let tokens = tokenizer.tokenize("B'01010'").unwrap();
3451
3452 assert_eq!(
3453 tokens.len(),
3454 1,
3455 "Expected 1 token for B'01010', got {:?}",
3456 tokens
3457 );
3458 assert_eq!(tokens[0].token_type, TokenType::BitString);
3459 assert_eq!(tokens[0].text, "01010");
3460 }
3461
3462 #[test]
3463 fn test_trailing_dot_number() {
3464 let tokenizer = Tokenizer::default();
3465
3466 let tokens = tokenizer.tokenize("SELECT 1.").unwrap();
3468 assert_eq!(
3469 tokens.len(),
3470 2,
3471 "Expected 2 tokens for 'SELECT 1.', got {:?}",
3472 tokens
3473 );
3474 assert_eq!(tokens[1].token_type, TokenType::Number);
3475 assert_eq!(tokens[1].text, "1.");
3476
3477 let tokens = tokenizer.tokenize("SELECT 1.5").unwrap();
3479 assert_eq!(tokens[1].text, "1.5");
3480
3481 let tokens = tokenizer.tokenize("SELECT 1.a").unwrap();
3484 assert_eq!(
3485 tokens.len(),
3486 3,
3487 "Expected 3 tokens for 'SELECT 1.a', got {:?}",
3488 tokens
3489 );
3490 assert_eq!(tokens[1].token_type, TokenType::Number);
3491 assert_eq!(tokens[1].text, "1.");
3492 assert_eq!(tokens[2].token_type, TokenType::Var);
3493
3494 let tokens = tokenizer.tokenize("SELECT 1..2").unwrap();
3496 assert_eq!(tokens[1].token_type, TokenType::Number);
3497 assert_eq!(tokens[1].text, "1");
3498 assert_eq!(tokens[2].token_type, TokenType::Dot);
3499 assert_eq!(tokens[3].token_type, TokenType::Dot);
3500 assert_eq!(tokens[4].token_type, TokenType::Number);
3501 assert_eq!(tokens[4].text, "2");
3502 }
3503
3504 #[test]
3505 fn test_leading_dot_number() {
3506 let tokenizer = Tokenizer::default();
3507
3508 let tokens = tokenizer.tokenize(".25").unwrap();
3510 assert_eq!(
3511 tokens.len(),
3512 1,
3513 "Expected 1 token for '.25', got {:?}",
3514 tokens
3515 );
3516 assert_eq!(tokens[0].token_type, TokenType::Number);
3517 assert_eq!(tokens[0].text, ".25");
3518
3519 let tokens = tokenizer.tokenize("SAMPLE (.25)").unwrap();
3521 assert_eq!(
3522 tokens.len(),
3523 4,
3524 "Expected 4 tokens for 'SAMPLE (.25)', got {:?}",
3525 tokens
3526 );
3527 assert_eq!(tokens[0].token_type, TokenType::Sample);
3528 assert_eq!(tokens[1].token_type, TokenType::LParen);
3529 assert_eq!(tokens[2].token_type, TokenType::Number);
3530 assert_eq!(tokens[2].text, ".25");
3531 assert_eq!(tokens[3].token_type, TokenType::RParen);
3532
3533 let tokens = tokenizer.tokenize(".5e10").unwrap();
3535 assert_eq!(
3536 tokens.len(),
3537 1,
3538 "Expected 1 token for '.5e10', got {:?}",
3539 tokens
3540 );
3541 assert_eq!(tokens[0].token_type, TokenType::Number);
3542 assert_eq!(tokens[0].text, ".5e10");
3543
3544 let tokens = tokenizer.tokenize("a.b").unwrap();
3546 assert_eq!(
3547 tokens.len(),
3548 3,
3549 "Expected 3 tokens for 'a.b', got {:?}",
3550 tokens
3551 );
3552 assert_eq!(tokens[1].token_type, TokenType::Dot);
3553 }
3554
3555 #[test]
3556 fn test_unrecognized_character() {
3557 let tokenizer = Tokenizer::default();
3558
3559 let result = tokenizer.tokenize("SELECT \u{2018}hello\u{2019}");
3561 assert!(
3562 result.is_ok(),
3563 "Curly quotes should be tokenized as strings"
3564 );
3565
3566 let result = tokenizer.tokenize("SELECT • FROM t");
3568 assert!(result.is_err());
3569 }
3570
3571 #[test]
3572 fn test_colon_eq_tokenization() {
3573 let tokenizer = Tokenizer::default();
3574
3575 let tokens = tokenizer.tokenize("a := 1").unwrap();
3577 assert_eq!(tokens.len(), 3);
3578 assert_eq!(tokens[0].token_type, TokenType::Var);
3579 assert_eq!(tokens[1].token_type, TokenType::ColonEq);
3580 assert_eq!(tokens[2].token_type, TokenType::Number);
3581
3582 let tokens = tokenizer.tokenize("a:b").unwrap();
3584 assert!(tokens.iter().any(|t| t.token_type == TokenType::Colon));
3585 assert!(!tokens.iter().any(|t| t.token_type == TokenType::ColonEq));
3586
3587 let tokens = tokenizer.tokenize("a::INT").unwrap();
3589 assert!(tokens.iter().any(|t| t.token_type == TokenType::DColon));
3590 }
3591
3592 #[test]
3593 fn test_colon_eq_parsing() {
3594 use crate::generator::Generator;
3595 use crate::parser::Parser;
3596
3597 let ast = Parser::parse_sql("SELECT @var1 := 1, @var2")
3599 .expect("Failed to parse MySQL @var := expr");
3600 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3601 assert_eq!(output, "SELECT @var1 := 1, @var2");
3602
3603 let ast = Parser::parse_sql("SELECT @var1, @var2 := @var1")
3605 .expect("Failed to parse MySQL @var2 := @var1");
3606 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3607 assert_eq!(output, "SELECT @var1, @var2 := @var1");
3608
3609 let ast = Parser::parse_sql("SELECT @var1 := COUNT(*) FROM t1")
3611 .expect("Failed to parse MySQL @var := COUNT(*)");
3612 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3613 assert_eq!(output, "SELECT @var1 := COUNT(*) FROM t1");
3614
3615 let ast = Parser::parse_sql("SET @var1 := 1").expect("Failed to parse SET @var1 := 1");
3617 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3618 assert_eq!(output, "SET @var1 = 1");
3619
3620 let ast =
3622 Parser::parse_sql("UNION_VALUE(k1 := 1)").expect("Failed to parse named arg with :=");
3623 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3624 assert_eq!(output, "UNION_VALUE(k1 := 1)");
3625
3626 let ast = Parser::parse_sql("SELECT UNNEST(col, recursive := TRUE) FROM t")
3628 .expect("Failed to parse UNNEST with :=");
3629 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3630 assert_eq!(output, "SELECT UNNEST(col, recursive := TRUE) FROM t");
3631
3632 let ast =
3634 Parser::parse_sql("SELECT foo: 1").expect("Failed to parse DuckDB prefix alias foo: 1");
3635 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3636 assert_eq!(output, "SELECT 1 AS foo");
3637
3638 let ast = Parser::parse_sql("SELECT foo: 1, bar: 2, baz: 3")
3640 .expect("Failed to parse DuckDB multiple prefix aliases");
3641 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3642 assert_eq!(output, "SELECT 1 AS foo, 2 AS bar, 3 AS baz");
3643 }
3644
3645 #[test]
3646 fn test_colon_eq_dialect_roundtrip() {
3647 use crate::dialects::{Dialect, DialectType};
3648
3649 fn check(dialect: DialectType, sql: &str, expected: Option<&str>) {
3650 let d = Dialect::get(dialect);
3651 let ast = d
3652 .parse(sql)
3653 .unwrap_or_else(|e| panic!("Parse error for '{}': {}", sql, e));
3654 assert!(!ast.is_empty(), "Empty AST for: {}", sql);
3655 let transformed = d
3656 .transform(ast[0].clone())
3657 .unwrap_or_else(|e| panic!("Transform error for '{}': {}", sql, e));
3658 let output = d
3659 .generate(&transformed)
3660 .unwrap_or_else(|e| panic!("Generate error for '{}': {}", sql, e));
3661 let expected = expected.unwrap_or(sql);
3662 assert_eq!(output, expected, "Roundtrip failed for: {}", sql);
3663 }
3664
3665 check(DialectType::MySQL, "SELECT @var1 := 1, @var2", None);
3667 check(DialectType::MySQL, "SELECT @var1, @var2 := @var1", None);
3668 check(DialectType::MySQL, "SELECT @var1 := COUNT(*) FROM t1", None);
3669 check(DialectType::MySQL, "SET @var1 := 1", Some("SET @var1 = 1"));
3670
3671 check(
3673 DialectType::DuckDB,
3674 "SELECT UNNEST(col, recursive := TRUE) FROM t",
3675 None,
3676 );
3677 check(DialectType::DuckDB, "UNION_VALUE(k1 := 1)", None);
3678
3679 {
3682 let d = Dialect::get(DialectType::DuckDB);
3683 let ast = d
3684 .parse("STRUCT_PACK(a := 'b')::json")
3685 .expect("Failed to parse STRUCT_PACK(a := 'b')::json");
3686 assert!(!ast.is_empty(), "Empty AST for STRUCT_PACK(a := 'b')::json");
3687 }
3688
3689 check(
3691 DialectType::DuckDB,
3692 "SELECT foo: 1",
3693 Some("SELECT 1 AS foo"),
3694 );
3695 check(
3696 DialectType::DuckDB,
3697 "SELECT foo: 1, bar: 2, baz: 3",
3698 Some("SELECT 1 AS foo, 2 AS bar, 3 AS baz"),
3699 );
3700 }
3701
3702 #[test]
3703 fn test_comment_roundtrip() {
3704 use crate::generator::Generator;
3705 use crate::parser::Parser;
3706
3707 fn check_roundtrip(sql: &str) -> Option<String> {
3708 let ast = match Parser::parse_sql(sql) {
3709 Ok(a) => a,
3710 Err(e) => return Some(format!("Parse error: {:?}", e)),
3711 };
3712 if ast.is_empty() {
3713 return Some("Empty AST".to_string());
3714 }
3715 let mut generator = Generator::default();
3716 let output = match generator.generate(&ast[0]) {
3717 Ok(o) => o,
3718 Err(e) => return Some(format!("Gen error: {:?}", e)),
3719 };
3720 if output == sql {
3721 None
3722 } else {
3723 Some(format!(
3724 "Mismatch:\n input: {}\n output: {}",
3725 sql, output
3726 ))
3727 }
3728 }
3729
3730 let tests = vec![
3731 "SELECT c /* c1 /* c2 */ c3 */",
3733 "SELECT c /* c1 /* c2 /* c3 */ */ */",
3734 "SELECT c /* c1 */ AS alias /* c2 */",
3736 "SELECT a /* x */, b /* x */",
3738 "SELECT a /* x */ /* y */ /* z */, b /* k */ /* m */",
3740 "SELECT * FROM foo /* x */, bla /* x */",
3742 "SELECT 1 /* comment */ + 1",
3744 "SELECT 1 /* c1 */ + 2 /* c2 */",
3745 "SELECT 1 /* c1 */ + /* c2 */ 2 /* c3 */",
3746 "SELECT CAST(x AS INT) /* comment */ FROM foo",
3748 "SELECT FOO(x /* c */) /* FOO */, b /* b */",
3750 "SELECT x FROM a.b.c /* x */, e.f.g /* x */",
3752 "INSERT INTO t1 (tc1 /* tc1 */, tc2 /* tc2 */) SELECT c1 /* sc1 */, c2 /* sc2 */ FROM t",
3754 "/* c */ WITH x AS (SELECT 1) SELECT * FROM x",
3756 "/* comment1 */ INSERT INTO x /* comment2 */ VALUES (1, 2, 3)",
3757 "/* comment1 */ UPDATE tbl /* comment2 */ SET x = 2 WHERE x < 2",
3758 "/* comment1 */ DELETE FROM x /* comment2 */ WHERE y > 1",
3759 "/* comment */ CREATE TABLE foo AS SELECT 1",
3760 "INSERT INTO foo SELECT * FROM bar /* comment */",
3762 "SELECT FOO(x /* c1 */ + y /* c2 */ + BLA(5 /* c3 */)) FROM (VALUES (1 /* c4 */, \"test\" /* c5 */)) /* c6 */",
3764 ];
3765
3766 let mut failures = Vec::new();
3767 for sql in tests {
3768 if let Some(e) = check_roundtrip(sql) {
3769 failures.push(e);
3770 }
3771 }
3772
3773 if !failures.is_empty() {
3774 panic!("Comment roundtrip failures:\n{}", failures.join("\n\n"));
3775 }
3776 }
3777
3778 #[test]
3779 fn test_dollar_quoted_string_parsing() {
3780 use crate::dialects::{Dialect, DialectType};
3781
3782 let (tag, content) = super::parse_dollar_string_token("FOO\x00content here");
3784 assert_eq!(tag, Some("FOO".to_string()));
3785 assert_eq!(content, "content here");
3786
3787 let (tag, content) = super::parse_dollar_string_token("just content");
3788 assert_eq!(tag, None);
3789 assert_eq!(content, "just content");
3790
3791 fn check_databricks(sql: &str, expected: Option<&str>) {
3793 let d = Dialect::get(DialectType::Databricks);
3794 let ast = d
3795 .parse(sql)
3796 .unwrap_or_else(|e| panic!("Parse error for '{}': {}", sql, e));
3797 assert!(!ast.is_empty(), "Empty AST for: {}", sql);
3798 let transformed = d
3799 .transform(ast[0].clone())
3800 .unwrap_or_else(|e| panic!("Transform error for '{}': {}", sql, e));
3801 let output = d
3802 .generate(&transformed)
3803 .unwrap_or_else(|e| panic!("Generate error for '{}': {}", sql, e));
3804 let expected = expected.unwrap_or(sql);
3805 assert_eq!(output, expected, "Roundtrip failed for: {}", sql);
3806 }
3807
3808 check_databricks(
3810 "CREATE FUNCTION add_one(x INT) RETURNS INT LANGUAGE PYTHON AS $$def add_one(x):\n return x+1$$",
3811 None
3812 );
3813
3814 check_databricks(
3816 "CREATE FUNCTION add_one(x INT) RETURNS INT LANGUAGE PYTHON AS $FOO$def add_one(x):\n return x+1$FOO$",
3817 None
3818 );
3819 }
3820
3821 #[test]
3822 fn test_numeric_underscore_stripping() {
3823 let mut config = TokenizerConfig::default();
3825 config.numbers_can_be_underscore_separated = true;
3826 let tokenizer = Tokenizer::new(config);
3827
3828 let tokens = tokenizer.tokenize("SELECT 1_2_3_4_5").unwrap();
3830 assert_eq!(tokens[1].token_type, TokenType::Number);
3831 assert_eq!(tokens[1].text, "12345");
3832
3833 let tokens = tokenizer.tokenize("SELECT 20_000").unwrap();
3835 assert_eq!(tokens[1].token_type, TokenType::Number);
3836 assert_eq!(tokens[1].text, "20000");
3837
3838 let tokens = tokenizer.tokenize("SELECT 1_2E+1_0").unwrap();
3840 assert_eq!(tokens[1].token_type, TokenType::Number);
3841 assert_eq!(tokens[1].text, "12E+10");
3842
3843 let default_tokenizer = Tokenizer::default();
3845 let tokens = default_tokenizer.tokenize("SELECT 1_2_3_4_5").unwrap();
3846 assert_eq!(tokens[1].token_type, TokenType::Number);
3847 assert_eq!(tokens[1].text, "1_2_3_4_5");
3848 }
3849}