1use crate::error::{Error, Result};
7use serde::{Deserialize, Serialize};
8use std::fmt;
9#[cfg(feature = "bindings")]
10use ts_rs::TS;
11
12pub fn parse_dollar_string_token(text: &str) -> (Option<String>, String) {
16 if let Some(pos) = text.find('\x00') {
17 let tag = &text[..pos];
18 let content = &text[pos + 1..];
19 (Some(tag.to_string()), content.to_string())
20 } else {
21 (None, text.to_string())
22 }
23}
24
25#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
27#[cfg_attr(feature = "bindings", derive(TS))]
28pub struct Span {
29 pub start: usize,
31 pub end: usize,
33 pub line: usize,
35 pub column: usize,
37}
38
39impl Span {
40 pub fn new(start: usize, end: usize, line: usize, column: usize) -> Self {
41 Self {
42 start,
43 end,
44 line,
45 column,
46 }
47 }
48}
49
50#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
52pub struct Token {
53 pub token_type: TokenType,
55 pub text: String,
57 pub span: Span,
59 #[serde(default)]
61 pub comments: Vec<String>,
62 #[serde(default)]
64 pub trailing_comments: Vec<String>,
65}
66
67impl Token {
68 pub fn new(token_type: TokenType, text: impl Into<String>, span: Span) -> Self {
70 Self {
71 token_type,
72 text: text.into(),
73 span,
74 comments: Vec::new(),
75 trailing_comments: Vec::new(),
76 }
77 }
78
79 pub fn number(n: i64) -> Self {
81 Self::new(TokenType::Number, n.to_string(), Span::default())
82 }
83
84 pub fn string(s: impl Into<String>) -> Self {
86 Self::new(TokenType::String, s, Span::default())
87 }
88
89 pub fn identifier(s: impl Into<String>) -> Self {
91 Self::new(TokenType::Identifier, s, Span::default())
92 }
93
94 pub fn var(s: impl Into<String>) -> Self {
96 Self::new(TokenType::Var, s, Span::default())
97 }
98
99 pub fn with_comment(mut self, comment: impl Into<String>) -> Self {
101 self.comments.push(comment.into());
102 self
103 }
104}
105
106impl fmt::Display for Token {
107 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
108 write!(f, "{:?}({})", self.token_type, self.text)
109 }
110}
111
112#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
114#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
115#[repr(u16)]
116pub enum TokenType {
117 LParen,
119 RParen,
120 LBracket,
121 RBracket,
122 LBrace,
123 RBrace,
124 Comma,
125 Dot,
126 Dash,
127 Plus,
128 Colon,
129 DotColon,
130 DColon,
131 DColonDollar,
132 DColonPercent,
133 DColonQMark,
134 DQMark,
135 Semicolon,
136 Star,
137 Backslash,
138 Slash,
139 Lt,
140 Lte,
141 Gt,
142 Gte,
143 Not,
144 Eq,
145 Neq,
146 NullsafeEq,
147 ColonEq,
148 ColonGt,
149 NColonGt,
150 And,
151 Or,
152 Amp,
153 DPipe,
154 PipeGt,
155 Pipe,
156 PipeSlash,
157 DPipeSlash,
158 Caret,
159 CaretAt,
160 LtLt, GtGt, Tilde,
163 Arrow,
164 DArrow,
165 FArrow,
166 Hash,
167 HashArrow,
168 DHashArrow,
169 LrArrow,
170 DAt,
171 AtAt,
172 LtAt,
173 AtGt,
174 Dollar,
175 Parameter,
176 Session,
177 SessionParameter,
178 SessionUser,
179 DAmp,
180 AmpLt,
181 AmpGt,
182 Adjacent,
183 Xor,
184 DStar,
185 QMarkAmp,
186 QMarkPipe,
187 HashDash,
188 Exclamation,
189
190 UriStart,
191 BlockStart,
192 BlockEnd,
193 Space,
194 Break,
195
196 BlockComment, LineComment, String,
202 DollarString, TripleDoubleQuotedString, TripleSingleQuotedString, Number,
206 Identifier,
207 QuotedIdentifier,
208 Database,
209 Column,
210 ColumnDef,
211 Schema,
212 Table,
213 Warehouse,
214 Stage,
215 Streamlit,
216 Var,
217 BitString,
218 HexString,
219 HexNumber,
221 ByteString,
222 NationalString,
223 EscapeString, RawString,
225 HeredocString,
226 HeredocStringAlternative,
227 UnicodeString,
228
229 Bit,
231 Boolean,
232 TinyInt,
233 UTinyInt,
234 SmallInt,
235 USmallInt,
236 MediumInt,
237 UMediumInt,
238 Int,
239 UInt,
240 BigInt,
241 UBigInt,
242 BigNum,
243 Int128,
244 UInt128,
245 Int256,
246 UInt256,
247 Float,
248 Double,
249 UDouble,
250 Decimal,
251 Decimal32,
252 Decimal64,
253 Decimal128,
254 Decimal256,
255 DecFloat,
256 UDecimal,
257 BigDecimal,
258 Char,
259 NChar,
260 VarChar,
261 NVarChar,
262 BpChar,
263 Text,
264 MediumText,
265 LongText,
266 Blob,
267 MediumBlob,
268 LongBlob,
269 TinyBlob,
270 TinyText,
271 Name,
272 Binary,
273 VarBinary,
274 Json,
275 JsonB,
276 Time,
277 TimeTz,
278 TimeNs,
279 Timestamp,
280 TimestampTz,
281 TimestampLtz,
282 TimestampNtz,
283 TimestampS,
284 TimestampMs,
285 TimestampNs,
286 DateTime,
287 DateTime2,
288 DateTime64,
289 SmallDateTime,
290 Date,
291 Date32,
292 Int4Range,
293 Int4MultiRange,
294 Int8Range,
295 Int8MultiRange,
296 NumRange,
297 NumMultiRange,
298 TsRange,
299 TsMultiRange,
300 TsTzRange,
301 TsTzMultiRange,
302 DateRange,
303 DateMultiRange,
304 Uuid,
305 Geography,
306 GeographyPoint,
307 Nullable,
308 Geometry,
309 Point,
310 Ring,
311 LineString,
312 LocalTime,
313 LocalTimestamp,
314 SysTimestamp,
315 MultiLineString,
316 Polygon,
317 MultiPolygon,
318 HllSketch,
319 HStore,
320 Super,
321 Serial,
322 SmallSerial,
323 BigSerial,
324 Xml,
325 Year,
326 UserDefined,
327 Money,
328 SmallMoney,
329 RowVersion,
330 Image,
331 Variant,
332 Object,
333 Inet,
334 IpAddress,
335 IpPrefix,
336 Ipv4,
337 Ipv6,
338 Enum,
339 Enum8,
340 Enum16,
341 FixedString,
342 LowCardinality,
343 Nested,
344 AggregateFunction,
345 SimpleAggregateFunction,
346 TDigest,
347 Unknown,
348 Vector,
349 Dynamic,
350 Void,
351
352 Add,
354 Alias,
355 Alter,
356 All,
357 Anti,
358 Any,
359 Apply,
360 Array,
361 Asc,
362 AsOf,
363 Attach,
364 AutoIncrement,
365 Begin,
366 Between,
367 BulkCollectInto,
368 Cache,
369 Cascade,
370 Case,
371 CharacterSet,
372 Cluster,
373 ClusterBy,
374 Collate,
375 Command,
376 Comment,
377 Commit,
378 Preserve,
379 Connect,
380 ConnectBy,
381 Constraint,
382 Copy,
383 Create,
384 Cross,
385 Cube,
386 CurrentDate,
387 CurrentDateTime,
388 CurrentSchema,
389 CurrentTime,
390 CurrentTimestamp,
391 CurrentUser,
392 CurrentRole,
393 CurrentCatalog,
394 Declare,
395 Default,
396 Delete,
397 Desc,
398 Describe,
399 Detach,
400 Dictionary,
401 Distinct,
402 Distribute,
403 DistributeBy,
404 Div,
405 Drop,
406 Else,
407 End,
408 Escape,
409 Except,
410 Execute,
411 Exists,
412 False,
413 Fetch,
414 File,
415 FileFormat,
416 Filter,
417 Final,
418 First,
419 For,
420 Force,
421 ForeignKey,
422 Format,
423 From,
424 Full,
425 Function,
426 Get,
427 Glob,
428 Global,
429 Grant,
430 GroupBy,
431 GroupingSets,
432 Having,
433 Hint,
434 Ignore,
435 ILike,
436 In,
437 Index,
438 IndexedBy,
439 Inner,
440 Input,
441 Insert,
442 Install,
443 Intersect,
444 Interval,
445 Into,
446 Inpath,
447 InputFormat,
448 Introducer,
449 IRLike,
450 Is,
451 IsNull,
452 Join,
453 JoinMarker,
454 Keep,
455 Key,
456 Kill,
457 Lambda,
458 Language,
459 Lateral,
460 Left,
461 Like,
462 NotLike, NotILike, NotRLike, NotIRLike, Limit,
467 List,
468 Load,
469 Local,
470 Lock,
471 Map,
472 Match,
473 MatchCondition,
474 MatchRecognize,
475 MemberOf,
476 Materialized,
477 Merge,
478 Mod,
479 Model,
480 Natural,
481 Next,
482 NoAction,
483 Nothing,
484 NotNull,
485 Null,
486 ObjectIdentifier,
487 Offset,
488 On,
489 Only,
490 Operator,
491 OrderBy,
492 OrderSiblingsBy,
493 Ordered,
494 Ordinality,
495 Out,
496 Outer,
497 Output,
498 Over,
499 Overlaps,
500 Overwrite,
501 Partition,
502 PartitionBy,
503 Percent,
504 Pivot,
505 Placeholder,
506 Positional,
507 Pragma,
508 Prewhere,
509 PrimaryKey,
510 Procedure,
511 Properties,
512 PseudoType,
513 Put,
514 Qualify,
515 Quote,
516 QDColon,
517 Range,
518 Recursive,
519 Refresh,
520 Rename,
521 Replace,
522 Returning,
523 Revoke,
524 References,
525 Restrict,
526 Right,
527 RLike,
528 Rollback,
529 Rollup,
530 Row,
531 Rows,
532 Select,
533 Semi,
534 Savepoint,
535 Separator,
536 Sequence,
537 Serde,
538 SerdeProperties,
539 Set,
540 Settings,
541 Show,
542 Siblings,
543 SimilarTo,
544 Some,
545 Sort,
546 SortBy,
547 SoundsLike,
548 StartWith,
549 StorageIntegration,
550 StraightJoin,
551 Struct,
552 Summarize,
553 TableSample,
554 Sample,
555 Bernoulli,
556 System,
557 Block,
558 Seed,
559 Repeatable,
560 Tag,
561 Temporary,
562 Transaction,
563 To,
564 Top,
565 Then,
566 True,
567 Truncate,
568 Uncache,
569 Union,
570 Unnest,
571 Unpivot,
572 Update,
573 Use,
574 Using,
575 Values,
576 View,
577 SemanticView,
578 Volatile,
579 When,
580 Where,
581 Window,
582 With,
583 Ties,
584 Exclude,
585 No,
586 Others,
587 Unique,
588 UtcDate,
589 UtcTime,
590 UtcTimestamp,
591 VersionSnapshot,
592 TimestampSnapshot,
593 Option,
594 Sink,
595 Source,
596 Analyze,
597 Namespace,
598 Export,
599 As,
600 By,
601 Nulls,
602 Respect,
603 Last,
604 If,
605 Cast,
606 TryCast,
607 SafeCast,
608 Count,
609 Extract,
610 Substring,
611 Trim,
612 Leading,
613 Trailing,
614 Both,
615 Position,
616 Overlaying,
617 Placing,
618 Treat,
619 Within,
620 Group,
621 Order,
622
623 Unbounded,
625 Preceding,
626 Following,
627 Current,
628 Groups,
629
630 Trigger,
632 Type,
633 Domain,
634 Returns,
635 Body,
636 Increment,
637 Minvalue,
638 Maxvalue,
639 Start,
640 Cycle,
641 NoCycle,
642 Prior,
643 Generated,
644 Identity,
645 Always,
646 Measures,
648 Pattern,
649 Define,
650 Running,
651 Owned,
652 After,
653 Before,
654 Instead,
655 Each,
656 Statement,
657 Referencing,
658 Old,
659 New,
660 Of,
661 Check,
662 Authorization,
663 Restart,
664
665 Eof,
667}
668
669impl TokenType {
670 pub fn is_keyword(&self) -> bool {
672 matches!(
673 self,
674 TokenType::Select
675 | TokenType::From
676 | TokenType::Where
677 | TokenType::And
678 | TokenType::Or
679 | TokenType::Not
680 | TokenType::In
681 | TokenType::Is
682 | TokenType::Null
683 | TokenType::True
684 | TokenType::False
685 | TokenType::As
686 | TokenType::On
687 | TokenType::Join
688 | TokenType::Left
689 | TokenType::Right
690 | TokenType::Inner
691 | TokenType::Outer
692 | TokenType::Full
693 | TokenType::Cross
694 | TokenType::Semi
695 | TokenType::Anti
696 | TokenType::Union
697 | TokenType::Except
698 | TokenType::Intersect
699 | TokenType::GroupBy
700 | TokenType::OrderBy
701 | TokenType::Having
702 | TokenType::Limit
703 | TokenType::Offset
704 | TokenType::Case
705 | TokenType::When
706 | TokenType::Then
707 | TokenType::Else
708 | TokenType::End
709 | TokenType::Create
710 | TokenType::Drop
711 | TokenType::Alter
712 | TokenType::Insert
713 | TokenType::Update
714 | TokenType::Delete
715 | TokenType::Into
716 | TokenType::Values
717 | TokenType::Set
718 | TokenType::With
719 | TokenType::Distinct
720 | TokenType::All
721 | TokenType::Exists
722 | TokenType::Between
723 | TokenType::Like
724 | TokenType::ILike
725 | TokenType::Filter
727 | TokenType::Date
728 | TokenType::Timestamp
729 | TokenType::TimestampTz
730 | TokenType::Interval
731 | TokenType::Time
732 | TokenType::Table
733 | TokenType::Index
734 | TokenType::Column
735 | TokenType::Database
736 | TokenType::Schema
737 | TokenType::View
738 | TokenType::Function
739 | TokenType::Procedure
740 | TokenType::Trigger
741 | TokenType::Sequence
742 | TokenType::Over
743 | TokenType::Partition
744 | TokenType::Window
745 | TokenType::Rows
746 | TokenType::Range
747 | TokenType::First
748 | TokenType::Last
749 | TokenType::Preceding
750 | TokenType::Following
751 | TokenType::Current
752 | TokenType::Row
753 | TokenType::Unbounded
754 | TokenType::Array
755 | TokenType::Struct
756 | TokenType::Map
757 | TokenType::PrimaryKey
758 | TokenType::Key
759 | TokenType::ForeignKey
760 | TokenType::References
761 | TokenType::Unique
762 | TokenType::Check
763 | TokenType::Default
764 | TokenType::Constraint
765 | TokenType::Comment
766 | TokenType::Rollup
767 | TokenType::Cube
768 | TokenType::Grant
769 | TokenType::Revoke
770 | TokenType::Type
771 | TokenType::Use
772 | TokenType::Cache
773 | TokenType::Uncache
774 | TokenType::Load
775 | TokenType::Any
776 | TokenType::Some
777 | TokenType::Asc
778 | TokenType::Desc
779 | TokenType::Nulls
780 | TokenType::Lateral
781 | TokenType::Natural
782 | TokenType::Escape
783 | TokenType::Glob
784 | TokenType::Match
785 | TokenType::Recursive
786 | TokenType::Replace
787 | TokenType::Returns
788 | TokenType::If
789 | TokenType::Pivot
790 | TokenType::Unpivot
791 | TokenType::Json
792 | TokenType::Blob
793 | TokenType::Text
794 | TokenType::Int
795 | TokenType::BigInt
796 | TokenType::SmallInt
797 | TokenType::TinyInt
798 | TokenType::Int128
799 | TokenType::UInt128
800 | TokenType::Int256
801 | TokenType::UInt256
802 | TokenType::UInt
803 | TokenType::UBigInt
804 | TokenType::Float
805 | TokenType::Double
806 | TokenType::Decimal
807 | TokenType::Boolean
808 | TokenType::VarChar
809 | TokenType::Char
810 | TokenType::Binary
811 | TokenType::VarBinary
812 | TokenType::No
813 | TokenType::DateTime
814 | TokenType::Truncate
815 | TokenType::Execute
816 | TokenType::Merge
817 | TokenType::Top
818 | TokenType::Begin
819 | TokenType::Generated
820 | TokenType::Identity
821 | TokenType::Always
822 | TokenType::Extract
823 | TokenType::AsOf
825 | TokenType::Prior
826 | TokenType::After
827 | TokenType::Restrict
828 | TokenType::Cascade
829 | TokenType::Local
830 | TokenType::Rename
831 | TokenType::Enum
832 | TokenType::Within
833 | TokenType::Format
834 | TokenType::Final
835 | TokenType::FileFormat
836 | TokenType::Input
837 | TokenType::InputFormat
838 | TokenType::Copy
839 | TokenType::Put
840 | TokenType::Get
841 | TokenType::Show
842 | TokenType::Serde
843 | TokenType::Sample
844 | TokenType::Sort
845 | TokenType::Collate
846 | TokenType::Ties
847 | TokenType::IsNull
848 | TokenType::NotNull
849 | TokenType::Exclude
850 | TokenType::Temporary
851 | TokenType::Add
852 | TokenType::Ordinality
853 | TokenType::Overlaps
854 | TokenType::Block
855 | TokenType::Pattern
856 | TokenType::Group
857 | TokenType::Cluster
858 | TokenType::Repeatable
859 | TokenType::Groups
860 | TokenType::Commit
861 | TokenType::Warehouse
862 | TokenType::System
863 | TokenType::By
864 | TokenType::To
865 | TokenType::Fetch
866 | TokenType::For
867 | TokenType::Only
868 | TokenType::Next
869 | TokenType::Lock
870 | TokenType::Refresh
871 | TokenType::Settings
872 | TokenType::Operator
873 | TokenType::Overwrite
874 | TokenType::StraightJoin
875 | TokenType::Start
876 | TokenType::Ignore
878 | TokenType::Domain
879 | TokenType::Apply
880 | TokenType::Respect
881 | TokenType::Materialized
882 | TokenType::Prewhere
883 | TokenType::Old
884 | TokenType::New
885 | TokenType::Cast
886 | TokenType::TryCast
887 | TokenType::SafeCast
888 | TokenType::Transaction
889 | TokenType::Describe
890 | TokenType::Kill
891 | TokenType::Lambda
892 | TokenType::Declare
893 | TokenType::Keep
894 | TokenType::Output
895 | TokenType::Percent
896 | TokenType::Qualify
897 | TokenType::Returning
898 | TokenType::Language
899 | TokenType::Preserve
900 | TokenType::Savepoint
901 | TokenType::Rollback
902 | TokenType::Body
903 | TokenType::Increment
904 | TokenType::Minvalue
905 | TokenType::Maxvalue
906 | TokenType::Cycle
907 | TokenType::NoCycle
908 | TokenType::Seed
909 | TokenType::Namespace
910 | TokenType::Authorization
911 | TokenType::Order
912 | TokenType::Restart
913 | TokenType::Before
914 | TokenType::Instead
915 | TokenType::Each
916 | TokenType::Statement
917 | TokenType::Referencing
918 | TokenType::Of
919 | TokenType::Separator
920 | TokenType::Others
921 | TokenType::Placing
922 | TokenType::Owned
923 | TokenType::Running
924 | TokenType::Define
925 | TokenType::Measures
926 | TokenType::MatchRecognize
927 | TokenType::AutoIncrement
928 | TokenType::Connect
929 | TokenType::Distribute
930 | TokenType::Bernoulli
931 | TokenType::TableSample
932 | TokenType::Inpath
933 | TokenType::Pragma
934 | TokenType::Siblings
935 | TokenType::SerdeProperties
936 | TokenType::RLike
937 )
938 }
939
940 pub fn is_comparison(&self) -> bool {
942 matches!(
943 self,
944 TokenType::Eq
945 | TokenType::Neq
946 | TokenType::Lt
947 | TokenType::Lte
948 | TokenType::Gt
949 | TokenType::Gte
950 | TokenType::NullsafeEq
951 )
952 }
953
954 pub fn is_arithmetic(&self) -> bool {
956 matches!(
957 self,
958 TokenType::Plus
959 | TokenType::Dash
960 | TokenType::Star
961 | TokenType::Slash
962 | TokenType::Percent
963 | TokenType::Mod
964 | TokenType::Div
965 )
966 }
967}
968
969impl fmt::Display for TokenType {
970 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
971 write!(f, "{:?}", self)
972 }
973}
974
975#[derive(Debug, Clone)]
977pub struct TokenizerConfig {
978 pub keywords: std::collections::HashMap<String, TokenType>,
980 pub single_tokens: std::collections::HashMap<char, TokenType>,
982 pub quotes: std::collections::HashMap<String, String>,
984 pub identifiers: std::collections::HashMap<char, char>,
986 pub comments: std::collections::HashMap<String, Option<String>>,
988 pub string_escapes: Vec<char>,
990 pub nested_comments: bool,
992 pub escape_follow_chars: Vec<char>,
997 pub b_prefix_is_byte_string: bool,
1000 pub numeric_literals: std::collections::HashMap<String, String>,
1003 pub identifiers_can_start_with_digit: bool,
1007 pub hex_number_strings: bool,
1011 pub hex_string_is_integer_type: bool,
1015 pub string_escapes_allowed_in_raw_strings: bool,
1020 pub hash_comments: bool,
1022 pub dollar_sign_is_identifier: bool,
1026 pub insert_format_raw_data: bool,
1030}
1031
1032impl Default for TokenizerConfig {
1033 fn default() -> Self {
1034 let mut keywords = std::collections::HashMap::new();
1035 keywords.insert("SELECT".to_string(), TokenType::Select);
1037 keywords.insert("FROM".to_string(), TokenType::From);
1038 keywords.insert("WHERE".to_string(), TokenType::Where);
1039 keywords.insert("AND".to_string(), TokenType::And);
1040 keywords.insert("OR".to_string(), TokenType::Or);
1041 keywords.insert("NOT".to_string(), TokenType::Not);
1042 keywords.insert("AS".to_string(), TokenType::As);
1043 keywords.insert("ON".to_string(), TokenType::On);
1044 keywords.insert("JOIN".to_string(), TokenType::Join);
1045 keywords.insert("LEFT".to_string(), TokenType::Left);
1046 keywords.insert("RIGHT".to_string(), TokenType::Right);
1047 keywords.insert("INNER".to_string(), TokenType::Inner);
1048 keywords.insert("OUTER".to_string(), TokenType::Outer);
1049 keywords.insert("OUTPUT".to_string(), TokenType::Output);
1050 keywords.insert("FULL".to_string(), TokenType::Full);
1051 keywords.insert("CROSS".to_string(), TokenType::Cross);
1052 keywords.insert("SEMI".to_string(), TokenType::Semi);
1053 keywords.insert("ANTI".to_string(), TokenType::Anti);
1054 keywords.insert("STRAIGHT_JOIN".to_string(), TokenType::StraightJoin);
1055 keywords.insert("UNION".to_string(), TokenType::Union);
1056 keywords.insert("EXCEPT".to_string(), TokenType::Except);
1057 keywords.insert("MINUS".to_string(), TokenType::Except); keywords.insert("INTERSECT".to_string(), TokenType::Intersect);
1059 keywords.insert("GROUP".to_string(), TokenType::Group);
1060 keywords.insert("CUBE".to_string(), TokenType::Cube);
1061 keywords.insert("ROLLUP".to_string(), TokenType::Rollup);
1062 keywords.insert("WITHIN".to_string(), TokenType::Within);
1063 keywords.insert("ORDER".to_string(), TokenType::Order);
1064 keywords.insert("BY".to_string(), TokenType::By);
1065 keywords.insert("HAVING".to_string(), TokenType::Having);
1066 keywords.insert("LIMIT".to_string(), TokenType::Limit);
1067 keywords.insert("OFFSET".to_string(), TokenType::Offset);
1068 keywords.insert("ORDINALITY".to_string(), TokenType::Ordinality);
1069 keywords.insert("FETCH".to_string(), TokenType::Fetch);
1070 keywords.insert("FIRST".to_string(), TokenType::First);
1071 keywords.insert("NEXT".to_string(), TokenType::Next);
1072 keywords.insert("ONLY".to_string(), TokenType::Only);
1073 keywords.insert("KEEP".to_string(), TokenType::Keep);
1074 keywords.insert("IGNORE".to_string(), TokenType::Ignore);
1075 keywords.insert("INPUT".to_string(), TokenType::Input);
1076 keywords.insert("CASE".to_string(), TokenType::Case);
1077 keywords.insert("WHEN".to_string(), TokenType::When);
1078 keywords.insert("THEN".to_string(), TokenType::Then);
1079 keywords.insert("ELSE".to_string(), TokenType::Else);
1080 keywords.insert("END".to_string(), TokenType::End);
1081 keywords.insert("ENDIF".to_string(), TokenType::End); keywords.insert("NULL".to_string(), TokenType::Null);
1083 keywords.insert("TRUE".to_string(), TokenType::True);
1084 keywords.insert("FALSE".to_string(), TokenType::False);
1085 keywords.insert("IS".to_string(), TokenType::Is);
1086 keywords.insert("IN".to_string(), TokenType::In);
1087 keywords.insert("BETWEEN".to_string(), TokenType::Between);
1088 keywords.insert("OVERLAPS".to_string(), TokenType::Overlaps);
1089 keywords.insert("LIKE".to_string(), TokenType::Like);
1090 keywords.insert("ILIKE".to_string(), TokenType::ILike);
1091 keywords.insert("RLIKE".to_string(), TokenType::RLike);
1092 keywords.insert("REGEXP".to_string(), TokenType::RLike);
1093 keywords.insert("ESCAPE".to_string(), TokenType::Escape);
1094 keywords.insert("EXISTS".to_string(), TokenType::Exists);
1095 keywords.insert("DISTINCT".to_string(), TokenType::Distinct);
1096 keywords.insert("ALL".to_string(), TokenType::All);
1097 keywords.insert("WITH".to_string(), TokenType::With);
1098 keywords.insert("CREATE".to_string(), TokenType::Create);
1099 keywords.insert("DROP".to_string(), TokenType::Drop);
1100 keywords.insert("ALTER".to_string(), TokenType::Alter);
1101 keywords.insert("TRUNCATE".to_string(), TokenType::Truncate);
1102 keywords.insert("TABLE".to_string(), TokenType::Table);
1103 keywords.insert("VIEW".to_string(), TokenType::View);
1104 keywords.insert("INDEX".to_string(), TokenType::Index);
1105 keywords.insert("COLUMN".to_string(), TokenType::Column);
1106 keywords.insert("CONSTRAINT".to_string(), TokenType::Constraint);
1107 keywords.insert("ADD".to_string(), TokenType::Add);
1108 keywords.insert("CASCADE".to_string(), TokenType::Cascade);
1109 keywords.insert("RESTRICT".to_string(), TokenType::Restrict);
1110 keywords.insert("RENAME".to_string(), TokenType::Rename);
1111 keywords.insert("TEMPORARY".to_string(), TokenType::Temporary);
1112 keywords.insert("TEMP".to_string(), TokenType::Temporary);
1113 keywords.insert("UNIQUE".to_string(), TokenType::Unique);
1114 keywords.insert("PRIMARY".to_string(), TokenType::PrimaryKey);
1115 keywords.insert("FOREIGN".to_string(), TokenType::ForeignKey);
1116 keywords.insert("KEY".to_string(), TokenType::Key);
1117 keywords.insert("KILL".to_string(), TokenType::Kill);
1118 keywords.insert("REFERENCES".to_string(), TokenType::References);
1119 keywords.insert("DEFAULT".to_string(), TokenType::Default);
1120 keywords.insert("DECLARE".to_string(), TokenType::Declare);
1121 keywords.insert("AUTO_INCREMENT".to_string(), TokenType::AutoIncrement);
1122 keywords.insert("AUTOINCREMENT".to_string(), TokenType::AutoIncrement); keywords.insert("MATERIALIZED".to_string(), TokenType::Materialized);
1124 keywords.insert("REPLACE".to_string(), TokenType::Replace);
1125 keywords.insert("TO".to_string(), TokenType::To);
1126 keywords.insert("INSERT".to_string(), TokenType::Insert);
1127 keywords.insert("OVERWRITE".to_string(), TokenType::Overwrite);
1128 keywords.insert("UPDATE".to_string(), TokenType::Update);
1129 keywords.insert("USE".to_string(), TokenType::Use);
1130 keywords.insert("WAREHOUSE".to_string(), TokenType::Warehouse);
1131 keywords.insert("GLOB".to_string(), TokenType::Glob);
1132 keywords.insert("DELETE".to_string(), TokenType::Delete);
1133 keywords.insert("MERGE".to_string(), TokenType::Merge);
1134 keywords.insert("CACHE".to_string(), TokenType::Cache);
1135 keywords.insert("UNCACHE".to_string(), TokenType::Uncache);
1136 keywords.insert("REFRESH".to_string(), TokenType::Refresh);
1137 keywords.insert("GRANT".to_string(), TokenType::Grant);
1138 keywords.insert("REVOKE".to_string(), TokenType::Revoke);
1139 keywords.insert("COMMENT".to_string(), TokenType::Comment);
1140 keywords.insert("COLLATE".to_string(), TokenType::Collate);
1141 keywords.insert("INTO".to_string(), TokenType::Into);
1142 keywords.insert("VALUES".to_string(), TokenType::Values);
1143 keywords.insert("SET".to_string(), TokenType::Set);
1144 keywords.insert("SETTINGS".to_string(), TokenType::Settings);
1145 keywords.insert("SEPARATOR".to_string(), TokenType::Separator);
1146 keywords.insert("ASC".to_string(), TokenType::Asc);
1147 keywords.insert("DESC".to_string(), TokenType::Desc);
1148 keywords.insert("NULLS".to_string(), TokenType::Nulls);
1149 keywords.insert("RESPECT".to_string(), TokenType::Respect);
1150 keywords.insert("FIRST".to_string(), TokenType::First);
1151 keywords.insert("LAST".to_string(), TokenType::Last);
1152 keywords.insert("IF".to_string(), TokenType::If);
1153 keywords.insert("CAST".to_string(), TokenType::Cast);
1154 keywords.insert("TRY_CAST".to_string(), TokenType::TryCast);
1155 keywords.insert("SAFE_CAST".to_string(), TokenType::SafeCast);
1156 keywords.insert("OVER".to_string(), TokenType::Over);
1157 keywords.insert("PARTITION".to_string(), TokenType::Partition);
1158 keywords.insert("PLACING".to_string(), TokenType::Placing);
1159 keywords.insert("WINDOW".to_string(), TokenType::Window);
1160 keywords.insert("ROWS".to_string(), TokenType::Rows);
1161 keywords.insert("RANGE".to_string(), TokenType::Range);
1162 keywords.insert("FILTER".to_string(), TokenType::Filter);
1163 keywords.insert("NATURAL".to_string(), TokenType::Natural);
1164 keywords.insert("USING".to_string(), TokenType::Using);
1165 keywords.insert("UNBOUNDED".to_string(), TokenType::Unbounded);
1166 keywords.insert("PRECEDING".to_string(), TokenType::Preceding);
1167 keywords.insert("FOLLOWING".to_string(), TokenType::Following);
1168 keywords.insert("CURRENT".to_string(), TokenType::Current);
1169 keywords.insert("ROW".to_string(), TokenType::Row);
1170 keywords.insert("GROUPS".to_string(), TokenType::Groups);
1171 keywords.insert("RECURSIVE".to_string(), TokenType::Recursive);
1172 keywords.insert("BOTH".to_string(), TokenType::Both);
1174 keywords.insert("LEADING".to_string(), TokenType::Leading);
1175 keywords.insert("TRAILING".to_string(), TokenType::Trailing);
1176 keywords.insert("INTERVAL".to_string(), TokenType::Interval);
1177 keywords.insert("TOP".to_string(), TokenType::Top);
1179 keywords.insert("QUALIFY".to_string(), TokenType::Qualify);
1180 keywords.insert("SAMPLE".to_string(), TokenType::Sample);
1181 keywords.insert("TABLESAMPLE".to_string(), TokenType::TableSample);
1182 keywords.insert("BERNOULLI".to_string(), TokenType::Bernoulli);
1183 keywords.insert("SYSTEM".to_string(), TokenType::System);
1184 keywords.insert("BLOCK".to_string(), TokenType::Block);
1185 keywords.insert("SEED".to_string(), TokenType::Seed);
1186 keywords.insert("REPEATABLE".to_string(), TokenType::Repeatable);
1187 keywords.insert("TIES".to_string(), TokenType::Ties);
1188 keywords.insert("LATERAL".to_string(), TokenType::Lateral);
1189 keywords.insert("LAMBDA".to_string(), TokenType::Lambda);
1190 keywords.insert("APPLY".to_string(), TokenType::Apply);
1191 keywords.insert("CONNECT".to_string(), TokenType::Connect);
1193 keywords.insert("CLUSTER".to_string(), TokenType::Cluster);
1195 keywords.insert("DISTRIBUTE".to_string(), TokenType::Distribute);
1196 keywords.insert("SORT".to_string(), TokenType::Sort);
1197 keywords.insert("PIVOT".to_string(), TokenType::Pivot);
1198 keywords.insert("PREWHERE".to_string(), TokenType::Prewhere);
1199 keywords.insert("UNPIVOT".to_string(), TokenType::Unpivot);
1200 keywords.insert("FOR".to_string(), TokenType::For);
1201 keywords.insert("ANY".to_string(), TokenType::Any);
1202 keywords.insert("SOME".to_string(), TokenType::Some);
1203 keywords.insert("ASOF".to_string(), TokenType::AsOf);
1204 keywords.insert("PERCENT".to_string(), TokenType::Percent);
1205 keywords.insert("EXCLUDE".to_string(), TokenType::Exclude);
1206 keywords.insert("NO".to_string(), TokenType::No);
1207 keywords.insert("OTHERS".to_string(), TokenType::Others);
1208 keywords.insert("OPERATOR".to_string(), TokenType::Operator);
1210 keywords.insert("SCHEMA".to_string(), TokenType::Schema);
1212 keywords.insert("NAMESPACE".to_string(), TokenType::Namespace);
1213 keywords.insert("DATABASE".to_string(), TokenType::Database);
1214 keywords.insert("FUNCTION".to_string(), TokenType::Function);
1215 keywords.insert("PROCEDURE".to_string(), TokenType::Procedure);
1216 keywords.insert("PROC".to_string(), TokenType::Procedure);
1217 keywords.insert("SEQUENCE".to_string(), TokenType::Sequence);
1218 keywords.insert("TRIGGER".to_string(), TokenType::Trigger);
1219 keywords.insert("TYPE".to_string(), TokenType::Type);
1220 keywords.insert("DOMAIN".to_string(), TokenType::Domain);
1221 keywords.insert("RETURNS".to_string(), TokenType::Returns);
1222 keywords.insert("RETURNING".to_string(), TokenType::Returning);
1223 keywords.insert("LANGUAGE".to_string(), TokenType::Language);
1224 keywords.insert("ROLLBACK".to_string(), TokenType::Rollback);
1225 keywords.insert("COMMIT".to_string(), TokenType::Commit);
1226 keywords.insert("BEGIN".to_string(), TokenType::Begin);
1227 keywords.insert("DESCRIBE".to_string(), TokenType::Describe);
1228 keywords.insert("PRESERVE".to_string(), TokenType::Preserve);
1229 keywords.insert("TRANSACTION".to_string(), TokenType::Transaction);
1230 keywords.insert("SAVEPOINT".to_string(), TokenType::Savepoint);
1231 keywords.insert("BODY".to_string(), TokenType::Body);
1232 keywords.insert("INCREMENT".to_string(), TokenType::Increment);
1233 keywords.insert("MINVALUE".to_string(), TokenType::Minvalue);
1234 keywords.insert("MAXVALUE".to_string(), TokenType::Maxvalue);
1235 keywords.insert("CYCLE".to_string(), TokenType::Cycle);
1236 keywords.insert("NOCYCLE".to_string(), TokenType::NoCycle);
1237 keywords.insert("PRIOR".to_string(), TokenType::Prior);
1238 keywords.insert("MATCH".to_string(), TokenType::Match);
1240 keywords.insert("MATCH_RECOGNIZE".to_string(), TokenType::MatchRecognize);
1241 keywords.insert("MEASURES".to_string(), TokenType::Measures);
1242 keywords.insert("PATTERN".to_string(), TokenType::Pattern);
1243 keywords.insert("DEFINE".to_string(), TokenType::Define);
1244 keywords.insert("RUNNING".to_string(), TokenType::Running);
1245 keywords.insert("FINAL".to_string(), TokenType::Final);
1246 keywords.insert("OWNED".to_string(), TokenType::Owned);
1247 keywords.insert("AFTER".to_string(), TokenType::After);
1248 keywords.insert("BEFORE".to_string(), TokenType::Before);
1249 keywords.insert("INSTEAD".to_string(), TokenType::Instead);
1250 keywords.insert("EACH".to_string(), TokenType::Each);
1251 keywords.insert("STATEMENT".to_string(), TokenType::Statement);
1252 keywords.insert("REFERENCING".to_string(), TokenType::Referencing);
1253 keywords.insert("OLD".to_string(), TokenType::Old);
1254 keywords.insert("NEW".to_string(), TokenType::New);
1255 keywords.insert("OF".to_string(), TokenType::Of);
1256 keywords.insert("CHECK".to_string(), TokenType::Check);
1257 keywords.insert("START".to_string(), TokenType::Start);
1258 keywords.insert("ENUM".to_string(), TokenType::Enum);
1259 keywords.insert("AUTHORIZATION".to_string(), TokenType::Authorization);
1260 keywords.insert("RESTART".to_string(), TokenType::Restart);
1261 keywords.insert("DATE".to_string(), TokenType::Date);
1263 keywords.insert("TIME".to_string(), TokenType::Time);
1264 keywords.insert("TIMESTAMP".to_string(), TokenType::Timestamp);
1265 keywords.insert("DATETIME".to_string(), TokenType::DateTime);
1266 keywords.insert("GENERATED".to_string(), TokenType::Generated);
1267 keywords.insert("IDENTITY".to_string(), TokenType::Identity);
1268 keywords.insert("ALWAYS".to_string(), TokenType::Always);
1269 keywords.insert("LOAD".to_string(), TokenType::Load);
1271 keywords.insert("LOCAL".to_string(), TokenType::Local);
1272 keywords.insert("INPATH".to_string(), TokenType::Inpath);
1273 keywords.insert("INPUTFORMAT".to_string(), TokenType::InputFormat);
1274 keywords.insert("SERDE".to_string(), TokenType::Serde);
1275 keywords.insert("SERDEPROPERTIES".to_string(), TokenType::SerdeProperties);
1276 keywords.insert("FORMAT".to_string(), TokenType::Format);
1277 keywords.insert("PRAGMA".to_string(), TokenType::Pragma);
1279 keywords.insert("SHOW".to_string(), TokenType::Show);
1281 keywords.insert("SIBLINGS".to_string(), TokenType::Siblings);
1283 keywords.insert("COPY".to_string(), TokenType::Copy);
1285 keywords.insert("PUT".to_string(), TokenType::Put);
1286 keywords.insert("GET".to_string(), TokenType::Get);
1287 keywords.insert("EXEC".to_string(), TokenType::Execute);
1289 keywords.insert("EXECUTE".to_string(), TokenType::Execute);
1290 keywords.insert("ISNULL".to_string(), TokenType::IsNull);
1292 keywords.insert("NOTNULL".to_string(), TokenType::NotNull);
1293
1294 let mut single_tokens = std::collections::HashMap::new();
1295 single_tokens.insert('(', TokenType::LParen);
1296 single_tokens.insert(')', TokenType::RParen);
1297 single_tokens.insert('[', TokenType::LBracket);
1298 single_tokens.insert(']', TokenType::RBracket);
1299 single_tokens.insert('{', TokenType::LBrace);
1300 single_tokens.insert('}', TokenType::RBrace);
1301 single_tokens.insert(',', TokenType::Comma);
1302 single_tokens.insert('.', TokenType::Dot);
1303 single_tokens.insert(';', TokenType::Semicolon);
1304 single_tokens.insert('+', TokenType::Plus);
1305 single_tokens.insert('-', TokenType::Dash);
1306 single_tokens.insert('*', TokenType::Star);
1307 single_tokens.insert('/', TokenType::Slash);
1308 single_tokens.insert('%', TokenType::Percent);
1309 single_tokens.insert('&', TokenType::Amp);
1310 single_tokens.insert('|', TokenType::Pipe);
1311 single_tokens.insert('^', TokenType::Caret);
1312 single_tokens.insert('~', TokenType::Tilde);
1313 single_tokens.insert('<', TokenType::Lt);
1314 single_tokens.insert('>', TokenType::Gt);
1315 single_tokens.insert('=', TokenType::Eq);
1316 single_tokens.insert('!', TokenType::Exclamation);
1317 single_tokens.insert(':', TokenType::Colon);
1318 single_tokens.insert('@', TokenType::DAt);
1319 single_tokens.insert('#', TokenType::Hash);
1320 single_tokens.insert('$', TokenType::Dollar);
1321 single_tokens.insert('?', TokenType::Parameter);
1322
1323 let mut quotes = std::collections::HashMap::new();
1324 quotes.insert("'".to_string(), "'".to_string());
1325 quotes.insert("\"\"\"".to_string(), "\"\"\"".to_string());
1327
1328 let mut identifiers = std::collections::HashMap::new();
1329 identifiers.insert('"', '"');
1330 identifiers.insert('`', '`');
1331 let mut comments = std::collections::HashMap::new();
1335 comments.insert("--".to_string(), None);
1336 comments.insert("/*".to_string(), Some("*/".to_string()));
1337
1338 Self {
1339 keywords,
1340 single_tokens,
1341 quotes,
1342 identifiers,
1343 comments,
1344 string_escapes: vec!['\''],
1347 nested_comments: true,
1348 escape_follow_chars: vec![],
1350 b_prefix_is_byte_string: false,
1352 numeric_literals: std::collections::HashMap::new(),
1353 identifiers_can_start_with_digit: false,
1354 hex_number_strings: false,
1355 hex_string_is_integer_type: false,
1356 string_escapes_allowed_in_raw_strings: true,
1359 hash_comments: false,
1360 dollar_sign_is_identifier: false,
1361 insert_format_raw_data: false,
1362 }
1363 }
1364}
1365
1366pub struct Tokenizer {
1368 config: TokenizerConfig,
1369}
1370
1371impl Tokenizer {
1372 pub fn new(config: TokenizerConfig) -> Self {
1374 Self { config }
1375 }
1376
1377 pub fn default_config() -> Self {
1379 Self::new(TokenizerConfig::default())
1380 }
1381
1382 pub fn tokenize(&self, sql: &str) -> Result<Vec<Token>> {
1384 let mut state = TokenizerState::new(sql, &self.config);
1385 state.tokenize()
1386 }
1387}
1388
1389impl Default for Tokenizer {
1390 fn default() -> Self {
1391 Self::default_config()
1392 }
1393}
1394
1395struct TokenizerState<'a> {
1397 source: &'a str,
1398 source_is_ascii: bool,
1399 chars: Vec<char>,
1400 size: usize,
1401 tokens: Vec<Token>,
1402 start: usize,
1403 current: usize,
1404 line: usize,
1405 column: usize,
1406 comments: Vec<String>,
1407 config: &'a TokenizerConfig,
1408}
1409
1410impl<'a> TokenizerState<'a> {
1411 fn new(sql: &'a str, config: &'a TokenizerConfig) -> Self {
1412 let chars: Vec<char> = sql.chars().collect();
1413 let size = chars.len();
1414 Self {
1415 source: sql,
1416 source_is_ascii: sql.is_ascii(),
1417 chars,
1418 size,
1419 tokens: Vec::new(),
1420 start: 0,
1421 current: 0,
1422 line: 1,
1423 column: 1,
1424 comments: Vec::new(),
1425 config,
1426 }
1427 }
1428
1429 fn tokenize(&mut self) -> Result<Vec<Token>> {
1430 while !self.is_at_end() {
1431 self.skip_whitespace();
1432 if self.is_at_end() {
1433 break;
1434 }
1435
1436 self.start = self.current;
1437 self.scan_token()?;
1438
1439 if self.config.insert_format_raw_data {
1442 if let Some(raw) = self.try_scan_insert_format_raw_data() {
1443 if !raw.is_empty() {
1444 self.start = self.current;
1445 self.add_token_with_text(TokenType::Var, raw);
1446 }
1447 }
1448 }
1449 }
1450
1451 if !self.comments.is_empty() {
1456 if let Some(last) = self.tokens.last_mut() {
1457 last.trailing_comments.extend(self.comments.drain(..));
1458 }
1459 }
1460
1461 Ok(std::mem::take(&mut self.tokens))
1462 }
1463
1464 fn is_at_end(&self) -> bool {
1465 self.current >= self.size
1466 }
1467
1468 #[inline]
1469 fn text_from_range(&self, start: usize, end: usize) -> String {
1470 if self.source_is_ascii {
1471 self.source[start..end].to_string()
1472 } else {
1473 self.chars[start..end].iter().collect()
1474 }
1475 }
1476
1477 fn peek(&self) -> char {
1478 if self.is_at_end() {
1479 '\0'
1480 } else {
1481 self.chars[self.current]
1482 }
1483 }
1484
1485 fn peek_next(&self) -> char {
1486 if self.current + 1 >= self.size {
1487 '\0'
1488 } else {
1489 self.chars[self.current + 1]
1490 }
1491 }
1492
1493 fn advance(&mut self) -> char {
1494 let c = self.peek();
1495 self.current += 1;
1496 if c == '\n' {
1497 self.line += 1;
1498 self.column = 1;
1499 } else {
1500 self.column += 1;
1501 }
1502 c
1503 }
1504
1505 fn skip_whitespace(&mut self) {
1506 let mut saw_newline = false;
1511 while !self.is_at_end() {
1512 let c = self.peek();
1513 match c {
1514 ' ' | '\t' | '\r' => {
1515 self.advance();
1516 }
1517 '\n' => {
1518 saw_newline = true;
1519 self.advance();
1520 }
1521 '\u{00A0}' | '\u{2000}'..='\u{200B}' | '\u{3000}' | '\u{FEFF}' => {
1526 self.advance();
1527 }
1528 '-' if self.peek_next() == '-' => {
1529 self.scan_line_comment(saw_newline);
1530 saw_newline = true;
1532 }
1533 '/' if self.peek_next() == '/' && self.config.hash_comments => {
1534 self.scan_double_slash_comment();
1536 }
1537 '/' if self.peek_next() == '*' => {
1538 if self.current + 2 < self.size && self.chars[self.current + 2] == '+' {
1540 break;
1542 }
1543 if self.scan_block_comment(saw_newline).is_err() {
1544 return;
1545 }
1546 }
1548 '/' if self.peek_next() == '/' && self.config.comments.contains_key("//") => {
1549 let prev_non_ws = if self.current > 0 {
1553 let mut i = self.current - 1;
1554 while i > 0 && (self.chars[i] == ' ' || self.chars[i] == '\t') {
1555 i -= 1;
1556 }
1557 self.chars[i]
1558 } else {
1559 '\0'
1560 };
1561 if prev_non_ws == ':' || prev_non_ws == '/' {
1562 break;
1564 }
1565 self.scan_line_comment(saw_newline);
1566 saw_newline = true;
1568 }
1569 '#' if self.config.hash_comments => {
1570 self.scan_hash_line_comment();
1571 }
1572 _ => break,
1573 }
1574 }
1575 }
1576
1577 fn scan_hash_line_comment(&mut self) {
1578 self.advance(); let start = self.current;
1580 while !self.is_at_end() && self.peek() != '\n' {
1581 self.advance();
1582 }
1583 let comment = self.text_from_range(start, self.current);
1584 let comment_text = comment.trim().to_string();
1585 if let Some(last) = self.tokens.last_mut() {
1586 last.trailing_comments.push(comment_text);
1587 } else {
1588 self.comments.push(comment_text);
1589 }
1590 }
1591
1592 fn scan_double_slash_comment(&mut self) {
1593 self.advance(); self.advance(); let start = self.current;
1596 while !self.is_at_end() && self.peek() != '\n' {
1597 self.advance();
1598 }
1599 let comment = self.text_from_range(start, self.current);
1600 let comment_text = comment.trim().to_string();
1601 if let Some(last) = self.tokens.last_mut() {
1602 last.trailing_comments.push(comment_text);
1603 } else {
1604 self.comments.push(comment_text);
1605 }
1606 }
1607
1608 fn scan_line_comment(&mut self, after_newline: bool) {
1609 self.advance(); self.advance(); let start = self.current;
1612 while !self.is_at_end() && self.peek() != '\n' {
1613 self.advance();
1614 }
1615 let comment_text = self.text_from_range(start, self.current);
1616
1617 if after_newline || self.tokens.is_empty() {
1620 self.comments.push(comment_text);
1621 } else if let Some(last) = self.tokens.last_mut() {
1622 last.trailing_comments.push(comment_text);
1623 }
1624 }
1625
1626 fn scan_block_comment(&mut self, after_newline: bool) -> Result<()> {
1627 self.advance(); self.advance(); let content_start = self.current;
1630 let mut depth = 1;
1631
1632 while !self.is_at_end() && depth > 0 {
1633 if self.peek() == '/' && self.peek_next() == '*' && self.config.nested_comments {
1634 self.advance();
1635 self.advance();
1636 depth += 1;
1637 } else if self.peek() == '*' && self.peek_next() == '/' {
1638 depth -= 1;
1639 if depth > 0 {
1640 self.advance();
1641 self.advance();
1642 }
1643 } else {
1644 self.advance();
1645 }
1646 }
1647
1648 if depth > 0 {
1649 return Err(Error::tokenize(
1650 "Unterminated block comment",
1651 self.line,
1652 self.column,
1653 self.start,
1654 self.current,
1655 ));
1656 }
1657
1658 let content = self.text_from_range(content_start, self.current);
1660 self.advance(); self.advance(); let comment_text = format!("/*{}*/", content);
1665
1666 if after_newline || self.tokens.is_empty() {
1669 self.comments.push(comment_text);
1670 } else if let Some(last) = self.tokens.last_mut() {
1671 last.trailing_comments.push(comment_text);
1672 }
1673
1674 Ok(())
1675 }
1676
1677 fn scan_hint(&mut self) -> Result<()> {
1679 self.advance(); self.advance(); self.advance(); let hint_start = self.current;
1683
1684 while !self.is_at_end() {
1686 if self.peek() == '*' && self.peek_next() == '/' {
1687 break;
1688 }
1689 self.advance();
1690 }
1691
1692 if self.is_at_end() {
1693 return Err(Error::tokenize(
1694 "Unterminated hint comment",
1695 self.line,
1696 self.column,
1697 self.start,
1698 self.current,
1699 ));
1700 }
1701
1702 let hint_text = self.text_from_range(hint_start, self.current);
1703 self.advance(); self.advance(); self.add_token_with_text(TokenType::Hint, hint_text.trim().to_string());
1707
1708 Ok(())
1709 }
1710
1711 fn scan_positional_parameter(&mut self) -> Result<()> {
1713 self.advance(); let start = self.current;
1715
1716 while !self.is_at_end() && self.peek().is_ascii_digit() {
1717 self.advance();
1718 }
1719
1720 let number = self.text_from_range(start, self.current);
1721 self.add_token_with_text(TokenType::Parameter, number);
1722 Ok(())
1723 }
1724
1725 fn try_scan_tagged_dollar_string(&mut self) -> Result<Option<()>> {
1730 let saved_pos = self.current;
1731
1732 self.advance(); let tag_start = self.current;
1738 while !self.is_at_end()
1739 && (self.peek().is_alphanumeric() || self.peek() == '_' || !self.peek().is_ascii())
1740 {
1741 self.advance();
1742 }
1743 let tag = self.text_from_range(tag_start, self.current);
1744
1745 if self.is_at_end() || self.peek() != '$' {
1747 self.current = saved_pos;
1749 return Ok(None);
1750 }
1751 self.advance(); let content_start = self.current;
1755 let closing_tag = format!("${}$", tag);
1756 let closing_chars: Vec<char> = closing_tag.chars().collect();
1757
1758 loop {
1759 if self.is_at_end() {
1760 self.current = saved_pos;
1762 return Ok(None);
1763 }
1764
1765 if self.peek() == '$' && self.current + closing_chars.len() <= self.size {
1767 let matches = closing_chars.iter().enumerate().all(|(j, &ch)| {
1768 self.current + j < self.size && self.chars[self.current + j] == ch
1769 });
1770 if matches {
1771 let content = self.text_from_range(content_start, self.current);
1772 for _ in 0..closing_chars.len() {
1774 self.advance();
1775 }
1776 let token_text = format!("{}\x00{}", tag, content);
1778 self.add_token_with_text(TokenType::DollarString, token_text);
1779 return Ok(Some(()));
1780 }
1781 }
1782 self.advance();
1783 }
1784 }
1785
1786 fn scan_dollar_quoted_string(&mut self) -> Result<()> {
1791 self.advance(); self.advance(); let start = self.current;
1796 while !self.is_at_end() {
1797 if self.peek() == '$'
1798 && self.current + 1 < self.size
1799 && self.chars[self.current + 1] == '$'
1800 {
1801 break;
1802 }
1803 self.advance();
1804 }
1805
1806 let content = self.text_from_range(start, self.current);
1807
1808 if !self.is_at_end() {
1809 self.advance(); self.advance(); }
1812
1813 self.add_token_with_text(TokenType::DollarString, content);
1814 Ok(())
1815 }
1816
1817 fn scan_token(&mut self) -> Result<()> {
1818 let c = self.peek();
1819
1820 if c == '\'' {
1822 if self.config.quotes.contains_key("'''")
1824 && self.peek_next() == '\''
1825 && self.current + 2 < self.size
1826 && self.chars[self.current + 2] == '\''
1827 {
1828 return self.scan_triple_quoted_string('\'');
1829 }
1830 return self.scan_string();
1831 }
1832
1833 if c == '"'
1835 && self.config.quotes.contains_key("\"\"\"")
1836 && self.peek_next() == '"'
1837 && self.current + 2 < self.size
1838 && self.chars[self.current + 2] == '"'
1839 {
1840 return self.scan_triple_quoted_string('"');
1841 }
1842
1843 if c == '"'
1846 && self.config.quotes.contains_key("\"")
1847 && !self.config.identifiers.contains_key(&'"')
1848 {
1849 return self.scan_double_quoted_string();
1850 }
1851
1852 if let Some(&end_quote) = self.config.identifiers.get(&c) {
1854 return self.scan_quoted_identifier(end_quote);
1855 }
1856
1857 if c.is_ascii_digit() {
1859 return self.scan_number();
1860 }
1861
1862 if c == '.' && self.peek_next().is_ascii_digit() {
1869 let prev_char = if self.current > 0 {
1870 self.chars[self.current - 1]
1871 } else {
1872 '\0'
1873 };
1874 let is_after_ident = prev_char.is_alphanumeric()
1875 || prev_char == '_'
1876 || prev_char == '`'
1877 || prev_char == '"'
1878 || prev_char == ']'
1879 || prev_char == ')';
1880 if prev_char != '.' && !is_after_ident {
1881 return self.scan_number_starting_with_dot();
1882 }
1883 }
1884
1885 if c == '/'
1887 && self.peek_next() == '*'
1888 && self.current + 2 < self.size
1889 && self.chars[self.current + 2] == '+'
1890 {
1891 return self.scan_hint();
1892 }
1893
1894 if let Some(token_type) = self.try_scan_multi_char_operator() {
1896 self.add_token(token_type);
1897 return Ok(());
1898 }
1899
1900 if c == '$'
1903 && (self.peek_next().is_alphanumeric()
1904 || self.peek_next() == '_'
1905 || !self.peek_next().is_ascii())
1906 {
1907 if let Some(()) = self.try_scan_tagged_dollar_string()? {
1908 return Ok(());
1909 }
1910 if self.config.dollar_sign_is_identifier {
1913 return self.scan_dollar_identifier();
1914 }
1915 }
1916
1917 if c == '$' && self.peek_next() == '$' {
1919 return self.scan_dollar_quoted_string();
1920 }
1921
1922 if c == '$' && self.peek_next().is_ascii_digit() {
1924 return self.scan_positional_parameter();
1925 }
1926
1927 if c == '$' && self.config.dollar_sign_is_identifier {
1929 return self.scan_dollar_identifier();
1930 }
1931
1932 if (c == '#' || c == '@')
1935 && (self.peek_next().is_alphanumeric()
1936 || self.peek_next() == '_'
1937 || self.peek_next() == '#')
1938 {
1939 return self.scan_tsql_identifier();
1940 }
1941
1942 if let Some(&token_type) = self.config.single_tokens.get(&c) {
1944 self.advance();
1945 self.add_token(token_type);
1946 return Ok(());
1947 }
1948
1949 if c == '\u{2212}' {
1951 self.advance();
1952 self.add_token(TokenType::Dash);
1953 return Ok(());
1954 }
1955
1956 if c == '\u{2044}' {
1958 self.advance();
1959 self.add_token(TokenType::Slash);
1960 return Ok(());
1961 }
1962
1963 if c == '\u{2018}' || c == '\u{2019}' {
1965 return self.scan_unicode_quoted_string(c);
1967 }
1968 if c == '\u{201C}' || c == '\u{201D}' {
1969 return self.scan_unicode_quoted_identifier(c);
1971 }
1972
1973 self.scan_identifier_or_keyword()
1975 }
1976
1977 fn try_scan_multi_char_operator(&mut self) -> Option<TokenType> {
1978 let c = self.peek();
1979 let next = self.peek_next();
1980 let third = if self.current + 2 < self.size {
1981 self.chars[self.current + 2]
1982 } else {
1983 '\0'
1984 };
1985
1986 if c == '-' && next == '|' && third == '-' {
1989 self.advance();
1990 self.advance();
1991 self.advance();
1992 return Some(TokenType::Adjacent);
1993 }
1994
1995 if c == '|' && next == '|' && third == '/' {
1997 self.advance();
1998 self.advance();
1999 self.advance();
2000 return Some(TokenType::DPipeSlash);
2001 }
2002
2003 if c == '#' && next == '>' && third == '>' {
2005 self.advance();
2006 self.advance();
2007 self.advance();
2008 return Some(TokenType::DHashArrow);
2009 }
2010
2011 if c == '-' && next == '>' && third == '>' {
2013 self.advance();
2014 self.advance();
2015 self.advance();
2016 return Some(TokenType::DArrow);
2017 }
2018
2019 if c == '<' && next == '=' && third == '>' {
2021 self.advance();
2022 self.advance();
2023 self.advance();
2024 return Some(TokenType::NullsafeEq);
2025 }
2026
2027 if c == '<' && next == '-' && third == '>' {
2029 self.advance();
2030 self.advance();
2031 self.advance();
2032 return Some(TokenType::LrArrow);
2033 }
2034
2035 if c == '<' && next == '@' {
2037 self.advance();
2038 self.advance();
2039 return Some(TokenType::LtAt);
2040 }
2041
2042 if c == '@' && next == '>' {
2044 self.advance();
2045 self.advance();
2046 return Some(TokenType::AtGt);
2047 }
2048
2049 if c == '~' && next == '~' && third == '~' {
2051 self.advance();
2052 self.advance();
2053 self.advance();
2054 return Some(TokenType::Glob);
2055 }
2056
2057 if c == '~' && next == '~' && third == '*' {
2059 self.advance();
2060 self.advance();
2061 self.advance();
2062 return Some(TokenType::ILike);
2063 }
2064
2065 let fourth = if self.current + 3 < self.size {
2067 self.chars[self.current + 3]
2068 } else {
2069 '\0'
2070 };
2071 if c == '!' && next == '~' && third == '~' && fourth == '*' {
2072 self.advance();
2073 self.advance();
2074 self.advance();
2075 self.advance();
2076 return Some(TokenType::NotILike);
2077 }
2078
2079 if c == '!' && next == '~' && third == '~' {
2081 self.advance();
2082 self.advance();
2083 self.advance();
2084 return Some(TokenType::NotLike);
2085 }
2086
2087 if c == '!' && next == '~' && third == '*' {
2089 self.advance();
2090 self.advance();
2091 self.advance();
2092 return Some(TokenType::NotIRLike);
2093 }
2094
2095 if c == '!' && next == ':' && third == '>' {
2097 self.advance();
2098 self.advance();
2099 self.advance();
2100 return Some(TokenType::NColonGt);
2101 }
2102
2103 if c == '?' && next == ':' && third == ':' {
2105 self.advance();
2106 self.advance();
2107 self.advance();
2108 return Some(TokenType::QDColon);
2109 }
2110
2111 if c == '!' && next == '~' {
2113 self.advance();
2114 self.advance();
2115 return Some(TokenType::NotRLike);
2116 }
2117
2118 if c == '~' && next == '~' {
2120 self.advance();
2121 self.advance();
2122 return Some(TokenType::Like);
2123 }
2124
2125 if c == '~' && next == '*' {
2127 self.advance();
2128 self.advance();
2129 return Some(TokenType::IRLike);
2130 }
2131
2132 if c == ':' && next == ':' && third == '$' {
2135 self.advance();
2136 self.advance();
2137 self.advance();
2138 return Some(TokenType::DColonDollar);
2139 }
2140 if c == ':' && next == ':' && third == '%' {
2141 self.advance();
2142 self.advance();
2143 self.advance();
2144 return Some(TokenType::DColonPercent);
2145 }
2146 if c == ':' && next == ':' && third == '?' {
2147 self.advance();
2148 self.advance();
2149 self.advance();
2150 return Some(TokenType::DColonQMark);
2151 }
2152
2153 let token_type = match (c, next) {
2155 ('.', ':') => Some(TokenType::DotColon),
2156 ('=', '=') => Some(TokenType::Eq), ('<', '=') => Some(TokenType::Lte),
2158 ('>', '=') => Some(TokenType::Gte),
2159 ('!', '=') => Some(TokenType::Neq),
2160 ('<', '>') => Some(TokenType::Neq),
2161 ('^', '=') => Some(TokenType::Neq),
2162 ('<', '<') => Some(TokenType::LtLt),
2163 ('>', '>') => Some(TokenType::GtGt),
2164 ('|', '|') => Some(TokenType::DPipe),
2165 ('|', '/') => Some(TokenType::PipeSlash), (':', ':') => Some(TokenType::DColon),
2167 (':', '=') => Some(TokenType::ColonEq), (':', '>') => Some(TokenType::ColonGt), ('-', '>') => Some(TokenType::Arrow), ('=', '>') => Some(TokenType::FArrow), ('&', '&') => Some(TokenType::DAmp),
2172 ('&', '<') => Some(TokenType::AmpLt), ('&', '>') => Some(TokenType::AmpGt), ('@', '@') => Some(TokenType::AtAt), ('?', '|') => Some(TokenType::QMarkPipe), ('?', '&') => Some(TokenType::QMarkAmp), ('?', '?') => Some(TokenType::DQMark), ('#', '>') => Some(TokenType::HashArrow), ('#', '-') => Some(TokenType::HashDash), ('^', '@') => Some(TokenType::CaretAt), ('*', '*') => Some(TokenType::DStar), ('|', '>') => Some(TokenType::PipeGt), _ => None,
2184 };
2185
2186 if token_type.is_some() {
2187 self.advance();
2188 self.advance();
2189 }
2190
2191 token_type
2192 }
2193
2194 fn scan_string(&mut self) -> Result<()> {
2195 self.advance(); let mut value = String::new();
2197
2198 while !self.is_at_end() {
2199 let c = self.peek();
2200 if c == '\'' {
2201 if self.peek_next() == '\'' {
2202 value.push('\'');
2204 self.advance();
2205 self.advance();
2206 } else {
2207 break;
2208 }
2209 } else if c == '\\' && self.config.string_escapes.contains(&'\\') {
2210 self.advance(); if !self.is_at_end() {
2213 let escaped = self.advance();
2214 match escaped {
2215 'n' => value.push('\n'),
2216 'r' => value.push('\r'),
2217 't' => value.push('\t'),
2218 '0' => value.push('\0'),
2219 'Z' => value.push('\x1A'), 'a' => value.push('\x07'), 'b' => value.push('\x08'), 'f' => value.push('\x0C'), 'v' => value.push('\x0B'), 'x' => {
2225 let mut hex = String::with_capacity(2);
2227 for _ in 0..2 {
2228 if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2229 hex.push(self.advance());
2230 }
2231 }
2232 if hex.len() == 2 {
2233 if let Ok(byte) = u8::from_str_radix(&hex, 16) {
2234 value.push(byte as char);
2235 } else {
2236 value.push('\\');
2237 value.push('x');
2238 value.push_str(&hex);
2239 }
2240 } else {
2241 value.push('\\');
2243 value.push('x');
2244 value.push_str(&hex);
2245 }
2246 }
2247 '\\' => value.push('\\'),
2248 '\'' => value.push('\''),
2249 '"' => value.push('"'),
2250 '%' => {
2251 value.push('%');
2253 }
2254 '_' => {
2255 value.push('_');
2257 }
2258 _ => {
2262 if !self.config.escape_follow_chars.is_empty() {
2263 value.push(escaped);
2265 } else {
2266 value.push('\\');
2268 value.push(escaped);
2269 }
2270 }
2271 }
2272 }
2273 } else {
2274 value.push(self.advance());
2275 }
2276 }
2277
2278 if self.is_at_end() {
2279 return Err(Error::tokenize(
2280 "Unterminated string",
2281 self.line,
2282 self.column,
2283 self.start,
2284 self.current,
2285 ));
2286 }
2287
2288 self.advance(); self.add_token_with_text(TokenType::String, value);
2290 Ok(())
2291 }
2292
2293 fn scan_double_quoted_string(&mut self) -> Result<()> {
2295 self.advance(); let mut value = String::new();
2297
2298 while !self.is_at_end() {
2299 let c = self.peek();
2300 if c == '"' {
2301 if self.peek_next() == '"' {
2302 value.push('"');
2304 self.advance();
2305 self.advance();
2306 } else {
2307 break;
2308 }
2309 } else if c == '\\' && self.config.string_escapes.contains(&'\\') {
2310 self.advance(); if !self.is_at_end() {
2313 let escaped = self.advance();
2314 match escaped {
2315 'n' => value.push('\n'),
2316 'r' => value.push('\r'),
2317 't' => value.push('\t'),
2318 '0' => value.push('\0'),
2319 'Z' => value.push('\x1A'), 'a' => value.push('\x07'), 'b' => value.push('\x08'), 'f' => value.push('\x0C'), 'v' => value.push('\x0B'), 'x' => {
2325 let mut hex = String::with_capacity(2);
2327 for _ in 0..2 {
2328 if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2329 hex.push(self.advance());
2330 }
2331 }
2332 if hex.len() == 2 {
2333 if let Ok(byte) = u8::from_str_radix(&hex, 16) {
2334 value.push(byte as char);
2335 } else {
2336 value.push('\\');
2337 value.push('x');
2338 value.push_str(&hex);
2339 }
2340 } else {
2341 value.push('\\');
2343 value.push('x');
2344 value.push_str(&hex);
2345 }
2346 }
2347 '\\' => value.push('\\'),
2348 '\'' => value.push('\''),
2349 '"' => value.push('"'),
2350 '%' => {
2351 value.push('%');
2353 }
2354 '_' => {
2355 value.push('_');
2357 }
2358 _ => {
2362 if !self.config.escape_follow_chars.is_empty() {
2363 value.push(escaped);
2365 } else {
2366 value.push('\\');
2368 value.push(escaped);
2369 }
2370 }
2371 }
2372 }
2373 } else {
2374 value.push(self.advance());
2375 }
2376 }
2377
2378 if self.is_at_end() {
2379 return Err(Error::tokenize(
2380 "Unterminated double-quoted string",
2381 self.line,
2382 self.column,
2383 self.start,
2384 self.current,
2385 ));
2386 }
2387
2388 self.advance(); self.add_token_with_text(TokenType::String, value);
2390 Ok(())
2391 }
2392
2393 fn scan_triple_quoted_string(&mut self, quote_char: char) -> Result<()> {
2394 self.advance();
2396 self.advance();
2397 self.advance();
2398 let mut value = String::new();
2399
2400 while !self.is_at_end() {
2401 if self.peek() == quote_char
2403 && self.current + 1 < self.size
2404 && self.chars[self.current + 1] == quote_char
2405 && self.current + 2 < self.size
2406 && self.chars[self.current + 2] == quote_char
2407 {
2408 break;
2410 }
2411 value.push(self.advance());
2412 }
2413
2414 if self.is_at_end() {
2415 return Err(Error::tokenize(
2416 "Unterminated triple-quoted string",
2417 self.line,
2418 self.column,
2419 self.start,
2420 self.current,
2421 ));
2422 }
2423
2424 self.advance();
2426 self.advance();
2427 self.advance();
2428 let token_type = if quote_char == '"' {
2429 TokenType::TripleDoubleQuotedString
2430 } else {
2431 TokenType::TripleSingleQuotedString
2432 };
2433 self.add_token_with_text(token_type, value);
2434 Ok(())
2435 }
2436
2437 fn scan_quoted_identifier(&mut self, end_quote: char) -> Result<()> {
2438 self.advance(); let mut value = String::new();
2440
2441 loop {
2442 if self.is_at_end() {
2443 return Err(Error::tokenize(
2444 "Unterminated identifier",
2445 self.line,
2446 self.column,
2447 self.start,
2448 self.current,
2449 ));
2450 }
2451 if self.peek() == end_quote {
2452 if self.peek_next() == end_quote {
2453 value.push(end_quote);
2455 self.advance(); self.advance(); } else {
2458 break;
2460 }
2461 } else {
2462 value.push(self.peek());
2463 self.advance();
2464 }
2465 }
2466
2467 self.advance(); self.add_token_with_text(TokenType::QuotedIdentifier, value);
2469 Ok(())
2470 }
2471
2472 fn scan_unicode_quoted_string(&mut self, open_quote: char) -> Result<()> {
2477 self.advance(); let start = self.current;
2479 let close_quote = if open_quote == '\u{2018}' {
2481 '\u{2019}' } else {
2483 '\u{2019}' };
2485 while !self.is_at_end() && self.peek() != close_quote {
2486 self.advance();
2487 }
2488 let value = self.text_from_range(start, self.current);
2489 if !self.is_at_end() {
2490 self.advance(); }
2492 self.add_token_with_text(TokenType::String, value);
2493 Ok(())
2494 }
2495
2496 fn scan_unicode_quoted_identifier(&mut self, open_quote: char) -> Result<()> {
2499 self.advance(); let start = self.current;
2501 let close_quote = if open_quote == '\u{201C}' {
2502 '\u{201D}' } else {
2504 '\u{201D}' };
2506 while !self.is_at_end() && self.peek() != close_quote && self.peek() != '"' {
2507 self.advance();
2508 }
2509 let value = self.text_from_range(start, self.current);
2510 if !self.is_at_end() {
2511 self.advance(); }
2513 self.add_token_with_text(TokenType::QuotedIdentifier, value);
2514 Ok(())
2515 }
2516
2517 fn scan_number(&mut self) -> Result<()> {
2518 if self.config.hex_number_strings && self.peek() == '0' && !self.is_at_end() {
2520 let next = if self.current + 1 < self.size {
2521 self.chars[self.current + 1]
2522 } else {
2523 '\0'
2524 };
2525 if next == 'x' || next == 'X' {
2526 self.advance();
2528 self.advance();
2529 let hex_start = self.current;
2531 while !self.is_at_end() && (self.peek().is_ascii_hexdigit() || self.peek() == '_') {
2532 if self.peek() == '_' && !self.peek_next().is_ascii_hexdigit() {
2533 break;
2534 }
2535 self.advance();
2536 }
2537 if self.current > hex_start {
2538 let mut is_hex_float = false;
2540 if !self.is_at_end() && self.peek() == '.' {
2542 let after_dot = if self.current + 1 < self.size {
2543 self.chars[self.current + 1]
2544 } else {
2545 '\0'
2546 };
2547 if after_dot.is_ascii_hexdigit() {
2548 is_hex_float = true;
2549 self.advance(); while !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2551 self.advance();
2552 }
2553 }
2554 }
2555 if !self.is_at_end() && (self.peek() == 'p' || self.peek() == 'P') {
2557 is_hex_float = true;
2558 self.advance(); if !self.is_at_end() && (self.peek() == '+' || self.peek() == '-') {
2560 self.advance();
2561 }
2562 while !self.is_at_end() && self.peek().is_ascii_digit() {
2563 self.advance();
2564 }
2565 }
2566 if is_hex_float {
2567 let full_text = self.text_from_range(self.start, self.current);
2569 self.add_token_with_text(TokenType::Number, full_text);
2570 } else if self.config.hex_string_is_integer_type {
2571 let hex_value = self.text_from_range(hex_start, self.current);
2573 self.add_token_with_text(TokenType::HexNumber, hex_value);
2574 } else {
2575 let hex_value = self.text_from_range(hex_start, self.current);
2577 self.add_token_with_text(TokenType::HexString, hex_value);
2578 }
2579 return Ok(());
2580 }
2581 self.current = self.start + 1;
2584 }
2585 }
2586
2587 while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2589 if self.peek() == '_' && (self.is_at_end() || !self.peek_next().is_ascii_digit()) {
2591 break;
2592 }
2593 self.advance();
2594 }
2595
2596 if self.peek() == '.' {
2600 let next = self.peek_next();
2601 if next != '.' {
2607 self.advance(); while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2610 if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2611 break;
2612 }
2613 self.advance();
2614 }
2615 }
2616 }
2617
2618 if self.peek() == 'e' || self.peek() == 'E' {
2620 self.advance();
2621 if self.peek() == '+' || self.peek() == '-' {
2622 self.advance();
2623 }
2624 while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2625 if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2626 break;
2627 }
2628 self.advance();
2629 }
2630 }
2631
2632 let text = self.text_from_range(self.start, self.current);
2633
2634 if !self.config.numeric_literals.is_empty() && !self.is_at_end() {
2636 let next_char = self.peek().to_uppercase().to_string();
2637 let suffix_match = if self.current + 1 < self.size {
2639 let two_char: String = vec![self.chars[self.current], self.chars[self.current + 1]]
2640 .iter()
2641 .collect::<String>()
2642 .to_uppercase();
2643 if self.config.numeric_literals.contains_key(&two_char) {
2644 let after_suffix = if self.current + 2 < self.size {
2646 self.chars[self.current + 2]
2647 } else {
2648 ' '
2649 };
2650 if !after_suffix.is_alphanumeric() && after_suffix != '_' {
2651 Some((two_char, 2))
2652 } else {
2653 None
2654 }
2655 } else if self.config.numeric_literals.contains_key(&next_char) {
2656 let after_suffix = if self.current + 1 < self.size {
2658 self.chars[self.current + 1]
2659 } else {
2660 ' '
2661 };
2662 if !after_suffix.is_alphanumeric() && after_suffix != '_' {
2663 Some((next_char, 1))
2664 } else {
2665 None
2666 }
2667 } else {
2668 None
2669 }
2670 } else if self.config.numeric_literals.contains_key(&next_char) {
2671 Some((next_char, 1))
2673 } else {
2674 None
2675 };
2676
2677 if let Some((suffix, len)) = suffix_match {
2678 for _ in 0..len {
2680 self.advance();
2681 }
2682 let type_name = self
2685 .config
2686 .numeric_literals
2687 .get(&suffix)
2688 .expect("suffix verified by contains_key above")
2689 .clone();
2690 let combined = format!("{}::{}", text, type_name);
2691 self.add_token_with_text(TokenType::Number, combined);
2692 return Ok(());
2693 }
2694 }
2695
2696 if self.config.identifiers_can_start_with_digit && !self.is_at_end() {
2699 let next = self.peek();
2700 if next.is_alphabetic() || next == '_' {
2701 while !self.is_at_end() {
2703 let ch = self.peek();
2704 if ch.is_alphanumeric() || ch == '_' {
2705 self.advance();
2706 } else {
2707 break;
2708 }
2709 }
2710 let ident_text = self.text_from_range(self.start, self.current);
2711 self.add_token_with_text(TokenType::Identifier, ident_text);
2712 return Ok(());
2713 }
2714 }
2715
2716 self.add_token_with_text(TokenType::Number, text);
2717 Ok(())
2718 }
2719
2720 fn scan_number_starting_with_dot(&mut self) -> Result<()> {
2722 self.advance();
2724
2725 while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2727 if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2728 break;
2729 }
2730 self.advance();
2731 }
2732
2733 if self.peek() == 'e' || self.peek() == 'E' {
2735 self.advance();
2736 if self.peek() == '+' || self.peek() == '-' {
2737 self.advance();
2738 }
2739 while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2740 if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2741 break;
2742 }
2743 self.advance();
2744 }
2745 }
2746
2747 let text = self.text_from_range(self.start, self.current);
2748 self.add_token_with_text(TokenType::Number, text);
2749 Ok(())
2750 }
2751
2752 fn scan_identifier_or_keyword(&mut self) -> Result<()> {
2753 let first_char = self.peek();
2755 if !first_char.is_alphanumeric() && first_char != '_' {
2756 let c = self.advance();
2758 return Err(Error::tokenize(
2759 format!("Unexpected character: '{}'", c),
2760 self.line,
2761 self.column,
2762 self.start,
2763 self.current,
2764 ));
2765 }
2766
2767 while !self.is_at_end() {
2768 let c = self.peek();
2769 if c == '#' {
2773 let next_c = if self.current + 1 < self.size {
2774 self.chars[self.current + 1]
2775 } else {
2776 '\0'
2777 };
2778 if next_c == '>' || next_c == '-' {
2779 break; }
2781 self.advance();
2782 } else if c.is_alphanumeric() || c == '_' || c == '$' || c == '@' {
2783 self.advance();
2784 } else {
2785 break;
2786 }
2787 }
2788
2789 let text = self.text_from_range(self.start, self.current);
2790 let upper = text.to_uppercase();
2791
2792 if upper == "NOT" && self.peek() == '=' {
2794 self.advance(); self.add_token(TokenType::Neq);
2796 return Ok(());
2797 }
2798
2799 let next_char = self.peek();
2802 let is_single_quote = next_char == '\'';
2803 let is_double_quote = next_char == '"' && self.config.quotes.contains_key("\"");
2804 let is_double_quote_for_raw = next_char == '"';
2807
2808 if upper == "R" && (is_single_quote || is_double_quote_for_raw) {
2811 let quote_char = if is_single_quote { '\'' } else { '"' };
2814 self.advance(); if self.peek() == quote_char && self.peek_next() == quote_char {
2818 self.advance(); self.advance(); let string_value = self.scan_raw_triple_quoted_content(quote_char)?;
2822 self.add_token_with_text(TokenType::RawString, string_value);
2823 } else {
2824 let string_value = self.scan_raw_string_content(quote_char)?;
2825 self.add_token_with_text(TokenType::RawString, string_value);
2826 }
2827 return Ok(());
2828 }
2829
2830 if is_single_quote || is_double_quote {
2831 match upper.as_str() {
2832 "N" => {
2833 self.advance(); let string_value = if is_single_quote {
2836 self.scan_string_content()?
2837 } else {
2838 self.scan_double_quoted_string_content()?
2839 };
2840 self.add_token_with_text(TokenType::NationalString, string_value);
2841 return Ok(());
2842 }
2843 "E" => {
2844 let lowercase = text == "e";
2848 let prefix = if lowercase { "e:" } else { "E:" };
2849 self.advance(); let string_value = self.scan_string_content_with_escapes(true)?;
2851 self.add_token_with_text(
2852 TokenType::EscapeString,
2853 format!("{}{}", prefix, string_value),
2854 );
2855 return Ok(());
2856 }
2857 "X" => {
2858 self.advance(); let string_value = if is_single_quote {
2861 self.scan_string_content()?
2862 } else {
2863 self.scan_double_quoted_string_content()?
2864 };
2865 self.add_token_with_text(TokenType::HexString, string_value);
2866 return Ok(());
2867 }
2868 "B" if is_double_quote => {
2869 self.advance(); let string_value = self.scan_double_quoted_string_content()?;
2872 self.add_token_with_text(TokenType::ByteString, string_value);
2873 return Ok(());
2874 }
2875 "B" if is_single_quote => {
2876 self.advance(); let string_value = self.scan_string_content()?;
2880 if self.config.b_prefix_is_byte_string {
2881 self.add_token_with_text(TokenType::ByteString, string_value);
2882 } else {
2883 self.add_token_with_text(TokenType::BitString, string_value);
2884 }
2885 return Ok(());
2886 }
2887 _ => {}
2888 }
2889 }
2890
2891 if upper == "U"
2893 && self.peek() == '&'
2894 && self.current + 1 < self.size
2895 && self.chars[self.current + 1] == '\''
2896 {
2897 self.advance(); self.advance(); let string_value = self.scan_string_content()?;
2900 self.add_token_with_text(TokenType::UnicodeString, string_value);
2901 return Ok(());
2902 }
2903
2904 let token_type = self
2905 .config
2906 .keywords
2907 .get(&upper)
2908 .copied()
2909 .unwrap_or(TokenType::Var);
2910
2911 self.add_token_with_text(token_type, text);
2912 Ok(())
2913 }
2914
2915 fn scan_string_content_with_escapes(
2919 &mut self,
2920 force_backslash_escapes: bool,
2921 ) -> Result<String> {
2922 let mut value = String::new();
2923 let use_backslash_escapes =
2924 force_backslash_escapes || self.config.string_escapes.contains(&'\\');
2925
2926 while !self.is_at_end() {
2927 let c = self.peek();
2928 if c == '\'' {
2929 if self.peek_next() == '\'' {
2930 value.push('\'');
2932 self.advance();
2933 self.advance();
2934 } else {
2935 break;
2936 }
2937 } else if c == '\\' && use_backslash_escapes {
2938 value.push(self.advance());
2940 if !self.is_at_end() {
2941 value.push(self.advance());
2942 }
2943 } else {
2944 value.push(self.advance());
2945 }
2946 }
2947
2948 if self.is_at_end() {
2949 return Err(Error::tokenize(
2950 "Unterminated string",
2951 self.line,
2952 self.column,
2953 self.start,
2954 self.current,
2955 ));
2956 }
2957
2958 self.advance(); Ok(value)
2960 }
2961
2962 fn scan_string_content(&mut self) -> Result<String> {
2964 self.scan_string_content_with_escapes(false)
2965 }
2966
2967 fn scan_double_quoted_string_content(&mut self) -> Result<String> {
2970 let mut value = String::new();
2971 let use_backslash_escapes = self.config.string_escapes.contains(&'\\');
2972
2973 while !self.is_at_end() {
2974 let c = self.peek();
2975 if c == '"' {
2976 if self.peek_next() == '"' {
2977 value.push('"');
2979 self.advance();
2980 self.advance();
2981 } else {
2982 break;
2983 }
2984 } else if c == '\\' && use_backslash_escapes {
2985 self.advance(); if !self.is_at_end() {
2988 let escaped = self.advance();
2989 match escaped {
2990 'n' => value.push('\n'),
2991 'r' => value.push('\r'),
2992 't' => value.push('\t'),
2993 '0' => value.push('\0'),
2994 '\\' => value.push('\\'),
2995 '"' => value.push('"'),
2996 '\'' => value.push('\''),
2997 'x' => {
2998 let mut hex = String::new();
3000 for _ in 0..2 {
3001 if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
3002 hex.push(self.advance());
3003 }
3004 }
3005 if let Ok(byte) = u8::from_str_radix(&hex, 16) {
3006 value.push(byte as char);
3007 } else {
3008 value.push('\\');
3010 value.push('x');
3011 value.push_str(&hex);
3012 }
3013 }
3014 _ => {
3015 value.push('\\');
3017 value.push(escaped);
3018 }
3019 }
3020 }
3021 } else {
3022 value.push(self.advance());
3023 }
3024 }
3025
3026 if self.is_at_end() {
3027 return Err(Error::tokenize(
3028 "Unterminated double-quoted string",
3029 self.line,
3030 self.column,
3031 self.start,
3032 self.current,
3033 ));
3034 }
3035
3036 self.advance(); Ok(value)
3038 }
3039
3040 fn scan_raw_string_content(&mut self, quote_char: char) -> Result<String> {
3045 let mut value = String::new();
3046
3047 while !self.is_at_end() {
3048 let c = self.peek();
3049 if c == quote_char {
3050 if self.peek_next() == quote_char {
3051 value.push(quote_char);
3053 self.advance();
3054 self.advance();
3055 } else {
3056 break;
3057 }
3058 } else if c == '\\'
3059 && self.peek_next() == quote_char
3060 && self.config.string_escapes_allowed_in_raw_strings
3061 {
3062 value.push(quote_char);
3066 self.advance(); self.advance(); } else {
3069 value.push(self.advance());
3071 }
3072 }
3073
3074 if self.is_at_end() {
3075 return Err(Error::tokenize(
3076 "Unterminated raw string",
3077 self.line,
3078 self.column,
3079 self.start,
3080 self.current,
3081 ));
3082 }
3083
3084 self.advance(); Ok(value)
3086 }
3087
3088 fn scan_raw_triple_quoted_content(&mut self, quote_char: char) -> Result<String> {
3091 let mut value = String::new();
3092
3093 while !self.is_at_end() {
3094 let c = self.peek();
3095 if c == quote_char && self.peek_next() == quote_char {
3096 if self.current + 2 < self.size && self.chars[self.current + 2] == quote_char {
3098 self.advance(); self.advance(); self.advance(); return Ok(value);
3103 }
3104 }
3105 let ch = self.advance();
3107 value.push(ch);
3108 }
3109
3110 Err(Error::tokenize(
3111 "Unterminated raw triple-quoted string",
3112 self.line,
3113 self.column,
3114 self.start,
3115 self.current,
3116 ))
3117 }
3118
3119 fn scan_dollar_identifier(&mut self) -> Result<()> {
3124 self.advance();
3126
3127 while !self.is_at_end() {
3129 let c = self.peek();
3130 if c.is_alphanumeric() || c == '_' || c == '$' {
3131 self.advance();
3132 } else {
3133 break;
3134 }
3135 }
3136
3137 let text = self.text_from_range(self.start, self.current);
3138 self.add_token_with_text(TokenType::Var, text);
3139 Ok(())
3140 }
3141
3142 fn scan_tsql_identifier(&mut self) -> Result<()> {
3143 let first = self.advance();
3145
3146 if first == '#' && self.peek() == '#' {
3148 self.advance();
3149 }
3150
3151 while !self.is_at_end() {
3153 let c = self.peek();
3154 if c.is_alphanumeric() || c == '_' || c == '$' || c == '#' || c == '@' {
3155 self.advance();
3156 } else {
3157 break;
3158 }
3159 }
3160
3161 let text = self.text_from_range(self.start, self.current);
3162 self.add_token_with_text(TokenType::Var, text);
3164 Ok(())
3165 }
3166
3167 fn try_scan_insert_format_raw_data(&mut self) -> Option<String> {
3171 let len = self.tokens.len();
3172 if len < 3 {
3173 return None;
3174 }
3175
3176 let last = &self.tokens[len - 1];
3178 if last.text.eq_ignore_ascii_case("VALUES") {
3179 return None;
3180 }
3181 if !matches!(last.token_type, TokenType::Var | TokenType::Identifier) {
3182 return None;
3183 }
3184
3185 let format_tok = &self.tokens[len - 2];
3187 if !format_tok.text.eq_ignore_ascii_case("FORMAT") {
3188 return None;
3189 }
3190
3191 let has_insert = self.tokens[..len - 2]
3193 .iter()
3194 .rev()
3195 .take(20)
3196 .any(|t| t.token_type == TokenType::Insert);
3197 if !has_insert {
3198 return None;
3199 }
3200
3201 let raw_start = self.current;
3205 while !self.is_at_end() {
3206 let c = self.peek();
3207 if c == '\n' {
3208 let saved = self.current;
3210 self.advance(); while !self.is_at_end() && self.peek() == '\r' {
3213 self.advance();
3214 }
3215 if self.is_at_end() || self.peek() == '\n' {
3216 let raw = self.text_from_range(raw_start, saved);
3219 return Some(raw.trim().to_string());
3220 }
3221 } else {
3223 self.advance();
3224 }
3225 }
3226
3227 let raw = self.text_from_range(raw_start, self.current);
3229 let trimmed = raw.trim().to_string();
3230 if trimmed.is_empty() {
3231 None
3232 } else {
3233 Some(trimmed)
3234 }
3235 }
3236
3237 fn add_token(&mut self, token_type: TokenType) {
3238 let text = self.text_from_range(self.start, self.current);
3239 self.add_token_with_text(token_type, text);
3240 }
3241
3242 fn add_token_with_text(&mut self, token_type: TokenType, text: String) {
3243 let span = Span::new(self.start, self.current, self.line, self.column);
3244 let mut token = Token::new(token_type, text, span);
3245 token.comments.append(&mut self.comments);
3246 self.tokens.push(token);
3247 }
3248}
3249
3250#[cfg(test)]
3251mod tests {
3252 use super::*;
3253
3254 #[test]
3255 fn test_simple_select() {
3256 let tokenizer = Tokenizer::default();
3257 let tokens = tokenizer.tokenize("SELECT 1").unwrap();
3258
3259 assert_eq!(tokens.len(), 2);
3260 assert_eq!(tokens[0].token_type, TokenType::Select);
3261 assert_eq!(tokens[1].token_type, TokenType::Number);
3262 assert_eq!(tokens[1].text, "1");
3263 }
3264
3265 #[test]
3266 fn test_select_with_identifier() {
3267 let tokenizer = Tokenizer::default();
3268 let tokens = tokenizer.tokenize("SELECT a, b FROM t").unwrap();
3269
3270 assert_eq!(tokens.len(), 6);
3271 assert_eq!(tokens[0].token_type, TokenType::Select);
3272 assert_eq!(tokens[1].token_type, TokenType::Var);
3273 assert_eq!(tokens[1].text, "a");
3274 assert_eq!(tokens[2].token_type, TokenType::Comma);
3275 assert_eq!(tokens[3].token_type, TokenType::Var);
3276 assert_eq!(tokens[3].text, "b");
3277 assert_eq!(tokens[4].token_type, TokenType::From);
3278 assert_eq!(tokens[5].token_type, TokenType::Var);
3279 assert_eq!(tokens[5].text, "t");
3280 }
3281
3282 #[test]
3283 fn test_string_literal() {
3284 let tokenizer = Tokenizer::default();
3285 let tokens = tokenizer.tokenize("SELECT 'hello'").unwrap();
3286
3287 assert_eq!(tokens.len(), 2);
3288 assert_eq!(tokens[1].token_type, TokenType::String);
3289 assert_eq!(tokens[1].text, "hello");
3290 }
3291
3292 #[test]
3293 fn test_escaped_string() {
3294 let tokenizer = Tokenizer::default();
3295 let tokens = tokenizer.tokenize("SELECT 'it''s'").unwrap();
3296
3297 assert_eq!(tokens.len(), 2);
3298 assert_eq!(tokens[1].token_type, TokenType::String);
3299 assert_eq!(tokens[1].text, "it's");
3300 }
3301
3302 #[test]
3303 fn test_comments() {
3304 let tokenizer = Tokenizer::default();
3305 let tokens = tokenizer.tokenize("SELECT -- comment\n1").unwrap();
3306
3307 assert_eq!(tokens.len(), 2);
3308 assert_eq!(tokens[0].trailing_comments.len(), 1);
3311 assert_eq!(tokens[0].trailing_comments[0], " comment");
3312 }
3313
3314 #[test]
3315 fn test_comment_in_and_chain() {
3316 use crate::generator::Generator;
3317 use crate::parser::Parser;
3318
3319 let sql = "SELECT a FROM b WHERE foo\n-- c1\nAND bar\n-- c2\nAND bla";
3321 let ast = Parser::parse_sql(sql).unwrap();
3322 let mut gen = Generator::default();
3323 let output = gen.generate(&ast[0]).unwrap();
3324 assert_eq!(
3325 output,
3326 "SELECT a FROM b WHERE foo AND /* c1 */ bar AND /* c2 */ bla"
3327 );
3328 }
3329
3330 #[test]
3331 fn test_operators() {
3332 let tokenizer = Tokenizer::default();
3333 let tokens = tokenizer.tokenize("1 + 2 * 3").unwrap();
3334
3335 assert_eq!(tokens.len(), 5);
3336 assert_eq!(tokens[0].token_type, TokenType::Number);
3337 assert_eq!(tokens[1].token_type, TokenType::Plus);
3338 assert_eq!(tokens[2].token_type, TokenType::Number);
3339 assert_eq!(tokens[3].token_type, TokenType::Star);
3340 assert_eq!(tokens[4].token_type, TokenType::Number);
3341 }
3342
3343 #[test]
3344 fn test_comparison_operators() {
3345 let tokenizer = Tokenizer::default();
3346 let tokens = tokenizer.tokenize("a <= b >= c != d").unwrap();
3347
3348 assert_eq!(tokens[1].token_type, TokenType::Lte);
3349 assert_eq!(tokens[3].token_type, TokenType::Gte);
3350 assert_eq!(tokens[5].token_type, TokenType::Neq);
3351 }
3352
3353 #[test]
3354 fn test_national_string() {
3355 let tokenizer = Tokenizer::default();
3356 let tokens = tokenizer.tokenize("N'abc'").unwrap();
3357
3358 assert_eq!(
3359 tokens.len(),
3360 1,
3361 "Expected 1 token for N'abc', got {:?}",
3362 tokens
3363 );
3364 assert_eq!(tokens[0].token_type, TokenType::NationalString);
3365 assert_eq!(tokens[0].text, "abc");
3366 }
3367
3368 #[test]
3369 fn test_hex_string() {
3370 let tokenizer = Tokenizer::default();
3371 let tokens = tokenizer.tokenize("X'ABCD'").unwrap();
3372
3373 assert_eq!(
3374 tokens.len(),
3375 1,
3376 "Expected 1 token for X'ABCD', got {:?}",
3377 tokens
3378 );
3379 assert_eq!(tokens[0].token_type, TokenType::HexString);
3380 assert_eq!(tokens[0].text, "ABCD");
3381 }
3382
3383 #[test]
3384 fn test_bit_string() {
3385 let tokenizer = Tokenizer::default();
3386 let tokens = tokenizer.tokenize("B'01010'").unwrap();
3387
3388 assert_eq!(
3389 tokens.len(),
3390 1,
3391 "Expected 1 token for B'01010', got {:?}",
3392 tokens
3393 );
3394 assert_eq!(tokens[0].token_type, TokenType::BitString);
3395 assert_eq!(tokens[0].text, "01010");
3396 }
3397
3398 #[test]
3399 fn test_trailing_dot_number() {
3400 let tokenizer = Tokenizer::default();
3401
3402 let tokens = tokenizer.tokenize("SELECT 1.").unwrap();
3404 assert_eq!(
3405 tokens.len(),
3406 2,
3407 "Expected 2 tokens for 'SELECT 1.', got {:?}",
3408 tokens
3409 );
3410 assert_eq!(tokens[1].token_type, TokenType::Number);
3411 assert_eq!(tokens[1].text, "1.");
3412
3413 let tokens = tokenizer.tokenize("SELECT 1.5").unwrap();
3415 assert_eq!(tokens[1].text, "1.5");
3416
3417 let tokens = tokenizer.tokenize("SELECT 1.a").unwrap();
3420 assert_eq!(
3421 tokens.len(),
3422 3,
3423 "Expected 3 tokens for 'SELECT 1.a', got {:?}",
3424 tokens
3425 );
3426 assert_eq!(tokens[1].token_type, TokenType::Number);
3427 assert_eq!(tokens[1].text, "1.");
3428 assert_eq!(tokens[2].token_type, TokenType::Var);
3429
3430 let tokens = tokenizer.tokenize("SELECT 1..2").unwrap();
3432 assert_eq!(tokens[1].token_type, TokenType::Number);
3433 assert_eq!(tokens[1].text, "1");
3434 assert_eq!(tokens[2].token_type, TokenType::Dot);
3435 assert_eq!(tokens[3].token_type, TokenType::Dot);
3436 assert_eq!(tokens[4].token_type, TokenType::Number);
3437 assert_eq!(tokens[4].text, "2");
3438 }
3439
3440 #[test]
3441 fn test_leading_dot_number() {
3442 let tokenizer = Tokenizer::default();
3443
3444 let tokens = tokenizer.tokenize(".25").unwrap();
3446 assert_eq!(
3447 tokens.len(),
3448 1,
3449 "Expected 1 token for '.25', got {:?}",
3450 tokens
3451 );
3452 assert_eq!(tokens[0].token_type, TokenType::Number);
3453 assert_eq!(tokens[0].text, ".25");
3454
3455 let tokens = tokenizer.tokenize("SAMPLE (.25)").unwrap();
3457 assert_eq!(
3458 tokens.len(),
3459 4,
3460 "Expected 4 tokens for 'SAMPLE (.25)', got {:?}",
3461 tokens
3462 );
3463 assert_eq!(tokens[0].token_type, TokenType::Sample);
3464 assert_eq!(tokens[1].token_type, TokenType::LParen);
3465 assert_eq!(tokens[2].token_type, TokenType::Number);
3466 assert_eq!(tokens[2].text, ".25");
3467 assert_eq!(tokens[3].token_type, TokenType::RParen);
3468
3469 let tokens = tokenizer.tokenize(".5e10").unwrap();
3471 assert_eq!(
3472 tokens.len(),
3473 1,
3474 "Expected 1 token for '.5e10', got {:?}",
3475 tokens
3476 );
3477 assert_eq!(tokens[0].token_type, TokenType::Number);
3478 assert_eq!(tokens[0].text, ".5e10");
3479
3480 let tokens = tokenizer.tokenize("a.b").unwrap();
3482 assert_eq!(
3483 tokens.len(),
3484 3,
3485 "Expected 3 tokens for 'a.b', got {:?}",
3486 tokens
3487 );
3488 assert_eq!(tokens[1].token_type, TokenType::Dot);
3489 }
3490
3491 #[test]
3492 fn test_unrecognized_character() {
3493 let tokenizer = Tokenizer::default();
3494
3495 let result = tokenizer.tokenize("SELECT \u{2018}hello\u{2019}");
3497 assert!(
3498 result.is_ok(),
3499 "Curly quotes should be tokenized as strings"
3500 );
3501
3502 let result = tokenizer.tokenize("SELECT • FROM t");
3504 assert!(result.is_err());
3505 }
3506
3507 #[test]
3508 fn test_colon_eq_tokenization() {
3509 let tokenizer = Tokenizer::default();
3510
3511 let tokens = tokenizer.tokenize("a := 1").unwrap();
3513 assert_eq!(tokens.len(), 3);
3514 assert_eq!(tokens[0].token_type, TokenType::Var);
3515 assert_eq!(tokens[1].token_type, TokenType::ColonEq);
3516 assert_eq!(tokens[2].token_type, TokenType::Number);
3517
3518 let tokens = tokenizer.tokenize("a:b").unwrap();
3520 assert!(tokens.iter().any(|t| t.token_type == TokenType::Colon));
3521 assert!(!tokens.iter().any(|t| t.token_type == TokenType::ColonEq));
3522
3523 let tokens = tokenizer.tokenize("a::INT").unwrap();
3525 assert!(tokens.iter().any(|t| t.token_type == TokenType::DColon));
3526 }
3527
3528 #[test]
3529 fn test_colon_eq_parsing() {
3530 use crate::generator::Generator;
3531 use crate::parser::Parser;
3532
3533 let ast = Parser::parse_sql("SELECT @var1 := 1, @var2")
3535 .expect("Failed to parse MySQL @var := expr");
3536 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3537 assert_eq!(output, "SELECT @var1 := 1, @var2");
3538
3539 let ast = Parser::parse_sql("SELECT @var1, @var2 := @var1")
3541 .expect("Failed to parse MySQL @var2 := @var1");
3542 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3543 assert_eq!(output, "SELECT @var1, @var2 := @var1");
3544
3545 let ast = Parser::parse_sql("SELECT @var1 := COUNT(*) FROM t1")
3547 .expect("Failed to parse MySQL @var := COUNT(*)");
3548 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3549 assert_eq!(output, "SELECT @var1 := COUNT(*) FROM t1");
3550
3551 let ast = Parser::parse_sql("SET @var1 := 1").expect("Failed to parse SET @var1 := 1");
3553 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3554 assert_eq!(output, "SET @var1 = 1");
3555
3556 let ast =
3558 Parser::parse_sql("UNION_VALUE(k1 := 1)").expect("Failed to parse named arg with :=");
3559 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3560 assert_eq!(output, "UNION_VALUE(k1 := 1)");
3561
3562 let ast = Parser::parse_sql("SELECT UNNEST(col, recursive := TRUE) FROM t")
3564 .expect("Failed to parse UNNEST with :=");
3565 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3566 assert_eq!(output, "SELECT UNNEST(col, recursive := TRUE) FROM t");
3567
3568 let ast =
3570 Parser::parse_sql("SELECT foo: 1").expect("Failed to parse DuckDB prefix alias foo: 1");
3571 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3572 assert_eq!(output, "SELECT 1 AS foo");
3573
3574 let ast = Parser::parse_sql("SELECT foo: 1, bar: 2, baz: 3")
3576 .expect("Failed to parse DuckDB multiple prefix aliases");
3577 let output = Generator::sql(&ast[0]).expect("Failed to generate");
3578 assert_eq!(output, "SELECT 1 AS foo, 2 AS bar, 3 AS baz");
3579 }
3580
3581 #[test]
3582 fn test_colon_eq_dialect_roundtrip() {
3583 use crate::dialects::{Dialect, DialectType};
3584
3585 fn check(dialect: DialectType, sql: &str, expected: Option<&str>) {
3586 let d = Dialect::get(dialect);
3587 let ast = d
3588 .parse(sql)
3589 .unwrap_or_else(|e| panic!("Parse error for '{}': {}", sql, e));
3590 assert!(!ast.is_empty(), "Empty AST for: {}", sql);
3591 let transformed = d
3592 .transform(ast[0].clone())
3593 .unwrap_or_else(|e| panic!("Transform error for '{}': {}", sql, e));
3594 let output = d
3595 .generate(&transformed)
3596 .unwrap_or_else(|e| panic!("Generate error for '{}': {}", sql, e));
3597 let expected = expected.unwrap_or(sql);
3598 assert_eq!(output, expected, "Roundtrip failed for: {}", sql);
3599 }
3600
3601 check(DialectType::MySQL, "SELECT @var1 := 1, @var2", None);
3603 check(DialectType::MySQL, "SELECT @var1, @var2 := @var1", None);
3604 check(DialectType::MySQL, "SELECT @var1 := COUNT(*) FROM t1", None);
3605 check(DialectType::MySQL, "SET @var1 := 1", Some("SET @var1 = 1"));
3606
3607 check(
3609 DialectType::DuckDB,
3610 "SELECT UNNEST(col, recursive := TRUE) FROM t",
3611 None,
3612 );
3613 check(DialectType::DuckDB, "UNION_VALUE(k1 := 1)", None);
3614
3615 {
3618 let d = Dialect::get(DialectType::DuckDB);
3619 let ast = d
3620 .parse("STRUCT_PACK(a := 'b')::json")
3621 .expect("Failed to parse STRUCT_PACK(a := 'b')::json");
3622 assert!(!ast.is_empty(), "Empty AST for STRUCT_PACK(a := 'b')::json");
3623 }
3624
3625 check(
3627 DialectType::DuckDB,
3628 "SELECT foo: 1",
3629 Some("SELECT 1 AS foo"),
3630 );
3631 check(
3632 DialectType::DuckDB,
3633 "SELECT foo: 1, bar: 2, baz: 3",
3634 Some("SELECT 1 AS foo, 2 AS bar, 3 AS baz"),
3635 );
3636 }
3637
3638 #[test]
3639 fn test_comment_roundtrip() {
3640 use crate::generator::Generator;
3641 use crate::parser::Parser;
3642
3643 fn check_roundtrip(sql: &str) -> Option<String> {
3644 let ast = match Parser::parse_sql(sql) {
3645 Ok(a) => a,
3646 Err(e) => return Some(format!("Parse error: {:?}", e)),
3647 };
3648 if ast.is_empty() {
3649 return Some("Empty AST".to_string());
3650 }
3651 let mut generator = Generator::default();
3652 let output = match generator.generate(&ast[0]) {
3653 Ok(o) => o,
3654 Err(e) => return Some(format!("Gen error: {:?}", e)),
3655 };
3656 if output == sql {
3657 None
3658 } else {
3659 Some(format!(
3660 "Mismatch:\n input: {}\n output: {}",
3661 sql, output
3662 ))
3663 }
3664 }
3665
3666 let tests = vec![
3667 "SELECT c /* c1 /* c2 */ c3 */",
3669 "SELECT c /* c1 /* c2 /* c3 */ */ */",
3670 "SELECT c /* c1 */ AS alias /* c2 */",
3672 "SELECT a /* x */, b /* x */",
3674 "SELECT a /* x */ /* y */ /* z */, b /* k */ /* m */",
3676 "SELECT * FROM foo /* x */, bla /* x */",
3678 "SELECT 1 /* comment */ + 1",
3680 "SELECT 1 /* c1 */ + 2 /* c2 */",
3681 "SELECT 1 /* c1 */ + /* c2 */ 2 /* c3 */",
3682 "SELECT CAST(x AS INT) /* comment */ FROM foo",
3684 "SELECT FOO(x /* c */) /* FOO */, b /* b */",
3686 "SELECT x FROM a.b.c /* x */, e.f.g /* x */",
3688 "INSERT INTO t1 (tc1 /* tc1 */, tc2 /* tc2 */) SELECT c1 /* sc1 */, c2 /* sc2 */ FROM t",
3690 "/* c */ WITH x AS (SELECT 1) SELECT * FROM x",
3692 "/* comment1 */ INSERT INTO x /* comment2 */ VALUES (1, 2, 3)",
3693 "/* comment1 */ UPDATE tbl /* comment2 */ SET x = 2 WHERE x < 2",
3694 "/* comment1 */ DELETE FROM x /* comment2 */ WHERE y > 1",
3695 "/* comment */ CREATE TABLE foo AS SELECT 1",
3696 "INSERT INTO foo SELECT * FROM bar /* comment */",
3698 "SELECT FOO(x /* c1 */ + y /* c2 */ + BLA(5 /* c3 */)) FROM (VALUES (1 /* c4 */, \"test\" /* c5 */)) /* c6 */",
3700 ];
3701
3702 let mut failures = Vec::new();
3703 for sql in tests {
3704 if let Some(e) = check_roundtrip(sql) {
3705 failures.push(e);
3706 }
3707 }
3708
3709 if !failures.is_empty() {
3710 panic!("Comment roundtrip failures:\n{}", failures.join("\n\n"));
3711 }
3712 }
3713
3714 #[test]
3715 fn test_dollar_quoted_string_parsing() {
3716 use crate::dialects::{Dialect, DialectType};
3717
3718 let (tag, content) = super::parse_dollar_string_token("FOO\x00content here");
3720 assert_eq!(tag, Some("FOO".to_string()));
3721 assert_eq!(content, "content here");
3722
3723 let (tag, content) = super::parse_dollar_string_token("just content");
3724 assert_eq!(tag, None);
3725 assert_eq!(content, "just content");
3726
3727 fn check_databricks(sql: &str, expected: Option<&str>) {
3729 let d = Dialect::get(DialectType::Databricks);
3730 let ast = d
3731 .parse(sql)
3732 .unwrap_or_else(|e| panic!("Parse error for '{}': {}", sql, e));
3733 assert!(!ast.is_empty(), "Empty AST for: {}", sql);
3734 let transformed = d
3735 .transform(ast[0].clone())
3736 .unwrap_or_else(|e| panic!("Transform error for '{}': {}", sql, e));
3737 let output = d
3738 .generate(&transformed)
3739 .unwrap_or_else(|e| panic!("Generate error for '{}': {}", sql, e));
3740 let expected = expected.unwrap_or(sql);
3741 assert_eq!(output, expected, "Roundtrip failed for: {}", sql);
3742 }
3743
3744 check_databricks(
3746 "CREATE FUNCTION add_one(x INT) RETURNS INT LANGUAGE PYTHON AS $$def add_one(x):\n return x+1$$",
3747 None
3748 );
3749
3750 check_databricks(
3752 "CREATE FUNCTION add_one(x INT) RETURNS INT LANGUAGE PYTHON AS $FOO$def add_one(x):\n return x+1$FOO$",
3753 None
3754 );
3755 }
3756}