1use crate::error::{Error, Result};
7use serde::{Deserialize, Serialize};
8use std::fmt;
9
10pub fn parse_dollar_string_token(text: &str) -> (Option<String>, String) {
14 if let Some(pos) = text.find('\x00') {
15 let tag = &text[..pos];
16 let content = &text[pos + 1..];
17 (Some(tag.to_string()), content.to_string())
18 } else {
19 (None, text.to_string())
20 }
21}
22
23#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
25pub struct Span {
26 pub start: usize,
28 pub end: usize,
30 pub line: usize,
32 pub column: usize,
34}
35
36impl Span {
37 pub fn new(start: usize, end: usize, line: usize, column: usize) -> Self {
38 Self { start, end, line, column }
39 }
40}
41
42#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
44pub struct Token {
45 pub token_type: TokenType,
47 pub text: String,
49 pub span: Span,
51 #[serde(default)]
53 pub comments: Vec<String>,
54 #[serde(default)]
56 pub trailing_comments: Vec<String>,
57}
58
59impl Token {
60 pub fn new(token_type: TokenType, text: impl Into<String>, span: Span) -> Self {
62 Self {
63 token_type,
64 text: text.into(),
65 span,
66 comments: Vec::new(),
67 trailing_comments: Vec::new(),
68 }
69 }
70
71 pub fn number(n: i64) -> Self {
73 Self::new(TokenType::Number, n.to_string(), Span::default())
74 }
75
76 pub fn string(s: impl Into<String>) -> Self {
78 Self::new(TokenType::String, s, Span::default())
79 }
80
81 pub fn identifier(s: impl Into<String>) -> Self {
83 Self::new(TokenType::Identifier, s, Span::default())
84 }
85
86 pub fn var(s: impl Into<String>) -> Self {
88 Self::new(TokenType::Var, s, Span::default())
89 }
90
91 pub fn with_comment(mut self, comment: impl Into<String>) -> Self {
93 self.comments.push(comment.into());
94 self
95 }
96}
97
98impl fmt::Display for Token {
99 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
100 write!(f, "{:?}({})", self.token_type, self.text)
101 }
102}
103
104#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
106#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
107#[repr(u16)]
108pub enum TokenType {
109 LParen,
111 RParen,
112 LBracket,
113 RBracket,
114 LBrace,
115 RBrace,
116 Comma,
117 Dot,
118 Dash,
119 Plus,
120 Colon,
121 DotColon,
122 DColon,
123 DColonDollar,
124 DColonPercent,
125 DColonQMark,
126 DQMark,
127 Semicolon,
128 Star,
129 Backslash,
130 Slash,
131 Lt,
132 Lte,
133 Gt,
134 Gte,
135 Not,
136 Eq,
137 Neq,
138 NullsafeEq,
139 ColonEq,
140 ColonGt,
141 NColonGt,
142 And,
143 Or,
144 Amp,
145 DPipe,
146 PipeGt,
147 Pipe,
148 PipeSlash,
149 DPipeSlash,
150 Caret,
151 CaretAt,
152 LtLt, GtGt, Tilde,
155 Arrow,
156 DArrow,
157 FArrow,
158 Hash,
159 HashArrow,
160 DHashArrow,
161 LrArrow,
162 DAt,
163 AtAt,
164 LtAt,
165 AtGt,
166 Dollar,
167 Parameter,
168 Session,
169 SessionParameter,
170 SessionUser,
171 DAmp,
172 AmpLt,
173 AmpGt,
174 Adjacent,
175 Xor,
176 DStar,
177 QMarkAmp,
178 QMarkPipe,
179 HashDash,
180 Exclamation,
181
182 UriStart,
183 BlockStart,
184 BlockEnd,
185 Space,
186 Break,
187
188 BlockComment, LineComment, String,
194 DollarString, TripleDoubleQuotedString, TripleSingleQuotedString, Number,
198 Identifier,
199 QuotedIdentifier,
200 Database,
201 Column,
202 ColumnDef,
203 Schema,
204 Table,
205 Warehouse,
206 Stage,
207 Streamlit,
208 Var,
209 BitString,
210 HexString,
211 HexNumber,
213 ByteString,
214 NationalString,
215 EscapeString, RawString,
217 HeredocString,
218 HeredocStringAlternative,
219 UnicodeString,
220
221 Bit,
223 Boolean,
224 TinyInt,
225 UTinyInt,
226 SmallInt,
227 USmallInt,
228 MediumInt,
229 UMediumInt,
230 Int,
231 UInt,
232 BigInt,
233 UBigInt,
234 BigNum,
235 Int128,
236 UInt128,
237 Int256,
238 UInt256,
239 Float,
240 Double,
241 UDouble,
242 Decimal,
243 Decimal32,
244 Decimal64,
245 Decimal128,
246 Decimal256,
247 DecFloat,
248 UDecimal,
249 BigDecimal,
250 Char,
251 NChar,
252 VarChar,
253 NVarChar,
254 BpChar,
255 Text,
256 MediumText,
257 LongText,
258 Blob,
259 MediumBlob,
260 LongBlob,
261 TinyBlob,
262 TinyText,
263 Name,
264 Binary,
265 VarBinary,
266 Json,
267 JsonB,
268 Time,
269 TimeTz,
270 TimeNs,
271 Timestamp,
272 TimestampTz,
273 TimestampLtz,
274 TimestampNtz,
275 TimestampS,
276 TimestampMs,
277 TimestampNs,
278 DateTime,
279 DateTime2,
280 DateTime64,
281 SmallDateTime,
282 Date,
283 Date32,
284 Int4Range,
285 Int4MultiRange,
286 Int8Range,
287 Int8MultiRange,
288 NumRange,
289 NumMultiRange,
290 TsRange,
291 TsMultiRange,
292 TsTzRange,
293 TsTzMultiRange,
294 DateRange,
295 DateMultiRange,
296 Uuid,
297 Geography,
298 GeographyPoint,
299 Nullable,
300 Geometry,
301 Point,
302 Ring,
303 LineString,
304 LocalTime,
305 LocalTimestamp,
306 SysTimestamp,
307 MultiLineString,
308 Polygon,
309 MultiPolygon,
310 HllSketch,
311 HStore,
312 Super,
313 Serial,
314 SmallSerial,
315 BigSerial,
316 Xml,
317 Year,
318 UserDefined,
319 Money,
320 SmallMoney,
321 RowVersion,
322 Image,
323 Variant,
324 Object,
325 Inet,
326 IpAddress,
327 IpPrefix,
328 Ipv4,
329 Ipv6,
330 Enum,
331 Enum8,
332 Enum16,
333 FixedString,
334 LowCardinality,
335 Nested,
336 AggregateFunction,
337 SimpleAggregateFunction,
338 TDigest,
339 Unknown,
340 Vector,
341 Dynamic,
342 Void,
343
344 Add,
346 Alias,
347 Alter,
348 All,
349 Anti,
350 Any,
351 Apply,
352 Array,
353 Asc,
354 AsOf,
355 Attach,
356 AutoIncrement,
357 Begin,
358 Between,
359 BulkCollectInto,
360 Cache,
361 Cascade,
362 Case,
363 CharacterSet,
364 Cluster,
365 ClusterBy,
366 Collate,
367 Command,
368 Comment,
369 Commit,
370 Preserve,
371 Connect,
372 ConnectBy,
373 Constraint,
374 Copy,
375 Create,
376 Cross,
377 Cube,
378 CurrentDate,
379 CurrentDateTime,
380 CurrentSchema,
381 CurrentTime,
382 CurrentTimestamp,
383 CurrentUser,
384 CurrentRole,
385 CurrentCatalog,
386 Declare,
387 Default,
388 Delete,
389 Desc,
390 Describe,
391 Detach,
392 Dictionary,
393 Distinct,
394 Distribute,
395 DistributeBy,
396 Div,
397 Drop,
398 Else,
399 End,
400 Escape,
401 Except,
402 Execute,
403 Exists,
404 False,
405 Fetch,
406 File,
407 FileFormat,
408 Filter,
409 Final,
410 First,
411 For,
412 Force,
413 ForeignKey,
414 Format,
415 From,
416 Full,
417 Function,
418 Get,
419 Glob,
420 Global,
421 Grant,
422 GroupBy,
423 GroupingSets,
424 Having,
425 Hint,
426 Ignore,
427 ILike,
428 In,
429 Index,
430 IndexedBy,
431 Inner,
432 Input,
433 Insert,
434 Install,
435 Intersect,
436 Interval,
437 Into,
438 Inpath,
439 InputFormat,
440 Introducer,
441 IRLike,
442 Is,
443 IsNull,
444 Join,
445 JoinMarker,
446 Keep,
447 Key,
448 Kill,
449 Lambda,
450 Language,
451 Lateral,
452 Left,
453 Like,
454 NotLike, NotILike, NotRLike, NotIRLike, Limit,
459 List,
460 Load,
461 Local,
462 Lock,
463 Map,
464 Match,
465 MatchCondition,
466 MatchRecognize,
467 MemberOf,
468 Materialized,
469 Merge,
470 Mod,
471 Model,
472 Natural,
473 Next,
474 NoAction,
475 Nothing,
476 NotNull,
477 Null,
478 ObjectIdentifier,
479 Offset,
480 On,
481 Only,
482 Operator,
483 OrderBy,
484 OrderSiblingsBy,
485 Ordered,
486 Ordinality,
487 Out,
488 Outer,
489 Output,
490 Over,
491 Overlaps,
492 Overwrite,
493 Partition,
494 PartitionBy,
495 Percent,
496 Pivot,
497 Placeholder,
498 Positional,
499 Pragma,
500 Prewhere,
501 PrimaryKey,
502 Procedure,
503 Properties,
504 PseudoType,
505 Put,
506 Qualify,
507 Quote,
508 QDColon,
509 Range,
510 Recursive,
511 Refresh,
512 Rename,
513 Replace,
514 Returning,
515 Revoke,
516 References,
517 Restrict,
518 Right,
519 RLike,
520 Rollback,
521 Rollup,
522 Row,
523 Rows,
524 Select,
525 Semi,
526 Savepoint,
527 Separator,
528 Sequence,
529 Serde,
530 SerdeProperties,
531 Set,
532 Settings,
533 Show,
534 Siblings,
535 SimilarTo,
536 Some,
537 Sort,
538 SortBy,
539 SoundsLike,
540 StartWith,
541 StorageIntegration,
542 StraightJoin,
543 Struct,
544 Summarize,
545 TableSample,
546 Sample,
547 Bernoulli,
548 System,
549 Block,
550 Seed,
551 Repeatable,
552 Tag,
553 Temporary,
554 Transaction,
555 To,
556 Top,
557 Then,
558 True,
559 Truncate,
560 Uncache,
561 Union,
562 Unnest,
563 Unpivot,
564 Update,
565 Use,
566 Using,
567 Values,
568 View,
569 SemanticView,
570 Volatile,
571 When,
572 Where,
573 Window,
574 With,
575 Ties,
576 Exclude,
577 No,
578 Others,
579 Unique,
580 UtcDate,
581 UtcTime,
582 UtcTimestamp,
583 VersionSnapshot,
584 TimestampSnapshot,
585 Option,
586 Sink,
587 Source,
588 Analyze,
589 Namespace,
590 Export,
591 As,
592 By,
593 Nulls,
594 Respect,
595 Last,
596 If,
597 Cast,
598 TryCast,
599 SafeCast,
600 Count,
601 Extract,
602 Substring,
603 Trim,
604 Leading,
605 Trailing,
606 Both,
607 Position,
608 Overlaying,
609 Placing,
610 Treat,
611 Within,
612 Group,
613 Order,
614
615 Unbounded,
617 Preceding,
618 Following,
619 Current,
620 Groups,
621
622 Trigger,
624 Type,
625 Domain,
626 Returns,
627 Body,
628 Increment,
629 Minvalue,
630 Maxvalue,
631 Start,
632 Cycle,
633 NoCycle,
634 Prior,
635 Generated,
636 Identity,
637 Always,
638 Measures,
640 Pattern,
641 Define,
642 Running,
643 Owned,
644 After,
645 Before,
646 Instead,
647 Each,
648 Statement,
649 Referencing,
650 Old,
651 New,
652 Of,
653 Check,
654 Authorization,
655 Restart,
656
657 Eof,
659}
660
661impl TokenType {
662 pub fn is_keyword(&self) -> bool {
664 matches!(
665 self,
666 TokenType::Select
667 | TokenType::From
668 | TokenType::Where
669 | TokenType::And
670 | TokenType::Or
671 | TokenType::Not
672 | TokenType::In
673 | TokenType::Is
674 | TokenType::Null
675 | TokenType::True
676 | TokenType::False
677 | TokenType::As
678 | TokenType::On
679 | TokenType::Join
680 | TokenType::Left
681 | TokenType::Right
682 | TokenType::Inner
683 | TokenType::Outer
684 | TokenType::Full
685 | TokenType::Cross
686 | TokenType::Semi
687 | TokenType::Anti
688 | TokenType::Union
689 | TokenType::Except
690 | TokenType::Intersect
691 | TokenType::GroupBy
692 | TokenType::OrderBy
693 | TokenType::Having
694 | TokenType::Limit
695 | TokenType::Offset
696 | TokenType::Case
697 | TokenType::When
698 | TokenType::Then
699 | TokenType::Else
700 | TokenType::End
701 | TokenType::Create
702 | TokenType::Drop
703 | TokenType::Alter
704 | TokenType::Insert
705 | TokenType::Update
706 | TokenType::Delete
707 | TokenType::Into
708 | TokenType::Values
709 | TokenType::Set
710 | TokenType::With
711 | TokenType::Distinct
712 | TokenType::All
713 | TokenType::Exists
714 | TokenType::Between
715 | TokenType::Like
716 | TokenType::ILike
717 | TokenType::Filter
719 | TokenType::Date
720 | TokenType::Timestamp
721 | TokenType::TimestampTz
722 | TokenType::Interval
723 | TokenType::Time
724 | TokenType::Table
725 | TokenType::Index
726 | TokenType::Column
727 | TokenType::Database
728 | TokenType::Schema
729 | TokenType::View
730 | TokenType::Function
731 | TokenType::Procedure
732 | TokenType::Trigger
733 | TokenType::Sequence
734 | TokenType::Over
735 | TokenType::Partition
736 | TokenType::Window
737 | TokenType::Rows
738 | TokenType::Range
739 | TokenType::First
740 | TokenType::Last
741 | TokenType::Preceding
742 | TokenType::Following
743 | TokenType::Current
744 | TokenType::Row
745 | TokenType::Unbounded
746 | TokenType::Array
747 | TokenType::Struct
748 | TokenType::Map
749 | TokenType::PrimaryKey
750 | TokenType::Key
751 | TokenType::ForeignKey
752 | TokenType::References
753 | TokenType::Unique
754 | TokenType::Check
755 | TokenType::Default
756 | TokenType::Constraint
757 | TokenType::Comment
758 | TokenType::Rollup
759 | TokenType::Cube
760 | TokenType::Grant
761 | TokenType::Revoke
762 | TokenType::Type
763 | TokenType::Use
764 | TokenType::Cache
765 | TokenType::Uncache
766 | TokenType::Load
767 | TokenType::Any
768 | TokenType::Some
769 | TokenType::Asc
770 | TokenType::Desc
771 | TokenType::Nulls
772 | TokenType::Lateral
773 | TokenType::Natural
774 | TokenType::Escape
775 | TokenType::Glob
776 | TokenType::Match
777 | TokenType::Recursive
778 | TokenType::Replace
779 | TokenType::Returns
780 | TokenType::If
781 | TokenType::Pivot
782 | TokenType::Unpivot
783 | TokenType::Json
784 | TokenType::Blob
785 | TokenType::Text
786 | TokenType::Int
787 | TokenType::BigInt
788 | TokenType::SmallInt
789 | TokenType::TinyInt
790 | TokenType::Int128
791 | TokenType::UInt128
792 | TokenType::Int256
793 | TokenType::UInt256
794 | TokenType::UInt
795 | TokenType::UBigInt
796 | TokenType::Float
797 | TokenType::Double
798 | TokenType::Decimal
799 | TokenType::Boolean
800 | TokenType::VarChar
801 | TokenType::Char
802 | TokenType::Binary
803 | TokenType::VarBinary
804 | TokenType::No
805 | TokenType::DateTime
806 | TokenType::Truncate
807 | TokenType::Execute
808 | TokenType::Merge
809 | TokenType::Top
810 | TokenType::Begin
811 | TokenType::Generated
812 | TokenType::Identity
813 | TokenType::Always
814 | TokenType::Extract
815 | TokenType::AsOf
817 | TokenType::Prior
818 | TokenType::After
819 | TokenType::Restrict
820 | TokenType::Cascade
821 | TokenType::Local
822 | TokenType::Rename
823 | TokenType::Enum
824 | TokenType::Within
825 | TokenType::Format
826 | TokenType::Final
827 | TokenType::FileFormat
828 | TokenType::Input
829 | TokenType::InputFormat
830 | TokenType::Copy
831 | TokenType::Put
832 | TokenType::Get
833 | TokenType::Show
834 | TokenType::Serde
835 | TokenType::Sample
836 | TokenType::Sort
837 | TokenType::Collate
838 | TokenType::Ties
839 | TokenType::IsNull
840 | TokenType::NotNull
841 | TokenType::Exclude
842 | TokenType::Temporary
843 | TokenType::Add
844 | TokenType::Ordinality
845 | TokenType::Overlaps
846 | TokenType::Block
847 | TokenType::Pattern
848 | TokenType::Group
849 | TokenType::Cluster
850 | TokenType::Repeatable
851 | TokenType::Groups
852 | TokenType::Commit
853 | TokenType::Warehouse
854 | TokenType::System
855 | TokenType::By
856 | TokenType::To
857 | TokenType::Fetch
858 | TokenType::For
859 | TokenType::Only
860 | TokenType::Next
861 | TokenType::Lock
862 | TokenType::Refresh
863 | TokenType::Settings
864 | TokenType::Operator
865 | TokenType::Overwrite
866 | TokenType::StraightJoin
867 | TokenType::Start
868 )
869 }
870
871 pub fn is_comparison(&self) -> bool {
873 matches!(
874 self,
875 TokenType::Eq
876 | TokenType::Neq
877 | TokenType::Lt
878 | TokenType::Lte
879 | TokenType::Gt
880 | TokenType::Gte
881 | TokenType::NullsafeEq
882 )
883 }
884
885 pub fn is_arithmetic(&self) -> bool {
887 matches!(
888 self,
889 TokenType::Plus
890 | TokenType::Dash
891 | TokenType::Star
892 | TokenType::Slash
893 | TokenType::Percent
894 | TokenType::Mod
895 | TokenType::Div
896 )
897 }
898}
899
900impl fmt::Display for TokenType {
901 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
902 write!(f, "{:?}", self)
903 }
904}
905
906#[derive(Debug, Clone)]
908pub struct TokenizerConfig {
909 pub keywords: std::collections::HashMap<String, TokenType>,
911 pub single_tokens: std::collections::HashMap<char, TokenType>,
913 pub quotes: std::collections::HashMap<String, String>,
915 pub identifiers: std::collections::HashMap<char, char>,
917 pub comments: std::collections::HashMap<String, Option<String>>,
919 pub string_escapes: Vec<char>,
921 pub nested_comments: bool,
923 pub escape_follow_chars: Vec<char>,
928 pub b_prefix_is_byte_string: bool,
931 pub numeric_literals: std::collections::HashMap<String, String>,
934 pub identifiers_can_start_with_digit: bool,
938 pub hex_number_strings: bool,
942 pub hex_string_is_integer_type: bool,
946 pub string_escapes_allowed_in_raw_strings: bool,
951}
952
953impl Default for TokenizerConfig {
954 fn default() -> Self {
955 let mut keywords = std::collections::HashMap::new();
956 keywords.insert("SELECT".to_string(), TokenType::Select);
958 keywords.insert("FROM".to_string(), TokenType::From);
959 keywords.insert("WHERE".to_string(), TokenType::Where);
960 keywords.insert("AND".to_string(), TokenType::And);
961 keywords.insert("OR".to_string(), TokenType::Or);
962 keywords.insert("NOT".to_string(), TokenType::Not);
963 keywords.insert("AS".to_string(), TokenType::As);
964 keywords.insert("ON".to_string(), TokenType::On);
965 keywords.insert("JOIN".to_string(), TokenType::Join);
966 keywords.insert("LEFT".to_string(), TokenType::Left);
967 keywords.insert("RIGHT".to_string(), TokenType::Right);
968 keywords.insert("INNER".to_string(), TokenType::Inner);
969 keywords.insert("OUTER".to_string(), TokenType::Outer);
970 keywords.insert("OUTPUT".to_string(), TokenType::Output);
971 keywords.insert("FULL".to_string(), TokenType::Full);
972 keywords.insert("CROSS".to_string(), TokenType::Cross);
973 keywords.insert("SEMI".to_string(), TokenType::Semi);
974 keywords.insert("ANTI".to_string(), TokenType::Anti);
975 keywords.insert("STRAIGHT_JOIN".to_string(), TokenType::StraightJoin);
976 keywords.insert("UNION".to_string(), TokenType::Union);
977 keywords.insert("EXCEPT".to_string(), TokenType::Except);
978 keywords.insert("MINUS".to_string(), TokenType::Except); keywords.insert("INTERSECT".to_string(), TokenType::Intersect);
980 keywords.insert("GROUP".to_string(), TokenType::Group);
981 keywords.insert("CUBE".to_string(), TokenType::Cube);
982 keywords.insert("ROLLUP".to_string(), TokenType::Rollup);
983 keywords.insert("WITHIN".to_string(), TokenType::Within);
984 keywords.insert("ORDER".to_string(), TokenType::Order);
985 keywords.insert("BY".to_string(), TokenType::By);
986 keywords.insert("HAVING".to_string(), TokenType::Having);
987 keywords.insert("LIMIT".to_string(), TokenType::Limit);
988 keywords.insert("OFFSET".to_string(), TokenType::Offset);
989 keywords.insert("ORDINALITY".to_string(), TokenType::Ordinality);
990 keywords.insert("FETCH".to_string(), TokenType::Fetch);
991 keywords.insert("FIRST".to_string(), TokenType::First);
992 keywords.insert("NEXT".to_string(), TokenType::Next);
993 keywords.insert("ONLY".to_string(), TokenType::Only);
994 keywords.insert("KEEP".to_string(), TokenType::Keep);
995 keywords.insert("IGNORE".to_string(), TokenType::Ignore);
996 keywords.insert("INPUT".to_string(), TokenType::Input);
997 keywords.insert("CASE".to_string(), TokenType::Case);
998 keywords.insert("WHEN".to_string(), TokenType::When);
999 keywords.insert("THEN".to_string(), TokenType::Then);
1000 keywords.insert("ELSE".to_string(), TokenType::Else);
1001 keywords.insert("END".to_string(), TokenType::End);
1002 keywords.insert("ENDIF".to_string(), TokenType::End); keywords.insert("NULL".to_string(), TokenType::Null);
1004 keywords.insert("TRUE".to_string(), TokenType::True);
1005 keywords.insert("FALSE".to_string(), TokenType::False);
1006 keywords.insert("IS".to_string(), TokenType::Is);
1007 keywords.insert("IN".to_string(), TokenType::In);
1008 keywords.insert("BETWEEN".to_string(), TokenType::Between);
1009 keywords.insert("OVERLAPS".to_string(), TokenType::Overlaps);
1010 keywords.insert("LIKE".to_string(), TokenType::Like);
1011 keywords.insert("ILIKE".to_string(), TokenType::ILike);
1012 keywords.insert("RLIKE".to_string(), TokenType::RLike);
1013 keywords.insert("REGEXP".to_string(), TokenType::RLike);
1014 keywords.insert("ESCAPE".to_string(), TokenType::Escape);
1015 keywords.insert("EXISTS".to_string(), TokenType::Exists);
1016 keywords.insert("DISTINCT".to_string(), TokenType::Distinct);
1017 keywords.insert("ALL".to_string(), TokenType::All);
1018 keywords.insert("WITH".to_string(), TokenType::With);
1019 keywords.insert("CREATE".to_string(), TokenType::Create);
1020 keywords.insert("DROP".to_string(), TokenType::Drop);
1021 keywords.insert("ALTER".to_string(), TokenType::Alter);
1022 keywords.insert("TRUNCATE".to_string(), TokenType::Truncate);
1023 keywords.insert("TABLE".to_string(), TokenType::Table);
1024 keywords.insert("VIEW".to_string(), TokenType::View);
1025 keywords.insert("INDEX".to_string(), TokenType::Index);
1026 keywords.insert("COLUMN".to_string(), TokenType::Column);
1027 keywords.insert("CONSTRAINT".to_string(), TokenType::Constraint);
1028 keywords.insert("ADD".to_string(), TokenType::Add);
1029 keywords.insert("CASCADE".to_string(), TokenType::Cascade);
1030 keywords.insert("RESTRICT".to_string(), TokenType::Restrict);
1031 keywords.insert("RENAME".to_string(), TokenType::Rename);
1032 keywords.insert("TEMPORARY".to_string(), TokenType::Temporary);
1033 keywords.insert("TEMP".to_string(), TokenType::Temporary);
1034 keywords.insert("UNIQUE".to_string(), TokenType::Unique);
1035 keywords.insert("PRIMARY".to_string(), TokenType::PrimaryKey);
1036 keywords.insert("FOREIGN".to_string(), TokenType::ForeignKey);
1037 keywords.insert("KEY".to_string(), TokenType::Key);
1038 keywords.insert("KILL".to_string(), TokenType::Kill);
1039 keywords.insert("REFERENCES".to_string(), TokenType::References);
1040 keywords.insert("DEFAULT".to_string(), TokenType::Default);
1041 keywords.insert("DECLARE".to_string(), TokenType::Declare);
1042 keywords.insert("AUTO_INCREMENT".to_string(), TokenType::AutoIncrement);
1043 keywords.insert("AUTOINCREMENT".to_string(), TokenType::AutoIncrement); keywords.insert("MATERIALIZED".to_string(), TokenType::Materialized);
1045 keywords.insert("REPLACE".to_string(), TokenType::Replace);
1046 keywords.insert("TO".to_string(), TokenType::To);
1047 keywords.insert("INSERT".to_string(), TokenType::Insert);
1048 keywords.insert("OVERWRITE".to_string(), TokenType::Overwrite);
1049 keywords.insert("UPDATE".to_string(), TokenType::Update);
1050 keywords.insert("USE".to_string(), TokenType::Use);
1051 keywords.insert("WAREHOUSE".to_string(), TokenType::Warehouse);
1052 keywords.insert("GLOB".to_string(), TokenType::Glob);
1053 keywords.insert("DELETE".to_string(), TokenType::Delete);
1054 keywords.insert("MERGE".to_string(), TokenType::Merge);
1055 keywords.insert("CACHE".to_string(), TokenType::Cache);
1056 keywords.insert("UNCACHE".to_string(), TokenType::Uncache);
1057 keywords.insert("REFRESH".to_string(), TokenType::Refresh);
1058 keywords.insert("GRANT".to_string(), TokenType::Grant);
1059 keywords.insert("REVOKE".to_string(), TokenType::Revoke);
1060 keywords.insert("COMMENT".to_string(), TokenType::Comment);
1061 keywords.insert("COLLATE".to_string(), TokenType::Collate);
1062 keywords.insert("INTO".to_string(), TokenType::Into);
1063 keywords.insert("VALUES".to_string(), TokenType::Values);
1064 keywords.insert("SET".to_string(), TokenType::Set);
1065 keywords.insert("SETTINGS".to_string(), TokenType::Settings);
1066 keywords.insert("SEPARATOR".to_string(), TokenType::Separator);
1067 keywords.insert("ASC".to_string(), TokenType::Asc);
1068 keywords.insert("DESC".to_string(), TokenType::Desc);
1069 keywords.insert("NULLS".to_string(), TokenType::Nulls);
1070 keywords.insert("RESPECT".to_string(), TokenType::Respect);
1071 keywords.insert("FIRST".to_string(), TokenType::First);
1072 keywords.insert("LAST".to_string(), TokenType::Last);
1073 keywords.insert("IF".to_string(), TokenType::If);
1074 keywords.insert("CAST".to_string(), TokenType::Cast);
1075 keywords.insert("TRY_CAST".to_string(), TokenType::TryCast);
1076 keywords.insert("SAFE_CAST".to_string(), TokenType::SafeCast);
1077 keywords.insert("OVER".to_string(), TokenType::Over);
1078 keywords.insert("PARTITION".to_string(), TokenType::Partition);
1079 keywords.insert("PLACING".to_string(), TokenType::Placing);
1080 keywords.insert("WINDOW".to_string(), TokenType::Window);
1081 keywords.insert("ROWS".to_string(), TokenType::Rows);
1082 keywords.insert("RANGE".to_string(), TokenType::Range);
1083 keywords.insert("FILTER".to_string(), TokenType::Filter);
1084 keywords.insert("NATURAL".to_string(), TokenType::Natural);
1085 keywords.insert("USING".to_string(), TokenType::Using);
1086 keywords.insert("UNBOUNDED".to_string(), TokenType::Unbounded);
1087 keywords.insert("PRECEDING".to_string(), TokenType::Preceding);
1088 keywords.insert("FOLLOWING".to_string(), TokenType::Following);
1089 keywords.insert("CURRENT".to_string(), TokenType::Current);
1090 keywords.insert("ROW".to_string(), TokenType::Row);
1091 keywords.insert("GROUPS".to_string(), TokenType::Groups);
1092 keywords.insert("RECURSIVE".to_string(), TokenType::Recursive);
1093 keywords.insert("BOTH".to_string(), TokenType::Both);
1095 keywords.insert("LEADING".to_string(), TokenType::Leading);
1096 keywords.insert("TRAILING".to_string(), TokenType::Trailing);
1097 keywords.insert("INTERVAL".to_string(), TokenType::Interval);
1098 keywords.insert("TOP".to_string(), TokenType::Top);
1100 keywords.insert("QUALIFY".to_string(), TokenType::Qualify);
1101 keywords.insert("SAMPLE".to_string(), TokenType::Sample);
1102 keywords.insert("TABLESAMPLE".to_string(), TokenType::TableSample);
1103 keywords.insert("BERNOULLI".to_string(), TokenType::Bernoulli);
1104 keywords.insert("SYSTEM".to_string(), TokenType::System);
1105 keywords.insert("BLOCK".to_string(), TokenType::Block);
1106 keywords.insert("SEED".to_string(), TokenType::Seed);
1107 keywords.insert("REPEATABLE".to_string(), TokenType::Repeatable);
1108 keywords.insert("TIES".to_string(), TokenType::Ties);
1109 keywords.insert("LATERAL".to_string(), TokenType::Lateral);
1110 keywords.insert("LAMBDA".to_string(), TokenType::Lambda);
1111 keywords.insert("APPLY".to_string(), TokenType::Apply);
1112 keywords.insert("CONNECT".to_string(), TokenType::Connect);
1114 keywords.insert("CLUSTER".to_string(), TokenType::Cluster);
1116 keywords.insert("DISTRIBUTE".to_string(), TokenType::Distribute);
1117 keywords.insert("SORT".to_string(), TokenType::Sort);
1118 keywords.insert("PIVOT".to_string(), TokenType::Pivot);
1119 keywords.insert("PREWHERE".to_string(), TokenType::Prewhere);
1120 keywords.insert("UNPIVOT".to_string(), TokenType::Unpivot);
1121 keywords.insert("FOR".to_string(), TokenType::For);
1122 keywords.insert("ANY".to_string(), TokenType::Any);
1123 keywords.insert("SOME".to_string(), TokenType::Some);
1124 keywords.insert("ASOF".to_string(), TokenType::AsOf);
1125 keywords.insert("PERCENT".to_string(), TokenType::Percent);
1126 keywords.insert("EXCLUDE".to_string(), TokenType::Exclude);
1127 keywords.insert("NO".to_string(), TokenType::No);
1128 keywords.insert("OTHERS".to_string(), TokenType::Others);
1129 keywords.insert("OPERATOR".to_string(), TokenType::Operator);
1131 keywords.insert("SCHEMA".to_string(), TokenType::Schema);
1133 keywords.insert("NAMESPACE".to_string(), TokenType::Namespace);
1134 keywords.insert("DATABASE".to_string(), TokenType::Database);
1135 keywords.insert("FUNCTION".to_string(), TokenType::Function);
1136 keywords.insert("PROCEDURE".to_string(), TokenType::Procedure);
1137 keywords.insert("PROC".to_string(), TokenType::Procedure);
1138 keywords.insert("SEQUENCE".to_string(), TokenType::Sequence);
1139 keywords.insert("TRIGGER".to_string(), TokenType::Trigger);
1140 keywords.insert("TYPE".to_string(), TokenType::Type);
1141 keywords.insert("DOMAIN".to_string(), TokenType::Domain);
1142 keywords.insert("RETURNS".to_string(), TokenType::Returns);
1143 keywords.insert("RETURNING".to_string(), TokenType::Returning);
1144 keywords.insert("LANGUAGE".to_string(), TokenType::Language);
1145 keywords.insert("ROLLBACK".to_string(), TokenType::Rollback);
1146 keywords.insert("COMMIT".to_string(), TokenType::Commit);
1147 keywords.insert("BEGIN".to_string(), TokenType::Begin);
1148 keywords.insert("DESCRIBE".to_string(), TokenType::Describe);
1149 keywords.insert("PRESERVE".to_string(), TokenType::Preserve);
1150 keywords.insert("TRANSACTION".to_string(), TokenType::Transaction);
1151 keywords.insert("SAVEPOINT".to_string(), TokenType::Savepoint);
1152 keywords.insert("BODY".to_string(), TokenType::Body);
1153 keywords.insert("INCREMENT".to_string(), TokenType::Increment);
1154 keywords.insert("MINVALUE".to_string(), TokenType::Minvalue);
1155 keywords.insert("MAXVALUE".to_string(), TokenType::Maxvalue);
1156 keywords.insert("CYCLE".to_string(), TokenType::Cycle);
1157 keywords.insert("NOCYCLE".to_string(), TokenType::NoCycle);
1158 keywords.insert("PRIOR".to_string(), TokenType::Prior);
1159 keywords.insert("MATCH".to_string(), TokenType::Match);
1161 keywords.insert("MATCH_RECOGNIZE".to_string(), TokenType::MatchRecognize);
1162 keywords.insert("MEASURES".to_string(), TokenType::Measures);
1163 keywords.insert("PATTERN".to_string(), TokenType::Pattern);
1164 keywords.insert("DEFINE".to_string(), TokenType::Define);
1165 keywords.insert("RUNNING".to_string(), TokenType::Running);
1166 keywords.insert("FINAL".to_string(), TokenType::Final);
1167 keywords.insert("OWNED".to_string(), TokenType::Owned);
1168 keywords.insert("AFTER".to_string(), TokenType::After);
1169 keywords.insert("BEFORE".to_string(), TokenType::Before);
1170 keywords.insert("INSTEAD".to_string(), TokenType::Instead);
1171 keywords.insert("EACH".to_string(), TokenType::Each);
1172 keywords.insert("STATEMENT".to_string(), TokenType::Statement);
1173 keywords.insert("REFERENCING".to_string(), TokenType::Referencing);
1174 keywords.insert("OLD".to_string(), TokenType::Old);
1175 keywords.insert("NEW".to_string(), TokenType::New);
1176 keywords.insert("OF".to_string(), TokenType::Of);
1177 keywords.insert("CHECK".to_string(), TokenType::Check);
1178 keywords.insert("START".to_string(), TokenType::Start);
1179 keywords.insert("ENUM".to_string(), TokenType::Enum);
1180 keywords.insert("AUTHORIZATION".to_string(), TokenType::Authorization);
1181 keywords.insert("RESTART".to_string(), TokenType::Restart);
1182 keywords.insert("DATE".to_string(), TokenType::Date);
1184 keywords.insert("TIME".to_string(), TokenType::Time);
1185 keywords.insert("TIMESTAMP".to_string(), TokenType::Timestamp);
1186 keywords.insert("DATETIME".to_string(), TokenType::DateTime);
1187 keywords.insert("GENERATED".to_string(), TokenType::Generated);
1188 keywords.insert("IDENTITY".to_string(), TokenType::Identity);
1189 keywords.insert("ALWAYS".to_string(), TokenType::Always);
1190 keywords.insert("LOAD".to_string(), TokenType::Load);
1192 keywords.insert("LOCAL".to_string(), TokenType::Local);
1193 keywords.insert("INPATH".to_string(), TokenType::Inpath);
1194 keywords.insert("INPUTFORMAT".to_string(), TokenType::InputFormat);
1195 keywords.insert("SERDE".to_string(), TokenType::Serde);
1196 keywords.insert("SERDEPROPERTIES".to_string(), TokenType::SerdeProperties);
1197 keywords.insert("FORMAT".to_string(), TokenType::Format);
1198 keywords.insert("PRAGMA".to_string(), TokenType::Pragma);
1200 keywords.insert("SHOW".to_string(), TokenType::Show);
1202 keywords.insert("SIBLINGS".to_string(), TokenType::Siblings);
1204 keywords.insert("COPY".to_string(), TokenType::Copy);
1206 keywords.insert("PUT".to_string(), TokenType::Put);
1207 keywords.insert("GET".to_string(), TokenType::Get);
1208 keywords.insert("EXEC".to_string(), TokenType::Execute);
1210 keywords.insert("EXECUTE".to_string(), TokenType::Execute);
1211 keywords.insert("ISNULL".to_string(), TokenType::IsNull);
1213 keywords.insert("NOTNULL".to_string(), TokenType::NotNull);
1214
1215 let mut single_tokens = std::collections::HashMap::new();
1216 single_tokens.insert('(', TokenType::LParen);
1217 single_tokens.insert(')', TokenType::RParen);
1218 single_tokens.insert('[', TokenType::LBracket);
1219 single_tokens.insert(']', TokenType::RBracket);
1220 single_tokens.insert('{', TokenType::LBrace);
1221 single_tokens.insert('}', TokenType::RBrace);
1222 single_tokens.insert(',', TokenType::Comma);
1223 single_tokens.insert('.', TokenType::Dot);
1224 single_tokens.insert(';', TokenType::Semicolon);
1225 single_tokens.insert('+', TokenType::Plus);
1226 single_tokens.insert('-', TokenType::Dash);
1227 single_tokens.insert('*', TokenType::Star);
1228 single_tokens.insert('/', TokenType::Slash);
1229 single_tokens.insert('%', TokenType::Percent);
1230 single_tokens.insert('&', TokenType::Amp);
1231 single_tokens.insert('|', TokenType::Pipe);
1232 single_tokens.insert('^', TokenType::Caret);
1233 single_tokens.insert('~', TokenType::Tilde);
1234 single_tokens.insert('<', TokenType::Lt);
1235 single_tokens.insert('>', TokenType::Gt);
1236 single_tokens.insert('=', TokenType::Eq);
1237 single_tokens.insert('!', TokenType::Exclamation);
1238 single_tokens.insert(':', TokenType::Colon);
1239 single_tokens.insert('@', TokenType::DAt);
1240 single_tokens.insert('#', TokenType::Hash);
1241 single_tokens.insert('$', TokenType::Dollar);
1242 single_tokens.insert('?', TokenType::Parameter);
1243
1244 let mut quotes = std::collections::HashMap::new();
1245 quotes.insert("'".to_string(), "'".to_string());
1246 quotes.insert("\"\"\"".to_string(), "\"\"\"".to_string());
1248
1249 let mut identifiers = std::collections::HashMap::new();
1250 identifiers.insert('"', '"');
1251 identifiers.insert('`', '`');
1252 let mut comments = std::collections::HashMap::new();
1256 comments.insert("--".to_string(), None);
1257 comments.insert("/*".to_string(), Some("*/".to_string()));
1258
1259 Self {
1260 keywords,
1261 single_tokens,
1262 quotes,
1263 identifiers,
1264 comments,
1265 string_escapes: vec!['\''],
1268 nested_comments: true,
1269 escape_follow_chars: vec![],
1271 b_prefix_is_byte_string: false,
1273 numeric_literals: std::collections::HashMap::new(),
1274 identifiers_can_start_with_digit: false,
1275 hex_number_strings: false,
1276 hex_string_is_integer_type: false,
1277 string_escapes_allowed_in_raw_strings: true,
1280 }
1281 }
1282}
1283
1284pub struct Tokenizer {
1286 config: TokenizerConfig,
1287}
1288
1289impl Tokenizer {
1290 pub fn new(config: TokenizerConfig) -> Self {
1292 Self { config }
1293 }
1294
1295 pub fn default_config() -> Self {
1297 Self::new(TokenizerConfig::default())
1298 }
1299
1300 pub fn tokenize(&self, sql: &str) -> Result<Vec<Token>> {
1302 let mut state = TokenizerState::new(sql, &self.config);
1303 state.tokenize()
1304 }
1305}
1306
1307impl Default for Tokenizer {
1308 fn default() -> Self {
1309 Self::default_config()
1310 }
1311}
1312
1313struct TokenizerState<'a> {
1315 chars: Vec<char>,
1316 size: usize,
1317 tokens: Vec<Token>,
1318 start: usize,
1319 current: usize,
1320 line: usize,
1321 column: usize,
1322 comments: Vec<String>,
1323 config: &'a TokenizerConfig,
1324}
1325
1326impl<'a> TokenizerState<'a> {
1327 fn new(sql: &str, config: &'a TokenizerConfig) -> Self {
1328 let chars: Vec<char> = sql.chars().collect();
1329 let size = chars.len();
1330 Self {
1331 chars,
1332 size,
1333 tokens: Vec::new(),
1334 start: 0,
1335 current: 0,
1336 line: 1,
1337 column: 1,
1338 comments: Vec::new(),
1339 config,
1340 }
1341 }
1342
1343 fn tokenize(&mut self) -> Result<Vec<Token>> {
1344 while !self.is_at_end() {
1345 self.skip_whitespace();
1346 if self.is_at_end() {
1347 break;
1348 }
1349
1350 self.start = self.current;
1351 self.scan_token()?;
1352 }
1353
1354 Ok(std::mem::take(&mut self.tokens))
1361 }
1362
1363 fn is_at_end(&self) -> bool {
1364 self.current >= self.size
1365 }
1366
1367 fn peek(&self) -> char {
1368 if self.is_at_end() {
1369 '\0'
1370 } else {
1371 self.chars[self.current]
1372 }
1373 }
1374
1375 fn peek_next(&self) -> char {
1376 if self.current + 1 >= self.size {
1377 '\0'
1378 } else {
1379 self.chars[self.current + 1]
1380 }
1381 }
1382
1383 fn advance(&mut self) -> char {
1384 let c = self.peek();
1385 self.current += 1;
1386 if c == '\n' {
1387 self.line += 1;
1388 self.column = 1;
1389 } else {
1390 self.column += 1;
1391 }
1392 c
1393 }
1394
1395 fn skip_whitespace(&mut self) {
1396 while !self.is_at_end() {
1397 let c = self.peek();
1398 match c {
1399 ' ' | '\t' | '\r' | '\n' => {
1400 self.advance();
1401 }
1402 '-' if self.peek_next() == '-' => {
1403 self.scan_line_comment();
1404 }
1405 '/' if self.peek_next() == '*' => {
1406 if self.current + 2 < self.size && self.chars[self.current + 2] == '+' {
1408 break;
1410 }
1411 if self.scan_block_comment().is_err() {
1412 return;
1413 }
1414 }
1415 _ => break,
1416 }
1417 }
1418 }
1419
1420 fn scan_line_comment(&mut self) {
1421 self.advance(); self.advance(); let start = self.current;
1424 while !self.is_at_end() && self.peek() != '\n' {
1425 self.advance();
1426 }
1427 let comment: String = self.chars[start..self.current].iter().collect();
1428 let comment_text = comment.trim().to_string();
1429
1430 if let Some(last) = self.tokens.last_mut() {
1432 last.trailing_comments.push(comment_text);
1433 } else {
1434 self.comments.push(comment_text);
1435 }
1436 }
1437
1438 fn scan_block_comment(&mut self) -> Result<()> {
1439 self.advance(); self.advance(); let content_start = self.current;
1442 let mut depth = 1;
1443
1444 while !self.is_at_end() && depth > 0 {
1445 if self.peek() == '/' && self.peek_next() == '*' && self.config.nested_comments {
1446 self.advance();
1447 self.advance();
1448 depth += 1;
1449 } else if self.peek() == '*' && self.peek_next() == '/' {
1450 depth -= 1;
1451 if depth > 0 {
1452 self.advance();
1453 self.advance();
1454 }
1455 } else {
1456 self.advance();
1457 }
1458 }
1459
1460 if depth > 0 {
1461 return Err(Error::tokenize(
1462 "Unterminated block comment",
1463 self.line,
1464 self.column,
1465 ));
1466 }
1467
1468 let content: String = self.chars[content_start..self.current].iter().collect();
1470 self.advance(); self.advance(); let comment_text = format!("/*{}*/", content);
1475
1476 if let Some(last) = self.tokens.last_mut() {
1478 last.trailing_comments.push(comment_text);
1479 } else {
1480 self.comments.push(comment_text);
1481 }
1482
1483 Ok(())
1484 }
1485
1486 fn scan_hint(&mut self) -> Result<()> {
1488 self.advance(); self.advance(); self.advance(); let hint_start = self.current;
1492
1493 while !self.is_at_end() {
1495 if self.peek() == '*' && self.peek_next() == '/' {
1496 break;
1497 }
1498 self.advance();
1499 }
1500
1501 if self.is_at_end() {
1502 return Err(Error::tokenize(
1503 "Unterminated hint comment",
1504 self.line,
1505 self.column,
1506 ));
1507 }
1508
1509 let hint_text: String = self.chars[hint_start..self.current].iter().collect();
1510 self.advance(); self.advance(); self.add_token_with_text(TokenType::Hint, hint_text.trim().to_string());
1514
1515 Ok(())
1516 }
1517
1518 fn scan_positional_parameter(&mut self) -> Result<()> {
1520 self.advance(); let start = self.current;
1522
1523 while !self.is_at_end() && self.peek().is_ascii_digit() {
1524 self.advance();
1525 }
1526
1527 let number: String = self.chars[start..self.current].iter().collect();
1528 self.add_token_with_text(TokenType::Parameter, number);
1529 Ok(())
1530 }
1531
1532 fn try_scan_tagged_dollar_string(&mut self) -> Result<Option<()>> {
1537 let saved_pos = self.current;
1538
1539 self.advance(); let tag_start = self.current;
1545 while !self.is_at_end() && (self.peek().is_alphanumeric() || self.peek() == '_' || !self.peek().is_ascii()) {
1546 self.advance();
1547 }
1548 let tag: String = self.chars[tag_start..self.current].iter().collect();
1549
1550 if self.is_at_end() || self.peek() != '$' {
1552 self.current = saved_pos;
1554 return Ok(None);
1555 }
1556 self.advance(); let content_start = self.current;
1560 let closing_tag = format!("${}$", tag);
1561 let closing_chars: Vec<char> = closing_tag.chars().collect();
1562
1563 loop {
1564 if self.is_at_end() {
1565 self.current = saved_pos;
1567 return Ok(None);
1568 }
1569
1570 if self.peek() == '$' && self.current + closing_chars.len() <= self.size {
1572 let matches = closing_chars.iter().enumerate().all(|(j, &ch)| {
1573 self.current + j < self.size && self.chars[self.current + j] == ch
1574 });
1575 if matches {
1576 let content: String = self.chars[content_start..self.current].iter().collect();
1577 for _ in 0..closing_chars.len() {
1579 self.advance();
1580 }
1581 let token_text = format!("{}\x00{}", tag, content);
1583 self.add_token_with_text(TokenType::DollarString, token_text);
1584 return Ok(Some(()));
1585 }
1586 }
1587 self.advance();
1588 }
1589 }
1590
1591 fn scan_dollar_quoted_string(&mut self) -> Result<()> {
1596 self.advance(); self.advance(); let start = self.current;
1601 while !self.is_at_end() {
1602 if self.peek() == '$' && self.current + 1 < self.size && self.chars[self.current + 1] == '$' {
1603 break;
1604 }
1605 self.advance();
1606 }
1607
1608 let content: String = self.chars[start..self.current].iter().collect();
1609
1610 if !self.is_at_end() {
1611 self.advance(); self.advance(); }
1614
1615 self.add_token_with_text(TokenType::DollarString, content);
1616 Ok(())
1617 }
1618
1619 fn scan_token(&mut self) -> Result<()> {
1620 let c = self.peek();
1621
1622 if c == '\'' {
1624 if self.config.quotes.contains_key("'''")
1626 && self.peek_next() == '\''
1627 && self.current + 2 < self.size && self.chars[self.current + 2] == '\'' {
1628 return self.scan_triple_quoted_string('\'');
1629 }
1630 return self.scan_string();
1631 }
1632
1633 if c == '"' && self.config.quotes.contains_key("\"\"\"")
1635 && self.peek_next() == '"'
1636 && self.current + 2 < self.size && self.chars[self.current + 2] == '"' {
1637 return self.scan_triple_quoted_string('"');
1638 }
1639
1640 if c == '"' && self.config.quotes.contains_key("\"") && !self.config.identifiers.contains_key(&'"') {
1643 return self.scan_double_quoted_string();
1644 }
1645
1646 if let Some(&end_quote) = self.config.identifiers.get(&c) {
1648 return self.scan_quoted_identifier(end_quote);
1649 }
1650
1651 if c.is_ascii_digit() {
1653 return self.scan_number();
1654 }
1655
1656 if c == '.' && self.peek_next().is_ascii_digit() {
1663 let prev_char = if self.current > 0 { self.chars[self.current - 1] } else { '\0' };
1664 let is_after_ident = prev_char.is_alphanumeric() || prev_char == '_'
1665 || prev_char == '`' || prev_char == '"' || prev_char == ']'
1666 || prev_char == ')';
1667 if prev_char != '.' && !is_after_ident {
1668 return self.scan_number_starting_with_dot();
1669 }
1670 }
1671
1672 if c == '/' && self.peek_next() == '*' && self.current + 2 < self.size && self.chars[self.current + 2] == '+' {
1674 return self.scan_hint();
1675 }
1676
1677 if let Some(token_type) = self.try_scan_multi_char_operator() {
1679 self.add_token(token_type);
1680 return Ok(());
1681 }
1682
1683 if c == '$'
1686 && (self.peek_next().is_alphanumeric() || self.peek_next() == '_' || !self.peek_next().is_ascii())
1687 {
1688 if let Some(()) = self.try_scan_tagged_dollar_string()? {
1689 return Ok(());
1690 }
1691 }
1692
1693 if c == '$' && self.peek_next() == '$' {
1695 return self.scan_dollar_quoted_string();
1696 }
1697
1698 if c == '$' && self.peek_next().is_ascii_digit() {
1700 return self.scan_positional_parameter();
1701 }
1702
1703 if (c == '#' || c == '@') && (self.peek_next().is_alphanumeric() || self.peek_next() == '_' || self.peek_next() == '#') {
1706 return self.scan_tsql_identifier();
1707 }
1708
1709 if let Some(&token_type) = self.config.single_tokens.get(&c) {
1711 self.advance();
1712 self.add_token(token_type);
1713 return Ok(());
1714 }
1715
1716 self.scan_identifier_or_keyword()
1718 }
1719
1720 fn try_scan_multi_char_operator(&mut self) -> Option<TokenType> {
1721 let c = self.peek();
1722 let next = self.peek_next();
1723 let third = if self.current + 2 < self.size { self.chars[self.current + 2] } else { '\0' };
1724
1725 if c == '-' && next == '|' && third == '-' {
1728 self.advance();
1729 self.advance();
1730 self.advance();
1731 return Some(TokenType::Adjacent);
1732 }
1733
1734 if c == '|' && next == '|' && third == '/' {
1736 self.advance();
1737 self.advance();
1738 self.advance();
1739 return Some(TokenType::DPipeSlash);
1740 }
1741
1742 if c == '#' && next == '>' && third == '>' {
1744 self.advance();
1745 self.advance();
1746 self.advance();
1747 return Some(TokenType::DHashArrow);
1748 }
1749
1750 if c == '-' && next == '>' && third == '>' {
1752 self.advance();
1753 self.advance();
1754 self.advance();
1755 return Some(TokenType::DArrow);
1756 }
1757
1758 if c == '<' && next == '=' && third == '>' {
1760 self.advance();
1761 self.advance();
1762 self.advance();
1763 return Some(TokenType::NullsafeEq);
1764 }
1765
1766 if c == '<' && next == '-' && third == '>' {
1768 self.advance();
1769 self.advance();
1770 self.advance();
1771 return Some(TokenType::LrArrow);
1772 }
1773
1774 if c == '<' && next == '@' {
1776 self.advance();
1777 self.advance();
1778 return Some(TokenType::LtAt);
1779 }
1780
1781 if c == '@' && next == '>' {
1783 self.advance();
1784 self.advance();
1785 return Some(TokenType::AtGt);
1786 }
1787
1788 if c == '~' && next == '~' && third == '~' {
1790 self.advance();
1791 self.advance();
1792 self.advance();
1793 return Some(TokenType::Glob);
1794 }
1795
1796 if c == '~' && next == '~' && third == '*' {
1798 self.advance();
1799 self.advance();
1800 self.advance();
1801 return Some(TokenType::ILike);
1802 }
1803
1804 let fourth = if self.current + 3 < self.size { self.chars[self.current + 3] } else { '\0' };
1806 if c == '!' && next == '~' && third == '~' && fourth == '*' {
1807 self.advance();
1808 self.advance();
1809 self.advance();
1810 self.advance();
1811 return Some(TokenType::NotILike);
1812 }
1813
1814 if c == '!' && next == '~' && third == '~' {
1816 self.advance();
1817 self.advance();
1818 self.advance();
1819 return Some(TokenType::NotLike);
1820 }
1821
1822 if c == '!' && next == '~' && third == '*' {
1824 self.advance();
1825 self.advance();
1826 self.advance();
1827 return Some(TokenType::NotIRLike);
1828 }
1829
1830 if c == '!' && next == ':' && third == '>' {
1832 self.advance();
1833 self.advance();
1834 self.advance();
1835 return Some(TokenType::NColonGt);
1836 }
1837
1838 if c == '?' && next == ':' && third == ':' {
1840 self.advance();
1841 self.advance();
1842 self.advance();
1843 return Some(TokenType::QDColon);
1844 }
1845
1846 if c == '!' && next == '~' {
1848 self.advance();
1849 self.advance();
1850 return Some(TokenType::NotRLike);
1851 }
1852
1853 if c == '~' && next == '~' {
1855 self.advance();
1856 self.advance();
1857 return Some(TokenType::Like);
1858 }
1859
1860 if c == '~' && next == '*' {
1862 self.advance();
1863 self.advance();
1864 return Some(TokenType::IRLike);
1865 }
1866
1867 if c == ':' && next == ':' && third == '$' {
1870 self.advance();
1871 self.advance();
1872 self.advance();
1873 return Some(TokenType::DColonDollar);
1874 }
1875 if c == ':' && next == ':' && third == '%' {
1876 self.advance();
1877 self.advance();
1878 self.advance();
1879 return Some(TokenType::DColonPercent);
1880 }
1881 if c == ':' && next == ':' && third == '?' {
1882 self.advance();
1883 self.advance();
1884 self.advance();
1885 return Some(TokenType::DColonQMark);
1886 }
1887
1888 let token_type = match (c, next) {
1890 ('.', ':') => Some(TokenType::DotColon),
1891 ('=', '=') => Some(TokenType::Eq), ('<', '=') => Some(TokenType::Lte),
1893 ('>', '=') => Some(TokenType::Gte),
1894 ('!', '=') => Some(TokenType::Neq),
1895 ('<', '>') => Some(TokenType::Neq),
1896 ('^', '=') => Some(TokenType::Neq),
1897 ('<', '<') => Some(TokenType::LtLt),
1898 ('>', '>') => Some(TokenType::GtGt),
1899 ('|', '|') => Some(TokenType::DPipe),
1900 ('|', '/') => Some(TokenType::PipeSlash), (':', ':') => Some(TokenType::DColon),
1902 (':', '=') => Some(TokenType::ColonEq), (':', '>') => Some(TokenType::ColonGt), ('-', '>') => Some(TokenType::Arrow), ('=', '>') => Some(TokenType::FArrow), ('&', '&') => Some(TokenType::DAmp),
1907 ('&', '<') => Some(TokenType::AmpLt), ('&', '>') => Some(TokenType::AmpGt), ('@', '@') => Some(TokenType::AtAt), ('?', '|') => Some(TokenType::QMarkPipe), ('?', '&') => Some(TokenType::QMarkAmp), ('?', '?') => Some(TokenType::DQMark), ('#', '>') => Some(TokenType::HashArrow), ('#', '-') => Some(TokenType::HashDash), ('^', '@') => Some(TokenType::CaretAt), ('*', '*') => Some(TokenType::DStar), ('|', '>') => Some(TokenType::PipeGt), _ => None,
1919 };
1920
1921 if token_type.is_some() {
1922 self.advance();
1923 self.advance();
1924 }
1925
1926 token_type
1927 }
1928
1929 fn scan_string(&mut self) -> Result<()> {
1930 self.advance(); let mut value = String::new();
1932
1933 while !self.is_at_end() {
1934 let c = self.peek();
1935 if c == '\'' {
1936 if self.peek_next() == '\'' {
1937 value.push('\'');
1939 self.advance();
1940 self.advance();
1941 } else {
1942 break;
1943 }
1944 } else if c == '\\' && self.config.string_escapes.contains(&'\\') {
1945 self.advance(); if !self.is_at_end() {
1948 let escaped = self.advance();
1949 match escaped {
1950 'n' => value.push('\n'),
1951 'r' => value.push('\r'),
1952 't' => value.push('\t'),
1953 '0' => value.push('\0'),
1954 'Z' => value.push('\x1A'), 'a' => value.push('\x07'), 'b' => value.push('\x08'), 'f' => value.push('\x0C'), 'v' => value.push('\x0B'), '\\' => value.push('\\'),
1960 '\'' => value.push('\''),
1961 '"' => value.push('"'),
1962 '%' => {
1963 value.push('%');
1965 }
1966 '_' => {
1967 value.push('_');
1969 }
1970 _ => {
1974 if !self.config.escape_follow_chars.is_empty() {
1975 value.push(escaped);
1977 } else {
1978 value.push('\\');
1980 value.push(escaped);
1981 }
1982 }
1983 }
1984 }
1985 } else {
1986 value.push(self.advance());
1987 }
1988 }
1989
1990 if self.is_at_end() {
1991 return Err(Error::tokenize(
1992 "Unterminated string",
1993 self.line,
1994 self.column,
1995 ));
1996 }
1997
1998 self.advance(); self.add_token_with_text(TokenType::String, value);
2000 Ok(())
2001 }
2002
2003 fn scan_double_quoted_string(&mut self) -> Result<()> {
2005 self.advance(); let mut value = String::new();
2007
2008 while !self.is_at_end() {
2009 let c = self.peek();
2010 if c == '"' {
2011 if self.peek_next() == '"' {
2012 value.push('"');
2014 self.advance();
2015 self.advance();
2016 } else {
2017 break;
2018 }
2019 } else if c == '\\' && self.config.string_escapes.contains(&'\\') {
2020 self.advance(); if !self.is_at_end() {
2023 let escaped = self.advance();
2024 match escaped {
2025 'n' => value.push('\n'),
2026 'r' => value.push('\r'),
2027 't' => value.push('\t'),
2028 '0' => value.push('\0'),
2029 'Z' => value.push('\x1A'), 'a' => value.push('\x07'), 'b' => value.push('\x08'), 'f' => value.push('\x0C'), 'v' => value.push('\x0B'), '\\' => value.push('\\'),
2035 '\'' => value.push('\''),
2036 '"' => value.push('"'),
2037 '%' => {
2038 value.push('%');
2040 }
2041 '_' => {
2042 value.push('_');
2044 }
2045 _ => {
2049 if !self.config.escape_follow_chars.is_empty() {
2050 value.push(escaped);
2052 } else {
2053 value.push('\\');
2055 value.push(escaped);
2056 }
2057 }
2058 }
2059 }
2060 } else {
2061 value.push(self.advance());
2062 }
2063 }
2064
2065 if self.is_at_end() {
2066 return Err(Error::tokenize(
2067 "Unterminated double-quoted string",
2068 self.line,
2069 self.column,
2070 ));
2071 }
2072
2073 self.advance(); self.add_token_with_text(TokenType::String, value);
2075 Ok(())
2076 }
2077
2078 fn scan_triple_quoted_string(&mut self, quote_char: char) -> Result<()> {
2079 self.advance();
2081 self.advance();
2082 self.advance();
2083 let mut value = String::new();
2084
2085 while !self.is_at_end() {
2086 if self.peek() == quote_char
2088 && self.current + 1 < self.size && self.chars[self.current + 1] == quote_char
2089 && self.current + 2 < self.size && self.chars[self.current + 2] == quote_char
2090 {
2091 break;
2093 }
2094 value.push(self.advance());
2095 }
2096
2097 if self.is_at_end() {
2098 return Err(Error::tokenize(
2099 "Unterminated triple-quoted string",
2100 self.line,
2101 self.column,
2102 ));
2103 }
2104
2105 self.advance();
2107 self.advance();
2108 self.advance();
2109 let token_type = if quote_char == '"' {
2110 TokenType::TripleDoubleQuotedString
2111 } else {
2112 TokenType::TripleSingleQuotedString
2113 };
2114 self.add_token_with_text(token_type, value);
2115 Ok(())
2116 }
2117
2118 fn scan_quoted_identifier(&mut self, end_quote: char) -> Result<()> {
2119 self.advance(); let start = self.current;
2121
2122 while !self.is_at_end() && self.peek() != end_quote {
2123 if self.peek() == end_quote && self.peek_next() == end_quote {
2124 self.advance();
2126 }
2127 self.advance();
2128 }
2129
2130 if self.is_at_end() {
2131 return Err(Error::tokenize(
2132 "Unterminated identifier",
2133 self.line,
2134 self.column,
2135 ));
2136 }
2137
2138 let value: String = self.chars[start..self.current].iter().collect();
2139 self.advance(); self.add_token_with_text(TokenType::QuotedIdentifier, value);
2141 Ok(())
2142 }
2143
2144 fn scan_number(&mut self) -> Result<()> {
2145 if self.config.hex_number_strings && self.peek() == '0' && !self.is_at_end() {
2147 let next = if self.current + 1 < self.size { self.chars[self.current + 1] } else { '\0' };
2148 if next == 'x' || next == 'X' {
2149 self.advance();
2151 self.advance();
2152 let hex_start = self.current;
2154 while !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2155 self.advance();
2156 }
2157 if self.current > hex_start {
2158 let hex_value: String = self.chars[hex_start..self.current].iter().collect();
2159 if self.config.hex_string_is_integer_type {
2160 self.add_token_with_text(TokenType::HexNumber, hex_value);
2162 } else {
2163 self.add_token_with_text(TokenType::HexString, hex_value);
2165 }
2166 return Ok(());
2167 }
2168 self.current = self.start + 1;
2171 }
2172 }
2173
2174 while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2176 if self.peek() == '_' && (self.is_at_end() || !self.peek_next().is_ascii_digit()) {
2178 break;
2179 }
2180 self.advance();
2181 }
2182
2183 if self.peek() == '.' {
2187 let next = self.peek_next();
2188 if next != '.' {
2194 self.advance(); while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2197 if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2198 break;
2199 }
2200 self.advance();
2201 }
2202 }
2203 }
2204
2205 if self.peek() == 'e' || self.peek() == 'E' {
2207 self.advance();
2208 if self.peek() == '+' || self.peek() == '-' {
2209 self.advance();
2210 }
2211 while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2212 if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2213 break;
2214 }
2215 self.advance();
2216 }
2217 }
2218
2219 let text: String = self.chars[self.start..self.current].iter().collect();
2220
2221 if !self.config.numeric_literals.is_empty() && !self.is_at_end() {
2223 let next_char = self.peek().to_uppercase().to_string();
2224 let suffix_match = if self.current + 1 < self.size {
2226 let two_char: String = vec![self.chars[self.current], self.chars[self.current + 1]]
2227 .iter().collect::<String>().to_uppercase();
2228 if self.config.numeric_literals.contains_key(&two_char) {
2229 let after_suffix = if self.current + 2 < self.size { self.chars[self.current + 2] } else { ' ' };
2231 if !after_suffix.is_alphanumeric() && after_suffix != '_' {
2232 Some((two_char, 2))
2233 } else {
2234 None
2235 }
2236 } else if self.config.numeric_literals.contains_key(&next_char) {
2237 let after_suffix = if self.current + 1 < self.size { self.chars[self.current + 1] } else { ' ' };
2239 if !after_suffix.is_alphanumeric() && after_suffix != '_' {
2240 Some((next_char, 1))
2241 } else {
2242 None
2243 }
2244 } else {
2245 None
2246 }
2247 } else if self.config.numeric_literals.contains_key(&next_char) {
2248 Some((next_char, 1))
2250 } else {
2251 None
2252 };
2253
2254 if let Some((suffix, len)) = suffix_match {
2255 for _ in 0..len {
2257 self.advance();
2258 }
2259 let type_name = self.config.numeric_literals.get(&suffix).expect("suffix verified by contains_key above").clone();
2262 let combined = format!("{}::{}", text, type_name);
2263 self.add_token_with_text(TokenType::Number, combined);
2264 return Ok(());
2265 }
2266 }
2267
2268 if self.config.identifiers_can_start_with_digit && !self.is_at_end() {
2271 let next = self.peek();
2272 if next.is_alphabetic() || next == '_' {
2273 while !self.is_at_end() {
2275 let ch = self.peek();
2276 if ch.is_alphanumeric() || ch == '_' {
2277 self.advance();
2278 } else {
2279 break;
2280 }
2281 }
2282 let ident_text: String = self.chars[self.start..self.current].iter().collect();
2283 self.add_token_with_text(TokenType::Identifier, ident_text);
2284 return Ok(());
2285 }
2286 }
2287
2288 self.add_token_with_text(TokenType::Number, text);
2289 Ok(())
2290 }
2291
2292 fn scan_number_starting_with_dot(&mut self) -> Result<()> {
2294 self.advance();
2296
2297 while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2299 if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2300 break;
2301 }
2302 self.advance();
2303 }
2304
2305 if self.peek() == 'e' || self.peek() == 'E' {
2307 self.advance();
2308 if self.peek() == '+' || self.peek() == '-' {
2309 self.advance();
2310 }
2311 while !self.is_at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
2312 if self.peek() == '_' && !self.peek_next().is_ascii_digit() {
2313 break;
2314 }
2315 self.advance();
2316 }
2317 }
2318
2319 let text: String = self.chars[self.start..self.current].iter().collect();
2320 self.add_token_with_text(TokenType::Number, text);
2321 Ok(())
2322 }
2323
2324 fn scan_identifier_or_keyword(&mut self) -> Result<()> {
2325 let first_char = self.peek();
2327 if !first_char.is_alphanumeric() && first_char != '_' {
2328 let c = self.advance();
2330 return Err(Error::tokenize(
2331 format!("Unexpected character: '{}'", c),
2332 self.line,
2333 self.column,
2334 ));
2335 }
2336
2337 while !self.is_at_end() {
2338 let c = self.peek();
2339 if c == '#' {
2343 let next_c = if self.current + 1 < self.size { self.chars[self.current + 1] } else { '\0' };
2344 if next_c == '>' || next_c == '-' {
2345 break; }
2347 self.advance();
2348 } else if c.is_alphanumeric() || c == '_' || c == '$' || c == '@' {
2349 self.advance();
2350 } else {
2351 break;
2352 }
2353 }
2354
2355 let text: String = self.chars[self.start..self.current].iter().collect();
2356 let upper = text.to_uppercase();
2357
2358 if upper == "NOT" && self.peek() == '=' {
2360 self.advance(); self.add_token(TokenType::Neq);
2362 return Ok(());
2363 }
2364
2365 let next_char = self.peek();
2368 let is_single_quote = next_char == '\'';
2369 let is_double_quote = next_char == '"' && self.config.quotes.contains_key("\"");
2370 let is_double_quote_for_raw = next_char == '"';
2373
2374 if upper == "R" && (is_single_quote || is_double_quote_for_raw) {
2377 let quote_char = if is_single_quote { '\'' } else { '"' };
2380 self.advance(); if self.peek() == quote_char && self.peek_next() == quote_char {
2384 self.advance(); self.advance(); let string_value = self.scan_raw_triple_quoted_content(quote_char)?;
2388 self.add_token_with_text(TokenType::RawString, string_value);
2389 } else {
2390 let string_value = self.scan_raw_string_content(quote_char)?;
2391 self.add_token_with_text(TokenType::RawString, string_value);
2392 }
2393 return Ok(());
2394 }
2395
2396 if is_single_quote || is_double_quote {
2397 match upper.as_str() {
2398 "N" => {
2399 self.advance(); let string_value = if is_single_quote {
2402 self.scan_string_content()?
2403 } else {
2404 self.scan_double_quoted_string_content()?
2405 };
2406 self.add_token_with_text(TokenType::NationalString, string_value);
2407 return Ok(());
2408 }
2409 "E" => {
2410 let lowercase = text == "e";
2414 let prefix = if lowercase { "e:" } else { "E:" };
2415 self.advance(); let string_value = self.scan_string_content_with_escapes(true)?;
2417 self.add_token_with_text(TokenType::EscapeString, format!("{}{}", prefix, string_value));
2418 return Ok(());
2419 }
2420 "X" => {
2421 self.advance(); let string_value = if is_single_quote {
2424 self.scan_string_content()?
2425 } else {
2426 self.scan_double_quoted_string_content()?
2427 };
2428 self.add_token_with_text(TokenType::HexString, string_value);
2429 return Ok(());
2430 }
2431 "B" if is_double_quote => {
2432 self.advance(); let string_value = self.scan_double_quoted_string_content()?;
2435 self.add_token_with_text(TokenType::ByteString, string_value);
2436 return Ok(());
2437 }
2438 "B" if is_single_quote => {
2439 self.advance(); let string_value = self.scan_string_content()?;
2443 if self.config.b_prefix_is_byte_string {
2444 self.add_token_with_text(TokenType::ByteString, string_value);
2445 } else {
2446 self.add_token_with_text(TokenType::BitString, string_value);
2447 }
2448 return Ok(());
2449 }
2450 _ => {}
2451 }
2452 }
2453
2454 if upper == "U" && self.peek() == '&' && self.current + 1 < self.size && self.chars[self.current + 1] == '\'' {
2456 self.advance(); self.advance(); let string_value = self.scan_string_content()?;
2459 self.add_token_with_text(TokenType::UnicodeString, string_value);
2460 return Ok(());
2461 }
2462
2463 let token_type = self
2464 .config
2465 .keywords
2466 .get(&upper)
2467 .copied()
2468 .unwrap_or(TokenType::Var);
2469
2470 self.add_token_with_text(token_type, text);
2471 Ok(())
2472 }
2473
2474 fn scan_string_content_with_escapes(&mut self, force_backslash_escapes: bool) -> Result<String> {
2478 let mut value = String::new();
2479 let use_backslash_escapes = force_backslash_escapes || self.config.string_escapes.contains(&'\\');
2480
2481 while !self.is_at_end() {
2482 let c = self.peek();
2483 if c == '\'' {
2484 if self.peek_next() == '\'' {
2485 value.push('\'');
2487 self.advance();
2488 self.advance();
2489 } else {
2490 break;
2491 }
2492 } else if c == '\\' && use_backslash_escapes {
2493 value.push(self.advance());
2495 if !self.is_at_end() {
2496 value.push(self.advance());
2497 }
2498 } else {
2499 value.push(self.advance());
2500 }
2501 }
2502
2503 if self.is_at_end() {
2504 return Err(Error::tokenize(
2505 "Unterminated string",
2506 self.line,
2507 self.column,
2508 ));
2509 }
2510
2511 self.advance(); Ok(value)
2513 }
2514
2515 fn scan_string_content(&mut self) -> Result<String> {
2517 self.scan_string_content_with_escapes(false)
2518 }
2519
2520 fn scan_double_quoted_string_content(&mut self) -> Result<String> {
2523 let mut value = String::new();
2524 let use_backslash_escapes = self.config.string_escapes.contains(&'\\');
2525
2526 while !self.is_at_end() {
2527 let c = self.peek();
2528 if c == '"' {
2529 if self.peek_next() == '"' {
2530 value.push('"');
2532 self.advance();
2533 self.advance();
2534 } else {
2535 break;
2536 }
2537 } else if c == '\\' && use_backslash_escapes {
2538 self.advance(); if !self.is_at_end() {
2541 let escaped = self.advance();
2542 match escaped {
2543 'n' => value.push('\n'),
2544 'r' => value.push('\r'),
2545 't' => value.push('\t'),
2546 '0' => value.push('\0'),
2547 '\\' => value.push('\\'),
2548 '"' => value.push('"'),
2549 '\'' => value.push('\''),
2550 'x' => {
2551 let mut hex = String::new();
2553 for _ in 0..2 {
2554 if !self.is_at_end() && self.peek().is_ascii_hexdigit() {
2555 hex.push(self.advance());
2556 }
2557 }
2558 if let Ok(byte) = u8::from_str_radix(&hex, 16) {
2559 value.push(byte as char);
2560 } else {
2561 value.push('\\');
2563 value.push('x');
2564 value.push_str(&hex);
2565 }
2566 }
2567 _ => {
2568 value.push('\\');
2570 value.push(escaped);
2571 }
2572 }
2573 }
2574 } else {
2575 value.push(self.advance());
2576 }
2577 }
2578
2579 if self.is_at_end() {
2580 return Err(Error::tokenize(
2581 "Unterminated double-quoted string",
2582 self.line,
2583 self.column,
2584 ));
2585 }
2586
2587 self.advance(); Ok(value)
2589 }
2590
2591 fn scan_raw_string_content(&mut self, quote_char: char) -> Result<String> {
2596 let mut value = String::new();
2597
2598 while !self.is_at_end() {
2599 let c = self.peek();
2600 if c == quote_char {
2601 if self.peek_next() == quote_char {
2602 value.push(quote_char);
2604 self.advance();
2605 self.advance();
2606 } else {
2607 break;
2608 }
2609 } else if c == '\\' && self.peek_next() == quote_char && self.config.string_escapes_allowed_in_raw_strings {
2610 value.push(quote_char);
2614 self.advance(); self.advance(); } else {
2617 value.push(self.advance());
2619 }
2620 }
2621
2622 if self.is_at_end() {
2623 return Err(Error::tokenize(
2624 "Unterminated raw string",
2625 self.line,
2626 self.column,
2627 ));
2628 }
2629
2630 self.advance(); Ok(value)
2632 }
2633
2634 fn scan_raw_triple_quoted_content(&mut self, quote_char: char) -> Result<String> {
2637 let mut value = String::new();
2638
2639 while !self.is_at_end() {
2640 let c = self.peek();
2641 if c == quote_char && self.peek_next() == quote_char {
2642 if self.current + 2 < self.size && self.chars[self.current + 2] == quote_char {
2644 self.advance(); self.advance(); self.advance(); return Ok(value);
2649 }
2650 }
2651 let ch = self.advance();
2653 value.push(ch);
2654 }
2655
2656 Err(Error::tokenize(
2657 "Unterminated raw triple-quoted string",
2658 self.line,
2659 self.column,
2660 ))
2661 }
2662
2663 fn scan_tsql_identifier(&mut self) -> Result<()> {
2666 let first = self.advance();
2668
2669 if first == '#' && self.peek() == '#' {
2671 self.advance();
2672 }
2673
2674 while !self.is_at_end() {
2676 let c = self.peek();
2677 if c.is_alphanumeric() || c == '_' || c == '$' || c == '#' || c == '@' {
2678 self.advance();
2679 } else {
2680 break;
2681 }
2682 }
2683
2684 let text: String = self.chars[self.start..self.current].iter().collect();
2685 self.add_token_with_text(TokenType::Var, text);
2687 Ok(())
2688 }
2689
2690 fn add_token(&mut self, token_type: TokenType) {
2691 let text: String = self.chars[self.start..self.current].iter().collect();
2692 self.add_token_with_text(token_type, text);
2693 }
2694
2695 fn add_token_with_text(&mut self, token_type: TokenType, text: String) {
2696 let span = Span::new(self.start, self.current, self.line, self.column);
2697 let mut token = Token::new(token_type, text, span);
2698 token.comments.append(&mut self.comments);
2699 self.tokens.push(token);
2700 }
2701}
2702
2703#[cfg(test)]
2704mod tests {
2705 use super::*;
2706
2707 #[test]
2708 fn test_simple_select() {
2709 let tokenizer = Tokenizer::default();
2710 let tokens = tokenizer.tokenize("SELECT 1").unwrap();
2711
2712 assert_eq!(tokens.len(), 2);
2713 assert_eq!(tokens[0].token_type, TokenType::Select);
2714 assert_eq!(tokens[1].token_type, TokenType::Number);
2715 assert_eq!(tokens[1].text, "1");
2716 }
2717
2718 #[test]
2719 fn test_select_with_identifier() {
2720 let tokenizer = Tokenizer::default();
2721 let tokens = tokenizer.tokenize("SELECT a, b FROM t").unwrap();
2722
2723 assert_eq!(tokens.len(), 6);
2724 assert_eq!(tokens[0].token_type, TokenType::Select);
2725 assert_eq!(tokens[1].token_type, TokenType::Var);
2726 assert_eq!(tokens[1].text, "a");
2727 assert_eq!(tokens[2].token_type, TokenType::Comma);
2728 assert_eq!(tokens[3].token_type, TokenType::Var);
2729 assert_eq!(tokens[3].text, "b");
2730 assert_eq!(tokens[4].token_type, TokenType::From);
2731 assert_eq!(tokens[5].token_type, TokenType::Var);
2732 assert_eq!(tokens[5].text, "t");
2733 }
2734
2735 #[test]
2736 fn test_string_literal() {
2737 let tokenizer = Tokenizer::default();
2738 let tokens = tokenizer.tokenize("SELECT 'hello'").unwrap();
2739
2740 assert_eq!(tokens.len(), 2);
2741 assert_eq!(tokens[1].token_type, TokenType::String);
2742 assert_eq!(tokens[1].text, "hello");
2743 }
2744
2745 #[test]
2746 fn test_escaped_string() {
2747 let tokenizer = Tokenizer::default();
2748 let tokens = tokenizer.tokenize("SELECT 'it''s'").unwrap();
2749
2750 assert_eq!(tokens.len(), 2);
2751 assert_eq!(tokens[1].token_type, TokenType::String);
2752 assert_eq!(tokens[1].text, "it's");
2753 }
2754
2755 #[test]
2756 fn test_comments() {
2757 let tokenizer = Tokenizer::default();
2758 let tokens = tokenizer.tokenize("SELECT -- comment\n1").unwrap();
2759
2760 assert_eq!(tokens.len(), 2);
2761 assert_eq!(tokens[0].trailing_comments.len(), 1);
2764 assert_eq!(tokens[0].trailing_comments[0], "comment");
2765 }
2766
2767 #[test]
2768 fn test_operators() {
2769 let tokenizer = Tokenizer::default();
2770 let tokens = tokenizer.tokenize("1 + 2 * 3").unwrap();
2771
2772 assert_eq!(tokens.len(), 5);
2773 assert_eq!(tokens[0].token_type, TokenType::Number);
2774 assert_eq!(tokens[1].token_type, TokenType::Plus);
2775 assert_eq!(tokens[2].token_type, TokenType::Number);
2776 assert_eq!(tokens[3].token_type, TokenType::Star);
2777 assert_eq!(tokens[4].token_type, TokenType::Number);
2778 }
2779
2780 #[test]
2781 fn test_comparison_operators() {
2782 let tokenizer = Tokenizer::default();
2783 let tokens = tokenizer.tokenize("a <= b >= c != d").unwrap();
2784
2785 assert_eq!(tokens[1].token_type, TokenType::Lte);
2786 assert_eq!(tokens[3].token_type, TokenType::Gte);
2787 assert_eq!(tokens[5].token_type, TokenType::Neq);
2788 }
2789
2790 #[test]
2791 fn test_national_string() {
2792 let tokenizer = Tokenizer::default();
2793 let tokens = tokenizer.tokenize("N'abc'").unwrap();
2794
2795 assert_eq!(tokens.len(), 1, "Expected 1 token for N'abc', got {:?}", tokens);
2796 assert_eq!(tokens[0].token_type, TokenType::NationalString);
2797 assert_eq!(tokens[0].text, "abc");
2798 }
2799
2800 #[test]
2801 fn test_hex_string() {
2802 let tokenizer = Tokenizer::default();
2803 let tokens = tokenizer.tokenize("X'ABCD'").unwrap();
2804
2805 assert_eq!(tokens.len(), 1, "Expected 1 token for X'ABCD', got {:?}", tokens);
2806 assert_eq!(tokens[0].token_type, TokenType::HexString);
2807 assert_eq!(tokens[0].text, "ABCD");
2808 }
2809
2810 #[test]
2811 fn test_bit_string() {
2812 let tokenizer = Tokenizer::default();
2813 let tokens = tokenizer.tokenize("B'01010'").unwrap();
2814
2815 assert_eq!(tokens.len(), 1, "Expected 1 token for B'01010', got {:?}", tokens);
2816 assert_eq!(tokens[0].token_type, TokenType::BitString);
2817 assert_eq!(tokens[0].text, "01010");
2818 }
2819
2820 #[test]
2821 fn test_trailing_dot_number() {
2822 let tokenizer = Tokenizer::default();
2823
2824 let tokens = tokenizer.tokenize("SELECT 1.").unwrap();
2826 assert_eq!(tokens.len(), 2, "Expected 2 tokens for 'SELECT 1.', got {:?}", tokens);
2827 assert_eq!(tokens[1].token_type, TokenType::Number);
2828 assert_eq!(tokens[1].text, "1.");
2829
2830 let tokens = tokenizer.tokenize("SELECT 1.5").unwrap();
2832 assert_eq!(tokens[1].text, "1.5");
2833
2834 let tokens = tokenizer.tokenize("SELECT 1.a").unwrap();
2837 assert_eq!(tokens.len(), 3, "Expected 3 tokens for 'SELECT 1.a', got {:?}", tokens);
2838 assert_eq!(tokens[1].token_type, TokenType::Number);
2839 assert_eq!(tokens[1].text, "1.");
2840 assert_eq!(tokens[2].token_type, TokenType::Var);
2841
2842 let tokens = tokenizer.tokenize("SELECT 1..2").unwrap();
2844 assert_eq!(tokens[1].token_type, TokenType::Number);
2845 assert_eq!(tokens[1].text, "1");
2846 assert_eq!(tokens[2].token_type, TokenType::Dot);
2847 assert_eq!(tokens[3].token_type, TokenType::Dot);
2848 assert_eq!(tokens[4].token_type, TokenType::Number);
2849 assert_eq!(tokens[4].text, "2");
2850 }
2851
2852 #[test]
2853 fn test_leading_dot_number() {
2854 let tokenizer = Tokenizer::default();
2855
2856 let tokens = tokenizer.tokenize(".25").unwrap();
2858 assert_eq!(tokens.len(), 1, "Expected 1 token for '.25', got {:?}", tokens);
2859 assert_eq!(tokens[0].token_type, TokenType::Number);
2860 assert_eq!(tokens[0].text, ".25");
2861
2862 let tokens = tokenizer.tokenize("SAMPLE (.25)").unwrap();
2864 assert_eq!(tokens.len(), 4, "Expected 4 tokens for 'SAMPLE (.25)', got {:?}", tokens);
2865 assert_eq!(tokens[0].token_type, TokenType::Sample);
2866 assert_eq!(tokens[1].token_type, TokenType::LParen);
2867 assert_eq!(tokens[2].token_type, TokenType::Number);
2868 assert_eq!(tokens[2].text, ".25");
2869 assert_eq!(tokens[3].token_type, TokenType::RParen);
2870
2871 let tokens = tokenizer.tokenize(".5e10").unwrap();
2873 assert_eq!(tokens.len(), 1, "Expected 1 token for '.5e10', got {:?}", tokens);
2874 assert_eq!(tokens[0].token_type, TokenType::Number);
2875 assert_eq!(tokens[0].text, ".5e10");
2876
2877 let tokens = tokenizer.tokenize("a.b").unwrap();
2879 assert_eq!(tokens.len(), 3, "Expected 3 tokens for 'a.b', got {:?}", tokens);
2880 assert_eq!(tokens[1].token_type, TokenType::Dot);
2881 }
2882
2883 #[test]
2884 fn test_unrecognized_character() {
2885 let tokenizer = Tokenizer::default();
2886
2887 let result = tokenizer.tokenize("SELECT \u{2018}hello\u{2019}");
2889 assert!(result.is_err(), "Should error on unrecognized character, got: {:?}", result);
2891
2892 let result = tokenizer.tokenize("SELECT • FROM t");
2894 assert!(result.is_err());
2895 }
2896
2897 #[test]
2898 fn test_colon_eq_tokenization() {
2899 let tokenizer = Tokenizer::default();
2900
2901 let tokens = tokenizer.tokenize("a := 1").unwrap();
2903 assert_eq!(tokens.len(), 3);
2904 assert_eq!(tokens[0].token_type, TokenType::Var);
2905 assert_eq!(tokens[1].token_type, TokenType::ColonEq);
2906 assert_eq!(tokens[2].token_type, TokenType::Number);
2907
2908 let tokens = tokenizer.tokenize("a:b").unwrap();
2910 assert!(tokens.iter().any(|t| t.token_type == TokenType::Colon));
2911 assert!(!tokens.iter().any(|t| t.token_type == TokenType::ColonEq));
2912
2913 let tokens = tokenizer.tokenize("a::INT").unwrap();
2915 assert!(tokens.iter().any(|t| t.token_type == TokenType::DColon));
2916 }
2917
2918 #[test]
2919 fn test_colon_eq_parsing() {
2920 use crate::parser::Parser;
2921 use crate::generator::Generator;
2922
2923 let ast = Parser::parse_sql("SELECT @var1 := 1, @var2").expect("Failed to parse MySQL @var := expr");
2925 let output = Generator::sql(&ast[0]).expect("Failed to generate");
2926 assert_eq!(output, "SELECT @var1 := 1, @var2");
2927
2928 let ast = Parser::parse_sql("SELECT @var1, @var2 := @var1").expect("Failed to parse MySQL @var2 := @var1");
2930 let output = Generator::sql(&ast[0]).expect("Failed to generate");
2931 assert_eq!(output, "SELECT @var1, @var2 := @var1");
2932
2933 let ast = Parser::parse_sql("SELECT @var1 := COUNT(*) FROM t1").expect("Failed to parse MySQL @var := COUNT(*)");
2935 let output = Generator::sql(&ast[0]).expect("Failed to generate");
2936 assert_eq!(output, "SELECT @var1 := COUNT(*) FROM t1");
2937
2938 let ast = Parser::parse_sql("SET @var1 := 1").expect("Failed to parse SET @var1 := 1");
2940 let output = Generator::sql(&ast[0]).expect("Failed to generate");
2941 assert_eq!(output, "SET @var1 = 1");
2942
2943 let ast = Parser::parse_sql("UNION_VALUE(k1 := 1)").expect("Failed to parse named arg with :=");
2945 let output = Generator::sql(&ast[0]).expect("Failed to generate");
2946 assert_eq!(output, "UNION_VALUE(k1 := 1)");
2947
2948 let ast = Parser::parse_sql("SELECT UNNEST(col, recursive := TRUE) FROM t").expect("Failed to parse UNNEST with :=");
2950 let output = Generator::sql(&ast[0]).expect("Failed to generate");
2951 assert_eq!(output, "SELECT UNNEST(col, recursive := TRUE) FROM t");
2952
2953 let ast = Parser::parse_sql("SELECT foo: 1").expect("Failed to parse DuckDB prefix alias foo: 1");
2955 let output = Generator::sql(&ast[0]).expect("Failed to generate");
2956 assert_eq!(output, "SELECT 1 AS foo");
2957
2958 let ast = Parser::parse_sql("SELECT foo: 1, bar: 2, baz: 3").expect("Failed to parse DuckDB multiple prefix aliases");
2960 let output = Generator::sql(&ast[0]).expect("Failed to generate");
2961 assert_eq!(output, "SELECT 1 AS foo, 2 AS bar, 3 AS baz");
2962 }
2963
2964 #[test]
2965 fn test_colon_eq_dialect_roundtrip() {
2966 use crate::dialects::{Dialect, DialectType};
2967
2968 fn check(dialect: DialectType, sql: &str, expected: Option<&str>) {
2969 let d = Dialect::get(dialect);
2970 let ast = d.parse(sql).unwrap_or_else(|e| panic!("Parse error for '{}': {}", sql, e));
2971 assert!(!ast.is_empty(), "Empty AST for: {}", sql);
2972 let transformed = d.transform(ast[0].clone()).unwrap_or_else(|e| panic!("Transform error for '{}': {}", sql, e));
2973 let output = d.generate(&transformed).unwrap_or_else(|e| panic!("Generate error for '{}': {}", sql, e));
2974 let expected = expected.unwrap_or(sql);
2975 assert_eq!(output, expected, "Roundtrip failed for: {}", sql);
2976 }
2977
2978 check(DialectType::MySQL, "SELECT @var1 := 1, @var2", None);
2980 check(DialectType::MySQL, "SELECT @var1, @var2 := @var1", None);
2981 check(DialectType::MySQL, "SELECT @var1 := COUNT(*) FROM t1", None);
2982 check(DialectType::MySQL, "SET @var1 := 1", Some("SET @var1 = 1"));
2983
2984 check(DialectType::DuckDB, "SELECT UNNEST(col, recursive := TRUE) FROM t", None);
2986 check(DialectType::DuckDB, "UNION_VALUE(k1 := 1)", None);
2987
2988 {
2991 let d = Dialect::get(DialectType::DuckDB);
2992 let ast = d.parse("STRUCT_PACK(a := 'b')::json").expect("Failed to parse STRUCT_PACK(a := 'b')::json");
2993 assert!(!ast.is_empty(), "Empty AST for STRUCT_PACK(a := 'b')::json");
2994 }
2995
2996 check(DialectType::DuckDB, "SELECT foo: 1", Some("SELECT 1 AS foo"));
2998 check(DialectType::DuckDB, "SELECT foo: 1, bar: 2, baz: 3", Some("SELECT 1 AS foo, 2 AS bar, 3 AS baz"));
2999 }
3000
3001 #[test]
3002 fn test_comment_roundtrip() {
3003 use crate::parser::Parser;
3004 use crate::generator::Generator;
3005
3006 fn check_roundtrip(sql: &str) -> Option<String> {
3007 let ast = match Parser::parse_sql(sql) {
3008 Ok(a) => a,
3009 Err(e) => return Some(format!("Parse error: {:?}", e)),
3010 };
3011 if ast.is_empty() {
3012 return Some("Empty AST".to_string());
3013 }
3014 let mut generator = Generator::default();
3015 let output = match generator.generate(&ast[0]) {
3016 Ok(o) => o,
3017 Err(e) => return Some(format!("Gen error: {:?}", e)),
3018 };
3019 if output == sql {
3020 None
3021 } else {
3022 Some(format!("Mismatch:\n input: {}\n output: {}", sql, output))
3023 }
3024 }
3025
3026 let tests = vec![
3027 "SELECT c /* c1 /* c2 */ c3 */",
3029 "SELECT c /* c1 /* c2 /* c3 */ */ */",
3030 "SELECT c /* c1 */ AS alias /* c2 */",
3032 "SELECT a /* x */, b /* x */",
3034 "SELECT a /* x */ /* y */ /* z */, b /* k */ /* m */",
3036 "SELECT * FROM foo /* x */, bla /* x */",
3038 "SELECT 1 /* comment */ + 1",
3040 "SELECT 1 /* c1 */ + 2 /* c2 */",
3041 "SELECT 1 /* c1 */ + /* c2 */ 2 /* c3 */",
3042 "SELECT CAST(x AS INT) /* comment */ FROM foo",
3044 "SELECT FOO(x /* c */) /* FOO */, b /* b */",
3046 "SELECT x FROM a.b.c /* x */, e.f.g /* x */",
3048 "INSERT INTO t1 (tc1 /* tc1 */, tc2 /* tc2 */) SELECT c1 /* sc1 */, c2 /* sc2 */ FROM t",
3050 "/* c */ WITH x AS (SELECT 1) SELECT * FROM x",
3052 "/* comment1 */ INSERT INTO x /* comment2 */ VALUES (1, 2, 3)",
3053 "/* comment1 */ UPDATE tbl /* comment2 */ SET x = 2 WHERE x < 2",
3054 "/* comment1 */ DELETE FROM x /* comment2 */ WHERE y > 1",
3055 "/* comment */ CREATE TABLE foo AS SELECT 1",
3056 "INSERT INTO foo SELECT * FROM bar /* comment */",
3058 "SELECT FOO(x /* c1 */ + y /* c2 */ + BLA(5 /* c3 */)) FROM (VALUES (1 /* c4 */, \"test\" /* c5 */)) /* c6 */",
3060 ];
3061
3062 let mut failures = Vec::new();
3063 for sql in tests {
3064 if let Some(e) = check_roundtrip(sql) {
3065 failures.push(e);
3066 }
3067 }
3068
3069 if !failures.is_empty() {
3070 panic!("Comment roundtrip failures:\n{}", failures.join("\n\n"));
3071 }
3072 }
3073
3074 #[test]
3075 fn test_dollar_quoted_string_parsing() {
3076 use crate::dialects::{Dialect, DialectType};
3077
3078 let (tag, content) = super::parse_dollar_string_token("FOO\x00content here");
3080 assert_eq!(tag, Some("FOO".to_string()));
3081 assert_eq!(content, "content here");
3082
3083 let (tag, content) = super::parse_dollar_string_token("just content");
3084 assert_eq!(tag, None);
3085 assert_eq!(content, "just content");
3086
3087 fn check_databricks(sql: &str, expected: Option<&str>) {
3089 let d = Dialect::get(DialectType::Databricks);
3090 let ast = d.parse(sql).unwrap_or_else(|e| panic!("Parse error for '{}': {}", sql, e));
3091 assert!(!ast.is_empty(), "Empty AST for: {}", sql);
3092 let transformed = d.transform(ast[0].clone()).unwrap_or_else(|e| panic!("Transform error for '{}': {}", sql, e));
3093 let output = d.generate(&transformed).unwrap_or_else(|e| panic!("Generate error for '{}': {}", sql, e));
3094 let expected = expected.unwrap_or(sql);
3095 assert_eq!(output, expected, "Roundtrip failed for: {}", sql);
3096 }
3097
3098 check_databricks(
3100 "CREATE FUNCTION add_one(x INT) RETURNS INT LANGUAGE PYTHON AS $$def add_one(x):\n return x+1$$",
3101 None
3102 );
3103
3104 check_databricks(
3106 "CREATE FUNCTION add_one(x INT) RETURNS INT LANGUAGE PYTHON AS $FOO$def add_one(x):\n return x+1$FOO$",
3107 None
3108 );
3109 }
3110}