erg_parser/
token.rs

1//! defines `Token` (The minimum unit in the Erg source code that serves as input to the parser).
2//!
3//! Token(パーサーへの入力となる、Ergソースコードにおける最小単位)を定義する
4use std::collections::VecDeque;
5use std::fmt;
6use std::hash::{Hash, Hasher};
7
8use erg_common::error::Location;
9use erg_common::impl_displayable_deque_stream_for_wrapper;
10use erg_common::opcode311::BinOpCode;
11use erg_common::str::Str;
12use erg_common::traits::{DequeStream, Locational};
13// use erg_common::ty::Type;
14// use erg_common::typaram::OpKind;
15// use erg_common::value::ValueObj;
16
17#[cfg(not(feature = "pylib"))]
18use erg_proc_macros::pyclass;
19#[cfg(feature = "pylib")]
20use pyo3::prelude::*;
21
22/// 意味論的名前と記号自体の名前が混在しているが、Pythonの名残である
23#[pyclass]
24#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
25#[repr(u8)]
26pub enum TokenKind {
27    /// e.g. i, p!, $s, T, `+`, `and`, 'd/dx'
28    Symbol,
29    /// e.g. 0, 1
30    NatLit,
31    /// e.g. -1, -2
32    IntLit,
33    /// e.g. 0b101
34    BinLit,
35    /// e.g. 0o777
36    OctLit,
37    /// e.g. 0xdeadbeef
38    HexLit,
39    RatioLit,
40    BoolLit,
41    StrLit,
42    /// e.g. "abc\{
43    StrInterpLeft,
44    /// e.g. }abc\{
45    StrInterpMid,
46    /// e.g. }def"
47    StrInterpRight,
48    NoneLit,
49    /// ... (== Ellipsis)
50    EllipsisLit,
51    InfLit,
52    DocComment,
53    /// `+` (unary)
54    PrePlus,
55    /// `-` (unary)
56    PreMinus,
57    /// ~ (unary)
58    PreBitNot,
59    // PreAmp,    // & (unary)
60    // PreAt,     // @ (unary)
61    /// ! (unary)
62    Mutate,
63    PreStar,    // * (unary)
64    PreDblStar, // ** (unary)
65    /// ? (postfix)
66    Try,
67    /// `+`
68    Plus,
69    /// `-`
70    Minus,
71    /// `*`
72    Star,
73    /// /
74    Slash,
75    /// //
76    FloorDiv,
77    /// **
78    Pow,
79    /// %
80    Mod,
81    /// ..
82    Closed,
83    /// ..<
84    RightOpen,
85    /// <..
86    LeftOpen,
87    /// <..<
88    Open,
89    /// &&
90    BitAnd,
91    /// ||
92    BitOr,
93    /// ^^
94    BitXor,
95    /// <<
96    Shl,
97    /// >>
98    Shr,
99    /// <
100    Less,
101    /// >
102    Gre,
103    /// <=
104    LessEq,
105    /// >=
106    GreEq,
107    /// ==
108    DblEq,
109    /// !=
110    NotEq,
111    /// `in`
112    InOp,
113    /// `notin`
114    NotInOp,
115    // `contains`
116    ContainsOp,
117    /// `sub` (subtype of)
118    SubOp,
119    /// `is!`
120    IsOp,
121    /// `isnot!`
122    IsNotOp,
123    /// `and`
124    AndOp,
125    /// `or`
126    OrOp,
127    /// `ref` (special unary)
128    RefOp,
129    /// `ref!` (special unary)
130    RefMutOp,
131    /// =
132    Assign,
133    /// <-
134    Inclusion,
135    /// :=
136    Walrus,
137    /// ->
138    FuncArrow,
139    /// =>
140    ProcArrow,
141    /// (
142    LParen,
143    /// )
144    RParen,
145    /// [
146    LSqBr,
147    /// ]
148    RSqBr,
149    /// {
150    LBrace,
151    /// }
152    RBrace,
153    Indent,
154    Dedent,
155    /// .
156    Dot,
157    /// |>
158    Pipe,
159    /// :
160    Colon,
161    /// ::
162    DblColon,
163    /// :>
164    SupertypeOf,
165    /// <:
166    SubtypeOf,
167    /// `as`
168    As,
169    /// ,
170    Comma,
171    /// ^
172    Caret,
173    /// &
174    Amper,
175    /// @
176    AtSign,
177    /// |
178    VBar,
179    /// _
180    UBar,
181    /// \n
182    Newline,
183    /// ;
184    Semi,
185    Illegal,
186    /// Beginning Of File
187    BOF,
188    EOF,
189}
190
191use TokenKind::*;
192
193#[pyclass]
194#[derive(Debug, Clone, Copy, PartialEq, Eq)]
195pub enum TokenCategory {
196    Symbol,
197    Literal,
198    StrInterpLeft,
199    StrInterpMid,
200    StrInterpRight,
201    BinOp,
202    UnaryOp,
203    /// ? <.. ..
204    PostfixOp,
205    /// ( [ { Indent
206    LEnclosure,
207    /// ) } } Dedent
208    REnclosure,
209    /// , : :: :> <: . |> :=
210    SpecialBinOp,
211    /// =
212    DefOp,
213    /// -> =>
214    LambdaOp,
215    /// \n ;
216    Separator,
217    /// ^ &
218    Reserved,
219    /// @
220    AtSign,
221    /// |
222    VBar,
223    /// _
224    UBar,
225    BOF,
226    EOF,
227    Illegal,
228}
229
230impl fmt::Display for TokenCategory {
231    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
232        write!(f, "{self:?}")
233    }
234}
235
236impl TokenCategory {
237    pub const fn is_block_op(&self) -> bool {
238        matches!(self, Self::DefOp | Self::LambdaOp)
239    }
240}
241
242impl TokenKind {
243    pub const fn category(&self) -> TokenCategory {
244        match self {
245            Symbol => TokenCategory::Symbol,
246            NatLit | BinLit | OctLit | HexLit | IntLit | RatioLit | StrLit | BoolLit | NoneLit
247            | EllipsisLit | InfLit | DocComment => TokenCategory::Literal,
248            StrInterpLeft => TokenCategory::StrInterpLeft,
249            StrInterpMid => TokenCategory::StrInterpMid,
250            StrInterpRight => TokenCategory::StrInterpRight,
251            PrePlus | PreMinus | PreBitNot | Mutate | PreStar | PreDblStar | RefOp | RefMutOp => {
252                TokenCategory::UnaryOp
253            }
254            Try => TokenCategory::PostfixOp,
255            Comma | Colon | DblColon | SupertypeOf | SubtypeOf | As | Dot | Pipe | Walrus
256            | Inclusion => TokenCategory::SpecialBinOp,
257            Assign => TokenCategory::DefOp,
258            FuncArrow | ProcArrow => TokenCategory::LambdaOp,
259            Semi | Newline => TokenCategory::Separator,
260            LParen | LBrace | LSqBr | Indent => TokenCategory::LEnclosure,
261            RParen | RBrace | RSqBr | Dedent => TokenCategory::REnclosure,
262            Caret | Amper => TokenCategory::Reserved,
263            AtSign => TokenCategory::AtSign,
264            VBar => TokenCategory::VBar,
265            UBar => TokenCategory::UBar,
266            BOF => TokenCategory::BOF,
267            EOF => TokenCategory::EOF,
268            Illegal => TokenCategory::Illegal,
269            _ => TokenCategory::BinOp,
270        }
271    }
272
273    pub const fn precedence(&self) -> Option<usize> {
274        let prec = match self {
275            Dot | DblColon => 200,                                    // .
276            Pow => 190,                                               // **
277            PrePlus | PreMinus | PreBitNot | RefOp | RefMutOp => 180, // (unary) + - * ~ ref ref!
278            Star | Slash | FloorDiv | Mod => 170,                     // * / // %
279            Plus | Minus => 160,                                      // + -
280            Shl | Shr => 150,                                         // << >>
281            BitAnd => 140,                                            // &&
282            BitXor => 130,                                            // ^^
283            BitOr => 120,                                             // ||
284            Closed | LeftOpen | RightOpen | Open => 100,              // range operators
285            Less | Gre | LessEq | GreEq | DblEq | NotEq | InOp | NotInOp | ContainsOp | IsOp
286            | IsNotOp => 90, // < > <= >= == != in notin contains is isnot
287            AndOp => 80,                                              // and
288            OrOp => 70,                                               // or
289            FuncArrow | ProcArrow | Inclusion => 60,                  // -> => <-
290            Colon | SupertypeOf | SubtypeOf | As => 50,               // : :> <: as
291            Comma => 40,                                              // ,
292            Assign | Walrus => 20,                                    // = :=
293            Newline | Semi => 10,                                     // \n ;
294            LParen | LBrace | LSqBr | Indent => 0,                    // ( { [ Indent
295            _ => return None,
296        };
297        Some(prec)
298    }
299
300    pub const fn is_right_associative(&self) -> bool {
301        matches!(
302            self,
303            FuncArrow | ProcArrow | Assign /* | PreDollar | PreAt */
304        )
305    }
306
307    pub const fn is_range_op(&self) -> bool {
308        matches!(self, Closed | LeftOpen | RightOpen | Open)
309    }
310}
311
312impl fmt::Display for TokenKind {
313    #[inline]
314    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
315        write!(f, "{self:?}")
316    }
317}
318
319impl From<TokenKind> for BinOpCode {
320    fn from(tk: TokenKind) -> Self {
321        match tk {
322            Plus => BinOpCode::Add,
323            Minus => BinOpCode::Subtract,
324            Star => BinOpCode::Multiply,
325            Slash => BinOpCode::TrueDivide,
326            FloorDiv => BinOpCode::FloorDiv,
327            Mod => BinOpCode::Remainder,
328            Pow => BinOpCode::Power,
329            BitAnd => BinOpCode::And,
330            BitOr => BinOpCode::Or,
331            BitXor => BinOpCode::Xor,
332            Shl => BinOpCode::LShift,
333            Shr => BinOpCode::RShift,
334            _ => panic!("invalid token kind for binop"),
335        }
336    }
337}
338
339#[pyclass(get_all, set_all)]
340#[derive(Clone, Eq)]
341pub struct Token {
342    pub kind: TokenKind,
343    pub content: Str,
344    /// 1 origin
345    // TODO: 複数行文字列リテラルもあるのでタプルにするのが妥当?
346    pub lineno: u32,
347    /// A pointer from which the token starts (0 origin)
348    pub col_begin: u32,
349    /// A pointer to the end position of the token.
350    /// `col_end - col_start` does not necessarily equal `content.len()`
351    pub col_end: u32,
352}
353
354pub const COLON: Token = Token::dummy(TokenKind::Colon, ":");
355pub const AS: Token = Token::dummy(TokenKind::As, "as");
356pub const DOT: Token = Token::dummy(TokenKind::Dot, ".");
357pub const EQUAL: Token = Token::dummy(TokenKind::Assign, "=");
358
359impl fmt::Debug for Token {
360    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
361        f.debug_struct("Token")
362            .field("kind", &self.kind)
363            .field("content", &self.content.replace('\n', "\\n"))
364            .field("lineno", &self.lineno)
365            .field("col_begin", &self.col_begin)
366            .field("col_end", &self.col_end)
367            .finish()
368    }
369}
370
371impl fmt::Display for Token {
372    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
373        write!(f, "{:?} {}", self.kind, self.content.replace('\n', "\\n"))
374    }
375}
376
377// the values of lineno and col are not relevant for comparison
378// use `deep_eq` if you want to compare them
379impl PartialEq for Token {
380    #[inline]
381    fn eq(&self, other: &Self) -> bool {
382        self.is(other.kind) && self.content == other.content
383    }
384}
385
386impl Hash for Token {
387    fn hash<H: Hasher>(&self, state: &mut H) {
388        self.kind.hash(state);
389        self.content.hash(state);
390    }
391}
392
393impl Locational for Token {
394    fn loc(&self) -> Location {
395        if self.lineno == 0 {
396            Location::Unknown
397        } else {
398            Location::range(self.lineno, self.col_begin, self.lineno, self.col_end)
399        }
400    }
401
402    #[inline]
403    fn col_end(&self) -> Option<u32> {
404        Some(self.col_begin + self.content.len() as u32)
405    }
406}
407
408impl Token {
409    pub const DUMMY: Token = Token {
410        kind: TokenKind::Illegal,
411        content: Str::ever("DUMMY"),
412        lineno: 0,
413        col_begin: 0,
414        col_end: 0,
415    };
416
417    pub const fn dummy(kind: TokenKind, content: &'static str) -> Self {
418        Self {
419            kind,
420            content: Str::ever(content),
421            lineno: 0,
422            col_begin: 0,
423            col_end: 0,
424        }
425    }
426
427    #[inline]
428    pub fn new<S: Into<Str>>(kind: TokenKind, cont: S, lineno: u32, col_begin: u32) -> Self {
429        let content = cont.into();
430        let col_end = col_begin + content.chars().count() as u32;
431        Token {
432            kind,
433            content,
434            lineno,
435            col_begin,
436            col_end,
437        }
438    }
439
440    #[inline]
441    pub fn new_fake<S: Into<Str>>(
442        kind: TokenKind,
443        cont: S,
444        lineno: u32,
445        col_begin: u32,
446        col_end: u32,
447    ) -> Self {
448        Token {
449            kind,
450            content: cont.into(),
451            lineno,
452            col_begin,
453            col_end,
454        }
455    }
456
457    pub fn new_with_loc(kind: TokenKind, cont: impl Into<Str>, loc: Location) -> Self {
458        Token {
459            kind,
460            content: cont.into(),
461            lineno: loc.ln_begin().unwrap_or(0),
462            col_begin: loc.col_begin().unwrap_or(0),
463            col_end: loc.col_end().unwrap_or(1),
464        }
465    }
466
467    #[inline]
468    pub fn from_str(kind: TokenKind, cont: &str) -> Self {
469        Token {
470            kind,
471            content: Str::rc(cont),
472            lineno: 0,
473            col_begin: 0,
474            col_end: 0,
475        }
476    }
477
478    #[inline]
479    pub fn symbol(cont: &str) -> Self {
480        Self::from_str(TokenKind::Symbol, cont)
481    }
482
483    #[inline]
484    pub fn symbol_with_line(cont: &str, lineno: u32) -> Self {
485        Token {
486            kind: TokenKind::Symbol,
487            content: Str::rc(cont),
488            lineno,
489            col_begin: 0,
490            col_end: 1,
491        }
492    }
493
494    pub fn symbol_with_loc<S: Into<Str>>(cont: S, loc: Location) -> Self {
495        Token {
496            kind: TokenKind::Symbol,
497            content: cont.into(),
498            lineno: loc.ln_begin().unwrap_or(0),
499            col_begin: loc.col_begin().unwrap_or(0),
500            col_end: loc.col_end().unwrap_or(1),
501        }
502    }
503
504    pub const fn static_symbol(s: &'static str) -> Self {
505        Token {
506            kind: TokenKind::Symbol,
507            content: Str::ever(s),
508            lineno: 0,
509            col_begin: 0,
510            col_end: 1,
511        }
512    }
513
514    pub fn deep_eq(&self, other: &Self) -> bool {
515        self.kind == other.kind
516            && self.content == other.content
517            && self.lineno == other.lineno
518            && self.col_begin == other.col_begin
519    }
520
521    pub fn loc(&self) -> Location {
522        Locational::loc(self)
523    }
524
525    pub const fn category(&self) -> TokenCategory {
526        self.kind.category()
527    }
528
529    pub fn category_is(&self, category: TokenCategory) -> bool {
530        self.kind.category() == category
531    }
532
533    pub fn is(&self, kind: TokenKind) -> bool {
534        self.kind == kind
535    }
536
537    pub fn is_number(&self) -> bool {
538        matches!(
539            self.kind,
540            NatLit | IntLit | BinLit | OctLit | HexLit | RatioLit | InfLit
541        )
542    }
543
544    pub fn is_str(&self) -> bool {
545        matches!(self.kind, StrLit | DocComment)
546    }
547
548    pub const fn is_block_op(&self) -> bool {
549        self.category().is_block_op()
550    }
551
552    pub const fn inspect(&self) -> &Str {
553        &self.content
554    }
555
556    pub fn is_procedural(&self) -> bool {
557        self.inspect().ends_with('!')
558    }
559
560    pub fn is_const(&self) -> bool {
561        self.inspect().is_uppercase()
562    }
563}
564
565#[pyclass]
566#[derive(Debug, Clone)]
567pub struct TokenStream(VecDeque<Token>);
568
569impl_displayable_deque_stream_for_wrapper!(TokenStream, Token);