regen/core/
token.rs

1//! Core logic of tokens and token rules
2use crate::core::ExtractFromLiteral;
3use crate::grammar::{pt, Ctx, Tok};
4use crate::sdk::Error;
5
6/// Definition for a token type used by the tokenizer
7///
8/// Each token definiton has a name and a flag to indicate if it should be extracted.
9/// Extracted tokens are not used in AST generation (for example, comments), while ignored tokens are completely removed
10/// from the rest of the parsing process.
11///
12/// Note that this should not be confused with the Token in the SDK, which is the token data structure at runtime when parsing.
13#[derive(Debug, Clone)]
14pub struct TokenDef {
15    /// Token type name
16    pub name: String,
17    /// If the token type should be extracted
18    pub is_extract: bool,
19}
20
21/// Definition for a token rule used by the tokenizer
22///
23/// On each tokenization step, the tokenizer will try to match the input against the token rules.
24/// The longest token matched will be the next token produced.
25///
26/// If a token cannot be matched, one character is skipped and added to the unmatchable token list.
27#[derive(Debug, Clone)]
28pub enum TokenRule {
29    /// Ignore rule matching a literal
30    IgnoreLiteral(String /* literal */),
31    /// Ignore rule matching a regular expression
32    IgnoreRegExp(String /* regex */),
33    /// Token rule matching a literal
34    Literal(String /* token_type */, String /* literal */),
35    /// Token rule matching a regular expression
36    RegExp(String /* token_type */, String /* regex */),
37}
38
39pub fn parse_token_def(pt: &pt::DefineTokenTypeStatement, ctx: &mut Ctx) -> Option<()> {
40    if pt.m_kw_extract {
41        // Extracted tokens are not used in AST generation. Tag it to indicate that
42        ctx.tbs.set(
43            &pt.ast.m_token_type,
44            Tok::Decor {
45                tag: "unused".to_owned(),
46                base: Box::new(Tok::SToken),
47            },
48        )
49    }
50
51    if !ctx.val.add_token(TokenDef {
52        name: pt.m_token_type.clone(),
53        is_extract: pt.m_kw_extract,
54    }) {
55        let name = &pt.m_token_type;
56        let msg = format!("Duplicate token definition: {name}");
57        let help = "Remove or rename the duplicate definition".to_owned();
58        ctx.err
59            .push(Error::from_token(&pt.ast.m_token_type, msg, help));
60    }
61    None
62}
63
64pub fn parse_token_ignore_rule(
65    pt: &pt::DefineIgnoreTokenRuleStatement,
66    ctx: &mut Ctx,
67) -> Option<()> {
68    let token_rule = match pt.m_value.as_ref() {
69        pt::LiteralOrRegExp::TokenLiteral(literal) => {
70            TokenRule::IgnoreLiteral(literal.m_t.strip_quotes())
71        }
72        pt::LiteralOrRegExp::TokenRegExp(regexp) => {
73            TokenRule::IgnoreRegExp(regexp.m_t.strip_and_escape_regex())
74        }
75    };
76
77    ctx.val.add_token_rule(token_rule);
78    None
79}
80
81pub fn parse_token_rule(pt: &pt::DefineTokenRuleStatement, ctx: &mut Ctx) -> Option<()> {
82    let token_rule = match pt.m_value.as_ref() {
83        pt::LiteralOrRegExp::TokenLiteral(literal) => {
84            TokenRule::Literal(pt.m_token_type.clone(), literal.m_t.strip_quotes())
85        }
86        pt::LiteralOrRegExp::TokenRegExp(regexp) => {
87            TokenRule::RegExp(pt.m_token_type.clone(), regexp.m_t.strip_and_escape_regex())
88        }
89    };
90
91    ctx.val.add_token_rule(token_rule);
92    None
93}