Skip to main content

perl_lexer/
token.rs

1//! Token types and structures for the Perl lexer
2
3use std::sync::Arc;
4
5/// Parts of an interpolated string
6#[derive(Debug, Clone, PartialEq)]
7pub enum StringPart {
8    /// Literal text
9    Literal(Arc<str>),
10    /// Variable interpolation: $var, @array, %hash
11    Variable(Arc<str>),
12    /// Expression interpolation: ${expr}, @{expr}
13    Expression(Arc<str>),
14    /// Method call: `->method()`
15    MethodCall(Arc<str>),
16    /// Array slice: [1..3]
17    ArraySlice(Arc<str>),
18}
19
20/// Token types for Perl
21#[derive(Debug, Clone, PartialEq)]
22pub enum TokenType {
23    // Slash-derived tokens
24    /// Division operator: /
25    Division,
26    /// Regex match: m// or //
27    RegexMatch,
28    /// Substitution: s///
29    Substitution,
30    /// Transliteration: tr/// or y///
31    Transliteration,
32    /// Quote regex: qr//
33    QuoteRegex,
34
35    // String and quote tokens
36    /// String literal: "string" or 'string'
37    StringLiteral,
38    /// Single quote: q//
39    QuoteSingle,
40    /// Double quote: qq//
41    QuoteDouble,
42    /// Quote words: qw//
43    QuoteWords,
44    /// Quote command: qx// or `backticks`
45    QuoteCommand,
46
47    // String interpolation tokens
48    /// String with interpolated parts
49    InterpolatedString(Vec<StringPart>),
50
51    // Heredoc tokens
52    /// Heredoc start: <<EOF or <<'EOF'
53    HeredocStart,
54    /// Heredoc body content
55    HeredocBody(Arc<str>),
56
57    // Format declarations
58    /// Format body content
59    FormatBody(Arc<str>),
60
61    // Version strings
62    /// Version string: v5.32.0
63    Version(Arc<str>),
64
65    // POD documentation
66    /// POD documentation block
67    Pod,
68
69    // Data sections
70    /// Data section marker: __DATA__ or __END__
71    DataMarker(Arc<str>),
72    /// Data section body content
73    DataBody(Arc<str>),
74
75    // Error recovery
76    /// Unknown rest of input (used when budget exceeded)
77    UnknownRest,
78
79    // Identifiers and literals
80    /// Identifier or variable name
81    Identifier(Arc<str>),
82    /// Numeric literal
83    Number(Arc<str>),
84    /// Operator
85    Operator(Arc<str>),
86    /// Keyword
87    Keyword(Arc<str>),
88
89    // Delimiters
90    /// Left parenthesis: (
91    LeftParen,
92    /// Right parenthesis: )
93    RightParen,
94    /// Left bracket: [
95    LeftBracket,
96    /// Right bracket: ]
97    RightBracket,
98    /// Left brace: {
99    LeftBrace,
100    /// Right brace: }
101    RightBrace,
102
103    // Punctuation
104    /// Semicolon: ;
105    Semicolon,
106    /// Comma: ,
107    Comma,
108    /// Colon: :
109    Colon,
110    /// Arrow: ->
111    Arrow,
112    /// Fat comma: =>
113    FatComma,
114
115    // Whitespace and comments
116    /// Whitespace (usually not returned)
117    Whitespace,
118    /// Newline character
119    Newline,
120    /// Comment text
121    Comment(Arc<str>),
122
123    // Special tokens
124    /// End of file
125    EOF,
126    /// Error token for invalid input
127    Error(Arc<str>),
128}
129
130/// Token with position information
131#[derive(Debug, Clone)]
132pub struct Token {
133    /// The type of token
134    pub token_type: TokenType,
135    /// The actual text of the token
136    pub text: Arc<str>,
137    /// Start position in the input
138    pub start: usize,
139    /// End position in the input
140    pub end: usize,
141}
142
143impl Token {
144    /// Create a new token
145    pub fn new(token_type: TokenType, text: impl Into<Arc<str>>, start: usize, end: usize) -> Self {
146        Self { token_type, text: text.into(), start, end }
147    }
148
149    /// Get the length of the token
150    pub fn len(&self) -> usize {
151        self.end - self.start
152    }
153
154    /// Check if the token is empty
155    pub fn is_empty(&self) -> bool {
156        self.start == self.end
157    }
158}