perl_token/lib.rs
1//! Perl token definitions shared across the parser ecosystem.
2//!
3//! This crate defines [`Token`] and [`TokenKind`], the fundamental types that
4//! flow from the lexer (`perl-lexer`) into the parser (`perl-parser-core`).
5//! Downstream crates re-export these types so consumers rarely need to depend
6//! on `perl-token` directly.
7//!
8//! # Examples
9//!
10//! Create and inspect tokens:
11//!
12//! ```rust
13//! use perl_token::{Token, TokenKind};
14//!
15//! // Create a keyword token for `my`
16//! let token = Token::new(TokenKind::My, "my", 0, 2);
17//! assert_eq!(token.kind, TokenKind::My);
18//! assert_eq!(&*token.text, "my");
19//! assert_eq!(token.start, 0);
20//! assert_eq!(token.end, 2);
21//!
22//! // Create a numeric literal token
23//! let num = Token::new(TokenKind::Number, "42", 7, 9);
24//! assert_eq!(num.kind, TokenKind::Number);
25//! assert_eq!(&*num.text, "42");
26//! ```
27//!
28//! Use [`TokenKind::display_name`] for user-facing error messages:
29//!
30//! ```rust
31//! use perl_token::TokenKind;
32//!
33//! assert_eq!(TokenKind::LeftBrace.display_name(), "'{'");
34//! assert_eq!(TokenKind::Identifier.display_name(), "identifier");
35//! assert_eq!(TokenKind::Eof.display_name(), "end of input");
36//! ```
37
38use std::sync::Arc;
39
40/// Token produced by the lexer and consumed by the parser.
41///
42/// Stores the token kind, original source text, and byte span. The text is kept
43/// in an `Arc<str>` so buffering and lookahead can clone tokens cheaply.
44#[derive(Debug, Clone, PartialEq)]
45pub struct Token {
46 /// Token classification for parser decision making
47 pub kind: TokenKind,
48 /// Original source text for precise reconstruction
49 pub text: Arc<str>,
50 /// Starting byte position for error reporting and location tracking
51 pub start: usize,
52 /// Ending byte position for span calculation and navigation
53 pub end: usize,
54}
55
56impl Token {
57 /// Create a new token with the given kind, source text, and byte span.
58 ///
59 /// # Examples
60 ///
61 /// ```rust
62 /// use perl_token::{Token, TokenKind};
63 ///
64 /// let tok = Token::new(TokenKind::Sub, "sub", 0, 3);
65 /// assert_eq!(tok.kind, TokenKind::Sub);
66 /// assert_eq!(&*tok.text, "sub");
67 /// ```
68 pub fn new(kind: TokenKind, text: impl Into<Arc<str>>, start: usize, end: usize) -> Self {
69 Token { kind, text: text.into(), start, end }
70 }
71}
72
73/// Token classification for Perl parsing.
74///
75/// The set is intentionally simplified for fast parser matching while covering
76/// keywords, operators, delimiters, literals, identifiers, and special tokens.
77///
78/// Use [`TokenKind::display_name`] to get a human-readable string suitable for
79/// error messages shown to the user.
80///
81/// # Categories
82///
83/// | Group | Examples |
84/// |-------|----------|
85/// | Keywords | [`My`](Self::My), [`Sub`](Self::Sub), [`If`](Self::If), ... |
86/// | Operators | [`Plus`](Self::Plus), [`Arrow`](Self::Arrow), [`And`](Self::And), ... |
87/// | Delimiters | [`LeftParen`](Self::LeftParen), [`LeftBrace`](Self::LeftBrace), ... |
88/// | Literals | [`Number`](Self::Number), [`String`](Self::String), [`Regex`](Self::Regex), ... |
89/// | Identifiers | [`Identifier`](Self::Identifier), [`ScalarSigil`](Self::ScalarSigil), ... |
90/// | Special | [`Eof`](Self::Eof), [`Unknown`](Self::Unknown) |
91#[derive(Debug, Clone, Copy, PartialEq, Eq)]
92pub enum TokenKind {
93 // ===== Keywords =====
94 /// Lexical variable declaration: `my $x`
95 My,
96 /// Package variable declaration: `our $x`
97 Our,
98 /// Dynamic scoping: `local $x`
99 Local,
100 /// Persistent variable: `state $x`
101 State,
102 /// Subroutine declaration: `sub foo`
103 Sub,
104 /// Conditional: `if (cond)`
105 If,
106 /// Else-if conditional: `elsif (cond)`
107 Elsif,
108 /// Else branch: `else { }`
109 Else,
110 /// Negated conditional: `unless (cond)`
111 Unless,
112 /// While loop: `while (cond)`
113 While,
114 /// Until loop: `until (cond)`
115 Until,
116 /// C-style for loop: `for (init; cond; update)`
117 For,
118 /// Iterator loop: `foreach $x (@list)`
119 Foreach,
120 /// Return statement: `return $value`
121 Return,
122 /// Package declaration: `package Foo`
123 Package,
124 /// Module import: `use Module`
125 Use,
126 /// Disable pragma/module: `no strict`
127 No,
128 /// Compile-time block: `BEGIN { }`
129 Begin,
130 /// Exit-time block: `END { }`
131 End,
132 /// Check phase block: `CHECK { }`
133 Check,
134 /// Init phase block: `INIT { }`
135 Init,
136 /// Unit check block: `UNITCHECK { }`
137 Unitcheck,
138 /// Exception handling: `eval { }`
139 Eval,
140 /// Block execution: `do { }` or `do "file"`
141 Do,
142 /// Switch expression: `given ($x)`
143 Given,
144 /// Case clause: `when ($pattern)`
145 When,
146 /// Default case: `default { }`
147 Default,
148 /// Try block: `try { }`
149 Try,
150 /// Catch block: `catch ($e) { }`
151 Catch,
152 /// Finally block: `finally { }`
153 Finally,
154 /// Continue block: `continue { }`
155 Continue,
156 /// Loop control: `next`
157 Next,
158 /// Loop control: `last`
159 Last,
160 /// Loop control: `redo`
161 Redo,
162 /// Goto statement: `goto LABEL`, `goto &sub`, `goto EXPR`
163 Goto,
164 /// Class declaration (5.38+): `class Foo`
165 Class,
166 /// Method declaration (5.38+): `method foo`
167 Method,
168 /// Class field declaration (5.38+): `field $name`
169 Field,
170 /// Format declaration: `format STDOUT =`
171 Format,
172 /// Undefined value: `undef`
173 Undef,
174
175 // ===== Operators =====
176 /// Assignment: `=`
177 Assign,
178 /// Addition: `+`
179 Plus,
180 /// Subtraction: `-`
181 Minus,
182 /// Multiplication: `*`
183 Star,
184 /// Division: `/`
185 Slash,
186 /// Modulo: `%`
187 Percent,
188 /// Exponentiation: `**`
189 Power,
190 /// Left bit shift: `<<`
191 LeftShift,
192 /// Right bit shift: `>>`
193 RightShift,
194 /// Bitwise AND: `&`
195 BitwiseAnd,
196 /// Bitwise OR: `|`
197 BitwiseOr,
198 /// Bitwise XOR: `^`
199 BitwiseXor,
200 /// Bitwise NOT: `~`
201 BitwiseNot,
202 /// Add and assign: `+=`
203 PlusAssign,
204 /// Subtract and assign: `-=`
205 MinusAssign,
206 /// Multiply and assign: `*=`
207 StarAssign,
208 /// Divide and assign: `/=`
209 SlashAssign,
210 /// Modulo and assign: `%=`
211 PercentAssign,
212 /// Concatenate and assign: `.=`
213 DotAssign,
214 /// Bitwise AND and assign: `&=`
215 AndAssign,
216 /// Bitwise OR and assign: `|=`
217 OrAssign,
218 /// Bitwise XOR and assign: `^=`
219 XorAssign,
220 /// Power and assign: `**=`
221 PowerAssign,
222 /// Left shift and assign: `<<=`
223 LeftShiftAssign,
224 /// Right shift and assign: `>>=`
225 RightShiftAssign,
226 /// Logical AND and assign: `&&=`
227 LogicalAndAssign,
228 /// Logical OR and assign: `||=`
229 LogicalOrAssign,
230 /// Defined-or and assign: `//=`
231 DefinedOrAssign,
232 /// Numeric equality: `==`
233 Equal,
234 /// Numeric inequality: `!=`
235 NotEqual,
236 /// Pattern match binding: `=~`
237 Match,
238 /// Negated pattern match: `!~`
239 NotMatch,
240 /// Smart match: `~~`
241 SmartMatch,
242 /// Less than: `<`
243 Less,
244 /// Greater than: `>`
245 Greater,
246 /// Less than or equal: `<=`
247 LessEqual,
248 /// Greater than or equal: `>=`
249 GreaterEqual,
250 /// Numeric comparison (spaceship): `<=>`
251 Spaceship,
252 /// String comparison: `cmp`
253 StringCompare,
254 /// Logical AND: `&&`
255 And,
256 /// Logical OR: `||`
257 Or,
258 /// Logical NOT: `!`
259 Not,
260 /// Defined-or: `//`
261 DefinedOr,
262 /// Word AND operator: `and`
263 WordAnd,
264 /// Word OR operator: `or`
265 WordOr,
266 /// Word NOT operator: `not`
267 WordNot,
268 /// Word XOR operator: `xor`
269 WordXor,
270 /// Method/dereference arrow: `->`
271 Arrow,
272 /// Hash key separator: `=>`
273 FatArrow,
274 /// String concatenation: `.`
275 Dot,
276 /// Range operator: `..`
277 Range,
278 /// Yada-yada (unimplemented): `...`
279 Ellipsis,
280 /// Increment: `++`
281 Increment,
282 /// Decrement: `--`
283 Decrement,
284 /// Package separator: `::`
285 DoubleColon,
286 /// Ternary condition: `?`
287 Question,
288 /// Ternary/label separator: `:`
289 Colon,
290 /// Reference operator: `\`
291 Backslash,
292
293 // ===== Delimiters =====
294 /// Left parenthesis: `(`
295 LeftParen,
296 /// Right parenthesis: `)`
297 RightParen,
298 /// Left brace: `{`
299 LeftBrace,
300 /// Right brace: `}`
301 RightBrace,
302 /// Left bracket: `[`
303 LeftBracket,
304 /// Right bracket: `]`
305 RightBracket,
306 /// Statement terminator: `;`
307 Semicolon,
308 /// List separator: `,`
309 Comma,
310
311 // ===== Literals =====
312 /// Numeric literal: `42`, `3.14`, `0xFF`
313 Number,
314 /// String literal: `"hello"` or `'world'`
315 String,
316 /// Regular expression: `/pattern/flags`
317 Regex,
318 /// Substitution: `s/pattern/replacement/flags`
319 Substitution,
320 /// Transliteration: `tr/abc/xyz/` or `y///`
321 Transliteration,
322 /// Single-quoted string: `q/text/`
323 QuoteSingle,
324 /// Double-quoted string: `qq/text/`
325 QuoteDouble,
326 /// Quote words: `qw(list of words)`
327 QuoteWords,
328 /// Backtick command: `` `cmd` `` or `qx/cmd/`
329 QuoteCommand,
330 /// Heredoc start marker: `<<EOF`
331 HeredocStart,
332 /// Heredoc content body
333 HeredocBody,
334 /// Format specification body
335 FormatBody,
336 /// Data section marker: `__DATA__` or `__END__`
337 DataMarker,
338 /// Data section content
339 DataBody,
340 /// Version string literal: `v5.26.0`, `v5.10`
341 VString,
342 /// Unparsed remainder (budget exceeded)
343 UnknownRest,
344 /// Heredoc depth limit exceeded (special error token)
345 HeredocDepthLimit,
346
347 // ===== Identifiers and Variables =====
348 /// Bareword identifier or function name
349 Identifier,
350 /// Scalar sigil: `$`
351 ScalarSigil,
352 /// Array sigil: `@`
353 ArraySigil,
354 /// Hash sigil: `%`
355 HashSigil,
356 /// Subroutine sigil: `&`
357 SubSigil,
358 /// Glob/typeglob sigil: `*`
359 GlobSigil,
360
361 // ===== Special =====
362 /// End of file/input
363 Eof,
364 /// Unknown/unrecognized token
365 Unknown,
366}
367
368impl TokenKind {
369 /// Return a user-friendly display name for this token kind.
370 ///
371 /// These names appear in parser error messages shown in the editor.
372 /// They use the actual Perl syntax (e.g. `}` instead of `RightBrace`)
373 /// so users can immediately understand what the parser expected.
374 ///
375 /// # Examples
376 ///
377 /// ```rust
378 /// use perl_token::TokenKind;
379 ///
380 /// assert_eq!(TokenKind::Semicolon.display_name(), "';'");
381 /// assert_eq!(TokenKind::Sub.display_name(), "'sub'");
382 /// assert_eq!(TokenKind::Number.display_name(), "number");
383 /// ```
384 pub fn display_name(self) -> &'static str {
385 match self {
386 // Keywords
387 TokenKind::My => "'my'",
388 TokenKind::Our => "'our'",
389 TokenKind::Local => "'local'",
390 TokenKind::State => "'state'",
391 TokenKind::Sub => "'sub'",
392 TokenKind::If => "'if'",
393 TokenKind::Elsif => "'elsif'",
394 TokenKind::Else => "'else'",
395 TokenKind::Unless => "'unless'",
396 TokenKind::While => "'while'",
397 TokenKind::Until => "'until'",
398 TokenKind::For => "'for'",
399 TokenKind::Foreach => "'foreach'",
400 TokenKind::Return => "'return'",
401 TokenKind::Package => "'package'",
402 TokenKind::Use => "'use'",
403 TokenKind::No => "'no'",
404 TokenKind::Begin => "'BEGIN'",
405 TokenKind::End => "'END'",
406 TokenKind::Check => "'CHECK'",
407 TokenKind::Init => "'INIT'",
408 TokenKind::Unitcheck => "'UNITCHECK'",
409 TokenKind::Eval => "'eval'",
410 TokenKind::Do => "'do'",
411 TokenKind::Given => "'given'",
412 TokenKind::When => "'when'",
413 TokenKind::Default => "'default'",
414 TokenKind::Try => "'try'",
415 TokenKind::Catch => "'catch'",
416 TokenKind::Finally => "'finally'",
417 TokenKind::Continue => "'continue'",
418 TokenKind::Next => "'next'",
419 TokenKind::Last => "'last'",
420 TokenKind::Redo => "'redo'",
421 TokenKind::Goto => "'goto'",
422 TokenKind::Class => "'class'",
423 TokenKind::Method => "'method'",
424 TokenKind::Field => "'field'",
425 TokenKind::Format => "'format'",
426 TokenKind::Undef => "'undef'",
427
428 // Operators
429 TokenKind::Assign => "'='",
430 TokenKind::Plus => "'+'",
431 TokenKind::Minus => "'-'",
432 TokenKind::Star => "'*'",
433 TokenKind::Slash => "'/'",
434 TokenKind::Percent => "'%'",
435 TokenKind::Power => "'**'",
436 TokenKind::LeftShift => "'<<'",
437 TokenKind::RightShift => "'>>'",
438 TokenKind::BitwiseAnd => "'&'",
439 TokenKind::BitwiseOr => "'|'",
440 TokenKind::BitwiseXor => "'^'",
441 TokenKind::BitwiseNot => "'~'",
442 TokenKind::PlusAssign => "'+='",
443 TokenKind::MinusAssign => "'-='",
444 TokenKind::StarAssign => "'*='",
445 TokenKind::SlashAssign => "'/='",
446 TokenKind::PercentAssign => "'%='",
447 TokenKind::DotAssign => "'.='",
448 TokenKind::AndAssign => "'&='",
449 TokenKind::OrAssign => "'|='",
450 TokenKind::XorAssign => "'^='",
451 TokenKind::PowerAssign => "'**='",
452 TokenKind::LeftShiftAssign => "'<<='",
453 TokenKind::RightShiftAssign => "'>>='",
454 TokenKind::LogicalAndAssign => "'&&='",
455 TokenKind::LogicalOrAssign => "'||='",
456 TokenKind::DefinedOrAssign => "'//='",
457 TokenKind::Equal => "'=='",
458 TokenKind::NotEqual => "'!='",
459 TokenKind::Match => "'=~'",
460 TokenKind::NotMatch => "'!~'",
461 TokenKind::SmartMatch => "'~~'",
462 TokenKind::Less => "'<'",
463 TokenKind::Greater => "'>'",
464 TokenKind::LessEqual => "'<='",
465 TokenKind::GreaterEqual => "'>='",
466 TokenKind::Spaceship => "'<=>'",
467 TokenKind::StringCompare => "'cmp'",
468 TokenKind::And => "'&&'",
469 TokenKind::Or => "'||'",
470 TokenKind::Not => "'!'",
471 TokenKind::DefinedOr => "'//'",
472 TokenKind::WordAnd => "'and'",
473 TokenKind::WordOr => "'or'",
474 TokenKind::WordNot => "'not'",
475 TokenKind::WordXor => "'xor'",
476 TokenKind::Arrow => "'->'",
477 TokenKind::FatArrow => "'=>'",
478 TokenKind::Dot => "'.'",
479 TokenKind::Range => "'..'",
480 TokenKind::Ellipsis => "'...'",
481 TokenKind::Increment => "'++'",
482 TokenKind::Decrement => "'--'",
483 TokenKind::DoubleColon => "'::'",
484 TokenKind::Question => "'?'",
485 TokenKind::Colon => "':'",
486 TokenKind::Backslash => "'\\'",
487
488 // Delimiters
489 TokenKind::LeftParen => "'('",
490 TokenKind::RightParen => "')'",
491 TokenKind::LeftBrace => "'{'",
492 TokenKind::RightBrace => "'}'",
493 TokenKind::LeftBracket => "'['",
494 TokenKind::RightBracket => "']'",
495 TokenKind::Semicolon => "';'",
496 TokenKind::Comma => "','",
497
498 // Literals
499 TokenKind::Number => "number",
500 TokenKind::String => "string",
501 TokenKind::Regex => "regex",
502 TokenKind::Substitution => "substitution (s///)",
503 TokenKind::Transliteration => "transliteration (tr///)",
504 TokenKind::QuoteSingle => "q// string",
505 TokenKind::QuoteDouble => "qq// string",
506 TokenKind::QuoteWords => "qw() word list",
507 TokenKind::QuoteCommand => "qx// command",
508 TokenKind::HeredocStart => "heredoc (<<)",
509 TokenKind::HeredocBody => "heredoc body",
510 TokenKind::FormatBody => "format body",
511 TokenKind::DataMarker => "__DATA__",
512 TokenKind::DataBody => "data section",
513 TokenKind::VString => "version string",
514 TokenKind::UnknownRest => "unparsed content",
515 TokenKind::HeredocDepthLimit => "heredoc depth limit",
516
517 // Identifiers and variables
518 TokenKind::Identifier => "identifier",
519 TokenKind::ScalarSigil => "'$'",
520 TokenKind::ArraySigil => "'@'",
521 TokenKind::HashSigil => "'%'",
522 TokenKind::SubSigil => "'&'",
523 TokenKind::GlobSigil => "'*'",
524
525 // Special
526 TokenKind::Eof => "end of input",
527 TokenKind::Unknown => "unknown token",
528 }
529 }
530}