Skip to main content

zsh/
tokens.rs

1//! Zsh token definitions - Direct port from zsh/Src/zsh.h
2//!
3//! This module defines all character tokens and lexical tokens
4//! used by the zsh lexer and parser.
5
6/// Character tokens - metafied characters with special meaning.
7/// These appear in strings and represent tokenized versions of special characters.
8/// Values match the C definitions in zsh.h exactly.
9pub mod char_tokens {
10    pub const META: char = '\u{83}';
11    pub const POUND: char = '\u{84}'; // #
12    pub const STRING: char = '\u{85}'; // $
13    pub const HAT: char = '\u{86}'; // ^
14    pub const STAR: char = '\u{87}'; // *
15    pub const INPAR: char = '\u{88}'; // (
16    pub const INPARMATH: char = '\u{89}'; // ((
17    pub const OUTPAR: char = '\u{8a}'; // )
18    pub const OUTPARMATH: char = '\u{8b}'; // ))
19    pub const QSTRING: char = '\u{8c}'; // $ in double quotes
20    pub const EQUALS: char = '\u{8d}'; // =
21    pub const BAR: char = '\u{8e}'; // |
22    pub const INBRACE: char = '\u{8f}'; // {
23    pub const OUTBRACE: char = '\u{90}'; // }
24    pub const INBRACK: char = '\u{91}'; // [
25    pub const OUTBRACK: char = '\u{92}'; // ]
26    pub const TICK: char = '\u{93}'; // `
27    pub const INANG: char = '\u{94}'; // <
28    pub const OUTANG: char = '\u{95}'; // >
29    pub const OUTANGPROC: char = '\u{96}'; // > for process sub
30    pub const QUEST: char = '\u{97}'; // ?
31    pub const TILDE: char = '\u{98}'; // ~
32    pub const QTICK: char = '\u{99}'; // ` in double quotes
33    pub const COMMA: char = '\u{9a}'; // ,
34    pub const DASH: char = '\u{9b}'; // - in patterns
35    pub const BANG: char = '\u{9c}'; // ! in patterns
36
37    pub const LAST_NORMAL_TOK: char = BANG;
38
39    // Null arguments: placeholders for quotes
40    pub const SNULL: char = '\u{9d}'; // single quote marker
41    pub const DNULL: char = '\u{9e}'; // double quote marker
42    pub const BNULL: char = '\u{9f}'; // backslash null
43
44    pub const BNULLKEEP: char = '\u{a0}'; // backslash to keep as \
45    pub const NULARG: char = '\u{a1}'; // null argument
46    pub const MARKER: char = '\u{a2}'; // special marker
47
48    /// Check if a character is a token
49    #[inline]
50    pub fn is_token(c: char) -> bool {
51        let b = c as u32;
52        b >= 0x84 && b <= 0xa2
53    }
54
55    /// Convert token back to its original character
56    pub fn untokenize(c: char) -> Option<char> {
57        match c {
58            POUND => Some('#'),
59            STRING | QSTRING => Some('$'),
60            HAT => Some('^'),
61            STAR => Some('*'),
62            INPAR | INPARMATH => Some('('),
63            OUTPAR | OUTPARMATH => Some(')'),
64            EQUALS => Some('='),
65            BAR => Some('|'),
66            INBRACE => Some('{'),
67            OUTBRACE => Some('}'),
68            INBRACK => Some('['),
69            OUTBRACK => Some(']'),
70            TICK | QTICK => Some('`'),
71            INANG => Some('<'),
72            OUTANG | OUTANGPROC => Some('>'),
73            QUEST => Some('?'),
74            TILDE => Some('~'),
75            COMMA => Some(','),
76            DASH => Some('-'),
77            BANG => Some('!'),
78            SNULL | DNULL | BNULL | BNULLKEEP | NULARG | MARKER => None,
79            _ => None,
80        }
81    }
82
83    /// Token characters string - maps token values back to their literal chars
84    /// Matches ztokens[] from lex.c: "#$^*(())$=|{}[]`<>>?~`,-!'\"\\\\"
85    pub const ZTOKENS: &str = "#$^*(())$=|{}[]`<>>?~`,-!'\"\\\\";
86}
87
88/// Lexical tokens - returned by the lexer
89/// These match enum lextok from zsh.h exactly
90#[derive(Debug, Clone, Copy, PartialEq, Eq)]
91#[repr(u8)]
92pub enum LexTok {
93    Nulltok = 0,
94    Seper,          // 1 - ;  (separator, not necessarily literal semicolon)
95    Newlin,         // 2 - \n
96    Semi,           // 3 - ;
97    Dsemi,          // 4 - ;;
98    Amper,          // 5 - &
99    Inpar,          // 6 - (
100    Outpar,         // 7 - )
101    Dbar,           // 8 - ||
102    Damper,         // 9 - &&
103    Outang,         // 10 - >
104    Outangbang,     // 11 - >|
105    Doutang,        // 12 - >>
106    Doutangbang,    // 13 - >>|
107    Inang,          // 14 - <
108    Inoutang,       // 15 - <>
109    Dinang,         // 16 - <<
110    Dinangdash,     // 17 - <<-
111    Inangamp,       // 18 - <&
112    Outangamp,      // 19 - >&
113    Ampoutang,      // 20 - &>
114    Outangampbang,  // 21 - &>|
115    Doutangamp,     // 22 - >>&
116    Doutangampbang, // 23 - >>&|
117    Trinang,        // 24 - <<<
118    Bar,            // 25 - |
119    Baramp,         // 26 - |&
120    Inoutpar,       // 27 - ()
121    Dinpar,         // 28 - ((
122    Doutpar,        // 29 - ))
123    Amperbang,      // 30 - &| or &!
124    Semiamp,        // 31 - ;&
125    Semibar,        // 32 - ;|
126
127    // Non-punctuation tokens
128    Doutbrack, // 33 - ]]
129    String,    // 34 - word/string
130    Envstring, // 35 - VAR=value
131    Envarray,  // 36 - VAR=(...)
132    Endinput,  // 37 - end of input
133    Lexerr,    // 38 - lexer error
134
135    // Reserved words
136    Bang,      // 39 - !
137    Dinbrack,  // 40 - [[
138    Inbrace,   // 41 - {
139    Outbrace,  // 42 - }
140    Case,      // 43 - case
141    Coproc,    // 44 - coproc
142    Doloop,    // 45 - do
143    Done,      // 46 - done
144    Elif,      // 47 - elif
145    Else,      // 48 - else
146    Zend,      // 49 - end
147    Esac,      // 50 - esac
148    Fi,        // 51 - fi
149    For,       // 52 - for
150    Foreach,   // 53 - foreach
151    Func,      // 54 - function
152    If,        // 55 - if
153    Nocorrect, // 56 - nocorrect
154    Repeat,    // 57 - repeat
155    Select,    // 58 - select
156    Then,      // 59 - then
157    Time,      // 60 - time
158    Until,     // 61 - until
159    While,     // 62 - while
160    Typeset,   // 63 - typeset or similar
161}
162
163impl LexTok {
164    /// Check if this token is a redirection operator
165    pub fn is_redirop(self) -> bool {
166        matches!(
167            self,
168            LexTok::Outang
169                | LexTok::Outangbang
170                | LexTok::Doutang
171                | LexTok::Doutangbang
172                | LexTok::Inang
173                | LexTok::Inoutang
174                | LexTok::Dinang
175                | LexTok::Dinangdash
176                | LexTok::Inangamp
177                | LexTok::Outangamp
178                | LexTok::Ampoutang
179                | LexTok::Outangampbang
180                | LexTok::Doutangamp
181                | LexTok::Doutangampbang
182                | LexTok::Trinang
183        )
184    }
185
186    /// String representation of punctuation tokens
187    pub fn as_str(self) -> Option<&'static str> {
188        match self {
189            LexTok::Nulltok => None,
190            LexTok::Seper => Some(";"),
191            LexTok::Newlin => Some("\\n"),
192            LexTok::Semi => Some(";"),
193            LexTok::Dsemi => Some(";;"),
194            LexTok::Amper => Some("&"),
195            LexTok::Inpar => Some("("),
196            LexTok::Outpar => Some(")"),
197            LexTok::Dbar => Some("||"),
198            LexTok::Damper => Some("&&"),
199            LexTok::Outang => Some(">"),
200            LexTok::Outangbang => Some(">|"),
201            LexTok::Doutang => Some(">>"),
202            LexTok::Doutangbang => Some(">>|"),
203            LexTok::Inang => Some("<"),
204            LexTok::Inoutang => Some("<>"),
205            LexTok::Dinang => Some("<<"),
206            LexTok::Dinangdash => Some("<<-"),
207            LexTok::Inangamp => Some("<&"),
208            LexTok::Outangamp => Some(">&"),
209            LexTok::Ampoutang => Some("&>"),
210            LexTok::Outangampbang => Some("&>|"),
211            LexTok::Doutangamp => Some(">>&"),
212            LexTok::Doutangampbang => Some(">>&|"),
213            LexTok::Trinang => Some("<<<"),
214            LexTok::Bar => Some("|"),
215            LexTok::Baramp => Some("|&"),
216            LexTok::Inoutpar => Some("()"),
217            LexTok::Dinpar => Some("(("),
218            LexTok::Doutpar => Some("))"),
219            LexTok::Amperbang => Some("&|"),
220            LexTok::Semiamp => Some(";&"),
221            LexTok::Semibar => Some(";|"),
222            _ => None,
223        }
224    }
225}
226
227/// Redirection types - matches enum from zsh.h
228#[derive(Debug, Clone, Copy, PartialEq, Eq)]
229#[repr(u8)]
230pub enum RedirType {
231    Write = 0,   // >
232    Writenow,    // >|
233    App,         // >>
234    Appnow,      // >>|
235    Errwrite,    // &>, >&
236    Errwritenow, // >&|
237    Errapp,      // >>&
238    Errappnow,   // >>&|
239    Readwrite,   // <>
240    Read,        // <
241    Heredoc,     // <<
242    Heredocdash, // <<-
243    Herestr,     // <<<
244    Mergein,     // <&n
245    Mergeout,    // >&n
246    Close,       // >&-, <&-
247    Inpipe,      // < <(...)
248    Outpipe,     // > >(...)
249}
250
251impl RedirType {
252    /// Check if this is a read-type redirection
253    pub fn is_read(self) -> bool {
254        matches!(
255            self,
256            RedirType::Read
257                | RedirType::Readwrite
258                | RedirType::Heredoc
259                | RedirType::Heredocdash
260                | RedirType::Herestr
261                | RedirType::Mergein
262                | RedirType::Inpipe
263        )
264    }
265
266    /// Check if this is a file write redirection
267    pub fn is_write_file(self) -> bool {
268        matches!(
269            self,
270            RedirType::Write
271                | RedirType::Writenow
272                | RedirType::App
273                | RedirType::Appnow
274                | RedirType::Readwrite
275        )
276    }
277}
278
279/// Condition types for [[ ... ]] expressions
280#[derive(Debug, Clone, Copy, PartialEq, Eq)]
281#[repr(u8)]
282pub enum CondType {
283    Not = 0,
284    And,
285    Or,
286    Streq,  // =
287    Strdeq, // ==
288    Strneq, // !=
289    Strlt,  // <
290    Strgtr, // >
291    Nt,     // -nt (newer than)
292    Ot,     // -ot (older than)
293    Ef,     // -ef (same file)
294    Eq,     // -eq
295    Ne,     // -ne
296    Lt,     // -lt
297    Gt,     // -gt
298    Le,     // -le
299    Ge,     // -ge
300    Regex,  // =~
301    Mod,    // module test
302    Modi,   // module test with infix
303}
304
305/// Characters that need quoting if meant literally
306pub const SPECCHARS: &str = "#$^*()=|{}[]`<>?~;&\n\t \\'\"";
307
308/// Characters that need quoting for pattern matching
309pub const PATCHARS: &str = "#^*()|[]<>?~\\";
310
311/// Check if character is a dash (literal or tokenized)
312#[inline]
313pub fn is_dash(c: char) -> bool {
314    c == '-' || c == char_tokens::DASH
315}
316
317/// Lexer action codes for first character of token
318#[derive(Debug, Clone, Copy, PartialEq, Eq)]
319#[repr(u8)]
320pub enum LexAct1 {
321    Bkslash = 0,
322    Comment = 1,
323    Newlin = 2,
324    Semi = 3,
325    Amper = 5,
326    Bar = 6,
327    Inpar = 7,
328    Outpar = 8,
329    Inang = 13,
330    Outang = 14,
331    Other = 15,
332}
333
334/// Lexer action codes for subsequent characters in token
335#[derive(Debug, Clone, Copy, PartialEq, Eq)]
336#[repr(u8)]
337pub enum LexAct2 {
338    Break = 0,
339    Outpar = 1,
340    Bar = 2,
341    String = 3,
342    Inbrack = 4,
343    Outbrack = 5,
344    Tilde = 6,
345    Inpar = 7,
346    Inbrace = 8,
347    Outbrace = 9,
348    Outang = 10,
349    Inang = 11,
350    Equals = 12,
351    Bkslash = 13,
352    Quote = 14,
353    Dquote = 15,
354    Bquote = 16,
355    Comma = 17,
356    Dash = 18,
357    Bang = 19,
358    Other = 20,
359    Meta = 21,
360}
361
362/// Reserved words table
363pub static RESERVED_WORDS: &[(&str, LexTok)] = &[
364    ("!", LexTok::Bang),
365    ("[[", LexTok::Dinbrack),
366    ("{", LexTok::Inbrace),
367    ("}", LexTok::Outbrace),
368    ("case", LexTok::Case),
369    ("coproc", LexTok::Coproc),
370    ("do", LexTok::Doloop),
371    ("done", LexTok::Done),
372    ("elif", LexTok::Elif),
373    ("else", LexTok::Else),
374    ("end", LexTok::Zend),
375    ("esac", LexTok::Esac),
376    ("fi", LexTok::Fi),
377    ("for", LexTok::For),
378    ("foreach", LexTok::Foreach),
379    ("function", LexTok::Func),
380    ("if", LexTok::If),
381    ("nocorrect", LexTok::Nocorrect),
382    ("repeat", LexTok::Repeat),
383    ("select", LexTok::Select),
384    ("then", LexTok::Then),
385    ("time", LexTok::Time),
386    ("until", LexTok::Until),
387    ("while", LexTok::While),
388];
389
390/// Lookup a reserved word
391pub fn lookup_reserved_word(s: &str) -> Option<LexTok> {
392    RESERVED_WORDS
393        .iter()
394        .find(|(word, _)| *word == s)
395        .map(|(_, tok)| *tok)
396}
397
398/// Typeset-like commands that affect parsing
399pub static TYPESET_COMMANDS: &[&str] = &[
400    "declare", "export", "float", "integer", "local", "readonly", "typeset",
401];
402
403/// Check if a command name is a typeset-like builtin
404pub fn is_typeset_command(s: &str) -> bool {
405    TYPESET_COMMANDS.contains(&s)
406}
407
408#[cfg(test)]
409mod tests {
410    use super::*;
411
412    #[test]
413    fn test_token_values() {
414        assert_eq!(char_tokens::SNULL as u32, 0x9d);
415        assert_eq!(char_tokens::DNULL as u32, 0x9e);
416        assert_eq!(char_tokens::BNULL as u32, 0x9f);
417    }
418
419    #[test]
420    fn test_reserved_words() {
421        assert_eq!(lookup_reserved_word("if"), Some(LexTok::If));
422        assert_eq!(lookup_reserved_word("then"), Some(LexTok::Then));
423        assert_eq!(lookup_reserved_word("notakeyword"), None);
424    }
425
426    #[test]
427    fn test_redirop() {
428        assert!(LexTok::Outang.is_redirop());
429        assert!(LexTok::Dinang.is_redirop());
430        assert!(!LexTok::If.is_redirop());
431        assert!(!LexTok::String.is_redirop());
432    }
433}