Skip to main content

reef/
lexer.rs

1//! Byte-oriented lexer for bash input.
2//!
3//! Operates on `&[u8]` with a position cursor. No token enum — the parser
4//! calls methods directly (`peek`/`eat`/`read`). Every read method returns
5//! `&'a str` — a zero-copy slice of the input.
6
7use std::fmt;
8
9/// Byte-oriented scanner for bash input. Operates on `&[u8]` with a position
10/// cursor. No token enum — the parser calls methods directly (peek/eat/read).
11/// Every read method returns `&'a str` — a zero-copy slice of the input.
12pub(crate) struct Lexer<'a> {
13    src: &'a [u8],
14    input: &'a str,
15    pos: usize,
16}
17
18/// Error produced when the parser encounters invalid or unsupported bash syntax.
19#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
20pub struct ParseError {
21    pos: usize,
22    msg: &'static str,
23}
24
25impl ParseError {
26    /// Create a new parse error at the given byte offset.
27    pub(crate) fn new(pos: usize, msg: &'static str) -> Self {
28        ParseError { pos, msg }
29    }
30
31    /// Byte offset in the input where the error occurred.
32    ///
33    /// # Examples
34    ///
35    /// ```
36    /// use reef::parser::Parser;
37    /// let err = Parser::new("echo $(").parse().unwrap_err();
38    /// assert!(err.position() <= 7);
39    /// ```
40    #[must_use]
41    #[allow(dead_code)] // public API for downstream consumers
42    pub fn position(&self) -> usize {
43        self.pos
44    }
45
46    /// Human-readable description of the error.
47    ///
48    /// # Examples
49    ///
50    /// ```
51    /// use reef::parser::Parser;
52    /// let err = Parser::new("echo $(").parse().unwrap_err();
53    /// assert!(!err.message().is_empty());
54    /// ```
55    #[must_use]
56    #[allow(dead_code)] // public API for downstream consumers
57    pub fn message(&self) -> &'static str {
58        self.msg
59    }
60}
61
62impl fmt::Display for ParseError {
63    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
64        write!(f, "parse error at byte {}: {}", self.pos, self.msg)
65    }
66}
67
68impl std::error::Error for ParseError {}
69
70impl<'a> Lexer<'a> {
71    /// Create a new lexer for the given input string.
72    pub(crate) fn new(input: &'a str) -> Self {
73        Lexer {
74            src: input.as_bytes(),
75            input,
76            pos: 0,
77        }
78    }
79
80    // -----------------------------------------------------------------------
81    // Position / lookahead
82    // -----------------------------------------------------------------------
83
84    /// Return the current byte offset.
85    #[inline]
86    #[must_use]
87    pub(crate) fn pos(&self) -> usize {
88        self.pos
89    }
90
91    /// Return true if the cursor is at or past the end of input.
92    #[inline]
93    #[must_use]
94    pub(crate) fn is_eof(&self) -> bool {
95        self.pos >= self.src.len()
96    }
97
98    /// Peek current byte. Returns 0 at EOF — NUL never appears in shell input.
99    #[inline]
100    #[must_use]
101    pub(crate) fn peek(&self) -> u8 {
102        if self.pos < self.src.len() {
103            self.src[self.pos]
104        } else {
105            0
106        }
107    }
108
109    /// Peek at `pos + offset`.
110    #[inline]
111    #[must_use]
112    pub(crate) fn peek_at(&self, offset: usize) -> u8 {
113        let i = self.pos + offset;
114        if i < self.src.len() { self.src[i] } else { 0 }
115    }
116
117    /// Slice of the original input from `start` to current position.
118    #[inline]
119    #[must_use]
120    pub(crate) fn slice(&self, start: usize) -> &'a str {
121        &self.input[start..self.pos]
122    }
123
124    /// Slice of the original input from `start` to `end`.
125    ///
126    /// # Panics
127    ///
128    /// Panics in debug mode if `start > end` or `end > input length`.
129    #[inline]
130    #[must_use]
131    pub(crate) fn slice_range(&self, start: usize, end: usize) -> &'a str {
132        debug_assert!(
133            start <= end && end <= self.src.len(),
134            "slice_range({start}, {end}): len={}",
135            self.src.len()
136        );
137        &self.input[start..end]
138    }
139
140    /// Remaining input from current position to end.
141    #[inline]
142    #[must_use]
143    pub(crate) fn remaining(&self) -> &'a str {
144        &self.input[self.pos..]
145    }
146
147    // -----------------------------------------------------------------------
148    // Advance
149    // -----------------------------------------------------------------------
150
151    /// Set position directly — used for backtracking.
152    #[inline]
153    pub(crate) fn set_pos(&mut self, pos: usize) {
154        self.pos = pos;
155    }
156
157    /// Advance the cursor by one byte.
158    #[inline]
159    pub(crate) fn bump(&mut self) {
160        self.pos += 1;
161    }
162
163    /// Advance the cursor by `n` bytes.
164    #[inline]
165    pub(crate) fn bump_n(&mut self, n: usize) {
166        self.pos += n;
167    }
168
169    /// Advance if current byte matches. Returns true if consumed.
170    #[inline]
171    pub(crate) fn eat(&mut self, b: u8) -> bool {
172        if self.peek() == b {
173            self.pos += 1;
174            true
175        } else {
176            false
177        }
178    }
179
180    /// Advance if the upcoming bytes match a string. Returns true if consumed.
181    pub(crate) fn eat_str(&mut self, s: &[u8]) -> bool {
182        if self.pos + s.len() <= self.src.len() && &self.src[self.pos..self.pos + s.len()] == s {
183            self.pos += s.len();
184            true
185        } else {
186            false
187        }
188    }
189
190    // -----------------------------------------------------------------------
191    // Skip
192    // -----------------------------------------------------------------------
193
194    /// Skip spaces and tabs (not newlines).
195    pub(crate) fn skip_blanks(&mut self) {
196        while self.pos < self.src.len() {
197            match self.src[self.pos] {
198                b' ' | b'\t' => self.pos += 1,
199                _ => break,
200            }
201        }
202    }
203
204    /// Skip a `#` comment through end of line.
205    pub(crate) fn skip_comment(&mut self) {
206        if self.peek() == b'#' {
207            while self.pos < self.src.len() && self.src[self.pos] != b'\n' {
208                self.pos += 1;
209            }
210        }
211    }
212
213    // -----------------------------------------------------------------------
214    // Read — all return &'a str, zero allocation
215    // -----------------------------------------------------------------------
216
217    /// Read a shell variable name: `[a-zA-Z_][a-zA-Z_0-9]*`.
218    /// Returns empty string if no valid name at current position.
219    #[must_use]
220    pub(crate) fn read_name(&mut self) -> &'a str {
221        let start = self.pos;
222        if self.pos < self.src.len()
223            && (self.src[self.pos].is_ascii_alphabetic() || self.src[self.pos] == b'_')
224        {
225            self.pos += 1;
226            while self.pos < self.src.len()
227                && (self.src[self.pos].is_ascii_alphanumeric() || self.src[self.pos] == b'_')
228            {
229                self.pos += 1;
230            }
231        }
232        self.slice(start)
233    }
234
235    /// Read a digit sequence: `[0-9]+`. Returns empty string if no digits.
236    #[must_use]
237    pub(crate) fn read_number(&mut self) -> &'a str {
238        let start = self.pos;
239        while self.pos < self.src.len() && self.src[self.pos].is_ascii_digit() {
240            self.pos += 1;
241        }
242        self.slice(start)
243    }
244
245    // -----------------------------------------------------------------------
246    // Balanced extraction
247    // -----------------------------------------------------------------------
248
249    /// Read content inside single quotes. Cursor starts after `'`.
250    /// No escaping — ends at next `'`. Returns content, cursor after closing `'`.
251    pub(crate) fn scan_squote(&mut self) -> Result<&'a str, ParseError> {
252        let start = self.pos;
253        while self.pos < self.src.len() {
254            if self.src[self.pos] == b'\'' {
255                let content = self.slice(start);
256                self.pos += 1;
257                return Ok(content);
258            }
259            self.pos += 1;
260        }
261        Err(self.err("unterminated single quote"))
262    }
263
264    // -----------------------------------------------------------------------
265    // Keyword detection — does NOT consume
266    // -----------------------------------------------------------------------
267
268    /// Check if the next word matches `kw` and is followed by a word boundary.
269    #[must_use]
270    pub(crate) fn at_keyword(&self, kw: &[u8]) -> bool {
271        let end = self.pos + kw.len();
272        if end > self.src.len() {
273            return false;
274        }
275        if &self.src[self.pos..end] != kw {
276            return false;
277        }
278        // Single-byte metacharacters are self-delimiting — no boundary needed
279        if kw.len() == 1 && is_meta(kw[0]) {
280            return true;
281        }
282        // Multi-byte keywords need a word boundary after them
283        end >= self.src.len() || is_meta(self.src[end])
284    }
285
286    /// Check if any of the given keywords match at the current position.
287    #[must_use]
288    pub(crate) fn at_any_keyword(&self, keywords: &[&[u8]]) -> bool {
289        keywords.iter().any(|kw| self.at_keyword(kw))
290    }
291
292    // -----------------------------------------------------------------------
293    // Helpers
294    // -----------------------------------------------------------------------
295
296    /// Create a [`ParseError`] at the current position.
297    pub(crate) fn err(&self, msg: &'static str) -> ParseError {
298        ParseError::new(self.pos, msg)
299    }
300}
301
302/// Shell metacharacters — terminate words and act as delimiters.
303#[inline]
304#[must_use]
305pub(crate) const fn is_meta(b: u8) -> bool {
306    matches!(
307        b,
308        b' ' | b'\t' | b'\n' | b';' | b'&' | b'|' | b'(' | b')' | b'<' | b'>' | b'\0'
309    )
310}
311
312#[cfg(test)]
313mod tests {
314    use super::*;
315
316    #[test]
317    fn peek_and_eof() {
318        let lex = Lexer::new("");
319        assert!(lex.is_eof());
320        assert_eq!(lex.peek(), 0);
321
322        let lex = Lexer::new("a");
323        assert!(!lex.is_eof());
324        assert_eq!(lex.peek(), b'a');
325    }
326
327    #[test]
328    fn eat_and_bump() {
329        let mut lex = Lexer::new("ab");
330        assert!(lex.eat(b'a'));
331        assert!(!lex.eat(b'a'));
332        assert!(lex.eat(b'b'));
333        assert!(lex.is_eof());
334    }
335
336    #[test]
337    fn eat_str() {
338        let mut lex = Lexer::new("then done");
339        assert!(lex.eat_str(b"then"));
340        assert_eq!(lex.peek(), b' ');
341        lex.bump();
342        assert!(lex.eat_str(b"done"));
343        assert!(lex.is_eof());
344    }
345
346    #[test]
347    fn skip_blanks_not_newlines() {
348        let mut lex = Lexer::new("  \t\nfoo");
349        lex.skip_blanks();
350        assert_eq!(lex.peek(), b'\n');
351    }
352
353    #[test]
354    fn read_name() {
355        let mut lex = Lexer::new("FOO_bar123 rest");
356        assert_eq!(lex.read_name(), "FOO_bar123");
357        assert_eq!(lex.peek(), b' ');
358    }
359
360    #[test]
361    fn read_name_underscore_start() {
362        let mut lex = Lexer::new("_private");
363        assert_eq!(lex.read_name(), "_private");
364    }
365
366    #[test]
367    fn read_name_no_match() {
368        let mut lex = Lexer::new("123abc");
369        assert_eq!(lex.read_name(), "");
370        assert_eq!(lex.pos(), 0);
371    }
372
373    #[test]
374    fn read_number() {
375        let mut lex = Lexer::new("42rest");
376        assert_eq!(lex.read_number(), "42");
377    }
378
379    #[test]
380    fn scan_squote() {
381        let mut lex = Lexer::new("hello world'rest");
382        let content = lex.scan_squote().unwrap();
383        assert_eq!(content, "hello world");
384        assert_eq!(lex.peek(), b'r');
385    }
386
387    #[test]
388    fn at_keyword() {
389        let lex = Lexer::new("then ");
390        assert!(lex.at_keyword(b"then"));
391        assert!(!lex.at_keyword(b"the"));
392    }
393
394    #[test]
395    fn at_keyword_eof() {
396        let lex = Lexer::new("fi");
397        assert!(lex.at_keyword(b"fi"));
398    }
399
400    #[test]
401    fn at_keyword_no_boundary() {
402        let lex = Lexer::new("done_stuff");
403        assert!(!lex.at_keyword(b"done"));
404    }
405
406    #[test]
407    fn skip_comment() {
408        let mut lex = Lexer::new("# this is a comment\nnext");
409        lex.skip_comment();
410        assert_eq!(lex.peek(), b'\n');
411    }
412
413    #[test]
414    fn parse_error_accessors() {
415        let err = ParseError::new(42, "test error");
416        assert_eq!(err.position(), 42);
417        assert_eq!(err.message(), "test error");
418    }
419}