use super::token::{Token, WordPart};
use super::BashError;
pub fn lex(src: &str) -> Result<Vec<Token>, BashError> {
Lexer { src: src.as_bytes(), pos: 0 }.run()
}
struct Lexer<'a> {
src: &'a [u8],
pos: usize,
}
impl Lexer<'_> {
fn run(&mut self) -> Result<Vec<Token>, BashError> {
let mut out = Vec::new();
loop {
self.skip_blanks_and_comments();
let Some(b) = self.peek() else {
out.push(Token::Eof);
return Ok(out);
};
match b {
b'\n' | b';' => {
self.pos += 1;
if !matches!(out.last(), Some(Token::Semi) | None) {
out.push(Token::Semi);
}
}
b'|' => {
self.pos += 1;
out.push(Token::Pipe);
}
_ => out.push(self.word()?),
}
}
}
fn peek(&self) -> Option<u8> {
self.src.get(self.pos).copied()
}
fn skip_blanks_and_comments(&mut self) {
loop {
match self.peek() {
Some(b' ' | b'\t' | b'\r') => self.pos += 1,
Some(b'#') => {
while !matches!(self.peek(), None | Some(b'\n')) {
self.pos += 1;
}
}
_ => return,
}
}
}
fn word(&mut self) -> Result<Token, BashError> {
let mut parts: Vec<WordPart> = Vec::new();
let mut lit = String::new();
macro_rules! flush {
() => {
if !lit.is_empty() {
match parts.last_mut() {
Some(WordPart::Lit(s)) => s.push_str(&lit),
_ => parts.push(WordPart::Lit(std::mem::take(&mut lit))),
}
lit.clear();
}
};
}
while let Some(b) = self.peek() {
match b {
b' ' | b'\t' | b'\r' | b'\n' | b';' | b'|' => break,
b'\'' => {
self.pos += 1;
while let Some(c) = self.peek() {
if c == b'\'' {
break;
}
lit.push(c as char);
self.pos += 1;
}
if self.peek() != Some(b'\'') {
return Err(BashError::parse("unterminated single quote"));
}
self.pos += 1;
}
b'"' => {
self.pos += 1;
loop {
match self.peek() {
None => return Err(BashError::parse("unterminated double quote")),
Some(b'"') => {
self.pos += 1;
break;
}
Some(b'\\') => {
self.pos += 1;
match self.peek() {
Some(c @ (b'"' | b'\\' | b'$' | b'`')) => {
lit.push(c as char);
self.pos += 1;
}
_ => lit.push('\\'),
}
}
Some(b'$') => {
flush!();
parts.push(self.dollar()?);
}
Some(c) => {
lit.push(c as char);
self.pos += 1;
}
}
}
}
b'\\' => {
self.pos += 1;
match self.peek() {
None => lit.push('\\'),
Some(b'\n') => self.pos += 1,
Some(c) => {
lit.push(c as char);
self.pos += 1;
}
}
}
b'$' => {
flush!();
parts.push(self.dollar()?);
}
c => {
lit.push(c as char);
self.pos += 1;
}
}
}
flush!();
Ok(Token::Word(parts))
}
fn dollar(&mut self) -> Result<WordPart, BashError> {
debug_assert_eq!(self.peek(), Some(b'$'));
self.pos += 1;
match self.peek() {
Some(b'(') => {
self.pos += 1;
let start = self.pos;
let mut depth = 1usize;
while let Some(c) = self.peek() {
match c {
b'(' => depth += 1,
b')' => {
depth -= 1;
if depth == 0 {
break;
}
}
_ => {}
}
self.pos += 1;
}
if self.peek() != Some(b')') {
return Err(BashError::parse("unterminated $( ) substitution"));
}
let inner = std::str::from_utf8(&self.src[start..self.pos])
.map_err(|_| BashError::parse("non-utf8 in substitution"))?
.to_string();
self.pos += 1; Ok(WordPart::Subst(inner))
}
Some(b'{') => {
self.pos += 1;
let start = self.pos;
while matches!(self.peek(), Some(c) if c != b'}') {
self.pos += 1;
}
if self.peek() != Some(b'}') {
return Err(BashError::parse("unterminated ${ } expansion"));
}
let name = std::str::from_utf8(&self.src[start..self.pos])
.map_err(|_| BashError::parse("non-utf8 var name"))?
.to_string();
self.pos += 1; if name.is_empty() {
return Err(BashError::parse("empty ${} variable name"));
}
Ok(WordPart::Var(name))
}
Some(b'?') => {
self.pos += 1;
Ok(WordPart::Var("?".to_string()))
}
Some(c) if c == b'_' || c.is_ascii_alphabetic() => {
let start = self.pos;
while matches!(self.peek(), Some(c) if c == b'_' || c.is_ascii_alphanumeric()) {
self.pos += 1;
}
let name = std::str::from_utf8(&self.src[start..self.pos]).unwrap().to_string();
Ok(WordPart::Var(name))
}
_ => Ok(WordPart::Lit("$".to_string())),
}
}
}