use super::tokens::{Token, TokenType, KEYWORDS, NAMED_OPERATORS};
fn is_word_start(c: char) -> bool {
c.is_ascii_alphabetic() || c == '_'
}
fn is_word_cont(c: char) -> bool {
c.is_ascii_alphanumeric() || c == '_'
}
fn is_digit(c: char) -> bool {
c.is_ascii_digit()
}
fn is_hex(c: char) -> bool {
c.is_ascii_hexdigit()
}
struct Lexer {
chars: Vec<char>, n: usize,
i: usize, byte: usize, line: u32,
col: u32,
tokens: Vec<Token>,
}
impl Lexer {
fn new(source: &str) -> Self {
let chars: Vec<char> = source.chars().collect();
let n = chars.len();
Lexer {
chars,
n,
i: 0,
byte: 0,
line: 1,
col: 1,
tokens: Vec::new(),
}
}
fn peek(&self, ahead: usize) -> char {
self.chars.get(self.i + ahead).copied().unwrap_or('\0')
}
fn advance(&mut self) -> char {
let ch = self.chars[self.i];
self.i += 1;
self.byte += ch.len_utf8();
if ch == '\n' {
self.line += 1;
self.col = 1;
} else {
self.col += 1;
}
ch
}
fn match_str(&mut self, s: &str) -> bool {
if self.starts_with(s) {
for _ in s.chars() {
self.advance();
}
true
} else {
false
}
}
fn starts_with(&self, s: &str) -> bool {
s.chars()
.enumerate()
.all(|(off, c)| self.chars.get(self.i + off) == Some(&c))
}
fn slice_str(&self, start: usize, end: usize) -> String {
self.chars[start..end].iter().collect()
}
fn add(&mut self, ty: TokenType, value: String, line: u32, col: u32, pos: usize) -> &mut Token {
self.tokens.push(Token::new(ty, value, line, col, pos));
self.tokens.last_mut().unwrap()
}
fn tokenize(mut self) -> Vec<Token> {
while self.i < self.n {
let ch = self.peek(0);
if ch == '`' && (self.peek(1) == '\n' || self.peek(1) == '\r') {
self.advance(); if self.peek(0) == '\r' {
self.advance();
}
if self.peek(0) == '\n' {
self.advance();
}
continue;
}
if " \t\x0c\x0b\u{00a0}".contains(ch) {
self.advance();
continue;
}
if ch == '\r' || ch == '\n' {
self.read_newline();
continue;
}
if ch == '#' {
self.read_line_comment();
continue;
}
if ch == '<' && self.peek(1) == '#' {
self.read_block_comment();
continue;
}
if ch == '$' {
if self.peek(1) == '(' {
self.read_simple("$(", TokenType::DollarParen);
continue;
}
self.read_variable();
continue;
}
if ch == '@' {
let nxt = self.peek(1);
if nxt == '\'' || nxt == '"' {
self.read_here_string(nxt);
continue;
}
if nxt == '(' {
self.read_simple("@(", TokenType::AtParen);
continue;
}
if nxt == '{' {
self.read_simple("@{", TokenType::AtBrace);
continue;
}
if nxt == '$' || is_word_start(nxt) {
self.read_splat();
continue;
}
self.read_simple("@", TokenType::Unknown);
continue;
}
if ch == '\'' {
self.read_single_quoted();
continue;
}
if ch == '"' {
self.read_double_quoted();
continue;
}
if ch == '>' || ch == '<' || ((ch == '*' || is_digit(ch)) && self.peek(1) == '>') {
self.read_redirection();
continue;
}
if is_digit(ch) || (ch == '.' && is_digit(self.peek(1)) && !self.prev_allows_member()) {
self.read_number();
continue;
}
if ch == '-' {
self.read_hyphen();
continue;
}
if self.read_operator_or_punct() {
continue;
}
if is_word_start(ch) {
self.read_word();
continue;
}
let (line, col, pos) = (self.line, self.col, self.byte);
let c = self.advance();
self.add(TokenType::Unknown, c.to_string(), line, col, pos);
}
let (line, col, pos) = (self.line, self.col, self.byte);
self.add(TokenType::Eof, String::new(), line, col, pos);
self.tokens
}
fn read_newline(&mut self) {
let (line, col, pos) = (self.line, self.col, self.byte);
let mut val = String::new();
if self.peek(0) == '\r' {
val.push(self.advance());
}
if self.peek(0) == '\n' {
val.push(self.advance());
}
if val.is_empty() {
val.push(self.advance());
}
self.add(TokenType::Newline, val, line, col, pos);
}
fn read_line_comment(&mut self) {
let (line, col, pos) = (self.line, self.col, self.byte);
let start = self.i;
while self.i < self.n && self.peek(0) != '\r' && self.peek(0) != '\n' {
self.advance();
}
let val = self.slice_str(start, self.i);
self.add(TokenType::Comment, val, line, col, pos);
}
fn read_block_comment(&mut self) {
let (line, col, pos) = (self.line, self.col, self.byte);
let start = self.i;
self.advance();
self.advance(); while self.i < self.n && !(self.peek(0) == '#' && self.peek(1) == '>') {
self.advance();
}
self.match_str("#>");
let val = self.slice_str(start, self.i);
self.add(TokenType::Comment, val, line, col, pos);
}
fn read_simple(&mut self, lit: &str, ty: TokenType) {
let (line, col, pos) = (self.line, self.col, self.byte);
self.match_str(lit);
self.add(ty, lit.to_owned(), line, col, pos);
}
fn read_variable(&mut self) {
let (line, col, pos) = (self.line, self.col, self.byte);
let start = self.i;
self.advance(); let mut scope: Option<String> = None;
let name: String;
if self.peek(0) == '{' {
self.advance();
let mut buf = String::new();
while self.i < self.n && self.peek(0) != '}' {
buf.push(self.advance());
}
self.match_str("}");
name = buf;
} else if "?^$_".contains(self.peek(0)) {
name = self.advance().to_string();
} else {
let mut buf = String::new();
while self.i < self.n && (is_word_cont(self.peek(0)) || self.peek(0) == ':') {
let c = self.advance();
buf.push(c);
if c == ':' && scope.is_none() {
scope = Some(buf[..buf.len() - 1].to_owned());
buf.clear();
}
}
name = buf;
}
let val = self.slice_str(start, self.i);
let tok = self.add(TokenType::Variable, val, line, col, pos);
tok.text = Some(name);
tok.scope = scope;
}
fn read_splat(&mut self) {
let (line, col, pos) = (self.line, self.col, self.byte);
let start = self.i;
self.advance(); if self.peek(0) == '$' {
self.advance();
}
while self.i < self.n && is_word_cont(self.peek(0)) {
self.advance();
}
let val = self.slice_str(start, self.i);
let name = val.trim_start_matches(['@', '$']).to_owned();
let tok = self.add(TokenType::Variable, val, line, col, pos);
tok.text = Some(name);
tok.splat = true;
}
fn read_here_string(&mut self, quote: char) {
let (line, col, pos) = (self.line, self.col, self.byte);
let start = self.i;
self.advance(); self.advance(); while self.i < self.n && self.peek(0) != '\r' && self.peek(0) != '\n' {
self.advance();
}
let terminator: String = format!("{quote}@");
let term_chars: Vec<char> = terminator.chars().collect();
let body_start = self.i;
while self.i < self.n {
let at_line_start =
self.i == 0 || self.chars[self.i - 1] == '\r' || self.chars[self.i - 1] == '\n';
if at_line_start && self.chars[self.i..].starts_with(&term_chars) {
break;
}
self.advance();
}
let body: String = self.chars[body_start..self.i]
.iter()
.collect::<String>()
.trim_matches(['\r', '\n'])
.to_owned();
self.match_str(&terminator);
let val = self.slice_str(start, self.i);
let ty = if quote == '\'' {
TokenType::HereStringSq
} else {
TokenType::HereStringDq
};
let tok = self.add(ty, val, line, col, pos);
tok.text = Some(body);
}
fn read_single_quoted(&mut self) {
let (line, col, pos) = (self.line, self.col, self.byte);
let start = self.i;
self.advance(); let mut buf = String::new();
loop {
if self.i >= self.n {
break;
}
let c = self.peek(0);
if c == '\'' {
if self.peek(1) == '\'' {
self.advance();
self.advance();
buf.push('\'');
} else {
self.advance();
break;
}
} else {
buf.push(self.advance());
}
}
let val = self.slice_str(start, self.i);
let tok = self.add(TokenType::StringSq, val, line, col, pos);
tok.text = Some(buf);
}
fn read_double_quoted(&mut self) {
let (line, col, pos) = (self.line, self.col, self.byte);
let start = self.i;
self.advance(); let mut buf = String::new();
loop {
if self.i >= self.n {
break;
}
let c = self.peek(0);
if c == '`' {
self.advance();
if self.i < self.n {
let esc = self.advance();
buf.push(match esc {
'n' => '\n',
't' => '\t',
'r' => '\r',
'0' => '\0',
'a' => '\x07',
'b' => '\x08',
other => other,
});
}
} else if c == '"' {
if self.peek(1) == '"' {
self.advance();
self.advance();
buf.push('"');
} else {
self.advance();
break;
}
} else {
buf.push(self.advance());
}
}
let val = self.slice_str(start, self.i);
let tok = self.add(TokenType::StringDq, val, line, col, pos);
tok.text = Some(buf);
}
fn read_number(&mut self) {
let (line, col, pos) = (self.line, self.col, self.byte);
let start = self.i;
if self.peek(0) == '0' && (self.peek(1) == 'x' || self.peek(1) == 'X') {
self.advance();
self.advance();
while self.i < self.n && is_hex(self.peek(0)) {
self.advance();
}
} else {
let mut seen_dot = false;
loop {
if self.i >= self.n {
break;
}
let c = self.peek(0);
if is_digit(c) {
self.advance();
} else if c == '.' && !seen_dot && is_digit(self.peek(1)) {
seen_dot = true;
self.advance();
} else if (c == 'e' || c == 'E')
&& (is_digit(self.peek(1)) || self.peek(1) == '+' || self.peek(1) == '-')
{
self.advance();
if self.peek(0) == '+' || self.peek(0) == '-' {
self.advance();
}
} else {
break;
}
}
}
let sfx2 = self
.slice_str(self.i, (self.i + 2).min(self.n))
.to_lowercase();
if ["kb", "mb", "gb", "tb", "pb"].contains(&sfx2.as_str()) {
self.advance();
self.advance();
} else if self.i < self.n && "ldu".contains(self.chars[self.i].to_ascii_lowercase()) {
self.advance();
}
let val = self.slice_str(start, self.i);
self.add(TokenType::Number, val, line, col, pos);
}
fn read_hyphen(&mut self) {
let (line, col, pos) = (self.line, self.col, self.byte);
let nxt = self.peek(1);
if !is_word_start(nxt) {
self.advance();
if self.peek(0) == '-' {
self.advance();
self.add(TokenType::Operator, "--".into(), line, col, pos);
} else {
self.add(TokenType::Operator, "-".into(), line, col, pos);
}
return;
}
self.advance(); let wstart = self.i;
while self.i < self.n && is_word_cont(self.peek(0)) {
self.advance();
}
let word = self.slice_str(wstart, self.i);
let lower = word.to_lowercase();
if crate::ops::is_named_operator_word(&lower, NAMED_OPERATORS) {
let tok = self.add(TokenType::Operator, format!("-{word}"), line, col, pos);
tok.text = Some(lower);
} else {
let tok = self.add(TokenType::Parameter, format!("-{word}"), line, col, pos);
tok.text = Some(word);
}
}
fn read_redirection(&mut self) {
let (line, col, pos) = (self.line, self.col, self.byte);
let mut text = String::new();
let c = self.peek(0);
if (c == '*' || is_digit(c)) && self.peek(1) == '>' {
text.push(self.advance());
}
if self.peek(0) == '<' {
text.push(self.advance());
} else {
text.push(self.advance()); if self.peek(0) == '>' {
text.push(self.advance()); }
if self.peek(0) == '&' && is_digit(self.peek(1)) {
text.push(self.advance()); text.push(self.advance()); }
}
self.add(TokenType::Redirect, text, line, col, pos);
}
fn read_operator_or_punct(&mut self) -> bool {
let (line, col, pos) = (self.line, self.col, self.byte);
let three: String = self.chars[self.i..(self.i + 3).min(self.n)]
.iter()
.collect();
if three == "??=" {
self.match_str("??=");
self.add(TokenType::Operator, "??=".to_owned(), line, col, pos);
return true;
}
let two: String = self.chars[self.i..(self.i + 2).min(self.n)]
.iter()
.collect();
if two == "?." || two == "?[" {
self.match_str(&two);
self.add(TokenType::Operator, two, line, col, pos);
return true;
}
macro_rules! tok2 {
($lit:expr, $ty:expr) => {
if two == $lit {
self.match_str($lit);
self.add($ty, $lit.to_owned(), line, col, pos);
return true;
}
};
}
tok2!("::", TokenType::DoubleColon);
tok2!("++", TokenType::Operator);
tok2!("..", TokenType::Operator);
for op in &["+=", "-=", "*=", "/=", "%=", "&&", "||", "??"] {
if two == *op {
self.match_str(op);
self.add(TokenType::Operator, op.to_string(), line, col, pos);
return true;
}
}
let c = self.peek(0);
let single: Option<TokenType> = match c {
'|' => Some(TokenType::Pipe),
'&' => Some(TokenType::Amp),
';' => Some(TokenType::Semicolon),
',' => Some(TokenType::Comma),
'.' => Some(TokenType::Dot),
'(' => Some(TokenType::LParen),
')' => Some(TokenType::RParen),
'{' => Some(TokenType::LBrace),
'}' => Some(TokenType::RBrace),
'[' => Some(TokenType::LBracket),
']' => Some(TokenType::RBracket),
_ => None,
};
if let Some(ty) = single {
self.advance();
self.add(ty, c.to_string(), line, col, pos);
return true;
}
if "+*/%=?:".contains(c) {
self.advance();
self.add(TokenType::Operator, c.to_string(), line, col, pos);
return true;
}
false
}
fn read_word(&mut self) {
let (line, col, pos) = (self.line, self.col, self.byte);
let start = self.i;
loop {
if self.i >= self.n {
break;
}
let c = self.peek(0);
if is_word_cont(c) {
self.advance();
} else if c == '-' && is_word_start(self.peek(1)) {
self.advance();
} else {
break;
}
}
let word = self.slice_str(start, self.i);
let lower = word.to_lowercase();
if KEYWORDS.contains(&lower.as_str()) {
let tok = self.add(TokenType::Keyword, word, line, col, pos);
tok.text = Some(lower);
} else {
let tok = self.add(TokenType::Generic, word.clone(), line, col, pos);
tok.text = Some(word);
}
}
fn prev_allows_member(&self) -> bool {
if let Some(t) = self.tokens.last() {
matches!(
t.ty,
TokenType::Variable
| TokenType::RParen
| TokenType::RBracket
| TokenType::StringDq
| TokenType::StringSq
| TokenType::Generic
)
} else {
false
}
}
}
pub fn tokenize(source: &str) -> Vec<Token> {
let source = crate::encoding::strip_bom(source);
Lexer::new(source).tokenize()
}
pub fn strip_comments(source: &str) -> String {
strip_comments_tokens(&tokenize(source), source)
}
pub fn strip_comments_tokens(tokens: &[Token], source: &str) -> String {
let mut bytes = source.as_bytes().to_vec();
for tok in tokens {
if tok.ty != TokenType::Comment {
continue;
}
let start = tok.pos; let end = (start + tok.value.len()).min(bytes.len());
for b in &mut bytes[start..end] {
if *b != b'\r' && *b != b'\n' {
*b = b' ';
}
}
}
String::from_utf8(bytes).unwrap_or_else(|_| source.to_owned())
}
#[cfg(test)]
mod tests {
use super::*;
fn types(src: &str) -> Vec<TokenType> {
tokenize(src).into_iter().map(|t| t.ty).collect()
}
#[test]
fn operators_keywords_and_redirections_unchanged() {
let one = |src: &str| {
let mut t = tokenize(src);
t.retain(|t| !matches!(t.ty, TokenType::Eof));
t
};
let t = one("-Eq");
assert_eq!(t.len(), 1);
assert_eq!(t[0].ty, TokenType::Operator);
assert_eq!(
(t[0].value.as_str(), t[0].text.as_deref()),
("-Eq", Some("eq"))
);
let t = one("-Path");
assert_eq!(t[0].ty, TokenType::Parameter);
assert_eq!(
(t[0].value.as_str(), t[0].text.as_deref()),
("-Path", Some("Path"))
);
let t = one("Function");
assert_eq!(t[0].ty, TokenType::Keyword);
assert_eq!(
(t[0].value.as_str(), t[0].text.as_deref()),
("Function", Some("function"))
);
let t = one("Get-Process");
assert_eq!(t[0].ty, TokenType::Generic);
assert_eq!(t[0].value, "Get-Process");
let t = one(">>");
assert_eq!((t[0].ty, t[0].value.as_str()), (TokenType::Redirect, ">>"));
let t = one("1..3");
assert_eq!((t[1].ty, t[1].value.as_str()), (TokenType::Operator, ".."));
}
#[test]
fn dollar_paren_is_a_distinct_token() {
let toks = tokenize("$(Get-Date)");
assert_eq!(toks[0].ty, TokenType::DollarParen, "got {:?}", toks[0]);
assert!(!types("$(Get-Date)").contains(&TokenType::Variable));
}
#[test]
fn plain_variable_still_lexes() {
let toks = tokenize("$env:PATH");
assert_eq!(toks[0].ty, TokenType::Variable);
assert_eq!(toks[0].scope.as_deref(), Some("env"));
assert_eq!(toks[0].text.as_deref(), Some("PATH"));
}
#[test]
fn token_pos_is_a_byte_offset() {
let toks = tokenize("é $x");
let var = toks.iter().find(|t| t.ty == TokenType::Variable).unwrap();
assert_eq!(&"é $x"[var.pos..var.pos + 2], "$x");
}
#[test]
fn strip_comments_preserves_byte_length_with_unicode() {
let src = "café # cömment\nWrite-Output 1\n";
let stripped = strip_comments(src);
assert_eq!(stripped.len(), src.len());
assert!(stripped.starts_with("café "));
assert!(!stripped.contains("cömment"));
assert!(stripped.contains("Write-Output 1"));
}
}