use crate::ast::{Alternatives, BaseFactor, Factor, IxmlGrammar, Mark, Repetition, Rule, Sequence};
use crate::lexer::Token;
pub struct Parser {
tokens: Vec<Token>,
pos: usize,
}
impl Parser {
pub fn new(tokens: Vec<Token>) -> Self {
Parser { tokens, pos: 0 }
}
fn peek(&self) -> Option<&Token> {
self.tokens.get(self.pos)
}
#[allow(dead_code)]
fn peek_ahead(&self, offset: usize) -> Option<&Token> {
self.tokens.get(self.pos + offset)
}
fn consume(&mut self) -> Option<Token> {
if self.pos < self.tokens.len() {
let token = self.tokens[self.pos].clone();
self.pos += 1;
Some(token)
} else {
None
}
}
fn expect(&mut self, description: &str) -> Result<Token, String> {
self.consume()
.ok_or_else(|| format!("Expected {} but reached end of input", description))
}
fn matches(&self, expected: &Token) -> bool {
self.peek()
.is_some_and(|t| std::mem::discriminant(t) == std::mem::discriminant(expected))
}
fn at_end(&self) -> bool {
self.pos >= self.tokens.len()
}
fn hex_to_char(hex: &str) -> Result<char, String> {
let code_point =
u32::from_str_radix(hex, 16).map_err(|e| format!("Invalid hex value: {}", e))?;
char::from_u32(code_point).ok_or_else(|| format!("Invalid Unicode code point: #{}", hex))
}
pub fn parse_grammar(&mut self) -> Result<IxmlGrammar, String> {
if self.tokens.get(self.pos) == Some(&Token::Ident("ixml".to_string()))
&& self.tokens.get(self.pos + 1) == Some(&Token::Ident("version".to_string()))
{
self.consume(); self.consume();
match self.expect("version string")? {
Token::String(_) => {
}
other => return Err(format!("Expected version string, got {:?}", other)),
}
if !self.matches(&Token::Period) {
return Err("Expected '.' after version declaration".to_string());
}
self.consume(); }
let mut rules = Vec::new();
while !self.at_end() {
rules.push(self.parse_rule()?);
}
if rules.is_empty() {
return Err("Grammar must contain at least one rule".to_string());
}
Ok(IxmlGrammar::new(rules))
}
fn parse_rule(&mut self) -> Result<Rule, String> {
let mark = if self.matches(&Token::At) {
self.consume();
Mark::Attribute
} else if self.matches(&Token::Minus) {
self.consume();
Mark::Hidden
} else if self.matches(&Token::Caret) {
self.consume();
Mark::Promoted
} else {
Mark::None
};
let name = match self.expect("identifier")? {
Token::Ident(s) => s,
other => return Err(format!("Expected identifier, got {:?}", other)),
};
if !self.matches(&Token::Colon) && !self.matches(&Token::Equals) {
return Err(format!("Expected ':' or '=' after rule name '{}'", name));
}
self.consume();
let alternatives = self.parse_alternatives()?;
if !self.matches(&Token::Period) {
return Err(format!("Expected '.' at end of rule '{}'", name));
}
self.consume();
Ok(Rule::new(name, mark, alternatives))
}
fn parse_alternatives(&mut self) -> Result<Alternatives, String> {
let mut alts = vec![self.parse_sequence()?];
while self.matches(&Token::Pipe) || self.matches(&Token::Semicolon) {
self.consume();
alts.push(self.parse_sequence()?);
}
Ok(Alternatives::new(alts))
}
fn parse_sequence(&mut self) -> Result<Sequence, String> {
if self.matches(&Token::Period)
|| self.matches(&Token::Pipe)
|| self.matches(&Token::Semicolon)
|| self.matches(&Token::RParen)
{
return Ok(Sequence::new(vec![]));
}
let mut factors = vec![self.parse_factor()?];
let comma_separated = self.matches(&Token::Comma);
if comma_separated {
while self.matches(&Token::Comma) {
self.consume();
factors.push(self.parse_factor()?);
}
} else {
while !self.at_end()
&& !self.matches(&Token::Period)
&& !self.matches(&Token::Pipe)
&& !self.matches(&Token::Semicolon)
&& !self.matches(&Token::RParen)
{
factors.push(self.parse_factor()?);
}
}
Ok(Sequence::new(factors))
}
fn parse_factor(&mut self) -> Result<Factor, String> {
let base = self.parse_base_factor()?;
let repetition = if self.matches(&Token::Question) {
self.consume();
Some(Repetition::Optional)
} else if self.matches(&Token::DoubleStar) {
self.consume();
let sep = if self.matches(&Token::LParen) {
self.consume();
let s = self.parse_sequence()?;
if !self.matches(&Token::RParen) {
return Err("Expected ')' after separator".to_string());
}
self.consume();
s
} else {
let factor = self.parse_base_factor()?;
Sequence::new(vec![Factor::simple(factor)])
};
Some(Repetition::SeparatedZeroOrMore(Box::new(sep)))
} else if self.matches(&Token::DoublePlus) {
self.consume();
let sep = if self.matches(&Token::LParen) {
self.consume();
let s = self.parse_sequence()?;
if !self.matches(&Token::RParen) {
return Err("Expected ')' after separator".to_string());
}
self.consume();
s
} else {
let factor = self.parse_base_factor()?;
Sequence::new(vec![Factor::simple(factor)])
};
Some(Repetition::SeparatedOneOrMore(Box::new(sep)))
} else if self.matches(&Token::Star) {
self.consume();
Some(Repetition::ZeroOrMore)
} else if self.matches(&Token::Plus) {
self.consume();
Some(Repetition::OneOrMore)
} else {
None
};
Ok(if let Some(rep) = repetition {
Factor::new(base, rep)
} else {
Factor::simple(base)
})
}
fn parse_base_factor(&mut self) -> Result<BaseFactor, String> {
if self.matches(&Token::At) || self.matches(&Token::Minus) || self.matches(&Token::Caret) {
let mark = if self.matches(&Token::At) {
self.consume();
Mark::Attribute
} else if self.matches(&Token::Minus) {
self.consume();
Mark::Hidden
} else {
self.consume(); Mark::Promoted
};
match self.peek() {
Some(Token::String(s)) => {
let s = s.clone();
self.consume();
Ok(BaseFactor::marked_literal(s, mark))
}
Some(Token::HexChar(h)) => {
let hex_str = h.clone();
self.consume();
let ch = Self::hex_to_char(&hex_str)?;
Ok(BaseFactor::marked_literal(ch.to_string(), mark))
}
Some(Token::CharClass(s)) => {
let s = s.clone();
self.consume();
Ok(BaseFactor::marked_charclass(s, false, mark))
}
Some(Token::Ident(s)) => {
let s = s.clone();
self.consume();
Ok(BaseFactor::marked_nonterminal(s, mark))
}
other => Err(format!("Expected string, hex char, character class, or identifier after mark, got {:?}", other)),
}
} else if self.matches(&Token::Plus) {
self.consume();
match self.expect("string or hex char after '+'")? {
Token::String(s) => Ok(BaseFactor::insertion(s)),
Token::HexChar(h) => {
let ch = Self::hex_to_char(&h)?;
Ok(BaseFactor::insertion(ch.to_string()))
}
other => Err(format!(
"Expected string or hex char after '+', got {:?}",
other
)),
}
} else if self.matches(&Token::Tilde) {
self.consume();
match self.expect("character class after '~'")? {
Token::CharClass(s) => Ok(BaseFactor::negated_charclass(s)),
other => Err(format!(
"Expected character class after '~', got {:?}",
other
)),
}
} else {
match self.peek() {
Some(Token::Ident(s)) => {
let s = s.clone();
self.consume();
Ok(BaseFactor::nonterminal(s))
}
Some(Token::String(s)) => {
let s = s.clone();
self.consume();
Ok(BaseFactor::literal(s))
}
Some(Token::CharClass(s)) => {
let s = s.clone();
self.consume();
Ok(BaseFactor::charclass(s))
}
Some(Token::HexChar(h)) => {
let hex_str = h.clone();
self.consume();
let ch = Self::hex_to_char(&hex_str)?;
Ok(BaseFactor::literal(ch.to_string()))
}
Some(Token::LParen) => {
self.consume();
let alts = self.parse_alternatives()?;
if !self.matches(&Token::RParen) {
return Err("Expected ')' after grouped alternatives".to_string());
}
self.consume();
Ok(BaseFactor::group(alts))
}
other => Err(format!("Expected factor, got {:?}", other)),
}
}
}
}
pub fn parse_ixml_grammar(input: &str) -> Result<IxmlGrammar, String> {
use crate::lexer::Lexer;
let mut lexer = Lexer::new(input);
let tokens = lexer
.tokenize()
.map_err(|e| format!("Lexer error: {}", e))?;
let tokens: Vec<Token> = tokens
.into_iter()
.filter(|t| !matches!(t, Token::Eof))
.collect();
let mut parser = Parser::new(tokens);
parser.parse_grammar()
}