use std::collections::HashMap;
use std::error::Error;
use std::fmt;
#[derive(Debug, Clone)]
pub enum RulePattern {
CharLiteral(char),
StringLiteral(String),
Regex(String),
CharSet(String),
CharRangeMatch1(char, char),
CharRangeMatch0(char, char),
Choice(Vec<RulePattern>),
EscapedChar(char),
AnyChar,
AnyCharPlus,
}
#[derive(Debug, Clone)]
pub struct LexerRule {
pub pattern: RulePattern,
pub kind: u32,
pub name: String,
pub context_token: Option<String>, pub action_code: Option<String>, }
impl LexerRule {
pub fn new(pattern: RulePattern, kind: u32, name: String) -> Self {
LexerRule {
pattern,
kind,
name,
context_token: None,
action_code: None,
}
}
pub fn new_with_context(
pattern: RulePattern,
kind: u32,
name: String,
context_token: String,
) -> Self {
LexerRule {
pattern,
kind,
name,
context_token: Some(context_token),
action_code: None,
}
}
pub fn new_with_action(pattern: RulePattern, action_code: String) -> Self {
LexerRule {
pattern,
kind: 0, name: String::new(), context_token: None,
action_code: Some(action_code),
}
}
}
#[derive(Debug)]
pub struct LexerSpec {
pub prefix_code: String,
pub rules: Vec<LexerRule>,
pub suffix_code: String,
pub custom_tokens: Vec<String>,
}
impl LexerSpec {
pub fn new() -> Self {
LexerSpec {
prefix_code: String::new(),
rules: Vec::new(),
suffix_code: String::new(),
custom_tokens: Vec::new(),
}
}
}
impl Default for LexerSpec {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug)]
pub struct ParseError {
message: String,
}
impl ParseError {
pub fn new(message: String) -> Self {
ParseError { message }
}
}
impl fmt::Display for ParseError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "Parse error: {}", self.message)
}
}
impl Error for ParseError {}
fn parse_pattern(input: &str) -> Result<RulePattern, ParseError> {
let trimmed = input.trim();
if trimmed == "?+" {
return Ok(RulePattern::AnyCharPlus);
}
if trimmed == "?" {
return Ok(RulePattern::AnyChar);
}
if trimmed.starts_with('\\') && trimmed.len() == 2 {
let escape_char = trimmed.chars().nth(1).unwrap();
let actual_char = match escape_char {
'n' => '\n',
't' => '\t',
'r' => '\r',
'\\' => '\\',
'+' => '+',
'*' => '*',
'?' => '?',
'(' => '(',
')' => ')',
'[' => '[',
']' => ']',
'{' => '{',
'}' => '}',
'|' => '|',
'^' => '^',
'$' => '$',
'.' => '.',
c => c, };
return Ok(RulePattern::EscapedChar(actual_char));
}
if trimmed.starts_with('\'') && trimmed.ends_with('\'') && trimmed.len() == 3 {
let ch = trimmed.chars().nth(1).unwrap();
return Ok(RulePattern::CharLiteral(ch));
}
if trimmed.starts_with('"') && trimmed.ends_with('"') && trimmed.len() >= 2 {
let content = &trimmed[1..trimmed.len() - 1];
return Ok(RulePattern::StringLiteral(content.to_string()));
}
if trimmed.starts_with('/') && trimmed.ends_with('/') && trimmed.len() >= 2 {
let content = &trimmed[1..trimmed.len() - 1];
return Ok(RulePattern::Regex(content.to_string()));
}
if trimmed.starts_with('[') && trimmed.contains(']') {
if let Some(closing_bracket) = trimmed.find(']') {
let inside = &trimmed[1..closing_bracket];
let quantifier = &trimmed[closing_bracket + 1..];
let parse_char = |s: &str| -> Option<char> {
if s.starts_with("\\u{") && s.ends_with('}') {
let hex_str = &s[3..s.len()-1];
u32::from_str_radix(hex_str, 16)
.ok()
.and_then(|code| char::from_u32(code))
} else if s.starts_with("\\x") && s.len() == 4 {
let hex_str = &s[2..];
u8::from_str_radix(hex_str, 16)
.ok()
.map(|code| code as char)
} else if s.len() == 1 {
s.chars().next()
} else {
None
}
};
if let Some(dash_pos) = inside.find('-') {
let start_str = &inside[..dash_pos];
let end_str = &inside[dash_pos + 1..];
if let (Some(start_char), Some(end_char)) = (parse_char(start_str), parse_char(end_str)) {
match quantifier {
"+" => return Ok(RulePattern::CharRangeMatch1(start_char, end_char)),
"*" => return Ok(RulePattern::CharRangeMatch0(start_char, end_char)),
_ => {} }
}
}
}
return Ok(RulePattern::CharSet(trimmed.to_string()));
}
if trimmed.starts_with('(') && trimmed.ends_with(')') {
let content = &trimmed[1..trimmed.len() - 1];
let parts: Vec<&str> = content.split('|').collect();
if parts.len() > 1 {
let mut patterns = Vec::new();
for part in parts {
patterns.push(parse_pattern(part.trim())?);
}
return Ok(RulePattern::Choice(patterns));
}
}
Ok(RulePattern::Regex(trimmed.to_string()))
}
pub fn parse_spec(input: &str) -> Result<LexerSpec, Box<dyn Error>> {
let mut spec = LexerSpec::new();
let mut token_names: HashMap<String, u32> = HashMap::new();
let parts: Vec<&str> = input.split("%%").collect();
if parts.len() != 3 {
return Err(Box::new(ParseError::new(
"Input must have exactly 3 sections separated by %%".to_string(),
)));
}
spec.prefix_code = parts[0].trim().to_string();
spec.suffix_code = parts[2].trim().to_string();
let rules_section = parts[1].trim();
let mut kind_counter = 0u32;
for line in rules_section.lines() {
let line = line.trim();
if line.is_empty() || line.starts_with("//") {
continue;
}
if line.starts_with("%token") {
let tokens_part = line[6..].trim();
let token_names_list: Vec<String> = tokens_part
.split(|c: char| c.is_whitespace() || c == ',')
.filter(|s| !s.is_empty())
.map(|s| s.to_string())
.collect();
spec.custom_tokens.extend(token_names_list);
continue;
}
if line.starts_with('%') {
if let Some(arrow_pos) = line.find("->") {
let left_part = line[1..arrow_pos].trim(); let token_name = line[arrow_pos + 2..].trim().to_string();
let parts: Vec<&str> = left_part.splitn(2, ' ').collect();
if parts.len() == 2 {
let context_token = parts[0].trim().to_string();
if !token_names.contains_key(&context_token) {
return Err(Box::new(ParseError::new(format!(
"Unknown context token '{}' in rule: {}",
context_token, line
))));
}
let pattern_str = parts[1].trim();
let pattern = parse_pattern(pattern_str)?;
spec.rules.push(LexerRule::new_with_context(
pattern,
kind_counter,
token_name,
context_token,
));
} else {
return Err(Box::new(ParseError::new(format!(
"Invalid context rule format: {}",
line
))));
}
} else {
return Err(Box::new(ParseError::new(format!(
"Context rule must have -> operator: {}",
line
))));
}
} else if let Some(arrow_pos) = line.find("->") {
let pattern_str = line[..arrow_pos].trim();
let pattern = parse_pattern(pattern_str)?;
let right_part = line[arrow_pos + 2..].trim();
if right_part.starts_with('{') && right_part.ends_with('}') {
let action_code = right_part[1..right_part.len() - 1].trim().to_string();
let mut rule = LexerRule::new_with_action(pattern, action_code);
rule.kind = kind_counter; spec.rules.push(rule);
} else {
let mut name = right_part.to_string();
if name == "_" {
name = "Whitespace".to_string();
}
spec.rules.push(LexerRule::new(pattern, kind_counter, name));
}
} else {
let pattern_str = line;
let pattern = parse_pattern(pattern_str)?;
let name = format!("TOKEN_{}", kind_counter);
spec.rules.push(LexerRule::new(pattern, kind_counter, name));
}
if let Some(rule) = spec.rules.last() {
if rule.action_code.is_none() && !rule.name.is_empty() {
token_names.insert(rule.name.clone(), rule.kind);
}
}
kind_counter += 1;
}
Ok(spec)
}