use crate::parser::{LexerSpec, RulePattern};
use std::collections::HashSet;
include!(concat!(env!("OUT_DIR"), "/template.rs"));
fn extract_custom_tokens(action_code: &str) -> HashSet<String> {
let mut tokens = HashSet::new();
let pattern = "TokenKind::";
for (i, _) in action_code.match_indices(pattern) {
let start = i + pattern.len();
let remaining = &action_code[start..];
let end = remaining
.chars()
.take_while(|c| c.is_alphanumeric() || *c == '_')
.count();
if end > 0 {
let token_name = &remaining[..end];
if token_name != "Unknown" && token_name != "Eof" {
tokens.insert(token_name.to_string());
}
}
}
tokens
}
fn pattern_to_regex(pattern: &RulePattern) -> String {
match pattern {
RulePattern::CharLiteral(ch) => {
regex::escape(&ch.to_string())
}
RulePattern::StringLiteral(s) => {
regex::escape(s)
}
RulePattern::Regex(regex_str) => {
regex_str.clone()
}
RulePattern::CharSet(char_set_pattern) => {
char_set_pattern.clone()
}
RulePattern::CharRangeMatch1(start, end) => {
format!("[{}-{}]+", start, end)
}
RulePattern::CharRangeMatch0(start, end) => {
format!("[{}-{}]*", start, end)
}
RulePattern::Choice(patterns) => {
let alternatives: Vec<String> = patterns.iter().map(|p| pattern_to_regex(p)).collect();
format!("({})", alternatives.join("|"))
}
RulePattern::EscapedChar(ch) => {
regex::escape(&ch.to_string())
}
RulePattern::AnyChar => {
".".to_string()
}
RulePattern::AnyCharPlus => {
".+".to_string()
}
}
}
fn generate_pattern_match_code(pattern: &RulePattern, rule_name: &str) -> (String, bool) {
match pattern {
RulePattern::CharLiteral(ch) => {
let escaped_ch = match ch {
'\n' => "\\n".to_string(),
'\t' => "\\t".to_string(),
'\r' => "\\r".to_string(),
'\\' => "\\\\".to_string(),
'\'' => "\\'".to_string(),
_ => ch.to_string(),
};
let code = format!(
"if remaining.starts_with('{}') {{\n Some(remaining.chars().next().unwrap().to_string())\n }} else {{\n None\n }}",
escaped_ch
);
(code, false) }
RulePattern::StringLiteral(s) => {
let escaped_s = s
.replace("\\", "\\\\")
.replace("\"", "\\\"")
.replace("\n", "\\n")
.replace("\t", "\\t")
.replace("\r", "\\r");
let code = format!(
"if remaining.starts_with(\"{}\") {{\n Some(\"{}\".to_string())\n }} else {{\n None\n }}",
escaped_s, escaped_s
);
(code, false) }
RulePattern::EscapedChar(ch) => {
let escaped_ch = match ch {
'\n' => "\\n".to_string(),
'\t' => "\\t".to_string(),
'\r' => "\\r".to_string(),
'\\' => "\\\\".to_string(),
'\'' => "\\'".to_string(),
_ => ch.to_string(),
};
let code = format!(
"if remaining.starts_with('{}') {{\n Some(remaining.chars().next().unwrap().to_string())\n }} else {{\n None\n }}",
escaped_ch
);
(code, false) }
RulePattern::AnyChar => {
let code = "if let Some(ch) = remaining.chars().next() {\n if ch != '\\n' {\n Some(ch.to_string())\n } else {\n None\n }\n } else {\n None\n }".to_string();
(code, false)
}
RulePattern::AnyCharPlus => {
(format!("self.match_cached_pattern(remaining, TokenKind::{})", rule_name), true)
}
RulePattern::CharRangeMatch1(start, end) => {
let code = format!(
"{{
let mut matched = String::new();
let mut chars = remaining.chars();
while let Some(ch) = chars.next() {{
if ch >= '{}' && ch <= '{}' {{
matched.push(ch);
}} else {{
break;
}}
}}
if !matched.is_empty() {{
Some(matched)
}} else {{
None
}}
}}",
start, end
);
(code, false) }
RulePattern::CharRangeMatch0(_start, _end) => {
(format!("self.match_cached_pattern(remaining, TokenKind::{})", rule_name), true)
}
RulePattern::Regex(_) | RulePattern::CharSet(_) | RulePattern::Choice(_) => {
(format!("self.match_cached_pattern(remaining, TokenKind::{})", rule_name), true)
}
}
}
pub fn generate_lexer(spec: &LexerSpec, source_file: &str) -> String {
let template = LEXER_TEMPLATE;
let mut output = template.to_string();
if !spec.prefix_code.is_empty() {
let prefix_with_newlines = format!("{}\n\n", spec.prefix_code);
output = output.replace(
"// This file is auto-generated.",
&format!("// This file is auto-generated.\n{}", prefix_with_newlines),
);
}
let mut token_kind_variants = String::new();
let mut all_token_names = HashSet::new();
for rule in &spec.rules {
if rule.action_code.is_none() && !rule.name.is_empty() {
if rule.name != "Unknown" && rule.name != "Eof" {
all_token_names.insert(rule.name.clone());
}
}
}
for token_name in &spec.custom_tokens {
if token_name != "Unknown" && token_name != "Eof" {
all_token_names.insert(token_name.clone());
}
}
for rule in &spec.rules {
if let Some(action_code) = &rule.action_code {
let custom_tokens = extract_custom_tokens(action_code);
all_token_names.extend(custom_tokens);
}
}
for token_name in &all_token_names {
if let Some(rule) = spec.rules.iter().find(|r| &r.name == token_name) {
let pattern_desc = pattern_to_regex(&rule.pattern)
.replace('\n', "\\n")
.replace('\t', "\\t")
.replace('\r', "\\r");
token_kind_variants.push_str(&format!("\t{}, // {}\n", token_name, pattern_desc));
} else {
token_kind_variants.push_str(&format!("\t{}, // Custom token\n", token_name));
}
}
let mut regex_code = String::new();
regex_code.push_str(" // Pre-compile patterns that require regex\n");
for rule in &spec.rules {
let (_match_code, needs_regex) = generate_pattern_match_code(&rule.pattern, &rule.name);
if needs_regex {
let regex_pattern = pattern_to_regex(&rule.pattern);
let escaped_pattern = regex_pattern.replace("\\", "\\\\").replace("\"", "\\\"");
regex_code.push_str(&format!(
" regex_cache.insert(TokenKind::{} as u32, Regex::new(\"^{}\").unwrap());\n",
rule.name, escaped_pattern
));
}
}
regex_code.push_str(" ");
let mut rule_match_code = String::new();
for rule in &spec.rules {
if let Some(context_token) = &rule.context_token {
let context_token_name = spec
.rules
.iter()
.find(|r| r.name == *context_token)
.map(|r| r.name.clone())
.unwrap_or_else(|| panic!("Context token '{}' not found", context_token));
let (match_code, _needs_regex) = generate_pattern_match_code(&rule.pattern, &rule.name);
let pattern_desc = pattern_to_regex(&rule.pattern)
.replace('\n', "\\n")
.replace('\t', "\\t")
.replace('\r', "\\r");
rule_match_code.push_str(&format!(
r#" // Context-dependent rule: {} -> {} (after {})
if self.last_token_kind == Some(TokenKind::{}) {{
let matched_opt = {{{}}};
if let Some(matched) = matched_opt {{
let token = Token::new(
TokenKind::{},
matched.clone(),
self.pos,
start_row,
start_col,
matched.len(),
indent,
);
self.advance(&matched);
self.last_token_kind = Some(token.kind.clone());
return Some(token);
}}
}}
"#,
pattern_desc, rule.name, context_token, context_token_name, match_code, rule.name
));
}
}
for rule in &spec.rules {
if rule.context_token.is_none() && rule.action_code.is_some() {
let action_code = rule.action_code.as_ref().unwrap();
let (match_code, _needs_regex) = generate_pattern_match_code(&rule.pattern, &rule.name);
let pattern_desc = pattern_to_regex(&rule.pattern)
.replace('\n', "\\n")
.replace('\t', "\\t")
.replace('\r', "\\r");
rule_match_code.push_str(&format!(
r#" // Action rule: {} -> {{ {} }}
{{
let matched_opt = {{{}}};
if let Some(matched) = matched_opt {{
let matched_str = matched.clone();
// Create token for action code to use
let test_t = Token::new(
TokenKind::Unknown,
matched_str.clone(),
self.pos,
start_row,
start_col,
matched_str.len(),
indent,
);
self.advance(&matched_str);
// Execute action code with available variables
let action_result: Option<Token> = {{
{}
}};
if let Some(token) = action_result {{
self.last_token_kind = Some(token.kind.clone());
return Some(token);
}} else {{
// Continue to next iteration if no token was returned from action
return self.next_token();
}}
}}
}}
"#,
pattern_desc, action_code, match_code, action_code
));
}
}
for rule in &spec.rules {
if rule.context_token.is_none() && rule.action_code.is_none() {
let update_context = if rule.name == "WHITESPACE" || rule.name == "Whitespace" || rule.name == "NEWLINE" || rule.name == "Newline" {
"// Whitespace tokens don't update context"
} else {
"self.last_token_kind = Some(token.kind.clone())"
};
let (match_code, _needs_regex) = generate_pattern_match_code(&rule.pattern, &rule.name);
let pattern_desc = pattern_to_regex(&rule.pattern)
.replace('\n', "\\n")
.replace('\t', "\\t")
.replace('\r', "\\r");
rule_match_code.push_str(&format!(
r#" // Rule: {} -> {}
{{
let matched_opt = {{{}}};
if let Some(matched) = matched_opt {{
let token = Token::new(
TokenKind::{},
matched.clone(),
self.pos,
start_row,
start_col,
matched.len(),
indent,
);
self.advance(&matched);
{};
return Some(token);
}}
}}
"#,
pattern_desc, rule.name, match_code, rule.name, update_context
));
}
}
let mut to_string_method = String::new();
to_string_method.push_str("\t/// Returns a string representation of the token kind for debugging purposes.\n");
to_string_method.push_str("\t///\n");
to_string_method.push_str("\t/// # Returns\n");
to_string_method.push_str("\t///\n");
to_string_method.push_str("\t/// A human-readable string representation of the token kind\n");
to_string_method.push_str("\tpub fn to_string(&self) -> String {\n");
to_string_method.push_str("\t\tmatch self.kind {\n");
for token_name in &all_token_names {
to_string_method.push_str(&format!("\t\t\tTokenKind::{} => \"{}\".to_string(),\n", token_name, token_name));
}
to_string_method.push_str("\t\t\tTokenKind::Unknown => \"UNKNOWN\".to_string(),\n");
to_string_method.push_str("\t\t}\n");
to_string_method.push_str("\t}");
output = output.replace(
"//----<GENERATED_BY>----",
&format!("// Generated from: {}", source_file),
);
output = output.replace("//----<TOKEN_KIND>----", &token_kind_variants);
output = output.replace("//----<REG_EX_CODE>----", ®ex_code);
output = output.replace("//----<RULE_MATCH_CODE>----", &rule_match_code);
output = output.replace("//----<TO_STRING_METHOD>----", &to_string_method);
if !spec.suffix_code.is_empty() {
output.push_str(&format!("\n{}\n", spec.suffix_code));
}
output
}