use crate::CommonGeneratorConfig;
use crate::generators::lexer_ir::build_scanner_mode_data;
use crate::generators::{GrammarConfig, NamingHelper};
use anyhow::Result;
use scnr2_generate::character_classes::CharacterClasses;
use scnr2_generate::dfa::Dfa;
use scnr2_generate::nfa::Nfa;
use scnr2_generate::pattern::{Lookahead as ScnrLookahead, Pattern};
use scnr2_generate::scanner_data::TransitionToNumericMode;
use scnr2_generate::scanner_mode::ScannerMode as ScnrScannerMode;
use std::fmt::Write;
pub fn generate_lexer_source<C: CommonGeneratorConfig>(
grammar_config: &GrammarConfig,
config: &C,
) -> Result<String> {
let terminal_names =
crate::generators::lexer_generator::generate_terminal_names(grammar_config);
generate_lexer_source_with_terminal_names(grammar_config, config, &terminal_names)
}
pub(crate) fn generate_lexer_source_with_terminal_names<C: CommonGeneratorConfig>(
grammar_config: &GrammarConfig,
config: &C,
terminal_names: &[String],
) -> Result<String> {
let mut source = String::new();
let _scanner_type_name = NamingHelper::to_upper_camel_case(config.user_type_name()) + "Scanner";
writeln!(source, "using System;")?;
writeln!(source, "using System.Collections.Generic;")?;
writeln!(source, "using Parol.Runtime.Scanner;")?;
writeln!(source)?;
writeln!(source, "namespace {} {{", config.user_type_name())?;
source.push_str(&generate_scanner_data_with_terminal_names(
grammar_config,
config,
terminal_names,
)?);
writeln!(source, "}}")?;
Ok(source)
}
pub fn generate_scanner_data<C: CommonGeneratorConfig>(
grammar_config: &GrammarConfig,
config: &C,
) -> Result<String> {
let terminal_names =
crate::generators::lexer_generator::generate_terminal_names(grammar_config);
generate_scanner_data_with_terminal_names(grammar_config, config, &terminal_names)
}
pub(crate) fn generate_scanner_data_with_terminal_names<C: CommonGeneratorConfig>(
grammar_config: &GrammarConfig,
config: &C,
terminal_names: &[String],
) -> Result<String> {
let mode_data = build_scanner_mode_data(grammar_config, terminal_names)
.map_err(|e| anyhow::anyhow!(e.to_string()))?;
let mode_indices = mode_data
.iter()
.enumerate()
.map(|(i, mode)| (mode.scanner_name.clone(), i))
.collect::<std::collections::HashMap<_, _>>();
let mut scanner_modes = Vec::new();
for mode in mode_data {
let sc_name = &mode.scanner_name;
let mut patterns = Vec::new();
for (rx, terminal_index, lookahead, _) in mode.terminal_mappings {
let scnr_lookahead =
match lookahead {
Some((true, pattern)) => ScnrLookahead::positive(pattern)
.map_err(|e| anyhow::anyhow!(e.to_string()))?,
Some((false, pattern)) => ScnrLookahead::negative(pattern)
.map_err(|e| anyhow::anyhow!(e.to_string()))?,
None => ScnrLookahead::None,
};
patterns.push(
Pattern::new(rx, (terminal_index as u32).into()).with_lookahead(scnr_lookahead),
);
}
let mut transitions = Vec::new();
for (terminal_index, state_switch) in mode.transitions {
let transition = match state_switch {
crate::parser::parol_grammar::ScannerStateSwitch::SwitchPush(mode_name, _) => {
let mode_index = *mode_indices.get(mode_name.as_str()).unwrap();
TransitionToNumericMode::PushMode(terminal_index as usize, mode_index)
}
crate::parser::parol_grammar::ScannerStateSwitch::SwitchPop(_) => {
TransitionToNumericMode::PopMode(terminal_index as usize)
}
crate::parser::parol_grammar::ScannerStateSwitch::Switch(mode_name, _) => {
let mode_index = *mode_indices.get(mode_name.as_str()).unwrap();
TransitionToNumericMode::SetMode(terminal_index as usize, mode_index)
}
};
transitions.push(transition);
}
transitions.sort_by_key(|t| t.token_type());
scanner_modes.push(ScnrScannerMode::new(sc_name, patterns, transitions));
}
let skip_tokens_by_mode = scanner_modes
.iter()
.map(|mode| {
grammar_config
.scanner_configurations
.iter()
.find(|sc| sc.scanner_name == mode.name)
.map(|sc| sc.skip_tokens.clone())
.unwrap_or_default()
})
.collect::<Vec<Vec<parol_runtime::TerminalIndex>>>();
let mut nfas = scanner_modes
.iter()
.map(|mode| {
Nfa::build_from_patterns(&mode.patterns).map_err(|e| anyhow::anyhow!(e.to_string()))
})
.collect::<Result<Vec<_>>>()?;
let mut character_classes = CharacterClasses::new();
for nfa in &nfas {
nfa.collect_character_classes(&mut character_classes);
}
character_classes.create_disjoint_character_classes();
for nfa in &mut nfas {
nfa.convert_to_disjoint_character_classes(&character_classes);
}
let dfas = nfas
.into_iter()
.map(|nfa| Dfa::try_from(&nfa).map_err(|e| anyhow::anyhow!(e.to_string())))
.collect::<Result<Vec<_>>>()?;
let mut source = String::new();
let scanner_type_name = NamingHelper::to_upper_camel_case(config.user_type_name()) + "Scanner";
writeln!(source, " /// <summary>")?;
writeln!(
source,
" /// Scanner tables and helper functions generated by parol for this grammar."
)?;
writeln!(source, " /// </summary>")?;
writeln!(
source,
" public static class {}Data {{",
scanner_type_name
)?;
writeln!(source, " /// <summary>")?;
writeln!(
source,
" /// Ordered terminal names used by scanner and parser diagnostics."
)?;
writeln!(source, " /// </summary>")?;
writeln!(
source,
" public static readonly string[] TerminalNames = ["
)?;
for name in terminal_names {
writeln!(source, " \"{}\",", name)?;
}
writeln!(source, " ];")?;
writeln!(source)?;
generate_match_function(&mut source, &character_classes)?;
writeln!(source, " /// <summary>")?;
writeln!(
source,
" /// Scanner mode table consumed by the scanner runtime."
)?;
writeln!(source, " /// </summary>")?;
writeln!(
source,
" public static readonly ScannerMode[] ScannerModes = {{"
)?;
for (i, mode) in scanner_modes.iter().enumerate() {
let dfa = &dfas[i];
generate_scanner_mode(&mut source, mode, dfa, character_classes.intervals.len())?;
}
writeln!(source, " }};")?;
writeln!(source)?;
writeln!(source, " /// <summary>")?;
writeln!(
source,
" /// Scanner-mode-specific token type indices that are skipped by the scanner runtime."
)?;
writeln!(source, " /// </summary>")?;
writeln!(
source,
" public static readonly int[][] SkipTokensByScannerMode = {{"
)?;
for skip_tokens in &skip_tokens_by_mode {
let tokens = skip_tokens
.iter()
.map(std::string::ToString::to_string)
.collect::<Vec<_>>()
.join(", ");
writeln!(source, " [{}],", tokens)?;
}
writeln!(source, " }};")?;
writeln!(source, " }}")?;
Ok(source)
}
fn generate_match_function(
source: &mut String,
character_classes: &CharacterClasses,
) -> Result<()> {
writeln!(source, " /// <summary>")?;
writeln!(
source,
" /// Maps an input character to its scanner character class index."
)?;
writeln!(source, " /// </summary>")?;
writeln!(
source,
" /// <param name=\"c\">Character to classify.</param>"
)?;
writeln!(
source,
" /// <returns>The class index, or <c>null</c> if no class matches.</returns>"
)?;
writeln!(
source,
" public static int? MatchFunction(char c) {{"
)?;
writeln!(
source,
" var intervals = new (char Start, char End, int ClassIdx)[] {{"
)?;
for interval in &character_classes.elementary_intervals {
let start = *interval.start();
let end = *interval.end();
let class_idx = character_classes
.intervals
.iter()
.enumerate()
.find(|(_, group)| group.contains(interval))
.map(|(idx, _)| idx)
.unwrap();
writeln!(
source,
" ('{}', '{}', {}),",
escape_char(start),
escape_char(end),
class_idx
)?;
}
writeln!(source, " }};")?;
writeln!(source)?;
writeln!(
source,
" int low = 0, high = intervals.Length - 1;"
)?;
writeln!(source, " while (low <= high) {{")?;
writeln!(source, " int mid = low + (high - low) / 2;")?;
writeln!(
source,
" if (c >= intervals[mid].Start && c <= intervals[mid].End) return intervals[mid].ClassIdx;"
)?;
writeln!(
source,
" if (c < intervals[mid].Start) high = mid - 1;"
)?;
writeln!(source, " else low = mid + 1;")?;
writeln!(source, " }}")?;
writeln!(source, " return null;")?;
writeln!(source, " }}")?;
Ok(())
}
fn generate_scanner_mode(
source: &mut String,
mode: &ScnrScannerMode,
dfa: &Dfa,
num_classes: usize,
) -> Result<()> {
writeln!(source, " new(")?;
writeln!(source, " \"{}\",", mode.name)?;
writeln!(source, " [")?;
for t in &mode.transitions {
match t {
TransitionToNumericMode::SetMode(token_type, target) => {
writeln!(
source,
" new(TransitionType.SetMode, {}, {}),",
token_type, target
)?;
}
TransitionToNumericMode::PushMode(token_type, target) => {
writeln!(
source,
" new(TransitionType.PushMode, {}, {}),",
token_type, target
)?;
}
TransitionToNumericMode::PopMode(token_type) => {
writeln!(
source,
" new(TransitionType.PopMode, {}),",
token_type
)?;
}
}
}
writeln!(source, " ],")?;
generate_dfa(source, dfa, num_classes)?;
writeln!(source, " ),")?;
Ok(())
}
fn generate_dfa(source: &mut String, dfa: &Dfa, num_classes: usize) -> Result<()> {
writeln!(source, " new Dfa([")?;
for state in &dfa.states {
writeln!(source, " new DfaState(")?;
write!(source, " [ ")?;
let mut transition_opts = vec![None; num_classes];
for t in &state.transitions {
transition_opts[t.elementary_interval_index.as_usize()] = Some(t.target.as_usize());
}
for (i, opt) in transition_opts.iter().enumerate() {
if let Some(target) = opt {
write!(source, "new({})", target)?;
} else {
write!(source, "null")?;
}
if i < num_classes - 1 {
write!(source, ", ")?;
}
}
writeln!(source, " ],")?;
writeln!(source, " [")?;
for ad in &state.accept_data {
write!(
source,
" new({}, {}, ",
ad.terminal_type.as_usize(),
ad.priority
)?;
generate_lookahead(source, &ad.lookahead, num_classes)?;
writeln!(source, "),")?;
}
writeln!(source, " ]")?;
writeln!(source, " ),")?;
}
writeln!(source, " ])")?;
Ok(())
}
fn generate_lookahead(
source: &mut String,
lookahead: &scnr2_generate::pattern::Lookahead,
num_classes: usize,
) -> Result<()> {
match lookahead {
scnr2_generate::pattern::Lookahead::None => write!(source, "new Lookahead.None()")?,
scnr2_generate::pattern::Lookahead::Positive(
scnr2_generate::pattern::AutomatonType::Dfa(d),
) => {
write!(source, "new Lookahead.Positive(")?;
generate_dfa(source, d, num_classes)?;
write!(source, ")")?;
}
scnr2_generate::pattern::Lookahead::Negative(
scnr2_generate::pattern::AutomatonType::Dfa(d),
) => {
write!(source, "new Lookahead.Negative(")?;
generate_dfa(source, d, num_classes)?;
write!(source, ")")?;
}
_ => panic!("Unexpected lookahead type"),
}
Ok(())
}
fn escape_char(c: char) -> String {
match c {
'\'' => "\\'".to_string(),
'\\' => "\\\\".to_string(),
'\n' => "\\n".to_string(),
'\r' => "\\r".to_string(),
'\t' => "\\t".to_string(),
'\0' => "\\0".to_string(),
_ if c.is_ascii_graphic() || c == ' ' => c.to_string(),
_ => {
let u = c as u32;
if u <= 0xFFFF {
format!("\\u{:04x}", u)
} else {
format!("\\u{:04x}", u & 0xFFFF) }
}
}
}