use bincode::Encode;
use cfgrammar::{
header::{
GrmtoolsSectionParser, Header, HeaderError, HeaderErrorKind, HeaderValue, Namespaced,
Setting, Value,
},
markmap::MergeBehavior,
span::{Location, Span},
};
use glob::glob;
use lrpar::{
CTParserBuilder, LexerTypes,
diagnostics::{DiagnosticFormatter, SpannedDiagnosticFormatter},
};
use num_traits::{AsPrimitive, PrimInt, Unsigned};
use proc_macro2::{Ident, TokenStream};
use quote::{ToTokens, TokenStreamExt, format_ident, quote};
use regex::Regex;
use std::marker::PhantomData;
use std::{
any::type_name,
borrow::Borrow,
collections::{HashMap, HashSet},
env::{current_dir, var},
error::Error,
fmt::{self, Debug, Display, Write as _},
fs::{self, File, create_dir_all, read_to_string},
hash::Hash,
io::Write,
path::{Path, PathBuf},
sync::{LazyLock, Mutex},
};
use crate::{DefaultLexerTypes, LRNonStreamingLexer, LRNonStreamingLexerDef, LexFlags, LexerDef};
const RUST_FILE_EXT: &str = "rs";
const ERROR: &str = "[Error]";
const WARNING: &str = "[Warning]";
static RE_TOKEN_ID: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"^[a-zA-Z_][a-zA-Z_0-9]*$").unwrap());
static GENERATED_PATHS: LazyLock<Mutex<HashSet<PathBuf>>> =
LazyLock::new(|| Mutex::new(HashSet::new()));
#[non_exhaustive]
pub enum LexerKind {
LRNonStreamingLexer,
}
impl<T: Clone> TryFrom<&Value<T>> for LexerKind {
type Error = cfgrammar::header::HeaderError<T>;
fn try_from(it: &Value<T>) -> Result<LexerKind, Self::Error> {
match it {
Value::Flag(_, loc) => Err(HeaderError {
kind: HeaderErrorKind::ConversionError(
"LexerKind",
"Expected `LexerKind` found bool",
),
locations: vec![loc.clone()],
}),
Value::Setting(Setting::Num(_, loc)) => Err(HeaderError {
kind: HeaderErrorKind::ConversionError(
"LexerKind",
"Expected `LexerKind` found numeric",
),
locations: vec![loc.clone()],
}),
Value::Setting(Setting::String(_, loc)) => Err(HeaderError {
kind: HeaderErrorKind::ConversionError(
"LexerKind",
"Expected `LexerKind` found string",
),
locations: vec![loc.clone()],
}),
Value::Setting(Setting::Constructor {
ctor:
Namespaced {
namespace: _,
member: (_, loc),
},
arg: _,
}) => Err(HeaderError {
kind: HeaderErrorKind::ConversionError(
"LexerKind",
"Expected `LexerKind` found constructor",
),
locations: vec![loc.clone()],
}),
Value::Setting(Setting::Array(_, arr_loc, _)) => Err(HeaderError {
kind: HeaderErrorKind::ConversionError(
"LexerKind",
"Expected `LexerKind` found array",
),
locations: vec![arr_loc.clone()],
}),
Value::Setting(Setting::Unitary(Namespaced {
namespace,
member: (member, member_loc),
})) => {
if let Some((ns, loc)) = namespace {
if ns.to_lowercase() != "lexerkind" {
return Err(HeaderError {
kind: HeaderErrorKind::ConversionError(
"LexerKind",
"Expected namespace `LexerKind`",
),
locations: vec![loc.clone()],
});
}
}
if member.to_lowercase() != "lrnonstreaminglexer" {
return Err(HeaderError {
kind: HeaderErrorKind::ConversionError(
"LexerKind",
"Unknown `LexerKind` Variant",
),
locations: vec![member_loc.clone()],
});
}
Ok(LexerKind::LRNonStreamingLexer)
}
}
}
}
#[derive(Clone, PartialEq, Eq, Debug)]
#[non_exhaustive]
pub enum Visibility {
Private,
Public,
PublicSuper,
PublicSelf,
PublicCrate,
PublicIn(String),
}
impl ToTokens for Visibility {
fn to_tokens(&self, tokens: &mut TokenStream) {
tokens.extend(match self {
Visibility::Private => quote!(),
Visibility::Public => quote! {pub},
Visibility::PublicSuper => quote! {pub(super)},
Visibility::PublicSelf => quote! {pub(self)},
Visibility::PublicCrate => quote! {pub(crate)},
Visibility::PublicIn(data) => {
let other = str::parse::<TokenStream>(data).unwrap();
quote! {pub(in #other)}
}
})
}
}
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
#[non_exhaustive]
pub enum RustEdition {
Rust2015,
Rust2018,
Rust2021,
}
struct QuoteOption<T>(Option<T>);
impl<T: ToTokens> ToTokens for QuoteOption<T> {
fn to_tokens(&self, tokens: &mut TokenStream) {
tokens.append_all(match self.0 {
Some(ref t) => quote! { ::std::option::Option::Some(#t) },
None => quote! { ::std::option::Option::None },
});
}
}
struct QuoteTuple<T>(T);
impl<A: ToTokens, B: ToTokens> ToTokens for QuoteTuple<(A, B)> {
fn to_tokens(&self, tokens: &mut TokenStream) {
let (a, b) = &self.0;
tokens.append_all(quote!((#a, #b)));
}
}
struct QuoteToString<'a>(&'a str);
impl ToTokens for QuoteToString<'_> {
fn to_tokens(&self, tokens: &mut TokenStream) {
let x = &self.0;
tokens.append_all(quote! { #x.to_string() });
}
}
struct ErrorString(String);
impl fmt::Display for ErrorString {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let ErrorString(s) = self;
write!(f, "{}", s)
}
}
impl fmt::Debug for ErrorString {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let ErrorString(s) = self;
write!(f, "{}", s)
}
}
impl Error for ErrorString {}
pub struct CTLexerBuilder<'a, LexerTypesT: LexerTypes = DefaultLexerTypes<u32>>
where
LexerTypesT::StorageT: Debug + Eq + Hash + ToTokens,
usize: num_traits::AsPrimitive<LexerTypesT::StorageT>,
{
lrpar_config:
Option<Box<dyn Fn(CTParserBuilder<LexerTypesT>) -> CTParserBuilder<LexerTypesT> + 'a>>,
lexer_path: Option<PathBuf>,
output_path: Option<PathBuf>,
lexerkind: Option<LexerKind>,
mod_name: Option<&'a str>,
visibility: Visibility,
rust_edition: RustEdition,
rule_ids_map: Option<HashMap<String, LexerTypesT::StorageT>>,
allow_missing_terms_in_lexer: bool,
allow_missing_tokens_in_parser: bool,
warnings_are_errors: bool,
show_warnings: bool,
header: Header<Location>,
#[cfg(test)]
inspect_lexerkind_cb: Option<Box<dyn Fn(&LexerKind) -> Result<(), Box<dyn Error>>>>,
}
impl CTLexerBuilder<'_, DefaultLexerTypes<u32>> {
pub fn new() -> Self {
CTLexerBuilder::<DefaultLexerTypes<u32>>::new_with_lexemet()
}
}
impl<'a, LexerTypesT: LexerTypes<LexErrorT = crate::LRLexError> + 'static>
CTLexerBuilder<'a, LexerTypesT>
where
LexerTypesT::StorageT:
'static + Debug + Eq + Hash + PrimInt + Encode + TryFrom<usize> + Unsigned + ToTokens,
usize: AsPrimitive<LexerTypesT::StorageT>,
{
pub fn new_with_lexemet() -> Self {
let mut header = Header::new();
header.set_default_merge_behavior(MergeBehavior::Ours);
CTLexerBuilder {
lrpar_config: None,
lexer_path: None,
output_path: None,
lexerkind: None,
mod_name: None,
visibility: Visibility::Private,
rust_edition: RustEdition::Rust2021,
rule_ids_map: None,
allow_missing_terms_in_lexer: false,
allow_missing_tokens_in_parser: false,
warnings_are_errors: false,
show_warnings: true,
header,
#[cfg(test)]
inspect_lexerkind_cb: None,
}
}
pub fn lrpar_config<F>(mut self, config_func: F) -> Self
where
F: Fn(CTParserBuilder<LexerTypesT>) -> CTParserBuilder<LexerTypesT> + 'a,
{
self.lrpar_config = Some(Box::new(config_func));
self
}
pub fn lexer_in_src_dir<P>(mut self, srcp: P) -> Result<Self, Box<dyn Error>>
where
P: AsRef<Path>,
{
if !srcp.as_ref().is_relative() {
return Err(format!(
"Lexer path '{}' must be a relative path.",
srcp.as_ref().to_str().unwrap_or("<invalid UTF-8>")
)
.into());
}
let mut lexp = current_dir()?;
lexp.push("src");
lexp.push(srcp.as_ref());
self.lexer_path = Some(lexp);
let mut outp = PathBuf::new();
outp.push(var("OUT_DIR").unwrap());
outp.push(srcp.as_ref().parent().unwrap().to_str().unwrap());
create_dir_all(&outp)?;
let mut leaf = srcp
.as_ref()
.file_name()
.unwrap()
.to_str()
.unwrap()
.to_owned();
write!(leaf, ".{}", RUST_FILE_EXT).ok();
outp.push(leaf);
Ok(self.output_path(outp))
}
pub fn lexer_path<P>(mut self, inp: P) -> Self
where
P: AsRef<Path>,
{
self.lexer_path = Some(inp.as_ref().to_owned());
self
}
pub fn output_path<P>(mut self, outp: P) -> Self
where
P: AsRef<Path>,
{
self.output_path = Some(outp.as_ref().to_owned());
self
}
pub fn lexerkind(mut self, lexerkind: LexerKind) -> Self {
self.lexerkind = Some(lexerkind);
self
}
pub fn mod_name(mut self, mod_name: &'a str) -> Self {
self.mod_name = Some(mod_name);
self
}
pub fn visibility(mut self, vis: Visibility) -> Self {
self.visibility = vis;
self
}
pub fn rust_edition(mut self, edition: RustEdition) -> Self {
self.rust_edition = edition;
self
}
pub fn rule_ids_map<T: std::borrow::Borrow<HashMap<String, LexerTypesT::StorageT>> + Clone>(
mut self,
rule_ids_map: T,
) -> Self {
self.rule_ids_map = Some(rule_ids_map.borrow().to_owned());
self
}
pub fn build(mut self) -> Result<CTLexer, Box<dyn Error>> {
let lexerp = self
.lexer_path
.as_ref()
.expect("lexer_path must be specified before processing.");
let outp = self
.output_path
.as_ref()
.expect("output_path must be specified before processing.");
{
let mut lk = GENERATED_PATHS.lock().unwrap();
if lk.contains(outp.as_path()) {
return Err(format!("Generating two lexers to the same path ('{}') is not allowed: use CTLexerBuilder::output_path (and, optionally, CTLexerBuilder::mod_name) to differentiate them.", &outp.to_str().unwrap()).into());
}
lk.insert(outp.clone());
}
let lex_src = read_to_string(lexerp)
.map_err(|e| format!("When reading '{}': {e}", lexerp.display()))?;
let lex_diag = SpannedDiagnosticFormatter::new(&lex_src, lexerp);
let mut header = self.header;
let (parsed_header, _) = GrmtoolsSectionParser::new(&lex_src, false)
.parse()
.map_err(|es| {
let mut out = String::new();
out.push_str(&format!(
"\n{ERROR}{}\n",
lex_diag.file_location_msg(" parsing the `%grmtools` section", None)
));
for e in es {
out.push_str(&indent(" ", &lex_diag.format_error(e).to_string()));
out.push('\n');
}
ErrorString(out)
})?;
header.merge_from(parsed_header)?;
header.mark_used(&"lexerkind".to_string());
let lexerkind = match self.lexerkind {
Some(lexerkind) => lexerkind,
None => {
if let Some(HeaderValue(_, lk_val)) = header.get("lexerkind") {
LexerKind::try_from(lk_val)?
} else {
LexerKind::LRNonStreamingLexer
}
}
};
#[cfg(test)]
if let Some(inspect_lexerkind_cb) = self.inspect_lexerkind_cb {
inspect_lexerkind_cb(&lexerkind)?
}
let (lexerdef, lex_flags): (LRNonStreamingLexerDef<LexerTypesT>, LexFlags) =
match lexerkind {
LexerKind::LRNonStreamingLexer => {
let lex_flags = LexFlags::try_from(&mut header)?;
let lexerdef = LRNonStreamingLexerDef::<LexerTypesT>::new_with_options(
&lex_src, lex_flags,
)
.map_err(|errs| {
let mut out = String::new();
out.push_str(&format!(
"\n{ERROR}{}\n",
lex_diag.file_location_msg("", None)
));
for e in errs {
out.push_str(&indent(" ", &lex_diag.format_error(e).to_string()));
out.push('\n');
}
ErrorString(out)
})?;
let lex_flags = lexerdef.lex_flags().cloned();
(lexerdef, lex_flags.unwrap())
}
};
let ct_parser = if let Some(ref lrcfg) = self.lrpar_config {
let mut closure_lexerdef = lexerdef.clone();
let mut ctp = CTParserBuilder::<LexerTypesT>::new().inspect_rt(Box::new(
move |yacc_header, rtpb, rule_ids_map, grm_path| {
let owned_map = rule_ids_map
.iter()
.map(|(x, y)| (&**x, *y))
.collect::<HashMap<_, _>>();
closure_lexerdef.set_rule_ids(&owned_map);
yacc_header.mark_used(&"test_files".to_string());
let grammar = rtpb.grammar();
let test_glob = yacc_header.get("test_files");
let mut err_str = None;
let add_error_line = |err_str: &mut Option<String>, line| {
if let Some(err_str) = err_str {
err_str.push_str(&format!("{}\n", line));
} else {
let _ = err_str.insert(format!("{}\n", line));
}
};
match test_glob {
Some(HeaderValue(_, Value::Setting(Setting::Array(test_globs, _, _)))) => {
for setting in test_globs {
match setting {
Setting::String(test_files, _) => {
let path_joined = grm_path.parent().unwrap().join(test_files);
let path_str = &path_joined.to_string_lossy();
let mut glob_paths = glob(path_str).map_err(|e| e.to_string())?.peekable();
if glob_paths.peek().is_none() {
return Err(format!("'test_files' glob '{}' matched no paths", path_str)
.to_string()
.into(),
);
}
for path in glob_paths {
let path = path?;
if let Some(ext) = path.extension() {
if let Some(ext) = ext.to_str() {
if ext.starts_with("grm") {
add_error_line(&mut err_str, "test_files extensions beginning with `grm` are reserved.".into());
}
}
}
let input = fs::read_to_string(&path)?;
let l: LRNonStreamingLexer<LexerTypesT> =
closure_lexerdef.lexer(&input);
let errs = rtpb.parse_map(&l, &|_| (), &|_, _| ()).1;
if !errs.is_empty() {
add_error_line(&mut err_str, format!("While parsing {}:", path.display()));
for e in errs {
let e_pp = e.pp(&l, &|t| grammar.token_epp(t));
let e_lines = e_pp.split("\n");
for e in e_lines {
add_error_line(&mut err_str, format!("\t{}", e));
}
}
}
}
}
_ => return Err("Invalid value for setting 'test_files'".into()),
}
}
if let Some(err_str) = err_str {
Err(ErrorString(err_str))?
} else {
Ok(())
}
}
Some(_) => Err("Invalid value for setting 'test_files'".into()),
None => Ok(()),
}
},
));
ctp = lrcfg(ctp);
let ct_parser = ctp.build()?;
self.rule_ids_map = Some(ct_parser.token_map().to_owned());
Some(ct_parser)
} else {
None
};
let mut lexerdef = Box::new(lexerdef);
let unused_header_values = header.unused();
if !unused_header_values.is_empty() {
return Err(
format!("Unused header values: {}", unused_header_values.join(", ")).into(),
);
}
let (missing_from_lexer, missing_from_parser) = match self.rule_ids_map {
Some(ref rim) => {
let owned_map = rim
.iter()
.map(|(x, y)| (&**x, *y))
.collect::<HashMap<_, _>>();
let (x, y) = lexerdef.set_rule_ids_spanned(&owned_map);
(
x.map(|a| a.iter().map(|&b| b.to_string()).collect::<HashSet<_>>()),
y.map(|a| {
a.iter()
.map(|(b, span)| (b.to_string(), *span))
.collect::<HashSet<_>>()
}),
)
}
None => (None, None),
};
let mut has_unallowed_missing = false;
let err_indent = " ".repeat(ERROR.len());
if !self.allow_missing_terms_in_lexer {
if let Some(ref mfl) = missing_from_lexer {
if let Some(ct_parser) = &ct_parser {
let grm = ct_parser.yacc_grammar();
let token_spans = mfl
.iter()
.map(|name| {
ct_parser
.yacc_grammar()
.token_span(*grm.tokens_map().get(name.as_str()).unwrap())
.expect("Given token should have a span")
})
.collect::<Vec<_>>();
let yacc_diag = SpannedDiagnosticFormatter::new(
ct_parser.grammar_src(),
ct_parser.grammar_path(),
);
eprintln!(
"{ERROR} these tokens are not referenced in the lexer but defined as follows"
);
eprintln!(
"{err_indent} {}",
yacc_diag.file_location_msg("in the grammar", None)
);
for span in token_spans {
eprintln!(
"{}",
yacc_diag.underline_span_with_text(
span,
"Missing from lexer".to_string(),
'^'
)
);
}
eprintln!();
} else {
eprintln!(
"{ERROR} the following tokens are used in the grammar but are not defined in the lexer:"
);
for n in mfl {
eprintln!(" {}", n);
}
}
has_unallowed_missing = true;
}
}
if !self.allow_missing_tokens_in_parser && self.show_warnings {
if let Some(ref mfp) = missing_from_parser {
let error_prefix = if self.warnings_are_errors {
ERROR
} else {
WARNING
};
let err_indent = " ".repeat(error_prefix.len());
let mut outs = Vec::new();
outs.push(format!("{error_prefix} these tokens are not referenced in the grammar but defined as follows"));
outs.push(format!(
"{err_indent} {}",
lex_diag.file_location_msg("in the lexer", None)
));
for (_, span) in mfp {
let error_contents = lex_diag.underline_span_with_text(
*span,
"Missing from parser".to_string(),
'^',
);
outs.extend(error_contents.lines().map(|s| s.to_string()));
}
for s in outs {
if !self.warnings_are_errors && std::env::var("OUT_DIR").is_ok() {
println!("cargo:warning={}", s)
} else {
eprintln!("{}", s);
}
}
has_unallowed_missing |= self.warnings_are_errors;
}
}
if has_unallowed_missing {
fs::remove_file(outp).ok();
panic!();
}
let mod_name = match self.mod_name {
Some(s) => s.to_owned(),
None => {
let mut stem = lexerp.to_str().unwrap();
loop {
let new_stem = Path::new(stem).file_stem().unwrap().to_str().unwrap();
if stem == new_stem {
break;
}
stem = new_stem;
}
format!("{}_l", stem)
}
};
let mod_name =
match syn::parse_str::<proc_macro2::Ident>(&mod_name) {
Ok(s) => s,
Err(e) => return Err(format!(
"CTLexerBuilder::mod_name(\"{}\") is not a valid rust identifier due to '{}'",
mod_name, e
)
.into()),
};
let mut lexerdef_func_impl = {
let LexFlags {
allow_wholeline_comments,
dot_matches_new_line,
multi_line,
octal,
posix_escapes,
case_insensitive,
unicode,
swap_greed,
ignore_whitespace,
size_limit,
dfa_size_limit,
nest_limit,
} = lex_flags;
let allow_wholeline_comments = QuoteOption(allow_wholeline_comments);
let dot_matches_new_line = QuoteOption(dot_matches_new_line);
let multi_line = QuoteOption(multi_line);
let octal = QuoteOption(octal);
let posix_escapes = QuoteOption(posix_escapes);
let case_insensitive = QuoteOption(case_insensitive);
let unicode = QuoteOption(unicode);
let swap_greed = QuoteOption(swap_greed);
let ignore_whitespace = QuoteOption(ignore_whitespace);
let size_limit = QuoteOption(size_limit);
let dfa_size_limit = QuoteOption(dfa_size_limit);
let nest_limit = QuoteOption(nest_limit);
quote! {
let mut lex_flags = ::lrlex::DEFAULT_LEX_FLAGS;
lex_flags.allow_wholeline_comments = #allow_wholeline_comments.or(::lrlex::DEFAULT_LEX_FLAGS.allow_wholeline_comments);
lex_flags.dot_matches_new_line = #dot_matches_new_line.or(::lrlex::DEFAULT_LEX_FLAGS.dot_matches_new_line);
lex_flags.multi_line = #multi_line.or(::lrlex::DEFAULT_LEX_FLAGS.multi_line);
lex_flags.octal = #octal.or(::lrlex::DEFAULT_LEX_FLAGS.octal);
lex_flags.posix_escapes = #posix_escapes.or(::lrlex::DEFAULT_LEX_FLAGS.posix_escapes);
lex_flags.case_insensitive = #case_insensitive.or(::lrlex::DEFAULT_LEX_FLAGS.case_insensitive);
lex_flags.unicode = #unicode.or(::lrlex::DEFAULT_LEX_FLAGS.unicode);
lex_flags.swap_greed = #swap_greed.or(::lrlex::DEFAULT_LEX_FLAGS.swap_greed);
lex_flags.ignore_whitespace = #ignore_whitespace.or(::lrlex::DEFAULT_LEX_FLAGS.ignore_whitespace);
lex_flags.size_limit = #size_limit.or(::lrlex::DEFAULT_LEX_FLAGS.size_limit);
lex_flags.dfa_size_limit = #dfa_size_limit.or(::lrlex::DEFAULT_LEX_FLAGS.dfa_size_limit);
lex_flags.nest_limit = #nest_limit.or(::lrlex::DEFAULT_LEX_FLAGS.nest_limit);
let lex_flags = lex_flags;
}
};
{
let start_states = lexerdef.iter_start_states();
let rules = lexerdef.iter_rules().map(|r| {
let tok_id = QuoteOption(r.tok_id);
let n = QuoteOption(r.name().map(QuoteToString));
let target_state =
QuoteOption(r.target_state().map(|(x, y)| QuoteTuple((x, y))));
let n_span = r.name_span();
let regex = QuoteToString(&r.re_str);
let start_states = r.start_states();
quote! {
Rule::new(::lrlex::unstable_api::InternalPublicApi, #tok_id, #n, #n_span, #regex.to_string(),
vec![#(#start_states),*], #target_state, &lex_flags).unwrap()
}
});
lexerdef_func_impl.append_all(quote! {
let start_states: Vec<StartState> = vec![#(#start_states),*];
let rules = vec![#(#rules),*];
});
}
let lexerdef_ty = match lexerkind {
LexerKind::LRNonStreamingLexer => {
quote!(::lrlex::LRNonStreamingLexerDef)
}
};
lexerdef_func_impl.append_all(quote! {
#lexerdef_ty::from_rules(start_states, rules)
});
let mut token_consts = TokenStream::new();
if let Some(rim) = self.rule_ids_map {
let mut rim_sorted = Vec::from_iter(rim.iter());
rim_sorted.sort_by_key(|(k, _)| *k);
for (name, id) in rim_sorted {
if RE_TOKEN_ID.is_match(name) {
let tok_ident = format_ident!("N_{}", name.to_ascii_uppercase());
let storaget =
str::parse::<TokenStream>(type_name::<LexerTypesT::StorageT>()).unwrap();
let tok_const = quote! {
#[allow(dead_code)]
pub const #tok_ident: #storaget = #id;
};
token_consts.extend(tok_const)
}
}
}
let token_consts = token_consts.into_iter();
let out_tokens = {
let lexerdef_param = str::parse::<TokenStream>(type_name::<LexerTypesT>()).unwrap();
let mod_vis = self.visibility;
quote! {
#mod_vis mod #mod_name {
use ::lrlex::{LexerDef, Rule, StartState};
#[allow(dead_code)]
pub fn lexerdef() -> #lexerdef_ty<#lexerdef_param> {
#lexerdef_func_impl
}
#(#token_consts)*
}
}
};
let unformatted = out_tokens.to_string();
let mut outs = String::new();
let timestamp = env!("VERGEN_BUILD_TIMESTAMP");
write!(outs, "// lrlex build time: {}\n\n", quote!(#timestamp),).ok();
outs.push_str(
&syn::parse_str(&unformatted)
.map(|syntax_tree| prettyplease::unparse(&syntax_tree))
.unwrap_or(unformatted),
);
if let Ok(curs) = read_to_string(outp) {
if curs == outs {
return Ok(CTLexer {
missing_from_lexer,
missing_from_parser,
});
}
}
let mut f = File::create(outp)?;
f.write_all(outs.as_bytes())?;
Ok(CTLexer {
missing_from_lexer,
missing_from_parser,
})
}
#[deprecated(
since = "0.11.0",
note = "Please use lexer_in_src_dir() and build() instead"
)]
#[allow(deprecated)]
pub fn process_file_in_src(
self,
srcp: &str,
) -> Result<(Option<HashSet<String>>, Option<HashSet<String>>), Box<dyn Error>> {
let mut inp = current_dir()?;
inp.push("src");
inp.push(srcp);
let mut outp = PathBuf::new();
outp.push(var("OUT_DIR").unwrap());
outp.push(Path::new(srcp).parent().unwrap().to_str().unwrap());
create_dir_all(&outp)?;
let mut leaf = Path::new(srcp)
.file_name()
.unwrap()
.to_str()
.unwrap()
.to_owned();
write!(leaf, ".{}", RUST_FILE_EXT).ok();
outp.push(leaf);
self.process_file(inp, outp)
}
#[deprecated(
since = "0.11.0",
note = "Please use lexer_in_src_dir() and build() instead"
)]
pub fn process_file<P, Q>(
mut self,
inp: P,
outp: Q,
) -> Result<(Option<HashSet<String>>, Option<HashSet<String>>), Box<dyn Error>>
where
P: AsRef<Path>,
Q: AsRef<Path>,
{
self.lexer_path = Some(inp.as_ref().to_owned());
self.output_path = Some(outp.as_ref().to_owned());
let cl = self.build()?;
Ok((
cl.missing_from_lexer().map(|x| x.to_owned()),
cl.missing_from_parser()
.map(|x| x.iter().map(|(n, _)| n.to_owned()).collect::<HashSet<_>>()),
))
}
pub fn allow_missing_terms_in_lexer(mut self, allow: bool) -> Self {
self.allow_missing_terms_in_lexer = allow;
self
}
pub fn allow_missing_tokens_in_parser(mut self, allow: bool) -> Self {
self.allow_missing_tokens_in_parser = allow;
self
}
pub fn warnings_are_errors(mut self, flag: bool) -> Self {
self.warnings_are_errors = flag;
self
}
pub fn show_warnings(mut self, flag: bool) -> Self {
self.show_warnings = flag;
self
}
pub fn allow_wholeline_comments(mut self, flag: bool) -> Self {
let key = "allow_wholeline_comments".to_string();
self.header.insert(
key,
HeaderValue(
Location::Other("CTLexerBuilder".to_string()),
Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
),
);
self
}
pub fn dot_matches_new_line(mut self, flag: bool) -> Self {
let key = "dot_matches_new_line".to_string();
self.header.insert(
key,
HeaderValue(
Location::Other("CTLexerBuilder".to_string()),
Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
),
);
self
}
pub fn multi_line(mut self, flag: bool) -> Self {
let key = "multi_line".to_string();
self.header.insert(
key,
HeaderValue(
Location::Other("CTLexerBuilder".to_string()),
Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
),
);
self
}
pub fn posix_escapes(mut self, flag: bool) -> Self {
let key = "posix_escapes".to_string();
self.header.insert(
key,
HeaderValue(
Location::Other("CTLexerBuilder".to_string()),
Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
),
);
self
}
pub fn octal(mut self, flag: bool) -> Self {
let key = "octal".to_string();
self.header.insert(
key,
HeaderValue(
Location::Other("CTLexerBuilder".to_string()),
Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
),
);
self
}
pub fn swap_greed(mut self, flag: bool) -> Self {
let key = "swap_greed".to_string();
self.header.insert(
key,
HeaderValue(
Location::Other("CTLexerBuilder".to_string()),
Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
),
);
self
}
pub fn ignore_whitespace(mut self, flag: bool) -> Self {
let key = "ignore_whitespace".to_string();
self.header.insert(
key,
HeaderValue(
Location::Other("CTLexerBuilder".to_string()),
Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
),
);
self
}
pub fn unicode(mut self, flag: bool) -> Self {
let key = "unicode".to_string();
self.header.insert(
key,
HeaderValue(
Location::Other("CTLexerBuilder".to_string()),
Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
),
);
self
}
pub fn case_insensitive(mut self, flag: bool) -> Self {
let key = "case_insensitive".to_string();
self.header.insert(
key,
HeaderValue(
Location::Other("CTLexerBuilder".to_string()),
Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
),
);
self
}
pub fn size_limit(mut self, sz: usize) -> Self {
let key = "size_limit".to_string();
self.header.insert(
key,
HeaderValue(
Location::Other("CTLexerBuilder".to_string()),
Value::Setting(Setting::Num(
sz as u64,
Location::Other("CTLexerBuilder".to_string()),
)),
),
);
self
}
pub fn dfa_size_limit(mut self, sz: usize) -> Self {
let key = "dfa_size_limit".to_string();
self.header.insert(
key,
HeaderValue(
Location::Other("CTLexerBuilder".to_string()),
Value::Setting(Setting::Num(
sz as u64,
Location::Other("CTLexerBuilder".to_string()),
)),
),
);
self
}
pub fn nest_limit(mut self, lim: u32) -> Self {
let key = "nest_limit".to_string();
self.header.insert(
key,
HeaderValue(
Location::Other("CTLexerBuilder".to_string()),
Value::Setting(Setting::Num(
lim as u64,
Location::Other("CTLexerBuilder".to_string()),
)),
),
);
self
}
#[cfg(test)]
pub fn inspect_lexerkind(
mut self,
cb: Box<dyn Fn(&LexerKind) -> Result<(), Box<dyn Error>>>,
) -> Self {
self.inspect_lexerkind_cb = Some(cb);
self
}
}
pub struct CTLexer {
missing_from_lexer: Option<HashSet<String>>,
missing_from_parser: Option<HashSet<(String, Span)>>,
}
impl CTLexer {
fn missing_from_lexer(&self) -> Option<&HashSet<String>> {
self.missing_from_lexer.as_ref()
}
fn missing_from_parser(&self) -> Option<&HashSet<(String, Span)>> {
self.missing_from_parser.as_ref()
}
}
#[derive(Debug, Clone)]
pub struct CTTokenMapBuilder<StorageT: Display + ToTokens> {
mod_name: String,
token_map: Vec<(String, TokenStream)>,
rename_map: Option<HashMap<String, String>>,
allow_dead_code: bool,
_marker: PhantomData<StorageT>,
}
impl<StorageT: Display + ToTokens> CTTokenMapBuilder<StorageT> {
pub fn new(
mod_name: impl Into<String>,
token_map: impl Borrow<HashMap<String, StorageT>>,
) -> Self {
Self {
mod_name: mod_name.into(),
token_map: token_map
.borrow()
.iter()
.map(|(tok_name, tok_value)| (tok_name.clone(), tok_value.to_token_stream()))
.collect(),
rename_map: None,
allow_dead_code: false,
_marker: PhantomData,
}
}
pub fn rename_map<M, I, K, V>(mut self, rename_map: Option<M>) -> Self
where
M: IntoIterator<Item = I>,
I: Borrow<(K, V)>,
K: AsRef<str>,
V: AsRef<str>,
{
self.rename_map = rename_map.map(|rename_map| {
rename_map
.into_iter()
.map(|it| {
let (k, v) = it.borrow();
let k = k.as_ref().into();
let v = v.as_ref().into();
(k, v)
})
.collect()
});
self
}
pub fn allow_dead_code(mut self, allow_dead_code: bool) -> Self {
self.allow_dead_code = allow_dead_code;
self
}
pub fn build(&self) -> Result<(), Box<dyn Error>> {
let mut outs = String::new();
let timestamp = env!("VERGEN_BUILD_TIMESTAMP");
let mod_ident = format_ident!("{}", self.mod_name);
write!(outs, "// lrlex build time: {}\n\n", quote!(#timestamp),).ok();
let storaget = str::parse::<TokenStream>(type_name::<StorageT>()).unwrap();
let mut token_map_sorted = self.token_map.clone();
token_map_sorted.sort_by(|(l, _), (r, _)| l.cmp(r));
let (token_array, tokens) = token_map_sorted
.iter()
.map(|(k, id)| {
let name = match &self.rename_map {
Some(rmap) => rmap.get(k).unwrap_or(k),
_ => k,
};
let tok_ident: Ident = syn::parse_str(&format!("T_{}", name.to_ascii_uppercase()))
.map_err(|e| {
format!(
"token name {:?} is not a valid Rust identifier: {}; \
consider renaming it via `CTTokenMapBuilder::rename_map`.",
name, e
)
})?;
Ok((
quote! {
#id,
},
quote! {
pub const #tok_ident: #storaget = #id;
},
))
})
.collect::<Result<(TokenStream, TokenStream), Box<dyn Error>>>()?;
let unused_annotation = if self.allow_dead_code {
quote! {#[allow(dead_code)]}
} else {
quote! {}
};
let unformatted = quote! {
#unused_annotation
mod #mod_ident {
#tokens
#[allow(dead_code)]
pub const TOK_IDS: &[#storaget] = &[#token_array];
}
}
.to_string();
let out_mod = syn::parse_str(&unformatted)
.map(|syntax_tree| prettyplease::unparse(&syntax_tree))
.unwrap_or(unformatted);
outs.push_str(&out_mod);
let mut outp = PathBuf::from(var("OUT_DIR")?);
outp.push(&self.mod_name);
outp.set_extension("rs");
if let Ok(curs) = read_to_string(&outp) {
if curs == outs {
return Ok(());
}
}
let mut f = File::create(outp)?;
f.write_all(outs.as_bytes())?;
Ok(())
}
}
#[deprecated(since = "0.14.0", note = "use `lrlex::CTTokenMapBuilder` instead")]
pub fn ct_token_map<StorageT: Display + ToTokens>(
mod_name: &str,
token_map: impl Borrow<HashMap<String, StorageT>>,
rename_map: Option<&HashMap<&str, &str>>,
) -> Result<(), Box<dyn Error>> {
CTTokenMapBuilder::new(mod_name, token_map)
.rename_map(rename_map)
.allow_dead_code(true)
.build()
}
fn indent(indent: &str, s: &str) -> String {
format!("{indent}{}\n", s.trim_end_matches('\n')).replace('\n', &format!("\n{}", indent))
}
#[cfg(all(not(target_arch = "wasm32"), test))]
mod test {
use std::fs::File;
use std::io::Write;
use super::{CTLexerBuilder, LexerKind};
#[test]
fn test_grmtools_section_lexerkind() {
let lexerkinds = [
"LRNonStreamingLexer",
"lrnonstreaminglexer",
"LexerKind::lrnonstreaminglexer",
"lexerkind::LRNonStreamingLexer",
];
for (i, kind) in lexerkinds.iter().enumerate() {
let lex_src = format!(
"
%grmtools{{lexerkind: {}}}
%%
. ;
",
kind
);
let lex_path = format!(
"{}/test_grmtools_section_lexerkind_{}.l",
env!("OUT_DIR"),
i
);
let mut l_file = File::create(lex_path.clone()).unwrap();
l_file.write_all(lex_src.as_bytes()).unwrap();
CTLexerBuilder::new()
.output_path(format!("{}.rs", lex_path.clone()))
.lexer_path(lex_path.clone())
.inspect_lexerkind(Box::new(move |lexerkind| {
assert!(matches!(lexerkind, &LexerKind::LRNonStreamingLexer));
Ok(())
}))
.build()
.unwrap();
}
}
#[test]
fn test_invalid_identifier_in_derived_mod_name() {
let mut lex_path = std::path::PathBuf::from(env!("OUT_DIR"));
lex_path.push("contains-a-dash.l");
let mut f = File::create(&lex_path).unwrap();
let _ = f.write_all(
r#"
%%
A "A"
"#
.as_bytes(),
);
match CTLexerBuilder::new()
.output_path(format!("{}.rs", lex_path.display()))
.lexer_path(lex_path.clone())
.build()
{
Ok(_) => panic!("Expected error"),
Err(e) => {
let err_string = e.to_string();
assert_eq!(
err_string,
"CTLexerBuilder::mod_name(\"contains-a-dash_l\") is not a valid rust identifier due to 'unexpected token'"
);
}
}
}
}