use crate::{
parse::{
context::{FlagId, FlagMaskWord},
insn::{FlagMaskTable, Insn, LiteralTable, ParseGraph},
string_table::{StringInterner, StringTable, SymbolId},
},
types::{CharClass, FieldId, InsnId, IntoSyntaxKind, RuleId, Tag},
};
use std::collections::{BTreeMap, HashMap, HashSet};
pub type GrammarChoiceFn = Box<dyn FnOnce(&mut GrammarBuilder)>;
type ByteDispatchArm = (CharClass, GrammarChoiceFn);
#[macro_export]
macro_rules! choices {
($g:expr, $first:expr $(, $rest:expr)* $(,)?) => {
$g.choices(vec![
Box::new($first),
$(Box::new($rest),)*
])
};
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum Repeat {
Exact(u32),
AtLeast(u32),
AtMost(u32),
Between(u32, u32),
}
impl From<u32> for Repeat {
fn from(n: u32) -> Self {
Self::Exact(n)
}
}
impl From<std::ops::RangeFrom<u32>> for Repeat {
fn from(r: std::ops::RangeFrom<u32>) -> Self {
Self::AtLeast(r.start)
}
}
impl From<std::ops::RangeToInclusive<u32>> for Repeat {
fn from(r: std::ops::RangeToInclusive<u32>) -> Self {
Self::AtMost(r.end)
}
}
impl From<std::ops::RangeInclusive<u32>> for Repeat {
fn from(r: std::ops::RangeInclusive<u32>) -> Self {
Self::Between(*r.start(), *r.end())
}
}
struct LiteralInterner {
data: Vec<u8>,
offsets: Vec<u32>,
}
impl LiteralInterner {
fn new() -> Self {
Self {
data: Vec::new(),
offsets: vec![0],
}
}
fn intern(&mut self, bytes: &[u8]) -> u32 {
let id = u32::try_from(self.offsets.len().saturating_sub(1)).unwrap_or(0);
self.data.extend_from_slice(bytes);
self.offsets
.push(u32::try_from(self.data.len()).unwrap_or(0));
id
}
fn seal(&mut self) {
let len_u32 = u32::try_from(self.data.len()).unwrap_or(0);
if self.offsets.last() != Some(&len_u32) {
self.offsets.push(len_u32);
}
}
}
struct FlagMaskInterner {
data: Vec<FlagMaskWord>,
offsets: Vec<u32>,
}
impl FlagMaskInterner {
fn new() -> Self {
Self {
data: Vec::new(),
offsets: vec![0],
}
}
fn intern(&mut self, set_ids: &[FlagId], clear_ids: &[FlagId]) -> u32 {
let mut by_word: BTreeMap<u32, (u64, u64)> = BTreeMap::new();
for &id in set_ids {
by_word.entry(u32::from(id >> 6)).or_default().0 |= 1u64 << (id & 63);
}
for &id in clear_ids {
by_word.entry(u32::from(id >> 6)).or_default().1 |= 1u64 << (id & 63);
}
let id = u32::try_from(self.offsets.len().saturating_sub(1)).unwrap_or(0);
for (word, (set_bits, clear_bits)) in by_word {
self.data.push(FlagMaskWord {
word,
set_bits,
clear_bits,
});
}
self.offsets
.push(u32::try_from(self.data.len()).unwrap_or(0));
id
}
fn seal(&mut self) {
let len_u32 = u32::try_from(self.data.len()).unwrap_or(0);
if self.offsets.last() != Some(&len_u32) {
self.offsets.push(len_u32);
}
}
}
pub struct BuiltGraph {
pub insns: Vec<Insn>,
pub rule_entry: Vec<InsnId>,
pub literal_data: Vec<u8>,
pub literal_offsets: Vec<u32>,
pub jump_tables: Vec<[u32; 256]>,
pub flag_mask_data: Vec<FlagMaskWord>,
pub flag_mask_offsets: Vec<u32>,
pub strings: StringTable,
pub rule_names: Vec<SymbolId>,
pub tag_names: Vec<SymbolId>,
pub class_labels: Vec<SymbolId>,
pub expected_labels: Vec<SymbolId>,
pub field_names: Vec<SymbolId>,
}
impl BuiltGraph {
#[must_use]
pub fn as_graph(&self) -> ParseGraph<'_> {
ParseGraph {
insns: &self.insns,
rule_entry: &self.rule_entry,
jump_tables: &self.jump_tables,
literals: LiteralTable {
data: &self.literal_data,
offsets: &self.literal_offsets,
},
flag_masks: FlagMaskTable {
data: &self.flag_mask_data,
offsets: &self.flag_mask_offsets,
},
strings: &self.strings,
rule_names: &self.rule_names,
tag_names: &self.tag_names,
class_labels: &self.class_labels,
expected_labels: &self.expected_labels,
field_names: &self.field_names,
}
}
#[must_use]
pub fn field_id(&self, name: &str) -> Option<FieldId> {
self.field_names.iter().enumerate().find_map(|(i, &sym)| {
(self.strings.resolve(sym) == name)
.then(|| FieldId::try_from(i).ok())
.flatten()
})
}
#[must_use]
#[inline]
pub fn rule_name_at(&self, i: usize) -> &str {
self.rule_names
.get(i)
.map_or("?", |&sym| self.strings.resolve(sym))
}
}
pub struct GrammarBuilder {
insns: Vec<Insn>,
rule_entry: Vec<InsnId>,
rule_by_name: HashMap<String, RuleId>,
rule_names: Vec<SymbolId>,
pending_calls: Vec<(usize, String)>,
pending_recover: Vec<(usize, String)>,
tag_by_name: HashMap<String, Tag>,
tag_names: Vec<SymbolId>,
field_by_name: HashMap<String, FieldId>,
strings: StringInterner,
literals: LiteralInterner,
flag_masks: FlagMaskInterner,
jump_tables: Vec<[u32; 256]>,
trivia_rule: Option<String>,
auto_trivia: bool,
class_labels: Vec<SymbolId>,
expected_labels: Vec<SymbolId>,
field_names: Vec<SymbolId>,
allow_rule_cycles: bool,
allow_unreachable_rules: bool,
trace_mode: bool,
}
impl GrammarBuilder {
#[must_use]
pub fn new() -> Self {
let mut strings = StringInterner::new();
let default_class = strings.intern("character class");
Self {
insns: Vec::with_capacity(1024),
rule_entry: Vec::new(),
rule_by_name: HashMap::new(),
rule_names: Vec::new(),
pending_calls: Vec::new(),
pending_recover: Vec::new(),
tag_by_name: HashMap::new(),
tag_names: Vec::new(),
field_by_name: HashMap::new(),
strings,
literals: LiteralInterner::new(),
flag_masks: FlagMaskInterner::new(),
jump_tables: Vec::new(),
trivia_rule: None,
auto_trivia: false,
class_labels: vec![default_class],
expected_labels: Vec::new(),
field_names: Vec::new(),
allow_rule_cycles: false,
allow_unreachable_rules: true,
trace_mode: false,
}
}
pub const fn set_trace_mode(&mut self, enabled: bool) -> &mut Self {
self.trace_mode = enabled;
self
}
pub const fn allow_rule_cycles(&mut self, allow: bool) -> &mut Self {
self.allow_rule_cycles = allow;
self
}
pub const fn allow_unreachable_rules(&mut self, allow: bool) -> &mut Self {
self.allow_unreachable_rules = allow;
self
}
pub fn set_trivia_rule(&mut self, name: impl Into<String>) {
self.trivia_rule = Some(name.into());
}
pub fn skip(&mut self) {
if let Some(name) = self.trivia_rule.clone() {
let prev = self.auto_trivia;
self.auto_trivia = false;
self.call(name);
self.auto_trivia = prev;
}
}
pub fn parser_rule<F>(&mut self, name: &str, body: F) -> RuleId
where
F: FnOnce(&mut Self),
{
self.rule_impl(name, body, true)
}
pub fn lexer_rule<F>(&mut self, name: &str, body: F) -> RuleId
where
F: FnOnce(&mut Self),
{
self.rule_impl(name, body, false)
}
pub fn rule<F>(&mut self, name: &str, body: F) -> RuleId
where
F: FnOnce(&mut Self),
{
let id = self.open_rule(name);
body(self);
self.close_rule();
id
}
fn open_rule(&mut self, name: &str) -> RuleId {
let id = RuleId::try_from(self.rule_entry.len()).unwrap_or(0);
self.rule_entry
.push(InsnId::try_from(self.insns.len()).unwrap_or(0));
self.rule_by_name.insert(name.to_string(), id);
let sym = self.strings.intern(name);
self.rule_names.push(sym);
id
}
fn close_rule(&mut self) {
self.emit(Insn::Return);
}
pub fn no_skip<F>(&mut self, body: F)
where
F: FnOnce(&mut Self),
{
let prev = self.auto_trivia;
self.auto_trivia = false;
body(self);
self.auto_trivia = prev;
}
pub fn tag(&mut self, name: &str) -> Tag {
if let Some(&id) = self.tag_by_name.get(name) {
return id;
}
let id = Tag::try_from(self.tag_names.len()).unwrap_or(0);
self.tag_by_name.insert(name.to_string(), id);
let sym = self.strings.intern(name);
self.tag_names.push(sym);
id
}
pub fn emit(&mut self, insn: Insn) -> InsnId {
let id = InsnId::try_from(self.insns.len()).unwrap_or(0);
self.insns.push(insn);
id
}
#[must_use]
pub fn current_ip(&self) -> InsnId {
u32::try_from(self.insns.len()).unwrap_or(u32::MAX)
}
fn patch(&mut self, addr: InsnId, target: InsnId) {
match &mut self.insns[addr as usize] {
Insn::Byte { on_fail, .. }
| Insn::ByteRange { on_fail, .. }
| Insn::Class { on_fail, .. } | Insn::Literal { on_fail, .. }
| Insn::EndOfInput { on_fail }
| Insn::AnyChar { on_fail }
| Insn::Char { on_fail, .. }
| Insn::CharRange { on_fail, .. }
| Insn::IfFlag { on_fail, .. }
| Insn::IfNotFlag { on_fail, .. }
=> *on_fail = target,
Insn::Jump { target: t }
| Insn::Commit { target: t }
| Insn::BackCommit { target: t }
| Insn::NegBackCommit { target: t }
| Insn::PartialCommit { target: t }
=> *t = target,
Insn::Choice { alt } => *alt = target,
Insn::RecoverUntil { resume, .. } => *resume = target,
other => panic!("patch: {other:?} at {addr} has no patchable field"),
}
}
fn patch_table_id(&mut self, addr: InsnId, table_id: u32) {
match &mut self.insns[addr as usize] {
Insn::ByteDispatch { table_id: t } => *t = table_id,
other => panic!("patch_table_id: expected ByteDispatch, got {other:?} at {addr}"),
}
}
pub fn byte(&mut self, b: u8) -> InsnId {
self.emit(Insn::Byte {
byte: b,
on_fail: u32::MAX,
})
}
pub fn byte_range(&mut self, lo: u8, hi: u8) -> InsnId {
self.emit(Insn::ByteRange {
lo,
hi,
on_fail: u32::MAX,
})
}
pub fn class(&mut self, class: CharClass) -> InsnId {
self.emit(Insn::Class {
class,
label_id: 0,
on_fail: u32::MAX,
})
}
pub fn class_with_label(&mut self, class: CharClass, label: &str) -> InsnId {
let sym = self.strings.intern(label);
let label_id = if let Some((i, _)) = self
.class_labels
.iter()
.enumerate()
.find(|&(_, &s)| s == sym)
{
u32::try_from(i).unwrap_or(0)
} else {
let id = u32::try_from(self.class_labels.len()).unwrap_or(0);
self.class_labels.push(sym);
id
};
self.emit(Insn::Class {
class,
label_id,
on_fail: u32::MAX,
})
}
pub fn literal(&mut self, bytes: &[u8]) -> InsnId {
if bytes.len() <= 8 {
let mut buf = [0u8; 8];
for (i, &b) in bytes.iter().enumerate() {
buf[i] = b;
}
return self.emit(Insn::LiteralSmall {
len: u8::try_from(bytes.len()).unwrap_or(0),
bytes: buf,
on_fail: u32::MAX,
});
}
let lit_id = self.literals.intern(bytes);
self.emit(Insn::Literal {
lit_id,
on_fail: u32::MAX,
})
}
pub fn byte_either(&mut self, a: u8, b: u8) -> InsnId {
self.emit(Insn::ByteEither {
a,
b,
on_fail: u32::MAX,
})
}
pub fn byte_in3(&mut self, a: u8, b: u8, c: u8) -> InsnId {
self.emit(Insn::ByteIn3 {
a,
b,
c,
on_fail: u32::MAX,
})
}
pub fn consume_while_class(&mut self, class: CharClass, min: u32) -> InsnId {
self.emit(Insn::ConsumeWhileClass {
class,
label_id: 0,
min,
on_fail: u32::MAX,
})
}
pub fn consume_while_class_with_label(
&mut self,
class: CharClass,
label: &str,
min: u32,
) -> InsnId {
let sym = self.strings.intern(label);
let label_id = if let Some((i, _)) = self
.class_labels
.iter()
.enumerate()
.find(|&(_, &s)| s == sym)
{
u32::try_from(i).unwrap_or(0)
} else {
let id = u32::try_from(self.class_labels.len()).unwrap_or(0);
self.class_labels.push(sym);
id
};
self.emit(Insn::ConsumeWhileClass {
class,
label_id,
min,
on_fail: u32::MAX,
})
}
#[inline]
pub fn consume_while_class1_with_label(&mut self, class: CharClass, label: &str) -> InsnId {
self.consume_while_class_with_label(class, label, 1)
}
pub fn end_of_input(&mut self) -> InsnId {
self.emit(Insn::EndOfInput { on_fail: u32::MAX })
}
pub fn any_char(&mut self) -> InsnId {
self.emit(Insn::AnyChar { on_fail: u32::MAX })
}
pub fn char(&mut self, c: char) -> InsnId {
self.emit(Insn::Char {
codepoint: u32::from(c),
on_fail: u32::MAX,
})
}
pub fn char_range(&mut self, lo: char, hi: char) -> InsnId {
self.emit(Insn::CharRange {
lo: u32::from(lo),
hi: u32::from(hi),
on_fail: u32::MAX,
})
}
pub fn fail(&mut self) -> InsnId {
self.emit(Insn::Fail)
}
pub fn accept(&mut self) {
self.emit(Insn::Accept);
}
pub fn call(&mut self, rule_name: impl Into<String>) -> InsnId {
if self.auto_trivia {
self.skip();
}
let addr = self.current_ip();
self.pending_calls.push((addr as usize, rule_name.into()));
self.emit(Insn::Call { rule: u16::MAX })
}
pub fn call_id(&mut self, rule: RuleId) -> InsnId {
if self.auto_trivia {
self.skip();
}
self.emit(Insn::Call { rule })
}
pub fn capture<F>(&mut self, tag: Tag, body: F)
where
F: FnOnce(&mut Self),
{
self.emit(Insn::CaptureBegin { tag });
body(self);
self.emit(Insn::CaptureEnd { tag });
}
pub fn node<K: IntoSyntaxKind, F>(&mut self, kind: K, body: F)
where
F: FnOnce(&mut Self),
{
self.emit(Insn::NodeBegin {
kind: kind.into_syntax_kind(),
field: None,
});
body(self);
self.emit(Insn::NodeEnd);
}
pub fn node_with_field<K: IntoSyntaxKind, F>(&mut self, kind: K, field: &str, body: F)
where
F: FnOnce(&mut Self),
{
let field_id = self.intern_field(field);
self.emit(Insn::NodeBegin {
kind: kind.into_syntax_kind(),
field: Some(field_id),
});
body(self);
self.emit(Insn::NodeEnd);
}
pub fn intern_field(&mut self, name: &str) -> FieldId {
if let Some(&id) = self.field_by_name.get(name) {
return id;
}
let id = FieldId::try_from(self.field_names.len()).unwrap_or(0);
let sym = self.strings.intern(name);
self.field_names.push(sym);
self.field_by_name.insert(name.to_string(), id);
id
}
pub fn token<K: IntoSyntaxKind, F>(&mut self, kind: K, body: F)
where
F: FnOnce(&mut Self),
{
if self.auto_trivia {
self.skip();
}
self.emit(Insn::TokenBegin {
kind: kind.into_syntax_kind(),
is_trivia: false,
});
let prev = self.auto_trivia;
self.auto_trivia = false;
body(self);
self.auto_trivia = prev;
self.emit(Insn::TokenEnd);
}
pub fn trivia<K: IntoSyntaxKind, F>(&mut self, kind: K, body: F)
where
F: FnOnce(&mut Self),
{
self.emit(Insn::TokenBegin {
kind: kind.into_syntax_kind(),
is_trivia: true,
});
let prev = self.auto_trivia;
self.auto_trivia = false;
body(self);
self.auto_trivia = prev;
self.emit(Insn::TokenEnd);
}
pub fn keyword<K: IntoSyntaxKind>(&mut self, kind: K, word: &'static [u8]) {
self.token(kind, |g| {
g.literal(word);
});
}
pub fn choice<F1, F2>(&mut self, e1: F1, e2: F2)
where
F1: FnOnce(&mut Self),
F2: FnOnce(&mut Self),
{
let choice_ip = self.emit(Insn::Choice { alt: u32::MAX });
e1(self);
let commit_ip = self.emit(Insn::Commit { target: u32::MAX });
let e2_start = self.current_ip();
self.patch(choice_ip, e2_start);
e2(self);
let after = self.current_ip();
self.patch(commit_ip, after);
}
pub fn choices(&mut self, mut alternatives: Vec<GrammarChoiceFn>) {
match alternatives.len() {
0 => {}
1 => {
let f = alternatives.pop().unwrap();
f(self);
}
_ => {
let first = alternatives.remove(0);
self.choice(first, |g| g.choices(alternatives));
}
}
}
pub fn choice_literals(&mut self, words: &[&'static [u8]]) {
match words {
[] => {}
[w] => {
self.literal(w);
}
[w, rest @ ..] => self.choice(
|g| {
g.literal(w);
},
|g| g.choice_literals(rest),
),
}
}
pub fn choice_rule_calls(&mut self, rules: &[&'static str]) {
match rules {
[] => {}
[r] => {
self.call(*r);
}
[r, rest @ ..] => self.choice(
|g| {
g.call(*r);
},
|g| g.choice_rule_calls(rest),
),
}
}
pub fn choice_rules(&mut self, rules: &[&'static str]) {
let alternatives: Vec<GrammarChoiceFn> = rules
.iter()
.map(|&r| {
Box::new(move |g: &mut Self| {
g.call(r);
}) as GrammarChoiceFn
})
.collect();
self.choices(alternatives);
}
pub fn choice3<F1, F2, F3>(&mut self, e1: F1, e2: F2, e3: F3)
where
F1: FnOnce(&mut Self),
F2: FnOnce(&mut Self),
F3: FnOnce(&mut Self),
{
self.choice(e1, |g| g.choice(e2, e3));
}
pub fn choice4<F1, F2, F3, F4>(&mut self, e1: F1, e2: F2, e3: F3, e4: F4)
where
F1: FnOnce(&mut Self),
F2: FnOnce(&mut Self),
F3: FnOnce(&mut Self),
F4: FnOnce(&mut Self),
{
self.choice(e1, |g| g.choice3(e2, e3, e4));
}
pub fn choice5<F1, F2, F3, F4, F5>(&mut self, e1: F1, e2: F2, e3: F3, e4: F4, e5: F5)
where
F1: FnOnce(&mut Self),
F2: FnOnce(&mut Self),
F3: FnOnce(&mut Self),
F4: FnOnce(&mut Self),
F5: FnOnce(&mut Self),
{
self.choice(e1, |g| g.choice4(e2, e3, e4, e5));
}
pub fn choice6<F1, F2, F3, F4, F5, F6>(
&mut self,
e1: F1,
e2: F2,
e3: F3,
e4: F4,
e5: F5,
e6: F6,
) where
F1: FnOnce(&mut Self),
F2: FnOnce(&mut Self),
F3: FnOnce(&mut Self),
F4: FnOnce(&mut Self),
F5: FnOnce(&mut Self),
F6: FnOnce(&mut Self),
{
self.choice(e1, |g| g.choice5(e2, e3, e4, e5, e6));
}
#[doc(alias = "?")]
pub fn optional<F>(&mut self, body: F)
where
F: Fn(&mut Self),
{
let choice_ip = self.emit(Insn::Choice { alt: u32::MAX });
body(self);
let commit_ip = self.emit(Insn::Commit { target: u32::MAX });
let after = self.current_ip();
self.patch(choice_ip, after);
self.patch(commit_ip, after);
}
#[doc(alias = "*")]
pub fn zero_or_more<F>(&mut self, body: F)
where
F: Fn(&mut Self),
{
let choice_ip = self.emit(Insn::Choice { alt: u32::MAX });
let loop_start = self.current_ip();
body(self);
self.emit(Insn::PartialCommit { target: loop_start });
let after = self.current_ip();
self.patch(choice_ip, after);
}
#[doc(alias = "+")]
pub fn one_or_more<F>(&mut self, body: F)
where
F: Fn(&mut Self),
{
body(self);
self.zero_or_more(body);
}
pub fn separated1<E, S>(&mut self, elem: E, sep: S)
where
E: Fn(&mut Self),
S: Fn(&mut Self),
{
elem(self);
self.zero_or_more(|g| {
sep(g);
elem(g);
});
}
pub fn separated<E, S>(&mut self, elem: E, sep: S)
where
E: Fn(&mut Self),
S: Fn(&mut Self),
{
self.optional(|g| {
elem(g);
g.zero_or_more(|g2| {
sep(g2);
elem(g2);
});
});
}
pub fn separated1_trailing_sep<E, S>(&mut self, elem: E, sep: S)
where
E: Fn(&mut Self),
S: Fn(&mut Self) + Clone,
{
self.separated1(elem, sep.clone());
self.optional(|g| sep.clone()(g));
}
pub fn separated_trailing_sep<E, S>(&mut self, elem: E, sep: S)
where
E: Fn(&mut Self),
S: Fn(&mut Self) + Clone,
{
self.optional(|g| {
elem(g);
g.zero_or_more(|g2| {
sep.clone()(g2);
elem(g2);
});
g.optional(|g2| sep.clone()(g2));
});
}
pub fn delimited<O, B, C>(&mut self, open: O, body: B, close: C)
where
O: FnOnce(&mut Self),
B: FnOnce(&mut Self),
C: FnOnce(&mut Self),
{
open(self);
body(self);
close(self);
}
pub fn preceded<P, B>(&mut self, prefix: P, body: B)
where
P: FnOnce(&mut Self),
B: FnOnce(&mut Self),
{
prefix(self);
body(self);
}
pub fn terminated<B, S>(&mut self, body: B, suffix: S)
where
B: FnOnce(&mut Self),
S: FnOnce(&mut Self),
{
body(self);
suffix(self);
}
pub fn lookahead<F>(&mut self, body: F)
where
F: FnOnce(&mut Self),
{
let choice_ip = self.emit(Insn::Choice { alt: u32::MAX });
body(self);
let backcommit = self.emit(Insn::BackCommit { target: u32::MAX });
let fail_ip = self.current_ip();
self.patch(choice_ip, fail_ip);
self.emit(Insn::Fail);
let after = self.current_ip();
self.patch(backcommit, after);
}
pub fn neg_lookahead<F>(&mut self, body: F)
where
F: FnOnce(&mut Self),
{
let choice_ip = self.emit(Insn::Choice { alt: u32::MAX });
body(self);
let neg_ip = self.emit(Insn::NegBackCommit { target: u32::MAX });
let after = self.current_ip();
self.patch(choice_ip, after);
self.patch(neg_ip, after);
}
pub fn not_followed_by<F>(&mut self, probe: F)
where
F: FnOnce(&mut Self),
{
self.neg_lookahead(probe);
}
pub fn expect_label<F>(&mut self, label: &str, body: F)
where
F: FnOnce(&mut Self),
{
let sym = self.strings.intern(label);
let label_id = if let Some((i, _)) = self
.expected_labels
.iter()
.enumerate()
.find(|&(_, &s)| s == sym)
{
u32::try_from(i).unwrap_or(0)
} else {
let id = u32::try_from(self.expected_labels.len()).unwrap_or(0);
self.expected_labels.push(sym);
id
};
self.emit(Insn::RecordExpectedLabel { label_id });
body(self);
}
pub fn context_rule<F>(&mut self, label: &str, body: F)
where
F: FnOnce(&mut Self),
{
let sym = self.strings.intern(label);
let label_id = if let Some((i, _)) = self
.expected_labels
.iter()
.enumerate()
.find(|&(_, &s)| s == sym)
{
u32::try_from(i).unwrap_or(0)
} else {
let id = u32::try_from(self.expected_labels.len()).unwrap_or(0);
self.expected_labels.push(sym);
id
};
self.emit(Insn::PushDiagnosticContext { label_id });
body(self);
self.emit(Insn::PopDiagnosticContext);
}
pub fn hint(&mut self, text: &str) -> InsnId {
let hint_id = self.strings.intern(text);
self.emit(Insn::SetHint { hint_id })
}
pub fn trace(&mut self, label: &str) {
if !self.trace_mode {
return;
}
let label_id = self.strings.intern(label);
self.emit(Insn::TracePoint { label_id });
}
pub fn cut<F>(&mut self, body: F)
where
F: FnOnce(&mut Self),
{
body(self);
let commit_ip = self.emit(Insn::Commit { target: u32::MAX });
self.patch(commit_ip, self.current_ip()); }
pub fn infix_left(&mut self, operand_rule: &str, op_rule: &str) {
self.call(operand_rule);
self.zero_or_more(|g| {
g.call(op_rule);
g.call(operand_rule);
});
}
pub fn recover_until<F>(&mut self, sync_rule: &str, body: F)
where
F: FnOnce(&mut Self),
{
let recover_ip = self.current_ip();
self.emit(Insn::RecoverUntil {
sync_rule: 0, resume: u32::MAX,
});
self.pending_recover
.push((recover_ip as usize, sync_rule.to_string()));
body(self);
let resume_ip = self.current_ip();
self.patch(recover_ip, resume_ip);
self.emit(Insn::RecoveryResume);
}
pub fn repeat<F, R>(&mut self, range: R, body: F)
where
F: Fn(&mut Self),
R: Into<Repeat>,
{
match range.into() {
Repeat::Exact(n) => {
for _ in 0..n {
body(self);
}
}
Repeat::AtLeast(n) => {
for _ in 0..n {
body(self);
}
self.zero_or_more(body);
}
Repeat::AtMost(n) => {
for _ in 0..n {
self.optional(&body);
}
}
Repeat::Between(lo, hi) => {
for _ in 0..lo {
body(self);
}
for _ in 0..hi.saturating_sub(lo) {
self.optional(&body);
}
}
}
}
pub fn if_flag(&mut self, id: FlagId) -> InsnId {
self.emit(Insn::IfFlag {
flag_id: id,
on_fail: u32::MAX,
})
}
pub fn if_not_flag(&mut self, id: FlagId) -> InsnId {
self.emit(Insn::IfNotFlag {
flag_id: id,
on_fail: u32::MAX,
})
}
pub fn with_flags<F>(&mut self, set_ids: &[FlagId], clear_ids: &[FlagId], body: F)
where
F: FnOnce(&mut Self),
{
let mask_id = self.flag_masks.intern(set_ids, clear_ids);
self.emit(Insn::PushFlags { mask_id });
body(self);
self.emit(Insn::PopFlags);
}
pub fn byte_dispatch(&mut self, arms: Vec<ByteDispatchArm>, fallback: Option<GrammarChoiceFn>) {
if self.auto_trivia {
self.skip();
}
let prev = self.auto_trivia;
self.auto_trivia = false;
let dispatch_ip = self.emit(Insn::ByteDispatch { table_id: u32::MAX });
let mut table = [u32::MAX; 256];
let mut exits = Vec::new();
for (class, body) in arms {
let arm_start = self.current_ip();
for b in 0u8..=255 {
if class.contains(b) {
table[b as usize] = arm_start;
}
}
body(self);
exits.push(self.emit(Insn::Jump { target: u32::MAX }));
}
if let Some(body) = fallback {
let fallback_start = self.current_ip();
for slot in &mut table {
if *slot == u32::MAX {
*slot = fallback_start;
}
}
body(self);
exits.push(self.emit(Insn::Jump { target: u32::MAX }));
}
let after = self.current_ip();
let table_id = u32::try_from(self.jump_tables.len()).unwrap_or(0);
for jmp in exits {
self.patch(jmp, after);
}
self.jump_tables.push(table);
self.patch_table_id(dispatch_ip, table_id);
self.auto_trivia = prev;
}
pub fn byte_dispatch_bytes(
&mut self,
arms: Vec<(u8, GrammarChoiceFn)>,
fallback: Option<GrammarChoiceFn>,
) {
#[cfg(debug_assertions)]
{
let mut seen = [false; 256];
for (b, _) in &arms {
let i = *b as usize;
assert!(
!seen[i],
"byte_dispatch_bytes: duplicate leading byte {b:?}"
);
seen[i] = true;
}
}
let arms: Vec<ByteDispatchArm> = arms
.into_iter()
.map(|(b, body)| (CharClass::from_byte(b), body))
.collect();
self.byte_dispatch(arms, fallback);
}
pub fn byte_dispatch_rules(
&mut self,
arms: &[(u8, &'static str)],
fallback_rule: Option<&'static str>,
) {
let arms: Vec<(u8, GrammarChoiceFn)> = arms
.iter()
.map(|&(b, name)| {
(
b,
Box::new(move |g: &mut Self| {
g.call(name);
}) as GrammarChoiceFn,
)
})
.collect();
let fallback = fallback_rule.map(|name| {
Box::new(move |g: &mut Self| {
g.call(name);
}) as GrammarChoiceFn
});
self.byte_dispatch_bytes(arms, fallback);
}
pub fn finish(mut self) -> Result<BuiltGraph, String> {
for (addr, rule_name) in self.pending_calls.drain(..) {
let rule_id = self
.rule_by_name
.get(&rule_name)
.copied()
.ok_or_else(|| format!("undefined rule: {rule_name}"))?;
if let Insn::Call { rule } = &mut self.insns[addr] {
*rule = rule_id;
}
}
for (addr, sync_rule_name) in self.pending_recover.drain(..) {
let rule_id = self
.rule_by_name
.get(&sync_rule_name)
.copied()
.ok_or_else(|| format!("undefined sync rule: {sync_rule_name}"))?;
if let Insn::RecoverUntil { sync_rule, .. } = &mut self.insns[addr] {
*sync_rule = rule_id;
}
}
if !self.allow_rule_cycles {
self.check_rule_cycles()?;
}
if !self.allow_unreachable_rules {
self.check_unreachable_rules()?;
}
self.literals.seal();
self.flag_masks.seal();
Ok(BuiltGraph {
insns: self.insns,
rule_entry: self.rule_entry,
literal_data: self.literals.data,
literal_offsets: self.literals.offsets,
jump_tables: self.jump_tables,
flag_mask_data: self.flag_masks.data,
flag_mask_offsets: self.flag_masks.offsets,
strings: self.strings.finish(),
rule_names: self.rule_names,
tag_names: self.tag_names,
class_labels: self.class_labels,
expected_labels: self.expected_labels,
field_names: self.field_names,
})
}
fn check_rule_cycles(&self) -> Result<(), String> {
struct CycleDfsState<'a> {
edges: &'a [Vec<RuleId>],
in_stack: &'a mut [bool],
order: &'a mut [Option<u32>],
cycle: &'a mut Vec<RuleId>,
counter: &'a mut u32,
}
fn visit(state: &mut CycleDfsState<'_>, r_usize: usize) -> bool {
let r = RuleId::try_from(r_usize).unwrap_or(0);
if state.order[r_usize].is_some() {
if state.in_stack[r_usize] {
state.cycle.push(r);
return true;
}
return false;
}
state.in_stack[r_usize] = true;
*state.counter += 1;
state.order[r_usize] = Some(*state.counter);
for &s in &state.edges[r_usize] {
let s_usize = s as usize;
if visit(state, s_usize) {
if state.cycle.len() == 1 || state.cycle[0] != r {
state.cycle.push(r);
}
state.in_stack[r_usize] = false;
return true;
}
}
state.in_stack[r_usize] = false;
false
}
let num_rules = self.rule_entry.len();
if num_rules == 0 {
return Ok(());
}
let edges = self.build_rule_call_edges();
let mut in_stack = vec![false; num_rules];
let mut order = vec![None; num_rules];
let mut cycle = Vec::new();
let mut counter = 0u32;
for r in 0..num_rules {
if order[r].is_some() {
continue;
}
let mut state = CycleDfsState {
edges: &edges,
in_stack: &mut in_stack,
order: &mut order,
cycle: &mut cycle,
counter: &mut counter,
};
if visit(&mut state, r) {
state.cycle.reverse();
let names: Vec<&str> = state
.cycle
.iter()
.map(|&id| {
self.rule_names
.get(id as usize)
.map_or("?", |&sym| self.strings.resolve(sym))
})
.collect();
let closed: Vec<&str> = names
.iter()
.copied()
.chain(names.first().copied())
.collect();
return Err(format!(
"left-recursive or mutually recursive rules: {}",
closed.join(" -> ")
));
}
}
Ok(())
}
fn check_unreachable_rules(&self) -> Result<(), String> {
let num_rules = self.rule_entry.len();
if num_rules <= 1 {
return Ok(());
}
let edges = self.build_rule_call_edges();
let mut reachable = vec![false; num_rules];
let mut stack: Vec<RuleId> = vec![0];
reachable[0] = true;
while let Some(r) = stack.pop() {
let r_usize = r as usize;
for &s in &edges[r_usize] {
let s_usize = s as usize;
if s_usize < num_rules && !reachable[s_usize] {
reachable[s_usize] = true;
stack.push(s);
}
}
}
for (r, &ok) in reachable.iter().enumerate() {
if !ok {
let name = self
.rule_names
.get(r)
.map_or("?", |&sym| self.strings.resolve(sym));
return Err(format!("unreachable rule: {name}"));
}
}
Ok(())
}
fn build_rule_call_edges(&self) -> Vec<Vec<RuleId>> {
let num_rules = self.rule_entry.len();
if num_rules == 0 {
return vec![];
}
let mut edges: Vec<Vec<RuleId>> = (0..num_rules).map(|_| Vec::new()).collect();
let insns = &self.insns;
let rule_entry = &self.rule_entry;
let jump_tables = &self.jump_tables;
for r in 0..num_rules {
let start = rule_entry[r] as usize;
let mut stack = vec![start];
let mut visited = HashSet::new();
while let Some(ip) = stack.pop() {
if ip >= insns.len() || !visited.insert(ip) {
continue;
}
match &insns[ip] {
Insn::Call { rule } => {
let callee = *rule;
if (callee as usize) < num_rules && !edges[r].contains(&callee) {
edges[r].push(callee);
}
stack.push(ip + 1);
}
Insn::Return | Insn::Fail | Insn::Accept => {}
Insn::Jump { target }
| Insn::Commit { target }
| Insn::BackCommit { target }
| Insn::NegBackCommit { target }
| Insn::PartialCommit { target } => stack.push(*target as usize),
Insn::Choice { alt } => {
stack.push(ip + 1);
stack.push(*alt as usize);
}
Insn::ByteDispatch { table_id } => {
let tid = *table_id as usize;
if tid < jump_tables.len() {
for target in &jump_tables[tid] {
if *target != u32::MAX {
stack.push(*target as usize);
}
}
}
stack.push(ip + 1);
}
Insn::RecoverUntil { resume, .. } => {
stack.push(ip + 1);
if *resume != u32::MAX {
stack.push(*resume as usize);
}
}
Insn::RecoveryResume => stack.push(ip + 1),
_ => {
stack.push(ip + 1);
let ip_u32 = u32::try_from(ip).unwrap_or(0);
if let Some(on_fail) = self.insn_on_fail(ip_u32) {
if on_fail != u32::MAX {
stack.push(on_fail as usize);
}
}
}
}
}
}
edges
}
fn insn_on_fail(&self, addr: u32) -> Option<u32> {
let ip = addr as usize;
if ip >= self.insns.len() {
return None;
}
match &self.insns[ip] {
Insn::Byte { on_fail, .. }
| Insn::ByteRange { on_fail, .. }
| Insn::Class { on_fail, .. } | Insn::Literal { on_fail, .. }
| Insn::EndOfInput { on_fail }
| Insn::AnyChar { on_fail }
| Insn::Char { on_fail, .. }
| Insn::CharRange { on_fail, .. }
| Insn::IfFlag { on_fail, .. }
| Insn::IfNotFlag { on_fail, .. } => Some(*on_fail),
_ => None,
}
}
fn rule_impl<F>(&mut self, name: &str, body: F, with_trivia: bool) -> RuleId
where
F: FnOnce(&mut Self),
{
let prev = self.auto_trivia;
self.auto_trivia = with_trivia;
let id = self.open_rule(name);
body(self);
self.close_rule();
self.auto_trivia = prev;
id
}
}
impl Default for GrammarBuilder {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn finish_rejects_left_recursive_cycle_by_default() {
let mut g = GrammarBuilder::new();
g.parser_rule("a", |g| {
g.call("a");
g.byte(b'x');
});
let res = g.finish();
assert!(res.is_err());
let err = res.err().unwrap();
assert!(err.contains("left-recursive") && err.contains("a"));
}
#[test]
fn finish_accepts_cycle_when_allow_rule_cycles() {
let mut g = GrammarBuilder::new();
g.allow_rule_cycles(true);
g.parser_rule("a", |g| {
g.call("a");
g.byte(b'x');
});
assert!(g.finish().is_ok());
}
#[test]
fn choices_macro() {
use crate::parse::engine::Engine;
let mut g = GrammarBuilder::new();
g.parser_rule("start", |g| {
crate::choices!(
g,
|g| {
g.byte(b'a');
},
|g| {
g.byte(b'b');
},
|g| {
g.byte(b'c');
}
);
g.end_of_input();
g.accept();
});
let built = g.finish().unwrap();
let graph = built.as_graph();
let mut engine = Engine::new();
assert!(engine.parse(&graph, b"a").is_ok());
assert!(engine.parse(&graph, b"b").is_ok());
assert!(engine.parse(&graph, b"c").is_ok());
assert!(engine.parse(&graph, b"d").is_err());
}
#[test]
fn literal_small_is_used_for_short_literals() {
let mut g = GrammarBuilder::new();
g.rule("start", |g| {
g.literal(b"abc");
g.end_of_input();
g.accept();
});
let built = g.finish().unwrap();
assert!(
built
.insns
.iter()
.any(|i| matches!(i, Insn::LiteralSmall { .. })),
"expected LiteralSmall to be emitted for <=8 byte literals"
);
}
}