pub use bumpalo;
use bumpalo::Bump;
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub enum Token {
Start { kind: u32, pos: u32 },
End { kind: u32, pos: u32, start_idx: u32 },
Leaf { kind: u32, start: u32, end: u32 },
}
pub struct ParserState<'a> {
pub input: &'a [u8],
pub tokens: Vec<Token>,
pub arena: Option<&'a Bump>,
pub furthest: u32,
pub lr_memo: std::collections::HashMap<(u32, u32), LrMemoEntry>,
}
impl<'a> ParserState<'a> {
#[inline]
pub fn new(input: &'a [u8]) -> Self {
let cap = input.len().saturating_mul(16).max(64);
Self {
input,
tokens: Vec::with_capacity(cap),
arena: None,
furthest: 0,
lr_memo: std::collections::HashMap::new(),
}
}
#[inline]
pub fn new_check(input: &'a [u8]) -> Self {
Self {
input,
tokens: Vec::new(),
arena: None,
furthest: 0,
lr_memo: std::collections::HashMap::new(),
}
}
#[inline]
pub fn new_in(input: &'a [u8], arena: &'a Bump) -> Self {
let cap = input.len().saturating_mul(16).max(64);
Self {
input,
tokens: Vec::with_capacity(cap),
arena: Some(arena),
furthest: 0,
lr_memo: std::collections::HashMap::new(),
}
}
#[inline(always)]
pub fn checkpoint(&self) -> usize {
self.tokens.len()
}
#[inline(always)]
pub fn truncate(&mut self, checkpoint: usize) {
self.tokens.truncate(checkpoint);
}
#[inline(always)]
unsafe fn push_unchecked(&mut self, t: Token) {
let len = self.tokens.len();
debug_assert!(len < self.tokens.capacity());
let ptr = self.tokens.as_mut_ptr().add(len);
core::ptr::write(ptr, t);
self.tokens.set_len(len + 1);
}
#[inline(always)]
fn push_safe(&mut self, t: Token) {
if self.tokens.len() < self.tokens.capacity() {
unsafe { self.push_unchecked(t) }
} else {
self.tokens.push(t);
}
}
#[inline(always)]
pub fn push_start(&mut self, kind: u32, pos: u32) -> u32 {
let idx = self.tokens.len() as u32;
self.push_safe(Token::Start { kind, pos });
idx
}
#[inline(always)]
pub fn push_end(&mut self, kind: u32, pos: u32, start_idx: u32) {
self.push_safe(Token::End {
kind,
pos,
start_idx,
});
}
#[inline(always)]
pub fn push_leaf(&mut self, kind: u32, start: u32, end: u32) {
self.push_safe(Token::Leaf { kind, start, end });
}
#[inline(always)]
pub fn note_failure_at(&mut self, pos: u32) {
if pos > self.furthest {
self.furthest = pos;
}
}
}
pub struct ParseTree<'a> {
tokens: &'a [Token],
input: &'a [u8],
}
impl<'a> ParseTree<'a> {
#[inline]
pub fn new(tokens: &'a [Token], input: &'a [u8]) -> Self {
Self { tokens, input }
}
pub fn tokens(&self) -> &'a [Token] {
self.tokens
}
pub fn input(&self) -> &'a [u8] {
self.input
}
pub fn root(&self) -> Option<NodeView<'_>> {
node_view_at(self.tokens, self.input, 0)
}
}
#[derive(Clone, Copy)]
pub enum NodeView<'a> {
Pair(Pair<'a>),
Leaf(Leaf<'a>),
}
impl<'a> NodeView<'a> {
pub fn kind(&self) -> u32 {
match self {
NodeView::Pair(p) => p.kind(),
NodeView::Leaf(l) => l.kind(),
}
}
pub fn span(&self) -> (u32, u32) {
match self {
NodeView::Pair(p) => p.span(),
NodeView::Leaf(l) => l.span(),
}
}
pub fn text(&self) -> &'a [u8] {
match self {
NodeView::Pair(p) => p.text(),
NodeView::Leaf(l) => l.text(),
}
}
}
#[derive(Clone, Copy)]
pub struct Pair<'a> {
tokens: &'a [Token],
input: &'a [u8],
start_idx: usize,
end_idx: usize,
}
impl<'a> Pair<'a> {
pub fn kind(&self) -> u32 {
match self.tokens[self.start_idx] {
Token::Start { kind, .. } => kind,
_ => unreachable!("Pair start_idx must point at a Start token"),
}
}
pub fn span(&self) -> (u32, u32) {
let start = match self.tokens[self.start_idx] {
Token::Start { pos, .. } => pos,
_ => unreachable!(),
};
let end = match self.tokens[self.end_idx] {
Token::End { pos, .. } => pos,
_ => unreachable!(),
};
(start, end)
}
pub fn text(&self) -> &'a [u8] {
let (s, e) = self.span();
&self.input[s as usize..e as usize]
}
pub fn children(&self) -> Children<'a> {
Children {
tokens: self.tokens,
input: self.input,
cursor: self.start_idx + 1,
end: self.end_idx,
}
}
}
#[derive(Clone, Copy)]
pub struct Leaf<'a> {
tokens: &'a [Token],
input: &'a [u8],
idx: usize,
}
impl<'a> Leaf<'a> {
pub fn kind(&self) -> u32 {
match self.tokens[self.idx] {
Token::Leaf { kind, .. } => kind,
_ => unreachable!(),
}
}
pub fn span(&self) -> (u32, u32) {
match self.tokens[self.idx] {
Token::Leaf { start, end, .. } => (start, end),
_ => unreachable!(),
}
}
pub fn text(&self) -> &'a [u8] {
let (s, e) = self.span();
&self.input[s as usize..e as usize]
}
}
pub struct Children<'a> {
tokens: &'a [Token],
input: &'a [u8],
cursor: usize,
end: usize, }
impl<'a> Iterator for Children<'a> {
type Item = NodeView<'a>;
fn next(&mut self) -> Option<Self::Item> {
if self.cursor >= self.end {
return None;
}
match self.tokens[self.cursor] {
Token::Leaf { .. } => {
let n = NodeView::Leaf(Leaf {
tokens: self.tokens,
input: self.input,
idx: self.cursor,
});
self.cursor += 1;
Some(n)
}
Token::Start { .. } => {
let s = self.cursor;
let e = find_matching_end(self.tokens, s);
let n = NodeView::Pair(Pair {
tokens: self.tokens,
input: self.input,
start_idx: s,
end_idx: e,
});
self.cursor = e + 1;
Some(n)
}
Token::End { .. } => None,
}
}
}
fn find_matching_end(tokens: &[Token], start_idx: usize) -> usize {
let mut depth: u32 = 1;
let mut i = start_idx + 1;
while i < tokens.len() {
match tokens[i] {
Token::Start { .. } => depth += 1,
Token::End { .. } => {
depth -= 1;
if depth == 0 {
return i;
}
}
Token::Leaf { .. } => {}
}
i += 1;
}
panic!("token buffer unbalanced: no matching End for Start at {start_idx}");
}
fn node_view_at<'a>(tokens: &'a [Token], input: &'a [u8], idx: usize) -> Option<NodeView<'a>> {
if idx >= tokens.len() {
return None;
}
match tokens[idx] {
Token::Start { .. } => {
let e = find_matching_end(tokens, idx);
Some(NodeView::Pair(Pair {
tokens,
input,
start_idx: idx,
end_idx: e,
}))
}
Token::Leaf { .. } => Some(NodeView::Leaf(Leaf { tokens, input, idx })),
Token::End { .. } => None,
}
}
#[derive(Copy, Clone, Debug)]
pub enum LrMemoEntry {
InProgress { seed: Option<u32> },
Done { result: Option<u32> },
}
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub struct ParseError {
pub pos: u32,
}
impl ParseError {
#[inline]
pub fn at(pos: u32) -> Self {
Self { pos }
}
pub fn row_col(&self, input: &[u8]) -> (u32, u32) {
let cap = self.pos as usize;
let stop = cap.min(input.len());
let mut line: u32 = 1;
let mut col: u32 = 1;
for &b in &input[..stop] {
if b == b'\n' {
line += 1;
col = 1;
} else {
col += 1;
}
}
(line, col)
}
}
impl core::fmt::Display for ParseError {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
write!(f, "parse error at byte offset {}", self.pos)
}
}
impl std::error::Error for ParseError {}
pub trait ParseState {
fn input(&self) -> &[u8];
fn checkpoint(&self) -> usize;
fn truncate(&mut self, cp: usize);
fn push_start(&mut self, kind: u32, pos: u32) -> u32;
fn push_end(&mut self, kind: u32, pos: u32, start_idx: u32);
fn push_leaf(&mut self, kind: u32, start: u32, end: u32);
fn note_failure_at(&mut self, pos: u32);
fn furthest(&self) -> u32;
fn lr_memo_get(&self, kind: u32, pos: u32) -> Option<LrMemoEntry>;
fn lr_memo_set(&mut self, kind: u32, pos: u32, entry: LrMemoEntry);
}
impl<'a> ParseState for ParserState<'a> {
#[inline(always)]
fn input(&self) -> &[u8] {
self.input
}
#[inline(always)]
fn checkpoint(&self) -> usize {
ParserState::checkpoint(self)
}
#[inline(always)]
fn truncate(&mut self, cp: usize) {
ParserState::truncate(self, cp)
}
#[inline(always)]
fn push_start(&mut self, kind: u32, pos: u32) -> u32 {
ParserState::push_start(self, kind, pos)
}
#[inline(always)]
fn push_end(&mut self, kind: u32, pos: u32, start_idx: u32) {
ParserState::push_end(self, kind, pos, start_idx)
}
#[inline(always)]
fn push_leaf(&mut self, kind: u32, start: u32, end: u32) {
ParserState::push_leaf(self, kind, start, end)
}
#[inline(always)]
fn note_failure_at(&mut self, pos: u32) {
ParserState::note_failure_at(self, pos)
}
#[inline(always)]
fn furthest(&self) -> u32 {
self.furthest
}
#[inline(always)]
fn lr_memo_get(&self, kind: u32, pos: u32) -> Option<LrMemoEntry> {
self.lr_memo.get(&(kind, pos)).copied()
}
#[inline(always)]
fn lr_memo_set(&mut self, kind: u32, pos: u32, entry: LrMemoEntry) {
self.lr_memo.insert((kind, pos), entry);
}
}
pub struct CheckState<'a> {
pub input: &'a [u8],
pub furthest: u32,
pub lr_memo: std::collections::HashMap<(u32, u32), LrMemoEntry>,
}
impl<'a> CheckState<'a> {
#[inline]
pub fn new(input: &'a [u8]) -> Self {
Self {
input,
furthest: 0,
lr_memo: std::collections::HashMap::new(),
}
}
}
impl<'a> ParseState for CheckState<'a> {
#[inline(always)]
fn input(&self) -> &[u8] {
self.input
}
#[inline(always)]
fn checkpoint(&self) -> usize {
0
}
#[inline(always)]
fn truncate(&mut self, _cp: usize) {}
#[inline(always)]
fn push_start(&mut self, _kind: u32, _pos: u32) -> u32 {
0
}
#[inline(always)]
fn push_end(&mut self, _kind: u32, _pos: u32, _start_idx: u32) {}
#[inline(always)]
fn push_leaf(&mut self, _kind: u32, _start: u32, _end: u32) {}
#[inline(always)]
fn note_failure_at(&mut self, pos: u32) {
if pos > self.furthest {
self.furthest = pos;
}
}
#[inline(always)]
fn furthest(&self) -> u32 {
self.furthest
}
#[inline(always)]
fn lr_memo_get(&self, kind: u32, pos: u32) -> Option<LrMemoEntry> {
self.lr_memo.get(&(kind, pos)).copied()
}
#[inline(always)]
fn lr_memo_set(&mut self, kind: u32, pos: u32, entry: LrMemoEntry) {
self.lr_memo.insert((kind, pos), entry);
}
}
pub mod kind {
pub const TERMINAL: u32 = 0xFFFF_FF00;
pub const EMPTY: u32 = 0xFFFF_FF01;
pub const FAILURE: u32 = 0xFFFF_FF02;
pub const ANY: u32 = 0xFFFF_FF03;
pub const ALL: u32 = 0xFFFF_FF04;
pub const USER_BASE: u32 = 0;
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn checkpoint_and_truncate_round_trip() {
let mut state = ParserState::new(b"hello");
let cp = state.checkpoint();
state.push_leaf(kind::TERMINAL, 0, 1);
state.push_leaf(kind::TERMINAL, 1, 2);
assert_eq!(state.tokens.len(), 2);
state.truncate(cp);
assert_eq!(state.tokens.len(), 0);
}
#[test]
fn start_end_pair_links_back() {
let mut state = ParserState::new(b"x");
let s = state.push_start(7, 0);
state.push_leaf(kind::TERMINAL, 0, 1);
state.push_end(7, 1, s);
match state.tokens[2] {
Token::End { start_idx, .. } => assert_eq!(start_idx, 0),
_ => panic!("expected End token"),
}
}
#[test]
fn state_with_arena_holds_reference() {
let arena = Bump::new();
let state = ParserState::new_in(b"x", &arena);
assert!(state.arena.is_some());
}
#[test]
fn ast_view_walks_a_simple_tree() {
let tokens = vec![
Token::Start { kind: 1, pos: 0 }, Token::Leaf {
kind: 100,
start: 0,
end: 1,
}, Token::Start { kind: 2, pos: 1 }, Token::Leaf {
kind: 100,
start: 1,
end: 2,
}, Token::End {
kind: 2,
pos: 2,
start_idx: 2,
},
Token::End {
kind: 1,
pos: 2,
start_idx: 0,
},
];
let input = b"xy";
let tree = ParseTree::new(&tokens, input);
let root = tree.root().expect("root");
let pair = match root {
NodeView::Pair(p) => p,
_ => panic!("expected Pair"),
};
assert_eq!(pair.kind(), 1);
assert_eq!(pair.span(), (0, 2));
assert_eq!(pair.text(), b"xy");
let mut children = pair.children();
let first = children.next().expect("first child");
assert_eq!(first.kind(), 100);
assert_eq!(first.text(), b"x");
let second = match children.next().expect("second child") {
NodeView::Pair(p) => p,
_ => panic!("expected nested Pair"),
};
assert_eq!(second.kind(), 2);
assert_eq!(second.text(), b"y");
assert_eq!(second.children().count(), 1);
assert!(children.next().is_none());
}
}