use crate::lex::parsing::ir::{NodeType, ParseNode};
use crate::lex::token::{LineContainer, LineType};
use regex::Regex;
use std::ops::Range;
mod builder;
mod grammar;
use builder::{
blank_line_node_from_range, container_starts_with_pipe_row, convert_pattern_to_node,
PatternMatch,
};
use grammar::{GRAMMAR_PATTERNS, LIST_ITEM_REGEX};
pub struct GrammarMatcher;
impl GrammarMatcher {
fn try_match(
tokens: &[LineContainer],
start_idx: usize,
allow_sessions: bool,
is_first_item: bool,
has_preceding_blank: bool,
has_preceding_boundary: bool,
prev_was_session: bool,
) -> Option<(PatternMatch, Range<usize>)> {
if start_idx >= tokens.len() {
return None;
}
if let Some(result) = Self::match_verbatim_block(tokens, start_idx) {
return Some(result);
}
if let Some(result) = Self::match_table(tokens, start_idx) {
return Some(result);
}
let remaining_tokens = &tokens[start_idx..];
let token_string = Self::tokens_to_grammar_string(remaining_tokens)?;
for (pattern_name, pattern_regex_str) in GRAMMAR_PATTERNS {
if *pattern_name == "verbatim_block" {
continue;
}
if let Ok(regex) = Regex::new(pattern_regex_str) {
if let Some(caps) = regex.captures(&token_string) {
let full_match = caps.get(0)?;
let consumed_count = Self::count_consumed_tokens(full_match.as_str());
let pattern = match *pattern_name {
"annotation_block" => PatternMatch::AnnotationBlock {
start_idx: 0,
content_idx: 1,
},
"annotation_single" => PatternMatch::AnnotationSingle { start_idx: 0 },
"list_no_blank" | "list_single_with_container" => {
let items_str = caps.name("items")?.as_str();
let mut items = Vec::new();
let mut token_idx = 0; for item_cap in LIST_ITEM_REGEX.find_iter(items_str) {
let has_container = item_cap.as_str().contains("<container>");
items.push((
token_idx,
if has_container {
Some(token_idx + 1)
} else {
None
},
));
token_idx += if has_container { 2 } else { 1 };
}
let trailing_blank_count = caps
.name("trailing_blank")
.map(|m| Self::count_consumed_tokens(m.as_str()))
.unwrap_or(0);
let trailing_blank_range = if trailing_blank_count > 0 {
Some(
start_idx + consumed_count - trailing_blank_count
..start_idx + consumed_count,
)
} else {
None
};
PatternMatch::List {
items,
preceding_blank_range: None,
trailing_blank_range,
}
}
"list" => {
let blank_count = caps
.name("blank")
.map(|m| Self::count_consumed_tokens(m.as_str()))
.unwrap_or(0);
let items_str = caps.name("items")?.as_str();
let mut items = Vec::new();
let mut token_idx = blank_count;
for item_cap in LIST_ITEM_REGEX.find_iter(items_str) {
let has_container = item_cap.as_str().contains("<container>");
items.push((
token_idx,
if has_container {
Some(token_idx + 1)
} else {
None
},
));
token_idx += if has_container { 2 } else { 1 };
}
let trailing_blank_count = caps
.name("trailing_blank")
.map(|m| Self::count_consumed_tokens(m.as_str()))
.unwrap_or(0);
let preceding_blank_range = if blank_count > 0 {
Some(start_idx..start_idx + blank_count)
} else {
None
};
let trailing_blank_range = if trailing_blank_count > 0 {
Some(
start_idx + consumed_count - trailing_blank_count
..start_idx + consumed_count,
)
} else {
None
};
PatternMatch::List {
items,
preceding_blank_range,
trailing_blank_range,
}
}
"session" => {
if !allow_sessions {
continue; }
if !(is_first_item
|| start_idx == 0
|| has_preceding_blank
|| has_preceding_boundary
|| prev_was_session)
{
continue; }
let blank_str = caps.name("blank")?.as_str();
let blank_count = Self::count_consumed_tokens(blank_str);
PatternMatch::Session {
subject_idx: 0,
content_idx: 1 + blank_count,
preceding_blank_range: None,
}
}
"definition" => PatternMatch::Definition {
subject_idx: 0,
content_idx: 1,
},
"blank_line_group" => PatternMatch::BlankLineGroup,
"document_title_with_subtitle" => {
PatternMatch::DocumentTitle {
title_idx: 1,
subtitle_idx: Some(2),
}
}
"document_title" => {
let next_idx = start_idx + consumed_count;
if next_idx < tokens.len()
&& matches!(&tokens[next_idx], LineContainer::Container { .. })
{
continue;
}
PatternMatch::DocumentTitle {
title_idx: 1,
subtitle_idx: None,
}
}
"document_start" => PatternMatch::DocumentStart,
_ => continue,
};
return Some((pattern, start_idx..start_idx + consumed_count));
}
}
}
Self::match_paragraph(tokens, start_idx)
}
fn tokens_to_grammar_string(tokens: &[LineContainer]) -> Option<String> {
let mut result = String::new();
for token in tokens {
match token {
LineContainer::Token(t) => {
result.push_str(&t.line_type.to_grammar_string());
}
LineContainer::Container { .. } => {
result.push_str("<container>");
}
}
}
if result.is_empty() {
None
} else {
Some(result)
}
}
fn count_consumed_tokens(grammar_str: &str) -> usize {
grammar_str.matches('<').count()
}
fn match_paragraph(
tokens: &[LineContainer],
start_idx: usize,
) -> Option<(PatternMatch, Range<usize>)> {
use LineType::*;
let len = tokens.len();
let mut idx = start_idx;
while idx < len {
match &tokens[idx] {
LineContainer::Token(t) => match t.line_type {
ParagraphLine | DialogLine => {
idx += 1;
}
SubjectLine => {
if Self::next_is_container(tokens, idx) {
break;
}
idx += 1;
}
SubjectOrListItemLine => {
if Self::next_is_container(tokens, idx) {
break;
}
if Self::next_is_list_like(tokens, idx) {
break;
}
idx += 1;
}
ListLine => {
if Self::next_is_list_continuation(tokens, idx) {
break;
}
idx += 1;
}
_ => break, },
LineContainer::Container { .. } => break,
}
}
if idx > start_idx {
Some((
PatternMatch::Paragraph {
start_idx: 0,
end_idx: idx - start_idx - 1,
},
start_idx..idx,
))
} else {
None
}
}
fn next_is_container(tokens: &[LineContainer], idx: usize) -> bool {
let next = idx + 1;
next < tokens.len() && matches!(&tokens[next], LineContainer::Container { .. })
}
fn next_is_list_like(tokens: &[LineContainer], idx: usize) -> bool {
let next = idx + 1;
if next >= tokens.len() {
return false;
}
matches!(
&tokens[next],
LineContainer::Token(t) if matches!(t.line_type, LineType::ListLine | LineType::SubjectOrListItemLine)
)
}
fn next_is_list_continuation(tokens: &[LineContainer], idx: usize) -> bool {
let next = idx + 1;
if next >= tokens.len() {
return false;
}
match &tokens[next] {
LineContainer::Token(t) => {
matches!(
t.line_type,
LineType::ListLine | LineType::SubjectOrListItemLine
)
}
LineContainer::Container { .. } => {
let after = next + 1;
after < tokens.len()
&& matches!(
&tokens[after],
LineContainer::Token(t) if matches!(t.line_type, LineType::ListLine | LineType::SubjectOrListItemLine)
)
}
}
}
fn match_table(
tokens: &[LineContainer],
start_idx: usize,
) -> Option<(PatternMatch, Range<usize>)> {
use LineType::{SubjectLine, SubjectOrListItemLine};
if start_idx >= tokens.len() {
return None;
}
let is_subject = matches!(
&tokens[start_idx],
LineContainer::Token(line) if matches!(line.line_type, SubjectLine | SubjectOrListItemLine)
);
if !is_subject {
return None;
}
let content_idx = start_idx + 1;
if content_idx >= tokens.len() {
return None;
}
let container = &tokens[content_idx];
if !matches!(container, LineContainer::Container { .. }) {
return None;
}
if !container_starts_with_pipe_row(container) {
return None;
}
Some((
PatternMatch::Table {
subject_idx: 0,
content_idx: 1,
},
start_idx..content_idx + 1,
))
}
fn match_verbatim_block(
tokens: &[LineContainer],
start_idx: usize,
) -> Option<(PatternMatch, Range<usize>)> {
use LineType::{
BlankLine, DataMarkerLine, DocumentStart, SubjectLine, SubjectOrListItemLine,
};
let len = tokens.len();
if start_idx >= len {
return None;
}
let mut idx = start_idx;
while idx < len {
if let LineContainer::Token(line) = &tokens[idx] {
if line.line_type == BlankLine || line.line_type == DocumentStart {
idx += 1;
continue;
}
}
break;
}
if idx >= len {
return None;
}
let first_subject_idx = match &tokens[idx] {
LineContainer::Token(line)
if matches!(line.line_type, SubjectLine | SubjectOrListItemLine) =>
{
idx
}
_ => return None,
};
let mut cursor = first_subject_idx + 1;
loop {
while cursor < len {
if let LineContainer::Token(line) = &tokens[cursor] {
if line.line_type == BlankLine {
cursor += 1;
continue;
}
}
break;
}
if cursor >= len {
return None;
}
match &tokens[cursor] {
LineContainer::Container { .. } => {
cursor += 1;
while cursor < len {
if let LineContainer::Token(line) = &tokens[cursor] {
if line.line_type == BlankLine {
cursor += 1;
continue;
}
}
break;
}
if cursor >= len {
return None; }
match &tokens[cursor] {
LineContainer::Token(line) => {
if matches!(line.line_type, DataMarkerLine) {
continue;
}
if matches!(line.line_type, SubjectLine | SubjectOrListItemLine) {
continue;
}
return None;
}
LineContainer::Container { .. } => {
return None;
}
}
}
LineContainer::Token(line) => {
if matches!(line.line_type, DataMarkerLine) {
return Some((
PatternMatch::VerbatimBlock {
subject_idx: first_subject_idx,
content_range: (first_subject_idx + 1)..cursor,
closing_idx: cursor,
},
start_idx..(cursor + 1),
));
}
if matches!(line.line_type, SubjectLine | SubjectOrListItemLine) {
cursor += 1;
continue;
}
cursor += 1;
}
}
}
}
}
pub fn parse_with_declarative_grammar(
tokens: Vec<LineContainer>,
source: &str,
) -> Result<Vec<ParseNode>, String> {
let tokens = fold_prose_continuations(tokens);
parse_with_declarative_grammar_internal(tokens, source, true, true)
}
fn is_prose_line(token: &crate::lex::token::LineToken) -> bool {
matches!(
token.line_type,
LineType::ParagraphLine | LineType::DialogLine
)
}
fn is_prose_only_run(children: &[LineContainer]) -> bool {
let mut idx = 0;
while idx < children.len() {
match &children[idx] {
LineContainer::Token(t) => match t.line_type {
LineType::ParagraphLine | LineType::DialogLine | LineType::BlankLine => {}
LineType::SubjectLine => {
if matches!(children.get(idx + 1), Some(LineContainer::Container { .. })) {
return false;
}
}
_ => return false,
},
LineContainer::Container { children: inner } => {
if !is_prose_only_run(inner) {
return false;
}
}
}
idx += 1;
}
true
}
fn dissolve_prose_into(container: LineContainer, out: &mut Vec<LineContainer>) {
match container {
token @ LineContainer::Token(_) => out.push(token),
LineContainer::Container { children } => {
for child in children {
dissolve_prose_into(child, out);
}
}
}
}
fn fold_prose_continuations(children: Vec<LineContainer>) -> Vec<LineContainer> {
let mut result: Vec<LineContainer> = Vec::with_capacity(children.len());
for item in children {
let dissolve = matches!(result.last(), Some(LineContainer::Token(t)) if is_prose_line(t))
&& matches!(&item, LineContainer::Container { children } if is_prose_only_run(children));
if dissolve {
dissolve_prose_into(item, &mut result);
continue;
}
match item {
LineContainer::Container { children } => result.push(LineContainer::Container {
children: fold_prose_continuations(children),
}),
token => result.push(token),
}
}
result
}
fn parse_with_declarative_grammar_internal(
tokens: Vec<LineContainer>,
source: &str,
allow_sessions: bool,
is_doc_start: bool,
) -> Result<Vec<ParseNode>, String> {
let mut items: Vec<ParseNode> = Vec::new();
let mut idx = 0;
while idx < tokens.len() {
let (has_preceding_blank, has_preceding_boundary, prev_was_session) =
if let Some(last_node) = items.last() {
(
matches!(last_node.node_type, NodeType::BlankLineGroup),
!last_node.children.is_empty()
|| matches!(
last_node.node_type,
NodeType::DocumentStart | NodeType::DocumentTitle
),
matches!(last_node.node_type, NodeType::Session),
)
} else {
(false, false, false)
};
let is_first_item = idx == 0 && is_doc_start;
if let Some((pattern, range)) = GrammarMatcher::try_match(
&tokens,
idx,
allow_sessions,
is_first_item,
has_preceding_blank,
has_preceding_boundary,
prev_was_session,
) {
let mut pending_nodes = Vec::new();
if let PatternMatch::List {
preceding_blank_range: Some(blank_range),
..
} = &pattern
{
pending_nodes.push(blank_line_node_from_range(&tokens, blank_range.clone())?);
}
if let PatternMatch::Session {
preceding_blank_range: Some(blank_range),
..
} = &pattern
{
pending_nodes.push(blank_line_node_from_range(&tokens, blank_range.clone())?);
}
let is_session = matches!(&pattern, PatternMatch::Session { .. });
let item = convert_pattern_to_node(
&tokens,
&pattern,
range.clone(),
source,
&move |children, src| {
parse_with_declarative_grammar_internal(children, src, is_session, false)
},
)?;
pending_nodes.push(item);
if let PatternMatch::List {
trailing_blank_range: Some(blank_range),
..
} = &pattern
{
pending_nodes.push(blank_line_node_from_range(&tokens, blank_range.clone())?);
}
items.extend(pending_nodes);
idx = range.end;
} else {
if let LineContainer::Container {
children: inner, ..
} = &tokens[idx]
{
if !inner.is_empty() {
let orphaned = parse_with_declarative_grammar_internal(
inner.clone(),
source,
allow_sessions,
false,
)?;
items.extend(orphaned);
}
}
idx += 1;
}
}
Ok(items)
}
#[cfg(test)]
mod prose_continuation_tests {
use super::*;
use crate::lex::token::LineToken;
fn line(line_type: LineType) -> LineContainer {
LineContainer::Token(LineToken {
source_tokens: vec![],
token_spans: vec![],
line_type,
})
}
fn container(children: Vec<LineContainer>) -> LineContainer {
LineContainer::Container { children }
}
#[test]
fn prose_run_accepts_paragraph_and_lone_subject() {
use LineType::*;
assert!(is_prose_only_run(&[
line(ParagraphLine),
line(SubjectLine),
line(BlankLine),
]));
}
#[test]
fn prose_run_rejects_list_markers() {
use LineType::*;
assert!(!is_prose_only_run(&[
line(SubjectOrListItemLine),
line(SubjectOrListItemLine),
]));
assert!(!is_prose_only_run(&[line(ListLine)]));
assert!(!is_prose_only_run(&[
line(SubjectLine),
container(vec![line(ParagraphLine)]),
]));
}
}