meta-language 0.46.0

A self-describing links-network core for lossless language representation
use std::collections::{BTreeSet, HashMap, HashSet};

use crate::grammar::{CharClassItem, Grammar, GrammarExpr};

pub(super) fn accepts(grammar: &Grammar, text: &str) -> bool {
    Recognizer::new(grammar, text).accepts()
}

struct Recognizer<'g, 't> {
    grammar: &'g Grammar,
    text: &'t str,
    memo: HashMap<(String, usize), BTreeSet<usize>>,
}

impl<'g, 't> Recognizer<'g, 't> {
    fn new(grammar: &'g Grammar, text: &'t str) -> Self {
        Self {
            grammar,
            text,
            memo: HashMap::new(),
        }
    }

    fn accepts(&mut self) -> bool {
        let Some(start) = self.grammar.start_rule() else {
            return false;
        };
        self.match_expr(start.expr(), 0, &mut HashSet::new())
            .contains(&self.text.len())
    }

    fn match_rule(
        &mut self,
        name: &str,
        position: usize,
        stack: &mut HashSet<(String, usize)>,
    ) -> BTreeSet<usize> {
        let key = (name.to_string(), position);
        if let Some(cached) = self.memo.get(&key) {
            return cached.clone();
        }
        if stack.contains(&key) {
            return BTreeSet::new();
        }
        let Some(rule) = self.grammar.rule(name) else {
            return BTreeSet::new();
        };

        stack.insert(key.clone());
        let matches = self.match_expr(rule.expr(), position, stack);
        stack.remove(&key);
        self.memo.insert(key, matches.clone());
        matches
    }

    fn match_expr(
        &mut self,
        expr: &GrammarExpr,
        position: usize,
        stack: &mut HashSet<(String, usize)>,
    ) -> BTreeSet<usize> {
        if position > self.text.len() || !self.text.is_char_boundary(position) {
            return BTreeSet::new();
        }

        match expr {
            GrammarExpr::Empty => singleton(position),
            GrammarExpr::Terminal(value) => {
                if self.text[position..].starts_with(value) {
                    singleton(position + value.len())
                } else {
                    BTreeSet::new()
                }
            }
            GrammarExpr::TerminalInsensitive(value) => {
                if starts_with_ascii_insensitive(&self.text[position..], value) {
                    singleton(position + value.len())
                } else {
                    BTreeSet::new()
                }
            }
            GrammarExpr::CharRange(start, end) => self
                .char_at(position)
                .filter(|(value, _)| start <= value && value <= end)
                .map_or_else(BTreeSet::new, |(_, next)| singleton(next)),
            GrammarExpr::CharClass { negated, items } => self
                .char_at(position)
                .filter(|(value, _)| class_accepts(*value, *negated, items))
                .map_or_else(BTreeSet::new, |(_, next)| singleton(next)),
            GrammarExpr::AnyChar => self
                .char_at(position)
                .map_or_else(BTreeSet::new, |(_, next)| singleton(next)),
            GrammarExpr::NonTerminal(name) => self.match_rule(name, position, stack),
            GrammarExpr::Choice {
                ordered,
                alternatives,
            } => self.match_choice(*ordered, alternatives, position, stack),
            GrammarExpr::Sequence(items) => self.match_sequence(items, position, stack),
            GrammarExpr::Optional(inner) => {
                let mut matches = singleton(position);
                matches.extend(self.match_expr(inner, position, stack));
                matches
            }
            GrammarExpr::ZeroOrMore(inner) => {
                self.repeat_closure(inner, singleton(position), stack)
            }
            GrammarExpr::OneOrMore(inner) => {
                let first = self.match_expr(inner, position, stack);
                self.repeat_closure(inner, first, stack)
            }
            GrammarExpr::Repeat { expr, min, max } => {
                self.repeat_bounds(expr, position, *min, *max, stack)
            }
            GrammarExpr::And(inner) => {
                if self.match_expr(inner, position, stack).is_empty() {
                    BTreeSet::new()
                } else {
                    singleton(position)
                }
            }
            GrammarExpr::Not(inner) => {
                if self.match_expr(inner, position, stack).is_empty() {
                    singleton(position)
                } else {
                    BTreeSet::new()
                }
            }
            GrammarExpr::Capture { expr, .. } => self.match_expr(expr, position, stack),
        }
    }

    fn match_choice(
        &mut self,
        ordered: bool,
        alternatives: &[GrammarExpr],
        position: usize,
        stack: &mut HashSet<(String, usize)>,
    ) -> BTreeSet<usize> {
        if ordered {
            for alternative in alternatives {
                let matches = self.match_expr(alternative, position, stack);
                if !matches.is_empty() {
                    return matches;
                }
            }
            BTreeSet::new()
        } else {
            let mut matches = BTreeSet::new();
            for alternative in alternatives {
                matches.extend(self.match_expr(alternative, position, stack));
            }
            matches
        }
    }

    fn match_sequence(
        &mut self,
        items: &[GrammarExpr],
        position: usize,
        stack: &mut HashSet<(String, usize)>,
    ) -> BTreeSet<usize> {
        let mut positions = singleton(position);
        for item in items {
            let mut next_positions = BTreeSet::new();
            for current in positions {
                next_positions.extend(self.match_expr(item, current, stack));
            }
            if next_positions.is_empty() {
                return BTreeSet::new();
            }
            positions = next_positions;
        }
        positions
    }

    fn repeat_closure(
        &mut self,
        inner: &GrammarExpr,
        starts: BTreeSet<usize>,
        stack: &mut HashSet<(String, usize)>,
    ) -> BTreeSet<usize> {
        let mut results = starts.clone();
        let mut frontier = starts;

        while !frontier.is_empty() {
            let mut next_frontier = BTreeSet::new();
            for position in frontier {
                for next in self.match_expr(inner, position, stack) {
                    if next != position && results.insert(next) {
                        next_frontier.insert(next);
                    }
                }
            }
            frontier = next_frontier;
        }

        results
    }

    fn repeat_bounds(
        &mut self,
        inner: &GrammarExpr,
        position: usize,
        min: usize,
        max: Option<usize>,
        stack: &mut HashSet<(String, usize)>,
    ) -> BTreeSet<usize> {
        let max = max.unwrap_or_else(|| self.remaining_chars(position).saturating_add(min));
        if max < min {
            return BTreeSet::new();
        }

        let mut matches = BTreeSet::new();
        let mut positions = singleton(position);
        for count in 0..=max {
            if count >= min {
                matches.extend(positions.iter().copied());
            }
            if count == max {
                break;
            }
            let mut next_positions = BTreeSet::new();
            for current in positions {
                next_positions.extend(self.match_expr(inner, current, stack));
            }
            if next_positions.is_empty() {
                break;
            }
            positions = next_positions;
        }
        matches
    }

    fn char_at(&self, position: usize) -> Option<(char, usize)> {
        self.text[position..]
            .chars()
            .next()
            .map(|value| (value, position + value.len_utf8()))
    }

    fn remaining_chars(&self, position: usize) -> usize {
        self.text[position..].chars().count()
    }
}

fn starts_with_ascii_insensitive(text: &str, value: &str) -> bool {
    text.get(..value.len())
        .is_some_and(|prefix| prefix.eq_ignore_ascii_case(value))
}

fn class_accepts(value: char, negated: bool, items: &[CharClassItem]) -> bool {
    let contains = items.iter().any(|item| match item {
        CharClassItem::Char(item) => *item == value,
        CharClassItem::Range(start, end) => *start <= value && value <= *end,
    });
    contains != negated
}

fn singleton(value: usize) -> BTreeSet<usize> {
    BTreeSet::from([value])
}