meta-language 0.40.0

A self-describing links-network core for lossless language representation
use std::collections::BTreeSet;

use crate::{LinkId, LinkNetwork, LinkType};

use super::{
    normalize_capture_name, structural_children, LinkRuleCaptures, LinkRuleMatch,
    LinkRuleParseError,
};

#[derive(Clone, Debug, PartialEq, Eq)]
pub(super) struct TextPattern {
    parts: Vec<TextPatternPart>,
}

impl TextPattern {
    pub(super) fn parse(source: String) -> Result<Self, LinkRuleParseError> {
        let mut parts = Vec::new();
        let mut rest = source.as_str();
        while let Some(start) = rest.find("{{") {
            if start > 0 {
                parts.push(TextPatternPart::Literal(rest[..start].to_string()));
            }
            let after_open = &rest[start + 2..];
            let Some(end) = after_open.find("}}") else {
                return Err(LinkRuleParseError::new("unterminated text placeholder"));
            };
            let name = after_open[..end].trim();
            if name.is_empty() {
                return Err(LinkRuleParseError::new("text placeholder is empty"));
            }
            parts.push(TextPatternPart::Placeholder(normalize_capture_name(name)));
            rest = &after_open[end + 2..];
        }
        if !rest.is_empty() {
            parts.push(TextPatternPart::Literal(rest.to_string()));
        }
        if parts.is_empty() {
            parts.push(TextPatternPart::Literal(source));
        }
        Ok(Self { parts })
    }

    pub(super) fn matches(&self, network: &LinkNetwork) -> Vec<LinkRuleMatch> {
        network
            .links()
            .filter(|link| link.metadata().link_type() == Some(LinkType::Document))
            .filter_map(|document| {
                let tokens = source_tokens(network, document.id());
                let text = tokens
                    .iter()
                    .filter_map(|(_, _, term)| term.as_deref())
                    .collect::<String>();
                let captures = self.match_text(&text, &tokens)?;
                Some(LinkRuleMatch {
                    link_id: document.id(),
                    captures,
                })
            })
            .collect()
    }

    fn match_text(
        &self,
        text: &str,
        tokens: &[(LinkId, std::ops::Range<usize>, Option<String>)],
    ) -> Option<LinkRuleCaptures> {
        let mut captures = LinkRuleCaptures::default();
        let mut position = 0;
        for (index, part) in self.parts.iter().enumerate() {
            match part {
                TextPatternPart::Literal(literal) => {
                    let remaining = text.get(position..)?;
                    if !remaining.starts_with(literal) {
                        return None;
                    }
                    position += literal.len();
                }
                TextPatternPart::Placeholder(name) => {
                    let capture_start = position;
                    let capture_end = if let Some(literal) = next_literal(&self.parts[index + 1..])
                    {
                        text[position..]
                            .find(literal)
                            .map(|offset| position + offset)?
                    } else {
                        text.len()
                    };
                    let captured_text_value = text.get(capture_start..capture_end)?.to_string();
                    let link_ids = tokens
                        .iter()
                        .filter(|(_, range, _)| {
                            range.start >= capture_start && range.end <= capture_end
                        })
                        .map(|(link_id, _, _)| *link_id)
                        .collect::<Vec<_>>();
                    captures = captures.with_text(name, captured_text_value, link_ids);
                    position = capture_end;
                }
            }
        }
        (position == text.len()).then_some(captures)
    }
}

#[derive(Clone, Debug, PartialEq, Eq)]
enum TextPatternPart {
    Literal(String),
    Placeholder(String),
}

fn next_literal(parts: &[TextPatternPart]) -> Option<&str> {
    parts.iter().find_map(|part| match part {
        TextPatternPart::Literal(literal) if !literal.is_empty() => Some(literal.as_str()),
        _ => None,
    })
}

fn source_tokens(
    network: &LinkNetwork,
    root: LinkId,
) -> Vec<(LinkId, std::ops::Range<usize>, Option<String>)> {
    let mut tokens = Vec::new();
    collect_tokens(network, root, &mut BTreeSet::new(), &mut tokens);
    tokens.sort_by_key(|(link_id, range, _)| (range.start, link_id.as_u64()));
    tokens
}

fn collect_tokens(
    network: &LinkNetwork,
    link_id: LinkId,
    visited: &mut BTreeSet<LinkId>,
    tokens: &mut Vec<(LinkId, std::ops::Range<usize>, Option<String>)>,
) {
    if !visited.insert(link_id) {
        return;
    }
    let Some(link) = network.link(link_id) else {
        return;
    };
    if link.metadata().link_type() == Some(LinkType::Token) && !link.metadata().flags().is_missing()
    {
        if let Some(span) = link.metadata().span() {
            tokens.push((
                link_id,
                span.byte_range().start()..span.byte_range().end(),
                link.metadata().term().map(str::to_string),
            ));
        }
        return;
    }
    for child in structural_children(network, link_id) {
        collect_tokens(network, child, visited, tokens);
    }
}