meta-language 0.22.0

A self-describing links-network core for lossless language representation
Documentation
use tree_sitter::{Language, Node, Parser};

use crate::{
    ByteRange, LinkFlags, LinkId, LinkMetadata, LinkNetwork, LinkType, ParseConfiguration, Point,
    SourceSpan,
};

pub fn parse(text: &str, language: &str, configuration: ParseConfiguration) -> Option<LinkNetwork> {
    let grammar = grammar_for_language(language)?;
    let mut parser = Parser::new();
    parser.set_language(grammar).ok()?;
    let parsed = parser.parse(text, None)?;

    let (mut network, document) = LinkNetwork::new_parse_document(text, language);
    let root = parsed.root_node();
    convert_node(&mut network, document, root, text, language, configuration);
    network.attach_embedded_regions(
        document,
        text,
        language,
        configuration.region_detection_policy(),
    );
    Some(network)
}

fn grammar_for_language(language: &str) -> Option<Language> {
    if language.eq_ignore_ascii_case("python") {
        Some(tree_sitter_python::language())
    } else if language == "C" || language == "c" {
        Some(tree_sitter_c::language())
    } else if language.eq_ignore_ascii_case("java") {
        Some(tree_sitter_java::language())
    } else if language.eq_ignore_ascii_case("c++") || language.eq_ignore_ascii_case("cpp") {
        Some(tree_sitter_cpp::language())
    } else if language.eq_ignore_ascii_case("c#") || language.eq_ignore_ascii_case("csharp") {
        Some(tree_sitter_c_sharp::language())
    } else if language.eq_ignore_ascii_case("javascript") || language.eq_ignore_ascii_case("js") {
        Some(tree_sitter_javascript::language())
    } else if language == "R" || language == "r" {
        Some(tree_sitter_r::language())
    } else {
        None
    }
}

fn convert_node(
    network: &mut LinkNetwork,
    parent: LinkId,
    node: Node<'_>,
    text: &str,
    language: &str,
    configuration: ParseConfiguration,
) -> LinkId {
    let node_id = network.insert_link(
        [parent],
        LinkMetadata::new()
            .with_link_type(LinkType::Syntax)
            .with_named(node.is_named())
            .with_term(node.kind())
            .with_language(language)
            .with_span(span_for_node(node))
            .with_flags(flags_for_node(node)),
    );

    if node.child_count() == 0 {
        insert_leaf_token(network, node_id, node, text, language, configuration);
        return node_id;
    }

    let mut covered_until = node.start_byte();
    for child_index in 0..node.child_count() {
        let child = node
            .child(child_index)
            .expect("tree-sitter child index should be valid");
        insert_gap_token(
            network,
            node_id,
            text,
            covered_until,
            child.start_byte(),
            language,
            configuration,
        );

        let child_id = convert_node(network, node_id, child, text, language, configuration);
        if let Some(label) = node.field_name_for_child(
            u32::try_from(child_index).expect("tree-sitter child index fits in u32"),
        ) {
            network.insert_field(node_id, label, child_id);
        }
        covered_until = child.end_byte();
    }

    insert_gap_token(
        network,
        node_id,
        text,
        covered_until,
        node.end_byte(),
        language,
        configuration,
    );
    node_id
}

fn insert_leaf_token(
    network: &mut LinkNetwork,
    owner: LinkId,
    node: Node<'_>,
    text: &str,
    language: &str,
    configuration: ParseConfiguration,
) {
    if node.is_missing() || node.start_byte() == node.end_byte() {
        return;
    }

    let span = span_for_node(node);
    let flags = flags_for_node(node);
    let token = network.insert_link(
        [owner],
        LinkMetadata::new()
            .with_link_type(LinkType::Token)
            .with_named(node.is_named())
            .with_term(&text[node.start_byte()..node.end_byte()])
            .with_language(language)
            .with_span(span)
            .with_flags(flags),
    );

    if flags.is_extra() {
        network.attach_trivia(owner, token, span, configuration.trivia_attachment_policy());
    }
}

fn insert_gap_token(
    network: &mut LinkNetwork,
    owner: LinkId,
    text: &str,
    start: usize,
    end: usize,
    language: &str,
    configuration: ParseConfiguration,
) {
    if start == end {
        return;
    }

    let span = SourceSpan::new(
        ByteRange::new(start, end),
        point_at_byte(text, start),
        point_at_byte(text, end),
    );
    let token = network.insert_link(
        [owner],
        LinkMetadata::new()
            .with_link_type(LinkType::Token)
            .with_named(false)
            .with_term(&text[start..end])
            .with_language(language)
            .with_span(span)
            .with_flags(LinkFlags::extra()),
    );
    network.attach_trivia(owner, token, span, configuration.trivia_attachment_policy());
}

fn flags_for_node(node: Node<'_>) -> LinkFlags {
    let mut flags = LinkFlags::clean();
    if node.is_error() {
        flags = flags.with_error();
    }
    if node.has_error() && !node.is_error() && !node.is_missing() {
        flags = flags.with_containing_error();
    }
    if node.is_missing() {
        flags = flags.with_missing();
    }
    if node.is_extra() {
        flags = flags.with_extra();
    }
    flags
}

fn span_for_node(node: Node<'_>) -> SourceSpan {
    SourceSpan::new(
        ByteRange::new(node.start_byte(), node.end_byte()),
        point_from_tree_sitter(node.start_position()),
        point_from_tree_sitter(node.end_position()),
    )
}

const fn point_from_tree_sitter(point: tree_sitter::Point) -> Point {
    Point::new(point.row, point.column)
}

fn point_at_byte(text: &str, byte: usize) -> Point {
    let mut row = 0;
    let mut line_start = 0;
    for (index, value) in text.bytes().enumerate().take(byte) {
        if value == b'\n' {
            row += 1;
            line_start = index + 1;
        }
    }
    Point::new(row, byte - line_start)
}