use std::borrow::Cow;
use tree_sitter::{Language, Node, Parser};
use crate::{
ByteRange, LinkFlags, LinkId, LinkMetadata, LinkNetwork, LinkType, ParseConfiguration, Point,
SourceSpan,
};
pub fn parse(text: &str, language: &str, configuration: ParseConfiguration) -> Option<LinkNetwork> {
let grammar = grammar_for_language(language)?;
let mut parser = Parser::new();
parser.set_language(&grammar).ok()?;
let parsed = parser.parse(text, None)?;
let (mut network, document) = LinkNetwork::new_parse_document(text, language);
let root = parsed.root_node();
let context = ConvertContext::new(
text,
language,
configuration,
SpanOffset::zero(),
text.len(),
);
convert_node(&mut network, document, root, context);
network.attach_embedded_regions(document, text, language, configuration);
Some(network)
}
pub fn parse_embedded_region_into(
network: &mut LinkNetwork,
region: LinkId,
text: &str,
language: &str,
span: SourceSpan,
configuration: ParseConfiguration,
) -> Option<LinkId> {
let grammar = grammar_for_language(language)?;
let parse_text = embedded_parse_text(text, language);
let mut parser = Parser::new();
parser.set_language(&grammar).ok()?;
let parsed = parser.parse(parse_text.as_ref(), None)?;
let root = parsed.root_node();
let context = ConvertContext::new(
parse_text.as_ref(),
language,
configuration,
SpanOffset::new(span.byte_range().start(), span.start_point()),
text.len(),
);
Some(convert_node(network, region, root, context))
}
fn grammar_for_language(language: &str) -> Option<Language> {
if language.eq_ignore_ascii_case("python") {
Some(tree_sitter_python::LANGUAGE.into())
} else if language == "C" || language == "c" {
Some(tree_sitter_c::LANGUAGE.into())
} else if language.eq_ignore_ascii_case("java") {
Some(tree_sitter_java::LANGUAGE.into())
} else if language.eq_ignore_ascii_case("c++") || language.eq_ignore_ascii_case("cpp") {
Some(tree_sitter_cpp::LANGUAGE.into())
} else if language.eq_ignore_ascii_case("c#") || language.eq_ignore_ascii_case("csharp") {
Some(tree_sitter_c_sharp::LANGUAGE.into())
} else if language.eq_ignore_ascii_case("javascript") || language.eq_ignore_ascii_case("js") {
Some(tree_sitter_javascript::LANGUAGE.into())
} else if language.eq_ignore_ascii_case("visual basic")
|| language.eq_ignore_ascii_case("vb")
|| language.eq_ignore_ascii_case("vb.net")
|| language.eq_ignore_ascii_case("vbnet")
{
Some(tree_sitter_vb_dotnet::LANGUAGE.into())
} else if language.eq_ignore_ascii_case("delphi/object pascal")
|| language.eq_ignore_ascii_case("delphi")
|| language.eq_ignore_ascii_case("object pascal")
|| language.eq_ignore_ascii_case("pascal")
{
Some(tree_sitter_pascal::LANGUAGE.into())
} else if language.eq_ignore_ascii_case("rust") {
Some(tree_sitter_rust::LANGUAGE.into())
} else if language == "R" || language == "r" {
Some(tree_sitter_r::LANGUAGE.into())
} else if language.eq_ignore_ascii_case("sql-ansi") {
Some(tree_sitter_sequel::LANGUAGE.into())
} else if language.eq_ignore_ascii_case("html") {
Some(tree_sitter_html::LANGUAGE.into())
} else if language.eq_ignore_ascii_case("css") {
Some(tree_sitter_css::LANGUAGE.into())
} else {
None
}
}
fn convert_node(
network: &mut LinkNetwork,
parent: LinkId,
node: Node<'_>,
context: ConvertContext<'_>,
) -> LinkId {
let node_id = network.insert_link(
[parent],
LinkMetadata::new()
.with_link_type(LinkType::Syntax)
.with_named(node.is_named())
.with_term(node.kind())
.with_language(context.language)
.with_span(span_for_node(
node,
context.text,
context.source_len,
context.offset,
))
.with_flags(flags_for_node(node)),
);
if node.child_count() == 0 {
insert_leaf_token(network, node_id, node, context);
return node_id;
}
let mut covered_until = node.start_byte();
for child_index in 0..node.child_count() {
let child = node
.child(child_index)
.expect("tree-sitter child index should be valid");
if context.has_synthetic_suffix() && child.start_byte() >= context.source_len {
break;
}
insert_gap_token(network, node_id, covered_until, child.start_byte(), context);
let child_id = convert_node(network, node_id, child, context);
if let Some(label) = node.field_name_for_child(
u32::try_from(child_index).expect("tree-sitter child index fits in u32"),
) {
network.insert_field(node_id, label, child_id);
}
covered_until = child.end_byte().min(context.source_len);
}
insert_gap_token(network, node_id, covered_until, node.end_byte(), context);
node_id
}
fn insert_leaf_token(
network: &mut LinkNetwork,
owner: LinkId,
node: Node<'_>,
context: ConvertContext<'_>,
) {
let start = node.start_byte();
let end = node.end_byte().min(context.source_len);
if node.is_missing() || start >= end {
return;
}
let span = span_for_range(context.text, start, end, context.offset);
let flags = flags_for_node(node);
let token = network.insert_link(
[owner],
LinkMetadata::new()
.with_link_type(LinkType::Token)
.with_named(node.is_named())
.with_term(&context.text[start..end])
.with_language(context.language)
.with_span(span)
.with_flags(flags),
);
if flags.is_extra() {
network.attach_trivia(
owner,
token,
span,
context.configuration.trivia_attachment_policy(),
);
}
}
fn insert_gap_token(
network: &mut LinkNetwork,
owner: LinkId,
start: usize,
end: usize,
context: ConvertContext<'_>,
) {
let start = start.min(context.source_len);
let end = end.min(context.source_len);
if start == end {
return;
}
let span = span_for_range(context.text, start, end, context.offset);
let token = network.insert_link(
[owner],
LinkMetadata::new()
.with_link_type(LinkType::Token)
.with_named(false)
.with_term(&context.text[start..end])
.with_language(context.language)
.with_span(span)
.with_flags(LinkFlags::extra()),
);
network.attach_trivia(
owner,
token,
span,
context.configuration.trivia_attachment_policy(),
);
}
fn flags_for_node(node: Node<'_>) -> LinkFlags {
let mut flags = LinkFlags::clean();
if node.is_error() {
flags = flags.with_error();
}
if node.has_error() && !node.is_error() && !node.is_missing() {
flags = flags.with_containing_error();
}
if node.is_missing() {
flags = flags.with_missing();
}
if node.is_extra() {
flags = flags.with_extra();
}
flags
}
fn span_for_node(node: Node<'_>, text: &str, source_len: usize, offset: SpanOffset) -> SourceSpan {
let start = node.start_byte().min(source_len);
let end = node.end_byte().min(source_len);
span_for_range(text, start, end, offset)
}
fn span_for_range(text: &str, start: usize, end: usize, offset: SpanOffset) -> SourceSpan {
SourceSpan::new(
ByteRange::new(offset.byte + start, offset.byte + end),
offset.point(point_at_byte(text, start)),
offset.point(point_at_byte(text, end)),
)
}
fn point_at_byte(text: &str, byte: usize) -> Point {
let mut row = 0;
let mut line_start = 0;
for (index, value) in text.bytes().enumerate().take(byte) {
if value == b'\n' {
row += 1;
line_start = index + 1;
}
}
Point::new(row, byte - line_start)
}
fn embedded_parse_text<'a>(text: &'a str, language: &str) -> Cow<'a, str> {
if language.eq_ignore_ascii_case("css") && css_declaration_list_needs_semicolon(text) {
Cow::Owned(format!("{text};"))
} else {
Cow::Borrowed(text)
}
}
fn css_declaration_list_needs_semicolon(text: &str) -> bool {
let trimmed = text.trim_end();
!trimmed.is_empty()
&& !trimmed.ends_with(';')
&& !trimmed.ends_with('}')
&& !trimmed.contains('{')
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
struct SpanOffset {
byte: usize,
point: Point,
}
impl SpanOffset {
const fn new(byte: usize, point: Point) -> Self {
Self { byte, point }
}
const fn zero() -> Self {
Self::new(0, Point::new(0, 0))
}
const fn point(self, point: Point) -> Point {
let row = self.point.row() + point.row();
let column = if point.row() == 0 {
self.point.column() + point.column()
} else {
point.column()
};
Point::new(row, column)
}
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
struct ConvertContext<'a> {
text: &'a str,
language: &'a str,
configuration: ParseConfiguration,
offset: SpanOffset,
source_len: usize,
}
impl<'a> ConvertContext<'a> {
const fn new(
text: &'a str,
language: &'a str,
configuration: ParseConfiguration,
offset: SpanOffset,
source_len: usize,
) -> Self {
Self {
text,
language,
configuration,
offset,
source_len,
}
}
const fn has_synthetic_suffix(self) -> bool {
self.source_len < self.text.len()
}
}