use harper_core::{
parsers::{Markdown, Parser},
Span,
};
use tree_sitter::{Language, TreeCursor};
pub struct TreeSitterParser {
language: Language,
}
impl TreeSitterParser {
pub fn new(language: Language) -> Self {
Self { language }
}
pub fn new_from_extension(file_extension: &str) -> Option<Self> {
let language = match file_extension {
"rs" => tree_sitter_rust::language(),
"tsx" => tree_sitter_typescript::language_tsx(),
"ts" => tree_sitter_typescript::language_typescript(),
"py" => tree_sitter_python::language(),
"js" => tree_sitter_javascript::language(),
"go" => tree_sitter_go::language(),
"c" => tree_sitter_c::language(),
"cpp" => tree_sitter_cpp::language(),
"h" => tree_sitter_cpp::language(),
"hpp" => tree_sitter_cpp::language(),
"rb" => tree_sitter_ruby::language(),
"swift" => tree_sitter_swift::language(),
"cs" => tree_sitter_c_sharp::language(),
"toml" => tree_sitter_toml::language(),
"lua" => tree_sitter_lua::language(),
_ => return None,
};
Some(Self { language })
}
}
impl Parser for TreeSitterParser {
fn parse(&mut self, source: &[char]) -> Vec<harper_core::Token> {
let text: String = source.iter().collect();
let mut markdown_parser = Markdown;
let mut parser = tree_sitter::Parser::new();
parser.set_language(self.language).unwrap();
let Some(root) = parser.parse(&text, None) else {
return vec![];
};
let mut comments_spans = Vec::new();
extract_comments(&mut root.walk(), &mut comments_spans);
byte_spans_to_char_spans(&mut comments_spans, &text);
let mut tokens = Vec::new();
for (s_index, span) in comments_spans.iter().enumerate() {
let actual_start = source[span.start..span.end]
.iter()
.position(|c| !is_comment_character(*c))
.unwrap_or(0)
+ span.start;
if span.end <= actual_start {
continue;
}
let mut new_tokens = markdown_parser.parse(&source[actual_start..span.end]);
if let Some(next_start) = comments_spans.get(s_index + 1).map(|v| v.start) {
if is_span_whitespace(Span::new(span.end, next_start), source) {
new_tokens.pop();
}
}
new_tokens
.iter_mut()
.for_each(|t| t.span.offset(actual_start));
tokens.append(&mut new_tokens);
}
tokens
}
}
fn is_span_whitespace(span: Span, source: &[char]) -> bool {
span.get_content(source)
.iter()
.filter(|c| !c.is_whitespace())
.count()
== 0
}
fn is_comment_character(c: char) -> bool {
matches!(c, '#' | '-' | '/')
}
fn byte_spans_to_char_spans(byte_spans: &mut [Span], source: &str) {
byte_spans.sort_by_key(|s| s.start);
let mut last_byte_pos = 0;
let mut last_char_pos = 0;
byte_spans.iter_mut().for_each(|span| {
let byte_span = *span;
last_char_pos += source[last_byte_pos..byte_span.start].chars().count();
span.start = last_char_pos;
last_char_pos += source[byte_span.start..byte_span.end].chars().count();
span.end = last_char_pos;
last_byte_pos = byte_span.end;
})
}
fn extract_comments(cursor: &mut TreeCursor, comments: &mut Vec<Span>) {
if !cursor.goto_first_child() {
return;
}
while cursor.goto_next_sibling() {
let node = cursor.node();
if node.kind().contains("comment") {
comments.push(node.byte_range().into());
}
extract_comments(cursor, comments);
}
cursor.goto_parent();
}