use std::collections::HashSet;
use harper_core::{
parsers::{Markdown, Parser},
FullDictionary, Span,
};
use tree_sitter::{Language, Node, Tree, TreeCursor};
#[derive(Debug, Clone)]
pub struct TreeSitterParser {
language: Language,
}
impl TreeSitterParser {
pub fn new(language: Language) -> Self {
Self { language }
}
pub fn new_from_extension(file_extension: &str) -> Option<Self> {
let language = match file_extension {
"rs" => tree_sitter_rust::language(),
"tsx" => tree_sitter_typescript::language_tsx(),
"ts" => tree_sitter_typescript::language_typescript(),
"py" => tree_sitter_python::language(),
"js" => tree_sitter_javascript::language(),
"go" => tree_sitter_go::language(),
"c" => tree_sitter_c::language(),
"cpp" => tree_sitter_cpp::language(),
"h" => tree_sitter_cpp::language(),
"hpp" => tree_sitter_cpp::language(),
"rb" => tree_sitter_ruby::language(),
"swift" => tree_sitter_swift::language(),
"cs" => tree_sitter_c_sharp::language(),
"toml" => tree_sitter_toml::language(),
"lua" => tree_sitter_lua::language(),
_ => return None,
};
Some(Self { language })
}
fn parse_root(&self, text: &str) -> Option<Tree> {
let mut parser = tree_sitter::Parser::new();
parser.set_language(self.language).unwrap();
parser.parse(text, None)
}
pub fn create_ident_dict(&self, source: &[char]) -> Option<FullDictionary> {
let text: String = source.iter().collect();
let mut ident_spans = Vec::new();
let tree = self.parse_root(&text)?;
Self::visit_nodes(&mut tree.walk(), &mut |node: &Node| {
if node.child_count() == 0 && node.kind().contains("ident") {
ident_spans.push(node.byte_range().into())
}
});
byte_spans_to_char_spans(&mut ident_spans, &text);
let mut idents = HashSet::new();
for span in ident_spans {
idents.insert(span.get_content(source));
}
let idents: Vec<_> = idents.into_iter().collect();
let mut dictionary = FullDictionary::new();
dictionary.extend_words(idents);
Some(dictionary)
}
fn extract_comments(cursor: &mut TreeCursor, comments: &mut Vec<Span>) {
Self::visit_nodes(cursor, &mut |node: &Node| {
if node.kind().contains("comment") {
comments.push(node.byte_range().into());
}
});
}
fn visit_nodes(cursor: &mut TreeCursor, visit: &mut impl FnMut(&Node)) {
if !cursor.goto_first_child() {
return;
}
while cursor.goto_next_sibling() {
let node = cursor.node();
visit(&node);
Self::visit_nodes(cursor, visit);
}
cursor.goto_parent();
}
}
impl Parser for TreeSitterParser {
fn parse(&mut self, source: &[char]) -> Vec<harper_core::Token> {
let text: String = source.iter().collect();
let mut markdown_parser = Markdown;
let Some(root) = self.parse_root(&text) else {
return vec![];
};
let mut comments_spans = Vec::new();
Self::extract_comments(&mut root.walk(), &mut comments_spans);
byte_spans_to_char_spans(&mut comments_spans, &text);
let mut tokens = Vec::new();
for (s_index, span) in comments_spans.iter().enumerate() {
let actual_start = source[span.start..span.end]
.iter()
.position(|c| !is_comment_character(*c))
.unwrap_or(0)
+ span.start;
if span.end <= actual_start {
continue;
}
let mut new_tokens = markdown_parser.parse(&source[actual_start..span.end]);
if let Some(next_start) = comments_spans.get(s_index + 1).map(|v| v.start) {
if is_span_whitespace(Span::new(span.end, next_start), source) {
new_tokens.pop();
}
}
new_tokens
.iter_mut()
.for_each(|t| t.span.offset(actual_start));
tokens.append(&mut new_tokens);
}
tokens
}
}
fn is_span_whitespace(span: Span, source: &[char]) -> bool {
span.get_content(source)
.iter()
.filter(|c| !c.is_whitespace())
.count()
== 0
}
fn is_comment_character(c: char) -> bool {
matches!(c, '#' | '-' | '/')
}
fn byte_spans_to_char_spans(byte_spans: &mut Vec<Span>, source: &str) {
byte_spans.sort_by_key(|s| s.start);
let cloned = byte_spans.clone();
let mut i = 0;
byte_spans.retain(|cur| {
i += 1;
if let Some(prev) = cloned.get(i - 2) {
!cur.overlaps_with(*prev)
} else {
true
}
});
let mut last_byte_pos = 0;
let mut last_char_pos = 0;
byte_spans.iter_mut().for_each(|span| {
let byte_span = *span;
last_char_pos += source[last_byte_pos..byte_span.start].chars().count();
span.start = last_char_pos;
last_char_pos += source[byte_span.start..byte_span.end].chars().count();
span.end = last_char_pos;
last_byte_pos = byte_span.end;
})
}