use std::collections::HashSet;
use tree_sitter::Language;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum TokenType {
Keyword,
Identifier,
TypeName,
Operator,
Punctuation,
StringLiteral,
NumericLiteral,
BooleanLiteral,
Comment,
Whitespace,
Special,
Unknown,
}
impl TokenType {
pub fn is_correctable(&self) -> bool {
!matches!(self, TokenType::Comment | TokenType::Whitespace)
}
pub fn has_fixed_vocabulary(&self) -> bool {
matches!(
self,
TokenType::Keyword
| TokenType::Operator
| TokenType::Punctuation
| TokenType::BooleanLiteral
)
}
}
#[derive(Debug, Clone)]
pub struct TokenContext {
pub token_type: TokenType,
pub parent_node_type: Option<String>,
pub sibling_types: Vec<String>,
pub depth: usize,
pub in_error_region: bool,
pub expected_types: Vec<TokenType>,
}
impl TokenContext {
pub fn new(token_type: TokenType) -> Self {
Self {
token_type,
parent_node_type: None,
sibling_types: Vec::new(),
depth: 0,
in_error_region: false,
expected_types: Vec::new(),
}
}
pub fn with_parent(mut self, parent: impl Into<String>) -> Self {
self.parent_node_type = Some(parent.into());
self
}
pub fn with_depth(mut self, depth: usize) -> Self {
self.depth = depth;
self
}
pub fn in_error(mut self) -> Self {
self.in_error_region = true;
self
}
}
pub trait CodeLanguage: Send + Sync {
fn name(&self) -> &str;
fn display_name(&self) -> &str {
self.name()
}
fn tree_sitter_language(&self) -> Language;
fn keywords(&self) -> &[&str];
fn special_tokens(&self) -> &[&str] {
&[]
}
fn file_extensions(&self) -> &[&str];
fn classify_token(&self, token: &str, node_kind: &str) -> TokenType;
fn is_valid_identifier(&self, s: &str) -> bool;
fn builtin_types(&self) -> &[&str] {
&[]
}
fn stdlib_functions(&self) -> &[&str] {
&[]
}
fn comment_syntax(&self) -> CommentSyntax {
CommentSyntax::default()
}
fn is_whitespace_significant(&self) -> bool {
false
}
fn keyword_set(&self) -> HashSet<&str> {
self.keywords().iter().copied().collect()
}
}
#[derive(Debug, Clone)]
pub struct CommentSyntax {
pub line_comment: Option<&'static str>,
pub block_comment: Option<(&'static str, &'static str)>,
pub doc_comment: Option<&'static str>,
}
impl Default for CommentSyntax {
fn default() -> Self {
Self {
line_comment: Some("//"),
block_comment: Some(("/*", "*/")),
doc_comment: Some("///"),
}
}
}
impl CommentSyntax {
pub fn c_style() -> Self {
Self::default()
}
pub fn python_style() -> Self {
Self {
line_comment: Some("#"),
block_comment: Some(("\"\"\"", "\"\"\"")),
doc_comment: Some("#"),
}
}
pub fn shell_style() -> Self {
Self {
line_comment: Some("#"),
block_comment: None,
doc_comment: None,
}
}
pub fn lisp_style() -> Self {
Self {
line_comment: Some(";"),
block_comment: Some(("#|", "|#")),
doc_comment: Some(";;"),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_token_type_is_correctable() {
assert!(TokenType::Keyword.is_correctable());
assert!(TokenType::Identifier.is_correctable());
assert!(!TokenType::Comment.is_correctable());
assert!(!TokenType::Whitespace.is_correctable());
}
#[test]
fn test_token_type_has_fixed_vocabulary() {
assert!(TokenType::Keyword.has_fixed_vocabulary());
assert!(TokenType::Operator.has_fixed_vocabulary());
assert!(!TokenType::Identifier.has_fixed_vocabulary());
assert!(!TokenType::StringLiteral.has_fixed_vocabulary());
}
#[test]
fn test_token_context_builder() {
let ctx = TokenContext::new(TokenType::Identifier)
.with_parent("function_definition")
.with_depth(3)
.in_error();
assert_eq!(ctx.token_type, TokenType::Identifier);
assert_eq!(
ctx.parent_node_type,
Some("function_definition".to_string())
);
assert_eq!(ctx.depth, 3);
assert!(ctx.in_error_region);
}
}