use std::path::Path;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum Language {
Rust,
TypeScript,
Tsx,
JavaScript,
Jsx,
Python,
Go,
Java,
C,
Cpp,
Json,
Toml,
Yaml,
Markdown,
Unknown,
}
impl Language {
pub fn as_str(&self) -> &'static str {
match self {
Language::Rust => "rust",
Language::TypeScript => "typescript",
Language::Tsx => "tsx",
Language::JavaScript => "javascript",
Language::Jsx => "jsx",
Language::Python => "python",
Language::Go => "go",
Language::Java => "java",
Language::C => "c",
Language::Cpp => "cpp",
Language::Json => "json",
Language::Toml => "toml",
Language::Yaml => "yaml",
Language::Markdown => "markdown",
Language::Unknown => "unknown",
}
}
#[cfg(feature = "semantic-chunking")]
pub fn has_tree_sitter_grammar(&self) -> bool {
!matches!(self, Language::Unknown)
}
#[cfg(not(feature = "semantic-chunking"))]
pub fn has_tree_sitter_grammar(&self) -> bool {
false
}
#[cfg(feature = "semantic-chunking")]
pub fn tree_sitter_language(&self) -> Option<tree_sitter::Language> {
match self {
Language::Rust => Some(tree_sitter_rust::LANGUAGE.into()),
Language::TypeScript => Some(tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into()),
Language::Tsx => Some(tree_sitter_typescript::LANGUAGE_TSX.into()),
Language::JavaScript => Some(tree_sitter_javascript::LANGUAGE.into()),
Language::Jsx => Some(tree_sitter_javascript::LANGUAGE.into()), Language::Python => Some(tree_sitter_python::LANGUAGE.into()),
Language::Go => Some(tree_sitter_go::LANGUAGE.into()),
Language::Java => Some(tree_sitter_java::LANGUAGE.into()),
Language::C => Some(tree_sitter_c::LANGUAGE.into()),
Language::Cpp => Some(tree_sitter_cpp::LANGUAGE.into()),
Language::Json => Some(tree_sitter_json::LANGUAGE.into()),
Language::Toml => Some(tree_sitter_toml_ng::LANGUAGE.into()),
Language::Yaml => Some(tree_sitter_yaml::LANGUAGE.into()),
Language::Markdown => Some(tree_sitter_md::LANGUAGE.into()),
Language::Unknown => None,
}
}
#[cfg(not(feature = "semantic-chunking"))]
pub fn tree_sitter_language(&self) -> Option<()> {
None
}
pub fn semantic_node_types(&self) -> &'static [&'static str] {
match self {
Language::Rust => &[
"function_item",
"impl_item",
"struct_item",
"enum_item",
"mod_item",
"trait_item",
"type_item",
"const_item",
"static_item",
"macro_definition",
],
Language::TypeScript | Language::Tsx => &[
"function_declaration",
"class_declaration",
"method_definition",
"arrow_function",
"interface_declaration",
"type_alias_declaration",
"enum_declaration",
"export_statement",
],
Language::JavaScript | Language::Jsx => &[
"function_declaration",
"class_declaration",
"method_definition",
"arrow_function",
"export_statement",
],
Language::Python => &[
"function_definition",
"class_definition",
"decorated_definition",
],
Language::Go => &[
"function_declaration",
"method_declaration",
"type_declaration",
"const_declaration",
"var_declaration",
],
Language::Java => &[
"class_declaration",
"method_declaration",
"interface_declaration",
"enum_declaration",
"constructor_declaration",
],
Language::C => &[
"function_definition",
"struct_specifier",
"enum_specifier",
"type_definition",
],
Language::Cpp => &[
"function_definition",
"class_specifier",
"struct_specifier",
"enum_specifier",
"namespace_definition",
"template_declaration",
],
Language::Json | Language::Toml | Language::Yaml => &[],
Language::Markdown => &["section", "atx_heading"],
Language::Unknown => &[],
}
}
}
pub fn detect_language(path: &Path) -> Language {
let extension = path.extension().and_then(|e| e.to_str()).unwrap_or("");
let filename = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
match filename.to_lowercase().as_str() {
"cargo.toml" | "pyproject.toml" => return Language::Toml,
"package.json" | "tsconfig.json" => return Language::Json,
"readme.md" | "changelog.md" => return Language::Markdown,
_ => {}
}
match extension.to_lowercase().as_str() {
"rs" => Language::Rust,
"ts" => Language::TypeScript,
"tsx" => Language::Tsx,
"js" | "mjs" | "cjs" => Language::JavaScript,
"jsx" => Language::Jsx,
"py" | "pyi" | "pyw" => Language::Python,
"go" => Language::Go,
"java" => Language::Java,
"c" | "h" => Language::C,
"cpp" | "cxx" | "cc" | "hpp" | "hxx" | "hh" => Language::Cpp,
"json" | "jsonc" => Language::Json,
"toml" => Language::Toml,
"yaml" | "yml" => Language::Yaml,
"md" | "markdown" => Language::Markdown,
_ => Language::Unknown,
}
}
pub fn detect_language_from_str(path: &str) -> Language {
detect_language(Path::new(path))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_detect_rust() {
assert_eq!(detect_language(Path::new("src/main.rs")), Language::Rust);
assert_eq!(detect_language(Path::new("lib.rs")), Language::Rust);
}
#[test]
fn test_detect_typescript() {
assert_eq!(detect_language(Path::new("app.ts")), Language::TypeScript);
assert_eq!(detect_language(Path::new("component.tsx")), Language::Tsx);
}
#[test]
fn test_detect_javascript() {
assert_eq!(detect_language(Path::new("index.js")), Language::JavaScript);
assert_eq!(
detect_language(Path::new("utils.mjs")),
Language::JavaScript
);
assert_eq!(detect_language(Path::new("component.jsx")), Language::Jsx);
}
#[test]
fn test_detect_python() {
assert_eq!(detect_language(Path::new("script.py")), Language::Python);
assert_eq!(detect_language(Path::new("types.pyi")), Language::Python);
}
#[test]
fn test_detect_go() {
assert_eq!(detect_language(Path::new("main.go")), Language::Go);
}
#[test]
fn test_detect_java() {
assert_eq!(detect_language(Path::new("Main.java")), Language::Java);
}
#[test]
fn test_detect_c_cpp() {
assert_eq!(detect_language(Path::new("main.c")), Language::C);
assert_eq!(detect_language(Path::new("header.h")), Language::C);
assert_eq!(detect_language(Path::new("app.cpp")), Language::Cpp);
assert_eq!(detect_language(Path::new("app.hpp")), Language::Cpp);
}
#[test]
fn test_detect_config_files() {
assert_eq!(detect_language(Path::new("config.json")), Language::Json);
assert_eq!(detect_language(Path::new("config.toml")), Language::Toml);
assert_eq!(detect_language(Path::new("config.yaml")), Language::Yaml);
assert_eq!(detect_language(Path::new("config.yml")), Language::Yaml);
}
#[test]
fn test_detect_markdown() {
assert_eq!(detect_language(Path::new("README.md")), Language::Markdown);
assert_eq!(
detect_language(Path::new("docs.markdown")),
Language::Markdown
);
}
#[test]
fn test_detect_unknown() {
assert_eq!(detect_language(Path::new("file.xyz")), Language::Unknown);
assert_eq!(
detect_language(Path::new("no_extension")),
Language::Unknown
);
}
#[test]
fn test_case_insensitive() {
assert_eq!(detect_language(Path::new("file.RS")), Language::Rust);
assert_eq!(detect_language(Path::new("file.PY")), Language::Python);
}
#[test]
fn test_semantic_node_types() {
let rust_types = Language::Rust.semantic_node_types();
assert!(rust_types.contains(&"function_item"));
assert!(rust_types.contains(&"impl_item"));
let py_types = Language::Python.semantic_node_types();
assert!(py_types.contains(&"function_definition"));
assert!(py_types.contains(&"class_definition"));
}
}