vyctor 0.1.0

A fast CLI tool for semantic file search using vector embeddings
Documentation
//! Language detection and tree-sitter grammar loading
//!
//! This module maps file extensions to language identifiers and provides
//! access to tree-sitter grammars when available.

use std::path::Path;

/// Supported languages for semantic chunking
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum Language {
    Rust,
    TypeScript,
    Tsx,
    JavaScript,
    Jsx,
    Python,
    Go,
    Java,
    C,
    Cpp,
    Json,
    Toml,
    Yaml,
    Markdown,
    Unknown,
}

impl Language {
    /// Get the language identifier string
    pub fn as_str(&self) -> &'static str {
        match self {
            Language::Rust => "rust",
            Language::TypeScript => "typescript",
            Language::Tsx => "tsx",
            Language::JavaScript => "javascript",
            Language::Jsx => "jsx",
            Language::Python => "python",
            Language::Go => "go",
            Language::Java => "java",
            Language::C => "c",
            Language::Cpp => "cpp",
            Language::Json => "json",
            Language::Toml => "toml",
            Language::Yaml => "yaml",
            Language::Markdown => "markdown",
            Language::Unknown => "unknown",
        }
    }

    /// Check if tree-sitter grammar is available for this language
    #[cfg(feature = "semantic-chunking")]
    pub fn has_tree_sitter_grammar(&self) -> bool {
        !matches!(self, Language::Unknown)
    }

    #[cfg(not(feature = "semantic-chunking"))]
    pub fn has_tree_sitter_grammar(&self) -> bool {
        false
    }

    /// Get tree-sitter language if available
    #[cfg(feature = "semantic-chunking")]
    pub fn tree_sitter_language(&self) -> Option<tree_sitter::Language> {
        match self {
            Language::Rust => Some(tree_sitter_rust::LANGUAGE.into()),
            Language::TypeScript => Some(tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into()),
            Language::Tsx => Some(tree_sitter_typescript::LANGUAGE_TSX.into()),
            Language::JavaScript => Some(tree_sitter_javascript::LANGUAGE.into()),
            Language::Jsx => Some(tree_sitter_javascript::LANGUAGE.into()), // JSX uses same grammar
            Language::Python => Some(tree_sitter_python::LANGUAGE.into()),
            Language::Go => Some(tree_sitter_go::LANGUAGE.into()),
            Language::Java => Some(tree_sitter_java::LANGUAGE.into()),
            Language::C => Some(tree_sitter_c::LANGUAGE.into()),
            Language::Cpp => Some(tree_sitter_cpp::LANGUAGE.into()),
            Language::Json => Some(tree_sitter_json::LANGUAGE.into()),
            Language::Toml => Some(tree_sitter_toml_ng::LANGUAGE.into()),
            Language::Yaml => Some(tree_sitter_yaml::LANGUAGE.into()),
            Language::Markdown => Some(tree_sitter_md::LANGUAGE.into()),
            Language::Unknown => None,
        }
    }

    #[cfg(not(feature = "semantic-chunking"))]
    pub fn tree_sitter_language(&self) -> Option<()> {
        None
    }

    /// Get the semantic node types for this language (functions, classes, etc.)
    pub fn semantic_node_types(&self) -> &'static [&'static str] {
        match self {
            Language::Rust => &[
                "function_item",
                "impl_item",
                "struct_item",
                "enum_item",
                "mod_item",
                "trait_item",
                "type_item",
                "const_item",
                "static_item",
                "macro_definition",
            ],
            Language::TypeScript | Language::Tsx => &[
                "function_declaration",
                "class_declaration",
                "method_definition",
                "arrow_function",
                "interface_declaration",
                "type_alias_declaration",
                "enum_declaration",
                "export_statement",
            ],
            Language::JavaScript | Language::Jsx => &[
                "function_declaration",
                "class_declaration",
                "method_definition",
                "arrow_function",
                "export_statement",
            ],
            Language::Python => &[
                "function_definition",
                "class_definition",
                "decorated_definition",
            ],
            Language::Go => &[
                "function_declaration",
                "method_declaration",
                "type_declaration",
                "const_declaration",
                "var_declaration",
            ],
            Language::Java => &[
                "class_declaration",
                "method_declaration",
                "interface_declaration",
                "enum_declaration",
                "constructor_declaration",
            ],
            Language::C => &[
                "function_definition",
                "struct_specifier",
                "enum_specifier",
                "type_definition",
            ],
            Language::Cpp => &[
                "function_definition",
                "class_specifier",
                "struct_specifier",
                "enum_specifier",
                "namespace_definition",
                "template_declaration",
            ],
            Language::Json | Language::Toml | Language::Yaml => &[],
            Language::Markdown => &["section", "atx_heading"],
            Language::Unknown => &[],
        }
    }
}

/// Detect language from file path based on extension
pub fn detect_language(path: &Path) -> Language {
    let extension = path.extension().and_then(|e| e.to_str()).unwrap_or("");

    let filename = path.file_name().and_then(|n| n.to_str()).unwrap_or("");

    // Check for special filenames first
    match filename.to_lowercase().as_str() {
        "cargo.toml" | "pyproject.toml" => return Language::Toml,
        "package.json" | "tsconfig.json" => return Language::Json,
        "readme.md" | "changelog.md" => return Language::Markdown,
        _ => {}
    }

    // Map by extension
    match extension.to_lowercase().as_str() {
        "rs" => Language::Rust,
        "ts" => Language::TypeScript,
        "tsx" => Language::Tsx,
        "js" | "mjs" | "cjs" => Language::JavaScript,
        "jsx" => Language::Jsx,
        "py" | "pyi" | "pyw" => Language::Python,
        "go" => Language::Go,
        "java" => Language::Java,
        "c" | "h" => Language::C,
        "cpp" | "cxx" | "cc" | "hpp" | "hxx" | "hh" => Language::Cpp,
        "json" | "jsonc" => Language::Json,
        "toml" => Language::Toml,
        "yaml" | "yml" => Language::Yaml,
        "md" | "markdown" => Language::Markdown,
        _ => Language::Unknown,
    }
}

/// Detect language from file path string
pub fn detect_language_from_str(path: &str) -> Language {
    detect_language(Path::new(path))
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_detect_rust() {
        assert_eq!(detect_language(Path::new("src/main.rs")), Language::Rust);
        assert_eq!(detect_language(Path::new("lib.rs")), Language::Rust);
    }

    #[test]
    fn test_detect_typescript() {
        assert_eq!(detect_language(Path::new("app.ts")), Language::TypeScript);
        assert_eq!(detect_language(Path::new("component.tsx")), Language::Tsx);
    }

    #[test]
    fn test_detect_javascript() {
        assert_eq!(detect_language(Path::new("index.js")), Language::JavaScript);
        assert_eq!(
            detect_language(Path::new("utils.mjs")),
            Language::JavaScript
        );
        assert_eq!(detect_language(Path::new("component.jsx")), Language::Jsx);
    }

    #[test]
    fn test_detect_python() {
        assert_eq!(detect_language(Path::new("script.py")), Language::Python);
        assert_eq!(detect_language(Path::new("types.pyi")), Language::Python);
    }

    #[test]
    fn test_detect_go() {
        assert_eq!(detect_language(Path::new("main.go")), Language::Go);
    }

    #[test]
    fn test_detect_java() {
        assert_eq!(detect_language(Path::new("Main.java")), Language::Java);
    }

    #[test]
    fn test_detect_c_cpp() {
        assert_eq!(detect_language(Path::new("main.c")), Language::C);
        assert_eq!(detect_language(Path::new("header.h")), Language::C);
        assert_eq!(detect_language(Path::new("app.cpp")), Language::Cpp);
        assert_eq!(detect_language(Path::new("app.hpp")), Language::Cpp);
    }

    #[test]
    fn test_detect_config_files() {
        assert_eq!(detect_language(Path::new("config.json")), Language::Json);
        assert_eq!(detect_language(Path::new("config.toml")), Language::Toml);
        assert_eq!(detect_language(Path::new("config.yaml")), Language::Yaml);
        assert_eq!(detect_language(Path::new("config.yml")), Language::Yaml);
    }

    #[test]
    fn test_detect_markdown() {
        assert_eq!(detect_language(Path::new("README.md")), Language::Markdown);
        assert_eq!(
            detect_language(Path::new("docs.markdown")),
            Language::Markdown
        );
    }

    #[test]
    fn test_detect_unknown() {
        assert_eq!(detect_language(Path::new("file.xyz")), Language::Unknown);
        assert_eq!(
            detect_language(Path::new("no_extension")),
            Language::Unknown
        );
    }

    #[test]
    fn test_case_insensitive() {
        assert_eq!(detect_language(Path::new("file.RS")), Language::Rust);
        assert_eq!(detect_language(Path::new("file.PY")), Language::Python);
    }

    #[test]
    fn test_semantic_node_types() {
        let rust_types = Language::Rust.semantic_node_types();
        assert!(rust_types.contains(&"function_item"));
        assert!(rust_types.contains(&"impl_item"));

        let py_types = Language::Python.semantic_node_types();
        assert!(py_types.contains(&"function_definition"));
        assert!(py_types.contains(&"class_definition"));
    }
}