opengrep 1.1.0

Advanced AST-aware code search tool with tree-sitter parsing and AI integration capabilities
Documentation
//! Language detection and parser management
//!
//! This module handles detecting file languages and managing tree-sitter parsers
//! for different programming languages.

use anyhow::Result;
use once_cell::sync::Lazy;
use std::collections::HashMap;
use std::path::Path;
use tree_sitter::Language;

/// Language metadata
#[derive(Debug, Clone)]
pub struct LanguageInfo {
    /// Language name
    pub name: &'static str,
    /// File extensions
    pub extensions: &'static [&'static str],
    /// Tree-sitter language
    pub language: Language,
    /// Common file patterns
    pub patterns: &'static [&'static str],
}

/// Global language registry
static LANGUAGES: Lazy<HashMap<&'static str, LanguageInfo>> = Lazy::new(|| {
    let mut languages = HashMap::new();
    
    // Register languages with their tree-sitter parsers
    macro_rules! register_language {
        ($name:expr, $lang:expr, $exts:expr, $patterns:expr) => {
            languages.insert($name, LanguageInfo {
                name: $name,
                extensions: $exts,
                language: $lang,
                patterns: $patterns,
            });
        };
    }
    
    register_language!(
        "rust", 
        tree_sitter_rust::language(), 
        &["rs"], 
        &["Cargo.toml", "Cargo.lock"]
    );
    
    register_language!(
        "python", 
        tree_sitter_python::language(), 
        &["py", "pyw", "pyi"], 
        &["setup.py", "pyproject.toml", "requirements.txt"]
    );
    
    register_language!(
        "javascript", 
        tree_sitter_javascript::language(), 
        &["js", "mjs", "cjs"], 
        &["package.json", ".eslintrc.js"]
    );
    
    register_language!(
        "typescript", 
        tree_sitter_typescript::language_typescript(), 
        &["ts"], 
        &["tsconfig.json"]
    );
    
    register_language!(
        "tsx", 
        tree_sitter_typescript::language_tsx(), 
        &["tsx"], 
        &[]
    );
    
    register_language!(
        "go", 
        tree_sitter_go::language(), 
        &["go"], 
        &["go.mod", "go.sum"]
    );
    
    register_language!(
        "java", 
        tree_sitter_java::language(), 
        &["java"], 
        &["pom.xml", "build.gradle"]
    );
    
    register_language!(
        "c", 
        tree_sitter_c::language(), 
        &["c", "h"], 
        &["Makefile", "CMakeLists.txt"]
    );
    
    register_language!(
        "cpp", 
        tree_sitter_cpp::language(), 
        &["cpp", "cc", "cxx", "hpp", "hxx", "h++"], 
        &["CMakeLists.txt"]
    );
    
    register_language!(
        "csharp", 
        tree_sitter_c_sharp::language(), 
        &["cs"], 
        &["*.csproj", "*.sln"]
    );
    
    register_language!(
        "ruby", 
        tree_sitter_ruby::language(), 
        &["rb"], 
        &["Gemfile", "Rakefile"]
    );
    
    register_language!(
        "bash", 
        tree_sitter_bash::language(), 
        &["sh", "bash", "zsh"], 
        &[".bashrc", ".zshrc"]
    );
    
    // YAML temporarily disabled due to dependency conflicts
    // register_language!(
    //     "yaml", 
    //     tree_sitter_yaml::language(), 
    //     &["yaml", "yml"], 
    //     &[".github/workflows/*.yml"]
    // );
    
    register_language!(
        "json", 
        tree_sitter_json::language(), 
        &["json"], 
        &["package.json", "tsconfig.json"]
    );
    
    register_language!(
        "toml", 
        tree_sitter_toml::language(), 
        &["toml"], 
        &["Cargo.toml", "pyproject.toml"]
    );
    
    // HTML temporarily disabled due to dependency conflicts
    // register_language!(
    //     "html", 
    //     tree_sitter_html::language(), 
    //     &["html", "htm"], 
    //     &["index.html"]
    // );
    
    register_language!(
        "css", 
        tree_sitter_css::language(), 
        &["css"], 
        &[]
    );
    
    // SQL support temporarily disabled - requires additional dependency
    // register_language!(
    //     "sql", 
    //     tree_sitter_sql::language(), 
    //     &["sql"], 
    //     &[]
    // );
    
    languages
});

/// Extension to language mapping
static EXTENSION_MAP: Lazy<HashMap<&'static str, &'static str>> = Lazy::new(|| {
    let mut map = HashMap::new();
    
    for (name, info) in LANGUAGES.iter() {
        for ext in info.extensions {
            map.insert(*ext, *name);
        }
    }
    
    map
});

/// Special file name patterns
static SPECIAL_FILES: Lazy<HashMap<&'static str, &'static str>> = Lazy::new(|| {
    let mut map = HashMap::new();
    
    // Dockerfile variants
    map.insert("Dockerfile", "dockerfile");
    map.insert("Dockerfile.dev", "dockerfile");
    map.insert("Dockerfile.prod", "dockerfile");
    
    // Build files
    map.insert("Makefile", "make");
    map.insert("makefile", "make");
    map.insert("GNUmakefile", "make");
    map.insert("CMakeLists.txt", "cmake");
    
    // Config files
    map.insert(".gitignore", "gitignore");
    map.insert(".dockerignore", "dockerignore");
    map.insert("README.md", "markdown");
    map.insert("README.rst", "rst");
    
    map
});

/// Detect language from file path
pub fn detect_language(path: &Path) -> Option<String> {
    // Check file name for special cases
    if let Some(file_name) = path.file_name().and_then(|n| n.to_str()) {
        if let Some(lang) = SPECIAL_FILES.get(file_name) {
            return Some(lang.to_string());
        }
    }
    
    // Check extension
    if let Some(extension) = path.extension().and_then(|e| e.to_str()) {
        if let Some(lang) = EXTENSION_MAP.get(extension) {
            return Some(lang.to_string());
        }
    }
    
    // Check shebang for shell scripts
    if let Ok(content) = std::fs::read_to_string(path) {
        if let Some(first_line) = content.lines().next() {
            if first_line.starts_with("#!") {
                if first_line.contains("python") {
                    return Some("python".to_string());
                } else if first_line.contains("bash") || first_line.contains("sh") {
                    return Some("bash".to_string());
                } else if first_line.contains("node") {
                    return Some("javascript".to_string());
                }
            }
        }
    }
    
    None
}

/// Get language info by name
pub fn get_language(name: &str) -> Option<&'static LanguageInfo> {
    LANGUAGES.get(name)
}

/// Parse AST for a given source code
pub fn parse_ast(source: &str, language: &str) -> Result<crate::ast::ParsedAst> {
    let lang_info = get_language(language)
        .ok_or_else(|| anyhow::anyhow!("Unknown language: {}", language))?;
    
    let mut analyzer = crate::ast::AstAnalyzer::new(lang_info.language.clone())?;
    analyzer.parse(source)
}

/// List all supported languages
pub fn list_supported_languages() {
    println!("Supported languages:");
    println!();
    
    let mut languages: Vec<_> = LANGUAGES.values().collect();
    languages.sort_by_key(|info| info.name);
    
    for info in languages {
        let extensions = info.extensions.join(", ");
        println!("  {:<15} Extensions: {}", info.name, extensions);
        
        if !info.patterns.is_empty() {
            let patterns = info.patterns.join(", ");
            println!("  {:<15} Patterns:   {}", "", patterns);
        }
        println!();
    }
    
    println!("Special files:");
    for (file, lang) in SPECIAL_FILES.iter() {
        println!("  {:<15} -> {}", file, lang);
    }
}

/// Get file filter for specific languages
pub fn get_language_filter(languages: &[String]) -> Box<dyn Fn(&Path) -> bool + Send + Sync> {
    let languages = languages.to_vec();
    
    Box::new(move |path: &Path| {
        if languages.is_empty() {
            return true;
        }
        
        if let Some(detected) = detect_language(path) {
            languages.contains(&detected)
        } else {
            false
        }
    })
}

/// Check if a file is likely binary
pub fn is_binary_file(path: &Path) -> bool {
    // Check extension first
    if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
        match ext.to_lowercase().as_str() {
            "exe" | "dll" | "so" | "dylib" | "a" | "lib" | "obj" | "o" => return true,
            "jpg" | "jpeg" | "png" | "gif" | "bmp" | "ico" | "svg" => return true,
            "mp3" | "mp4" | "wav" | "avi" | "mov" | "mkv" => return true,
            "zip" | "tar" | "gz" | "rar" | "7z" => return true,
            "pdf" | "doc" | "docx" | "xls" | "xlsx" | "ppt" | "pptx" => return true,
            _ => {}
        }
    }
    
    // Check file content for null bytes
    if let Ok(content) = std::fs::read(path) {
        if content.len() > 8192 {
            // Only check first 8KB
            return content[..8192].contains(&0);
        } else {
            return content.contains(&0);
        }
    }
    
    false
}

/// Get language statistics for a directory
pub fn get_language_stats(path: &Path) -> Result<HashMap<String, usize>> {
    use walkdir::WalkDir;
    
    let mut stats = HashMap::new();
    
    for entry in WalkDir::new(path).follow_links(false) {
        let entry = entry?;
        
        if entry.file_type().is_file() {
            if let Some(lang) = detect_language(entry.path()) {
                *stats.entry(lang).or_insert(0) += 1;
            }
        }
    }
    
    Ok(stats)
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::path::PathBuf;
    
    #[test]
    fn test_detect_language() {
        assert_eq!(detect_language(&PathBuf::from("test.rs")), Some("rust".to_string()));
        assert_eq!(detect_language(&PathBuf::from("test.py")), Some("python".to_string()));
        assert_eq!(detect_language(&PathBuf::from("test.js")), Some("javascript".to_string()));
        assert_eq!(detect_language(&PathBuf::from("test.ts")), Some("typescript".to_string()));
        assert_eq!(detect_language(&PathBuf::from("Dockerfile")), Some("dockerfile".to_string()));
        assert_eq!(detect_language(&PathBuf::from("Makefile")), Some("make".to_string()));
    }
    
    #[test]
    fn test_get_language() {
        assert!(get_language("rust").is_some());
        assert!(get_language("python").is_some());
        assert!(get_language("javascript").is_some());
        assert!(get_language("unknown").is_none());
    }
    
    #[test]
    fn test_language_filter() {
        let filter = get_language_filter(&["rust".to_string()]);
        assert!(filter(&PathBuf::from("test.rs")));
        assert!(!filter(&PathBuf::from("test.py")));
        
        let filter = get_language_filter(&[]);
        assert!(filter(&PathBuf::from("test.rs")));
        assert!(filter(&PathBuf::from("test.py")));
    }
    
    #[test]
    fn test_is_binary_file() {
        assert!(is_binary_file(&PathBuf::from("test.exe")));
        assert!(is_binary_file(&PathBuf::from("image.jpg")));
        assert!(!is_binary_file(&PathBuf::from("test.rs")));
        assert!(!is_binary_file(&PathBuf::from("README.md")));
    }
}