loc-rs 0.2.2

Advanced Lines of Code counter with function extraction, git integration, and parallel processing
// Author: kelexine (https://github.com/kelexine)
// language.rs — Language-to-extension mapping and resolution

use once_cell::sync::Lazy;
use std::collections::HashMap;

/// Static map from language name → list of file extensions (with leading dot).
pub static LANGUAGE_MAP: Lazy<HashMap<&'static str, Vec<&'static str>>> = Lazy::new(|| {
    let mut m = HashMap::new();
    m.insert("python", vec![".py", ".pyw", ".pyi"]);
    m.insert("javascript", vec![".js", ".mjs", ".cjs"]);
    m.insert("typescript", vec![".ts", ".tsx", ".mts"]);
    m.insert("rust", vec![".rs"]);
    m.insert("go", vec![".go"]);
    m.insert("java", vec![".java"]);
    m.insert("kotlin", vec![".kt", ".kts"]);
    m.insert("swift", vec![".swift"]);
    m.insert("c", vec![".c", ".h"]);
    m.insert("cpp", vec![".cpp", ".cc", ".cxx", ".hpp", ".hxx", ".h++"]);
    m.insert("csharp", vec![".cs"]);
    m.insert("ruby", vec![".rb", ".rake", ".gemspec"]);
    m.insert("php", vec![".php", ".php3", ".php4", ".php5", ".phtml"]);
    m.insert("html", vec![".html", ".htm"]);
    m.insert("css", vec![".css", ".scss", ".sass", ".less"]);
    m.insert("shell", vec![".sh", ".bash", ".zsh", ".fish"]);
    m.insert("sql", vec![".sql"]);
    m.insert("markdown", vec![".md", ".markdown", ".mdx"]);
    m.insert("json", vec![".json", ".jsonl", ".json5"]);
    m.insert("yaml", vec![".yml", ".yaml"]);
    m.insert("xml", vec![".xml", ".xsl", ".xslt"]);
    m.insert("jsx", vec![".jsx"]);
    m.insert("vue", vec![".vue"]);
    m.insert("svelte", vec![".svelte"]);
    m.insert("toml", vec![".toml"]);
    m.insert("scala", vec![".scala", ".sc"]);
    m.insert("haskell", vec![".hs", ".lhs"]);
    m.insert("elixir", vec![".ex", ".exs"]);
    m.insert("lua", vec![".lua"]);
    m.insert("zig", vec![".zig"]);
    m.insert("nim", vec![".nim", ".nims"]);
    m
});

/// Aliases: short names / common misspellings → canonical language names.
static ALIASES: Lazy<HashMap<&'static str, &'static str>> = Lazy::new(|| {
    let mut m = HashMap::new();
    m.insert("py", "python");
    m.insert("js", "javascript");
    m.insert("ts", "typescript");
    m.insert("tsx", "typescript");
    m.insert("rs", "rust");
    m.insert("c++", "cpp");
    m.insert("cxx", "cpp");
    m.insert("cc", "cpp");
    m.insert("cs", "csharp");
    m.insert("rb", "ruby");
    m.insert("sh", "shell");
    m.insert("bash", "shell");
    m.insert("zsh", "shell");
    m.insert("md", "markdown");
    m.insert("yml", "yaml");
    m.insert("kt", "kotlin");
    m.insert("hs", "haskell");
    m
});

/// Resolve a user-provided language name or extension to a list of file extensions.
///
/// Accepts:
/// - Full language name: `"python"` → `[".py", ".pyw", ".pyi"]`
/// - Alias: `"py"` → same
/// - Raw extension: `".rs"` or `"rs"` → `[".rs"]`
pub fn resolve_extensions(input: &str) -> Vec<String> {
    let lower = input.to_lowercase();

    // Direct dot-extension supplied: ".rs" or "rs"
    if lower.starts_with('.') {
        return vec![lower];
    }

    // Check alias map first
    let canonical = ALIASES
        .get(lower.as_str())
        .copied()
        .unwrap_or(lower.as_str());

    // Look up in language map
    if let Some(exts) = LANGUAGE_MAP.get(canonical) {
        return exts.iter().map(|e| e.to_string()).collect();
    }

    // Treat the input as a bare extension
    vec![format!(".{}", lower)]
}

/// Binary extensions — files with these extensions are skipped for line counting.
pub static BINARY_EXTENSIONS: Lazy<std::collections::HashSet<&'static str>> = Lazy::new(|| {
    [
        ".png", ".jpg", ".jpeg", ".gif", ".bmp", ".ico", ".pdf", ".zip", ".tar", ".gz", ".bz2",
        ".xz", ".rar", ".7z", ".exe", ".dll", ".so", ".dylib", ".bin", ".wasm", ".mp3", ".mp4",
        ".avi", ".mov", ".wav", ".flac", ".ogg", ".ttf", ".otf", ".woff", ".woff2", ".eot", ".pyc",
        ".pyo", ".class", ".o", ".a", ".lib", ".db", ".sqlite", ".sqlite3",
    ]
    .iter()
    .copied()
    .collect()
});

/// Specification for comment markers in a language.
#[derive(Debug, Clone, Copy)]
pub struct CommentSpec {
    /// Single-line comment marker (e.g. "//" or "#")
    pub single: Option<&'static str>,
    /// Multi-line comment (start, end)
    pub multi: Option<(&'static str, &'static str)>,
}

/// Registry for comment specifications by extension.
pub static COMMENT_REGISTRY: Lazy<HashMap<&'static str, CommentSpec>> = Lazy::new(|| {
    let mut m = HashMap::new();

    let c_style = CommentSpec {
        single: Some("//"),
        multi: Some(("/*", "*/")),
    };
    let bash_style = CommentSpec {
        single: Some("#"),
        multi: None,
    };
    let py_style = CommentSpec {
        single: Some("#"),
        multi: Some(("\"\"\"", "\"\"\"")), // Triple quotes are often used as docstrings but act like block comments
    };
    let html_style = CommentSpec {
        single: None,
        multi: Some(("<!--", "-->")),
    };
    let sql_style = CommentSpec {
        single: Some("--"),
        multi: Some(("/*", "*/")),
    };
    let lua_style = CommentSpec {
        single: Some("--"),
        multi: Some(("--[[", "]]")),
    };
    let haskell_style = CommentSpec {
        single: Some("--"),
        multi: Some(("{-", "-}")),
    };
    let ruby_style = CommentSpec {
        single: Some("#"),
        multi: Some(("=begin", "=end")),
    };

    // Mapping
    let mappings = [
        (
            vec![
                ".rs", ".go", ".java", ".kt", ".kts", ".swift", ".c", ".h", ".cpp", ".cc", ".cxx",
                ".hpp", ".hxx", ".h++", ".cs", ".js", ".mjs", ".cjs", ".ts", ".tsx", ".mts",
                ".php", ".scala", ".sc", ".zig",
            ],
            c_style,
        ),
        (vec![".py", ".pyw", ".pyi"], py_style),
        (
            vec![
                ".sh", ".bash", ".zsh", ".fish", ".rb", ".rake", ".gemspec", ".yaml", ".yml",
                ".toml", ".ex", ".exs", ".nim", ".nims",
            ],
            bash_style,
        ),
        (
            vec![".html", ".htm", ".xml", ".xsl", ".xslt", ".vue", ".svelte"],
            html_style,
        ),
        (vec![".sql"], sql_style),
        (vec![".lua"], lua_style),
        (vec![".hs", ".lhs"], haskell_style),
        (vec![".rb"], ruby_style), // Overwrite for block comments
    ];

    for (exts, spec) in mappings {
        for ext in exts {
            m.insert(ext, spec);
        }
    }

    m
});

/// Directories excluded by default in non-git mode.
pub static EXCLUDED_DIRS: Lazy<std::collections::HashSet<&'static str>> = Lazy::new(|| {
    [
        "node_modules",
        ".git",
        "vendor",
        ".venv",
        "venv",
        "__pycache__",
        "dist",
        "build",
        ".next",
        ".nuxt",
        "target",
        "bin",
        "obj",
        ".gradle",
        ".idea",
        ".vscode",
        "coverage",
        ".pytest_cache",
        ".mypy_cache",
        ".tox",
        "eggs",
        ".eggs",
        ".cargo",
    ]
    .iter()
    .copied()
    .collect()
});

/// List all known language names (for help text).
#[allow(dead_code)]
pub fn all_languages() -> Vec<&'static str> {
    let mut langs: Vec<_> = LANGUAGE_MAP.keys().copied().collect();
    langs.sort_unstable();
    langs
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_resolve_extensions_basic() {
        assert_eq!(resolve_extensions("rust"), vec![".rs".to_string()]);
        assert_eq!(resolve_extensions("rs"), vec![".rs".to_string()]);
        assert_eq!(resolve_extensions(".rs"), vec![".rs".to_string()]);
    }

    #[test]
    fn test_resolve_extensions_aliases() {
        assert_eq!(
            resolve_extensions("py"),
            vec![".py", ".pyw", ".pyi"]
                .iter()
                .map(|s| s.to_string())
                .collect::<Vec<_>>()
        );
        assert_eq!(
            resolve_extensions("javascript"),
            vec![".js", ".mjs", ".cjs"]
                .iter()
                .map(|s| s.to_string())
                .collect::<Vec<_>>()
        );
        assert_eq!(
            resolve_extensions("js"),
            vec![".js", ".mjs", ".cjs"]
                .iter()
                .map(|s| s.to_string())
                .collect::<Vec<_>>()
        );
    }

    #[test]
    fn test_resolve_extensions_case_insensitive() {
        assert_eq!(resolve_extensions("RUST"), vec![".rs".to_string()]);
        assert_eq!(
            resolve_extensions("Py"),
            vec![".py", ".pyw", ".pyi"]
                .iter()
                .map(|s| s.to_string())
                .collect::<Vec<_>>()
        );
    }

    #[test]
    fn test_resolve_extensions_unknown() {
        // Unknown language should be treated as a bare extension
        assert_eq!(resolve_extensions("xyzzy"), vec![".xyzzy".to_string()]);
    }

    #[test]
    fn test_all_languages() {
        let langs = all_languages();
        assert!(langs.contains(&"rust"));
        assert!(langs.contains(&"python"));
        assert!(langs.is_sorted());
    }
}