use regex::Regex;
use std::sync::LazyLock;
const LANGUAGE_MAP: &[(&str, &str)] = &[
("rust", "rust"),
("rs", "rust"),
("python", "python"),
("py", "python"),
("javascript", "javascript"),
("js", "javascript"),
("typescript", "typescript"),
("ts", "typescript"),
("go", "go"),
("golang", "go"),
("java", "java"),
("cpp", "cpp"),
("c++", "cpp"),
("cxx", "cpp"),
("c", "c"),
("ruby", "ruby"),
("rb", "ruby"),
("php", "php"),
("sql", "sql"),
("terraform", "terraform"),
("tf", "terraform"),
("kotlin", "kotlin"),
("kt", "kotlin"),
("swift", "swift"),
("scala", "scala"),
("elixir", "elixir"),
("ex", "elixir"),
("haskell", "haskell"),
("hs", "haskell"),
("perl", "perl"),
("pl", "perl"),
("lua", "lua"),
("r", "r"),
("dart", "dart"),
("zig", "zig"),
("groovy", "groovy"),
("shell", "shell"),
("bash", "shell"),
("sh", "shell"),
("html", "html"),
("css", "css"),
("vue", "vue"),
("svelte", "svelte"),
];
static LANGUAGE_PATTERNS: LazyLock<Vec<(Regex, &'static str)>> = LazyLock::new(|| {
LANGUAGE_MAP
.iter()
.map(|(keyword, normalized)| {
let pattern = format!(r"(?i)\b{}\b", regex::escape(keyword));
(
Regex::new(&pattern).expect("Invalid language regex"),
*normalized,
)
})
.collect()
});
#[must_use]
pub fn extract_languages(input: &str) -> Vec<String> {
let mut languages = Vec::new();
for (pattern, normalized) in LANGUAGE_PATTERNS.iter() {
if pattern.is_match(input) && !languages.contains(&(*normalized).to_string()) {
languages.push((*normalized).to_string());
}
}
languages
}
#[must_use]
pub fn normalize_language(lang: &str) -> Option<&'static str> {
let lower = lang.to_lowercase();
LANGUAGE_MAP
.iter()
.find(|(keyword, _)| *keyword == lower)
.map(|(_, normalized)| *normalized)
}
#[must_use]
pub fn is_known_language(lang: &str) -> bool {
normalize_language(lang).is_some()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extract_single_language() {
let langs = extract_languages("find foo in rust");
assert_eq!(langs, vec!["rust"]);
}
#[test]
fn test_extract_multiple_languages() {
let langs = extract_languages("find foo in typescript and javascript");
assert!(langs.contains(&"typescript".to_string()));
assert!(langs.contains(&"javascript".to_string()));
}
#[test]
fn test_extract_alias() {
let langs = extract_languages("find foo in py");
assert_eq!(langs, vec!["python"]);
}
#[test]
fn test_extract_case_insensitive() {
let langs = extract_languages("find foo in RUST");
assert_eq!(langs, vec!["rust"]);
}
#[test]
fn test_no_duplicates() {
let langs = extract_languages("find foo in rust and rs");
assert_eq!(langs, vec!["rust"]);
}
#[test]
fn test_normalize_language() {
assert_eq!(normalize_language("py"), Some("python"));
assert_eq!(normalize_language("golang"), Some("go"));
assert_eq!(normalize_language("unknown"), None);
}
#[test]
fn test_is_known_language() {
assert!(is_known_language("rust"));
assert!(is_known_language("JS"));
assert!(!is_known_language("foobar"));
}
}