Skip to main content

rumdl_lib/code_block_tools/
linguist.rs

1//! Language alias resolution using GitHub Linguist data.
2//!
3//! This module provides mapping from language aliases (e.g., "py", "bash")
4//! to canonical language names (e.g., "python", "shell") for consistent
5//! tool configuration lookup.
6
7use std::collections::HashMap;
8use std::sync::LazyLock;
9
10/// Resolver for language aliases to canonical names.
11pub struct LinguistResolver {
12    /// Map from alias -> canonical name
13    alias_map: &'static HashMap<&'static str, &'static str>,
14}
15
16impl LinguistResolver {
17    /// Create a new resolver using embedded Linguist data.
18    pub fn new() -> Self {
19        Self {
20            alias_map: &LANGUAGE_ALIASES,
21        }
22    }
23
24    /// Resolve a language tag to its canonical name.
25    ///
26    /// Returns the canonical name if the input is a known alias,
27    /// otherwise returns the input lowercased.
28    pub fn resolve(&self, language: &str) -> String {
29        let lower = language.to_lowercase();
30        self.alias_map
31            .get(lower.as_str())
32            .map(|&s| s.to_string())
33            .unwrap_or(lower)
34    }
35
36    /// Check if a language (or alias) is known.
37    pub fn is_known(&self, language: &str) -> bool {
38        let lower = language.to_lowercase();
39        self.alias_map.contains_key(lower.as_str())
40    }
41}
42
43impl Default for LinguistResolver {
44    fn default() -> Self {
45        Self::new()
46    }
47}
48
49/// Embedded language alias map.
50///
51/// Maps aliases and canonical names to canonical names.
52/// Curated subset inspired by GitHub Linguist languages.yml.
53///
54/// The map includes:
55/// - Canonical name -> canonical name (identity)
56/// - Alias -> canonical name
57/// - Extension (without dot) -> canonical name (for common extensions)
58static LANGUAGE_ALIASES: LazyLock<HashMap<&'static str, &'static str>> = LazyLock::new(|| {
59    let mut m = HashMap::new();
60
61    // Python
62    m.insert("python", "python");
63    m.insert("py", "python");
64    m.insert("python3", "python");
65    m.insert("py3", "python");
66    m.insert("pyw", "python");
67
68    // JavaScript
69    m.insert("javascript", "javascript");
70    m.insert("js", "javascript");
71    m.insert("node", "javascript");
72    m.insert("nodejs", "javascript");
73    m.insert("mjs", "javascript");
74    m.insert("cjs", "javascript");
75
76    // TypeScript
77    m.insert("typescript", "typescript");
78    m.insert("ts", "typescript");
79    m.insert("mts", "typescript");
80    m.insert("cts", "typescript");
81
82    // Shell/Bash
83    m.insert("shell", "shell");
84    m.insert("bash", "shell");
85    m.insert("sh", "shell");
86    m.insert("zsh", "shell");
87    m.insert("ksh", "shell");
88    m.insert("fish", "shell");
89    m.insert("shellscript", "shell");
90    m.insert("shell-script", "shell");
91
92    // Rust
93    m.insert("rust", "rust");
94    m.insert("rs", "rust");
95
96    // Go
97    m.insert("go", "go");
98    m.insert("golang", "go");
99
100    // Ruby
101    m.insert("ruby", "ruby");
102    m.insert("rb", "ruby");
103    m.insert("jruby", "ruby");
104
105    // Java
106    m.insert("java", "java");
107
108    // Kotlin
109    m.insert("kotlin", "kotlin");
110    m.insert("kt", "kotlin");
111    m.insert("kts", "kotlin");
112
113    // Scala
114    m.insert("scala", "scala");
115
116    // C
117    m.insert("c", "c");
118    m.insert("h", "c");
119
120    // C++
121    m.insert("c++", "cpp");
122    m.insert("cpp", "cpp");
123    m.insert("cxx", "cpp");
124    m.insert("cc", "cpp");
125    m.insert("hpp", "cpp");
126    m.insert("hxx", "cpp");
127
128    // C#
129    m.insert("c#", "csharp");
130    m.insert("csharp", "csharp");
131    m.insert("cs", "csharp");
132
133    // F#
134    m.insert("f#", "fsharp");
135    m.insert("fsharp", "fsharp");
136    m.insert("fs", "fsharp");
137
138    // Swift
139    m.insert("swift", "swift");
140
141    // Objective-C
142    m.insert("objective-c", "objective-c");
143    m.insert("objc", "objective-c");
144    m.insert("obj-c", "objective-c");
145
146    // PHP
147    m.insert("php", "php");
148
149    // Perl
150    m.insert("perl", "perl");
151    m.insert("pl", "perl");
152
153    // R
154    m.insert("r", "r");
155
156    // Lua
157    m.insert("lua", "lua");
158
159    // Haskell
160    m.insert("haskell", "haskell");
161    m.insert("hs", "haskell");
162
163    // Elixir
164    m.insert("elixir", "elixir");
165    m.insert("ex", "elixir");
166    m.insert("exs", "elixir");
167
168    // Erlang
169    m.insert("erlang", "erlang");
170    m.insert("erl", "erlang");
171
172    // Clojure
173    m.insert("clojure", "clojure");
174    m.insert("clj", "clojure");
175    m.insert("cljs", "clojure");
176    m.insert("cljc", "clojure");
177
178    // HTML
179    m.insert("html", "html");
180    m.insert("htm", "html");
181    m.insert("xhtml", "html");
182
183    // CSS
184    m.insert("css", "css");
185
186    // SCSS/Sass
187    m.insert("scss", "scss");
188    m.insert("sass", "sass");
189
190    // Less
191    m.insert("less", "less");
192
193    // JSON
194    m.insert("json", "json");
195    m.insert("jsonc", "json");
196    m.insert("json5", "json");
197
198    // YAML
199    m.insert("yaml", "yaml");
200    m.insert("yml", "yaml");
201
202    // TOML
203    m.insert("toml", "toml");
204
205    // XML
206    m.insert("xml", "xml");
207    m.insert("xsd", "xml");
208    m.insert("xsl", "xml");
209    m.insert("xslt", "xml");
210
211    // Markdown
212    m.insert("markdown", "markdown");
213    m.insert("md", "markdown");
214    m.insert("mkd", "markdown");
215    m.insert("mdx", "markdown");
216
217    // SQL
218    m.insert("sql", "sql");
219    m.insert("mysql", "sql");
220    m.insert("postgresql", "sql");
221    m.insert("postgres", "sql");
222    m.insert("sqlite", "sql");
223    m.insert("plsql", "sql");
224    m.insert("tsql", "sql");
225
226    // GraphQL
227    m.insert("graphql", "graphql");
228    m.insert("gql", "graphql");
229
230    // Protocol Buffers
231    m.insert("protobuf", "protobuf");
232    m.insert("proto", "protobuf");
233
234    // Terraform/HCL
235    m.insert("terraform", "terraform");
236    m.insert("tf", "terraform");
237    m.insert("hcl", "hcl");
238
239    // Dockerfile
240    m.insert("dockerfile", "dockerfile");
241    m.insert("docker", "dockerfile");
242
243    // Makefile
244    m.insert("makefile", "makefile");
245    m.insert("make", "makefile");
246
247    // Nix
248    m.insert("nix", "nix");
249
250    // Vim script
251    m.insert("vim", "vim");
252    m.insert("viml", "vim");
253    m.insert("vimscript", "vim");
254
255    // Zig
256    m.insert("zig", "zig");
257
258    // Nim
259    m.insert("nim", "nim");
260
261    // Julia
262    m.insert("julia", "julia");
263    m.insert("jl", "julia");
264
265    // OCaml
266    m.insert("ocaml", "ocaml");
267    m.insert("ml", "ocaml");
268
269    // ReasonML
270    m.insert("reason", "reason");
271    m.insert("re", "reason");
272
273    // Dart
274    m.insert("dart", "dart");
275
276    // V
277    m.insert("v", "v");
278    m.insert("vlang", "v");
279
280    // Awk
281    m.insert("awk", "awk");
282    m.insert("gawk", "awk");
283
284    // Sed
285    m.insert("sed", "sed");
286
287    // PowerShell
288    m.insert("powershell", "powershell");
289    m.insert("pwsh", "powershell");
290    m.insert("ps1", "powershell");
291
292    // Batch
293    m.insert("batch", "batch");
294    m.insert("bat", "batch");
295    m.insert("cmd", "batch");
296
297    // Diff
298    m.insert("diff", "diff");
299    m.insert("patch", "diff");
300
301    // INI
302    m.insert("ini", "ini");
303    m.insert("cfg", "ini");
304    m.insert("conf", "ini");
305
306    // AppleScript
307    m.insert("applescript", "applescript");
308
309    // Groovy
310    m.insert("groovy", "groovy");
311
312    // LaTeX
313    m.insert("latex", "latex");
314    m.insert("tex", "latex");
315
316    // Plain text
317    m.insert("text", "text");
318    m.insert("txt", "text");
319    m.insert("plaintext", "text");
320    m.insert("plain", "text");
321
322    m
323});
324
325#[cfg(test)]
326mod tests {
327    use super::*;
328
329    #[test]
330    fn test_resolve_known_alias() {
331        let resolver = LinguistResolver::new();
332
333        // Python aliases
334        assert_eq!(resolver.resolve("py"), "python");
335        assert_eq!(resolver.resolve("python3"), "python");
336        assert_eq!(resolver.resolve("Python"), "python");
337        assert_eq!(resolver.resolve("PY"), "python");
338
339        // Shell aliases
340        assert_eq!(resolver.resolve("bash"), "shell");
341        assert_eq!(resolver.resolve("sh"), "shell");
342        assert_eq!(resolver.resolve("zsh"), "shell");
343
344        // JavaScript aliases
345        assert_eq!(resolver.resolve("js"), "javascript");
346        assert_eq!(resolver.resolve("node"), "javascript");
347
348        // Rust
349        assert_eq!(resolver.resolve("rs"), "rust");
350        assert_eq!(resolver.resolve("Rust"), "rust");
351    }
352
353    #[test]
354    fn test_resolve_unknown_language() {
355        let resolver = LinguistResolver::new();
356
357        // Unknown languages are returned lowercased
358        assert_eq!(resolver.resolve("UnknownLang"), "unknownlang");
359        assert_eq!(resolver.resolve("CUSTOM"), "custom");
360    }
361
362    #[test]
363    fn test_resolve_canonical_name() {
364        let resolver = LinguistResolver::new();
365
366        // Canonical names resolve to themselves
367        assert_eq!(resolver.resolve("python"), "python");
368        assert_eq!(resolver.resolve("javascript"), "javascript");
369        assert_eq!(resolver.resolve("rust"), "rust");
370    }
371
372    #[test]
373    fn test_is_known() {
374        let resolver = LinguistResolver::new();
375
376        assert!(resolver.is_known("python"));
377        assert!(resolver.is_known("py"));
378        assert!(resolver.is_known("bash"));
379        assert!(resolver.is_known("JavaScript"));
380
381        assert!(!resolver.is_known("unknownlang"));
382        assert!(!resolver.is_known("customformat"));
383    }
384
385    #[test]
386    fn test_case_insensitivity() {
387        let resolver = LinguistResolver::new();
388
389        assert_eq!(resolver.resolve("PYTHON"), "python");
390        assert_eq!(resolver.resolve("Python"), "python");
391        assert_eq!(resolver.resolve("pYtHoN"), "python");
392        assert_eq!(resolver.resolve("JAVASCRIPT"), "javascript");
393        assert_eq!(resolver.resolve("JavaScript"), "javascript");
394    }
395}