difft_lib/parse/
guess_language.rs

1//! Guess which programming language a file is written in.
2//!
3//! This is heavily based on GitHub's
4//! [linguist](https://github.com/github/linguist/blob/master/docs/how-linguist-works.md),
5//! particularly its
6//! [languages.yml](https://github.com/github/linguist/blob/master/lib/linguist/languages.yml).
7//!
8//! Difftastic does not reuse languages.yml directly, for
9//! implementation simplicity and to avoid parsing very large files
10//! (e.g. package.lock) that can't be handled in a reasonable time
11//! yet.
12
13use lazy_static::lazy_static;
14use regex::Regex;
15use std::{borrow::Borrow, ffi::OsStr, path::Path};
16
17/// Languages supported by difftastic. Each language here has a
18/// corresponding tree-sitter parser.
19#[derive(Debug, Clone, Copy, PartialEq, Eq)]
20pub enum Language {
21    // Bash,
22    // C,
23    // Clojure,
24    // CMake,
25    // CommonLisp,
26    // CPlusPlus,
27    // CSharp,
28    // Css,
29    // Dart,
30    // Elixir,
31    // Elm,
32    // Elvish,
33    // EmacsLisp,
34    // Gleam,
35    // Go,
36    // Hack,
37    // Haskell,
38    // Hcl,
39    // Html,
40    // Janet,
41    // Java,
42    // JavaScript,
43    // Json,
44    // Julia,
45    // Jsx,
46    // Kotlin,
47    // Lua,
48    // Make,
49    // Nix,
50    // OCaml,
51    // OCamlInterface,
52    // Php,
53    // Perl,
54    // Python,
55    // Qml,
56    // Ruby,
57    // Rust,
58    // Scala,
59    // Sql,
60    // Swift,
61    // Toml,
62    // Tsx,
63    // TypeScript,
64    Yaml,
65    // Zig,
66}
67
68/// The language name shown to the user.
69pub fn language_name(language: Language) -> &'static str {
70    match language {
71        // Bash => "Bash",
72        // C => "C",
73        // Clojure => "Clojure",
74        // CMake => "CMake",
75        // CommonLisp => "Common Lisp",
76        // CPlusPlus => "C++",
77        // CSharp => "C#",
78        // Css => "CSS",
79        // Dart => "Dart",
80        // Elixir => "Elixir",
81        // Elm => "Elm",
82        // Elvish => "Elvish",
83        // EmacsLisp => "Emacs Lisp",
84        // Gleam => "Gleam",
85        // Go => "Go",
86        // Hack => "Hack",
87        // Haskell => "Haskell",
88        // Hcl => "HCL",
89        // Html => "HTML",
90        // Janet => "Janet",
91        // Java => "Java",
92        // JavaScript => "JavaScript",
93        // Json => "JSON",
94        // Julia => "Julia",
95        // Jsx => "JavaScript JSX",
96        // Kotlin => "Kotlin",
97        // Lua => "Lua",
98        // Make => "Make",
99        // Nix => "Nix",
100        // OCaml => "OCaml",
101        // OCamlInterface => "OCaml Interface",
102        // Php => "PHP",
103        // Perl => "Perl",
104        // Python => "Python",
105        // Qml => "QML",
106        // Ruby => "Ruby",
107        // Rust => "Rust",
108        // Scala => "Scala",
109        // Sql => "SQL",
110        // Swift => "Swift",
111        // Toml => "TOML",
112        // Tsx => "TypeScript TSX",
113        // TypeScript => "TypeScript",
114        Yaml => "YAML",
115        // Zig => "Zig",
116    }
117}
118
119pub const LANG_EXTENSIONS: &'static [(Language, &[&str])] = &[
120    // (
121    //     Bash,
122    //     &[
123    //         "sh", "bash", "bats", "cgi", "command", "env", "fcgi", "ksh", "sh.in", "tmux", "tool",
124    //         "zsh",
125    //     ],
126    // ),
127    // (C, &["c"]),
128    // (
129    //     Clojure,
130    //     &[
131    //         "bb", "boot", "clj", "cljc", "clje", "cljs", "cljx", "edn", "joke", "joker",
132    //     ],
133    // ),
134    // (CMake, &["cmake", "cmake.in"]),
135    // (CommonLisp, &["lisp", "lsp", "asd"]),
136    // // Treat .h as C++ rather than C. This is an arbitrary choice, but
137    // // C++ is more widely used than C according to
138    // // https://madnight.github.io/githut/
139    // (CPlusPlus, &["cc", "cpp", "h", "hh", "hpp", "cxx"]),
140    // (CSharp, &["cs"]),
141    // (Css, &["css"]),
142    // (Dart, &["dart"]),
143    // (Elm, &["elm"]),
144    // (EmacsLisp, &["el"]),
145    // (Elixir, &["ex", "exs"]),
146    // (Elvish, &["elv"]),
147    // (Gleam, &["gleam"]),
148    // (Go, &["go"]),
149    // (Hack, &["hack", "hck", "hhi"]),
150    // (Haskell, &["hs"]),
151    // (Hcl, &["hcl", "nomad", "tf", "tfvars", "workflow"]),
152    // (Html, &["html", "htm", "xhtml"]),
153    // (Janet, &["janet", "jdn"]),
154    // (Java, &["java"]),
155    // (JavaScript, &["cjs", "js", "mjs"]),
156    // (
157    //     Json,
158    //     &[
159    //         "json",
160    //         "avsc",
161    //         "geojson",
162    //         "gltf",
163    //         "har",
164    //         "ice",
165    //         "JSON-tmLanguage",
166    //         "jsonl",
167    //         "mcmeta",
168    //         "tfstate",
169    //         "tfstate.backup",
170    //         "topojson",
171    //         "webapp",
172    //         "webmanifest",
173    //     ],
174    // ),
175    // (Jsx, &["jsx"]),
176    // (Julia, &["jl"]),
177    // (Kotlin, &["kt", "ktm", "kts"]),
178    // (Lua, &["lua"]),
179    // (Make, &["mak", "d", "make", "makefile", "mk", "mkfile"]),
180    // (Nix, &["nix"]),
181    // (OCaml, &["ml"]),
182    // (OCamlInterface, &["mli"]),
183    // (Php, &["php"]),
184    // (Perl, &["pm", "pl"]),
185    // (Python, &["py", "py3", "pyi", "bzl"]),
186    // (Qml, &["qml"]),
187    // (Ruby, &["rb", "builder", "spec", "rake"]),
188    // (Rust, &["rs"]),
189    // (Scala, &["scala", "sbt", "sc"]),
190    // (Sql, &["sql", "pgsql"]),
191    // (Swift, &["swift"]),
192    // (Toml, &["toml"]),
193    // (TypeScript, &["ts"]),
194    // (Tsx, &["tsx"]),
195    (Yaml, &["yaml", "yml"]),
196    // (Zig, &["zig"]),
197];
198
199use Language::*;
200
201pub fn guess(path: &Path, src: &str) -> Option<Language> {
202    if let Some(lang) = from_emacs_mode_header(src) {
203        return Some(lang);
204    }
205    // if let Some(lang) = from_shebang(src) {
206    //     return Some(lang);
207    // }
208    // if let Some(lang) = from_name(path) {
209    //     return Some(lang);
210    // }
211
212    match path.extension() {
213        Some(extension) => match from_extension(extension) {
214            // Some(Language::Php) if src.starts_with("<?hh") => None,
215            language => language,
216        },
217        None => None,
218    }
219}
220
221/// Try to guess the language based on an Emacs mode comment at the
222/// beginning of the file.
223///
224/// <https://www.gnu.org/software/emacs/manual/html_node/emacs/Choosing-Modes.html>
225/// <https://www.gnu.org/software/emacs/manual/html_node/emacs/Specifying-File-Variables.html>
226fn from_emacs_mode_header(src: &str) -> Option<Language> {
227    lazy_static! {
228        static ref MODE_RE: Regex = Regex::new(r"-\*-.*mode:([^;]+?);.*-\*-").unwrap();
229        static ref SHORTHAND_RE: Regex = Regex::new(r"-\*-(.+)-\*-").unwrap();
230    }
231
232    // Emacs allows the mode header to occur on the second line if the
233    // first line is a shebang.
234    for line in src.lines().take(2) {
235        let mode_name: String = match (MODE_RE.captures(line), SHORTHAND_RE.captures(line)) {
236            (Some(cap), _) | (_, Some(cap)) => cap[1].into(),
237            _ => "".into(),
238        };
239        let lang = match mode_name.to_ascii_lowercase().trim().borrow() {
240            // "c" => Some(C),
241            // "clojure" => Some(Clojure),
242            // "csharp" => Some(CSharp),
243            // "css" => Some(Css),
244            // "dart" => Some(Dart),
245            // "c++" => Some(CPlusPlus),
246            // "elixir" => Some(Elixir),
247            // "elm" => Some(Elm),
248            // "elvish" => Some(Elvish),
249            // "emacs-lisp" => Some(EmacsLisp),
250            // "gleam" => Some(Gleam),
251            // "go" => Some(Go),
252            // "haskell" => Some(Haskell),
253            // "hcl" => Some(Hcl),
254            // "html" => Some(Html),
255            // "janet" => Some(Janet),
256            // "java" => Some(Java),
257            // "js" | "js2" => Some(JavaScript),
258            // "lisp" => Some(CommonLisp),
259            // "perl" => Some(Perl),
260            // "python" => Some(Python),
261            // "rjsx" => Some(Jsx),
262            // "ruby" => Some(Ruby),
263            // "rust" => Some(Rust),
264            // "scala" => Some(Scala),
265            // "sh" => Some(Bash),
266            // "sql" => Some(Sql),
267            // "swift" => Some(Swift),
268            // "toml" => Some(Toml),
269            // "tuareg" => Some(OCaml),
270            // "typescript" => Some(TypeScript),
271            "yaml" => Some(Yaml),
272            // "zig" => Some(Zig),
273            _ => None,
274        };
275        if lang.is_some() {
276            return lang;
277        }
278    }
279
280    None
281}
282
283/// Try to guess the language based on a shebang present in the source.
284// fn from_shebang(src: &str) -> Option<Language> {
285//     lazy_static! {
286//         static ref RE: Regex = Regex::new(r"#!(?:/usr/bin/env )?([^ ]+)").unwrap();
287//     }
288//     if let Some(first_line) = src.lines().next() {
289//         if let Some(cap) = RE.captures(first_line) {
290//             let interpreter_path = Path::new(&cap[1]);
291//             if let Some(name) = interpreter_path.file_name() {
292//                 match name.to_string_lossy().borrow() {
293//                     "ash" | "bash" | "dash" | "ksh" | "mksh" | "pdksh" | "rc" | "sh" | "zsh" => {
294//                         return Some(Bash)
295//                     }
296//                     "tcc" => return Some(C),
297//                     "lisp" | "sbc" | "ccl" | "clisp" | "ecl" => return Some(CommonLisp),
298//                     "elixir" => return Some(Elixir),
299//                     "elvish" => return Some(Elvish),
300//                     "hhvm" => return Some(Hack),
301//                     "runghc" | "runhaskell" | "runhugs" => return Some(Haskell),
302//                     "chakra" | "d8" | "gjs" | "js" | "node" | "nodejs" | "qjs" | "rhino" | "v8"
303//                     | "v8-shell" => return Some(JavaScript),
304//                     "ocaml" | "ocamlrun" | "ocamlscript" => return Some(OCaml),
305//                     "perl" => return Some(Perl),
306//                     "python" | "python2" | "python3" => return Some(Python),
307//                     "ruby" | "macruby" | "rake" | "jruby" | "rbx" => return Some(Ruby),
308//                     "swift" => return Some(Swift),
309//                     "deno" | "ts-node" => return Some(TypeScript),
310//                     _ => {}
311//                 }
312//             }
313//         }
314
315//         // Hack can use <?hh in files with a .php extension.
316//         if first_line.starts_with("<?hh") {
317//             return Some(Hack);
318//         }
319//     }
320
321//     None
322// }
323
324// fn from_name(path: &Path) -> Option<Language> {
325//     match path.file_name() {
326//         Some(name) => match name.to_string_lossy().borrow() {
327//             ".bash_aliases" | ".bash_history" | ".bash_logout" | ".bash_profile" | ".bashrc"
328//             | ".cshrc" | ".env" | ".env.example" | ".flaskenv" | ".kshrc" | ".login"
329//             | ".profile" | ".zlogin" | ".zlogout" | ".zprofile" | ".zshenv" | ".zshrc" | "9fs"
330//             | "PKGBUILD" | "bash_aliases" | "bash_logout" | "bash_profile" | "bashrc" | "cshrc"
331//             | "gradlew" | "kshrc" | "login" | "man" | "profile" | "zlogin" | "zlogout"
332//             | "zprofile" | "zshenv" | "zshrc" => Some(Bash),
333//             "CMakeLists.txt" => Some(CMake),
334//             ".emacs" | "_emacs" | "Cask" => Some(EmacsLisp),
335//             ".arcconfig" | ".auto-changelog" | ".c8rc" | ".htmlhintrc" | ".imgbotconfig"
336//             | ".nycrc" | ".tern-config" | ".tern-project" | ".watchmanconfig" | "Pipfile.lock"
337//             | "composer.lock" | "mcmod.info" => Some(Json),
338//             "BSDmakefile" | "GNUmakefile" | "Kbuild" | "Makefile" | "Makefile.am"
339//             | "Makefile.boot" | "Makefile.frag" | "Makefile.in" | "Makefile.inc"
340//             | "Makefile.wat" | "makefile" | "makefile.sco" | "mkfile" => Some(Make),
341//             "TARGETS" | "BUCK" | "DEPS" => Some(Python),
342//             "Gemfile" | "Rakefile" => Some(Ruby),
343//             "Cargo.lock" | "Gopkg.lock" | "Pipfile" | "poetry.lock" => Some(Toml),
344//             _ => None,
345//         },
346//         None => None,
347//     }
348// }
349
350pub fn from_extension(current_extension: &OsStr) -> Option<Language> {
351    let current_extension = current_extension.to_string_lossy();
352
353    for (language, extensions) in LANG_EXTENSIONS {
354        for extension in *extensions {
355            if &*current_extension == *extension {
356                return Some(*language);
357            }
358        }
359    }
360    None
361}
362
363// #[cfg(test)]
364// mod tests {
365//     use super::*;
366//     use pretty_assertions::assert_eq;
367
368//     #[test]
369//     fn test_guess_by_extension() {
370//         let path = Path::new("foo.el");
371//         assert_eq!(guess(path, ""), Some(EmacsLisp));
372//     }
373
374//     #[test]
375//     fn test_guess_by_whole_name() {
376//         let path = Path::new("foo/.bashrc");
377//         assert_eq!(guess(path, ""), Some(Bash));
378//     }
379
380//     #[test]
381//     fn test_guess_by_shebang() {
382//         let path = Path::new("foo");
383//         assert_eq!(guess(path, "#!/bin/bash"), Some(Bash));
384//     }
385
386//     #[test]
387//     fn test_guess_by_env_shebang() {
388//         let path = Path::new("foo");
389//         assert_eq!(guess(path, "#!/usr/bin/env python"), Some(Python));
390//     }
391
392//     #[test]
393//     fn test_guess_by_emacs_mode() {
394//         let path = Path::new("foo");
395//         assert_eq!(
396//             guess(path, "; -*- mode: Lisp; eval: (auto-fill-mode 1); -*-"),
397//             Some(CommonLisp)
398//         );
399//     }
400
401//     #[test]
402//     fn test_guess_by_emacs_mode_second_line() {
403//         let path = Path::new("foo");
404//         assert_eq!(
405//             guess(path, "#!/bin/bash\n; -*- mode: Lisp; -*-"),
406//             Some(CommonLisp)
407//         );
408//     }
409
410//     #[test]
411//     fn test_guess_by_emacs_mode_shorthand() {
412//         let path = Path::new("foo");
413//         assert_eq!(guess(path, "(* -*- tuareg -*- *)"), Some(OCaml));
414//     }
415
416//     #[test]
417//     fn test_guess_by_emacs_mode_shorthand_no_spaces() {
418//         let path = Path::new("foo");
419//         assert_eq!(guess(path, "# -*-python-*-"), Some(Python));
420//     }
421
422//     #[test]
423//     fn test_guess_unknown() {
424//         let path = Path::new("jfkdlsjfkdsljfkdsljf");
425//         assert_eq!(guess(path, ""), None);
426//     }
427// }