difft_lib/parse/guess_language.rs
1//! Guess which programming language a file is written in.
2//!
3//! This is heavily based on GitHub's
4//! [linguist](https://github.com/github/linguist/blob/master/docs/how-linguist-works.md),
5//! particularly its
6//! [languages.yml](https://github.com/github/linguist/blob/master/lib/linguist/languages.yml).
7//!
8//! Difftastic does not reuse languages.yml directly, for
9//! implementation simplicity and to avoid parsing very large files
10//! (e.g. package.lock) that can't be handled in a reasonable time
11//! yet.
12
13use lazy_static::lazy_static;
14use regex::Regex;
15use std::{borrow::Borrow, ffi::OsStr, path::Path};
16
17/// Languages supported by difftastic. Each language here has a
18/// corresponding tree-sitter parser.
19#[derive(Debug, Clone, Copy, PartialEq, Eq)]
20pub enum Language {
21 // Bash,
22 // C,
23 // Clojure,
24 // CMake,
25 // CommonLisp,
26 // CPlusPlus,
27 // CSharp,
28 // Css,
29 // Dart,
30 // Elixir,
31 // Elm,
32 // Elvish,
33 // EmacsLisp,
34 // Gleam,
35 // Go,
36 // Hack,
37 // Haskell,
38 // Hcl,
39 // Html,
40 // Janet,
41 // Java,
42 // JavaScript,
43 // Json,
44 // Julia,
45 // Jsx,
46 // Kotlin,
47 // Lua,
48 // Make,
49 // Nix,
50 // OCaml,
51 // OCamlInterface,
52 // Php,
53 // Perl,
54 // Python,
55 // Qml,
56 // Ruby,
57 // Rust,
58 // Scala,
59 // Sql,
60 // Swift,
61 // Toml,
62 // Tsx,
63 // TypeScript,
64 Yaml,
65 // Zig,
66}
67
68/// The language name shown to the user.
69pub fn language_name(language: Language) -> &'static str {
70 match language {
71 // Bash => "Bash",
72 // C => "C",
73 // Clojure => "Clojure",
74 // CMake => "CMake",
75 // CommonLisp => "Common Lisp",
76 // CPlusPlus => "C++",
77 // CSharp => "C#",
78 // Css => "CSS",
79 // Dart => "Dart",
80 // Elixir => "Elixir",
81 // Elm => "Elm",
82 // Elvish => "Elvish",
83 // EmacsLisp => "Emacs Lisp",
84 // Gleam => "Gleam",
85 // Go => "Go",
86 // Hack => "Hack",
87 // Haskell => "Haskell",
88 // Hcl => "HCL",
89 // Html => "HTML",
90 // Janet => "Janet",
91 // Java => "Java",
92 // JavaScript => "JavaScript",
93 // Json => "JSON",
94 // Julia => "Julia",
95 // Jsx => "JavaScript JSX",
96 // Kotlin => "Kotlin",
97 // Lua => "Lua",
98 // Make => "Make",
99 // Nix => "Nix",
100 // OCaml => "OCaml",
101 // OCamlInterface => "OCaml Interface",
102 // Php => "PHP",
103 // Perl => "Perl",
104 // Python => "Python",
105 // Qml => "QML",
106 // Ruby => "Ruby",
107 // Rust => "Rust",
108 // Scala => "Scala",
109 // Sql => "SQL",
110 // Swift => "Swift",
111 // Toml => "TOML",
112 // Tsx => "TypeScript TSX",
113 // TypeScript => "TypeScript",
114 Yaml => "YAML",
115 // Zig => "Zig",
116 }
117}
118
119pub const LANG_EXTENSIONS: &'static [(Language, &[&str])] = &[
120 // (
121 // Bash,
122 // &[
123 // "sh", "bash", "bats", "cgi", "command", "env", "fcgi", "ksh", "sh.in", "tmux", "tool",
124 // "zsh",
125 // ],
126 // ),
127 // (C, &["c"]),
128 // (
129 // Clojure,
130 // &[
131 // "bb", "boot", "clj", "cljc", "clje", "cljs", "cljx", "edn", "joke", "joker",
132 // ],
133 // ),
134 // (CMake, &["cmake", "cmake.in"]),
135 // (CommonLisp, &["lisp", "lsp", "asd"]),
136 // // Treat .h as C++ rather than C. This is an arbitrary choice, but
137 // // C++ is more widely used than C according to
138 // // https://madnight.github.io/githut/
139 // (CPlusPlus, &["cc", "cpp", "h", "hh", "hpp", "cxx"]),
140 // (CSharp, &["cs"]),
141 // (Css, &["css"]),
142 // (Dart, &["dart"]),
143 // (Elm, &["elm"]),
144 // (EmacsLisp, &["el"]),
145 // (Elixir, &["ex", "exs"]),
146 // (Elvish, &["elv"]),
147 // (Gleam, &["gleam"]),
148 // (Go, &["go"]),
149 // (Hack, &["hack", "hck", "hhi"]),
150 // (Haskell, &["hs"]),
151 // (Hcl, &["hcl", "nomad", "tf", "tfvars", "workflow"]),
152 // (Html, &["html", "htm", "xhtml"]),
153 // (Janet, &["janet", "jdn"]),
154 // (Java, &["java"]),
155 // (JavaScript, &["cjs", "js", "mjs"]),
156 // (
157 // Json,
158 // &[
159 // "json",
160 // "avsc",
161 // "geojson",
162 // "gltf",
163 // "har",
164 // "ice",
165 // "JSON-tmLanguage",
166 // "jsonl",
167 // "mcmeta",
168 // "tfstate",
169 // "tfstate.backup",
170 // "topojson",
171 // "webapp",
172 // "webmanifest",
173 // ],
174 // ),
175 // (Jsx, &["jsx"]),
176 // (Julia, &["jl"]),
177 // (Kotlin, &["kt", "ktm", "kts"]),
178 // (Lua, &["lua"]),
179 // (Make, &["mak", "d", "make", "makefile", "mk", "mkfile"]),
180 // (Nix, &["nix"]),
181 // (OCaml, &["ml"]),
182 // (OCamlInterface, &["mli"]),
183 // (Php, &["php"]),
184 // (Perl, &["pm", "pl"]),
185 // (Python, &["py", "py3", "pyi", "bzl"]),
186 // (Qml, &["qml"]),
187 // (Ruby, &["rb", "builder", "spec", "rake"]),
188 // (Rust, &["rs"]),
189 // (Scala, &["scala", "sbt", "sc"]),
190 // (Sql, &["sql", "pgsql"]),
191 // (Swift, &["swift"]),
192 // (Toml, &["toml"]),
193 // (TypeScript, &["ts"]),
194 // (Tsx, &["tsx"]),
195 (Yaml, &["yaml", "yml"]),
196 // (Zig, &["zig"]),
197];
198
199use Language::*;
200
201pub fn guess(path: &Path, src: &str) -> Option<Language> {
202 if let Some(lang) = from_emacs_mode_header(src) {
203 return Some(lang);
204 }
205 // if let Some(lang) = from_shebang(src) {
206 // return Some(lang);
207 // }
208 // if let Some(lang) = from_name(path) {
209 // return Some(lang);
210 // }
211
212 match path.extension() {
213 Some(extension) => match from_extension(extension) {
214 // Some(Language::Php) if src.starts_with("<?hh") => None,
215 language => language,
216 },
217 None => None,
218 }
219}
220
221/// Try to guess the language based on an Emacs mode comment at the
222/// beginning of the file.
223///
224/// <https://www.gnu.org/software/emacs/manual/html_node/emacs/Choosing-Modes.html>
225/// <https://www.gnu.org/software/emacs/manual/html_node/emacs/Specifying-File-Variables.html>
226fn from_emacs_mode_header(src: &str) -> Option<Language> {
227 lazy_static! {
228 static ref MODE_RE: Regex = Regex::new(r"-\*-.*mode:([^;]+?);.*-\*-").unwrap();
229 static ref SHORTHAND_RE: Regex = Regex::new(r"-\*-(.+)-\*-").unwrap();
230 }
231
232 // Emacs allows the mode header to occur on the second line if the
233 // first line is a shebang.
234 for line in src.lines().take(2) {
235 let mode_name: String = match (MODE_RE.captures(line), SHORTHAND_RE.captures(line)) {
236 (Some(cap), _) | (_, Some(cap)) => cap[1].into(),
237 _ => "".into(),
238 };
239 let lang = match mode_name.to_ascii_lowercase().trim().borrow() {
240 // "c" => Some(C),
241 // "clojure" => Some(Clojure),
242 // "csharp" => Some(CSharp),
243 // "css" => Some(Css),
244 // "dart" => Some(Dart),
245 // "c++" => Some(CPlusPlus),
246 // "elixir" => Some(Elixir),
247 // "elm" => Some(Elm),
248 // "elvish" => Some(Elvish),
249 // "emacs-lisp" => Some(EmacsLisp),
250 // "gleam" => Some(Gleam),
251 // "go" => Some(Go),
252 // "haskell" => Some(Haskell),
253 // "hcl" => Some(Hcl),
254 // "html" => Some(Html),
255 // "janet" => Some(Janet),
256 // "java" => Some(Java),
257 // "js" | "js2" => Some(JavaScript),
258 // "lisp" => Some(CommonLisp),
259 // "perl" => Some(Perl),
260 // "python" => Some(Python),
261 // "rjsx" => Some(Jsx),
262 // "ruby" => Some(Ruby),
263 // "rust" => Some(Rust),
264 // "scala" => Some(Scala),
265 // "sh" => Some(Bash),
266 // "sql" => Some(Sql),
267 // "swift" => Some(Swift),
268 // "toml" => Some(Toml),
269 // "tuareg" => Some(OCaml),
270 // "typescript" => Some(TypeScript),
271 "yaml" => Some(Yaml),
272 // "zig" => Some(Zig),
273 _ => None,
274 };
275 if lang.is_some() {
276 return lang;
277 }
278 }
279
280 None
281}
282
283/// Try to guess the language based on a shebang present in the source.
284// fn from_shebang(src: &str) -> Option<Language> {
285// lazy_static! {
286// static ref RE: Regex = Regex::new(r"#!(?:/usr/bin/env )?([^ ]+)").unwrap();
287// }
288// if let Some(first_line) = src.lines().next() {
289// if let Some(cap) = RE.captures(first_line) {
290// let interpreter_path = Path::new(&cap[1]);
291// if let Some(name) = interpreter_path.file_name() {
292// match name.to_string_lossy().borrow() {
293// "ash" | "bash" | "dash" | "ksh" | "mksh" | "pdksh" | "rc" | "sh" | "zsh" => {
294// return Some(Bash)
295// }
296// "tcc" => return Some(C),
297// "lisp" | "sbc" | "ccl" | "clisp" | "ecl" => return Some(CommonLisp),
298// "elixir" => return Some(Elixir),
299// "elvish" => return Some(Elvish),
300// "hhvm" => return Some(Hack),
301// "runghc" | "runhaskell" | "runhugs" => return Some(Haskell),
302// "chakra" | "d8" | "gjs" | "js" | "node" | "nodejs" | "qjs" | "rhino" | "v8"
303// | "v8-shell" => return Some(JavaScript),
304// "ocaml" | "ocamlrun" | "ocamlscript" => return Some(OCaml),
305// "perl" => return Some(Perl),
306// "python" | "python2" | "python3" => return Some(Python),
307// "ruby" | "macruby" | "rake" | "jruby" | "rbx" => return Some(Ruby),
308// "swift" => return Some(Swift),
309// "deno" | "ts-node" => return Some(TypeScript),
310// _ => {}
311// }
312// }
313// }
314
315// // Hack can use <?hh in files with a .php extension.
316// if first_line.starts_with("<?hh") {
317// return Some(Hack);
318// }
319// }
320
321// None
322// }
323
324// fn from_name(path: &Path) -> Option<Language> {
325// match path.file_name() {
326// Some(name) => match name.to_string_lossy().borrow() {
327// ".bash_aliases" | ".bash_history" | ".bash_logout" | ".bash_profile" | ".bashrc"
328// | ".cshrc" | ".env" | ".env.example" | ".flaskenv" | ".kshrc" | ".login"
329// | ".profile" | ".zlogin" | ".zlogout" | ".zprofile" | ".zshenv" | ".zshrc" | "9fs"
330// | "PKGBUILD" | "bash_aliases" | "bash_logout" | "bash_profile" | "bashrc" | "cshrc"
331// | "gradlew" | "kshrc" | "login" | "man" | "profile" | "zlogin" | "zlogout"
332// | "zprofile" | "zshenv" | "zshrc" => Some(Bash),
333// "CMakeLists.txt" => Some(CMake),
334// ".emacs" | "_emacs" | "Cask" => Some(EmacsLisp),
335// ".arcconfig" | ".auto-changelog" | ".c8rc" | ".htmlhintrc" | ".imgbotconfig"
336// | ".nycrc" | ".tern-config" | ".tern-project" | ".watchmanconfig" | "Pipfile.lock"
337// | "composer.lock" | "mcmod.info" => Some(Json),
338// "BSDmakefile" | "GNUmakefile" | "Kbuild" | "Makefile" | "Makefile.am"
339// | "Makefile.boot" | "Makefile.frag" | "Makefile.in" | "Makefile.inc"
340// | "Makefile.wat" | "makefile" | "makefile.sco" | "mkfile" => Some(Make),
341// "TARGETS" | "BUCK" | "DEPS" => Some(Python),
342// "Gemfile" | "Rakefile" => Some(Ruby),
343// "Cargo.lock" | "Gopkg.lock" | "Pipfile" | "poetry.lock" => Some(Toml),
344// _ => None,
345// },
346// None => None,
347// }
348// }
349
350pub fn from_extension(current_extension: &OsStr) -> Option<Language> {
351 let current_extension = current_extension.to_string_lossy();
352
353 for (language, extensions) in LANG_EXTENSIONS {
354 for extension in *extensions {
355 if &*current_extension == *extension {
356 return Some(*language);
357 }
358 }
359 }
360 None
361}
362
363// #[cfg(test)]
364// mod tests {
365// use super::*;
366// use pretty_assertions::assert_eq;
367
368// #[test]
369// fn test_guess_by_extension() {
370// let path = Path::new("foo.el");
371// assert_eq!(guess(path, ""), Some(EmacsLisp));
372// }
373
374// #[test]
375// fn test_guess_by_whole_name() {
376// let path = Path::new("foo/.bashrc");
377// assert_eq!(guess(path, ""), Some(Bash));
378// }
379
380// #[test]
381// fn test_guess_by_shebang() {
382// let path = Path::new("foo");
383// assert_eq!(guess(path, "#!/bin/bash"), Some(Bash));
384// }
385
386// #[test]
387// fn test_guess_by_env_shebang() {
388// let path = Path::new("foo");
389// assert_eq!(guess(path, "#!/usr/bin/env python"), Some(Python));
390// }
391
392// #[test]
393// fn test_guess_by_emacs_mode() {
394// let path = Path::new("foo");
395// assert_eq!(
396// guess(path, "; -*- mode: Lisp; eval: (auto-fill-mode 1); -*-"),
397// Some(CommonLisp)
398// );
399// }
400
401// #[test]
402// fn test_guess_by_emacs_mode_second_line() {
403// let path = Path::new("foo");
404// assert_eq!(
405// guess(path, "#!/bin/bash\n; -*- mode: Lisp; -*-"),
406// Some(CommonLisp)
407// );
408// }
409
410// #[test]
411// fn test_guess_by_emacs_mode_shorthand() {
412// let path = Path::new("foo");
413// assert_eq!(guess(path, "(* -*- tuareg -*- *)"), Some(OCaml));
414// }
415
416// #[test]
417// fn test_guess_by_emacs_mode_shorthand_no_spaces() {
418// let path = Path::new("foo");
419// assert_eq!(guess(path, "# -*-python-*-"), Some(Python));
420// }
421
422// #[test]
423// fn test_guess_unknown() {
424// let path = Path::new("jfkdlsjfkdsljfkdsljf");
425// assert_eq!(guess(path, ""), None);
426// }
427// }