big_code_analysis/langs.rs
1// Per-language metric and AST modules deliberately consume the macro-
2// generated tree-sitter token enums via `use crate::*` and `use Foo::*`
3// inside match expressions — explicit imports would list dozens of
4// variants per arm and obscure the per-language token sets that are the
5// point of these files. Allowed at the module level rather than per
6// function so the per-language impl blocks stay readable.
7#![allow(clippy::wildcard_imports, clippy::enum_glob_use)]
8
9use std::path::Path;
10use std::sync::Arc;
11use tree_sitter::Language;
12
13// `get_language` is referenced from feature-gated arms inside the
14// `mk_lang!` expansion; an `--no-default-features` build with no
15// language features compiles every arm out, leaving the import
16// nominally unused. The macro itself carries the same allow.
17#[allow(unused_imports)]
18use crate::macros::{
19 get_language, mk_action, mk_code, mk_emacs_mode, mk_extensions, mk_lang, mk_langs,
20};
21use crate::preproc::PreprocResults;
22use crate::*;
23
24mk_langs!(
25 // 1) Cargo feature name that enables this variant's grammar
26 // 2) Name for enum
27 // 3) Language description
28 // 4) Display name
29 // 5) Empty struct name to implement
30 // 6) Parser name
31 // 7) tree-sitter function to call to get a Language
32 // 8) file extensions
33 // 9) emacs modes
34 //
35 // Per #252, each variant carries a Cargo feature that gates the
36 // grammar crate references in `mk_lang!` / `mk_action!`. The enum
37 // surface (variants, file-extension lookup, emacs-mode lookup,
38 // per-language `*Code` / `*Parser` tags) is always compiled in;
39 // disabling a feature only strips the grammar crate from the dep
40 // graph and turns every dispatcher into
41 // `Err(MetricsError::LanguageDisabled(_))`.
42 //
43 // `Ccomment` and `Preproc` ride the `cpp` feature because they
44 // are internal helpers for the C/C++ pipeline; they share the
45 // `tree-sitter-ccomment` / `tree-sitter-preproc` crates that
46 // `cpp` (and `mozcpp`) pull in. `Tsx` rides `typescript` because
47 // both variants resolve to the `tree-sitter-typescript` crate
48 // (TSX vs TypeScript is a per-grammar `LANGUAGE_*` constant
49 // inside that one crate, see `get_language!` in `src/macros.rs`).
50 (
51 "mozjs",
52 Mozjs,
53 "The `Mozjs` language is variant of the `JavaScript` language",
54 "javascript",
55 MozjsCode,
56 MozjsParser,
57 tree_sitter_mozjs,
58 [js, jsm, mjs, jsx],
59 ["js", "js2"]
60 ),
61 (
62 "javascript",
63 Javascript,
64 "The `JavaScript` language",
65 "javascript",
66 JavascriptCode,
67 JavascriptParser,
68 tree_sitter_javascript,
69 [],
70 []
71 ),
72 (
73 "java",
74 Java,
75 "The `Java` language",
76 "java",
77 JavaCode,
78 JavaParser,
79 tree_sitter_java,
80 [java],
81 ["java"]
82 ),
83 (
84 "go",
85 Go,
86 "The `Go` language",
87 "go",
88 GoCode,
89 GoParser,
90 tree_sitter_go,
91 [go],
92 ["go"]
93 ),
94 (
95 "kotlin",
96 Kotlin,
97 "The `Kotlin` language",
98 "kotlin",
99 KotlinCode,
100 KotlinParser,
101 tree_sitter_kotlin_ng,
102 [kt, kts],
103 ["kotlin"]
104 ),
105 (
106 "lua",
107 Lua,
108 "The `Lua` language",
109 "lua",
110 LuaCode,
111 LuaParser,
112 tree_sitter_lua,
113 [lua],
114 ["lua"]
115 ),
116 (
117 "rust",
118 Rust,
119 "The `Rust` language",
120 "rust",
121 RustCode,
122 RustParser,
123 tree_sitter_rust,
124 [rs],
125 ["rust"]
126 ),
127 (
128 "tcl",
129 Tcl,
130 "The `Tcl` language",
131 "tcl",
132 TclCode,
133 TclParser,
134 tree_sitter_tcl,
135 [tcl, tk, tm],
136 ["tcl"]
137 ),
138 (
139 "cpp",
140 Cpp,
141 "The `C/C++` language",
142 "c/c++",
143 CppCode,
144 CppParser,
145 tree_sitter_cpp,
146 [cpp, cxx, cc, hxx, hpp, c, h, hh, inc, mm, m],
147 ["c++", "c", "objc", "objc++", "objective-c++", "objective-c"]
148 ),
149 (
150 "csharp",
151 Csharp,
152 "The `C#` language",
153 "c#",
154 CsharpCode,
155 CsharpParser,
156 tree_sitter_c_sharp,
157 [cs, csx, cake],
158 ["csharp"]
159 ),
160 (
161 "elixir",
162 Elixir,
163 "The `Elixir` language",
164 "elixir",
165 ElixirCode,
166 ElixirParser,
167 tree_sitter_elixir,
168 [ex, exs],
169 ["elixir"]
170 ),
171 (
172 "python",
173 Python,
174 "The `Python` language",
175 "python",
176 PythonCode,
177 PythonParser,
178 tree_sitter_python,
179 [py],
180 ["python"]
181 ),
182 (
183 "typescript",
184 Tsx,
185 "The `Tsx` language incorporates the `JSX` syntax inside `TypeScript`",
186 "typescript",
187 TsxCode,
188 TsxParser,
189 tree_sitter_tsx,
190 [tsx],
191 []
192 ),
193 (
194 "typescript",
195 Typescript,
196 "The `TypeScript` language",
197 "typescript",
198 TypescriptCode,
199 TypescriptParser,
200 tree_sitter_typescript,
201 [ts, jsw, jsmw],
202 ["typescript"]
203 ),
204 (
205 "bash",
206 Bash,
207 "The `Bash` language",
208 "bash",
209 BashCode,
210 BashParser,
211 tree_sitter_bash,
212 [sh, bash],
213 ["sh"]
214 ),
215 (
216 "cpp",
217 Ccomment,
218 "The `Ccomment` language is a variant of the `C` language focused on comments",
219 "ccomment",
220 CcommentCode,
221 CcommentParser,
222 tree_sitter_ccomment,
223 [],
224 []
225 ),
226 (
227 "cpp",
228 Preproc,
229 "The `PreProc` language is a variant of the `C/C++` language focused on macros",
230 "preproc",
231 PreprocCode,
232 PreprocParser,
233 tree_sitter_preproc,
234 [],
235 []
236 ),
237 (
238 "perl",
239 Perl,
240 "The `Perl` language",
241 "perl",
242 PerlCode,
243 PerlParser,
244 tree_sitter_perl,
245 [pl, pm, t],
246 ["perl", "cperl"]
247 ),
248 (
249 "php",
250 Php,
251 "The `Php` language",
252 "php",
253 PhpCode,
254 PhpParser,
255 tree_sitter_php,
256 [php, phtml, php3, php4, php5, php7, phps],
257 ["php"]
258 ),
259 (
260 "ruby",
261 Ruby,
262 "The `Ruby` language",
263 "ruby",
264 RubyCode,
265 RubyParser,
266 tree_sitter_ruby,
267 [rb, rake, gemspec],
268 ["ruby"]
269 ),
270 (
271 "groovy",
272 Groovy,
273 "The `Groovy` language",
274 "groovy",
275 GroovyCode,
276 GroovyParser,
277 dekobon_tree_sitter_groovy,
278 [groovy, gradle, gvy, gy, gsh],
279 ["groovy"]
280 )
281);
282
283pub(crate) mod fake {
284 pub(crate) fn get_true<'a>(ext: &str, mode: &str) -> Option<&'a str> {
285 if ext == "m"
286 || ext == "mm"
287 || mode == "objc"
288 || mode == "objc++"
289 || mode == "objective-c++"
290 || mode == "objective-c"
291 {
292 Some("obj-c/c++")
293 } else {
294 None
295 }
296 }
297}
298
299#[cfg(test)]
300mod tests {
301 use super::*;
302 use crate::MetricsError;
303
304 // The test suite normally runs under the workspace default
305 // feature set (`all-languages` is on, see `Cargo.toml`), so
306 // every variant must report itself as enabled. A regression in
307 // the cfg-gating of `is_enabled` would flip individual arms to
308 // `false` even when the matching grammar crate is in the dep
309 // graph; this test would catch that without needing a separate
310 // `--no-default-features` build matrix entry. Gated on
311 // `feature = "all-languages"` so the CI minimal-langs matrix
312 // entry (`--no-default-features --features rust,typescript`)
313 // still compiles cleanly without a runtime failure.
314 #[cfg(feature = "all-languages")]
315 #[test]
316 fn every_lang_variant_is_enabled_under_all_languages() {
317 for lang in LANG::into_enum_iter() {
318 assert!(
319 lang.is_enabled(),
320 "{} should be enabled under the default `all-languages` feature set",
321 lang.get_name(),
322 );
323 }
324 }
325
326 // Smoke test for the `LanguageDisabled` contract on a build
327 // without the `javascript` feature: every dispatch entry point
328 // (here, `get_tree_sitter_language`) must hand back
329 // `Err(LanguageDisabled(LANG::Javascript))`. Gated on
330 // `not(feature = "javascript")` so it only runs in a feature-
331 // subset build where the language is actually disabled — the
332 // `all-languages` default would have `is_enabled` return true
333 // and `get_tree_sitter_language` succeed.
334 #[cfg(not(feature = "javascript"))]
335 #[test]
336 fn disabled_language_dispatch_returns_language_disabled() {
337 assert!(!LANG::Javascript.is_enabled());
338 match LANG::Javascript.get_tree_sitter_language() {
339 Err(MetricsError::LanguageDisabled(LANG::Javascript)) => {}
340 other => panic!(
341 "expected Err(LanguageDisabled(Javascript)) for disabled `javascript` feature, got {other:?}",
342 ),
343 }
344 }
345
346 // `is_enabled` and `get_tree_sitter_language` must agree: a
347 // variant that reports itself enabled must hand back a usable
348 // `Language`, never `Err(LanguageDisabled)`. The pairing exists
349 // so callers that branch on `is_enabled` (rather than match on
350 // the error) can rely on the language lookup succeeding.
351 #[test]
352 fn is_enabled_matches_get_tree_sitter_language() {
353 for lang in LANG::into_enum_iter() {
354 let lookup = lang.get_tree_sitter_language();
355 assert_eq!(
356 lang.is_enabled(),
357 lookup.is_ok(),
358 "{} disagrees: is_enabled={}, get_tree_sitter_language={:?}",
359 lang.get_name(),
360 lang.is_enabled(),
361 lookup.map(|_| "Ok"),
362 );
363 }
364 }
365
366 // Regression guard for issue #262: the `MetricsError::EmptyRoot`
367 // variant is documented as "Reserved — not produced today".
368 // `metrics_with_options` pushes a synthetic top-level Unit
369 // `FuncSpace` before walking, so every parse — including empty,
370 // whitespace-only, and comment-only input — currently returns
371 // `Ok(FuncSpace { kind: Unit, .. })`. If the walker is ever
372 // changed to legitimately drain its state stack (e.g. by
373 // dropping the synthetic root), this test will start failing
374 // and the variant docs must be revisited.
375 #[test]
376 fn empty_and_comment_only_input_never_returns_empty_root() {
377 use crate::{MetricsOptions, Source, SpaceKind, analyze};
378
379 // Pair every enabled language with sources that would, by
380 // the old (false) variant doc, surface `EmptyRoot`. The
381 // comment syntaxes cover line and block forms across the
382 // supported language families.
383 let inputs: &[&[u8]] = &[b"", b" \n\t\n", b"// just a comment\n", b"/* block */\n"];
384
385 for lang in LANG::into_enum_iter() {
386 if !lang.is_enabled() {
387 continue;
388 }
389 for src in inputs {
390 let space = analyze(Source::new(lang, src), MetricsOptions::default())
391 .unwrap_or_else(|err| {
392 panic!(
393 "{} on input {:?} unexpectedly returned {err:?}; \
394 EmptyRoot is documented as not produced today",
395 lang.get_name(),
396 String::from_utf8_lossy(src),
397 )
398 });
399 assert_eq!(
400 space.kind,
401 SpaceKind::Unit,
402 "{} on input {:?} produced a non-Unit top-level FuncSpace",
403 lang.get_name(),
404 String::from_utf8_lossy(src),
405 );
406 }
407 }
408 }
409
410 // The error variant carries the originating `LANG` so callers
411 // can distinguish "X is disabled" from "Y is disabled" in a
412 // mixed batch. Verifies the `Display` impl mentions the
413 // language name as documented in `src/error.rs`.
414 #[test]
415 fn language_disabled_display_includes_language_name() {
416 let err = MetricsError::LanguageDisabled(LANG::Rust);
417 let rendered = err.to_string();
418 assert!(
419 rendered.contains("rust"),
420 "expected LanguageDisabled display to mention `rust`, got {rendered:?}",
421 );
422 }
423}