Skip to main content

big_code_analysis/
langs.rs

1// Per-language metric and AST modules deliberately consume the macro-
2// generated tree-sitter token enums via `use crate::*` and `use Foo::*`
3// inside match expressions — explicit imports would list dozens of
4// variants per arm and obscure the per-language token sets that are the
5// point of these files. Allowed at the module level rather than per
6// function so the per-language impl blocks stay readable.
7#![allow(clippy::wildcard_imports, clippy::enum_glob_use)]
8
9use std::path::Path;
10use std::sync::Arc;
11use tree_sitter::Language;
12
13// `get_language` is referenced from feature-gated arms inside the
14// `mk_lang!` expansion; an `--no-default-features` build with no
15// language features compiles every arm out, leaving the import
16// nominally unused. The macro itself carries the same allow.
17#[allow(unused_imports)]
18use crate::macros::{
19    get_language, mk_action, mk_code, mk_emacs_mode, mk_extensions, mk_lang, mk_langs,
20};
21use crate::preproc::PreprocResults;
22use crate::*;
23
24mk_langs!(
25    // 1) Cargo feature name that enables this variant's grammar
26    // 2) Name for enum
27    // 3) Language description
28    // 4) Display name
29    // 5) Empty struct name to implement
30    // 6) Parser name
31    // 7) tree-sitter function to call to get a Language
32    // 8) file extensions
33    // 9) emacs modes
34    //
35    // Per #252, each variant carries a Cargo feature that gates the
36    // grammar crate references in `mk_lang!` / `mk_action!`. The enum
37    // surface (variants, file-extension lookup, emacs-mode lookup,
38    // per-language `*Code` / `*Parser` tags) is always compiled in;
39    // disabling a feature only strips the grammar crate from the dep
40    // graph and turns every dispatcher into
41    // `Err(MetricsError::LanguageDisabled(_))`.
42    //
43    // `Ccomment` and `Preproc` ride the `cpp` feature because they
44    // are internal helpers for the C/C++ pipeline; they share the
45    // `tree-sitter-ccomment` / `tree-sitter-preproc` crates that
46    // `cpp` (and `mozcpp`) pull in. `Tsx` rides `typescript` because
47    // both variants resolve to the `tree-sitter-typescript` crate
48    // (TSX vs TypeScript is a per-grammar `LANGUAGE_*` constant
49    // inside that one crate, see `get_language!` in `src/macros.rs`).
50    (
51        "mozjs",
52        Mozjs,
53        "The `Mozjs` language is variant of the `JavaScript` language",
54        "javascript",
55        MozjsCode,
56        MozjsParser,
57        tree_sitter_mozjs,
58        [js, jsm, mjs, jsx],
59        ["js", "js2"]
60    ),
61    (
62        "javascript",
63        Javascript,
64        "The `JavaScript` language",
65        "javascript",
66        JavascriptCode,
67        JavascriptParser,
68        tree_sitter_javascript,
69        [],
70        []
71    ),
72    (
73        "java",
74        Java,
75        "The `Java` language",
76        "java",
77        JavaCode,
78        JavaParser,
79        tree_sitter_java,
80        [java],
81        ["java"]
82    ),
83    (
84        "go",
85        Go,
86        "The `Go` language",
87        "go",
88        GoCode,
89        GoParser,
90        tree_sitter_go,
91        [go],
92        ["go"]
93    ),
94    (
95        "kotlin",
96        Kotlin,
97        "The `Kotlin` language",
98        "kotlin",
99        KotlinCode,
100        KotlinParser,
101        tree_sitter_kotlin_ng,
102        [kt, kts],
103        ["kotlin"]
104    ),
105    (
106        "lua",
107        Lua,
108        "The `Lua` language",
109        "lua",
110        LuaCode,
111        LuaParser,
112        tree_sitter_lua,
113        [lua],
114        ["lua"]
115    ),
116    (
117        "rust",
118        Rust,
119        "The `Rust` language",
120        "rust",
121        RustCode,
122        RustParser,
123        tree_sitter_rust,
124        [rs],
125        ["rust"]
126    ),
127    (
128        "tcl",
129        Tcl,
130        "The `Tcl` language",
131        "tcl",
132        TclCode,
133        TclParser,
134        tree_sitter_tcl,
135        [tcl, tk, tm],
136        ["tcl"]
137    ),
138    (
139        "cpp",
140        Cpp,
141        "The `C/C++` language",
142        "c/c++",
143        CppCode,
144        CppParser,
145        tree_sitter_cpp,
146        [cpp, cxx, cc, hxx, hpp, c, h, hh, inc, mm, m],
147        ["c++", "c", "objc", "objc++", "objective-c++", "objective-c"]
148    ),
149    (
150        "csharp",
151        Csharp,
152        "The `C#` language",
153        "c#",
154        CsharpCode,
155        CsharpParser,
156        tree_sitter_c_sharp,
157        [cs, csx, cake],
158        ["csharp"]
159    ),
160    (
161        "elixir",
162        Elixir,
163        "The `Elixir` language",
164        "elixir",
165        ElixirCode,
166        ElixirParser,
167        tree_sitter_elixir,
168        [ex, exs],
169        ["elixir"]
170    ),
171    (
172        "python",
173        Python,
174        "The `Python` language",
175        "python",
176        PythonCode,
177        PythonParser,
178        tree_sitter_python,
179        [py],
180        ["python"]
181    ),
182    (
183        "typescript",
184        Tsx,
185        "The `Tsx` language incorporates the `JSX` syntax inside `TypeScript`",
186        "typescript",
187        TsxCode,
188        TsxParser,
189        tree_sitter_tsx,
190        [tsx],
191        []
192    ),
193    (
194        "typescript",
195        Typescript,
196        "The `TypeScript` language",
197        "typescript",
198        TypescriptCode,
199        TypescriptParser,
200        tree_sitter_typescript,
201        [ts, jsw, jsmw],
202        ["typescript"]
203    ),
204    (
205        "bash",
206        Bash,
207        "The `Bash` language",
208        "bash",
209        BashCode,
210        BashParser,
211        tree_sitter_bash,
212        [sh, bash],
213        ["sh"]
214    ),
215    (
216        "cpp",
217        Ccomment,
218        "The `Ccomment` language is a variant of the `C` language focused on comments",
219        "ccomment",
220        CcommentCode,
221        CcommentParser,
222        tree_sitter_ccomment,
223        [],
224        []
225    ),
226    (
227        "cpp",
228        Preproc,
229        "The `PreProc` language is a variant of the `C/C++` language focused on macros",
230        "preproc",
231        PreprocCode,
232        PreprocParser,
233        tree_sitter_preproc,
234        [],
235        []
236    ),
237    (
238        "perl",
239        Perl,
240        "The `Perl` language",
241        "perl",
242        PerlCode,
243        PerlParser,
244        tree_sitter_perl,
245        [pl, pm, t],
246        ["perl", "cperl"]
247    ),
248    (
249        "php",
250        Php,
251        "The `Php` language",
252        "php",
253        PhpCode,
254        PhpParser,
255        tree_sitter_php,
256        [php, phtml, php3, php4, php5, php7, phps],
257        ["php"]
258    ),
259    (
260        "ruby",
261        Ruby,
262        "The `Ruby` language",
263        "ruby",
264        RubyCode,
265        RubyParser,
266        tree_sitter_ruby,
267        [rb, rake, gemspec],
268        ["ruby"]
269    ),
270    (
271        "groovy",
272        Groovy,
273        "The `Groovy` language",
274        "groovy",
275        GroovyCode,
276        GroovyParser,
277        dekobon_tree_sitter_groovy,
278        [groovy, gradle, gvy, gy, gsh],
279        ["groovy"]
280    )
281);
282
283pub(crate) mod fake {
284    pub(crate) fn get_true<'a>(ext: &str, mode: &str) -> Option<&'a str> {
285        if ext == "m"
286            || ext == "mm"
287            || mode == "objc"
288            || mode == "objc++"
289            || mode == "objective-c++"
290            || mode == "objective-c"
291        {
292            Some("obj-c/c++")
293        } else {
294            None
295        }
296    }
297}
298
299#[cfg(test)]
300mod tests {
301    use super::*;
302    use crate::MetricsError;
303
304    // The test suite normally runs under the workspace default
305    // feature set (`all-languages` is on, see `Cargo.toml`), so
306    // every variant must report itself as enabled. A regression in
307    // the cfg-gating of `is_enabled` would flip individual arms to
308    // `false` even when the matching grammar crate is in the dep
309    // graph; this test would catch that without needing a separate
310    // `--no-default-features` build matrix entry. Gated on
311    // `feature = "all-languages"` so the CI minimal-langs matrix
312    // entry (`--no-default-features --features rust,typescript`)
313    // still compiles cleanly without a runtime failure.
314    #[cfg(feature = "all-languages")]
315    #[test]
316    fn every_lang_variant_is_enabled_under_all_languages() {
317        for lang in LANG::into_enum_iter() {
318            assert!(
319                lang.is_enabled(),
320                "{} should be enabled under the default `all-languages` feature set",
321                lang.get_name(),
322            );
323        }
324    }
325
326    // Smoke test for the `LanguageDisabled` contract on a build
327    // without the `javascript` feature: every dispatch entry point
328    // (here, `get_tree_sitter_language`) must hand back
329    // `Err(LanguageDisabled(LANG::Javascript))`. Gated on
330    // `not(feature = "javascript")` so it only runs in a feature-
331    // subset build where the language is actually disabled — the
332    // `all-languages` default would have `is_enabled` return true
333    // and `get_tree_sitter_language` succeed.
334    #[cfg(not(feature = "javascript"))]
335    #[test]
336    fn disabled_language_dispatch_returns_language_disabled() {
337        assert!(!LANG::Javascript.is_enabled());
338        match LANG::Javascript.get_tree_sitter_language() {
339            Err(MetricsError::LanguageDisabled(LANG::Javascript)) => {}
340            other => panic!(
341                "expected Err(LanguageDisabled(Javascript)) for disabled `javascript` feature, got {other:?}",
342            ),
343        }
344    }
345
346    // `is_enabled` and `get_tree_sitter_language` must agree: a
347    // variant that reports itself enabled must hand back a usable
348    // `Language`, never `Err(LanguageDisabled)`. The pairing exists
349    // so callers that branch on `is_enabled` (rather than match on
350    // the error) can rely on the language lookup succeeding.
351    #[test]
352    fn is_enabled_matches_get_tree_sitter_language() {
353        for lang in LANG::into_enum_iter() {
354            let lookup = lang.get_tree_sitter_language();
355            assert_eq!(
356                lang.is_enabled(),
357                lookup.is_ok(),
358                "{} disagrees: is_enabled={}, get_tree_sitter_language={:?}",
359                lang.get_name(),
360                lang.is_enabled(),
361                lookup.map(|_| "Ok"),
362            );
363        }
364    }
365
366    // Regression guard for issue #262: the `MetricsError::EmptyRoot`
367    // variant is documented as "Reserved — not produced today".
368    // `metrics_with_options` pushes a synthetic top-level Unit
369    // `FuncSpace` before walking, so every parse — including empty,
370    // whitespace-only, and comment-only input — currently returns
371    // `Ok(FuncSpace { kind: Unit, .. })`. If the walker is ever
372    // changed to legitimately drain its state stack (e.g. by
373    // dropping the synthetic root), this test will start failing
374    // and the variant docs must be revisited.
375    #[test]
376    fn empty_and_comment_only_input_never_returns_empty_root() {
377        use crate::{MetricsOptions, Source, SpaceKind, analyze};
378
379        // Pair every enabled language with sources that would, by
380        // the old (false) variant doc, surface `EmptyRoot`. The
381        // comment syntaxes cover line and block forms across the
382        // supported language families.
383        let inputs: &[&[u8]] = &[b"", b"   \n\t\n", b"// just a comment\n", b"/* block */\n"];
384
385        for lang in LANG::into_enum_iter() {
386            if !lang.is_enabled() {
387                continue;
388            }
389            for src in inputs {
390                let space = analyze(Source::new(lang, src), MetricsOptions::default())
391                    .unwrap_or_else(|err| {
392                        panic!(
393                            "{} on input {:?} unexpectedly returned {err:?}; \
394                             EmptyRoot is documented as not produced today",
395                            lang.get_name(),
396                            String::from_utf8_lossy(src),
397                        )
398                    });
399                assert_eq!(
400                    space.kind,
401                    SpaceKind::Unit,
402                    "{} on input {:?} produced a non-Unit top-level FuncSpace",
403                    lang.get_name(),
404                    String::from_utf8_lossy(src),
405                );
406            }
407        }
408    }
409
410    // The error variant carries the originating `LANG` so callers
411    // can distinguish "X is disabled" from "Y is disabled" in a
412    // mixed batch. Verifies the `Display` impl mentions the
413    // language name as documented in `src/error.rs`.
414    #[test]
415    fn language_disabled_display_includes_language_name() {
416        let err = MetricsError::LanguageDisabled(LANG::Rust);
417        let rendered = err.to_string();
418        assert!(
419            rendered.contains("rust"),
420            "expected LanguageDisabled display to mention `rust`, got {rendered:?}",
421        );
422    }
423}