Skip to main content

sem_core/parser/plugins/code/
languages.rs

1use tree_sitter::Language;
2
3pub struct SuppressedNestedEntity {
4    pub parent_entity_node_type: &'static str,
5    pub child_entity_node_type: &'static str,
6}
7
8#[allow(dead_code)]
9pub struct LanguageConfig {
10    pub id: &'static str,
11    pub extensions: &'static [&'static str],
12    pub entity_node_types: &'static [&'static str],
13    pub container_node_types: &'static [&'static str],
14    pub call_entity_identifiers: &'static [&'static str],
15    pub suppressed_nested_entities: &'static [SuppressedNestedEntity],
16    /// Node types that introduce a new scope. The general (non-container) recursion
17    /// in visit_node will not descend into these nodes, preventing local variables
18    /// inside function bodies from being extracted as top-level entities.
19    pub scope_boundary_types: &'static [&'static str],
20    pub get_language: fn() -> Option<Language>,
21}
22
23fn get_typescript() -> Option<Language> {
24    Some(tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into())
25}
26
27fn get_tsx() -> Option<Language> {
28    Some(tree_sitter_typescript::LANGUAGE_TSX.into())
29}
30
31fn get_javascript() -> Option<Language> {
32    Some(tree_sitter_javascript::LANGUAGE.into())
33}
34
35fn get_python() -> Option<Language> {
36    Some(tree_sitter_python::LANGUAGE.into())
37}
38
39fn get_go() -> Option<Language> {
40    Some(tree_sitter_go::LANGUAGE.into())
41}
42
43fn get_rust() -> Option<Language> {
44    Some(tree_sitter_rust::LANGUAGE.into())
45}
46
47fn get_java() -> Option<Language> {
48    Some(tree_sitter_java::LANGUAGE.into())
49}
50
51fn get_c() -> Option<Language> {
52    Some(tree_sitter_c::LANGUAGE.into())
53}
54
55fn get_cpp() -> Option<Language> {
56    Some(tree_sitter_cpp::LANGUAGE.into())
57}
58
59fn get_ruby() -> Option<Language> {
60    Some(tree_sitter_ruby::LANGUAGE.into())
61}
62
63fn get_csharp() -> Option<Language> {
64    Some(tree_sitter_c_sharp::LANGUAGE.into())
65}
66
67fn get_php() -> Option<Language> {
68    Some(tree_sitter_php::LANGUAGE_PHP.into())
69}
70
71fn get_fortran() -> Option<Language> {
72    Some(tree_sitter_fortran::LANGUAGE.into())
73}
74
75fn get_swift() -> Option<Language> {
76    Some(tree_sitter_swift::LANGUAGE.into())
77}
78
79fn get_elixir() -> Option<Language> {
80    Some(tree_sitter_elixir::LANGUAGE.into())
81}
82
83fn get_bash() -> Option<Language> {
84    Some(tree_sitter_bash::LANGUAGE.into())
85}
86
87fn get_hcl() -> Option<Language> {
88    Some(tree_sitter_hcl::LANGUAGE.into())
89}
90
91fn get_kotlin() -> Option<Language> {
92    Some(tree_sitter_kotlin_ng::LANGUAGE.into())
93}
94
95fn get_xml() -> Option<Language> {
96    Some(tree_sitter_xml::LANGUAGE_XML.into())
97}
98
99fn get_dart() -> Option<Language> {
100    Some(tree_sitter_dart::LANGUAGE.into())
101}
102
103fn get_perl() -> Option<Language> {
104    Some(tree_sitter_perl_next::LANGUAGE.into())
105}
106
107fn get_ocaml() -> Option<Language> {
108    Some(tree_sitter_ocaml::LANGUAGE_OCAML.into())
109}
110
111fn get_ocaml_interface() -> Option<Language> {
112    Some(tree_sitter_ocaml::LANGUAGE_OCAML_INTERFACE.into())
113}
114
115fn get_scala() -> Option<Language> {
116    Some(tree_sitter_scala::LANGUAGE.into())
117}
118
119fn get_zig() -> Option<Language> {
120    Some(tree_sitter_zig::LANGUAGE.into())
121}
122
123/// Inside JS/TS function bodies, suppress variable declarations so that local
124/// variables are not extracted as nested entities. Inner function/class
125/// declarations are still extracted for diff granularity.
126const JS_TS_SUPPRESSED_NESTED: &[SuppressedNestedEntity] = &[
127    SuppressedNestedEntity {
128        parent_entity_node_type: "function_declaration",
129        child_entity_node_type: "lexical_declaration",
130    },
131    SuppressedNestedEntity {
132        parent_entity_node_type: "function_declaration",
133        child_entity_node_type: "variable_declaration",
134    },
135    SuppressedNestedEntity {
136        parent_entity_node_type: "generator_function_declaration",
137        child_entity_node_type: "lexical_declaration",
138    },
139    SuppressedNestedEntity {
140        parent_entity_node_type: "generator_function_declaration",
141        child_entity_node_type: "variable_declaration",
142    },
143    SuppressedNestedEntity {
144        parent_entity_node_type: "method_definition",
145        child_entity_node_type: "lexical_declaration",
146    },
147    SuppressedNestedEntity {
148        parent_entity_node_type: "method_definition",
149        child_entity_node_type: "variable_declaration",
150    },
151    // Scope boundaries: suppress local variables inside arrow functions,
152    // function expressions, and generator functions, while still allowing
153    // inner class/function declarations to be extracted.
154    SuppressedNestedEntity {
155        parent_entity_node_type: "arrow_function",
156        child_entity_node_type: "lexical_declaration",
157    },
158    SuppressedNestedEntity {
159        parent_entity_node_type: "arrow_function",
160        child_entity_node_type: "variable_declaration",
161    },
162    SuppressedNestedEntity {
163        parent_entity_node_type: "function_expression",
164        child_entity_node_type: "lexical_declaration",
165    },
166    SuppressedNestedEntity {
167        parent_entity_node_type: "function_expression",
168        child_entity_node_type: "variable_declaration",
169    },
170    SuppressedNestedEntity {
171        parent_entity_node_type: "generator_function",
172        child_entity_node_type: "lexical_declaration",
173    },
174    SuppressedNestedEntity {
175        parent_entity_node_type: "generator_function",
176        child_entity_node_type: "variable_declaration",
177    },
178];
179
180const JS_TS_SCOPE_BOUNDARIES: &[&str] = &[
181    "arrow_function",
182    "function_expression",
183    "generator_function",
184];
185
186static TYPESCRIPT_CONFIG: LanguageConfig = LanguageConfig {
187    id: "typescript",
188    extensions: &[".ts", ".mts", ".cts"],
189    entity_node_types: &[
190        "function_declaration",
191        "generator_function_declaration",
192        "class_declaration",
193        "interface_declaration",
194        "type_alias_declaration",
195        "enum_declaration",
196        "export_statement",
197        "lexical_declaration",
198        "variable_declaration",
199        "method_definition",
200        "public_field_definition",
201    ],
202    container_node_types: &["class_body", "interface_body", "enum_body", "statement_block"],
203    call_entity_identifiers: &[],
204    suppressed_nested_entities: JS_TS_SUPPRESSED_NESTED,
205    scope_boundary_types: JS_TS_SCOPE_BOUNDARIES,
206    get_language: get_typescript,
207};
208
209static TSX_CONFIG: LanguageConfig = LanguageConfig {
210    id: "tsx",
211    extensions: &[".tsx"],
212    entity_node_types: &[
213        "function_declaration",
214        "generator_function_declaration",
215        "class_declaration",
216        "interface_declaration",
217        "type_alias_declaration",
218        "enum_declaration",
219        "export_statement",
220        "lexical_declaration",
221        "variable_declaration",
222        "method_definition",
223        "public_field_definition",
224    ],
225    container_node_types: &["class_body", "interface_body", "enum_body", "statement_block"],
226    call_entity_identifiers: &[],
227    suppressed_nested_entities: JS_TS_SUPPRESSED_NESTED,
228    scope_boundary_types: JS_TS_SCOPE_BOUNDARIES,
229    get_language: get_tsx,
230};
231
232static JAVASCRIPT_CONFIG: LanguageConfig = LanguageConfig {
233    id: "javascript",
234    extensions: &[".js", ".jsx", ".mjs", ".cjs", ".es6"],
235    entity_node_types: &[
236        "function_declaration",
237        "generator_function_declaration",
238        "class_declaration",
239        "export_statement",
240        "lexical_declaration",
241        "variable_declaration",
242        "method_definition",
243        "field_definition",
244    ],
245    container_node_types: &["class_body", "statement_block"],
246    call_entity_identifiers: &[],
247    suppressed_nested_entities: JS_TS_SUPPRESSED_NESTED,
248    scope_boundary_types: JS_TS_SCOPE_BOUNDARIES,
249    get_language: get_javascript,
250};
251
252static PYTHON_CONFIG: LanguageConfig = LanguageConfig {
253    id: "python",
254    extensions: &[".py", ".pyi"],
255    entity_node_types: &[
256        "function_definition",
257        "class_definition",
258        "decorated_definition",
259    ],
260    container_node_types: &["block"],
261    call_entity_identifiers: &[],
262    suppressed_nested_entities: &[],
263    scope_boundary_types: &[],
264    get_language: get_python,
265};
266
267static GO_CONFIG: LanguageConfig = LanguageConfig {
268    id: "go",
269    extensions: &[".go"],
270    entity_node_types: &[
271        "function_declaration",
272        "method_declaration",
273        "type_declaration",
274        "var_declaration",
275        "const_declaration",
276    ],
277    container_node_types: &["block"],
278    call_entity_identifiers: &[],
279    suppressed_nested_entities: &[],
280    scope_boundary_types: &[],
281    get_language: get_go,
282};
283
284static RUST_CONFIG: LanguageConfig = LanguageConfig {
285    id: "rust",
286    extensions: &[".rs"],
287    entity_node_types: &[
288        "function_item",
289        "struct_item",
290        "enum_item",
291        "impl_item",
292        "trait_item",
293        "mod_item",
294        "const_item",
295        "static_item",
296        "type_item",
297    ],
298    container_node_types: &["declaration_list", "block"],
299    call_entity_identifiers: &[],
300    suppressed_nested_entities: &[],
301    scope_boundary_types: &[],
302    get_language: get_rust,
303};
304
305static JAVA_CONFIG: LanguageConfig = LanguageConfig {
306    id: "java",
307    extensions: &[".java"],
308    entity_node_types: &[
309        "class_declaration",
310        "method_declaration",
311        "interface_declaration",
312        "enum_declaration",
313        "field_declaration",
314        "constructor_declaration",
315        "annotation_type_declaration",
316    ],
317    container_node_types: &["class_body", "interface_body", "enum_body", "block"],
318    call_entity_identifiers: &[],
319    suppressed_nested_entities: &[],
320    scope_boundary_types: &[],
321    get_language: get_java,
322};
323
324static C_CONFIG: LanguageConfig = LanguageConfig {
325    id: "c",
326    extensions: &[".c", ".h"],
327    entity_node_types: &[
328        "function_definition",
329        "struct_specifier",
330        "enum_specifier",
331        "union_specifier",
332        "type_definition",
333        "declaration",
334    ],
335    container_node_types: &["compound_statement"],
336    call_entity_identifiers: &[],
337    suppressed_nested_entities: &[],
338    scope_boundary_types: &[],
339    get_language: get_c,
340};
341
342static CPP_CONFIG: LanguageConfig = LanguageConfig {
343    id: "cpp",
344    extensions: &[".cpp", ".cc", ".cxx", ".hpp", ".hh", ".hxx"],
345    entity_node_types: &[
346        "function_definition",
347        "class_specifier",
348        "struct_specifier",
349        "enum_specifier",
350        "namespace_definition",
351        "template_declaration",
352        "declaration",
353        "type_definition",
354    ],
355    container_node_types: &["field_declaration_list", "declaration_list", "compound_statement"],
356    call_entity_identifiers: &[],
357    suppressed_nested_entities: &[],
358    scope_boundary_types: &[],
359    get_language: get_cpp,
360};
361
362static RUBY_CONFIG: LanguageConfig = LanguageConfig {
363    id: "ruby",
364    extensions: &[".rb"],
365    entity_node_types: &[
366        "method",
367        "singleton_method",
368        "class",
369        "module",
370    ],
371    container_node_types: &["body_statement"],
372    call_entity_identifiers: &[],
373    suppressed_nested_entities: &[],
374    scope_boundary_types: &[],
375    get_language: get_ruby,
376};
377
378static CSHARP_CONFIG: LanguageConfig = LanguageConfig {
379    id: "csharp",
380    extensions: &[".cs"],
381    entity_node_types: &[
382        "method_declaration",
383        "class_declaration",
384        "interface_declaration",
385        "enum_declaration",
386        "struct_declaration",
387        "namespace_declaration",
388        "property_declaration",
389        "constructor_declaration",
390        "field_declaration",
391    ],
392    container_node_types: &["declaration_list", "block"],
393    call_entity_identifiers: &[],
394    suppressed_nested_entities: &[],
395    scope_boundary_types: &[],
396    get_language: get_csharp,
397};
398
399static PHP_CONFIG: LanguageConfig = LanguageConfig {
400    id: "php",
401    extensions: &[".php"],
402    entity_node_types: &[
403        "function_definition",
404        "class_declaration",
405        "method_declaration",
406        "interface_declaration",
407        "trait_declaration",
408        "enum_declaration",
409        "namespace_definition",
410    ],
411    container_node_types: &["declaration_list", "enum_declaration_list", "compound_statement"],
412    call_entity_identifiers: &[],
413    suppressed_nested_entities: &[],
414    scope_boundary_types: &[],
415    get_language: get_php,
416};
417
418static FORTRAN_CONFIG: LanguageConfig = LanguageConfig {
419    id: "fortran",
420    extensions: &[".f90", ".f95", ".f03", ".f08", ".f", ".for"],
421    entity_node_types: &[
422        "function",
423        "subroutine",
424        "module",
425        "program",
426        "interface",
427        "type_declaration",
428    ],
429    container_node_types: &["module", "program", "internal_procedures"],
430    call_entity_identifiers: &[],
431    suppressed_nested_entities: &[],
432    scope_boundary_types: &[],
433    get_language: get_fortran,
434};
435
436static SWIFT_CONFIG: LanguageConfig = LanguageConfig {
437    id: "swift",
438    extensions: &[".swift"],
439    entity_node_types: &[
440        "function_declaration",
441        "class_declaration",
442        "protocol_declaration",
443        "init_declaration",
444        "deinit_declaration",
445        "subscript_declaration",
446        "typealias_declaration",
447        "property_declaration",
448        "operator_declaration",
449        "associatedtype_declaration",
450    ],
451    container_node_types: &["class_body", "protocol_body", "enum_class_body", "function_body"],
452    call_entity_identifiers: &[],
453    suppressed_nested_entities: &[],
454    scope_boundary_types: &[],
455    get_language: get_swift,
456};
457
458static ELIXIR_CONFIG: LanguageConfig = LanguageConfig {
459    id: "elixir",
460    extensions: &[".ex", ".exs"],
461    entity_node_types: &[],
462    container_node_types: &["do_block"],
463    call_entity_identifiers: &[
464        "defmodule", "def", "defp", "defmacro", "defmacrop",
465        "defguard", "defguardp", "defprotocol", "defimpl",
466        "defstruct", "defexception", "defdelegate",
467    ],
468    suppressed_nested_entities: &[],
469    scope_boundary_types: &[],
470    get_language: get_elixir,
471};
472
473static BASH_CONFIG: LanguageConfig = LanguageConfig {
474    id: "bash",
475    extensions: &[".sh"],
476    entity_node_types: &["function_definition"],
477    container_node_types: &["compound_statement"],
478    call_entity_identifiers: &[],
479    suppressed_nested_entities: &[],
480    scope_boundary_types: &[],
481    get_language: get_bash,
482};
483
484static HCL_CONFIG: LanguageConfig = LanguageConfig {
485    id: "hcl",
486    extensions: &[".hcl", ".tf", ".tfvars"],
487    entity_node_types: &["block", "attribute"],
488    container_node_types: &["body"],
489    call_entity_identifiers: &[],
490    suppressed_nested_entities: &[SuppressedNestedEntity {
491        parent_entity_node_type: "block",
492        child_entity_node_type: "attribute",
493    }],
494    scope_boundary_types: &[],
495    get_language: get_hcl,
496};
497
498static KOTLIN_CONFIG: LanguageConfig = LanguageConfig {
499    id: "kotlin",
500    extensions: &[".kt", ".kts"],
501    entity_node_types: &[
502        "function_declaration",
503        "class_declaration",
504        "object_declaration",
505        "property_declaration",
506        "companion_object",
507        "secondary_constructor",
508        "type_alias",
509    ],
510    container_node_types: &["class_body", "enum_class_body"],
511    call_entity_identifiers: &[],
512    suppressed_nested_entities: &[],
513    scope_boundary_types: &[],
514    get_language: get_kotlin,
515};
516
517static XML_CONFIG: LanguageConfig = LanguageConfig {
518    id: "xml",
519    extensions: &[".xml", ".plist", ".svg", ".xhtml", ".csproj", ".fsproj", ".vbproj", ".props", ".targets", ".nuspec", ".resx", ".xaml", ".axml"],
520    entity_node_types: &["element"],
521    container_node_types: &["content"],
522    call_entity_identifiers: &[],
523    suppressed_nested_entities: &[],
524    scope_boundary_types: &[],
525    get_language: get_xml,
526};
527
528static DART_CONFIG: LanguageConfig = LanguageConfig {
529    id: "dart",
530    extensions: &[".dart"],
531    entity_node_types: &[
532        "class_declaration",
533        "mixin_declaration",
534        "extension_declaration",
535        "extension_type_declaration",
536        "enum_declaration",
537        "type_alias",
538        "class_member",
539        "function_signature",
540        "getter_signature",
541        "setter_signature",
542    ],
543    container_node_types: &["class_body", "enum_body", "extension_body"],
544    call_entity_identifiers: &[],
545    suppressed_nested_entities: &[],
546    scope_boundary_types: &[],
547    get_language: get_dart,
548};
549  
550static PERL_CONFIG: LanguageConfig = LanguageConfig {
551    id: "perl",
552    extensions: &[".pl", ".pm", ".t"],
553    entity_node_types: &[
554        "subroutine_declaration_statement",
555        "package_statement",
556    ],
557    container_node_types: &["block"],
558    call_entity_identifiers: &[],
559    suppressed_nested_entities: &[],
560    scope_boundary_types: &[],
561    get_language: get_perl,
562};
563
564static OCAML_CONFIG: LanguageConfig = LanguageConfig {
565    id: "ocaml",
566    extensions: &[".ml"],
567    entity_node_types: &[
568        "value_definition",
569        "module_definition",
570        "module_type_definition",
571        "type_definition",
572        "exception_definition",
573        "class_definition",
574        "class_type_definition",
575        "external",
576    ],
577    container_node_types: &["structure", "module_binding"],
578    call_entity_identifiers: &[],
579    suppressed_nested_entities: &[],
580    scope_boundary_types: &[],
581    get_language: get_ocaml,
582};
583
584static OCAML_INTERFACE_CONFIG: LanguageConfig = LanguageConfig {
585    id: "ocaml_interface",
586    extensions: &[".mli"],
587    entity_node_types: &[
588        "value_specification",
589        "module_definition",
590        "module_type_definition",
591        "type_definition",
592        "exception_definition",
593        "class_definition",
594        "class_type_definition",
595        "external",
596    ],
597    container_node_types: &["signature", "module_binding"],
598    call_entity_identifiers: &[],
599    suppressed_nested_entities: &[],
600    scope_boundary_types: &[],
601    get_language: get_ocaml_interface,
602};
603
604static SCALA_CONFIG: LanguageConfig = LanguageConfig {
605    id: "scala",
606    extensions: &[".scala", ".sc", ".sbt", ".kojo", ".mill"],
607    entity_node_types: &[
608        "class_definition",
609        "object_definition",
610        "trait_definition",
611        "enum_definition",
612        "function_definition",
613        "function_declaration",
614        "val_definition",
615        "given_definition",
616        "extension_definition",
617        "type_definition",
618        "package_object",
619    ],
620    container_node_types: &["template_body", "enum_body", "with_template_body"],
621    call_entity_identifiers: &[],
622    suppressed_nested_entities: &[],
623    scope_boundary_types: &[],
624    get_language: get_scala,
625};
626
627static ZIG_CONFIG: LanguageConfig = LanguageConfig {
628    id: "zig",
629    extensions: &[".zig"],
630    entity_node_types: &[
631        "function_declaration",
632        "test_declaration",
633        "variable_declaration",
634    ],
635    container_node_types: &["block"],
636    call_entity_identifiers: &[],
637    suppressed_nested_entities: &[
638        SuppressedNestedEntity {
639            parent_entity_node_type: "function_declaration",
640            child_entity_node_type: "variable_declaration",
641        },
642    ],
643    scope_boundary_types: &[],
644    get_language: get_zig,
645};
646
647static ALL_CONFIGS: &[&LanguageConfig] = &[
648    &TYPESCRIPT_CONFIG,
649    &TSX_CONFIG,
650    &JAVASCRIPT_CONFIG,
651    &PYTHON_CONFIG,
652    &GO_CONFIG,
653    &RUST_CONFIG,
654    &JAVA_CONFIG,
655    &C_CONFIG,
656    &CPP_CONFIG,
657    &RUBY_CONFIG,
658    &CSHARP_CONFIG,
659    &PHP_CONFIG,
660    &FORTRAN_CONFIG,
661    &SWIFT_CONFIG,
662    &ELIXIR_CONFIG,
663    &BASH_CONFIG,
664    &HCL_CONFIG,
665    &KOTLIN_CONFIG,
666    &XML_CONFIG,
667    &DART_CONFIG,
668    &PERL_CONFIG,
669    &OCAML_CONFIG,
670    &OCAML_INTERFACE_CONFIG,
671    &SCALA_CONFIG,
672    &ZIG_CONFIG,
673];
674
675pub fn get_language_config(extension: &str) -> Option<&'static LanguageConfig> {
676    ALL_CONFIGS
677        .iter()
678        .find(|c| c.extensions.contains(&extension))
679        .copied()
680}
681
682pub fn get_all_code_extensions() -> &'static [&'static str] {
683    // All unique extensions across all language configs
684    static EXTENSIONS: &[&str] = &[
685        ".ts",".tsx", ".mts", ".cts", ".js", ".jsx", ".mjs", ".cjs", ".py", ".pyi", ".go", ".rs", ".java", ".c", ".h",
686        ".cpp", ".cc", ".cxx", ".hpp", ".hh", ".hxx", ".rb", ".cs", ".php", ".f90", ".f95", ".f03",
687        ".f08", ".f", ".for", ".swift", ".ex", ".exs", ".sh", ".hcl", ".tf", ".tfvars",
688        ".kt", ".kts",
689        ".xml", ".plist", ".svg", ".xhtml", ".csproj", ".fsproj", ".vbproj", ".props", ".targets",
690        ".nuspec", ".resx", ".xaml", ".axml",
691        ".dart",
692        ".pl", ".pm", ".t",
693        ".ml", ".mli",
694        ".scala", ".sc", ".sbt", ".kojo", ".mill",
695        ".zig",
696    ];
697    EXTENSIONS
698}