Skip to main content

sem_core/parser/plugins/code/
languages.rs

1use tree_sitter::Language;
2
3pub struct SuppressedNestedEntity {
4    pub parent_entity_node_type: &'static str,
5    pub child_entity_node_type: &'static str,
6}
7
8#[allow(dead_code)]
9pub struct LanguageConfig {
10    pub id: &'static str,
11    pub extensions: &'static [&'static str],
12    pub entity_node_types: &'static [&'static str],
13    pub container_node_types: &'static [&'static str],
14    pub call_entity_identifiers: &'static [&'static str],
15    pub suppressed_nested_entities: &'static [SuppressedNestedEntity],
16    /// Node types that introduce a new scope. The general (non-container) recursion
17    /// in visit_node will not descend into these nodes, preventing local variables
18    /// inside function bodies from being extracted as top-level entities.
19    pub scope_boundary_types: &'static [&'static str],
20    pub get_language: fn() -> Option<Language>,
21}
22
23fn get_typescript() -> Option<Language> {
24    Some(tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into())
25}
26
27fn get_tsx() -> Option<Language> {
28    Some(tree_sitter_typescript::LANGUAGE_TSX.into())
29}
30
31fn get_javascript() -> Option<Language> {
32    Some(tree_sitter_javascript::LANGUAGE.into())
33}
34
35fn get_python() -> Option<Language> {
36    Some(tree_sitter_python::LANGUAGE.into())
37}
38
39fn get_go() -> Option<Language> {
40    Some(tree_sitter_go::LANGUAGE.into())
41}
42
43fn get_rust() -> Option<Language> {
44    Some(tree_sitter_rust::LANGUAGE.into())
45}
46
47fn get_java() -> Option<Language> {
48    Some(tree_sitter_java::LANGUAGE.into())
49}
50
51fn get_c() -> Option<Language> {
52    Some(tree_sitter_c::LANGUAGE.into())
53}
54
55fn get_cpp() -> Option<Language> {
56    Some(tree_sitter_cpp::LANGUAGE.into())
57}
58
59fn get_ruby() -> Option<Language> {
60    Some(tree_sitter_ruby::LANGUAGE.into())
61}
62
63fn get_csharp() -> Option<Language> {
64    Some(tree_sitter_c_sharp::LANGUAGE.into())
65}
66
67fn get_php() -> Option<Language> {
68    Some(tree_sitter_php::LANGUAGE_PHP.into())
69}
70
71fn get_fortran() -> Option<Language> {
72    Some(tree_sitter_fortran::LANGUAGE.into())
73}
74
75fn get_swift() -> Option<Language> {
76    Some(tree_sitter_swift::LANGUAGE.into())
77}
78
79fn get_elixir() -> Option<Language> {
80    Some(tree_sitter_elixir::LANGUAGE.into())
81}
82
83fn get_bash() -> Option<Language> {
84    Some(tree_sitter_bash::LANGUAGE.into())
85}
86
87fn get_hcl() -> Option<Language> {
88    Some(tree_sitter_hcl::LANGUAGE.into())
89}
90
91fn get_kotlin() -> Option<Language> {
92    Some(tree_sitter_kotlin_ng::LANGUAGE.into())
93}
94
95fn get_xml() -> Option<Language> {
96    Some(tree_sitter_xml::LANGUAGE_XML.into())
97}
98
99fn get_dart() -> Option<Language> {
100    Some(tree_sitter_dart::LANGUAGE.into())
101}
102
103fn get_perl() -> Option<Language> {
104    Some(tree_sitter_perl_next::LANGUAGE.into())
105}
106
107fn get_ocaml() -> Option<Language> {
108    Some(tree_sitter_ocaml::LANGUAGE_OCAML.into())
109}
110
111fn get_ocaml_interface() -> Option<Language> {
112    Some(tree_sitter_ocaml::LANGUAGE_OCAML_INTERFACE.into())
113}
114
115fn get_scala() -> Option<Language> {
116    Some(tree_sitter_scala::LANGUAGE.into())
117}
118
119/// Inside JS/TS function bodies, suppress variable declarations so that local
120/// variables are not extracted as nested entities. Inner function/class
121/// declarations are still extracted for diff granularity.
122const JS_TS_SUPPRESSED_NESTED: &[SuppressedNestedEntity] = &[
123    SuppressedNestedEntity {
124        parent_entity_node_type: "function_declaration",
125        child_entity_node_type: "lexical_declaration",
126    },
127    SuppressedNestedEntity {
128        parent_entity_node_type: "function_declaration",
129        child_entity_node_type: "variable_declaration",
130    },
131    SuppressedNestedEntity {
132        parent_entity_node_type: "generator_function_declaration",
133        child_entity_node_type: "lexical_declaration",
134    },
135    SuppressedNestedEntity {
136        parent_entity_node_type: "generator_function_declaration",
137        child_entity_node_type: "variable_declaration",
138    },
139    SuppressedNestedEntity {
140        parent_entity_node_type: "method_definition",
141        child_entity_node_type: "lexical_declaration",
142    },
143    SuppressedNestedEntity {
144        parent_entity_node_type: "method_definition",
145        child_entity_node_type: "variable_declaration",
146    },
147    // Scope boundaries: suppress local variables inside arrow functions,
148    // function expressions, and generator functions, while still allowing
149    // inner class/function declarations to be extracted.
150    SuppressedNestedEntity {
151        parent_entity_node_type: "arrow_function",
152        child_entity_node_type: "lexical_declaration",
153    },
154    SuppressedNestedEntity {
155        parent_entity_node_type: "arrow_function",
156        child_entity_node_type: "variable_declaration",
157    },
158    SuppressedNestedEntity {
159        parent_entity_node_type: "function_expression",
160        child_entity_node_type: "lexical_declaration",
161    },
162    SuppressedNestedEntity {
163        parent_entity_node_type: "function_expression",
164        child_entity_node_type: "variable_declaration",
165    },
166    SuppressedNestedEntity {
167        parent_entity_node_type: "generator_function",
168        child_entity_node_type: "lexical_declaration",
169    },
170    SuppressedNestedEntity {
171        parent_entity_node_type: "generator_function",
172        child_entity_node_type: "variable_declaration",
173    },
174];
175
176const JS_TS_SCOPE_BOUNDARIES: &[&str] = &[
177    "arrow_function",
178    "function_expression",
179    "generator_function",
180];
181
182static TYPESCRIPT_CONFIG: LanguageConfig = LanguageConfig {
183    id: "typescript",
184    extensions: &[".ts", ".mts", ".cts"],
185    entity_node_types: &[
186        "function_declaration",
187        "generator_function_declaration",
188        "class_declaration",
189        "interface_declaration",
190        "type_alias_declaration",
191        "enum_declaration",
192        "export_statement",
193        "lexical_declaration",
194        "variable_declaration",
195        "method_definition",
196        "public_field_definition",
197    ],
198    container_node_types: &["class_body", "interface_body", "enum_body", "statement_block"],
199    call_entity_identifiers: &[],
200    suppressed_nested_entities: JS_TS_SUPPRESSED_NESTED,
201    scope_boundary_types: JS_TS_SCOPE_BOUNDARIES,
202    get_language: get_typescript,
203};
204
205static TSX_CONFIG: LanguageConfig = LanguageConfig {
206    id: "tsx",
207    extensions: &[".tsx"],
208    entity_node_types: &[
209        "function_declaration",
210        "generator_function_declaration",
211        "class_declaration",
212        "interface_declaration",
213        "type_alias_declaration",
214        "enum_declaration",
215        "export_statement",
216        "lexical_declaration",
217        "variable_declaration",
218        "method_definition",
219        "public_field_definition",
220    ],
221    container_node_types: &["class_body", "interface_body", "enum_body", "statement_block"],
222    call_entity_identifiers: &[],
223    suppressed_nested_entities: JS_TS_SUPPRESSED_NESTED,
224    scope_boundary_types: JS_TS_SCOPE_BOUNDARIES,
225    get_language: get_tsx,
226};
227
228static JAVASCRIPT_CONFIG: LanguageConfig = LanguageConfig {
229    id: "javascript",
230    extensions: &[".js", ".jsx", ".mjs", ".cjs", ".es6"],
231    entity_node_types: &[
232        "function_declaration",
233        "generator_function_declaration",
234        "class_declaration",
235        "export_statement",
236        "lexical_declaration",
237        "variable_declaration",
238        "method_definition",
239        "field_definition",
240    ],
241    container_node_types: &["class_body", "statement_block"],
242    call_entity_identifiers: &[],
243    suppressed_nested_entities: JS_TS_SUPPRESSED_NESTED,
244    scope_boundary_types: JS_TS_SCOPE_BOUNDARIES,
245    get_language: get_javascript,
246};
247
248static PYTHON_CONFIG: LanguageConfig = LanguageConfig {
249    id: "python",
250    extensions: &[".py", ".pyi"],
251    entity_node_types: &[
252        "function_definition",
253        "class_definition",
254        "decorated_definition",
255    ],
256    container_node_types: &["block"],
257    call_entity_identifiers: &[],
258    suppressed_nested_entities: &[],
259    scope_boundary_types: &[],
260    get_language: get_python,
261};
262
263static GO_CONFIG: LanguageConfig = LanguageConfig {
264    id: "go",
265    extensions: &[".go"],
266    entity_node_types: &[
267        "function_declaration",
268        "method_declaration",
269        "type_declaration",
270        "var_declaration",
271        "const_declaration",
272    ],
273    container_node_types: &["block"],
274    call_entity_identifiers: &[],
275    suppressed_nested_entities: &[],
276    scope_boundary_types: &[],
277    get_language: get_go,
278};
279
280static RUST_CONFIG: LanguageConfig = LanguageConfig {
281    id: "rust",
282    extensions: &[".rs"],
283    entity_node_types: &[
284        "function_item",
285        "struct_item",
286        "enum_item",
287        "impl_item",
288        "trait_item",
289        "mod_item",
290        "const_item",
291        "static_item",
292        "type_item",
293    ],
294    container_node_types: &["declaration_list", "block"],
295    call_entity_identifiers: &[],
296    suppressed_nested_entities: &[],
297    scope_boundary_types: &[],
298    get_language: get_rust,
299};
300
301static JAVA_CONFIG: LanguageConfig = LanguageConfig {
302    id: "java",
303    extensions: &[".java"],
304    entity_node_types: &[
305        "class_declaration",
306        "method_declaration",
307        "interface_declaration",
308        "enum_declaration",
309        "field_declaration",
310        "constructor_declaration",
311        "annotation_type_declaration",
312    ],
313    container_node_types: &["class_body", "interface_body", "enum_body", "block"],
314    call_entity_identifiers: &[],
315    suppressed_nested_entities: &[],
316    scope_boundary_types: &[],
317    get_language: get_java,
318};
319
320static C_CONFIG: LanguageConfig = LanguageConfig {
321    id: "c",
322    extensions: &[".c", ".h"],
323    entity_node_types: &[
324        "function_definition",
325        "struct_specifier",
326        "enum_specifier",
327        "union_specifier",
328        "type_definition",
329        "declaration",
330    ],
331    container_node_types: &["compound_statement"],
332    call_entity_identifiers: &[],
333    suppressed_nested_entities: &[],
334    scope_boundary_types: &[],
335    get_language: get_c,
336};
337
338static CPP_CONFIG: LanguageConfig = LanguageConfig {
339    id: "cpp",
340    extensions: &[".cpp", ".cc", ".cxx", ".hpp", ".hh", ".hxx"],
341    entity_node_types: &[
342        "function_definition",
343        "class_specifier",
344        "struct_specifier",
345        "enum_specifier",
346        "namespace_definition",
347        "template_declaration",
348        "declaration",
349        "type_definition",
350    ],
351    container_node_types: &["field_declaration_list", "declaration_list", "compound_statement"],
352    call_entity_identifiers: &[],
353    suppressed_nested_entities: &[],
354    scope_boundary_types: &[],
355    get_language: get_cpp,
356};
357
358static RUBY_CONFIG: LanguageConfig = LanguageConfig {
359    id: "ruby",
360    extensions: &[".rb"],
361    entity_node_types: &[
362        "method",
363        "singleton_method",
364        "class",
365        "module",
366    ],
367    container_node_types: &["body_statement"],
368    call_entity_identifiers: &[],
369    suppressed_nested_entities: &[],
370    scope_boundary_types: &[],
371    get_language: get_ruby,
372};
373
374static CSHARP_CONFIG: LanguageConfig = LanguageConfig {
375    id: "csharp",
376    extensions: &[".cs"],
377    entity_node_types: &[
378        "method_declaration",
379        "class_declaration",
380        "interface_declaration",
381        "enum_declaration",
382        "struct_declaration",
383        "namespace_declaration",
384        "property_declaration",
385        "constructor_declaration",
386        "field_declaration",
387    ],
388    container_node_types: &["declaration_list", "block"],
389    call_entity_identifiers: &[],
390    suppressed_nested_entities: &[],
391    scope_boundary_types: &[],
392    get_language: get_csharp,
393};
394
395static PHP_CONFIG: LanguageConfig = LanguageConfig {
396    id: "php",
397    extensions: &[".php"],
398    entity_node_types: &[
399        "function_definition",
400        "class_declaration",
401        "method_declaration",
402        "interface_declaration",
403        "trait_declaration",
404        "enum_declaration",
405        "namespace_definition",
406    ],
407    container_node_types: &["declaration_list", "enum_declaration_list", "compound_statement"],
408    call_entity_identifiers: &[],
409    suppressed_nested_entities: &[],
410    scope_boundary_types: &[],
411    get_language: get_php,
412};
413
414static FORTRAN_CONFIG: LanguageConfig = LanguageConfig {
415    id: "fortran",
416    extensions: &[".f90", ".f95", ".f03", ".f08", ".f", ".for"],
417    entity_node_types: &[
418        "function",
419        "subroutine",
420        "module",
421        "program",
422        "interface",
423        "type_declaration",
424    ],
425    container_node_types: &["module", "program", "internal_procedures"],
426    call_entity_identifiers: &[],
427    suppressed_nested_entities: &[],
428    scope_boundary_types: &[],
429    get_language: get_fortran,
430};
431
432static SWIFT_CONFIG: LanguageConfig = LanguageConfig {
433    id: "swift",
434    extensions: &[".swift"],
435    entity_node_types: &[
436        "function_declaration",
437        "class_declaration",
438        "protocol_declaration",
439        "init_declaration",
440        "deinit_declaration",
441        "subscript_declaration",
442        "typealias_declaration",
443        "property_declaration",
444        "operator_declaration",
445        "associatedtype_declaration",
446    ],
447    container_node_types: &["class_body", "protocol_body", "enum_class_body", "function_body"],
448    call_entity_identifiers: &[],
449    suppressed_nested_entities: &[],
450    scope_boundary_types: &[],
451    get_language: get_swift,
452};
453
454static ELIXIR_CONFIG: LanguageConfig = LanguageConfig {
455    id: "elixir",
456    extensions: &[".ex", ".exs"],
457    entity_node_types: &[],
458    container_node_types: &["do_block"],
459    call_entity_identifiers: &[
460        "defmodule", "def", "defp", "defmacro", "defmacrop",
461        "defguard", "defguardp", "defprotocol", "defimpl",
462        "defstruct", "defexception", "defdelegate",
463    ],
464    suppressed_nested_entities: &[],
465    scope_boundary_types: &[],
466    get_language: get_elixir,
467};
468
469static BASH_CONFIG: LanguageConfig = LanguageConfig {
470    id: "bash",
471    extensions: &[".sh"],
472    entity_node_types: &["function_definition"],
473    container_node_types: &["compound_statement"],
474    call_entity_identifiers: &[],
475    suppressed_nested_entities: &[],
476    scope_boundary_types: &[],
477    get_language: get_bash,
478};
479
480static HCL_CONFIG: LanguageConfig = LanguageConfig {
481    id: "hcl",
482    extensions: &[".hcl", ".tf", ".tfvars"],
483    entity_node_types: &["block", "attribute"],
484    container_node_types: &["body"],
485    call_entity_identifiers: &[],
486    suppressed_nested_entities: &[SuppressedNestedEntity {
487        parent_entity_node_type: "block",
488        child_entity_node_type: "attribute",
489    }],
490    scope_boundary_types: &[],
491    get_language: get_hcl,
492};
493
494static KOTLIN_CONFIG: LanguageConfig = LanguageConfig {
495    id: "kotlin",
496    extensions: &[".kt", ".kts"],
497    entity_node_types: &[
498        "function_declaration",
499        "class_declaration",
500        "object_declaration",
501        "property_declaration",
502        "companion_object",
503        "secondary_constructor",
504        "type_alias",
505    ],
506    container_node_types: &["class_body", "enum_class_body"],
507    call_entity_identifiers: &[],
508    suppressed_nested_entities: &[],
509    scope_boundary_types: &[],
510    get_language: get_kotlin,
511};
512
513static XML_CONFIG: LanguageConfig = LanguageConfig {
514    id: "xml",
515    extensions: &[".xml", ".plist", ".svg", ".xhtml", ".csproj", ".fsproj", ".vbproj", ".props", ".targets", ".nuspec", ".resx", ".xaml", ".axml"],
516    entity_node_types: &["element"],
517    container_node_types: &["content"],
518    call_entity_identifiers: &[],
519    suppressed_nested_entities: &[],
520    scope_boundary_types: &[],
521    get_language: get_xml,
522};
523
524static DART_CONFIG: LanguageConfig = LanguageConfig {
525    id: "dart",
526    extensions: &[".dart"],
527    entity_node_types: &[
528        "class_declaration",
529        "mixin_declaration",
530        "extension_declaration",
531        "extension_type_declaration",
532        "enum_declaration",
533        "type_alias",
534        "class_member",
535        "function_signature",
536        "getter_signature",
537        "setter_signature",
538    ],
539    container_node_types: &["class_body", "enum_body", "extension_body"],
540    call_entity_identifiers: &[],
541    suppressed_nested_entities: &[],
542    scope_boundary_types: &[],
543    get_language: get_dart,
544};
545  
546static PERL_CONFIG: LanguageConfig = LanguageConfig {
547    id: "perl",
548    extensions: &[".pl", ".pm", ".t"],
549    entity_node_types: &[
550        "subroutine_declaration_statement",
551        "package_statement",
552    ],
553    container_node_types: &["block"],
554    call_entity_identifiers: &[],
555    suppressed_nested_entities: &[],
556    scope_boundary_types: &[],
557    get_language: get_perl,
558};
559
560static OCAML_CONFIG: LanguageConfig = LanguageConfig {
561    id: "ocaml",
562    extensions: &[".ml"],
563    entity_node_types: &[
564        "value_definition",
565        "module_definition",
566        "module_type_definition",
567        "type_definition",
568        "exception_definition",
569        "class_definition",
570        "class_type_definition",
571        "external",
572    ],
573    container_node_types: &["structure", "module_binding"],
574    call_entity_identifiers: &[],
575    suppressed_nested_entities: &[],
576    scope_boundary_types: &[],
577    get_language: get_ocaml,
578};
579
580static OCAML_INTERFACE_CONFIG: LanguageConfig = LanguageConfig {
581    id: "ocaml_interface",
582    extensions: &[".mli"],
583    entity_node_types: &[
584        "value_specification",
585        "module_definition",
586        "module_type_definition",
587        "type_definition",
588        "exception_definition",
589        "class_definition",
590        "class_type_definition",
591        "external",
592    ],
593    container_node_types: &["signature", "module_binding"],
594    call_entity_identifiers: &[],
595    suppressed_nested_entities: &[],
596    scope_boundary_types: &[],
597    get_language: get_ocaml_interface,
598};
599
600static SCALA_CONFIG: LanguageConfig = LanguageConfig {
601    id: "scala",
602    extensions: &[".scala", ".sc", ".sbt", ".kojo", ".mill"],
603    entity_node_types: &[
604        "class_definition",
605        "object_definition",
606        "trait_definition",
607        "enum_definition",
608        "function_definition",
609        "function_declaration",
610        "val_definition",
611        "given_definition",
612        "extension_definition",
613        "type_definition",
614        "package_object",
615    ],
616    container_node_types: &["template_body", "enum_body", "with_template_body"],
617    call_entity_identifiers: &[],
618    suppressed_nested_entities: &[],
619    scope_boundary_types: &[],
620    get_language: get_scala,
621};
622
623static ALL_CONFIGS: &[&LanguageConfig] = &[
624    &TYPESCRIPT_CONFIG,
625    &TSX_CONFIG,
626    &JAVASCRIPT_CONFIG,
627    &PYTHON_CONFIG,
628    &GO_CONFIG,
629    &RUST_CONFIG,
630    &JAVA_CONFIG,
631    &C_CONFIG,
632    &CPP_CONFIG,
633    &RUBY_CONFIG,
634    &CSHARP_CONFIG,
635    &PHP_CONFIG,
636    &FORTRAN_CONFIG,
637    &SWIFT_CONFIG,
638    &ELIXIR_CONFIG,
639    &BASH_CONFIG,
640    &HCL_CONFIG,
641    &KOTLIN_CONFIG,
642    &XML_CONFIG,
643    &DART_CONFIG,
644    &PERL_CONFIG,
645    &OCAML_CONFIG,
646    &OCAML_INTERFACE_CONFIG,
647    &SCALA_CONFIG,
648];
649
650pub fn get_language_config(extension: &str) -> Option<&'static LanguageConfig> {
651    ALL_CONFIGS
652        .iter()
653        .find(|c| c.extensions.contains(&extension))
654        .copied()
655}
656
657pub fn get_all_code_extensions() -> &'static [&'static str] {
658    // All unique extensions across all language configs
659    static EXTENSIONS: &[&str] = &[
660        ".ts",".tsx", ".mts", ".cts", ".js", ".jsx", ".mjs", ".cjs", ".py", ".pyi", ".go", ".rs", ".java", ".c", ".h",
661        ".cpp", ".cc", ".cxx", ".hpp", ".hh", ".hxx", ".rb", ".cs", ".php", ".f90", ".f95", ".f03",
662        ".f08", ".f", ".for", ".swift", ".ex", ".exs", ".sh", ".hcl", ".tf", ".tfvars",
663        ".kt", ".kts",
664        ".xml", ".plist", ".svg", ".xhtml", ".csproj", ".fsproj", ".vbproj", ".props", ".targets",
665        ".nuspec", ".resx", ".xaml", ".axml",
666        ".dart",
667        ".pl", ".pm", ".t",
668        ".ml", ".mli",
669        ".scala", ".sc", ".sbt", ".kojo", ".mill",
670    ];
671    EXTENSIONS
672}