recoco_splitters/
prog_langs.rs

1// ReCoco is a Rust-only fork of CocoIndex, by [CocoIndex](https://CocoIndex)
2// Original code from CocoIndex is copyrighted by CocoIndex
3// SPDX-FileCopyrightText: 2025-2026 CocoIndex (upstream)
4// SPDX-FileContributor: CocoIndex Contributors
5//
6// All modifications from the upstream for ReCoco are copyrighted by Knitli Inc.
7// SPDX-FileCopyrightText: 2026 Knitli Inc. (ReCoco)
8// SPDX-FileContributor: Adam Poulemanos <adam@knit.li>
9//
10// Both the upstream CocoIndex code and the ReCoco modifications are licensed under the Apache-2.0 License.
11// SPDX-License-Identifier: Apache-2.0
12
13//! Programming language detection and tree-sitter support.
14
15use std::collections::{HashMap, HashSet};
16use std::sync::{Arc, LazyLock};
17use unicase::UniCase;
18
19/// Tree-sitter language information for syntax-aware parsing.
20pub struct TreeSitterLanguageInfo {
21    pub tree_sitter_lang: tree_sitter::Language,
22    pub terminal_node_kind_ids: HashSet<u16>,
23}
24
25impl TreeSitterLanguageInfo {
26    fn new(
27        lang_fn: impl Into<tree_sitter::Language>,
28        terminal_node_kinds: impl IntoIterator<Item = &'static str>,
29    ) -> Self {
30        let tree_sitter_lang: tree_sitter::Language = lang_fn.into();
31        let terminal_node_kind_ids = terminal_node_kinds
32            .into_iter()
33            .filter_map(|kind| {
34                let id = tree_sitter_lang.id_for_node_kind(kind, true);
35                if id != 0 {
36                    Some(id)
37                } else {
38                    // Node kind not found - this is a configuration issue
39                    None
40                }
41            })
42            .collect();
43        Self {
44            tree_sitter_lang,
45            terminal_node_kind_ids,
46        }
47    }
48}
49
50/// Information about a programming language.
51pub struct ProgrammingLanguageInfo {
52    /// The main name of the language.
53    /// It's expected to be consistent with the language names listed at:
54    /// <https://github.com/Goldziher/tree-sitter-language-pack/tree/main?tab=readme-ov-file#available-languages>
55    pub name: Arc<str>,
56
57    /// Optional tree-sitter language info for syntax-aware parsing.
58    pub treesitter_info: Option<TreeSitterLanguageInfo>,
59}
60
61static LANGUAGE_INFO_BY_NAME: LazyLock<
62    HashMap<UniCase<&'static str>, Arc<ProgrammingLanguageInfo>>,
63> = LazyLock::new(|| {
64    let mut map = HashMap::new();
65
66    // Adds a language to the global map of languages.
67    // `name` is the main name of the language, used to set the `name` field of the `ProgrammingLanguageInfo`.
68    // `aliases` are the other names of the language, which can be language names or file extensions (e.g. `.js`, `.py`).
69    let mut add = |name: &'static str,
70                   aliases: &[&'static str],
71                   treesitter_info: Option<TreeSitterLanguageInfo>| {
72        let config = Arc::new(ProgrammingLanguageInfo {
73            name: Arc::from(name),
74            treesitter_info,
75        });
76        for name in std::iter::once(name).chain(aliases.iter().copied()) {
77            if map.insert(name.into(), config.clone()).is_some() {
78                panic!("Language `{name}` already exists");
79            }
80        }
81    };
82
83    // Languages sorted alphabetically by name
84    add("actionscript", &[".as"], None);
85    add("ada", &[".ada", ".adb", ".ads"], None);
86    add("agda", &[".agda"], None);
87    add("apex", &[".cls", ".trigger"], None);
88    add("arduino", &[".ino"], None);
89    add("asm", &[".asm", ".a51", ".i", ".nas", ".nasm", ".s"], None);
90    add("astro", &[".astro"], None);
91    add("bash", &[".sh", ".bash"], None);
92    add("beancount", &[".beancount"], None);
93    add("bibtex", &[".bib", ".bibtex"], None);
94    add("bicep", &[".bicep", ".bicepparam"], None);
95    add("bitbake", &[".bb", ".bbappend", ".bbclass"], None);
96    add(
97        "c",
98        &[".c", ".cats", ".h.in", ".idc"],
99        Some(TreeSitterLanguageInfo::new(tree_sitter_c::LANGUAGE, [])),
100    );
101    add("cairo", &[".cairo"], None);
102    add("capnp", &[".capnp"], None);
103    add("chatito", &[".chatito"], None);
104    add("clarity", &[".clar"], None);
105    add(
106        "clojure",
107        &[
108            ".clj", ".boot", ".cl2", ".cljc", ".cljs", ".cljs.hl", ".cljscm", ".cljx", ".hic",
109        ],
110        None,
111    );
112    add("cmake", &[".cmake", ".cmake.in"], None);
113    add(
114        "commonlisp",
115        &[
116            ".lisp", ".asd", ".cl", ".l", ".lsp", ".ny", ".podsl", ".sexp",
117        ],
118        None,
119    );
120    add(
121        "cpp",
122        &[
123            ".cpp", ".h", ".c++", ".cc", ".cp", ".cppm", ".cxx", ".h++", ".hh", ".hpp", ".hxx",
124            ".inl", ".ipp", ".ixx", ".tcc", ".tpp", ".txx", "c++",
125        ],
126        Some(TreeSitterLanguageInfo::new(tree_sitter_cpp::LANGUAGE, [])),
127    );
128    add("cpon", &[".cpon"], None);
129    add(
130        "csharp",
131        &[".cs", ".cake", ".cs.pp", ".csx", ".linq", "cs", "c#"],
132        Some(TreeSitterLanguageInfo::new(
133            tree_sitter_c_sharp::LANGUAGE,
134            [],
135        )),
136    );
137    add(
138        "css",
139        &[".css", ".scss"],
140        Some(TreeSitterLanguageInfo::new(tree_sitter_css::LANGUAGE, [])),
141    );
142    add("csv", &[".csv"], None);
143    add("cuda", &[".cu", ".cuh"], None);
144    add("d", &[".d", ".di"], None);
145    add("dart", &[".dart"], None);
146    add("dockerfile", &[".dockerfile", ".containerfile"], None);
147    add(
148        "dtd",
149        &[".dtd"],
150        Some(TreeSitterLanguageInfo::new(
151            tree_sitter_xml::LANGUAGE_DTD,
152            [],
153        )),
154    );
155    add("elisp", &[".el"], None);
156    add("elixir", &[".ex", ".exs"], None);
157    add("elm", &[".elm"], None);
158    add("embeddedtemplate", &[".ets"], None);
159    add(
160        "erlang",
161        &[
162            ".erl", ".app", ".app.src", ".escript", ".hrl", ".xrl", ".yrl",
163        ],
164        None,
165    );
166    add("fennel", &[".fnl"], None);
167    add("firrtl", &[".fir"], None);
168    add("fish", &[".fish"], None);
169    add(
170        "fortran",
171        &[".f", ".f90", ".f95", ".f03", "f", "f90", "f95", "f03"],
172        Some(TreeSitterLanguageInfo::new(
173            tree_sitter_fortran::LANGUAGE,
174            [],
175        )),
176    );
177    add("fsharp", &[".fs", ".fsi", ".fsx"], None);
178    add("func", &[".func"], None);
179    add("gdscript", &[".gd"], None);
180    add("gitattributes", &[".gitattributes"], None);
181    add("gitignore", &[".gitignore"], None);
182    add("gleam", &[".gleam"], None);
183    add("glsl", &[".glsl", ".vert", ".frag"], None);
184    add("gn", &[".gn", ".gni"], None);
185    add(
186        "go",
187        &[".go", "golang"],
188        Some(TreeSitterLanguageInfo::new(tree_sitter_go::LANGUAGE, [])),
189    );
190    add("gomod", &["go.mod"], None);
191    add("gosum", &["go.sum"], None);
192    add("graphql", &[".graphql", ".gql"], None);
193    add(
194        "groovy",
195        &[".groovy", ".grt", ".gtpl", ".gvy", ".gradle"],
196        None,
197    );
198    add("hack", &[".hack"], None);
199    add("hare", &[".ha"], None);
200    add("haskell", &[".hs", ".hs-boot", ".hsc"], None);
201    add("haxe", &[".hx"], None);
202    add("hcl", &[".hcl", ".tf"], None);
203    add("heex", &[".heex"], None);
204    add("hlsl", &[".hlsl"], None);
205    add(
206        "html",
207        &[".html", ".htm", ".hta", ".html.hl", ".xht", ".xhtml"],
208        Some(TreeSitterLanguageInfo::new(tree_sitter_html::LANGUAGE, [])),
209    );
210    add("hyprlang", &[".hl"], None);
211    add("ini", &[".ini", ".cfg"], None);
212    add("ispc", &[".ispc"], None);
213    add("janet", &[".janet"], None);
214    add(
215        "java",
216        &[".java", ".jav", ".jsh"],
217        Some(TreeSitterLanguageInfo::new(tree_sitter_java::LANGUAGE, [])),
218    );
219    add(
220        "javascript",
221        &[
222            ".js",
223            "._js",
224            ".bones",
225            ".cjs",
226            ".es",
227            ".es6",
228            ".gs",
229            ".jake",
230            ".javascript",
231            ".jsb",
232            ".jscad",
233            ".jsfl",
234            ".jslib",
235            ".jsm",
236            ".jspre",
237            ".jss",
238            ".jsx",
239            ".mjs",
240            ".njs",
241            ".pac",
242            ".sjs",
243            ".ssjs",
244            ".xsjs",
245            ".xsjslib",
246            "js",
247        ],
248        Some(TreeSitterLanguageInfo::new(
249            tree_sitter_javascript::LANGUAGE,
250            [],
251        )),
252    );
253    add(
254        "json",
255        &[
256            ".json",
257            ".4DForm",
258            ".4DProject",
259            ".avsc",
260            ".geojson",
261            ".gltf",
262            ".har",
263            ".ice",
264            ".JSON-tmLanguage",
265            ".json.example",
266            ".jsonl",
267            ".mcmeta",
268            ".sarif",
269            ".tact",
270            ".tfstate",
271            ".tfstate.backup",
272            ".topojson",
273            ".webapp",
274            ".webmanifest",
275            ".yy",
276            ".yyp",
277        ],
278        Some(TreeSitterLanguageInfo::new(tree_sitter_json::LANGUAGE, [])),
279    );
280    add("jsonnet", &[".jsonnet"], None);
281    add("julia", &[".jl"], None);
282    add("kdl", &[".kdl"], None);
283    add(
284        "kotlin",
285        &[".kt", ".ktm", ".kts"],
286        Some(TreeSitterLanguageInfo::new(
287            tree_sitter_kotlin_ng::LANGUAGE,
288            [],
289        )),
290    );
291    add("latex", &[".tex"], None);
292    add("linkerscript", &[".ld"], None);
293    add("llvm", &[".ll"], None);
294    add(
295        "lua",
296        &[
297            ".lua",
298            ".nse",
299            ".p8",
300            ".pd_lua",
301            ".rbxs",
302            ".rockspec",
303            ".wlua",
304        ],
305        None,
306    );
307    add("luau", &[".luau"], None);
308    add("magik", &[".magik"], None);
309    add(
310        "make",
311        &[".mak", ".make", ".makefile", ".mk", ".mkfile"],
312        None,
313    );
314    add(
315        "markdown",
316        &[
317            ".md",
318            ".livemd",
319            ".markdown",
320            ".mdown",
321            ".mdwn",
322            ".mdx",
323            ".mkd",
324            ".mkdn",
325            ".mkdown",
326            ".ronn",
327            ".scd",
328            ".workbook",
329            "md",
330        ],
331        Some(TreeSitterLanguageInfo::new(
332            tree_sitter_md::LANGUAGE,
333            ["inline", "indented_code_block", "fenced_code_block"],
334        )),
335    );
336    add("mermaid", &[".mmd"], None);
337    add("meson", &["meson.build"], None);
338    add("netlinx", &[".axi"], None);
339    add(
340        "nim",
341        &[".nim", ".nim.cfg", ".nimble", ".nimrod", ".nims"],
342        None,
343    );
344    add("ninja", &[".ninja"], None);
345    add("nix", &[".nix"], None);
346    add("nqc", &[".nqc"], None);
347    add(
348        "pascal",
349        &[
350            ".pas", ".dfm", ".dpr", ".lpr", ".pascal", "pas", "dpr", "delphi",
351        ],
352        Some(TreeSitterLanguageInfo::new(
353            tree_sitter_pascal::LANGUAGE,
354            [],
355        )),
356    );
357    add("pem", &[".pem"], None);
358    add(
359        "perl",
360        &[
361            ".pl", ".al", ".cgi", ".fcgi", ".perl", ".ph", ".plx", ".pm", ".psgi", ".t",
362        ],
363        None,
364    );
365    add("pgn", &[".pgn"], None);
366    add(
367        "php",
368        &[".php"],
369        Some(TreeSitterLanguageInfo::new(
370            tree_sitter_php::LANGUAGE_PHP,
371            [],
372        )),
373    );
374    add("po", &[".po"], None);
375    add("pony", &[".pony"], None);
376    add("powershell", &[".ps1"], None);
377    add("prisma", &[".prisma"], None);
378    add("properties", &[".properties"], None);
379    add("proto", &[".proto"], None);
380    add("psv", &[".psv"], None);
381    add("puppet", &[".pp"], None);
382    add("purescript", &[".purs"], None);
383    add(
384        "python",
385        &[".py", ".pyw", ".pyi", ".pyx", ".pxd", ".pxi"],
386        Some(TreeSitterLanguageInfo::new(
387            tree_sitter_python::LANGUAGE,
388            [],
389        )),
390    );
391    add("qmljs", &[".qml"], None);
392    add(
393        "r",
394        &[".r"],
395        Some(TreeSitterLanguageInfo::new(tree_sitter_r::LANGUAGE, [])),
396    );
397    add("racket", &[".rkt"], None);
398    add("rbs", &[".rbs"], None);
399    add("re2c", &[".re"], None);
400    add("rego", &[".rego"], None);
401    add("requirements", &["requirements.txt"], None);
402    add("ron", &[".ron"], None);
403    add("rst", &[".rst"], None);
404    add(
405        "ruby",
406        &[".rb"],
407        Some(TreeSitterLanguageInfo::new(tree_sitter_ruby::LANGUAGE, [])),
408    );
409    add(
410        "rust",
411        &[".rs", "rs"],
412        Some(TreeSitterLanguageInfo::new(tree_sitter_rust::LANGUAGE, [])),
413    );
414    add(
415        "scala",
416        &[".scala"],
417        Some(TreeSitterLanguageInfo::new(tree_sitter_scala::LANGUAGE, [])),
418    );
419    add("scheme", &[".ss"], None);
420    add("slang", &[".slang"], None);
421    add("smali", &[".smali"], None);
422    add("smithy", &[".smithy"], None);
423    add(
424        "solidity",
425        &[".sol"],
426        Some(TreeSitterLanguageInfo::new(
427            tree_sitter_solidity::LANGUAGE,
428            [],
429        )),
430    );
431    add("sparql", &[".sparql"], None);
432    add(
433        "sql",
434        &[".sql"],
435        Some(TreeSitterLanguageInfo::new(
436            tree_sitter_sequel::LANGUAGE,
437            [],
438        )),
439    );
440    add("squirrel", &[".nut"], None);
441    add("starlark", &[".star", ".bzl"], None);
442    add("svelte", &[".svelte"], None);
443    add(
444        "swift",
445        &[".swift"],
446        Some(TreeSitterLanguageInfo::new(tree_sitter_swift::LANGUAGE, [])),
447    );
448    add("tablegen", &[".td"], None);
449    add("tcl", &[".tcl"], None);
450    add("thrift", &[".thrift"], None);
451    add(
452        "toml",
453        &[".toml"],
454        Some(TreeSitterLanguageInfo::new(
455            tree_sitter_toml_ng::LANGUAGE,
456            [],
457        )),
458    );
459    add("tsv", &[".tsv"], None);
460    add(
461        "tsx",
462        &[".tsx"],
463        Some(TreeSitterLanguageInfo::new(
464            tree_sitter_typescript::LANGUAGE_TSX,
465            [],
466        )),
467    );
468    add("twig", &[".twig"], None);
469    add(
470        "typescript",
471        &[".ts", "ts"],
472        Some(TreeSitterLanguageInfo::new(
473            tree_sitter_typescript::LANGUAGE_TYPESCRIPT,
474            [],
475        )),
476    );
477    add("typst", &[".typ"], None);
478    add("udev", &[".rules"], None);
479    add("ungrammar", &[".ungram"], None);
480    add("uxntal", &[".tal"], None);
481    add("verilog", &[".vh"], None);
482    add("vhdl", &[".vhd", ".vhdl"], None);
483    add("vim", &[".vim"], None);
484    add("vue", &[".vue"], None);
485    add("wast", &[".wast"], None);
486    add("wat", &[".wat"], None);
487    add("wgsl", &[".wgsl"], None);
488    add("xcompose", &[".xcompose"], None);
489    add(
490        "xml",
491        &[".xml"],
492        Some(TreeSitterLanguageInfo::new(
493            tree_sitter_xml::LANGUAGE_XML,
494            [],
495        )),
496    );
497    add(
498        "yaml",
499        &[".yaml", ".yml"],
500        Some(TreeSitterLanguageInfo::new(tree_sitter_yaml::LANGUAGE, [])),
501    );
502    add("yuck", &[".yuck"], None);
503    add("zig", &[".zig"], None);
504
505    map
506});
507
508/// Get programming language info by name or file extension.
509///
510/// The lookup is case-insensitive and supports both language names
511/// (e.g., "rust", "python") and file extensions (e.g., ".rs", ".py").
512pub fn get_language_info(name: &str) -> Option<&ProgrammingLanguageInfo> {
513    LANGUAGE_INFO_BY_NAME
514        .get(&UniCase::new(name))
515        .map(|info| info.as_ref())
516}
517
518/// Detect programming language from a filename.
519///
520/// Returns the language name if the file extension is recognized.
521pub fn detect_language(filename: &str) -> Option<&str> {
522    let last_dot = filename.rfind('.')?;
523    let extension = &filename[last_dot..];
524    get_language_info(extension).map(|info| info.name.as_ref())
525}
526
527#[cfg(test)]
528mod tests {
529    use super::*;
530
531    #[test]
532    fn test_get_language_info() {
533        let rust_info = get_language_info(".rs").unwrap();
534        assert_eq!(rust_info.name.as_ref(), "rust");
535        assert!(rust_info.treesitter_info.is_some());
536
537        let py_info = get_language_info(".py").unwrap();
538        assert_eq!(py_info.name.as_ref(), "python");
539
540        // Case insensitive
541        let rust_upper = get_language_info(".RS").unwrap();
542        assert_eq!(rust_upper.name.as_ref(), "rust");
543
544        // Unknown extension
545        assert!(get_language_info(".unknown").is_none());
546    }
547
548    #[test]
549    fn test_detect_language() {
550        assert_eq!(detect_language("test.rs"), Some("rust"));
551        assert_eq!(detect_language("main.py"), Some("python"));
552        assert_eq!(detect_language("app.js"), Some("javascript"));
553        assert_eq!(detect_language("noextension"), None);
554        assert_eq!(detect_language("unknown.xyz"), None);
555    }
556}