Skip to main content

normalize_languages/
registry.rs

1//! Language support registry with extension-based lookup.
2
3use crate::Language;
4use std::collections::HashMap;
5use std::path::Path;
6use std::sync::{OnceLock, RwLock};
7
8/// Global language registry.
9static LANGUAGES: RwLock<Vec<&'static dyn Language>> = RwLock::new(Vec::new());
10static INITIALIZED: OnceLock<()> = OnceLock::new();
11
12/// Cached extension → language lookup table.
13static EXTENSION_MAP: OnceLock<HashMap<&'static str, &'static dyn Language>> = OnceLock::new();
14
15/// Cached grammar_name → language lookup table.
16static GRAMMAR_MAP: OnceLock<HashMap<&'static str, &'static dyn Language>> = OnceLock::new();
17
18/// Register a language in the global registry.
19/// Called internally by language modules.
20pub fn register(lang: &'static dyn Language) {
21    LANGUAGES.write().unwrap().push(lang);
22}
23
24/// Initialize built-in languages (called once).
25fn init_builtin() {
26    INITIALIZED.get_or_init(|| {
27        #[cfg(feature = "lang-python")]
28        register(&crate::python::Python);
29        #[cfg(feature = "lang-rust")]
30        register(&crate::rust::Rust);
31        #[cfg(feature = "lang-javascript")]
32        register(&crate::javascript::JavaScript);
33        #[cfg(feature = "lang-typescript")]
34        {
35            register(&crate::typescript::TypeScript);
36            register(&crate::typescript::Tsx);
37        }
38        #[cfg(feature = "lang-go")]
39        register(&crate::go::Go);
40        #[cfg(feature = "lang-java")]
41        register(&crate::java::Java);
42        #[cfg(feature = "lang-kotlin")]
43        register(&crate::kotlin::Kotlin);
44        #[cfg(feature = "lang-csharp")]
45        register(&crate::csharp::CSharp);
46        #[cfg(feature = "lang-swift")]
47        register(&crate::swift::Swift);
48        #[cfg(feature = "lang-php")]
49        register(&crate::php::Php);
50        #[cfg(feature = "lang-dockerfile")]
51        register(&crate::dockerfile::Dockerfile);
52        #[cfg(feature = "lang-c")]
53        register(&crate::c::C);
54        #[cfg(feature = "lang-cpp")]
55        register(&crate::cpp::Cpp);
56        #[cfg(feature = "lang-ruby")]
57        register(&crate::ruby::Ruby);
58        #[cfg(feature = "lang-scala")]
59        register(&crate::scala::Scala);
60        #[cfg(feature = "lang-vue")]
61        register(&crate::vue::Vue);
62        #[cfg(feature = "lang-markdown")]
63        register(&crate::markdown::Markdown);
64        #[cfg(feature = "lang-json")]
65        register(&crate::json::Json);
66        #[cfg(feature = "lang-yaml")]
67        register(&crate::yaml::Yaml);
68        #[cfg(feature = "lang-toml")]
69        register(&crate::toml::Toml);
70        #[cfg(feature = "lang-html")]
71        register(&crate::html::Html);
72        #[cfg(feature = "lang-css")]
73        register(&crate::css::Css);
74        #[cfg(feature = "lang-bash")]
75        register(&crate::bash::Bash);
76        #[cfg(feature = "lang-lua")]
77        register(&crate::lua::Lua);
78        #[cfg(feature = "lang-zig")]
79        register(&crate::zig::Zig);
80        #[cfg(feature = "lang-elixir")]
81        register(&crate::elixir::Elixir);
82        #[cfg(feature = "lang-erlang")]
83        register(&crate::erlang::Erlang);
84        #[cfg(feature = "lang-dart")]
85        register(&crate::dart::Dart);
86        #[cfg(feature = "lang-fsharp")]
87        register(&crate::fsharp::FSharp);
88        #[cfg(feature = "lang-sql")]
89        register(&crate::sql::Sql);
90        #[cfg(feature = "lang-graphql")]
91        register(&crate::graphql::GraphQL);
92        #[cfg(feature = "lang-hcl")]
93        register(&crate::hcl::Hcl);
94        #[cfg(feature = "lang-scss")]
95        register(&crate::scss::Scss);
96        #[cfg(feature = "lang-svelte")]
97        register(&crate::svelte::Svelte);
98        #[cfg(feature = "lang-xml")]
99        register(&crate::xml::Xml);
100        #[cfg(feature = "lang-clojure")]
101        register(&crate::clojure::Clojure);
102        #[cfg(feature = "lang-haskell")]
103        register(&crate::haskell::Haskell);
104        #[cfg(feature = "lang-ocaml")]
105        register(&crate::ocaml::OCaml);
106        #[cfg(feature = "lang-nix")]
107        register(&crate::nix::Nix);
108        #[cfg(feature = "lang-perl")]
109        register(&crate::perl::Perl);
110        #[cfg(feature = "lang-r")]
111        register(&crate::r::R);
112        #[cfg(feature = "lang-julia")]
113        register(&crate::julia::Julia);
114        #[cfg(feature = "lang-elm")]
115        register(&crate::elm::Elm);
116        #[cfg(feature = "lang-cmake")]
117        register(&crate::cmake::CMake);
118        #[cfg(feature = "lang-vim")]
119        register(&crate::vim::Vim);
120        #[cfg(feature = "lang-awk")]
121        register(&crate::awk::Awk);
122        #[cfg(feature = "lang-fish")]
123        register(&crate::fish::Fish);
124        #[cfg(feature = "lang-jq")]
125        register(&crate::jq::Jq);
126        #[cfg(feature = "lang-powershell")]
127        register(&crate::powershell::PowerShell);
128        #[cfg(feature = "lang-zsh")]
129        register(&crate::zsh::Zsh);
130        #[cfg(feature = "lang-groovy")]
131        register(&crate::groovy::Groovy);
132        #[cfg(feature = "lang-glsl")]
133        register(&crate::glsl::Glsl);
134        #[cfg(feature = "lang-hlsl")]
135        register(&crate::hlsl::Hlsl);
136        #[cfg(feature = "lang-commonlisp")]
137        register(&crate::commonlisp::CommonLisp);
138        #[cfg(feature = "lang-elisp")]
139        register(&crate::elisp::Elisp);
140        #[cfg(feature = "lang-gleam")]
141        register(&crate::gleam::Gleam);
142        #[cfg(feature = "lang-scheme")]
143        register(&crate::scheme::Scheme);
144        #[cfg(feature = "lang-ini")]
145        register(&crate::ini::Ini);
146        #[cfg(feature = "lang-diff")]
147        register(&crate::diff::Diff);
148        #[cfg(feature = "lang-dot")]
149        register(&crate::dot::Dot);
150        #[cfg(feature = "lang-kdl")]
151        register(&crate::kdl::Kdl);
152        #[cfg(feature = "lang-ada")]
153        register(&crate::ada::Ada);
154        #[cfg(feature = "lang-agda")]
155        register(&crate::agda::Agda);
156        #[cfg(feature = "lang-d")]
157        register(&crate::d::D);
158        #[cfg(feature = "lang-matlab")]
159        register(&crate::matlab::Matlab);
160        #[cfg(feature = "lang-meson")]
161        register(&crate::meson::Meson);
162        #[cfg(feature = "lang-nginx")]
163        register(&crate::nginx::Nginx);
164        #[cfg(feature = "lang-prolog")]
165        register(&crate::prolog::Prolog);
166        #[cfg(feature = "lang-batch")]
167        register(&crate::batch::Batch);
168        #[cfg(feature = "lang-asm")]
169        register(&crate::asm::Asm);
170        #[cfg(feature = "lang-objc")]
171        register(&crate::objc::ObjC);
172        #[cfg(feature = "lang-typst")]
173        register(&crate::typst::Typst);
174        #[cfg(feature = "lang-asciidoc")]
175        register(&crate::asciidoc::AsciiDoc);
176        #[cfg(feature = "lang-vb")]
177        register(&crate::vb::VB);
178        #[cfg(feature = "lang-idris")]
179        register(&crate::idris::Idris);
180        #[cfg(feature = "lang-rescript")]
181        register(&crate::rescript::ReScript);
182        #[cfg(feature = "lang-lean")]
183        register(&crate::lean::Lean);
184        #[cfg(feature = "lang-caddy")]
185        register(&crate::caddy::Caddy);
186        #[cfg(feature = "lang-capnp")]
187        register(&crate::capnp::Capnp);
188        #[cfg(feature = "lang-devicetree")]
189        register(&crate::devicetree::DeviceTree);
190        #[cfg(feature = "lang-jinja2")]
191        register(&crate::jinja2::Jinja2);
192        #[cfg(feature = "lang-ninja")]
193        register(&crate::ninja::Ninja);
194        #[cfg(feature = "lang-postscript")]
195        register(&crate::postscript::PostScript);
196        #[cfg(feature = "lang-query")]
197        register(&crate::query::Query);
198        #[cfg(feature = "lang-ron")]
199        register(&crate::ron::Ron);
200        #[cfg(feature = "lang-sparql")]
201        register(&crate::sparql::Sparql);
202        #[cfg(feature = "lang-sshconfig")]
203        register(&crate::sshconfig::SshConfig);
204        #[cfg(feature = "lang-starlark")]
205        register(&crate::starlark::Starlark);
206        #[cfg(feature = "lang-textproto")]
207        register(&crate::textproto::TextProto);
208        #[cfg(feature = "lang-thrift")]
209        register(&crate::thrift::Thrift);
210        #[cfg(feature = "lang-tlaplus")]
211        register(&crate::tlaplus::TlaPlus);
212        #[cfg(feature = "lang-uiua")]
213        register(&crate::uiua::Uiua);
214        #[cfg(feature = "lang-verilog")]
215        register(&crate::verilog::Verilog);
216        #[cfg(feature = "lang-vhdl")]
217        register(&crate::vhdl::Vhdl);
218        #[cfg(feature = "lang-wit")]
219        register(&crate::wit::Wit);
220        #[cfg(feature = "lang-x86asm")]
221        register(&crate::x86asm::X86Asm);
222        #[cfg(feature = "lang-yuri")]
223        register(&crate::yuri::Yuri);
224    });
225}
226
227fn extension_map() -> &'static HashMap<&'static str, &'static dyn Language> {
228    init_builtin();
229    EXTENSION_MAP.get_or_init(|| {
230        let mut map = HashMap::new();
231        let langs = LANGUAGES.read().unwrap();
232        for lang in langs.iter() {
233            for ext in lang.extensions() {
234                map.insert(*ext, *lang);
235            }
236        }
237        map
238    })
239}
240
241fn grammar_map() -> &'static HashMap<&'static str, &'static dyn Language> {
242    init_builtin();
243    GRAMMAR_MAP.get_or_init(|| {
244        let mut map = HashMap::new();
245        let langs = LANGUAGES.read().unwrap();
246        for lang in langs.iter() {
247            map.insert(lang.grammar_name(), *lang);
248        }
249        map
250    })
251}
252
253/// Get language support for a file extension.
254///
255/// Returns `None` if the extension is not recognized or the feature is not enabled.
256pub fn support_for_extension(ext: &str) -> Option<&'static dyn Language> {
257    extension_map()
258        .get(ext)
259        .or_else(|| extension_map().get(ext.to_lowercase().as_str()))
260        .copied()
261}
262
263/// Get language support by grammar name.
264///
265/// Returns `None` if the grammar is not recognized or the feature is not enabled.
266pub fn support_for_grammar(grammar: &str) -> Option<&'static dyn Language> {
267    grammar_map().get(grammar).copied()
268}
269
270/// Get language support from a file path.
271///
272/// Returns `None` if the file has no extension, the extension is not recognized,
273/// or the feature is not enabled.
274pub fn support_for_path(path: &Path) -> Option<&'static dyn Language> {
275    path.extension()
276        .and_then(|e| e.to_str())
277        .and_then(support_for_extension)
278}
279
280/// Get all supported languages.
281pub fn supported_languages() -> Vec<&'static dyn Language> {
282    init_builtin();
283    LANGUAGES.read().unwrap().clone()
284}
285
286#[cfg(test)]
287mod tests {
288    use super::*;
289    use crate::GrammarLoader;
290
291    /// Dump all valid node kinds for a grammar (useful for fixing invalid kinds).
292    /// Run with: cargo test -p rhizome-moss-languages dump_node_kinds -- --nocapture
293    #[test]
294    #[ignore]
295    fn dump_node_kinds() {
296        let loader = GrammarLoader::new();
297        // Change this to the grammar you want to inspect
298        let grammar_name = std::env::var("DUMP_GRAMMAR").unwrap_or_else(|_| "python".to_string());
299
300        let ts_lang = loader.get(&grammar_name).expect("grammar not found");
301
302        println!("\n=== Valid node kinds for '{}' ===\n", grammar_name);
303        let count = ts_lang.node_kind_count();
304        for id in 0..count as u16 {
305            if let Some(kind) = ts_lang.node_kind_for_id(id) {
306                let named = ts_lang.node_kind_is_named(id);
307                if named && !kind.starts_with('_') {
308                    println!("{}", kind);
309                }
310            }
311        }
312    }
313
314    /// Validate that all node kinds returned by Language trait methods
315    /// actually exist in the tree-sitter grammar.
316    #[test]
317    fn validate_node_kinds() {
318        let loader = GrammarLoader::new();
319        let mut errors: Vec<String> = Vec::new();
320
321        for lang in supported_languages() {
322            let grammar_name = lang.grammar_name();
323            let ts_lang = match loader.get(grammar_name) {
324                Some(l) => l,
325                None => {
326                    // Grammar not available in search paths
327                    continue;
328                }
329            };
330
331            // Collect all node kinds from trait methods
332            let all_kinds: Vec<(&str, &[&str])> = vec![
333                ("container_kinds", lang.container_kinds()),
334                ("function_kinds", lang.function_kinds()),
335                ("type_kinds", lang.type_kinds()),
336                ("import_kinds", lang.import_kinds()),
337                ("public_symbol_kinds", lang.public_symbol_kinds()),
338                ("scope_creating_kinds", lang.scope_creating_kinds()),
339                ("control_flow_kinds", lang.control_flow_kinds()),
340                ("complexity_nodes", lang.complexity_nodes()),
341                ("nesting_nodes", lang.nesting_nodes()),
342            ];
343
344            for (method, kinds) in all_kinds {
345                for kind in kinds {
346                    // id_for_node_kind returns 0 if the kind doesn't exist
347                    let id = ts_lang.id_for_node_kind(kind, true);
348                    if id == 0 {
349                        // Also check unnamed nodes (like operators)
350                        let unnamed_id = ts_lang.id_for_node_kind(kind, false);
351                        if unnamed_id == 0 {
352                            errors.push(format!(
353                                "{}: {}() contains invalid node kind '{}'",
354                                lang.name(),
355                                method,
356                                kind
357                            ));
358                        }
359                    }
360                }
361            }
362        }
363
364        if !errors.is_empty() {
365            panic!(
366                "Found {} invalid node kinds:\n{}",
367                errors.len(),
368                errors.join("\n")
369            );
370        }
371    }
372
373    /// Cross-check grammar node kinds against Language implementations.
374    /// Finds potentially useful kinds that exist in the grammar but aren't used.
375    /// Run with: cargo test -p rhizome-moss-languages cross_check_node_kinds -- --nocapture --ignored
376    #[test]
377    #[ignore]
378    fn cross_check_node_kinds() {
379        use std::collections::HashSet;
380
381        let loader = GrammarLoader::new();
382
383        // Keywords that suggest a node kind might be useful
384        let interesting_patterns = [
385            "statement",
386            "expression",
387            "definition",
388            "declaration",
389            "clause",
390            "block",
391            "body",
392            "import",
393            "export",
394            "function",
395            "method",
396            "class",
397            "struct",
398            "enum",
399            "interface",
400            "trait",
401            "module",
402            "type",
403            "return",
404            "if",
405            "else",
406            "for",
407            "while",
408            "loop",
409            "match",
410            "case",
411            "try",
412            "catch",
413            "except",
414            "throw",
415            "raise",
416            "with",
417            "async",
418            "await",
419            "yield",
420            "lambda",
421            "comprehension",
422            "generator",
423            "operator",
424        ];
425
426        for lang in supported_languages() {
427            let grammar_name = lang.grammar_name();
428            let ts_lang = match loader.get(grammar_name) {
429                Some(l) => l,
430                None => continue,
431            };
432
433            // Collect all kinds currently used by the language
434            let mut used_kinds: HashSet<&str> = HashSet::new();
435            for kind in lang.container_kinds() {
436                used_kinds.insert(kind);
437            }
438            for kind in lang.function_kinds() {
439                used_kinds.insert(kind);
440            }
441            for kind in lang.type_kinds() {
442                used_kinds.insert(kind);
443            }
444            for kind in lang.import_kinds() {
445                used_kinds.insert(kind);
446            }
447            for kind in lang.public_symbol_kinds() {
448                used_kinds.insert(kind);
449            }
450            for kind in lang.scope_creating_kinds() {
451                used_kinds.insert(kind);
452            }
453            for kind in lang.control_flow_kinds() {
454                used_kinds.insert(kind);
455            }
456            for kind in lang.complexity_nodes() {
457                used_kinds.insert(kind);
458            }
459            for kind in lang.nesting_nodes() {
460                used_kinds.insert(kind);
461            }
462
463            // Get all valid named node kinds from grammar
464            let mut all_kinds: Vec<&str> = Vec::new();
465            let count = ts_lang.node_kind_count();
466            for id in 0..count as u16 {
467                if let Some(kind) = ts_lang.node_kind_for_id(id) {
468                    let named = ts_lang.node_kind_is_named(id);
469                    if named && !kind.starts_with('_') {
470                        all_kinds.push(kind);
471                    }
472                }
473            }
474
475            // Find unused but potentially interesting kinds
476            let mut unused_interesting: Vec<&str> = all_kinds
477                .into_iter()
478                .filter(|kind| !used_kinds.contains(*kind))
479                .filter(|kind| {
480                    let lower = kind.to_lowercase();
481                    interesting_patterns.iter().any(|p| lower.contains(p))
482                })
483                .collect();
484
485            unused_interesting.sort();
486
487            if !unused_interesting.is_empty() {
488                println!(
489                    "\n=== {} ({}) - {} potentially useful unused kinds ===",
490                    lang.name(),
491                    grammar_name,
492                    unused_interesting.len()
493                );
494                for kind in &unused_interesting {
495                    println!("  {}", kind);
496                }
497            }
498        }
499    }
500}
501
502/// Validate that a language's unused node kinds audit is complete and accurate.
503///
504/// This function checks:
505/// 1. All kinds in `documented_unused` actually exist in the grammar
506/// 2. All potentially useful kinds from the grammar are either used or documented
507///
508/// Call this from each language's `unused_node_kinds_audit` test.
509pub fn validate_unused_kinds_audit(
510    lang: &dyn Language,
511    documented_unused: &[&str],
512) -> Result<(), String> {
513    use crate::GrammarLoader;
514    use std::collections::HashSet;
515
516    let loader = GrammarLoader::new();
517    let ts_lang = loader
518        .get(lang.grammar_name())
519        .ok_or_else(|| format!("Grammar '{}' not found", lang.grammar_name()))?;
520
521    // Keywords that suggest a node kind might be useful (same as cross_check_node_kinds)
522    let interesting_patterns = [
523        "statement",
524        "expression",
525        "definition",
526        "declaration",
527        "clause",
528        "block",
529        "body",
530        "import",
531        "export",
532        "function",
533        "method",
534        "class",
535        "struct",
536        "enum",
537        "interface",
538        "trait",
539        "module",
540        "type",
541        "return",
542        "if",
543        "else",
544        "for",
545        "while",
546        "loop",
547        "match",
548        "case",
549        "try",
550        "catch",
551        "except",
552        "throw",
553        "raise",
554        "with",
555        "async",
556        "await",
557        "yield",
558        "lambda",
559        "comprehension",
560        "generator",
561        "operator",
562    ];
563
564    // Collect all kinds used by Language trait methods
565    let mut used_kinds: HashSet<&str> = HashSet::new();
566    for kind in lang.container_kinds() {
567        used_kinds.insert(kind);
568    }
569    for kind in lang.function_kinds() {
570        used_kinds.insert(kind);
571    }
572    for kind in lang.type_kinds() {
573        used_kinds.insert(kind);
574    }
575    for kind in lang.import_kinds() {
576        used_kinds.insert(kind);
577    }
578    for kind in lang.public_symbol_kinds() {
579        used_kinds.insert(kind);
580    }
581    for kind in lang.scope_creating_kinds() {
582        used_kinds.insert(kind);
583    }
584    for kind in lang.control_flow_kinds() {
585        used_kinds.insert(kind);
586    }
587    for kind in lang.complexity_nodes() {
588        used_kinds.insert(kind);
589    }
590    for kind in lang.nesting_nodes() {
591        used_kinds.insert(kind);
592    }
593
594    let documented_set: HashSet<&str> = documented_unused.iter().copied().collect();
595
596    // Get all valid named node kinds from grammar
597    let mut grammar_kinds: HashSet<&str> = HashSet::new();
598    let count = ts_lang.node_kind_count();
599    for id in 0..count as u16 {
600        if let Some(kind) = ts_lang.node_kind_for_id(id) {
601            let named = ts_lang.node_kind_is_named(id);
602            if named && !kind.starts_with('_') {
603                grammar_kinds.insert(kind);
604            }
605        }
606    }
607
608    let mut errors: Vec<String> = Vec::new();
609
610    // Check 1: All documented unused kinds must exist in grammar
611    for kind in documented_unused {
612        if !grammar_kinds.contains(*kind) {
613            errors.push(format!(
614                "Documented kind '{}' doesn't exist in grammar",
615                kind
616            ));
617        }
618        // Also check it's not actually being used
619        if used_kinds.contains(*kind) {
620            errors.push(format!(
621                "Documented kind '{}' is actually used in trait methods",
622                kind
623            ));
624        }
625    }
626
627    // Check 2: All potentially useful grammar kinds must be used or documented
628    for kind in &grammar_kinds {
629        let lower = kind.to_lowercase();
630        let is_interesting = interesting_patterns.iter().any(|p| lower.contains(p));
631
632        if is_interesting && !used_kinds.contains(*kind) && !documented_set.contains(*kind) {
633            errors.push(format!(
634                "Potentially useful kind '{}' is neither used nor documented",
635                kind
636            ));
637        }
638    }
639
640    if errors.is_empty() {
641        Ok(())
642    } else {
643        Err(format!(
644            "{} validation errors:\n  - {}",
645            errors.len(),
646            errors.join("\n  - ")
647        ))
648    }
649}