Skip to main content

normalize_languages/
registry.rs

1//! Language support registry with extension-based lookup.
2
3use crate::Language;
4use std::collections::HashMap;
5use std::path::Path;
6use std::sync::{OnceLock, RwLock};
7
8/// Global language registry.
9static LANGUAGES: RwLock<Vec<&'static dyn Language>> = RwLock::new(Vec::new());
10static INITIALIZED: OnceLock<()> = OnceLock::new();
11
12/// Cached extension → language lookup table.
13static EXTENSION_MAP: OnceLock<HashMap<&'static str, &'static dyn Language>> = OnceLock::new();
14
15/// Cached grammar_name → language lookup table.
16static GRAMMAR_MAP: OnceLock<HashMap<&'static str, &'static dyn Language>> = OnceLock::new();
17
18/// Register a language in the global registry.
19/// Called internally by language modules.
20pub fn register(lang: &'static dyn Language) {
21    LANGUAGES
22        .write()
23        .unwrap_or_else(|e| e.into_inner())
24        .push(lang);
25}
26
27/// Initialize built-in languages (called once).
28fn init_builtin() {
29    INITIALIZED.get_or_init(|| {
30        #[cfg(feature = "lang-python")]
31        register(&crate::python::Python);
32        #[cfg(feature = "lang-rust")]
33        register(&crate::rust::Rust);
34        #[cfg(feature = "lang-javascript")]
35        register(&crate::javascript::JavaScript);
36        #[cfg(feature = "lang-typescript")]
37        {
38            register(&crate::typescript::TypeScript);
39            register(&crate::typescript::Tsx);
40        }
41        #[cfg(feature = "lang-go")]
42        register(&crate::go::Go);
43        #[cfg(feature = "lang-java")]
44        register(&crate::java::Java);
45        #[cfg(feature = "lang-kotlin")]
46        register(&crate::kotlin::Kotlin);
47        #[cfg(feature = "lang-csharp")]
48        register(&crate::csharp::CSharp);
49        #[cfg(feature = "lang-swift")]
50        register(&crate::swift::Swift);
51        #[cfg(feature = "lang-php")]
52        register(&crate::php::Php);
53        #[cfg(feature = "lang-dockerfile")]
54        register(&crate::dockerfile::Dockerfile);
55        #[cfg(feature = "lang-c")]
56        register(&crate::c::C);
57        #[cfg(feature = "lang-cpp")]
58        register(&crate::cpp::Cpp);
59        #[cfg(feature = "lang-ruby")]
60        register(&crate::ruby::Ruby);
61        #[cfg(feature = "lang-scala")]
62        register(&crate::scala::Scala);
63        #[cfg(feature = "lang-vue")]
64        register(&crate::vue::Vue);
65        #[cfg(feature = "lang-markdown")]
66        register(&crate::markdown::Markdown);
67        #[cfg(feature = "lang-json")]
68        register(&crate::json::Json);
69        #[cfg(feature = "lang-yaml")]
70        register(&crate::yaml::Yaml);
71        #[cfg(feature = "lang-toml")]
72        register(&crate::toml::Toml);
73        #[cfg(feature = "lang-html")]
74        register(&crate::html::Html);
75        #[cfg(feature = "lang-css")]
76        register(&crate::css::Css);
77        #[cfg(feature = "lang-bash")]
78        register(&crate::bash::Bash);
79        #[cfg(feature = "lang-lua")]
80        register(&crate::lua::Lua);
81        #[cfg(feature = "lang-zig")]
82        register(&crate::zig::Zig);
83        #[cfg(feature = "lang-elixir")]
84        register(&crate::elixir::Elixir);
85        #[cfg(feature = "lang-erlang")]
86        register(&crate::erlang::Erlang);
87        #[cfg(feature = "lang-dart")]
88        register(&crate::dart::Dart);
89        #[cfg(feature = "lang-fsharp")]
90        register(&crate::fsharp::FSharp);
91        #[cfg(feature = "lang-sql")]
92        register(&crate::sql::Sql);
93        #[cfg(feature = "lang-graphql")]
94        register(&crate::graphql::GraphQL);
95        #[cfg(feature = "lang-hcl")]
96        register(&crate::hcl::Hcl);
97        #[cfg(feature = "lang-scss")]
98        register(&crate::scss::Scss);
99        #[cfg(feature = "lang-svelte")]
100        register(&crate::svelte::Svelte);
101        #[cfg(feature = "lang-xml")]
102        register(&crate::xml::Xml);
103        #[cfg(feature = "lang-clojure")]
104        register(&crate::clojure::Clojure);
105        #[cfg(feature = "lang-haskell")]
106        register(&crate::haskell::Haskell);
107        #[cfg(feature = "lang-ocaml")]
108        register(&crate::ocaml::OCaml);
109        #[cfg(feature = "lang-nix")]
110        register(&crate::nix::Nix);
111        #[cfg(feature = "lang-perl")]
112        register(&crate::perl::Perl);
113        #[cfg(feature = "lang-r")]
114        register(&crate::r::R);
115        #[cfg(feature = "lang-julia")]
116        register(&crate::julia::Julia);
117        #[cfg(feature = "lang-elm")]
118        register(&crate::elm::Elm);
119        #[cfg(feature = "lang-cmake")]
120        register(&crate::cmake::CMake);
121        #[cfg(feature = "lang-vim")]
122        register(&crate::vim::Vim);
123        #[cfg(feature = "lang-awk")]
124        register(&crate::awk::Awk);
125        #[cfg(feature = "lang-fish")]
126        register(&crate::fish::Fish);
127        #[cfg(feature = "lang-jq")]
128        register(&crate::jq::Jq);
129        #[cfg(feature = "lang-powershell")]
130        register(&crate::powershell::PowerShell);
131        #[cfg(feature = "lang-zsh")]
132        register(&crate::zsh::Zsh);
133        #[cfg(feature = "lang-groovy")]
134        register(&crate::groovy::Groovy);
135        #[cfg(feature = "lang-glsl")]
136        register(&crate::glsl::Glsl);
137        #[cfg(feature = "lang-hlsl")]
138        register(&crate::hlsl::Hlsl);
139        #[cfg(feature = "lang-commonlisp")]
140        register(&crate::commonlisp::CommonLisp);
141        #[cfg(feature = "lang-elisp")]
142        register(&crate::elisp::Elisp);
143        #[cfg(feature = "lang-gleam")]
144        register(&crate::gleam::Gleam);
145        #[cfg(feature = "lang-ini")]
146        register(&crate::ini::Ini);
147        #[cfg(feature = "lang-diff")]
148        register(&crate::diff::Diff);
149        #[cfg(feature = "lang-dot")]
150        register(&crate::dot::Dot);
151        #[cfg(feature = "lang-kdl")]
152        register(&crate::kdl::Kdl);
153        #[cfg(feature = "lang-ada")]
154        register(&crate::ada::Ada);
155        #[cfg(feature = "lang-agda")]
156        register(&crate::agda::Agda);
157        #[cfg(feature = "lang-d")]
158        register(&crate::d::D);
159        #[cfg(feature = "lang-matlab")]
160        register(&crate::matlab::Matlab);
161        #[cfg(feature = "lang-meson")]
162        register(&crate::meson::Meson);
163        #[cfg(feature = "lang-nginx")]
164        register(&crate::nginx::Nginx);
165        #[cfg(feature = "lang-prolog")]
166        register(&crate::prolog::Prolog);
167        #[cfg(feature = "lang-batch")]
168        register(&crate::batch::Batch);
169        #[cfg(feature = "lang-asm")]
170        register(&crate::asm::Asm);
171        #[cfg(feature = "lang-objc")]
172        register(&crate::objc::ObjC);
173        #[cfg(feature = "lang-typst")]
174        register(&crate::typst::Typst);
175        #[cfg(feature = "lang-asciidoc")]
176        register(&crate::asciidoc::AsciiDoc);
177        #[cfg(feature = "lang-vb")]
178        register(&crate::vb::VB);
179        #[cfg(feature = "lang-idris")]
180        register(&crate::idris::Idris);
181        #[cfg(feature = "lang-rescript")]
182        register(&crate::rescript::ReScript);
183        #[cfg(feature = "lang-lean")]
184        register(&crate::lean::Lean);
185        #[cfg(feature = "lang-caddy")]
186        register(&crate::caddy::Caddy);
187        #[cfg(feature = "lang-capnp")]
188        register(&crate::capnp::Capnp);
189        #[cfg(feature = "lang-devicetree")]
190        register(&crate::devicetree::DeviceTree);
191        #[cfg(feature = "lang-jinja2")]
192        register(&crate::jinja2::Jinja2);
193        #[cfg(feature = "lang-ninja")]
194        register(&crate::ninja::Ninja);
195        #[cfg(feature = "lang-postscript")]
196        register(&crate::postscript::PostScript);
197        #[cfg(feature = "lang-query")]
198        register(&crate::query::Query);
199        // Scheme registered after Query so .scm → Scheme (not Query) in extension_map
200        #[cfg(feature = "lang-scheme")]
201        register(&crate::scheme::Scheme);
202        #[cfg(feature = "lang-ron")]
203        register(&crate::ron::Ron);
204        #[cfg(feature = "lang-sparql")]
205        register(&crate::sparql::Sparql);
206        #[cfg(feature = "lang-sshconfig")]
207        register(&crate::sshconfig::SshConfig);
208        #[cfg(feature = "lang-starlark")]
209        register(&crate::starlark::Starlark);
210        #[cfg(feature = "lang-textproto")]
211        register(&crate::textproto::TextProto);
212        #[cfg(feature = "lang-thrift")]
213        register(&crate::thrift::Thrift);
214        #[cfg(feature = "lang-tlaplus")]
215        register(&crate::tlaplus::TlaPlus);
216        #[cfg(feature = "lang-uiua")]
217        register(&crate::uiua::Uiua);
218        #[cfg(feature = "lang-verilog")]
219        register(&crate::verilog::Verilog);
220        #[cfg(feature = "lang-vhdl")]
221        register(&crate::vhdl::Vhdl);
222        #[cfg(feature = "lang-wit")]
223        register(&crate::wit::Wit);
224        #[cfg(feature = "lang-x86asm")]
225        register(&crate::x86asm::X86Asm);
226        #[cfg(feature = "lang-yuri")]
227        register(&crate::yuri::Yuri);
228    });
229}
230
231fn extension_map() -> &'static HashMap<&'static str, &'static dyn Language> {
232    init_builtin();
233    EXTENSION_MAP.get_or_init(|| {
234        let mut map = HashMap::new();
235        let langs = LANGUAGES.read().unwrap_or_else(|e| e.into_inner());
236        for lang in langs.iter() {
237            for ext in lang.extensions() {
238                map.insert(*ext, *lang);
239            }
240        }
241        map
242    })
243}
244
245fn grammar_map() -> &'static HashMap<&'static str, &'static dyn Language> {
246    init_builtin();
247    GRAMMAR_MAP.get_or_init(|| {
248        let mut map = HashMap::new();
249        let langs = LANGUAGES.read().unwrap_or_else(|e| e.into_inner());
250        for lang in langs.iter() {
251            map.insert(lang.grammar_name(), *lang);
252        }
253        map
254    })
255}
256
257/// Get language support for a file extension.
258///
259/// Returns `None` if the extension is not recognized or the feature is not enabled.
260pub fn support_for_extension(ext: &str) -> Option<&'static dyn Language> {
261    extension_map()
262        .get(ext)
263        .or_else(|| extension_map().get(ext.to_lowercase().as_str()))
264        .copied()
265}
266
267/// Get language support by grammar name.
268///
269/// Returns `None` if the grammar is not recognized or the feature is not enabled.
270pub fn support_for_grammar(grammar: &str) -> Option<&'static dyn Language> {
271    grammar_map().get(grammar).copied()
272}
273
274/// Get language support from a file path.
275///
276/// Returns `None` if the file has no extension, the extension is not recognized,
277/// or the feature is not enabled.
278pub fn support_for_path(path: &Path) -> Option<&'static dyn Language> {
279    path.extension()
280        .and_then(|e| e.to_str())
281        .and_then(support_for_extension)
282}
283
284/// Check if a file path is a dedicated test file for its language.
285///
286/// Returns false for unknown file types or languages that use inline tests.
287/// Matches against the language's `test_file_globs()` patterns.
288pub fn is_test_path(path: &Path) -> bool {
289    let lang = match support_for_path(path) {
290        Some(l) => l,
291        None => return false,
292    };
293    let globs = lang.test_file_globs();
294    if globs.is_empty() {
295        return false;
296    }
297    let mut builder = globset::GlobSetBuilder::new();
298    for g in globs {
299        if let Ok(glob) = globset::Glob::new(g) {
300            builder.add(glob);
301        }
302    }
303    let Ok(set) = builder.build() else {
304        return false;
305    };
306    set.is_match(path)
307}
308
309/// Get all glob patterns that identify test files for a given language extension.
310pub fn test_file_globs_for_path(path: &Path) -> &'static [&'static str] {
311    support_for_path(path)
312        .map(|lang| lang.test_file_globs())
313        .unwrap_or(&[])
314}
315
316/// Get all supported languages.
317pub fn supported_languages() -> Vec<&'static dyn Language> {
318    init_builtin();
319    LANGUAGES.read().unwrap_or_else(|e| e.into_inner()).clone()
320}
321
322/// Check if a path is a programming language (not a data/config format).
323///
324/// Returns false for data formats like JSON, YAML, TOML, Markdown, etc.
325/// even though normalize-languages can parse them for syntax highlighting.
326///
327/// Useful for architecture analysis where only "code" files are relevant.
328/// Uses `normalize_language_meta::capabilities_for()` to determine if a
329/// language is executable code.
330pub fn is_programming_language(path: &Path) -> bool {
331    let lang = match support_for_path(path) {
332        Some(l) => l,
333        None => return false,
334    };
335
336    let caps = normalize_language_meta::capabilities_for(lang.name());
337    caps.executable
338}
339
340/// Validate that a language's unused node kinds audit is complete and accurate.
341///
342/// This function checks:
343/// 1. All kinds in `documented_unused` actually exist in the grammar
344/// 2. All potentially useful kinds from the grammar are either used or documented
345///
346/// Call this from each language's `unused_node_kinds_audit` test.
347pub fn validate_unused_kinds_audit(
348    lang: &dyn Language,
349    documented_unused: &[&str],
350) -> Result<(), String> {
351    use crate::GrammarLoader;
352    use crate::grammar_loader::GrammarLoadError;
353    use std::collections::HashSet;
354
355    let loader = GrammarLoader::new();
356    let ts_lang = match loader.get(lang.grammar_name()) {
357        Ok(l) => l,
358        // Grammar `.so` not present — typical in `cargo test` without
359        // `cargo xtask build-grammars`. Skip the audit instead of panicking.
360        // Real audit failures (loaded but mismatched) still surface.
361        Err(GrammarLoadError::NotFound(_)) => return Ok(()),
362        Err(e) => return Err(format!("Grammar '{}' not found: {e}", lang.grammar_name())),
363    };
364
365    // Keywords that suggest a node kind might be useful (same as cross_check_node_kinds)
366    let interesting_patterns = [
367        "statement",
368        "expression",
369        "definition",
370        "declaration",
371        "clause",
372        "block",
373        "body",
374        "import",
375        "export",
376        "function",
377        "method",
378        "class",
379        "struct",
380        "enum",
381        "interface",
382        "trait",
383        "module",
384        "type",
385        "return",
386        "if",
387        "else",
388        "for",
389        "while",
390        "loop",
391        "match",
392        "case",
393        "try",
394        "catch",
395        "except",
396        "throw",
397        "raise",
398        "with",
399        "async",
400        "await",
401        "yield",
402        "lambda",
403        "comprehension",
404        "generator",
405        "operator",
406    ];
407
408    // Collect kinds referenced in tags.scm
409    let tags_kinds: HashSet<String> = {
410        let mut kinds = HashSet::new();
411        if let Some(tags_content) = loader.get_tags(lang.grammar_name()) {
412            // Extract top-level node kind names: lines starting with "(<identifier>"
413            // These are the patterns like "(function_definition ..." in the query
414            for line in tags_content.lines() {
415                let trimmed = line.trim_start();
416                if trimmed.starts_with('(')
417                    && !trimmed.starts_with(";;")
418                    && !trimmed.starts_with(";")
419                {
420                    // Extract the first word after the opening paren
421                    let inner = &trimmed[1..];
422                    let kind_name: String = inner
423                        .chars()
424                        .take_while(|c| c.is_alphanumeric() || *c == '_' || *c == '-')
425                        .collect();
426                    if !kind_name.is_empty() && !kind_name.starts_with('@') {
427                        kinds.insert(kind_name);
428                    }
429                }
430            }
431        }
432        kinds
433    };
434
435    let documented_set: HashSet<&str> = documented_unused.iter().copied().collect();
436
437    // Get all valid named node kinds from grammar
438    let mut grammar_kinds: HashSet<&str> = HashSet::new();
439    let count = ts_lang.node_kind_count();
440    for id in 0..count as u16 {
441        if let Some(kind) = ts_lang.node_kind_for_id(id) {
442            let named = ts_lang.node_kind_is_named(id);
443            if named && !kind.starts_with('_') {
444                grammar_kinds.insert(kind);
445            }
446        }
447    }
448
449    let mut errors: Vec<String> = Vec::new();
450
451    // Check 1: All documented unused kinds must exist in grammar
452    for kind in documented_unused {
453        if !grammar_kinds.contains(*kind) {
454            errors.push(format!(
455                "Documented kind '{}' doesn't exist in grammar",
456                kind
457            ));
458        }
459        // Also check it's not actually being used (in tags.scm)
460        if tags_kinds.contains(*kind) {
461            errors.push(format!(
462                "Documented kind '{}' is actually used in tags.scm",
463                kind
464            ));
465        }
466    }
467
468    // Check 2: All potentially useful grammar kinds must be used or documented
469    for kind in &grammar_kinds {
470        let lower = kind.to_lowercase();
471        let is_interesting = interesting_patterns.iter().any(|p| lower.contains(p));
472
473        if is_interesting && !tags_kinds.contains(*kind) && !documented_set.contains(*kind) {
474            errors.push(format!(
475                "Potentially useful kind '{}' is neither used nor documented",
476                kind
477            ));
478        }
479    }
480
481    if errors.is_empty() {
482        Ok(())
483    } else {
484        Err(format!(
485            "{} validation errors:\n  - {}",
486            errors.len(),
487            errors.join("\n  - ")
488        ))
489    }
490}
491
492#[cfg(test)]
493mod tests {
494    use super::*;
495    use crate::GrammarLoader;
496
497    /// Dump all valid node kinds for a grammar (useful for fixing invalid kinds).
498    /// Run with: cargo test -p rhizome-normalize-languages dump_node_kinds -- --nocapture
499    #[test]
500    #[ignore]
501    fn dump_node_kinds() {
502        let loader = GrammarLoader::new();
503        // Change this to the grammar you want to inspect
504        let grammar_name = std::env::var("DUMP_GRAMMAR").unwrap_or_else(|_| "python".to_string());
505
506        let ts_lang = loader.get(&grammar_name).expect("grammar not found");
507
508        println!("\n=== Valid node kinds for '{}' ===\n", grammar_name);
509        let count = ts_lang.node_kind_count();
510        for id in 0..count as u16 {
511            if let Some(kind) = ts_lang.node_kind_for_id(id) {
512                let named = ts_lang.node_kind_is_named(id);
513                if named && !kind.starts_with('_') {
514                    println!("{}", kind);
515                }
516            }
517        }
518    }
519
520    /// Validate that all node kinds returned by Language trait methods
521    /// actually exist in the tree-sitter grammar.
522    ///
523    /// No trait methods return node kind lists any more —
524    /// export detection now uses tags.scm queries exclusively.
525    /// This test is intentionally empty.
526    #[test]
527    fn validate_node_kinds() {
528        // Nothing to validate — node kind lists were removed from the Language trait.
529    }
530
531    /// Cross-check grammar node kinds against Language implementations.
532    /// Finds potentially useful kinds that exist in the grammar but aren't used.
533    /// Run with: cargo test -p rhizome-normalize-languages cross_check_node_kinds -- --nocapture --ignored
534    #[test]
535    #[ignore]
536    fn cross_check_node_kinds() {
537        use std::collections::HashSet;
538
539        let loader = GrammarLoader::new();
540
541        // Keywords that suggest a node kind might be useful
542        let interesting_patterns = [
543            "statement",
544            "expression",
545            "definition",
546            "declaration",
547            "clause",
548            "block",
549            "body",
550            "import",
551            "export",
552            "function",
553            "method",
554            "class",
555            "struct",
556            "enum",
557            "interface",
558            "trait",
559            "module",
560            "type",
561            "return",
562            "if",
563            "else",
564            "for",
565            "while",
566            "loop",
567            "match",
568            "case",
569            "try",
570            "catch",
571            "except",
572            "throw",
573            "raise",
574            "with",
575            "async",
576            "await",
577            "yield",
578            "lambda",
579            "comprehension",
580            "generator",
581            "operator",
582        ];
583
584        for lang in supported_languages() {
585            let grammar_name = lang.grammar_name();
586            let ts_lang = match loader.get(grammar_name).ok() {
587                Some(l) => l,
588                None => continue,
589            };
590
591            // Collect all kinds currently used by the language
592            // public_symbol_kinds() removed — export detection uses tags.scm exclusively.
593            let used_kinds: HashSet<&str> = HashSet::new();
594
595            // Get all valid named node kinds from grammar
596            let mut all_kinds: Vec<&str> = Vec::new();
597            let count = ts_lang.node_kind_count();
598            for id in 0..count as u16 {
599                if let Some(kind) = ts_lang.node_kind_for_id(id) {
600                    let named = ts_lang.node_kind_is_named(id);
601                    if named && !kind.starts_with('_') {
602                        all_kinds.push(kind);
603                    }
604                }
605            }
606
607            // Find unused but potentially interesting kinds
608            let mut unused_interesting: Vec<&str> = all_kinds
609                .into_iter()
610                .filter(|kind| !used_kinds.contains(*kind))
611                .filter(|kind| {
612                    let lower = kind.to_lowercase();
613                    interesting_patterns.iter().any(|p| lower.contains(p))
614                })
615                .collect();
616
617            unused_interesting.sort();
618
619            if !unused_interesting.is_empty() {
620                println!(
621                    "\n=== {} ({}) - {} potentially useful unused kinds ===",
622                    lang.name(),
623                    grammar_name,
624                    unused_interesting.len()
625                );
626                for kind in &unused_interesting {
627                    println!("  {}", kind);
628                }
629            }
630        }
631    }
632}