Skip to main content

fresh/primitives/grammar/
types.rs

1//! Pure grammar registry types without I/O operations.
2//!
3//! This module contains the `GrammarRegistry` struct and all syntax lookup methods
4//! that don't require filesystem access. This enables WASM compatibility and easier testing.
5
6use serde::{Deserialize, Serialize};
7use std::collections::HashMap;
8use std::path::{Path, PathBuf};
9use std::sync::Arc;
10use syntect::parsing::{SyntaxDefinition, SyntaxReference, SyntaxSet, SyntaxSetBuilder};
11
12// Re-export glob matching utilities for use by other modules
13pub use crate::primitives::glob_match::{
14    filename_glob_matches, is_glob_pattern, is_path_pattern, path_glob_matches,
15};
16
17/// A grammar specification: language name, path to grammar file, and associated file extensions.
18///
19/// Used to pass grammar information between the plugin layer, loader, and registry
20/// without relying on anonymous tuples.
21#[derive(Clone, Debug)]
22pub struct GrammarSpec {
23    /// Language identifier (e.g., "elixir")
24    pub language: String,
25    /// Path to the grammar file (.sublime-syntax)
26    pub path: PathBuf,
27    /// File extensions to associate with this grammar (e.g., ["ex", "exs"])
28    pub extensions: Vec<String>,
29}
30
31/// Where a grammar was loaded from.
32#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
33#[serde(tag = "type")]
34pub enum GrammarSource {
35    /// Built-in to Fresh (pre-compiled syntect defaults + embedded grammars)
36    #[serde(rename = "built-in")]
37    BuiltIn,
38    /// Installed from a user grammar directory (~/.config/fresh/grammars/)
39    #[serde(rename = "user")]
40    User { path: PathBuf },
41    /// From a language pack (~/.config/fresh/languages/packages/)
42    #[serde(rename = "language-pack")]
43    LanguagePack { name: String, path: PathBuf },
44    /// From a bundle package (~/.config/fresh/bundles/packages/)
45    #[serde(rename = "bundle")]
46    Bundle { name: String, path: PathBuf },
47    /// Registered by a plugin at runtime
48    #[serde(rename = "plugin")]
49    Plugin { plugin: String, path: PathBuf },
50}
51
52impl std::fmt::Display for GrammarSource {
53    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
54        match self {
55            GrammarSource::BuiltIn => write!(f, "built-in"),
56            GrammarSource::User { path } => write!(f, "user ({})", path.display()),
57            GrammarSource::LanguagePack { name, .. } => write!(f, "language-pack ({})", name),
58            GrammarSource::Bundle { name, .. } => write!(f, "bundle ({})", name),
59            GrammarSource::Plugin { plugin, .. } => write!(f, "plugin ({})", plugin),
60        }
61    }
62}
63
64/// Information about an available grammar, including its provenance.
65#[derive(Clone, Debug, Serialize, Deserialize)]
66pub struct GrammarInfo {
67    /// The grammar name as used in config files (case-insensitive matching)
68    pub name: String,
69    /// Where this grammar was loaded from
70    pub source: GrammarSource,
71    /// File extensions associated with this grammar
72    pub file_extensions: Vec<String>,
73    /// Optional short name alias (e.g., "bash" for "Bourne Again Shell (bash)")
74    #[serde(default, skip_serializing_if = "Option::is_none")]
75    pub short_name: Option<String>,
76}
77
78/// Bridge between syntect display names and `fresh_languages::Language`.
79///
80/// Most syntect grammars map one-to-one: "Rust" → `Language::Rust`. A few
81/// have verbose display names that don't match the tree-sitter enum's
82/// `display_name()`, and `Language::from_name` has fuzzy "contains shell"
83/// fallbacks that would wrongly tag Nushell as tree-sitter Bash. This is
84/// the one place we spell the exceptions out explicitly.
85const SYNTECT_TO_TREE_SITTER_ALIASES: &[(&str, fresh_languages::Language)] =
86    &[("Bourne Again Shell (bash)", fresh_languages::Language::Bash)];
87
88/// Resolve a syntect syntax display name to a tree-sitter language, using
89/// strict equality against the alias table and `Language::display_name()`.
90fn tree_sitter_for_syntect_name(display_name: &str) -> Option<fresh_languages::Language> {
91    for (syntect_name, lang) in SYNTECT_TO_TREE_SITTER_ALIASES {
92        if *syntect_name == display_name {
93            return Some(*lang);
94        }
95    }
96    fresh_languages::Language::all()
97        .iter()
98        .find(|l| l.display_name() == display_name)
99        .copied()
100}
101
102/// Which highlighters can serve a given `GrammarEntry`.
103///
104/// A catalog entry may come from syntect (a TextMate grammar indexed into
105/// `SyntaxSet`), tree-sitter (a `fresh_languages::Language`), or both.
106#[derive(Clone, Debug, Default)]
107pub struct GrammarEngines {
108    /// Index into `GrammarRegistry::syntax_set().syntaxes()`, if a syntect
109    /// grammar is available.
110    pub syntect: Option<usize>,
111    /// Tree-sitter language, if one is registered for this grammar.
112    pub tree_sitter: Option<fresh_languages::Language>,
113}
114
115/// A single entry in the unified grammar catalog.
116///
117/// Each entry represents one logical language (e.g. "Rust", "TypeScript") and
118/// records which highlighting engines can serve it, plus the names/extensions
119/// used to look it up. The catalog is the single source of truth for grammar
120/// lookups — `find_by_name`, `find_by_path`, `find_by_extension` all return
121/// entries from here, and both `HighlightEngine::from_entry` and
122/// `DetectedLanguage::from_entry` consume them.
123#[derive(Clone, Debug)]
124pub struct GrammarEntry {
125    /// Human-readable display name (e.g. "TypeScript", "Bourne Again Shell (bash)").
126    pub display_name: String,
127    /// Canonical language ID used in config and LSP (e.g. "typescript", "csharp").
128    pub language_id: String,
129    /// Short alias, if one exists (e.g. "ts" for TypeScript).
130    pub short_name: Option<String>,
131    /// File extensions (without leading dot).
132    pub extensions: Vec<String>,
133    /// Exact filenames that map to this grammar (e.g. "Dockerfile").
134    pub filenames: Vec<String>,
135    /// Filename globs from user config (e.g. "*.conf", "/etc/**/rc.*").
136    pub filename_globs: Vec<String>,
137    /// Where this grammar was loaded from.
138    pub source: GrammarSource,
139    /// Highlighters that can serve this entry.
140    pub engines: GrammarEngines,
141}
142
143/// Embedded TOML grammar (syntect doesn't include one)
144pub const TOML_GRAMMAR: &str = include_str!("../../grammars/toml.sublime-syntax");
145
146/// Embedded Odin grammar (syntect doesn't include one)
147/// From: https://github.com/Tetralux/sublime-odin (MIT License)
148pub const ODIN_GRAMMAR: &str = include_str!("../../grammars/odin/Odin.sublime-syntax");
149
150/// Embedded Zig grammar (syntect doesn't include one)
151pub const ZIG_GRAMMAR: &str = include_str!("../../grammars/zig.sublime-syntax");
152
153/// Embedded Git Rebase Todo grammar for interactive rebase
154pub const GIT_REBASE_GRAMMAR: &str = include_str!("../../grammars/git-rebase.sublime-syntax");
155
156/// Embedded Git Commit Message grammar for COMMIT_EDITMSG, MERGE_MSG, etc.
157pub const GIT_COMMIT_GRAMMAR: &str = include_str!("../../grammars/git-commit.sublime-syntax");
158
159/// Embedded Gitignore grammar for .gitignore and similar files
160pub const GITIGNORE_GRAMMAR: &str = include_str!("../../grammars/gitignore.sublime-syntax");
161
162/// Embedded Git Config grammar for .gitconfig, .gitmodules
163pub const GITCONFIG_GRAMMAR: &str = include_str!("../../grammars/gitconfig.sublime-syntax");
164
165/// Embedded Git Attributes grammar for .gitattributes
166pub const GITATTRIBUTES_GRAMMAR: &str = include_str!("../../grammars/gitattributes.sublime-syntax");
167
168/// Embedded Typst grammar (syntect doesn't include one)
169pub const TYPST_GRAMMAR: &str = include_str!("../../grammars/typst.sublime-syntax");
170
171/// Embedded Dockerfile grammar
172pub const DOCKERFILE_GRAMMAR: &str = include_str!("../../grammars/dockerfile.sublime-syntax");
173/// Embedded INI grammar (also handles .env, .cfg, .editorconfig, etc.)
174pub const INI_GRAMMAR: &str = include_str!("../../grammars/ini.sublime-syntax");
175/// Embedded CMake grammar
176pub const CMAKE_GRAMMAR: &str = include_str!("../../grammars/cmake.sublime-syntax");
177/// Embedded SCSS grammar
178pub const SCSS_GRAMMAR: &str = include_str!("../../grammars/scss.sublime-syntax");
179/// Embedded LESS grammar
180pub const LESS_GRAMMAR: &str = include_str!("../../grammars/less.sublime-syntax");
181/// Embedded PowerShell grammar
182pub const POWERSHELL_GRAMMAR: &str = include_str!("../../grammars/powershell.sublime-syntax");
183/// Embedded Kotlin grammar
184pub const KOTLIN_GRAMMAR: &str = include_str!("../../grammars/kotlin.sublime-syntax");
185/// Embedded Swift grammar
186pub const SWIFT_GRAMMAR: &str = include_str!("../../grammars/swift.sublime-syntax");
187/// Embedded Dart grammar
188pub const DART_GRAMMAR: &str = include_str!("../../grammars/dart.sublime-syntax");
189/// Embedded Elixir grammar
190pub const ELIXIR_GRAMMAR: &str = include_str!("../../grammars/elixir.sublime-syntax");
191/// Embedded F# grammar
192pub const FSHARP_GRAMMAR: &str = include_str!("../../grammars/fsharp.sublime-syntax");
193/// Embedded Nix grammar
194pub const NIX_GRAMMAR: &str = include_str!("../../grammars/nix.sublime-syntax");
195/// Embedded HCL/Terraform grammar
196pub const HCL_GRAMMAR: &str = include_str!("../../grammars/hcl.sublime-syntax");
197/// Embedded Protocol Buffers grammar
198pub const PROTOBUF_GRAMMAR: &str = include_str!("../../grammars/protobuf.sublime-syntax");
199/// Embedded GraphQL grammar
200pub const GRAPHQL_GRAMMAR: &str = include_str!("../../grammars/graphql.sublime-syntax");
201/// Embedded Julia grammar
202pub const JULIA_GRAMMAR: &str = include_str!("../../grammars/julia.sublime-syntax");
203/// Embedded Nim grammar
204pub const NIM_GRAMMAR: &str = include_str!("../../grammars/nim.sublime-syntax");
205/// Embedded Gleam grammar
206pub const GLEAM_GRAMMAR: &str = include_str!("../../grammars/gleam.sublime-syntax");
207/// Embedded V language grammar
208pub const VLANG_GRAMMAR: &str = include_str!("../../grammars/vlang.sublime-syntax");
209/// Embedded Solidity grammar
210pub const SOLIDITY_GRAMMAR: &str = include_str!("../../grammars/solidity.sublime-syntax");
211/// Embedded KDL grammar
212pub const KDL_GRAMMAR: &str = include_str!("../../grammars/kdl.sublime-syntax");
213/// Embedded Nushell grammar
214pub const NUSHELL_GRAMMAR: &str = include_str!("../../grammars/nushell.sublime-syntax");
215/// Embedded Starlark/Bazel grammar
216pub const STARLARK_GRAMMAR: &str = include_str!("../../grammars/starlark.sublime-syntax");
217/// Embedded Justfile grammar
218pub const JUSTFILE_GRAMMAR: &str = include_str!("../../grammars/justfile.sublime-syntax");
219/// Embedded Earthfile grammar
220pub const EARTHFILE_GRAMMAR: &str = include_str!("../../grammars/earthfile.sublime-syntax");
221/// Embedded Go Module grammar
222pub const GOMOD_GRAMMAR: &str = include_str!("../../grammars/gomod.sublime-syntax");
223/// Embedded Vue grammar
224pub const VUE_GRAMMAR: &str = include_str!("../../grammars/vue.sublime-syntax");
225/// Embedded Svelte grammar
226pub const SVELTE_GRAMMAR: &str = include_str!("../../grammars/svelte.sublime-syntax");
227/// Embedded Astro grammar
228pub const ASTRO_GRAMMAR: &str = include_str!("../../grammars/astro.sublime-syntax");
229/// Embedded Hyprlang grammar (Hyprland config)
230pub const HYPRLANG_GRAMMAR: &str = include_str!("../../grammars/hyprlang.sublime-syntax");
231/// Embedded AutoHotkey grammar
232/// From: https://github.com/SALZKARTOFFEEEL/ahk-sublime-syntax (MIT License)
233pub const AUTOHOTKEY_GRAMMAR: &str =
234    include_str!("../../grammars/autohotkey/AutoHotkey.sublime-syntax");
235
236/// Registry of all available TextMate grammars.
237///
238/// This struct holds the compiled syntax set and provides lookup methods.
239/// It does not perform I/O directly - use `GrammarLoader` for loading grammars.
240impl std::fmt::Debug for GrammarRegistry {
241    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
242        f.debug_struct("GrammarRegistry")
243            .field("syntax_count", &self.syntax_set.syntaxes().len())
244            .finish()
245    }
246}
247
248pub struct GrammarRegistry {
249    /// Combined syntax set (built-in + embedded + user grammars)
250    syntax_set: Arc<SyntaxSet>,
251    /// Extension -> scope name mapping for user grammars (takes priority)
252    user_extensions: HashMap<String, String>,
253    /// Filename -> scope name mapping for dotfiles and special files
254    filename_scopes: HashMap<String, String>,
255    /// Paths to dynamically loaded grammar files (for reloading when adding more)
256    loaded_grammar_paths: Vec<GrammarSpec>,
257    /// Provenance info for each grammar (keyed by grammar name)
258    grammar_sources: HashMap<String, GrammarInfo>,
259    /// Short name aliases: lowercase short_name -> full syntect grammar name.
260    /// Provides a deterministic, one-to-one mapping so users can write
261    /// `grammar = "bash"` instead of `grammar = "Bourne Again Shell (bash)"`.
262    aliases: HashMap<String, String>,
263    /// Unified catalog of every known grammar. Rebuilt whenever the syntax set
264    /// or alias table changes. Lookups (`find_by_name`, `find_by_path`, ...)
265    /// all resolve against this.
266    catalog: Vec<GrammarEntry>,
267    /// Index from lowercased lookup keys (display name, language_id, short_name)
268    /// to catalog index.
269    catalog_by_name: HashMap<String, usize>,
270    /// Index from file extension (without dot) to catalog index.
271    catalog_by_extension: HashMap<String, usize>,
272    /// Index from filename to catalog index.
273    catalog_by_filename: HashMap<String, usize>,
274    /// The most recent language config handed to `apply_language_config`.
275    /// Retained so `rebuild_catalog` can replay it — otherwise a rebuild
276    /// (triggered by e.g. `populate_built_in_aliases`) silently wipes user
277    /// `[languages]` config that was merged on top.
278    applied_language_config: HashMap<String, crate::config::LanguageConfig>,
279}
280
281impl GrammarRegistry {
282    /// Create a new GrammarRegistry from pre-built components.
283    ///
284    /// This is typically called by `GrammarLoader` implementations after
285    /// loading grammars from various sources.
286    pub(crate) fn new(
287        syntax_set: SyntaxSet,
288        user_extensions: HashMap<String, String>,
289        filename_scopes: HashMap<String, String>,
290    ) -> Self {
291        Self::new_with_loaded_paths(
292            syntax_set,
293            user_extensions,
294            filename_scopes,
295            Vec::new(),
296            HashMap::new(),
297        )
298    }
299
300    /// Create a GrammarRegistry with pre-loaded grammar path tracking.
301    ///
302    /// Used by the loader when plugin grammars were included in the initial build,
303    /// so that `loaded_grammar_paths()` reflects what was actually loaded.
304    pub(crate) fn new_with_loaded_paths(
305        syntax_set: SyntaxSet,
306        user_extensions: HashMap<String, String>,
307        filename_scopes: HashMap<String, String>,
308        loaded_grammar_paths: Vec<GrammarSpec>,
309        grammar_sources: HashMap<String, GrammarInfo>,
310    ) -> Self {
311        let mut reg = Self {
312            syntax_set: Arc::new(syntax_set),
313            user_extensions,
314            filename_scopes,
315            loaded_grammar_paths,
316            grammar_sources,
317            aliases: HashMap::new(),
318            catalog: Vec::new(),
319            catalog_by_name: HashMap::new(),
320            catalog_by_extension: HashMap::new(),
321            catalog_by_filename: HashMap::new(),
322            applied_language_config: HashMap::new(),
323        };
324        reg.rebuild_catalog();
325        reg
326    }
327
328    /// Create an empty grammar registry (fast, for tests that don't need syntax highlighting)
329    pub fn empty() -> Arc<Self> {
330        let mut builder = SyntaxSetBuilder::new();
331        builder.add_plain_text_syntax();
332        let mut reg = Self {
333            syntax_set: Arc::new(builder.build()),
334            user_extensions: HashMap::new(),
335            filename_scopes: HashMap::new(),
336            loaded_grammar_paths: Vec::new(),
337            grammar_sources: HashMap::new(),
338            aliases: HashMap::new(),
339            catalog: Vec::new(),
340            catalog_by_name: HashMap::new(),
341            catalog_by_extension: HashMap::new(),
342            catalog_by_filename: HashMap::new(),
343            applied_language_config: HashMap::new(),
344        };
345        reg.rebuild_catalog();
346        Arc::new(reg)
347    }
348
349    /// Create a registry with only syntect's pre-compiled defaults (~0ms).
350    ///
351    /// This provides instant syntax highlighting for ~50 common languages
352    /// (Rust, Python, JS/TS, C/C++, Go, Java, HTML, CSS, Markdown, etc.)
353    /// without any `SyntaxSetBuilder::build()` call. Use this at startup,
354    /// then swap in a full registry built on a background thread.
355    pub fn defaults_only() -> Arc<Self> {
356        // Load pre-compiled syntax set (defaults + embedded grammars) from
357        // build-time packdump. This avoids the expensive into_builder() + build()
358        // cycle at runtime (~12s → ~300ms).
359        tracing::info!("defaults_only: loading pre-compiled syntax packdump...");
360        let syntax_set: SyntaxSet = syntect::dumps::from_uncompressed_data(include_bytes!(
361            concat!(env!("OUT_DIR"), "/default_syntaxes.packdump")
362        ))
363        .expect("Failed to load pre-compiled syntax packdump");
364        tracing::info!(
365            "defaults_only: loaded ({} syntaxes)",
366            syntax_set.syntaxes().len()
367        );
368        let grammar_sources = Self::build_grammar_sources_from_syntax_set(&syntax_set);
369        let filename_scopes = Self::build_filename_scopes();
370        let extra_extensions = Self::build_extra_extensions();
371        let mut registry = Self {
372            syntax_set: Arc::new(syntax_set),
373            user_extensions: extra_extensions,
374            filename_scopes,
375            loaded_grammar_paths: Vec::new(),
376            grammar_sources,
377            aliases: HashMap::new(),
378            catalog: Vec::new(),
379            catalog_by_name: HashMap::new(),
380            catalog_by_extension: HashMap::new(),
381            catalog_by_filename: HashMap::new(),
382            applied_language_config: HashMap::new(),
383        };
384        registry.populate_built_in_aliases();
385        registry.rebuild_catalog();
386        Arc::new(registry)
387    }
388
389    /// Build extra extension -> scope mappings for extensions not covered by syntect defaults.
390    ///
391    /// These map common file extensions to existing syntect grammar scopes,
392    /// filling gaps where syntect's built-in extension lists are incomplete.
393    pub(crate) fn build_extra_extensions() -> HashMap<String, String> {
394        let mut map = HashMap::new();
395
396        // JavaScript variants not in syntect defaults (["js", "htc"])
397        let js_scope = "source.js".to_string();
398        map.insert("cjs".to_string(), js_scope.clone());
399        map.insert("mjs".to_string(), js_scope);
400
401        // Dockerfile variants (e.g. Dockerfile.dev -> .dev extension)
402        // These won't match by extension, handled by filename_scopes and first_line_match
403
404        map
405    }
406
407    /// Build the default filename -> scope mappings for dotfiles and special files.
408    pub(crate) fn build_filename_scopes() -> HashMap<String, String> {
409        let mut map = HashMap::new();
410
411        // Shell configuration files -> Bash/Shell script scope
412        let shell_scope = "source.shell.bash".to_string();
413        for filename in [
414            ".zshrc",
415            ".zprofile",
416            ".zshenv",
417            ".zlogin",
418            ".zlogout",
419            ".bash_aliases",
420            // .bashrc and .bash_profile are already recognized by syntect
421            // Common shell script files without extensions
422            "PKGBUILD",
423            "APKBUILD",
424        ] {
425            map.insert(filename.to_string(), shell_scope.clone());
426        }
427
428        // Git rebase todo files
429        let git_rebase_scope = "source.git-rebase-todo".to_string();
430        map.insert("git-rebase-todo".to_string(), git_rebase_scope);
431
432        // Git commit message files
433        let git_commit_scope = "source.git-commit".to_string();
434        for filename in ["COMMIT_EDITMSG", "MERGE_MSG", "SQUASH_MSG", "TAG_EDITMSG"] {
435            map.insert(filename.to_string(), git_commit_scope.clone());
436        }
437
438        // Gitignore and similar files
439        let gitignore_scope = "source.gitignore".to_string();
440        for filename in [".gitignore", ".dockerignore", ".npmignore", ".hgignore"] {
441            map.insert(filename.to_string(), gitignore_scope.clone());
442        }
443
444        // Git config files
445        let gitconfig_scope = "source.gitconfig".to_string();
446        for filename in [".gitconfig", ".gitmodules"] {
447            map.insert(filename.to_string(), gitconfig_scope.clone());
448        }
449
450        // Git attributes files
451        let gitattributes_scope = "source.gitattributes".to_string();
452        map.insert(".gitattributes".to_string(), gitattributes_scope);
453
454        // Jenkinsfile -> Groovy
455        let groovy_scope = "source.groovy".to_string();
456        map.insert("Jenkinsfile".to_string(), groovy_scope);
457
458        // Vagrantfile -> Ruby (syntect already handles this, but be explicit)
459        // Brewfile -> Ruby
460        let ruby_scope = "source.ruby".to_string();
461        map.insert("Brewfile".to_string(), ruby_scope);
462
463        // Dockerfile and variants (exact names; Dockerfile.* handled via prefix check)
464        let dockerfile_scope = "source.dockerfile".to_string();
465        map.insert("Dockerfile".to_string(), dockerfile_scope.clone());
466        map.insert("Containerfile".to_string(), dockerfile_scope.clone());
467        // Common Dockerfile variants
468        map.insert("Dockerfile.dev".to_string(), dockerfile_scope.clone());
469        map.insert("Dockerfile.prod".to_string(), dockerfile_scope.clone());
470        map.insert("Dockerfile.test".to_string(), dockerfile_scope.clone());
471        map.insert("Dockerfile.build".to_string(), dockerfile_scope.clone());
472
473        // CMake
474        let cmake_scope = "source.cmake".to_string();
475        map.insert("CMakeLists.txt".to_string(), cmake_scope);
476
477        // Starlark/Bazel
478        let starlark_scope = "source.starlark".to_string();
479        map.insert("BUILD".to_string(), starlark_scope.clone());
480        map.insert("BUILD.bazel".to_string(), starlark_scope.clone());
481        map.insert("WORKSPACE".to_string(), starlark_scope.clone());
482        map.insert("WORKSPACE.bazel".to_string(), starlark_scope.clone());
483        map.insert("Tiltfile".to_string(), starlark_scope);
484
485        // Justfile (various casings)
486        let justfile_scope = "source.justfile".to_string();
487        map.insert("justfile".to_string(), justfile_scope.clone());
488        map.insert("Justfile".to_string(), justfile_scope.clone());
489        map.insert(".justfile".to_string(), justfile_scope);
490
491        // EditorConfig -> INI
492        let ini_scope = "source.ini".to_string();
493        map.insert(".editorconfig".to_string(), ini_scope);
494
495        // Earthfile
496        let earthfile_scope = "source.earthfile".to_string();
497        map.insert("Earthfile".to_string(), earthfile_scope);
498
499        // Hyprlang (Hyprland config files)
500        let hyprlang_scope = "source.hyprlang".to_string();
501        map.insert("hyprland.conf".to_string(), hyprlang_scope.clone());
502        map.insert("hyprpaper.conf".to_string(), hyprlang_scope.clone());
503        map.insert("hyprlock.conf".to_string(), hyprlang_scope);
504
505        // go.mod / go.sum
506        let gomod_scope = "source.gomod".to_string();
507        map.insert("go.mod".to_string(), gomod_scope.clone());
508        map.insert("go.sum".to_string(), gomod_scope);
509
510        map
511    }
512
513    /// Add embedded grammars (TOML, Odin, etc.) to a syntax set builder.
514    pub(crate) fn add_embedded_grammars(builder: &mut SyntaxSetBuilder) {
515        // TOML grammar
516        match SyntaxDefinition::load_from_str(TOML_GRAMMAR, true, Some("TOML")) {
517            Ok(syntax) => {
518                builder.add(syntax);
519                tracing::debug!("Loaded embedded TOML grammar");
520            }
521            Err(e) => {
522                tracing::warn!("Failed to load embedded TOML grammar: {}", e);
523            }
524        }
525
526        // Odin grammar
527        match SyntaxDefinition::load_from_str(ODIN_GRAMMAR, true, Some("Odin")) {
528            Ok(syntax) => {
529                builder.add(syntax);
530                tracing::debug!("Loaded embedded Odin grammar");
531            }
532            Err(e) => {
533                tracing::warn!("Failed to load embedded Odin grammar: {}", e);
534            }
535        }
536
537        // Zig grammar
538        match SyntaxDefinition::load_from_str(ZIG_GRAMMAR, true, Some("Zig")) {
539            Ok(syntax) => {
540                builder.add(syntax);
541                tracing::debug!("Loaded embedded Zig grammar");
542            }
543            Err(e) => {
544                tracing::warn!("Failed to load embedded Zig grammar: {}", e);
545            }
546        }
547
548        // Git Rebase Todo grammar
549        match SyntaxDefinition::load_from_str(GIT_REBASE_GRAMMAR, true, Some("Git Rebase Todo")) {
550            Ok(syntax) => {
551                builder.add(syntax);
552                tracing::debug!("Loaded embedded Git Rebase Todo grammar");
553            }
554            Err(e) => {
555                tracing::warn!("Failed to load embedded Git Rebase Todo grammar: {}", e);
556            }
557        }
558
559        // Git Commit Message grammar
560        match SyntaxDefinition::load_from_str(GIT_COMMIT_GRAMMAR, true, Some("Git Commit Message"))
561        {
562            Ok(syntax) => {
563                builder.add(syntax);
564                tracing::debug!("Loaded embedded Git Commit Message grammar");
565            }
566            Err(e) => {
567                tracing::warn!("Failed to load embedded Git Commit Message grammar: {}", e);
568            }
569        }
570
571        // Gitignore grammar
572        match SyntaxDefinition::load_from_str(GITIGNORE_GRAMMAR, true, Some("Gitignore")) {
573            Ok(syntax) => {
574                builder.add(syntax);
575                tracing::debug!("Loaded embedded Gitignore grammar");
576            }
577            Err(e) => {
578                tracing::warn!("Failed to load embedded Gitignore grammar: {}", e);
579            }
580        }
581
582        // Git Config grammar
583        match SyntaxDefinition::load_from_str(GITCONFIG_GRAMMAR, true, Some("Git Config")) {
584            Ok(syntax) => {
585                builder.add(syntax);
586                tracing::debug!("Loaded embedded Git Config grammar");
587            }
588            Err(e) => {
589                tracing::warn!("Failed to load embedded Git Config grammar: {}", e);
590            }
591        }
592
593        // Git Attributes grammar
594        match SyntaxDefinition::load_from_str(GITATTRIBUTES_GRAMMAR, true, Some("Git Attributes")) {
595            Ok(syntax) => {
596                builder.add(syntax);
597                tracing::debug!("Loaded embedded Git Attributes grammar");
598            }
599            Err(e) => {
600                tracing::warn!("Failed to load embedded Git Attributes grammar: {}", e);
601            }
602        }
603
604        // Typst grammar
605        match SyntaxDefinition::load_from_str(TYPST_GRAMMAR, true, Some("Typst")) {
606            Ok(syntax) => {
607                builder.add(syntax);
608                tracing::debug!("Loaded embedded Typst grammar");
609            }
610            Err(e) => {
611                tracing::warn!("Failed to load embedded Typst grammar: {}", e);
612            }
613        }
614
615        // Additional embedded grammars for languages not in syntect defaults
616        let additional_grammars: &[(&str, &str)] = &[
617            (DOCKERFILE_GRAMMAR, "Dockerfile"),
618            (INI_GRAMMAR, "INI"),
619            (CMAKE_GRAMMAR, "CMake"),
620            (SCSS_GRAMMAR, "SCSS"),
621            (LESS_GRAMMAR, "LESS"),
622            (POWERSHELL_GRAMMAR, "PowerShell"),
623            (KOTLIN_GRAMMAR, "Kotlin"),
624            (SWIFT_GRAMMAR, "Swift"),
625            (DART_GRAMMAR, "Dart"),
626            (ELIXIR_GRAMMAR, "Elixir"),
627            (FSHARP_GRAMMAR, "FSharp"),
628            (NIX_GRAMMAR, "Nix"),
629            (HCL_GRAMMAR, "HCL"),
630            (PROTOBUF_GRAMMAR, "Protocol Buffers"),
631            (GRAPHQL_GRAMMAR, "GraphQL"),
632            (JULIA_GRAMMAR, "Julia"),
633            (NIM_GRAMMAR, "Nim"),
634            (GLEAM_GRAMMAR, "Gleam"),
635            (VLANG_GRAMMAR, "V"),
636            (SOLIDITY_GRAMMAR, "Solidity"),
637            (KDL_GRAMMAR, "KDL"),
638            (NUSHELL_GRAMMAR, "Nushell"),
639            (STARLARK_GRAMMAR, "Starlark"),
640            (JUSTFILE_GRAMMAR, "Justfile"),
641            (EARTHFILE_GRAMMAR, "Earthfile"),
642            (GOMOD_GRAMMAR, "Go Module"),
643            (VUE_GRAMMAR, "Vue"),
644            (SVELTE_GRAMMAR, "Svelte"),
645            (ASTRO_GRAMMAR, "Astro"),
646            (HYPRLANG_GRAMMAR, "Hyprlang"),
647            (AUTOHOTKEY_GRAMMAR, "AutoHotkey"),
648        ];
649
650        for (grammar_str, name) in additional_grammars {
651            match SyntaxDefinition::load_from_str(grammar_str, true, Some(name)) {
652                Ok(syntax) => {
653                    builder.add(syntax);
654                    tracing::debug!("Loaded embedded {} grammar", name);
655                }
656                Err(e) => {
657                    tracing::warn!("Failed to load embedded {} grammar: {}", name, e);
658                }
659            }
660        }
661    }
662
663    /// Find syntax for a file by path/extension/filename.
664    ///
665    /// Purely metadata-based — does not read the file. For first-line
666    /// (shebang) fallback, use [`find_by_path`] with a `first_line` argument
667    /// and resolve the returned entry's syntect index.
668    pub fn find_syntax_for_file(&self, path: &Path) -> Option<&SyntaxReference> {
669        let entry = self.find_by_path(path, None)?;
670        entry
671            .engines
672            .syntect
673            .map(|i| &self.syntax_set.syntaxes()[i])
674    }
675
676    /// Find syntax by name, with alias resolution.
677    ///
678    /// Thin wrapper around `find_by_name` that returns the associated syntect
679    /// `SyntaxReference`. Tree-sitter-only entries return `None`.
680    ///
681    /// Falls back to a direct syntect lookup for "Plain Text", which the
682    /// catalog deliberately omits but syntect still exposes.
683    pub fn find_syntax_by_name(&self, name: &str) -> Option<&SyntaxReference> {
684        if let Some(entry) = self.find_by_name(name) {
685            if let Some(idx) = entry.engines.syntect {
686                return Some(&self.syntax_set.syntaxes()[idx]);
687            }
688        }
689        // Plain Text is excluded from the catalog (it's not a "grammar" a user
690        // would ever pick), but syntect still stores it and a handful of
691        // callers still ask for it by name.
692        self.syntax_set.find_syntax_by_name(name)
693    }
694
695    // === Alias management ===
696
697    /// Hardcoded short-name aliases for built-in and embedded grammars.
698    ///
699    /// Each entry maps a short name (lowercase) to the exact syntect grammar name.
700    /// Only grammars whose full name differs significantly from a natural short
701    /// form need an entry here. Grammars already short (e.g., "Rust", "Go") are
702    /// reachable via case-insensitive matching and don't need aliases.
703    fn built_in_aliases() -> Vec<(&'static str, &'static str)> {
704        vec![
705            // Syntect built-in grammars with verbose names
706            ("bash", "Bourne Again Shell (bash)"),
707            ("shell", "Bourne Again Shell (bash)"),
708            ("sh", "Bourne Again Shell (bash)"),
709            ("c++", "C++"),
710            ("cpp", "C++"),
711            ("csharp", "C#"),
712            ("objc", "Objective-C"),
713            ("objcpp", "Objective-C++"),
714            ("regex", "Regular Expressions (Python)"),
715            ("regexp", "Regular Expressions (Python)"),
716            // Embedded grammars with multi-word or non-obvious names
717            ("proto", "Protocol Buffers"),
718            ("protobuf", "Protocol Buffers"),
719            ("gomod", "Go Module"),
720            ("git-rebase", "Git Rebase Todo"),
721            ("git-commit", "Git Commit Message"),
722            ("git-config", "Git Config"),
723            ("git-attributes", "Git Attributes"),
724            ("gitignore", "Gitignore"),
725            ("fsharp", "FSharp"),
726            ("f#", "FSharp"),
727            ("terraform", "HCL"),
728            ("tf", "HCL"),
729            ("ts", "TypeScript"),
730            ("js", "JavaScript"),
731            ("py", "Python"),
732            ("rb", "Ruby"),
733            ("rs", "Rust"),
734            ("md", "Markdown"),
735            ("yml", "YAML"),
736            ("dockerfile", "Dockerfile"),
737        ]
738    }
739
740    /// Populate aliases from the built-in table.
741    ///
742    /// Validates that:
743    /// - Each alias target (full name) exists in the syntax set
744    /// - No alias collides (case-insensitive) with an existing grammar full name
745    /// - No duplicate aliases exist
746    pub(crate) fn populate_built_in_aliases(&mut self) {
747        for (short, full) in Self::built_in_aliases() {
748            self.register_alias_inner(short, full, true);
749        }
750        self.rebuild_catalog();
751    }
752
753    /// Register a short-name alias for a grammar.
754    ///
755    /// Returns `true` if the alias was registered, `false` if rejected due to
756    /// collision or missing target. For built-in aliases, collisions panic
757    /// (they indicate a bug). For dynamic aliases, collisions log a warning.
758    ///
759    /// Splices the alias directly into the catalog rather than rebuilding, so
760    /// any user config previously merged via `apply_language_config` is
761    /// preserved. A full rebuild would wipe those entries.
762    pub(crate) fn register_alias(&mut self, short_name: &str, full_name: &str) -> bool {
763        if !self.register_alias_inner(short_name, full_name, false) {
764            return false;
765        }
766        let short_lower = short_name.to_lowercase();
767        let full_lower = full_name.to_lowercase();
768        if let Some(&idx) = self.catalog_by_name.get(&full_lower) {
769            self.catalog_by_name
770                .entry(short_lower.clone())
771                .or_insert(idx);
772            let entry = &mut self.catalog[idx];
773            let replace = match &entry.short_name {
774                None => true,
775                Some(existing) => short_name.len() < existing.len(),
776            };
777            if replace {
778                entry.short_name = Some(short_lower);
779            }
780        }
781        true
782    }
783
784    fn register_alias_inner(
785        &mut self,
786        short_name: &str,
787        full_name: &str,
788        is_built_in: bool,
789    ) -> bool {
790        let short_lower = short_name.to_lowercase();
791
792        // Validate: target grammar must exist in the syntax set
793        let target_exists = self
794            .syntax_set
795            .syntaxes()
796            .iter()
797            .any(|s| s.name.eq_ignore_ascii_case(full_name));
798        if !target_exists {
799            // Tree-sitter-only targets (e.g. TypeScript) are expected to be
800            // absent from the syntect set. `rebuild_catalog` attaches their
801            // short names via a separate pass over `built_in_aliases()`.
802            if tree_sitter_for_syntect_name(full_name).is_some() {
803                return false;
804            }
805            if is_built_in {
806                // Built-in alias targets should always exist; warn but don't panic
807                // (grammar might have been removed from syntect upstream)
808                tracing::warn!(
809                    "[grammar-alias] Built-in alias '{}' -> '{}': target grammar not found, skipping",
810                    short_name, full_name
811                );
812            } else {
813                tracing::warn!(
814                    "[grammar-alias] Alias '{}' -> '{}': target grammar not found, skipping",
815                    short_name,
816                    full_name
817                );
818            }
819            return false;
820        }
821
822        // Validate: short name must not collide (case-insensitive) with any grammar full name
823        let collides_with_full_name = self
824            .syntax_set
825            .syntaxes()
826            .iter()
827            .any(|s| s.name.eq_ignore_ascii_case(&short_lower));
828        if collides_with_full_name {
829            // This is actually fine — the short name matches a full name directly,
830            // so find_syntax_by_name's case-insensitive search will find it.
831            // No alias needed.
832            tracing::debug!(
833                "[grammar-alias] Alias '{}' matches an existing grammar name, skipping (not needed)",
834                short_name
835            );
836            return false;
837        }
838
839        // Validate: no duplicate alias (case-insensitive)
840        if let Some(existing_target) = self.aliases.get(&short_lower) {
841            if existing_target.eq_ignore_ascii_case(full_name) {
842                // Same mapping, no-op
843                return true;
844            }
845            let msg = format!(
846                "Alias '{}' already maps to '{}', cannot remap to '{}'",
847                short_name, existing_target, full_name
848            );
849            if is_built_in {
850                panic!("[grammar-alias] Built-in alias collision: {}", msg);
851            } else {
852                tracing::warn!("[grammar-alias] {}", msg);
853                return false;
854            }
855        }
856
857        // Resolve the exact syntect name (preserving original case)
858        let exact_name = self
859            .syntax_set
860            .syntaxes()
861            .iter()
862            .find(|s| s.name.eq_ignore_ascii_case(full_name))
863            .map(|s| s.name.clone())
864            .unwrap();
865
866        self.aliases.insert(short_lower, exact_name);
867        true
868    }
869
870    // === Unified catalog ===
871
872    /// Rebuild the flat catalog of grammar entries.
873    ///
874    /// Called after the syntax set, aliases, or filename scopes change.
875    /// Produces one entry per logical language by merging:
876    /// 1. Every `SyntaxReference` in the syntax set (except "Plain Text")
877    /// 2. Every `fresh_languages::Language` not already covered by a syntect entry
878    /// 3. Alias short-names attached to their target entry
879    /// 4. Filename mappings from `filename_scopes` attached to their scope's entry
880    /// 5. Extra extensions from `user_extensions` attached to their scope's entry
881    ///
882    /// Automatically replays the last `apply_language_config` at the end, so
883    /// user `[languages]` config survives any rebuild.
884    pub(crate) fn rebuild_catalog(&mut self) {
885        // Reverse-map: full_name (lowercase) -> shortest alias.
886        //
887        // Seed from the built-in alias table as well as the live `aliases`
888        // HashMap: the live map only contains aliases whose target exists in
889        // the syntect set, so tree-sitter-only entries (TypeScript) would
890        // otherwise never get their short name ("ts").
891        let mut short_by_full: HashMap<String, String> = HashMap::new();
892        let record = |map: &mut HashMap<String, String>, short: &str, full: &str| {
893            let key = full.to_lowercase();
894            let keep = match map.get(&key) {
895                None => true,
896                Some(existing) => short.len() < existing.len(),
897            };
898            if keep {
899                map.insert(key, short.to_string());
900            }
901        };
902        for (short, full) in Self::built_in_aliases() {
903            record(&mut short_by_full, short, full);
904        }
905        for (short, full) in &self.aliases {
906            record(&mut short_by_full, short, full);
907        }
908
909        let derive_language_id =
910            |display_name: &str| -> (String, Option<fresh_languages::Language>) {
911                let ts = tree_sitter_for_syntect_name(display_name);
912                let id = ts
913                    .map(|l| l.id().to_string())
914                    .unwrap_or_else(|| display_name.to_lowercase());
915                (id, ts)
916            };
917
918        let mut catalog: Vec<GrammarEntry> = Vec::new();
919        let mut scope_to_index: HashMap<String, usize> = HashMap::new();
920
921        // Syntect-backed entries (skip Plain Text).
922        //
923        // Syntect's `file_extensions` is a hybrid list: real extensions like
924        // "rb" sit alongside bare filenames like "Gemfile", "Rakefile",
925        // "Makefile". Syntect's own `find_syntax_for_file` tries each entry
926        // against the whole filename AND against the path's extension, and
927        // the catalog has to preserve that semantics. We keep everything in
928        // `extensions` here and index each entry as *both* an extension and
929        // a filename at the bottom of this method.
930        for (idx, syntax) in self.syntax_set.syntaxes().iter().enumerate() {
931            if syntax.name == "Plain Text" {
932                continue;
933            }
934            let (language_id, tree_sitter) = derive_language_id(&syntax.name);
935            let short_name = short_by_full.get(&syntax.name.to_lowercase()).cloned();
936            let source = self
937                .grammar_sources
938                .get(&syntax.name)
939                .map(|info| info.source.clone())
940                .unwrap_or(GrammarSource::BuiltIn);
941            let entry_index = catalog.len();
942            scope_to_index.insert(syntax.scope.to_string(), entry_index);
943
944            // Union syntect's file_extensions with tree-sitter's own
945            // extension list when the entry carries both engines.
946            // tree-sitter-javascript handles `.jsx`/`.mjs`/`.cjs` that
947            // syntect's JS grammar doesn't list, and the old code used to
948            // route those paths to tree-sitter via a separate lookup.
949            let mut extensions = syntax.file_extensions.clone();
950            if let Some(lang) = tree_sitter {
951                for ext in lang.extensions() {
952                    let ext = ext.to_string();
953                    if !extensions.iter().any(|e| e == &ext) {
954                        extensions.push(ext);
955                    }
956                }
957            }
958
959            catalog.push(GrammarEntry {
960                display_name: syntax.name.clone(),
961                language_id,
962                short_name,
963                extensions,
964                filenames: Vec::new(),
965                filename_globs: Vec::new(),
966                source,
967                engines: GrammarEngines {
968                    syntect: Some(idx),
969                    tree_sitter,
970                },
971            });
972        }
973
974        // Attach filename_scopes to their entries.
975        for (filename, scope) in &self.filename_scopes {
976            if let Some(&idx) = scope_to_index.get(scope) {
977                if !catalog[idx].filenames.iter().any(|f| f == filename) {
978                    catalog[idx].filenames.push(filename.clone());
979                }
980            }
981        }
982
983        // Attach user_extensions (extra → scope) to their entries.
984        for (ext, scope) in &self.user_extensions {
985            if let Some(&idx) = scope_to_index.get(scope) {
986                if !catalog[idx].extensions.iter().any(|e| e == ext) {
987                    catalog[idx].extensions.push(ext.clone());
988                }
989            }
990        }
991
992        // Ensure every tree-sitter language has an entry. If a syntect entry
993        // already maps to the same tree-sitter language, skip it; otherwise
994        // add a tree-sitter-only entry so the catalog is complete (TypeScript
995        // being the motivating example — syntect ships no grammar for it).
996        let mut ts_covered: std::collections::HashSet<fresh_languages::Language> =
997            std::collections::HashSet::new();
998        for entry in &catalog {
999            if let Some(lang) = entry.engines.tree_sitter {
1000                ts_covered.insert(lang);
1001            }
1002        }
1003        for lang in fresh_languages::Language::all() {
1004            if ts_covered.contains(lang) {
1005                continue;
1006            }
1007            let display_name = lang.display_name().to_string();
1008            let language_id = lang.id().to_string();
1009            let short_name = short_by_full.get(&display_name.to_lowercase()).cloned();
1010            let extensions: Vec<String> = lang.extensions().iter().map(|s| s.to_string()).collect();
1011            catalog.push(GrammarEntry {
1012                display_name,
1013                language_id,
1014                short_name,
1015                extensions,
1016                filenames: Vec::new(),
1017                filename_globs: Vec::new(),
1018                source: GrammarSource::BuiltIn,
1019                engines: GrammarEngines {
1020                    syntect: None,
1021                    tree_sitter: Some(*lang),
1022                },
1023            });
1024        }
1025
1026        // Build name / extension / filename indices.
1027        //
1028        // Every entry in `extensions` gets indexed in BOTH `by_extension`
1029        // (lowercased) AND `by_filename` (exact case) — syntect's
1030        // `file_extensions` list holds both real extensions ("rb") and bare
1031        // filenames ("Gemfile", "Rakefile", "Makefile"). Indexing both ways
1032        // matches syntect's own `find_syntax_for_file` semantics.
1033        let mut by_name: HashMap<String, usize> = HashMap::new();
1034        let mut by_extension: HashMap<String, usize> = HashMap::new();
1035        let mut by_filename: HashMap<String, usize> = HashMap::new();
1036        for (idx, entry) in catalog.iter().enumerate() {
1037            by_name.insert(entry.display_name.to_lowercase(), idx);
1038            by_name.insert(entry.language_id.to_lowercase(), idx);
1039            if let Some(short) = &entry.short_name {
1040                by_name.insert(short.to_lowercase(), idx);
1041            }
1042            for ext in &entry.extensions {
1043                by_extension.entry(ext.to_lowercase()).or_insert(idx);
1044                by_filename.entry(ext.clone()).or_insert(idx);
1045            }
1046            for filename in &entry.filenames {
1047                by_filename.entry(filename.clone()).or_insert(idx);
1048            }
1049        }
1050
1051        self.catalog = catalog;
1052        self.catalog_by_name = by_name;
1053        self.catalog_by_extension = by_extension;
1054        self.catalog_by_filename = by_filename;
1055
1056        // Replay the most recent user config so a rebuild doesn't silently
1057        // wipe out user `[languages]` rules. `take` + restore avoids both a
1058        // clone and a borrow checker fight with `apply_language_config_inner`.
1059        if !self.applied_language_config.is_empty() {
1060            let cfg = std::mem::take(&mut self.applied_language_config);
1061            self.apply_language_config_inner(&cfg);
1062            self.applied_language_config = cfg;
1063        }
1064    }
1065
1066    /// Return the full catalog of grammar entries.
1067    pub fn catalog(&self) -> &[GrammarEntry] {
1068        &self.catalog
1069    }
1070
1071    /// Look up a grammar entry by display name, language ID, or short alias
1072    /// (case-insensitive). All aliases — built-in and user-config-declared —
1073    /// are indexed directly in `catalog_by_name` during `rebuild_catalog` /
1074    /// `register_alias` / `apply_language_config`, so a single lookup covers
1075    /// every case.
1076    pub fn find_by_name(&self, name: &str) -> Option<&GrammarEntry> {
1077        self.catalog_by_name
1078            .get(&name.to_lowercase())
1079            .map(|&idx| &self.catalog[idx])
1080    }
1081
1082    /// Look up a grammar entry by file path, with optional first-line content
1083    /// for shebang / `first_line_match` detection.
1084    ///
1085    /// Resolution order:
1086    /// 1. Exact filename (config-declared filenames and filename_scopes live here)
1087    /// 2. Glob patterns from user config (e.g. "*.conf", "/etc/**/rc.*")
1088    /// 3. File extension
1089    /// 4. Shebang / first-line regex match on `first_line` if supplied
1090    ///
1091    /// Globs take priority over extension so a user rule like `*.conf → bash`
1092    /// wins over any built-in extension match on `.conf`. The first-line
1093    /// fallback (#4) is last so catalog matches stay authoritative — syntect
1094    /// might otherwise misclassify `.fish` as bash via its first-line
1095    /// regexes.
1096    ///
1097    /// The first-line fallback is pure: it runs syntect's
1098    /// `find_syntax_by_first_line` regex cache against the caller-supplied
1099    /// string. The registry never touches the filesystem — the caller (who
1100    /// already loaded the buffer via the `FileSystem` trait) must extract
1101    /// the first line and pass it in.
1102    pub fn find_by_path(&self, path: &Path, first_line: Option<&str>) -> Option<&GrammarEntry> {
1103        let filename = path.file_name().and_then(|n| n.to_str());
1104        let path_str = path.to_str().unwrap_or("");
1105
1106        if let Some(name) = filename {
1107            if let Some(&idx) = self.catalog_by_filename.get(name) {
1108                return Some(&self.catalog[idx]);
1109            }
1110        }
1111
1112        // Glob walk — filenames with globs are rare so linear scan is fine.
1113        if let Some(name) = filename {
1114            for entry in &self.catalog {
1115                for pattern in &entry.filename_globs {
1116                    let matched = if is_path_pattern(pattern) {
1117                        path_glob_matches(pattern, path_str)
1118                    } else {
1119                        filename_glob_matches(pattern, name)
1120                    };
1121                    if matched {
1122                        return Some(entry);
1123                    }
1124                }
1125            }
1126        }
1127
1128        if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
1129            if let Some(entry) = self.find_by_extension(ext) {
1130                return Some(entry);
1131            }
1132        }
1133
1134        // Last resort: shebang / first-line regex match against the
1135        // caller-supplied content. Map the matched syntect grammar back to a
1136        // catalog entry by name — every syntect syntax has a catalog entry,
1137        // so this round-trip preserves tree-sitter attachment.
1138        let line = first_line?;
1139        let syntax = self.syntax_set.find_syntax_by_first_line(line)?;
1140        self.find_by_name(&syntax.name)
1141    }
1142
1143    /// Look up a grammar entry by file extension (case-insensitive, without dot).
1144    pub fn find_by_extension(&self, ext: &str) -> Option<&GrammarEntry> {
1145        self.catalog_by_extension
1146            .get(&ext.to_lowercase())
1147            .map(|&idx| &self.catalog[idx])
1148    }
1149
1150    /// Merge user `[languages]` config into the catalog.
1151    ///
1152    /// For each config entry, resolves its grammar to an existing catalog entry
1153    /// (by grammar name or by language id). Extensions are added and override
1154    /// the ext→entry index so config wins over built-in mappings. Filenames are
1155    /// split into exact matches (indexed) and globs (walked at lookup time).
1156    ///
1157    /// If no existing entry matches, a new engine-less entry is created so the
1158    /// language still appears in the palette.
1159    ///
1160    /// Idempotent. The config is cached on the registry so `rebuild_catalog`
1161    /// can replay it — callers don't need to re-apply after a rebuild.
1162    pub fn apply_language_config(
1163        &mut self,
1164        languages: &HashMap<String, crate::config::LanguageConfig>,
1165    ) {
1166        self.applied_language_config = languages.clone();
1167        self.apply_language_config_inner(languages);
1168    }
1169
1170    /// Do the actual catalog splicing without touching
1171    /// `applied_language_config`. Called from `apply_language_config` (which
1172    /// records the input) and from `rebuild_catalog` (which replays the
1173    /// cached input after wiping the catalog).
1174    fn apply_language_config_inner(
1175        &mut self,
1176        languages: &HashMap<String, crate::config::LanguageConfig>,
1177    ) {
1178        for (lang_id, lang_cfg) in languages {
1179            let grammar_name = if lang_cfg.grammar.is_empty() {
1180                lang_id.as_str()
1181            } else {
1182                lang_cfg.grammar.as_str()
1183            };
1184
1185            // Resolve to an existing entry; fall back to creating one.
1186            let idx = self
1187                .catalog_by_name
1188                .get(&grammar_name.to_lowercase())
1189                .copied()
1190                .or_else(|| self.catalog_by_name.get(&lang_id.to_lowercase()).copied())
1191                .unwrap_or_else(|| {
1192                    let idx = self.catalog.len();
1193                    self.catalog.push(GrammarEntry {
1194                        display_name: lang_id.clone(),
1195                        language_id: lang_id.clone(),
1196                        short_name: None,
1197                        extensions: Vec::new(),
1198                        filenames: Vec::new(),
1199                        filename_globs: Vec::new(),
1200                        source: GrammarSource::BuiltIn,
1201                        engines: GrammarEngines::default(),
1202                    });
1203                    idx
1204                });
1205
1206            // Always index the config key so `find_by_name("mylang")` resolves
1207            // even when `mylang` aliases an existing grammar (e.g.
1208            // `[languages.mylang] grammar = "Rust"`). `or_insert` preserves
1209            // any existing mapping — won't clobber the canonical entry.
1210            self.catalog_by_name
1211                .entry(lang_id.to_lowercase())
1212                .or_insert(idx);
1213
1214            for ext in &lang_cfg.extensions {
1215                if !self.catalog[idx].extensions.iter().any(|e| e == ext) {
1216                    self.catalog[idx].extensions.push(ext.clone());
1217                }
1218                // Config-declared extensions override any previous mapping.
1219                self.catalog_by_extension.insert(ext.to_lowercase(), idx);
1220            }
1221            for filename in &lang_cfg.filenames {
1222                if is_glob_pattern(filename) {
1223                    if !self.catalog[idx]
1224                        .filename_globs
1225                        .iter()
1226                        .any(|f| f == filename)
1227                    {
1228                        self.catalog[idx].filename_globs.push(filename.clone());
1229                    }
1230                } else {
1231                    if !self.catalog[idx].filenames.iter().any(|f| f == filename) {
1232                        self.catalog[idx].filenames.push(filename.clone());
1233                    }
1234                    self.catalog_by_filename.insert(filename.clone(), idx);
1235                }
1236            }
1237        }
1238    }
1239
1240    /// Get the underlying syntax set
1241    pub fn syntax_set(&self) -> &Arc<SyntaxSet> {
1242        &self.syntax_set
1243    }
1244
1245    /// Get a clone of the Arc for sharing
1246    pub fn syntax_set_arc(&self) -> Arc<SyntaxSet> {
1247        Arc::clone(&self.syntax_set)
1248    }
1249
1250    /// List all available syntax names
1251    pub fn available_syntaxes(&self) -> Vec<&str> {
1252        self.syntax_set
1253            .syntaxes()
1254            .iter()
1255            .map(|s| s.name.as_str())
1256            .collect()
1257    }
1258
1259    /// List all available grammars with provenance information.
1260    ///
1261    /// Returns a sorted list of `GrammarInfo` entries derived from the unified
1262    /// catalog — this includes both syntect grammars and tree-sitter-only
1263    /// languages (like TypeScript). Each entry is listed exactly once even
1264    /// when both engines can serve it.
1265    pub fn available_grammar_info(&self) -> Vec<GrammarInfo> {
1266        let mut result: Vec<GrammarInfo> = self
1267            .catalog
1268            .iter()
1269            .map(|entry| GrammarInfo {
1270                name: entry.display_name.clone(),
1271                source: entry.source.clone(),
1272                file_extensions: entry.extensions.clone(),
1273                short_name: entry.short_name.clone(),
1274            })
1275            .collect();
1276        result.sort_by(|a, b| a.name.to_lowercase().cmp(&b.name.to_lowercase()));
1277        result
1278    }
1279
1280    /// Get the grammar sources map.
1281    pub(crate) fn grammar_sources(&self) -> &HashMap<String, GrammarInfo> {
1282        &self.grammar_sources
1283    }
1284
1285    /// Build grammar source info from a pre-compiled syntax set.
1286    ///
1287    /// All grammars in the packdump (syntect defaults + embedded) are tagged as built-in.
1288    pub(crate) fn build_grammar_sources_from_syntax_set(
1289        syntax_set: &SyntaxSet,
1290    ) -> HashMap<String, GrammarInfo> {
1291        let mut sources = HashMap::new();
1292        for syntax in syntax_set.syntaxes() {
1293            sources.insert(
1294                syntax.name.clone(),
1295                GrammarInfo {
1296                    name: syntax.name.clone(),
1297                    source: GrammarSource::BuiltIn,
1298                    file_extensions: syntax.file_extensions.clone(),
1299                    short_name: None,
1300                },
1301            );
1302        }
1303        sources
1304    }
1305
1306    /// Get the user extensions mapping (extension -> scope name).
1307    #[cfg(test)]
1308    pub(crate) fn user_extensions(&self) -> &HashMap<String, String> {
1309        &self.user_extensions
1310    }
1311
1312    /// Get the loaded grammar paths (for deduplication in flush_pending_grammars).
1313    #[cfg(test)]
1314    pub(crate) fn loaded_grammar_paths(&self) -> &[GrammarSpec] {
1315        &self.loaded_grammar_paths
1316    }
1317
1318    /// Create a new registry with additional grammar files
1319    ///
1320    /// This builds a new GrammarRegistry that includes all grammars from
1321    /// the base registry plus the additional grammars specified.
1322    /// Uses the base registry's syntax_set as the builder base, preserving
1323    /// all existing grammars (user grammars, language packs, etc.).
1324    ///
1325    /// # Arguments
1326    /// * `base` - The base registry to extend
1327    /// * `additional` - List of (language, path, extensions) tuples for new grammars
1328    ///
1329    /// # Returns
1330    /// A new GrammarRegistry with the additional grammars, or None if rebuilding fails
1331    pub fn with_additional_grammars(
1332        base: &GrammarRegistry,
1333        additional: &[GrammarSpec],
1334    ) -> Option<Self> {
1335        tracing::info!(
1336            "[SYNTAX DEBUG] with_additional_grammars: adding {} grammars to base with {} syntaxes",
1337            additional.len(),
1338            base.syntax_set.syntaxes().len()
1339        );
1340
1341        // Use the base registry's syntax_set as builder base — this preserves
1342        // ALL existing grammars (defaults, embedded, user, language packs)
1343        // without needing to reload them from disk.
1344        let mut builder = (*base.syntax_set).clone().into_builder();
1345
1346        // Preserve existing user extensions and add new ones
1347        let mut user_extensions = base.user_extensions.clone();
1348
1349        // Track loaded grammar paths (existing + new)
1350        let mut loaded_grammar_paths = base.loaded_grammar_paths.clone();
1351
1352        // Preserve existing grammar sources
1353        let mut grammar_sources = base.grammar_sources.clone();
1354
1355        // Add each new grammar
1356        for spec in additional {
1357            tracing::info!(
1358                "[SYNTAX DEBUG] loading new grammar file: lang='{}', path={:?}, extensions={:?}",
1359                spec.language,
1360                spec.path,
1361                spec.extensions
1362            );
1363            match Self::load_grammar_file(&spec.path) {
1364                Ok(syntax) => {
1365                    let scope = syntax.scope.to_string();
1366                    let syntax_name = syntax.name.clone();
1367                    tracing::info!(
1368                        "[SYNTAX DEBUG] grammar loaded successfully: name='{}', scope='{}'",
1369                        syntax_name,
1370                        scope
1371                    );
1372                    builder.add(syntax);
1373                    tracing::info!(
1374                        "Loaded grammar for '{}' from {:?} with extensions {:?}",
1375                        spec.language,
1376                        spec.path,
1377                        spec.extensions
1378                    );
1379                    // Register extensions for this grammar
1380                    for ext in &spec.extensions {
1381                        user_extensions.insert(ext.clone(), scope.clone());
1382                    }
1383                    // Track provenance
1384                    grammar_sources.insert(
1385                        syntax_name.clone(),
1386                        GrammarInfo {
1387                            name: syntax_name,
1388                            source: GrammarSource::Plugin {
1389                                plugin: spec.language.clone(),
1390                                path: spec.path.clone(),
1391                            },
1392                            file_extensions: spec.extensions.clone(),
1393                            short_name: None,
1394                        },
1395                    );
1396                    // Track this grammar path for future reloads
1397                    loaded_grammar_paths.push(spec.clone());
1398                }
1399                Err(e) => {
1400                    tracing::warn!(
1401                        "Failed to load grammar for '{}' from {:?}: {}",
1402                        spec.language,
1403                        spec.path,
1404                        e
1405                    );
1406                }
1407            }
1408        }
1409
1410        let mut reg = Self {
1411            syntax_set: Arc::new(builder.build()),
1412            user_extensions,
1413            filename_scopes: base.filename_scopes.clone(),
1414            loaded_grammar_paths,
1415            grammar_sources,
1416            aliases: base.aliases.clone(),
1417            catalog: Vec::new(),
1418            catalog_by_name: HashMap::new(),
1419            catalog_by_extension: HashMap::new(),
1420            catalog_by_filename: HashMap::new(),
1421            applied_language_config: HashMap::new(),
1422        };
1423        reg.rebuild_catalog();
1424        Some(reg)
1425    }
1426
1427    /// Load a grammar file from disk
1428    ///
1429    /// Only Sublime Text (.sublime-syntax) format is supported.
1430    /// TextMate (.tmLanguage) grammars use a completely different format
1431    /// and cannot be loaded by syntect's yaml-load feature.
1432    pub(crate) fn load_grammar_file(path: &Path) -> Result<SyntaxDefinition, String> {
1433        let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
1434
1435        match ext {
1436            "sublime-syntax" => {
1437                let content = std::fs::read_to_string(path)
1438                    .map_err(|e| format!("Failed to read file: {}", e))?;
1439                SyntaxDefinition::load_from_str(
1440                    &content,
1441                    true,
1442                    path.file_stem().and_then(|s| s.to_str()),
1443                )
1444                .map_err(|e| format!("Failed to parse sublime-syntax: {}", e))
1445            }
1446            _ => Err(format!(
1447                "Unsupported grammar format: .{}. Only .sublime-syntax is supported.",
1448                ext
1449            )),
1450        }
1451    }
1452}
1453
1454impl Default for GrammarRegistry {
1455    fn default() -> Self {
1456        // Create with defaults and embedded grammars only (no user grammars)
1457        let defaults = SyntaxSet::load_defaults_newlines();
1458        let mut builder = defaults.into_builder();
1459        Self::add_embedded_grammars(&mut builder);
1460        let syntax_set = builder.build();
1461        let filename_scopes = Self::build_filename_scopes();
1462        let extra_extensions = Self::build_extra_extensions();
1463
1464        let mut registry = Self::new(syntax_set, extra_extensions, filename_scopes);
1465        registry.populate_built_in_aliases();
1466        registry.rebuild_catalog();
1467        registry
1468    }
1469}
1470
1471// VSCode package.json structures for parsing grammar manifests
1472
1473#[derive(Debug, Deserialize)]
1474pub struct PackageManifest {
1475    #[serde(default)]
1476    pub contributes: Option<Contributes>,
1477}
1478
1479#[derive(Debug, Deserialize, Default)]
1480pub struct Contributes {
1481    #[serde(default)]
1482    pub languages: Vec<LanguageContribution>,
1483    #[serde(default)]
1484    pub grammars: Vec<GrammarContribution>,
1485}
1486
1487#[derive(Debug, Deserialize)]
1488pub struct LanguageContribution {
1489    pub id: String,
1490    #[serde(default)]
1491    pub extensions: Vec<String>,
1492}
1493
1494#[derive(Debug, Deserialize)]
1495pub struct GrammarContribution {
1496    pub language: String,
1497    #[serde(rename = "scopeName")]
1498    pub scope_name: String,
1499    pub path: String,
1500}
1501
1502#[cfg(test)]
1503mod tests {
1504    use super::*;
1505
1506    #[test]
1507    fn test_empty_registry() {
1508        let registry = GrammarRegistry::empty();
1509        // Should have at least plain text
1510        assert!(!registry.available_syntaxes().is_empty());
1511    }
1512
1513    #[test]
1514    fn test_default_registry() {
1515        let registry = GrammarRegistry::default();
1516        // Should have built-in syntaxes
1517        assert!(!registry.available_syntaxes().is_empty());
1518    }
1519
1520    #[test]
1521    fn test_find_syntax_for_common_extensions() {
1522        let registry = GrammarRegistry::default();
1523
1524        // Test common extensions that syntect should support
1525        let test_cases = [
1526            ("test.py", true),
1527            ("test.rs", true),
1528            ("test.js", true),
1529            ("test.json", true),
1530            ("test.md", true),
1531            ("test.html", true),
1532            ("test.css", true),
1533            ("test.unknown_extension_xyz", false),
1534        ];
1535
1536        for (filename, should_exist) in test_cases {
1537            let path = Path::new(filename);
1538            let result = registry.find_syntax_for_file(path);
1539            assert_eq!(
1540                result.is_some(),
1541                should_exist,
1542                "Expected {:?} for {}",
1543                should_exist,
1544                filename
1545            );
1546        }
1547    }
1548
1549    #[test]
1550    fn test_syntax_set_arc() {
1551        let registry = GrammarRegistry::default();
1552        let arc1 = registry.syntax_set_arc();
1553        let arc2 = registry.syntax_set_arc();
1554        // Both should point to the same data
1555        assert!(Arc::ptr_eq(&arc1, &arc2));
1556    }
1557
1558    #[test]
1559    fn test_shell_dotfiles_detection() {
1560        let registry = GrammarRegistry::default();
1561
1562        // All these should be detected as shell scripts
1563        let shell_files = [".zshrc", ".zprofile", ".zshenv", ".bash_aliases"];
1564
1565        for filename in shell_files {
1566            let path = Path::new(filename);
1567            let result = registry.find_syntax_for_file(path);
1568            assert!(
1569                result.is_some(),
1570                "{} should be detected as a syntax",
1571                filename
1572            );
1573            let syntax = result.unwrap();
1574            // Should be detected as Bash/Shell
1575            assert!(
1576                syntax.name.to_lowercase().contains("bash")
1577                    || syntax.name.to_lowercase().contains("shell"),
1578                "{} should be detected as shell/bash, got: {}",
1579                filename,
1580                syntax.name
1581            );
1582        }
1583    }
1584
1585    #[test]
1586    fn test_pkgbuild_detection() {
1587        let registry = GrammarRegistry::default();
1588
1589        // PKGBUILD and APKBUILD should be detected as shell scripts
1590        for filename in ["PKGBUILD", "APKBUILD"] {
1591            let path = Path::new(filename);
1592            let result = registry.find_syntax_for_file(path);
1593            assert!(
1594                result.is_some(),
1595                "{} should be detected as a syntax",
1596                filename
1597            );
1598            let syntax = result.unwrap();
1599            // Should be detected as Bash/Shell
1600            assert!(
1601                syntax.name.to_lowercase().contains("bash")
1602                    || syntax.name.to_lowercase().contains("shell"),
1603                "{} should be detected as shell/bash, got: {}",
1604                filename,
1605                syntax.name
1606            );
1607        }
1608    }
1609
1610    #[test]
1611    fn test_find_syntax_with_glob_filenames() {
1612        let mut registry = GrammarRegistry::default();
1613        let mut languages = std::collections::HashMap::new();
1614        languages.insert(
1615            "shell-configs".to_string(),
1616            crate::config::LanguageConfig {
1617                extensions: vec!["sh".to_string()],
1618                filenames: vec!["*.conf".to_string(), "*rc".to_string()],
1619                grammar: "bash".to_string(),
1620                comment_prefix: Some("#".to_string()),
1621                auto_indent: true,
1622                auto_close: None,
1623                auto_surround: None,
1624                textmate_grammar: None,
1625                show_whitespace_tabs: true,
1626                line_wrap: None,
1627                wrap_column: None,
1628                page_view: None,
1629                page_width: None,
1630                use_tabs: None,
1631                tab_size: None,
1632                formatter: None,
1633                format_on_save: false,
1634                on_save: vec![],
1635                word_characters: None,
1636            },
1637        );
1638        registry.apply_language_config(&languages);
1639
1640        assert!(
1641            registry
1642                .find_by_path(Path::new("nftables.conf"), None)
1643                .is_some(),
1644            "*.conf should match nftables.conf"
1645        );
1646        assert!(
1647            registry.find_by_path(Path::new("lfrc"), None).is_some(),
1648            "*rc should match lfrc"
1649        );
1650        // Unrelated file shouldn't panic.
1651        let _ = registry.find_by_path(Path::new("randomfile"), None);
1652    }
1653
1654    #[test]
1655    fn test_find_syntax_with_path_glob_filenames() {
1656        let mut registry = GrammarRegistry::default();
1657        let mut languages = std::collections::HashMap::new();
1658        languages.insert(
1659            "shell-configs".to_string(),
1660            crate::config::LanguageConfig {
1661                extensions: vec!["sh".to_string()],
1662                filenames: vec!["/etc/**/rc.*".to_string()],
1663                grammar: "bash".to_string(),
1664                comment_prefix: Some("#".to_string()),
1665                auto_indent: true,
1666                auto_close: None,
1667                auto_surround: None,
1668                textmate_grammar: None,
1669                show_whitespace_tabs: true,
1670                line_wrap: None,
1671                wrap_column: None,
1672                page_view: None,
1673                page_width: None,
1674                use_tabs: None,
1675                tab_size: None,
1676                formatter: None,
1677                format_on_save: false,
1678                on_save: vec![],
1679                word_characters: None,
1680            },
1681        );
1682        registry.apply_language_config(&languages);
1683
1684        assert!(
1685            registry
1686                .find_by_path(Path::new("/etc/rc.conf"), None)
1687                .is_some(),
1688            "/etc/**/rc.* should match /etc/rc.conf"
1689        );
1690        assert!(
1691            registry
1692                .find_by_path(Path::new("/etc/init/rc.local"), None)
1693                .is_some(),
1694            "/etc/**/rc.* should match /etc/init/rc.local"
1695        );
1696        let _ = registry.find_by_path(Path::new("/var/rc.conf"), None);
1697    }
1698
1699    #[test]
1700    fn test_exact_filename_takes_priority_over_glob() {
1701        let mut registry = GrammarRegistry::default();
1702        let mut languages = std::collections::HashMap::new();
1703
1704        // A language with exact filename "lfrc" -> python grammar
1705        languages.insert(
1706            "custom-lfrc".to_string(),
1707            crate::config::LanguageConfig {
1708                extensions: vec![],
1709                filenames: vec!["lfrc".to_string()],
1710                grammar: "python".to_string(),
1711                comment_prefix: Some("#".to_string()),
1712                auto_indent: true,
1713                auto_close: None,
1714                auto_surround: None,
1715                textmate_grammar: None,
1716                show_whitespace_tabs: true,
1717                line_wrap: None,
1718                wrap_column: None,
1719                page_view: None,
1720                page_width: None,
1721                use_tabs: None,
1722                tab_size: None,
1723                formatter: None,
1724                format_on_save: false,
1725                on_save: vec![],
1726                word_characters: None,
1727            },
1728        );
1729
1730        // A language with glob "*rc" -> bash grammar
1731        languages.insert(
1732            "rc-files".to_string(),
1733            crate::config::LanguageConfig {
1734                extensions: vec![],
1735                filenames: vec!["*rc".to_string()],
1736                grammar: "bash".to_string(),
1737                comment_prefix: Some("#".to_string()),
1738                auto_indent: true,
1739                auto_close: None,
1740                auto_surround: None,
1741                textmate_grammar: None,
1742                show_whitespace_tabs: true,
1743                line_wrap: None,
1744                wrap_column: None,
1745                page_view: None,
1746                page_width: None,
1747                use_tabs: None,
1748                tab_size: None,
1749                formatter: None,
1750                format_on_save: false,
1751                on_save: vec![],
1752                word_characters: None,
1753            },
1754        );
1755
1756        registry.apply_language_config(&languages);
1757
1758        // "lfrc" should match the exact rule (python), not the glob (bash)
1759        let entry = registry.find_by_path(Path::new("lfrc"), None).unwrap();
1760        assert!(
1761            entry.display_name.to_lowercase().contains("python"),
1762            "exact match should win over glob, got: {}",
1763            entry.display_name
1764        );
1765    }
1766
1767    #[test]
1768    fn test_built_in_aliases_resolve() {
1769        let registry = GrammarRegistry::default();
1770
1771        // "bash" should resolve to "Bourne Again Shell (bash)" via alias
1772        let syntax = registry.find_syntax_by_name("bash");
1773        assert!(syntax.is_some(), "alias 'bash' should resolve");
1774        assert_eq!(syntax.unwrap().name, "Bourne Again Shell (bash)");
1775
1776        // "cpp" should resolve to "C++"
1777        let syntax = registry.find_syntax_by_name("cpp");
1778        assert!(syntax.is_some(), "alias 'cpp' should resolve");
1779        assert_eq!(syntax.unwrap().name, "C++");
1780
1781        // "csharp" should resolve to "C#"
1782        let syntax = registry.find_syntax_by_name("csharp");
1783        assert!(syntax.is_some(), "alias 'csharp' should resolve");
1784        assert_eq!(syntax.unwrap().name, "C#");
1785
1786        // "sh" should also resolve to bash
1787        let syntax = registry.find_syntax_by_name("sh");
1788        assert!(syntax.is_some(), "alias 'sh' should resolve");
1789        assert_eq!(syntax.unwrap().name, "Bourne Again Shell (bash)");
1790
1791        // "proto" should resolve to "Protocol Buffers"
1792        let syntax = registry.find_syntax_by_name("proto");
1793        assert!(syntax.is_some(), "alias 'proto' should resolve");
1794        assert_eq!(syntax.unwrap().name, "Protocol Buffers");
1795    }
1796
1797    #[test]
1798    fn test_alias_case_insensitive_input() {
1799        let registry = GrammarRegistry::default();
1800
1801        // Aliases should be case-insensitive on input
1802        let syntax = registry.find_syntax_by_name("BASH");
1803        assert!(
1804            syntax.is_some(),
1805            "alias 'BASH' should resolve case-insensitively"
1806        );
1807        assert_eq!(syntax.unwrap().name, "Bourne Again Shell (bash)");
1808
1809        let syntax = registry.find_syntax_by_name("Cpp");
1810        assert!(
1811            syntax.is_some(),
1812            "alias 'Cpp' should resolve case-insensitively"
1813        );
1814        assert_eq!(syntax.unwrap().name, "C++");
1815    }
1816
1817    #[test]
1818    fn test_full_name_still_works() {
1819        let registry = GrammarRegistry::default();
1820
1821        // Full names should still work (exact match)
1822        let syntax = registry.find_syntax_by_name("Bourne Again Shell (bash)");
1823        assert!(syntax.is_some(), "full name should still resolve");
1824        assert_eq!(syntax.unwrap().name, "Bourne Again Shell (bash)");
1825
1826        // Case-insensitive full name should still work
1827        let syntax = registry.find_syntax_by_name("bourne again shell (bash)");
1828        assert!(
1829            syntax.is_some(),
1830            "case-insensitive full name should resolve"
1831        );
1832        assert_eq!(syntax.unwrap().name, "Bourne Again Shell (bash)");
1833    }
1834
1835    #[test]
1836    fn test_alias_does_not_shadow_full_names() {
1837        let registry = GrammarRegistry::default();
1838
1839        // "Rust" should resolve directly via case-insensitive match, not via alias
1840        let syntax = registry.find_syntax_by_name("rust");
1841        assert!(syntax.is_some());
1842        assert_eq!(syntax.unwrap().name, "Rust");
1843
1844        // "Go" should resolve directly
1845        let syntax = registry.find_syntax_by_name("go");
1846        assert!(syntax.is_some());
1847        assert_eq!(syntax.unwrap().name, "Go");
1848    }
1849
1850    #[test]
1851    fn test_register_alias_rejects_collision() {
1852        let mut registry = GrammarRegistry::default();
1853
1854        // Trying to register an alias that maps to two different targets should fail
1855        assert!(registry.register_alias("myalias", "Rust"));
1856        assert!(!registry.register_alias("myalias", "Go"));
1857
1858        // Same mapping is fine (idempotent)
1859        assert!(registry.register_alias("myalias", "Rust"));
1860    }
1861
1862    #[test]
1863    fn test_register_alias_rejects_nonexistent_target() {
1864        let mut registry = GrammarRegistry::default();
1865        assert!(!registry.register_alias("nope", "Nonexistent Grammar"));
1866    }
1867
1868    #[test]
1869    fn test_register_alias_skips_existing_grammar_name() {
1870        let mut registry = GrammarRegistry::default();
1871
1872        // "rust" case-insensitively matches the grammar "Rust", so no alias needed
1873        assert!(!registry.register_alias("rust", "Rust"));
1874        // Should still be resolvable via case-insensitive match
1875        assert!(registry.find_syntax_by_name("rust").is_some());
1876    }
1877
1878    #[test]
1879    fn test_available_grammar_info_includes_short_names() {
1880        let registry = GrammarRegistry::default();
1881        let infos = registry.available_grammar_info();
1882
1883        let bash_info = infos.iter().find(|g| g.name == "Bourne Again Shell (bash)");
1884        assert!(bash_info.is_some(), "bash grammar should be in the list");
1885        let bash_info = bash_info.unwrap();
1886        assert!(
1887            bash_info.short_name.is_some(),
1888            "bash grammar should have a short_name"
1889        );
1890        // The shortest alias for bash is "sh"
1891        assert_eq!(bash_info.short_name.as_deref(), Some("sh"));
1892    }
1893
1894    #[test]
1895    fn test_catalog_contains_each_language_once() {
1896        let registry = GrammarRegistry::default();
1897        let catalog = registry.catalog();
1898
1899        // Every catalog entry must have a unique (case-insensitive) display name.
1900        let mut seen = std::collections::HashSet::new();
1901        for entry in catalog {
1902            let key = entry.display_name.to_lowercase();
1903            assert!(
1904                seen.insert(key.clone()),
1905                "duplicate catalog entry for display_name={:?}",
1906                entry.display_name
1907            );
1908        }
1909
1910        // TypeScript is tree-sitter-only (syntect ships no grammar for it) yet
1911        // must still appear in the catalog.
1912        let ts = registry
1913            .find_by_name("TypeScript")
1914            .expect("TypeScript must be in the catalog");
1915        assert!(ts.engines.syntect.is_none());
1916        assert_eq!(
1917            ts.engines.tree_sitter,
1918            Some(fresh_languages::Language::TypeScript)
1919        );
1920        assert_eq!(ts.language_id, "typescript");
1921        assert!(ts.extensions.iter().any(|e| e == "ts"));
1922
1923        // Languages that exist in both syntect and tree-sitter (Rust, Python,
1924        // JavaScript) must appear exactly once and prefer the syntect engine.
1925        for name in ["Rust", "Python", "JavaScript"] {
1926            let entry = registry
1927                .find_by_name(name)
1928                .unwrap_or_else(|| panic!("{} must be in the catalog", name));
1929            assert!(
1930                entry.engines.syntect.is_some(),
1931                "{} should have a syntect index",
1932                name
1933            );
1934            assert!(
1935                entry.engines.tree_sitter.is_some(),
1936                "{} should also have a tree-sitter language",
1937                name
1938            );
1939            // Only one entry with this display name (already checked above),
1940            // but also verify language_id lookup lands on the same entry.
1941            let by_id = registry
1942                .find_by_name(&entry.language_id)
1943                .expect("language_id should resolve");
1944            assert_eq!(by_id.display_name, entry.display_name);
1945        }
1946    }
1947
1948    #[test]
1949    fn test_catalog_find_by_path_and_extension() {
1950        let registry = GrammarRegistry::default();
1951        let ts = registry
1952            .find_by_path(Path::new("foo.ts"), None)
1953            .expect("foo.ts should resolve");
1954        assert_eq!(ts.display_name, "TypeScript");
1955        let rs = registry.find_by_extension("rs").expect("rs should resolve");
1956        assert_eq!(rs.display_name, "Rust");
1957    }
1958
1959    /// Build a minimal LanguageConfig for tests.
1960    fn lang_cfg(
1961        grammar: &str,
1962        extensions: &[&str],
1963        filenames: &[&str],
1964    ) -> crate::config::LanguageConfig {
1965        crate::config::LanguageConfig {
1966            extensions: extensions.iter().map(|s| s.to_string()).collect(),
1967            filenames: filenames.iter().map(|s| s.to_string()).collect(),
1968            grammar: grammar.to_string(),
1969            comment_prefix: None,
1970            auto_indent: true,
1971            auto_close: None,
1972            auto_surround: None,
1973            textmate_grammar: None,
1974            show_whitespace_tabs: true,
1975            line_wrap: None,
1976            wrap_column: None,
1977            page_view: None,
1978            page_width: None,
1979            use_tabs: None,
1980            tab_size: None,
1981            formatter: None,
1982            format_on_save: false,
1983            on_save: vec![],
1984            word_characters: None,
1985        }
1986    }
1987
1988    /// Bug #1: a user-declared config key that aliases an existing grammar
1989    /// (e.g. `[languages.mylang] grammar = "Rust"`) must resolve via
1990    /// `find_by_name("mylang")` so the language palette can select it.
1991    #[test]
1992    fn test_user_alias_resolves_via_find_by_name() {
1993        let mut registry = GrammarRegistry::default();
1994        let mut languages = std::collections::HashMap::new();
1995        languages.insert("mylang".to_string(), lang_cfg("Rust", &[], &[]));
1996        registry.apply_language_config(&languages);
1997
1998        let entry = registry
1999            .find_by_name("mylang")
2000            .expect("user-declared alias 'mylang' must resolve");
2001        assert_eq!(entry.display_name, "Rust");
2002    }
2003
2004    /// Bug #2: `register_alias` used to rebuild the catalog from scratch,
2005    /// wiping out everything `apply_language_config` had merged. Registering
2006    /// an alias afterwards must not lose user config.
2007    #[test]
2008    fn test_register_alias_preserves_applied_language_config() {
2009        let mut registry = GrammarRegistry::default();
2010        let mut languages = std::collections::HashMap::new();
2011        languages.insert(
2012            "shell-configs".to_string(),
2013            lang_cfg("bash", &["myconf"], &["*.myconf"]),
2014        );
2015        registry.apply_language_config(&languages);
2016
2017        // Sanity: config applied.
2018        assert!(registry.find_by_extension("myconf").is_some());
2019        assert!(
2020            registry
2021                .find_by_path(Path::new("foo.myconf"), None)
2022                .is_some(),
2023            "glob should match before register_alias"
2024        );
2025
2026        // Registering an alias must not erase the config we just applied.
2027        registry.register_alias("mycustom", "Rust");
2028
2029        assert!(
2030            registry.find_by_extension("myconf").is_some(),
2031            "config extension must survive register_alias"
2032        );
2033        assert!(
2034            registry
2035                .find_by_path(Path::new("foo.myconf"), None)
2036                .is_some(),
2037            "glob must survive register_alias"
2038        );
2039    }
2040
2041    /// Bug #4: `from_syntax_name` used to unconditionally overwrite the
2042    /// catalog's canonical display name with whatever the user typed (e.g.
2043    /// "BASH") — that string ended up in the status bar.
2044    #[test]
2045    fn test_from_syntax_name_preserves_canonical_display_name() {
2046        use crate::primitives::detected_language::DetectedLanguage;
2047        let registry = GrammarRegistry::default();
2048        let languages = std::collections::HashMap::new();
2049
2050        let detected = DetectedLanguage::from_syntax_name("BASH", &registry, &languages)
2051            .expect("BASH should resolve via alias");
2052        assert_eq!(
2053            detected.display_name, "Bourne Again Shell (bash)",
2054            "display_name must be canonical, not user-typed"
2055        );
2056    }
2057
2058    /// A config-only language (no matching syntect grammar) must still appear
2059    /// in the catalog so the language palette can offer it — the old
2060    /// `DetectedLanguage::from_config_language` branch was load-bearing.
2061    #[test]
2062    fn test_config_only_language_appears_in_catalog() {
2063        let mut registry = GrammarRegistry::default();
2064        let mut languages = std::collections::HashMap::new();
2065        // "fish" isn't in syntect; grammar="fish" doesn't resolve either.
2066        languages.insert("fish".to_string(), lang_cfg("fish", &["fish"], &[]));
2067        registry.apply_language_config(&languages);
2068
2069        let entry = registry
2070            .find_by_name("fish")
2071            .expect("fish should be in the catalog after apply_language_config");
2072        assert!(entry.engines.syntect.is_none());
2073        assert!(entry.engines.tree_sitter.is_none());
2074        assert_eq!(entry.language_id, "fish");
2075        assert!(entry.extensions.iter().any(|e| e == "fish"));
2076    }
2077
2078    /// Config-declared extensions must override the built-in mapping. If the
2079    /// user says `[languages.typescript-overlay] extensions = ["js"] grammar
2080    /// = "TypeScript"`, then `foo.js` must resolve to TypeScript, not
2081    /// JavaScript.
2082    #[test]
2083    fn test_config_extension_overrides_builtin() {
2084        let mut registry = GrammarRegistry::default();
2085        // Sanity: default mapping is JavaScript.
2086        assert_eq!(
2087            registry.find_by_extension("js").unwrap().display_name,
2088            "JavaScript"
2089        );
2090
2091        let mut languages = std::collections::HashMap::new();
2092        languages.insert(
2093            "ts-overlay".to_string(),
2094            lang_cfg("TypeScript", &["js"], &[]),
2095        );
2096        registry.apply_language_config(&languages);
2097
2098        assert_eq!(
2099            registry.find_by_extension("js").unwrap().display_name,
2100            "TypeScript",
2101            "user-config extension must win over built-in"
2102        );
2103    }
2104
2105    /// Bare filenames listed by syntect grammars (e.g. "Gemfile", "Makefile",
2106    /// "Rakefile") must resolve through `find_by_path`. Syntect stores these
2107    /// in each grammar's `file_extensions` field alongside real extensions
2108    /// like "rb"; its own `find_syntax_for_file` treats them as either. The
2109    /// catalog has to do the same or `HighlightEngine::for_file` breaks for
2110    /// every extensionless config file.
2111    #[test]
2112    fn test_bare_filename_resolves_via_find_by_path() {
2113        let registry = GrammarRegistry::default();
2114        for (filename, expected_substr) in [
2115            ("Gemfile", "ruby"),
2116            ("Rakefile", "ruby"),
2117            ("Vagrantfile", "ruby"),
2118            ("Makefile", "makefile"),
2119            ("GNUmakefile", "makefile"),
2120        ] {
2121            let entry = registry
2122                .find_by_path(Path::new(filename), None)
2123                .unwrap_or_else(|| panic!("{} must resolve via catalog", filename));
2124            assert!(
2125                entry.display_name.to_lowercase().contains(expected_substr),
2126                "{} should resolve to {} grammar, got {}",
2127                filename,
2128                expected_substr,
2129                entry.display_name
2130            );
2131        }
2132    }
2133
2134    /// Languages that have both syntect and tree-sitter (e.g. JavaScript) must
2135    /// expose the union of both engines' extensions. Tree-sitter-javascript
2136    /// knows `.jsx`; syntect's JavaScript grammar does not. Both should route
2137    /// through the JavaScript catalog entry.
2138    #[test]
2139    fn test_jsx_resolves_to_javascript() {
2140        let registry = GrammarRegistry::default();
2141        let entry = registry
2142            .find_by_path(Path::new("foo.jsx"), None)
2143            .expect("foo.jsx must resolve");
2144        assert_eq!(entry.display_name, "JavaScript");
2145    }
2146
2147    /// `rebuild_catalog` must replay the last-applied language config so it
2148    /// can never silently wipe user `[languages]` rules. This is the invariant
2149    /// that keeps `register_alias`, `populate_built_in_aliases`, and any
2150    /// future rebuild callsite safe-by-construction.
2151    #[test]
2152    fn test_rebuild_catalog_replays_language_config() {
2153        let mut registry = GrammarRegistry::default();
2154        let mut languages = std::collections::HashMap::new();
2155        languages.insert(
2156            "myshell".to_string(),
2157            lang_cfg("bash", &["myext"], &["*.myglob"]),
2158        );
2159        registry.apply_language_config(&languages);
2160        assert!(registry.find_by_extension("myext").is_some());
2161        assert!(registry
2162            .find_by_path(Path::new("foo.myglob"), None)
2163            .is_some());
2164
2165        // Force a rebuild — the catalog gets wiped and re-populated from
2166        // syntect / tree-sitter, but user config must come back on top.
2167        registry.rebuild_catalog();
2168        assert!(
2169            registry.find_by_extension("myext").is_some(),
2170            "rebuild_catalog must replay applied user config"
2171        );
2172        assert!(
2173            registry
2174                .find_by_path(Path::new("foo.myglob"), None)
2175                .is_some(),
2176            "rebuild_catalog must replay user globs"
2177        );
2178    }
2179
2180    /// `apply_language_config` must be idempotent: calling it twice with the
2181    /// same config yields the same catalog state.
2182    #[test]
2183    fn test_apply_language_config_idempotent() {
2184        let mut registry = GrammarRegistry::default();
2185        let mut languages = std::collections::HashMap::new();
2186        languages.insert(
2187            "shell-cfg".to_string(),
2188            lang_cfg("bash", &["myconf"], &["*.myconf"]),
2189        );
2190
2191        registry.apply_language_config(&languages);
2192        let first_extensions = registry
2193            .find_by_name("bash")
2194            .unwrap()
2195            .extensions
2196            .iter()
2197            .filter(|e| e == &"myconf")
2198            .count();
2199        let first_globs = registry
2200            .find_by_name("bash")
2201            .unwrap()
2202            .filename_globs
2203            .iter()
2204            .filter(|g| g == &"*.myconf")
2205            .count();
2206        assert_eq!(first_extensions, 1);
2207        assert_eq!(first_globs, 1);
2208
2209        // Second call must not duplicate anything.
2210        registry.apply_language_config(&languages);
2211        let second_extensions = registry
2212            .find_by_name("bash")
2213            .unwrap()
2214            .extensions
2215            .iter()
2216            .filter(|e| e == &"myconf")
2217            .count();
2218        let second_globs = registry
2219            .find_by_name("bash")
2220            .unwrap()
2221            .filename_globs
2222            .iter()
2223            .filter(|g| g == &"*.myconf")
2224            .count();
2225        assert_eq!(second_extensions, 1, "extensions must not duplicate");
2226        assert_eq!(second_globs, 1, "globs must not duplicate");
2227    }
2228
2229    /// `tree_sitter_for_syntect_name` handles the alias table + strict
2230    /// display-name match. The alias table catches syntect's verbose names;
2231    /// the strict match handles the common case.
2232    #[test]
2233    fn test_tree_sitter_bridge() {
2234        assert_eq!(
2235            tree_sitter_for_syntect_name("Bourne Again Shell (bash)"),
2236            Some(fresh_languages::Language::Bash)
2237        );
2238        assert_eq!(
2239            tree_sitter_for_syntect_name("Rust"),
2240            Some(fresh_languages::Language::Rust)
2241        );
2242        // Must NOT fuzzy-match Nushell to Bash.
2243        assert_eq!(tree_sitter_for_syntect_name("Nushell"), None);
2244        // Must NOT match arbitrary strings.
2245        assert_eq!(tree_sitter_for_syntect_name("does-not-exist"), None);
2246    }
2247}