Skip to main content

fresh/primitives/grammar/
types.rs

1//! Pure grammar registry types without I/O operations.
2//!
3//! This module contains the `GrammarRegistry` struct and all syntax lookup methods
4//! that don't require filesystem access. This enables WASM compatibility and easier testing.
5
6use serde::{Deserialize, Serialize};
7use std::collections::HashMap;
8use std::path::{Path, PathBuf};
9use std::sync::Arc;
10use syntect::parsing::{SyntaxDefinition, SyntaxReference, SyntaxSet, SyntaxSetBuilder};
11
12// Re-export glob matching utilities for use by other modules
13pub use crate::primitives::glob_match::{
14    filename_glob_matches, is_glob_pattern, is_path_pattern, path_glob_matches,
15};
16
17/// A grammar specification: language name, path to grammar file, and associated file extensions.
18///
19/// Used to pass grammar information between the plugin layer, loader, and registry
20/// without relying on anonymous tuples.
21#[derive(Clone, Debug)]
22pub struct GrammarSpec {
23    /// Language identifier (e.g., "elixir")
24    pub language: String,
25    /// Path to the grammar file (.sublime-syntax)
26    pub path: PathBuf,
27    /// File extensions to associate with this grammar (e.g., ["ex", "exs"])
28    pub extensions: Vec<String>,
29}
30
31/// Where a grammar was loaded from.
32#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
33#[serde(tag = "type")]
34pub enum GrammarSource {
35    /// Built-in to Fresh (pre-compiled syntect defaults + embedded grammars)
36    #[serde(rename = "built-in")]
37    BuiltIn,
38    /// Installed from a user grammar directory (~/.config/fresh/grammars/)
39    #[serde(rename = "user")]
40    User { path: PathBuf },
41    /// From a language pack (~/.config/fresh/languages/packages/)
42    #[serde(rename = "language-pack")]
43    LanguagePack { name: String, path: PathBuf },
44    /// From a bundle package (~/.config/fresh/bundles/packages/)
45    #[serde(rename = "bundle")]
46    Bundle { name: String, path: PathBuf },
47    /// Registered by a plugin at runtime
48    #[serde(rename = "plugin")]
49    Plugin { plugin: String, path: PathBuf },
50}
51
52impl std::fmt::Display for GrammarSource {
53    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
54        match self {
55            GrammarSource::BuiltIn => write!(f, "built-in"),
56            GrammarSource::User { path } => write!(f, "user ({})", path.display()),
57            GrammarSource::LanguagePack { name, .. } => write!(f, "language-pack ({})", name),
58            GrammarSource::Bundle { name, .. } => write!(f, "bundle ({})", name),
59            GrammarSource::Plugin { plugin, .. } => write!(f, "plugin ({})", plugin),
60        }
61    }
62}
63
64/// Information about an available grammar, including its provenance.
65#[derive(Clone, Debug, Serialize, Deserialize)]
66pub struct GrammarInfo {
67    /// The grammar name as used in config files (case-insensitive matching)
68    pub name: String,
69    /// Where this grammar was loaded from
70    pub source: GrammarSource,
71    /// File extensions associated with this grammar
72    pub file_extensions: Vec<String>,
73    /// Optional short name alias (e.g., "bash" for "Bourne Again Shell (bash)")
74    #[serde(default, skip_serializing_if = "Option::is_none")]
75    pub short_name: Option<String>,
76}
77
78/// Bridge between syntect display names and `fresh_languages::Language`.
79///
80/// Most syntect grammars map one-to-one: "Rust" → `Language::Rust`. A few
81/// have verbose display names that don't match the tree-sitter enum's
82/// `display_name()`, and `Language::from_name` has fuzzy "contains shell"
83/// fallbacks that would wrongly tag Nushell as tree-sitter Bash. This is
84/// the one place we spell the exceptions out explicitly.
85const SYNTECT_TO_TREE_SITTER_ALIASES: &[(&str, fresh_languages::Language)] =
86    &[("Bourne Again Shell (bash)", fresh_languages::Language::Bash)];
87
88/// Resolve a syntect syntax display name to a tree-sitter language, using
89/// strict equality against the alias table and `Language::display_name()`.
90fn tree_sitter_for_syntect_name(display_name: &str) -> Option<fresh_languages::Language> {
91    for (syntect_name, lang) in SYNTECT_TO_TREE_SITTER_ALIASES {
92        if *syntect_name == display_name {
93            return Some(*lang);
94        }
95    }
96    fresh_languages::Language::all()
97        .iter()
98        .find(|l| l.display_name() == display_name)
99        .copied()
100}
101
102/// Which highlighters can serve a given `GrammarEntry`.
103///
104/// A catalog entry may come from syntect (a TextMate grammar indexed into
105/// `SyntaxSet`), tree-sitter (a `fresh_languages::Language`), or both.
106#[derive(Clone, Debug, Default)]
107pub struct GrammarEngines {
108    /// Index into `GrammarRegistry::syntax_set().syntaxes()`, if a syntect
109    /// grammar is available.
110    pub syntect: Option<usize>,
111    /// Tree-sitter language, if one is registered for this grammar.
112    pub tree_sitter: Option<fresh_languages::Language>,
113}
114
115/// A single entry in the unified grammar catalog.
116///
117/// Each entry represents one logical language (e.g. "Rust", "TypeScript") and
118/// records which highlighting engines can serve it, plus the names/extensions
119/// used to look it up. The catalog is the single source of truth for grammar
120/// lookups — `find_by_name`, `find_by_path`, `find_by_extension` all return
121/// entries from here, and both `HighlightEngine::from_entry` and
122/// `DetectedLanguage::from_entry` consume them.
123#[derive(Clone, Debug)]
124pub struct GrammarEntry {
125    /// Human-readable display name (e.g. "TypeScript", "Bourne Again Shell (bash)").
126    pub display_name: String,
127    /// Canonical language ID used in config and LSP (e.g. "typescript", "csharp").
128    pub language_id: String,
129    /// Short alias, if one exists (e.g. "ts" for TypeScript).
130    pub short_name: Option<String>,
131    /// File extensions (without leading dot).
132    pub extensions: Vec<String>,
133    /// Exact filenames that map to this grammar (e.g. "Dockerfile").
134    pub filenames: Vec<String>,
135    /// Filename globs from user config (e.g. "*.conf", "/etc/**/rc.*").
136    pub filename_globs: Vec<String>,
137    /// Where this grammar was loaded from.
138    pub source: GrammarSource,
139    /// Highlighters that can serve this entry.
140    pub engines: GrammarEngines,
141}
142
143/// Embedded TOML grammar (syntect doesn't include one)
144pub const TOML_GRAMMAR: &str = include_str!("../../grammars/toml.sublime-syntax");
145
146/// Embedded Odin grammar (syntect doesn't include one)
147/// From: https://github.com/Tetralux/sublime-odin (MIT License)
148pub const ODIN_GRAMMAR: &str = include_str!("../../grammars/odin/Odin.sublime-syntax");
149
150/// Embedded Zig grammar (syntect doesn't include one)
151pub const ZIG_GRAMMAR: &str = include_str!("../../grammars/zig.sublime-syntax");
152
153/// Embedded Git Rebase Todo grammar for interactive rebase
154pub const GIT_REBASE_GRAMMAR: &str = include_str!("../../grammars/git-rebase.sublime-syntax");
155
156/// Embedded Git Commit Message grammar for COMMIT_EDITMSG, MERGE_MSG, etc.
157pub const GIT_COMMIT_GRAMMAR: &str = include_str!("../../grammars/git-commit.sublime-syntax");
158
159/// Embedded Gitignore grammar for .gitignore and similar files
160pub const GITIGNORE_GRAMMAR: &str = include_str!("../../grammars/gitignore.sublime-syntax");
161
162/// Embedded Git Config grammar for .gitconfig, .gitmodules
163pub const GITCONFIG_GRAMMAR: &str = include_str!("../../grammars/gitconfig.sublime-syntax");
164
165/// Embedded Git Attributes grammar for .gitattributes
166pub const GITATTRIBUTES_GRAMMAR: &str = include_str!("../../grammars/gitattributes.sublime-syntax");
167
168/// Embedded Typst grammar (syntect doesn't include one)
169pub const TYPST_GRAMMAR: &str = include_str!("../../grammars/typst.sublime-syntax");
170
171/// Embedded Dockerfile grammar
172pub const DOCKERFILE_GRAMMAR: &str = include_str!("../../grammars/dockerfile.sublime-syntax");
173/// Embedded INI grammar (also handles .env, .cfg, .editorconfig, etc.)
174pub const INI_GRAMMAR: &str = include_str!("../../grammars/ini.sublime-syntax");
175/// Embedded CMake grammar
176pub const CMAKE_GRAMMAR: &str = include_str!("../../grammars/cmake.sublime-syntax");
177/// Embedded SCSS grammar
178pub const SCSS_GRAMMAR: &str = include_str!("../../grammars/scss.sublime-syntax");
179/// Embedded LESS grammar
180pub const LESS_GRAMMAR: &str = include_str!("../../grammars/less.sublime-syntax");
181/// Embedded PowerShell grammar
182pub const POWERSHELL_GRAMMAR: &str = include_str!("../../grammars/powershell.sublime-syntax");
183/// Embedded Kotlin grammar
184pub const KOTLIN_GRAMMAR: &str = include_str!("../../grammars/kotlin.sublime-syntax");
185/// Embedded Swift grammar
186pub const SWIFT_GRAMMAR: &str = include_str!("../../grammars/swift.sublime-syntax");
187/// Embedded Dart grammar
188pub const DART_GRAMMAR: &str = include_str!("../../grammars/dart.sublime-syntax");
189/// Embedded Elixir grammar
190pub const ELIXIR_GRAMMAR: &str = include_str!("../../grammars/elixir.sublime-syntax");
191/// Embedded F# grammar
192pub const FSHARP_GRAMMAR: &str = include_str!("../../grammars/fsharp.sublime-syntax");
193/// Embedded Nix grammar
194pub const NIX_GRAMMAR: &str = include_str!("../../grammars/nix.sublime-syntax");
195/// Embedded HCL/Terraform grammar
196pub const HCL_GRAMMAR: &str = include_str!("../../grammars/hcl.sublime-syntax");
197/// Embedded Protocol Buffers grammar
198pub const PROTOBUF_GRAMMAR: &str = include_str!("../../grammars/protobuf.sublime-syntax");
199/// Embedded GraphQL grammar
200pub const GRAPHQL_GRAMMAR: &str = include_str!("../../grammars/graphql.sublime-syntax");
201/// Embedded Julia grammar
202pub const JULIA_GRAMMAR: &str = include_str!("../../grammars/julia.sublime-syntax");
203/// Embedded Nim grammar
204pub const NIM_GRAMMAR: &str = include_str!("../../grammars/nim.sublime-syntax");
205/// Embedded Gleam grammar
206pub const GLEAM_GRAMMAR: &str = include_str!("../../grammars/gleam.sublime-syntax");
207/// Embedded V language grammar
208pub const VLANG_GRAMMAR: &str = include_str!("../../grammars/vlang.sublime-syntax");
209/// Embedded Solidity grammar
210pub const SOLIDITY_GRAMMAR: &str = include_str!("../../grammars/solidity.sublime-syntax");
211/// Embedded KDL grammar
212pub const KDL_GRAMMAR: &str = include_str!("../../grammars/kdl.sublime-syntax");
213/// Embedded Nushell grammar
214pub const NUSHELL_GRAMMAR: &str = include_str!("../../grammars/nushell.sublime-syntax");
215/// Embedded Starlark/Bazel grammar
216pub const STARLARK_GRAMMAR: &str = include_str!("../../grammars/starlark.sublime-syntax");
217/// Embedded Justfile grammar
218pub const JUSTFILE_GRAMMAR: &str = include_str!("../../grammars/justfile.sublime-syntax");
219/// Embedded Earthfile grammar
220pub const EARTHFILE_GRAMMAR: &str = include_str!("../../grammars/earthfile.sublime-syntax");
221/// Embedded Go Module grammar
222pub const GOMOD_GRAMMAR: &str = include_str!("../../grammars/gomod.sublime-syntax");
223/// Embedded Vue grammar
224pub const VUE_GRAMMAR: &str = include_str!("../../grammars/vue.sublime-syntax");
225/// Embedded Svelte grammar
226pub const SVELTE_GRAMMAR: &str = include_str!("../../grammars/svelte.sublime-syntax");
227/// Embedded Astro grammar
228pub const ASTRO_GRAMMAR: &str = include_str!("../../grammars/astro.sublime-syntax");
229/// Embedded Hyprlang grammar (Hyprland config)
230pub const HYPRLANG_GRAMMAR: &str = include_str!("../../grammars/hyprlang.sublime-syntax");
231/// Embedded AutoHotkey grammar
232/// From: https://github.com/SALZKARTOFFEEEL/ahk-sublime-syntax (MIT License)
233pub const AUTOHOTKEY_GRAMMAR: &str =
234    include_str!("../../grammars/autohotkey/AutoHotkey.sublime-syntax");
235
236/// Registry of all available TextMate grammars.
237///
238/// This struct holds the compiled syntax set and provides lookup methods.
239/// It does not perform I/O directly - use `GrammarLoader` for loading grammars.
240impl std::fmt::Debug for GrammarRegistry {
241    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
242        f.debug_struct("GrammarRegistry")
243            .field("syntax_count", &self.syntax_set.syntaxes().len())
244            .finish()
245    }
246}
247
248pub struct GrammarRegistry {
249    /// Combined syntax set (built-in + embedded + user grammars)
250    syntax_set: Arc<SyntaxSet>,
251    /// Extension -> scope name mapping for user grammars (takes priority)
252    user_extensions: HashMap<String, String>,
253    /// Filename -> scope name mapping for dotfiles and special files
254    filename_scopes: HashMap<String, String>,
255    /// Paths to dynamically loaded grammar files (for reloading when adding more)
256    loaded_grammar_paths: Vec<GrammarSpec>,
257    /// Provenance info for each grammar (keyed by grammar name)
258    grammar_sources: HashMap<String, GrammarInfo>,
259    /// Short name aliases: lowercase short_name -> full syntect grammar name.
260    /// Provides a deterministic, one-to-one mapping so users can write
261    /// `grammar = "bash"` instead of `grammar = "Bourne Again Shell (bash)"`.
262    aliases: HashMap<String, String>,
263    /// Unified catalog of every known grammar. Rebuilt whenever the syntax set
264    /// or alias table changes. Lookups (`find_by_name`, `find_by_path`, ...)
265    /// all resolve against this.
266    catalog: Vec<GrammarEntry>,
267    /// Index from lowercased lookup keys (display name, language_id, short_name)
268    /// to catalog index.
269    catalog_by_name: HashMap<String, usize>,
270    /// Index from file extension (without dot) to catalog index.
271    catalog_by_extension: HashMap<String, usize>,
272    /// Index from filename to catalog index.
273    catalog_by_filename: HashMap<String, usize>,
274    /// The most recent language config handed to `apply_language_config`.
275    /// Retained so `rebuild_catalog` can replay it — otherwise a rebuild
276    /// (triggered by e.g. `populate_built_in_aliases`) silently wipes user
277    /// `[languages]` config that was merged on top.
278    applied_language_config: HashMap<String, crate::config::LanguageConfig>,
279}
280
281impl GrammarRegistry {
282    /// Create a new GrammarRegistry from pre-built components.
283    ///
284    /// This is typically called by `GrammarLoader` implementations after
285    /// loading grammars from various sources.
286    pub(crate) fn new(
287        syntax_set: SyntaxSet,
288        user_extensions: HashMap<String, String>,
289        filename_scopes: HashMap<String, String>,
290    ) -> Self {
291        Self::new_with_loaded_paths(
292            syntax_set,
293            user_extensions,
294            filename_scopes,
295            Vec::new(),
296            HashMap::new(),
297        )
298    }
299
300    /// Create a GrammarRegistry with pre-loaded grammar path tracking.
301    ///
302    /// Used by the loader when plugin grammars were included in the initial build,
303    /// so that `loaded_grammar_paths()` reflects what was actually loaded.
304    pub(crate) fn new_with_loaded_paths(
305        syntax_set: SyntaxSet,
306        user_extensions: HashMap<String, String>,
307        filename_scopes: HashMap<String, String>,
308        loaded_grammar_paths: Vec<GrammarSpec>,
309        grammar_sources: HashMap<String, GrammarInfo>,
310    ) -> Self {
311        let mut reg = Self {
312            syntax_set: Arc::new(syntax_set),
313            user_extensions,
314            filename_scopes,
315            loaded_grammar_paths,
316            grammar_sources,
317            aliases: HashMap::new(),
318            catalog: Vec::new(),
319            catalog_by_name: HashMap::new(),
320            catalog_by_extension: HashMap::new(),
321            catalog_by_filename: HashMap::new(),
322            applied_language_config: HashMap::new(),
323        };
324        reg.rebuild_catalog();
325        reg
326    }
327
328    /// Create an empty grammar registry (fast, for tests that don't need syntax highlighting)
329    pub fn empty() -> Arc<Self> {
330        let mut builder = SyntaxSetBuilder::new();
331        builder.add_plain_text_syntax();
332        let mut reg = Self {
333            syntax_set: Arc::new(builder.build()),
334            user_extensions: HashMap::new(),
335            filename_scopes: HashMap::new(),
336            loaded_grammar_paths: Vec::new(),
337            grammar_sources: HashMap::new(),
338            aliases: HashMap::new(),
339            catalog: Vec::new(),
340            catalog_by_name: HashMap::new(),
341            catalog_by_extension: HashMap::new(),
342            catalog_by_filename: HashMap::new(),
343            applied_language_config: HashMap::new(),
344        };
345        reg.rebuild_catalog();
346        Arc::new(reg)
347    }
348
349    /// Create a registry with only syntect's pre-compiled defaults (~0ms).
350    ///
351    /// This provides instant syntax highlighting for ~50 common languages
352    /// (Rust, Python, JS/TS, C/C++, Go, Java, HTML, CSS, Markdown, etc.)
353    /// without any `SyntaxSetBuilder::build()` call. Use this at startup,
354    /// then swap in a full registry built on a background thread.
355    pub fn defaults_only() -> Arc<Self> {
356        // Load pre-compiled syntax set (defaults + embedded grammars) from
357        // build-time packdump. This avoids the expensive into_builder() + build()
358        // cycle at runtime (~12s → ~300ms).
359        tracing::info!("defaults_only: loading pre-compiled syntax packdump...");
360        let syntax_set: SyntaxSet = syntect::dumps::from_uncompressed_data(include_bytes!(
361            concat!(env!("OUT_DIR"), "/default_syntaxes.packdump")
362        ))
363        .expect("Failed to load pre-compiled syntax packdump");
364        tracing::info!(
365            "defaults_only: loaded ({} syntaxes)",
366            syntax_set.syntaxes().len()
367        );
368        let grammar_sources = Self::build_grammar_sources_from_syntax_set(&syntax_set);
369        let filename_scopes = Self::build_filename_scopes();
370        let extra_extensions = Self::build_extra_extensions();
371        let mut registry = Self {
372            syntax_set: Arc::new(syntax_set),
373            user_extensions: extra_extensions,
374            filename_scopes,
375            loaded_grammar_paths: Vec::new(),
376            grammar_sources,
377            aliases: HashMap::new(),
378            catalog: Vec::new(),
379            catalog_by_name: HashMap::new(),
380            catalog_by_extension: HashMap::new(),
381            catalog_by_filename: HashMap::new(),
382            applied_language_config: HashMap::new(),
383        };
384        registry.populate_built_in_aliases();
385        registry.rebuild_catalog();
386        Arc::new(registry)
387    }
388
389    /// Build extra extension -> scope mappings for extensions not covered by syntect defaults.
390    ///
391    /// These map common file extensions to existing syntect grammar scopes,
392    /// filling gaps where syntect's built-in extension lists are incomplete.
393    pub(crate) fn build_extra_extensions() -> HashMap<String, String> {
394        let mut map = HashMap::new();
395
396        // JavaScript variants not in syntect defaults (["js", "htc"])
397        let js_scope = "source.js".to_string();
398        map.insert("cjs".to_string(), js_scope.clone());
399        map.insert("mjs".to_string(), js_scope);
400
401        // Dockerfile variants (e.g. Dockerfile.dev -> .dev extension)
402        // These won't match by extension, handled by filename_scopes and first_line_match
403
404        map
405    }
406
407    /// Build the default filename -> scope mappings for dotfiles and special files.
408    pub(crate) fn build_filename_scopes() -> HashMap<String, String> {
409        let mut map = HashMap::new();
410
411        // Shell configuration files -> Bash/Shell script scope
412        let shell_scope = "source.shell.bash".to_string();
413        for filename in [
414            ".zshrc",
415            ".zprofile",
416            ".zshenv",
417            ".zlogin",
418            ".zlogout",
419            ".bash_aliases",
420            // .bashrc and .bash_profile are already recognized by syntect
421            // Common shell script files without extensions
422            "PKGBUILD",
423            "APKBUILD",
424        ] {
425            map.insert(filename.to_string(), shell_scope.clone());
426        }
427
428        // Git rebase todo files
429        let git_rebase_scope = "source.git-rebase-todo".to_string();
430        map.insert("git-rebase-todo".to_string(), git_rebase_scope);
431
432        // Git commit message files
433        let git_commit_scope = "source.git-commit".to_string();
434        for filename in ["COMMIT_EDITMSG", "MERGE_MSG", "SQUASH_MSG", "TAG_EDITMSG"] {
435            map.insert(filename.to_string(), git_commit_scope.clone());
436        }
437
438        // Gitignore and similar files
439        let gitignore_scope = "source.gitignore".to_string();
440        for filename in [".gitignore", ".dockerignore", ".npmignore", ".hgignore"] {
441            map.insert(filename.to_string(), gitignore_scope.clone());
442        }
443
444        // Git config files
445        let gitconfig_scope = "source.gitconfig".to_string();
446        for filename in [".gitconfig", ".gitmodules"] {
447            map.insert(filename.to_string(), gitconfig_scope.clone());
448        }
449
450        // Git attributes files
451        let gitattributes_scope = "source.gitattributes".to_string();
452        map.insert(".gitattributes".to_string(), gitattributes_scope);
453
454        // Jenkinsfile -> Groovy
455        let groovy_scope = "source.groovy".to_string();
456        map.insert("Jenkinsfile".to_string(), groovy_scope);
457
458        // Vagrantfile -> Ruby (syntect already handles this, but be explicit)
459        // Brewfile -> Ruby
460        let ruby_scope = "source.ruby".to_string();
461        map.insert("Brewfile".to_string(), ruby_scope);
462
463        // Dockerfile and variants (exact names; Dockerfile.* handled via prefix check)
464        let dockerfile_scope = "source.dockerfile".to_string();
465        map.insert("Dockerfile".to_string(), dockerfile_scope.clone());
466        map.insert("Containerfile".to_string(), dockerfile_scope.clone());
467        // Common Dockerfile variants
468        map.insert("Dockerfile.dev".to_string(), dockerfile_scope.clone());
469        map.insert("Dockerfile.prod".to_string(), dockerfile_scope.clone());
470        map.insert("Dockerfile.test".to_string(), dockerfile_scope.clone());
471        map.insert("Dockerfile.build".to_string(), dockerfile_scope.clone());
472
473        // CMake
474        let cmake_scope = "source.cmake".to_string();
475        map.insert("CMakeLists.txt".to_string(), cmake_scope);
476
477        // Starlark/Bazel
478        let starlark_scope = "source.starlark".to_string();
479        map.insert("BUILD".to_string(), starlark_scope.clone());
480        map.insert("BUILD.bazel".to_string(), starlark_scope.clone());
481        map.insert("WORKSPACE".to_string(), starlark_scope.clone());
482        map.insert("WORKSPACE.bazel".to_string(), starlark_scope.clone());
483        map.insert("Tiltfile".to_string(), starlark_scope);
484
485        // Justfile (various casings)
486        let justfile_scope = "source.justfile".to_string();
487        map.insert("justfile".to_string(), justfile_scope.clone());
488        map.insert("Justfile".to_string(), justfile_scope.clone());
489        map.insert(".justfile".to_string(), justfile_scope);
490
491        // EditorConfig -> INI
492        let ini_scope = "source.ini".to_string();
493        map.insert(".editorconfig".to_string(), ini_scope);
494
495        // Earthfile
496        let earthfile_scope = "source.earthfile".to_string();
497        map.insert("Earthfile".to_string(), earthfile_scope);
498
499        // Hyprlang (Hyprland config files)
500        let hyprlang_scope = "source.hyprlang".to_string();
501        map.insert("hyprland.conf".to_string(), hyprlang_scope.clone());
502        map.insert("hyprpaper.conf".to_string(), hyprlang_scope.clone());
503        map.insert("hyprlock.conf".to_string(), hyprlang_scope);
504
505        // go.mod / go.sum
506        let gomod_scope = "source.gomod".to_string();
507        map.insert("go.mod".to_string(), gomod_scope.clone());
508        map.insert("go.sum".to_string(), gomod_scope);
509
510        map
511    }
512
513    /// Add embedded grammars (TOML, Odin, etc.) to a syntax set builder.
514    pub(crate) fn add_embedded_grammars(builder: &mut SyntaxSetBuilder) {
515        // TOML grammar
516        match SyntaxDefinition::load_from_str(TOML_GRAMMAR, true, Some("TOML")) {
517            Ok(syntax) => {
518                builder.add(syntax);
519                tracing::debug!("Loaded embedded TOML grammar");
520            }
521            Err(e) => {
522                tracing::warn!("Failed to load embedded TOML grammar: {}", e);
523            }
524        }
525
526        // Odin grammar
527        match SyntaxDefinition::load_from_str(ODIN_GRAMMAR, true, Some("Odin")) {
528            Ok(syntax) => {
529                builder.add(syntax);
530                tracing::debug!("Loaded embedded Odin grammar");
531            }
532            Err(e) => {
533                tracing::warn!("Failed to load embedded Odin grammar: {}", e);
534            }
535        }
536
537        // Zig grammar
538        match SyntaxDefinition::load_from_str(ZIG_GRAMMAR, true, Some("Zig")) {
539            Ok(syntax) => {
540                builder.add(syntax);
541                tracing::debug!("Loaded embedded Zig grammar");
542            }
543            Err(e) => {
544                tracing::warn!("Failed to load embedded Zig grammar: {}", e);
545            }
546        }
547
548        // Git Rebase Todo grammar
549        match SyntaxDefinition::load_from_str(GIT_REBASE_GRAMMAR, true, Some("Git Rebase Todo")) {
550            Ok(syntax) => {
551                builder.add(syntax);
552                tracing::debug!("Loaded embedded Git Rebase Todo grammar");
553            }
554            Err(e) => {
555                tracing::warn!("Failed to load embedded Git Rebase Todo grammar: {}", e);
556            }
557        }
558
559        // Git Commit Message grammar
560        match SyntaxDefinition::load_from_str(GIT_COMMIT_GRAMMAR, true, Some("Git Commit Message"))
561        {
562            Ok(syntax) => {
563                builder.add(syntax);
564                tracing::debug!("Loaded embedded Git Commit Message grammar");
565            }
566            Err(e) => {
567                tracing::warn!("Failed to load embedded Git Commit Message grammar: {}", e);
568            }
569        }
570
571        // Gitignore grammar
572        match SyntaxDefinition::load_from_str(GITIGNORE_GRAMMAR, true, Some("Gitignore")) {
573            Ok(syntax) => {
574                builder.add(syntax);
575                tracing::debug!("Loaded embedded Gitignore grammar");
576            }
577            Err(e) => {
578                tracing::warn!("Failed to load embedded Gitignore grammar: {}", e);
579            }
580        }
581
582        // Git Config grammar
583        match SyntaxDefinition::load_from_str(GITCONFIG_GRAMMAR, true, Some("Git Config")) {
584            Ok(syntax) => {
585                builder.add(syntax);
586                tracing::debug!("Loaded embedded Git Config grammar");
587            }
588            Err(e) => {
589                tracing::warn!("Failed to load embedded Git Config grammar: {}", e);
590            }
591        }
592
593        // Git Attributes grammar
594        match SyntaxDefinition::load_from_str(GITATTRIBUTES_GRAMMAR, true, Some("Git Attributes")) {
595            Ok(syntax) => {
596                builder.add(syntax);
597                tracing::debug!("Loaded embedded Git Attributes grammar");
598            }
599            Err(e) => {
600                tracing::warn!("Failed to load embedded Git Attributes grammar: {}", e);
601            }
602        }
603
604        // Typst grammar
605        match SyntaxDefinition::load_from_str(TYPST_GRAMMAR, true, Some("Typst")) {
606            Ok(syntax) => {
607                builder.add(syntax);
608                tracing::debug!("Loaded embedded Typst grammar");
609            }
610            Err(e) => {
611                tracing::warn!("Failed to load embedded Typst grammar: {}", e);
612            }
613        }
614
615        // Additional embedded grammars for languages not in syntect defaults
616        let additional_grammars: &[(&str, &str)] = &[
617            (DOCKERFILE_GRAMMAR, "Dockerfile"),
618            (INI_GRAMMAR, "INI"),
619            (CMAKE_GRAMMAR, "CMake"),
620            (SCSS_GRAMMAR, "SCSS"),
621            (LESS_GRAMMAR, "LESS"),
622            (POWERSHELL_GRAMMAR, "PowerShell"),
623            (KOTLIN_GRAMMAR, "Kotlin"),
624            (SWIFT_GRAMMAR, "Swift"),
625            (DART_GRAMMAR, "Dart"),
626            (ELIXIR_GRAMMAR, "Elixir"),
627            (FSHARP_GRAMMAR, "FSharp"),
628            (NIX_GRAMMAR, "Nix"),
629            (HCL_GRAMMAR, "HCL"),
630            (PROTOBUF_GRAMMAR, "Protocol Buffers"),
631            (GRAPHQL_GRAMMAR, "GraphQL"),
632            (JULIA_GRAMMAR, "Julia"),
633            (NIM_GRAMMAR, "Nim"),
634            (GLEAM_GRAMMAR, "Gleam"),
635            (VLANG_GRAMMAR, "V"),
636            (SOLIDITY_GRAMMAR, "Solidity"),
637            (KDL_GRAMMAR, "KDL"),
638            (NUSHELL_GRAMMAR, "Nushell"),
639            (STARLARK_GRAMMAR, "Starlark"),
640            (JUSTFILE_GRAMMAR, "Justfile"),
641            (EARTHFILE_GRAMMAR, "Earthfile"),
642            (GOMOD_GRAMMAR, "Go Module"),
643            (VUE_GRAMMAR, "Vue"),
644            (SVELTE_GRAMMAR, "Svelte"),
645            (ASTRO_GRAMMAR, "Astro"),
646            (HYPRLANG_GRAMMAR, "Hyprlang"),
647            (AUTOHOTKEY_GRAMMAR, "AutoHotkey"),
648        ];
649
650        for (grammar_str, name) in additional_grammars {
651            match SyntaxDefinition::load_from_str(grammar_str, true, Some(name)) {
652                Ok(syntax) => {
653                    builder.add(syntax);
654                    tracing::debug!("Loaded embedded {} grammar", name);
655                }
656                Err(e) => {
657                    tracing::warn!("Failed to load embedded {} grammar: {}", name, e);
658                }
659            }
660        }
661    }
662
663    /// Find syntax for a file by path/extension/filename.
664    ///
665    /// Checks in order:
666    /// 1. User-configured grammar extensions (by scope)
667    /// 2. By extension (includes built-in + embedded grammars)
668    /// 3. By filename (custom dotfile mappings like .zshrc)
669    /// 4. By filename via syntect (handles Makefile, .bashrc, etc.)
670    pub fn find_syntax_for_file(&self, path: &Path) -> Option<&SyntaxReference> {
671        if let Some(entry) = self.find_by_path(path) {
672            // Return the syntect grammar if one is attached; otherwise bail.
673            // We must NOT fall through to syntect's own detection here — the
674            // catalog match (e.g. a user-declared "fish" or tree-sitter-only
675            // TypeScript) is authoritative, and syntect might misclassify
676            // `.fish` as bash via its built-in heuristics.
677            return entry
678                .engines
679                .syntect
680                .map(|i| &self.syntax_set.syntaxes()[i]);
681        }
682        // No catalog match — try syntect's file detection (first-line /
683        // Makefile-style filenames that aren't in filename_scopes).
684        self.syntax_set.find_syntax_for_file(path).ok().flatten()
685    }
686
687    /// Find syntax by name, with alias resolution.
688    ///
689    /// Thin wrapper around `find_by_name` that returns the associated syntect
690    /// `SyntaxReference`. Tree-sitter-only entries return `None`.
691    ///
692    /// Falls back to a direct syntect lookup for "Plain Text", which the
693    /// catalog deliberately omits but syntect still exposes.
694    pub fn find_syntax_by_name(&self, name: &str) -> Option<&SyntaxReference> {
695        if let Some(entry) = self.find_by_name(name) {
696            if let Some(idx) = entry.engines.syntect {
697                return Some(&self.syntax_set.syntaxes()[idx]);
698            }
699        }
700        // Plain Text is excluded from the catalog (it's not a "grammar" a user
701        // would ever pick), but syntect still stores it and a handful of
702        // callers still ask for it by name.
703        self.syntax_set.find_syntax_by_name(name)
704    }
705
706    // === Alias management ===
707
708    /// Hardcoded short-name aliases for built-in and embedded grammars.
709    ///
710    /// Each entry maps a short name (lowercase) to the exact syntect grammar name.
711    /// Only grammars whose full name differs significantly from a natural short
712    /// form need an entry here. Grammars already short (e.g., "Rust", "Go") are
713    /// reachable via case-insensitive matching and don't need aliases.
714    fn built_in_aliases() -> Vec<(&'static str, &'static str)> {
715        vec![
716            // Syntect built-in grammars with verbose names
717            ("bash", "Bourne Again Shell (bash)"),
718            ("shell", "Bourne Again Shell (bash)"),
719            ("sh", "Bourne Again Shell (bash)"),
720            ("c++", "C++"),
721            ("cpp", "C++"),
722            ("csharp", "C#"),
723            ("objc", "Objective-C"),
724            ("objcpp", "Objective-C++"),
725            ("regex", "Regular Expressions (Python)"),
726            ("regexp", "Regular Expressions (Python)"),
727            // Embedded grammars with multi-word or non-obvious names
728            ("proto", "Protocol Buffers"),
729            ("protobuf", "Protocol Buffers"),
730            ("gomod", "Go Module"),
731            ("git-rebase", "Git Rebase Todo"),
732            ("git-commit", "Git Commit Message"),
733            ("git-config", "Git Config"),
734            ("git-attributes", "Git Attributes"),
735            ("gitignore", "Gitignore"),
736            ("fsharp", "FSharp"),
737            ("f#", "FSharp"),
738            ("terraform", "HCL"),
739            ("tf", "HCL"),
740            ("ts", "TypeScript"),
741            ("js", "JavaScript"),
742            ("py", "Python"),
743            ("rb", "Ruby"),
744            ("rs", "Rust"),
745            ("md", "Markdown"),
746            ("yml", "YAML"),
747            ("dockerfile", "Dockerfile"),
748        ]
749    }
750
751    /// Populate aliases from the built-in table.
752    ///
753    /// Validates that:
754    /// - Each alias target (full name) exists in the syntax set
755    /// - No alias collides (case-insensitive) with an existing grammar full name
756    /// - No duplicate aliases exist
757    pub(crate) fn populate_built_in_aliases(&mut self) {
758        for (short, full) in Self::built_in_aliases() {
759            self.register_alias_inner(short, full, true);
760        }
761        self.rebuild_catalog();
762    }
763
764    /// Register a short-name alias for a grammar.
765    ///
766    /// Returns `true` if the alias was registered, `false` if rejected due to
767    /// collision or missing target. For built-in aliases, collisions panic
768    /// (they indicate a bug). For dynamic aliases, collisions log a warning.
769    ///
770    /// Splices the alias directly into the catalog rather than rebuilding, so
771    /// any user config previously merged via `apply_language_config` is
772    /// preserved. A full rebuild would wipe those entries.
773    pub(crate) fn register_alias(&mut self, short_name: &str, full_name: &str) -> bool {
774        if !self.register_alias_inner(short_name, full_name, false) {
775            return false;
776        }
777        let short_lower = short_name.to_lowercase();
778        let full_lower = full_name.to_lowercase();
779        if let Some(&idx) = self.catalog_by_name.get(&full_lower) {
780            self.catalog_by_name
781                .entry(short_lower.clone())
782                .or_insert(idx);
783            let entry = &mut self.catalog[idx];
784            let replace = match &entry.short_name {
785                None => true,
786                Some(existing) => short_name.len() < existing.len(),
787            };
788            if replace {
789                entry.short_name = Some(short_lower);
790            }
791        }
792        true
793    }
794
795    fn register_alias_inner(
796        &mut self,
797        short_name: &str,
798        full_name: &str,
799        is_built_in: bool,
800    ) -> bool {
801        let short_lower = short_name.to_lowercase();
802
803        // Validate: target grammar must exist in the syntax set
804        let target_exists = self
805            .syntax_set
806            .syntaxes()
807            .iter()
808            .any(|s| s.name.eq_ignore_ascii_case(full_name));
809        if !target_exists {
810            if is_built_in {
811                // Built-in alias targets should always exist; warn but don't panic
812                // (grammar might have been removed from syntect upstream)
813                tracing::warn!(
814                    "[grammar-alias] Built-in alias '{}' -> '{}': target grammar not found, skipping",
815                    short_name, full_name
816                );
817            } else {
818                tracing::warn!(
819                    "[grammar-alias] Alias '{}' -> '{}': target grammar not found, skipping",
820                    short_name,
821                    full_name
822                );
823            }
824            return false;
825        }
826
827        // Validate: short name must not collide (case-insensitive) with any grammar full name
828        let collides_with_full_name = self
829            .syntax_set
830            .syntaxes()
831            .iter()
832            .any(|s| s.name.eq_ignore_ascii_case(&short_lower));
833        if collides_with_full_name {
834            // This is actually fine — the short name matches a full name directly,
835            // so find_syntax_by_name's case-insensitive search will find it.
836            // No alias needed.
837            tracing::debug!(
838                "[grammar-alias] Alias '{}' matches an existing grammar name, skipping (not needed)",
839                short_name
840            );
841            return false;
842        }
843
844        // Validate: no duplicate alias (case-insensitive)
845        if let Some(existing_target) = self.aliases.get(&short_lower) {
846            if existing_target.eq_ignore_ascii_case(full_name) {
847                // Same mapping, no-op
848                return true;
849            }
850            let msg = format!(
851                "Alias '{}' already maps to '{}', cannot remap to '{}'",
852                short_name, existing_target, full_name
853            );
854            if is_built_in {
855                panic!("[grammar-alias] Built-in alias collision: {}", msg);
856            } else {
857                tracing::warn!("[grammar-alias] {}", msg);
858                return false;
859            }
860        }
861
862        // Resolve the exact syntect name (preserving original case)
863        let exact_name = self
864            .syntax_set
865            .syntaxes()
866            .iter()
867            .find(|s| s.name.eq_ignore_ascii_case(full_name))
868            .map(|s| s.name.clone())
869            .unwrap();
870
871        self.aliases.insert(short_lower, exact_name);
872        true
873    }
874
875    // === Unified catalog ===
876
877    /// Rebuild the flat catalog of grammar entries.
878    ///
879    /// Called after the syntax set, aliases, or filename scopes change.
880    /// Produces one entry per logical language by merging:
881    /// 1. Every `SyntaxReference` in the syntax set (except "Plain Text")
882    /// 2. Every `fresh_languages::Language` not already covered by a syntect entry
883    /// 3. Alias short-names attached to their target entry
884    /// 4. Filename mappings from `filename_scopes` attached to their scope's entry
885    /// 5. Extra extensions from `user_extensions` attached to their scope's entry
886    ///
887    /// Automatically replays the last `apply_language_config` at the end, so
888    /// user `[languages]` config survives any rebuild.
889    pub(crate) fn rebuild_catalog(&mut self) {
890        // Reverse-map: full_name (lowercase) -> shortest alias.
891        //
892        // Seed from the built-in alias table as well as the live `aliases`
893        // HashMap: the live map only contains aliases whose target exists in
894        // the syntect set, so tree-sitter-only entries (TypeScript) would
895        // otherwise never get their short name ("ts").
896        let mut short_by_full: HashMap<String, String> = HashMap::new();
897        let record = |map: &mut HashMap<String, String>, short: &str, full: &str| {
898            let key = full.to_lowercase();
899            let keep = match map.get(&key) {
900                None => true,
901                Some(existing) => short.len() < existing.len(),
902            };
903            if keep {
904                map.insert(key, short.to_string());
905            }
906        };
907        for (short, full) in Self::built_in_aliases() {
908            record(&mut short_by_full, short, full);
909        }
910        for (short, full) in &self.aliases {
911            record(&mut short_by_full, short, full);
912        }
913
914        let derive_language_id =
915            |display_name: &str| -> (String, Option<fresh_languages::Language>) {
916                let ts = tree_sitter_for_syntect_name(display_name);
917                let id = ts
918                    .map(|l| l.id().to_string())
919                    .unwrap_or_else(|| display_name.to_lowercase());
920                (id, ts)
921            };
922
923        let mut catalog: Vec<GrammarEntry> = Vec::new();
924        let mut scope_to_index: HashMap<String, usize> = HashMap::new();
925
926        // Syntect-backed entries (skip Plain Text).
927        //
928        // Syntect's `file_extensions` is a hybrid list: real extensions like
929        // "rb" sit alongside bare filenames like "Gemfile", "Rakefile",
930        // "Makefile". Syntect's own `find_syntax_for_file` tries each entry
931        // against the whole filename AND against the path's extension, and
932        // the catalog has to preserve that semantics. We keep everything in
933        // `extensions` here and index each entry as *both* an extension and
934        // a filename at the bottom of this method.
935        for (idx, syntax) in self.syntax_set.syntaxes().iter().enumerate() {
936            if syntax.name == "Plain Text" {
937                continue;
938            }
939            let (language_id, tree_sitter) = derive_language_id(&syntax.name);
940            let short_name = short_by_full.get(&syntax.name.to_lowercase()).cloned();
941            let source = self
942                .grammar_sources
943                .get(&syntax.name)
944                .map(|info| info.source.clone())
945                .unwrap_or(GrammarSource::BuiltIn);
946            let entry_index = catalog.len();
947            scope_to_index.insert(syntax.scope.to_string(), entry_index);
948
949            // Union syntect's file_extensions with tree-sitter's own
950            // extension list when the entry carries both engines.
951            // tree-sitter-javascript handles `.jsx`/`.mjs`/`.cjs` that
952            // syntect's JS grammar doesn't list, and the old code used to
953            // route those paths to tree-sitter via a separate lookup.
954            let mut extensions = syntax.file_extensions.clone();
955            if let Some(lang) = tree_sitter {
956                for ext in lang.extensions() {
957                    let ext = ext.to_string();
958                    if !extensions.iter().any(|e| e == &ext) {
959                        extensions.push(ext);
960                    }
961                }
962            }
963
964            catalog.push(GrammarEntry {
965                display_name: syntax.name.clone(),
966                language_id,
967                short_name,
968                extensions,
969                filenames: Vec::new(),
970                filename_globs: Vec::new(),
971                source,
972                engines: GrammarEngines {
973                    syntect: Some(idx),
974                    tree_sitter,
975                },
976            });
977        }
978
979        // Attach filename_scopes to their entries.
980        for (filename, scope) in &self.filename_scopes {
981            if let Some(&idx) = scope_to_index.get(scope) {
982                if !catalog[idx].filenames.iter().any(|f| f == filename) {
983                    catalog[idx].filenames.push(filename.clone());
984                }
985            }
986        }
987
988        // Attach user_extensions (extra → scope) to their entries.
989        for (ext, scope) in &self.user_extensions {
990            if let Some(&idx) = scope_to_index.get(scope) {
991                if !catalog[idx].extensions.iter().any(|e| e == ext) {
992                    catalog[idx].extensions.push(ext.clone());
993                }
994            }
995        }
996
997        // Ensure every tree-sitter language has an entry. If a syntect entry
998        // already maps to the same tree-sitter language, skip it; otherwise
999        // add a tree-sitter-only entry so the catalog is complete (TypeScript
1000        // being the motivating example — syntect ships no grammar for it).
1001        let mut ts_covered: std::collections::HashSet<fresh_languages::Language> =
1002            std::collections::HashSet::new();
1003        for entry in &catalog {
1004            if let Some(lang) = entry.engines.tree_sitter {
1005                ts_covered.insert(lang);
1006            }
1007        }
1008        for lang in fresh_languages::Language::all() {
1009            if ts_covered.contains(lang) {
1010                continue;
1011            }
1012            let display_name = lang.display_name().to_string();
1013            let language_id = lang.id().to_string();
1014            let short_name = short_by_full.get(&display_name.to_lowercase()).cloned();
1015            let extensions: Vec<String> = lang.extensions().iter().map(|s| s.to_string()).collect();
1016            catalog.push(GrammarEntry {
1017                display_name,
1018                language_id,
1019                short_name,
1020                extensions,
1021                filenames: Vec::new(),
1022                filename_globs: Vec::new(),
1023                source: GrammarSource::BuiltIn,
1024                engines: GrammarEngines {
1025                    syntect: None,
1026                    tree_sitter: Some(*lang),
1027                },
1028            });
1029        }
1030
1031        // Build name / extension / filename indices.
1032        //
1033        // Every entry in `extensions` gets indexed in BOTH `by_extension`
1034        // (lowercased) AND `by_filename` (exact case) — syntect's
1035        // `file_extensions` list holds both real extensions ("rb") and bare
1036        // filenames ("Gemfile", "Rakefile", "Makefile"). Indexing both ways
1037        // matches syntect's own `find_syntax_for_file` semantics.
1038        let mut by_name: HashMap<String, usize> = HashMap::new();
1039        let mut by_extension: HashMap<String, usize> = HashMap::new();
1040        let mut by_filename: HashMap<String, usize> = HashMap::new();
1041        for (idx, entry) in catalog.iter().enumerate() {
1042            by_name.insert(entry.display_name.to_lowercase(), idx);
1043            by_name.insert(entry.language_id.to_lowercase(), idx);
1044            if let Some(short) = &entry.short_name {
1045                by_name.insert(short.to_lowercase(), idx);
1046            }
1047            for ext in &entry.extensions {
1048                by_extension.entry(ext.to_lowercase()).or_insert(idx);
1049                by_filename.entry(ext.clone()).or_insert(idx);
1050            }
1051            for filename in &entry.filenames {
1052                by_filename.entry(filename.clone()).or_insert(idx);
1053            }
1054        }
1055
1056        self.catalog = catalog;
1057        self.catalog_by_name = by_name;
1058        self.catalog_by_extension = by_extension;
1059        self.catalog_by_filename = by_filename;
1060
1061        // Replay the most recent user config so a rebuild doesn't silently
1062        // wipe out user `[languages]` rules. `take` + restore avoids both a
1063        // clone and a borrow checker fight with `apply_language_config_inner`.
1064        if !self.applied_language_config.is_empty() {
1065            let cfg = std::mem::take(&mut self.applied_language_config);
1066            self.apply_language_config_inner(&cfg);
1067            self.applied_language_config = cfg;
1068        }
1069    }
1070
1071    /// Return the full catalog of grammar entries.
1072    pub fn catalog(&self) -> &[GrammarEntry] {
1073        &self.catalog
1074    }
1075
1076    /// Look up a grammar entry by display name, language ID, or short alias
1077    /// (case-insensitive). All aliases — built-in and user-config-declared —
1078    /// are indexed directly in `catalog_by_name` during `rebuild_catalog` /
1079    /// `register_alias` / `apply_language_config`, so a single lookup covers
1080    /// every case.
1081    pub fn find_by_name(&self, name: &str) -> Option<&GrammarEntry> {
1082        self.catalog_by_name
1083            .get(&name.to_lowercase())
1084            .map(|&idx| &self.catalog[idx])
1085    }
1086
1087    /// Look up a grammar entry by file path.
1088    ///
1089    /// Resolution order:
1090    /// 1. Exact filename (config-declared filenames and filename_scopes live here)
1091    /// 2. Glob patterns from user config (e.g. "*.conf", "/etc/**/rc.*")
1092    /// 3. File extension
1093    ///
1094    /// Globs take priority over extension so a user rule like `*.conf → bash`
1095    /// wins over any built-in extension match on `.conf`.
1096    pub fn find_by_path(&self, path: &Path) -> Option<&GrammarEntry> {
1097        let filename = path.file_name().and_then(|n| n.to_str());
1098        let path_str = path.to_str().unwrap_or("");
1099
1100        if let Some(name) = filename {
1101            if let Some(&idx) = self.catalog_by_filename.get(name) {
1102                return Some(&self.catalog[idx]);
1103            }
1104        }
1105
1106        // Glob walk — filenames with globs are rare so linear scan is fine.
1107        if let Some(name) = filename {
1108            for entry in &self.catalog {
1109                for pattern in &entry.filename_globs {
1110                    let matched = if is_path_pattern(pattern) {
1111                        path_glob_matches(pattern, path_str)
1112                    } else {
1113                        filename_glob_matches(pattern, name)
1114                    };
1115                    if matched {
1116                        return Some(entry);
1117                    }
1118                }
1119            }
1120        }
1121
1122        if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
1123            return self.find_by_extension(ext);
1124        }
1125        None
1126    }
1127
1128    /// Look up a grammar entry by file extension (case-insensitive, without dot).
1129    pub fn find_by_extension(&self, ext: &str) -> Option<&GrammarEntry> {
1130        self.catalog_by_extension
1131            .get(&ext.to_lowercase())
1132            .map(|&idx| &self.catalog[idx])
1133    }
1134
1135    /// Merge user `[languages]` config into the catalog.
1136    ///
1137    /// For each config entry, resolves its grammar to an existing catalog entry
1138    /// (by grammar name or by language id). Extensions are added and override
1139    /// the ext→entry index so config wins over built-in mappings. Filenames are
1140    /// split into exact matches (indexed) and globs (walked at lookup time).
1141    ///
1142    /// If no existing entry matches, a new engine-less entry is created so the
1143    /// language still appears in the palette.
1144    ///
1145    /// Idempotent. The config is cached on the registry so `rebuild_catalog`
1146    /// can replay it — callers don't need to re-apply after a rebuild.
1147    pub fn apply_language_config(
1148        &mut self,
1149        languages: &HashMap<String, crate::config::LanguageConfig>,
1150    ) {
1151        self.applied_language_config = languages.clone();
1152        self.apply_language_config_inner(languages);
1153    }
1154
1155    /// Do the actual catalog splicing without touching
1156    /// `applied_language_config`. Called from `apply_language_config` (which
1157    /// records the input) and from `rebuild_catalog` (which replays the
1158    /// cached input after wiping the catalog).
1159    fn apply_language_config_inner(
1160        &mut self,
1161        languages: &HashMap<String, crate::config::LanguageConfig>,
1162    ) {
1163        for (lang_id, lang_cfg) in languages {
1164            let grammar_name = if lang_cfg.grammar.is_empty() {
1165                lang_id.as_str()
1166            } else {
1167                lang_cfg.grammar.as_str()
1168            };
1169
1170            // Resolve to an existing entry; fall back to creating one.
1171            let idx = self
1172                .catalog_by_name
1173                .get(&grammar_name.to_lowercase())
1174                .copied()
1175                .or_else(|| self.catalog_by_name.get(&lang_id.to_lowercase()).copied())
1176                .unwrap_or_else(|| {
1177                    let idx = self.catalog.len();
1178                    self.catalog.push(GrammarEntry {
1179                        display_name: lang_id.clone(),
1180                        language_id: lang_id.clone(),
1181                        short_name: None,
1182                        extensions: Vec::new(),
1183                        filenames: Vec::new(),
1184                        filename_globs: Vec::new(),
1185                        source: GrammarSource::BuiltIn,
1186                        engines: GrammarEngines::default(),
1187                    });
1188                    idx
1189                });
1190
1191            // Always index the config key so `find_by_name("mylang")` resolves
1192            // even when `mylang` aliases an existing grammar (e.g.
1193            // `[languages.mylang] grammar = "Rust"`). `or_insert` preserves
1194            // any existing mapping — won't clobber the canonical entry.
1195            self.catalog_by_name
1196                .entry(lang_id.to_lowercase())
1197                .or_insert(idx);
1198
1199            for ext in &lang_cfg.extensions {
1200                if !self.catalog[idx].extensions.iter().any(|e| e == ext) {
1201                    self.catalog[idx].extensions.push(ext.clone());
1202                }
1203                // Config-declared extensions override any previous mapping.
1204                self.catalog_by_extension.insert(ext.to_lowercase(), idx);
1205            }
1206            for filename in &lang_cfg.filenames {
1207                if is_glob_pattern(filename) {
1208                    if !self.catalog[idx]
1209                        .filename_globs
1210                        .iter()
1211                        .any(|f| f == filename)
1212                    {
1213                        self.catalog[idx].filename_globs.push(filename.clone());
1214                    }
1215                } else {
1216                    if !self.catalog[idx].filenames.iter().any(|f| f == filename) {
1217                        self.catalog[idx].filenames.push(filename.clone());
1218                    }
1219                    self.catalog_by_filename.insert(filename.clone(), idx);
1220                }
1221            }
1222        }
1223    }
1224
1225    /// Get the underlying syntax set
1226    pub fn syntax_set(&self) -> &Arc<SyntaxSet> {
1227        &self.syntax_set
1228    }
1229
1230    /// Get a clone of the Arc for sharing
1231    pub fn syntax_set_arc(&self) -> Arc<SyntaxSet> {
1232        Arc::clone(&self.syntax_set)
1233    }
1234
1235    /// List all available syntax names
1236    pub fn available_syntaxes(&self) -> Vec<&str> {
1237        self.syntax_set
1238            .syntaxes()
1239            .iter()
1240            .map(|s| s.name.as_str())
1241            .collect()
1242    }
1243
1244    /// List all available grammars with provenance information.
1245    ///
1246    /// Returns a sorted list of `GrammarInfo` entries derived from the unified
1247    /// catalog — this includes both syntect grammars and tree-sitter-only
1248    /// languages (like TypeScript). Each entry is listed exactly once even
1249    /// when both engines can serve it.
1250    pub fn available_grammar_info(&self) -> Vec<GrammarInfo> {
1251        let mut result: Vec<GrammarInfo> = self
1252            .catalog
1253            .iter()
1254            .map(|entry| GrammarInfo {
1255                name: entry.display_name.clone(),
1256                source: entry.source.clone(),
1257                file_extensions: entry.extensions.clone(),
1258                short_name: entry.short_name.clone(),
1259            })
1260            .collect();
1261        result.sort_by(|a, b| a.name.to_lowercase().cmp(&b.name.to_lowercase()));
1262        result
1263    }
1264
1265    /// Get the grammar sources map.
1266    pub(crate) fn grammar_sources(&self) -> &HashMap<String, GrammarInfo> {
1267        &self.grammar_sources
1268    }
1269
1270    /// Build grammar source info from a pre-compiled syntax set.
1271    ///
1272    /// All grammars in the packdump (syntect defaults + embedded) are tagged as built-in.
1273    pub(crate) fn build_grammar_sources_from_syntax_set(
1274        syntax_set: &SyntaxSet,
1275    ) -> HashMap<String, GrammarInfo> {
1276        let mut sources = HashMap::new();
1277        for syntax in syntax_set.syntaxes() {
1278            sources.insert(
1279                syntax.name.clone(),
1280                GrammarInfo {
1281                    name: syntax.name.clone(),
1282                    source: GrammarSource::BuiltIn,
1283                    file_extensions: syntax.file_extensions.clone(),
1284                    short_name: None,
1285                },
1286            );
1287        }
1288        sources
1289    }
1290
1291    /// Get the user extensions mapping (extension -> scope name).
1292    #[cfg(test)]
1293    pub(crate) fn user_extensions(&self) -> &HashMap<String, String> {
1294        &self.user_extensions
1295    }
1296
1297    /// Get the loaded grammar paths (for deduplication in flush_pending_grammars).
1298    #[cfg(test)]
1299    pub(crate) fn loaded_grammar_paths(&self) -> &[GrammarSpec] {
1300        &self.loaded_grammar_paths
1301    }
1302
1303    /// Create a new registry with additional grammar files
1304    ///
1305    /// This builds a new GrammarRegistry that includes all grammars from
1306    /// the base registry plus the additional grammars specified.
1307    /// Uses the base registry's syntax_set as the builder base, preserving
1308    /// all existing grammars (user grammars, language packs, etc.).
1309    ///
1310    /// # Arguments
1311    /// * `base` - The base registry to extend
1312    /// * `additional` - List of (language, path, extensions) tuples for new grammars
1313    ///
1314    /// # Returns
1315    /// A new GrammarRegistry with the additional grammars, or None if rebuilding fails
1316    pub fn with_additional_grammars(
1317        base: &GrammarRegistry,
1318        additional: &[GrammarSpec],
1319    ) -> Option<Self> {
1320        tracing::info!(
1321            "[SYNTAX DEBUG] with_additional_grammars: adding {} grammars to base with {} syntaxes",
1322            additional.len(),
1323            base.syntax_set.syntaxes().len()
1324        );
1325
1326        // Use the base registry's syntax_set as builder base — this preserves
1327        // ALL existing grammars (defaults, embedded, user, language packs)
1328        // without needing to reload them from disk.
1329        let mut builder = (*base.syntax_set).clone().into_builder();
1330
1331        // Preserve existing user extensions and add new ones
1332        let mut user_extensions = base.user_extensions.clone();
1333
1334        // Track loaded grammar paths (existing + new)
1335        let mut loaded_grammar_paths = base.loaded_grammar_paths.clone();
1336
1337        // Preserve existing grammar sources
1338        let mut grammar_sources = base.grammar_sources.clone();
1339
1340        // Add each new grammar
1341        for spec in additional {
1342            tracing::info!(
1343                "[SYNTAX DEBUG] loading new grammar file: lang='{}', path={:?}, extensions={:?}",
1344                spec.language,
1345                spec.path,
1346                spec.extensions
1347            );
1348            match Self::load_grammar_file(&spec.path) {
1349                Ok(syntax) => {
1350                    let scope = syntax.scope.to_string();
1351                    let syntax_name = syntax.name.clone();
1352                    tracing::info!(
1353                        "[SYNTAX DEBUG] grammar loaded successfully: name='{}', scope='{}'",
1354                        syntax_name,
1355                        scope
1356                    );
1357                    builder.add(syntax);
1358                    tracing::info!(
1359                        "Loaded grammar for '{}' from {:?} with extensions {:?}",
1360                        spec.language,
1361                        spec.path,
1362                        spec.extensions
1363                    );
1364                    // Register extensions for this grammar
1365                    for ext in &spec.extensions {
1366                        user_extensions.insert(ext.clone(), scope.clone());
1367                    }
1368                    // Track provenance
1369                    grammar_sources.insert(
1370                        syntax_name.clone(),
1371                        GrammarInfo {
1372                            name: syntax_name,
1373                            source: GrammarSource::Plugin {
1374                                plugin: spec.language.clone(),
1375                                path: spec.path.clone(),
1376                            },
1377                            file_extensions: spec.extensions.clone(),
1378                            short_name: None,
1379                        },
1380                    );
1381                    // Track this grammar path for future reloads
1382                    loaded_grammar_paths.push(spec.clone());
1383                }
1384                Err(e) => {
1385                    tracing::warn!(
1386                        "Failed to load grammar for '{}' from {:?}: {}",
1387                        spec.language,
1388                        spec.path,
1389                        e
1390                    );
1391                }
1392            }
1393        }
1394
1395        let mut reg = Self {
1396            syntax_set: Arc::new(builder.build()),
1397            user_extensions,
1398            filename_scopes: base.filename_scopes.clone(),
1399            loaded_grammar_paths,
1400            grammar_sources,
1401            aliases: base.aliases.clone(),
1402            catalog: Vec::new(),
1403            catalog_by_name: HashMap::new(),
1404            catalog_by_extension: HashMap::new(),
1405            catalog_by_filename: HashMap::new(),
1406            applied_language_config: HashMap::new(),
1407        };
1408        reg.rebuild_catalog();
1409        Some(reg)
1410    }
1411
1412    /// Load a grammar file from disk
1413    ///
1414    /// Only Sublime Text (.sublime-syntax) format is supported.
1415    /// TextMate (.tmLanguage) grammars use a completely different format
1416    /// and cannot be loaded by syntect's yaml-load feature.
1417    pub(crate) fn load_grammar_file(path: &Path) -> Result<SyntaxDefinition, String> {
1418        let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
1419
1420        match ext {
1421            "sublime-syntax" => {
1422                let content = std::fs::read_to_string(path)
1423                    .map_err(|e| format!("Failed to read file: {}", e))?;
1424                SyntaxDefinition::load_from_str(
1425                    &content,
1426                    true,
1427                    path.file_stem().and_then(|s| s.to_str()),
1428                )
1429                .map_err(|e| format!("Failed to parse sublime-syntax: {}", e))
1430            }
1431            _ => Err(format!(
1432                "Unsupported grammar format: .{}. Only .sublime-syntax is supported.",
1433                ext
1434            )),
1435        }
1436    }
1437}
1438
1439impl Default for GrammarRegistry {
1440    fn default() -> Self {
1441        // Create with defaults and embedded grammars only (no user grammars)
1442        let defaults = SyntaxSet::load_defaults_newlines();
1443        let mut builder = defaults.into_builder();
1444        Self::add_embedded_grammars(&mut builder);
1445        let syntax_set = builder.build();
1446        let filename_scopes = Self::build_filename_scopes();
1447        let extra_extensions = Self::build_extra_extensions();
1448
1449        let mut registry = Self::new(syntax_set, extra_extensions, filename_scopes);
1450        registry.populate_built_in_aliases();
1451        registry.rebuild_catalog();
1452        registry
1453    }
1454}
1455
1456// VSCode package.json structures for parsing grammar manifests
1457
1458#[derive(Debug, Deserialize)]
1459pub struct PackageManifest {
1460    #[serde(default)]
1461    pub contributes: Option<Contributes>,
1462}
1463
1464#[derive(Debug, Deserialize, Default)]
1465pub struct Contributes {
1466    #[serde(default)]
1467    pub languages: Vec<LanguageContribution>,
1468    #[serde(default)]
1469    pub grammars: Vec<GrammarContribution>,
1470}
1471
1472#[derive(Debug, Deserialize)]
1473pub struct LanguageContribution {
1474    pub id: String,
1475    #[serde(default)]
1476    pub extensions: Vec<String>,
1477}
1478
1479#[derive(Debug, Deserialize)]
1480pub struct GrammarContribution {
1481    pub language: String,
1482    #[serde(rename = "scopeName")]
1483    pub scope_name: String,
1484    pub path: String,
1485}
1486
1487#[cfg(test)]
1488mod tests {
1489    use super::*;
1490
1491    #[test]
1492    fn test_empty_registry() {
1493        let registry = GrammarRegistry::empty();
1494        // Should have at least plain text
1495        assert!(!registry.available_syntaxes().is_empty());
1496    }
1497
1498    #[test]
1499    fn test_default_registry() {
1500        let registry = GrammarRegistry::default();
1501        // Should have built-in syntaxes
1502        assert!(!registry.available_syntaxes().is_empty());
1503    }
1504
1505    #[test]
1506    fn test_find_syntax_for_common_extensions() {
1507        let registry = GrammarRegistry::default();
1508
1509        // Test common extensions that syntect should support
1510        let test_cases = [
1511            ("test.py", true),
1512            ("test.rs", true),
1513            ("test.js", true),
1514            ("test.json", true),
1515            ("test.md", true),
1516            ("test.html", true),
1517            ("test.css", true),
1518            ("test.unknown_extension_xyz", false),
1519        ];
1520
1521        for (filename, should_exist) in test_cases {
1522            let path = Path::new(filename);
1523            let result = registry.find_syntax_for_file(path);
1524            assert_eq!(
1525                result.is_some(),
1526                should_exist,
1527                "Expected {:?} for {}",
1528                should_exist,
1529                filename
1530            );
1531        }
1532    }
1533
1534    #[test]
1535    fn test_syntax_set_arc() {
1536        let registry = GrammarRegistry::default();
1537        let arc1 = registry.syntax_set_arc();
1538        let arc2 = registry.syntax_set_arc();
1539        // Both should point to the same data
1540        assert!(Arc::ptr_eq(&arc1, &arc2));
1541    }
1542
1543    #[test]
1544    fn test_shell_dotfiles_detection() {
1545        let registry = GrammarRegistry::default();
1546
1547        // All these should be detected as shell scripts
1548        let shell_files = [".zshrc", ".zprofile", ".zshenv", ".bash_aliases"];
1549
1550        for filename in shell_files {
1551            let path = Path::new(filename);
1552            let result = registry.find_syntax_for_file(path);
1553            assert!(
1554                result.is_some(),
1555                "{} should be detected as a syntax",
1556                filename
1557            );
1558            let syntax = result.unwrap();
1559            // Should be detected as Bash/Shell
1560            assert!(
1561                syntax.name.to_lowercase().contains("bash")
1562                    || syntax.name.to_lowercase().contains("shell"),
1563                "{} should be detected as shell/bash, got: {}",
1564                filename,
1565                syntax.name
1566            );
1567        }
1568    }
1569
1570    #[test]
1571    fn test_pkgbuild_detection() {
1572        let registry = GrammarRegistry::default();
1573
1574        // PKGBUILD and APKBUILD should be detected as shell scripts
1575        for filename in ["PKGBUILD", "APKBUILD"] {
1576            let path = Path::new(filename);
1577            let result = registry.find_syntax_for_file(path);
1578            assert!(
1579                result.is_some(),
1580                "{} should be detected as a syntax",
1581                filename
1582            );
1583            let syntax = result.unwrap();
1584            // Should be detected as Bash/Shell
1585            assert!(
1586                syntax.name.to_lowercase().contains("bash")
1587                    || syntax.name.to_lowercase().contains("shell"),
1588                "{} should be detected as shell/bash, got: {}",
1589                filename,
1590                syntax.name
1591            );
1592        }
1593    }
1594
1595    #[test]
1596    fn test_find_syntax_with_glob_filenames() {
1597        let mut registry = GrammarRegistry::default();
1598        let mut languages = std::collections::HashMap::new();
1599        languages.insert(
1600            "shell-configs".to_string(),
1601            crate::config::LanguageConfig {
1602                extensions: vec!["sh".to_string()],
1603                filenames: vec!["*.conf".to_string(), "*rc".to_string()],
1604                grammar: "bash".to_string(),
1605                comment_prefix: Some("#".to_string()),
1606                auto_indent: true,
1607                auto_close: None,
1608                auto_surround: None,
1609                textmate_grammar: None,
1610                show_whitespace_tabs: true,
1611                line_wrap: None,
1612                wrap_column: None,
1613                page_view: None,
1614                page_width: None,
1615                use_tabs: None,
1616                tab_size: None,
1617                formatter: None,
1618                format_on_save: false,
1619                on_save: vec![],
1620                word_characters: None,
1621            },
1622        );
1623        registry.apply_language_config(&languages);
1624
1625        assert!(
1626            registry.find_by_path(Path::new("nftables.conf")).is_some(),
1627            "*.conf should match nftables.conf"
1628        );
1629        assert!(
1630            registry.find_by_path(Path::new("lfrc")).is_some(),
1631            "*rc should match lfrc"
1632        );
1633        // Unrelated file shouldn't panic.
1634        let _ = registry.find_by_path(Path::new("randomfile"));
1635    }
1636
1637    #[test]
1638    fn test_find_syntax_with_path_glob_filenames() {
1639        let mut registry = GrammarRegistry::default();
1640        let mut languages = std::collections::HashMap::new();
1641        languages.insert(
1642            "shell-configs".to_string(),
1643            crate::config::LanguageConfig {
1644                extensions: vec!["sh".to_string()],
1645                filenames: vec!["/etc/**/rc.*".to_string()],
1646                grammar: "bash".to_string(),
1647                comment_prefix: Some("#".to_string()),
1648                auto_indent: true,
1649                auto_close: None,
1650                auto_surround: None,
1651                textmate_grammar: None,
1652                show_whitespace_tabs: true,
1653                line_wrap: None,
1654                wrap_column: None,
1655                page_view: None,
1656                page_width: None,
1657                use_tabs: None,
1658                tab_size: None,
1659                formatter: None,
1660                format_on_save: false,
1661                on_save: vec![],
1662                word_characters: None,
1663            },
1664        );
1665        registry.apply_language_config(&languages);
1666
1667        assert!(
1668            registry.find_by_path(Path::new("/etc/rc.conf")).is_some(),
1669            "/etc/**/rc.* should match /etc/rc.conf"
1670        );
1671        assert!(
1672            registry
1673                .find_by_path(Path::new("/etc/init/rc.local"))
1674                .is_some(),
1675            "/etc/**/rc.* should match /etc/init/rc.local"
1676        );
1677        let _ = registry.find_by_path(Path::new("/var/rc.conf"));
1678    }
1679
1680    #[test]
1681    fn test_exact_filename_takes_priority_over_glob() {
1682        let mut registry = GrammarRegistry::default();
1683        let mut languages = std::collections::HashMap::new();
1684
1685        // A language with exact filename "lfrc" -> python grammar
1686        languages.insert(
1687            "custom-lfrc".to_string(),
1688            crate::config::LanguageConfig {
1689                extensions: vec![],
1690                filenames: vec!["lfrc".to_string()],
1691                grammar: "python".to_string(),
1692                comment_prefix: Some("#".to_string()),
1693                auto_indent: true,
1694                auto_close: None,
1695                auto_surround: None,
1696                textmate_grammar: None,
1697                show_whitespace_tabs: true,
1698                line_wrap: None,
1699                wrap_column: None,
1700                page_view: None,
1701                page_width: None,
1702                use_tabs: None,
1703                tab_size: None,
1704                formatter: None,
1705                format_on_save: false,
1706                on_save: vec![],
1707                word_characters: None,
1708            },
1709        );
1710
1711        // A language with glob "*rc" -> bash grammar
1712        languages.insert(
1713            "rc-files".to_string(),
1714            crate::config::LanguageConfig {
1715                extensions: vec![],
1716                filenames: vec!["*rc".to_string()],
1717                grammar: "bash".to_string(),
1718                comment_prefix: Some("#".to_string()),
1719                auto_indent: true,
1720                auto_close: None,
1721                auto_surround: None,
1722                textmate_grammar: None,
1723                show_whitespace_tabs: true,
1724                line_wrap: None,
1725                wrap_column: None,
1726                page_view: None,
1727                page_width: None,
1728                use_tabs: None,
1729                tab_size: None,
1730                formatter: None,
1731                format_on_save: false,
1732                on_save: vec![],
1733                word_characters: None,
1734            },
1735        );
1736
1737        registry.apply_language_config(&languages);
1738
1739        // "lfrc" should match the exact rule (python), not the glob (bash)
1740        let entry = registry.find_by_path(Path::new("lfrc")).unwrap();
1741        assert!(
1742            entry.display_name.to_lowercase().contains("python"),
1743            "exact match should win over glob, got: {}",
1744            entry.display_name
1745        );
1746    }
1747
1748    #[test]
1749    fn test_built_in_aliases_resolve() {
1750        let registry = GrammarRegistry::default();
1751
1752        // "bash" should resolve to "Bourne Again Shell (bash)" via alias
1753        let syntax = registry.find_syntax_by_name("bash");
1754        assert!(syntax.is_some(), "alias 'bash' should resolve");
1755        assert_eq!(syntax.unwrap().name, "Bourne Again Shell (bash)");
1756
1757        // "cpp" should resolve to "C++"
1758        let syntax = registry.find_syntax_by_name("cpp");
1759        assert!(syntax.is_some(), "alias 'cpp' should resolve");
1760        assert_eq!(syntax.unwrap().name, "C++");
1761
1762        // "csharp" should resolve to "C#"
1763        let syntax = registry.find_syntax_by_name("csharp");
1764        assert!(syntax.is_some(), "alias 'csharp' should resolve");
1765        assert_eq!(syntax.unwrap().name, "C#");
1766
1767        // "sh" should also resolve to bash
1768        let syntax = registry.find_syntax_by_name("sh");
1769        assert!(syntax.is_some(), "alias 'sh' should resolve");
1770        assert_eq!(syntax.unwrap().name, "Bourne Again Shell (bash)");
1771
1772        // "proto" should resolve to "Protocol Buffers"
1773        let syntax = registry.find_syntax_by_name("proto");
1774        assert!(syntax.is_some(), "alias 'proto' should resolve");
1775        assert_eq!(syntax.unwrap().name, "Protocol Buffers");
1776    }
1777
1778    #[test]
1779    fn test_alias_case_insensitive_input() {
1780        let registry = GrammarRegistry::default();
1781
1782        // Aliases should be case-insensitive on input
1783        let syntax = registry.find_syntax_by_name("BASH");
1784        assert!(
1785            syntax.is_some(),
1786            "alias 'BASH' should resolve case-insensitively"
1787        );
1788        assert_eq!(syntax.unwrap().name, "Bourne Again Shell (bash)");
1789
1790        let syntax = registry.find_syntax_by_name("Cpp");
1791        assert!(
1792            syntax.is_some(),
1793            "alias 'Cpp' should resolve case-insensitively"
1794        );
1795        assert_eq!(syntax.unwrap().name, "C++");
1796    }
1797
1798    #[test]
1799    fn test_full_name_still_works() {
1800        let registry = GrammarRegistry::default();
1801
1802        // Full names should still work (exact match)
1803        let syntax = registry.find_syntax_by_name("Bourne Again Shell (bash)");
1804        assert!(syntax.is_some(), "full name should still resolve");
1805        assert_eq!(syntax.unwrap().name, "Bourne Again Shell (bash)");
1806
1807        // Case-insensitive full name should still work
1808        let syntax = registry.find_syntax_by_name("bourne again shell (bash)");
1809        assert!(
1810            syntax.is_some(),
1811            "case-insensitive full name should resolve"
1812        );
1813        assert_eq!(syntax.unwrap().name, "Bourne Again Shell (bash)");
1814    }
1815
1816    #[test]
1817    fn test_alias_does_not_shadow_full_names() {
1818        let registry = GrammarRegistry::default();
1819
1820        // "Rust" should resolve directly via case-insensitive match, not via alias
1821        let syntax = registry.find_syntax_by_name("rust");
1822        assert!(syntax.is_some());
1823        assert_eq!(syntax.unwrap().name, "Rust");
1824
1825        // "Go" should resolve directly
1826        let syntax = registry.find_syntax_by_name("go");
1827        assert!(syntax.is_some());
1828        assert_eq!(syntax.unwrap().name, "Go");
1829    }
1830
1831    #[test]
1832    fn test_register_alias_rejects_collision() {
1833        let mut registry = GrammarRegistry::default();
1834
1835        // Trying to register an alias that maps to two different targets should fail
1836        assert!(registry.register_alias("myalias", "Rust"));
1837        assert!(!registry.register_alias("myalias", "Go"));
1838
1839        // Same mapping is fine (idempotent)
1840        assert!(registry.register_alias("myalias", "Rust"));
1841    }
1842
1843    #[test]
1844    fn test_register_alias_rejects_nonexistent_target() {
1845        let mut registry = GrammarRegistry::default();
1846        assert!(!registry.register_alias("nope", "Nonexistent Grammar"));
1847    }
1848
1849    #[test]
1850    fn test_register_alias_skips_existing_grammar_name() {
1851        let mut registry = GrammarRegistry::default();
1852
1853        // "rust" case-insensitively matches the grammar "Rust", so no alias needed
1854        assert!(!registry.register_alias("rust", "Rust"));
1855        // Should still be resolvable via case-insensitive match
1856        assert!(registry.find_syntax_by_name("rust").is_some());
1857    }
1858
1859    #[test]
1860    fn test_available_grammar_info_includes_short_names() {
1861        let registry = GrammarRegistry::default();
1862        let infos = registry.available_grammar_info();
1863
1864        let bash_info = infos.iter().find(|g| g.name == "Bourne Again Shell (bash)");
1865        assert!(bash_info.is_some(), "bash grammar should be in the list");
1866        let bash_info = bash_info.unwrap();
1867        assert!(
1868            bash_info.short_name.is_some(),
1869            "bash grammar should have a short_name"
1870        );
1871        // The shortest alias for bash is "sh"
1872        assert_eq!(bash_info.short_name.as_deref(), Some("sh"));
1873    }
1874
1875    #[test]
1876    fn test_catalog_contains_each_language_once() {
1877        let registry = GrammarRegistry::default();
1878        let catalog = registry.catalog();
1879
1880        // Every catalog entry must have a unique (case-insensitive) display name.
1881        let mut seen = std::collections::HashSet::new();
1882        for entry in catalog {
1883            let key = entry.display_name.to_lowercase();
1884            assert!(
1885                seen.insert(key.clone()),
1886                "duplicate catalog entry for display_name={:?}",
1887                entry.display_name
1888            );
1889        }
1890
1891        // TypeScript is tree-sitter-only (syntect ships no grammar for it) yet
1892        // must still appear in the catalog.
1893        let ts = registry
1894            .find_by_name("TypeScript")
1895            .expect("TypeScript must be in the catalog");
1896        assert!(ts.engines.syntect.is_none());
1897        assert_eq!(
1898            ts.engines.tree_sitter,
1899            Some(fresh_languages::Language::TypeScript)
1900        );
1901        assert_eq!(ts.language_id, "typescript");
1902        assert!(ts.extensions.iter().any(|e| e == "ts"));
1903
1904        // Languages that exist in both syntect and tree-sitter (Rust, Python,
1905        // JavaScript) must appear exactly once and prefer the syntect engine.
1906        for name in ["Rust", "Python", "JavaScript"] {
1907            let entry = registry
1908                .find_by_name(name)
1909                .unwrap_or_else(|| panic!("{} must be in the catalog", name));
1910            assert!(
1911                entry.engines.syntect.is_some(),
1912                "{} should have a syntect index",
1913                name
1914            );
1915            assert!(
1916                entry.engines.tree_sitter.is_some(),
1917                "{} should also have a tree-sitter language",
1918                name
1919            );
1920            // Only one entry with this display name (already checked above),
1921            // but also verify language_id lookup lands on the same entry.
1922            let by_id = registry
1923                .find_by_name(&entry.language_id)
1924                .expect("language_id should resolve");
1925            assert_eq!(by_id.display_name, entry.display_name);
1926        }
1927    }
1928
1929    #[test]
1930    fn test_catalog_find_by_path_and_extension() {
1931        let registry = GrammarRegistry::default();
1932        let ts = registry
1933            .find_by_path(Path::new("foo.ts"))
1934            .expect("foo.ts should resolve");
1935        assert_eq!(ts.display_name, "TypeScript");
1936        let rs = registry.find_by_extension("rs").expect("rs should resolve");
1937        assert_eq!(rs.display_name, "Rust");
1938    }
1939
1940    /// Build a minimal LanguageConfig for tests.
1941    fn lang_cfg(
1942        grammar: &str,
1943        extensions: &[&str],
1944        filenames: &[&str],
1945    ) -> crate::config::LanguageConfig {
1946        crate::config::LanguageConfig {
1947            extensions: extensions.iter().map(|s| s.to_string()).collect(),
1948            filenames: filenames.iter().map(|s| s.to_string()).collect(),
1949            grammar: grammar.to_string(),
1950            comment_prefix: None,
1951            auto_indent: true,
1952            auto_close: None,
1953            auto_surround: None,
1954            textmate_grammar: None,
1955            show_whitespace_tabs: true,
1956            line_wrap: None,
1957            wrap_column: None,
1958            page_view: None,
1959            page_width: None,
1960            use_tabs: None,
1961            tab_size: None,
1962            formatter: None,
1963            format_on_save: false,
1964            on_save: vec![],
1965            word_characters: None,
1966        }
1967    }
1968
1969    /// Bug #1: a user-declared config key that aliases an existing grammar
1970    /// (e.g. `[languages.mylang] grammar = "Rust"`) must resolve via
1971    /// `find_by_name("mylang")` so the language palette can select it.
1972    #[test]
1973    fn test_user_alias_resolves_via_find_by_name() {
1974        let mut registry = GrammarRegistry::default();
1975        let mut languages = std::collections::HashMap::new();
1976        languages.insert("mylang".to_string(), lang_cfg("Rust", &[], &[]));
1977        registry.apply_language_config(&languages);
1978
1979        let entry = registry
1980            .find_by_name("mylang")
1981            .expect("user-declared alias 'mylang' must resolve");
1982        assert_eq!(entry.display_name, "Rust");
1983    }
1984
1985    /// Bug #2: `register_alias` used to rebuild the catalog from scratch,
1986    /// wiping out everything `apply_language_config` had merged. Registering
1987    /// an alias afterwards must not lose user config.
1988    #[test]
1989    fn test_register_alias_preserves_applied_language_config() {
1990        let mut registry = GrammarRegistry::default();
1991        let mut languages = std::collections::HashMap::new();
1992        languages.insert(
1993            "shell-configs".to_string(),
1994            lang_cfg("bash", &["myconf"], &["*.myconf"]),
1995        );
1996        registry.apply_language_config(&languages);
1997
1998        // Sanity: config applied.
1999        assert!(registry.find_by_extension("myconf").is_some());
2000        assert!(
2001            registry.find_by_path(Path::new("foo.myconf")).is_some(),
2002            "glob should match before register_alias"
2003        );
2004
2005        // Registering an alias must not erase the config we just applied.
2006        registry.register_alias("mycustom", "Rust");
2007
2008        assert!(
2009            registry.find_by_extension("myconf").is_some(),
2010            "config extension must survive register_alias"
2011        );
2012        assert!(
2013            registry.find_by_path(Path::new("foo.myconf")).is_some(),
2014            "glob must survive register_alias"
2015        );
2016    }
2017
2018    /// Bug #4: `from_syntax_name` used to unconditionally overwrite the
2019    /// catalog's canonical display name with whatever the user typed (e.g.
2020    /// "BASH") — that string ended up in the status bar.
2021    #[test]
2022    fn test_from_syntax_name_preserves_canonical_display_name() {
2023        use crate::primitives::detected_language::DetectedLanguage;
2024        let registry = GrammarRegistry::default();
2025        let languages = std::collections::HashMap::new();
2026
2027        let detected = DetectedLanguage::from_syntax_name("BASH", &registry, &languages)
2028            .expect("BASH should resolve via alias");
2029        assert_eq!(
2030            detected.display_name, "Bourne Again Shell (bash)",
2031            "display_name must be canonical, not user-typed"
2032        );
2033    }
2034
2035    /// A config-only language (no matching syntect grammar) must still appear
2036    /// in the catalog so the language palette can offer it — the old
2037    /// `DetectedLanguage::from_config_language` branch was load-bearing.
2038    #[test]
2039    fn test_config_only_language_appears_in_catalog() {
2040        let mut registry = GrammarRegistry::default();
2041        let mut languages = std::collections::HashMap::new();
2042        // "fish" isn't in syntect; grammar="fish" doesn't resolve either.
2043        languages.insert("fish".to_string(), lang_cfg("fish", &["fish"], &[]));
2044        registry.apply_language_config(&languages);
2045
2046        let entry = registry
2047            .find_by_name("fish")
2048            .expect("fish should be in the catalog after apply_language_config");
2049        assert!(entry.engines.syntect.is_none());
2050        assert!(entry.engines.tree_sitter.is_none());
2051        assert_eq!(entry.language_id, "fish");
2052        assert!(entry.extensions.iter().any(|e| e == "fish"));
2053    }
2054
2055    /// Config-declared extensions must override the built-in mapping. If the
2056    /// user says `[languages.typescript-overlay] extensions = ["js"] grammar
2057    /// = "TypeScript"`, then `foo.js` must resolve to TypeScript, not
2058    /// JavaScript.
2059    #[test]
2060    fn test_config_extension_overrides_builtin() {
2061        let mut registry = GrammarRegistry::default();
2062        // Sanity: default mapping is JavaScript.
2063        assert_eq!(
2064            registry.find_by_extension("js").unwrap().display_name,
2065            "JavaScript"
2066        );
2067
2068        let mut languages = std::collections::HashMap::new();
2069        languages.insert(
2070            "ts-overlay".to_string(),
2071            lang_cfg("TypeScript", &["js"], &[]),
2072        );
2073        registry.apply_language_config(&languages);
2074
2075        assert_eq!(
2076            registry.find_by_extension("js").unwrap().display_name,
2077            "TypeScript",
2078            "user-config extension must win over built-in"
2079        );
2080    }
2081
2082    /// Bare filenames listed by syntect grammars (e.g. "Gemfile", "Makefile",
2083    /// "Rakefile") must resolve through `find_by_path`. Syntect stores these
2084    /// in each grammar's `file_extensions` field alongside real extensions
2085    /// like "rb"; its own `find_syntax_for_file` treats them as either. The
2086    /// catalog has to do the same or `HighlightEngine::for_file` breaks for
2087    /// every extensionless config file.
2088    #[test]
2089    fn test_bare_filename_resolves_via_find_by_path() {
2090        let registry = GrammarRegistry::default();
2091        for (filename, expected_substr) in [
2092            ("Gemfile", "ruby"),
2093            ("Rakefile", "ruby"),
2094            ("Vagrantfile", "ruby"),
2095            ("Makefile", "makefile"),
2096            ("GNUmakefile", "makefile"),
2097        ] {
2098            let entry = registry
2099                .find_by_path(Path::new(filename))
2100                .unwrap_or_else(|| panic!("{} must resolve via catalog", filename));
2101            assert!(
2102                entry.display_name.to_lowercase().contains(expected_substr),
2103                "{} should resolve to {} grammar, got {}",
2104                filename,
2105                expected_substr,
2106                entry.display_name
2107            );
2108        }
2109    }
2110
2111    /// Languages that have both syntect and tree-sitter (e.g. JavaScript) must
2112    /// expose the union of both engines' extensions. Tree-sitter-javascript
2113    /// knows `.jsx`; syntect's JavaScript grammar does not. Both should route
2114    /// through the JavaScript catalog entry.
2115    #[test]
2116    fn test_jsx_resolves_to_javascript() {
2117        let registry = GrammarRegistry::default();
2118        let entry = registry
2119            .find_by_path(Path::new("foo.jsx"))
2120            .expect("foo.jsx must resolve");
2121        assert_eq!(entry.display_name, "JavaScript");
2122    }
2123
2124    /// `rebuild_catalog` must replay the last-applied language config so it
2125    /// can never silently wipe user `[languages]` rules. This is the invariant
2126    /// that keeps `register_alias`, `populate_built_in_aliases`, and any
2127    /// future rebuild callsite safe-by-construction.
2128    #[test]
2129    fn test_rebuild_catalog_replays_language_config() {
2130        let mut registry = GrammarRegistry::default();
2131        let mut languages = std::collections::HashMap::new();
2132        languages.insert(
2133            "myshell".to_string(),
2134            lang_cfg("bash", &["myext"], &["*.myglob"]),
2135        );
2136        registry.apply_language_config(&languages);
2137        assert!(registry.find_by_extension("myext").is_some());
2138        assert!(registry.find_by_path(Path::new("foo.myglob")).is_some());
2139
2140        // Force a rebuild — the catalog gets wiped and re-populated from
2141        // syntect / tree-sitter, but user config must come back on top.
2142        registry.rebuild_catalog();
2143        assert!(
2144            registry.find_by_extension("myext").is_some(),
2145            "rebuild_catalog must replay applied user config"
2146        );
2147        assert!(
2148            registry.find_by_path(Path::new("foo.myglob")).is_some(),
2149            "rebuild_catalog must replay user globs"
2150        );
2151    }
2152
2153    /// `apply_language_config` must be idempotent: calling it twice with the
2154    /// same config yields the same catalog state.
2155    #[test]
2156    fn test_apply_language_config_idempotent() {
2157        let mut registry = GrammarRegistry::default();
2158        let mut languages = std::collections::HashMap::new();
2159        languages.insert(
2160            "shell-cfg".to_string(),
2161            lang_cfg("bash", &["myconf"], &["*.myconf"]),
2162        );
2163
2164        registry.apply_language_config(&languages);
2165        let first_extensions = registry
2166            .find_by_name("bash")
2167            .unwrap()
2168            .extensions
2169            .iter()
2170            .filter(|e| e == &"myconf")
2171            .count();
2172        let first_globs = registry
2173            .find_by_name("bash")
2174            .unwrap()
2175            .filename_globs
2176            .iter()
2177            .filter(|g| g == &"*.myconf")
2178            .count();
2179        assert_eq!(first_extensions, 1);
2180        assert_eq!(first_globs, 1);
2181
2182        // Second call must not duplicate anything.
2183        registry.apply_language_config(&languages);
2184        let second_extensions = registry
2185            .find_by_name("bash")
2186            .unwrap()
2187            .extensions
2188            .iter()
2189            .filter(|e| e == &"myconf")
2190            .count();
2191        let second_globs = registry
2192            .find_by_name("bash")
2193            .unwrap()
2194            .filename_globs
2195            .iter()
2196            .filter(|g| g == &"*.myconf")
2197            .count();
2198        assert_eq!(second_extensions, 1, "extensions must not duplicate");
2199        assert_eq!(second_globs, 1, "globs must not duplicate");
2200    }
2201
2202    /// `tree_sitter_for_syntect_name` handles the alias table + strict
2203    /// display-name match. The alias table catches syntect's verbose names;
2204    /// the strict match handles the common case.
2205    #[test]
2206    fn test_tree_sitter_bridge() {
2207        assert_eq!(
2208            tree_sitter_for_syntect_name("Bourne Again Shell (bash)"),
2209            Some(fresh_languages::Language::Bash)
2210        );
2211        assert_eq!(
2212            tree_sitter_for_syntect_name("Rust"),
2213            Some(fresh_languages::Language::Rust)
2214        );
2215        // Must NOT fuzzy-match Nushell to Bash.
2216        assert_eq!(tree_sitter_for_syntect_name("Nushell"), None);
2217        // Must NOT match arbitrary strings.
2218        assert_eq!(tree_sitter_for_syntect_name("does-not-exist"), None);
2219    }
2220}