Skip to main content

fresh/primitives/grammar/
types.rs

1//! Pure grammar registry types without I/O operations.
2//!
3//! This module contains the `GrammarRegistry` struct and all syntax lookup methods
4//! that don't require filesystem access. This enables WASM compatibility and easier testing.
5
6use serde::{Deserialize, Serialize};
7use std::collections::HashMap;
8use std::path::{Path, PathBuf};
9use std::sync::Arc;
10use syntect::parsing::{SyntaxDefinition, SyntaxReference, SyntaxSet, SyntaxSetBuilder};
11
12// Re-export glob matching utilities for use by other modules
13pub use crate::primitives::glob_match::{
14    filename_glob_matches, is_glob_pattern, is_path_pattern, path_glob_matches,
15};
16
17/// A grammar specification: language name, path to grammar file, and associated file extensions.
18///
19/// Used to pass grammar information between the plugin layer, loader, and registry
20/// without relying on anonymous tuples.
21#[derive(Clone, Debug)]
22pub struct GrammarSpec {
23    /// Language identifier (e.g., "elixir")
24    pub language: String,
25    /// Path to the grammar file (.sublime-syntax)
26    pub path: PathBuf,
27    /// File extensions to associate with this grammar (e.g., ["ex", "exs"])
28    pub extensions: Vec<String>,
29}
30
31/// Where a grammar was loaded from.
32#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
33#[serde(tag = "type")]
34pub enum GrammarSource {
35    /// Built-in to Fresh (pre-compiled syntect defaults + embedded grammars)
36    #[serde(rename = "built-in")]
37    BuiltIn,
38    /// Installed from a user grammar directory (~/.config/fresh/grammars/)
39    #[serde(rename = "user")]
40    User { path: PathBuf },
41    /// From a language pack (~/.config/fresh/languages/packages/)
42    #[serde(rename = "language-pack")]
43    LanguagePack { name: String, path: PathBuf },
44    /// From a bundle package (~/.config/fresh/bundles/packages/)
45    #[serde(rename = "bundle")]
46    Bundle { name: String, path: PathBuf },
47    /// Registered by a plugin at runtime
48    #[serde(rename = "plugin")]
49    Plugin { plugin: String, path: PathBuf },
50}
51
52impl std::fmt::Display for GrammarSource {
53    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
54        match self {
55            GrammarSource::BuiltIn => write!(f, "built-in"),
56            GrammarSource::User { path } => write!(f, "user ({})", path.display()),
57            GrammarSource::LanguagePack { name, .. } => write!(f, "language-pack ({})", name),
58            GrammarSource::Bundle { name, .. } => write!(f, "bundle ({})", name),
59            GrammarSource::Plugin { plugin, .. } => write!(f, "plugin ({})", plugin),
60        }
61    }
62}
63
64/// Information about an available grammar, including its provenance.
65#[derive(Clone, Debug, Serialize, Deserialize)]
66pub struct GrammarInfo {
67    /// The grammar name as used in config files (case-insensitive matching)
68    pub name: String,
69    /// Where this grammar was loaded from
70    pub source: GrammarSource,
71    /// File extensions associated with this grammar
72    pub file_extensions: Vec<String>,
73    /// Optional short name alias (e.g., "bash" for "Bourne Again Shell (bash)")
74    #[serde(default, skip_serializing_if = "Option::is_none")]
75    pub short_name: Option<String>,
76}
77
78/// Bridge between syntect display names and `fresh_languages::Language`.
79///
80/// Most syntect grammars map one-to-one: "Rust" → `Language::Rust`. A few
81/// have verbose display names that don't match the tree-sitter enum's
82/// `display_name()`, and `Language::from_name` has fuzzy "contains shell"
83/// fallbacks that would wrongly tag Nushell as tree-sitter Bash. This is
84/// the one place we spell the exceptions out explicitly.
85const SYNTECT_TO_TREE_SITTER_ALIASES: &[(&str, fresh_languages::Language)] =
86    &[("Bourne Again Shell (bash)", fresh_languages::Language::Bash)];
87
88/// Resolve a syntect syntax display name to a tree-sitter language, using
89/// strict equality against the alias table and `Language::display_name()`.
90fn tree_sitter_for_syntect_name(display_name: &str) -> Option<fresh_languages::Language> {
91    for (syntect_name, lang) in SYNTECT_TO_TREE_SITTER_ALIASES {
92        if *syntect_name == display_name {
93            return Some(*lang);
94        }
95    }
96    fresh_languages::Language::all()
97        .iter()
98        .find(|l| l.display_name() == display_name)
99        .copied()
100}
101
102/// Which highlighters can serve a given `GrammarEntry`.
103///
104/// A catalog entry may come from syntect (a TextMate grammar indexed into
105/// `SyntaxSet`), tree-sitter (a `fresh_languages::Language`), or both.
106#[derive(Clone, Debug, Default)]
107pub struct GrammarEngines {
108    /// Index into `GrammarRegistry::syntax_set().syntaxes()`, if a syntect
109    /// grammar is available.
110    pub syntect: Option<usize>,
111    /// Tree-sitter language, if one is registered for this grammar.
112    pub tree_sitter: Option<fresh_languages::Language>,
113}
114
115/// A single entry in the unified grammar catalog.
116///
117/// Each entry represents one logical language (e.g. "Rust", "TypeScript") and
118/// records which highlighting engines can serve it, plus the names/extensions
119/// used to look it up. The catalog is the single source of truth for grammar
120/// lookups — `find_by_name`, `find_by_path`, `find_by_extension` all return
121/// entries from here, and both `HighlightEngine::from_entry` and
122/// `DetectedLanguage::from_entry` consume them.
123#[derive(Clone, Debug)]
124pub struct GrammarEntry {
125    /// Human-readable display name (e.g. "TypeScript", "Bourne Again Shell (bash)").
126    pub display_name: String,
127    /// Canonical language ID used in config and LSP (e.g. "typescript", "csharp").
128    pub language_id: String,
129    /// Short alias, if one exists (e.g. "ts" for TypeScript).
130    pub short_name: Option<String>,
131    /// File extensions (without leading dot).
132    pub extensions: Vec<String>,
133    /// Exact filenames that map to this grammar (e.g. "Dockerfile").
134    pub filenames: Vec<String>,
135    /// Filename globs from user config (e.g. "*.conf", "/etc/**/rc.*").
136    pub filename_globs: Vec<String>,
137    /// Where this grammar was loaded from.
138    pub source: GrammarSource,
139    /// Highlighters that can serve this entry.
140    pub engines: GrammarEngines,
141}
142
143/// Embedded TOML grammar (syntect doesn't include one)
144pub const TOML_GRAMMAR: &str = include_str!("../../grammars/toml.sublime-syntax");
145
146/// Embedded Odin grammar (syntect doesn't include one)
147/// From: https://github.com/Tetralux/sublime-odin (MIT License)
148pub const ODIN_GRAMMAR: &str = include_str!("../../grammars/odin/Odin.sublime-syntax");
149
150/// Embedded Zig grammar (syntect doesn't include one)
151pub const ZIG_GRAMMAR: &str = include_str!("../../grammars/zig.sublime-syntax");
152
153/// Embedded Git Rebase Todo grammar for interactive rebase
154pub const GIT_REBASE_GRAMMAR: &str = include_str!("../../grammars/git-rebase.sublime-syntax");
155
156/// Embedded Git Commit Message grammar for COMMIT_EDITMSG, MERGE_MSG, etc.
157pub const GIT_COMMIT_GRAMMAR: &str = include_str!("../../grammars/git-commit.sublime-syntax");
158
159/// Embedded Gitignore grammar for .gitignore and similar files
160pub const GITIGNORE_GRAMMAR: &str = include_str!("../../grammars/gitignore.sublime-syntax");
161
162/// Embedded Git Config grammar for .gitconfig, .gitmodules
163pub const GITCONFIG_GRAMMAR: &str = include_str!("../../grammars/gitconfig.sublime-syntax");
164
165/// Embedded Git Attributes grammar for .gitattributes
166pub const GITATTRIBUTES_GRAMMAR: &str = include_str!("../../grammars/gitattributes.sublime-syntax");
167
168/// Embedded Typst grammar (syntect doesn't include one)
169pub const TYPST_GRAMMAR: &str = include_str!("../../grammars/typst.sublime-syntax");
170
171/// Embedded Dockerfile grammar
172pub const DOCKERFILE_GRAMMAR: &str = include_str!("../../grammars/dockerfile.sublime-syntax");
173/// Embedded INI grammar (also handles .env, .cfg, .editorconfig, etc.)
174pub const INI_GRAMMAR: &str = include_str!("../../grammars/ini.sublime-syntax");
175/// Embedded CMake grammar
176pub const CMAKE_GRAMMAR: &str = include_str!("../../grammars/cmake.sublime-syntax");
177/// Embedded SCSS grammar
178pub const SCSS_GRAMMAR: &str = include_str!("../../grammars/scss.sublime-syntax");
179/// Embedded LESS grammar
180pub const LESS_GRAMMAR: &str = include_str!("../../grammars/less.sublime-syntax");
181/// Embedded PowerShell grammar
182pub const POWERSHELL_GRAMMAR: &str = include_str!("../../grammars/powershell.sublime-syntax");
183/// Embedded Kotlin grammar
184pub const KOTLIN_GRAMMAR: &str = include_str!("../../grammars/kotlin.sublime-syntax");
185/// Embedded Swift grammar
186pub const SWIFT_GRAMMAR: &str = include_str!("../../grammars/swift.sublime-syntax");
187/// Embedded Dart grammar
188pub const DART_GRAMMAR: &str = include_str!("../../grammars/dart.sublime-syntax");
189/// Embedded Elixir grammar
190pub const ELIXIR_GRAMMAR: &str = include_str!("../../grammars/elixir.sublime-syntax");
191/// Embedded F# grammar
192pub const FSHARP_GRAMMAR: &str = include_str!("../../grammars/fsharp.sublime-syntax");
193/// Embedded Nix grammar
194pub const NIX_GRAMMAR: &str = include_str!("../../grammars/nix.sublime-syntax");
195/// Embedded HCL/Terraform grammar
196pub const HCL_GRAMMAR: &str = include_str!("../../grammars/hcl.sublime-syntax");
197/// Embedded Protocol Buffers grammar
198pub const PROTOBUF_GRAMMAR: &str = include_str!("../../grammars/protobuf.sublime-syntax");
199/// Embedded GraphQL grammar
200pub const GRAPHQL_GRAMMAR: &str = include_str!("../../grammars/graphql.sublime-syntax");
201/// Embedded Julia grammar
202pub const JULIA_GRAMMAR: &str = include_str!("../../grammars/julia.sublime-syntax");
203/// Embedded Nim grammar
204pub const NIM_GRAMMAR: &str = include_str!("../../grammars/nim.sublime-syntax");
205/// Embedded Gleam grammar
206pub const GLEAM_GRAMMAR: &str = include_str!("../../grammars/gleam.sublime-syntax");
207/// Embedded V language grammar
208pub const VLANG_GRAMMAR: &str = include_str!("../../grammars/vlang.sublime-syntax");
209/// Embedded Solidity grammar
210pub const SOLIDITY_GRAMMAR: &str = include_str!("../../grammars/solidity.sublime-syntax");
211/// Embedded KDL grammar
212pub const KDL_GRAMMAR: &str = include_str!("../../grammars/kdl.sublime-syntax");
213/// Embedded Nushell grammar
214pub const NUSHELL_GRAMMAR: &str = include_str!("../../grammars/nushell.sublime-syntax");
215/// Embedded Starlark/Bazel grammar
216pub const STARLARK_GRAMMAR: &str = include_str!("../../grammars/starlark.sublime-syntax");
217/// Embedded Justfile grammar
218pub const JUSTFILE_GRAMMAR: &str = include_str!("../../grammars/justfile.sublime-syntax");
219/// Embedded Earthfile grammar
220pub const EARTHFILE_GRAMMAR: &str = include_str!("../../grammars/earthfile.sublime-syntax");
221/// Embedded Go Module grammar
222pub const GOMOD_GRAMMAR: &str = include_str!("../../grammars/gomod.sublime-syntax");
223/// Embedded Vue grammar
224pub const VUE_GRAMMAR: &str = include_str!("../../grammars/vue.sublime-syntax");
225/// Embedded Svelte grammar
226pub const SVELTE_GRAMMAR: &str = include_str!("../../grammars/svelte.sublime-syntax");
227/// Embedded Astro grammar
228pub const ASTRO_GRAMMAR: &str = include_str!("../../grammars/astro.sublime-syntax");
229/// Embedded Hyprlang grammar (Hyprland config)
230pub const HYPRLANG_GRAMMAR: &str = include_str!("../../grammars/hyprlang.sublime-syntax");
231/// Embedded AutoHotkey grammar
232/// From: https://github.com/SALZKARTOFFEEEL/ahk-sublime-syntax (MIT License)
233pub const AUTOHOTKEY_GRAMMAR: &str =
234    include_str!("../../grammars/autohotkey/AutoHotkey.sublime-syntax");
235/// Embedded Racket grammar (syntect doesn't include one)
236pub const RACKET_GRAMMAR: &str = include_str!("../../grammars/racket.sublime-syntax");
237
238/// Registry of all available TextMate grammars.
239///
240/// This struct holds the compiled syntax set and provides lookup methods.
241/// It does not perform I/O directly - use `GrammarLoader` for loading grammars.
242impl std::fmt::Debug for GrammarRegistry {
243    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
244        f.debug_struct("GrammarRegistry")
245            .field("syntax_count", &self.syntax_set.syntaxes().len())
246            .finish()
247    }
248}
249
250pub struct GrammarRegistry {
251    /// Combined syntax set (built-in + embedded + user grammars)
252    syntax_set: Arc<SyntaxSet>,
253    /// Extension -> scope name mapping for user grammars (takes priority)
254    user_extensions: HashMap<String, String>,
255    /// Filename -> scope name mapping for dotfiles and special files
256    filename_scopes: HashMap<String, String>,
257    /// Paths to dynamically loaded grammar files (for reloading when adding more)
258    loaded_grammar_paths: Vec<GrammarSpec>,
259    /// Provenance info for each grammar (keyed by grammar name)
260    grammar_sources: HashMap<String, GrammarInfo>,
261    /// Short name aliases: lowercase short_name -> full syntect grammar name.
262    /// Provides a deterministic, one-to-one mapping so users can write
263    /// `grammar = "bash"` instead of `grammar = "Bourne Again Shell (bash)"`.
264    aliases: HashMap<String, String>,
265    /// Unified catalog of every known grammar. Rebuilt whenever the syntax set
266    /// or alias table changes. Lookups (`find_by_name`, `find_by_path`, ...)
267    /// all resolve against this.
268    catalog: Vec<GrammarEntry>,
269    /// Index from lowercased lookup keys (display name, language_id, short_name)
270    /// to catalog index.
271    catalog_by_name: HashMap<String, usize>,
272    /// Index from file extension (without dot) to catalog index.
273    catalog_by_extension: HashMap<String, usize>,
274    /// Index from filename to catalog index.
275    catalog_by_filename: HashMap<String, usize>,
276    /// The most recent language config handed to `apply_language_config`.
277    /// Retained so `rebuild_catalog` can replay it — otherwise a rebuild
278    /// (triggered by e.g. `populate_built_in_aliases`) silently wipes user
279    /// `[languages]` config that was merged on top.
280    applied_language_config: HashMap<String, crate::config::LanguageConfig>,
281}
282
283impl GrammarRegistry {
284    /// Create a new GrammarRegistry from pre-built components.
285    ///
286    /// This is typically called by `GrammarLoader` implementations after
287    /// loading grammars from various sources.
288    pub(crate) fn new(
289        syntax_set: SyntaxSet,
290        user_extensions: HashMap<String, String>,
291        filename_scopes: HashMap<String, String>,
292    ) -> Self {
293        Self::new_with_loaded_paths(
294            syntax_set,
295            user_extensions,
296            filename_scopes,
297            Vec::new(),
298            HashMap::new(),
299        )
300    }
301
302    /// Create a GrammarRegistry with pre-loaded grammar path tracking.
303    ///
304    /// Used by the loader when plugin grammars were included in the initial build,
305    /// so that `loaded_grammar_paths()` reflects what was actually loaded.
306    pub(crate) fn new_with_loaded_paths(
307        syntax_set: SyntaxSet,
308        user_extensions: HashMap<String, String>,
309        filename_scopes: HashMap<String, String>,
310        loaded_grammar_paths: Vec<GrammarSpec>,
311        grammar_sources: HashMap<String, GrammarInfo>,
312    ) -> Self {
313        let mut reg = Self {
314            syntax_set: Arc::new(syntax_set),
315            user_extensions,
316            filename_scopes,
317            loaded_grammar_paths,
318            grammar_sources,
319            aliases: HashMap::new(),
320            catalog: Vec::new(),
321            catalog_by_name: HashMap::new(),
322            catalog_by_extension: HashMap::new(),
323            catalog_by_filename: HashMap::new(),
324            applied_language_config: HashMap::new(),
325        };
326        reg.rebuild_catalog();
327        reg
328    }
329
330    /// Create an empty grammar registry (fast, for tests that don't need syntax highlighting)
331    pub fn empty() -> Arc<Self> {
332        let mut builder = SyntaxSetBuilder::new();
333        builder.add_plain_text_syntax();
334        let mut reg = Self {
335            syntax_set: Arc::new(builder.build()),
336            user_extensions: HashMap::new(),
337            filename_scopes: HashMap::new(),
338            loaded_grammar_paths: Vec::new(),
339            grammar_sources: HashMap::new(),
340            aliases: HashMap::new(),
341            catalog: Vec::new(),
342            catalog_by_name: HashMap::new(),
343            catalog_by_extension: HashMap::new(),
344            catalog_by_filename: HashMap::new(),
345            applied_language_config: HashMap::new(),
346        };
347        reg.rebuild_catalog();
348        Arc::new(reg)
349    }
350
351    /// Create a registry with only syntect's pre-compiled defaults (~0ms).
352    ///
353    /// This provides instant syntax highlighting for ~50 common languages
354    /// (Rust, Python, JS/TS, C/C++, Go, Java, HTML, CSS, Markdown, etc.)
355    /// without any `SyntaxSetBuilder::build()` call. Use this at startup,
356    /// then swap in a full registry built on a background thread.
357    pub fn defaults_only() -> Arc<Self> {
358        // Load pre-compiled syntax set (defaults + embedded grammars) from
359        // build-time packdump. This avoids the expensive into_builder() + build()
360        // cycle at runtime (~12s → ~300ms).
361        tracing::info!("defaults_only: loading pre-compiled syntax packdump...");
362        let syntax_set: SyntaxSet = syntect::dumps::from_uncompressed_data(include_bytes!(
363            concat!(env!("OUT_DIR"), "/default_syntaxes.packdump")
364        ))
365        .expect("Failed to load pre-compiled syntax packdump");
366        tracing::info!(
367            "defaults_only: loaded ({} syntaxes)",
368            syntax_set.syntaxes().len()
369        );
370        let grammar_sources = Self::build_grammar_sources_from_syntax_set(&syntax_set);
371        let filename_scopes = Self::build_filename_scopes();
372        let extra_extensions = Self::build_extra_extensions();
373        let mut registry = Self {
374            syntax_set: Arc::new(syntax_set),
375            user_extensions: extra_extensions,
376            filename_scopes,
377            loaded_grammar_paths: Vec::new(),
378            grammar_sources,
379            aliases: HashMap::new(),
380            catalog: Vec::new(),
381            catalog_by_name: HashMap::new(),
382            catalog_by_extension: HashMap::new(),
383            catalog_by_filename: HashMap::new(),
384            applied_language_config: HashMap::new(),
385        };
386        registry.populate_built_in_aliases();
387        registry.rebuild_catalog();
388        Arc::new(registry)
389    }
390
391    /// Build extra extension -> scope mappings for extensions not covered by syntect defaults.
392    ///
393    /// These map common file extensions to existing syntect grammar scopes,
394    /// filling gaps where syntect's built-in extension lists are incomplete.
395    pub(crate) fn build_extra_extensions() -> HashMap<String, String> {
396        let mut map = HashMap::new();
397
398        // JavaScript variants not in syntect defaults (["js", "htc"])
399        let js_scope = "source.js".to_string();
400        map.insert("cjs".to_string(), js_scope.clone());
401        map.insert("mjs".to_string(), js_scope);
402
403        // Dockerfile variants (e.g. Dockerfile.dev -> .dev extension)
404        // These won't match by extension, handled by filename_scopes and first_line_match
405
406        map
407    }
408
409    /// Build the default filename -> scope mappings for dotfiles and special files.
410    pub(crate) fn build_filename_scopes() -> HashMap<String, String> {
411        let mut map = HashMap::new();
412
413        // Shell configuration files -> Bash/Shell script scope
414        let shell_scope = "source.shell.bash".to_string();
415        for filename in [
416            ".zshrc",
417            ".zprofile",
418            ".zshenv",
419            ".zlogin",
420            ".zlogout",
421            ".bash_aliases",
422            // .bashrc and .bash_profile are already recognized by syntect
423            // Common shell script files without extensions
424            "PKGBUILD",
425            "APKBUILD",
426        ] {
427            map.insert(filename.to_string(), shell_scope.clone());
428        }
429
430        // Git rebase todo files
431        let git_rebase_scope = "source.git-rebase-todo".to_string();
432        map.insert("git-rebase-todo".to_string(), git_rebase_scope);
433
434        // Git commit message files
435        let git_commit_scope = "source.git-commit".to_string();
436        for filename in ["COMMIT_EDITMSG", "MERGE_MSG", "SQUASH_MSG", "TAG_EDITMSG"] {
437            map.insert(filename.to_string(), git_commit_scope.clone());
438        }
439
440        // Gitignore and similar files
441        let gitignore_scope = "source.gitignore".to_string();
442        for filename in [".gitignore", ".dockerignore", ".npmignore", ".hgignore"] {
443            map.insert(filename.to_string(), gitignore_scope.clone());
444        }
445
446        // Git config files
447        let gitconfig_scope = "source.gitconfig".to_string();
448        for filename in [".gitconfig", ".gitmodules"] {
449            map.insert(filename.to_string(), gitconfig_scope.clone());
450        }
451
452        // Git attributes files
453        let gitattributes_scope = "source.gitattributes".to_string();
454        map.insert(".gitattributes".to_string(), gitattributes_scope);
455
456        // Jenkinsfile -> Groovy
457        let groovy_scope = "source.groovy".to_string();
458        map.insert("Jenkinsfile".to_string(), groovy_scope);
459
460        // Vagrantfile -> Ruby (syntect already handles this, but be explicit)
461        // Brewfile -> Ruby
462        let ruby_scope = "source.ruby".to_string();
463        map.insert("Brewfile".to_string(), ruby_scope);
464
465        // Dockerfile and variants (exact names; Dockerfile.* handled via prefix check)
466        let dockerfile_scope = "source.dockerfile".to_string();
467        map.insert("Dockerfile".to_string(), dockerfile_scope.clone());
468        map.insert("Containerfile".to_string(), dockerfile_scope.clone());
469        // Common Dockerfile variants
470        map.insert("Dockerfile.dev".to_string(), dockerfile_scope.clone());
471        map.insert("Dockerfile.prod".to_string(), dockerfile_scope.clone());
472        map.insert("Dockerfile.test".to_string(), dockerfile_scope.clone());
473        map.insert("Dockerfile.build".to_string(), dockerfile_scope.clone());
474
475        // CMake
476        let cmake_scope = "source.cmake".to_string();
477        map.insert("CMakeLists.txt".to_string(), cmake_scope);
478
479        // Starlark/Bazel
480        let starlark_scope = "source.starlark".to_string();
481        map.insert("BUILD".to_string(), starlark_scope.clone());
482        map.insert("BUILD.bazel".to_string(), starlark_scope.clone());
483        map.insert("WORKSPACE".to_string(), starlark_scope.clone());
484        map.insert("WORKSPACE.bazel".to_string(), starlark_scope.clone());
485        map.insert("Tiltfile".to_string(), starlark_scope);
486
487        // Justfile (various casings)
488        let justfile_scope = "source.justfile".to_string();
489        map.insert("justfile".to_string(), justfile_scope.clone());
490        map.insert("Justfile".to_string(), justfile_scope.clone());
491        map.insert(".justfile".to_string(), justfile_scope);
492
493        // EditorConfig -> INI
494        let ini_scope = "source.ini".to_string();
495        map.insert(".editorconfig".to_string(), ini_scope);
496
497        // Earthfile
498        let earthfile_scope = "source.earthfile".to_string();
499        map.insert("Earthfile".to_string(), earthfile_scope);
500
501        // Hyprlang (Hyprland config files)
502        let hyprlang_scope = "source.hyprlang".to_string();
503        map.insert("hyprland.conf".to_string(), hyprlang_scope.clone());
504        map.insert("hyprpaper.conf".to_string(), hyprlang_scope.clone());
505        map.insert("hyprlock.conf".to_string(), hyprlang_scope);
506
507        // go.mod / go.sum
508        let gomod_scope = "source.gomod".to_string();
509        map.insert("go.mod".to_string(), gomod_scope.clone());
510        map.insert("go.sum".to_string(), gomod_scope);
511
512        map
513    }
514
515    /// Add embedded grammars (TOML, Odin, etc.) to a syntax set builder.
516    pub(crate) fn add_embedded_grammars(builder: &mut SyntaxSetBuilder) {
517        // TOML grammar
518        match SyntaxDefinition::load_from_str(TOML_GRAMMAR, true, Some("TOML")) {
519            Ok(syntax) => {
520                builder.add(syntax);
521                tracing::debug!("Loaded embedded TOML grammar");
522            }
523            Err(e) => {
524                tracing::warn!("Failed to load embedded TOML grammar: {}", e);
525            }
526        }
527
528        // Odin grammar
529        match SyntaxDefinition::load_from_str(ODIN_GRAMMAR, true, Some("Odin")) {
530            Ok(syntax) => {
531                builder.add(syntax);
532                tracing::debug!("Loaded embedded Odin grammar");
533            }
534            Err(e) => {
535                tracing::warn!("Failed to load embedded Odin grammar: {}", e);
536            }
537        }
538
539        // Zig grammar
540        match SyntaxDefinition::load_from_str(ZIG_GRAMMAR, true, Some("Zig")) {
541            Ok(syntax) => {
542                builder.add(syntax);
543                tracing::debug!("Loaded embedded Zig grammar");
544            }
545            Err(e) => {
546                tracing::warn!("Failed to load embedded Zig grammar: {}", e);
547            }
548        }
549
550        // Git Rebase Todo grammar
551        match SyntaxDefinition::load_from_str(GIT_REBASE_GRAMMAR, true, Some("Git Rebase Todo")) {
552            Ok(syntax) => {
553                builder.add(syntax);
554                tracing::debug!("Loaded embedded Git Rebase Todo grammar");
555            }
556            Err(e) => {
557                tracing::warn!("Failed to load embedded Git Rebase Todo grammar: {}", e);
558            }
559        }
560
561        // Git Commit Message grammar
562        match SyntaxDefinition::load_from_str(GIT_COMMIT_GRAMMAR, true, Some("Git Commit Message"))
563        {
564            Ok(syntax) => {
565                builder.add(syntax);
566                tracing::debug!("Loaded embedded Git Commit Message grammar");
567            }
568            Err(e) => {
569                tracing::warn!("Failed to load embedded Git Commit Message grammar: {}", e);
570            }
571        }
572
573        // Gitignore grammar
574        match SyntaxDefinition::load_from_str(GITIGNORE_GRAMMAR, true, Some("Gitignore")) {
575            Ok(syntax) => {
576                builder.add(syntax);
577                tracing::debug!("Loaded embedded Gitignore grammar");
578            }
579            Err(e) => {
580                tracing::warn!("Failed to load embedded Gitignore grammar: {}", e);
581            }
582        }
583
584        // Git Config grammar
585        match SyntaxDefinition::load_from_str(GITCONFIG_GRAMMAR, true, Some("Git Config")) {
586            Ok(syntax) => {
587                builder.add(syntax);
588                tracing::debug!("Loaded embedded Git Config grammar");
589            }
590            Err(e) => {
591                tracing::warn!("Failed to load embedded Git Config grammar: {}", e);
592            }
593        }
594
595        // Git Attributes grammar
596        match SyntaxDefinition::load_from_str(GITATTRIBUTES_GRAMMAR, true, Some("Git Attributes")) {
597            Ok(syntax) => {
598                builder.add(syntax);
599                tracing::debug!("Loaded embedded Git Attributes grammar");
600            }
601            Err(e) => {
602                tracing::warn!("Failed to load embedded Git Attributes grammar: {}", e);
603            }
604        }
605
606        // Typst grammar
607        match SyntaxDefinition::load_from_str(TYPST_GRAMMAR, true, Some("Typst")) {
608            Ok(syntax) => {
609                builder.add(syntax);
610                tracing::debug!("Loaded embedded Typst grammar");
611            }
612            Err(e) => {
613                tracing::warn!("Failed to load embedded Typst grammar: {}", e);
614            }
615        }
616
617        // Additional embedded grammars for languages not in syntect defaults
618        let additional_grammars: &[(&str, &str)] = &[
619            (DOCKERFILE_GRAMMAR, "Dockerfile"),
620            (INI_GRAMMAR, "INI"),
621            (CMAKE_GRAMMAR, "CMake"),
622            (SCSS_GRAMMAR, "SCSS"),
623            (LESS_GRAMMAR, "LESS"),
624            (POWERSHELL_GRAMMAR, "PowerShell"),
625            (KOTLIN_GRAMMAR, "Kotlin"),
626            (SWIFT_GRAMMAR, "Swift"),
627            (DART_GRAMMAR, "Dart"),
628            (ELIXIR_GRAMMAR, "Elixir"),
629            (FSHARP_GRAMMAR, "FSharp"),
630            (NIX_GRAMMAR, "Nix"),
631            (HCL_GRAMMAR, "HCL"),
632            (PROTOBUF_GRAMMAR, "Protocol Buffers"),
633            (GRAPHQL_GRAMMAR, "GraphQL"),
634            (JULIA_GRAMMAR, "Julia"),
635            (NIM_GRAMMAR, "Nim"),
636            (GLEAM_GRAMMAR, "Gleam"),
637            (VLANG_GRAMMAR, "V"),
638            (SOLIDITY_GRAMMAR, "Solidity"),
639            (KDL_GRAMMAR, "KDL"),
640            (NUSHELL_GRAMMAR, "Nushell"),
641            (STARLARK_GRAMMAR, "Starlark"),
642            (JUSTFILE_GRAMMAR, "Justfile"),
643            (EARTHFILE_GRAMMAR, "Earthfile"),
644            (GOMOD_GRAMMAR, "Go Module"),
645            (VUE_GRAMMAR, "Vue"),
646            (SVELTE_GRAMMAR, "Svelte"),
647            (ASTRO_GRAMMAR, "Astro"),
648            (HYPRLANG_GRAMMAR, "Hyprlang"),
649            (AUTOHOTKEY_GRAMMAR, "AutoHotkey"),
650            (RACKET_GRAMMAR, "Racket"),
651        ];
652
653        for (grammar_str, name) in additional_grammars {
654            match SyntaxDefinition::load_from_str(grammar_str, true, Some(name)) {
655                Ok(syntax) => {
656                    builder.add(syntax);
657                    tracing::debug!("Loaded embedded {} grammar", name);
658                }
659                Err(e) => {
660                    tracing::warn!("Failed to load embedded {} grammar: {}", name, e);
661                }
662            }
663        }
664    }
665
666    /// Find syntax for a file by path/extension/filename.
667    ///
668    /// Purely metadata-based — does not read the file. For first-line
669    /// (shebang) fallback, use [`find_by_path`] with a `first_line` argument
670    /// and resolve the returned entry's syntect index.
671    pub fn find_syntax_for_file(&self, path: &Path) -> Option<&SyntaxReference> {
672        let entry = self.find_by_path(path, None)?;
673        entry
674            .engines
675            .syntect
676            .map(|i| &self.syntax_set.syntaxes()[i])
677    }
678
679    /// Find syntax by name, with alias resolution.
680    ///
681    /// Thin wrapper around `find_by_name` that returns the associated syntect
682    /// `SyntaxReference`. Tree-sitter-only entries return `None`.
683    ///
684    /// Falls back to a direct syntect lookup for "Plain Text", which the
685    /// catalog deliberately omits but syntect still exposes.
686    pub fn find_syntax_by_name(&self, name: &str) -> Option<&SyntaxReference> {
687        if let Some(entry) = self.find_by_name(name) {
688            if let Some(idx) = entry.engines.syntect {
689                return Some(&self.syntax_set.syntaxes()[idx]);
690            }
691        }
692        // Plain Text is excluded from the catalog (it's not a "grammar" a user
693        // would ever pick), but syntect still stores it and a handful of
694        // callers still ask for it by name.
695        self.syntax_set.find_syntax_by_name(name)
696    }
697
698    // === Alias management ===
699
700    /// Hardcoded short-name aliases for built-in and embedded grammars.
701    ///
702    /// Each entry maps a short name (lowercase) to the exact syntect grammar name.
703    /// Only grammars whose full name differs significantly from a natural short
704    /// form need an entry here. Grammars already short (e.g., "Rust", "Go") are
705    /// reachable via case-insensitive matching and don't need aliases.
706    fn built_in_aliases() -> Vec<(&'static str, &'static str)> {
707        vec![
708            // Syntect built-in grammars with verbose names
709            ("bash", "Bourne Again Shell (bash)"),
710            ("shell", "Bourne Again Shell (bash)"),
711            ("sh", "Bourne Again Shell (bash)"),
712            ("c++", "C++"),
713            ("cpp", "C++"),
714            ("csharp", "C#"),
715            ("objc", "Objective-C"),
716            ("objcpp", "Objective-C++"),
717            ("regex", "Regular Expressions (Python)"),
718            ("regexp", "Regular Expressions (Python)"),
719            // Embedded grammars with multi-word or non-obvious names
720            ("proto", "Protocol Buffers"),
721            ("protobuf", "Protocol Buffers"),
722            ("gomod", "Go Module"),
723            ("git-rebase", "Git Rebase Todo"),
724            ("git-commit", "Git Commit Message"),
725            ("git-config", "Git Config"),
726            ("git-attributes", "Git Attributes"),
727            ("gitignore", "Gitignore"),
728            ("fsharp", "FSharp"),
729            ("f#", "FSharp"),
730            ("terraform", "HCL"),
731            ("tf", "HCL"),
732            ("ts", "TypeScript"),
733            ("js", "JavaScript"),
734            ("py", "Python"),
735            ("rb", "Ruby"),
736            ("rs", "Rust"),
737            ("md", "Markdown"),
738            ("yml", "YAML"),
739            ("dockerfile", "Dockerfile"),
740        ]
741    }
742
743    /// Populate aliases from the built-in table.
744    ///
745    /// Validates that:
746    /// - Each alias target (full name) exists in the syntax set
747    /// - No alias collides (case-insensitive) with an existing grammar full name
748    /// - No duplicate aliases exist
749    pub(crate) fn populate_built_in_aliases(&mut self) {
750        for (short, full) in Self::built_in_aliases() {
751            self.register_alias_inner(short, full, true);
752        }
753        self.rebuild_catalog();
754    }
755
756    /// Register a short-name alias for a grammar.
757    ///
758    /// Returns `true` if the alias was registered, `false` if rejected due to
759    /// collision or missing target. For built-in aliases, collisions panic
760    /// (they indicate a bug). For dynamic aliases, collisions log a warning.
761    ///
762    /// Splices the alias directly into the catalog rather than rebuilding, so
763    /// any user config previously merged via `apply_language_config` is
764    /// preserved. A full rebuild would wipe those entries.
765    pub(crate) fn register_alias(&mut self, short_name: &str, full_name: &str) -> bool {
766        if !self.register_alias_inner(short_name, full_name, false) {
767            return false;
768        }
769        let short_lower = short_name.to_lowercase();
770        let full_lower = full_name.to_lowercase();
771        if let Some(&idx) = self.catalog_by_name.get(&full_lower) {
772            self.catalog_by_name
773                .entry(short_lower.clone())
774                .or_insert(idx);
775            let entry = &mut self.catalog[idx];
776            let replace = match &entry.short_name {
777                None => true,
778                Some(existing) => short_name.len() < existing.len(),
779            };
780            if replace {
781                entry.short_name = Some(short_lower);
782            }
783        }
784        true
785    }
786
787    fn register_alias_inner(
788        &mut self,
789        short_name: &str,
790        full_name: &str,
791        is_built_in: bool,
792    ) -> bool {
793        let short_lower = short_name.to_lowercase();
794
795        // Validate: target grammar must exist in the syntax set
796        let target_exists = self
797            .syntax_set
798            .syntaxes()
799            .iter()
800            .any(|s| s.name.eq_ignore_ascii_case(full_name));
801        if !target_exists {
802            // Tree-sitter-only targets (e.g. TypeScript) are expected to be
803            // absent from the syntect set. `rebuild_catalog` attaches their
804            // short names via a separate pass over `built_in_aliases()`.
805            if tree_sitter_for_syntect_name(full_name).is_some() {
806                return false;
807            }
808            if is_built_in {
809                // Built-in alias targets should always exist; warn but don't panic
810                // (grammar might have been removed from syntect upstream)
811                tracing::warn!(
812                    "[grammar-alias] Built-in alias '{}' -> '{}': target grammar not found, skipping",
813                    short_name, full_name
814                );
815            } else {
816                tracing::warn!(
817                    "[grammar-alias] Alias '{}' -> '{}': target grammar not found, skipping",
818                    short_name,
819                    full_name
820                );
821            }
822            return false;
823        }
824
825        // Validate: short name must not collide (case-insensitive) with any grammar full name
826        let collides_with_full_name = self
827            .syntax_set
828            .syntaxes()
829            .iter()
830            .any(|s| s.name.eq_ignore_ascii_case(&short_lower));
831        if collides_with_full_name {
832            // This is actually fine — the short name matches a full name directly,
833            // so find_syntax_by_name's case-insensitive search will find it.
834            // No alias needed.
835            tracing::debug!(
836                "[grammar-alias] Alias '{}' matches an existing grammar name, skipping (not needed)",
837                short_name
838            );
839            return false;
840        }
841
842        // Validate: no duplicate alias (case-insensitive)
843        if let Some(existing_target) = self.aliases.get(&short_lower) {
844            if existing_target.eq_ignore_ascii_case(full_name) {
845                // Same mapping, no-op
846                return true;
847            }
848            let msg = format!(
849                "Alias '{}' already maps to '{}', cannot remap to '{}'",
850                short_name, existing_target, full_name
851            );
852            if is_built_in {
853                panic!("[grammar-alias] Built-in alias collision: {}", msg);
854            } else {
855                tracing::warn!("[grammar-alias] {}", msg);
856                return false;
857            }
858        }
859
860        // Resolve the exact syntect name (preserving original case)
861        let exact_name = self
862            .syntax_set
863            .syntaxes()
864            .iter()
865            .find(|s| s.name.eq_ignore_ascii_case(full_name))
866            .map(|s| s.name.clone())
867            .unwrap();
868
869        self.aliases.insert(short_lower, exact_name);
870        true
871    }
872
873    // === Unified catalog ===
874
875    /// Rebuild the flat catalog of grammar entries.
876    ///
877    /// Called after the syntax set, aliases, or filename scopes change.
878    /// Produces one entry per logical language by merging:
879    /// 1. Every `SyntaxReference` in the syntax set (except "Plain Text")
880    /// 2. Every `fresh_languages::Language` not already covered by a syntect entry
881    /// 3. Alias short-names attached to their target entry
882    /// 4. Filename mappings from `filename_scopes` attached to their scope's entry
883    /// 5. Extra extensions from `user_extensions` attached to their scope's entry
884    ///
885    /// Automatically replays the last `apply_language_config` at the end, so
886    /// user `[languages]` config survives any rebuild.
887    pub(crate) fn rebuild_catalog(&mut self) {
888        // Reverse-map: full_name (lowercase) -> shortest alias.
889        //
890        // Seed from the built-in alias table as well as the live `aliases`
891        // HashMap: the live map only contains aliases whose target exists in
892        // the syntect set, so tree-sitter-only entries (TypeScript) would
893        // otherwise never get their short name ("ts").
894        let mut short_by_full: HashMap<String, String> = HashMap::new();
895        let record = |map: &mut HashMap<String, String>, short: &str, full: &str| {
896            let key = full.to_lowercase();
897            let keep = match map.get(&key) {
898                None => true,
899                Some(existing) => short.len() < existing.len(),
900            };
901            if keep {
902                map.insert(key, short.to_string());
903            }
904        };
905        for (short, full) in Self::built_in_aliases() {
906            record(&mut short_by_full, short, full);
907        }
908        for (short, full) in &self.aliases {
909            record(&mut short_by_full, short, full);
910        }
911
912        let derive_language_id =
913            |display_name: &str| -> (String, Option<fresh_languages::Language>) {
914                let ts = tree_sitter_for_syntect_name(display_name);
915                let id = ts
916                    .map(|l| l.id().to_string())
917                    .unwrap_or_else(|| display_name.to_lowercase());
918                (id, ts)
919            };
920
921        let mut catalog: Vec<GrammarEntry> = Vec::new();
922        let mut scope_to_index: HashMap<String, usize> = HashMap::new();
923
924        // Syntect-backed entries (skip Plain Text).
925        //
926        // Syntect's `file_extensions` is a hybrid list: real extensions like
927        // "rb" sit alongside bare filenames like "Gemfile", "Rakefile",
928        // "Makefile". Syntect's own `find_syntax_for_file` tries each entry
929        // against the whole filename AND against the path's extension, and
930        // the catalog has to preserve that semantics. We keep everything in
931        // `extensions` here and index each entry as *both* an extension and
932        // a filename at the bottom of this method.
933        for (idx, syntax) in self.syntax_set.syntaxes().iter().enumerate() {
934            if syntax.name == "Plain Text" {
935                continue;
936            }
937            let (language_id, tree_sitter) = derive_language_id(&syntax.name);
938            let short_name = short_by_full.get(&syntax.name.to_lowercase()).cloned();
939            let source = self
940                .grammar_sources
941                .get(&syntax.name)
942                .map(|info| info.source.clone())
943                .unwrap_or(GrammarSource::BuiltIn);
944            let entry_index = catalog.len();
945            scope_to_index.insert(syntax.scope.to_string(), entry_index);
946
947            // Union syntect's file_extensions with tree-sitter's own
948            // extension list when the entry carries both engines.
949            // tree-sitter-javascript handles `.jsx`/`.mjs`/`.cjs` that
950            // syntect's JS grammar doesn't list, and the old code used to
951            // route those paths to tree-sitter via a separate lookup.
952            let mut extensions = syntax.file_extensions.clone();
953            if let Some(lang) = tree_sitter {
954                for ext in lang.extensions() {
955                    let ext = ext.to_string();
956                    if !extensions.iter().any(|e| e == &ext) {
957                        extensions.push(ext);
958                    }
959                }
960            }
961
962            catalog.push(GrammarEntry {
963                display_name: syntax.name.clone(),
964                language_id,
965                short_name,
966                extensions,
967                filenames: Vec::new(),
968                filename_globs: Vec::new(),
969                source,
970                engines: GrammarEngines {
971                    syntect: Some(idx),
972                    tree_sitter,
973                },
974            });
975        }
976
977        // Attach filename_scopes to their entries.
978        for (filename, scope) in &self.filename_scopes {
979            if let Some(&idx) = scope_to_index.get(scope) {
980                if !catalog[idx].filenames.iter().any(|f| f == filename) {
981                    catalog[idx].filenames.push(filename.clone());
982                }
983            }
984        }
985
986        // Attach user_extensions (extra → scope) to their entries.
987        for (ext, scope) in &self.user_extensions {
988            if let Some(&idx) = scope_to_index.get(scope) {
989                if !catalog[idx].extensions.iter().any(|e| e == ext) {
990                    catalog[idx].extensions.push(ext.clone());
991                }
992            }
993        }
994
995        // Ensure every tree-sitter language has an entry. If a syntect entry
996        // already maps to the same tree-sitter language, skip it; otherwise
997        // add a tree-sitter-only entry so the catalog is complete (TypeScript
998        // being the motivating example — syntect ships no grammar for it).
999        let mut ts_covered: std::collections::HashSet<fresh_languages::Language> =
1000            std::collections::HashSet::new();
1001        for entry in &catalog {
1002            if let Some(lang) = entry.engines.tree_sitter {
1003                ts_covered.insert(lang);
1004            }
1005        }
1006        for lang in fresh_languages::Language::all() {
1007            if ts_covered.contains(lang) {
1008                continue;
1009            }
1010            let display_name = lang.display_name().to_string();
1011            let language_id = lang.id().to_string();
1012            let short_name = short_by_full.get(&display_name.to_lowercase()).cloned();
1013            let extensions: Vec<String> = lang.extensions().iter().map(|s| s.to_string()).collect();
1014            catalog.push(GrammarEntry {
1015                display_name,
1016                language_id,
1017                short_name,
1018                extensions,
1019                filenames: Vec::new(),
1020                filename_globs: Vec::new(),
1021                source: GrammarSource::BuiltIn,
1022                engines: GrammarEngines {
1023                    syntect: None,
1024                    tree_sitter: Some(*lang),
1025                },
1026            });
1027        }
1028
1029        // Build name / extension / filename indices.
1030        //
1031        // Every entry in `extensions` gets indexed in BOTH `by_extension`
1032        // (lowercased) AND `by_filename` (exact case) — syntect's
1033        // `file_extensions` list holds both real extensions ("rb") and bare
1034        // filenames ("Gemfile", "Rakefile", "Makefile"). Indexing both ways
1035        // matches syntect's own `find_syntax_for_file` semantics.
1036        let mut by_name: HashMap<String, usize> = HashMap::new();
1037        let mut by_extension: HashMap<String, usize> = HashMap::new();
1038        let mut by_filename: HashMap<String, usize> = HashMap::new();
1039        for (idx, entry) in catalog.iter().enumerate() {
1040            by_name.insert(entry.display_name.to_lowercase(), idx);
1041            by_name.insert(entry.language_id.to_lowercase(), idx);
1042            if let Some(short) = &entry.short_name {
1043                by_name.insert(short.to_lowercase(), idx);
1044            }
1045            for ext in &entry.extensions {
1046                by_extension.entry(ext.to_lowercase()).or_insert(idx);
1047                by_filename.entry(ext.clone()).or_insert(idx);
1048            }
1049            for filename in &entry.filenames {
1050                by_filename.entry(filename.clone()).or_insert(idx);
1051            }
1052        }
1053
1054        self.catalog = catalog;
1055        self.catalog_by_name = by_name;
1056        self.catalog_by_extension = by_extension;
1057        self.catalog_by_filename = by_filename;
1058
1059        // Replay the most recent user config so a rebuild doesn't silently
1060        // wipe out user `[languages]` rules. `take` + restore avoids both a
1061        // clone and a borrow checker fight with `apply_language_config_inner`.
1062        if !self.applied_language_config.is_empty() {
1063            let cfg = std::mem::take(&mut self.applied_language_config);
1064            self.apply_language_config_inner(&cfg);
1065            self.applied_language_config = cfg;
1066        }
1067    }
1068
1069    /// Return the full catalog of grammar entries.
1070    pub fn catalog(&self) -> &[GrammarEntry] {
1071        &self.catalog
1072    }
1073
1074    /// Look up a grammar entry by display name, language ID, or short alias
1075    /// (case-insensitive). All aliases — built-in and user-config-declared —
1076    /// are indexed directly in `catalog_by_name` during `rebuild_catalog` /
1077    /// `register_alias` / `apply_language_config`, so a single lookup covers
1078    /// every case.
1079    pub fn find_by_name(&self, name: &str) -> Option<&GrammarEntry> {
1080        self.catalog_by_name
1081            .get(&name.to_lowercase())
1082            .map(|&idx| &self.catalog[idx])
1083    }
1084
1085    /// Look up a grammar entry by file path, with optional first-line content
1086    /// for shebang / `first_line_match` detection.
1087    ///
1088    /// Resolution order:
1089    /// 1. Exact filename (config-declared filenames and filename_scopes live here)
1090    /// 2. Glob patterns from user config (e.g. "*.conf", "/etc/**/rc.*")
1091    /// 3. File extension
1092    /// 4. Shebang / first-line regex match on `first_line` if supplied
1093    ///
1094    /// Globs take priority over extension so a user rule like `*.conf → bash`
1095    /// wins over any built-in extension match on `.conf`. The first-line
1096    /// fallback (#4) is last so catalog matches stay authoritative — syntect
1097    /// might otherwise misclassify `.fish` as bash via its first-line
1098    /// regexes.
1099    ///
1100    /// The first-line fallback is pure: it runs syntect's
1101    /// `find_syntax_by_first_line` regex cache against the caller-supplied
1102    /// string. The registry never touches the filesystem — the caller (who
1103    /// already loaded the buffer via the `FileSystem` trait) must extract
1104    /// the first line and pass it in.
1105    pub fn find_by_path(&self, path: &Path, first_line: Option<&str>) -> Option<&GrammarEntry> {
1106        let filename = path.file_name().and_then(|n| n.to_str());
1107        let path_str = path.to_str().unwrap_or("");
1108
1109        if let Some(name) = filename {
1110            if let Some(&idx) = self.catalog_by_filename.get(name) {
1111                return Some(&self.catalog[idx]);
1112            }
1113        }
1114
1115        // Glob walk — filenames with globs are rare so linear scan is fine.
1116        if let Some(name) = filename {
1117            for entry in &self.catalog {
1118                for pattern in &entry.filename_globs {
1119                    let matched = if is_path_pattern(pattern) {
1120                        path_glob_matches(pattern, path_str)
1121                    } else {
1122                        filename_glob_matches(pattern, name)
1123                    };
1124                    if matched {
1125                        return Some(entry);
1126                    }
1127                }
1128            }
1129        }
1130
1131        if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
1132            if let Some(entry) = self.find_by_extension(ext) {
1133                return Some(entry);
1134            }
1135        }
1136
1137        // Last resort: shebang / first-line regex match against the
1138        // caller-supplied content. Map the matched syntect grammar back to a
1139        // catalog entry by name — every syntect syntax has a catalog entry,
1140        // so this round-trip preserves tree-sitter attachment.
1141        let line = first_line?;
1142        let syntax = self.syntax_set.find_syntax_by_first_line(line)?;
1143        self.find_by_name(&syntax.name)
1144    }
1145
1146    /// Look up a grammar entry by file extension (case-insensitive, without dot).
1147    pub fn find_by_extension(&self, ext: &str) -> Option<&GrammarEntry> {
1148        self.catalog_by_extension
1149            .get(&ext.to_lowercase())
1150            .map(|&idx| &self.catalog[idx])
1151    }
1152
1153    /// Merge user `[languages]` config into the catalog.
1154    ///
1155    /// For each config entry, resolves its grammar to an existing catalog entry
1156    /// (by grammar name or by language id). Extensions are added and override
1157    /// the ext→entry index so config wins over built-in mappings. Filenames are
1158    /// split into exact matches (indexed) and globs (walked at lookup time).
1159    ///
1160    /// If no existing entry matches, a new engine-less entry is created so the
1161    /// language still appears in the palette.
1162    ///
1163    /// Idempotent. The config is cached on the registry so `rebuild_catalog`
1164    /// can replay it — callers don't need to re-apply after a rebuild.
1165    pub fn apply_language_config(
1166        &mut self,
1167        languages: &HashMap<String, crate::config::LanguageConfig>,
1168    ) {
1169        self.applied_language_config = languages.clone();
1170        self.apply_language_config_inner(languages);
1171    }
1172
1173    /// Do the actual catalog splicing without touching
1174    /// `applied_language_config`. Called from `apply_language_config` (which
1175    /// records the input) and from `rebuild_catalog` (which replays the
1176    /// cached input after wiping the catalog).
1177    fn apply_language_config_inner(
1178        &mut self,
1179        languages: &HashMap<String, crate::config::LanguageConfig>,
1180    ) {
1181        for (lang_id, lang_cfg) in languages {
1182            let grammar_name = if lang_cfg.grammar.is_empty() {
1183                lang_id.as_str()
1184            } else {
1185                lang_cfg.grammar.as_str()
1186            };
1187
1188            // Resolve to an existing entry; fall back to creating one.
1189            let idx = self
1190                .catalog_by_name
1191                .get(&grammar_name.to_lowercase())
1192                .copied()
1193                .or_else(|| self.catalog_by_name.get(&lang_id.to_lowercase()).copied())
1194                .unwrap_or_else(|| {
1195                    let idx = self.catalog.len();
1196                    self.catalog.push(GrammarEntry {
1197                        display_name: lang_id.clone(),
1198                        language_id: lang_id.clone(),
1199                        short_name: None,
1200                        extensions: Vec::new(),
1201                        filenames: Vec::new(),
1202                        filename_globs: Vec::new(),
1203                        source: GrammarSource::BuiltIn,
1204                        engines: GrammarEngines::default(),
1205                    });
1206                    idx
1207                });
1208
1209            // Always index the config key so `find_by_name("mylang")` resolves
1210            // even when `mylang` aliases an existing grammar (e.g.
1211            // `[languages.mylang] grammar = "Rust"`). `or_insert` preserves
1212            // any existing mapping — won't clobber the canonical entry.
1213            self.catalog_by_name
1214                .entry(lang_id.to_lowercase())
1215                .or_insert(idx);
1216
1217            for ext in &lang_cfg.extensions {
1218                if !self.catalog[idx].extensions.iter().any(|e| e == ext) {
1219                    self.catalog[idx].extensions.push(ext.clone());
1220                }
1221                // Config-declared extensions override any previous mapping.
1222                self.catalog_by_extension.insert(ext.to_lowercase(), idx);
1223            }
1224            for filename in &lang_cfg.filenames {
1225                if is_glob_pattern(filename) {
1226                    if !self.catalog[idx]
1227                        .filename_globs
1228                        .iter()
1229                        .any(|f| f == filename)
1230                    {
1231                        self.catalog[idx].filename_globs.push(filename.clone());
1232                    }
1233                } else {
1234                    if !self.catalog[idx].filenames.iter().any(|f| f == filename) {
1235                        self.catalog[idx].filenames.push(filename.clone());
1236                    }
1237                    self.catalog_by_filename.insert(filename.clone(), idx);
1238                }
1239            }
1240        }
1241    }
1242
1243    /// Get the underlying syntax set
1244    pub fn syntax_set(&self) -> &Arc<SyntaxSet> {
1245        &self.syntax_set
1246    }
1247
1248    /// Get a clone of the Arc for sharing
1249    pub fn syntax_set_arc(&self) -> Arc<SyntaxSet> {
1250        Arc::clone(&self.syntax_set)
1251    }
1252
1253    /// List all available syntax names
1254    pub fn available_syntaxes(&self) -> Vec<&str> {
1255        self.syntax_set
1256            .syntaxes()
1257            .iter()
1258            .map(|s| s.name.as_str())
1259            .collect()
1260    }
1261
1262    /// List all available grammars with provenance information.
1263    ///
1264    /// Returns a sorted list of `GrammarInfo` entries derived from the unified
1265    /// catalog — this includes both syntect grammars and tree-sitter-only
1266    /// languages (like TypeScript). Each entry is listed exactly once even
1267    /// when both engines can serve it.
1268    pub fn available_grammar_info(&self) -> Vec<GrammarInfo> {
1269        let mut result: Vec<GrammarInfo> = self
1270            .catalog
1271            .iter()
1272            .map(|entry| GrammarInfo {
1273                name: entry.display_name.clone(),
1274                source: entry.source.clone(),
1275                file_extensions: entry.extensions.clone(),
1276                short_name: entry.short_name.clone(),
1277            })
1278            .collect();
1279        result.sort_by(|a, b| a.name.to_lowercase().cmp(&b.name.to_lowercase()));
1280        result
1281    }
1282
1283    /// Get the grammar sources map.
1284    pub(crate) fn grammar_sources(&self) -> &HashMap<String, GrammarInfo> {
1285        &self.grammar_sources
1286    }
1287
1288    /// Build grammar source info from a pre-compiled syntax set.
1289    ///
1290    /// All grammars in the packdump (syntect defaults + embedded) are tagged as built-in.
1291    pub(crate) fn build_grammar_sources_from_syntax_set(
1292        syntax_set: &SyntaxSet,
1293    ) -> HashMap<String, GrammarInfo> {
1294        let mut sources = HashMap::new();
1295        for syntax in syntax_set.syntaxes() {
1296            sources.insert(
1297                syntax.name.clone(),
1298                GrammarInfo {
1299                    name: syntax.name.clone(),
1300                    source: GrammarSource::BuiltIn,
1301                    file_extensions: syntax.file_extensions.clone(),
1302                    short_name: None,
1303                },
1304            );
1305        }
1306        sources
1307    }
1308
1309    /// Get the user extensions mapping (extension -> scope name).
1310    #[cfg(test)]
1311    pub(crate) fn user_extensions(&self) -> &HashMap<String, String> {
1312        &self.user_extensions
1313    }
1314
1315    /// Get the loaded grammar paths (for deduplication in flush_pending_grammars).
1316    #[cfg(test)]
1317    pub(crate) fn loaded_grammar_paths(&self) -> &[GrammarSpec] {
1318        &self.loaded_grammar_paths
1319    }
1320
1321    /// Create a new registry with additional grammar files
1322    ///
1323    /// This builds a new GrammarRegistry that includes all grammars from
1324    /// the base registry plus the additional grammars specified.
1325    /// Uses the base registry's syntax_set as the builder base, preserving
1326    /// all existing grammars (user grammars, language packs, etc.).
1327    ///
1328    /// # Arguments
1329    /// * `base` - The base registry to extend
1330    /// * `additional` - List of (language, path, extensions) tuples for new grammars
1331    ///
1332    /// # Returns
1333    /// A new GrammarRegistry with the additional grammars, or None if rebuilding fails
1334    pub fn with_additional_grammars(
1335        base: &GrammarRegistry,
1336        additional: &[GrammarSpec],
1337    ) -> Option<Self> {
1338        tracing::info!(
1339            "[SYNTAX DEBUG] with_additional_grammars: adding {} grammars to base with {} syntaxes",
1340            additional.len(),
1341            base.syntax_set.syntaxes().len()
1342        );
1343
1344        // Use the base registry's syntax_set as builder base — this preserves
1345        // ALL existing grammars (defaults, embedded, user, language packs)
1346        // without needing to reload them from disk.
1347        let mut builder = (*base.syntax_set).clone().into_builder();
1348
1349        // Preserve existing user extensions and add new ones
1350        let mut user_extensions = base.user_extensions.clone();
1351
1352        // Track loaded grammar paths (existing + new)
1353        let mut loaded_grammar_paths = base.loaded_grammar_paths.clone();
1354
1355        // Preserve existing grammar sources
1356        let mut grammar_sources = base.grammar_sources.clone();
1357
1358        // Add each new grammar
1359        for spec in additional {
1360            tracing::info!(
1361                "[SYNTAX DEBUG] loading new grammar file: lang='{}', path={:?}, extensions={:?}",
1362                spec.language,
1363                spec.path,
1364                spec.extensions
1365            );
1366            match Self::load_grammar_file(&spec.path) {
1367                Ok(syntax) => {
1368                    let scope = syntax.scope.to_string();
1369                    let syntax_name = syntax.name.clone();
1370                    tracing::info!(
1371                        "[SYNTAX DEBUG] grammar loaded successfully: name='{}', scope='{}'",
1372                        syntax_name,
1373                        scope
1374                    );
1375                    builder.add(syntax);
1376                    tracing::info!(
1377                        "Loaded grammar for '{}' from {:?} with extensions {:?}",
1378                        spec.language,
1379                        spec.path,
1380                        spec.extensions
1381                    );
1382                    // Register extensions for this grammar
1383                    for ext in &spec.extensions {
1384                        user_extensions.insert(ext.clone(), scope.clone());
1385                    }
1386                    // Track provenance
1387                    grammar_sources.insert(
1388                        syntax_name.clone(),
1389                        GrammarInfo {
1390                            name: syntax_name,
1391                            source: GrammarSource::Plugin {
1392                                plugin: spec.language.clone(),
1393                                path: spec.path.clone(),
1394                            },
1395                            file_extensions: spec.extensions.clone(),
1396                            short_name: None,
1397                        },
1398                    );
1399                    // Track this grammar path for future reloads
1400                    loaded_grammar_paths.push(spec.clone());
1401                }
1402                Err(e) => {
1403                    tracing::warn!(
1404                        "Failed to load grammar for '{}' from {:?}: {}",
1405                        spec.language,
1406                        spec.path,
1407                        e
1408                    );
1409                }
1410            }
1411        }
1412
1413        let mut reg = Self {
1414            syntax_set: Arc::new(builder.build()),
1415            user_extensions,
1416            filename_scopes: base.filename_scopes.clone(),
1417            loaded_grammar_paths,
1418            grammar_sources,
1419            aliases: base.aliases.clone(),
1420            catalog: Vec::new(),
1421            catalog_by_name: HashMap::new(),
1422            catalog_by_extension: HashMap::new(),
1423            catalog_by_filename: HashMap::new(),
1424            applied_language_config: HashMap::new(),
1425        };
1426        reg.rebuild_catalog();
1427        Some(reg)
1428    }
1429
1430    /// Load a grammar file from disk
1431    ///
1432    /// Only Sublime Text (.sublime-syntax) format is supported.
1433    /// TextMate (.tmLanguage) grammars use a completely different format
1434    /// and cannot be loaded by syntect's yaml-load feature.
1435    pub(crate) fn load_grammar_file(path: &Path) -> Result<SyntaxDefinition, String> {
1436        let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
1437
1438        match ext {
1439            "sublime-syntax" => {
1440                let content = std::fs::read_to_string(path)
1441                    .map_err(|e| format!("Failed to read file: {}", e))?;
1442                SyntaxDefinition::load_from_str(
1443                    &content,
1444                    true,
1445                    path.file_stem().and_then(|s| s.to_str()),
1446                )
1447                .map_err(|e| format!("Failed to parse sublime-syntax: {}", e))
1448            }
1449            _ => Err(format!(
1450                "Unsupported grammar format: .{}. Only .sublime-syntax is supported.",
1451                ext
1452            )),
1453        }
1454    }
1455}
1456
1457impl Default for GrammarRegistry {
1458    fn default() -> Self {
1459        // Create with defaults and embedded grammars only (no user grammars)
1460        let defaults = SyntaxSet::load_defaults_newlines();
1461        let mut builder = defaults.into_builder();
1462        Self::add_embedded_grammars(&mut builder);
1463        let syntax_set = builder.build();
1464        let filename_scopes = Self::build_filename_scopes();
1465        let extra_extensions = Self::build_extra_extensions();
1466
1467        let mut registry = Self::new(syntax_set, extra_extensions, filename_scopes);
1468        registry.populate_built_in_aliases();
1469        registry.rebuild_catalog();
1470        registry
1471    }
1472}
1473
1474// VSCode package.json structures for parsing grammar manifests
1475
1476#[derive(Debug, Deserialize)]
1477pub struct PackageManifest {
1478    #[serde(default)]
1479    pub contributes: Option<Contributes>,
1480}
1481
1482#[derive(Debug, Deserialize, Default)]
1483pub struct Contributes {
1484    #[serde(default)]
1485    pub languages: Vec<LanguageContribution>,
1486    #[serde(default)]
1487    pub grammars: Vec<GrammarContribution>,
1488}
1489
1490#[derive(Debug, Deserialize)]
1491pub struct LanguageContribution {
1492    pub id: String,
1493    #[serde(default)]
1494    pub extensions: Vec<String>,
1495}
1496
1497#[derive(Debug, Deserialize)]
1498pub struct GrammarContribution {
1499    pub language: String,
1500    #[serde(rename = "scopeName")]
1501    pub scope_name: String,
1502    pub path: String,
1503}
1504
1505#[cfg(test)]
1506mod tests {
1507    use super::*;
1508
1509    #[test]
1510    fn test_empty_registry() {
1511        let registry = GrammarRegistry::empty();
1512        // Should have at least plain text
1513        assert!(!registry.available_syntaxes().is_empty());
1514    }
1515
1516    #[test]
1517    fn test_default_registry() {
1518        let registry = GrammarRegistry::default();
1519        // Should have built-in syntaxes
1520        assert!(!registry.available_syntaxes().is_empty());
1521    }
1522
1523    #[test]
1524    fn test_find_syntax_for_common_extensions() {
1525        let registry = GrammarRegistry::default();
1526
1527        // Test common extensions that syntect should support
1528        let test_cases = [
1529            ("test.py", true),
1530            ("test.rs", true),
1531            ("test.js", true),
1532            ("test.json", true),
1533            ("test.md", true),
1534            ("test.html", true),
1535            ("test.css", true),
1536            ("test.unknown_extension_xyz", false),
1537        ];
1538
1539        for (filename, should_exist) in test_cases {
1540            let path = Path::new(filename);
1541            let result = registry.find_syntax_for_file(path);
1542            assert_eq!(
1543                result.is_some(),
1544                should_exist,
1545                "Expected {:?} for {}",
1546                should_exist,
1547                filename
1548            );
1549        }
1550    }
1551
1552    #[test]
1553    fn test_racket_grammar_loaded() {
1554        let registry = GrammarRegistry::default();
1555        for filename in ["main.rkt", "data.rktd", "info.rktl", "doc.scrbl"] {
1556            let result = registry.find_syntax_for_file(Path::new(filename));
1557            assert!(
1558                result.is_some(),
1559                "Racket grammar should be available for {}",
1560                filename
1561            );
1562            let entry = registry.find_by_path(Path::new(filename), None).unwrap();
1563            assert_eq!(entry.display_name, "Racket", "for {}", filename);
1564        }
1565    }
1566
1567    #[test]
1568    fn test_syntax_set_arc() {
1569        let registry = GrammarRegistry::default();
1570        let arc1 = registry.syntax_set_arc();
1571        let arc2 = registry.syntax_set_arc();
1572        // Both should point to the same data
1573        assert!(Arc::ptr_eq(&arc1, &arc2));
1574    }
1575
1576    #[test]
1577    fn test_shell_dotfiles_detection() {
1578        let registry = GrammarRegistry::default();
1579
1580        // All these should be detected as shell scripts
1581        let shell_files = [".zshrc", ".zprofile", ".zshenv", ".bash_aliases"];
1582
1583        for filename in shell_files {
1584            let path = Path::new(filename);
1585            let result = registry.find_syntax_for_file(path);
1586            assert!(
1587                result.is_some(),
1588                "{} should be detected as a syntax",
1589                filename
1590            );
1591            let syntax = result.unwrap();
1592            // Should be detected as Bash/Shell
1593            assert!(
1594                syntax.name.to_lowercase().contains("bash")
1595                    || syntax.name.to_lowercase().contains("shell"),
1596                "{} should be detected as shell/bash, got: {}",
1597                filename,
1598                syntax.name
1599            );
1600        }
1601    }
1602
1603    #[test]
1604    fn test_pkgbuild_detection() {
1605        let registry = GrammarRegistry::default();
1606
1607        // PKGBUILD and APKBUILD should be detected as shell scripts
1608        for filename in ["PKGBUILD", "APKBUILD"] {
1609            let path = Path::new(filename);
1610            let result = registry.find_syntax_for_file(path);
1611            assert!(
1612                result.is_some(),
1613                "{} should be detected as a syntax",
1614                filename
1615            );
1616            let syntax = result.unwrap();
1617            // Should be detected as Bash/Shell
1618            assert!(
1619                syntax.name.to_lowercase().contains("bash")
1620                    || syntax.name.to_lowercase().contains("shell"),
1621                "{} should be detected as shell/bash, got: {}",
1622                filename,
1623                syntax.name
1624            );
1625        }
1626    }
1627
1628    #[test]
1629    fn test_find_syntax_with_glob_filenames() {
1630        let mut registry = GrammarRegistry::default();
1631        let mut languages = std::collections::HashMap::new();
1632        languages.insert(
1633            "shell-configs".to_string(),
1634            crate::config::LanguageConfig {
1635                extensions: vec!["sh".to_string()],
1636                filenames: vec!["*.conf".to_string(), "*rc".to_string()],
1637                grammar: "bash".to_string(),
1638                comment_prefix: Some("#".to_string()),
1639                auto_indent: true,
1640                auto_close: None,
1641                auto_surround: None,
1642                textmate_grammar: None,
1643                show_whitespace_tabs: true,
1644                line_wrap: None,
1645                wrap_column: None,
1646                page_view: None,
1647                page_width: None,
1648                use_tabs: None,
1649                tab_size: None,
1650                formatter: None,
1651                format_on_save: false,
1652                on_save: vec![],
1653                word_characters: None,
1654            },
1655        );
1656        registry.apply_language_config(&languages);
1657
1658        assert!(
1659            registry
1660                .find_by_path(Path::new("nftables.conf"), None)
1661                .is_some(),
1662            "*.conf should match nftables.conf"
1663        );
1664        assert!(
1665            registry.find_by_path(Path::new("lfrc"), None).is_some(),
1666            "*rc should match lfrc"
1667        );
1668        // Unrelated file shouldn't panic.
1669        let _ = registry.find_by_path(Path::new("randomfile"), None);
1670    }
1671
1672    #[test]
1673    fn test_find_syntax_with_path_glob_filenames() {
1674        let mut registry = GrammarRegistry::default();
1675        let mut languages = std::collections::HashMap::new();
1676        languages.insert(
1677            "shell-configs".to_string(),
1678            crate::config::LanguageConfig {
1679                extensions: vec!["sh".to_string()],
1680                filenames: vec!["/etc/**/rc.*".to_string()],
1681                grammar: "bash".to_string(),
1682                comment_prefix: Some("#".to_string()),
1683                auto_indent: true,
1684                auto_close: None,
1685                auto_surround: None,
1686                textmate_grammar: None,
1687                show_whitespace_tabs: true,
1688                line_wrap: None,
1689                wrap_column: None,
1690                page_view: None,
1691                page_width: None,
1692                use_tabs: None,
1693                tab_size: None,
1694                formatter: None,
1695                format_on_save: false,
1696                on_save: vec![],
1697                word_characters: None,
1698            },
1699        );
1700        registry.apply_language_config(&languages);
1701
1702        assert!(
1703            registry
1704                .find_by_path(Path::new("/etc/rc.conf"), None)
1705                .is_some(),
1706            "/etc/**/rc.* should match /etc/rc.conf"
1707        );
1708        assert!(
1709            registry
1710                .find_by_path(Path::new("/etc/init/rc.local"), None)
1711                .is_some(),
1712            "/etc/**/rc.* should match /etc/init/rc.local"
1713        );
1714        let _ = registry.find_by_path(Path::new("/var/rc.conf"), None);
1715    }
1716
1717    #[test]
1718    fn test_exact_filename_takes_priority_over_glob() {
1719        let mut registry = GrammarRegistry::default();
1720        let mut languages = std::collections::HashMap::new();
1721
1722        // A language with exact filename "lfrc" -> python grammar
1723        languages.insert(
1724            "custom-lfrc".to_string(),
1725            crate::config::LanguageConfig {
1726                extensions: vec![],
1727                filenames: vec!["lfrc".to_string()],
1728                grammar: "python".to_string(),
1729                comment_prefix: Some("#".to_string()),
1730                auto_indent: true,
1731                auto_close: None,
1732                auto_surround: None,
1733                textmate_grammar: None,
1734                show_whitespace_tabs: true,
1735                line_wrap: None,
1736                wrap_column: None,
1737                page_view: None,
1738                page_width: None,
1739                use_tabs: None,
1740                tab_size: None,
1741                formatter: None,
1742                format_on_save: false,
1743                on_save: vec![],
1744                word_characters: None,
1745            },
1746        );
1747
1748        // A language with glob "*rc" -> bash grammar
1749        languages.insert(
1750            "rc-files".to_string(),
1751            crate::config::LanguageConfig {
1752                extensions: vec![],
1753                filenames: vec!["*rc".to_string()],
1754                grammar: "bash".to_string(),
1755                comment_prefix: Some("#".to_string()),
1756                auto_indent: true,
1757                auto_close: None,
1758                auto_surround: None,
1759                textmate_grammar: None,
1760                show_whitespace_tabs: true,
1761                line_wrap: None,
1762                wrap_column: None,
1763                page_view: None,
1764                page_width: None,
1765                use_tabs: None,
1766                tab_size: None,
1767                formatter: None,
1768                format_on_save: false,
1769                on_save: vec![],
1770                word_characters: None,
1771            },
1772        );
1773
1774        registry.apply_language_config(&languages);
1775
1776        // "lfrc" should match the exact rule (python), not the glob (bash)
1777        let entry = registry.find_by_path(Path::new("lfrc"), None).unwrap();
1778        assert!(
1779            entry.display_name.to_lowercase().contains("python"),
1780            "exact match should win over glob, got: {}",
1781            entry.display_name
1782        );
1783    }
1784
1785    #[test]
1786    fn test_built_in_aliases_resolve() {
1787        let registry = GrammarRegistry::default();
1788
1789        // "bash" should resolve to "Bourne Again Shell (bash)" via alias
1790        let syntax = registry.find_syntax_by_name("bash");
1791        assert!(syntax.is_some(), "alias 'bash' should resolve");
1792        assert_eq!(syntax.unwrap().name, "Bourne Again Shell (bash)");
1793
1794        // "cpp" should resolve to "C++"
1795        let syntax = registry.find_syntax_by_name("cpp");
1796        assert!(syntax.is_some(), "alias 'cpp' should resolve");
1797        assert_eq!(syntax.unwrap().name, "C++");
1798
1799        // "csharp" should resolve to "C#"
1800        let syntax = registry.find_syntax_by_name("csharp");
1801        assert!(syntax.is_some(), "alias 'csharp' should resolve");
1802        assert_eq!(syntax.unwrap().name, "C#");
1803
1804        // "sh" should also resolve to bash
1805        let syntax = registry.find_syntax_by_name("sh");
1806        assert!(syntax.is_some(), "alias 'sh' should resolve");
1807        assert_eq!(syntax.unwrap().name, "Bourne Again Shell (bash)");
1808
1809        // "proto" should resolve to "Protocol Buffers"
1810        let syntax = registry.find_syntax_by_name("proto");
1811        assert!(syntax.is_some(), "alias 'proto' should resolve");
1812        assert_eq!(syntax.unwrap().name, "Protocol Buffers");
1813    }
1814
1815    #[test]
1816    fn test_alias_case_insensitive_input() {
1817        let registry = GrammarRegistry::default();
1818
1819        // Aliases should be case-insensitive on input
1820        let syntax = registry.find_syntax_by_name("BASH");
1821        assert!(
1822            syntax.is_some(),
1823            "alias 'BASH' should resolve case-insensitively"
1824        );
1825        assert_eq!(syntax.unwrap().name, "Bourne Again Shell (bash)");
1826
1827        let syntax = registry.find_syntax_by_name("Cpp");
1828        assert!(
1829            syntax.is_some(),
1830            "alias 'Cpp' should resolve case-insensitively"
1831        );
1832        assert_eq!(syntax.unwrap().name, "C++");
1833    }
1834
1835    #[test]
1836    fn test_full_name_still_works() {
1837        let registry = GrammarRegistry::default();
1838
1839        // Full names should still work (exact match)
1840        let syntax = registry.find_syntax_by_name("Bourne Again Shell (bash)");
1841        assert!(syntax.is_some(), "full name should still resolve");
1842        assert_eq!(syntax.unwrap().name, "Bourne Again Shell (bash)");
1843
1844        // Case-insensitive full name should still work
1845        let syntax = registry.find_syntax_by_name("bourne again shell (bash)");
1846        assert!(
1847            syntax.is_some(),
1848            "case-insensitive full name should resolve"
1849        );
1850        assert_eq!(syntax.unwrap().name, "Bourne Again Shell (bash)");
1851    }
1852
1853    #[test]
1854    fn test_alias_does_not_shadow_full_names() {
1855        let registry = GrammarRegistry::default();
1856
1857        // "Rust" should resolve directly via case-insensitive match, not via alias
1858        let syntax = registry.find_syntax_by_name("rust");
1859        assert!(syntax.is_some());
1860        assert_eq!(syntax.unwrap().name, "Rust");
1861
1862        // "Go" should resolve directly
1863        let syntax = registry.find_syntax_by_name("go");
1864        assert!(syntax.is_some());
1865        assert_eq!(syntax.unwrap().name, "Go");
1866    }
1867
1868    #[test]
1869    fn test_register_alias_rejects_collision() {
1870        let mut registry = GrammarRegistry::default();
1871
1872        // Trying to register an alias that maps to two different targets should fail
1873        assert!(registry.register_alias("myalias", "Rust"));
1874        assert!(!registry.register_alias("myalias", "Go"));
1875
1876        // Same mapping is fine (idempotent)
1877        assert!(registry.register_alias("myalias", "Rust"));
1878    }
1879
1880    #[test]
1881    fn test_register_alias_rejects_nonexistent_target() {
1882        let mut registry = GrammarRegistry::default();
1883        assert!(!registry.register_alias("nope", "Nonexistent Grammar"));
1884    }
1885
1886    #[test]
1887    fn test_register_alias_skips_existing_grammar_name() {
1888        let mut registry = GrammarRegistry::default();
1889
1890        // "rust" case-insensitively matches the grammar "Rust", so no alias needed
1891        assert!(!registry.register_alias("rust", "Rust"));
1892        // Should still be resolvable via case-insensitive match
1893        assert!(registry.find_syntax_by_name("rust").is_some());
1894    }
1895
1896    #[test]
1897    fn test_available_grammar_info_includes_short_names() {
1898        let registry = GrammarRegistry::default();
1899        let infos = registry.available_grammar_info();
1900
1901        let bash_info = infos.iter().find(|g| g.name == "Bourne Again Shell (bash)");
1902        assert!(bash_info.is_some(), "bash grammar should be in the list");
1903        let bash_info = bash_info.unwrap();
1904        assert!(
1905            bash_info.short_name.is_some(),
1906            "bash grammar should have a short_name"
1907        );
1908        // The shortest alias for bash is "sh"
1909        assert_eq!(bash_info.short_name.as_deref(), Some("sh"));
1910    }
1911
1912    #[test]
1913    fn test_catalog_contains_each_language_once() {
1914        let registry = GrammarRegistry::default();
1915        let catalog = registry.catalog();
1916
1917        // Every catalog entry must have a unique (case-insensitive) display name.
1918        let mut seen = std::collections::HashSet::new();
1919        for entry in catalog {
1920            let key = entry.display_name.to_lowercase();
1921            assert!(
1922                seen.insert(key.clone()),
1923                "duplicate catalog entry for display_name={:?}",
1924                entry.display_name
1925            );
1926        }
1927
1928        // TypeScript is tree-sitter-only (syntect ships no grammar for it) yet
1929        // must still appear in the catalog.
1930        let ts = registry
1931            .find_by_name("TypeScript")
1932            .expect("TypeScript must be in the catalog");
1933        assert!(ts.engines.syntect.is_none());
1934        assert_eq!(
1935            ts.engines.tree_sitter,
1936            Some(fresh_languages::Language::TypeScript)
1937        );
1938        assert_eq!(ts.language_id, "typescript");
1939        assert!(ts.extensions.iter().any(|e| e == "ts"));
1940
1941        // Languages that exist in both syntect and tree-sitter (Rust, Python,
1942        // JavaScript) must appear exactly once and prefer the syntect engine.
1943        for name in ["Rust", "Python", "JavaScript"] {
1944            let entry = registry
1945                .find_by_name(name)
1946                .unwrap_or_else(|| panic!("{} must be in the catalog", name));
1947            assert!(
1948                entry.engines.syntect.is_some(),
1949                "{} should have a syntect index",
1950                name
1951            );
1952            assert!(
1953                entry.engines.tree_sitter.is_some(),
1954                "{} should also have a tree-sitter language",
1955                name
1956            );
1957            // Only one entry with this display name (already checked above),
1958            // but also verify language_id lookup lands on the same entry.
1959            let by_id = registry
1960                .find_by_name(&entry.language_id)
1961                .expect("language_id should resolve");
1962            assert_eq!(by_id.display_name, entry.display_name);
1963        }
1964    }
1965
1966    #[test]
1967    fn test_catalog_find_by_path_and_extension() {
1968        let registry = GrammarRegistry::default();
1969        let ts = registry
1970            .find_by_path(Path::new("foo.ts"), None)
1971            .expect("foo.ts should resolve");
1972        assert_eq!(ts.display_name, "TypeScript");
1973        let rs = registry.find_by_extension("rs").expect("rs should resolve");
1974        assert_eq!(rs.display_name, "Rust");
1975    }
1976
1977    /// Build a minimal LanguageConfig for tests.
1978    fn lang_cfg(
1979        grammar: &str,
1980        extensions: &[&str],
1981        filenames: &[&str],
1982    ) -> crate::config::LanguageConfig {
1983        crate::config::LanguageConfig {
1984            extensions: extensions.iter().map(|s| s.to_string()).collect(),
1985            filenames: filenames.iter().map(|s| s.to_string()).collect(),
1986            grammar: grammar.to_string(),
1987            comment_prefix: None,
1988            auto_indent: true,
1989            auto_close: None,
1990            auto_surround: None,
1991            textmate_grammar: None,
1992            show_whitespace_tabs: true,
1993            line_wrap: None,
1994            wrap_column: None,
1995            page_view: None,
1996            page_width: None,
1997            use_tabs: None,
1998            tab_size: None,
1999            formatter: None,
2000            format_on_save: false,
2001            on_save: vec![],
2002            word_characters: None,
2003        }
2004    }
2005
2006    /// Bug #1: a user-declared config key that aliases an existing grammar
2007    /// (e.g. `[languages.mylang] grammar = "Rust"`) must resolve via
2008    /// `find_by_name("mylang")` so the language palette can select it.
2009    #[test]
2010    fn test_user_alias_resolves_via_find_by_name() {
2011        let mut registry = GrammarRegistry::default();
2012        let mut languages = std::collections::HashMap::new();
2013        languages.insert("mylang".to_string(), lang_cfg("Rust", &[], &[]));
2014        registry.apply_language_config(&languages);
2015
2016        let entry = registry
2017            .find_by_name("mylang")
2018            .expect("user-declared alias 'mylang' must resolve");
2019        assert_eq!(entry.display_name, "Rust");
2020    }
2021
2022    /// Bug #2: `register_alias` used to rebuild the catalog from scratch,
2023    /// wiping out everything `apply_language_config` had merged. Registering
2024    /// an alias afterwards must not lose user config.
2025    #[test]
2026    fn test_register_alias_preserves_applied_language_config() {
2027        let mut registry = GrammarRegistry::default();
2028        let mut languages = std::collections::HashMap::new();
2029        languages.insert(
2030            "shell-configs".to_string(),
2031            lang_cfg("bash", &["myconf"], &["*.myconf"]),
2032        );
2033        registry.apply_language_config(&languages);
2034
2035        // Sanity: config applied.
2036        assert!(registry.find_by_extension("myconf").is_some());
2037        assert!(
2038            registry
2039                .find_by_path(Path::new("foo.myconf"), None)
2040                .is_some(),
2041            "glob should match before register_alias"
2042        );
2043
2044        // Registering an alias must not erase the config we just applied.
2045        registry.register_alias("mycustom", "Rust");
2046
2047        assert!(
2048            registry.find_by_extension("myconf").is_some(),
2049            "config extension must survive register_alias"
2050        );
2051        assert!(
2052            registry
2053                .find_by_path(Path::new("foo.myconf"), None)
2054                .is_some(),
2055            "glob must survive register_alias"
2056        );
2057    }
2058
2059    /// Bug #4: `from_syntax_name` used to unconditionally overwrite the
2060    /// catalog's canonical display name with whatever the user typed (e.g.
2061    /// "BASH") — that string ended up in the status bar.
2062    #[test]
2063    fn test_from_syntax_name_preserves_canonical_display_name() {
2064        use crate::primitives::detected_language::DetectedLanguage;
2065        let registry = GrammarRegistry::default();
2066        let languages = std::collections::HashMap::new();
2067
2068        let detected = DetectedLanguage::from_syntax_name("BASH", &registry, &languages)
2069            .expect("BASH should resolve via alias");
2070        assert_eq!(
2071            detected.display_name, "Bourne Again Shell (bash)",
2072            "display_name must be canonical, not user-typed"
2073        );
2074    }
2075
2076    /// A config-only language (no matching syntect grammar) must still appear
2077    /// in the catalog so the language palette can offer it — the old
2078    /// `DetectedLanguage::from_config_language` branch was load-bearing.
2079    #[test]
2080    fn test_config_only_language_appears_in_catalog() {
2081        let mut registry = GrammarRegistry::default();
2082        let mut languages = std::collections::HashMap::new();
2083        // "fish" isn't in syntect; grammar="fish" doesn't resolve either.
2084        languages.insert("fish".to_string(), lang_cfg("fish", &["fish"], &[]));
2085        registry.apply_language_config(&languages);
2086
2087        let entry = registry
2088            .find_by_name("fish")
2089            .expect("fish should be in the catalog after apply_language_config");
2090        assert!(entry.engines.syntect.is_none());
2091        assert!(entry.engines.tree_sitter.is_none());
2092        assert_eq!(entry.language_id, "fish");
2093        assert!(entry.extensions.iter().any(|e| e == "fish"));
2094    }
2095
2096    /// Config-declared extensions must override the built-in mapping. If the
2097    /// user says `[languages.typescript-overlay] extensions = ["js"] grammar
2098    /// = "TypeScript"`, then `foo.js` must resolve to TypeScript, not
2099    /// JavaScript.
2100    #[test]
2101    fn test_config_extension_overrides_builtin() {
2102        let mut registry = GrammarRegistry::default();
2103        // Sanity: default mapping is JavaScript.
2104        assert_eq!(
2105            registry.find_by_extension("js").unwrap().display_name,
2106            "JavaScript"
2107        );
2108
2109        let mut languages = std::collections::HashMap::new();
2110        languages.insert(
2111            "ts-overlay".to_string(),
2112            lang_cfg("TypeScript", &["js"], &[]),
2113        );
2114        registry.apply_language_config(&languages);
2115
2116        assert_eq!(
2117            registry.find_by_extension("js").unwrap().display_name,
2118            "TypeScript",
2119            "user-config extension must win over built-in"
2120        );
2121    }
2122
2123    /// Bare filenames listed by syntect grammars (e.g. "Gemfile", "Makefile",
2124    /// "Rakefile") must resolve through `find_by_path`. Syntect stores these
2125    /// in each grammar's `file_extensions` field alongside real extensions
2126    /// like "rb"; its own `find_syntax_for_file` treats them as either. The
2127    /// catalog has to do the same or `HighlightEngine::for_file` breaks for
2128    /// every extensionless config file.
2129    #[test]
2130    fn test_bare_filename_resolves_via_find_by_path() {
2131        let registry = GrammarRegistry::default();
2132        for (filename, expected_substr) in [
2133            ("Gemfile", "ruby"),
2134            ("Rakefile", "ruby"),
2135            ("Vagrantfile", "ruby"),
2136            ("Makefile", "makefile"),
2137            ("GNUmakefile", "makefile"),
2138        ] {
2139            let entry = registry
2140                .find_by_path(Path::new(filename), None)
2141                .unwrap_or_else(|| panic!("{} must resolve via catalog", filename));
2142            assert!(
2143                entry.display_name.to_lowercase().contains(expected_substr),
2144                "{} should resolve to {} grammar, got {}",
2145                filename,
2146                expected_substr,
2147                entry.display_name
2148            );
2149        }
2150    }
2151
2152    /// Languages that have both syntect and tree-sitter (e.g. JavaScript) must
2153    /// expose the union of both engines' extensions. Tree-sitter-javascript
2154    /// knows `.jsx`; syntect's JavaScript grammar does not. Both should route
2155    /// through the JavaScript catalog entry.
2156    #[test]
2157    fn test_jsx_resolves_to_javascript() {
2158        let registry = GrammarRegistry::default();
2159        let entry = registry
2160            .find_by_path(Path::new("foo.jsx"), None)
2161            .expect("foo.jsx must resolve");
2162        assert_eq!(entry.display_name, "JavaScript");
2163    }
2164
2165    /// `rebuild_catalog` must replay the last-applied language config so it
2166    /// can never silently wipe user `[languages]` rules. This is the invariant
2167    /// that keeps `register_alias`, `populate_built_in_aliases`, and any
2168    /// future rebuild callsite safe-by-construction.
2169    #[test]
2170    fn test_rebuild_catalog_replays_language_config() {
2171        let mut registry = GrammarRegistry::default();
2172        let mut languages = std::collections::HashMap::new();
2173        languages.insert(
2174            "myshell".to_string(),
2175            lang_cfg("bash", &["myext"], &["*.myglob"]),
2176        );
2177        registry.apply_language_config(&languages);
2178        assert!(registry.find_by_extension("myext").is_some());
2179        assert!(registry
2180            .find_by_path(Path::new("foo.myglob"), None)
2181            .is_some());
2182
2183        // Force a rebuild — the catalog gets wiped and re-populated from
2184        // syntect / tree-sitter, but user config must come back on top.
2185        registry.rebuild_catalog();
2186        assert!(
2187            registry.find_by_extension("myext").is_some(),
2188            "rebuild_catalog must replay applied user config"
2189        );
2190        assert!(
2191            registry
2192                .find_by_path(Path::new("foo.myglob"), None)
2193                .is_some(),
2194            "rebuild_catalog must replay user globs"
2195        );
2196    }
2197
2198    /// `apply_language_config` must be idempotent: calling it twice with the
2199    /// same config yields the same catalog state.
2200    #[test]
2201    fn test_apply_language_config_idempotent() {
2202        let mut registry = GrammarRegistry::default();
2203        let mut languages = std::collections::HashMap::new();
2204        languages.insert(
2205            "shell-cfg".to_string(),
2206            lang_cfg("bash", &["myconf"], &["*.myconf"]),
2207        );
2208
2209        registry.apply_language_config(&languages);
2210        let first_extensions = registry
2211            .find_by_name("bash")
2212            .unwrap()
2213            .extensions
2214            .iter()
2215            .filter(|e| e == &"myconf")
2216            .count();
2217        let first_globs = registry
2218            .find_by_name("bash")
2219            .unwrap()
2220            .filename_globs
2221            .iter()
2222            .filter(|g| g == &"*.myconf")
2223            .count();
2224        assert_eq!(first_extensions, 1);
2225        assert_eq!(first_globs, 1);
2226
2227        // Second call must not duplicate anything.
2228        registry.apply_language_config(&languages);
2229        let second_extensions = registry
2230            .find_by_name("bash")
2231            .unwrap()
2232            .extensions
2233            .iter()
2234            .filter(|e| e == &"myconf")
2235            .count();
2236        let second_globs = registry
2237            .find_by_name("bash")
2238            .unwrap()
2239            .filename_globs
2240            .iter()
2241            .filter(|g| g == &"*.myconf")
2242            .count();
2243        assert_eq!(second_extensions, 1, "extensions must not duplicate");
2244        assert_eq!(second_globs, 1, "globs must not duplicate");
2245    }
2246
2247    /// `tree_sitter_for_syntect_name` handles the alias table + strict
2248    /// display-name match. The alias table catches syntect's verbose names;
2249    /// the strict match handles the common case.
2250    #[test]
2251    fn test_tree_sitter_bridge() {
2252        assert_eq!(
2253            tree_sitter_for_syntect_name("Bourne Again Shell (bash)"),
2254            Some(fresh_languages::Language::Bash)
2255        );
2256        assert_eq!(
2257            tree_sitter_for_syntect_name("Rust"),
2258            Some(fresh_languages::Language::Rust)
2259        );
2260        // Must NOT fuzzy-match Nushell to Bash.
2261        assert_eq!(tree_sitter_for_syntect_name("Nushell"), None);
2262        // Must NOT match arbitrary strings.
2263        assert_eq!(tree_sitter_for_syntect_name("does-not-exist"), None);
2264    }
2265}