Skip to main content

fresh/primitives/grammar/
types.rs

1//! Pure grammar registry types without I/O operations.
2//!
3//! This module contains the `GrammarRegistry` struct and all syntax lookup methods
4//! that don't require filesystem access. This enables WASM compatibility and easier testing.
5
6use serde::{Deserialize, Serialize};
7use std::collections::HashMap;
8use std::path::{Path, PathBuf};
9use std::sync::Arc;
10use syntect::parsing::{SyntaxDefinition, SyntaxReference, SyntaxSet, SyntaxSetBuilder};
11
12// Re-export glob matching utilities for use by other modules
13pub use crate::primitives::glob_match::{
14    filename_glob_matches, is_glob_pattern, is_path_pattern, path_glob_matches,
15};
16
17/// A grammar specification: language name, path to grammar file, and associated file extensions.
18///
19/// Used to pass grammar information between the plugin layer, loader, and registry
20/// without relying on anonymous tuples.
21#[derive(Clone, Debug)]
22pub struct GrammarSpec {
23    /// Language identifier (e.g., "elixir")
24    pub language: String,
25    /// Path to the grammar file (.sublime-syntax)
26    pub path: PathBuf,
27    /// File extensions to associate with this grammar (e.g., ["ex", "exs"])
28    pub extensions: Vec<String>,
29}
30
31/// Where a grammar was loaded from.
32#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
33#[serde(tag = "type")]
34pub enum GrammarSource {
35    /// Built-in to Fresh (pre-compiled syntect defaults + embedded grammars)
36    #[serde(rename = "built-in")]
37    BuiltIn,
38    /// Installed from a user grammar directory (~/.config/fresh/grammars/)
39    #[serde(rename = "user")]
40    User { path: PathBuf },
41    /// From a language pack (~/.config/fresh/languages/packages/)
42    #[serde(rename = "language-pack")]
43    LanguagePack { name: String, path: PathBuf },
44    /// From a bundle package (~/.config/fresh/bundles/packages/)
45    #[serde(rename = "bundle")]
46    Bundle { name: String, path: PathBuf },
47    /// Registered by a plugin at runtime
48    #[serde(rename = "plugin")]
49    Plugin { plugin: String, path: PathBuf },
50}
51
52impl std::fmt::Display for GrammarSource {
53    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
54        match self {
55            GrammarSource::BuiltIn => write!(f, "built-in"),
56            GrammarSource::User { path } => write!(f, "user ({})", path.display()),
57            GrammarSource::LanguagePack { name, .. } => write!(f, "language-pack ({})", name),
58            GrammarSource::Bundle { name, .. } => write!(f, "bundle ({})", name),
59            GrammarSource::Plugin { plugin, .. } => write!(f, "plugin ({})", plugin),
60        }
61    }
62}
63
64/// Information about an available grammar, including its provenance.
65#[derive(Clone, Debug, Serialize, Deserialize)]
66pub struct GrammarInfo {
67    /// The grammar name as used in config files (case-insensitive matching)
68    pub name: String,
69    /// Where this grammar was loaded from
70    pub source: GrammarSource,
71    /// File extensions associated with this grammar
72    pub file_extensions: Vec<String>,
73    /// Optional short name alias (e.g., "bash" for "Bourne Again Shell (bash)")
74    #[serde(default, skip_serializing_if = "Option::is_none")]
75    pub short_name: Option<String>,
76}
77
78/// Bridge between syntect display names and `fresh_languages::Language`.
79///
80/// Most syntect grammars map one-to-one: "Rust" → `Language::Rust`. A few
81/// have verbose display names that don't match the tree-sitter enum's
82/// `display_name()`, and `Language::from_name` has fuzzy "contains shell"
83/// fallbacks that would wrongly tag Nushell as tree-sitter Bash. This is
84/// the one place we spell the exceptions out explicitly.
85const SYNTECT_TO_TREE_SITTER_ALIASES: &[(&str, fresh_languages::Language)] =
86    &[("Bourne Again Shell (bash)", fresh_languages::Language::Bash)];
87
88/// Resolve a syntect syntax display name to a tree-sitter language, using
89/// strict equality against the alias table and `Language::display_name()`.
90fn tree_sitter_for_syntect_name(display_name: &str) -> Option<fresh_languages::Language> {
91    for (syntect_name, lang) in SYNTECT_TO_TREE_SITTER_ALIASES {
92        if *syntect_name == display_name {
93            return Some(*lang);
94        }
95    }
96    fresh_languages::Language::all()
97        .iter()
98        .find(|l| l.display_name() == display_name)
99        .copied()
100}
101
102/// Which highlighters can serve a given `GrammarEntry`.
103///
104/// A catalog entry may come from syntect (a TextMate grammar indexed into
105/// `SyntaxSet`), tree-sitter (a `fresh_languages::Language`), or both.
106#[derive(Clone, Debug, Default)]
107pub struct GrammarEngines {
108    /// Index into `GrammarRegistry::syntax_set().syntaxes()`, if a syntect
109    /// grammar is available.
110    pub syntect: Option<usize>,
111    /// Tree-sitter language, if one is registered for this grammar.
112    pub tree_sitter: Option<fresh_languages::Language>,
113}
114
115/// A single entry in the unified grammar catalog.
116///
117/// Each entry represents one logical language (e.g. "Rust", "TypeScript") and
118/// records which highlighting engines can serve it, plus the names/extensions
119/// used to look it up. The catalog is the single source of truth for grammar
120/// lookups — `find_by_name`, `find_by_path`, `find_by_extension` all return
121/// entries from here, and both `HighlightEngine::from_entry` and
122/// `DetectedLanguage::from_entry` consume them.
123#[derive(Clone, Debug)]
124pub struct GrammarEntry {
125    /// Human-readable display name (e.g. "TypeScript", "Bourne Again Shell (bash)").
126    pub display_name: String,
127    /// Canonical language ID used in config and LSP (e.g. "typescript", "csharp").
128    pub language_id: String,
129    /// Short alias, if one exists (e.g. "ts" for TypeScript).
130    pub short_name: Option<String>,
131    /// File extensions (without leading dot).
132    pub extensions: Vec<String>,
133    /// Exact filenames that map to this grammar (e.g. "Dockerfile").
134    pub filenames: Vec<String>,
135    /// Filename globs from user config (e.g. "*.conf", "/etc/**/rc.*").
136    pub filename_globs: Vec<String>,
137    /// Where this grammar was loaded from.
138    pub source: GrammarSource,
139    /// Highlighters that can serve this entry.
140    pub engines: GrammarEngines,
141}
142
143/// Embedded TOML grammar (syntect doesn't include one)
144pub const TOML_GRAMMAR: &str = include_str!("../../grammars/toml.sublime-syntax");
145
146/// Embedded Odin grammar (syntect doesn't include one)
147/// From: https://github.com/Tetralux/sublime-odin (MIT License)
148pub const ODIN_GRAMMAR: &str = include_str!("../../grammars/odin/Odin.sublime-syntax");
149
150/// Embedded Zig grammar (syntect doesn't include one)
151pub const ZIG_GRAMMAR: &str = include_str!("../../grammars/zig.sublime-syntax");
152
153/// Embedded Git Rebase Todo grammar for interactive rebase
154pub const GIT_REBASE_GRAMMAR: &str = include_str!("../../grammars/git-rebase.sublime-syntax");
155
156/// Embedded Git Commit Message grammar for COMMIT_EDITMSG, MERGE_MSG, etc.
157pub const GIT_COMMIT_GRAMMAR: &str = include_str!("../../grammars/git-commit.sublime-syntax");
158
159/// Embedded Gitignore grammar for .gitignore and similar files
160pub const GITIGNORE_GRAMMAR: &str = include_str!("../../grammars/gitignore.sublime-syntax");
161
162/// Embedded Git Config grammar for .gitconfig, .gitmodules
163pub const GITCONFIG_GRAMMAR: &str = include_str!("../../grammars/gitconfig.sublime-syntax");
164
165/// Embedded Git Attributes grammar for .gitattributes
166pub const GITATTRIBUTES_GRAMMAR: &str = include_str!("../../grammars/gitattributes.sublime-syntax");
167
168/// Embedded Typst grammar (syntect doesn't include one)
169pub const TYPST_GRAMMAR: &str = include_str!("../../grammars/typst.sublime-syntax");
170
171/// Embedded Dockerfile grammar
172pub const DOCKERFILE_GRAMMAR: &str = include_str!("../../grammars/dockerfile.sublime-syntax");
173/// Embedded INI grammar (also handles .env, .cfg, .editorconfig, etc.)
174pub const INI_GRAMMAR: &str = include_str!("../../grammars/ini.sublime-syntax");
175/// Embedded CMake grammar
176pub const CMAKE_GRAMMAR: &str = include_str!("../../grammars/cmake.sublime-syntax");
177/// Embedded SCSS grammar
178pub const SCSS_GRAMMAR: &str = include_str!("../../grammars/scss.sublime-syntax");
179/// Embedded LESS grammar
180pub const LESS_GRAMMAR: &str = include_str!("../../grammars/less.sublime-syntax");
181/// Embedded PowerShell grammar
182pub const POWERSHELL_GRAMMAR: &str = include_str!("../../grammars/powershell.sublime-syntax");
183/// Embedded Kotlin grammar
184pub const KOTLIN_GRAMMAR: &str = include_str!("../../grammars/kotlin.sublime-syntax");
185/// Embedded Swift grammar
186pub const SWIFT_GRAMMAR: &str = include_str!("../../grammars/swift.sublime-syntax");
187/// Embedded Dart grammar
188pub const DART_GRAMMAR: &str = include_str!("../../grammars/dart.sublime-syntax");
189/// Embedded Elixir grammar
190pub const ELIXIR_GRAMMAR: &str = include_str!("../../grammars/elixir.sublime-syntax");
191/// Embedded F# grammar
192pub const FSHARP_GRAMMAR: &str = include_str!("../../grammars/fsharp.sublime-syntax");
193/// Embedded Nix grammar
194pub const NIX_GRAMMAR: &str = include_str!("../../grammars/nix.sublime-syntax");
195/// Embedded HCL/Terraform grammar
196pub const HCL_GRAMMAR: &str = include_str!("../../grammars/hcl.sublime-syntax");
197/// Embedded Protocol Buffers grammar
198pub const PROTOBUF_GRAMMAR: &str = include_str!("../../grammars/protobuf.sublime-syntax");
199/// Embedded GraphQL grammar
200pub const GRAPHQL_GRAMMAR: &str = include_str!("../../grammars/graphql.sublime-syntax");
201/// Embedded Julia grammar
202pub const JULIA_GRAMMAR: &str = include_str!("../../grammars/julia.sublime-syntax");
203/// Embedded Nim grammar
204pub const NIM_GRAMMAR: &str = include_str!("../../grammars/nim.sublime-syntax");
205/// Embedded Gleam grammar
206pub const GLEAM_GRAMMAR: &str = include_str!("../../grammars/gleam.sublime-syntax");
207/// Embedded V language grammar
208pub const VLANG_GRAMMAR: &str = include_str!("../../grammars/vlang.sublime-syntax");
209/// Embedded Solidity grammar
210pub const SOLIDITY_GRAMMAR: &str = include_str!("../../grammars/solidity.sublime-syntax");
211/// Embedded KDL grammar
212pub const KDL_GRAMMAR: &str = include_str!("../../grammars/kdl.sublime-syntax");
213/// Embedded Nushell grammar
214pub const NUSHELL_GRAMMAR: &str = include_str!("../../grammars/nushell.sublime-syntax");
215/// Embedded Starlark/Bazel grammar
216pub const STARLARK_GRAMMAR: &str = include_str!("../../grammars/starlark.sublime-syntax");
217/// Embedded Justfile grammar
218pub const JUSTFILE_GRAMMAR: &str = include_str!("../../grammars/justfile.sublime-syntax");
219/// Embedded Earthfile grammar
220pub const EARTHFILE_GRAMMAR: &str = include_str!("../../grammars/earthfile.sublime-syntax");
221/// Embedded Go Module grammar
222pub const GOMOD_GRAMMAR: &str = include_str!("../../grammars/gomod.sublime-syntax");
223/// Embedded Vue grammar
224pub const VUE_GRAMMAR: &str = include_str!("../../grammars/vue.sublime-syntax");
225/// Embedded Svelte grammar
226pub const SVELTE_GRAMMAR: &str = include_str!("../../grammars/svelte.sublime-syntax");
227/// Embedded Astro grammar
228pub const ASTRO_GRAMMAR: &str = include_str!("../../grammars/astro.sublime-syntax");
229/// Embedded Hyprlang grammar (Hyprland config)
230pub const HYPRLANG_GRAMMAR: &str = include_str!("../../grammars/hyprlang.sublime-syntax");
231/// Embedded AutoHotkey grammar
232/// From: https://github.com/SALZKARTOFFEEEL/ahk-sublime-syntax (MIT License)
233pub const AUTOHOTKEY_GRAMMAR: &str =
234    include_str!("../../grammars/autohotkey/AutoHotkey.sublime-syntax");
235/// Embedded Racket grammar (syntect doesn't include one)
236pub const RACKET_GRAMMAR: &str = include_str!("../../grammars/racket.sublime-syntax");
237
238/// Registry of all available TextMate grammars.
239///
240/// This struct holds the compiled syntax set and provides lookup methods.
241/// It does not perform I/O directly - use `GrammarLoader` for loading grammars.
242impl std::fmt::Debug for GrammarRegistry {
243    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
244        f.debug_struct("GrammarRegistry")
245            .field("syntax_count", &self.syntax_set.syntaxes().len())
246            .finish()
247    }
248}
249
250pub struct GrammarRegistry {
251    /// Combined syntax set (built-in + embedded + user grammars)
252    syntax_set: Arc<SyntaxSet>,
253    /// Extension -> scope name mapping for user grammars (takes priority)
254    user_extensions: HashMap<String, String>,
255    /// Filename -> scope name mapping for dotfiles and special files
256    filename_scopes: HashMap<String, String>,
257    /// Paths to dynamically loaded grammar files (for reloading when adding more)
258    loaded_grammar_paths: Vec<GrammarSpec>,
259    /// Provenance info for each grammar (keyed by grammar name)
260    grammar_sources: HashMap<String, GrammarInfo>,
261    /// Short name aliases: lowercase short_name -> full syntect grammar name.
262    /// Provides a deterministic, one-to-one mapping so users can write
263    /// `grammar = "bash"` instead of `grammar = "Bourne Again Shell (bash)"`.
264    aliases: HashMap<String, String>,
265    /// Unified catalog of every known grammar. Rebuilt whenever the syntax set
266    /// or alias table changes. Lookups (`find_by_name`, `find_by_path`, ...)
267    /// all resolve against this.
268    catalog: Vec<GrammarEntry>,
269    /// Index from lowercased lookup keys (display name, language_id, short_name)
270    /// to catalog index.
271    catalog_by_name: HashMap<String, usize>,
272    /// Index from file extension (without dot) to catalog index.
273    catalog_by_extension: HashMap<String, usize>,
274    /// Index from filename to catalog index.
275    catalog_by_filename: HashMap<String, usize>,
276    /// The most recent language config handed to `apply_language_config`.
277    /// Retained so `rebuild_catalog` can replay it — otherwise a rebuild
278    /// (triggered by e.g. `populate_built_in_aliases`) silently wipes user
279    /// `[languages]` config that was merged on top.
280    applied_language_config: HashMap<String, crate::config::LanguageConfig>,
281    /// Monotonic generation, bumped on every catalog mutation. Lets
282    /// observers (plugin state snapshot) detect changes with one integer
283    /// compare instead of recounting entries.
284    catalog_gen: u64,
285}
286
287impl GrammarRegistry {
288    /// Create a new GrammarRegistry from pre-built components.
289    ///
290    /// This is typically called by `GrammarLoader` implementations after
291    /// loading grammars from various sources.
292    pub(crate) fn new(
293        syntax_set: SyntaxSet,
294        user_extensions: HashMap<String, String>,
295        filename_scopes: HashMap<String, String>,
296    ) -> Self {
297        Self::new_with_loaded_paths(
298            syntax_set,
299            user_extensions,
300            filename_scopes,
301            Vec::new(),
302            HashMap::new(),
303        )
304    }
305
306    /// Create a GrammarRegistry with pre-loaded grammar path tracking.
307    ///
308    /// Used by the loader when plugin grammars were included in the initial build,
309    /// so that `loaded_grammar_paths()` reflects what was actually loaded.
310    pub(crate) fn new_with_loaded_paths(
311        syntax_set: SyntaxSet,
312        user_extensions: HashMap<String, String>,
313        filename_scopes: HashMap<String, String>,
314        loaded_grammar_paths: Vec<GrammarSpec>,
315        grammar_sources: HashMap<String, GrammarInfo>,
316    ) -> Self {
317        let mut reg = Self {
318            syntax_set: Arc::new(syntax_set),
319            user_extensions,
320            filename_scopes,
321            loaded_grammar_paths,
322            grammar_sources,
323            aliases: HashMap::new(),
324            catalog: Vec::new(),
325            catalog_by_name: HashMap::new(),
326            catalog_by_extension: HashMap::new(),
327            catalog_by_filename: HashMap::new(),
328            applied_language_config: HashMap::new(),
329            catalog_gen: 0,
330        };
331        reg.rebuild_catalog();
332        reg
333    }
334
335    /// Create an empty grammar registry (fast, for tests that don't need syntax highlighting)
336    pub fn empty() -> Arc<Self> {
337        let mut builder = SyntaxSetBuilder::new();
338        builder.add_plain_text_syntax();
339        let mut reg = Self {
340            syntax_set: Arc::new(builder.build()),
341            user_extensions: HashMap::new(),
342            filename_scopes: HashMap::new(),
343            loaded_grammar_paths: Vec::new(),
344            grammar_sources: HashMap::new(),
345            aliases: HashMap::new(),
346            catalog: Vec::new(),
347            catalog_by_name: HashMap::new(),
348            catalog_by_extension: HashMap::new(),
349            catalog_by_filename: HashMap::new(),
350            applied_language_config: HashMap::new(),
351            catalog_gen: 0,
352        };
353        reg.rebuild_catalog();
354        Arc::new(reg)
355    }
356
357    /// Create a registry with only syntect's pre-compiled defaults (~0ms).
358    ///
359    /// This provides instant syntax highlighting for ~50 common languages
360    /// (Rust, Python, JS/TS, C/C++, Go, Java, HTML, CSS, Markdown, etc.)
361    /// without any `SyntaxSetBuilder::build()` call. Use this at startup,
362    /// then swap in a full registry built on a background thread.
363    pub fn defaults_only() -> Arc<Self> {
364        // Load pre-compiled syntax set (defaults + embedded grammars) from
365        // build-time packdump. This avoids the expensive into_builder() + build()
366        // cycle at runtime (~12s → ~300ms).
367        tracing::info!("defaults_only: loading pre-compiled syntax packdump...");
368        let syntax_set: SyntaxSet = syntect::dumps::from_uncompressed_data(include_bytes!(
369            concat!(env!("OUT_DIR"), "/default_syntaxes.packdump")
370        ))
371        .expect("Failed to load pre-compiled syntax packdump");
372        tracing::info!(
373            "defaults_only: loaded ({} syntaxes)",
374            syntax_set.syntaxes().len()
375        );
376        let grammar_sources = Self::build_grammar_sources_from_syntax_set(&syntax_set);
377        let filename_scopes = Self::build_filename_scopes();
378        let extra_extensions = Self::build_extra_extensions();
379        let mut registry = Self {
380            syntax_set: Arc::new(syntax_set),
381            user_extensions: extra_extensions,
382            filename_scopes,
383            loaded_grammar_paths: Vec::new(),
384            grammar_sources,
385            aliases: HashMap::new(),
386            catalog: Vec::new(),
387            catalog_by_name: HashMap::new(),
388            catalog_by_extension: HashMap::new(),
389            catalog_by_filename: HashMap::new(),
390            applied_language_config: HashMap::new(),
391            catalog_gen: 0,
392        };
393        registry.populate_built_in_aliases();
394        registry.rebuild_catalog();
395        Arc::new(registry)
396    }
397
398    /// Build extra extension -> scope mappings for extensions not covered by syntect defaults.
399    ///
400    /// These map common file extensions to existing syntect grammar scopes,
401    /// filling gaps where syntect's built-in extension lists are incomplete.
402    pub(crate) fn build_extra_extensions() -> HashMap<String, String> {
403        let mut map = HashMap::new();
404
405        // JavaScript variants not in syntect defaults (["js", "htc"])
406        let js_scope = "source.js".to_string();
407        map.insert("cjs".to_string(), js_scope.clone());
408        map.insert("mjs".to_string(), js_scope);
409
410        // Dockerfile variants (e.g. Dockerfile.dev -> .dev extension)
411        // These won't match by extension, handled by filename_scopes and first_line_match
412
413        map
414    }
415
416    /// Build the default filename -> scope mappings for dotfiles and special files.
417    pub(crate) fn build_filename_scopes() -> HashMap<String, String> {
418        let mut map = HashMap::new();
419
420        // Shell configuration files -> Bash/Shell script scope
421        let shell_scope = "source.shell.bash".to_string();
422        for filename in [
423            ".zshrc",
424            ".zprofile",
425            ".zshenv",
426            ".zlogin",
427            ".zlogout",
428            ".bash_aliases",
429            // .bashrc and .bash_profile are already recognized by syntect
430            // Common shell script files without extensions
431            "PKGBUILD",
432            "APKBUILD",
433        ] {
434            map.insert(filename.to_string(), shell_scope.clone());
435        }
436
437        // Git rebase todo files
438        let git_rebase_scope = "source.git-rebase-todo".to_string();
439        map.insert("git-rebase-todo".to_string(), git_rebase_scope);
440
441        // Git commit message files
442        let git_commit_scope = "source.git-commit".to_string();
443        for filename in ["COMMIT_EDITMSG", "MERGE_MSG", "SQUASH_MSG", "TAG_EDITMSG"] {
444            map.insert(filename.to_string(), git_commit_scope.clone());
445        }
446
447        // Gitignore and similar files
448        let gitignore_scope = "source.gitignore".to_string();
449        for filename in [".gitignore", ".dockerignore", ".npmignore", ".hgignore"] {
450            map.insert(filename.to_string(), gitignore_scope.clone());
451        }
452
453        // Git config files
454        let gitconfig_scope = "source.gitconfig".to_string();
455        for filename in [".gitconfig", ".gitmodules"] {
456            map.insert(filename.to_string(), gitconfig_scope.clone());
457        }
458
459        // Git attributes files
460        let gitattributes_scope = "source.gitattributes".to_string();
461        map.insert(".gitattributes".to_string(), gitattributes_scope);
462
463        // Jenkinsfile -> Groovy
464        let groovy_scope = "source.groovy".to_string();
465        map.insert("Jenkinsfile".to_string(), groovy_scope);
466
467        // Vagrantfile -> Ruby (syntect already handles this, but be explicit)
468        // Brewfile -> Ruby
469        let ruby_scope = "source.ruby".to_string();
470        map.insert("Brewfile".to_string(), ruby_scope);
471
472        // Dockerfile and variants (exact names; Dockerfile.* handled via prefix check)
473        let dockerfile_scope = "source.dockerfile".to_string();
474        map.insert("Dockerfile".to_string(), dockerfile_scope.clone());
475        map.insert("Containerfile".to_string(), dockerfile_scope.clone());
476        // Common Dockerfile variants
477        map.insert("Dockerfile.dev".to_string(), dockerfile_scope.clone());
478        map.insert("Dockerfile.prod".to_string(), dockerfile_scope.clone());
479        map.insert("Dockerfile.test".to_string(), dockerfile_scope.clone());
480        map.insert("Dockerfile.build".to_string(), dockerfile_scope.clone());
481
482        // CMake
483        let cmake_scope = "source.cmake".to_string();
484        map.insert("CMakeLists.txt".to_string(), cmake_scope);
485
486        // Starlark/Bazel
487        let starlark_scope = "source.starlark".to_string();
488        map.insert("BUILD".to_string(), starlark_scope.clone());
489        map.insert("BUILD.bazel".to_string(), starlark_scope.clone());
490        map.insert("WORKSPACE".to_string(), starlark_scope.clone());
491        map.insert("WORKSPACE.bazel".to_string(), starlark_scope.clone());
492        map.insert("Tiltfile".to_string(), starlark_scope);
493
494        // Justfile (various casings)
495        let justfile_scope = "source.justfile".to_string();
496        map.insert("justfile".to_string(), justfile_scope.clone());
497        map.insert("Justfile".to_string(), justfile_scope.clone());
498        map.insert(".justfile".to_string(), justfile_scope);
499
500        // EditorConfig -> INI
501        let ini_scope = "source.ini".to_string();
502        map.insert(".editorconfig".to_string(), ini_scope);
503
504        // Earthfile
505        let earthfile_scope = "source.earthfile".to_string();
506        map.insert("Earthfile".to_string(), earthfile_scope);
507
508        // Hyprlang (Hyprland config files)
509        let hyprlang_scope = "source.hyprlang".to_string();
510        map.insert("hyprland.conf".to_string(), hyprlang_scope.clone());
511        map.insert("hyprpaper.conf".to_string(), hyprlang_scope.clone());
512        map.insert("hyprlock.conf".to_string(), hyprlang_scope);
513
514        // go.mod / go.sum
515        let gomod_scope = "source.gomod".to_string();
516        map.insert("go.mod".to_string(), gomod_scope.clone());
517        map.insert("go.sum".to_string(), gomod_scope);
518
519        map
520    }
521
522    /// Add embedded grammars (TOML, Odin, etc.) to a syntax set builder.
523    pub(crate) fn add_embedded_grammars(builder: &mut SyntaxSetBuilder) {
524        // TOML grammar
525        match SyntaxDefinition::load_from_str(TOML_GRAMMAR, true, Some("TOML")) {
526            Ok(syntax) => {
527                builder.add(syntax);
528                tracing::debug!("Loaded embedded TOML grammar");
529            }
530            Err(e) => {
531                tracing::warn!("Failed to load embedded TOML grammar: {}", e);
532            }
533        }
534
535        // Odin grammar
536        match SyntaxDefinition::load_from_str(ODIN_GRAMMAR, true, Some("Odin")) {
537            Ok(syntax) => {
538                builder.add(syntax);
539                tracing::debug!("Loaded embedded Odin grammar");
540            }
541            Err(e) => {
542                tracing::warn!("Failed to load embedded Odin grammar: {}", e);
543            }
544        }
545
546        // Zig grammar
547        match SyntaxDefinition::load_from_str(ZIG_GRAMMAR, true, Some("Zig")) {
548            Ok(syntax) => {
549                builder.add(syntax);
550                tracing::debug!("Loaded embedded Zig grammar");
551            }
552            Err(e) => {
553                tracing::warn!("Failed to load embedded Zig grammar: {}", e);
554            }
555        }
556
557        // Git Rebase Todo grammar
558        match SyntaxDefinition::load_from_str(GIT_REBASE_GRAMMAR, true, Some("Git Rebase Todo")) {
559            Ok(syntax) => {
560                builder.add(syntax);
561                tracing::debug!("Loaded embedded Git Rebase Todo grammar");
562            }
563            Err(e) => {
564                tracing::warn!("Failed to load embedded Git Rebase Todo grammar: {}", e);
565            }
566        }
567
568        // Git Commit Message grammar
569        match SyntaxDefinition::load_from_str(GIT_COMMIT_GRAMMAR, true, Some("Git Commit Message"))
570        {
571            Ok(syntax) => {
572                builder.add(syntax);
573                tracing::debug!("Loaded embedded Git Commit Message grammar");
574            }
575            Err(e) => {
576                tracing::warn!("Failed to load embedded Git Commit Message grammar: {}", e);
577            }
578        }
579
580        // Gitignore grammar
581        match SyntaxDefinition::load_from_str(GITIGNORE_GRAMMAR, true, Some("Gitignore")) {
582            Ok(syntax) => {
583                builder.add(syntax);
584                tracing::debug!("Loaded embedded Gitignore grammar");
585            }
586            Err(e) => {
587                tracing::warn!("Failed to load embedded Gitignore grammar: {}", e);
588            }
589        }
590
591        // Git Config grammar
592        match SyntaxDefinition::load_from_str(GITCONFIG_GRAMMAR, true, Some("Git Config")) {
593            Ok(syntax) => {
594                builder.add(syntax);
595                tracing::debug!("Loaded embedded Git Config grammar");
596            }
597            Err(e) => {
598                tracing::warn!("Failed to load embedded Git Config grammar: {}", e);
599            }
600        }
601
602        // Git Attributes grammar
603        match SyntaxDefinition::load_from_str(GITATTRIBUTES_GRAMMAR, true, Some("Git Attributes")) {
604            Ok(syntax) => {
605                builder.add(syntax);
606                tracing::debug!("Loaded embedded Git Attributes grammar");
607            }
608            Err(e) => {
609                tracing::warn!("Failed to load embedded Git Attributes grammar: {}", e);
610            }
611        }
612
613        // Typst grammar
614        match SyntaxDefinition::load_from_str(TYPST_GRAMMAR, true, Some("Typst")) {
615            Ok(syntax) => {
616                builder.add(syntax);
617                tracing::debug!("Loaded embedded Typst grammar");
618            }
619            Err(e) => {
620                tracing::warn!("Failed to load embedded Typst grammar: {}", e);
621            }
622        }
623
624        // Additional embedded grammars for languages not in syntect defaults
625        let additional_grammars: &[(&str, &str)] = &[
626            (DOCKERFILE_GRAMMAR, "Dockerfile"),
627            (INI_GRAMMAR, "INI"),
628            (CMAKE_GRAMMAR, "CMake"),
629            (SCSS_GRAMMAR, "SCSS"),
630            (LESS_GRAMMAR, "LESS"),
631            (POWERSHELL_GRAMMAR, "PowerShell"),
632            (KOTLIN_GRAMMAR, "Kotlin"),
633            (SWIFT_GRAMMAR, "Swift"),
634            (DART_GRAMMAR, "Dart"),
635            (ELIXIR_GRAMMAR, "Elixir"),
636            (FSHARP_GRAMMAR, "FSharp"),
637            (NIX_GRAMMAR, "Nix"),
638            (HCL_GRAMMAR, "HCL"),
639            (PROTOBUF_GRAMMAR, "Protocol Buffers"),
640            (GRAPHQL_GRAMMAR, "GraphQL"),
641            (JULIA_GRAMMAR, "Julia"),
642            (NIM_GRAMMAR, "Nim"),
643            (GLEAM_GRAMMAR, "Gleam"),
644            (VLANG_GRAMMAR, "V"),
645            (SOLIDITY_GRAMMAR, "Solidity"),
646            (KDL_GRAMMAR, "KDL"),
647            (NUSHELL_GRAMMAR, "Nushell"),
648            (STARLARK_GRAMMAR, "Starlark"),
649            (JUSTFILE_GRAMMAR, "Justfile"),
650            (EARTHFILE_GRAMMAR, "Earthfile"),
651            (GOMOD_GRAMMAR, "Go Module"),
652            (VUE_GRAMMAR, "Vue"),
653            (SVELTE_GRAMMAR, "Svelte"),
654            (ASTRO_GRAMMAR, "Astro"),
655            (HYPRLANG_GRAMMAR, "Hyprlang"),
656            (AUTOHOTKEY_GRAMMAR, "AutoHotkey"),
657            (RACKET_GRAMMAR, "Racket"),
658        ];
659
660        for (grammar_str, name) in additional_grammars {
661            match SyntaxDefinition::load_from_str(grammar_str, true, Some(name)) {
662                Ok(syntax) => {
663                    builder.add(syntax);
664                    tracing::debug!("Loaded embedded {} grammar", name);
665                }
666                Err(e) => {
667                    tracing::warn!("Failed to load embedded {} grammar: {}", name, e);
668                }
669            }
670        }
671    }
672
673    /// Find syntax for a file by path/extension/filename.
674    ///
675    /// Purely metadata-based — does not read the file. For first-line
676    /// (shebang) fallback, use [`find_by_path`] with a `first_line` argument
677    /// and resolve the returned entry's syntect index.
678    pub fn find_syntax_for_file(&self, path: &Path) -> Option<&SyntaxReference> {
679        let entry = self.find_by_path(path, None)?;
680        entry
681            .engines
682            .syntect
683            .map(|i| &self.syntax_set.syntaxes()[i])
684    }
685
686    /// Find syntax by name, with alias resolution.
687    ///
688    /// Thin wrapper around `find_by_name` that returns the associated syntect
689    /// `SyntaxReference`. Tree-sitter-only entries return `None`.
690    ///
691    /// Falls back to a direct syntect lookup for "Plain Text", which the
692    /// catalog deliberately omits but syntect still exposes.
693    pub fn find_syntax_by_name(&self, name: &str) -> Option<&SyntaxReference> {
694        if let Some(entry) = self.find_by_name(name) {
695            if let Some(idx) = entry.engines.syntect {
696                return Some(&self.syntax_set.syntaxes()[idx]);
697            }
698        }
699        // Plain Text is excluded from the catalog (it's not a "grammar" a user
700        // would ever pick), but syntect still stores it and a handful of
701        // callers still ask for it by name.
702        self.syntax_set.find_syntax_by_name(name)
703    }
704
705    // === Alias management ===
706
707    /// Hardcoded short-name aliases for built-in and embedded grammars.
708    ///
709    /// Each entry maps a short name (lowercase) to the exact syntect grammar name.
710    /// Only grammars whose full name differs significantly from a natural short
711    /// form need an entry here. Grammars already short (e.g., "Rust", "Go") are
712    /// reachable via case-insensitive matching and don't need aliases.
713    fn built_in_aliases() -> Vec<(&'static str, &'static str)> {
714        vec![
715            // Syntect built-in grammars with verbose names
716            ("bash", "Bourne Again Shell (bash)"),
717            ("shell", "Bourne Again Shell (bash)"),
718            ("sh", "Bourne Again Shell (bash)"),
719            ("c++", "C++"),
720            ("cpp", "C++"),
721            ("csharp", "C#"),
722            ("objc", "Objective-C"),
723            ("objcpp", "Objective-C++"),
724            ("regex", "Regular Expressions (Python)"),
725            ("regexp", "Regular Expressions (Python)"),
726            // Embedded grammars with multi-word or non-obvious names
727            ("proto", "Protocol Buffers"),
728            ("protobuf", "Protocol Buffers"),
729            ("gomod", "Go Module"),
730            ("git-rebase", "Git Rebase Todo"),
731            ("git-commit", "Git Commit Message"),
732            ("git-config", "Git Config"),
733            ("git-attributes", "Git Attributes"),
734            ("gitignore", "Gitignore"),
735            ("fsharp", "FSharp"),
736            ("f#", "FSharp"),
737            ("terraform", "HCL"),
738            ("tf", "HCL"),
739            ("ts", "TypeScript"),
740            ("js", "JavaScript"),
741            ("py", "Python"),
742            ("rb", "Ruby"),
743            ("rs", "Rust"),
744            ("md", "Markdown"),
745            ("yml", "YAML"),
746            ("dockerfile", "Dockerfile"),
747        ]
748    }
749
750    /// Populate aliases from the built-in table.
751    ///
752    /// Validates that:
753    /// - Each alias target (full name) exists in the syntax set
754    /// - No alias collides (case-insensitive) with an existing grammar full name
755    /// - No duplicate aliases exist
756    pub(crate) fn populate_built_in_aliases(&mut self) {
757        for (short, full) in Self::built_in_aliases() {
758            self.register_alias_inner(short, full, true);
759        }
760        self.rebuild_catalog();
761    }
762
763    /// Register a short-name alias for a grammar.
764    ///
765    /// Returns `true` if the alias was registered, `false` if rejected due to
766    /// collision or missing target. For built-in aliases, collisions panic
767    /// (they indicate a bug). For dynamic aliases, collisions log a warning.
768    ///
769    /// Splices the alias directly into the catalog rather than rebuilding, so
770    /// any user config previously merged via `apply_language_config` is
771    /// preserved. A full rebuild would wipe those entries.
772    pub(crate) fn register_alias(&mut self, short_name: &str, full_name: &str) -> bool {
773        if !self.register_alias_inner(short_name, full_name, false) {
774            return false;
775        }
776        let short_lower = short_name.to_lowercase();
777        let full_lower = full_name.to_lowercase();
778        if let Some(&idx) = self.catalog_by_name.get(&full_lower) {
779            self.catalog_by_name
780                .entry(short_lower.clone())
781                .or_insert(idx);
782            let entry = &mut self.catalog[idx];
783            let replace = match &entry.short_name {
784                None => true,
785                Some(existing) => short_name.len() < existing.len(),
786            };
787            if replace {
788                entry.short_name = Some(short_lower);
789            }
790        }
791        true
792    }
793
794    fn register_alias_inner(
795        &mut self,
796        short_name: &str,
797        full_name: &str,
798        is_built_in: bool,
799    ) -> bool {
800        let short_lower = short_name.to_lowercase();
801
802        // Validate: target grammar must exist in the syntax set
803        let target_exists = self
804            .syntax_set
805            .syntaxes()
806            .iter()
807            .any(|s| s.name.eq_ignore_ascii_case(full_name));
808        if !target_exists {
809            // Tree-sitter-only targets (e.g. TypeScript) are expected to be
810            // absent from the syntect set. `rebuild_catalog` attaches their
811            // short names via a separate pass over `built_in_aliases()`.
812            if tree_sitter_for_syntect_name(full_name).is_some() {
813                return false;
814            }
815            if is_built_in {
816                // Built-in alias targets should always exist; warn but don't panic
817                // (grammar might have been removed from syntect upstream)
818                tracing::warn!(
819                    "[grammar-alias] Built-in alias '{}' -> '{}': target grammar not found, skipping",
820                    short_name, full_name
821                );
822            } else {
823                tracing::warn!(
824                    "[grammar-alias] Alias '{}' -> '{}': target grammar not found, skipping",
825                    short_name,
826                    full_name
827                );
828            }
829            return false;
830        }
831
832        // Validate: short name must not collide (case-insensitive) with any grammar full name
833        let collides_with_full_name = self
834            .syntax_set
835            .syntaxes()
836            .iter()
837            .any(|s| s.name.eq_ignore_ascii_case(&short_lower));
838        if collides_with_full_name {
839            // This is actually fine — the short name matches a full name directly,
840            // so find_syntax_by_name's case-insensitive search will find it.
841            // No alias needed.
842            tracing::debug!(
843                "[grammar-alias] Alias '{}' matches an existing grammar name, skipping (not needed)",
844                short_name
845            );
846            return false;
847        }
848
849        // Validate: no duplicate alias (case-insensitive)
850        if let Some(existing_target) = self.aliases.get(&short_lower) {
851            if existing_target.eq_ignore_ascii_case(full_name) {
852                // Same mapping, no-op
853                return true;
854            }
855            let msg = format!(
856                "Alias '{}' already maps to '{}', cannot remap to '{}'",
857                short_name, existing_target, full_name
858            );
859            if is_built_in {
860                panic!("[grammar-alias] Built-in alias collision: {}", msg);
861            } else {
862                tracing::warn!("[grammar-alias] {}", msg);
863                return false;
864            }
865        }
866
867        // Resolve the exact syntect name (preserving original case)
868        let exact_name = self
869            .syntax_set
870            .syntaxes()
871            .iter()
872            .find(|s| s.name.eq_ignore_ascii_case(full_name))
873            .map(|s| s.name.clone())
874            .unwrap();
875
876        self.aliases.insert(short_lower, exact_name);
877        true
878    }
879
880    // === Unified catalog ===
881
882    /// Rebuild the flat catalog of grammar entries.
883    ///
884    /// Called after the syntax set, aliases, or filename scopes change.
885    /// Produces one entry per logical language by merging:
886    /// 1. Every `SyntaxReference` in the syntax set (except "Plain Text")
887    /// 2. Every `fresh_languages::Language` not already covered by a syntect entry
888    /// 3. Alias short-names attached to their target entry
889    /// 4. Filename mappings from `filename_scopes` attached to their scope's entry
890    /// 5. Extra extensions from `user_extensions` attached to their scope's entry
891    ///
892    /// Automatically replays the last `apply_language_config` at the end, so
893    /// user `[languages]` config survives any rebuild.
894    pub(crate) fn rebuild_catalog(&mut self) {
895        // Reverse-map: full_name (lowercase) -> shortest alias.
896        //
897        // Seed from the built-in alias table as well as the live `aliases`
898        // HashMap: the live map only contains aliases whose target exists in
899        // the syntect set, so tree-sitter-only entries (TypeScript) would
900        // otherwise never get their short name ("ts").
901        let mut short_by_full: HashMap<String, String> = HashMap::new();
902        let record = |map: &mut HashMap<String, String>, short: &str, full: &str| {
903            let key = full.to_lowercase();
904            let keep = match map.get(&key) {
905                None => true,
906                Some(existing) => short.len() < existing.len(),
907            };
908            if keep {
909                map.insert(key, short.to_string());
910            }
911        };
912        for (short, full) in Self::built_in_aliases() {
913            record(&mut short_by_full, short, full);
914        }
915        for (short, full) in &self.aliases {
916            record(&mut short_by_full, short, full);
917        }
918
919        let derive_language_id =
920            |display_name: &str| -> (String, Option<fresh_languages::Language>) {
921                let ts = tree_sitter_for_syntect_name(display_name);
922                let id = ts
923                    .map(|l| l.id().to_string())
924                    .unwrap_or_else(|| display_name.to_lowercase());
925                (id, ts)
926            };
927
928        let mut catalog: Vec<GrammarEntry> = Vec::new();
929        let mut scope_to_index: HashMap<String, usize> = HashMap::new();
930
931        // Syntect-backed entries (skip Plain Text).
932        //
933        // Syntect's `file_extensions` is a hybrid list: real extensions like
934        // "rb" sit alongside bare filenames like "Gemfile", "Rakefile",
935        // "Makefile". Syntect's own `find_syntax_for_file` tries each entry
936        // against the whole filename AND against the path's extension, and
937        // the catalog has to preserve that semantics. We keep everything in
938        // `extensions` here and index each entry as *both* an extension and
939        // a filename at the bottom of this method.
940        for (idx, syntax) in self.syntax_set.syntaxes().iter().enumerate() {
941            if syntax.name == "Plain Text" {
942                continue;
943            }
944            let (language_id, tree_sitter) = derive_language_id(&syntax.name);
945            let short_name = short_by_full.get(&syntax.name.to_lowercase()).cloned();
946            let source = self
947                .grammar_sources
948                .get(&syntax.name)
949                .map(|info| info.source.clone())
950                .unwrap_or(GrammarSource::BuiltIn);
951            let entry_index = catalog.len();
952            scope_to_index.insert(syntax.scope.to_string(), entry_index);
953
954            // Union syntect's file_extensions with tree-sitter's own
955            // extension list when the entry carries both engines.
956            // tree-sitter-javascript handles `.jsx`/`.mjs`/`.cjs` that
957            // syntect's JS grammar doesn't list, and the old code used to
958            // route those paths to tree-sitter via a separate lookup.
959            let mut extensions = syntax.file_extensions.clone();
960            if let Some(lang) = tree_sitter {
961                for ext in lang.extensions() {
962                    let ext = ext.to_string();
963                    if !extensions.iter().any(|e| e == &ext) {
964                        extensions.push(ext);
965                    }
966                }
967            }
968
969            catalog.push(GrammarEntry {
970                display_name: syntax.name.clone(),
971                language_id,
972                short_name,
973                extensions,
974                filenames: Vec::new(),
975                filename_globs: Vec::new(),
976                source,
977                engines: GrammarEngines {
978                    syntect: Some(idx),
979                    tree_sitter,
980                },
981            });
982        }
983
984        // Attach filename_scopes to their entries.
985        for (filename, scope) in &self.filename_scopes {
986            if let Some(&idx) = scope_to_index.get(scope) {
987                if !catalog[idx].filenames.iter().any(|f| f == filename) {
988                    catalog[idx].filenames.push(filename.clone());
989                }
990            }
991        }
992
993        // Attach user_extensions (extra → scope) to their entries.
994        for (ext, scope) in &self.user_extensions {
995            if let Some(&idx) = scope_to_index.get(scope) {
996                if !catalog[idx].extensions.iter().any(|e| e == ext) {
997                    catalog[idx].extensions.push(ext.clone());
998                }
999            }
1000        }
1001
1002        // Ensure every tree-sitter language has an entry. If a syntect entry
1003        // already maps to the same tree-sitter language, skip it; otherwise
1004        // add a tree-sitter-only entry so the catalog is complete (TypeScript
1005        // being the motivating example — syntect ships no grammar for it).
1006        let mut ts_covered: std::collections::HashSet<fresh_languages::Language> =
1007            std::collections::HashSet::new();
1008        for entry in &catalog {
1009            if let Some(lang) = entry.engines.tree_sitter {
1010                ts_covered.insert(lang);
1011            }
1012        }
1013        for lang in fresh_languages::Language::all() {
1014            if ts_covered.contains(lang) {
1015                continue;
1016            }
1017            let display_name = lang.display_name().to_string();
1018            let language_id = lang.id().to_string();
1019            let short_name = short_by_full.get(&display_name.to_lowercase()).cloned();
1020            let extensions: Vec<String> = lang.extensions().iter().map(|s| s.to_string()).collect();
1021            catalog.push(GrammarEntry {
1022                display_name,
1023                language_id,
1024                short_name,
1025                extensions,
1026                filenames: Vec::new(),
1027                filename_globs: Vec::new(),
1028                source: GrammarSource::BuiltIn,
1029                engines: GrammarEngines {
1030                    syntect: None,
1031                    tree_sitter: Some(*lang),
1032                },
1033            });
1034        }
1035
1036        // Build name / extension / filename indices.
1037        //
1038        // Every entry in `extensions` gets indexed in BOTH `by_extension`
1039        // (lowercased) AND `by_filename` (exact case) — syntect's
1040        // `file_extensions` list holds both real extensions ("rb") and bare
1041        // filenames ("Gemfile", "Rakefile", "Makefile"). Indexing both ways
1042        // matches syntect's own `find_syntax_for_file` semantics.
1043        let mut by_name: HashMap<String, usize> = HashMap::new();
1044        let mut by_extension: HashMap<String, usize> = HashMap::new();
1045        let mut by_filename: HashMap<String, usize> = HashMap::new();
1046        for (idx, entry) in catalog.iter().enumerate() {
1047            by_name.insert(entry.display_name.to_lowercase(), idx);
1048            by_name.insert(entry.language_id.to_lowercase(), idx);
1049            if let Some(short) = &entry.short_name {
1050                by_name.insert(short.to_lowercase(), idx);
1051            }
1052            for ext in &entry.extensions {
1053                by_extension.entry(ext.to_lowercase()).or_insert(idx);
1054                by_filename.entry(ext.clone()).or_insert(idx);
1055            }
1056            for filename in &entry.filenames {
1057                by_filename.entry(filename.clone()).or_insert(idx);
1058            }
1059        }
1060
1061        self.catalog = catalog;
1062        self.catalog_by_name = by_name;
1063        self.catalog_by_extension = by_extension;
1064        self.catalog_by_filename = by_filename;
1065
1066        // Replay the most recent user config so a rebuild doesn't silently
1067        // wipe out user `[languages]` rules. `take` + restore avoids both a
1068        // clone and a borrow checker fight with `apply_language_config_inner`.
1069        if !self.applied_language_config.is_empty() {
1070            let cfg = std::mem::take(&mut self.applied_language_config);
1071            self.apply_language_config_inner(&cfg);
1072            self.applied_language_config = cfg;
1073        }
1074        self.catalog_gen = self.catalog_gen.wrapping_add(1);
1075    }
1076
1077    /// Return the full catalog of grammar entries.
1078    pub fn catalog(&self) -> &[GrammarEntry] {
1079        &self.catalog
1080    }
1081
1082    /// Monotonic generation, bumped on every catalog mutation. Compare against
1083    /// a previously-observed value to decide whether to recompute derived
1084    /// state.
1085    pub fn catalog_gen(&self) -> u64 {
1086        self.catalog_gen
1087    }
1088
1089    /// Look up a grammar entry by display name, language ID, or short alias
1090    /// (case-insensitive). All aliases — built-in and user-config-declared —
1091    /// are indexed directly in `catalog_by_name` during `rebuild_catalog` /
1092    /// `register_alias` / `apply_language_config`, so a single lookup covers
1093    /// every case.
1094    pub fn find_by_name(&self, name: &str) -> Option<&GrammarEntry> {
1095        self.catalog_by_name
1096            .get(&name.to_lowercase())
1097            .map(|&idx| &self.catalog[idx])
1098    }
1099
1100    /// Look up a grammar entry by file path, with optional first-line content
1101    /// for shebang / `first_line_match` detection.
1102    ///
1103    /// Resolution order:
1104    /// 1. Exact filename (config-declared filenames and filename_scopes live here)
1105    /// 2. Glob patterns from user config (e.g. "*.conf", "/etc/**/rc.*")
1106    /// 3. File extension
1107    /// 4. Shebang / first-line regex match on `first_line` if supplied
1108    ///
1109    /// Globs take priority over extension so a user rule like `*.conf → bash`
1110    /// wins over any built-in extension match on `.conf`. The first-line
1111    /// fallback (#4) is last so catalog matches stay authoritative — syntect
1112    /// might otherwise misclassify `.fish` as bash via its first-line
1113    /// regexes.
1114    ///
1115    /// The first-line fallback is pure: it runs syntect's
1116    /// `find_syntax_by_first_line` regex cache against the caller-supplied
1117    /// string. The registry never touches the filesystem — the caller (who
1118    /// already loaded the buffer via the `FileSystem` trait) must extract
1119    /// the first line and pass it in.
1120    pub fn find_by_path(&self, path: &Path, first_line: Option<&str>) -> Option<&GrammarEntry> {
1121        let filename = path.file_name().and_then(|n| n.to_str());
1122        let path_str = path.to_str().unwrap_or("");
1123
1124        if let Some(name) = filename {
1125            if let Some(&idx) = self.catalog_by_filename.get(name) {
1126                return Some(&self.catalog[idx]);
1127            }
1128        }
1129
1130        // Glob walk — filenames with globs are rare so linear scan is fine.
1131        if let Some(name) = filename {
1132            for entry in &self.catalog {
1133                for pattern in &entry.filename_globs {
1134                    let matched = if is_path_pattern(pattern) {
1135                        path_glob_matches(pattern, path_str)
1136                    } else {
1137                        filename_glob_matches(pattern, name)
1138                    };
1139                    if matched {
1140                        return Some(entry);
1141                    }
1142                }
1143            }
1144        }
1145
1146        if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
1147            if let Some(entry) = self.find_by_extension(ext) {
1148                return Some(entry);
1149            }
1150        }
1151
1152        // Last resort: shebang / first-line regex match against the
1153        // caller-supplied content. Map the matched syntect grammar back to a
1154        // catalog entry by name — every syntect syntax has a catalog entry,
1155        // so this round-trip preserves tree-sitter attachment.
1156        let line = first_line?;
1157        let syntax = self.syntax_set.find_syntax_by_first_line(line)?;
1158        self.find_by_name(&syntax.name)
1159    }
1160
1161    /// Look up a grammar entry by file extension (case-insensitive, without dot).
1162    pub fn find_by_extension(&self, ext: &str) -> Option<&GrammarEntry> {
1163        self.catalog_by_extension
1164            .get(&ext.to_lowercase())
1165            .map(|&idx| &self.catalog[idx])
1166    }
1167
1168    /// Merge user `[languages]` config into the catalog.
1169    ///
1170    /// For each config entry, resolves its grammar to an existing catalog entry
1171    /// (by grammar name or by language id). Extensions are added and override
1172    /// the ext→entry index so config wins over built-in mappings. Filenames are
1173    /// split into exact matches (indexed) and globs (walked at lookup time).
1174    ///
1175    /// If no existing entry matches, a new engine-less entry is created so the
1176    /// language still appears in the palette.
1177    ///
1178    /// Idempotent. The config is cached on the registry so `rebuild_catalog`
1179    /// can replay it — callers don't need to re-apply after a rebuild.
1180    pub fn apply_language_config(
1181        &mut self,
1182        languages: &HashMap<String, crate::config::LanguageConfig>,
1183    ) {
1184        self.applied_language_config = languages.clone();
1185        self.apply_language_config_inner(languages);
1186        self.catalog_gen = self.catalog_gen.wrapping_add(1);
1187    }
1188
1189    /// Do the actual catalog splicing without touching
1190    /// `applied_language_config`. Called from `apply_language_config` (which
1191    /// records the input) and from `rebuild_catalog` (which replays the
1192    /// cached input after wiping the catalog).
1193    fn apply_language_config_inner(
1194        &mut self,
1195        languages: &HashMap<String, crate::config::LanguageConfig>,
1196    ) {
1197        for (lang_id, lang_cfg) in languages {
1198            let grammar_name = if lang_cfg.grammar.is_empty() {
1199                lang_id.as_str()
1200            } else {
1201                lang_cfg.grammar.as_str()
1202            };
1203
1204            // Resolve to an existing entry; fall back to creating one.
1205            let idx = self
1206                .catalog_by_name
1207                .get(&grammar_name.to_lowercase())
1208                .copied()
1209                .or_else(|| self.catalog_by_name.get(&lang_id.to_lowercase()).copied())
1210                .unwrap_or_else(|| {
1211                    let idx = self.catalog.len();
1212                    self.catalog.push(GrammarEntry {
1213                        display_name: lang_id.clone(),
1214                        language_id: lang_id.clone(),
1215                        short_name: None,
1216                        extensions: Vec::new(),
1217                        filenames: Vec::new(),
1218                        filename_globs: Vec::new(),
1219                        source: GrammarSource::BuiltIn,
1220                        engines: GrammarEngines::default(),
1221                    });
1222                    idx
1223                });
1224
1225            // Always index the config key so `find_by_name("mylang")` resolves
1226            // even when `mylang` aliases an existing grammar (e.g.
1227            // `[languages.mylang] grammar = "Rust"`). `or_insert` preserves
1228            // any existing mapping — won't clobber the canonical entry.
1229            self.catalog_by_name
1230                .entry(lang_id.to_lowercase())
1231                .or_insert(idx);
1232
1233            for ext in &lang_cfg.extensions {
1234                if !self.catalog[idx].extensions.iter().any(|e| e == ext) {
1235                    self.catalog[idx].extensions.push(ext.clone());
1236                }
1237                // Config-declared extensions override any previous mapping.
1238                self.catalog_by_extension.insert(ext.to_lowercase(), idx);
1239            }
1240            for filename in &lang_cfg.filenames {
1241                if is_glob_pattern(filename) {
1242                    if !self.catalog[idx]
1243                        .filename_globs
1244                        .iter()
1245                        .any(|f| f == filename)
1246                    {
1247                        self.catalog[idx].filename_globs.push(filename.clone());
1248                    }
1249                } else {
1250                    if !self.catalog[idx].filenames.iter().any(|f| f == filename) {
1251                        self.catalog[idx].filenames.push(filename.clone());
1252                    }
1253                    self.catalog_by_filename.insert(filename.clone(), idx);
1254                }
1255            }
1256        }
1257    }
1258
1259    /// Get the underlying syntax set
1260    pub fn syntax_set(&self) -> &Arc<SyntaxSet> {
1261        &self.syntax_set
1262    }
1263
1264    /// Get a clone of the Arc for sharing
1265    pub fn syntax_set_arc(&self) -> Arc<SyntaxSet> {
1266        Arc::clone(&self.syntax_set)
1267    }
1268
1269    /// List all available syntax names
1270    pub fn available_syntaxes(&self) -> Vec<&str> {
1271        self.syntax_set
1272            .syntaxes()
1273            .iter()
1274            .map(|s| s.name.as_str())
1275            .collect()
1276    }
1277
1278    /// List all available grammars with provenance information.
1279    ///
1280    /// Returns a sorted list of `GrammarInfo` entries derived from the unified
1281    /// catalog — this includes both syntect grammars and tree-sitter-only
1282    /// languages (like TypeScript). Each entry is listed exactly once even
1283    /// when both engines can serve it.
1284    pub fn available_grammar_info(&self) -> Vec<GrammarInfo> {
1285        let mut result: Vec<GrammarInfo> = self
1286            .catalog
1287            .iter()
1288            .map(|entry| GrammarInfo {
1289                name: entry.display_name.clone(),
1290                source: entry.source.clone(),
1291                file_extensions: entry.extensions.clone(),
1292                short_name: entry.short_name.clone(),
1293            })
1294            .collect();
1295        result.sort_by(|a, b| a.name.to_lowercase().cmp(&b.name.to_lowercase()));
1296        result
1297    }
1298
1299    /// Get the grammar sources map.
1300    pub(crate) fn grammar_sources(&self) -> &HashMap<String, GrammarInfo> {
1301        &self.grammar_sources
1302    }
1303
1304    /// Build grammar source info from a pre-compiled syntax set.
1305    ///
1306    /// All grammars in the packdump (syntect defaults + embedded) are tagged as built-in.
1307    pub(crate) fn build_grammar_sources_from_syntax_set(
1308        syntax_set: &SyntaxSet,
1309    ) -> HashMap<String, GrammarInfo> {
1310        let mut sources = HashMap::new();
1311        for syntax in syntax_set.syntaxes() {
1312            sources.insert(
1313                syntax.name.clone(),
1314                GrammarInfo {
1315                    name: syntax.name.clone(),
1316                    source: GrammarSource::BuiltIn,
1317                    file_extensions: syntax.file_extensions.clone(),
1318                    short_name: None,
1319                },
1320            );
1321        }
1322        sources
1323    }
1324
1325    /// Get the user extensions mapping (extension -> scope name).
1326    #[cfg(test)]
1327    pub(crate) fn user_extensions(&self) -> &HashMap<String, String> {
1328        &self.user_extensions
1329    }
1330
1331    /// Get the loaded grammar paths (for deduplication in flush_pending_grammars).
1332    #[cfg(test)]
1333    pub(crate) fn loaded_grammar_paths(&self) -> &[GrammarSpec] {
1334        &self.loaded_grammar_paths
1335    }
1336
1337    /// Create a new registry with additional grammar files
1338    ///
1339    /// This builds a new GrammarRegistry that includes all grammars from
1340    /// the base registry plus the additional grammars specified.
1341    /// Uses the base registry's syntax_set as the builder base, preserving
1342    /// all existing grammars (user grammars, language packs, etc.).
1343    ///
1344    /// # Arguments
1345    /// * `base` - The base registry to extend
1346    /// * `additional` - List of (language, path, extensions) tuples for new grammars
1347    ///
1348    /// # Returns
1349    /// A new GrammarRegistry with the additional grammars, or None if rebuilding fails
1350    pub fn with_additional_grammars(
1351        base: &GrammarRegistry,
1352        additional: &[GrammarSpec],
1353    ) -> Option<Self> {
1354        tracing::info!(
1355            "[SYNTAX DEBUG] with_additional_grammars: adding {} grammars to base with {} syntaxes",
1356            additional.len(),
1357            base.syntax_set.syntaxes().len()
1358        );
1359
1360        // Use the base registry's syntax_set as builder base — this preserves
1361        // ALL existing grammars (defaults, embedded, user, language packs)
1362        // without needing to reload them from disk.
1363        let mut builder = (*base.syntax_set).clone().into_builder();
1364
1365        // Preserve existing user extensions and add new ones
1366        let mut user_extensions = base.user_extensions.clone();
1367
1368        // Track loaded grammar paths (existing + new)
1369        let mut loaded_grammar_paths = base.loaded_grammar_paths.clone();
1370
1371        // Preserve existing grammar sources
1372        let mut grammar_sources = base.grammar_sources.clone();
1373
1374        // Add each new grammar
1375        for spec in additional {
1376            tracing::info!(
1377                "[SYNTAX DEBUG] loading new grammar file: lang='{}', path={:?}, extensions={:?}",
1378                spec.language,
1379                spec.path,
1380                spec.extensions
1381            );
1382            match Self::load_grammar_file(&spec.path) {
1383                Ok(syntax) => {
1384                    let scope = syntax.scope.to_string();
1385                    let syntax_name = syntax.name.clone();
1386                    tracing::info!(
1387                        "[SYNTAX DEBUG] grammar loaded successfully: name='{}', scope='{}'",
1388                        syntax_name,
1389                        scope
1390                    );
1391                    builder.add(syntax);
1392                    tracing::info!(
1393                        "Loaded grammar for '{}' from {:?} with extensions {:?}",
1394                        spec.language,
1395                        spec.path,
1396                        spec.extensions
1397                    );
1398                    // Register extensions for this grammar
1399                    for ext in &spec.extensions {
1400                        user_extensions.insert(ext.clone(), scope.clone());
1401                    }
1402                    // Track provenance
1403                    grammar_sources.insert(
1404                        syntax_name.clone(),
1405                        GrammarInfo {
1406                            name: syntax_name,
1407                            source: GrammarSource::Plugin {
1408                                plugin: spec.language.clone(),
1409                                path: spec.path.clone(),
1410                            },
1411                            file_extensions: spec.extensions.clone(),
1412                            short_name: None,
1413                        },
1414                    );
1415                    // Track this grammar path for future reloads
1416                    loaded_grammar_paths.push(spec.clone());
1417                }
1418                Err(e) => {
1419                    tracing::warn!(
1420                        "Failed to load grammar for '{}' from {:?}: {}",
1421                        spec.language,
1422                        spec.path,
1423                        e
1424                    );
1425                }
1426            }
1427        }
1428
1429        let mut reg = Self {
1430            syntax_set: Arc::new(builder.build()),
1431            user_extensions,
1432            filename_scopes: base.filename_scopes.clone(),
1433            loaded_grammar_paths,
1434            grammar_sources,
1435            aliases: base.aliases.clone(),
1436            catalog: Vec::new(),
1437            catalog_by_name: HashMap::new(),
1438            catalog_by_extension: HashMap::new(),
1439            catalog_by_filename: HashMap::new(),
1440            applied_language_config: HashMap::new(),
1441            catalog_gen: 0,
1442        };
1443        reg.rebuild_catalog();
1444        Some(reg)
1445    }
1446
1447    /// Load a grammar file from disk
1448    ///
1449    /// Only Sublime Text (.sublime-syntax) format is supported.
1450    /// TextMate (.tmLanguage) grammars use a completely different format
1451    /// and cannot be loaded by syntect's yaml-load feature.
1452    pub(crate) fn load_grammar_file(path: &Path) -> Result<SyntaxDefinition, String> {
1453        let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
1454
1455        match ext {
1456            "sublime-syntax" => {
1457                let content = std::fs::read_to_string(path)
1458                    .map_err(|e| format!("Failed to read file: {}", e))?;
1459                SyntaxDefinition::load_from_str(
1460                    &content,
1461                    true,
1462                    path.file_stem().and_then(|s| s.to_str()),
1463                )
1464                .map_err(|e| format!("Failed to parse sublime-syntax: {}", e))
1465            }
1466            _ => Err(format!(
1467                "Unsupported grammar format: .{}. Only .sublime-syntax is supported.",
1468                ext
1469            )),
1470        }
1471    }
1472}
1473
1474impl Default for GrammarRegistry {
1475    fn default() -> Self {
1476        // Create with defaults and embedded grammars only (no user grammars)
1477        let defaults = SyntaxSet::load_defaults_newlines();
1478        let mut builder = defaults.into_builder();
1479        Self::add_embedded_grammars(&mut builder);
1480        let syntax_set = builder.build();
1481        let filename_scopes = Self::build_filename_scopes();
1482        let extra_extensions = Self::build_extra_extensions();
1483
1484        let mut registry = Self::new(syntax_set, extra_extensions, filename_scopes);
1485        registry.populate_built_in_aliases();
1486        registry.rebuild_catalog();
1487        registry
1488    }
1489}
1490
1491// VSCode package.json structures for parsing grammar manifests
1492
1493#[derive(Debug, Deserialize)]
1494pub struct PackageManifest {
1495    #[serde(default)]
1496    pub contributes: Option<Contributes>,
1497}
1498
1499#[derive(Debug, Deserialize, Default)]
1500pub struct Contributes {
1501    #[serde(default)]
1502    pub languages: Vec<LanguageContribution>,
1503    #[serde(default)]
1504    pub grammars: Vec<GrammarContribution>,
1505}
1506
1507#[derive(Debug, Deserialize)]
1508pub struct LanguageContribution {
1509    pub id: String,
1510    #[serde(default)]
1511    pub extensions: Vec<String>,
1512}
1513
1514#[derive(Debug, Deserialize)]
1515pub struct GrammarContribution {
1516    pub language: String,
1517    #[serde(rename = "scopeName")]
1518    pub scope_name: String,
1519    pub path: String,
1520}
1521
1522#[cfg(test)]
1523mod tests {
1524    use super::*;
1525
1526    #[test]
1527    fn test_empty_registry() {
1528        let registry = GrammarRegistry::empty();
1529        // Should have at least plain text
1530        assert!(!registry.available_syntaxes().is_empty());
1531    }
1532
1533    #[test]
1534    fn test_default_registry() {
1535        let registry = GrammarRegistry::default();
1536        // Should have built-in syntaxes
1537        assert!(!registry.available_syntaxes().is_empty());
1538    }
1539
1540    #[test]
1541    fn test_find_syntax_for_common_extensions() {
1542        let registry = GrammarRegistry::default();
1543
1544        // Test common extensions that syntect should support
1545        let test_cases = [
1546            ("test.py", true),
1547            ("test.rs", true),
1548            ("test.js", true),
1549            ("test.json", true),
1550            ("test.md", true),
1551            ("test.html", true),
1552            ("test.css", true),
1553            ("test.unknown_extension_xyz", false),
1554        ];
1555
1556        for (filename, should_exist) in test_cases {
1557            let path = Path::new(filename);
1558            let result = registry.find_syntax_for_file(path);
1559            assert_eq!(
1560                result.is_some(),
1561                should_exist,
1562                "Expected {:?} for {}",
1563                should_exist,
1564                filename
1565            );
1566        }
1567    }
1568
1569    #[test]
1570    fn test_racket_grammar_loaded() {
1571        let registry = GrammarRegistry::default();
1572        for filename in ["main.rkt", "data.rktd", "info.rktl", "doc.scrbl"] {
1573            let result = registry.find_syntax_for_file(Path::new(filename));
1574            assert!(
1575                result.is_some(),
1576                "Racket grammar should be available for {}",
1577                filename
1578            );
1579            let entry = registry.find_by_path(Path::new(filename), None).unwrap();
1580            assert_eq!(entry.display_name, "Racket", "for {}", filename);
1581        }
1582    }
1583
1584    #[test]
1585    fn test_syntax_set_arc() {
1586        let registry = GrammarRegistry::default();
1587        let arc1 = registry.syntax_set_arc();
1588        let arc2 = registry.syntax_set_arc();
1589        // Both should point to the same data
1590        assert!(Arc::ptr_eq(&arc1, &arc2));
1591    }
1592
1593    #[test]
1594    fn test_shell_dotfiles_detection() {
1595        let registry = GrammarRegistry::default();
1596
1597        // All these should be detected as shell scripts
1598        let shell_files = [".zshrc", ".zprofile", ".zshenv", ".bash_aliases"];
1599
1600        for filename in shell_files {
1601            let path = Path::new(filename);
1602            let result = registry.find_syntax_for_file(path);
1603            assert!(
1604                result.is_some(),
1605                "{} should be detected as a syntax",
1606                filename
1607            );
1608            let syntax = result.unwrap();
1609            // Should be detected as Bash/Shell
1610            assert!(
1611                syntax.name.to_lowercase().contains("bash")
1612                    || syntax.name.to_lowercase().contains("shell"),
1613                "{} should be detected as shell/bash, got: {}",
1614                filename,
1615                syntax.name
1616            );
1617        }
1618    }
1619
1620    #[test]
1621    fn test_pkgbuild_detection() {
1622        let registry = GrammarRegistry::default();
1623
1624        // PKGBUILD and APKBUILD should be detected as shell scripts
1625        for filename in ["PKGBUILD", "APKBUILD"] {
1626            let path = Path::new(filename);
1627            let result = registry.find_syntax_for_file(path);
1628            assert!(
1629                result.is_some(),
1630                "{} should be detected as a syntax",
1631                filename
1632            );
1633            let syntax = result.unwrap();
1634            // Should be detected as Bash/Shell
1635            assert!(
1636                syntax.name.to_lowercase().contains("bash")
1637                    || syntax.name.to_lowercase().contains("shell"),
1638                "{} should be detected as shell/bash, got: {}",
1639                filename,
1640                syntax.name
1641            );
1642        }
1643    }
1644
1645    #[test]
1646    fn test_find_syntax_with_glob_filenames() {
1647        let mut registry = GrammarRegistry::default();
1648        let mut languages = std::collections::HashMap::new();
1649        languages.insert(
1650            "shell-configs".to_string(),
1651            crate::config::LanguageConfig {
1652                extensions: vec!["sh".to_string()],
1653                filenames: vec!["*.conf".to_string(), "*rc".to_string()],
1654                grammar: "bash".to_string(),
1655                comment_prefix: Some("#".to_string()),
1656                auto_indent: true,
1657                auto_close: None,
1658                auto_surround: None,
1659                textmate_grammar: None,
1660                show_whitespace_tabs: true,
1661                line_wrap: None,
1662                wrap_column: None,
1663                page_view: None,
1664                page_width: None,
1665                use_tabs: None,
1666                tab_size: None,
1667                formatter: None,
1668                format_on_save: false,
1669                on_save: vec![],
1670                word_characters: None,
1671            },
1672        );
1673        registry.apply_language_config(&languages);
1674
1675        assert!(
1676            registry
1677                .find_by_path(Path::new("nftables.conf"), None)
1678                .is_some(),
1679            "*.conf should match nftables.conf"
1680        );
1681        assert!(
1682            registry.find_by_path(Path::new("lfrc"), None).is_some(),
1683            "*rc should match lfrc"
1684        );
1685        // Unrelated file shouldn't panic.
1686        let _ = registry.find_by_path(Path::new("randomfile"), None);
1687    }
1688
1689    #[test]
1690    fn test_find_syntax_with_path_glob_filenames() {
1691        let mut registry = GrammarRegistry::default();
1692        let mut languages = std::collections::HashMap::new();
1693        languages.insert(
1694            "shell-configs".to_string(),
1695            crate::config::LanguageConfig {
1696                extensions: vec!["sh".to_string()],
1697                filenames: vec!["/etc/**/rc.*".to_string()],
1698                grammar: "bash".to_string(),
1699                comment_prefix: Some("#".to_string()),
1700                auto_indent: true,
1701                auto_close: None,
1702                auto_surround: None,
1703                textmate_grammar: None,
1704                show_whitespace_tabs: true,
1705                line_wrap: None,
1706                wrap_column: None,
1707                page_view: None,
1708                page_width: None,
1709                use_tabs: None,
1710                tab_size: None,
1711                formatter: None,
1712                format_on_save: false,
1713                on_save: vec![],
1714                word_characters: None,
1715            },
1716        );
1717        registry.apply_language_config(&languages);
1718
1719        assert!(
1720            registry
1721                .find_by_path(Path::new("/etc/rc.conf"), None)
1722                .is_some(),
1723            "/etc/**/rc.* should match /etc/rc.conf"
1724        );
1725        assert!(
1726            registry
1727                .find_by_path(Path::new("/etc/init/rc.local"), None)
1728                .is_some(),
1729            "/etc/**/rc.* should match /etc/init/rc.local"
1730        );
1731        let _ = registry.find_by_path(Path::new("/var/rc.conf"), None);
1732    }
1733
1734    #[test]
1735    fn test_exact_filename_takes_priority_over_glob() {
1736        let mut registry = GrammarRegistry::default();
1737        let mut languages = std::collections::HashMap::new();
1738
1739        // A language with exact filename "lfrc" -> python grammar
1740        languages.insert(
1741            "custom-lfrc".to_string(),
1742            crate::config::LanguageConfig {
1743                extensions: vec![],
1744                filenames: vec!["lfrc".to_string()],
1745                grammar: "python".to_string(),
1746                comment_prefix: Some("#".to_string()),
1747                auto_indent: true,
1748                auto_close: None,
1749                auto_surround: None,
1750                textmate_grammar: None,
1751                show_whitespace_tabs: true,
1752                line_wrap: None,
1753                wrap_column: None,
1754                page_view: None,
1755                page_width: None,
1756                use_tabs: None,
1757                tab_size: None,
1758                formatter: None,
1759                format_on_save: false,
1760                on_save: vec![],
1761                word_characters: None,
1762            },
1763        );
1764
1765        // A language with glob "*rc" -> bash grammar
1766        languages.insert(
1767            "rc-files".to_string(),
1768            crate::config::LanguageConfig {
1769                extensions: vec![],
1770                filenames: vec!["*rc".to_string()],
1771                grammar: "bash".to_string(),
1772                comment_prefix: Some("#".to_string()),
1773                auto_indent: true,
1774                auto_close: None,
1775                auto_surround: None,
1776                textmate_grammar: None,
1777                show_whitespace_tabs: true,
1778                line_wrap: None,
1779                wrap_column: None,
1780                page_view: None,
1781                page_width: None,
1782                use_tabs: None,
1783                tab_size: None,
1784                formatter: None,
1785                format_on_save: false,
1786                on_save: vec![],
1787                word_characters: None,
1788            },
1789        );
1790
1791        registry.apply_language_config(&languages);
1792
1793        // "lfrc" should match the exact rule (python), not the glob (bash)
1794        let entry = registry.find_by_path(Path::new("lfrc"), None).unwrap();
1795        assert!(
1796            entry.display_name.to_lowercase().contains("python"),
1797            "exact match should win over glob, got: {}",
1798            entry.display_name
1799        );
1800    }
1801
1802    #[test]
1803    fn test_built_in_aliases_resolve() {
1804        let registry = GrammarRegistry::default();
1805
1806        // "bash" should resolve to "Bourne Again Shell (bash)" via alias
1807        let syntax = registry.find_syntax_by_name("bash");
1808        assert!(syntax.is_some(), "alias 'bash' should resolve");
1809        assert_eq!(syntax.unwrap().name, "Bourne Again Shell (bash)");
1810
1811        // "cpp" should resolve to "C++"
1812        let syntax = registry.find_syntax_by_name("cpp");
1813        assert!(syntax.is_some(), "alias 'cpp' should resolve");
1814        assert_eq!(syntax.unwrap().name, "C++");
1815
1816        // "csharp" should resolve to "C#"
1817        let syntax = registry.find_syntax_by_name("csharp");
1818        assert!(syntax.is_some(), "alias 'csharp' should resolve");
1819        assert_eq!(syntax.unwrap().name, "C#");
1820
1821        // "sh" should also resolve to bash
1822        let syntax = registry.find_syntax_by_name("sh");
1823        assert!(syntax.is_some(), "alias 'sh' should resolve");
1824        assert_eq!(syntax.unwrap().name, "Bourne Again Shell (bash)");
1825
1826        // "proto" should resolve to "Protocol Buffers"
1827        let syntax = registry.find_syntax_by_name("proto");
1828        assert!(syntax.is_some(), "alias 'proto' should resolve");
1829        assert_eq!(syntax.unwrap().name, "Protocol Buffers");
1830    }
1831
1832    #[test]
1833    fn test_alias_case_insensitive_input() {
1834        let registry = GrammarRegistry::default();
1835
1836        // Aliases should be case-insensitive on input
1837        let syntax = registry.find_syntax_by_name("BASH");
1838        assert!(
1839            syntax.is_some(),
1840            "alias 'BASH' should resolve case-insensitively"
1841        );
1842        assert_eq!(syntax.unwrap().name, "Bourne Again Shell (bash)");
1843
1844        let syntax = registry.find_syntax_by_name("Cpp");
1845        assert!(
1846            syntax.is_some(),
1847            "alias 'Cpp' should resolve case-insensitively"
1848        );
1849        assert_eq!(syntax.unwrap().name, "C++");
1850    }
1851
1852    #[test]
1853    fn test_full_name_still_works() {
1854        let registry = GrammarRegistry::default();
1855
1856        // Full names should still work (exact match)
1857        let syntax = registry.find_syntax_by_name("Bourne Again Shell (bash)");
1858        assert!(syntax.is_some(), "full name should still resolve");
1859        assert_eq!(syntax.unwrap().name, "Bourne Again Shell (bash)");
1860
1861        // Case-insensitive full name should still work
1862        let syntax = registry.find_syntax_by_name("bourne again shell (bash)");
1863        assert!(
1864            syntax.is_some(),
1865            "case-insensitive full name should resolve"
1866        );
1867        assert_eq!(syntax.unwrap().name, "Bourne Again Shell (bash)");
1868    }
1869
1870    #[test]
1871    fn test_alias_does_not_shadow_full_names() {
1872        let registry = GrammarRegistry::default();
1873
1874        // "Rust" should resolve directly via case-insensitive match, not via alias
1875        let syntax = registry.find_syntax_by_name("rust");
1876        assert!(syntax.is_some());
1877        assert_eq!(syntax.unwrap().name, "Rust");
1878
1879        // "Go" should resolve directly
1880        let syntax = registry.find_syntax_by_name("go");
1881        assert!(syntax.is_some());
1882        assert_eq!(syntax.unwrap().name, "Go");
1883    }
1884
1885    #[test]
1886    fn test_register_alias_rejects_collision() {
1887        let mut registry = GrammarRegistry::default();
1888
1889        // Trying to register an alias that maps to two different targets should fail
1890        assert!(registry.register_alias("myalias", "Rust"));
1891        assert!(!registry.register_alias("myalias", "Go"));
1892
1893        // Same mapping is fine (idempotent)
1894        assert!(registry.register_alias("myalias", "Rust"));
1895    }
1896
1897    #[test]
1898    fn test_register_alias_rejects_nonexistent_target() {
1899        let mut registry = GrammarRegistry::default();
1900        assert!(!registry.register_alias("nope", "Nonexistent Grammar"));
1901    }
1902
1903    #[test]
1904    fn test_register_alias_skips_existing_grammar_name() {
1905        let mut registry = GrammarRegistry::default();
1906
1907        // "rust" case-insensitively matches the grammar "Rust", so no alias needed
1908        assert!(!registry.register_alias("rust", "Rust"));
1909        // Should still be resolvable via case-insensitive match
1910        assert!(registry.find_syntax_by_name("rust").is_some());
1911    }
1912
1913    #[test]
1914    fn test_available_grammar_info_includes_short_names() {
1915        let registry = GrammarRegistry::default();
1916        let infos = registry.available_grammar_info();
1917
1918        let bash_info = infos.iter().find(|g| g.name == "Bourne Again Shell (bash)");
1919        assert!(bash_info.is_some(), "bash grammar should be in the list");
1920        let bash_info = bash_info.unwrap();
1921        assert!(
1922            bash_info.short_name.is_some(),
1923            "bash grammar should have a short_name"
1924        );
1925        // The shortest alias for bash is "sh"
1926        assert_eq!(bash_info.short_name.as_deref(), Some("sh"));
1927    }
1928
1929    #[test]
1930    fn test_catalog_contains_each_language_once() {
1931        let registry = GrammarRegistry::default();
1932        let catalog = registry.catalog();
1933
1934        // Every catalog entry must have a unique (case-insensitive) display name.
1935        let mut seen = std::collections::HashSet::new();
1936        for entry in catalog {
1937            let key = entry.display_name.to_lowercase();
1938            assert!(
1939                seen.insert(key.clone()),
1940                "duplicate catalog entry for display_name={:?}",
1941                entry.display_name
1942            );
1943        }
1944
1945        // TypeScript is tree-sitter-only (syntect ships no grammar for it) yet
1946        // must still appear in the catalog.
1947        let ts = registry
1948            .find_by_name("TypeScript")
1949            .expect("TypeScript must be in the catalog");
1950        assert!(ts.engines.syntect.is_none());
1951        assert_eq!(
1952            ts.engines.tree_sitter,
1953            Some(fresh_languages::Language::TypeScript)
1954        );
1955        assert_eq!(ts.language_id, "typescript");
1956        assert!(ts.extensions.iter().any(|e| e == "ts"));
1957
1958        // Languages that exist in both syntect and tree-sitter (Rust, Python,
1959        // JavaScript) must appear exactly once and prefer the syntect engine.
1960        for name in ["Rust", "Python", "JavaScript"] {
1961            let entry = registry
1962                .find_by_name(name)
1963                .unwrap_or_else(|| panic!("{} must be in the catalog", name));
1964            assert!(
1965                entry.engines.syntect.is_some(),
1966                "{} should have a syntect index",
1967                name
1968            );
1969            assert!(
1970                entry.engines.tree_sitter.is_some(),
1971                "{} should also have a tree-sitter language",
1972                name
1973            );
1974            // Only one entry with this display name (already checked above),
1975            // but also verify language_id lookup lands on the same entry.
1976            let by_id = registry
1977                .find_by_name(&entry.language_id)
1978                .expect("language_id should resolve");
1979            assert_eq!(by_id.display_name, entry.display_name);
1980        }
1981    }
1982
1983    #[test]
1984    fn test_catalog_find_by_path_and_extension() {
1985        let registry = GrammarRegistry::default();
1986        let ts = registry
1987            .find_by_path(Path::new("foo.ts"), None)
1988            .expect("foo.ts should resolve");
1989        assert_eq!(ts.display_name, "TypeScript");
1990        let rs = registry.find_by_extension("rs").expect("rs should resolve");
1991        assert_eq!(rs.display_name, "Rust");
1992    }
1993
1994    /// Build a minimal LanguageConfig for tests.
1995    fn lang_cfg(
1996        grammar: &str,
1997        extensions: &[&str],
1998        filenames: &[&str],
1999    ) -> crate::config::LanguageConfig {
2000        crate::config::LanguageConfig {
2001            extensions: extensions.iter().map(|s| s.to_string()).collect(),
2002            filenames: filenames.iter().map(|s| s.to_string()).collect(),
2003            grammar: grammar.to_string(),
2004            comment_prefix: None,
2005            auto_indent: true,
2006            auto_close: None,
2007            auto_surround: None,
2008            textmate_grammar: None,
2009            show_whitespace_tabs: true,
2010            line_wrap: None,
2011            wrap_column: None,
2012            page_view: None,
2013            page_width: None,
2014            use_tabs: None,
2015            tab_size: None,
2016            formatter: None,
2017            format_on_save: false,
2018            on_save: vec![],
2019            word_characters: None,
2020        }
2021    }
2022
2023    /// Bug #1: a user-declared config key that aliases an existing grammar
2024    /// (e.g. `[languages.mylang] grammar = "Rust"`) must resolve via
2025    /// `find_by_name("mylang")` so the language palette can select it.
2026    #[test]
2027    fn test_user_alias_resolves_via_find_by_name() {
2028        let mut registry = GrammarRegistry::default();
2029        let mut languages = std::collections::HashMap::new();
2030        languages.insert("mylang".to_string(), lang_cfg("Rust", &[], &[]));
2031        registry.apply_language_config(&languages);
2032
2033        let entry = registry
2034            .find_by_name("mylang")
2035            .expect("user-declared alias 'mylang' must resolve");
2036        assert_eq!(entry.display_name, "Rust");
2037    }
2038
2039    /// Bug #2: `register_alias` used to rebuild the catalog from scratch,
2040    /// wiping out everything `apply_language_config` had merged. Registering
2041    /// an alias afterwards must not lose user config.
2042    #[test]
2043    fn test_register_alias_preserves_applied_language_config() {
2044        let mut registry = GrammarRegistry::default();
2045        let mut languages = std::collections::HashMap::new();
2046        languages.insert(
2047            "shell-configs".to_string(),
2048            lang_cfg("bash", &["myconf"], &["*.myconf"]),
2049        );
2050        registry.apply_language_config(&languages);
2051
2052        // Sanity: config applied.
2053        assert!(registry.find_by_extension("myconf").is_some());
2054        assert!(
2055            registry
2056                .find_by_path(Path::new("foo.myconf"), None)
2057                .is_some(),
2058            "glob should match before register_alias"
2059        );
2060
2061        // Registering an alias must not erase the config we just applied.
2062        registry.register_alias("mycustom", "Rust");
2063
2064        assert!(
2065            registry.find_by_extension("myconf").is_some(),
2066            "config extension must survive register_alias"
2067        );
2068        assert!(
2069            registry
2070                .find_by_path(Path::new("foo.myconf"), None)
2071                .is_some(),
2072            "glob must survive register_alias"
2073        );
2074    }
2075
2076    /// Bug #4: `from_syntax_name` used to unconditionally overwrite the
2077    /// catalog's canonical display name with whatever the user typed (e.g.
2078    /// "BASH") — that string ended up in the status bar.
2079    #[test]
2080    fn test_from_syntax_name_preserves_canonical_display_name() {
2081        use crate::primitives::detected_language::DetectedLanguage;
2082        let registry = GrammarRegistry::default();
2083        let languages = std::collections::HashMap::new();
2084
2085        let detected = DetectedLanguage::from_syntax_name("BASH", &registry, &languages)
2086            .expect("BASH should resolve via alias");
2087        assert_eq!(
2088            detected.display_name, "Bourne Again Shell (bash)",
2089            "display_name must be canonical, not user-typed"
2090        );
2091    }
2092
2093    /// A config-only language (no matching syntect grammar) must still appear
2094    /// in the catalog so the language palette can offer it — the old
2095    /// `DetectedLanguage::from_config_language` branch was load-bearing.
2096    #[test]
2097    fn test_config_only_language_appears_in_catalog() {
2098        let mut registry = GrammarRegistry::default();
2099        let mut languages = std::collections::HashMap::new();
2100        // "fish" isn't in syntect; grammar="fish" doesn't resolve either.
2101        languages.insert("fish".to_string(), lang_cfg("fish", &["fish"], &[]));
2102        registry.apply_language_config(&languages);
2103
2104        let entry = registry
2105            .find_by_name("fish")
2106            .expect("fish should be in the catalog after apply_language_config");
2107        assert!(entry.engines.syntect.is_none());
2108        assert!(entry.engines.tree_sitter.is_none());
2109        assert_eq!(entry.language_id, "fish");
2110        assert!(entry.extensions.iter().any(|e| e == "fish"));
2111    }
2112
2113    /// Config-declared extensions must override the built-in mapping. If the
2114    /// user says `[languages.typescript-overlay] extensions = ["js"] grammar
2115    /// = "TypeScript"`, then `foo.js` must resolve to TypeScript, not
2116    /// JavaScript.
2117    #[test]
2118    fn test_config_extension_overrides_builtin() {
2119        let mut registry = GrammarRegistry::default();
2120        // Sanity: default mapping is JavaScript.
2121        assert_eq!(
2122            registry.find_by_extension("js").unwrap().display_name,
2123            "JavaScript"
2124        );
2125
2126        let mut languages = std::collections::HashMap::new();
2127        languages.insert(
2128            "ts-overlay".to_string(),
2129            lang_cfg("TypeScript", &["js"], &[]),
2130        );
2131        registry.apply_language_config(&languages);
2132
2133        assert_eq!(
2134            registry.find_by_extension("js").unwrap().display_name,
2135            "TypeScript",
2136            "user-config extension must win over built-in"
2137        );
2138    }
2139
2140    /// Bare filenames listed by syntect grammars (e.g. "Gemfile", "Makefile",
2141    /// "Rakefile") must resolve through `find_by_path`. Syntect stores these
2142    /// in each grammar's `file_extensions` field alongside real extensions
2143    /// like "rb"; its own `find_syntax_for_file` treats them as either. The
2144    /// catalog has to do the same or `HighlightEngine::for_file` breaks for
2145    /// every extensionless config file.
2146    #[test]
2147    fn test_bare_filename_resolves_via_find_by_path() {
2148        let registry = GrammarRegistry::default();
2149        for (filename, expected_substr) in [
2150            ("Gemfile", "ruby"),
2151            ("Rakefile", "ruby"),
2152            ("Vagrantfile", "ruby"),
2153            ("Makefile", "makefile"),
2154            ("GNUmakefile", "makefile"),
2155        ] {
2156            let entry = registry
2157                .find_by_path(Path::new(filename), None)
2158                .unwrap_or_else(|| panic!("{} must resolve via catalog", filename));
2159            assert!(
2160                entry.display_name.to_lowercase().contains(expected_substr),
2161                "{} should resolve to {} grammar, got {}",
2162                filename,
2163                expected_substr,
2164                entry.display_name
2165            );
2166        }
2167    }
2168
2169    /// Languages that have both syntect and tree-sitter (e.g. JavaScript) must
2170    /// expose the union of both engines' extensions. Tree-sitter-javascript
2171    /// knows `.jsx`; syntect's JavaScript grammar does not. Both should route
2172    /// through the JavaScript catalog entry.
2173    #[test]
2174    fn test_jsx_resolves_to_javascript() {
2175        let registry = GrammarRegistry::default();
2176        let entry = registry
2177            .find_by_path(Path::new("foo.jsx"), None)
2178            .expect("foo.jsx must resolve");
2179        assert_eq!(entry.display_name, "JavaScript");
2180    }
2181
2182    /// `rebuild_catalog` must replay the last-applied language config so it
2183    /// can never silently wipe user `[languages]` rules. This is the invariant
2184    /// that keeps `register_alias`, `populate_built_in_aliases`, and any
2185    /// future rebuild callsite safe-by-construction.
2186    #[test]
2187    fn test_rebuild_catalog_replays_language_config() {
2188        let mut registry = GrammarRegistry::default();
2189        let mut languages = std::collections::HashMap::new();
2190        languages.insert(
2191            "myshell".to_string(),
2192            lang_cfg("bash", &["myext"], &["*.myglob"]),
2193        );
2194        registry.apply_language_config(&languages);
2195        assert!(registry.find_by_extension("myext").is_some());
2196        assert!(registry
2197            .find_by_path(Path::new("foo.myglob"), None)
2198            .is_some());
2199
2200        // Force a rebuild — the catalog gets wiped and re-populated from
2201        // syntect / tree-sitter, but user config must come back on top.
2202        registry.rebuild_catalog();
2203        assert!(
2204            registry.find_by_extension("myext").is_some(),
2205            "rebuild_catalog must replay applied user config"
2206        );
2207        assert!(
2208            registry
2209                .find_by_path(Path::new("foo.myglob"), None)
2210                .is_some(),
2211            "rebuild_catalog must replay user globs"
2212        );
2213    }
2214
2215    /// `apply_language_config` must be idempotent: calling it twice with the
2216    /// same config yields the same catalog state.
2217    #[test]
2218    fn test_apply_language_config_idempotent() {
2219        let mut registry = GrammarRegistry::default();
2220        let mut languages = std::collections::HashMap::new();
2221        languages.insert(
2222            "shell-cfg".to_string(),
2223            lang_cfg("bash", &["myconf"], &["*.myconf"]),
2224        );
2225
2226        registry.apply_language_config(&languages);
2227        let first_extensions = registry
2228            .find_by_name("bash")
2229            .unwrap()
2230            .extensions
2231            .iter()
2232            .filter(|e| e == &"myconf")
2233            .count();
2234        let first_globs = registry
2235            .find_by_name("bash")
2236            .unwrap()
2237            .filename_globs
2238            .iter()
2239            .filter(|g| g == &"*.myconf")
2240            .count();
2241        assert_eq!(first_extensions, 1);
2242        assert_eq!(first_globs, 1);
2243
2244        // Second call must not duplicate anything.
2245        registry.apply_language_config(&languages);
2246        let second_extensions = registry
2247            .find_by_name("bash")
2248            .unwrap()
2249            .extensions
2250            .iter()
2251            .filter(|e| e == &"myconf")
2252            .count();
2253        let second_globs = registry
2254            .find_by_name("bash")
2255            .unwrap()
2256            .filename_globs
2257            .iter()
2258            .filter(|g| g == &"*.myconf")
2259            .count();
2260        assert_eq!(second_extensions, 1, "extensions must not duplicate");
2261        assert_eq!(second_globs, 1, "globs must not duplicate");
2262    }
2263
2264    /// `tree_sitter_for_syntect_name` handles the alias table + strict
2265    /// display-name match. The alias table catches syntect's verbose names;
2266    /// the strict match handles the common case.
2267    #[test]
2268    fn test_tree_sitter_bridge() {
2269        assert_eq!(
2270            tree_sitter_for_syntect_name("Bourne Again Shell (bash)"),
2271            Some(fresh_languages::Language::Bash)
2272        );
2273        assert_eq!(
2274            tree_sitter_for_syntect_name("Rust"),
2275            Some(fresh_languages::Language::Rust)
2276        );
2277        // Must NOT fuzzy-match Nushell to Bash.
2278        assert_eq!(tree_sitter_for_syntect_name("Nushell"), None);
2279        // Must NOT match arbitrary strings.
2280        assert_eq!(tree_sitter_for_syntect_name("does-not-exist"), None);
2281    }
2282}