Skip to main content

fresh/primitives/grammar/
types.rs

1//! Pure grammar registry types without I/O operations.
2//!
3//! This module contains the `GrammarRegistry` struct and all syntax lookup methods
4//! that don't require filesystem access. This enables WASM compatibility and easier testing.
5
6use serde::{Deserialize, Serialize};
7use std::collections::HashMap;
8use std::path::{Path, PathBuf};
9use std::sync::Arc;
10use syntect::parsing::{SyntaxDefinition, SyntaxReference, SyntaxSet, SyntaxSetBuilder};
11
12// Re-export glob matching utilities for use by other modules
13pub use crate::primitives::glob_match::{
14    filename_glob_matches, is_glob_pattern, is_path_pattern, path_glob_matches,
15};
16
17/// A grammar specification: language name, path to grammar file, and associated file extensions.
18///
19/// Used to pass grammar information between the plugin layer, loader, and registry
20/// without relying on anonymous tuples.
21#[derive(Clone, Debug)]
22pub struct GrammarSpec {
23    /// Language identifier (e.g., "elixir")
24    pub language: String,
25    /// Path to the grammar file (.sublime-syntax)
26    pub path: PathBuf,
27    /// File extensions to associate with this grammar (e.g., ["ex", "exs"])
28    pub extensions: Vec<String>,
29}
30
31/// Where a grammar was loaded from.
32#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
33#[serde(tag = "type")]
34pub enum GrammarSource {
35    /// Built-in to Fresh (pre-compiled syntect defaults + embedded grammars)
36    #[serde(rename = "built-in")]
37    BuiltIn,
38    /// Installed from a user grammar directory (~/.config/fresh/grammars/)
39    #[serde(rename = "user")]
40    User { path: PathBuf },
41    /// From a language pack (~/.config/fresh/languages/packages/)
42    #[serde(rename = "language-pack")]
43    LanguagePack { name: String, path: PathBuf },
44    /// From a bundle package (~/.config/fresh/bundles/packages/)
45    #[serde(rename = "bundle")]
46    Bundle { name: String, path: PathBuf },
47    /// Registered by a plugin at runtime
48    #[serde(rename = "plugin")]
49    Plugin { plugin: String, path: PathBuf },
50}
51
52impl std::fmt::Display for GrammarSource {
53    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
54        match self {
55            GrammarSource::BuiltIn => write!(f, "built-in"),
56            GrammarSource::User { path } => write!(f, "user ({})", path.display()),
57            GrammarSource::LanguagePack { name, .. } => write!(f, "language-pack ({})", name),
58            GrammarSource::Bundle { name, .. } => write!(f, "bundle ({})", name),
59            GrammarSource::Plugin { plugin, .. } => write!(f, "plugin ({})", plugin),
60        }
61    }
62}
63
64/// Information about an available grammar, including its provenance.
65#[derive(Clone, Debug, Serialize, Deserialize)]
66pub struct GrammarInfo {
67    /// The grammar name as used in config files (case-insensitive matching)
68    pub name: String,
69    /// Where this grammar was loaded from
70    pub source: GrammarSource,
71    /// File extensions associated with this grammar
72    pub file_extensions: Vec<String>,
73    /// Optional short name alias (e.g., "bash" for "Bourne Again Shell (bash)")
74    #[serde(default, skip_serializing_if = "Option::is_none")]
75    pub short_name: Option<String>,
76}
77
78/// Bridge between syntect display names and `fresh_languages::Language`.
79///
80/// Most syntect grammars map one-to-one: "Rust" → `Language::Rust`. A few
81/// have verbose display names that don't match the tree-sitter enum's
82/// `display_name()`, and `Language::from_name` has fuzzy "contains shell"
83/// fallbacks that would wrongly tag Nushell as tree-sitter Bash. This is
84/// the one place we spell the exceptions out explicitly.
85const SYNTECT_TO_TREE_SITTER_ALIASES: &[(&str, fresh_languages::Language)] =
86    &[("Bourne Again Shell (bash)", fresh_languages::Language::Bash)];
87
88/// Resolve a syntect syntax display name to a tree-sitter language, using
89/// strict equality against the alias table and `Language::display_name()`.
90fn tree_sitter_for_syntect_name(display_name: &str) -> Option<fresh_languages::Language> {
91    for (syntect_name, lang) in SYNTECT_TO_TREE_SITTER_ALIASES {
92        if *syntect_name == display_name {
93            return Some(*lang);
94        }
95    }
96    fresh_languages::Language::all()
97        .iter()
98        .find(|l| l.display_name() == display_name)
99        .copied()
100}
101
102/// Which highlighters can serve a given `GrammarEntry`.
103///
104/// A catalog entry may come from syntect (a TextMate grammar indexed into
105/// `SyntaxSet`), tree-sitter (a `fresh_languages::Language`), or both.
106#[derive(Clone, Debug, Default)]
107pub struct GrammarEngines {
108    /// Index into `GrammarRegistry::syntax_set().syntaxes()`, if a syntect
109    /// grammar is available.
110    pub syntect: Option<usize>,
111    /// Tree-sitter language, if one is registered for this grammar.
112    pub tree_sitter: Option<fresh_languages::Language>,
113}
114
115/// A single entry in the unified grammar catalog.
116///
117/// Each entry represents one logical language (e.g. "Rust", "TypeScript") and
118/// records which highlighting engines can serve it, plus the names/extensions
119/// used to look it up. The catalog is the single source of truth for grammar
120/// lookups — `find_by_name`, `find_by_path`, `find_by_extension` all return
121/// entries from here, and both `HighlightEngine::from_entry` and
122/// `DetectedLanguage::from_entry` consume them.
123#[derive(Clone, Debug)]
124pub struct GrammarEntry {
125    /// Human-readable display name (e.g. "TypeScript", "Bourne Again Shell (bash)").
126    pub display_name: String,
127    /// Canonical language ID used in config and LSP (e.g. "typescript", "csharp").
128    pub language_id: String,
129    /// Short alias, if one exists (e.g. "ts" for TypeScript).
130    pub short_name: Option<String>,
131    /// File extensions (without leading dot).
132    pub extensions: Vec<String>,
133    /// Exact filenames that map to this grammar (e.g. "Dockerfile").
134    pub filenames: Vec<String>,
135    /// Filename globs from user config (e.g. "*.conf", "/etc/**/rc.*").
136    pub filename_globs: Vec<String>,
137    /// Where this grammar was loaded from.
138    pub source: GrammarSource,
139    /// Highlighters that can serve this entry.
140    pub engines: GrammarEngines,
141}
142
143/// Embedded TOML grammar (syntect doesn't include one)
144pub const TOML_GRAMMAR: &str = include_str!("../../grammars/toml.sublime-syntax");
145
146/// Embedded Odin grammar (syntect doesn't include one)
147/// From: https://github.com/Tetralux/sublime-odin (MIT License)
148pub const ODIN_GRAMMAR: &str = include_str!("../../grammars/odin/Odin.sublime-syntax");
149
150/// Embedded Zig grammar (syntect doesn't include one)
151pub const ZIG_GRAMMAR: &str = include_str!("../../grammars/zig.sublime-syntax");
152
153/// Embedded GDScript grammar
154/// Based on https://github.com/beefsack/GDScript-sublime (MIT License)
155pub const GDSCRIPT_GRAMMAR: &str = include_str!("../../grammars/gdscript.sublime-syntax");
156
157/// Embedded Git Rebase Todo grammar for interactive rebase
158pub const GIT_REBASE_GRAMMAR: &str = include_str!("../../grammars/git-rebase.sublime-syntax");
159
160/// Embedded Git Commit Message grammar for COMMIT_EDITMSG, MERGE_MSG, etc.
161pub const GIT_COMMIT_GRAMMAR: &str = include_str!("../../grammars/git-commit.sublime-syntax");
162
163/// Embedded Gitignore grammar for .gitignore and similar files
164pub const GITIGNORE_GRAMMAR: &str = include_str!("../../grammars/gitignore.sublime-syntax");
165
166/// Embedded Git Config grammar for .gitconfig, .gitmodules
167pub const GITCONFIG_GRAMMAR: &str = include_str!("../../grammars/gitconfig.sublime-syntax");
168
169/// Embedded Git Attributes grammar for .gitattributes
170pub const GITATTRIBUTES_GRAMMAR: &str = include_str!("../../grammars/gitattributes.sublime-syntax");
171
172/// Embedded Typst grammar (syntect doesn't include one)
173pub const TYPST_GRAMMAR: &str = include_str!("../../grammars/typst.sublime-syntax");
174
175/// Embedded Dockerfile grammar
176pub const DOCKERFILE_GRAMMAR: &str = include_str!("../../grammars/dockerfile.sublime-syntax");
177/// Embedded INI grammar (also handles .env, .cfg, .editorconfig, etc.)
178pub const INI_GRAMMAR: &str = include_str!("../../grammars/ini.sublime-syntax");
179/// Embedded CMake grammar
180pub const CMAKE_GRAMMAR: &str = include_str!("../../grammars/cmake.sublime-syntax");
181/// Embedded SCSS grammar
182pub const SCSS_GRAMMAR: &str = include_str!("../../grammars/scss.sublime-syntax");
183/// Embedded LESS grammar
184pub const LESS_GRAMMAR: &str = include_str!("../../grammars/less.sublime-syntax");
185/// Embedded PowerShell grammar
186pub const POWERSHELL_GRAMMAR: &str = include_str!("../../grammars/powershell.sublime-syntax");
187/// Embedded Kotlin grammar
188pub const KOTLIN_GRAMMAR: &str = include_str!("../../grammars/kotlin.sublime-syntax");
189/// Embedded Swift grammar
190pub const SWIFT_GRAMMAR: &str = include_str!("../../grammars/swift.sublime-syntax");
191/// Embedded Dart grammar
192pub const DART_GRAMMAR: &str = include_str!("../../grammars/dart.sublime-syntax");
193/// Embedded Elixir grammar
194pub const ELIXIR_GRAMMAR: &str = include_str!("../../grammars/elixir.sublime-syntax");
195/// Embedded F# grammar
196pub const FSHARP_GRAMMAR: &str = include_str!("../../grammars/fsharp.sublime-syntax");
197/// Embedded Nix grammar
198pub const NIX_GRAMMAR: &str = include_str!("../../grammars/nix.sublime-syntax");
199/// Embedded HCL/Terraform grammar
200pub const HCL_GRAMMAR: &str = include_str!("../../grammars/hcl.sublime-syntax");
201/// Embedded Protocol Buffers grammar
202pub const PROTOBUF_GRAMMAR: &str = include_str!("../../grammars/protobuf.sublime-syntax");
203/// Embedded GraphQL grammar
204pub const GRAPHQL_GRAMMAR: &str = include_str!("../../grammars/graphql.sublime-syntax");
205/// Embedded Julia grammar
206pub const JULIA_GRAMMAR: &str = include_str!("../../grammars/julia.sublime-syntax");
207/// Embedded Nim grammar
208pub const NIM_GRAMMAR: &str = include_str!("../../grammars/nim.sublime-syntax");
209/// Embedded Gleam grammar
210pub const GLEAM_GRAMMAR: &str = include_str!("../../grammars/gleam.sublime-syntax");
211/// Embedded V language grammar
212pub const VLANG_GRAMMAR: &str = include_str!("../../grammars/vlang.sublime-syntax");
213/// Embedded Solidity grammar
214pub const SOLIDITY_GRAMMAR: &str = include_str!("../../grammars/solidity.sublime-syntax");
215/// Embedded KDL grammar
216pub const KDL_GRAMMAR: &str = include_str!("../../grammars/kdl.sublime-syntax");
217/// Embedded Nushell grammar
218pub const NUSHELL_GRAMMAR: &str = include_str!("../../grammars/nushell.sublime-syntax");
219/// Embedded Smali grammar
220pub const SMALI_GRAMMAR: &str = include_str!("../../grammars/smali.sublime-syntax");
221/// Embedded Fish shell grammar
222pub const FISH_GRAMMAR: &str = include_str!("../../grammars/fish.sublime-syntax");
223/// Embedded Starlark/Bazel grammar
224pub const STARLARK_GRAMMAR: &str = include_str!("../../grammars/starlark.sublime-syntax");
225/// Embedded Justfile grammar
226pub const JUSTFILE_GRAMMAR: &str = include_str!("../../grammars/justfile.sublime-syntax");
227/// Embedded Earthfile grammar
228pub const EARTHFILE_GRAMMAR: &str = include_str!("../../grammars/earthfile.sublime-syntax");
229/// Embedded Go Module grammar
230pub const GOMOD_GRAMMAR: &str = include_str!("../../grammars/gomod.sublime-syntax");
231/// Embedded Vue grammar
232pub const VUE_GRAMMAR: &str = include_str!("../../grammars/vue.sublime-syntax");
233/// Embedded Svelte grammar
234pub const SVELTE_GRAMMAR: &str = include_str!("../../grammars/svelte.sublime-syntax");
235/// Embedded Astro grammar
236pub const ASTRO_GRAMMAR: &str = include_str!("../../grammars/astro.sublime-syntax");
237/// Embedded Hyprlang grammar (Hyprland config)
238pub const HYPRLANG_GRAMMAR: &str = include_str!("../../grammars/hyprlang.sublime-syntax");
239/// Embedded AutoHotkey grammar
240/// From: https://github.com/SALZKARTOFFEEEL/ahk-sublime-syntax (MIT License)
241pub const AUTOHOTKEY_GRAMMAR: &str =
242    include_str!("../../grammars/autohotkey/AutoHotkey.sublime-syntax");
243/// Embedded Racket grammar (syntect doesn't include one)
244pub const RACKET_GRAMMAR: &str = include_str!("../../grammars/racket.sublime-syntax");
245/// Embedded Verilog grammar (HDL)
246pub const VERILOG_GRAMMAR: &str = include_str!("../../grammars/verilog.sublime-syntax");
247/// Embedded SystemVerilog grammar (HDL)
248pub const SYSTEMVERILOG_GRAMMAR: &str = include_str!("../../grammars/systemverilog.sublime-syntax");
249/// Embedded VHDL grammar (HDL)
250pub const VHDL_GRAMMAR: &str = include_str!("../../grammars/vhdl.sublime-syntax");
251
252pub const C3_GRAMMAR: &str = include_str!("../../grammars/c3.sublime-syntax");
253
254/// Embedded Assembly grammar (GAS/AT&T and Intel/NASM dialects; syntect
255/// doesn't include one)
256pub const ASM_GRAMMAR: &str = include_str!("../../grammars/asm.sublime-syntax");
257
258/// Registry of all available TextMate grammars.
259///
260/// This struct holds the compiled syntax set and provides lookup methods.
261/// It does not perform I/O directly - use `GrammarLoader` for loading grammars.
262impl std::fmt::Debug for GrammarRegistry {
263    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
264        f.debug_struct("GrammarRegistry")
265            .field("syntax_count", &self.syntax_set.syntaxes().len())
266            .finish()
267    }
268}
269
270pub struct GrammarRegistry {
271    /// Combined syntax set (built-in + embedded + user grammars)
272    syntax_set: Arc<SyntaxSet>,
273    /// Extension -> scope name mapping for user grammars (takes priority)
274    user_extensions: HashMap<String, String>,
275    /// Filename -> scope name mapping for dotfiles and special files
276    filename_scopes: HashMap<String, String>,
277    /// Paths to dynamically loaded grammar files (for reloading when adding more)
278    loaded_grammar_paths: Vec<GrammarSpec>,
279    /// Provenance info for each grammar (keyed by grammar name)
280    grammar_sources: HashMap<String, GrammarInfo>,
281    /// Short name aliases: lowercase short_name -> full syntect grammar name.
282    /// Provides a deterministic, one-to-one mapping so users can write
283    /// `grammar = "bash"` instead of `grammar = "Bourne Again Shell (bash)"`.
284    aliases: HashMap<String, String>,
285    /// Unified catalog of every known grammar. Rebuilt whenever the syntax set
286    /// or alias table changes. Lookups (`find_by_name`, `find_by_path`, ...)
287    /// all resolve against this.
288    catalog: Vec<GrammarEntry>,
289    /// Index from lowercased lookup keys (display name, language_id, short_name)
290    /// to catalog index.
291    catalog_by_name: HashMap<String, usize>,
292    /// Index from file extension (without dot) to catalog index.
293    catalog_by_extension: HashMap<String, usize>,
294    /// Index from filename to catalog index.
295    catalog_by_filename: HashMap<String, usize>,
296    /// The most recent language config handed to `apply_language_config`.
297    /// Retained so `rebuild_catalog` can replay it — otherwise a rebuild
298    /// (triggered by e.g. `populate_built_in_aliases`) silently wipes user
299    /// `[languages]` config that was merged on top.
300    applied_language_config: HashMap<String, crate::config::LanguageConfig>,
301    /// Monotonic generation, bumped on every catalog mutation. Lets
302    /// observers (plugin state snapshot) detect changes with one integer
303    /// compare instead of recounting entries.
304    catalog_gen: u64,
305}
306
307impl GrammarRegistry {
308    /// Create a new GrammarRegistry from pre-built components.
309    ///
310    /// This is typically called by `GrammarLoader` implementations after
311    /// loading grammars from various sources.
312    pub(crate) fn new(
313        syntax_set: SyntaxSet,
314        user_extensions: HashMap<String, String>,
315        filename_scopes: HashMap<String, String>,
316    ) -> Self {
317        Self::new_with_loaded_paths(
318            syntax_set,
319            user_extensions,
320            filename_scopes,
321            Vec::new(),
322            HashMap::new(),
323        )
324    }
325
326    /// Create a GrammarRegistry with pre-loaded grammar path tracking.
327    ///
328    /// Used by the loader when plugin grammars were included in the initial build,
329    /// so that `loaded_grammar_paths()` reflects what was actually loaded.
330    pub(crate) fn new_with_loaded_paths(
331        syntax_set: SyntaxSet,
332        user_extensions: HashMap<String, String>,
333        filename_scopes: HashMap<String, String>,
334        loaded_grammar_paths: Vec<GrammarSpec>,
335        grammar_sources: HashMap<String, GrammarInfo>,
336    ) -> Self {
337        let mut reg = Self {
338            syntax_set: Arc::new(syntax_set),
339            user_extensions,
340            filename_scopes,
341            loaded_grammar_paths,
342            grammar_sources,
343            aliases: HashMap::new(),
344            catalog: Vec::new(),
345            catalog_by_name: HashMap::new(),
346            catalog_by_extension: HashMap::new(),
347            catalog_by_filename: HashMap::new(),
348            applied_language_config: HashMap::new(),
349            catalog_gen: 0,
350        };
351        reg.rebuild_catalog();
352        reg
353    }
354
355    /// Create an empty grammar registry (fast, for tests that don't need syntax highlighting)
356    pub fn empty() -> Arc<Self> {
357        let mut builder = SyntaxSetBuilder::new();
358        builder.add_plain_text_syntax();
359        let mut reg = Self {
360            syntax_set: Arc::new(builder.build()),
361            user_extensions: HashMap::new(),
362            filename_scopes: HashMap::new(),
363            loaded_grammar_paths: Vec::new(),
364            grammar_sources: HashMap::new(),
365            aliases: HashMap::new(),
366            catalog: Vec::new(),
367            catalog_by_name: HashMap::new(),
368            catalog_by_extension: HashMap::new(),
369            catalog_by_filename: HashMap::new(),
370            applied_language_config: HashMap::new(),
371            catalog_gen: 0,
372        };
373        reg.rebuild_catalog();
374        Arc::new(reg)
375    }
376
377    /// Create a registry with only syntect's pre-compiled defaults (~0ms).
378    ///
379    /// This provides instant syntax highlighting for ~50 common languages
380    /// (Rust, Python, JS/TS, C/C++, Go, Java, HTML, CSS, Markdown, etc.)
381    /// without any `SyntaxSetBuilder::build()` call. Use this at startup,
382    /// then swap in a full registry built on a background thread.
383    pub fn defaults_only() -> Arc<Self> {
384        // Load pre-compiled syntax set (defaults + embedded grammars) from
385        // build-time packdump. This avoids the expensive into_builder() + build()
386        // cycle at runtime (~12s → ~300ms).
387        tracing::info!("defaults_only: loading pre-compiled syntax packdump...");
388        let syntax_set: SyntaxSet = syntect::dumps::from_uncompressed_data(include_bytes!(
389            concat!(env!("OUT_DIR"), "/default_syntaxes.packdump")
390        ))
391        .expect("Failed to load pre-compiled syntax packdump");
392        tracing::info!(
393            "defaults_only: loaded ({} syntaxes)",
394            syntax_set.syntaxes().len()
395        );
396        let grammar_sources = Self::build_grammar_sources_from_syntax_set(&syntax_set);
397        let filename_scopes = Self::build_filename_scopes();
398        let extra_extensions = Self::build_extra_extensions();
399        let mut registry = Self {
400            syntax_set: Arc::new(syntax_set),
401            user_extensions: extra_extensions,
402            filename_scopes,
403            loaded_grammar_paths: Vec::new(),
404            grammar_sources,
405            aliases: HashMap::new(),
406            catalog: Vec::new(),
407            catalog_by_name: HashMap::new(),
408            catalog_by_extension: HashMap::new(),
409            catalog_by_filename: HashMap::new(),
410            applied_language_config: HashMap::new(),
411            catalog_gen: 0,
412        };
413        registry.populate_built_in_aliases();
414        registry.rebuild_catalog();
415        Arc::new(registry)
416    }
417
418    /// Build extra extension -> scope mappings for extensions not covered by syntect defaults.
419    ///
420    /// These map common file extensions to existing syntect grammar scopes,
421    /// filling gaps where syntect's built-in extension lists are incomplete.
422    pub(crate) fn build_extra_extensions() -> HashMap<String, String> {
423        let mut map = HashMap::new();
424
425        // JavaScript variants not in syntect defaults (["js", "htc"])
426        let js_scope = "source.js".to_string();
427        map.insert("cjs".to_string(), js_scope.clone());
428        map.insert("mjs".to_string(), js_scope);
429
430        // Dockerfile variants (e.g. Dockerfile.dev -> .dev extension)
431        // These won't match by extension, handled by filename_scopes and first_line_match
432
433        map
434    }
435
436    /// Build the default filename -> scope mappings for dotfiles and special files.
437    pub(crate) fn build_filename_scopes() -> HashMap<String, String> {
438        let mut map = HashMap::new();
439
440        // Shell configuration files -> Bash/Shell script scope
441        let shell_scope = "source.shell.bash".to_string();
442        for filename in [
443            ".zshrc",
444            ".zprofile",
445            ".zshenv",
446            ".zlogin",
447            ".zlogout",
448            ".bash_aliases",
449            // .bashrc and .bash_profile are already recognized by syntect
450            // Common shell script files without extensions
451            "PKGBUILD",
452            "APKBUILD",
453        ] {
454            map.insert(filename.to_string(), shell_scope.clone());
455        }
456
457        // Git rebase todo files
458        let git_rebase_scope = "source.git-rebase-todo".to_string();
459        map.insert("git-rebase-todo".to_string(), git_rebase_scope);
460
461        // Git commit message files
462        let git_commit_scope = "source.git-commit".to_string();
463        for filename in ["COMMIT_EDITMSG", "MERGE_MSG", "SQUASH_MSG", "TAG_EDITMSG"] {
464            map.insert(filename.to_string(), git_commit_scope.clone());
465        }
466
467        // Gitignore and similar files
468        let gitignore_scope = "source.gitignore".to_string();
469        for filename in [".gitignore", ".dockerignore", ".npmignore", ".hgignore"] {
470            map.insert(filename.to_string(), gitignore_scope.clone());
471        }
472
473        // Git config files
474        let gitconfig_scope = "source.gitconfig".to_string();
475        for filename in [".gitconfig", ".gitmodules"] {
476            map.insert(filename.to_string(), gitconfig_scope.clone());
477        }
478
479        // Git attributes files
480        let gitattributes_scope = "source.gitattributes".to_string();
481        map.insert(".gitattributes".to_string(), gitattributes_scope);
482
483        // Jenkinsfile -> Groovy
484        let groovy_scope = "source.groovy".to_string();
485        map.insert("Jenkinsfile".to_string(), groovy_scope);
486
487        // Vagrantfile -> Ruby (syntect already handles this, but be explicit)
488        // Brewfile -> Ruby
489        let ruby_scope = "source.ruby".to_string();
490        map.insert("Brewfile".to_string(), ruby_scope);
491
492        // Dockerfile and variants (exact names; Dockerfile.* handled via prefix check)
493        let dockerfile_scope = "source.dockerfile".to_string();
494        map.insert("Dockerfile".to_string(), dockerfile_scope.clone());
495        map.insert("Containerfile".to_string(), dockerfile_scope.clone());
496        // Common Dockerfile variants
497        map.insert("Dockerfile.dev".to_string(), dockerfile_scope.clone());
498        map.insert("Dockerfile.prod".to_string(), dockerfile_scope.clone());
499        map.insert("Dockerfile.test".to_string(), dockerfile_scope.clone());
500        map.insert("Dockerfile.build".to_string(), dockerfile_scope.clone());
501
502        // CMake
503        let cmake_scope = "source.cmake".to_string();
504        map.insert("CMakeLists.txt".to_string(), cmake_scope);
505
506        // Starlark/Bazel
507        let starlark_scope = "source.starlark".to_string();
508        map.insert("BUILD".to_string(), starlark_scope.clone());
509        map.insert("BUILD.bazel".to_string(), starlark_scope.clone());
510        map.insert("WORKSPACE".to_string(), starlark_scope.clone());
511        map.insert("WORKSPACE.bazel".to_string(), starlark_scope.clone());
512        map.insert("Tiltfile".to_string(), starlark_scope);
513
514        // Justfile (various casings)
515        let justfile_scope = "source.justfile".to_string();
516        map.insert("justfile".to_string(), justfile_scope.clone());
517        map.insert("Justfile".to_string(), justfile_scope.clone());
518        map.insert(".justfile".to_string(), justfile_scope);
519
520        // EditorConfig -> INI
521        let ini_scope = "source.ini".to_string();
522        map.insert(".editorconfig".to_string(), ini_scope);
523
524        // Earthfile
525        let earthfile_scope = "source.earthfile".to_string();
526        map.insert("Earthfile".to_string(), earthfile_scope);
527
528        // Hyprlang (Hyprland config files)
529        let hyprlang_scope = "source.hyprlang".to_string();
530        map.insert("hyprland.conf".to_string(), hyprlang_scope.clone());
531        map.insert("hyprpaper.conf".to_string(), hyprlang_scope.clone());
532        map.insert("hyprlock.conf".to_string(), hyprlang_scope);
533
534        // go.mod / go.sum
535        let gomod_scope = "source.gomod".to_string();
536        map.insert("go.mod".to_string(), gomod_scope.clone());
537        map.insert("go.sum".to_string(), gomod_scope);
538
539        // YAML(-ish) files without a .yaml/.yml extension (#2326). These are
540        // formats whose content is unambiguously YAML:
541        //   - yarn v1 lockfiles use a YAML-compatible format.
542        //   - .clang-format / _clang-format and .clang-tidy are YAML (LLVM).
543        //   - .yamllint is yamllint's own YAML config.
544        //   - Podfile.lock (CocoaPods) and pubspec.lock (Dart pub) are YAML.
545        let yaml_scope = "source.yaml".to_string();
546        for filename in [
547            "yarn.lock",
548            ".clang-format",
549            "_clang-format",
550            ".clang-tidy",
551            ".yamllint",
552            "Podfile.lock",
553            "pubspec.lock",
554        ] {
555            map.insert(filename.to_string(), yaml_scope.clone());
556        }
557
558        // Lock files whose content is TOML. Cargo.lock is also matched by
559        // syntect's first-line regex, but mapping it explicitly is robust
560        // regardless of the file's first line.
561        let toml_scope = "source.toml".to_string();
562        for filename in ["Cargo.lock", "poetry.lock", "uv.lock"] {
563            map.insert(filename.to_string(), toml_scope.clone());
564        }
565
566        // Lock files whose content is JSON. The JSON grammar has no first-line
567        // regex for a leading `{`, so these need an explicit filename mapping.
568        let json_scope = "source.json".to_string();
569        for filename in ["composer.lock", "Pipfile.lock", "flake.lock", "deno.lock"] {
570            map.insert(filename.to_string(), json_scope.clone());
571        }
572
573        map
574    }
575
576    /// Add embedded grammars (TOML, Odin, etc.) to a syntax set builder.
577    pub(crate) fn add_embedded_grammars(builder: &mut SyntaxSetBuilder) {
578        // TOML grammar
579        match SyntaxDefinition::load_from_str(TOML_GRAMMAR, true, Some("TOML")) {
580            Ok(syntax) => {
581                builder.add(syntax);
582                tracing::debug!("Loaded embedded TOML grammar");
583            }
584            Err(e) => {
585                tracing::warn!("Failed to load embedded TOML grammar: {}", e);
586            }
587        }
588
589        // Odin grammar
590        match SyntaxDefinition::load_from_str(ODIN_GRAMMAR, true, Some("Odin")) {
591            Ok(syntax) => {
592                builder.add(syntax);
593                tracing::debug!("Loaded embedded Odin grammar");
594            }
595            Err(e) => {
596                tracing::warn!("Failed to load embedded Odin grammar: {}", e);
597            }
598        }
599
600        // Zig grammar
601        match SyntaxDefinition::load_from_str(ZIG_GRAMMAR, true, Some("Zig")) {
602            Ok(syntax) => {
603                builder.add(syntax);
604                tracing::debug!("Loaded embedded Zig grammar");
605            }
606            Err(e) => {
607                tracing::warn!("Failed to load embedded Zig grammar: {}", e);
608            }
609        }
610
611        // GDScript grammar
612        match SyntaxDefinition::load_from_str(GDSCRIPT_GRAMMAR, true, Some("GDScript")) {
613            Ok(syntax) => {
614                builder.add(syntax);
615                tracing::debug!("Loaded embedded GDScript grammar");
616            }
617            Err(e) => {
618                tracing::warn!("Failed to load embedded GDScript grammar: {}", e);
619            }
620        }
621
622        // Git Rebase Todo grammar
623        match SyntaxDefinition::load_from_str(GIT_REBASE_GRAMMAR, true, Some("Git Rebase Todo")) {
624            Ok(syntax) => {
625                builder.add(syntax);
626                tracing::debug!("Loaded embedded Git Rebase Todo grammar");
627            }
628            Err(e) => {
629                tracing::warn!("Failed to load embedded Git Rebase Todo grammar: {}", e);
630            }
631        }
632
633        // Git Commit Message grammar
634        match SyntaxDefinition::load_from_str(GIT_COMMIT_GRAMMAR, true, Some("Git Commit Message"))
635        {
636            Ok(syntax) => {
637                builder.add(syntax);
638                tracing::debug!("Loaded embedded Git Commit Message grammar");
639            }
640            Err(e) => {
641                tracing::warn!("Failed to load embedded Git Commit Message grammar: {}", e);
642            }
643        }
644
645        // Gitignore grammar
646        match SyntaxDefinition::load_from_str(GITIGNORE_GRAMMAR, true, Some("Gitignore")) {
647            Ok(syntax) => {
648                builder.add(syntax);
649                tracing::debug!("Loaded embedded Gitignore grammar");
650            }
651            Err(e) => {
652                tracing::warn!("Failed to load embedded Gitignore grammar: {}", e);
653            }
654        }
655
656        // Git Config grammar
657        match SyntaxDefinition::load_from_str(GITCONFIG_GRAMMAR, true, Some("Git Config")) {
658            Ok(syntax) => {
659                builder.add(syntax);
660                tracing::debug!("Loaded embedded Git Config grammar");
661            }
662            Err(e) => {
663                tracing::warn!("Failed to load embedded Git Config grammar: {}", e);
664            }
665        }
666
667        // Git Attributes grammar
668        match SyntaxDefinition::load_from_str(GITATTRIBUTES_GRAMMAR, true, Some("Git Attributes")) {
669            Ok(syntax) => {
670                builder.add(syntax);
671                tracing::debug!("Loaded embedded Git Attributes grammar");
672            }
673            Err(e) => {
674                tracing::warn!("Failed to load embedded Git Attributes grammar: {}", e);
675            }
676        }
677
678        // Typst grammar
679        match SyntaxDefinition::load_from_str(TYPST_GRAMMAR, true, Some("Typst")) {
680            Ok(syntax) => {
681                builder.add(syntax);
682                tracing::debug!("Loaded embedded Typst grammar");
683            }
684            Err(e) => {
685                tracing::warn!("Failed to load embedded Typst grammar: {}", e);
686            }
687        }
688
689        // Additional embedded grammars for languages not in syntect defaults
690        let additional_grammars: &[(&str, &str)] = &[
691            (DOCKERFILE_GRAMMAR, "Dockerfile"),
692            (INI_GRAMMAR, "INI"),
693            (CMAKE_GRAMMAR, "CMake"),
694            (SCSS_GRAMMAR, "SCSS"),
695            (LESS_GRAMMAR, "LESS"),
696            (POWERSHELL_GRAMMAR, "PowerShell"),
697            (KOTLIN_GRAMMAR, "Kotlin"),
698            (SWIFT_GRAMMAR, "Swift"),
699            (DART_GRAMMAR, "Dart"),
700            (ELIXIR_GRAMMAR, "Elixir"),
701            (FSHARP_GRAMMAR, "FSharp"),
702            (NIX_GRAMMAR, "Nix"),
703            (HCL_GRAMMAR, "HCL"),
704            (PROTOBUF_GRAMMAR, "Protocol Buffers"),
705            (GRAPHQL_GRAMMAR, "GraphQL"),
706            (JULIA_GRAMMAR, "Julia"),
707            (NIM_GRAMMAR, "Nim"),
708            (GLEAM_GRAMMAR, "Gleam"),
709            (VLANG_GRAMMAR, "V"),
710            (SOLIDITY_GRAMMAR, "Solidity"),
711            (KDL_GRAMMAR, "KDL"),
712            (NUSHELL_GRAMMAR, "Nushell"),
713            (SMALI_GRAMMAR, "Smali"),
714            (FISH_GRAMMAR, "Fish"),
715            (STARLARK_GRAMMAR, "Starlark"),
716            (JUSTFILE_GRAMMAR, "Justfile"),
717            (EARTHFILE_GRAMMAR, "Earthfile"),
718            (GOMOD_GRAMMAR, "Go Module"),
719            (VUE_GRAMMAR, "Vue"),
720            (SVELTE_GRAMMAR, "Svelte"),
721            (ASTRO_GRAMMAR, "Astro"),
722            (HYPRLANG_GRAMMAR, "Hyprlang"),
723            (AUTOHOTKEY_GRAMMAR, "AutoHotkey"),
724            (RACKET_GRAMMAR, "Racket"),
725            (VERILOG_GRAMMAR, "Verilog"),
726            (SYSTEMVERILOG_GRAMMAR, "SystemVerilog"),
727            (VHDL_GRAMMAR, "VHDL"),
728            (C3_GRAMMAR, "C3"),
729            (ASM_GRAMMAR, "Assembly"),
730        ];
731
732        for (grammar_str, name) in additional_grammars {
733            match SyntaxDefinition::load_from_str(grammar_str, true, Some(name)) {
734                Ok(syntax) => {
735                    builder.add(syntax);
736                    tracing::debug!("Loaded embedded {} grammar", name);
737                }
738                Err(e) => {
739                    tracing::warn!("Failed to load embedded {} grammar: {}", name, e);
740                }
741            }
742        }
743    }
744
745    /// Find syntax for a file by path/extension/filename.
746    ///
747    /// Purely metadata-based — does not read the file. For first-line
748    /// (shebang) fallback, use [`find_by_path`] with a `first_line` argument
749    /// and resolve the returned entry's syntect index.
750    pub fn find_syntax_for_file(&self, path: &Path) -> Option<&SyntaxReference> {
751        let entry = self.find_by_path(path, None)?;
752        entry
753            .engines
754            .syntect
755            .map(|i| &self.syntax_set.syntaxes()[i])
756    }
757
758    /// Find syntax by name, with alias resolution.
759    ///
760    /// Thin wrapper around `find_by_name` that returns the associated syntect
761    /// `SyntaxReference`. Tree-sitter-only entries return `None`.
762    ///
763    /// Falls back to a direct syntect lookup for "Plain Text", which the
764    /// catalog deliberately omits but syntect still exposes.
765    pub fn find_syntax_by_name(&self, name: &str) -> Option<&SyntaxReference> {
766        if let Some(entry) = self.find_by_name(name) {
767            if let Some(idx) = entry.engines.syntect {
768                return Some(&self.syntax_set.syntaxes()[idx]);
769            }
770        }
771        // Plain Text is excluded from the catalog (it's not a "grammar" a user
772        // would ever pick), but syntect still stores it and a handful of
773        // callers still ask for it by name.
774        self.syntax_set.find_syntax_by_name(name)
775    }
776
777    // === Alias management ===
778
779    /// Hardcoded short-name aliases for built-in and embedded grammars.
780    ///
781    /// Each entry maps a short name (lowercase) to the exact syntect grammar name.
782    /// Only grammars whose full name differs significantly from a natural short
783    /// form need an entry here. Grammars already short (e.g., "Rust", "Go") are
784    /// reachable via case-insensitive matching and don't need aliases.
785    fn built_in_aliases() -> Vec<(&'static str, &'static str)> {
786        vec![
787            // Syntect built-in grammars with verbose names
788            ("bash", "Bourne Again Shell (bash)"),
789            ("shell", "Bourne Again Shell (bash)"),
790            ("sh", "Bourne Again Shell (bash)"),
791            ("c++", "C++"),
792            ("cpp", "C++"),
793            ("csharp", "C#"),
794            ("objc", "Objective-C"),
795            ("objcpp", "Objective-C++"),
796            ("regex", "Regular Expressions (Python)"),
797            ("regexp", "Regular Expressions (Python)"),
798            // Embedded grammars with multi-word or non-obvious names
799            ("proto", "Protocol Buffers"),
800            ("protobuf", "Protocol Buffers"),
801            ("gomod", "Go Module"),
802            ("git-rebase", "Git Rebase Todo"),
803            ("git-commit", "Git Commit Message"),
804            ("git-config", "Git Config"),
805            ("git-attributes", "Git Attributes"),
806            ("gitignore", "Gitignore"),
807            ("fsharp", "FSharp"),
808            ("f#", "FSharp"),
809            ("terraform", "HCL"),
810            ("tf", "HCL"),
811            ("ts", "TypeScript"),
812            ("js", "JavaScript"),
813            ("py", "Python"),
814            ("rb", "Ruby"),
815            ("rs", "Rust"),
816            ("md", "Markdown"),
817            ("yml", "YAML"),
818            ("dockerfile", "Dockerfile"),
819        ]
820    }
821
822    /// Populate aliases from the built-in table.
823    ///
824    /// Validates that:
825    /// - Each alias target (full name) exists in the syntax set
826    /// - No alias collides (case-insensitive) with an existing grammar full name
827    /// - No duplicate aliases exist
828    pub(crate) fn populate_built_in_aliases(&mut self) {
829        for (short, full) in Self::built_in_aliases() {
830            self.register_alias_inner(short, full, true);
831        }
832        self.rebuild_catalog();
833    }
834
835    /// Register a short-name alias for a grammar.
836    ///
837    /// Returns `true` if the alias was registered, `false` if rejected due to
838    /// collision or missing target. For built-in aliases, collisions panic
839    /// (they indicate a bug). For dynamic aliases, collisions log a warning.
840    ///
841    /// Splices the alias directly into the catalog rather than rebuilding, so
842    /// any user config previously merged via `apply_language_config` is
843    /// preserved. A full rebuild would wipe those entries.
844    pub(crate) fn register_alias(&mut self, short_name: &str, full_name: &str) -> bool {
845        if !self.register_alias_inner(short_name, full_name, false) {
846            return false;
847        }
848        let short_lower = short_name.to_lowercase();
849        let full_lower = full_name.to_lowercase();
850        if let Some(&idx) = self.catalog_by_name.get(&full_lower) {
851            self.catalog_by_name
852                .entry(short_lower.clone())
853                .or_insert(idx);
854            let entry = &mut self.catalog[idx];
855            let replace = match &entry.short_name {
856                None => true,
857                Some(existing) => short_name.len() < existing.len(),
858            };
859            if replace {
860                entry.short_name = Some(short_lower);
861            }
862        }
863        true
864    }
865
866    fn register_alias_inner(
867        &mut self,
868        short_name: &str,
869        full_name: &str,
870        is_built_in: bool,
871    ) -> bool {
872        let short_lower = short_name.to_lowercase();
873
874        // Validate: target grammar must exist in the syntax set
875        let target_exists = self
876            .syntax_set
877            .syntaxes()
878            .iter()
879            .any(|s| s.name.eq_ignore_ascii_case(full_name));
880        if !target_exists {
881            // Tree-sitter-only targets (e.g. TypeScript) are expected to be
882            // absent from the syntect set. `rebuild_catalog` attaches their
883            // short names via a separate pass over `built_in_aliases()`.
884            if tree_sitter_for_syntect_name(full_name).is_some() {
885                return false;
886            }
887            if is_built_in {
888                // Built-in alias targets should always exist; warn but don't panic
889                // (grammar might have been removed from syntect upstream)
890                tracing::warn!(
891                    "[grammar-alias] Built-in alias '{}' -> '{}': target grammar not found, skipping",
892                    short_name, full_name
893                );
894            } else {
895                tracing::warn!(
896                    "[grammar-alias] Alias '{}' -> '{}': target grammar not found, skipping",
897                    short_name,
898                    full_name
899                );
900            }
901            return false;
902        }
903
904        // Validate: short name must not collide (case-insensitive) with any grammar full name
905        let collides_with_full_name = self
906            .syntax_set
907            .syntaxes()
908            .iter()
909            .any(|s| s.name.eq_ignore_ascii_case(&short_lower));
910        if collides_with_full_name {
911            // This is actually fine — the short name matches a full name directly,
912            // so find_syntax_by_name's case-insensitive search will find it.
913            // No alias needed.
914            tracing::debug!(
915                "[grammar-alias] Alias '{}' matches an existing grammar name, skipping (not needed)",
916                short_name
917            );
918            return false;
919        }
920
921        // Validate: no duplicate alias (case-insensitive)
922        if let Some(existing_target) = self.aliases.get(&short_lower) {
923            if existing_target.eq_ignore_ascii_case(full_name) {
924                // Same mapping, no-op
925                return true;
926            }
927            let msg = format!(
928                "Alias '{}' already maps to '{}', cannot remap to '{}'",
929                short_name, existing_target, full_name
930            );
931            if is_built_in {
932                panic!("[grammar-alias] Built-in alias collision: {}", msg);
933            } else {
934                tracing::warn!("[grammar-alias] {}", msg);
935                return false;
936            }
937        }
938
939        // Resolve the exact syntect name (preserving original case)
940        let exact_name = self
941            .syntax_set
942            .syntaxes()
943            .iter()
944            .find(|s| s.name.eq_ignore_ascii_case(full_name))
945            .map(|s| s.name.clone())
946            .unwrap();
947
948        self.aliases.insert(short_lower, exact_name);
949        true
950    }
951
952    // === Unified catalog ===
953
954    /// Rebuild the flat catalog of grammar entries.
955    ///
956    /// Called after the syntax set, aliases, or filename scopes change.
957    /// Produces one entry per logical language by merging:
958    /// 1. Every `SyntaxReference` in the syntax set (except "Plain Text")
959    /// 2. Every `fresh_languages::Language` not already covered by a syntect entry
960    /// 3. Alias short-names attached to their target entry
961    /// 4. Filename mappings from `filename_scopes` attached to their scope's entry
962    /// 5. Extra extensions from `user_extensions` attached to their scope's entry
963    ///
964    /// Automatically replays the last `apply_language_config` at the end, so
965    /// user `[languages]` config survives any rebuild.
966    pub(crate) fn rebuild_catalog(&mut self) {
967        // Reverse-map: full_name (lowercase) -> shortest alias.
968        //
969        // Seed from the built-in alias table as well as the live `aliases`
970        // HashMap: the live map only contains aliases whose target exists in
971        // the syntect set, so tree-sitter-only entries (TypeScript) would
972        // otherwise never get their short name ("ts").
973        let mut short_by_full: HashMap<String, String> = HashMap::new();
974        let record = |map: &mut HashMap<String, String>, short: &str, full: &str| {
975            let key = full.to_lowercase();
976            let keep = match map.get(&key) {
977                None => true,
978                Some(existing) => short.len() < existing.len(),
979            };
980            if keep {
981                map.insert(key, short.to_string());
982            }
983        };
984        for (short, full) in Self::built_in_aliases() {
985            record(&mut short_by_full, short, full);
986        }
987        for (short, full) in &self.aliases {
988            record(&mut short_by_full, short, full);
989        }
990
991        let derive_language_id =
992            |display_name: &str| -> (String, Option<fresh_languages::Language>) {
993                let ts = tree_sitter_for_syntect_name(display_name);
994                let id = ts
995                    .map(|l| l.id().to_string())
996                    .unwrap_or_else(|| display_name.to_lowercase());
997                (id, ts)
998            };
999
1000        let mut catalog: Vec<GrammarEntry> = Vec::new();
1001        let mut scope_to_index: HashMap<String, usize> = HashMap::new();
1002
1003        // Syntect-backed entries (skip Plain Text and JavaScript).
1004        //
1005        // Syntect's `file_extensions` is a hybrid list: real extensions like
1006        // "rb" sit alongside bare filenames like "Gemfile", "Rakefile",
1007        // "Makefile". Syntect's own `find_syntax_for_file` tries each entry
1008        // against the whole filename AND against the path's extension, and
1009        // the catalog has to preserve that semantics. We keep everything in
1010        // `extensions` here and index each entry as *both* an extension and
1011        // a filename at the bottom of this method.
1012        //
1013        // JavaScript is skipped here so the catalog falls through to the
1014        // tree-sitter-only fallback below — the bundled syntect JS grammar
1015        // mishandles class fields whose initialiser is an arrow function
1016        // returning a template literal (issue #899: state leaks past the
1017        // closing backtick and paints the rest of the file as a string).
1018        // tree-sitter-javascript parses template literals from the AST and
1019        // does not have this failure mode. `find_syntax_by_name("JavaScript")`
1020        // still returns syntect's grammar via the catalog's fallback path,
1021        // so markdown popup rendering and other code-string highlighters
1022        // are unaffected.
1023        for (idx, syntax) in self.syntax_set.syntaxes().iter().enumerate() {
1024            if syntax.name == "Plain Text" || syntax.name == "JavaScript" {
1025                continue;
1026            }
1027            let (language_id, tree_sitter) = derive_language_id(&syntax.name);
1028            let short_name = short_by_full.get(&syntax.name.to_lowercase()).cloned();
1029            let source = self
1030                .grammar_sources
1031                .get(&syntax.name)
1032                .map(|info| info.source.clone())
1033                .unwrap_or(GrammarSource::BuiltIn);
1034            let entry_index = catalog.len();
1035            scope_to_index.insert(syntax.scope.to_string(), entry_index);
1036
1037            // Union syntect's file_extensions with tree-sitter's own
1038            // extension list when the entry carries both engines.
1039            // tree-sitter-javascript handles `.jsx`/`.mjs`/`.cjs` that
1040            // syntect's JS grammar doesn't list, and the old code used to
1041            // route those paths to tree-sitter via a separate lookup.
1042            let mut extensions = syntax.file_extensions.clone();
1043            if let Some(lang) = tree_sitter {
1044                for ext in lang.extensions() {
1045                    let ext = ext.to_string();
1046                    if !extensions.iter().any(|e| e == &ext) {
1047                        extensions.push(ext);
1048                    }
1049                }
1050            }
1051
1052            // Only Fresh's dedicated Fish grammar may own `.fish`. Syntect's
1053            // stock Bash grammar also advertises the extension (a quirk of its
1054            // upstream definition), which would otherwise shadow Fish in the
1055            // first-wins extension index since Bash precedes it in the packdump.
1056            if syntax.name != "Fish" {
1057                extensions.retain(|e| e != "fish");
1058            }
1059
1060            catalog.push(GrammarEntry {
1061                display_name: syntax.name.clone(),
1062                language_id,
1063                short_name,
1064                extensions,
1065                filenames: Vec::new(),
1066                filename_globs: Vec::new(),
1067                source,
1068                engines: GrammarEngines {
1069                    syntect: Some(idx),
1070                    tree_sitter,
1071                },
1072            });
1073        }
1074
1075        // Attach filename_scopes to their entries.
1076        for (filename, scope) in &self.filename_scopes {
1077            if let Some(&idx) = scope_to_index.get(scope) {
1078                if !catalog[idx].filenames.iter().any(|f| f == filename) {
1079                    catalog[idx].filenames.push(filename.clone());
1080                }
1081            }
1082        }
1083
1084        // Attach user_extensions (extra → scope) to their entries.
1085        for (ext, scope) in &self.user_extensions {
1086            if let Some(&idx) = scope_to_index.get(scope) {
1087                if !catalog[idx].extensions.iter().any(|e| e == ext) {
1088                    catalog[idx].extensions.push(ext.clone());
1089                }
1090            }
1091        }
1092
1093        // Ensure every tree-sitter language has an entry. If a syntect entry
1094        // already maps to the same tree-sitter language, skip it; otherwise
1095        // add a tree-sitter-only entry so the catalog is complete (TypeScript
1096        // being the motivating example — syntect ships no grammar for it).
1097        let mut ts_covered: std::collections::HashSet<fresh_languages::Language> =
1098            std::collections::HashSet::new();
1099        for entry in &catalog {
1100            if let Some(lang) = entry.engines.tree_sitter {
1101                ts_covered.insert(lang);
1102            }
1103        }
1104        for lang in fresh_languages::Language::all() {
1105            if ts_covered.contains(lang) {
1106                continue;
1107            }
1108            let display_name = lang.display_name().to_string();
1109            let language_id = lang.id().to_string();
1110            let short_name = short_by_full.get(&display_name.to_lowercase()).cloned();
1111            let extensions: Vec<String> = lang.extensions().iter().map(|s| s.to_string()).collect();
1112            catalog.push(GrammarEntry {
1113                display_name,
1114                language_id,
1115                short_name,
1116                extensions,
1117                filenames: Vec::new(),
1118                filename_globs: Vec::new(),
1119                source: GrammarSource::BuiltIn,
1120                engines: GrammarEngines {
1121                    syntect: None,
1122                    tree_sitter: Some(*lang),
1123                },
1124            });
1125        }
1126
1127        // Build name / extension / filename indices.
1128        //
1129        // Every entry in `extensions` gets indexed in BOTH `by_extension`
1130        // (lowercased) AND `by_filename` (exact case) — syntect's
1131        // `file_extensions` list holds both real extensions ("rb") and bare
1132        // filenames ("Gemfile", "Rakefile", "Makefile"). Indexing both ways
1133        // matches syntect's own `find_syntax_for_file` semantics.
1134        let mut by_name: HashMap<String, usize> = HashMap::new();
1135        let mut by_extension: HashMap<String, usize> = HashMap::new();
1136        let mut by_filename: HashMap<String, usize> = HashMap::new();
1137        for (idx, entry) in catalog.iter().enumerate() {
1138            by_name.insert(entry.display_name.to_lowercase(), idx);
1139            by_name.insert(entry.language_id.to_lowercase(), idx);
1140            if let Some(short) = &entry.short_name {
1141                by_name.insert(short.to_lowercase(), idx);
1142            }
1143            for ext in &entry.extensions {
1144                by_extension.entry(ext.to_lowercase()).or_insert(idx);
1145                by_filename.entry(ext.clone()).or_insert(idx);
1146            }
1147            for filename in &entry.filenames {
1148                by_filename.entry(filename.clone()).or_insert(idx);
1149            }
1150        }
1151
1152        self.catalog = catalog;
1153        self.catalog_by_name = by_name;
1154        self.catalog_by_extension = by_extension;
1155        self.catalog_by_filename = by_filename;
1156
1157        // Replay the most recent user config so a rebuild doesn't silently
1158        // wipe out user `[languages]` rules. `take` + restore avoids both a
1159        // clone and a borrow checker fight with `apply_language_config_inner`.
1160        if !self.applied_language_config.is_empty() {
1161            let cfg = std::mem::take(&mut self.applied_language_config);
1162            self.apply_language_config_inner(&cfg);
1163            self.applied_language_config = cfg;
1164        }
1165        self.catalog_gen = self.catalog_gen.wrapping_add(1);
1166    }
1167
1168    /// Return the full catalog of grammar entries.
1169    pub fn catalog(&self) -> &[GrammarEntry] {
1170        &self.catalog
1171    }
1172
1173    /// Monotonic generation, bumped on every catalog mutation. Compare against
1174    /// a previously-observed value to decide whether to recompute derived
1175    /// state.
1176    pub fn catalog_gen(&self) -> u64 {
1177        self.catalog_gen
1178    }
1179
1180    /// Look up a grammar entry by display name, language ID, or short alias
1181    /// (case-insensitive). All aliases — built-in and user-config-declared —
1182    /// are indexed directly in `catalog_by_name` during `rebuild_catalog` /
1183    /// `register_alias` / `apply_language_config`, so a single lookup covers
1184    /// every case.
1185    pub fn find_by_name(&self, name: &str) -> Option<&GrammarEntry> {
1186        self.catalog_by_name
1187            .get(&name.to_lowercase())
1188            .map(|&idx| &self.catalog[idx])
1189    }
1190
1191    /// Look up a grammar entry by file path, with optional first-line content
1192    /// for shebang / `first_line_match` detection.
1193    ///
1194    /// Resolution order:
1195    /// 1. Exact filename (config-declared filenames and filename_scopes live here)
1196    /// 2. Glob patterns from user config (e.g. "*.conf", "/etc/**/rc.*")
1197    /// 3. File extension
1198    /// 4. Shebang / first-line regex match on `first_line` if supplied
1199    ///
1200    /// Globs take priority over extension so a user rule like `*.conf → bash`
1201    /// wins over any built-in extension match on `.conf`. The first-line
1202    /// fallback (#4) is last so catalog matches stay authoritative — syntect
1203    /// might otherwise misclassify `.fish` as bash via its first-line
1204    /// regexes.
1205    ///
1206    /// The first-line fallback is pure: it runs syntect's
1207    /// `find_syntax_by_first_line` regex cache against the caller-supplied
1208    /// string. The registry never touches the filesystem — the caller (who
1209    /// already loaded the buffer via the `FileSystem` trait) must extract
1210    /// the first line and pass it in.
1211    pub fn find_by_path(&self, path: &Path, first_line: Option<&str>) -> Option<&GrammarEntry> {
1212        let filename = path.file_name().and_then(|n| n.to_str());
1213        let path_str = path.to_str().unwrap_or("");
1214
1215        if let Some(name) = filename {
1216            if let Some(&idx) = self.catalog_by_filename.get(name) {
1217                return Some(&self.catalog[idx]);
1218            }
1219        }
1220
1221        // Glob walk — filenames with globs are rare so linear scan is fine.
1222        if let Some(name) = filename {
1223            for entry in &self.catalog {
1224                for pattern in &entry.filename_globs {
1225                    let matched = if is_path_pattern(pattern) {
1226                        path_glob_matches(pattern, path_str)
1227                    } else {
1228                        filename_glob_matches(pattern, name)
1229                    };
1230                    if matched {
1231                        return Some(entry);
1232                    }
1233                }
1234            }
1235        }
1236
1237        if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
1238            if let Some(entry) = self.find_by_extension(ext) {
1239                return Some(entry);
1240            }
1241        }
1242
1243        // Last resort: shebang / first-line regex match against the
1244        // caller-supplied content. Map the matched syntect grammar back to a
1245        // catalog entry by name — every syntect syntax has a catalog entry,
1246        // so this round-trip preserves tree-sitter attachment.
1247        let line = first_line?;
1248        if let Some(syntax) = self.syntax_set.find_syntax_by_first_line(line) {
1249            if let Some(entry) = self.find_by_name(&syntax.name) {
1250                return Some(entry);
1251            }
1252        }
1253
1254        // Final fallback: map the shebang interpreter directly to a known
1255        // grammar, covering interpreters whose grammars ship no syntect
1256        // `first_line_match` regex (fish, Lua, PowerShell, …) — see `shebang`.
1257        let lang = super::shebang::language_for_shebang(line)?;
1258        self.find_by_name(lang)
1259    }
1260
1261    /// Look up a grammar entry by file extension (case-insensitive, without dot).
1262    pub fn find_by_extension(&self, ext: &str) -> Option<&GrammarEntry> {
1263        self.catalog_by_extension
1264            .get(&ext.to_lowercase())
1265            .map(|&idx| &self.catalog[idx])
1266    }
1267
1268    /// Merge user `[languages]` config into the catalog.
1269    ///
1270    /// For each config entry, resolves its grammar to an existing catalog entry
1271    /// (by grammar name or by language id). Extensions are added and override
1272    /// the ext→entry index so config wins over built-in mappings. Filenames are
1273    /// split into exact matches (indexed) and globs (walked at lookup time).
1274    ///
1275    /// If no existing entry matches, a new engine-less entry is created so the
1276    /// language still appears in the palette.
1277    ///
1278    /// Idempotent. The config is cached on the registry so `rebuild_catalog`
1279    /// can replay it — callers don't need to re-apply after a rebuild.
1280    pub fn apply_language_config(
1281        &mut self,
1282        languages: &HashMap<String, crate::config::LanguageConfig>,
1283    ) {
1284        self.applied_language_config = languages.clone();
1285        self.apply_language_config_inner(languages);
1286        self.catalog_gen = self.catalog_gen.wrapping_add(1);
1287    }
1288
1289    /// Do the actual catalog splicing without touching
1290    /// `applied_language_config`. Called from `apply_language_config` (which
1291    /// records the input) and from `rebuild_catalog` (which replays the
1292    /// cached input after wiping the catalog).
1293    fn apply_language_config_inner(
1294        &mut self,
1295        languages: &HashMap<String, crate::config::LanguageConfig>,
1296    ) {
1297        for (lang_id, lang_cfg) in languages {
1298            let grammar_name = if lang_cfg.grammar.is_empty() {
1299                lang_id.as_str()
1300            } else {
1301                lang_cfg.grammar.as_str()
1302            };
1303
1304            // Resolve to an existing entry; fall back to creating one.
1305            let idx = self
1306                .catalog_by_name
1307                .get(&grammar_name.to_lowercase())
1308                .copied()
1309                .or_else(|| self.catalog_by_name.get(&lang_id.to_lowercase()).copied())
1310                .unwrap_or_else(|| {
1311                    let idx = self.catalog.len();
1312                    self.catalog.push(GrammarEntry {
1313                        display_name: lang_id.clone(),
1314                        language_id: lang_id.clone(),
1315                        short_name: None,
1316                        extensions: Vec::new(),
1317                        filenames: Vec::new(),
1318                        filename_globs: Vec::new(),
1319                        source: GrammarSource::BuiltIn,
1320                        engines: GrammarEngines::default(),
1321                    });
1322                    idx
1323                });
1324
1325            // Always index the config key so `find_by_name("mylang")` resolves
1326            // even when `mylang` aliases an existing grammar (e.g.
1327            // `[languages.mylang] grammar = "Rust"`). `or_insert` preserves
1328            // any existing mapping — won't clobber the canonical entry.
1329            self.catalog_by_name
1330                .entry(lang_id.to_lowercase())
1331                .or_insert(idx);
1332
1333            for ext in &lang_cfg.extensions {
1334                if !self.catalog[idx].extensions.iter().any(|e| e == ext) {
1335                    self.catalog[idx].extensions.push(ext.clone());
1336                }
1337                // Config-declared extensions override any previous mapping.
1338                self.catalog_by_extension.insert(ext.to_lowercase(), idx);
1339            }
1340            for filename in &lang_cfg.filenames {
1341                if is_glob_pattern(filename) {
1342                    if !self.catalog[idx]
1343                        .filename_globs
1344                        .iter()
1345                        .any(|f| f == filename)
1346                    {
1347                        self.catalog[idx].filename_globs.push(filename.clone());
1348                    }
1349                } else {
1350                    if !self.catalog[idx].filenames.iter().any(|f| f == filename) {
1351                        self.catalog[idx].filenames.push(filename.clone());
1352                    }
1353                    self.catalog_by_filename.insert(filename.clone(), idx);
1354                }
1355            }
1356        }
1357    }
1358
1359    /// Get the underlying syntax set
1360    pub fn syntax_set(&self) -> &Arc<SyntaxSet> {
1361        &self.syntax_set
1362    }
1363
1364    /// Get a clone of the Arc for sharing
1365    pub fn syntax_set_arc(&self) -> Arc<SyntaxSet> {
1366        Arc::clone(&self.syntax_set)
1367    }
1368
1369    /// List all available syntax names
1370    pub fn available_syntaxes(&self) -> Vec<&str> {
1371        self.syntax_set
1372            .syntaxes()
1373            .iter()
1374            .map(|s| s.name.as_str())
1375            .collect()
1376    }
1377
1378    /// List all available grammars with provenance information.
1379    ///
1380    /// Returns a sorted list of `GrammarInfo` entries derived from the unified
1381    /// catalog — this includes both syntect grammars and tree-sitter-only
1382    /// languages (like TypeScript). Each entry is listed exactly once even
1383    /// when both engines can serve it.
1384    pub fn available_grammar_info(&self) -> Vec<GrammarInfo> {
1385        let mut result: Vec<GrammarInfo> = self
1386            .catalog
1387            .iter()
1388            .map(|entry| GrammarInfo {
1389                name: entry.display_name.clone(),
1390                source: entry.source.clone(),
1391                file_extensions: entry.extensions.clone(),
1392                short_name: entry.short_name.clone(),
1393            })
1394            .collect();
1395        result.sort_by(|a, b| a.name.to_lowercase().cmp(&b.name.to_lowercase()));
1396        result
1397    }
1398
1399    /// Get the grammar sources map.
1400    pub(crate) fn grammar_sources(&self) -> &HashMap<String, GrammarInfo> {
1401        &self.grammar_sources
1402    }
1403
1404    /// Build grammar source info from a pre-compiled syntax set.
1405    ///
1406    /// All grammars in the packdump (syntect defaults + embedded) are tagged as built-in.
1407    pub(crate) fn build_grammar_sources_from_syntax_set(
1408        syntax_set: &SyntaxSet,
1409    ) -> HashMap<String, GrammarInfo> {
1410        let mut sources = HashMap::new();
1411        for syntax in syntax_set.syntaxes() {
1412            sources.insert(
1413                syntax.name.clone(),
1414                GrammarInfo {
1415                    name: syntax.name.clone(),
1416                    source: GrammarSource::BuiltIn,
1417                    file_extensions: syntax.file_extensions.clone(),
1418                    short_name: None,
1419                },
1420            );
1421        }
1422        sources
1423    }
1424
1425    /// Get the user extensions mapping (extension -> scope name).
1426    #[cfg(test)]
1427    pub(crate) fn user_extensions(&self) -> &HashMap<String, String> {
1428        &self.user_extensions
1429    }
1430
1431    /// Get the loaded grammar paths (for deduplication in flush_pending_grammars).
1432    #[cfg(test)]
1433    pub(crate) fn loaded_grammar_paths(&self) -> &[GrammarSpec] {
1434        &self.loaded_grammar_paths
1435    }
1436
1437    /// Create a new registry with additional grammar files
1438    ///
1439    /// This builds a new GrammarRegistry that includes all grammars from
1440    /// the base registry plus the additional grammars specified.
1441    /// Uses the base registry's syntax_set as the builder base, preserving
1442    /// all existing grammars (user grammars, language packs, etc.).
1443    ///
1444    /// # Arguments
1445    /// * `base` - The base registry to extend
1446    /// * `additional` - List of (language, path, extensions) tuples for new grammars
1447    ///
1448    /// # Returns
1449    /// A new GrammarRegistry with the additional grammars, or None if rebuilding fails
1450    pub fn with_additional_grammars(
1451        base: &GrammarRegistry,
1452        additional: &[GrammarSpec],
1453    ) -> Option<Self> {
1454        tracing::info!(
1455            "[SYNTAX DEBUG] with_additional_grammars: adding {} grammars to base with {} syntaxes",
1456            additional.len(),
1457            base.syntax_set.syntaxes().len()
1458        );
1459
1460        // Use the base registry's syntax_set as builder base — this preserves
1461        // ALL existing grammars (defaults, embedded, user, language packs)
1462        // without needing to reload them from disk.
1463        let mut builder = (*base.syntax_set).clone().into_builder();
1464
1465        // Preserve existing user extensions and add new ones
1466        let mut user_extensions = base.user_extensions.clone();
1467
1468        // Track loaded grammar paths (existing + new)
1469        let mut loaded_grammar_paths = base.loaded_grammar_paths.clone();
1470
1471        // Preserve existing grammar sources
1472        let mut grammar_sources = base.grammar_sources.clone();
1473
1474        // Add each new grammar
1475        for spec in additional {
1476            tracing::info!(
1477                "[SYNTAX DEBUG] loading new grammar file: lang='{}', path={:?}, extensions={:?}",
1478                spec.language,
1479                spec.path,
1480                spec.extensions
1481            );
1482            match Self::load_grammar_file(&spec.path) {
1483                Ok(syntax) => {
1484                    let scope = syntax.scope.to_string();
1485                    let syntax_name = syntax.name.clone();
1486                    tracing::info!(
1487                        "[SYNTAX DEBUG] grammar loaded successfully: name='{}', scope='{}'",
1488                        syntax_name,
1489                        scope
1490                    );
1491                    builder.add(syntax);
1492                    tracing::info!(
1493                        "Loaded grammar for '{}' from {:?} with extensions {:?}",
1494                        spec.language,
1495                        spec.path,
1496                        spec.extensions
1497                    );
1498                    // Register extensions for this grammar
1499                    for ext in &spec.extensions {
1500                        user_extensions.insert(ext.clone(), scope.clone());
1501                    }
1502                    // Track provenance
1503                    grammar_sources.insert(
1504                        syntax_name.clone(),
1505                        GrammarInfo {
1506                            name: syntax_name,
1507                            source: GrammarSource::Plugin {
1508                                plugin: spec.language.clone(),
1509                                path: spec.path.clone(),
1510                            },
1511                            file_extensions: spec.extensions.clone(),
1512                            short_name: None,
1513                        },
1514                    );
1515                    // Track this grammar path for future reloads
1516                    loaded_grammar_paths.push(spec.clone());
1517                }
1518                Err(e) => {
1519                    tracing::warn!(
1520                        "Failed to load grammar for '{}' from {:?}: {}",
1521                        spec.language,
1522                        spec.path,
1523                        e
1524                    );
1525                }
1526            }
1527        }
1528
1529        let mut reg = Self {
1530            syntax_set: Arc::new(builder.build()),
1531            user_extensions,
1532            filename_scopes: base.filename_scopes.clone(),
1533            loaded_grammar_paths,
1534            grammar_sources,
1535            aliases: base.aliases.clone(),
1536            catalog: Vec::new(),
1537            catalog_by_name: HashMap::new(),
1538            catalog_by_extension: HashMap::new(),
1539            catalog_by_filename: HashMap::new(),
1540            applied_language_config: HashMap::new(),
1541            catalog_gen: 0,
1542        };
1543        reg.rebuild_catalog();
1544        Some(reg)
1545    }
1546
1547    /// Load a grammar file from disk
1548    ///
1549    /// Only Sublime Text (.sublime-syntax) format is supported.
1550    /// TextMate (.tmLanguage) grammars use a completely different format
1551    /// and cannot be loaded by syntect's yaml-load feature.
1552    pub(crate) fn load_grammar_file(path: &Path) -> Result<SyntaxDefinition, String> {
1553        let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
1554
1555        match ext {
1556            "sublime-syntax" => {
1557                let content = std::fs::read_to_string(path)
1558                    .map_err(|e| format!("Failed to read file: {}", e))?;
1559                SyntaxDefinition::load_from_str(
1560                    &content,
1561                    true,
1562                    path.file_stem().and_then(|s| s.to_str()),
1563                )
1564                .map_err(|e| format!("Failed to parse sublime-syntax: {}", e))
1565            }
1566            _ => Err(format!(
1567                "Unsupported grammar format: .{}. Only .sublime-syntax is supported.",
1568                ext
1569            )),
1570        }
1571    }
1572}
1573
1574impl Default for GrammarRegistry {
1575    fn default() -> Self {
1576        // Create with defaults and embedded grammars only (no user grammars)
1577        let defaults = SyntaxSet::load_defaults_newlines();
1578        let mut builder = defaults.into_builder();
1579        Self::add_embedded_grammars(&mut builder);
1580        let syntax_set = builder.build();
1581        let filename_scopes = Self::build_filename_scopes();
1582        let extra_extensions = Self::build_extra_extensions();
1583
1584        let mut registry = Self::new(syntax_set, extra_extensions, filename_scopes);
1585        registry.populate_built_in_aliases();
1586        registry.rebuild_catalog();
1587        registry
1588    }
1589}
1590
1591// VSCode package.json structures for parsing grammar manifests
1592
1593#[derive(Debug, Deserialize)]
1594pub struct PackageManifest {
1595    #[serde(default)]
1596    pub contributes: Option<Contributes>,
1597}
1598
1599#[derive(Debug, Deserialize, Default)]
1600pub struct Contributes {
1601    #[serde(default)]
1602    pub languages: Vec<LanguageContribution>,
1603    #[serde(default)]
1604    pub grammars: Vec<GrammarContribution>,
1605}
1606
1607#[derive(Debug, Deserialize)]
1608pub struct LanguageContribution {
1609    pub id: String,
1610    #[serde(default)]
1611    pub extensions: Vec<String>,
1612}
1613
1614#[derive(Debug, Deserialize)]
1615pub struct GrammarContribution {
1616    pub language: String,
1617    #[serde(rename = "scopeName")]
1618    pub scope_name: String,
1619    pub path: String,
1620}
1621
1622#[cfg(test)]
1623mod tests {
1624    use super::*;
1625
1626    #[test]
1627    fn test_empty_registry() {
1628        let registry = GrammarRegistry::empty();
1629        // Should have at least plain text
1630        assert!(!registry.available_syntaxes().is_empty());
1631    }
1632
1633    #[test]
1634    fn test_default_registry() {
1635        let registry = GrammarRegistry::default();
1636        // Should have built-in syntaxes
1637        assert!(!registry.available_syntaxes().is_empty());
1638    }
1639
1640    #[test]
1641    fn test_find_syntax_for_common_extensions() {
1642        let registry = GrammarRegistry::default();
1643
1644        // Test common extensions that resolve to a syntect (TextMate) grammar
1645        // via the catalog. JavaScript is intentionally NOT here — it is routed
1646        // exclusively to tree-sitter (issue #899) and so has no catalog-level
1647        // syntect entry. Code-block highlighting in popups still finds the
1648        // syntect JS grammar through `SyntaxSet::find_syntax_by_token`, which
1649        // bypasses the catalog.
1650        let test_cases = [
1651            ("test.py", true),
1652            ("test.rs", true),
1653            ("test.js", false),
1654            ("test.json", true),
1655            ("test.md", true),
1656            ("test.html", true),
1657            ("test.css", true),
1658            ("test.gd", true),
1659            ("test.unknown_extension_xyz", false),
1660        ];
1661
1662        for (filename, should_exist) in test_cases {
1663            let path = Path::new(filename);
1664            let result = registry.find_syntax_for_file(path);
1665            assert_eq!(
1666                result.is_some(),
1667                should_exist,
1668                "Expected {:?} for {}",
1669                should_exist,
1670                filename
1671            );
1672        }
1673    }
1674
1675    #[test]
1676    fn test_racket_grammar_loaded() {
1677        let registry = GrammarRegistry::default();
1678        for filename in ["main.rkt", "data.rktd", "info.rktl", "doc.scrbl"] {
1679            let result = registry.find_syntax_for_file(Path::new(filename));
1680            assert!(
1681                result.is_some(),
1682                "Racket grammar should be available for {}",
1683                filename
1684            );
1685            let entry = registry.find_by_path(Path::new(filename), None).unwrap();
1686            assert_eq!(entry.display_name, "Racket", "for {}", filename);
1687        }
1688    }
1689
1690    #[test]
1691    fn test_syntax_set_arc() {
1692        let registry = GrammarRegistry::default();
1693        let arc1 = registry.syntax_set_arc();
1694        let arc2 = registry.syntax_set_arc();
1695        // Both should point to the same data
1696        assert!(Arc::ptr_eq(&arc1, &arc2));
1697    }
1698
1699    #[test]
1700    fn test_shell_dotfiles_detection() {
1701        let registry = GrammarRegistry::default();
1702
1703        // All these should be detected as shell scripts
1704        let shell_files = [".zshrc", ".zprofile", ".zshenv", ".bash_aliases"];
1705
1706        for filename in shell_files {
1707            let path = Path::new(filename);
1708            let result = registry.find_syntax_for_file(path);
1709            assert!(
1710                result.is_some(),
1711                "{} should be detected as a syntax",
1712                filename
1713            );
1714            let syntax = result.unwrap();
1715            // Should be detected as Bash/Shell
1716            assert!(
1717                syntax.name.to_lowercase().contains("bash")
1718                    || syntax.name.to_lowercase().contains("shell"),
1719                "{} should be detected as shell/bash, got: {}",
1720                filename,
1721                syntax.name
1722            );
1723        }
1724    }
1725
1726    #[test]
1727    fn test_shebang_interpreter_targets_resolve() {
1728        // Every language id the shebang table can return must exist in the
1729        // built-in catalog, otherwise the fallback would silently no-op.
1730        let registry = GrammarRegistry::default();
1731        for name in [
1732            "/bin/sh",
1733            "/usr/bin/fish",
1734            "/usr/bin/env python3",
1735            "/usr/bin/env ruby",
1736            "/usr/bin/perl",
1737            "/usr/bin/env php",
1738            "/usr/bin/env node",
1739            "/usr/bin/env deno",
1740            "/usr/bin/lua",
1741            "/usr/bin/pwsh",
1742            "/usr/bin/tclsh",
1743            "/usr/bin/env groovy",
1744            "/usr/bin/env elixir",
1745            "/usr/bin/env Rscript",
1746            "/usr/bin/env julia",
1747            "/usr/bin/nu",
1748            "/usr/bin/dart",
1749        ] {
1750            let line = format!("#!{name}\n");
1751            let lang = super::super::shebang::language_for_shebang(&line)
1752                .unwrap_or_else(|| panic!("expected an interpreter mapping for {line:?}"));
1753            assert!(
1754                registry.find_by_name(lang).is_some(),
1755                "interpreter mapping {line:?} → {lang:?} must resolve to a catalog grammar",
1756            );
1757        }
1758    }
1759
1760    #[test]
1761    fn test_find_by_path_detects_shebang_only_interpreters() {
1762        // These interpreters have no syntect first-line regex; before the
1763        // interpreter-table fallback an extensionless script fell through to
1764        // plain text. Now `find_by_path` resolves them from the shebang alone.
1765        let registry = GrammarRegistry::default();
1766        let cases = [
1767            ("#!/usr/bin/fish\n", "fish"),
1768            ("#!/usr/bin/lua\n", "lua"),
1769            ("#!/usr/bin/pwsh\n", "powershell"),
1770            ("#!/usr/bin/tclsh\n", "tcl"),
1771            ("#!/usr/bin/env groovy\n", "groovy"),
1772            ("#!/usr/bin/env elixir\n", "elixir"),
1773            ("#!/usr/bin/env Rscript\n", "r"),
1774        ];
1775        for (first_line, expected_id) in cases {
1776            let entry = registry
1777                .find_by_path(Path::new("scriptfile"), Some(first_line))
1778                .unwrap_or_else(|| panic!("no grammar for {first_line:?}"));
1779            assert_eq!(
1780                entry.language_id, expected_id,
1781                "shebang {first_line:?} should detect {expected_id:?}, got {:?}",
1782                entry.language_id,
1783            );
1784        }
1785    }
1786
1787    #[test]
1788    fn test_find_by_path_extension_still_wins_over_shebang() {
1789        // The interpreter fallback must not override an explicit extension
1790        // match — a `.py` file stays Python even with a shell shebang.
1791        let registry = GrammarRegistry::default();
1792        let entry = registry
1793            .find_by_path(Path::new("script.py"), Some("#!/bin/sh\n"))
1794            .unwrap();
1795        assert_eq!(entry.language_id, "python");
1796    }
1797
1798    #[test]
1799    fn test_pkgbuild_detection() {
1800        let registry = GrammarRegistry::default();
1801
1802        // PKGBUILD and APKBUILD should be detected as shell scripts
1803        for filename in ["PKGBUILD", "APKBUILD"] {
1804            let path = Path::new(filename);
1805            let result = registry.find_syntax_for_file(path);
1806            assert!(
1807                result.is_some(),
1808                "{} should be detected as a syntax",
1809                filename
1810            );
1811            let syntax = result.unwrap();
1812            // Should be detected as Bash/Shell
1813            assert!(
1814                syntax.name.to_lowercase().contains("bash")
1815                    || syntax.name.to_lowercase().contains("shell"),
1816                "{} should be detected as shell/bash, got: {}",
1817                filename,
1818                syntax.name
1819            );
1820        }
1821    }
1822
1823    #[test]
1824    fn test_find_syntax_with_glob_filenames() {
1825        let mut registry = GrammarRegistry::default();
1826        let mut languages = std::collections::HashMap::new();
1827        languages.insert(
1828            "shell-configs".to_string(),
1829            crate::config::LanguageConfig {
1830                extensions: vec!["sh".to_string()],
1831                filenames: vec!["*.conf".to_string(), "*rc".to_string()],
1832                grammar: "bash".to_string(),
1833                comment_prefix: Some("#".to_string()),
1834                auto_indent: true,
1835                auto_close: None,
1836                auto_surround: None,
1837                textmate_grammar: None,
1838                show_whitespace_tabs: true,
1839                line_wrap: None,
1840                wrap_column: None,
1841                page_view: None,
1842                page_width: None,
1843                use_tabs: None,
1844                tab_size: None,
1845                formatter: None,
1846                format_on_save: false,
1847                on_save: vec![],
1848                word_characters: None,
1849                indent: None,
1850            },
1851        );
1852        registry.apply_language_config(&languages);
1853
1854        assert!(
1855            registry
1856                .find_by_path(Path::new("nftables.conf"), None)
1857                .is_some(),
1858            "*.conf should match nftables.conf"
1859        );
1860        assert!(
1861            registry.find_by_path(Path::new("lfrc"), None).is_some(),
1862            "*rc should match lfrc"
1863        );
1864        // Unrelated file shouldn't panic.
1865        let _ = registry.find_by_path(Path::new("randomfile"), None);
1866    }
1867
1868    #[test]
1869    fn test_find_syntax_with_path_glob_filenames() {
1870        let mut registry = GrammarRegistry::default();
1871        let mut languages = std::collections::HashMap::new();
1872        languages.insert(
1873            "shell-configs".to_string(),
1874            crate::config::LanguageConfig {
1875                extensions: vec!["sh".to_string()],
1876                filenames: vec!["/etc/**/rc.*".to_string()],
1877                grammar: "bash".to_string(),
1878                comment_prefix: Some("#".to_string()),
1879                auto_indent: true,
1880                auto_close: None,
1881                auto_surround: None,
1882                textmate_grammar: None,
1883                show_whitespace_tabs: true,
1884                line_wrap: None,
1885                wrap_column: None,
1886                page_view: None,
1887                page_width: None,
1888                use_tabs: None,
1889                tab_size: None,
1890                formatter: None,
1891                format_on_save: false,
1892                on_save: vec![],
1893                word_characters: None,
1894                indent: None,
1895            },
1896        );
1897        registry.apply_language_config(&languages);
1898
1899        assert!(
1900            registry
1901                .find_by_path(Path::new("/etc/rc.conf"), None)
1902                .is_some(),
1903            "/etc/**/rc.* should match /etc/rc.conf"
1904        );
1905        assert!(
1906            registry
1907                .find_by_path(Path::new("/etc/init/rc.local"), None)
1908                .is_some(),
1909            "/etc/**/rc.* should match /etc/init/rc.local"
1910        );
1911        let _ = registry.find_by_path(Path::new("/var/rc.conf"), None);
1912    }
1913
1914    #[test]
1915    fn test_exact_filename_takes_priority_over_glob() {
1916        let mut registry = GrammarRegistry::default();
1917        let mut languages = std::collections::HashMap::new();
1918
1919        // A language with exact filename "lfrc" -> python grammar
1920        languages.insert(
1921            "custom-lfrc".to_string(),
1922            crate::config::LanguageConfig {
1923                extensions: vec![],
1924                filenames: vec!["lfrc".to_string()],
1925                grammar: "python".to_string(),
1926                comment_prefix: Some("#".to_string()),
1927                auto_indent: true,
1928                auto_close: None,
1929                auto_surround: None,
1930                textmate_grammar: None,
1931                show_whitespace_tabs: true,
1932                line_wrap: None,
1933                wrap_column: None,
1934                page_view: None,
1935                page_width: None,
1936                use_tabs: None,
1937                tab_size: None,
1938                formatter: None,
1939                format_on_save: false,
1940                on_save: vec![],
1941                word_characters: None,
1942                indent: None,
1943            },
1944        );
1945
1946        // A language with glob "*rc" -> bash grammar
1947        languages.insert(
1948            "rc-files".to_string(),
1949            crate::config::LanguageConfig {
1950                extensions: vec![],
1951                filenames: vec!["*rc".to_string()],
1952                grammar: "bash".to_string(),
1953                comment_prefix: Some("#".to_string()),
1954                auto_indent: true,
1955                auto_close: None,
1956                auto_surround: None,
1957                textmate_grammar: None,
1958                show_whitespace_tabs: true,
1959                line_wrap: None,
1960                wrap_column: None,
1961                page_view: None,
1962                page_width: None,
1963                use_tabs: None,
1964                tab_size: None,
1965                formatter: None,
1966                format_on_save: false,
1967                on_save: vec![],
1968                word_characters: None,
1969                indent: None,
1970            },
1971        );
1972
1973        registry.apply_language_config(&languages);
1974
1975        // "lfrc" should match the exact rule (python), not the glob (bash)
1976        let entry = registry.find_by_path(Path::new("lfrc"), None).unwrap();
1977        assert!(
1978            entry.display_name.to_lowercase().contains("python"),
1979            "exact match should win over glob, got: {}",
1980            entry.display_name
1981        );
1982    }
1983
1984    #[test]
1985    fn test_built_in_aliases_resolve() {
1986        let registry = GrammarRegistry::default();
1987
1988        // "bash" should resolve to "Bourne Again Shell (bash)" via alias
1989        let syntax = registry.find_syntax_by_name("bash");
1990        assert!(syntax.is_some(), "alias 'bash' should resolve");
1991        assert_eq!(syntax.unwrap().name, "Bourne Again Shell (bash)");
1992
1993        // "cpp" should resolve to "C++"
1994        let syntax = registry.find_syntax_by_name("cpp");
1995        assert!(syntax.is_some(), "alias 'cpp' should resolve");
1996        assert_eq!(syntax.unwrap().name, "C++");
1997
1998        // "csharp" should resolve to "C#"
1999        let syntax = registry.find_syntax_by_name("csharp");
2000        assert!(syntax.is_some(), "alias 'csharp' should resolve");
2001        assert_eq!(syntax.unwrap().name, "C#");
2002
2003        // "sh" should also resolve to bash
2004        let syntax = registry.find_syntax_by_name("sh");
2005        assert!(syntax.is_some(), "alias 'sh' should resolve");
2006        assert_eq!(syntax.unwrap().name, "Bourne Again Shell (bash)");
2007
2008        // "proto" should resolve to "Protocol Buffers"
2009        let syntax = registry.find_syntax_by_name("proto");
2010        assert!(syntax.is_some(), "alias 'proto' should resolve");
2011        assert_eq!(syntax.unwrap().name, "Protocol Buffers");
2012    }
2013
2014    #[test]
2015    fn test_alias_case_insensitive_input() {
2016        let registry = GrammarRegistry::default();
2017
2018        // Aliases should be case-insensitive on input
2019        let syntax = registry.find_syntax_by_name("BASH");
2020        assert!(
2021            syntax.is_some(),
2022            "alias 'BASH' should resolve case-insensitively"
2023        );
2024        assert_eq!(syntax.unwrap().name, "Bourne Again Shell (bash)");
2025
2026        let syntax = registry.find_syntax_by_name("Cpp");
2027        assert!(
2028            syntax.is_some(),
2029            "alias 'Cpp' should resolve case-insensitively"
2030        );
2031        assert_eq!(syntax.unwrap().name, "C++");
2032    }
2033
2034    #[test]
2035    fn test_full_name_still_works() {
2036        let registry = GrammarRegistry::default();
2037
2038        // Full names should still work (exact match)
2039        let syntax = registry.find_syntax_by_name("Bourne Again Shell (bash)");
2040        assert!(syntax.is_some(), "full name should still resolve");
2041        assert_eq!(syntax.unwrap().name, "Bourne Again Shell (bash)");
2042
2043        // Case-insensitive full name should still work
2044        let syntax = registry.find_syntax_by_name("bourne again shell (bash)");
2045        assert!(
2046            syntax.is_some(),
2047            "case-insensitive full name should resolve"
2048        );
2049        assert_eq!(syntax.unwrap().name, "Bourne Again Shell (bash)");
2050    }
2051
2052    #[test]
2053    fn test_alias_does_not_shadow_full_names() {
2054        let registry = GrammarRegistry::default();
2055
2056        // "Rust" should resolve directly via case-insensitive match, not via alias
2057        let syntax = registry.find_syntax_by_name("rust");
2058        assert!(syntax.is_some());
2059        assert_eq!(syntax.unwrap().name, "Rust");
2060
2061        // "Go" should resolve directly
2062        let syntax = registry.find_syntax_by_name("go");
2063        assert!(syntax.is_some());
2064        assert_eq!(syntax.unwrap().name, "Go");
2065    }
2066
2067    #[test]
2068    fn test_register_alias_rejects_collision() {
2069        let mut registry = GrammarRegistry::default();
2070
2071        // Trying to register an alias that maps to two different targets should fail
2072        assert!(registry.register_alias("myalias", "Rust"));
2073        assert!(!registry.register_alias("myalias", "Go"));
2074
2075        // Same mapping is fine (idempotent)
2076        assert!(registry.register_alias("myalias", "Rust"));
2077    }
2078
2079    #[test]
2080    fn test_register_alias_rejects_nonexistent_target() {
2081        let mut registry = GrammarRegistry::default();
2082        assert!(!registry.register_alias("nope", "Nonexistent Grammar"));
2083    }
2084
2085    #[test]
2086    fn test_register_alias_skips_existing_grammar_name() {
2087        let mut registry = GrammarRegistry::default();
2088
2089        // "rust" case-insensitively matches the grammar "Rust", so no alias needed
2090        assert!(!registry.register_alias("rust", "Rust"));
2091        // Should still be resolvable via case-insensitive match
2092        assert!(registry.find_syntax_by_name("rust").is_some());
2093    }
2094
2095    #[test]
2096    fn test_available_grammar_info_includes_short_names() {
2097        let registry = GrammarRegistry::default();
2098        let infos = registry.available_grammar_info();
2099
2100        let bash_info = infos.iter().find(|g| g.name == "Bourne Again Shell (bash)");
2101        assert!(bash_info.is_some(), "bash grammar should be in the list");
2102        let bash_info = bash_info.unwrap();
2103        assert!(
2104            bash_info.short_name.is_some(),
2105            "bash grammar should have a short_name"
2106        );
2107        // The shortest alias for bash is "sh"
2108        assert_eq!(bash_info.short_name.as_deref(), Some("sh"));
2109    }
2110
2111    #[test]
2112    fn test_catalog_contains_each_language_once() {
2113        let registry = GrammarRegistry::default();
2114        let catalog = registry.catalog();
2115
2116        // Every catalog entry must have a unique (case-insensitive) display name.
2117        let mut seen = std::collections::HashSet::new();
2118        for entry in catalog {
2119            let key = entry.display_name.to_lowercase();
2120            assert!(
2121                seen.insert(key.clone()),
2122                "duplicate catalog entry for display_name={:?}",
2123                entry.display_name
2124            );
2125        }
2126
2127        // TypeScript is tree-sitter-only (syntect ships no grammar for it) yet
2128        // must still appear in the catalog.
2129        let ts = registry
2130            .find_by_name("TypeScript")
2131            .expect("TypeScript must be in the catalog");
2132        assert!(ts.engines.syntect.is_none());
2133        assert_eq!(
2134            ts.engines.tree_sitter,
2135            Some(fresh_languages::Language::TypeScript)
2136        );
2137        assert_eq!(ts.language_id, "typescript");
2138        assert!(ts.extensions.iter().any(|e| e == "ts"));
2139
2140        // Languages that exist in both syntect and tree-sitter (Rust, Python)
2141        // must appear exactly once and prefer the syntect engine.
2142        for name in ["Rust", "Python"] {
2143            let entry = registry
2144                .find_by_name(name)
2145                .unwrap_or_else(|| panic!("{} must be in the catalog", name));
2146            assert!(
2147                entry.engines.syntect.is_some(),
2148                "{} should have a syntect index",
2149                name
2150            );
2151            assert!(
2152                entry.engines.tree_sitter.is_some(),
2153                "{} should also have a tree-sitter language",
2154                name
2155            );
2156            // Only one entry with this display name (already checked above),
2157            // but also verify language_id lookup lands on the same entry.
2158            let by_id = registry
2159                .find_by_name(&entry.language_id)
2160                .expect("language_id should resolve");
2161            assert_eq!(by_id.display_name, entry.display_name);
2162        }
2163
2164        // JavaScript is deliberately routed to tree-sitter only — the
2165        // bundled syntect JavaScript grammar mishandles certain template
2166        // literals and bleeds string state into the rest of the file
2167        // (issue #899). The catalog must therefore expose a tree-sitter-only
2168        // entry, even though syntect ships a JavaScript grammar.
2169        let js = registry
2170            .find_by_name("JavaScript")
2171            .expect("JavaScript must be in the catalog");
2172        assert!(
2173            js.engines.syntect.is_none(),
2174            "JavaScript must not be routed to the syntect engine (issue #899)"
2175        );
2176        assert_eq!(
2177            js.engines.tree_sitter,
2178            Some(fresh_languages::Language::JavaScript),
2179            "JavaScript must carry the tree-sitter language"
2180        );
2181
2182        let gdscript = registry
2183            .find_by_path(Path::new("player.gd"), None)
2184            .expect("player.gd should resolve to GDScript");
2185        assert_eq!(gdscript.display_name, "GDScript");
2186        assert_eq!(gdscript.language_id, "gdscript");
2187        assert!(
2188            gdscript.engines.syntect.is_some(),
2189            "GDScript should use the embedded Syntect grammar"
2190        );
2191        assert!(
2192            gdscript.engines.tree_sitter.is_none(),
2193            "GDScript must not carry a tree-sitter parser"
2194        );
2195    }
2196
2197    #[test]
2198    fn test_catalog_find_by_path_and_extension() {
2199        let registry = GrammarRegistry::default();
2200        let ts = registry
2201            .find_by_path(Path::new("foo.ts"), None)
2202            .expect("foo.ts should resolve");
2203        assert_eq!(ts.display_name, "TypeScript");
2204        let rs = registry.find_by_extension("rs").expect("rs should resolve");
2205        assert_eq!(rs.display_name, "Rust");
2206    }
2207
2208    #[test]
2209    fn test_smali_embedded_grammar_loads_and_resolves() {
2210        let syntax = SyntaxDefinition::load_from_str(SMALI_GRAMMAR, true, Some("Smali"))
2211            .expect("Smali grammar should parse");
2212        assert!(syntax.file_extensions.iter().any(|ext| ext == "smali"));
2213
2214        let registry = GrammarRegistry::default();
2215        let entry = registry
2216            .find_by_path(Path::new("MainActivity.smali"), None)
2217            .expect("Smali files should resolve");
2218        assert_eq!(entry.display_name, "Smali");
2219        assert!(entry.engines.syntect.is_some());
2220        assert!(entry.engines.tree_sitter.is_none());
2221    }
2222
2223    /// Build a minimal LanguageConfig for tests.
2224    fn lang_cfg(
2225        grammar: &str,
2226        extensions: &[&str],
2227        filenames: &[&str],
2228    ) -> crate::config::LanguageConfig {
2229        crate::config::LanguageConfig {
2230            extensions: extensions.iter().map(|s| s.to_string()).collect(),
2231            filenames: filenames.iter().map(|s| s.to_string()).collect(),
2232            grammar: grammar.to_string(),
2233            comment_prefix: None,
2234            auto_indent: true,
2235            auto_close: None,
2236            auto_surround: None,
2237            textmate_grammar: None,
2238            show_whitespace_tabs: true,
2239            line_wrap: None,
2240            wrap_column: None,
2241            page_view: None,
2242            page_width: None,
2243            use_tabs: None,
2244            tab_size: None,
2245            formatter: None,
2246            format_on_save: false,
2247            on_save: vec![],
2248            word_characters: None,
2249            indent: None,
2250        }
2251    }
2252
2253    /// Bug #1: a user-declared config key that aliases an existing grammar
2254    /// (e.g. `[languages.mylang] grammar = "Rust"`) must resolve via
2255    /// `find_by_name("mylang")` so the language palette can select it.
2256    #[test]
2257    fn test_user_alias_resolves_via_find_by_name() {
2258        let mut registry = GrammarRegistry::default();
2259        let mut languages = std::collections::HashMap::new();
2260        languages.insert("mylang".to_string(), lang_cfg("Rust", &[], &[]));
2261        registry.apply_language_config(&languages);
2262
2263        let entry = registry
2264            .find_by_name("mylang")
2265            .expect("user-declared alias 'mylang' must resolve");
2266        assert_eq!(entry.display_name, "Rust");
2267    }
2268
2269    /// Bug #2: `register_alias` used to rebuild the catalog from scratch,
2270    /// wiping out everything `apply_language_config` had merged. Registering
2271    /// an alias afterwards must not lose user config.
2272    #[test]
2273    fn test_register_alias_preserves_applied_language_config() {
2274        let mut registry = GrammarRegistry::default();
2275        let mut languages = std::collections::HashMap::new();
2276        languages.insert(
2277            "shell-configs".to_string(),
2278            lang_cfg("bash", &["myconf"], &["*.myconf"]),
2279        );
2280        registry.apply_language_config(&languages);
2281
2282        // Sanity: config applied.
2283        assert!(registry.find_by_extension("myconf").is_some());
2284        assert!(
2285            registry
2286                .find_by_path(Path::new("foo.myconf"), None)
2287                .is_some(),
2288            "glob should match before register_alias"
2289        );
2290
2291        // Registering an alias must not erase the config we just applied.
2292        registry.register_alias("mycustom", "Rust");
2293
2294        assert!(
2295            registry.find_by_extension("myconf").is_some(),
2296            "config extension must survive register_alias"
2297        );
2298        assert!(
2299            registry
2300                .find_by_path(Path::new("foo.myconf"), None)
2301                .is_some(),
2302            "glob must survive register_alias"
2303        );
2304    }
2305
2306    /// Bug #4: `from_syntax_name` used to unconditionally overwrite the
2307    /// catalog's canonical display name with whatever the user typed (e.g.
2308    /// "BASH") — that string ended up in the status bar.
2309    #[test]
2310    fn test_from_syntax_name_preserves_canonical_display_name() {
2311        use crate::primitives::detected_language::DetectedLanguage;
2312        let registry = GrammarRegistry::default();
2313        let languages = std::collections::HashMap::new();
2314
2315        let detected = DetectedLanguage::from_syntax_name("BASH", &registry, &languages)
2316            .expect("BASH should resolve via alias");
2317        assert_eq!(
2318            detected.display_name, "Bourne Again Shell (bash)",
2319            "display_name must be canonical, not user-typed"
2320        );
2321    }
2322
2323    /// A config-only language (no matching syntect grammar) must still appear
2324    /// in the catalog so the language palette can offer it — the old
2325    /// `DetectedLanguage::from_config_language` branch was load-bearing.
2326    #[test]
2327    fn test_config_only_language_appears_in_catalog() {
2328        let mut registry = GrammarRegistry::default();
2329        let mut languages = std::collections::HashMap::new();
2330        languages.insert("elvish".to_string(), lang_cfg("elvish", &["elv"], &[]));
2331        registry.apply_language_config(&languages);
2332
2333        let entry = registry
2334            .find_by_name("elvish")
2335            .expect("elvish should be in the catalog after apply_language_config");
2336        assert!(entry.engines.syntect.is_none());
2337        assert!(entry.engines.tree_sitter.is_none());
2338        assert_eq!(entry.language_id, "elvish");
2339        assert!(entry.extensions.iter().any(|e| e == "elv"));
2340    }
2341
2342    #[test]
2343    fn test_fish_extension_resolves_to_fish_grammar_not_bash() {
2344        // Syntect's stock Bash grammar also advertises `.fish`; the catalog
2345        // strips it so only Fresh's dedicated Fish grammar owns the extension.
2346        let registry = GrammarRegistry::default();
2347        let entry = registry
2348            .find_by_extension("fish")
2349            .expect(".fish should resolve to a grammar entry");
2350
2351        assert_eq!(entry.language_id, "fish");
2352        assert_eq!(entry.display_name, "Fish");
2353        assert!(entry.engines.syntect.is_some());
2354    }
2355
2356    /// Config-declared extensions must override the built-in mapping. If the
2357    /// user says `[languages.typescript-overlay] extensions = ["js"] grammar
2358    /// = "TypeScript"`, then `foo.js` must resolve to TypeScript, not
2359    /// JavaScript.
2360    #[test]
2361    fn test_config_extension_overrides_builtin() {
2362        let mut registry = GrammarRegistry::default();
2363        // Sanity: default mapping is JavaScript.
2364        assert_eq!(
2365            registry.find_by_extension("js").unwrap().display_name,
2366            "JavaScript"
2367        );
2368
2369        let mut languages = std::collections::HashMap::new();
2370        languages.insert(
2371            "ts-overlay".to_string(),
2372            lang_cfg("TypeScript", &["js"], &[]),
2373        );
2374        registry.apply_language_config(&languages);
2375
2376        assert_eq!(
2377            registry.find_by_extension("js").unwrap().display_name,
2378            "TypeScript",
2379            "user-config extension must win over built-in"
2380        );
2381    }
2382
2383    /// Bare filenames listed by syntect grammars (e.g. "Gemfile", "Makefile",
2384    /// "Rakefile") must resolve through `find_by_path`. Syntect stores these
2385    /// in each grammar's `file_extensions` field alongside real extensions
2386    /// like "rb"; its own `find_syntax_for_file` treats them as either. The
2387    /// catalog has to do the same or `HighlightEngine::for_file` breaks for
2388    /// every extensionless config file.
2389    #[test]
2390    fn test_bare_filename_resolves_via_find_by_path() {
2391        let registry = GrammarRegistry::default();
2392        for (filename, expected_substr) in [
2393            ("Gemfile", "ruby"),
2394            ("Rakefile", "ruby"),
2395            ("Vagrantfile", "ruby"),
2396            ("Makefile", "makefile"),
2397            ("GNUmakefile", "makefile"),
2398        ] {
2399            let entry = registry
2400                .find_by_path(Path::new(filename), None)
2401                .unwrap_or_else(|| panic!("{} must resolve via catalog", filename));
2402            assert!(
2403                entry.display_name.to_lowercase().contains(expected_substr),
2404                "{} should resolve to {} grammar, got {}",
2405                filename,
2406                expected_substr,
2407                entry.display_name
2408            );
2409        }
2410    }
2411
2412    /// Languages that have both syntect and tree-sitter (e.g. JavaScript) must
2413    /// expose the union of both engines' extensions. Tree-sitter-javascript
2414    /// knows `.jsx`; syntect's JavaScript grammar does not. Both should route
2415    /// through the JavaScript catalog entry.
2416    #[test]
2417    fn test_jsx_resolves_to_javascript() {
2418        let registry = GrammarRegistry::default();
2419        let entry = registry
2420            .find_by_path(Path::new("foo.jsx"), None)
2421            .expect("foo.jsx must resolve");
2422        assert_eq!(entry.display_name, "JavaScript");
2423    }
2424
2425    /// `rebuild_catalog` must replay the last-applied language config so it
2426    /// can never silently wipe user `[languages]` rules. This is the invariant
2427    /// that keeps `register_alias`, `populate_built_in_aliases`, and any
2428    /// future rebuild callsite safe-by-construction.
2429    #[test]
2430    fn test_rebuild_catalog_replays_language_config() {
2431        let mut registry = GrammarRegistry::default();
2432        let mut languages = std::collections::HashMap::new();
2433        languages.insert(
2434            "myshell".to_string(),
2435            lang_cfg("bash", &["myext"], &["*.myglob"]),
2436        );
2437        registry.apply_language_config(&languages);
2438        assert!(registry.find_by_extension("myext").is_some());
2439        assert!(registry
2440            .find_by_path(Path::new("foo.myglob"), None)
2441            .is_some());
2442
2443        // Force a rebuild — the catalog gets wiped and re-populated from
2444        // syntect / tree-sitter, but user config must come back on top.
2445        registry.rebuild_catalog();
2446        assert!(
2447            registry.find_by_extension("myext").is_some(),
2448            "rebuild_catalog must replay applied user config"
2449        );
2450        assert!(
2451            registry
2452                .find_by_path(Path::new("foo.myglob"), None)
2453                .is_some(),
2454            "rebuild_catalog must replay user globs"
2455        );
2456    }
2457
2458    /// `apply_language_config` must be idempotent: calling it twice with the
2459    /// same config yields the same catalog state.
2460    #[test]
2461    fn test_apply_language_config_idempotent() {
2462        let mut registry = GrammarRegistry::default();
2463        let mut languages = std::collections::HashMap::new();
2464        languages.insert(
2465            "shell-cfg".to_string(),
2466            lang_cfg("bash", &["myconf"], &["*.myconf"]),
2467        );
2468
2469        registry.apply_language_config(&languages);
2470        let first_extensions = registry
2471            .find_by_name("bash")
2472            .unwrap()
2473            .extensions
2474            .iter()
2475            .filter(|e| e == &"myconf")
2476            .count();
2477        let first_globs = registry
2478            .find_by_name("bash")
2479            .unwrap()
2480            .filename_globs
2481            .iter()
2482            .filter(|g| g == &"*.myconf")
2483            .count();
2484        assert_eq!(first_extensions, 1);
2485        assert_eq!(first_globs, 1);
2486
2487        // Second call must not duplicate anything.
2488        registry.apply_language_config(&languages);
2489        let second_extensions = registry
2490            .find_by_name("bash")
2491            .unwrap()
2492            .extensions
2493            .iter()
2494            .filter(|e| e == &"myconf")
2495            .count();
2496        let second_globs = registry
2497            .find_by_name("bash")
2498            .unwrap()
2499            .filename_globs
2500            .iter()
2501            .filter(|g| g == &"*.myconf")
2502            .count();
2503        assert_eq!(second_extensions, 1, "extensions must not duplicate");
2504        assert_eq!(second_globs, 1, "globs must not duplicate");
2505    }
2506
2507    /// Julia: a single-quote after an identifier is the adjoint
2508    /// (conjugate-transpose) postfix operator, not the start of a string. The
2509    /// old grammar pushed a string context on every `'`, so `A'` swallowed
2510    /// the rest of the file until the next quote — wrecking highlighting for
2511    /// any subsequent keyword. Issue #1852.
2512    #[test]
2513    fn test_julia_adjoint_does_not_start_string() {
2514        use syntect::parsing::{ParseState, ScopeStack};
2515
2516        let registry = GrammarRegistry::default();
2517        let syntax_set = registry.syntax_set();
2518        let syntax = registry
2519            .find_syntax_by_name("Julia")
2520            .expect("Julia grammar must be loaded");
2521        let mut state = ParseState::new(syntax);
2522        let mut scopes = ScopeStack::new();
2523
2524        // Adjoint operator followed by code on later lines.
2525        let lines = ["x = A'\n", "function foo()\n", "end\n"];
2526        let mut keyword_line_in_string = false;
2527        let mut found_function_keyword = false;
2528
2529        for line in &lines {
2530            let ops = state.parse_line(line, syntax_set).unwrap();
2531            // Walk byte-by-byte, applying ops as we pass their offset.
2532            let mut op_iter = ops.iter().peekable();
2533            for (byte_idx, _) in line.char_indices() {
2534                while let Some((offset, op)) = op_iter.peek() {
2535                    if *offset <= byte_idx {
2536                        scopes.apply(op).unwrap();
2537                        op_iter.next();
2538                    } else {
2539                        break;
2540                    }
2541                }
2542                let in_string = scopes
2543                    .as_slice()
2544                    .iter()
2545                    .any(|s| s.build_string().starts_with("string."));
2546                let is_function_kw = line[byte_idx..].starts_with("function");
2547                if is_function_kw && in_string {
2548                    keyword_line_in_string = true;
2549                }
2550                if is_function_kw && !in_string {
2551                    found_function_keyword = true;
2552                }
2553            }
2554            // Drain remaining ops at end of line.
2555            for (_, op) in op_iter {
2556                scopes.apply(op).unwrap();
2557            }
2558        }
2559
2560        assert!(
2561            !keyword_line_in_string,
2562            "the `function` keyword after an adjoint operator must not be inside a string scope"
2563        );
2564        assert!(
2565            found_function_keyword,
2566            "test harness must have reached the `function` keyword"
2567        );
2568    }
2569
2570    /// Julia: `'a'` is a valid character literal. The grammar must still
2571    /// scope it as a constant/character so themes can color it. Issue #1852.
2572    #[test]
2573    fn test_julia_char_literal_is_recognized() {
2574        use syntect::parsing::{ParseState, ScopeStack};
2575
2576        let registry = GrammarRegistry::default();
2577        let syntax_set = registry.syntax_set();
2578        let syntax = registry
2579            .find_syntax_by_name("Julia")
2580            .expect("Julia grammar must be loaded");
2581        let mut state = ParseState::new(syntax);
2582        let mut scopes = ScopeStack::new();
2583
2584        let line = "x = 'a'\n";
2585        let ops = state.parse_line(line, syntax_set).unwrap();
2586        let mut saw_constant_or_string_at_quote = false;
2587        let mut op_iter = ops.iter().peekable();
2588        for (byte_idx, _) in line.char_indices() {
2589            while let Some((offset, op)) = op_iter.peek() {
2590                if *offset <= byte_idx {
2591                    scopes.apply(op).unwrap();
2592                    op_iter.next();
2593                } else {
2594                    break;
2595                }
2596            }
2597            if byte_idx == 5 {
2598                // position of 'a' (the char)
2599                let scoped = scopes.as_slice().iter().any(|s| {
2600                    let str = s.build_string();
2601                    str.starts_with("constant.") || str.starts_with("string.")
2602                });
2603                if scoped {
2604                    saw_constant_or_string_at_quote = true;
2605                }
2606            }
2607        }
2608        assert!(
2609            saw_constant_or_string_at_quote,
2610            "char literal 'a' must receive a constant/string scope"
2611        );
2612    }
2613
2614    /// `tree_sitter_for_syntect_name` handles the alias table + strict
2615    /// display-name match. The alias table catches syntect's verbose names;
2616    /// the strict match handles the common case.
2617    #[test]
2618    fn test_tree_sitter_bridge() {
2619        assert_eq!(
2620            tree_sitter_for_syntect_name("Bourne Again Shell (bash)"),
2621            Some(fresh_languages::Language::Bash)
2622        );
2623        assert_eq!(
2624            tree_sitter_for_syntect_name("Rust"),
2625            Some(fresh_languages::Language::Rust)
2626        );
2627        assert_eq!(tree_sitter_for_syntect_name("GDScript"), None);
2628        // Must NOT fuzzy-match Nushell to Bash.
2629        assert_eq!(tree_sitter_for_syntect_name("Nushell"), None);
2630        // Must NOT match arbitrary strings.
2631        assert_eq!(tree_sitter_for_syntect_name("does-not-exist"), None);
2632    }
2633}