Skip to main content

harn_hostlib/ast/
language.rs

1//! Tree-sitter language registry.
2//!
3//! The set of languages, their canonical names, and their file extensions
4//! form the hostlib AST wire contract. Adding or dropping a language
5//! requires coordinated schema, fixture, and host-bridge updates.
6//!
7//! ## Per-language onboarding contract (B.7)
8//!
9//! Each [`Language`] variant carries the full adapter contract on the enum
10//! itself — there is no separate `LanguageAdapter` object to keep in sync:
11//!
12//! 1. **grammar binding** — [`Language::ts_language`]
13//! 2. **wire name + aliases** — [`Language::name`] / [`Language::from_name`]
14//! 3. **extension detection** — [`Language::from_extension`]
15//! 4. **symbol-graph projection** (drives `rename_symbol`) —
16//!    [`Language::rename_identifier_kinds`]
17//! 5. **symbol/outline extraction** — `ast::symbols::extract`
18//! 6. **test fixture** — `tests/fixtures/ast/<name>/`
19//!
20//! Format-preserving span replacement and trivia/indentation handling are
21//! grammar-agnostic (byte-span splice + inferred indent), so they need no
22//! per-language code. The result is that adding a language is a bounded
23//! ticket: register the grammar, add the four mapping arms, drop in a
24//! fixture, and (optionally) an identifier-kind table for rename support.
25
26use tree_sitter::Language as TsLanguage;
27
28/// Languages with tree-sitter grammar support.
29///
30/// The string returned by [`Language::name`] is the canonical wire name;
31/// callers (and the JSON schemas) refer to languages by that string. The
32/// trailing group (`Json`..`Markdown`) are data/markup/config grammars:
33/// they support the query-driven edit primitives but have no symbol-graph
34/// projection (see [`Language::edit_capabilities`]).
35#[allow(missing_docs)]
36#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
37pub enum Language {
38    TypeScript,
39    Tsx,
40    JavaScript,
41    Jsx,
42    Python,
43    Go,
44    Rust,
45    Java,
46    C,
47    Cpp,
48    CSharp,
49    Ruby,
50    Kotlin,
51    Php,
52    Scala,
53    Bash,
54    Swift,
55    Zig,
56    Elixir,
57    Lua,
58    Haskell,
59    R,
60    Json,
61    Yaml,
62    Toml,
63    Css,
64    Html,
65    Sql,
66    Markdown,
67}
68
69/// The text-level fallback the agent loop should reach for when an
70/// AST-precise edit is unavailable for a file. Surfaced verbatim as the
71/// `fallback_suggestion` field on every `unsupported_*` edit response so
72/// the loop can degrade gracefully without per-call branching.
73pub const TEXT_PATCH_FALLBACK: &str =
74    "fall back to a text-level edit (std/edit `edit_safe_text_patch`)";
75
76/// Which AST-precise edit primitives are available for a language.
77///
78/// `apply_node` and `insert_at_anchor` are query-driven and work against
79/// any registered tree-sitter grammar, so they are always `true`.
80/// `rename_symbol` needs a per-language identifier-kind projection (see
81/// [`Language::rename_identifier_kinds`]); `symbols`/`outline` need a
82/// per-language extractor (see `ast::symbols`). The matrix is the
83/// onboarding contract: it tells the agent loop which primitive to reach
84/// for and is rendered into the capability-matrix docs.
85#[derive(Debug, Clone, Copy, PartialEq, Eq)]
86pub struct EditCapabilities {
87    /// Tree-sitter query → format-preserving replace.
88    pub apply_node: bool,
89    /// Anchored sibling/child insertion.
90    pub insert_at_anchor: bool,
91    /// Cross-file safe rename via the symbol graph.
92    pub rename_symbol: bool,
93    /// Symbol + outline extraction.
94    pub symbols: bool,
95}
96
97impl Language {
98    /// Canonical wire name.
99    pub fn name(self) -> &'static str {
100        match self {
101            Language::TypeScript => "typescript",
102            Language::Tsx => "tsx",
103            Language::JavaScript => "javascript",
104            Language::Jsx => "jsx",
105            Language::Python => "python",
106            Language::Go => "go",
107            Language::Rust => "rust",
108            Language::Java => "java",
109            Language::C => "c",
110            Language::Cpp => "cpp",
111            Language::CSharp => "csharp",
112            Language::Ruby => "ruby",
113            Language::Kotlin => "kotlin",
114            Language::Php => "php",
115            Language::Scala => "scala",
116            Language::Bash => "bash",
117            Language::Swift => "swift",
118            Language::Zig => "zig",
119            Language::Elixir => "elixir",
120            Language::Lua => "lua",
121            Language::Haskell => "haskell",
122            Language::R => "r",
123            Language::Json => "json",
124            Language::Yaml => "yaml",
125            Language::Toml => "toml",
126            Language::Css => "css",
127            Language::Html => "html",
128            Language::Sql => "sql",
129            Language::Markdown => "markdown",
130        }
131    }
132
133    /// Tree-sitter grammar handle, or `None` when this build was not
134    /// compiled with the grammar family that backs `self`.
135    ///
136    /// Each arm is gated on its `grammar-*` family feature, so a trimmed
137    /// build only links the grammars it asked for. The `name`/extension/
138    /// detection metadata above stays complete regardless of features — a
139    /// lean build still *recognizes* a `.py` file, it just returns `None`
140    /// here and the edit primitives degrade to the text fallback. The full
141    /// (default) build enables every family, so `None` never occurs there.
142    /// Cheap when present; the underlying `LANGUAGE` constants are static.
143    pub fn ts_language(self) -> Option<TsLanguage> {
144        Some(match self {
145            #[cfg(feature = "grammar-web")]
146            Language::TypeScript => tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into(),
147            #[cfg(feature = "grammar-web")]
148            Language::Tsx => tree_sitter_typescript::LANGUAGE_TSX.into(),
149            #[cfg(feature = "grammar-web")]
150            Language::JavaScript | Language::Jsx => tree_sitter_javascript::LANGUAGE.into(),
151            #[cfg(feature = "grammar-web")]
152            Language::Html => tree_sitter_html::LANGUAGE.into(),
153            #[cfg(feature = "grammar-web")]
154            Language::Css => tree_sitter_css::LANGUAGE.into(),
155
156            #[cfg(feature = "grammar-systems")]
157            Language::Rust => tree_sitter_rust::LANGUAGE.into(),
158            #[cfg(feature = "grammar-systems")]
159            Language::C => tree_sitter_c::LANGUAGE.into(),
160            #[cfg(feature = "grammar-systems")]
161            Language::Cpp => tree_sitter_cpp::LANGUAGE.into(),
162            #[cfg(feature = "grammar-systems")]
163            Language::Go => tree_sitter_go::LANGUAGE.into(),
164            #[cfg(feature = "grammar-systems")]
165            Language::Zig => tree_sitter_zig::LANGUAGE.into(),
166
167            #[cfg(feature = "grammar-scripting")]
168            Language::Python => tree_sitter_python::LANGUAGE.into(),
169            #[cfg(feature = "grammar-scripting")]
170            Language::Ruby => tree_sitter_ruby::LANGUAGE.into(),
171            #[cfg(feature = "grammar-scripting")]
172            Language::Bash => tree_sitter_bash::LANGUAGE.into(),
173            #[cfg(feature = "grammar-scripting")]
174            Language::Lua => tree_sitter_lua::LANGUAGE.into(),
175            #[cfg(feature = "grammar-scripting")]
176            Language::Php => tree_sitter_php::LANGUAGE_PHP.into(),
177            #[cfg(feature = "grammar-scripting")]
178            Language::R => tree_sitter_r::LANGUAGE.into(),
179
180            #[cfg(feature = "grammar-jvm")]
181            Language::Java => tree_sitter_java::LANGUAGE.into(),
182            #[cfg(feature = "grammar-jvm")]
183            Language::Kotlin => tree_sitter_kotlin_ng::LANGUAGE.into(),
184            #[cfg(feature = "grammar-jvm")]
185            Language::Scala => tree_sitter_scala::LANGUAGE.into(),
186
187            #[cfg(feature = "grammar-enterprise")]
188            Language::CSharp => tree_sitter_c_sharp::LANGUAGE.into(),
189            #[cfg(feature = "grammar-enterprise")]
190            Language::Swift => tree_sitter_swift::LANGUAGE.into(),
191            #[cfg(feature = "grammar-enterprise")]
192            Language::Elixir => tree_sitter_elixir::LANGUAGE.into(),
193            #[cfg(feature = "grammar-enterprise")]
194            Language::Haskell => tree_sitter_haskell::LANGUAGE.into(),
195
196            #[cfg(feature = "grammar-data")]
197            Language::Json => tree_sitter_json::LANGUAGE.into(),
198            #[cfg(feature = "grammar-data")]
199            Language::Yaml => tree_sitter_yaml::LANGUAGE.into(),
200            #[cfg(feature = "grammar-data")]
201            Language::Toml => tree_sitter_toml_ng::LANGUAGE.into(),
202            #[cfg(feature = "grammar-data")]
203            Language::Sql => tree_sitter_sequel::LANGUAGE.into(),
204            // tree-sitter-md ships a split block/inline grammar; the block
205            // grammar is the structural tree the edit primitives operate
206            // on (headings, lists, fenced code, …).
207            #[cfg(feature = "grammar-data")]
208            Language::Markdown => tree_sitter_md::LANGUAGE.into(),
209
210            // Any language whose family was not compiled into this build.
211            // Unreachable under the default (all-families) build.
212            #[allow(unreachable_patterns)]
213            _ => return None,
214        })
215    }
216
217    /// Resolve a language from its canonical wire name. Accepts a few
218    /// historical aliases (`ts`, `js`, `c++`, …) so users don't have to
219    /// memorize the exact spelling.
220    pub fn from_name(name: &str) -> Option<Self> {
221        let normalized = name.trim().to_ascii_lowercase();
222        Some(match normalized.as_str() {
223            "typescript" | "ts" => Language::TypeScript,
224            "tsx" => Language::Tsx,
225            "javascript" | "js" => Language::JavaScript,
226            "jsx" => Language::Jsx,
227            "python" | "py" => Language::Python,
228            "go" | "golang" => Language::Go,
229            "rust" | "rs" => Language::Rust,
230            "java" => Language::Java,
231            "c" => Language::C,
232            "cpp" | "c++" | "cxx" => Language::Cpp,
233            "csharp" | "c#" | "cs" => Language::CSharp,
234            "ruby" | "rb" => Language::Ruby,
235            "kotlin" | "kt" => Language::Kotlin,
236            "php" => Language::Php,
237            "scala" => Language::Scala,
238            "bash" | "shell" | "sh" | "zsh" => Language::Bash,
239            "swift" => Language::Swift,
240            "zig" => Language::Zig,
241            "elixir" | "ex" => Language::Elixir,
242            "lua" => Language::Lua,
243            "haskell" | "hs" => Language::Haskell,
244            "r" => Language::R,
245            "json" => Language::Json,
246            "yaml" | "yml" => Language::Yaml,
247            "toml" => Language::Toml,
248            "css" => Language::Css,
249            "html" | "htm" => Language::Html,
250            "sql" => Language::Sql,
251            "markdown" | "md" => Language::Markdown,
252            _ => return None,
253        })
254    }
255
256    /// Resolve a language from a file extension.
257    pub fn from_extension(ext: &str) -> Option<Self> {
258        let normalized = ext.trim_start_matches('.').to_ascii_lowercase();
259        Some(match normalized.as_str() {
260            "ts" => Language::TypeScript,
261            "tsx" => Language::Tsx,
262            "js" | "mjs" | "cjs" => Language::JavaScript,
263            "jsx" => Language::Jsx,
264            "py" => Language::Python,
265            "go" => Language::Go,
266            "rs" => Language::Rust,
267            "java" => Language::Java,
268            "c" | "h" => Language::C,
269            "cpp" | "cc" | "cxx" | "hpp" | "hxx" | "hh" => Language::Cpp,
270            "cs" | "csx" => Language::CSharp,
271            "rb" => Language::Ruby,
272            "kt" | "kts" => Language::Kotlin,
273            "php" => Language::Php,
274            "scala" | "sc" => Language::Scala,
275            "sh" | "bash" | "zsh" => Language::Bash,
276            "swift" => Language::Swift,
277            "zig" | "zon" => Language::Zig,
278            "ex" | "exs" => Language::Elixir,
279            "lua" => Language::Lua,
280            "hs" | "lhs" => Language::Haskell,
281            "r" => Language::R,
282            "json" => Language::Json,
283            "yaml" | "yml" => Language::Yaml,
284            "toml" => Language::Toml,
285            "css" => Language::Css,
286            "html" | "htm" => Language::Html,
287            "sql" => Language::Sql,
288            "md" | "markdown" => Language::Markdown,
289            _ => return None,
290        })
291    }
292
293    /// Resolve from a file path: prefer explicit `language_hint` if
294    /// supplied, otherwise fall back to extension-based detection.
295    pub fn detect(path: &std::path::Path, language_hint: Option<&str>) -> Option<Self> {
296        if let Some(name) = language_hint.and_then(|s| (!s.is_empty()).then_some(s)) {
297            return Self::from_name(name);
298        }
299        let ext = path.extension().and_then(|s| s.to_str())?;
300        Self::from_extension(ext)
301    }
302
303    /// A representative file extension for the language (no leading dot).
304    /// Used by docs and the onboarding probe; not necessarily the only
305    /// extension [`Language::from_extension`] accepts.
306    pub fn primary_extension(self) -> &'static str {
307        match self {
308            Language::TypeScript => "ts",
309            Language::Tsx => "tsx",
310            Language::JavaScript => "js",
311            Language::Jsx => "jsx",
312            Language::Python => "py",
313            Language::Go => "go",
314            Language::Rust => "rs",
315            Language::Java => "java",
316            Language::C => "c",
317            Language::Cpp => "cpp",
318            Language::CSharp => "cs",
319            Language::Ruby => "rb",
320            Language::Kotlin => "kt",
321            Language::Php => "php",
322            Language::Scala => "scala",
323            Language::Bash => "sh",
324            Language::Swift => "swift",
325            Language::Zig => "zig",
326            Language::Elixir => "ex",
327            Language::Lua => "lua",
328            Language::Haskell => "hs",
329            Language::R => "r",
330            Language::Json => "json",
331            Language::Yaml => "yaml",
332            Language::Toml => "toml",
333            Language::Css => "css",
334            Language::Html => "html",
335            Language::Sql => "sql",
336            Language::Markdown => "md",
337        }
338    }
339
340    /// Per-language allow-list of tree-sitter node kinds that represent an
341    /// identifier token bound to a name (variables, functions, types,
342    /// fields). This is the symbol-graph projection that drives
343    /// `rename_symbol`: anything not in this table is treated as a literal
344    /// or punctuation node and left alone, which keeps a rename out of
345    /// comments and string bodies even though those *contain* identifier
346    /// substrings. `None` means the language has no rename projection yet.
347    pub fn rename_identifier_kinds(self) -> Option<&'static [&'static str]> {
348        Some(match self {
349            Language::Rust => &[
350                "identifier",
351                "type_identifier",
352                "field_identifier",
353                "shorthand_field_identifier",
354            ],
355            Language::TypeScript | Language::Tsx => &[
356                "identifier",
357                "type_identifier",
358                "property_identifier",
359                "shorthand_property_identifier",
360                "shorthand_property_identifier_pattern",
361            ],
362            Language::JavaScript | Language::Jsx => &[
363                "identifier",
364                "property_identifier",
365                "shorthand_property_identifier",
366                "shorthand_property_identifier_pattern",
367            ],
368            Language::Python => &["identifier"],
369            Language::Go => &[
370                "identifier",
371                "type_identifier",
372                "field_identifier",
373                "package_identifier",
374            ],
375            Language::Swift => &["simple_identifier", "type_identifier"],
376            _ => return None,
377        })
378    }
379
380    /// Whether `rename_symbol` can operate on this language (i.e. it has a
381    /// [`Language::rename_identifier_kinds`] projection).
382    pub fn supports_rename(self) -> bool {
383        self.rename_identifier_kinds().is_some()
384    }
385
386    /// Data / markup / config grammars that carry no nameable symbols, so
387    /// symbol + outline extraction is intentionally empty for them.
388    fn is_data_format(self) -> bool {
389        matches!(
390            self,
391            Language::Json
392                | Language::Yaml
393                | Language::Toml
394                | Language::Css
395                | Language::Html
396                | Language::Sql
397                | Language::Markdown
398        )
399    }
400
401    /// Whether `symbols`/`outline` produce meaningful results. Data/markup
402    /// grammars parse and edit fine but expose no symbol projection.
403    pub fn supports_symbol_extraction(self) -> bool {
404        !self.is_data_format()
405    }
406
407    /// The AST-precise edit capability matrix for this language. See
408    /// [`EditCapabilities`].
409    pub fn edit_capabilities(self) -> EditCapabilities {
410        EditCapabilities {
411            apply_node: true,
412            insert_at_anchor: true,
413            rename_symbol: self.supports_rename(),
414            symbols: self.supports_symbol_extraction(),
415        }
416    }
417
418    /// Every language we ship support for. Useful for tests + introspection.
419    pub fn all() -> &'static [Language] {
420        &[
421            Language::TypeScript,
422            Language::Tsx,
423            Language::JavaScript,
424            Language::Jsx,
425            Language::Python,
426            Language::Go,
427            Language::Rust,
428            Language::Java,
429            Language::C,
430            Language::Cpp,
431            Language::CSharp,
432            Language::Ruby,
433            Language::Kotlin,
434            Language::Php,
435            Language::Scala,
436            Language::Bash,
437            Language::Swift,
438            Language::Zig,
439            Language::Elixir,
440            Language::Lua,
441            Language::Haskell,
442            Language::R,
443            Language::Json,
444            Language::Yaml,
445            Language::Toml,
446            Language::Css,
447            Language::Html,
448            Language::Sql,
449            Language::Markdown,
450        ]
451    }
452}
453
454#[cfg(test)]
455mod tests {
456    use super::*;
457
458    // Only the all-families (default) build links every grammar; under a
459    // trimmed grammar set some languages intentionally resolve to `None`.
460    #[cfg(feature = "grammars-all")]
461    #[test]
462    fn every_language_is_loadable() {
463        for &lang in Language::all() {
464            // Constructing the tree-sitter Language must not panic and must
465            // produce a non-trivial grammar.
466            let ts = lang
467                .ts_language()
468                .unwrap_or_else(|| panic!("{} grammar not compiled", lang.name()));
469            assert!(ts.node_kind_count() > 0, "{} grammar is empty", lang.name());
470        }
471    }
472
473    #[test]
474    fn extension_detection_round_trips_canonical_extensions() {
475        let cases: &[(&str, Language)] = &[
476            ("ts", Language::TypeScript),
477            ("tsx", Language::Tsx),
478            ("js", Language::JavaScript),
479            ("jsx", Language::Jsx),
480            ("py", Language::Python),
481            ("rs", Language::Rust),
482            ("go", Language::Go),
483            ("java", Language::Java),
484            ("c", Language::C),
485            ("cpp", Language::Cpp),
486            ("cs", Language::CSharp),
487            ("rb", Language::Ruby),
488            ("kt", Language::Kotlin),
489            ("php", Language::Php),
490            ("scala", Language::Scala),
491            ("sh", Language::Bash),
492            ("swift", Language::Swift),
493            ("zig", Language::Zig),
494            ("ex", Language::Elixir),
495            ("lua", Language::Lua),
496            ("hs", Language::Haskell),
497            ("r", Language::R),
498            ("json", Language::Json),
499            ("yaml", Language::Yaml),
500            ("yml", Language::Yaml),
501            ("toml", Language::Toml),
502            ("css", Language::Css),
503            ("html", Language::Html),
504            ("sql", Language::Sql),
505            ("md", Language::Markdown),
506        ];
507        for (ext, want) in cases {
508            assert_eq!(Language::from_extension(ext), Some(*want), "ext {ext}");
509        }
510    }
511
512    #[test]
513    fn name_round_trips_for_every_language() {
514        for &lang in Language::all() {
515            assert_eq!(Language::from_name(lang.name()), Some(lang));
516        }
517    }
518
519    #[test]
520    fn primary_extension_resolves_back_to_the_language() {
521        for &lang in Language::all() {
522            assert_eq!(
523                Language::from_extension(lang.primary_extension()),
524                Some(lang),
525                "primary extension for {} does not round-trip",
526                lang.name()
527            );
528        }
529    }
530
531    #[test]
532    fn detect_prefers_hint_over_extension() {
533        let path = std::path::Path::new("foo.ts");
534        assert_eq!(Language::detect(path, None), Some(Language::TypeScript));
535        assert_eq!(
536            Language::detect(path, Some("javascript")),
537            Some(Language::JavaScript)
538        );
539    }
540
541    #[test]
542    fn edit_primitives_are_universal_rename_is_gated() {
543        for &lang in Language::all() {
544            let caps = lang.edit_capabilities();
545            assert!(caps.apply_node, "{} should support apply_node", lang.name());
546            assert!(
547                caps.insert_at_anchor,
548                "{} should support insert_at_anchor",
549                lang.name()
550            );
551            assert_eq!(
552                caps.rename_symbol,
553                lang.rename_identifier_kinds().is_some(),
554                "{} rename capability must match its identifier-kind table",
555                lang.name()
556            );
557        }
558        // Data/markup formats edit but carry no symbol projection.
559        assert!(!Language::Json.edit_capabilities().rename_symbol);
560        assert!(!Language::Json.edit_capabilities().symbols);
561        assert!(Language::Rust.edit_capabilities().rename_symbol);
562        assert!(Language::Rust.edit_capabilities().symbols);
563    }
564}