Skip to main content

harn_hostlib/ast/
language.rs

1//! Tree-sitter language registry.
2//!
3//! The set of languages, their canonical names, and their file extensions
4//! form the hostlib AST wire contract. Adding or dropping a language
5//! requires coordinated schema, fixture, and host-bridge updates.
6//!
7//! ## Per-language onboarding contract (B.7)
8//!
9//! Each [`Language`] variant carries the full adapter contract on the enum
10//! itself — there is no separate `LanguageAdapter` object to keep in sync:
11//!
12//! 1. **grammar binding** — [`Language::ts_language`]
13//! 2. **wire name + aliases** — [`Language::name`] / [`Language::from_name`]
14//! 3. **extension detection** — [`Language::from_extension`]
15//! 4. **symbol-graph projection** (drives `rename_symbol`) —
16//!    [`Language::rename_identifier_kinds`]
17//! 5. **symbol/outline extraction** — `ast::symbols::extract`
18//! 6. **test fixture** — `tests/fixtures/ast/<name>/`
19//!
20//! Format-preserving span replacement and trivia/indentation handling are
21//! grammar-agnostic (byte-span splice + inferred indent), so they need no
22//! per-language code. The result is that adding a language is a bounded
23//! ticket: register the grammar, add the four mapping arms, drop in a
24//! fixture, and (optionally) an identifier-kind table for rename support.
25
26use tree_sitter::Language as TsLanguage;
27
28/// Languages with tree-sitter grammar support.
29///
30/// The string returned by [`Language::name`] is the canonical wire name;
31/// callers (and the JSON schemas) refer to languages by that string. The
32/// trailing group (`Json`..`Markdown`) are data/markup/config grammars:
33/// they support the query-driven edit primitives but have no symbol-graph
34/// projection (see [`Language::edit_capabilities`]).
35#[allow(missing_docs)]
36#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
37pub enum Language {
38    TypeScript,
39    Tsx,
40    JavaScript,
41    Jsx,
42    Python,
43    Go,
44    Rust,
45    Java,
46    C,
47    Cpp,
48    CSharp,
49    Ruby,
50    Kotlin,
51    Php,
52    Scala,
53    Bash,
54    Swift,
55    Zig,
56    Elixir,
57    Lua,
58    Haskell,
59    R,
60    Json,
61    Yaml,
62    Toml,
63    Css,
64    Html,
65    Sql,
66    Markdown,
67}
68
69/// The text-level fallback the agent loop should reach for when an
70/// AST-precise edit is unavailable for a file. Surfaced verbatim as the
71/// `fallback_suggestion` field on every `unsupported_*` edit response so
72/// the loop can degrade gracefully without per-call branching.
73pub const TEXT_PATCH_FALLBACK: &str =
74    "fall back to a text-level edit (std/edit `edit_safe_text_patch`)";
75
76/// Which AST-precise edit primitives are available for a language.
77///
78/// `apply_node` and `insert_at_anchor` are query-driven and work against
79/// any registered tree-sitter grammar, so they are always `true`.
80/// `rename_symbol` needs a per-language identifier-kind projection (see
81/// [`Language::rename_identifier_kinds`]); `symbols`/`outline` need a
82/// per-language extractor (see `ast::symbols`). The matrix is the
83/// onboarding contract: it tells the agent loop which primitive to reach
84/// for and is rendered into the capability-matrix docs.
85#[derive(Debug, Clone, Copy, PartialEq, Eq)]
86pub struct EditCapabilities {
87    /// Tree-sitter query → format-preserving replace.
88    pub apply_node: bool,
89    /// Anchored sibling/child insertion.
90    pub insert_at_anchor: bool,
91    /// Cross-file safe rename via the symbol graph.
92    pub rename_symbol: bool,
93    /// Symbol + outline extraction.
94    pub symbols: bool,
95}
96
97impl Language {
98    /// Canonical wire name.
99    pub fn name(self) -> &'static str {
100        match self {
101            Language::TypeScript => "typescript",
102            Language::Tsx => "tsx",
103            Language::JavaScript => "javascript",
104            Language::Jsx => "jsx",
105            Language::Python => "python",
106            Language::Go => "go",
107            Language::Rust => "rust",
108            Language::Java => "java",
109            Language::C => "c",
110            Language::Cpp => "cpp",
111            Language::CSharp => "csharp",
112            Language::Ruby => "ruby",
113            Language::Kotlin => "kotlin",
114            Language::Php => "php",
115            Language::Scala => "scala",
116            Language::Bash => "bash",
117            Language::Swift => "swift",
118            Language::Zig => "zig",
119            Language::Elixir => "elixir",
120            Language::Lua => "lua",
121            Language::Haskell => "haskell",
122            Language::R => "r",
123            Language::Json => "json",
124            Language::Yaml => "yaml",
125            Language::Toml => "toml",
126            Language::Css => "css",
127            Language::Html => "html",
128            Language::Sql => "sql",
129            Language::Markdown => "markdown",
130        }
131    }
132
133    /// Tree-sitter grammar handle. Cheap; the underlying `LANGUAGE`
134    /// constants are static.
135    pub fn ts_language(self) -> TsLanguage {
136        match self {
137            Language::TypeScript => tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into(),
138            Language::Tsx => tree_sitter_typescript::LANGUAGE_TSX.into(),
139            Language::JavaScript | Language::Jsx => tree_sitter_javascript::LANGUAGE.into(),
140            Language::Python => tree_sitter_python::LANGUAGE.into(),
141            Language::Go => tree_sitter_go::LANGUAGE.into(),
142            Language::Rust => tree_sitter_rust::LANGUAGE.into(),
143            Language::Java => tree_sitter_java::LANGUAGE.into(),
144            Language::C => tree_sitter_c::LANGUAGE.into(),
145            Language::Cpp => tree_sitter_cpp::LANGUAGE.into(),
146            Language::CSharp => tree_sitter_c_sharp::LANGUAGE.into(),
147            Language::Ruby => tree_sitter_ruby::LANGUAGE.into(),
148            Language::Kotlin => tree_sitter_kotlin_ng::LANGUAGE.into(),
149            Language::Php => tree_sitter_php::LANGUAGE_PHP.into(),
150            Language::Scala => tree_sitter_scala::LANGUAGE.into(),
151            Language::Bash => tree_sitter_bash::LANGUAGE.into(),
152            Language::Swift => tree_sitter_swift::LANGUAGE.into(),
153            Language::Zig => tree_sitter_zig::LANGUAGE.into(),
154            Language::Elixir => tree_sitter_elixir::LANGUAGE.into(),
155            Language::Lua => tree_sitter_lua::LANGUAGE.into(),
156            Language::Haskell => tree_sitter_haskell::LANGUAGE.into(),
157            Language::R => tree_sitter_r::LANGUAGE.into(),
158            Language::Json => tree_sitter_json::LANGUAGE.into(),
159            Language::Yaml => tree_sitter_yaml::LANGUAGE.into(),
160            Language::Toml => tree_sitter_toml_ng::LANGUAGE.into(),
161            Language::Css => tree_sitter_css::LANGUAGE.into(),
162            Language::Html => tree_sitter_html::LANGUAGE.into(),
163            Language::Sql => tree_sitter_sequel::LANGUAGE.into(),
164            // tree-sitter-md ships a split block/inline grammar; the block
165            // grammar is the structural tree the edit primitives operate
166            // on (headings, lists, fenced code, …).
167            Language::Markdown => tree_sitter_md::LANGUAGE.into(),
168        }
169    }
170
171    /// Resolve a language from its canonical wire name. Accepts a few
172    /// historical aliases (`ts`, `js`, `c++`, …) so users don't have to
173    /// memorize the exact spelling.
174    pub fn from_name(name: &str) -> Option<Self> {
175        let normalized = name.trim().to_ascii_lowercase();
176        Some(match normalized.as_str() {
177            "typescript" | "ts" => Language::TypeScript,
178            "tsx" => Language::Tsx,
179            "javascript" | "js" => Language::JavaScript,
180            "jsx" => Language::Jsx,
181            "python" | "py" => Language::Python,
182            "go" | "golang" => Language::Go,
183            "rust" | "rs" => Language::Rust,
184            "java" => Language::Java,
185            "c" => Language::C,
186            "cpp" | "c++" | "cxx" => Language::Cpp,
187            "csharp" | "c#" | "cs" => Language::CSharp,
188            "ruby" | "rb" => Language::Ruby,
189            "kotlin" | "kt" => Language::Kotlin,
190            "php" => Language::Php,
191            "scala" => Language::Scala,
192            "bash" | "shell" | "sh" | "zsh" => Language::Bash,
193            "swift" => Language::Swift,
194            "zig" => Language::Zig,
195            "elixir" | "ex" => Language::Elixir,
196            "lua" => Language::Lua,
197            "haskell" | "hs" => Language::Haskell,
198            "r" => Language::R,
199            "json" => Language::Json,
200            "yaml" | "yml" => Language::Yaml,
201            "toml" => Language::Toml,
202            "css" => Language::Css,
203            "html" | "htm" => Language::Html,
204            "sql" => Language::Sql,
205            "markdown" | "md" => Language::Markdown,
206            _ => return None,
207        })
208    }
209
210    /// Resolve a language from a file extension.
211    pub fn from_extension(ext: &str) -> Option<Self> {
212        let normalized = ext.trim_start_matches('.').to_ascii_lowercase();
213        Some(match normalized.as_str() {
214            "ts" => Language::TypeScript,
215            "tsx" => Language::Tsx,
216            "js" | "mjs" | "cjs" => Language::JavaScript,
217            "jsx" => Language::Jsx,
218            "py" => Language::Python,
219            "go" => Language::Go,
220            "rs" => Language::Rust,
221            "java" => Language::Java,
222            "c" | "h" => Language::C,
223            "cpp" | "cc" | "cxx" | "hpp" | "hxx" | "hh" => Language::Cpp,
224            "cs" | "csx" => Language::CSharp,
225            "rb" => Language::Ruby,
226            "kt" | "kts" => Language::Kotlin,
227            "php" => Language::Php,
228            "scala" | "sc" => Language::Scala,
229            "sh" | "bash" | "zsh" => Language::Bash,
230            "swift" => Language::Swift,
231            "zig" | "zon" => Language::Zig,
232            "ex" | "exs" => Language::Elixir,
233            "lua" => Language::Lua,
234            "hs" | "lhs" => Language::Haskell,
235            "r" => Language::R,
236            "json" => Language::Json,
237            "yaml" | "yml" => Language::Yaml,
238            "toml" => Language::Toml,
239            "css" => Language::Css,
240            "html" | "htm" => Language::Html,
241            "sql" => Language::Sql,
242            "md" | "markdown" => Language::Markdown,
243            _ => return None,
244        })
245    }
246
247    /// Resolve from a file path: prefer explicit `language_hint` if
248    /// supplied, otherwise fall back to extension-based detection.
249    pub fn detect(path: &std::path::Path, language_hint: Option<&str>) -> Option<Self> {
250        if let Some(name) = language_hint.and_then(|s| (!s.is_empty()).then_some(s)) {
251            return Self::from_name(name);
252        }
253        let ext = path.extension().and_then(|s| s.to_str())?;
254        Self::from_extension(ext)
255    }
256
257    /// A representative file extension for the language (no leading dot).
258    /// Used by docs and the onboarding probe; not necessarily the only
259    /// extension [`Language::from_extension`] accepts.
260    pub fn primary_extension(self) -> &'static str {
261        match self {
262            Language::TypeScript => "ts",
263            Language::Tsx => "tsx",
264            Language::JavaScript => "js",
265            Language::Jsx => "jsx",
266            Language::Python => "py",
267            Language::Go => "go",
268            Language::Rust => "rs",
269            Language::Java => "java",
270            Language::C => "c",
271            Language::Cpp => "cpp",
272            Language::CSharp => "cs",
273            Language::Ruby => "rb",
274            Language::Kotlin => "kt",
275            Language::Php => "php",
276            Language::Scala => "scala",
277            Language::Bash => "sh",
278            Language::Swift => "swift",
279            Language::Zig => "zig",
280            Language::Elixir => "ex",
281            Language::Lua => "lua",
282            Language::Haskell => "hs",
283            Language::R => "r",
284            Language::Json => "json",
285            Language::Yaml => "yaml",
286            Language::Toml => "toml",
287            Language::Css => "css",
288            Language::Html => "html",
289            Language::Sql => "sql",
290            Language::Markdown => "md",
291        }
292    }
293
294    /// Per-language allow-list of tree-sitter node kinds that represent an
295    /// identifier token bound to a name (variables, functions, types,
296    /// fields). This is the symbol-graph projection that drives
297    /// `rename_symbol`: anything not in this table is treated as a literal
298    /// or punctuation node and left alone, which keeps a rename out of
299    /// comments and string bodies even though those *contain* identifier
300    /// substrings. `None` means the language has no rename projection yet.
301    pub fn rename_identifier_kinds(self) -> Option<&'static [&'static str]> {
302        Some(match self {
303            Language::Rust => &[
304                "identifier",
305                "type_identifier",
306                "field_identifier",
307                "shorthand_field_identifier",
308            ],
309            Language::TypeScript | Language::Tsx => &[
310                "identifier",
311                "type_identifier",
312                "property_identifier",
313                "shorthand_property_identifier",
314                "shorthand_property_identifier_pattern",
315            ],
316            Language::JavaScript | Language::Jsx => &[
317                "identifier",
318                "property_identifier",
319                "shorthand_property_identifier",
320                "shorthand_property_identifier_pattern",
321            ],
322            Language::Python => &["identifier"],
323            Language::Go => &[
324                "identifier",
325                "type_identifier",
326                "field_identifier",
327                "package_identifier",
328            ],
329            Language::Swift => &["simple_identifier", "type_identifier"],
330            _ => return None,
331        })
332    }
333
334    /// Whether `rename_symbol` can operate on this language (i.e. it has a
335    /// [`Language::rename_identifier_kinds`] projection).
336    pub fn supports_rename(self) -> bool {
337        self.rename_identifier_kinds().is_some()
338    }
339
340    /// Data / markup / config grammars that carry no nameable symbols, so
341    /// symbol + outline extraction is intentionally empty for them.
342    fn is_data_format(self) -> bool {
343        matches!(
344            self,
345            Language::Json
346                | Language::Yaml
347                | Language::Toml
348                | Language::Css
349                | Language::Html
350                | Language::Sql
351                | Language::Markdown
352        )
353    }
354
355    /// Whether `symbols`/`outline` produce meaningful results. Data/markup
356    /// grammars parse and edit fine but expose no symbol projection.
357    pub fn supports_symbol_extraction(self) -> bool {
358        !self.is_data_format()
359    }
360
361    /// The AST-precise edit capability matrix for this language. See
362    /// [`EditCapabilities`].
363    pub fn edit_capabilities(self) -> EditCapabilities {
364        EditCapabilities {
365            apply_node: true,
366            insert_at_anchor: true,
367            rename_symbol: self.supports_rename(),
368            symbols: self.supports_symbol_extraction(),
369        }
370    }
371
372    /// Every language we ship support for. Useful for tests + introspection.
373    pub fn all() -> &'static [Language] {
374        &[
375            Language::TypeScript,
376            Language::Tsx,
377            Language::JavaScript,
378            Language::Jsx,
379            Language::Python,
380            Language::Go,
381            Language::Rust,
382            Language::Java,
383            Language::C,
384            Language::Cpp,
385            Language::CSharp,
386            Language::Ruby,
387            Language::Kotlin,
388            Language::Php,
389            Language::Scala,
390            Language::Bash,
391            Language::Swift,
392            Language::Zig,
393            Language::Elixir,
394            Language::Lua,
395            Language::Haskell,
396            Language::R,
397            Language::Json,
398            Language::Yaml,
399            Language::Toml,
400            Language::Css,
401            Language::Html,
402            Language::Sql,
403            Language::Markdown,
404        ]
405    }
406}
407
408#[cfg(test)]
409mod tests {
410    use super::*;
411
412    #[test]
413    fn every_language_is_loadable() {
414        for &lang in Language::all() {
415            // Constructing the tree-sitter Language must not panic and must
416            // produce a non-trivial grammar.
417            let ts = lang.ts_language();
418            assert!(ts.node_kind_count() > 0, "{} grammar is empty", lang.name());
419        }
420    }
421
422    #[test]
423    fn extension_detection_round_trips_canonical_extensions() {
424        let cases: &[(&str, Language)] = &[
425            ("ts", Language::TypeScript),
426            ("tsx", Language::Tsx),
427            ("js", Language::JavaScript),
428            ("jsx", Language::Jsx),
429            ("py", Language::Python),
430            ("rs", Language::Rust),
431            ("go", Language::Go),
432            ("java", Language::Java),
433            ("c", Language::C),
434            ("cpp", Language::Cpp),
435            ("cs", Language::CSharp),
436            ("rb", Language::Ruby),
437            ("kt", Language::Kotlin),
438            ("php", Language::Php),
439            ("scala", Language::Scala),
440            ("sh", Language::Bash),
441            ("swift", Language::Swift),
442            ("zig", Language::Zig),
443            ("ex", Language::Elixir),
444            ("lua", Language::Lua),
445            ("hs", Language::Haskell),
446            ("r", Language::R),
447            ("json", Language::Json),
448            ("yaml", Language::Yaml),
449            ("yml", Language::Yaml),
450            ("toml", Language::Toml),
451            ("css", Language::Css),
452            ("html", Language::Html),
453            ("sql", Language::Sql),
454            ("md", Language::Markdown),
455        ];
456        for (ext, want) in cases {
457            assert_eq!(Language::from_extension(ext), Some(*want), "ext {ext}");
458        }
459    }
460
461    #[test]
462    fn name_round_trips_for_every_language() {
463        for &lang in Language::all() {
464            assert_eq!(Language::from_name(lang.name()), Some(lang));
465        }
466    }
467
468    #[test]
469    fn primary_extension_resolves_back_to_the_language() {
470        for &lang in Language::all() {
471            assert_eq!(
472                Language::from_extension(lang.primary_extension()),
473                Some(lang),
474                "primary extension for {} does not round-trip",
475                lang.name()
476            );
477        }
478    }
479
480    #[test]
481    fn detect_prefers_hint_over_extension() {
482        let path = std::path::Path::new("foo.ts");
483        assert_eq!(Language::detect(path, None), Some(Language::TypeScript));
484        assert_eq!(
485            Language::detect(path, Some("javascript")),
486            Some(Language::JavaScript)
487        );
488    }
489
490    #[test]
491    fn edit_primitives_are_universal_rename_is_gated() {
492        for &lang in Language::all() {
493            let caps = lang.edit_capabilities();
494            assert!(caps.apply_node, "{} should support apply_node", lang.name());
495            assert!(
496                caps.insert_at_anchor,
497                "{} should support insert_at_anchor",
498                lang.name()
499            );
500            assert_eq!(
501                caps.rename_symbol,
502                lang.rename_identifier_kinds().is_some(),
503                "{} rename capability must match its identifier-kind table",
504                lang.name()
505            );
506        }
507        // Data/markup formats edit but carry no symbol projection.
508        assert!(!Language::Json.edit_capabilities().rename_symbol);
509        assert!(!Language::Json.edit_capabilities().symbols);
510        assert!(Language::Rust.edit_capabilities().rename_symbol);
511        assert!(Language::Rust.edit_capabilities().symbols);
512    }
513}