Skip to main content

harn_hostlib/ast/
language.rs

1//! Tree-sitter language registry.
2//!
3//! The set of languages, their canonical names, and their file extensions
4//! form the hostlib AST wire contract. Adding or dropping a language
5//! requires coordinated schema, fixture, and host-bridge updates.
6//!
7//! ## Per-language onboarding contract (B.7)
8//!
9//! Each [`Language`] variant carries the full adapter contract on the enum
10//! itself — there is no separate `LanguageAdapter` object to keep in sync:
11//!
12//! 1. **grammar binding** — [`Language::ts_language`]
13//! 2. **wire name + aliases** — [`Language::name`] / [`Language::from_name`]
14//! 3. **extension detection** — [`Language::from_extension`]
15//! 4. **symbol-graph projection** (drives `rename_symbol`) —
16//!    [`Language::rename_identifier_kinds`]
17//! 5. **symbol/outline extraction** — `ast::symbols::extract`
18//! 6. **test fixture** — `tests/fixtures/ast/<name>/`
19//!
20//! Format-preserving span replacement and trivia/indentation handling are
21//! grammar-agnostic (byte-span splice + inferred indent), so they need no
22//! per-language code. The result is that adding a language is a bounded
23//! ticket: register the grammar, add the four mapping arms, drop in a
24//! fixture, and (optionally) an identifier-kind table for rename support.
25
26use tree_sitter::Language as TsLanguage;
27
28/// Languages with tree-sitter grammar support.
29///
30/// The string returned by [`Language::name`] is the canonical wire name;
31/// callers (and the JSON schemas) refer to languages by that string. The
32/// trailing group (`Json`..`Markdown`) are data/markup/config grammars:
33/// they support the query-driven edit primitives but have no symbol-graph
34/// projection (see [`Language::edit_capabilities`]).
35#[allow(missing_docs)]
36#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
37pub enum Language {
38    Harn,
39    TypeScript,
40    Tsx,
41    JavaScript,
42    Jsx,
43    Python,
44    Go,
45    Rust,
46    Java,
47    C,
48    Cpp,
49    CSharp,
50    Ruby,
51    Kotlin,
52    Php,
53    Scala,
54    Bash,
55    Swift,
56    Zig,
57    Elixir,
58    Lua,
59    Haskell,
60    R,
61    Json,
62    Yaml,
63    Toml,
64    Css,
65    Html,
66    Sql,
67    Markdown,
68}
69
70/// The text-level fallback the agent loop should reach for when an
71/// AST-precise edit is unavailable for a file. Surfaced verbatim as the
72/// `fallback_suggestion` field on every `unsupported_*` edit response so
73/// the loop can degrade gracefully without per-call branching.
74pub const TEXT_PATCH_FALLBACK: &str =
75    "fall back to a text-level edit (std/edit `edit_safe_text_patch`)";
76
77/// Which AST-precise edit primitives are available for a language.
78///
79/// `apply_node` and `insert_at_anchor` are query-driven and work against
80/// any registered tree-sitter grammar, so they are always `true`.
81/// `rename_symbol` needs a per-language identifier-kind projection (see
82/// [`Language::rename_identifier_kinds`]); `symbols`/`outline` need a
83/// per-language extractor (see `ast::symbols`). The matrix is the
84/// onboarding contract: it tells the agent loop which primitive to reach
85/// for and is rendered into the capability-matrix docs.
86#[derive(Debug, Clone, Copy, PartialEq, Eq)]
87pub struct EditCapabilities {
88    /// Tree-sitter query → format-preserving replace.
89    pub apply_node: bool,
90    /// Anchored sibling/child insertion.
91    pub insert_at_anchor: bool,
92    /// Cross-file safe rename via the symbol graph.
93    pub rename_symbol: bool,
94    /// Symbol + outline extraction.
95    pub symbols: bool,
96}
97
98impl Language {
99    /// Canonical wire name.
100    pub fn name(self) -> &'static str {
101        match self {
102            Language::Harn => "harn",
103            Language::TypeScript => "typescript",
104            Language::Tsx => "tsx",
105            Language::JavaScript => "javascript",
106            Language::Jsx => "jsx",
107            Language::Python => "python",
108            Language::Go => "go",
109            Language::Rust => "rust",
110            Language::Java => "java",
111            Language::C => "c",
112            Language::Cpp => "cpp",
113            Language::CSharp => "csharp",
114            Language::Ruby => "ruby",
115            Language::Kotlin => "kotlin",
116            Language::Php => "php",
117            Language::Scala => "scala",
118            Language::Bash => "bash",
119            Language::Swift => "swift",
120            Language::Zig => "zig",
121            Language::Elixir => "elixir",
122            Language::Lua => "lua",
123            Language::Haskell => "haskell",
124            Language::R => "r",
125            Language::Json => "json",
126            Language::Yaml => "yaml",
127            Language::Toml => "toml",
128            Language::Css => "css",
129            Language::Html => "html",
130            Language::Sql => "sql",
131            Language::Markdown => "markdown",
132        }
133    }
134
135    /// Tree-sitter grammar handle, or `None` when this build was not
136    /// compiled with the grammar family that backs `self`.
137    ///
138    /// Each arm is gated on its `grammar-*` family feature, so a trimmed
139    /// build only links the grammars it asked for. The `name`/extension/
140    /// detection metadata above stays complete regardless of features — a
141    /// lean build still *recognizes* a `.py` file, it just returns `None`
142    /// here and the edit primitives degrade to the text fallback. The full
143    /// (default) build enables every family, so `None` never occurs there.
144    /// Cheap when present; the underlying `LANGUAGE` constants are static.
145    pub fn ts_language(self) -> Option<TsLanguage> {
146        Some(match self {
147            #[cfg(feature = "grammar-harn")]
148            Language::Harn => tree_sitter_harn::LANGUAGE.into(),
149
150            #[cfg(feature = "grammar-web")]
151            Language::TypeScript => tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into(),
152            #[cfg(feature = "grammar-web")]
153            Language::Tsx => tree_sitter_typescript::LANGUAGE_TSX.into(),
154            #[cfg(feature = "grammar-web")]
155            Language::JavaScript | Language::Jsx => tree_sitter_javascript::LANGUAGE.into(),
156            #[cfg(feature = "grammar-web")]
157            Language::Html => tree_sitter_html::LANGUAGE.into(),
158            #[cfg(feature = "grammar-web")]
159            Language::Css => tree_sitter_css::LANGUAGE.into(),
160
161            #[cfg(feature = "grammar-systems")]
162            Language::Rust => tree_sitter_rust::LANGUAGE.into(),
163            #[cfg(feature = "grammar-systems")]
164            Language::C => tree_sitter_c::LANGUAGE.into(),
165            #[cfg(feature = "grammar-systems")]
166            Language::Cpp => tree_sitter_cpp::LANGUAGE.into(),
167            #[cfg(feature = "grammar-systems")]
168            Language::Go => tree_sitter_go::LANGUAGE.into(),
169            #[cfg(feature = "grammar-systems")]
170            Language::Zig => tree_sitter_zig::LANGUAGE.into(),
171
172            #[cfg(feature = "grammar-scripting")]
173            Language::Python => tree_sitter_python::LANGUAGE.into(),
174            #[cfg(feature = "grammar-scripting")]
175            Language::Ruby => tree_sitter_ruby::LANGUAGE.into(),
176            #[cfg(feature = "grammar-scripting")]
177            Language::Bash => tree_sitter_bash::LANGUAGE.into(),
178            #[cfg(feature = "grammar-scripting")]
179            Language::Lua => tree_sitter_lua::LANGUAGE.into(),
180            #[cfg(feature = "grammar-scripting")]
181            Language::Php => tree_sitter_php::LANGUAGE_PHP.into(),
182            #[cfg(feature = "grammar-scripting")]
183            Language::R => tree_sitter_r::LANGUAGE.into(),
184
185            #[cfg(feature = "grammar-jvm")]
186            Language::Java => tree_sitter_java::LANGUAGE.into(),
187            #[cfg(feature = "grammar-jvm")]
188            Language::Kotlin => tree_sitter_kotlin_ng::LANGUAGE.into(),
189            #[cfg(feature = "grammar-jvm")]
190            Language::Scala => tree_sitter_scala::LANGUAGE.into(),
191
192            #[cfg(feature = "grammar-enterprise")]
193            Language::CSharp => tree_sitter_c_sharp::LANGUAGE.into(),
194            #[cfg(feature = "grammar-enterprise")]
195            Language::Swift => tree_sitter_swift::LANGUAGE.into(),
196            #[cfg(feature = "grammar-enterprise")]
197            Language::Elixir => tree_sitter_elixir::LANGUAGE.into(),
198            #[cfg(feature = "grammar-enterprise")]
199            Language::Haskell => tree_sitter_haskell::LANGUAGE.into(),
200
201            #[cfg(feature = "grammar-data")]
202            Language::Json => tree_sitter_json::LANGUAGE.into(),
203            #[cfg(feature = "grammar-data")]
204            Language::Yaml => tree_sitter_yaml::LANGUAGE.into(),
205            #[cfg(feature = "grammar-data")]
206            Language::Toml => tree_sitter_toml_ng::LANGUAGE.into(),
207            #[cfg(feature = "grammar-data")]
208            Language::Sql => tree_sitter_sequel::LANGUAGE.into(),
209            // tree-sitter-md ships a split block/inline grammar; the block
210            // grammar is the structural tree the edit primitives operate
211            // on (headings, lists, fenced code, …).
212            #[cfg(feature = "grammar-data")]
213            Language::Markdown => tree_sitter_md::LANGUAGE.into(),
214
215            // Any language whose family was not compiled into this build.
216            // Unreachable under the default (all-families) build.
217            #[allow(unreachable_patterns)]
218            _ => return None,
219        })
220    }
221
222    /// Resolve a language from its canonical wire name. Accepts a few
223    /// historical aliases (`ts`, `js`, `c++`, …) so users don't have to
224    /// memorize the exact spelling.
225    pub fn from_name(name: &str) -> Option<Self> {
226        let normalized = name.trim().to_ascii_lowercase();
227        Some(match normalized.as_str() {
228            "harn" => Language::Harn,
229            "typescript" | "ts" => Language::TypeScript,
230            "tsx" => Language::Tsx,
231            "javascript" | "js" => Language::JavaScript,
232            "jsx" => Language::Jsx,
233            "python" | "py" => Language::Python,
234            "go" | "golang" => Language::Go,
235            "rust" | "rs" => Language::Rust,
236            "java" => Language::Java,
237            "c" => Language::C,
238            "cpp" | "c++" | "cxx" => Language::Cpp,
239            "csharp" | "c#" | "cs" => Language::CSharp,
240            "ruby" | "rb" => Language::Ruby,
241            "kotlin" | "kt" => Language::Kotlin,
242            "php" => Language::Php,
243            "scala" => Language::Scala,
244            "bash" | "shell" | "sh" | "zsh" => Language::Bash,
245            "swift" => Language::Swift,
246            "zig" => Language::Zig,
247            "elixir" | "ex" => Language::Elixir,
248            "lua" => Language::Lua,
249            "haskell" | "hs" => Language::Haskell,
250            "r" => Language::R,
251            "json" => Language::Json,
252            "yaml" | "yml" => Language::Yaml,
253            "toml" => Language::Toml,
254            "css" => Language::Css,
255            "html" | "htm" => Language::Html,
256            "sql" => Language::Sql,
257            "markdown" | "md" => Language::Markdown,
258            _ => return None,
259        })
260    }
261
262    /// Resolve a language from a file extension.
263    pub fn from_extension(ext: &str) -> Option<Self> {
264        let normalized = ext.trim_start_matches('.').to_ascii_lowercase();
265        Some(match normalized.as_str() {
266            "harn" => Language::Harn,
267            "ts" => Language::TypeScript,
268            "tsx" => Language::Tsx,
269            "js" | "mjs" | "cjs" => Language::JavaScript,
270            "jsx" => Language::Jsx,
271            "py" => Language::Python,
272            "go" => Language::Go,
273            "rs" => Language::Rust,
274            "java" => Language::Java,
275            "c" | "h" => Language::C,
276            "cpp" | "cc" | "cxx" | "hpp" | "hxx" | "hh" => Language::Cpp,
277            "cs" | "csx" => Language::CSharp,
278            "rb" => Language::Ruby,
279            "kt" | "kts" => Language::Kotlin,
280            "php" => Language::Php,
281            "scala" | "sc" => Language::Scala,
282            "sh" | "bash" | "zsh" => Language::Bash,
283            "swift" => Language::Swift,
284            "zig" | "zon" => Language::Zig,
285            "ex" | "exs" => Language::Elixir,
286            "lua" => Language::Lua,
287            "hs" | "lhs" => Language::Haskell,
288            "r" => Language::R,
289            "json" => Language::Json,
290            "yaml" | "yml" => Language::Yaml,
291            "toml" => Language::Toml,
292            "css" => Language::Css,
293            "html" | "htm" => Language::Html,
294            "sql" => Language::Sql,
295            "md" | "markdown" => Language::Markdown,
296            _ => return None,
297        })
298    }
299
300    /// Resolve from a file path: prefer explicit `language_hint` if
301    /// supplied, otherwise fall back to extension-based detection.
302    pub fn detect(path: &std::path::Path, language_hint: Option<&str>) -> Option<Self> {
303        if let Some(name) = language_hint.and_then(|s| (!s.is_empty()).then_some(s)) {
304            return Self::from_name(name);
305        }
306        let ext = path.extension().and_then(|s| s.to_str())?;
307        Self::from_extension(ext)
308    }
309
310    /// A representative file extension for the language (no leading dot).
311    /// Used by docs and the onboarding probe; not necessarily the only
312    /// extension [`Language::from_extension`] accepts.
313    pub fn primary_extension(self) -> &'static str {
314        match self {
315            Language::Harn => "harn",
316            Language::TypeScript => "ts",
317            Language::Tsx => "tsx",
318            Language::JavaScript => "js",
319            Language::Jsx => "jsx",
320            Language::Python => "py",
321            Language::Go => "go",
322            Language::Rust => "rs",
323            Language::Java => "java",
324            Language::C => "c",
325            Language::Cpp => "cpp",
326            Language::CSharp => "cs",
327            Language::Ruby => "rb",
328            Language::Kotlin => "kt",
329            Language::Php => "php",
330            Language::Scala => "scala",
331            Language::Bash => "sh",
332            Language::Swift => "swift",
333            Language::Zig => "zig",
334            Language::Elixir => "ex",
335            Language::Lua => "lua",
336            Language::Haskell => "hs",
337            Language::R => "r",
338            Language::Json => "json",
339            Language::Yaml => "yaml",
340            Language::Toml => "toml",
341            Language::Css => "css",
342            Language::Html => "html",
343            Language::Sql => "sql",
344            Language::Markdown => "md",
345        }
346    }
347
348    /// Per-language allow-list of tree-sitter node kinds that represent an
349    /// identifier token bound to a name (variables, functions, types,
350    /// fields). This is the symbol-graph projection that drives
351    /// `rename_symbol`: anything not in this table is treated as a literal
352    /// or punctuation node and left alone, which keeps a rename out of
353    /// comments and string bodies even though those *contain* identifier
354    /// substrings. `None` means the language has no rename projection yet.
355    pub fn rename_identifier_kinds(self) -> Option<&'static [&'static str]> {
356        Some(match self {
357            Language::Harn => &["identifier"],
358            Language::Rust => &[
359                "identifier",
360                "type_identifier",
361                "field_identifier",
362                "shorthand_field_identifier",
363            ],
364            Language::TypeScript | Language::Tsx => &[
365                "identifier",
366                "type_identifier",
367                "property_identifier",
368                "shorthand_property_identifier",
369                "shorthand_property_identifier_pattern",
370            ],
371            Language::JavaScript | Language::Jsx => &[
372                "identifier",
373                "property_identifier",
374                "shorthand_property_identifier",
375                "shorthand_property_identifier_pattern",
376            ],
377            Language::Python => &["identifier"],
378            Language::Go => &[
379                "identifier",
380                "type_identifier",
381                "field_identifier",
382                "package_identifier",
383            ],
384            Language::Swift => &["simple_identifier", "type_identifier"],
385            _ => return None,
386        })
387    }
388
389    /// Whether `rename_symbol` can operate on this language (i.e. it has a
390    /// [`Language::rename_identifier_kinds`] projection).
391    pub fn supports_rename(self) -> bool {
392        self.rename_identifier_kinds().is_some()
393    }
394
395    /// Data / markup / config grammars that carry no nameable symbols, so
396    /// symbol + outline extraction is intentionally empty for them.
397    fn is_data_format(self) -> bool {
398        matches!(
399            self,
400            Language::Json
401                | Language::Yaml
402                | Language::Toml
403                | Language::Css
404                | Language::Html
405                | Language::Sql
406                | Language::Markdown
407        )
408    }
409
410    /// Whether `symbols`/`outline` produce meaningful results. Data/markup
411    /// grammars parse and edit fine but expose no symbol projection.
412    pub fn supports_symbol_extraction(self) -> bool {
413        !self.is_data_format()
414    }
415
416    /// The AST-precise edit capability matrix for this language. See
417    /// [`EditCapabilities`].
418    pub fn edit_capabilities(self) -> EditCapabilities {
419        EditCapabilities {
420            apply_node: true,
421            insert_at_anchor: true,
422            rename_symbol: self.supports_rename(),
423            symbols: self.supports_symbol_extraction(),
424        }
425    }
426
427    /// Every language we ship support for. Useful for tests + introspection.
428    pub fn all() -> &'static [Language] {
429        &[
430            Language::Harn,
431            Language::TypeScript,
432            Language::Tsx,
433            Language::JavaScript,
434            Language::Jsx,
435            Language::Python,
436            Language::Go,
437            Language::Rust,
438            Language::Java,
439            Language::C,
440            Language::Cpp,
441            Language::CSharp,
442            Language::Ruby,
443            Language::Kotlin,
444            Language::Php,
445            Language::Scala,
446            Language::Bash,
447            Language::Swift,
448            Language::Zig,
449            Language::Elixir,
450            Language::Lua,
451            Language::Haskell,
452            Language::R,
453            Language::Json,
454            Language::Yaml,
455            Language::Toml,
456            Language::Css,
457            Language::Html,
458            Language::Sql,
459            Language::Markdown,
460        ]
461    }
462}
463
464#[cfg(test)]
465mod tests {
466    use super::*;
467
468    // Only the all-families (default) build links every grammar; under a
469    // trimmed grammar set some languages intentionally resolve to `None`.
470    #[cfg(feature = "grammars-all")]
471    #[test]
472    fn every_language_is_loadable() {
473        for &lang in Language::all() {
474            // Constructing the tree-sitter Language must not panic and must
475            // produce a non-trivial grammar.
476            let ts = lang
477                .ts_language()
478                .unwrap_or_else(|| panic!("{} grammar not compiled", lang.name()));
479            assert!(ts.node_kind_count() > 0, "{} grammar is empty", lang.name());
480        }
481    }
482
483    #[test]
484    fn extension_detection_round_trips_canonical_extensions() {
485        let cases: &[(&str, Language)] = &[
486            ("harn", Language::Harn),
487            ("ts", Language::TypeScript),
488            ("tsx", Language::Tsx),
489            ("js", Language::JavaScript),
490            ("jsx", Language::Jsx),
491            ("py", Language::Python),
492            ("rs", Language::Rust),
493            ("go", Language::Go),
494            ("java", Language::Java),
495            ("c", Language::C),
496            ("cpp", Language::Cpp),
497            ("cs", Language::CSharp),
498            ("rb", Language::Ruby),
499            ("kt", Language::Kotlin),
500            ("php", Language::Php),
501            ("scala", Language::Scala),
502            ("sh", Language::Bash),
503            ("swift", Language::Swift),
504            ("zig", Language::Zig),
505            ("ex", Language::Elixir),
506            ("lua", Language::Lua),
507            ("hs", Language::Haskell),
508            ("r", Language::R),
509            ("json", Language::Json),
510            ("yaml", Language::Yaml),
511            ("yml", Language::Yaml),
512            ("toml", Language::Toml),
513            ("css", Language::Css),
514            ("html", Language::Html),
515            ("sql", Language::Sql),
516            ("md", Language::Markdown),
517        ];
518        for (ext, want) in cases {
519            assert_eq!(Language::from_extension(ext), Some(*want), "ext {ext}");
520        }
521    }
522
523    #[test]
524    fn name_round_trips_for_every_language() {
525        for &lang in Language::all() {
526            assert_eq!(Language::from_name(lang.name()), Some(lang));
527        }
528    }
529
530    #[test]
531    fn primary_extension_resolves_back_to_the_language() {
532        for &lang in Language::all() {
533            assert_eq!(
534                Language::from_extension(lang.primary_extension()),
535                Some(lang),
536                "primary extension for {} does not round-trip",
537                lang.name()
538            );
539        }
540    }
541
542    #[test]
543    fn detect_prefers_hint_over_extension() {
544        let path = std::path::Path::new("foo.ts");
545        assert_eq!(Language::detect(path, None), Some(Language::TypeScript));
546        assert_eq!(
547            Language::detect(path, Some("javascript")),
548            Some(Language::JavaScript)
549        );
550    }
551
552    #[test]
553    fn edit_primitives_are_universal_rename_is_gated() {
554        for &lang in Language::all() {
555            let caps = lang.edit_capabilities();
556            assert!(caps.apply_node, "{} should support apply_node", lang.name());
557            assert!(
558                caps.insert_at_anchor,
559                "{} should support insert_at_anchor",
560                lang.name()
561            );
562            assert_eq!(
563                caps.rename_symbol,
564                lang.rename_identifier_kinds().is_some(),
565                "{} rename capability must match its identifier-kind table",
566                lang.name()
567            );
568        }
569        // Data/markup formats edit but carry no symbol projection.
570        assert!(!Language::Json.edit_capabilities().rename_symbol);
571        assert!(!Language::Json.edit_capabilities().symbols);
572        assert!(Language::Rust.edit_capabilities().rename_symbol);
573        assert!(Language::Rust.edit_capabilities().symbols);
574    }
575}