Skip to main content

harn_hostlib/ast/
language.rs

1//! Tree-sitter language registry.
2//!
3//! The set of languages, their canonical names, and their file extensions
4//! form the hostlib AST wire contract. Adding or dropping a language
5//! requires coordinated schema, fixture, and host-bridge updates.
6
7use tree_sitter::Language as TsLanguage;
8
9/// Languages with tree-sitter symbol extraction support.
10///
11/// The string returned by [`Language::name`] is the canonical wire name;
12/// callers (and the JSON schemas) refer to languages by that string.
13#[allow(missing_docs)]
14#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
15pub enum Language {
16    TypeScript,
17    Tsx,
18    JavaScript,
19    Jsx,
20    Python,
21    Go,
22    Rust,
23    Java,
24    C,
25    Cpp,
26    CSharp,
27    Ruby,
28    Kotlin,
29    Php,
30    Scala,
31    Bash,
32    Swift,
33    Zig,
34    Elixir,
35    Lua,
36    Haskell,
37    R,
38}
39
40impl Language {
41    /// Canonical wire name.
42    pub fn name(self) -> &'static str {
43        match self {
44            Language::TypeScript => "typescript",
45            Language::Tsx => "tsx",
46            Language::JavaScript => "javascript",
47            Language::Jsx => "jsx",
48            Language::Python => "python",
49            Language::Go => "go",
50            Language::Rust => "rust",
51            Language::Java => "java",
52            Language::C => "c",
53            Language::Cpp => "cpp",
54            Language::CSharp => "csharp",
55            Language::Ruby => "ruby",
56            Language::Kotlin => "kotlin",
57            Language::Php => "php",
58            Language::Scala => "scala",
59            Language::Bash => "bash",
60            Language::Swift => "swift",
61            Language::Zig => "zig",
62            Language::Elixir => "elixir",
63            Language::Lua => "lua",
64            Language::Haskell => "haskell",
65            Language::R => "r",
66        }
67    }
68
69    /// Tree-sitter grammar handle. Cheap; the underlying `LANGUAGE`
70    /// constants are static.
71    pub fn ts_language(self) -> TsLanguage {
72        match self {
73            Language::TypeScript => tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into(),
74            Language::Tsx => tree_sitter_typescript::LANGUAGE_TSX.into(),
75            Language::JavaScript | Language::Jsx => tree_sitter_javascript::LANGUAGE.into(),
76            Language::Python => tree_sitter_python::LANGUAGE.into(),
77            Language::Go => tree_sitter_go::LANGUAGE.into(),
78            Language::Rust => tree_sitter_rust::LANGUAGE.into(),
79            Language::Java => tree_sitter_java::LANGUAGE.into(),
80            Language::C => tree_sitter_c::LANGUAGE.into(),
81            Language::Cpp => tree_sitter_cpp::LANGUAGE.into(),
82            Language::CSharp => tree_sitter_c_sharp::LANGUAGE.into(),
83            Language::Ruby => tree_sitter_ruby::LANGUAGE.into(),
84            Language::Kotlin => tree_sitter_kotlin_ng::LANGUAGE.into(),
85            Language::Php => tree_sitter_php::LANGUAGE_PHP.into(),
86            Language::Scala => tree_sitter_scala::LANGUAGE.into(),
87            Language::Bash => tree_sitter_bash::LANGUAGE.into(),
88            Language::Swift => tree_sitter_swift::LANGUAGE.into(),
89            Language::Zig => tree_sitter_zig::LANGUAGE.into(),
90            Language::Elixir => tree_sitter_elixir::LANGUAGE.into(),
91            Language::Lua => tree_sitter_lua::LANGUAGE.into(),
92            Language::Haskell => tree_sitter_haskell::LANGUAGE.into(),
93            Language::R => tree_sitter_r::LANGUAGE.into(),
94        }
95    }
96
97    /// Resolve a language from its canonical wire name. Accepts a few
98    /// historical aliases (`ts`, `js`, `c++`, …) so users don't have to
99    /// memorize the exact spelling.
100    pub fn from_name(name: &str) -> Option<Self> {
101        let normalized = name.trim().to_ascii_lowercase();
102        Some(match normalized.as_str() {
103            "typescript" | "ts" => Language::TypeScript,
104            "tsx" => Language::Tsx,
105            "javascript" | "js" => Language::JavaScript,
106            "jsx" => Language::Jsx,
107            "python" | "py" => Language::Python,
108            "go" | "golang" => Language::Go,
109            "rust" | "rs" => Language::Rust,
110            "java" => Language::Java,
111            "c" => Language::C,
112            "cpp" | "c++" | "cxx" => Language::Cpp,
113            "csharp" | "c#" | "cs" => Language::CSharp,
114            "ruby" | "rb" => Language::Ruby,
115            "kotlin" | "kt" => Language::Kotlin,
116            "php" => Language::Php,
117            "scala" => Language::Scala,
118            "bash" | "shell" | "sh" | "zsh" => Language::Bash,
119            "swift" => Language::Swift,
120            "zig" => Language::Zig,
121            "elixir" | "ex" => Language::Elixir,
122            "lua" => Language::Lua,
123            "haskell" | "hs" => Language::Haskell,
124            "r" => Language::R,
125            _ => return None,
126        })
127    }
128
129    /// Resolve a language from a file extension.
130    pub fn from_extension(ext: &str) -> Option<Self> {
131        let normalized = ext.trim_start_matches('.').to_ascii_lowercase();
132        Some(match normalized.as_str() {
133            "ts" => Language::TypeScript,
134            "tsx" => Language::Tsx,
135            "js" | "mjs" | "cjs" => Language::JavaScript,
136            "jsx" => Language::Jsx,
137            "py" => Language::Python,
138            "go" => Language::Go,
139            "rs" => Language::Rust,
140            "java" => Language::Java,
141            "c" | "h" => Language::C,
142            "cpp" | "cc" | "cxx" | "hpp" | "hxx" | "hh" => Language::Cpp,
143            "cs" | "csx" => Language::CSharp,
144            "rb" => Language::Ruby,
145            "kt" | "kts" => Language::Kotlin,
146            "php" => Language::Php,
147            "scala" | "sc" => Language::Scala,
148            "sh" | "bash" | "zsh" => Language::Bash,
149            "swift" => Language::Swift,
150            "zig" | "zon" => Language::Zig,
151            "ex" | "exs" => Language::Elixir,
152            "lua" => Language::Lua,
153            "hs" | "lhs" => Language::Haskell,
154            "r" => Language::R,
155            _ => return None,
156        })
157    }
158
159    /// Resolve from a file path: prefer explicit `language_hint` if
160    /// supplied, otherwise fall back to extension-based detection.
161    pub fn detect(path: &std::path::Path, language_hint: Option<&str>) -> Option<Self> {
162        if let Some(name) = language_hint.and_then(|s| (!s.is_empty()).then_some(s)) {
163            return Self::from_name(name);
164        }
165        let ext = path.extension().and_then(|s| s.to_str())?;
166        Self::from_extension(ext)
167    }
168
169    /// Every language we ship support for. Useful for tests + introspection.
170    pub fn all() -> &'static [Language] {
171        &[
172            Language::TypeScript,
173            Language::Tsx,
174            Language::JavaScript,
175            Language::Jsx,
176            Language::Python,
177            Language::Go,
178            Language::Rust,
179            Language::Java,
180            Language::C,
181            Language::Cpp,
182            Language::CSharp,
183            Language::Ruby,
184            Language::Kotlin,
185            Language::Php,
186            Language::Scala,
187            Language::Bash,
188            Language::Swift,
189            Language::Zig,
190            Language::Elixir,
191            Language::Lua,
192            Language::Haskell,
193            Language::R,
194        ]
195    }
196}
197
198#[cfg(test)]
199mod tests {
200    use super::*;
201
202    #[test]
203    fn every_language_is_loadable() {
204        for &lang in Language::all() {
205            // Constructing the tree-sitter Language must not panic and must
206            // produce a non-trivial grammar.
207            let ts = lang.ts_language();
208            assert!(ts.node_kind_count() > 0, "{} grammar is empty", lang.name());
209        }
210    }
211
212    #[test]
213    fn extension_detection_round_trips_canonical_extensions() {
214        let cases: &[(&str, Language)] = &[
215            ("ts", Language::TypeScript),
216            ("tsx", Language::Tsx),
217            ("js", Language::JavaScript),
218            ("jsx", Language::Jsx),
219            ("py", Language::Python),
220            ("rs", Language::Rust),
221            ("go", Language::Go),
222            ("java", Language::Java),
223            ("c", Language::C),
224            ("cpp", Language::Cpp),
225            ("cs", Language::CSharp),
226            ("rb", Language::Ruby),
227            ("kt", Language::Kotlin),
228            ("php", Language::Php),
229            ("scala", Language::Scala),
230            ("sh", Language::Bash),
231            ("swift", Language::Swift),
232            ("zig", Language::Zig),
233            ("ex", Language::Elixir),
234            ("lua", Language::Lua),
235            ("hs", Language::Haskell),
236            ("r", Language::R),
237        ];
238        for (ext, want) in cases {
239            assert_eq!(Language::from_extension(ext), Some(*want), "ext {ext}");
240        }
241    }
242
243    #[test]
244    fn name_round_trips_for_every_language() {
245        for &lang in Language::all() {
246            assert_eq!(Language::from_name(lang.name()), Some(lang));
247        }
248    }
249
250    #[test]
251    fn detect_prefers_hint_over_extension() {
252        let path = std::path::Path::new("foo.ts");
253        assert_eq!(Language::detect(path, None), Some(Language::TypeScript));
254        assert_eq!(
255            Language::detect(path, Some("javascript")),
256            Some(Language::JavaScript)
257        );
258    }
259}