Skip to main content

codescout/ast/
mod.rs

1//! AST engine: offline, in-process code parsing via tree-sitter.
2//!
3//! Provides symbol extraction and structural queries without requiring a
4//! running language server. Used as the primary fallback when no LSP is
5//! configured, and as a complement to LSP for fast structural analysis.
6
7pub mod parser;
8
9use anyhow::Result;
10use std::path::Path;
11
12// tree-sitter grammars — used by get_ts_language
13use tree_sitter_bash;
14use tree_sitter_css;
15use tree_sitter_go;
16use tree_sitter_html;
17use tree_sitter_java;
18use tree_sitter_kotlin_ng;
19use tree_sitter_python;
20use tree_sitter_rust;
21use tree_sitter_typescript;
22
23use crate::lsp::symbols::SymbolInfo;
24pub use parser::has_syntax_errors;
25pub use parser::DocstringInfo;
26
27/// Extract top-level symbols from a file using tree-sitter.
28///
29/// Faster than an LSP round-trip and works offline — ideal for initial
30/// indexing or when an LSP is unavailable.
31pub fn extract_symbols(path: &Path) -> Result<Vec<SymbolInfo>> {
32    let source = std::fs::read_to_string(path)?;
33    let language = detect_language(path);
34    parser::extract_symbols_from_source(&source, language, path)
35}
36
37/// Extract symbols from already-loaded source text using tree-sitter.
38///
39/// Prefer this over `extract_symbols` when the file content is already in memory
40/// to avoid a second disk read.
41pub fn extract_symbols_from_text(text: &str, path: &Path) -> Result<Vec<SymbolInfo>> {
42    let language = detect_language(path);
43    parser::extract_symbols_from_source(text, language, path)
44}
45
46/// Extract docstrings/comments from a file using tree-sitter.
47pub fn extract_docstrings(path: &Path) -> Result<Vec<DocstringInfo>> {
48    let source = std::fs::read_to_string(path)?;
49    let language = detect_language(path);
50    parser::extract_docstrings_from_source(&source, language, path)
51}
52
53/// Detect the programming language from a file extension.
54///
55/// Returns a canonical language name for any recognized source file — including
56/// languages that have **no tree-sitter grammar** (e.g. C, C++, Ruby, PHP, Swift).
57/// Use this to decide whether a path is a source file at all.
58///
59/// To check whether a language has AST/tree-sitter support, call
60/// [`get_ts_language`] and test for `Some`.
61pub fn detect_language(path: &Path) -> Option<&'static str> {
62    match path.extension()?.to_str()? {
63        "rs" => Some("rust"),
64        "py" => Some("python"),
65        "ts" => Some("typescript"),
66        "tsx" => Some("tsx"),
67        "js" | "cjs" | "mjs" => Some("javascript"),
68        "jsx" => Some("jsx"),
69        "go" => Some("go"),
70        "java" => Some("java"),
71        "kt" | "kts" => Some("kotlin"),
72        "c" => Some("c"),
73        "cpp" | "cc" | "cxx" => Some("cpp"),
74        "cs" => Some("csharp"),
75        "rb" => Some("ruby"),
76        "html" | "htm" => Some("html"),
77        "css" => Some("css"),
78        "scss" => Some("scss"),
79        "less" => Some("less"),
80        "php" => Some("php"),
81        "swift" => Some("swift"),
82        "scala" => Some("scala"),
83        "ex" | "exs" => Some("elixir"),
84        "hs" => Some("haskell"),
85        "lua" => Some("lua"),
86        "sh" | "bash" => Some("bash"),
87        "md" | "markdown" => Some("markdown"),
88        _ => None,
89    }
90}
91
92/// Map a language name to its tree-sitter grammar (case-insensitive).
93///
94/// This is the single source of truth for tree-sitter language resolution.
95/// Both the AST parser and the embedding chunker use this function.
96///
97/// JavaScript and JSX reuse the TypeScript/TSX grammars respectively —
98/// TypeScript is a superset of JavaScript so the parse trees are compatible.
99/// SCSS and Less reuse the CSS grammar.
100pub(crate) fn get_ts_language(lang: &str) -> Option<tree_sitter::Language> {
101    match lang.to_ascii_lowercase().as_str() {
102        "rust" => Some(tree_sitter_rust::LANGUAGE.into()),
103        "python" => Some(tree_sitter_python::LANGUAGE.into()),
104        "go" => Some(tree_sitter_go::LANGUAGE.into()),
105        "typescript" | "javascript" => Some(tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into()),
106        "tsx" | "jsx" => Some(tree_sitter_typescript::LANGUAGE_TSX.into()),
107        "java" => Some(tree_sitter_java::LANGUAGE.into()),
108        "kotlin" => Some(tree_sitter_kotlin_ng::LANGUAGE.into()),
109        "html" => Some(tree_sitter_html::LANGUAGE.into()),
110        "css" | "scss" | "less" => Some(tree_sitter_css::LANGUAGE.into()),
111        "bash" => Some(tree_sitter_bash::LANGUAGE.into()),
112        _ => None,
113    }
114}
115
116#[cfg(test)]
117mod tests {
118    use super::*;
119    use std::path::Path;
120
121    /// `detect_language` is intentionally broader than `get_ts_language`:
122    /// it identifies any source file we recognise (for LSP routing, file
123    /// gating, code-fence labels, …) while AST chunking is only available
124    /// for the languages where we ship a tree-sitter grammar.
125    ///
126    /// This test pins the contract by enumerating both sets explicitly.
127    /// Add a new extension → also update this list (and add a tree-sitter
128    /// crate if you intend to give it AST support).
129    #[test]
130    fn detect_language_vs_get_ts_language_contract() {
131        // Every extension that detect_language() recognises.
132        let detected_samples: &[(&str, &str)] = &[
133            ("a.rs", "rust"),
134            ("a.py", "python"),
135            ("a.ts", "typescript"),
136            ("a.tsx", "tsx"),
137            ("a.js", "javascript"),
138            ("a.jsx", "jsx"),
139            ("a.go", "go"),
140            ("a.java", "java"),
141            ("a.kt", "kotlin"),
142            ("a.c", "c"),
143            ("a.cpp", "cpp"),
144            ("a.cs", "csharp"),
145            ("a.rb", "ruby"),
146            ("a.html", "html"),
147            ("a.css", "css"),
148            ("a.scss", "scss"),
149            ("a.less", "less"),
150            ("a.php", "php"),
151            ("a.swift", "swift"),
152            ("a.scala", "scala"),
153            ("a.ex", "elixir"),
154            ("a.hs", "haskell"),
155            ("a.lua", "lua"),
156            ("a.sh", "bash"),
157            ("a.md", "markdown"),
158        ];
159
160        for (path, expected_lang) in detected_samples {
161            let actual = detect_language(Path::new(path));
162            assert_eq!(
163                actual,
164                Some(*expected_lang),
165                "detect_language({path}) should return Some({expected_lang})"
166            );
167        }
168
169        // Languages that DO have tree-sitter (AST) support.
170        let with_ast = &[
171            "rust",
172            "python",
173            "typescript",
174            "javascript",
175            "tsx",
176            "jsx",
177            "go",
178            "java",
179            "kotlin",
180            "html",
181            "css",
182            "scss",
183            "less",
184            "bash",
185        ];
186        for lang in with_ast {
187            assert!(
188                get_ts_language(lang).is_some(),
189                "expected AST support for {lang}"
190            );
191        }
192
193        // Languages we detect as source files but do NOT chunk via AST.
194        // These fall back to plain-text chunking in the embedding pipeline,
195        // and tools that require AST (e.g. structural editing) refuse them
196        // with a clear error. Adding a tree-sitter crate moves an entry
197        // from this list into `with_ast`.
198        let without_ast = &[
199            "c", "cpp", "csharp", "ruby", "php", "swift", "scala", "elixir", "haskell", "lua",
200            "markdown",
201        ];
202        for lang in without_ast {
203            assert!(
204                get_ts_language(lang).is_none(),
205                "{lang} unexpectedly has AST support — move it to with_ast"
206            );
207        }
208    }
209}