panproto_parse/scope_detector.rs
1//! Grammar-driven named-scope detection via tree-sitter `tags.scm` queries.
2//!
3//! Every tree-sitter grammar ships a `queries/tags.scm` file, consumed by
4//! GitHub code navigation, Helix, and the `tree-sitter tags` CLI. The capture
5//! vocabulary is standardized:
6//!
7//! - `@definition.function` / `@definition.method` / `@definition.class` /
8//! `@definition.module` / `@definition.interface` / `@definition.type` /
9//! `@definition.macro` (and more): the scope node
10//! - `@name`: the identifier within the scope node
11//!
12//! This module wraps `tree-sitter-tags` to produce a uniform [`NamedScope`]
13//! view of source code across all 248 supported languages. The walker uses
14//! the resulting scope map to drive named-scope detection without any
15//! hardcoded node-kind lists.
16
17use std::ops::Range;
18
19use tree_sitter::Language;
20use tree_sitter_tags::{TagsConfiguration, TagsContext};
21
22use crate::error::ParseError;
23
24/// A named scope discovered by the tags query.
25///
26/// Represents whatever the grammar's `tags.scm` labels with an
27/// `@definition.*` capture paired with `@name`: functions, classes,
28/// methods, modules, types, interfaces, macros, or custom definitions.
29#[derive(Debug, Clone)]
30pub struct NamedScope {
31 /// Byte range of the scope node (e.g. the whole `fn foo() { ... }`).
32 pub node_range: Range<usize>,
33 /// Byte range of the name identifier inside the scope.
34 pub name_range: Range<usize>,
35 /// The identifier text (e.g. `"foo"`), resolved from `name_range`.
36 pub name: String,
37 /// Grammar-declared kind: the `@definition.X` capture suffix
38 /// (`"function"`, `"method"`, `"class"`, `"module"`, `"interface"`,
39 /// `"type"`, `"macro"`, or any custom suffix the grammar defines).
40 pub kind: ScopeKind,
41}
42
43/// Grammar-declared scope kind, parsed from the `@definition.*` capture.
44///
45/// Named variants cover the standard tree-sitter tags vocabulary; [`Other`]
46/// holds any additional suffix a grammar defines.
47///
48/// [`Other`]: ScopeKind::Other
49#[derive(Debug, Clone, PartialEq, Eq)]
50pub enum ScopeKind {
51 /// `@definition.function`
52 Function,
53 /// `@definition.method`
54 Method,
55 /// `@definition.class`
56 Class,
57 /// `@definition.module`
58 Module,
59 /// `@definition.interface`
60 Interface,
61 /// `@definition.type`
62 Type,
63 /// `@definition.macro`
64 Macro,
65 /// Any other `@definition.X` suffix the grammar defines.
66 Other(String),
67}
68
69impl ScopeKind {
70 /// Construct from the `@definition.X` capture suffix.
71 #[must_use]
72 pub fn from_suffix(s: &str) -> Self {
73 match s {
74 "function" => Self::Function,
75 "method" => Self::Method,
76 "class" => Self::Class,
77 "module" => Self::Module,
78 "interface" => Self::Interface,
79 "type" => Self::Type,
80 "macro" => Self::Macro,
81 other => Self::Other(other.to_owned()),
82 }
83 }
84
85 /// The canonical capture suffix for this kind.
86 #[must_use]
87 pub fn as_suffix(&self) -> &str {
88 match self {
89 Self::Function => "function",
90 Self::Method => "method",
91 Self::Class => "class",
92 Self::Module => "module",
93 Self::Interface => "interface",
94 Self::Type => "type",
95 Self::Macro => "macro",
96 Self::Other(s) => s.as_str(),
97 }
98 }
99}
100
101/// A reusable per-language detector that runs a tags query over source bytes
102/// and yields [`NamedScope`]s.
103///
104/// Construct once per grammar (the query is compiled inside
105/// [`TagsConfiguration::new`]); reuse across many files. The internal
106/// [`TagsContext`] holds a tree-sitter `Parser` and `QueryCursor` that are
107/// reset between calls.
108pub struct ScopeDetector {
109 config: Option<TagsConfiguration>,
110 ctx: TagsContext,
111}
112
113impl std::fmt::Debug for ScopeDetector {
114 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
115 // `TagsContext` doesn't implement Debug; we only expose presence of
116 // the config. `finish_non_exhaustive` documents the omission to the
117 // reader (and to clippy's missing-fields lint).
118 f.debug_struct("ScopeDetector")
119 .field("has_config", &self.config.is_some())
120 .finish_non_exhaustive()
121 }
122}
123
124impl ScopeDetector {
125 /// Build a detector from a grammar's tags query.
126 ///
127 /// Pass `None` for `base_query` if the grammar does not ship a
128 /// `queries/tags.scm`; the detector is then a no-op (always returns an
129 /// empty scope list). Pass `Some(project_override)` to concatenate an
130 /// override query in front of the base (tree-sitter unions all patterns,
131 /// so override captures augment the grammar defaults).
132 ///
133 /// # Errors
134 ///
135 /// Returns [`ParseError::ScopeQueryCompile`] if the combined query fails
136 /// to compile against the given language (malformed S-expression, unknown
137 /// capture name outside the tags-query vocabulary, etc.).
138 pub fn new(
139 language: &Language,
140 base_query: Option<&str>,
141 project_override: Option<&str>,
142 ) -> Result<Self, ParseError> {
143 let combined = match (base_query, project_override) {
144 (None, None) => {
145 return Ok(Self {
146 config: None,
147 ctx: TagsContext::new(),
148 });
149 }
150 (Some(base), None) => base.to_owned(),
151 (None, Some(ov)) => ov.to_owned(),
152 (Some(base), Some(ov)) => format!("{ov}\n{base}"),
153 };
154
155 let config = TagsConfiguration::new(language.clone(), &combined, "").map_err(|e| {
156 ParseError::ScopeQueryCompile {
157 reason: e.to_string(),
158 }
159 })?;
160
161 Ok(Self {
162 config: Some(config),
163 ctx: TagsContext::new(),
164 })
165 }
166
167 /// True when this detector has a compiled tags query and will produce
168 /// scopes. A detector constructed from `(None, None)` is a no-op.
169 #[must_use]
170 pub const fn has_query(&self) -> bool {
171 self.config.is_some()
172 }
173
174 /// Run the tags query over `source` and return every `@definition.*`
175 /// match as a [`NamedScope`].
176 ///
177 /// Non-definition captures (`@reference.*`, etc.) are filtered out: they
178 /// describe call sites, not scopes. `@ignore` matches (used by grammars
179 /// like Elixir to suppress false positives) are handled internally by
180 /// `tree-sitter-tags`.
181 ///
182 /// Ordering mirrors tree-sitter's match order (roughly source order,
183 /// with potential reordering when patterns overlap). Callers that need a
184 /// deterministic byte-ordered index should sort the result.
185 #[must_use]
186 pub fn scopes(&mut self, source: &[u8]) -> Vec<NamedScope> {
187 let Some(config) = self.config.as_ref() else {
188 return Vec::new();
189 };
190
191 let Ok((iter, _had_parse_error)) = self.ctx.generate_tags(config, source, None) else {
192 return Vec::new();
193 };
194
195 let mut scopes = Vec::new();
196 for tag_result in iter {
197 let Ok(tag) = tag_result else { continue };
198 if !tag.is_definition {
199 continue;
200 }
201
202 let syntax = config.syntax_type_name(tag.syntax_type_id);
203 let kind = ScopeKind::from_suffix(syntax);
204
205 let Some(name_bytes) = source.get(tag.name_range.clone()) else {
206 continue;
207 };
208 let Ok(name) = std::str::from_utf8(name_bytes) else {
209 continue;
210 };
211
212 scopes.push(NamedScope {
213 node_range: tag.range,
214 name_range: tag.name_range,
215 name: name.to_owned(),
216 kind,
217 });
218 }
219 scopes
220 }
221}
222
223#[cfg(test)]
224#[allow(clippy::unwrap_used)]
225mod tests {
226 use super::*;
227
228 #[test]
229 fn no_query_is_empty() {
230 // Build a detector without any query: scopes() should always return [].
231 // We use the rust grammar only to supply a Language for the (unused) ctx;
232 // the detector with no config should not invoke it.
233 #[cfg(feature = "grammars")]
234 {
235 let lang = panproto_grammars::grammars()
236 .into_iter()
237 .find(|g| g.name == "rust")
238 .map(|g| g.language);
239 if let Some(lang) = lang {
240 let mut det = ScopeDetector::new(&lang, None, None).unwrap();
241 assert!(!det.has_query());
242 assert!(det.scopes(b"fn f() {}").is_empty());
243 }
244 }
245 }
246
247 #[test]
248 #[cfg(feature = "grammars")]
249 fn rust_function_item_is_detected() {
250 let grammar = panproto_grammars::grammars()
251 .into_iter()
252 .find(|g| g.name == "rust");
253 let Some(g) = grammar else {
254 return; // rust grammar not enabled in this feature set
255 };
256 let tags = g.tags_query;
257 if tags.is_none() {
258 return; // grammar was fetched without queries/tags.scm
259 }
260 let mut det = ScopeDetector::new(&g.language, tags, None).unwrap();
261 assert!(det.has_query());
262
263 let source = b"fn verify_push(token: &str) -> bool { true }\n\
264 struct Foo { x: u32 }\n";
265 let scopes = det.scopes(source);
266 let names: Vec<&str> = scopes.iter().map(|s| s.name.as_str()).collect();
267 assert!(names.contains(&"verify_push"), "got {names:?}");
268 assert!(names.contains(&"Foo"), "got {names:?}");
269
270 let fn_scope = scopes.iter().find(|s| s.name == "verify_push").unwrap();
271 assert_eq!(fn_scope.kind, ScopeKind::Function);
272 }
273
274 #[test]
275 #[cfg(feature = "grammars")]
276 fn rust_impl_method_is_detected_as_method() {
277 let Some(g) = panproto_grammars::grammars()
278 .into_iter()
279 .find(|g| g.name == "rust")
280 else {
281 return;
282 };
283 let Some(tags) = g.tags_query else {
284 return;
285 };
286 let mut det = ScopeDetector::new(&g.language, Some(tags), None).unwrap();
287
288 let source = b"impl Foo { fn bar(&self) {} }";
289 let scopes = det.scopes(source);
290 let bar = scopes.iter().find(|s| s.name == "bar");
291 assert!(bar.is_some(), "expected bar method, got {scopes:?}");
292 // Most rust tags.scm versions label impl methods as @definition.method;
293 // we accept either Method or Function to tolerate upstream variation.
294 let k = &bar.unwrap().kind;
295 assert!(matches!(k, ScopeKind::Method | ScopeKind::Function));
296 }
297}