Skip to main content

dk_engine/parser/
engine.rs

1//! Generic query-driven parser engine.
2//!
3//! [`QueryDrivenParser`] uses tree-sitter's Query API to extract symbols,
4//! calls, and imports from any language. Each language supplies a
5//! [`LanguageConfig`](super::lang_config::LanguageConfig) with its grammar
6//! and S-expression queries; the engine compiles and runs them.
7
8use super::lang_config::{CommentStyle, LanguageConfig};
9use super::LanguageParser;
10use dk_core::{
11    CallKind, Error, FileAnalysis, Import, RawCallEdge, Result, Span, Symbol, TypeInfo,
12};
13use std::path::Path;
14use std::sync::Mutex;
15use streaming_iterator::StreamingIterator;
16use tree_sitter::{Node, Parser, Query, QueryCursor, Tree};
17use uuid::Uuid;
18
19/// A language-agnostic parser driven by tree-sitter queries.
20///
21/// One instance handles a single language, configured via [`LanguageConfig`].
22pub struct QueryDrivenParser {
23    config: Box<dyn LanguageConfig>,
24    parser: Mutex<Parser>,
25    symbols_query: Query,
26    calls_query: Option<Query>,
27    imports_query: Option<Query>,
28}
29
30impl QueryDrivenParser {
31    /// Create a new parser from a language configuration.
32    ///
33    /// Compiles the S-expression query strings from `config` into
34    /// [`Query`] objects. Returns [`Error::ParseError`] if compilation fails.
35    pub fn new(config: Box<dyn LanguageConfig>) -> Result<Self> {
36        let lang = config.language();
37
38        let symbols_query = Query::new(&lang, config.symbols_query()).map_err(|e| {
39            Error::ParseError(format!("Failed to compile symbols query: {e}"))
40        })?;
41
42        let calls_query = {
43            let q = config.calls_query();
44            if q.is_empty() {
45                None
46            } else {
47                Some(Query::new(&lang, q).map_err(|e| {
48                    Error::ParseError(format!("Failed to compile calls query: {e}"))
49                })?)
50            }
51        };
52
53        let imports_query = {
54            let q = config.imports_query();
55            if q.is_empty() {
56                None
57            } else {
58                Some(Query::new(&lang, q).map_err(|e| {
59                    Error::ParseError(format!("Failed to compile imports query: {e}"))
60                })?)
61            }
62        };
63
64        let mut parser = Parser::new();
65        parser
66            .set_language(&lang)
67            .map_err(|e| Error::ParseError(format!("Failed to set language: {e}")))?;
68
69        Ok(Self {
70            config,
71            parser: Mutex::new(parser),
72            symbols_query,
73            calls_query,
74            imports_query,
75        })
76    }
77
78    // ── Helpers ──
79
80    /// Parse source bytes into a tree-sitter syntax tree.
81    ///
82    /// Reuses the cached `Parser` instance to avoid repeated allocation
83    /// and language setup.
84    fn parse_tree(&self, source: &[u8]) -> Result<tree_sitter::Tree> {
85        let mut parser = self.parser.lock().unwrap_or_else(|e| e.into_inner());
86        parser
87            .parse(source, None)
88            .ok_or_else(|| Error::ParseError("tree-sitter parse returned None".into()))
89    }
90
91    /// Extract the UTF-8 text of a node from the source bytes.
92    fn node_text<'a>(node: &Node, source: &'a [u8]) -> &'a str {
93        let bytes = &source[node.start_byte()..node.end_byte()];
94        std::str::from_utf8(bytes).unwrap_or("")
95    }
96
97    /// Extract the first line of a node's text as its signature.
98    fn node_signature(node: &Node, source: &[u8]) -> Option<String> {
99        let text = Self::node_text(node, source);
100        let first_line = text.lines().next()?;
101        let trimmed = first_line.trim();
102        if trimmed.is_empty() {
103            None
104        } else {
105            Some(trimmed.to_string())
106        }
107    }
108
109    /// Collect doc-comment lines immediately preceding `node`.
110    ///
111    /// Walks backwards through previous siblings, collecting lines that
112    /// match the configured [`CommentStyle`]. Preserves the original
113    /// comment prefix (e.g. `#`, `///`, `//`, `/** */`) so that AST merge
114    /// can reconstruct valid source code.
115    ///
116    /// **Note:** Unlike the old hand-written parsers (which stripped the
117    /// prefix), `Symbol.doc_comment` now includes the raw prefix. This is
118    /// intentional — AST merge needs the prefix to reconstruct valid
119    /// source. Consumers that display doc comments should strip prefixes
120    /// at the presentation layer.
121    fn collect_doc_comments(&self, node: &Node, source: &[u8]) -> Option<String> {
122        let comment_prefix = match self.config.comment_style() {
123            CommentStyle::TripleSlash => "///",
124            CommentStyle::Hash => "#",
125            CommentStyle::SlashSlash => "//",
126        };
127
128        let mut lines = Vec::new();
129        let mut sibling = node.prev_sibling();
130
131        while let Some(prev) = sibling {
132            if prev.kind() == "line_comment" || prev.kind() == "comment" {
133                // Skip inline comments: if this comment is on the same line
134                // as a preceding non-comment sibling, it belongs to that
135                // sibling (e.g. `x = 60  # 60 seconds`), not to our node.
136                if let Some(before_comment) = prev.prev_sibling() {
137                    if before_comment.kind() != "comment"
138                        && before_comment.kind() != "line_comment"
139                        && before_comment.end_position().row == prev.start_position().row
140                    {
141                        break;
142                    }
143                }
144
145                let text = Self::node_text(&prev, source).trim();
146                if text.starts_with(comment_prefix) || text.starts_with("/*") {
147                    // Preserve the full comment text including prefix.
148                    // The `/*` branch captures JSDoc (`/** ... */`) blocks
149                    // for languages using CommentStyle::SlashSlash.
150                    lines.push(text.to_string());
151                    sibling = prev.prev_sibling();
152                    continue;
153                }
154            }
155            break;
156        }
157
158        if lines.is_empty() {
159            None
160        } else {
161            lines.reverse();
162            Some(lines.join("\n"))
163        }
164    }
165
166    /// Walk parent nodes to find the name of the enclosing function.
167    ///
168    /// Returns `"<module>"` if the node is at the top level.
169    fn enclosing_function_name(&self, node: &Node, source: &[u8]) -> String {
170        let named_function_kinds = [
171            "function_item",
172            "function_definition",
173            "function_declaration",
174            "method_definition",
175        ];
176        // Anonymous function forms whose name comes from an enclosing
177        // variable_declarator (e.g. `const fn = function() {}`)
178        let anonymous_function_kinds = ["arrow_function", "function_expression", "function"];
179
180        let mut current = node.parent();
181        while let Some(parent) = current {
182            let kind = parent.kind();
183            if named_function_kinds.contains(&kind) {
184                if let Some(name_node) = parent.child_by_field_name("name") {
185                    let name = Self::node_text(&name_node, source);
186                    if !name.is_empty() {
187                        return name.to_string();
188                    }
189                }
190            } else if anonymous_function_kinds.contains(&kind) {
191                // Check if assigned to a variable: const foo = function() {}
192                if let Some(gp) = parent.parent() {
193                    if gp.kind() == "variable_declarator" {
194                        if let Some(name_node) = gp.child_by_field_name("name") {
195                            let name = Self::node_text(&name_node, source);
196                            if !name.is_empty() {
197                                return name.to_string();
198                            }
199                        }
200                    }
201                }
202            }
203            current = parent.parent();
204        }
205        "<module>".to_string()
206    }
207}
208
209impl QueryDrivenParser {
210    /// Extract symbols from an already-parsed tree.
211    fn symbols_from_tree(
212        &self,
213        tree: &Tree,
214        source: &[u8],
215        file_path: &Path,
216    ) -> Vec<Symbol> {
217        let root = tree.root_node();
218        let capture_names = self.symbols_query.capture_names();
219
220        let mut cursor = QueryCursor::new();
221        let mut symbols = Vec::new();
222        let mut matches = cursor.matches(&self.symbols_query, root, source);
223
224        while let Some(m) = { matches.advance(); matches.get() } {
225            let mut name_text: Option<String> = None;
226            let mut definition_node: Option<Node> = None;
227            let mut kind_suffix: Option<String> = None;
228            let mut modifiers_text: Option<String> = None;
229
230            for capture in m.captures {
231                let capture_name = capture_names[capture.index as usize];
232
233                if capture_name == "name" {
234                    name_text = Some(Self::node_text(&capture.node, source).to_string());
235                } else if let Some(suffix) = capture_name.strip_prefix("definition.") {
236                    definition_node = Some(capture.node);
237                    kind_suffix = Some(suffix.to_string());
238                } else if capture_name == "modifiers" {
239                    modifiers_text = Some(Self::node_text(&capture.node, source).to_string());
240                }
241            }
242
243            // We need at least a name and a definition node with a kind suffix.
244            let name = match &name_text {
245                Some(n) if !n.is_empty() => n.as_str(),
246                _ => continue,
247            };
248            let def_node = match definition_node {
249                Some(n) => n,
250                None => continue,
251            };
252            let suffix = match &kind_suffix {
253                Some(s) => s.as_str(),
254                None => continue,
255            };
256
257            let symbol_kind = match self.config.map_capture_to_kind(suffix) {
258                Some(k) => k,
259                None => continue,
260            };
261
262            let visibility = self
263                .config
264                .resolve_visibility(modifiers_text.as_deref(), name);
265            let signature = Self::node_signature(&def_node, source);
266            let doc_comment = self.collect_doc_comments(&def_node, source);
267
268            let mut sym = Symbol {
269                id: Uuid::new_v4(),
270                name: name.to_string(),
271                qualified_name: name.to_string(),
272                kind: symbol_kind,
273                visibility,
274                file_path: file_path.to_path_buf(),
275                span: Span {
276                    start_byte: def_node.start_byte() as u32,
277                    end_byte: def_node.end_byte() as u32,
278                },
279                signature,
280                doc_comment,
281                parent: None,
282                last_modified_by: None,
283                last_modified_intent: None,
284            };
285
286            self.config.adjust_symbol(&mut sym, &def_node, source);
287            symbols.push(sym);
288        }
289
290        symbols
291    }
292
293    /// Extract call edges from an already-parsed tree.
294    fn calls_from_tree(&self, tree: &Tree, source: &[u8]) -> Vec<RawCallEdge> {
295        let calls_query = match &self.calls_query {
296            Some(q) => q,
297            None => return vec![],
298        };
299
300        let root = tree.root_node();
301        let capture_names = calls_query.capture_names();
302
303        let mut cursor = QueryCursor::new();
304        let mut calls = Vec::new();
305        let mut matches = cursor.matches(calls_query, root, source);
306
307        while let Some(m) = { matches.advance(); matches.get() } {
308            let mut callee_text: Option<String> = None;
309            let mut method_callee_text: Option<String> = None;
310            let mut call_node: Option<Node> = None;
311            let mut first_node: Option<Node> = None;
312
313            for capture in m.captures {
314                let capture_name = capture_names[capture.index as usize];
315
316                if first_node.is_none() {
317                    first_node = Some(capture.node);
318                }
319
320                match capture_name {
321                    "callee" => {
322                        callee_text =
323                            Some(Self::node_text(&capture.node, source).to_string());
324                    }
325                    "method_callee" => {
326                        method_callee_text =
327                            Some(Self::node_text(&capture.node, source).to_string());
328                    }
329                    "call" => call_node = Some(capture.node),
330                    _ => {}
331                }
332            }
333
334            // Determine call kind and callee name.
335            let (callee_name, call_kind) = if let Some(method) =
336                method_callee_text.filter(|s| !s.is_empty())
337            {
338                (method, CallKind::MethodCall)
339            } else if let Some(direct) = callee_text.filter(|s| !s.is_empty()) {
340                (direct, CallKind::DirectCall)
341            } else {
342                continue;
343            };
344
345            // Use the @call node for span, falling back to the first captured node.
346            let span_node = call_node
347                .or(first_node)
348                .expect("match has at least one capture");
349
350            let caller_name = self.enclosing_function_name(&span_node, source);
351
352            calls.push(RawCallEdge {
353                caller_name,
354                callee_name,
355                call_site: Span {
356                    start_byte: span_node.start_byte() as u32,
357                    end_byte: span_node.end_byte() as u32,
358                },
359                kind: call_kind,
360            });
361        }
362
363        calls
364    }
365
366    /// Extract imports from an already-parsed tree.
367    fn imports_from_tree(&self, tree: &Tree, source: &[u8]) -> Vec<Import> {
368        let imports_query = match &self.imports_query {
369            Some(q) => q,
370            None => return vec![],
371        };
372
373        let root = tree.root_node();
374        let capture_names = imports_query.capture_names();
375
376        let mut cursor = QueryCursor::new();
377        let mut imports = Vec::new();
378        let mut matches = cursor.matches(imports_query, root, source);
379
380        while let Some(m) = { matches.advance(); matches.get() } {
381            let mut module_text: Option<String> = None;
382            let mut import_name_text: Option<String> = None;
383            let mut alias_text: Option<String> = None;
384            let mut is_relative_import = false;
385
386            for capture in m.captures {
387                let capture_name = capture_names[capture.index as usize];
388
389                match capture_name {
390                    "module" => {
391                        let text = Self::node_text(&capture.node, source);
392                        // Strip surrounding quotes if present.
393                        module_text = Some(
394                            text.trim_matches(|c| c == '"' || c == '\'').to_string(),
395                        );
396                    }
397                    "import_name" => {
398                        import_name_text =
399                            Some(Self::node_text(&capture.node, source).to_string());
400                    }
401                    "alias" => {
402                        alias_text = Some(Self::node_text(&capture.node, source).to_string());
403                    }
404                    "_relative" => {
405                        // Marker capture: the import query flagged this as a
406                        // relative/internal import (e.g. Ruby require_relative).
407                        is_relative_import = true;
408                    }
409                    _ => {}
410                }
411            }
412
413            let module_path = match module_text {
414                Some(ref m) if !m.is_empty() => m.clone(),
415                _ => continue,
416            };
417
418            let imported_name = import_name_text
419                .filter(|s| !s.is_empty())
420                .unwrap_or_else(|| {
421                    // Derive imported name from the last segment of the module path.
422                    module_path
423                        .rsplit(['/', '.', ':'])
424                        .next()
425                        .unwrap_or(&module_path)
426                        .to_string()
427                });
428
429            let alias = alias_text.filter(|s| !s.is_empty());
430
431            // If the query flagged this as a relative import (e.g. Ruby
432            // require_relative), it is always internal regardless of path.
433            let is_external = if is_relative_import {
434                false
435            } else {
436                self.config.is_external_import(&module_path)
437            };
438
439            imports.push(Import {
440                module_path,
441                imported_name,
442                alias,
443                is_external,
444            });
445        }
446
447        imports
448    }
449}
450
451impl LanguageParser for QueryDrivenParser {
452    fn extensions(&self) -> &[&str] {
453        self.config.extensions()
454    }
455
456    fn extract_symbols(&self, source: &[u8], file_path: &Path) -> Result<Vec<Symbol>> {
457        if source.is_empty() {
458            return Ok(vec![]);
459        }
460        let tree = self.parse_tree(source)?;
461        Ok(self.symbols_from_tree(&tree, source, file_path))
462    }
463
464    fn extract_calls(&self, source: &[u8], _file_path: &Path) -> Result<Vec<RawCallEdge>> {
465        if source.is_empty() {
466            return Ok(vec![]);
467        }
468        let tree = self.parse_tree(source)?;
469        Ok(self.calls_from_tree(&tree, source))
470    }
471
472    fn extract_types(&self, _source: &[u8], _file_path: &Path) -> Result<Vec<TypeInfo>> {
473        Ok(vec![])
474    }
475
476    fn extract_imports(&self, source: &[u8], _file_path: &Path) -> Result<Vec<Import>> {
477        if source.is_empty() {
478            return Ok(vec![]);
479        }
480        let tree = self.parse_tree(source)?;
481        Ok(self.imports_from_tree(&tree, source))
482    }
483
484    /// Parse once and extract all data — avoids triple-parsing the same source.
485    fn parse_file(&self, source: &[u8], file_path: &Path) -> Result<FileAnalysis> {
486        if source.is_empty() {
487            return Ok(FileAnalysis {
488                symbols: vec![],
489                calls: vec![],
490                types: vec![],
491                imports: vec![],
492            });
493        }
494        let tree = self.parse_tree(source)?;
495        Ok(FileAnalysis {
496            symbols: self.symbols_from_tree(&tree, source, file_path),
497            calls: self.calls_from_tree(&tree, source),
498            types: vec![],
499            imports: self.imports_from_tree(&tree, source),
500        })
501    }
502}