llmcc_core/
lang_def.rs

1//! Language definition framework for multi-language AST support.
2use std::any::Any;
3
4use crate::context::{CompileCtxt, CompileUnit};
5use crate::graph_builder::BlockKind;
6use crate::ir::HirKind;
7use crate::ir::HirNode;
8use crate::scope::{Scope, ScopeStack};
9
10/// A child node paired with its field ID for efficient iteration.
11pub struct ChildWithFieldId<'a> {
12    /// The child node (boxed for trait object)
13    pub node: Box<dyn ParseNode + 'a>,
14    /// The field ID of this child within its parent (u16::MAX if none)
15    pub field_id: u16,
16}
17
18/// Generic trait for parse tree representation.
19///
20/// Implementations can wrap tree-sitter trees, custom ASTs, or other parse representations.
21/// This abstraction decouples language definitions from specific parser implementations.
22pub trait ParseTree: Send + Sync + 'static {
23    /// Type-erased access to underlying tree for downcasting
24    fn as_any(&self) -> &(dyn Any + Send + Sync);
25
26    /// Debug representation
27    fn debug_info(&self) -> String;
28
29    /// Get the root ParseNode of this tree
30    fn root_node(&self) -> Option<Box<dyn ParseNode + '_>> {
31        None
32    }
33}
34
35/// Default implementation wrapping tree-sitter Tree
36#[derive(Debug, Clone)]
37pub struct TreeSitterParseTree {
38    pub tree: ::tree_sitter::Tree,
39}
40
41impl ParseTree for TreeSitterParseTree {
42    fn as_any(&self) -> &(dyn Any + Send + Sync) {
43        self
44    }
45
46    fn debug_info(&self) -> String {
47        format!("TreeSitter(root_id: {})", self.tree.root_node().id())
48    }
49
50    fn root_node(&self) -> Option<Box<dyn ParseNode + '_>> {
51        Some(Box::new(TreeSitterParseNode::new(self.tree.root_node())))
52    }
53}
54
55/// Generic trait for parse tree nodes (individual AST nodes).
56///
57/// Implementations can wrap tree-sitter nodes, custom AST nodes, or other parse representations.
58/// This abstraction allows IR building to work with any parser backend.
59///
60/// Note: Unlike ParseTree, ParseNode can have lifetime parameters to match the lifetime
61/// of the underlying parser's borrowed nodes (e.g., tree-sitter::Node<'tree>).
62pub trait ParseNode: Send + Sync {
63    /// Get the node's kind ID (language-specific token ID)
64    fn kind_id(&self) -> u16;
65
66    /// Get the start byte offset of this node in the source
67    fn start_byte(&self) -> usize;
68
69    /// Get the end byte offset of this node in the source
70    fn end_byte(&self) -> usize;
71
72    /// Get the number of children this node has
73    fn child_count(&self) -> usize;
74
75    /// Get the child at the specified index
76    fn child(&self, index: usize) -> Option<Box<dyn ParseNode + '_>>;
77
78    /// Get the field name of the child at the specified index (if available)
79    fn child_field_name(&self, _index: usize) -> Option<&str> {
80        None
81    }
82
83    /// Get the field ID of this node within its parent (if available).
84    /// Returns None if the node has no parent or the field ID cannot be determined.
85    fn field_id(&self) -> Option<u16> {
86        None
87    }
88
89    /// Collect all children with their field IDs in a single pass.
90    /// This is more efficient than calling child() + field_id() separately
91    /// because it uses a cursor to get field_id during iteration.
92    ///
93    /// Default implementation falls back to child() + field_id() for each child.
94    fn collect_children_with_field_ids(&self) -> Vec<ChildWithFieldId<'_>> {
95        let mut result = Vec::with_capacity(self.child_count());
96        for i in 0..self.child_count() {
97            if let Some(child) = self.child(i) {
98                let field_id = child.field_id().unwrap_or(u16::MAX);
99                result.push(ChildWithFieldId {
100                    node: child,
101                    field_id,
102                });
103            }
104        }
105        result
106    }
107
108    /// Get a child by field name (if supported by the parser)
109    fn child_by_field_name(&self, field_name: &str) -> Option<Box<dyn ParseNode + '_>>;
110
111    /// Get a child by field ID (if supported by the parser)
112    fn child_by_field_id(&self, _field_id: u16) -> Option<Box<dyn ParseNode + '_>> {
113        None
114    }
115
116    /// Check if this node represents a parse error
117    fn is_error(&self) -> bool {
118        false
119    }
120
121    /// Check if this node is "extra" (typically whitespace/comments)
122    fn is_extra(&self) -> bool {
123        false
124    }
125
126    /// Check if this node is missing (e.g., implicit tokens)
127    fn is_missing(&self) -> bool {
128        false
129    }
130
131    /// Check if this node is a named token (vs anonymous)
132    fn is_named(&self) -> bool {
133        true
134    }
135
136    /// Get the parent node if available
137    fn parent(&self) -> Option<Box<dyn ParseNode + '_>> {
138        None
139    }
140
141    /// Debug representation of this node
142    fn debug_info(&self) -> String;
143
144    /// Format a label for this node suitable for debugging and rendering.
145    fn format_node_label(&self, field_name: Option<&str>) -> String {
146        // Extract kind string from debug_info
147        let debug_str = self.debug_info();
148        let kind_str = if let Some(start) = debug_str.find("kind: ") {
149            if let Some(end) = debug_str[start + 6..].find(',') {
150                &debug_str[start + 6..start + 6 + end]
151            } else if let Some(end) = debug_str[start + 6..].find(')') {
152                &debug_str[start + 6..start + 6 + end]
153            } else {
154                "unknown"
155            }
156        } else {
157            "unknown"
158        };
159
160        let kind_id = self.kind_id();
161        let mut label = String::new();
162
163        // Add field name if provided
164        if let Some(fname) = field_name {
165            label.push_str(&format!("|{fname}|_ "));
166        }
167
168        // Add kind and kind_id
169        label.push_str(&format!("{kind_str} [{kind_id}]"));
170
171        // Add status flags
172        if self.is_error() {
173            label.push_str(" [ERROR]");
174        } else if self.is_extra() {
175            label.push_str(" [EXTRA]");
176        } else if self.is_missing() {
177            label.push_str(" [MISSING]");
178        }
179
180        label
181    }
182}
183
184/// Wrapper implementation of ParseNode for tree-sitter nodes
185pub struct TreeSitterParseNode<'tree> {
186    node: ::tree_sitter::Node<'tree>,
187}
188
189impl<'tree> TreeSitterParseNode<'tree> {
190    /// Create a new wrapper around a tree-sitter node
191    pub fn new(node: ::tree_sitter::Node<'tree>) -> Self {
192        Self { node }
193    }
194}
195
196impl<'tree> ParseNode for TreeSitterParseNode<'tree> {
197    fn kind_id(&self) -> u16 {
198        self.node.kind_id()
199    }
200
201    fn start_byte(&self) -> usize {
202        self.node.start_byte()
203    }
204
205    fn end_byte(&self) -> usize {
206        self.node.end_byte()
207    }
208
209    fn child_count(&self) -> usize {
210        self.node.child_count()
211    }
212
213    fn child(&self, index: usize) -> Option<Box<dyn ParseNode + '_>> {
214        self.node
215            .child(index)
216            .map(|child| Box::new(TreeSitterParseNode::new(child)) as Box<dyn ParseNode + '_>)
217    }
218
219    fn child_field_name(&self, index: usize) -> Option<&str> {
220        self.node.field_name_for_child(index as u32)
221    }
222
223    fn field_id(&self) -> Option<u16> {
224        // Walk up to parent and find this node's field ID
225        // NOTE: This is O(n) per call - prefer collect_children_with_field_ids for bulk access
226        let parent = self.node.parent()?;
227        let mut cursor = parent.walk();
228
229        if !cursor.goto_first_child() {
230            return None;
231        }
232
233        loop {
234            if cursor.node().id() == self.node.id() {
235                return cursor.field_id().map(|id| id.get());
236            }
237            if !cursor.goto_next_sibling() {
238                break;
239            }
240        }
241
242        None
243    }
244
245    /// Efficient cursor-based collection of children with field IDs.
246    /// This avoids the O(n²) cost of calling field_id() on each child separately.
247    fn collect_children_with_field_ids(&self) -> Vec<ChildWithFieldId<'_>> {
248        let mut result = Vec::with_capacity(self.node.child_count());
249        let mut cursor = self.node.walk();
250
251        if !cursor.goto_first_child() {
252            return result;
253        }
254
255        loop {
256            let child_node = cursor.node();
257            let field_id = cursor.field_id().map(|id| id.get()).unwrap_or(u16::MAX);
258            result.push(ChildWithFieldId {
259                node: Box::new(TreeSitterParseNode::new(child_node)),
260                field_id,
261            });
262
263            if !cursor.goto_next_sibling() {
264                break;
265            }
266        }
267
268        result
269    }
270
271    fn child_by_field_name(&self, field_name: &str) -> Option<Box<dyn ParseNode + '_>> {
272        self.node
273            .child_by_field_name(field_name)
274            .map(|child| Box::new(TreeSitterParseNode::new(child)) as Box<dyn ParseNode + '_>)
275    }
276
277    fn child_by_field_id(&self, _field_id: u16) -> Option<Box<dyn ParseNode + '_>> {
278        None
279    }
280
281    fn is_error(&self) -> bool {
282        self.node.is_error()
283    }
284
285    fn is_extra(&self) -> bool {
286        self.node.is_extra()
287    }
288
289    fn is_missing(&self) -> bool {
290        self.node.is_missing()
291    }
292
293    fn is_named(&self) -> bool {
294        self.node.is_named()
295    }
296
297    fn parent(&self) -> Option<Box<dyn ParseNode + '_>> {
298        self.node
299            .parent()
300            .map(|parent| Box::new(TreeSitterParseNode::new(parent)) as Box<dyn ParseNode + '_>)
301    }
302
303    fn debug_info(&self) -> String {
304        format!(
305            "TreeSitterNode(kind: {}, kind_id: {}, bytes: {}..{})",
306            self.node.kind(),
307            self.node.kind_id(),
308            self.start_byte(),
309            self.end_byte()
310        )
311    }
312}
313
314/// Scopes trait defining language-specific AST handling.
315pub trait LanguageTrait {
316    /// Get the manifest file name for this language (e.g., "Cargo.toml", "package.json").
317    fn manifest_name() -> &'static str;
318
319    /// Get the container directories that don't add semantic meaning.
320    /// These directories are skipped in module detection (e.g., "src", "lib").
321    fn container_dirs() -> &'static [&'static str];
322
323    /// Check if a directory name is a container directory.
324    fn is_container(name: &str) -> bool {
325        Self::container_dirs().contains(&name)
326    }
327
328    /// Parse source code and return a generic parse tree.
329    fn parse(_text: impl AsRef<[u8]>) -> Option<Box<dyn ParseTree>>;
330
331    /// Map a token kind ID to its corresponding HIR kind.
332    fn hir_kind(kind_id: u16) -> HirKind;
333
334    /// Map a token kind ID to its corresponding block kind.
335    fn block_kind(kind_id: u16) -> BlockKind;
336
337    /// Map a token kind ID to its corresponding block kind with parent context.
338    /// This allows languages to create blocks based on the parent node's kind.
339    /// For example, types inside tuple struct definitions become Field blocks.
340    /// Default implementation ignores parent and delegates to block_kind.
341    fn block_kind_with_parent(kind_id: u16, field_id: u16, _parent_kind_id: u16) -> BlockKind {
342        let field_kind = Self::block_kind(field_id);
343        if field_kind != BlockKind::Undefined {
344            field_kind
345        } else {
346            Self::block_kind(kind_id)
347        }
348    }
349
350    /// Check if a parse node is a test-related attribute that should cause the next item to be skipped.
351    /// This is used to filter out test functions and modules from the HIR at build time.
352    /// Takes the parse node and source bytes to extract and check the attribute text.
353    /// Default implementation returns false (no filtering).
354    fn is_test_attribute(node: &dyn ParseNode, source: &[u8]) -> bool {
355        let _ = (node, source);
356        false
357    }
358
359    /// Get the string representation of a token ID.
360    fn token_str(kind_id: u16) -> Option<&'static str>;
361
362    /// Validate whether a kind ID corresponds to a defined token.
363    fn is_valid_token(kind_id: u16) -> bool;
364
365    /// Get the field ID that represents the "name" of a construct.
366    fn name_field() -> u16;
367
368    /// Get the field ID that represents the "type" of a construct.
369    fn type_field() -> u16;
370
371    /// Get the field ID that represents the "trait" in impl blocks.
372    /// Used for `impl Trait for Type { }` to identify the trait being implemented.
373    fn trait_field() -> u16;
374
375    /// Get the list of file extensions this language supports.
376    fn supported_extensions() -> &'static [&'static str];
377
378    fn collect_init<'tcx>(cc: &'tcx CompileCtxt<'tcx>) -> ScopeStack<'tcx>;
379
380    /// TOOD: can we remove the generics here, we could make a new crate or
381    /// bring llmcc-resolver into core to solve the cross dependency
382    fn collect_symbols<'tcx, C>(
383        unit: CompileUnit<'tcx>,
384        node: HirNode<'tcx>,
385        scope_stack: ScopeStack<'tcx>,
386        config: &C,
387    ) -> &'tcx Scope<'tcx>;
388
389    fn bind_symbols<'tcx, C>(
390        unit: CompileUnit<'tcx>,
391        node: HirNode<'tcx>,
392        globals: &'tcx Scope<'tcx>,
393        config: &C,
394    );
395}
396
397/// Extension trait for providing custom parse implementations.
398pub trait LanguageTraitImpl: LanguageTrait {
399    /// Custom parse implementation for this language.
400    fn parse_impl(text: impl AsRef<[u8]>) -> Option<Box<dyn ParseTree>>;
401
402    /// Supported file extensions for this language.
403    fn supported_extensions_impl() -> &'static [&'static str];
404
405    /// The manifest file name for this language (e.g., "Cargo.toml", "package.json").
406    fn manifest_name_impl() -> &'static str;
407
408    /// Container directories that don't add semantic meaning (e.g., "src", "lib").
409    fn container_dirs_impl() -> &'static [&'static str];
410
411    /// Language-specific block kind with parent context.
412    /// Override this to handle context-dependent block creation.
413    /// Default implementation delegates to the trait's default.
414    fn block_kind_with_parent_impl(kind_id: u16, field_id: u16, _parent_kind_id: u16) -> BlockKind {
415        // Default: use the trait's default implementation
416        let field_kind = Self::block_kind(field_id);
417        if field_kind != BlockKind::Undefined {
418            field_kind
419        } else {
420            Self::block_kind(kind_id)
421        }
422    }
423
424    fn collect_init_impl<'tcx>(cc: &'tcx CompileCtxt<'tcx>) -> ScopeStack<'tcx> {
425        ScopeStack::new(cc.arena(), &cc.interner)
426    }
427
428    /// Check if a parse node is a test attribute that should cause the next item to be skipped.
429    /// Override this for language-specific test attribute detection.
430    /// Default implementation returns false.
431    fn is_test_attribute_impl(node: &dyn ParseNode, source: &[u8]) -> bool {
432        let _ = (node, source);
433        false
434    }
435
436    fn collect_symbols_impl<'tcx, C>(
437        unit: CompileUnit<'tcx>,
438        node: HirNode<'tcx>,
439        scope_stack: ScopeStack<'tcx>,
440        config: &C,
441    ) -> &'tcx Scope<'tcx>;
442
443    fn bind_symbols_impl<'tcx, C>(
444        unit: CompileUnit<'tcx>,
445        node: HirNode<'tcx>,
446        globals: &'tcx Scope<'tcx>,
447        config: &C,
448    );
449}
450
451#[allow(clippy::crate_in_macro_def)]
452#[macro_export]
453macro_rules! define_lang {
454    (
455        $suffix:ident,
456        $( ($const:ident, $id:expr, $str:expr, $kind:expr $(, $block:expr)? ) ),* $(,)?
457    ) => {
458        $crate::paste::paste! {
459            /// Language Struct Definition
460            #[derive(Debug)]
461            pub struct [<Lang $suffix>] {}
462
463
464            /// Language Constants
465            #[allow(non_upper_case_globals)]
466            impl [<Lang $suffix>] {
467                /// Create a new Language instance
468                pub fn new() -> Self {
469                    Self {}
470                }
471
472                // Generate token ID constants
473                $(
474                    pub const $const: u16 = $id;
475                )*
476            }
477
478
479            /// Language Trait Implementation
480            impl $crate::lang_def::LanguageTrait for [<Lang $suffix>] {
481                fn manifest_name() -> &'static str {
482                    <Self as $crate::lang_def::LanguageTraitImpl>::manifest_name_impl()
483                }
484
485                fn container_dirs() -> &'static [&'static str] {
486                    <Self as $crate::lang_def::LanguageTraitImpl>::container_dirs_impl()
487                }
488
489                /// Parse source code and return a generic parse tree.
490                ///
491                /// First tries the custom parse_impl from LanguageTraitImpl.
492                /// If that returns None, falls back to tree-sitter parsing if available.
493                fn parse(text: impl AsRef<[u8]>) -> Option<Box<dyn $crate::lang_def::ParseTree>> {
494                    <Self as $crate::lang_def::LanguageTraitImpl>::parse_impl(text.as_ref())
495                }
496
497                fn collect_init<'tcx>(cc: &'tcx $crate::context::CompileCtxt<'tcx>) -> $crate::scope::ScopeStack<'tcx> {
498                    <Self as $crate::lang_def::LanguageTraitImpl>::collect_init_impl(cc)
499                }
500
501                fn collect_symbols<'tcx, C>(
502                    unit: $crate::context::CompileUnit<'tcx>,
503                    node: $crate::ir::HirNode<'tcx>,
504                    scope_stack: $crate::scope::ScopeStack<'tcx>,
505                    config: &C,
506                ) -> &'tcx $crate::scope::Scope<'tcx> {
507                    <Self as $crate::lang_def::LanguageTraitImpl>::collect_symbols_impl(unit, node, scope_stack, config)
508                }
509
510                fn bind_symbols<'tcx, C>(
511                    unit: $crate::context::CompileUnit<'tcx>,
512                    node: $crate::ir::HirNode<'tcx>,
513                    globals: &'tcx $crate::scope::Scope<'tcx>,
514                    config: &C,
515                ) {
516                    <Self as $crate::lang_def::LanguageTraitImpl>::bind_symbols_impl(unit, node, globals, config);
517                }
518
519                /// Return the list of supported file extensions for this language
520                fn supported_extensions() -> &'static [&'static str] {
521                    <Self as $crate::lang_def::LanguageTraitImpl>::supported_extensions_impl()
522                }
523
524                /// Get the HIR kind for a given token ID
525                fn hir_kind(kind_id: u16) -> $crate::ir::HirKind {
526                    match kind_id {
527                        $(
528                            Self::$const => $kind,
529                        )*
530                        _ => $crate::ir::HirKind::Internal,
531                    }
532                }
533
534                /// Get the Block kind for a given token ID
535                fn block_kind(kind_id: u16) -> $crate::graph_builder::BlockKind {
536                    match kind_id {
537                        $(
538                            Self::$const => define_lang!(@unwrap_block $($block)?),
539                        )*
540                        _ => $crate::graph_builder::BlockKind::Undefined,
541                    }
542                }
543
544                /// Get the Block kind for a given token ID with parent context
545                fn block_kind_with_parent(kind_id: u16, field_id: u16, parent_kind_id: u16) -> $crate::graph_builder::BlockKind {
546                    <Self as $crate::lang_def::LanguageTraitImpl>::block_kind_with_parent_impl(kind_id, field_id, parent_kind_id)
547                }
548
549                /// Check if a parse node is a test attribute that should cause the next item to be skipped
550                fn is_test_attribute(node: &dyn $crate::lang_def::ParseNode, source: &[u8]) -> bool {
551                    <Self as $crate::lang_def::LanguageTraitImpl>::is_test_attribute_impl(node, source)
552                }
553
554                /// Get the string representation of a token ID
555                fn token_str(kind_id: u16) -> Option<&'static str> {
556                    match kind_id {
557                        $(
558                            Self::$const => Some($str),
559                        )*
560                        _ => None,
561                    }
562                }
563
564                /// Check if a token ID is valid
565                fn is_valid_token(kind_id: u16) -> bool {
566                    matches!(kind_id, $(Self::$const)|*)
567                }
568
569                fn name_field() -> u16 {
570                    Self::field_name
571                }
572
573                fn type_field() -> u16 {
574                    Self::field_type
575                }
576
577                fn trait_field() -> u16 {
578                    Self::field_trait
579                }
580            }
581
582
583            /// Visitor Trait Definition
584            pub trait [<AstVisitor $suffix>]<'a, T> {
585                /// Visit a node, dispatching to the appropriate method based on token ID
586                /// NOTE: scope stack is for lookup convenience, the actual namespace in
587                /// which names should be mangled and declared.
588                /// So namespace is semantic home scope for name resolution/mangling,
589                /// independent of the push stack.
590                fn visit_node(
591                    &mut self,
592                    unit: &$crate::context::CompileUnit<'a>,
593                    node: &$crate::ir::HirNode<'a>,
594                    scopes: &mut T,
595                    namespace: &'a $crate::scope::Scope<'a>,
596                    parent: Option<&$crate::symbol::Symbol>,
597                ) {
598                    match node.kind_id() {
599                        $(
600                            [<Lang $suffix>]::$const => $crate::paste::paste! {{
601                                self.[<visit_ $const>](unit, node, scopes, namespace, parent)
602                            }},
603                        )*
604                        _ => self.visit_unknown(unit, node, scopes, namespace, parent),
605                    }
606                }
607
608                /// Visit all children of a node
609                fn visit_children(
610                    &mut self,
611                    unit: &$crate::context::CompileUnit<'a>,
612                    node: &$crate::ir::HirNode<'a>,
613                    scopes: &mut T,
614                    namespace: &'a $crate::scope::Scope<'a>,
615                    parent: Option<&$crate::symbol::Symbol>,
616                ) {
617                    // Iterate directly over child IDs to avoid Vec/SmallVec allocation
618                    for &child_id in node.child_ids() {
619                        let child = unit.hir_node(child_id);
620                        self.visit_node(unit, &child, scopes, namespace, parent);
621                    }
622                }
623
624                /// Handle unknown/unrecognized token types
625                fn visit_unknown(
626                    &mut self,
627                    unit: &$crate::context::CompileUnit<'a>,
628                    node: &$crate::ir::HirNode<'a>,
629                    scopes: &mut T,
630                    namespace: &'a $crate::scope::Scope<'a>,
631                    parent: Option<&$crate::symbol::Symbol>,
632                ) {
633                    self.visit_children(unit, node, scopes, namespace, parent);
634                }
635
636                // Generate visit methods for each token type with visit_ prefix
637                $(
638                    $crate::paste::paste! {
639                        fn [<visit_ $const>](
640                            &mut self,
641                            unit: &$crate::context::CompileUnit<'a>,
642                            node: &$crate::ir::HirNode<'a>,
643                            scopes: &mut T,
644                            namespace: &'a $crate::scope::Scope<'a>,
645                            parent: Option<&$crate::symbol::Symbol>,
646                        ) {
647                            self.visit_children(unit, node, scopes, namespace, parent);
648                        }
649                    }
650                )*
651            }
652        }
653    };
654
655    (@unwrap_block $block:expr) => { $block };
656    (@unwrap_block) => { $crate::graph_builder::BlockKind::Undefined };
657}