Skip to main content

big_code_analysis/
ast.rs

1// Per-language metric and AST modules deliberately consume the macro-
2// generated tree-sitter token enums via `use crate::*` and `use Foo::*`
3// inside match expressions — explicit imports would list dozens of
4// variants per arm and obscure the per-language token sets that are the
5// point of these files. Allowed at the module level rather than per
6// function so the per-language impl blocks stay readable.
7#![allow(clippy::enum_glob_use, clippy::if_not_else, clippy::wildcard_imports)]
8
9use serde::ser::{SerializeStruct, Serializer};
10use serde::{Deserialize, Serialize};
11
12use crate::*;
13
14/// Start and end positions of a node in a code in terms of rows and columns.
15///
16/// The first and second fields represent the row and column associated to
17/// the start position of a node.
18///
19/// The third and fourth fields represent the row and column associated to
20/// the end position of a node.
21pub type Span = Option<(usize, usize, usize, usize)>;
22
23/// The payload of an `Ast` request.
24#[derive(Debug, Deserialize, Serialize)]
25pub struct AstPayload {
26    /// The id associated to a request for an `AST`
27    pub id: String,
28    /// The filename associated to a source code file
29    pub file_name: String,
30    /// The code to be represented as an `AST`
31    pub code: String,
32    /// If `true`, nodes representing comments are ignored
33    pub comment: bool,
34    /// If `true`, the start and end positions of a node in a code
35    /// are considered
36    pub span: bool,
37}
38
39/// The response of an `AST` request.
40#[derive(Debug, Serialize)]
41pub struct AstResponse {
42    /// The id associated to a request for an `AST`
43    pub id: String,
44    /// The root node of an `AST`
45    ///
46    /// If `None`, an error has occurred
47    pub root: Option<AstNode>,
48}
49
50/// Information on an `AST` node.
51#[derive(Debug)]
52pub struct AstNode {
53    /// The type of node
54    pub r#type: &'static str,
55    /// The code associated to a node
56    pub value: String,
57    /// The start and end positions of a node in a code
58    pub span: Span,
59    /// Tree-sitter grammar field name through which the parent reaches
60    /// this node (e.g. `left`, `right`, `name`, `body`).
61    ///
62    /// `None` for the root node, anonymous tokens (punctuation, keywords),
63    /// and any child that does not occupy a named grammar field. Consumers
64    /// of the JSON output rely on this to distinguish structurally
65    /// equivalent children without grammar-specific positional knowledge.
66    pub field_name: Option<&'static str>,
67    /// The children of a node
68    pub children: Vec<AstNode>,
69}
70
71impl Serialize for AstNode {
72    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
73    where
74        S: Serializer,
75    {
76        let mut st = serializer.serialize_struct("Node", 5)?;
77        st.serialize_field("Type", &self.r#type)?;
78        st.serialize_field("TextValue", &self.value)?;
79        st.serialize_field("Span", &self.span)?;
80        st.serialize_field("FieldName", &self.field_name)?;
81        st.serialize_field("Children", &self.children)?;
82        st.end()
83    }
84}
85
86impl AstNode {
87    /// Builds an `AstNode` with the supplied type, value, span, and
88    /// children. The `field_name` is set to `None`; use
89    /// [`AstNode::with_field_name`] to record the tree-sitter grammar
90    /// field through which the parent reaches this node.
91    #[must_use]
92    pub fn new(r#type: &'static str, value: String, span: Span, children: Vec<AstNode>) -> Self {
93        Self::with_field_name(r#type, value, span, None, children)
94    }
95
96    /// Builds an `AstNode` carrying the tree-sitter grammar field name
97    /// (`left`, `right`, `name`, `body`, ...) through which the parent
98    /// reaches this node.
99    #[must_use]
100    pub fn with_field_name(
101        r#type: &'static str,
102        value: String,
103        span: Span,
104        field_name: Option<&'static str>,
105        children: Vec<AstNode>,
106    ) -> Self {
107        Self {
108            r#type,
109            value,
110            span,
111            field_name,
112            children,
113        }
114    }
115}
116
117fn build<T: ParserTrait>(parser: &T, span: bool, comment: bool) -> Option<AstNode> {
118    // Iterative depth-first walk that materializes `AstNode`s bottom-up.
119    // Each frame holds the pending parent node, the grammar field name
120    // through which its own parent reached it (None for the root), the
121    // already-materialized child `AstNode`s, and the next child index to
122    // descend into. The parent's `field_name_for_child(idx)` lookup is
123    // O(1) and avoids the parallel cursor walk that was required when
124    // field names had to be captured via `TreeCursor::field_name()`.
125    struct Frame<'a> {
126        node: crate::Node<'a>,
127        field: Option<&'static str>,
128        children: Vec<AstNode>,
129        next_child_index: usize,
130    }
131
132    let code = parser.get_code();
133    let root = parser.get_root();
134    let mut stack: Vec<Frame<'_>> = vec![Frame {
135        node: root,
136        field: None,
137        children: Vec::with_capacity(root.child_count()),
138        next_child_index: 0,
139    }];
140
141    loop {
142        let frame = stack
143            .last_mut()
144            .expect("stack invariant: loop only runs while stack is non-empty");
145        let child_count = frame.node.child_count();
146        if frame.next_child_index < child_count {
147            let idx = frame.next_child_index;
148            frame.next_child_index += 1;
149            // `Node::child` is O(1) (direct tree-sitter pointer
150            // arithmetic); `field_name_for_child` returns the static
151            // grammar field for that child position. Tree-sitter caps
152            // child indices at u32, so the cast is safe by invariant.
153            let child = frame
154                .node
155                .child(idx)
156                .expect("stack invariant: idx < child_count so the child exists");
157            let field = frame.node.field_name_for_child(
158                u32::try_from(idx).expect("invariant: tree-sitter caps child indices at u32::MAX"),
159            );
160            stack.push(Frame {
161                node: child,
162                field,
163                children: Vec::with_capacity(child.child_count()),
164                next_child_index: 0,
165            });
166        } else {
167            let frame = stack
168                .pop()
169                .expect("stack invariant: just observed non-empty via last_mut()");
170            let node = T::Checker::get_ast_node(
171                &frame.node,
172                code,
173                span,
174                comment,
175                frame.field,
176                frame.children,
177            );
178            match (node, stack.last_mut()) {
179                (Some(ast), Some(parent)) => parent.children.push(ast),
180                (Some(ast), None) => return Some(ast),
181                (None, None) => return None,
182                (None, Some(_)) => {}
183            }
184        }
185    }
186}
187
188/// Type tag identifying the AST extraction action; carries no data.
189pub struct AstCallback {
190    _guard: (),
191}
192
193/// Configuration options for retrieving the nodes of an `AST`.
194#[derive(Debug)]
195pub struct AstCfg {
196    /// The id associated to a request for an `AST`
197    pub id: String,
198    /// If `true`, nodes representing comments are ignored
199    pub comment: bool,
200    /// If `true`, the start and end positions of a node in a code
201    /// are considered
202    pub span: bool,
203}
204
205impl Callback for AstCallback {
206    type Res = AstResponse;
207    type Cfg = AstCfg;
208
209    fn call<T: ParserTrait>(cfg: Self::Cfg, parser: &T) -> Self::Res {
210        AstResponse {
211            id: cfg.id,
212            root: build(parser, cfg.span, cfg.comment),
213        }
214    }
215}
216
217#[cfg(test)]
218mod tests {
219    use std::path::PathBuf;
220
221    use super::*;
222
223    fn build_ast<P: ParserTrait>(code: &[u8], filename: &str) -> AstNode {
224        let path = PathBuf::from(filename);
225        let parser = P::new(code.to_vec(), &path, None);
226        let cfg = AstCfg {
227            id: String::new(),
228            comment: false,
229            span: false,
230        };
231        AstCallback::call(cfg, &parser)
232            .root
233            .expect("parser should produce a root AST node")
234    }
235
236    fn find_first<'a>(node: &'a AstNode, kind: &str) -> Option<&'a AstNode> {
237        if node.r#type == kind {
238            return Some(node);
239        }
240        node.children.iter().find_map(|c| find_first(c, kind))
241    }
242
243    fn find_child<'a>(parent: &'a AstNode, field: &str) -> Option<&'a AstNode> {
244        parent.children.iter().find(|c| c.field_name == Some(field))
245    }
246
247    #[test]
248    fn root_has_no_field_name() {
249        let root = build_ast::<crate::RustParser>(b"fn main() {}", "test.rs");
250        assert_eq!(root.field_name, None);
251    }
252
253    #[test]
254    fn rust_assignment_carries_left_and_right_field_names() {
255        // `assignment_expression` in the Rust grammar names its operands
256        // `left` and `right`. Without `FieldName` exposed in the JSON,
257        // downstream consumers cannot distinguish the two `identifier`
258        // children. This is the canonical example from issue #244.
259        let root =
260            build_ast::<crate::RustParser>(b"fn f() { let mut a = 0; a = a + 1; }", "test.rs");
261        let assign = find_first(&root, "assignment_expression")
262            .expect("expected an assignment_expression node");
263        let left = find_child(assign, "left").expect("expected a `left` child");
264        let right = find_child(assign, "right").expect("expected a `right` child");
265        assert_eq!(left.field_name, Some("left"));
266        assert_eq!(right.field_name, Some("right"));
267        // Anonymous `=` token is a child too, with no field name.
268        assert!(
269            assign
270                .children
271                .iter()
272                .any(|c| c.r#type == "=" && c.field_name.is_none()),
273            "expected the `=` token child to carry no field name; got {:?}",
274            assign
275                .children
276                .iter()
277                .map(|c| (c.r#type, c.field_name))
278                .collect::<Vec<_>>(),
279        );
280    }
281
282    #[test]
283    fn rust_function_carries_name_and_body_field_names() {
284        // `function_item` names children `name`, `parameters`, `body`.
285        // Assert the field name directly on the AstNode so a bug that
286        // misnames a field (e.g. always emits "body") fails even if
287        // the target node kinds coincidentally line up.
288        let root =
289            build_ast::<crate::RustParser>(b"fn greet(name: &str) -> &str { name }", "test.rs");
290        let func = find_first(&root, "function_item").expect("expected a function_item node");
291        let name_child = find_child(func, "name").expect("function_item should have a name child");
292        assert_eq!(name_child.field_name, Some("name"));
293        assert_eq!(name_child.r#type, "identifier");
294        let params_child =
295            find_child(func, "parameters").expect("function_item should have a parameters child");
296        assert_eq!(params_child.field_name, Some("parameters"));
297        assert_eq!(params_child.r#type, "parameters");
298        let body_child = find_child(func, "body").expect("function_item should have a body child");
299        assert_eq!(body_child.field_name, Some("body"));
300        assert_eq!(body_child.r#type, "block");
301    }
302
303    #[test]
304    fn cpp_assignment_carries_left_and_right_field_names() {
305        // Cross-language confirmation: the C/C++ grammar uses the same
306        // `left`/`right` field names for `assignment_expression`.
307        let root =
308            build_ast::<crate::CppParser>(b"int main(){ int x = 0; x = x + 1; }", "test.cpp");
309        let assign = find_first(&root, "assignment_expression")
310            .expect("expected an assignment_expression node");
311        assert_eq!(
312            find_child(assign, "left").map(|n| n.r#type),
313            Some("identifier")
314        );
315        assert_eq!(
316            find_child(assign, "right").map(|n| n.r#type),
317            Some("binary_expression")
318        );
319    }
320
321    #[test]
322    fn serialized_json_includes_field_name_key() {
323        // Regression for the Serialize impl: every node must serialize
324        // a `FieldName` key (null or string). Verifying via JSON
325        // string-match catches accidental removal of the field from
326        // the serializer.
327        let root = build_ast::<crate::RustParser>(b"fn f(){ let a = 1; }", "test.rs");
328        let json = serde_json::to_string(&root).expect("serialize");
329        assert!(
330            json.contains("\"FieldName\""),
331            "FieldName missing from JSON: {json}"
332        );
333        // The let binding's `pattern` and `value` fields should both
334        // appear as string values in the JSON.
335        assert!(
336            json.contains("\"FieldName\":\"pattern\""),
337            "expected pattern field name; got {json}"
338        );
339        assert!(
340            json.contains("\"FieldName\":\"value\""),
341            "expected value field name; got {json}"
342        );
343    }
344}