vortex_array/expr/analysis/immediate_access.rs
1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use vortex_dtype::FieldName;
5use vortex_dtype::StructFields;
6use vortex_error::VortexExpect;
7use vortex_utils::aliases::hash_set::HashSet;
8
9use crate::expr::Expression;
10use crate::expr::analysis::AnnotationFn;
11use crate::expr::analysis::Annotations;
12use crate::expr::descendent_annotations;
13use crate::expr::exprs::get_item::GetItem;
14use crate::expr::exprs::root::Root;
15use crate::expr::exprs::select::Select;
16
17pub type FieldAccesses<'a> = Annotations<'a, FieldName>;
18
19/// Returns the "free fields" for this expression node.
20///
21/// A "free field" is a top-level field from the root scope that this expression references—not
22/// nested fields within those top-level fields. For example, `root().a.b` has free field `{a}`,
23/// not `{b}`, because `a` is the top-level field being accessed from root.
24///
25/// The term "free" is borrowed from PL theory's "free variables"—variables that reference an
26/// outer scope rather than being introduced locally.
27///
28/// This is useful for column pruning, where we only need to read the top-level fields that an
29/// expression actually touches.
30///
31/// # Annotation Rules
32///
33/// - **[`Select`]**: Returns the included field names if the child is [`Root`].
34/// - **[`GetItem`] on [`Root`]**: Returns `[field_name]` if the child is [`Root`].
35/// - **[`Root`]**: Returns all field names from `scope` (conservative over-approximation).
36/// - **Everything else**: Returns empty (annotations aggregate from children automatically).
37///
38/// # Example
39///
40/// Given `scope = {a: {b: .., c: ..}, d: ..}` and `expr = root().a.b + root().d`:
41/// - `root().a` has free fields `{a}`.
42/// - `root().d` has free fields `{d}`.
43/// - The full expression has free fields `{a, d}` (not `b`, only top-level fields are tracked).
44pub fn make_free_field_annotator(
45 scope: &StructFields,
46) -> impl AnnotationFn<Annotation = FieldName> {
47 move |expr: &Expression| {
48 if let Some(selection) = expr.as_opt::<Select>() {
49 if expr.child(0).is::<Root>() {
50 return selection
51 .normalize_to_included_fields(scope.names())
52 .vortex_expect("Select fields must be valid for scope")
53 .into_iter()
54 .collect();
55 }
56 } else if let Some(field_name) = expr.as_opt::<GetItem>() {
57 if expr.child(0).is::<Root>() {
58 return vec![field_name.clone()];
59 }
60 } else if expr.is::<Root>() {
61 return scope.names().iter().cloned().collect();
62 }
63
64 vec![]
65 }
66}
67
68/// For all subexpressions in an expression, find the fields that are accessed directly from the
69/// scope, but not any fields in those fields
70/// e.g. scope = {a: {b: .., c: ..}, d: ..}, expr = root().a.b + root().d accesses {a,d} (not b).
71///
72/// Note: This is a very naive, but simple analysis to find the fields that are accessed directly on an
73/// identity node. This is combined to provide an over-approximation of the fields that are accessed
74/// by an expression.
75pub fn immediate_scope_accesses<'a>(
76 expr: &'a Expression,
77 scope: &'a StructFields,
78) -> FieldAccesses<'a> {
79 descendent_annotations(expr, make_free_field_annotator(scope))
80}
81
82/// This returns the immediate scope_access (as explained `immediate_scope_accesses`) for `expr`.
83pub fn immediate_scope_access<'a>(
84 expr: &'a Expression,
85 scope: &'a StructFields,
86) -> HashSet<FieldName> {
87 immediate_scope_accesses(expr, scope)
88 .get(expr)
89 .vortex_expect("Expression missing from scope accesses, this is a internal bug")
90 .clone()
91}