Skip to main content

uni_query/query/
df_planner.rs

1// SPDX-License-Identifier: Apache-2.0
2// Copyright 2024-2026 Dragonscale Team
3
4//! Hybrid physical planner for DataFusion integration.
5//!
6//! This module provides [`HybridPhysicalPlanner`], which converts Cypher's
7//! [`LogicalPlan`] into a DataFusion [`ExecutionPlan`] tree. The "hybrid" nature
8//! refers to the mix of:
9//!
10//! - **Custom graph operators**: `GraphScanExec`, `GraphTraverseExec`, `GraphShortestPathExec`
11//! - **Native DataFusion operators**: `FilterExec`, `AggregateExec`, `SortExec`, etc.
12//!
13//! # Architecture
14//!
15//! ```text
16//! LogicalPlan (Cypher)
17//!        │
18//!        ▼
19//! ┌────────────────────┐
20//! │HybridPhysicalPlanner│
21//! │                    │
22//! │ Graph ops → Custom │
23//! │ Rel ops → DataFusion│
24//! └────────────────────┘
25//!        │
26//!        ▼
27//! ExecutionPlan (DataFusion)
28//! ```
29//!
30//! # Expression Translation
31//!
32//! Cypher expressions are translated to DataFusion expressions using
33//! [`cypher_expr_to_df`] from the `df_expr` module.
34
35use crate::query::df_expr::{TranslationContext, VariableKind, cypher_expr_to_df};
36use crate::query::df_graph::ReadSetRecordingExec;
37use crate::query::df_graph::bind_fixed_path::BindFixedPathExec;
38use crate::query::df_graph::bind_zero_length_path::BindZeroLengthPathExec;
39use crate::query::df_graph::mutation_common::{
40    MutationKind, extended_schema_for_new_vars, new_create_exec, new_merge_exec,
41};
42use crate::query::df_graph::mutation_delete::new_delete_exec;
43use crate::query::df_graph::mutation_remove::new_remove_exec;
44use crate::query::df_graph::mutation_set::new_set_exec;
45use crate::query::df_graph::recursive_cte::RecursiveCTEExec;
46use crate::query::df_graph::traverse::{
47    GraphVariableLengthTraverseExec, GraphVariableLengthTraverseMainExec,
48};
49use crate::query::df_graph::{
50    GraphApplyExec, GraphExecutionContext, GraphExtIdLookupExec, GraphProcedureCallExec,
51    GraphScanExec, GraphShortestPathExec, GraphTraverseExec, GraphTraverseMainExec,
52    GraphUnwindExec, GraphVectorKnnExec, L0Context, MutationContext, MutationExec,
53    OptionalFilterExec,
54};
55use crate::query::planner::{
56    LogicalPlan, STRUCT_ONLY_SENTINEL, aggregate_column_name, collect_properties_from_plan,
57};
58use anyhow::{Result, anyhow};
59use arrow_schema::{DataType, Schema, SchemaRef};
60use datafusion::common::JoinType;
61use datafusion::execution::SessionState;
62use datafusion::logical_expr::{Expr as DfExpr, ExprSchemable, SortExpr as DfSortExpr};
63use datafusion::physical_expr::{create_physical_expr, create_physical_sort_exprs};
64use datafusion::physical_plan::ExecutionPlan;
65use datafusion::physical_plan::aggregates::{AggregateExec, AggregateMode, PhysicalGroupBy};
66use datafusion::physical_plan::filter::FilterExec;
67use datafusion::physical_plan::joins::NestedLoopJoinExec;
68use datafusion::physical_plan::limit::LocalLimitExec;
69use datafusion::physical_plan::placeholder_row::PlaceholderRowExec;
70use datafusion::physical_plan::projection::ProjectionExec;
71use datafusion::physical_plan::sorts::sort::SortExec;
72use datafusion::physical_plan::udaf::AggregateFunctionExpr;
73use datafusion::physical_plan::union::UnionExec;
74use datafusion::prelude::SessionContext;
75use parking_lot::RwLock;
76use std::collections::{HashMap, HashSet};
77use std::sync::Arc;
78use std::sync::atomic::{AtomicU64, Ordering};
79use uni_algo::algo::AlgorithmRegistry;
80use uni_common::core::schema::{PropertyMeta, Schema as UniSchema};
81use uni_cypher::ast::{
82    CypherLiteral, Direction as AstDirection, Expr, Pattern, PatternElement, SortItem,
83};
84use uni_store::runtime::l0::L0Buffer;
85use uni_store::runtime::property_manager::PropertyManager;
86use uni_store::storage::direction::Direction;
87use uni_store::storage::manager::StorageManager;
88use uni_xervo::runtime::ModelRuntime;
89
90/// An aggregate function expression paired with its optional filter.
91type PhysicalAggregate = (
92    Arc<AggregateFunctionExpr>,
93    Option<Arc<dyn datafusion::physical_expr::PhysicalExpr>>,
94);
95
96/// Hybrid physical planner that produces DataFusion ExecutionPlan trees.
97///
98/// Routes graph operations to custom `ExecutionPlan` implementations
99/// and relational operations to native DataFusion operators.
100///
101/// # Example
102///
103/// ```ignore
104/// let planner = HybridPhysicalPlanner::new(
105///     session_ctx,
106///     storage,
107///     l0,
108///     property_manager,
109///     schema,
110///     params,
111/// );
112///
113/// let execution_plan = planner.plan(&logical_plan)?;
114/// ```
115pub struct HybridPhysicalPlanner {
116    /// DataFusion session context.
117    session_ctx: Arc<RwLock<SessionContext>>,
118
119    /// Storage manager for dataset access.
120    storage: Arc<StorageManager>,
121
122    /// Graph execution context for custom operators.
123    graph_ctx: Arc<GraphExecutionContext>,
124
125    /// Schema for label/edge type lookups.
126    schema: Arc<UniSchema>,
127
128    /// Last flush version for staleness detection.
129    last_flush_version: AtomicU64,
130
131    /// Query parameters for expression translation.
132    params: HashMap<String, uni_common::Value>,
133
134    /// Correlated outer values from Apply input rows (for subquery correlation).
135    /// These take precedence over parameters during variable resolution to prevent
136    /// YIELD columns from shadowing user query parameters.
137    outer_values: HashMap<String, uni_common::Value>,
138
139    /// Mutation context for write operations (CREATE, SET, REMOVE, DELETE).
140    /// Present only when the query contains write clauses.
141    mutation_ctx: Option<Arc<MutationContext>>,
142
143    /// Entity variable names from outer scopes, threaded through for nested EXISTS
144    /// so the expression compiler can distinguish fresh pattern bindings from
145    /// correlated references.
146    outer_entity_vars: HashSet<String>,
147
148    /// Plugin registry used to resolve Locy aggregates (and other plugin
149    /// surfaces) at plan time. Defaults to a process-wide registry pre-loaded
150    /// with the built-ins from `uni-plugin-builtin`; replace with
151    /// [`Self::with_plugin_registry`] to use a host-supplied registry.
152    plugin_registry: Arc<uni_plugin::PluginRegistry>,
153}
154
155impl std::fmt::Debug for HybridPhysicalPlanner {
156    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
157        f.debug_struct("HybridPhysicalPlanner")
158            .field(
159                "last_flush_version",
160                &self.last_flush_version.load(Ordering::Relaxed),
161            )
162            .finish_non_exhaustive()
163    }
164}
165
166impl HybridPhysicalPlanner {
167    /// Create a new hybrid physical planner.
168    ///
169    /// # Arguments
170    ///
171    /// * `session_ctx` - DataFusion session context
172    /// * `storage` - Storage manager for dataset access
173    /// * `l0` - Current L0 buffer for MVCC
174    /// * `property_manager` - Property manager for lazy loading
175    /// * `schema` - Uni schema for lookups
176    pub fn new(
177        session_ctx: Arc<RwLock<SessionContext>>,
178        storage: Arc<StorageManager>,
179        l0: Arc<RwLock<L0Buffer>>,
180        property_manager: Arc<PropertyManager>,
181        schema: Arc<UniSchema>,
182        params: HashMap<String, uni_common::Value>,
183    ) -> Self {
184        let graph_ctx = Arc::new(GraphExecutionContext::new(
185            storage.clone(),
186            l0,
187            property_manager,
188        ));
189
190        Self {
191            session_ctx,
192            storage,
193            graph_ctx,
194            schema,
195            last_flush_version: AtomicU64::new(0),
196            params,
197            outer_values: HashMap::new(),
198            mutation_ctx: None,
199            outer_entity_vars: HashSet::new(),
200            plugin_registry: super::df_graph::locy_fold::default_locy_plugin_registry(),
201        }
202    }
203
204    /// Replace the plugin registry used for Locy aggregate resolution.
205    ///
206    /// The default registry contains only the built-in aggregates from
207    /// `uni-plugin-builtin`. Hosts that have registered additional Locy
208    /// aggregates should pass their full [`uni_plugin::PluginRegistry`] here
209    /// so user-declared aggregates resolve at plan time.
210    #[must_use]
211    pub fn with_plugin_registry(
212        mut self,
213        plugin_registry: Arc<uni_plugin::PluginRegistry>,
214    ) -> Self {
215        // Also propagate into the GraphExecutionContext so the
216        // native-label plugin-storage dispatcher in
217        // `columnar_scan_vertex_batch_static` (M5h.2) can reach the
218        // registered `Storage` impls.
219        let mut ctx = self.take_graph_ctx();
220        ctx = ctx.with_plugin_registry(Arc::clone(&plugin_registry));
221        self.graph_ctx = Arc::new(ctx);
222        self.plugin_registry = plugin_registry;
223        self
224    }
225
226    /// Resolve the set of property names for `variable` from the collected plan properties.
227    ///
228    /// If the property set contains `"*"`, expands to all schema-defined properties
229    /// for `schema_name` (a label or edge type name). Otherwise filters out the
230    /// wildcard sentinel and returns the explicit property names.
231    fn resolve_properties(
232        &self,
233        variable: &str,
234        schema_name: &str,
235        all_properties: &HashMap<String, HashSet<String>>,
236    ) -> Vec<String> {
237        // System columns managed by the engine — never treat as user properties.
238        const SYSTEM_COLUMNS: &[&str] =
239            &["_vid", "_labels", "_eid", "_src_vid", "_dst_vid", "_type"];
240
241        all_properties
242            .get(variable)
243            .map(|props| {
244                if props.contains("*") {
245                    let schema_props: Vec<String> = self
246                        .schema
247                        .properties
248                        .get(schema_name)
249                        .map(|p| p.keys().cloned().collect())
250                        .unwrap_or_default();
251
252                    // Collect explicit property names (non-wildcard, non-internal).
253                    // System-managed columns surfaced through Cypher functions
254                    // (e.g. `_created_at`/`_updated_at` via `created_at(n)`/
255                    // `updated_at(n)`) are kept — they are intentionally
256                    // requested even when the wildcard is also set.
257                    let explicit: Vec<String> = props
258                        .iter()
259                        .filter(|p| {
260                            *p != "*"
261                                && *p != STRUCT_ONLY_SENTINEL
262                                && (!p.starts_with('_')
263                                    || matches!(p.as_str(), "_created_at" | "_updated_at"))
264                        })
265                        .cloned()
266                        .collect();
267
268                    if schema_props.is_empty() && explicit.is_empty() {
269                        // Structural-only access, no specific properties needed
270                        return vec!["*".to_string()];
271                    }
272
273                    // Merge schema props + explicit props, dedup
274                    let mut combined: Vec<String> = schema_props;
275                    for p in explicit {
276                        if !combined.contains(&p) {
277                            combined.push(p);
278                        }
279                    }
280                    combined.retain(|p| !SYSTEM_COLUMNS.contains(&p.as_str()));
281                    combined.sort();
282                    combined
283                } else {
284                    // Sentinel-only or no structural marker: return the explicit
285                    // properties without schema expansion. The sentinel itself
286                    // is filtered. Structural projection is still applied
287                    // downstream via the `need_full` gate (which accepts the
288                    // sentinel) — it just builds a smaller struct.
289                    let mut explicit_props: Vec<String> = props
290                        .iter()
291                        .filter(|p| {
292                            *p != "*"
293                                && *p != STRUCT_ONLY_SENTINEL
294                                && !SYSTEM_COLUMNS.contains(&p.as_str())
295                        })
296                        .cloned()
297                        .collect();
298                    explicit_props.sort();
299                    explicit_props
300                }
301            })
302            .unwrap_or_default()
303    }
304
305    /// Create planner with full L0 context.
306    pub fn with_l0_context(
307        session_ctx: Arc<RwLock<SessionContext>>,
308        storage: Arc<StorageManager>,
309        l0_context: L0Context,
310        property_manager: Arc<PropertyManager>,
311        schema: Arc<UniSchema>,
312        params: HashMap<String, uni_common::Value>,
313        outer_values: HashMap<String, uni_common::Value>,
314    ) -> Self {
315        let graph_ctx = Arc::new(GraphExecutionContext::with_l0_context(
316            storage.clone(),
317            l0_context,
318            property_manager,
319        ));
320
321        Self {
322            session_ctx,
323            storage,
324            graph_ctx,
325            schema,
326            last_flush_version: AtomicU64::new(0),
327            params,
328            outer_values,
329            mutation_ctx: None,
330            outer_entity_vars: HashSet::new(),
331            plugin_registry: super::df_graph::locy_fold::default_locy_plugin_registry(),
332        }
333    }
334
335    /// Unwrap the inner `GraphExecutionContext` from its `Arc`, preserving all
336    /// existing registries. If other Arc references exist, clones the base context
337    /// and re-attaches the saved registries.
338    fn take_graph_ctx(&mut self) -> GraphExecutionContext {
339        let algo_registry = self.graph_ctx.algo_registry().cloned();
340        let procedure_registry = self.graph_ctx.procedure_registry().cloned();
341        let xervo_runtime = self.graph_ctx.xervo_runtime().cloned();
342        let plugin_registry = self.graph_ctx.plugin_registry().cloned();
343        let writer = self.graph_ctx.writer().cloned();
344
345        let new_base = |ctx: &Arc<GraphExecutionContext>| {
346            GraphExecutionContext::with_l0_context(
347                ctx.storage().clone(),
348                ctx.l0_context().clone(),
349                ctx.property_manager().clone(),
350            )
351        };
352        let placeholder = Arc::new(new_base(&self.graph_ctx));
353        let arc = std::mem::replace(&mut self.graph_ctx, placeholder);
354        let mut ctx = Arc::try_unwrap(arc).unwrap_or_else(|arc| new_base(&arc));
355
356        if let Some(registry) = algo_registry {
357            ctx = ctx.with_algo_registry(registry);
358        }
359        if let Some(registry) = procedure_registry {
360            ctx = ctx.with_procedure_registry(registry);
361        }
362        if let Some(runtime) = xervo_runtime {
363            ctx = ctx.with_xervo_runtime(runtime);
364        }
365        if let Some(registry) = plugin_registry {
366            ctx = ctx.with_plugin_registry(registry);
367        }
368        if let Some(w) = writer {
369            ctx = ctx.with_writer(w);
370        }
371        ctx
372    }
373
374    /// Attach the outer transaction's writer handle so declared
375    /// `WRITE`-mode procedures invoked through this plan can run
376    /// their Cypher bodies via the write-enabled inner-query host
377    /// (FU-1 / M11 #6).
378    #[must_use]
379    pub fn with_writer(mut self, writer: Arc<uni_store::Writer>) -> Self {
380        let ctx = self.take_graph_ctx().with_writer(writer);
381        self.graph_ctx = Arc::new(ctx);
382        self
383    }
384
385    /// Set the algorithm registry for `uni.algo.*` procedure dispatch.
386    ///
387    /// Rebuilds the inner `GraphExecutionContext` with the registry attached.
388    /// Set outer entity variable names for nested EXISTS correlated reference detection.
389    pub fn set_outer_entity_vars(&mut self, vars: HashSet<String>) {
390        self.outer_entity_vars = vars;
391    }
392
393    pub fn with_algo_registry(mut self, registry: Arc<AlgorithmRegistry>) -> Self {
394        let ctx = self.take_graph_ctx().with_algo_registry(registry);
395        self.graph_ctx = Arc::new(ctx);
396        self
397    }
398
399    /// Set the external procedure registry for test/user-defined procedures.
400    ///
401    /// Rebuilds the inner `GraphExecutionContext` with the registry attached.
402    pub fn with_procedure_registry(
403        mut self,
404        registry: Arc<crate::query::executor::procedure::ProcedureRegistry>,
405    ) -> Self {
406        let ctx = self.take_graph_ctx().with_procedure_registry(registry);
407        self.graph_ctx = Arc::new(ctx);
408        self
409    }
410
411    /// Set Uni-Xervo runtime used by query-time vector auto-embedding.
412    pub fn with_xervo_runtime(mut self, runtime: Arc<ModelRuntime>) -> Self {
413        let ctx = self.take_graph_ctx().with_xervo_runtime(runtime);
414        self.graph_ctx = Arc::new(ctx);
415        self
416    }
417
418    /// Set the mutation context for write operations.
419    pub fn with_mutation_context(mut self, ctx: Arc<MutationContext>) -> Self {
420        self.mutation_ctx = Some(ctx);
421        self
422    }
423
424    /// Return the graph execution context (for columnar subplan execution).
425    pub fn graph_ctx(&self) -> &Arc<GraphExecutionContext> {
426        &self.graph_ctx
427    }
428
429    /// Return the DataFusion session context (for columnar subplan execution).
430    pub fn session_ctx(&self) -> &Arc<RwLock<SessionContext>> {
431        &self.session_ctx
432    }
433
434    /// Return the storage manager (for columnar subplan execution).
435    pub fn storage(&self) -> &Arc<StorageManager> {
436        &self.storage
437    }
438
439    /// Return the schema (for columnar subplan execution).
440    pub fn schema_info(&self) -> &Arc<UniSchema> {
441        &self.schema
442    }
443
444    /// Get the mutation context, returning an error if not set.
445    fn require_mutation_ctx(&self) -> Result<Arc<MutationContext>> {
446        self.mutation_ctx.clone().ok_or_else(|| {
447            tracing::error!(
448                "Mutation context not set — this indicates a routing bug where a write \
449                 operation was sent to the DataFusion engine without a MutationContext"
450            );
451            anyhow!("Mutation context not set — write operations require a MutationContext")
452        })
453    }
454
455    /// Build a `TranslationContext` with variable kinds collected from a LogicalPlan.
456    ///
457    /// This is used for expression translation in filters, projections, etc.
458    /// where bare variable references need to resolve to identity columns.
459    fn translation_context_for_plan(&self, plan: &LogicalPlan) -> TranslationContext {
460        let mut variable_kinds = HashMap::new();
461        let mut variable_labels = HashMap::new();
462        let mut node_variable_hints = Vec::new();
463        let mut mutation_edge_hints = Vec::new();
464        collect_variable_kinds(plan, &mut variable_kinds);
465        collect_mutation_node_hints(plan, &mut node_variable_hints);
466        collect_mutation_edge_hints(plan, &mut mutation_edge_hints);
467        self.collect_variable_labels(plan, &mut variable_labels);
468        TranslationContext {
469            parameters: self.params.clone(),
470            outer_values: self.outer_values.clone(),
471            variable_labels,
472            variable_kinds,
473            node_variable_hints,
474            mutation_edge_hints,
475            ..Default::default()
476        }
477    }
478
479    /// Recursively collect variable-to-label/type mappings from a `LogicalPlan`.
480    ///
481    /// For node variables, maps to the first label name. For edge variables, maps
482    /// to the edge type name (when a single type is known). This is used by
483    /// `type(r)` to resolve the edge type as a string literal.
484    fn collect_variable_labels(&self, plan: &LogicalPlan, labels: &mut HashMap<String, String>) {
485        match plan {
486            LogicalPlan::Scan {
487                variable,
488                labels: scan_labels,
489                ..
490            }
491            | LogicalPlan::ScanMainByLabels {
492                variable,
493                labels: scan_labels,
494                ..
495            } => {
496                if let Some(first) = scan_labels.first() {
497                    labels.insert(variable.clone(), first.clone());
498                }
499            }
500            LogicalPlan::Traverse {
501                input,
502                step_variable,
503                edge_type_ids,
504                target_variable,
505                target_label_id,
506                ..
507            } => {
508                self.collect_variable_labels(input, labels);
509                if let Some(sv) = step_variable
510                    && edge_type_ids.len() == 1
511                    && let Some(name) = self.schema.edge_type_name_by_id(edge_type_ids[0])
512                {
513                    labels.insert(sv.clone(), name.to_string());
514                }
515                if *target_label_id != 0
516                    && let Some(name) = self.schema.label_name_by_id(*target_label_id)
517                {
518                    labels.insert(target_variable.clone(), name.to_string());
519                }
520            }
521            LogicalPlan::TraverseMainByType {
522                input,
523                step_variable,
524                type_names,
525                ..
526            } => {
527                self.collect_variable_labels(input, labels);
528                if let Some(sv) = step_variable
529                    && type_names.len() == 1
530                {
531                    labels.insert(sv.clone(), type_names[0].clone());
532                }
533            }
534            // Wrapper nodes: recurse into input(s)
535            LogicalPlan::Filter { input, .. }
536            | LogicalPlan::Project { input, .. }
537            | LogicalPlan::Sort { input, .. }
538            | LogicalPlan::Limit { input, .. }
539            | LogicalPlan::Aggregate { input, .. }
540            | LogicalPlan::Distinct { input, .. }
541            | LogicalPlan::Window { input, .. }
542            | LogicalPlan::Unwind { input, .. }
543            | LogicalPlan::Create { input, .. }
544            | LogicalPlan::CreateBatch { input, .. }
545            | LogicalPlan::Merge { input, .. }
546            | LogicalPlan::Set { input, .. }
547            | LogicalPlan::Remove { input, .. }
548            | LogicalPlan::Delete { input, .. }
549            | LogicalPlan::Foreach { input, .. }
550            | LogicalPlan::SubqueryCall { input, .. } => {
551                self.collect_variable_labels(input, labels);
552            }
553            LogicalPlan::Union { left, right, .. } | LogicalPlan::CrossJoin { left, right, .. } => {
554                self.collect_variable_labels(left, labels);
555                self.collect_variable_labels(right, labels);
556            }
557            LogicalPlan::Apply {
558                input, subquery, ..
559            } => {
560                self.collect_variable_labels(input, labels);
561                self.collect_variable_labels(subquery, labels);
562            }
563            LogicalPlan::Explain { plan } => {
564                self.collect_variable_labels(plan, labels);
565            }
566            _ => {}
567        }
568    }
569
570    fn merged_edge_type_properties(&self, edge_type_ids: &[u32]) -> HashMap<String, PropertyMeta> {
571        crate::query::df_graph::common::merged_edge_schema_props(&self.schema, edge_type_ids)
572    }
573
574    /// Plan a logical plan into an execution plan.
575    ///
576    /// # Arguments
577    ///
578    /// * `logical` - The logical plan to convert
579    ///
580    /// # Returns
581    ///
582    /// DataFusion ExecutionPlan ready for execution.
583    ///
584    /// # Errors
585    ///
586    /// Returns an error if planning fails (unsupported operation, schema mismatch, etc.)
587    pub fn plan(&self, logical: &LogicalPlan) -> Result<Arc<dyn ExecutionPlan>> {
588        // Pre-pass: lift UNWIND-correlated IN-list filters into the scan
589        // subtrees of any Filter(CrossJoin(L, R)) shapes. Runs as a pure
590        // logical-plan rewrite *before* any physical-plan optimization
591        // (HashJoin, VidLookupJoin, etc.) so the scan-side filters
592        // survive any downstream optimization bailout. See
593        // `merge_unwind_in_filters` for the rationale.
594        let logical_rewritten = merge_unwind_in_filters(logical, &self.params);
595
596        // Collect all properties needed anywhere in the plan tree
597        let all_properties = collect_properties_from_plan(&logical_rewritten);
598
599        // Delegate to internal planning with properties context
600        self.plan_internal(&logical_rewritten, &all_properties)
601    }
602
603    /// Plan a LogicalPlan with additional property requirements.
604    ///
605    /// Merges `extra_properties` into the auto-collected properties from the plan tree.
606    /// Used by MERGE execution to ensure structural projections are applied for
607    /// variables that need full node/edge Maps in the output.
608    pub fn plan_with_properties(
609        &self,
610        logical: &LogicalPlan,
611        extra_properties: HashMap<String, HashSet<String>>,
612    ) -> Result<Arc<dyn ExecutionPlan>> {
613        // Same pre-pass as `plan()` — see commentary there.
614        let logical_rewritten = merge_unwind_in_filters(logical, &self.params);
615        let mut all_properties = collect_properties_from_plan(&logical_rewritten);
616        for (var, props) in extra_properties {
617            all_properties.entry(var).or_default().extend(props);
618        }
619        self.plan_internal(&logical_rewritten, &all_properties)
620    }
621
622    /// Wrap a plan with optional semantics.
623    ///
624    /// If optional is true, performs a Left Outer Join with a single-row source (PlaceholderRow)
625    /// to ensure at least one row (of NULLs) is returned if the input is empty.
626    ///
627    /// Conceptually: SELECT * FROM (SELECT 1) LEFT JOIN Plan ON true
628    fn wrap_optional(
629        &self,
630        plan: Arc<dyn ExecutionPlan>,
631        optional: bool,
632    ) -> Result<Arc<dyn ExecutionPlan>> {
633        if !optional {
634            return Ok(plan);
635        }
636
637        // Create a single-row source
638        let empty_schema = Arc::new(Schema::empty());
639        let placeholder = Arc::new(PlaceholderRowExec::new(empty_schema));
640
641        // Use NestedLoopJoin with Left Outer Join type
642        // This ensures if 'plan' is empty, we get 1 row with all NULLs
643        Ok(Arc::new(NestedLoopJoinExec::try_new(
644            placeholder,
645            plan,
646            None, // No filter
647            &JoinType::Left,
648            None, // No projection
649        )?))
650    }
651
652    fn plan_internal(
653        &self,
654        logical: &LogicalPlan,
655        all_properties: &HashMap<String, HashSet<String>>,
656    ) -> Result<Arc<dyn ExecutionPlan>> {
657        match logical {
658            // === Graph Operations ===
659            // Phase 5b followup: `FusedIndexScanWrapped` is a
660            // planner-side observability wrapper around lossy
661            // operators (VectorKnn, InvertedIndexLookup). The
662            // runtime fusion happens at the `BranchedBackend`
663            // layer via Lance per-branch reads; the physical
664            // planner just unwraps and recurses on the inner node.
665            LogicalPlan::FusedIndexScanWrapped { inner, kind: _ } => {
666                self.plan_internal(inner, all_properties)
667            }
668            LogicalPlan::Scan {
669                label_id,
670                labels,
671                variable,
672                filter,
673                optional,
674            }
675            // Phase 5a-impl Step 3: decay `FusedIndexScan` to a plain
676            // `Scan` for now — preserves correctness because Lance's
677            // `base_paths` chain already covers parent-inherited
678            // indexes for forked sessions. Step 4 (VidUid) and
679            // beyond replace this fallback with type-specific fused
680            // physical operators.
681            | LogicalPlan::FusedIndexScan {
682                label_id,
683                labels,
684                variable,
685                filter,
686                optional,
687                kind: _,
688            } => {
689                if labels.len() > 1 {
690                    // Multi-label: use main table with intersection semantics
691                    self.plan_multi_label_scan(
692                        labels,
693                        variable,
694                        filter.as_ref(),
695                        *optional,
696                        all_properties,
697                    )
698                } else {
699                    // Single-label: use per-label table
700                    self.plan_scan(
701                        *label_id,
702                        variable,
703                        filter.as_ref(),
704                        *optional,
705                        all_properties,
706                    )
707                }
708            }
709
710            // ScanMainByLabels is now supported via schemaless scan
711            LogicalPlan::ScanMainByLabels {
712                labels,
713                variable,
714                filter,
715                optional,
716            } => {
717                if labels.len() > 1 {
718                    // Multi-label schemaless scan
719                    self.plan_multi_label_scan(
720                        labels,
721                        variable,
722                        filter.as_ref(),
723                        *optional,
724                        all_properties,
725                    )
726                } else if let Some(label_name) = labels.first() {
727                    // Single label schemaless scan
728                    self.plan_schemaless_scan(
729                        label_name,
730                        variable,
731                        filter.as_ref(),
732                        *optional,
733                        all_properties,
734                    )
735                } else {
736                    // Empty labels - should not happen, fallback to scan all
737                    self.plan_scan_all(variable, filter.as_ref(), *optional, all_properties)
738                }
739            }
740
741            // ScanAll is now supported via schemaless scan with empty label
742            LogicalPlan::ScanAll {
743                variable,
744                filter,
745                optional,
746            } => self.plan_scan_all(variable, filter.as_ref(), *optional, all_properties),
747
748            // TraverseMainByType is now supported via schemaless traversal
749            LogicalPlan::TraverseMainByType {
750                type_names,
751                input,
752                direction,
753                source_variable,
754                target_variable,
755                step_variable,
756                min_hops,
757                max_hops,
758                optional,
759                target_filter,
760                path_variable,
761                is_variable_length,
762                scope_match_variables,
763                optional_pattern_vars,
764                edge_filter_expr,
765                path_mode,
766                ..
767            } => {
768                if *is_variable_length {
769                    let vlp_plan = self.plan_traverse_main_by_type_vlp(
770                        input,
771                        type_names,
772                        direction.clone(),
773                        source_variable,
774                        target_variable,
775                        step_variable.as_deref(),
776                        *min_hops,
777                        *max_hops,
778                        path_variable.as_deref(),
779                        *optional,
780                        all_properties,
781                        edge_filter_expr.as_ref(),
782                        path_mode,
783                        scope_match_variables,
784                    )?;
785                    self.apply_schemaless_traverse_filter(
786                        vlp_plan,
787                        target_filter.as_ref(),
788                        source_variable,
789                        target_variable,
790                        step_variable.as_deref(),
791                        path_variable.as_deref(),
792                        true, // is_variable_length
793                        *optional,
794                        optional_pattern_vars,
795                    )
796                } else {
797                    let base_plan = self.plan_traverse_main_by_type(
798                        input,
799                        type_names,
800                        direction.clone(),
801                        source_variable,
802                        target_variable,
803                        step_variable.as_deref(),
804                        *optional,
805                        optional_pattern_vars,
806                        all_properties,
807                        scope_match_variables,
808                    )?;
809                    // Apply edge property filter first, then target node filter.
810                    // Without the target_filter, MATCH (a)-[r]->(b {prop: val}) SET r.x
811                    // would apply SET to ALL edges from a, ignoring b's properties.
812                    let edge_filtered = self.apply_schemaless_traverse_filter(
813                        base_plan,
814                        edge_filter_expr.as_ref(),
815                        source_variable,
816                        target_variable,
817                        step_variable.as_deref(),
818                        path_variable.as_deref(),
819                        false,
820                        *optional,
821                        optional_pattern_vars,
822                    )?;
823                    self.apply_schemaless_traverse_filter(
824                        edge_filtered,
825                        target_filter.as_ref(),
826                        source_variable,
827                        target_variable,
828                        step_variable.as_deref(),
829                        path_variable.as_deref(),
830                        false,
831                        *optional,
832                        optional_pattern_vars,
833                    )
834                }
835            }
836
837            LogicalPlan::Traverse {
838                input,
839                edge_type_ids,
840                direction,
841                source_variable,
842                target_variable,
843                target_label_id,
844                step_variable,
845                min_hops,
846                max_hops,
847                optional,
848                target_filter,
849                path_variable,
850                is_variable_length,
851                optional_pattern_vars,
852                scope_match_variables,
853                edge_filter_expr,
854                path_mode,
855                qpp_steps,
856                ..
857            } => self.plan_traverse(
858                input,
859                edge_type_ids,
860                direction.clone(),
861                source_variable,
862                target_variable,
863                *target_label_id,
864                step_variable.as_deref(),
865                *min_hops,
866                *max_hops,
867                path_variable.as_deref(),
868                *optional,
869                target_filter.as_ref(),
870                *is_variable_length,
871                optional_pattern_vars,
872                all_properties,
873                scope_match_variables,
874                edge_filter_expr.as_ref(),
875                path_mode,
876                qpp_steps.as_deref(),
877            ),
878
879            LogicalPlan::ShortestPath {
880                input,
881                edge_type_ids,
882                direction,
883                source_variable,
884                target_variable,
885                target_label_id: _,
886                path_variable,
887                min_hops: _,
888                max_hops: _,
889            } => self.plan_shortest_path(
890                input,
891                edge_type_ids,
892                direction.clone(),
893                source_variable,
894                target_variable,
895                path_variable,
896                false,
897                all_properties,
898            ),
899
900            // === Relational Operations ===
901            LogicalPlan::Filter {
902                input,
903                predicate,
904                optional_variables,
905            } => self.plan_filter(input, predicate, optional_variables, all_properties),
906
907            LogicalPlan::Project { input, projections } => {
908                // Build alias map for ORDER BY alias resolution
909                // When plan is Project(Limit(Sort(...))), Sort needs to know aliases
910                let alias_map: HashMap<String, Expr> = projections
911                    .iter()
912                    .filter_map(|(expr, alias)| alias.as_ref().map(|a| (a.clone(), expr.clone())))
913                    .collect();
914
915                // Check if the input chain contains a Sort and pass alias map
916                self.plan_project_with_aliases(input, projections, all_properties, &alias_map)
917            }
918
919            LogicalPlan::Aggregate {
920                input,
921                group_by,
922                aggregates,
923            } => self.plan_aggregate(input, group_by, aggregates, all_properties),
924
925            LogicalPlan::Distinct { input } => {
926                let input_plan = self.plan_internal(input, all_properties)?;
927                let schema = input_plan.schema();
928                // Group by all columns with no aggregates = deduplication
929                let group_exprs: Vec<(Arc<dyn datafusion::physical_expr::PhysicalExpr>, String)> =
930                    schema
931                        .fields()
932                        .iter()
933                        .enumerate()
934                        .map(|(i, f)| {
935                            (
936                                Arc::new(datafusion::physical_expr::expressions::Column::new(
937                                    f.name(),
938                                    i,
939                                ))
940                                    as Arc<dyn datafusion::physical_expr::PhysicalExpr>,
941                                f.name().clone(),
942                            )
943                        })
944                        .collect();
945                let group_by = PhysicalGroupBy::new_single(group_exprs);
946                Ok(Arc::new(AggregateExec::try_new(
947                    AggregateMode::Single,
948                    group_by,
949                    vec![],
950                    vec![],
951                    input_plan.clone(),
952                    input_plan.schema(),
953                )?))
954            }
955
956            LogicalPlan::Sort { input, order_by } => {
957                self.plan_sort(input, order_by, all_properties, &HashMap::new())
958            }
959
960            LogicalPlan::Limit { input, skip, fetch } => {
961                self.plan_limit(input, *skip, *fetch, all_properties)
962            }
963
964            LogicalPlan::Union { left, right, all } => {
965                self.plan_union(left, right, *all, all_properties)
966            }
967
968            LogicalPlan::Empty => self.plan_empty(),
969
970            LogicalPlan::BindZeroLengthPath {
971                input,
972                node_variable,
973                path_variable,
974            } => {
975                self.plan_bind_zero_length_path(input, node_variable, path_variable, all_properties)
976            }
977
978            LogicalPlan::BindPath {
979                input,
980                node_variables,
981                edge_variables,
982                path_variable,
983            } => self.plan_bind_path(
984                input,
985                node_variables,
986                edge_variables,
987                path_variable,
988                all_properties,
989            ),
990
991            // === Mutation operators ===
992            LogicalPlan::Create { input, pattern } => {
993                tracing::debug!("Planning MutationCreateExec");
994                let child = self.plan_internal(input, all_properties)?;
995                let mutation_ctx = self.require_mutation_ctx()?;
996                Ok(Arc::new(new_create_exec(
997                    child,
998                    pattern.clone(),
999                    mutation_ctx,
1000                )))
1001            }
1002            LogicalPlan::CreateBatch { input, patterns } => {
1003                tracing::debug!(
1004                    patterns = patterns.len(),
1005                    "Planning MutationCreateExec (batch)"
1006                );
1007                let child = self.plan_internal(input, all_properties)?;
1008                let mutation_ctx = self.require_mutation_ctx()?;
1009                // Use a single MutationExec with CreateBatch to avoid N nested
1010                // operators (which cause stack overflow for large N).
1011                let output_schema = extended_schema_for_new_vars(&child.schema(), patterns);
1012                Ok(Arc::new(MutationExec::new_with_schema(
1013                    child,
1014                    MutationKind::CreateBatch {
1015                        patterns: patterns.clone(),
1016                    },
1017                    "MutationCreateExec",
1018                    mutation_ctx,
1019                    output_schema,
1020                )))
1021            }
1022            LogicalPlan::Set { input, items } => {
1023                tracing::debug!(items = items.len(), "Planning MutationSetExec");
1024                let child = self.plan_internal(input, all_properties)?;
1025                let mutation_ctx = self.require_mutation_ctx()?;
1026                Ok(Arc::new(new_set_exec(child, items.clone(), mutation_ctx)))
1027            }
1028            LogicalPlan::Remove { input, items } => {
1029                tracing::debug!(items = items.len(), "Planning MutationRemoveExec");
1030                let child = self.plan_internal(input, all_properties)?;
1031                let mutation_ctx = self.require_mutation_ctx()?;
1032                Ok(Arc::new(new_remove_exec(
1033                    child,
1034                    items.clone(),
1035                    mutation_ctx,
1036                )))
1037            }
1038            LogicalPlan::Delete {
1039                input,
1040                items,
1041                detach,
1042            } => {
1043                tracing::debug!(
1044                    items = items.len(),
1045                    detach = detach,
1046                    "Planning MutationDeleteExec"
1047                );
1048                let child = self.plan_internal(input, all_properties)?;
1049                let mutation_ctx = self.require_mutation_ctx()?;
1050                Ok(Arc::new(new_delete_exec(
1051                    child,
1052                    items.clone(),
1053                    *detach,
1054                    mutation_ctx,
1055                )))
1056            }
1057            LogicalPlan::Merge {
1058                input,
1059                pattern,
1060                on_match,
1061                on_create,
1062            } => {
1063                tracing::debug!("Planning MutationMergeExec");
1064                let child = self.plan_internal(input, all_properties)?;
1065                let mutation_ctx = self.require_mutation_ctx()?;
1066                Ok(Arc::new(new_merge_exec(
1067                    child,
1068                    pattern.clone(),
1069                    on_match.clone(),
1070                    on_create.clone(),
1071                    mutation_ctx,
1072                )))
1073            }
1074
1075            LogicalPlan::Window {
1076                input,
1077                window_exprs,
1078            } => {
1079                let input_plan = self.plan_internal(input, all_properties)?;
1080                if !window_exprs.is_empty() {
1081                    self.plan_window_functions(input_plan, window_exprs, Some(input.as_ref()))
1082                } else {
1083                    Ok(input_plan)
1084                }
1085            }
1086
1087            LogicalPlan::CrossJoin { left, right } => {
1088                let left_plan = self.plan_internal(left, all_properties)?;
1089                let right_plan = self.plan_internal(right, all_properties)?;
1090
1091                // For Locy IS-ref joins (graph scan × derived scan), strip structural
1092                // projection columns (Struct-typed bare variable columns like "a", "b")
1093                // from the graph scan output that conflict with derived scan column names.
1094                // Non-conflicting struct columns (e.g., edge "e") are preserved for
1095                // typed property access.
1096                let left_plan = if matches!(right.as_ref(), LogicalPlan::LocyDerivedScan { .. }) {
1097                    let derived_schema = right_plan.schema();
1098                    let derived_names: HashSet<&str> = derived_schema
1099                        .fields()
1100                        .iter()
1101                        .map(|f| f.name().as_str())
1102                        .collect();
1103                    strip_conflicting_structural_columns(left_plan, &derived_names)?
1104                } else {
1105                    left_plan
1106                };
1107
1108                Ok(Arc::new(
1109                    datafusion::physical_plan::joins::CrossJoinExec::new(left_plan, right_plan),
1110                ))
1111            }
1112
1113            LogicalPlan::Apply {
1114                input,
1115                subquery,
1116                input_filter,
1117            } => self.plan_apply(input, subquery, input_filter.as_ref(), all_properties),
1118
1119            LogicalPlan::Unwind {
1120                input,
1121                expr,
1122                variable,
1123            } => self.plan_unwind(
1124                input.as_ref().clone(),
1125                expr.clone(),
1126                variable.clone(),
1127                all_properties,
1128            ),
1129
1130            LogicalPlan::VectorKnn {
1131                label_id,
1132                variable,
1133                property,
1134                query,
1135                k,
1136                threshold,
1137            } => self.plan_vector_knn(
1138                *label_id,
1139                variable,
1140                property,
1141                query.clone(),
1142                *k,
1143                *threshold,
1144                all_properties,
1145            ),
1146
1147            LogicalPlan::InvertedIndexLookup { .. } => Err(anyhow!(
1148                "Full-text search not yet supported in DataFusion engine"
1149            )),
1150
1151            LogicalPlan::AllShortestPaths {
1152                input,
1153                edge_type_ids,
1154                direction,
1155                source_variable,
1156                target_variable,
1157                target_label_id: _,
1158                path_variable,
1159                min_hops: _,
1160                max_hops: _,
1161            } => self.plan_shortest_path(
1162                input,
1163                edge_type_ids,
1164                direction.clone(),
1165                source_variable,
1166                target_variable,
1167                path_variable,
1168                true,
1169                all_properties,
1170            ),
1171
1172            LogicalPlan::QuantifiedPattern { .. } => Err(anyhow!(
1173                "Quantified patterns not yet supported in DataFusion engine"
1174            )),
1175
1176            LogicalPlan::RecursiveCTE {
1177                cte_name,
1178                initial,
1179                recursive,
1180            } => self.plan_recursive_cte(cte_name, initial, recursive, all_properties),
1181
1182            LogicalPlan::ProcedureCall {
1183                procedure_name,
1184                arguments,
1185                yield_items,
1186            } => self.plan_procedure_call(procedure_name, arguments, yield_items, all_properties),
1187
1188            LogicalPlan::SubqueryCall { input, subquery } => {
1189                self.plan_apply(input, subquery, None, all_properties)
1190            }
1191
1192            LogicalPlan::ExtIdLookup {
1193                variable,
1194                ext_id,
1195                filter,
1196                optional,
1197            } => self.plan_ext_id_lookup(variable, ext_id, filter.as_ref(), *optional),
1198
1199            LogicalPlan::Foreach {
1200                input,
1201                variable,
1202                list,
1203                body,
1204            } => {
1205                tracing::debug!(variable = variable.as_str(), "Planning ForeachExec");
1206                let child = self.plan_internal(input, all_properties)?;
1207                let mutation_ctx = self.require_mutation_ctx()?;
1208                Ok(Arc::new(
1209                    super::df_graph::mutation_foreach::ForeachExec::new(
1210                        child,
1211                        variable.clone(),
1212                        list.clone(),
1213                        body.clone(),
1214                        mutation_ctx,
1215                    ),
1216                ))
1217            }
1218
1219            // Locy standalone operators
1220            LogicalPlan::LocyPriority { input, key_columns } => {
1221                let child = self.plan_internal(input, all_properties)?;
1222                let key_indices = resolve_column_indices(&child.schema(), key_columns)?;
1223                let priority_col_index = child.schema().index_of("__priority").map_err(|_| {
1224                    anyhow::anyhow!("LocyPriority input must contain __priority column")
1225                })?;
1226                Ok(Arc::new(super::df_graph::locy_priority::PriorityExec::new(
1227                    child,
1228                    key_indices,
1229                    priority_col_index,
1230                )))
1231            }
1232
1233            LogicalPlan::LocyBestBy {
1234                input,
1235                key_columns,
1236                criteria,
1237            } => {
1238                let child = self.plan_internal(input, all_properties)?;
1239                let key_indices = resolve_column_indices(&child.schema(), key_columns)?;
1240                let sort_criteria = resolve_best_by_criteria(&child.schema(), criteria)?;
1241                Ok(Arc::new(super::df_graph::locy_best_by::BestByExec::new(
1242                    child,
1243                    key_indices,
1244                    sort_criteria,
1245                    true, // LocyBestBy logical plan always uses deterministic ordering
1246                )))
1247            }
1248
1249            LogicalPlan::LocyFold {
1250                input,
1251                key_columns,
1252                fold_bindings,
1253                strict_probability_domain,
1254                probability_epsilon,
1255            } => {
1256                let child = self.plan_internal(input, all_properties)?;
1257                let key_indices = resolve_column_indices(&child.schema(), key_columns)?;
1258                let bindings =
1259                    resolve_fold_bindings(&child.schema(), fold_bindings, &self.plugin_registry)?;
1260                Ok(Arc::new(super::df_graph::locy_fold::FoldExec::new(
1261                    child,
1262                    key_indices,
1263                    bindings,
1264                    *strict_probability_domain,
1265                    *probability_epsilon,
1266                )))
1267            }
1268
1269            LogicalPlan::LocyDerivedScan {
1270                scan_index: _,
1271                data,
1272                schema,
1273            } => Ok(Arc::new(
1274                super::df_graph::locy_fixpoint::DerivedScanExec::new(
1275                    Arc::clone(data),
1276                    Arc::clone(schema),
1277                ),
1278            )),
1279
1280            LogicalPlan::LocyProject {
1281                input,
1282                projections,
1283                target_types,
1284            } => self.plan_locy_project(input, projections, target_types, all_properties),
1285
1286            LogicalPlan::LocyModelInvoke {
1287                input,
1288                invocations,
1289                classifier_registry,
1290                classifier_cache,
1291                classifier_provenance_store,
1292                path_context_handles,
1293            } => {
1294                let input_plan = self.plan_internal(input, all_properties)?;
1295                // Phase D D2 runtime: inject the Xervo embedder runtime
1296                // from graph_ctx at physical lowering. The logical plan
1297                // is graph_ctx-agnostic; the physical exec carries the
1298                // shared `Arc<ModelRuntime>` needed to embed
1299                // `semantic_match` query literals.
1300                let xervo_runtime =
1301                    super::df_graph::locy_model_invoke::XervoRuntimeHandle(
1302                        self.graph_ctx.xervo_runtime().cloned(),
1303                    );
1304                // Phase D D1 graph-structural runtime: lift registry +
1305                // storage + L0 snapshot from graph_ctx. Construction
1306                // mirrors `execute_algo_procedure` in procedure_call.rs.
1307                let graph_algo = {
1308                    let l0_ctx = self.graph_ctx.l0_context();
1309                    let l0_mgr = l0_ctx.current_l0.as_ref().map(|current| {
1310                        let mut pending = l0_ctx.pending_flush_l0s.clone();
1311                        if let Some(tx_l0) = &l0_ctx.transaction_l0 {
1312                            pending.push(tx_l0.clone());
1313                        }
1314                        Arc::new(uni_store::runtime::l0_manager::L0Manager::from_snapshot(
1315                            current.clone(),
1316                            pending,
1317                        ))
1318                    });
1319                    let l0_buffers = self.graph_ctx.l0_context().current_l0.as_ref().map(
1320                        |current| super::df_graph::locy_model_invoke::L0Buffers {
1321                            current: current.clone(),
1322                            transaction: self.graph_ctx.l0_context().transaction_l0.clone(),
1323                            pending_flush: self.graph_ctx.l0_context().pending_flush_l0s.clone(),
1324                        },
1325                    );
1326                    super::df_graph::locy_model_invoke::GraphAlgoHandle {
1327                        registry: self.graph_ctx.algo_registry().cloned(),
1328                        storage: Some(self.graph_ctx.storage().clone()),
1329                        l0_manager: l0_mgr,
1330                        property_manager: Some(self.graph_ctx.property_manager().clone()),
1331                        l0_buffers,
1332                    }
1333                };
1334                Ok(Arc::new(
1335                    super::df_graph::locy_model_invoke::LocyModelInvokeExec::new(
1336                        input_plan,
1337                        invocations.clone(),
1338                        Arc::clone(classifier_registry),
1339                        classifier_cache.as_ref().map(Arc::clone),
1340                        classifier_provenance_store.as_ref().map(Arc::clone),
1341                        path_context_handles.clone(),
1342                        xervo_runtime,
1343                        graph_algo,
1344                    ),
1345                ))
1346            }
1347
1348            LogicalPlan::LocyProgram {
1349                strata,
1350                commands,
1351                derived_scan_registry,
1352                max_iterations,
1353                timeout,
1354                max_derived_bytes,
1355                deterministic_best_by,
1356                strict_probability_domain,
1357                probability_epsilon,
1358                exact_probability,
1359                max_bdd_variables,
1360                top_k_proofs,
1361                semiring_kind,
1362                classifier_registry,
1363                classifier_cache,
1364                classifier_provenance_store,
1365            } => {
1366                let output_schema = super::df_graph::locy_program::stats_schema();
1367
1368                Ok(Arc::new(
1369                    super::df_graph::locy_program::LocyProgramExec::new_with_semiring_classifiers_and_cache(
1370                        strata.clone(),
1371                        commands.clone(),
1372                        Arc::clone(derived_scan_registry),
1373                        Arc::clone(&self.plugin_registry),
1374                        Arc::clone(&self.graph_ctx),
1375                        Arc::clone(&self.session_ctx),
1376                        Arc::clone(&self.storage),
1377                        Arc::clone(&self.schema),
1378                        self.params.clone(),
1379                        output_schema,
1380                        *max_iterations,
1381                        *timeout,
1382                        *max_derived_bytes,
1383                        *deterministic_best_by,
1384                        *strict_probability_domain,
1385                        *probability_epsilon,
1386                        *exact_probability,
1387                        *max_bdd_variables,
1388                        *top_k_proofs,
1389                        *semiring_kind,
1390                        Arc::clone(classifier_registry),
1391                        classifier_cache.as_ref().map(Arc::clone),
1392                        classifier_provenance_store.as_ref().map(Arc::clone),
1393                    ),
1394                ))
1395            }
1396
1397            // DDL operations should be handled separately
1398            LogicalPlan::CreateVectorIndex { .. }
1399            | LogicalPlan::CreateFullTextIndex { .. }
1400            | LogicalPlan::CreateScalarIndex { .. }
1401            | LogicalPlan::CreateJsonFtsIndex { .. }
1402            | LogicalPlan::DropIndex { .. }
1403            | LogicalPlan::ShowIndexes { .. }
1404            | LogicalPlan::Copy { .. }
1405            | LogicalPlan::Backup { .. }
1406            | LogicalPlan::ShowDatabase
1407            | LogicalPlan::ShowConfig
1408            | LogicalPlan::ShowStatistics
1409            | LogicalPlan::Vacuum
1410            | LogicalPlan::Checkpoint
1411            | LogicalPlan::CopyTo { .. }
1412            | LogicalPlan::CopyFrom { .. }
1413            | LogicalPlan::CreateLabel(_)
1414            | LogicalPlan::CreateEdgeType(_)
1415            | LogicalPlan::AlterLabel(_)
1416            | LogicalPlan::AlterEdgeType(_)
1417            | LogicalPlan::DropLabel(_)
1418            | LogicalPlan::DropEdgeType(_)
1419            | LogicalPlan::CreateConstraint(_)
1420            | LogicalPlan::DropConstraint(_)
1421            | LogicalPlan::ShowConstraints(_)
1422            | LogicalPlan::Explain { .. } => {
1423                Err(anyhow!("DDL/Admin operations should be handled separately"))
1424            }
1425        }
1426    }
1427
1428    /// Like `plan_internal`, but propagates alias mappings to Sort nodes.
1429    /// This is used when a Project wraps a Sort (possibly through Limit)
1430    /// so that ORDER BY can reference projection aliases.
1431    fn plan_internal_with_aliases(
1432        &self,
1433        logical: &LogicalPlan,
1434        all_properties: &HashMap<String, HashSet<String>>,
1435        alias_map: &HashMap<String, Expr>,
1436    ) -> Result<Arc<dyn ExecutionPlan>> {
1437        match logical {
1438            LogicalPlan::Sort { input, order_by } => {
1439                self.plan_sort(input, order_by, all_properties, alias_map)
1440            }
1441            LogicalPlan::Limit { input, skip, fetch } => {
1442                // Propagate aliases through Limit to reach Sort
1443                let input_plan =
1444                    self.plan_internal_with_aliases(input, all_properties, alias_map)?;
1445                if let Some(offset) = skip.filter(|&s| s > 0) {
1446                    use datafusion::physical_plan::limit::GlobalLimitExec;
1447                    Ok(Arc::new(GlobalLimitExec::new(input_plan, offset, *fetch)))
1448                } else {
1449                    Ok(Arc::new(LocalLimitExec::new(
1450                        input_plan,
1451                        fetch.unwrap_or(usize::MAX),
1452                    )))
1453                }
1454            }
1455            // For all other nodes, fall through to normal planning
1456            _ => self.plan_internal(logical, all_properties),
1457        }
1458    }
1459
1460    /// Apply a node-level filter to a scan or lookup plan.
1461    ///
1462    /// Wraps the input plan with a `FilterExec` if `filter` is `Some`.
1463    /// Builds a `TranslationContext` marking `variable` as `VariableKind::Node`
1464    /// for correct expression translation.
1465    /// Extract a VID literal from a Cypher filter expression for scan-level
1466    /// optimization. Looks for `_vid = <int>` patterns (produced by the
1467    /// `id()` → `_vid` rewrite). Returns the VID if found, enabling L0
1468    /// short-circuit and Lance _vid pushdown inside the scan.
1469    /// Extract VID(s) from a Cypher WHERE filter for scan-level pushdown.
1470    ///
1471    /// Returns the list of VIDs the filter constrains for `variable`, or
1472    /// `None` if the filter doesn't contain a recognised `_vid = lit` /
1473    /// `_vid IN (lit, ...)` predicate. A single-element vec means single-VID
1474    /// pushdown; multi-element vec means IN-list pushdown. See issue #55 PR #4.
1475    fn extract_vid_from_cypher_filter(
1476        filter: Option<&Expr>,
1477        variable: &str,
1478        params: &HashMap<String, uni_common::Value>,
1479    ) -> Option<Vec<u64>> {
1480        use uni_cypher::ast::BinaryOp;
1481        let filter = filter?;
1482        match filter {
1483            Expr::BinaryOp {
1484                left,
1485                op: BinaryOp::Eq,
1486                right,
1487            } => {
1488                // Check: variable._vid = literal/param
1489                if let Expr::Property(var_expr, prop) = left.as_ref()
1490                    && let Expr::Variable(v) = var_expr.as_ref()
1491                    && v == variable
1492                    && prop == "_vid"
1493                {
1494                    return Self::resolve_vid_value(right, params).map(|v| vec![v]);
1495                }
1496                // Check: literal/param = variable._vid
1497                if let Expr::Property(var_expr, prop) = right.as_ref()
1498                    && let Expr::Variable(v) = var_expr.as_ref()
1499                    && v == variable
1500                    && prop == "_vid"
1501                {
1502                    return Self::resolve_vid_value(left, params).map(|v| vec![v]);
1503                }
1504                None
1505            }
1506            Expr::In { expr, list } => {
1507                // Check: variable._vid IN (literals)
1508                let Expr::Property(var_expr, prop) = expr.as_ref() else {
1509                    return None;
1510                };
1511                let Expr::Variable(v) = var_expr.as_ref() else {
1512                    return None;
1513                };
1514                if v != variable || prop != "_vid" {
1515                    return None;
1516                }
1517                let Expr::List(items) = list.as_ref() else {
1518                    return None;
1519                };
1520                let mut out = Vec::with_capacity(items.len());
1521                for item in items {
1522                    out.push(Self::resolve_vid_value(item, params)?);
1523                }
1524                if out.is_empty() { None } else { Some(out) }
1525            }
1526            Expr::BinaryOp {
1527                left,
1528                op: BinaryOp::And,
1529                right,
1530            } => Self::extract_vid_from_cypher_filter(Some(left), variable, params)
1531                .or_else(|| Self::extract_vid_from_cypher_filter(Some(right), variable, params)),
1532            _ => None,
1533        }
1534    }
1535
1536    /// Build a physical `_vid = literal` filter expression for scan-level
1537    /// optimization (single-VID case). For multi-VID IN-list, use
1538    /// `GraphScanExec::vid_list_filter` directly — it bypasses the
1539    /// PhysicalExpr roundtrip.
1540    fn build_vid_physical_filter(
1541        col_name: &str,
1542        vid: u64,
1543    ) -> Arc<dyn datafusion::physical_expr::PhysicalExpr> {
1544        use datafusion::physical_expr::expressions::{BinaryExpr, Column, Literal};
1545        Arc::new(BinaryExpr::new(
1546            Arc::new(Column::new(col_name, 0)),
1547            datafusion::logical_expr::Operator::Eq,
1548            Arc::new(Literal::new(datafusion::common::ScalarValue::UInt64(Some(
1549                vid,
1550            )))),
1551        ))
1552    }
1553
1554    fn resolve_vid_value(expr: &Expr, params: &HashMap<String, uni_common::Value>) -> Option<u64> {
1555        match expr {
1556            Expr::Literal(CypherLiteral::Integer(v)) if *v >= 0 => Some(*v as u64),
1557            Expr::Parameter(name) => match params.get(name) {
1558                Some(uni_common::Value::Int(v)) if *v >= 0 => Some(*v as u64),
1559                _ => None,
1560            },
1561            _ => None,
1562        }
1563    }
1564
1565    /// AND-combine a non-empty list of predicates into a single `Expr`.
1566    /// Trivial for length 0/1 (returns true / the single expr); folds left
1567    /// for length >= 2.
1568    fn and_join_predicates(mut preds: Vec<Expr>) -> Expr {
1569        if preds.is_empty() {
1570            return uni_cypher::ast::Expr::TRUE;
1571        }
1572        let mut acc = preds.remove(0);
1573        for p in preds {
1574            acc = Expr::BinaryOp {
1575                left: Box::new(acc),
1576                op: uni_cypher::ast::BinaryOp::And,
1577                right: Box::new(p),
1578            };
1579        }
1580        acc
1581    }
1582
1583    /// Build the indexed-property pushdown for a vertex scan: a Lance SQL
1584    /// filter string AND an Arrow-side `PhysicalExpr`, both derived from the
1585    /// same set of indexed-property conjuncts.
1586    ///
1587    /// - The Lance string drives an O(1) hash-index lookup against on-disk data.
1588    /// - The Arrow filter applies to the merged (Lance + L0) batch in-process,
1589    ///   so L0 rows that haven't been flushed yet are still index-bounded.
1590    ///
1591    /// Returns `None` when no indexed predicate exists or any parameter
1592    /// resolution fails — in that case the planner falls back to the regular
1593    /// post-scan `FilterExec`. See issue #57.
1594    fn build_indexed_property_pushdown(
1595        &self,
1596        filter: Option<&Expr>,
1597        variable: &str,
1598        label_id: u16,
1599        scan_schema: &SchemaRef,
1600    ) -> Option<(String, Arc<dyn datafusion::physical_expr::PhysicalExpr>)> {
1601        let filter = filter?;
1602        let analyzer = crate::query::pushdown::IndexAwareAnalyzer::new(&self.schema);
1603        let strategy = analyzer.analyze(filter, variable, label_id);
1604        if strategy.hash_index_columns.is_empty() {
1605            return None;
1606        }
1607
1608        // Collect lance_predicates that touch a hash-indexed column. Other
1609        // lance_predicates (e.g. range on non-indexed props) are deliberately
1610        // left for the outer FilterExec: pushing them inside the scan
1611        // would also filter L0 rows that match the indexed conjunct but not
1612        // the residual conjunct on the SAME row — which is fine — but the
1613        // outer FilterExec already handles them, so keeping the boundary
1614        // simple keeps the merge behaviour obvious.
1615        let label_name = self.schema.label_name_by_id(label_id)?;
1616        let label_props = self.schema.properties.get(label_name);
1617        let mut indexed_preds: Vec<Expr> = Vec::new();
1618        for pred in &strategy.lance_predicates {
1619            if let Some(col) = crate::query::pushdown::predicate_target_column(pred, variable)
1620                && strategy.hash_index_columns.iter().any(|c| c == &col)
1621            {
1622                let resolved = crate::query::pushdown::substitute_params(pred, &self.params)?;
1623                indexed_preds.push(resolved);
1624            }
1625        }
1626        if indexed_preds.is_empty() {
1627            return None;
1628        }
1629
1630        // Render the Lance SQL filter string for storage-side pushdown.
1631        let lance_str = crate::query::pushdown::LanceFilterGenerator::generate(
1632            &indexed_preds,
1633            variable,
1634            label_props,
1635        )?;
1636
1637        // Build the Arrow-side PhysicalExpr from the same predicates. The
1638        // GraphScanExec applies it to the merged (Lance+L0) batch so the
1639        // scan output is index-bounded regardless of where the data lives.
1640        let combined = Self::and_join_predicates(indexed_preds.clone());
1641        let mut variable_kinds = HashMap::new();
1642        variable_kinds.insert(variable.to_string(), VariableKind::Node);
1643        let mut variable_labels = HashMap::new();
1644        variable_labels.insert(variable.to_string(), label_name.to_string());
1645        let ctx = TranslationContext {
1646            parameters: self.params.clone(),
1647            variable_labels,
1648            variable_kinds,
1649            ..Default::default()
1650        };
1651        let df_filter = cypher_expr_to_df(&combined, Some(&ctx)).ok()?;
1652        let session = self.session_ctx.read();
1653        let physical = self
1654            .create_physical_filter_expr(&df_filter, scan_schema, &session)
1655            .ok()?;
1656        Some((lance_str, physical))
1657    }
1658
1659    /// Wraps a leaf scan plan so surviving row identities feed the SSI read-set.
1660    ///
1661    /// No-op unless the current transaction has an optimistic read-set (a
1662    /// read-write transaction begun under `UniConfig::ssi_enabled`), so the wrap
1663    /// self-gates at runtime — when SSI is off, `occ_read_set` is `None` and the
1664    /// plan is returned verbatim. Must be inserted above the residual `FilterExec`
1665    /// and below any structural projection so the `{var}._vid` / `{var}._eid`
1666    /// columns are still present.
1667    fn wrap_read_set_recording(
1668        &self,
1669        plan: Arc<dyn ExecutionPlan>,
1670        variable: &str,
1671    ) -> Arc<dyn ExecutionPlan> {
1672        let has_read_set = self
1673            .graph_ctx
1674            .l0_context()
1675            .transaction_l0
1676            .as_ref()
1677            .is_some_and(|l0| l0.read().occ_read_set.is_some());
1678        if !has_read_set {
1679            return plan;
1680        }
1681        Arc::new(ReadSetRecordingExec::new(
1682            plan,
1683            self.graph_ctx.clone(),
1684            variable,
1685        ))
1686    }
1687
1688    fn apply_scan_filter(
1689        &self,
1690        plan: Arc<dyn ExecutionPlan>,
1691        variable: &str,
1692        filter: Option<&Expr>,
1693        label_name: Option<&str>,
1694    ) -> Result<Arc<dyn ExecutionPlan>> {
1695        let Some(filter_expr) = filter else {
1696            return Ok(plan);
1697        };
1698
1699        let mut variable_kinds = HashMap::new();
1700        variable_kinds.insert(variable.to_string(), VariableKind::Node);
1701        let mut variable_labels = HashMap::new();
1702        if let Some(label) = label_name {
1703            variable_labels.insert(variable.to_string(), label.to_string());
1704        }
1705        let ctx = TranslationContext {
1706            parameters: self.params.clone(),
1707            variable_labels,
1708            variable_kinds,
1709            ..Default::default()
1710        };
1711        let df_filter = cypher_expr_to_df(filter_expr, Some(&ctx))?;
1712
1713        let schema = plan.schema();
1714
1715        let session = self.session_ctx.read();
1716        let physical_filter = self.create_physical_filter_expr(&df_filter, &schema, &session)?;
1717
1718        Ok(Arc::new(FilterExec::try_new(physical_filter, plan)?))
1719    }
1720
1721    /// Apply a filter to a schemaless traverse plan (TraverseMainByType).
1722    ///
1723    /// Builds a `TranslationContext` with the appropriate variable kinds for
1724    /// source, target, edge, and path variables, then creates and applies the
1725    /// filter. Used by both VLP (target_filter) and fixed-length (edge_filter)
1726    /// branches of TraverseMainByType planning.
1727    #[expect(clippy::too_many_arguments)]
1728    fn apply_schemaless_traverse_filter(
1729        &self,
1730        plan: Arc<dyn ExecutionPlan>,
1731        filter_expr: Option<&Expr>,
1732        source_variable: &str,
1733        target_variable: &str,
1734        step_variable: Option<&str>,
1735        path_variable: Option<&str>,
1736        is_variable_length: bool,
1737        optional: bool,
1738        optional_pattern_vars: &HashSet<String>,
1739    ) -> Result<Arc<dyn ExecutionPlan>> {
1740        let Some(filter_expr) = filter_expr else {
1741            return Ok(plan);
1742        };
1743
1744        let mut variable_kinds = HashMap::new();
1745        variable_kinds.insert(source_variable.to_string(), VariableKind::Node);
1746        variable_kinds.insert(target_variable.to_string(), VariableKind::Node);
1747        if let Some(sv) = step_variable {
1748            variable_kinds.insert(sv.to_string(), VariableKind::edge_for(is_variable_length));
1749        }
1750        if let Some(pv) = path_variable {
1751            variable_kinds.insert(pv.to_string(), VariableKind::Path);
1752        }
1753        let ctx = TranslationContext {
1754            parameters: self.params.clone(),
1755            variable_kinds,
1756            ..Default::default()
1757        };
1758        let df_filter = cypher_expr_to_df(filter_expr, Some(&ctx))?;
1759        let schema = plan.schema();
1760        let session = self.session_ctx.read();
1761        let physical_filter = self.create_physical_filter_expr(&df_filter, &schema, &session)?;
1762
1763        if optional {
1764            Ok(Arc::new(OptionalFilterExec::new(
1765                plan,
1766                physical_filter,
1767                optional_pattern_vars.clone(),
1768            )))
1769        } else {
1770            Ok(Arc::new(FilterExec::try_new(physical_filter, plan)?))
1771        }
1772    }
1773
1774    /// Plan an external ID lookup.
1775    fn plan_ext_id_lookup(
1776        &self,
1777        variable: &str,
1778        ext_id: &str,
1779        filter: Option<&Expr>,
1780        optional: bool,
1781    ) -> Result<Arc<dyn ExecutionPlan>> {
1782        // Collect properties needed from the filter
1783        let properties = if let Some(filter_expr) = filter {
1784            crate::query::df_expr::collect_properties(filter_expr)
1785                .into_iter()
1786                .filter(|(var, _)| var == variable)
1787                .map(|(_, prop)| prop)
1788                .collect()
1789        } else {
1790            vec![]
1791        };
1792
1793        let lookup_plan: Arc<dyn ExecutionPlan> = Arc::new(GraphExtIdLookupExec::new(
1794            self.graph_ctx.clone(),
1795            variable.to_string(),
1796            ext_id.to_string(),
1797            properties,
1798            optional,
1799        ));
1800
1801        self.apply_scan_filter(lookup_plan, variable, filter, None)
1802    }
1803
1804    /// Plan an UNWIND operation.
1805    ///
1806    /// UNWIND expands a list expression into multiple rows.
1807    fn plan_unwind(
1808        &self,
1809        input: LogicalPlan,
1810        expr: Expr,
1811        variable: String,
1812        all_properties: &HashMap<String, HashSet<String>>,
1813    ) -> Result<Arc<dyn ExecutionPlan>> {
1814        // Recursively plan the input
1815        let input_plan = self.plan_internal(&input, all_properties)?;
1816
1817        let unwind = GraphUnwindExec::new(input_plan, expr, variable, self.params.clone());
1818
1819        Ok(Arc::new(unwind))
1820    }
1821
1822    /// Plan a recursive CTE (`WITH RECURSIVE`).
1823    ///
1824    /// Creates a [`RecursiveCTEExec`] that stores the logical plans and
1825    /// re-plans/executes them iteratively at execution time.
1826    fn plan_recursive_cte(
1827        &self,
1828        cte_name: &str,
1829        initial: &LogicalPlan,
1830        recursive: &LogicalPlan,
1831        _all_properties: &HashMap<String, HashSet<String>>,
1832    ) -> Result<Arc<dyn ExecutionPlan>> {
1833        Ok(Arc::new(RecursiveCTEExec::new(
1834            cte_name.to_string(),
1835            initial.clone(),
1836            recursive.clone(),
1837            self.graph_ctx.clone(),
1838            self.session_ctx.clone(),
1839            self.storage.clone(),
1840            self.schema.clone(),
1841            self.params.clone(),
1842            self.mutation_ctx.clone(),
1843        )))
1844    }
1845
1846    /// Plan an Apply (correlated subquery) or SubqueryCall.
1847    fn plan_apply(
1848        &self,
1849        input: &LogicalPlan,
1850        subquery: &LogicalPlan,
1851        input_filter: Option<&Expr>,
1852        all_properties: &HashMap<String, HashSet<String>>,
1853    ) -> Result<Arc<dyn ExecutionPlan>> {
1854        use crate::query::df_graph::common::infer_logical_plan_schema;
1855
1856        // 1. Plan input physically
1857        let input_exec = self.plan_internal(input, all_properties)?;
1858        let input_schema = input_exec.schema();
1859
1860        // 1a. Unit-subquery unwrap: write-only `CALL { ... }` (no inner
1861        // RETURN) is wrapped in `Limit { fetch: Some(0), input: Set/... }` by
1862        // the planner so the subquery emits zero rows. At the physical layer,
1863        // `GlobalLimitExec(fetch=0)` short-circuits and never polls its input
1864        // — so the embedded write operator never runs. Strip that wrapper so
1865        // the side effect executes per outer row; output emptiness is still
1866        // signaled via the schema (sub_schema has no fields → unit detection
1867        // in `GraphApplyExec`).
1868        let subquery_effective = match subquery {
1869            LogicalPlan::Limit {
1870                input: inner,
1871                skip: None,
1872                fetch: Some(0),
1873            } => inner.as_ref(),
1874            _ => subquery,
1875        };
1876
1877        // 2. Infer subquery output schema from logical plan + UniSchema metadata.
1878        // Use the ORIGINAL (still-wrapped) subquery so a unit subquery resolves
1879        // to an empty schema, which `GraphApplyExec` reads as the unit signal.
1880        let sub_schema = infer_logical_plan_schema(subquery, &self.schema);
1881
1882        // 3. Merge schemas: subquery fields override input fields with the
1883        //    same name. The subquery's RETURN list is authoritative for the
1884        //    names it lists, which is what `CALL { WITH n SET n.x = ...
1885        //    RETURN n }` semantically requires — the outer plan must see the
1886        //    post-SET `n`, not the pre-SET copy carried through from the
1887        //    correlated input. For correlated subqueries that don't re-emit
1888        //    an imported variable (EXISTS, COUNT, non-SET CALLs), there is no
1889        //    name collision and behavior is unchanged.
1890        let sub_field_names: HashSet<&str> = sub_schema
1891            .fields()
1892            .iter()
1893            .map(|f| f.name().as_str())
1894            .collect();
1895        // Input columns whose name collides with a subquery RETURN field are
1896        // dropped (sub wins). Dotted columns (`v.prop`) whose base variable
1897        // `v` is overridden by the subquery are kept in the schema (so the
1898        // expr compiler resolves `v.prop` via the flat-column path) but at
1899        // data-fill time they're refreshed from the post-SET bare `v` Map
1900        // in the subquery output. See `append_cross_join_row` /
1901        // `kept_input_overrides`.
1902        let kept_input_indices: Vec<usize> = input_schema
1903            .fields()
1904            .iter()
1905            .enumerate()
1906            .filter(|(_, f)| !sub_field_names.contains(f.name().as_str()))
1907            .map(|(i, _)| i)
1908            .collect();
1909        // For each kept input column, pre-compute whether it should be
1910        // sourced from the subquery's bare entity Map instead of the input
1911        // batch. Some((var, prop)) means refresh `var.prop` from
1912        // `sub_row[var]`; None means slice from input as usual.
1913        let kept_input_overrides: Vec<Option<(String, String)>> = kept_input_indices
1914            .iter()
1915            .map(|&i| {
1916                let name = input_schema.field(i).name();
1917                if let Some(dot) = name.find('.') {
1918                    let base = &name[..dot];
1919                    if sub_field_names.contains(base) {
1920                        return Some((base.to_string(), name[dot + 1..].to_string()));
1921                    }
1922                }
1923                None
1924            })
1925            .collect();
1926        let mut fields: Vec<Arc<arrow_schema::Field>> = kept_input_indices
1927            .iter()
1928            .map(|&i| input_schema.fields()[i].clone())
1929            .collect();
1930        fields.extend(sub_schema.fields().iter().cloned());
1931        let output_schema: SchemaRef = Arc::new(Schema::new(fields));
1932
1933        Ok(Arc::new(GraphApplyExec::new(
1934            input_exec,
1935            subquery_effective.clone(),
1936            input_filter.cloned(),
1937            self.graph_ctx.clone(),
1938            self.session_ctx.clone(),
1939            self.storage.clone(),
1940            self.schema.clone(),
1941            self.params.clone(),
1942            output_schema,
1943            kept_input_indices,
1944            kept_input_overrides,
1945            self.mutation_ctx.clone(),
1946        )))
1947    }
1948
1949    /// Plan a vector KNN search.
1950    #[expect(clippy::too_many_arguments)]
1951    fn plan_vector_knn(
1952        &self,
1953        label_id: u16,
1954        variable: &str,
1955        property: &str,
1956        query_expr: Expr,
1957        k: usize,
1958        threshold: Option<f32>,
1959        all_properties: &HashMap<String, HashSet<String>>,
1960    ) -> Result<Arc<dyn ExecutionPlan>> {
1961        let label_name = self
1962            .schema
1963            .label_name_by_id(label_id)
1964            .ok_or_else(|| anyhow!("Unknown label ID: {}", label_id))?;
1965
1966        let target_properties = self.resolve_properties(variable, label_name, all_properties);
1967
1968        // M5b follow-up #4 (IndexProbeExec bridge): look up the index by
1969        // name for this `(label, property)` pair, then ask the plugin
1970        // registry whether a live `IndexHandle` has been registered under
1971        // that name. If yes, dispatch the probe through the plugin handle
1972        // via `VectorSource::Plugin`; if no, fall through to the native
1973        // `StorageManager::vector_search` path (preserves the "no behavior
1974        // change for built-ins" invariant — native vector indexes never
1975        // register a handle in this table).
1976        let plugin_source = self
1977            .schema
1978            .vector_index_for_property(label_name, property)
1979            .and_then(|cfg| {
1980                self.plugin_registry
1981                    .index_handle(&cfg.name)
1982                    .map(|entry| (cfg.name.clone(), entry))
1983            });
1984
1985        let knn = if let Some((index_name, entry)) = plugin_source {
1986            tracing::debug!(
1987                target: "uni.plugin.registry",
1988                index_kind = %entry.kind.0,
1989                index_name = %index_name,
1990                "plan_vector_knn: dispatching via plugin IndexHandle"
1991            );
1992            GraphVectorKnnExec::with_plugin_source(
1993                self.graph_ctx.clone(),
1994                label_id,
1995                label_name,
1996                variable.to_string(),
1997                property.to_string(),
1998                query_expr,
1999                k,
2000                threshold,
2001                self.params.clone(),
2002                target_properties,
2003                entry.kind,
2004                entry.handle,
2005            )
2006        } else {
2007            GraphVectorKnnExec::new(
2008                self.graph_ctx.clone(),
2009                label_id,
2010                label_name,
2011                variable.to_string(),
2012                property.to_string(),
2013                query_expr,
2014                k,
2015                threshold,
2016                self.params.clone(),
2017                target_properties,
2018            )
2019        };
2020
2021        // SSI read-set: a vector-KNN result is a set of *real* graph vertices
2022        // (the exec emits `{variable}._vid` from the native/plugin index over the
2023        // actual store), each of which a concurrent transaction can write. A
2024        // read-write antidependency through a KNN read must therefore abort, so
2025        // record the matched vids — exactly as `plan_scan` does for label scans.
2026        Ok(self.wrap_read_set_recording(Arc::new(knn), variable))
2027    }
2028
2029    /// Plan a procedure call.
2030    fn plan_procedure_call(
2031        &self,
2032        procedure_name: &str,
2033        arguments: &[Expr],
2034        yield_items: &[(String, Option<String>)],
2035        all_properties: &HashMap<String, HashSet<String>>,
2036    ) -> Result<Arc<dyn ExecutionPlan>> {
2037        use crate::query::df_graph::procedure_call::map_yield_to_canonical;
2038
2039        // Build target_properties map for node-like yields in search procedures
2040        let mut target_properties: HashMap<String, Vec<String>> = HashMap::new();
2041
2042        if crate::query::df_graph::procedure_call::is_node_yield_procedure_static(procedure_name) {
2043            for (name, alias) in yield_items {
2044                let output_name = alias.as_ref().unwrap_or(name);
2045                let canonical = map_yield_to_canonical(name);
2046                if canonical == "node" {
2047                    // Collect properties requested for this node variable
2048                    if let Some(props) = all_properties.get(output_name.as_str()) {
2049                        let prop_list: Vec<String> = props
2050                            .iter()
2051                            .filter(|p| *p != "*" && !p.starts_with('_'))
2052                            .cloned()
2053                            .collect();
2054                        target_properties.insert(output_name.clone(), prop_list);
2055                    }
2056                }
2057            }
2058        }
2059
2060        let exec = GraphProcedureCallExec::new(
2061            self.graph_ctx.clone(),
2062            procedure_name.to_string(),
2063            arguments.to_vec(),
2064            yield_items.to_vec(),
2065            self.params.clone(),
2066            self.outer_values.clone(),
2067            target_properties,
2068        );
2069
2070        Ok(Arc::new(exec))
2071    }
2072
2073    /// Plan a vertex scan.
2074    fn plan_scan(
2075        &self,
2076        label_id: u16,
2077        variable: &str,
2078        filter: Option<&Expr>,
2079        optional: bool,
2080        all_properties: &HashMap<String, HashSet<String>>,
2081    ) -> Result<Arc<dyn ExecutionPlan>> {
2082        // Virtual label: dispatch to a `CatalogVertexScanExec` that wraps
2083        // the plugin-registered `CatalogTable` (M5 follow-up #6). The
2084        // plan caches the virtual id, not the table — every execute
2085        // resolves the latest table from `PluginRegistry::virtual_label_by_id`,
2086        // so a re-registered provider naturally picks up.
2087        if uni_common::core::schema::is_virtual_label_id(label_id) {
2088            let entry = self
2089                .plugin_registry
2090                .virtual_label_by_id(label_id)
2091                .ok_or_else(|| {
2092                    anyhow!(
2093                        "Virtual label id {label_id:#x} has no registered CatalogTable; \
2094                         the originating CatalogProvider may have been deregistered \
2095                         after the plan was cached"
2096                    )
2097                })?;
2098            let label_name = entry.name.as_str();
2099            let properties = self.resolve_properties(variable, label_name, all_properties);
2100            let pushdown_filters: Vec<datafusion::logical_expr::Expr> = filter
2101                .map(|f| -> Result<Vec<_>> {
2102                    let ctx = crate::query::df_expr::TranslationContext {
2103                        parameters: self.params.clone(),
2104                        outer_values: self.outer_values.clone(),
2105                        ..Default::default()
2106                    };
2107                    let df = crate::query::df_expr::cypher_expr_to_df(f, Some(&ctx))?;
2108                    Ok(vec![df])
2109                })
2110                .transpose()?
2111                .unwrap_or_default();
2112            let exec = crate::query::df_graph::catalog_scan::CatalogVertexScanExec::try_new(
2113                entry.table,
2114                label_id,
2115                label_name.to_string(),
2116                variable.to_string(),
2117                properties,
2118                pushdown_filters,
2119                None, // limit-pushdown is applied at a higher layer for now
2120            )?;
2121            let mut plan: Arc<dyn ExecutionPlan> = Arc::new(exec);
2122            // Re-apply the Cypher filter as a top-level FilterExec for
2123            // safety (the catalog table may have ignored the pushdown).
2124            plan = self.apply_scan_filter(plan, variable, filter, Some(label_name))?;
2125            // SSI read-set: deliberately NOT recorded. A virtual (catalog-backed)
2126            // label is read-only — CREATE/SET/DELETE on it is rejected at both
2127            // planner and runtime — so no uni transaction can ever write a
2128            // virtual vertex, and a read-write antidependency through one is
2129            // impossible. Its `_vid` is also synthetic (`label_id << 48 | row`,
2130            // ≥ 0xFF00…), disjoint from real vids, so recording it could only add
2131            // never-matching keys (and risk a false abort if the spaces ever
2132            // overlapped). Excluding it is the sound choice, not a gap.
2133            return self.wrap_optional(plan, optional);
2134        }
2135
2136        let label_name = self
2137            .schema
2138            .label_name_by_id(label_id)
2139            .ok_or_else(|| anyhow!("Unknown label ID: {}", label_id))?;
2140
2141        // Resolve properties collected from the entire plan tree, expanding "*" wildcards
2142        let mut properties = self.resolve_properties(variable, label_name, all_properties);
2143
2144        // Check if any projected property is NOT in the schema (needs overflow_json)
2145        let label_props = self.schema.properties.get(label_name);
2146        let has_projection_overflow = properties.iter().any(|p| {
2147            p != "overflow_json"
2148                && !p.starts_with('_')
2149                && !label_props.is_some_and(|lp| lp.contains_key(p.as_str()))
2150        });
2151        if has_projection_overflow && !properties.iter().any(|p| p == "overflow_json") {
2152            properties.push("overflow_json".to_string());
2153        }
2154
2155        // If the filter references overflow properties (not in schema), ensure
2156        // `overflow_json` is projected so the DataFusion FilterExec can read it.
2157        if let Some(filter_expr) = filter {
2158            let filter_props = crate::query::df_expr::collect_properties(filter_expr);
2159            let has_overflow = filter_props.iter().any(|(var, prop)| {
2160                var == variable
2161                    && !prop.starts_with('_')
2162                    && label_props.is_none_or(|props| !props.contains_key(prop.as_str()))
2163            });
2164            if has_overflow && !properties.iter().any(|p| p == "overflow_json") {
2165                properties.push("overflow_json".to_string());
2166            }
2167        }
2168
2169        // Structural projection is needed if EITHER:
2170        //   - "*"            (full record requested — bare variable, REMOVE,
2171        //                    Labels/Variable/VariablePlus SET, etc.), or
2172        //   - STRUCT_ONLY_SENTINEL  (Property SET only — needs the bare struct
2173        //                    column for `row.get(var)` but not the full schema).
2174        // Only "*" pushes `_all_props` / `overflow_json` into the scan; the
2175        // sentinel deliberately skips these so wide columns (e.g. embeddings)
2176        // are NOT materialized.
2177        let var_props = all_properties.get(variable);
2178        let need_full =
2179            var_props.is_some_and(|p| p.contains("*") || p.contains(STRUCT_ONLY_SENTINEL));
2180        let need_full_record = var_props.is_some_and(|p| p.contains("*"));
2181        if need_full_record {
2182            if !properties.contains(&"_all_props".to_string()) {
2183                properties.push("_all_props".to_string());
2184            }
2185            if !properties.contains(&"overflow_json".to_string()) {
2186                properties.push("overflow_json".to_string());
2187            }
2188        }
2189
2190        // Extract VID(s) from filter for scan-level optimization (L0
2191        // short-circuit + Lance pushdown). Single-VID becomes a `_vid = N`
2192        // physical filter that GraphScanExec uses both in L0 short-circuit and
2193        // in the Lance pushdown string. Multi-VID (from
2194        // `_vid IN (literals)`) bypasses the PhysicalExpr roundtrip and goes
2195        // direct to GraphScanExec via `with_vid_list_filter` — at runtime
2196        // it becomes `_vid IN (v1, v2, ...)` for Lance pushdown. See issue #55 PR #4.
2197        let extracted_vids = Self::extract_vid_from_cypher_filter(filter, variable, &self.params);
2198        let scan_filter = extracted_vids
2199            .as_deref()
2200            .filter(|v| v.len() == 1)
2201            .map(|v| Self::build_vid_physical_filter(&format!("{variable}._vid"), v[0]));
2202        let mut scan_exec = GraphScanExec::new_vertex_scan(
2203            self.graph_ctx.clone(),
2204            label_name.to_string(),
2205            variable.to_string(),
2206            properties.clone(),
2207            scan_filter,
2208        );
2209        if let Some(vids) = extracted_vids
2210            && vids.len() > 1
2211        {
2212            scan_exec = scan_exec.with_vid_list_filter(vids);
2213        }
2214
2215        // Indexed-property pushdown — issue #57. Detect equality / IN
2216        // predicates against hash-indexed properties on (label, prop), resolve
2217        // any parameters at plan time, render BOTH a Lance SQL filter (for
2218        // on-disk index lookup) and an Arrow PhysicalExpr (for in-process
2219        // L0 filtering). The redundant FilterExec on top (added by
2220        // `apply_scan_filter` below) is harmless and keeps residual conjuncts
2221        // (e.g. non-indexed multi-property AND) correct.
2222        let scan_schema_for_idx = scan_exec.schema();
2223        if let Some((lance_str, runtime_filter)) =
2224            self.build_indexed_property_pushdown(filter, variable, label_id, &scan_schema_for_idx)
2225        {
2226            scan_exec = scan_exec
2227                .with_extra_lance_filter(lance_str)
2228                .with_extra_runtime_filter(runtime_filter);
2229        }
2230        let mut scan_plan: Arc<dyn ExecutionPlan> = Arc::new(scan_exec);
2231
2232        // Apply filter BEFORE structural projection so that the schema is
2233        // unambiguous (no duplicate `variable._vid` from both flat column and
2234        // struct field). This prevents "Ambiguous reference" errors when
2235        // comparing `_vid` (UInt64) against Int64 literals in type coercion.
2236        scan_plan = self.apply_scan_filter(scan_plan, variable, filter, Some(label_name))?;
2237
2238        // Record surviving (post-filter) row ids into the SSI read-set so keyed
2239        // matches conflict only with writers touching the same rows.
2240        scan_plan = self.wrap_read_set_recording(scan_plan, variable);
2241
2242        if need_full {
2243            // Filter sentinel markers and overflow_json from the structural
2244            // projection. Keep _all_props so properties()/keys() UDFs can use it.
2245            let struct_props: Vec<String> = properties
2246                .iter()
2247                .filter(|p| *p != "overflow_json" && *p != "*" && *p != STRUCT_ONLY_SENTINEL)
2248                .cloned()
2249                .collect();
2250            scan_plan = self.add_structural_projection(scan_plan, variable, &struct_props)?;
2251        }
2252
2253        self.wrap_optional(scan_plan, optional)
2254    }
2255
2256    /// Plan a schemaless vertex scan using the main vertices table.
2257    ///
2258    /// Used for labels that aren't in the schema - queries the main table
2259    /// with `array_contains(labels, 'X')` filter and extracts properties from `props_json`.
2260    /// Add a structural projection for a variable if wildcard access ("*") is needed.
2261    ///
2262    /// Derives the property list from the plan's output schema (columns with the
2263    /// variable prefix) and wraps them into a Struct column via `add_structural_projection`.
2264    fn add_wildcard_structural_projection(
2265        &self,
2266        plan: Arc<dyn ExecutionPlan>,
2267        variable: &str,
2268        all_properties: &HashMap<String, HashSet<String>>,
2269    ) -> Result<Arc<dyn ExecutionPlan>> {
2270        if !all_properties
2271            .get(variable)
2272            .is_some_and(|p| p.contains("*") || p.contains(STRUCT_ONLY_SENTINEL))
2273        {
2274            return Ok(plan);
2275        }
2276        let prefix = format!("{}.", variable);
2277        let struct_props: Vec<String> = plan
2278            .schema()
2279            .fields()
2280            .iter()
2281            .filter_map(|f| {
2282                f.name()
2283                    .strip_prefix(&prefix)
2284                    .filter(|prop| !prop.starts_with('_') || *prop == "_all_props")
2285                    .map(|prop| prop.to_string())
2286            })
2287            .collect();
2288        self.add_structural_projection(plan, variable, &struct_props)
2289    }
2290
2291    /// Detect whether a target variable is already bound in the input plan's schema.
2292    ///
2293    /// Returns `Some("{target_variable}._vid")` when the column is present.
2294    fn detect_bound_target(input_schema: &SchemaRef, target_variable: &str) -> Option<String> {
2295        // Standard: {var}._vid from ScanNodes output
2296        let col = format!("{}._vid", target_variable);
2297        if input_schema.column_with_name(&col).is_some() {
2298            return Some(col);
2299        }
2300        // Fallback: bare variable name if it's a numeric (VID) column.
2301        // This handles EXISTS subquery contexts where imported variables are
2302        // projected as Parameter("{var}") → bare VID column.
2303        // VIDs are UInt64 in Arrow, but may become Int64 after parameter
2304        // round-tripping through Value::Integer → ScalarValue::Int64.
2305        if let Ok(field) = input_schema.field_with_name(target_variable)
2306            && matches!(
2307                field.data_type(),
2308                datafusion::arrow::datatypes::DataType::UInt64
2309                    | datafusion::arrow::datatypes::DataType::Int64
2310            )
2311        {
2312            return Some(target_variable.to_string());
2313        }
2314        None
2315    }
2316
2317    /// Resolve the property list and wildcard flag for a schemaless vertex scan.
2318    ///
2319    /// Filters out `"*"` and the structural-only sentinel, ensures `_all_props`
2320    /// is present (schemaless backend requirement — properties live in a JSON
2321    /// blob), and returns `(properties, need_full)` where `need_full`
2322    /// indicates structural access (either marker triggers it).
2323    fn resolve_schemaless_properties(
2324        variable: &str,
2325        all_properties: &HashMap<String, HashSet<String>>,
2326    ) -> (Vec<String>, bool) {
2327        let mut properties: Vec<String> = all_properties
2328            .get(variable)
2329            .map(|s| {
2330                s.iter()
2331                    .filter(|p| *p != "*" && *p != STRUCT_ONLY_SENTINEL)
2332                    .cloned()
2333                    .collect()
2334            })
2335            .unwrap_or_default();
2336        let need_full = all_properties
2337            .get(variable)
2338            .is_some_and(|p| p.contains("*") || p.contains(STRUCT_ONLY_SENTINEL));
2339        if !properties.iter().any(|p| p == "_all_props") {
2340            properties.push("_all_props".to_string());
2341        }
2342        (properties, need_full)
2343    }
2344
2345    /// Collect edge columns (`._eid` and `__eid_to_*`) from a schema, filtered to the
2346    /// current MATCH scope. Optionally excludes a specific column (for rebound edge patterns).
2347    fn collect_used_edge_columns(
2348        schema: &SchemaRef,
2349        scope_match_variables: &HashSet<String>,
2350        exclude_col: Option<&str>,
2351    ) -> Vec<String> {
2352        schema
2353            .fields()
2354            .iter()
2355            .filter_map(|f| {
2356                let name = f.name();
2357                if exclude_col.is_some_and(|exc| name == exc) {
2358                    None
2359                } else if name.ends_with("._eid") {
2360                    let var_name = name.trim_end_matches("._eid");
2361                    scope_match_variables
2362                        .contains(var_name)
2363                        .then(|| name.clone())
2364                } else if name.starts_with("__eid_to_") {
2365                    let var_name = name.trim_start_matches("__eid_to_");
2366                    scope_match_variables
2367                        .contains(var_name)
2368                        .then(|| name.clone())
2369                } else {
2370                    None
2371                }
2372            })
2373            .collect()
2374    }
2375
2376    /// Conditionally add edge structural projection when the edge variable has wildcard access.
2377    /// Skips if `skip_if_vlp` is true (VLP step variables are already `List<Edge>`).
2378    fn maybe_add_edge_structural_projection(
2379        &self,
2380        plan: Arc<dyn ExecutionPlan>,
2381        step_variable: Option<&str>,
2382        source_variable: &str,
2383        target_variable: &str,
2384        all_properties: &HashMap<String, HashSet<String>>,
2385        skip_if_vlp: bool,
2386    ) -> Result<Arc<dyn ExecutionPlan>> {
2387        if skip_if_vlp {
2388            return Ok(plan);
2389        }
2390        let Some(edge_var) = step_variable else {
2391            return Ok(plan);
2392        };
2393        if !all_properties
2394            .get(edge_var)
2395            .is_some_and(|p| p.contains("*") || p.contains(STRUCT_ONLY_SENTINEL))
2396        {
2397            return Ok(plan);
2398        }
2399        // Derive edge properties from the plan's output schema
2400        let prefix = format!("{}.", edge_var);
2401        let edge_props: Vec<String> = plan
2402            .schema()
2403            .fields()
2404            .iter()
2405            .filter_map(|f| {
2406                f.name()
2407                    .strip_prefix(&prefix)
2408                    .filter(|prop| !prop.starts_with('_') && *prop != "overflow_json")
2409                    .map(|prop| prop.to_string())
2410            })
2411            .collect();
2412        self.add_edge_structural_projection(
2413            plan,
2414            edge_var,
2415            &edge_props,
2416            source_variable,
2417            target_variable,
2418        )
2419    }
2420
2421    /// Apply filter, optional structural projection, and optional wrapping to a schemaless scan.
2422    fn finalize_schemaless_scan(
2423        &self,
2424        scan_plan: Arc<dyn ExecutionPlan>,
2425        variable: &str,
2426        filter: Option<&Expr>,
2427        optional: bool,
2428        properties: &[String],
2429        need_full: bool,
2430    ) -> Result<Arc<dyn ExecutionPlan>> {
2431        // Apply filter BEFORE structural projection to avoid ambiguous column
2432        // references (flat `var._vid` vs struct `var._vid` field).
2433        let mut plan = self.apply_scan_filter(scan_plan, variable, filter, None)?;
2434
2435        // Record surviving (post-filter) row ids into the SSI read-set so keyed
2436        // matches conflict only with writers touching the same rows.
2437        plan = self.wrap_read_set_recording(plan, variable);
2438
2439        // If we need the full object (structural access), build a struct with _labels + properties.
2440        // This enables labels(n)/keys(n) UDFs which expect a Struct column with a _labels field.
2441        if need_full {
2442            // Filter out "*" (wildcard marker) and the structural-only sentinel
2443            // from struct_props. Keep "_all_props" so that keys()/properties()
2444            // UDFs can extract property names at runtime from the CypherValue
2445            // blob.
2446            let struct_props: Vec<String> = properties
2447                .iter()
2448                .filter(|p| *p != "*" && *p != STRUCT_ONLY_SENTINEL)
2449                .cloned()
2450                .collect();
2451            plan = self.add_structural_projection(plan, variable, &struct_props)?;
2452        }
2453
2454        self.wrap_optional(plan, optional)
2455    }
2456
2457    fn plan_schemaless_scan(
2458        &self,
2459        label_name: &str,
2460        variable: &str,
2461        filter: Option<&Expr>,
2462        optional: bool,
2463        all_properties: &HashMap<String, HashSet<String>>,
2464    ) -> Result<Arc<dyn ExecutionPlan>> {
2465        let (properties, need_full) = Self::resolve_schemaless_properties(variable, all_properties);
2466        let scan_plan: Arc<dyn ExecutionPlan> =
2467            Arc::new(GraphScanExec::new_schemaless_vertex_scan(
2468                self.graph_ctx.clone(),
2469                label_name.to_string(),
2470                variable.to_string(),
2471                properties.clone(),
2472                None,
2473            ));
2474        self.finalize_schemaless_scan(
2475            scan_plan,
2476            variable,
2477            filter,
2478            optional,
2479            &properties,
2480            need_full,
2481        )
2482    }
2483
2484    /// Split a label list into `(virtual_labels, native_labels)` against the plugin registry.
2485    ///
2486    /// A label is virtual when `PluginRegistry::virtual_label_by_name` returns
2487    /// a registered id; otherwise it is treated as native. Used by both the
2488    /// single- and multi-label scan paths to decide whether to dispatch a
2489    /// `CatalogVertexScanExec`, a `GraphScanExec`, or a join of the two.
2490    fn classify_labels(
2491        registry: &uni_plugin::PluginRegistry,
2492        labels: &[String],
2493    ) -> (Vec<(String, u16)>, Vec<String>) {
2494        let mut virtual_labels: Vec<(String, u16)> = Vec::new();
2495        let mut native_labels: Vec<String> = Vec::new();
2496        for label in labels {
2497            if let Some(id) = registry.virtual_label_by_name(label) {
2498                virtual_labels.push((label.clone(), id));
2499            } else {
2500                native_labels.push(label.clone());
2501            }
2502        }
2503        (virtual_labels, native_labels)
2504    }
2505
2506    /// Plan a multi-label vertex scan using the main vertices table.
2507    ///
2508    /// For patterns like `(n:A:B)`, scans vertices that carry ALL labels
2509    /// (intersection semantics). When some labels are plugin-registered
2510    /// virtual labels and others are native, builds a `CatalogVertexScanExec`
2511    /// for the virtual side, a `GraphScanExec` for the native side, and a
2512    /// `LeftSemi` `HashJoinExec` keyed on `{variable}._vid` so the catalog
2513    /// rows are filtered by native presence (and the output schema stays
2514    /// clean — only the catalog side's columns flow through).
2515    ///
2516    /// # Errors
2517    ///
2518    /// Returns an error if a virtual-label entry is missing at plan time
2519    /// (a `CatalogProvider` was deregistered after the plan was cached)
2520    /// or if the underlying scan / join construction fails.
2521    fn plan_multi_label_scan(
2522        &self,
2523        labels: &[String],
2524        variable: &str,
2525        filter: Option<&Expr>,
2526        optional: bool,
2527        all_properties: &HashMap<String, HashSet<String>>,
2528    ) -> Result<Arc<dyn ExecutionPlan>> {
2529        let (virtual_labels, native_labels) = Self::classify_labels(&self.plugin_registry, labels);
2530
2531        // All-native: keep the legacy schemaless multi-label scan.
2532        if virtual_labels.is_empty() {
2533            let (properties, need_full) =
2534                Self::resolve_schemaless_properties(variable, all_properties);
2535            let scan_plan: Arc<dyn ExecutionPlan> =
2536                Arc::new(GraphScanExec::new_multi_label_vertex_scan(
2537                    self.graph_ctx.clone(),
2538                    labels.to_vec(),
2539                    variable.to_string(),
2540                    properties.clone(),
2541                    None,
2542                ));
2543            return self.finalize_schemaless_scan(
2544                scan_plan,
2545                variable,
2546                filter,
2547                optional,
2548                &properties,
2549                need_full,
2550            );
2551        }
2552
2553        // Build the virtual side: one `CatalogVertexScanExec` per virtual
2554        // label, unioned when there's more than one. The union is per the
2555        // plan-doc contract ("union if >1"); each catalog table contributes
2556        // its own vid space (encoded with the per-label id), so the unioned
2557        // stream is well-formed.
2558        let virtual_side =
2559            self.build_virtual_union_scan(&virtual_labels, variable, filter, all_properties)?;
2560
2561        // All-virtual: no native filter to apply.
2562        if native_labels.is_empty() {
2563            // Re-apply the Cypher filter as a top-level FilterExec for safety
2564            // (catalog tables may ignore pushdowns). The per-leaf scans already
2565            // ran the filter; this is harmless and keeps semantics consistent
2566            // with `plan_scan`'s single-virtual branch.
2567            let plan = self.apply_scan_filter(virtual_side, variable, filter, None)?;
2568            return self.wrap_optional(plan, optional);
2569        }
2570
2571        // Mixed: build the native side (schemaless multi-label scan projecting
2572        // only `_vid`) and `LeftSemi`-join the virtual side against it. The
2573        // semi-join shape mirrors the plan-doc's "inner on _vid" intent but
2574        // emits only the left (catalog) columns, so downstream consumers see
2575        // a clean `{variable}.{prop}` schema instead of duplicate vid columns.
2576        let native_properties: Vec<String> = vec!["_all_props".to_string()];
2577        let native_scan: Arc<dyn ExecutionPlan> =
2578            Arc::new(GraphScanExec::new_multi_label_vertex_scan(
2579                self.graph_ctx.clone(),
2580                native_labels,
2581                variable.to_string(),
2582                native_properties,
2583                None,
2584            ));
2585
2586        let joined = self.semi_join_on_vid(virtual_side, native_scan, variable)?;
2587        let plan = self.apply_scan_filter(joined, variable, filter, None)?;
2588        self.wrap_optional(plan, optional)
2589    }
2590
2591    /// Build the virtual-side scan: a single `CatalogVertexScanExec` for one
2592    /// virtual label, or a `UnionExec` of one-per-label scans when several.
2593    /// SSI note: like the single virtual scan, the catalog scans built here are
2594    /// deliberately NOT wrapped in read-set recording — virtual labels are
2595    /// read-only with synthetic vids, so no antidependency is possible. See the
2596    /// rationale at the single-label virtual scan in `plan_scan`.
2597    fn build_virtual_union_scan(
2598        &self,
2599        virtual_labels: &[(String, u16)],
2600        variable: &str,
2601        filter: Option<&Expr>,
2602        all_properties: &HashMap<String, HashSet<String>>,
2603    ) -> Result<Arc<dyn ExecutionPlan>> {
2604        let pushdown_filters: Vec<DfExpr> = filter
2605            .map(|f| -> Result<Vec<_>> {
2606                let ctx = crate::query::df_expr::TranslationContext {
2607                    parameters: self.params.clone(),
2608                    outer_values: self.outer_values.clone(),
2609                    ..Default::default()
2610                };
2611                let df = crate::query::df_expr::cypher_expr_to_df(f, Some(&ctx))?;
2612                Ok(vec![df])
2613            })
2614            .transpose()?
2615            .unwrap_or_default();
2616
2617        let mut scans: Vec<Arc<dyn ExecutionPlan>> = Vec::with_capacity(virtual_labels.len());
2618        for (label_name, label_id) in virtual_labels {
2619            let entry = self
2620                .plugin_registry
2621                .virtual_label_by_id(*label_id)
2622                .ok_or_else(|| {
2623                    anyhow!(
2624                        "Virtual label `{label_name}` (id {label_id:#x}) has no \
2625                             registered CatalogTable; the originating CatalogProvider \
2626                             may have been deregistered after the plan was cached"
2627                    )
2628                })?;
2629            let properties = self.resolve_properties(variable, label_name, all_properties);
2630            let exec = crate::query::df_graph::catalog_scan::CatalogVertexScanExec::try_new(
2631                entry.table,
2632                *label_id,
2633                label_name.clone(),
2634                variable.to_string(),
2635                properties,
2636                pushdown_filters.clone(),
2637                None,
2638            )?;
2639            scans.push(Arc::new(exec));
2640        }
2641
2642        if scans.len() == 1 {
2643            Ok(scans.pop().expect("len == 1 implies non-empty"))
2644        } else {
2645            UnionExec::try_new(scans).map_err(|e| anyhow!("UnionExec construction failed: {e}"))
2646        }
2647    }
2648
2649    /// Build a `LeftSemi` `HashJoinExec` keyed on `{variable}._vid` between
2650    /// `left` (the catalog side carrying the row data) and `right` (the
2651    /// native side acting as a presence filter).
2652    fn semi_join_on_vid(
2653        &self,
2654        left: Arc<dyn ExecutionPlan>,
2655        right: Arc<dyn ExecutionPlan>,
2656        variable: &str,
2657    ) -> Result<Arc<dyn ExecutionPlan>> {
2658        use datafusion::common::NullEquality;
2659        use datafusion::physical_plan::expressions::Column;
2660        use datafusion::physical_plan::joins::{HashJoinExec, PartitionMode};
2661
2662        let vid_col = format!("{variable}._vid");
2663        let left_idx = left
2664            .schema()
2665            .index_of(&vid_col)
2666            .map_err(|e| anyhow!("virtual scan output missing `{vid_col}`: {e}"))?;
2667        let right_idx = right
2668            .schema()
2669            .index_of(&vid_col)
2670            .map_err(|e| anyhow!("native scan output missing `{vid_col}`: {e}"))?;
2671        let on: Vec<(
2672            Arc<dyn datafusion::physical_plan::PhysicalExpr>,
2673            Arc<dyn datafusion::physical_plan::PhysicalExpr>,
2674        )> = vec![(
2675            Arc::new(Column::new(&vid_col, left_idx)),
2676            Arc::new(Column::new(&vid_col, right_idx)),
2677        )];
2678        let join = HashJoinExec::try_new(
2679            left,
2680            right,
2681            on,
2682            None,
2683            &JoinType::LeftSemi,
2684            None,
2685            PartitionMode::CollectLeft,
2686            NullEquality::NullEqualsNothing,
2687            false,
2688        )?;
2689        Ok(Arc::new(join))
2690    }
2691
2692    /// Inner-join the traverse output (carrying `{target}._vid`) with a
2693    /// `CatalogVertexScanExec` for a virtual destination label, projecting
2694    /// away the duplicate `_vid` column from the catalog side.
2695    ///
2696    /// Used by `plan_traverse` and `plan_traverse_main_by_type` when the
2697    /// destination label is plugin-registered. The catalog side contributes
2698    /// `{target}._labels` and `{target}.<prop>` for every requested
2699    /// property; the traverse side contributes everything else (source
2700    /// vid/properties, edge columns, the destination vid we join on).
2701    ///
2702    /// # Errors
2703    ///
2704    /// Returns an error if the virtual label entry has been deregistered
2705    /// since plan time, if either side of the join is missing
2706    /// `{target}._vid`, or if the underlying DataFusion plan construction
2707    /// fails.
2708    fn hydrate_virtual_target_from_catalog(
2709        &self,
2710        traverse_plan: Arc<dyn ExecutionPlan>,
2711        target_label_id: u16,
2712        target_variable: &str,
2713        all_properties: &HashMap<String, HashSet<String>>,
2714    ) -> Result<Arc<dyn ExecutionPlan>> {
2715        use datafusion::common::NullEquality;
2716        use datafusion::physical_expr::expressions::{Column, col as col_expr};
2717        use datafusion::physical_plan::joins::{HashJoinExec, PartitionMode};
2718
2719        let entry = self
2720            .plugin_registry
2721            .virtual_label_by_id(target_label_id)
2722            .ok_or_else(|| {
2723                anyhow!(
2724                    "Virtual label id {target_label_id:#x} for target `{target_variable}` has no \
2725                     registered CatalogTable; the originating CatalogProvider may have been \
2726                     deregistered after the plan was cached"
2727                )
2728            })?;
2729        let label_name = entry.name.as_str();
2730        let properties = self.resolve_properties(target_variable, label_name, all_properties);
2731        // The catalog provider may ignore pushdown predicates, but the
2732        // traverse output already constrains rows by `_vid`, so we don't
2733        // need to forward the original target-filter again here. The
2734        // outer `target_filter` FilterExec at the end of `plan_traverse`
2735        // will re-apply.
2736        let catalog_exec = crate::query::df_graph::catalog_scan::CatalogVertexScanExec::try_new(
2737            entry.table,
2738            target_label_id,
2739            label_name.to_string(),
2740            target_variable.to_string(),
2741            properties,
2742            Vec::new(),
2743            None,
2744        )?;
2745        let catalog_plan: Arc<dyn ExecutionPlan> = Arc::new(catalog_exec);
2746
2747        let vid_col_name = format!("{target_variable}._vid");
2748        let left_idx = traverse_plan
2749            .schema()
2750            .index_of(&vid_col_name)
2751            .map_err(|e| anyhow!("traverse plan missing `{vid_col_name}` for hydration: {e}"))?;
2752        let right_idx = catalog_plan
2753            .schema()
2754            .index_of(&vid_col_name)
2755            .map_err(|e| anyhow!("catalog scan missing `{vid_col_name}`: {e}"))?;
2756        let on: Vec<(
2757            Arc<dyn datafusion::physical_plan::PhysicalExpr>,
2758            Arc<dyn datafusion::physical_plan::PhysicalExpr>,
2759        )> = vec![(
2760            Arc::new(Column::new(&vid_col_name, left_idx)),
2761            Arc::new(Column::new(&vid_col_name, right_idx)),
2762        )];
2763        let join = HashJoinExec::try_new(
2764            traverse_plan,
2765            catalog_plan,
2766            on,
2767            None,
2768            &JoinType::Inner,
2769            None,
2770            PartitionMode::CollectLeft,
2771            NullEquality::NullEqualsNothing,
2772            false,
2773        )?;
2774        let join_plan: Arc<dyn ExecutionPlan> = Arc::new(join);
2775
2776        // Project away the duplicate `{target}._vid` from the catalog side.
2777        // HashJoinExec emits left columns followed by right columns; the
2778        // left already has `{target}._vid` from the traverse, so we drop
2779        // the right-side copy (which sits at left_schema_len + right_idx
2780        // before re-ordering — DataFusion's HashJoinExec preserves the
2781        // left/right column order, so the duplicate is in the right
2782        // section).
2783        let join_schema = join_plan.schema();
2784        let mut projection_exprs: Vec<(Arc<dyn datafusion::physical_plan::PhysicalExpr>, String)> =
2785            Vec::with_capacity(join_schema.fields().len() - 1);
2786        let mut seen_vid = false;
2787        for field in join_schema.fields().iter() {
2788            if field.name() == &vid_col_name {
2789                if seen_vid {
2790                    continue;
2791                }
2792                seen_vid = true;
2793            }
2794            let expr = col_expr(field.name(), &join_schema)
2795                .map_err(|e| anyhow!("hydrate_virtual_target_from_catalog projection: {e}"))?;
2796            projection_exprs.push((expr, field.name().clone()));
2797        }
2798        let projected = ProjectionExec::try_new(projection_exprs, join_plan)
2799            .map_err(|e| anyhow!("hydrate_virtual_target_from_catalog projection: {e}"))?;
2800        Ok(Arc::new(projected))
2801    }
2802
2803    /// M5b.3 — physical plan for `MATCH (a)-[r:VirtualEdge]->(b)` where the
2804    /// relationship type is plugin-registered.
2805    ///
2806    /// Builds: `HashJoin(input × CatalogEdgeScanExec)` keyed on
2807    /// `{source}._vid = {step}._src_vid`, then a `ProjectionExec` that
2808    /// renames `{step}._dst_vid` -> `{target}._vid` and drops the
2809    /// duplicate join-key column from the right side. If the destination
2810    /// label is itself virtual, the postlude layers
2811    /// `hydrate_virtual_target_from_catalog` on top.
2812    ///
2813    /// SSI note: the `CatalogEdgeScanExec` and any virtual target are NOT
2814    /// read-set recorded — virtual edges/vertices are read-only with synthetic
2815    /// ids, so no antidependency is possible (see the rationale in `plan_scan`).
2816    /// The *real* source vertex `{source}._vid` entering the join was already
2817    /// recorded by whatever scan produced `input_plan`.
2818    #[expect(
2819        clippy::too_many_arguments,
2820        reason = "mirrors plan_traverse's argument set"
2821    )]
2822    fn plan_traverse_virtual_edge(
2823        &self,
2824        input_plan: Arc<dyn ExecutionPlan>,
2825        source_col: String,
2826        source_variable: &str,
2827        virtual_edge_type_id: u32,
2828        direction: AstDirection,
2829        target_variable: &str,
2830        target_label_id: u16,
2831        step_variable: Option<&str>,
2832        all_properties: &HashMap<String, HashSet<String>>,
2833        target_filter: Option<&Expr>,
2834        optional: bool,
2835        optional_pattern_vars: &HashSet<String>,
2836    ) -> Result<Arc<dyn ExecutionPlan>> {
2837        use datafusion::common::NullEquality;
2838        use datafusion::physical_expr::expressions::{Column, col as col_expr};
2839        use datafusion::physical_plan::joins::{HashJoinExec, PartitionMode};
2840
2841        let entry = self
2842            .plugin_registry
2843            .virtual_edge_type_by_id(virtual_edge_type_id)
2844            .ok_or_else(|| {
2845                anyhow!(
2846                    "Virtual edge-type id {virtual_edge_type_id:#x} for `{target_variable}` has \
2847                     no registered CatalogTable; the originating CatalogProvider may have been \
2848                     deregistered after the plan was cached"
2849                )
2850            })?;
2851        let type_name = entry.name.as_str();
2852        let edge_var = step_variable
2853            .map(str::to_string)
2854            .unwrap_or_else(|| format!("__anon_edge_{target_variable}"));
2855
2856        let edge_properties: Vec<String> = step_variable
2857            .and_then(|sv| all_properties.get(sv))
2858            .map(|props| {
2859                props
2860                    .iter()
2861                    .filter(|p| !p.starts_with('_') && *p != "*")
2862                    .cloned()
2863                    .collect()
2864            })
2865            .unwrap_or_default();
2866
2867        let catalog_exec = crate::query::df_graph::catalog_scan::CatalogEdgeScanExec::try_new(
2868            entry.table,
2869            virtual_edge_type_id,
2870            type_name.to_string(),
2871            edge_var.clone(),
2872            edge_properties,
2873            Vec::new(),
2874            None,
2875        )?;
2876        let catalog_plan: Arc<dyn ExecutionPlan> = Arc::new(catalog_exec);
2877
2878        let edge_src_col = format!("{edge_var}._src_vid");
2879        let edge_dst_col = format!("{edge_var}._dst_vid");
2880        let (right_key, target_src_col) = match direction {
2881            AstDirection::Outgoing => (edge_src_col.clone(), edge_dst_col.clone()),
2882            AstDirection::Incoming => (edge_dst_col.clone(), edge_src_col.clone()),
2883            AstDirection::Both => (edge_src_col.clone(), edge_dst_col.clone()),
2884        };
2885
2886        let left_idx = input_plan
2887            .schema()
2888            .index_of(&source_col)
2889            .map_err(|e| anyhow!("input plan missing source vid column `{source_col}`: {e}"))?;
2890        let right_idx = catalog_plan
2891            .schema()
2892            .index_of(&right_key)
2893            .map_err(|e| anyhow!("CatalogEdgeScanExec missing `{right_key}`: {e}"))?;
2894        let on: Vec<(
2895            Arc<dyn datafusion::physical_plan::PhysicalExpr>,
2896            Arc<dyn datafusion::physical_plan::PhysicalExpr>,
2897        )> = vec![(
2898            Arc::new(Column::new(&source_col, left_idx)),
2899            Arc::new(Column::new(&right_key, right_idx)),
2900        )];
2901        let join = HashJoinExec::try_new(
2902            input_plan,
2903            catalog_plan,
2904            on,
2905            None,
2906            &JoinType::Inner,
2907            None,
2908            PartitionMode::CollectLeft,
2909            NullEquality::NullEqualsNothing,
2910            false,
2911        )?;
2912        let join_plan: Arc<dyn ExecutionPlan> = Arc::new(join);
2913
2914        let join_schema = join_plan.schema();
2915        let target_vid_name = format!("{target_variable}._vid");
2916        let mut projection_exprs: Vec<(Arc<dyn datafusion::physical_plan::PhysicalExpr>, String)> =
2917            Vec::with_capacity(join_schema.fields().len());
2918        for field in join_schema.fields() {
2919            let name = field.name();
2920            if name == &right_key {
2921                continue;
2922            }
2923            let expr = col_expr(name, &join_schema)
2924                .map_err(|e| anyhow!("plan_traverse_virtual_edge projection: {e}"))?;
2925            let out_name = if name == &target_src_col {
2926                target_vid_name.clone()
2927            } else {
2928                name.clone()
2929            };
2930            projection_exprs.push((expr, out_name));
2931        }
2932        let projected: Arc<dyn ExecutionPlan> = Arc::new(
2933            ProjectionExec::try_new(projection_exprs, join_plan)
2934                .map_err(|e| anyhow!("plan_traverse_virtual_edge projection: {e}"))?,
2935        );
2936
2937        let mut plan = if uni_common::core::schema::is_virtual_label_id(target_label_id) {
2938            self.hydrate_virtual_target_from_catalog(
2939                projected,
2940                target_label_id,
2941                target_variable,
2942                all_properties,
2943            )?
2944        } else {
2945            projected
2946        };
2947
2948        plan = self.add_wildcard_structural_projection(plan, target_variable, all_properties)?;
2949        plan = self.maybe_add_edge_structural_projection(
2950            plan,
2951            step_variable,
2952            source_variable,
2953            target_variable,
2954            all_properties,
2955            false,
2956        )?;
2957
2958        if let Some(filter_expr) = target_filter {
2959            let mut variable_kinds = HashMap::new();
2960            variable_kinds.insert(source_variable.to_string(), VariableKind::Node);
2961            variable_kinds.insert(target_variable.to_string(), VariableKind::Node);
2962            if let Some(sv) = step_variable {
2963                variable_kinds.insert(sv.to_string(), VariableKind::edge_for(false));
2964            }
2965            let ctx = TranslationContext {
2966                parameters: self.params.clone(),
2967                variable_kinds,
2968                ..Default::default()
2969            };
2970            let df_filter = cypher_expr_to_df(filter_expr, Some(&ctx))?;
2971            let schema = plan.schema();
2972            let session = self.session_ctx.read();
2973            let physical_filter =
2974                self.create_physical_filter_expr(&df_filter, &schema, &session)?;
2975            plan = if optional {
2976                Arc::new(OptionalFilterExec::new(
2977                    plan,
2978                    physical_filter,
2979                    optional_pattern_vars.clone(),
2980                ))
2981            } else {
2982                Arc::new(FilterExec::try_new(physical_filter, plan)?)
2983            };
2984        } else {
2985            let _ = optional_pattern_vars;
2986        }
2987        Ok(plan)
2988    }
2989
2990    /// Plan a scan of all vertices regardless of label.
2991    ///
2992    /// This is used for `MATCH (n)` without a label filter.
2993    fn plan_scan_all(
2994        &self,
2995        variable: &str,
2996        filter: Option<&Expr>,
2997        optional: bool,
2998        all_properties: &HashMap<String, HashSet<String>>,
2999    ) -> Result<Arc<dyn ExecutionPlan>> {
3000        let (properties, need_full) = Self::resolve_schemaless_properties(variable, all_properties);
3001        // Extract VID(s) from filter for scan-level optimization. See the
3002        // detailed comment at the per-label scan site (issue #55 PR #4).
3003        let extracted_vids = Self::extract_vid_from_cypher_filter(filter, variable, &self.params);
3004        let scan_filter = extracted_vids
3005            .as_deref()
3006            .filter(|v| v.len() == 1)
3007            .map(|v| Self::build_vid_physical_filter(&format!("{variable}._vid"), v[0]));
3008        let mut scan_exec = GraphScanExec::new_schemaless_all_scan(
3009            self.graph_ctx.clone(),
3010            variable.to_string(),
3011            properties.clone(),
3012            scan_filter,
3013        );
3014        if let Some(vids) = extracted_vids
3015            && vids.len() > 1
3016        {
3017            scan_exec = scan_exec.with_vid_list_filter(vids);
3018        }
3019        let scan_plan: Arc<dyn ExecutionPlan> = Arc::new(scan_exec);
3020        self.finalize_schemaless_scan(
3021            scan_plan,
3022            variable,
3023            filter,
3024            optional,
3025            &properties,
3026            need_full,
3027        )
3028    }
3029
3030    /// Plan a graph traversal.
3031    #[expect(
3032        clippy::too_many_arguments,
3033        reason = "Graph traversal requires many parameters"
3034    )]
3035    fn plan_traverse(
3036        &self,
3037        input: &LogicalPlan,
3038        edge_type_ids: &[u32],
3039        direction: AstDirection,
3040        source_variable: &str,
3041        target_variable: &str,
3042        target_label_id: u16,
3043        step_variable: Option<&str>,
3044        min_hops: usize,
3045        max_hops: usize,
3046        path_variable: Option<&str>,
3047        optional: bool,
3048        target_filter: Option<&Expr>,
3049        is_variable_length: bool,
3050        optional_pattern_vars: &HashSet<String>,
3051        all_properties: &HashMap<String, HashSet<String>>,
3052        scope_match_variables: &HashSet<String>,
3053        edge_filter_expr: Option<&Expr>,
3054        path_mode: &crate::query::df_graph::nfa::PathMode,
3055        qpp_steps: Option<&[crate::query::planner::QppStepInfo]>,
3056    ) -> Result<Arc<dyn ExecutionPlan>> {
3057        let input_plan = self.plan_internal(input, all_properties)?;
3058
3059        let adj_direction = convert_direction(direction.clone());
3060        let (input_plan, source_col) = Self::resolve_source_vid_col(input_plan, source_variable)?;
3061
3062        // M5b.3 — virtual edge-type dispatch. When the relationship type
3063        // is plugin-registered (`is_virtual_edge_type_id`), there are no
3064        // native adjacencies: the rows live in a `CatalogTable` accessed
3065        // via `CatalogEdgeScanExec`. The all-virtual single-hop case
3066        // dispatches to `plan_traverse_virtual_edge`; mixed
3067        // native+virtual and VLP-with-virtual continue through the legacy
3068        // `GraphTraverseExec` branch (yielding zero rows for the virtual
3069        // portion, matching the pre-M5b.3 baseline).
3070        if !is_variable_length
3071            && !edge_type_ids.is_empty()
3072            && edge_type_ids.len() == 1
3073            && edge_type_ids
3074                .iter()
3075                .all(|eid| uni_common::core::edge_type::is_virtual_edge_type(*eid))
3076        {
3077            return self.plan_traverse_virtual_edge(
3078                input_plan,
3079                source_col,
3080                source_variable,
3081                edge_type_ids[0],
3082                direction,
3083                target_variable,
3084                target_label_id,
3085                step_variable,
3086                all_properties,
3087                target_filter,
3088                optional,
3089                optional_pattern_vars,
3090            );
3091        }
3092
3093        let traverse_plan: Arc<dyn ExecutionPlan> = if !is_variable_length {
3094            // Extract edge properties for pushdown hydration, expanding "*" wildcards
3095            let mut edge_properties: Vec<String> = if let Some(edge_var) = step_variable {
3096                let has_wildcard = all_properties
3097                    .get(edge_var)
3098                    .is_some_and(|props| props.contains("*"));
3099                if has_wildcard {
3100                    // Expand to all schema-defined properties across all matching edge types
3101                    let mut schema_props: Vec<String> = edge_type_ids
3102                        .iter()
3103                        .filter_map(|eid| self.schema.edge_type_name_by_id(*eid))
3104                        .flat_map(|name| {
3105                            self.schema
3106                                .properties
3107                                .get(name)
3108                                .map(|p| p.keys().cloned().collect::<Vec<_>>())
3109                                .unwrap_or_default()
3110                        })
3111                        .collect();
3112
3113                    // Also include explicitly referenced properties (non-wildcard, non-internal)
3114                    // that may be overflow properties not in the schema. System-managed
3115                    // timestamp columns (`_created_at`, `_updated_at`) requested via
3116                    // `created_at(r)` / `updated_at(r)` are kept too.
3117                    if let Some(props) = all_properties.get(edge_var) {
3118                        for p in props {
3119                            let passthrough = !p.starts_with('_')
3120                                || matches!(p.as_str(), "_created_at" | "_updated_at");
3121                            if p != "*" && passthrough && !schema_props.contains(p) {
3122                                schema_props.push(p.clone());
3123                            }
3124                        }
3125                    }
3126                    schema_props
3127                } else {
3128                    all_properties
3129                        .get(edge_var)
3130                        .map(|props| props.iter().filter(|p| *p != "*").cloned().collect())
3131                        .unwrap_or_default()
3132                }
3133            } else {
3134                Vec::new()
3135            };
3136
3137            // Check if any edge property is NOT in the schema (needs overflow_json)
3138            if let Some(edge_var) = step_variable {
3139                let has_wildcard = all_properties
3140                    .get(edge_var)
3141                    .is_some_and(|props| props.contains("*"));
3142                let edge_type_props = self.merged_edge_type_properties(edge_type_ids);
3143                let has_overflow_edge_props = edge_properties.iter().any(|p| {
3144                    p != "overflow_json"
3145                        && !p.starts_with('_')
3146                        && !edge_type_props.contains_key(p.as_str())
3147                });
3148                // Add overflow_json if:
3149                // 1. Wildcard was used AND edge_properties is empty (no schema props for this edge type)
3150                // 2. OR there are overflow properties explicitly referenced
3151                let needs_overflow =
3152                    (has_wildcard && edge_properties.is_empty()) || has_overflow_edge_props;
3153                if needs_overflow && !edge_properties.contains(&"overflow_json".to_string()) {
3154                    edge_properties.push("overflow_json".to_string());
3155                }
3156
3157                // Add _all_props for L0 edge property visibility: schemaless edges
3158                // store properties by name in L0, not as overflow_json blobs, so we
3159                // need _all_props to surface them through the DataFusion path.
3160                if has_wildcard && !edge_properties.contains(&"_all_props".to_string()) {
3161                    edge_properties.push("_all_props".to_string());
3162                }
3163            }
3164
3165            // Extract target vertex properties, expanding "*" wildcards
3166            let target_label_name_str = self.schema.label_name_by_id(target_label_id).unwrap_or("");
3167            let mut target_properties =
3168                self.resolve_properties(target_variable, target_label_name_str, all_properties);
3169
3170            // Filter out "*" and the structural-only sentinel from
3171            // target_properties — they are used for structural projection
3172            // (bare variable access like `RETURN t`, or SET t.prop) but must
3173            // not be passed to GraphTraverseExec as actual property column
3174            // names.
3175            target_properties.retain(|p| p != "*" && p != STRUCT_ONLY_SENTINEL);
3176
3177            // When wildcard access was requested but no specific properties resolved,
3178            // add _all_props to ensure properties are loaded (mirrors plan_scan_all behavior).
3179            let target_has_wildcard = all_properties
3180                .get(target_variable)
3181                .is_some_and(|p| p.contains("*"));
3182            if target_has_wildcard && target_properties.is_empty() {
3183                target_properties.push("_all_props".to_string());
3184            }
3185
3186            // Check for non-schema properties that need CypherValue extraction.
3187            // For the traverse path, always use _all_props (not overflow_json) as
3188            // the CypherValue source since get_property_value handles _all_props directly.
3189            let target_label_props = if !target_label_name_str.is_empty() {
3190                self.schema.properties.get(target_label_name_str)
3191            } else {
3192                None
3193            };
3194            let has_non_schema_props = target_properties.iter().any(|p| {
3195                p != "overflow_json"
3196                    && p != "_all_props"
3197                    && !p.starts_with('_')
3198                    && !target_label_props.is_some_and(|lp| lp.contains_key(p.as_str()))
3199            });
3200            if has_non_schema_props && !target_properties.iter().any(|p| p == "_all_props") {
3201                target_properties.push("_all_props".to_string());
3202            }
3203            // Also check the filter for non-schema property references
3204            if let Some(filter_expr) = target_filter {
3205                let filter_props = crate::query::df_expr::collect_properties(filter_expr);
3206                let has_overflow_filter = filter_props.iter().any(|(var, prop)| {
3207                    var == target_variable
3208                        && !prop.starts_with('_')
3209                        && !target_label_props
3210                            .is_some_and(|props| props.contains_key(prop.as_str()))
3211                });
3212                if has_overflow_filter && !target_properties.iter().any(|p| p == "_all_props") {
3213                    target_properties.push("_all_props".to_string());
3214                }
3215            }
3216            // For schema-defined labels that also have overflow properties, add overflow_json
3217            // for the scan path compatibility (Lance storage has overflow_json column).
3218            if !target_label_name_str.is_empty()
3219                && has_non_schema_props
3220                && !target_properties.iter().any(|p| p == "overflow_json")
3221            {
3222                target_properties.push("overflow_json".to_string());
3223            }
3224
3225            // Resolve target label name for property type lookups
3226            let target_label_name = if target_label_name_str.is_empty() {
3227                None
3228            } else {
3229                Some(target_label_name_str.to_string())
3230            };
3231
3232            // Single-hop traversal
3233            // Note: target_label_id is not passed here because VIDs no longer embed label info.
3234            // Label filtering for traversals is handled via the fallback executor when DataFusion
3235            // cannot handle the query, or via explicit filter predicates.
3236
3237            // Check if target variable is already bound (for cycle patterns like n-->k<--n)
3238            let bound_target_column =
3239                Self::detect_bound_target(&input_plan.schema(), target_variable);
3240
3241            // Collect edge ID columns from previous hops for relationship uniqueness.
3242            // Look for both explicit edge variables (ending in "._eid") and
3243            // internal tracking columns (starting with "__eid_to_").
3244            //
3245            // Rebound edge patterns (e.g. OPTIONAL MATCH ()-[r]->() where `r` is already bound)
3246            // use a temporary edge variable `__rebound_{r}` for traversal and then filter on eid.
3247            // Do not treat the already-bound `{r}._eid` as "used" here, otherwise the only
3248            // candidate edge is filtered out before rebound matching.
3249            // Handle rebound struct variables from WITH + aggregation.
3250            // When edge or target variables have passed through aggregation, they become
3251            // struct columns. Extract ALL fields as flat columns so that:
3252            // 1. {edge}._eid is available for uniqueness checking
3253            // 2. {edge}.{property} is available for downstream RETURN/WHERE
3254            // 3. {target}._vid is available for the bound target filter
3255            // 4. {target}.{property} is available for downstream RETURN/WHERE
3256            let mut input_plan = input_plan;
3257            for rebound_var in [
3258                step_variable.and_then(|sv| sv.strip_prefix("__rebound_")),
3259                target_variable.strip_prefix("__rebound_"),
3260            ]
3261            .into_iter()
3262            .flatten()
3263            {
3264                if input_plan
3265                    .schema()
3266                    .field_with_name(rebound_var)
3267                    .ok()
3268                    .is_some_and(|f| {
3269                        matches!(
3270                            f.data_type(),
3271                            datafusion::arrow::datatypes::DataType::Struct(_)
3272                        )
3273                    })
3274                {
3275                    input_plan = Self::extract_all_struct_fields(input_plan, rebound_var)?;
3276                }
3277            }
3278
3279            let rebound_bound_edge_col = step_variable
3280                .and_then(|sv| sv.strip_prefix("__rebound_"))
3281                .map(|bound| format!("{}._eid", bound));
3282
3283            let used_edge_columns = Self::collect_used_edge_columns(
3284                &input_plan.schema(),
3285                scope_match_variables,
3286                rebound_bound_edge_col.as_deref(),
3287            );
3288
3289            Arc::new(GraphTraverseExec::new(
3290                input_plan,
3291                source_col,
3292                edge_type_ids.to_vec(),
3293                adj_direction,
3294                target_variable.to_string(),
3295                step_variable.map(|s| s.to_string()),
3296                edge_properties,
3297                target_properties,
3298                target_label_name,
3299                None, // VIDs don't embed label - use VidLabelsIndex instead
3300                self.graph_ctx.clone(),
3301                optional,
3302                optional_pattern_vars.clone(),
3303                bound_target_column,
3304                used_edge_columns,
3305            ))
3306        } else {
3307            // Variable-length traversal
3308            if edge_type_ids.is_empty() {
3309                // No edge types - for min_hops=0, we can still emit zero-length paths
3310                // Use BindZeroLengthPath to create path with just the source node
3311                if let (0, Some(path_var)) = (min_hops, path_variable) {
3312                    return Ok(Arc::new(BindZeroLengthPathExec::new(
3313                        input_plan,
3314                        source_variable.to_string(),
3315                        path_var.to_string(),
3316                        self.graph_ctx.clone(),
3317                    )));
3318                } else if min_hops == 0 && step_variable.is_none() {
3319                    // min_hops=0 but no path variable - just return input as-is
3320                    // (the target is the same as source for zero-length)
3321                    return Ok(input_plan);
3322                }
3323            }
3324            {
3325                // Resolve target properties for VLP (same logic as single-hop above)
3326                let vlp_target_label_name_str =
3327                    self.schema.label_name_by_id(target_label_id).unwrap_or("");
3328                let vlp_target_properties_raw = self.resolve_properties(
3329                    target_variable,
3330                    vlp_target_label_name_str,
3331                    all_properties,
3332                );
3333                let target_has_wildcard = all_properties
3334                    .get(target_variable)
3335                    .is_some_and(|p| p.contains("*"));
3336                let vlp_target_label_props: Option<HashSet<String>> =
3337                    if vlp_target_label_name_str.is_empty() {
3338                        None
3339                    } else {
3340                        self.schema
3341                            .properties
3342                            .get(vlp_target_label_name_str)
3343                            .map(|props| props.keys().cloned().collect())
3344                    };
3345                let mut vlp_target_properties = sanitize_vlp_target_properties(
3346                    vlp_target_properties_raw,
3347                    target_has_wildcard,
3348                    vlp_target_label_props.as_ref(),
3349                );
3350                let vlp_target_label_name = if vlp_target_label_name_str.is_empty() {
3351                    None
3352                } else {
3353                    Some(vlp_target_label_name_str.to_string())
3354                };
3355
3356                // Check if target variable is already bound (for patterns where target is in scope)
3357                let bound_target_column =
3358                    Self::detect_bound_target(&input_plan.schema(), target_variable);
3359                if bound_target_column.is_some() {
3360                    // For correlated patterns with bound target, traversal only needs reachability.
3361                    // Reuse existing bound target columns from input and avoid re-hydrating props.
3362                    vlp_target_properties.clear();
3363                }
3364
3365                // VLP: compile edge predicates to Lance SQL for bitmap preselection
3366                let edge_lance_filter: Option<String> = edge_filter_expr.and_then(|expr| {
3367                    let edge_var_name = step_variable.unwrap_or("__anon_edge");
3368                    crate::query::pushdown::LanceFilterGenerator::generate(
3369                        std::slice::from_ref(expr),
3370                        edge_var_name,
3371                        None,
3372                    )
3373                });
3374
3375                // VLP: extract simple property equality conditions for L0 checking
3376                let edge_property_conditions = edge_filter_expr
3377                    .map(Self::extract_edge_property_conditions)
3378                    .unwrap_or_default();
3379
3380                // VLP: collect used edge columns for cross-pattern relationship uniqueness
3381                let used_edge_columns = Self::collect_used_edge_columns(
3382                    &input_plan.schema(),
3383                    scope_match_variables,
3384                    None,
3385                );
3386
3387                // VLP: determine output mode based on bound variables
3388                let output_mode = if step_variable.is_some() {
3389                    crate::query::df_graph::nfa::VlpOutputMode::StepVariable
3390                } else if path_variable.is_some() {
3391                    crate::query::df_graph::nfa::VlpOutputMode::FullPath
3392                } else {
3393                    crate::query::df_graph::nfa::VlpOutputMode::EndpointsOnly
3394                };
3395
3396                // Compile QPP NFA if multi-step pattern, otherwise let exec compile VLP NFA
3397                let qpp_nfa = qpp_steps.map(|steps| {
3398                    use crate::query::df_graph::nfa::{QppStep, VertexConstraint};
3399                    let hops_per_iter = steps.len();
3400                    let min_iter = min_hops / hops_per_iter;
3401                    let max_iter = max_hops / hops_per_iter;
3402                    let nfa_steps: Vec<QppStep> = steps
3403                        .iter()
3404                        .map(|s| QppStep {
3405                            edge_type_ids: s.edge_type_ids.clone(),
3406                            direction: convert_direction(s.direction.clone()),
3407                            target_constraint: s
3408                                .target_label
3409                                .as_ref()
3410                                .map(|l| VertexConstraint::Label(l.clone())),
3411                        })
3412                        .collect();
3413                    crate::query::df_graph::nfa::PathNfa::from_qpp(nfa_steps, min_iter, max_iter)
3414                });
3415
3416                Arc::new(GraphVariableLengthTraverseExec::new(
3417                    input_plan,
3418                    source_col,
3419                    edge_type_ids.to_vec(),
3420                    adj_direction,
3421                    min_hops,
3422                    max_hops,
3423                    target_variable.to_string(),
3424                    step_variable.map(|s| s.to_string()),
3425                    path_variable.map(|s| s.to_string()),
3426                    vlp_target_properties,
3427                    vlp_target_label_name,
3428                    self.graph_ctx.clone(),
3429                    optional,
3430                    bound_target_column,
3431                    edge_lance_filter,
3432                    edge_property_conditions,
3433                    used_edge_columns,
3434                    path_mode.clone(),
3435                    output_mode,
3436                    qpp_nfa,
3437                ))
3438            }
3439        };
3440
3441        // Add structural projections for bare variable access (RETURN t, labels(t), etc.)
3442        let mut traverse_plan = traverse_plan;
3443
3444        // M5b.3 — Native↔virtual joins mid-pattern. When the destination
3445        // label of the traversal is a plugin-registered virtual label, the
3446        // graph operator above has produced `{target}._vid` against the
3447        // native adjacency (so this only makes sense when host storage
3448        // contains edges whose destination vid is the virtual encoding).
3449        // Hydrate target properties from the corresponding `CatalogTable`
3450        // by inner-joining a `CatalogVertexScanExec` on `{target}._vid`.
3451        // The catalog scan side carries `_vid`, `_labels`, and the
3452        // requested properties — we drop its `_vid` after the join so the
3453        // output schema stays unambiguous for downstream consumers.
3454        if uni_common::core::schema::is_virtual_label_id(target_label_id) {
3455            traverse_plan = self.hydrate_virtual_target_from_catalog(
3456                traverse_plan,
3457                target_label_id,
3458                target_variable,
3459                all_properties,
3460            )?;
3461        }
3462
3463        // Structural projection for target variable
3464        traverse_plan = self.add_wildcard_structural_projection(
3465            traverse_plan,
3466            target_variable,
3467            all_properties,
3468        )?;
3469
3470        // Structural projection for edge variable
3471        // Only for single-hop traversals; VLP step variables are already List<Edge>
3472        traverse_plan = self.maybe_add_edge_structural_projection(
3473            traverse_plan,
3474            step_variable,
3475            source_variable,
3476            target_variable,
3477            all_properties,
3478            is_variable_length,
3479        )?;
3480
3481        // Apply target filter if present
3482        if let Some(filter_expr) = target_filter {
3483            // Build context with variable kinds for this traverse
3484            let mut variable_kinds = HashMap::new();
3485            variable_kinds.insert(source_variable.to_string(), VariableKind::Node);
3486            variable_kinds.insert(target_variable.to_string(), VariableKind::Node);
3487            if let Some(sv) = step_variable {
3488                variable_kinds.insert(sv.to_string(), VariableKind::edge_for(is_variable_length));
3489            }
3490            if let Some(pv) = path_variable {
3491                variable_kinds.insert(pv.to_string(), VariableKind::Path);
3492            }
3493            let mut variable_labels = HashMap::new();
3494            if let Some(sv) = step_variable
3495                && edge_type_ids.len() == 1
3496                && let Some(name) = self.schema.edge_type_name_by_id(edge_type_ids[0])
3497            {
3498                variable_labels.insert(sv.to_string(), name.to_string());
3499            }
3500            let target_label_name_str = self.schema.label_name_by_id(target_label_id).unwrap_or("");
3501            if !target_label_name_str.is_empty() {
3502                variable_labels.insert(
3503                    target_variable.to_string(),
3504                    target_label_name_str.to_string(),
3505                );
3506            }
3507            let ctx = TranslationContext {
3508                parameters: self.params.clone(),
3509                variable_labels,
3510                variable_kinds,
3511                ..Default::default()
3512            };
3513            let df_filter = cypher_expr_to_df(filter_expr, Some(&ctx))?;
3514            let schema = traverse_plan.schema();
3515            let session = self.session_ctx.read();
3516            let physical_filter =
3517                self.create_physical_filter_expr(&df_filter, &schema, &session)?;
3518
3519            if optional {
3520                Ok(Arc::new(OptionalFilterExec::new(
3521                    traverse_plan,
3522                    physical_filter,
3523                    optional_pattern_vars.clone(),
3524                )))
3525            } else {
3526                Ok(Arc::new(FilterExec::try_new(
3527                    physical_filter,
3528                    traverse_plan,
3529                )?))
3530            }
3531        } else {
3532            Ok(traverse_plan)
3533        }
3534    }
3535
3536    /// Plan a schemaless edge traversal (TraverseMainByType).
3537    ///
3538    /// This is used for edges without a schema-defined type that must query the main edges table.
3539    /// Supports OR relationship types like `[:KNOWS|HATES]` via multiple type_names.
3540    #[expect(clippy::too_many_arguments)]
3541    fn plan_traverse_main_by_type(
3542        &self,
3543        input: &LogicalPlan,
3544        type_names: &[String],
3545        direction: AstDirection,
3546        source_variable: &str,
3547        target_variable: &str,
3548        step_variable: Option<&str>,
3549        optional: bool,
3550        optional_pattern_vars: &HashSet<String>,
3551        all_properties: &HashMap<String, HashSet<String>>,
3552        scope_match_variables: &HashSet<String>,
3553    ) -> Result<Arc<dyn ExecutionPlan>> {
3554        let input_plan = self.plan_internal(input, all_properties)?;
3555
3556        let adj_direction = convert_direction(direction);
3557        let (input_plan, source_col) = Self::resolve_source_vid_col(input_plan, source_variable)?;
3558
3559        // Check if target variable is already bound (for patterns where target is in scope)
3560        let bound_target_column = Self::detect_bound_target(&input_plan.schema(), target_variable);
3561
3562        // Extract edge properties for schemaless edges (all treated as Utf8/JSON)
3563        let mut edge_properties: Vec<String> = if let Some(edge_var) = step_variable {
3564            all_properties
3565                .get(edge_var)
3566                .map(|props| props.iter().filter(|p| *p != "*").cloned().collect())
3567                .unwrap_or_default()
3568        } else {
3569            Vec::new()
3570        };
3571
3572        // If edge has wildcard, include _all_props for keys()/properties() support
3573        if let Some(edge_var) = step_variable
3574            && all_properties
3575                .get(edge_var)
3576                .is_some_and(|props| props.contains("*"))
3577            && !edge_properties.iter().any(|p| p == "_all_props")
3578        {
3579            edge_properties.push("_all_props".to_string());
3580        }
3581
3582        // Extract target vertex properties
3583        let mut target_properties: Vec<String> = all_properties
3584            .get(target_variable)
3585            .map(|props| props.iter().filter(|p| *p != "*").cloned().collect())
3586            .unwrap_or_default();
3587
3588        // Always include _all_props so post-traverse filters can rewrite
3589        // property accesses to json_get_* calls against the CypherValue blob.
3590        // Also include it when wildcard access was requested (RETURN n) even if empty.
3591        let target_has_wildcard = all_properties
3592            .get(target_variable)
3593            .is_some_and(|p| p.contains("*"));
3594        if (target_has_wildcard || !target_properties.is_empty())
3595            && !target_properties.iter().any(|p| p == "_all_props")
3596        {
3597            target_properties.push("_all_props".to_string());
3598        }
3599        if bound_target_column.is_some() {
3600            // Target already comes from outer scope; avoid redundant property materialization.
3601            target_properties.clear();
3602        }
3603
3604        // Compute used_edge_columns for relationship uniqueness (same logic as Traverse).
3605        // Exclude the rebound edge's own column so the BFS can match the bound edge.
3606        let rebound_bound_edge_col = step_variable
3607            .and_then(|sv| sv.strip_prefix("__rebound_"))
3608            .map(|bound| format!("{}._eid", bound));
3609        let used_edge_columns = Self::collect_used_edge_columns(
3610            &input_plan.schema(),
3611            scope_match_variables,
3612            rebound_bound_edge_col.as_deref(),
3613        );
3614
3615        // Create the schemaless traversal execution plan
3616        let traverse_plan: Arc<dyn ExecutionPlan> = Arc::new(GraphTraverseMainExec::new(
3617            input_plan,
3618            source_col,
3619            type_names.to_vec(),
3620            adj_direction,
3621            target_variable.to_string(),
3622            step_variable.map(|s| s.to_string()),
3623            edge_properties.clone(),
3624            target_properties,
3625            self.graph_ctx.clone(),
3626            optional,
3627            optional_pattern_vars.clone(),
3628            bound_target_column,
3629            used_edge_columns,
3630        ));
3631
3632        let mut result_plan = traverse_plan;
3633
3634        // Structural projection for target variable (RETURN t, labels(t), etc.)
3635        result_plan =
3636            self.add_wildcard_structural_projection(result_plan, target_variable, all_properties)?;
3637
3638        // Structural projection for edge variable (type(r), RETURN r, etc.)
3639        result_plan = self.maybe_add_edge_structural_projection(
3640            result_plan,
3641            step_variable,
3642            source_variable,
3643            target_variable,
3644            all_properties,
3645            false, // not variable-length
3646        )?;
3647
3648        Ok(result_plan)
3649    }
3650
3651    /// Plan a schemaless edge traversal with variable-length paths (TraverseMainByType VLP).
3652    ///
3653    /// This is used for VLP patterns on edges without a schema-defined type that must query the main edges table.
3654    /// Supports OR relationship types like `[:KNOWS|HATES]` via multiple type_names.
3655    #[expect(clippy::too_many_arguments)]
3656    fn plan_traverse_main_by_type_vlp(
3657        &self,
3658        input: &LogicalPlan,
3659        type_names: &[String],
3660        direction: AstDirection,
3661        source_variable: &str,
3662        target_variable: &str,
3663        step_variable: Option<&str>,
3664        min_hops: usize,
3665        max_hops: usize,
3666        path_variable: Option<&str>,
3667        optional: bool,
3668        all_properties: &HashMap<String, HashSet<String>>,
3669        edge_filter_expr: Option<&Expr>,
3670        path_mode: &crate::query::df_graph::nfa::PathMode,
3671        scope_match_variables: &HashSet<String>,
3672    ) -> Result<Arc<dyn ExecutionPlan>> {
3673        let input_plan = self.plan_internal(input, all_properties)?;
3674
3675        let adj_direction = convert_direction(direction);
3676        let (input_plan, source_col) = Self::resolve_source_vid_col(input_plan, source_variable)?;
3677
3678        // Check if target variable is already bound (for patterns where target is in scope)
3679        let bound_target_column = Self::detect_bound_target(&input_plan.schema(), target_variable);
3680
3681        // Extract target vertex properties
3682        let mut target_properties: Vec<String> = all_properties
3683            .get(target_variable)
3684            .map(|props| props.iter().filter(|p| *p != "*").cloned().collect())
3685            .unwrap_or_default();
3686
3687        // Always include _all_props so post-traverse filters can rewrite
3688        // property accesses to json_get_* calls against the CypherValue blob.
3689        // Also include it when wildcard access was requested (RETURN n) even if empty.
3690        let target_has_wildcard = all_properties
3691            .get(target_variable)
3692            .is_some_and(|p| p.contains("*"));
3693        if (target_has_wildcard || !target_properties.is_empty())
3694            && !target_properties.iter().any(|p| p == "_all_props")
3695        {
3696            target_properties.push("_all_props".to_string());
3697        }
3698        if bound_target_column.is_some() {
3699            // Correlated EXISTS only requires reachability; keep bound target columns from input.
3700            target_properties.clear();
3701        }
3702
3703        // VLP: compile edge predicates to Lance SQL for bitmap preselection
3704        let edge_lance_filter: Option<String> = edge_filter_expr.and_then(|expr| {
3705            let edge_var_name = step_variable.unwrap_or("__anon_edge");
3706            crate::query::pushdown::LanceFilterGenerator::generate(
3707                std::slice::from_ref(expr),
3708                edge_var_name,
3709                None,
3710            )
3711        });
3712
3713        // VLP: extract edge property conditions for BFS-level filtering
3714        let edge_property_conditions = edge_filter_expr
3715            .map(Self::extract_edge_property_conditions)
3716            .unwrap_or_default();
3717
3718        // VLP: collect used edge columns for cross-pattern relationship uniqueness
3719        let used_edge_columns =
3720            Self::collect_used_edge_columns(&input_plan.schema(), scope_match_variables, None);
3721
3722        // VLP: determine output mode based on bound variables
3723        let output_mode = if step_variable.is_some() {
3724            crate::query::df_graph::nfa::VlpOutputMode::StepVariable
3725        } else if path_variable.is_some() {
3726            crate::query::df_graph::nfa::VlpOutputMode::FullPath
3727        } else {
3728            crate::query::df_graph::nfa::VlpOutputMode::EndpointsOnly
3729        };
3730
3731        let traverse_plan = Arc::new(GraphVariableLengthTraverseMainExec::new(
3732            input_plan,
3733            source_col,
3734            type_names.to_vec(),
3735            adj_direction,
3736            min_hops,
3737            max_hops,
3738            target_variable.to_string(),
3739            step_variable.map(|s| s.to_string()),
3740            path_variable.map(|s| s.to_string()),
3741            target_properties,
3742            self.graph_ctx.clone(),
3743            optional,
3744            bound_target_column,
3745            edge_lance_filter,
3746            edge_property_conditions,
3747            used_edge_columns,
3748            path_mode.clone(),
3749            output_mode,
3750        ));
3751
3752        Ok(traverse_plan)
3753    }
3754
3755    /// Plan a shortest path computation.
3756    #[expect(clippy::too_many_arguments)]
3757    fn plan_shortest_path(
3758        &self,
3759        input: &LogicalPlan,
3760        edge_type_ids: &[u32],
3761        direction: AstDirection,
3762        source_variable: &str,
3763        target_variable: &str,
3764        path_variable: &str,
3765        all_shortest: bool,
3766        all_properties: &HashMap<String, HashSet<String>>,
3767    ) -> Result<Arc<dyn ExecutionPlan>> {
3768        let input_plan = self.plan_internal(input, all_properties)?;
3769
3770        let adj_direction = convert_direction(direction);
3771        let source_col = format!("{}._vid", source_variable);
3772        let target_col = format!("{}._vid", target_variable);
3773
3774        Ok(Arc::new(GraphShortestPathExec::new(
3775            input_plan,
3776            source_col,
3777            target_col,
3778            edge_type_ids.to_vec(),
3779            adj_direction,
3780            path_variable.to_string(),
3781            self.graph_ctx.clone(),
3782            all_shortest,
3783        )))
3784    }
3785
3786    /// Plan a filter operation.
3787    ///
3788    /// When `optional_variables` is non-empty, applies OPTIONAL MATCH WHERE semantics:
3789    /// rows where all optional variables are NULL are preserved regardless of the predicate.
3790    fn plan_filter(
3791        &self,
3792        input: &LogicalPlan,
3793        predicate: &Expr,
3794        optional_variables: &HashSet<String>,
3795        all_properties: &HashMap<String, HashSet<String>>,
3796    ) -> Result<Arc<dyn ExecutionPlan>> {
3797        // Optimization (issue #53): when input is a CrossJoin and the predicate
3798        // contains equi-join conditions across the two sides, emit HashJoinExec
3799        // instead of FilterExec(CrossJoinExec). Issue #54 extends this to
3800        // OPTIONAL MATCH (LeftOuter/RightOuter HashJoin) when the predicate is
3801        // a pure equi-join — see try_plan_cross_join_as_hash_join for the
3802        // safety conditions.
3803        if let LogicalPlan::CrossJoin { left, right } = input
3804            && let Some(plan) = self.try_plan_cross_join_as_hash_join(
3805                left,
3806                right,
3807                predicate,
3808                optional_variables,
3809                all_properties,
3810            )?
3811        {
3812            return Ok(plan);
3813        }
3814
3815        let input_plan = self.plan_internal(input, all_properties)?;
3816        let schema = input_plan.schema();
3817
3818        // Use CypherPhysicalExprCompiler for all filters (handles both schema-typed
3819        // and schemaless LargeBinary/CypherValue columns without coercion failures).
3820        let ctx = self.translation_context_for_plan(input);
3821        let session = self.session_ctx.read();
3822        let state = session.state();
3823        let compiler = crate::query::df_graph::expr_compiler::CypherPhysicalExprCompiler::new(
3824            &state,
3825            Some(&ctx),
3826        )
3827        .with_subquery_ctx(
3828            self.graph_ctx.clone(),
3829            self.schema.clone(),
3830            self.session_ctx.clone(),
3831            self.storage.clone(),
3832            self.params.clone(),
3833            self.outer_entity_vars.clone(),
3834        );
3835        let physical_predicate = compiler.compile(predicate, &schema)?;
3836
3837        // For OPTIONAL MATCH: use OptionalFilterExec for proper NULL row preservation.
3838        if !optional_variables.is_empty() {
3839            return Ok(Arc::new(OptionalFilterExec::new(
3840                input_plan,
3841                physical_predicate,
3842                optional_variables.clone(),
3843            )));
3844        }
3845
3846        Ok(Arc::new(FilterExec::try_new(
3847            physical_predicate,
3848            input_plan,
3849        )?))
3850    }
3851
3852    /// Issue #53 optimization: try to convert Filter(CrossJoin(L, R), pred) into
3853    /// HashJoinExec when `pred` contains an equi-join condition across the two
3854    /// sides. Returns `Ok(None)` (fall through to FilterExec) when the pattern
3855    /// doesn't apply or when join key types can't be unified.
3856    ///
3857    /// Left/right-only conjuncts are pushed into a wrapper `Filter` over each
3858    /// subtree before planning, so nested CrossJoins re-trigger the same
3859    /// optimization recursively via `plan_internal`.
3860    fn try_plan_cross_join_as_hash_join(
3861        &self,
3862        left: &LogicalPlan,
3863        right: &LogicalPlan,
3864        predicate: &Expr,
3865        optional_variables: &HashSet<String>,
3866        all_properties: &HashMap<String, HashSet<String>>,
3867    ) -> Result<Option<Arc<dyn ExecutionPlan>>> {
3868        use datafusion::common::NullEquality;
3869        use datafusion::physical_plan::joins::{HashJoinExec, PartitionMode};
3870
3871        let left_vars = collect_plan_variables(left);
3872        let right_vars = collect_plan_variables(right);
3873        let cls = classify_join_predicate(predicate, &left_vars, &right_vars);
3874
3875        if cls.equi_pairs.is_empty() {
3876            return Ok(None);
3877        }
3878
3879        // Determine join type from optional_variables.
3880        //
3881        // OPTIONAL MATCH semantics (per OptionalFilterExec) require that for
3882        // each "source group" (rows of the required side), if all rows fail
3883        // the predicate we still emit one row with the optional side NULLed.
3884        // A LeftOuter HashJoin gives the same behavior **only when** the
3885        // predicate is a pure equi-join across the required and optional
3886        // sides — any non-equi conjunct (left_only, right_only, residual) on
3887        // either side could drop a row that OPTIONAL semantics would have
3888        // NULL-preserved. So for the OPTIONAL path we accept only pure
3889        // equi-joins; everything else falls back to OptionalFilterExec.
3890        let left_optional: HashSet<&String> = optional_variables
3891            .iter()
3892            .filter(|v| left_vars.contains(*v))
3893            .collect();
3894        let right_optional: HashSet<&String> = optional_variables
3895            .iter()
3896            .filter(|v| right_vars.contains(*v))
3897            .collect();
3898
3899        let join_type = match (left_optional.is_empty(), right_optional.is_empty()) {
3900            (true, true) => JoinType::Inner,
3901            (true, false) => JoinType::Left,
3902            (false, true) => JoinType::Right,
3903            (false, false) => return Ok(None), // optional vars on both sides — bail
3904        };
3905
3906        // For outer joins: only safe when the predicate is purely equi-joins
3907        // (no left_only/right_only/residual conjuncts).
3908        if !matches!(join_type, JoinType::Inner)
3909            && (!cls.left_only.is_empty() || !cls.right_only.is_empty() || cls.residual.is_some())
3910        {
3911            return Ok(None);
3912        }
3913
3914        // UNWIND IN-list scan pushdown (issue #54 part 3) is now handled
3915        // by the standalone `merge_unwind_in_filters` pre-pass at
3916        // `HybridPhysicalPlanner::plan`. That pass walks the LogicalPlan
3917        // tree BEFORE any physical-plan optimization can bail (e.g.,
3918        // `unify_join_key_types` failing on Utf8 ↔ LargeBinary), so the
3919        // scan-side filters always survive — regardless of whether this
3920        // function emits HashJoinExec or falls back to FilterExec(CrossJoin).
3921        //
3922        // Left-only / right-only conjuncts (from `classify_join_predicate`)
3923        // remain handled here because they're predicate-decomposition
3924        // concerns specific to HashJoin emission, not UNWIND-IN-list
3925        // pushdown. They flow into wrap_with_filter below.
3926        tracing::debug!(
3927            target: "uni_query::cross_join_in_pushdown",
3928            equi_pairs = cls.equi_pairs.len(),
3929            left_only = cls.left_only.len(),
3930            right_only = cls.right_only.len(),
3931            has_residual = cls.residual.is_some(),
3932            "try_plan_cross_join_as_hash_join: classified predicate"
3933        );
3934
3935        let left_filters: Vec<Expr> = cls.left_only.clone();
3936        let right_filters: Vec<Expr> = cls.right_only.clone();
3937        let left_with_filter = wrap_with_filter(left.clone(), &left_filters);
3938        let right_with_filter = wrap_with_filter(right.clone(), &right_filters);
3939        let left_plan = self.plan_internal(&left_with_filter, all_properties)?;
3940        let right_plan = self.plan_internal(&right_with_filter, all_properties)?;
3941
3942        // Compile each (l_expr, r_expr) pair, wrapping both sides in tointeger
3943        // for type unification (handles UInt64 _vid vs LargeBinary CV property).
3944        // If any pair can't be unified, fall through to FilterExec.
3945        let left_schema = left_plan.schema();
3946        let right_schema = right_plan.schema();
3947        let left_ctx = self.translation_context_for_plan(&left_with_filter);
3948        let right_ctx = self.translation_context_for_plan(&right_with_filter);
3949
3950        // Build join keys: compile each side's expression and wrap in tointeger
3951        // for type unification (handles UInt64 _vid vs LargeBinary CV property).
3952        // Drop the session lock between this scope and HashJoinExec construction.
3953        let on: Vec<(
3954            Arc<dyn datafusion::physical_plan::PhysicalExpr>,
3955            Arc<dyn datafusion::physical_plan::PhysicalExpr>,
3956        )> = {
3957            let session = self.session_ctx.read();
3958            let state = session.state();
3959
3960            let left_compiler =
3961                crate::query::df_graph::expr_compiler::CypherPhysicalExprCompiler::new(
3962                    &state,
3963                    Some(&left_ctx),
3964                )
3965                .with_subquery_ctx(
3966                    self.graph_ctx.clone(),
3967                    self.schema.clone(),
3968                    self.session_ctx.clone(),
3969                    self.storage.clone(),
3970                    self.params.clone(),
3971                    self.outer_entity_vars.clone(),
3972                );
3973            let right_compiler =
3974                crate::query::df_graph::expr_compiler::CypherPhysicalExprCompiler::new(
3975                    &state,
3976                    Some(&right_ctx),
3977                )
3978                .with_subquery_ctx(
3979                    self.graph_ctx.clone(),
3980                    self.schema.clone(),
3981                    self.session_ctx.clone(),
3982                    self.storage.clone(),
3983                    self.params.clone(),
3984                    self.outer_entity_vars.clone(),
3985                );
3986
3987            let mut pairs: Vec<(
3988                Arc<dyn datafusion::physical_plan::PhysicalExpr>,
3989                Arc<dyn datafusion::physical_plan::PhysicalExpr>,
3990            )> = Vec::with_capacity(cls.equi_pairs.len());
3991
3992            for (l_expr, r_expr) in &cls.equi_pairs {
3993                let l_phys = left_compiler.compile(l_expr, &left_schema)?;
3994                let r_phys = right_compiler.compile(r_expr, &right_schema)?;
3995                let Some((l_key, r_key)) =
3996                    unify_join_key_types(l_phys, r_phys, &left_schema, &right_schema, &state)
3997                else {
3998                    return Ok(None);
3999                };
4000                pairs.push((l_key, r_key));
4001            }
4002            pairs
4003        };
4004
4005        // Issue #55 PR #5+#6: cross-MATCH dynamic VID-filter pushdown.
4006        // When the equi-pairs include exactly one anchor pair on the
4007        // probe-side `_vid`, and the probe-side planned subtree is a
4008        // fresh `GraphScanExec`, replace `HashJoinExec{build, full_scan}`
4009        // with `VidLookupJoinExec`. Supports INNER and LEFT outer; falls
4010        // through to HashJoinExec for RIGHT outer, non-Scan probes, or
4011        // computed (non-Column) join keys.
4012        if matches!(join_type, JoinType::Inner | JoinType::Left)
4013            && cls.residual.is_none()
4014            && let Some(plan) = self.try_emit_vid_lookup_join(
4015                &cls.equi_pairs,
4016                join_type,
4017                &left_plan,
4018                &right_plan,
4019                &left_with_filter,
4020                &right_with_filter,
4021            )?
4022        {
4023            return Ok(Some(plan));
4024        }
4025
4026        let join: Arc<dyn ExecutionPlan> = Arc::new(HashJoinExec::try_new(
4027            left_plan,
4028            right_plan,
4029            on,
4030            None,
4031            &join_type,
4032            None,
4033            PartitionMode::CollectLeft,
4034            NullEquality::NullEqualsNothing,
4035            false,
4036        )?);
4037
4038        // Apply mixed-non-equi residual (predicates referencing both sides
4039        // that aren't equi-joins) as a post-join FilterExec.
4040        if let Some(residual) = cls.residual {
4041            let join_schema = join.schema();
4042            let crossjoin_for_ctx = LogicalPlan::CrossJoin {
4043                left: Box::new(left_with_filter.clone()),
4044                right: Box::new(right_with_filter.clone()),
4045            };
4046            let merged_ctx = self.translation_context_for_plan(&crossjoin_for_ctx);
4047            let session = self.session_ctx.read();
4048            let state = session.state();
4049            let compiler = crate::query::df_graph::expr_compiler::CypherPhysicalExprCompiler::new(
4050                &state,
4051                Some(&merged_ctx),
4052            )
4053            .with_subquery_ctx(
4054                self.graph_ctx.clone(),
4055                self.schema.clone(),
4056                self.session_ctx.clone(),
4057                self.storage.clone(),
4058                self.params.clone(),
4059                self.outer_entity_vars.clone(),
4060            );
4061            let physical_residual = compiler.compile(&residual, &join_schema)?;
4062            return Ok(Some(Arc::new(FilterExec::try_new(
4063                physical_residual,
4064                join,
4065            )?)));
4066        }
4067
4068        Ok(Some(join))
4069    }
4070
4071    /// Issue #55 PR #5+#6: detect the cross-MATCH dynamic VID-filter pushdown
4072    /// pattern and emit `VidLookupJoinExec` instead of `HashJoinExec`.
4073    /// Returns `Ok(None)` for any pattern that doesn't match — the caller
4074    /// falls through to the standard HashJoin emission.
4075    ///
4076    /// Pattern recognised:
4077    ///   * One equi-pair (the *anchor*) has the probe side equal to
4078    ///     `Property(Variable(scan_var), "_vid")`. Its values drive the
4079    ///     IN-list pushdown.
4080    ///   * Other equi-pairs (if any) compile to `Column` references on
4081    ///     both sides; they're applied in-memory as post-match filters.
4082    ///   * The probe-side planned subtree is a top-level `GraphScanExec`.
4083    ///   * The anchor build column is UInt64 (a VID).
4084    ///   * Join is INNER or LEFT outer (RIGHT outer rejected — we can't
4085    ///     produce probe rows that don't match any build VID).
4086    fn try_emit_vid_lookup_join(
4087        &self,
4088        equi_pairs: &[(Expr, Expr)],
4089        join_type: JoinType,
4090        left_plan: &Arc<dyn ExecutionPlan>,
4091        right_plan: &Arc<dyn ExecutionPlan>,
4092        left_logical: &LogicalPlan,
4093        right_logical: &LogicalPlan,
4094    ) -> Result<Option<Arc<dyn ExecutionPlan>>> {
4095        use crate::query::df_graph::scan::GraphScanExec;
4096        use crate::query::df_graph::vid_lookup_join::{
4097            EquiPair, ProbeSide, VidJoinKind, VidLookupJoinExec,
4098        };
4099        use datafusion::physical_expr::expressions::Column;
4100
4101        if equi_pairs.is_empty() {
4102            return Ok(None);
4103        }
4104
4105        // 1. Find the anchor pair: the one where the probe side is
4106        // `Property(Variable(_), "_vid")`. The classifier's invariant is
4107        // that `l_expr` references LEFT subtree variables and `r_expr`
4108        // references RIGHT subtree variables, so detecting `_vid` on
4109        // `l_expr` means the probe is on the left.
4110        let mut anchor_idx: Option<(usize, ProbeSide)> = None;
4111        for (i, (l_expr, r_expr)) in equi_pairs.iter().enumerate() {
4112            if expr_is_vid_property(l_expr) {
4113                anchor_idx = Some((i, ProbeSide::Left));
4114                break;
4115            }
4116            if expr_is_vid_property(r_expr) {
4117                anchor_idx = Some((i, ProbeSide::Right));
4118                break;
4119            }
4120        }
4121        let Some((anchor_pair_idx, probe_side)) = anchor_idx else {
4122            return Ok(None);
4123        };
4124
4125        let probe_plan = match probe_side {
4126            ProbeSide::Left => left_plan,
4127            ProbeSide::Right => right_plan,
4128        };
4129        let build_plan = match probe_side {
4130            ProbeSide::Left => right_plan,
4131            ProbeSide::Right => left_plan,
4132        };
4133        let build_logical = match probe_side {
4134            ProbeSide::Left => right_logical,
4135            ProbeSide::Right => left_logical,
4136        };
4137
4138        // 2. Probe-side plan must be a top-level GraphScanExec.
4139        //
4140        // We deliberately do NOT peek through an SSI `ReadSetRecordingExec`
4141        // here. That wrapper is only inserted for read-write transactions with
4142        // an active read-set, and `VidLookupJoinExec` drives the probe scan via
4143        // `execute_with_vid_filter`, bypassing the wrapper — which would silently
4144        // skip read-set capture for the probe rows. Letting the wrapper mask the
4145        // scan makes this rewrite bail to `HashJoinExec`, which executes the
4146        // wrapper normally and records the reads. Non-SSI / read-only contexts
4147        // have no wrapper, so the optimization still fires there.
4148        if probe_plan
4149            .as_any()
4150            .downcast_ref::<GraphScanExec>()
4151            .is_none()
4152        {
4153            return Ok(None);
4154        }
4155
4156        // 3. Compile every equi-pair's expressions against their respective
4157        // schemas, requiring each side to resolve to a Column. The anchor
4158        // pair additionally requires the build side to be UInt64.
4159        let left_schema = left_plan.schema();
4160        let right_schema = right_plan.schema();
4161        let left_ctx = self.translation_context_for_plan(left_logical);
4162        let right_ctx = self.translation_context_for_plan(right_logical);
4163        let _ = build_logical; // contexts already covered by left/right_ctx
4164
4165        let session = self.session_ctx.read();
4166        let state = session.state();
4167        let left_compiler = crate::query::df_graph::expr_compiler::CypherPhysicalExprCompiler::new(
4168            &state,
4169            Some(&left_ctx),
4170        )
4171        .with_subquery_ctx(
4172            self.graph_ctx.clone(),
4173            self.schema.clone(),
4174            self.session_ctx.clone(),
4175            self.storage.clone(),
4176            self.params.clone(),
4177            self.outer_entity_vars.clone(),
4178        );
4179        let right_compiler =
4180            crate::query::df_graph::expr_compiler::CypherPhysicalExprCompiler::new(
4181                &state,
4182                Some(&right_ctx),
4183            )
4184            .with_subquery_ctx(
4185                self.graph_ctx.clone(),
4186                self.schema.clone(),
4187                self.session_ctx.clone(),
4188                self.storage.clone(),
4189                self.params.clone(),
4190                self.outer_entity_vars.clone(),
4191            );
4192
4193        let mut compiled: Vec<EquiPair> = Vec::with_capacity(equi_pairs.len());
4194        for (l_expr, r_expr) in equi_pairs {
4195            let l_phys = left_compiler.compile(l_expr, &left_schema)?;
4196            let r_phys = right_compiler.compile(r_expr, &right_schema)?;
4197            let (Some(l_col), Some(r_col)) = (
4198                l_phys.as_any().downcast_ref::<Column>(),
4199                r_phys.as_any().downcast_ref::<Column>(),
4200            ) else {
4201                // Computed expression on either side → bail to HashJoinExec.
4202                return Ok(None);
4203            };
4204            compiled.push(EquiPair {
4205                left_col_idx: l_col.index(),
4206                right_col_idx: r_col.index(),
4207            });
4208        }
4209
4210        // 4. Anchor build column must be UInt64.
4211        let anchor = compiled[anchor_pair_idx];
4212        let anchor_build_idx = match probe_side {
4213            ProbeSide::Left => anchor.right_col_idx,
4214            ProbeSide::Right => anchor.left_col_idx,
4215        };
4216        let build_schema = build_plan.schema();
4217        if !matches!(
4218            build_schema.field(anchor_build_idx).data_type(),
4219            datafusion::arrow::datatypes::DataType::UInt64
4220        ) {
4221            return Ok(None);
4222        }
4223
4224        // 5. Reorder so the anchor pair is at index 0 (operator's invariant).
4225        if anchor_pair_idx != 0 {
4226            compiled.swap(0, anchor_pair_idx);
4227        }
4228
4229        // 6. Translate join_type. RIGHT outer is rejected — we can't
4230        // produce probe rows that don't match any build VID, since our
4231        // probe scan only fetches rows whose `_vid` is in the build set.
4232        let join_kind = match join_type {
4233            JoinType::Inner => VidJoinKind::Inner,
4234            JoinType::Left => VidJoinKind::Left,
4235            _ => return Ok(None),
4236        };
4237
4238        drop(session);
4239
4240        Ok(Some(Arc::new(VidLookupJoinExec::try_new(
4241            left_plan.clone(),
4242            right_plan.clone(),
4243            probe_side,
4244            compiled,
4245            join_kind,
4246        )?)))
4247    }
4248
4249    /// Plan a projection, passing alias map through to Sort nodes in the input chain.
4250    fn plan_project_with_aliases(
4251        &self,
4252        input: &LogicalPlan,
4253        projections: &[(Expr, Option<String>)],
4254        all_properties: &HashMap<String, HashSet<String>>,
4255        alias_map: &HashMap<String, Expr>,
4256    ) -> Result<Arc<dyn ExecutionPlan>> {
4257        // Route through plan_internal_with_aliases to propagate aliases to Sort
4258        let input_plan = self.plan_internal_with_aliases(input, all_properties, alias_map)?;
4259        self.plan_project_from_input(input_plan, projections, Some(input))
4260    }
4261
4262    /// Build projection expressions from an already-planned input.
4263    fn plan_project_from_input(
4264        &self,
4265        input_plan: Arc<dyn ExecutionPlan>,
4266        projections: &[(Expr, Option<String>)],
4267        context_plan: Option<&LogicalPlan>,
4268    ) -> Result<Arc<dyn ExecutionPlan>> {
4269        let schema = input_plan.schema();
4270
4271        let session = self.session_ctx.read();
4272        let state = session.state();
4273
4274        // Build translation context with variable kinds if we have a logical plan
4275        let ctx = context_plan.map(|p| self.translation_context_for_plan(p));
4276
4277        let mut exprs: Vec<(Arc<dyn datafusion::physical_expr::PhysicalExpr>, String)> = Vec::new();
4278
4279        for (expr, alias) in projections {
4280            // Handle whole-node/relationship projection: RETURN n
4281            // The scan layer materializes the variable as either:
4282            //   - A Struct column (registered labels via add_structural_projection)
4283            //   - A LargeBinary/CypherValue column aliased as the variable (schemaless via add_alias_projection)
4284            // Project that column directly, plus _vid/_labels helpers for post-processing.
4285            if let Expr::Variable(var_name) = expr {
4286                if schema.column_with_name(var_name).is_some() {
4287                    let (col_idx, _) = schema.column_with_name(var_name).unwrap();
4288                    let col_expr: Arc<dyn datafusion::physical_expr::PhysicalExpr> = Arc::new(
4289                        datafusion::physical_expr::expressions::Column::new(var_name, col_idx),
4290                    );
4291                    let name = alias.clone().unwrap_or_else(|| var_name.clone());
4292                    exprs.push((col_expr, name));
4293
4294                    // Include _vid and _labels as helper columns for post-processing
4295                    let vid_col = format!("{}._vid", var_name);
4296                    let labels_col = format!("{}._labels", var_name);
4297                    if let Some((vi, _)) = schema.column_with_name(&vid_col) {
4298                        let ve: Arc<dyn datafusion::physical_expr::PhysicalExpr> = Arc::new(
4299                            datafusion::physical_expr::expressions::Column::new(&vid_col, vi),
4300                        );
4301                        exprs.push((ve, vid_col.clone()));
4302                    }
4303                    if let Some((li, _)) = schema.column_with_name(&labels_col) {
4304                        let le: Arc<dyn datafusion::physical_expr::PhysicalExpr> = Arc::new(
4305                            datafusion::physical_expr::expressions::Column::new(&labels_col, li),
4306                        );
4307                        exprs.push((le, labels_col.clone()));
4308                    }
4309
4310                    // Carry through all {var}.{prop} columns so downstream
4311                    // operators (e.g. RETURN n.name after WITH n) can find them.
4312                    let prefix = format!("{}.", var_name);
4313                    for (idx, field) in schema.fields().iter().enumerate() {
4314                        let fname = field.name();
4315                        if fname.starts_with(&prefix)
4316                            && fname != &vid_col
4317                            && fname != &labels_col
4318                            && !exprs.iter().any(|(_, n)| n == fname)
4319                        {
4320                            let prop_expr: Arc<dyn datafusion::physical_expr::PhysicalExpr> =
4321                                Arc::new(datafusion::physical_expr::expressions::Column::new(
4322                                    fname, idx,
4323                                ));
4324                            exprs.push((prop_expr, fname.clone()));
4325                        }
4326                    }
4327                    continue;
4328                }
4329
4330                // No materialized column — build a struct from expanded dot-columns
4331                // This handles traversal targets that have b._vid, b.name, etc. but no b column
4332                let prefix = format!("{}.", var_name);
4333                let expanded_fields: Vec<(usize, String)> = schema
4334                    .fields()
4335                    .iter()
4336                    .enumerate()
4337                    .filter(|(_, f)| f.name().starts_with(&prefix))
4338                    .map(|(i, f)| (i, f.name().clone()))
4339                    .collect();
4340
4341                if !expanded_fields.is_empty() {
4342                    use datafusion::functions::expr_fn::named_struct;
4343                    use datafusion::logical_expr::lit;
4344
4345                    // Build named_struct args: pairs of (field_name_literal, column_ref)
4346                    let mut struct_args = Vec::new();
4347                    for (_, field_name) in &expanded_fields {
4348                        let prop_name = &field_name[prefix.len()..];
4349                        struct_args.push(lit(prop_name.to_string()));
4350                        // Use Column::from_name to avoid dot-parsing (b._vid != table b, col _vid)
4351                        struct_args.push(DfExpr::Column(datafusion::common::Column::from_name(
4352                            field_name.as_str(),
4353                        )));
4354                    }
4355
4356                    let struct_expr = named_struct(struct_args);
4357                    let df_schema =
4358                        datafusion::common::DFSchema::try_from(schema.as_ref().clone())?;
4359                    let session = self.session_ctx.read();
4360                    let state_ref = session.state();
4361                    let resolved_expr = Self::resolve_udfs(&struct_expr, &state_ref)?;
4362
4363                    use datafusion::physical_planner::PhysicalPlanner;
4364                    let phys_planner =
4365                        datafusion::physical_planner::DefaultPhysicalPlanner::default();
4366                    let physical_struct_expr = phys_planner.create_physical_expr(
4367                        &resolved_expr,
4368                        &df_schema,
4369                        &state_ref,
4370                    )?;
4371
4372                    let name = alias.clone().unwrap_or_else(|| var_name.clone());
4373                    exprs.push((physical_struct_expr, name));
4374
4375                    // Also include _vid and _labels helpers
4376                    let vid_col = format!("{}._vid", var_name);
4377                    let labels_col = format!("{}._labels", var_name);
4378                    if let Some((vi, _)) = schema.column_with_name(&vid_col) {
4379                        let ve: Arc<dyn datafusion::physical_expr::PhysicalExpr> = Arc::new(
4380                            datafusion::physical_expr::expressions::Column::new(&vid_col, vi),
4381                        );
4382                        exprs.push((ve, vid_col.clone()));
4383                    }
4384                    if let Some((li, _)) = schema.column_with_name(&labels_col) {
4385                        let le: Arc<dyn datafusion::physical_expr::PhysicalExpr> = Arc::new(
4386                            datafusion::physical_expr::expressions::Column::new(&labels_col, li),
4387                        );
4388                        exprs.push((le, labels_col.clone()));
4389                    }
4390
4391                    // Carry through remaining {var}.{prop} columns not already
4392                    // included by the struct projection above.
4393                    for (idx, field) in schema.fields().iter().enumerate() {
4394                        let fname = field.name();
4395                        if fname.starts_with(&prefix)
4396                            && fname != &vid_col
4397                            && fname != &labels_col
4398                            && !exprs.iter().any(|(_, n)| n == fname)
4399                        {
4400                            let prop_expr: Arc<dyn datafusion::physical_expr::PhysicalExpr> =
4401                                Arc::new(datafusion::physical_expr::expressions::Column::new(
4402                                    fname, idx,
4403                                ));
4404                            exprs.push((prop_expr, fname.clone()));
4405                        }
4406                    }
4407                    continue;
4408                }
4409                // Fall through to normal expression compilation if no matching columns at all
4410            }
4411
4412            // Handle RETURN * (wildcard) — expand to all input columns
4413            if matches!(expr, Expr::Wildcard) {
4414                for (col_idx, field) in schema.fields().iter().enumerate() {
4415                    let col_expr: Arc<dyn datafusion::physical_expr::PhysicalExpr> = Arc::new(
4416                        datafusion::physical_expr::expressions::Column::new(field.name(), col_idx),
4417                    );
4418                    exprs.push((col_expr, field.name().clone()));
4419                }
4420                continue;
4421            }
4422
4423            let compiler = crate::query::df_graph::expr_compiler::CypherPhysicalExprCompiler::new(
4424                &state,
4425                ctx.as_ref(),
4426            )
4427            .with_subquery_ctx(
4428                self.graph_ctx.clone(),
4429                self.schema.clone(),
4430                self.session_ctx.clone(),
4431                self.storage.clone(),
4432                self.params.clone(),
4433                self.outer_entity_vars.clone(),
4434            );
4435            let physical_expr = compiler.compile(expr, &schema)?;
4436
4437            let name = alias.clone().unwrap_or_else(|| expr.to_string_repr());
4438            exprs.push((physical_expr, name));
4439        }
4440
4441        Ok(Arc::new(ProjectionExec::try_new(exprs, input_plan)?))
4442    }
4443
4444    /// Plan a compact Locy YIELD projection — emits ONLY the listed expressions,
4445    /// without carrying through helper/property columns.
4446    ///
4447    /// Node variables are projected as their `._vid` column (UInt64).
4448    /// Other expressions are compiled normally, then CAST to target type if needed.
4449    fn plan_locy_project(
4450        &self,
4451        input: &LogicalPlan,
4452        projections: &[(Expr, Option<String>)],
4453        target_types: &[DataType],
4454        all_properties: &HashMap<String, HashSet<String>>,
4455    ) -> Result<Arc<dyn ExecutionPlan>> {
4456        use datafusion::physical_expr::expressions::Column;
4457
4458        let input_plan = self.plan_internal(input, all_properties)?;
4459        let schema = input_plan.schema();
4460
4461        let session = self.session_ctx.read();
4462        let state = session.state();
4463
4464        let ctx = self.translation_context_for_plan(input);
4465
4466        let mut exprs: Vec<(Arc<dyn datafusion::physical_expr::PhysicalExpr>, String)> = Vec::new();
4467
4468        for (i, (expr, alias)) in projections.iter().enumerate() {
4469            let target_type = target_types.get(i);
4470
4471            // Handle node/relationship variables: extract ._vid column
4472            if let Expr::Variable(var_name) = expr {
4473                // Check if this is a graph-expanded node variable ({var}._vid exists)
4474                let vid_col_name = format!("{}._vid", var_name);
4475                let vid_col_match = schema
4476                    .fields()
4477                    .iter()
4478                    .enumerate()
4479                    .find(|(_, f)| f.name() == &vid_col_name);
4480
4481                if let Some((vid_idx, _)) = vid_col_match {
4482                    // Node variable → extract VID (UInt64)
4483                    let col_expr: Arc<dyn datafusion::physical_expr::PhysicalExpr> =
4484                        Arc::new(Column::new(&vid_col_name, vid_idx));
4485                    let name = alias.clone().unwrap_or_else(|| var_name.clone());
4486                    exprs.push((col_expr, name));
4487                    continue;
4488                }
4489
4490                // Direct column (e.g. from derived scan)
4491                if let Some((col_idx, _)) = schema.column_with_name(var_name) {
4492                    let col_expr: Arc<dyn datafusion::physical_expr::PhysicalExpr> =
4493                        Arc::new(Column::new(var_name, col_idx));
4494                    let name = alias.clone().unwrap_or_else(|| var_name.clone());
4495                    exprs.push((col_expr, name));
4496                    continue;
4497                }
4498                // Fall through to generic expression compilation
4499            }
4500
4501            // Generic expression compilation (property access, literals, etc.)
4502            let compiler = crate::query::df_graph::expr_compiler::CypherPhysicalExprCompiler::new(
4503                &state,
4504                Some(&ctx),
4505            )
4506            .with_subquery_ctx(
4507                self.graph_ctx.clone(),
4508                self.schema.clone(),
4509                self.session_ctx.clone(),
4510                self.storage.clone(),
4511                self.params.clone(),
4512                self.outer_entity_vars.clone(),
4513            );
4514            let physical_expr = compiler.compile(expr, &schema)?;
4515
4516            // CAST if the compiled expression's output type doesn't match target.
4517            // Skip coercion when actual is a string type but target is numeric
4518            // (or vice versa) — this means `infer_expr_type` guessed wrong
4519            // (e.g. defaulting Property to Float64 for a string column).
4520            let physical_expr = if let Some(target_dt) = target_type {
4521                let actual_dt = physical_expr
4522                    .data_type(schema.as_ref())
4523                    .unwrap_or(DataType::LargeUtf8);
4524                let is_string = |dt: &DataType| matches!(dt, DataType::Utf8 | DataType::LargeUtf8);
4525                let is_numeric = |dt: &DataType| {
4526                    matches!(dt, DataType::Int64 | DataType::Float64 | DataType::UInt64)
4527                };
4528                let cross_domain = (is_string(&actual_dt) && is_numeric(target_dt))
4529                    || (is_numeric(&actual_dt) && is_string(target_dt));
4530                if actual_dt != *target_dt && !cross_domain {
4531                    coerce_physical_expr(physical_expr, &actual_dt, target_dt, schema.as_ref())
4532                } else {
4533                    physical_expr
4534                }
4535            } else {
4536                physical_expr
4537            };
4538
4539            let name = alias.clone().unwrap_or_else(|| expr.to_string_repr());
4540            exprs.push((physical_expr, name));
4541        }
4542
4543        Ok(Arc::new(ProjectionExec::try_new(exprs, input_plan)?))
4544    }
4545
4546    /// Plan an aggregation.
4547    fn plan_aggregate(
4548        &self,
4549        input: &LogicalPlan,
4550        group_by: &[Expr],
4551        aggregates: &[Expr],
4552        all_properties: &HashMap<String, HashSet<String>>,
4553    ) -> Result<Arc<dyn ExecutionPlan>> {
4554        let input_plan = self.plan_internal(input, all_properties)?;
4555        let schema = input_plan.schema();
4556
4557        let session = self.session_ctx.read();
4558        let state = session.state();
4559
4560        // Build translation context with variable kinds from the input plan
4561        let ctx = self.translation_context_for_plan(input);
4562
4563        // Translate group by expressions
4564        use crate::query::df_graph::expr_compiler::CypherPhysicalExprCompiler;
4565        let mut group_exprs: Vec<(Arc<dyn datafusion::physical_expr::PhysicalExpr>, String)> =
4566            Vec::new();
4567        for expr in group_by {
4568            let name = expr.to_string_repr();
4569
4570            // Entity variables (Node/Edge) from traversals may not have a direct
4571            // column — only expanded property columns like "other._vid",
4572            // "other.name", etc. Skip them here; the property expansion loop
4573            // below adds those columns to the group-by instead.
4574            if let Expr::Variable(var_name) = expr
4575                && schema.column_with_name(var_name).is_none()
4576            {
4577                let prefix = format!("{}.", var_name);
4578                let has_expanded = schema
4579                    .fields()
4580                    .iter()
4581                    .any(|f| f.name().starts_with(&prefix));
4582                if has_expanded {
4583                    continue;
4584                }
4585            }
4586
4587            let physical_expr = if CypherPhysicalExprCompiler::contains_custom_expr(expr) {
4588                // Custom expressions (quantifiers, list comprehensions, reduce, etc.)
4589                // cannot be translated via cypher_expr_to_df; compile them directly.
4590                let compiler = CypherPhysicalExprCompiler::new(&state, Some(&ctx))
4591                    .with_subquery_ctx(
4592                        self.graph_ctx.clone(),
4593                        self.schema.clone(),
4594                        self.session_ctx.clone(),
4595                        self.storage.clone(),
4596                        self.params.clone(),
4597                        self.outer_entity_vars.clone(),
4598                    );
4599                compiler.compile(expr, &schema)?
4600            } else {
4601                // DateTime/Time struct grouping: group by UTC-normalized values
4602                // Two DateTimes with same UTC instant but different offsets should group together
4603                let df_schema_ref =
4604                    datafusion::common::DFSchema::try_from(schema.as_ref().clone())?;
4605                let df_expr = cypher_expr_to_df(expr, Some(&ctx))?;
4606                let df_expr = Self::resolve_udfs(&df_expr, &state)?;
4607                let df_expr = crate::query::df_expr::apply_type_coercion(&df_expr, &df_schema_ref)?;
4608                let mut df_expr = Self::resolve_udfs(&df_expr, &state)?;
4609                if let Ok(expr_type) = df_expr.get_type(&df_schema_ref) {
4610                    if uni_common::core::schema::is_datetime_struct(&expr_type) {
4611                        // Group by UTC instant (nanos_since_epoch)
4612                        df_expr = crate::query::df_expr::extract_datetime_nanos(df_expr);
4613                    } else if uni_common::core::schema::is_time_struct(&expr_type) {
4614                        // Group by UTC-normalized time
4615                        // extract_time_nanos does: nanos_since_midnight - (offset_seconds * 1e9)
4616                        df_expr = crate::query::df_expr::extract_time_nanos(df_expr);
4617                    }
4618                }
4619
4620                // Convert logical expression to physical
4621                create_physical_expr(&df_expr, &df_schema_ref, state.execution_props())?
4622            };
4623            group_exprs.push((physical_expr, name));
4624        }
4625
4626        // For entity variables (Node/Edge) in group_by, also include their
4627        // property columns. Properties are functionally dependent on the entity,
4628        // so grouping by them is semantically correct and ensures they survive
4629        // the aggregation for downstream property access (e.g. RETURN a.name
4630        // after WITH a, min(...) AS m).
4631        for expr in group_by {
4632            if let Expr::Variable(var_name) = expr
4633                && matches!(
4634                    ctx.variable_kinds.get(var_name),
4635                    Some(VariableKind::Node) | Some(VariableKind::Edge)
4636                )
4637            {
4638                let prefix = format!("{}.", var_name);
4639                for (idx, field) in schema.fields().iter().enumerate() {
4640                    if field.name().starts_with(&prefix) {
4641                        let prop_col: Arc<dyn datafusion::physical_expr::PhysicalExpr> = Arc::new(
4642                            datafusion::physical_expr::expressions::Column::new(field.name(), idx),
4643                        );
4644                        group_exprs.push((prop_col, field.name().clone()));
4645                    }
4646                }
4647            }
4648        }
4649
4650        let physical_group_by = PhysicalGroupBy::new_single(group_exprs);
4651
4652        // Pre-compute pattern comprehensions in aggregate arguments
4653        let (input_plan, schema, rewritten_aggregates) =
4654            self.precompute_custom_aggregate_args(input_plan, &schema, aggregates, &state, &ctx)?;
4655
4656        // Translate aggregates and their associated filter expressions
4657        // (e.g. collect() uses a filter to exclude null values per Cypher spec)
4658        let (aggr_exprs, filter_exprs): (Vec<_>, Vec<_>) = self
4659            .translate_aggregates(&rewritten_aggregates, &schema, &state, &ctx)?
4660            .into_iter()
4661            .unzip();
4662        let num_aggregates = aggr_exprs.len();
4663
4664        let agg_exec = Arc::new(AggregateExec::try_new(
4665            AggregateMode::Single,
4666            physical_group_by,
4667            aggr_exprs,
4668            filter_exprs,
4669            input_plan,
4670            schema,
4671        )?);
4672
4673        // DataFusion's AggregateExec auto-generates column names from physical
4674        // expressions (e.g. `count(Int32(1))`), but the logical plan's projection
4675        // expects names like `COUNT(n)`. Add a renaming projection to bridge this.
4676        let agg_schema = agg_exec.schema();
4677        // Use actual expanded group-by count (includes entity property columns)
4678        // rather than logical group_by.len() which doesn't account for expansion.
4679        let num_group_by = agg_schema.fields().len() - num_aggregates;
4680        let mut proj_exprs: Vec<(Arc<dyn datafusion::physical_expr::PhysicalExpr>, String)> =
4681            Vec::new();
4682
4683        for (i, field) in agg_schema.fields().iter().enumerate() {
4684            let col_expr: Arc<dyn datafusion::physical_expr::PhysicalExpr> = Arc::new(
4685                datafusion::physical_expr::expressions::Column::new(field.name(), i),
4686            );
4687            let name = if i >= num_group_by {
4688                // Rename aggregate column to expected Cypher name
4689                aggregate_column_name(&aggregates[i - num_group_by])
4690            } else {
4691                field.name().clone()
4692            };
4693            proj_exprs.push((col_expr, name));
4694        }
4695
4696        Ok(Arc::new(ProjectionExec::try_new(proj_exprs, agg_exec)?))
4697    }
4698
4699    /// Wrap a temporal aggregate argument with `get_field(arg, "nanos_since_epoch")` or
4700    /// `get_field(arg, "nanos_since_midnight")` when the argument is a DateTime/Time struct.
4701    ///
4702    /// Returns the argument unchanged for non-temporal types.
4703    fn wrap_temporal_sort_key(
4704        arg: datafusion::logical_expr::Expr,
4705        schema: &SchemaRef,
4706    ) -> Result<datafusion::logical_expr::Expr> {
4707        use datafusion::logical_expr::ScalarUDF;
4708        if let Ok(arg_type) = arg.get_type(&datafusion::common::DFSchema::try_from(
4709            schema.as_ref().clone(),
4710        )?) {
4711            if uni_common::core::schema::is_datetime_struct(&arg_type) {
4712                return Ok(datafusion::logical_expr::Expr::ScalarFunction(
4713                    datafusion::logical_expr::expr::ScalarFunction::new_udf(
4714                        Arc::new(ScalarUDF::from(
4715                            datafusion::functions::core::getfield::GetFieldFunc::new(),
4716                        )),
4717                        vec![arg, datafusion::logical_expr::lit("nanos_since_epoch")],
4718                    ),
4719                ));
4720            } else if uni_common::core::schema::is_time_struct(&arg_type) {
4721                return Ok(datafusion::logical_expr::Expr::ScalarFunction(
4722                    datafusion::logical_expr::expr::ScalarFunction::new_udf(
4723                        Arc::new(ScalarUDF::from(
4724                            datafusion::functions::core::getfield::GetFieldFunc::new(),
4725                        )),
4726                        vec![arg, datafusion::logical_expr::lit("nanos_since_midnight")],
4727                    ),
4728                ));
4729            }
4730        }
4731        Ok(arg)
4732    }
4733
4734    /// Translate Cypher aggregate expressions to DataFusion.
4735    fn translate_aggregates(
4736        &self,
4737        aggregates: &[Expr],
4738        schema: &SchemaRef,
4739        state: &SessionState,
4740        ctx: &TranslationContext,
4741    ) -> Result<Vec<PhysicalAggregate>> {
4742        use datafusion::functions_aggregate::expr_fn::{avg, count, max, min, sum};
4743
4744        let mut result: Vec<PhysicalAggregate> = Vec::new();
4745
4746        for agg_expr in aggregates {
4747            let Expr::FunctionCall {
4748                name,
4749                args,
4750                distinct,
4751                ..
4752            } = agg_expr
4753            else {
4754                return Err(anyhow!("Expected aggregate function, got: {:?}", agg_expr));
4755            };
4756
4757            let name_lower = name.to_lowercase();
4758
4759            // Helper to get required first argument
4760            let get_arg = || -> Result<DfExpr> {
4761                if args.is_empty() {
4762                    return Err(anyhow!("{}() requires an argument", name_lower));
4763                }
4764                cypher_expr_to_df(&args[0], Some(ctx))
4765            };
4766
4767            let df_agg = match name_lower.as_str() {
4768                "count" if args.is_empty() => count(datafusion::logical_expr::lit(1)),
4769                "count" => {
4770                    // For count(*) or count(variable) where variable is a node/edge
4771                    // (not a property), translate to count(lit(1)) since the variable
4772                    // itself has no column in the scan schema.
4773                    // Exception: COUNT(DISTINCT variable) needs the actual column
4774                    // reference so that null rows (from OPTIONAL MATCH) are excluded.
4775                    if matches!(args.first(), Some(uni_cypher::ast::Expr::Wildcard)) {
4776                        count(datafusion::logical_expr::lit(1))
4777                    } else if matches!(args.first(), Some(uni_cypher::ast::Expr::Variable(_))) {
4778                        if *distinct {
4779                            count(get_arg()?)
4780                        } else {
4781                            count(datafusion::logical_expr::lit(1))
4782                        }
4783                    } else {
4784                        count(get_arg()?)
4785                    }
4786                }
4787                "sum" => {
4788                    let arg = get_arg()?;
4789                    if self.is_large_binary_col(&arg, schema) {
4790                        let udaf = Arc::new(crate::query::df_udfs::create_cypher_sum_udaf());
4791                        udaf.call(vec![arg])
4792                    } else {
4793                        // Widen small integers to Int64 (DataFusion doesn't support Int32 sum).
4794                        // Float columns pass through unchanged so SUM preserves float type.
4795                        use datafusion::logical_expr::Cast;
4796                        let is_float = if let DfExpr::Column(col) = &arg
4797                            && let Ok(field) = schema.field_with_name(&col.name)
4798                        {
4799                            matches!(
4800                                field.data_type(),
4801                                datafusion::arrow::datatypes::DataType::Float32
4802                                    | datafusion::arrow::datatypes::DataType::Float64
4803                            )
4804                        } else {
4805                            false
4806                        };
4807                        if is_float {
4808                            sum(DfExpr::Cast(Cast::new(
4809                                Box::new(arg),
4810                                datafusion::arrow::datatypes::DataType::Float64,
4811                            )))
4812                        } else {
4813                            sum(DfExpr::Cast(Cast::new(
4814                                Box::new(arg),
4815                                datafusion::arrow::datatypes::DataType::Int64,
4816                            )))
4817                        }
4818                    }
4819                }
4820                "avg" => {
4821                    let arg = get_arg()?;
4822                    if self.is_large_binary_col(&arg, schema) {
4823                        let coerced = crate::query::df_udfs::cypher_to_float64_expr(arg);
4824                        avg(coerced)
4825                    } else {
4826                        use datafusion::logical_expr::Cast;
4827                        avg(DfExpr::Cast(Cast::new(
4828                            Box::new(arg),
4829                            datafusion::arrow::datatypes::DataType::Float64,
4830                        )))
4831                    }
4832                }
4833                "min" => {
4834                    // Use Cypher-aware min for LargeBinary columns (mixed types)
4835                    let arg = Self::wrap_temporal_sort_key(get_arg()?, schema)?;
4836
4837                    if self.is_large_binary_col(&arg, schema) {
4838                        let udaf = Arc::new(crate::query::df_udfs::create_cypher_min_udaf());
4839                        udaf.call(vec![arg])
4840                    } else {
4841                        min(arg)
4842                    }
4843                }
4844                "max" => {
4845                    // Use Cypher-aware max for LargeBinary columns (mixed types)
4846                    let arg = Self::wrap_temporal_sort_key(get_arg()?, schema)?;
4847
4848                    if self.is_large_binary_col(&arg, schema) {
4849                        let udaf = Arc::new(crate::query::df_udfs::create_cypher_max_udaf());
4850                        udaf.call(vec![arg])
4851                    } else {
4852                        max(arg)
4853                    }
4854                }
4855                "percentiledisc" => {
4856                    if args.len() != 2 {
4857                        return Err(anyhow!("percentileDisc() requires exactly 2 arguments"));
4858                    }
4859                    let expr_arg = cypher_expr_to_df(&args[0], Some(ctx))?;
4860                    let pct_arg = cypher_expr_to_df(&args[1], Some(ctx))?;
4861                    let coerced = crate::query::df_udfs::cypher_to_float64_expr(expr_arg);
4862                    let udaf =
4863                        Arc::new(crate::query::df_udfs::create_cypher_percentile_disc_udaf());
4864                    udaf.call(vec![coerced, pct_arg])
4865                }
4866                "percentilecont" => {
4867                    if args.len() != 2 {
4868                        return Err(anyhow!("percentileCont() requires exactly 2 arguments"));
4869                    }
4870                    let expr_arg = cypher_expr_to_df(&args[0], Some(ctx))?;
4871                    let pct_arg = cypher_expr_to_df(&args[1], Some(ctx))?;
4872                    let coerced = crate::query::df_udfs::cypher_to_float64_expr(expr_arg);
4873                    let udaf =
4874                        Arc::new(crate::query::df_udfs::create_cypher_percentile_cont_udaf());
4875                    udaf.call(vec![coerced, pct_arg])
4876                }
4877                "collect" => {
4878                    // Use custom Cypher collect UDAF that filters nulls and returns
4879                    // empty list (not null) when all inputs are null.
4880                    let arg = get_arg()?;
4881                    crate::query::df_udfs::create_cypher_collect_expr(arg, *distinct)
4882                }
4883                "btic_min" => {
4884                    let arg = get_arg()?;
4885                    let udaf = Arc::new(crate::query::df_udfs::create_btic_min_udaf());
4886                    udaf.call(vec![arg])
4887                }
4888                "btic_max" => {
4889                    let arg = get_arg()?;
4890                    let udaf = Arc::new(crate::query::df_udfs::create_btic_max_udaf());
4891                    udaf.call(vec![arg])
4892                }
4893                "btic_span_agg" => {
4894                    let arg = get_arg()?;
4895                    let udaf = Arc::new(crate::query::df_udfs::create_btic_span_agg_udaf());
4896                    udaf.call(vec![arg])
4897                }
4898                "btic_count_at" => {
4899                    if args.len() != 2 {
4900                        return Err(anyhow!("btic_count_at() requires exactly 2 arguments"));
4901                    }
4902                    let btic_arg = cypher_expr_to_df(&args[0], Some(ctx))?;
4903                    let point_arg = cypher_expr_to_df(&args[1], Some(ctx))?;
4904                    let udaf = Arc::new(crate::query::df_udfs::create_btic_count_at_udaf());
4905                    udaf.call(vec![btic_arg, point_arg])
4906                }
4907                _ => {
4908                    // Fall through to plugin-registry lookup. User
4909                    // aggregates registered via
4910                    // `PluginRegistrar::aggregate_fn` (M9
4911                    // `uni.plugin.declareAggregate` is the primary
4912                    // user) dispatch through the
4913                    // `PluginAggregateUdaf` adapter.
4914                    if let Some((ns, local)) = name_lower.split_once('.')
4915                        && let Some(entry) = self
4916                            .plugin_registry
4917                            .aggregate(&uni_plugin::QName::new(ns, local))
4918                    {
4919                        let arg_exprs: Vec<DfExpr> = args
4920                            .iter()
4921                            .map(|a| cypher_expr_to_df(a, Some(ctx)))
4922                            .collect::<Result<Vec<_>>>()?;
4923                        let udaf = Arc::new(datafusion::logical_expr::AggregateUDF::from(
4924                            crate::query::df_udaf_plugin::PluginAggregateUdaf::new(
4925                                uni_plugin::QName::new(ns, local),
4926                                Arc::clone(&self.plugin_registry),
4927                                entry.signature.clone(),
4928                            ),
4929                        ));
4930                        udaf.call(arg_exprs)
4931                    } else {
4932                        return Err(anyhow!("Unsupported aggregate function: {}", name));
4933                    }
4934                }
4935            };
4936
4937            // Apply DISTINCT if needed (collect/percentile handle their own distinct)
4938            let df_agg = if *distinct
4939                && !matches!(
4940                    name_lower.as_str(),
4941                    "collect" | "percentiledisc" | "percentilecont"
4942                ) {
4943                use datafusion::prelude::ExprFunctionExt;
4944                df_agg.distinct().build().map_err(|e| anyhow!("{}", e))?
4945            } else {
4946                df_agg
4947            };
4948
4949            // Resolve UDFs and apply type coercion inside aggregate arguments
4950            let df_schema = datafusion::common::DFSchema::try_from(schema.as_ref().clone())?;
4951            let df_agg = Self::resolve_udfs(&df_agg, state)?;
4952            let df_agg = crate::query::df_expr::apply_type_coercion(&df_agg, &df_schema)?;
4953            let df_agg = Self::resolve_udfs(&df_agg, state)?;
4954
4955            // Convert to physical aggregate
4956            let agg_and_filter = self.create_physical_aggregate(&df_agg, schema, state)?;
4957            result.push(agg_and_filter);
4958        }
4959
4960        Ok(result)
4961    }
4962
4963    /// Pre-compute pattern comprehensions in aggregate arguments.
4964    ///
4965    /// Scans aggregate expressions for pattern comprehensions, compiles them as
4966    /// physical expressions, adds them as projected columns, and rewrites the
4967    /// aggregate expressions to reference the pre-computed columns.
4968    fn precompute_custom_aggregate_args(
4969        &self,
4970        input_plan: Arc<dyn ExecutionPlan>,
4971        schema: &SchemaRef,
4972        aggregates: &[Expr],
4973        state: &SessionState,
4974        ctx: &TranslationContext,
4975    ) -> Result<(Arc<dyn ExecutionPlan>, SchemaRef, Vec<Expr>)> {
4976        use crate::query::df_graph::expr_compiler::CypherPhysicalExprCompiler;
4977
4978        let mut needs_projection = false;
4979        let mut proj_exprs: Vec<(Arc<dyn datafusion::physical_expr::PhysicalExpr>, String)> =
4980            Vec::new();
4981        let mut rewritten_aggregates = Vec::new();
4982        let mut col_counter = 0;
4983
4984        // First pass: copy all existing columns
4985        for (i, field) in schema.fields().iter().enumerate() {
4986            let col_expr: Arc<dyn datafusion::physical_expr::PhysicalExpr> = Arc::new(
4987                datafusion::physical_expr::expressions::Column::new(field.name(), i),
4988            );
4989            proj_exprs.push((col_expr, field.name().clone()));
4990        }
4991
4992        // Second pass: scan aggregates for custom expressions in arguments
4993        for agg_expr in aggregates {
4994            let Expr::FunctionCall {
4995                name,
4996                args,
4997                distinct,
4998                window_spec,
4999            } = agg_expr
5000            else {
5001                rewritten_aggregates.push(agg_expr.clone());
5002                continue;
5003            };
5004
5005            let mut rewritten_args = Vec::new();
5006            let mut agg_needs_rewrite = false;
5007
5008            for arg in args {
5009                if CypherPhysicalExprCompiler::contains_custom_expr(arg) {
5010                    // Compile the custom expression
5011                    let compiler = CypherPhysicalExprCompiler::new(state, Some(ctx))
5012                        .with_subquery_ctx(
5013                            self.graph_ctx.clone(),
5014                            self.schema.clone(),
5015                            self.session_ctx.clone(),
5016                            self.storage.clone(),
5017                            self.params.clone(),
5018                            self.outer_entity_vars.clone(),
5019                        );
5020                    let physical_expr = compiler.compile(arg, schema)?;
5021
5022                    // Add it as a projected column
5023                    let col_name = format!("__pc_{}", col_counter);
5024                    col_counter += 1;
5025                    proj_exprs.push((physical_expr, col_name.clone()));
5026
5027                    // Rewrite aggregate to reference the column
5028                    rewritten_args.push(Expr::Variable(col_name));
5029                    agg_needs_rewrite = true;
5030                    needs_projection = true;
5031                } else {
5032                    rewritten_args.push(arg.clone());
5033                }
5034            }
5035
5036            if agg_needs_rewrite {
5037                rewritten_aggregates.push(Expr::FunctionCall {
5038                    name: name.clone(),
5039                    args: rewritten_args,
5040                    distinct: *distinct,
5041                    window_spec: window_spec.clone(),
5042                });
5043            } else {
5044                rewritten_aggregates.push(agg_expr.clone());
5045            }
5046        }
5047
5048        if needs_projection {
5049            let projection_exec = Arc::new(
5050                datafusion::physical_plan::projection::ProjectionExec::try_new(
5051                    proj_exprs, input_plan,
5052                )?,
5053            );
5054            let new_schema = projection_exec.schema();
5055            Ok((projection_exec, new_schema, rewritten_aggregates))
5056        } else {
5057            Ok((input_plan, schema.clone(), aggregates.to_vec()))
5058        }
5059    }
5060
5061    /// Plan a sort operation.
5062    ///
5063    /// The `alias_map` provides a mapping from alias names to underlying expressions.
5064    /// This is needed because ORDER BY expressions may reference aliases defined in
5065    /// a parent Project node (e.g., `ORDER BY friend_count` where `friend_count`
5066    /// is an alias for `COUNT(r)`).
5067    fn plan_sort(
5068        &self,
5069        input: &LogicalPlan,
5070        order_by: &[SortItem],
5071        all_properties: &HashMap<String, HashSet<String>>,
5072        alias_map: &HashMap<String, Expr>,
5073    ) -> Result<Arc<dyn ExecutionPlan>> {
5074        let input_plan = self.plan_internal(input, all_properties)?;
5075        let schema = input_plan.schema();
5076
5077        let session = self.session_ctx.read();
5078
5079        // Build translation context with variable kinds from the input plan
5080        let ctx = self.translation_context_for_plan(input);
5081
5082        // Build DFSchema once for type coercion and physical expression conversion
5083        let df_schema = datafusion::common::DFSchema::try_from(schema.as_ref().clone())?;
5084
5085        // Translate sort expressions to DataFusion's SortExpr (a.k.a. Sort struct)
5086        // SortItem has `ascending: bool`, so use it directly
5087        // Default nulls_first to false for ASC, true for DESC
5088        use crate::query::df_graph::expr_compiler::CypherPhysicalExprCompiler;
5089
5090        let mut df_sort_exprs = Vec::new();
5091        let mut custom_physical_overrides: Vec<(
5092            usize,
5093            Arc<dyn datafusion::physical_expr::PhysicalExpr>,
5094        )> = Vec::new();
5095        for item in order_by {
5096            let mut sort_expr = item.expr.clone();
5097
5098            // If the sort expression is a variable that matches an alias,
5099            // replace it with the underlying expression
5100            if let Expr::Variable(ref name) = sort_expr {
5101                // Check if this name exists in the input schema
5102                let col_name = name.as_str();
5103                let exists_in_schema = schema.fields().iter().any(|f| f.name() == col_name);
5104
5105                if !exists_in_schema && let Some(aliased_expr) = alias_map.get(col_name) {
5106                    sort_expr = aliased_expr.clone();
5107                }
5108            }
5109
5110            let asc = item.ascending;
5111            let nulls_first = !asc; // Standard SQL behavior: nulls last for ASC, first for DESC
5112
5113            // Custom expressions (similar_to, comprehensions, etc.) cannot be
5114            // translated via cypher_expr_to_df. Compile with the custom compiler
5115            // and save as an override for the physical sort expression.
5116            if CypherPhysicalExprCompiler::contains_custom_expr(&sort_expr) {
5117                let sort_state = session.state();
5118                let compiler = CypherPhysicalExprCompiler::new(&sort_state, Some(&ctx))
5119                    .with_subquery_ctx(
5120                        self.graph_ctx.clone(),
5121                        self.schema.clone(),
5122                        self.session_ctx.clone(),
5123                        self.storage.clone(),
5124                        self.params.clone(),
5125                        self.outer_entity_vars.clone(),
5126                    );
5127                let inner_physical = compiler.compile(&sort_expr, &schema)?;
5128
5129                // Use a dummy column reference for the logical sort expression
5130                // (we'll replace the physical expression below).
5131                let first_col = schema
5132                    .fields()
5133                    .first()
5134                    .map(|f| f.name().clone())
5135                    .unwrap_or_else(|| "_dummy_".to_string());
5136                let dummy_expr = DfExpr::Column(datafusion::common::Column::from_name(&first_col));
5137                let sort_key_udf = crate::query::df_udfs::create_cypher_sort_key_udf();
5138                let sort_key_expr = sort_key_udf.call(vec![dummy_expr]);
5139                custom_physical_overrides.push((df_sort_exprs.len(), inner_physical));
5140                df_sort_exprs.push(DfSortExpr::new(sort_key_expr, asc, nulls_first));
5141                continue;
5142            }
5143
5144            let df_expr = cypher_expr_to_df(&sort_expr, Some(&ctx))?;
5145            let df_expr = Self::resolve_udfs(&df_expr, &session.state())?;
5146            let df_expr = crate::query::df_expr::apply_type_coercion(&df_expr, &df_schema)?;
5147            // Resolve UDFs again: apply_type_coercion may create new dummy UDF
5148            // placeholders (e.g. _cv_to_bool, _cypher_add) that need resolution.
5149            let df_expr = Self::resolve_udfs(&df_expr, &session.state())?;
5150
5151            // Single order-preserving sort key: _cypher_sort_key(expr) -> LargeBinary
5152            // The UDF handles all Cypher ordering semantics (cross-type ranks,
5153            // within-type comparisons, temporal normalization, NaN/null placement)
5154            // so memcmp of the resulting bytes gives correct Cypher ORDER BY.
5155            let sort_key_udf = crate::query::df_udfs::create_cypher_sort_key_udf();
5156            let sort_key_expr = sort_key_udf.call(vec![df_expr]);
5157            df_sort_exprs.push(DfSortExpr::new(sort_key_expr, asc, nulls_first));
5158        }
5159
5160        let mut physical_sort_exprs = create_physical_sort_exprs(
5161            &df_sort_exprs,
5162            &df_schema,
5163            session.state().execution_props(),
5164        )?;
5165
5166        // Replace the inner expression for custom sort expressions.
5167        // The _cypher_sort_key UDF wrapper is already in place; we just need
5168        // to swap the dummy column reference with the actual custom physical expr.
5169        for (idx, custom_inner) in custom_physical_overrides {
5170            if idx < physical_sort_exprs.len() {
5171                let phys = &physical_sort_exprs[idx];
5172                // The physical sort expression wraps _cypher_sort_key(dummy_col).
5173                // We need to replace the inner arg with our custom expression.
5174                // ScalarFunctionExpr wraps the UDF; rebuild it with the correct child.
5175                let sort_key_udf = Arc::new(crate::query::df_udfs::create_cypher_sort_key_udf());
5176                let config_options = Arc::new(datafusion::config::ConfigOptions::default());
5177                let udf_name = sort_key_udf.name().to_string();
5178                let new_sort_key = datafusion::physical_expr::ScalarFunctionExpr::new(
5179                    &udf_name,
5180                    sort_key_udf,
5181                    vec![custom_inner],
5182                    Arc::new(arrow_schema::Field::new(
5183                        "_cypher_sort_key",
5184                        DataType::LargeBinary,
5185                        true,
5186                    )),
5187                    config_options,
5188                );
5189                physical_sort_exprs[idx] = datafusion::physical_expr::PhysicalSortExpr {
5190                    expr: Arc::new(new_sort_key),
5191                    options: phys.options,
5192                };
5193            }
5194        }
5195
5196        // Convert Vec<PhysicalSortExpr> to LexOrdering
5197        // LexOrdering::new returns None for empty vector, so handle that case
5198        let lex_ordering = datafusion::physical_expr::LexOrdering::new(physical_sort_exprs)
5199            .ok_or_else(|| anyhow!("ORDER BY must have at least one sort expression"))?;
5200
5201        Ok(Arc::new(SortExec::new(lex_ordering, input_plan)))
5202    }
5203
5204    /// Plan a limit operation.
5205    fn plan_limit(
5206        &self,
5207        input: &LogicalPlan,
5208        skip: Option<usize>,
5209        fetch: Option<usize>,
5210        all_properties: &HashMap<String, HashSet<String>>,
5211    ) -> Result<Arc<dyn ExecutionPlan>> {
5212        let input_plan = self.plan_internal(input, all_properties)?;
5213
5214        // Handle SKIP via GlobalLimitExec (LocalLimitExec doesn't support offset)
5215        if let Some(offset) = skip.filter(|&s| s > 0) {
5216            use datafusion::physical_plan::limit::GlobalLimitExec;
5217            return Ok(Arc::new(GlobalLimitExec::new(input_plan, offset, fetch)));
5218        }
5219
5220        if let Some(limit) = fetch {
5221            Ok(Arc::new(LocalLimitExec::new(input_plan, limit)))
5222        } else {
5223            // No limit, return input as-is
5224            Ok(input_plan)
5225        }
5226    }
5227
5228    /// Plan a union operation.
5229    fn plan_union(
5230        &self,
5231        left: &LogicalPlan,
5232        right: &LogicalPlan,
5233        all: bool,
5234        all_properties: &HashMap<String, HashSet<String>>,
5235    ) -> Result<Arc<dyn ExecutionPlan>> {
5236        let left_plan = self.plan_internal(left, all_properties)?;
5237        let right_plan = self.plan_internal(right, all_properties)?;
5238
5239        // Guard against schema mismatches reaching DataFusion's
5240        // `union_schema`, which panics with `index out of bounds` rather
5241        // than returning `Err` when branch widths or per-position types
5242        // differ (issue rustic-ai/uni-db#62). With the planner-level
5243        // fallback in place for label disjunction this should be
5244        // unreachable, but a typed error here protects any future
5245        // logical-Union path against the same process-aborting panic.
5246        //
5247        // We only compare field count and per-position **type**; the
5248        // user-facing Cypher `UNION` clause routinely produces branches
5249        // whose per-position field *names* differ (e.g. `MATCH (a:A)
5250        // RETURN a AS a UNION MATCH (b:B) RETURN b AS a` — both branches
5251        // alias their pattern variable to `a`, but internal namespaced
5252        // columns like `a._vid` vs `b._vid` differ). DataFusion handles
5253        // that case fine by adopting left names; only width/type
5254        // mismatches are the panic source.
5255        let left_schema = left_plan.schema();
5256        let right_schema = right_plan.schema();
5257        if left_schema.fields().len() != right_schema.fields().len()
5258            || left_schema
5259                .fields()
5260                .iter()
5261                .zip(right_schema.fields().iter())
5262                .any(|(l, r)| l.data_type() != r.data_type())
5263        {
5264            let fmt = |s: &Schema| {
5265                s.fields()
5266                    .iter()
5267                    .map(|f| format!("{}: {:?}", f.name(), f.data_type()))
5268                    .collect::<Vec<_>>()
5269                    .join(", ")
5270            };
5271            return Err(anyhow!(
5272                "Plan: cannot UNION branches with mismatched schemas — \
5273                 left=[{}], right=[{}]. This is a planner bug; please file \
5274                 an issue.",
5275                fmt(left_schema.as_ref()),
5276                fmt(right_schema.as_ref()),
5277            ));
5278        }
5279
5280        let union_plan = UnionExec::try_new(vec![left_plan, right_plan])?;
5281
5282        // UNION (without ALL) requires deduplication
5283        if !all {
5284            use datafusion::physical_plan::aggregates::{
5285                AggregateExec, AggregateMode, PhysicalGroupBy,
5286            };
5287            use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec;
5288
5289            // First, coalesce all partitions into one to ensure global deduplication
5290            let coalesced = Arc::new(CoalescePartitionsExec::new(union_plan));
5291
5292            // Create group by all columns to deduplicate
5293            let schema = coalesced.schema();
5294            let group_by_exprs: Vec<_> = (0..schema.fields().len())
5295                .map(|i| {
5296                    (
5297                        Arc::new(datafusion::physical_plan::expressions::Column::new(
5298                            schema.field(i).name(),
5299                            i,
5300                        ))
5301                            as Arc<dyn datafusion::physical_expr::PhysicalExpr>,
5302                        schema.field(i).name().clone(),
5303                    )
5304                })
5305                .collect();
5306
5307            let group_by = PhysicalGroupBy::new_single(group_by_exprs);
5308
5309            Ok(Arc::new(AggregateExec::try_new(
5310                AggregateMode::Single,
5311                group_by,
5312                vec![], // No aggregate functions, just grouping for distinct
5313                vec![], // No filters
5314                coalesced,
5315                schema,
5316            )?))
5317        } else {
5318            // UNION ALL - just return the union
5319            Ok(union_plan)
5320        }
5321    }
5322
5323    /// Plan all window functions (aggregate and manual) using DataFusion's WindowAggExec.
5324    ///
5325    /// Translates Cypher window expressions to DataFusion's window function execution plan.
5326    /// Supports both aggregate window functions (SUM, AVG, etc.) via AggregateUDF and
5327    /// manual window functions (ROW_NUMBER, RANK, LAG, etc.) via WindowUDF.
5328    fn plan_window_functions(
5329        &self,
5330        input: Arc<dyn ExecutionPlan>,
5331        window_exprs: &[Expr],
5332        context_plan: Option<&LogicalPlan>,
5333    ) -> Result<Arc<dyn ExecutionPlan>> {
5334        use datafusion::functions_aggregate::average::avg_udaf;
5335        use datafusion::functions_aggregate::count::count_udaf;
5336        use datafusion::functions_aggregate::min_max::{max_udaf, min_udaf};
5337        use datafusion::functions_aggregate::sum::sum_udaf;
5338        use datafusion::functions_window::lead_lag::{lag_udwf, lead_udwf};
5339        use datafusion::functions_window::nth_value::{
5340            first_value_udwf, last_value_udwf, nth_value_udwf,
5341        };
5342        use datafusion::functions_window::ntile::ntile_udwf;
5343        use datafusion::functions_window::rank::{dense_rank_udwf, rank_udwf};
5344        use datafusion::functions_window::row_number::row_number_udwf;
5345        use datafusion::logical_expr::{WindowFrame, WindowFunctionDefinition};
5346        use datafusion::physical_expr::LexOrdering;
5347        use datafusion::physical_plan::sorts::sort::SortExec;
5348        use datafusion::physical_plan::windows::{WindowAggExec, create_window_expr};
5349
5350        let input_schema = input.schema();
5351        let df_schema = datafusion::common::DFSchema::try_from(input_schema.as_ref().clone())?;
5352
5353        let session = self.session_ctx.read();
5354        let state = session.state();
5355
5356        // Build translation context with variable kinds if we have a logical plan
5357        let tx_ctx = context_plan.map(|p| self.translation_context_for_plan(p));
5358        let mut window_expr_list = Vec::new();
5359
5360        for expr in window_exprs {
5361            let Expr::FunctionCall {
5362                name,
5363                args,
5364                distinct,
5365                window_spec: Some(window_spec),
5366            } = expr
5367            else {
5368                return Err(anyhow!("Expected window function call with OVER clause"));
5369            };
5370
5371            let name_lower = name.to_lowercase();
5372
5373            // Resolve the window function definition: either AggregateUDF or WindowUDF
5374            let (window_fn_def, is_aggregate) = match name_lower.as_str() {
5375                // Aggregate window functions → AggregateUDF
5376                "count" => (WindowFunctionDefinition::AggregateUDF(count_udaf()), true),
5377                "sum" => (WindowFunctionDefinition::AggregateUDF(sum_udaf()), true),
5378                "avg" => (WindowFunctionDefinition::AggregateUDF(avg_udaf()), true),
5379                "min" => (WindowFunctionDefinition::AggregateUDF(min_udaf()), true),
5380                "max" => (WindowFunctionDefinition::AggregateUDF(max_udaf()), true),
5381                // Manual window functions → WindowUDF
5382                "row_number" => (
5383                    WindowFunctionDefinition::WindowUDF(row_number_udwf()),
5384                    false,
5385                ),
5386                "rank" => (WindowFunctionDefinition::WindowUDF(rank_udwf()), false),
5387                "dense_rank" => (
5388                    WindowFunctionDefinition::WindowUDF(dense_rank_udwf()),
5389                    false,
5390                ),
5391                "lag" => (WindowFunctionDefinition::WindowUDF(lag_udwf()), false),
5392                "lead" => (WindowFunctionDefinition::WindowUDF(lead_udwf()), false),
5393                "ntile" => {
5394                    // Validate NTILE bucket count: must be positive
5395                    if let Some(Expr::Literal(CypherLiteral::Integer(n))) = args.first()
5396                        && *n <= 0
5397                    {
5398                        return Err(anyhow!("NTILE bucket count must be positive, got: {}", n));
5399                    }
5400                    (WindowFunctionDefinition::WindowUDF(ntile_udwf()), false)
5401                }
5402                "first_value" => (
5403                    WindowFunctionDefinition::WindowUDF(first_value_udwf()),
5404                    false,
5405                ),
5406                "last_value" => (
5407                    WindowFunctionDefinition::WindowUDF(last_value_udwf()),
5408                    false,
5409                ),
5410                "nth_value" => (WindowFunctionDefinition::WindowUDF(nth_value_udwf()), false),
5411                other => return Err(anyhow!("Unsupported window function: {}", other)),
5412            };
5413
5414            // Translate argument expressions to physical expressions
5415            let physical_args: Vec<Arc<dyn datafusion::physical_expr::PhysicalExpr>> =
5416                if args.is_empty() || matches!(args.as_slice(), [Expr::Wildcard]) {
5417                    // COUNT(*) or zero-arg functions (row_number, rank, dense_rank)
5418                    if is_aggregate {
5419                        vec![create_physical_expr(
5420                            &datafusion::logical_expr::lit(1),
5421                            &df_schema,
5422                            state.execution_props(),
5423                        )?]
5424                    } else {
5425                        // Manual window functions with no args (row_number, rank, dense_rank)
5426                        vec![]
5427                    }
5428                } else {
5429                    args.iter()
5430                        .map(|arg| {
5431                            let mut df_expr = cypher_expr_to_df(arg, tx_ctx.as_ref())?;
5432
5433                            // Cast numeric types only for SUM/AVG aggregate functions:
5434                            // SUM needs Int64 to avoid overflow, AVG needs Float64
5435                            if is_aggregate {
5436                                let cast_type = match name_lower.as_str() {
5437                                    "sum" => Some(datafusion::arrow::datatypes::DataType::Int64),
5438                                    "avg" => Some(datafusion::arrow::datatypes::DataType::Float64),
5439                                    _ => None,
5440                                };
5441                                if let Some(target_type) = cast_type {
5442                                    df_expr = DfExpr::Cast(datafusion::logical_expr::Cast::new(
5443                                        Box::new(df_expr),
5444                                        target_type,
5445                                    ));
5446                                }
5447                            }
5448
5449                            create_physical_expr(&df_expr, &df_schema, state.execution_props())
5450                                .map_err(|e| anyhow!("Failed to create physical expr: {}", e))
5451                        })
5452                        .collect::<Result<Vec<_>>>()?
5453                };
5454
5455            // Translate PARTITION BY expressions to physical expressions
5456            let partition_by_physical: Vec<Arc<dyn datafusion::physical_expr::PhysicalExpr>> =
5457                window_spec
5458                    .partition_by
5459                    .iter()
5460                    .map(|e| {
5461                        let df_expr = cypher_expr_to_df(e, tx_ctx.as_ref())?;
5462                        create_physical_expr(&df_expr, &df_schema, state.execution_props())
5463                            .map_err(|e| anyhow!("Failed to create physical expr: {}", e))
5464                    })
5465                    .collect::<Result<Vec<_>>>()?;
5466
5467            // Translate ORDER BY expressions to physical sort expressions
5468            let mut order_by_physical: Vec<datafusion::physical_expr::PhysicalSortExpr> =
5469                window_spec
5470                    .order_by
5471                    .iter()
5472                    .map(|sort_item| {
5473                        let df_expr = cypher_expr_to_df(&sort_item.expr, tx_ctx.as_ref())?;
5474                        let physical_expr =
5475                            create_physical_expr(&df_expr, &df_schema, state.execution_props())
5476                                .map_err(|e| anyhow!("Failed to create physical expr: {}", e))?;
5477                        Ok(datafusion::physical_expr::PhysicalSortExpr {
5478                            expr: physical_expr,
5479                            options: datafusion::arrow::compute::SortOptions {
5480                                descending: !sort_item.ascending,
5481                                nulls_first: !sort_item.ascending, // SQL standard: nulls last for ASC
5482                            },
5483                        })
5484                    })
5485                    .collect::<Result<Vec<_>>>()?;
5486
5487            // DataFusion requires partition columns to have an ordering.
5488            // If ORDER BY is empty but PARTITION BY is not, add partition columns to ordering.
5489            if order_by_physical.is_empty() && !partition_by_physical.is_empty() {
5490                for partition_expr in &partition_by_physical {
5491                    order_by_physical.push(datafusion::physical_expr::PhysicalSortExpr {
5492                        expr: Arc::clone(partition_expr),
5493                        options: datafusion::arrow::compute::SortOptions {
5494                            descending: false,
5495                            nulls_first: false,
5496                        },
5497                    });
5498                }
5499            }
5500
5501            // Create window frame based on function type:
5502            // - Aggregate functions: cumulative when ORDER BY present, full partition when absent
5503            // - Manual window functions: always full partition (frame is irrelevant for ranking,
5504            //   and value functions like last_value/first_value expect full-partition semantics)
5505            let window_frame = if is_aggregate {
5506                if window_spec.order_by.is_empty() {
5507                    // No ORDER BY: aggregate over entire partition
5508                    use datafusion::logical_expr::{WindowFrameBound, WindowFrameUnits};
5509                    Arc::new(WindowFrame::new_bounds(
5510                        WindowFrameUnits::Rows,
5511                        WindowFrameBound::Preceding(datafusion::common::ScalarValue::UInt64(None)),
5512                        WindowFrameBound::Following(datafusion::common::ScalarValue::UInt64(None)),
5513                    ))
5514                } else {
5515                    // With ORDER BY: cumulative from partition start to current row
5516                    Arc::new(WindowFrame::new(Some(false)))
5517                }
5518            } else {
5519                // Manual window functions: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
5520                use datafusion::logical_expr::{WindowFrameBound, WindowFrameUnits};
5521                Arc::new(WindowFrame::new_bounds(
5522                    WindowFrameUnits::Rows,
5523                    WindowFrameBound::Preceding(datafusion::common::ScalarValue::UInt64(None)),
5524                    WindowFrameBound::Following(datafusion::common::ScalarValue::UInt64(None)),
5525                ))
5526            };
5527
5528            // Get the output name
5529            let alias = expr.to_string_repr();
5530
5531            // Create the window expression using DataFusion's create_window_expr
5532            let window_expr = create_window_expr(
5533                &window_fn_def,
5534                alias,
5535                &physical_args,
5536                &partition_by_physical,
5537                &order_by_physical,
5538                window_frame,
5539                input_schema.clone(),
5540                false, // ignore_nulls
5541                *distinct,
5542                None, // filter
5543            )?;
5544
5545            window_expr_list.push(window_expr);
5546        }
5547
5548        // WindowAggExec requires input to be sorted by partition columns + order by columns.
5549        // Create a SortExec to ensure proper ordering.
5550        let mut sort_exprs = Vec::new();
5551
5552        // Add partition columns to sort (must be sorted by partition first)
5553        for expr in window_exprs {
5554            if let Expr::FunctionCall {
5555                window_spec: Some(window_spec),
5556                ..
5557            } = expr
5558            {
5559                for partition_expr in &window_spec.partition_by {
5560                    let df_expr = cypher_expr_to_df(partition_expr, tx_ctx.as_ref())?;
5561                    let physical_expr =
5562                        create_physical_expr(&df_expr, &df_schema, state.execution_props())?;
5563
5564                    // Only add if not already in sort list
5565                    // Use display comparison as proxy for equality since PhysicalExpr doesn't implement Eq
5566                    if !sort_exprs
5567                        .iter()
5568                        .any(|s: &datafusion::physical_expr::PhysicalSortExpr| {
5569                            s.expr.to_string() == physical_expr.to_string()
5570                        })
5571                    {
5572                        sort_exprs.push(datafusion::physical_expr::PhysicalSortExpr {
5573                            expr: physical_expr,
5574                            options: datafusion::arrow::compute::SortOptions {
5575                                descending: false,
5576                                nulls_first: false,
5577                            },
5578                        });
5579                    }
5580                }
5581
5582                // Then add order by columns
5583                for sort_item in &window_spec.order_by {
5584                    let df_expr = cypher_expr_to_df(&sort_item.expr, tx_ctx.as_ref())?;
5585                    let physical_expr =
5586                        create_physical_expr(&df_expr, &df_schema, state.execution_props())?;
5587
5588                    sort_exprs.push(datafusion::physical_expr::PhysicalSortExpr {
5589                        expr: physical_expr,
5590                        options: datafusion::arrow::compute::SortOptions {
5591                            descending: !sort_item.ascending,
5592                            nulls_first: !sort_item.ascending,
5593                        },
5594                    });
5595                }
5596            }
5597        }
5598
5599        // Add SortExec before WindowAggExec if we have partition or order by columns
5600        let sorted_input = if !sort_exprs.is_empty() {
5601            let lex_ordering = LexOrdering::new(sort_exprs)
5602                .ok_or_else(|| anyhow!("Failed to create LexOrdering for window function"))?;
5603            Arc::new(SortExec::new(lex_ordering, input)) as Arc<dyn ExecutionPlan>
5604        } else {
5605            input
5606        };
5607
5608        // Create WindowAggExec
5609        let window_agg_exec = WindowAggExec::try_new(
5610            window_expr_list,
5611            sorted_input,
5612            false, // can_repartition - keep data on current partitions
5613        )?;
5614
5615        Ok(Arc::new(window_agg_exec))
5616    }
5617
5618    /// Plan an empty input that produces exactly one row.
5619    ///
5620    /// In Cypher, `RETURN 1` (without MATCH) expects a single row to project from.
5621    /// This matches the fallback executor behavior which returns `vec![HashMap::new()]`.
5622    fn plan_empty(&self) -> Result<Arc<dyn ExecutionPlan>> {
5623        let schema = Arc::new(Schema::empty());
5624        // Use PlaceholderRowExec to produce exactly one row (like SQL's "SELECT 1").
5625        // EmptyExec produces 0 rows, which breaks `RETURN 1 AS num`.
5626        Ok(Arc::new(PlaceholderRowExec::new(schema)))
5627    }
5628
5629    /// Plan a zero-length path binding.
5630    /// Converts a single node pattern `p = (a)` into a Path with one node and zero edges.
5631    fn plan_bind_zero_length_path(
5632        &self,
5633        input: &LogicalPlan,
5634        node_variable: &str,
5635        path_variable: &str,
5636        all_properties: &HashMap<String, HashSet<String>>,
5637    ) -> Result<Arc<dyn ExecutionPlan>> {
5638        let input_plan = self.plan_internal(input, all_properties)?;
5639        Ok(Arc::new(BindZeroLengthPathExec::new(
5640            input_plan,
5641            node_variable.to_string(),
5642            path_variable.to_string(),
5643            self.graph_ctx.clone(),
5644        )))
5645    }
5646
5647    /// Plan a fixed-length path binding.
5648    /// Synthesizes a path struct from existing node and edge columns.
5649    fn plan_bind_path(
5650        &self,
5651        input: &LogicalPlan,
5652        node_variables: &[String],
5653        edge_variables: &[String],
5654        path_variable: &str,
5655        all_properties: &HashMap<String, HashSet<String>>,
5656    ) -> Result<Arc<dyn ExecutionPlan>> {
5657        let input_plan = self.plan_internal(input, all_properties)?;
5658        Ok(Arc::new(BindFixedPathExec::new(
5659            input_plan,
5660            node_variables.to_vec(),
5661            edge_variables.to_vec(),
5662            path_variable.to_string(),
5663            self.graph_ctx.clone(),
5664        )))
5665    }
5666
5667    /// Extract simple property equality conditions from a Cypher expression tree.
5668    ///
5669    /// Handles patterns generated by `properties_to_expr`:
5670    /// - `variable.prop = literal` → `(prop, value)`
5671    /// - `cond1 AND cond2` → recursive extraction
5672    ///
5673    /// Returns `Vec<(property_name, expected_value)>` for use in L0 edge property
5674    /// checking during VLP BFS.
5675    fn extract_edge_property_conditions(expr: &Expr) -> Vec<(String, uni_common::Value)> {
5676        match expr {
5677            Expr::BinaryOp {
5678                left,
5679                op: uni_cypher::ast::BinaryOp::Eq,
5680                right,
5681            } => {
5682                // Pattern: variable.prop = literal
5683                if let Expr::Property(inner, prop_name) = left.as_ref()
5684                    && matches!(inner.as_ref(), Expr::Variable(_))
5685                    && let Expr::Literal(lit) = right.as_ref()
5686                {
5687                    return vec![(prop_name.clone(), lit.to_value())];
5688                }
5689                // Reverse: literal = variable.prop
5690                if let Expr::Literal(lit) = left.as_ref()
5691                    && let Expr::Property(inner, prop_name) = right.as_ref()
5692                    && matches!(inner.as_ref(), Expr::Variable(_))
5693                {
5694                    return vec![(prop_name.clone(), lit.to_value())];
5695                }
5696                vec![]
5697            }
5698            Expr::BinaryOp {
5699                left,
5700                op: uni_cypher::ast::BinaryOp::And,
5701                right,
5702            } => {
5703                let mut result = Self::extract_edge_property_conditions(left);
5704                result.extend(Self::extract_edge_property_conditions(right));
5705                result
5706            }
5707            _ => vec![],
5708        }
5709    }
5710
5711    /// Create a physical filter expression from a DataFusion logical expression.
5712    ///
5713    /// Applies type coercion to resolve mismatches like Int32 vs Int64
5714    /// before creating the physical expression.
5715    fn create_physical_filter_expr(
5716        &self,
5717        expr: &DfExpr,
5718        schema: &SchemaRef,
5719        session: &SessionContext,
5720    ) -> Result<Arc<dyn datafusion::physical_expr::PhysicalExpr>> {
5721        let df_schema = datafusion::common::DFSchema::try_from(schema.as_ref().clone())?;
5722        let state = session.state();
5723
5724        // Replace DummyUdf placeholders with registered UDFs
5725        let resolved_expr = Self::resolve_udfs(expr, &state)?;
5726
5727        // Apply type coercion to resolve Int32/Int64, Float32/Float64 mismatches
5728        let coerced_expr = crate::query::df_expr::apply_type_coercion(&resolved_expr, &df_schema)?;
5729
5730        // Re-resolve UDFs after coercion (coercion may introduce new dummy UDF calls)
5731        let coerced_expr = Self::resolve_udfs(&coerced_expr, &state)?;
5732
5733        // Use SessionState's create_physical_expr to properly resolve UDFs
5734        use datafusion::physical_planner::PhysicalPlanner;
5735        let planner = datafusion::physical_planner::DefaultPhysicalPlanner::default();
5736        let physical = planner.create_physical_expr(&coerced_expr, &df_schema, &state)?;
5737
5738        Ok(physical)
5739    }
5740
5741    /// Resolve DummyUdf placeholders to actual registered UDFs from SessionState.
5742    ///
5743    /// Uses DataFusion's TreeNode API to traverse the entire expression tree,
5744    /// replacing any ScalarFunction nodes whose UDF name matches a registered UDF.
5745    fn resolve_udfs(expr: &DfExpr, state: &datafusion::execution::SessionState) -> Result<DfExpr> {
5746        use datafusion::common::tree_node::{Transformed, TreeNode};
5747        use datafusion::logical_expr::Expr as DfExpr;
5748
5749        let result = expr
5750            .clone()
5751            .transform_up(|node| {
5752                if let DfExpr::ScalarFunction(ref func) = node {
5753                    let udf_name = func.func.name();
5754                    if let Some(registered_udf) = state.scalar_functions().get(udf_name) {
5755                        return Ok(Transformed::yes(DfExpr::ScalarFunction(
5756                            datafusion::logical_expr::expr::ScalarFunction {
5757                                func: registered_udf.clone(),
5758                                args: func.args.clone(),
5759                            },
5760                        )));
5761                    }
5762                }
5763                Ok(Transformed::no(node))
5764            })
5765            .map_err(|e| anyhow::anyhow!("Failed to resolve UDFs: {}", e))?;
5766
5767        Ok(result.data)
5768    }
5769
5770    /// Add a structural projection on top of an execution plan to create a Struct column
5771    /// for a Node or Edge variable.
5772    fn add_structural_projection(
5773        &self,
5774        input: Arc<dyn ExecutionPlan>,
5775        variable: &str,
5776        properties: &[String],
5777    ) -> Result<Arc<dyn ExecutionPlan>> {
5778        use datafusion::functions::expr_fn::named_struct;
5779        use datafusion::logical_expr::lit;
5780        use datafusion::physical_plan::projection::ProjectionExec;
5781
5782        let input_schema = input.schema();
5783        let mut proj_exprs: Vec<(Arc<dyn datafusion::physical_expr::PhysicalExpr>, String)> =
5784            Vec::new();
5785
5786        // 1. Keep all existing columns
5787        for (i, field) in input_schema.fields().iter().enumerate() {
5788            let col_expr = Arc::new(datafusion::physical_expr::expressions::Column::new(
5789                field.name(),
5790                i,
5791            ));
5792            proj_exprs.push((col_expr, field.name().clone()));
5793        }
5794
5795        // 2. Add the named_struct AS variable
5796        let mut struct_args = Vec::with_capacity(properties.len() * 2 + 4);
5797
5798        // Add _vid field for identity access
5799        struct_args.push(lit("_vid"));
5800        struct_args.push(DfExpr::Column(datafusion::common::Column::from_name(
5801            format!("{}._vid", variable),
5802        )));
5803
5804        // Add _labels field for labels() function support
5805        struct_args.push(lit("_labels"));
5806        struct_args.push(DfExpr::Column(datafusion::common::Column::from_name(
5807            format!("{}._labels", variable),
5808        )));
5809
5810        for prop in properties {
5811            struct_args.push(lit(prop.clone()));
5812            struct_args.push(DfExpr::Column(datafusion::common::Column::from_name(
5813                format!("{}.{}", variable, prop),
5814            )));
5815        }
5816
5817        // If no properties, still create an empty struct to represent the entity
5818        let struct_expr = named_struct(struct_args);
5819
5820        let df_schema = datafusion::common::DFSchema::try_from(input_schema.as_ref().clone())?;
5821        let session = self.session_ctx.read();
5822        let state = session.state();
5823
5824        // Resolve DummyUdf placeholders
5825        let resolved_expr = Self::resolve_udfs(&struct_expr, &state)?;
5826
5827        use datafusion::physical_planner::PhysicalPlanner;
5828        let planner = datafusion::physical_planner::DefaultPhysicalPlanner::default();
5829        let physical_struct_expr =
5830            planner.create_physical_expr(&resolved_expr, &df_schema, &state)?;
5831
5832        proj_exprs.push((physical_struct_expr, variable.to_string()));
5833
5834        Ok(Arc::new(ProjectionExec::try_new(proj_exprs, input)?))
5835    }
5836
5837    /// Add a structural projection for an edge variable (builds a Struct with _eid, _type, _src, _dst + properties).
5838    fn add_edge_structural_projection(
5839        &self,
5840        input: Arc<dyn ExecutionPlan>,
5841        variable: &str,
5842        properties: &[String],
5843        source_variable: &str,
5844        target_variable: &str,
5845    ) -> Result<Arc<dyn ExecutionPlan>> {
5846        use datafusion::functions::expr_fn::named_struct;
5847        use datafusion::logical_expr::lit;
5848        use datafusion::physical_plan::projection::ProjectionExec;
5849
5850        let input_schema = input.schema();
5851        let mut proj_exprs: Vec<(Arc<dyn datafusion::physical_expr::PhysicalExpr>, String)> =
5852            Vec::new();
5853
5854        // 1. Keep all existing columns
5855        for (i, field) in input_schema.fields().iter().enumerate() {
5856            let col_expr = Arc::new(datafusion::physical_expr::expressions::Column::new(
5857                field.name(),
5858                i,
5859            ));
5860            proj_exprs.push((col_expr, field.name().clone()));
5861        }
5862
5863        // 2. Build named_struct with system fields + properties
5864        let mut struct_args = Vec::with_capacity(properties.len() * 2 + 10);
5865
5866        // Add _eid field for identity access
5867        struct_args.push(lit("_eid"));
5868        struct_args.push(DfExpr::Column(datafusion::common::Column::from_name(
5869            format!("{}._eid", variable),
5870        )));
5871
5872        struct_args.push(lit("_type"));
5873        struct_args.push(DfExpr::Column(datafusion::common::Column::from_name(
5874            format!("{}._type", variable),
5875        )));
5876
5877        // Add _src and _dst from source/target variable VIDs so the result
5878        // normalizer can detect this as an edge.
5879        // Use {var}._vid when available, falling back to bare {var} column
5880        // (e.g., in EXISTS subqueries where the source is a parameter VID).
5881        let resolve_vid_col = |var: &str| -> String {
5882            let vid_col = format!("{}._vid", var);
5883            if input_schema.column_with_name(&vid_col).is_some() {
5884                vid_col
5885            } else {
5886                var.to_string()
5887            }
5888        };
5889        let src_col_name = resolve_vid_col(source_variable);
5890        let dst_col_name = resolve_vid_col(target_variable);
5891        struct_args.push(lit("_src"));
5892        struct_args.push(DfExpr::Column(datafusion::common::Column::from_name(
5893            src_col_name,
5894        )));
5895
5896        struct_args.push(lit("_dst"));
5897        struct_args.push(DfExpr::Column(datafusion::common::Column::from_name(
5898            dst_col_name,
5899        )));
5900
5901        // Include _all_props if present (for keys()/properties() on schemaless edges)
5902        let all_props_col = format!("{}._all_props", variable);
5903        if input_schema.column_with_name(&all_props_col).is_some() {
5904            struct_args.push(lit("_all_props"));
5905            struct_args.push(DfExpr::Column(datafusion::common::Column::from_name(
5906                all_props_col,
5907            )));
5908        }
5909
5910        for prop in properties {
5911            struct_args.push(lit(prop.clone()));
5912            struct_args.push(DfExpr::Column(datafusion::common::Column::from_name(
5913                format!("{}.{}", variable, prop),
5914            )));
5915        }
5916
5917        let struct_expr = named_struct(struct_args);
5918
5919        let df_schema = datafusion::common::DFSchema::try_from(input_schema.as_ref().clone())?;
5920        let session = self.session_ctx.read();
5921        let state = session.state();
5922
5923        let resolved_expr = Self::resolve_udfs(&struct_expr, &state)?;
5924
5925        use datafusion::physical_planner::PhysicalPlanner;
5926        let planner = datafusion::physical_planner::DefaultPhysicalPlanner::default();
5927        let physical_struct_expr =
5928            planner.create_physical_expr(&resolved_expr, &df_schema, &state)?;
5929
5930        proj_exprs.push((physical_struct_expr, variable.to_string()));
5931
5932        Ok(Arc::new(ProjectionExec::try_new(proj_exprs, input)?))
5933    }
5934
5935    /// Create a physical aggregate expression.
5936    fn create_physical_aggregate(
5937        &self,
5938        expr: &DfExpr,
5939        schema: &SchemaRef,
5940        state: &SessionState,
5941    ) -> Result<PhysicalAggregate> {
5942        use datafusion::physical_planner::create_aggregate_expr_and_maybe_filter;
5943
5944        // Build a DFSchema from the Arrow schema for the function call
5945        let df_schema = datafusion::common::DFSchema::try_from(schema.as_ref().clone())?;
5946
5947        // The function returns (AggregateFunctionExpr, Option<filter>, Vec<ordering>)
5948        let (agg_expr, filter, _ordering) = create_aggregate_expr_and_maybe_filter(
5949            expr,
5950            &df_schema,
5951            schema.as_ref(),
5952            state.execution_props(),
5953        )?;
5954        Ok((agg_expr, filter))
5955    }
5956
5957    /// Resolve the source VID column for traversal, adding a struct field extraction
5958    /// projection if the source variable is a struct column (e.g., after WITH aggregation).
5959    ///
5960    /// Returns the (possibly modified) input plan and the column name to use as the source VID.
5961    fn resolve_source_vid_col(
5962        input_plan: Arc<dyn ExecutionPlan>,
5963        source_variable: &str,
5964    ) -> Result<(Arc<dyn ExecutionPlan>, String)> {
5965        let source_vid_col = format!("{}._vid", source_variable);
5966        if input_plan
5967            .schema()
5968            .column_with_name(&source_vid_col)
5969            .is_some()
5970        {
5971            return Ok((input_plan, source_vid_col));
5972        }
5973        // Check if the variable is a struct column (entity after WITH aggregation).
5974        // If so, add a projection to extract _vid from the struct.
5975        if let Ok(field) = input_plan.schema().field_with_name(source_variable)
5976            && matches!(
5977                field.data_type(),
5978                datafusion::arrow::datatypes::DataType::Struct(_)
5979            )
5980        {
5981            let enriched = Self::extract_struct_identity_columns(input_plan, source_variable)?;
5982            return Ok((enriched, format!("{}._vid", source_variable)));
5983        }
5984        Ok((input_plan, source_variable.to_string()))
5985    }
5986
5987    /// Add a projection that extracts `{variable}._vid` and `{variable}._labels` from
5988    /// a struct column named `{variable}`. This is needed when an entity variable
5989    /// has been passed through a WITH + aggregation and exists as a struct rather
5990    /// than flat columns.
5991    fn extract_struct_identity_columns(
5992        input: Arc<dyn ExecutionPlan>,
5993        variable: &str,
5994    ) -> Result<Arc<dyn ExecutionPlan>> {
5995        use datafusion::common::ScalarValue;
5996        use datafusion::physical_plan::projection::ProjectionExec;
5997
5998        let schema = input.schema();
5999        let mut proj_exprs: Vec<(Arc<dyn datafusion::physical_expr::PhysicalExpr>, String)> =
6000            Vec::new();
6001
6002        // Keep all existing columns
6003        for (i, field) in schema.fields().iter().enumerate() {
6004            let col_expr = Arc::new(datafusion::physical_expr::expressions::Column::new(
6005                field.name(),
6006                i,
6007            ));
6008            proj_exprs.push((col_expr, field.name().clone()));
6009        }
6010
6011        // Find the struct column and extract identity fields using get_field UDF
6012        if let Some((struct_idx, struct_field)) = schema
6013            .fields()
6014            .iter()
6015            .enumerate()
6016            .find(|(_, f)| f.name() == variable)
6017            && let datafusion::arrow::datatypes::DataType::Struct(fields) = struct_field.data_type()
6018        {
6019            let struct_col: Arc<dyn datafusion::physical_expr::PhysicalExpr> = Arc::new(
6020                datafusion::physical_expr::expressions::Column::new(variable, struct_idx),
6021            );
6022            let get_field_udf: Arc<datafusion::logical_expr::ScalarUDF> =
6023                Arc::new(datafusion::logical_expr::ScalarUDF::from(
6024                    datafusion::functions::core::getfield::GetFieldFunc::new(),
6025                ));
6026
6027            // Extract _vid field
6028            if fields.iter().any(|f| f.name() == "_vid") {
6029                let field_name: Arc<dyn datafusion::physical_expr::PhysicalExpr> =
6030                    Arc::new(datafusion::physical_expr::expressions::Literal::new(
6031                        ScalarValue::Utf8(Some("_vid".to_string())),
6032                    ));
6033                let vid_expr = Arc::new(datafusion::physical_expr::ScalarFunctionExpr::try_new(
6034                    get_field_udf.clone(),
6035                    vec![struct_col.clone(), field_name],
6036                    schema.as_ref(),
6037                    Arc::new(datafusion::common::config::ConfigOptions::default()),
6038                )?);
6039                proj_exprs.push((vid_expr, format!("{}._vid", variable)));
6040            }
6041
6042            // Extract _labels field
6043            if fields.iter().any(|f| f.name() == "_labels") {
6044                let field_name: Arc<dyn datafusion::physical_expr::PhysicalExpr> =
6045                    Arc::new(datafusion::physical_expr::expressions::Literal::new(
6046                        ScalarValue::Utf8(Some("_labels".to_string())),
6047                    ));
6048                let labels_expr = Arc::new(datafusion::physical_expr::ScalarFunctionExpr::try_new(
6049                    get_field_udf,
6050                    vec![struct_col, field_name],
6051                    schema.as_ref(),
6052                    Arc::new(datafusion::common::config::ConfigOptions::default()),
6053                )?);
6054                proj_exprs.push((labels_expr, format!("{}._labels", variable)));
6055            }
6056        }
6057
6058        Ok(Arc::new(ProjectionExec::try_new(proj_exprs, input)?))
6059    }
6060
6061    /// Add a projection that extracts ALL fields from a struct column named `{variable}`
6062    /// as flat `{variable}.{field_name}` columns. Used when a variable that passed through
6063    /// WITH + aggregation (and became a struct) is referenced by property access downstream.
6064    fn extract_all_struct_fields(
6065        input: Arc<dyn ExecutionPlan>,
6066        variable: &str,
6067    ) -> Result<Arc<dyn ExecutionPlan>> {
6068        use datafusion::common::ScalarValue;
6069        use datafusion::physical_plan::projection::ProjectionExec;
6070
6071        let schema = input.schema();
6072        let mut proj_exprs: Vec<(Arc<dyn datafusion::physical_expr::PhysicalExpr>, String)> =
6073            Vec::new();
6074
6075        // Keep all existing columns
6076        for (i, field) in schema.fields().iter().enumerate() {
6077            let col_expr = Arc::new(datafusion::physical_expr::expressions::Column::new(
6078                field.name(),
6079                i,
6080            ));
6081            proj_exprs.push((col_expr, field.name().clone()));
6082        }
6083
6084        // Find the struct column and extract ALL fields
6085        if let Some((struct_idx, struct_field)) = schema
6086            .fields()
6087            .iter()
6088            .enumerate()
6089            .find(|(_, f)| f.name() == variable)
6090            && let datafusion::arrow::datatypes::DataType::Struct(fields) = struct_field.data_type()
6091        {
6092            let struct_col: Arc<dyn datafusion::physical_expr::PhysicalExpr> = Arc::new(
6093                datafusion::physical_expr::expressions::Column::new(variable, struct_idx),
6094            );
6095            let get_field_udf: Arc<datafusion::logical_expr::ScalarUDF> =
6096                Arc::new(datafusion::logical_expr::ScalarUDF::from(
6097                    datafusion::functions::core::getfield::GetFieldFunc::new(),
6098                ));
6099
6100            for field in fields.iter() {
6101                let flat_name = format!("{}.{}", variable, field.name());
6102                // Skip if already exists as a flat column
6103                if schema.column_with_name(&flat_name).is_some() {
6104                    continue;
6105                }
6106                let field_lit: Arc<dyn datafusion::physical_expr::PhysicalExpr> =
6107                    Arc::new(datafusion::physical_expr::expressions::Literal::new(
6108                        ScalarValue::Utf8(Some(field.name().to_string())),
6109                    ));
6110                let extract_expr =
6111                    Arc::new(datafusion::physical_expr::ScalarFunctionExpr::try_new(
6112                        get_field_udf.clone(),
6113                        vec![struct_col.clone(), field_lit],
6114                        schema.as_ref(),
6115                        Arc::new(datafusion::common::config::ConfigOptions::default()),
6116                    )?);
6117                proj_exprs.push((extract_expr, flat_name));
6118            }
6119        }
6120
6121        Ok(Arc::new(ProjectionExec::try_new(proj_exprs, input)?))
6122    }
6123
6124    /// Check if a DataFusion expression refers to a LargeBinary column in the schema.
6125    fn is_large_binary_col(&self, expr: &DfExpr, schema: &SchemaRef) -> bool {
6126        if let DfExpr::Column(col) = expr
6127            && let Ok(field) = schema.field_with_name(&col.name)
6128        {
6129            return matches!(
6130                field.data_type(),
6131                datafusion::arrow::datatypes::DataType::LargeBinary
6132            );
6133        }
6134        // For any other expression type, conservatively return true
6135        // since schemaless properties are stored as LargeBinary
6136        true
6137    }
6138}
6139
6140// ---------------------------------------------------------------------------
6141// Locy operator helpers
6142// ---------------------------------------------------------------------------
6143
6144/// Resolve column names to indices in a schema.
6145/// Strip structural projection columns from a physical plan.
6146///
6147/// Graph scans add `named_struct` columns for node/edge variables (e.g., column `a`
6148/// of type `Struct{_vid, _labels, _all_props}`). When CrossJoined with a derived scan
6149/// Coerce a physical expression from `actual_dt` to `target_dt`.
6150///
6151/// Arrow's CastExpr cannot handle LargeBinary→Float64 because LargeBinary holds
6152/// serialized CypherValue bytes. For these cases, use the `_cypher_to_float64` UDF
6153/// which deserializes properly. For standard numeric coercions (Int64→Float64 etc.)
6154/// we use Arrow's built-in CastExpr.
6155fn coerce_physical_expr(
6156    expr: Arc<dyn datafusion::physical_expr::PhysicalExpr>,
6157    actual_dt: &DataType,
6158    target_dt: &DataType,
6159    schema: &arrow_schema::Schema,
6160) -> Arc<dyn datafusion::physical_expr::PhysicalExpr> {
6161    use datafusion::physical_expr::expressions::CastExpr;
6162
6163    match (actual_dt, target_dt) {
6164        // LargeBinary → Float64: use Cypher value deserializer UDF
6165        (DataType::LargeBinary, DataType::Float64) => wrap_cypher_to_float64(expr, schema),
6166        // LargeBinary → Int64: cast through Float64 first (extract number, then truncate)
6167        (DataType::LargeBinary, DataType::Int64) => {
6168            let float_expr = wrap_cypher_to_float64(expr, schema);
6169            Arc::new(CastExpr::new(float_expr, DataType::Int64, None))
6170        }
6171        // Standard Arrow casts (Int64→Float64, Float64→Int64, etc.)
6172        _ => Arc::new(CastExpr::new(expr, target_dt.clone(), None)),
6173    }
6174}
6175
6176/// Wrap a physical expression with `_cypher_to_float64` UDF.
6177fn wrap_cypher_to_float64(
6178    expr: Arc<dyn datafusion::physical_expr::PhysicalExpr>,
6179    schema: &arrow_schema::Schema,
6180) -> Arc<dyn datafusion::physical_expr::PhysicalExpr> {
6181    let udf = Arc::new(super::df_udfs::cypher_to_float64_udf());
6182    let config = Arc::new(datafusion::common::config::ConfigOptions::default());
6183    Arc::new(
6184        datafusion::physical_expr::ScalarFunctionExpr::try_new(udf, vec![expr], schema, config)
6185            .expect("CypherToFloat64Udf accepts Any(1) signature"),
6186    )
6187}
6188
6189/// Strip structural projection columns from a physical plan that conflict with
6190/// derived scan column names.
6191///
6192/// Graph scans add `named_struct` columns for node/edge variables (e.g., column `a`
6193/// of type `Struct{_vid, _labels, _all_props}`). When CrossJoined with a derived scan
6194/// that also has a column `a` (UInt64 VID), the duplicate name causes ambiguous
6195/// column resolution. This function removes ONLY those Struct-typed columns whose
6196/// names collide with derived scan columns, preserving non-conflicting struct columns
6197/// (like edge structs) that are needed for typed property access.
6198fn strip_conflicting_structural_columns(
6199    input: Arc<dyn datafusion::physical_plan::ExecutionPlan>,
6200    derived_col_names: &HashSet<&str>,
6201) -> anyhow::Result<Arc<dyn datafusion::physical_plan::ExecutionPlan>> {
6202    use datafusion::physical_plan::projection::ProjectionExec;
6203
6204    let schema = input.schema();
6205    let proj_exprs: Vec<(Arc<dyn datafusion::physical_expr::PhysicalExpr>, String)> = schema
6206        .fields()
6207        .iter()
6208        .enumerate()
6209        .filter(|(_, f)| {
6210            // Remove Struct columns whose names conflict with derived scan columns.
6211            !(matches!(f.data_type(), arrow_schema::DataType::Struct(_))
6212                && derived_col_names.contains(f.name().as_str()))
6213        })
6214        .map(|(i, f)| {
6215            let col: Arc<dyn datafusion::physical_expr::PhysicalExpr> = Arc::new(
6216                datafusion::physical_expr::expressions::Column::new(f.name(), i),
6217            );
6218            (col, f.name().clone())
6219        })
6220        .collect();
6221
6222    if proj_exprs.len() == schema.fields().len() {
6223        // No conflicting structural columns
6224        return Ok(input);
6225    }
6226
6227    Ok(Arc::new(ProjectionExec::try_new(proj_exprs, input)?))
6228}
6229
6230fn resolve_column_indices(
6231    schema: &arrow_schema::SchemaRef,
6232    column_names: &[String],
6233) -> anyhow::Result<Vec<usize>> {
6234    column_names
6235        .iter()
6236        .map(|name| {
6237            schema
6238                .index_of(name)
6239                .map_err(|_| anyhow::anyhow!("Column '{}' not found in schema", name))
6240        })
6241        .collect()
6242}
6243
6244/// Resolve BEST BY criteria from `(Expr, ascending)` pairs to `SortCriterion` values.
6245fn resolve_best_by_criteria(
6246    schema: &arrow_schema::SchemaRef,
6247    criteria: &[(Expr, bool)],
6248) -> anyhow::Result<Vec<super::df_graph::locy_best_by::SortCriterion>> {
6249    criteria
6250        .iter()
6251        .map(|(expr, ascending)| {
6252            // Extract candidate column names — try property name first (short),
6253            // then full "var.prop" form, then variable name.
6254            let candidates: Vec<String> = match expr {
6255                Expr::Property(base, prop) => {
6256                    if let Expr::Variable(var) = base.as_ref() {
6257                        vec![prop.clone(), format!("{}.{}", var, prop)]
6258                    } else {
6259                        vec![prop.clone()]
6260                    }
6261                }
6262                Expr::Variable(name) => {
6263                    let short = name.rsplit('.').next().unwrap_or(name).to_string();
6264                    if short != *name {
6265                        vec![short, name.clone()]
6266                    } else {
6267                        vec![name.clone()]
6268                    }
6269                }
6270                _ => {
6271                    return Err(anyhow::anyhow!(
6272                        "BEST BY criteria must be variable or property access"
6273                    ));
6274                }
6275            };
6276            let col_index = candidates
6277                .iter()
6278                .find_map(|name| schema.index_of(name).ok())
6279                .ok_or_else(|| {
6280                    anyhow::anyhow!(
6281                        "BEST BY column '{}' not found",
6282                        candidates.first().unwrap_or(&String::new())
6283                    )
6284                })?;
6285            Ok(super::df_graph::locy_best_by::SortCriterion {
6286                col_index,
6287                ascending: *ascending,
6288                nulls_first: false, // NULLS LAST is Locy default
6289            })
6290        })
6291        .collect()
6292}
6293
6294/// Resolve fold bindings from `(output_name, aggregate_expr)` to `FoldBinding` values.
6295///
6296/// Normalizes grammar aliases to canonical names and resolves each aggregate
6297/// against `plugin_registry` so the runtime engine receives a pre-bound
6298/// [`uni_plugin::traits::locy::LocyAggregate`] trait object.
6299fn resolve_fold_bindings(
6300    schema: &arrow_schema::SchemaRef,
6301    fold_bindings: &[(String, Expr)],
6302    plugin_registry: &uni_plugin::PluginRegistry,
6303) -> anyhow::Result<Vec<super::df_graph::locy_fold::FoldBinding>> {
6304    use super::df_graph::locy_fold::resolve_locy_aggregate;
6305    fold_bindings
6306        .iter()
6307        .map(|(output_name, expr)| {
6308            // Parse aggregate expression: FunctionCall { name, args }
6309            match expr {
6310                Expr::FunctionCall { name, args, .. } => {
6311                    let upper = name.to_uppercase();
6312                    let is_count = matches!(upper.as_str(), "COUNT" | "MCOUNT");
6313
6314                    let canonical: smol_str::SmolStr = if is_count && args.is_empty() {
6315                        smol_str::SmolStr::new_static("COUNTALL")
6316                    } else {
6317                        match upper.as_str() {
6318                            "SUM" | "MSUM" => smol_str::SmolStr::new_static("SUM"),
6319                            "COUNT" | "MCOUNT" => smol_str::SmolStr::new_static("COUNT"),
6320                            "MAX" | "MMAX" => smol_str::SmolStr::new_static("MAX"),
6321                            "MIN" | "MMIN" => smol_str::SmolStr::new_static("MIN"),
6322                            "AVG" => smol_str::SmolStr::new_static("AVG"),
6323                            "COLLECT" => smol_str::SmolStr::new_static("COLLECT"),
6324                            "MNOR" => smol_str::SmolStr::new_static("MNOR"),
6325                            "MPROD" => smol_str::SmolStr::new_static("MPROD"),
6326                            other => {
6327                                return Err(anyhow::anyhow!(
6328                                    "Unsupported FOLD aggregate function: {}",
6329                                    other
6330                                ));
6331                            }
6332                        }
6333                    };
6334
6335                    let entry = resolve_locy_aggregate(plugin_registry, canonical.as_str())
6336                        .ok_or_else(|| {
6337                            anyhow::anyhow!(
6338                                "Locy aggregate '{canonical}' is not registered in the plugin registry"
6339                            )
6340                        })?;
6341                    let aggregate = Arc::clone(&entry.aggregate);
6342
6343                    // COUNTALL has no input column.
6344                    if canonical.as_str() == "COUNTALL" {
6345                        return Ok(super::df_graph::locy_fold::FoldBinding {
6346                            output_name: output_name.clone(),
6347                            name: canonical,
6348                            aggregate,
6349                            input_col_index: 0,
6350                            input_col_name: None,
6351                        });
6352                    }
6353
6354                    // The LocyProject aliases the aggregate input expression to the
6355                    // fold output name, so look up the output name in the schema.
6356                    let input_col_index = schema
6357                        .index_of(output_name)
6358                        .or_else(|_| {
6359                            // Fallback: try the raw argument column name
6360                            let col_name = match args.first() {
6361                                Some(Expr::Variable(name)) => Some(name.clone()),
6362                                Some(Expr::Property(base, prop)) => {
6363                                    if let Expr::Variable(var) = base.as_ref() {
6364                                        Some(format!("{}.{}", var, prop))
6365                                    } else {
6366                                        None
6367                                    }
6368                                }
6369                                _ => None,
6370                            };
6371                            col_name
6372                                .and_then(|n| schema.index_of(&n).ok())
6373                                .ok_or_else(|| {
6374                                    arrow_schema::ArrowError::SchemaError(format!(
6375                                        "FOLD column '{}' not found",
6376                                        output_name
6377                                    ))
6378                                })
6379                        })
6380                        .map_err(|_| anyhow::anyhow!("FOLD column '{}' not found", output_name))?;
6381                    Ok(super::df_graph::locy_fold::FoldBinding {
6382                        output_name: output_name.clone(),
6383                        name: canonical,
6384                        aggregate,
6385                        input_col_index,
6386                        input_col_name: Some(output_name.clone()),
6387                    })
6388                }
6389                _ => Err(anyhow::anyhow!(
6390                    "FOLD binding must be an aggregate function call"
6391                )),
6392            }
6393        })
6394        .collect()
6395}
6396
6397/// Recursively collect variable kinds (node, edge, path) from a LogicalPlan.
6398///
6399/// This information is used by the expression translator to resolve bare variable
6400/// references to their identity columns (e.g., `n` → `n._vid` for nodes).
6401fn collect_variable_kinds(plan: &LogicalPlan, kinds: &mut HashMap<String, VariableKind>) {
6402    match plan {
6403        // Phase 5b followup: recurse into the wrapped node so the
6404        // wrapped operator's variable still gets collected.
6405        LogicalPlan::FusedIndexScanWrapped { inner, .. } => {
6406            collect_variable_kinds(inner, kinds);
6407        }
6408        LogicalPlan::Scan { variable, .. }
6409        | LogicalPlan::FusedIndexScan { variable, .. }
6410        | LogicalPlan::ExtIdLookup { variable, .. }
6411        | LogicalPlan::ScanAll { variable, .. }
6412        | LogicalPlan::ScanMainByLabels { variable, .. }
6413        | LogicalPlan::VectorKnn { variable, .. }
6414        | LogicalPlan::InvertedIndexLookup { variable, .. } => {
6415            kinds.insert(variable.clone(), VariableKind::Node);
6416        }
6417        LogicalPlan::Traverse {
6418            input,
6419            source_variable,
6420            target_variable,
6421            step_variable,
6422            path_variable,
6423            is_variable_length,
6424            ..
6425        }
6426        | LogicalPlan::TraverseMainByType {
6427            input,
6428            source_variable,
6429            target_variable,
6430            step_variable,
6431            path_variable,
6432            is_variable_length,
6433            ..
6434        } => {
6435            collect_variable_kinds(input, kinds);
6436            kinds.insert(source_variable.clone(), VariableKind::Node);
6437            kinds.insert(target_variable.clone(), VariableKind::Node);
6438            if let Some(sv) = step_variable {
6439                kinds.insert(sv.clone(), VariableKind::edge_for(*is_variable_length));
6440            }
6441            if let Some(pv) = path_variable {
6442                kinds.insert(pv.clone(), VariableKind::Path);
6443            }
6444        }
6445        LogicalPlan::ShortestPath {
6446            input,
6447            source_variable,
6448            target_variable,
6449            path_variable,
6450            ..
6451        }
6452        | LogicalPlan::AllShortestPaths {
6453            input,
6454            source_variable,
6455            target_variable,
6456            path_variable,
6457            ..
6458        } => {
6459            collect_variable_kinds(input, kinds);
6460            kinds.insert(source_variable.clone(), VariableKind::Node);
6461            kinds.insert(target_variable.clone(), VariableKind::Node);
6462            kinds.insert(path_variable.clone(), VariableKind::Path);
6463        }
6464        LogicalPlan::QuantifiedPattern {
6465            input,
6466            pattern_plan,
6467            path_variable,
6468            start_variable,
6469            binding_variable,
6470            ..
6471        } => {
6472            collect_variable_kinds(input, kinds);
6473            collect_variable_kinds(pattern_plan, kinds);
6474            kinds.insert(start_variable.clone(), VariableKind::Node);
6475            kinds.insert(binding_variable.clone(), VariableKind::Node);
6476            if let Some(pv) = path_variable {
6477                kinds.insert(pv.clone(), VariableKind::Path);
6478            }
6479        }
6480        LogicalPlan::BindZeroLengthPath {
6481            input,
6482            node_variable,
6483            path_variable,
6484        } => {
6485            collect_variable_kinds(input, kinds);
6486            kinds.insert(node_variable.clone(), VariableKind::Node);
6487            kinds.insert(path_variable.clone(), VariableKind::Path);
6488        }
6489        LogicalPlan::BindPath {
6490            input,
6491            node_variables,
6492            edge_variables,
6493            path_variable,
6494        } => {
6495            collect_variable_kinds(input, kinds);
6496            for nv in node_variables {
6497                kinds.insert(nv.clone(), VariableKind::Node);
6498            }
6499            for ev in edge_variables {
6500                kinds.insert(ev.clone(), VariableKind::Edge);
6501            }
6502            kinds.insert(path_variable.clone(), VariableKind::Path);
6503        }
6504        // Wrapper nodes: recurse into input(s)
6505        LogicalPlan::Filter { input, .. }
6506        | LogicalPlan::Project { input, .. }
6507        | LogicalPlan::Sort { input, .. }
6508        | LogicalPlan::Limit { input, .. }
6509        | LogicalPlan::Aggregate { input, .. }
6510        | LogicalPlan::Distinct { input, .. }
6511        | LogicalPlan::Window { input, .. }
6512        | LogicalPlan::Unwind { input, .. }
6513        | LogicalPlan::Create { input, .. }
6514        | LogicalPlan::CreateBatch { input, .. }
6515        | LogicalPlan::Merge { input, .. }
6516        | LogicalPlan::Set { input, .. }
6517        | LogicalPlan::Remove { input, .. }
6518        | LogicalPlan::Delete { input, .. }
6519        | LogicalPlan::Foreach { input, .. }
6520        | LogicalPlan::SubqueryCall { input, .. } => {
6521            collect_variable_kinds(input, kinds);
6522        }
6523        LogicalPlan::Union { left, right, .. } | LogicalPlan::CrossJoin { left, right, .. } => {
6524            collect_variable_kinds(left, kinds);
6525            collect_variable_kinds(right, kinds);
6526        }
6527        LogicalPlan::Apply {
6528            input, subquery, ..
6529        } => {
6530            collect_variable_kinds(input, kinds);
6531            collect_variable_kinds(subquery, kinds);
6532        }
6533        LogicalPlan::RecursiveCTE {
6534            initial, recursive, ..
6535        } => {
6536            collect_variable_kinds(initial, kinds);
6537            collect_variable_kinds(recursive, kinds);
6538        }
6539        LogicalPlan::Explain { plan } => {
6540            collect_variable_kinds(plan, kinds);
6541        }
6542        LogicalPlan::ProcedureCall {
6543            procedure_name,
6544            yield_items,
6545            ..
6546        } => {
6547            use crate::query::df_graph::procedure_call::{
6548                is_node_yield_procedure_static, map_yield_to_canonical,
6549            };
6550            for (name, alias) in yield_items {
6551                let var = alias.as_ref().unwrap_or(name);
6552                if is_node_yield_procedure_static(procedure_name.as_str()) {
6553                    let canonical = map_yield_to_canonical(name);
6554                    if canonical == "node" {
6555                        kinds.insert(var.clone(), VariableKind::Node);
6556                    }
6557                    // Scalar yields (distance, score, vid) don't need VariableKind
6558                }
6559                // For schema procedures, yields are all scalars — no entry needed
6560            }
6561        }
6562        // Locy operators — no variable kinds to collect
6563        LogicalPlan::LocyProgram { .. }
6564        | LogicalPlan::LocyFold { .. }
6565        | LogicalPlan::LocyBestBy { .. }
6566        | LogicalPlan::LocyPriority { .. }
6567        | LogicalPlan::LocyDerivedScan { .. }
6568        | LogicalPlan::LocyProject { .. }
6569        | LogicalPlan::LocyModelInvoke { .. } => {}
6570        // Leaf nodes with no variables or not applicable
6571        LogicalPlan::Empty
6572        | LogicalPlan::CreateVectorIndex { .. }
6573        | LogicalPlan::CreateFullTextIndex { .. }
6574        | LogicalPlan::CreateScalarIndex { .. }
6575        | LogicalPlan::CreateJsonFtsIndex { .. }
6576        | LogicalPlan::DropIndex { .. }
6577        | LogicalPlan::ShowIndexes { .. }
6578        | LogicalPlan::Copy { .. }
6579        | LogicalPlan::Backup { .. }
6580        | LogicalPlan::ShowDatabase
6581        | LogicalPlan::ShowConfig
6582        | LogicalPlan::ShowStatistics
6583        | LogicalPlan::Vacuum
6584        | LogicalPlan::Checkpoint
6585        | LogicalPlan::CopyTo { .. }
6586        | LogicalPlan::CopyFrom { .. }
6587        | LogicalPlan::CreateLabel(_)
6588        | LogicalPlan::CreateEdgeType(_)
6589        | LogicalPlan::AlterLabel(_)
6590        | LogicalPlan::AlterEdgeType(_)
6591        | LogicalPlan::DropLabel(_)
6592        | LogicalPlan::DropEdgeType(_)
6593        | LogicalPlan::CreateConstraint(_)
6594        | LogicalPlan::DropConstraint(_)
6595        | LogicalPlan::ShowConstraints(_) => {}
6596    }
6597}
6598
6599/// Collect node variable names from CREATE/MERGE patterns for startNode/endNode UDFs.
6600///
6601/// These hints are used alongside `variable_kinds` to identify node variables
6602/// in mutation contexts for startNode/endNode resolution.
6603fn collect_mutation_node_hints(plan: &LogicalPlan, hints: &mut Vec<String>) {
6604    match plan {
6605        LogicalPlan::Create { input, pattern } => {
6606            collect_node_names_from_pattern(pattern, hints);
6607            collect_mutation_node_hints(input, hints);
6608        }
6609        LogicalPlan::CreateBatch { input, patterns } => {
6610            for pattern in patterns {
6611                collect_node_names_from_pattern(pattern, hints);
6612            }
6613            collect_mutation_node_hints(input, hints);
6614        }
6615        LogicalPlan::Merge { input, pattern, .. } => {
6616            collect_node_names_from_pattern(pattern, hints);
6617            collect_mutation_node_hints(input, hints);
6618        }
6619        // For all other nodes, recurse into inputs
6620        LogicalPlan::Traverse { input, .. }
6621        | LogicalPlan::TraverseMainByType { input, .. }
6622        | LogicalPlan::Filter { input, .. }
6623        | LogicalPlan::Project { input, .. }
6624        | LogicalPlan::Sort { input, .. }
6625        | LogicalPlan::Limit { input, .. }
6626        | LogicalPlan::Aggregate { input, .. }
6627        | LogicalPlan::Distinct { input, .. }
6628        | LogicalPlan::Window { input, .. }
6629        | LogicalPlan::Unwind { input, .. }
6630        | LogicalPlan::Set { input, .. }
6631        | LogicalPlan::Remove { input, .. }
6632        | LogicalPlan::Delete { input, .. }
6633        | LogicalPlan::Foreach { input, .. }
6634        | LogicalPlan::SubqueryCall { input, .. }
6635        | LogicalPlan::ShortestPath { input, .. }
6636        | LogicalPlan::AllShortestPaths { input, .. }
6637        | LogicalPlan::QuantifiedPattern { input, .. }
6638        | LogicalPlan::BindZeroLengthPath { input, .. }
6639        | LogicalPlan::BindPath { input, .. } => {
6640            collect_mutation_node_hints(input, hints);
6641        }
6642        LogicalPlan::Union { left, right, .. } | LogicalPlan::CrossJoin { left, right, .. } => {
6643            collect_mutation_node_hints(left, hints);
6644            collect_mutation_node_hints(right, hints);
6645        }
6646        LogicalPlan::Apply {
6647            input, subquery, ..
6648        } => {
6649            collect_mutation_node_hints(input, hints);
6650            collect_mutation_node_hints(subquery, hints);
6651        }
6652        LogicalPlan::RecursiveCTE {
6653            initial, recursive, ..
6654        } => {
6655            collect_mutation_node_hints(initial, hints);
6656            collect_mutation_node_hints(recursive, hints);
6657        }
6658        LogicalPlan::Explain { plan } => {
6659            collect_mutation_node_hints(plan, hints);
6660        }
6661        // Leaf nodes — nothing to collect
6662        _ => {}
6663    }
6664}
6665
6666/// Extract node variable names from a single Cypher pattern.
6667fn collect_node_names_from_pattern(pattern: &Pattern, hints: &mut Vec<String>) {
6668    for path in &pattern.paths {
6669        for element in &path.elements {
6670            match element {
6671                PatternElement::Node(n) => {
6672                    if let Some(ref v) = n.variable
6673                        && !hints.contains(v)
6674                    {
6675                        hints.push(v.clone());
6676                    }
6677                }
6678                PatternElement::Parenthesized { pattern, .. } => {
6679                    let sub = Pattern {
6680                        paths: vec![pattern.as_ref().clone()],
6681                    };
6682                    collect_node_names_from_pattern(&sub, hints);
6683                }
6684                _ => {}
6685            }
6686        }
6687    }
6688}
6689
6690/// Collect edge (relationship) variable names from CREATE/MERGE patterns.
6691///
6692/// Used by `id()` to resolve edge identity as `_eid` instead of `_vid`.
6693fn collect_mutation_edge_hints(plan: &LogicalPlan, hints: &mut Vec<String>) {
6694    match plan {
6695        LogicalPlan::Create { input, pattern } | LogicalPlan::Merge { input, pattern, .. } => {
6696            collect_edge_names_from_pattern(pattern, hints);
6697            collect_mutation_edge_hints(input, hints);
6698        }
6699        LogicalPlan::CreateBatch { input, patterns } => {
6700            for pattern in patterns {
6701                collect_edge_names_from_pattern(pattern, hints);
6702            }
6703            collect_mutation_edge_hints(input, hints);
6704        }
6705        // For all other nodes, recurse into inputs
6706        LogicalPlan::Traverse { input, .. }
6707        | LogicalPlan::TraverseMainByType { input, .. }
6708        | LogicalPlan::Filter { input, .. }
6709        | LogicalPlan::Project { input, .. }
6710        | LogicalPlan::Sort { input, .. }
6711        | LogicalPlan::Limit { input, .. }
6712        | LogicalPlan::Aggregate { input, .. }
6713        | LogicalPlan::Distinct { input, .. }
6714        | LogicalPlan::Window { input, .. }
6715        | LogicalPlan::Unwind { input, .. }
6716        | LogicalPlan::Set { input, .. }
6717        | LogicalPlan::Remove { input, .. }
6718        | LogicalPlan::Delete { input, .. }
6719        | LogicalPlan::Foreach { input, .. }
6720        | LogicalPlan::SubqueryCall { input, .. }
6721        | LogicalPlan::ShortestPath { input, .. }
6722        | LogicalPlan::AllShortestPaths { input, .. }
6723        | LogicalPlan::QuantifiedPattern { input, .. }
6724        | LogicalPlan::BindZeroLengthPath { input, .. }
6725        | LogicalPlan::BindPath { input, .. } => {
6726            collect_mutation_edge_hints(input, hints);
6727        }
6728        LogicalPlan::Union { left, right, .. } | LogicalPlan::CrossJoin { left, right, .. } => {
6729            collect_mutation_edge_hints(left, hints);
6730            collect_mutation_edge_hints(right, hints);
6731        }
6732        LogicalPlan::Apply {
6733            input, subquery, ..
6734        } => {
6735            collect_mutation_edge_hints(input, hints);
6736            collect_mutation_edge_hints(subquery, hints);
6737        }
6738        LogicalPlan::RecursiveCTE {
6739            initial, recursive, ..
6740        } => {
6741            collect_mutation_edge_hints(initial, hints);
6742            collect_mutation_edge_hints(recursive, hints);
6743        }
6744        LogicalPlan::Explain { plan } => {
6745            collect_mutation_edge_hints(plan, hints);
6746        }
6747        _ => {}
6748    }
6749}
6750
6751/// Extract edge (relationship) variable names from a single Cypher pattern.
6752fn collect_edge_names_from_pattern(pattern: &Pattern, hints: &mut Vec<String>) {
6753    for path in &pattern.paths {
6754        for element in &path.elements {
6755            match element {
6756                PatternElement::Relationship(r) => {
6757                    if let Some(ref v) = r.variable
6758                        && !hints.contains(v)
6759                    {
6760                        hints.push(v.clone());
6761                    }
6762                }
6763                PatternElement::Parenthesized { pattern, .. } => {
6764                    let sub = Pattern {
6765                        paths: vec![pattern.as_ref().clone()],
6766                    };
6767                    collect_edge_names_from_pattern(&sub, hints);
6768                }
6769                _ => {}
6770            }
6771        }
6772    }
6773}
6774
6775/// Convert AST Direction to adjacency cache Direction.
6776fn convert_direction(ast_dir: AstDirection) -> Direction {
6777    match ast_dir {
6778        AstDirection::Outgoing => Direction::Outgoing,
6779        AstDirection::Incoming => Direction::Incoming,
6780        AstDirection::Both => Direction::Both,
6781    }
6782}
6783
6784/// Clean VLP target property list derived from planner property collection.
6785///
6786/// Removes the wildcard sentinel `"*"` (not a real property), and ensures
6787/// `_all_props` is loaded when wildcard/non-schema properties require it.
6788fn sanitize_vlp_target_properties(
6789    mut properties: Vec<String>,
6790    target_has_wildcard: bool,
6791    target_label_props: Option<&HashSet<String>>,
6792) -> Vec<String> {
6793    properties.retain(|p| p != "*");
6794
6795    if target_has_wildcard && properties.is_empty() {
6796        properties.push("_all_props".to_string());
6797    }
6798
6799    let has_non_schema_props = properties.iter().any(|p| {
6800        p != "_all_props"
6801            && p != "overflow_json"
6802            && !p.starts_with('_')
6803            && !target_label_props.is_some_and(|props| props.contains(p))
6804    });
6805    if has_non_schema_props && !properties.iter().any(|p| p == "_all_props") {
6806        properties.push("_all_props".to_string());
6807    }
6808
6809    properties
6810}
6811
6812// ---------------------------------------------------------------------------
6813// Issue #53: helpers for the CrossJoin+Filter → HashJoinExec optimization.
6814// ---------------------------------------------------------------------------
6815
6816/// Classification of a Filter predicate sitting above a CrossJoin, used to
6817/// decide whether (and how) to rewrite it as a HashJoin.
6818struct JoinPredicateClassification {
6819    /// Equi-join conditions: each `(left_expr, right_expr)` pair has
6820    /// `left_expr` referencing only LEFT-side variables and `right_expr`
6821    /// referencing only RIGHT-side variables.
6822    equi_pairs: Vec<(Expr, Expr)>,
6823    /// Conjuncts referencing ONLY left-side variables. Pushed into a Filter
6824    /// wrapped around the LEFT subtree before planning.
6825    left_only: Vec<Expr>,
6826    /// Conjuncts referencing ONLY right-side variables. Pushed into a Filter
6827    /// wrapped around the RIGHT subtree before planning.
6828    right_only: Vec<Expr>,
6829    /// Conjuncts referencing both sides but NOT in equi-join form. Applied as
6830    /// a post-join FilterExec.
6831    residual: Option<Expr>,
6832}
6833
6834/// Walk a LogicalPlan subtree and collect all variable names produced by it
6835/// (Scans, Unwind targets, Traverse targets, etc.). Used to classify which
6836/// side of a CrossJoin a predicate's variables belong to.
6837fn collect_plan_variables(plan: &LogicalPlan) -> HashSet<String> {
6838    let mut out = HashSet::new();
6839    collect_plan_variables_into(plan, &mut out);
6840    out
6841}
6842
6843fn collect_plan_variables_into(plan: &LogicalPlan, out: &mut HashSet<String>) {
6844    match plan {
6845        LogicalPlan::Scan { variable, .. }
6846        | LogicalPlan::ExtIdLookup { variable, .. }
6847        | LogicalPlan::ScanAll { variable, .. }
6848        | LogicalPlan::ScanMainByLabels { variable, .. } => {
6849            out.insert(variable.clone());
6850        }
6851        LogicalPlan::Unwind {
6852            input, variable, ..
6853        } => {
6854            out.insert(variable.clone());
6855            collect_plan_variables_into(input, out);
6856        }
6857        LogicalPlan::Traverse {
6858            input,
6859            source_variable,
6860            target_variable,
6861            step_variable,
6862            path_variable,
6863            ..
6864        } => {
6865            collect_plan_variables_into(input, out);
6866            out.insert(source_variable.clone());
6867            out.insert(target_variable.clone());
6868            if let Some(s) = step_variable {
6869                out.insert(s.clone());
6870            }
6871            if let Some(p) = path_variable {
6872                out.insert(p.clone());
6873            }
6874        }
6875        LogicalPlan::TraverseMainByType {
6876            input,
6877            source_variable,
6878            target_variable,
6879            step_variable,
6880            path_variable,
6881            ..
6882        } => {
6883            collect_plan_variables_into(input, out);
6884            out.insert(source_variable.clone());
6885            out.insert(target_variable.clone());
6886            if let Some(s) = step_variable {
6887                out.insert(s.clone());
6888            }
6889            if let Some(p) = path_variable {
6890                out.insert(p.clone());
6891            }
6892        }
6893        LogicalPlan::Union { left, right, .. } | LogicalPlan::CrossJoin { left, right } => {
6894            collect_plan_variables_into(left, out);
6895            collect_plan_variables_into(right, out);
6896        }
6897        LogicalPlan::Apply {
6898            input, subquery, ..
6899        } => {
6900            collect_plan_variables_into(input, out);
6901            collect_plan_variables_into(subquery, out);
6902        }
6903        LogicalPlan::Filter { input, .. }
6904        | LogicalPlan::Project { input, .. }
6905        | LogicalPlan::Sort { input, .. }
6906        | LogicalPlan::Limit { input, .. }
6907        | LogicalPlan::Aggregate { input, .. }
6908        | LogicalPlan::Distinct { input }
6909        | LogicalPlan::Window { input, .. }
6910        | LogicalPlan::Create { input, .. }
6911        | LogicalPlan::CreateBatch { input, .. }
6912        | LogicalPlan::Merge { input, .. }
6913        | LogicalPlan::Set { input, .. }
6914        | LogicalPlan::Remove { input, .. }
6915        | LogicalPlan::Delete { input, .. }
6916        | LogicalPlan::Foreach { input, .. }
6917        | LogicalPlan::SubqueryCall { input, .. } => {
6918            collect_plan_variables_into(input, out);
6919        }
6920        // Leaf or unsupported: no variables collected.
6921        _ => {}
6922    }
6923}
6924
6925/// Recursively collect variable names referenced in an expression.
6926fn collect_expr_variables_set(expr: &Expr) -> HashSet<String> {
6927    let mut out = HashSet::new();
6928    collect_expr_variables_into(expr, &mut out);
6929    out
6930}
6931
6932fn collect_expr_variables_into(expr: &Expr, out: &mut HashSet<String>) {
6933    use uni_cypher::ast::Expr as E;
6934    match expr {
6935        E::Variable(v) => {
6936            out.insert(v.clone());
6937        }
6938        E::Property(base, _) => collect_expr_variables_into(base, out),
6939        E::BinaryOp { left, right, .. } => {
6940            collect_expr_variables_into(left, out);
6941            collect_expr_variables_into(right, out);
6942        }
6943        E::UnaryOp { expr, .. } | E::IsNull(expr) | E::IsNotNull(expr) | E::IsUnique(expr) => {
6944            collect_expr_variables_into(expr, out)
6945        }
6946        E::FunctionCall { args, .. } => {
6947            for a in args {
6948                collect_expr_variables_into(a, out);
6949            }
6950        }
6951        E::List(items) => {
6952            for it in items {
6953                collect_expr_variables_into(it, out);
6954            }
6955        }
6956        E::In { expr, list } => {
6957            collect_expr_variables_into(expr, out);
6958            collect_expr_variables_into(list, out);
6959        }
6960        E::Case {
6961            expr,
6962            when_then,
6963            else_expr,
6964        } => {
6965            if let Some(e) = expr {
6966                collect_expr_variables_into(e, out);
6967            }
6968            for (w, t) in when_then {
6969                collect_expr_variables_into(w, out);
6970                collect_expr_variables_into(t, out);
6971            }
6972            if let Some(e) = else_expr {
6973                collect_expr_variables_into(e, out);
6974            }
6975        }
6976        E::Map(entries) => {
6977            for (_, v) in entries {
6978                collect_expr_variables_into(v, out);
6979            }
6980        }
6981        E::LabelCheck { expr, .. } => collect_expr_variables_into(expr, out),
6982        E::ArrayIndex { array, index } => {
6983            collect_expr_variables_into(array, out);
6984            collect_expr_variables_into(index, out);
6985        }
6986        E::ArraySlice { array, start, end } => {
6987            collect_expr_variables_into(array, out);
6988            if let Some(s) = start {
6989                collect_expr_variables_into(s, out);
6990            }
6991            if let Some(e) = end {
6992                collect_expr_variables_into(e, out);
6993            }
6994        }
6995        // Skip Quantifier/Reduce/ListComprehension/PatternComprehension —
6996        // they introduce local bindings not in outer scope.
6997        _ => {}
6998    }
6999}
7000
7001/// Split a predicate at top-level AND-conjuncts.
7002fn split_and_conjuncts(predicate: &Expr) -> Vec<Expr> {
7003    use uni_cypher::ast::BinaryOp;
7004    let mut out = Vec::new();
7005    fn walk(e: &Expr, out: &mut Vec<Expr>) {
7006        if let Expr::BinaryOp {
7007            left,
7008            op: BinaryOp::And,
7009            right,
7010        } = e
7011        {
7012            walk(left, out);
7013            walk(right, out);
7014        } else {
7015            out.push(e.clone());
7016        }
7017    }
7018    walk(predicate, &mut out);
7019    out
7020}
7021
7022/// AND-combine multiple expressions into one (or None for empty input).
7023fn and_combine(exprs: Vec<Expr>) -> Option<Expr> {
7024    use uni_cypher::ast::BinaryOp;
7025    let mut iter = exprs.into_iter();
7026    let first = iter.next()?;
7027    Some(iter.fold(first, |acc, e| Expr::BinaryOp {
7028        left: Box::new(acc),
7029        op: BinaryOp::And,
7030        right: Box::new(e),
7031    }))
7032}
7033
7034/// Classify each AND-conjunct of `predicate` according to which side(s) of a
7035/// CrossJoin its variables come from.
7036fn classify_join_predicate(
7037    predicate: &Expr,
7038    left_vars: &HashSet<String>,
7039    right_vars: &HashSet<String>,
7040) -> JoinPredicateClassification {
7041    use uni_cypher::ast::BinaryOp;
7042
7043    let mut equi_pairs = Vec::new();
7044    let mut left_only = Vec::new();
7045    let mut right_only = Vec::new();
7046    let mut residual_parts: Vec<Expr> = Vec::new();
7047
7048    for conjunct in split_and_conjuncts(predicate) {
7049        // Try equi-join: BinaryOp::Eq with one side referencing only left vars
7050        // and the other only right vars.
7051        if let Expr::BinaryOp {
7052            left,
7053            op: BinaryOp::Eq,
7054            right,
7055        } = &conjunct
7056        {
7057            let lv = collect_expr_variables_set(left);
7058            let rv = collect_expr_variables_set(right);
7059            let l_in_left = !lv.is_empty() && lv.is_subset(left_vars);
7060            let r_in_right = !rv.is_empty() && rv.is_subset(right_vars);
7061            let l_in_right = !lv.is_empty() && lv.is_subset(right_vars);
7062            let r_in_left = !rv.is_empty() && rv.is_subset(left_vars);
7063            if l_in_left && r_in_right {
7064                equi_pairs.push(((**left).clone(), (**right).clone()));
7065                continue;
7066            }
7067            if l_in_right && r_in_left {
7068                equi_pairs.push(((**right).clone(), (**left).clone()));
7069                continue;
7070            }
7071        }
7072
7073        // Not an equi-join — classify by which sides its variables belong to.
7074        let vars = collect_expr_variables_set(&conjunct);
7075        let touches_left = vars.iter().any(|v| left_vars.contains(v));
7076        let touches_right = vars.iter().any(|v| right_vars.contains(v));
7077        match (touches_left, touches_right) {
7078            (true, false) => left_only.push(conjunct),
7079            (false, true) => right_only.push(conjunct),
7080            // Both sides (mixed-non-equi) or neither (constant) → residual.
7081            _ => residual_parts.push(conjunct),
7082        }
7083    }
7084
7085    JoinPredicateClassification {
7086        equi_pairs,
7087        left_only,
7088        right_only,
7089        residual: and_combine(residual_parts),
7090    }
7091}
7092
7093/// Maximum static UNWIND list size for IN-list scan pushdown. Beyond this,
7094/// the cost of injecting a giant `IN` filter outweighs the savings vs. the
7095/// HashJoin alone, so we skip the pushdown.
7096const MAX_UNWIND_IN_PUSHDOWN_VALUES: usize = 10_000;
7097
7098/// Convert a `uni_common::Value` primitive into a `CypherLiteral` for use in
7099/// AST `Expr::List` items. Returns `None` for non-primitive Values (lists,
7100/// maps, nodes, etc.) — those don't make sense as `IN` list elements anyway.
7101/// One-shot `tracing::warn!` when a literal-list UNWIND that *looks* like
7102/// it should be pushable to a scan-side IN-list filter fails one of the
7103/// content gates (missing field, non-literal value at field, oversized
7104/// list). Surfaces the gap so diagnostic users and CI catch "I wrote an
7105/// inlined UNWIND for a test and got silent full-scan" patterns; in
7106/// production these would have pushed if rewritten as `UNWIND $param AS u`.
7107///
7108/// Deduped via a single `AtomicBool` to avoid log spam on long-running
7109/// processes; one warning per process across all reasons.
7110fn warn_unpushable_unwind_once(reason: &'static str) {
7111    use std::sync::atomic::{AtomicBool, Ordering};
7112    static WARNED: AtomicBool = AtomicBool::new(false);
7113    if WARNED.swap(true, Ordering::Relaxed) {
7114        return;
7115    }
7116    tracing::warn!(
7117        target: "uni_query::cross_join_in_pushdown",
7118        reason,
7119        "Inlined UNWIND of map literals failed pushdown — falling back \
7120         to FilterExec over a full scan. Rewrite as `UNWIND $param AS u` \
7121         with the param bound as a List<Map<...>> to guarantee pushdown."
7122    );
7123}
7124
7125fn value_to_cypher_literal(v: &uni_common::Value) -> Option<CypherLiteral> {
7126    use uni_common::Value;
7127    match v {
7128        Value::Null => Some(CypherLiteral::Null),
7129        Value::Bool(b) => Some(CypherLiteral::Bool(*b)),
7130        Value::Int(n) => Some(CypherLiteral::Integer(*n)),
7131        Value::Float(f) => Some(CypherLiteral::Float(*f)),
7132        Value::String(s) => Some(CypherLiteral::String(s.clone())),
7133        _ => None,
7134    }
7135}
7136
7137/// Walk a logical-plan subtree looking for `LogicalPlan::Unwind { variable, expr, .. }`
7138/// where `variable == target_var`, and return the bound list of values **if**
7139/// the UNWIND source is statically resolvable at plan time:
7140///
7141/// - `Expr::List(items)` where every item is an `Expr::Literal(_)` → use them directly.
7142/// - `Expr::Parameter(name)` where `params[name]` is `Value::List(...)` → convert
7143///   each primitive element into an `Expr::Literal`.
7144///
7145/// Returns `None` for any other source (sub-MATCH, correlated, runtime-only),
7146/// or when the list contains non-primitive values, or exceeds
7147/// `MAX_UNWIND_IN_PUSHDOWN_VALUES`.
7148/// Walk a chain of UNWIND/Filter/Project/CrossJoin nodes looking for the
7149/// `Unwind` binding `target_var`. When found, `extract` is invoked on that
7150/// UNWIND's source expression; the first `Some` result wins.
7151///
7152/// Both `extract_static_unwind_values` and `extract_static_unwind_field_values`
7153/// share this traversal — they differ only in what `extract` returns.
7154/// Touching the set of recognized plan nodes (e.g. adding `Distinct`) only
7155/// needs to happen here.
7156fn walk_static_unwind_chain<F, T>(
7157    plan: &LogicalPlan,
7158    target_var: &str,
7159    extract: &mut F,
7160) -> Option<T>
7161where
7162    F: FnMut(&Expr) -> Option<T>,
7163{
7164    match plan {
7165        LogicalPlan::Unwind {
7166            input,
7167            expr,
7168            variable,
7169        } if variable == target_var => {
7170            extract(expr).or_else(|| walk_static_unwind_chain(input, target_var, extract))
7171        }
7172        // Single-input plan nodes: recurse into the input.
7173        LogicalPlan::Filter { input, .. }
7174        | LogicalPlan::Project { input, .. }
7175        | LogicalPlan::Unwind { input, .. } => walk_static_unwind_chain(input, target_var, extract),
7176        // CrossJoin: search both subtrees. The UNWIND of `target_var` lives in
7177        // exactly one side; the other returns None.
7178        LogicalPlan::CrossJoin { left, right } => {
7179            walk_static_unwind_chain(left, target_var, extract)
7180                .or_else(|| walk_static_unwind_chain(right, target_var, extract))
7181        }
7182        _ => None,
7183    }
7184}
7185
7186fn extract_static_unwind_values(
7187    plan: &LogicalPlan,
7188    target_var: &str,
7189    params: &HashMap<String, uni_common::Value>,
7190) -> Option<Vec<Expr>> {
7191    walk_static_unwind_chain(plan, target_var, &mut |expr| {
7192        materialize_unwind_source(expr, params)
7193    })
7194}
7195
7196/// Variant of [`extract_static_unwind_values`] that projects a named `field`
7197/// out of each map element in the UNWIND source. See issue #55 (PR #4).
7198fn extract_static_unwind_field_values(
7199    plan: &LogicalPlan,
7200    target_var: &str,
7201    field: &str,
7202    params: &HashMap<String, uni_common::Value>,
7203) -> Option<Vec<Expr>> {
7204    walk_static_unwind_chain(plan, target_var, &mut |expr| {
7205        materialize_unwind_source_field(expr, params, field)
7206    })
7207}
7208
7209/// Materialize a UNWIND source `Expr` into a vec of literal `Expr`s if possible.
7210fn materialize_unwind_source(
7211    expr: &Expr,
7212    params: &HashMap<String, uni_common::Value>,
7213) -> Option<Vec<Expr>> {
7214    match expr {
7215        Expr::List(items) => {
7216            if items.len() > MAX_UNWIND_IN_PUSHDOWN_VALUES {
7217                return None;
7218            }
7219            let mut out = Vec::with_capacity(items.len());
7220            for item in items {
7221                match item {
7222                    Expr::Literal(_) => out.push(item.clone()),
7223                    _ => return None,
7224                }
7225            }
7226            Some(out)
7227        }
7228        Expr::Parameter(name) => match params.get(name)? {
7229            uni_common::Value::List(values) => {
7230                if values.len() > MAX_UNWIND_IN_PUSHDOWN_VALUES {
7231                    return None;
7232                }
7233                let mut out = Vec::with_capacity(values.len());
7234                for v in values {
7235                    out.push(Expr::Literal(value_to_cypher_literal(v)?));
7236                }
7237                Some(out)
7238            }
7239            _ => None,
7240        },
7241        _ => None,
7242    }
7243}
7244
7245/// Materialize a UNWIND source `Expr` into a vec of literal `Expr`s, projecting
7246/// `field` out of each map element. Handles the common case where the UNWIND
7247/// source is a list of maps and we want to push down on a specific field —
7248/// e.g. `UNWIND $edges AS e ... WHERE id(a) = e.src` with `$edges` bound to
7249/// `List<Map<src, dst>>` returns the list of `src` values as literals.
7250///
7251/// Returns `None` if the source isn't a statically-resolvable list of maps
7252/// or any element lacks `field` or has a non-primitive value at `field`.
7253/// See issue #55 (PR #4).
7254fn materialize_unwind_source_field(
7255    expr: &Expr,
7256    params: &HashMap<String, uni_common::Value>,
7257    field: &str,
7258) -> Option<Vec<Expr>> {
7259    match expr {
7260        Expr::List(items) => {
7261            if items.len() > MAX_UNWIND_IN_PUSHDOWN_VALUES {
7262                warn_unpushable_unwind_once("UNWIND list exceeds MAX_UNWIND_IN_PUSHDOWN_VALUES");
7263                return None;
7264            }
7265            // Inlined map literals at plan time: each item must be an
7266            // `Expr::Map(entries)` whose entry at `field` is itself an
7267            // `Expr::Literal(_)`. Extract the literals directly — we
7268            // already have them as Expr, no Value↔Literal conversion
7269            // needed (unlike the Parameter branch below).
7270            //
7271            // Non-map items return None silently (they're a type
7272            // mismatch the planner will flag elsewhere). Maps with a
7273            // missing or non-literal value at `field` emit a one-shot
7274            // warn — those shapes would have pushed if rewritten as
7275            // `UNWIND $param AS u` (where parameter resolution makes
7276            // every value a primitive Value).
7277            let mut out = Vec::with_capacity(items.len());
7278            for item in items {
7279                let entries = match item {
7280                    Expr::Map(entries) => entries,
7281                    _ => return None,
7282                };
7283                let Some((_, value_expr)) = entries.iter().find(|(k, _)| k == field) else {
7284                    warn_unpushable_unwind_once(
7285                        "UNWIND map literal is missing the field referenced by the join predicate",
7286                    );
7287                    return None;
7288                };
7289                let Expr::Literal(_) = value_expr else {
7290                    warn_unpushable_unwind_once(
7291                        "UNWIND map literal has a non-literal value at the joined field \
7292                         (e.g., a parameter or function call) — substitute with a literal \
7293                         or rewrite as `UNWIND $param AS u` with the param bound at runtime",
7294                    );
7295                    return None;
7296                };
7297                out.push(value_expr.clone());
7298            }
7299            Some(out)
7300        }
7301        Expr::Parameter(name) => match params.get(name)? {
7302            uni_common::Value::List(values) => {
7303                if values.len() > MAX_UNWIND_IN_PUSHDOWN_VALUES {
7304                    return None;
7305                }
7306                let mut out = Vec::with_capacity(values.len());
7307                for v in values {
7308                    let map = match v {
7309                        uni_common::Value::Map(m) => m,
7310                        _ => return None,
7311                    };
7312                    let inner = map.get(field)?;
7313                    out.push(Expr::Literal(value_to_cypher_literal(inner)?));
7314                }
7315                Some(out)
7316            }
7317            _ => None,
7318        },
7319        _ => None,
7320    }
7321}
7322
7323/// If `unwind_side_expr` is bound to a variable produced by a static UNWIND
7324/// in `unwind_subplan`, and `scan_side_expr` is a property of a scan variable,
7325/// build an `Expr::In { expr: scan_side_expr, list: [literals...] }` to inject
7326/// as a scan-side filter. Returns `None` if any condition fails.
7327///
7328/// Accepts two forms on the unwind side:
7329/// - `Variable(v)` — direct list element (e.g. `UNWIND $names AS n ... = n`).
7330/// - `Property(Variable(v), _)` — list of maps (e.g. `UNWIND $rows AS r ... = r.k`).
7331///   Property form requires the parameter list to be a list of `Value::Map`s,
7332///   so we conservatively skip it here (the materializer rejects non-primitive
7333///   values anyway).
7334fn build_in_pushdown(
7335    unwind_side_expr: &Expr,
7336    scan_side_expr: &Expr,
7337    unwind_subplan: &LogicalPlan,
7338    params: &HashMap<String, uni_common::Value>,
7339) -> Option<Expr> {
7340    // Identify the UNWIND variable (and optional field) on the unwind side.
7341    let (unwind_var, field) = match unwind_side_expr {
7342        Expr::Variable(v) => (v.as_str(), None),
7343        Expr::Property(box_var, f) => match box_var.as_ref() {
7344            Expr::Variable(v) => (v.as_str(), Some(f.as_str())),
7345            _ => {
7346                tracing::debug!(
7347                    target: "uni_query::cross_join_in_pushdown",
7348                    reason = "unwind side Property inner is not Variable",
7349                    "build_in_pushdown rejected"
7350                );
7351                return None;
7352            }
7353        },
7354        _ => {
7355            tracing::debug!(
7356                target: "uni_query::cross_join_in_pushdown",
7357                reason = "unwind side is not Variable or Property",
7358                unwind_kind = std::any::type_name_of_val(&unwind_side_expr),
7359                "build_in_pushdown rejected"
7360            );
7361            return None;
7362        }
7363    };
7364
7365    // Scan side must be `Property(Variable(_), _)` so that `is_pushable`
7366    // (which accepts `Property(Variable(scan_var), prop)` on the LHS of an IN)
7367    // will push the filter into the scan.
7368    let Expr::Property(scan_box_var, _scan_field) = scan_side_expr else {
7369        tracing::debug!(
7370            target: "uni_query::cross_join_in_pushdown",
7371            reason = "scan side is not Property",
7372            "build_in_pushdown rejected"
7373        );
7374        return None;
7375    };
7376    if !matches!(scan_box_var.as_ref(), Expr::Variable(_)) {
7377        tracing::debug!(
7378            target: "uni_query::cross_join_in_pushdown",
7379            reason = "scan side Property inner is not Variable",
7380            "build_in_pushdown rejected"
7381        );
7382        return None;
7383    }
7384
7385    // Resolve the IN-list values from the UNWIND source. The two cases are:
7386    //   * `UNWIND $list AS e ... = e`           → primitive list at $list
7387    //   * `UNWIND $list AS e ... = e.field`     → list of maps at $list,
7388    //                                              project `field` per element
7389    let values = match field {
7390        None => match extract_static_unwind_values(unwind_subplan, unwind_var, params) {
7391            Some(v) => v,
7392            None => {
7393                tracing::debug!(
7394                    target: "uni_query::cross_join_in_pushdown",
7395                    reason = "extract_static_unwind_values returned None",
7396                    unwind_var,
7397                    "build_in_pushdown rejected"
7398                );
7399                return None;
7400            }
7401        },
7402        Some(f) => {
7403            match extract_static_unwind_field_values(unwind_subplan, unwind_var, f, params) {
7404                Some(v) => v,
7405                None => {
7406                    tracing::debug!(
7407                        target: "uni_query::cross_join_in_pushdown",
7408                        reason = "extract_static_unwind_field_values returned None \
7409                                  (UNWIND source is not Expr::Parameter, or param is not \
7410                                  Value::List<Value::Map>, or a map element lacks field, \
7411                                  or list size exceeded MAX_UNWIND_IN_PUSHDOWN_VALUES)",
7412                        unwind_var,
7413                        field = f,
7414                        "build_in_pushdown rejected"
7415                    );
7416                    return None;
7417                }
7418            }
7419        }
7420    };
7421    if values.is_empty() {
7422        tracing::debug!(
7423            target: "uni_query::cross_join_in_pushdown",
7424            reason = "extracted value list is empty",
7425            unwind_var,
7426            ?field,
7427            "build_in_pushdown rejected"
7428        );
7429        return None;
7430    }
7431
7432    tracing::debug!(
7433        target: "uni_query::cross_join_in_pushdown",
7434        unwind_var,
7435        ?field,
7436        values_count = values.len(),
7437        "build_in_pushdown extracted IN-list"
7438    );
7439    Some(Expr::In {
7440        expr: Box::new(scan_side_expr.clone()),
7441        list: Box::new(Expr::List(values)),
7442    })
7443}
7444
7445/// Wrap `plan` with a `LogicalPlan::Filter` AND-combining `filters` if any.
7446/// Returns true if `expr` is `Property(Variable(_), "_vid")`. Used by
7447/// [`try_emit_vid_lookup_join`] (issue #55 PR #5) to identify the probe side
7448/// of an inner-equi-join. `id(x)` is lowered to this shape during AST→
7449/// logical-plan translation, so we don't need a separate `FunctionCall`
7450/// arm here.
7451fn expr_is_vid_property(expr: &Expr) -> bool {
7452    matches!(
7453        expr,
7454        Expr::Property(inner, prop)
7455            if prop == "_vid" && matches!(inner.as_ref(), Expr::Variable(_))
7456    )
7457}
7458
7459fn wrap_with_filter(plan: LogicalPlan, filters: &[Expr]) -> LogicalPlan {
7460    if filters.is_empty() {
7461        return plan;
7462    }
7463    let predicate = and_combine(filters.to_vec()).expect("non-empty filters");
7464    // Critical for issue #55: when `plan` is a Scan node, we MUST merge the
7465    // predicate into the Scan's own `filter` field. Wrapping the Scan in a
7466    // Filter LogicalPlan node would route through `plan_filter`, which builds
7467    // a FilterExec on top of GraphScanExec — that runs Lance's full-table
7468    // scan first and only filters in DataFusion afterwards, defeating the
7469    // pushdown. Merging into Scan.filter lets `plan_scan` /
7470    // `plan_schemaless_scan` extract the IN-list and push it to Lance.
7471    match plan {
7472        LogicalPlan::Scan {
7473            label_id,
7474            labels,
7475            variable,
7476            filter: existing,
7477            optional,
7478        } => LogicalPlan::Scan {
7479            label_id,
7480            labels,
7481            variable,
7482            filter: merge_filter(existing, predicate),
7483            optional,
7484        },
7485        LogicalPlan::ScanMainByLabels {
7486            labels,
7487            variable,
7488            filter: existing,
7489            optional,
7490        } => LogicalPlan::ScanMainByLabels {
7491            labels,
7492            variable,
7493            filter: merge_filter(existing, predicate),
7494            optional,
7495        },
7496        LogicalPlan::ScanAll {
7497            variable,
7498            filter: existing,
7499            optional,
7500        } => LogicalPlan::ScanAll {
7501            variable,
7502            filter: merge_filter(existing, predicate),
7503            optional,
7504        },
7505        // For any other shape (CrossJoin, nested Filter, etc.) keep the
7506        // historical wrap-in-Filter behavior. plan_internal will recurse and
7507        // any inner Scan-wrapped subtree will benefit from the merge above.
7508        other => LogicalPlan::Filter {
7509            input: Box::new(other),
7510            predicate,
7511            optional_variables: HashSet::new(),
7512        },
7513    }
7514}
7515
7516/// AND-merge an optional existing filter with a new predicate.
7517///
7518/// Idempotent: if `existing == predicate`, the existing filter is
7519/// returned unchanged (no `Expr::BinaryOp(And, X, X)` duplication).
7520/// This makes the `merge_unwind_in_filters` rewrite pass safely
7521/// re-runnable and keeps Scan filters minimal across the planner's
7522/// recursive descent.
7523fn merge_filter(existing: Option<Expr>, predicate: Expr) -> Option<Expr> {
7524    match existing {
7525        Some(prev) if prev == predicate => Some(prev),
7526        Some(prev) => and_combine(vec![prev, predicate]),
7527        None => Some(predicate),
7528    }
7529}
7530
7531/// Pre-physical-plan rewrite: walk a [`LogicalPlan`] tree and, at every
7532/// `Filter(CrossJoin(L, R), pred)` shape, lift IN-list filters extracted
7533/// from UNWIND-correlated equi-pairs into the appropriate `Scan.filter`
7534/// field of L or R.
7535///
7536/// **Why this lives outside `try_plan_cross_join_as_hash_join`**:
7537///
7538/// Historically the merge happened inside `try_plan_cross_join_as_hash_join`
7539/// before the HashJoin attempt. When join-key type unification failed (e.g.
7540/// `Utf8 ↔ LargeBinary CV` — see `unify_join_key_types` line ~6995), the
7541/// function returned `Ok(None)` and the caller (`plan_filter`) re-planned
7542/// the **original** CrossJoin from scratch, discarding the merged-filter
7543/// subtrees. The Hash-index pushdown silently vanished.
7544///
7545/// Separating the rewrite as an independent logical-plan pass that runs
7546/// **before** any physical-plan optimization closes that class of bugs at
7547/// the source: regardless of whether `HashJoinExec`, `VidLookupJoinExec`,
7548/// or a future optimization succeeds or bails, the scan-side filters are
7549/// already in the LogicalPlan and propagate to the eventual physical
7550/// plan via the normal `plan_scan` → `build_indexed_property_pushdown`
7551/// path.
7552///
7553/// **What this pass does NOT do**:
7554///
7555///  - It does not push `left_only` / `right_only` predicate conjuncts
7556///    into the subtrees. Those are predicate-decomposition concerns
7557///    handled by `classify_join_predicate` + the residual logic inside
7558///    `try_plan_cross_join_as_hash_join`. Decomposition is part of
7559///    HashJoin emission and conceptually belongs with it.
7560///  - It does not touch non-CrossJoin nodes. Filters on other inputs
7561///    (Scan, Traverse, Apply, etc.) already merge correctly via
7562///    `wrap_with_filter` when needed.
7563///
7564/// **Idempotence**: running the pass twice produces the same result.
7565/// The IN-list filters merged on the first pass are not equi-join
7566/// predicates against the (now-already-filtered) subtree's UNWIND, so
7567/// the second pass extracts nothing new.
7568fn merge_unwind_in_filters(
7569    plan: &LogicalPlan,
7570    params: &HashMap<String, uni_common::Value>,
7571) -> LogicalPlan {
7572    match plan {
7573        // Target shape: Filter wrapping a CrossJoin — try IN-list pushdown.
7574        LogicalPlan::Filter {
7575            input,
7576            predicate,
7577            optional_variables,
7578        } if matches!(input.as_ref(), LogicalPlan::CrossJoin { .. }) => {
7579            // Safe: matches! above guarantees this destructure succeeds.
7580            let LogicalPlan::CrossJoin { left, right } = input.as_ref() else {
7581                unreachable!("matches! above guarantees CrossJoin")
7582            };
7583
7584            // Recurse into the subtrees first to catch nested CrossJoins.
7585            let left_rewritten = merge_unwind_in_filters(left, params);
7586            let right_rewritten = merge_unwind_in_filters(right, params);
7587
7588            let left_vars = collect_plan_variables(&left_rewritten);
7589            let right_vars = collect_plan_variables(&right_rewritten);
7590            let cls = classify_join_predicate(predicate, &left_vars, &right_vars);
7591
7592            let rebuild_unmodified = |l: LogicalPlan, r: LogicalPlan| LogicalPlan::Filter {
7593                input: Box::new(LogicalPlan::CrossJoin {
7594                    left: Box::new(l),
7595                    right: Box::new(r),
7596                }),
7597                predicate: predicate.clone(),
7598                optional_variables: optional_variables.clone(),
7599            };
7600
7601            if cls.equi_pairs.is_empty() {
7602                return rebuild_unmodified(left_rewritten, right_rewritten);
7603            }
7604
7605            // Build IN-list filters for each equi-pair × subtree orientation.
7606            // See `build_in_pushdown` for the gating; `materialize_unwind_source_*`
7607            // returns None for shapes we can't statically resolve.
7608            let mut left_extra_in: Vec<Expr> = Vec::new();
7609            let mut right_extra_in: Vec<Expr> = Vec::new();
7610            for (l_expr, r_expr) in &cls.equi_pairs {
7611                if let Some(in_filter) = build_in_pushdown(l_expr, r_expr, &left_rewritten, params)
7612                {
7613                    right_extra_in.push(in_filter);
7614                    continue;
7615                }
7616                if let Some(in_filter) = build_in_pushdown(r_expr, l_expr, &left_rewritten, params)
7617                {
7618                    right_extra_in.push(in_filter);
7619                    continue;
7620                }
7621                if let Some(in_filter) = build_in_pushdown(l_expr, r_expr, &right_rewritten, params)
7622                {
7623                    left_extra_in.push(in_filter);
7624                    continue;
7625                }
7626                if let Some(in_filter) = build_in_pushdown(r_expr, l_expr, &right_rewritten, params)
7627                {
7628                    left_extra_in.push(in_filter);
7629                }
7630            }
7631
7632            tracing::debug!(
7633                target: "uni_query::cross_join_in_pushdown",
7634                left_in_filters = left_extra_in.len(),
7635                right_in_filters = right_extra_in.len(),
7636                "merge_unwind_in_filters: IN-pushdown result"
7637            );
7638
7639            if left_extra_in.is_empty() && right_extra_in.is_empty() {
7640                return rebuild_unmodified(left_rewritten, right_rewritten);
7641            }
7642
7643            let left_merged = wrap_with_filter(left_rewritten, &left_extra_in);
7644            let right_merged = wrap_with_filter(right_rewritten, &right_extra_in);
7645            rebuild_unmodified(left_merged, right_merged)
7646        }
7647        // Pass through Filter wrapping non-CrossJoin.
7648        LogicalPlan::Filter {
7649            input,
7650            predicate,
7651            optional_variables,
7652        } => LogicalPlan::Filter {
7653            input: Box::new(merge_unwind_in_filters(input, params)),
7654            predicate: predicate.clone(),
7655            optional_variables: optional_variables.clone(),
7656        },
7657        // Single-input wrappers: recurse on `input`.
7658        LogicalPlan::Project { input, projections } => LogicalPlan::Project {
7659            input: Box::new(merge_unwind_in_filters(input, params)),
7660            projections: projections.clone(),
7661        },
7662        LogicalPlan::Sort { input, order_by } => LogicalPlan::Sort {
7663            input: Box::new(merge_unwind_in_filters(input, params)),
7664            order_by: order_by.clone(),
7665        },
7666        LogicalPlan::Limit { input, skip, fetch } => LogicalPlan::Limit {
7667            input: Box::new(merge_unwind_in_filters(input, params)),
7668            skip: *skip,
7669            fetch: *fetch,
7670        },
7671        LogicalPlan::Distinct { input } => LogicalPlan::Distinct {
7672            input: Box::new(merge_unwind_in_filters(input, params)),
7673        },
7674        LogicalPlan::Unwind {
7675            input,
7676            expr,
7677            variable,
7678        } => LogicalPlan::Unwind {
7679            input: Box::new(merge_unwind_in_filters(input, params)),
7680            expr: expr.clone(),
7681            variable: variable.clone(),
7682        },
7683        // Mutation nodes wrap a MATCH-side input — recurse so that
7684        // `UNWIND $list AS u MATCH (n:Label) WHERE n.k = u.k SET ...` /
7685        // REMOVE / DELETE / CREATE-with-MATCH / MERGE all benefit from
7686        // the rewrite. The mutation operation itself isn't touched.
7687        LogicalPlan::Set { input, items } => LogicalPlan::Set {
7688            input: Box::new(merge_unwind_in_filters(input, params)),
7689            items: items.clone(),
7690        },
7691        LogicalPlan::Remove { input, items } => LogicalPlan::Remove {
7692            input: Box::new(merge_unwind_in_filters(input, params)),
7693            items: items.clone(),
7694        },
7695        LogicalPlan::Delete {
7696            input,
7697            items,
7698            detach,
7699        } => LogicalPlan::Delete {
7700            input: Box::new(merge_unwind_in_filters(input, params)),
7701            items: items.clone(),
7702            detach: *detach,
7703        },
7704        LogicalPlan::Create { input, pattern } => LogicalPlan::Create {
7705            input: Box::new(merge_unwind_in_filters(input, params)),
7706            pattern: pattern.clone(),
7707        },
7708        LogicalPlan::CreateBatch { input, patterns } => LogicalPlan::CreateBatch {
7709            input: Box::new(merge_unwind_in_filters(input, params)),
7710            patterns: patterns.clone(),
7711        },
7712        LogicalPlan::Merge {
7713            input,
7714            pattern,
7715            on_match,
7716            on_create,
7717        } => LogicalPlan::Merge {
7718            input: Box::new(merge_unwind_in_filters(input, params)),
7719            pattern: pattern.clone(),
7720            on_match: on_match.clone(),
7721            on_create: on_create.clone(),
7722        },
7723        LogicalPlan::Foreach {
7724            input,
7725            variable,
7726            list,
7727            body,
7728        } => LogicalPlan::Foreach {
7729            input: Box::new(merge_unwind_in_filters(input, params)),
7730            variable: variable.clone(),
7731            list: list.clone(),
7732            body: body
7733                .iter()
7734                .map(|b| merge_unwind_in_filters(b, params))
7735                .collect(),
7736        },
7737        // Aggregation and windowing nodes wrap an input — recurse.
7738        LogicalPlan::Aggregate {
7739            input,
7740            group_by,
7741            aggregates,
7742        } => LogicalPlan::Aggregate {
7743            input: Box::new(merge_unwind_in_filters(input, params)),
7744            group_by: group_by.clone(),
7745            aggregates: aggregates.clone(),
7746        },
7747        LogicalPlan::Window {
7748            input,
7749            window_exprs,
7750        } => LogicalPlan::Window {
7751            input: Box::new(merge_unwind_in_filters(input, params)),
7752            window_exprs: window_exprs.clone(),
7753        },
7754        LogicalPlan::SubqueryCall { input, subquery } => LogicalPlan::SubqueryCall {
7755            input: Box::new(merge_unwind_in_filters(input, params)),
7756            subquery: Box::new(merge_unwind_in_filters(subquery, params)),
7757        },
7758        // Two-input nodes: recurse on both.
7759        LogicalPlan::CrossJoin { left, right } => LogicalPlan::CrossJoin {
7760            left: Box::new(merge_unwind_in_filters(left, params)),
7761            right: Box::new(merge_unwind_in_filters(right, params)),
7762        },
7763        LogicalPlan::Union { left, right, all } => LogicalPlan::Union {
7764            left: Box::new(merge_unwind_in_filters(left, params)),
7765            right: Box::new(merge_unwind_in_filters(right, params)),
7766            all: *all,
7767        },
7768        // Apply has input + correlated subquery; recurse on both.
7769        LogicalPlan::Apply {
7770            input,
7771            subquery,
7772            input_filter,
7773        } => LogicalPlan::Apply {
7774            input: Box::new(merge_unwind_in_filters(input, params)),
7775            subquery: Box::new(merge_unwind_in_filters(subquery, params)),
7776            input_filter: input_filter.clone(),
7777        },
7778        // Leaf / unsupported / nodes whose internals don't currently
7779        // benefit from this rewrite: pass through unchanged. Adding
7780        // recursion for other variants (Aggregate, Window, Traverse,
7781        // mutation nodes, etc.) is safe but unnecessary — the
7782        // CrossJoin shape only appears under inputs we already recurse
7783        // into above.
7784        _ => plan.clone(),
7785    }
7786}
7787
7788/// Returns `true` if `dt` is hashable directly by Arrow's HashJoinExec without
7789/// any value transformation. When both join keys share such a dtype, we can
7790/// skip the `tointeger` / `_cypher_sort_key` wrap entirely.
7791fn is_hashable_native_dtype(dt: &DataType) -> bool {
7792    matches!(
7793        dt,
7794        DataType::Boolean
7795            | DataType::Int8
7796            | DataType::Int16
7797            | DataType::Int32
7798            | DataType::Int64
7799            | DataType::UInt8
7800            | DataType::UInt16
7801            | DataType::UInt32
7802            | DataType::UInt64
7803            | DataType::Float32
7804            | DataType::Float64
7805            | DataType::Utf8
7806            | DataType::LargeUtf8
7807            | DataType::Binary
7808            | DataType::LargeBinary
7809            | DataType::Date32
7810            | DataType::Date64
7811    )
7812}
7813
7814/// Returns `true` if `dt` is one of the types `tointeger` UDF accepts as input
7815/// (numeric primitives plus CV-encoded `LargeBinary`).
7816fn tointeger_accepts_dtype(dt: &DataType) -> bool {
7817    matches!(
7818        dt,
7819        DataType::Int8
7820            | DataType::Int16
7821            | DataType::Int32
7822            | DataType::Int64
7823            | DataType::UInt8
7824            | DataType::UInt16
7825            | DataType::UInt32
7826            | DataType::UInt64
7827            | DataType::Float32
7828            | DataType::Float64
7829            | DataType::LargeBinary
7830    )
7831}
7832
7833/// Wrap `expr` with a 1-arg scalar UDF that returns `return_dt`.
7834fn wrap_with_unary_udf(
7835    expr: Arc<dyn datafusion::physical_plan::PhysicalExpr>,
7836    udf: Arc<datafusion::logical_expr::ScalarUDF>,
7837    return_dt: DataType,
7838) -> Arc<dyn datafusion::physical_plan::PhysicalExpr> {
7839    let config_options = Arc::new(datafusion::config::ConfigOptions::default());
7840    let udf_name = udf.name().to_string();
7841    let return_field = Arc::new(arrow_schema::Field::new(&udf_name, return_dt, true));
7842    Arc::new(datafusion::physical_expr::ScalarFunctionExpr::new(
7843        &udf_name,
7844        udf,
7845        vec![expr],
7846        return_field,
7847        config_options,
7848    ))
7849}
7850
7851/// Bilateral type unification for a HashJoin equi-pair.
7852///
7853/// Strategy (in order of preference):
7854/// 1. Same dtype + natively hashable → return both unchanged (fast path,
7855///    e.g. `Utf8 = Utf8`, `Int64 = Int64`).
7856/// 2. Both dtypes accepted by `tointeger` (numeric or CV-encoded
7857///    `LargeBinary`) → wrap both in `tointeger` to unify on `Int64`. This is
7858///    the original issue #53 behavior.
7859/// 3. Otherwise (mixed string/CV/other Cypher types) → wrap both in
7860///    `_cypher_sort_key`, which produces an order-preserving `LargeBinary`
7861///    encoding that hashes equal iff the underlying Cypher values are equal.
7862///
7863/// Returns `None` only when the required UDFs aren't registered or a side's
7864/// dtype can't be inferred — the caller falls back to FilterExec+CrossJoin.
7865fn unify_join_key_types(
7866    left: Arc<dyn datafusion::physical_plan::PhysicalExpr>,
7867    right: Arc<dyn datafusion::physical_plan::PhysicalExpr>,
7868    left_schema: &Schema,
7869    right_schema: &Schema,
7870    state: &SessionState,
7871) -> Option<(
7872    Arc<dyn datafusion::physical_plan::PhysicalExpr>,
7873    Arc<dyn datafusion::physical_plan::PhysicalExpr>,
7874)> {
7875    let l_dt = left.data_type(left_schema).ok()?;
7876    let r_dt = right.data_type(right_schema).ok()?;
7877
7878    if l_dt == r_dt && is_hashable_native_dtype(&l_dt) {
7879        return Some((left, right));
7880    }
7881
7882    if tointeger_accepts_dtype(&l_dt) && tointeger_accepts_dtype(&r_dt) {
7883        let udf = state.scalar_functions().get("tointeger")?.clone();
7884        return Some((
7885            wrap_with_unary_udf(left, udf.clone(), DataType::Int64),
7886            wrap_with_unary_udf(right, udf, DataType::Int64),
7887        ));
7888    }
7889
7890    // Cross-domain unification (e.g. Utf8 ↔ LargeBinary CV-encoded) is not yet
7891    // implemented at the HashJoin layer — fall through to FilterExec, which
7892    // handles these via Cypher-aware comparison UDFs.
7893    None
7894}
7895
7896#[cfg(test)]
7897mod tests {
7898    use super::*;
7899
7900    #[test]
7901    fn test_convert_direction() {
7902        assert!(matches!(
7903            convert_direction(AstDirection::Outgoing),
7904            Direction::Outgoing
7905        ));
7906        assert!(matches!(
7907            convert_direction(AstDirection::Incoming),
7908            Direction::Incoming
7909        ));
7910        assert!(matches!(
7911            convert_direction(AstDirection::Both),
7912            Direction::Both
7913        ));
7914    }
7915
7916    #[test]
7917    fn test_sanitize_vlp_target_properties_removes_wildcard() {
7918        let props = vec!["*".to_string(), "name".to_string()];
7919        let label_props = HashSet::from(["name".to_string()]);
7920        let sanitized = sanitize_vlp_target_properties(props, true, Some(&label_props));
7921
7922        assert_eq!(sanitized, vec!["name".to_string()]);
7923    }
7924
7925    #[test]
7926    fn test_sanitize_vlp_target_properties_adds_all_props_for_wildcard_empty() {
7927        let props = vec!["*".to_string()];
7928        let sanitized = sanitize_vlp_target_properties(props, true, None);
7929
7930        assert_eq!(sanitized, vec!["_all_props".to_string()]);
7931    }
7932
7933    #[test]
7934    fn test_sanitize_vlp_target_properties_adds_all_props_for_non_schema() {
7935        let props = vec!["custom_prop".to_string()];
7936        let label_props = HashSet::from(["name".to_string()]);
7937        let sanitized = sanitize_vlp_target_properties(props, false, Some(&label_props));
7938
7939        assert_eq!(
7940            sanitized,
7941            vec!["custom_prop".to_string(), "_all_props".to_string()]
7942        );
7943    }
7944
7945    // -----------------------------------------------------------------
7946    // UNWIND IN-list pushdown — `materialize_unwind_source_field`
7947    //
7948    // Background: an inlined `UNWIND [{nid: 64}, {nid: 65}] AS u
7949    // MATCH (n:Entity) WHERE id(n) = u.nid` should be pushable to a
7950    // `_vid IN (64, 65)` scan filter — identical observable result to
7951    // the param-bound form `UNWIND $updates AS u`. The Parameter branch
7952    // (df_planner.rs:6515-6532) handles parameter-bound lists of maps;
7953    // the literal-list branch must handle the equivalent inlined form.
7954    // -----------------------------------------------------------------
7955
7956    use uni_cypher::ast::CypherLiteral;
7957
7958    fn int_lit(n: i64) -> Expr {
7959        Expr::Literal(CypherLiteral::Integer(n))
7960    }
7961
7962    fn str_lit(s: &str) -> Expr {
7963        Expr::Literal(CypherLiteral::String(s.to_string()))
7964    }
7965
7966    fn map_entry(k: &str, v: Expr) -> (String, Expr) {
7967        (k.to_string(), v)
7968    }
7969
7970    #[test]
7971    fn materialize_unwind_field_accepts_inlined_map_literals() {
7972        // `UNWIND [{nid: 64, x: 1}, {nid: 65, x: 2}] AS u ... = u.nid`
7973        let unwind_expr = Expr::List(vec![
7974            Expr::Map(vec![
7975                map_entry("nid", int_lit(64)),
7976                map_entry("x", int_lit(1)),
7977            ]),
7978            Expr::Map(vec![
7979                map_entry("nid", int_lit(65)),
7980                map_entry("x", int_lit(2)),
7981            ]),
7982        ]);
7983        let params = HashMap::new();
7984        let result = materialize_unwind_source_field(&unwind_expr, &params, "nid");
7985        let values = result.expect("literal-map UNWIND should produce an IN-list");
7986        assert_eq!(values.len(), 2);
7987        assert!(matches!(
7988            &values[0],
7989            Expr::Literal(CypherLiteral::Integer(64))
7990        ));
7991        assert!(matches!(
7992            &values[1],
7993            Expr::Literal(CypherLiteral::Integer(65))
7994        ));
7995    }
7996
7997    #[test]
7998    fn materialize_unwind_field_handles_mixed_primitive_field_types() {
7999        // String field — should also work since value_to_cypher_literal
8000        // accepts strings.
8001        let unwind_expr = Expr::List(vec![
8002            Expr::Map(vec![map_entry("k", str_lit("a"))]),
8003            Expr::Map(vec![map_entry("k", str_lit("b"))]),
8004        ]);
8005        let params = HashMap::new();
8006        let values = materialize_unwind_source_field(&unwind_expr, &params, "k")
8007            .expect("literal-map UNWIND should produce an IN-list");
8008        assert_eq!(values.len(), 2);
8009    }
8010
8011    #[test]
8012    fn materialize_unwind_field_rejects_non_literal_value_at_target_field() {
8013        // `UNWIND [{nid: $p}, ...]` — value is a Parameter, not a Literal.
8014        // Should bail conservatively (we don't substitute parameters
8015        // inside inlined map literals at plan time).
8016        let unwind_expr = Expr::List(vec![Expr::Map(vec![map_entry(
8017            "nid",
8018            Expr::Parameter("p".to_string()),
8019        )])]);
8020        let params = HashMap::new();
8021        let result = materialize_unwind_source_field(&unwind_expr, &params, "nid");
8022        assert!(result.is_none(), "non-literal value at field should bail");
8023    }
8024
8025    #[test]
8026    fn materialize_unwind_field_rejects_when_target_field_missing() {
8027        // `UNWIND [{other: 64}, ...] ... = u.nid` — no `nid` entry.
8028        let unwind_expr = Expr::List(vec![Expr::Map(vec![map_entry("other", int_lit(64))])]);
8029        let params = HashMap::new();
8030        let result = materialize_unwind_source_field(&unwind_expr, &params, "nid");
8031        assert!(
8032            result.is_none(),
8033            "map missing the requested field should bail"
8034        );
8035    }
8036
8037    #[test]
8038    fn materialize_unwind_field_rejects_non_map_list_item() {
8039        // `UNWIND [64, 65] AS u ... = u.nid` — items are bare ints, not
8040        // maps. We're projecting `.nid` from a non-map.
8041        let unwind_expr = Expr::List(vec![int_lit(64), int_lit(65)]);
8042        let params = HashMap::new();
8043        let result = materialize_unwind_source_field(&unwind_expr, &params, "nid");
8044        assert!(
8045            result.is_none(),
8046            "non-map list items can't be field-projected"
8047        );
8048    }
8049
8050    #[test]
8051    fn materialize_unwind_field_rejects_oversized_list() {
8052        // Guard against the `MAX_UNWIND_IN_PUSHDOWN_VALUES` ceiling.
8053        let oversized = MAX_UNWIND_IN_PUSHDOWN_VALUES + 1;
8054        let items: Vec<Expr> = (0..oversized)
8055            .map(|i| Expr::Map(vec![map_entry("nid", int_lit(i as i64))]))
8056            .collect();
8057        let unwind_expr = Expr::List(items);
8058        let params = HashMap::new();
8059        let result = materialize_unwind_source_field(&unwind_expr, &params, "nid");
8060        assert!(result.is_none(), "oversized list should bail");
8061    }
8062
8063    #[test]
8064    fn materialize_unwind_field_param_form_still_works() {
8065        // Regression guard: the param branch must still work after the
8066        // literal branch change.
8067        let mut params = HashMap::new();
8068        params.insert(
8069            "updates".to_string(),
8070            uni_common::Value::List(vec![
8071                uni_common::Value::Map({
8072                    let mut m = HashMap::new();
8073                    m.insert("nid".to_string(), uni_common::Value::Int(64));
8074                    m
8075                }),
8076                uni_common::Value::Map({
8077                    let mut m = HashMap::new();
8078                    m.insert("nid".to_string(), uni_common::Value::Int(65));
8079                    m
8080                }),
8081            ]),
8082        );
8083        let unwind_expr = Expr::Parameter("updates".to_string());
8084        let values = materialize_unwind_source_field(&unwind_expr, &params, "nid")
8085            .expect("parameter form should produce IN-list");
8086        assert_eq!(values.len(), 2);
8087    }
8088
8089    // -----------------------------------------------------------------
8090    // `merge_unwind_in_filters` rewrite pass — lifts IN-list filters
8091    // from `Filter(CrossJoin(Unwind, Scan))` predicates into `Scan.filter`
8092    // BEFORE physical-plan optimizations can bail and discard the merge.
8093    // Closes the systemic class where HashJoin emission failure (e.g.,
8094    // Utf8 ↔ LargeBinary key unification) caused scan-side pushdowns to
8095    // silently vanish.
8096    // -----------------------------------------------------------------
8097
8098    /// Build `Filter(CrossJoin(Unwind, Scan), n.name = u)` — the
8099    /// canonical shape the pass targets.
8100    fn make_filter_crossjoin_scan(
8101        unwind_source: Expr,
8102        unwind_var: &str,
8103        scan_label_id: u16,
8104        scan_label: &str,
8105        scan_var: &str,
8106        predicate: Expr,
8107    ) -> LogicalPlan {
8108        let unwind = LogicalPlan::Unwind {
8109            input: Box::new(LogicalPlan::Project {
8110                input: Box::new(LogicalPlan::Scan {
8111                    label_id: scan_label_id,
8112                    labels: vec![scan_label.to_string()],
8113                    variable: "__dummy__".to_string(),
8114                    filter: None,
8115                    optional: false,
8116                }),
8117                projections: vec![],
8118            }),
8119            expr: unwind_source,
8120            variable: unwind_var.to_string(),
8121        };
8122        let scan = LogicalPlan::Scan {
8123            label_id: scan_label_id,
8124            labels: vec![scan_label.to_string()],
8125            variable: scan_var.to_string(),
8126            filter: None,
8127            optional: false,
8128        };
8129        LogicalPlan::Filter {
8130            input: Box::new(LogicalPlan::CrossJoin {
8131                left: Box::new(unwind),
8132                right: Box::new(scan),
8133            }),
8134            predicate,
8135            optional_variables: HashSet::new(),
8136        }
8137    }
8138
8139    /// `n.scan_var.field = u.unwind_var` predicate, for use as the
8140    /// join predicate in the rewrite-pass tests.
8141    fn eq_property_predicate(scan_var: &str, prop: &str, unwind_var: &str) -> Expr {
8142        Expr::BinaryOp {
8143            left: Box::new(Expr::Property(
8144                Box::new(Expr::Variable(scan_var.to_string())),
8145                prop.to_string(),
8146            )),
8147            op: uni_cypher::ast::BinaryOp::Eq,
8148            right: Box::new(Expr::Variable(unwind_var.to_string())),
8149        }
8150    }
8151
8152    fn assert_scan_filter_is_in_list(plan: &LogicalPlan, expected_label: &str) {
8153        // Find the right subtree of the top-level CrossJoin and assert
8154        // its Scan node has a filter containing an IN-list.
8155        let LogicalPlan::Filter { input, .. } = plan else {
8156            panic!("expected top-level Filter, got {plan:?}");
8157        };
8158        let LogicalPlan::CrossJoin { right, .. } = input.as_ref() else {
8159            panic!("expected CrossJoin under Filter, got {input:?}");
8160        };
8161        let LogicalPlan::Scan { labels, filter, .. } = right.as_ref() else {
8162            panic!("expected Scan as right subtree, got {right:?}");
8163        };
8164        assert_eq!(labels, &vec![expected_label.to_string()]);
8165        let filter_expr = filter
8166            .as_ref()
8167            .expect("Scan.filter must be Some after pass");
8168        assert!(
8169            matches!(filter_expr, Expr::In { .. }),
8170            "Scan.filter should be Expr::In, got {filter_expr:?}"
8171        );
8172    }
8173
8174    #[test]
8175    fn merge_pass_pushes_in_list_into_scan_filter() {
8176        // UNWIND ['a', 'b'] AS u MATCH (n:Item) WHERE n.name = u
8177        let unwind_source = Expr::List(vec![str_lit("a"), str_lit("b")]);
8178        let plan = make_filter_crossjoin_scan(
8179            unwind_source,
8180            "u",
8181            1,
8182            "Item",
8183            "n",
8184            eq_property_predicate("n", "name", "u"),
8185        );
8186        let params = HashMap::new();
8187        let rewritten = merge_unwind_in_filters(&plan, &params);
8188        assert_scan_filter_is_in_list(&rewritten, "Item");
8189    }
8190
8191    #[test]
8192    fn merge_pass_idempotent() {
8193        // Running the pass twice should produce a structurally equivalent
8194        // plan to the single-pass result. We assert the scan filter is
8195        // an IN-list both times (not nested ANDs from re-extraction).
8196        let unwind_source = Expr::List(vec![str_lit("a"), str_lit("b")]);
8197        let plan = make_filter_crossjoin_scan(
8198            unwind_source,
8199            "u",
8200            1,
8201            "Item",
8202            "n",
8203            eq_property_predicate("n", "name", "u"),
8204        );
8205        let params = HashMap::new();
8206        let pass1 = merge_unwind_in_filters(&plan, &params);
8207        let pass2 = merge_unwind_in_filters(&pass1, &params);
8208
8209        // The second pass should leave the merged filter as-is (its
8210        // walker doesn't recurse into Scan.filter, so the IN-list is
8211        // not re-extracted and re-ANDed). Verify the scan.filter
8212        // structure remains `Expr::In`, not `Expr::BinaryOp(And, ...)`.
8213        let LogicalPlan::Filter { input, .. } = &pass2 else {
8214            panic!("expected Filter");
8215        };
8216        let LogicalPlan::CrossJoin { right, .. } = input.as_ref() else {
8217            panic!("expected CrossJoin");
8218        };
8219        let LogicalPlan::Scan { filter, .. } = right.as_ref() else {
8220            panic!("expected Scan");
8221        };
8222        let filter_expr = filter.as_ref().expect("Scan.filter must be Some");
8223        assert!(
8224            matches!(filter_expr, Expr::In { .. }),
8225            "After 2 passes the filter should still be a single Expr::In, \
8226             not ANDed with a duplicate; got {filter_expr:?}"
8227        );
8228    }
8229
8230    #[test]
8231    fn merge_pass_leaves_non_pushable_predicates_alone() {
8232        // Filter with a non-equi predicate (e.g., n.name STARTS WITH "x")
8233        // shouldn't trigger any pushdown — classify_join_predicate
8234        // produces no equi-pairs, so the pass leaves the plan unchanged.
8235        let unwind_source = Expr::List(vec![str_lit("a")]);
8236        let starts_with = Expr::BinaryOp {
8237            left: Box::new(Expr::Property(
8238                Box::new(Expr::Variable("n".to_string())),
8239                "name".to_string(),
8240            )),
8241            op: uni_cypher::ast::BinaryOp::StartsWith,
8242            right: Box::new(str_lit("x")),
8243        };
8244        let plan = make_filter_crossjoin_scan(unwind_source, "u", 1, "Item", "n", starts_with);
8245        let params = HashMap::new();
8246        let rewritten = merge_unwind_in_filters(&plan, &params);
8247
8248        // The Scan's filter should remain None (no equi-pair → no
8249        // IN-list lifted).
8250        let LogicalPlan::Filter { input, .. } = &rewritten else {
8251            panic!("expected Filter");
8252        };
8253        let LogicalPlan::CrossJoin { right, .. } = input.as_ref() else {
8254            panic!("expected CrossJoin");
8255        };
8256        let LogicalPlan::Scan { filter, .. } = right.as_ref() else {
8257            panic!("expected Scan");
8258        };
8259        assert!(
8260            filter.is_none(),
8261            "no equi-pair → no pushdown; Scan.filter should remain None, got {filter:?}"
8262        );
8263    }
8264
8265    #[test]
8266    fn merge_pass_handles_nested_crossjoin() {
8267        // `Filter(CrossJoin(Unwind, CrossJoin(Scan_A, Scan_B)), n.name = u)` —
8268        // The pass should recurse and lift the IN-list into Scan_A
8269        // (which is the side that owns the joined variable "n").
8270        //
8271        // To make the test self-contained, build:
8272        //   Outer: Filter(predicate=`n.name=u`, CrossJoin(L=Unwind(u), R=CrossJoin(Scan(Item,n), Scan(Other,m))))
8273        // The pass walks the outer Filter, recurses into the inner CrossJoin
8274        // first, finds no Filter wrapping it (so leaves it), then handles
8275        // the outer Filter+CrossJoin and lifts the IN-list into the
8276        // appropriate Scan via wrap_with_filter, which recurses into the
8277        // inner CrossJoin to find the matching Scan.
8278        let unwind_source = Expr::List(vec![str_lit("a")]);
8279        let unwind = LogicalPlan::Unwind {
8280            input: Box::new(LogicalPlan::Project {
8281                input: Box::new(LogicalPlan::Scan {
8282                    label_id: 0,
8283                    labels: vec!["__".to_string()],
8284                    variable: "__".to_string(),
8285                    filter: None,
8286                    optional: false,
8287                }),
8288                projections: vec![],
8289            }),
8290            expr: unwind_source,
8291            variable: "u".to_string(),
8292        };
8293        let inner_cross = LogicalPlan::CrossJoin {
8294            left: Box::new(LogicalPlan::Scan {
8295                label_id: 1,
8296                labels: vec!["Item".to_string()],
8297                variable: "n".to_string(),
8298                filter: None,
8299                optional: false,
8300            }),
8301            right: Box::new(LogicalPlan::Scan {
8302                label_id: 2,
8303                labels: vec!["Other".to_string()],
8304                variable: "m".to_string(),
8305                filter: None,
8306                optional: false,
8307            }),
8308        };
8309        let plan = LogicalPlan::Filter {
8310            input: Box::new(LogicalPlan::CrossJoin {
8311                left: Box::new(unwind),
8312                right: Box::new(inner_cross),
8313            }),
8314            predicate: eq_property_predicate("n", "name", "u"),
8315            optional_variables: HashSet::new(),
8316        };
8317        let params = HashMap::new();
8318        let rewritten = merge_unwind_in_filters(&plan, &params);
8319
8320        // Navigate to the Item scan (via outer Filter → CrossJoin.right
8321        // → CrossJoin (or Filter wrapping it) → leftmost Scan). The
8322        // wrap_with_filter helper merges into the right subtree of the
8323        // top-level CrossJoin; that subtree was the inner CrossJoin,
8324        // which isn't a Scan — so wrap_with_filter fell through to its
8325        // "wrap in Filter" branch.
8326        let LogicalPlan::Filter { input, .. } = &rewritten else {
8327            panic!("expected outer Filter");
8328        };
8329        let LogicalPlan::CrossJoin { right, .. } = input.as_ref() else {
8330            panic!("expected outer CrossJoin");
8331        };
8332        // wrap_with_filter wrapped the inner CrossJoin in a Filter
8333        // because it's not a Scan-shape. The IN-list ended up on top
8334        // of the inner CrossJoin, not inside Scan.filter.
8335        match right.as_ref() {
8336            LogicalPlan::Filter { predicate, .. } => {
8337                assert!(
8338                    matches!(predicate, Expr::In { .. }),
8339                    "expected Expr::In wrapping inner CrossJoin, got {predicate:?}"
8340                );
8341            }
8342            other => panic!(
8343                "expected Filter wrapping inner CrossJoin, got {other:?}. \
8344                 This is acceptable behaviour — the IN-list is preserved \
8345                 above the inner join — but the test should be updated if \
8346                 wrap_with_filter changes to descend through CrossJoins."
8347            ),
8348        }
8349    }
8350}