Struct deltalake::datafusion::physical_plan::joins::HashJoinExec

source ·

pub struct HashJoinExec {
    pub left: Arc<dyn ExecutionPlan>,
    pub right: Arc<dyn ExecutionPlan>,
    pub on: Vec<(Arc<dyn PhysicalExpr>, Arc<dyn PhysicalExpr>)>,
    pub filter: Option<JoinFilter>,
    pub join_type: JoinType,
    pub mode: PartitionMode,
    pub projection: Option<Vec<usize>>,
    pub null_equals_null: bool,
    /* private fields */
}

Expand description

Join execution plan: Evaluates eqijoin predicates in parallel on multiple partitions using a hash table and an optional filter list to apply post join.

§Join Expressions

This implementation is optimized for evaluating eqijoin predicates ( <col1> = <col2>) expressions, which are represented as a list of Columns in Self::on.

Non-equality predicates, which can not pushed down to a join inputs (e.g. <col1> != <col2>) are known as “filter expressions” and are evaluated after the equijoin predicates.

§“Build Side” vs “Probe Side”

HashJoin takes two inputs, which are referred to as the “build” and the “probe”. The build side is the first child, and the probe side is the second child.

The two inputs are treated differently and it is VERY important that the smaller input is placed on the build side to minimize the work of creating the hash table.

         ┌───────────┐
         │ HashJoin  │
         │           │
         └───────────┘
             │   │
       ┌─────┘   └─────┐
       ▼               ▼
┌────────────┐  ┌─────────────┐
│   Input    │  │    Input    │
│    [0]     │  │     [1]     │
└────────────┘  └─────────────┘

 "build side"    "probe side"

Execution proceeds in 2 stages:

the build phase creates a hash table from the tuples of the build side, and single concatenated batch containing data from all fetched record batches. Resulting hash table stores hashed join-key fields for each row as a key, and indices of corresponding rows in concatenated batch.

Hash join uses LIFO data structure as a hash table, and in order to retain original build-side input order while obtaining data during probe phase, hash table is updated by iterating batch sequence in reverse order – it allows to keep rows with smaller indices “on the top” of hash table, and still maintain correct indexing for concatenated build-side data batch.

Example of build phase for 3 record batches:


 Original build-side data   Inserting build-side values into hashmap    Concatenated build-side batch
                                                                        ┌───────────────────────────┐
                            hasmap.insert(row-hash, row-idx + offset)   │                      idx  │
           ┌───────┐                                                    │          ┌───────┐        │
           │ Row 1 │        1) update_hash for batch 3 with offset 0    │          │ Row 6 │    0   │
  Batch 1  │       │           - hashmap.insert(Row 7, idx 1)           │ Batch 3  │       │        │
           │ Row 2 │           - hashmap.insert(Row 6, idx 0)           │          │ Row 7 │    1   │
           └───────┘                                                    │          └───────┘        │
                                                                        │                           │
           ┌───────┐                                                    │          ┌───────┐        │
           │ Row 3 │        2) update_hash for batch 2 with offset 2    │          │ Row 3 │    2   │
           │       │           - hashmap.insert(Row 5, idx 4)           │          │       │        │
  Batch 2  │ Row 4 │           - hashmap.insert(Row 4, idx 3)           │ Batch 2  │ Row 4 │    3   │
           │       │           - hashmap.insert(Row 3, idx 2)           │          │       │        │
           │ Row 5 │                                                    │          │ Row 5 │    4   │
           └───────┘                                                    │          └───────┘        │
                                                                        │                           │
           ┌───────┐                                                    │          ┌───────┐        │
           │ Row 6 │        3) update_hash for batch 1 with offset 5    │          │ Row 1 │    5   │
  Batch 3  │       │           - hashmap.insert(Row 2, idx 5)           │ Batch 1  │       │        │
           │ Row 7 │           - hashmap.insert(Row 1, idx 6)           │          │ Row 2 │    6   │
           └───────┘                                                    │          └───────┘        │
                                                                        │                           │
                                                                        └───────────────────────────┘

the probe phase where the tuples of the probe side are streamed through, checking for matches of the join keys in the hash table.

                ┌────────────────┐          ┌────────────────┐
                │ ┌─────────┐    │          │ ┌─────────┐    │
                │ │  Hash   │    │          │ │  Hash   │    │
                │ │  Table  │    │          │ │  Table  │    │
                │ │(keys are│    │          │ │(keys are│    │
                │ │equi join│    │          │ │equi join│    │  Stage 2: batches from
 Stage 1: the   │ │columns) │    │          │ │columns) │    │    the probe side are
*entire* build  │ │         │    │          │ │         │    │  streamed through, and
 side is read   │ └─────────┘    │          │ └─────────┘    │   checked against the
into the hash   │      ▲         │          │          ▲     │   contents of the hash
    table       │       HashJoin │          │  HashJoin      │          table
                └──────┼─────────┘          └──────────┼─────┘
            ─ ─ ─ ─ ─ ─                                 ─ ─ ─ ─ ─ ─ ─
           │                                                         │

           │                                                         │
    ┌────────────┐                                            ┌────────────┐
    │RecordBatch │                                            │RecordBatch │
    └────────────┘                                            └────────────┘
    ┌────────────┐                                            ┌────────────┐
    │RecordBatch │                                            │RecordBatch │
    └────────────┘                                            └────────────┘
          ...                                                       ...
    ┌────────────┐                                            ┌────────────┐
    │RecordBatch │                                            │RecordBatch │
    └────────────┘                                            └────────────┘

       build side                                                probe side

§Example “Optimal” Plans

The differences in the inputs means that for classic “Star Schema Query”, the optimal plan will be a “Right Deep Tree” . A Star Schema Query is one where there is one large table and several smaller “dimension” tables, joined on Foreign Key = Primary Key predicates.

A “Right Deep Tree” looks like this large table as the probe side on the lowest join:

            ┌───────────┐
            │ HashJoin  │
            │           │
            └───────────┘
                │   │
        ┌───────┘   └──────────┐
        ▼                      ▼
┌───────────────┐        ┌───────────┐
│ small table 1 │        │ HashJoin  │
│  "dimension"  │        │           │
└───────────────┘        └───┬───┬───┘
                  ┌──────────┘   └───────┐
                  │                      │
                  ▼                      ▼
          ┌───────────────┐        ┌───────────┐
          │ small table 2 │        │ HashJoin  │
          │  "dimension"  │        │           │
          └───────────────┘        └───┬───┬───┘
                              ┌────────┘   └────────┐
                              │                     │
                              ▼                     ▼
                      ┌───────────────┐     ┌───────────────┐
                      │ small table 3 │     │  large table  │
                      │  "dimension"  │     │    "fact"     │
                      └───────────────┘     └───────────────┘

Fields§

§left: Arc<dyn ExecutionPlan>

left (build) side which gets hashed

§right: Arc<dyn ExecutionPlan>

right (probe) side which are filtered by the hash table

§on: Vec<(Arc<dyn PhysicalExpr>, Arc<dyn PhysicalExpr>)>

Set of equijoin columns from the relations: (left_col, right_col)

§filter: Option<JoinFilter>

Filters which are applied while finding matching rows

§join_type: JoinType

How the join is performed (OUTER, INNER, etc)

§mode: PartitionMode

Partitioning mode to use

§projection: Option<Vec<usize>>

The projection indices of the columns in the output schema of join

§null_equals_null: bool

Null matching behavior: If null_equals_null is true, rows that have nulls in both left and right equijoin columns will be matched. Otherwise, rows that have nulls in the join columns will not be matched and thus will not appear in the output.

Struct deltalake::datafusion::physical_plan::joins::HashJoinExecCopy item path

§Join Expressions

§“Build Side” vs “Probe Side”

§Example “Optimal” Plans

Fields§

Implementations§

impl HashJoinExec

§Error

pub fn left(&self) -> &Arc<dyn ExecutionPlan>

pub fn right(&self) -> &Arc<dyn ExecutionPlan>

pub fn on(&self) -> &[(Arc<dyn PhysicalExpr>, Arc<dyn PhysicalExpr>)]

pub fn filter(&self) -> Option<&JoinFilter>

pub fn join_type(&self) -> &JoinType

pub fn partition_mode(&self) -> &PartitionMode

pub fn null_equals_null(&self) -> bool

pub fn probe_side() -> JoinSide

pub fn contain_projection(&self) -> bool

pub fn with_projection( &self, projection: Option<Vec<usize>> ) -> Result<HashJoinExec, DataFusionError>

Trait Implementations§

impl Debug for HashJoinExec

fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), Error>

impl DisplayAs for HashJoinExec

fn fmt_as( &self, t: DisplayFormatType, f: &mut Formatter<'_> ) -> Result<(), Error>

impl ExecutionPlan for HashJoinExec

fn name(&self) -> &'static str

fn as_any(&self) -> &(dyn Any + 'static)

fn properties(&self) -> &PlanProperties

fn required_input_distribution(&self) -> Vec<Distribution>

fn maintains_input_order(&self) -> Vec<bool>

fn children(&self) -> Vec<Arc<dyn ExecutionPlan>>

fn with_new_children( self: Arc<HashJoinExec>, children: Vec<Arc<dyn ExecutionPlan>> ) -> Result<Arc<dyn ExecutionPlan>, DataFusionError>

fn execute( &self, partition: usize, context: Arc<TaskContext> ) -> Result<Pin<Box<dyn RecordBatchStream<Item = Result<RecordBatch, DataFusionError>> + Send>>, DataFusionError>

fn metrics(&self) -> Option<MetricsSet>

fn statistics(&self) -> Result<Statistics, DataFusionError>

fn schema(&self) -> Arc<Schema>

fn required_input_ordering(&self) -> Vec<Option<Vec<PhysicalSortRequirement>>>

fn benefits_from_input_partitioning(&self) -> Vec<bool>

fn repartitioned( &self, _target_partitions: usize, _config: &ConfigOptions ) -> Result<Option<Arc<dyn ExecutionPlan>>, DataFusionError>

Auto Trait Implementations§

impl !Freeze for HashJoinExec

impl !RefUnwindSafe for HashJoinExec

impl Send for HashJoinExec

impl Sync for HashJoinExec

impl Unpin for HashJoinExec

impl !UnwindSafe for HashJoinExec

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> From<T> for T

fn from(t: T) -> T

impl<T> Instrument for T

fn instrument(self, span: Span) -> Instrumented<Self>

fn in_current_span(self) -> Instrumented<Self>

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T> IntoEither for T

fn into_either(self, into_left: bool) -> Either<Self, Self>

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>where F: FnOnce(&Self) -> bool,

impl<Unshared, Shared> IntoShared<Shared> for Unsharedwhere Shared: FromUnshared<Unshared>,

fn into_shared(self) -> Shared

impl<T> Same for T

type Output = T

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

type Error = Infallible

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

impl<V, T> VZip<V> for Twhere V: MultiLane<T>,

fn vzip(self) -> V

impl<T> WithSubscriber for T

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>where S: Into<Dispatch>,

fn with_current_subscriber(self) -> WithDispatch<Self>

impl<T> Ungil for Twhere T: Send,

Struct deltalake::datafusion::physical_plan::joins::HashJoinExec

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T, U> Into<U> for T
where U: From<T>,

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

impl<Unshared, Shared> IntoShared<Shared> for Unshared
where Shared: FromUnshared<Unshared>,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>
where S: Into<Dispatch>,

impl<T> Ungil for T
where T: Send,