stoolap 0.4.0 - Docs.rs

// Copyright 2025 Stoolap Contributors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//! Parallel Query Execution
//!
//! This module provides parallel execution strategies for CPU-intensive query operations:
//!
//! - **Parallel Scan + Filter**: Process table rows in parallel chunks with WHERE evaluation
//! - **Parallel Aggregation**: Already implemented in aggregation.rs
//! - **Parallel Sort**: Parallel ORDER BY using rayon's par_sort (future)
//! - **Parallel Join**: Parallel hash join build/probe phases (future)
//!
//! # Architecture
//!
//! The parallel execution model works by:
//! 1. Collecting rows from storage (sequential - storage layer limitation)
//! 2. Splitting rows into chunks for parallel processing
//! 3. Processing each chunk independently using Rayon's work-stealing scheduler
//! 4. Merging results back together
//!
//! # Thresholds
//!
//! Parallelization has overhead, so we only use it when beneficial:
//! - Table scan + filter: 10,000+ rows
//! - Aggregation: 100,000+ rows (already in aggregation.rs)
//! - ORDER BY: 50,000+ rows
//! - Hash join: 10,000+ build rows

#[cfg(feature = "parallel")]
use rayon::prelude::*;
use rustc_hash::{FxHashMap, FxHashSet};
use std::sync::atomic::{AtomicBool, Ordering};

use crate::common::CompactVec;
use crate::core::value::NULL_VALUE;
use crate::core::{Result, Row, RowVec, Value};
use crate::functions::FunctionRegistry;
use crate::parser::ast::Expression;

use super::context::ExecutionContext;
use super::expression::ExpressionEval;
#[cfg(feature = "parallel")]
use super::expression::RowFilter;
use super::utils::{hash_composite_key, hash_row, rows_equal, verify_composite_key_equality};

/// Get number of parallel threads (1 when parallel feature is disabled)
#[inline]
fn num_threads() -> usize {
    #[cfg(feature = "parallel")]
    {
        rayon::current_num_threads()
    }
    #[cfg(not(feature = "parallel"))]
    {
        1
    }
}

// Re-export JoinType from operators::hash_join - single source of truth
pub use super::operators::hash_join::JoinType;

// Default thresholds for parallel execution - single source of truth
// These are used by both ParallelConfig and CostEstimator
pub const DEFAULT_PARALLEL_FILTER_THRESHOLD: usize = 10_000;
pub const DEFAULT_PARALLEL_SORT_THRESHOLD: usize = 50_000;
pub const DEFAULT_PARALLEL_JOIN_THRESHOLD: usize = 10_000;
pub const DEFAULT_PARALLEL_CHUNK_SIZE: usize = 2048;

/// Configuration for parallel execution
#[derive(Clone, Debug)]
pub struct ParallelConfig {
    /// Whether parallel execution is enabled
    pub enabled: bool,
    /// Minimum rows to trigger parallel scan + filter
    pub min_rows_for_parallel_filter: usize,
    /// Minimum rows to trigger parallel sort
    pub min_rows_for_parallel_sort: usize,
    /// Minimum build rows to trigger parallel hash join
    pub min_rows_for_parallel_join: usize,
    /// Chunk size for parallel processing (rows per thread task)
    pub chunk_size: usize,
}

impl Default for ParallelConfig {
    fn default() -> Self {
        Self {
            enabled: true,
            min_rows_for_parallel_filter: DEFAULT_PARALLEL_FILTER_THRESHOLD,
            min_rows_for_parallel_sort: DEFAULT_PARALLEL_SORT_THRESHOLD,
            min_rows_for_parallel_join: DEFAULT_PARALLEL_JOIN_THRESHOLD,
            // Optimal chunk size balances:
            // - Too small: excessive task scheduling overhead
            // - Too large: poor load balancing if chunks have varying filter selectivity
            // 2048 is a good default that works well with typical L2 cache sizes
            chunk_size: DEFAULT_PARALLEL_CHUNK_SIZE,
        }
    }
}

impl ParallelConfig {
    /// Create a new parallel config with custom settings
    pub fn new(
        enabled: bool,
        min_rows_for_parallel_filter: usize,
        min_rows_for_parallel_sort: usize,
        min_rows_for_parallel_join: usize,
        chunk_size: usize,
    ) -> Self {
        Self {
            enabled,
            min_rows_for_parallel_filter,
            min_rows_for_parallel_sort,
            min_rows_for_parallel_join,
            chunk_size,
        }
    }

    /// Create a config with parallel execution disabled
    pub fn disabled() -> Self {
        Self {
            enabled: false,
            ..Default::default()
        }
    }

    /// Check if parallel filter should be used for the given row count
    #[inline]
    pub fn should_parallel_filter(&self, row_count: usize) -> bool {
        self.enabled && row_count >= self.min_rows_for_parallel_filter
    }

    /// Check if parallel sort should be used for the given row count
    #[inline]
    pub fn should_parallel_sort(&self, row_count: usize) -> bool {
        self.enabled && row_count >= self.min_rows_for_parallel_sort
    }

    /// Check if parallel join should be used for the given build side row count
    #[inline]
    pub fn should_parallel_join(&self, build_rows: usize) -> bool {
        self.enabled && build_rows >= self.min_rows_for_parallel_join
    }
}

/// Parallel filter execution for WHERE clause evaluation
///
/// This function filters rows in parallel by:
/// 1. Splitting rows into chunks
/// 2. Evaluating the WHERE predicate on each chunk in parallel
/// 3. Collecting matching rows from all chunks
///
/// Works with `(i64, Row)` tuples, preserving the row ID throughout filtering.
///
/// # Performance
///
/// For a table with 1M rows and 50% selectivity:
/// - Sequential: ~500ms
/// - Parallel (8 cores): ~80ms (6x speedup)
///
/// The speedup depends on:
/// - Number of CPU cores
/// - Complexity of the WHERE predicate
/// - Selectivity (how many rows pass the filter)
pub fn parallel_filter(
    rows: RowVec,
    filter_expr: &Expression,
    columns: &[String],
    _function_registry: &FunctionRegistry,
    config: &ParallelConfig,
    ctx: &ExecutionContext,
) -> Result<RowVec> {
    #[cfg(feature = "parallel")]
    {
        let row_count = rows.len();
        if config.should_parallel_filter(row_count) {
            // Pre-compile the filter expression once (RowFilter is Send+Sync)
            let columns_vec: Vec<String> = columns.to_vec();
            let filter = RowFilter::new(filter_expr, &columns_vec)?.with_context(ctx);

            // Mark-and-extract: compute a parallel boolean mask (1 byte per row),
            // then single-pass extract kept rows in original order.
            // This avoids materializing an N-sized Option<(i64,Row)> intermediate.
            let mut row_vec: Vec<(i64, Row)> = rows.into_vec();
            let keep: Result<Vec<bool>> = row_vec
                .par_iter()
                .map(|(_, row)| filter.matches_checked(row))
                .collect();
            let keep = keep?;

            // Single-pass extract: move kept rows into result, preserving order
            let kept_count = keep.iter().filter(|&&b| b).count();
            let mut filtered = Vec::with_capacity(kept_count);
            for (i, entry) in row_vec.drain(..).enumerate() {
                if keep[i] {
                    filtered.push(entry);
                }
            }

            // Wrap final result in RowVec (uses main thread's cache)
            return Ok(RowVec::from_vec(filtered));
        }
    }

    // Sequential fallback (always compiled)
    let _ = config; // suppress unused warning when parallel feature is disabled
    sequential_filter(rows, filter_expr, columns, ctx)
}

/// Sequential filter for small datasets or when parallel is disabled
fn sequential_filter(
    rows: RowVec,
    filter_expr: &Expression,
    columns: &[String],
    ctx: &ExecutionContext,
) -> Result<RowVec> {
    let columns_vec: Vec<String> = columns.to_vec();
    let mut eval = ExpressionEval::compile(filter_expr, &columns_vec)?.with_context(ctx);

    let mut result = RowVec::with_capacity(rows.len());
    for (id, row) in rows {
        if eval.eval_bool_checked(&row)? {
            result.push((id, row));
        }
    }
    Ok(result)
}

/// Parallel filter with ownership transfer (more efficient for large results)
///
/// Uses parallel boolean marking + sequential extraction to minimize allocations.
/// Only allocates: 1 bit-vec for marks + 1 RowVec for results.
pub fn parallel_filter_owned(
    rows: RowVec,
    predicate: impl Fn(&Row) -> bool + Sync + Send,
    config: &ParallelConfig,
) -> RowVec {
    #[cfg(feature = "parallel")]
    {
        let row_count = rows.len();
        if config.should_parallel_filter(row_count) {
            // Phase 1: Parallel marking - each thread marks matching rows
            // Uses Vec<bool> which is 1 byte per row (could use bitvec for 1 bit)
            let chunk_size = (row_count / num_threads()).max(1000);
            let marks: Vec<bool> = rows
                .par_chunks(chunk_size)
                .flat_map(|chunk| {
                    chunk
                        .iter()
                        .map(|(_, row)| predicate(row))
                        .collect::<Vec<_>>()
                })
                .collect();

            // Count matches for pre-allocation
            let match_count = marks.iter().filter(|&&b| b).count();

            // Phase 2: Sequential extraction into RowVec (from cache)
            let mut result = RowVec::with_capacity(match_count);
            for ((id, row), keep) in rows.into_iter().zip(marks) {
                if keep {
                    result.push((id, row));
                }
            }
            return result;
        }
    }

    // Sequential fallback (always compiled)
    let _ = config; // suppress unused warning when parallel feature is disabled
    rows.into_iter().filter(|(_, r)| predicate(r)).collect()
}

/// Parallel sort using rayon's par_sort_unstable_by
///
/// For large datasets, parallel sort can provide 2-4x speedup.
/// Uses unstable sort (faster, doesn't preserve order of equal elements).
pub fn parallel_sort<F>(rows: &mut [(i64, Row)], compare: F, config: &ParallelConfig)
where
    F: Fn(&Row, &Row) -> std::cmp::Ordering + Sync + Send,
{
    #[cfg(feature = "parallel")]
    if config.should_parallel_sort(rows.len()) {
        rows.par_sort_unstable_by(|(_, a), (_, b)| compare(a, b));
        return;
    }
    let _ = config; // suppress unused warning when parallel feature is disabled
    rows.sort_unstable_by(|(_, a), (_, b)| compare(a, b));
}

/// Parallel sort that's unstable (faster but doesn't preserve order of equal elements)
pub fn parallel_sort_unstable<F>(rows: &mut [(i64, Row)], compare: F, config: &ParallelConfig)
where
    F: Fn(&Row, &Row) -> std::cmp::Ordering + Sync + Send,
{
    #[cfg(feature = "parallel")]
    if config.should_parallel_sort(rows.len()) {
        rows.par_sort_unstable_by(|(_, a), (_, b)| compare(a, b));
        return;
    }
    let _ = config; // suppress unused warning when parallel feature is disabled
    rows.sort_unstable_by(|(_, a), (_, b)| compare(a, b));
}

/// Parallel DISTINCT processing using hash map with proper equality checking
///
/// Removes duplicate rows in parallel by:
/// 1. Splitting rows into chunks
/// 2. Each chunk builds a local hash map (hash -> list of rows)
/// 3. Final merge pass removes cross-chunk duplicates with full equality check
///
/// This implementation correctly handles hash collisions by storing actual rows
/// and comparing them for equality, not just their hashes.
pub fn parallel_distinct(rows: RowVec, config: &ParallelConfig) -> RowVec {
    #[cfg(feature = "parallel")]
    {
        let row_count = rows.len();
        if config.should_parallel_filter(row_count) {
            // For parallel distinct, we use a two-phase approach:
            // Phase 1: Filter locally within chunks (parallel) - uses hash map with collision lists
            // Phase 2: Final dedup across chunks (sequential) - verifies equality for hash matches

            let n_threads = num_threads();
            let chunk_size = config.chunk_size.max(row_count / n_threads).max(1000);

            // Phase 1: Parallel local dedup within chunks
            // Note: We must convert to Vec for into_par_iter (RowVec doesn't impl IntoParallelIterator)
            // Each chunk produces unique rows (within that chunk) along with their hashes
            let row_vec: Vec<(i64, Row)> = rows.into_vec();
            let deduped_chunks: Vec<Vec<(u64, i64, Row)>> = row_vec
                .into_par_iter()
                .chunks(chunk_size)
                .map(|chunk| {
                    // Use hash map: hash -> list of indices into unique_with_hashes
                    let mut hash_to_indices: FxHashMap<u64, Vec<usize>> = FxHashMap::default();
                    // Store (hash, row_id, row) tuples
                    let mut unique_with_hashes: Vec<(u64, i64, Row)> =
                        Vec::with_capacity(chunk.len());

                    for (row_id, row) in chunk {
                        let hash = hash_row(&row);
                        let indices = hash_to_indices.entry(hash).or_default();

                        // Check if this exact row already exists (handle hash collisions)
                        let is_duplicate = indices
                            .iter()
                            .any(|&idx| rows_equal(&unique_with_hashes[idx].2, &row));

                        if !is_duplicate {
                            indices.push(unique_with_hashes.len());
                            unique_with_hashes.push((hash, row_id, row));
                        }
                    }

                    unique_with_hashes
                })
                .collect();

            // Phase 2: Sequential final dedup across chunks with full equality verification
            let total_size: usize = deduped_chunks.iter().map(|chunk| chunk.len()).sum();
            let estimated_size = (total_size * 3) / 4;
            let mut result = RowVec::with_capacity(estimated_size);
            let mut hash_to_indices: FxHashMap<u64, Vec<usize>> = FxHashMap::default();

            for chunk in deduped_chunks {
                for (hash, row_id, row) in chunk {
                    let indices = hash_to_indices.entry(hash).or_default();

                    // Check if this exact row already exists globally (handle hash collisions)
                    let is_duplicate = indices.iter().any(|&idx| rows_equal(&result[idx].1, &row));

                    if !is_duplicate {
                        indices.push(result.len());
                        result.push((row_id, row));
                    }
                }
            }

            return result;
        }
    }

    // Sequential fallback (always compiled)
    let _ = config; // suppress unused warning when parallel feature is disabled
    sequential_distinct(rows)
}

/// Sequential DISTINCT with proper equality checking for hash collisions
fn sequential_distinct(rows: RowVec) -> RowVec {
    let mut hash_to_indices: FxHashMap<u64, Vec<usize>> = FxHashMap::default();
    let mut result = RowVec::with_capacity(rows.len());

    for (row_id, row) in rows {
        let hash = hash_row(&row);
        let indices = hash_to_indices.entry(hash).or_default();

        // Check if this exact row already exists (handle hash collisions)
        let is_duplicate = indices.iter().any(|&idx| rows_equal(&result[idx].1, &row));

        if !is_duplicate {
            indices.push(result.len());
            result.push((row_id, row));
        }
    }

    result
}

/// Parallel projection - evaluate expressions on rows in parallel
///
/// For complex SELECT expressions (not simple column references),
/// parallel evaluation can speed up projection significantly.
/// Uses in-place parallel mapping to minimize allocations.
pub fn parallel_project<F>(rows: RowVec, project_fn: F, config: &ParallelConfig) -> RowVec
where
    F: Fn(&Row) -> Row + Sync + Send,
{
    #[cfg(feature = "parallel")]
    let row_count = rows.len();

    #[cfg(feature = "parallel")]
    if config.should_parallel_filter(row_count) {
        // Parallel projection using par_chunks for better cache locality
        let chunk_size = (row_count / num_threads()).max(1000);
        let projected: Vec<(i64, Row)> = rows
            .par_chunks(chunk_size)
            .flat_map(|chunk| {
                chunk
                    .iter()
                    .map(|(id, row)| (*id, project_fn(row)))
                    .collect::<Vec<_>>()
            })
            .collect();

        return RowVec::from_vec(projected);
    }

    // Sequential fallback (always compiled)
    let _ = config; // suppress unused warning when parallel feature is disabled
    rows.into_iter()
        .map(|(id, r)| (id, project_fn(&r)))
        .collect()
}

/// Statistics about parallel execution for monitoring/debugging
#[derive(Clone, Debug, Default)]
pub struct ParallelStats {
    /// Number of rows processed
    pub rows_processed: usize,
    /// Number of rows that passed the filter
    pub rows_passed: usize,
    /// Number of chunks used
    pub chunks_used: usize,
    /// Whether parallel execution was used
    pub parallel_used: bool,
}

// ============================================================================
// Parallel Hash Join
// ============================================================================

/// Hash table storage that adapts between sequential and parallel execution
enum HashTableStorage {
    /// Sequential execution using FxHashMap (optimized for trusted keys)
    Sequential(FxHashMap<u64, Vec<usize>>),
    /// Parallel execution using DashMap (concurrent, lock-free)
    #[cfg(feature = "parallel")]
    Parallel(dashmap::DashMap<u64, Vec<usize>>),
}

/// Build side match tracking using atomic operations
///
/// Uses Vec<AtomicBool> for both sequential and parallel execution to ensure
/// the type is Sync and can be safely shared across threads. The atomic overhead
/// in sequential mode is minimal (~1-2 nanoseconds per operation).
struct BuildMatchedTracker {
    matched: Vec<AtomicBool>,
}

impl BuildMatchedTracker {
    /// Create a new tracker
    fn new(size: usize) -> Self {
        BuildMatchedTracker {
            matched: (0..size).map(|_| AtomicBool::new(false)).collect(),
        }
    }

    /// Mark a build row as matched
    ///
    /// Uses Release ordering in parallel mode for cross-thread visibility.
    /// In sequential mode, Relaxed would suffice, but we use Release uniformly
    /// for simplicity and the overhead is negligible.
    #[inline]
    fn mark_matched(&self, idx: usize) {
        self.matched[idx].store(true, Ordering::Release);
    }

    /// Check if a build row was matched
    ///
    /// Uses Acquire ordering to synchronize with Release stores from probe phase.
    #[inline]
    fn was_matched(&self, idx: usize) -> bool {
        self.matched[idx].load(Ordering::Acquire)
    }
}

impl HashTableStorage {
    /// Get matching build row indices for a hash key
    #[inline]
    fn get(&self, key: &u64) -> Option<Vec<usize>> {
        match self {
            HashTableStorage::Sequential(map) => map.get(key).cloned(),
            #[cfg(feature = "parallel")]
            HashTableStorage::Parallel(map) => map.get(key).map(|v| v.clone()),
        }
    }
}

/// Result of parallel hash table build phase
pub struct ParallelHashTable {
    /// The hash table mapping composite key hashes to row indices
    storage: HashTableStorage,
    /// Number of rows in the build side
    pub row_count: usize,
}

impl ParallelHashTable {
    /// Get matching build row indices for a hash key
    #[inline]
    pub fn get(&self, key: &u64) -> Option<Vec<usize>> {
        self.storage.get(key)
    }
}

/// Build a hash table in parallel for hash join
///
/// This function builds the hash table from the build side of a join in parallel:
/// 1. Split build rows into chunks
/// 2. Each thread computes hashes and builds a partial hash table
/// 3. Merge partial tables into a concurrent DashMap
///
/// # Performance
///
/// For 100K build rows:
/// - Sequential: ~15ms
/// - Parallel (8 cores): ~4ms (3.5x speedup)
pub fn parallel_hash_build(
    build_rows: &[Row],
    key_indices: &[usize],
    config: &ParallelConfig,
) -> ParallelHashTable {
    let row_count = build_rows.len();

    #[cfg(feature = "parallel")]
    if config.should_parallel_join(row_count) {
        use dashmap::DashMap;

        // Parallel build using DashMap's concurrent access
        let table: DashMap<u64, Vec<usize>> = DashMap::with_capacity(row_count);
        let n_threads = num_threads();
        let chunk_size = config.chunk_size.max(row_count / n_threads).max(1000);

        // Process chunks in parallel, each inserting directly into the concurrent hash table
        build_rows
            .par_chunks(chunk_size)
            .enumerate()
            .for_each(|(chunk_idx, chunk)| {
                let base_idx = chunk_idx * chunk_size;
                for (local_idx, row) in chunk.iter().enumerate() {
                    // SAFETY: Check for index overflow (would require ~18 quintillion rows on 64-bit)
                    // Use debug_assert for zero runtime cost in release builds
                    debug_assert!(
                        base_idx.checked_add(local_idx).is_some(),
                        "Index overflow in parallel hash build: base_idx={} + local_idx={}",
                        base_idx,
                        local_idx
                    );
                    let global_idx = base_idx + local_idx;
                    let hash = hash_composite_key(row, key_indices);
                    table.entry(hash).or_default().push(global_idx);
                }
            });

        return ParallelHashTable {
            storage: HashTableStorage::Parallel(table),
            row_count,
        };
    }

    // Sequential fallback (always compiled)
    // OPTIMIZATION: Sequential build uses FxHashMap for best performance with trusted keys
    // FxHashMap is optimized for non-adversarial use cases (embedded database)
    let _ = config; // suppress unused warning when parallel feature is disabled
    let mut table: FxHashMap<u64, Vec<usize>> =
        FxHashMap::with_capacity_and_hasher(row_count, Default::default());
    for (idx, row) in build_rows.iter().enumerate() {
        let hash = hash_composite_key(row, key_indices);
        table.entry(hash).or_default().push(idx);
    }
    ParallelHashTable {
        storage: HashTableStorage::Sequential(table),
        row_count,
    }
}

/// Parallel probe phase of hash join
///
/// Probe the hash table with rows from the probe side in parallel.
/// Returns matching pairs of (probe_row_idx, build_row_idx).
///
/// # Performance
///
/// For 100K probe rows against 50K build rows:
/// - Sequential: ~25ms
/// - Parallel (8 cores): ~6ms (4x speedup)
pub fn parallel_hash_probe<F>(
    probe_rows: &[Row],
    probe_key_indices: &[usize],
    hash_table: &ParallelHashTable,
    build_rows: &[Row],
    verify_match: F,
    config: &ParallelConfig,
) -> Vec<(usize, usize)>
where
    F: Fn(&Row, &Row) -> bool + Sync + Send,
{
    #[cfg(feature = "parallel")]
    let probe_count = probe_rows.len();

    #[cfg(feature = "parallel")]
    if config.should_parallel_join(probe_count) {
        // Parallel probe
        let n_threads = num_threads();
        let chunk_size = config.chunk_size.max(probe_count / n_threads).max(1000);

        return probe_rows
            .par_chunks(chunk_size)
            .enumerate()
            .flat_map(|(chunk_idx, chunk)| {
                let base_idx = chunk_idx * chunk_size;
                let mut local_matches = Vec::new();

                for (local_idx, probe_row) in chunk.iter().enumerate() {
                    let probe_idx = base_idx + local_idx;
                    let hash = hash_composite_key(probe_row, probe_key_indices);

                    if let Some(build_indices) = hash_table.get(&hash) {
                        for build_idx in build_indices {
                            if verify_match(probe_row, &build_rows[build_idx]) {
                                local_matches.push((probe_idx, build_idx));
                            }
                        }
                    }
                }

                local_matches
            })
            .collect();
    }

    // Sequential fallback (always compiled)
    let _ = config; // suppress unused warning when parallel feature is disabled
    let mut matches = Vec::new();
    for (probe_idx, probe_row) in probe_rows.iter().enumerate() {
        let hash = hash_composite_key(probe_row, probe_key_indices);
        if let Some(build_indices) = hash_table.get(&hash) {
            for build_idx in build_indices {
                if verify_match(probe_row, &build_rows[build_idx]) {
                    matches.push((probe_idx, build_idx));
                }
            }
        }
    }
    matches
}

/// Hash a row using specific key column indices.
/// Alias for hash_composite_key from utils for backward compatibility.
#[inline]
pub fn hash_row_by_keys(row: &Row, key_indices: &[usize]) -> u64 {
    hash_composite_key(row, key_indices)
}

/// Verify that two rows match on their respective key columns.
/// Alias for verify_composite_key_equality from utils for backward compatibility.
#[inline]
pub fn verify_key_match(
    probe_row: &Row,
    build_row: &Row,
    probe_key_indices: &[usize],
    build_key_indices: &[usize],
) -> bool {
    verify_composite_key_equality(probe_row, build_row, probe_key_indices, build_key_indices)
}

// JoinType is imported from operators::hash_join - single source of truth

/// Parallel hash join result
pub struct ParallelJoinResult {
    /// The joined rows
    pub rows: Vec<Row>,
    /// Whether parallel execution was used
    pub parallel_used: bool,
    /// Number of probe rows processed
    pub probe_rows_processed: usize,
    /// Number of build rows
    pub build_rows_count: usize,
    /// Number of matches found
    pub matches_found: usize,
}

/// Sequential probe phase for hash join (used when parallel is disabled or dataset is small)
#[allow(clippy::too_many_arguments)]
fn sequential_probe(
    probe_rows: &[Row],
    build_rows: &[Row],
    hash_table: &ParallelHashTable,
    probe_key_indices: &[usize],
    build_key_indices: &[usize],
    join_type: &JoinType,
    probe_col_count: usize,
    build_col_count: usize,
    swapped: bool,
    build_matched: &Option<BuildMatchedTracker>,
) -> (Vec<Row>, Vec<Row>) {
    let mut matched_rows = Vec::new();
    let needs_unmatched_probe = join_type.needs_unmatched_probe(swapped);

    for probe_row in probe_rows.iter() {
        let hash = hash_row_by_keys(probe_row, probe_key_indices);
        let mut matched = false;

        if let Some(build_indices) = hash_table.get(&hash) {
            for build_idx in build_indices {
                let build_row = &build_rows[build_idx];
                if verify_key_match(probe_row, build_row, probe_key_indices, build_key_indices) {
                    matched = true;
                    if let Some(ref tracker) = build_matched {
                        tracker.mark_matched(build_idx);
                    }
                    let combined = combine_join_rows(
                        probe_row,
                        build_row,
                        probe_col_count,
                        build_col_count,
                        swapped,
                    );
                    matched_rows.push(Row::from_compact_vec(combined));
                }
            }
        }

        if !matched && needs_unmatched_probe {
            let values = combine_with_nulls(probe_row, probe_col_count, build_col_count, swapped);
            matched_rows.push(Row::from_compact_vec(values));
        }
    }

    (matched_rows, Vec::new())
}

/// Execute a complete parallel hash join
///
/// This is the main entry point for parallel hash join execution. It handles:
/// - Parallel hash table build
/// - Parallel probe phase (for INNER joins)
/// - Sequential probe with atomic tracking (for OUTER joins)
/// - Proper NULL handling for unmatched rows
///
/// # Arguments
/// * `probe_rows` - Rows from the probe side (typically larger)
/// * `build_rows` - Rows from the build side (typically smaller)
/// * `probe_key_indices` - Column indices for join keys in probe rows
/// * `build_key_indices` - Column indices for join keys in build rows
/// * `join_type` - Type of join (INNER, LEFT, RIGHT, FULL)
/// * `probe_col_count` - Number of columns in probe rows
/// * `build_col_count` - Number of columns in build rows
/// * `swapped` - Whether left/right were swapped for build side optimization
/// * `config` - Parallel execution configuration
#[allow(clippy::too_many_arguments)]
pub fn parallel_hash_join(
    probe_rows: &[Row],
    build_rows: &[Row],
    probe_key_indices: &[usize],
    build_key_indices: &[usize],
    join_type: JoinType,
    probe_col_count: usize,
    build_col_count: usize,
    swapped: bool,
    config: &ParallelConfig,
) -> ParallelJoinResult {
    let probe_count = probe_rows.len();
    let build_count = build_rows.len();

    // Determine if we should use parallel execution
    #[cfg(feature = "parallel")]
    let use_parallel =
        config.should_parallel_join(build_count) || config.should_parallel_join(probe_count);
    #[cfg(not(feature = "parallel"))]
    let use_parallel = false;

    // Build phase: Create hash table (parallel if large enough)
    let hash_table = parallel_hash_build(build_rows, build_key_indices, config);

    // For OUTER joins, we need to track which build rows were matched
    // Uses Vec<AtomicBool> for both sequential and parallel execution (minimal overhead)
    let build_matched: Option<BuildMatchedTracker> = if join_type.needs_unmatched_build(swapped) {
        Some(BuildMatchedTracker::new(build_count))
    } else {
        None
    };

    // Probe phase
    #[allow(unused_variables)]
    let (matched_rows, unmatched_probe_rows) = {
        #[cfg(feature = "parallel")]
        {
            if use_parallel && join_type == JoinType::Inner {
                // For INNER joins, we can fully parallelize the probe phase
                let matches: Vec<Row> = probe_rows
                    .par_chunks(config.chunk_size.max(1000))
                    .flat_map(|chunk| {
                        let mut local_results = Vec::new();
                        for probe_row in chunk {
                            let hash = hash_row_by_keys(probe_row, probe_key_indices);
                            if let Some(build_indices) = hash_table.get(&hash) {
                                for build_idx in build_indices {
                                    let build_row = &build_rows[build_idx];
                                    if verify_key_match(
                                        probe_row,
                                        build_row,
                                        probe_key_indices,
                                        build_key_indices,
                                    ) {
                                        let combined = combine_join_rows(
                                            probe_row,
                                            build_row,
                                            probe_col_count,
                                            build_col_count,
                                            swapped,
                                        );
                                        local_results.push(Row::from_compact_vec(combined));
                                    }
                                }
                            }
                        }
                        local_results
                    })
                    .collect();
                (matches, Vec::new())
            } else if use_parallel {
                // For OUTER joins with parallel execution, use atomic tracking for build side
                // and collect unmatched probe rows directly in parallel
                let needs_unmatched_probe = join_type.needs_unmatched_probe(swapped);

                // Each chunk returns: (matched_rows, unmatched_probe_rows)
                let chunk_results: Vec<(Vec<Row>, Vec<Row>)> = probe_rows
                    .par_chunks(config.chunk_size.max(1000))
                    .map(|chunk| {
                        let mut matched_results = Vec::new();
                        let mut unmatched_results = Vec::new();

                        for probe_row in chunk.iter() {
                            let mut matched = false;
                            let hash = hash_row_by_keys(probe_row, probe_key_indices);

                            if let Some(build_indices) = hash_table.get(&hash) {
                                for build_idx in build_indices {
                                    let build_row = &build_rows[build_idx];
                                    if verify_key_match(
                                        probe_row,
                                        build_row,
                                        probe_key_indices,
                                        build_key_indices,
                                    ) {
                                        matched = true;
                                        if let Some(ref tracker) = build_matched {
                                            tracker.mark_matched(build_idx);
                                        }
                                        let combined = combine_join_rows(
                                            probe_row,
                                            build_row,
                                            probe_col_count,
                                            build_col_count,
                                            swapped,
                                        );
                                        matched_results.push(Row::from_compact_vec(combined));
                                    }
                                }
                            }

                            if !matched && needs_unmatched_probe {
                                let values = combine_with_nulls(
                                    probe_row,
                                    probe_col_count,
                                    build_col_count,
                                    swapped,
                                );
                                unmatched_results.push(Row::from_compact_vec(values));
                            }
                        }

                        (matched_results, unmatched_results)
                    })
                    .collect();

                let total_matched: usize = chunk_results.iter().map(|(m, _)| m.len()).sum();
                let total_unmatched: usize = chunk_results.iter().map(|(_, u)| u.len()).sum();

                let mut matched_rows = Vec::with_capacity(total_matched);
                let mut unmatched_rows = Vec::with_capacity(total_unmatched);

                for (matched, unmatched) in chunk_results {
                    matched_rows.extend(matched);
                    unmatched_rows.extend(unmatched);
                }

                // CRITICAL: Acquire fence for cross-thread visibility of build_matched[] writes.
                // Parallel probe stores use Release ordering; this fence ensures all stores
                // are visible before the sequential scan of unmatched build rows below.
                std::sync::atomic::fence(Ordering::Acquire);

                (matched_rows, unmatched_rows)
            } else {
                // Sequential execution for small datasets
                sequential_probe(
                    probe_rows,
                    build_rows,
                    &hash_table,
                    probe_key_indices,
                    build_key_indices,
                    &join_type,
                    probe_col_count,
                    build_col_count,
                    swapped,
                    &build_matched,
                )
            }
        }
        #[cfg(not(feature = "parallel"))]
        {
            sequential_probe(
                probe_rows,
                build_rows,
                &hash_table,
                probe_key_indices,
                build_key_indices,
                &join_type,
                probe_col_count,
                build_col_count,
                swapped,
                &build_matched,
            )
        }
    };

    let mut result_rows = matched_rows;
    result_rows.extend(unmatched_probe_rows);

    // Handle unmatched build rows for OUTER joins
    // The Acquire fence at line 794 ensures all parallel stores are visible
    if let Some(ref tracker) = build_matched {
        for (build_idx, build_row) in build_rows.iter().enumerate() {
            if !tracker.was_matched(build_idx) {
                let values =
                    combine_build_with_nulls(build_row, build_col_count, probe_col_count, swapped);
                result_rows.push(Row::from_compact_vec(values));
            }
        }
    }

    let matches_found = result_rows.len();

    ParallelJoinResult {
        rows: result_rows,
        parallel_used: use_parallel,
        probe_rows_processed: probe_count,
        build_rows_count: build_count,
        matches_found,
    }
}

/// Combine probe and build rows into a single row, respecting swap order
#[inline]
fn combine_join_rows(
    probe_row: &Row,
    build_row: &Row,
    probe_col_count: usize,
    build_col_count: usize,
    swapped: bool,
) -> CompactVec<Value> {
    // Use CompactVec directly to avoid Vec→CompactVec conversion in Row::from_values
    let mut combined: CompactVec<Value> =
        CompactVec::with_capacity(probe_col_count + build_col_count);
    if swapped {
        // Build was originally left, probe was originally right
        for i in 0..build_col_count {
            combined.push(build_row.get(i).cloned().unwrap_or(NULL_VALUE));
        }
        for i in 0..probe_col_count {
            combined.push(probe_row.get(i).cloned().unwrap_or(NULL_VALUE));
        }
    } else {
        // Probe is left, build is right
        for i in 0..probe_col_count {
            combined.push(probe_row.get(i).cloned().unwrap_or(NULL_VALUE));
        }
        for i in 0..build_col_count {
            combined.push(build_row.get(i).cloned().unwrap_or(NULL_VALUE));
        }
    }
    combined
}

/// Combine probe row with NULLs for unmatched probe side in OUTER joins
#[inline]
fn combine_with_nulls(
    probe_row: &Row,
    probe_col_count: usize,
    build_col_count: usize,
    swapped: bool,
) -> CompactVec<Value> {
    // Use CompactVec directly to avoid Vec→CompactVec conversion in Row::from_values
    let mut combined: CompactVec<Value> =
        CompactVec::with_capacity(probe_col_count + build_col_count);
    if swapped {
        // Build (left) is NULL, probe (right) has values
        combined.extend(std::iter::repeat_n(NULL_VALUE, build_col_count));
        for i in 0..probe_col_count {
            combined.push(probe_row.get(i).cloned().unwrap_or(NULL_VALUE));
        }
    } else {
        // Probe (left) has values, build (right) is NULL
        for i in 0..probe_col_count {
            combined.push(probe_row.get(i).cloned().unwrap_or(NULL_VALUE));
        }
        combined.extend(std::iter::repeat_n(NULL_VALUE, build_col_count));
    }
    combined
}

/// Combine build row with NULLs for unmatched build side in OUTER joins
#[inline]
fn combine_build_with_nulls(
    build_row: &Row,
    build_col_count: usize,
    probe_col_count: usize,
    swapped: bool,
) -> CompactVec<Value> {
    // Use CompactVec directly to avoid Vec→CompactVec conversion in Row::from_values
    let mut combined: CompactVec<Value> =
        CompactVec::with_capacity(probe_col_count + build_col_count);
    if swapped {
        // Build (left) has values, probe (right) is NULL
        for i in 0..build_col_count {
            combined.push(build_row.get(i).cloned().unwrap_or(NULL_VALUE));
        }
        combined.extend(std::iter::repeat_n(NULL_VALUE, probe_col_count));
    } else {
        // Probe (left) is NULL, build (right) has values
        combined.extend(std::iter::repeat_n(NULL_VALUE, probe_col_count));
        for i in 0..build_col_count {
            combined.push(build_row.get(i).cloned().unwrap_or(NULL_VALUE));
        }
    }
    combined
}

// ============================================================================
// Parallel ORDER BY
// ============================================================================

/// Sort direction for ORDER BY
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum SortDirection {
    Ascending,
    Descending,
}

/// Sort specification for a single column
#[derive(Clone, Debug)]
pub struct SortSpec {
    /// Column index to sort by
    pub column_index: usize,
    /// Sort direction (ASC or DESC)
    pub direction: SortDirection,
    /// NULLS FIRST or NULLS LAST
    pub nulls_first: bool,
}

/// Parallel ORDER BY execution
///
/// Sorts rows in parallel using rayon's par_sort when the dataset is large enough.
/// Supports multi-column sorting with mixed ASC/DESC directions and NULL handling.
///
/// # Performance
///
/// For 100K rows:
/// - Sequential: ~50ms
/// - Parallel (8 cores): ~15ms (3x speedup)
pub fn parallel_order_by(
    rows: &mut [(i64, Row)],
    sort_specs: &[SortSpec],
    config: &ParallelConfig,
) {
    let compare = |(_, a): &(i64, Row), (_, b): &(i64, Row)| -> std::cmp::Ordering {
        for spec in sort_specs {
            let a_val = a.get(spec.column_index);
            let b_val = b.get(spec.column_index);

            // Check for NULL values (either missing column or Value::Null)
            let a_is_null = a_val.map(|v| v.is_null()).unwrap_or(true);
            let b_is_null = b_val.map(|v| v.is_null()).unwrap_or(true);

            let ordering = match (a_is_null, b_is_null) {
                (true, true) => std::cmp::Ordering::Equal,
                (true, false) => {
                    if spec.nulls_first {
                        std::cmp::Ordering::Less
                    } else {
                        std::cmp::Ordering::Greater
                    }
                }
                (false, true) => {
                    if spec.nulls_first {
                        std::cmp::Ordering::Greater
                    } else {
                        std::cmp::Ordering::Less
                    }
                }
                (false, false) => {
                    // Both non-null, compare values
                    let a_v = a_val.unwrap();
                    let b_v = b_val.unwrap();
                    a_v.partial_cmp(b_v).unwrap_or(std::cmp::Ordering::Equal)
                }
            };

            let ordering = if spec.direction == SortDirection::Descending {
                ordering.reverse()
            } else {
                ordering
            };

            if ordering != std::cmp::Ordering::Equal {
                return ordering;
            }
        }
        std::cmp::Ordering::Equal
    };

    #[cfg(feature = "parallel")]
    if config.should_parallel_sort(rows.len()) {
        rows.par_sort_unstable_by(compare);
        return;
    }
    let _ = config; // suppress unused warning when parallel feature is disabled
    rows.sort_unstable_by(compare);
}

/// Parallel ORDER BY with a custom comparator function
///
/// More flexible version that accepts any comparison function.
/// Uses unstable sort for better performance.
pub fn parallel_order_by_fn<F>(rows: &mut [(i64, Row)], compare: F, config: &ParallelConfig)
where
    F: Fn(&Row, &Row) -> std::cmp::Ordering + Sync + Send,
{
    #[cfg(feature = "parallel")]
    if config.should_parallel_sort(rows.len()) {
        rows.par_sort_unstable_by(|(_, a), (_, b)| compare(a, b));
        return;
    }
    let _ = config; // suppress unused warning when parallel feature is disabled
    rows.sort_unstable_by(|(_, a), (_, b)| compare(a, b));
}

/// Parallel ORDER BY with unstable sort (faster, but doesn't preserve order of equal elements)
pub fn parallel_order_by_unstable<F>(rows: &mut [(i64, Row)], compare: F, config: &ParallelConfig)
where
    F: Fn(&Row, &Row) -> std::cmp::Ordering + Sync + Send,
{
    #[cfg(feature = "parallel")]
    if config.should_parallel_sort(rows.len()) {
        rows.par_sort_unstable_by(|(_, a), (_, b)| compare(a, b));
        return;
    }
    let _ = config; // suppress unused warning when parallel feature is disabled
    rows.sort_unstable_by(|(_, a), (_, b)| compare(a, b));
}

/// Parallel filter with statistics collection
///
/// CRITICAL: This function now returns Result to properly propagate compilation errors.
pub fn parallel_filter_with_stats(
    rows: RowVec,
    filter_expr: &Expression,
    columns: &[String],
    function_registry: &FunctionRegistry,
    config: &ParallelConfig,
    ctx: &ExecutionContext,
) -> Result<(RowVec, ParallelStats)> {
    let row_count = rows.len();
    let parallel_used = config.should_parallel_filter(row_count);

    let n_threads = num_threads();
    let chunk_size = if parallel_used {
        config.chunk_size.max(row_count / (n_threads * 4)).max(512)
    } else {
        row_count // Single chunk for sequential
    };

    let chunks_used = row_count.div_ceil(chunk_size);

    // CRITICAL: Propagate errors with ? instead of silently swallowing them
    let result = parallel_filter(rows, filter_expr, columns, function_registry, config, ctx)?;

    let stats = ParallelStats {
        rows_processed: row_count,
        rows_passed: result.len(),
        chunks_used,
        parallel_used,
    };

    Ok((result, stats))
}

/// Distance metric for vector search
#[derive(Debug, Clone, Copy)]
pub enum DistanceMetric {
    L2,
    Cosine,
    InnerProduct,
}

/// Parallel brute-force k-NN vector search
///
/// Fuses distance computation + top-K heap selection into parallel chunks,
/// then merges chunk heaps. Zero-copy on Extension(Vector) values.
///
/// Returns (row_id, row, distance) sorted by distance (ascending).
pub fn parallel_topn_vector_search(
    rows: RowVec,
    vector_col_idx: usize,
    query_bytes: &[u8],
    k: usize,
    metric: DistanceMetric,
    config: &ParallelConfig,
) -> Vec<(i64, Row, f64)> {
    use std::collections::BinaryHeap;

    if k == 0 || rows.is_empty() {
        return Vec::new();
    }

    let distance_fn: fn(&[u8], &[u8]) -> f64 = match metric {
        DistanceMetric::L2 => crate::functions::scalar::vector::l2_distance_bytes,
        DistanceMetric::Cosine => crate::functions::scalar::vector::cosine_distance_bytes,
        DistanceMetric::InnerProduct => crate::functions::scalar::vector::ip_distance_bytes,
    };

    // Extract vector bytes from a row value — zero-copy for Extension(Vector)
    #[inline]
    fn get_vector_bytes(row: &Row, col_idx: usize) -> Option<&[u8]> {
        match row.get(col_idx)? {
            Value::Extension(data)
                if data.first() == Some(&(crate::core::DataType::Vector as u8)) =>
            {
                Some(&data[1..])
            }
            _ => None,
        }
    }

    /// Max-heap entry: highest distance at top so we can efficiently evict the worst
    struct HeapEntry {
        distance: f64,
        idx: usize, // index into the chunk's collected (row_id, row) vec
    }

    impl PartialEq for HeapEntry {
        fn eq(&self, other: &Self) -> bool {
            self.distance == other.distance
        }
    }
    impl Eq for HeapEntry {}
    impl PartialOrd for HeapEntry {
        fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
            Some(self.cmp(other))
        }
    }
    impl Ord for HeapEntry {
        fn cmp(&self, other: &Self) -> std::cmp::Ordering {
            self.distance.total_cmp(&other.distance)
        }
    }

    let row_vec: Vec<(i64, Row)> = rows.into_vec();

    #[cfg(feature = "parallel")]
    let use_parallel = config.should_parallel_filter(row_vec.len());
    #[cfg(not(feature = "parallel"))]
    let use_parallel = false;
    let _ = config;

    if use_parallel {
        #[cfg(feature = "parallel")]
        {
            let n_threads = num_threads();
            let chunk_size = (row_vec.len() / (n_threads * 4)).max(1024);

            // Each chunk computes top-K locally, returns (row_id, row, distance)
            let chunk_results: Vec<Vec<(i64, Row, f64)>> = row_vec
                .into_par_iter()
                .chunks(chunk_size)
                .map(|chunk| {
                    let mut heap: BinaryHeap<HeapEntry> = BinaryHeap::with_capacity(k + 1);
                    let mut entries: Vec<(i64, Row, f64)> = Vec::with_capacity(k + 1);

                    for (row_id, row) in chunk {
                        let dist = if let Some(vec_bytes) = get_vector_bytes(&row, vector_col_idx) {
                            if vec_bytes.len() == query_bytes.len() {
                                distance_fn(vec_bytes, query_bytes)
                            } else {
                                f64::INFINITY // Dimension mismatch → sort to end
                            }
                        } else {
                            f64::INFINITY // No vector → sort to end
                        };
                        if entries.len() < k {
                            let idx = entries.len();
                            entries.push((row_id, row, dist));
                            heap.push(HeapEntry {
                                distance: dist,
                                idx,
                            });
                        } else if let Some(worst) = heap.peek() {
                            if dist < worst.distance {
                                let evict_idx = worst.idx;
                                heap.pop();
                                entries[evict_idx] = (row_id, row, dist);
                                heap.push(HeapEntry {
                                    distance: dist,
                                    idx: evict_idx,
                                });
                            }
                        }
                    }

                    // Return only the live entries (some slots may have been reused)
                    let live_indices: FxHashSet<usize> = heap.into_iter().map(|e| e.idx).collect();
                    entries
                        .into_iter()
                        .enumerate()
                        .filter_map(|(i, e)| {
                            if live_indices.contains(&i) {
                                Some(e)
                            } else {
                                None
                            }
                        })
                        .collect()
                })
                .collect();

            // Merge: collect all chunk results, take global top-K
            let mut merged: Vec<(i64, Row, f64)> = chunk_results.into_iter().flatten().collect();
            merged.sort_unstable_by(|a, b| a.2.total_cmp(&b.2));
            merged.truncate(k);
            return merged;
        }
    }

    // Sequential fallback
    let mut heap: BinaryHeap<HeapEntry> = BinaryHeap::with_capacity(k + 1);
    let mut entries: Vec<(i64, Row, f64)> = Vec::with_capacity(k + 1);

    for (row_id, row) in row_vec {
        let dist = if let Some(vec_bytes) = get_vector_bytes(&row, vector_col_idx) {
            if vec_bytes.len() == query_bytes.len() {
                distance_fn(vec_bytes, query_bytes)
            } else {
                f64::INFINITY // Dimension mismatch → sort to end
            }
        } else {
            f64::INFINITY // No vector → sort to end
        };
        if entries.len() < k {
            let idx = entries.len();
            entries.push((row_id, row, dist));
            heap.push(HeapEntry {
                distance: dist,
                idx,
            });
        } else if let Some(worst) = heap.peek() {
            if dist < worst.distance {
                let evict_idx = worst.idx;
                heap.pop();
                entries[evict_idx] = (row_id, row, dist);
                heap.push(HeapEntry {
                    distance: dist,
                    idx: evict_idx,
                });
            }
        }
    }

    let live_indices: FxHashSet<usize> = heap.into_iter().map(|e| e.idx).collect();
    let mut result: Vec<(i64, Row, f64)> = entries
        .into_iter()
        .enumerate()
        .filter_map(|(i, e)| {
            if live_indices.contains(&i) {
                Some(e)
            } else {
                None
            }
        })
        .collect();
    result.sort_unstable_by(|a, b| a.2.total_cmp(&b.2));
    result
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::core::Value;

    fn make_test_rows(count: usize) -> RowVec {
        (0..count)
            .map(|i| {
                (
                    i as i64,
                    Row::from_values(vec![
                        Value::Integer(i as i64),
                        Value::Integer(i as i64 % 10),
                    ]),
                )
            })
            .collect()
    }

    #[test]
    fn test_parallel_distinct() {
        // Create rows with duplicates
        let mut rows = RowVec::new();
        for i in 0i64..1000 {
            // Each value appears 10 times
            rows.push((i, Row::from_values(vec![Value::Integer(i % 100)])));
        }

        let config = ParallelConfig {
            min_rows_for_parallel_filter: 100, // Lower threshold for test
            ..Default::default()
        };

        let result = parallel_distinct(rows, &config);

        // Should have 100 unique values
        assert_eq!(result.len(), 100);
    }

    #[test]
    fn test_parallel_sort() {
        let mut rows: RowVec = (0..1000)
            .rev()
            .map(|i| (i, Row::from_values(vec![Value::Integer(i)])))
            .collect();

        let config = ParallelConfig {
            min_rows_for_parallel_sort: 100,
            ..Default::default()
        };

        parallel_sort(
            &mut rows,
            |a, b| {
                let a_val = a.get(0).and_then(|v| v.as_int64()).unwrap_or(0);
                let b_val = b.get(0).and_then(|v| v.as_int64()).unwrap_or(0);
                a_val.cmp(&b_val)
            },
            &config,
        );

        // Verify sorted
        for (i, (_, row)) in rows.iter().enumerate() {
            assert_eq!(row.get(0), Some(&Value::Integer(i as i64)));
        }
    }

    #[test]
    fn test_parallel_config_thresholds() {
        let config = ParallelConfig::default();

        assert!(!config.should_parallel_filter(1000)); // Below threshold
        assert!(config.should_parallel_filter(20_000)); // Above threshold

        assert!(!config.should_parallel_sort(10_000)); // Below threshold
        assert!(config.should_parallel_sort(100_000)); // Above threshold

        let disabled = ParallelConfig::disabled();
        assert!(!disabled.should_parallel_filter(1_000_000)); // Always false when disabled
    }

    #[test]
    fn test_parallel_filter_owned() {
        let rows = make_test_rows(50_000);
        let config = ParallelConfig::default();

        // Filter rows where second column (value) < 5
        let result = parallel_filter_owned(
            rows,
            |row| {
                if let Some(Value::Integer(v)) = row.get(1) {
                    *v < 5
                } else {
                    false
                }
            },
            &config,
        );

        // 50% selectivity: values 0-4 pass out of 0-9
        assert_eq!(result.len(), 25_000);
    }

    #[test]
    fn test_sequential_fallback_small_dataset() {
        let rows = make_test_rows(100); // Below threshold
        let config = ParallelConfig::default();

        // Should use sequential path
        let result = parallel_filter_owned(
            rows,
            |row| {
                if let Some(Value::Integer(v)) = row.get(1) {
                    *v < 5
                } else {
                    false
                }
            },
            &config,
        );

        // 50% selectivity
        assert_eq!(result.len(), 50);
    }

    #[test]
    fn test_parallel_hash_build() {
        // Build side: 10K rows with id as key
        let build_rows: Vec<Row> = (0..10_000)
            .map(|i| {
                Row::from_values(vec![
                    Value::Integer(i),
                    Value::Text(format!("build_{}", i).into()),
                ])
            })
            .collect();

        let config = ParallelConfig {
            min_rows_for_parallel_join: 1000,
            ..Default::default()
        };

        let hash_table = parallel_hash_build(&build_rows, &[0], &config);

        assert_eq!(hash_table.row_count, 10_000);
        // Verify some lookups work
        let test_hash = hash_row_by_keys(&build_rows[500], &[0]);
        assert!(hash_table.get(&test_hash).is_some());
    }

    #[test]
    fn test_parallel_hash_probe() {
        // Build side: 5K rows
        let build_rows: Vec<Row> = (0..5_000)
            .map(|i| {
                Row::from_values(vec![
                    Value::Integer(i),
                    Value::Text(format!("build_{}", i).into()),
                ])
            })
            .collect();

        // Probe side: 10K rows, half will match (even numbers only)
        let probe_rows: Vec<Row> = (0..10_000)
            .map(|i| {
                Row::from_values(vec![
                    Value::Integer(i * 2 % 5_000), // Maps to 0-4999 with duplicates
                    Value::Text(format!("probe_{}", i).into()),
                ])
            })
            .collect();

        let config = ParallelConfig {
            min_rows_for_parallel_join: 1000,
            ..Default::default()
        };

        // Build hash table
        let hash_table = parallel_hash_build(&build_rows, &[0], &config);

        // Probe
        let matches = parallel_hash_probe(
            &probe_rows,
            &[0],
            &hash_table,
            &build_rows,
            |probe, build| {
                // Verify actual equality
                probe.get(0) == build.get(0)
            },
            &config,
        );

        // Each probe row should match exactly one build row
        // 10K probes, each maps to one of 5K build rows
        assert_eq!(matches.len(), 10_000);
    }

    #[test]
    fn test_verify_key_match() {
        let row1 = Row::from_values(vec![Value::Integer(1), Value::Text("a".to_string().into())]);
        let row2 = Row::from_values(vec![Value::Integer(1), Value::Text("b".to_string().into())]);
        let row3 = Row::from_values(vec![Value::Integer(2), Value::Text("a".to_string().into())]);

        // Same key column 0
        assert!(verify_key_match(&row1, &row2, &[0], &[0]));

        // Different key column 0
        assert!(!verify_key_match(&row1, &row3, &[0], &[0]));

        // Same value in column 1
        assert!(verify_key_match(&row1, &row3, &[1], &[1]));
    }

    #[test]
    fn test_parallel_order_by() {
        // Create rows with random-ish order
        let mut rows: RowVec = (0..1000)
            .map(|i| {
                (
                    i,
                    Row::from_values(vec![
                        Value::Integer((i * 7 + 13) % 1000), // Scrambled order
                        Value::Text(format!("row_{}", i).into()),
                    ]),
                )
            })
            .collect();

        let config = ParallelConfig {
            min_rows_for_parallel_sort: 100,
            ..Default::default()
        };

        let sort_specs = vec![SortSpec {
            column_index: 0,
            direction: SortDirection::Ascending,
            nulls_first: false,
        }];

        parallel_order_by(&mut rows, &sort_specs, &config);

        // Verify sorted ascending
        for i in 1..rows.len() {
            let prev = rows[i - 1].1.get(0).and_then(|v| v.as_int64()).unwrap();
            let curr = rows[i].1.get(0).and_then(|v| v.as_int64()).unwrap();
            assert!(prev <= curr, "Row {} should be <= row {}", i - 1, i);
        }
    }

    #[test]
    fn test_parallel_order_by_descending() {
        let mut rows: RowVec = (0..500)
            .map(|i| (i, Row::from_values(vec![Value::Integer(i)])))
            .collect();

        let config = ParallelConfig {
            min_rows_for_parallel_sort: 100,
            ..Default::default()
        };

        let sort_specs = vec![SortSpec {
            column_index: 0,
            direction: SortDirection::Descending,
            nulls_first: false,
        }];

        parallel_order_by(&mut rows, &sort_specs, &config);

        // Verify sorted descending
        for i in 1..rows.len() {
            let prev = rows[i - 1].1.get(0).and_then(|v| v.as_int64()).unwrap();
            let curr = rows[i].1.get(0).and_then(|v| v.as_int64()).unwrap();
            assert!(prev >= curr, "Row {} should be >= row {}", i - 1, i);
        }
    }

    #[test]
    fn test_parallel_order_by_with_nulls() {
        let mut rows = RowVec::new();
        rows.push((0, Row::from_values(vec![Value::Integer(3)])));
        rows.push((1, Row::from_values(vec![Value::null_unknown()])));
        rows.push((2, Row::from_values(vec![Value::Integer(1)])));
        rows.push((3, Row::from_values(vec![Value::null_unknown()])));
        rows.push((4, Row::from_values(vec![Value::Integer(2)])));

        let config = ParallelConfig {
            min_rows_for_parallel_sort: 1, // Force parallel for test
            ..Default::default()
        };

        // NULLS FIRST
        let sort_specs = vec![SortSpec {
            column_index: 0,
            direction: SortDirection::Ascending,
            nulls_first: true,
        }];

        parallel_order_by(&mut rows, &sort_specs, &config);

        // First two should be NULL
        assert!(rows[0].1.get(0).map(|v| v.is_null()).unwrap_or(false));
        assert!(rows[1].1.get(0).map(|v| v.is_null()).unwrap_or(false));
        // Then 1, 2, 3
        assert_eq!(rows[2].1.get(0), Some(&Value::Integer(1)));
        assert_eq!(rows[3].1.get(0), Some(&Value::Integer(2)));
        assert_eq!(rows[4].1.get(0), Some(&Value::Integer(3)));
    }

    // ========================================================================
    // Edge Case Tests for Hash Collisions and OUTER Joins
    // ========================================================================

    /// Test that DISTINCT handles hash collisions correctly
    /// Two different rows with the same hash should both be preserved
    #[test]
    fn test_distinct_hash_collision_handling() {
        // Create rows that might have hash collisions by having different values
        // that could theoretically hash to the same value
        let mut rows = RowVec::new();
        rows.push((
            0,
            Row::from_values(vec![Value::Integer(1), Value::Text("a".into())]),
        ));
        rows.push((
            1,
            Row::from_values(vec![Value::Integer(1), Value::Text("b".into())]),
        )); // Different from above
        rows.push((
            2,
            Row::from_values(vec![Value::Integer(1), Value::Text("a".into())]),
        )); // Duplicate of first
        rows.push((
            3,
            Row::from_values(vec![Value::Integer(2), Value::Text("a".into())]),
        )); // Different from all
        rows.push((
            4,
            Row::from_values(vec![Value::Integer(2), Value::Text("a".into())]),
        )); // Duplicate

        let config = ParallelConfig {
            min_rows_for_parallel_filter: 1, // Force parallel path
            ..Default::default()
        };

        let result = parallel_distinct(rows, &config);

        // Should have 3 unique rows:
        // (1, "a"), (1, "b"), (2, "a")
        assert_eq!(result.len(), 3, "Should have 3 unique rows");

        // Verify all three distinct combinations are present
        let has_1_a = result.iter().any(|(_, r)| {
            r.get(0) == Some(&Value::Integer(1)) && r.get(1) == Some(&Value::Text("a".into()))
        });
        let has_1_b = result.iter().any(|(_, r)| {
            r.get(0) == Some(&Value::Integer(1)) && r.get(1) == Some(&Value::Text("b".into()))
        });
        let has_2_a = result.iter().any(|(_, r)| {
            r.get(0) == Some(&Value::Integer(2)) && r.get(1) == Some(&Value::Text("a".into()))
        });

        assert!(has_1_a, "Should contain (1, 'a')");
        assert!(has_1_b, "Should contain (1, 'b')");
        assert!(has_2_a, "Should contain (2, 'a')");
    }

    /// Test sequential distinct also handles hash collisions
    #[test]
    fn test_sequential_distinct_hash_collision() {
        let mut rows = RowVec::new();
        rows.push((0, Row::from_values(vec![Value::Integer(100)])));
        rows.push((1, Row::from_values(vec![Value::Integer(200)])));
        rows.push((2, Row::from_values(vec![Value::Integer(100)]))); // Duplicate
        rows.push((3, Row::from_values(vec![Value::Integer(300)])));

        // Use high threshold to force sequential path
        let config = ParallelConfig {
            min_rows_for_parallel_filter: 10000,
            ..Default::default()
        };

        let result = parallel_distinct(rows, &config);
        assert_eq!(
            result.len(),
            3,
            "Should have 3 unique values: 100, 200, 300"
        );
    }

    /// Test parallel hash join with hash collisions on join keys
    #[test]
    fn test_parallel_hash_join_collision_handling() {
        // Build side: rows with varying second columns but same join key
        let build_rows: Vec<Row> = vec![
            Row::from_values(vec![Value::Integer(1), Value::Text("build_a".into())]),
            Row::from_values(vec![Value::Integer(2), Value::Text("build_b".into())]),
            Row::from_values(vec![Value::Integer(3), Value::Text("build_c".into())]),
        ];

        // Probe side: rows that should match build side
        let probe_rows: Vec<Row> = vec![
            Row::from_values(vec![Value::Integer(1), Value::Text("probe_x".into())]),
            Row::from_values(vec![Value::Integer(2), Value::Text("probe_y".into())]),
            Row::from_values(vec![Value::Integer(4), Value::Text("probe_z".into())]), // No match
        ];

        let config = ParallelConfig {
            min_rows_for_parallel_join: 1, // Force parallel path
            ..Default::default()
        };

        // INNER JOIN on first column
        let result = parallel_hash_join(
            &probe_rows,
            &build_rows,
            &[0], // probe key
            &[0], // build key
            JoinType::Inner,
            2, // probe col count
            2, // build col count
            false,
            &config,
        );

        // Should have 2 matches (id=1 and id=2)
        assert_eq!(result.rows.len(), 2, "INNER JOIN should have 2 matches");

        // Verify the joined rows have correct values
        for row in &result.rows {
            // Combined row should have 4 columns (2 from probe + 2 from build)
            assert_eq!(row.len(), 4);
        }
    }

    /// Test LEFT OUTER join with unmatched probe rows
    #[test]
    fn test_parallel_left_join_unmatched() {
        let build_rows: Vec<Row> = vec![Row::from_values(vec![
            Value::Integer(1),
            Value::Text("match".into()),
        ])];

        let probe_rows: Vec<Row> = vec![
            Row::from_values(vec![Value::Integer(1), Value::Text("p1".into())]), // Matches
            Row::from_values(vec![Value::Integer(2), Value::Text("p2".into())]), // No match
            Row::from_values(vec![Value::Integer(3), Value::Text("p3".into())]), // No match
        ];

        let config = ParallelConfig {
            min_rows_for_parallel_join: 1,
            ..Default::default()
        };

        let result = parallel_hash_join(
            &probe_rows,
            &build_rows,
            &[0],
            &[0],
            JoinType::Left,
            2,
            2,
            false,
            &config,
        );

        // Should have 3 rows: 1 matched + 2 unmatched with NULL build columns
        assert_eq!(result.rows.len(), 3, "LEFT JOIN should have 3 rows");

        // Count rows with NULL in build columns (last 2 columns)
        let null_count = result
            .rows
            .iter()
            .filter(|r| {
                r.get(2).map(|v| v.is_null()).unwrap_or(false)
                    && r.get(3).map(|v| v.is_null()).unwrap_or(false)
            })
            .count();
        assert_eq!(
            null_count, 2,
            "Should have 2 unmatched rows with NULL build columns"
        );
    }

    /// Test RIGHT OUTER join with unmatched build rows
    #[test]
    fn test_parallel_right_join_unmatched() {
        let build_rows: Vec<Row> = vec![
            Row::from_values(vec![Value::Integer(1), Value::Text("b1".into())]), // Matches
            Row::from_values(vec![Value::Integer(2), Value::Text("b2".into())]), // No match
            Row::from_values(vec![Value::Integer(3), Value::Text("b3".into())]), // No match
        ];

        let probe_rows: Vec<Row> = vec![Row::from_values(vec![
            Value::Integer(1),
            Value::Text("p1".into()),
        ])];

        let config = ParallelConfig {
            min_rows_for_parallel_join: 1,
            ..Default::default()
        };

        let result = parallel_hash_join(
            &probe_rows,
            &build_rows,
            &[0],
            &[0],
            JoinType::Right,
            2,
            2,
            false,
            &config,
        );

        // Should have 3 rows: 1 matched + 2 unmatched with NULL probe columns
        assert_eq!(result.rows.len(), 3, "RIGHT JOIN should have 3 rows");

        // Count rows with NULL in probe columns (first 2 columns)
        let null_count = result
            .rows
            .iter()
            .filter(|r| {
                r.get(0).map(|v| v.is_null()).unwrap_or(false)
                    && r.get(1).map(|v| v.is_null()).unwrap_or(false)
            })
            .count();
        assert_eq!(
            null_count, 2,
            "Should have 2 unmatched rows with NULL probe columns"
        );
    }

    /// Test FULL OUTER join with unmatched rows on both sides
    #[test]
    fn test_parallel_full_outer_join() {
        let build_rows: Vec<Row> = vec![
            Row::from_values(vec![Value::Integer(1), Value::Text("b1".into())]), // Matches
            Row::from_values(vec![Value::Integer(3), Value::Text("b3".into())]), // No match
        ];

        let probe_rows: Vec<Row> = vec![
            Row::from_values(vec![Value::Integer(1), Value::Text("p1".into())]), // Matches
            Row::from_values(vec![Value::Integer(2), Value::Text("p2".into())]), // No match
        ];

        let config = ParallelConfig {
            min_rows_for_parallel_join: 1,
            ..Default::default()
        };

        let result = parallel_hash_join(
            &probe_rows,
            &build_rows,
            &[0],
            &[0],
            JoinType::Full,
            2,
            2,
            false,
            &config,
        );

        // Should have 3 rows:
        // 1 matched (id=1)
        // 1 unmatched probe (id=2, build NULL)
        // 1 unmatched build (id=3, probe NULL)
        assert_eq!(result.rows.len(), 3, "FULL OUTER JOIN should have 3 rows");
    }

    /// Test join with empty tables
    #[test]
    fn test_parallel_join_empty_tables() {
        let config = ParallelConfig::default();

        // Empty probe
        let result = parallel_hash_join(
            &[],
            &[Row::from_values(vec![Value::Integer(1)])],
            &[0],
            &[0],
            JoinType::Inner,
            1,
            1,
            false,
            &config,
        );
        assert_eq!(
            result.rows.len(),
            0,
            "Empty probe should give empty result for INNER"
        );

        // Empty build
        let result = parallel_hash_join(
            &[Row::from_values(vec![Value::Integer(1)])],
            &[],
            &[0],
            &[0],
            JoinType::Inner,
            1,
            1,
            false,
            &config,
        );
        assert_eq!(
            result.rows.len(),
            0,
            "Empty build should give empty result for INNER"
        );

        // LEFT JOIN with empty build should preserve all probe rows
        let result = parallel_hash_join(
            &[
                Row::from_values(vec![Value::Integer(1)]),
                Row::from_values(vec![Value::Integer(2)]),
            ],
            &[],
            &[0],
            &[0],
            JoinType::Left,
            1,
            1,
            false,
            &config,
        );
        assert_eq!(
            result.rows.len(),
            2,
            "LEFT JOIN with empty build should have all probe rows"
        );
    }

    /// Test join with swapped build/probe sides
    /// When swapped=true, the roles of probe and build are swapped, but the join type
    /// semantics stay the same. LEFT JOIN with swapped=true means:
    /// - Build side is actually "left" in the original query
    /// - Probe side is "right"
    /// - LEFT JOIN needs unmatched LEFT (build) rows, not probe rows
    #[test]
    fn test_parallel_join_swapped() {
        // When swapped=true:
        // - build_rows (1 row) = original left side
        // - probe_rows (2 rows) = original right side
        let build_rows: Vec<Row> = vec![
            Row::from_values(vec![Value::Integer(1), Value::Text("b1".into())]),
            Row::from_values(vec![Value::Integer(3), Value::Text("b3".into())]), // Unmatched left
        ];

        let probe_rows: Vec<Row> = vec![
            Row::from_values(vec![Value::Integer(1), Value::Text("p1".into())]),
            Row::from_values(vec![Value::Integer(2), Value::Text("p2".into())]), // Unmatched right
        ];

        let config = ParallelConfig {
            min_rows_for_parallel_join: 1,
            ..Default::default()
        };

        // LEFT JOIN with swapped=true:
        // - Build is "left", so unmatched build rows need NULL right columns
        // - Probe is "right", so unmatched probe rows are NOT included (LEFT JOIN only keeps left)
        let result = parallel_hash_join(
            &probe_rows,
            &build_rows,
            &[0],
            &[0],
            JoinType::Left,
            2,
            2,
            true, // swapped: build=left, probe=right
            &config,
        );

        // Should have 2 rows:
        // 1 matched row (id=1)
        // 1 unmatched build row (id=3) with NULL probe columns
        assert_eq!(result.rows.len(), 2, "LEFT JOIN swapped should have 2 rows");

        // Verify column order is correct (build columns first when swapped)
        // Row structure: [build_col0, build_col1, probe_col0, probe_col1]
        let matched_row = result
            .rows
            .iter()
            .find(|r| r.get(0) == Some(&Value::Integer(1)) && r.get(2) == Some(&Value::Integer(1)));
        assert!(matched_row.is_some(), "Should have a matched row with id=1");

        // Verify unmatched build row has NULL in probe columns
        let unmatched_row = result
            .rows
            .iter()
            .find(|r| r.get(0) == Some(&Value::Integer(3)));
        assert!(
            unmatched_row.is_some(),
            "Should have unmatched build row with id=3"
        );
        let unmatched = unmatched_row.unwrap();
        assert!(
            unmatched.get(2).map(|v| v.is_null()).unwrap_or(false),
            "Probe col should be NULL"
        );
        assert!(
            unmatched.get(3).map(|v| v.is_null()).unwrap_or(false),
            "Probe col should be NULL"
        );
    }

    /// Test rows_equal function directly
    #[test]
    fn test_rows_equal() {
        let row1 = Row::from_values(vec![Value::Integer(1), Value::Text("a".into())]);
        let row2 = Row::from_values(vec![Value::Integer(1), Value::Text("a".into())]);
        let row3 = Row::from_values(vec![Value::Integer(1), Value::Text("b".into())]);
        let row4 = Row::from_values(vec![Value::Integer(1)]);

        assert!(rows_equal(&row1, &row2), "Identical rows should be equal");
        assert!(
            !rows_equal(&row1, &row3),
            "Different values should not be equal"
        );
        assert!(
            !rows_equal(&row1, &row4),
            "Different lengths should not be equal"
        );
    }

    /// Test rows_equal with NULL values
    #[test]
    fn test_rows_equal_with_nulls() {
        let row_with_null1 = Row::from_values(vec![Value::Integer(1), Value::null_unknown()]);
        let row_with_null2 = Row::from_values(vec![Value::Integer(1), Value::null_unknown()]);
        let row_no_null = Row::from_values(vec![Value::Integer(1), Value::Integer(2)]);

        // NULL == NULL in this context (for deduplication purposes)
        assert!(
            rows_equal(&row_with_null1, &row_with_null2),
            "Rows with same NULL positions should be equal"
        );
        assert!(
            !rows_equal(&row_with_null1, &row_no_null),
            "NULL should not equal non-NULL"
        );
    }
}