buoyant_kernel 0.21.101

//! Functionality to create and execute scans (reads) over data stored in a delta table

use std::borrow::Cow;
use std::collections::{HashMap, HashSet};
use std::sync::{Arc, LazyLock};
use std::time::Instant;

use delta_kernel_derive::internal_api;
use itertools::Itertools;
use tracing::{debug, info};
use url::Url;

use crate::metrics::MetricId;
use crate::scan::metrics::ScanMetrics;
use crate::utils::IteratorExt;

use self::data_skipping::as_checkpoint_skipping_predicate;
use self::log_replay::get_scan_metadata_transform_expr;
use crate::actions::deletion_vector::{
    deletion_treemap_to_bools, split_vector, DeletionVectorDescriptor,
};
use crate::actions::{get_commit_schema, Add, ADD_NAME, REMOVE_NAME};
use crate::engine_data::FilteredEngineData;
use crate::expressions::{ColumnName, ExpressionRef, Predicate, PredicateRef, Scalar};
use crate::kernel_predicates::{
    DefaultKernelPredicateEvaluator, EmptyColumnResolver, KernelPredicateEvaluator as _,
};
use crate::log_replay::{ActionsBatch, HasSelectionVector};
use crate::log_segment::{ActionsWithCheckpointInfo, CheckpointReadInfo, LogSegment};
use crate::log_segment_files::LogSegmentFiles;
use crate::metrics::ScanType;
use crate::parallel::sequential_phase::SequentialPhase;
use crate::scan::log_replay::ScanLogReplayProcessor;
use crate::scan::log_replay::{
    BASE_ROW_ID_NAME, CLUSTERING_PROVIDER_NAME, DEFAULT_ROW_COMMIT_VERSION_NAME,
};
use crate::scan::state_info::StateInfo;
use crate::schema::{
    ArrayType, DataType, MapType, PrimitiveType, Schema, SchemaRef, StructField, StructType,
    ToSchema as _,
};
use crate::table_features::{ColumnMappingMode, Operation};
use crate::transforms::{ExpressionTransform, SchemaTransform};
use crate::{DeltaResult, Engine, EngineData, Error, FileMeta, SnapshotRef, Version};

use self::log_replay::scan_action_iter;

pub(crate) mod data_skipping;
pub(crate) mod field_classifiers;
pub mod log_replay;
pub(crate) mod metrics;
pub mod state;
pub(crate) mod state_info;
pub(crate) mod transform_spec;

#[cfg(test)]
pub(crate) mod test_utils;

#[cfg(test)]
mod tests;

// safety: we define get_commit_schema() and _know_ it contains ADD_NAME and REMOVE_NAME
#[allow(clippy::unwrap_used)]
pub(crate) static COMMIT_READ_SCHEMA: LazyLock<SchemaRef> = LazyLock::new(|| {
    get_commit_schema()
        .project(&[ADD_NAME, REMOVE_NAME])
        .unwrap()
});
// safety: we define get_commit_schema() and _know_ it contains ADD_NAME and SIDECAR_NAME
#[allow(clippy::unwrap_used)]
pub(crate) static CHECKPOINT_READ_SCHEMA: LazyLock<SchemaRef> =
    LazyLock::new(|| get_commit_schema().project(&[ADD_NAME]).unwrap());

/// Checkpoint schema WITHOUT stats for column projection pushdown.
/// When skip_stats is enabled, we use this schema to avoid reading the stats column from parquet.
pub(crate) static CHECKPOINT_READ_SCHEMA_NO_STATS: LazyLock<SchemaRef> = LazyLock::new(|| {
    let add_schema = Add::to_schema();
    let fields_no_stats: Vec<_> = add_schema
        .fields()
        .filter(|f| f.name() != "stats")
        .cloned()
        .collect();
    let add_no_stats = StructType::new_unchecked(fields_no_stats);
    Arc::new(StructType::new_unchecked([StructField::nullable(
        ADD_NAME,
        add_no_stats,
    )]))
});

#[allow(unused)]
pub use crate::parallel::parallel_scan_metadata::{
    AfterSequentialScanMetadata, ParallelScanMetadata, ParallelState, SequentialScanMetadata,
};

/// Controls how file statistics are handled during a scan.
///
/// This enum determines whether and which statistics columns appear in scan metadata output,
/// and whether internal data skipping is enabled.
#[derive(Debug, Clone)]
pub enum StatsOutputMode {
    /// Output all table stats columns in `stats_parsed`.
    AllColumns,
    /// Output stats for specific columns. An empty list means no stats output, but
    /// predicate-based data skipping still works internally.
    Columns(Vec<ColumnName>),
    /// Skip reading stats entirely. Disables data skipping.
    Skip,
}

impl Default for StatsOutputMode {
    fn default() -> Self {
        StatsOutputMode::Columns(Vec::new())
    }
}

/// Builder to scan a snapshot of a table.
pub struct ScanBuilder {
    snapshot: SnapshotRef,
    schema: Option<SchemaRef>,
    predicate: Option<PredicateRef>,
    stats_output_mode: StatsOutputMode,
}

impl std::fmt::Debug for ScanBuilder {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
        f.debug_struct("ScanBuilder")
            .field("schema", &self.schema)
            .field("predicate", &self.predicate)
            .field("stats_output_mode", &self.stats_output_mode)
            .finish()
    }
}

impl ScanBuilder {
    /// Create a new [`ScanBuilder`] instance.
    pub fn new(snapshot: impl Into<SnapshotRef>) -> Self {
        Self {
            snapshot: snapshot.into(),
            schema: None,
            predicate: None,
            stats_output_mode: StatsOutputMode::default(),
        }
    }

    /// Provide [`Schema`] for columns to select from the [`Snapshot`].
    ///
    /// A table with columns `[a, b, c]` could have a scan which reads only the first
    /// two columns by using the schema `[a, b]`.
    ///
    /// [`Schema`]: crate::schema::Schema
    /// [`Snapshot`]: crate::snapshot::Snapshot
    pub fn with_schema(mut self, schema: SchemaRef) -> Self {
        self.schema = Some(schema);
        self
    }

    /// Optionally provide a [`SchemaRef`] for columns to select from the [`Snapshot`]. See
    /// [`ScanBuilder::with_schema`] for details. If `schema_opt` is `None` this is a no-op.
    ///
    /// [`Snapshot`]: crate::Snapshot
    pub fn with_schema_opt(self, schema_opt: Option<SchemaRef>) -> Self {
        match schema_opt {
            Some(schema) => self.with_schema(schema),
            None => self,
        }
    }

    /// Optionally provide an expression to filter rows. For example, using the predicate `x <
    /// 4` to return a subset of the rows in the scan which satisfy the filter. If `predicate_opt`
    /// is `None`, this is a no-op.
    ///
    /// NOTE: The filtering is best-effort and can produce false positives (rows that should should
    /// have been filtered out but were kept).
    ///
    /// This method can be combined with [`include_all_stats_columns`]. When both are used, the kernel
    /// performs data skipping internally using the predicate AND outputs parsed statistics to the
    /// engine via the `stats_parsed` column in scan metadata.
    ///
    /// [`include_all_stats_columns`]: ScanBuilder::include_all_stats_columns
    pub fn with_predicate(mut self, predicate: impl Into<Option<PredicateRef>>) -> Self {
        self.predicate = predicate.into();
        self
    }

    /// Include all parsed statistics in scan metadata.
    ///
    /// When enabled, the scan will include a `stats_parsed` column in the scan metadata
    /// containing pre-parsed file statistics (minValues, maxValues, nullCount, numRecords)
    /// that integrations can use for their own data skipping logic.
    ///
    /// The statistics schema is determined by the table's configuration
    /// (`delta.dataSkippingStatsColumns` or `delta.dataSkippingNumIndexedCols`). In the future,
    /// a requested columns filter may limit which columns appear in the output without
    /// affecting the table-level column counting.
    ///
    /// This method can be combined with [`with_predicate`]. When both are used, the kernel
    /// performs data skipping internally using the predicate AND outputs parsed statistics to the
    /// engine via the `stats_parsed` column in scan metadata.
    ///
    /// [`with_predicate`]: ScanBuilder::with_predicate
    pub fn include_all_stats_columns(mut self) -> Self {
        self.stats_output_mode = StatsOutputMode::AllColumns;
        self
    }

    /// Include specific columns in the scan metadata.
    ///
    /// When `columns` is non-empty, only those columns' statistics appear in `stats_parsed`.
    /// When `columns` is empty, no stats are output (equivalent to the default behavior).
    ///
    /// [`with_stats_columns`]: ScanBuilder::with_stats_columns
    /// [`build`]: ScanBuilder::build
    pub fn with_stats_columns(mut self, columns: Vec<ColumnName>) -> Self {
        self.stats_output_mode = StatsOutputMode::Columns(columns);
        self
    }

    /// Skip reading file statistics from checkpoint files.
    ///
    /// When enabled:
    /// - Parquet checkpoint reads use column projection to skip the stats column
    /// - The `stats` field in scan results will be `None`
    /// - Columnar data skipping is disabled (no stats-based or partition-value-based pruning),
    ///   but row-level partition filtering still applies
    ///
    /// If called after [`include_all_stats_columns`] or [`with_stats_columns`], the last call wins.
    ///
    /// Use this when data skipping is handled externally (e.g., by the query engine).
    ///
    /// [`include_all_stats_columns`]: ScanBuilder::include_all_stats_columns
    /// [`with_stats_columns`]: ScanBuilder::with_stats_columns
    pub fn with_skip_stats(mut self, skip_stats: bool) -> Self {
        if skip_stats {
            self.stats_output_mode = StatsOutputMode::Skip;
        }
        self
    }

    /// Build the [`Scan`].
    ///
    /// This does not scan the table at this point, but does do some work to ensure that the
    /// provided schema make sense, and to prepare some metadata that the scan will need.  The
    /// [`Scan`] type itself can be used to fetch the files and associated metadata required to
    /// perform actual data reads.
    pub fn build(self) -> DeltaResult<Scan> {
        // if no schema is provided, use snapshot's entire schema (e.g. SELECT *)
        let logical_schema = self.schema.unwrap_or_else(|| self.snapshot.schema());

        self.snapshot
            .table_configuration()
            .ensure_operation_supported(Operation::Scan)?;

        let state_info = StateInfo::try_new(
            logical_schema,
            self.snapshot.table_configuration(),
            self.predicate,
            self.stats_output_mode.clone(),
            (), // No classifier, default is for scans
        )?;

        Ok(Scan {
            snapshot: self.snapshot,
            state_info: Arc::new(state_info),
            stats_output_mode: self.stats_output_mode,
        })
    }
}

#[derive(Clone, Debug, PartialEq)]
pub(crate) enum PhysicalPredicate {
    Some(PredicateRef, SchemaRef),
    StaticSkipAll,
    None,
}

impl PhysicalPredicate {
    /// If we have a predicate, verify the columns it references and apply column mapping. First, get
    /// the set of references; use that to filter the schema to only the columns of interest (and
    /// verify that all referenced columns exist); then use the resulting logical/physical mappings
    /// to rewrite the expression with physical column names.
    ///
    /// NOTE: It is possible the predicate resolves to FALSE even ignoring column references,
    /// e.g. `col > 10 AND FALSE`. Such predicates can statically skip the whole query.
    pub(crate) fn try_new(
        predicate: &Predicate,
        logical_schema: &Schema,
        column_mapping_mode: ColumnMappingMode,
    ) -> DeltaResult<PhysicalPredicate> {
        if can_statically_skip_all_files(predicate) {
            return Ok(PhysicalPredicate::StaticSkipAll);
        }
        let unresolved_references = predicate.references();
        // Group predicate references by case-folded path so that multiple references to the
        // same column with different casings (e.g., `col > 5 AND COL < 10`) all resolve
        // correctly instead of one being silently dropped.
        let mut folded_references: HashMap<Vec<String>, Vec<&ColumnName>> = HashMap::new();
        for r in &unresolved_references {
            let folded: Vec<String> = r.iter().map(|s| s.to_lowercase()).collect();
            folded_references.entry(folded).or_default().push(r);
        }
        let mut get_referenced_fields = GetReferencedFields {
            unresolved_references,
            folded_references,
            column_mappings: HashMap::new(),
            logical_path: vec![],
            folded_logical_path: vec![],
            physical_path: vec![],
            column_mapping_mode,
        };
        let schema_opt = get_referenced_fields.transform_struct(logical_schema);
        let mut unresolved = get_referenced_fields.unresolved_references.into_iter();
        if let Some(unresolved) = unresolved.next() {
            // Schema traversal failed to resolve at least one column referenced by the predicate.
            //
            // NOTE: It's a pretty serious engine bug if we got this far with a query whose WHERE
            // clause has invalid column references. Data skipping is best-effort and the predicate
            // anyway needs to be evaluated against every row of data -- which is impossible if the
            // columns are missing/invalid. Just blow up instead of trying to handle it gracefully.
            return Err(Error::missing_column(format!(
                "Predicate references unknown column: {unresolved}"
            )));
        }
        let Some(schema) = schema_opt else {
            // The predicate doesn't statically skip all files, and it doesn't reference any columns
            // that could dynamically change its behavior, so it's useless for data skipping.
            return Ok(PhysicalPredicate::None);
        };
        let mut apply_mappings = ApplyColumnMappings {
            column_mappings: get_referenced_fields.column_mappings,
        };
        if let Some(predicate) = apply_mappings.transform_pred(predicate) {
            Ok(PhysicalPredicate::Some(
                Arc::new(predicate.into_owned()),
                Arc::new(schema.into_owned()),
            ))
        } else {
            Ok(PhysicalPredicate::None)
        }
    }
}

// Evaluates a static data skipping predicate, ignoring any column references, and returns true if
// the predicate allows to statically skip all files. Since this is direct evaluation (not an
// expression rewrite), we use a `DefaultKernelPredicateEvaluator` with an empty column resolver.
fn can_statically_skip_all_files(predicate: &Predicate) -> bool {
    let evaluator = DefaultKernelPredicateEvaluator::from(EmptyColumnResolver);
    evaluator.eval_sql_where(predicate) == Some(false)
}

// Build the stats read schema filtering the table schema to keep only skipping-eligible
// leaf fields that the skipping expression actually references. Also extract physical name
// mappings so we can access the correct physical stats column for each logical column.
struct GetReferencedFields<'a> {
    unresolved_references: HashSet<&'a ColumnName>,
    /// Case-folded (lowercased) column path -> all predicate column names that fold to it,
    /// for O(1) case-insensitive matching. Grouped as a `Vec` so that multiple references to
    /// the same column with different casings all resolve correctly.
    folded_references: HashMap<Vec<String>, Vec<&'a ColumnName>>,
    column_mappings: HashMap<ColumnName, ColumnName>,
    logical_path: Vec<String>,
    /// Case-folded version of `logical_path`, maintained incrementally via push/pop to avoid
    /// re-folding the entire path at every leaf field.
    folded_logical_path: Vec<String>,
    physical_path: Vec<String>,
    column_mapping_mode: ColumnMappingMode,
}
impl<'a> SchemaTransform<'a> for GetReferencedFields<'a> {
    // Capture the path mapping for this leaf field
    fn transform_primitive(&mut self, ptype: &'a PrimitiveType) -> Option<Cow<'a, PrimitiveType>> {
        // Record the physical name mappings for all referenced leaf columns. Delta column names
        // are case-insensitive, so we probe the case-folded lookup map for O(1) matching.
        let pred_cols = self
            .folded_references
            .remove(self.folded_logical_path.as_slice())?;
        let physical = ColumnName::new(&self.physical_path);
        for pred_col in pred_cols {
            self.unresolved_references.remove(pred_col);
            // Use the predicate's column name as key so ApplyColumnMappings can look it up
            // by the exact name used in the predicate expression.
            self.column_mappings
                .insert(pred_col.clone(), physical.clone());
        }
        Some(Cow::Borrowed(ptype))
    }

    // array and map fields are not eligible for data skipping, so filter them out.
    fn transform_array(&mut self, _: &'a ArrayType) -> Option<Cow<'a, ArrayType>> {
        None
    }
    fn transform_map(&mut self, _: &'a MapType) -> Option<Cow<'a, MapType>> {
        None
    }

    fn transform_struct_field(&mut self, field: &'a StructField) -> Option<Cow<'a, StructField>> {
        let physical_name = field.physical_name(self.column_mapping_mode);
        self.logical_path.push(field.name.clone());
        self.folded_logical_path.push(field.name.to_lowercase());
        self.physical_path.push(physical_name.to_string());
        let field = self.recurse_into_struct_field(field);
        self.logical_path.pop();
        self.folded_logical_path.pop();
        self.physical_path.pop();
        Some(Cow::Owned(field?.with_name(physical_name)))
    }
}

/// Prefixes all column references in a predicate with a fixed path.
/// Transforms data-skipping predicates (e.g., `minValues.x > 100`) into
/// checkpoint/sidecar-compatible predicates (e.g., `add.stats_parsed.minValues.x > 100`).
struct PrefixColumns {
    prefix: ColumnName,
}

impl<'a> ExpressionTransform<'a> for PrefixColumns {
    fn transform_expr_column(&mut self, name: &'a ColumnName) -> Option<Cow<'a, ColumnName>> {
        Some(Cow::Owned(self.prefix.join(name)))
    }
}

struct ApplyColumnMappings {
    column_mappings: HashMap<ColumnName, ColumnName>,
}
impl<'a> ExpressionTransform<'a> for ApplyColumnMappings {
    // NOTE: We already verified all column references. But if the map probe ever did fail, the
    // transform would just delete any expression(s) that reference the invalid column.
    fn transform_expr_column(&mut self, name: &'a ColumnName) -> Option<Cow<'a, ColumnName>> {
        self.column_mappings
            .get(name)
            .map(|physical_name| Cow::Owned(physical_name.clone()))
    }
}

static RESTORED_ADD_SCHEMA: LazyLock<SchemaRef> = LazyLock::new(|| {
    let partition_values = MapType::new(DataType::STRING, DataType::STRING, true);
    StructType::new_unchecked(vec![StructField::nullable(
        "add",
        StructType::new_unchecked(vec![
            StructField::not_null("path", DataType::STRING),
            StructField::not_null("partitionValues", partition_values),
            StructField::not_null("size", DataType::LONG),
            StructField::nullable("modificationTime", DataType::LONG),
            StructField::nullable("stats", DataType::STRING),
            StructField::nullable(
                "tags",
                MapType::new(DataType::STRING, DataType::STRING, true),
            ),
            StructField::nullable("deletionVector", DeletionVectorDescriptor::to_schema()),
            StructField::nullable(BASE_ROW_ID_NAME, DataType::LONG),
            StructField::nullable(DEFAULT_ROW_COMMIT_VERSION_NAME, DataType::LONG),
            StructField::nullable(CLUSTERING_PROVIDER_NAME, DataType::STRING),
        ]),
    )])
    .into()
});

pub(crate) fn restored_add_schema() -> &'static SchemaRef {
    &RESTORED_ADD_SCHEMA
}

/// utility method making it easy to get a transform for a particular row. If the requested row is
/// outside the range of the passed slice returns `None`, otherwise returns the element at the index
/// of the specified row
pub fn get_transform_for_row(
    row: usize,
    transforms: &[Option<ExpressionRef>],
) -> Option<ExpressionRef> {
    transforms.get(row).cloned().flatten()
}

/// [`ScanMetadata`] contains (1) a batch of [`FilteredEngineData`] specifying data files to be scanned
/// and (2) a vector of transforms (one transform per scan file) that must be applied to the data read
/// from those files.
pub struct ScanMetadata {
    /// Filtered engine data with one row per file to scan (and only selected rows should be scanned)
    pub scan_files: FilteredEngineData,

    /// Row-level transformations to apply to data read from files.
    ///
    /// Each entry in this vector corresponds to a row in the `scan_files` data. The entry is an
    /// optional expression that must be applied to convert the file's data into the logical schema
    /// expected by the scan:
    ///
    /// - `Some(expr)`: Apply this expression to transform the data to match
    ///   [`Scan::logical_schema()`].
    /// - `None`: No transformation is needed; the data is already in the correct logical form.
    ///
    /// Note: This vector can be indexed by row number, as rows masked by the selection vector will
    /// have corresponding entries that will be `None`.
    pub scan_file_transforms: Vec<Option<ExpressionRef>>,
}

impl ScanMetadata {
    fn try_new(
        data: Box<dyn EngineData>,
        selection_vector: Vec<bool>,
        scan_file_transforms: Vec<Option<ExpressionRef>>,
    ) -> DeltaResult<Self> {
        Ok(Self {
            scan_files: FilteredEngineData::try_new(data, selection_vector)?,
            scan_file_transforms,
        })
    }
}

impl HasSelectionVector for ScanMetadata {
    fn has_selected_rows(&self) -> bool {
        self.scan_files.selection_vector().contains(&true)
    }
}

/// The result of building a scan over a table. This can be used to get the actual data from
/// scanning the table.
pub struct Scan {
    snapshot: SnapshotRef,
    state_info: Arc<StateInfo>,
    stats_output_mode: StatsOutputMode,
}

impl std::fmt::Debug for Scan {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
        f.debug_struct("Scan")
            .field("schema", &self.state_info.logical_schema)
            .field("predicate", &self.state_info.physical_predicate)
            .field("stats_output_mode", &self.stats_output_mode)
            .finish()
    }
}

impl Scan {
    /// Whether stats reading is entirely skipped, disabling data skipping.
    fn skip_stats(&self) -> bool {
        matches!(self.stats_output_mode, StatsOutputMode::Skip)
    }

    /// The table's root URL. Any relative paths returned from `scan_data` (or in a callback from
    /// [`ScanMetadata::visit_scan_files`]) must be resolved against this root to get the actual path to
    /// the file.
    ///
    /// [`ScanMetadata::visit_scan_files`]: crate::scan::ScanMetadata::visit_scan_files
    // NOTE: this is obviously included in the snapshot, just re-exposed here for convenience.
    pub fn table_root(&self) -> &Url {
        self.snapshot.table_root()
    }

    /// Get a shared reference to the [`Snapshot`] of this scan.
    ///
    /// [`Snapshot`]: crate::Snapshot
    pub fn snapshot(&self) -> &SnapshotRef {
        &self.snapshot
    }

    /// Get a shared reference to the logical [`Schema`] of the scan (i.e. the output schema of the
    /// scan). Note that the logical schema can differ from the physical schema due to e.g.
    /// partition columns which are present in the logical schema but not in the physical schema.
    ///
    /// [`Schema`]: crate::schema::Schema
    pub fn logical_schema(&self) -> &SchemaRef {
        &self.state_info.logical_schema
    }

    /// Get a shared reference to the physical [`Schema`] of the scan. This represents the schema
    /// of the underlying data files which must be read from storage.
    ///
    /// [`Schema`]: crate::schema::Schema
    pub fn physical_schema(&self) -> &SchemaRef {
        &self.state_info.physical_schema
    }

    /// Get the predicate [`PredicateRef`] of the scan.
    pub fn physical_predicate(&self) -> Option<PredicateRef> {
        if let PhysicalPredicate::Some(ref predicate, _) = self.state_info.physical_predicate {
            Some(predicate.clone())
        } else {
            None
        }
    }

    /// Get an iterator of [`ScanMetadata`]s that should be used to facilitate a scan. This handles
    /// log-replay, reconciling Add and Remove actions, and applying data skipping (if possible).
    ///
    /// Reports metrics: [`MetricEvent::ScanMetadataCompleted`] when the returned iterator is
    /// fully exhausted.
    ///
    /// [`MetricEvent::ScanMetadataCompleted`]: crate::metrics::MetricEvent::ScanMetadataCompleted
    ///
    /// Each item in the returned iterator is a struct of:
    /// - `Box<dyn EngineData>`: Data in engine format, where each row represents a file to be
    ///   scanned. The schema for each row can be obtained by calling [`scan_row_schema`].
    /// - `Vec<bool>`: A selection vector. If a row is at index `i` and this vector is `false` at
    ///   index `i`, then that row should *not* be processed (i.e. it is filtered out). If the vector
    ///   is `true` at index `i` the row *should* be processed. If the selection vector is *shorter*
    ///   than the number of rows returned, missing elements are considered `true`, i.e. included in
    ///   the query. NB: If you are using the default engine and plan to call arrow's
    ///   `filter_record_batch`, you _need_ to extend this vector to the full length of the batch or
    ///   arrow will drop the extra rows.
    /// - `Vec<Option<Expression>>`: Transformation expressions that need to be applied. For each
    ///   row at index `i` in the above data, if an expression exists at index `i` in the `Vec`,
    ///   the associated expression _must_ be applied to the data read from the file specified by
    ///   the row. The resultant schema for this expression is guaranteed to be
    ///   [`Self::logical_schema()`]. If the item at index `i` in this `Vec` is `None`, or if the
    ///   `Vec` contains fewer than `i` elements, no expression need be applied and the data read
    ///   from disk is already in the correct logical state.
    pub fn scan_metadata(
        &self,
        engine: &dyn Engine,
    ) -> DeltaResult<impl Iterator<Item = DeltaResult<ScanMetadata>>> {
        let actions_with_checkpoint_info = self.replay_for_scan_metadata(engine)?;
        self.scan_metadata_inner(engine, actions_with_checkpoint_info)
    }

    /// Get an updated iterator of [`ScanMetadata`]s based on an existing iterator of [`EngineData`]s.
    ///
    /// The existing iterator is assumed to contain data from a previous call to `scan_metadata`.
    /// Engines may decide to cache the results of `scan_metadata` to avoid additional IO operations
    /// required to replay the log.
    ///
    /// As such the new scan's predicate must "contain" the previous scan's predicate. That is, the new
    /// scan's predicate MUST skip all files the previous scan's predicate skipped. The new scan's
    /// predicate is also allowed to skip files the previous predicate kept. For example, if the previous
    /// scan predicate was
    /// ```sql
    /// WHERE a < 42 AND b = 10
    /// ```
    /// then it is legal for the new scan to use predicates such as the following:
    /// ```sql
    /// WHERE a = 30 AND b = 10
    /// WHERE a < 10 AND b = 10
    /// WHERE a < 42 AND b = 10 AND c = 20
    /// ```
    /// but it is NOT legal for the new scan to use predicates like these:
    /// ```sql
    /// WHERE a < 42
    /// WHERE a = 50 AND b = 10
    /// WHERE a < 42 AND b <= 10
    /// WHERE a < 42 OR b = 10
    /// ```
    ///
    /// <div class="warning">
    ///
    /// The current implementation does not yet validate the existing
    /// predicate against the current predicate. Until this is implemented,
    /// the caller must ensure that the existing predicate is compatible with
    /// the current predicate.
    ///
    /// </div>
    ///
    /// # Parameters
    ///
    /// * `existing_version` - Table version the provided data was read from.
    /// * `existing_data` - Existing processed scan metadata with all selection vectors applied.
    /// * `existing_predicate` - The predicate used by the previous scan.
    #[allow(unused)]
    #[internal_api]
    pub(crate) fn scan_metadata_from(
        &self,
        engine: &dyn Engine,
        existing_version: Version,
        existing_data: impl IntoIterator<Item = Box<dyn EngineData>> + 'static,
        _existing_predicate: Option<PredicateRef>,
    ) -> DeltaResult<Box<dyn Iterator<Item = DeltaResult<ScanMetadata>>>> {
        // TODO(#966): validate that the current predicate is compatible with the hint predicate.

        if existing_version > self.snapshot.version() {
            return Err(Error::Generic(format!(
                "existing_version {} is greater than current version {}",
                existing_version,
                self.snapshot.version()
            )));
        }

        // in order to be processed by our log replay, we must re-shape the existing scan metadata
        // back into shape as we read it from the log. Since it is already reconciled data,
        // we treat it as if it originated from a checkpoint.
        let transform = engine.evaluation_handler().new_expression_evaluator(
            scan_row_schema(),
            get_scan_metadata_transform_expr(),
            restored_add_schema().clone().into(),
        )?;
        let apply_transform = move |data: Box<dyn EngineData>| {
            Ok(ActionsBatch::new(transform.evaluate(data.as_ref())?, false))
        };

        let log_segment = self.snapshot.log_segment();

        // If the snapshot version corresponds to the hint version, we process the existing data
        // to apply file skipping and provide the required transformations.
        // Since we're only processing existing data (no checkpoint), we use the base schema
        // and no stats_parsed optimization.
        if existing_version == self.snapshot.version() {
            let actions_with_checkpoint_info = ActionsWithCheckpointInfo {
                actions: existing_data.into_iter().map(apply_transform),
                checkpoint_info: CheckpointReadInfo {
                    has_stats_parsed: false,
                    has_partition_values_parsed: false,
                    checkpoint_read_schema: restored_add_schema().clone(),
                },
            };
            return Ok(Box::new(
                self.scan_metadata_inner(engine, actions_with_checkpoint_info)?,
            ));
        }

        // If the current log segment contains a checkpoint newer than the hint version
        // we disregard the existing data hint, and perform a full scan. The current log segment
        // only has deltas after the checkpoint, so we cannot update from prior versions.
        // TODO: we may be able to apply heuristics or other logic to try and fetch missing deltas
        // from the log.
        if matches!(log_segment.checkpoint_version, Some(v) if v > existing_version) {
            return Ok(Box::new(self.scan_metadata(engine)?));
        }

        // create a new log segment containing only the commits added after the version hint.
        let mut ascending_commit_files = log_segment.listed.ascending_commit_files.clone();
        ascending_commit_files.retain(|f| f.version > existing_version);
        let log_segment_files = LogSegmentFiles {
            ascending_commit_files,
            latest_commit_file: log_segment.listed.latest_commit_file.clone(),
            ..Default::default()
        };
        let new_log_segment = LogSegment::try_new(
            log_segment_files,
            log_segment.log_root.clone(),
            Some(log_segment.end_version),
            None, // No checkpoint in this incremental segment
        )?;

        // For incremental reads, new_log_segment has no checkpoint but we use the
        // checkpoint schema returned by the function for consistency.
        let (checkpoint_schema, meta_predicate) = if self.skip_stats() {
            (CHECKPOINT_READ_SCHEMA_NO_STATS.clone(), None)
        } else {
            (
                CHECKPOINT_READ_SCHEMA.clone(),
                self.build_actions_meta_predicate(),
            )
        };
        let result = new_log_segment.read_actions_with_projected_checkpoint_actions(
            engine,
            COMMIT_READ_SCHEMA.clone(),
            checkpoint_schema,
            meta_predicate,
            self.state_info
                .physical_stats_schema
                .as_ref()
                .map(|s| s.as_ref()),
            None,
        )?;
        let actions_with_checkpoint_info = ActionsWithCheckpointInfo {
            actions: result
                .actions
                .chain(existing_data.into_iter().map(apply_transform)),
            checkpoint_info: result.checkpoint_info,
        };

        Ok(Box::new(self.scan_metadata_inner(
            engine,
            actions_with_checkpoint_info,
        )?))
    }

    fn scan_metadata_inner(
        &self,
        engine: &dyn Engine,
        actions_with_checkpoint_info: ActionsWithCheckpointInfo<
            impl Iterator<Item = DeltaResult<ActionsBatch>>,
        >,
    ) -> DeltaResult<impl Iterator<Item = DeltaResult<ScanMetadata>>> {
        let start = Instant::now();
        let reporter = engine.get_metrics_reporter();
        let operation_id = MetricId::new();

        let (iter, metrics) = match self.state_info.physical_predicate {
            PhysicalPredicate::StaticSkipAll => {
                info!("Predicate statically evaluated to false; skipping all files");
                (None, Arc::new(ScanMetrics::default()))
            }
            _ => {
                let (it, m) = scan_action_iter(
                    engine,
                    actions_with_checkpoint_info.actions,
                    self.state_info.clone(),
                    actions_with_checkpoint_info.checkpoint_info,
                    self.skip_stats(),
                )?;
                (Some(it), m)
            }
        };

        let on_complete = move || {
            let event = metrics.to_event(operation_id, ScanType::Full, start.elapsed());
            info!(%event);
            if let Some(r) = reporter {
                r.report(event);
            }
        };
        Ok(iter.into_iter().flatten().on_complete(on_complete))
    }

    // Factored out to facilitate testing
    fn replay_for_scan_metadata(
        &self,
        engine: &dyn Engine,
    ) -> DeltaResult<
        ActionsWithCheckpointInfo<impl Iterator<Item = DeltaResult<ActionsBatch>> + Send>,
    > {
        let (checkpoint_schema, meta_predicate) = if self.skip_stats() {
            (CHECKPOINT_READ_SCHEMA_NO_STATS.clone(), None)
        } else {
            (
                CHECKPOINT_READ_SCHEMA.clone(),
                self.build_actions_meta_predicate(),
            )
        };
        self.snapshot
            .log_segment()
            .read_actions_with_projected_checkpoint_actions(
                engine,
                COMMIT_READ_SCHEMA.clone(),
                checkpoint_schema,
                meta_predicate,
                self.state_info
                    .physical_stats_schema
                    .as_ref()
                    .map(|s| s.as_ref()),
                self.state_info
                    .physical_partition_schema
                    .as_ref()
                    .map(|s| s.as_ref()),
            )
    }

    /// Builds a predicate for row group skipping in checkpoint and sidecar parquet files.
    ///
    /// The scan predicate is first transformed into a data-skipping form with IS NULL guards
    /// (e.g., `x > 100` becomes `OR(maxValues.x IS NULL, maxValues.x > 100)`), then column
    /// references are prefixed with `add.stats_parsed` to match the physical column layout
    /// of checkpoint/sidecar files. The parquet reader's row group filter can then use
    /// parquet-level statistics on these nested columns to skip entire row groups that cannot
    /// contain matching files.
    ///
    /// The IS NULL guards are necessary because parquet footer min/max statistics ignore null
    /// values. Without them, row groups containing files with missing stats (null stat columns)
    /// could be incorrectly pruned, since the footer min/max wouldn't reflect those files.
    ///
    /// Returns `None` if the scan has no predicate, no stats schema, or if the predicate is a
    /// bare unsupported expression (e.g. column-column comparison). Junctions with unsupported
    /// arms replace them with TRUE to conservatively prevent pruning.
    fn build_actions_meta_predicate(&self) -> Option<PredicateRef> {
        let PhysicalPredicate::Some(ref predicate, _) = self.state_info.physical_predicate else {
            return None;
        };
        self.state_info.physical_stats_schema.as_ref()?;

        let partition_columns = self
            .snapshot
            .table_configuration()
            .metadata()
            .partition_columns();
        let skipping_pred = as_checkpoint_skipping_predicate(predicate, partition_columns)?;

        let mut prefixer = PrefixColumns {
            prefix: ColumnName::new(["add", "stats_parsed"]),
        };
        let prefixed = prefixer.transform_pred(&skipping_pred)?;
        Some(Arc::new(prefixed.into_owned()))
    }

    /// Start a parallel scan metadata processing for the table.
    ///
    /// This method returns a [`SequentialScanMetadata`] iterator that processes commits and
    /// checkpoint manifests sequentially. After exhausting this iterator, call `finish()`
    /// to determine if a distributed phase is needed.
    ///
    /// # Example
    ///
    /// ```no_run
    /// # use std::sync::Arc;
    /// # use buoyant_kernel as delta_kernel;
    /// # use delta_kernel::{Engine, DeltaResult};
    /// # use delta_kernel::scan::{AfterSequentialScanMetadata, ParallelScanMetadata};
    /// # use delta_kernel::Snapshot;
    /// # use url::Url;
    /// # use delta_kernel::engine::default::DefaultEngineBuilder;
    /// # use delta_kernel::object_store::local::LocalFileSystem;
    /// # fn main() -> DeltaResult<()> {
    /// let engine = Arc::new(DefaultEngineBuilder::new(Arc::new(LocalFileSystem::new())).build());
    /// let table_root = Url::parse("file:///path/to/table")?;
    ///
    /// // Build a snapshot
    /// let snapshot = Snapshot::builder_for(table_root.clone())
    ///     .at_version(5) // Optional: specify a time-travel version (default is latest version)
    ///     .build(engine.as_ref())?;
    /// let scan = snapshot.scan_builder().build()?;
    /// let mut sequential = scan.parallel_scan_metadata(engine.clone())?;
    ///
    /// // Process sequential phase
    /// for result in sequential.by_ref() {
    ///     let scan_metadata = result?;
    ///     // Process scan metadata...
    /// }
    ///
    /// // Check if distributed phase is needed
    /// match sequential.finish()? {
    ///     AfterSequentialScanMetadata::Done => {
    ///         // All processing complete
    ///     }
    ///     AfterSequentialScanMetadata::Parallel { state, files } => {
    ///         // Distribute files for parallel processing (e.g., one file per worker)
    ///         let state = Arc::new(*state);
    ///         for file in files {
    ///             let parallel = ParallelScanMetadata::try_new(
    ///                 engine.clone(),
    ///                 state.clone(),
    ///                 vec![file],
    ///             )?;
    ///             for result in parallel {
    ///                 let scan_metadata = result?;
    ///                 // Process scan metadata...
    ///             }
    ///         }
    ///     }
    /// }
    /// # Ok(())
    /// # }
    pub fn parallel_scan_metadata(
        &self,
        engine: Arc<dyn Engine>,
    ) -> DeltaResult<SequentialScanMetadata> {
        // For the sequential/parallel phase approach, we use a conservative checkpoint_info
        // since SequentialPhase reads checkpoints via CheckpointManifestReader which doesn't
        // currently support stats_parsed optimization.
        let checkpoint_read_schema = if self.skip_stats() {
            CHECKPOINT_READ_SCHEMA_NO_STATS.clone()
        } else {
            CHECKPOINT_READ_SCHEMA.clone()
        };
        let checkpoint_info = CheckpointReadInfo {
            has_stats_parsed: false,
            has_partition_values_parsed: false,
            checkpoint_read_schema,
        };
        let processor = ScanLogReplayProcessor::new(
            engine.as_ref(),
            self.state_info.clone(),
            checkpoint_info,
            self.skip_stats(),
        )?;
        let sequential =
            SequentialPhase::try_new(processor, self.snapshot.log_segment(), engine.clone())?;

        Ok(SequentialScanMetadata::new(sequential))
    }

    /// Perform an "all in one" scan. This will use the provided `engine` to read and process all
    /// the data for the query. Each [`EngineData`] in the resultant iterator is a portion of the
    /// final table data. Generally connectors/engines will want to use [`Scan::scan_metadata`] so
    /// they can have more control over the execution of the scan.
    // This calls [`Scan::scan_metadata`] to get an iterator of `ScanMetadata` actions for the scan,
    // and then uses the `engine`'s [`crate::ParquetHandler`] to read the actual table data.
    pub fn execute(
        &self,
        engine: Arc<dyn Engine>,
    ) -> DeltaResult<impl Iterator<Item = DeltaResult<Box<dyn EngineData>>>> {
        fn scan_metadata_callback(batches: &mut Vec<state::ScanFile>, file: state::ScanFile) {
            batches.push(file);
        }

        debug!(
            "Executing scan with logical schema {:#?} and physical schema {:#?}",
            self.state_info.logical_schema, self.state_info.physical_schema
        );

        let table_root = self.snapshot.table_root().clone();

        let scan_metadata_iter = self.scan_metadata(engine.as_ref())?;
        let scan_files_iter = scan_metadata_iter
            .map(|res| {
                let scan_metadata = res?;
                let scan_files = vec![];
                scan_metadata.visit_scan_files(scan_files, scan_metadata_callback)
            })
            // Iterator<DeltaResult<Vec<ScanFile>>> to Iterator<DeltaResult<ScanFile>>
            .flatten_ok();

        let physical_schema = self.physical_schema().clone();
        let logical_schema = self.logical_schema().clone();
        let result = scan_files_iter
            .map(move |scan_file| -> DeltaResult<_> {
                let scan_file = scan_file?;
                let file_path = table_root.join(&scan_file.path)?;
                let mut selection_vector = scan_file
                    .dv_info
                    .get_selection_vector(engine.as_ref(), &table_root)?;
                let meta = FileMeta {
                    last_modified: 0,
                    size: scan_file.size.try_into().map_err(|_| {
                        Error::generic("Unable to convert scan file size into FileSize")
                    })?,
                    location: file_path,
                };

                // WARNING: We validated the physical predicate against a schema that includes
                // partition columns, but the read schema we use here does _NOT_ include partition
                // columns. So we cannot safely assume that all column references are valid. See
                // https://github.com/delta-io/delta-kernel-rs/issues/434 for more details.
                //
                // TODO(#860): we disable predicate pushdown until we support row indexes.
                let read_result_iter = engine.parquet_handler().read_parquet_files(
                    &[meta],
                    physical_schema.clone(),
                    None,
                )?;

                let mut read_result_iter = read_result_iter.peekable();

                // Only flag an empty iterator as a connector bug when stats are present and report
                // a positive row count. When stats are absent we cannot distinguish a legitimate
                // 0-row file from a buggy connector, so we conservatively allow it.
                let expect_data = scan_file.stats.as_ref().is_some_and(|s| s.num_records > 0);
                if expect_data && read_result_iter.peek().is_none() {
                    return Err(Error::internal_error(format!(
                        "ParquetHandler returned no data for file '{}'. This is likely a connector \
                         bug -- the handler's read_parquet_files must return at least one batch for \
                         each requested file that contains rows.",
                        scan_file.path
                    )));
                }

                let engine = engine.clone(); // Arc clone
                let physical_schema_inner = physical_schema.clone();
                let logical_schema_inner = logical_schema.clone();
                Ok(read_result_iter.map(move |read_result| -> DeltaResult<_> {
                    let read_result = read_result?;
                    // transform the physical data into the correct logical form
                    let logical = state::transform_to_logical(
                        engine.as_ref(),
                        read_result,
                        &physical_schema_inner,
                        &logical_schema_inner,
                        scan_file.transform.clone(), // Arc clone
                    );
                    let len = logical.as_ref().map_or(0, |res| res.len());
                    // need to split the dv_mask. what's left in dv_mask covers this result, and rest
                    // will cover the following results. we `take()` out of `selection_vector` to avoid
                    // trying to return a captured variable. We're going to reassign `selection_vector`
                    // to `rest` in a moment anyway
                    let mut sv = selection_vector.take();
                    let rest = split_vector(sv.as_mut(), len, None);
                    let result = match sv {
                        Some(sv) => logical.and_then(|data| data.apply_selection_vector(sv)),
                        None => logical,
                    };
                    selection_vector = rest;
                    result
                }))
            })
            // Iterator<DeltaResult<Iterator<DeltaResult<Box<dyn EngineData>>>>> to Iterator<DeltaResult<DeltaResult<Box<dyn EngineData>>>>
            .flatten_ok()
            // Iterator<DeltaResult<DeltaResult<Box<dyn EngineData>>>> to Iterator<DeltaResult<Box<dyn EngineData>>>
            .map(|x| x?);
        Ok(result)
    }
}

/// Get the schema that scan rows (from [`Scan::scan_metadata`]) will be returned with.
///
/// It is:
/// ```ignored
/// {
///    path: string,
///    size: long,
///    modificationTime: long,
///    stats: string,
///    deletionVector: {
///      storageType: string,
///      pathOrInlineDv: string,
///      offset: int,
///      sizeInBytes: int,
///      cardinality: long,
///    },
///    fileConstantValues: {
///      partitionValues: map<string, string>,
///      tags: map<string, string>,
///      baseRowId: long,
///      defaultRowCommitVersion: long,
///      clusteringProvider: string,
///    }
/// }
/// ```
pub fn scan_row_schema() -> SchemaRef {
    log_replay::SCAN_ROW_SCHEMA.clone()
}

pub fn selection_vector(
    engine: &dyn Engine,
    descriptor: &DeletionVectorDescriptor,
    table_root: &Url,
) -> DeltaResult<Vec<bool>> {
    let storage = engine.storage_handler();
    let dv_treemap = descriptor.read(storage, table_root)?;
    Ok(deletion_treemap_to_bools(dv_treemap))
}