Skip to main content

lance/dataset/
scanner.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4use std::collections::HashSet;
5
6use datafusion::config::ConfigOptions;
7use std::ops::Range;
8use std::pin::Pin;
9use std::sync::{Arc, LazyLock};
10use std::task::{Context, Poll};
11
12use crate::index::DatasetIndexExt;
13use arrow::array::AsArray;
14use arrow_array::{Array, Float32Array, Int64Array, RecordBatch};
15use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema, SchemaRef, SortOptions};
16use arrow_select::concat::concat_batches;
17use async_recursion::async_recursion;
18use chrono::Utc;
19use datafusion::common::{DFSchema, JoinType, NullEquality, SchemaExt, exec_datafusion_err};
20use datafusion::functions_aggregate;
21use datafusion::logical_expr::{Expr, ScalarUDF, col, lit};
22use datafusion::physical_expr::PhysicalSortExpr;
23#[allow(deprecated)]
24use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec;
25use datafusion::physical_plan::expressions;
26use datafusion::physical_plan::projection::ProjectionExec as DFProjectionExec;
27use datafusion::physical_plan::sorts::sort::SortExec;
28use datafusion::physical_plan::{
29    ExecutionPlan, SendableRecordBatchStream,
30    aggregates::{AggregateExec, AggregateMode, PhysicalGroupBy},
31    display::DisplayableExecutionPlan,
32    limit::GlobalLimitExec,
33    repartition::RepartitionExec,
34    union::UnionExec,
35};
36use datafusion::scalar::ScalarValue;
37use datafusion_expr::ExprSchemable;
38use datafusion_expr::execution_props::ExecutionProps;
39use datafusion_functions::core::getfield::GetFieldFunc;
40use datafusion_physical_expr::expressions::Column;
41use datafusion_physical_expr::{LexOrdering, Partitioning, PhysicalExpr, create_physical_expr};
42use datafusion_physical_plan::joins::PartitionMode;
43use datafusion_physical_plan::projection::ProjectionExec;
44use datafusion_physical_plan::stream::RecordBatchStreamAdapter;
45use datafusion_physical_plan::{empty::EmptyExec, joins::HashJoinExec};
46use futures::future::BoxFuture;
47use futures::stream::{Stream, StreamExt};
48use futures::{FutureExt, TryStreamExt};
49use lance_arrow::floats::{FloatType, coerce_float_vector};
50use lance_arrow::{DataTypeExt, SchemaExt as ArrowSchemaExt};
51use lance_core::datatypes::{
52    BlobHandling, Field, OnMissing, Projection, escape_field_path_for_project, format_field_path,
53};
54use lance_core::error::LanceOptionExt;
55use lance_core::utils::address::RowAddress;
56use lance_core::utils::mask::{RowAddrMask, RowAddrTreeMap};
57use lance_core::utils::tokio::get_num_compute_intensive_cpus;
58use lance_core::{ROW_ADDR, ROW_ID, ROW_OFFSET};
59use lance_datafusion::aggregate::Aggregate;
60use lance_datafusion::exec::{
61    LanceExecutionOptions, OneShotExec, StrictBatchSizeExec, analyze_plan, execute_plan,
62};
63use lance_datafusion::expr::safe_coerce_scalar;
64use lance_datafusion::projection::ProjectionPlan;
65use lance_file::reader::FileReaderOptions;
66use lance_index::IndexCriteria;
67use lance_index::scalar::FullTextSearchQuery;
68use lance_index::scalar::expression::ScalarIndexExpr;
69use lance_index::scalar::expression::{INDEX_EXPR_RESULT_SCHEMA, IndexExprResult, PlannerIndexExt};
70use lance_index::scalar::inverted::query::{
71    FtsQuery, FtsQueryNode, FtsSearchParams, MatchQuery, PhraseQuery, fill_fts_query_column,
72};
73use lance_index::scalar::inverted::{SCORE_COL, SCORE_FIELD};
74use lance_index::vector::{DEFAULT_QUERY_PARALLELISM, DIST_COL, Query};
75use lance_index::{metrics::NoOpMetricsCollector, scalar::inverted::FTS_SCHEMA};
76use lance_io::stream::RecordBatchStream;
77use lance_linalg::distance::MetricType;
78use lance_table::format::{Fragment, IndexMetadata};
79use roaring::RoaringBitmap;
80use tracing::{Span, info_span, instrument};
81use uuid::Uuid;
82
83use super::Dataset;
84use crate::dataset::row_offsets_to_row_addresses;
85use crate::dataset::utils::SchemaAdapter;
86use crate::index::DatasetIndexInternalExt;
87use crate::index::scalar::inverted::{load_segment_details, load_segments};
88use crate::index::scalar_logical::scalar_index_fragment_bitmap;
89use crate::index::vector::utils::{
90    default_distance_type_for, get_vector_dim, get_vector_type, validate_distance_type_for,
91};
92use crate::io::exec::filtered_read::{FilteredReadExec, FilteredReadOptions};
93use crate::io::exec::fts::{
94    BoostQueryExec, FlatMatchFilterExec, FlatMatchQueryExec, MatchQueryExec, PhraseQueryExec,
95};
96use crate::io::exec::knn::MultivectorScoringExec;
97use crate::io::exec::scalar_index::{MaterializeIndexExec, ScalarIndexExec};
98use crate::io::exec::{
99    AddRowAddrExec, FilterPlan as ExprFilterPlan, KNNVectorDistanceExec, LancePushdownScanExec,
100    LanceScanExec, Planner, PreFilterSource, ScanConfig, TakeExec,
101    knn::{KNN_INDEX_SCHEMA, new_knn_exec},
102    project,
103};
104use crate::io::exec::{AddRowOffsetExec, LanceFilterExec, LanceScanConfig, get_physical_optimizer};
105use crate::{Error, Result};
106use crate::{
107    datatypes::Schema,
108    io::exec::fts::{BoolSlot, BooleanQueryExec, build_boolean_query_children},
109};
110
111pub use lance_datafusion::exec::{ExecutionStatsCallback, ExecutionSummaryCounts};
112#[cfg(feature = "substrait")]
113use lance_datafusion::substrait::parse_substrait;
114
115pub(crate) const BATCH_SIZE_FALLBACK: usize = 8192;
116
117/// Parse an environment variable as a specific type, logging a warning on parse failure.
118fn parse_env_var<T: std::str::FromStr>(env_var_name: &str, default_val: &str) -> Option<T>
119where
120    T::Err: std::fmt::Display,
121{
122    std::env::var(env_var_name)
123        .ok()
124        .and_then(|val| match val.parse() {
125            Ok(value) => Some(value),
126            Err(e) => {
127                log::warn!(
128                    "Failed to parse the environment variable {}='{}': {}, the default value is: {}.",
129                    env_var_name,
130                    val,
131                    e,
132                    default_val
133                );
134                None
135            }
136        })
137}
138
139// For backwards compatibility / historical reasons we re-calculate the default batch size
140// on each call
141pub fn get_default_batch_size() -> Option<usize> {
142    parse_env_var("LANCE_DEFAULT_BATCH_SIZE", &BATCH_SIZE_FALLBACK.to_string())
143}
144
145pub const LEGACY_DEFAULT_FRAGMENT_READAHEAD: usize = 4;
146
147pub static DEFAULT_FRAGMENT_READAHEAD: LazyLock<Option<usize>> = LazyLock::new(|| {
148    parse_env_var(
149        "LANCE_DEFAULT_FRAGMENT_READAHEAD",
150        &LEGACY_DEFAULT_FRAGMENT_READAHEAD.to_string(),
151    )
152});
153
154const DEFAULT_XTR_OVERFETCH_VALUE: u32 = 10;
155
156pub static DEFAULT_XTR_OVERFETCH: LazyLock<u32> = LazyLock::new(|| {
157    parse_env_var(
158        "LANCE_XTR_OVERFETCH",
159        &DEFAULT_XTR_OVERFETCH_VALUE.to_string(),
160    )
161    .unwrap_or(DEFAULT_XTR_OVERFETCH_VALUE)
162});
163
164// We want to support ~256 concurrent reads to maximize throughput on cloud storage systems
165// Our typical page size is 8MiB (though not all reads are this large yet due to offset buffers, validity buffers, etc.)
166// So we want to support 256 * 8MiB ~= 2GiB of queued reads
167const DEFAULT_IO_BUFFER_SIZE_VALUE: u64 = 2 * 1024 * 1024 * 1024;
168
169pub static DEFAULT_IO_BUFFER_SIZE: LazyLock<u64> = LazyLock::new(|| {
170    parse_env_var(
171        "LANCE_DEFAULT_IO_BUFFER_SIZE",
172        &DEFAULT_IO_BUFFER_SIZE_VALUE.to_string(),
173    )
174    .unwrap_or(DEFAULT_IO_BUFFER_SIZE_VALUE)
175});
176
177/// The user-set value of `LANCE_DEFAULT_IO_BUFFER_SIZE`, or `None` if the env var
178/// is unset or unparsable. Consult this from paths that have a sensible non-fixed
179/// default (e.g. `SchedulerConfig::max_bandwidth`) so the env var still takes
180/// precedence over that default. Re-reads the env var on each call so tests can
181/// mutate it.
182pub fn get_default_io_buffer_size_override() -> Option<u64> {
183    parse_env_var(
184        "LANCE_DEFAULT_IO_BUFFER_SIZE",
185        &DEFAULT_IO_BUFFER_SIZE_VALUE.to_string(),
186    )
187}
188
189/// Defines an ordering for a single column
190///
191/// Floats are sorted using the IEEE 754 total ordering
192/// Strings are sorted using UTF-8 lexicographic order (i.e. we sort the binary)
193#[derive(Debug, Clone)]
194pub struct ColumnOrdering {
195    pub ascending: bool,
196    pub nulls_first: bool,
197    pub column_name: String,
198}
199
200impl ColumnOrdering {
201    pub fn asc_nulls_first(column_name: String) -> Self {
202        Self {
203            ascending: true,
204            nulls_first: true,
205            column_name,
206        }
207    }
208
209    pub fn asc_nulls_last(column_name: String) -> Self {
210        Self {
211            ascending: true,
212            nulls_first: false,
213            column_name,
214        }
215    }
216
217    pub fn desc_nulls_first(column_name: String) -> Self {
218        Self {
219            ascending: false,
220            nulls_first: true,
221            column_name,
222        }
223    }
224
225    pub fn desc_nulls_last(column_name: String) -> Self {
226        Self {
227            ascending: false,
228            nulls_first: false,
229            column_name,
230        }
231    }
232}
233
234/// Materialization style for the scanner
235///
236/// This only affects columns that are not used in a filter
237///
238/// Early materialization will fetch the entire column and throw
239/// away the rows that are not needed.  This fetches more data but
240/// uses fewer I/O requests.
241///
242/// Late materialization will only fetch the rows that are needed.
243/// This fetches less data but uses more I/O requests.
244///
245/// This parameter only affects scans.  Vector search and full text search
246/// always use late materialization.
247#[derive(Clone)]
248pub enum MaterializationStyle {
249    /// Heuristic-based materialization style
250    ///
251    /// The default approach depends on the type of object storage.  For
252    /// cloud storage (e.g. S3, GCS, etc.) we only use late materialization
253    /// for columns that are more than 1000 bytes in size.
254    ///
255    /// For local storage we use late materialization for columns that are
256    /// more than 10 bytes in size.
257    ///
258    /// These values are based on experimentation and the assumption that a
259    /// filter will be selecting ~0.1% of the rows in a column.
260    Heuristic,
261    /// All columns will be fetched with late materialization where possible
262    AllLate,
263    /// All columns will be fetched with early materialization where possible
264    AllEarly,
265    /// All columns will be fetched with late materialization except for the specified columns
266    AllEarlyExcept(Vec<u32>),
267}
268
269impl MaterializationStyle {
270    pub fn all_early_except(columns: &[impl AsRef<str>], schema: &Schema) -> Result<Self> {
271        let field_ids = schema
272            .project(columns)?
273            .field_ids()
274            .into_iter()
275            .map(|id| id as u32)
276            .collect();
277        Ok(Self::AllEarlyExcept(field_ids))
278    }
279}
280
281#[derive(Debug)]
282struct PlannedFilteredScan {
283    plan: Arc<dyn ExecutionPlan>,
284    limit_pushed_down: bool,
285    filter_pushed_down: bool,
286}
287
288pub struct FilterPlan {
289    // Query filter plan
290    query_filter: Option<QueryFilter>,
291    refine_query_filter: bool,
292    // Expr filter plan
293    expr_filter_plan: ExprFilterPlan,
294}
295
296impl FilterPlan {
297    pub fn new(query_filter: Option<QueryFilter>, expr_filter_plan: ExprFilterPlan) -> Self {
298        Self {
299            query_filter,
300            refine_query_filter: false,
301            expr_filter_plan,
302        }
303    }
304
305    pub fn disable_refine(&mut self) {
306        self.expr_filter_plan = ExprFilterPlan::default();
307        self.refine_query_filter = false;
308    }
309
310    pub fn make_refine_only(&mut self) {
311        self.expr_filter_plan.make_refine_only();
312        self.refine_query_filter = true;
313    }
314
315    pub fn fts_filter(&self) -> Option<FullTextSearchQuery> {
316        match &self.query_filter {
317            Some(QueryFilter::Fts(query)) => Some(query.clone()),
318            _ => None,
319        }
320    }
321
322    pub fn vector_filter(&self) -> Option<Query> {
323        match &self.query_filter {
324            Some(QueryFilter::Vector(query)) => Some(query.clone()),
325            _ => None,
326        }
327    }
328
329    pub fn has_refine(&self) -> bool {
330        self.expr_filter_plan.has_refine() || self.refine_query_filter
331    }
332
333    pub async fn refine_columns(&self, dataset: &Arc<Dataset>) -> Result<Vec<String>> {
334        let mut columns = vec![];
335
336        if self.expr_filter_plan.has_refine() {
337            columns.extend(self.expr_filter_plan.refine_columns());
338        }
339
340        if self.refine_query_filter {
341            match &self.query_filter {
342                Some(QueryFilter::Fts(fts_query)) => {
343                    let cols = if fts_query.columns().is_empty() {
344                        let indexed_columns = fts_indexed_columns(dataset.clone()).await?;
345                        let q = fill_fts_query_column(&fts_query.query, &indexed_columns, false)?;
346                        q.columns()
347                    } else {
348                        fts_query.columns()
349                    };
350
351                    // Add refine column for match query since it supports `FlatMatchQueryExec`.
352                    // Other fts query use join so we don't need to add refine column.
353                    if let FtsQuery::Match(_) = &fts_query.query {
354                        columns.extend(cols.iter().cloned().collect::<Vec<_>>());
355                    }
356                }
357                Some(QueryFilter::Vector(vector_query)) => {
358                    columns.push(vector_query.column.clone());
359                }
360                None => {}
361            }
362        }
363
364        Ok(columns)
365    }
366
367    pub async fn refine_filter(
368        &self,
369        input: Arc<dyn ExecutionPlan>,
370        scanner: &Scanner,
371    ) -> Result<Arc<dyn ExecutionPlan>> {
372        let mut plan = input;
373
374        if self.refine_query_filter {
375            match &self.query_filter {
376                Some(QueryFilter::Fts(fts_query)) => {
377                    plan = scanner.flat_fts_filter(plan, fts_query).await?;
378                }
379                Some(QueryFilter::Vector(vector_query)) => {
380                    plan = scanner.flat_knn(plan, vector_query)?;
381                }
382                None => {}
383            }
384        }
385
386        if let Some(refine_expr) = &self.expr_filter_plan.refine_expr {
387            // We create a new planner specific to the node's schema, since
388            // physical expressions reference column by index rather than by name.
389            plan = Arc::new(LanceFilterExec::try_new(refine_expr.clone(), plan)?);
390        }
391
392        Ok(plan)
393    }
394}
395
396#[derive(Debug, Clone, Default)]
397pub struct LanceFilter {
398    query_filter: Option<QueryFilter>,
399    expr_filter: Option<ExprFilter>,
400}
401
402impl LanceFilter {
403    pub fn is_none(&self) -> bool {
404        self.query_filter.is_none() && self.expr_filter.is_none()
405    }
406}
407
408/// Query filter for filtering rows
409#[derive(Debug, Clone)]
410pub enum QueryFilter {
411    Fts(FullTextSearchQuery),
412    Vector(Query),
413}
414
415/// Expr filter for filtering rows
416#[derive(Debug, Clone)]
417pub enum ExprFilter {
418    /// The filter is an SQL string
419    Sql(String),
420    /// The filter is a Substrait expression
421    Substrait(Vec<u8>),
422    /// The filter is a Datafusion expression
423    Datafusion(Expr),
424}
425
426impl ExprFilter {
427    /// Converts the filter to a Datafusion expression
428    ///
429    /// The schema for this conversion should be the full schema available to
430    /// the filter (`full_schema`).  However, due to a limitation in the way
431    /// we do Substrait conversion today we can only do Substrait conversion with
432    /// the dataset schema (`dataset_schema`).  This means that Substrait will
433    /// not be able to access columns that are not in the dataset schema (e.g.
434    /// _rowid, _rowaddr, etc.)
435    #[allow(unused)]
436    #[instrument(level = "trace", name = "filter_to_df", skip_all)]
437    pub fn to_datafusion(&self, dataset_schema: &Schema, full_schema: &Schema) -> Result<Expr> {
438        match self {
439            Self::Sql(sql) => {
440                let schema = Arc::new(ArrowSchema::from(full_schema));
441                let planner = Planner::new(schema.clone());
442                let filter = planner.parse_filter(sql)?;
443
444                let df_schema = DFSchema::try_from(schema)?;
445                let ret_field = filter.to_field(&df_schema)?.1;
446                let ret_type = ret_field.data_type();
447                if ret_type != &DataType::Boolean {
448                    return Err(Error::invalid_input_source(
449                        format!("The filter {} does not return a boolean", filter).into(),
450                    ));
451                }
452
453                let optimized = planner.optimize_expr(filter).map_err(|e| {
454                    Error::invalid_input(format!("Error optimizing sql filter: {sql} ({e})"))
455                })?;
456                Ok(optimized)
457            }
458            #[cfg(feature = "substrait")]
459            Self::Substrait(expr) => {
460                use lance_datafusion::exec::{LanceExecutionOptions, get_session_context};
461
462                let ctx = get_session_context(&LanceExecutionOptions::default());
463                let state = ctx.state();
464                let schema = Arc::new(ArrowSchema::from(dataset_schema));
465                let expr = parse_substrait(expr, schema.clone(), &ctx.state())
466                    .now_or_never()
467                    .expect("could not parse the Substrait filter in a synchronous fashion")?;
468                let planner = Planner::new(schema);
469                planner.optimize_expr(expr.clone()).map_err(|e| {
470                    Error::invalid_input(format!(
471                        "Error optimizing substrait filter: {expr:?} ({e})"
472                    ))
473                })
474            }
475            #[cfg(not(feature = "substrait"))]
476            Self::Substrait(_) => Err(Error::not_supported_source(
477                "Substrait filter is not supported in this build".into(),
478            )),
479            Self::Datafusion(expr) => Ok(expr.clone()),
480        }
481    }
482}
483
484/// Aggregate expression from Substrait or DataFusion.
485#[derive(Debug, Clone)]
486pub enum AggregateExpr {
487    #[cfg(feature = "substrait")]
488    Substrait(Vec<u8>),
489    Datafusion {
490        group_by: Vec<Expr>,
491        aggregates: Vec<Expr>,
492    },
493}
494
495impl AggregateExpr {
496    /// Create a new builder for aggregate expressions.
497    ///
498    /// # Example
499    /// ```ignore
500    /// let agg = AggregateExpr::builder()
501    ///     .group_by("category")
502    ///     .count_star().alias("total_count")
503    ///     .sum("amount").alias("total_amount")
504    ///     .avg("price")
505    ///     .build();
506    /// scanner.aggregate(agg);
507    /// ```
508    pub fn builder() -> AggregateExprBuilder<false> {
509        AggregateExprBuilder::new()
510    }
511
512    /// Create from Substrait Plan bytes.
513    #[cfg(feature = "substrait")]
514    pub fn substrait(bytes: impl Into<Vec<u8>>) -> Self {
515        Self::Substrait(bytes.into())
516    }
517
518    /// Create from DataFusion expressions.
519    /// Use `.alias()` on expressions to set output column names.
520    pub fn datafusion(group_by: Vec<Expr>, aggregates: Vec<Expr>) -> Self {
521        Self::Datafusion {
522            group_by,
523            aggregates,
524        }
525    }
526
527    /// Parse into a unified Aggregate structure.
528    ///
529    /// For Substrait, this parses the bytes into DataFusion expressions.
530    /// For DataFusion, this just wraps the expressions.
531    ///
532    /// The schema is used to resolve field references in Substrait expressions.
533    fn parse(self, #[allow(unused_variables)] schema: Arc<ArrowSchema>) -> Result<Aggregate> {
534        match self {
535            #[cfg(feature = "substrait")]
536            Self::Substrait(bytes) => {
537                use lance_datafusion::exec::{LanceExecutionOptions, get_session_context};
538                use lance_datafusion::substrait::parse_substrait_aggregate;
539
540                let ctx = get_session_context(&LanceExecutionOptions::default());
541                parse_substrait_aggregate(&bytes, schema, &ctx.state())
542                    .now_or_never()
543                    .expect("could not parse the Substrait aggregate in a synchronous fashion")
544            }
545            Self::Datafusion {
546                group_by,
547                aggregates,
548            } => Ok(Aggregate::new(group_by, aggregates)),
549        }
550    }
551}
552
553/// Builder for creating aggregate expressions without using DataFusion or Substrait directly.
554///
555/// The const generic `HAS_PENDING` tracks whether there's a pending aggregate that can be aliased.
556/// When `HAS_PENDING` is `true`, the last item in `aggregates` is the pending aggregate.
557#[derive(Debug, Clone)]
558pub struct AggregateExprBuilder<const HAS_PENDING: bool> {
559    group_by: Vec<Expr>,
560    aggregates: Vec<Expr>,
561}
562
563impl Default for AggregateExprBuilder<false> {
564    fn default() -> Self {
565        Self {
566            group_by: Vec::new(),
567            aggregates: Vec::new(),
568        }
569    }
570}
571
572impl AggregateExprBuilder<false> {
573    /// Create a new builder.
574    pub fn new() -> Self {
575        Self::default()
576    }
577
578    /// Build the aggregate expression.
579    pub fn build(self) -> AggregateExpr {
580        AggregateExpr::Datafusion {
581            group_by: self.group_by,
582            aggregates: self.aggregates,
583        }
584    }
585}
586
587impl<const HAS_PENDING: bool> AggregateExprBuilder<HAS_PENDING> {
588    /// Add a column to group by.
589    ///
590    /// Multiple invocations will add to the list (not replace it).
591    /// E.g. `.group_by("x").group_by("y")` will group by both `x` and `y`.
592    pub fn group_by(mut self, column: impl Into<String>) -> AggregateExprBuilder<false> {
593        self.group_by.push(col(column.into()));
594        AggregateExprBuilder {
595            group_by: self.group_by,
596            aggregates: self.aggregates,
597        }
598    }
599
600    /// Add multiple columns to group by.
601    ///
602    /// Multiple invocations will add to the list (not replace it).
603    /// E.g. `.group_by("x").group_by_columns(["y", "z"])` will group by `x`, `y`, and `z`.
604    pub fn group_by_columns(
605        mut self,
606        columns: impl IntoIterator<Item = impl Into<String>>,
607    ) -> AggregateExprBuilder<false> {
608        for column in columns {
609            self.group_by.push(col(column.into()));
610        }
611        AggregateExprBuilder {
612            group_by: self.group_by,
613            aggregates: self.aggregates,
614        }
615    }
616
617    /// Add COUNT(*) aggregate that counts all rows.
618    pub fn count_star(mut self) -> AggregateExprBuilder<true> {
619        self.aggregates
620            .push(functions_aggregate::count::count(lit(1)));
621        AggregateExprBuilder {
622            group_by: self.group_by,
623            aggregates: self.aggregates,
624        }
625    }
626
627    /// Add COUNT(column) aggregate.
628    ///
629    /// Unlike `count_star`, this will only count the number of rows where `column`
630    /// is not NULL.
631    pub fn count(mut self, column: impl Into<String>) -> AggregateExprBuilder<true> {
632        self.aggregates
633            .push(functions_aggregate::count::count(col(column.into())));
634        AggregateExprBuilder {
635            group_by: self.group_by,
636            aggregates: self.aggregates,
637        }
638    }
639
640    /// Add SUM(column) aggregate.
641    pub fn sum(mut self, column: impl Into<String>) -> AggregateExprBuilder<true> {
642        self.aggregates
643            .push(functions_aggregate::sum::sum(col(column.into())));
644        AggregateExprBuilder {
645            group_by: self.group_by,
646            aggregates: self.aggregates,
647        }
648    }
649
650    /// Add AVG(column) aggregate.
651    pub fn avg(mut self, column: impl Into<String>) -> AggregateExprBuilder<true> {
652        self.aggregates
653            .push(functions_aggregate::average::avg(col(column.into())));
654        AggregateExprBuilder {
655            group_by: self.group_by,
656            aggregates: self.aggregates,
657        }
658    }
659
660    /// Add MIN(column) aggregate.
661    pub fn min(mut self, column: impl Into<String>) -> AggregateExprBuilder<true> {
662        self.aggregates
663            .push(functions_aggregate::min_max::min(col(column.into())));
664        AggregateExprBuilder {
665            group_by: self.group_by,
666            aggregates: self.aggregates,
667        }
668    }
669
670    /// Add MAX(column) aggregate.
671    pub fn max(mut self, column: impl Into<String>) -> AggregateExprBuilder<true> {
672        self.aggregates
673            .push(functions_aggregate::min_max::max(col(column.into())));
674        AggregateExprBuilder {
675            group_by: self.group_by,
676            aggregates: self.aggregates,
677        }
678    }
679}
680
681impl AggregateExprBuilder<true> {
682    /// Set an alias for the pending aggregate (the last added aggregate).
683    pub fn alias(mut self, name: impl Into<String>) -> AggregateExprBuilder<false> {
684        let pending = self.aggregates.pop().expect("pending aggregate must exist");
685        self.aggregates.push(pending.alias(name.into()));
686        AggregateExprBuilder {
687            group_by: self.group_by,
688            aggregates: self.aggregates,
689        }
690    }
691
692    /// Build the aggregate expression.
693    pub fn build(self) -> AggregateExpr {
694        AggregateExpr::Datafusion {
695            group_by: self.group_by,
696            aggregates: self.aggregates,
697        }
698    }
699}
700
701/// Dataset Scanner
702///
703/// ```rust,ignore
704/// let dataset = Dataset::open(uri).await.unwrap();
705/// let stream = dataset.scan()
706///     .project(&["col", "col2.subfield"]).unwrap()
707///     .limit(10)
708///     .into_stream();
709/// stream
710///   .map(|batch| batch.num_rows())
711///   .buffered(16)
712///   .sum()
713/// ```
714#[derive(Clone)]
715pub struct Scanner {
716    dataset: Arc<Dataset>,
717
718    /// The projection plan for the scanner
719    ///
720    /// This includes
721    /// - The physical projection that must be read from the dataset
722    /// - Dynamic expressions that are evaluated after the physical projection
723    /// - The names of the output columns
724    projection_plan: ProjectionPlan,
725    blob_handling: BlobHandling,
726
727    /// If true then the filter will be applied before an index scan
728    prefilter: bool,
729
730    /// Materialization style controls when columns are fetched
731    materialization_style: MaterializationStyle,
732
733    /// Filter.
734    filter: LanceFilter,
735
736    /// Optional full text search query
737    full_text_query: Option<FullTextSearchQuery>,
738
739    /// The batch size controls the maximum size of rows to return for each read.
740    batch_size: Option<usize>,
741
742    /// If set, the scanner will produce batches whose total size in bytes
743    /// is approximately this value, overriding the row-based `batch_size`.
744    batch_size_bytes: Option<u64>,
745
746    /// Number of batches to prefetch
747    batch_readahead: usize,
748
749    /// Number of fragments to read concurrently
750    fragment_readahead: Option<usize>,
751
752    /// Number of bytes to allow to queue up in the I/O buffer
753    io_buffer_size: Option<u64>,
754
755    limit: Option<i64>,
756    offset: Option<i64>,
757
758    /// If Some then results will be ordered by the provided ordering
759    ///
760    /// If there are multiple columns the results will first be ordered
761    /// by the first column.  Then, any values whose first column is equal
762    /// will be sorted by the next column, and so on.
763    ///
764    /// If this is Some then the value of `ordered` is ignored.  The scan
765    /// will always be unordered since we are just going to reorder it anyways.
766    ordering: Option<Vec<ColumnOrdering>>,
767
768    nearest: Option<Query>,
769
770    /// If false, do not use any scalar indices for the scan
771    ///
772    /// This can be used to pick a more efficient plan for certain queries where
773    /// scalar indices do not work well (though we should also improve our planning
774    /// to handle this better in the future as well)
775    use_scalar_index: bool,
776
777    /// Whether to use statistics to optimize the scan (default: true)
778    ///
779    /// This is used for debugging or benchmarking purposes.
780    use_stats: bool,
781
782    /// Whether to scan in deterministic order (default: true)
783    ///
784    /// This field is ignored if `ordering` is defined
785    ordered: bool,
786
787    /// If set, this scanner serves only these fragments.
788    fragments: Option<Vec<Fragment>>,
789
790    /// If set, this scanner will only search the specified vector index segments.
791    index_segments: Option<Vec<Uuid>>,
792
793    /// Only search the data being indexed (weak consistency search).
794    ///
795    /// Default value is false.
796    ///
797    /// This is essentially a weak consistency search. Users can run index or optimize index
798    /// to make the index catch up with the latest data.
799    fast_search: bool,
800
801    /// If true, the scanner will emit deleted rows
802    include_deleted_rows: bool,
803
804    /// If set, this callback will be called after the scan with summary statistics
805    scan_stats_callback: Option<ExecutionStatsCallback>,
806
807    /// Whether the result returned by the scanner must be of the size of the batch_size.
808    /// By default, it is false.
809    /// Mainly, if the result is returned strictly according to the batch_size,
810    /// batching and waiting are required, and the performance will decrease.
811    strict_batch_size: bool,
812
813    /// File reader options to use when reading data files.
814    file_reader_options: Option<FileReaderOptions>,
815
816    aggregate: Option<Aggregate>,
817
818    // Legacy fields to help migrate some old projection behavior to new behavior
819    //
820    // There are two behaviors we are moving away from:
821    //
822    // First, the old behavior used methods like with_row_id and with_row_addr to add
823    // "system" columns.  The new behavior is to specify them in the projection like any
824    // other column.  The only difference between a system column and a regular column is
825    // that system columns are not returned in the schema and are not returned by default
826    // (i.e. "SELECT *")
827    //
828    // Second, the old behavior would _always_ add the _score or _distance columns to the
829    // output and there was no way for the user to opt out.  The new behavior treats the
830    // _score and _distance as regular output columns of the "search table function".  If
831    // the user does not specify a projection (i.e. "SELECT *") then we will add the _score
832    // and _distance columns to the end.  If the user does specify a projection then they
833    // must request those columns for them to show up.
834    //
835    // --------------------------------------------------------------------------
836    /// Whether the user wants the row id on top of the projection, will always come last
837    /// except possibly before _rowaddr
838    legacy_with_row_id: bool,
839    /// Whether the user wants the row address on top of the projection, will always come last
840    legacy_with_row_addr: bool,
841    /// Whether the user explicitly requested a projection.  If they did then we will warn them
842    /// if they do not specify _score / _distance unless legacy_projection_behavior is set to false
843    explicit_projection: bool,
844    /// Whether the user wants to use the legacy projection behavior.
845    autoproject_scoring_columns: bool,
846}
847
848/// Represents a user-requested take operation
849#[derive(Debug, Clone)]
850pub enum TakeOperation {
851    /// Take rows by row id
852    RowIds(Vec<u64>),
853    /// Take rows by row address
854    RowAddrs(Vec<u64>),
855    /// Take rows by row offset
856    ///
857    /// The row offset is the offset of the row in the dataset.  This can
858    /// be converted to row addresses using the fragment sizes.
859    RowOffsets(Vec<u64>),
860}
861
862impl TakeOperation {
863    fn extract_u64_list(list: &[Expr]) -> Option<Vec<u64>> {
864        let mut u64s = Vec::with_capacity(list.len());
865        for expr in list {
866            if let Expr::Literal(lit, _) = expr {
867                if let Some(ScalarValue::UInt64(Some(val))) =
868                    safe_coerce_scalar(lit, &DataType::UInt64)
869                {
870                    u64s.push(val);
871                } else {
872                    return None;
873                }
874            } else {
875                return None;
876            }
877        }
878        Some(u64s)
879    }
880
881    fn merge(self, other: Self) -> Option<Self> {
882        match (self, other) {
883            (Self::RowIds(mut left), Self::RowIds(right)) => {
884                left.extend(right);
885                Some(Self::RowIds(left))
886            }
887            (Self::RowAddrs(mut left), Self::RowAddrs(right)) => {
888                left.extend(right);
889                Some(Self::RowAddrs(left))
890            }
891            (Self::RowOffsets(mut left), Self::RowOffsets(right)) => {
892                left.extend(right);
893                Some(Self::RowOffsets(left))
894            }
895            _ => None,
896        }
897    }
898
899    /// Attempts to create a take operation from an expression.  This will succeed if the expression
900    /// has one of the following forms:
901    ///  - `_rowid = 10`
902    ///  - `_rowid = 10 OR _rowid = 20 OR _rowid = 30`
903    ///  - `_rowid IN (10, 20, 30)`
904    ///  - `_rowaddr = 10`
905    ///  - `_rowaddr = 10 OR _rowaddr = 20 OR _rowaddr = 30`
906    ///  - `_rowaddr IN (10, 20, 30)`
907    ///  - `_rowoffset = 10`
908    ///  - `_rowoffset = 10 OR _rowoffset = 20 OR _rowoffset = 30`
909    ///  - `_rowoffset IN (10, 20, 30)`
910    ///
911    /// The _rowid / _rowaddr / _rowoffset determine if we are taking by row id, address, or offset.
912    ///
913    /// If a take expression is combined with some other filter via an AND then the remainder will be
914    /// returned as well.  For example, `_rowid = 10` will return (take_op, None) and
915    /// `_rowid = 10 AND x > 70` will return (take_op, Some(x > 70)).
916    fn try_from_expr(expr: &Expr) -> Option<(Self, Option<Expr>)> {
917        if let Expr::BinaryExpr(binary) = expr {
918            match binary.op {
919                datafusion_expr::Operator::And => {
920                    let left_take = Self::try_from_expr(&binary.left);
921                    let right_take = Self::try_from_expr(&binary.right);
922                    match (left_take, right_take) {
923                        (Some(_), Some(_)) => {
924                            // This is something like...
925                            //
926                            // _rowid = 10 AND _rowid = 20
927                            //
928                            // ...which is kind of nonsensical.  Better to just return None.
929                            return None;
930                        }
931                        (Some((left_op, left_rem)), None) => {
932                            let remainder = match left_rem {
933                                // If there is a remainder on the left side we combine it.  This _should_
934                                // be something like converting (_rowid = 10 AND x > 70) AND y > 80
935                                // to (_rowid = 10) AND (x > 70 AND y > 80) which should be valid
936                                Some(expr) => Expr::and(expr, binary.right.as_ref().clone()),
937                                None => binary.right.as_ref().clone(),
938                            };
939                            return Some((left_op, Some(remainder)));
940                        }
941                        (None, Some((right_op, right_rem))) => {
942                            let remainder = match right_rem {
943                                Some(expr) => Expr::and(expr, binary.left.as_ref().clone()),
944                                None => binary.left.as_ref().clone(),
945                            };
946                            return Some((right_op, Some(remainder)));
947                        }
948                        (None, None) => {
949                            return None;
950                        }
951                    }
952                }
953                datafusion_expr::Operator::Eq => {
954                    // Check for _rowid = literal
955                    if let (Expr::Column(col), Expr::Literal(lit, _)) =
956                        (binary.left.as_ref(), binary.right.as_ref())
957                        && let Some(ScalarValue::UInt64(Some(val))) =
958                            safe_coerce_scalar(lit, &DataType::UInt64)
959                    {
960                        if col.name == ROW_ID {
961                            return Some((Self::RowIds(vec![val]), None));
962                        } else if col.name == ROW_ADDR {
963                            return Some((Self::RowAddrs(vec![val]), None));
964                        } else if col.name == ROW_OFFSET {
965                            return Some((Self::RowOffsets(vec![val]), None));
966                        }
967                    }
968                }
969                datafusion_expr::Operator::Or => {
970                    let left_take = Self::try_from_expr(&binary.left);
971                    let right_take = Self::try_from_expr(&binary.right);
972                    if let (Some(left), Some(right)) = (left_take, right_take) {
973                        if left.1.is_some() || right.1.is_some() {
974                            // This would be something like...
975                            //
976                            // (_rowid = 10 AND x > 70) OR _rowid = 20
977                            //
978                            // I don't think it's correct to convert this into a take operation
979                            // which would give us (_rowid = 10 OR _rowid = 20) AND x > 70
980                            return None;
981                        }
982                        return left.0.merge(right.0).map(|op| (op, None));
983                    }
984                }
985                _ => {}
986            }
987        } else if let Expr::InList(in_expr) = expr
988            && let Expr::Column(col) = in_expr.expr.as_ref()
989            && let Some(u64s) = Self::extract_u64_list(&in_expr.list)
990        {
991            if col.name == ROW_ID {
992                return Some((Self::RowIds(u64s), None));
993            } else if col.name == ROW_ADDR {
994                return Some((Self::RowAddrs(u64s), None));
995            } else if col.name == ROW_OFFSET {
996                return Some((Self::RowOffsets(u64s), None));
997            }
998        }
999        None
1000    }
1001}
1002
1003impl Scanner {
1004    pub fn new(dataset: Arc<Dataset>) -> Self {
1005        let projection_plan = ProjectionPlan::full(dataset.clone()).unwrap();
1006        let file_reader_options = dataset.file_reader_options.clone();
1007        let mut scanner = Self {
1008            dataset,
1009            projection_plan,
1010            blob_handling: BlobHandling::default(),
1011            prefilter: false,
1012            materialization_style: MaterializationStyle::Heuristic,
1013            filter: LanceFilter::default(),
1014            full_text_query: None,
1015            batch_size: None,
1016            batch_size_bytes: None,
1017            batch_readahead: get_num_compute_intensive_cpus(),
1018            fragment_readahead: None,
1019            io_buffer_size: None,
1020            limit: None,
1021            offset: None,
1022            ordering: None,
1023            nearest: None,
1024            use_stats: true,
1025            ordered: true,
1026            fragments: None,
1027            index_segments: None,
1028            fast_search: false,
1029            use_scalar_index: true,
1030            include_deleted_rows: false,
1031            scan_stats_callback: None,
1032            strict_batch_size: false,
1033            file_reader_options,
1034            aggregate: None,
1035            legacy_with_row_addr: false,
1036            legacy_with_row_id: false,
1037            explicit_projection: false,
1038            autoproject_scoring_columns: true,
1039        };
1040        scanner.apply_blob_handling();
1041        scanner
1042    }
1043
1044    fn apply_blob_handling(&mut self) {
1045        let projection = self
1046            .projection_plan
1047            .physical_projection
1048            .clone()
1049            .with_blob_handling(self.blob_handling.clone());
1050        self.projection_plan.physical_projection = projection;
1051    }
1052
1053    pub fn blob_handling(&mut self, blob_handling: BlobHandling) -> &mut Self {
1054        self.blob_handling = blob_handling;
1055        self.apply_blob_handling();
1056        self
1057    }
1058
1059    pub fn from_fragment(dataset: Arc<Dataset>, fragment: Fragment) -> Self {
1060        Self {
1061            fragments: Some(vec![fragment]),
1062            ..Self::new(dataset)
1063        }
1064    }
1065
1066    /// Set which fragments should be scanned.
1067    ///
1068    /// If scan_in_order is set to true, the fragments will be scanned in the order of the vector.
1069    pub fn with_fragments(&mut self, fragments: Vec<Fragment>) -> &mut Self {
1070        self.fragments = Some(fragments);
1071        self
1072    }
1073
1074    /// Restrict vector index search to the specified index segments.
1075    ///
1076    /// This setting is only supported for vector search.
1077    ///
1078    /// If [`Self::with_fragments`] is also set then rows from those fragments that are not covered
1079    /// by the selected index segments will still be searched with flat KNN. Otherwise, unindexed
1080    /// fragments outside the selected index segments are not searched.
1081    pub fn with_index_segments(&mut self, segments: Vec<Uuid>) -> Result<&mut Self> {
1082        if segments.is_empty() {
1083            return Err(Error::invalid_input(
1084                "with_index_segments does not accept an empty segment list".to_string(),
1085            ));
1086        }
1087        self.index_segments = Some(segments);
1088        Ok(self)
1089    }
1090
1091    fn get_batch_size(&self) -> usize {
1092        // Default batch size to be large enough so that a i32 column can be
1093        // read in a single range request. For the object store default of
1094        // 64KB, this is 16K rows. For local file systems, the default block size
1095        // is just 4K, which would mean only 1K rows, which might be a little small.
1096        // So we use a default minimum of 8K rows.
1097        get_default_batch_size().unwrap_or_else(|| {
1098            self.batch_size.unwrap_or_else(|| {
1099                std::cmp::max(
1100                    self.dataset.object_store.as_ref().block_size() / 4,
1101                    BATCH_SIZE_FALLBACK,
1102                )
1103            })
1104        })
1105    }
1106
1107    fn ensure_not_fragment_scan(&self) -> Result<()> {
1108        if self.is_fragment_scan() {
1109            Err(Error::not_supported(
1110                "This operation is not supported for fragment scan".to_string(),
1111            ))
1112        } else {
1113            Ok(())
1114        }
1115    }
1116
1117    fn is_fragment_scan(&self) -> bool {
1118        self.fragments.is_some()
1119    }
1120
1121    /// Empty Projection (useful for count queries)
1122    ///
1123    /// The row_address will be scanned (no I/O required) but not included in the output
1124    pub fn empty_project(&mut self) -> Result<&mut Self> {
1125        self.project(&[] as &[&str])
1126    }
1127
1128    /// Projection.
1129    ///
1130    /// Only select the specified columns. If not specified, all columns will be scanned.
1131    pub fn project<T: AsRef<str>>(&mut self, columns: &[T]) -> Result<&mut Self> {
1132        let transformed_columns: Vec<(&str, String)> = columns
1133            .iter()
1134            .map(|c| (c.as_ref(), escape_field_path_for_project(c.as_ref())))
1135            .collect();
1136
1137        self.project_with_transform(&transformed_columns)
1138    }
1139
1140    /// Projection with transform
1141    ///
1142    /// Only select the specified columns with the given transform.
1143    pub fn project_with_transform(
1144        &mut self,
1145        columns: &[(impl AsRef<str>, impl AsRef<str>)],
1146    ) -> Result<&mut Self> {
1147        self.explicit_projection = true;
1148        self.projection_plan = ProjectionPlan::from_expressions(self.dataset.clone(), columns)?;
1149        if self.legacy_with_row_id {
1150            self.projection_plan.include_row_id();
1151        }
1152        if self.legacy_with_row_addr {
1153            self.projection_plan.include_row_addr();
1154        }
1155        self.apply_blob_handling();
1156        Ok(self)
1157    }
1158
1159    /// Should the filter run before the vector index is applied
1160    ///
1161    /// If true then the filter will be applied before the vector index.  This
1162    /// means the results will be accurate but the overall query may be more expensive.
1163    ///
1164    /// If false then the filter will be applied to the nearest results.  This means
1165    /// you may get back fewer results than you ask for (or none at all) if the closest
1166    /// results do not match the filter.
1167    pub fn prefilter(&mut self, should_prefilter: bool) -> &mut Self {
1168        self.prefilter = should_prefilter;
1169        self
1170    }
1171
1172    /// Set the callback to be called after the scan with summary statistics
1173    pub fn scan_stats_callback(&mut self, callback: ExecutionStatsCallback) -> &mut Self {
1174        self.scan_stats_callback = Some(callback);
1175        self
1176    }
1177
1178    /// Set the materialization style for the scan
1179    ///
1180    /// This controls when columns are fetched from storage.  The default should work
1181    /// well for most cases.
1182    ///
1183    /// If you know (in advance) a query will return relatively few results (less than
1184    /// 0.1% of the rows) then you may want to experiment with applying late materialization
1185    /// to more (or all) columns.
1186    ///
1187    /// If you know a query is going to return many rows then you may want to experiment
1188    /// with applying early materialization to more (or all) columns.
1189    pub fn materialization_style(&mut self, style: MaterializationStyle) -> &mut Self {
1190        self.materialization_style = style;
1191        self
1192    }
1193
1194    /// Apply filters
1195    ///
1196    /// The filters can be presented as the string, as in WHERE clause in SQL.
1197    ///
1198    /// ```rust,ignore
1199    /// let dataset = Dataset::open(uri).await.unwrap();
1200    /// let stream = dataset.scan()
1201    ///     .project(&["col", "col2.subfield"]).unwrap()
1202    ///     .filter("a > 10 AND b < 200").unwrap()
1203    ///     .limit(10)
1204    ///     .into_stream();
1205    /// ```
1206    ///
1207    /// Once the filter is applied, Lance will create an optimized I/O plan for filtering.
1208    ///
1209    pub fn filter(&mut self, filter: &str) -> Result<&mut Self> {
1210        self.filter.expr_filter = Some(ExprFilter::Sql(filter.to_string()));
1211        Ok(self)
1212    }
1213
1214    /// Apply fts/vector query as filter.
1215    ///
1216    /// * Vector query filter can only be applied to full text search.
1217    /// * Fts query filter can only be applied to vector search.
1218    /// * Query filter couldn't be applied to normal query.
1219    ///
1220    /// ```rust,ignore
1221    /// let dataset = Dataset::open(uri).await.unwrap();
1222    /// let query_vector = Float32Array::from(vec![300f32, 300f32, 300f32, 300f32]);
1223    /// let stream = dataset.scan()
1224    ///     .nearest("vector", &query_vector, 5)
1225    ///     .project(&["col", "col2.subfield"]).unwrap()
1226    ///     .query_filter(QueryFilter::Fts(FullTextSearchQuery::new(
1227    ///       "hello".to_string(),
1228    ///     ))).unwrap()
1229    ///     .limit(10)
1230    ///     .into_stream();
1231    /// ```
1232    pub fn filter_query(&mut self, filter: QueryFilter) -> Result<&mut Self> {
1233        self.filter.query_filter = Some(filter);
1234        Ok(self)
1235    }
1236
1237    /// Filter by full text search
1238    /// The column must be a string column.
1239    /// The query is a string to search for.
1240    /// The search is case-insensitive, BM25 scoring is used.
1241    ///
1242    /// ```rust,ignore
1243    /// let dataset = Dataset::open(uri).await.unwrap();
1244    /// let stream = dataset.scan()
1245    ///    .project(&["col", "col2.subfield"]).unwrap()
1246    ///    .full_text_search("col", "query").unwrap()
1247    ///    .limit(10)
1248    ///    .into_stream();
1249    /// ```
1250    pub fn full_text_search(&mut self, query: FullTextSearchQuery) -> Result<&mut Self> {
1251        let fields = query.columns();
1252        if !fields.is_empty() {
1253            for field in fields.iter() {
1254                if self.dataset.schema().field(field).is_none() {
1255                    return Err(Error::invalid_input(format!("Column {} not found", field)));
1256                }
1257            }
1258        }
1259
1260        self.full_text_query = Some(query);
1261        Ok(self)
1262    }
1263
1264    /// Set a filter using a Substrait ExtendedExpression message
1265    ///
1266    /// The message must contain exactly one expression and that expression
1267    /// must be a scalar expression whose return type is boolean.
1268    pub fn filter_substrait(&mut self, filter: &[u8]) -> Result<&mut Self> {
1269        self.filter.expr_filter = Some(ExprFilter::Substrait(filter.to_vec()));
1270        Ok(self)
1271    }
1272
1273    pub fn filter_expr(&mut self, filter: Expr) -> &mut Self {
1274        self.filter.expr_filter = Some(ExprFilter::Datafusion(filter));
1275        self
1276    }
1277
1278    /// Set aggregation.
1279    ///
1280    /// The aggregate expression is parsed immediately using the dataset schema.
1281    /// For Substrait aggregates, this converts them to DataFusion expressions.
1282    pub fn aggregate(&mut self, aggregate: AggregateExpr) -> Result<&mut Self> {
1283        let schema: Arc<ArrowSchema> = Arc::new(self.dataset.schema().into());
1284        let parsed = aggregate.parse(schema)?;
1285        self.aggregate = Some(parsed);
1286        Ok(self)
1287    }
1288
1289    /// Set the maximum number of rows per batch.
1290    ///
1291    /// Note: this can be overridden by [`Self::batch_size_bytes`] or by a dataset-level
1292    /// `batch_size_bytes` set via [`ReadParams::file_reader_options`](crate::dataset::ReadParams::file_reader_options).  When a byte-based
1293    /// batch size is active, the row-based batch size is used only as an initial estimate.
1294    pub fn batch_size(&mut self, batch_size: usize) -> &mut Self {
1295        self.batch_size = Some(batch_size);
1296        self
1297    }
1298
1299    /// Set the target batch size in bytes.
1300    ///
1301    /// When set, the scanner will produce batches whose total size in bytes
1302    /// is approximately this value, overriding the row-based `batch_size`.
1303    ///
1304    /// This can also be configured at the dataset level via
1305    /// [`ReadParams::file_reader_options`](crate::dataset::ReadParams::file_reader_options).  A scanner-level setting takes
1306    /// precedence over the dataset-level default.
1307    pub fn batch_size_bytes(&mut self, batch_size_bytes: u64) -> &mut Self {
1308        self.batch_size_bytes = Some(batch_size_bytes);
1309        self
1310    }
1311
1312    /// Include deleted rows
1313    ///
1314    /// These are rows that have been deleted from the dataset but are still present in the
1315    /// underlying storage.  These rows will have the `_rowid` column set to NULL.  The other columns
1316    /// (include _rowaddr) will be set to their deleted values.
1317    ///
1318    /// This can be useful for generating aligned fragments or debugging
1319    ///
1320    /// Note: when entire fragments are deleted, the scanner will not emit any rows for that fragment
1321    /// since the fragment is no longer present in the dataset.
1322    pub fn include_deleted_rows(&mut self) -> &mut Self {
1323        self.include_deleted_rows = true;
1324        self
1325    }
1326
1327    /// Set the I/O buffer size
1328    ///
1329    /// This is the amount of RAM that will be reserved for holding I/O received from
1330    /// storage before it is processed.  This is used to control the amount of memory
1331    /// used by the scanner.  If the buffer is full then the scanner will block until
1332    /// the buffer is processed.
1333    ///
1334    /// Generally this should scale with the number of concurrent I/O threads.  The
1335    /// default is 2GiB which comfortably provides enough space for somewhere between
1336    /// 32 and 256 concurrent I/O threads.
1337    ///
1338    /// This value is not a hard cap on the amount of RAM the scanner will use.  Some
1339    /// space is used for the compute (which can be controlled by the batch size) and
1340    /// Lance does not keep track of memory after it is returned to the user.
1341    ///
1342    /// Currently, if there is a single batch of data which is larger than the io buffer
1343    /// size then the scanner will deadlock.  This is a known issue and will be fixed in
1344    /// a future release.
1345    pub fn io_buffer_size(&mut self, size: u64) -> &mut Self {
1346        self.io_buffer_size = Some(size);
1347        self
1348    }
1349
1350    /// Set the prefetch size.
1351    /// Ignored in v2 and newer format
1352    pub fn batch_readahead(&mut self, nbatches: usize) -> &mut Self {
1353        self.batch_readahead = nbatches;
1354        self
1355    }
1356
1357    /// Set the fragment readahead.
1358    ///
1359    /// This is only used if ``scan_in_order`` is set to false.
1360    pub fn fragment_readahead(&mut self, nfragments: usize) -> &mut Self {
1361        self.fragment_readahead = Some(nfragments);
1362        self
1363    }
1364
1365    /// Set whether to read data in order (default: true)
1366    ///
1367    /// A scan will always read from the disk concurrently.  If this property
1368    /// is true then a ready batch (a batch that has been read from disk) will
1369    /// only be returned if it is the next batch in the sequence.  Otherwise,
1370    /// the batch will be held until the stream catches up.  This means the
1371    /// sequence is returned in order but there may be slightly less parallelism.
1372    ///
1373    /// If this is false, then batches will be returned as soon as they are
1374    /// available, potentially increasing throughput slightly
1375    ///
1376    /// If an ordering is defined (using [Self::order_by]) then the scan will
1377    /// always scan in parallel and any value set here will be ignored.
1378    pub fn scan_in_order(&mut self, ordered: bool) -> &mut Self {
1379        self.ordered = ordered;
1380        self
1381    }
1382
1383    /// Set whether to use scalar index.
1384    ///
1385    /// By default, scalar indices will be used to optimize a query if available.
1386    /// However, in some corner cases, scalar indices may not be the best choice.
1387    /// This option allows users to disable scalar indices for a query.
1388    pub fn use_scalar_index(&mut self, use_scalar_index: bool) -> &mut Self {
1389        self.use_scalar_index = use_scalar_index;
1390        self
1391    }
1392
1393    /// Set whether to use strict batch size.
1394    ///
1395    /// If this is true then output batches (except the last batch) will have exactly `batch_size` rows.
1396    /// By default, this is False and output batches are allowed to have fewer than `batch_size` rows
1397    /// Setting this to True will require us to merge batches, incurring a data copy, for a minor performance
1398    /// penalty.
1399    pub fn strict_batch_size(&mut self, strict_batch_size: bool) -> &mut Self {
1400        self.strict_batch_size = strict_batch_size;
1401        self
1402    }
1403
1404    /// Set limit and offset.
1405    ///
1406    /// If offset is set, the first offset rows will be skipped. If limit is set,
1407    /// only the provided number of rows will be returned. These can be set
1408    /// independently. For example, setting offset to 10 and limit to None will
1409    /// skip the first 10 rows and return the rest of the rows in the dataset.
1410    pub fn limit(&mut self, limit: Option<i64>, offset: Option<i64>) -> Result<&mut Self> {
1411        if limit.unwrap_or_default() < 0 {
1412            return Err(Error::invalid_input(
1413                "Limit must be non-negative".to_string(),
1414            ));
1415        }
1416        if let Some(off) = offset
1417            && off < 0
1418        {
1419            return Err(Error::invalid_input(
1420                "Offset must be non-negative".to_string(),
1421            ));
1422        }
1423        self.limit = limit;
1424        self.offset = offset;
1425        Ok(self)
1426    }
1427
1428    /// Find k-nearest neighbor within the vector column.
1429    /// the query can be a Float16Array, Float32Array, Float64Array, UInt8Array,
1430    /// or a ListArray/FixedSizeListArray of the above types.
1431    pub fn nearest(&mut self, column: &str, q: &dyn Array, k: usize) -> Result<&mut Self> {
1432        if !self.prefilter {
1433            // We can allow fragment scan if the input to nearest is a prefilter.
1434            // The fragment scan will be performed by the prefilter.
1435            self.ensure_not_fragment_scan()?;
1436        }
1437
1438        if k == 0 {
1439            return Err(Error::invalid_input("k must be positive".to_string()));
1440        }
1441        if q.is_empty() {
1442            return Err(Error::invalid_input(
1443                "Query vector must have non-zero length".to_string(),
1444            ));
1445        }
1446        // make sure the field exists
1447        let (vector_type, element_type) = get_vector_type(self.dataset.schema(), column)?;
1448        let dim = get_vector_dim(self.dataset.schema(), column)?;
1449
1450        let q = match q.data_type() {
1451            DataType::List(_) | DataType::FixedSizeList(_, _) => {
1452                if !matches!(vector_type, DataType::List(_)) {
1453                    return Err(Error::invalid_input(format!(
1454                        "Query is multivector but column {}({})is not multivector",
1455                        column, vector_type,
1456                    )));
1457                }
1458
1459                if let Some(list_array) = q.as_list_opt::<i32>() {
1460                    for i in 0..list_array.len() {
1461                        let vec = list_array.value(i);
1462                        if vec.len() != dim {
1463                            return Err(Error::invalid_input(format!(
1464                                "query dim({}) doesn't match the column {} vector dim({})",
1465                                vec.len(),
1466                                column,
1467                                dim,
1468                            )));
1469                        }
1470                    }
1471                    list_array.values().clone()
1472                } else {
1473                    let fsl = q.as_fixed_size_list();
1474                    if fsl.value_length() as usize != dim {
1475                        return Err(Error::invalid_input(format!(
1476                            "query dim({}) doesn't match the column {} vector dim({})",
1477                            fsl.value_length(),
1478                            column,
1479                            dim,
1480                        )));
1481                    }
1482                    fsl.values().clone()
1483                }
1484            }
1485            _ => {
1486                if q.len() != dim {
1487                    return Err(Error::invalid_input(format!(
1488                        "query dim({}) doesn't match the column {} vector dim({})",
1489                        q.len(),
1490                        column,
1491                        dim,
1492                    )));
1493                }
1494                q.slice(0, q.len())
1495            }
1496        };
1497
1498        let key = match &element_type {
1499            dt if dt == q.data_type() => q,
1500            dt if dt.is_floating() => coerce_float_vector(
1501                q.as_any().downcast_ref::<Float32Array>().unwrap(),
1502                FloatType::try_from(dt)?,
1503            )?,
1504            _ => {
1505                return Err(Error::invalid_input(format!(
1506                    "Column {} has element type {} and the query vector is {}",
1507                    column,
1508                    element_type,
1509                    q.data_type(),
1510                )));
1511            }
1512        };
1513
1514        self.nearest = Some(Query {
1515            column: column.to_string(),
1516            key,
1517            k,
1518            lower_bound: None,
1519            upper_bound: None,
1520            minimum_nprobes: 1,
1521            maximum_nprobes: None,
1522            ef: None,
1523            refine_factor: None,
1524            metric_type: None,
1525            use_index: true,
1526            query_parallelism: DEFAULT_QUERY_PARALLELISM,
1527            dist_q_c: 0.0,
1528        });
1529        Ok(self)
1530    }
1531
1532    #[cfg(test)]
1533    fn nearest_mut(&mut self) -> Option<&mut Query> {
1534        self.nearest.as_mut()
1535    }
1536
1537    /// Set the distance thresholds for the nearest neighbor search.
1538    pub fn distance_range(
1539        &mut self,
1540        lower_bound: Option<f32>,
1541        upper_bound: Option<f32>,
1542    ) -> &mut Self {
1543        if let Some(q) = self.nearest.as_mut() {
1544            q.lower_bound = lower_bound;
1545            q.upper_bound = upper_bound;
1546        }
1547        self
1548    }
1549
1550    /// Configures how many partititions will be searched in the vector index.
1551    ///
1552    /// This method is a convenience method that sets both [Self::minimum_nprobes] and
1553    /// [Self::maximum_nprobes] to the same value.
1554    pub fn nprobes(&mut self, n: usize) -> &mut Self {
1555        if let Some(q) = self.nearest.as_mut() {
1556            q.minimum_nprobes = n;
1557            q.maximum_nprobes = Some(n);
1558        } else {
1559            log::warn!("nprobes is not set because nearest has not been called yet");
1560        }
1561        self
1562    }
1563
1564    /// Configures how many partititions will be searched in the vector index.
1565    ///
1566    /// This method is a convenience method that sets both [Self::minimum_nprobes] and
1567    /// [Self::maximum_nprobes] to the same value.
1568    #[deprecated(note = "Use nprobes instead")]
1569    pub fn nprobs(&mut self, n: usize) -> &mut Self {
1570        if let Some(q) = self.nearest.as_mut() {
1571            q.minimum_nprobes = n;
1572            q.maximum_nprobes = Some(n);
1573        } else {
1574            log::warn!("nprobes is not set because nearest has not been called yet");
1575        }
1576        self
1577    }
1578
1579    /// Configures the minimum number of partitions to search in the vector index.
1580    ///
1581    /// If we have found k matching results after searching this many partitions then
1582    /// the search will stop.  Increasing this number can increase recall but will increase
1583    /// latency on all queries.
1584    ///
1585    /// The default value is 1.
1586    pub fn minimum_nprobes(&mut self, n: usize) -> &mut Self {
1587        if let Some(q) = self.nearest.as_mut() {
1588            q.minimum_nprobes = n;
1589        } else {
1590            log::warn!("minimum_nprobes is not set because nearest has not been called yet");
1591        }
1592        self
1593    }
1594
1595    /// Configures the maximum number of partitions to search in the vector index.
1596    ///
1597    /// These partitions will only be searched if we have not found `k` results after
1598    /// searching the minimum number of partitions.  Setting this to None (the default)
1599    /// will search all partitions if needed.
1600    ///
1601    /// This setting only takes effect when a prefilter is in place.  In that case we
1602    /// can spend more effort to try and find results when the filter is highly selective.
1603    ///
1604    /// If there is no prefilter, or the results are not highly selective, this value will
1605    /// have no effect.
1606    pub fn maximum_nprobes(&mut self, n: usize) -> &mut Self {
1607        if let Some(q) = self.nearest.as_mut() {
1608            q.maximum_nprobes = Some(n);
1609        } else {
1610            log::warn!("maximum_nprobes is not set because nearest has not been called yet");
1611        }
1612        self
1613    }
1614
1615    pub fn ef(&mut self, ef: usize) -> &mut Self {
1616        if let Some(q) = self.nearest.as_mut() {
1617            q.ef = Some(ef);
1618        }
1619        self
1620    }
1621
1622    /// Only search the data being indexed.
1623    ///
1624    /// Default value is false.
1625    ///
1626    /// This is essentially a weak consistency search, only on the indexed data.
1627    pub fn fast_search(&mut self) -> &mut Self {
1628        if let Some(q) = self.nearest.as_mut() {
1629            q.use_index = true;
1630        }
1631        self.fast_search = true;
1632        self.projection_plan.include_row_id(); // fast search requires _rowid
1633        self
1634    }
1635
1636    /// Apply a refine step to the vector search.
1637    ///
1638    /// A refine improves query accuracy but also makes search slower, by reading extra elements
1639    /// and using the original vector values to re-rank the distances.
1640    ///
1641    /// * `factor` - the factor of extra elements to read.  For example, if factor is 2, then
1642    ///   the search will read 2x more elements than the requested k before performing
1643    ///   the re-ranking. Note: even if the factor is 1, the  results will still be
1644    ///   re-ranked without fetching additional elements.
1645    pub fn refine(&mut self, factor: u32) -> &mut Self {
1646        if let Some(q) = self.nearest.as_mut() {
1647            q.refine_factor = Some(factor)
1648        };
1649        self
1650    }
1651
1652    /// Change the distance [MetricType], i.e, L2 or Cosine distance.
1653    pub fn distance_metric(&mut self, metric_type: MetricType) -> &mut Self {
1654        if let Some(q) = self.nearest.as_mut() {
1655            q.metric_type = Some(metric_type)
1656        }
1657        self
1658    }
1659
1660    /// Sort the results of the scan by one or more columns
1661    ///
1662    /// If Some, then the resulting stream will be sorted according to the given ordering.
1663    /// This may increase the latency of the first result since all data must be read before
1664    /// the first batch can be returned.
1665    pub fn order_by(&mut self, ordering: Option<Vec<ColumnOrdering>>) -> Result<&mut Self> {
1666        if let Some(ordering) = &ordering {
1667            if ordering.is_empty() {
1668                self.ordering = None;
1669                return Ok(self);
1670            }
1671            // Verify early that the fields exist
1672            for column in ordering {
1673                self.dataset
1674                    .schema()
1675                    .field(&column.column_name)
1676                    .ok_or(Error::invalid_input(format!(
1677                        "Column {} not found",
1678                        &column.column_name
1679                    )))?;
1680            }
1681        }
1682        self.ordering = ordering;
1683        Ok(self)
1684    }
1685
1686    /// Set whether to use the index if available
1687    pub fn use_index(&mut self, use_index: bool) -> &mut Self {
1688        if let Some(q) = self.nearest.as_mut() {
1689            q.use_index = use_index
1690        }
1691        self
1692    }
1693
1694    /// Configure partition-search concurrency for each vector query.
1695    ///
1696    /// The default is 0.
1697    /// Value 0 selects the automatic policy; today this resolves to 1 for the
1698    /// sequential fast path unless an index implementation overrides it.
1699    /// Value -1 uses the CPU pool size.
1700    /// Value 1 uses the single-worker sequential partition search path.
1701    /// Values >= 2 use the partition-parallel path and are clamped to the CPU
1702    /// pool size by the execution layer.
1703    pub fn query_parallelism(&mut self, query_parallelism: i32) -> &mut Self {
1704        if let Some(q) = self.nearest.as_mut() {
1705            q.query_parallelism = query_parallelism;
1706        } else {
1707            log::warn!("query_parallelism is not set because nearest has not been called yet");
1708        }
1709        self
1710    }
1711
1712    /// Instruct the scanner to return the `_rowid` meta column from the dataset.
1713    pub fn with_row_id(&mut self) -> &mut Self {
1714        self.legacy_with_row_id = true;
1715        self.projection_plan.include_row_id();
1716        self
1717    }
1718
1719    /// Instruct the scanner to return the `_rowaddr` meta column from the dataset.
1720    pub fn with_row_address(&mut self) -> &mut Self {
1721        self.legacy_with_row_addr = true;
1722        self.projection_plan.include_row_addr();
1723        self
1724    }
1725
1726    /// Instruct the scanner to disable automatic projection of scoring columns
1727    ///
1728    /// In the future, this will be the default behavior.  This method is useful for
1729    /// opting in to the new behavior early to avoid breaking changes (and a warning
1730    /// message)
1731    ///
1732    /// Once the default switches, the old autoprojection behavior will be removed.
1733    ///
1734    /// The autoprojection behavior (current default) includes the _score or _distance
1735    /// column even if a projection is manually specified with `[project]` or
1736    /// `[project_with_transform]`.
1737    ///
1738    /// The new behavior will only include the _score or _distance column if no projection
1739    /// is specified or if the user explicitly includes the _score or _distance column
1740    /// in the projection.
1741    pub fn disable_scoring_autoprojection(&mut self) -> &mut Self {
1742        self.autoproject_scoring_columns = false;
1743        self
1744    }
1745
1746    /// Set the file reader options to use when reading data files.
1747    pub fn with_file_reader_options(&mut self, options: FileReaderOptions) -> &mut Self {
1748        self.file_reader_options = Some(options);
1749        self
1750    }
1751
1752    /// Compute the resolved file reader options, merging the scanner's explicit
1753    /// `file_reader_options`, the dataset-level defaults, and the `batch_size_bytes`
1754    /// setting.
1755    fn resolved_file_reader_options(&self) -> Option<FileReaderOptions> {
1756        let base = self
1757            .file_reader_options
1758            .clone()
1759            .or_else(|| self.dataset.file_reader_options.clone());
1760        match (base, self.batch_size_bytes) {
1761            (Some(mut opts), Some(bsb)) => {
1762                if opts.batch_size_bytes.is_none() {
1763                    opts.batch_size_bytes = Some(bsb);
1764                }
1765                Some(opts)
1766            }
1767            (Some(opts), None) => Some(opts),
1768            (None, Some(bsb)) => Some(FileReaderOptions {
1769                batch_size_bytes: Some(bsb),
1770                ..Default::default()
1771            }),
1772            (None, None) => None,
1773        }
1774    }
1775
1776    /// Create a physical expression for a column that may be nested
1777    fn create_column_expr(
1778        column_name: &str,
1779        dataset: &Dataset,
1780        arrow_schema: &ArrowSchema,
1781    ) -> Result<Arc<dyn PhysicalExpr>> {
1782        let lance_schema = dataset.schema();
1783        let field_path = lance_schema
1784            .resolve_case_insensitive(column_name)
1785            .ok_or_else(|| {
1786                Error::invalid_input(format!("Field '{}' not found in schema", column_name))
1787            })?;
1788
1789        if field_path.len() == 1 {
1790            // Simple top-level column
1791            expressions::col(&field_path[0].name, arrow_schema).map_err(|e| {
1792                Error::internal(format!(
1793                    "Failed to create column expression for '{}': {}",
1794                    column_name, e
1795                ))
1796            })
1797        } else {
1798            // Nested field - build a chain of GetFieldFunc calls
1799            let get_field_func = ScalarUDF::from(GetFieldFunc::default());
1800
1801            // Use Expr::Column with Column::new_unqualified to preserve exact case
1802            // (col() normalizes identifiers to lowercase)
1803            let mut expr = Expr::Column(datafusion::common::Column::new_unqualified(
1804                &field_path[0].name,
1805            ));
1806            for nested_field in &field_path[1..] {
1807                expr = get_field_func.call(vec![expr, lit(&nested_field.name)]);
1808            }
1809
1810            // Convert logical to physical expression
1811            let df_schema = Arc::new(DFSchema::try_from(arrow_schema.clone())?);
1812            let execution_props = ExecutionProps::new().with_query_execution_start_time(Utc::now());
1813            create_physical_expr(&expr, &df_schema, &execution_props).map_err(|e| {
1814                Error::internal(format!(
1815                    "Failed to create physical expression for nested field '{}': {}",
1816                    column_name, e
1817                ))
1818            })
1819        }
1820    }
1821
1822    /// Set whether to use statistics to optimize the scan (default: true)
1823    ///
1824    /// This is used for debugging or benchmarking purposes.
1825    pub fn use_stats(&mut self, use_stats: bool) -> &mut Self {
1826        self.use_stats = use_stats;
1827        self
1828    }
1829
1830    /// The Arrow schema of the output, including projections and vector / _distance
1831    pub async fn schema(&self) -> Result<SchemaRef> {
1832        let plan = self.create_plan().await?;
1833        Ok(plan.schema())
1834    }
1835
1836    /// Fetches the currently set expr filter
1837    ///
1838    /// Note that this forces the filter to be evaluated and the result will depend on
1839    /// the current state of the scanner (e.g. if with_row_id has been called then _rowid
1840    /// will be available for filtering but not otherwise) and so you may want to call this
1841    /// after setting all other options.
1842    pub fn get_expr_filter(&self) -> Result<Option<Expr>> {
1843        if let Some(filter) = &self.filter.expr_filter {
1844            let filter_schema = self.filterable_schema()?;
1845            Ok(Some(filter.to_datafusion(
1846                self.dataset.schema(),
1847                filter_schema.as_ref(),
1848            )?))
1849        } else {
1850            Ok(None)
1851        }
1852    }
1853
1854    fn add_extra_columns(&self, schema: Schema) -> Result<Schema> {
1855        let mut extra_columns = vec![ArrowField::new(ROW_OFFSET, DataType::UInt64, true)];
1856
1857        if self.nearest.as_ref().is_some() {
1858            extra_columns.push(ArrowField::new(DIST_COL, DataType::Float32, true));
1859        };
1860
1861        if self.full_text_query.is_some() {
1862            extra_columns.push(ArrowField::new(SCORE_COL, DataType::Float32, true));
1863        }
1864
1865        schema.merge(&ArrowSchema::new(extra_columns))
1866    }
1867
1868    /// The full schema available to filters
1869    ///
1870    /// This is the schema of the dataset, any metadata columns like _rowid or _rowaddr
1871    /// and any extra columns like _distance or _score
1872    fn filterable_schema(&self) -> Result<Arc<Schema>> {
1873        let base_schema = Projection::full(self.dataset.clone())
1874            .with_row_id()
1875            .with_row_addr()
1876            .with_row_last_updated_at_version()
1877            .with_row_created_at_version()
1878            .to_schema();
1879
1880        Ok(Arc::new(self.add_extra_columns(base_schema)?))
1881    }
1882
1883    /// This takes the current output, and the user's requested projection, and calculates the
1884    /// final projection expression.
1885    ///
1886    /// This final expression may reorder columns, drop columns, or calculate new columns
1887    pub(crate) fn calculate_final_projection(
1888        &self,
1889        current_schema: &ArrowSchema,
1890    ) -> Result<Vec<(Arc<dyn PhysicalExpr>, String)>> {
1891        // Select the columns from the output schema based on the user's projection (or the list
1892        // of all available columns if the user did not specify a projection)
1893        let mut output_expr = self.projection_plan.to_physical_exprs(current_schema)?;
1894
1895        // Make sure _distance and _score are _always_ in the output unless user has opted out of the legacy
1896        // projection behavior
1897        if self.autoproject_scoring_columns {
1898            if self.nearest.is_some() && output_expr.iter().all(|(_, name)| name != DIST_COL) {
1899                if self.explicit_projection {
1900                    log::warn!(
1901                        "Deprecation warning, this behavior will change in the future. This search specified output columns but did not include `_distance`.  Currently the `_distance` column will be included.  In the future it will not.  Call `disable_scoring_autoprojection` to adopt the future behavior and avoid this warning"
1902                    );
1903                }
1904                let vector_expr = expressions::col(DIST_COL, current_schema)?;
1905                output_expr.push((vector_expr, DIST_COL.to_string()));
1906            }
1907            if self.full_text_query.is_some()
1908                && output_expr.iter().all(|(_, name)| name != SCORE_COL)
1909            {
1910                if self.explicit_projection {
1911                    log::warn!(
1912                        "Deprecation warning, this behavior will change in the future. This search specified output columns but did not include `_score`.  Currently the `_score` column will be included.  In the future it will not.  Call `disable_scoring_autoprojection` to adopt the future behavior and avoid this warning"
1913                    );
1914                }
1915                let score_expr = expressions::col(SCORE_COL, current_schema)?;
1916                output_expr.push((score_expr, SCORE_COL.to_string()));
1917            }
1918        }
1919
1920        if self.legacy_with_row_id {
1921            let row_id_pos = output_expr
1922                .iter()
1923                .position(|(_, name)| name == ROW_ID)
1924                .ok_or_else(|| {
1925                    Error::internal(
1926                        "user specified with_row_id but the _rowid column was not in the output"
1927                            .to_string(),
1928                    )
1929                })?;
1930            if row_id_pos != output_expr.len() - 1 {
1931                // Row id is not last column.  Need to rotate it to the last spot.
1932                let row_id_expr = output_expr.remove(row_id_pos);
1933                output_expr.push(row_id_expr);
1934            }
1935        }
1936
1937        if self.legacy_with_row_addr {
1938            let row_addr_pos = output_expr.iter().position(|(_, name)| name == ROW_ADDR).ok_or_else(|| {
1939                Error::internal("user specified with_row_address but the _rowaddr column was not in the output".to_string())
1940            })?;
1941            if row_addr_pos != output_expr.len() - 1 {
1942                // Row addr is not last column.  Need to rotate it to the last spot.
1943                let row_addr_expr = output_expr.remove(row_addr_pos);
1944                output_expr.push(row_addr_expr);
1945            }
1946        }
1947
1948        Ok(output_expr)
1949    }
1950
1951    /// Create a stream from the Scanner.
1952    #[instrument(skip_all)]
1953    pub fn try_into_stream(&self) -> BoxFuture<'_, Result<DatasetRecordBatchStream>> {
1954        // Future intentionally boxed here to avoid large futures on the stack
1955        async move {
1956            let plan = self.create_plan().await?;
1957
1958            Ok(DatasetRecordBatchStream::new(execute_plan(
1959                plan,
1960                LanceExecutionOptions {
1961                    batch_size: self.batch_size,
1962                    execution_stats_callback: self.scan_stats_callback.clone(),
1963                    ..Default::default()
1964                },
1965            )?))
1966        }
1967        .boxed()
1968    }
1969
1970    pub(crate) async fn try_into_dfstream(
1971        &self,
1972        mut options: LanceExecutionOptions,
1973    ) -> Result<SendableRecordBatchStream> {
1974        let plan = self.create_plan().await?;
1975
1976        // Use the scan stats callback if the user didn't set an execution stats callback
1977        if options.execution_stats_callback.is_none() {
1978            options.execution_stats_callback = self.scan_stats_callback.clone();
1979        }
1980
1981        execute_plan(plan, options)
1982    }
1983
1984    pub(crate) fn execution_options(&self) -> LanceExecutionOptions {
1985        LanceExecutionOptions {
1986            batch_size: self.batch_size,
1987            execution_stats_callback: self.scan_stats_callback.clone(),
1988            ..Default::default()
1989        }
1990    }
1991
1992    pub async fn try_into_batch(&self) -> Result<RecordBatch> {
1993        let stream = self.try_into_stream().await?;
1994        let schema = stream.schema();
1995        let batches = stream.try_collect::<Vec<_>>().await?;
1996        Ok(concat_batches(&schema, &batches)?)
1997    }
1998
1999    /// Scan and return the number of matching rows
2000    ///
2001    /// Note: calling [`Dataset::count_rows`] can be more efficient than calling this method
2002    /// especially if there is no filter.
2003    #[instrument(skip_all)]
2004    pub fn count_rows(&self) -> BoxFuture<'_, Result<u64>> {
2005        // Future intentionally boxed here to avoid large futures on the stack
2006        async move {
2007            let mut scanner = self.clone();
2008            scanner.aggregate(AggregateExpr::builder().count_star().build())?;
2009
2010            let plan = scanner.create_plan().await?;
2011            let mut stream = execute_plan(plan, LanceExecutionOptions::default())?;
2012
2013            // A count plan will always return a single batch with a single row.
2014            if let Some(first_batch) = stream.next().await {
2015                let batch = first_batch?;
2016                let array = batch
2017                    .column(0)
2018                    .as_any()
2019                    .downcast_ref::<Int64Array>()
2020                    .ok_or(Error::invalid_input(
2021                        "Count plan did not return an Int64Array".to_string(),
2022                    ))?;
2023                Ok(array.value(0) as u64)
2024            } else {
2025                Ok(0)
2026            }
2027        }
2028        .boxed()
2029    }
2030
2031    /// Create an execution plan with aggregation.
2032    ///
2033    /// Requires `aggregate()` to be called first.
2034    #[deprecated(note = "Use create_plan() instead, which now applies aggregate automatically")]
2035    pub fn create_aggregate_plan(&self) -> BoxFuture<'_, Result<Arc<dyn ExecutionPlan>>> {
2036        async move {
2037            if self.aggregate.is_none() {
2038                return Err(Error::invalid_input(
2039                    "create_aggregate_plan called but no aggregate was set",
2040                ));
2041            }
2042            // create_plan() now applies aggregate automatically when set
2043            self.create_plan().await
2044        }
2045        .boxed()
2046    }
2047
2048    async fn apply_aggregate(
2049        &self,
2050        plan: Arc<dyn ExecutionPlan>,
2051        agg: &Aggregate,
2052    ) -> Result<Arc<dyn ExecutionPlan>> {
2053        use datafusion_physical_expr::aggregate::AggregateFunctionExpr;
2054
2055        let schema = plan.schema();
2056        let df_schema = DFSchema::try_from(schema.as_ref().clone())?;
2057
2058        let group_exprs: Vec<(Arc<dyn PhysicalExpr>, String)> = agg
2059            .group_by
2060            .iter()
2061            .map(|expr| {
2062                let name = expr.schema_name().to_string();
2063                let physical_expr =
2064                    create_physical_expr(expr, &df_schema, &ExecutionProps::default())?;
2065                Ok((physical_expr, name))
2066            })
2067            .collect::<Result<_>>()?;
2068
2069        #[allow(clippy::type_complexity)]
2070        let aggr_results: Vec<(Arc<AggregateFunctionExpr>, Option<Arc<dyn PhysicalExpr>>)> = agg
2071            .aggregates
2072            .iter()
2073            .map(|expr| self.build_physical_aggregate_expr(expr, &df_schema, &schema))
2074            .collect::<Result<_>>()?;
2075
2076        let (aggr_exprs, filters): (Vec<_>, Vec<_>) = aggr_results.into_iter().unzip();
2077
2078        Ok(Arc::new(AggregateExec::try_new(
2079            AggregateMode::Single,
2080            PhysicalGroupBy::new_single(group_exprs),
2081            aggr_exprs,
2082            filters,
2083            plan,
2084            schema,
2085        )?) as Arc<dyn ExecutionPlan>)
2086    }
2087
2088    #[allow(clippy::type_complexity)]
2089    fn build_physical_aggregate_expr(
2090        &self,
2091        expr: &Expr,
2092        df_schema: &DFSchema,
2093        input_schema: &SchemaRef,
2094    ) -> Result<(
2095        Arc<datafusion_physical_expr::aggregate::AggregateFunctionExpr>,
2096        Option<Arc<dyn PhysicalExpr>>,
2097    )> {
2098        use datafusion::physical_planner::create_aggregate_expr_and_maybe_filter;
2099
2100        let coerced_expr = self.coerce_aggregate_expr(expr, df_schema)?;
2101
2102        // Note: order_by is already embedded in the AggregateFunctionExpr for ordered aggregates
2103        let (agg_expr, filter, _order_by) = create_aggregate_expr_and_maybe_filter(
2104            &coerced_expr,
2105            df_schema,
2106            input_schema.as_ref(),
2107            &ExecutionProps::default(),
2108        )?;
2109
2110        Ok((agg_expr, filter))
2111    }
2112
2113    /// Apply type coercion to aggregate arguments for UserDefined signature functions.
2114    ///
2115    /// Most aggregate functions (SUM, COUNT, MIN, MAX) have explicit type signatures that
2116    /// DataFusion handles automatically. However, some functions like AVG use UserDefined
2117    /// type signatures in the Substrait consumer, which means DataFusion doesn't know the
2118    /// expected input types and won't perform automatic coercion. We must explicitly coerce
2119    /// arguments to the types returned by `func.coerce_types()`.
2120    fn coerce_aggregate_expr(&self, expr: &Expr, schema: &DFSchema) -> Result<Expr> {
2121        Self::coerce_aggregate_expr_impl(expr, schema)
2122    }
2123
2124    fn coerce_aggregate_expr_impl(expr: &Expr, schema: &DFSchema) -> Result<Expr> {
2125        use datafusion::logical_expr::Expr;
2126        use datafusion::logical_expr::expr::AggregateFunction;
2127        use datafusion::logical_expr::type_coercion::functions::fields_with_udf;
2128
2129        match expr {
2130            Expr::AggregateFunction(agg_func) => {
2131                let func = &agg_func.func;
2132                let args = &agg_func.params.args;
2133
2134                if args.is_empty() {
2135                    return Ok(expr.clone());
2136                }
2137
2138                let current_fields: Vec<arrow_schema::FieldRef> = args
2139                    .iter()
2140                    .enumerate()
2141                    .map(|(i, e)| {
2142                        let dt = e.get_type(schema)?;
2143                        Ok(Arc::new(arrow_schema::Field::new(
2144                            format!("arg_{i}"),
2145                            dt,
2146                            true,
2147                        )))
2148                    })
2149                    .collect::<std::result::Result<_, datafusion::common::DataFusionError>>()?;
2150
2151                let coerced_fields = fields_with_udf(&current_fields, func.as_ref())?;
2152                let coerced_args: Vec<Expr> = args
2153                    .iter()
2154                    .zip(coerced_fields.iter())
2155                    .map(|(arg, target_field)| {
2156                        let arg_type = arg.get_type(schema)?;
2157                        let target_type = target_field.data_type();
2158                        if arg_type == *target_type {
2159                            Ok(arg.clone())
2160                        } else {
2161                            arg.clone().cast_to(target_type, schema)
2162                        }
2163                    })
2164                    .collect::<std::result::Result<_, _>>()?;
2165
2166                Ok(Expr::AggregateFunction(AggregateFunction::new_udf(
2167                    func.clone(),
2168                    coerced_args,
2169                    agg_func.params.distinct,
2170                    agg_func.params.filter.clone(),
2171                    agg_func.params.order_by.clone(),
2172                    agg_func.params.null_treatment,
2173                )))
2174            }
2175            Expr::Alias(alias) => {
2176                // Recursively coerce the inner expression and preserve the alias
2177                let coerced_inner = Self::coerce_aggregate_expr_impl(&alias.expr, schema)?;
2178                Ok(coerced_inner.alias(&alias.name))
2179            }
2180            other => Err(Error::invalid_input(format!(
2181                "Expected aggregate function expression, got {:?}",
2182                other.variant_name()
2183            ))),
2184        }
2185    }
2186
2187    // A "narrow" field is a field that is so small that we are better off reading the
2188    // entire column and filtering in memory rather than "take"ing the column.
2189    //
2190    // The exact threshold depends on a two factors:
2191    // 1. The number of rows returned by the filter
2192    // 2. The number of rows in the dataset
2193    // 3. The IOPS/bandwidth ratio of the storage system
2194    // 4. The size of each value in the column
2195    //
2196    // We don't (today) have a good way of knowing #1 or #4.  #2 is easy to know.  We can
2197    // combine 1 & 2 into "percentage of rows returned" but since we don't know #1 it
2198    // doesn't really help.  #3 is complex but as a rule of thumb we can use:
2199    //
2200    //   Local storage: 1 IOP for ever ten thousand bytes
2201    //   Cloud storage: 1 IOP for every million bytes
2202    //
2203    // Our current heuristic today is to assume a filter will return 0.1% of the rows in the dataset.
2204    //
2205    // This means, for cloud storage, a field is "narrow" if there are 1KB of data per row and
2206    // for local disk a field is "narrow" if there are 10 bytes of data per row.
2207    fn is_early_field(&self, field: &Field) -> bool {
2208        match self.materialization_style {
2209            MaterializationStyle::AllEarly => true,
2210            MaterializationStyle::AllLate => false,
2211            MaterializationStyle::AllEarlyExcept(ref cols) => !cols.contains(&(field.id as u32)),
2212            MaterializationStyle::Heuristic => {
2213                if field.is_blob() {
2214                    // By default, blobs are loaded as descriptions, and so should be early
2215                    //
2216                    // TODO: Once we make blob handling configurable, we should use the blob
2217                    // handling setting here.
2218                    return true;
2219                }
2220
2221                let byte_width = field.data_type().byte_width_opt();
2222                let is_cloud = self.dataset.object_store.as_ref().is_cloud();
2223                if is_cloud {
2224                    byte_width.is_some_and(|bw| bw < 1000)
2225                } else {
2226                    byte_width.is_some_and(|bw| bw < 10)
2227                }
2228            }
2229        }
2230    }
2231
2232    // If we are going to filter on `filter_plan`, then which columns are so small it is
2233    // cheaper to read the entire column and filter in memory.
2234    //
2235    // Note: only add columns that we actually need to read
2236    fn calc_eager_projection(
2237        &self,
2238        filter_plan: &ExprFilterPlan,
2239        desired_projection: &Projection,
2240    ) -> Result<Projection> {
2241        // Note: We use all_columns and not refine_columns here.  If a column is covered by an index but
2242        // the user has requested it, then we do not use it for late materialization.
2243        //
2244        // Either that column is covered by an exact filter (e.g. string with bitmap/btree) and there is no
2245        // need for late materialization or that column is covered by an inexact filter (e.g. ngram) in which
2246        // case we are going to load the column anyways for the recheck.
2247        let filter_columns = filter_plan.all_columns();
2248
2249        let filter_schema = self
2250            .dataset
2251            .empty_projection()
2252            .union_columns(filter_columns, OnMissing::Error)?
2253            .into_schema();
2254
2255        // Start with the desired fields
2256        Ok(desired_projection
2257            .clone()
2258            // Subtract columns that are expensive
2259            .subtract_predicate(|f| !self.is_early_field(f))
2260            // Add back columns that we need for filtering
2261            .union_schema(&filter_schema))
2262    }
2263
2264    fn validate_options(&self) -> Result<()> {
2265        if self.include_deleted_rows && !self.projection_plan.physical_projection.with_row_id {
2266            return Err(Error::invalid_input_source(
2267                "include_deleted_rows is set but with_row_id is false".into(),
2268            ));
2269        }
2270
2271        if self.aggregate.is_some() {
2272            if self.limit.is_some() || self.offset.is_some() {
2273                return Err(Error::invalid_input_source(
2274                    "Cannot use limit/offset with aggregate. Apply limit to the result instead."
2275                        .into(),
2276                ));
2277            }
2278            if self.ordering.is_some() {
2279                return Err(Error::invalid_input_source(
2280                    "Cannot use order_by with aggregate. Apply ordering to the result instead."
2281                        .into(),
2282                ));
2283            }
2284        }
2285
2286        if self.index_segments.is_some() && self.nearest.is_none() {
2287            return Err(Error::not_supported(
2288                "with_index_segments is only supported for vector search".to_string(),
2289            ));
2290        }
2291
2292        Ok(())
2293    }
2294
2295    async fn create_filter_plan(&self, use_scalar_index: bool) -> Result<FilterPlan> {
2296        let filter_schema = self.filterable_schema()?;
2297        let planner = Planner::new(Arc::new(filter_schema.as_ref().into()));
2298
2299        // Check expr filter
2300        let filter_plan = if let Some(filter) = self.filter.expr_filter.as_ref() {
2301            let expr = filter.to_datafusion(self.dataset.schema(), filter_schema.as_ref())?;
2302            let index_info = self.dataset.scalar_index_info().await?;
2303            let filter_plan =
2304                planner.create_filter_plan(expr.clone(), &index_info, use_scalar_index)?;
2305
2306            // This tests if any of the fragments are missing the physical_rows property (old style)
2307            // If they are then we cannot use scalar indices
2308            if filter_plan.index_query.is_some() {
2309                let fragments = if let Some(fragments) = self.fragments.as_ref() {
2310                    fragments
2311                } else {
2312                    self.dataset.fragments()
2313                };
2314                let mut has_missing_row_count = false;
2315                for frag in fragments {
2316                    if frag.physical_rows.is_none() {
2317                        has_missing_row_count = true;
2318                        break;
2319                    }
2320                }
2321                if has_missing_row_count {
2322                    // We need row counts to use scalar indices.  If we don't have them then
2323                    // fallback to a non-indexed filter
2324                    let filter_plan =
2325                        planner.create_filter_plan(expr.clone(), &index_info, false)?;
2326                    FilterPlan::new(self.filter.query_filter.clone(), filter_plan)
2327                } else {
2328                    FilterPlan::new(self.filter.query_filter.clone(), filter_plan)
2329                }
2330            } else {
2331                FilterPlan::new(self.filter.query_filter.clone(), filter_plan)
2332            }
2333        } else {
2334            FilterPlan::new(self.filter.query_filter.clone(), ExprFilterPlan::default())
2335        };
2336
2337        // Check query filter
2338        if filter_plan.query_filter.is_some()
2339            && self.nearest.is_none()
2340            && self.full_text_query.is_none()
2341        {
2342            return Err(Error::invalid_input_source(
2343                "Query filter can only be used with full text search or vector search".into(),
2344            ));
2345        }
2346        if self.nearest.is_some() && filter_plan.vector_filter().is_some() {
2347            return Err(Error::invalid_input_source(
2348                "Query filter can't be used with vector search".into(),
2349            ));
2350        }
2351        if self.full_text_query.is_some() && filter_plan.fts_filter().is_some() {
2352            return Err(Error::invalid_input_source(
2353                "Fts filter can't be used with fts search".into(),
2354            ));
2355        }
2356
2357        Ok(filter_plan)
2358    }
2359
2360    async fn get_scan_range(&self, filter_plan: &ExprFilterPlan) -> Result<Option<Range<u64>>> {
2361        if filter_plan.has_any_filter() {
2362            // If there is a filter we can't pushdown limit / offset
2363            Ok(None)
2364        } else if self.ordering.is_some() {
2365            // If there is ordering, we can't pushdown limit / offset
2366            // because we need to sort all data first before applying the limit
2367            Ok(None)
2368        } else if self.dataset.manifest.uses_stable_row_ids() {
2369            // Stable-row-id datasets can contain deleted / rewritten rows that still occupy
2370            // physical positions in older fragments while the live replacement rows are appended
2371            // to new fragments. `scan_range_before_filter` is a logical offset over visible rows,
2372            // but filtered-read planning trims fragments before the stable-row-id/deletion-aware
2373            // remapping is finished. Pushing limit / offset down here can spend the range on
2374            // tombstoned positions and skip still-live rows in later fragments.
2375            Ok(None)
2376        } else {
2377            match (self.limit, self.offset) {
2378                (None, None) => Ok(None),
2379                (Some(limit), None) => {
2380                    let num_rows = self.dataset.count_all_rows().await? as i64;
2381                    Ok(Some(0..limit.min(num_rows) as u64))
2382                }
2383                (None, Some(offset)) => {
2384                    let num_rows = self.dataset.count_all_rows().await? as i64;
2385                    Ok(Some(offset.min(num_rows) as u64..num_rows as u64))
2386                }
2387                (Some(limit), Some(offset)) => {
2388                    let num_rows = self.dataset.count_all_rows().await? as i64;
2389                    Ok(Some(
2390                        offset.min(num_rows) as u64..(offset + limit).min(num_rows) as u64,
2391                    ))
2392                }
2393            }
2394        }
2395    }
2396
2397    /// Create [`ExecutionPlan`] for Scan.
2398    ///
2399    /// An ExecutionPlan is a graph of operators that can be executed.
2400    ///
2401    /// The following plans are supported:
2402    ///
2403    ///  - **Plain scan without filter or limits.**
2404    ///
2405    ///  ```ignore
2406    ///  Scan(projections)
2407    ///  ```
2408    ///
2409    ///  - **Scan with filter and/or limits.**
2410    ///
2411    ///  ```ignore
2412    ///  Scan(filtered_cols) -> Filter(expr)
2413    ///     -> (*LimitExec(limit, offset))
2414    ///     -> Take(remaining_cols) -> Projection()
2415    ///  ```
2416    ///
2417    ///  - **Use KNN Index (with filter and/or limits)**
2418    ///
2419    /// ```ignore
2420    /// KNNIndex() -> Take(vector) -> FlatRefine()
2421    ///     -> Take(filtered_cols) -> Filter(expr)
2422    ///     -> (*LimitExec(limit, offset))
2423    ///     -> Take(remaining_cols) -> Projection()
2424    /// ```
2425    ///
2426    /// - **Use KNN flat (brute force) with filter and/or limits**
2427    ///
2428    /// ```ignore
2429    /// Scan(vector) -> FlatKNN()
2430    ///     -> Take(filtered_cols) -> Filter(expr)
2431    ///     -> (*LimitExec(limit, offset))
2432    ///     -> Take(remaining_cols) -> Projection()
2433    /// ```
2434    ///
2435    /// In general, a plan has 5 stages:
2436    ///
2437    /// 1. Source (from dataset Scan or from index, may include prefilter)
2438    /// 2. Filter
2439    /// 3. Sort
2440    /// 4. Limit / Offset
2441    /// 5. Take remaining columns / Projection
2442    #[instrument(level = "debug", skip_all)]
2443    pub async fn create_plan(&self) -> Result<Arc<dyn ExecutionPlan>> {
2444        log::trace!("creating scanner plan");
2445        self.validate_options()?;
2446
2447        // Scalar indices are only used when prefiltering
2448        let use_scalar_index = self.use_scalar_index && (self.prefilter || self.nearest.is_none());
2449        let mut filter_plan = self.create_filter_plan(use_scalar_index).await?;
2450
2451        let mut use_limit_node = true;
2452        // Source: either a (K|A)NN search, full text search, or a (full|indexed) scan
2453        let mut plan: Arc<dyn ExecutionPlan> = match (&self.nearest, &self.full_text_query) {
2454            (Some(_), None) => self.vector_search_source(&mut filter_plan).await?,
2455            (None, Some(query)) => self.fts_search_source(&mut filter_plan, query).await?,
2456            (None, None) => {
2457                if self.projection_plan.has_output_cols()
2458                    && self.projection_plan.physical_projection.is_empty()
2459                {
2460                    // This means the user is doing something like `SELECT 1 AS foo`.  We don't support this and
2461                    // I'm not sure we should.  Users should use a full SQL API to do something like this.
2462                    //
2463                    // It's also possible we get here from `SELECT does_not_exist`
2464
2465                    // Note: even though we are just going to return an error we still want to calculate the
2466                    // final projection here.  This lets us distinguish between a user doing something like:
2467                    //
2468                    // SELECT 1 FROM t (not supported error)
2469                    // SELECT non_existent_column FROM t (column not found error)
2470                    let output_expr = self.calculate_final_projection(&ArrowSchema::empty())?;
2471                    return Err(Error::not_supported_source(format!("Scans must request at least one column.  Received only dynamic expressions: {:?}", output_expr).into()));
2472                }
2473
2474                let take_op = filter_plan
2475                    .expr_filter_plan
2476                    .full_expr
2477                    .as_ref()
2478                    .and_then(TakeOperation::try_from_expr);
2479                if let Some((take_op, remainder)) = take_op {
2480                    // If there is any remainder use it as the filter (we don't even try and combine an indexed
2481                    // search on the filter with a take as that seems excessive)
2482                    filter_plan.expr_filter_plan = remainder
2483                        .map(ExprFilterPlan::new_refine_only)
2484                        .unwrap_or(ExprFilterPlan::default());
2485                    self.take_source(take_op).await?
2486                } else {
2487                    let planned_read = self
2488                        .filtered_read_source(&mut filter_plan.expr_filter_plan)
2489                        .await?;
2490                    if planned_read.limit_pushed_down {
2491                        use_limit_node = false;
2492                    }
2493                    if planned_read.filter_pushed_down {
2494                        filter_plan.disable_refine();
2495                    }
2496                    planned_read.plan
2497                }
2498            }
2499            _ => {
2500                return Err(Error::invalid_input_source(
2501                    "Cannot have both nearest and full text search".into(),
2502                ));
2503            }
2504        };
2505
2506        // Load columns needed for filter and ordering
2507        let mut pre_filter_projection = self.dataset.empty_projection();
2508
2509        // We may need to take filter columns if we are going to refine
2510        // an indexed scan.
2511        if filter_plan.has_refine() {
2512            // It's ok for some filter columns to be missing (e.g. _rowid)
2513            pre_filter_projection = pre_filter_projection.union_columns(
2514                filter_plan.refine_columns(&self.dataset).await?,
2515                OnMissing::Ignore,
2516            )?;
2517        }
2518
2519        // TODO: Does it always make sense to take the ordering columns here?  If there is a filter then
2520        // maybe we wait until after the filter to take the ordering columns?  Maybe it would be better to
2521        // grab the ordering column in the initial scan (if it is eager) and if it isn't then we should
2522        // take it after the filtering phase, if any (we already have a take there).
2523        if let Some(ordering) = &self.ordering {
2524            pre_filter_projection = pre_filter_projection.union_columns(
2525                ordering.iter().map(|col| &col.column_name),
2526                OnMissing::Error,
2527            )?;
2528        }
2529
2530        plan = self.take(plan, pre_filter_projection)?;
2531
2532        // Filter
2533        plan = filter_plan.refine_filter(plan, self).await?;
2534
2535        // Aggregate (if set, applies aggregate and returns early)
2536        if let Some(agg) = &self.aggregate {
2537            // Take only columns needed by the aggregate, not the full projection.
2538            // For COUNT(*), this is empty. For SUM(x), this is just [x].
2539            let required_columns = agg.required_columns();
2540            let agg_projection = if required_columns.is_empty() {
2541                self.dataset.empty_projection()
2542            } else {
2543                self.dataset
2544                    .empty_projection()
2545                    .union_columns(&required_columns, OnMissing::Error)?
2546            };
2547            plan = self.take(plan, agg_projection)?;
2548            plan = self.apply_aggregate(plan, agg).await?;
2549
2550            let optimizer = get_physical_optimizer();
2551            let options = Default::default();
2552            for rule in optimizer.rules {
2553                plan = rule.optimize(plan, &options)?;
2554            }
2555
2556            return Ok(plan);
2557        }
2558
2559        // Sort
2560        if let Some(ordering) = &self.ordering {
2561            let ordering_columns = ordering.iter().map(|col| &col.column_name);
2562            let projection_with_ordering = self
2563                .dataset
2564                .empty_projection()
2565                .union_columns(ordering_columns, OnMissing::Error)?;
2566            // We haven't loaded the sort column yet so take it now
2567            plan = self.take(plan, projection_with_ordering)?;
2568            let col_exprs = ordering
2569                .iter()
2570                .map(|col| {
2571                    Ok(PhysicalSortExpr {
2572                        expr: Self::create_column_expr(
2573                            &col.column_name,
2574                            &self.dataset,
2575                            plan.schema().as_ref(),
2576                        )?,
2577                        options: SortOptions {
2578                            descending: !col.ascending,
2579                            nulls_first: col.nulls_first,
2580                        },
2581                    })
2582                })
2583                .collect::<Result<Vec<_>>>()?;
2584            plan = Arc::new(SortExec::new(
2585                LexOrdering::new(col_exprs)
2586                    .ok_or(exec_datafusion_err!("Unexpected empty sort expressions"))?,
2587                plan,
2588            ));
2589        }
2590
2591        // Limit / offset
2592        if use_limit_node && (self.limit.unwrap_or(0) > 0 || self.offset.is_some()) {
2593            plan = self.limit_node(plan);
2594        }
2595
2596        // Take remaining columns required for projection
2597        plan = self.take(plan, self.projection_plan.physical_projection.clone())?;
2598
2599        // Add system columns, if requested
2600        if self.projection_plan.must_add_row_offset {
2601            plan = Arc::new(AddRowOffsetExec::try_new(plan, self.dataset.clone()).await?);
2602        }
2603
2604        // Final projection
2605        let final_projection = self.calculate_final_projection(plan.schema().as_ref())?;
2606
2607        plan = Arc::new(DFProjectionExec::try_new(final_projection, plan)?);
2608
2609        // If requested, apply a strict batch size to the final output
2610        if self.strict_batch_size {
2611            plan = Arc::new(StrictBatchSizeExec::new(plan, self.get_batch_size()));
2612        }
2613
2614        let optimizer = get_physical_optimizer();
2615        let options: ConfigOptions = Default::default();
2616        for rule in optimizer.rules {
2617            plan = rule.optimize(plan, &options)?;
2618        }
2619
2620        Ok(plan)
2621    }
2622
2623    // Check if a filter plan references version columns
2624    fn filter_references_version_columns(&self, filter_plan: &ExprFilterPlan) -> bool {
2625        use lance_core::{ROW_CREATED_AT_VERSION, ROW_LAST_UPDATED_AT_VERSION};
2626
2627        if let Some(refine_expr) = &filter_plan.refine_expr {
2628            let column_names = Planner::column_names_in_expr(refine_expr);
2629            for col_name in column_names {
2630                if col_name == ROW_CREATED_AT_VERSION || col_name == ROW_LAST_UPDATED_AT_VERSION {
2631                    return true;
2632                }
2633            }
2634        }
2635        false
2636    }
2637
2638    // Helper function for filtered_read
2639    //
2640    // Do not call this directly, use filtered_read instead
2641    //
2642    // First return value is the plan, second is whether the limit was pushed down
2643    async fn legacy_filtered_read(
2644        &self,
2645        filter_plan: &ExprFilterPlan,
2646        projection: Projection,
2647        make_deletions_null: bool,
2648        fragments: Option<Arc<Vec<Fragment>>>,
2649        scan_range: Option<Range<u64>>,
2650        is_prefilter: bool,
2651    ) -> Result<PlannedFilteredScan> {
2652        let fragments = fragments.unwrap_or(self.dataset.fragments().clone());
2653        let mut filter_pushed_down = false;
2654
2655        let plan: Arc<dyn ExecutionPlan> = if filter_plan.has_index_query() {
2656            if self.include_deleted_rows {
2657                return Err(Error::invalid_input_source(
2658                    "Cannot include deleted rows in a scalar indexed scan".into(),
2659                ));
2660            }
2661            self.scalar_indexed_scan(projection, filter_plan, fragments)
2662                .await
2663        } else if !is_prefilter
2664            && filter_plan.has_refine()
2665            && self.batch_size.is_none()
2666            && self.use_stats
2667            && !self.filter_references_version_columns(filter_plan)
2668        {
2669            filter_pushed_down = true;
2670            self.pushdown_scan(false, filter_plan)
2671        } else {
2672            let ordered = if self.ordering.is_some() || self.nearest.is_some() {
2673                // If we are sorting the results there is no need to scan in order
2674                false
2675            } else if projection.with_row_last_updated_at_version
2676                || projection.with_row_created_at_version
2677            {
2678                // Version columns require ordered scanning because version metadata
2679                // is indexed by position within each fragment
2680                true
2681            } else {
2682                self.ordered
2683            };
2684
2685            let projection = if let Some(refine_expr) = filter_plan.refine_expr.as_ref() {
2686                if is_prefilter {
2687                    let refine_cols = Planner::column_names_in_expr(refine_expr);
2688                    projection.union_columns(refine_cols, OnMissing::Error)?
2689                } else {
2690                    projection
2691                }
2692            } else {
2693                projection
2694            };
2695
2696            // Can't push down limit for legacy scan if there is a refine step
2697            let scan_range = if filter_plan.has_refine() {
2698                None
2699            } else {
2700                scan_range
2701            };
2702
2703            let scan = self.scan_fragments(
2704                projection.with_row_id,
2705                self.projection_plan.physical_projection.with_row_addr,
2706                self.projection_plan
2707                    .physical_projection
2708                    .with_row_last_updated_at_version,
2709                self.projection_plan
2710                    .physical_projection
2711                    .with_row_created_at_version,
2712                make_deletions_null,
2713                Arc::new(projection.to_bare_schema()),
2714                fragments,
2715                scan_range,
2716                ordered,
2717            );
2718
2719            if filter_plan.has_refine() && is_prefilter {
2720                Ok(Arc::new(LanceFilterExec::try_new(
2721                    filter_plan.refine_expr.clone().unwrap(),
2722                    scan,
2723                )?) as Arc<dyn ExecutionPlan>)
2724            } else {
2725                Ok(scan)
2726            }
2727        }?;
2728        Ok(PlannedFilteredScan {
2729            plan,
2730            limit_pushed_down: false,
2731            filter_pushed_down,
2732        })
2733    }
2734
2735    // Helper function for filtered_read
2736    //
2737    // Do not call this directly, use filtered_read instead
2738    async fn new_filtered_read(
2739        &self,
2740        filter_plan: &ExprFilterPlan,
2741        projection: Projection,
2742        make_deletions_null: bool,
2743        fragments: Option<Arc<Vec<Fragment>>>,
2744        scan_range: Option<Range<u64>>,
2745    ) -> Result<Arc<dyn ExecutionPlan>> {
2746        let mut read_options = FilteredReadOptions::basic_full_read(&self.dataset)
2747            .with_filter_plan(filter_plan.clone())
2748            .with_projection(projection);
2749
2750        if let Some(fragments) = fragments {
2751            read_options = read_options.with_fragments(fragments);
2752        }
2753
2754        if let Some(scan_range) = scan_range {
2755            read_options = read_options.with_scan_range_before_filter(scan_range)?;
2756        }
2757
2758        if let Some(batch_size) = self.batch_size {
2759            read_options = read_options.with_batch_size(batch_size as u32);
2760        }
2761
2762        if let Some(file_reader_options) = self.resolved_file_reader_options() {
2763            read_options = read_options.with_file_reader_options(file_reader_options);
2764        }
2765
2766        if let Some(fragment_readahead) = self.fragment_readahead {
2767            read_options = read_options.with_fragment_readahead(fragment_readahead);
2768        }
2769
2770        if make_deletions_null {
2771            read_options = read_options.with_deleted_rows()?;
2772        }
2773
2774        if let Some(io_buffer_size_bytes) = self.io_buffer_size {
2775            read_options = read_options.with_io_buffer_size(io_buffer_size_bytes);
2776        }
2777
2778        let index_input = filter_plan.index_query.clone().map(|index_query| {
2779            Arc::new(ScalarIndexExec::new(self.dataset.clone(), index_query))
2780                as Arc<dyn ExecutionPlan>
2781        });
2782
2783        Ok(Arc::new(FilteredReadExec::try_new(
2784            self.dataset.clone(),
2785            read_options,
2786            index_input,
2787        )?))
2788    }
2789
2790    // Helper function for filtered read
2791    //
2792    // Delegates to legacy or new filtered read based on dataset storage version
2793    async fn filtered_read(
2794        &self,
2795        filter_plan: &ExprFilterPlan,
2796        projection: Projection,
2797        make_deletions_null: bool,
2798        fragments: Option<Arc<Vec<Fragment>>>,
2799        scan_range: Option<Range<u64>>,
2800        is_prefilter: bool,
2801    ) -> Result<PlannedFilteredScan> {
2802        // Use legacy path if dataset uses legacy storage format
2803        if self.dataset.is_legacy_storage() {
2804            self.legacy_filtered_read(
2805                filter_plan,
2806                projection,
2807                make_deletions_null,
2808                fragments,
2809                scan_range,
2810                is_prefilter,
2811            )
2812            .await
2813        } else {
2814            let limit_pushed_down = scan_range.is_some();
2815            let plan = self
2816                .new_filtered_read(
2817                    filter_plan,
2818                    projection,
2819                    make_deletions_null,
2820                    fragments,
2821                    scan_range,
2822                )
2823                .await?;
2824            Ok(PlannedFilteredScan {
2825                filter_pushed_down: true,
2826                limit_pushed_down,
2827                plan,
2828            })
2829        }
2830    }
2831
2832    fn u64s_as_take_input(&self, u64s: Vec<u64>) -> Result<Arc<dyn ExecutionPlan>> {
2833        let row_addrs = RowAddrTreeMap::from_iter(u64s);
2834        let row_addr_mask = RowAddrMask::from_allowed(row_addrs);
2835        let index_result = IndexExprResult::Exact(row_addr_mask);
2836        let fragments_covered = self.dataset.fragment_bitmap.as_ref().clone();
2837        let batch = index_result.serialize_to_arrow(&fragments_covered)?;
2838        let stream = futures::stream::once(async move { Ok(batch) });
2839        let stream = Box::pin(RecordBatchStreamAdapter::new(
2840            INDEX_EXPR_RESULT_SCHEMA.clone(),
2841            stream,
2842        ));
2843        Ok(Arc::new(OneShotExec::new(stream)))
2844    }
2845
2846    async fn take_source(&self, take_op: TakeOperation) -> Result<Arc<dyn ExecutionPlan>> {
2847        // We generally assume that late materialization does not make sense for take operations
2848        // so we can just use the physical projection
2849        let projection = self.projection_plan.physical_projection.clone();
2850
2851        let input = match take_op {
2852            TakeOperation::RowIds(ids) => self.u64s_as_take_input(ids),
2853            TakeOperation::RowAddrs(addrs) => self.u64s_as_take_input(addrs),
2854            TakeOperation::RowOffsets(offsets) => {
2855                let mut addrs =
2856                    row_offsets_to_row_addresses(&self.dataset.get_fragments(), &offsets).await?;
2857                addrs.retain(|addr| *addr != RowAddress::TOMBSTONE_ROW);
2858                self.u64s_as_take_input(addrs)
2859            }
2860        }?;
2861
2862        let mut filtered_read_options = FilteredReadOptions::new(projection);
2863        if let Some(fragment) = self.fragments.as_ref() {
2864            filtered_read_options =
2865                filtered_read_options.with_fragments(Arc::new(fragment.clone()));
2866        }
2867
2868        Ok(Arc::new(FilteredReadExec::try_new(
2869            self.dataset.clone(),
2870            filtered_read_options,
2871            Some(input),
2872        )?))
2873    }
2874
2875    async fn filtered_read_source(
2876        &self,
2877        filter_plan: &mut ExprFilterPlan,
2878    ) -> Result<PlannedFilteredScan> {
2879        log::trace!("source is a filtered read");
2880
2881        // Compute the effective projection based on what's actually needed.
2882        // If we have an aggregate, we only need the columns referenced by the aggregate,
2883        // not all the columns from the projection plan.
2884        let effective_projection = if let Some(agg) = &self.aggregate {
2885            let required_columns = agg.required_columns();
2886            if required_columns.is_empty() {
2887                // COUNT(*) or similar - no columns needed
2888                self.dataset.empty_projection()
2889            } else {
2890                // Aggregate needs specific columns
2891                self.dataset
2892                    .empty_projection()
2893                    .union_columns(&required_columns, OnMissing::Error)?
2894            }
2895        } else {
2896            self.projection_plan.physical_projection.clone()
2897        };
2898
2899        let mut projection = if filter_plan.has_refine() {
2900            // If the filter plan has two steps (a scalar indexed portion and a refine portion) then
2901            // it makes sense to grab cheap columns during the first step to avoid taking them for
2902            // the second step.
2903            self.calc_eager_projection(filter_plan, &effective_projection)?
2904                .with_row_id()
2905        } else {
2906            // If the filter plan only has one step then we just do a filtered read of all the
2907            // columns that the user asked for.
2908            effective_projection
2909        };
2910
2911        if projection.is_empty() {
2912            // If the user is not requesting any columns then we will scan the row address which
2913            // is cheap
2914            projection.with_row_addr = true;
2915        }
2916
2917        let scan_range = if filter_plan.is_empty() {
2918            log::trace!("pushing scan_range into filtered_read");
2919            self.get_scan_range(filter_plan).await?
2920        } else {
2921            None
2922        };
2923
2924        self.filtered_read(
2925            filter_plan,
2926            projection,
2927            self.include_deleted_rows,
2928            self.fragments.clone().map(Arc::new),
2929            scan_range,
2930            /*is_prefilter= */ false,
2931        )
2932        .await
2933    }
2934
2935    async fn fts_search_source(
2936        &self,
2937        filter_plan: &mut FilterPlan,
2938        query: &FullTextSearchQuery,
2939    ) -> Result<Arc<dyn ExecutionPlan>> {
2940        log::trace!("source is an fts search");
2941        if self.include_deleted_rows {
2942            return Err(Error::invalid_input_source(
2943                "Cannot include deleted rows in an FTS search".into(),
2944            ));
2945        }
2946
2947        // The source is an FTS search
2948        if self.prefilter {
2949            let source: Arc<dyn ExecutionPlan> = match &filter_plan.vector_filter() {
2950                Some(vector_query) => {
2951                    // Perform vector search first then rerank according to BM25 scores
2952                    let vector_plan = self
2953                        .vector_search(&filter_plan.expr_filter_plan, vector_query)
2954                        .await?;
2955                    self.fts_rerank(vector_plan, query).await?
2956                }
2957                None => self.fts(&filter_plan.expr_filter_plan, query).await?,
2958            };
2959            // If we are prefiltering then the fts node will take care of the filter
2960            filter_plan.disable_refine();
2961            Ok(source)
2962        } else {
2963            // If we are postfiltering then we can't use scalar indices for the filter
2964            // and will need to run the postfilter in memory
2965            filter_plan.make_refine_only();
2966            self.fts(&ExprFilterPlan::default(), query).await
2967        }
2968    }
2969
2970    async fn vector_search_source(
2971        &self,
2972        filter_plan: &mut FilterPlan,
2973    ) -> Result<Arc<dyn ExecutionPlan>> {
2974        if self.include_deleted_rows {
2975            return Err(Error::invalid_input_source(
2976                "Cannot include deleted rows in a nearest neighbor search".into(),
2977            ));
2978        }
2979        let Some(query) = self.nearest.as_ref() else {
2980            return Err(Error::invalid_input("No nearest query".to_string()));
2981        };
2982
2983        if self.prefilter {
2984            log::trace!("source is a vector search (prefilter)");
2985            // If we are prefiltering then the ann / knn node will take care of the filter
2986            let source: Arc<dyn ExecutionPlan> = match &filter_plan.fts_filter() {
2987                Some(fts_query) => {
2988                    let fts_plan = self.fts(&filter_plan.expr_filter_plan, fts_query).await?;
2989                    let projection = self
2990                        .dataset
2991                        .empty_projection()
2992                        .union_column(&query.column, OnMissing::Error)?;
2993                    let plan = self.take(fts_plan, projection)?;
2994
2995                    self.flat_knn(plan, query)?
2996                }
2997                None => {
2998                    self.vector_search(&filter_plan.expr_filter_plan, query)
2999                        .await?
3000                }
3001            };
3002
3003            filter_plan.disable_refine();
3004            Ok(source)
3005        } else {
3006            log::trace!("source is a vector search (postfilter)");
3007            // If we are postfiltering then we can't use scalar indices for the filter
3008            // and will need to run the postfilter in memory
3009            filter_plan.make_refine_only();
3010            self.vector_search(&ExprFilterPlan::default(), query).await
3011        }
3012    }
3013
3014    async fn fragments_covered_by_fts_leaf(
3015        &self,
3016        column: &str,
3017        accum: &mut RoaringBitmap,
3018    ) -> Result<bool> {
3019        let index = self
3020            .dataset
3021            .load_scalar_index(IndexCriteria::default().for_column(column).supports_fts())
3022            .await?;
3023        match index {
3024            Some(index) => match &index.fragment_bitmap {
3025                Some(fragmap) => {
3026                    *accum |= fragmap;
3027                    Ok(true)
3028                }
3029                None => Ok(false),
3030            },
3031            None => Ok(false),
3032        }
3033    }
3034
3035    #[async_recursion]
3036    async fn fragments_covered_by_fts_query_helper(
3037        &self,
3038        query: &FtsQuery,
3039        accum: &mut RoaringBitmap,
3040    ) -> Result<bool> {
3041        match query {
3042            FtsQuery::Match(match_query) => {
3043                self.fragments_covered_by_fts_leaf(
3044                    match_query.column.as_ref().ok_or(Error::invalid_input(
3045                        "the column must be specified in the query".to_string(),
3046                    ))?,
3047                    accum,
3048                )
3049                .await
3050            }
3051            FtsQuery::Boost(boost) => Ok(self
3052                .fragments_covered_by_fts_query_helper(&boost.negative, accum)
3053                .await?
3054                & self
3055                    .fragments_covered_by_fts_query_helper(&boost.positive, accum)
3056                    .await?),
3057            FtsQuery::MultiMatch(multi_match) => {
3058                for mq in &multi_match.match_queries {
3059                    if !self
3060                        .fragments_covered_by_fts_leaf(
3061                            mq.column.as_ref().ok_or(Error::invalid_input(
3062                                "the column must be specified in the query".to_string(),
3063                            ))?,
3064                            accum,
3065                        )
3066                        .await?
3067                    {
3068                        return Ok(false);
3069                    }
3070                }
3071                Ok(true)
3072            }
3073            FtsQuery::Phrase(phrase_query) => {
3074                self.fragments_covered_by_fts_leaf(
3075                    phrase_query.column.as_ref().ok_or(Error::invalid_input(
3076                        "the column must be specified in the query".to_string(),
3077                    ))?,
3078                    accum,
3079                )
3080                .await
3081            }
3082            FtsQuery::Boolean(bool_query) => {
3083                for query in bool_query.must.iter() {
3084                    if !self
3085                        .fragments_covered_by_fts_query_helper(query, accum)
3086                        .await?
3087                    {
3088                        return Ok(false);
3089                    }
3090                }
3091                for query in &bool_query.should {
3092                    if !self
3093                        .fragments_covered_by_fts_query_helper(query, accum)
3094                        .await?
3095                    {
3096                        return Ok(false);
3097                    }
3098                }
3099                Ok(true)
3100            }
3101        }
3102    }
3103
3104    async fn fragments_covered_by_fts_query(&self, query: &FtsQuery) -> Result<RoaringBitmap> {
3105        let all_fragments = self.get_fragments_as_bitmap();
3106
3107        let mut referenced_fragments = RoaringBitmap::new();
3108        if !self
3109            .fragments_covered_by_fts_query_helper(query, &mut referenced_fragments)
3110            .await?
3111        {
3112            // One or more indices is missing the fragment bitmap, require all fragments in prefilter
3113            Ok(all_fragments)
3114        } else {
3115            // Fragments required for prefilter is intersection of index fragments and query fragments
3116            Ok(all_fragments & referenced_fragments)
3117        }
3118    }
3119
3120    // Create an execution plan to do full text search
3121    async fn fts(
3122        &self,
3123        filter_plan: &ExprFilterPlan,
3124        query: &FullTextSearchQuery,
3125    ) -> Result<Arc<dyn ExecutionPlan>> {
3126        let columns = query.columns();
3127        let mut params = query.params();
3128        if params.limit.is_none() {
3129            let search_limit = match (self.limit, self.offset) {
3130                (Some(limit), Some(offset)) => Some((limit + offset) as usize),
3131                (Some(limit), None) => Some(limit as usize),
3132                (None, Some(_)) => None, // No limit but has offset - fetch all and let limit_node handle
3133                (None, None) => None,
3134            };
3135            params = params.with_limit(search_limit);
3136        }
3137        let query = if columns.is_empty() {
3138            // the field is not specified,
3139            // try to search over all indexed fields including nested ones
3140            let indexed_columns = fts_indexed_columns(self.dataset.clone()).await?;
3141            fill_fts_query_column(&query.query, &indexed_columns, false)?
3142        } else {
3143            query.query.clone()
3144        };
3145
3146        // TODO: Could maybe walk the query here to find all the indices that will be
3147        // involved in the query to calculate a more accuarate required_fragments than
3148        // get_fragments_as_bitmap but this is safe for now.
3149        let prefilter_source = self
3150            .prefilter_source(
3151                filter_plan,
3152                self.fragments_covered_by_fts_query(&query).await?,
3153            )
3154            .await?;
3155        let fts_exec = self
3156            .plan_fts(&query, &params, filter_plan, &prefilter_source)
3157            .await?;
3158        Ok(fts_exec)
3159    }
3160
3161    async fn plan_fts(
3162        &self,
3163        query: &FtsQuery,
3164        params: &FtsSearchParams,
3165        filter_plan: &ExprFilterPlan,
3166        prefilter_source: &PreFilterSource,
3167    ) -> Result<Arc<dyn ExecutionPlan>> {
3168        let plan: Arc<dyn ExecutionPlan> = match query {
3169            FtsQuery::Match(query) => {
3170                self.plan_match_query(query, params, filter_plan, prefilter_source)
3171                    .await?
3172            }
3173            FtsQuery::Phrase(query) => {
3174                self.plan_phrase_query(query, params, prefilter_source)
3175                    .await?
3176            }
3177
3178            FtsQuery::Boost(query) => {
3179                // for boost query, we need to erase the limit so that we can find
3180                // the documents that are not in the top-k results of the positive query,
3181                // but in the final top-k results.
3182                let unlimited_params = params.clone().with_limit(None);
3183                let positive_exec = Box::pin(self.plan_fts(
3184                    &query.positive,
3185                    &unlimited_params,
3186                    filter_plan,
3187                    prefilter_source,
3188                ));
3189                let negative_exec = Box::pin(self.plan_fts(
3190                    &query.negative,
3191                    &unlimited_params,
3192                    filter_plan,
3193                    prefilter_source,
3194                ));
3195                let (positive_exec, negative_exec) =
3196                    futures::future::try_join(positive_exec, negative_exec).await?;
3197                Arc::new(BoostQueryExec::new(
3198                    query.clone(),
3199                    params.clone(),
3200                    positive_exec,
3201                    negative_exec,
3202                ))
3203            }
3204
3205            FtsQuery::MultiMatch(query) => {
3206                let mut children = Vec::with_capacity(query.match_queries.len());
3207                for match_query in &query.match_queries {
3208                    let child =
3209                        self.plan_match_query(match_query, params, filter_plan, prefilter_source);
3210                    children.push(child);
3211                }
3212                let children = futures::future::try_join_all(children).await?;
3213
3214                let schema = children[0].schema();
3215                let group_expr = vec![(
3216                    expressions::col(ROW_ID, schema.as_ref())?,
3217                    ROW_ID.to_string(),
3218                )];
3219
3220                let fts_node = UnionExec::try_new(children)?;
3221                let fts_node = Arc::new(RepartitionExec::try_new(
3222                    fts_node,
3223                    Partitioning::RoundRobinBatch(1),
3224                )?);
3225                // dedup by row_id and return the max score as final score
3226                let fts_node = Arc::new(AggregateExec::try_new(
3227                    AggregateMode::Single,
3228                    PhysicalGroupBy::new_single(group_expr),
3229                    vec![Arc::new(
3230                        datafusion_physical_expr::aggregate::AggregateExprBuilder::new(
3231                            functions_aggregate::min_max::max_udaf(),
3232                            vec![expressions::col(SCORE_COL, &schema)?],
3233                        )
3234                        .schema(schema.clone())
3235                        .alias(SCORE_COL)
3236                        .build()?,
3237                    )],
3238                    vec![None],
3239                    fts_node,
3240                    schema,
3241                )?);
3242                let sort_expr = PhysicalSortExpr {
3243                    expr: expressions::col(SCORE_COL, fts_node.schema().as_ref())?,
3244                    options: SortOptions {
3245                        descending: true,
3246                        nulls_first: false,
3247                    },
3248                };
3249
3250                Arc::new(
3251                    SortExec::new([sort_expr].into(), fts_node)
3252                        .with_fetch(self.limit.map(|l| l as usize)),
3253                )
3254            }
3255            FtsQuery::Boolean(query) => {
3256                // TODO: rewrite the query for better performance
3257
3258                // we need to remove the limit from the params,
3259                // so that we won't miss possible matches
3260                let unlimited_params = params.clone().with_limit(None);
3261
3262                let mut should = Vec::with_capacity(query.should.len());
3263                for subquery in &query.should {
3264                    should.push(
3265                        Box::pin(self.plan_fts(
3266                            subquery,
3267                            &unlimited_params,
3268                            filter_plan,
3269                            prefilter_source,
3270                        ))
3271                        .await?,
3272                    );
3273                }
3274                let mut must = Vec::with_capacity(query.must.len());
3275                for subquery in &query.must {
3276                    must.push(
3277                        Box::pin(self.plan_fts(
3278                            subquery,
3279                            &unlimited_params,
3280                            filter_plan,
3281                            prefilter_source,
3282                        ))
3283                        .await?,
3284                    );
3285                }
3286                let mut must_not = Vec::with_capacity(query.must_not.len());
3287                for subquery in &query.must_not {
3288                    must_not.push(
3289                        Box::pin(self.plan_fts(
3290                            subquery,
3291                            &unlimited_params,
3292                            filter_plan,
3293                            prefilter_source,
3294                        ))
3295                        .await?,
3296                    );
3297                }
3298
3299                let should = build_boolean_query_children(BoolSlot::Should, should)?
3300                    .expect("Should slot always returns Some");
3301                let must = build_boolean_query_children(BoolSlot::Must, must)?;
3302                let must_not = build_boolean_query_children(BoolSlot::MustNot, must_not)?
3303                    .expect("MustNot slot always returns Some");
3304
3305                if query.should.is_empty() && must.is_none() {
3306                    return Err(Error::invalid_input(
3307                        "boolean query must have at least one should/must query".to_string(),
3308                    ));
3309                }
3310
3311                Arc::new(BooleanQueryExec::new(
3312                    query.clone(),
3313                    params.clone(),
3314                    should,
3315                    must,
3316                    must_not,
3317                ))
3318            }
3319        };
3320
3321        Ok(plan)
3322    }
3323
3324    async fn plan_phrase_query(
3325        &self,
3326        query: &PhraseQuery,
3327        params: &FtsSearchParams,
3328        prefilter_source: &PreFilterSource,
3329    ) -> Result<Arc<dyn ExecutionPlan>> {
3330        let column = query.column.clone().ok_or(Error::invalid_input(
3331            "the column must be specified in the query".to_string(),
3332        ))?;
3333
3334        let segments = load_segments(&self.dataset, &column)
3335            .await?
3336            .ok_or(Error::invalid_input(format!(
3337                "No Inverted index found for column {}",
3338                column
3339            )))?;
3340        let details = load_segment_details(&self.dataset, &column, &segments).await?;
3341
3342        if !details.with_position {
3343            return Err(Error::invalid_input("position is not found but required for phrase queries, try recreating the index with position"
3344                .to_string()));
3345        }
3346
3347        Ok(Arc::new(PhraseQueryExec::new(
3348            self.dataset.clone(),
3349            query.clone(),
3350            params.clone(),
3351            prefilter_source.clone(),
3352        )))
3353    }
3354
3355    async fn plan_match_query(
3356        &self,
3357        query: &MatchQuery,
3358        params: &FtsSearchParams,
3359        filter_plan: &ExprFilterPlan,
3360        prefilter_source: &PreFilterSource,
3361    ) -> Result<Arc<dyn ExecutionPlan>> {
3362        let column = query
3363            .column
3364            .as_ref()
3365            .ok_or(Error::invalid_input(
3366                "the column must be specified in the query".to_string(),
3367            ))?
3368            .clone();
3369
3370        let index = self
3371            .dataset
3372            .load_scalar_index(IndexCriteria::default().for_column(&column).supports_fts())
3373            .await?;
3374
3375        // Get target fragments
3376        let target_fragments = self
3377            .fragments
3378            .clone()
3379            .unwrap_or_else(|| self.dataset.fragments().to_vec());
3380
3381        let (match_plan, flat_match_plan) = match &index {
3382            Some(index) => {
3383                // Get unindexed fragments and filter to target fragments
3384                let unindexed_fragments = self
3385                    .retain_target_fragments(self.dataset.unindexed_fragments(&index.name).await?);
3386
3387                // If all target fragments are unindexed, skip index entirely
3388                if unindexed_fragments.len() == target_fragments.len() {
3389                    if self.fast_search {
3390                        return Ok(Arc::new(EmptyExec::new(FTS_SCHEMA.clone())));
3391                    }
3392                    let flat_match_plan = self
3393                        .plan_flat_match_query(unindexed_fragments, query, params, filter_plan)
3394                        .await?;
3395                    return Ok(flat_match_plan);
3396                }
3397
3398                // Mixed case: use index + flat search for unindexed
3399                let match_plan: Arc<dyn ExecutionPlan> = Arc::new(MatchQueryExec::new(
3400                    self.dataset.clone(),
3401                    query.clone(),
3402                    params.clone(),
3403                    prefilter_source.clone(),
3404                ));
3405
3406                if self.fast_search || unindexed_fragments.is_empty() {
3407                    (Some(match_plan), None)
3408                } else {
3409                    let flat_match_plan = self
3410                        .plan_flat_match_query(unindexed_fragments, query, params, filter_plan)
3411                        .await?;
3412                    (Some(match_plan), Some(flat_match_plan))
3413                }
3414            }
3415            None => {
3416                if self.fast_search {
3417                    return Ok(Arc::new(EmptyExec::new(FTS_SCHEMA.clone())));
3418                }
3419                // No index: flat search all target fragments
3420                let flat_match_plan = self
3421                    .plan_flat_match_query(target_fragments.clone(), query, params, filter_plan)
3422                    .await?;
3423                (None, Some(flat_match_plan))
3424            }
3425        };
3426
3427        // Combine plans
3428        let plan = match (match_plan, flat_match_plan) {
3429            (Some(match_plan), Some(flat_match_plan)) => {
3430                let match_plan = UnionExec::try_new(vec![match_plan, flat_match_plan])?;
3431                let match_plan = Arc::new(RepartitionExec::try_new(
3432                    match_plan,
3433                    Partitioning::RoundRobinBatch(1),
3434                )?);
3435                let sort_expr = PhysicalSortExpr {
3436                    expr: expressions::col(SCORE_COL, match_plan.schema().as_ref())?,
3437                    options: SortOptions {
3438                        descending: true,
3439                        nulls_first: false,
3440                    },
3441                };
3442                Arc::new(SortExec::new([sort_expr].into(), match_plan).with_fetch(params.limit))
3443            }
3444            (Some(match_plan), None) => match_plan,
3445            (None, Some(flat_match_plan)) => flat_match_plan,
3446            (None, None) => unreachable!(),
3447        };
3448
3449        Ok(plan)
3450    }
3451
3452    /// Plan match query on unindexed fragments
3453    async fn plan_flat_match_query(
3454        &self,
3455        fragments: Vec<Fragment>,
3456        query: &MatchQuery,
3457        params: &FtsSearchParams,
3458        filter_plan: &ExprFilterPlan,
3459    ) -> Result<Arc<dyn ExecutionPlan>> {
3460        let column = query
3461            .column
3462            .as_ref()
3463            .ok_or(Error::invalid_input(
3464                "the column must be specified in the query".to_string(),
3465            ))?
3466            .clone();
3467
3468        let mut columns = vec![column];
3469        if let Some(expr) = filter_plan.full_expr.as_ref() {
3470            let filter_columns = Planner::column_names_in_expr(expr);
3471            columns.extend(filter_columns);
3472        }
3473        let flat_fts_scan_schema = Arc::new(self.dataset.schema().project(&columns).unwrap());
3474        let mut scan_node = self.scan_fragments(
3475            true,
3476            false,
3477            false,
3478            false,
3479            false,
3480            flat_fts_scan_schema,
3481            Arc::new(fragments),
3482            None,
3483            false,
3484        );
3485
3486        if let Some(expr) = filter_plan.full_expr.as_ref() {
3487            // If there is a prefilter we need to manually apply it to the new data
3488            scan_node = Arc::new(LanceFilterExec::try_new(expr.clone(), scan_node)?);
3489        }
3490
3491        let flat_match_plan = Arc::new(FlatMatchQueryExec::new(
3492            self.dataset.clone(),
3493            query.clone(),
3494            params.clone(),
3495            scan_node,
3496        ));
3497        Ok(flat_match_plan)
3498    }
3499
3500    // ANN/KNN search execution node with optional prefilter
3501    async fn vector_search(
3502        &self,
3503        filter_plan: &ExprFilterPlan,
3504        q: &Query,
3505    ) -> Result<Arc<dyn ExecutionPlan>> {
3506        let mut q = q.clone();
3507
3508        // Sanity check
3509        let (vector_type, element_type) = get_vector_type(self.dataset.schema(), &q.column)?;
3510
3511        let column_id = self.dataset.schema().field_id(q.column.as_str())?;
3512        let use_index = q.use_index;
3513        let indices = if use_index {
3514            self.dataset.load_indices().await?
3515        } else {
3516            Arc::new(vec![])
3517        };
3518        let index_and_segments = if use_index {
3519            if let Some(requested_segments) = self.index_segments.as_ref() {
3520                let requested_segment_set =
3521                    requested_segments.iter().copied().collect::<HashSet<_>>();
3522                let requested_index_segments = indices
3523                    .iter()
3524                    .filter(|idx| requested_segment_set.contains(&idx.uuid))
3525                    .cloned()
3526                    .collect::<Vec<_>>();
3527
3528                if requested_index_segments.len() != requested_segment_set.len() {
3529                    let found_segment_set = requested_index_segments
3530                        .iter()
3531                        .map(|idx| idx.uuid)
3532                        .collect::<HashSet<_>>();
3533                    let missing_segments = requested_segment_set
3534                        .difference(&found_segment_set)
3535                        .map(ToString::to_string)
3536                        .collect::<Vec<_>>();
3537                    return Err(Error::invalid_input(format!(
3538                        "with_index_segments referenced unknown index segments: {missing_segments:?}",
3539                    )));
3540                }
3541
3542                if requested_index_segments
3543                    .iter()
3544                    .any(|idx| !idx.fields.contains(&column_id))
3545                {
3546                    return Err(Error::invalid_input(format!(
3547                        "with_index_segments contained a segment that does not belong to vector column '{}'",
3548                        q.column
3549                    )));
3550                }
3551
3552                let index_name = requested_index_segments[0].name.clone();
3553                if requested_index_segments
3554                    .iter()
3555                    .any(|idx| idx.name != index_name)
3556                {
3557                    return Err(Error::invalid_input(
3558                        "with_index_segments must reference segments from a single logical index"
3559                            .to_string(),
3560                    ));
3561                }
3562
3563                let selected_index_segments =
3564                    self.retain_relevant_index_segments(requested_index_segments);
3565                if selected_index_segments.is_empty() {
3566                    None
3567                } else {
3568                    let idx = self
3569                        .dataset
3570                        .open_vector_index(
3571                            q.column.as_str(),
3572                            &selected_index_segments[0].uuid.to_string(),
3573                            &NoOpMetricsCollector,
3574                        )
3575                        .await?;
3576                    let index_metric = idx.metric_type();
3577                    let use_this_index = match q.metric_type {
3578                        Some(user_metric) => {
3579                            if user_metric == index_metric {
3580                                true
3581                            } else {
3582                                return Err(Error::invalid_input(format!(
3583                                    "with_index_segments requested metric {:?} but the selected index segments use {:?}",
3584                                    user_metric, index_metric
3585                                )));
3586                            }
3587                        }
3588                        None => true,
3589                    };
3590                    if use_this_index {
3591                        Some((index_name, selected_index_segments, index_metric))
3592                    } else {
3593                        None
3594                    }
3595                }
3596            } else if let Some(index) = indices.iter().find(|i| i.fields.contains(&column_id)) {
3597                // Try to get metric type from index metadata first (fast path for newer indices)
3598                let index_metric = if let Some(metric) =
3599                    crate::index::vector::details::metric_type_from_index_metadata(index)
3600                {
3601                    metric
3602                } else {
3603                    // Fall back to opening the index for legacy indices without details
3604                    let idx = self
3605                        .dataset
3606                        .open_vector_index(
3607                            q.column.as_str(),
3608                            &index.uuid.to_string(),
3609                            &NoOpMetricsCollector,
3610                        )
3611                        .await?;
3612                    idx.metric_type()
3613                };
3614
3615                let use_this_index = match q.metric_type {
3616                    Some(user_metric) => {
3617                        if user_metric == index_metric {
3618                            true
3619                        } else {
3620                            log::warn!(
3621                                "Requested metric {:?} is incompatible with index metric {:?}, falling back to brute-force search",
3622                                user_metric,
3623                                index_metric
3624                            );
3625                            false
3626                        }
3627                    }
3628                    None => true,
3629                };
3630
3631                if use_this_index {
3632                    let index_segments = self.retain_relevant_index_segments(
3633                        self.dataset.load_indices_by_name(&index.name).await?,
3634                    );
3635                    let index_frags = self.get_indexed_frags(&index_segments);
3636                    if !index_segments.is_empty() && !index_frags.is_empty() {
3637                        Some((index.name.clone(), index_segments, index_metric))
3638                    } else {
3639                        None
3640                    }
3641                } else {
3642                    None
3643                }
3644            } else {
3645                None
3646            }
3647        } else {
3648            None
3649        };
3650
3651        if let Some((index_name, index_segments, index_metric)) = index_and_segments {
3652            log::trace!("index found for vector search");
3653            // Use the index's metric type
3654            q.metric_type = Some(index_metric);
3655            validate_distance_type_for(index_metric, &element_type)?;
3656
3657            if matches!(q.refine_factor, Some(0)) {
3658                return Err(Error::invalid_input(
3659                    "Refine factor cannot be zero".to_string(),
3660                ));
3661            }
3662            let ann_node = match vector_type {
3663                DataType::FixedSizeList(_, _) => self.ann(&q, &index_segments, filter_plan).await?,
3664                DataType::List(_) => self.multivec_ann(&q, &index_segments, filter_plan).await?,
3665                _ => unreachable!(),
3666            };
3667
3668            let mut knn_node = if q.refine_factor.is_some() {
3669                let vector_projection = self
3670                    .dataset
3671                    .empty_projection()
3672                    .union_column(&q.column, OnMissing::Error)
3673                    .unwrap();
3674                let knn_node_with_vector = self.take(ann_node, vector_projection)?;
3675                self.flat_knn(knn_node_with_vector, &q)?
3676            } else {
3677                ann_node
3678            }; // vector, _distance, _rowid
3679
3680            if !self.fast_search {
3681                knn_node = self
3682                    .knn_combined(&q, &index_name, &index_segments, knn_node, filter_plan)
3683                    .await?;
3684            }
3685
3686            Ok(knn_node)
3687        } else {
3688            if self.fast_search {
3689                return Ok(Arc::new(EmptyExec::new(KNN_INDEX_SCHEMA.clone())));
3690            }
3691            // Resolve metric type for flat search (use default if not specified)
3692            let metric = q
3693                .metric_type
3694                .unwrap_or_else(|| default_distance_type_for(&element_type));
3695            q.metric_type = Some(metric);
3696            validate_distance_type_for(metric, &element_type)?;
3697            // No index found. use flat search.
3698            let mut columns = vec![q.column.clone()];
3699            if let Some(refine_expr) = filter_plan.refine_expr.as_ref() {
3700                columns.extend(Planner::column_names_in_expr(refine_expr));
3701            }
3702            let mut vector_scan_projection = self
3703                .dataset
3704                .empty_projection()
3705                .with_row_id()
3706                .union_columns(&columns, OnMissing::Error)?;
3707
3708            vector_scan_projection.with_row_addr =
3709                self.projection_plan.physical_projection.with_row_addr;
3710
3711            let PlannedFilteredScan { mut plan, .. } = self
3712                .filtered_read(
3713                    filter_plan,
3714                    vector_scan_projection,
3715                    /*include_deleted_rows=*/ true,
3716                    self.fragments.clone().map(Arc::new),
3717                    None,
3718                    /*is_prefilter= */ true,
3719                )
3720                .await?;
3721
3722            if let Some(refine_expr) = &filter_plan.refine_expr {
3723                plan = Arc::new(LanceFilterExec::try_new(refine_expr.clone(), plan)?);
3724            }
3725            Ok(self.flat_knn(plan, &q)?)
3726        }
3727    }
3728
3729    /// Combine ANN results with KNN results for data appended after index creation
3730    async fn knn_combined(
3731        &self,
3732        q: &Query,
3733        index_name: &str,
3734        indexed_segments: &[IndexMetadata],
3735        mut knn_node: Arc<dyn ExecutionPlan>,
3736        filter_plan: &ExprFilterPlan,
3737    ) -> Result<Arc<dyn ExecutionPlan>> {
3738        let fallback_fragments = if let Some(target_fragments) = &self.fragments {
3739            let indexed_fragments = self.get_indexed_frags(indexed_segments);
3740            target_fragments
3741                .iter()
3742                .filter(|fragment| !indexed_fragments.contains(fragment.id as u32))
3743                .cloned()
3744                .collect::<Vec<_>>()
3745        } else if self.index_segments.is_some() {
3746            Vec::new()
3747        } else {
3748            self.dataset.unindexed_fragments(index_name).await?
3749        };
3750
3751        if !fallback_fragments.is_empty() {
3752            let q = q.clone();
3753            debug_assert!(q.metric_type.is_some());
3754
3755            // If the vector column is not present, we need to take the vector column, so
3756            // that the distance value is comparable with the flat search ones.
3757            if knn_node.schema().column_with_name(&q.column).is_none() {
3758                let vector_projection = self
3759                    .dataset
3760                    .empty_projection()
3761                    .union_column(&q.column, OnMissing::Error)
3762                    .unwrap();
3763                knn_node = self.take(knn_node, vector_projection)?;
3764            }
3765
3766            let mut columns = vec![q.column.clone()];
3767            if let Some(expr) = filter_plan.full_expr.as_ref() {
3768                let filter_columns = Planner::column_names_in_expr(expr);
3769                columns.extend(filter_columns);
3770            }
3771            let vector_scan_projection = Arc::new(self.dataset.schema().project(&columns).unwrap());
3772            // Note: we could try and use the scalar indices here to reduce the scope of this scan but the
3773            // most common case is that fragments that are newer than the vector index are going to be newer
3774            // than the scalar indices anyways
3775            let mut scan_node = self.scan_fragments(
3776                true,
3777                false,
3778                false,
3779                false,
3780                false,
3781                vector_scan_projection,
3782                Arc::new(fallback_fragments),
3783                // Can't pushdown limit/offset in an ANN search
3784                None,
3785                // We are re-ordering anyways, so no need to get data in data
3786                // in a deterministic order.
3787                false,
3788            );
3789
3790            if let Some(expr) = filter_plan.full_expr.as_ref() {
3791                // If there is a prefilter we need to manually apply it to the new data
3792                scan_node = Arc::new(LanceFilterExec::try_new(expr.clone(), scan_node)?);
3793            }
3794            // first we do flat search on just the new data
3795            let topk_appended = self.flat_knn(scan_node, &q)?;
3796
3797            // To do a union, we need to make the schemas match. Right now
3798            // knn_node: _distance, _rowid, vector
3799            // topk_appended: vector, <filter columns?>, _rowid, _distance
3800            let topk_appended = project(topk_appended, knn_node.schema().as_ref())?;
3801            assert!(
3802                topk_appended
3803                    .schema()
3804                    .equivalent_names_and_types(&knn_node.schema())
3805            );
3806            // union
3807            let unioned = UnionExec::try_new(vec![Arc::new(topk_appended), knn_node])?;
3808            // Enforce only 1 partition.
3809            let unioned = RepartitionExec::try_new(
3810                unioned,
3811                datafusion::physical_plan::Partitioning::RoundRobinBatch(1),
3812            )?;
3813            // then we do a flat search on KNN(new data) + ANN(indexed data)
3814            return self.flat_knn(Arc::new(unioned), &q);
3815        }
3816
3817        Ok(knn_node)
3818    }
3819
3820    #[async_recursion]
3821    async fn fragments_covered_by_index_query(
3822        &self,
3823        index_expr: &ScalarIndexExpr,
3824    ) -> Result<RoaringBitmap> {
3825        match index_expr {
3826            ScalarIndexExpr::And(lhs, rhs) => {
3827                Ok(self.fragments_covered_by_index_query(lhs).await?
3828                    & self.fragments_covered_by_index_query(rhs).await?)
3829            }
3830            ScalarIndexExpr::Or(lhs, rhs) => Ok(self.fragments_covered_by_index_query(lhs).await?
3831                & self.fragments_covered_by_index_query(rhs).await?),
3832            ScalarIndexExpr::Not(expr) => self.fragments_covered_by_index_query(expr).await,
3833            ScalarIndexExpr::Query(search) => scalar_index_fragment_bitmap(
3834                self.dataset.as_ref(),
3835                &search.column,
3836                &search.index_name,
3837            )
3838            .await?
3839            .ok_or_else(|| {
3840                crate::Error::internal(format!(
3841                    "Index not found even though it must have been found earlier: {}",
3842                    search.index_name
3843                ))
3844            }),
3845        }
3846    }
3847
3848    /// Given an index query, split the fragments into two sets
3849    ///
3850    /// The first set is the relevant fragments, which are covered by ALL indices in the query
3851    /// The second set is the missing fragments, which are missed by at least one index
3852    ///
3853    /// There is no point in handling the case where a fragment is covered by some (but not all)
3854    /// of the indices.  If we have to do a full scan of the fragment then we do it
3855    async fn partition_frags_by_coverage(
3856        &self,
3857        index_expr: &ScalarIndexExpr,
3858        fragments: Arc<Vec<Fragment>>,
3859    ) -> Result<(Vec<Fragment>, Vec<Fragment>)> {
3860        let covered_frags = self.fragments_covered_by_index_query(index_expr).await?;
3861        let mut relevant_frags = Vec::with_capacity(fragments.len());
3862        let mut missing_frags = Vec::with_capacity(fragments.len());
3863        for fragment in fragments.iter() {
3864            if covered_frags.contains(fragment.id as u32) {
3865                relevant_frags.push(fragment.clone());
3866            } else {
3867                missing_frags.push(fragment.clone());
3868            }
3869        }
3870        Ok((relevant_frags, missing_frags))
3871    }
3872
3873    // First perform a lookup in a scalar index for ids and then perform a take on the
3874    // target fragments with those ids
3875    async fn scalar_indexed_scan(
3876        &self,
3877        projection: Projection,
3878        filter_plan: &ExprFilterPlan,
3879        fragments: Arc<Vec<Fragment>>,
3880    ) -> Result<Arc<dyn ExecutionPlan>> {
3881        log::trace!("scalar indexed scan");
3882        // One or more scalar indices cover this data and there is a filter which is
3883        // compatible with the indices.  Use that filter to perform a take instead of
3884        // a full scan.
3885
3886        // If this unwrap fails we have a bug because we shouldn't be using this function unless we've already
3887        // checked that there is an index query
3888        let index_expr = filter_plan.index_query.as_ref().unwrap();
3889
3890        let needs_recheck = index_expr.needs_recheck();
3891
3892        // Figure out which fragments are covered by ALL indices
3893        let (relevant_frags, missing_frags) = self
3894            .partition_frags_by_coverage(index_expr, fragments)
3895            .await?;
3896
3897        let mut plan: Arc<dyn ExecutionPlan> = Arc::new(MaterializeIndexExec::new(
3898            self.dataset.clone(),
3899            index_expr.clone(),
3900            Arc::new(relevant_frags),
3901        ));
3902
3903        let refine_expr = filter_plan.refine_expr.as_ref();
3904
3905        // If all we want is the row ids then we can skip the take.  However, if there is a refine
3906        // or a recheck then we still need to do a take because we need filter columns.
3907        let needs_take =
3908            needs_recheck || projection.has_data_fields() || filter_plan.refine_expr.is_some();
3909        if needs_take {
3910            let mut take_projection = projection.clone();
3911            if needs_recheck {
3912                // If we need to recheck then we need to also take the columns used for the filter
3913                let filter_expr = index_expr.to_expr();
3914                let filter_cols = Planner::column_names_in_expr(&filter_expr);
3915                take_projection = take_projection.union_columns(filter_cols, OnMissing::Error)?;
3916            }
3917            if let Some(refine_expr) = refine_expr {
3918                let refine_cols = Planner::column_names_in_expr(refine_expr);
3919                take_projection = take_projection.union_columns(refine_cols, OnMissing::Error)?;
3920            }
3921            log::trace!("need to take additional columns for scalar_indexed_scan");
3922            plan = self.take(plan, take_projection)?;
3923        }
3924
3925        let post_take_filter = match (needs_recheck, refine_expr) {
3926            (false, None) => None,
3927            (true, None) => {
3928                // If we need to recheck then we need to apply the filter to the results
3929                Some(index_expr.to_expr())
3930            }
3931            (true, Some(_)) => Some(filter_plan.full_expr.as_ref().unwrap().clone()),
3932            (false, Some(refine_expr)) => Some(refine_expr.clone()),
3933        };
3934
3935        if let Some(post_take_filter) = post_take_filter {
3936            let planner = Planner::new(plan.schema());
3937            let optimized_filter = planner.optimize_expr(post_take_filter)?;
3938
3939            log::trace!("applying post-take filter to indexed scan");
3940            plan = Arc::new(LanceFilterExec::try_new(optimized_filter, plan)?);
3941        }
3942
3943        if self.projection_plan.physical_projection.with_row_addr {
3944            plan = Arc::new(AddRowAddrExec::try_new(plan, self.dataset.clone(), 0)?);
3945        }
3946
3947        let new_data_path: Option<Arc<dyn ExecutionPlan>> = if !missing_frags.is_empty() {
3948            log::trace!(
3949                "scalar_indexed_scan will need full scan of {} missing fragments",
3950                missing_frags.len()
3951            );
3952
3953            // If there is new data then we need this:
3954            //
3955            // MaterializeIndexExec(old_frags) -> Take -> Union
3956            // Scan(new_frags) -> Filter -> Project    -|
3957            //
3958            // The project is to drop any columns we had to include
3959            // in the full scan merely for the sake of fulfilling the
3960            // filter.
3961            //
3962            // If there were no extra columns then we still need the project
3963            // because Materialize -> Take puts the row id at the left and
3964            // Scan puts the row id at the right
3965            let filter = filter_plan.full_expr.as_ref().unwrap();
3966            let filter_cols = Planner::column_names_in_expr(filter);
3967            let scan_projection = projection.union_columns(filter_cols, OnMissing::Error)?;
3968
3969            let scan_schema = Arc::new(scan_projection.to_bare_schema());
3970            let scan_arrow_schema = Arc::new(scan_schema.as_ref().into());
3971            let planner = Planner::new(scan_arrow_schema);
3972            let optimized_filter = planner.optimize_expr(filter.clone())?;
3973
3974            let new_data_scan = self.scan_fragments(
3975                true,
3976                self.projection_plan.physical_projection.with_row_addr,
3977                self.projection_plan
3978                    .physical_projection
3979                    .with_row_last_updated_at_version,
3980                self.projection_plan
3981                    .physical_projection
3982                    .with_row_created_at_version,
3983                false,
3984                scan_schema,
3985                missing_frags.into(),
3986                // No pushdown of limit/offset when doing scalar indexed scan
3987                None,
3988                false,
3989            );
3990            let filtered = Arc::new(LanceFilterExec::try_new(optimized_filter, new_data_scan)?);
3991            Some(Arc::new(project(filtered, plan.schema().as_ref())?))
3992        } else {
3993            log::trace!("scalar_indexed_scan will not need full scan of any missing fragments");
3994            None
3995        };
3996
3997        if let Some(new_data_path) = new_data_path {
3998            let unioned = UnionExec::try_new(vec![plan, new_data_path])?;
3999            // Enforce only 1 partition.
4000            let unioned = Arc::new(RepartitionExec::try_new(
4001                unioned,
4002                datafusion::physical_plan::Partitioning::RoundRobinBatch(1),
4003            )?);
4004            Ok(unioned)
4005        } else {
4006            Ok(plan)
4007        }
4008    }
4009
4010    fn get_io_buffer_size(&self) -> u64 {
4011        self.io_buffer_size.unwrap_or(*DEFAULT_IO_BUFFER_SIZE)
4012    }
4013
4014    /// Create an Execution plan with a scan node
4015    ///
4016    /// Setting `with_make_deletions_null` will use the validity of the _rowid
4017    /// column as a selection vector. Read more in [crate::io::FileReader].
4018    #[allow(clippy::too_many_arguments)]
4019    pub(crate) fn scan(
4020        &self,
4021        with_row_id: bool,
4022        with_row_address: bool,
4023        with_row_last_updated_at_version: bool,
4024        with_row_created_at_version: bool,
4025        with_make_deletions_null: bool,
4026        range: Option<Range<u64>>,
4027        projection: Arc<Schema>,
4028    ) -> Arc<dyn ExecutionPlan> {
4029        let fragments = if let Some(fragment) = self.fragments.as_ref() {
4030            Arc::new(fragment.clone())
4031        } else {
4032            self.dataset.fragments().clone()
4033        };
4034        let ordered = if self.ordering.is_some() || self.nearest.is_some() {
4035            // If we are sorting the results there is no need to scan in order
4036            false
4037        } else {
4038            self.ordered
4039        };
4040        self.scan_fragments(
4041            with_row_id,
4042            with_row_address,
4043            with_row_last_updated_at_version,
4044            with_row_created_at_version,
4045            with_make_deletions_null,
4046            projection,
4047            fragments,
4048            range,
4049            ordered,
4050        )
4051    }
4052
4053    #[allow(clippy::too_many_arguments)]
4054    fn scan_fragments(
4055        &self,
4056        with_row_id: bool,
4057        with_row_address: bool,
4058        with_row_last_updated_at_version: bool,
4059        with_row_created_at_version: bool,
4060        with_make_deletions_null: bool,
4061        projection: Arc<Schema>,
4062        fragments: Arc<Vec<Fragment>>,
4063        range: Option<Range<u64>>,
4064        ordered: bool,
4065    ) -> Arc<dyn ExecutionPlan> {
4066        log::trace!("scan_fragments covered {} fragments", fragments.len());
4067        let config = LanceScanConfig {
4068            batch_size: self.get_batch_size(),
4069            batch_readahead: self.batch_readahead,
4070            fragment_readahead: self.fragment_readahead,
4071            io_buffer_size: self.get_io_buffer_size(),
4072            with_row_id,
4073            with_row_address,
4074            with_row_last_updated_at_version,
4075            with_row_created_at_version,
4076            with_make_deletions_null,
4077            ordered_output: ordered,
4078            file_reader_options: self.resolved_file_reader_options(),
4079        };
4080        Arc::new(LanceScanExec::new(
4081            self.dataset.clone(),
4082            fragments,
4083            range,
4084            projection,
4085            config,
4086        ))
4087    }
4088
4089    fn pushdown_scan(
4090        &self,
4091        make_deletions_null: bool,
4092        filter_plan: &ExprFilterPlan,
4093    ) -> Result<Arc<dyn ExecutionPlan>> {
4094        log::trace!("pushdown_scan");
4095
4096        let config = ScanConfig {
4097            batch_readahead: self.batch_readahead,
4098            fragment_readahead: self
4099                .fragment_readahead
4100                .unwrap_or(LEGACY_DEFAULT_FRAGMENT_READAHEAD),
4101            with_row_id: self.projection_plan.physical_projection.with_row_id,
4102            with_row_address: self.projection_plan.physical_projection.with_row_addr,
4103            make_deletions_null,
4104            ordered_output: self.ordered,
4105            file_reader_options: self.resolved_file_reader_options(),
4106        };
4107
4108        let fragments = if let Some(fragment) = self.fragments.as_ref() {
4109            Arc::new(fragment.clone())
4110        } else {
4111            self.dataset.fragments().clone()
4112        };
4113
4114        Ok(Arc::new(LancePushdownScanExec::try_new(
4115            self.dataset.clone(),
4116            fragments,
4117            Arc::new(self.projection_plan.physical_projection.to_bare_schema()),
4118            filter_plan.refine_expr.clone().unwrap(),
4119            config,
4120        )?))
4121    }
4122
4123    /// Here we use a full text search as a post-filter.  Any rows that
4124    /// do not contain at least one query token are removed.
4125    ///
4126    /// Only valid (currently) for match queries.
4127    async fn flat_fts_filter(
4128        &self,
4129        input: Arc<dyn ExecutionPlan>,
4130        q: &FullTextSearchQuery,
4131    ) -> Result<Arc<dyn ExecutionPlan>> {
4132        let fts_query = if q.columns().is_empty() {
4133            let indexed_columns = fts_indexed_columns(self.dataset.clone()).await?;
4134            fill_fts_query_column(&q.query, &indexed_columns, false)?
4135        } else {
4136            q.query.clone()
4137        };
4138
4139        match &fts_query {
4140            FtsQuery::Match(match_query) => {
4141                let schema = Arc::new((input.schema()).try_with_column(SCORE_FIELD.clone())?);
4142
4143                let column = match_query
4144                    .column
4145                    .as_ref()
4146                    .ok_or(Error::invalid_input(
4147                        "the column must be specified in the query".to_string(),
4148                    ))?
4149                    .clone();
4150                let input = if schema.column_with_name(&column).is_none() {
4151                    let projection = self
4152                        .dataset
4153                        .empty_projection()
4154                        .union_column(&column, OnMissing::Error)?;
4155                    self.take(input, projection)?
4156                } else {
4157                    input
4158                };
4159
4160                Ok(Arc::new(FlatMatchFilterExec::new(
4161                    input,
4162                    self.dataset.clone(),
4163                    match_query.clone(),
4164                    q.params(),
4165                )))
4166            }
4167            _ => Err(Error::not_supported(
4168                "Only Match queries are supported currently when using FTS as a post-filter",
4169            )),
4170        }
4171    }
4172
4173    /// Here we consume all input (as unindexed) and rerank according to BM25 scores
4174    ///
4175    /// If there is an index on the column then we still use the index to determine the
4176    /// tokenizer and inform the BM25 scoring (e.g. avg doc length, token frequency, etc.)
4177    async fn fts_rerank(
4178        &self,
4179        input: Arc<dyn ExecutionPlan>,
4180        q: &FullTextSearchQuery,
4181    ) -> Result<Arc<dyn ExecutionPlan>> {
4182        let fts_query = if q.columns().is_empty() {
4183            let indexed_columns = fts_indexed_columns(self.dataset.clone()).await?;
4184            fill_fts_query_column(&q.query, &indexed_columns, false)?
4185        } else {
4186            q.query.clone()
4187        };
4188
4189        match &fts_query {
4190            FtsQuery::Match(match_query) => {
4191                let schema = Arc::new((input.schema()).try_with_column(SCORE_FIELD.clone())?);
4192
4193                let column = match_query
4194                    .column
4195                    .as_ref()
4196                    .ok_or(Error::invalid_input(
4197                        "the column must be specified in the query".to_string(),
4198                    ))?
4199                    .clone();
4200                let input = if schema.column_with_name(&column).is_none() {
4201                    let projection = self
4202                        .dataset
4203                        .empty_projection()
4204                        .union_column(&column, OnMissing::Error)?;
4205                    self.take(input, projection)?
4206                } else {
4207                    input
4208                };
4209
4210                Ok(Arc::new(FlatMatchQueryExec::new(
4211                    self.dataset.clone(),
4212                    match_query.clone(),
4213                    q.params(),
4214                    input,
4215                )))
4216            }
4217            _ => {
4218                let default_filter = ExprFilterPlan::default();
4219                let fts_plan = self.fts(&default_filter, q).await?;
4220
4221                let vector_row_id = Column::new_with_schema(ROW_ID, input.schema().as_ref())?;
4222                let fts_row_id = Column::new_with_schema(ROW_ID, fts_plan.schema().as_ref())?;
4223                let join = HashJoinExec::try_new(
4224                    input,
4225                    fts_plan,
4226                    vec![(Arc::new(vector_row_id), Arc::new(fts_row_id))],
4227                    None,
4228                    &JoinType::Inner,
4229                    None,
4230                    PartitionMode::CollectLeft,
4231                    NullEquality::NullEqualsNull,
4232                    false,
4233                )?;
4234
4235                let schema = join.schema();
4236                let mut projection_exprs = Vec::new();
4237                let mut contain_rowid = false;
4238                for field in schema.fields() {
4239                    if field.name() == ROW_ID {
4240                        if contain_rowid {
4241                            continue;
4242                        }
4243                        contain_rowid = true;
4244                    }
4245                    projection_exprs.push((
4246                        Arc::new(Column::new_with_schema(field.name(), schema.as_ref())?)
4247                            as Arc<dyn PhysicalExpr>,
4248                        field.name().clone(),
4249                    ));
4250                }
4251
4252                let projection_exec = ProjectionExec::try_new(projection_exprs, Arc::new(join))?;
4253                Ok(Arc::new(projection_exec))
4254            }
4255        }
4256    }
4257
4258    /// Add a knn search node to the input plan
4259    fn flat_knn(&self, input: Arc<dyn ExecutionPlan>, q: &Query) -> Result<Arc<dyn ExecutionPlan>> {
4260        // Resolve metric_type if not set (use default for the column's element type)
4261        let metric_type = match q.metric_type {
4262            Some(m) => m,
4263            None => {
4264                let (_, element_type) = get_vector_type(self.dataset.schema(), &q.column)?;
4265                default_distance_type_for(&element_type)
4266            }
4267        };
4268        let flat_dist = Arc::new(KNNVectorDistanceExec::try_new(
4269            input,
4270            &q.column,
4271            q.key.clone(),
4272            metric_type,
4273        )?);
4274
4275        let lower: Option<(Expr, Arc<dyn PhysicalExpr>)> = q
4276            .lower_bound
4277            .map(|v| -> Result<(Expr, Arc<dyn PhysicalExpr>)> {
4278                let logical = col(DIST_COL).gt_eq(lit(v));
4279                let schema = flat_dist.schema();
4280                let df_schema = DFSchema::try_from(schema)?;
4281                let physical = create_physical_expr(&logical, &df_schema, &ExecutionProps::new())?;
4282                Ok::<(Expr, Arc<dyn PhysicalExpr>), _>((logical, physical))
4283            })
4284            .transpose()?;
4285
4286        let upper = q
4287            .upper_bound
4288            .map(|v| -> Result<(Expr, Arc<dyn PhysicalExpr>)> {
4289                let logical = col(DIST_COL).lt(lit(v));
4290                let schema = flat_dist.schema();
4291                let df_schema = DFSchema::try_from(schema)?;
4292                let physical = create_physical_expr(&logical, &df_schema, &ExecutionProps::new())?;
4293                Ok::<(Expr, Arc<dyn PhysicalExpr>), _>((logical, physical))
4294            })
4295            .transpose()?;
4296
4297        let filter_expr = match (lower, upper) {
4298            (Some((llog, _)), Some((ulog, _))) => {
4299                let logical = llog.and(ulog);
4300                let schema = flat_dist.schema();
4301                let df_schema = DFSchema::try_from(schema)?;
4302                let physical = create_physical_expr(&logical, &df_schema, &ExecutionProps::new())?;
4303                Some((logical, physical))
4304            }
4305            (Some((llog, lphys)), None) => Some((llog, lphys)),
4306            (None, Some((ulog, uphys))) => Some((ulog, uphys)),
4307            (None, None) => None,
4308        };
4309
4310        let knn_plan: Arc<dyn ExecutionPlan> = if let Some(filter_expr) = filter_expr {
4311            Arc::new(LanceFilterExec::try_new(filter_expr.0, flat_dist)?)
4312        } else {
4313            flat_dist
4314        };
4315
4316        // Use DataFusion's [SortExec] for Top-K search
4317        let sort = SortExec::new(
4318            [
4319                PhysicalSortExpr {
4320                    expr: expressions::col(DIST_COL, knn_plan.schema().as_ref())?,
4321                    options: SortOptions {
4322                        descending: false,
4323                        nulls_first: false,
4324                    },
4325                },
4326                PhysicalSortExpr {
4327                    expr: expressions::col(ROW_ID, knn_plan.schema().as_ref())?,
4328                    options: SortOptions {
4329                        descending: false,
4330                        nulls_first: false,
4331                    },
4332                },
4333            ]
4334            .into(),
4335            knn_plan,
4336        )
4337        .with_fetch(Some(q.k));
4338
4339        let logical_not_null = col(DIST_COL).is_not_null();
4340        let not_nulls = Arc::new(LanceFilterExec::try_new(logical_not_null, Arc::new(sort))?);
4341
4342        Ok(not_nulls)
4343    }
4344
4345    fn get_fragments_as_bitmap(&self) -> RoaringBitmap {
4346        if let Some(fragments) = &self.fragments {
4347            RoaringBitmap::from_iter(fragments.iter().map(|f| f.id as u32))
4348        } else {
4349            self.dataset.fragment_bitmap.as_ref().clone()
4350        }
4351    }
4352
4353    fn retain_relevant_index_segments(
4354        &self,
4355        index_segments: Vec<IndexMetadata>,
4356    ) -> Vec<IndexMetadata> {
4357        if let Some(fragments) = &self.fragments {
4358            let target_fragments = RoaringBitmap::from_iter(fragments.iter().map(|f| f.id as u32));
4359            index_segments
4360                .into_iter()
4361                .filter(|idx| {
4362                    idx.fragment_bitmap
4363                        .as_ref()
4364                        .is_some_and(|fragmap| !(fragmap & &target_fragments).is_empty())
4365                })
4366                .collect()
4367        } else {
4368            index_segments
4369        }
4370    }
4371
4372    /// Retain only fragments that are in the user-specified fragment list.
4373    /// If no fragment list is specified, returns the fragments unchanged.
4374    fn retain_target_fragments(&self, mut fragments: Vec<Fragment>) -> Vec<Fragment> {
4375        if let Some(target) = &self.fragments {
4376            let bitmap = RoaringBitmap::from_iter(target.iter().map(|f| f.id as u32));
4377            fragments.retain(|f| bitmap.contains(f.id as u32));
4378        }
4379        fragments
4380    }
4381
4382    fn get_indexed_frags(&self, index: &[IndexMetadata]) -> RoaringBitmap {
4383        let all_fragments = self.get_fragments_as_bitmap();
4384
4385        let mut all_indexed_frags = RoaringBitmap::new();
4386        for idx in index {
4387            if let Some(fragmap) = idx.fragment_bitmap.as_ref() {
4388                all_indexed_frags |= fragmap;
4389            } else {
4390                // If any index is missing the fragment bitmap it is safest to just assume we
4391                // need all fragments
4392                return all_fragments;
4393            }
4394        }
4395
4396        all_indexed_frags & all_fragments
4397    }
4398
4399    /// Create an Execution plan to do indexed ANN search
4400    async fn ann(
4401        &self,
4402        q: &Query,
4403        index: &[IndexMetadata],
4404        filter_plan: &ExprFilterPlan,
4405    ) -> Result<Arc<dyn ExecutionPlan>> {
4406        let prefilter_source = self
4407            .prefilter_source(filter_plan, self.get_indexed_frags(index))
4408            .await?;
4409        let inner_fanout_search = new_knn_exec(self.dataset.clone(), index, q, prefilter_source)?;
4410        let sort_expr = PhysicalSortExpr {
4411            expr: expressions::col(DIST_COL, inner_fanout_search.schema().as_ref())?,
4412            options: SortOptions {
4413                descending: false,
4414                nulls_first: false,
4415            },
4416        };
4417        let sort_expr_row_id = PhysicalSortExpr {
4418            expr: expressions::col(ROW_ID, inner_fanout_search.schema().as_ref())?,
4419            options: SortOptions {
4420                descending: false,
4421                nulls_first: false,
4422            },
4423        };
4424        Ok(Arc::new(
4425            SortExec::new([sort_expr, sort_expr_row_id].into(), inner_fanout_search)
4426                .with_fetch(Some(q.k * q.refine_factor.unwrap_or(1) as usize)),
4427        ))
4428    }
4429
4430    // Create an Execution plan to do ANN over multivectors
4431    async fn multivec_ann(
4432        &self,
4433        q: &Query,
4434        index: &[IndexMetadata],
4435        filter_plan: &ExprFilterPlan,
4436    ) -> Result<Arc<dyn ExecutionPlan>> {
4437        // we split the query procedure into two steps:
4438        // 1. collect the candidates by vector searching on each query vector
4439        // 2. scoring the candidates
4440
4441        let over_fetch_factor = *DEFAULT_XTR_OVERFETCH;
4442
4443        let prefilter_source = self
4444            .prefilter_source(filter_plan, self.get_indexed_frags(index))
4445            .await?;
4446        let dim = get_vector_dim(self.dataset.schema(), &q.column)?;
4447
4448        let num_queries = q.key.len() / dim;
4449        let new_queries = (0..num_queries)
4450            .map(|i| q.key.slice(i * dim, dim))
4451            .map(|query_vec| {
4452                let mut new_query = q.clone();
4453                new_query.key = query_vec;
4454                // with XTR, we don't need to refine the result with original vectors,
4455                // but here we really need to over-fetch the candidates to reach good enough recall.
4456                // TODO: improve the recall with WARP, expose this parameter to the users.
4457                new_query.refine_factor = Some(over_fetch_factor);
4458                new_query
4459            });
4460        let mut ann_nodes = Vec::with_capacity(new_queries.len());
4461        for query in new_queries {
4462            // this produces `nprobes * k * over_fetch_factor * num_indices` candidates
4463            let ann_node = new_knn_exec(
4464                self.dataset.clone(),
4465                index,
4466                &query,
4467                prefilter_source.clone(),
4468            )?;
4469            let sort_expr = PhysicalSortExpr {
4470                expr: expressions::col(DIST_COL, ann_node.schema().as_ref())?,
4471                options: SortOptions {
4472                    descending: false,
4473                    nulls_first: false,
4474                },
4475            };
4476            let sort_expr_row_id = PhysicalSortExpr {
4477                expr: expressions::col(ROW_ID, ann_node.schema().as_ref())?,
4478                options: SortOptions {
4479                    descending: false,
4480                    nulls_first: false,
4481                },
4482            };
4483            let ann_node = Arc::new(
4484                SortExec::new([sort_expr, sort_expr_row_id].into(), ann_node)
4485                    .with_fetch(Some(q.k * over_fetch_factor as usize)),
4486            );
4487            ann_nodes.push(ann_node as Arc<dyn ExecutionPlan>);
4488        }
4489
4490        let ann_node = Arc::new(MultivectorScoringExec::try_new(ann_nodes, q.clone())?);
4491
4492        let sort_expr = PhysicalSortExpr {
4493            expr: expressions::col(DIST_COL, ann_node.schema().as_ref())?,
4494            options: SortOptions {
4495                descending: false,
4496                nulls_first: false,
4497            },
4498        };
4499        let sort_expr_row_id = PhysicalSortExpr {
4500            expr: expressions::col(ROW_ID, ann_node.schema().as_ref())?,
4501            options: SortOptions {
4502                descending: false,
4503                nulls_first: false,
4504            },
4505        };
4506        let ann_node = Arc::new(
4507            SortExec::new([sort_expr, sort_expr_row_id].into(), ann_node)
4508                .with_fetch(Some(q.k * q.refine_factor.unwrap_or(1) as usize)),
4509        );
4510
4511        Ok(ann_node)
4512    }
4513
4514    /// Create prefilter source from filter plan
4515    ///
4516    /// A prefilter is an input to a vector or fts search.  It tells us which rows are eligible
4517    /// for the search.  A prefilter is calculated by doing a filtered read of the row id column.
4518    async fn prefilter_source(
4519        &self,
4520        filter_plan: &ExprFilterPlan,
4521        required_frags: RoaringBitmap,
4522    ) -> Result<PreFilterSource> {
4523        if filter_plan.is_empty() && self.fragments.is_none() {
4524            log::trace!("no filter plan, no prefilter");
4525            return Ok(PreFilterSource::None);
4526        }
4527
4528        // get fragments covered by index
4529        let fragments: Vec<Fragment> = self
4530            .dataset
4531            .manifest
4532            .fragments
4533            .iter()
4534            .filter(|f| required_frags.contains(f.id as u32))
4535            .cloned()
4536            .collect();
4537
4538        // If explicitly specified fragments with .with_fragments(), intersect with those
4539        let fragments = Arc::new(self.retain_target_fragments(fragments));
4540
4541        // Can only use ScalarIndexExec when the scalar index is exact and we are not scanning
4542        // a subset of the fragments.
4543        //
4544        // TODO: We could enhance ScalarIndexExec with a fragment bitmap to filter out rows that
4545        // are not in the fragments we are scanning.
4546        if filter_plan.is_exact_index_search() && self.fragments.is_none() {
4547            let index_query = filter_plan.index_query.as_ref().expect_ok()?;
4548            let (_, missing_frags) = self
4549                .partition_frags_by_coverage(index_query, fragments.clone())
4550                .await?;
4551
4552            if missing_frags.is_empty() {
4553                log::trace!("prefilter entirely satisfied by exact index search");
4554                // We can only avoid materializing the index for a prefilter if:
4555                // 1. The search is indexed
4556                // 2. The index search is an exact search with no recheck or refine
4557                // 3. The indices cover at least the same fragments as the vector index
4558                return Ok(PreFilterSource::ScalarIndexQuery(Arc::new(
4559                    ScalarIndexExec::new(self.dataset.clone(), index_query.clone()),
4560                )));
4561            } else {
4562                log::trace!("exact index search did not cover all fragments");
4563            }
4564        }
4565
4566        // If one of our criteria is not met, we need to do a filtered read of just the row id column
4567        log::trace!(
4568            "prefilter is a filtered read of {} fragments",
4569            fragments.len()
4570        );
4571        let PlannedFilteredScan { plan, .. } = self
4572            .filtered_read(
4573                filter_plan,
4574                self.dataset.empty_projection().with_row_id(),
4575                false,
4576                Some(fragments),
4577                None,
4578                /*is_prefilter= */ true,
4579            )
4580            .await?;
4581        Ok(PreFilterSource::FilteredRowIds(plan))
4582    }
4583
4584    /// Take row indices produced by input plan from the dataset (with projection)
4585    #[allow(deprecated)]
4586    fn take(
4587        &self,
4588        input: Arc<dyn ExecutionPlan>,
4589        output_projection: Projection,
4590    ) -> Result<Arc<dyn ExecutionPlan>> {
4591        let coalesced = Arc::new(CoalesceBatchesExec::new(
4592            input.clone(),
4593            self.get_batch_size(),
4594        ));
4595        if let Some(take_plan) =
4596            TakeExec::try_new(self.dataset.clone(), coalesced, output_projection)?
4597        {
4598            Ok(Arc::new(take_plan))
4599        } else {
4600            // No new columns needed
4601            Ok(input)
4602        }
4603    }
4604
4605    /// Global offset-limit of the result of the input plan
4606    fn limit_node(&self, plan: Arc<dyn ExecutionPlan>) -> Arc<dyn ExecutionPlan> {
4607        Arc::new(GlobalLimitExec::new(
4608            plan,
4609            *self.offset.as_ref().unwrap_or(&0) as usize,
4610            self.limit.map(|l| l as usize),
4611        ))
4612    }
4613
4614    #[instrument(level = "info", skip(self))]
4615    pub async fn analyze_plan(&self) -> Result<String> {
4616        let plan = self.create_plan().await?;
4617        analyze_plan(
4618            plan,
4619            LanceExecutionOptions {
4620                batch_size: self.batch_size,
4621                ..Default::default()
4622            },
4623        )
4624        .await
4625    }
4626
4627    #[instrument(level = "info", skip(self))]
4628    pub async fn explain_plan(&self, verbose: bool) -> Result<String> {
4629        let plan = self.create_plan().await?;
4630        let display = DisplayableExecutionPlan::new(plan.as_ref());
4631
4632        Ok(format!("{}", display.indent(verbose)))
4633    }
4634}
4635
4636// Search over all indexed fields including nested ones, collecting columns that have an
4637// inverted index
4638async fn fts_indexed_columns(dataset: Arc<Dataset>) -> Result<Vec<String>> {
4639    let mut indexed_columns = Vec::new();
4640    for field in dataset.schema().fields_pre_order() {
4641        // Check if this field is a string type that could have an inverted index
4642        let is_string_field = match field.data_type() {
4643            DataType::Utf8 | DataType::LargeUtf8 => true,
4644            DataType::List(inner_field) | DataType::LargeList(inner_field) => {
4645                matches!(
4646                    inner_field.data_type(),
4647                    DataType::Utf8 | DataType::LargeUtf8
4648                )
4649            }
4650            _ => false,
4651        };
4652
4653        if is_string_field {
4654            // Build the full field path for nested fields
4655            let column_path =
4656                if let Some(ancestors) = dataset.schema().field_ancestry_by_id(field.id) {
4657                    let field_refs: Vec<&str> = ancestors.iter().map(|f| f.name.as_str()).collect();
4658                    format_field_path(&field_refs)
4659                } else {
4660                    continue; // Skip if we can't find the field ancestry
4661                };
4662
4663            // Check if this field has an inverted index
4664            let has_fts_index = dataset
4665                .load_scalar_index(
4666                    IndexCriteria::default()
4667                        .for_column(&column_path)
4668                        .supports_fts(),
4669                )
4670                .await?
4671                .is_some();
4672
4673            if has_fts_index {
4674                indexed_columns.push(column_path);
4675            }
4676        }
4677    }
4678    Ok(indexed_columns)
4679}
4680
4681/// [`DatasetRecordBatchStream`] wraps the dataset into a [`RecordBatchStream`] for
4682/// consumption by the user.
4683///
4684#[pin_project::pin_project]
4685pub struct DatasetRecordBatchStream {
4686    #[pin]
4687    exec_node: SendableRecordBatchStream,
4688    span: Span,
4689}
4690
4691impl DatasetRecordBatchStream {
4692    pub fn new(exec_node: SendableRecordBatchStream) -> Self {
4693        let schema = exec_node.schema();
4694        let adapter = SchemaAdapter::new(schema.clone());
4695        let exec_node = if SchemaAdapter::requires_logical_conversion(&schema) {
4696            adapter.to_logical_stream(exec_node)
4697        } else {
4698            exec_node
4699        };
4700
4701        let span = info_span!("DatasetRecordBatchStream");
4702        Self { exec_node, span }
4703    }
4704}
4705
4706impl RecordBatchStream for DatasetRecordBatchStream {
4707    fn schema(&self) -> SchemaRef {
4708        self.exec_node.schema()
4709    }
4710}
4711
4712impl Stream for DatasetRecordBatchStream {
4713    type Item = Result<RecordBatch>;
4714
4715    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
4716        let mut this = self.project();
4717        let _guard = this.span.enter();
4718        match this.exec_node.poll_next_unpin(cx) {
4719            Poll::Ready(result) => Poll::Ready(result.map(|r| Ok(r?))),
4720            Poll::Pending => Poll::Pending,
4721        }
4722    }
4723}
4724
4725impl From<DatasetRecordBatchStream> for SendableRecordBatchStream {
4726    fn from(stream: DatasetRecordBatchStream) -> Self {
4727        stream.exec_node
4728    }
4729}
4730
4731#[cfg(test)]
4732pub mod test_dataset {
4733
4734    use super::*;
4735
4736    use std::{collections::HashMap, vec};
4737
4738    use arrow_array::{
4739        ArrayRef, FixedSizeListArray, Int32Array, RecordBatch, RecordBatchIterator, StringArray,
4740        types::Float32Type,
4741    };
4742    use arrow_schema::{ArrowError, DataType};
4743    use lance_arrow::FixedSizeListArrayExt;
4744    use lance_core::utils::tempfile::TempStrDir;
4745    use lance_file::version::LanceFileVersion;
4746    use lance_index::{
4747        IndexType,
4748        scalar::{ScalarIndexParams, inverted::tokenizer::InvertedIndexParams},
4749        vector::{
4750            ivf::IvfBuildParams,
4751            kmeans::{KMeansParams, train_kmeans},
4752        },
4753    };
4754    use lance_linalg::distance::DistanceType;
4755    use uuid::Uuid;
4756
4757    use crate::dataset::WriteParams;
4758    use crate::index::vector::VectorIndexParams;
4759
4760    // Creates a dataset with 5 batches where each batch has 80 rows
4761    //
4762    // The dataset has the following columns:
4763    //
4764    //  i   - i32      : [0, 1, ..., 399]
4765    //  s   - &str     : ["s-0", "s-1", ..., "s-399"]
4766    //  vec - [f32; 32]: [[0, 1, ... 31], [32, ..., 63], ... [..., (80 * 5 * 32) - 1]]
4767    //
4768    // An IVF-PQ index with 2 partitions is trained on this data
4769    pub struct TestVectorDataset {
4770        pub tmp_dir: TempStrDir,
4771        pub schema: Arc<ArrowSchema>,
4772        pub dataset: Dataset,
4773        dimension: u32,
4774    }
4775
4776    impl TestVectorDataset {
4777        pub async fn new(
4778            data_storage_version: LanceFileVersion,
4779            stable_row_ids: bool,
4780        ) -> Result<Self> {
4781            Self::new_with_dimension(data_storage_version, stable_row_ids, 32).await
4782        }
4783
4784        pub async fn new_with_dimension(
4785            data_storage_version: LanceFileVersion,
4786            stable_row_ids: bool,
4787            dimension: u32,
4788        ) -> Result<Self> {
4789            let path = TempStrDir::default();
4790
4791            // Make sure the schema has metadata so it tests all paths that re-construct the schema along the way
4792            let metadata: HashMap<String, String> =
4793                vec![("dataset".to_string(), "vector".to_string())]
4794                    .into_iter()
4795                    .collect();
4796
4797            let schema = Arc::new(ArrowSchema::new_with_metadata(
4798                vec![
4799                    ArrowField::new("i", DataType::Int32, true),
4800                    ArrowField::new("s", DataType::Utf8, true),
4801                    ArrowField::new(
4802                        "vec",
4803                        DataType::FixedSizeList(
4804                            Arc::new(ArrowField::new("item", DataType::Float32, true)),
4805                            dimension as i32,
4806                        ),
4807                        true,
4808                    ),
4809                ],
4810                metadata,
4811            ));
4812
4813            let batches: Vec<RecordBatch> = (0..5)
4814                .map(|i| {
4815                    let vector_values: Float32Array =
4816                        (0..dimension * 80).map(|v| v as f32).collect();
4817                    let vectors =
4818                        FixedSizeListArray::try_new_from_values(vector_values, dimension as i32)
4819                            .unwrap();
4820                    RecordBatch::try_new(
4821                        schema.clone(),
4822                        vec![
4823                            Arc::new(Int32Array::from_iter_values(i * 80..(i + 1) * 80)),
4824                            Arc::new(StringArray::from_iter_values(
4825                                (i * 80..(i + 1) * 80).map(|v| format!("s-{}", v)),
4826                            )),
4827                            Arc::new(vectors),
4828                        ],
4829                    )
4830                })
4831                .collect::<std::result::Result<Vec<_>, ArrowError>>()?;
4832
4833            let params = WriteParams {
4834                max_rows_per_group: 10,
4835                max_rows_per_file: 200,
4836                data_storage_version: Some(data_storage_version),
4837                enable_stable_row_ids: stable_row_ids,
4838                ..Default::default()
4839            };
4840            let reader = RecordBatchIterator::new(batches.into_iter().map(Ok), schema.clone());
4841
4842            let dataset = Dataset::write(reader, &path, Some(params)).await?;
4843
4844            Ok(Self {
4845                tmp_dir: path,
4846                schema,
4847                dataset,
4848                dimension,
4849            })
4850        }
4851
4852        pub async fn make_vector_index(&mut self) -> Result<()> {
4853            let params = VectorIndexParams::ivf_pq(2, 8, 2, MetricType::L2, 2);
4854            self.dataset
4855                .create_index(
4856                    &["vec"],
4857                    IndexType::Vector,
4858                    Some("idx".to_string()),
4859                    &params,
4860                    true,
4861                )
4862                .await?;
4863            Ok(())
4864        }
4865
4866        pub async fn make_segmented_vector_index(&mut self) -> Result<Vec<Uuid>> {
4867            let batch = self
4868                .dataset
4869                .scan()
4870                .project(&["vec"])
4871                .unwrap()
4872                .try_into_batch()
4873                .await?;
4874            let vectors = batch
4875                .column_by_name("vec")
4876                .expect("vector column should exist")
4877                .as_fixed_size_list();
4878            let values = vectors.values().as_primitive::<Float32Type>();
4879            let centroids = train_kmeans::<Float32Type>(
4880                values,
4881                KMeansParams::new(None, 10, 1, DistanceType::L2),
4882                self.dimension as usize,
4883                2,
4884                2,
4885            )
4886            .unwrap()
4887            .centroids
4888            .as_primitive::<Float32Type>()
4889            .clone();
4890            let centroids = Arc::new(
4891                FixedSizeListArray::try_new_from_values(centroids, self.dimension as i32).unwrap(),
4892            );
4893            let params = VectorIndexParams::with_ivf_flat_params(
4894                DistanceType::L2,
4895                IvfBuildParams::try_with_centroids(2, centroids).unwrap(),
4896            );
4897            let fragment_ids = self
4898                .dataset
4899                .get_fragments()
4900                .iter()
4901                .map(|fragment| fragment.id() as u32)
4902                .collect::<Vec<_>>();
4903
4904            let mut segments = Vec::with_capacity(fragment_ids.len());
4905            for fragment_id in fragment_ids {
4906                let mut builder =
4907                    self.dataset
4908                        .create_index_builder(&["vec"], IndexType::Vector, &params);
4909                builder = builder.name("idx".to_string()).fragments(vec![fragment_id]);
4910                segments.push(builder.execute_uncommitted().await?);
4911            }
4912
4913            let segment_ids = segments
4914                .iter()
4915                .map(|segment| segment.uuid)
4916                .collect::<Vec<_>>();
4917            let segments = self
4918                .dataset
4919                .create_index_segment_builder()
4920                .with_index_type(params.index_type())
4921                .with_segments(segments)
4922                .build_all()
4923                .await?;
4924            self.dataset
4925                .commit_existing_index_segments("idx", "vec", segments)
4926                .await?;
4927            Ok(segment_ids)
4928        }
4929
4930        pub async fn make_scalar_index(&mut self) -> Result<()> {
4931            self.dataset
4932                .create_index(
4933                    &["i"],
4934                    IndexType::Scalar,
4935                    None,
4936                    &ScalarIndexParams::default(),
4937                    true,
4938                )
4939                .await?;
4940            Ok(())
4941        }
4942
4943        pub async fn make_fts_index(&mut self) -> Result<()> {
4944            let params = InvertedIndexParams::default().with_position(true);
4945            self.dataset
4946                .create_index(&["s"], IndexType::Inverted, None, &params, true)
4947                .await?;
4948            Ok(())
4949        }
4950
4951        pub async fn append_new_data(&mut self) -> Result<()> {
4952            self.append_data_with_range(400, 410).await
4953        }
4954
4955        pub async fn append_data_with_range(&mut self, start: i32, end: i32) -> Result<()> {
4956            let count = (end - start) as usize;
4957            let vector_values: Float32Array = (0..count)
4958                .flat_map(|i| vec![i as f32; self.dimension as usize].into_iter())
4959                .collect();
4960            let new_vectors =
4961                FixedSizeListArray::try_new_from_values(vector_values, self.dimension as i32)
4962                    .unwrap();
4963            let new_data: Vec<ArrayRef> = vec![
4964                Arc::new(Int32Array::from_iter_values(start..end)),
4965                Arc::new(StringArray::from_iter_values(
4966                    (start..end).map(|v| format!("s-{}", v)),
4967                )),
4968                Arc::new(new_vectors),
4969            ];
4970            let reader = RecordBatchIterator::new(
4971                vec![RecordBatch::try_new(self.schema.clone(), new_data).unwrap()]
4972                    .into_iter()
4973                    .map(Ok),
4974                self.schema.clone(),
4975            );
4976            self.dataset.append(reader, None).await?;
4977            Ok(())
4978        }
4979    }
4980}
4981
4982#[cfg(test)]
4983mod test {
4984
4985    use std::collections::BTreeSet;
4986    use std::time::{Duration, Instant};
4987    use std::vec;
4988
4989    use arrow::array::as_primitive_array;
4990    use arrow::datatypes::{Float64Type, Int32Type, Int64Type};
4991    use arrow_array::cast::AsArray;
4992    use arrow_array::types::{Float32Type, UInt64Type};
4993    use arrow_array::{
4994        ArrayRef, FixedSizeListArray, Float16Array, Int32Array, LargeStringArray, PrimitiveArray,
4995        RecordBatchIterator, StringArray, StructArray, UInt8Array,
4996    };
4997
4998    use arrow_ord::sort::sort_to_indices;
4999    use arrow_schema::Fields;
5000    use arrow_select::take;
5001    use datafusion::logical_expr::{col, lit};
5002    use half::f16;
5003    use lance_arrow::{FixedSizeListArrayExt, SchemaExt};
5004    use lance_core::utils::tempfile::TempStrDir;
5005    use lance_core::{ROW_CREATED_AT_VERSION, ROW_LAST_UPDATED_AT_VERSION};
5006    use lance_datagen::{
5007        ArrayGeneratorExt, BatchCount, ByteCount, Dimension, RowCount, array, gen_batch,
5008    };
5009    use lance_file::version::LanceFileVersion;
5010    use lance_index::optimize::OptimizeOptions;
5011    use lance_index::scalar::inverted::query::{MatchQuery, PhraseQuery};
5012    use lance_index::vector::hnsw::builder::HnswBuildParams;
5013    use lance_index::vector::ivf::IvfBuildParams;
5014    use lance_index::vector::pq::PQBuildParams;
5015    use lance_index::vector::sq::builder::SQBuildParams;
5016    use lance_index::{IndexType, scalar::ScalarIndexParams};
5017    use lance_io::assert_io_gt;
5018    use lance_io::object_store::ObjectStoreParams;
5019
5020    use lance_linalg::distance::DistanceType;
5021    use lance_testing::datagen::{BatchGenerator, IncrementingInt32, RandomVector};
5022    use object_store::throttle::ThrottleConfig;
5023    use rstest::rstest;
5024
5025    use super::*;
5026    use crate::dataset::WriteMode;
5027    use crate::dataset::WriteParams;
5028    use crate::dataset::optimize::{CompactionOptions, compact_files};
5029    use crate::dataset::scanner::test_dataset::TestVectorDataset;
5030    use crate::index::vector::{StageParams, VectorIndexParams};
5031    use crate::utils::test::{
5032        DatagenExt, FragmentCount, FragmentRowCount, ThrottledStoreWrapper, assert_plan_node_equals,
5033    };
5034
5035    #[test]
5036    fn test_env_var_parsing() {
5037        // Test that invalid environment variable values don't panic
5038
5039        // Test invalid LANCE_DEFAULT_BATCH_SIZE
5040        unsafe {
5041            std::env::set_var("LANCE_DEFAULT_BATCH_SIZE", "not_a_number");
5042        }
5043        let result = get_default_batch_size();
5044        assert_eq!(result, None, "Should return None for invalid batch size");
5045
5046        // Test valid LANCE_DEFAULT_BATCH_SIZE
5047        unsafe {
5048            std::env::set_var("LANCE_DEFAULT_BATCH_SIZE", "2048");
5049        }
5050        let result = get_default_batch_size();
5051        assert_eq!(result, Some(2048), "Should parse valid batch size");
5052
5053        // Test unset LANCE_DEFAULT_BATCH_SIZE
5054        unsafe {
5055            std::env::remove_var("LANCE_DEFAULT_BATCH_SIZE");
5056        }
5057        let result = get_default_batch_size();
5058        assert_eq!(result, None, "Should return None when env var is not set");
5059    }
5060
5061    #[test]
5062    fn test_parse_env_var() {
5063        // Test parse_env_var with different types to ensure full coverage
5064
5065        // Test with a unique env var name to avoid conflicts
5066        let test_var = "LANCE_TEST_PARSE_ENV_VAR_USIZE";
5067
5068        // Test valid usize parsing
5069        unsafe {
5070            std::env::set_var(test_var, "12345");
5071        }
5072        let result: Option<usize> = parse_env_var(test_var, "Using default.");
5073        assert_eq!(result, Some(12345));
5074
5075        // Test invalid usize parsing (triggers warning log)
5076        unsafe {
5077            std::env::set_var(test_var, "not_a_number");
5078        }
5079        let result: Option<usize> = parse_env_var(test_var, "Using default.");
5080        assert_eq!(result, None);
5081
5082        // Test unset env var
5083        unsafe {
5084            std::env::remove_var(test_var);
5085        }
5086        let result: Option<usize> = parse_env_var(test_var, "Using default.");
5087        assert_eq!(result, None);
5088
5089        // Test with u32 type
5090        let test_var_u32 = "LANCE_TEST_PARSE_ENV_VAR_U32";
5091        unsafe {
5092            std::env::set_var(test_var_u32, "42");
5093        }
5094        let result: Option<u32> = parse_env_var(test_var_u32, "Using default value.");
5095        assert_eq!(result, Some(42));
5096
5097        unsafe {
5098            std::env::set_var(test_var_u32, "invalid");
5099        }
5100        let result: Option<u32> = parse_env_var(test_var_u32, "Using default value.");
5101        assert_eq!(result, None);
5102
5103        unsafe {
5104            std::env::remove_var(test_var_u32);
5105        }
5106
5107        // Test with u64 type
5108        let test_var_u64 = "LANCE_TEST_PARSE_ENV_VAR_U64";
5109        unsafe {
5110            std::env::set_var(test_var_u64, "9999999999");
5111        }
5112        let result: Option<u64> = parse_env_var(test_var_u64, "Using default value.");
5113        assert_eq!(result, Some(9999999999));
5114
5115        unsafe {
5116            std::env::set_var(test_var_u64, "-1");
5117        }
5118        let result: Option<u64> = parse_env_var(test_var_u64, "Using default value.");
5119        assert_eq!(result, None);
5120
5121        unsafe {
5122            std::env::remove_var(test_var_u64);
5123        }
5124    }
5125
5126    async fn make_binary_vector_dataset() -> Result<(TempStrDir, Dataset)> {
5127        let tmp_dir = TempStrDir::default();
5128        let dim = 4;
5129        let schema = Arc::new(ArrowSchema::new(vec![
5130            ArrowField::new("id", DataType::Int32, false),
5131            ArrowField::new(
5132                "bin",
5133                DataType::FixedSizeList(
5134                    Arc::new(ArrowField::new("item", DataType::UInt8, true)),
5135                    dim,
5136                ),
5137                false,
5138            ),
5139        ]));
5140
5141        let vectors = FixedSizeListArray::try_new_from_values(
5142            UInt8Array::from(vec![
5143                0b0000_1111u8,
5144                0,
5145                0,
5146                0, //
5147                0b0000_0011u8,
5148                0,
5149                0,
5150                0, //
5151                0u8,
5152                0,
5153                0,
5154                0,
5155            ]),
5156            dim,
5157        )?;
5158        let ids = Int32Array::from(vec![0, 1, 2]);
5159
5160        let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(ids), Arc::new(vectors)])?;
5161        let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone());
5162        Dataset::write(reader, &tmp_dir, None).await?;
5163        let dataset = Dataset::open(&tmp_dir).await?;
5164        Ok((tmp_dir, dataset))
5165    }
5166
5167    #[tokio::test]
5168    async fn test_batch_size() {
5169        let schema = Arc::new(ArrowSchema::new(vec![
5170            ArrowField::new("i", DataType::Int32, true),
5171            ArrowField::new("s", DataType::Utf8, true),
5172        ]));
5173
5174        let batches: Vec<RecordBatch> = (0..5)
5175            .map(|i| {
5176                RecordBatch::try_new(
5177                    schema.clone(),
5178                    vec![
5179                        Arc::new(Int32Array::from_iter_values(i * 20..(i + 1) * 20)),
5180                        Arc::new(StringArray::from_iter_values(
5181                            (i * 20..(i + 1) * 20).map(|v| format!("s-{}", v)),
5182                        )),
5183                    ],
5184                )
5185                .unwrap()
5186            })
5187            .collect();
5188
5189        for use_filter in [false, true] {
5190            let test_dir = TempStrDir::default();
5191            let test_uri = &test_dir;
5192            let write_params = WriteParams {
5193                max_rows_per_file: 40,
5194                max_rows_per_group: 10,
5195                ..Default::default()
5196            };
5197            let batches =
5198                RecordBatchIterator::new(batches.clone().into_iter().map(Ok), schema.clone());
5199            Dataset::write(batches, test_uri, Some(write_params))
5200                .await
5201                .unwrap();
5202
5203            let dataset = Dataset::open(test_uri).await.unwrap();
5204            let mut builder = dataset.scan();
5205            builder.batch_size(8);
5206            if use_filter {
5207                builder.filter("i IS NOT NULL").unwrap();
5208            }
5209            let mut stream = builder.try_into_stream().await.unwrap();
5210            let mut rows_read = 0;
5211            while let Some(next) = stream.next().await {
5212                let next = next.unwrap();
5213                let expected = 8.min(100 - rows_read);
5214                assert_eq!(next.num_rows(), expected);
5215                rows_read += next.num_rows();
5216            }
5217        }
5218    }
5219
5220    #[tokio::test]
5221    async fn test_strict_batch_size() {
5222        let dataset = lance_datagen::gen_batch()
5223            .col("x", array::step::<Int32Type>())
5224            .anon_col(array::step::<Int64Type>())
5225            .into_ram_dataset(FragmentCount::from(7), FragmentRowCount::from(6))
5226            .await
5227            .unwrap();
5228
5229        let mut scan = dataset.scan();
5230        scan.batch_size(10)
5231            .strict_batch_size(true)
5232            .filter("x % 2 == 0")
5233            .unwrap();
5234
5235        let batches = scan
5236            .try_into_stream()
5237            .await
5238            .unwrap()
5239            .try_collect::<Vec<_>>()
5240            .await
5241            .unwrap();
5242
5243        let batch_sizes = batches.iter().map(|b| b.num_rows()).collect::<Vec<_>>();
5244        assert_eq!(batch_sizes, vec![10, 10, 1]);
5245    }
5246
5247    #[tokio::test]
5248    async fn test_column_not_exist() {
5249        let dataset = lance_datagen::gen_batch()
5250            .col("x", array::step::<Int32Type>())
5251            .into_ram_dataset(FragmentCount::from(7), FragmentRowCount::from(6))
5252            .await
5253            .unwrap();
5254
5255        let check_err_msg = |r: Result<DatasetRecordBatchStream>| {
5256            let Err(err) = r else {
5257                panic!(
5258                    "Expected an error to be raised saying column y is not found but got no error"
5259                )
5260            };
5261
5262            assert!(
5263                err.to_string().contains("No field named y"),
5264                "Expected error to contain 'No field named y' but got {}",
5265                err
5266            );
5267        };
5268
5269        let mut scan = dataset.scan();
5270        scan.project(&["x", "y"]).unwrap();
5271        check_err_msg(scan.try_into_stream().await);
5272
5273        let mut scan = dataset.scan();
5274        scan.project(&["y"]).unwrap();
5275        check_err_msg(scan.try_into_stream().await);
5276
5277        // This represents a query like `SELECT 1 AS foo` which we could _technically_ satisfy
5278        // but it is not supported today
5279        let mut scan = dataset.scan();
5280        scan.project_with_transform(&[("foo", "1")]).unwrap();
5281        match scan.try_into_stream().await {
5282            Ok(_) => panic!("Expected an error to be raised saying not supported"),
5283            Err(e) => {
5284                assert!(
5285                    e.to_string().contains("Received only dynamic expressions"),
5286                    "Expected error to contain 'Received only dynamic expressions' but got {}",
5287                    e
5288                );
5289            }
5290        }
5291    }
5292
5293    #[cfg(not(windows))]
5294    #[tokio::test]
5295    async fn test_local_object_store() {
5296        let schema = Arc::new(ArrowSchema::new(vec![
5297            ArrowField::new("i", DataType::Int32, true),
5298            ArrowField::new("s", DataType::Utf8, true),
5299        ]));
5300
5301        let batches: Vec<RecordBatch> = (0..5)
5302            .map(|i| {
5303                RecordBatch::try_new(
5304                    schema.clone(),
5305                    vec![
5306                        Arc::new(Int32Array::from_iter_values(i * 20..(i + 1) * 20)),
5307                        Arc::new(StringArray::from_iter_values(
5308                            (i * 20..(i + 1) * 20).map(|v| format!("s-{}", v)),
5309                        )),
5310                    ],
5311                )
5312                .unwrap()
5313            })
5314            .collect();
5315
5316        let test_dir = TempStrDir::default();
5317        let test_uri = &test_dir;
5318        let write_params = WriteParams {
5319            max_rows_per_file: 40,
5320            max_rows_per_group: 10,
5321            ..Default::default()
5322        };
5323        let batches = RecordBatchIterator::new(batches.clone().into_iter().map(Ok), schema.clone());
5324        Dataset::write(batches, test_uri, Some(write_params))
5325            .await
5326            .unwrap();
5327
5328        let dataset = Dataset::open(&format!("file-object-store://{}", test_uri))
5329            .await
5330            .unwrap();
5331        let mut builder = dataset.scan();
5332        builder.batch_size(8);
5333        let mut stream = builder.try_into_stream().await.unwrap();
5334        let mut rows_read = 0;
5335        while let Some(next) = stream.next().await {
5336            let next = next.unwrap();
5337            let expected = 8.min(100 - rows_read);
5338            assert_eq!(next.num_rows(), expected);
5339            rows_read += next.num_rows();
5340        }
5341    }
5342
5343    #[tokio::test]
5344    async fn test_filter_parsing() -> Result<()> {
5345        let test_ds = TestVectorDataset::new(LanceFileVersion::Stable, false).await?;
5346        let dataset = &test_ds.dataset;
5347
5348        let mut scan = dataset.scan();
5349        assert!(scan.filter.is_none());
5350
5351        scan.filter("i > 50")?;
5352        assert_eq!(scan.get_expr_filter().unwrap(), Some(col("i").gt(lit(50))));
5353
5354        for use_stats in [false, true] {
5355            let batches = scan
5356                .project(&["s"])?
5357                .use_stats(use_stats)
5358                .try_into_stream()
5359                .await?
5360                .try_collect::<Vec<_>>()
5361                .await?;
5362            let batch = concat_batches(&batches[0].schema(), &batches)?;
5363
5364            let expected_batch = RecordBatch::try_new(
5365                // Projected just "s"
5366                Arc::new(test_ds.schema.project(&[1])?),
5367                vec![Arc::new(StringArray::from_iter_values(
5368                    (51..400).map(|v| format!("s-{}", v)),
5369                ))],
5370            )?;
5371            assert_eq!(batch, expected_batch);
5372        }
5373        Ok(())
5374    }
5375
5376    #[tokio::test]
5377    async fn test_scan_regexp_match_and_non_empty_captions() {
5378        // Build a small dataset with three Utf8 columns and verify the full
5379        // scan().filter(...) path handles regexp_match combined with non-null/non-empty checks.
5380        let schema = Arc::new(ArrowSchema::new(vec![
5381            ArrowField::new("keywords", DataType::Utf8, true),
5382            ArrowField::new("natural_caption", DataType::Utf8, true),
5383            ArrowField::new("poetic_caption", DataType::Utf8, true),
5384        ]));
5385
5386        let batch = RecordBatch::try_new(
5387            schema.clone(),
5388            vec![
5389                Arc::new(StringArray::from(vec![
5390                    Some("Liberty for all"),
5391                    Some("peace"),
5392                    Some("revolution now"),
5393                    Some("Liberty"),
5394                    Some("revolutionary"),
5395                    Some("none"),
5396                ])) as ArrayRef,
5397                Arc::new(StringArray::from(vec![
5398                    Some("a"),
5399                    Some("b"),
5400                    None,
5401                    Some(""),
5402                    Some("c"),
5403                    Some("d"),
5404                ])) as ArrayRef,
5405                Arc::new(StringArray::from(vec![
5406                    Some("x"),
5407                    Some(""),
5408                    Some("y"),
5409                    Some("z"),
5410                    None,
5411                    Some("w"),
5412                ])) as ArrayRef,
5413            ],
5414        )
5415        .unwrap();
5416
5417        let reader = RecordBatchIterator::new(vec![Ok(batch.clone())], schema.clone());
5418        let dataset = Dataset::write(reader, "memory://", None).await.unwrap();
5419
5420        let mut scan = dataset.scan();
5421        scan.filter(
5422            "regexp_match(keywords, 'Liberty|revolution') AND \
5423             (natural_caption IS NOT NULL AND natural_caption <> '' AND \
5424              poetic_caption IS NOT NULL AND poetic_caption <> '')",
5425        )
5426        .unwrap();
5427
5428        let out = scan.try_into_batch().await.unwrap();
5429        assert_eq!(out.num_rows(), 1);
5430
5431        let out_keywords = out
5432            .column_by_name("keywords")
5433            .unwrap()
5434            .as_string::<i32>()
5435            .value(0);
5436        let out_nat = out
5437            .column_by_name("natural_caption")
5438            .unwrap()
5439            .as_string::<i32>()
5440            .value(0);
5441        let out_poetic = out
5442            .column_by_name("poetic_caption")
5443            .unwrap()
5444            .as_string::<i32>()
5445            .value(0);
5446
5447        assert_eq!(out_keywords, "Liberty for all");
5448        assert_eq!(out_nat, "a");
5449        assert_eq!(out_poetic, "x");
5450    }
5451
5452    #[tokio::test]
5453    async fn test_nested_projection() {
5454        let point_fields: Fields = vec![
5455            ArrowField::new("x", DataType::Float32, true),
5456            ArrowField::new("y", DataType::Float32, true),
5457        ]
5458        .into();
5459        let metadata_fields: Fields = vec![
5460            ArrowField::new("location", DataType::Struct(point_fields), true),
5461            ArrowField::new("age", DataType::Int32, true),
5462        ]
5463        .into();
5464        let metadata_field = ArrowField::new("metadata", DataType::Struct(metadata_fields), true);
5465        let schema = Arc::new(ArrowSchema::new(vec![
5466            metadata_field,
5467            ArrowField::new("idx", DataType::Int32, true),
5468        ]));
5469        let data = lance_datagen::rand(&schema)
5470            .into_ram_dataset(FragmentCount::from(7), FragmentRowCount::from(6))
5471            .await
5472            .unwrap();
5473
5474        let mut scan = data.scan();
5475        scan.project(&["metadata.location.x", "metadata.age"])
5476            .unwrap();
5477        let batch = scan.try_into_batch().await.unwrap();
5478
5479        assert_eq!(
5480            batch.schema().as_ref(),
5481            &ArrowSchema::new(vec![
5482                ArrowField::new("metadata.location.x", DataType::Float32, true),
5483                ArrowField::new("metadata.age", DataType::Int32, true),
5484            ])
5485        );
5486
5487        // 0 - metadata
5488        // 2 - x
5489        // 4 - age
5490        let take_schema = data.schema().project_by_ids(&[0, 2, 4], false);
5491
5492        let taken = data.take_rows(&[0, 5], take_schema).await.unwrap();
5493
5494        // The expected schema drops y from the location field
5495        let part_point_fields = Fields::from(vec![ArrowField::new("x", DataType::Float32, true)]);
5496        let part_metadata_fields = Fields::from(vec![
5497            ArrowField::new("location", DataType::Struct(part_point_fields), true),
5498            ArrowField::new("age", DataType::Int32, true),
5499        ]);
5500        let part_schema = Arc::new(ArrowSchema::new(vec![ArrowField::new(
5501            "metadata",
5502            DataType::Struct(part_metadata_fields),
5503            true,
5504        )]));
5505
5506        assert_eq!(taken.schema(), part_schema);
5507    }
5508
5509    #[rstest]
5510    #[tokio::test]
5511    async fn test_limit(
5512        #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
5513        data_storage_version: LanceFileVersion,
5514    ) -> Result<()> {
5515        let test_ds = TestVectorDataset::new(data_storage_version, false).await?;
5516        let dataset = &test_ds.dataset;
5517
5518        let full_data = dataset.scan().try_into_batch().await?.slice(19, 2);
5519
5520        let actual = dataset
5521            .scan()
5522            .limit(Some(2), Some(19))?
5523            .try_into_batch()
5524            .await?;
5525
5526        assert_eq!(actual.num_rows(), 2);
5527        assert_eq!(actual, full_data);
5528        Ok(())
5529    }
5530
5531    #[test_log::test(tokio::test)]
5532    async fn test_limit_cancel() {
5533        // If there is a filter and a limit and we can't use the index to satisfy
5534        // the filter, then we have to read until we have enough matching rows and
5535        // then cancel the scan.
5536        //
5537        // This test regresses the case where we fail to cancel the scan for whatever
5538        // reason.
5539
5540        // Make the store slow so that if we don't cancel the scan, it will take a loooong time.
5541        let throttled = Arc::new(ThrottledStoreWrapper {
5542            config: ThrottleConfig {
5543                wait_get_per_call: Duration::from_secs(1),
5544                ..Default::default()
5545            },
5546        });
5547        let write_params = WriteParams {
5548            store_params: Some(ObjectStoreParams {
5549                object_store_wrapper: Some(throttled.clone()),
5550                ..Default::default()
5551            }),
5552            max_rows_per_file: 1,
5553            ..Default::default()
5554        };
5555
5556        // Make a dataset with lots of tiny fragments, that will make it more obvious if we fail to cancel the scan.
5557        let dataset = gen_batch()
5558            .col("i", array::step::<Int32Type>().with_random_nulls(0.1))
5559            .into_ram_dataset_with_params(
5560                FragmentCount::from(2000),
5561                FragmentRowCount::from(1),
5562                Some(write_params),
5563            )
5564            .await
5565            .unwrap();
5566
5567        let mut scan = dataset.scan();
5568        scan.filter("i IS NOT NULL").unwrap();
5569        scan.limit(Some(10), None).unwrap();
5570
5571        let start = Instant::now();
5572        scan.try_into_stream()
5573            .await
5574            .unwrap()
5575            .try_collect::<Vec<_>>()
5576            .await
5577            .unwrap();
5578        let duration = start.elapsed();
5579
5580        // This test is a timing test, which is unfortunate, as it may be flaky.  I'm hoping
5581        // we have enough wiggle room here.  The failure case is 30s on my machine and the pass
5582        // case is 2-3s.
5583        assert!(duration < Duration::from_secs(10));
5584    }
5585
5586    #[rstest]
5587    #[tokio::test]
5588    async fn test_knn_nodes(
5589        #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
5590        data_storage_version: LanceFileVersion,
5591        #[values(false, true)] stable_row_ids: bool,
5592        #[values(false, true)] build_index: bool,
5593    ) {
5594        let mut test_ds = TestVectorDataset::new(data_storage_version, stable_row_ids)
5595            .await
5596            .unwrap();
5597        if build_index {
5598            test_ds.make_vector_index().await.unwrap();
5599        }
5600        let dataset = &test_ds.dataset;
5601
5602        let mut scan = dataset.scan();
5603        let key: Float32Array = (32..64).map(|v| v as f32).collect();
5604        scan.nearest("vec", &key, 5).unwrap();
5605        scan.refine(5);
5606
5607        let batch = scan.try_into_batch().await.unwrap();
5608
5609        assert_eq!(batch.num_rows(), 5);
5610        assert_eq!(
5611            batch.schema().as_ref(),
5612            &ArrowSchema::new(vec![
5613                ArrowField::new("i", DataType::Int32, true),
5614                ArrowField::new("s", DataType::Utf8, true),
5615                ArrowField::new(
5616                    "vec",
5617                    DataType::FixedSizeList(
5618                        Arc::new(ArrowField::new("item", DataType::Float32, true)),
5619                        32,
5620                    ),
5621                    true,
5622                ),
5623                ArrowField::new(DIST_COL, DataType::Float32, true),
5624            ])
5625            .with_metadata([("dataset".into(), "vector".into())].into())
5626        );
5627
5628        let expected_i = BTreeSet::from_iter(vec![1, 81, 161, 241, 321]);
5629        let column_i = batch.column_by_name("i").unwrap();
5630        let actual_i: BTreeSet<i32> = as_primitive_array::<Int32Type>(column_i.as_ref())
5631            .values()
5632            .iter()
5633            .copied()
5634            .collect();
5635        assert_eq!(expected_i, actual_i);
5636    }
5637
5638    #[rstest]
5639    #[tokio::test]
5640    async fn test_can_project_distance() {
5641        let test_ds = TestVectorDataset::new(LanceFileVersion::Stable, true)
5642            .await
5643            .unwrap();
5644        let dataset = &test_ds.dataset;
5645
5646        let mut scan = dataset.scan();
5647        let key: Float32Array = (32..64).map(|v| v as f32).collect();
5648        scan.nearest("vec", &key, 5).unwrap();
5649        scan.refine(5);
5650        scan.project(&["_distance"]).unwrap();
5651
5652        let batch = scan.try_into_batch().await.unwrap();
5653
5654        assert_eq!(batch.num_rows(), 5);
5655        assert_eq!(batch.num_columns(), 1);
5656    }
5657
5658    #[rstest]
5659    #[tokio::test]
5660    async fn test_knn_with_new_data(
5661        #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
5662        data_storage_version: LanceFileVersion,
5663        #[values(false, true)] stable_row_ids: bool,
5664    ) {
5665        let mut test_ds = TestVectorDataset::new(data_storage_version, stable_row_ids)
5666            .await
5667            .unwrap();
5668        test_ds.make_vector_index().await.unwrap();
5669        test_ds.append_new_data().await.unwrap();
5670        let dataset = &test_ds.dataset;
5671
5672        // Create a bunch of queries
5673        let key: Float32Array = [0f32; 32].into_iter().collect();
5674        // Set as larger than the number of new rows that aren't in the index to
5675        // force result sets to be combined between index and flat scan.
5676        let k = 20;
5677
5678        #[derive(Debug)]
5679        struct TestCase {
5680            filter: Option<&'static str>,
5681            limit: Option<i64>,
5682            use_index: bool,
5683        }
5684
5685        let mut cases = vec![];
5686        for filter in [Some("i > 100"), None] {
5687            for limit in [None, Some(10)] {
5688                for use_index in [true, false] {
5689                    cases.push(TestCase {
5690                        filter,
5691                        limit,
5692                        use_index,
5693                    });
5694                }
5695            }
5696        }
5697
5698        // Validate them all.
5699        for case in cases {
5700            let mut scanner = dataset.scan();
5701            scanner
5702                .nearest("vec", &key, k)
5703                .unwrap()
5704                .limit(case.limit, None)
5705                .unwrap()
5706                .refine(3)
5707                .use_index(case.use_index);
5708            if let Some(filter) = case.filter {
5709                scanner.filter(filter).unwrap();
5710            }
5711
5712            let result = scanner
5713                .try_into_stream()
5714                .await
5715                .unwrap()
5716                .try_collect::<Vec<_>>()
5717                .await
5718                .unwrap();
5719            assert!(!result.is_empty());
5720            let result = concat_batches(&result[0].schema(), result.iter()).unwrap();
5721
5722            if case.filter.is_some() {
5723                let result_rows = result.num_rows();
5724                let expected_rows = case.limit.unwrap_or(k as i64) as usize;
5725                assert!(
5726                    result_rows <= expected_rows,
5727                    "Expected less than {} rows, got {}",
5728                    expected_rows,
5729                    result_rows
5730                );
5731            } else {
5732                // Exactly equal count
5733                assert_eq!(result.num_rows(), case.limit.unwrap_or(k as i64) as usize);
5734            }
5735
5736            // Top one should be the first value of new data
5737            assert_eq!(
5738                as_primitive_array::<Int32Type>(result.column(0).as_ref()).value(0),
5739                400
5740            );
5741        }
5742    }
5743
5744    #[rstest]
5745    #[tokio::test]
5746    async fn test_knn_with_prefilter(
5747        #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
5748        data_storage_version: LanceFileVersion,
5749        #[values(false, true)] stable_row_ids: bool,
5750    ) {
5751        let mut test_ds = TestVectorDataset::new(data_storage_version, stable_row_ids)
5752            .await
5753            .unwrap();
5754        test_ds.make_vector_index().await.unwrap();
5755        let dataset = &test_ds.dataset;
5756
5757        let mut scan = dataset.scan();
5758        let key: Float32Array = (32..64).map(|v| v as f32).collect();
5759        scan.filter("i > 100").unwrap();
5760        scan.prefilter(true);
5761        scan.project(&["i", "vec"]).unwrap();
5762        scan.nearest("vec", &key, 5).unwrap();
5763        scan.use_index(false);
5764
5765        let results = scan
5766            .try_into_stream()
5767            .await
5768            .unwrap()
5769            .try_collect::<Vec<_>>()
5770            .await
5771            .unwrap();
5772
5773        assert_eq!(results.len(), 1);
5774        let batch = &results[0];
5775
5776        assert_eq!(batch.num_rows(), 5);
5777        assert_eq!(
5778            batch.schema().as_ref(),
5779            &ArrowSchema::new(vec![
5780                ArrowField::new("i", DataType::Int32, true),
5781                ArrowField::new(
5782                    "vec",
5783                    DataType::FixedSizeList(
5784                        Arc::new(ArrowField::new("item", DataType::Float32, true)),
5785                        32,
5786                    ),
5787                    true,
5788                ),
5789                ArrowField::new(DIST_COL, DataType::Float32, true),
5790            ])
5791            .with_metadata([("dataset".into(), "vector".into())].into())
5792        );
5793
5794        // These match the query exactly.  The 5 results must include these 3.
5795        let exact_i = BTreeSet::from_iter(vec![161, 241, 321]);
5796        // These also include those 1 off from the query.  The remaining 2 results must be in this set.
5797        let close_i = BTreeSet::from_iter(vec![161, 241, 321, 160, 162, 240, 242, 320, 322]);
5798        let column_i = batch.column_by_name("i").unwrap();
5799        let actual_i: BTreeSet<i32> = as_primitive_array::<Int32Type>(column_i.as_ref())
5800            .values()
5801            .iter()
5802            .copied()
5803            .collect();
5804        assert!(exact_i.is_subset(&actual_i));
5805        assert!(actual_i.is_subset(&close_i));
5806    }
5807
5808    #[rstest]
5809    #[tokio::test]
5810    async fn test_knn_filter_new_data(
5811        #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
5812        data_storage_version: LanceFileVersion,
5813        #[values(false, true)] stable_row_ids: bool,
5814    ) {
5815        // This test verifies that a filter (prefilter or postfilter) gets applied to the flat KNN results
5816        // in a combined KNN scan (a scan that combines results from an indexed ANN with an unindexed flat
5817        // search of new data)
5818        let mut test_ds = TestVectorDataset::new(data_storage_version, stable_row_ids)
5819            .await
5820            .unwrap();
5821        test_ds.make_vector_index().await.unwrap();
5822        test_ds.append_new_data().await.unwrap();
5823        let dataset = &test_ds.dataset;
5824
5825        // This query will match exactly the new row with i = 400 which should be excluded by the prefilter
5826        let key: Float32Array = [0f32; 32].into_iter().collect();
5827
5828        let mut query = dataset.scan();
5829        query.nearest("vec", &key, 20).unwrap();
5830
5831        // Sanity check that 400 is in our results
5832        let results = query
5833            .try_into_stream()
5834            .await
5835            .unwrap()
5836            .try_collect::<Vec<_>>()
5837            .await
5838            .unwrap();
5839
5840        let results_i = results[0]["i"]
5841            .as_primitive::<Int32Type>()
5842            .values()
5843            .iter()
5844            .copied()
5845            .collect::<BTreeSet<_>>();
5846
5847        assert!(results_i.contains(&400));
5848
5849        // Both prefilter and postfilter should remove 400 from our results
5850        for prefilter in [false, true] {
5851            let mut query = dataset.scan();
5852            query
5853                .filter("i != 400")
5854                .unwrap()
5855                .prefilter(prefilter)
5856                .nearest("vec", &key, 20)
5857                .unwrap();
5858
5859            let results = query
5860                .try_into_stream()
5861                .await
5862                .unwrap()
5863                .try_collect::<Vec<_>>()
5864                .await
5865                .unwrap();
5866
5867            let results_i = results[0]["i"]
5868                .as_primitive::<Int32Type>()
5869                .values()
5870                .iter()
5871                .copied()
5872                .collect::<BTreeSet<_>>();
5873
5874            assert!(!results_i.contains(&400));
5875        }
5876    }
5877
5878    #[rstest]
5879    #[tokio::test]
5880    async fn test_knn_with_filter(
5881        #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
5882        data_storage_version: LanceFileVersion,
5883        #[values(false, true)] stable_row_ids: bool,
5884    ) {
5885        let test_ds = TestVectorDataset::new(data_storage_version, stable_row_ids)
5886            .await
5887            .unwrap();
5888        let dataset = &test_ds.dataset;
5889
5890        let mut scan = dataset.scan();
5891        let key: Float32Array = (32..64).map(|v| v as f32).collect();
5892        scan.nearest("vec", &key, 5).unwrap();
5893        scan.filter("i > 100").unwrap();
5894        scan.project(&["i", "vec"]).unwrap();
5895        scan.refine(5);
5896
5897        let results = scan
5898            .try_into_stream()
5899            .await
5900            .unwrap()
5901            .try_collect::<Vec<_>>()
5902            .await
5903            .unwrap();
5904
5905        assert_eq!(results.len(), 1);
5906        let batch = &results[0];
5907
5908        assert_eq!(batch.num_rows(), 3);
5909        assert_eq!(
5910            batch.schema().as_ref(),
5911            &ArrowSchema::new(vec![
5912                ArrowField::new("i", DataType::Int32, true),
5913                ArrowField::new(
5914                    "vec",
5915                    DataType::FixedSizeList(
5916                        Arc::new(ArrowField::new("item", DataType::Float32, true)),
5917                        32,
5918                    ),
5919                    true,
5920                ),
5921                ArrowField::new(DIST_COL, DataType::Float32, true),
5922            ])
5923            .with_metadata([("dataset".into(), "vector".into())].into())
5924        );
5925
5926        let expected_i = BTreeSet::from_iter(vec![161, 241, 321]);
5927        let column_i = batch.column_by_name("i").unwrap();
5928        let actual_i: BTreeSet<i32> = as_primitive_array::<Int32Type>(column_i.as_ref())
5929            .values()
5930            .iter()
5931            .copied()
5932            .collect();
5933        assert_eq!(expected_i, actual_i);
5934    }
5935
5936    #[rstest]
5937    #[tokio::test]
5938    async fn test_refine_factor(
5939        #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
5940        data_storage_version: LanceFileVersion,
5941        #[values(false, true)] stable_row_ids: bool,
5942    ) {
5943        let test_ds = TestVectorDataset::new(data_storage_version, stable_row_ids)
5944            .await
5945            .unwrap();
5946        let dataset = &test_ds.dataset;
5947
5948        let mut scan = dataset.scan();
5949        let key: Float32Array = (32..64).map(|v| v as f32).collect();
5950        scan.nearest("vec", &key, 5).unwrap();
5951        scan.refine(5);
5952
5953        let results = scan
5954            .try_into_stream()
5955            .await
5956            .unwrap()
5957            .try_collect::<Vec<_>>()
5958            .await
5959            .unwrap();
5960
5961        assert_eq!(results.len(), 1);
5962        let batch = &results[0];
5963
5964        assert_eq!(batch.num_rows(), 5);
5965        assert_eq!(
5966            batch.schema().as_ref(),
5967            &ArrowSchema::new(vec![
5968                ArrowField::new("i", DataType::Int32, true),
5969                ArrowField::new("s", DataType::Utf8, true),
5970                ArrowField::new(
5971                    "vec",
5972                    DataType::FixedSizeList(
5973                        Arc::new(ArrowField::new("item", DataType::Float32, true)),
5974                        32,
5975                    ),
5976                    true,
5977                ),
5978                ArrowField::new(DIST_COL, DataType::Float32, true),
5979            ])
5980            .with_metadata([("dataset".into(), "vector".into())].into())
5981        );
5982
5983        let expected_i = BTreeSet::from_iter(vec![1, 81, 161, 241, 321]);
5984        let column_i = batch.column_by_name("i").unwrap();
5985        let actual_i: BTreeSet<i32> = as_primitive_array::<Int32Type>(column_i.as_ref())
5986            .values()
5987            .iter()
5988            .copied()
5989            .collect();
5990        assert_eq!(expected_i, actual_i);
5991    }
5992
5993    #[tokio::test]
5994    async fn test_binary_vectors_default_to_hamming() {
5995        let (_tmp_dir, dataset) = make_binary_vector_dataset().await.unwrap();
5996        let query = UInt8Array::from(vec![0b0000_1111u8, 0, 0, 0]);
5997
5998        let mut scan = dataset.scan();
5999        scan.nearest("bin", &query, 3).unwrap();
6000
6001        // metric_type is None initially; it will be resolved to Hamming during search
6002        assert_eq!(scan.nearest.as_ref().unwrap().metric_type, None);
6003
6004        let batch = scan.try_into_batch().await.unwrap();
6005        let ids = batch
6006            .column_by_name("id")
6007            .unwrap()
6008            .as_primitive::<Int32Type>()
6009            .values();
6010        assert_eq!(ids, &[0, 1, 2]);
6011        let distances = batch
6012            .column_by_name(DIST_COL)
6013            .unwrap()
6014            .as_primitive::<Float32Type>()
6015            .values();
6016        assert_eq!(distances, &[0.0, 2.0, 4.0]);
6017    }
6018
6019    #[tokio::test]
6020    async fn test_binary_vectors_invalid_distance_error() {
6021        let (_tmp_dir, dataset) = make_binary_vector_dataset().await.unwrap();
6022        let query = UInt8Array::from(vec![0b0000_1111u8, 0, 0, 0]);
6023
6024        let mut scan = dataset.scan();
6025        scan.nearest("bin", &query, 1).unwrap();
6026        scan.distance_metric(DistanceType::L2);
6027
6028        let err = scan.try_into_batch().await.unwrap_err();
6029        assert!(matches!(err, Error::InvalidInput { .. }));
6030        let message = err.to_string();
6031        assert!(
6032            message.contains("l2") && message.contains("UInt8"),
6033            "unexpected message: {message}"
6034        );
6035    }
6036
6037    /// Test that when query specifies a metric different from the index,
6038    /// we fall back to flat search and return correct distances.
6039    /// Regression test for https://github.com/lance-format/lance/issues/5608
6040    #[tokio::test]
6041    async fn test_knn_metric_mismatch_falls_back_to_flat_search() {
6042        let mut test_ds = TestVectorDataset::new(LanceFileVersion::Stable, true)
6043            .await
6044            .unwrap();
6045        // Create IVF_PQ index with L2 metric
6046        test_ds.make_vector_index().await.unwrap();
6047
6048        let dataset = &test_ds.dataset;
6049        let key: Float32Array = (32..64).map(|v| v as f32).collect();
6050
6051        // Query with Dot metric (different from the L2 index)
6052        let mut scan = dataset.scan();
6053        scan.nearest("vec", &key, 5).unwrap();
6054        scan.distance_metric(DistanceType::Dot);
6055
6056        // Verify the explain plan does NOT show ANNSubIndex (should use flat search)
6057        let plan = scan.explain_plan(false).await.unwrap();
6058        assert!(
6059            !plan.contains("ANNSubIndex"),
6060            "Expected flat search, but got ANN index in plan:\n{}",
6061            plan
6062        );
6063        // Should show flat KNN with Dot metric (metric is displayed lowercase)
6064        assert!(
6065            plan.contains("KNNVectorDistance") && plan.to_lowercase().contains("dot"),
6066            "Expected flat KNN with Dot metric in plan:\n{}",
6067            plan
6068        );
6069
6070        // Also verify the distances are different from L2 results
6071        let dot_batch = dataset
6072            .scan()
6073            .nearest("vec", &key, 5)
6074            .unwrap()
6075            .distance_metric(DistanceType::Dot)
6076            .try_into_batch()
6077            .await
6078            .unwrap();
6079
6080        let l2_batch = dataset
6081            .scan()
6082            .nearest("vec", &key, 5)
6083            .unwrap()
6084            .distance_metric(DistanceType::L2)
6085            .try_into_batch()
6086            .await
6087            .unwrap();
6088
6089        let dot_distances: Vec<f32> = dot_batch
6090            .column_by_name(DIST_COL)
6091            .unwrap()
6092            .as_primitive::<Float32Type>()
6093            .values()
6094            .to_vec();
6095        let l2_distances: Vec<f32> = l2_batch
6096            .column_by_name(DIST_COL)
6097            .unwrap()
6098            .as_primitive::<Float32Type>()
6099            .values()
6100            .to_vec();
6101
6102        // Dot and L2 distances should be different (this verifies we're using the correct metric)
6103        assert_ne!(dot_distances, l2_distances);
6104    }
6105
6106    /// Test that when query does not specify a metric, we use the index's metric.
6107    /// Regression test for https://github.com/lance-format/lance/issues/5608
6108    #[tokio::test]
6109    async fn test_knn_no_metric_uses_index_metric() {
6110        let mut test_ds = TestVectorDataset::new(LanceFileVersion::Stable, true)
6111            .await
6112            .unwrap();
6113        // Create IVF_PQ index with L2 metric
6114        test_ds.make_vector_index().await.unwrap();
6115
6116        let dataset = &test_ds.dataset;
6117        let key: Float32Array = (32..64).map(|v| v as f32).collect();
6118
6119        // Query without specifying metric
6120        let mut scan = dataset.scan();
6121        scan.nearest("vec", &key, 5).unwrap();
6122        // Don't call distance_metric() - should use index's L2
6123
6124        // Verify the explain plan shows ANNSubIndex with L2 metric
6125        let plan = scan.explain_plan(false).await.unwrap();
6126        assert!(
6127            plan.contains("ANNSubIndex") && plan.to_lowercase().contains("l2"),
6128            "Expected ANN index with L2 metric in plan:\n{}",
6129            plan
6130        );
6131    }
6132
6133    #[rstest]
6134    #[tokio::test]
6135    async fn test_only_row_id(
6136        #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
6137        data_storage_version: LanceFileVersion,
6138    ) {
6139        let test_ds = TestVectorDataset::new(data_storage_version, false)
6140            .await
6141            .unwrap();
6142        let dataset = &test_ds.dataset;
6143
6144        let mut scan = dataset.scan();
6145        scan.project::<&str>(&[]).unwrap().with_row_id();
6146
6147        let batch = scan.try_into_batch().await.unwrap();
6148
6149        assert_eq!(batch.num_columns(), 1);
6150        assert_eq!(batch.num_rows(), 400);
6151        let expected_schema =
6152            ArrowSchema::new(vec![ArrowField::new(ROW_ID, DataType::UInt64, true)])
6153                .with_metadata(dataset.schema().metadata.clone());
6154        assert_eq!(batch.schema().as_ref(), &expected_schema,);
6155
6156        let expected_row_ids: Vec<u64> = (0..200_u64).chain((1 << 32)..((1 << 32) + 200)).collect();
6157        let actual_row_ids: Vec<u64> = as_primitive_array::<UInt64Type>(batch.column(0).as_ref())
6158            .values()
6159            .iter()
6160            .copied()
6161            .collect();
6162        assert_eq!(expected_row_ids, actual_row_ids);
6163    }
6164
6165    #[tokio::test]
6166    async fn test_scan_unordered_with_row_id() {
6167        // This test doesn't make sense for v2 files, there is no way to get an out-of-order scan
6168        let test_ds = TestVectorDataset::new(LanceFileVersion::Legacy, false)
6169            .await
6170            .unwrap();
6171        let dataset = &test_ds.dataset;
6172
6173        let mut scan = dataset.scan();
6174        scan.with_row_id();
6175
6176        let ordered_batches = scan
6177            .try_into_stream()
6178            .await
6179            .unwrap()
6180            .try_collect::<Vec<RecordBatch>>()
6181            .await
6182            .unwrap();
6183        assert!(ordered_batches.len() > 2);
6184        let ordered_batch =
6185            concat_batches(&ordered_batches[0].schema(), ordered_batches.iter()).unwrap();
6186
6187        // Attempt to get out-of-order scan, but that might take multiple attempts.
6188        scan.scan_in_order(false);
6189        for _ in 0..10 {
6190            let unordered_batches = scan
6191                .try_into_stream()
6192                .await
6193                .unwrap()
6194                .try_collect::<Vec<RecordBatch>>()
6195                .await
6196                .unwrap();
6197            let unordered_batch =
6198                concat_batches(&unordered_batches[0].schema(), unordered_batches.iter()).unwrap();
6199
6200            assert_eq!(ordered_batch.num_rows(), unordered_batch.num_rows());
6201
6202            // If they aren't equal, they should be equal if we sort by row id
6203            if ordered_batch != unordered_batch {
6204                let sort_indices = sort_to_indices(&unordered_batch[ROW_ID], None, None).unwrap();
6205
6206                let ordered_i = ordered_batch["i"].clone();
6207                let sorted_i = take::take(&unordered_batch["i"], &sort_indices, None).unwrap();
6208
6209                assert_eq!(&ordered_i, &sorted_i);
6210
6211                break;
6212            }
6213        }
6214    }
6215
6216    #[tokio::test]
6217    async fn test_scan_with_wildcard() {
6218        let data = gen_batch()
6219            .col("x", array::step::<Float64Type>())
6220            .col("y", array::step::<Float64Type>())
6221            .into_ram_dataset(FragmentCount::from(1), FragmentRowCount::from(100))
6222            .await
6223            .unwrap();
6224
6225        let check_cols = async |projection: &[&str], expected_cols: &[&str]| {
6226            let mut scan = data.scan();
6227            scan.project(projection).unwrap();
6228            let stream = scan.try_into_stream().await.unwrap();
6229            let schema = stream.schema();
6230            let field_names = schema.field_names();
6231            assert_eq!(field_names, expected_cols);
6232        };
6233
6234        check_cols(&["*"], &["x", "y"]).await;
6235        check_cols(&["x", "y"], &["x", "y"]).await;
6236        check_cols(&["x"], &["x"]).await;
6237        check_cols(&["_rowid", "*"], &["_rowid", "x", "y"]).await;
6238        check_cols(&["*", "_rowid"], &["x", "y", "_rowid"]).await;
6239        check_cols(
6240            &["_rowid", "*", "_rowoffset"],
6241            &["_rowid", "x", "y", "_rowoffset"],
6242        )
6243        .await;
6244
6245        let check_exprs = async |exprs: &[&str], expected_cols: &[&str]| {
6246            let mut scan = data.scan();
6247            let projection = exprs
6248                .iter()
6249                .map(|e| (e.to_string(), e.to_string()))
6250                .collect::<Vec<_>>();
6251            scan.project_with_transform(&projection).unwrap();
6252            let stream = scan.try_into_stream().await.unwrap();
6253            let schema = stream.schema();
6254            let field_names = schema.field_names();
6255            assert_eq!(field_names, expected_cols);
6256        };
6257
6258        // Make sure we can reference * fields in exprs and add new columns
6259        check_exprs(&["_rowid", "*", "x * 2"], &["_rowid", "x", "y", "x * 2"]).await;
6260
6261        let check_fails = |projection: &[&str]| {
6262            let mut scan = data.scan();
6263            assert!(scan.project(projection).is_err());
6264        };
6265
6266        // Would duplicate x
6267        check_fails(&["x", "*"]);
6268        check_fails(&["_rowid", "_rowid"]);
6269    }
6270
6271    #[rstest]
6272    #[tokio::test]
6273    async fn test_scan_order(
6274        #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
6275        data_storage_version: LanceFileVersion,
6276    ) {
6277        let test_dir = TempStrDir::default();
6278        let test_uri = &test_dir;
6279
6280        let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new(
6281            "i",
6282            DataType::Int32,
6283            true,
6284        )]));
6285
6286        let batch1 = RecordBatch::try_new(
6287            schema.clone(),
6288            vec![Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]))],
6289        )
6290        .unwrap();
6291
6292        let batch2 = RecordBatch::try_new(
6293            schema.clone(),
6294            vec![Arc::new(Int32Array::from(vec![6, 7, 8]))],
6295        )
6296        .unwrap();
6297
6298        let params = WriteParams {
6299            mode: WriteMode::Append,
6300            data_storage_version: Some(data_storage_version),
6301            ..Default::default()
6302        };
6303
6304        let write_batch = |batch: RecordBatch| async {
6305            let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone());
6306            Dataset::write(reader, test_uri, Some(params)).await
6307        };
6308
6309        write_batch.clone()(batch1.clone()).await.unwrap();
6310        write_batch(batch2.clone()).await.unwrap();
6311
6312        let dataset = Arc::new(Dataset::open(test_uri).await.unwrap());
6313        let fragment1 = dataset.get_fragment(0).unwrap().metadata().clone();
6314        let fragment2 = dataset.get_fragment(1).unwrap().metadata().clone();
6315
6316        // 1 then 2
6317        let mut scanner = dataset.scan();
6318        scanner.with_fragments(vec![fragment1.clone(), fragment2.clone()]);
6319        let output = scanner
6320            .try_into_stream()
6321            .await
6322            .unwrap()
6323            .try_collect::<Vec<_>>()
6324            .await
6325            .unwrap();
6326        assert_eq!(output.len(), 2);
6327        assert_eq!(output[0], batch1);
6328        assert_eq!(output[1], batch2);
6329
6330        // 2 then 1
6331        let mut scanner = dataset.scan();
6332        scanner.with_fragments(vec![fragment2, fragment1]);
6333        let output = scanner
6334            .try_into_stream()
6335            .await
6336            .unwrap()
6337            .try_collect::<Vec<_>>()
6338            .await
6339            .unwrap();
6340        assert_eq!(output.len(), 2);
6341        assert_eq!(output[0], batch2);
6342        assert_eq!(output[1], batch1);
6343    }
6344
6345    #[rstest]
6346    #[tokio::test]
6347    async fn test_scan_sort(
6348        #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
6349        data_storage_version: LanceFileVersion,
6350    ) {
6351        let test_dir = TempStrDir::default();
6352        let test_uri = &test_dir;
6353
6354        let data = gen_batch()
6355            .col("int", array::cycle::<Int32Type>(vec![5, 4, 1, 2, 3]))
6356            .col(
6357                "str",
6358                array::cycle_utf8_literals(&["a", "b", "c", "e", "d"]),
6359            );
6360
6361        let sorted_by_int = gen_batch()
6362            .col("int", array::cycle::<Int32Type>(vec![1, 2, 3, 4, 5]))
6363            .col(
6364                "str",
6365                array::cycle_utf8_literals(&["c", "e", "d", "b", "a"]),
6366            )
6367            .into_batch_rows(RowCount::from(5))
6368            .unwrap();
6369
6370        let sorted_by_str = gen_batch()
6371            .col("int", array::cycle::<Int32Type>(vec![5, 4, 1, 3, 2]))
6372            .col(
6373                "str",
6374                array::cycle_utf8_literals(&["a", "b", "c", "d", "e"]),
6375            )
6376            .into_batch_rows(RowCount::from(5))
6377            .unwrap();
6378
6379        Dataset::write(
6380            data.into_reader_rows(RowCount::from(5), BatchCount::from(1)),
6381            test_uri,
6382            Some(WriteParams {
6383                data_storage_version: Some(data_storage_version),
6384                ..Default::default()
6385            }),
6386        )
6387        .await
6388        .unwrap();
6389
6390        let dataset = Arc::new(Dataset::open(test_uri).await.unwrap());
6391
6392        let batches_by_int = dataset
6393            .scan()
6394            .order_by(Some(vec![ColumnOrdering::asc_nulls_first(
6395                "int".to_string(),
6396            )]))
6397            .unwrap()
6398            .try_into_stream()
6399            .await
6400            .unwrap()
6401            .try_collect::<Vec<_>>()
6402            .await
6403            .unwrap();
6404
6405        assert_eq!(batches_by_int[0], sorted_by_int);
6406
6407        let batches_by_str = dataset
6408            .scan()
6409            .order_by(Some(vec![ColumnOrdering::asc_nulls_first(
6410                "str".to_string(),
6411            )]))
6412            .unwrap()
6413            .try_into_stream()
6414            .await
6415            .unwrap()
6416            .try_collect::<Vec<_>>()
6417            .await
6418            .unwrap();
6419
6420        assert_eq!(batches_by_str[0], sorted_by_str);
6421
6422        // Ensure an empty sort vec does not break anything (sorting is disabled)
6423        dataset
6424            .scan()
6425            .order_by(Some(vec![]))
6426            .unwrap()
6427            .try_into_stream()
6428            .await
6429            .unwrap()
6430            .try_collect::<Vec<_>>()
6431            .await
6432            .unwrap();
6433    }
6434
6435    #[rstest]
6436    #[tokio::test]
6437    async fn test_sort_multi_columns(
6438        #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
6439        data_storage_version: LanceFileVersion,
6440    ) {
6441        let test_dir = TempStrDir::default();
6442        let test_uri = &test_dir;
6443
6444        let data = gen_batch()
6445            .col("int", array::cycle::<Int32Type>(vec![5, 5, 1, 1, 3]))
6446            .col(
6447                "float",
6448                array::cycle::<Float32Type>(vec![7.3, -f32::NAN, f32::NAN, 4.3, f32::INFINITY]),
6449            );
6450
6451        let sorted_by_int_then_float = gen_batch()
6452            .col("int", array::cycle::<Int32Type>(vec![1, 1, 3, 5, 5]))
6453            .col(
6454                "float",
6455                // floats should be sorted using total order so -NAN is before all and NAN is after all
6456                array::cycle::<Float32Type>(vec![4.3, f32::NAN, f32::INFINITY, -f32::NAN, 7.3]),
6457            )
6458            .into_batch_rows(RowCount::from(5))
6459            .unwrap();
6460
6461        Dataset::write(
6462            data.into_reader_rows(RowCount::from(5), BatchCount::from(1)),
6463            test_uri,
6464            Some(WriteParams {
6465                data_storage_version: Some(data_storage_version),
6466                ..Default::default()
6467            }),
6468        )
6469        .await
6470        .unwrap();
6471
6472        let dataset = Arc::new(Dataset::open(test_uri).await.unwrap());
6473
6474        let batches_by_int_then_float = dataset
6475            .scan()
6476            .order_by(Some(vec![
6477                ColumnOrdering::asc_nulls_first("int".to_string()),
6478                ColumnOrdering::asc_nulls_first("float".to_string()),
6479            ]))
6480            .unwrap()
6481            .try_into_stream()
6482            .await
6483            .unwrap()
6484            .try_collect::<Vec<_>>()
6485            .await
6486            .unwrap();
6487
6488        assert_eq!(batches_by_int_then_float[0], sorted_by_int_then_float);
6489    }
6490
6491    #[rstest]
6492    #[tokio::test]
6493    async fn test_ann_prefilter(
6494        #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
6495        data_storage_version: LanceFileVersion,
6496        #[values(false, true)] stable_row_ids: bool,
6497        #[values(
6498            VectorIndexParams::ivf_pq(2, 8, 2, MetricType::L2, 2),
6499            VectorIndexParams::with_ivf_hnsw_sq_params(
6500                MetricType::L2,
6501                IvfBuildParams::new(2),
6502                HnswBuildParams::default(),
6503                SQBuildParams::default()
6504            )
6505        )]
6506        index_params: VectorIndexParams,
6507    ) {
6508        use lance_arrow::{FixedSizeListArrayExt, fixed_size_list_type};
6509
6510        let test_dir = TempStrDir::default();
6511        let test_uri = &test_dir;
6512
6513        let schema = Arc::new(ArrowSchema::new(vec![
6514            ArrowField::new("filterable", DataType::Int32, true),
6515            ArrowField::new("vector", fixed_size_list_type(2, DataType::Float32), true),
6516        ]));
6517
6518        let vector_values = Float32Array::from_iter_values((0..600).map(|x| x as f32));
6519
6520        let batches = vec![
6521            RecordBatch::try_new(
6522                schema.clone(),
6523                vec![
6524                    Arc::new(Int32Array::from_iter_values(0..300)),
6525                    Arc::new(FixedSizeListArray::try_new_from_values(vector_values, 2).unwrap()),
6526                ],
6527            )
6528            .unwrap(),
6529        ];
6530
6531        let write_params = WriteParams {
6532            data_storage_version: Some(data_storage_version),
6533            max_rows_per_file: 300, // At least two files to make sure stable row ids make a difference
6534            enable_stable_row_ids: stable_row_ids,
6535            ..Default::default()
6536        };
6537        let batches = RecordBatchIterator::new(batches.into_iter().map(Ok), schema.clone());
6538        let mut dataset = Dataset::write(batches, test_uri, Some(write_params))
6539            .await
6540            .unwrap();
6541
6542        dataset
6543            .create_index(&["vector"], IndexType::Vector, None, &index_params, false)
6544            .await
6545            .unwrap();
6546
6547        let query_key = Arc::new(Float32Array::from_iter_values((0..2).map(|x| x as f32)));
6548        let mut scan = dataset.scan();
6549        scan.filter("filterable > 5").unwrap();
6550        scan.nearest("vector", query_key.as_ref(), 1).unwrap();
6551        scan.minimum_nprobes(100);
6552        scan.ef(100);
6553        scan.with_row_id();
6554
6555        let batches = scan
6556            .try_into_stream()
6557            .await
6558            .unwrap()
6559            .try_collect::<Vec<_>>()
6560            .await
6561            .unwrap();
6562
6563        assert_eq!(batches.len(), 0);
6564
6565        scan.prefilter(true);
6566
6567        let batches = scan
6568            .try_into_stream()
6569            .await
6570            .unwrap()
6571            .try_collect::<Vec<_>>()
6572            .await
6573            .unwrap();
6574        assert_eq!(batches.len(), 1);
6575
6576        let first_match = batches[0][ROW_ID].as_primitive::<UInt64Type>().values()[0];
6577
6578        // HNSW+SQ is an approximate index; this test validates *prefiltering*, so
6579        // every row failing `filterable > 5` (row ids 0..=5) must be excluded.
6580        // HNSW recall is covered by dedicated vector-index tests elsewhere.
6581        assert!(
6582            first_match > 5,
6583            "prefilter not honored: returned row id {first_match} should satisfy `filterable > 5`"
6584        );
6585    }
6586
6587    #[rstest]
6588    #[tokio::test]
6589    async fn test_filter_on_large_utf8(
6590        #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
6591        data_storage_version: LanceFileVersion,
6592    ) {
6593        let test_dir = TempStrDir::default();
6594        let test_uri = &test_dir;
6595
6596        let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new(
6597            "ls",
6598            DataType::LargeUtf8,
6599            true,
6600        )]));
6601
6602        let batches = vec![
6603            RecordBatch::try_new(
6604                schema.clone(),
6605                vec![Arc::new(LargeStringArray::from_iter_values(
6606                    (0..10).map(|v| format!("s-{}", v)),
6607                ))],
6608            )
6609            .unwrap(),
6610        ];
6611
6612        let write_params = WriteParams {
6613            data_storage_version: Some(data_storage_version),
6614            ..Default::default()
6615        };
6616        let batches = RecordBatchIterator::new(batches.into_iter().map(Ok), schema.clone());
6617        Dataset::write(batches, test_uri, Some(write_params))
6618            .await
6619            .unwrap();
6620
6621        let dataset = Dataset::open(test_uri).await.unwrap();
6622        let mut scan = dataset.scan();
6623        scan.filter("ls = 's-8'").unwrap();
6624
6625        let batches = scan
6626            .try_into_stream()
6627            .await
6628            .unwrap()
6629            .try_collect::<Vec<_>>()
6630            .await
6631            .unwrap();
6632        let batch = &batches[0];
6633
6634        let expected = RecordBatch::try_new(
6635            schema.clone(),
6636            vec![Arc::new(LargeStringArray::from_iter_values(
6637                (8..9).map(|v| format!("s-{}", v)),
6638            ))],
6639        )
6640        .unwrap();
6641
6642        assert_eq!(batch, &expected);
6643    }
6644
6645    #[rstest]
6646    #[tokio::test]
6647    async fn test_filter_with_regex(
6648        #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
6649        data_storage_version: LanceFileVersion,
6650    ) {
6651        let test_dir = TempStrDir::default();
6652        let test_uri = &test_dir;
6653
6654        let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new(
6655            "ls",
6656            DataType::Utf8,
6657            true,
6658        )]));
6659
6660        let batches = vec![
6661            RecordBatch::try_new(
6662                schema.clone(),
6663                vec![Arc::new(StringArray::from_iter_values(
6664                    (0..20).map(|v| format!("s-{}", v)),
6665                ))],
6666            )
6667            .unwrap(),
6668        ];
6669
6670        let write_params = WriteParams {
6671            data_storage_version: Some(data_storage_version),
6672            ..Default::default()
6673        };
6674        let batches = RecordBatchIterator::new(batches.into_iter().map(Ok), schema.clone());
6675        Dataset::write(batches, test_uri, Some(write_params))
6676            .await
6677            .unwrap();
6678
6679        let dataset = Dataset::open(test_uri).await.unwrap();
6680        let mut scan = dataset.scan();
6681        scan.filter("regexp_match(ls, 's-1.')").unwrap();
6682
6683        let stream = scan.try_into_stream().await.unwrap();
6684        let batches = stream.try_collect::<Vec<_>>().await.unwrap();
6685        let batch = &batches[0];
6686
6687        let expected = RecordBatch::try_new(
6688            schema.clone(),
6689            vec![Arc::new(StringArray::from_iter_values(
6690                (10..=19).map(|v| format!("s-{}", v)),
6691            ))],
6692        )
6693        .unwrap();
6694
6695        assert_eq!(batch, &expected);
6696    }
6697
6698    #[tokio::test]
6699    async fn test_filter_proj_bug() {
6700        let struct_i_field = ArrowField::new("i", DataType::Int32, true);
6701        let struct_o_field = ArrowField::new("o", DataType::Utf8, true);
6702        let schema = Arc::new(ArrowSchema::new(vec![
6703            ArrowField::new(
6704                "struct",
6705                DataType::Struct(vec![struct_i_field.clone(), struct_o_field.clone()].into()),
6706                true,
6707            ),
6708            ArrowField::new("s", DataType::Utf8, true),
6709        ]));
6710
6711        let input_batches: Vec<RecordBatch> = (0..5)
6712            .map(|i| {
6713                let struct_i_arr: Arc<Int32Array> =
6714                    Arc::new(Int32Array::from_iter_values(i * 20..(i + 1) * 20));
6715                let struct_o_arr: Arc<StringArray> = Arc::new(StringArray::from_iter_values(
6716                    (i * 20..(i + 1) * 20).map(|v| format!("o-{:02}", v)),
6717                ));
6718                RecordBatch::try_new(
6719                    schema.clone(),
6720                    vec![
6721                        Arc::new(StructArray::from(vec![
6722                            (Arc::new(struct_i_field.clone()), struct_i_arr as ArrayRef),
6723                            (Arc::new(struct_o_field.clone()), struct_o_arr as ArrayRef),
6724                        ])),
6725                        Arc::new(StringArray::from_iter_values(
6726                            (i * 20..(i + 1) * 20).map(|v| format!("s-{}", v)),
6727                        )),
6728                    ],
6729                )
6730                .unwrap()
6731            })
6732            .collect();
6733        let batches =
6734            RecordBatchIterator::new(input_batches.clone().into_iter().map(Ok), schema.clone());
6735        let test_dir = TempStrDir::default();
6736        let test_uri = &test_dir;
6737        let write_params = WriteParams {
6738            max_rows_per_file: 40,
6739            max_rows_per_group: 10,
6740            data_storage_version: Some(LanceFileVersion::Legacy),
6741            ..Default::default()
6742        };
6743        Dataset::write(batches, test_uri, Some(write_params))
6744            .await
6745            .unwrap();
6746
6747        let dataset = Dataset::open(test_uri).await.unwrap();
6748        let batches = dataset
6749            .scan()
6750            .filter("struct.i >= 20")
6751            .unwrap()
6752            .try_into_stream()
6753            .await
6754            .unwrap()
6755            .try_collect::<Vec<_>>()
6756            .await
6757            .unwrap();
6758        let batch = concat_batches(&batches[0].schema(), &batches).unwrap();
6759
6760        let expected_batch = concat_batches(&schema, &input_batches.as_slice()[1..]).unwrap();
6761        assert_eq!(batch, expected_batch);
6762
6763        // different order
6764        let batches = dataset
6765            .scan()
6766            .filter("struct.o >= 'o-20'")
6767            .unwrap()
6768            .try_into_stream()
6769            .await
6770            .unwrap()
6771            .try_collect::<Vec<_>>()
6772            .await
6773            .unwrap();
6774        let batch = concat_batches(&batches[0].schema(), &batches).unwrap();
6775        assert_eq!(batch, expected_batch);
6776
6777        // other reported bug with nested top level column access
6778        let batches = dataset
6779            .scan()
6780            .project(vec!["struct"].as_slice())
6781            .unwrap()
6782            .try_into_stream()
6783            .await
6784            .unwrap()
6785            .try_collect::<Vec<_>>()
6786            .await
6787            .unwrap();
6788        concat_batches(&batches[0].schema(), &batches).unwrap();
6789    }
6790
6791    #[rstest]
6792    #[tokio::test]
6793    async fn test_ann_with_deletion(
6794        #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
6795        data_storage_version: LanceFileVersion,
6796        #[values(false, true)] stable_row_ids: bool,
6797    ) {
6798        let vec_params = vec![
6799            // TODO: re-enable diskann test when we can tune to get reproducible results.
6800            // VectorIndexParams::with_diskann_params(MetricType::L2, DiskANNParams::new(10, 1.5, 10)),
6801            VectorIndexParams::ivf_pq(4, 8, 2, MetricType::L2, 2),
6802        ];
6803        for params in vec_params {
6804            use lance_arrow::FixedSizeListArrayExt;
6805
6806            let test_dir = TempStrDir::default();
6807            let test_uri = &test_dir;
6808
6809            // make dataset
6810            let schema = Arc::new(ArrowSchema::new(vec![
6811                ArrowField::new("i", DataType::Int32, true),
6812                ArrowField::new(
6813                    "vec",
6814                    DataType::FixedSizeList(
6815                        Arc::new(ArrowField::new("item", DataType::Float32, true)),
6816                        32,
6817                    ),
6818                    true,
6819                ),
6820            ]));
6821
6822            // vectors are [1, 1, 1, ...] [2, 2, 2, ...]
6823            let vector_values: Float32Array =
6824                (0..32 * 512).map(|v| (v / 32) as f32 + 1.0).collect();
6825            let vectors = FixedSizeListArray::try_new_from_values(vector_values, 32).unwrap();
6826
6827            let batches = vec![
6828                RecordBatch::try_new(
6829                    schema.clone(),
6830                    vec![
6831                        Arc::new(Int32Array::from_iter_values(0..512)),
6832                        Arc::new(vectors.clone()),
6833                    ],
6834                )
6835                .unwrap(),
6836            ];
6837
6838            let reader = RecordBatchIterator::new(batches.into_iter().map(Ok), schema.clone());
6839            let mut dataset = Dataset::write(
6840                reader,
6841                test_uri,
6842                Some(WriteParams {
6843                    data_storage_version: Some(data_storage_version),
6844                    enable_stable_row_ids: stable_row_ids,
6845                    ..Default::default()
6846                }),
6847            )
6848            .await
6849            .unwrap();
6850
6851            assert_eq!(dataset.index_cache_entry_count().await, 0);
6852            dataset
6853                .create_index(
6854                    &["vec"],
6855                    IndexType::Vector,
6856                    Some("idx".to_string()),
6857                    &params,
6858                    true,
6859                )
6860                .await
6861                .unwrap();
6862
6863            let mut scan = dataset.scan();
6864            // closest be i = 0..5
6865            let key: Float32Array = (0..32).map(|_v| 1.0_f32).collect();
6866            scan.nearest("vec", &key, 5).unwrap();
6867            scan.refine(100);
6868            scan.minimum_nprobes(100);
6869
6870            assert_eq!(
6871                dataset.index_cache_entry_count().await,
6872                2, // 2 for index metadata at version 1 and 2.
6873            );
6874            let results = scan
6875                .try_into_stream()
6876                .await
6877                .unwrap()
6878                .try_collect::<Vec<_>>()
6879                .await
6880                .unwrap();
6881
6882            assert_eq!(
6883                dataset.index_cache_entry_count().await,
6884                5 + dataset.versions().await.unwrap().len()
6885            );
6886            assert_eq!(results.len(), 1);
6887            let batch = &results[0];
6888
6889            let expected_i = BTreeSet::from_iter(vec![0, 1, 2, 3, 4]);
6890            let column_i = batch.column_by_name("i").unwrap();
6891            let actual_i: BTreeSet<i32> = as_primitive_array::<Int32Type>(column_i.as_ref())
6892                .values()
6893                .iter()
6894                .copied()
6895                .collect();
6896            assert_eq!(expected_i, actual_i);
6897
6898            // DELETE top result and search again
6899
6900            dataset.delete("i = 1").await.unwrap();
6901            let mut scan = dataset.scan();
6902            scan.nearest("vec", &key, 5).unwrap();
6903            scan.refine(100);
6904            scan.minimum_nprobes(100);
6905
6906            let results = scan
6907                .try_into_stream()
6908                .await
6909                .unwrap()
6910                .try_collect::<Vec<_>>()
6911                .await
6912                .unwrap();
6913
6914            assert_eq!(results.len(), 1);
6915            let batch = &results[0];
6916
6917            // i=1 was deleted, and 5 is the next best, the reset shouldn't change
6918            let expected_i = BTreeSet::from_iter(vec![0, 2, 3, 4, 5]);
6919            let column_i = batch.column_by_name("i").unwrap();
6920            let actual_i: BTreeSet<i32> = as_primitive_array::<Int32Type>(column_i.as_ref())
6921                .values()
6922                .iter()
6923                .copied()
6924                .collect();
6925            assert_eq!(expected_i, actual_i);
6926
6927            // Add a second fragment and test the case where there are no deletion
6928            // files but there are missing fragments.
6929            let batches = vec![
6930                RecordBatch::try_new(
6931                    schema.clone(),
6932                    vec![
6933                        Arc::new(Int32Array::from_iter_values(512..1024)),
6934                        Arc::new(vectors),
6935                    ],
6936                )
6937                .unwrap(),
6938            ];
6939
6940            let reader = RecordBatchIterator::new(batches.into_iter().map(Ok), schema.clone());
6941            let mut dataset = Dataset::write(
6942                reader,
6943                test_uri,
6944                Some(WriteParams {
6945                    mode: WriteMode::Append,
6946                    data_storage_version: Some(data_storage_version),
6947                    ..Default::default()
6948                }),
6949            )
6950            .await
6951            .unwrap();
6952            dataset
6953                .create_index(
6954                    &["vec"],
6955                    IndexType::Vector,
6956                    Some("idx".to_string()),
6957                    &params,
6958                    true,
6959                )
6960                .await
6961                .unwrap();
6962
6963            dataset.delete("i < 512").await.unwrap();
6964
6965            let mut scan = dataset.scan();
6966            scan.nearest("vec", &key, 5).unwrap();
6967            scan.refine(100);
6968            scan.minimum_nprobes(100);
6969
6970            let results = scan
6971                .try_into_stream()
6972                .await
6973                .unwrap()
6974                .try_collect::<Vec<_>>()
6975                .await
6976                .unwrap();
6977
6978            assert_eq!(results.len(), 1);
6979            let batch = &results[0];
6980
6981            // It should not pick up any results from the first fragment
6982            let expected_i = BTreeSet::from_iter(vec![512, 513, 514, 515, 516]);
6983            let column_i = batch.column_by_name("i").unwrap();
6984            let actual_i: BTreeSet<i32> = as_primitive_array::<Int32Type>(column_i.as_ref())
6985                .values()
6986                .iter()
6987                .copied()
6988                .collect();
6989            assert_eq!(expected_i, actual_i);
6990        }
6991    }
6992
6993    #[tokio::test]
6994    async fn test_projection_order() {
6995        let vec_params = VectorIndexParams::ivf_pq(4, 8, 2, MetricType::L2, 2);
6996        let mut data = gen_batch()
6997            .col("vec", array::rand_vec::<Float32Type>(Dimension::from(4)))
6998            .col("text", array::rand_utf8(ByteCount::from(10), false))
6999            .into_ram_dataset(FragmentCount::from(3), FragmentRowCount::from(100))
7000            .await
7001            .unwrap();
7002        data.create_index(&["vec"], IndexType::Vector, None, &vec_params, true)
7003            .await
7004            .unwrap();
7005
7006        let mut scan = data.scan();
7007        scan.nearest("vec", &Float32Array::from(vec![1.0, 1.0, 1.0, 1.0]), 5)
7008            .unwrap();
7009        scan.with_row_id().project(&["text"]).unwrap();
7010
7011        let results = scan
7012            .try_into_stream()
7013            .await
7014            .unwrap()
7015            .try_collect::<Vec<_>>()
7016            .await
7017            .unwrap();
7018
7019        assert_eq!(
7020            results[0].schema().field_names(),
7021            vec!["text", "_distance", "_rowid"]
7022        );
7023    }
7024
7025    #[rstest]
7026    #[tokio::test]
7027    async fn test_count_rows_with_filter(
7028        #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
7029        data_storage_version: LanceFileVersion,
7030    ) {
7031        let test_dir = TempStrDir::default();
7032        let test_uri = &test_dir;
7033        let mut data_gen = BatchGenerator::new().col(Box::new(
7034            IncrementingInt32::new().named("Filter_me".to_owned()),
7035        ));
7036        Dataset::write(
7037            data_gen.batch(32),
7038            test_uri,
7039            Some(WriteParams {
7040                data_storage_version: Some(data_storage_version),
7041                ..Default::default()
7042            }),
7043        )
7044        .await
7045        .unwrap();
7046
7047        let dataset = Dataset::open(test_uri).await.unwrap();
7048        assert_eq!(32, dataset.count_rows(None).await.unwrap());
7049        assert_eq!(
7050            16,
7051            dataset
7052                .count_rows(Some("`Filter_me` > 15".to_string()))
7053                .await
7054                .unwrap()
7055        );
7056    }
7057
7058    #[rstest]
7059    #[tokio::test]
7060    async fn test_dynamic_projection(
7061        #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
7062        data_storage_version: LanceFileVersion,
7063    ) {
7064        let test_dir = TempStrDir::default();
7065        let test_uri = &test_dir;
7066        let mut data_gen =
7067            BatchGenerator::new().col(Box::new(IncrementingInt32::new().named("i".to_owned())));
7068        Dataset::write(
7069            data_gen.batch(32),
7070            test_uri,
7071            Some(WriteParams {
7072                data_storage_version: Some(data_storage_version),
7073                ..Default::default()
7074            }),
7075        )
7076        .await
7077        .unwrap();
7078
7079        let dataset = Dataset::open(test_uri).await.unwrap();
7080        assert_eq!(dataset.count_rows(None).await.unwrap(), 32);
7081
7082        let mut scanner = dataset.scan();
7083
7084        let scan_res = scanner
7085            .project_with_transform(&[("bool", "i > 15")])
7086            .unwrap()
7087            .try_into_batch()
7088            .await
7089            .unwrap();
7090
7091        assert_eq!(1, scan_res.num_columns());
7092
7093        let bool_col = scan_res
7094            .column_by_name("bool")
7095            .expect("bool column should exist");
7096        let bool_arr = bool_col.as_boolean();
7097        for i in 0..32 {
7098            if i > 15 {
7099                assert!(bool_arr.value(i));
7100            } else {
7101                assert!(!bool_arr.value(i));
7102            }
7103        }
7104    }
7105
7106    #[rstest]
7107    #[tokio::test]
7108    async fn test_column_casting_function(
7109        #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
7110        data_storage_version: LanceFileVersion,
7111    ) {
7112        let test_dir = TempStrDir::default();
7113        let test_uri = &test_dir;
7114        let mut data_gen =
7115            BatchGenerator::new().col(Box::new(RandomVector::new().named("vec".to_owned())));
7116        Dataset::write(
7117            data_gen.batch(32),
7118            test_uri,
7119            Some(WriteParams {
7120                data_storage_version: Some(data_storage_version),
7121                ..Default::default()
7122            }),
7123        )
7124        .await
7125        .unwrap();
7126
7127        let dataset = Dataset::open(test_uri).await.unwrap();
7128        assert_eq!(dataset.count_rows(None).await.unwrap(), 32);
7129
7130        let mut scanner = dataset.scan();
7131
7132        let scan_res = scanner
7133            .project_with_transform(&[("f16", "_cast_list_f16(vec)")])
7134            .unwrap()
7135            .try_into_batch()
7136            .await
7137            .unwrap();
7138
7139        assert_eq!(1, scan_res.num_columns());
7140        assert_eq!(32, scan_res.num_rows());
7141        assert_eq!("f16", scan_res.schema().field(0).name());
7142
7143        let mut scanner = dataset.scan();
7144        let scan_res_original = scanner
7145            .project(&["vec"])
7146            .unwrap()
7147            .try_into_batch()
7148            .await
7149            .unwrap();
7150
7151        let f32_col: &Float32Array = scan_res_original
7152            .column_by_name("vec")
7153            .unwrap()
7154            .as_fixed_size_list()
7155            .values()
7156            .as_primitive();
7157        let f16_col: &Float16Array = scan_res
7158            .column_by_name("f16")
7159            .unwrap()
7160            .as_fixed_size_list()
7161            .values()
7162            .as_primitive();
7163
7164        for (f32_val, f16_val) in f32_col.iter().zip(f16_col.iter()) {
7165            let f32_val = f32_val.unwrap();
7166            let f16_val = f16_val.unwrap();
7167            assert_eq!(f16::from_f32(f32_val), f16_val);
7168        }
7169    }
7170
7171    struct ScalarIndexTestFixture {
7172        _test_dir: TempStrDir,
7173        dataset: Dataset,
7174        sample_query: Arc<dyn Array>,
7175        delete_query: Arc<dyn Array>,
7176        // The original version of the data, two fragments, rows 0-1000
7177        original_version: u64,
7178        // The original version of the data, 1 row deleted, compacted to a single fragment
7179        compact_version: u64,
7180        // The original version of the data + an extra 1000 unindexed
7181        append_version: u64,
7182        // The original version of the data + an extra 1000 rows, with indices updated so all rows indexed
7183        updated_version: u64,
7184        // The original version of the data with 1 deleted row
7185        delete_version: u64,
7186        // The original version of the data + an extra 1000 uindexed + 1 deleted row
7187        append_then_delete_version: u64,
7188    }
7189
7190    #[derive(Debug, PartialEq)]
7191    struct ScalarTestParams {
7192        use_index: bool,
7193        use_projection: bool,
7194        use_deleted_data: bool,
7195        use_new_data: bool,
7196        with_row_id: bool,
7197        use_compaction: bool,
7198        use_updated: bool,
7199    }
7200
7201    impl ScalarIndexTestFixture {
7202        async fn new(data_storage_version: LanceFileVersion, use_stable_row_ids: bool) -> Self {
7203            let test_dir = TempStrDir::default();
7204            let test_uri = &test_dir;
7205
7206            // Write 1000 rows.  Train indices.  Then write 1000 new rows with the same vector data.
7207            // Then delete a row from the trained data.
7208            //
7209            // The first row where indexed == 50 is our sample query.
7210            // The first row where indexed == 75 is our deleted row (and delete query)
7211            let data = gen_batch()
7212                .col(
7213                    "vector",
7214                    array::rand_vec::<Float32Type>(Dimension::from(32)),
7215                )
7216                .col("indexed", array::step::<Int32Type>())
7217                .col("not_indexed", array::step::<Int32Type>())
7218                .into_batch_rows(RowCount::from(1000))
7219                .unwrap();
7220
7221            // Write as two batches so we can later compact
7222            let mut dataset = Dataset::write(
7223                RecordBatchIterator::new(vec![Ok(data.clone())], data.schema().clone()),
7224                test_uri,
7225                Some(WriteParams {
7226                    max_rows_per_file: 500,
7227                    data_storage_version: Some(data_storage_version),
7228                    enable_stable_row_ids: use_stable_row_ids,
7229                    ..Default::default()
7230                }),
7231            )
7232            .await
7233            .unwrap();
7234
7235            dataset
7236                .create_index(
7237                    &["vector"],
7238                    IndexType::Vector,
7239                    None,
7240                    &VectorIndexParams::ivf_pq(2, 8, 2, MetricType::L2, 2),
7241                    false,
7242                )
7243                .await
7244                .unwrap();
7245
7246            dataset
7247                .create_index(
7248                    &["indexed"],
7249                    IndexType::Scalar,
7250                    None,
7251                    &ScalarIndexParams::default(),
7252                    false,
7253                )
7254                .await
7255                .unwrap();
7256
7257            let original_version = dataset.version().version;
7258            let sample_query = data["vector"].as_fixed_size_list().value(50);
7259            let delete_query = data["vector"].as_fixed_size_list().value(75);
7260
7261            // APPEND DATA
7262
7263            // Re-use the vector column in the new batch but add 1000 to the indexed/not_indexed columns so
7264            // they are distinct.  This makes our checks easier.
7265            let new_indexed =
7266                arrow_arith::numeric::add(&data["indexed"], &Int32Array::new_scalar(1000)).unwrap();
7267            let new_not_indexed =
7268                arrow_arith::numeric::add(&data["indexed"], &Int32Array::new_scalar(1000)).unwrap();
7269            let append_data = RecordBatch::try_new(
7270                data.schema(),
7271                vec![data["vector"].clone(), new_indexed, new_not_indexed],
7272            )
7273            .unwrap();
7274
7275            dataset
7276                .append(
7277                    RecordBatchIterator::new(vec![Ok(append_data)], data.schema()),
7278                    Some(WriteParams {
7279                        data_storage_version: Some(data_storage_version),
7280                        ..Default::default()
7281                    }),
7282                )
7283                .await
7284                .unwrap();
7285
7286            let append_version = dataset.version().version;
7287
7288            // UPDATE
7289
7290            dataset
7291                .optimize_indices(&OptimizeOptions::merge(1))
7292                .await
7293                .unwrap();
7294            let updated_version = dataset.version().version;
7295
7296            // APPEND -> DELETE
7297
7298            dataset.checkout_version(append_version).await.unwrap();
7299            dataset.restore().await.unwrap();
7300
7301            dataset.delete("not_indexed = 75").await.unwrap();
7302
7303            let append_then_delete_version = dataset.version().version;
7304
7305            // DELETE
7306
7307            let mut dataset = dataset.checkout_version(original_version).await.unwrap();
7308            dataset.restore().await.unwrap();
7309
7310            dataset.delete("not_indexed = 75").await.unwrap();
7311
7312            let delete_version = dataset.version().version;
7313
7314            // COMPACT (this should materialize the deletion)
7315            compact_files(&mut dataset, CompactionOptions::default(), None)
7316                .await
7317                .unwrap();
7318            let compact_version = dataset.version().version;
7319            dataset.checkout_version(original_version).await.unwrap();
7320            dataset.restore().await.unwrap();
7321
7322            Self {
7323                _test_dir: test_dir,
7324                dataset,
7325                sample_query,
7326                delete_query,
7327                original_version,
7328                compact_version,
7329                append_version,
7330                updated_version,
7331                delete_version,
7332                append_then_delete_version,
7333            }
7334        }
7335
7336        fn sample_query(&self) -> &PrimitiveArray<Float32Type> {
7337            self.sample_query.as_primitive::<Float32Type>()
7338        }
7339
7340        fn delete_query(&self) -> &PrimitiveArray<Float32Type> {
7341            self.delete_query.as_primitive::<Float32Type>()
7342        }
7343
7344        async fn get_dataset(&self, params: &ScalarTestParams) -> Dataset {
7345            let version = if params.use_compaction {
7346                // These combinations should not be possible
7347                if params.use_deleted_data || params.use_new_data || params.use_updated {
7348                    panic!(
7349                        "There is no test data combining new/deleted/updated data with compaction"
7350                    );
7351                } else {
7352                    self.compact_version
7353                }
7354            } else if params.use_updated {
7355                // These combinations should not be possible
7356                if params.use_deleted_data || params.use_new_data || params.use_compaction {
7357                    panic!(
7358                        "There is no test data combining updated data with new/deleted/compaction"
7359                    );
7360                } else {
7361                    self.updated_version
7362                }
7363            } else {
7364                match (params.use_new_data, params.use_deleted_data) {
7365                    (false, false) => self.original_version,
7366                    (false, true) => self.delete_version,
7367                    (true, false) => self.append_version,
7368                    (true, true) => self.append_then_delete_version,
7369                }
7370            };
7371            self.dataset.checkout_version(version).await.unwrap()
7372        }
7373
7374        async fn run_query(
7375            &self,
7376            query: &str,
7377            vector: Option<&PrimitiveArray<Float32Type>>,
7378            params: &ScalarTestParams,
7379        ) -> (String, RecordBatch) {
7380            let dataset = self.get_dataset(params).await;
7381            let mut scan = dataset.scan();
7382            if let Some(vector) = vector {
7383                scan.nearest("vector", vector, 10).unwrap();
7384            }
7385            if params.use_projection {
7386                scan.project(&["indexed"]).unwrap();
7387            }
7388            if params.with_row_id {
7389                scan.with_row_id();
7390            }
7391            scan.scan_in_order(true);
7392            scan.use_index(params.use_index);
7393            scan.filter(query).unwrap();
7394            scan.prefilter(true);
7395
7396            let plan = scan.explain_plan(true).await.unwrap();
7397            let batch = scan.try_into_batch().await.unwrap();
7398
7399            if params.use_projection {
7400                // 1 projected column
7401                let mut expected_columns = 1;
7402                if vector.is_some() {
7403                    // distance column if included always (TODO: it shouldn't)
7404                    expected_columns += 1;
7405                }
7406                if params.with_row_id {
7407                    expected_columns += 1;
7408                }
7409                assert_eq!(batch.num_columns(), expected_columns);
7410            } else {
7411                let mut expected_columns = 3;
7412                if vector.is_some() {
7413                    // distance column
7414                    expected_columns += 1;
7415                }
7416                if params.with_row_id {
7417                    expected_columns += 1;
7418                }
7419                // vector, indexed, not_indexed, _distance
7420                assert_eq!(batch.num_columns(), expected_columns);
7421            }
7422
7423            (plan, batch)
7424        }
7425
7426        fn assert_none<F: Fn(i32) -> bool>(
7427            &self,
7428            batch: &RecordBatch,
7429            predicate: F,
7430            message: &str,
7431        ) {
7432            let indexed = batch["indexed"].as_primitive::<Int32Type>();
7433            if indexed.iter().map(|val| val.unwrap()).any(predicate) {
7434                panic!("{}", message);
7435            }
7436        }
7437
7438        fn assert_one<F: Fn(i32) -> bool>(&self, batch: &RecordBatch, predicate: F, message: &str) {
7439            let indexed = batch["indexed"].as_primitive::<Int32Type>();
7440            if !indexed.iter().map(|val| val.unwrap()).any(predicate) {
7441                panic!("{}", message);
7442            }
7443        }
7444
7445        async fn check_vector_scalar_indexed_and_refine(&self, params: &ScalarTestParams) {
7446            let (query_plan, batch) = self
7447                .run_query(
7448                    "indexed != 50 AND ((not_indexed < 100) OR (not_indexed >= 1000 AND not_indexed < 1100))",
7449                    Some(self.sample_query()),
7450                    params,
7451                )
7452                .await;
7453            // Materialization is always required if there is a refine
7454            if self.dataset.is_legacy_storage() {
7455                assert!(query_plan.contains("MaterializeIndex"));
7456            }
7457            // The result should not include the sample query
7458            self.assert_none(
7459                &batch,
7460                |val| val == 50,
7461                "The query contained 50 even though it was filtered",
7462            );
7463            if !params.use_new_data {
7464                // Refine should have been applied
7465                self.assert_none(
7466                    &batch,
7467                    |val| (100..1000).contains(&val) || (val >= 1100),
7468                    "The non-indexed refine filter was not applied",
7469                );
7470            }
7471
7472            // If there is new data then the dupe of row 50 should be in the results
7473            if params.use_new_data || params.use_updated {
7474                self.assert_one(
7475                    &batch,
7476                    |val| val == 1050,
7477                    "The query did not contain 1050 from the new data",
7478                );
7479            }
7480        }
7481
7482        async fn check_vector_scalar_indexed_only(&self, params: &ScalarTestParams) {
7483            let (query_plan, batch) = self
7484                .run_query("indexed != 50", Some(self.sample_query()), params)
7485                .await;
7486            if self.dataset.is_legacy_storage() {
7487                if params.use_index {
7488                    // An ANN search whose prefilter is fully satisfied by the index should be
7489                    // able to use a ScalarIndexQuery
7490                    assert!(query_plan.contains("ScalarIndexQuery"));
7491                } else {
7492                    // A KNN search requires materialization of the index
7493                    assert!(query_plan.contains("MaterializeIndex"));
7494                }
7495            }
7496            // The result should not include the sample query
7497            self.assert_none(
7498                &batch,
7499                |val| val == 50,
7500                "The query contained 50 even though it was filtered",
7501            );
7502            // If there is new data then the dupe of row 50 should be in the results
7503            if params.use_new_data {
7504                self.assert_one(
7505                    &batch,
7506                    |val| val == 1050,
7507                    "The query did not contain 1050 from the new data",
7508                );
7509                if !params.use_new_data {
7510                    // Let's also make sure our filter can target something in the new data only
7511                    let (_, batch) = self
7512                        .run_query("indexed == 1050", Some(self.sample_query()), params)
7513                        .await;
7514                    assert_eq!(batch.num_rows(), 1);
7515                }
7516            }
7517            if params.use_deleted_data {
7518                let (_, batch) = self
7519                    .run_query("indexed == 75", Some(self.delete_query()), params)
7520                    .await;
7521                if !params.use_new_data {
7522                    assert_eq!(batch.num_rows(), 0);
7523                }
7524            }
7525        }
7526
7527        async fn check_vector_queries(&self, params: &ScalarTestParams) {
7528            self.check_vector_scalar_indexed_only(params).await;
7529            self.check_vector_scalar_indexed_and_refine(params).await;
7530        }
7531
7532        async fn check_simple_indexed_only(&self, params: &ScalarTestParams) {
7533            let (query_plan, batch) = self.run_query("indexed != 50", None, params).await;
7534            // Materialization is always required for non-vector search
7535            if self.dataset.is_legacy_storage() {
7536                assert!(query_plan.contains("MaterializeIndex"));
7537            } else {
7538                assert!(query_plan.contains("LanceRead"));
7539            }
7540            // The result should not include the targeted row
7541            self.assert_none(
7542                &batch,
7543                |val| val == 50,
7544                "The query contained 50 even though it was filtered",
7545            );
7546            let mut expected_num_rows = if params.use_new_data || params.use_updated {
7547                1999
7548            } else {
7549                999
7550            };
7551            if params.use_deleted_data || params.use_compaction {
7552                expected_num_rows -= 1;
7553            }
7554            assert_eq!(batch.num_rows(), expected_num_rows);
7555
7556            // Let's also make sure our filter can target something in the new data only
7557            if params.use_new_data || params.use_updated {
7558                let (_, batch) = self.run_query("indexed == 1050", None, params).await;
7559                assert_eq!(batch.num_rows(), 1);
7560            }
7561
7562            // Also make sure we don't return deleted data
7563            if params.use_deleted_data || params.use_compaction {
7564                let (_, batch) = self.run_query("indexed == 75", None, params).await;
7565                assert_eq!(batch.num_rows(), 0);
7566            }
7567        }
7568
7569        async fn check_simple_indexed_and_refine(&self, params: &ScalarTestParams) {
7570            let (query_plan, batch) = self.run_query(
7571                "indexed != 50 AND ((not_indexed < 100) OR (not_indexed >= 1000 AND not_indexed < 1100))",
7572                None,
7573                params
7574            ).await;
7575            // Materialization is always required for non-vector search
7576            if self.dataset.is_legacy_storage() {
7577                assert!(query_plan.contains("MaterializeIndex"));
7578            } else {
7579                assert!(query_plan.contains("LanceRead"));
7580            }
7581            // The result should not include the targeted row
7582            self.assert_none(
7583                &batch,
7584                |val| val == 50,
7585                "The query contained 50 even though it was filtered",
7586            );
7587            // The refine should be applied
7588            self.assert_none(
7589                &batch,
7590                |val| (100..1000).contains(&val) || (val >= 1100),
7591                "The non-indexed refine filter was not applied",
7592            );
7593
7594            let mut expected_num_rows = if params.use_new_data || params.use_updated {
7595                199
7596            } else {
7597                99
7598            };
7599            if params.use_deleted_data || params.use_compaction {
7600                expected_num_rows -= 1;
7601            }
7602            assert_eq!(batch.num_rows(), expected_num_rows);
7603        }
7604
7605        async fn check_simple_queries(&self, params: &ScalarTestParams) {
7606            self.check_simple_indexed_only(params).await;
7607            self.check_simple_indexed_and_refine(params).await;
7608        }
7609    }
7610
7611    // There are many different ways that a query can be run and they all have slightly different
7612    // effects on the plan that gets built.  This test attempts to run the same queries in various
7613    // different configurations to ensure that we get consistent results
7614    #[rstest]
7615    #[tokio::test]
7616    async fn test_secondary_index_scans(
7617        #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
7618        data_storage_version: LanceFileVersion,
7619        #[values(false, true)] use_stable_row_ids: bool,
7620    ) {
7621        let fixture = Box::pin(ScalarIndexTestFixture::new(
7622            data_storage_version,
7623            use_stable_row_ids,
7624        ))
7625        .await;
7626
7627        for use_index in [false, true] {
7628            for use_projection in [false, true] {
7629                for use_deleted_data in [false, true] {
7630                    for use_new_data in [false, true] {
7631                        // Don't test compaction in conjunction with deletion and new data, it's too
7632                        // many combinations with no clear benefit.  Feel free to update if there is
7633                        // a need
7634                        // TODO: enable compaction for stable row id once supported.
7635                        let compaction_choices =
7636                            if use_deleted_data || use_new_data || use_stable_row_ids {
7637                                vec![false]
7638                            } else {
7639                                vec![false, true]
7640                            };
7641                        for use_compaction in compaction_choices {
7642                            let updated_choices =
7643                                if use_deleted_data || use_new_data || use_compaction {
7644                                    vec![false]
7645                                } else {
7646                                    vec![false, true]
7647                                };
7648                            for use_updated in updated_choices {
7649                                for with_row_id in [false, true] {
7650                                    let params = ScalarTestParams {
7651                                        use_index,
7652                                        use_projection,
7653                                        use_deleted_data,
7654                                        use_new_data,
7655                                        with_row_id,
7656                                        use_compaction,
7657                                        use_updated,
7658                                    };
7659                                    fixture.check_vector_queries(&params).await;
7660                                    fixture.check_simple_queries(&params).await;
7661                                }
7662                            }
7663                        }
7664                    }
7665                }
7666            }
7667        }
7668    }
7669
7670    #[tokio::test]
7671    async fn can_filter_row_id() {
7672        let dataset = lance_datagen::gen_batch()
7673            .col("x", array::step::<Int32Type>())
7674            .into_ram_dataset(FragmentCount::from(1), FragmentRowCount::from(1000))
7675            .await
7676            .unwrap();
7677
7678        let mut scan = dataset.scan();
7679        scan.with_row_id();
7680        scan.project::<&str>(&[]).unwrap();
7681        scan.filter("_rowid == 50").unwrap();
7682        let batch = scan.try_into_batch().await.unwrap();
7683        assert_eq!(batch.num_rows(), 1);
7684        assert_eq!(batch.column(0).as_primitive::<UInt64Type>().values()[0], 50);
7685    }
7686
7687    #[rstest]
7688    #[tokio::test]
7689    async fn test_index_take_batch_size() {
7690        let fixture = Box::pin(ScalarIndexTestFixture::new(LanceFileVersion::Stable, false)).await;
7691        let stream = fixture
7692            .dataset
7693            .scan()
7694            .filter("indexed > 0")
7695            .unwrap()
7696            .batch_size(16)
7697            .try_into_stream()
7698            .await
7699            .unwrap();
7700        let batches = stream.collect::<Vec<_>>().await;
7701        assert_eq!(batches.len(), 1000_usize.div_ceil(16));
7702    }
7703
7704    /// Assert that the plan when formatted matches the expected string.
7705    ///
7706    /// Within expected, you can use `...` to match any number of characters.
7707    async fn assert_plan_equals(
7708        dataset: &Dataset,
7709        plan: impl Fn(&mut Scanner) -> Result<&mut Scanner>,
7710        expected: &str,
7711    ) -> Result<()> {
7712        let mut scan = dataset.scan();
7713        plan(&mut scan)?;
7714        let exec_plan = scan.create_plan().await?;
7715        assert_plan_node_equals(exec_plan, expected).await
7716    }
7717
7718    #[tokio::test]
7719    async fn test_inexact_scalar_index_plans() {
7720        let data = gen_batch()
7721            .col("ngram", array::rand_utf8(ByteCount::from(5), false))
7722            .col("exact", array::rand_type(&DataType::UInt32))
7723            .col("no_index", array::rand_type(&DataType::UInt32))
7724            .into_reader_rows(RowCount::from(1000), BatchCount::from(5));
7725
7726        let mut dataset = Dataset::write(data, "memory://test", None).await.unwrap();
7727        dataset
7728            .create_index(
7729                &["ngram"],
7730                IndexType::NGram,
7731                None,
7732                &ScalarIndexParams::default(),
7733                true,
7734            )
7735            .await
7736            .unwrap();
7737        dataset
7738            .create_index(
7739                &["exact"],
7740                IndexType::BTree,
7741                None,
7742                &ScalarIndexParams::default(),
7743                true,
7744            )
7745            .await
7746            .unwrap();
7747
7748        // Simple in-exact filter
7749        assert_plan_equals(
7750            &dataset,
7751            |scanner| scanner.filter("contains(ngram, 'test string')"),
7752            "LanceRead: uri=..., projection=[ngram, exact, no_index], num_fragments=1, \
7753             range_before=None, range_after=None, row_id=false, row_addr=false, \
7754             full_filter=contains(ngram, Utf8(\"test string\")), refine_filter=--
7755               ScalarIndexQuery: query=[contains(ngram, Utf8(\"test string\"))]@ngram_idx(NGram)",
7756        )
7757        .await
7758        .unwrap();
7759
7760        // Combined with exact filter
7761        assert_plan_equals(
7762            &dataset,
7763            |scanner| scanner.filter("contains(ngram, 'test string') and exact < 50"),
7764            "LanceRead: uri=..., projection=[ngram, exact, no_index], num_fragments=1, \
7765            range_before=None, range_after=None, row_id=false, row_addr=false, \
7766            full_filter=contains(ngram, Utf8(\"test string\")) AND exact < UInt32(50), \
7767            refine_filter=--
7768              ScalarIndexQuery: query=AND([contains(ngram, Utf8(\"test string\"))]@ngram_idx(NGram),[exact < 50]@exact_idx(BTree))",
7769        )
7770        .await
7771        .unwrap();
7772
7773        // All three filters
7774        assert_plan_equals(
7775            &dataset,
7776            |scanner| {
7777                scanner.filter("contains(ngram, 'test string') and exact < 50 AND no_index > 100")
7778            },
7779            "ProjectionExec: expr=[ngram@0 as ngram, exact@1 as exact, no_index@2 as no_index]
7780  LanceRead: uri=..., projection=[ngram, exact, no_index], num_fragments=1, range_before=None, \
7781  range_after=None, row_id=true, row_addr=false, full_filter=contains(ngram, Utf8(\"test string\")) AND exact < UInt32(50) AND no_index > UInt32(100), \
7782  refine_filter=no_index > UInt32(100)
7783    ScalarIndexQuery: query=AND([contains(ngram, Utf8(\"test string\"))]@ngram_idx(NGram),[exact < 50]@exact_idx(BTree))",
7784        )
7785        .await
7786        .unwrap();
7787    }
7788
7789    #[tokio::test]
7790    async fn test_like_prefix_with_btree_index() {
7791        // Create dataset with string data that has various prefixes
7792        // Avoid LIKE special characters (%, _) in data to keep tests simple
7793        let data = gen_batch()
7794            .col(
7795                "name",
7796                array::cycle_utf8_literals(&[
7797                    "apple",
7798                    "application",
7799                    "app",
7800                    "banana",
7801                    "band",
7802                    "testns1",
7803                    "testns2",
7804                    "test",
7805                    "testing",
7806                    "zoo",
7807                ]),
7808            )
7809            .col("id", array::step::<Int32Type>())
7810            .into_reader_rows(RowCount::from(100), BatchCount::from(1));
7811
7812        let mut dataset = Dataset::write(data, "memory://test_like", None)
7813            .await
7814            .unwrap();
7815
7816        // Create BTree index on string column
7817        dataset
7818            .create_index(
7819                &["name"],
7820                IndexType::BTree,
7821                None,
7822                &ScalarIndexParams::default(),
7823                true,
7824            )
7825            .await
7826            .unwrap();
7827
7828        // Test 1: Verify LIKE 'app%' uses scalar index and returns correct results
7829        assert_plan_equals(
7830            &dataset,
7831            |scanner| scanner.filter("name LIKE 'app%'"),
7832            "LanceRead: uri=..., projection=[name, id], num_fragments=1, \
7833             range_before=None, range_after=None, row_id=false, row_addr=false, \
7834             full_filter=name LIKE Utf8(\"app%\"), refine_filter=--
7835               ScalarIndexQuery: query=[name LIKE 'app%']@name_idx(BTree)",
7836        )
7837        .await
7838        .unwrap();
7839
7840        // Verify correct results for LIKE 'app%'
7841        let results = dataset
7842            .scan()
7843            .filter("name LIKE 'app%'")
7844            .unwrap()
7845            .try_into_batch()
7846            .await
7847            .unwrap();
7848        let names: Vec<&str> = results
7849            .column_by_name("name")
7850            .unwrap()
7851            .as_any()
7852            .downcast_ref::<StringArray>()
7853            .unwrap()
7854            .iter()
7855            .map(|s| s.unwrap())
7856            .collect();
7857        // Should match: apple, application, app (repeated in cycle)
7858        assert!(names.iter().all(|n| n.starts_with("app")));
7859        assert!(!names.is_empty());
7860
7861        // Test 2: Verify starts_with() uses scalar index (simple prefix without special chars)
7862        // Note: DataFusion optimizes starts_with() to LIKE before our index planning
7863        assert_plan_equals(
7864            &dataset,
7865            |scanner| scanner.filter("starts_with(name, 'ban')"),
7866            "LanceRead: uri=..., projection=[name, id], num_fragments=1, \
7867             range_before=None, range_after=None, row_id=false, row_addr=false, \
7868             full_filter=name LIKE Utf8(\"ban%\"), refine_filter=--
7869               ScalarIndexQuery: query=[name LIKE 'ban%']@name_idx(BTree)",
7870        )
7871        .await
7872        .unwrap();
7873
7874        // Verify correct results for starts_with
7875        let results = dataset
7876            .scan()
7877            .filter("starts_with(name, 'ban')")
7878            .unwrap()
7879            .try_into_batch()
7880            .await
7881            .unwrap();
7882        let names: Vec<&str> = results
7883            .column_by_name("name")
7884            .unwrap()
7885            .as_any()
7886            .downcast_ref::<StringArray>()
7887            .unwrap()
7888            .iter()
7889            .map(|s| s.unwrap())
7890            .collect();
7891        // Should match: banana, band
7892        assert!(names.iter().all(|n| n.starts_with("ban")));
7893        assert!(!names.is_empty());
7894
7895        // Test 3: LIKE with pattern requiring refine (e.g., 'test%2')
7896        assert_plan_equals(
7897            &dataset,
7898            |scanner| scanner.filter("name LIKE 'test%2'"),
7899            "ProjectionExec: expr=[name@0 as name, id@1 as id]
7900  LanceRead: uri=..., projection=[name, id], num_fragments=1, \
7901range_before=None, range_after=None, row_id=true, row_addr=false, \
7902full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\")
7903    ScalarIndexQuery: query=[name LIKE 'test%']@name_idx(BTree)",
7904        )
7905        .await
7906        .unwrap();
7907
7908        // Verify correct results for LIKE 'test%2' (needs refine)
7909        let results = dataset
7910            .scan()
7911            .filter("name LIKE 'test%2'")
7912            .unwrap()
7913            .try_into_batch()
7914            .await
7915            .unwrap();
7916        let names: Vec<&str> = results
7917            .column_by_name("name")
7918            .unwrap()
7919            .as_any()
7920            .downcast_ref::<StringArray>()
7921            .unwrap()
7922            .iter()
7923            .map(|s| s.unwrap())
7924            .collect();
7925        // Should match: testns2 (ends with '2')
7926        assert!(
7927            names
7928                .iter()
7929                .all(|n| n.starts_with("test") && n.ends_with("2"))
7930        );
7931
7932        // Test 4: LIKE starting with wildcard should NOT use scalar index for pruning
7933        // Verify by checking the plan does NOT have ScalarIndexQuery
7934        let mut scanner = dataset.scan();
7935        scanner.filter("name LIKE '%app%'").unwrap();
7936        let plan = scanner.create_plan().await.unwrap();
7937        let plan_str = format!("{:?}", plan);
7938        assert!(
7939            !plan_str.contains("ScalarIndexQuery"),
7940            "LIKE '%app%' should not use scalar index, but got: {}",
7941            plan_str
7942        );
7943
7944        // Verify correct results for LIKE '%app%'
7945        let results = dataset
7946            .scan()
7947            .filter("name LIKE '%app%'")
7948            .unwrap()
7949            .try_into_batch()
7950            .await
7951            .unwrap();
7952        let names: Vec<&str> = results
7953            .column_by_name("name")
7954            .unwrap()
7955            .as_any()
7956            .downcast_ref::<StringArray>()
7957            .unwrap()
7958            .iter()
7959            .map(|s| s.unwrap())
7960            .collect();
7961        // Should match: apple, application, app (contain 'app')
7962        assert!(names.iter().all(|n| n.contains("app")));
7963
7964        // Test 5: NOT LIKE should NOT use scalar index
7965        let mut scanner = dataset.scan();
7966        scanner.filter("name NOT LIKE 'app%'").unwrap();
7967        let plan = scanner.create_plan().await.unwrap();
7968        let plan_str = format!("{:?}", plan);
7969        assert!(
7970            !plan_str.contains("ScalarIndexQuery"),
7971            "NOT LIKE should not use scalar index, but got: {}",
7972            plan_str
7973        );
7974    }
7975
7976    #[tokio::test]
7977    async fn test_like_prefix_correctness_with_btree_index() {
7978        // Create dataset with deterministic string data for exact result verification
7979        let names: Vec<&str> = vec![
7980            "alpha", "alphabet", "beta", "gamma", "delta", "epsilon", "eta", "theta", "iota",
7981            "kappa",
7982        ];
7983        let data = RecordBatch::try_new(
7984            Arc::new(ArrowSchema::new(vec![
7985                ArrowField::new("name", DataType::Utf8, false),
7986                ArrowField::new("id", DataType::Int32, false),
7987            ])),
7988            vec![
7989                Arc::new(StringArray::from(names.clone())),
7990                Arc::new(Int32Array::from_iter_values(0..10)),
7991            ],
7992        )
7993        .unwrap();
7994
7995        let reader = RecordBatchIterator::new(
7996            vec![Ok(data)],
7997            Arc::new(ArrowSchema::new(vec![
7998                ArrowField::new("name", DataType::Utf8, false),
7999                ArrowField::new("id", DataType::Int32, false),
8000            ])),
8001        );
8002
8003        let mut dataset = Dataset::write(reader, "memory://test_like_correctness", None)
8004            .await
8005            .unwrap();
8006
8007        // Create BTree index
8008        dataset
8009            .create_index(
8010                &["name"],
8011                IndexType::BTree,
8012                None,
8013                &ScalarIndexParams::default(),
8014                true,
8015            )
8016            .await
8017            .unwrap();
8018
8019        // Test with index
8020        let with_index = dataset
8021            .scan()
8022            .filter("name LIKE 'alpha%'")
8023            .unwrap()
8024            .try_into_batch()
8025            .await
8026            .unwrap();
8027
8028        // Test without index (for comparison)
8029        let without_index = dataset
8030            .scan()
8031            .use_scalar_index(false)
8032            .filter("name LIKE 'alpha%'")
8033            .unwrap()
8034            .try_into_batch()
8035            .await
8036            .unwrap();
8037
8038        // Both should return same results: alpha, alphabet
8039        assert_eq!(with_index.num_rows(), without_index.num_rows());
8040        assert_eq!(with_index.num_rows(), 2);
8041
8042        let with_index_names: BTreeSet<String> = with_index
8043            .column_by_name("name")
8044            .unwrap()
8045            .as_any()
8046            .downcast_ref::<StringArray>()
8047            .unwrap()
8048            .iter()
8049            .map(|s| s.unwrap().to_string())
8050            .collect();
8051
8052        let without_index_names: BTreeSet<String> = without_index
8053            .column_by_name("name")
8054            .unwrap()
8055            .as_any()
8056            .downcast_ref::<StringArray>()
8057            .unwrap()
8058            .iter()
8059            .map(|s| s.unwrap().to_string())
8060            .collect();
8061
8062        assert_eq!(with_index_names, without_index_names);
8063        assert_eq!(
8064            with_index_names,
8065            BTreeSet::from(["alpha".to_string(), "alphabet".to_string()])
8066        );
8067
8068        // Test starts_with correctness
8069        let starts_with_result = dataset
8070            .scan()
8071            .filter("starts_with(name, 'e')")
8072            .unwrap()
8073            .try_into_batch()
8074            .await
8075            .unwrap();
8076
8077        let starts_with_names: BTreeSet<String> = starts_with_result
8078            .column_by_name("name")
8079            .unwrap()
8080            .as_any()
8081            .downcast_ref::<StringArray>()
8082            .unwrap()
8083            .iter()
8084            .map(|s| s.unwrap().to_string())
8085            .collect();
8086
8087        // Should match: epsilon, eta
8088        assert_eq!(
8089            starts_with_names,
8090            BTreeSet::from(["epsilon".to_string(), "eta".to_string()])
8091        );
8092    }
8093
8094    #[tokio::test]
8095    async fn test_like_prefix_with_zone_map() {
8096        use lance_index::scalar::BuiltinIndexType;
8097
8098        // Create dataset with string data that has various prefixes
8099        let data = gen_batch()
8100            .col(
8101                "name",
8102                array::cycle_utf8_literals(&[
8103                    "apple",
8104                    "application",
8105                    "app",
8106                    "banana",
8107                    "band",
8108                    "testns1",
8109                    "testns2",
8110                    "test",
8111                    "testing",
8112                    "zoo",
8113                ]),
8114            )
8115            .col("id", array::step::<Int32Type>())
8116            .into_reader_rows(RowCount::from(100), BatchCount::from(1));
8117
8118        let mut dataset = Dataset::write(data, "memory://test_like_zonemap", None)
8119            .await
8120            .unwrap();
8121
8122        // Create ZoneMap index on string column
8123        let params = ScalarIndexParams::for_builtin(BuiltinIndexType::ZoneMap);
8124        dataset
8125            .create_index(
8126                &["name"],
8127                IndexType::Scalar,
8128                Some("name_zonemap".to_string()),
8129                &params,
8130                true,
8131            )
8132            .await
8133            .unwrap();
8134
8135        // Test 1: Verify LIKE 'app%' uses zone map index
8136        let mut scanner = dataset.scan();
8137        scanner.filter("name LIKE 'app%'").unwrap();
8138        let plan = scanner.create_plan().await.unwrap();
8139        let plan_str = format!("{:?}", plan);
8140        // Zone map uses ScalarIndexExec with LikePrefix query
8141        assert!(
8142            plan_str.contains("ScalarIndexExec") && plan_str.contains("LikePrefix"),
8143            "LIKE 'app%' should use zone map index with LikePrefix, but got: {}",
8144            plan_str
8145        );
8146
8147        // Verify correct results for LIKE 'app%'
8148        let results = dataset
8149            .scan()
8150            .filter("name LIKE 'app%'")
8151            .unwrap()
8152            .try_into_batch()
8153            .await
8154            .unwrap();
8155        let names: Vec<&str> = results
8156            .column_by_name("name")
8157            .unwrap()
8158            .as_any()
8159            .downcast_ref::<StringArray>()
8160            .unwrap()
8161            .iter()
8162            .map(|s| s.unwrap())
8163            .collect();
8164        assert!(names.iter().all(|n| n.starts_with("app")));
8165        assert!(!names.is_empty());
8166
8167        // Test 2: Verify starts_with() uses zone map index
8168        let mut scanner = dataset.scan();
8169        scanner.filter("starts_with(name, 'ban')").unwrap();
8170        let plan = scanner.create_plan().await.unwrap();
8171        let plan_str = format!("{:?}", plan);
8172        assert!(
8173            plan_str.contains("ScalarIndexExec") && plan_str.contains("LikePrefix"),
8174            "starts_with should use zone map index with LikePrefix, but got: {}",
8175            plan_str
8176        );
8177
8178        // Verify correct results
8179        let results = dataset
8180            .scan()
8181            .filter("starts_with(name, 'ban')")
8182            .unwrap()
8183            .try_into_batch()
8184            .await
8185            .unwrap();
8186        let names: Vec<&str> = results
8187            .column_by_name("name")
8188            .unwrap()
8189            .as_any()
8190            .downcast_ref::<StringArray>()
8191            .unwrap()
8192            .iter()
8193            .map(|s| s.unwrap())
8194            .collect();
8195        assert!(names.iter().all(|n| n.starts_with("ban")));
8196
8197        // Test 3: LIKE with refine pattern still uses zone map for prefix pruning
8198        let mut scanner = dataset.scan();
8199        scanner.filter("name LIKE 'test%2'").unwrap();
8200        let plan = scanner.create_plan().await.unwrap();
8201        let plan_str = format!("{:?}", plan);
8202        assert!(
8203            plan_str.contains("ScalarIndexExec") && plan_str.contains("LikePrefix"),
8204            "LIKE 'test%2' should use zone map index for prefix, but got: {}",
8205            plan_str
8206        );
8207
8208        // Test 4: LIKE starting with wildcard should NOT use zone map
8209        let mut scanner = dataset.scan();
8210        scanner.filter("name LIKE '%app%'").unwrap();
8211        let plan = scanner.create_plan().await.unwrap();
8212        let plan_str = format!("{:?}", plan);
8213        assert!(
8214            !plan_str.contains("LikePrefix"),
8215            "LIKE '%app%' should not use LikePrefix index, but got: {}",
8216            plan_str
8217        );
8218    }
8219
8220    #[tokio::test]
8221    async fn test_like_prefix_with_segmented_zone_map() {
8222        use lance_index::scalar::BuiltinIndexType;
8223
8224        let data = gen_batch()
8225            .col(
8226                "name",
8227                array::cycle_utf8_literals(&[
8228                    "apple",
8229                    "application",
8230                    "app",
8231                    "banana",
8232                    "band",
8233                    "testns1",
8234                    "testns2",
8235                    "test",
8236                    "testing",
8237                    "zoo",
8238                ]),
8239            )
8240            .col("id", array::step::<Int32Type>())
8241            .into_reader_rows(RowCount::from(150), BatchCount::from(6));
8242
8243        let write_params = WriteParams {
8244            max_rows_per_file: 25,
8245            max_rows_per_group: 10,
8246            ..Default::default()
8247        };
8248
8249        let mut dataset = Dataset::write(
8250            data,
8251            "memory://test_like_segmented_zonemap",
8252            Some(write_params),
8253        )
8254        .await
8255        .unwrap();
8256
8257        let fragments = dataset.get_fragments();
8258        assert!(fragments.len() > 1, "expected multiple fragments");
8259
8260        let params = ScalarIndexParams::for_builtin(BuiltinIndexType::ZoneMap);
8261        let mut segments = Vec::with_capacity(fragments.len());
8262        for fragment in &fragments {
8263            let mut builder = dataset.create_index_builder(&["name"], IndexType::Scalar, &params);
8264            builder = builder
8265                .name("name_zonemap".to_string())
8266                .fragments(vec![fragment.id() as u32]);
8267            segments.push(builder.execute_uncommitted().await.unwrap());
8268        }
8269
8270        dataset
8271            .commit_existing_index_segments("name_zonemap", "name", segments)
8272            .await
8273            .unwrap();
8274
8275        let committed = dataset.load_indices_by_name("name_zonemap").await.unwrap();
8276        assert_eq!(committed.len(), fragments.len());
8277
8278        let mut scanner = dataset.scan();
8279        scanner.filter("name LIKE 'app%'").unwrap();
8280        let plan = scanner.create_plan().await.unwrap();
8281        let plan_str = format!("{:?}", plan);
8282        assert!(
8283            plan_str.contains("ScalarIndexExec") && plan_str.contains("LikePrefix"),
8284            "segmented zonemap should use LikePrefix pruning, but got: {}",
8285            plan_str
8286        );
8287
8288        let with_index = dataset
8289            .scan()
8290            .filter("name LIKE 'app%'")
8291            .unwrap()
8292            .try_into_batch()
8293            .await
8294            .unwrap();
8295        let without_index = dataset
8296            .scan()
8297            .use_scalar_index(false)
8298            .filter("name LIKE 'app%'")
8299            .unwrap()
8300            .try_into_batch()
8301            .await
8302            .unwrap();
8303
8304        let with_index_ids = with_index
8305            .column_by_name("id")
8306            .unwrap()
8307            .as_primitive::<Int32Type>()
8308            .values()
8309            .iter()
8310            .copied()
8311            .collect::<BTreeSet<_>>();
8312        let without_index_ids = without_index
8313            .column_by_name("id")
8314            .unwrap()
8315            .as_primitive::<Int32Type>()
8316            .values()
8317            .iter()
8318            .copied()
8319            .collect::<BTreeSet<_>>();
8320        assert_eq!(with_index_ids, without_index_ids);
8321        assert!(!with_index_ids.is_empty());
8322
8323        let names = with_index
8324            .column_by_name("name")
8325            .unwrap()
8326            .as_any()
8327            .downcast_ref::<StringArray>()
8328            .unwrap()
8329            .iter()
8330            .map(|value| value.unwrap())
8331            .collect::<Vec<_>>();
8332        assert!(names.iter().all(|name| name.starts_with("app")));
8333    }
8334
8335    #[tokio::test]
8336    async fn test_like_prefix_with_segmented_btree() {
8337        let data = gen_batch()
8338            .col(
8339                "name",
8340                array::cycle_utf8_literals(&[
8341                    "apple",
8342                    "application",
8343                    "app",
8344                    "banana",
8345                    "band",
8346                    "testns1",
8347                    "testns2",
8348                    "test",
8349                    "testing",
8350                    "zoo",
8351                ]),
8352            )
8353            .col("id", array::step::<Int32Type>())
8354            .into_reader_rows(RowCount::from(150), BatchCount::from(6));
8355
8356        let write_params = WriteParams {
8357            max_rows_per_file: 25,
8358            max_rows_per_group: 10,
8359            ..Default::default()
8360        };
8361
8362        let mut dataset = Dataset::write(
8363            data,
8364            "memory://test_like_segmented_btree",
8365            Some(write_params),
8366        )
8367        .await
8368        .unwrap();
8369
8370        let fragments = dataset.get_fragments();
8371        assert!(fragments.len() > 1, "expected multiple fragments");
8372
8373        let params = ScalarIndexParams::for_builtin(lance_index::scalar::BuiltinIndexType::BTree);
8374        let mut segments = Vec::with_capacity(fragments.len());
8375        for fragment in &fragments {
8376            let mut builder = dataset.create_index_builder(&["name"], IndexType::BTree, &params);
8377            builder = builder
8378                .name("name_btree".to_string())
8379                .fragments(vec![fragment.id() as u32]);
8380            segments.push(builder.execute_uncommitted().await.unwrap());
8381        }
8382
8383        dataset
8384            .commit_existing_index_segments("name_btree", "name", segments)
8385            .await
8386            .unwrap();
8387
8388        let committed = dataset.load_indices_by_name("name_btree").await.unwrap();
8389        assert_eq!(committed.len(), fragments.len());
8390
8391        let mut scanner = dataset.scan();
8392        scanner.filter("name LIKE 'app%'").unwrap();
8393        let plan = scanner.create_plan().await.unwrap();
8394        let plan_str = format!("{:?}", plan);
8395        assert!(
8396            plan_str.contains("ScalarIndexExec") && plan_str.contains("LikePrefix(Utf8(\"app\"))"),
8397            "segmented btree should use scalar index pruning, but got: {}",
8398            plan_str
8399        );
8400
8401        let with_index = dataset
8402            .scan()
8403            .filter("name LIKE 'app%'")
8404            .unwrap()
8405            .try_into_batch()
8406            .await
8407            .unwrap();
8408        let without_index = dataset
8409            .scan()
8410            .use_scalar_index(false)
8411            .filter("name LIKE 'app%'")
8412            .unwrap()
8413            .try_into_batch()
8414            .await
8415            .unwrap();
8416
8417        let with_index_ids = with_index
8418            .column_by_name("id")
8419            .unwrap()
8420            .as_primitive::<Int32Type>()
8421            .values()
8422            .iter()
8423            .copied()
8424            .collect::<BTreeSet<_>>();
8425        let without_index_ids = without_index
8426            .column_by_name("id")
8427            .unwrap()
8428            .as_primitive::<Int32Type>()
8429            .values()
8430            .iter()
8431            .copied()
8432            .collect::<BTreeSet<_>>();
8433        assert_eq!(with_index_ids, without_index_ids);
8434        assert!(!with_index_ids.is_empty());
8435
8436        let names = with_index
8437            .column_by_name("name")
8438            .unwrap()
8439            .as_any()
8440            .downcast_ref::<StringArray>()
8441            .unwrap()
8442            .iter()
8443            .map(|value| value.unwrap())
8444            .collect::<Vec<_>>();
8445        assert!(names.iter().all(|name| name.starts_with("app")));
8446    }
8447
8448    #[tokio::test]
8449    async fn test_like_prefix_correctness_with_zone_map() {
8450        use lance_index::scalar::BuiltinIndexType;
8451
8452        // Create dataset with deterministic string data for exact result verification
8453        let names: Vec<&str> = vec![
8454            "alpha", "alphabet", "beta", "gamma", "delta", "epsilon", "eta", "theta", "iota",
8455            "kappa",
8456        ];
8457        let data = RecordBatch::try_new(
8458            Arc::new(ArrowSchema::new(vec![
8459                ArrowField::new("name", DataType::Utf8, false),
8460                ArrowField::new("id", DataType::Int32, false),
8461            ])),
8462            vec![
8463                Arc::new(StringArray::from(names.clone())),
8464                Arc::new(Int32Array::from_iter_values(0..10)),
8465            ],
8466        )
8467        .unwrap();
8468
8469        let reader = RecordBatchIterator::new(
8470            vec![Ok(data)],
8471            Arc::new(ArrowSchema::new(vec![
8472                ArrowField::new("name", DataType::Utf8, false),
8473                ArrowField::new("id", DataType::Int32, false),
8474            ])),
8475        );
8476
8477        let mut dataset = Dataset::write(reader, "memory://test_like_correctness_zonemap", None)
8478            .await
8479            .unwrap();
8480
8481        // Create ZoneMap index
8482        let params = ScalarIndexParams::for_builtin(BuiltinIndexType::ZoneMap);
8483        dataset
8484            .create_index(
8485                &["name"],
8486                IndexType::Scalar,
8487                Some("name_zonemap".to_string()),
8488                &params,
8489                true,
8490            )
8491            .await
8492            .unwrap();
8493
8494        // Test with zone map index
8495        let with_index = dataset
8496            .scan()
8497            .filter("name LIKE 'alpha%'")
8498            .unwrap()
8499            .try_into_batch()
8500            .await
8501            .unwrap();
8502
8503        // Test without index (for comparison)
8504        let without_index = dataset
8505            .scan()
8506            .use_scalar_index(false)
8507            .filter("name LIKE 'alpha%'")
8508            .unwrap()
8509            .try_into_batch()
8510            .await
8511            .unwrap();
8512
8513        // Both should return same results: alpha, alphabet
8514        assert_eq!(with_index.num_rows(), without_index.num_rows());
8515        assert_eq!(with_index.num_rows(), 2);
8516
8517        let with_index_names: BTreeSet<String> = with_index
8518            .column_by_name("name")
8519            .unwrap()
8520            .as_any()
8521            .downcast_ref::<StringArray>()
8522            .unwrap()
8523            .iter()
8524            .map(|s| s.unwrap().to_string())
8525            .collect();
8526
8527        let without_index_names: BTreeSet<String> = without_index
8528            .column_by_name("name")
8529            .unwrap()
8530            .as_any()
8531            .downcast_ref::<StringArray>()
8532            .unwrap()
8533            .iter()
8534            .map(|s| s.unwrap().to_string())
8535            .collect();
8536
8537        assert_eq!(with_index_names, without_index_names);
8538        assert_eq!(
8539            with_index_names,
8540            BTreeSet::from(["alpha".to_string(), "alphabet".to_string()])
8541        );
8542
8543        // Test starts_with correctness with zone map
8544        let starts_with_result = dataset
8545            .scan()
8546            .filter("starts_with(name, 'e')")
8547            .unwrap()
8548            .try_into_batch()
8549            .await
8550            .unwrap();
8551
8552        let starts_with_names: BTreeSet<String> = starts_with_result
8553            .column_by_name("name")
8554            .unwrap()
8555            .as_any()
8556            .downcast_ref::<StringArray>()
8557            .unwrap()
8558            .iter()
8559            .map(|s| s.unwrap().to_string())
8560            .collect();
8561
8562        // Should match: epsilon, eta
8563        assert_eq!(
8564            starts_with_names,
8565            BTreeSet::from(["epsilon".to_string(), "eta".to_string()])
8566        );
8567    }
8568
8569    #[rstest]
8570    #[tokio::test]
8571    async fn test_late_materialization(
8572        #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
8573        data_storage_version: LanceFileVersion,
8574    ) {
8575        use lance_io::assert_io_lt;
8576        // Create a large dataset with a scalar indexed column and a sorted but not scalar
8577        // indexed column
8578        use lance_table::io::commit::RenameCommitHandler;
8579        let data = gen_batch()
8580            .col(
8581                "vector",
8582                array::rand_vec::<Float32Type>(Dimension::from(32)),
8583            )
8584            .col("indexed", array::step::<Int32Type>())
8585            .col("not_indexed", array::step::<Int32Type>())
8586            .into_reader_rows(RowCount::from(1000), BatchCount::from(20));
8587
8588        let mut dataset = Dataset::write(
8589            data,
8590            "memory://test",
8591            Some(WriteParams {
8592                commit_handler: Some(Arc::new(RenameCommitHandler)),
8593                data_storage_version: Some(data_storage_version),
8594                ..Default::default()
8595            }),
8596        )
8597        .await
8598        .unwrap();
8599        dataset
8600            .create_index(
8601                &["indexed"],
8602                IndexType::Scalar,
8603                None,
8604                &ScalarIndexParams::default(),
8605                false,
8606            )
8607            .await
8608            .unwrap();
8609
8610        // First run a full scan to get a baseline
8611        let _ = dataset.object_store.as_ref().io_stats_incremental(); // reset
8612        dataset.scan().try_into_batch().await.unwrap();
8613        let io_stats = dataset.object_store.as_ref().io_stats_incremental();
8614        let full_scan_bytes = io_stats.read_bytes;
8615
8616        // Next do a scan without pushdown, we should still see a benefit from late materialization
8617        dataset
8618            .scan()
8619            .use_stats(false)
8620            .filter("not_indexed = 50")
8621            .unwrap()
8622            .try_into_batch()
8623            .await
8624            .unwrap();
8625        let io_stats = dataset.object_store.as_ref().io_stats_incremental();
8626        assert_io_lt!(io_stats, read_bytes, full_scan_bytes);
8627        let filtered_scan_bytes = io_stats.read_bytes;
8628
8629        // Now do a scan with pushdown, the benefit should be even greater
8630        // Pushdown only works with the legacy format for now.
8631        if data_storage_version == LanceFileVersion::Legacy {
8632            dataset
8633                .scan()
8634                .filter("not_indexed = 50")
8635                .unwrap()
8636                .try_into_batch()
8637                .await
8638                .unwrap();
8639            let io_stats = dataset.object_store.as_ref().io_stats_incremental();
8640            assert_io_lt!(io_stats, read_bytes, filtered_scan_bytes);
8641        }
8642
8643        // Now do a scalar index scan, this should be better than a
8644        // full scan but since we have to load the index might be more
8645        // expensive than late / pushdown scan
8646        dataset
8647            .scan()
8648            .filter("indexed = 50")
8649            .unwrap()
8650            .try_into_batch()
8651            .await
8652            .unwrap();
8653        let io_stats = dataset.object_store.as_ref().io_stats_incremental();
8654        assert_io_lt!(io_stats, read_bytes, full_scan_bytes);
8655        let index_scan_bytes = io_stats.read_bytes;
8656
8657        // A second scalar index scan should be cheaper than the first
8658        // since we should have the index in cache
8659        dataset
8660            .scan()
8661            .filter("indexed = 50")
8662            .unwrap()
8663            .try_into_batch()
8664            .await
8665            .unwrap();
8666        let io_stats = dataset.object_store.as_ref().io_stats_incremental();
8667        assert_io_lt!(io_stats, read_bytes, index_scan_bytes);
8668    }
8669
8670    #[rstest]
8671    #[tokio::test]
8672    async fn test_project_nested(
8673        #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
8674        data_storage_version: LanceFileVersion,
8675    ) -> Result<()> {
8676        let struct_i_field = ArrowField::new("i", DataType::Int32, true);
8677        let struct_o_field = ArrowField::new("o", DataType::Utf8, true);
8678        let schema = Arc::new(ArrowSchema::new(vec![
8679            ArrowField::new(
8680                "struct",
8681                DataType::Struct(vec![struct_i_field.clone(), struct_o_field.clone()].into()),
8682                true,
8683            ),
8684            ArrowField::new("s", DataType::Utf8, true),
8685        ]));
8686
8687        let input_batches: Vec<RecordBatch> = (0..5)
8688            .map(|i| {
8689                let struct_i_arr: Arc<Int32Array> =
8690                    Arc::new(Int32Array::from_iter_values(i * 20..(i + 1) * 20));
8691                let struct_o_arr: Arc<StringArray> = Arc::new(StringArray::from_iter_values(
8692                    (i * 20..(i + 1) * 20).map(|v| format!("o-{:02}", v)),
8693                ));
8694                RecordBatch::try_new(
8695                    schema.clone(),
8696                    vec![
8697                        Arc::new(StructArray::from(vec![
8698                            (Arc::new(struct_i_field.clone()), struct_i_arr as ArrayRef),
8699                            (Arc::new(struct_o_field.clone()), struct_o_arr as ArrayRef),
8700                        ])),
8701                        Arc::new(StringArray::from_iter_values(
8702                            (i * 20..(i + 1) * 20).map(|v| format!("s-{}", v)),
8703                        )),
8704                    ],
8705                )
8706                .unwrap()
8707            })
8708            .collect();
8709        let batches =
8710            RecordBatchIterator::new(input_batches.clone().into_iter().map(Ok), schema.clone());
8711        let test_dir = TempStrDir::default();
8712        let test_uri = &test_dir;
8713        let write_params = WriteParams {
8714            max_rows_per_file: 40,
8715            max_rows_per_group: 10,
8716            data_storage_version: Some(data_storage_version),
8717            ..Default::default()
8718        };
8719        Dataset::write(batches, test_uri, Some(write_params))
8720            .await
8721            .unwrap();
8722
8723        let dataset = Dataset::open(test_uri).await.unwrap();
8724
8725        let batches = dataset
8726            .scan()
8727            .project(&["struct.i"])
8728            .unwrap()
8729            .try_into_stream()
8730            .await
8731            .unwrap()
8732            .try_collect::<Vec<_>>()
8733            .await
8734            .unwrap();
8735        let batch = concat_batches(&batches[0].schema(), &batches).unwrap();
8736        assert!(batch.column_by_name("struct.i").is_some());
8737        Ok(())
8738    }
8739
8740    #[rstest]
8741    #[tokio::test]
8742    async fn test_plans(
8743        #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
8744        data_storage_version: LanceFileVersion,
8745        #[values(false, true)] stable_row_id: bool,
8746    ) -> Result<()> {
8747        // Create a vector dataset
8748
8749        use lance_index::scalar::inverted::query::BoostQuery;
8750        let dim = 256;
8751        let mut dataset =
8752            TestVectorDataset::new_with_dimension(data_storage_version, stable_row_id, dim).await?;
8753        let lance_schema = dataset.dataset.schema();
8754
8755        // Scans
8756        // ---------------------------------------------------------------------
8757        // V2 writer does not use LancePushdownScan
8758        if data_storage_version == LanceFileVersion::Legacy {
8759            log::info!("Test case: Pushdown scan");
8760            assert_plan_equals(
8761                &dataset.dataset,
8762                |scan| scan.project(&["s"])?.filter("i > 10 and i < 20"),
8763                "LancePushdownScan: uri=..., projection=[s], predicate=i > Int32(10) AND i < Int32(20), row_id=false, row_addr=false, ordered=true"
8764            ).await?;
8765        }
8766
8767        log::info!("Test case: Project and filter");
8768        let expected = if data_storage_version == LanceFileVersion::Legacy {
8769            "ProjectionExec: expr=[s@2 as s]
8770  Take: columns=\"i, _rowid, (s)\"
8771    CoalesceBatchesExec: target_batch_size=8192
8772      FilterExec: i@0 > 10 AND i@0 < 20
8773        LanceScan: uri..., projection=[i], row_id=true, row_addr=false, ordered=true, range=None"
8774        } else {
8775            "ProjectionExec: expr=[s@2 as s]
8776  Take: columns=\"i, _rowid, (s)\"
8777    CoalesceBatchesExec: target_batch_size=8192
8778      LanceRead: ..., projection=[i], num_fragments=2, range_before=None, range_after=None, row_id=true, row_addr=false, full_filter=i > Int32(10) AND i < Int32(20), refine_filter=i > Int32(10) AND i < Int32(20)"
8779        };
8780        assert_plan_equals(
8781            &dataset.dataset,
8782            |scan| {
8783                scan.use_stats(false)
8784                    .project(&["s"])?
8785                    .filter("i > 10 and i < 20")
8786            },
8787            expected,
8788        )
8789        .await?;
8790
8791        // Integer fields will be eagerly materialized while string/vec fields
8792        // are not.
8793        log::info!("Test case: Late materialization");
8794        let expected = if data_storage_version == LanceFileVersion::Legacy {
8795            "ProjectionExec: expr=[i@0 as i, s@1 as s, vec@3 as vec]
8796            Take: columns=\"i, s, _rowid, (vec)\"
8797              CoalesceBatchesExec: target_batch_size=8192
8798                FilterExec: s@1 IS NOT NULL
8799                  LanceScan: uri..., projection=[i, s], row_id=true, row_addr=false, ordered=true, range=None"
8800        } else {
8801            "ProjectionExec: expr=[i@0 as i, s@1 as s, vec@3 as vec]
8802  Take: columns=\"i, s, _rowid, (vec)\"
8803    CoalesceBatchesExec: target_batch_size=8192
8804      LanceRead: uri=..., projection=[i, s], num_fragments=2, range_before=None, range_after=None, \
8805      row_id=true, row_addr=false, full_filter=s IS NOT NULL, refine_filter=s IS NOT NULL"
8806        };
8807        assert_plan_equals(
8808            &dataset.dataset,
8809            |scan| scan.use_stats(false).filter("s IS NOT NULL"),
8810            expected,
8811        )
8812        .await?;
8813
8814        // Custom materialization
8815        log::info!("Test case: Custom materialization (all early)");
8816        let expected = if data_storage_version == LanceFileVersion::Legacy {
8817            "ProjectionExec: expr=[i@0 as i, s@1 as s, vec@2 as vec]
8818  FilterExec: s@1 IS NOT NULL
8819    LanceScan: uri..., projection=[i, s, vec], row_id=true, row_addr=false, ordered=true, range=None"
8820        } else {
8821            "ProjectionExec: expr=[i@0 as i, s@1 as s, vec@2 as vec]
8822  LanceRead: uri=..., projection=[i, s, vec], num_fragments=2, range_before=None, \
8823  range_after=None, row_id=true, row_addr=false, full_filter=s IS NOT NULL, refine_filter=s IS NOT NULL"
8824        };
8825        assert_plan_equals(
8826            &dataset.dataset,
8827            |scan| {
8828                scan.use_stats(false)
8829                    .materialization_style(MaterializationStyle::AllEarly)
8830                    .filter("s IS NOT NULL")
8831            },
8832            expected,
8833        )
8834        .await?;
8835
8836        log::info!("Test case: Custom materialization 2 (all late)");
8837        let expected = if data_storage_version == LanceFileVersion::Legacy {
8838            "ProjectionExec: expr=[i@2 as i, s@0 as s, vec@3 as vec]
8839  Take: columns=\"s, _rowid, (i), (vec)\"
8840    CoalesceBatchesExec: target_batch_size=8192
8841      FilterExec: s@0 IS NOT NULL
8842        LanceScan: uri..., projection=[s], row_id=true, row_addr=false, ordered=true, range=None"
8843        } else {
8844            "ProjectionExec: expr=[i@2 as i, s@0 as s, vec@3 as vec]
8845  Take: columns=\"s, _rowid, (i), (vec)\"
8846    CoalesceBatchesExec: target_batch_size=8192
8847      LanceRead: uri=..., projection=[s], num_fragments=2, range_before=None, \
8848      range_after=None, row_id=true, row_addr=false, full_filter=s IS NOT NULL, refine_filter=s IS NOT NULL"
8849        };
8850        assert_plan_equals(
8851            &dataset.dataset,
8852            |scan| {
8853                scan.use_stats(false)
8854                    .materialization_style(MaterializationStyle::AllLate)
8855                    .filter("s IS NOT NULL")
8856            },
8857            expected,
8858        )
8859        .await?;
8860
8861        log::info!("Test case: Custom materialization 3 (mixed)");
8862        let expected = if data_storage_version == LanceFileVersion::Legacy {
8863            "ProjectionExec: expr=[i@3 as i, s@0 as s, vec@1 as vec]
8864  Take: columns=\"s, vec, _rowid, (i)\"
8865    CoalesceBatchesExec: target_batch_size=8192
8866      FilterExec: s@0 IS NOT NULL
8867        LanceScan: uri..., projection=[s, vec], row_id=true, row_addr=false, ordered=true, range=None"
8868        } else {
8869            "ProjectionExec: expr=[i@3 as i, s@0 as s, vec@1 as vec]
8870  Take: columns=\"s, vec, _rowid, (i)\"
8871    CoalesceBatchesExec: target_batch_size=8192
8872      LanceRead: uri=..., projection=[s, vec], num_fragments=2, range_before=None, range_after=None, \
8873      row_id=true, row_addr=false, full_filter=s IS NOT NULL, refine_filter=s IS NOT NULL"
8874        };
8875        assert_plan_equals(
8876            &dataset.dataset,
8877            |scan| {
8878                scan.use_stats(false)
8879                    .materialization_style(
8880                        MaterializationStyle::all_early_except(&["i"], lance_schema).unwrap(),
8881                    )
8882                    .filter("s IS NOT NULL")
8883            },
8884            expected,
8885        )
8886        .await?;
8887
8888        log::info!("Test case: Scan out of order");
8889        let expected = if data_storage_version == LanceFileVersion::Legacy {
8890            "LanceScan: uri=..., projection=[s], row_id=true, row_addr=false, ordered=false, range=None"
8891        } else {
8892            "LanceRead: uri=..., projection=[s], num_fragments=2, range_before=None, range_after=None, row_id=true, \
8893            row_addr=false, full_filter=--, refine_filter=--"
8894        };
8895        assert_plan_equals(
8896            &dataset.dataset,
8897            |scan| Ok(scan.project(&["s"])?.with_row_id().scan_in_order(false)),
8898            expected,
8899        )
8900        .await?;
8901
8902        // KNN
8903        // ---------------------------------------------------------------------
8904        let q: Float32Array = (32..32 + dim).map(|v| v as f32).collect();
8905        log::info!("Test case: Basic KNN");
8906        let expected = if data_storage_version == LanceFileVersion::Legacy {
8907            "ProjectionExec: expr=[i@3 as i, s@4 as s, vec@0 as vec, _distance@2 as _distance]
8908  Take: columns=\"vec, _rowid, _distance, (i), (s)\"
8909    CoalesceBatchesExec: target_batch_size=8192
8910      FilterExec: _distance@2 IS NOT NULL
8911        SortExec: TopK(fetch=5), expr=...
8912          KNNVectorDistance: metric=l2
8913            LanceScan: uri=..., projection=[vec], row_id=true, row_addr=false, ordered=false, range=None"
8914        } else {
8915            "ProjectionExec: expr=[i@3 as i, s@4 as s, vec@0 as vec, _distance@2 as _distance]
8916  Take: columns=\"vec, _rowid, _distance, (i), (s)\"
8917    CoalesceBatchesExec: target_batch_size=8192
8918      FilterExec: _distance@2 IS NOT NULL
8919        SortExec: TopK(fetch=5), expr=...
8920          KNNVectorDistance: metric=l2
8921            LanceRead: uri=..., projection=[vec], num_fragments=2, range_before=None, range_after=None, \
8922            row_id=true, row_addr=false, full_filter=--, refine_filter=--"
8923        };
8924        assert_plan_equals(
8925            &dataset.dataset,
8926            |scan| scan.nearest("vec", &q, 5),
8927            expected,
8928        )
8929        .await?;
8930
8931        // KNN + Limit (arguably the user, or us, should fold the limit into the KNN but we don't today)
8932        // ---------------------------------------------------------------------
8933        let q: Float32Array = (32..32 + dim).map(|v| v as f32).collect();
8934        log::info!("Test case: KNN with extraneous limit");
8935        let expected = if data_storage_version == LanceFileVersion::Legacy {
8936            "ProjectionExec: expr=[i@3 as i, s@4 as s, vec@0 as vec, _distance@2 as _distance]
8937  Take: columns=\"vec, _rowid, _distance, (i), (s)\"
8938    CoalesceBatchesExec: target_batch_size=8192
8939      GlobalLimitExec: skip=0, fetch=1
8940        FilterExec: _distance@2 IS NOT NULL
8941          SortExec: TopK(fetch=5), expr=...
8942            KNNVectorDistance: metric=l2
8943              LanceScan: uri=..., projection=[vec], row_id=true, row_addr=false, ordered=false, range=None"
8944        } else {
8945            "ProjectionExec: expr=[i@3 as i, s@4 as s, vec@0 as vec, _distance@2 as _distance]
8946  Take: columns=\"vec, _rowid, _distance, (i), (s)\"
8947    CoalesceBatchesExec: target_batch_size=8192
8948      GlobalLimitExec: skip=0, fetch=1
8949        FilterExec: _distance@2 IS NOT NULL
8950          SortExec: TopK(fetch=5), expr=...
8951            KNNVectorDistance: metric=l2
8952              LanceRead: uri=..., projection=[vec], num_fragments=2, range_before=None, range_after=None, \
8953              row_id=true, row_addr=false, full_filter=--, refine_filter=--"
8954        };
8955        assert_plan_equals(
8956            &dataset.dataset,
8957            |scan| scan.nearest("vec", &q, 5)?.limit(Some(1), None),
8958            expected,
8959        )
8960        .await?;
8961
8962        // ANN
8963        // ---------------------------------------------------------------------
8964        dataset.make_vector_index().await?;
8965        log::info!("Test case: Basic ANN");
8966        let expected =
8967            "ProjectionExec: expr=[i@2 as i, s@3 as s, vec@4 as vec, _distance@0 as _distance]
8968  Take: columns=\"_distance, _rowid, (i), (s), (vec)\"
8969    CoalesceBatchesExec: target_batch_size=8192
8970      SortExec: TopK(fetch=42), expr=...
8971        ANNSubIndex: name=..., k=42, deltas=1, metric=L2
8972          ANNIvfPartition: uuid=..., minimum_nprobes=1, maximum_nprobes=None, deltas=1";
8973        assert_plan_equals(
8974            &dataset.dataset,
8975            |scan| scan.nearest("vec", &q, 42),
8976            expected,
8977        )
8978        .await?;
8979
8980        log::info!("Test case: ANN with refine");
8981        let expected =
8982            "ProjectionExec: expr=[i@3 as i, s@4 as s, vec@1 as vec, _distance@2 as _distance]
8983  Take: columns=\"_rowid, vec, _distance, (i), (s)\"
8984    CoalesceBatchesExec: target_batch_size=8192
8985      FilterExec: _distance@... IS NOT NULL
8986        SortExec: TopK(fetch=10), expr=...
8987          KNNVectorDistance: metric=l2
8988            Take: columns=\"_distance, _rowid, (vec)\"
8989              CoalesceBatchesExec: target_batch_size=8192
8990                SortExec: TopK(fetch=40), expr=...
8991                  ANNSubIndex: name=..., k=40, deltas=1, metric=L2
8992                    ANNIvfPartition: uuid=..., minimum_nprobes=1, maximum_nprobes=None, deltas=1";
8993        assert_plan_equals(
8994            &dataset.dataset,
8995            |scan| Ok(scan.nearest("vec", &q, 10)?.refine(4)),
8996            expected,
8997        )
8998        .await?;
8999
9000        // use_index = False -> same plan as KNN
9001        log::info!("Test case: ANN with index disabled");
9002        let expected = if data_storage_version == LanceFileVersion::Legacy {
9003            "ProjectionExec: expr=[i@3 as i, s@4 as s, vec@0 as vec, _distance@2 as _distance]
9004  Take: columns=\"vec, _rowid, _distance, (i), (s)\"
9005    CoalesceBatchesExec: target_batch_size=8192
9006      FilterExec: _distance@... IS NOT NULL
9007        SortExec: TopK(fetch=13), expr=...
9008          KNNVectorDistance: metric=l2
9009            LanceScan: uri=..., projection=[vec], row_id=true, row_addr=false, ordered=false, range=None"
9010        } else {
9011            "ProjectionExec: expr=[i@3 as i, s@4 as s, vec@0 as vec, _distance@2 as _distance]
9012  Take: columns=\"vec, _rowid, _distance, (i), (s)\"
9013    CoalesceBatchesExec: target_batch_size=8192
9014      FilterExec: _distance@... IS NOT NULL
9015        SortExec: TopK(fetch=13), expr=...
9016          KNNVectorDistance: metric=l2
9017            LanceRead: uri=..., projection=[vec], num_fragments=2, range_before=None, range_after=None, \
9018            row_id=true, row_addr=false, full_filter=--, refine_filter=--"
9019        };
9020        assert_plan_equals(
9021            &dataset.dataset,
9022            |scan| Ok(scan.nearest("vec", &q, 13)?.use_index(false)),
9023            expected,
9024        )
9025        .await?;
9026
9027        log::info!("Test case: ANN with postfilter");
9028        let expected = "ProjectionExec: expr=[s@3 as s, vec@4 as vec, _distance@0 as _distance, _rowid@1 as _rowid]
9029  Take: columns=\"_distance, _rowid, i, (s), (vec)\"
9030    CoalesceBatchesExec: target_batch_size=8192
9031      FilterExec: i@2 > 10
9032        Take: columns=\"_distance, _rowid, (i)\"
9033          CoalesceBatchesExec: target_batch_size=8192
9034            SortExec: TopK(fetch=17), expr=...
9035              ANNSubIndex: name=..., k=17, deltas=1, metric=L2
9036                ANNIvfPartition: uuid=..., minimum_nprobes=1, maximum_nprobes=None, deltas=1";
9037        assert_plan_equals(
9038            &dataset.dataset,
9039            |scan| {
9040                Ok(scan
9041                    .nearest("vec", &q, 17)?
9042                    .filter("i > 10")?
9043                    .project(&["s", "vec"])?
9044                    .with_row_id())
9045            },
9046            expected,
9047        )
9048        .await?;
9049
9050        log::info!("Test case: ANN with prefilter");
9051        let expected = if data_storage_version == LanceFileVersion::Legacy {
9052            "ProjectionExec: expr=[i@2 as i, s@3 as s, vec@4 as vec, _distance@0 as _distance]
9053  Take: columns=\"_distance, _rowid, (i), (s), (vec)\"
9054    CoalesceBatchesExec: target_batch_size=8192
9055      SortExec: TopK(fetch=17), expr=...
9056        ANNSubIndex: name=..., k=17, deltas=1, metric=L2
9057          ANNIvfPartition: uuid=..., minimum_nprobes=1, maximum_nprobes=None, deltas=1
9058          FilterExec: i@0 > 10
9059            LanceScan: uri=..., projection=[i], row_id=true, row_addr=false, ordered=false, range=None"
9060        } else {
9061            "ProjectionExec: expr=[i@2 as i, s@3 as s, vec@4 as vec, _distance@0 as _distance]
9062  Take: columns=\"_distance, _rowid, (i), (s), (vec)\"
9063    CoalesceBatchesExec: target_batch_size=8192
9064      SortExec: TopK(fetch=17), expr=...
9065        ANNSubIndex: name=..., k=17, deltas=1, metric=L2
9066          ANNIvfPartition: uuid=..., minimum_nprobes=1, maximum_nprobes=None, deltas=1
9067          LanceRead: uri=..., projection=[], num_fragments=2, range_before=None, range_after=None, \
9068          row_id=true, row_addr=false, full_filter=i > Int32(10), refine_filter=i > Int32(10)
9069"
9070        };
9071        assert_plan_equals(
9072            &dataset.dataset,
9073            |scan| {
9074                Ok(scan
9075                    .nearest("vec", &q, 17)?
9076                    .filter("i > 10")?
9077                    .prefilter(true))
9078            },
9079            expected,
9080        )
9081        .await?;
9082
9083        dataset.append_new_data().await?;
9084        log::info!("Test case: Combined KNN/ANN");
9085        let expected = "ProjectionExec: expr=[i@3 as i, s@4 as s, vec@1 as vec, _distance@2 as _distance]
9086  Take: columns=\"_rowid, vec, _distance, (i), (s)\"
9087    CoalesceBatchesExec: target_batch_size=8192
9088      FilterExec: _distance@... IS NOT NULL
9089        SortExec: TopK(fetch=6), expr=...
9090          KNNVectorDistance: metric=l2
9091            RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2
9092              UnionExec
9093                ProjectionExec: expr=[_distance@2 as _distance, _rowid@1 as _rowid, vec@0 as vec]
9094                  FilterExec: _distance@... IS NOT NULL
9095                    SortExec: TopK(fetch=6), expr=...
9096                      KNNVectorDistance: metric=l2
9097                        LanceScan: uri=..., projection=[vec], row_id=true, row_addr=false, ordered=false, range=None
9098                Take: columns=\"_distance, _rowid, (vec)\"
9099                  CoalesceBatchesExec: target_batch_size=8192
9100                    SortExec: TopK(fetch=6), expr=...
9101                      ANNSubIndex: name=..., k=6, deltas=1, metric=L2
9102                        ANNIvfPartition: uuid=..., minimum_nprobes=1, maximum_nprobes=None, deltas=1";
9103        assert_plan_equals(
9104            &dataset.dataset,
9105            |scan| scan.nearest("vec", &q, 6),
9106            // TODO: we could write an optimizer rule to eliminate the last Projection
9107            // by doing it as part of the last Take. This would likely have minimal impact though.
9108            expected,
9109        )
9110        .await?;
9111
9112        // new data and with filter
9113        log::info!("Test case: Combined KNN/ANN with postfilter");
9114        let expected = "ProjectionExec: expr=[i@3 as i, s@4 as s, vec@1 as vec, _distance@2 as _distance]
9115  Take: columns=\"_rowid, vec, _distance, i, (s)\"
9116    CoalesceBatchesExec: target_batch_size=8192
9117      FilterExec: i@3 > 10
9118        Take: columns=\"_rowid, vec, _distance, (i)\"
9119          CoalesceBatchesExec: target_batch_size=8192
9120            FilterExec: _distance@... IS NOT NULL
9121              SortExec: TopK(fetch=15), expr=...
9122                KNNVectorDistance: metric=l2
9123                  RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2
9124                    UnionExec
9125                      ProjectionExec: expr=[_distance@2 as _distance, _rowid@1 as _rowid, vec@0 as vec]
9126                        FilterExec: _distance@... IS NOT NULL
9127                          SortExec: TopK(fetch=15), expr=...
9128                            KNNVectorDistance: metric=l2
9129                              LanceScan: uri=..., projection=[vec], row_id=true, row_addr=false, ordered=false, range=None
9130                      Take: columns=\"_distance, _rowid, (vec)\"
9131                        CoalesceBatchesExec: target_batch_size=8192
9132                          SortExec: TopK(fetch=15), expr=...
9133                            ANNSubIndex: name=..., k=15, deltas=1, metric=L2
9134                              ANNIvfPartition: uuid=..., minimum_nprobes=1, maximum_nprobes=None, deltas=1";
9135        assert_plan_equals(
9136            &dataset.dataset,
9137            |scan| scan.nearest("vec", &q, 15)?.filter("i > 10"),
9138            expected,
9139        )
9140        .await?;
9141
9142        // new data and with prefilter
9143        log::info!("Test case: Combined KNN/ANN with prefilter");
9144        let expected = if data_storage_version == LanceFileVersion::Legacy {
9145            "ProjectionExec: expr=[i@3 as i, s@4 as s, vec@1 as vec, _distance@2 as _distance]
9146  Take: columns=\"_rowid, vec, _distance, (i), (s)\"
9147    CoalesceBatchesExec: target_batch_size=8192
9148      FilterExec: _distance@... IS NOT NULL
9149        SortExec: TopK(fetch=5), expr=...
9150          KNNVectorDistance: metric=l2
9151            RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2
9152              UnionExec
9153                ProjectionExec: expr=[_distance@3 as _distance, _rowid@2 as _rowid, vec@0 as vec]
9154                  FilterExec: _distance@... IS NOT NULL
9155                    SortExec: TopK(fetch=5), expr=...
9156                      KNNVectorDistance: metric=l2
9157                        FilterExec: i@1 > 10
9158                          LanceScan: uri=..., projection=[vec, i], row_id=true, row_addr=false, ordered=false, range=None
9159                Take: columns=\"_distance, _rowid, (vec)\"
9160                  CoalesceBatchesExec: target_batch_size=8192
9161                    SortExec: TopK(fetch=5), expr=...
9162                      ANNSubIndex: name=..., k=5, deltas=1, metric=L2
9163                        ANNIvfPartition: uuid=..., minimum_nprobes=1, maximum_nprobes=None, deltas=1
9164                        FilterExec: i@0 > 10
9165                          LanceScan: uri=..., projection=[i], row_id=true, row_addr=false, ordered=false, range=None"
9166        } else {
9167            "ProjectionExec: expr=[i@3 as i, s@4 as s, vec@1 as vec, _distance@2 as _distance]
9168  Take: columns=\"_rowid, vec, _distance, (i), (s)\"
9169    CoalesceBatchesExec: target_batch_size=8192
9170      FilterExec: _distance@... IS NOT NULL
9171        SortExec: TopK(fetch=5), expr=...
9172          KNNVectorDistance: metric=l2
9173            RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2
9174              UnionExec
9175                ProjectionExec: expr=[_distance@3 as _distance, _rowid@2 as _rowid, vec@0 as vec]
9176                  FilterExec: _distance@... IS NOT NULL
9177                    SortExec: TopK(fetch=5), expr=...
9178                      KNNVectorDistance: metric=l2
9179                        FilterExec: i@1 > 10
9180                          LanceScan: uri=..., projection=[vec, i], row_id=true, row_addr=false, ordered=false, range=None
9181                Take: columns=\"_distance, _rowid, (vec)\"
9182                  CoalesceBatchesExec: target_batch_size=8192
9183                    SortExec: TopK(fetch=5), expr=...
9184                      ANNSubIndex: name=..., k=5, deltas=1, metric=L2
9185                        ANNIvfPartition: uuid=..., minimum_nprobes=1, maximum_nprobes=None, deltas=1
9186                        LanceRead: uri=..., projection=[], num_fragments=2, range_before=None, range_after=None, \
9187                          row_id=true, row_addr=false, full_filter=i > Int32(10), refine_filter=i > Int32(10)"
9188        };
9189        assert_plan_equals(
9190            &dataset.dataset,
9191            |scan| {
9192                Ok(scan
9193                    .nearest("vec", &q, 5)?
9194                    .filter("i > 10")?
9195                    .prefilter(true))
9196            },
9197            // TODO: i is scanned on both sides but is projected away mid-plan
9198            // only to be taken again later. We should fix this.
9199            expected,
9200        )
9201        .await?;
9202
9203        // ANN with scalar index
9204        // ---------------------------------------------------------------------
9205        // Make sure both indices are up-to-date to start
9206        dataset.make_vector_index().await?;
9207        dataset.make_scalar_index().await?;
9208
9209        log::info!("Test case: ANN with scalar index");
9210        let expected =
9211            "ProjectionExec: expr=[i@2 as i, s@3 as s, vec@4 as vec, _distance@0 as _distance]
9212  Take: columns=\"_distance, _rowid, (i), (s), (vec)\"
9213    CoalesceBatchesExec: target_batch_size=8192
9214      SortExec: TopK(fetch=5), expr=...
9215        ANNSubIndex: name=..., k=5, deltas=1, metric=L2
9216          ANNIvfPartition: uuid=..., minimum_nprobes=1, maximum_nprobes=None, deltas=1
9217          ScalarIndexQuery: query=[i > 10]@i_idx(BTree)";
9218        assert_plan_equals(
9219            &dataset.dataset,
9220            |scan| {
9221                Ok(scan
9222                    .nearest("vec", &q, 5)?
9223                    .filter("i > 10")?
9224                    .prefilter(true))
9225            },
9226            expected,
9227        )
9228        .await?;
9229
9230        log::info!("Test case: ANN with scalar index disabled");
9231        let expected = if data_storage_version == LanceFileVersion::Legacy {
9232            "ProjectionExec: expr=[i@2 as i, s@3 as s, vec@4 as vec, _distance@0 as _distance]
9233  Take: columns=\"_distance, _rowid, (i), (s), (vec)\"
9234    CoalesceBatchesExec: target_batch_size=8192
9235      SortExec: TopK(fetch=5), expr=...
9236        ANNSubIndex: name=..., k=5, deltas=1, metric=L2
9237          ANNIvfPartition: uuid=..., minimum_nprobes=1, maximum_nprobes=None, deltas=1
9238          FilterExec: i@0 > 10
9239            LanceScan: uri=..., projection=[i], row_id=true, row_addr=false, ordered=false, range=None"
9240        } else {
9241            "ProjectionExec: expr=[i@2 as i, s@3 as s, vec@4 as vec, _distance@0 as _distance]
9242  Take: columns=\"_distance, _rowid, (i), (s), (vec)\"
9243    CoalesceBatchesExec: target_batch_size=8192
9244      SortExec: TopK(fetch=5), expr=...
9245        ANNSubIndex: name=..., k=5, deltas=1, metric=L2
9246          ANNIvfPartition: uuid=..., minimum_nprobes=1, maximum_nprobes=None, deltas=1
9247          LanceRead: uri=..., projection=[], num_fragments=3, range_before=None, \
9248          range_after=None, row_id=true, row_addr=false, full_filter=i > Int32(10), refine_filter=i > Int32(10)"
9249        };
9250        assert_plan_equals(
9251            &dataset.dataset,
9252            |scan| {
9253                Ok(scan
9254                    .nearest("vec", &q, 5)?
9255                    .use_scalar_index(false)
9256                    .filter("i > 10")?
9257                    .prefilter(true))
9258            },
9259            expected,
9260        )
9261        .await?;
9262
9263        dataset.append_new_data().await?;
9264
9265        log::info!("Test case: Combined KNN/ANN with scalar index");
9266        let expected = "ProjectionExec: expr=[i@3 as i, s@4 as s, vec@1 as vec, _distance@2 as _distance]
9267  Take: columns=\"_rowid, vec, _distance, (i), (s)\"
9268    CoalesceBatchesExec: target_batch_size=8192
9269      FilterExec: _distance@... IS NOT NULL
9270        SortExec: TopK(fetch=8), expr=...
9271          KNNVectorDistance: metric=l2
9272            RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2
9273              UnionExec
9274                ProjectionExec: expr=[_distance@3 as _distance, _rowid@2 as _rowid, vec@0 as vec]
9275                  FilterExec: _distance@... IS NOT NULL
9276                    SortExec: TopK(fetch=8), expr=...
9277                      KNNVectorDistance: metric=l2
9278                        FilterExec: i@1 > 10
9279                          LanceScan: uri=..., projection=[vec, i], row_id=true, row_addr=false, ordered=false, range=None
9280                Take: columns=\"_distance, _rowid, (vec)\"
9281                  CoalesceBatchesExec: target_batch_size=8192
9282                    SortExec: TopK(fetch=8), expr=...
9283                      ANNSubIndex: name=..., k=8, deltas=1, metric=L2
9284                        ANNIvfPartition: uuid=..., minimum_nprobes=1, maximum_nprobes=None, deltas=1
9285                        ScalarIndexQuery: query=[i > 10]@i_idx(BTree)";
9286        assert_plan_equals(
9287            &dataset.dataset,
9288            |scan| {
9289                Ok(scan
9290                    .nearest("vec", &q, 8)?
9291                    .filter("i > 10")?
9292                    .prefilter(true))
9293            },
9294            expected,
9295        )
9296        .await?;
9297
9298        // Update scalar index but not vector index
9299        log::info!(
9300            "Test case: Combined KNN/ANN with updated scalar index and outdated vector index"
9301        );
9302        let expected = "ProjectionExec: expr=[i@3 as i, s@4 as s, vec@1 as vec, _distance@2 as _distance]
9303  Take: columns=\"_rowid, vec, _distance, (i), (s)\"
9304    CoalesceBatchesExec: target_batch_size=8192
9305      FilterExec: _distance@... IS NOT NULL
9306        SortExec: TopK(fetch=11), expr=...
9307          KNNVectorDistance: metric=l2
9308            RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2
9309              UnionExec
9310                ProjectionExec: expr=[_distance@3 as _distance, _rowid@2 as _rowid, vec@0 as vec]
9311                  FilterExec: _distance@... IS NOT NULL
9312                    SortExec: TopK(fetch=11), expr=...
9313                      KNNVectorDistance: metric=l2
9314                        FilterExec: i@1 > 10
9315                          LanceScan: uri=..., projection=[vec, i], row_id=true, row_addr=false, ordered=false, range=None
9316                Take: columns=\"_distance, _rowid, (vec)\"
9317                  CoalesceBatchesExec: target_batch_size=8192
9318                    SortExec: TopK(fetch=11), expr=...
9319                      ANNSubIndex: name=..., k=11, deltas=1, metric=L2
9320                        ANNIvfPartition: uuid=..., minimum_nprobes=1, maximum_nprobes=None, deltas=1
9321                        ScalarIndexQuery: query=[i > 10]@i_idx(BTree)";
9322        dataset.make_scalar_index().await?;
9323        assert_plan_equals(
9324            &dataset.dataset,
9325            |scan| {
9326                Ok(scan
9327                    .nearest("vec", &q, 11)?
9328                    .filter("i > 10")?
9329                    .prefilter(true))
9330            },
9331            expected,
9332        )
9333        .await?;
9334
9335        // Scans with scalar index
9336        // ---------------------------------------------------------------------
9337        log::info!("Test case: Filtered read with scalar index");
9338        let expected = if data_storage_version == LanceFileVersion::Legacy {
9339            "ProjectionExec: expr=[s@1 as s]
9340  Take: columns=\"_rowid, (s)\"
9341    CoalesceBatchesExec: target_batch_size=8192
9342      MaterializeIndex: query=[i > 10]@i_idx(BTree)"
9343        } else {
9344            "LanceRead: uri=..., projection=[s], num_fragments=4, range_before=None, \
9345            range_after=None, row_id=false, row_addr=false, full_filter=i > Int32(10), refine_filter=--
9346              ScalarIndexQuery: query=[i > 10]@i_idx(BTree)"
9347        };
9348        assert_plan_equals(
9349            &dataset.dataset,
9350            |scan| scan.project(&["s"])?.filter("i > 10"),
9351            expected,
9352        )
9353        .await?;
9354
9355        if data_storage_version != LanceFileVersion::Legacy {
9356            log::info!(
9357                "Test case: Filtered read with scalar index disabled (late materialization)"
9358            );
9359            assert_plan_equals(
9360                &dataset.dataset,
9361                |scan| {
9362                    scan.project(&["s"])?
9363                        .use_scalar_index(false)
9364                        .filter("i > 10")
9365                },
9366                "ProjectionExec: expr=[s@2 as s]
9367  Take: columns=\"i, _rowid, (s)\"
9368    CoalesceBatchesExec: target_batch_size=8192
9369      LanceRead: uri=..., projection=[i], num_fragments=4, range_before=None, \
9370      range_after=None, row_id=true, row_addr=false, full_filter=i > Int32(10), refine_filter=i > Int32(10)",
9371            )
9372            .await?;
9373        }
9374
9375        log::info!("Test case: Empty projection");
9376        let expected = if data_storage_version == LanceFileVersion::Legacy {
9377            "ProjectionExec: expr=[_rowaddr@0 as _rowaddr]
9378  AddRowAddrExec
9379    MaterializeIndex: query=[i > 10]@i_idx(BTree)"
9380        } else {
9381            "LanceRead: uri=..., projection=[], num_fragments=4, range_before=None, \
9382            range_after=None, row_id=false, row_addr=true, full_filter=i > Int32(10), refine_filter=--
9383              ScalarIndexQuery: query=[i > 10]@i_idx(BTree)"
9384        };
9385        assert_plan_equals(
9386            &dataset.dataset,
9387            |scan| {
9388                scan.filter("i > 10")
9389                    .unwrap()
9390                    .with_row_address()
9391                    .project::<&str>(&[])
9392            },
9393            expected,
9394        )
9395        .await?;
9396
9397        dataset.append_new_data().await?;
9398        log::info!("Test case: Combined Scalar/non-scalar filtered read");
9399        let expected = if data_storage_version == LanceFileVersion::Legacy {
9400            "ProjectionExec: expr=[s@1 as s]
9401  RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2
9402    UnionExec
9403      Take: columns=\"_rowid, (s)\"
9404        CoalesceBatchesExec: target_batch_size=8192
9405          MaterializeIndex: query=[i > 10]@i_idx(BTree)
9406      ProjectionExec: expr=[_rowid@2 as _rowid, s@1 as s]
9407        FilterExec: i@0 > 10
9408          LanceScan: uri=..., projection=[i, s], row_id=true, row_addr=false, ordered=false, range=None"
9409        } else {
9410            "LanceRead: uri=..., projection=[s], num_fragments=5, range_before=None, \
9411            range_after=None, row_id=false, row_addr=false, full_filter=i > Int32(10), refine_filter=--
9412              ScalarIndexQuery: query=[i > 10]@i_idx(BTree)"
9413        };
9414        assert_plan_equals(
9415            &dataset.dataset,
9416            |scan| scan.project(&["s"])?.filter("i > 10"),
9417            expected,
9418        )
9419        .await?;
9420
9421        log::info!("Test case: Combined Scalar/non-scalar filtered read with empty projection");
9422        let expected = if data_storage_version == LanceFileVersion::Legacy {
9423            "ProjectionExec: expr=[_rowaddr@0 as _rowaddr]
9424  RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2
9425    UnionExec
9426      AddRowAddrExec
9427        MaterializeIndex: query=[i > 10]@i_idx(BTree)
9428      ProjectionExec: expr=[_rowaddr@2 as _rowaddr, _rowid@1 as _rowid]
9429        FilterExec: i@0 > 10
9430          LanceScan: uri=..., projection=[i], row_id=true, row_addr=true, ordered=false, range=None"
9431        } else {
9432            "LanceRead: uri=..., projection=[], num_fragments=5, range_before=None, \
9433            range_after=None, row_id=false, row_addr=true, full_filter=i > Int32(10), refine_filter=--
9434              ScalarIndexQuery: query=[i > 10]@i_idx(BTree)"
9435        };
9436        assert_plan_equals(
9437            &dataset.dataset,
9438            |scan| {
9439                scan.filter("i > 10")
9440                    .unwrap()
9441                    .with_row_address()
9442                    .project::<&str>(&[])
9443            },
9444            expected,
9445        )
9446        .await?;
9447
9448        // Scans with dynamic projection
9449        // When an expression is specified in the projection, the plan should include a ProjectionExec
9450        log::info!("Test case: Dynamic projection");
9451        let expected = if data_storage_version == LanceFileVersion::Legacy {
9452            "ProjectionExec: expr=[regexp_match(s@1, .*) as matches]
9453  RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2
9454    UnionExec
9455      Take: columns=\"_rowid, (s)\"
9456        CoalesceBatchesExec: target_batch_size=8192
9457          MaterializeIndex: query=[i > 10]@i_idx(BTree)
9458      ProjectionExec: expr=[_rowid@2 as _rowid, s@1 as s]
9459        FilterExec: i@0 > 10
9460          LanceScan: uri=..., row_id=true, row_addr=false, ordered=false, range=None"
9461        } else {
9462            "ProjectionExec: expr=[regexp_match(s@0, .*) as matches]
9463  LanceRead: uri=..., projection=[s], num_fragments=5, range_before=None, \
9464  range_after=None, row_id=false, row_addr=false, full_filter=i > Int32(10), refine_filter=--
9465    ScalarIndexQuery: query=[i > 10]@i_idx(BTree)"
9466        };
9467        assert_plan_equals(
9468            &dataset.dataset,
9469            |scan| {
9470                scan.project_with_transform(&[("matches", "regexp_match(s, \".*\")")])?
9471                    .filter("i > 10")
9472            },
9473            expected,
9474        )
9475        .await?;
9476
9477        // FTS
9478        // ---------------------------------------------------------------------
9479        // All rows are indexed
9480        dataset.make_fts_index().await?;
9481        log::info!("Test case: Full text search (match query)");
9482        let expected = r#"ProjectionExec: expr=[s@2 as s, _score@1 as _score, _rowid@0 as _rowid]
9483  Take: columns="_rowid, _score, (s)"
9484    CoalesceBatchesExec: target_batch_size=8192
9485      MatchQuery: column=s, query=hello"#;
9486        assert_plan_equals(
9487            &dataset.dataset,
9488            |scan| {
9489                scan.project(&["s"])?
9490                    .with_row_id()
9491                    .full_text_search(FullTextSearchQuery::new("hello".to_owned()))
9492            },
9493            expected,
9494        )
9495        .await?;
9496
9497        log::info!("Test case: Full text search (phrase query)");
9498        let expected = r#"ProjectionExec: expr=[s@2 as s, _score@1 as _score, _rowid@0 as _rowid]
9499  Take: columns="_rowid, _score, (s)"
9500    CoalesceBatchesExec: target_batch_size=8192
9501      PhraseQuery: column=s, query=hello world"#;
9502        assert_plan_equals(
9503            &dataset.dataset,
9504            |scan| {
9505                let query = PhraseQuery::new("hello world".to_owned());
9506                scan.project(&["s"])?
9507                    .with_row_id()
9508                    .full_text_search(FullTextSearchQuery::new_query(query.into()))
9509            },
9510            expected,
9511        )
9512        .await?;
9513
9514        log::info!("Test case: Full text search (boost query)");
9515        let expected = r#"ProjectionExec: expr=[s@2 as s, _score@1 as _score, _rowid@0 as _rowid]
9516  Take: columns="_rowid, _score, (s)"
9517    CoalesceBatchesExec: target_batch_size=8192
9518      BoostQuery: negative_boost=1
9519        MatchQuery: column=s, query=hello
9520        MatchQuery: column=s, query=world"#;
9521        assert_plan_equals(
9522            &dataset.dataset,
9523            |scan| {
9524                let positive =
9525                    MatchQuery::new("hello".to_owned()).with_column(Some("s".to_owned()));
9526                let negative =
9527                    MatchQuery::new("world".to_owned()).with_column(Some("s".to_owned()));
9528                let query = BoostQuery::new(positive.into(), negative.into(), Some(1.0));
9529                scan.project(&["s"])?
9530                    .with_row_id()
9531                    .full_text_search(FullTextSearchQuery::new_query(query.into()))
9532            },
9533            expected,
9534        )
9535        .await?;
9536
9537        log::info!("Test case: Full text search with prefilter");
9538        let expected = if data_storage_version == LanceFileVersion::Legacy {
9539            r#"ProjectionExec: expr=[s@2 as s, _score@1 as _score, _rowid@0 as _rowid]
9540  Take: columns="_rowid, _score, (s)"
9541    CoalesceBatchesExec: target_batch_size=8192
9542      MatchQuery: column=s, query=hello
9543        RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2
9544          UnionExec
9545            MaterializeIndex: query=[i > 10]@i_idx(BTree)
9546            ProjectionExec: expr=[_rowid@1 as _rowid]
9547              FilterExec: i@0 > 10
9548                LanceScan: uri=..., projection=[i], row_id=true, row_addr=false, ordered=false, range=None"#
9549        } else {
9550            r#"ProjectionExec: expr=[s@2 as s, _score@1 as _score, _rowid@0 as _rowid]
9551  Take: columns="_rowid, _score, (s)"
9552    CoalesceBatchesExec: target_batch_size=8192
9553      MatchQuery: column=s, query=hello
9554        LanceRead: uri=..., projection=[], num_fragments=5, range_before=None, range_after=None, row_id=true, row_addr=false, full_filter=i > Int32(10), refine_filter=--
9555          ScalarIndexQuery: query=[i > 10]@i_idx(BTree)"#
9556        };
9557        assert_plan_equals(
9558            &dataset.dataset,
9559            |scan| {
9560                scan.project(&["s"])?
9561                    .with_row_id()
9562                    .filter("i > 10")?
9563                    .prefilter(true)
9564                    .full_text_search(FullTextSearchQuery::new("hello".to_owned()))
9565            },
9566            expected,
9567        )
9568        .await?;
9569
9570        log::info!("Test case: Full text search with unindexed rows");
9571        let expected = r#"ProjectionExec: expr=[s@2 as s, _score@1 as _score, _rowid@0 as _rowid]
9572  Take: columns="_rowid, _score, (s)"
9573    CoalesceBatchesExec: target_batch_size=8192
9574      SortExec: expr=[_score@1 DESC NULLS LAST], preserve_partitioning=[false]
9575        RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2
9576          UnionExec
9577            MatchQuery: column=s, query=hello
9578            FlatMatchQuery: column=s, query=hello
9579              LanceScan: uri=..., projection=[s], row_id=true, row_addr=false, ordered=false, range=None"#;
9580        dataset.append_new_data().await?;
9581        assert_plan_equals(
9582            &dataset.dataset,
9583            |scan| {
9584                scan.project(&["s"])?
9585                    .with_row_id()
9586                    .full_text_search(FullTextSearchQuery::new("hello".to_owned()))
9587            },
9588            expected,
9589        )
9590        .await?;
9591
9592        log::info!("Test case: Full text search with unindexed rows and fast_search");
9593        let expected = r#"ProjectionExec: expr=[s@2 as s, _score@1 as _score, _rowid@0 as _rowid]
9594  Take: columns="_rowid, _score, (s)"
9595    CoalesceBatchesExec: target_batch_size=8192
9596      MatchQuery: column=s, query=hello"#;
9597        assert_plan_equals(
9598            &dataset.dataset,
9599            |scan| {
9600                let scan = scan
9601                    .project(&["s"])?
9602                    .with_row_id()
9603                    .full_text_search(FullTextSearchQuery::new("hello".to_owned()))?;
9604                scan.fast_search();
9605                Ok(scan)
9606            },
9607            expected,
9608        )
9609        .await?;
9610
9611        log::info!("Test case: Full text search with unindexed rows and prefilter");
9612        let expected = if data_storage_version == LanceFileVersion::Legacy {
9613            r#"ProjectionExec: expr=[s@2 as s, _score@1 as _score, _rowid@0 as _rowid]
9614  Take: columns="_rowid, _score, (s)"
9615    CoalesceBatchesExec: target_batch_size=8192
9616      SortExec: expr=[_score@1 DESC NULLS LAST], preserve_partitioning=[false]
9617        RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2
9618          UnionExec
9619            MatchQuery: column=s, query=hello
9620              RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2
9621                UnionExec
9622                  MaterializeIndex: query=[i > 10]@i_idx(BTree)
9623                  ProjectionExec: expr=[_rowid@1 as _rowid]
9624                    FilterExec: i@0 > 10
9625                      LanceScan: uri=..., projection=[i], row_id=true, row_addr=false, ordered=false, range=None
9626            FlatMatchQuery: column=s, query=hello
9627              FilterExec: i@1 > 10
9628                LanceScan: uri=..., projection=[s, i], row_id=true, row_addr=false, ordered=false, range=None"#
9629        } else {
9630            r#"ProjectionExec: expr=[s@2 as s, _score@1 as _score, _rowid@0 as _rowid]
9631  Take: columns="_rowid, _score, (s)"
9632    CoalesceBatchesExec: target_batch_size=8192
9633      SortExec: expr=[_score@1 DESC NULLS LAST], preserve_partitioning=[false]
9634        RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2
9635          UnionExec
9636            MatchQuery: column=s, query=hello
9637              LanceRead: uri=..., projection=[], num_fragments=5, range_before=None, range_after=None, row_id=true, row_addr=false, full_filter=i > Int32(10), refine_filter=--
9638                ScalarIndexQuery: query=[i > 10]@i_idx(BTree)
9639            FlatMatchQuery: column=s, query=hello
9640              FilterExec: i@1 > 10
9641                LanceScan: uri=..., projection=[s, i], row_id=true, row_addr=false, ordered=false, range=None"#
9642        };
9643        assert_plan_equals(
9644            &dataset.dataset,
9645            |scan| {
9646                scan.project(&["s"])?
9647                    .with_row_id()
9648                    .filter("i > 10")?
9649                    .prefilter(true)
9650                    .full_text_search(FullTextSearchQuery::new("hello".to_owned()))
9651            },
9652            expected,
9653        )
9654        .await?;
9655
9656        Ok(())
9657    }
9658
9659    #[tokio::test]
9660    async fn test_fast_search_plan() {
9661        // Create a vector dataset
9662        let mut dataset = TestVectorDataset::new(LanceFileVersion::Stable, true)
9663            .await
9664            .unwrap();
9665        dataset.make_vector_index().await.unwrap();
9666        dataset.append_new_data().await.unwrap();
9667
9668        let q: Float32Array = (32..64).map(|v| v as f32).collect();
9669
9670        assert_plan_equals(
9671            &dataset.dataset,
9672            |scan| {
9673                scan.nearest("vec", &q, 32)?
9674                    .fast_search()
9675                    .project(&["_distance", "_rowid"])
9676            },
9677            "SortExec: TopK(fetch=32), expr=[_distance@0 ASC NULLS LAST, _rowid@1 ASC NULLS LAST]...
9678    ANNSubIndex: name=idx, k=32, deltas=1, metric=L2
9679      ANNIvfPartition: uuid=..., minimum_nprobes=1, maximum_nprobes=None, deltas=1",
9680        )
9681        .await
9682        .unwrap();
9683
9684        assert_plan_equals(
9685            &dataset.dataset,
9686            |scan| {
9687                scan.nearest("vec", &q, 33)?
9688                    .fast_search()
9689                    .with_row_id()
9690                    .project(&["_distance", "_rowid"])
9691            },
9692            "SortExec: TopK(fetch=33), expr=[_distance@0 ASC NULLS LAST, _rowid@1 ASC NULLS LAST]...
9693    ANNSubIndex: name=idx, k=33, deltas=1, metric=L2
9694      ANNIvfPartition: uuid=..., minimum_nprobes=1, maximum_nprobes=None, deltas=1",
9695        )
9696        .await
9697        .unwrap();
9698
9699        // Not `fast_scan` case
9700        assert_plan_equals(
9701            &dataset.dataset,
9702            |scan| {
9703                scan.nearest("vec", &q, 34)?
9704                    .with_row_id()
9705                    .project(&["_distance", "_rowid"])
9706            },
9707            "ProjectionExec: expr=[_distance@2 as _distance, _rowid@0 as _rowid]
9708  FilterExec: _distance@2 IS NOT NULL
9709    SortExec: TopK(fetch=34), expr=[_distance@2 ASC NULLS LAST, _rowid@0 ASC NULLS LAST]...
9710      KNNVectorDistance: metric=l2
9711        RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2
9712          UnionExec
9713            ProjectionExec: expr=[_distance@2 as _distance, _rowid@1 as _rowid, vec@0 as vec]
9714              FilterExec: _distance@2 IS NOT NULL
9715                SortExec: TopK(fetch=34), expr=[_distance@2 ASC NULLS LAST, _rowid@1 ASC NULLS LAST]...
9716                  KNNVectorDistance: metric=l2
9717                    LanceScan: uri=..., projection=[vec], row_id=true, row_addr=false, ordered=false, range=None
9718            Take: columns=\"_distance, _rowid, (vec)\"
9719              CoalesceBatchesExec: target_batch_size=8192
9720                SortExec: TopK(fetch=34), expr=[_distance@0 ASC NULLS LAST, _rowid@1 ASC NULLS LAST]...
9721                  ANNSubIndex: name=idx, k=34, deltas=1, metric=L2
9722                    ANNIvfPartition: uuid=..., minimum_nprobes=1, maximum_nprobes=None, deltas=1",
9723        )
9724        .await
9725        .unwrap();
9726    }
9727
9728    #[tokio::test]
9729    async fn test_fast_search_without_vector_index_returns_empty() {
9730        let dataset = TestVectorDataset::new(LanceFileVersion::Stable, true)
9731            .await
9732            .unwrap();
9733        let q: Float32Array = (32..64).map(|v| v as f32).collect();
9734
9735        let mut scanner = dataset.dataset.scan();
9736        scanner.nearest("vec", &q, 10).unwrap();
9737        let normal_rows = scanner.try_into_batch().await.unwrap().num_rows();
9738
9739        let mut scanner = dataset.dataset.scan();
9740        scanner.nearest("vec", &q, 10).unwrap().fast_search();
9741        let fast_rows = scanner.try_into_batch().await.unwrap().num_rows();
9742
9743        assert_eq!(normal_rows, 10);
9744        assert_eq!(fast_rows, 0);
9745    }
9746
9747    #[rstest]
9748    #[tokio::test]
9749    pub async fn test_scan_planning_io(
9750        #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
9751        data_storage_version: LanceFileVersion,
9752    ) {
9753        // Create a large dataset with a scalar indexed column and a sorted but not scalar
9754        // indexed column
9755
9756        use lance_index::scalar::inverted::tokenizer::InvertedIndexParams;
9757        use lance_io::assert_io_eq;
9758        let data = gen_batch()
9759            .col(
9760                "vector",
9761                array::rand_vec::<Float32Type>(Dimension::from(32)),
9762            )
9763            .col("text", array::rand_utf8(ByteCount::from(4), false))
9764            .col("indexed", array::step::<Int32Type>())
9765            .col("not_indexed", array::step::<Int32Type>())
9766            .into_reader_rows(RowCount::from(100), BatchCount::from(5));
9767
9768        let mut dataset = Dataset::write(
9769            data,
9770            "memory://test",
9771            Some(WriteParams {
9772                data_storage_version: Some(data_storage_version),
9773                ..Default::default()
9774            }),
9775        )
9776        .await
9777        .unwrap();
9778        dataset
9779            .create_index(
9780                &["indexed"],
9781                IndexType::Scalar,
9782                None,
9783                &ScalarIndexParams::default(),
9784                false,
9785            )
9786            .await
9787            .unwrap();
9788        dataset
9789            .create_index(
9790                &["text"],
9791                IndexType::Inverted,
9792                None,
9793                &InvertedIndexParams::default(),
9794                false,
9795            )
9796            .await
9797            .unwrap();
9798        dataset
9799            .create_index(
9800                &["vector"],
9801                IndexType::Vector,
9802                None,
9803                &VectorIndexParams {
9804                    metric_type: DistanceType::L2,
9805                    stages: vec![
9806                        StageParams::Ivf(IvfBuildParams {
9807                            max_iters: 2,
9808                            num_partitions: Some(2),
9809                            sample_rate: 2,
9810                            ..Default::default()
9811                        }),
9812                        StageParams::PQ(PQBuildParams {
9813                            max_iters: 2,
9814                            num_sub_vectors: 2,
9815                            ..Default::default()
9816                        }),
9817                    ],
9818                    version: crate::index::vector::IndexFileVersion::Legacy,
9819                    skip_transpose: false,
9820                    runtime_hints: Default::default(),
9821                },
9822                false,
9823            )
9824            .await
9825            .unwrap();
9826
9827        // First planning cycle needs to do some I/O to determine what scalar indices are available
9828        dataset
9829            .scan()
9830            .prefilter(true)
9831            .filter("indexed > 10")
9832            .unwrap()
9833            .explain_plan(true)
9834            .await
9835            .unwrap();
9836
9837        // First pass will need to perform some IOPs to determine what scalar indices are available
9838        let io_stats = dataset.object_store.as_ref().io_stats_incremental();
9839        assert_io_gt!(io_stats, read_iops, 0);
9840
9841        // Second planning cycle should not perform any I/O
9842        dataset
9843            .scan()
9844            .prefilter(true)
9845            .filter("indexed > 10")
9846            .unwrap()
9847            .explain_plan(true)
9848            .await
9849            .unwrap();
9850
9851        let io_stats = dataset.object_store.as_ref().io_stats_incremental();
9852        assert_io_eq!(io_stats, read_iops, 0);
9853
9854        dataset
9855            .scan()
9856            .prefilter(true)
9857            .filter("true")
9858            .unwrap()
9859            .explain_plan(true)
9860            .await
9861            .unwrap();
9862
9863        let io_stats = dataset.object_store.as_ref().io_stats_incremental();
9864        assert_io_eq!(io_stats, read_iops, 0);
9865
9866        dataset
9867            .scan()
9868            .prefilter(true)
9869            .materialization_style(MaterializationStyle::AllEarly)
9870            .filter("true")
9871            .unwrap()
9872            .explain_plan(true)
9873            .await
9874            .unwrap();
9875
9876        let io_stats = dataset.object_store.as_ref().io_stats_incremental();
9877        assert_io_eq!(io_stats, read_iops, 0);
9878
9879        dataset
9880            .scan()
9881            .prefilter(true)
9882            .materialization_style(MaterializationStyle::AllLate)
9883            .filter("true")
9884            .unwrap()
9885            .explain_plan(true)
9886            .await
9887            .unwrap();
9888
9889        let io_stats = dataset.object_store.as_ref().io_stats_incremental();
9890        assert_io_eq!(io_stats, read_iops, 0);
9891    }
9892
9893    #[rstest]
9894    #[tokio::test]
9895    pub async fn test_row_meta_columns(
9896        #[values(
9897            (true, false),  // Test row_id only
9898            (false, true),  // Test row_address only
9899            (true, true)    // Test both
9900        )]
9901        columns: (bool, bool),
9902    ) {
9903        let (with_row_id, with_row_address) = columns;
9904        let test_dir = TempStrDir::default();
9905        let uri = &test_dir;
9906
9907        let schema = Arc::new(arrow_schema::Schema::new(vec![
9908            arrow_schema::Field::new("data_item_id", arrow_schema::DataType::Int32, false),
9909            arrow_schema::Field::new("a", arrow_schema::DataType::Int32, false),
9910        ]));
9911
9912        let data = RecordBatch::try_new(
9913            schema.clone(),
9914            vec![
9915                Arc::new(Int32Array::from(vec![1001, 1002, 1003])),
9916                Arc::new(Int32Array::from(vec![1, 2, 3])),
9917            ],
9918        )
9919        .unwrap();
9920
9921        let dataset = Dataset::write(
9922            RecordBatchIterator::new(vec![Ok(data)], schema.clone()),
9923            uri,
9924            None,
9925        )
9926        .await
9927        .unwrap();
9928
9929        // Test explicit projection
9930        let mut scanner = dataset.scan();
9931
9932        let mut projection = vec!["data_item_id".to_string()];
9933        if with_row_id {
9934            scanner.with_row_id();
9935            projection.push(ROW_ID.to_string());
9936        }
9937        if with_row_address {
9938            scanner.with_row_address();
9939            projection.push(ROW_ADDR.to_string());
9940        }
9941
9942        scanner.project(&projection).unwrap();
9943        let stream = scanner.try_into_stream().await.unwrap();
9944        let batch = stream.try_collect::<Vec<_>>().await.unwrap().pop().unwrap();
9945
9946        // Verify column existence and data type
9947        if with_row_id {
9948            let column = batch.column_by_name(ROW_ID).unwrap();
9949            assert_eq!(column.data_type(), &DataType::UInt64);
9950        }
9951        if with_row_address {
9952            let column = batch.column_by_name(ROW_ADDR).unwrap();
9953            assert_eq!(column.data_type(), &DataType::UInt64);
9954        }
9955
9956        // Test implicit inclusion
9957        let mut scanner = dataset.scan();
9958        if with_row_id {
9959            scanner.with_row_id();
9960        }
9961        if with_row_address {
9962            scanner.with_row_address();
9963        }
9964        scanner.project(&["data_item_id"]).unwrap();
9965        let stream = scanner.try_into_stream().await.unwrap();
9966        let batch = stream.try_collect::<Vec<_>>().await.unwrap().pop().unwrap();
9967        let meta_column = batch.column_by_name(if with_row_id { ROW_ID } else { ROW_ADDR });
9968        assert!(meta_column.is_some());
9969
9970        // Test error case
9971        let mut scanner = dataset.scan();
9972        if with_row_id {
9973            scanner.project(&[ROW_ID]).unwrap();
9974        } else {
9975            scanner.project(&[ROW_ADDR]).unwrap();
9976        };
9977        let stream = scanner.try_into_stream().await.unwrap();
9978        assert_eq!(stream.schema().fields().len(), 1);
9979        if with_row_id {
9980            assert!(stream.schema().field_with_name(ROW_ID).is_ok());
9981        } else {
9982            assert!(stream.schema().field_with_name(ROW_ADDR).is_ok());
9983        }
9984    }
9985
9986    async fn limit_offset_equivalency_test(scanner: &Scanner) {
9987        async fn test_one(
9988            scanner: &Scanner,
9989            full_result: &RecordBatch,
9990            limit: Option<i64>,
9991            offset: Option<i64>,
9992        ) {
9993            let mut new_scanner = scanner.clone();
9994            new_scanner.limit(limit, offset).unwrap();
9995            if let Some(nearest) = new_scanner.nearest_mut() {
9996                nearest.k = offset.unwrap_or(0).saturating_add(limit.unwrap_or(10_000)) as usize;
9997            }
9998            let result = new_scanner.try_into_batch().await.unwrap();
9999
10000            let resolved_offset = offset.unwrap_or(0).min(full_result.num_rows() as i64);
10001            let resolved_length = limit
10002                .unwrap_or(i64::MAX)
10003                .min(full_result.num_rows() as i64 - resolved_offset);
10004
10005            let expected = full_result.slice(resolved_offset as usize, resolved_length as usize);
10006
10007            if expected != result {
10008                let plan = new_scanner.analyze_plan().await.unwrap();
10009                assert_eq!(
10010                    &expected, &result,
10011                    "Limit: {:?}, Offset: {:?}, Plan: \n{}",
10012                    limit, offset, plan
10013                );
10014            }
10015        }
10016
10017        let mut scanner_full = scanner.clone();
10018        if let Some(nearest) = scanner_full.nearest_mut() {
10019            nearest.k = 500;
10020        }
10021        let full_results = scanner_full.try_into_batch().await.unwrap();
10022
10023        test_one(scanner, &full_results, Some(1), None).await;
10024        test_one(scanner, &full_results, Some(1), Some(1)).await;
10025        test_one(scanner, &full_results, Some(1), Some(2)).await;
10026        test_one(scanner, &full_results, Some(1), Some(10)).await;
10027
10028        test_one(scanner, &full_results, Some(3), None).await;
10029        test_one(scanner, &full_results, Some(3), Some(2)).await;
10030        test_one(scanner, &full_results, Some(3), Some(4)).await;
10031
10032        test_one(scanner, &full_results, None, Some(3)).await;
10033        test_one(scanner, &full_results, None, Some(10)).await;
10034    }
10035
10036    #[tokio::test]
10037    async fn test_scan_limit_offset() {
10038        let test_ds = TestVectorDataset::new(LanceFileVersion::Stable, false)
10039            .await
10040            .unwrap();
10041        let scanner = test_ds.dataset.scan();
10042        limit_offset_equivalency_test(&scanner).await;
10043    }
10044
10045    #[tokio::test]
10046    async fn test_knn_limit_offset() {
10047        let test_ds = TestVectorDataset::new(LanceFileVersion::Stable, false)
10048            .await
10049            .unwrap();
10050        let query_vector = Float32Array::from(vec![0.0; 32]);
10051        let mut scanner = test_ds.dataset.scan();
10052        scanner
10053            .nearest("vec", &query_vector, 5)
10054            .unwrap()
10055            .project(&["i"])
10056            .unwrap();
10057        limit_offset_equivalency_test(&scanner).await;
10058    }
10059
10060    #[tokio::test]
10061    async fn test_knn_query_parallelism_defaults_and_setter() {
10062        let test_ds = TestVectorDataset::new(LanceFileVersion::Stable, false)
10063            .await
10064            .unwrap();
10065        let query_vector = Float32Array::from(vec![0.0; 32]);
10066        let mut scanner = test_ds.dataset.scan();
10067        scanner.nearest("vec", &query_vector, 5).unwrap();
10068        assert_eq!(
10069            scanner.nearest_mut().unwrap().query_parallelism,
10070            DEFAULT_QUERY_PARALLELISM
10071        );
10072
10073        scanner.query_parallelism(4);
10074        assert_eq!(scanner.nearest_mut().unwrap().query_parallelism, 4);
10075
10076        scanner.query_parallelism(-1);
10077        assert_eq!(scanner.nearest_mut().unwrap().query_parallelism, -1);
10078    }
10079
10080    #[tokio::test]
10081    async fn test_ivf_pq_query_parallelism_returns_same_results() {
10082        let mut test_ds = TestVectorDataset::new(LanceFileVersion::Stable, false)
10083            .await
10084            .unwrap();
10085        test_ds.make_vector_index().await.unwrap();
10086
10087        let query_vector = Float32Array::from(vec![0.0; 32]);
10088
10089        let mut sequential = test_ds.dataset.scan();
10090        sequential.nearest("vec", &query_vector, 50).unwrap();
10091        let sequential_results = sequential.try_into_batch().await.unwrap();
10092
10093        let mut parallel = test_ds.dataset.scan();
10094        parallel
10095            .nearest("vec", &query_vector, 50)
10096            .unwrap()
10097            .query_parallelism(4);
10098        let parallel_results = parallel.try_into_batch().await.unwrap();
10099
10100        assert_eq!(sequential_results, parallel_results);
10101    }
10102
10103    #[tokio::test]
10104    async fn test_ivf_pq_limit_offset() {
10105        let mut test_ds = TestVectorDataset::new(LanceFileVersion::Stable, false)
10106            .await
10107            .unwrap();
10108        test_ds.make_vector_index().await.unwrap();
10109        test_ds.append_new_data().await.unwrap();
10110        let query_vector = Float32Array::from(vec![0.0; 32]);
10111        let mut scanner = test_ds.dataset.scan();
10112        scanner.nearest("vec", &query_vector, 500).unwrap();
10113        limit_offset_equivalency_test(&scanner).await;
10114    }
10115
10116    #[tokio::test]
10117    async fn test_fts_limit_offset() {
10118        let mut test_ds = TestVectorDataset::new(LanceFileVersion::Stable, false)
10119            .await
10120            .unwrap();
10121        test_ds.make_fts_index().await.unwrap();
10122        test_ds.append_new_data().await.unwrap();
10123        let mut scanner = test_ds.dataset.scan();
10124        scanner
10125            .full_text_search(FullTextSearchQuery::new("4".into()))
10126            .unwrap();
10127        limit_offset_equivalency_test(&scanner).await;
10128    }
10129
10130    #[tokio::test]
10131    async fn test_fts_fast_search_excludes_unindexed_rows() {
10132        let mut test_ds = TestVectorDataset::new(LanceFileVersion::Stable, false)
10133            .await
10134            .unwrap();
10135        test_ds.make_fts_index().await.unwrap();
10136        // Append rows after index build so they stay unindexed.
10137        test_ds.append_data_with_range(10, 20).await.unwrap();
10138
10139        let mut scanner = test_ds.dataset.scan();
10140        scanner
10141            .full_text_search(FullTextSearchQuery::new_query(
10142                MatchQuery::new("15".to_owned())
10143                    .with_column(Some("s".to_owned()))
10144                    .into(),
10145            ))
10146            .unwrap();
10147        let normal_rows = scanner.try_into_batch().await.unwrap().num_rows();
10148
10149        let mut scanner = test_ds.dataset.scan();
10150        scanner
10151            .full_text_search(FullTextSearchQuery::new_query(
10152                MatchQuery::new("15".to_owned())
10153                    .with_column(Some("s".to_owned()))
10154                    .into(),
10155            ))
10156            .unwrap()
10157            .fast_search();
10158        let fast_rows = scanner.try_into_batch().await.unwrap().num_rows();
10159
10160        assert_eq!(normal_rows, 2);
10161        assert_eq!(fast_rows, 1);
10162    }
10163
10164    async fn test_row_offset_read_helper(
10165        ds: &Dataset,
10166        scan_builder: impl FnOnce(&mut Scanner) -> &mut Scanner,
10167        expected_cols: &[&str],
10168        expected_row_offsets: &[u64],
10169    ) {
10170        let mut scanner = ds.scan();
10171        let scanner = scan_builder(&mut scanner);
10172        let stream = scanner.try_into_stream().await.unwrap();
10173
10174        let schema = stream.schema();
10175        let actual_cols = schema
10176            .fields()
10177            .iter()
10178            .map(|f| f.name().as_str())
10179            .collect::<Vec<_>>();
10180        assert_eq!(&actual_cols, expected_cols);
10181
10182        let batches = stream.try_collect::<Vec<_>>().await.unwrap();
10183        let batch = arrow_select::concat::concat_batches(&schema, &batches).unwrap();
10184
10185        let row_offsets = batch
10186            .column_by_name(ROW_OFFSET)
10187            .unwrap()
10188            .as_primitive::<UInt64Type>()
10189            .values();
10190        assert_eq!(row_offsets.as_ref(), expected_row_offsets);
10191    }
10192
10193    #[tokio::test]
10194    async fn test_row_offset_read() {
10195        let mut ds = lance_datagen::gen_batch()
10196            .col("idx", array::step::<Int32Type>())
10197            .into_ram_dataset(FragmentCount::from(3), FragmentRowCount::from(3))
10198            .await
10199            .unwrap();
10200        // [0, 1, 2], [3, 4, 5], [6, 7, 8]
10201
10202        // Delete [2, 3, 4, 5, 6]
10203        ds.delete("idx >= 2 AND idx <= 6").await.unwrap();
10204
10205        // Normal read, all columns plus row offset
10206        test_row_offset_read_helper(
10207            &ds,
10208            |scanner| scanner.project(&["idx", ROW_OFFSET]).unwrap(),
10209            &["idx", ROW_OFFSET],
10210            &[0, 1, 2, 3],
10211        )
10212        .await;
10213
10214        // Read with row offset only
10215        test_row_offset_read_helper(
10216            &ds,
10217            |scanner| scanner.project(&[ROW_OFFSET]).unwrap(),
10218            &[ROW_OFFSET],
10219            &[0, 1, 2, 3],
10220        )
10221        .await;
10222
10223        // Filtered read of row offset
10224        test_row_offset_read_helper(
10225            &ds,
10226            |scanner| {
10227                scanner
10228                    .filter("idx > 1")
10229                    .unwrap()
10230                    .project(&[ROW_OFFSET])
10231                    .unwrap()
10232            },
10233            &[ROW_OFFSET],
10234            &[2, 3],
10235        )
10236        .await;
10237    }
10238
10239    #[tokio::test]
10240    async fn test_filter_to_take() {
10241        let mut ds = lance_datagen::gen_batch()
10242            .col("idx", array::step::<Int32Type>())
10243            .into_ram_dataset(FragmentCount::from(3), FragmentRowCount::from(100))
10244            .await
10245            .unwrap();
10246
10247        let row_ids = ds
10248            .scan()
10249            .project(&Vec::<&str>::default())
10250            .unwrap()
10251            .with_row_id()
10252            .try_into_stream()
10253            .await
10254            .unwrap()
10255            .try_collect::<Vec<_>>()
10256            .await
10257            .unwrap();
10258        let schema = row_ids[0].schema();
10259        let row_ids = concat_batches(&schema, row_ids.iter()).unwrap();
10260        let row_ids = row_ids.column(0).as_primitive::<UInt64Type>().clone();
10261
10262        let row_addrs = ds
10263            .scan()
10264            .project(&Vec::<&str>::default())
10265            .unwrap()
10266            .with_row_address()
10267            .try_into_stream()
10268            .await
10269            .unwrap()
10270            .try_collect::<Vec<_>>()
10271            .await
10272            .unwrap();
10273        let schema = row_addrs[0].schema();
10274        let row_addrs = concat_batches(&schema, row_addrs.iter()).unwrap();
10275        let row_addrs = row_addrs.column(0).as_primitive::<UInt64Type>().clone();
10276
10277        ds.delete("idx >= 190 AND idx < 210").await.unwrap();
10278
10279        let ds_copy = ds.clone();
10280        let do_check = async move |filt: &str, expected_idx: &[i32], applies_optimization: bool| {
10281            let mut scanner = ds_copy.scan();
10282            scanner.filter(filt).unwrap();
10283            // Verify the optimization is applied
10284            let plan = scanner.explain_plan(true).await.unwrap();
10285            if applies_optimization {
10286                assert!(
10287                    plan.contains("OneShotStream"),
10288                    "expected take optimization to be applied. Filter: '{}'.  Plan:\n{}",
10289                    filt,
10290                    plan
10291                );
10292            } else {
10293                assert!(
10294                    !plan.contains("OneShotStream"),
10295                    "expected take optimization to not be applied. Filter: '{}'.  Plan:\n{}",
10296                    filt,
10297                    plan
10298                );
10299            }
10300
10301            // Verify the results
10302            let stream = scanner.try_into_stream().await.unwrap();
10303            let batches = stream.try_collect::<Vec<_>>().await.unwrap();
10304            let idx = batches
10305                .iter()
10306                .map(|b| b.column_by_name("idx").unwrap().as_ref())
10307                .collect::<Vec<_>>();
10308
10309            if idx.is_empty() {
10310                assert!(expected_idx.is_empty());
10311                return;
10312            }
10313
10314            let idx = arrow::compute::concat(&idx).unwrap();
10315            assert_eq!(idx.as_primitive::<Int32Type>().values(), expected_idx);
10316        };
10317        let check =
10318            async |filt: &str, expected_idx: &[i32]| do_check(filt, expected_idx, true).await;
10319        let check_no_opt = async |filt: &str, expected_idx: &[i32]| {
10320            do_check(filt, expected_idx, false).await;
10321        };
10322
10323        // Simple case, no deletions yet
10324        check("_rowid = 50", &[50]).await;
10325        check("_rowaddr = 50", &[50]).await;
10326        check("_rowoffset = 50", &[50]).await;
10327
10328        check(
10329            "_rowid = 50 OR _rowid = 51 OR _rowid = 52 OR _rowid = 49",
10330            &[49, 50, 51, 52],
10331        )
10332        .await;
10333        check(
10334            "_rowaddr = 50 OR _rowaddr = 51 OR _rowaddr = 52 OR _rowaddr = 49",
10335            &[49, 50, 51, 52],
10336        )
10337        .await;
10338        check(
10339            "_rowoffset = 50 OR _rowoffset = 51 OR _rowoffset = 52 OR _rowoffset = 49",
10340            &[49, 50, 51, 52],
10341        )
10342        .await;
10343
10344        check("_rowid IN (52, 51, 50, 17)", &[17, 50, 51, 52]).await;
10345        check("_rowaddr IN (52, 51, 50, 17)", &[17, 50, 51, 52]).await;
10346        check("_rowoffset IN (52, 51, 50, 17)", &[17, 50, 51, 52]).await;
10347
10348        // Taking _rowid / _rowaddr of deleted row
10349
10350        // When using rowid / rowaddr we get an empty
10351        check(&format!("_rowid = {}", row_ids.value(190)), &[]).await;
10352        check(&format!("_rowaddr = {}", row_addrs.value(190)), &[]).await;
10353        // When using rowoffset it just skips the deleted rows (impossible to create an offset
10354        // into a deleted row)
10355        check("_rowoffset = 190", &[210]).await;
10356
10357        // Grabbing after the deleted rows
10358        check(&format!("_rowid = {}", row_ids.value(250)), &[250]).await;
10359        check(&format!("_rowaddr = {}", row_addrs.value(250)), &[250]).await;
10360        check("_rowoffset = 250", &[270]).await;
10361
10362        // Grabbing past the end
10363        check("_rowoffset = 1000", &[]).await;
10364
10365        // Combine take and filter
10366        check("_rowid IN (5, 10, 15) AND idx > 10", &[15]).await;
10367        check("_rowaddr IN (5, 10, 15) AND idx > 10", &[15]).await;
10368        check("_rowoffset IN (5, 10, 15) AND idx > 10", &[15]).await;
10369        check("idx > 10 AND _rowid IN (5, 10, 15)", &[15]).await;
10370        check("idx > 10 AND _rowaddr IN (5, 10, 15)", &[15]).await;
10371        check("idx > 10 AND _rowoffset IN (5, 10, 15)", &[15]).await;
10372        // Get's simplified into _rowid = 50 and so we catch it
10373        check("_rowid = 50 AND _rowid = 50", &[50]).await;
10374
10375        // Filters that cannot be converted into a take
10376        check_no_opt("_rowid = 50 AND _rowid = 51", &[]).await;
10377        check_no_opt("(_rowid = 50 AND idx < 100) OR _rowid = 51", &[50, 51]).await;
10378
10379        // Dynamic projection
10380        let mut scanner = ds.scan();
10381        scanner.filter("_rowoffset = 77").unwrap();
10382        scanner
10383            .project_with_transform(&[("foo", "idx * 2")])
10384            .unwrap();
10385        let stream = scanner.try_into_stream().await.unwrap();
10386        let batches = stream.try_collect::<Vec<_>>().await.unwrap();
10387        assert_eq!(batches[0].schema().field(0).name(), "foo");
10388        let val = batches[0].column(0).as_primitive::<Int32Type>().values()[0];
10389        assert_eq!(val, 154);
10390    }
10391
10392    #[tokio::test]
10393    async fn test_nested_field_ordering() {
10394        use arrow_array::StructArray;
10395
10396        // Create test data with nested structs
10397        let id_array = Int32Array::from(vec![3, 1, 2]);
10398        let nested_values = Int32Array::from(vec![30, 10, 20]);
10399        let nested_struct = StructArray::from(vec![(
10400            Arc::new(ArrowField::new("value", DataType::Int32, false)),
10401            Arc::new(nested_values) as ArrayRef,
10402        )]);
10403
10404        let schema = Arc::new(ArrowSchema::new(vec![
10405            ArrowField::new("id", DataType::Int32, false),
10406            ArrowField::new(
10407                "nested",
10408                DataType::Struct(vec![ArrowField::new("value", DataType::Int32, false)].into()),
10409                false,
10410            ),
10411        ]));
10412
10413        let batch = RecordBatch::try_new(
10414            schema.clone(),
10415            vec![Arc::new(id_array), Arc::new(nested_struct)],
10416        )
10417        .unwrap();
10418
10419        let test_dir = TempStrDir::default();
10420        let test_uri = &test_dir;
10421        let reader = RecordBatchIterator::new(vec![Ok(batch)].into_iter(), schema.clone());
10422
10423        let dataset = Dataset::write(reader, test_uri, None).await.unwrap();
10424
10425        // Test ordering by nested field
10426        let mut scanner = dataset.scan();
10427        scanner
10428            .order_by(Some(vec![ColumnOrdering {
10429                column_name: "nested.value".to_string(),
10430                ascending: true,
10431                nulls_first: true,
10432            }]))
10433            .unwrap(); // ascending order
10434
10435        let stream = scanner.try_into_stream().await.unwrap();
10436        let batches = stream.try_collect::<Vec<_>>().await.unwrap();
10437
10438        // Check that results are sorted by nested.value
10439        let sorted_ids = batches[0].column(0).as_primitive::<Int32Type>().values();
10440        assert_eq!(sorted_ids[0], 1); // id=1 has nested.value=10
10441        assert_eq!(sorted_ids[1], 2); // id=2 has nested.value=20
10442        assert_eq!(sorted_ids[2], 3); // id=3 has nested.value=30
10443    }
10444
10445    #[tokio::test]
10446    async fn test_limit_with_ordering_not_pushed_down() {
10447        // This test verifies the fix for a bug where limit/offset could be pushed down
10448        // even when ordering was specified. When ordering is present, we need to load
10449        // all data first to sort it before applying limits.
10450
10451        // Create test data with specific ordering
10452        let id_array = Int32Array::from(vec![5, 2, 8, 1, 3, 7, 4, 6]);
10453        let value_array = Int32Array::from(vec![50, 20, 80, 10, 30, 70, 40, 60]);
10454
10455        let schema = Arc::new(ArrowSchema::new(vec![
10456            ArrowField::new("id", DataType::Int32, false),
10457            ArrowField::new("value", DataType::Int32, false),
10458        ]));
10459
10460        let batch = RecordBatch::try_new(
10461            schema.clone(),
10462            vec![Arc::new(id_array), Arc::new(value_array)],
10463        )
10464        .unwrap();
10465
10466        let test_dir = TempStrDir::default();
10467        let test_uri = &test_dir;
10468        let reader = RecordBatchIterator::new(vec![Ok(batch)].into_iter(), schema.clone());
10469
10470        let dataset = Dataset::write(reader, test_uri, None).await.unwrap();
10471
10472        // Test 1: limit with ordering should return top N after sorting
10473        let mut scanner = dataset.scan();
10474        scanner
10475            .order_by(Some(vec![ColumnOrdering {
10476                column_name: "value".to_string(),
10477                ascending: true,
10478                nulls_first: true,
10479            }]))
10480            .unwrap();
10481        scanner.limit(Some(3), None).unwrap();
10482
10483        let stream = scanner.try_into_stream().await.unwrap();
10484        let batches = stream.try_collect::<Vec<_>>().await.unwrap();
10485
10486        // Results should be sorted by value and limited to 3
10487        let sorted_ids = batches[0].column(0).as_primitive::<Int32Type>().values();
10488        let sorted_values = batches[0].column(1).as_primitive::<Int32Type>().values();
10489        assert_eq!(batches[0].num_rows(), 3);
10490        assert_eq!(sorted_ids[0], 1); // value=10
10491        assert_eq!(sorted_ids[1], 2); // value=20
10492        assert_eq!(sorted_ids[2], 3); // value=30
10493        assert_eq!(sorted_values[0], 10);
10494        assert_eq!(sorted_values[1], 20);
10495        assert_eq!(sorted_values[2], 30);
10496
10497        // Test 2: offset with ordering should skip first N after sorting
10498        let mut scanner = dataset.scan();
10499        scanner
10500            .order_by(Some(vec![ColumnOrdering {
10501                column_name: "value".to_string(),
10502                ascending: true,
10503                nulls_first: true,
10504            }]))
10505            .unwrap();
10506        scanner.limit(Some(3), Some(2)).unwrap(); // Skip first 2, take next 3
10507
10508        let stream = scanner.try_into_stream().await.unwrap();
10509        let batches = stream.try_collect::<Vec<_>>().await.unwrap();
10510
10511        let sorted_ids = batches[0].column(0).as_primitive::<Int32Type>().values();
10512        let sorted_values = batches[0].column(1).as_primitive::<Int32Type>().values();
10513        assert_eq!(batches[0].num_rows(), 3);
10514        assert_eq!(sorted_ids[0], 3); // value=30 (skipped 10, 20)
10515        assert_eq!(sorted_ids[1], 4); // value=40
10516        assert_eq!(sorted_ids[2], 5); // value=50
10517        assert_eq!(sorted_values[0], 30);
10518        assert_eq!(sorted_values[1], 40);
10519        assert_eq!(sorted_values[2], 50);
10520
10521        // Test 3: without ordering, limit can be pushed down (different behavior)
10522        let mut scanner = dataset.scan();
10523        scanner.limit(Some(3), None).unwrap();
10524
10525        let stream = scanner.try_into_stream().await.unwrap();
10526        let batches = stream.try_collect::<Vec<_>>().await.unwrap();
10527
10528        // Should get first 3 rows in storage order (not sorted)
10529        assert_eq!(batches[0].num_rows(), 3);
10530        let unsorted_values = batches[0].column(1).as_primitive::<Int32Type>().values();
10531        // These will be in original insertion order, not sorted
10532        assert_eq!(unsorted_values[0], 50);
10533        assert_eq!(unsorted_values[1], 20);
10534        assert_eq!(unsorted_values[2], 80);
10535    }
10536
10537    #[tokio::test]
10538    async fn test_scan_with_version_columns() {
10539        use arrow_array::{Int32Array, RecordBatch, RecordBatchIterator};
10540        use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema};
10541
10542        // Create a simple dataset
10543        let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new(
10544            "id",
10545            DataType::Int32,
10546            false,
10547        )]));
10548
10549        let batch = RecordBatch::try_new(
10550            schema.clone(),
10551            vec![Arc::new(Int32Array::from(vec![1, 2, 3]))],
10552        )
10553        .unwrap();
10554
10555        let test_dir = lance_core::utils::tempfile::TempStrDir::default();
10556        let test_uri = test_dir.as_str();
10557
10558        let reader = RecordBatchIterator::new(vec![Ok(batch)], schema);
10559        let write_params = WriteParams {
10560            enable_stable_row_ids: true,
10561            ..Default::default()
10562        };
10563        Dataset::write(reader, test_uri, Some(write_params))
10564            .await
10565            .unwrap();
10566
10567        let dataset = Dataset::open(test_uri).await.unwrap();
10568        let mut scanner = dataset.scan();
10569
10570        scanner
10571            .project(&[ROW_CREATED_AT_VERSION, ROW_LAST_UPDATED_AT_VERSION])
10572            .unwrap();
10573
10574        // Check that the schema includes version columns
10575        let output_schema = scanner.schema().await.unwrap();
10576        assert!(
10577            output_schema
10578                .column_with_name("_row_last_updated_at_version")
10579                .is_some(),
10580            "Schema should include _row_last_updated_at_version"
10581        );
10582        assert!(
10583            output_schema
10584                .column_with_name("_row_created_at_version")
10585                .is_some(),
10586            "Schema should include _row_created_at_version"
10587        );
10588
10589        // Actually read the data to ensure version columns are materialized
10590        let batches = scanner
10591            .try_into_stream()
10592            .await
10593            .unwrap()
10594            .try_collect::<Vec<_>>()
10595            .await
10596            .unwrap();
10597
10598        assert_eq!(batches.len(), 1);
10599        let batch = &batches[0];
10600
10601        // Verify version columns exist in the output
10602        let last_updated = batch
10603            .column_by_name("_row_last_updated_at_version")
10604            .expect("Should have _row_last_updated_at_version column");
10605        let created_at = batch
10606            .column_by_name("_row_created_at_version")
10607            .expect("Should have _row_created_at_version column");
10608
10609        // Verify they have the correct values (all rows created in version 1)
10610        let last_updated_array = last_updated
10611            .as_any()
10612            .downcast_ref::<arrow_array::UInt64Array>()
10613            .unwrap();
10614        let created_at_array = created_at
10615            .as_any()
10616            .downcast_ref::<arrow_array::UInt64Array>()
10617            .unwrap();
10618
10619        for i in 0..batch.num_rows() {
10620            assert_eq!(
10621                last_updated_array.value(i),
10622                1,
10623                "All rows last updated at version 1"
10624            );
10625            assert_eq!(
10626                created_at_array.value(i),
10627                1,
10628                "All rows created at version 1"
10629            );
10630        }
10631    }
10632
10633    #[test_log::test(test)]
10634    fn test_scan_finishes_all_tasks() {
10635        // Need to use multi-threaded runtime otherwise tasks don't run unless someone is polling somewhere
10636        let runtime = tokio::runtime::Builder::new_multi_thread()
10637            .enable_time()
10638            .build()
10639            .unwrap();
10640
10641        runtime.block_on(async move {
10642            let ds = lance_datagen::gen_batch()
10643                .col("id", lance_datagen::array::step::<Int32Type>())
10644                .into_ram_dataset(FragmentCount::from(1000), FragmentRowCount::from(10))
10645                .await
10646                .unwrap();
10647
10648            // This scan with has a small I/O buffer size and batch size to mimic a real-world situation
10649            // that required a lot of data.  Many fragments will be scheduled at low priority and the data
10650            // buffer will fill up with data reads.  When the scan is abandoned, the tasks to read the fragment
10651            // metadata were left behind and would never finish because the data was never decoded to drain the
10652            // backpressure queue.
10653            //
10654            // The fix (that this test verifies) is to ensure we close the I/O scheduler when the scan is abandoned.
10655            let mut stream = ds
10656                .scan()
10657                .fragment_readahead(1000)
10658                .batch_size(1)
10659                .io_buffer_size(1)
10660                .batch_readahead(1)
10661                .try_into_stream()
10662                .await
10663                .unwrap();
10664            stream.next().await.unwrap().unwrap();
10665        });
10666
10667        let start = Instant::now();
10668        while start.elapsed() < Duration::from_secs(10) {
10669            if runtime.handle().metrics().num_alive_tasks() == 0 {
10670                break;
10671            }
10672            std::thread::sleep(Duration::from_millis(100));
10673        }
10674
10675        assert!(
10676            runtime.handle().metrics().num_alive_tasks() == 0,
10677            "Tasks should have finished within 10 seconds but there are still {} tasks running",
10678            runtime.handle().metrics().num_alive_tasks()
10679        );
10680    }
10681
10682    fn find_filtered_read(plan: &dyn ExecutionPlan) -> Option<&FilteredReadExec> {
10683        if let Some(f) = plan.as_any().downcast_ref::<FilteredReadExec>() {
10684            return Some(f);
10685        }
10686        for child in plan.children() {
10687            if let Some(f) = find_filtered_read(child.as_ref()) {
10688                return Some(f);
10689            }
10690        }
10691        None
10692    }
10693
10694    #[tokio::test]
10695    async fn test_io_buffer_size_explicit_propagated() {
10696        // Sanity check: an explicit .io_buffer_size(N) call must reach the
10697        // FilteredReadExec options unchanged, and the absence of one must leave
10698        // io_buffer_size_bytes as None so FilteredReadExec can pick its own
10699        // fallback (env var or max_bandwidth).
10700        let data = lance_datagen::gen_batch()
10701            .col("x", lance_datagen::array::step::<Int32Type>())
10702            .into_reader_rows(RowCount::from(8), BatchCount::from(1));
10703        let dataset = Dataset::write(data, "memory://test_io_buffer_explicit", None)
10704            .await
10705            .unwrap();
10706
10707        let plan = dataset.scan().create_plan().await.unwrap();
10708        let filtered = find_filtered_read(plan.as_ref())
10709            .expect("expected a FilteredReadExec in the scan plan");
10710        assert_eq!(filtered.options().io_buffer_size_bytes, None);
10711
10712        let mut scanner = dataset.scan();
10713        scanner.io_buffer_size(7777);
10714        let plan = scanner.create_plan().await.unwrap();
10715        let filtered = find_filtered_read(plan.as_ref())
10716            .expect("expected a FilteredReadExec in the scan plan");
10717        assert_eq!(filtered.options().io_buffer_size_bytes, Some(7777));
10718    }
10719
10720    // The env var key scopes serial_test's lock so this test only blocks others
10721    // that touch LANCE_DEFAULT_IO_BUFFER_SIZE — unrelated tests still run in
10722    // parallel.
10723    #[test]
10724    #[serial_test::serial(LANCE_DEFAULT_IO_BUFFER_SIZE)]
10725    fn test_default_io_buffer_size_override_env_var() {
10726        // Force the sibling LazyLock to evaluate before we mutate the env var.
10727        // It caches forever on first read, so another test concurrently reading
10728        // *DEFAULT_IO_BUFFER_SIZE during our mutation window would otherwise
10729        // cache one of our test values and poison the rest of the suite.
10730        let _ = *DEFAULT_IO_BUFFER_SIZE;
10731
10732        // FilteredReadExec consults this when no explicit io_buffer_size was set
10733        // on the scanner, so the LANCE_DEFAULT_IO_BUFFER_SIZE env var takes
10734        // precedence over the max_bandwidth fallback.
10735        unsafe {
10736            std::env::set_var("LANCE_DEFAULT_IO_BUFFER_SIZE", "4096");
10737        }
10738        assert_eq!(get_default_io_buffer_size_override(), Some(4096));
10739
10740        unsafe {
10741            std::env::set_var("LANCE_DEFAULT_IO_BUFFER_SIZE", "not_a_number");
10742        }
10743        assert_eq!(get_default_io_buffer_size_override(), None);
10744
10745        unsafe {
10746            std::env::remove_var("LANCE_DEFAULT_IO_BUFFER_SIZE");
10747        }
10748        assert_eq!(get_default_io_buffer_size_override(), None);
10749    }
10750
10751    fn assert_values_in_range(array: &Int32Array, range: std::ops::Range<i32>, msg: &str) {
10752        assert!(!array.is_empty(), "Expected some results but got none");
10753        assert!(
10754            array
10755                .iter()
10756                .all(|v| v.is_some_and(|val| range.contains(&val))),
10757            "{msg} (expected range {range:?})"
10758        );
10759    }
10760
10761    // Helper to assert that results exist from all fragment ranges
10762    fn assert_has_all_fragments(array: &Int32Array) {
10763        assert!(
10764            array
10765                .iter()
10766                .any(|v| v.is_some_and(|val| (0..200).contains(&val)))
10767                && array
10768                    .iter()
10769                    .any(|v| v.is_some_and(|val| (200..400).contains(&val)))
10770                && array
10771                    .iter()
10772                    .any(|v| v.is_some_and(|val| (400..410).contains(&val)))
10773                && array
10774                    .iter()
10775                    .any(|v| v.is_some_and(|val| (410..420).contains(&val))),
10776            "Expected results from all fragments"
10777        );
10778    }
10779
10780    // Common test function for fragment list filtering (unindexed + indexed fragments)
10781    async fn test_fragment_list_filtering(
10782        test_ds: &TestVectorDataset,
10783        fragments: &[Fragment],
10784        mut build_scanner: impl FnMut(&Dataset) -> Scanner,
10785    ) {
10786        // Test 1: Query without fragment filter - should get results from all fragments
10787        let batch = build_scanner(&test_ds.dataset)
10788            .try_into_batch()
10789            .await
10790            .unwrap();
10791        let i_array = batch
10792            .column_by_name("i")
10793            .unwrap()
10794            .as_any()
10795            .downcast_ref::<Int32Array>()
10796            .unwrap();
10797        assert_has_all_fragments(i_array);
10798
10799        // Test 2: Query only one unindexed fragment (fragment 2), excluding fragment 3
10800        let mut scanner = build_scanner(&test_ds.dataset);
10801        scanner.with_fragments(vec![fragments[2].clone()]);
10802        let batch = scanner.try_into_batch().await.unwrap();
10803        let i_array = batch
10804            .column_by_name("i")
10805            .unwrap()
10806            .as_any()
10807            .downcast_ref::<Int32Array>()
10808            .unwrap();
10809        assert_values_in_range(i_array, 400..410, "Should only get results from fragment 2");
10810
10811        // Test 3: Query a single indexed fragment (fragment 0 only)
10812        let mut scanner = build_scanner(&test_ds.dataset);
10813        scanner.with_fragments(vec![fragments[0].clone()]);
10814        let batch = scanner.try_into_batch().await.unwrap();
10815        let i_array = batch
10816            .column_by_name("i")
10817            .unwrap()
10818            .as_any()
10819            .downcast_ref::<Int32Array>()
10820            .unwrap();
10821        assert_values_in_range(i_array, 0..200, "Should only get results from fragment 0");
10822
10823        // Test 4: Query all indexed fragments (0, 1) plus one unindexed fragment (2), excluding fragment 3
10824        let mut scanner = build_scanner(&test_ds.dataset);
10825        scanner.with_fragments(vec![
10826            fragments[0].clone(),
10827            fragments[1].clone(),
10828            fragments[2].clone(),
10829        ]);
10830        let batch = scanner.try_into_batch().await.unwrap();
10831        let i_array = batch
10832            .column_by_name("i")
10833            .unwrap()
10834            .as_any()
10835            .downcast_ref::<Int32Array>()
10836            .unwrap();
10837        assert_values_in_range(
10838            i_array,
10839            0..410,
10840            "Should get results from fragments 0, 1, and 2, excluding fragment 3",
10841        );
10842
10843        // Test 5: One indexed fragment (0) + one unindexed fragment (2), skipping indexed fragment 1 and unindexed fragment 3
10844        let mut scanner = build_scanner(&test_ds.dataset);
10845        scanner.with_fragments(vec![fragments[0].clone(), fragments[2].clone()]);
10846        let batch = scanner.try_into_batch().await.unwrap();
10847        let i_array = batch
10848            .column_by_name("i")
10849            .unwrap()
10850            .as_any()
10851            .downcast_ref::<Int32Array>()
10852            .unwrap();
10853        assert!(
10854            i_array
10855                .iter()
10856                .all(|v| v.is_some_and(|val| (0..200).contains(&val) || (400..410).contains(&val)))
10857                && i_array
10858                    .iter()
10859                    .any(|v| v.is_some_and(|val| (0..200).contains(&val)))
10860                && i_array
10861                    .iter()
10862                    .any(|v| v.is_some_and(|val| (400..410).contains(&val))),
10863            "Should only get results from fragment 0 (indexed) and fragment 2 (unindexed)"
10864        );
10865    }
10866
10867    #[tokio::test]
10868    async fn test_vector_search_respects_fragment_list() {
10869        let mut test_ds = TestVectorDataset::new(LanceFileVersion::Stable, false)
10870            .await
10871            .unwrap();
10872
10873        // Create one segment per indexed fragment so fragment filtering must prune ANN fan-out.
10874        test_ds.make_segmented_vector_index().await.unwrap();
10875
10876        let query: Float32Array = (0..32).map(|v| v as f32).collect();
10877
10878        // Append two more unindexed fragments
10879        test_ds.append_data_with_range(400, 410).await.unwrap();
10880        test_ds.append_data_with_range(410, 420).await.unwrap();
10881
10882        // Fragment 0: i=0..200 (indexed), Fragment 1: i=200..400 (indexed)
10883        // Fragment 2: i=400..410 (unindexed), Fragment 3: i=410..420 (unindexed)
10884        let fragments = test_ds.dataset.fragments();
10885        assert_eq!(fragments.len(), 4);
10886
10887        test_fragment_list_filtering(&test_ds, fragments, |dataset| {
10888            let mut scanner = dataset.scan();
10889            scanner.nearest("vec", &query, 420).unwrap();
10890            scanner
10891        })
10892        .await;
10893    }
10894
10895    #[tokio::test]
10896    async fn test_vector_search_fragment_filter_prunes_segment_fanout() {
10897        let mut test_ds = TestVectorDataset::new(LanceFileVersion::Stable, false)
10898            .await
10899            .unwrap();
10900        test_ds.make_segmented_vector_index().await.unwrap();
10901
10902        let query: Float32Array = (0..32).map(|v| v as f32).collect();
10903        test_ds.append_data_with_range(400, 410).await.unwrap();
10904        test_ds.append_data_with_range(410, 420).await.unwrap();
10905        let fragments = test_ds.dataset.fragments();
10906
10907        let mut scanner = test_ds.dataset.scan();
10908        scanner.nearest("vec", &query, 420).unwrap();
10909        let full_plan = scanner.explain_plan(true).await.unwrap();
10910        assert!(
10911            full_plan.contains("ANNSubIndex: name=idx, k=420, deltas=2, metric=L2"),
10912            "expected two ANN deltas without fragment filter, plan was:\n{full_plan}"
10913        );
10914
10915        let mut scanner = test_ds.dataset.scan();
10916        scanner
10917            .nearest("vec", &query, 420)
10918            .unwrap()
10919            .with_fragments(vec![fragments[0].clone()]);
10920        let filtered_plan = scanner.explain_plan(true).await.unwrap();
10921        assert!(
10922            filtered_plan.contains("ANNSubIndex: name=idx, k=420, deltas=1, metric=L2"),
10923            "expected one ANN delta with fragment filter, plan was:\n{filtered_plan}"
10924        );
10925    }
10926
10927    #[tokio::test]
10928    async fn test_vector_search_respects_index_segments() {
10929        let mut test_ds = TestVectorDataset::new(LanceFileVersion::Stable, false)
10930            .await
10931            .unwrap();
10932        let segment_ids = test_ds.make_segmented_vector_index().await.unwrap();
10933
10934        let query: Float32Array = (0..32).map(|v| v as f32).collect();
10935        test_ds.append_data_with_range(400, 410).await.unwrap();
10936        test_ds.append_data_with_range(410, 420).await.unwrap();
10937
10938        let mut scanner = test_ds.dataset.scan();
10939        scanner
10940            .nearest("vec", &query, 420)
10941            .unwrap()
10942            .with_index_segments(vec![segment_ids[0]])
10943            .unwrap();
10944        let batch = scanner.try_into_batch().await.unwrap();
10945        let i_array = batch
10946            .column_by_name("i")
10947            .unwrap()
10948            .as_any()
10949            .downcast_ref::<Int32Array>()
10950            .unwrap();
10951        assert_eq!(batch.num_rows(), 200);
10952        assert_values_in_range(
10953            i_array,
10954            0..200,
10955            "Should only get results from the selected index segment",
10956        );
10957    }
10958
10959    #[tokio::test]
10960    async fn test_vector_search_intersects_fragments_and_index_segments() {
10961        let mut test_ds = TestVectorDataset::new(LanceFileVersion::Stable, false)
10962            .await
10963            .unwrap();
10964        let segment_ids = test_ds.make_segmented_vector_index().await.unwrap();
10965
10966        let query: Float32Array = (0..32).map(|v| v as f32).collect();
10967        test_ds.append_data_with_range(400, 410).await.unwrap();
10968        test_ds.append_data_with_range(410, 420).await.unwrap();
10969        let fragments = test_ds.dataset.fragments();
10970
10971        let mut scanner = test_ds.dataset.scan();
10972        scanner
10973            .nearest("vec", &query, 420)
10974            .unwrap()
10975            .with_fragments(vec![fragments[0].clone(), fragments[2].clone()])
10976            .with_index_segments(vec![segment_ids[0]])
10977            .unwrap();
10978        let batch = scanner.try_into_batch().await.unwrap();
10979        let i_array = batch
10980            .column_by_name("i")
10981            .unwrap()
10982            .as_any()
10983            .downcast_ref::<Int32Array>()
10984            .unwrap();
10985        assert!(
10986            i_array
10987                .iter()
10988                .all(|v| v.is_some_and(|val| (0..200).contains(&val) || (400..410).contains(&val)))
10989                && i_array
10990                    .iter()
10991                    .any(|v| v.is_some_and(|val| (0..200).contains(&val)))
10992                && i_array
10993                    .iter()
10994                    .any(|v| v.is_some_and(|val| (400..410).contains(&val))),
10995            "Should get selected segment rows plus flat fallback for target fragments outside the selected segments"
10996        );
10997    }
10998
10999    #[tokio::test]
11000    async fn test_vector_search_rejects_unknown_index_segment() {
11001        let mut test_ds = TestVectorDataset::new(LanceFileVersion::Stable, false)
11002            .await
11003            .unwrap();
11004        test_ds.make_segmented_vector_index().await.unwrap();
11005
11006        let query: Float32Array = (0..32).map(|v| v as f32).collect();
11007        let err = test_ds
11008            .dataset
11009            .scan()
11010            .nearest("vec", &query, 10)
11011            .unwrap()
11012            .with_index_segments(vec![Uuid::new_v4()])
11013            .unwrap()
11014            .try_into_batch()
11015            .await
11016            .unwrap_err();
11017        assert!(
11018            err.to_string().contains("unknown index segments"),
11019            "unexpected error: {err}"
11020        );
11021    }
11022
11023    #[tokio::test]
11024    async fn test_vector_search_rejects_metric_mismatch_for_index_segments() {
11025        let mut test_ds = TestVectorDataset::new(LanceFileVersion::Stable, false)
11026            .await
11027            .unwrap();
11028        let segment_ids = test_ds.make_segmented_vector_index().await.unwrap();
11029
11030        let query: Float32Array = (0..32).map(|v| v as f32).collect();
11031        let err = test_ds
11032            .dataset
11033            .scan()
11034            .nearest("vec", &query, 10)
11035            .unwrap()
11036            .distance_metric(DistanceType::Dot)
11037            .with_index_segments(vec![segment_ids[0]])
11038            .unwrap()
11039            .try_into_batch()
11040            .await
11041            .unwrap_err();
11042        assert!(
11043            err.to_string()
11044                .contains("with_index_segments requested metric"),
11045            "unexpected error: {err}"
11046        );
11047    }
11048
11049    #[tokio::test]
11050    async fn test_with_index_segments_rejects_empty_list() {
11051        let test_ds = TestVectorDataset::new(LanceFileVersion::Stable, false)
11052            .await
11053            .unwrap();
11054        let query: Float32Array = (0..32).map(|v| v as f32).collect();
11055
11056        let Err(err) = test_ds
11057            .dataset
11058            .scan()
11059            .nearest("vec", &query, 10)
11060            .unwrap()
11061            .with_index_segments(vec![])
11062        else {
11063            panic!("expected empty index segments to be rejected");
11064        };
11065        assert!(
11066            err.to_string()
11067                .contains("with_index_segments does not accept an empty segment list"),
11068            "unexpected error: {err}"
11069        );
11070    }
11071
11072    #[tokio::test]
11073    async fn test_with_index_segments_rejected_for_non_vector_query() {
11074        let mut test_ds = TestVectorDataset::new(LanceFileVersion::Stable, false)
11075            .await
11076            .unwrap();
11077        let segment_ids = test_ds.make_segmented_vector_index().await.unwrap();
11078
11079        let err = test_ds
11080            .dataset
11081            .scan()
11082            .project(&["i"])
11083            .unwrap()
11084            .with_index_segments(vec![segment_ids[0]])
11085            .unwrap()
11086            .try_into_batch()
11087            .await
11088            .unwrap_err();
11089        assert!(
11090            err.to_string()
11091                .contains("with_index_segments is only supported for vector search"),
11092            "unexpected error: {err}"
11093        );
11094    }
11095
11096    #[tokio::test]
11097    async fn test_fts_respects_fragment_list() {
11098        let mut test_ds = TestVectorDataset::new(LanceFileVersion::Stable, false)
11099            .await
11100            .unwrap();
11101
11102        // Create FTS index on first 2 fragments
11103        test_ds.make_fts_index().await.unwrap();
11104
11105        // Append two more unindexed fragments
11106        test_ds.append_data_with_range(400, 410).await.unwrap();
11107        test_ds.append_data_with_range(410, 420).await.unwrap();
11108
11109        // Fragment 0: i=0..200 (indexed), Fragment 1: i=200..400 (indexed)
11110        // Fragment 2: i=400..410 (unindexed), Fragment 3: i=410..420 (unindexed)
11111        let fragments = test_ds.dataset.fragments();
11112        assert_eq!(fragments.len(), 4);
11113
11114        // "s-5" matches: s-5, s-50..s-59, s-150..s-159 (frag 0), s-250..s-259, s-350..s-359 (frag 1), s-405 (frag 2), s-415 (frag 3)
11115        test_fragment_list_filtering(&test_ds, fragments, |dataset| {
11116            let mut scanner = dataset.scan();
11117            scanner
11118                .full_text_search(FullTextSearchQuery::new("s-5".into()))
11119                .unwrap();
11120            scanner
11121        })
11122        .await;
11123    }
11124}