1use std::collections::HashSet;
5
6use datafusion::config::ConfigOptions;
7use std::ops::Range;
8use std::pin::Pin;
9use std::sync::{Arc, LazyLock};
10use std::task::{Context, Poll};
11
12use crate::index::DatasetIndexExt;
13use arrow::array::AsArray;
14use arrow_array::{Array, Float32Array, Int64Array, RecordBatch};
15use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema, SchemaRef, SortOptions};
16use arrow_select::concat::concat_batches;
17use async_recursion::async_recursion;
18use chrono::Utc;
19use datafusion::common::{DFSchema, JoinType, NullEquality, SchemaExt, exec_datafusion_err};
20use datafusion::functions_aggregate;
21use datafusion::logical_expr::{Expr, ScalarUDF, col, lit};
22use datafusion::physical_expr::PhysicalSortExpr;
23#[allow(deprecated)]
24use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec;
25use datafusion::physical_plan::expressions;
26use datafusion::physical_plan::projection::ProjectionExec as DFProjectionExec;
27use datafusion::physical_plan::sorts::sort::SortExec;
28use datafusion::physical_plan::{
29 ExecutionPlan, SendableRecordBatchStream,
30 aggregates::{AggregateExec, AggregateMode, PhysicalGroupBy},
31 display::DisplayableExecutionPlan,
32 limit::GlobalLimitExec,
33 repartition::RepartitionExec,
34 union::UnionExec,
35};
36use datafusion::scalar::ScalarValue;
37use datafusion_expr::ExprSchemable;
38use datafusion_expr::execution_props::ExecutionProps;
39use datafusion_functions::core::getfield::GetFieldFunc;
40use datafusion_physical_expr::expressions::Column;
41use datafusion_physical_expr::{LexOrdering, Partitioning, PhysicalExpr, create_physical_expr};
42use datafusion_physical_plan::joins::PartitionMode;
43use datafusion_physical_plan::projection::ProjectionExec;
44use datafusion_physical_plan::stream::RecordBatchStreamAdapter;
45use datafusion_physical_plan::{empty::EmptyExec, joins::HashJoinExec};
46use futures::future::BoxFuture;
47use futures::stream::{Stream, StreamExt};
48use futures::{FutureExt, TryStreamExt};
49use lance_arrow::floats::{FloatType, coerce_float_vector};
50use lance_arrow::{DataTypeExt, SchemaExt as ArrowSchemaExt};
51use lance_core::datatypes::{
52 BlobHandling, Field, OnMissing, Projection, escape_field_path_for_project, format_field_path,
53};
54use lance_core::error::LanceOptionExt;
55use lance_core::utils::address::RowAddress;
56use lance_core::utils::mask::{RowAddrMask, RowAddrTreeMap};
57use lance_core::utils::tokio::get_num_compute_intensive_cpus;
58use lance_core::{ROW_ADDR, ROW_ID, ROW_OFFSET};
59use lance_datafusion::aggregate::Aggregate;
60use lance_datafusion::exec::{
61 LanceExecutionOptions, OneShotExec, StrictBatchSizeExec, analyze_plan, execute_plan,
62};
63use lance_datafusion::expr::safe_coerce_scalar;
64use lance_datafusion::projection::ProjectionPlan;
65use lance_file::reader::FileReaderOptions;
66use lance_index::IndexCriteria;
67use lance_index::scalar::FullTextSearchQuery;
68use lance_index::scalar::expression::ScalarIndexExpr;
69use lance_index::scalar::expression::{INDEX_EXPR_RESULT_SCHEMA, IndexExprResult, PlannerIndexExt};
70use lance_index::scalar::inverted::query::{
71 FtsQuery, FtsQueryNode, FtsSearchParams, MatchQuery, PhraseQuery, fill_fts_query_column,
72};
73use lance_index::scalar::inverted::{SCORE_COL, SCORE_FIELD};
74use lance_index::vector::{DEFAULT_QUERY_PARALLELISM, DIST_COL, Query};
75use lance_index::{metrics::NoOpMetricsCollector, scalar::inverted::FTS_SCHEMA};
76use lance_io::stream::RecordBatchStream;
77use lance_linalg::distance::MetricType;
78use lance_table::format::{Fragment, IndexMetadata};
79use roaring::RoaringBitmap;
80use tracing::{Span, info_span, instrument};
81use uuid::Uuid;
82
83use super::Dataset;
84use crate::dataset::row_offsets_to_row_addresses;
85use crate::dataset::utils::SchemaAdapter;
86use crate::index::DatasetIndexInternalExt;
87use crate::index::scalar::inverted::{load_segment_details, load_segments};
88use crate::index::scalar_logical::scalar_index_fragment_bitmap;
89use crate::index::vector::utils::{
90 default_distance_type_for, get_vector_dim, get_vector_type, validate_distance_type_for,
91};
92use crate::io::exec::filtered_read::{FilteredReadExec, FilteredReadOptions};
93use crate::io::exec::fts::{
94 BoostQueryExec, FlatMatchFilterExec, FlatMatchQueryExec, MatchQueryExec, PhraseQueryExec,
95};
96use crate::io::exec::knn::MultivectorScoringExec;
97use crate::io::exec::scalar_index::{MaterializeIndexExec, ScalarIndexExec};
98use crate::io::exec::{
99 AddRowAddrExec, FilterPlan as ExprFilterPlan, KNNVectorDistanceExec, LancePushdownScanExec,
100 LanceScanExec, Planner, PreFilterSource, ScanConfig, TakeExec,
101 knn::{KNN_INDEX_SCHEMA, new_knn_exec},
102 project,
103};
104use crate::io::exec::{AddRowOffsetExec, LanceFilterExec, LanceScanConfig, get_physical_optimizer};
105use crate::{Error, Result};
106use crate::{
107 datatypes::Schema,
108 io::exec::fts::{BoolSlot, BooleanQueryExec, build_boolean_query_children},
109};
110
111pub use lance_datafusion::exec::{ExecutionStatsCallback, ExecutionSummaryCounts};
112#[cfg(feature = "substrait")]
113use lance_datafusion::substrait::parse_substrait;
114
115pub(crate) const BATCH_SIZE_FALLBACK: usize = 8192;
116
117fn parse_env_var<T: std::str::FromStr>(env_var_name: &str, default_val: &str) -> Option<T>
119where
120 T::Err: std::fmt::Display,
121{
122 std::env::var(env_var_name)
123 .ok()
124 .and_then(|val| match val.parse() {
125 Ok(value) => Some(value),
126 Err(e) => {
127 log::warn!(
128 "Failed to parse the environment variable {}='{}': {}, the default value is: {}.",
129 env_var_name,
130 val,
131 e,
132 default_val
133 );
134 None
135 }
136 })
137}
138
139pub fn get_default_batch_size() -> Option<usize> {
142 parse_env_var("LANCE_DEFAULT_BATCH_SIZE", &BATCH_SIZE_FALLBACK.to_string())
143}
144
145pub const LEGACY_DEFAULT_FRAGMENT_READAHEAD: usize = 4;
146
147pub static DEFAULT_FRAGMENT_READAHEAD: LazyLock<Option<usize>> = LazyLock::new(|| {
148 parse_env_var(
149 "LANCE_DEFAULT_FRAGMENT_READAHEAD",
150 &LEGACY_DEFAULT_FRAGMENT_READAHEAD.to_string(),
151 )
152});
153
154const DEFAULT_XTR_OVERFETCH_VALUE: u32 = 10;
155
156pub static DEFAULT_XTR_OVERFETCH: LazyLock<u32> = LazyLock::new(|| {
157 parse_env_var(
158 "LANCE_XTR_OVERFETCH",
159 &DEFAULT_XTR_OVERFETCH_VALUE.to_string(),
160 )
161 .unwrap_or(DEFAULT_XTR_OVERFETCH_VALUE)
162});
163
164const DEFAULT_IO_BUFFER_SIZE_VALUE: u64 = 2 * 1024 * 1024 * 1024;
168
169pub static DEFAULT_IO_BUFFER_SIZE: LazyLock<u64> = LazyLock::new(|| {
170 parse_env_var(
171 "LANCE_DEFAULT_IO_BUFFER_SIZE",
172 &DEFAULT_IO_BUFFER_SIZE_VALUE.to_string(),
173 )
174 .unwrap_or(DEFAULT_IO_BUFFER_SIZE_VALUE)
175});
176
177pub fn get_default_io_buffer_size_override() -> Option<u64> {
183 parse_env_var(
184 "LANCE_DEFAULT_IO_BUFFER_SIZE",
185 &DEFAULT_IO_BUFFER_SIZE_VALUE.to_string(),
186 )
187}
188
189#[derive(Debug, Clone)]
194pub struct ColumnOrdering {
195 pub ascending: bool,
196 pub nulls_first: bool,
197 pub column_name: String,
198}
199
200impl ColumnOrdering {
201 pub fn asc_nulls_first(column_name: String) -> Self {
202 Self {
203 ascending: true,
204 nulls_first: true,
205 column_name,
206 }
207 }
208
209 pub fn asc_nulls_last(column_name: String) -> Self {
210 Self {
211 ascending: true,
212 nulls_first: false,
213 column_name,
214 }
215 }
216
217 pub fn desc_nulls_first(column_name: String) -> Self {
218 Self {
219 ascending: false,
220 nulls_first: true,
221 column_name,
222 }
223 }
224
225 pub fn desc_nulls_last(column_name: String) -> Self {
226 Self {
227 ascending: false,
228 nulls_first: false,
229 column_name,
230 }
231 }
232}
233
234#[derive(Clone)]
248pub enum MaterializationStyle {
249 Heuristic,
261 AllLate,
263 AllEarly,
265 AllEarlyExcept(Vec<u32>),
267}
268
269impl MaterializationStyle {
270 pub fn all_early_except(columns: &[impl AsRef<str>], schema: &Schema) -> Result<Self> {
271 let field_ids = schema
272 .project(columns)?
273 .field_ids()
274 .into_iter()
275 .map(|id| id as u32)
276 .collect();
277 Ok(Self::AllEarlyExcept(field_ids))
278 }
279}
280
281#[derive(Debug)]
282struct PlannedFilteredScan {
283 plan: Arc<dyn ExecutionPlan>,
284 limit_pushed_down: bool,
285 filter_pushed_down: bool,
286}
287
288pub struct FilterPlan {
289 query_filter: Option<QueryFilter>,
291 refine_query_filter: bool,
292 expr_filter_plan: ExprFilterPlan,
294}
295
296impl FilterPlan {
297 pub fn new(query_filter: Option<QueryFilter>, expr_filter_plan: ExprFilterPlan) -> Self {
298 Self {
299 query_filter,
300 refine_query_filter: false,
301 expr_filter_plan,
302 }
303 }
304
305 pub fn disable_refine(&mut self) {
306 self.expr_filter_plan = ExprFilterPlan::default();
307 self.refine_query_filter = false;
308 }
309
310 pub fn make_refine_only(&mut self) {
311 self.expr_filter_plan.make_refine_only();
312 self.refine_query_filter = true;
313 }
314
315 pub fn fts_filter(&self) -> Option<FullTextSearchQuery> {
316 match &self.query_filter {
317 Some(QueryFilter::Fts(query)) => Some(query.clone()),
318 _ => None,
319 }
320 }
321
322 pub fn vector_filter(&self) -> Option<Query> {
323 match &self.query_filter {
324 Some(QueryFilter::Vector(query)) => Some(query.clone()),
325 _ => None,
326 }
327 }
328
329 pub fn has_refine(&self) -> bool {
330 self.expr_filter_plan.has_refine() || self.refine_query_filter
331 }
332
333 pub async fn refine_columns(&self, dataset: &Arc<Dataset>) -> Result<Vec<String>> {
334 let mut columns = vec![];
335
336 if self.expr_filter_plan.has_refine() {
337 columns.extend(self.expr_filter_plan.refine_columns());
338 }
339
340 if self.refine_query_filter {
341 match &self.query_filter {
342 Some(QueryFilter::Fts(fts_query)) => {
343 let cols = if fts_query.columns().is_empty() {
344 let indexed_columns = fts_indexed_columns(dataset.clone()).await?;
345 let q = fill_fts_query_column(&fts_query.query, &indexed_columns, false)?;
346 q.columns()
347 } else {
348 fts_query.columns()
349 };
350
351 if let FtsQuery::Match(_) = &fts_query.query {
354 columns.extend(cols.iter().cloned().collect::<Vec<_>>());
355 }
356 }
357 Some(QueryFilter::Vector(vector_query)) => {
358 columns.push(vector_query.column.clone());
359 }
360 None => {}
361 }
362 }
363
364 Ok(columns)
365 }
366
367 pub async fn refine_filter(
368 &self,
369 input: Arc<dyn ExecutionPlan>,
370 scanner: &Scanner,
371 ) -> Result<Arc<dyn ExecutionPlan>> {
372 let mut plan = input;
373
374 if self.refine_query_filter {
375 match &self.query_filter {
376 Some(QueryFilter::Fts(fts_query)) => {
377 plan = scanner.flat_fts_filter(plan, fts_query).await?;
378 }
379 Some(QueryFilter::Vector(vector_query)) => {
380 plan = scanner.flat_knn(plan, vector_query)?;
381 }
382 None => {}
383 }
384 }
385
386 if let Some(refine_expr) = &self.expr_filter_plan.refine_expr {
387 plan = Arc::new(LanceFilterExec::try_new(refine_expr.clone(), plan)?);
390 }
391
392 Ok(plan)
393 }
394}
395
396#[derive(Debug, Clone, Default)]
397pub struct LanceFilter {
398 query_filter: Option<QueryFilter>,
399 expr_filter: Option<ExprFilter>,
400}
401
402impl LanceFilter {
403 pub fn is_none(&self) -> bool {
404 self.query_filter.is_none() && self.expr_filter.is_none()
405 }
406}
407
408#[derive(Debug, Clone)]
410pub enum QueryFilter {
411 Fts(FullTextSearchQuery),
412 Vector(Query),
413}
414
415#[derive(Debug, Clone)]
417pub enum ExprFilter {
418 Sql(String),
420 Substrait(Vec<u8>),
422 Datafusion(Expr),
424}
425
426impl ExprFilter {
427 #[allow(unused)]
436 #[instrument(level = "trace", name = "filter_to_df", skip_all)]
437 pub fn to_datafusion(&self, dataset_schema: &Schema, full_schema: &Schema) -> Result<Expr> {
438 match self {
439 Self::Sql(sql) => {
440 let schema = Arc::new(ArrowSchema::from(full_schema));
441 let planner = Planner::new(schema.clone());
442 let filter = planner.parse_filter(sql)?;
443
444 let df_schema = DFSchema::try_from(schema)?;
445 let ret_field = filter.to_field(&df_schema)?.1;
446 let ret_type = ret_field.data_type();
447 if ret_type != &DataType::Boolean {
448 return Err(Error::invalid_input_source(
449 format!("The filter {} does not return a boolean", filter).into(),
450 ));
451 }
452
453 let optimized = planner.optimize_expr(filter).map_err(|e| {
454 Error::invalid_input(format!("Error optimizing sql filter: {sql} ({e})"))
455 })?;
456 Ok(optimized)
457 }
458 #[cfg(feature = "substrait")]
459 Self::Substrait(expr) => {
460 use lance_datafusion::exec::{LanceExecutionOptions, get_session_context};
461
462 let ctx = get_session_context(&LanceExecutionOptions::default());
463 let state = ctx.state();
464 let schema = Arc::new(ArrowSchema::from(dataset_schema));
465 let expr = parse_substrait(expr, schema.clone(), &ctx.state())
466 .now_or_never()
467 .expect("could not parse the Substrait filter in a synchronous fashion")?;
468 let planner = Planner::new(schema);
469 planner.optimize_expr(expr.clone()).map_err(|e| {
470 Error::invalid_input(format!(
471 "Error optimizing substrait filter: {expr:?} ({e})"
472 ))
473 })
474 }
475 #[cfg(not(feature = "substrait"))]
476 Self::Substrait(_) => Err(Error::not_supported_source(
477 "Substrait filter is not supported in this build".into(),
478 )),
479 Self::Datafusion(expr) => Ok(expr.clone()),
480 }
481 }
482}
483
484#[derive(Debug, Clone)]
486pub enum AggregateExpr {
487 #[cfg(feature = "substrait")]
488 Substrait(Vec<u8>),
489 Datafusion {
490 group_by: Vec<Expr>,
491 aggregates: Vec<Expr>,
492 },
493}
494
495impl AggregateExpr {
496 pub fn builder() -> AggregateExprBuilder<false> {
509 AggregateExprBuilder::new()
510 }
511
512 #[cfg(feature = "substrait")]
514 pub fn substrait(bytes: impl Into<Vec<u8>>) -> Self {
515 Self::Substrait(bytes.into())
516 }
517
518 pub fn datafusion(group_by: Vec<Expr>, aggregates: Vec<Expr>) -> Self {
521 Self::Datafusion {
522 group_by,
523 aggregates,
524 }
525 }
526
527 fn parse(self, #[allow(unused_variables)] schema: Arc<ArrowSchema>) -> Result<Aggregate> {
534 match self {
535 #[cfg(feature = "substrait")]
536 Self::Substrait(bytes) => {
537 use lance_datafusion::exec::{LanceExecutionOptions, get_session_context};
538 use lance_datafusion::substrait::parse_substrait_aggregate;
539
540 let ctx = get_session_context(&LanceExecutionOptions::default());
541 parse_substrait_aggregate(&bytes, schema, &ctx.state())
542 .now_or_never()
543 .expect("could not parse the Substrait aggregate in a synchronous fashion")
544 }
545 Self::Datafusion {
546 group_by,
547 aggregates,
548 } => Ok(Aggregate::new(group_by, aggregates)),
549 }
550 }
551}
552
553#[derive(Debug, Clone)]
558pub struct AggregateExprBuilder<const HAS_PENDING: bool> {
559 group_by: Vec<Expr>,
560 aggregates: Vec<Expr>,
561}
562
563impl Default for AggregateExprBuilder<false> {
564 fn default() -> Self {
565 Self {
566 group_by: Vec::new(),
567 aggregates: Vec::new(),
568 }
569 }
570}
571
572impl AggregateExprBuilder<false> {
573 pub fn new() -> Self {
575 Self::default()
576 }
577
578 pub fn build(self) -> AggregateExpr {
580 AggregateExpr::Datafusion {
581 group_by: self.group_by,
582 aggregates: self.aggregates,
583 }
584 }
585}
586
587impl<const HAS_PENDING: bool> AggregateExprBuilder<HAS_PENDING> {
588 pub fn group_by(mut self, column: impl Into<String>) -> AggregateExprBuilder<false> {
593 self.group_by.push(col(column.into()));
594 AggregateExprBuilder {
595 group_by: self.group_by,
596 aggregates: self.aggregates,
597 }
598 }
599
600 pub fn group_by_columns(
605 mut self,
606 columns: impl IntoIterator<Item = impl Into<String>>,
607 ) -> AggregateExprBuilder<false> {
608 for column in columns {
609 self.group_by.push(col(column.into()));
610 }
611 AggregateExprBuilder {
612 group_by: self.group_by,
613 aggregates: self.aggregates,
614 }
615 }
616
617 pub fn count_star(mut self) -> AggregateExprBuilder<true> {
619 self.aggregates
620 .push(functions_aggregate::count::count(lit(1)));
621 AggregateExprBuilder {
622 group_by: self.group_by,
623 aggregates: self.aggregates,
624 }
625 }
626
627 pub fn count(mut self, column: impl Into<String>) -> AggregateExprBuilder<true> {
632 self.aggregates
633 .push(functions_aggregate::count::count(col(column.into())));
634 AggregateExprBuilder {
635 group_by: self.group_by,
636 aggregates: self.aggregates,
637 }
638 }
639
640 pub fn sum(mut self, column: impl Into<String>) -> AggregateExprBuilder<true> {
642 self.aggregates
643 .push(functions_aggregate::sum::sum(col(column.into())));
644 AggregateExprBuilder {
645 group_by: self.group_by,
646 aggregates: self.aggregates,
647 }
648 }
649
650 pub fn avg(mut self, column: impl Into<String>) -> AggregateExprBuilder<true> {
652 self.aggregates
653 .push(functions_aggregate::average::avg(col(column.into())));
654 AggregateExprBuilder {
655 group_by: self.group_by,
656 aggregates: self.aggregates,
657 }
658 }
659
660 pub fn min(mut self, column: impl Into<String>) -> AggregateExprBuilder<true> {
662 self.aggregates
663 .push(functions_aggregate::min_max::min(col(column.into())));
664 AggregateExprBuilder {
665 group_by: self.group_by,
666 aggregates: self.aggregates,
667 }
668 }
669
670 pub fn max(mut self, column: impl Into<String>) -> AggregateExprBuilder<true> {
672 self.aggregates
673 .push(functions_aggregate::min_max::max(col(column.into())));
674 AggregateExprBuilder {
675 group_by: self.group_by,
676 aggregates: self.aggregates,
677 }
678 }
679}
680
681impl AggregateExprBuilder<true> {
682 pub fn alias(mut self, name: impl Into<String>) -> AggregateExprBuilder<false> {
684 let pending = self.aggregates.pop().expect("pending aggregate must exist");
685 self.aggregates.push(pending.alias(name.into()));
686 AggregateExprBuilder {
687 group_by: self.group_by,
688 aggregates: self.aggregates,
689 }
690 }
691
692 pub fn build(self) -> AggregateExpr {
694 AggregateExpr::Datafusion {
695 group_by: self.group_by,
696 aggregates: self.aggregates,
697 }
698 }
699}
700
701#[derive(Clone)]
715pub struct Scanner {
716 dataset: Arc<Dataset>,
717
718 projection_plan: ProjectionPlan,
725 blob_handling: BlobHandling,
726
727 prefilter: bool,
729
730 materialization_style: MaterializationStyle,
732
733 filter: LanceFilter,
735
736 full_text_query: Option<FullTextSearchQuery>,
738
739 batch_size: Option<usize>,
741
742 batch_size_bytes: Option<u64>,
745
746 batch_readahead: usize,
748
749 fragment_readahead: Option<usize>,
751
752 io_buffer_size: Option<u64>,
754
755 limit: Option<i64>,
756 offset: Option<i64>,
757
758 ordering: Option<Vec<ColumnOrdering>>,
767
768 nearest: Option<Query>,
769
770 use_scalar_index: bool,
776
777 use_stats: bool,
781
782 ordered: bool,
786
787 fragments: Option<Vec<Fragment>>,
789
790 index_segments: Option<Vec<Uuid>>,
792
793 fast_search: bool,
800
801 include_deleted_rows: bool,
803
804 scan_stats_callback: Option<ExecutionStatsCallback>,
806
807 strict_batch_size: bool,
812
813 file_reader_options: Option<FileReaderOptions>,
815
816 aggregate: Option<Aggregate>,
817
818 legacy_with_row_id: bool,
839 legacy_with_row_addr: bool,
841 explicit_projection: bool,
844 autoproject_scoring_columns: bool,
846}
847
848#[derive(Debug, Clone)]
850pub enum TakeOperation {
851 RowIds(Vec<u64>),
853 RowAddrs(Vec<u64>),
855 RowOffsets(Vec<u64>),
860}
861
862impl TakeOperation {
863 fn extract_u64_list(list: &[Expr]) -> Option<Vec<u64>> {
864 let mut u64s = Vec::with_capacity(list.len());
865 for expr in list {
866 if let Expr::Literal(lit, _) = expr {
867 if let Some(ScalarValue::UInt64(Some(val))) =
868 safe_coerce_scalar(lit, &DataType::UInt64)
869 {
870 u64s.push(val);
871 } else {
872 return None;
873 }
874 } else {
875 return None;
876 }
877 }
878 Some(u64s)
879 }
880
881 fn merge(self, other: Self) -> Option<Self> {
882 match (self, other) {
883 (Self::RowIds(mut left), Self::RowIds(right)) => {
884 left.extend(right);
885 Some(Self::RowIds(left))
886 }
887 (Self::RowAddrs(mut left), Self::RowAddrs(right)) => {
888 left.extend(right);
889 Some(Self::RowAddrs(left))
890 }
891 (Self::RowOffsets(mut left), Self::RowOffsets(right)) => {
892 left.extend(right);
893 Some(Self::RowOffsets(left))
894 }
895 _ => None,
896 }
897 }
898
899 fn try_from_expr(expr: &Expr) -> Option<(Self, Option<Expr>)> {
917 if let Expr::BinaryExpr(binary) = expr {
918 match binary.op {
919 datafusion_expr::Operator::And => {
920 let left_take = Self::try_from_expr(&binary.left);
921 let right_take = Self::try_from_expr(&binary.right);
922 match (left_take, right_take) {
923 (Some(_), Some(_)) => {
924 return None;
930 }
931 (Some((left_op, left_rem)), None) => {
932 let remainder = match left_rem {
933 Some(expr) => Expr::and(expr, binary.right.as_ref().clone()),
937 None => binary.right.as_ref().clone(),
938 };
939 return Some((left_op, Some(remainder)));
940 }
941 (None, Some((right_op, right_rem))) => {
942 let remainder = match right_rem {
943 Some(expr) => Expr::and(expr, binary.left.as_ref().clone()),
944 None => binary.left.as_ref().clone(),
945 };
946 return Some((right_op, Some(remainder)));
947 }
948 (None, None) => {
949 return None;
950 }
951 }
952 }
953 datafusion_expr::Operator::Eq => {
954 if let (Expr::Column(col), Expr::Literal(lit, _)) =
956 (binary.left.as_ref(), binary.right.as_ref())
957 && let Some(ScalarValue::UInt64(Some(val))) =
958 safe_coerce_scalar(lit, &DataType::UInt64)
959 {
960 if col.name == ROW_ID {
961 return Some((Self::RowIds(vec![val]), None));
962 } else if col.name == ROW_ADDR {
963 return Some((Self::RowAddrs(vec![val]), None));
964 } else if col.name == ROW_OFFSET {
965 return Some((Self::RowOffsets(vec![val]), None));
966 }
967 }
968 }
969 datafusion_expr::Operator::Or => {
970 let left_take = Self::try_from_expr(&binary.left);
971 let right_take = Self::try_from_expr(&binary.right);
972 if let (Some(left), Some(right)) = (left_take, right_take) {
973 if left.1.is_some() || right.1.is_some() {
974 return None;
981 }
982 return left.0.merge(right.0).map(|op| (op, None));
983 }
984 }
985 _ => {}
986 }
987 } else if let Expr::InList(in_expr) = expr
988 && let Expr::Column(col) = in_expr.expr.as_ref()
989 && let Some(u64s) = Self::extract_u64_list(&in_expr.list)
990 {
991 if col.name == ROW_ID {
992 return Some((Self::RowIds(u64s), None));
993 } else if col.name == ROW_ADDR {
994 return Some((Self::RowAddrs(u64s), None));
995 } else if col.name == ROW_OFFSET {
996 return Some((Self::RowOffsets(u64s), None));
997 }
998 }
999 None
1000 }
1001}
1002
1003impl Scanner {
1004 pub fn new(dataset: Arc<Dataset>) -> Self {
1005 let projection_plan = ProjectionPlan::full(dataset.clone()).unwrap();
1006 let file_reader_options = dataset.file_reader_options.clone();
1007 let mut scanner = Self {
1008 dataset,
1009 projection_plan,
1010 blob_handling: BlobHandling::default(),
1011 prefilter: false,
1012 materialization_style: MaterializationStyle::Heuristic,
1013 filter: LanceFilter::default(),
1014 full_text_query: None,
1015 batch_size: None,
1016 batch_size_bytes: None,
1017 batch_readahead: get_num_compute_intensive_cpus(),
1018 fragment_readahead: None,
1019 io_buffer_size: None,
1020 limit: None,
1021 offset: None,
1022 ordering: None,
1023 nearest: None,
1024 use_stats: true,
1025 ordered: true,
1026 fragments: None,
1027 index_segments: None,
1028 fast_search: false,
1029 use_scalar_index: true,
1030 include_deleted_rows: false,
1031 scan_stats_callback: None,
1032 strict_batch_size: false,
1033 file_reader_options,
1034 aggregate: None,
1035 legacy_with_row_addr: false,
1036 legacy_with_row_id: false,
1037 explicit_projection: false,
1038 autoproject_scoring_columns: true,
1039 };
1040 scanner.apply_blob_handling();
1041 scanner
1042 }
1043
1044 fn apply_blob_handling(&mut self) {
1045 let projection = self
1046 .projection_plan
1047 .physical_projection
1048 .clone()
1049 .with_blob_handling(self.blob_handling.clone());
1050 self.projection_plan.physical_projection = projection;
1051 }
1052
1053 pub fn blob_handling(&mut self, blob_handling: BlobHandling) -> &mut Self {
1054 self.blob_handling = blob_handling;
1055 self.apply_blob_handling();
1056 self
1057 }
1058
1059 pub fn from_fragment(dataset: Arc<Dataset>, fragment: Fragment) -> Self {
1060 Self {
1061 fragments: Some(vec![fragment]),
1062 ..Self::new(dataset)
1063 }
1064 }
1065
1066 pub fn with_fragments(&mut self, fragments: Vec<Fragment>) -> &mut Self {
1070 self.fragments = Some(fragments);
1071 self
1072 }
1073
1074 pub fn with_index_segments(&mut self, segments: Vec<Uuid>) -> Result<&mut Self> {
1082 if segments.is_empty() {
1083 return Err(Error::invalid_input(
1084 "with_index_segments does not accept an empty segment list".to_string(),
1085 ));
1086 }
1087 self.index_segments = Some(segments);
1088 Ok(self)
1089 }
1090
1091 fn get_batch_size(&self) -> usize {
1092 get_default_batch_size().unwrap_or_else(|| {
1098 self.batch_size.unwrap_or_else(|| {
1099 std::cmp::max(
1100 self.dataset.object_store.as_ref().block_size() / 4,
1101 BATCH_SIZE_FALLBACK,
1102 )
1103 })
1104 })
1105 }
1106
1107 fn ensure_not_fragment_scan(&self) -> Result<()> {
1108 if self.is_fragment_scan() {
1109 Err(Error::not_supported(
1110 "This operation is not supported for fragment scan".to_string(),
1111 ))
1112 } else {
1113 Ok(())
1114 }
1115 }
1116
1117 fn is_fragment_scan(&self) -> bool {
1118 self.fragments.is_some()
1119 }
1120
1121 pub fn empty_project(&mut self) -> Result<&mut Self> {
1125 self.project(&[] as &[&str])
1126 }
1127
1128 pub fn project<T: AsRef<str>>(&mut self, columns: &[T]) -> Result<&mut Self> {
1132 let transformed_columns: Vec<(&str, String)> = columns
1133 .iter()
1134 .map(|c| (c.as_ref(), escape_field_path_for_project(c.as_ref())))
1135 .collect();
1136
1137 self.project_with_transform(&transformed_columns)
1138 }
1139
1140 pub fn project_with_transform(
1144 &mut self,
1145 columns: &[(impl AsRef<str>, impl AsRef<str>)],
1146 ) -> Result<&mut Self> {
1147 self.explicit_projection = true;
1148 self.projection_plan = ProjectionPlan::from_expressions(self.dataset.clone(), columns)?;
1149 if self.legacy_with_row_id {
1150 self.projection_plan.include_row_id();
1151 }
1152 if self.legacy_with_row_addr {
1153 self.projection_plan.include_row_addr();
1154 }
1155 self.apply_blob_handling();
1156 Ok(self)
1157 }
1158
1159 pub fn prefilter(&mut self, should_prefilter: bool) -> &mut Self {
1168 self.prefilter = should_prefilter;
1169 self
1170 }
1171
1172 pub fn scan_stats_callback(&mut self, callback: ExecutionStatsCallback) -> &mut Self {
1174 self.scan_stats_callback = Some(callback);
1175 self
1176 }
1177
1178 pub fn materialization_style(&mut self, style: MaterializationStyle) -> &mut Self {
1190 self.materialization_style = style;
1191 self
1192 }
1193
1194 pub fn filter(&mut self, filter: &str) -> Result<&mut Self> {
1210 self.filter.expr_filter = Some(ExprFilter::Sql(filter.to_string()));
1211 Ok(self)
1212 }
1213
1214 pub fn filter_query(&mut self, filter: QueryFilter) -> Result<&mut Self> {
1233 self.filter.query_filter = Some(filter);
1234 Ok(self)
1235 }
1236
1237 pub fn full_text_search(&mut self, query: FullTextSearchQuery) -> Result<&mut Self> {
1251 let fields = query.columns();
1252 if !fields.is_empty() {
1253 for field in fields.iter() {
1254 if self.dataset.schema().field(field).is_none() {
1255 return Err(Error::invalid_input(format!("Column {} not found", field)));
1256 }
1257 }
1258 }
1259
1260 self.full_text_query = Some(query);
1261 Ok(self)
1262 }
1263
1264 pub fn filter_substrait(&mut self, filter: &[u8]) -> Result<&mut Self> {
1269 self.filter.expr_filter = Some(ExprFilter::Substrait(filter.to_vec()));
1270 Ok(self)
1271 }
1272
1273 pub fn filter_expr(&mut self, filter: Expr) -> &mut Self {
1274 self.filter.expr_filter = Some(ExprFilter::Datafusion(filter));
1275 self
1276 }
1277
1278 pub fn aggregate(&mut self, aggregate: AggregateExpr) -> Result<&mut Self> {
1283 let schema: Arc<ArrowSchema> = Arc::new(self.dataset.schema().into());
1284 let parsed = aggregate.parse(schema)?;
1285 self.aggregate = Some(parsed);
1286 Ok(self)
1287 }
1288
1289 pub fn batch_size(&mut self, batch_size: usize) -> &mut Self {
1295 self.batch_size = Some(batch_size);
1296 self
1297 }
1298
1299 pub fn batch_size_bytes(&mut self, batch_size_bytes: u64) -> &mut Self {
1308 self.batch_size_bytes = Some(batch_size_bytes);
1309 self
1310 }
1311
1312 pub fn include_deleted_rows(&mut self) -> &mut Self {
1323 self.include_deleted_rows = true;
1324 self
1325 }
1326
1327 pub fn io_buffer_size(&mut self, size: u64) -> &mut Self {
1346 self.io_buffer_size = Some(size);
1347 self
1348 }
1349
1350 pub fn batch_readahead(&mut self, nbatches: usize) -> &mut Self {
1353 self.batch_readahead = nbatches;
1354 self
1355 }
1356
1357 pub fn fragment_readahead(&mut self, nfragments: usize) -> &mut Self {
1361 self.fragment_readahead = Some(nfragments);
1362 self
1363 }
1364
1365 pub fn scan_in_order(&mut self, ordered: bool) -> &mut Self {
1379 self.ordered = ordered;
1380 self
1381 }
1382
1383 pub fn use_scalar_index(&mut self, use_scalar_index: bool) -> &mut Self {
1389 self.use_scalar_index = use_scalar_index;
1390 self
1391 }
1392
1393 pub fn strict_batch_size(&mut self, strict_batch_size: bool) -> &mut Self {
1400 self.strict_batch_size = strict_batch_size;
1401 self
1402 }
1403
1404 pub fn limit(&mut self, limit: Option<i64>, offset: Option<i64>) -> Result<&mut Self> {
1411 if limit.unwrap_or_default() < 0 {
1412 return Err(Error::invalid_input(
1413 "Limit must be non-negative".to_string(),
1414 ));
1415 }
1416 if let Some(off) = offset
1417 && off < 0
1418 {
1419 return Err(Error::invalid_input(
1420 "Offset must be non-negative".to_string(),
1421 ));
1422 }
1423 self.limit = limit;
1424 self.offset = offset;
1425 Ok(self)
1426 }
1427
1428 pub fn nearest(&mut self, column: &str, q: &dyn Array, k: usize) -> Result<&mut Self> {
1432 if !self.prefilter {
1433 self.ensure_not_fragment_scan()?;
1436 }
1437
1438 if k == 0 {
1439 return Err(Error::invalid_input("k must be positive".to_string()));
1440 }
1441 if q.is_empty() {
1442 return Err(Error::invalid_input(
1443 "Query vector must have non-zero length".to_string(),
1444 ));
1445 }
1446 let (vector_type, element_type) = get_vector_type(self.dataset.schema(), column)?;
1448 let dim = get_vector_dim(self.dataset.schema(), column)?;
1449
1450 let q = match q.data_type() {
1451 DataType::List(_) | DataType::FixedSizeList(_, _) => {
1452 if !matches!(vector_type, DataType::List(_)) {
1453 return Err(Error::invalid_input(format!(
1454 "Query is multivector but column {}({})is not multivector",
1455 column, vector_type,
1456 )));
1457 }
1458
1459 if let Some(list_array) = q.as_list_opt::<i32>() {
1460 for i in 0..list_array.len() {
1461 let vec = list_array.value(i);
1462 if vec.len() != dim {
1463 return Err(Error::invalid_input(format!(
1464 "query dim({}) doesn't match the column {} vector dim({})",
1465 vec.len(),
1466 column,
1467 dim,
1468 )));
1469 }
1470 }
1471 list_array.values().clone()
1472 } else {
1473 let fsl = q.as_fixed_size_list();
1474 if fsl.value_length() as usize != dim {
1475 return Err(Error::invalid_input(format!(
1476 "query dim({}) doesn't match the column {} vector dim({})",
1477 fsl.value_length(),
1478 column,
1479 dim,
1480 )));
1481 }
1482 fsl.values().clone()
1483 }
1484 }
1485 _ => {
1486 if q.len() != dim {
1487 return Err(Error::invalid_input(format!(
1488 "query dim({}) doesn't match the column {} vector dim({})",
1489 q.len(),
1490 column,
1491 dim,
1492 )));
1493 }
1494 q.slice(0, q.len())
1495 }
1496 };
1497
1498 let key = match &element_type {
1499 dt if dt == q.data_type() => q,
1500 dt if dt.is_floating() => coerce_float_vector(
1501 q.as_any().downcast_ref::<Float32Array>().unwrap(),
1502 FloatType::try_from(dt)?,
1503 )?,
1504 _ => {
1505 return Err(Error::invalid_input(format!(
1506 "Column {} has element type {} and the query vector is {}",
1507 column,
1508 element_type,
1509 q.data_type(),
1510 )));
1511 }
1512 };
1513
1514 self.nearest = Some(Query {
1515 column: column.to_string(),
1516 key,
1517 k,
1518 lower_bound: None,
1519 upper_bound: None,
1520 minimum_nprobes: 1,
1521 maximum_nprobes: None,
1522 ef: None,
1523 refine_factor: None,
1524 metric_type: None,
1525 use_index: true,
1526 query_parallelism: DEFAULT_QUERY_PARALLELISM,
1527 dist_q_c: 0.0,
1528 });
1529 Ok(self)
1530 }
1531
1532 #[cfg(test)]
1533 fn nearest_mut(&mut self) -> Option<&mut Query> {
1534 self.nearest.as_mut()
1535 }
1536
1537 pub fn distance_range(
1539 &mut self,
1540 lower_bound: Option<f32>,
1541 upper_bound: Option<f32>,
1542 ) -> &mut Self {
1543 if let Some(q) = self.nearest.as_mut() {
1544 q.lower_bound = lower_bound;
1545 q.upper_bound = upper_bound;
1546 }
1547 self
1548 }
1549
1550 pub fn nprobes(&mut self, n: usize) -> &mut Self {
1555 if let Some(q) = self.nearest.as_mut() {
1556 q.minimum_nprobes = n;
1557 q.maximum_nprobes = Some(n);
1558 } else {
1559 log::warn!("nprobes is not set because nearest has not been called yet");
1560 }
1561 self
1562 }
1563
1564 #[deprecated(note = "Use nprobes instead")]
1569 pub fn nprobs(&mut self, n: usize) -> &mut Self {
1570 if let Some(q) = self.nearest.as_mut() {
1571 q.minimum_nprobes = n;
1572 q.maximum_nprobes = Some(n);
1573 } else {
1574 log::warn!("nprobes is not set because nearest has not been called yet");
1575 }
1576 self
1577 }
1578
1579 pub fn minimum_nprobes(&mut self, n: usize) -> &mut Self {
1587 if let Some(q) = self.nearest.as_mut() {
1588 q.minimum_nprobes = n;
1589 } else {
1590 log::warn!("minimum_nprobes is not set because nearest has not been called yet");
1591 }
1592 self
1593 }
1594
1595 pub fn maximum_nprobes(&mut self, n: usize) -> &mut Self {
1607 if let Some(q) = self.nearest.as_mut() {
1608 q.maximum_nprobes = Some(n);
1609 } else {
1610 log::warn!("maximum_nprobes is not set because nearest has not been called yet");
1611 }
1612 self
1613 }
1614
1615 pub fn ef(&mut self, ef: usize) -> &mut Self {
1616 if let Some(q) = self.nearest.as_mut() {
1617 q.ef = Some(ef);
1618 }
1619 self
1620 }
1621
1622 pub fn fast_search(&mut self) -> &mut Self {
1628 if let Some(q) = self.nearest.as_mut() {
1629 q.use_index = true;
1630 }
1631 self.fast_search = true;
1632 self.projection_plan.include_row_id(); self
1634 }
1635
1636 pub fn refine(&mut self, factor: u32) -> &mut Self {
1646 if let Some(q) = self.nearest.as_mut() {
1647 q.refine_factor = Some(factor)
1648 };
1649 self
1650 }
1651
1652 pub fn distance_metric(&mut self, metric_type: MetricType) -> &mut Self {
1654 if let Some(q) = self.nearest.as_mut() {
1655 q.metric_type = Some(metric_type)
1656 }
1657 self
1658 }
1659
1660 pub fn order_by(&mut self, ordering: Option<Vec<ColumnOrdering>>) -> Result<&mut Self> {
1666 if let Some(ordering) = &ordering {
1667 if ordering.is_empty() {
1668 self.ordering = None;
1669 return Ok(self);
1670 }
1671 for column in ordering {
1673 self.dataset
1674 .schema()
1675 .field(&column.column_name)
1676 .ok_or(Error::invalid_input(format!(
1677 "Column {} not found",
1678 &column.column_name
1679 )))?;
1680 }
1681 }
1682 self.ordering = ordering;
1683 Ok(self)
1684 }
1685
1686 pub fn use_index(&mut self, use_index: bool) -> &mut Self {
1688 if let Some(q) = self.nearest.as_mut() {
1689 q.use_index = use_index
1690 }
1691 self
1692 }
1693
1694 pub fn query_parallelism(&mut self, query_parallelism: i32) -> &mut Self {
1704 if let Some(q) = self.nearest.as_mut() {
1705 q.query_parallelism = query_parallelism;
1706 } else {
1707 log::warn!("query_parallelism is not set because nearest has not been called yet");
1708 }
1709 self
1710 }
1711
1712 pub fn with_row_id(&mut self) -> &mut Self {
1714 self.legacy_with_row_id = true;
1715 self.projection_plan.include_row_id();
1716 self
1717 }
1718
1719 pub fn with_row_address(&mut self) -> &mut Self {
1721 self.legacy_with_row_addr = true;
1722 self.projection_plan.include_row_addr();
1723 self
1724 }
1725
1726 pub fn disable_scoring_autoprojection(&mut self) -> &mut Self {
1742 self.autoproject_scoring_columns = false;
1743 self
1744 }
1745
1746 pub fn with_file_reader_options(&mut self, options: FileReaderOptions) -> &mut Self {
1748 self.file_reader_options = Some(options);
1749 self
1750 }
1751
1752 fn resolved_file_reader_options(&self) -> Option<FileReaderOptions> {
1756 let base = self
1757 .file_reader_options
1758 .clone()
1759 .or_else(|| self.dataset.file_reader_options.clone());
1760 match (base, self.batch_size_bytes) {
1761 (Some(mut opts), Some(bsb)) => {
1762 if opts.batch_size_bytes.is_none() {
1763 opts.batch_size_bytes = Some(bsb);
1764 }
1765 Some(opts)
1766 }
1767 (Some(opts), None) => Some(opts),
1768 (None, Some(bsb)) => Some(FileReaderOptions {
1769 batch_size_bytes: Some(bsb),
1770 ..Default::default()
1771 }),
1772 (None, None) => None,
1773 }
1774 }
1775
1776 fn create_column_expr(
1778 column_name: &str,
1779 dataset: &Dataset,
1780 arrow_schema: &ArrowSchema,
1781 ) -> Result<Arc<dyn PhysicalExpr>> {
1782 let lance_schema = dataset.schema();
1783 let field_path = lance_schema
1784 .resolve_case_insensitive(column_name)
1785 .ok_or_else(|| {
1786 Error::invalid_input(format!("Field '{}' not found in schema", column_name))
1787 })?;
1788
1789 if field_path.len() == 1 {
1790 expressions::col(&field_path[0].name, arrow_schema).map_err(|e| {
1792 Error::internal(format!(
1793 "Failed to create column expression for '{}': {}",
1794 column_name, e
1795 ))
1796 })
1797 } else {
1798 let get_field_func = ScalarUDF::from(GetFieldFunc::default());
1800
1801 let mut expr = Expr::Column(datafusion::common::Column::new_unqualified(
1804 &field_path[0].name,
1805 ));
1806 for nested_field in &field_path[1..] {
1807 expr = get_field_func.call(vec![expr, lit(&nested_field.name)]);
1808 }
1809
1810 let df_schema = Arc::new(DFSchema::try_from(arrow_schema.clone())?);
1812 let execution_props = ExecutionProps::new().with_query_execution_start_time(Utc::now());
1813 create_physical_expr(&expr, &df_schema, &execution_props).map_err(|e| {
1814 Error::internal(format!(
1815 "Failed to create physical expression for nested field '{}': {}",
1816 column_name, e
1817 ))
1818 })
1819 }
1820 }
1821
1822 pub fn use_stats(&mut self, use_stats: bool) -> &mut Self {
1826 self.use_stats = use_stats;
1827 self
1828 }
1829
1830 pub async fn schema(&self) -> Result<SchemaRef> {
1832 let plan = self.create_plan().await?;
1833 Ok(plan.schema())
1834 }
1835
1836 pub fn get_expr_filter(&self) -> Result<Option<Expr>> {
1843 if let Some(filter) = &self.filter.expr_filter {
1844 let filter_schema = self.filterable_schema()?;
1845 Ok(Some(filter.to_datafusion(
1846 self.dataset.schema(),
1847 filter_schema.as_ref(),
1848 )?))
1849 } else {
1850 Ok(None)
1851 }
1852 }
1853
1854 fn add_extra_columns(&self, schema: Schema) -> Result<Schema> {
1855 let mut extra_columns = vec![ArrowField::new(ROW_OFFSET, DataType::UInt64, true)];
1856
1857 if self.nearest.as_ref().is_some() {
1858 extra_columns.push(ArrowField::new(DIST_COL, DataType::Float32, true));
1859 };
1860
1861 if self.full_text_query.is_some() {
1862 extra_columns.push(ArrowField::new(SCORE_COL, DataType::Float32, true));
1863 }
1864
1865 schema.merge(&ArrowSchema::new(extra_columns))
1866 }
1867
1868 fn filterable_schema(&self) -> Result<Arc<Schema>> {
1873 let base_schema = Projection::full(self.dataset.clone())
1874 .with_row_id()
1875 .with_row_addr()
1876 .with_row_last_updated_at_version()
1877 .with_row_created_at_version()
1878 .to_schema();
1879
1880 Ok(Arc::new(self.add_extra_columns(base_schema)?))
1881 }
1882
1883 pub(crate) fn calculate_final_projection(
1888 &self,
1889 current_schema: &ArrowSchema,
1890 ) -> Result<Vec<(Arc<dyn PhysicalExpr>, String)>> {
1891 let mut output_expr = self.projection_plan.to_physical_exprs(current_schema)?;
1894
1895 if self.autoproject_scoring_columns {
1898 if self.nearest.is_some() && output_expr.iter().all(|(_, name)| name != DIST_COL) {
1899 if self.explicit_projection {
1900 log::warn!(
1901 "Deprecation warning, this behavior will change in the future. This search specified output columns but did not include `_distance`. Currently the `_distance` column will be included. In the future it will not. Call `disable_scoring_autoprojection` to adopt the future behavior and avoid this warning"
1902 );
1903 }
1904 let vector_expr = expressions::col(DIST_COL, current_schema)?;
1905 output_expr.push((vector_expr, DIST_COL.to_string()));
1906 }
1907 if self.full_text_query.is_some()
1908 && output_expr.iter().all(|(_, name)| name != SCORE_COL)
1909 {
1910 if self.explicit_projection {
1911 log::warn!(
1912 "Deprecation warning, this behavior will change in the future. This search specified output columns but did not include `_score`. Currently the `_score` column will be included. In the future it will not. Call `disable_scoring_autoprojection` to adopt the future behavior and avoid this warning"
1913 );
1914 }
1915 let score_expr = expressions::col(SCORE_COL, current_schema)?;
1916 output_expr.push((score_expr, SCORE_COL.to_string()));
1917 }
1918 }
1919
1920 if self.legacy_with_row_id {
1921 let row_id_pos = output_expr
1922 .iter()
1923 .position(|(_, name)| name == ROW_ID)
1924 .ok_or_else(|| {
1925 Error::internal(
1926 "user specified with_row_id but the _rowid column was not in the output"
1927 .to_string(),
1928 )
1929 })?;
1930 if row_id_pos != output_expr.len() - 1 {
1931 let row_id_expr = output_expr.remove(row_id_pos);
1933 output_expr.push(row_id_expr);
1934 }
1935 }
1936
1937 if self.legacy_with_row_addr {
1938 let row_addr_pos = output_expr.iter().position(|(_, name)| name == ROW_ADDR).ok_or_else(|| {
1939 Error::internal("user specified with_row_address but the _rowaddr column was not in the output".to_string())
1940 })?;
1941 if row_addr_pos != output_expr.len() - 1 {
1942 let row_addr_expr = output_expr.remove(row_addr_pos);
1944 output_expr.push(row_addr_expr);
1945 }
1946 }
1947
1948 Ok(output_expr)
1949 }
1950
1951 #[instrument(skip_all)]
1953 pub fn try_into_stream(&self) -> BoxFuture<'_, Result<DatasetRecordBatchStream>> {
1954 async move {
1956 let plan = self.create_plan().await?;
1957
1958 Ok(DatasetRecordBatchStream::new(execute_plan(
1959 plan,
1960 LanceExecutionOptions {
1961 batch_size: self.batch_size,
1962 execution_stats_callback: self.scan_stats_callback.clone(),
1963 ..Default::default()
1964 },
1965 )?))
1966 }
1967 .boxed()
1968 }
1969
1970 pub(crate) async fn try_into_dfstream(
1971 &self,
1972 mut options: LanceExecutionOptions,
1973 ) -> Result<SendableRecordBatchStream> {
1974 let plan = self.create_plan().await?;
1975
1976 if options.execution_stats_callback.is_none() {
1978 options.execution_stats_callback = self.scan_stats_callback.clone();
1979 }
1980
1981 execute_plan(plan, options)
1982 }
1983
1984 pub(crate) fn execution_options(&self) -> LanceExecutionOptions {
1985 LanceExecutionOptions {
1986 batch_size: self.batch_size,
1987 execution_stats_callback: self.scan_stats_callback.clone(),
1988 ..Default::default()
1989 }
1990 }
1991
1992 pub async fn try_into_batch(&self) -> Result<RecordBatch> {
1993 let stream = self.try_into_stream().await?;
1994 let schema = stream.schema();
1995 let batches = stream.try_collect::<Vec<_>>().await?;
1996 Ok(concat_batches(&schema, &batches)?)
1997 }
1998
1999 #[instrument(skip_all)]
2004 pub fn count_rows(&self) -> BoxFuture<'_, Result<u64>> {
2005 async move {
2007 let mut scanner = self.clone();
2008 scanner.aggregate(AggregateExpr::builder().count_star().build())?;
2009
2010 let plan = scanner.create_plan().await?;
2011 let mut stream = execute_plan(plan, LanceExecutionOptions::default())?;
2012
2013 if let Some(first_batch) = stream.next().await {
2015 let batch = first_batch?;
2016 let array = batch
2017 .column(0)
2018 .as_any()
2019 .downcast_ref::<Int64Array>()
2020 .ok_or(Error::invalid_input(
2021 "Count plan did not return an Int64Array".to_string(),
2022 ))?;
2023 Ok(array.value(0) as u64)
2024 } else {
2025 Ok(0)
2026 }
2027 }
2028 .boxed()
2029 }
2030
2031 #[deprecated(note = "Use create_plan() instead, which now applies aggregate automatically")]
2035 pub fn create_aggregate_plan(&self) -> BoxFuture<'_, Result<Arc<dyn ExecutionPlan>>> {
2036 async move {
2037 if self.aggregate.is_none() {
2038 return Err(Error::invalid_input(
2039 "create_aggregate_plan called but no aggregate was set",
2040 ));
2041 }
2042 self.create_plan().await
2044 }
2045 .boxed()
2046 }
2047
2048 async fn apply_aggregate(
2049 &self,
2050 plan: Arc<dyn ExecutionPlan>,
2051 agg: &Aggregate,
2052 ) -> Result<Arc<dyn ExecutionPlan>> {
2053 use datafusion_physical_expr::aggregate::AggregateFunctionExpr;
2054
2055 let schema = plan.schema();
2056 let df_schema = DFSchema::try_from(schema.as_ref().clone())?;
2057
2058 let group_exprs: Vec<(Arc<dyn PhysicalExpr>, String)> = agg
2059 .group_by
2060 .iter()
2061 .map(|expr| {
2062 let name = expr.schema_name().to_string();
2063 let physical_expr =
2064 create_physical_expr(expr, &df_schema, &ExecutionProps::default())?;
2065 Ok((physical_expr, name))
2066 })
2067 .collect::<Result<_>>()?;
2068
2069 #[allow(clippy::type_complexity)]
2070 let aggr_results: Vec<(Arc<AggregateFunctionExpr>, Option<Arc<dyn PhysicalExpr>>)> = agg
2071 .aggregates
2072 .iter()
2073 .map(|expr| self.build_physical_aggregate_expr(expr, &df_schema, &schema))
2074 .collect::<Result<_>>()?;
2075
2076 let (aggr_exprs, filters): (Vec<_>, Vec<_>) = aggr_results.into_iter().unzip();
2077
2078 Ok(Arc::new(AggregateExec::try_new(
2079 AggregateMode::Single,
2080 PhysicalGroupBy::new_single(group_exprs),
2081 aggr_exprs,
2082 filters,
2083 plan,
2084 schema,
2085 )?) as Arc<dyn ExecutionPlan>)
2086 }
2087
2088 #[allow(clippy::type_complexity)]
2089 fn build_physical_aggregate_expr(
2090 &self,
2091 expr: &Expr,
2092 df_schema: &DFSchema,
2093 input_schema: &SchemaRef,
2094 ) -> Result<(
2095 Arc<datafusion_physical_expr::aggregate::AggregateFunctionExpr>,
2096 Option<Arc<dyn PhysicalExpr>>,
2097 )> {
2098 use datafusion::physical_planner::create_aggregate_expr_and_maybe_filter;
2099
2100 let coerced_expr = self.coerce_aggregate_expr(expr, df_schema)?;
2101
2102 let (agg_expr, filter, _order_by) = create_aggregate_expr_and_maybe_filter(
2104 &coerced_expr,
2105 df_schema,
2106 input_schema.as_ref(),
2107 &ExecutionProps::default(),
2108 )?;
2109
2110 Ok((agg_expr, filter))
2111 }
2112
2113 fn coerce_aggregate_expr(&self, expr: &Expr, schema: &DFSchema) -> Result<Expr> {
2121 Self::coerce_aggregate_expr_impl(expr, schema)
2122 }
2123
2124 fn coerce_aggregate_expr_impl(expr: &Expr, schema: &DFSchema) -> Result<Expr> {
2125 use datafusion::logical_expr::Expr;
2126 use datafusion::logical_expr::expr::AggregateFunction;
2127 use datafusion::logical_expr::type_coercion::functions::fields_with_udf;
2128
2129 match expr {
2130 Expr::AggregateFunction(agg_func) => {
2131 let func = &agg_func.func;
2132 let args = &agg_func.params.args;
2133
2134 if args.is_empty() {
2135 return Ok(expr.clone());
2136 }
2137
2138 let current_fields: Vec<arrow_schema::FieldRef> = args
2139 .iter()
2140 .enumerate()
2141 .map(|(i, e)| {
2142 let dt = e.get_type(schema)?;
2143 Ok(Arc::new(arrow_schema::Field::new(
2144 format!("arg_{i}"),
2145 dt,
2146 true,
2147 )))
2148 })
2149 .collect::<std::result::Result<_, datafusion::common::DataFusionError>>()?;
2150
2151 let coerced_fields = fields_with_udf(¤t_fields, func.as_ref())?;
2152 let coerced_args: Vec<Expr> = args
2153 .iter()
2154 .zip(coerced_fields.iter())
2155 .map(|(arg, target_field)| {
2156 let arg_type = arg.get_type(schema)?;
2157 let target_type = target_field.data_type();
2158 if arg_type == *target_type {
2159 Ok(arg.clone())
2160 } else {
2161 arg.clone().cast_to(target_type, schema)
2162 }
2163 })
2164 .collect::<std::result::Result<_, _>>()?;
2165
2166 Ok(Expr::AggregateFunction(AggregateFunction::new_udf(
2167 func.clone(),
2168 coerced_args,
2169 agg_func.params.distinct,
2170 agg_func.params.filter.clone(),
2171 agg_func.params.order_by.clone(),
2172 agg_func.params.null_treatment,
2173 )))
2174 }
2175 Expr::Alias(alias) => {
2176 let coerced_inner = Self::coerce_aggregate_expr_impl(&alias.expr, schema)?;
2178 Ok(coerced_inner.alias(&alias.name))
2179 }
2180 other => Err(Error::invalid_input(format!(
2181 "Expected aggregate function expression, got {:?}",
2182 other.variant_name()
2183 ))),
2184 }
2185 }
2186
2187 fn is_early_field(&self, field: &Field) -> bool {
2208 match self.materialization_style {
2209 MaterializationStyle::AllEarly => true,
2210 MaterializationStyle::AllLate => false,
2211 MaterializationStyle::AllEarlyExcept(ref cols) => !cols.contains(&(field.id as u32)),
2212 MaterializationStyle::Heuristic => {
2213 if field.is_blob() {
2214 return true;
2219 }
2220
2221 let byte_width = field.data_type().byte_width_opt();
2222 let is_cloud = self.dataset.object_store.as_ref().is_cloud();
2223 if is_cloud {
2224 byte_width.is_some_and(|bw| bw < 1000)
2225 } else {
2226 byte_width.is_some_and(|bw| bw < 10)
2227 }
2228 }
2229 }
2230 }
2231
2232 fn calc_eager_projection(
2237 &self,
2238 filter_plan: &ExprFilterPlan,
2239 desired_projection: &Projection,
2240 ) -> Result<Projection> {
2241 let filter_columns = filter_plan.all_columns();
2248
2249 let filter_schema = self
2250 .dataset
2251 .empty_projection()
2252 .union_columns(filter_columns, OnMissing::Error)?
2253 .into_schema();
2254
2255 Ok(desired_projection
2257 .clone()
2258 .subtract_predicate(|f| !self.is_early_field(f))
2260 .union_schema(&filter_schema))
2262 }
2263
2264 fn validate_options(&self) -> Result<()> {
2265 if self.include_deleted_rows && !self.projection_plan.physical_projection.with_row_id {
2266 return Err(Error::invalid_input_source(
2267 "include_deleted_rows is set but with_row_id is false".into(),
2268 ));
2269 }
2270
2271 if self.aggregate.is_some() {
2272 if self.limit.is_some() || self.offset.is_some() {
2273 return Err(Error::invalid_input_source(
2274 "Cannot use limit/offset with aggregate. Apply limit to the result instead."
2275 .into(),
2276 ));
2277 }
2278 if self.ordering.is_some() {
2279 return Err(Error::invalid_input_source(
2280 "Cannot use order_by with aggregate. Apply ordering to the result instead."
2281 .into(),
2282 ));
2283 }
2284 }
2285
2286 if self.index_segments.is_some() && self.nearest.is_none() {
2287 return Err(Error::not_supported(
2288 "with_index_segments is only supported for vector search".to_string(),
2289 ));
2290 }
2291
2292 Ok(())
2293 }
2294
2295 async fn create_filter_plan(&self, use_scalar_index: bool) -> Result<FilterPlan> {
2296 let filter_schema = self.filterable_schema()?;
2297 let planner = Planner::new(Arc::new(filter_schema.as_ref().into()));
2298
2299 let filter_plan = if let Some(filter) = self.filter.expr_filter.as_ref() {
2301 let expr = filter.to_datafusion(self.dataset.schema(), filter_schema.as_ref())?;
2302 let index_info = self.dataset.scalar_index_info().await?;
2303 let filter_plan =
2304 planner.create_filter_plan(expr.clone(), &index_info, use_scalar_index)?;
2305
2306 if filter_plan.index_query.is_some() {
2309 let fragments = if let Some(fragments) = self.fragments.as_ref() {
2310 fragments
2311 } else {
2312 self.dataset.fragments()
2313 };
2314 let mut has_missing_row_count = false;
2315 for frag in fragments {
2316 if frag.physical_rows.is_none() {
2317 has_missing_row_count = true;
2318 break;
2319 }
2320 }
2321 if has_missing_row_count {
2322 let filter_plan =
2325 planner.create_filter_plan(expr.clone(), &index_info, false)?;
2326 FilterPlan::new(self.filter.query_filter.clone(), filter_plan)
2327 } else {
2328 FilterPlan::new(self.filter.query_filter.clone(), filter_plan)
2329 }
2330 } else {
2331 FilterPlan::new(self.filter.query_filter.clone(), filter_plan)
2332 }
2333 } else {
2334 FilterPlan::new(self.filter.query_filter.clone(), ExprFilterPlan::default())
2335 };
2336
2337 if filter_plan.query_filter.is_some()
2339 && self.nearest.is_none()
2340 && self.full_text_query.is_none()
2341 {
2342 return Err(Error::invalid_input_source(
2343 "Query filter can only be used with full text search or vector search".into(),
2344 ));
2345 }
2346 if self.nearest.is_some() && filter_plan.vector_filter().is_some() {
2347 return Err(Error::invalid_input_source(
2348 "Query filter can't be used with vector search".into(),
2349 ));
2350 }
2351 if self.full_text_query.is_some() && filter_plan.fts_filter().is_some() {
2352 return Err(Error::invalid_input_source(
2353 "Fts filter can't be used with fts search".into(),
2354 ));
2355 }
2356
2357 Ok(filter_plan)
2358 }
2359
2360 async fn get_scan_range(&self, filter_plan: &ExprFilterPlan) -> Result<Option<Range<u64>>> {
2361 if filter_plan.has_any_filter() {
2362 Ok(None)
2364 } else if self.ordering.is_some() {
2365 Ok(None)
2368 } else if self.dataset.manifest.uses_stable_row_ids() {
2369 Ok(None)
2376 } else {
2377 match (self.limit, self.offset) {
2378 (None, None) => Ok(None),
2379 (Some(limit), None) => {
2380 let num_rows = self.dataset.count_all_rows().await? as i64;
2381 Ok(Some(0..limit.min(num_rows) as u64))
2382 }
2383 (None, Some(offset)) => {
2384 let num_rows = self.dataset.count_all_rows().await? as i64;
2385 Ok(Some(offset.min(num_rows) as u64..num_rows as u64))
2386 }
2387 (Some(limit), Some(offset)) => {
2388 let num_rows = self.dataset.count_all_rows().await? as i64;
2389 Ok(Some(
2390 offset.min(num_rows) as u64..(offset + limit).min(num_rows) as u64,
2391 ))
2392 }
2393 }
2394 }
2395 }
2396
2397 #[instrument(level = "debug", skip_all)]
2443 pub async fn create_plan(&self) -> Result<Arc<dyn ExecutionPlan>> {
2444 log::trace!("creating scanner plan");
2445 self.validate_options()?;
2446
2447 let use_scalar_index = self.use_scalar_index && (self.prefilter || self.nearest.is_none());
2449 let mut filter_plan = self.create_filter_plan(use_scalar_index).await?;
2450
2451 let mut use_limit_node = true;
2452 let mut plan: Arc<dyn ExecutionPlan> = match (&self.nearest, &self.full_text_query) {
2454 (Some(_), None) => self.vector_search_source(&mut filter_plan).await?,
2455 (None, Some(query)) => self.fts_search_source(&mut filter_plan, query).await?,
2456 (None, None) => {
2457 if self.projection_plan.has_output_cols()
2458 && self.projection_plan.physical_projection.is_empty()
2459 {
2460 let output_expr = self.calculate_final_projection(&ArrowSchema::empty())?;
2471 return Err(Error::not_supported_source(format!("Scans must request at least one column. Received only dynamic expressions: {:?}", output_expr).into()));
2472 }
2473
2474 let take_op = filter_plan
2475 .expr_filter_plan
2476 .full_expr
2477 .as_ref()
2478 .and_then(TakeOperation::try_from_expr);
2479 if let Some((take_op, remainder)) = take_op {
2480 filter_plan.expr_filter_plan = remainder
2483 .map(ExprFilterPlan::new_refine_only)
2484 .unwrap_or(ExprFilterPlan::default());
2485 self.take_source(take_op).await?
2486 } else {
2487 let planned_read = self
2488 .filtered_read_source(&mut filter_plan.expr_filter_plan)
2489 .await?;
2490 if planned_read.limit_pushed_down {
2491 use_limit_node = false;
2492 }
2493 if planned_read.filter_pushed_down {
2494 filter_plan.disable_refine();
2495 }
2496 planned_read.plan
2497 }
2498 }
2499 _ => {
2500 return Err(Error::invalid_input_source(
2501 "Cannot have both nearest and full text search".into(),
2502 ));
2503 }
2504 };
2505
2506 let mut pre_filter_projection = self.dataset.empty_projection();
2508
2509 if filter_plan.has_refine() {
2512 pre_filter_projection = pre_filter_projection.union_columns(
2514 filter_plan.refine_columns(&self.dataset).await?,
2515 OnMissing::Ignore,
2516 )?;
2517 }
2518
2519 if let Some(ordering) = &self.ordering {
2524 pre_filter_projection = pre_filter_projection.union_columns(
2525 ordering.iter().map(|col| &col.column_name),
2526 OnMissing::Error,
2527 )?;
2528 }
2529
2530 plan = self.take(plan, pre_filter_projection)?;
2531
2532 plan = filter_plan.refine_filter(plan, self).await?;
2534
2535 if let Some(agg) = &self.aggregate {
2537 let required_columns = agg.required_columns();
2540 let agg_projection = if required_columns.is_empty() {
2541 self.dataset.empty_projection()
2542 } else {
2543 self.dataset
2544 .empty_projection()
2545 .union_columns(&required_columns, OnMissing::Error)?
2546 };
2547 plan = self.take(plan, agg_projection)?;
2548 plan = self.apply_aggregate(plan, agg).await?;
2549
2550 let optimizer = get_physical_optimizer();
2551 let options = Default::default();
2552 for rule in optimizer.rules {
2553 plan = rule.optimize(plan, &options)?;
2554 }
2555
2556 return Ok(plan);
2557 }
2558
2559 if let Some(ordering) = &self.ordering {
2561 let ordering_columns = ordering.iter().map(|col| &col.column_name);
2562 let projection_with_ordering = self
2563 .dataset
2564 .empty_projection()
2565 .union_columns(ordering_columns, OnMissing::Error)?;
2566 plan = self.take(plan, projection_with_ordering)?;
2568 let col_exprs = ordering
2569 .iter()
2570 .map(|col| {
2571 Ok(PhysicalSortExpr {
2572 expr: Self::create_column_expr(
2573 &col.column_name,
2574 &self.dataset,
2575 plan.schema().as_ref(),
2576 )?,
2577 options: SortOptions {
2578 descending: !col.ascending,
2579 nulls_first: col.nulls_first,
2580 },
2581 })
2582 })
2583 .collect::<Result<Vec<_>>>()?;
2584 plan = Arc::new(SortExec::new(
2585 LexOrdering::new(col_exprs)
2586 .ok_or(exec_datafusion_err!("Unexpected empty sort expressions"))?,
2587 plan,
2588 ));
2589 }
2590
2591 if use_limit_node && (self.limit.unwrap_or(0) > 0 || self.offset.is_some()) {
2593 plan = self.limit_node(plan);
2594 }
2595
2596 plan = self.take(plan, self.projection_plan.physical_projection.clone())?;
2598
2599 if self.projection_plan.must_add_row_offset {
2601 plan = Arc::new(AddRowOffsetExec::try_new(plan, self.dataset.clone()).await?);
2602 }
2603
2604 let final_projection = self.calculate_final_projection(plan.schema().as_ref())?;
2606
2607 plan = Arc::new(DFProjectionExec::try_new(final_projection, plan)?);
2608
2609 if self.strict_batch_size {
2611 plan = Arc::new(StrictBatchSizeExec::new(plan, self.get_batch_size()));
2612 }
2613
2614 let optimizer = get_physical_optimizer();
2615 let options: ConfigOptions = Default::default();
2616 for rule in optimizer.rules {
2617 plan = rule.optimize(plan, &options)?;
2618 }
2619
2620 Ok(plan)
2621 }
2622
2623 fn filter_references_version_columns(&self, filter_plan: &ExprFilterPlan) -> bool {
2625 use lance_core::{ROW_CREATED_AT_VERSION, ROW_LAST_UPDATED_AT_VERSION};
2626
2627 if let Some(refine_expr) = &filter_plan.refine_expr {
2628 let column_names = Planner::column_names_in_expr(refine_expr);
2629 for col_name in column_names {
2630 if col_name == ROW_CREATED_AT_VERSION || col_name == ROW_LAST_UPDATED_AT_VERSION {
2631 return true;
2632 }
2633 }
2634 }
2635 false
2636 }
2637
2638 async fn legacy_filtered_read(
2644 &self,
2645 filter_plan: &ExprFilterPlan,
2646 projection: Projection,
2647 make_deletions_null: bool,
2648 fragments: Option<Arc<Vec<Fragment>>>,
2649 scan_range: Option<Range<u64>>,
2650 is_prefilter: bool,
2651 ) -> Result<PlannedFilteredScan> {
2652 let fragments = fragments.unwrap_or(self.dataset.fragments().clone());
2653 let mut filter_pushed_down = false;
2654
2655 let plan: Arc<dyn ExecutionPlan> = if filter_plan.has_index_query() {
2656 if self.include_deleted_rows {
2657 return Err(Error::invalid_input_source(
2658 "Cannot include deleted rows in a scalar indexed scan".into(),
2659 ));
2660 }
2661 self.scalar_indexed_scan(projection, filter_plan, fragments)
2662 .await
2663 } else if !is_prefilter
2664 && filter_plan.has_refine()
2665 && self.batch_size.is_none()
2666 && self.use_stats
2667 && !self.filter_references_version_columns(filter_plan)
2668 {
2669 filter_pushed_down = true;
2670 self.pushdown_scan(false, filter_plan)
2671 } else {
2672 let ordered = if self.ordering.is_some() || self.nearest.is_some() {
2673 false
2675 } else if projection.with_row_last_updated_at_version
2676 || projection.with_row_created_at_version
2677 {
2678 true
2681 } else {
2682 self.ordered
2683 };
2684
2685 let projection = if let Some(refine_expr) = filter_plan.refine_expr.as_ref() {
2686 if is_prefilter {
2687 let refine_cols = Planner::column_names_in_expr(refine_expr);
2688 projection.union_columns(refine_cols, OnMissing::Error)?
2689 } else {
2690 projection
2691 }
2692 } else {
2693 projection
2694 };
2695
2696 let scan_range = if filter_plan.has_refine() {
2698 None
2699 } else {
2700 scan_range
2701 };
2702
2703 let scan = self.scan_fragments(
2704 projection.with_row_id,
2705 self.projection_plan.physical_projection.with_row_addr,
2706 self.projection_plan
2707 .physical_projection
2708 .with_row_last_updated_at_version,
2709 self.projection_plan
2710 .physical_projection
2711 .with_row_created_at_version,
2712 make_deletions_null,
2713 Arc::new(projection.to_bare_schema()),
2714 fragments,
2715 scan_range,
2716 ordered,
2717 );
2718
2719 if filter_plan.has_refine() && is_prefilter {
2720 Ok(Arc::new(LanceFilterExec::try_new(
2721 filter_plan.refine_expr.clone().unwrap(),
2722 scan,
2723 )?) as Arc<dyn ExecutionPlan>)
2724 } else {
2725 Ok(scan)
2726 }
2727 }?;
2728 Ok(PlannedFilteredScan {
2729 plan,
2730 limit_pushed_down: false,
2731 filter_pushed_down,
2732 })
2733 }
2734
2735 async fn new_filtered_read(
2739 &self,
2740 filter_plan: &ExprFilterPlan,
2741 projection: Projection,
2742 make_deletions_null: bool,
2743 fragments: Option<Arc<Vec<Fragment>>>,
2744 scan_range: Option<Range<u64>>,
2745 ) -> Result<Arc<dyn ExecutionPlan>> {
2746 let mut read_options = FilteredReadOptions::basic_full_read(&self.dataset)
2747 .with_filter_plan(filter_plan.clone())
2748 .with_projection(projection);
2749
2750 if let Some(fragments) = fragments {
2751 read_options = read_options.with_fragments(fragments);
2752 }
2753
2754 if let Some(scan_range) = scan_range {
2755 read_options = read_options.with_scan_range_before_filter(scan_range)?;
2756 }
2757
2758 if let Some(batch_size) = self.batch_size {
2759 read_options = read_options.with_batch_size(batch_size as u32);
2760 }
2761
2762 if let Some(file_reader_options) = self.resolved_file_reader_options() {
2763 read_options = read_options.with_file_reader_options(file_reader_options);
2764 }
2765
2766 if let Some(fragment_readahead) = self.fragment_readahead {
2767 read_options = read_options.with_fragment_readahead(fragment_readahead);
2768 }
2769
2770 if make_deletions_null {
2771 read_options = read_options.with_deleted_rows()?;
2772 }
2773
2774 if let Some(io_buffer_size_bytes) = self.io_buffer_size {
2775 read_options = read_options.with_io_buffer_size(io_buffer_size_bytes);
2776 }
2777
2778 let index_input = filter_plan.index_query.clone().map(|index_query| {
2779 Arc::new(ScalarIndexExec::new(self.dataset.clone(), index_query))
2780 as Arc<dyn ExecutionPlan>
2781 });
2782
2783 Ok(Arc::new(FilteredReadExec::try_new(
2784 self.dataset.clone(),
2785 read_options,
2786 index_input,
2787 )?))
2788 }
2789
2790 async fn filtered_read(
2794 &self,
2795 filter_plan: &ExprFilterPlan,
2796 projection: Projection,
2797 make_deletions_null: bool,
2798 fragments: Option<Arc<Vec<Fragment>>>,
2799 scan_range: Option<Range<u64>>,
2800 is_prefilter: bool,
2801 ) -> Result<PlannedFilteredScan> {
2802 if self.dataset.is_legacy_storage() {
2804 self.legacy_filtered_read(
2805 filter_plan,
2806 projection,
2807 make_deletions_null,
2808 fragments,
2809 scan_range,
2810 is_prefilter,
2811 )
2812 .await
2813 } else {
2814 let limit_pushed_down = scan_range.is_some();
2815 let plan = self
2816 .new_filtered_read(
2817 filter_plan,
2818 projection,
2819 make_deletions_null,
2820 fragments,
2821 scan_range,
2822 )
2823 .await?;
2824 Ok(PlannedFilteredScan {
2825 filter_pushed_down: true,
2826 limit_pushed_down,
2827 plan,
2828 })
2829 }
2830 }
2831
2832 fn u64s_as_take_input(&self, u64s: Vec<u64>) -> Result<Arc<dyn ExecutionPlan>> {
2833 let row_addrs = RowAddrTreeMap::from_iter(u64s);
2834 let row_addr_mask = RowAddrMask::from_allowed(row_addrs);
2835 let index_result = IndexExprResult::Exact(row_addr_mask);
2836 let fragments_covered = self.dataset.fragment_bitmap.as_ref().clone();
2837 let batch = index_result.serialize_to_arrow(&fragments_covered)?;
2838 let stream = futures::stream::once(async move { Ok(batch) });
2839 let stream = Box::pin(RecordBatchStreamAdapter::new(
2840 INDEX_EXPR_RESULT_SCHEMA.clone(),
2841 stream,
2842 ));
2843 Ok(Arc::new(OneShotExec::new(stream)))
2844 }
2845
2846 async fn take_source(&self, take_op: TakeOperation) -> Result<Arc<dyn ExecutionPlan>> {
2847 let projection = self.projection_plan.physical_projection.clone();
2850
2851 let input = match take_op {
2852 TakeOperation::RowIds(ids) => self.u64s_as_take_input(ids),
2853 TakeOperation::RowAddrs(addrs) => self.u64s_as_take_input(addrs),
2854 TakeOperation::RowOffsets(offsets) => {
2855 let mut addrs =
2856 row_offsets_to_row_addresses(&self.dataset.get_fragments(), &offsets).await?;
2857 addrs.retain(|addr| *addr != RowAddress::TOMBSTONE_ROW);
2858 self.u64s_as_take_input(addrs)
2859 }
2860 }?;
2861
2862 let mut filtered_read_options = FilteredReadOptions::new(projection);
2863 if let Some(fragment) = self.fragments.as_ref() {
2864 filtered_read_options =
2865 filtered_read_options.with_fragments(Arc::new(fragment.clone()));
2866 }
2867
2868 Ok(Arc::new(FilteredReadExec::try_new(
2869 self.dataset.clone(),
2870 filtered_read_options,
2871 Some(input),
2872 )?))
2873 }
2874
2875 async fn filtered_read_source(
2876 &self,
2877 filter_plan: &mut ExprFilterPlan,
2878 ) -> Result<PlannedFilteredScan> {
2879 log::trace!("source is a filtered read");
2880
2881 let effective_projection = if let Some(agg) = &self.aggregate {
2885 let required_columns = agg.required_columns();
2886 if required_columns.is_empty() {
2887 self.dataset.empty_projection()
2889 } else {
2890 self.dataset
2892 .empty_projection()
2893 .union_columns(&required_columns, OnMissing::Error)?
2894 }
2895 } else {
2896 self.projection_plan.physical_projection.clone()
2897 };
2898
2899 let mut projection = if filter_plan.has_refine() {
2900 self.calc_eager_projection(filter_plan, &effective_projection)?
2904 .with_row_id()
2905 } else {
2906 effective_projection
2909 };
2910
2911 if projection.is_empty() {
2912 projection.with_row_addr = true;
2915 }
2916
2917 let scan_range = if filter_plan.is_empty() {
2918 log::trace!("pushing scan_range into filtered_read");
2919 self.get_scan_range(filter_plan).await?
2920 } else {
2921 None
2922 };
2923
2924 self.filtered_read(
2925 filter_plan,
2926 projection,
2927 self.include_deleted_rows,
2928 self.fragments.clone().map(Arc::new),
2929 scan_range,
2930 false,
2931 )
2932 .await
2933 }
2934
2935 async fn fts_search_source(
2936 &self,
2937 filter_plan: &mut FilterPlan,
2938 query: &FullTextSearchQuery,
2939 ) -> Result<Arc<dyn ExecutionPlan>> {
2940 log::trace!("source is an fts search");
2941 if self.include_deleted_rows {
2942 return Err(Error::invalid_input_source(
2943 "Cannot include deleted rows in an FTS search".into(),
2944 ));
2945 }
2946
2947 if self.prefilter {
2949 let source: Arc<dyn ExecutionPlan> = match &filter_plan.vector_filter() {
2950 Some(vector_query) => {
2951 let vector_plan = self
2953 .vector_search(&filter_plan.expr_filter_plan, vector_query)
2954 .await?;
2955 self.fts_rerank(vector_plan, query).await?
2956 }
2957 None => self.fts(&filter_plan.expr_filter_plan, query).await?,
2958 };
2959 filter_plan.disable_refine();
2961 Ok(source)
2962 } else {
2963 filter_plan.make_refine_only();
2966 self.fts(&ExprFilterPlan::default(), query).await
2967 }
2968 }
2969
2970 async fn vector_search_source(
2971 &self,
2972 filter_plan: &mut FilterPlan,
2973 ) -> Result<Arc<dyn ExecutionPlan>> {
2974 if self.include_deleted_rows {
2975 return Err(Error::invalid_input_source(
2976 "Cannot include deleted rows in a nearest neighbor search".into(),
2977 ));
2978 }
2979 let Some(query) = self.nearest.as_ref() else {
2980 return Err(Error::invalid_input("No nearest query".to_string()));
2981 };
2982
2983 if self.prefilter {
2984 log::trace!("source is a vector search (prefilter)");
2985 let source: Arc<dyn ExecutionPlan> = match &filter_plan.fts_filter() {
2987 Some(fts_query) => {
2988 let fts_plan = self.fts(&filter_plan.expr_filter_plan, fts_query).await?;
2989 let projection = self
2990 .dataset
2991 .empty_projection()
2992 .union_column(&query.column, OnMissing::Error)?;
2993 let plan = self.take(fts_plan, projection)?;
2994
2995 self.flat_knn(plan, query)?
2996 }
2997 None => {
2998 self.vector_search(&filter_plan.expr_filter_plan, query)
2999 .await?
3000 }
3001 };
3002
3003 filter_plan.disable_refine();
3004 Ok(source)
3005 } else {
3006 log::trace!("source is a vector search (postfilter)");
3007 filter_plan.make_refine_only();
3010 self.vector_search(&ExprFilterPlan::default(), query).await
3011 }
3012 }
3013
3014 async fn fragments_covered_by_fts_leaf(
3015 &self,
3016 column: &str,
3017 accum: &mut RoaringBitmap,
3018 ) -> Result<bool> {
3019 let index = self
3020 .dataset
3021 .load_scalar_index(IndexCriteria::default().for_column(column).supports_fts())
3022 .await?;
3023 match index {
3024 Some(index) => match &index.fragment_bitmap {
3025 Some(fragmap) => {
3026 *accum |= fragmap;
3027 Ok(true)
3028 }
3029 None => Ok(false),
3030 },
3031 None => Ok(false),
3032 }
3033 }
3034
3035 #[async_recursion]
3036 async fn fragments_covered_by_fts_query_helper(
3037 &self,
3038 query: &FtsQuery,
3039 accum: &mut RoaringBitmap,
3040 ) -> Result<bool> {
3041 match query {
3042 FtsQuery::Match(match_query) => {
3043 self.fragments_covered_by_fts_leaf(
3044 match_query.column.as_ref().ok_or(Error::invalid_input(
3045 "the column must be specified in the query".to_string(),
3046 ))?,
3047 accum,
3048 )
3049 .await
3050 }
3051 FtsQuery::Boost(boost) => Ok(self
3052 .fragments_covered_by_fts_query_helper(&boost.negative, accum)
3053 .await?
3054 & self
3055 .fragments_covered_by_fts_query_helper(&boost.positive, accum)
3056 .await?),
3057 FtsQuery::MultiMatch(multi_match) => {
3058 for mq in &multi_match.match_queries {
3059 if !self
3060 .fragments_covered_by_fts_leaf(
3061 mq.column.as_ref().ok_or(Error::invalid_input(
3062 "the column must be specified in the query".to_string(),
3063 ))?,
3064 accum,
3065 )
3066 .await?
3067 {
3068 return Ok(false);
3069 }
3070 }
3071 Ok(true)
3072 }
3073 FtsQuery::Phrase(phrase_query) => {
3074 self.fragments_covered_by_fts_leaf(
3075 phrase_query.column.as_ref().ok_or(Error::invalid_input(
3076 "the column must be specified in the query".to_string(),
3077 ))?,
3078 accum,
3079 )
3080 .await
3081 }
3082 FtsQuery::Boolean(bool_query) => {
3083 for query in bool_query.must.iter() {
3084 if !self
3085 .fragments_covered_by_fts_query_helper(query, accum)
3086 .await?
3087 {
3088 return Ok(false);
3089 }
3090 }
3091 for query in &bool_query.should {
3092 if !self
3093 .fragments_covered_by_fts_query_helper(query, accum)
3094 .await?
3095 {
3096 return Ok(false);
3097 }
3098 }
3099 Ok(true)
3100 }
3101 }
3102 }
3103
3104 async fn fragments_covered_by_fts_query(&self, query: &FtsQuery) -> Result<RoaringBitmap> {
3105 let all_fragments = self.get_fragments_as_bitmap();
3106
3107 let mut referenced_fragments = RoaringBitmap::new();
3108 if !self
3109 .fragments_covered_by_fts_query_helper(query, &mut referenced_fragments)
3110 .await?
3111 {
3112 Ok(all_fragments)
3114 } else {
3115 Ok(all_fragments & referenced_fragments)
3117 }
3118 }
3119
3120 async fn fts(
3122 &self,
3123 filter_plan: &ExprFilterPlan,
3124 query: &FullTextSearchQuery,
3125 ) -> Result<Arc<dyn ExecutionPlan>> {
3126 let columns = query.columns();
3127 let mut params = query.params();
3128 if params.limit.is_none() {
3129 let search_limit = match (self.limit, self.offset) {
3130 (Some(limit), Some(offset)) => Some((limit + offset) as usize),
3131 (Some(limit), None) => Some(limit as usize),
3132 (None, Some(_)) => None, (None, None) => None,
3134 };
3135 params = params.with_limit(search_limit);
3136 }
3137 let query = if columns.is_empty() {
3138 let indexed_columns = fts_indexed_columns(self.dataset.clone()).await?;
3141 fill_fts_query_column(&query.query, &indexed_columns, false)?
3142 } else {
3143 query.query.clone()
3144 };
3145
3146 let prefilter_source = self
3150 .prefilter_source(
3151 filter_plan,
3152 self.fragments_covered_by_fts_query(&query).await?,
3153 )
3154 .await?;
3155 let fts_exec = self
3156 .plan_fts(&query, ¶ms, filter_plan, &prefilter_source)
3157 .await?;
3158 Ok(fts_exec)
3159 }
3160
3161 async fn plan_fts(
3162 &self,
3163 query: &FtsQuery,
3164 params: &FtsSearchParams,
3165 filter_plan: &ExprFilterPlan,
3166 prefilter_source: &PreFilterSource,
3167 ) -> Result<Arc<dyn ExecutionPlan>> {
3168 let plan: Arc<dyn ExecutionPlan> = match query {
3169 FtsQuery::Match(query) => {
3170 self.plan_match_query(query, params, filter_plan, prefilter_source)
3171 .await?
3172 }
3173 FtsQuery::Phrase(query) => {
3174 self.plan_phrase_query(query, params, prefilter_source)
3175 .await?
3176 }
3177
3178 FtsQuery::Boost(query) => {
3179 let unlimited_params = params.clone().with_limit(None);
3183 let positive_exec = Box::pin(self.plan_fts(
3184 &query.positive,
3185 &unlimited_params,
3186 filter_plan,
3187 prefilter_source,
3188 ));
3189 let negative_exec = Box::pin(self.plan_fts(
3190 &query.negative,
3191 &unlimited_params,
3192 filter_plan,
3193 prefilter_source,
3194 ));
3195 let (positive_exec, negative_exec) =
3196 futures::future::try_join(positive_exec, negative_exec).await?;
3197 Arc::new(BoostQueryExec::new(
3198 query.clone(),
3199 params.clone(),
3200 positive_exec,
3201 negative_exec,
3202 ))
3203 }
3204
3205 FtsQuery::MultiMatch(query) => {
3206 let mut children = Vec::with_capacity(query.match_queries.len());
3207 for match_query in &query.match_queries {
3208 let child =
3209 self.plan_match_query(match_query, params, filter_plan, prefilter_source);
3210 children.push(child);
3211 }
3212 let children = futures::future::try_join_all(children).await?;
3213
3214 let schema = children[0].schema();
3215 let group_expr = vec![(
3216 expressions::col(ROW_ID, schema.as_ref())?,
3217 ROW_ID.to_string(),
3218 )];
3219
3220 let fts_node = UnionExec::try_new(children)?;
3221 let fts_node = Arc::new(RepartitionExec::try_new(
3222 fts_node,
3223 Partitioning::RoundRobinBatch(1),
3224 )?);
3225 let fts_node = Arc::new(AggregateExec::try_new(
3227 AggregateMode::Single,
3228 PhysicalGroupBy::new_single(group_expr),
3229 vec![Arc::new(
3230 datafusion_physical_expr::aggregate::AggregateExprBuilder::new(
3231 functions_aggregate::min_max::max_udaf(),
3232 vec![expressions::col(SCORE_COL, &schema)?],
3233 )
3234 .schema(schema.clone())
3235 .alias(SCORE_COL)
3236 .build()?,
3237 )],
3238 vec![None],
3239 fts_node,
3240 schema,
3241 )?);
3242 let sort_expr = PhysicalSortExpr {
3243 expr: expressions::col(SCORE_COL, fts_node.schema().as_ref())?,
3244 options: SortOptions {
3245 descending: true,
3246 nulls_first: false,
3247 },
3248 };
3249
3250 Arc::new(
3251 SortExec::new([sort_expr].into(), fts_node)
3252 .with_fetch(self.limit.map(|l| l as usize)),
3253 )
3254 }
3255 FtsQuery::Boolean(query) => {
3256 let unlimited_params = params.clone().with_limit(None);
3261
3262 let mut should = Vec::with_capacity(query.should.len());
3263 for subquery in &query.should {
3264 should.push(
3265 Box::pin(self.plan_fts(
3266 subquery,
3267 &unlimited_params,
3268 filter_plan,
3269 prefilter_source,
3270 ))
3271 .await?,
3272 );
3273 }
3274 let mut must = Vec::with_capacity(query.must.len());
3275 for subquery in &query.must {
3276 must.push(
3277 Box::pin(self.plan_fts(
3278 subquery,
3279 &unlimited_params,
3280 filter_plan,
3281 prefilter_source,
3282 ))
3283 .await?,
3284 );
3285 }
3286 let mut must_not = Vec::with_capacity(query.must_not.len());
3287 for subquery in &query.must_not {
3288 must_not.push(
3289 Box::pin(self.plan_fts(
3290 subquery,
3291 &unlimited_params,
3292 filter_plan,
3293 prefilter_source,
3294 ))
3295 .await?,
3296 );
3297 }
3298
3299 let should = build_boolean_query_children(BoolSlot::Should, should)?
3300 .expect("Should slot always returns Some");
3301 let must = build_boolean_query_children(BoolSlot::Must, must)?;
3302 let must_not = build_boolean_query_children(BoolSlot::MustNot, must_not)?
3303 .expect("MustNot slot always returns Some");
3304
3305 if query.should.is_empty() && must.is_none() {
3306 return Err(Error::invalid_input(
3307 "boolean query must have at least one should/must query".to_string(),
3308 ));
3309 }
3310
3311 Arc::new(BooleanQueryExec::new(
3312 query.clone(),
3313 params.clone(),
3314 should,
3315 must,
3316 must_not,
3317 ))
3318 }
3319 };
3320
3321 Ok(plan)
3322 }
3323
3324 async fn plan_phrase_query(
3325 &self,
3326 query: &PhraseQuery,
3327 params: &FtsSearchParams,
3328 prefilter_source: &PreFilterSource,
3329 ) -> Result<Arc<dyn ExecutionPlan>> {
3330 let column = query.column.clone().ok_or(Error::invalid_input(
3331 "the column must be specified in the query".to_string(),
3332 ))?;
3333
3334 let segments = load_segments(&self.dataset, &column)
3335 .await?
3336 .ok_or(Error::invalid_input(format!(
3337 "No Inverted index found for column {}",
3338 column
3339 )))?;
3340 let details = load_segment_details(&self.dataset, &column, &segments).await?;
3341
3342 if !details.with_position {
3343 return Err(Error::invalid_input("position is not found but required for phrase queries, try recreating the index with position"
3344 .to_string()));
3345 }
3346
3347 Ok(Arc::new(PhraseQueryExec::new(
3348 self.dataset.clone(),
3349 query.clone(),
3350 params.clone(),
3351 prefilter_source.clone(),
3352 )))
3353 }
3354
3355 async fn plan_match_query(
3356 &self,
3357 query: &MatchQuery,
3358 params: &FtsSearchParams,
3359 filter_plan: &ExprFilterPlan,
3360 prefilter_source: &PreFilterSource,
3361 ) -> Result<Arc<dyn ExecutionPlan>> {
3362 let column = query
3363 .column
3364 .as_ref()
3365 .ok_or(Error::invalid_input(
3366 "the column must be specified in the query".to_string(),
3367 ))?
3368 .clone();
3369
3370 let index = self
3371 .dataset
3372 .load_scalar_index(IndexCriteria::default().for_column(&column).supports_fts())
3373 .await?;
3374
3375 let target_fragments = self
3377 .fragments
3378 .clone()
3379 .unwrap_or_else(|| self.dataset.fragments().to_vec());
3380
3381 let (match_plan, flat_match_plan) = match &index {
3382 Some(index) => {
3383 let unindexed_fragments = self
3385 .retain_target_fragments(self.dataset.unindexed_fragments(&index.name).await?);
3386
3387 if unindexed_fragments.len() == target_fragments.len() {
3389 if self.fast_search {
3390 return Ok(Arc::new(EmptyExec::new(FTS_SCHEMA.clone())));
3391 }
3392 let flat_match_plan = self
3393 .plan_flat_match_query(unindexed_fragments, query, params, filter_plan)
3394 .await?;
3395 return Ok(flat_match_plan);
3396 }
3397
3398 let match_plan: Arc<dyn ExecutionPlan> = Arc::new(MatchQueryExec::new(
3400 self.dataset.clone(),
3401 query.clone(),
3402 params.clone(),
3403 prefilter_source.clone(),
3404 ));
3405
3406 if self.fast_search || unindexed_fragments.is_empty() {
3407 (Some(match_plan), None)
3408 } else {
3409 let flat_match_plan = self
3410 .plan_flat_match_query(unindexed_fragments, query, params, filter_plan)
3411 .await?;
3412 (Some(match_plan), Some(flat_match_plan))
3413 }
3414 }
3415 None => {
3416 if self.fast_search {
3417 return Ok(Arc::new(EmptyExec::new(FTS_SCHEMA.clone())));
3418 }
3419 let flat_match_plan = self
3421 .plan_flat_match_query(target_fragments.clone(), query, params, filter_plan)
3422 .await?;
3423 (None, Some(flat_match_plan))
3424 }
3425 };
3426
3427 let plan = match (match_plan, flat_match_plan) {
3429 (Some(match_plan), Some(flat_match_plan)) => {
3430 let match_plan = UnionExec::try_new(vec![match_plan, flat_match_plan])?;
3431 let match_plan = Arc::new(RepartitionExec::try_new(
3432 match_plan,
3433 Partitioning::RoundRobinBatch(1),
3434 )?);
3435 let sort_expr = PhysicalSortExpr {
3436 expr: expressions::col(SCORE_COL, match_plan.schema().as_ref())?,
3437 options: SortOptions {
3438 descending: true,
3439 nulls_first: false,
3440 },
3441 };
3442 Arc::new(SortExec::new([sort_expr].into(), match_plan).with_fetch(params.limit))
3443 }
3444 (Some(match_plan), None) => match_plan,
3445 (None, Some(flat_match_plan)) => flat_match_plan,
3446 (None, None) => unreachable!(),
3447 };
3448
3449 Ok(plan)
3450 }
3451
3452 async fn plan_flat_match_query(
3454 &self,
3455 fragments: Vec<Fragment>,
3456 query: &MatchQuery,
3457 params: &FtsSearchParams,
3458 filter_plan: &ExprFilterPlan,
3459 ) -> Result<Arc<dyn ExecutionPlan>> {
3460 let column = query
3461 .column
3462 .as_ref()
3463 .ok_or(Error::invalid_input(
3464 "the column must be specified in the query".to_string(),
3465 ))?
3466 .clone();
3467
3468 let mut columns = vec![column];
3469 if let Some(expr) = filter_plan.full_expr.as_ref() {
3470 let filter_columns = Planner::column_names_in_expr(expr);
3471 columns.extend(filter_columns);
3472 }
3473 let flat_fts_scan_schema = Arc::new(self.dataset.schema().project(&columns).unwrap());
3474 let mut scan_node = self.scan_fragments(
3475 true,
3476 false,
3477 false,
3478 false,
3479 false,
3480 flat_fts_scan_schema,
3481 Arc::new(fragments),
3482 None,
3483 false,
3484 );
3485
3486 if let Some(expr) = filter_plan.full_expr.as_ref() {
3487 scan_node = Arc::new(LanceFilterExec::try_new(expr.clone(), scan_node)?);
3489 }
3490
3491 let flat_match_plan = Arc::new(FlatMatchQueryExec::new(
3492 self.dataset.clone(),
3493 query.clone(),
3494 params.clone(),
3495 scan_node,
3496 ));
3497 Ok(flat_match_plan)
3498 }
3499
3500 async fn vector_search(
3502 &self,
3503 filter_plan: &ExprFilterPlan,
3504 q: &Query,
3505 ) -> Result<Arc<dyn ExecutionPlan>> {
3506 let mut q = q.clone();
3507
3508 let (vector_type, element_type) = get_vector_type(self.dataset.schema(), &q.column)?;
3510
3511 let column_id = self.dataset.schema().field_id(q.column.as_str())?;
3512 let use_index = q.use_index;
3513 let indices = if use_index {
3514 self.dataset.load_indices().await?
3515 } else {
3516 Arc::new(vec![])
3517 };
3518 let index_and_segments = if use_index {
3519 if let Some(requested_segments) = self.index_segments.as_ref() {
3520 let requested_segment_set =
3521 requested_segments.iter().copied().collect::<HashSet<_>>();
3522 let requested_index_segments = indices
3523 .iter()
3524 .filter(|idx| requested_segment_set.contains(&idx.uuid))
3525 .cloned()
3526 .collect::<Vec<_>>();
3527
3528 if requested_index_segments.len() != requested_segment_set.len() {
3529 let found_segment_set = requested_index_segments
3530 .iter()
3531 .map(|idx| idx.uuid)
3532 .collect::<HashSet<_>>();
3533 let missing_segments = requested_segment_set
3534 .difference(&found_segment_set)
3535 .map(ToString::to_string)
3536 .collect::<Vec<_>>();
3537 return Err(Error::invalid_input(format!(
3538 "with_index_segments referenced unknown index segments: {missing_segments:?}",
3539 )));
3540 }
3541
3542 if requested_index_segments
3543 .iter()
3544 .any(|idx| !idx.fields.contains(&column_id))
3545 {
3546 return Err(Error::invalid_input(format!(
3547 "with_index_segments contained a segment that does not belong to vector column '{}'",
3548 q.column
3549 )));
3550 }
3551
3552 let index_name = requested_index_segments[0].name.clone();
3553 if requested_index_segments
3554 .iter()
3555 .any(|idx| idx.name != index_name)
3556 {
3557 return Err(Error::invalid_input(
3558 "with_index_segments must reference segments from a single logical index"
3559 .to_string(),
3560 ));
3561 }
3562
3563 let selected_index_segments =
3564 self.retain_relevant_index_segments(requested_index_segments);
3565 if selected_index_segments.is_empty() {
3566 None
3567 } else {
3568 let idx = self
3569 .dataset
3570 .open_vector_index(
3571 q.column.as_str(),
3572 &selected_index_segments[0].uuid.to_string(),
3573 &NoOpMetricsCollector,
3574 )
3575 .await?;
3576 let index_metric = idx.metric_type();
3577 let use_this_index = match q.metric_type {
3578 Some(user_metric) => {
3579 if user_metric == index_metric {
3580 true
3581 } else {
3582 return Err(Error::invalid_input(format!(
3583 "with_index_segments requested metric {:?} but the selected index segments use {:?}",
3584 user_metric, index_metric
3585 )));
3586 }
3587 }
3588 None => true,
3589 };
3590 if use_this_index {
3591 Some((index_name, selected_index_segments, index_metric))
3592 } else {
3593 None
3594 }
3595 }
3596 } else if let Some(index) = indices.iter().find(|i| i.fields.contains(&column_id)) {
3597 let index_metric = if let Some(metric) =
3599 crate::index::vector::details::metric_type_from_index_metadata(index)
3600 {
3601 metric
3602 } else {
3603 let idx = self
3605 .dataset
3606 .open_vector_index(
3607 q.column.as_str(),
3608 &index.uuid.to_string(),
3609 &NoOpMetricsCollector,
3610 )
3611 .await?;
3612 idx.metric_type()
3613 };
3614
3615 let use_this_index = match q.metric_type {
3616 Some(user_metric) => {
3617 if user_metric == index_metric {
3618 true
3619 } else {
3620 log::warn!(
3621 "Requested metric {:?} is incompatible with index metric {:?}, falling back to brute-force search",
3622 user_metric,
3623 index_metric
3624 );
3625 false
3626 }
3627 }
3628 None => true,
3629 };
3630
3631 if use_this_index {
3632 let index_segments = self.retain_relevant_index_segments(
3633 self.dataset.load_indices_by_name(&index.name).await?,
3634 );
3635 let index_frags = self.get_indexed_frags(&index_segments);
3636 if !index_segments.is_empty() && !index_frags.is_empty() {
3637 Some((index.name.clone(), index_segments, index_metric))
3638 } else {
3639 None
3640 }
3641 } else {
3642 None
3643 }
3644 } else {
3645 None
3646 }
3647 } else {
3648 None
3649 };
3650
3651 if let Some((index_name, index_segments, index_metric)) = index_and_segments {
3652 log::trace!("index found for vector search");
3653 q.metric_type = Some(index_metric);
3655 validate_distance_type_for(index_metric, &element_type)?;
3656
3657 if matches!(q.refine_factor, Some(0)) {
3658 return Err(Error::invalid_input(
3659 "Refine factor cannot be zero".to_string(),
3660 ));
3661 }
3662 let ann_node = match vector_type {
3663 DataType::FixedSizeList(_, _) => self.ann(&q, &index_segments, filter_plan).await?,
3664 DataType::List(_) => self.multivec_ann(&q, &index_segments, filter_plan).await?,
3665 _ => unreachable!(),
3666 };
3667
3668 let mut knn_node = if q.refine_factor.is_some() {
3669 let vector_projection = self
3670 .dataset
3671 .empty_projection()
3672 .union_column(&q.column, OnMissing::Error)
3673 .unwrap();
3674 let knn_node_with_vector = self.take(ann_node, vector_projection)?;
3675 self.flat_knn(knn_node_with_vector, &q)?
3676 } else {
3677 ann_node
3678 }; if !self.fast_search {
3681 knn_node = self
3682 .knn_combined(&q, &index_name, &index_segments, knn_node, filter_plan)
3683 .await?;
3684 }
3685
3686 Ok(knn_node)
3687 } else {
3688 if self.fast_search {
3689 return Ok(Arc::new(EmptyExec::new(KNN_INDEX_SCHEMA.clone())));
3690 }
3691 let metric = q
3693 .metric_type
3694 .unwrap_or_else(|| default_distance_type_for(&element_type));
3695 q.metric_type = Some(metric);
3696 validate_distance_type_for(metric, &element_type)?;
3697 let mut columns = vec![q.column.clone()];
3699 if let Some(refine_expr) = filter_plan.refine_expr.as_ref() {
3700 columns.extend(Planner::column_names_in_expr(refine_expr));
3701 }
3702 let mut vector_scan_projection = self
3703 .dataset
3704 .empty_projection()
3705 .with_row_id()
3706 .union_columns(&columns, OnMissing::Error)?;
3707
3708 vector_scan_projection.with_row_addr =
3709 self.projection_plan.physical_projection.with_row_addr;
3710
3711 let PlannedFilteredScan { mut plan, .. } = self
3712 .filtered_read(
3713 filter_plan,
3714 vector_scan_projection,
3715 true,
3716 self.fragments.clone().map(Arc::new),
3717 None,
3718 true,
3719 )
3720 .await?;
3721
3722 if let Some(refine_expr) = &filter_plan.refine_expr {
3723 plan = Arc::new(LanceFilterExec::try_new(refine_expr.clone(), plan)?);
3724 }
3725 Ok(self.flat_knn(plan, &q)?)
3726 }
3727 }
3728
3729 async fn knn_combined(
3731 &self,
3732 q: &Query,
3733 index_name: &str,
3734 indexed_segments: &[IndexMetadata],
3735 mut knn_node: Arc<dyn ExecutionPlan>,
3736 filter_plan: &ExprFilterPlan,
3737 ) -> Result<Arc<dyn ExecutionPlan>> {
3738 let fallback_fragments = if let Some(target_fragments) = &self.fragments {
3739 let indexed_fragments = self.get_indexed_frags(indexed_segments);
3740 target_fragments
3741 .iter()
3742 .filter(|fragment| !indexed_fragments.contains(fragment.id as u32))
3743 .cloned()
3744 .collect::<Vec<_>>()
3745 } else if self.index_segments.is_some() {
3746 Vec::new()
3747 } else {
3748 self.dataset.unindexed_fragments(index_name).await?
3749 };
3750
3751 if !fallback_fragments.is_empty() {
3752 let q = q.clone();
3753 debug_assert!(q.metric_type.is_some());
3754
3755 if knn_node.schema().column_with_name(&q.column).is_none() {
3758 let vector_projection = self
3759 .dataset
3760 .empty_projection()
3761 .union_column(&q.column, OnMissing::Error)
3762 .unwrap();
3763 knn_node = self.take(knn_node, vector_projection)?;
3764 }
3765
3766 let mut columns = vec![q.column.clone()];
3767 if let Some(expr) = filter_plan.full_expr.as_ref() {
3768 let filter_columns = Planner::column_names_in_expr(expr);
3769 columns.extend(filter_columns);
3770 }
3771 let vector_scan_projection = Arc::new(self.dataset.schema().project(&columns).unwrap());
3772 let mut scan_node = self.scan_fragments(
3776 true,
3777 false,
3778 false,
3779 false,
3780 false,
3781 vector_scan_projection,
3782 Arc::new(fallback_fragments),
3783 None,
3785 false,
3788 );
3789
3790 if let Some(expr) = filter_plan.full_expr.as_ref() {
3791 scan_node = Arc::new(LanceFilterExec::try_new(expr.clone(), scan_node)?);
3793 }
3794 let topk_appended = self.flat_knn(scan_node, &q)?;
3796
3797 let topk_appended = project(topk_appended, knn_node.schema().as_ref())?;
3801 assert!(
3802 topk_appended
3803 .schema()
3804 .equivalent_names_and_types(&knn_node.schema())
3805 );
3806 let unioned = UnionExec::try_new(vec![Arc::new(topk_appended), knn_node])?;
3808 let unioned = RepartitionExec::try_new(
3810 unioned,
3811 datafusion::physical_plan::Partitioning::RoundRobinBatch(1),
3812 )?;
3813 return self.flat_knn(Arc::new(unioned), &q);
3815 }
3816
3817 Ok(knn_node)
3818 }
3819
3820 #[async_recursion]
3821 async fn fragments_covered_by_index_query(
3822 &self,
3823 index_expr: &ScalarIndexExpr,
3824 ) -> Result<RoaringBitmap> {
3825 match index_expr {
3826 ScalarIndexExpr::And(lhs, rhs) => {
3827 Ok(self.fragments_covered_by_index_query(lhs).await?
3828 & self.fragments_covered_by_index_query(rhs).await?)
3829 }
3830 ScalarIndexExpr::Or(lhs, rhs) => Ok(self.fragments_covered_by_index_query(lhs).await?
3831 & self.fragments_covered_by_index_query(rhs).await?),
3832 ScalarIndexExpr::Not(expr) => self.fragments_covered_by_index_query(expr).await,
3833 ScalarIndexExpr::Query(search) => scalar_index_fragment_bitmap(
3834 self.dataset.as_ref(),
3835 &search.column,
3836 &search.index_name,
3837 )
3838 .await?
3839 .ok_or_else(|| {
3840 crate::Error::internal(format!(
3841 "Index not found even though it must have been found earlier: {}",
3842 search.index_name
3843 ))
3844 }),
3845 }
3846 }
3847
3848 async fn partition_frags_by_coverage(
3856 &self,
3857 index_expr: &ScalarIndexExpr,
3858 fragments: Arc<Vec<Fragment>>,
3859 ) -> Result<(Vec<Fragment>, Vec<Fragment>)> {
3860 let covered_frags = self.fragments_covered_by_index_query(index_expr).await?;
3861 let mut relevant_frags = Vec::with_capacity(fragments.len());
3862 let mut missing_frags = Vec::with_capacity(fragments.len());
3863 for fragment in fragments.iter() {
3864 if covered_frags.contains(fragment.id as u32) {
3865 relevant_frags.push(fragment.clone());
3866 } else {
3867 missing_frags.push(fragment.clone());
3868 }
3869 }
3870 Ok((relevant_frags, missing_frags))
3871 }
3872
3873 async fn scalar_indexed_scan(
3876 &self,
3877 projection: Projection,
3878 filter_plan: &ExprFilterPlan,
3879 fragments: Arc<Vec<Fragment>>,
3880 ) -> Result<Arc<dyn ExecutionPlan>> {
3881 log::trace!("scalar indexed scan");
3882 let index_expr = filter_plan.index_query.as_ref().unwrap();
3889
3890 let needs_recheck = index_expr.needs_recheck();
3891
3892 let (relevant_frags, missing_frags) = self
3894 .partition_frags_by_coverage(index_expr, fragments)
3895 .await?;
3896
3897 let mut plan: Arc<dyn ExecutionPlan> = Arc::new(MaterializeIndexExec::new(
3898 self.dataset.clone(),
3899 index_expr.clone(),
3900 Arc::new(relevant_frags),
3901 ));
3902
3903 let refine_expr = filter_plan.refine_expr.as_ref();
3904
3905 let needs_take =
3908 needs_recheck || projection.has_data_fields() || filter_plan.refine_expr.is_some();
3909 if needs_take {
3910 let mut take_projection = projection.clone();
3911 if needs_recheck {
3912 let filter_expr = index_expr.to_expr();
3914 let filter_cols = Planner::column_names_in_expr(&filter_expr);
3915 take_projection = take_projection.union_columns(filter_cols, OnMissing::Error)?;
3916 }
3917 if let Some(refine_expr) = refine_expr {
3918 let refine_cols = Planner::column_names_in_expr(refine_expr);
3919 take_projection = take_projection.union_columns(refine_cols, OnMissing::Error)?;
3920 }
3921 log::trace!("need to take additional columns for scalar_indexed_scan");
3922 plan = self.take(plan, take_projection)?;
3923 }
3924
3925 let post_take_filter = match (needs_recheck, refine_expr) {
3926 (false, None) => None,
3927 (true, None) => {
3928 Some(index_expr.to_expr())
3930 }
3931 (true, Some(_)) => Some(filter_plan.full_expr.as_ref().unwrap().clone()),
3932 (false, Some(refine_expr)) => Some(refine_expr.clone()),
3933 };
3934
3935 if let Some(post_take_filter) = post_take_filter {
3936 let planner = Planner::new(plan.schema());
3937 let optimized_filter = planner.optimize_expr(post_take_filter)?;
3938
3939 log::trace!("applying post-take filter to indexed scan");
3940 plan = Arc::new(LanceFilterExec::try_new(optimized_filter, plan)?);
3941 }
3942
3943 if self.projection_plan.physical_projection.with_row_addr {
3944 plan = Arc::new(AddRowAddrExec::try_new(plan, self.dataset.clone(), 0)?);
3945 }
3946
3947 let new_data_path: Option<Arc<dyn ExecutionPlan>> = if !missing_frags.is_empty() {
3948 log::trace!(
3949 "scalar_indexed_scan will need full scan of {} missing fragments",
3950 missing_frags.len()
3951 );
3952
3953 let filter = filter_plan.full_expr.as_ref().unwrap();
3966 let filter_cols = Planner::column_names_in_expr(filter);
3967 let scan_projection = projection.union_columns(filter_cols, OnMissing::Error)?;
3968
3969 let scan_schema = Arc::new(scan_projection.to_bare_schema());
3970 let scan_arrow_schema = Arc::new(scan_schema.as_ref().into());
3971 let planner = Planner::new(scan_arrow_schema);
3972 let optimized_filter = planner.optimize_expr(filter.clone())?;
3973
3974 let new_data_scan = self.scan_fragments(
3975 true,
3976 self.projection_plan.physical_projection.with_row_addr,
3977 self.projection_plan
3978 .physical_projection
3979 .with_row_last_updated_at_version,
3980 self.projection_plan
3981 .physical_projection
3982 .with_row_created_at_version,
3983 false,
3984 scan_schema,
3985 missing_frags.into(),
3986 None,
3988 false,
3989 );
3990 let filtered = Arc::new(LanceFilterExec::try_new(optimized_filter, new_data_scan)?);
3991 Some(Arc::new(project(filtered, plan.schema().as_ref())?))
3992 } else {
3993 log::trace!("scalar_indexed_scan will not need full scan of any missing fragments");
3994 None
3995 };
3996
3997 if let Some(new_data_path) = new_data_path {
3998 let unioned = UnionExec::try_new(vec![plan, new_data_path])?;
3999 let unioned = Arc::new(RepartitionExec::try_new(
4001 unioned,
4002 datafusion::physical_plan::Partitioning::RoundRobinBatch(1),
4003 )?);
4004 Ok(unioned)
4005 } else {
4006 Ok(plan)
4007 }
4008 }
4009
4010 fn get_io_buffer_size(&self) -> u64 {
4011 self.io_buffer_size.unwrap_or(*DEFAULT_IO_BUFFER_SIZE)
4012 }
4013
4014 #[allow(clippy::too_many_arguments)]
4019 pub(crate) fn scan(
4020 &self,
4021 with_row_id: bool,
4022 with_row_address: bool,
4023 with_row_last_updated_at_version: bool,
4024 with_row_created_at_version: bool,
4025 with_make_deletions_null: bool,
4026 range: Option<Range<u64>>,
4027 projection: Arc<Schema>,
4028 ) -> Arc<dyn ExecutionPlan> {
4029 let fragments = if let Some(fragment) = self.fragments.as_ref() {
4030 Arc::new(fragment.clone())
4031 } else {
4032 self.dataset.fragments().clone()
4033 };
4034 let ordered = if self.ordering.is_some() || self.nearest.is_some() {
4035 false
4037 } else {
4038 self.ordered
4039 };
4040 self.scan_fragments(
4041 with_row_id,
4042 with_row_address,
4043 with_row_last_updated_at_version,
4044 with_row_created_at_version,
4045 with_make_deletions_null,
4046 projection,
4047 fragments,
4048 range,
4049 ordered,
4050 )
4051 }
4052
4053 #[allow(clippy::too_many_arguments)]
4054 fn scan_fragments(
4055 &self,
4056 with_row_id: bool,
4057 with_row_address: bool,
4058 with_row_last_updated_at_version: bool,
4059 with_row_created_at_version: bool,
4060 with_make_deletions_null: bool,
4061 projection: Arc<Schema>,
4062 fragments: Arc<Vec<Fragment>>,
4063 range: Option<Range<u64>>,
4064 ordered: bool,
4065 ) -> Arc<dyn ExecutionPlan> {
4066 log::trace!("scan_fragments covered {} fragments", fragments.len());
4067 let config = LanceScanConfig {
4068 batch_size: self.get_batch_size(),
4069 batch_readahead: self.batch_readahead,
4070 fragment_readahead: self.fragment_readahead,
4071 io_buffer_size: self.get_io_buffer_size(),
4072 with_row_id,
4073 with_row_address,
4074 with_row_last_updated_at_version,
4075 with_row_created_at_version,
4076 with_make_deletions_null,
4077 ordered_output: ordered,
4078 file_reader_options: self.resolved_file_reader_options(),
4079 };
4080 Arc::new(LanceScanExec::new(
4081 self.dataset.clone(),
4082 fragments,
4083 range,
4084 projection,
4085 config,
4086 ))
4087 }
4088
4089 fn pushdown_scan(
4090 &self,
4091 make_deletions_null: bool,
4092 filter_plan: &ExprFilterPlan,
4093 ) -> Result<Arc<dyn ExecutionPlan>> {
4094 log::trace!("pushdown_scan");
4095
4096 let config = ScanConfig {
4097 batch_readahead: self.batch_readahead,
4098 fragment_readahead: self
4099 .fragment_readahead
4100 .unwrap_or(LEGACY_DEFAULT_FRAGMENT_READAHEAD),
4101 with_row_id: self.projection_plan.physical_projection.with_row_id,
4102 with_row_address: self.projection_plan.physical_projection.with_row_addr,
4103 make_deletions_null,
4104 ordered_output: self.ordered,
4105 file_reader_options: self.resolved_file_reader_options(),
4106 };
4107
4108 let fragments = if let Some(fragment) = self.fragments.as_ref() {
4109 Arc::new(fragment.clone())
4110 } else {
4111 self.dataset.fragments().clone()
4112 };
4113
4114 Ok(Arc::new(LancePushdownScanExec::try_new(
4115 self.dataset.clone(),
4116 fragments,
4117 Arc::new(self.projection_plan.physical_projection.to_bare_schema()),
4118 filter_plan.refine_expr.clone().unwrap(),
4119 config,
4120 )?))
4121 }
4122
4123 async fn flat_fts_filter(
4128 &self,
4129 input: Arc<dyn ExecutionPlan>,
4130 q: &FullTextSearchQuery,
4131 ) -> Result<Arc<dyn ExecutionPlan>> {
4132 let fts_query = if q.columns().is_empty() {
4133 let indexed_columns = fts_indexed_columns(self.dataset.clone()).await?;
4134 fill_fts_query_column(&q.query, &indexed_columns, false)?
4135 } else {
4136 q.query.clone()
4137 };
4138
4139 match &fts_query {
4140 FtsQuery::Match(match_query) => {
4141 let schema = Arc::new((input.schema()).try_with_column(SCORE_FIELD.clone())?);
4142
4143 let column = match_query
4144 .column
4145 .as_ref()
4146 .ok_or(Error::invalid_input(
4147 "the column must be specified in the query".to_string(),
4148 ))?
4149 .clone();
4150 let input = if schema.column_with_name(&column).is_none() {
4151 let projection = self
4152 .dataset
4153 .empty_projection()
4154 .union_column(&column, OnMissing::Error)?;
4155 self.take(input, projection)?
4156 } else {
4157 input
4158 };
4159
4160 Ok(Arc::new(FlatMatchFilterExec::new(
4161 input,
4162 self.dataset.clone(),
4163 match_query.clone(),
4164 q.params(),
4165 )))
4166 }
4167 _ => Err(Error::not_supported(
4168 "Only Match queries are supported currently when using FTS as a post-filter",
4169 )),
4170 }
4171 }
4172
4173 async fn fts_rerank(
4178 &self,
4179 input: Arc<dyn ExecutionPlan>,
4180 q: &FullTextSearchQuery,
4181 ) -> Result<Arc<dyn ExecutionPlan>> {
4182 let fts_query = if q.columns().is_empty() {
4183 let indexed_columns = fts_indexed_columns(self.dataset.clone()).await?;
4184 fill_fts_query_column(&q.query, &indexed_columns, false)?
4185 } else {
4186 q.query.clone()
4187 };
4188
4189 match &fts_query {
4190 FtsQuery::Match(match_query) => {
4191 let schema = Arc::new((input.schema()).try_with_column(SCORE_FIELD.clone())?);
4192
4193 let column = match_query
4194 .column
4195 .as_ref()
4196 .ok_or(Error::invalid_input(
4197 "the column must be specified in the query".to_string(),
4198 ))?
4199 .clone();
4200 let input = if schema.column_with_name(&column).is_none() {
4201 let projection = self
4202 .dataset
4203 .empty_projection()
4204 .union_column(&column, OnMissing::Error)?;
4205 self.take(input, projection)?
4206 } else {
4207 input
4208 };
4209
4210 Ok(Arc::new(FlatMatchQueryExec::new(
4211 self.dataset.clone(),
4212 match_query.clone(),
4213 q.params(),
4214 input,
4215 )))
4216 }
4217 _ => {
4218 let default_filter = ExprFilterPlan::default();
4219 let fts_plan = self.fts(&default_filter, q).await?;
4220
4221 let vector_row_id = Column::new_with_schema(ROW_ID, input.schema().as_ref())?;
4222 let fts_row_id = Column::new_with_schema(ROW_ID, fts_plan.schema().as_ref())?;
4223 let join = HashJoinExec::try_new(
4224 input,
4225 fts_plan,
4226 vec![(Arc::new(vector_row_id), Arc::new(fts_row_id))],
4227 None,
4228 &JoinType::Inner,
4229 None,
4230 PartitionMode::CollectLeft,
4231 NullEquality::NullEqualsNull,
4232 false,
4233 )?;
4234
4235 let schema = join.schema();
4236 let mut projection_exprs = Vec::new();
4237 let mut contain_rowid = false;
4238 for field in schema.fields() {
4239 if field.name() == ROW_ID {
4240 if contain_rowid {
4241 continue;
4242 }
4243 contain_rowid = true;
4244 }
4245 projection_exprs.push((
4246 Arc::new(Column::new_with_schema(field.name(), schema.as_ref())?)
4247 as Arc<dyn PhysicalExpr>,
4248 field.name().clone(),
4249 ));
4250 }
4251
4252 let projection_exec = ProjectionExec::try_new(projection_exprs, Arc::new(join))?;
4253 Ok(Arc::new(projection_exec))
4254 }
4255 }
4256 }
4257
4258 fn flat_knn(&self, input: Arc<dyn ExecutionPlan>, q: &Query) -> Result<Arc<dyn ExecutionPlan>> {
4260 let metric_type = match q.metric_type {
4262 Some(m) => m,
4263 None => {
4264 let (_, element_type) = get_vector_type(self.dataset.schema(), &q.column)?;
4265 default_distance_type_for(&element_type)
4266 }
4267 };
4268 let flat_dist = Arc::new(KNNVectorDistanceExec::try_new(
4269 input,
4270 &q.column,
4271 q.key.clone(),
4272 metric_type,
4273 )?);
4274
4275 let lower: Option<(Expr, Arc<dyn PhysicalExpr>)> = q
4276 .lower_bound
4277 .map(|v| -> Result<(Expr, Arc<dyn PhysicalExpr>)> {
4278 let logical = col(DIST_COL).gt_eq(lit(v));
4279 let schema = flat_dist.schema();
4280 let df_schema = DFSchema::try_from(schema)?;
4281 let physical = create_physical_expr(&logical, &df_schema, &ExecutionProps::new())?;
4282 Ok::<(Expr, Arc<dyn PhysicalExpr>), _>((logical, physical))
4283 })
4284 .transpose()?;
4285
4286 let upper = q
4287 .upper_bound
4288 .map(|v| -> Result<(Expr, Arc<dyn PhysicalExpr>)> {
4289 let logical = col(DIST_COL).lt(lit(v));
4290 let schema = flat_dist.schema();
4291 let df_schema = DFSchema::try_from(schema)?;
4292 let physical = create_physical_expr(&logical, &df_schema, &ExecutionProps::new())?;
4293 Ok::<(Expr, Arc<dyn PhysicalExpr>), _>((logical, physical))
4294 })
4295 .transpose()?;
4296
4297 let filter_expr = match (lower, upper) {
4298 (Some((llog, _)), Some((ulog, _))) => {
4299 let logical = llog.and(ulog);
4300 let schema = flat_dist.schema();
4301 let df_schema = DFSchema::try_from(schema)?;
4302 let physical = create_physical_expr(&logical, &df_schema, &ExecutionProps::new())?;
4303 Some((logical, physical))
4304 }
4305 (Some((llog, lphys)), None) => Some((llog, lphys)),
4306 (None, Some((ulog, uphys))) => Some((ulog, uphys)),
4307 (None, None) => None,
4308 };
4309
4310 let knn_plan: Arc<dyn ExecutionPlan> = if let Some(filter_expr) = filter_expr {
4311 Arc::new(LanceFilterExec::try_new(filter_expr.0, flat_dist)?)
4312 } else {
4313 flat_dist
4314 };
4315
4316 let sort = SortExec::new(
4318 [
4319 PhysicalSortExpr {
4320 expr: expressions::col(DIST_COL, knn_plan.schema().as_ref())?,
4321 options: SortOptions {
4322 descending: false,
4323 nulls_first: false,
4324 },
4325 },
4326 PhysicalSortExpr {
4327 expr: expressions::col(ROW_ID, knn_plan.schema().as_ref())?,
4328 options: SortOptions {
4329 descending: false,
4330 nulls_first: false,
4331 },
4332 },
4333 ]
4334 .into(),
4335 knn_plan,
4336 )
4337 .with_fetch(Some(q.k));
4338
4339 let logical_not_null = col(DIST_COL).is_not_null();
4340 let not_nulls = Arc::new(LanceFilterExec::try_new(logical_not_null, Arc::new(sort))?);
4341
4342 Ok(not_nulls)
4343 }
4344
4345 fn get_fragments_as_bitmap(&self) -> RoaringBitmap {
4346 if let Some(fragments) = &self.fragments {
4347 RoaringBitmap::from_iter(fragments.iter().map(|f| f.id as u32))
4348 } else {
4349 self.dataset.fragment_bitmap.as_ref().clone()
4350 }
4351 }
4352
4353 fn retain_relevant_index_segments(
4354 &self,
4355 index_segments: Vec<IndexMetadata>,
4356 ) -> Vec<IndexMetadata> {
4357 if let Some(fragments) = &self.fragments {
4358 let target_fragments = RoaringBitmap::from_iter(fragments.iter().map(|f| f.id as u32));
4359 index_segments
4360 .into_iter()
4361 .filter(|idx| {
4362 idx.fragment_bitmap
4363 .as_ref()
4364 .is_some_and(|fragmap| !(fragmap & &target_fragments).is_empty())
4365 })
4366 .collect()
4367 } else {
4368 index_segments
4369 }
4370 }
4371
4372 fn retain_target_fragments(&self, mut fragments: Vec<Fragment>) -> Vec<Fragment> {
4375 if let Some(target) = &self.fragments {
4376 let bitmap = RoaringBitmap::from_iter(target.iter().map(|f| f.id as u32));
4377 fragments.retain(|f| bitmap.contains(f.id as u32));
4378 }
4379 fragments
4380 }
4381
4382 fn get_indexed_frags(&self, index: &[IndexMetadata]) -> RoaringBitmap {
4383 let all_fragments = self.get_fragments_as_bitmap();
4384
4385 let mut all_indexed_frags = RoaringBitmap::new();
4386 for idx in index {
4387 if let Some(fragmap) = idx.fragment_bitmap.as_ref() {
4388 all_indexed_frags |= fragmap;
4389 } else {
4390 return all_fragments;
4393 }
4394 }
4395
4396 all_indexed_frags & all_fragments
4397 }
4398
4399 async fn ann(
4401 &self,
4402 q: &Query,
4403 index: &[IndexMetadata],
4404 filter_plan: &ExprFilterPlan,
4405 ) -> Result<Arc<dyn ExecutionPlan>> {
4406 let prefilter_source = self
4407 .prefilter_source(filter_plan, self.get_indexed_frags(index))
4408 .await?;
4409 let inner_fanout_search = new_knn_exec(self.dataset.clone(), index, q, prefilter_source)?;
4410 let sort_expr = PhysicalSortExpr {
4411 expr: expressions::col(DIST_COL, inner_fanout_search.schema().as_ref())?,
4412 options: SortOptions {
4413 descending: false,
4414 nulls_first: false,
4415 },
4416 };
4417 let sort_expr_row_id = PhysicalSortExpr {
4418 expr: expressions::col(ROW_ID, inner_fanout_search.schema().as_ref())?,
4419 options: SortOptions {
4420 descending: false,
4421 nulls_first: false,
4422 },
4423 };
4424 Ok(Arc::new(
4425 SortExec::new([sort_expr, sort_expr_row_id].into(), inner_fanout_search)
4426 .with_fetch(Some(q.k * q.refine_factor.unwrap_or(1) as usize)),
4427 ))
4428 }
4429
4430 async fn multivec_ann(
4432 &self,
4433 q: &Query,
4434 index: &[IndexMetadata],
4435 filter_plan: &ExprFilterPlan,
4436 ) -> Result<Arc<dyn ExecutionPlan>> {
4437 let over_fetch_factor = *DEFAULT_XTR_OVERFETCH;
4442
4443 let prefilter_source = self
4444 .prefilter_source(filter_plan, self.get_indexed_frags(index))
4445 .await?;
4446 let dim = get_vector_dim(self.dataset.schema(), &q.column)?;
4447
4448 let num_queries = q.key.len() / dim;
4449 let new_queries = (0..num_queries)
4450 .map(|i| q.key.slice(i * dim, dim))
4451 .map(|query_vec| {
4452 let mut new_query = q.clone();
4453 new_query.key = query_vec;
4454 new_query.refine_factor = Some(over_fetch_factor);
4458 new_query
4459 });
4460 let mut ann_nodes = Vec::with_capacity(new_queries.len());
4461 for query in new_queries {
4462 let ann_node = new_knn_exec(
4464 self.dataset.clone(),
4465 index,
4466 &query,
4467 prefilter_source.clone(),
4468 )?;
4469 let sort_expr = PhysicalSortExpr {
4470 expr: expressions::col(DIST_COL, ann_node.schema().as_ref())?,
4471 options: SortOptions {
4472 descending: false,
4473 nulls_first: false,
4474 },
4475 };
4476 let sort_expr_row_id = PhysicalSortExpr {
4477 expr: expressions::col(ROW_ID, ann_node.schema().as_ref())?,
4478 options: SortOptions {
4479 descending: false,
4480 nulls_first: false,
4481 },
4482 };
4483 let ann_node = Arc::new(
4484 SortExec::new([sort_expr, sort_expr_row_id].into(), ann_node)
4485 .with_fetch(Some(q.k * over_fetch_factor as usize)),
4486 );
4487 ann_nodes.push(ann_node as Arc<dyn ExecutionPlan>);
4488 }
4489
4490 let ann_node = Arc::new(MultivectorScoringExec::try_new(ann_nodes, q.clone())?);
4491
4492 let sort_expr = PhysicalSortExpr {
4493 expr: expressions::col(DIST_COL, ann_node.schema().as_ref())?,
4494 options: SortOptions {
4495 descending: false,
4496 nulls_first: false,
4497 },
4498 };
4499 let sort_expr_row_id = PhysicalSortExpr {
4500 expr: expressions::col(ROW_ID, ann_node.schema().as_ref())?,
4501 options: SortOptions {
4502 descending: false,
4503 nulls_first: false,
4504 },
4505 };
4506 let ann_node = Arc::new(
4507 SortExec::new([sort_expr, sort_expr_row_id].into(), ann_node)
4508 .with_fetch(Some(q.k * q.refine_factor.unwrap_or(1) as usize)),
4509 );
4510
4511 Ok(ann_node)
4512 }
4513
4514 async fn prefilter_source(
4519 &self,
4520 filter_plan: &ExprFilterPlan,
4521 required_frags: RoaringBitmap,
4522 ) -> Result<PreFilterSource> {
4523 if filter_plan.is_empty() && self.fragments.is_none() {
4524 log::trace!("no filter plan, no prefilter");
4525 return Ok(PreFilterSource::None);
4526 }
4527
4528 let fragments: Vec<Fragment> = self
4530 .dataset
4531 .manifest
4532 .fragments
4533 .iter()
4534 .filter(|f| required_frags.contains(f.id as u32))
4535 .cloned()
4536 .collect();
4537
4538 let fragments = Arc::new(self.retain_target_fragments(fragments));
4540
4541 if filter_plan.is_exact_index_search() && self.fragments.is_none() {
4547 let index_query = filter_plan.index_query.as_ref().expect_ok()?;
4548 let (_, missing_frags) = self
4549 .partition_frags_by_coverage(index_query, fragments.clone())
4550 .await?;
4551
4552 if missing_frags.is_empty() {
4553 log::trace!("prefilter entirely satisfied by exact index search");
4554 return Ok(PreFilterSource::ScalarIndexQuery(Arc::new(
4559 ScalarIndexExec::new(self.dataset.clone(), index_query.clone()),
4560 )));
4561 } else {
4562 log::trace!("exact index search did not cover all fragments");
4563 }
4564 }
4565
4566 log::trace!(
4568 "prefilter is a filtered read of {} fragments",
4569 fragments.len()
4570 );
4571 let PlannedFilteredScan { plan, .. } = self
4572 .filtered_read(
4573 filter_plan,
4574 self.dataset.empty_projection().with_row_id(),
4575 false,
4576 Some(fragments),
4577 None,
4578 true,
4579 )
4580 .await?;
4581 Ok(PreFilterSource::FilteredRowIds(plan))
4582 }
4583
4584 #[allow(deprecated)]
4586 fn take(
4587 &self,
4588 input: Arc<dyn ExecutionPlan>,
4589 output_projection: Projection,
4590 ) -> Result<Arc<dyn ExecutionPlan>> {
4591 let coalesced = Arc::new(CoalesceBatchesExec::new(
4592 input.clone(),
4593 self.get_batch_size(),
4594 ));
4595 if let Some(take_plan) =
4596 TakeExec::try_new(self.dataset.clone(), coalesced, output_projection)?
4597 {
4598 Ok(Arc::new(take_plan))
4599 } else {
4600 Ok(input)
4602 }
4603 }
4604
4605 fn limit_node(&self, plan: Arc<dyn ExecutionPlan>) -> Arc<dyn ExecutionPlan> {
4607 Arc::new(GlobalLimitExec::new(
4608 plan,
4609 *self.offset.as_ref().unwrap_or(&0) as usize,
4610 self.limit.map(|l| l as usize),
4611 ))
4612 }
4613
4614 #[instrument(level = "info", skip(self))]
4615 pub async fn analyze_plan(&self) -> Result<String> {
4616 let plan = self.create_plan().await?;
4617 analyze_plan(
4618 plan,
4619 LanceExecutionOptions {
4620 batch_size: self.batch_size,
4621 ..Default::default()
4622 },
4623 )
4624 .await
4625 }
4626
4627 #[instrument(level = "info", skip(self))]
4628 pub async fn explain_plan(&self, verbose: bool) -> Result<String> {
4629 let plan = self.create_plan().await?;
4630 let display = DisplayableExecutionPlan::new(plan.as_ref());
4631
4632 Ok(format!("{}", display.indent(verbose)))
4633 }
4634}
4635
4636async fn fts_indexed_columns(dataset: Arc<Dataset>) -> Result<Vec<String>> {
4639 let mut indexed_columns = Vec::new();
4640 for field in dataset.schema().fields_pre_order() {
4641 let is_string_field = match field.data_type() {
4643 DataType::Utf8 | DataType::LargeUtf8 => true,
4644 DataType::List(inner_field) | DataType::LargeList(inner_field) => {
4645 matches!(
4646 inner_field.data_type(),
4647 DataType::Utf8 | DataType::LargeUtf8
4648 )
4649 }
4650 _ => false,
4651 };
4652
4653 if is_string_field {
4654 let column_path =
4656 if let Some(ancestors) = dataset.schema().field_ancestry_by_id(field.id) {
4657 let field_refs: Vec<&str> = ancestors.iter().map(|f| f.name.as_str()).collect();
4658 format_field_path(&field_refs)
4659 } else {
4660 continue; };
4662
4663 let has_fts_index = dataset
4665 .load_scalar_index(
4666 IndexCriteria::default()
4667 .for_column(&column_path)
4668 .supports_fts(),
4669 )
4670 .await?
4671 .is_some();
4672
4673 if has_fts_index {
4674 indexed_columns.push(column_path);
4675 }
4676 }
4677 }
4678 Ok(indexed_columns)
4679}
4680
4681#[pin_project::pin_project]
4685pub struct DatasetRecordBatchStream {
4686 #[pin]
4687 exec_node: SendableRecordBatchStream,
4688 span: Span,
4689}
4690
4691impl DatasetRecordBatchStream {
4692 pub fn new(exec_node: SendableRecordBatchStream) -> Self {
4693 let schema = exec_node.schema();
4694 let adapter = SchemaAdapter::new(schema.clone());
4695 let exec_node = if SchemaAdapter::requires_logical_conversion(&schema) {
4696 adapter.to_logical_stream(exec_node)
4697 } else {
4698 exec_node
4699 };
4700
4701 let span = info_span!("DatasetRecordBatchStream");
4702 Self { exec_node, span }
4703 }
4704}
4705
4706impl RecordBatchStream for DatasetRecordBatchStream {
4707 fn schema(&self) -> SchemaRef {
4708 self.exec_node.schema()
4709 }
4710}
4711
4712impl Stream for DatasetRecordBatchStream {
4713 type Item = Result<RecordBatch>;
4714
4715 fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
4716 let mut this = self.project();
4717 let _guard = this.span.enter();
4718 match this.exec_node.poll_next_unpin(cx) {
4719 Poll::Ready(result) => Poll::Ready(result.map(|r| Ok(r?))),
4720 Poll::Pending => Poll::Pending,
4721 }
4722 }
4723}
4724
4725impl From<DatasetRecordBatchStream> for SendableRecordBatchStream {
4726 fn from(stream: DatasetRecordBatchStream) -> Self {
4727 stream.exec_node
4728 }
4729}
4730
4731#[cfg(test)]
4732pub mod test_dataset {
4733
4734 use super::*;
4735
4736 use std::{collections::HashMap, vec};
4737
4738 use arrow_array::{
4739 ArrayRef, FixedSizeListArray, Int32Array, RecordBatch, RecordBatchIterator, StringArray,
4740 types::Float32Type,
4741 };
4742 use arrow_schema::{ArrowError, DataType};
4743 use lance_arrow::FixedSizeListArrayExt;
4744 use lance_core::utils::tempfile::TempStrDir;
4745 use lance_file::version::LanceFileVersion;
4746 use lance_index::{
4747 IndexType,
4748 scalar::{ScalarIndexParams, inverted::tokenizer::InvertedIndexParams},
4749 vector::{
4750 ivf::IvfBuildParams,
4751 kmeans::{KMeansParams, train_kmeans},
4752 },
4753 };
4754 use lance_linalg::distance::DistanceType;
4755 use uuid::Uuid;
4756
4757 use crate::dataset::WriteParams;
4758 use crate::index::vector::VectorIndexParams;
4759
4760 pub struct TestVectorDataset {
4770 pub tmp_dir: TempStrDir,
4771 pub schema: Arc<ArrowSchema>,
4772 pub dataset: Dataset,
4773 dimension: u32,
4774 }
4775
4776 impl TestVectorDataset {
4777 pub async fn new(
4778 data_storage_version: LanceFileVersion,
4779 stable_row_ids: bool,
4780 ) -> Result<Self> {
4781 Self::new_with_dimension(data_storage_version, stable_row_ids, 32).await
4782 }
4783
4784 pub async fn new_with_dimension(
4785 data_storage_version: LanceFileVersion,
4786 stable_row_ids: bool,
4787 dimension: u32,
4788 ) -> Result<Self> {
4789 let path = TempStrDir::default();
4790
4791 let metadata: HashMap<String, String> =
4793 vec![("dataset".to_string(), "vector".to_string())]
4794 .into_iter()
4795 .collect();
4796
4797 let schema = Arc::new(ArrowSchema::new_with_metadata(
4798 vec![
4799 ArrowField::new("i", DataType::Int32, true),
4800 ArrowField::new("s", DataType::Utf8, true),
4801 ArrowField::new(
4802 "vec",
4803 DataType::FixedSizeList(
4804 Arc::new(ArrowField::new("item", DataType::Float32, true)),
4805 dimension as i32,
4806 ),
4807 true,
4808 ),
4809 ],
4810 metadata,
4811 ));
4812
4813 let batches: Vec<RecordBatch> = (0..5)
4814 .map(|i| {
4815 let vector_values: Float32Array =
4816 (0..dimension * 80).map(|v| v as f32).collect();
4817 let vectors =
4818 FixedSizeListArray::try_new_from_values(vector_values, dimension as i32)
4819 .unwrap();
4820 RecordBatch::try_new(
4821 schema.clone(),
4822 vec![
4823 Arc::new(Int32Array::from_iter_values(i * 80..(i + 1) * 80)),
4824 Arc::new(StringArray::from_iter_values(
4825 (i * 80..(i + 1) * 80).map(|v| format!("s-{}", v)),
4826 )),
4827 Arc::new(vectors),
4828 ],
4829 )
4830 })
4831 .collect::<std::result::Result<Vec<_>, ArrowError>>()?;
4832
4833 let params = WriteParams {
4834 max_rows_per_group: 10,
4835 max_rows_per_file: 200,
4836 data_storage_version: Some(data_storage_version),
4837 enable_stable_row_ids: stable_row_ids,
4838 ..Default::default()
4839 };
4840 let reader = RecordBatchIterator::new(batches.into_iter().map(Ok), schema.clone());
4841
4842 let dataset = Dataset::write(reader, &path, Some(params)).await?;
4843
4844 Ok(Self {
4845 tmp_dir: path,
4846 schema,
4847 dataset,
4848 dimension,
4849 })
4850 }
4851
4852 pub async fn make_vector_index(&mut self) -> Result<()> {
4853 let params = VectorIndexParams::ivf_pq(2, 8, 2, MetricType::L2, 2);
4854 self.dataset
4855 .create_index(
4856 &["vec"],
4857 IndexType::Vector,
4858 Some("idx".to_string()),
4859 ¶ms,
4860 true,
4861 )
4862 .await?;
4863 Ok(())
4864 }
4865
4866 pub async fn make_segmented_vector_index(&mut self) -> Result<Vec<Uuid>> {
4867 let batch = self
4868 .dataset
4869 .scan()
4870 .project(&["vec"])
4871 .unwrap()
4872 .try_into_batch()
4873 .await?;
4874 let vectors = batch
4875 .column_by_name("vec")
4876 .expect("vector column should exist")
4877 .as_fixed_size_list();
4878 let values = vectors.values().as_primitive::<Float32Type>();
4879 let centroids = train_kmeans::<Float32Type>(
4880 values,
4881 KMeansParams::new(None, 10, 1, DistanceType::L2),
4882 self.dimension as usize,
4883 2,
4884 2,
4885 )
4886 .unwrap()
4887 .centroids
4888 .as_primitive::<Float32Type>()
4889 .clone();
4890 let centroids = Arc::new(
4891 FixedSizeListArray::try_new_from_values(centroids, self.dimension as i32).unwrap(),
4892 );
4893 let params = VectorIndexParams::with_ivf_flat_params(
4894 DistanceType::L2,
4895 IvfBuildParams::try_with_centroids(2, centroids).unwrap(),
4896 );
4897 let fragment_ids = self
4898 .dataset
4899 .get_fragments()
4900 .iter()
4901 .map(|fragment| fragment.id() as u32)
4902 .collect::<Vec<_>>();
4903
4904 let mut segments = Vec::with_capacity(fragment_ids.len());
4905 for fragment_id in fragment_ids {
4906 let mut builder =
4907 self.dataset
4908 .create_index_builder(&["vec"], IndexType::Vector, ¶ms);
4909 builder = builder.name("idx".to_string()).fragments(vec![fragment_id]);
4910 segments.push(builder.execute_uncommitted().await?);
4911 }
4912
4913 let segment_ids = segments
4914 .iter()
4915 .map(|segment| segment.uuid)
4916 .collect::<Vec<_>>();
4917 let segments = self
4918 .dataset
4919 .create_index_segment_builder()
4920 .with_index_type(params.index_type())
4921 .with_segments(segments)
4922 .build_all()
4923 .await?;
4924 self.dataset
4925 .commit_existing_index_segments("idx", "vec", segments)
4926 .await?;
4927 Ok(segment_ids)
4928 }
4929
4930 pub async fn make_scalar_index(&mut self) -> Result<()> {
4931 self.dataset
4932 .create_index(
4933 &["i"],
4934 IndexType::Scalar,
4935 None,
4936 &ScalarIndexParams::default(),
4937 true,
4938 )
4939 .await?;
4940 Ok(())
4941 }
4942
4943 pub async fn make_fts_index(&mut self) -> Result<()> {
4944 let params = InvertedIndexParams::default().with_position(true);
4945 self.dataset
4946 .create_index(&["s"], IndexType::Inverted, None, ¶ms, true)
4947 .await?;
4948 Ok(())
4949 }
4950
4951 pub async fn append_new_data(&mut self) -> Result<()> {
4952 self.append_data_with_range(400, 410).await
4953 }
4954
4955 pub async fn append_data_with_range(&mut self, start: i32, end: i32) -> Result<()> {
4956 let count = (end - start) as usize;
4957 let vector_values: Float32Array = (0..count)
4958 .flat_map(|i| vec![i as f32; self.dimension as usize].into_iter())
4959 .collect();
4960 let new_vectors =
4961 FixedSizeListArray::try_new_from_values(vector_values, self.dimension as i32)
4962 .unwrap();
4963 let new_data: Vec<ArrayRef> = vec![
4964 Arc::new(Int32Array::from_iter_values(start..end)),
4965 Arc::new(StringArray::from_iter_values(
4966 (start..end).map(|v| format!("s-{}", v)),
4967 )),
4968 Arc::new(new_vectors),
4969 ];
4970 let reader = RecordBatchIterator::new(
4971 vec![RecordBatch::try_new(self.schema.clone(), new_data).unwrap()]
4972 .into_iter()
4973 .map(Ok),
4974 self.schema.clone(),
4975 );
4976 self.dataset.append(reader, None).await?;
4977 Ok(())
4978 }
4979 }
4980}
4981
4982#[cfg(test)]
4983mod test {
4984
4985 use std::collections::BTreeSet;
4986 use std::time::{Duration, Instant};
4987 use std::vec;
4988
4989 use arrow::array::as_primitive_array;
4990 use arrow::datatypes::{Float64Type, Int32Type, Int64Type};
4991 use arrow_array::cast::AsArray;
4992 use arrow_array::types::{Float32Type, UInt64Type};
4993 use arrow_array::{
4994 ArrayRef, FixedSizeListArray, Float16Array, Int32Array, LargeStringArray, PrimitiveArray,
4995 RecordBatchIterator, StringArray, StructArray, UInt8Array,
4996 };
4997
4998 use arrow_ord::sort::sort_to_indices;
4999 use arrow_schema::Fields;
5000 use arrow_select::take;
5001 use datafusion::logical_expr::{col, lit};
5002 use half::f16;
5003 use lance_arrow::{FixedSizeListArrayExt, SchemaExt};
5004 use lance_core::utils::tempfile::TempStrDir;
5005 use lance_core::{ROW_CREATED_AT_VERSION, ROW_LAST_UPDATED_AT_VERSION};
5006 use lance_datagen::{
5007 ArrayGeneratorExt, BatchCount, ByteCount, Dimension, RowCount, array, gen_batch,
5008 };
5009 use lance_file::version::LanceFileVersion;
5010 use lance_index::optimize::OptimizeOptions;
5011 use lance_index::scalar::inverted::query::{MatchQuery, PhraseQuery};
5012 use lance_index::vector::hnsw::builder::HnswBuildParams;
5013 use lance_index::vector::ivf::IvfBuildParams;
5014 use lance_index::vector::pq::PQBuildParams;
5015 use lance_index::vector::sq::builder::SQBuildParams;
5016 use lance_index::{IndexType, scalar::ScalarIndexParams};
5017 use lance_io::assert_io_gt;
5018 use lance_io::object_store::ObjectStoreParams;
5019
5020 use lance_linalg::distance::DistanceType;
5021 use lance_testing::datagen::{BatchGenerator, IncrementingInt32, RandomVector};
5022 use object_store::throttle::ThrottleConfig;
5023 use rstest::rstest;
5024
5025 use super::*;
5026 use crate::dataset::WriteMode;
5027 use crate::dataset::WriteParams;
5028 use crate::dataset::optimize::{CompactionOptions, compact_files};
5029 use crate::dataset::scanner::test_dataset::TestVectorDataset;
5030 use crate::index::vector::{StageParams, VectorIndexParams};
5031 use crate::utils::test::{
5032 DatagenExt, FragmentCount, FragmentRowCount, ThrottledStoreWrapper, assert_plan_node_equals,
5033 };
5034
5035 #[test]
5036 fn test_env_var_parsing() {
5037 unsafe {
5041 std::env::set_var("LANCE_DEFAULT_BATCH_SIZE", "not_a_number");
5042 }
5043 let result = get_default_batch_size();
5044 assert_eq!(result, None, "Should return None for invalid batch size");
5045
5046 unsafe {
5048 std::env::set_var("LANCE_DEFAULT_BATCH_SIZE", "2048");
5049 }
5050 let result = get_default_batch_size();
5051 assert_eq!(result, Some(2048), "Should parse valid batch size");
5052
5053 unsafe {
5055 std::env::remove_var("LANCE_DEFAULT_BATCH_SIZE");
5056 }
5057 let result = get_default_batch_size();
5058 assert_eq!(result, None, "Should return None when env var is not set");
5059 }
5060
5061 #[test]
5062 fn test_parse_env_var() {
5063 let test_var = "LANCE_TEST_PARSE_ENV_VAR_USIZE";
5067
5068 unsafe {
5070 std::env::set_var(test_var, "12345");
5071 }
5072 let result: Option<usize> = parse_env_var(test_var, "Using default.");
5073 assert_eq!(result, Some(12345));
5074
5075 unsafe {
5077 std::env::set_var(test_var, "not_a_number");
5078 }
5079 let result: Option<usize> = parse_env_var(test_var, "Using default.");
5080 assert_eq!(result, None);
5081
5082 unsafe {
5084 std::env::remove_var(test_var);
5085 }
5086 let result: Option<usize> = parse_env_var(test_var, "Using default.");
5087 assert_eq!(result, None);
5088
5089 let test_var_u32 = "LANCE_TEST_PARSE_ENV_VAR_U32";
5091 unsafe {
5092 std::env::set_var(test_var_u32, "42");
5093 }
5094 let result: Option<u32> = parse_env_var(test_var_u32, "Using default value.");
5095 assert_eq!(result, Some(42));
5096
5097 unsafe {
5098 std::env::set_var(test_var_u32, "invalid");
5099 }
5100 let result: Option<u32> = parse_env_var(test_var_u32, "Using default value.");
5101 assert_eq!(result, None);
5102
5103 unsafe {
5104 std::env::remove_var(test_var_u32);
5105 }
5106
5107 let test_var_u64 = "LANCE_TEST_PARSE_ENV_VAR_U64";
5109 unsafe {
5110 std::env::set_var(test_var_u64, "9999999999");
5111 }
5112 let result: Option<u64> = parse_env_var(test_var_u64, "Using default value.");
5113 assert_eq!(result, Some(9999999999));
5114
5115 unsafe {
5116 std::env::set_var(test_var_u64, "-1");
5117 }
5118 let result: Option<u64> = parse_env_var(test_var_u64, "Using default value.");
5119 assert_eq!(result, None);
5120
5121 unsafe {
5122 std::env::remove_var(test_var_u64);
5123 }
5124 }
5125
5126 async fn make_binary_vector_dataset() -> Result<(TempStrDir, Dataset)> {
5127 let tmp_dir = TempStrDir::default();
5128 let dim = 4;
5129 let schema = Arc::new(ArrowSchema::new(vec![
5130 ArrowField::new("id", DataType::Int32, false),
5131 ArrowField::new(
5132 "bin",
5133 DataType::FixedSizeList(
5134 Arc::new(ArrowField::new("item", DataType::UInt8, true)),
5135 dim,
5136 ),
5137 false,
5138 ),
5139 ]));
5140
5141 let vectors = FixedSizeListArray::try_new_from_values(
5142 UInt8Array::from(vec![
5143 0b0000_1111u8,
5144 0,
5145 0,
5146 0, 0b0000_0011u8,
5148 0,
5149 0,
5150 0, 0u8,
5152 0,
5153 0,
5154 0,
5155 ]),
5156 dim,
5157 )?;
5158 let ids = Int32Array::from(vec![0, 1, 2]);
5159
5160 let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(ids), Arc::new(vectors)])?;
5161 let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone());
5162 Dataset::write(reader, &tmp_dir, None).await?;
5163 let dataset = Dataset::open(&tmp_dir).await?;
5164 Ok((tmp_dir, dataset))
5165 }
5166
5167 #[tokio::test]
5168 async fn test_batch_size() {
5169 let schema = Arc::new(ArrowSchema::new(vec![
5170 ArrowField::new("i", DataType::Int32, true),
5171 ArrowField::new("s", DataType::Utf8, true),
5172 ]));
5173
5174 let batches: Vec<RecordBatch> = (0..5)
5175 .map(|i| {
5176 RecordBatch::try_new(
5177 schema.clone(),
5178 vec![
5179 Arc::new(Int32Array::from_iter_values(i * 20..(i + 1) * 20)),
5180 Arc::new(StringArray::from_iter_values(
5181 (i * 20..(i + 1) * 20).map(|v| format!("s-{}", v)),
5182 )),
5183 ],
5184 )
5185 .unwrap()
5186 })
5187 .collect();
5188
5189 for use_filter in [false, true] {
5190 let test_dir = TempStrDir::default();
5191 let test_uri = &test_dir;
5192 let write_params = WriteParams {
5193 max_rows_per_file: 40,
5194 max_rows_per_group: 10,
5195 ..Default::default()
5196 };
5197 let batches =
5198 RecordBatchIterator::new(batches.clone().into_iter().map(Ok), schema.clone());
5199 Dataset::write(batches, test_uri, Some(write_params))
5200 .await
5201 .unwrap();
5202
5203 let dataset = Dataset::open(test_uri).await.unwrap();
5204 let mut builder = dataset.scan();
5205 builder.batch_size(8);
5206 if use_filter {
5207 builder.filter("i IS NOT NULL").unwrap();
5208 }
5209 let mut stream = builder.try_into_stream().await.unwrap();
5210 let mut rows_read = 0;
5211 while let Some(next) = stream.next().await {
5212 let next = next.unwrap();
5213 let expected = 8.min(100 - rows_read);
5214 assert_eq!(next.num_rows(), expected);
5215 rows_read += next.num_rows();
5216 }
5217 }
5218 }
5219
5220 #[tokio::test]
5221 async fn test_strict_batch_size() {
5222 let dataset = lance_datagen::gen_batch()
5223 .col("x", array::step::<Int32Type>())
5224 .anon_col(array::step::<Int64Type>())
5225 .into_ram_dataset(FragmentCount::from(7), FragmentRowCount::from(6))
5226 .await
5227 .unwrap();
5228
5229 let mut scan = dataset.scan();
5230 scan.batch_size(10)
5231 .strict_batch_size(true)
5232 .filter("x % 2 == 0")
5233 .unwrap();
5234
5235 let batches = scan
5236 .try_into_stream()
5237 .await
5238 .unwrap()
5239 .try_collect::<Vec<_>>()
5240 .await
5241 .unwrap();
5242
5243 let batch_sizes = batches.iter().map(|b| b.num_rows()).collect::<Vec<_>>();
5244 assert_eq!(batch_sizes, vec![10, 10, 1]);
5245 }
5246
5247 #[tokio::test]
5248 async fn test_column_not_exist() {
5249 let dataset = lance_datagen::gen_batch()
5250 .col("x", array::step::<Int32Type>())
5251 .into_ram_dataset(FragmentCount::from(7), FragmentRowCount::from(6))
5252 .await
5253 .unwrap();
5254
5255 let check_err_msg = |r: Result<DatasetRecordBatchStream>| {
5256 let Err(err) = r else {
5257 panic!(
5258 "Expected an error to be raised saying column y is not found but got no error"
5259 )
5260 };
5261
5262 assert!(
5263 err.to_string().contains("No field named y"),
5264 "Expected error to contain 'No field named y' but got {}",
5265 err
5266 );
5267 };
5268
5269 let mut scan = dataset.scan();
5270 scan.project(&["x", "y"]).unwrap();
5271 check_err_msg(scan.try_into_stream().await);
5272
5273 let mut scan = dataset.scan();
5274 scan.project(&["y"]).unwrap();
5275 check_err_msg(scan.try_into_stream().await);
5276
5277 let mut scan = dataset.scan();
5280 scan.project_with_transform(&[("foo", "1")]).unwrap();
5281 match scan.try_into_stream().await {
5282 Ok(_) => panic!("Expected an error to be raised saying not supported"),
5283 Err(e) => {
5284 assert!(
5285 e.to_string().contains("Received only dynamic expressions"),
5286 "Expected error to contain 'Received only dynamic expressions' but got {}",
5287 e
5288 );
5289 }
5290 }
5291 }
5292
5293 #[cfg(not(windows))]
5294 #[tokio::test]
5295 async fn test_local_object_store() {
5296 let schema = Arc::new(ArrowSchema::new(vec![
5297 ArrowField::new("i", DataType::Int32, true),
5298 ArrowField::new("s", DataType::Utf8, true),
5299 ]));
5300
5301 let batches: Vec<RecordBatch> = (0..5)
5302 .map(|i| {
5303 RecordBatch::try_new(
5304 schema.clone(),
5305 vec![
5306 Arc::new(Int32Array::from_iter_values(i * 20..(i + 1) * 20)),
5307 Arc::new(StringArray::from_iter_values(
5308 (i * 20..(i + 1) * 20).map(|v| format!("s-{}", v)),
5309 )),
5310 ],
5311 )
5312 .unwrap()
5313 })
5314 .collect();
5315
5316 let test_dir = TempStrDir::default();
5317 let test_uri = &test_dir;
5318 let write_params = WriteParams {
5319 max_rows_per_file: 40,
5320 max_rows_per_group: 10,
5321 ..Default::default()
5322 };
5323 let batches = RecordBatchIterator::new(batches.clone().into_iter().map(Ok), schema.clone());
5324 Dataset::write(batches, test_uri, Some(write_params))
5325 .await
5326 .unwrap();
5327
5328 let dataset = Dataset::open(&format!("file-object-store://{}", test_uri))
5329 .await
5330 .unwrap();
5331 let mut builder = dataset.scan();
5332 builder.batch_size(8);
5333 let mut stream = builder.try_into_stream().await.unwrap();
5334 let mut rows_read = 0;
5335 while let Some(next) = stream.next().await {
5336 let next = next.unwrap();
5337 let expected = 8.min(100 - rows_read);
5338 assert_eq!(next.num_rows(), expected);
5339 rows_read += next.num_rows();
5340 }
5341 }
5342
5343 #[tokio::test]
5344 async fn test_filter_parsing() -> Result<()> {
5345 let test_ds = TestVectorDataset::new(LanceFileVersion::Stable, false).await?;
5346 let dataset = &test_ds.dataset;
5347
5348 let mut scan = dataset.scan();
5349 assert!(scan.filter.is_none());
5350
5351 scan.filter("i > 50")?;
5352 assert_eq!(scan.get_expr_filter().unwrap(), Some(col("i").gt(lit(50))));
5353
5354 for use_stats in [false, true] {
5355 let batches = scan
5356 .project(&["s"])?
5357 .use_stats(use_stats)
5358 .try_into_stream()
5359 .await?
5360 .try_collect::<Vec<_>>()
5361 .await?;
5362 let batch = concat_batches(&batches[0].schema(), &batches)?;
5363
5364 let expected_batch = RecordBatch::try_new(
5365 Arc::new(test_ds.schema.project(&[1])?),
5367 vec![Arc::new(StringArray::from_iter_values(
5368 (51..400).map(|v| format!("s-{}", v)),
5369 ))],
5370 )?;
5371 assert_eq!(batch, expected_batch);
5372 }
5373 Ok(())
5374 }
5375
5376 #[tokio::test]
5377 async fn test_scan_regexp_match_and_non_empty_captions() {
5378 let schema = Arc::new(ArrowSchema::new(vec![
5381 ArrowField::new("keywords", DataType::Utf8, true),
5382 ArrowField::new("natural_caption", DataType::Utf8, true),
5383 ArrowField::new("poetic_caption", DataType::Utf8, true),
5384 ]));
5385
5386 let batch = RecordBatch::try_new(
5387 schema.clone(),
5388 vec![
5389 Arc::new(StringArray::from(vec![
5390 Some("Liberty for all"),
5391 Some("peace"),
5392 Some("revolution now"),
5393 Some("Liberty"),
5394 Some("revolutionary"),
5395 Some("none"),
5396 ])) as ArrayRef,
5397 Arc::new(StringArray::from(vec![
5398 Some("a"),
5399 Some("b"),
5400 None,
5401 Some(""),
5402 Some("c"),
5403 Some("d"),
5404 ])) as ArrayRef,
5405 Arc::new(StringArray::from(vec![
5406 Some("x"),
5407 Some(""),
5408 Some("y"),
5409 Some("z"),
5410 None,
5411 Some("w"),
5412 ])) as ArrayRef,
5413 ],
5414 )
5415 .unwrap();
5416
5417 let reader = RecordBatchIterator::new(vec![Ok(batch.clone())], schema.clone());
5418 let dataset = Dataset::write(reader, "memory://", None).await.unwrap();
5419
5420 let mut scan = dataset.scan();
5421 scan.filter(
5422 "regexp_match(keywords, 'Liberty|revolution') AND \
5423 (natural_caption IS NOT NULL AND natural_caption <> '' AND \
5424 poetic_caption IS NOT NULL AND poetic_caption <> '')",
5425 )
5426 .unwrap();
5427
5428 let out = scan.try_into_batch().await.unwrap();
5429 assert_eq!(out.num_rows(), 1);
5430
5431 let out_keywords = out
5432 .column_by_name("keywords")
5433 .unwrap()
5434 .as_string::<i32>()
5435 .value(0);
5436 let out_nat = out
5437 .column_by_name("natural_caption")
5438 .unwrap()
5439 .as_string::<i32>()
5440 .value(0);
5441 let out_poetic = out
5442 .column_by_name("poetic_caption")
5443 .unwrap()
5444 .as_string::<i32>()
5445 .value(0);
5446
5447 assert_eq!(out_keywords, "Liberty for all");
5448 assert_eq!(out_nat, "a");
5449 assert_eq!(out_poetic, "x");
5450 }
5451
5452 #[tokio::test]
5453 async fn test_nested_projection() {
5454 let point_fields: Fields = vec![
5455 ArrowField::new("x", DataType::Float32, true),
5456 ArrowField::new("y", DataType::Float32, true),
5457 ]
5458 .into();
5459 let metadata_fields: Fields = vec![
5460 ArrowField::new("location", DataType::Struct(point_fields), true),
5461 ArrowField::new("age", DataType::Int32, true),
5462 ]
5463 .into();
5464 let metadata_field = ArrowField::new("metadata", DataType::Struct(metadata_fields), true);
5465 let schema = Arc::new(ArrowSchema::new(vec![
5466 metadata_field,
5467 ArrowField::new("idx", DataType::Int32, true),
5468 ]));
5469 let data = lance_datagen::rand(&schema)
5470 .into_ram_dataset(FragmentCount::from(7), FragmentRowCount::from(6))
5471 .await
5472 .unwrap();
5473
5474 let mut scan = data.scan();
5475 scan.project(&["metadata.location.x", "metadata.age"])
5476 .unwrap();
5477 let batch = scan.try_into_batch().await.unwrap();
5478
5479 assert_eq!(
5480 batch.schema().as_ref(),
5481 &ArrowSchema::new(vec![
5482 ArrowField::new("metadata.location.x", DataType::Float32, true),
5483 ArrowField::new("metadata.age", DataType::Int32, true),
5484 ])
5485 );
5486
5487 let take_schema = data.schema().project_by_ids(&[0, 2, 4], false);
5491
5492 let taken = data.take_rows(&[0, 5], take_schema).await.unwrap();
5493
5494 let part_point_fields = Fields::from(vec![ArrowField::new("x", DataType::Float32, true)]);
5496 let part_metadata_fields = Fields::from(vec![
5497 ArrowField::new("location", DataType::Struct(part_point_fields), true),
5498 ArrowField::new("age", DataType::Int32, true),
5499 ]);
5500 let part_schema = Arc::new(ArrowSchema::new(vec![ArrowField::new(
5501 "metadata",
5502 DataType::Struct(part_metadata_fields),
5503 true,
5504 )]));
5505
5506 assert_eq!(taken.schema(), part_schema);
5507 }
5508
5509 #[rstest]
5510 #[tokio::test]
5511 async fn test_limit(
5512 #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
5513 data_storage_version: LanceFileVersion,
5514 ) -> Result<()> {
5515 let test_ds = TestVectorDataset::new(data_storage_version, false).await?;
5516 let dataset = &test_ds.dataset;
5517
5518 let full_data = dataset.scan().try_into_batch().await?.slice(19, 2);
5519
5520 let actual = dataset
5521 .scan()
5522 .limit(Some(2), Some(19))?
5523 .try_into_batch()
5524 .await?;
5525
5526 assert_eq!(actual.num_rows(), 2);
5527 assert_eq!(actual, full_data);
5528 Ok(())
5529 }
5530
5531 #[test_log::test(tokio::test)]
5532 async fn test_limit_cancel() {
5533 let throttled = Arc::new(ThrottledStoreWrapper {
5542 config: ThrottleConfig {
5543 wait_get_per_call: Duration::from_secs(1),
5544 ..Default::default()
5545 },
5546 });
5547 let write_params = WriteParams {
5548 store_params: Some(ObjectStoreParams {
5549 object_store_wrapper: Some(throttled.clone()),
5550 ..Default::default()
5551 }),
5552 max_rows_per_file: 1,
5553 ..Default::default()
5554 };
5555
5556 let dataset = gen_batch()
5558 .col("i", array::step::<Int32Type>().with_random_nulls(0.1))
5559 .into_ram_dataset_with_params(
5560 FragmentCount::from(2000),
5561 FragmentRowCount::from(1),
5562 Some(write_params),
5563 )
5564 .await
5565 .unwrap();
5566
5567 let mut scan = dataset.scan();
5568 scan.filter("i IS NOT NULL").unwrap();
5569 scan.limit(Some(10), None).unwrap();
5570
5571 let start = Instant::now();
5572 scan.try_into_stream()
5573 .await
5574 .unwrap()
5575 .try_collect::<Vec<_>>()
5576 .await
5577 .unwrap();
5578 let duration = start.elapsed();
5579
5580 assert!(duration < Duration::from_secs(10));
5584 }
5585
5586 #[rstest]
5587 #[tokio::test]
5588 async fn test_knn_nodes(
5589 #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
5590 data_storage_version: LanceFileVersion,
5591 #[values(false, true)] stable_row_ids: bool,
5592 #[values(false, true)] build_index: bool,
5593 ) {
5594 let mut test_ds = TestVectorDataset::new(data_storage_version, stable_row_ids)
5595 .await
5596 .unwrap();
5597 if build_index {
5598 test_ds.make_vector_index().await.unwrap();
5599 }
5600 let dataset = &test_ds.dataset;
5601
5602 let mut scan = dataset.scan();
5603 let key: Float32Array = (32..64).map(|v| v as f32).collect();
5604 scan.nearest("vec", &key, 5).unwrap();
5605 scan.refine(5);
5606
5607 let batch = scan.try_into_batch().await.unwrap();
5608
5609 assert_eq!(batch.num_rows(), 5);
5610 assert_eq!(
5611 batch.schema().as_ref(),
5612 &ArrowSchema::new(vec![
5613 ArrowField::new("i", DataType::Int32, true),
5614 ArrowField::new("s", DataType::Utf8, true),
5615 ArrowField::new(
5616 "vec",
5617 DataType::FixedSizeList(
5618 Arc::new(ArrowField::new("item", DataType::Float32, true)),
5619 32,
5620 ),
5621 true,
5622 ),
5623 ArrowField::new(DIST_COL, DataType::Float32, true),
5624 ])
5625 .with_metadata([("dataset".into(), "vector".into())].into())
5626 );
5627
5628 let expected_i = BTreeSet::from_iter(vec![1, 81, 161, 241, 321]);
5629 let column_i = batch.column_by_name("i").unwrap();
5630 let actual_i: BTreeSet<i32> = as_primitive_array::<Int32Type>(column_i.as_ref())
5631 .values()
5632 .iter()
5633 .copied()
5634 .collect();
5635 assert_eq!(expected_i, actual_i);
5636 }
5637
5638 #[rstest]
5639 #[tokio::test]
5640 async fn test_can_project_distance() {
5641 let test_ds = TestVectorDataset::new(LanceFileVersion::Stable, true)
5642 .await
5643 .unwrap();
5644 let dataset = &test_ds.dataset;
5645
5646 let mut scan = dataset.scan();
5647 let key: Float32Array = (32..64).map(|v| v as f32).collect();
5648 scan.nearest("vec", &key, 5).unwrap();
5649 scan.refine(5);
5650 scan.project(&["_distance"]).unwrap();
5651
5652 let batch = scan.try_into_batch().await.unwrap();
5653
5654 assert_eq!(batch.num_rows(), 5);
5655 assert_eq!(batch.num_columns(), 1);
5656 }
5657
5658 #[rstest]
5659 #[tokio::test]
5660 async fn test_knn_with_new_data(
5661 #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
5662 data_storage_version: LanceFileVersion,
5663 #[values(false, true)] stable_row_ids: bool,
5664 ) {
5665 let mut test_ds = TestVectorDataset::new(data_storage_version, stable_row_ids)
5666 .await
5667 .unwrap();
5668 test_ds.make_vector_index().await.unwrap();
5669 test_ds.append_new_data().await.unwrap();
5670 let dataset = &test_ds.dataset;
5671
5672 let key: Float32Array = [0f32; 32].into_iter().collect();
5674 let k = 20;
5677
5678 #[derive(Debug)]
5679 struct TestCase {
5680 filter: Option<&'static str>,
5681 limit: Option<i64>,
5682 use_index: bool,
5683 }
5684
5685 let mut cases = vec![];
5686 for filter in [Some("i > 100"), None] {
5687 for limit in [None, Some(10)] {
5688 for use_index in [true, false] {
5689 cases.push(TestCase {
5690 filter,
5691 limit,
5692 use_index,
5693 });
5694 }
5695 }
5696 }
5697
5698 for case in cases {
5700 let mut scanner = dataset.scan();
5701 scanner
5702 .nearest("vec", &key, k)
5703 .unwrap()
5704 .limit(case.limit, None)
5705 .unwrap()
5706 .refine(3)
5707 .use_index(case.use_index);
5708 if let Some(filter) = case.filter {
5709 scanner.filter(filter).unwrap();
5710 }
5711
5712 let result = scanner
5713 .try_into_stream()
5714 .await
5715 .unwrap()
5716 .try_collect::<Vec<_>>()
5717 .await
5718 .unwrap();
5719 assert!(!result.is_empty());
5720 let result = concat_batches(&result[0].schema(), result.iter()).unwrap();
5721
5722 if case.filter.is_some() {
5723 let result_rows = result.num_rows();
5724 let expected_rows = case.limit.unwrap_or(k as i64) as usize;
5725 assert!(
5726 result_rows <= expected_rows,
5727 "Expected less than {} rows, got {}",
5728 expected_rows,
5729 result_rows
5730 );
5731 } else {
5732 assert_eq!(result.num_rows(), case.limit.unwrap_or(k as i64) as usize);
5734 }
5735
5736 assert_eq!(
5738 as_primitive_array::<Int32Type>(result.column(0).as_ref()).value(0),
5739 400
5740 );
5741 }
5742 }
5743
5744 #[rstest]
5745 #[tokio::test]
5746 async fn test_knn_with_prefilter(
5747 #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
5748 data_storage_version: LanceFileVersion,
5749 #[values(false, true)] stable_row_ids: bool,
5750 ) {
5751 let mut test_ds = TestVectorDataset::new(data_storage_version, stable_row_ids)
5752 .await
5753 .unwrap();
5754 test_ds.make_vector_index().await.unwrap();
5755 let dataset = &test_ds.dataset;
5756
5757 let mut scan = dataset.scan();
5758 let key: Float32Array = (32..64).map(|v| v as f32).collect();
5759 scan.filter("i > 100").unwrap();
5760 scan.prefilter(true);
5761 scan.project(&["i", "vec"]).unwrap();
5762 scan.nearest("vec", &key, 5).unwrap();
5763 scan.use_index(false);
5764
5765 let results = scan
5766 .try_into_stream()
5767 .await
5768 .unwrap()
5769 .try_collect::<Vec<_>>()
5770 .await
5771 .unwrap();
5772
5773 assert_eq!(results.len(), 1);
5774 let batch = &results[0];
5775
5776 assert_eq!(batch.num_rows(), 5);
5777 assert_eq!(
5778 batch.schema().as_ref(),
5779 &ArrowSchema::new(vec![
5780 ArrowField::new("i", DataType::Int32, true),
5781 ArrowField::new(
5782 "vec",
5783 DataType::FixedSizeList(
5784 Arc::new(ArrowField::new("item", DataType::Float32, true)),
5785 32,
5786 ),
5787 true,
5788 ),
5789 ArrowField::new(DIST_COL, DataType::Float32, true),
5790 ])
5791 .with_metadata([("dataset".into(), "vector".into())].into())
5792 );
5793
5794 let exact_i = BTreeSet::from_iter(vec![161, 241, 321]);
5796 let close_i = BTreeSet::from_iter(vec![161, 241, 321, 160, 162, 240, 242, 320, 322]);
5798 let column_i = batch.column_by_name("i").unwrap();
5799 let actual_i: BTreeSet<i32> = as_primitive_array::<Int32Type>(column_i.as_ref())
5800 .values()
5801 .iter()
5802 .copied()
5803 .collect();
5804 assert!(exact_i.is_subset(&actual_i));
5805 assert!(actual_i.is_subset(&close_i));
5806 }
5807
5808 #[rstest]
5809 #[tokio::test]
5810 async fn test_knn_filter_new_data(
5811 #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
5812 data_storage_version: LanceFileVersion,
5813 #[values(false, true)] stable_row_ids: bool,
5814 ) {
5815 let mut test_ds = TestVectorDataset::new(data_storage_version, stable_row_ids)
5819 .await
5820 .unwrap();
5821 test_ds.make_vector_index().await.unwrap();
5822 test_ds.append_new_data().await.unwrap();
5823 let dataset = &test_ds.dataset;
5824
5825 let key: Float32Array = [0f32; 32].into_iter().collect();
5827
5828 let mut query = dataset.scan();
5829 query.nearest("vec", &key, 20).unwrap();
5830
5831 let results = query
5833 .try_into_stream()
5834 .await
5835 .unwrap()
5836 .try_collect::<Vec<_>>()
5837 .await
5838 .unwrap();
5839
5840 let results_i = results[0]["i"]
5841 .as_primitive::<Int32Type>()
5842 .values()
5843 .iter()
5844 .copied()
5845 .collect::<BTreeSet<_>>();
5846
5847 assert!(results_i.contains(&400));
5848
5849 for prefilter in [false, true] {
5851 let mut query = dataset.scan();
5852 query
5853 .filter("i != 400")
5854 .unwrap()
5855 .prefilter(prefilter)
5856 .nearest("vec", &key, 20)
5857 .unwrap();
5858
5859 let results = query
5860 .try_into_stream()
5861 .await
5862 .unwrap()
5863 .try_collect::<Vec<_>>()
5864 .await
5865 .unwrap();
5866
5867 let results_i = results[0]["i"]
5868 .as_primitive::<Int32Type>()
5869 .values()
5870 .iter()
5871 .copied()
5872 .collect::<BTreeSet<_>>();
5873
5874 assert!(!results_i.contains(&400));
5875 }
5876 }
5877
5878 #[rstest]
5879 #[tokio::test]
5880 async fn test_knn_with_filter(
5881 #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
5882 data_storage_version: LanceFileVersion,
5883 #[values(false, true)] stable_row_ids: bool,
5884 ) {
5885 let test_ds = TestVectorDataset::new(data_storage_version, stable_row_ids)
5886 .await
5887 .unwrap();
5888 let dataset = &test_ds.dataset;
5889
5890 let mut scan = dataset.scan();
5891 let key: Float32Array = (32..64).map(|v| v as f32).collect();
5892 scan.nearest("vec", &key, 5).unwrap();
5893 scan.filter("i > 100").unwrap();
5894 scan.project(&["i", "vec"]).unwrap();
5895 scan.refine(5);
5896
5897 let results = scan
5898 .try_into_stream()
5899 .await
5900 .unwrap()
5901 .try_collect::<Vec<_>>()
5902 .await
5903 .unwrap();
5904
5905 assert_eq!(results.len(), 1);
5906 let batch = &results[0];
5907
5908 assert_eq!(batch.num_rows(), 3);
5909 assert_eq!(
5910 batch.schema().as_ref(),
5911 &ArrowSchema::new(vec![
5912 ArrowField::new("i", DataType::Int32, true),
5913 ArrowField::new(
5914 "vec",
5915 DataType::FixedSizeList(
5916 Arc::new(ArrowField::new("item", DataType::Float32, true)),
5917 32,
5918 ),
5919 true,
5920 ),
5921 ArrowField::new(DIST_COL, DataType::Float32, true),
5922 ])
5923 .with_metadata([("dataset".into(), "vector".into())].into())
5924 );
5925
5926 let expected_i = BTreeSet::from_iter(vec![161, 241, 321]);
5927 let column_i = batch.column_by_name("i").unwrap();
5928 let actual_i: BTreeSet<i32> = as_primitive_array::<Int32Type>(column_i.as_ref())
5929 .values()
5930 .iter()
5931 .copied()
5932 .collect();
5933 assert_eq!(expected_i, actual_i);
5934 }
5935
5936 #[rstest]
5937 #[tokio::test]
5938 async fn test_refine_factor(
5939 #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
5940 data_storage_version: LanceFileVersion,
5941 #[values(false, true)] stable_row_ids: bool,
5942 ) {
5943 let test_ds = TestVectorDataset::new(data_storage_version, stable_row_ids)
5944 .await
5945 .unwrap();
5946 let dataset = &test_ds.dataset;
5947
5948 let mut scan = dataset.scan();
5949 let key: Float32Array = (32..64).map(|v| v as f32).collect();
5950 scan.nearest("vec", &key, 5).unwrap();
5951 scan.refine(5);
5952
5953 let results = scan
5954 .try_into_stream()
5955 .await
5956 .unwrap()
5957 .try_collect::<Vec<_>>()
5958 .await
5959 .unwrap();
5960
5961 assert_eq!(results.len(), 1);
5962 let batch = &results[0];
5963
5964 assert_eq!(batch.num_rows(), 5);
5965 assert_eq!(
5966 batch.schema().as_ref(),
5967 &ArrowSchema::new(vec![
5968 ArrowField::new("i", DataType::Int32, true),
5969 ArrowField::new("s", DataType::Utf8, true),
5970 ArrowField::new(
5971 "vec",
5972 DataType::FixedSizeList(
5973 Arc::new(ArrowField::new("item", DataType::Float32, true)),
5974 32,
5975 ),
5976 true,
5977 ),
5978 ArrowField::new(DIST_COL, DataType::Float32, true),
5979 ])
5980 .with_metadata([("dataset".into(), "vector".into())].into())
5981 );
5982
5983 let expected_i = BTreeSet::from_iter(vec![1, 81, 161, 241, 321]);
5984 let column_i = batch.column_by_name("i").unwrap();
5985 let actual_i: BTreeSet<i32> = as_primitive_array::<Int32Type>(column_i.as_ref())
5986 .values()
5987 .iter()
5988 .copied()
5989 .collect();
5990 assert_eq!(expected_i, actual_i);
5991 }
5992
5993 #[tokio::test]
5994 async fn test_binary_vectors_default_to_hamming() {
5995 let (_tmp_dir, dataset) = make_binary_vector_dataset().await.unwrap();
5996 let query = UInt8Array::from(vec![0b0000_1111u8, 0, 0, 0]);
5997
5998 let mut scan = dataset.scan();
5999 scan.nearest("bin", &query, 3).unwrap();
6000
6001 assert_eq!(scan.nearest.as_ref().unwrap().metric_type, None);
6003
6004 let batch = scan.try_into_batch().await.unwrap();
6005 let ids = batch
6006 .column_by_name("id")
6007 .unwrap()
6008 .as_primitive::<Int32Type>()
6009 .values();
6010 assert_eq!(ids, &[0, 1, 2]);
6011 let distances = batch
6012 .column_by_name(DIST_COL)
6013 .unwrap()
6014 .as_primitive::<Float32Type>()
6015 .values();
6016 assert_eq!(distances, &[0.0, 2.0, 4.0]);
6017 }
6018
6019 #[tokio::test]
6020 async fn test_binary_vectors_invalid_distance_error() {
6021 let (_tmp_dir, dataset) = make_binary_vector_dataset().await.unwrap();
6022 let query = UInt8Array::from(vec![0b0000_1111u8, 0, 0, 0]);
6023
6024 let mut scan = dataset.scan();
6025 scan.nearest("bin", &query, 1).unwrap();
6026 scan.distance_metric(DistanceType::L2);
6027
6028 let err = scan.try_into_batch().await.unwrap_err();
6029 assert!(matches!(err, Error::InvalidInput { .. }));
6030 let message = err.to_string();
6031 assert!(
6032 message.contains("l2") && message.contains("UInt8"),
6033 "unexpected message: {message}"
6034 );
6035 }
6036
6037 #[tokio::test]
6041 async fn test_knn_metric_mismatch_falls_back_to_flat_search() {
6042 let mut test_ds = TestVectorDataset::new(LanceFileVersion::Stable, true)
6043 .await
6044 .unwrap();
6045 test_ds.make_vector_index().await.unwrap();
6047
6048 let dataset = &test_ds.dataset;
6049 let key: Float32Array = (32..64).map(|v| v as f32).collect();
6050
6051 let mut scan = dataset.scan();
6053 scan.nearest("vec", &key, 5).unwrap();
6054 scan.distance_metric(DistanceType::Dot);
6055
6056 let plan = scan.explain_plan(false).await.unwrap();
6058 assert!(
6059 !plan.contains("ANNSubIndex"),
6060 "Expected flat search, but got ANN index in plan:\n{}",
6061 plan
6062 );
6063 assert!(
6065 plan.contains("KNNVectorDistance") && plan.to_lowercase().contains("dot"),
6066 "Expected flat KNN with Dot metric in plan:\n{}",
6067 plan
6068 );
6069
6070 let dot_batch = dataset
6072 .scan()
6073 .nearest("vec", &key, 5)
6074 .unwrap()
6075 .distance_metric(DistanceType::Dot)
6076 .try_into_batch()
6077 .await
6078 .unwrap();
6079
6080 let l2_batch = dataset
6081 .scan()
6082 .nearest("vec", &key, 5)
6083 .unwrap()
6084 .distance_metric(DistanceType::L2)
6085 .try_into_batch()
6086 .await
6087 .unwrap();
6088
6089 let dot_distances: Vec<f32> = dot_batch
6090 .column_by_name(DIST_COL)
6091 .unwrap()
6092 .as_primitive::<Float32Type>()
6093 .values()
6094 .to_vec();
6095 let l2_distances: Vec<f32> = l2_batch
6096 .column_by_name(DIST_COL)
6097 .unwrap()
6098 .as_primitive::<Float32Type>()
6099 .values()
6100 .to_vec();
6101
6102 assert_ne!(dot_distances, l2_distances);
6104 }
6105
6106 #[tokio::test]
6109 async fn test_knn_no_metric_uses_index_metric() {
6110 let mut test_ds = TestVectorDataset::new(LanceFileVersion::Stable, true)
6111 .await
6112 .unwrap();
6113 test_ds.make_vector_index().await.unwrap();
6115
6116 let dataset = &test_ds.dataset;
6117 let key: Float32Array = (32..64).map(|v| v as f32).collect();
6118
6119 let mut scan = dataset.scan();
6121 scan.nearest("vec", &key, 5).unwrap();
6122 let plan = scan.explain_plan(false).await.unwrap();
6126 assert!(
6127 plan.contains("ANNSubIndex") && plan.to_lowercase().contains("l2"),
6128 "Expected ANN index with L2 metric in plan:\n{}",
6129 plan
6130 );
6131 }
6132
6133 #[rstest]
6134 #[tokio::test]
6135 async fn test_only_row_id(
6136 #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
6137 data_storage_version: LanceFileVersion,
6138 ) {
6139 let test_ds = TestVectorDataset::new(data_storage_version, false)
6140 .await
6141 .unwrap();
6142 let dataset = &test_ds.dataset;
6143
6144 let mut scan = dataset.scan();
6145 scan.project::<&str>(&[]).unwrap().with_row_id();
6146
6147 let batch = scan.try_into_batch().await.unwrap();
6148
6149 assert_eq!(batch.num_columns(), 1);
6150 assert_eq!(batch.num_rows(), 400);
6151 let expected_schema =
6152 ArrowSchema::new(vec![ArrowField::new(ROW_ID, DataType::UInt64, true)])
6153 .with_metadata(dataset.schema().metadata.clone());
6154 assert_eq!(batch.schema().as_ref(), &expected_schema,);
6155
6156 let expected_row_ids: Vec<u64> = (0..200_u64).chain((1 << 32)..((1 << 32) + 200)).collect();
6157 let actual_row_ids: Vec<u64> = as_primitive_array::<UInt64Type>(batch.column(0).as_ref())
6158 .values()
6159 .iter()
6160 .copied()
6161 .collect();
6162 assert_eq!(expected_row_ids, actual_row_ids);
6163 }
6164
6165 #[tokio::test]
6166 async fn test_scan_unordered_with_row_id() {
6167 let test_ds = TestVectorDataset::new(LanceFileVersion::Legacy, false)
6169 .await
6170 .unwrap();
6171 let dataset = &test_ds.dataset;
6172
6173 let mut scan = dataset.scan();
6174 scan.with_row_id();
6175
6176 let ordered_batches = scan
6177 .try_into_stream()
6178 .await
6179 .unwrap()
6180 .try_collect::<Vec<RecordBatch>>()
6181 .await
6182 .unwrap();
6183 assert!(ordered_batches.len() > 2);
6184 let ordered_batch =
6185 concat_batches(&ordered_batches[0].schema(), ordered_batches.iter()).unwrap();
6186
6187 scan.scan_in_order(false);
6189 for _ in 0..10 {
6190 let unordered_batches = scan
6191 .try_into_stream()
6192 .await
6193 .unwrap()
6194 .try_collect::<Vec<RecordBatch>>()
6195 .await
6196 .unwrap();
6197 let unordered_batch =
6198 concat_batches(&unordered_batches[0].schema(), unordered_batches.iter()).unwrap();
6199
6200 assert_eq!(ordered_batch.num_rows(), unordered_batch.num_rows());
6201
6202 if ordered_batch != unordered_batch {
6204 let sort_indices = sort_to_indices(&unordered_batch[ROW_ID], None, None).unwrap();
6205
6206 let ordered_i = ordered_batch["i"].clone();
6207 let sorted_i = take::take(&unordered_batch["i"], &sort_indices, None).unwrap();
6208
6209 assert_eq!(&ordered_i, &sorted_i);
6210
6211 break;
6212 }
6213 }
6214 }
6215
6216 #[tokio::test]
6217 async fn test_scan_with_wildcard() {
6218 let data = gen_batch()
6219 .col("x", array::step::<Float64Type>())
6220 .col("y", array::step::<Float64Type>())
6221 .into_ram_dataset(FragmentCount::from(1), FragmentRowCount::from(100))
6222 .await
6223 .unwrap();
6224
6225 let check_cols = async |projection: &[&str], expected_cols: &[&str]| {
6226 let mut scan = data.scan();
6227 scan.project(projection).unwrap();
6228 let stream = scan.try_into_stream().await.unwrap();
6229 let schema = stream.schema();
6230 let field_names = schema.field_names();
6231 assert_eq!(field_names, expected_cols);
6232 };
6233
6234 check_cols(&["*"], &["x", "y"]).await;
6235 check_cols(&["x", "y"], &["x", "y"]).await;
6236 check_cols(&["x"], &["x"]).await;
6237 check_cols(&["_rowid", "*"], &["_rowid", "x", "y"]).await;
6238 check_cols(&["*", "_rowid"], &["x", "y", "_rowid"]).await;
6239 check_cols(
6240 &["_rowid", "*", "_rowoffset"],
6241 &["_rowid", "x", "y", "_rowoffset"],
6242 )
6243 .await;
6244
6245 let check_exprs = async |exprs: &[&str], expected_cols: &[&str]| {
6246 let mut scan = data.scan();
6247 let projection = exprs
6248 .iter()
6249 .map(|e| (e.to_string(), e.to_string()))
6250 .collect::<Vec<_>>();
6251 scan.project_with_transform(&projection).unwrap();
6252 let stream = scan.try_into_stream().await.unwrap();
6253 let schema = stream.schema();
6254 let field_names = schema.field_names();
6255 assert_eq!(field_names, expected_cols);
6256 };
6257
6258 check_exprs(&["_rowid", "*", "x * 2"], &["_rowid", "x", "y", "x * 2"]).await;
6260
6261 let check_fails = |projection: &[&str]| {
6262 let mut scan = data.scan();
6263 assert!(scan.project(projection).is_err());
6264 };
6265
6266 check_fails(&["x", "*"]);
6268 check_fails(&["_rowid", "_rowid"]);
6269 }
6270
6271 #[rstest]
6272 #[tokio::test]
6273 async fn test_scan_order(
6274 #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
6275 data_storage_version: LanceFileVersion,
6276 ) {
6277 let test_dir = TempStrDir::default();
6278 let test_uri = &test_dir;
6279
6280 let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new(
6281 "i",
6282 DataType::Int32,
6283 true,
6284 )]));
6285
6286 let batch1 = RecordBatch::try_new(
6287 schema.clone(),
6288 vec![Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]))],
6289 )
6290 .unwrap();
6291
6292 let batch2 = RecordBatch::try_new(
6293 schema.clone(),
6294 vec![Arc::new(Int32Array::from(vec![6, 7, 8]))],
6295 )
6296 .unwrap();
6297
6298 let params = WriteParams {
6299 mode: WriteMode::Append,
6300 data_storage_version: Some(data_storage_version),
6301 ..Default::default()
6302 };
6303
6304 let write_batch = |batch: RecordBatch| async {
6305 let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone());
6306 Dataset::write(reader, test_uri, Some(params)).await
6307 };
6308
6309 write_batch.clone()(batch1.clone()).await.unwrap();
6310 write_batch(batch2.clone()).await.unwrap();
6311
6312 let dataset = Arc::new(Dataset::open(test_uri).await.unwrap());
6313 let fragment1 = dataset.get_fragment(0).unwrap().metadata().clone();
6314 let fragment2 = dataset.get_fragment(1).unwrap().metadata().clone();
6315
6316 let mut scanner = dataset.scan();
6318 scanner.with_fragments(vec![fragment1.clone(), fragment2.clone()]);
6319 let output = scanner
6320 .try_into_stream()
6321 .await
6322 .unwrap()
6323 .try_collect::<Vec<_>>()
6324 .await
6325 .unwrap();
6326 assert_eq!(output.len(), 2);
6327 assert_eq!(output[0], batch1);
6328 assert_eq!(output[1], batch2);
6329
6330 let mut scanner = dataset.scan();
6332 scanner.with_fragments(vec![fragment2, fragment1]);
6333 let output = scanner
6334 .try_into_stream()
6335 .await
6336 .unwrap()
6337 .try_collect::<Vec<_>>()
6338 .await
6339 .unwrap();
6340 assert_eq!(output.len(), 2);
6341 assert_eq!(output[0], batch2);
6342 assert_eq!(output[1], batch1);
6343 }
6344
6345 #[rstest]
6346 #[tokio::test]
6347 async fn test_scan_sort(
6348 #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
6349 data_storage_version: LanceFileVersion,
6350 ) {
6351 let test_dir = TempStrDir::default();
6352 let test_uri = &test_dir;
6353
6354 let data = gen_batch()
6355 .col("int", array::cycle::<Int32Type>(vec![5, 4, 1, 2, 3]))
6356 .col(
6357 "str",
6358 array::cycle_utf8_literals(&["a", "b", "c", "e", "d"]),
6359 );
6360
6361 let sorted_by_int = gen_batch()
6362 .col("int", array::cycle::<Int32Type>(vec![1, 2, 3, 4, 5]))
6363 .col(
6364 "str",
6365 array::cycle_utf8_literals(&["c", "e", "d", "b", "a"]),
6366 )
6367 .into_batch_rows(RowCount::from(5))
6368 .unwrap();
6369
6370 let sorted_by_str = gen_batch()
6371 .col("int", array::cycle::<Int32Type>(vec![5, 4, 1, 3, 2]))
6372 .col(
6373 "str",
6374 array::cycle_utf8_literals(&["a", "b", "c", "d", "e"]),
6375 )
6376 .into_batch_rows(RowCount::from(5))
6377 .unwrap();
6378
6379 Dataset::write(
6380 data.into_reader_rows(RowCount::from(5), BatchCount::from(1)),
6381 test_uri,
6382 Some(WriteParams {
6383 data_storage_version: Some(data_storage_version),
6384 ..Default::default()
6385 }),
6386 )
6387 .await
6388 .unwrap();
6389
6390 let dataset = Arc::new(Dataset::open(test_uri).await.unwrap());
6391
6392 let batches_by_int = dataset
6393 .scan()
6394 .order_by(Some(vec![ColumnOrdering::asc_nulls_first(
6395 "int".to_string(),
6396 )]))
6397 .unwrap()
6398 .try_into_stream()
6399 .await
6400 .unwrap()
6401 .try_collect::<Vec<_>>()
6402 .await
6403 .unwrap();
6404
6405 assert_eq!(batches_by_int[0], sorted_by_int);
6406
6407 let batches_by_str = dataset
6408 .scan()
6409 .order_by(Some(vec![ColumnOrdering::asc_nulls_first(
6410 "str".to_string(),
6411 )]))
6412 .unwrap()
6413 .try_into_stream()
6414 .await
6415 .unwrap()
6416 .try_collect::<Vec<_>>()
6417 .await
6418 .unwrap();
6419
6420 assert_eq!(batches_by_str[0], sorted_by_str);
6421
6422 dataset
6424 .scan()
6425 .order_by(Some(vec![]))
6426 .unwrap()
6427 .try_into_stream()
6428 .await
6429 .unwrap()
6430 .try_collect::<Vec<_>>()
6431 .await
6432 .unwrap();
6433 }
6434
6435 #[rstest]
6436 #[tokio::test]
6437 async fn test_sort_multi_columns(
6438 #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
6439 data_storage_version: LanceFileVersion,
6440 ) {
6441 let test_dir = TempStrDir::default();
6442 let test_uri = &test_dir;
6443
6444 let data = gen_batch()
6445 .col("int", array::cycle::<Int32Type>(vec![5, 5, 1, 1, 3]))
6446 .col(
6447 "float",
6448 array::cycle::<Float32Type>(vec![7.3, -f32::NAN, f32::NAN, 4.3, f32::INFINITY]),
6449 );
6450
6451 let sorted_by_int_then_float = gen_batch()
6452 .col("int", array::cycle::<Int32Type>(vec![1, 1, 3, 5, 5]))
6453 .col(
6454 "float",
6455 array::cycle::<Float32Type>(vec![4.3, f32::NAN, f32::INFINITY, -f32::NAN, 7.3]),
6457 )
6458 .into_batch_rows(RowCount::from(5))
6459 .unwrap();
6460
6461 Dataset::write(
6462 data.into_reader_rows(RowCount::from(5), BatchCount::from(1)),
6463 test_uri,
6464 Some(WriteParams {
6465 data_storage_version: Some(data_storage_version),
6466 ..Default::default()
6467 }),
6468 )
6469 .await
6470 .unwrap();
6471
6472 let dataset = Arc::new(Dataset::open(test_uri).await.unwrap());
6473
6474 let batches_by_int_then_float = dataset
6475 .scan()
6476 .order_by(Some(vec![
6477 ColumnOrdering::asc_nulls_first("int".to_string()),
6478 ColumnOrdering::asc_nulls_first("float".to_string()),
6479 ]))
6480 .unwrap()
6481 .try_into_stream()
6482 .await
6483 .unwrap()
6484 .try_collect::<Vec<_>>()
6485 .await
6486 .unwrap();
6487
6488 assert_eq!(batches_by_int_then_float[0], sorted_by_int_then_float);
6489 }
6490
6491 #[rstest]
6492 #[tokio::test]
6493 async fn test_ann_prefilter(
6494 #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
6495 data_storage_version: LanceFileVersion,
6496 #[values(false, true)] stable_row_ids: bool,
6497 #[values(
6498 VectorIndexParams::ivf_pq(2, 8, 2, MetricType::L2, 2),
6499 VectorIndexParams::with_ivf_hnsw_sq_params(
6500 MetricType::L2,
6501 IvfBuildParams::new(2),
6502 HnswBuildParams::default(),
6503 SQBuildParams::default()
6504 )
6505 )]
6506 index_params: VectorIndexParams,
6507 ) {
6508 use lance_arrow::{FixedSizeListArrayExt, fixed_size_list_type};
6509
6510 let test_dir = TempStrDir::default();
6511 let test_uri = &test_dir;
6512
6513 let schema = Arc::new(ArrowSchema::new(vec![
6514 ArrowField::new("filterable", DataType::Int32, true),
6515 ArrowField::new("vector", fixed_size_list_type(2, DataType::Float32), true),
6516 ]));
6517
6518 let vector_values = Float32Array::from_iter_values((0..600).map(|x| x as f32));
6519
6520 let batches = vec![
6521 RecordBatch::try_new(
6522 schema.clone(),
6523 vec![
6524 Arc::new(Int32Array::from_iter_values(0..300)),
6525 Arc::new(FixedSizeListArray::try_new_from_values(vector_values, 2).unwrap()),
6526 ],
6527 )
6528 .unwrap(),
6529 ];
6530
6531 let write_params = WriteParams {
6532 data_storage_version: Some(data_storage_version),
6533 max_rows_per_file: 300, enable_stable_row_ids: stable_row_ids,
6535 ..Default::default()
6536 };
6537 let batches = RecordBatchIterator::new(batches.into_iter().map(Ok), schema.clone());
6538 let mut dataset = Dataset::write(batches, test_uri, Some(write_params))
6539 .await
6540 .unwrap();
6541
6542 dataset
6543 .create_index(&["vector"], IndexType::Vector, None, &index_params, false)
6544 .await
6545 .unwrap();
6546
6547 let query_key = Arc::new(Float32Array::from_iter_values((0..2).map(|x| x as f32)));
6548 let mut scan = dataset.scan();
6549 scan.filter("filterable > 5").unwrap();
6550 scan.nearest("vector", query_key.as_ref(), 1).unwrap();
6551 scan.minimum_nprobes(100);
6552 scan.ef(100);
6553 scan.with_row_id();
6554
6555 let batches = scan
6556 .try_into_stream()
6557 .await
6558 .unwrap()
6559 .try_collect::<Vec<_>>()
6560 .await
6561 .unwrap();
6562
6563 assert_eq!(batches.len(), 0);
6564
6565 scan.prefilter(true);
6566
6567 let batches = scan
6568 .try_into_stream()
6569 .await
6570 .unwrap()
6571 .try_collect::<Vec<_>>()
6572 .await
6573 .unwrap();
6574 assert_eq!(batches.len(), 1);
6575
6576 let first_match = batches[0][ROW_ID].as_primitive::<UInt64Type>().values()[0];
6577
6578 assert!(
6582 first_match > 5,
6583 "prefilter not honored: returned row id {first_match} should satisfy `filterable > 5`"
6584 );
6585 }
6586
6587 #[rstest]
6588 #[tokio::test]
6589 async fn test_filter_on_large_utf8(
6590 #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
6591 data_storage_version: LanceFileVersion,
6592 ) {
6593 let test_dir = TempStrDir::default();
6594 let test_uri = &test_dir;
6595
6596 let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new(
6597 "ls",
6598 DataType::LargeUtf8,
6599 true,
6600 )]));
6601
6602 let batches = vec![
6603 RecordBatch::try_new(
6604 schema.clone(),
6605 vec![Arc::new(LargeStringArray::from_iter_values(
6606 (0..10).map(|v| format!("s-{}", v)),
6607 ))],
6608 )
6609 .unwrap(),
6610 ];
6611
6612 let write_params = WriteParams {
6613 data_storage_version: Some(data_storage_version),
6614 ..Default::default()
6615 };
6616 let batches = RecordBatchIterator::new(batches.into_iter().map(Ok), schema.clone());
6617 Dataset::write(batches, test_uri, Some(write_params))
6618 .await
6619 .unwrap();
6620
6621 let dataset = Dataset::open(test_uri).await.unwrap();
6622 let mut scan = dataset.scan();
6623 scan.filter("ls = 's-8'").unwrap();
6624
6625 let batches = scan
6626 .try_into_stream()
6627 .await
6628 .unwrap()
6629 .try_collect::<Vec<_>>()
6630 .await
6631 .unwrap();
6632 let batch = &batches[0];
6633
6634 let expected = RecordBatch::try_new(
6635 schema.clone(),
6636 vec![Arc::new(LargeStringArray::from_iter_values(
6637 (8..9).map(|v| format!("s-{}", v)),
6638 ))],
6639 )
6640 .unwrap();
6641
6642 assert_eq!(batch, &expected);
6643 }
6644
6645 #[rstest]
6646 #[tokio::test]
6647 async fn test_filter_with_regex(
6648 #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
6649 data_storage_version: LanceFileVersion,
6650 ) {
6651 let test_dir = TempStrDir::default();
6652 let test_uri = &test_dir;
6653
6654 let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new(
6655 "ls",
6656 DataType::Utf8,
6657 true,
6658 )]));
6659
6660 let batches = vec![
6661 RecordBatch::try_new(
6662 schema.clone(),
6663 vec![Arc::new(StringArray::from_iter_values(
6664 (0..20).map(|v| format!("s-{}", v)),
6665 ))],
6666 )
6667 .unwrap(),
6668 ];
6669
6670 let write_params = WriteParams {
6671 data_storage_version: Some(data_storage_version),
6672 ..Default::default()
6673 };
6674 let batches = RecordBatchIterator::new(batches.into_iter().map(Ok), schema.clone());
6675 Dataset::write(batches, test_uri, Some(write_params))
6676 .await
6677 .unwrap();
6678
6679 let dataset = Dataset::open(test_uri).await.unwrap();
6680 let mut scan = dataset.scan();
6681 scan.filter("regexp_match(ls, 's-1.')").unwrap();
6682
6683 let stream = scan.try_into_stream().await.unwrap();
6684 let batches = stream.try_collect::<Vec<_>>().await.unwrap();
6685 let batch = &batches[0];
6686
6687 let expected = RecordBatch::try_new(
6688 schema.clone(),
6689 vec![Arc::new(StringArray::from_iter_values(
6690 (10..=19).map(|v| format!("s-{}", v)),
6691 ))],
6692 )
6693 .unwrap();
6694
6695 assert_eq!(batch, &expected);
6696 }
6697
6698 #[tokio::test]
6699 async fn test_filter_proj_bug() {
6700 let struct_i_field = ArrowField::new("i", DataType::Int32, true);
6701 let struct_o_field = ArrowField::new("o", DataType::Utf8, true);
6702 let schema = Arc::new(ArrowSchema::new(vec![
6703 ArrowField::new(
6704 "struct",
6705 DataType::Struct(vec![struct_i_field.clone(), struct_o_field.clone()].into()),
6706 true,
6707 ),
6708 ArrowField::new("s", DataType::Utf8, true),
6709 ]));
6710
6711 let input_batches: Vec<RecordBatch> = (0..5)
6712 .map(|i| {
6713 let struct_i_arr: Arc<Int32Array> =
6714 Arc::new(Int32Array::from_iter_values(i * 20..(i + 1) * 20));
6715 let struct_o_arr: Arc<StringArray> = Arc::new(StringArray::from_iter_values(
6716 (i * 20..(i + 1) * 20).map(|v| format!("o-{:02}", v)),
6717 ));
6718 RecordBatch::try_new(
6719 schema.clone(),
6720 vec![
6721 Arc::new(StructArray::from(vec![
6722 (Arc::new(struct_i_field.clone()), struct_i_arr as ArrayRef),
6723 (Arc::new(struct_o_field.clone()), struct_o_arr as ArrayRef),
6724 ])),
6725 Arc::new(StringArray::from_iter_values(
6726 (i * 20..(i + 1) * 20).map(|v| format!("s-{}", v)),
6727 )),
6728 ],
6729 )
6730 .unwrap()
6731 })
6732 .collect();
6733 let batches =
6734 RecordBatchIterator::new(input_batches.clone().into_iter().map(Ok), schema.clone());
6735 let test_dir = TempStrDir::default();
6736 let test_uri = &test_dir;
6737 let write_params = WriteParams {
6738 max_rows_per_file: 40,
6739 max_rows_per_group: 10,
6740 data_storage_version: Some(LanceFileVersion::Legacy),
6741 ..Default::default()
6742 };
6743 Dataset::write(batches, test_uri, Some(write_params))
6744 .await
6745 .unwrap();
6746
6747 let dataset = Dataset::open(test_uri).await.unwrap();
6748 let batches = dataset
6749 .scan()
6750 .filter("struct.i >= 20")
6751 .unwrap()
6752 .try_into_stream()
6753 .await
6754 .unwrap()
6755 .try_collect::<Vec<_>>()
6756 .await
6757 .unwrap();
6758 let batch = concat_batches(&batches[0].schema(), &batches).unwrap();
6759
6760 let expected_batch = concat_batches(&schema, &input_batches.as_slice()[1..]).unwrap();
6761 assert_eq!(batch, expected_batch);
6762
6763 let batches = dataset
6765 .scan()
6766 .filter("struct.o >= 'o-20'")
6767 .unwrap()
6768 .try_into_stream()
6769 .await
6770 .unwrap()
6771 .try_collect::<Vec<_>>()
6772 .await
6773 .unwrap();
6774 let batch = concat_batches(&batches[0].schema(), &batches).unwrap();
6775 assert_eq!(batch, expected_batch);
6776
6777 let batches = dataset
6779 .scan()
6780 .project(vec!["struct"].as_slice())
6781 .unwrap()
6782 .try_into_stream()
6783 .await
6784 .unwrap()
6785 .try_collect::<Vec<_>>()
6786 .await
6787 .unwrap();
6788 concat_batches(&batches[0].schema(), &batches).unwrap();
6789 }
6790
6791 #[rstest]
6792 #[tokio::test]
6793 async fn test_ann_with_deletion(
6794 #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
6795 data_storage_version: LanceFileVersion,
6796 #[values(false, true)] stable_row_ids: bool,
6797 ) {
6798 let vec_params = vec![
6799 VectorIndexParams::ivf_pq(4, 8, 2, MetricType::L2, 2),
6802 ];
6803 for params in vec_params {
6804 use lance_arrow::FixedSizeListArrayExt;
6805
6806 let test_dir = TempStrDir::default();
6807 let test_uri = &test_dir;
6808
6809 let schema = Arc::new(ArrowSchema::new(vec![
6811 ArrowField::new("i", DataType::Int32, true),
6812 ArrowField::new(
6813 "vec",
6814 DataType::FixedSizeList(
6815 Arc::new(ArrowField::new("item", DataType::Float32, true)),
6816 32,
6817 ),
6818 true,
6819 ),
6820 ]));
6821
6822 let vector_values: Float32Array =
6824 (0..32 * 512).map(|v| (v / 32) as f32 + 1.0).collect();
6825 let vectors = FixedSizeListArray::try_new_from_values(vector_values, 32).unwrap();
6826
6827 let batches = vec![
6828 RecordBatch::try_new(
6829 schema.clone(),
6830 vec![
6831 Arc::new(Int32Array::from_iter_values(0..512)),
6832 Arc::new(vectors.clone()),
6833 ],
6834 )
6835 .unwrap(),
6836 ];
6837
6838 let reader = RecordBatchIterator::new(batches.into_iter().map(Ok), schema.clone());
6839 let mut dataset = Dataset::write(
6840 reader,
6841 test_uri,
6842 Some(WriteParams {
6843 data_storage_version: Some(data_storage_version),
6844 enable_stable_row_ids: stable_row_ids,
6845 ..Default::default()
6846 }),
6847 )
6848 .await
6849 .unwrap();
6850
6851 assert_eq!(dataset.index_cache_entry_count().await, 0);
6852 dataset
6853 .create_index(
6854 &["vec"],
6855 IndexType::Vector,
6856 Some("idx".to_string()),
6857 ¶ms,
6858 true,
6859 )
6860 .await
6861 .unwrap();
6862
6863 let mut scan = dataset.scan();
6864 let key: Float32Array = (0..32).map(|_v| 1.0_f32).collect();
6866 scan.nearest("vec", &key, 5).unwrap();
6867 scan.refine(100);
6868 scan.minimum_nprobes(100);
6869
6870 assert_eq!(
6871 dataset.index_cache_entry_count().await,
6872 2, );
6874 let results = scan
6875 .try_into_stream()
6876 .await
6877 .unwrap()
6878 .try_collect::<Vec<_>>()
6879 .await
6880 .unwrap();
6881
6882 assert_eq!(
6883 dataset.index_cache_entry_count().await,
6884 5 + dataset.versions().await.unwrap().len()
6885 );
6886 assert_eq!(results.len(), 1);
6887 let batch = &results[0];
6888
6889 let expected_i = BTreeSet::from_iter(vec![0, 1, 2, 3, 4]);
6890 let column_i = batch.column_by_name("i").unwrap();
6891 let actual_i: BTreeSet<i32> = as_primitive_array::<Int32Type>(column_i.as_ref())
6892 .values()
6893 .iter()
6894 .copied()
6895 .collect();
6896 assert_eq!(expected_i, actual_i);
6897
6898 dataset.delete("i = 1").await.unwrap();
6901 let mut scan = dataset.scan();
6902 scan.nearest("vec", &key, 5).unwrap();
6903 scan.refine(100);
6904 scan.minimum_nprobes(100);
6905
6906 let results = scan
6907 .try_into_stream()
6908 .await
6909 .unwrap()
6910 .try_collect::<Vec<_>>()
6911 .await
6912 .unwrap();
6913
6914 assert_eq!(results.len(), 1);
6915 let batch = &results[0];
6916
6917 let expected_i = BTreeSet::from_iter(vec![0, 2, 3, 4, 5]);
6919 let column_i = batch.column_by_name("i").unwrap();
6920 let actual_i: BTreeSet<i32> = as_primitive_array::<Int32Type>(column_i.as_ref())
6921 .values()
6922 .iter()
6923 .copied()
6924 .collect();
6925 assert_eq!(expected_i, actual_i);
6926
6927 let batches = vec![
6930 RecordBatch::try_new(
6931 schema.clone(),
6932 vec![
6933 Arc::new(Int32Array::from_iter_values(512..1024)),
6934 Arc::new(vectors),
6935 ],
6936 )
6937 .unwrap(),
6938 ];
6939
6940 let reader = RecordBatchIterator::new(batches.into_iter().map(Ok), schema.clone());
6941 let mut dataset = Dataset::write(
6942 reader,
6943 test_uri,
6944 Some(WriteParams {
6945 mode: WriteMode::Append,
6946 data_storage_version: Some(data_storage_version),
6947 ..Default::default()
6948 }),
6949 )
6950 .await
6951 .unwrap();
6952 dataset
6953 .create_index(
6954 &["vec"],
6955 IndexType::Vector,
6956 Some("idx".to_string()),
6957 ¶ms,
6958 true,
6959 )
6960 .await
6961 .unwrap();
6962
6963 dataset.delete("i < 512").await.unwrap();
6964
6965 let mut scan = dataset.scan();
6966 scan.nearest("vec", &key, 5).unwrap();
6967 scan.refine(100);
6968 scan.minimum_nprobes(100);
6969
6970 let results = scan
6971 .try_into_stream()
6972 .await
6973 .unwrap()
6974 .try_collect::<Vec<_>>()
6975 .await
6976 .unwrap();
6977
6978 assert_eq!(results.len(), 1);
6979 let batch = &results[0];
6980
6981 let expected_i = BTreeSet::from_iter(vec![512, 513, 514, 515, 516]);
6983 let column_i = batch.column_by_name("i").unwrap();
6984 let actual_i: BTreeSet<i32> = as_primitive_array::<Int32Type>(column_i.as_ref())
6985 .values()
6986 .iter()
6987 .copied()
6988 .collect();
6989 assert_eq!(expected_i, actual_i);
6990 }
6991 }
6992
6993 #[tokio::test]
6994 async fn test_projection_order() {
6995 let vec_params = VectorIndexParams::ivf_pq(4, 8, 2, MetricType::L2, 2);
6996 let mut data = gen_batch()
6997 .col("vec", array::rand_vec::<Float32Type>(Dimension::from(4)))
6998 .col("text", array::rand_utf8(ByteCount::from(10), false))
6999 .into_ram_dataset(FragmentCount::from(3), FragmentRowCount::from(100))
7000 .await
7001 .unwrap();
7002 data.create_index(&["vec"], IndexType::Vector, None, &vec_params, true)
7003 .await
7004 .unwrap();
7005
7006 let mut scan = data.scan();
7007 scan.nearest("vec", &Float32Array::from(vec![1.0, 1.0, 1.0, 1.0]), 5)
7008 .unwrap();
7009 scan.with_row_id().project(&["text"]).unwrap();
7010
7011 let results = scan
7012 .try_into_stream()
7013 .await
7014 .unwrap()
7015 .try_collect::<Vec<_>>()
7016 .await
7017 .unwrap();
7018
7019 assert_eq!(
7020 results[0].schema().field_names(),
7021 vec!["text", "_distance", "_rowid"]
7022 );
7023 }
7024
7025 #[rstest]
7026 #[tokio::test]
7027 async fn test_count_rows_with_filter(
7028 #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
7029 data_storage_version: LanceFileVersion,
7030 ) {
7031 let test_dir = TempStrDir::default();
7032 let test_uri = &test_dir;
7033 let mut data_gen = BatchGenerator::new().col(Box::new(
7034 IncrementingInt32::new().named("Filter_me".to_owned()),
7035 ));
7036 Dataset::write(
7037 data_gen.batch(32),
7038 test_uri,
7039 Some(WriteParams {
7040 data_storage_version: Some(data_storage_version),
7041 ..Default::default()
7042 }),
7043 )
7044 .await
7045 .unwrap();
7046
7047 let dataset = Dataset::open(test_uri).await.unwrap();
7048 assert_eq!(32, dataset.count_rows(None).await.unwrap());
7049 assert_eq!(
7050 16,
7051 dataset
7052 .count_rows(Some("`Filter_me` > 15".to_string()))
7053 .await
7054 .unwrap()
7055 );
7056 }
7057
7058 #[rstest]
7059 #[tokio::test]
7060 async fn test_dynamic_projection(
7061 #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
7062 data_storage_version: LanceFileVersion,
7063 ) {
7064 let test_dir = TempStrDir::default();
7065 let test_uri = &test_dir;
7066 let mut data_gen =
7067 BatchGenerator::new().col(Box::new(IncrementingInt32::new().named("i".to_owned())));
7068 Dataset::write(
7069 data_gen.batch(32),
7070 test_uri,
7071 Some(WriteParams {
7072 data_storage_version: Some(data_storage_version),
7073 ..Default::default()
7074 }),
7075 )
7076 .await
7077 .unwrap();
7078
7079 let dataset = Dataset::open(test_uri).await.unwrap();
7080 assert_eq!(dataset.count_rows(None).await.unwrap(), 32);
7081
7082 let mut scanner = dataset.scan();
7083
7084 let scan_res = scanner
7085 .project_with_transform(&[("bool", "i > 15")])
7086 .unwrap()
7087 .try_into_batch()
7088 .await
7089 .unwrap();
7090
7091 assert_eq!(1, scan_res.num_columns());
7092
7093 let bool_col = scan_res
7094 .column_by_name("bool")
7095 .expect("bool column should exist");
7096 let bool_arr = bool_col.as_boolean();
7097 for i in 0..32 {
7098 if i > 15 {
7099 assert!(bool_arr.value(i));
7100 } else {
7101 assert!(!bool_arr.value(i));
7102 }
7103 }
7104 }
7105
7106 #[rstest]
7107 #[tokio::test]
7108 async fn test_column_casting_function(
7109 #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
7110 data_storage_version: LanceFileVersion,
7111 ) {
7112 let test_dir = TempStrDir::default();
7113 let test_uri = &test_dir;
7114 let mut data_gen =
7115 BatchGenerator::new().col(Box::new(RandomVector::new().named("vec".to_owned())));
7116 Dataset::write(
7117 data_gen.batch(32),
7118 test_uri,
7119 Some(WriteParams {
7120 data_storage_version: Some(data_storage_version),
7121 ..Default::default()
7122 }),
7123 )
7124 .await
7125 .unwrap();
7126
7127 let dataset = Dataset::open(test_uri).await.unwrap();
7128 assert_eq!(dataset.count_rows(None).await.unwrap(), 32);
7129
7130 let mut scanner = dataset.scan();
7131
7132 let scan_res = scanner
7133 .project_with_transform(&[("f16", "_cast_list_f16(vec)")])
7134 .unwrap()
7135 .try_into_batch()
7136 .await
7137 .unwrap();
7138
7139 assert_eq!(1, scan_res.num_columns());
7140 assert_eq!(32, scan_res.num_rows());
7141 assert_eq!("f16", scan_res.schema().field(0).name());
7142
7143 let mut scanner = dataset.scan();
7144 let scan_res_original = scanner
7145 .project(&["vec"])
7146 .unwrap()
7147 .try_into_batch()
7148 .await
7149 .unwrap();
7150
7151 let f32_col: &Float32Array = scan_res_original
7152 .column_by_name("vec")
7153 .unwrap()
7154 .as_fixed_size_list()
7155 .values()
7156 .as_primitive();
7157 let f16_col: &Float16Array = scan_res
7158 .column_by_name("f16")
7159 .unwrap()
7160 .as_fixed_size_list()
7161 .values()
7162 .as_primitive();
7163
7164 for (f32_val, f16_val) in f32_col.iter().zip(f16_col.iter()) {
7165 let f32_val = f32_val.unwrap();
7166 let f16_val = f16_val.unwrap();
7167 assert_eq!(f16::from_f32(f32_val), f16_val);
7168 }
7169 }
7170
7171 struct ScalarIndexTestFixture {
7172 _test_dir: TempStrDir,
7173 dataset: Dataset,
7174 sample_query: Arc<dyn Array>,
7175 delete_query: Arc<dyn Array>,
7176 original_version: u64,
7178 compact_version: u64,
7180 append_version: u64,
7182 updated_version: u64,
7184 delete_version: u64,
7186 append_then_delete_version: u64,
7188 }
7189
7190 #[derive(Debug, PartialEq)]
7191 struct ScalarTestParams {
7192 use_index: bool,
7193 use_projection: bool,
7194 use_deleted_data: bool,
7195 use_new_data: bool,
7196 with_row_id: bool,
7197 use_compaction: bool,
7198 use_updated: bool,
7199 }
7200
7201 impl ScalarIndexTestFixture {
7202 async fn new(data_storage_version: LanceFileVersion, use_stable_row_ids: bool) -> Self {
7203 let test_dir = TempStrDir::default();
7204 let test_uri = &test_dir;
7205
7206 let data = gen_batch()
7212 .col(
7213 "vector",
7214 array::rand_vec::<Float32Type>(Dimension::from(32)),
7215 )
7216 .col("indexed", array::step::<Int32Type>())
7217 .col("not_indexed", array::step::<Int32Type>())
7218 .into_batch_rows(RowCount::from(1000))
7219 .unwrap();
7220
7221 let mut dataset = Dataset::write(
7223 RecordBatchIterator::new(vec![Ok(data.clone())], data.schema().clone()),
7224 test_uri,
7225 Some(WriteParams {
7226 max_rows_per_file: 500,
7227 data_storage_version: Some(data_storage_version),
7228 enable_stable_row_ids: use_stable_row_ids,
7229 ..Default::default()
7230 }),
7231 )
7232 .await
7233 .unwrap();
7234
7235 dataset
7236 .create_index(
7237 &["vector"],
7238 IndexType::Vector,
7239 None,
7240 &VectorIndexParams::ivf_pq(2, 8, 2, MetricType::L2, 2),
7241 false,
7242 )
7243 .await
7244 .unwrap();
7245
7246 dataset
7247 .create_index(
7248 &["indexed"],
7249 IndexType::Scalar,
7250 None,
7251 &ScalarIndexParams::default(),
7252 false,
7253 )
7254 .await
7255 .unwrap();
7256
7257 let original_version = dataset.version().version;
7258 let sample_query = data["vector"].as_fixed_size_list().value(50);
7259 let delete_query = data["vector"].as_fixed_size_list().value(75);
7260
7261 let new_indexed =
7266 arrow_arith::numeric::add(&data["indexed"], &Int32Array::new_scalar(1000)).unwrap();
7267 let new_not_indexed =
7268 arrow_arith::numeric::add(&data["indexed"], &Int32Array::new_scalar(1000)).unwrap();
7269 let append_data = RecordBatch::try_new(
7270 data.schema(),
7271 vec![data["vector"].clone(), new_indexed, new_not_indexed],
7272 )
7273 .unwrap();
7274
7275 dataset
7276 .append(
7277 RecordBatchIterator::new(vec![Ok(append_data)], data.schema()),
7278 Some(WriteParams {
7279 data_storage_version: Some(data_storage_version),
7280 ..Default::default()
7281 }),
7282 )
7283 .await
7284 .unwrap();
7285
7286 let append_version = dataset.version().version;
7287
7288 dataset
7291 .optimize_indices(&OptimizeOptions::merge(1))
7292 .await
7293 .unwrap();
7294 let updated_version = dataset.version().version;
7295
7296 dataset.checkout_version(append_version).await.unwrap();
7299 dataset.restore().await.unwrap();
7300
7301 dataset.delete("not_indexed = 75").await.unwrap();
7302
7303 let append_then_delete_version = dataset.version().version;
7304
7305 let mut dataset = dataset.checkout_version(original_version).await.unwrap();
7308 dataset.restore().await.unwrap();
7309
7310 dataset.delete("not_indexed = 75").await.unwrap();
7311
7312 let delete_version = dataset.version().version;
7313
7314 compact_files(&mut dataset, CompactionOptions::default(), None)
7316 .await
7317 .unwrap();
7318 let compact_version = dataset.version().version;
7319 dataset.checkout_version(original_version).await.unwrap();
7320 dataset.restore().await.unwrap();
7321
7322 Self {
7323 _test_dir: test_dir,
7324 dataset,
7325 sample_query,
7326 delete_query,
7327 original_version,
7328 compact_version,
7329 append_version,
7330 updated_version,
7331 delete_version,
7332 append_then_delete_version,
7333 }
7334 }
7335
7336 fn sample_query(&self) -> &PrimitiveArray<Float32Type> {
7337 self.sample_query.as_primitive::<Float32Type>()
7338 }
7339
7340 fn delete_query(&self) -> &PrimitiveArray<Float32Type> {
7341 self.delete_query.as_primitive::<Float32Type>()
7342 }
7343
7344 async fn get_dataset(&self, params: &ScalarTestParams) -> Dataset {
7345 let version = if params.use_compaction {
7346 if params.use_deleted_data || params.use_new_data || params.use_updated {
7348 panic!(
7349 "There is no test data combining new/deleted/updated data with compaction"
7350 );
7351 } else {
7352 self.compact_version
7353 }
7354 } else if params.use_updated {
7355 if params.use_deleted_data || params.use_new_data || params.use_compaction {
7357 panic!(
7358 "There is no test data combining updated data with new/deleted/compaction"
7359 );
7360 } else {
7361 self.updated_version
7362 }
7363 } else {
7364 match (params.use_new_data, params.use_deleted_data) {
7365 (false, false) => self.original_version,
7366 (false, true) => self.delete_version,
7367 (true, false) => self.append_version,
7368 (true, true) => self.append_then_delete_version,
7369 }
7370 };
7371 self.dataset.checkout_version(version).await.unwrap()
7372 }
7373
7374 async fn run_query(
7375 &self,
7376 query: &str,
7377 vector: Option<&PrimitiveArray<Float32Type>>,
7378 params: &ScalarTestParams,
7379 ) -> (String, RecordBatch) {
7380 let dataset = self.get_dataset(params).await;
7381 let mut scan = dataset.scan();
7382 if let Some(vector) = vector {
7383 scan.nearest("vector", vector, 10).unwrap();
7384 }
7385 if params.use_projection {
7386 scan.project(&["indexed"]).unwrap();
7387 }
7388 if params.with_row_id {
7389 scan.with_row_id();
7390 }
7391 scan.scan_in_order(true);
7392 scan.use_index(params.use_index);
7393 scan.filter(query).unwrap();
7394 scan.prefilter(true);
7395
7396 let plan = scan.explain_plan(true).await.unwrap();
7397 let batch = scan.try_into_batch().await.unwrap();
7398
7399 if params.use_projection {
7400 let mut expected_columns = 1;
7402 if vector.is_some() {
7403 expected_columns += 1;
7405 }
7406 if params.with_row_id {
7407 expected_columns += 1;
7408 }
7409 assert_eq!(batch.num_columns(), expected_columns);
7410 } else {
7411 let mut expected_columns = 3;
7412 if vector.is_some() {
7413 expected_columns += 1;
7415 }
7416 if params.with_row_id {
7417 expected_columns += 1;
7418 }
7419 assert_eq!(batch.num_columns(), expected_columns);
7421 }
7422
7423 (plan, batch)
7424 }
7425
7426 fn assert_none<F: Fn(i32) -> bool>(
7427 &self,
7428 batch: &RecordBatch,
7429 predicate: F,
7430 message: &str,
7431 ) {
7432 let indexed = batch["indexed"].as_primitive::<Int32Type>();
7433 if indexed.iter().map(|val| val.unwrap()).any(predicate) {
7434 panic!("{}", message);
7435 }
7436 }
7437
7438 fn assert_one<F: Fn(i32) -> bool>(&self, batch: &RecordBatch, predicate: F, message: &str) {
7439 let indexed = batch["indexed"].as_primitive::<Int32Type>();
7440 if !indexed.iter().map(|val| val.unwrap()).any(predicate) {
7441 panic!("{}", message);
7442 }
7443 }
7444
7445 async fn check_vector_scalar_indexed_and_refine(&self, params: &ScalarTestParams) {
7446 let (query_plan, batch) = self
7447 .run_query(
7448 "indexed != 50 AND ((not_indexed < 100) OR (not_indexed >= 1000 AND not_indexed < 1100))",
7449 Some(self.sample_query()),
7450 params,
7451 )
7452 .await;
7453 if self.dataset.is_legacy_storage() {
7455 assert!(query_plan.contains("MaterializeIndex"));
7456 }
7457 self.assert_none(
7459 &batch,
7460 |val| val == 50,
7461 "The query contained 50 even though it was filtered",
7462 );
7463 if !params.use_new_data {
7464 self.assert_none(
7466 &batch,
7467 |val| (100..1000).contains(&val) || (val >= 1100),
7468 "The non-indexed refine filter was not applied",
7469 );
7470 }
7471
7472 if params.use_new_data || params.use_updated {
7474 self.assert_one(
7475 &batch,
7476 |val| val == 1050,
7477 "The query did not contain 1050 from the new data",
7478 );
7479 }
7480 }
7481
7482 async fn check_vector_scalar_indexed_only(&self, params: &ScalarTestParams) {
7483 let (query_plan, batch) = self
7484 .run_query("indexed != 50", Some(self.sample_query()), params)
7485 .await;
7486 if self.dataset.is_legacy_storage() {
7487 if params.use_index {
7488 assert!(query_plan.contains("ScalarIndexQuery"));
7491 } else {
7492 assert!(query_plan.contains("MaterializeIndex"));
7494 }
7495 }
7496 self.assert_none(
7498 &batch,
7499 |val| val == 50,
7500 "The query contained 50 even though it was filtered",
7501 );
7502 if params.use_new_data {
7504 self.assert_one(
7505 &batch,
7506 |val| val == 1050,
7507 "The query did not contain 1050 from the new data",
7508 );
7509 if !params.use_new_data {
7510 let (_, batch) = self
7512 .run_query("indexed == 1050", Some(self.sample_query()), params)
7513 .await;
7514 assert_eq!(batch.num_rows(), 1);
7515 }
7516 }
7517 if params.use_deleted_data {
7518 let (_, batch) = self
7519 .run_query("indexed == 75", Some(self.delete_query()), params)
7520 .await;
7521 if !params.use_new_data {
7522 assert_eq!(batch.num_rows(), 0);
7523 }
7524 }
7525 }
7526
7527 async fn check_vector_queries(&self, params: &ScalarTestParams) {
7528 self.check_vector_scalar_indexed_only(params).await;
7529 self.check_vector_scalar_indexed_and_refine(params).await;
7530 }
7531
7532 async fn check_simple_indexed_only(&self, params: &ScalarTestParams) {
7533 let (query_plan, batch) = self.run_query("indexed != 50", None, params).await;
7534 if self.dataset.is_legacy_storage() {
7536 assert!(query_plan.contains("MaterializeIndex"));
7537 } else {
7538 assert!(query_plan.contains("LanceRead"));
7539 }
7540 self.assert_none(
7542 &batch,
7543 |val| val == 50,
7544 "The query contained 50 even though it was filtered",
7545 );
7546 let mut expected_num_rows = if params.use_new_data || params.use_updated {
7547 1999
7548 } else {
7549 999
7550 };
7551 if params.use_deleted_data || params.use_compaction {
7552 expected_num_rows -= 1;
7553 }
7554 assert_eq!(batch.num_rows(), expected_num_rows);
7555
7556 if params.use_new_data || params.use_updated {
7558 let (_, batch) = self.run_query("indexed == 1050", None, params).await;
7559 assert_eq!(batch.num_rows(), 1);
7560 }
7561
7562 if params.use_deleted_data || params.use_compaction {
7564 let (_, batch) = self.run_query("indexed == 75", None, params).await;
7565 assert_eq!(batch.num_rows(), 0);
7566 }
7567 }
7568
7569 async fn check_simple_indexed_and_refine(&self, params: &ScalarTestParams) {
7570 let (query_plan, batch) = self.run_query(
7571 "indexed != 50 AND ((not_indexed < 100) OR (not_indexed >= 1000 AND not_indexed < 1100))",
7572 None,
7573 params
7574 ).await;
7575 if self.dataset.is_legacy_storage() {
7577 assert!(query_plan.contains("MaterializeIndex"));
7578 } else {
7579 assert!(query_plan.contains("LanceRead"));
7580 }
7581 self.assert_none(
7583 &batch,
7584 |val| val == 50,
7585 "The query contained 50 even though it was filtered",
7586 );
7587 self.assert_none(
7589 &batch,
7590 |val| (100..1000).contains(&val) || (val >= 1100),
7591 "The non-indexed refine filter was not applied",
7592 );
7593
7594 let mut expected_num_rows = if params.use_new_data || params.use_updated {
7595 199
7596 } else {
7597 99
7598 };
7599 if params.use_deleted_data || params.use_compaction {
7600 expected_num_rows -= 1;
7601 }
7602 assert_eq!(batch.num_rows(), expected_num_rows);
7603 }
7604
7605 async fn check_simple_queries(&self, params: &ScalarTestParams) {
7606 self.check_simple_indexed_only(params).await;
7607 self.check_simple_indexed_and_refine(params).await;
7608 }
7609 }
7610
7611 #[rstest]
7615 #[tokio::test]
7616 async fn test_secondary_index_scans(
7617 #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
7618 data_storage_version: LanceFileVersion,
7619 #[values(false, true)] use_stable_row_ids: bool,
7620 ) {
7621 let fixture = Box::pin(ScalarIndexTestFixture::new(
7622 data_storage_version,
7623 use_stable_row_ids,
7624 ))
7625 .await;
7626
7627 for use_index in [false, true] {
7628 for use_projection in [false, true] {
7629 for use_deleted_data in [false, true] {
7630 for use_new_data in [false, true] {
7631 let compaction_choices =
7636 if use_deleted_data || use_new_data || use_stable_row_ids {
7637 vec![false]
7638 } else {
7639 vec![false, true]
7640 };
7641 for use_compaction in compaction_choices {
7642 let updated_choices =
7643 if use_deleted_data || use_new_data || use_compaction {
7644 vec![false]
7645 } else {
7646 vec![false, true]
7647 };
7648 for use_updated in updated_choices {
7649 for with_row_id in [false, true] {
7650 let params = ScalarTestParams {
7651 use_index,
7652 use_projection,
7653 use_deleted_data,
7654 use_new_data,
7655 with_row_id,
7656 use_compaction,
7657 use_updated,
7658 };
7659 fixture.check_vector_queries(¶ms).await;
7660 fixture.check_simple_queries(¶ms).await;
7661 }
7662 }
7663 }
7664 }
7665 }
7666 }
7667 }
7668 }
7669
7670 #[tokio::test]
7671 async fn can_filter_row_id() {
7672 let dataset = lance_datagen::gen_batch()
7673 .col("x", array::step::<Int32Type>())
7674 .into_ram_dataset(FragmentCount::from(1), FragmentRowCount::from(1000))
7675 .await
7676 .unwrap();
7677
7678 let mut scan = dataset.scan();
7679 scan.with_row_id();
7680 scan.project::<&str>(&[]).unwrap();
7681 scan.filter("_rowid == 50").unwrap();
7682 let batch = scan.try_into_batch().await.unwrap();
7683 assert_eq!(batch.num_rows(), 1);
7684 assert_eq!(batch.column(0).as_primitive::<UInt64Type>().values()[0], 50);
7685 }
7686
7687 #[rstest]
7688 #[tokio::test]
7689 async fn test_index_take_batch_size() {
7690 let fixture = Box::pin(ScalarIndexTestFixture::new(LanceFileVersion::Stable, false)).await;
7691 let stream = fixture
7692 .dataset
7693 .scan()
7694 .filter("indexed > 0")
7695 .unwrap()
7696 .batch_size(16)
7697 .try_into_stream()
7698 .await
7699 .unwrap();
7700 let batches = stream.collect::<Vec<_>>().await;
7701 assert_eq!(batches.len(), 1000_usize.div_ceil(16));
7702 }
7703
7704 async fn assert_plan_equals(
7708 dataset: &Dataset,
7709 plan: impl Fn(&mut Scanner) -> Result<&mut Scanner>,
7710 expected: &str,
7711 ) -> Result<()> {
7712 let mut scan = dataset.scan();
7713 plan(&mut scan)?;
7714 let exec_plan = scan.create_plan().await?;
7715 assert_plan_node_equals(exec_plan, expected).await
7716 }
7717
7718 #[tokio::test]
7719 async fn test_inexact_scalar_index_plans() {
7720 let data = gen_batch()
7721 .col("ngram", array::rand_utf8(ByteCount::from(5), false))
7722 .col("exact", array::rand_type(&DataType::UInt32))
7723 .col("no_index", array::rand_type(&DataType::UInt32))
7724 .into_reader_rows(RowCount::from(1000), BatchCount::from(5));
7725
7726 let mut dataset = Dataset::write(data, "memory://test", None).await.unwrap();
7727 dataset
7728 .create_index(
7729 &["ngram"],
7730 IndexType::NGram,
7731 None,
7732 &ScalarIndexParams::default(),
7733 true,
7734 )
7735 .await
7736 .unwrap();
7737 dataset
7738 .create_index(
7739 &["exact"],
7740 IndexType::BTree,
7741 None,
7742 &ScalarIndexParams::default(),
7743 true,
7744 )
7745 .await
7746 .unwrap();
7747
7748 assert_plan_equals(
7750 &dataset,
7751 |scanner| scanner.filter("contains(ngram, 'test string')"),
7752 "LanceRead: uri=..., projection=[ngram, exact, no_index], num_fragments=1, \
7753 range_before=None, range_after=None, row_id=false, row_addr=false, \
7754 full_filter=contains(ngram, Utf8(\"test string\")), refine_filter=--
7755 ScalarIndexQuery: query=[contains(ngram, Utf8(\"test string\"))]@ngram_idx(NGram)",
7756 )
7757 .await
7758 .unwrap();
7759
7760 assert_plan_equals(
7762 &dataset,
7763 |scanner| scanner.filter("contains(ngram, 'test string') and exact < 50"),
7764 "LanceRead: uri=..., projection=[ngram, exact, no_index], num_fragments=1, \
7765 range_before=None, range_after=None, row_id=false, row_addr=false, \
7766 full_filter=contains(ngram, Utf8(\"test string\")) AND exact < UInt32(50), \
7767 refine_filter=--
7768 ScalarIndexQuery: query=AND([contains(ngram, Utf8(\"test string\"))]@ngram_idx(NGram),[exact < 50]@exact_idx(BTree))",
7769 )
7770 .await
7771 .unwrap();
7772
7773 assert_plan_equals(
7775 &dataset,
7776 |scanner| {
7777 scanner.filter("contains(ngram, 'test string') and exact < 50 AND no_index > 100")
7778 },
7779 "ProjectionExec: expr=[ngram@0 as ngram, exact@1 as exact, no_index@2 as no_index]
7780 LanceRead: uri=..., projection=[ngram, exact, no_index], num_fragments=1, range_before=None, \
7781 range_after=None, row_id=true, row_addr=false, full_filter=contains(ngram, Utf8(\"test string\")) AND exact < UInt32(50) AND no_index > UInt32(100), \
7782 refine_filter=no_index > UInt32(100)
7783 ScalarIndexQuery: query=AND([contains(ngram, Utf8(\"test string\"))]@ngram_idx(NGram),[exact < 50]@exact_idx(BTree))",
7784 )
7785 .await
7786 .unwrap();
7787 }
7788
7789 #[tokio::test]
7790 async fn test_like_prefix_with_btree_index() {
7791 let data = gen_batch()
7794 .col(
7795 "name",
7796 array::cycle_utf8_literals(&[
7797 "apple",
7798 "application",
7799 "app",
7800 "banana",
7801 "band",
7802 "testns1",
7803 "testns2",
7804 "test",
7805 "testing",
7806 "zoo",
7807 ]),
7808 )
7809 .col("id", array::step::<Int32Type>())
7810 .into_reader_rows(RowCount::from(100), BatchCount::from(1));
7811
7812 let mut dataset = Dataset::write(data, "memory://test_like", None)
7813 .await
7814 .unwrap();
7815
7816 dataset
7818 .create_index(
7819 &["name"],
7820 IndexType::BTree,
7821 None,
7822 &ScalarIndexParams::default(),
7823 true,
7824 )
7825 .await
7826 .unwrap();
7827
7828 assert_plan_equals(
7830 &dataset,
7831 |scanner| scanner.filter("name LIKE 'app%'"),
7832 "LanceRead: uri=..., projection=[name, id], num_fragments=1, \
7833 range_before=None, range_after=None, row_id=false, row_addr=false, \
7834 full_filter=name LIKE Utf8(\"app%\"), refine_filter=--
7835 ScalarIndexQuery: query=[name LIKE 'app%']@name_idx(BTree)",
7836 )
7837 .await
7838 .unwrap();
7839
7840 let results = dataset
7842 .scan()
7843 .filter("name LIKE 'app%'")
7844 .unwrap()
7845 .try_into_batch()
7846 .await
7847 .unwrap();
7848 let names: Vec<&str> = results
7849 .column_by_name("name")
7850 .unwrap()
7851 .as_any()
7852 .downcast_ref::<StringArray>()
7853 .unwrap()
7854 .iter()
7855 .map(|s| s.unwrap())
7856 .collect();
7857 assert!(names.iter().all(|n| n.starts_with("app")));
7859 assert!(!names.is_empty());
7860
7861 assert_plan_equals(
7864 &dataset,
7865 |scanner| scanner.filter("starts_with(name, 'ban')"),
7866 "LanceRead: uri=..., projection=[name, id], num_fragments=1, \
7867 range_before=None, range_after=None, row_id=false, row_addr=false, \
7868 full_filter=name LIKE Utf8(\"ban%\"), refine_filter=--
7869 ScalarIndexQuery: query=[name LIKE 'ban%']@name_idx(BTree)",
7870 )
7871 .await
7872 .unwrap();
7873
7874 let results = dataset
7876 .scan()
7877 .filter("starts_with(name, 'ban')")
7878 .unwrap()
7879 .try_into_batch()
7880 .await
7881 .unwrap();
7882 let names: Vec<&str> = results
7883 .column_by_name("name")
7884 .unwrap()
7885 .as_any()
7886 .downcast_ref::<StringArray>()
7887 .unwrap()
7888 .iter()
7889 .map(|s| s.unwrap())
7890 .collect();
7891 assert!(names.iter().all(|n| n.starts_with("ban")));
7893 assert!(!names.is_empty());
7894
7895 assert_plan_equals(
7897 &dataset,
7898 |scanner| scanner.filter("name LIKE 'test%2'"),
7899 "ProjectionExec: expr=[name@0 as name, id@1 as id]
7900 LanceRead: uri=..., projection=[name, id], num_fragments=1, \
7901range_before=None, range_after=None, row_id=true, row_addr=false, \
7902full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\")
7903 ScalarIndexQuery: query=[name LIKE 'test%']@name_idx(BTree)",
7904 )
7905 .await
7906 .unwrap();
7907
7908 let results = dataset
7910 .scan()
7911 .filter("name LIKE 'test%2'")
7912 .unwrap()
7913 .try_into_batch()
7914 .await
7915 .unwrap();
7916 let names: Vec<&str> = results
7917 .column_by_name("name")
7918 .unwrap()
7919 .as_any()
7920 .downcast_ref::<StringArray>()
7921 .unwrap()
7922 .iter()
7923 .map(|s| s.unwrap())
7924 .collect();
7925 assert!(
7927 names
7928 .iter()
7929 .all(|n| n.starts_with("test") && n.ends_with("2"))
7930 );
7931
7932 let mut scanner = dataset.scan();
7935 scanner.filter("name LIKE '%app%'").unwrap();
7936 let plan = scanner.create_plan().await.unwrap();
7937 let plan_str = format!("{:?}", plan);
7938 assert!(
7939 !plan_str.contains("ScalarIndexQuery"),
7940 "LIKE '%app%' should not use scalar index, but got: {}",
7941 plan_str
7942 );
7943
7944 let results = dataset
7946 .scan()
7947 .filter("name LIKE '%app%'")
7948 .unwrap()
7949 .try_into_batch()
7950 .await
7951 .unwrap();
7952 let names: Vec<&str> = results
7953 .column_by_name("name")
7954 .unwrap()
7955 .as_any()
7956 .downcast_ref::<StringArray>()
7957 .unwrap()
7958 .iter()
7959 .map(|s| s.unwrap())
7960 .collect();
7961 assert!(names.iter().all(|n| n.contains("app")));
7963
7964 let mut scanner = dataset.scan();
7966 scanner.filter("name NOT LIKE 'app%'").unwrap();
7967 let plan = scanner.create_plan().await.unwrap();
7968 let plan_str = format!("{:?}", plan);
7969 assert!(
7970 !plan_str.contains("ScalarIndexQuery"),
7971 "NOT LIKE should not use scalar index, but got: {}",
7972 plan_str
7973 );
7974 }
7975
7976 #[tokio::test]
7977 async fn test_like_prefix_correctness_with_btree_index() {
7978 let names: Vec<&str> = vec![
7980 "alpha", "alphabet", "beta", "gamma", "delta", "epsilon", "eta", "theta", "iota",
7981 "kappa",
7982 ];
7983 let data = RecordBatch::try_new(
7984 Arc::new(ArrowSchema::new(vec![
7985 ArrowField::new("name", DataType::Utf8, false),
7986 ArrowField::new("id", DataType::Int32, false),
7987 ])),
7988 vec![
7989 Arc::new(StringArray::from(names.clone())),
7990 Arc::new(Int32Array::from_iter_values(0..10)),
7991 ],
7992 )
7993 .unwrap();
7994
7995 let reader = RecordBatchIterator::new(
7996 vec![Ok(data)],
7997 Arc::new(ArrowSchema::new(vec![
7998 ArrowField::new("name", DataType::Utf8, false),
7999 ArrowField::new("id", DataType::Int32, false),
8000 ])),
8001 );
8002
8003 let mut dataset = Dataset::write(reader, "memory://test_like_correctness", None)
8004 .await
8005 .unwrap();
8006
8007 dataset
8009 .create_index(
8010 &["name"],
8011 IndexType::BTree,
8012 None,
8013 &ScalarIndexParams::default(),
8014 true,
8015 )
8016 .await
8017 .unwrap();
8018
8019 let with_index = dataset
8021 .scan()
8022 .filter("name LIKE 'alpha%'")
8023 .unwrap()
8024 .try_into_batch()
8025 .await
8026 .unwrap();
8027
8028 let without_index = dataset
8030 .scan()
8031 .use_scalar_index(false)
8032 .filter("name LIKE 'alpha%'")
8033 .unwrap()
8034 .try_into_batch()
8035 .await
8036 .unwrap();
8037
8038 assert_eq!(with_index.num_rows(), without_index.num_rows());
8040 assert_eq!(with_index.num_rows(), 2);
8041
8042 let with_index_names: BTreeSet<String> = with_index
8043 .column_by_name("name")
8044 .unwrap()
8045 .as_any()
8046 .downcast_ref::<StringArray>()
8047 .unwrap()
8048 .iter()
8049 .map(|s| s.unwrap().to_string())
8050 .collect();
8051
8052 let without_index_names: BTreeSet<String> = without_index
8053 .column_by_name("name")
8054 .unwrap()
8055 .as_any()
8056 .downcast_ref::<StringArray>()
8057 .unwrap()
8058 .iter()
8059 .map(|s| s.unwrap().to_string())
8060 .collect();
8061
8062 assert_eq!(with_index_names, without_index_names);
8063 assert_eq!(
8064 with_index_names,
8065 BTreeSet::from(["alpha".to_string(), "alphabet".to_string()])
8066 );
8067
8068 let starts_with_result = dataset
8070 .scan()
8071 .filter("starts_with(name, 'e')")
8072 .unwrap()
8073 .try_into_batch()
8074 .await
8075 .unwrap();
8076
8077 let starts_with_names: BTreeSet<String> = starts_with_result
8078 .column_by_name("name")
8079 .unwrap()
8080 .as_any()
8081 .downcast_ref::<StringArray>()
8082 .unwrap()
8083 .iter()
8084 .map(|s| s.unwrap().to_string())
8085 .collect();
8086
8087 assert_eq!(
8089 starts_with_names,
8090 BTreeSet::from(["epsilon".to_string(), "eta".to_string()])
8091 );
8092 }
8093
8094 #[tokio::test]
8095 async fn test_like_prefix_with_zone_map() {
8096 use lance_index::scalar::BuiltinIndexType;
8097
8098 let data = gen_batch()
8100 .col(
8101 "name",
8102 array::cycle_utf8_literals(&[
8103 "apple",
8104 "application",
8105 "app",
8106 "banana",
8107 "band",
8108 "testns1",
8109 "testns2",
8110 "test",
8111 "testing",
8112 "zoo",
8113 ]),
8114 )
8115 .col("id", array::step::<Int32Type>())
8116 .into_reader_rows(RowCount::from(100), BatchCount::from(1));
8117
8118 let mut dataset = Dataset::write(data, "memory://test_like_zonemap", None)
8119 .await
8120 .unwrap();
8121
8122 let params = ScalarIndexParams::for_builtin(BuiltinIndexType::ZoneMap);
8124 dataset
8125 .create_index(
8126 &["name"],
8127 IndexType::Scalar,
8128 Some("name_zonemap".to_string()),
8129 ¶ms,
8130 true,
8131 )
8132 .await
8133 .unwrap();
8134
8135 let mut scanner = dataset.scan();
8137 scanner.filter("name LIKE 'app%'").unwrap();
8138 let plan = scanner.create_plan().await.unwrap();
8139 let plan_str = format!("{:?}", plan);
8140 assert!(
8142 plan_str.contains("ScalarIndexExec") && plan_str.contains("LikePrefix"),
8143 "LIKE 'app%' should use zone map index with LikePrefix, but got: {}",
8144 plan_str
8145 );
8146
8147 let results = dataset
8149 .scan()
8150 .filter("name LIKE 'app%'")
8151 .unwrap()
8152 .try_into_batch()
8153 .await
8154 .unwrap();
8155 let names: Vec<&str> = results
8156 .column_by_name("name")
8157 .unwrap()
8158 .as_any()
8159 .downcast_ref::<StringArray>()
8160 .unwrap()
8161 .iter()
8162 .map(|s| s.unwrap())
8163 .collect();
8164 assert!(names.iter().all(|n| n.starts_with("app")));
8165 assert!(!names.is_empty());
8166
8167 let mut scanner = dataset.scan();
8169 scanner.filter("starts_with(name, 'ban')").unwrap();
8170 let plan = scanner.create_plan().await.unwrap();
8171 let plan_str = format!("{:?}", plan);
8172 assert!(
8173 plan_str.contains("ScalarIndexExec") && plan_str.contains("LikePrefix"),
8174 "starts_with should use zone map index with LikePrefix, but got: {}",
8175 plan_str
8176 );
8177
8178 let results = dataset
8180 .scan()
8181 .filter("starts_with(name, 'ban')")
8182 .unwrap()
8183 .try_into_batch()
8184 .await
8185 .unwrap();
8186 let names: Vec<&str> = results
8187 .column_by_name("name")
8188 .unwrap()
8189 .as_any()
8190 .downcast_ref::<StringArray>()
8191 .unwrap()
8192 .iter()
8193 .map(|s| s.unwrap())
8194 .collect();
8195 assert!(names.iter().all(|n| n.starts_with("ban")));
8196
8197 let mut scanner = dataset.scan();
8199 scanner.filter("name LIKE 'test%2'").unwrap();
8200 let plan = scanner.create_plan().await.unwrap();
8201 let plan_str = format!("{:?}", plan);
8202 assert!(
8203 plan_str.contains("ScalarIndexExec") && plan_str.contains("LikePrefix"),
8204 "LIKE 'test%2' should use zone map index for prefix, but got: {}",
8205 plan_str
8206 );
8207
8208 let mut scanner = dataset.scan();
8210 scanner.filter("name LIKE '%app%'").unwrap();
8211 let plan = scanner.create_plan().await.unwrap();
8212 let plan_str = format!("{:?}", plan);
8213 assert!(
8214 !plan_str.contains("LikePrefix"),
8215 "LIKE '%app%' should not use LikePrefix index, but got: {}",
8216 plan_str
8217 );
8218 }
8219
8220 #[tokio::test]
8221 async fn test_like_prefix_with_segmented_zone_map() {
8222 use lance_index::scalar::BuiltinIndexType;
8223
8224 let data = gen_batch()
8225 .col(
8226 "name",
8227 array::cycle_utf8_literals(&[
8228 "apple",
8229 "application",
8230 "app",
8231 "banana",
8232 "band",
8233 "testns1",
8234 "testns2",
8235 "test",
8236 "testing",
8237 "zoo",
8238 ]),
8239 )
8240 .col("id", array::step::<Int32Type>())
8241 .into_reader_rows(RowCount::from(150), BatchCount::from(6));
8242
8243 let write_params = WriteParams {
8244 max_rows_per_file: 25,
8245 max_rows_per_group: 10,
8246 ..Default::default()
8247 };
8248
8249 let mut dataset = Dataset::write(
8250 data,
8251 "memory://test_like_segmented_zonemap",
8252 Some(write_params),
8253 )
8254 .await
8255 .unwrap();
8256
8257 let fragments = dataset.get_fragments();
8258 assert!(fragments.len() > 1, "expected multiple fragments");
8259
8260 let params = ScalarIndexParams::for_builtin(BuiltinIndexType::ZoneMap);
8261 let mut segments = Vec::with_capacity(fragments.len());
8262 for fragment in &fragments {
8263 let mut builder = dataset.create_index_builder(&["name"], IndexType::Scalar, ¶ms);
8264 builder = builder
8265 .name("name_zonemap".to_string())
8266 .fragments(vec![fragment.id() as u32]);
8267 segments.push(builder.execute_uncommitted().await.unwrap());
8268 }
8269
8270 dataset
8271 .commit_existing_index_segments("name_zonemap", "name", segments)
8272 .await
8273 .unwrap();
8274
8275 let committed = dataset.load_indices_by_name("name_zonemap").await.unwrap();
8276 assert_eq!(committed.len(), fragments.len());
8277
8278 let mut scanner = dataset.scan();
8279 scanner.filter("name LIKE 'app%'").unwrap();
8280 let plan = scanner.create_plan().await.unwrap();
8281 let plan_str = format!("{:?}", plan);
8282 assert!(
8283 plan_str.contains("ScalarIndexExec") && plan_str.contains("LikePrefix"),
8284 "segmented zonemap should use LikePrefix pruning, but got: {}",
8285 plan_str
8286 );
8287
8288 let with_index = dataset
8289 .scan()
8290 .filter("name LIKE 'app%'")
8291 .unwrap()
8292 .try_into_batch()
8293 .await
8294 .unwrap();
8295 let without_index = dataset
8296 .scan()
8297 .use_scalar_index(false)
8298 .filter("name LIKE 'app%'")
8299 .unwrap()
8300 .try_into_batch()
8301 .await
8302 .unwrap();
8303
8304 let with_index_ids = with_index
8305 .column_by_name("id")
8306 .unwrap()
8307 .as_primitive::<Int32Type>()
8308 .values()
8309 .iter()
8310 .copied()
8311 .collect::<BTreeSet<_>>();
8312 let without_index_ids = without_index
8313 .column_by_name("id")
8314 .unwrap()
8315 .as_primitive::<Int32Type>()
8316 .values()
8317 .iter()
8318 .copied()
8319 .collect::<BTreeSet<_>>();
8320 assert_eq!(with_index_ids, without_index_ids);
8321 assert!(!with_index_ids.is_empty());
8322
8323 let names = with_index
8324 .column_by_name("name")
8325 .unwrap()
8326 .as_any()
8327 .downcast_ref::<StringArray>()
8328 .unwrap()
8329 .iter()
8330 .map(|value| value.unwrap())
8331 .collect::<Vec<_>>();
8332 assert!(names.iter().all(|name| name.starts_with("app")));
8333 }
8334
8335 #[tokio::test]
8336 async fn test_like_prefix_with_segmented_btree() {
8337 let data = gen_batch()
8338 .col(
8339 "name",
8340 array::cycle_utf8_literals(&[
8341 "apple",
8342 "application",
8343 "app",
8344 "banana",
8345 "band",
8346 "testns1",
8347 "testns2",
8348 "test",
8349 "testing",
8350 "zoo",
8351 ]),
8352 )
8353 .col("id", array::step::<Int32Type>())
8354 .into_reader_rows(RowCount::from(150), BatchCount::from(6));
8355
8356 let write_params = WriteParams {
8357 max_rows_per_file: 25,
8358 max_rows_per_group: 10,
8359 ..Default::default()
8360 };
8361
8362 let mut dataset = Dataset::write(
8363 data,
8364 "memory://test_like_segmented_btree",
8365 Some(write_params),
8366 )
8367 .await
8368 .unwrap();
8369
8370 let fragments = dataset.get_fragments();
8371 assert!(fragments.len() > 1, "expected multiple fragments");
8372
8373 let params = ScalarIndexParams::for_builtin(lance_index::scalar::BuiltinIndexType::BTree);
8374 let mut segments = Vec::with_capacity(fragments.len());
8375 for fragment in &fragments {
8376 let mut builder = dataset.create_index_builder(&["name"], IndexType::BTree, ¶ms);
8377 builder = builder
8378 .name("name_btree".to_string())
8379 .fragments(vec![fragment.id() as u32]);
8380 segments.push(builder.execute_uncommitted().await.unwrap());
8381 }
8382
8383 dataset
8384 .commit_existing_index_segments("name_btree", "name", segments)
8385 .await
8386 .unwrap();
8387
8388 let committed = dataset.load_indices_by_name("name_btree").await.unwrap();
8389 assert_eq!(committed.len(), fragments.len());
8390
8391 let mut scanner = dataset.scan();
8392 scanner.filter("name LIKE 'app%'").unwrap();
8393 let plan = scanner.create_plan().await.unwrap();
8394 let plan_str = format!("{:?}", plan);
8395 assert!(
8396 plan_str.contains("ScalarIndexExec") && plan_str.contains("LikePrefix(Utf8(\"app\"))"),
8397 "segmented btree should use scalar index pruning, but got: {}",
8398 plan_str
8399 );
8400
8401 let with_index = dataset
8402 .scan()
8403 .filter("name LIKE 'app%'")
8404 .unwrap()
8405 .try_into_batch()
8406 .await
8407 .unwrap();
8408 let without_index = dataset
8409 .scan()
8410 .use_scalar_index(false)
8411 .filter("name LIKE 'app%'")
8412 .unwrap()
8413 .try_into_batch()
8414 .await
8415 .unwrap();
8416
8417 let with_index_ids = with_index
8418 .column_by_name("id")
8419 .unwrap()
8420 .as_primitive::<Int32Type>()
8421 .values()
8422 .iter()
8423 .copied()
8424 .collect::<BTreeSet<_>>();
8425 let without_index_ids = without_index
8426 .column_by_name("id")
8427 .unwrap()
8428 .as_primitive::<Int32Type>()
8429 .values()
8430 .iter()
8431 .copied()
8432 .collect::<BTreeSet<_>>();
8433 assert_eq!(with_index_ids, without_index_ids);
8434 assert!(!with_index_ids.is_empty());
8435
8436 let names = with_index
8437 .column_by_name("name")
8438 .unwrap()
8439 .as_any()
8440 .downcast_ref::<StringArray>()
8441 .unwrap()
8442 .iter()
8443 .map(|value| value.unwrap())
8444 .collect::<Vec<_>>();
8445 assert!(names.iter().all(|name| name.starts_with("app")));
8446 }
8447
8448 #[tokio::test]
8449 async fn test_like_prefix_correctness_with_zone_map() {
8450 use lance_index::scalar::BuiltinIndexType;
8451
8452 let names: Vec<&str> = vec![
8454 "alpha", "alphabet", "beta", "gamma", "delta", "epsilon", "eta", "theta", "iota",
8455 "kappa",
8456 ];
8457 let data = RecordBatch::try_new(
8458 Arc::new(ArrowSchema::new(vec![
8459 ArrowField::new("name", DataType::Utf8, false),
8460 ArrowField::new("id", DataType::Int32, false),
8461 ])),
8462 vec![
8463 Arc::new(StringArray::from(names.clone())),
8464 Arc::new(Int32Array::from_iter_values(0..10)),
8465 ],
8466 )
8467 .unwrap();
8468
8469 let reader = RecordBatchIterator::new(
8470 vec![Ok(data)],
8471 Arc::new(ArrowSchema::new(vec![
8472 ArrowField::new("name", DataType::Utf8, false),
8473 ArrowField::new("id", DataType::Int32, false),
8474 ])),
8475 );
8476
8477 let mut dataset = Dataset::write(reader, "memory://test_like_correctness_zonemap", None)
8478 .await
8479 .unwrap();
8480
8481 let params = ScalarIndexParams::for_builtin(BuiltinIndexType::ZoneMap);
8483 dataset
8484 .create_index(
8485 &["name"],
8486 IndexType::Scalar,
8487 Some("name_zonemap".to_string()),
8488 ¶ms,
8489 true,
8490 )
8491 .await
8492 .unwrap();
8493
8494 let with_index = dataset
8496 .scan()
8497 .filter("name LIKE 'alpha%'")
8498 .unwrap()
8499 .try_into_batch()
8500 .await
8501 .unwrap();
8502
8503 let without_index = dataset
8505 .scan()
8506 .use_scalar_index(false)
8507 .filter("name LIKE 'alpha%'")
8508 .unwrap()
8509 .try_into_batch()
8510 .await
8511 .unwrap();
8512
8513 assert_eq!(with_index.num_rows(), without_index.num_rows());
8515 assert_eq!(with_index.num_rows(), 2);
8516
8517 let with_index_names: BTreeSet<String> = with_index
8518 .column_by_name("name")
8519 .unwrap()
8520 .as_any()
8521 .downcast_ref::<StringArray>()
8522 .unwrap()
8523 .iter()
8524 .map(|s| s.unwrap().to_string())
8525 .collect();
8526
8527 let without_index_names: BTreeSet<String> = without_index
8528 .column_by_name("name")
8529 .unwrap()
8530 .as_any()
8531 .downcast_ref::<StringArray>()
8532 .unwrap()
8533 .iter()
8534 .map(|s| s.unwrap().to_string())
8535 .collect();
8536
8537 assert_eq!(with_index_names, without_index_names);
8538 assert_eq!(
8539 with_index_names,
8540 BTreeSet::from(["alpha".to_string(), "alphabet".to_string()])
8541 );
8542
8543 let starts_with_result = dataset
8545 .scan()
8546 .filter("starts_with(name, 'e')")
8547 .unwrap()
8548 .try_into_batch()
8549 .await
8550 .unwrap();
8551
8552 let starts_with_names: BTreeSet<String> = starts_with_result
8553 .column_by_name("name")
8554 .unwrap()
8555 .as_any()
8556 .downcast_ref::<StringArray>()
8557 .unwrap()
8558 .iter()
8559 .map(|s| s.unwrap().to_string())
8560 .collect();
8561
8562 assert_eq!(
8564 starts_with_names,
8565 BTreeSet::from(["epsilon".to_string(), "eta".to_string()])
8566 );
8567 }
8568
8569 #[rstest]
8570 #[tokio::test]
8571 async fn test_late_materialization(
8572 #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
8573 data_storage_version: LanceFileVersion,
8574 ) {
8575 use lance_io::assert_io_lt;
8576 use lance_table::io::commit::RenameCommitHandler;
8579 let data = gen_batch()
8580 .col(
8581 "vector",
8582 array::rand_vec::<Float32Type>(Dimension::from(32)),
8583 )
8584 .col("indexed", array::step::<Int32Type>())
8585 .col("not_indexed", array::step::<Int32Type>())
8586 .into_reader_rows(RowCount::from(1000), BatchCount::from(20));
8587
8588 let mut dataset = Dataset::write(
8589 data,
8590 "memory://test",
8591 Some(WriteParams {
8592 commit_handler: Some(Arc::new(RenameCommitHandler)),
8593 data_storage_version: Some(data_storage_version),
8594 ..Default::default()
8595 }),
8596 )
8597 .await
8598 .unwrap();
8599 dataset
8600 .create_index(
8601 &["indexed"],
8602 IndexType::Scalar,
8603 None,
8604 &ScalarIndexParams::default(),
8605 false,
8606 )
8607 .await
8608 .unwrap();
8609
8610 let _ = dataset.object_store.as_ref().io_stats_incremental(); dataset.scan().try_into_batch().await.unwrap();
8613 let io_stats = dataset.object_store.as_ref().io_stats_incremental();
8614 let full_scan_bytes = io_stats.read_bytes;
8615
8616 dataset
8618 .scan()
8619 .use_stats(false)
8620 .filter("not_indexed = 50")
8621 .unwrap()
8622 .try_into_batch()
8623 .await
8624 .unwrap();
8625 let io_stats = dataset.object_store.as_ref().io_stats_incremental();
8626 assert_io_lt!(io_stats, read_bytes, full_scan_bytes);
8627 let filtered_scan_bytes = io_stats.read_bytes;
8628
8629 if data_storage_version == LanceFileVersion::Legacy {
8632 dataset
8633 .scan()
8634 .filter("not_indexed = 50")
8635 .unwrap()
8636 .try_into_batch()
8637 .await
8638 .unwrap();
8639 let io_stats = dataset.object_store.as_ref().io_stats_incremental();
8640 assert_io_lt!(io_stats, read_bytes, filtered_scan_bytes);
8641 }
8642
8643 dataset
8647 .scan()
8648 .filter("indexed = 50")
8649 .unwrap()
8650 .try_into_batch()
8651 .await
8652 .unwrap();
8653 let io_stats = dataset.object_store.as_ref().io_stats_incremental();
8654 assert_io_lt!(io_stats, read_bytes, full_scan_bytes);
8655 let index_scan_bytes = io_stats.read_bytes;
8656
8657 dataset
8660 .scan()
8661 .filter("indexed = 50")
8662 .unwrap()
8663 .try_into_batch()
8664 .await
8665 .unwrap();
8666 let io_stats = dataset.object_store.as_ref().io_stats_incremental();
8667 assert_io_lt!(io_stats, read_bytes, index_scan_bytes);
8668 }
8669
8670 #[rstest]
8671 #[tokio::test]
8672 async fn test_project_nested(
8673 #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
8674 data_storage_version: LanceFileVersion,
8675 ) -> Result<()> {
8676 let struct_i_field = ArrowField::new("i", DataType::Int32, true);
8677 let struct_o_field = ArrowField::new("o", DataType::Utf8, true);
8678 let schema = Arc::new(ArrowSchema::new(vec![
8679 ArrowField::new(
8680 "struct",
8681 DataType::Struct(vec![struct_i_field.clone(), struct_o_field.clone()].into()),
8682 true,
8683 ),
8684 ArrowField::new("s", DataType::Utf8, true),
8685 ]));
8686
8687 let input_batches: Vec<RecordBatch> = (0..5)
8688 .map(|i| {
8689 let struct_i_arr: Arc<Int32Array> =
8690 Arc::new(Int32Array::from_iter_values(i * 20..(i + 1) * 20));
8691 let struct_o_arr: Arc<StringArray> = Arc::new(StringArray::from_iter_values(
8692 (i * 20..(i + 1) * 20).map(|v| format!("o-{:02}", v)),
8693 ));
8694 RecordBatch::try_new(
8695 schema.clone(),
8696 vec![
8697 Arc::new(StructArray::from(vec![
8698 (Arc::new(struct_i_field.clone()), struct_i_arr as ArrayRef),
8699 (Arc::new(struct_o_field.clone()), struct_o_arr as ArrayRef),
8700 ])),
8701 Arc::new(StringArray::from_iter_values(
8702 (i * 20..(i + 1) * 20).map(|v| format!("s-{}", v)),
8703 )),
8704 ],
8705 )
8706 .unwrap()
8707 })
8708 .collect();
8709 let batches =
8710 RecordBatchIterator::new(input_batches.clone().into_iter().map(Ok), schema.clone());
8711 let test_dir = TempStrDir::default();
8712 let test_uri = &test_dir;
8713 let write_params = WriteParams {
8714 max_rows_per_file: 40,
8715 max_rows_per_group: 10,
8716 data_storage_version: Some(data_storage_version),
8717 ..Default::default()
8718 };
8719 Dataset::write(batches, test_uri, Some(write_params))
8720 .await
8721 .unwrap();
8722
8723 let dataset = Dataset::open(test_uri).await.unwrap();
8724
8725 let batches = dataset
8726 .scan()
8727 .project(&["struct.i"])
8728 .unwrap()
8729 .try_into_stream()
8730 .await
8731 .unwrap()
8732 .try_collect::<Vec<_>>()
8733 .await
8734 .unwrap();
8735 let batch = concat_batches(&batches[0].schema(), &batches).unwrap();
8736 assert!(batch.column_by_name("struct.i").is_some());
8737 Ok(())
8738 }
8739
8740 #[rstest]
8741 #[tokio::test]
8742 async fn test_plans(
8743 #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
8744 data_storage_version: LanceFileVersion,
8745 #[values(false, true)] stable_row_id: bool,
8746 ) -> Result<()> {
8747 use lance_index::scalar::inverted::query::BoostQuery;
8750 let dim = 256;
8751 let mut dataset =
8752 TestVectorDataset::new_with_dimension(data_storage_version, stable_row_id, dim).await?;
8753 let lance_schema = dataset.dataset.schema();
8754
8755 if data_storage_version == LanceFileVersion::Legacy {
8759 log::info!("Test case: Pushdown scan");
8760 assert_plan_equals(
8761 &dataset.dataset,
8762 |scan| scan.project(&["s"])?.filter("i > 10 and i < 20"),
8763 "LancePushdownScan: uri=..., projection=[s], predicate=i > Int32(10) AND i < Int32(20), row_id=false, row_addr=false, ordered=true"
8764 ).await?;
8765 }
8766
8767 log::info!("Test case: Project and filter");
8768 let expected = if data_storage_version == LanceFileVersion::Legacy {
8769 "ProjectionExec: expr=[s@2 as s]
8770 Take: columns=\"i, _rowid, (s)\"
8771 CoalesceBatchesExec: target_batch_size=8192
8772 FilterExec: i@0 > 10 AND i@0 < 20
8773 LanceScan: uri..., projection=[i], row_id=true, row_addr=false, ordered=true, range=None"
8774 } else {
8775 "ProjectionExec: expr=[s@2 as s]
8776 Take: columns=\"i, _rowid, (s)\"
8777 CoalesceBatchesExec: target_batch_size=8192
8778 LanceRead: ..., projection=[i], num_fragments=2, range_before=None, range_after=None, row_id=true, row_addr=false, full_filter=i > Int32(10) AND i < Int32(20), refine_filter=i > Int32(10) AND i < Int32(20)"
8779 };
8780 assert_plan_equals(
8781 &dataset.dataset,
8782 |scan| {
8783 scan.use_stats(false)
8784 .project(&["s"])?
8785 .filter("i > 10 and i < 20")
8786 },
8787 expected,
8788 )
8789 .await?;
8790
8791 log::info!("Test case: Late materialization");
8794 let expected = if data_storage_version == LanceFileVersion::Legacy {
8795 "ProjectionExec: expr=[i@0 as i, s@1 as s, vec@3 as vec]
8796 Take: columns=\"i, s, _rowid, (vec)\"
8797 CoalesceBatchesExec: target_batch_size=8192
8798 FilterExec: s@1 IS NOT NULL
8799 LanceScan: uri..., projection=[i, s], row_id=true, row_addr=false, ordered=true, range=None"
8800 } else {
8801 "ProjectionExec: expr=[i@0 as i, s@1 as s, vec@3 as vec]
8802 Take: columns=\"i, s, _rowid, (vec)\"
8803 CoalesceBatchesExec: target_batch_size=8192
8804 LanceRead: uri=..., projection=[i, s], num_fragments=2, range_before=None, range_after=None, \
8805 row_id=true, row_addr=false, full_filter=s IS NOT NULL, refine_filter=s IS NOT NULL"
8806 };
8807 assert_plan_equals(
8808 &dataset.dataset,
8809 |scan| scan.use_stats(false).filter("s IS NOT NULL"),
8810 expected,
8811 )
8812 .await?;
8813
8814 log::info!("Test case: Custom materialization (all early)");
8816 let expected = if data_storage_version == LanceFileVersion::Legacy {
8817 "ProjectionExec: expr=[i@0 as i, s@1 as s, vec@2 as vec]
8818 FilterExec: s@1 IS NOT NULL
8819 LanceScan: uri..., projection=[i, s, vec], row_id=true, row_addr=false, ordered=true, range=None"
8820 } else {
8821 "ProjectionExec: expr=[i@0 as i, s@1 as s, vec@2 as vec]
8822 LanceRead: uri=..., projection=[i, s, vec], num_fragments=2, range_before=None, \
8823 range_after=None, row_id=true, row_addr=false, full_filter=s IS NOT NULL, refine_filter=s IS NOT NULL"
8824 };
8825 assert_plan_equals(
8826 &dataset.dataset,
8827 |scan| {
8828 scan.use_stats(false)
8829 .materialization_style(MaterializationStyle::AllEarly)
8830 .filter("s IS NOT NULL")
8831 },
8832 expected,
8833 )
8834 .await?;
8835
8836 log::info!("Test case: Custom materialization 2 (all late)");
8837 let expected = if data_storage_version == LanceFileVersion::Legacy {
8838 "ProjectionExec: expr=[i@2 as i, s@0 as s, vec@3 as vec]
8839 Take: columns=\"s, _rowid, (i), (vec)\"
8840 CoalesceBatchesExec: target_batch_size=8192
8841 FilterExec: s@0 IS NOT NULL
8842 LanceScan: uri..., projection=[s], row_id=true, row_addr=false, ordered=true, range=None"
8843 } else {
8844 "ProjectionExec: expr=[i@2 as i, s@0 as s, vec@3 as vec]
8845 Take: columns=\"s, _rowid, (i), (vec)\"
8846 CoalesceBatchesExec: target_batch_size=8192
8847 LanceRead: uri=..., projection=[s], num_fragments=2, range_before=None, \
8848 range_after=None, row_id=true, row_addr=false, full_filter=s IS NOT NULL, refine_filter=s IS NOT NULL"
8849 };
8850 assert_plan_equals(
8851 &dataset.dataset,
8852 |scan| {
8853 scan.use_stats(false)
8854 .materialization_style(MaterializationStyle::AllLate)
8855 .filter("s IS NOT NULL")
8856 },
8857 expected,
8858 )
8859 .await?;
8860
8861 log::info!("Test case: Custom materialization 3 (mixed)");
8862 let expected = if data_storage_version == LanceFileVersion::Legacy {
8863 "ProjectionExec: expr=[i@3 as i, s@0 as s, vec@1 as vec]
8864 Take: columns=\"s, vec, _rowid, (i)\"
8865 CoalesceBatchesExec: target_batch_size=8192
8866 FilterExec: s@0 IS NOT NULL
8867 LanceScan: uri..., projection=[s, vec], row_id=true, row_addr=false, ordered=true, range=None"
8868 } else {
8869 "ProjectionExec: expr=[i@3 as i, s@0 as s, vec@1 as vec]
8870 Take: columns=\"s, vec, _rowid, (i)\"
8871 CoalesceBatchesExec: target_batch_size=8192
8872 LanceRead: uri=..., projection=[s, vec], num_fragments=2, range_before=None, range_after=None, \
8873 row_id=true, row_addr=false, full_filter=s IS NOT NULL, refine_filter=s IS NOT NULL"
8874 };
8875 assert_plan_equals(
8876 &dataset.dataset,
8877 |scan| {
8878 scan.use_stats(false)
8879 .materialization_style(
8880 MaterializationStyle::all_early_except(&["i"], lance_schema).unwrap(),
8881 )
8882 .filter("s IS NOT NULL")
8883 },
8884 expected,
8885 )
8886 .await?;
8887
8888 log::info!("Test case: Scan out of order");
8889 let expected = if data_storage_version == LanceFileVersion::Legacy {
8890 "LanceScan: uri=..., projection=[s], row_id=true, row_addr=false, ordered=false, range=None"
8891 } else {
8892 "LanceRead: uri=..., projection=[s], num_fragments=2, range_before=None, range_after=None, row_id=true, \
8893 row_addr=false, full_filter=--, refine_filter=--"
8894 };
8895 assert_plan_equals(
8896 &dataset.dataset,
8897 |scan| Ok(scan.project(&["s"])?.with_row_id().scan_in_order(false)),
8898 expected,
8899 )
8900 .await?;
8901
8902 let q: Float32Array = (32..32 + dim).map(|v| v as f32).collect();
8905 log::info!("Test case: Basic KNN");
8906 let expected = if data_storage_version == LanceFileVersion::Legacy {
8907 "ProjectionExec: expr=[i@3 as i, s@4 as s, vec@0 as vec, _distance@2 as _distance]
8908 Take: columns=\"vec, _rowid, _distance, (i), (s)\"
8909 CoalesceBatchesExec: target_batch_size=8192
8910 FilterExec: _distance@2 IS NOT NULL
8911 SortExec: TopK(fetch=5), expr=...
8912 KNNVectorDistance: metric=l2
8913 LanceScan: uri=..., projection=[vec], row_id=true, row_addr=false, ordered=false, range=None"
8914 } else {
8915 "ProjectionExec: expr=[i@3 as i, s@4 as s, vec@0 as vec, _distance@2 as _distance]
8916 Take: columns=\"vec, _rowid, _distance, (i), (s)\"
8917 CoalesceBatchesExec: target_batch_size=8192
8918 FilterExec: _distance@2 IS NOT NULL
8919 SortExec: TopK(fetch=5), expr=...
8920 KNNVectorDistance: metric=l2
8921 LanceRead: uri=..., projection=[vec], num_fragments=2, range_before=None, range_after=None, \
8922 row_id=true, row_addr=false, full_filter=--, refine_filter=--"
8923 };
8924 assert_plan_equals(
8925 &dataset.dataset,
8926 |scan| scan.nearest("vec", &q, 5),
8927 expected,
8928 )
8929 .await?;
8930
8931 let q: Float32Array = (32..32 + dim).map(|v| v as f32).collect();
8934 log::info!("Test case: KNN with extraneous limit");
8935 let expected = if data_storage_version == LanceFileVersion::Legacy {
8936 "ProjectionExec: expr=[i@3 as i, s@4 as s, vec@0 as vec, _distance@2 as _distance]
8937 Take: columns=\"vec, _rowid, _distance, (i), (s)\"
8938 CoalesceBatchesExec: target_batch_size=8192
8939 GlobalLimitExec: skip=0, fetch=1
8940 FilterExec: _distance@2 IS NOT NULL
8941 SortExec: TopK(fetch=5), expr=...
8942 KNNVectorDistance: metric=l2
8943 LanceScan: uri=..., projection=[vec], row_id=true, row_addr=false, ordered=false, range=None"
8944 } else {
8945 "ProjectionExec: expr=[i@3 as i, s@4 as s, vec@0 as vec, _distance@2 as _distance]
8946 Take: columns=\"vec, _rowid, _distance, (i), (s)\"
8947 CoalesceBatchesExec: target_batch_size=8192
8948 GlobalLimitExec: skip=0, fetch=1
8949 FilterExec: _distance@2 IS NOT NULL
8950 SortExec: TopK(fetch=5), expr=...
8951 KNNVectorDistance: metric=l2
8952 LanceRead: uri=..., projection=[vec], num_fragments=2, range_before=None, range_after=None, \
8953 row_id=true, row_addr=false, full_filter=--, refine_filter=--"
8954 };
8955 assert_plan_equals(
8956 &dataset.dataset,
8957 |scan| scan.nearest("vec", &q, 5)?.limit(Some(1), None),
8958 expected,
8959 )
8960 .await?;
8961
8962 dataset.make_vector_index().await?;
8965 log::info!("Test case: Basic ANN");
8966 let expected =
8967 "ProjectionExec: expr=[i@2 as i, s@3 as s, vec@4 as vec, _distance@0 as _distance]
8968 Take: columns=\"_distance, _rowid, (i), (s), (vec)\"
8969 CoalesceBatchesExec: target_batch_size=8192
8970 SortExec: TopK(fetch=42), expr=...
8971 ANNSubIndex: name=..., k=42, deltas=1, metric=L2
8972 ANNIvfPartition: uuid=..., minimum_nprobes=1, maximum_nprobes=None, deltas=1";
8973 assert_plan_equals(
8974 &dataset.dataset,
8975 |scan| scan.nearest("vec", &q, 42),
8976 expected,
8977 )
8978 .await?;
8979
8980 log::info!("Test case: ANN with refine");
8981 let expected =
8982 "ProjectionExec: expr=[i@3 as i, s@4 as s, vec@1 as vec, _distance@2 as _distance]
8983 Take: columns=\"_rowid, vec, _distance, (i), (s)\"
8984 CoalesceBatchesExec: target_batch_size=8192
8985 FilterExec: _distance@... IS NOT NULL
8986 SortExec: TopK(fetch=10), expr=...
8987 KNNVectorDistance: metric=l2
8988 Take: columns=\"_distance, _rowid, (vec)\"
8989 CoalesceBatchesExec: target_batch_size=8192
8990 SortExec: TopK(fetch=40), expr=...
8991 ANNSubIndex: name=..., k=40, deltas=1, metric=L2
8992 ANNIvfPartition: uuid=..., minimum_nprobes=1, maximum_nprobes=None, deltas=1";
8993 assert_plan_equals(
8994 &dataset.dataset,
8995 |scan| Ok(scan.nearest("vec", &q, 10)?.refine(4)),
8996 expected,
8997 )
8998 .await?;
8999
9000 log::info!("Test case: ANN with index disabled");
9002 let expected = if data_storage_version == LanceFileVersion::Legacy {
9003 "ProjectionExec: expr=[i@3 as i, s@4 as s, vec@0 as vec, _distance@2 as _distance]
9004 Take: columns=\"vec, _rowid, _distance, (i), (s)\"
9005 CoalesceBatchesExec: target_batch_size=8192
9006 FilterExec: _distance@... IS NOT NULL
9007 SortExec: TopK(fetch=13), expr=...
9008 KNNVectorDistance: metric=l2
9009 LanceScan: uri=..., projection=[vec], row_id=true, row_addr=false, ordered=false, range=None"
9010 } else {
9011 "ProjectionExec: expr=[i@3 as i, s@4 as s, vec@0 as vec, _distance@2 as _distance]
9012 Take: columns=\"vec, _rowid, _distance, (i), (s)\"
9013 CoalesceBatchesExec: target_batch_size=8192
9014 FilterExec: _distance@... IS NOT NULL
9015 SortExec: TopK(fetch=13), expr=...
9016 KNNVectorDistance: metric=l2
9017 LanceRead: uri=..., projection=[vec], num_fragments=2, range_before=None, range_after=None, \
9018 row_id=true, row_addr=false, full_filter=--, refine_filter=--"
9019 };
9020 assert_plan_equals(
9021 &dataset.dataset,
9022 |scan| Ok(scan.nearest("vec", &q, 13)?.use_index(false)),
9023 expected,
9024 )
9025 .await?;
9026
9027 log::info!("Test case: ANN with postfilter");
9028 let expected = "ProjectionExec: expr=[s@3 as s, vec@4 as vec, _distance@0 as _distance, _rowid@1 as _rowid]
9029 Take: columns=\"_distance, _rowid, i, (s), (vec)\"
9030 CoalesceBatchesExec: target_batch_size=8192
9031 FilterExec: i@2 > 10
9032 Take: columns=\"_distance, _rowid, (i)\"
9033 CoalesceBatchesExec: target_batch_size=8192
9034 SortExec: TopK(fetch=17), expr=...
9035 ANNSubIndex: name=..., k=17, deltas=1, metric=L2
9036 ANNIvfPartition: uuid=..., minimum_nprobes=1, maximum_nprobes=None, deltas=1";
9037 assert_plan_equals(
9038 &dataset.dataset,
9039 |scan| {
9040 Ok(scan
9041 .nearest("vec", &q, 17)?
9042 .filter("i > 10")?
9043 .project(&["s", "vec"])?
9044 .with_row_id())
9045 },
9046 expected,
9047 )
9048 .await?;
9049
9050 log::info!("Test case: ANN with prefilter");
9051 let expected = if data_storage_version == LanceFileVersion::Legacy {
9052 "ProjectionExec: expr=[i@2 as i, s@3 as s, vec@4 as vec, _distance@0 as _distance]
9053 Take: columns=\"_distance, _rowid, (i), (s), (vec)\"
9054 CoalesceBatchesExec: target_batch_size=8192
9055 SortExec: TopK(fetch=17), expr=...
9056 ANNSubIndex: name=..., k=17, deltas=1, metric=L2
9057 ANNIvfPartition: uuid=..., minimum_nprobes=1, maximum_nprobes=None, deltas=1
9058 FilterExec: i@0 > 10
9059 LanceScan: uri=..., projection=[i], row_id=true, row_addr=false, ordered=false, range=None"
9060 } else {
9061 "ProjectionExec: expr=[i@2 as i, s@3 as s, vec@4 as vec, _distance@0 as _distance]
9062 Take: columns=\"_distance, _rowid, (i), (s), (vec)\"
9063 CoalesceBatchesExec: target_batch_size=8192
9064 SortExec: TopK(fetch=17), expr=...
9065 ANNSubIndex: name=..., k=17, deltas=1, metric=L2
9066 ANNIvfPartition: uuid=..., minimum_nprobes=1, maximum_nprobes=None, deltas=1
9067 LanceRead: uri=..., projection=[], num_fragments=2, range_before=None, range_after=None, \
9068 row_id=true, row_addr=false, full_filter=i > Int32(10), refine_filter=i > Int32(10)
9069"
9070 };
9071 assert_plan_equals(
9072 &dataset.dataset,
9073 |scan| {
9074 Ok(scan
9075 .nearest("vec", &q, 17)?
9076 .filter("i > 10")?
9077 .prefilter(true))
9078 },
9079 expected,
9080 )
9081 .await?;
9082
9083 dataset.append_new_data().await?;
9084 log::info!("Test case: Combined KNN/ANN");
9085 let expected = "ProjectionExec: expr=[i@3 as i, s@4 as s, vec@1 as vec, _distance@2 as _distance]
9086 Take: columns=\"_rowid, vec, _distance, (i), (s)\"
9087 CoalesceBatchesExec: target_batch_size=8192
9088 FilterExec: _distance@... IS NOT NULL
9089 SortExec: TopK(fetch=6), expr=...
9090 KNNVectorDistance: metric=l2
9091 RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2
9092 UnionExec
9093 ProjectionExec: expr=[_distance@2 as _distance, _rowid@1 as _rowid, vec@0 as vec]
9094 FilterExec: _distance@... IS NOT NULL
9095 SortExec: TopK(fetch=6), expr=...
9096 KNNVectorDistance: metric=l2
9097 LanceScan: uri=..., projection=[vec], row_id=true, row_addr=false, ordered=false, range=None
9098 Take: columns=\"_distance, _rowid, (vec)\"
9099 CoalesceBatchesExec: target_batch_size=8192
9100 SortExec: TopK(fetch=6), expr=...
9101 ANNSubIndex: name=..., k=6, deltas=1, metric=L2
9102 ANNIvfPartition: uuid=..., minimum_nprobes=1, maximum_nprobes=None, deltas=1";
9103 assert_plan_equals(
9104 &dataset.dataset,
9105 |scan| scan.nearest("vec", &q, 6),
9106 expected,
9109 )
9110 .await?;
9111
9112 log::info!("Test case: Combined KNN/ANN with postfilter");
9114 let expected = "ProjectionExec: expr=[i@3 as i, s@4 as s, vec@1 as vec, _distance@2 as _distance]
9115 Take: columns=\"_rowid, vec, _distance, i, (s)\"
9116 CoalesceBatchesExec: target_batch_size=8192
9117 FilterExec: i@3 > 10
9118 Take: columns=\"_rowid, vec, _distance, (i)\"
9119 CoalesceBatchesExec: target_batch_size=8192
9120 FilterExec: _distance@... IS NOT NULL
9121 SortExec: TopK(fetch=15), expr=...
9122 KNNVectorDistance: metric=l2
9123 RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2
9124 UnionExec
9125 ProjectionExec: expr=[_distance@2 as _distance, _rowid@1 as _rowid, vec@0 as vec]
9126 FilterExec: _distance@... IS NOT NULL
9127 SortExec: TopK(fetch=15), expr=...
9128 KNNVectorDistance: metric=l2
9129 LanceScan: uri=..., projection=[vec], row_id=true, row_addr=false, ordered=false, range=None
9130 Take: columns=\"_distance, _rowid, (vec)\"
9131 CoalesceBatchesExec: target_batch_size=8192
9132 SortExec: TopK(fetch=15), expr=...
9133 ANNSubIndex: name=..., k=15, deltas=1, metric=L2
9134 ANNIvfPartition: uuid=..., minimum_nprobes=1, maximum_nprobes=None, deltas=1";
9135 assert_plan_equals(
9136 &dataset.dataset,
9137 |scan| scan.nearest("vec", &q, 15)?.filter("i > 10"),
9138 expected,
9139 )
9140 .await?;
9141
9142 log::info!("Test case: Combined KNN/ANN with prefilter");
9144 let expected = if data_storage_version == LanceFileVersion::Legacy {
9145 "ProjectionExec: expr=[i@3 as i, s@4 as s, vec@1 as vec, _distance@2 as _distance]
9146 Take: columns=\"_rowid, vec, _distance, (i), (s)\"
9147 CoalesceBatchesExec: target_batch_size=8192
9148 FilterExec: _distance@... IS NOT NULL
9149 SortExec: TopK(fetch=5), expr=...
9150 KNNVectorDistance: metric=l2
9151 RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2
9152 UnionExec
9153 ProjectionExec: expr=[_distance@3 as _distance, _rowid@2 as _rowid, vec@0 as vec]
9154 FilterExec: _distance@... IS NOT NULL
9155 SortExec: TopK(fetch=5), expr=...
9156 KNNVectorDistance: metric=l2
9157 FilterExec: i@1 > 10
9158 LanceScan: uri=..., projection=[vec, i], row_id=true, row_addr=false, ordered=false, range=None
9159 Take: columns=\"_distance, _rowid, (vec)\"
9160 CoalesceBatchesExec: target_batch_size=8192
9161 SortExec: TopK(fetch=5), expr=...
9162 ANNSubIndex: name=..., k=5, deltas=1, metric=L2
9163 ANNIvfPartition: uuid=..., minimum_nprobes=1, maximum_nprobes=None, deltas=1
9164 FilterExec: i@0 > 10
9165 LanceScan: uri=..., projection=[i], row_id=true, row_addr=false, ordered=false, range=None"
9166 } else {
9167 "ProjectionExec: expr=[i@3 as i, s@4 as s, vec@1 as vec, _distance@2 as _distance]
9168 Take: columns=\"_rowid, vec, _distance, (i), (s)\"
9169 CoalesceBatchesExec: target_batch_size=8192
9170 FilterExec: _distance@... IS NOT NULL
9171 SortExec: TopK(fetch=5), expr=...
9172 KNNVectorDistance: metric=l2
9173 RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2
9174 UnionExec
9175 ProjectionExec: expr=[_distance@3 as _distance, _rowid@2 as _rowid, vec@0 as vec]
9176 FilterExec: _distance@... IS NOT NULL
9177 SortExec: TopK(fetch=5), expr=...
9178 KNNVectorDistance: metric=l2
9179 FilterExec: i@1 > 10
9180 LanceScan: uri=..., projection=[vec, i], row_id=true, row_addr=false, ordered=false, range=None
9181 Take: columns=\"_distance, _rowid, (vec)\"
9182 CoalesceBatchesExec: target_batch_size=8192
9183 SortExec: TopK(fetch=5), expr=...
9184 ANNSubIndex: name=..., k=5, deltas=1, metric=L2
9185 ANNIvfPartition: uuid=..., minimum_nprobes=1, maximum_nprobes=None, deltas=1
9186 LanceRead: uri=..., projection=[], num_fragments=2, range_before=None, range_after=None, \
9187 row_id=true, row_addr=false, full_filter=i > Int32(10), refine_filter=i > Int32(10)"
9188 };
9189 assert_plan_equals(
9190 &dataset.dataset,
9191 |scan| {
9192 Ok(scan
9193 .nearest("vec", &q, 5)?
9194 .filter("i > 10")?
9195 .prefilter(true))
9196 },
9197 expected,
9200 )
9201 .await?;
9202
9203 dataset.make_vector_index().await?;
9207 dataset.make_scalar_index().await?;
9208
9209 log::info!("Test case: ANN with scalar index");
9210 let expected =
9211 "ProjectionExec: expr=[i@2 as i, s@3 as s, vec@4 as vec, _distance@0 as _distance]
9212 Take: columns=\"_distance, _rowid, (i), (s), (vec)\"
9213 CoalesceBatchesExec: target_batch_size=8192
9214 SortExec: TopK(fetch=5), expr=...
9215 ANNSubIndex: name=..., k=5, deltas=1, metric=L2
9216 ANNIvfPartition: uuid=..., minimum_nprobes=1, maximum_nprobes=None, deltas=1
9217 ScalarIndexQuery: query=[i > 10]@i_idx(BTree)";
9218 assert_plan_equals(
9219 &dataset.dataset,
9220 |scan| {
9221 Ok(scan
9222 .nearest("vec", &q, 5)?
9223 .filter("i > 10")?
9224 .prefilter(true))
9225 },
9226 expected,
9227 )
9228 .await?;
9229
9230 log::info!("Test case: ANN with scalar index disabled");
9231 let expected = if data_storage_version == LanceFileVersion::Legacy {
9232 "ProjectionExec: expr=[i@2 as i, s@3 as s, vec@4 as vec, _distance@0 as _distance]
9233 Take: columns=\"_distance, _rowid, (i), (s), (vec)\"
9234 CoalesceBatchesExec: target_batch_size=8192
9235 SortExec: TopK(fetch=5), expr=...
9236 ANNSubIndex: name=..., k=5, deltas=1, metric=L2
9237 ANNIvfPartition: uuid=..., minimum_nprobes=1, maximum_nprobes=None, deltas=1
9238 FilterExec: i@0 > 10
9239 LanceScan: uri=..., projection=[i], row_id=true, row_addr=false, ordered=false, range=None"
9240 } else {
9241 "ProjectionExec: expr=[i@2 as i, s@3 as s, vec@4 as vec, _distance@0 as _distance]
9242 Take: columns=\"_distance, _rowid, (i), (s), (vec)\"
9243 CoalesceBatchesExec: target_batch_size=8192
9244 SortExec: TopK(fetch=5), expr=...
9245 ANNSubIndex: name=..., k=5, deltas=1, metric=L2
9246 ANNIvfPartition: uuid=..., minimum_nprobes=1, maximum_nprobes=None, deltas=1
9247 LanceRead: uri=..., projection=[], num_fragments=3, range_before=None, \
9248 range_after=None, row_id=true, row_addr=false, full_filter=i > Int32(10), refine_filter=i > Int32(10)"
9249 };
9250 assert_plan_equals(
9251 &dataset.dataset,
9252 |scan| {
9253 Ok(scan
9254 .nearest("vec", &q, 5)?
9255 .use_scalar_index(false)
9256 .filter("i > 10")?
9257 .prefilter(true))
9258 },
9259 expected,
9260 )
9261 .await?;
9262
9263 dataset.append_new_data().await?;
9264
9265 log::info!("Test case: Combined KNN/ANN with scalar index");
9266 let expected = "ProjectionExec: expr=[i@3 as i, s@4 as s, vec@1 as vec, _distance@2 as _distance]
9267 Take: columns=\"_rowid, vec, _distance, (i), (s)\"
9268 CoalesceBatchesExec: target_batch_size=8192
9269 FilterExec: _distance@... IS NOT NULL
9270 SortExec: TopK(fetch=8), expr=...
9271 KNNVectorDistance: metric=l2
9272 RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2
9273 UnionExec
9274 ProjectionExec: expr=[_distance@3 as _distance, _rowid@2 as _rowid, vec@0 as vec]
9275 FilterExec: _distance@... IS NOT NULL
9276 SortExec: TopK(fetch=8), expr=...
9277 KNNVectorDistance: metric=l2
9278 FilterExec: i@1 > 10
9279 LanceScan: uri=..., projection=[vec, i], row_id=true, row_addr=false, ordered=false, range=None
9280 Take: columns=\"_distance, _rowid, (vec)\"
9281 CoalesceBatchesExec: target_batch_size=8192
9282 SortExec: TopK(fetch=8), expr=...
9283 ANNSubIndex: name=..., k=8, deltas=1, metric=L2
9284 ANNIvfPartition: uuid=..., minimum_nprobes=1, maximum_nprobes=None, deltas=1
9285 ScalarIndexQuery: query=[i > 10]@i_idx(BTree)";
9286 assert_plan_equals(
9287 &dataset.dataset,
9288 |scan| {
9289 Ok(scan
9290 .nearest("vec", &q, 8)?
9291 .filter("i > 10")?
9292 .prefilter(true))
9293 },
9294 expected,
9295 )
9296 .await?;
9297
9298 log::info!(
9300 "Test case: Combined KNN/ANN with updated scalar index and outdated vector index"
9301 );
9302 let expected = "ProjectionExec: expr=[i@3 as i, s@4 as s, vec@1 as vec, _distance@2 as _distance]
9303 Take: columns=\"_rowid, vec, _distance, (i), (s)\"
9304 CoalesceBatchesExec: target_batch_size=8192
9305 FilterExec: _distance@... IS NOT NULL
9306 SortExec: TopK(fetch=11), expr=...
9307 KNNVectorDistance: metric=l2
9308 RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2
9309 UnionExec
9310 ProjectionExec: expr=[_distance@3 as _distance, _rowid@2 as _rowid, vec@0 as vec]
9311 FilterExec: _distance@... IS NOT NULL
9312 SortExec: TopK(fetch=11), expr=...
9313 KNNVectorDistance: metric=l2
9314 FilterExec: i@1 > 10
9315 LanceScan: uri=..., projection=[vec, i], row_id=true, row_addr=false, ordered=false, range=None
9316 Take: columns=\"_distance, _rowid, (vec)\"
9317 CoalesceBatchesExec: target_batch_size=8192
9318 SortExec: TopK(fetch=11), expr=...
9319 ANNSubIndex: name=..., k=11, deltas=1, metric=L2
9320 ANNIvfPartition: uuid=..., minimum_nprobes=1, maximum_nprobes=None, deltas=1
9321 ScalarIndexQuery: query=[i > 10]@i_idx(BTree)";
9322 dataset.make_scalar_index().await?;
9323 assert_plan_equals(
9324 &dataset.dataset,
9325 |scan| {
9326 Ok(scan
9327 .nearest("vec", &q, 11)?
9328 .filter("i > 10")?
9329 .prefilter(true))
9330 },
9331 expected,
9332 )
9333 .await?;
9334
9335 log::info!("Test case: Filtered read with scalar index");
9338 let expected = if data_storage_version == LanceFileVersion::Legacy {
9339 "ProjectionExec: expr=[s@1 as s]
9340 Take: columns=\"_rowid, (s)\"
9341 CoalesceBatchesExec: target_batch_size=8192
9342 MaterializeIndex: query=[i > 10]@i_idx(BTree)"
9343 } else {
9344 "LanceRead: uri=..., projection=[s], num_fragments=4, range_before=None, \
9345 range_after=None, row_id=false, row_addr=false, full_filter=i > Int32(10), refine_filter=--
9346 ScalarIndexQuery: query=[i > 10]@i_idx(BTree)"
9347 };
9348 assert_plan_equals(
9349 &dataset.dataset,
9350 |scan| scan.project(&["s"])?.filter("i > 10"),
9351 expected,
9352 )
9353 .await?;
9354
9355 if data_storage_version != LanceFileVersion::Legacy {
9356 log::info!(
9357 "Test case: Filtered read with scalar index disabled (late materialization)"
9358 );
9359 assert_plan_equals(
9360 &dataset.dataset,
9361 |scan| {
9362 scan.project(&["s"])?
9363 .use_scalar_index(false)
9364 .filter("i > 10")
9365 },
9366 "ProjectionExec: expr=[s@2 as s]
9367 Take: columns=\"i, _rowid, (s)\"
9368 CoalesceBatchesExec: target_batch_size=8192
9369 LanceRead: uri=..., projection=[i], num_fragments=4, range_before=None, \
9370 range_after=None, row_id=true, row_addr=false, full_filter=i > Int32(10), refine_filter=i > Int32(10)",
9371 )
9372 .await?;
9373 }
9374
9375 log::info!("Test case: Empty projection");
9376 let expected = if data_storage_version == LanceFileVersion::Legacy {
9377 "ProjectionExec: expr=[_rowaddr@0 as _rowaddr]
9378 AddRowAddrExec
9379 MaterializeIndex: query=[i > 10]@i_idx(BTree)"
9380 } else {
9381 "LanceRead: uri=..., projection=[], num_fragments=4, range_before=None, \
9382 range_after=None, row_id=false, row_addr=true, full_filter=i > Int32(10), refine_filter=--
9383 ScalarIndexQuery: query=[i > 10]@i_idx(BTree)"
9384 };
9385 assert_plan_equals(
9386 &dataset.dataset,
9387 |scan| {
9388 scan.filter("i > 10")
9389 .unwrap()
9390 .with_row_address()
9391 .project::<&str>(&[])
9392 },
9393 expected,
9394 )
9395 .await?;
9396
9397 dataset.append_new_data().await?;
9398 log::info!("Test case: Combined Scalar/non-scalar filtered read");
9399 let expected = if data_storage_version == LanceFileVersion::Legacy {
9400 "ProjectionExec: expr=[s@1 as s]
9401 RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2
9402 UnionExec
9403 Take: columns=\"_rowid, (s)\"
9404 CoalesceBatchesExec: target_batch_size=8192
9405 MaterializeIndex: query=[i > 10]@i_idx(BTree)
9406 ProjectionExec: expr=[_rowid@2 as _rowid, s@1 as s]
9407 FilterExec: i@0 > 10
9408 LanceScan: uri=..., projection=[i, s], row_id=true, row_addr=false, ordered=false, range=None"
9409 } else {
9410 "LanceRead: uri=..., projection=[s], num_fragments=5, range_before=None, \
9411 range_after=None, row_id=false, row_addr=false, full_filter=i > Int32(10), refine_filter=--
9412 ScalarIndexQuery: query=[i > 10]@i_idx(BTree)"
9413 };
9414 assert_plan_equals(
9415 &dataset.dataset,
9416 |scan| scan.project(&["s"])?.filter("i > 10"),
9417 expected,
9418 )
9419 .await?;
9420
9421 log::info!("Test case: Combined Scalar/non-scalar filtered read with empty projection");
9422 let expected = if data_storage_version == LanceFileVersion::Legacy {
9423 "ProjectionExec: expr=[_rowaddr@0 as _rowaddr]
9424 RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2
9425 UnionExec
9426 AddRowAddrExec
9427 MaterializeIndex: query=[i > 10]@i_idx(BTree)
9428 ProjectionExec: expr=[_rowaddr@2 as _rowaddr, _rowid@1 as _rowid]
9429 FilterExec: i@0 > 10
9430 LanceScan: uri=..., projection=[i], row_id=true, row_addr=true, ordered=false, range=None"
9431 } else {
9432 "LanceRead: uri=..., projection=[], num_fragments=5, range_before=None, \
9433 range_after=None, row_id=false, row_addr=true, full_filter=i > Int32(10), refine_filter=--
9434 ScalarIndexQuery: query=[i > 10]@i_idx(BTree)"
9435 };
9436 assert_plan_equals(
9437 &dataset.dataset,
9438 |scan| {
9439 scan.filter("i > 10")
9440 .unwrap()
9441 .with_row_address()
9442 .project::<&str>(&[])
9443 },
9444 expected,
9445 )
9446 .await?;
9447
9448 log::info!("Test case: Dynamic projection");
9451 let expected = if data_storage_version == LanceFileVersion::Legacy {
9452 "ProjectionExec: expr=[regexp_match(s@1, .*) as matches]
9453 RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2
9454 UnionExec
9455 Take: columns=\"_rowid, (s)\"
9456 CoalesceBatchesExec: target_batch_size=8192
9457 MaterializeIndex: query=[i > 10]@i_idx(BTree)
9458 ProjectionExec: expr=[_rowid@2 as _rowid, s@1 as s]
9459 FilterExec: i@0 > 10
9460 LanceScan: uri=..., row_id=true, row_addr=false, ordered=false, range=None"
9461 } else {
9462 "ProjectionExec: expr=[regexp_match(s@0, .*) as matches]
9463 LanceRead: uri=..., projection=[s], num_fragments=5, range_before=None, \
9464 range_after=None, row_id=false, row_addr=false, full_filter=i > Int32(10), refine_filter=--
9465 ScalarIndexQuery: query=[i > 10]@i_idx(BTree)"
9466 };
9467 assert_plan_equals(
9468 &dataset.dataset,
9469 |scan| {
9470 scan.project_with_transform(&[("matches", "regexp_match(s, \".*\")")])?
9471 .filter("i > 10")
9472 },
9473 expected,
9474 )
9475 .await?;
9476
9477 dataset.make_fts_index().await?;
9481 log::info!("Test case: Full text search (match query)");
9482 let expected = r#"ProjectionExec: expr=[s@2 as s, _score@1 as _score, _rowid@0 as _rowid]
9483 Take: columns="_rowid, _score, (s)"
9484 CoalesceBatchesExec: target_batch_size=8192
9485 MatchQuery: column=s, query=hello"#;
9486 assert_plan_equals(
9487 &dataset.dataset,
9488 |scan| {
9489 scan.project(&["s"])?
9490 .with_row_id()
9491 .full_text_search(FullTextSearchQuery::new("hello".to_owned()))
9492 },
9493 expected,
9494 )
9495 .await?;
9496
9497 log::info!("Test case: Full text search (phrase query)");
9498 let expected = r#"ProjectionExec: expr=[s@2 as s, _score@1 as _score, _rowid@0 as _rowid]
9499 Take: columns="_rowid, _score, (s)"
9500 CoalesceBatchesExec: target_batch_size=8192
9501 PhraseQuery: column=s, query=hello world"#;
9502 assert_plan_equals(
9503 &dataset.dataset,
9504 |scan| {
9505 let query = PhraseQuery::new("hello world".to_owned());
9506 scan.project(&["s"])?
9507 .with_row_id()
9508 .full_text_search(FullTextSearchQuery::new_query(query.into()))
9509 },
9510 expected,
9511 )
9512 .await?;
9513
9514 log::info!("Test case: Full text search (boost query)");
9515 let expected = r#"ProjectionExec: expr=[s@2 as s, _score@1 as _score, _rowid@0 as _rowid]
9516 Take: columns="_rowid, _score, (s)"
9517 CoalesceBatchesExec: target_batch_size=8192
9518 BoostQuery: negative_boost=1
9519 MatchQuery: column=s, query=hello
9520 MatchQuery: column=s, query=world"#;
9521 assert_plan_equals(
9522 &dataset.dataset,
9523 |scan| {
9524 let positive =
9525 MatchQuery::new("hello".to_owned()).with_column(Some("s".to_owned()));
9526 let negative =
9527 MatchQuery::new("world".to_owned()).with_column(Some("s".to_owned()));
9528 let query = BoostQuery::new(positive.into(), negative.into(), Some(1.0));
9529 scan.project(&["s"])?
9530 .with_row_id()
9531 .full_text_search(FullTextSearchQuery::new_query(query.into()))
9532 },
9533 expected,
9534 )
9535 .await?;
9536
9537 log::info!("Test case: Full text search with prefilter");
9538 let expected = if data_storage_version == LanceFileVersion::Legacy {
9539 r#"ProjectionExec: expr=[s@2 as s, _score@1 as _score, _rowid@0 as _rowid]
9540 Take: columns="_rowid, _score, (s)"
9541 CoalesceBatchesExec: target_batch_size=8192
9542 MatchQuery: column=s, query=hello
9543 RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2
9544 UnionExec
9545 MaterializeIndex: query=[i > 10]@i_idx(BTree)
9546 ProjectionExec: expr=[_rowid@1 as _rowid]
9547 FilterExec: i@0 > 10
9548 LanceScan: uri=..., projection=[i], row_id=true, row_addr=false, ordered=false, range=None"#
9549 } else {
9550 r#"ProjectionExec: expr=[s@2 as s, _score@1 as _score, _rowid@0 as _rowid]
9551 Take: columns="_rowid, _score, (s)"
9552 CoalesceBatchesExec: target_batch_size=8192
9553 MatchQuery: column=s, query=hello
9554 LanceRead: uri=..., projection=[], num_fragments=5, range_before=None, range_after=None, row_id=true, row_addr=false, full_filter=i > Int32(10), refine_filter=--
9555 ScalarIndexQuery: query=[i > 10]@i_idx(BTree)"#
9556 };
9557 assert_plan_equals(
9558 &dataset.dataset,
9559 |scan| {
9560 scan.project(&["s"])?
9561 .with_row_id()
9562 .filter("i > 10")?
9563 .prefilter(true)
9564 .full_text_search(FullTextSearchQuery::new("hello".to_owned()))
9565 },
9566 expected,
9567 )
9568 .await?;
9569
9570 log::info!("Test case: Full text search with unindexed rows");
9571 let expected = r#"ProjectionExec: expr=[s@2 as s, _score@1 as _score, _rowid@0 as _rowid]
9572 Take: columns="_rowid, _score, (s)"
9573 CoalesceBatchesExec: target_batch_size=8192
9574 SortExec: expr=[_score@1 DESC NULLS LAST], preserve_partitioning=[false]
9575 RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2
9576 UnionExec
9577 MatchQuery: column=s, query=hello
9578 FlatMatchQuery: column=s, query=hello
9579 LanceScan: uri=..., projection=[s], row_id=true, row_addr=false, ordered=false, range=None"#;
9580 dataset.append_new_data().await?;
9581 assert_plan_equals(
9582 &dataset.dataset,
9583 |scan| {
9584 scan.project(&["s"])?
9585 .with_row_id()
9586 .full_text_search(FullTextSearchQuery::new("hello".to_owned()))
9587 },
9588 expected,
9589 )
9590 .await?;
9591
9592 log::info!("Test case: Full text search with unindexed rows and fast_search");
9593 let expected = r#"ProjectionExec: expr=[s@2 as s, _score@1 as _score, _rowid@0 as _rowid]
9594 Take: columns="_rowid, _score, (s)"
9595 CoalesceBatchesExec: target_batch_size=8192
9596 MatchQuery: column=s, query=hello"#;
9597 assert_plan_equals(
9598 &dataset.dataset,
9599 |scan| {
9600 let scan = scan
9601 .project(&["s"])?
9602 .with_row_id()
9603 .full_text_search(FullTextSearchQuery::new("hello".to_owned()))?;
9604 scan.fast_search();
9605 Ok(scan)
9606 },
9607 expected,
9608 )
9609 .await?;
9610
9611 log::info!("Test case: Full text search with unindexed rows and prefilter");
9612 let expected = if data_storage_version == LanceFileVersion::Legacy {
9613 r#"ProjectionExec: expr=[s@2 as s, _score@1 as _score, _rowid@0 as _rowid]
9614 Take: columns="_rowid, _score, (s)"
9615 CoalesceBatchesExec: target_batch_size=8192
9616 SortExec: expr=[_score@1 DESC NULLS LAST], preserve_partitioning=[false]
9617 RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2
9618 UnionExec
9619 MatchQuery: column=s, query=hello
9620 RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2
9621 UnionExec
9622 MaterializeIndex: query=[i > 10]@i_idx(BTree)
9623 ProjectionExec: expr=[_rowid@1 as _rowid]
9624 FilterExec: i@0 > 10
9625 LanceScan: uri=..., projection=[i], row_id=true, row_addr=false, ordered=false, range=None
9626 FlatMatchQuery: column=s, query=hello
9627 FilterExec: i@1 > 10
9628 LanceScan: uri=..., projection=[s, i], row_id=true, row_addr=false, ordered=false, range=None"#
9629 } else {
9630 r#"ProjectionExec: expr=[s@2 as s, _score@1 as _score, _rowid@0 as _rowid]
9631 Take: columns="_rowid, _score, (s)"
9632 CoalesceBatchesExec: target_batch_size=8192
9633 SortExec: expr=[_score@1 DESC NULLS LAST], preserve_partitioning=[false]
9634 RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2
9635 UnionExec
9636 MatchQuery: column=s, query=hello
9637 LanceRead: uri=..., projection=[], num_fragments=5, range_before=None, range_after=None, row_id=true, row_addr=false, full_filter=i > Int32(10), refine_filter=--
9638 ScalarIndexQuery: query=[i > 10]@i_idx(BTree)
9639 FlatMatchQuery: column=s, query=hello
9640 FilterExec: i@1 > 10
9641 LanceScan: uri=..., projection=[s, i], row_id=true, row_addr=false, ordered=false, range=None"#
9642 };
9643 assert_plan_equals(
9644 &dataset.dataset,
9645 |scan| {
9646 scan.project(&["s"])?
9647 .with_row_id()
9648 .filter("i > 10")?
9649 .prefilter(true)
9650 .full_text_search(FullTextSearchQuery::new("hello".to_owned()))
9651 },
9652 expected,
9653 )
9654 .await?;
9655
9656 Ok(())
9657 }
9658
9659 #[tokio::test]
9660 async fn test_fast_search_plan() {
9661 let mut dataset = TestVectorDataset::new(LanceFileVersion::Stable, true)
9663 .await
9664 .unwrap();
9665 dataset.make_vector_index().await.unwrap();
9666 dataset.append_new_data().await.unwrap();
9667
9668 let q: Float32Array = (32..64).map(|v| v as f32).collect();
9669
9670 assert_plan_equals(
9671 &dataset.dataset,
9672 |scan| {
9673 scan.nearest("vec", &q, 32)?
9674 .fast_search()
9675 .project(&["_distance", "_rowid"])
9676 },
9677 "SortExec: TopK(fetch=32), expr=[_distance@0 ASC NULLS LAST, _rowid@1 ASC NULLS LAST]...
9678 ANNSubIndex: name=idx, k=32, deltas=1, metric=L2
9679 ANNIvfPartition: uuid=..., minimum_nprobes=1, maximum_nprobes=None, deltas=1",
9680 )
9681 .await
9682 .unwrap();
9683
9684 assert_plan_equals(
9685 &dataset.dataset,
9686 |scan| {
9687 scan.nearest("vec", &q, 33)?
9688 .fast_search()
9689 .with_row_id()
9690 .project(&["_distance", "_rowid"])
9691 },
9692 "SortExec: TopK(fetch=33), expr=[_distance@0 ASC NULLS LAST, _rowid@1 ASC NULLS LAST]...
9693 ANNSubIndex: name=idx, k=33, deltas=1, metric=L2
9694 ANNIvfPartition: uuid=..., minimum_nprobes=1, maximum_nprobes=None, deltas=1",
9695 )
9696 .await
9697 .unwrap();
9698
9699 assert_plan_equals(
9701 &dataset.dataset,
9702 |scan| {
9703 scan.nearest("vec", &q, 34)?
9704 .with_row_id()
9705 .project(&["_distance", "_rowid"])
9706 },
9707 "ProjectionExec: expr=[_distance@2 as _distance, _rowid@0 as _rowid]
9708 FilterExec: _distance@2 IS NOT NULL
9709 SortExec: TopK(fetch=34), expr=[_distance@2 ASC NULLS LAST, _rowid@0 ASC NULLS LAST]...
9710 KNNVectorDistance: metric=l2
9711 RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2
9712 UnionExec
9713 ProjectionExec: expr=[_distance@2 as _distance, _rowid@1 as _rowid, vec@0 as vec]
9714 FilterExec: _distance@2 IS NOT NULL
9715 SortExec: TopK(fetch=34), expr=[_distance@2 ASC NULLS LAST, _rowid@1 ASC NULLS LAST]...
9716 KNNVectorDistance: metric=l2
9717 LanceScan: uri=..., projection=[vec], row_id=true, row_addr=false, ordered=false, range=None
9718 Take: columns=\"_distance, _rowid, (vec)\"
9719 CoalesceBatchesExec: target_batch_size=8192
9720 SortExec: TopK(fetch=34), expr=[_distance@0 ASC NULLS LAST, _rowid@1 ASC NULLS LAST]...
9721 ANNSubIndex: name=idx, k=34, deltas=1, metric=L2
9722 ANNIvfPartition: uuid=..., minimum_nprobes=1, maximum_nprobes=None, deltas=1",
9723 )
9724 .await
9725 .unwrap();
9726 }
9727
9728 #[tokio::test]
9729 async fn test_fast_search_without_vector_index_returns_empty() {
9730 let dataset = TestVectorDataset::new(LanceFileVersion::Stable, true)
9731 .await
9732 .unwrap();
9733 let q: Float32Array = (32..64).map(|v| v as f32).collect();
9734
9735 let mut scanner = dataset.dataset.scan();
9736 scanner.nearest("vec", &q, 10).unwrap();
9737 let normal_rows = scanner.try_into_batch().await.unwrap().num_rows();
9738
9739 let mut scanner = dataset.dataset.scan();
9740 scanner.nearest("vec", &q, 10).unwrap().fast_search();
9741 let fast_rows = scanner.try_into_batch().await.unwrap().num_rows();
9742
9743 assert_eq!(normal_rows, 10);
9744 assert_eq!(fast_rows, 0);
9745 }
9746
9747 #[rstest]
9748 #[tokio::test]
9749 pub async fn test_scan_planning_io(
9750 #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
9751 data_storage_version: LanceFileVersion,
9752 ) {
9753 use lance_index::scalar::inverted::tokenizer::InvertedIndexParams;
9757 use lance_io::assert_io_eq;
9758 let data = gen_batch()
9759 .col(
9760 "vector",
9761 array::rand_vec::<Float32Type>(Dimension::from(32)),
9762 )
9763 .col("text", array::rand_utf8(ByteCount::from(4), false))
9764 .col("indexed", array::step::<Int32Type>())
9765 .col("not_indexed", array::step::<Int32Type>())
9766 .into_reader_rows(RowCount::from(100), BatchCount::from(5));
9767
9768 let mut dataset = Dataset::write(
9769 data,
9770 "memory://test",
9771 Some(WriteParams {
9772 data_storage_version: Some(data_storage_version),
9773 ..Default::default()
9774 }),
9775 )
9776 .await
9777 .unwrap();
9778 dataset
9779 .create_index(
9780 &["indexed"],
9781 IndexType::Scalar,
9782 None,
9783 &ScalarIndexParams::default(),
9784 false,
9785 )
9786 .await
9787 .unwrap();
9788 dataset
9789 .create_index(
9790 &["text"],
9791 IndexType::Inverted,
9792 None,
9793 &InvertedIndexParams::default(),
9794 false,
9795 )
9796 .await
9797 .unwrap();
9798 dataset
9799 .create_index(
9800 &["vector"],
9801 IndexType::Vector,
9802 None,
9803 &VectorIndexParams {
9804 metric_type: DistanceType::L2,
9805 stages: vec![
9806 StageParams::Ivf(IvfBuildParams {
9807 max_iters: 2,
9808 num_partitions: Some(2),
9809 sample_rate: 2,
9810 ..Default::default()
9811 }),
9812 StageParams::PQ(PQBuildParams {
9813 max_iters: 2,
9814 num_sub_vectors: 2,
9815 ..Default::default()
9816 }),
9817 ],
9818 version: crate::index::vector::IndexFileVersion::Legacy,
9819 skip_transpose: false,
9820 runtime_hints: Default::default(),
9821 },
9822 false,
9823 )
9824 .await
9825 .unwrap();
9826
9827 dataset
9829 .scan()
9830 .prefilter(true)
9831 .filter("indexed > 10")
9832 .unwrap()
9833 .explain_plan(true)
9834 .await
9835 .unwrap();
9836
9837 let io_stats = dataset.object_store.as_ref().io_stats_incremental();
9839 assert_io_gt!(io_stats, read_iops, 0);
9840
9841 dataset
9843 .scan()
9844 .prefilter(true)
9845 .filter("indexed > 10")
9846 .unwrap()
9847 .explain_plan(true)
9848 .await
9849 .unwrap();
9850
9851 let io_stats = dataset.object_store.as_ref().io_stats_incremental();
9852 assert_io_eq!(io_stats, read_iops, 0);
9853
9854 dataset
9855 .scan()
9856 .prefilter(true)
9857 .filter("true")
9858 .unwrap()
9859 .explain_plan(true)
9860 .await
9861 .unwrap();
9862
9863 let io_stats = dataset.object_store.as_ref().io_stats_incremental();
9864 assert_io_eq!(io_stats, read_iops, 0);
9865
9866 dataset
9867 .scan()
9868 .prefilter(true)
9869 .materialization_style(MaterializationStyle::AllEarly)
9870 .filter("true")
9871 .unwrap()
9872 .explain_plan(true)
9873 .await
9874 .unwrap();
9875
9876 let io_stats = dataset.object_store.as_ref().io_stats_incremental();
9877 assert_io_eq!(io_stats, read_iops, 0);
9878
9879 dataset
9880 .scan()
9881 .prefilter(true)
9882 .materialization_style(MaterializationStyle::AllLate)
9883 .filter("true")
9884 .unwrap()
9885 .explain_plan(true)
9886 .await
9887 .unwrap();
9888
9889 let io_stats = dataset.object_store.as_ref().io_stats_incremental();
9890 assert_io_eq!(io_stats, read_iops, 0);
9891 }
9892
9893 #[rstest]
9894 #[tokio::test]
9895 pub async fn test_row_meta_columns(
9896 #[values(
9897 (true, false), (false, true), (true, true) )]
9901 columns: (bool, bool),
9902 ) {
9903 let (with_row_id, with_row_address) = columns;
9904 let test_dir = TempStrDir::default();
9905 let uri = &test_dir;
9906
9907 let schema = Arc::new(arrow_schema::Schema::new(vec![
9908 arrow_schema::Field::new("data_item_id", arrow_schema::DataType::Int32, false),
9909 arrow_schema::Field::new("a", arrow_schema::DataType::Int32, false),
9910 ]));
9911
9912 let data = RecordBatch::try_new(
9913 schema.clone(),
9914 vec![
9915 Arc::new(Int32Array::from(vec![1001, 1002, 1003])),
9916 Arc::new(Int32Array::from(vec![1, 2, 3])),
9917 ],
9918 )
9919 .unwrap();
9920
9921 let dataset = Dataset::write(
9922 RecordBatchIterator::new(vec![Ok(data)], schema.clone()),
9923 uri,
9924 None,
9925 )
9926 .await
9927 .unwrap();
9928
9929 let mut scanner = dataset.scan();
9931
9932 let mut projection = vec!["data_item_id".to_string()];
9933 if with_row_id {
9934 scanner.with_row_id();
9935 projection.push(ROW_ID.to_string());
9936 }
9937 if with_row_address {
9938 scanner.with_row_address();
9939 projection.push(ROW_ADDR.to_string());
9940 }
9941
9942 scanner.project(&projection).unwrap();
9943 let stream = scanner.try_into_stream().await.unwrap();
9944 let batch = stream.try_collect::<Vec<_>>().await.unwrap().pop().unwrap();
9945
9946 if with_row_id {
9948 let column = batch.column_by_name(ROW_ID).unwrap();
9949 assert_eq!(column.data_type(), &DataType::UInt64);
9950 }
9951 if with_row_address {
9952 let column = batch.column_by_name(ROW_ADDR).unwrap();
9953 assert_eq!(column.data_type(), &DataType::UInt64);
9954 }
9955
9956 let mut scanner = dataset.scan();
9958 if with_row_id {
9959 scanner.with_row_id();
9960 }
9961 if with_row_address {
9962 scanner.with_row_address();
9963 }
9964 scanner.project(&["data_item_id"]).unwrap();
9965 let stream = scanner.try_into_stream().await.unwrap();
9966 let batch = stream.try_collect::<Vec<_>>().await.unwrap().pop().unwrap();
9967 let meta_column = batch.column_by_name(if with_row_id { ROW_ID } else { ROW_ADDR });
9968 assert!(meta_column.is_some());
9969
9970 let mut scanner = dataset.scan();
9972 if with_row_id {
9973 scanner.project(&[ROW_ID]).unwrap();
9974 } else {
9975 scanner.project(&[ROW_ADDR]).unwrap();
9976 };
9977 let stream = scanner.try_into_stream().await.unwrap();
9978 assert_eq!(stream.schema().fields().len(), 1);
9979 if with_row_id {
9980 assert!(stream.schema().field_with_name(ROW_ID).is_ok());
9981 } else {
9982 assert!(stream.schema().field_with_name(ROW_ADDR).is_ok());
9983 }
9984 }
9985
9986 async fn limit_offset_equivalency_test(scanner: &Scanner) {
9987 async fn test_one(
9988 scanner: &Scanner,
9989 full_result: &RecordBatch,
9990 limit: Option<i64>,
9991 offset: Option<i64>,
9992 ) {
9993 let mut new_scanner = scanner.clone();
9994 new_scanner.limit(limit, offset).unwrap();
9995 if let Some(nearest) = new_scanner.nearest_mut() {
9996 nearest.k = offset.unwrap_or(0).saturating_add(limit.unwrap_or(10_000)) as usize;
9997 }
9998 let result = new_scanner.try_into_batch().await.unwrap();
9999
10000 let resolved_offset = offset.unwrap_or(0).min(full_result.num_rows() as i64);
10001 let resolved_length = limit
10002 .unwrap_or(i64::MAX)
10003 .min(full_result.num_rows() as i64 - resolved_offset);
10004
10005 let expected = full_result.slice(resolved_offset as usize, resolved_length as usize);
10006
10007 if expected != result {
10008 let plan = new_scanner.analyze_plan().await.unwrap();
10009 assert_eq!(
10010 &expected, &result,
10011 "Limit: {:?}, Offset: {:?}, Plan: \n{}",
10012 limit, offset, plan
10013 );
10014 }
10015 }
10016
10017 let mut scanner_full = scanner.clone();
10018 if let Some(nearest) = scanner_full.nearest_mut() {
10019 nearest.k = 500;
10020 }
10021 let full_results = scanner_full.try_into_batch().await.unwrap();
10022
10023 test_one(scanner, &full_results, Some(1), None).await;
10024 test_one(scanner, &full_results, Some(1), Some(1)).await;
10025 test_one(scanner, &full_results, Some(1), Some(2)).await;
10026 test_one(scanner, &full_results, Some(1), Some(10)).await;
10027
10028 test_one(scanner, &full_results, Some(3), None).await;
10029 test_one(scanner, &full_results, Some(3), Some(2)).await;
10030 test_one(scanner, &full_results, Some(3), Some(4)).await;
10031
10032 test_one(scanner, &full_results, None, Some(3)).await;
10033 test_one(scanner, &full_results, None, Some(10)).await;
10034 }
10035
10036 #[tokio::test]
10037 async fn test_scan_limit_offset() {
10038 let test_ds = TestVectorDataset::new(LanceFileVersion::Stable, false)
10039 .await
10040 .unwrap();
10041 let scanner = test_ds.dataset.scan();
10042 limit_offset_equivalency_test(&scanner).await;
10043 }
10044
10045 #[tokio::test]
10046 async fn test_knn_limit_offset() {
10047 let test_ds = TestVectorDataset::new(LanceFileVersion::Stable, false)
10048 .await
10049 .unwrap();
10050 let query_vector = Float32Array::from(vec![0.0; 32]);
10051 let mut scanner = test_ds.dataset.scan();
10052 scanner
10053 .nearest("vec", &query_vector, 5)
10054 .unwrap()
10055 .project(&["i"])
10056 .unwrap();
10057 limit_offset_equivalency_test(&scanner).await;
10058 }
10059
10060 #[tokio::test]
10061 async fn test_knn_query_parallelism_defaults_and_setter() {
10062 let test_ds = TestVectorDataset::new(LanceFileVersion::Stable, false)
10063 .await
10064 .unwrap();
10065 let query_vector = Float32Array::from(vec![0.0; 32]);
10066 let mut scanner = test_ds.dataset.scan();
10067 scanner.nearest("vec", &query_vector, 5).unwrap();
10068 assert_eq!(
10069 scanner.nearest_mut().unwrap().query_parallelism,
10070 DEFAULT_QUERY_PARALLELISM
10071 );
10072
10073 scanner.query_parallelism(4);
10074 assert_eq!(scanner.nearest_mut().unwrap().query_parallelism, 4);
10075
10076 scanner.query_parallelism(-1);
10077 assert_eq!(scanner.nearest_mut().unwrap().query_parallelism, -1);
10078 }
10079
10080 #[tokio::test]
10081 async fn test_ivf_pq_query_parallelism_returns_same_results() {
10082 let mut test_ds = TestVectorDataset::new(LanceFileVersion::Stable, false)
10083 .await
10084 .unwrap();
10085 test_ds.make_vector_index().await.unwrap();
10086
10087 let query_vector = Float32Array::from(vec![0.0; 32]);
10088
10089 let mut sequential = test_ds.dataset.scan();
10090 sequential.nearest("vec", &query_vector, 50).unwrap();
10091 let sequential_results = sequential.try_into_batch().await.unwrap();
10092
10093 let mut parallel = test_ds.dataset.scan();
10094 parallel
10095 .nearest("vec", &query_vector, 50)
10096 .unwrap()
10097 .query_parallelism(4);
10098 let parallel_results = parallel.try_into_batch().await.unwrap();
10099
10100 assert_eq!(sequential_results, parallel_results);
10101 }
10102
10103 #[tokio::test]
10104 async fn test_ivf_pq_limit_offset() {
10105 let mut test_ds = TestVectorDataset::new(LanceFileVersion::Stable, false)
10106 .await
10107 .unwrap();
10108 test_ds.make_vector_index().await.unwrap();
10109 test_ds.append_new_data().await.unwrap();
10110 let query_vector = Float32Array::from(vec![0.0; 32]);
10111 let mut scanner = test_ds.dataset.scan();
10112 scanner.nearest("vec", &query_vector, 500).unwrap();
10113 limit_offset_equivalency_test(&scanner).await;
10114 }
10115
10116 #[tokio::test]
10117 async fn test_fts_limit_offset() {
10118 let mut test_ds = TestVectorDataset::new(LanceFileVersion::Stable, false)
10119 .await
10120 .unwrap();
10121 test_ds.make_fts_index().await.unwrap();
10122 test_ds.append_new_data().await.unwrap();
10123 let mut scanner = test_ds.dataset.scan();
10124 scanner
10125 .full_text_search(FullTextSearchQuery::new("4".into()))
10126 .unwrap();
10127 limit_offset_equivalency_test(&scanner).await;
10128 }
10129
10130 #[tokio::test]
10131 async fn test_fts_fast_search_excludes_unindexed_rows() {
10132 let mut test_ds = TestVectorDataset::new(LanceFileVersion::Stable, false)
10133 .await
10134 .unwrap();
10135 test_ds.make_fts_index().await.unwrap();
10136 test_ds.append_data_with_range(10, 20).await.unwrap();
10138
10139 let mut scanner = test_ds.dataset.scan();
10140 scanner
10141 .full_text_search(FullTextSearchQuery::new_query(
10142 MatchQuery::new("15".to_owned())
10143 .with_column(Some("s".to_owned()))
10144 .into(),
10145 ))
10146 .unwrap();
10147 let normal_rows = scanner.try_into_batch().await.unwrap().num_rows();
10148
10149 let mut scanner = test_ds.dataset.scan();
10150 scanner
10151 .full_text_search(FullTextSearchQuery::new_query(
10152 MatchQuery::new("15".to_owned())
10153 .with_column(Some("s".to_owned()))
10154 .into(),
10155 ))
10156 .unwrap()
10157 .fast_search();
10158 let fast_rows = scanner.try_into_batch().await.unwrap().num_rows();
10159
10160 assert_eq!(normal_rows, 2);
10161 assert_eq!(fast_rows, 1);
10162 }
10163
10164 async fn test_row_offset_read_helper(
10165 ds: &Dataset,
10166 scan_builder: impl FnOnce(&mut Scanner) -> &mut Scanner,
10167 expected_cols: &[&str],
10168 expected_row_offsets: &[u64],
10169 ) {
10170 let mut scanner = ds.scan();
10171 let scanner = scan_builder(&mut scanner);
10172 let stream = scanner.try_into_stream().await.unwrap();
10173
10174 let schema = stream.schema();
10175 let actual_cols = schema
10176 .fields()
10177 .iter()
10178 .map(|f| f.name().as_str())
10179 .collect::<Vec<_>>();
10180 assert_eq!(&actual_cols, expected_cols);
10181
10182 let batches = stream.try_collect::<Vec<_>>().await.unwrap();
10183 let batch = arrow_select::concat::concat_batches(&schema, &batches).unwrap();
10184
10185 let row_offsets = batch
10186 .column_by_name(ROW_OFFSET)
10187 .unwrap()
10188 .as_primitive::<UInt64Type>()
10189 .values();
10190 assert_eq!(row_offsets.as_ref(), expected_row_offsets);
10191 }
10192
10193 #[tokio::test]
10194 async fn test_row_offset_read() {
10195 let mut ds = lance_datagen::gen_batch()
10196 .col("idx", array::step::<Int32Type>())
10197 .into_ram_dataset(FragmentCount::from(3), FragmentRowCount::from(3))
10198 .await
10199 .unwrap();
10200 ds.delete("idx >= 2 AND idx <= 6").await.unwrap();
10204
10205 test_row_offset_read_helper(
10207 &ds,
10208 |scanner| scanner.project(&["idx", ROW_OFFSET]).unwrap(),
10209 &["idx", ROW_OFFSET],
10210 &[0, 1, 2, 3],
10211 )
10212 .await;
10213
10214 test_row_offset_read_helper(
10216 &ds,
10217 |scanner| scanner.project(&[ROW_OFFSET]).unwrap(),
10218 &[ROW_OFFSET],
10219 &[0, 1, 2, 3],
10220 )
10221 .await;
10222
10223 test_row_offset_read_helper(
10225 &ds,
10226 |scanner| {
10227 scanner
10228 .filter("idx > 1")
10229 .unwrap()
10230 .project(&[ROW_OFFSET])
10231 .unwrap()
10232 },
10233 &[ROW_OFFSET],
10234 &[2, 3],
10235 )
10236 .await;
10237 }
10238
10239 #[tokio::test]
10240 async fn test_filter_to_take() {
10241 let mut ds = lance_datagen::gen_batch()
10242 .col("idx", array::step::<Int32Type>())
10243 .into_ram_dataset(FragmentCount::from(3), FragmentRowCount::from(100))
10244 .await
10245 .unwrap();
10246
10247 let row_ids = ds
10248 .scan()
10249 .project(&Vec::<&str>::default())
10250 .unwrap()
10251 .with_row_id()
10252 .try_into_stream()
10253 .await
10254 .unwrap()
10255 .try_collect::<Vec<_>>()
10256 .await
10257 .unwrap();
10258 let schema = row_ids[0].schema();
10259 let row_ids = concat_batches(&schema, row_ids.iter()).unwrap();
10260 let row_ids = row_ids.column(0).as_primitive::<UInt64Type>().clone();
10261
10262 let row_addrs = ds
10263 .scan()
10264 .project(&Vec::<&str>::default())
10265 .unwrap()
10266 .with_row_address()
10267 .try_into_stream()
10268 .await
10269 .unwrap()
10270 .try_collect::<Vec<_>>()
10271 .await
10272 .unwrap();
10273 let schema = row_addrs[0].schema();
10274 let row_addrs = concat_batches(&schema, row_addrs.iter()).unwrap();
10275 let row_addrs = row_addrs.column(0).as_primitive::<UInt64Type>().clone();
10276
10277 ds.delete("idx >= 190 AND idx < 210").await.unwrap();
10278
10279 let ds_copy = ds.clone();
10280 let do_check = async move |filt: &str, expected_idx: &[i32], applies_optimization: bool| {
10281 let mut scanner = ds_copy.scan();
10282 scanner.filter(filt).unwrap();
10283 let plan = scanner.explain_plan(true).await.unwrap();
10285 if applies_optimization {
10286 assert!(
10287 plan.contains("OneShotStream"),
10288 "expected take optimization to be applied. Filter: '{}'. Plan:\n{}",
10289 filt,
10290 plan
10291 );
10292 } else {
10293 assert!(
10294 !plan.contains("OneShotStream"),
10295 "expected take optimization to not be applied. Filter: '{}'. Plan:\n{}",
10296 filt,
10297 plan
10298 );
10299 }
10300
10301 let stream = scanner.try_into_stream().await.unwrap();
10303 let batches = stream.try_collect::<Vec<_>>().await.unwrap();
10304 let idx = batches
10305 .iter()
10306 .map(|b| b.column_by_name("idx").unwrap().as_ref())
10307 .collect::<Vec<_>>();
10308
10309 if idx.is_empty() {
10310 assert!(expected_idx.is_empty());
10311 return;
10312 }
10313
10314 let idx = arrow::compute::concat(&idx).unwrap();
10315 assert_eq!(idx.as_primitive::<Int32Type>().values(), expected_idx);
10316 };
10317 let check =
10318 async |filt: &str, expected_idx: &[i32]| do_check(filt, expected_idx, true).await;
10319 let check_no_opt = async |filt: &str, expected_idx: &[i32]| {
10320 do_check(filt, expected_idx, false).await;
10321 };
10322
10323 check("_rowid = 50", &[50]).await;
10325 check("_rowaddr = 50", &[50]).await;
10326 check("_rowoffset = 50", &[50]).await;
10327
10328 check(
10329 "_rowid = 50 OR _rowid = 51 OR _rowid = 52 OR _rowid = 49",
10330 &[49, 50, 51, 52],
10331 )
10332 .await;
10333 check(
10334 "_rowaddr = 50 OR _rowaddr = 51 OR _rowaddr = 52 OR _rowaddr = 49",
10335 &[49, 50, 51, 52],
10336 )
10337 .await;
10338 check(
10339 "_rowoffset = 50 OR _rowoffset = 51 OR _rowoffset = 52 OR _rowoffset = 49",
10340 &[49, 50, 51, 52],
10341 )
10342 .await;
10343
10344 check("_rowid IN (52, 51, 50, 17)", &[17, 50, 51, 52]).await;
10345 check("_rowaddr IN (52, 51, 50, 17)", &[17, 50, 51, 52]).await;
10346 check("_rowoffset IN (52, 51, 50, 17)", &[17, 50, 51, 52]).await;
10347
10348 check(&format!("_rowid = {}", row_ids.value(190)), &[]).await;
10352 check(&format!("_rowaddr = {}", row_addrs.value(190)), &[]).await;
10353 check("_rowoffset = 190", &[210]).await;
10356
10357 check(&format!("_rowid = {}", row_ids.value(250)), &[250]).await;
10359 check(&format!("_rowaddr = {}", row_addrs.value(250)), &[250]).await;
10360 check("_rowoffset = 250", &[270]).await;
10361
10362 check("_rowoffset = 1000", &[]).await;
10364
10365 check("_rowid IN (5, 10, 15) AND idx > 10", &[15]).await;
10367 check("_rowaddr IN (5, 10, 15) AND idx > 10", &[15]).await;
10368 check("_rowoffset IN (5, 10, 15) AND idx > 10", &[15]).await;
10369 check("idx > 10 AND _rowid IN (5, 10, 15)", &[15]).await;
10370 check("idx > 10 AND _rowaddr IN (5, 10, 15)", &[15]).await;
10371 check("idx > 10 AND _rowoffset IN (5, 10, 15)", &[15]).await;
10372 check("_rowid = 50 AND _rowid = 50", &[50]).await;
10374
10375 check_no_opt("_rowid = 50 AND _rowid = 51", &[]).await;
10377 check_no_opt("(_rowid = 50 AND idx < 100) OR _rowid = 51", &[50, 51]).await;
10378
10379 let mut scanner = ds.scan();
10381 scanner.filter("_rowoffset = 77").unwrap();
10382 scanner
10383 .project_with_transform(&[("foo", "idx * 2")])
10384 .unwrap();
10385 let stream = scanner.try_into_stream().await.unwrap();
10386 let batches = stream.try_collect::<Vec<_>>().await.unwrap();
10387 assert_eq!(batches[0].schema().field(0).name(), "foo");
10388 let val = batches[0].column(0).as_primitive::<Int32Type>().values()[0];
10389 assert_eq!(val, 154);
10390 }
10391
10392 #[tokio::test]
10393 async fn test_nested_field_ordering() {
10394 use arrow_array::StructArray;
10395
10396 let id_array = Int32Array::from(vec![3, 1, 2]);
10398 let nested_values = Int32Array::from(vec![30, 10, 20]);
10399 let nested_struct = StructArray::from(vec![(
10400 Arc::new(ArrowField::new("value", DataType::Int32, false)),
10401 Arc::new(nested_values) as ArrayRef,
10402 )]);
10403
10404 let schema = Arc::new(ArrowSchema::new(vec![
10405 ArrowField::new("id", DataType::Int32, false),
10406 ArrowField::new(
10407 "nested",
10408 DataType::Struct(vec![ArrowField::new("value", DataType::Int32, false)].into()),
10409 false,
10410 ),
10411 ]));
10412
10413 let batch = RecordBatch::try_new(
10414 schema.clone(),
10415 vec![Arc::new(id_array), Arc::new(nested_struct)],
10416 )
10417 .unwrap();
10418
10419 let test_dir = TempStrDir::default();
10420 let test_uri = &test_dir;
10421 let reader = RecordBatchIterator::new(vec![Ok(batch)].into_iter(), schema.clone());
10422
10423 let dataset = Dataset::write(reader, test_uri, None).await.unwrap();
10424
10425 let mut scanner = dataset.scan();
10427 scanner
10428 .order_by(Some(vec![ColumnOrdering {
10429 column_name: "nested.value".to_string(),
10430 ascending: true,
10431 nulls_first: true,
10432 }]))
10433 .unwrap(); let stream = scanner.try_into_stream().await.unwrap();
10436 let batches = stream.try_collect::<Vec<_>>().await.unwrap();
10437
10438 let sorted_ids = batches[0].column(0).as_primitive::<Int32Type>().values();
10440 assert_eq!(sorted_ids[0], 1); assert_eq!(sorted_ids[1], 2); assert_eq!(sorted_ids[2], 3); }
10444
10445 #[tokio::test]
10446 async fn test_limit_with_ordering_not_pushed_down() {
10447 let id_array = Int32Array::from(vec![5, 2, 8, 1, 3, 7, 4, 6]);
10453 let value_array = Int32Array::from(vec![50, 20, 80, 10, 30, 70, 40, 60]);
10454
10455 let schema = Arc::new(ArrowSchema::new(vec![
10456 ArrowField::new("id", DataType::Int32, false),
10457 ArrowField::new("value", DataType::Int32, false),
10458 ]));
10459
10460 let batch = RecordBatch::try_new(
10461 schema.clone(),
10462 vec![Arc::new(id_array), Arc::new(value_array)],
10463 )
10464 .unwrap();
10465
10466 let test_dir = TempStrDir::default();
10467 let test_uri = &test_dir;
10468 let reader = RecordBatchIterator::new(vec![Ok(batch)].into_iter(), schema.clone());
10469
10470 let dataset = Dataset::write(reader, test_uri, None).await.unwrap();
10471
10472 let mut scanner = dataset.scan();
10474 scanner
10475 .order_by(Some(vec![ColumnOrdering {
10476 column_name: "value".to_string(),
10477 ascending: true,
10478 nulls_first: true,
10479 }]))
10480 .unwrap();
10481 scanner.limit(Some(3), None).unwrap();
10482
10483 let stream = scanner.try_into_stream().await.unwrap();
10484 let batches = stream.try_collect::<Vec<_>>().await.unwrap();
10485
10486 let sorted_ids = batches[0].column(0).as_primitive::<Int32Type>().values();
10488 let sorted_values = batches[0].column(1).as_primitive::<Int32Type>().values();
10489 assert_eq!(batches[0].num_rows(), 3);
10490 assert_eq!(sorted_ids[0], 1); assert_eq!(sorted_ids[1], 2); assert_eq!(sorted_ids[2], 3); assert_eq!(sorted_values[0], 10);
10494 assert_eq!(sorted_values[1], 20);
10495 assert_eq!(sorted_values[2], 30);
10496
10497 let mut scanner = dataset.scan();
10499 scanner
10500 .order_by(Some(vec![ColumnOrdering {
10501 column_name: "value".to_string(),
10502 ascending: true,
10503 nulls_first: true,
10504 }]))
10505 .unwrap();
10506 scanner.limit(Some(3), Some(2)).unwrap(); let stream = scanner.try_into_stream().await.unwrap();
10509 let batches = stream.try_collect::<Vec<_>>().await.unwrap();
10510
10511 let sorted_ids = batches[0].column(0).as_primitive::<Int32Type>().values();
10512 let sorted_values = batches[0].column(1).as_primitive::<Int32Type>().values();
10513 assert_eq!(batches[0].num_rows(), 3);
10514 assert_eq!(sorted_ids[0], 3); assert_eq!(sorted_ids[1], 4); assert_eq!(sorted_ids[2], 5); assert_eq!(sorted_values[0], 30);
10518 assert_eq!(sorted_values[1], 40);
10519 assert_eq!(sorted_values[2], 50);
10520
10521 let mut scanner = dataset.scan();
10523 scanner.limit(Some(3), None).unwrap();
10524
10525 let stream = scanner.try_into_stream().await.unwrap();
10526 let batches = stream.try_collect::<Vec<_>>().await.unwrap();
10527
10528 assert_eq!(batches[0].num_rows(), 3);
10530 let unsorted_values = batches[0].column(1).as_primitive::<Int32Type>().values();
10531 assert_eq!(unsorted_values[0], 50);
10533 assert_eq!(unsorted_values[1], 20);
10534 assert_eq!(unsorted_values[2], 80);
10535 }
10536
10537 #[tokio::test]
10538 async fn test_scan_with_version_columns() {
10539 use arrow_array::{Int32Array, RecordBatch, RecordBatchIterator};
10540 use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema};
10541
10542 let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new(
10544 "id",
10545 DataType::Int32,
10546 false,
10547 )]));
10548
10549 let batch = RecordBatch::try_new(
10550 schema.clone(),
10551 vec![Arc::new(Int32Array::from(vec![1, 2, 3]))],
10552 )
10553 .unwrap();
10554
10555 let test_dir = lance_core::utils::tempfile::TempStrDir::default();
10556 let test_uri = test_dir.as_str();
10557
10558 let reader = RecordBatchIterator::new(vec![Ok(batch)], schema);
10559 let write_params = WriteParams {
10560 enable_stable_row_ids: true,
10561 ..Default::default()
10562 };
10563 Dataset::write(reader, test_uri, Some(write_params))
10564 .await
10565 .unwrap();
10566
10567 let dataset = Dataset::open(test_uri).await.unwrap();
10568 let mut scanner = dataset.scan();
10569
10570 scanner
10571 .project(&[ROW_CREATED_AT_VERSION, ROW_LAST_UPDATED_AT_VERSION])
10572 .unwrap();
10573
10574 let output_schema = scanner.schema().await.unwrap();
10576 assert!(
10577 output_schema
10578 .column_with_name("_row_last_updated_at_version")
10579 .is_some(),
10580 "Schema should include _row_last_updated_at_version"
10581 );
10582 assert!(
10583 output_schema
10584 .column_with_name("_row_created_at_version")
10585 .is_some(),
10586 "Schema should include _row_created_at_version"
10587 );
10588
10589 let batches = scanner
10591 .try_into_stream()
10592 .await
10593 .unwrap()
10594 .try_collect::<Vec<_>>()
10595 .await
10596 .unwrap();
10597
10598 assert_eq!(batches.len(), 1);
10599 let batch = &batches[0];
10600
10601 let last_updated = batch
10603 .column_by_name("_row_last_updated_at_version")
10604 .expect("Should have _row_last_updated_at_version column");
10605 let created_at = batch
10606 .column_by_name("_row_created_at_version")
10607 .expect("Should have _row_created_at_version column");
10608
10609 let last_updated_array = last_updated
10611 .as_any()
10612 .downcast_ref::<arrow_array::UInt64Array>()
10613 .unwrap();
10614 let created_at_array = created_at
10615 .as_any()
10616 .downcast_ref::<arrow_array::UInt64Array>()
10617 .unwrap();
10618
10619 for i in 0..batch.num_rows() {
10620 assert_eq!(
10621 last_updated_array.value(i),
10622 1,
10623 "All rows last updated at version 1"
10624 );
10625 assert_eq!(
10626 created_at_array.value(i),
10627 1,
10628 "All rows created at version 1"
10629 );
10630 }
10631 }
10632
10633 #[test_log::test(test)]
10634 fn test_scan_finishes_all_tasks() {
10635 let runtime = tokio::runtime::Builder::new_multi_thread()
10637 .enable_time()
10638 .build()
10639 .unwrap();
10640
10641 runtime.block_on(async move {
10642 let ds = lance_datagen::gen_batch()
10643 .col("id", lance_datagen::array::step::<Int32Type>())
10644 .into_ram_dataset(FragmentCount::from(1000), FragmentRowCount::from(10))
10645 .await
10646 .unwrap();
10647
10648 let mut stream = ds
10656 .scan()
10657 .fragment_readahead(1000)
10658 .batch_size(1)
10659 .io_buffer_size(1)
10660 .batch_readahead(1)
10661 .try_into_stream()
10662 .await
10663 .unwrap();
10664 stream.next().await.unwrap().unwrap();
10665 });
10666
10667 let start = Instant::now();
10668 while start.elapsed() < Duration::from_secs(10) {
10669 if runtime.handle().metrics().num_alive_tasks() == 0 {
10670 break;
10671 }
10672 std::thread::sleep(Duration::from_millis(100));
10673 }
10674
10675 assert!(
10676 runtime.handle().metrics().num_alive_tasks() == 0,
10677 "Tasks should have finished within 10 seconds but there are still {} tasks running",
10678 runtime.handle().metrics().num_alive_tasks()
10679 );
10680 }
10681
10682 fn find_filtered_read(plan: &dyn ExecutionPlan) -> Option<&FilteredReadExec> {
10683 if let Some(f) = plan.as_any().downcast_ref::<FilteredReadExec>() {
10684 return Some(f);
10685 }
10686 for child in plan.children() {
10687 if let Some(f) = find_filtered_read(child.as_ref()) {
10688 return Some(f);
10689 }
10690 }
10691 None
10692 }
10693
10694 #[tokio::test]
10695 async fn test_io_buffer_size_explicit_propagated() {
10696 let data = lance_datagen::gen_batch()
10701 .col("x", lance_datagen::array::step::<Int32Type>())
10702 .into_reader_rows(RowCount::from(8), BatchCount::from(1));
10703 let dataset = Dataset::write(data, "memory://test_io_buffer_explicit", None)
10704 .await
10705 .unwrap();
10706
10707 let plan = dataset.scan().create_plan().await.unwrap();
10708 let filtered = find_filtered_read(plan.as_ref())
10709 .expect("expected a FilteredReadExec in the scan plan");
10710 assert_eq!(filtered.options().io_buffer_size_bytes, None);
10711
10712 let mut scanner = dataset.scan();
10713 scanner.io_buffer_size(7777);
10714 let plan = scanner.create_plan().await.unwrap();
10715 let filtered = find_filtered_read(plan.as_ref())
10716 .expect("expected a FilteredReadExec in the scan plan");
10717 assert_eq!(filtered.options().io_buffer_size_bytes, Some(7777));
10718 }
10719
10720 #[test]
10724 #[serial_test::serial(LANCE_DEFAULT_IO_BUFFER_SIZE)]
10725 fn test_default_io_buffer_size_override_env_var() {
10726 let _ = *DEFAULT_IO_BUFFER_SIZE;
10731
10732 unsafe {
10736 std::env::set_var("LANCE_DEFAULT_IO_BUFFER_SIZE", "4096");
10737 }
10738 assert_eq!(get_default_io_buffer_size_override(), Some(4096));
10739
10740 unsafe {
10741 std::env::set_var("LANCE_DEFAULT_IO_BUFFER_SIZE", "not_a_number");
10742 }
10743 assert_eq!(get_default_io_buffer_size_override(), None);
10744
10745 unsafe {
10746 std::env::remove_var("LANCE_DEFAULT_IO_BUFFER_SIZE");
10747 }
10748 assert_eq!(get_default_io_buffer_size_override(), None);
10749 }
10750
10751 fn assert_values_in_range(array: &Int32Array, range: std::ops::Range<i32>, msg: &str) {
10752 assert!(!array.is_empty(), "Expected some results but got none");
10753 assert!(
10754 array
10755 .iter()
10756 .all(|v| v.is_some_and(|val| range.contains(&val))),
10757 "{msg} (expected range {range:?})"
10758 );
10759 }
10760
10761 fn assert_has_all_fragments(array: &Int32Array) {
10763 assert!(
10764 array
10765 .iter()
10766 .any(|v| v.is_some_and(|val| (0..200).contains(&val)))
10767 && array
10768 .iter()
10769 .any(|v| v.is_some_and(|val| (200..400).contains(&val)))
10770 && array
10771 .iter()
10772 .any(|v| v.is_some_and(|val| (400..410).contains(&val)))
10773 && array
10774 .iter()
10775 .any(|v| v.is_some_and(|val| (410..420).contains(&val))),
10776 "Expected results from all fragments"
10777 );
10778 }
10779
10780 async fn test_fragment_list_filtering(
10782 test_ds: &TestVectorDataset,
10783 fragments: &[Fragment],
10784 mut build_scanner: impl FnMut(&Dataset) -> Scanner,
10785 ) {
10786 let batch = build_scanner(&test_ds.dataset)
10788 .try_into_batch()
10789 .await
10790 .unwrap();
10791 let i_array = batch
10792 .column_by_name("i")
10793 .unwrap()
10794 .as_any()
10795 .downcast_ref::<Int32Array>()
10796 .unwrap();
10797 assert_has_all_fragments(i_array);
10798
10799 let mut scanner = build_scanner(&test_ds.dataset);
10801 scanner.with_fragments(vec![fragments[2].clone()]);
10802 let batch = scanner.try_into_batch().await.unwrap();
10803 let i_array = batch
10804 .column_by_name("i")
10805 .unwrap()
10806 .as_any()
10807 .downcast_ref::<Int32Array>()
10808 .unwrap();
10809 assert_values_in_range(i_array, 400..410, "Should only get results from fragment 2");
10810
10811 let mut scanner = build_scanner(&test_ds.dataset);
10813 scanner.with_fragments(vec![fragments[0].clone()]);
10814 let batch = scanner.try_into_batch().await.unwrap();
10815 let i_array = batch
10816 .column_by_name("i")
10817 .unwrap()
10818 .as_any()
10819 .downcast_ref::<Int32Array>()
10820 .unwrap();
10821 assert_values_in_range(i_array, 0..200, "Should only get results from fragment 0");
10822
10823 let mut scanner = build_scanner(&test_ds.dataset);
10825 scanner.with_fragments(vec![
10826 fragments[0].clone(),
10827 fragments[1].clone(),
10828 fragments[2].clone(),
10829 ]);
10830 let batch = scanner.try_into_batch().await.unwrap();
10831 let i_array = batch
10832 .column_by_name("i")
10833 .unwrap()
10834 .as_any()
10835 .downcast_ref::<Int32Array>()
10836 .unwrap();
10837 assert_values_in_range(
10838 i_array,
10839 0..410,
10840 "Should get results from fragments 0, 1, and 2, excluding fragment 3",
10841 );
10842
10843 let mut scanner = build_scanner(&test_ds.dataset);
10845 scanner.with_fragments(vec![fragments[0].clone(), fragments[2].clone()]);
10846 let batch = scanner.try_into_batch().await.unwrap();
10847 let i_array = batch
10848 .column_by_name("i")
10849 .unwrap()
10850 .as_any()
10851 .downcast_ref::<Int32Array>()
10852 .unwrap();
10853 assert!(
10854 i_array
10855 .iter()
10856 .all(|v| v.is_some_and(|val| (0..200).contains(&val) || (400..410).contains(&val)))
10857 && i_array
10858 .iter()
10859 .any(|v| v.is_some_and(|val| (0..200).contains(&val)))
10860 && i_array
10861 .iter()
10862 .any(|v| v.is_some_and(|val| (400..410).contains(&val))),
10863 "Should only get results from fragment 0 (indexed) and fragment 2 (unindexed)"
10864 );
10865 }
10866
10867 #[tokio::test]
10868 async fn test_vector_search_respects_fragment_list() {
10869 let mut test_ds = TestVectorDataset::new(LanceFileVersion::Stable, false)
10870 .await
10871 .unwrap();
10872
10873 test_ds.make_segmented_vector_index().await.unwrap();
10875
10876 let query: Float32Array = (0..32).map(|v| v as f32).collect();
10877
10878 test_ds.append_data_with_range(400, 410).await.unwrap();
10880 test_ds.append_data_with_range(410, 420).await.unwrap();
10881
10882 let fragments = test_ds.dataset.fragments();
10885 assert_eq!(fragments.len(), 4);
10886
10887 test_fragment_list_filtering(&test_ds, fragments, |dataset| {
10888 let mut scanner = dataset.scan();
10889 scanner.nearest("vec", &query, 420).unwrap();
10890 scanner
10891 })
10892 .await;
10893 }
10894
10895 #[tokio::test]
10896 async fn test_vector_search_fragment_filter_prunes_segment_fanout() {
10897 let mut test_ds = TestVectorDataset::new(LanceFileVersion::Stable, false)
10898 .await
10899 .unwrap();
10900 test_ds.make_segmented_vector_index().await.unwrap();
10901
10902 let query: Float32Array = (0..32).map(|v| v as f32).collect();
10903 test_ds.append_data_with_range(400, 410).await.unwrap();
10904 test_ds.append_data_with_range(410, 420).await.unwrap();
10905 let fragments = test_ds.dataset.fragments();
10906
10907 let mut scanner = test_ds.dataset.scan();
10908 scanner.nearest("vec", &query, 420).unwrap();
10909 let full_plan = scanner.explain_plan(true).await.unwrap();
10910 assert!(
10911 full_plan.contains("ANNSubIndex: name=idx, k=420, deltas=2, metric=L2"),
10912 "expected two ANN deltas without fragment filter, plan was:\n{full_plan}"
10913 );
10914
10915 let mut scanner = test_ds.dataset.scan();
10916 scanner
10917 .nearest("vec", &query, 420)
10918 .unwrap()
10919 .with_fragments(vec![fragments[0].clone()]);
10920 let filtered_plan = scanner.explain_plan(true).await.unwrap();
10921 assert!(
10922 filtered_plan.contains("ANNSubIndex: name=idx, k=420, deltas=1, metric=L2"),
10923 "expected one ANN delta with fragment filter, plan was:\n{filtered_plan}"
10924 );
10925 }
10926
10927 #[tokio::test]
10928 async fn test_vector_search_respects_index_segments() {
10929 let mut test_ds = TestVectorDataset::new(LanceFileVersion::Stable, false)
10930 .await
10931 .unwrap();
10932 let segment_ids = test_ds.make_segmented_vector_index().await.unwrap();
10933
10934 let query: Float32Array = (0..32).map(|v| v as f32).collect();
10935 test_ds.append_data_with_range(400, 410).await.unwrap();
10936 test_ds.append_data_with_range(410, 420).await.unwrap();
10937
10938 let mut scanner = test_ds.dataset.scan();
10939 scanner
10940 .nearest("vec", &query, 420)
10941 .unwrap()
10942 .with_index_segments(vec![segment_ids[0]])
10943 .unwrap();
10944 let batch = scanner.try_into_batch().await.unwrap();
10945 let i_array = batch
10946 .column_by_name("i")
10947 .unwrap()
10948 .as_any()
10949 .downcast_ref::<Int32Array>()
10950 .unwrap();
10951 assert_eq!(batch.num_rows(), 200);
10952 assert_values_in_range(
10953 i_array,
10954 0..200,
10955 "Should only get results from the selected index segment",
10956 );
10957 }
10958
10959 #[tokio::test]
10960 async fn test_vector_search_intersects_fragments_and_index_segments() {
10961 let mut test_ds = TestVectorDataset::new(LanceFileVersion::Stable, false)
10962 .await
10963 .unwrap();
10964 let segment_ids = test_ds.make_segmented_vector_index().await.unwrap();
10965
10966 let query: Float32Array = (0..32).map(|v| v as f32).collect();
10967 test_ds.append_data_with_range(400, 410).await.unwrap();
10968 test_ds.append_data_with_range(410, 420).await.unwrap();
10969 let fragments = test_ds.dataset.fragments();
10970
10971 let mut scanner = test_ds.dataset.scan();
10972 scanner
10973 .nearest("vec", &query, 420)
10974 .unwrap()
10975 .with_fragments(vec![fragments[0].clone(), fragments[2].clone()])
10976 .with_index_segments(vec![segment_ids[0]])
10977 .unwrap();
10978 let batch = scanner.try_into_batch().await.unwrap();
10979 let i_array = batch
10980 .column_by_name("i")
10981 .unwrap()
10982 .as_any()
10983 .downcast_ref::<Int32Array>()
10984 .unwrap();
10985 assert!(
10986 i_array
10987 .iter()
10988 .all(|v| v.is_some_and(|val| (0..200).contains(&val) || (400..410).contains(&val)))
10989 && i_array
10990 .iter()
10991 .any(|v| v.is_some_and(|val| (0..200).contains(&val)))
10992 && i_array
10993 .iter()
10994 .any(|v| v.is_some_and(|val| (400..410).contains(&val))),
10995 "Should get selected segment rows plus flat fallback for target fragments outside the selected segments"
10996 );
10997 }
10998
10999 #[tokio::test]
11000 async fn test_vector_search_rejects_unknown_index_segment() {
11001 let mut test_ds = TestVectorDataset::new(LanceFileVersion::Stable, false)
11002 .await
11003 .unwrap();
11004 test_ds.make_segmented_vector_index().await.unwrap();
11005
11006 let query: Float32Array = (0..32).map(|v| v as f32).collect();
11007 let err = test_ds
11008 .dataset
11009 .scan()
11010 .nearest("vec", &query, 10)
11011 .unwrap()
11012 .with_index_segments(vec![Uuid::new_v4()])
11013 .unwrap()
11014 .try_into_batch()
11015 .await
11016 .unwrap_err();
11017 assert!(
11018 err.to_string().contains("unknown index segments"),
11019 "unexpected error: {err}"
11020 );
11021 }
11022
11023 #[tokio::test]
11024 async fn test_vector_search_rejects_metric_mismatch_for_index_segments() {
11025 let mut test_ds = TestVectorDataset::new(LanceFileVersion::Stable, false)
11026 .await
11027 .unwrap();
11028 let segment_ids = test_ds.make_segmented_vector_index().await.unwrap();
11029
11030 let query: Float32Array = (0..32).map(|v| v as f32).collect();
11031 let err = test_ds
11032 .dataset
11033 .scan()
11034 .nearest("vec", &query, 10)
11035 .unwrap()
11036 .distance_metric(DistanceType::Dot)
11037 .with_index_segments(vec![segment_ids[0]])
11038 .unwrap()
11039 .try_into_batch()
11040 .await
11041 .unwrap_err();
11042 assert!(
11043 err.to_string()
11044 .contains("with_index_segments requested metric"),
11045 "unexpected error: {err}"
11046 );
11047 }
11048
11049 #[tokio::test]
11050 async fn test_with_index_segments_rejects_empty_list() {
11051 let test_ds = TestVectorDataset::new(LanceFileVersion::Stable, false)
11052 .await
11053 .unwrap();
11054 let query: Float32Array = (0..32).map(|v| v as f32).collect();
11055
11056 let Err(err) = test_ds
11057 .dataset
11058 .scan()
11059 .nearest("vec", &query, 10)
11060 .unwrap()
11061 .with_index_segments(vec![])
11062 else {
11063 panic!("expected empty index segments to be rejected");
11064 };
11065 assert!(
11066 err.to_string()
11067 .contains("with_index_segments does not accept an empty segment list"),
11068 "unexpected error: {err}"
11069 );
11070 }
11071
11072 #[tokio::test]
11073 async fn test_with_index_segments_rejected_for_non_vector_query() {
11074 let mut test_ds = TestVectorDataset::new(LanceFileVersion::Stable, false)
11075 .await
11076 .unwrap();
11077 let segment_ids = test_ds.make_segmented_vector_index().await.unwrap();
11078
11079 let err = test_ds
11080 .dataset
11081 .scan()
11082 .project(&["i"])
11083 .unwrap()
11084 .with_index_segments(vec![segment_ids[0]])
11085 .unwrap()
11086 .try_into_batch()
11087 .await
11088 .unwrap_err();
11089 assert!(
11090 err.to_string()
11091 .contains("with_index_segments is only supported for vector search"),
11092 "unexpected error: {err}"
11093 );
11094 }
11095
11096 #[tokio::test]
11097 async fn test_fts_respects_fragment_list() {
11098 let mut test_ds = TestVectorDataset::new(LanceFileVersion::Stable, false)
11099 .await
11100 .unwrap();
11101
11102 test_ds.make_fts_index().await.unwrap();
11104
11105 test_ds.append_data_with_range(400, 410).await.unwrap();
11107 test_ds.append_data_with_range(410, 420).await.unwrap();
11108
11109 let fragments = test_ds.dataset.fragments();
11112 assert_eq!(fragments.len(), 4);
11113
11114 test_fragment_list_filtering(&test_ds, fragments, |dataset| {
11116 let mut scanner = dataset.scan();
11117 scanner
11118 .full_text_search(FullTextSearchQuery::new("s-5".into()))
11119 .unwrap();
11120 scanner
11121 })
11122 .await;
11123 }
11124}