1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors

//! Lance Columnar Data Format
//!
//! Lance columnar data format is an alternative to Parquet. It provides 100x faster for random access,
//! automatic versioning, optimized for computer vision, bioinformatics, spatial and ML data.
//! [Apache Arrow](https://arrow.apache.org/) and DuckDB compatible.
//!
//!
//! # Create a Dataset
//!
//! ```rust
//! # use std::sync::Arc;
//! # use tokio::runtime::Runtime;
//! # use arrow_array::{RecordBatch, RecordBatchIterator};
//! # use arrow_schema::{Schema, Field, DataType};
//! use lance::{dataset::WriteParams, Dataset};
//!
//! # let mut rt = Runtime::new().unwrap();
//! # rt.block_on(async {
//! #
//! # let test_dir = tempfile::tempdir().unwrap();
//! # let uri = test_dir.path().to_str().unwrap().to_string();
//! let schema = Arc::new(Schema::new(vec![Field::new("test", DataType::Int64, false)]));
//! let batches = vec![RecordBatch::new_empty(schema.clone())];
//! let reader = RecordBatchIterator::new(
//!     batches.into_iter().map(Ok), schema
//! );
//!
//! let write_params = WriteParams::default();
//! Dataset::write(reader, &uri, Some(write_params)).await.unwrap();
//! # })
//! ```
//!
//! # Scan a Dataset
//!
//! ```rust
//! # use std::sync::Arc;
//! # use arrow_array::{RecordBatch, Int32Array, RecordBatchIterator, ArrayRef};
//! # use tokio::runtime::Runtime;
//! use futures::StreamExt;
//! use lance::Dataset;
//! # use lance::dataset::WriteParams;
//!
//! # let array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2]));
//! # let batches = vec![RecordBatch::try_from_iter(vec![("test", array)]).unwrap()];
//! # let test_dir = tempfile::tempdir().unwrap();
//! # let path = test_dir.path().to_str().unwrap().to_string();
//! # let schema = batches[0].schema();
//! # let mut rt = Runtime::new().unwrap();
//! # rt.block_on(async {
//! #   let write_params = WriteParams::default();
//! #   let reader = RecordBatchIterator::new(
//! #       batches.into_iter().map(Ok), schema
//! #   );
//! #   Dataset::write(reader, &path, Some(write_params)).await.unwrap();
//! let dataset = Dataset::open(&path).await.unwrap();
//! let mut scanner = dataset.scan();
//! let batches: Vec<RecordBatch> = scanner
//!     .try_into_stream()
//!     .await
//!     .unwrap()
//!     .map(|b| b.unwrap())
//!     .collect::<Vec<RecordBatch>>()
//!     .await;
//! # })
//!
//! ```
//!

use arrow_schema::DataType;
use dataset::builder::DatasetBuilder;
pub use lance_core::{datatypes, error};
pub use lance_core::{Error, Result};

pub mod arrow;
pub mod datafusion;
pub mod dataset;
pub mod index;
pub mod io;
pub mod session;
pub mod table;
pub mod utils;

pub use dataset::Dataset;
use lance_index::vector::DIST_COL;

/// Creates and loads a [`Dataset`] from the given path.
/// Infers the storage backend to use from the scheme in the given table path.
///
/// For more advanced configurations use [`DatasetBuilder`].
pub async fn open_dataset<T: AsRef<str>>(table_uri: T) -> Result<Dataset> {
    DatasetBuilder::from_uri(table_uri.as_ref()).load().await
}

lazy_static::lazy_static! {
    pub static ref DIST_FIELD : arrow_schema::Field = arrow_schema::Field::new(DIST_COL, DataType::Float32, true);
}