lance/lib.rs
1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4//! Lance Columnar Data Format
5//!
6//! Lance columnar data format is an alternative to Parquet. It provides 100x faster for random access,
7//! automatic versioning, optimized for computer vision, bioinformatics, spatial and ML data.
8//! [Apache Arrow](https://arrow.apache.org/) and DuckDB compatible.
9//!
10//!
11//! # Create a Dataset
12//!
13//! ```rust
14//! # use std::sync::Arc;
15//! # use tokio::runtime::Runtime;
16//! # use arrow_array::{RecordBatch, RecordBatchIterator};
17//! # use arrow_schema::{Schema, Field, DataType};
18//! use lance::{dataset::WriteParams, Dataset};
19//!
20//! # let mut rt = Runtime::new().unwrap();
21//! # rt.block_on(async {
22//! #
23//! # let test_dir = tempfile::tempdir().unwrap();
24//! # let uri = test_dir.path().to_str().unwrap().to_string();
25//! let schema = Arc::new(Schema::new(vec![Field::new("test", DataType::Int64, false)]));
26//! let batches = vec![RecordBatch::new_empty(schema.clone())];
27//! let reader = RecordBatchIterator::new(
28//! batches.into_iter().map(Ok), schema
29//! );
30//!
31//! let write_params = WriteParams::default();
32//! Dataset::write(reader, &uri, Some(write_params)).await.unwrap();
33//! # })
34//! ```
35//!
36//! # Scan a Dataset
37//!
38//! ```rust
39//! # use std::sync::Arc;
40//! # use arrow_array::{RecordBatch, Int32Array, RecordBatchIterator, ArrayRef};
41//! # use tokio::runtime::Runtime;
42//! use futures::StreamExt;
43//! use lance::Dataset;
44//! # use lance::dataset::WriteParams;
45//!
46//! # let array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2]));
47//! # let batches = vec![RecordBatch::try_from_iter(vec![("test", array)]).unwrap()];
48//! # let test_dir = tempfile::tempdir().unwrap();
49//! # let path = test_dir.path().to_str().unwrap().to_string();
50//! # let schema = batches[0].schema();
51//! # let mut rt = Runtime::new().unwrap();
52//! # rt.block_on(async {
53//! # let write_params = WriteParams::default();
54//! # let reader = RecordBatchIterator::new(
55//! # batches.into_iter().map(Ok), schema
56//! # );
57//! # Dataset::write(reader, &path, Some(write_params)).await.unwrap();
58//! let dataset = Dataset::open(&path).await.unwrap();
59//! let mut scanner = dataset.scan();
60//! let batches: Vec<RecordBatch> = scanner
61//! .try_into_stream()
62//! .await
63//! .unwrap()
64//! .map(|b| b.unwrap())
65//! .collect::<Vec<RecordBatch>>()
66//! .await;
67//! # })
68//!
69//! ```
70//!
71
72use arrow_schema::DataType;
73use dataset::builder::DatasetBuilder;
74pub use lance_core::datatypes;
75pub use lance_core::{Error, Result};
76use std::sync::LazyLock;
77
78pub mod arrow;
79pub mod datafusion;
80pub mod dataset;
81pub mod index;
82pub mod io;
83pub mod session;
84pub mod table;
85pub mod utils;
86
87pub use dataset::Dataset;
88use lance_index::vector::DIST_COL;
89
90/// Creates and loads a [`Dataset`] from the given path.
91/// Infers the storage backend to use from the scheme in the given table path.
92///
93/// For more advanced configurations use [`DatasetBuilder`].
94pub async fn open_dataset<T: AsRef<str>>(table_uri: T) -> Result<Dataset> {
95 DatasetBuilder::from_uri(table_uri.as_ref()).load().await
96}
97
98pub static DIST_FIELD: LazyLock<arrow_schema::Field> =
99 LazyLock::new(|| arrow_schema::Field::new(DIST_COL, DataType::Float32, true));
100
101/// Re-exports of 3rd party dependencies used in lance public APIs
102///
103/// Users that only use these dependencies for the sake of communicating with
104/// Lance APIs can use these re-exports to ensure they are always pinned to the
105/// same version that lance is using.
106pub mod deps {
107 pub use arrow_array;
108 pub use arrow_schema;
109 pub use datafusion;
110}