Struct lance::dataset::Dataset

source ·
pub struct Dataset { /* private fields */ }
Expand description

Lance Dataset

Implementations§

source§

impl Dataset

source

pub async fn open(uri: &str) -> Result<Self>

Open an existing dataset.

source

pub async fn open_with_params(uri: &str, params: &ReadParams) -> Result<Self>

Open a dataset with read params.

source

pub async fn checkout(uri: &str, version: u64) -> Result<Self>

Check out a version of the dataset.

source

pub async fn checkout_with_params( uri: &str, version: u64, params: &ReadParams ) -> Result<Self>

Check out a version of the dataset with read params.

source

pub async fn checkout_version(&self, version: u64) -> Result<Self>

Check out the specified version of this dataset

source

pub async fn write( batches: impl RecordBatchReader + Send + 'static, uri: &str, params: Option<WriteParams> ) -> Result<Self>

Write to or Create a Dataset with a stream of RecordBatchs.

Returns the newly created Dataset. Or Returns Error if the dataset already exists.

source

pub async fn append( &mut self, batches: impl RecordBatchReader + Send + 'static, params: Option<WriteParams> ) -> Result<()>

Append to existing Dataset with a stream of RecordBatchs

Returns void result or Returns Error

source

pub async fn latest_manifest(&self) -> Result<Manifest>

source

pub async fn restore(&mut self, write_params: Option<WriteParams>) -> Result<()>

Restore the currently checked out version of the dataset as the latest version.

Currently, write_params is just used to get additional store params. Other options are ignored.

source

pub fn cleanup_old_versions( &self, older_than: Duration, delete_unverified: Option<bool> ) -> BoxFuture<'_, Result<RemovalStats>>

Removes old versions of the dataset from disk

This function will remove all versions of the dataset that are older than the provided timestamp. This function will not remove the current version of the dataset.

Once a version is removed it can no longer be checked out or restored. Any data unique to that version will be lost.

Arguments
  • older_than - Versions older than this will be deleted.
  • delete_unverified - If false (the default) then files will only be deleted if they are listed in at least one manifest. Otherwise these files will be kept since they cannot be distinguished from an in-progress transaction. Set to true to delete these files if you are sure there are no other in-progress dataset operations.
Returns
  • RemovalStats - Statistics about the removal operation
source

pub async fn commit( base_uri: &str, operation: Operation, read_version: Option<u64>, store_params: Option<ObjectStoreParams> ) -> Result<Self>

Commit changes to the dataset

This operation is not needed if you are using append/write/delete to manipulate the dataset. It is used to commit changes to the dataset that are made externally. For example, a bulk import tool may import large amounts of new data and write the appropriate lance files directly instead of using the write function.

This method can be used to commit this change to the dataset’s manifest. This method will not verify that the provided fragments exist and correct, that is the caller’s responsibility.

If this commit is a change to an existing dataset then it will often need to be based on an existing version of the dataset. For example, if this change is a delete operation then the caller will have read in the existing data (at some version) to determine which fragments need to be deleted. The base version that the caller used should be supplied as the read_version parameter. Some operations (e.g. Overwrite) do not depend on a previous version and read_version can be None. An error will be returned if the read_version is needed for an operation and it is not specified.

All operations except Overwrite will fail if the dataset does not already exist.

Arguments
  • base_uri - The base URI of the dataset
  • operation - A description of the change to commit
  • read_version - The version of the dataset that this change is based on
  • store_params Parameters controlling object store access to the manifest
source

pub async fn merge( &mut self, stream: impl RecordBatchReader + Send + 'static, left_on: &str, right_on: &str ) -> Result<()>

source

pub fn scan(&self) -> Scanner

Create a Scanner to scan the dataset.

source

pub async fn count_rows(&self) -> Result<usize>

Count the number of rows in the dataset.

It offers a fast path of counting rows by just computing via metadata.

source

pub async fn take( &self, row_indices: &[u64], projection: &Schema ) -> Result<RecordBatch>

source

pub async fn take_rows( &self, row_ids: &[u64], projection: &Schema ) -> Result<RecordBatch>

Take rows by the internal ROW ids.

source

pub fn take_scan( &self, row_ranges: Pin<Box<dyn Stream<Item = Result<Range<u64>>> + Send>>, projection: Arc<Schema>, batch_readahead: usize ) -> DatasetRecordBatchStream

Get a stream of batches based on iterator of ranges of row numbers.

This is an experimental API. It may change at any time.

source

pub async fn delete(&mut self, predicate: &str) -> Result<()>

Delete rows based on a predicate.

source

pub async fn count_deleted_rows(&self) -> Result<usize>

source

pub fn version(&self) -> Version

source

pub async fn versions(&self) -> Result<Vec<Version>>

Get all versions.

source

pub async fn latest_version_id(&self) -> Result<u64>

Get the latest version of the dataset This is meant to be a fast path for checking if a dataset has changed. This is why we don’t return the full version struct.

source

pub fn count_fragments(&self) -> usize

source

pub fn schema(&self) -> &Schema

source

pub fn get_fragments(&self) -> Vec<FileFragment>

Get fragments.

If filter is provided, only fragments with the given name will be returned.

source

pub fn get_fragment(&self, fragment_id: usize) -> Option<FileFragment>

source

pub async fn load_indices(&self) -> Result<Vec<Index>>

Read all indices of this Dataset version.

source

pub async fn load_index(&self, uuid: &str) -> Option<Index>

Loads a specific index with the given id

source

pub async fn load_index_by_name(&self, name: &str) -> Option<Index>

source

pub async fn index_statistics(&self, index_name: &str) -> Result<Option<String>>

Find index with a given index_name and return its serialized statistics.

source

pub async fn count_unindexed_rows( &self, index_uuid: &str ) -> Result<Option<usize>>

source

pub async fn count_indexed_rows( &self, index_uuid: &str ) -> Result<Option<usize>>

source

pub async fn num_small_files(&self, max_rows_per_group: usize) -> usize

Gets the number of files that are so small they don’t even have a full group. These are considered too small because reading many of them is much less efficient than reading a single file because the separate files split up what would otherwise be single IO requests into multiple.

source

pub async fn validate(&self) -> Result<()>

Trait Implementations§

source§

impl Clone for Dataset

source§

fn clone(&self) -> Dataset

Returns a copy of the value. Read more
1.0.0 · source§

fn clone_from(&mut self, source: &Self)

Performs copy-assignment from source. Read more
source§

impl DatasetIndexExt for Dataset

source§

fn create_index<'life0, 'life1, 'life2, 'life3, 'async_trait>( &'life0 mut self, columns: &'life1 [&'life2 str], index_type: IndexType, name: Option<String>, params: &'life3 dyn IndexParams, replace: bool ) -> Pin<Box<dyn Future<Output = Result<()>> + Send + 'async_trait>>
where Self: 'async_trait, 'life0: 'async_trait, 'life1: 'async_trait, 'life2: 'async_trait, 'life3: 'async_trait,

Create indices on columns. Read more
source§

fn optimize_indices<'life0, 'async_trait>( &'life0 mut self ) -> Pin<Box<dyn Future<Output = Result<()>> + Send + 'async_trait>>
where Self: 'async_trait, 'life0: 'async_trait,

Optimize indices.
source§

impl Debug for Dataset

source§

fn fmt(&self, f: &mut Formatter<'_>) -> Result

Formats the value using the given formatter. Read more
source§

impl TableProvider for Dataset

source§

fn as_any(&self) -> &dyn Any

Returns the table provider as Any so that it can be downcast to a specific implementation.
source§

fn schema(&self) -> Arc<ArrowSchema>

Get a reference to the schema for this table
source§

fn table_type(&self) -> TableType

Get the type of this table for metadata/catalog purposes.
source§

fn get_table_definition(&self) -> Option<&str>

Get the create statement used to create this table, if available.
source§

fn get_logical_plan(&self) -> Option<&LogicalPlan>

Get the Logical Plan of this table, if available.
source§

fn scan<'life0, 'life1, 'life2, 'life3, 'async_trait>( &'life0 self, _: &'life1 SessionState, projection: Option<&'life2 Vec<usize>>, _: &'life3 [Expr], limit: Option<usize> ) -> Pin<Box<dyn Future<Output = DatafusionResult<Arc<dyn ExecutionPlan>>> + Send + 'async_trait>>
where Self: 'async_trait, 'life0: 'async_trait, 'life1: 'async_trait, 'life2: 'async_trait, 'life3: 'async_trait,

Create an ExecutionPlan that will scan the table. The table provider will be usually responsible of grouping the source data into partitions that can be efficiently parallelized or distributed.
source§

fn constraints(&self) -> Option<&Constraints>

Get a reference to the constraints of the table.
source§

fn supports_filter_pushdown( &self, _filter: &Expr ) -> Result<TableProviderFilterPushDown, DataFusionError>

👎Deprecated since 20.0.0: use supports_filters_pushdown instead
Tests whether the table provider can make use of a filter expression to optimise data retrieval.
source§

fn supports_filters_pushdown( &self, filters: &[&Expr] ) -> Result<Vec<TableProviderFilterPushDown>, DataFusionError>

Tests whether the table provider can make use of any or all filter expressions to optimise data retrieval.
source§

fn statistics(&self) -> Option<Statistics>

Get statistics for this table, if available
source§

fn insert_into<'life0, 'life1, 'async_trait>( &'life0 self, _state: &'life1 SessionState, _input: Arc<dyn ExecutionPlan>, _overwrite: bool ) -> Pin<Box<dyn Future<Output = Result<Arc<dyn ExecutionPlan>, DataFusionError>> + Send + 'async_trait>>
where 'life0: 'async_trait, 'life1: 'async_trait, Self: 'async_trait,

Return an [ExecutionPlan] to insert data into this table, if supported. Read more

Auto Trait Implementations§

Blanket Implementations§

source§

impl<T> Any for T
where T: 'static + ?Sized,

source§

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more
source§

impl<T> Borrow<T> for T
where T: ?Sized,

source§

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more
source§

impl<T> BorrowMut<T> for T
where T: ?Sized,

source§

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more
source§

impl<T> From<T> for T

source§

fn from(t: T) -> T

Returns the argument unchanged.

source§

impl<T> Instrument for T

source§

fn instrument(self, span: Span) -> Instrumented<Self>

Instruments this type with the provided Span, returning an Instrumented wrapper. Read more
source§

fn in_current_span(self) -> Instrumented<Self>

Instruments this type with the current Span, returning an Instrumented wrapper. Read more
source§

impl<T, U> Into<U> for T
where U: From<T>,

source§

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

§

impl<T> Pointable for T

§

const ALIGN: usize = _

The alignment of pointer.
§

type Init = T

The type for initializers.
§

unsafe fn init(init: <T as Pointable>::Init) -> usize

Initializes a with the given initializer. Read more
§

unsafe fn deref<'a>(ptr: usize) -> &'a T

Dereferences the given pointer. Read more
§

unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

Mutably dereferences the given pointer. Read more
§

unsafe fn drop(ptr: usize)

Drops the object pointed to by the given pointer. Read more
source§

impl<T> Same for T

§

type Output = T

Should always be Self
source§

impl<T> ToOwned for T
where T: Clone,

§

type Owned = T

The resulting type after obtaining ownership.
source§

fn to_owned(&self) -> T

Creates owned data from borrowed data, usually by cloning. Read more
source§

fn clone_into(&self, target: &mut T)

Uses borrowed data to replace owned data, usually by cloning. Read more
source§

impl<T, U> TryFrom<U> for T
where U: Into<T>,

§

type Error = Infallible

The type returned in the event of a conversion error.
source§

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

Performs the conversion.
source§

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

§

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.
source§

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Performs the conversion.
§

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

§

fn vzip(self) -> V

source§

impl<T> WithSubscriber for T

source§

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>
where S: Into<Dispatch>,

Attaches the provided Subscriber to this type, returning a WithDispatch wrapper. Read more
source§

fn with_current_subscriber(self) -> WithDispatch<Self>

Attaches the current default Subscriber to this type, returning a WithDispatch wrapper. Read more