pub struct Dataset { /* private fields */ }Expand description
Lance Dataset
Implementations§
source§impl Dataset
impl Dataset
sourcepub async fn open_with_params(uri: &str, params: &ReadParams) -> Result<Self>
pub async fn open_with_params(uri: &str, params: &ReadParams) -> Result<Self>
Open a dataset with read params.
sourcepub async fn checkout(uri: &str, version: u64) -> Result<Self>
pub async fn checkout(uri: &str, version: u64) -> Result<Self>
Check out a version of the dataset.
sourcepub async fn checkout_with_params(
uri: &str,
version: u64,
params: &ReadParams
) -> Result<Self>
pub async fn checkout_with_params( uri: &str, version: u64, params: &ReadParams ) -> Result<Self>
Check out a version of the dataset with read params.
sourcepub async fn checkout_version(&self, version: u64) -> Result<Self>
pub async fn checkout_version(&self, version: u64) -> Result<Self>
Check out the specified version of this dataset
sourcepub async fn write(
batches: impl RecordBatchReader + Send + 'static,
uri: &str,
params: Option<WriteParams>
) -> Result<Self>
pub async fn write( batches: impl RecordBatchReader + Send + 'static, uri: &str, params: Option<WriteParams> ) -> Result<Self>
Write to or Create a Dataset with a stream of RecordBatchs.
Returns the newly created Dataset.
Or Returns Error if the dataset already exists.
sourcepub async fn append(
&mut self,
batches: impl RecordBatchReader + Send + 'static,
params: Option<WriteParams>
) -> Result<()>
pub async fn append( &mut self, batches: impl RecordBatchReader + Send + 'static, params: Option<WriteParams> ) -> Result<()>
Append to existing Dataset with a stream of RecordBatchs
Returns void result or Returns Error
pub async fn latest_manifest(&self) -> Result<Manifest>
sourcepub async fn restore(&mut self, write_params: Option<WriteParams>) -> Result<()>
pub async fn restore(&mut self, write_params: Option<WriteParams>) -> Result<()>
Restore the currently checked out version of the dataset as the latest version.
Currently, write_params is just used to get additional store params.
Other options are ignored.
sourcepub fn cleanup_old_versions(
&self,
older_than: Duration,
delete_unverified: Option<bool>
) -> BoxFuture<'_, Result<RemovalStats>>
pub fn cleanup_old_versions( &self, older_than: Duration, delete_unverified: Option<bool> ) -> BoxFuture<'_, Result<RemovalStats>>
Removes old versions of the dataset from disk
This function will remove all versions of the dataset that are older than the provided timestamp. This function will not remove the current version of the dataset.
Once a version is removed it can no longer be checked out or restored. Any data unique to that version will be lost.
Arguments
older_than- Versions older than this will be deleted.delete_unverified- If false (the default) then files will only be deleted if they are listed in at least one manifest. Otherwise these files will be kept since they cannot be distinguished from an in-progress transaction. Set to true to delete these files if you are sure there are no other in-progress dataset operations.
Returns
RemovalStats- Statistics about the removal operation
sourcepub async fn commit(
base_uri: &str,
operation: Operation,
read_version: Option<u64>,
store_params: Option<ObjectStoreParams>
) -> Result<Self>
pub async fn commit( base_uri: &str, operation: Operation, read_version: Option<u64>, store_params: Option<ObjectStoreParams> ) -> Result<Self>
Commit changes to the dataset
This operation is not needed if you are using append/write/delete to manipulate the dataset. It is used to commit changes to the dataset that are made externally. For example, a bulk import tool may import large amounts of new data and write the appropriate lance files directly instead of using the write function.
This method can be used to commit this change to the dataset’s manifest. This method will not verify that the provided fragments exist and correct, that is the caller’s responsibility.
If this commit is a change to an existing dataset then it will often need to be based on an
existing version of the dataset. For example, if this change is a delete operation then
the caller will have read in the existing data (at some version) to determine which fragments
need to be deleted. The base version that the caller used should be supplied as the read_version
parameter. Some operations (e.g. Overwrite) do not depend on a previous version and read_version
can be None. An error will be returned if the read_version is needed for an operation and
it is not specified.
All operations except Overwrite will fail if the dataset does not already exist.
Arguments
base_uri- The base URI of the datasetoperation- A description of the change to commitread_version- The version of the dataset that this change is based onstore_paramsParameters controlling object store access to the manifest
pub async fn merge( &mut self, stream: impl RecordBatchReader + Send + 'static, left_on: &str, right_on: &str ) -> Result<()>
sourcepub async fn count_rows(&self) -> Result<usize>
pub async fn count_rows(&self) -> Result<usize>
Count the number of rows in the dataset.
It offers a fast path of counting rows by just computing via metadata.
pub async fn take( &self, row_indices: &[u64], projection: &Schema ) -> Result<RecordBatch>
sourcepub async fn take_rows(
&self,
row_ids: &[u64],
projection: &Schema
) -> Result<RecordBatch>
pub async fn take_rows( &self, row_ids: &[u64], projection: &Schema ) -> Result<RecordBatch>
Take rows by the internal ROW ids.
sourcepub fn take_scan(
&self,
row_ranges: Pin<Box<dyn Stream<Item = Result<Range<u64>>> + Send>>,
projection: Arc<Schema>,
batch_readahead: usize
) -> DatasetRecordBatchStream
pub fn take_scan( &self, row_ranges: Pin<Box<dyn Stream<Item = Result<Range<u64>>> + Send>>, projection: Arc<Schema>, batch_readahead: usize ) -> DatasetRecordBatchStream
Get a stream of batches based on iterator of ranges of row numbers.
This is an experimental API. It may change at any time.
sourcepub async fn delete(&mut self, predicate: &str) -> Result<()>
pub async fn delete(&mut self, predicate: &str) -> Result<()>
Delete rows based on a predicate.
pub async fn count_deleted_rows(&self) -> Result<usize>
pub fn version(&self) -> Version
sourcepub async fn latest_version_id(&self) -> Result<u64>
pub async fn latest_version_id(&self) -> Result<u64>
Get the latest version of the dataset This is meant to be a fast path for checking if a dataset has changed. This is why we don’t return the full version struct.
pub fn count_fragments(&self) -> usize
pub fn schema(&self) -> &Schema
sourcepub fn get_fragments(&self) -> Vec<FileFragment>
pub fn get_fragments(&self) -> Vec<FileFragment>
Get fragments.
If filter is provided, only fragments with the given name will be returned.
pub fn get_fragment(&self, fragment_id: usize) -> Option<FileFragment>
sourcepub async fn load_indices(&self) -> Result<Vec<Index>>
pub async fn load_indices(&self) -> Result<Vec<Index>>
Read all indices of this Dataset version.
sourcepub async fn load_index(&self, uuid: &str) -> Option<Index>
pub async fn load_index(&self, uuid: &str) -> Option<Index>
Loads a specific index with the given id
pub async fn load_index_by_name(&self, name: &str) -> Option<Index>
sourcepub async fn index_statistics(&self, index_name: &str) -> Result<Option<String>>
pub async fn index_statistics(&self, index_name: &str) -> Result<Option<String>>
Find index with a given index_name and return its serialized statistics.
pub async fn count_unindexed_rows( &self, index_uuid: &str ) -> Result<Option<usize>>
pub async fn count_indexed_rows( &self, index_uuid: &str ) -> Result<Option<usize>>
sourcepub async fn num_small_files(&self, max_rows_per_group: usize) -> usize
pub async fn num_small_files(&self, max_rows_per_group: usize) -> usize
Gets the number of files that are so small they don’t even have a full group. These are considered too small because reading many of them is much less efficient than reading a single file because the separate files split up what would otherwise be single IO requests into multiple.
pub async fn validate(&self) -> Result<()>
Trait Implementations§
source§impl DatasetIndexExt for Dataset
impl DatasetIndexExt for Dataset
source§fn create_index<'life0, 'life1, 'life2, 'life3, 'async_trait>(
&'life0 mut self,
columns: &'life1 [&'life2 str],
index_type: IndexType,
name: Option<String>,
params: &'life3 dyn IndexParams,
replace: bool
) -> Pin<Box<dyn Future<Output = Result<()>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
'life2: 'async_trait,
'life3: 'async_trait,
fn create_index<'life0, 'life1, 'life2, 'life3, 'async_trait>(
&'life0 mut self,
columns: &'life1 [&'life2 str],
index_type: IndexType,
name: Option<String>,
params: &'life3 dyn IndexParams,
replace: bool
) -> Pin<Box<dyn Future<Output = Result<()>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
'life2: 'async_trait,
'life3: 'async_trait,
source§impl TableProvider for Dataset
impl TableProvider for Dataset
source§fn as_any(&self) -> &dyn Any
fn as_any(&self) -> &dyn Any
Any so that it can be
downcast to a specific implementation.source§fn schema(&self) -> Arc<ArrowSchema>
fn schema(&self) -> Arc<ArrowSchema>
source§fn table_type(&self) -> TableType
fn table_type(&self) -> TableType
source§fn get_table_definition(&self) -> Option<&str>
fn get_table_definition(&self) -> Option<&str>
source§fn get_logical_plan(&self) -> Option<&LogicalPlan>
fn get_logical_plan(&self) -> Option<&LogicalPlan>
source§fn scan<'life0, 'life1, 'life2, 'life3, 'async_trait>(
&'life0 self,
_: &'life1 SessionState,
projection: Option<&'life2 Vec<usize>>,
_: &'life3 [Expr],
limit: Option<usize>
) -> Pin<Box<dyn Future<Output = DatafusionResult<Arc<dyn ExecutionPlan>>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
'life2: 'async_trait,
'life3: 'async_trait,
fn scan<'life0, 'life1, 'life2, 'life3, 'async_trait>(
&'life0 self,
_: &'life1 SessionState,
projection: Option<&'life2 Vec<usize>>,
_: &'life3 [Expr],
limit: Option<usize>
) -> Pin<Box<dyn Future<Output = DatafusionResult<Arc<dyn ExecutionPlan>>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
'life2: 'async_trait,
'life3: 'async_trait,
source§fn constraints(&self) -> Option<&Constraints>
fn constraints(&self) -> Option<&Constraints>
source§fn supports_filter_pushdown(
&self,
_filter: &Expr
) -> Result<TableProviderFilterPushDown, DataFusionError>
fn supports_filter_pushdown( &self, _filter: &Expr ) -> Result<TableProviderFilterPushDown, DataFusionError>
source§fn supports_filters_pushdown(
&self,
filters: &[&Expr]
) -> Result<Vec<TableProviderFilterPushDown>, DataFusionError>
fn supports_filters_pushdown( &self, filters: &[&Expr] ) -> Result<Vec<TableProviderFilterPushDown>, DataFusionError>
source§fn statistics(&self) -> Option<Statistics>
fn statistics(&self) -> Option<Statistics>
source§fn insert_into<'life0, 'life1, 'async_trait>(
&'life0 self,
_state: &'life1 SessionState,
_input: Arc<dyn ExecutionPlan>,
_overwrite: bool
) -> Pin<Box<dyn Future<Output = Result<Arc<dyn ExecutionPlan>, DataFusionError>> + Send + 'async_trait>>where
'life0: 'async_trait,
'life1: 'async_trait,
Self: 'async_trait,
fn insert_into<'life0, 'life1, 'async_trait>(
&'life0 self,
_state: &'life1 SessionState,
_input: Arc<dyn ExecutionPlan>,
_overwrite: bool
) -> Pin<Box<dyn Future<Output = Result<Arc<dyn ExecutionPlan>, DataFusionError>> + Send + 'async_trait>>where
'life0: 'async_trait,
'life1: 'async_trait,
Self: 'async_trait,
ExecutionPlan] to insert data into this table, if
supported. Read more