vortex_file/read/mod.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
use std::collections::BTreeSet;
use std::fmt::Debug;
use vortex_array::ArrayData;
use vortex_error::VortexResult;
pub mod builder;
mod cache;
mod context;
mod expr_project;
mod filtering;
pub mod layouts;
mod mask;
pub mod projection;
mod recordbatchreader;
mod splits;
mod stream;
pub use builder::initial_read::*;
pub use builder::VortexReadBuilder;
pub use cache::*;
pub use context::*;
pub use filtering::RowFilter;
pub use projection::Projection;
pub use recordbatchreader::{AsyncRuntime, VortexRecordBatchReader};
pub use stream::VortexFileArrayStream;
use vortex_expr::ExprRef;
use vortex_ipc::stream_writer::ByteRange;
pub use crate::read::mask::RowMask;
// Recommended read-size according to the AWS performance guide
pub const INITIAL_READ_SIZE: usize = 8 * 1024 * 1024;
/// Operation to apply to data returned by the layout
#[derive(Debug, Clone)]
pub struct Scan {
expr: Option<ExprRef>,
}
impl Scan {
pub fn new(expr: Option<ExprRef>) -> Self {
Self { expr }
}
}
/// Unique identifier for a message within a layout
pub type LayoutPartId = u16;
/// Path through layout tree to given message
pub type MessageId = Vec<LayoutPartId>;
/// A unique locator for a message, including its ID and byte range containing
/// the message contents.
#[derive(Debug, Clone)]
pub struct MessageLocator(pub MessageId, pub ByteRange);
#[derive(Debug)]
pub enum BatchRead {
ReadMore(Vec<MessageLocator>),
Batch(ArrayData),
}
/// A reader for a layout, a serialized sequence of Vortex arrays.
///
/// Some layouts are _horizontally divisble_: they can read a sub-sequence of rows independently of
/// other sub-sequences. A layout advertises its sub-divisions in its [add_splits][Self::add_splits]
/// method. Any layout which is or contains a chunked layout is horizontally divisble.
///
/// The [read_selection][Self::read_selection] method accepts and applies a [RowMask], reading only
/// the sub-divisions which contain the selected (i.e. masked) rows.
pub trait LayoutReader: Debug + Send {
/// Register all horizontal row boundaries of this layout.
///
/// Layout should register all indivisible absolute row boundaries of the data stored in itself and its children.
/// `row_offset` gives the relative row position of this layout to the beginning of the file.
fn add_splits(&self, row_offset: usize, splits: &mut BTreeSet<usize>) -> VortexResult<()>;
/// Reads the data from the underlying layout within given selection
///
/// Layout is required to return all data for given selection in one batch. Layout can either
/// return a batch of data (i.e., an Array) or ask for more layout messages to be read. When
/// requesting messages to be read the caller should populate the message cache used when
/// creating the invoked instance of this trait and then call back into this function.
///
/// The layout is finished producing data for selection when it returns None
fn read_selection(&self, selector: &RowMask) -> VortexResult<Option<BatchRead>>;
}