vortex_serde/file/read/mod.rs
1use std::collections::BTreeSet;
2use std::fmt::Debug;
3use std::sync::Arc;
4
5use vortex_array::Array;
6use vortex_error::VortexResult;
7
8mod buffered;
9pub mod builder;
10mod cache;
11mod column_batch;
12mod context;
13mod expr_project;
14mod filtering;
15pub mod layouts;
16mod mask;
17mod recordbatchreader;
18mod stream;
19
20pub use builder::initial_read::*;
21pub use builder::VortexReadBuilder;
22pub use cache::*;
23pub use context::*;
24pub use filtering::RowFilter;
25pub use recordbatchreader::{AsyncRuntime, VortexRecordBatchReader};
26pub use stream::VortexFileArrayStream;
27use vortex_expr::VortexExpr;
28pub use vortex_schema::projection::Projection;
29pub use vortex_schema::Schema;
30
31pub use crate::file::read::mask::RowMask;
32use crate::stream_writer::ByteRange;
33
34// Recommended read-size according to the AWS performance guide
35pub const INITIAL_READ_SIZE: usize = 8 * 1024 * 1024;
36
37/// Operation to apply to data returned by the layout
38#[derive(Debug, Clone)]
39pub struct Scan {
40 expr: Option<Arc<dyn VortexExpr>>,
41}
42
43impl Scan {
44 pub fn new(expr: Option<Arc<dyn VortexExpr>>) -> Self {
45 Self { expr }
46 }
47}
48
49/// Unique identifier for a message within a layout
50pub type LayoutPartId = u16;
51/// Path through layout tree to given message
52pub type MessageId = Vec<LayoutPartId>;
53/// ID and Range of atomic element of the file
54pub type Message = (MessageId, ByteRange);
55
56#[derive(Debug)]
57pub enum BatchRead {
58 ReadMore(Vec<Message>),
59 Batch(Array),
60}
61
62/// A reader for a layout, a serialized sequence of Vortex arrays.
63///
64/// Some layouts are _horizontally divisble_: they can read a sub-sequence of rows independently of
65/// other sub-sequences. A layout advertises its sub-divisions in its [add_splits][Self::add_splits]
66/// method. Any layout which is or contains a chunked layout is horizontally divisble.
67///
68/// The [read_selection][Self::read_selection] method accepts and applies a [RowMask], reading only
69/// the sub-divisions which contain the selected (i.e. masked) rows.
70pub trait LayoutReader: Debug + Send {
71 /// Register all horizontal row boundaries of this layout.
72 ///
73 /// Layout should register all indivisible absolute row boundaries of the data stored in itself and its children.
74 /// `row_offset` gives the relative row position of this layout to the beginning of the file.
75 fn add_splits(&self, row_offset: usize, splits: &mut BTreeSet<usize>) -> VortexResult<()>;
76
77 /// Reads the data from the underlying layout within given selection
78 ///
79 /// Layout is required to return all data for given selection in one batch. Layout can either
80 /// return a batch of data (i.e., an Array) or ask for more layout messages to be read. When
81 /// requesting messages to be read the caller should populate the message cache used when
82 /// creating the invoked instance of this trait and then call back into this function.
83 ///
84 /// The layout is finished producing data for selection when it returns None
85 fn read_selection(&mut self, selector: &RowMask) -> VortexResult<Option<BatchRead>>;
86}