vortex_serde/file/mod.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192
//! Read and write Vortex layouts, a serialization of Vortex arrays.
//!
//! A layout is a serialized array which is stored in some linear and contiguous block of
//! memory. Layouts are recursively defined in terms of one of three kinds:
//!
//! 1. The [flat layout][layouts::FlatLayoutSpec]. A contiguously serialized array using the [Vortex
//! flatbuffer Batch message][vortex_flatbuffers::message].
//!
//! 2. The [columnar layout][layouts::ColumnarLayoutSpec]. Each column of a
//! [StructArray][vortex_array::array::StructArray] is sequentially laid out at known
//! offsets. This permits reading a subset of columns in time linear in the number of kept
//! columns.
//!
//! 3. The [chunked layout][layouts::ChunkedLayoutSpec]. Each chunk of a
//! [ChunkedArray][vortex_array::array::ChunkedArray] is sequentially laid out at known
//! offsets. This permits reading a subset of rows in time linear in the number of kept rows.
//!
//! A layout, alone, is _not_ a standalone Vortex file because layouts are not self-describing. They
//! neither contain a description of the kind of layout (e.g. flat, column of flat, chunked of
//! column of flat) nor a [data type][vortex_dtype::DType]. A standalone Vortex file comprises seven
//! sections, the first of which is the serialized array bytes. The interpretation of those bytes,
//! i.e. which particular layout was used, is given in the fourth section: the footer.
//!
//! <table>
//! <thead>
//! <tr>
//! <th>Section</th>
//! <th>Size</th>
//! <th>Description</th>
//! </tr>
//! </thead>
//! <tr>
//! <td>
//! Data
//! </td>
//! <td>
//! In the Footer.
//! </td>
//! <td>
//! The serialized arrays.
//! </td>
//! </tr><tr>
//! <td>
//! Metadata
//! </td>
//! <td>
//! In the Footer.
//! </td>
//! <td>
//! A table per column with a row per chunk. Contains statistics.
//! </td>
//! </tr><tr>
//! <td>
//! Schema
//! </td>
//! <td>
//! In the Postscript.
//! </td>
//! <td>
//! A serialized data type.
//! </td>
//! </tr><tr>
//! <td>
//! Footer
//! </td>
//! <td>
//! In the Postscript.
//! </td>
//! <td>
//! A recursive description of the layout including the number of rows.
//! </td>
//! </tr><tr>
//! <td>
//! Postscript
//! </td>
//! <td>
//! 32 bytes
//! </td>
//! <td>
//! Two 64-bit offsets pointing at schema and the footer.
//! </td>
//! </tr><tr>
//! <td>
//! Version
//! </td>
//! <td>
//! 4 bytes
//! </td>
//! <td>
//! The file format version.
//! </td>
//! </tr><tr>
//! <td>
//! Magic bytes
//! </td>
//! <td>
//! 4 bytes
//! </td>
//! <td>
//! The ASCII bytes "VRTX" (86, 82, 84, 88; 0x56525458).
//! </td>
//! </tr>
//! </table>
//!
//! A Parquet-style file format is realized by using a chunked layout containing column layouts
//! containing chunked layouts containing flat layouts. The outer chunked layout represents row
//! groups. The inner chunked layout represents pages.
//!
//! All the chunks of a chunked layout and all the columns of a column layout need not use the same
//! layout.
//!
//! Anything implementing [VortexReadAt][crate::io::VortexReadAt], for example local files, byte
//! buffers, and [cloud storage][crate::io::ObjectStoreReadAt], can be used as the "linear and
//! contiguous memory".
//!
//! # Reading
//!
//! Layout reading is implemented by [VortexFileArrayStream]. The VortexFileArrayStream should be
//! constructed by a [VortexReadBuilder], which first uses an [InitialRead] to read the footer (schema,
//! layout, postscript, version, and magic bytes). In most cases, these entire footer can be read by
//! a single read of the suffix of the file.
//!
//! A VortexFileArrayStream internally contains a [LayoutMessageCache] which is shared by its layout
//! reader and the layout reader's descendents. The cache permits the reading system to "read" the
//! bytes of a layout multiple times without triggering reads to the underlying storage. For
//! example, the VortexFileArrayStream reads an array, evaluates the row filter, and then reads the
//! array again with the filter mask.
//!
//! [`read_layout_from_initial`] produces a [LayoutReader] which assembles one or more Vortex arrays
//! by reading the serialized data and metadata.
//!
//! # Apache Arrow
//!
//! If you ultimately seek Arrow arrays, [VortexRecordBatchReader] converts a [VortexFileArrayStream]
//! into a RecordBatchReader.
mod read;
mod write;
mod pruning;
#[cfg(test)]
mod tests;
/// The current version of the Vortex file format
pub const VERSION: u16 = 1;
/// The size of the footer in bytes in Vortex version 1
pub const V1_FOOTER_FBS_SIZE: usize = 32;
/// Constants that will never change (i.e., doing so would break backwards compatibility)
mod forever_constant {
use super::*;
/// The extension for Vortex files
pub const VORTEX_FILE_EXTENSION: &str = "vortex";
/// The maximum length of a Vortex footer in bytes
pub const MAX_FOOTER_SIZE: u16 = u16::MAX - 8;
/// The magic bytes for a Vortex file
pub const MAGIC_BYTES: [u8; 4] = *b"VTXF";
/// The size of the EOF marker in bytes
pub const EOF_SIZE: usize = 8;
/// The layout ID for a flat layout
pub const FLAT_LAYOUT_ID: LayoutId = LayoutId(1);
/// The layout ID for a chunked layout
pub const CHUNKED_LAYOUT_ID: LayoutId = LayoutId(2);
/// The layout ID for a column layout
pub const COLUMNAR_LAYOUT_ID: LayoutId = LayoutId(3);
/// The layout ID for an inline schema layout
pub const INLINE_SCHEMA_LAYOUT_ID: LayoutId = LayoutId(4);
#[cfg(test)]
mod test {
use super::*;
#[test]
fn never_change_these_constants() {
assert_eq!(V1_FOOTER_FBS_SIZE, 32);
assert_eq!(MAX_FOOTER_SIZE, 65527);
assert_eq!(MAGIC_BYTES, *b"VTXF");
assert_eq!(EOF_SIZE, 8);
assert_eq!(FLAT_LAYOUT_ID, LayoutId(1));
assert_eq!(CHUNKED_LAYOUT_ID, LayoutId(2));
assert_eq!(COLUMNAR_LAYOUT_ID, LayoutId(3));
assert_eq!(INLINE_SCHEMA_LAYOUT_ID, LayoutId(4));
}
}
}
pub use forever_constant::*;
pub use read::*;
pub use write::*;