vortex_file/lib.rs
1#![allow(clippy::cast_possible_truncation)]
2#![doc(html_logo_url = "/vortex/docs/_static/vortex_spiral_logo.svg")]
3//! Read and write Vortex layouts, a serialization of Vortex arrays.
4//!
5//! A layout is a serialized array which is stored in some linear and contiguous block of
6//! memory. Layouts are recursive, and there are currently three types:
7//!
8//! 1. The [`FlatLayout`](vortex_layout::layouts::flat::FlatLayout). A contiguously serialized array of buffers, with a specific in-memory [`Alignment`](vortex_buffer::Alignment).
9//!
10//! 2. The [`StructLayout`](vortex_layout::layouts::struct_::StructLayout). Each column of a
11//! [`StructArray`][vortex_array::arrays::StructArray] is sequentially laid out at known offsets.
12//! This permits reading a subset of columns in linear time, as well as constant-time random
13//! access to any column.
14//!
15//! 3. The [`ChunkedLayout`](vortex_layout::layouts::chunked::ChunkedLayout). Each chunk of a
16//! [`ChunkedArray`](vortex_array::arrays::ChunkedArray) is sequentially laid out at known
17//! offsets. Finding the chunks containing row range is an `Nlog(N)` operation of searching the
18//! offsets.
19//!
20//! 4. The [`StatsLayout`](vortex_layout::layouts::stats::StatsLayout).
21//!
22//! A layout, alone, is _not_ a standalone Vortex file because layouts are not self-describing. They
23//! neither contain a description of the kind of layout (e.g. flat, column of flat, chunked of
24//! column of flat) nor a data type ([`DType`](vortex_dtype::DType)).
25//!
26//! # Reading
27//!
28//! Vortex files are read using [`VortexOpenOptions`], which can be provided with information about the file's
29//! structure to save on IO before the actual data read. Once the file is open and has done the initial IO work to understand its own structure,
30//! it can be turned into a stream by calling [`VortexFile::scan`].
31//!
32//! The file manages IO-oriented work and CPU-oriented work on two different underlying runtimes, which are configurable and pluggable with multiple provided implementations (Tokio, Rayon etc.).
33//! It also caches buffers between stages of the scan, saving on duplicate IO. The cache can also be reused between scans of the same file (See [`SegmentCache`](`crate::segments::SegmentCache`)).
34//!
35//! # File Format
36//!
37//! Succinctly, the file format specification is as follows:
38//!
39//! 1. Data is written first, in a form that is describable by a Layout (typically Array IPC Messages).
40//! 1. To allow for more efficient IO & pruning, our writer implementation first writes the "data" arrays,
41//! and then writes the "metadata" arrays (i.e., per-column statistics)
42//! 2. We write what is collectively referred to as the "Footer", which contains:
43//! 1. An optional Schema, which if present is a valid flatbuffer representing a message::Schema
44//! 2. The Layout, which is a valid footer::Layout flatbuffer, and describes the physical byte ranges & relationships amongst
45//! the those byte ranges that we wrote in part 1.
46//! 3. The Postscript, which is a valid footer::Postscript flatbuffer, containing the absolute start offsets of the Schema & Layout
47//! flatbuffers within the file.
48//! 4. The End-of-File marker, which is 8 bytes, and contains the u16 version, u16 postscript length, and 4 magic bytes.
49//!
50//! ## Illustrated File Format
51//! ```text
52//! ┌────────────────────────────┐
53//! │ │
54//! │ Data │
55//! │ (Array IPC Messages) │
56//! │ │
57//! ├────────────────────────────┤
58//! │ │
59//! │ Per-Column Statistics │
60//! │ │
61//! ├────────────────────────────┤
62//! │ │
63//! │ Schema Flatbuffer │
64//! │ │
65//! ├────────────────────────────┤
66//! │ │
67//! │ Layout Flatbuffer │
68//! │ │
69//! ├────────────────────────────┤
70//! │ │
71//! │ Postscript Flatbuffer │
72//! │ (Schema & Layout Offsets) │
73//! │ │
74//! ├────────────────────────────┤
75//! │ 8-byte End of File │
76//! │(Version, Postscript Length,│
77//! │ Magic Bytes) │
78//! └────────────────────────────┘
79//! ```
80//!
81//! A Parquet-style file format is realized by using a chunked layout containing column layouts
82//! containing chunked layouts containing flat layouts. The outer chunked layout represents row
83//! groups. The inner chunked layout represents pages.
84//!
85//! Layouts are adaptive, and the writer is free to build arbitrarily complex layouts to suit their
86//! goals of locality or parallelism. For example, one may write a column in a Struct Layout with
87//! or without chunking, or completely elide statistics to save space or if they are not needed, for
88//! example if the metadata is being stored in an external index.
89//!
90//! Anything implementing [`VortexReadAt`](vortex_io::VortexReadAt), for example local files, byte
91//! buffers, and [cloud storage](vortex_io::ObjectStoreReadAt), can be used as the backing store.
92
93mod driver;
94mod file;
95mod footer;
96mod generic;
97mod memory;
98mod open;
99pub mod segments;
100mod strategy;
101#[cfg(test)]
102mod tests;
103mod writer;
104
105use std::sync::{Arc, LazyLock};
106
107pub use file::*;
108pub use footer::{Footer, SegmentSpec};
109pub use forever_constant::*;
110pub use generic::*;
111pub use memory::*;
112pub use open::*;
113pub use strategy::*;
114use vortex_alp::{ALPEncoding, ALPRDEncoding};
115use vortex_array::{ArrayRegistry, Encoding};
116use vortex_bytebool::ByteBoolEncoding;
117use vortex_datetime_parts::DateTimePartsEncoding;
118use vortex_dict::DictEncoding;
119use vortex_fastlanes::{BitPackedEncoding, DeltaEncoding, FoREncoding};
120use vortex_fsst::FSSTEncoding;
121pub use vortex_layout::scan;
122use vortex_runend::RunEndEncoding;
123use vortex_sparse::SparseEncoding;
124use vortex_zigzag::ZigZagEncoding;
125pub use writer::*;
126
127/// The current version of the Vortex file format
128pub const VERSION: u16 = 1;
129/// The size of the footer in bytes in Vortex version 1
130pub const V1_FOOTER_FBS_SIZE: usize = 32;
131
132/// Constants that will never change (i.e., doing so would break backwards compatibility)
133mod forever_constant {
134 /// The extension for Vortex files
135 pub const VORTEX_FILE_EXTENSION: &str = "vortex";
136
137 /// The maximum length of a Vortex footer in bytes
138 pub const MAX_FOOTER_SIZE: u16 = u16::MAX - 8;
139 /// The magic bytes for a Vortex file
140 pub const MAGIC_BYTES: [u8; 4] = *b"VTXF";
141 /// The size of the EOF marker in bytes
142 pub const EOF_SIZE: usize = 8;
143
144 #[cfg(test)]
145 mod test {
146 use super::*;
147 use crate::*;
148
149 #[test]
150 fn never_change_these_constants() {
151 assert_eq!(V1_FOOTER_FBS_SIZE, 32);
152 assert_eq!(MAX_FOOTER_SIZE, 65527);
153 assert_eq!(MAGIC_BYTES, *b"VTXF");
154 assert_eq!(EOF_SIZE, 8);
155 }
156 }
157}
158
159/// A default registry containing the built-in Vortex encodings and layouts.
160pub static DEFAULT_REGISTRY: LazyLock<Arc<ArrayRegistry>> = LazyLock::new(|| {
161 // Register the compressed encodings that Vortex ships with.
162 let mut registry = ArrayRegistry::canonical_only();
163 registry.register_many([
164 ALPEncoding.vtable(),
165 ALPRDEncoding.vtable(),
166 BitPackedEncoding.vtable(),
167 ByteBoolEncoding.vtable(),
168 DateTimePartsEncoding.vtable(),
169 DeltaEncoding.vtable(),
170 DictEncoding.vtable(),
171 FoREncoding.vtable(),
172 FSSTEncoding.vtable(),
173 RunEndEncoding.vtable(),
174 SparseEncoding.vtable(),
175 ZigZagEncoding.vtable(),
176 ]);
177 Arc::new(registry)
178});