Skip to main content

vortex_file/
lib.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4#![allow(clippy::cast_possible_truncation)]
5#![doc(html_logo_url = "/vortex/docs/_static/vortex_spiral_logo.svg")]
6//! Read and write Vortex layouts, a serialization of Vortex arrays.
7//!
8//! A layout is a serialized array which is stored in some linear and contiguous block of
9//! memory. Layouts are recursive, and there are currently three types:
10//!
11//! 1. The [`FlatLayout`](vortex_layout::layouts::flat::FlatLayout). A contiguously serialized array of buffers, with a specific in-memory [`Alignment`](vortex_buffer::Alignment).
12//!
13//! 2. The [`StructLayout`](vortex_layout::layouts::struct_::StructLayout). Each column of a
14//!    [`StructArray`][vortex_array::arrays::StructArray] is sequentially laid out at known offsets.
15//!    This permits reading a subset of columns in linear time, as well as constant-time random
16//!    access to any column.
17//!
18//! 3. The [`ChunkedLayout`](vortex_layout::layouts::chunked::ChunkedLayout). Each chunk of a
19//!    [`ChunkedArray`](vortex_array::arrays::ChunkedArray) is sequentially laid out at known
20//!    offsets. Finding the chunks containing row range is an `Nlog(N)` operation of searching the
21//!    offsets.
22//!
23//! 4. The [`ZonedLayout`](vortex_layout::layouts::zoned::ZonedLayout).
24//!
25//! A layout, alone, is _not_ a standalone Vortex file because layouts are not self-describing. They
26//! neither contain a description of the kind of layout (e.g. flat, column of flat, chunked of
27//! column of flat) nor a data type ([`DType`](vortex_array::dtype::DType)).
28//!
29//! # Reading
30//!
31//! Vortex files are read using [`VortexOpenOptions`], which can be provided with information about the file's
32//! structure to save on IO before the actual data read. Once the file is open and has done the initial IO work to understand its own structure,
33//! it can be turned into a stream by calling [`VortexFile::scan`].
34//!
35//! The file manages IO-oriented work and CPU-oriented work on two different underlying runtimes, which are configurable and pluggable with multiple provided implementations (Tokio, Rayon etc.).
36//!
37//! # File Format
38//!
39//! Succinctly, the file format specification is as follows:
40//!
41//! 1. Data is written first, in a form that is describable by a Layout (typically Array IPC Messages).
42//!    1. To allow for more efficient IO & pruning, our writer implementation first writes the "data" arrays,
43//!       and then writes the "metadata" arrays (i.e., per-column statistics)
44//! 2. We write what is collectively referred to as the "Footer", which contains:
45//!    1. An optional Schema, which if present is a valid flatbuffer representing a message::Schema
46//!    2. The Layout, which is a valid footer::Layout flatbuffer, and describes the physical byte ranges & relationships amongst
47//!       the those byte ranges that we wrote in part 1.
48//!    3. The Postscript, which is a valid footer::Postscript flatbuffer, containing the absolute start offsets of the Schema & Layout
49//!       flatbuffers within the file.
50//!    4. The End-of-File marker, which is 8 bytes, and contains the u16 version, u16 postscript length, and 4 magic bytes.
51//!
52//! ## Illustrated File Format
53//! ```text
54//! ┌────────────────────────────┐
55//! │                            │
56//! │            Data            │
57//! │    (Array IPC Messages)    │
58//! │                            │
59//! ├────────────────────────────┤
60//! │                            │
61//! │   Per-Column Statistics    │
62//! │                            │
63//! ├────────────────────────────┤
64//! │                            │
65//! │     Schema Flatbuffer      │
66//! │                            │
67//! ├────────────────────────────┤
68//! │                            │
69//! │     Layout Flatbuffer      │
70//! │                            │
71//! ├────────────────────────────┤
72//! │                            │
73//! │    Postscript Flatbuffer   │
74//! │  (Schema & Layout Offsets) │
75//! │                            │
76//! ├────────────────────────────┤
77//! │     8-byte End of File     │
78//! │(Version, Postscript Length,│
79//! │       Magic Bytes)         │
80//! └────────────────────────────┘
81//! ```
82//!
83//! A Parquet-style file format is realized by using a chunked layout containing column layouts
84//! containing chunked layouts containing flat layouts. The outer chunked layout represents row
85//! groups. The inner chunked layout represents pages.
86//!
87//! Layouts are adaptive, and the writer is free to build arbitrarily complex layouts to suit their
88//! goals of locality or parallelism. For example, one may write a column in a Struct Layout with
89//! or without chunking, or completely elide statistics to save space or if they are not needed, for
90//! example if the metadata is being stored in an external index.
91//!
92
93mod counting;
94mod file;
95mod footer;
96pub mod multi;
97mod open;
98mod pruning;
99mod read;
100pub mod segments;
101mod strategy;
102#[cfg(test)]
103mod tests;
104pub mod v2;
105mod writer;
106
107pub use file::*;
108pub use footer::*;
109pub use forever_constant::*;
110pub use open::*;
111pub use strategy::*;
112use vortex_alp::ALPRDVTable;
113use vortex_alp::ALPVTable;
114use vortex_array::arrays::DictVTable;
115use vortex_array::session::ArraySessionExt;
116use vortex_bytebool::ByteBoolVTable;
117use vortex_datetime_parts::DateTimePartsVTable;
118use vortex_decimal_byte_parts::DecimalBytePartsVTable;
119use vortex_fastlanes::BitPackedVTable;
120use vortex_fastlanes::DeltaVTable;
121use vortex_fastlanes::FoRVTable;
122use vortex_fastlanes::RLEVTable;
123use vortex_fsst::FSSTVTable;
124use vortex_pco::PcoVTable;
125use vortex_sequence::SequenceVTable;
126use vortex_session::VortexSession;
127use vortex_sparse::SparseVTable;
128use vortex_zigzag::ZigZagVTable;
129pub use writer::*;
130
131/// The current version of the Vortex file format
132pub const VERSION: u16 = 1;
133/// The size of the footer in bytes in Vortex version 1
134pub const V1_FOOTER_FBS_SIZE: usize = 32;
135
136/// Constants that will never change (i.e., doing so would break backwards compatibility)
137mod forever_constant {
138    /// The extension for Vortex files
139    pub const VORTEX_FILE_EXTENSION: &str = "vortex";
140
141    /// The maximum length of a Vortex postscript in bytes
142    pub const MAX_POSTSCRIPT_SIZE: u16 = u16::MAX - 8;
143    /// The magic bytes for a Vortex file
144    pub const MAGIC_BYTES: [u8; 4] = *b"VTXF";
145    /// The size of the EOF marker in bytes
146    pub const EOF_SIZE: usize = 8;
147
148    #[cfg(test)]
149    mod test {
150        use super::*;
151        use crate::*;
152
153        #[test]
154        fn never_change_these_constants() {
155            assert_eq!(V1_FOOTER_FBS_SIZE, 32);
156            assert_eq!(MAX_POSTSCRIPT_SIZE, 65527);
157            assert_eq!(MAGIC_BYTES, *b"VTXF");
158            assert_eq!(EOF_SIZE, 8);
159        }
160    }
161}
162
163/// Register the default encodings use in Vortex files with the provided session.
164///
165/// NOTE: this function will be changed in the future to encapsulate logic for using different
166/// Vortex "Editions" that may support different sets of encodings.
167pub fn register_default_encodings(session: &mut VortexSession) {
168    {
169        let arrays = session.arrays();
170        arrays.register(ALPVTable::ID, ALPVTable);
171        arrays.register(ALPRDVTable::ID, ALPRDVTable);
172        arrays.register(BitPackedVTable::ID, BitPackedVTable);
173        arrays.register(ByteBoolVTable::ID, ByteBoolVTable);
174        arrays.register(DateTimePartsVTable::ID, DateTimePartsVTable);
175        arrays.register(DecimalBytePartsVTable::ID, DecimalBytePartsVTable);
176        arrays.register(DeltaVTable::ID, DeltaVTable);
177        arrays.register(DictVTable::ID, DictVTable);
178        arrays.register(FSSTVTable::ID, FSSTVTable);
179        arrays.register(FoRVTable::ID, FoRVTable);
180        arrays.register(PcoVTable::ID, PcoVTable);
181        arrays.register(RLEVTable::ID, RLEVTable);
182        arrays.register(SequenceVTable::ID, SequenceVTable);
183        arrays.register(SparseVTable::ID, SparseVTable);
184        arrays.register(ZigZagVTable::ID, ZigZagVTable);
185        #[cfg(feature = "zstd")]
186        arrays.register(vortex_zstd::ZstdVTable::ID, vortex_zstd::ZstdVTable);
187        #[cfg(all(feature = "zstd", feature = "unstable_encodings"))]
188        arrays.register(
189            vortex_zstd::ZstdBuffersVTable::ID,
190            vortex_zstd::ZstdBuffersVTable,
191        );
192    }
193
194    // Eventually all encodings crates should expose an initialize function. For now it's only
195    // a few of them.
196    vortex_runend::initialize(session)
197}