vortex_file/
lib.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4#![allow(clippy::cast_possible_truncation)]
5#![doc(html_logo_url = "/vortex/docs/_static/vortex_spiral_logo.svg")]
6//! Read and write Vortex layouts, a serialization of Vortex arrays.
7//!
8//! A layout is a serialized array which is stored in some linear and contiguous block of
9//! memory. Layouts are recursive, and there are currently three types:
10//!
11//! 1. The [`FlatLayout`](vortex_layout::layouts::flat::FlatLayout). A contiguously serialized array of buffers, with a specific in-memory [`Alignment`](vortex_buffer::Alignment).
12//!
13//! 2. The [`StructLayout`](vortex_layout::layouts::struct_::StructLayout). Each column of a
14//!    [`StructArray`][vortex_array::arrays::StructArray] is sequentially laid out at known offsets.
15//!    This permits reading a subset of columns in linear time, as well as constant-time random
16//!    access to any column.
17//!
18//! 3. The [`ChunkedLayout`](vortex_layout::layouts::chunked::ChunkedLayout). Each chunk of a
19//!    [`ChunkedArray`](vortex_array::arrays::ChunkedArray) is sequentially laid out at known
20//!    offsets. Finding the chunks containing row range is an `Nlog(N)` operation of searching the
21//!    offsets.
22//!
23//! 4. The [`ZonedLayout`](vortex_layout::layouts::zoned::ZonedLayout).
24//!
25//! A layout, alone, is _not_ a standalone Vortex file because layouts are not self-describing. They
26//! neither contain a description of the kind of layout (e.g. flat, column of flat, chunked of
27//! column of flat) nor a data type ([`DType`](vortex_dtype::DType)).
28//!
29//! # Reading
30//!
31//! Vortex files are read using [`VortexOpenOptions`], which can be provided with information about the file's
32//! structure to save on IO before the actual data read. Once the file is open and has done the initial IO work to understand its own structure,
33//! it can be turned into a stream by calling [`VortexFile::scan`].
34//!
35//! The file manages IO-oriented work and CPU-oriented work on two different underlying runtimes, which are configurable and pluggable with multiple provided implementations (Tokio, Rayon etc.).
36//! It also caches buffers between stages of the scan, saving on duplicate IO. The cache can also be reused between scans of the same file (See [`SegmentCache`](`crate::segments::SegmentCache`)).
37//!
38//! # File Format
39//!
40//! Succinctly, the file format specification is as follows:
41//!
42//! 1. Data is written first, in a form that is describable by a Layout (typically Array IPC Messages).
43//!    1. To allow for more efficient IO & pruning, our writer implementation first writes the "data" arrays,
44//!       and then writes the "metadata" arrays (i.e., per-column statistics)
45//! 2. We write what is collectively referred to as the "Footer", which contains:
46//!    1. An optional Schema, which if present is a valid flatbuffer representing a message::Schema
47//!    2. The Layout, which is a valid footer::Layout flatbuffer, and describes the physical byte ranges & relationships amongst
48//!       the those byte ranges that we wrote in part 1.
49//!    3. The Postscript, which is a valid footer::Postscript flatbuffer, containing the absolute start offsets of the Schema & Layout
50//!       flatbuffers within the file.
51//!    4. The End-of-File marker, which is 8 bytes, and contains the u16 version, u16 postscript length, and 4 magic bytes.
52//!
53//! ## Illustrated File Format
54//! ```text
55//! ┌────────────────────────────┐
56//! │                            │
57//! │            Data            │
58//! │    (Array IPC Messages)    │
59//! │                            │
60//! ├────────────────────────────┤
61//! │                            │
62//! │   Per-Column Statistics    │
63//! │                            │
64//! ├────────────────────────────┤
65//! │                            │
66//! │     Schema Flatbuffer      │
67//! │                            │
68//! ├────────────────────────────┤
69//! │                            │
70//! │     Layout Flatbuffer      │
71//! │                            │
72//! ├────────────────────────────┤
73//! │                            │
74//! │    Postscript Flatbuffer   │
75//! │  (Schema & Layout Offsets) │
76//! │                            │
77//! ├────────────────────────────┤
78//! │     8-byte End of File     │
79//! │(Version, Postscript Length,│
80//! │       Magic Bytes)         │
81//! └────────────────────────────┘
82//! ```
83//!
84//! A Parquet-style file format is realized by using a chunked layout containing column layouts
85//! containing chunked layouts containing flat layouts. The outer chunked layout represents row
86//! groups. The inner chunked layout represents pages.
87//!
88//! Layouts are adaptive, and the writer is free to build arbitrarily complex layouts to suit their
89//! goals of locality or parallelism. For example, one may write a column in a Struct Layout with
90//! or without chunking, or completely elide statistics to save space or if they are not needed, for
91//! example if the metadata is being stored in an external index.
92//!
93//! Anything implementing [`VortexReadAt`](vortex_io::VortexReadAt), for example local files, byte
94//! buffers, and [cloud storage](vortex_io::ObjectStoreReadAt), can be used as the backing store.
95
96mod driver;
97mod file;
98mod footer;
99mod generic;
100mod memory;
101mod open;
102mod pruning;
103pub mod segments;
104mod strategy;
105#[cfg(test)]
106mod tests;
107mod writer;
108
109use std::sync::{Arc, LazyLock};
110
111pub use file::*;
112pub use footer::{Footer, SegmentSpec};
113pub use forever_constant::*;
114pub use generic::*;
115pub use memory::*;
116pub use open::*;
117pub use strategy::*;
118use vortex_alp::{ALPEncoding, ALPRDEncoding};
119use vortex_array::{ArrayRegistry, EncodingRef};
120use vortex_bytebool::ByteBoolEncoding;
121use vortex_datetime_parts::DateTimePartsEncoding;
122use vortex_decimal_byte_parts::DecimalBytePartsEncoding;
123use vortex_dict::DictEncoding;
124use vortex_fastlanes::{BitPackedEncoding, DeltaEncoding, FoREncoding};
125use vortex_fsst::FSSTEncoding;
126use vortex_pco::PcoEncoding;
127use vortex_runend::RunEndEncoding;
128use vortex_sequence::SequenceEncoding;
129use vortex_sparse::SparseEncoding;
130use vortex_zigzag::ZigZagEncoding;
131pub use writer::*;
132
133/// The current version of the Vortex file format
134pub const VERSION: u16 = 1;
135/// The size of the footer in bytes in Vortex version 1
136pub const V1_FOOTER_FBS_SIZE: usize = 32;
137
138/// Constants that will never change (i.e., doing so would break backwards compatibility)
139mod forever_constant {
140    /// The extension for Vortex files
141    pub const VORTEX_FILE_EXTENSION: &str = "vortex";
142
143    /// The maximum length of a Vortex footer in bytes
144    pub const MAX_FOOTER_SIZE: u16 = u16::MAX - 8;
145    /// The magic bytes for a Vortex file
146    pub const MAGIC_BYTES: [u8; 4] = *b"VTXF";
147    /// The size of the EOF marker in bytes
148    pub const EOF_SIZE: usize = 8;
149
150    #[cfg(test)]
151    mod test {
152        use super::*;
153        use crate::*;
154
155        #[test]
156        fn never_change_these_constants() {
157            assert_eq!(V1_FOOTER_FBS_SIZE, 32);
158            assert_eq!(MAX_FOOTER_SIZE, 65527);
159            assert_eq!(MAGIC_BYTES, *b"VTXF");
160            assert_eq!(EOF_SIZE, 8);
161        }
162    }
163}
164
165/// A default registry containing the built-in Vortex encodings and layouts.
166pub static DEFAULT_REGISTRY: LazyLock<Arc<ArrayRegistry>> = LazyLock::new(|| {
167    // Register the compressed encodings that Vortex ships with.
168    let mut registry = ArrayRegistry::canonical_only();
169    registry.register_many([
170        EncodingRef::new_ref(ALPEncoding.as_ref()),
171        EncodingRef::new_ref(ALPRDEncoding.as_ref()),
172        EncodingRef::new_ref(BitPackedEncoding.as_ref()),
173        EncodingRef::new_ref(ByteBoolEncoding.as_ref()),
174        EncodingRef::new_ref(DateTimePartsEncoding.as_ref()),
175        EncodingRef::new_ref(DecimalBytePartsEncoding.as_ref()),
176        EncodingRef::new_ref(DeltaEncoding.as_ref()),
177        EncodingRef::new_ref(DictEncoding.as_ref()),
178        EncodingRef::new_ref(FSSTEncoding.as_ref()),
179        EncodingRef::new_ref(FoREncoding.as_ref()),
180        EncodingRef::new_ref(PcoEncoding.as_ref()),
181        EncodingRef::new_ref(RunEndEncoding.as_ref()),
182        EncodingRef::new_ref(SequenceEncoding.as_ref()),
183        EncodingRef::new_ref(SparseEncoding.as_ref()),
184        EncodingRef::new_ref(ZigZagEncoding.as_ref()),
185        #[cfg(feature = "zstd")]
186        EncodingRef::new_ref(vortex_zstd::ZstdEncoding.as_ref()),
187    ]);
188    Arc::new(registry)
189});