vortex_array/arrays/patched/mod.rs
1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! An array that partially "patches" another array with new values.
5//!
6//! # Background
7//!
8//! This is meant to be the foundation of a fully data-parallel patching strategy, based on the
9//! work published in ["G-ALP" from Hepkema et al.](https://ir.cwi.nl/pub/35205/35205.pdf)
10//!
11//! Patching is common when an encoding almost completely covers an array save a few exceptions.
12//! In that case, rather than avoid the encoding entirely, it's preferable to
13//!
14//! * Replace unencodable values with fillers (zeros, frequent values, nulls, etc.)
15//! * Wrap the array with a `PatchedArray` signaling that when the original array is executed,
16//! some of the decoded values must be overwritten.
17//!
18//! In Vortex, the FastLanes bit-packing encoding is often the terminal node in an encoding tree,
19//! and FastLanes has an intrinsic chunking of 1024 elements. Thus, 1024 elements is pervasively
20//! a useful unit of chunking throughout Vortex, and so we use 1024 as a chunk size here
21//! as well.
22//!
23//! # Details
24//!
25//! To patch an array, we first divide it into a set of chunks of length 1024, and then within
26//! each chunk, we assign each position to a lane. The number of lanes depends on the width of
27//! the underlying type.
28//!
29//! Thus, rather than sorting patch indices and values by their global offset, they are sorted
30//! primarily by their chunk, and then subsequently by their lanes.
31//!
32//! The Patched array layout has 4 children
33//!
34//! * `inner`: the inner array is the one containing encoded values, including the filler values
35//! that need to be patched over at execution time
36//! * `lane_offsets`: this is an indexing buffer that allows you to see into ranges of the other
37//! two children
38//! * `indices`: An array of `u16` chunk indices, indicating where within the chunk should the value
39//! be overwritten by the patch value
40//! * `values`: The child array containing the patch values, which should be inserted over
41//! the values of the `inner` at the locations provided by `indices`
42//!
43//! `indices` and `values` are aligned and accessed together.
44//!
45//! ```text
46//!
47//! chunk 0 chunk 0 chunk 0 chunk 0 chunk 0 chunk 0
48//! lane 0 lane 1 lane 2 lane 3 lane 4 lane 5
49//! ┌────────────┬────────────┬────────────┬────────────┬────────────┬────────────┐
50//! lane_offsets │ 0 │ 0 │ 2 │ 2 │ 3 │ 5 │ ...
51//! └─────┬──────┴─────┬──────┴─────┬──────┴──────┬─────┴──────┬─────┴──────┬─────┘
52//! │ │ │ │ │ │
53//! │ │ │ │ │ │
54//! ┌─────┴────────────┘ └──────┬──────┘ ┌──────┘ └─────┐
55//! │ │ │ │
56//! │ │ │ │
57//! │ │ │ │
58//! ▼────────────┬────────────┬────────────▼────────────▼────────────┬────────────▼
59//! indices │ │ │ │ │ │ │
60//! │ │ │ │ │ │ │
61//! ├────────────┼────────────┼────────────┼────────────┼────────────┼────────────┤
62//! values │ │ │ │ │ │ │
63//! │ │ │ │ │ │ │
64//! └────────────┴────────────┴────────────┴────────────┴────────────┴────────────┘
65//! ```
66//!
67//! It turns out that this layout is optimal for executing patching on GPUs, because the
68//! `lane_offsets` allows each thread in a warp to seek to its patches in constant time.
69
70mod array;
71mod compute;
72mod vtable;
73
74use std::env;
75use std::sync::LazyLock;
76
77pub use array::*;
78use vortex_buffer::ByteBuffer;
79pub use vtable::*;
80
81/// Patches that have been transposed into GPU format.
82struct TransposedPatches {
83 n_lanes: usize,
84 lane_offsets: ByteBuffer,
85 indices: ByteBuffer,
86 values: ByteBuffer,
87}
88
89/// Number of lanes used at patch time for a value of type `V`.
90///
91/// This is *NOT* equal to the number of FastLanes lanes for the type `V`, rather this is going to
92/// correspond to how many "lanes" we will end up copying data on.
93///
94/// When applied on the CPU, this configuration doesn't really matter. On the GPU, it is based
95/// on the number of patches involved here.
96const fn patch_lanes<V: Sized>() -> usize {
97 // For types 32-bits or smaller, we use a 32 lane configuration, and for 64-bit we use 16 lanes.
98 // This matches up with the number of lanes we use to execute copying results from bit-unpacking
99 // from shared to global memory.
100 if size_of::<V>() < 8 { 32 } else { 16 }
101}
102
103/// Flag indicating if experimental patched array support is enabled.
104///
105/// This is set using the environment variable `VORTEX_EXPERIMENTAL_PATCHED_ARRAY`.
106///
107/// When this is true, any arrays with interior `Patches` will be read as a `Patched`
108/// array, and eliminate the interior patches.
109///
110/// The builtin compressor will also generate Patched arrays.
111pub fn use_experimental_patches() -> bool {
112 static USE_EXPERIMENTAL_PATCHES: LazyLock<bool> =
113 LazyLock::new(|| env::var("VORTEX_EXPERIMENTAL_PATCHED_ARRAY").is_ok_and(|v| v == "1"));
114 *USE_EXPERIMENTAL_PATCHES
115}