vortex_compressor/stats/cache.rs
1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Per-compression-site statistics cache and the [`ArrayAndStats`] bundle.
5
6use std::any::Any;
7use std::any::TypeId;
8
9use vortex_array::ArrayRef;
10use vortex_array::ToCanonical;
11use vortex_array::arrays::Primitive;
12use vortex_array::arrays::PrimitiveArray;
13use vortex_array::arrays::VarBinView;
14use vortex_array::arrays::VarBinViewArray;
15use vortex_error::VortexExpect;
16
17use super::BoolStats;
18use super::FloatStats;
19use super::GenerateStatsOptions;
20use super::IntegerStats;
21use super::StringStats;
22
23/// Cache for compression statistics, keyed by concrete type.
24struct StatsCache {
25 // TODO(connor): We could further optimize this with a `SmallVec` here.
26 /// The cache entries, keyed by [`TypeId`].
27 ///
28 /// The total number of statistics types in this stats should be relatively small, so we use a
29 /// vector instead of a hash map.
30 entries: Vec<(TypeId, Box<dyn Any>)>,
31}
32
33impl StatsCache {
34 /// Creates a new empty cache.
35 fn new() -> Self {
36 Self {
37 entries: Vec::new(),
38 }
39 }
40
41 /// Returns a cached value, computing it on first access.
42 fn get_or_insert_with<T: 'static>(&mut self, f: impl FnOnce() -> T) -> &T {
43 let type_id = TypeId::of::<T>();
44 let pos = self.entries.iter().position(|(id, _)| *id == type_id);
45
46 if let Some(pos) = pos {
47 self.entries[pos]
48 .1
49 .downcast_ref::<T>()
50 .vortex_expect("we just checked the TypeID")
51 } else {
52 self.entries.push((type_id, Box::new(f())));
53 self.entries
54 .last()
55 .vortex_expect("just pushed")
56 .1
57 .downcast_ref::<T>()
58 .vortex_expect("we just checked the TypeID")
59 }
60 }
61}
62
63/// An array bundled with its lazily-computed statistics cache.
64///
65/// The cache is guaranteed to correspond to the array. When a scheme creates a derived array (e.g.
66/// FoR bias subtraction), it must create a new [`ArrayAndStats`] so that stale stats from the
67/// original array are not reused.
68///
69/// Built-in stats are accessed via typed methods (`integer_stats`, `float_stats`, `string_stats`)
70/// which generate stats lazily on first access using the stored [`GenerateStatsOptions`].
71///
72/// Extension schemes can use `get_or_insert_with` for custom stats types.
73pub struct ArrayAndStats {
74 /// The array. This is always in canonical form.
75 array: ArrayRef,
76 /// The stats cache.
77 cache: StatsCache,
78 /// The stats generation options.
79 opts: GenerateStatsOptions,
80}
81
82impl ArrayAndStats {
83 /// Creates a new bundle with the given stats generation options.
84 ///
85 /// Stats are generated lazily on first access via the typed accessor methods.
86 ///
87 /// # Panics
88 ///
89 /// Panics if the array is not canonical.
90 pub fn new(array: ArrayRef, opts: GenerateStatsOptions) -> Self {
91 assert!(
92 array.is_canonical(),
93 "ArrayAndStats should only be created with canonical arrays"
94 );
95
96 Self {
97 array,
98 cache: StatsCache::new(),
99 opts,
100 }
101 }
102
103 /// Returns a reference to the array.
104 pub fn array(&self) -> &ArrayRef {
105 &self.array
106 }
107
108 // TODO(connor): This should return an `ArrayView<Primitive>` once more vtable changes land.
109 /// Returns the array as a [`PrimitiveArray`].
110 ///
111 /// # Panics
112 ///
113 /// Panics if the array is not a primitive array.
114 pub fn array_as_primitive(&self) -> PrimitiveArray {
115 self.array
116 .as_opt::<Primitive>()
117 .vortex_expect("the array is guaranteed to already be canonical by construction")
118 .into_owned()
119 }
120
121 // TODO(connor): This should return an `ArrayView<VarBinView>` once more vtable changes land.
122 /// Returns the array as a [`VarBinViewArray`].
123 ///
124 /// # Panics
125 ///
126 /// Panics if the array is not a UTF-8 string array.
127 pub fn array_as_utf8(&self) -> VarBinViewArray {
128 self.array
129 .as_opt::<VarBinView>()
130 .vortex_expect("the array is guaranteed to already be canonical by construction")
131 .into_owned()
132 }
133
134 /// Consumes the bundle and returns the array.
135 pub fn into_array(self) -> ArrayRef {
136 self.array
137 }
138
139 /// Returns the length of the array.
140 pub fn array_len(&self) -> usize {
141 self.array.len()
142 }
143
144 /// Returns bool stats, generating them lazily on first access.
145 pub fn bool_stats(&mut self) -> &BoolStats {
146 let array = self.array.clone();
147
148 self.cache.get_or_insert_with::<BoolStats>(|| {
149 BoolStats::generate(&array.to_bool()).vortex_expect("BoolStats shouldn't fail")
150 })
151 }
152
153 // TODO(connor): These should all have interior mutability instead!!!
154
155 /// Returns integer stats, generating them lazily on first access.
156 pub fn integer_stats(&mut self) -> &IntegerStats {
157 let array = self.array.clone();
158 let opts = self.opts;
159
160 self.cache.get_or_insert_with::<IntegerStats>(|| {
161 IntegerStats::generate_opts(&array.to_primitive(), opts)
162 })
163 }
164
165 /// Returns float stats, generating them lazily on first access.
166 pub fn float_stats(&mut self) -> &FloatStats {
167 let array = self.array.clone();
168 let opts = self.opts;
169
170 self.cache.get_or_insert_with::<FloatStats>(|| {
171 FloatStats::generate_opts(&array.to_primitive(), opts)
172 })
173 }
174
175 /// Returns string stats, generating them lazily on first access.
176 pub fn string_stats(&mut self) -> &StringStats {
177 let array = self.array.clone();
178 let opts = self.opts;
179
180 self.cache.get_or_insert_with::<StringStats>(|| {
181 StringStats::generate_opts(&array.to_varbinview(), opts)
182 })
183 }
184
185 /// For extension schemes with custom stats types.
186 pub fn get_or_insert_with<T: 'static>(&mut self, f: impl FnOnce() -> T) -> &T {
187 self.cache.get_or_insert_with::<T>(f)
188 }
189}