Skip to main content

vortex_compressor/stats/
cache.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Per-compression-site statistics cache and the [`ArrayAndStats`] bundle.
5
6use std::any::Any;
7use std::any::TypeId;
8
9use vortex_array::ArrayRef;
10use vortex_array::ToCanonical;
11use vortex_array::arrays::Primitive;
12use vortex_array::arrays::PrimitiveArray;
13use vortex_array::arrays::VarBinView;
14use vortex_array::arrays::VarBinViewArray;
15use vortex_error::VortexExpect;
16
17use super::BoolStats;
18use super::FloatStats;
19use super::GenerateStatsOptions;
20use super::IntegerStats;
21use super::StringStats;
22
23/// Cache for compression statistics, keyed by concrete type.
24struct StatsCache {
25    // TODO(connor): We could further optimize this with a `SmallVec` here.
26    /// The cache entries, keyed by [`TypeId`].
27    ///
28    /// The total number of statistics types in this stats should be relatively small, so we use a
29    /// vector instead of a hash map.
30    entries: Vec<(TypeId, Box<dyn Any>)>,
31}
32
33impl StatsCache {
34    /// Creates a new empty cache.
35    fn new() -> Self {
36        Self {
37            entries: Vec::new(),
38        }
39    }
40
41    /// Returns a cached value, computing it on first access.
42    fn get_or_insert_with<T: 'static>(&mut self, f: impl FnOnce() -> T) -> &T {
43        let type_id = TypeId::of::<T>();
44        let pos = self.entries.iter().position(|(id, _)| *id == type_id);
45
46        if let Some(pos) = pos {
47            self.entries[pos]
48                .1
49                .downcast_ref::<T>()
50                .vortex_expect("we just checked the TypeID")
51        } else {
52            self.entries.push((type_id, Box::new(f())));
53            self.entries
54                .last()
55                .vortex_expect("just pushed")
56                .1
57                .downcast_ref::<T>()
58                .vortex_expect("we just checked the TypeID")
59        }
60    }
61}
62
63/// An array bundled with its lazily-computed statistics cache.
64///
65/// The cache is guaranteed to correspond to the array. When a scheme creates a derived array (e.g.
66/// FoR bias subtraction), it must create a new [`ArrayAndStats`] so that stale stats from the
67/// original array are not reused.
68///
69/// Built-in stats are accessed via typed methods (`integer_stats`, `float_stats`, `string_stats`)
70/// which generate stats lazily on first access using the stored [`GenerateStatsOptions`].
71///
72/// Extension schemes can use `get_or_insert_with` for custom stats types.
73pub struct ArrayAndStats {
74    /// The array. This is always in canonical form.
75    array: ArrayRef,
76    /// The stats cache.
77    cache: StatsCache,
78    /// The stats generation options.
79    opts: GenerateStatsOptions,
80}
81
82impl ArrayAndStats {
83    /// Creates a new bundle with the given stats generation options.
84    ///
85    /// Stats are generated lazily on first access via the typed accessor methods.
86    ///
87    /// # Panics
88    ///
89    /// Panics if the array is not canonical.
90    pub fn new(array: ArrayRef, opts: GenerateStatsOptions) -> Self {
91        assert!(
92            array.is_canonical(),
93            "ArrayAndStats should only be created with canonical arrays"
94        );
95
96        Self {
97            array,
98            cache: StatsCache::new(),
99            opts,
100        }
101    }
102
103    /// Returns a reference to the array.
104    pub fn array(&self) -> &ArrayRef {
105        &self.array
106    }
107
108    // TODO(connor): This should return an `ArrayView<Primitive>` once more vtable changes land.
109    /// Returns the array as a [`PrimitiveArray`].
110    ///
111    /// # Panics
112    ///
113    /// Panics if the array is not a primitive array.
114    pub fn array_as_primitive(&self) -> PrimitiveArray {
115        self.array
116            .as_opt::<Primitive>()
117            .vortex_expect("the array is guaranteed to already be canonical by construction")
118            .into_owned()
119    }
120
121    // TODO(connor): This should return an `ArrayView<VarBinView>` once more vtable changes land.
122    /// Returns the array as a [`VarBinViewArray`].
123    ///
124    /// # Panics
125    ///
126    /// Panics if the array is not a UTF-8 string array.
127    pub fn array_as_utf8(&self) -> VarBinViewArray {
128        self.array
129            .as_opt::<VarBinView>()
130            .vortex_expect("the array is guaranteed to already be canonical by construction")
131            .into_owned()
132    }
133
134    /// Consumes the bundle and returns the array.
135    pub fn into_array(self) -> ArrayRef {
136        self.array
137    }
138
139    /// Returns the length of the array.
140    pub fn array_len(&self) -> usize {
141        self.array.len()
142    }
143
144    /// Returns bool stats, generating them lazily on first access.
145    pub fn bool_stats(&mut self) -> &BoolStats {
146        let array = self.array.clone();
147
148        self.cache.get_or_insert_with::<BoolStats>(|| {
149            BoolStats::generate(&array.to_bool()).vortex_expect("BoolStats shouldn't fail")
150        })
151    }
152
153    // TODO(connor): These should all have interior mutability instead!!!
154
155    /// Returns integer stats, generating them lazily on first access.
156    pub fn integer_stats(&mut self) -> &IntegerStats {
157        let array = self.array.clone();
158        let opts = self.opts;
159
160        self.cache.get_or_insert_with::<IntegerStats>(|| {
161            IntegerStats::generate_opts(&array.to_primitive(), opts)
162        })
163    }
164
165    /// Returns float stats, generating them lazily on first access.
166    pub fn float_stats(&mut self) -> &FloatStats {
167        let array = self.array.clone();
168        let opts = self.opts;
169
170        self.cache.get_or_insert_with::<FloatStats>(|| {
171            FloatStats::generate_opts(&array.to_primitive(), opts)
172        })
173    }
174
175    /// Returns string stats, generating them lazily on first access.
176    pub fn string_stats(&mut self) -> &StringStats {
177        let array = self.array.clone();
178        let opts = self.opts;
179
180        self.cache.get_or_insert_with::<StringStats>(|| {
181            StringStats::generate_opts(&array.to_varbinview(), opts)
182        })
183    }
184
185    /// For extension schemes with custom stats types.
186    pub fn get_or_insert_with<T: 'static>(&mut self, f: impl FnOnce() -> T) -> &T {
187        self.cache.get_or_insert_with::<T>(f)
188    }
189}