Skip to main content

vortex_compressor/stats/
cache.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Per-compression-site statistics cache and the [`ArrayAndStats`] bundle.
5
6use std::any::Any;
7use std::any::TypeId;
8
9use vortex_array::ArrayRef;
10use vortex_array::ArrayView;
11use vortex_array::ToCanonical;
12use vortex_array::arrays::Primitive;
13use vortex_array::arrays::VarBinView;
14use vortex_error::VortexExpect;
15
16use super::BoolStats;
17use super::FloatStats;
18use super::GenerateStatsOptions;
19use super::IntegerStats;
20use super::StringStats;
21
22/// Cache for compression statistics, keyed by concrete type.
23struct StatsCache {
24    // TODO(connor): We could further optimize this with a `SmallVec` here.
25    /// The cache entries, keyed by [`TypeId`].
26    ///
27    /// The total number of statistics types in this stats should be relatively small, so we use a
28    /// vector instead of a hash map.
29    entries: Vec<(TypeId, Box<dyn Any>)>,
30}
31
32impl StatsCache {
33    /// Creates a new empty cache.
34    fn new() -> Self {
35        Self {
36            entries: Vec::new(),
37        }
38    }
39
40    /// Returns a cached value, computing it on first access.
41    fn get_or_insert_with<T: 'static>(&mut self, f: impl FnOnce() -> T) -> &T {
42        let type_id = TypeId::of::<T>();
43        let pos = self.entries.iter().position(|(id, _)| *id == type_id);
44
45        if let Some(pos) = pos {
46            self.entries[pos]
47                .1
48                .downcast_ref::<T>()
49                .vortex_expect("we just checked the TypeID")
50        } else {
51            self.entries.push((type_id, Box::new(f())));
52            self.entries
53                .last()
54                .vortex_expect("just pushed")
55                .1
56                .downcast_ref::<T>()
57                .vortex_expect("we just checked the TypeID")
58        }
59    }
60}
61
62/// An array bundled with its lazily-computed statistics cache.
63///
64/// The cache is guaranteed to correspond to the array. When a scheme creates a derived array (e.g.
65/// FoR bias subtraction), it must create a new [`ArrayAndStats`] so that stale stats from the
66/// original array are not reused.
67///
68/// Built-in stats are accessed via typed methods (`integer_stats`, `float_stats`, `string_stats`)
69/// which generate stats lazily on first access using the stored [`GenerateStatsOptions`].
70///
71/// Extension schemes can use `get_or_insert_with` for custom stats types.
72pub struct ArrayAndStats {
73    /// The array. This is always in canonical form.
74    array: ArrayRef,
75    /// The stats cache.
76    cache: StatsCache,
77    /// The stats generation options.
78    opts: GenerateStatsOptions,
79}
80
81impl ArrayAndStats {
82    /// Creates a new bundle with the given stats generation options.
83    ///
84    /// Stats are generated lazily on first access via the typed accessor methods.
85    ///
86    /// # Panics
87    ///
88    /// Panics if the array is not canonical.
89    pub fn new(array: ArrayRef, opts: GenerateStatsOptions) -> Self {
90        assert!(
91            array.is_canonical(),
92            "ArrayAndStats should only be created with canonical arrays"
93        );
94
95        Self {
96            array,
97            cache: StatsCache::new(),
98            opts,
99        }
100    }
101
102    /// Returns a reference to the array.
103    pub fn array(&self) -> &ArrayRef {
104        &self.array
105    }
106
107    /// Returns the array as an [`ArrayView<Primitive>`].
108    ///
109    /// # Panics
110    ///
111    /// Panics if the array is not a primitive array.
112    pub fn array_as_primitive(&self) -> ArrayView<'_, Primitive> {
113        self.array
114            .as_opt::<Primitive>()
115            .vortex_expect("the array is guaranteed to already be canonical by construction")
116    }
117
118    /// Returns the array as an [`ArrayView<VarBinView>`].
119    ///
120    /// # Panics
121    ///
122    /// Panics if the array is not a UTF-8 string array.
123    pub fn array_as_utf8(&self) -> ArrayView<'_, VarBinView> {
124        self.array
125            .as_opt::<VarBinView>()
126            .vortex_expect("the array is guaranteed to already be canonical by construction")
127    }
128
129    /// Consumes the bundle and returns the array.
130    pub fn into_array(self) -> ArrayRef {
131        self.array
132    }
133
134    /// Returns the length of the array.
135    pub fn array_len(&self) -> usize {
136        self.array.len()
137    }
138
139    /// Returns bool stats, generating them lazily on first access.
140    pub fn bool_stats(&mut self) -> &BoolStats {
141        let array = self.array.clone();
142
143        self.cache.get_or_insert_with::<BoolStats>(|| {
144            BoolStats::generate(&array.to_bool()).vortex_expect("BoolStats shouldn't fail")
145        })
146    }
147
148    // TODO(connor): These should all have interior mutability instead!!!
149
150    /// Returns integer stats, generating them lazily on first access.
151    pub fn integer_stats(&mut self) -> &IntegerStats {
152        let array = self.array.clone();
153        let opts = self.opts;
154
155        self.cache.get_or_insert_with::<IntegerStats>(|| {
156            IntegerStats::generate_opts(&array.to_primitive(), opts)
157        })
158    }
159
160    /// Returns float stats, generating them lazily on first access.
161    pub fn float_stats(&mut self) -> &FloatStats {
162        let array = self.array.clone();
163        let opts = self.opts;
164
165        self.cache.get_or_insert_with::<FloatStats>(|| {
166            FloatStats::generate_opts(&array.to_primitive(), opts)
167        })
168    }
169
170    /// Returns string stats, generating them lazily on first access.
171    pub fn string_stats(&mut self) -> &StringStats {
172        let array = self.array.clone();
173        let opts = self.opts;
174
175        self.cache.get_or_insert_with::<StringStats>(|| {
176            StringStats::generate_opts(&array.to_varbinview(), opts)
177        })
178    }
179
180    /// For extension schemes with custom stats types.
181    pub fn get_or_insert_with<T: 'static>(&mut self, f: impl FnOnce() -> T) -> &T {
182        self.cache.get_or_insert_with::<T>(f)
183    }
184}