Skip to main content

formualizer_eval/engine/
lookup_index_cache.rs

1use std::hash::{Hash, Hasher};
2use std::sync::atomic::{AtomicUsize, Ordering};
3use std::sync::{Arc, RwLock};
4
5use formualizer_common::{ExcelError, LiteralValue, SheetId};
6use rustc_hash::FxHashMap;
7use smallvec::SmallVec;
8
9use crate::builtins::lookup::lookup_utils::cmp_for_lookup;
10use crate::engine::range_view::RangeView;
11
12#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
13pub struct LookupIndexKey {
14    pub(crate) sheet_id: SheetId,
15    pub(crate) start_row: u32,
16    pub(crate) start_col: u32,
17    pub(crate) end_row: u32,
18    pub(crate) end_col: u32,
19    pub(crate) axis: LookupAxis,
20    pub(crate) snapshot_id: u64,
21}
22
23#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
24pub enum LookupAxis {
25    ColumnInView(usize),
26    RowInView(usize),
27}
28
29#[derive(Debug, Eq, PartialEq)]
30pub enum LookupHashKey {
31    Number(u64),
32    Text(Box<str>),
33    Boolean(bool),
34    Empty,
35}
36
37impl Hash for LookupHashKey {
38    fn hash<H: Hasher>(&self, state: &mut H) {
39        match self {
40            Self::Number(bits) => {
41                0u8.hash(state);
42                bits.hash(state);
43            }
44            Self::Text(text) => {
45                1u8.hash(state);
46                text.hash(state);
47            }
48            Self::Boolean(value) => {
49                2u8.hash(state);
50                value.hash(state);
51            }
52            Self::Empty => {
53                3u8.hash(state);
54            }
55        }
56    }
57}
58
59impl LookupHashKey {
60    pub(crate) fn from_literal(value: &LiteralValue) -> Option<Self> {
61        match value {
62            LiteralValue::Number(n) => Some(Self::Number(normalize_f64_bits(*n))),
63            LiteralValue::Int(i) => Some(Self::Number(normalize_f64_bits(*i as f64))),
64            LiteralValue::Text(s) => Some(Self::Text(s.to_lowercase().into_boxed_str())),
65            LiteralValue::Boolean(b) => Some(Self::Boolean(*b)),
66            LiteralValue::Empty => Some(Self::Empty),
67            LiteralValue::Error(_)
68            | LiteralValue::Array(_)
69            | LiteralValue::Date(_)
70            | LiteralValue::DateTime(_)
71            | LiteralValue::Time(_)
72            | LiteralValue::Duration(_)
73            | LiteralValue::Pending => None,
74        }
75    }
76}
77
78fn normalize_f64_bits(n: f64) -> u64 {
79    if n.is_nan() {
80        return f64::NAN.to_bits();
81    }
82    let rounded = n.round();
83    if (n - rounded).abs() < 1e-12 {
84        rounded.to_bits()
85    } else {
86        n.to_bits()
87    }
88}
89
90#[derive(Debug, Clone, Default)]
91pub struct DuplicateIndices {
92    pub(crate) first: usize,
93    pub(crate) last: usize,
94    pub(crate) all: SmallVec<[usize; 1]>,
95}
96
97pub struct LookupIndex {
98    pub(crate) len: usize,
99    pub(crate) bytes: usize,
100    pub(crate) entries: FxHashMap<LookupHashKey, DuplicateIndices>,
101    pub(crate) cell_values: Box<[LiteralValue]>,
102    pub(crate) first_empty: Option<usize>,
103}
104
105impl LookupIndex {
106    pub(crate) fn build(
107        view: &RangeView<'_>,
108        axis: LookupAxis,
109    ) -> Result<BuildOutcome, ExcelError> {
110        let (rows, cols) = view.dims();
111        let len = match axis {
112            LookupAxis::ColumnInView(col) => {
113                if col >= cols {
114                    return Ok(BuildOutcome::Degenerate);
115                }
116                rows
117            }
118            LookupAxis::RowInView(row) => {
119                if row >= rows {
120                    return Ok(BuildOutcome::Degenerate);
121                }
122                cols
123            }
124        };
125        if len == 0 {
126            return Ok(BuildOutcome::Degenerate);
127        }
128
129        let mut entries: FxHashMap<LookupHashKey, DuplicateIndices> = FxHashMap::default();
130        let mut cell_values = Vec::with_capacity(len);
131        let mut first_empty = None;
132        let mut error_count = 0usize;
133
134        for idx in 0..len {
135            let value = match axis {
136                LookupAxis::ColumnInView(col) => view.get_cell(idx, col),
137                LookupAxis::RowInView(row) => view.get_cell(row, idx),
138            };
139            if matches!(value, LiteralValue::Error(_)) {
140                error_count += 1;
141            }
142            if matches!(value, LiteralValue::Empty) && first_empty.is_none() {
143                first_empty = Some(idx);
144            }
145            if let Some(key) = LookupHashKey::from_literal(&value) {
146                let dups = entries.entry(key).or_insert_with(|| DuplicateIndices {
147                    first: idx,
148                    last: idx,
149                    all: SmallVec::new(),
150                });
151                if dups.all.is_empty() {
152                    dups.first = idx;
153                }
154                dups.last = idx;
155                dups.all.push(idx);
156            }
157            cell_values.push(value);
158        }
159
160        if error_count > 0 {
161            return Ok(BuildOutcome::ErrorInLookupAxis);
162        }
163
164        let bytes = estimate_bytes(len, entries.len());
165        Ok(BuildOutcome::Built(Self {
166            len,
167            bytes,
168            entries,
169            cell_values: cell_values.into_boxed_slice(),
170            first_empty,
171        }))
172    }
173
174    pub(crate) fn find_first_exact(&self, needle: &LiteralValue) -> Option<usize> {
175        let hash_key = LookupHashKey::from_literal(needle)?;
176        if let Some(dups) = self.entries.get(&hash_key) {
177            for &idx in &dups.all {
178                if cmp_for_lookup(needle, &self.cell_values[idx]) == Some(0) {
179                    return Some(idx);
180                }
181            }
182        }
183        if let Some(n) = numeric_zero_candidate(needle)
184            && n.abs() < 1e-12
185        {
186            return self.first_empty;
187        }
188        None
189    }
190
191    pub(crate) fn find_last_exact(&self, needle: &LiteralValue) -> Option<usize> {
192        let hash_key = LookupHashKey::from_literal(needle)?;
193        if let Some(dups) = self.entries.get(&hash_key) {
194            for &idx in dups.all.iter().rev() {
195                if cmp_for_lookup(needle, &self.cell_values[idx]) == Some(0) {
196                    return Some(idx);
197                }
198            }
199        }
200        if let Some(n) = numeric_zero_candidate(needle)
201            && n.abs() < 1e-12
202        {
203            return self.first_empty;
204        }
205        None
206    }
207}
208
209fn numeric_zero_candidate(needle: &LiteralValue) -> Option<f64> {
210    match needle {
211        LiteralValue::Number(n) => Some(*n),
212        LiteralValue::Int(i) => Some(*i as f64),
213        _ => None,
214    }
215}
216
217pub(crate) fn estimate_bytes(len: usize, entries: usize) -> usize {
218    len.saturating_mul(std::mem::size_of::<LiteralValue>().saturating_add(8))
219        .saturating_add(entries.saturating_mul(96))
220        .saturating_add(256)
221}
222
223pub(crate) enum BuildOutcome {
224    Built(LookupIndex),
225    ErrorInLookupAxis,
226    Degenerate,
227}
228
229const LOOKUP_INDEX_BUILD_THRESHOLD: u32 = 3;
230const CALL_COUNT_PRUNE_LIMIT: usize = 4096;
231
232#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
233pub struct LookupIndexCacheReport {
234    pub(crate) builds: usize,
235    pub(crate) hits: usize,
236    pub(crate) misses: usize,
237    pub(crate) skipped_volatile: usize,
238    pub(crate) skipped_error: usize,
239    pub(crate) skipped_tiny: usize,
240    pub(crate) skipped_cap: usize,
241    pub(crate) skipped_below_threshold: usize,
242    pub(crate) bytes_in_cache: usize,
243    pub(crate) entries_count: usize,
244}
245
246pub struct LookupIndexCache {
247    inner: RwLock<FxHashMap<LookupIndexKey, Arc<LookupIndex>>>,
248    call_counts: RwLock<FxHashMap<LookupIndexKey, u32>>,
249    volatile_keys: RwLock<FxHashMap<LookupIndexKey, ()>>,
250    build_threshold: u32,
251    bytes_in_use: AtomicUsize,
252    max_bytes: usize,
253    builds: AtomicUsize,
254    hits: AtomicUsize,
255    misses: AtomicUsize,
256    skipped_volatile: AtomicUsize,
257    skipped_error: AtomicUsize,
258    skipped_tiny: AtomicUsize,
259    skipped_cap: AtomicUsize,
260    skipped_below_threshold: AtomicUsize,
261}
262
263fn volatile_key(mut key: LookupIndexKey) -> LookupIndexKey {
264    key.snapshot_id = 0;
265    key
266}
267
268impl LookupIndexCache {
269    pub(crate) fn new(max_bytes: usize) -> Self {
270        Self {
271            inner: RwLock::new(FxHashMap::default()),
272            call_counts: RwLock::new(FxHashMap::default()),
273            volatile_keys: RwLock::new(FxHashMap::default()),
274            build_threshold: LOOKUP_INDEX_BUILD_THRESHOLD,
275            bytes_in_use: AtomicUsize::new(0),
276            max_bytes,
277            builds: AtomicUsize::new(0),
278            hits: AtomicUsize::new(0),
279            misses: AtomicUsize::new(0),
280            skipped_volatile: AtomicUsize::new(0),
281            skipped_error: AtomicUsize::new(0),
282            skipped_tiny: AtomicUsize::new(0),
283            skipped_cap: AtomicUsize::new(0),
284            skipped_below_threshold: AtomicUsize::new(0),
285        }
286    }
287
288    pub(crate) fn get(&self, key: &LookupIndexKey) -> Option<Arc<LookupIndex>> {
289        let found = self
290            .inner
291            .read()
292            .ok()
293            .and_then(|guard| guard.get(key).cloned());
294        if found.is_some() {
295            self.hits.fetch_add(1, Ordering::Relaxed);
296        } else {
297            self.misses.fetch_add(1, Ordering::Relaxed);
298        }
299        found
300    }
301
302    pub(crate) fn should_build(&self, key: LookupIndexKey) -> bool {
303        let Ok(mut guard) = self.call_counts.write() else {
304            self.skipped_below_threshold.fetch_add(1, Ordering::Relaxed);
305            return false;
306        };
307        if guard.len() > CALL_COUNT_PRUNE_LIMIT {
308            guard.retain(|existing_key, _| existing_key.snapshot_id == key.snapshot_id);
309        }
310        let count = guard.entry(key).or_insert(0);
311        *count = count.saturating_add(1);
312        if *count <= self.build_threshold {
313            self.skipped_below_threshold.fetch_add(1, Ordering::Relaxed);
314            return false;
315        }
316        true
317    }
318
319    pub(crate) fn would_exceed_cap(&self, bytes: usize) -> bool {
320        self.bytes_in_use
321            .load(Ordering::Relaxed)
322            .saturating_add(bytes)
323            > self.max_bytes
324    }
325
326    pub(crate) fn is_known_volatile(&self, key: &LookupIndexKey) -> bool {
327        let volatile_key = volatile_key(*key);
328        self.volatile_keys
329            .read()
330            .map(|guard| guard.contains_key(&volatile_key))
331            .unwrap_or(false)
332    }
333
334    pub(crate) fn note_volatile_key(&self, key: LookupIndexKey) {
335        if let Ok(mut guard) = self.volatile_keys.write() {
336            if guard.len() > CALL_COUNT_PRUNE_LIMIT {
337                guard.clear();
338            }
339            guard.insert(volatile_key(key), ());
340        }
341    }
342
343    pub(crate) fn insert_if_room(
344        &self,
345        key: LookupIndexKey,
346        index: LookupIndex,
347    ) -> Option<Arc<LookupIndex>> {
348        let bytes = index.bytes;
349        let current = self.bytes_in_use.load(Ordering::Relaxed);
350        if current.saturating_add(bytes) > self.max_bytes {
351            self.skipped_cap.fetch_add(1, Ordering::Relaxed);
352            return None;
353        }
354        let index = Arc::new(index);
355        if let Ok(mut guard) = self.inner.write() {
356            if let Some(existing) = guard.get(&key) {
357                self.hits.fetch_add(1, Ordering::Relaxed);
358                return Some(existing.clone());
359            }
360            guard.insert(key, index.clone());
361            self.bytes_in_use.fetch_add(bytes, Ordering::Relaxed);
362            self.builds.fetch_add(1, Ordering::Relaxed);
363            Some(index)
364        } else {
365            None
366        }
367    }
368
369    pub(crate) fn note_skipped_volatile(&self) {
370        self.skipped_volatile.fetch_add(1, Ordering::Relaxed);
371    }
372
373    pub(crate) fn note_skipped_error(&self) {
374        self.skipped_error.fetch_add(1, Ordering::Relaxed);
375    }
376
377    pub(crate) fn note_skipped_tiny(&self) {
378        self.skipped_tiny.fetch_add(1, Ordering::Relaxed);
379    }
380
381    pub(crate) fn note_skipped_cap(&self) {
382        self.skipped_cap.fetch_add(1, Ordering::Relaxed);
383    }
384
385    pub(crate) fn reset_counters(&self) {
386        self.builds.store(0, Ordering::Relaxed);
387        self.hits.store(0, Ordering::Relaxed);
388        self.misses.store(0, Ordering::Relaxed);
389        self.skipped_volatile.store(0, Ordering::Relaxed);
390        self.skipped_error.store(0, Ordering::Relaxed);
391        self.skipped_tiny.store(0, Ordering::Relaxed);
392        self.skipped_cap.store(0, Ordering::Relaxed);
393        self.skipped_below_threshold.store(0, Ordering::Relaxed);
394    }
395
396    pub(crate) fn report(&self) -> LookupIndexCacheReport {
397        let entries_count = self
398            .inner
399            .read()
400            .map(|guard| guard.len())
401            .unwrap_or_default();
402        LookupIndexCacheReport {
403            builds: self.builds.load(Ordering::Relaxed),
404            hits: self.hits.load(Ordering::Relaxed),
405            misses: self.misses.load(Ordering::Relaxed),
406            skipped_volatile: self.skipped_volatile.load(Ordering::Relaxed),
407            skipped_error: self.skipped_error.load(Ordering::Relaxed),
408            skipped_tiny: self.skipped_tiny.load(Ordering::Relaxed),
409            skipped_cap: self.skipped_cap.load(Ordering::Relaxed),
410            skipped_below_threshold: self.skipped_below_threshold.load(Ordering::Relaxed),
411            bytes_in_cache: self.bytes_in_use.load(Ordering::Relaxed),
412            entries_count,
413        }
414    }
415}