casc_storage/index/
combined_index.rs

1//! Combined index for efficient lookups across multiple bucket indices
2
3use crate::types::{ArchiveLocation, EKey};
4use dashmap::DashMap;
5use std::collections::BTreeMap;
6use std::sync::Arc;
7use tracing::{debug, trace};
8
9/// A combined index that maintains both per-bucket indices and a global lookup table
10pub struct CombinedIndex {
11    /// Per-bucket indices using BTreeMap for O(log n) lookups
12    bucket_indices: DashMap<u8, BTreeMap<EKey, ArchiveLocation>>,
13    /// Global index for direct O(1) bucket lookup
14    global_index: DashMap<EKey, u8>,
15    /// Optional bloom filter for existence checks (future enhancement)
16    #[allow(dead_code)]
17    bloom_filter: Option<Arc<BloomFilter>>,
18}
19
20impl CombinedIndex {
21    /// Create a new combined index
22    pub fn new() -> Self {
23        Self {
24            bucket_indices: DashMap::new(),
25            global_index: DashMap::new(),
26            bloom_filter: None,
27        }
28    }
29
30    /// Add an entry to the index
31    pub fn insert(&self, ekey: EKey, location: ArchiveLocation) {
32        let bucket = ekey.bucket_index();
33
34        // Update bucket index
35        self.bucket_indices
36            .entry(bucket)
37            .or_default()
38            .insert(ekey, location);
39
40        // Update global index
41        self.global_index.insert(ekey, bucket);
42
43        trace!("Inserted {} into bucket {:02x}", ekey, bucket);
44    }
45
46    /// Perform an optimized lookup using bucket hash
47    pub fn lookup(&self, ekey: &EKey) -> Option<ArchiveLocation> {
48        // First check if we know which bucket this key is in
49        if let Some(bucket_ref) = self.global_index.get(ekey) {
50            let bucket = *bucket_ref;
51
52            // Direct lookup in the specific bucket - O(log n)
53            if let Some(bucket_index) = self.bucket_indices.get(&bucket) {
54                return bucket_index.get(ekey).copied();
55            }
56        }
57
58        // Fallback: compute bucket and try direct lookup
59        let computed_bucket = ekey.bucket_index();
60        if let Some(bucket_index) = self.bucket_indices.get(&computed_bucket) {
61            if let Some(location) = bucket_index.get(ekey) {
62                // Update global index for next time
63                self.global_index.insert(*ekey, computed_bucket);
64                return Some(*location);
65            }
66        }
67
68        // Last resort: search all buckets (should rarely happen)
69        debug!("Falling back to full search for {}", ekey);
70        for bucket_ref in self.bucket_indices.iter() {
71            if let Some(location) = bucket_ref.value().get(ekey) {
72                // Update global index for next time
73                self.global_index.insert(*ekey, *bucket_ref.key());
74                return Some(*location);
75            }
76        }
77
78        None
79    }
80
81    /// Batch lookup for multiple keys - optimized for bulk operations
82    pub fn lookup_batch(&self, ekeys: &[EKey]) -> Vec<Option<ArchiveLocation>> {
83        ekeys.iter().map(|ekey| self.lookup(ekey)).collect()
84    }
85
86    /// Get all entries from a specific bucket
87    pub fn get_bucket(&self, bucket: u8) -> Option<Vec<(EKey, ArchiveLocation)>> {
88        self.bucket_indices
89            .get(&bucket)
90            .map(|index| index.iter().map(|(k, v)| (*k, *v)).collect())
91    }
92
93    /// Get total number of entries across all buckets
94    pub fn total_entries(&self) -> usize {
95        self.bucket_indices
96            .iter()
97            .map(|bucket| bucket.value().len())
98            .sum()
99    }
100
101    /// Get number of buckets
102    pub fn bucket_count(&self) -> usize {
103        self.bucket_indices.len()
104    }
105
106    /// Clear all indices
107    pub fn clear(&self) {
108        self.bucket_indices.clear();
109        self.global_index.clear();
110    }
111
112    /// Rebuild global index from bucket indices (for migration)
113    pub fn rebuild_global_index(&self) {
114        self.global_index.clear();
115
116        for bucket_ref in self.bucket_indices.iter() {
117            let bucket = *bucket_ref.key();
118            for ekey in bucket_ref.value().keys() {
119                self.global_index.insert(*ekey, bucket);
120            }
121        }
122
123        debug!(
124            "Rebuilt global index with {} entries",
125            self.global_index.len()
126        );
127    }
128
129    /// Get statistics about the index
130    pub fn stats(&self) -> IndexStats {
131        let mut min_bucket_size = usize::MAX;
132        let mut max_bucket_size = 0;
133        let mut total_entries = 0;
134
135        for bucket_ref in self.bucket_indices.iter() {
136            let size = bucket_ref.value().len();
137            min_bucket_size = min_bucket_size.min(size);
138            max_bucket_size = max_bucket_size.max(size);
139            total_entries += size;
140        }
141
142        IndexStats {
143            total_entries,
144            bucket_count: self.bucket_indices.len(),
145            min_bucket_size: if min_bucket_size == usize::MAX {
146                0
147            } else {
148                min_bucket_size
149            },
150            max_bucket_size,
151            avg_bucket_size: if self.bucket_indices.is_empty() {
152                0.0
153            } else {
154                total_entries as f64 / self.bucket_indices.len() as f64
155            },
156            global_index_size: self.global_index.len(),
157        }
158    }
159}
160
161impl Default for CombinedIndex {
162    fn default() -> Self {
163        Self::new()
164    }
165}
166
167/// Statistics about the combined index
168#[derive(Debug, Clone)]
169pub struct IndexStats {
170    pub total_entries: usize,
171    pub bucket_count: usize,
172    pub min_bucket_size: usize,
173    pub max_bucket_size: usize,
174    pub avg_bucket_size: f64,
175    pub global_index_size: usize,
176}
177
178/// Simple bloom filter for existence checks (stub for future implementation)
179struct BloomFilter {
180    // Future: implement a proper bloom filter for fast negative lookups
181}
182
183#[cfg(test)]
184mod tests {
185    use super::*;
186
187    #[test]
188    fn test_combined_index_lookup() {
189        let index = CombinedIndex::new();
190
191        // Create test entries
192        let key1 = EKey::new([0x10; 16]);
193        let key2 = EKey::new([0x20; 16]);
194        let key3 = EKey::new([0x30; 16]);
195
196        let loc1 = ArchiveLocation {
197            archive_id: 1,
198            offset: 100,
199            size: 50,
200        };
201
202        let loc2 = ArchiveLocation {
203            archive_id: 2,
204            offset: 200,
205            size: 60,
206        };
207
208        // Insert entries
209        index.insert(key1, loc1);
210        index.insert(key2, loc2);
211
212        // Test lookups
213        assert_eq!(index.lookup(&key1), Some(loc1));
214        assert_eq!(index.lookup(&key2), Some(loc2));
215        assert_eq!(index.lookup(&key3), None);
216
217        // Test stats
218        let stats = index.stats();
219        assert_eq!(stats.total_entries, 2);
220        assert_eq!(stats.global_index_size, 2);
221    }
222
223    #[test]
224    fn test_batch_lookup() {
225        let index = CombinedIndex::new();
226
227        // Insert test data
228        for i in 0..10u8 {
229            let mut key_bytes = [0u8; 16];
230            key_bytes[0] = i;
231            let key = EKey::new(key_bytes);
232            let loc = ArchiveLocation {
233                archive_id: i as u16,
234                offset: (i as u64) * 100,
235                size: 50,
236            };
237            index.insert(key, loc);
238        }
239
240        // Batch lookup
241        let keys: Vec<_> = (0..5u8)
242            .map(|i| {
243                let mut key_bytes = [0u8; 16];
244                key_bytes[0] = i;
245                EKey::new(key_bytes)
246            })
247            .collect();
248
249        let results = index.lookup_batch(&keys);
250        assert_eq!(results.len(), 5);
251        assert!(results.iter().all(|r| r.is_some()));
252    }
253}