ruvector_graph/optimization/
index_compression.rs

1//! Compressed index structures for massive space savings
2//!
3//! This module provides:
4//! - Roaring bitmaps for label indexes
5//! - Delta encoding for sorted ID lists
6//! - Dictionary encoding for string properties
7
8use parking_lot::RwLock;
9use roaring::RoaringBitmap;
10use std::collections::HashMap;
11use std::sync::Arc;
12
13/// Compressed index using multiple encoding strategies
14pub struct CompressedIndex {
15    /// Bitmap indexes for labels
16    label_indexes: Arc<RwLock<HashMap<String, RoaringBitmap>>>,
17    /// Delta-encoded sorted ID lists
18    sorted_indexes: Arc<RwLock<HashMap<String, DeltaEncodedList>>>,
19    /// Dictionary encoding for string properties
20    string_dict: Arc<RwLock<StringDictionary>>,
21}
22
23impl CompressedIndex {
24    pub fn new() -> Self {
25        Self {
26            label_indexes: Arc::new(RwLock::new(HashMap::new())),
27            sorted_indexes: Arc::new(RwLock::new(HashMap::new())),
28            string_dict: Arc::new(RwLock::new(StringDictionary::new())),
29        }
30    }
31
32    /// Add node to label index
33    pub fn add_to_label_index(&self, label: &str, node_id: u64) {
34        let mut indexes = self.label_indexes.write();
35        indexes
36            .entry(label.to_string())
37            .or_insert_with(RoaringBitmap::new)
38            .insert(node_id as u32);
39    }
40
41    /// Get all nodes with a specific label
42    pub fn get_nodes_by_label(&self, label: &str) -> Vec<u64> {
43        self.label_indexes
44            .read()
45            .get(label)
46            .map(|bitmap| bitmap.iter().map(|id| id as u64).collect())
47            .unwrap_or_default()
48    }
49
50    /// Check if node has label (fast bitmap lookup)
51    pub fn has_label(&self, label: &str, node_id: u64) -> bool {
52        self.label_indexes
53            .read()
54            .get(label)
55            .map(|bitmap| bitmap.contains(node_id as u32))
56            .unwrap_or(false)
57    }
58
59    /// Count nodes with label
60    pub fn count_label(&self, label: &str) -> u64 {
61        self.label_indexes
62            .read()
63            .get(label)
64            .map(|bitmap| bitmap.len())
65            .unwrap_or(0)
66    }
67
68    /// Intersect multiple labels (efficient bitmap AND)
69    pub fn intersect_labels(&self, labels: &[&str]) -> Vec<u64> {
70        let indexes = self.label_indexes.read();
71
72        if labels.is_empty() {
73            return Vec::new();
74        }
75
76        let mut result = indexes
77            .get(labels[0])
78            .cloned()
79            .unwrap_or_else(RoaringBitmap::new);
80
81        for &label in &labels[1..] {
82            if let Some(bitmap) = indexes.get(label) {
83                result &= bitmap;
84            } else {
85                return Vec::new();
86            }
87        }
88
89        result.iter().map(|id| id as u64).collect()
90    }
91
92    /// Union multiple labels (efficient bitmap OR)
93    pub fn union_labels(&self, labels: &[&str]) -> Vec<u64> {
94        let indexes = self.label_indexes.read();
95        let mut result = RoaringBitmap::new();
96
97        for &label in labels {
98            if let Some(bitmap) = indexes.get(label) {
99                result |= bitmap;
100            }
101        }
102
103        result.iter().map(|id| id as u64).collect()
104    }
105
106    /// Encode string using dictionary
107    pub fn encode_string(&self, s: &str) -> u32 {
108        self.string_dict.write().encode(s)
109    }
110
111    /// Decode string from dictionary
112    pub fn decode_string(&self, id: u32) -> Option<String> {
113        self.string_dict.read().decode(id)
114    }
115}
116
117impl Default for CompressedIndex {
118    fn default() -> Self {
119        Self::new()
120    }
121}
122
123/// Roaring bitmap index for efficient set operations
124pub struct RoaringBitmapIndex {
125    bitmap: RoaringBitmap,
126}
127
128impl RoaringBitmapIndex {
129    pub fn new() -> Self {
130        Self {
131            bitmap: RoaringBitmap::new(),
132        }
133    }
134
135    pub fn insert(&mut self, id: u64) {
136        self.bitmap.insert(id as u32);
137    }
138
139    pub fn contains(&self, id: u64) -> bool {
140        self.bitmap.contains(id as u32)
141    }
142
143    pub fn remove(&mut self, id: u64) {
144        self.bitmap.remove(id as u32);
145    }
146
147    pub fn len(&self) -> u64 {
148        self.bitmap.len()
149    }
150
151    pub fn is_empty(&self) -> bool {
152        self.bitmap.is_empty()
153    }
154
155    pub fn iter(&self) -> impl Iterator<Item = u64> + '_ {
156        self.bitmap.iter().map(|id| id as u64)
157    }
158
159    /// Intersect with another bitmap
160    pub fn intersect(&self, other: &Self) -> Self {
161        Self {
162            bitmap: &self.bitmap & &other.bitmap,
163        }
164    }
165
166    /// Union with another bitmap
167    pub fn union(&self, other: &Self) -> Self {
168        Self {
169            bitmap: &self.bitmap | &other.bitmap,
170        }
171    }
172
173    /// Serialize to bytes
174    pub fn serialize(&self) -> Vec<u8> {
175        let mut bytes = Vec::new();
176        self.bitmap
177            .serialize_into(&mut bytes)
178            .expect("Failed to serialize bitmap");
179        bytes
180    }
181
182    /// Deserialize from bytes
183    pub fn deserialize(bytes: &[u8]) -> Result<Self, Box<dyn std::error::Error>> {
184        let bitmap = RoaringBitmap::deserialize_from(bytes)?;
185        Ok(Self { bitmap })
186    }
187}
188
189impl Default for RoaringBitmapIndex {
190    fn default() -> Self {
191        Self::new()
192    }
193}
194
195/// Delta encoding for sorted ID lists
196/// Stores differences between consecutive IDs for better compression
197pub struct DeltaEncodedList {
198    /// Base value (first ID)
199    base: u64,
200    /// Delta values
201    deltas: Vec<u32>,
202}
203
204impl DeltaEncodedList {
205    pub fn new() -> Self {
206        Self {
207            base: 0,
208            deltas: Vec::new(),
209        }
210    }
211
212    /// Encode a sorted list of IDs
213    pub fn encode(ids: &[u64]) -> Self {
214        if ids.is_empty() {
215            return Self::new();
216        }
217
218        let base = ids[0];
219        let deltas = ids
220            .windows(2)
221            .map(|pair| (pair[1] - pair[0]) as u32)
222            .collect();
223
224        Self { base, deltas }
225    }
226
227    /// Decode to original ID list
228    pub fn decode(&self) -> Vec<u64> {
229        if self.deltas.is_empty() {
230            if self.base == 0 {
231                return Vec::new();
232            }
233            return vec![self.base];
234        }
235
236        let mut result = Vec::with_capacity(self.deltas.len() + 1);
237        result.push(self.base);
238
239        let mut current = self.base;
240        for &delta in &self.deltas {
241            current += delta as u64;
242            result.push(current);
243        }
244
245        result
246    }
247
248    /// Get compression ratio
249    pub fn compression_ratio(&self) -> f64 {
250        let original_size = (self.deltas.len() + 1) * 8; // u64s
251        let compressed_size = 8 + self.deltas.len() * 4; // base + u32 deltas
252        original_size as f64 / compressed_size as f64
253    }
254}
255
256impl Default for DeltaEncodedList {
257    fn default() -> Self {
258        Self::new()
259    }
260}
261
262/// Delta encoder utility
263pub struct DeltaEncoder;
264
265impl DeltaEncoder {
266    /// Encode sorted u64 slice to delta-encoded format
267    pub fn encode(values: &[u64]) -> Vec<u8> {
268        if values.is_empty() {
269            return Vec::new();
270        }
271
272        let mut result = Vec::new();
273
274        // Write base value
275        result.extend_from_slice(&values[0].to_le_bytes());
276
277        // Write deltas
278        for window in values.windows(2) {
279            let delta = (window[1] - window[0]) as u32;
280            result.extend_from_slice(&delta.to_le_bytes());
281        }
282
283        result
284    }
285
286    /// Decode delta-encoded format back to u64 values
287    pub fn decode(bytes: &[u8]) -> Vec<u64> {
288        if bytes.len() < 8 {
289            return Vec::new();
290        }
291
292        let mut result = Vec::new();
293
294        // Read base value
295        let base = u64::from_le_bytes(bytes[0..8].try_into().unwrap());
296        result.push(base);
297
298        // Read deltas
299        let mut current = base;
300        for chunk in bytes[8..].chunks(4) {
301            if chunk.len() == 4 {
302                let delta = u32::from_le_bytes(chunk.try_into().unwrap());
303                current += delta as u64;
304                result.push(current);
305            }
306        }
307
308        result
309    }
310}
311
312/// String dictionary for deduplication and compression
313struct StringDictionary {
314    /// String to ID mapping
315    string_to_id: HashMap<String, u32>,
316    /// ID to string mapping
317    id_to_string: HashMap<u32, String>,
318    /// Next available ID
319    next_id: u32,
320}
321
322impl StringDictionary {
323    fn new() -> Self {
324        Self {
325            string_to_id: HashMap::new(),
326            id_to_string: HashMap::new(),
327            next_id: 0,
328        }
329    }
330
331    fn encode(&mut self, s: &str) -> u32 {
332        if let Some(&id) = self.string_to_id.get(s) {
333            return id;
334        }
335
336        let id = self.next_id;
337        self.next_id += 1;
338
339        self.string_to_id.insert(s.to_string(), id);
340        self.id_to_string.insert(id, s.to_string());
341
342        id
343    }
344
345    fn decode(&self, id: u32) -> Option<String> {
346        self.id_to_string.get(&id).cloned()
347    }
348
349    fn len(&self) -> usize {
350        self.string_to_id.len()
351    }
352}
353
354#[cfg(test)]
355mod tests {
356    use super::*;
357
358    #[test]
359    fn test_compressed_index() {
360        let index = CompressedIndex::new();
361
362        index.add_to_label_index("Person", 1);
363        index.add_to_label_index("Person", 2);
364        index.add_to_label_index("Person", 3);
365        index.add_to_label_index("Employee", 2);
366        index.add_to_label_index("Employee", 3);
367
368        let persons = index.get_nodes_by_label("Person");
369        assert_eq!(persons.len(), 3);
370
371        let intersection = index.intersect_labels(&["Person", "Employee"]);
372        assert_eq!(intersection.len(), 2);
373
374        let union = index.union_labels(&["Person", "Employee"]);
375        assert_eq!(union.len(), 3);
376    }
377
378    #[test]
379    fn test_roaring_bitmap() {
380        let mut bitmap = RoaringBitmapIndex::new();
381
382        bitmap.insert(1);
383        bitmap.insert(100);
384        bitmap.insert(1000);
385
386        assert!(bitmap.contains(1));
387        assert!(bitmap.contains(100));
388        assert!(!bitmap.contains(50));
389
390        assert_eq!(bitmap.len(), 3);
391    }
392
393    #[test]
394    fn test_delta_encoding() {
395        let ids = vec![100, 102, 105, 110, 120];
396        let encoded = DeltaEncodedList::encode(&ids);
397        let decoded = encoded.decode();
398
399        assert_eq!(ids, decoded);
400        assert!(encoded.compression_ratio() > 1.0);
401    }
402
403    #[test]
404    fn test_delta_encoder() {
405        let values = vec![1000, 1005, 1010, 1020, 1030];
406        let encoded = DeltaEncoder::encode(&values);
407        let decoded = DeltaEncoder::decode(&encoded);
408
409        assert_eq!(values, decoded);
410
411        // Encoded size should be smaller
412        assert!(encoded.len() < values.len() * 8);
413    }
414
415    #[test]
416    fn test_string_dictionary() {
417        let index = CompressedIndex::new();
418
419        let id1 = index.encode_string("hello");
420        let id2 = index.encode_string("world");
421        let id3 = index.encode_string("hello"); // Duplicate
422
423        assert_eq!(id1, id3); // Same string gets same ID
424        assert_ne!(id1, id2);
425
426        assert_eq!(index.decode_string(id1), Some("hello".to_string()));
427        assert_eq!(index.decode_string(id2), Some("world".to_string()));
428    }
429}