Skip to main content

grafeo_core/storage/
runlength.rs

1//! Run-length encoding for highly repetitive data.
2//!
3//! Run-length encoding (RLE) compresses sequences with consecutive repeated
4//! values by storing each unique value once along with its run length.
5//!
6//! | Data pattern | Compression ratio |
7//! | ------------ | ----------------- |
8//! | Constant value | ~100x |
9//! | Few distinct values, long runs | 10-50x |
10//! | Many short runs | 2-5x |
11//! | Random data | < 1x (expansion) |
12//!
13//! # Example
14//!
15//! ```ignore
16//! use grafeo_core::storage::RunLengthEncoding;
17//!
18//! // Compress data with many repeated values
19//! let values = vec![1, 1, 1, 2, 2, 3, 3, 3, 3, 3];
20//! let encoded = RunLengthEncoding::encode(&values);
21//!
22//! println!("Runs: {}", encoded.run_count()); // 3 runs
23//! println!("Compression: {:.1}x", encoded.compression_ratio()); // ~3.3x
24//!
25//! // Decode back to original
26//! let decoded = encoded.decode();
27//! assert_eq!(values, decoded);
28//! ```
29
30use std::io::{self, Read};
31
32/// A run in run-length encoding: a value and how many times it repeats.
33#[derive(Debug, Clone, PartialEq, Eq)]
34pub struct Run<T> {
35    /// The value for this run.
36    pub value: T,
37    /// Number of consecutive occurrences.
38    pub length: u64,
39}
40
41impl<T> Run<T> {
42    /// Creates a new run.
43    #[must_use]
44    pub fn new(value: T, length: u64) -> Self {
45        Self { value, length }
46    }
47}
48
49/// Run-length encoded data for u64 values.
50///
51/// Stores sequences of (value, count) pairs. Achieves excellent compression
52/// when data has long runs of repeated values.
53#[derive(Debug, Clone)]
54pub struct RunLengthEncoding {
55    /// The runs: each is (value, count).
56    runs: Vec<Run<u64>>,
57    /// Total number of values (sum of all run lengths).
58    total_count: usize,
59}
60
61impl<'a> IntoIterator for &'a RunLengthEncoding {
62    type Item = u64;
63    type IntoIter = RunLengthIterator<'a>;
64
65    fn into_iter(self) -> Self::IntoIter {
66        self.iter()
67    }
68}
69
70impl RunLengthEncoding {
71    /// Encodes a slice of u64 values using run-length encoding.
72    ///
73    /// # Example
74    /// ```ignore
75    /// let values = vec![1, 1, 1, 2, 2, 3];
76    /// let encoded = RunLengthEncoding::encode(&values);
77    /// // Results in 3 runs: (1, 3), (2, 2), (3, 1)
78    /// ```
79    #[must_use]
80    pub fn encode(values: &[u64]) -> Self {
81        if values.is_empty() {
82            return Self {
83                runs: Vec::new(),
84                total_count: 0,
85            };
86        }
87
88        let mut runs = Vec::new();
89        let mut current_value = values[0];
90        let mut current_length = 1u64;
91
92        for &value in &values[1..] {
93            if value == current_value {
94                current_length += 1;
95            } else {
96                runs.push(Run::new(current_value, current_length));
97                current_value = value;
98                current_length = 1;
99            }
100        }
101
102        // Don't forget the last run
103        runs.push(Run::new(current_value, current_length));
104
105        Self {
106            runs,
107            total_count: values.len(),
108        }
109    }
110
111    /// Creates a run-length encoding from pre-built runs.
112    #[must_use]
113    pub fn from_runs(runs: Vec<Run<u64>>) -> Self {
114        let total_count = runs.iter().map(|r| r.length as usize).sum();
115        Self { runs, total_count }
116    }
117
118    /// Decodes back to the original values.
119    #[must_use]
120    pub fn decode(&self) -> Vec<u64> {
121        let mut values = Vec::with_capacity(self.total_count);
122
123        for run in &self.runs {
124            for _ in 0..run.length {
125                values.push(run.value);
126            }
127        }
128
129        values
130    }
131
132    /// Returns the number of runs.
133    #[must_use]
134    pub fn run_count(&self) -> usize {
135        self.runs.len()
136    }
137
138    /// Returns the total number of values represented.
139    #[must_use]
140    pub fn total_count(&self) -> usize {
141        self.total_count
142    }
143
144    /// Returns the runs.
145    #[must_use]
146    pub fn runs(&self) -> &[Run<u64>] {
147        &self.runs
148    }
149
150    /// Returns true if there are no values.
151    #[must_use]
152    pub fn is_empty(&self) -> bool {
153        self.total_count == 0
154    }
155
156    /// Returns the compression ratio (original size / encoded size).
157    ///
158    /// Values > 1.0 indicate compression, < 1.0 indicate expansion.
159    #[must_use]
160    pub fn compression_ratio(&self) -> f64 {
161        if self.runs.is_empty() {
162            return 1.0;
163        }
164
165        // Original: total_count * 8 bytes
166        // Encoded: runs.len() * 16 bytes (8 for value, 8 for length)
167        let original_size = self.total_count * 8;
168        let encoded_size = self.runs.len() * 16;
169
170        if encoded_size == 0 {
171            return 1.0;
172        }
173
174        original_size as f64 / encoded_size as f64
175    }
176
177    /// Returns true if run-length encoding is beneficial for this data.
178    ///
179    /// Returns true when compression ratio > 1.0 (actual compression achieved).
180    #[must_use]
181    pub fn is_beneficial(&self) -> bool {
182        self.compression_ratio() > 1.0
183    }
184
185    /// Returns the memory size in bytes of the encoded representation.
186    #[must_use]
187    pub fn encoded_size(&self) -> usize {
188        // Each run: 8 bytes value + 8 bytes length
189        self.runs.len() * 16
190    }
191
192    /// Serializes the run-length encoding to bytes.
193    pub fn to_bytes(&self) -> Vec<u8> {
194        let mut bytes = Vec::with_capacity(8 + self.runs.len() * 16);
195
196        // Write run count
197        bytes.extend_from_slice(&(self.runs.len() as u64).to_le_bytes());
198
199        // Write each run
200        for run in &self.runs {
201            bytes.extend_from_slice(&run.value.to_le_bytes());
202            bytes.extend_from_slice(&run.length.to_le_bytes());
203        }
204
205        bytes
206    }
207
208    /// Deserializes run-length encoding from bytes.
209    pub fn from_bytes(bytes: &[u8]) -> io::Result<Self> {
210        let mut cursor = io::Cursor::new(bytes);
211
212        // Read run count
213        let mut buf = [0u8; 8];
214        cursor.read_exact(&mut buf)?;
215        let run_count = u64::from_le_bytes(buf) as usize;
216
217        // Read runs
218        let mut runs = Vec::with_capacity(run_count);
219        for _ in 0..run_count {
220            cursor.read_exact(&mut buf)?;
221            let value = u64::from_le_bytes(buf);
222
223            cursor.read_exact(&mut buf)?;
224            let length = u64::from_le_bytes(buf);
225
226            runs.push(Run::new(value, length));
227        }
228
229        Ok(Self::from_runs(runs))
230    }
231
232    /// Gets the value at a specific index without full decompression.
233    ///
234    /// Returns None if index is out of bounds.
235    #[must_use]
236    pub fn get(&self, index: usize) -> Option<u64> {
237        if index >= self.total_count {
238            return None;
239        }
240
241        let mut offset = 0usize;
242        for run in &self.runs {
243            let run_end = offset + run.length as usize;
244            if index < run_end {
245                return Some(run.value);
246            }
247            offset = run_end;
248        }
249
250        None
251    }
252
253    /// Returns an iterator over the decoded values.
254    pub fn iter(&self) -> RunLengthIterator<'_> {
255        RunLengthIterator {
256            runs: &self.runs,
257            run_index: 0,
258            within_run: 0,
259        }
260    }
261}
262
263/// Iterator over run-length encoded values.
264pub struct RunLengthIterator<'a> {
265    runs: &'a [Run<u64>],
266    run_index: usize,
267    within_run: u64,
268}
269
270impl Iterator for RunLengthIterator<'_> {
271    type Item = u64;
272
273    fn next(&mut self) -> Option<Self::Item> {
274        while self.run_index < self.runs.len() {
275            let run = &self.runs[self.run_index];
276            if self.within_run < run.length {
277                self.within_run += 1;
278                return Some(run.value);
279            }
280            self.run_index += 1;
281            self.within_run = 0;
282        }
283        None
284    }
285
286    fn size_hint(&self) -> (usize, Option<usize>) {
287        let remaining: u64 = self.runs[self.run_index..]
288            .iter()
289            .map(|r| r.length)
290            .sum::<u64>()
291            - self.within_run;
292        (remaining as usize, Some(remaining as usize))
293    }
294}
295
296impl ExactSizeIterator for RunLengthIterator<'_> {}
297
298/// Run-length encoding for signed integers.
299///
300/// Uses zigzag encoding internally for efficient storage.
301#[derive(Debug, Clone)]
302pub struct SignedRunLengthEncoding {
303    inner: RunLengthEncoding,
304}
305
306impl SignedRunLengthEncoding {
307    /// Encodes signed integers using run-length encoding.
308    #[must_use]
309    pub fn encode(values: &[i64]) -> Self {
310        let unsigned: Vec<u64> = values.iter().map(|&v| zigzag_encode(v)).collect();
311        Self {
312            inner: RunLengthEncoding::encode(&unsigned),
313        }
314    }
315
316    /// Decodes back to signed integers.
317    #[must_use]
318    pub fn decode(&self) -> Vec<i64> {
319        self.inner.decode().into_iter().map(zigzag_decode).collect()
320    }
321
322    /// Returns the compression ratio.
323    #[must_use]
324    pub fn compression_ratio(&self) -> f64 {
325        self.inner.compression_ratio()
326    }
327
328    /// Returns the number of runs.
329    #[must_use]
330    pub fn run_count(&self) -> usize {
331        self.inner.run_count()
332    }
333
334    /// Serializes to bytes.
335    pub fn to_bytes(&self) -> Vec<u8> {
336        self.inner.to_bytes()
337    }
338
339    /// Deserializes from bytes.
340    pub fn from_bytes(bytes: &[u8]) -> io::Result<Self> {
341        Ok(Self {
342            inner: RunLengthEncoding::from_bytes(bytes)?,
343        })
344    }
345}
346
347/// Zigzag encodes a signed integer to unsigned.
348#[inline]
349#[must_use]
350pub fn zigzag_encode(n: i64) -> u64 {
351    ((n << 1) ^ (n >> 63)) as u64
352}
353
354/// Zigzag decodes an unsigned integer to signed.
355#[inline]
356#[must_use]
357pub fn zigzag_decode(n: u64) -> i64 {
358    ((n >> 1) as i64) ^ -((n & 1) as i64)
359}
360
361/// Analyzes data to determine if run-length encoding is beneficial.
362pub struct RunLengthAnalyzer;
363
364impl RunLengthAnalyzer {
365    /// Estimates the compression ratio without actually encoding.
366    ///
367    /// This is faster than encoding for decision-making.
368    #[must_use]
369    pub fn estimate_ratio(values: &[u64]) -> f64 {
370        if values.is_empty() {
371            return 1.0;
372        }
373
374        // Count runs
375        let mut run_count = 1usize;
376        for i in 1..values.len() {
377            if values[i] != values[i - 1] {
378                run_count += 1;
379            }
380        }
381
382        // Original: values.len() * 8 bytes
383        // Encoded: run_count * 16 bytes
384        let original = values.len() * 8;
385        let encoded = run_count * 16;
386
387        if encoded == 0 {
388            return 1.0;
389        }
390
391        original as f64 / encoded as f64
392    }
393
394    /// Returns true if run-length encoding would be beneficial.
395    #[must_use]
396    pub fn is_beneficial(values: &[u64]) -> bool {
397        Self::estimate_ratio(values) > 1.0
398    }
399
400    /// Returns the average run length in the data.
401    #[must_use]
402    pub fn average_run_length(values: &[u64]) -> f64 {
403        if values.is_empty() {
404            return 0.0;
405        }
406
407        let mut run_count = 1usize;
408        for i in 1..values.len() {
409            if values[i] != values[i - 1] {
410                run_count += 1;
411            }
412        }
413
414        values.len() as f64 / run_count as f64
415    }
416}
417
418#[cfg(test)]
419mod tests {
420    use super::*;
421
422    #[test]
423    fn test_encode_decode_basic() {
424        let values = vec![1, 1, 1, 2, 2, 3, 3, 3, 3, 3];
425        let encoded = RunLengthEncoding::encode(&values);
426
427        assert_eq!(encoded.run_count(), 3);
428        assert_eq!(encoded.total_count(), 10);
429
430        let decoded = encoded.decode();
431        assert_eq!(values, decoded);
432    }
433
434    #[test]
435    fn test_encode_empty() {
436        let values: Vec<u64> = vec![];
437        let encoded = RunLengthEncoding::encode(&values);
438
439        assert_eq!(encoded.run_count(), 0);
440        assert_eq!(encoded.total_count(), 0);
441        assert!(encoded.is_empty());
442
443        let decoded = encoded.decode();
444        assert!(decoded.is_empty());
445    }
446
447    #[test]
448    fn test_encode_single() {
449        let values = vec![42];
450        let encoded = RunLengthEncoding::encode(&values);
451
452        assert_eq!(encoded.run_count(), 1);
453        assert_eq!(encoded.total_count(), 1);
454
455        let decoded = encoded.decode();
456        assert_eq!(values, decoded);
457    }
458
459    #[test]
460    fn test_encode_all_same() {
461        let values = vec![7u64; 1000];
462        let encoded = RunLengthEncoding::encode(&values);
463
464        assert_eq!(encoded.run_count(), 1);
465        assert_eq!(encoded.total_count(), 1000);
466
467        // Should compress very well
468        let ratio = encoded.compression_ratio();
469        assert!(ratio > 50.0, "Expected ratio > 50, got {}", ratio);
470
471        let decoded = encoded.decode();
472        assert_eq!(values, decoded);
473    }
474
475    #[test]
476    fn test_encode_all_different() {
477        let values: Vec<u64> = (0..100).collect();
478        let encoded = RunLengthEncoding::encode(&values);
479
480        assert_eq!(encoded.run_count(), 100);
481        assert_eq!(encoded.total_count(), 100);
482
483        // Should not compress (expand by 2x)
484        let ratio = encoded.compression_ratio();
485        assert!(ratio < 1.0, "Expected ratio < 1, got {}", ratio);
486
487        let decoded = encoded.decode();
488        assert_eq!(values, decoded);
489    }
490
491    #[test]
492    fn test_compression_ratio() {
493        // Perfect case: all same values
494        let all_same = vec![1u64; 100];
495        let encoded = RunLengthEncoding::encode(&all_same);
496        assert!(encoded.compression_ratio() > 1.0);
497        assert!(encoded.is_beneficial());
498
499        // Bad case: all different values
500        let all_diff: Vec<u64> = (0..100).collect();
501        let encoded = RunLengthEncoding::encode(&all_diff);
502        assert!(encoded.compression_ratio() < 1.0);
503        assert!(!encoded.is_beneficial());
504    }
505
506    #[test]
507    fn test_serialization() {
508        let values = vec![1, 1, 1, 2, 2, 3, 3, 3, 3, 3];
509        let encoded = RunLengthEncoding::encode(&values);
510
511        let bytes = encoded.to_bytes();
512        let decoded_encoding = RunLengthEncoding::from_bytes(&bytes).unwrap();
513
514        assert_eq!(encoded.run_count(), decoded_encoding.run_count());
515        assert_eq!(encoded.decode(), decoded_encoding.decode());
516    }
517
518    #[test]
519    fn test_get_index() {
520        let values = vec![1, 1, 1, 2, 2, 3, 3, 3, 3, 3];
521        let encoded = RunLengthEncoding::encode(&values);
522
523        assert_eq!(encoded.get(0), Some(1));
524        assert_eq!(encoded.get(2), Some(1));
525        assert_eq!(encoded.get(3), Some(2));
526        assert_eq!(encoded.get(4), Some(2));
527        assert_eq!(encoded.get(5), Some(3));
528        assert_eq!(encoded.get(9), Some(3));
529        assert_eq!(encoded.get(10), None);
530    }
531
532    #[test]
533    fn test_iterator() {
534        let values = vec![1, 1, 1, 2, 2, 3];
535        let encoded = RunLengthEncoding::encode(&values);
536
537        let iterated: Vec<u64> = encoded.iter().collect();
538        assert_eq!(values, iterated);
539    }
540
541    #[test]
542    fn test_signed_integers() {
543        let values = vec![-5, -5, -5, 0, 0, 10, 10, 10, 10];
544        let encoded = SignedRunLengthEncoding::encode(&values);
545
546        assert_eq!(encoded.run_count(), 3);
547
548        let decoded = encoded.decode();
549        assert_eq!(values, decoded);
550    }
551
552    #[test]
553    fn test_signed_serialization() {
554        let values = vec![-100, -100, 0, 0, 0, 100];
555        let encoded = SignedRunLengthEncoding::encode(&values);
556
557        let bytes = encoded.to_bytes();
558        let decoded_encoding = SignedRunLengthEncoding::from_bytes(&bytes).unwrap();
559
560        assert_eq!(encoded.decode(), decoded_encoding.decode());
561    }
562
563    #[test]
564    fn test_zigzag() {
565        assert_eq!(zigzag_encode(0), 0);
566        assert_eq!(zigzag_encode(-1), 1);
567        assert_eq!(zigzag_encode(1), 2);
568        assert_eq!(zigzag_encode(-2), 3);
569        assert_eq!(zigzag_encode(2), 4);
570
571        for i in -1000i64..1000 {
572            assert_eq!(zigzag_decode(zigzag_encode(i)), i);
573        }
574    }
575
576    #[test]
577    fn test_analyzer_estimate() {
578        let all_same = vec![1u64; 100];
579        let ratio = RunLengthAnalyzer::estimate_ratio(&all_same);
580        assert!(ratio > 1.0);
581        assert!(RunLengthAnalyzer::is_beneficial(&all_same));
582
583        let all_diff: Vec<u64> = (0..100).collect();
584        let ratio = RunLengthAnalyzer::estimate_ratio(&all_diff);
585        assert!(ratio < 1.0);
586        assert!(!RunLengthAnalyzer::is_beneficial(&all_diff));
587    }
588
589    #[test]
590    fn test_analyzer_average_run_length() {
591        let values = vec![1, 1, 1, 2, 2, 3, 3, 3, 3, 3]; // 3 runs, 10 values
592        let avg = RunLengthAnalyzer::average_run_length(&values);
593        assert!((avg - 3.33).abs() < 0.1);
594
595        let all_same = vec![1u64; 100];
596        let avg = RunLengthAnalyzer::average_run_length(&all_same);
597        assert!((avg - 100.0).abs() < 0.1);
598    }
599
600    #[test]
601    fn test_from_runs() {
602        let runs = vec![Run::new(1, 3), Run::new(2, 2), Run::new(3, 5)];
603        let encoded = RunLengthEncoding::from_runs(runs);
604
605        assert_eq!(encoded.run_count(), 3);
606        assert_eq!(encoded.total_count(), 10);
607        assert_eq!(encoded.decode(), vec![1, 1, 1, 2, 2, 3, 3, 3, 3, 3]);
608    }
609}