Skip to main content

grafeo_core/codec/
runlength.rs

1//! Run-length encoding for highly repetitive data.
2//!
3//! Run-length encoding (RLE) compresses sequences with consecutive repeated
4//! values by storing each unique value once along with its run length.
5//!
6//! | Data pattern | Compression ratio |
7//! | ------------ | ----------------- |
8//! | Constant value | ~100x |
9//! | Few distinct values, long runs | 10-50x |
10//! | Many short runs | 2-5x |
11//! | Random data | < 1x (expansion) |
12//!
13//! # Example
14//!
15//! ```no_run
16//! use grafeo_core::codec::RunLengthEncoding;
17//!
18//! // Compress data with many repeated values
19//! let values = vec![1, 1, 1, 2, 2, 3, 3, 3, 3, 3];
20//! let encoded = RunLengthEncoding::encode(&values);
21//!
22//! println!("Runs: {}", encoded.run_count()); // 3 runs
23//! println!("Compression: {:.1}x", encoded.compression_ratio()); // ~3.3x
24//!
25//! // Decode back to original
26//! let decoded = encoded.decode();
27//! assert_eq!(values, decoded);
28//! ```
29
30use std::io::{self, Read};
31
32/// A run in run-length encoding: a value and how many times it repeats.
33#[derive(Debug, Clone, PartialEq, Eq)]
34pub struct Run<T> {
35    /// The value for this run.
36    pub value: T,
37    /// Number of consecutive occurrences.
38    pub length: u64,
39}
40
41impl<T> Run<T> {
42    /// Creates a new run.
43    #[must_use]
44    pub fn new(value: T, length: u64) -> Self {
45        Self { value, length }
46    }
47}
48
49/// Run-length encoded data for u64 values.
50///
51/// Stores sequences of (value, count) pairs. Achieves excellent compression
52/// when data has long runs of repeated values.
53#[derive(Debug, Clone)]
54pub struct RunLengthEncoding {
55    /// The runs: each is (value, count).
56    runs: Vec<Run<u64>>,
57    /// Total number of values (sum of all run lengths).
58    total_count: usize,
59}
60
61impl<'a> IntoIterator for &'a RunLengthEncoding {
62    type Item = u64;
63    type IntoIter = RunLengthIterator<'a>;
64
65    fn into_iter(self) -> Self::IntoIter {
66        self.iter()
67    }
68}
69
70impl RunLengthEncoding {
71    /// Encodes a slice of u64 values using run-length encoding.
72    ///
73    /// # Example
74    /// ```no_run
75    /// # use grafeo_core::codec::runlength::RunLengthEncoding;
76    /// let values = vec![1, 1, 1, 2, 2, 3];
77    /// let encoded = RunLengthEncoding::encode(&values);
78    /// // Results in 3 runs: (1, 3), (2, 2), (3, 1)
79    /// ```
80    #[must_use]
81    pub fn encode(values: &[u64]) -> Self {
82        if values.is_empty() {
83            return Self {
84                runs: Vec::new(),
85                total_count: 0,
86            };
87        }
88
89        let mut runs = Vec::new();
90        let mut current_value = values[0];
91        let mut current_length = 1u64;
92
93        for &value in &values[1..] {
94            if value == current_value {
95                current_length += 1;
96            } else {
97                runs.push(Run::new(current_value, current_length));
98                current_value = value;
99                current_length = 1;
100            }
101        }
102
103        // Don't forget the last run
104        runs.push(Run::new(current_value, current_length));
105
106        Self {
107            runs,
108            total_count: values.len(),
109        }
110    }
111
112    /// Creates a run-length encoding from pre-built runs.
113    #[must_use]
114    pub fn from_runs(runs: Vec<Run<u64>>) -> Self {
115        let total_count = runs.iter().map(|r| r.length as usize).sum();
116        Self { runs, total_count }
117    }
118
119    /// Decodes back to the original values.
120    #[must_use]
121    pub fn decode(&self) -> Vec<u64> {
122        let mut values = Vec::with_capacity(self.total_count);
123
124        for run in &self.runs {
125            for _ in 0..run.length {
126                values.push(run.value);
127            }
128        }
129
130        values
131    }
132
133    /// Returns the number of runs.
134    #[must_use]
135    pub fn run_count(&self) -> usize {
136        self.runs.len()
137    }
138
139    /// Returns the total number of values represented.
140    #[must_use]
141    pub fn total_count(&self) -> usize {
142        self.total_count
143    }
144
145    /// Returns the runs.
146    #[must_use]
147    pub fn runs(&self) -> &[Run<u64>] {
148        &self.runs
149    }
150
151    /// Returns true if there are no values.
152    #[must_use]
153    pub fn is_empty(&self) -> bool {
154        self.total_count == 0
155    }
156
157    /// Returns the compression ratio (original size / encoded size).
158    ///
159    /// Values > 1.0 indicate compression, < 1.0 indicate expansion.
160    #[must_use]
161    pub fn compression_ratio(&self) -> f64 {
162        if self.runs.is_empty() {
163            return 1.0;
164        }
165
166        // Original: total_count * 8 bytes
167        // Encoded: runs.len() * 16 bytes (8 for value, 8 for length)
168        let original_size = self.total_count * 8;
169        let encoded_size = self.runs.len() * 16;
170
171        if encoded_size == 0 {
172            return 1.0;
173        }
174
175        original_size as f64 / encoded_size as f64
176    }
177
178    /// Returns true if run-length encoding is beneficial for this data.
179    ///
180    /// Returns true when compression ratio > 1.0 (actual compression achieved).
181    #[must_use]
182    pub fn is_beneficial(&self) -> bool {
183        self.compression_ratio() > 1.0
184    }
185
186    /// Returns the memory size in bytes of the encoded representation.
187    #[must_use]
188    pub fn encoded_size(&self) -> usize {
189        // Each run: 8 bytes value + 8 bytes length
190        self.runs.len() * 16
191    }
192
193    /// Serializes the run-length encoding to bytes.
194    pub fn to_bytes(&self) -> Vec<u8> {
195        let mut bytes = Vec::with_capacity(8 + self.runs.len() * 16);
196
197        // Write run count
198        bytes.extend_from_slice(&(self.runs.len() as u64).to_le_bytes());
199
200        // Write each run
201        for run in &self.runs {
202            bytes.extend_from_slice(&run.value.to_le_bytes());
203            bytes.extend_from_slice(&run.length.to_le_bytes());
204        }
205
206        bytes
207    }
208
209    /// Deserializes run-length encoding from bytes.
210    ///
211    /// # Errors
212    ///
213    /// Returns `Err` if the byte slice is too short or contains invalid run data.
214    pub fn from_bytes(bytes: &[u8]) -> io::Result<Self> {
215        let mut cursor = io::Cursor::new(bytes);
216
217        // Read run count
218        let mut buf = [0u8; 8];
219        cursor.read_exact(&mut buf)?;
220        let run_count = u64::from_le_bytes(buf) as usize;
221
222        // Read runs
223        let mut runs = Vec::with_capacity(run_count);
224        for _ in 0..run_count {
225            cursor.read_exact(&mut buf)?;
226            let value = u64::from_le_bytes(buf);
227
228            cursor.read_exact(&mut buf)?;
229            let length = u64::from_le_bytes(buf);
230
231            runs.push(Run::new(value, length));
232        }
233
234        Ok(Self::from_runs(runs))
235    }
236
237    /// Gets the value at a specific index without full decompression.
238    ///
239    /// Returns None if index is out of bounds.
240    #[must_use]
241    pub fn get(&self, index: usize) -> Option<u64> {
242        if index >= self.total_count {
243            return None;
244        }
245
246        let mut offset = 0usize;
247        for run in &self.runs {
248            let run_end = offset + run.length as usize;
249            if index < run_end {
250                return Some(run.value);
251            }
252            offset = run_end;
253        }
254
255        None
256    }
257
258    /// Returns an iterator over the decoded values.
259    pub fn iter(&self) -> RunLengthIterator<'_> {
260        RunLengthIterator {
261            runs: &self.runs,
262            run_index: 0,
263            within_run: 0,
264        }
265    }
266}
267
268/// Iterator over run-length encoded values.
269pub struct RunLengthIterator<'a> {
270    runs: &'a [Run<u64>],
271    run_index: usize,
272    within_run: u64,
273}
274
275impl Iterator for RunLengthIterator<'_> {
276    type Item = u64;
277
278    fn next(&mut self) -> Option<Self::Item> {
279        while self.run_index < self.runs.len() {
280            let run = &self.runs[self.run_index];
281            if self.within_run < run.length {
282                self.within_run += 1;
283                return Some(run.value);
284            }
285            self.run_index += 1;
286            self.within_run = 0;
287        }
288        None
289    }
290
291    fn size_hint(&self) -> (usize, Option<usize>) {
292        let remaining: u64 = self.runs[self.run_index..]
293            .iter()
294            .map(|r| r.length)
295            .sum::<u64>()
296            - self.within_run;
297        (remaining as usize, Some(remaining as usize))
298    }
299}
300
301impl ExactSizeIterator for RunLengthIterator<'_> {}
302
303/// Run-length encoding for signed integers.
304///
305/// Uses zigzag encoding internally for efficient storage.
306#[derive(Debug, Clone)]
307pub struct SignedRunLengthEncoding {
308    inner: RunLengthEncoding,
309}
310
311impl SignedRunLengthEncoding {
312    /// Encodes signed integers using run-length encoding.
313    #[must_use]
314    pub fn encode(values: &[i64]) -> Self {
315        let unsigned: Vec<u64> = values.iter().map(|&v| zigzag_encode(v)).collect();
316        Self {
317            inner: RunLengthEncoding::encode(&unsigned),
318        }
319    }
320
321    /// Decodes back to signed integers.
322    #[must_use]
323    pub fn decode(&self) -> Vec<i64> {
324        self.inner.decode().into_iter().map(zigzag_decode).collect()
325    }
326
327    /// Returns the compression ratio.
328    #[must_use]
329    pub fn compression_ratio(&self) -> f64 {
330        self.inner.compression_ratio()
331    }
332
333    /// Returns the number of runs.
334    #[must_use]
335    pub fn run_count(&self) -> usize {
336        self.inner.run_count()
337    }
338
339    /// Serializes to bytes.
340    pub fn to_bytes(&self) -> Vec<u8> {
341        self.inner.to_bytes()
342    }
343
344    /// Deserializes from bytes.
345    ///
346    /// # Errors
347    ///
348    /// Returns `Err` if the byte slice is too short or contains invalid run data.
349    pub fn from_bytes(bytes: &[u8]) -> io::Result<Self> {
350        Ok(Self {
351            inner: RunLengthEncoding::from_bytes(bytes)?,
352        })
353    }
354}
355
356/// Zigzag encodes a signed integer to unsigned.
357#[inline]
358#[must_use]
359pub fn zigzag_encode(n: i64) -> u64 {
360    ((n << 1) ^ (n >> 63)) as u64
361}
362
363/// Zigzag decodes an unsigned integer to signed.
364#[inline]
365#[must_use]
366pub fn zigzag_decode(n: u64) -> i64 {
367    ((n >> 1) as i64) ^ -((n & 1) as i64)
368}
369
370/// Analyzes data to determine if run-length encoding is beneficial.
371pub struct RunLengthAnalyzer;
372
373impl RunLengthAnalyzer {
374    /// Estimates the compression ratio without actually encoding.
375    ///
376    /// This is faster than encoding for decision-making.
377    #[must_use]
378    pub fn estimate_ratio(values: &[u64]) -> f64 {
379        if values.is_empty() {
380            return 1.0;
381        }
382
383        // Count runs
384        let mut run_count = 1usize;
385        for i in 1..values.len() {
386            if values[i] != values[i - 1] {
387                run_count += 1;
388            }
389        }
390
391        // Original: values.len() * 8 bytes
392        // Encoded: run_count * 16 bytes
393        let original = values.len() * 8;
394        let encoded = run_count * 16;
395
396        if encoded == 0 {
397            return 1.0;
398        }
399
400        original as f64 / encoded as f64
401    }
402
403    /// Returns true if run-length encoding would be beneficial.
404    #[must_use]
405    pub fn is_beneficial(values: &[u64]) -> bool {
406        Self::estimate_ratio(values) > 1.0
407    }
408
409    /// Returns the average run length in the data.
410    #[must_use]
411    pub fn average_run_length(values: &[u64]) -> f64 {
412        if values.is_empty() {
413            return 0.0;
414        }
415
416        let mut run_count = 1usize;
417        for i in 1..values.len() {
418            if values[i] != values[i - 1] {
419                run_count += 1;
420            }
421        }
422
423        values.len() as f64 / run_count as f64
424    }
425}
426
427#[cfg(test)]
428mod tests {
429    use super::*;
430
431    #[test]
432    fn test_encode_decode_basic() {
433        let values = vec![1, 1, 1, 2, 2, 3, 3, 3, 3, 3];
434        let encoded = RunLengthEncoding::encode(&values);
435
436        assert_eq!(encoded.run_count(), 3);
437        assert_eq!(encoded.total_count(), 10);
438
439        let decoded = encoded.decode();
440        assert_eq!(values, decoded);
441    }
442
443    #[test]
444    fn test_encode_empty() {
445        let values: Vec<u64> = vec![];
446        let encoded = RunLengthEncoding::encode(&values);
447
448        assert_eq!(encoded.run_count(), 0);
449        assert_eq!(encoded.total_count(), 0);
450        assert!(encoded.is_empty());
451
452        let decoded = encoded.decode();
453        assert!(decoded.is_empty());
454    }
455
456    #[test]
457    fn test_encode_single() {
458        let values = vec![42];
459        let encoded = RunLengthEncoding::encode(&values);
460
461        assert_eq!(encoded.run_count(), 1);
462        assert_eq!(encoded.total_count(), 1);
463
464        let decoded = encoded.decode();
465        assert_eq!(values, decoded);
466    }
467
468    #[test]
469    fn test_encode_all_same() {
470        let values = vec![7u64; 1000];
471        let encoded = RunLengthEncoding::encode(&values);
472
473        assert_eq!(encoded.run_count(), 1);
474        assert_eq!(encoded.total_count(), 1000);
475
476        // Should compress very well
477        let ratio = encoded.compression_ratio();
478        assert!(ratio > 50.0, "Expected ratio > 50, got {}", ratio);
479
480        let decoded = encoded.decode();
481        assert_eq!(values, decoded);
482    }
483
484    #[test]
485    fn test_encode_all_different() {
486        let values: Vec<u64> = (0..100).collect();
487        let encoded = RunLengthEncoding::encode(&values);
488
489        assert_eq!(encoded.run_count(), 100);
490        assert_eq!(encoded.total_count(), 100);
491
492        // Should not compress (expand by 2x)
493        let ratio = encoded.compression_ratio();
494        assert!(ratio < 1.0, "Expected ratio < 1, got {}", ratio);
495
496        let decoded = encoded.decode();
497        assert_eq!(values, decoded);
498    }
499
500    #[test]
501    fn test_compression_ratio() {
502        // Perfect case: all same values
503        let all_same = vec![1u64; 100];
504        let encoded = RunLengthEncoding::encode(&all_same);
505        assert!(encoded.compression_ratio() > 1.0);
506        assert!(encoded.is_beneficial());
507
508        // Bad case: all different values
509        let all_diff: Vec<u64> = (0..100).collect();
510        let encoded = RunLengthEncoding::encode(&all_diff);
511        assert!(encoded.compression_ratio() < 1.0);
512        assert!(!encoded.is_beneficial());
513    }
514
515    #[test]
516    fn test_serialization() {
517        let values = vec![1, 1, 1, 2, 2, 3, 3, 3, 3, 3];
518        let encoded = RunLengthEncoding::encode(&values);
519
520        let bytes = encoded.to_bytes();
521        let decoded_encoding = RunLengthEncoding::from_bytes(&bytes).unwrap();
522
523        assert_eq!(encoded.run_count(), decoded_encoding.run_count());
524        assert_eq!(encoded.decode(), decoded_encoding.decode());
525    }
526
527    #[test]
528    fn test_get_index() {
529        let values = vec![1, 1, 1, 2, 2, 3, 3, 3, 3, 3];
530        let encoded = RunLengthEncoding::encode(&values);
531
532        assert_eq!(encoded.get(0), Some(1));
533        assert_eq!(encoded.get(2), Some(1));
534        assert_eq!(encoded.get(3), Some(2));
535        assert_eq!(encoded.get(4), Some(2));
536        assert_eq!(encoded.get(5), Some(3));
537        assert_eq!(encoded.get(9), Some(3));
538        assert_eq!(encoded.get(10), None);
539    }
540
541    #[test]
542    fn test_iterator() {
543        let values = vec![1, 1, 1, 2, 2, 3];
544        let encoded = RunLengthEncoding::encode(&values);
545
546        let iterated: Vec<u64> = encoded.iter().collect();
547        assert_eq!(values, iterated);
548    }
549
550    #[test]
551    fn test_signed_integers() {
552        let values = vec![-5, -5, -5, 0, 0, 10, 10, 10, 10];
553        let encoded = SignedRunLengthEncoding::encode(&values);
554
555        assert_eq!(encoded.run_count(), 3);
556
557        let decoded = encoded.decode();
558        assert_eq!(values, decoded);
559    }
560
561    #[test]
562    fn test_signed_serialization() {
563        let values = vec![-100, -100, 0, 0, 0, 100];
564        let encoded = SignedRunLengthEncoding::encode(&values);
565
566        let bytes = encoded.to_bytes();
567        let decoded_encoding = SignedRunLengthEncoding::from_bytes(&bytes).unwrap();
568
569        assert_eq!(encoded.decode(), decoded_encoding.decode());
570    }
571
572    #[test]
573    fn test_zigzag() {
574        assert_eq!(zigzag_encode(0), 0);
575        assert_eq!(zigzag_encode(-1), 1);
576        assert_eq!(zigzag_encode(1), 2);
577        assert_eq!(zigzag_encode(-2), 3);
578        assert_eq!(zigzag_encode(2), 4);
579
580        for i in -1000i64..1000 {
581            assert_eq!(zigzag_decode(zigzag_encode(i)), i);
582        }
583    }
584
585    #[test]
586    fn test_analyzer_estimate() {
587        let all_same = vec![1u64; 100];
588        let ratio = RunLengthAnalyzer::estimate_ratio(&all_same);
589        assert!(ratio > 1.0);
590        assert!(RunLengthAnalyzer::is_beneficial(&all_same));
591
592        let all_diff: Vec<u64> = (0..100).collect();
593        let ratio = RunLengthAnalyzer::estimate_ratio(&all_diff);
594        assert!(ratio < 1.0);
595        assert!(!RunLengthAnalyzer::is_beneficial(&all_diff));
596    }
597
598    #[test]
599    fn test_analyzer_average_run_length() {
600        let values = vec![1, 1, 1, 2, 2, 3, 3, 3, 3, 3]; // 3 runs, 10 values
601        let avg = RunLengthAnalyzer::average_run_length(&values);
602        assert!((avg - 3.33).abs() < 0.1);
603
604        let all_same = vec![1u64; 100];
605        let avg = RunLengthAnalyzer::average_run_length(&all_same);
606        assert!((avg - 100.0).abs() < 0.1);
607    }
608
609    #[test]
610    fn test_from_runs() {
611        let runs = vec![Run::new(1, 3), Run::new(2, 2), Run::new(3, 5)];
612        let encoded = RunLengthEncoding::from_runs(runs);
613
614        assert_eq!(encoded.run_count(), 3);
615        assert_eq!(encoded.total_count(), 10);
616        assert_eq!(encoded.decode(), vec![1, 1, 1, 2, 2, 3, 3, 3, 3, 3]);
617    }
618}