Skip to main content

grafeo_core/codec/
runlength.rs

1//! Run-length encoding for highly repetitive data.
2//!
3//! Run-length encoding (RLE) compresses sequences with consecutive repeated
4//! values by storing each unique value once along with its run length.
5//!
6//! | Data pattern | Compression ratio |
7//! | ------------ | ----------------- |
8//! | Constant value | ~100x |
9//! | Few distinct values, long runs | 10-50x |
10//! | Many short runs | 2-5x |
11//! | Random data | < 1x (expansion) |
12//!
13//! # Example
14//!
15//! ```no_run
16//! use grafeo_core::codec::RunLengthEncoding;
17//!
18//! // Compress data with many repeated values
19//! let values = vec![1, 1, 1, 2, 2, 3, 3, 3, 3, 3];
20//! let encoded = RunLengthEncoding::encode(&values);
21//!
22//! println!("Runs: {}", encoded.run_count()); // 3 runs
23//! println!("Compression: {:.1}x", encoded.compression_ratio()); // ~3.3x
24//!
25//! // Decode back to original
26//! let decoded = encoded.decode();
27//! assert_eq!(values, decoded);
28//! ```
29
30use std::io::{self, Read};
31
32/// A run in run-length encoding: a value and how many times it repeats.
33#[derive(Debug, Clone, PartialEq, Eq)]
34pub struct Run<T> {
35    /// The value for this run.
36    pub value: T,
37    /// Number of consecutive occurrences.
38    pub length: u64,
39}
40
41impl<T> Run<T> {
42    /// Creates a new run.
43    #[must_use]
44    pub fn new(value: T, length: u64) -> Self {
45        Self { value, length }
46    }
47}
48
49/// Run-length encoded data for u64 values.
50///
51/// Stores sequences of (value, count) pairs. Achieves excellent compression
52/// when data has long runs of repeated values.
53#[derive(Debug, Clone)]
54pub struct RunLengthEncoding {
55    /// The runs: each is (value, count).
56    runs: Vec<Run<u64>>,
57    /// Total number of values (sum of all run lengths).
58    total_count: usize,
59}
60
61impl<'a> IntoIterator for &'a RunLengthEncoding {
62    type Item = u64;
63    type IntoIter = RunLengthIterator<'a>;
64
65    fn into_iter(self) -> Self::IntoIter {
66        self.iter()
67    }
68}
69
70impl RunLengthEncoding {
71    /// Encodes a slice of u64 values using run-length encoding.
72    ///
73    /// # Example
74    /// ```no_run
75    /// # use grafeo_core::codec::runlength::RunLengthEncoding;
76    /// let values = vec![1, 1, 1, 2, 2, 3];
77    /// let encoded = RunLengthEncoding::encode(&values);
78    /// // Results in 3 runs: (1, 3), (2, 2), (3, 1)
79    /// ```
80    #[must_use]
81    pub fn encode(values: &[u64]) -> Self {
82        if values.is_empty() {
83            return Self {
84                runs: Vec::new(),
85                total_count: 0,
86            };
87        }
88
89        let mut runs = Vec::new();
90        let mut current_value = values[0];
91        let mut current_length = 1u64;
92
93        for &value in &values[1..] {
94            if value == current_value {
95                current_length += 1;
96            } else {
97                runs.push(Run::new(current_value, current_length));
98                current_value = value;
99                current_length = 1;
100            }
101        }
102
103        // Don't forget the last run
104        runs.push(Run::new(current_value, current_length));
105
106        Self {
107            runs,
108            total_count: values.len(),
109        }
110    }
111
112    /// Creates a run-length encoding from pre-built runs.
113    #[must_use]
114    pub fn from_runs(runs: Vec<Run<u64>>) -> Self {
115        let total_count = runs
116            .iter()
117            .map(|r| {
118                // reason: run lengths are bounded by practical data sizes
119                #[allow(clippy::cast_possible_truncation)]
120                let len = r.length as usize;
121                len
122            })
123            .sum();
124        Self { runs, total_count }
125    }
126
127    /// Decodes back to the original values.
128    #[must_use]
129    pub fn decode(&self) -> Vec<u64> {
130        let mut values = Vec::with_capacity(self.total_count);
131
132        for run in &self.runs {
133            for _ in 0..run.length {
134                values.push(run.value);
135            }
136        }
137
138        values
139    }
140
141    /// Returns the number of runs.
142    #[must_use]
143    pub fn run_count(&self) -> usize {
144        self.runs.len()
145    }
146
147    /// Returns the total number of values represented.
148    #[must_use]
149    pub fn total_count(&self) -> usize {
150        self.total_count
151    }
152
153    /// Returns the runs.
154    #[must_use]
155    pub fn runs(&self) -> &[Run<u64>] {
156        &self.runs
157    }
158
159    /// Returns true if there are no values.
160    #[must_use]
161    pub fn is_empty(&self) -> bool {
162        self.total_count == 0
163    }
164
165    /// Returns the compression ratio (original size / encoded size).
166    ///
167    /// Values > 1.0 indicate compression, < 1.0 indicate expansion.
168    #[must_use]
169    pub fn compression_ratio(&self) -> f64 {
170        if self.runs.is_empty() {
171            return 1.0;
172        }
173
174        // Original: total_count * 8 bytes
175        // Encoded: runs.len() * 16 bytes (8 for value, 8 for length)
176        let original_size = self.total_count * 8;
177        let encoded_size = self.runs.len() * 16;
178
179        if encoded_size == 0 {
180            return 1.0;
181        }
182
183        original_size as f64 / encoded_size as f64
184    }
185
186    /// Returns true if run-length encoding is beneficial for this data.
187    ///
188    /// Returns true when compression ratio > 1.0 (actual compression achieved).
189    #[must_use]
190    pub fn is_beneficial(&self) -> bool {
191        self.compression_ratio() > 1.0
192    }
193
194    /// Returns the memory size in bytes of the encoded representation.
195    #[must_use]
196    pub fn encoded_size(&self) -> usize {
197        // Each run: 8 bytes value + 8 bytes length
198        self.runs.len() * 16
199    }
200
201    /// Serializes the run-length encoding to bytes.
202    pub fn to_bytes(&self) -> Vec<u8> {
203        let mut bytes = Vec::with_capacity(8 + self.runs.len() * 16);
204
205        // Write run count
206        bytes.extend_from_slice(&(self.runs.len() as u64).to_le_bytes());
207
208        // Write each run
209        for run in &self.runs {
210            bytes.extend_from_slice(&run.value.to_le_bytes());
211            bytes.extend_from_slice(&run.length.to_le_bytes());
212        }
213
214        bytes
215    }
216
217    /// Deserializes run-length encoding from bytes.
218    ///
219    /// # Errors
220    ///
221    /// Returns `Err` if the byte slice is too short or contains invalid run data.
222    pub fn from_bytes(bytes: &[u8]) -> io::Result<Self> {
223        let mut cursor = io::Cursor::new(bytes);
224
225        // Read run count
226        let mut buf = [0u8; 8];
227        cursor.read_exact(&mut buf)?;
228        // reason: run count is bounded by practical data sizes
229        #[allow(clippy::cast_possible_truncation)]
230        let run_count = u64::from_le_bytes(buf) as usize;
231
232        // Read runs
233        let mut runs = Vec::with_capacity(run_count);
234        for _ in 0..run_count {
235            cursor.read_exact(&mut buf)?;
236            let value = u64::from_le_bytes(buf);
237
238            cursor.read_exact(&mut buf)?;
239            let length = u64::from_le_bytes(buf);
240
241            runs.push(Run::new(value, length));
242        }
243
244        Ok(Self::from_runs(runs))
245    }
246
247    /// Gets the value at a specific index without full decompression.
248    ///
249    /// Returns None if index is out of bounds.
250    #[must_use]
251    pub fn get(&self, index: usize) -> Option<u64> {
252        if index >= self.total_count {
253            return None;
254        }
255
256        let mut offset = 0usize;
257        for run in &self.runs {
258            // reason: run length is bounded by total_count which is usize
259            #[allow(clippy::cast_possible_truncation)]
260            let run_end = offset + run.length as usize;
261            if index < run_end {
262                return Some(run.value);
263            }
264            offset = run_end;
265        }
266
267        None
268    }
269
270    /// Returns an iterator over the decoded values.
271    pub fn iter(&self) -> RunLengthIterator<'_> {
272        RunLengthIterator {
273            runs: &self.runs,
274            run_index: 0,
275            within_run: 0,
276        }
277    }
278}
279
280/// Iterator over run-length encoded values.
281pub struct RunLengthIterator<'a> {
282    runs: &'a [Run<u64>],
283    run_index: usize,
284    within_run: u64,
285}
286
287impl Iterator for RunLengthIterator<'_> {
288    type Item = u64;
289
290    fn next(&mut self) -> Option<Self::Item> {
291        while self.run_index < self.runs.len() {
292            let run = &self.runs[self.run_index];
293            if self.within_run < run.length {
294                self.within_run += 1;
295                return Some(run.value);
296            }
297            self.run_index += 1;
298            self.within_run = 0;
299        }
300        None
301    }
302
303    fn size_hint(&self) -> (usize, Option<usize>) {
304        let remaining: u64 = self.runs[self.run_index..]
305            .iter()
306            .map(|r| r.length)
307            .sum::<u64>()
308            - self.within_run;
309        // reason: remaining count is bounded by total data length
310        #[allow(clippy::cast_possible_truncation)]
311        (remaining as usize, Some(remaining as usize))
312    }
313}
314
315impl ExactSizeIterator for RunLengthIterator<'_> {}
316
317/// Run-length encoding for signed integers.
318///
319/// Uses zigzag encoding internally for efficient storage.
320#[derive(Debug, Clone)]
321pub struct SignedRunLengthEncoding {
322    inner: RunLengthEncoding,
323}
324
325impl SignedRunLengthEncoding {
326    /// Encodes signed integers using run-length encoding.
327    #[must_use]
328    pub fn encode(values: &[i64]) -> Self {
329        let unsigned: Vec<u64> = values.iter().map(|&v| zigzag_encode(v)).collect();
330        Self {
331            inner: RunLengthEncoding::encode(&unsigned),
332        }
333    }
334
335    /// Decodes back to signed integers.
336    #[must_use]
337    pub fn decode(&self) -> Vec<i64> {
338        self.inner.decode().into_iter().map(zigzag_decode).collect()
339    }
340
341    /// Returns the compression ratio.
342    #[must_use]
343    pub fn compression_ratio(&self) -> f64 {
344        self.inner.compression_ratio()
345    }
346
347    /// Returns the number of runs.
348    #[must_use]
349    pub fn run_count(&self) -> usize {
350        self.inner.run_count()
351    }
352
353    /// Serializes to bytes.
354    pub fn to_bytes(&self) -> Vec<u8> {
355        self.inner.to_bytes()
356    }
357
358    /// Deserializes from bytes.
359    ///
360    /// # Errors
361    ///
362    /// Returns `Err` if the byte slice is too short or contains invalid run data.
363    pub fn from_bytes(bytes: &[u8]) -> io::Result<Self> {
364        Ok(Self {
365            inner: RunLengthEncoding::from_bytes(bytes)?,
366        })
367    }
368}
369
370/// Zigzag encodes a signed integer to unsigned.
371#[inline]
372#[must_use]
373// reason: zig-zag encoding intentionally reinterprets bits, no data loss
374#[allow(clippy::cast_sign_loss)]
375pub fn zigzag_encode(n: i64) -> u64 {
376    ((n << 1) ^ (n >> 63)) as u64
377}
378
379/// Zigzag decodes an unsigned integer to signed.
380#[inline]
381#[must_use]
382// reason: zig-zag decoding intentionally reinterprets bits, inverse of encode
383#[allow(clippy::cast_possible_wrap)]
384pub fn zigzag_decode(n: u64) -> i64 {
385    ((n >> 1) as i64) ^ -((n & 1) as i64)
386}
387
388/// Analyzes data to determine if run-length encoding is beneficial.
389pub struct RunLengthAnalyzer;
390
391impl RunLengthAnalyzer {
392    /// Estimates the compression ratio without actually encoding.
393    ///
394    /// This is faster than encoding for decision-making.
395    #[must_use]
396    pub fn estimate_ratio(values: &[u64]) -> f64 {
397        if values.is_empty() {
398            return 1.0;
399        }
400
401        // Count runs
402        let mut run_count = 1usize;
403        for i in 1..values.len() {
404            if values[i] != values[i - 1] {
405                run_count += 1;
406            }
407        }
408
409        // Original: values.len() * 8 bytes
410        // Encoded: run_count * 16 bytes
411        let original = values.len() * 8;
412        let encoded = run_count * 16;
413
414        if encoded == 0 {
415            return 1.0;
416        }
417
418        original as f64 / encoded as f64
419    }
420
421    /// Returns true if run-length encoding would be beneficial.
422    #[must_use]
423    pub fn is_beneficial(values: &[u64]) -> bool {
424        Self::estimate_ratio(values) > 1.0
425    }
426
427    /// Returns the average run length in the data.
428    #[must_use]
429    pub fn average_run_length(values: &[u64]) -> f64 {
430        if values.is_empty() {
431            return 0.0;
432        }
433
434        let mut run_count = 1usize;
435        for i in 1..values.len() {
436            if values[i] != values[i - 1] {
437                run_count += 1;
438            }
439        }
440
441        values.len() as f64 / run_count as f64
442    }
443}
444
445#[cfg(test)]
446mod tests {
447    use super::*;
448
449    #[test]
450    fn test_encode_decode_basic() {
451        let values = vec![1, 1, 1, 2, 2, 3, 3, 3, 3, 3];
452        let encoded = RunLengthEncoding::encode(&values);
453
454        assert_eq!(encoded.run_count(), 3);
455        assert_eq!(encoded.total_count(), 10);
456
457        let decoded = encoded.decode();
458        assert_eq!(values, decoded);
459    }
460
461    #[test]
462    fn test_encode_empty() {
463        let values: Vec<u64> = vec![];
464        let encoded = RunLengthEncoding::encode(&values);
465
466        assert_eq!(encoded.run_count(), 0);
467        assert_eq!(encoded.total_count(), 0);
468        assert!(encoded.is_empty());
469
470        let decoded = encoded.decode();
471        assert!(decoded.is_empty());
472    }
473
474    #[test]
475    fn test_encode_single() {
476        let values = vec![42];
477        let encoded = RunLengthEncoding::encode(&values);
478
479        assert_eq!(encoded.run_count(), 1);
480        assert_eq!(encoded.total_count(), 1);
481
482        let decoded = encoded.decode();
483        assert_eq!(values, decoded);
484    }
485
486    #[test]
487    fn test_encode_all_same() {
488        let values = vec![7u64; 1000];
489        let encoded = RunLengthEncoding::encode(&values);
490
491        assert_eq!(encoded.run_count(), 1);
492        assert_eq!(encoded.total_count(), 1000);
493
494        // Should compress very well
495        let ratio = encoded.compression_ratio();
496        assert!(ratio > 50.0, "Expected ratio > 50, got {}", ratio);
497
498        let decoded = encoded.decode();
499        assert_eq!(values, decoded);
500    }
501
502    #[test]
503    fn test_encode_all_different() {
504        let values: Vec<u64> = (0..100).collect();
505        let encoded = RunLengthEncoding::encode(&values);
506
507        assert_eq!(encoded.run_count(), 100);
508        assert_eq!(encoded.total_count(), 100);
509
510        // Should not compress (expand by 2x)
511        let ratio = encoded.compression_ratio();
512        assert!(ratio < 1.0, "Expected ratio < 1, got {}", ratio);
513
514        let decoded = encoded.decode();
515        assert_eq!(values, decoded);
516    }
517
518    #[test]
519    fn test_compression_ratio() {
520        // Perfect case: all same values
521        let all_same = vec![1u64; 100];
522        let encoded = RunLengthEncoding::encode(&all_same);
523        assert!(encoded.compression_ratio() > 1.0);
524        assert!(encoded.is_beneficial());
525
526        // Bad case: all different values
527        let all_diff: Vec<u64> = (0..100).collect();
528        let encoded = RunLengthEncoding::encode(&all_diff);
529        assert!(encoded.compression_ratio() < 1.0);
530        assert!(!encoded.is_beneficial());
531    }
532
533    #[test]
534    fn test_serialization() {
535        let values = vec![1, 1, 1, 2, 2, 3, 3, 3, 3, 3];
536        let encoded = RunLengthEncoding::encode(&values);
537
538        let bytes = encoded.to_bytes();
539        let decoded_encoding = RunLengthEncoding::from_bytes(&bytes).unwrap();
540
541        assert_eq!(encoded.run_count(), decoded_encoding.run_count());
542        assert_eq!(encoded.decode(), decoded_encoding.decode());
543    }
544
545    #[test]
546    fn test_get_index() {
547        let values = vec![1, 1, 1, 2, 2, 3, 3, 3, 3, 3];
548        let encoded = RunLengthEncoding::encode(&values);
549
550        assert_eq!(encoded.get(0), Some(1));
551        assert_eq!(encoded.get(2), Some(1));
552        assert_eq!(encoded.get(3), Some(2));
553        assert_eq!(encoded.get(4), Some(2));
554        assert_eq!(encoded.get(5), Some(3));
555        assert_eq!(encoded.get(9), Some(3));
556        assert_eq!(encoded.get(10), None);
557    }
558
559    #[test]
560    fn test_iterator() {
561        let values = vec![1, 1, 1, 2, 2, 3];
562        let encoded = RunLengthEncoding::encode(&values);
563
564        let iterated: Vec<u64> = encoded.iter().collect();
565        assert_eq!(values, iterated);
566    }
567
568    #[test]
569    fn test_signed_integers() {
570        let values = vec![-5, -5, -5, 0, 0, 10, 10, 10, 10];
571        let encoded = SignedRunLengthEncoding::encode(&values);
572
573        assert_eq!(encoded.run_count(), 3);
574
575        let decoded = encoded.decode();
576        assert_eq!(values, decoded);
577    }
578
579    #[test]
580    fn test_signed_serialization() {
581        let values = vec![-100, -100, 0, 0, 0, 100];
582        let encoded = SignedRunLengthEncoding::encode(&values);
583
584        let bytes = encoded.to_bytes();
585        let decoded_encoding = SignedRunLengthEncoding::from_bytes(&bytes).unwrap();
586
587        assert_eq!(encoded.decode(), decoded_encoding.decode());
588    }
589
590    #[test]
591    fn test_zigzag() {
592        assert_eq!(zigzag_encode(0), 0);
593        assert_eq!(zigzag_encode(-1), 1);
594        assert_eq!(zigzag_encode(1), 2);
595        assert_eq!(zigzag_encode(-2), 3);
596        assert_eq!(zigzag_encode(2), 4);
597
598        for i in -1000i64..1000 {
599            assert_eq!(zigzag_decode(zigzag_encode(i)), i);
600        }
601    }
602
603    #[test]
604    fn test_analyzer_estimate() {
605        let all_same = vec![1u64; 100];
606        let ratio = RunLengthAnalyzer::estimate_ratio(&all_same);
607        assert!(ratio > 1.0);
608        assert!(RunLengthAnalyzer::is_beneficial(&all_same));
609
610        let all_diff: Vec<u64> = (0..100).collect();
611        let ratio = RunLengthAnalyzer::estimate_ratio(&all_diff);
612        assert!(ratio < 1.0);
613        assert!(!RunLengthAnalyzer::is_beneficial(&all_diff));
614    }
615
616    #[test]
617    fn test_analyzer_average_run_length() {
618        let values = vec![1, 1, 1, 2, 2, 3, 3, 3, 3, 3]; // 3 runs, 10 values
619        let avg = RunLengthAnalyzer::average_run_length(&values);
620        assert!((avg - 3.33).abs() < 0.1);
621
622        let all_same = vec![1u64; 100];
623        let avg = RunLengthAnalyzer::average_run_length(&all_same);
624        assert!((avg - 100.0).abs() < 0.1);
625    }
626
627    #[test]
628    fn test_from_runs() {
629        let runs = vec![Run::new(1, 3), Run::new(2, 2), Run::new(3, 5)];
630        let encoded = RunLengthEncoding::from_runs(runs);
631
632        assert_eq!(encoded.run_count(), 3);
633        assert_eq!(encoded.total_count(), 10);
634        assert_eq!(encoded.decode(), vec![1, 1, 1, 2, 2, 3, 3, 3, 3, 3]);
635    }
636}