rust-gd 0.2.3

Generalized Deduplication based on Error-Correcting Codes
Documentation
mod dict;
mod error;
mod gd_bit_unit;
mod gd_byte_unit;
mod separator;

use async_trait::async_trait;
use dict::BasisDict;
use error::*;
use gd_bit_unit::BitGD;
use gd_byte_unit::ByteGD;
use libecc::{types::*, *};

///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#[derive(Debug, Clone)]
pub enum GD {
  ReedSolomon(usize, usize),
  Hamming(usize),
}
impl GD {
  pub async fn setup(&self, dict_size: usize) -> Result<GDInner> {
    // TODO: consider parallelization using async
    match self {
      GD::ReedSolomon(a, b) => Ok(GDInner::ReedSolomon(ByteGD {
        code: ReedSolomon::new(*a, *b).await?,
        basis_dict: BasisDict::<U8VRep>::new(dict_size),
        chunk_bytelen: *a,
      })),

      GD::Hamming(a) => {
        let code = Hamming::new(*a as u32)?;
        ensure!(code.code_bit_len >= 8, "Insufficient code length");
        let chunk_bytelen = (code.code_bit_len - code.code_bit_len % 8) / 8;
        Ok(GDInner::Hamming(BitGD {
          code,
          basis_dict: BasisDict::<BVRep>::new(dict_size),
          chunk_bytelen,
        }))
      }
    }
  }
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#[derive(Debug, Clone)]
pub enum GDInner {
  ReedSolomon(ByteGD<ReedSolomon>),
  Hamming(BitGD<Hamming>),
}

impl GDInner {
  pub fn unit_check(&self) {
    match &self {
      GDInner::Hamming(x) => x.unit_check(),
      GDInner::ReedSolomon(x) => x.unit_check(),
    }
  }
  // Asynchronous APIs
  // TODO: consider some parallelization only for 'decoding' operation to split chunk into base and deviation.
  // TODO: also consider for 'encoding' as well
  pub async fn dedup(&mut self, buf: &U8SRep) -> Result<Deduped> {
    match self {
      GDInner::Hamming(x) => x.dedup(buf).await,
      GDInner::ReedSolomon(x) => x.dedup(buf).await,
    }
  }

  pub async fn dup(&mut self, deduped: &Deduped) -> Result<U8VRep> {
    match self {
      GDInner::Hamming(x) => x.dup(deduped).await,
      GDInner::ReedSolomon(x) => x.dup(deduped).await,
    }
  }
  pub async fn set_error_alignment(&mut self, trans: &[U8VRep]) -> Result<()> {
    match self {
      GDInner::Hamming(_) => Err(anyhow!("No such method for Hamming codes")),
      GDInner::ReedSolomon(x) => x.set_error_alignment(trans).await,
    }
  }
}

///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#[async_trait]
pub trait GDTrait {
  fn unit_check(&self);
  async fn dedup(&mut self, buf: &U8SRep) -> Result<Deduped>;
  async fn dup(&mut self, deduped: &Deduped) -> Result<U8VRep>;
}

///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#[derive(Debug, Clone)]
pub struct Deduped {
  pub data: U8VRep,
  pub last_chunk_pad_bytelen: usize,
}
/////////////////////////////////////////

#[cfg(test)]
mod tests {
  use super::*;
  use rand::Rng;

  // const WORD_STR: &str = "寿限無(じゅげむ)寿限無(じゅげむ)五劫(ごこう)のすりきれ海砂利(かいじゃり)水魚(すいぎょ)の水行末(すいぎょうまつ) ";
  // const WORD_STR: &str = "寿限無(じゅげむ)寿限無(じゅげむ)五劫(ごこう)のすりきれ海砂利(かいじゃり)水魚(すいぎょ)の水行末(すいぎょうまつ) 雲来末(うんらいまつ) 風来末(ふうらいまつ)食(く)う寝(ね)るところに住(す)むところやぶらこうじのぶらこうじパイポパイポパイポのシューリンガンシューリンガンのグーリンダイグーリンダイのポンポコピーのポンポコナの長久命(ちょうきゅうめい)の長助(ちょうすけ)";
  const WORD_STR: &str =
    "寿限無(じゅげむ)寿限無(じゅげむ)五劫(ごこう)のすりきれ海砂利(かいじゃり)padpadpadpadpadpadpadpad"; // Byte alignment is quire needed...

  #[tokio::test]
  async fn hamming_works() {
    let words = WORD_STR.to_string().repeat(128).into_bytes();

    for hamming_deg in 4..11 {
      let hamming_dict_size = 511;

      let mut gd_dedup = GD::Hamming(hamming_deg)
        .setup(hamming_dict_size)
        .await
        .unwrap();
      let mut gd_dup = GD::Hamming(hamming_deg)
        .setup(hamming_dict_size)
        .await
        .unwrap();
      // gd_dedup.unit_check();

      // println!("Hamimng code deg = {}", hamming_deg);
      // println!("> org size: {} bits", words.len() * 8);
      let x = gd_dedup.dedup(&words).await.unwrap();
      // println!("> deduped size {} bits", x.data.len());
      let y = gd_dup.dup(&x).await.unwrap();
      // println!("> duped size {} bits", y.len() * 8);
      assert_eq!(y, words);
      println!(
        "Hamimng code deg = {} > Deduped rate: {:.2} %",
        hamming_deg,
        100.0 * (x.data.len() as f32) / (y.len() as f32)
      );
      // println!()
    }
  }

  const RS_MAX_DICT_BITS: usize = 8;
  const RS_DICT_PARAM: usize = 2;
  const RS_REPEAT: usize = 1024;

  #[tokio::test]
  async fn rs_works() {
    let mut rng = rand::thread_rng();
    let words_org = WORD_STR.to_string().into_bytes().repeat(RS_REPEAT);

    for code_len in vec![128].into_iter() {
      for msg_len in 2isize.max(code_len as isize - 8) as usize..code_len {
        let dict_size = (1 << ((code_len - msg_len) * RS_DICT_PARAM).min(RS_MAX_DICT_BITS)) - 1;

        let mut gd_dedup = GD::ReedSolomon(code_len, msg_len)
          .setup(dict_size)
          .await
          .unwrap();
        let mut gd_dup = GD::ReedSolomon(code_len, msg_len)
          .setup(dict_size)
          .await
          .unwrap();
        // gd_dedup.unit_check();

        let words: Vec<u8> = words_org
          .clone()
          .into_iter()
          .enumerate()
          .map(|(idx, b)| {
            if idx % code_len < msg_len {
              b
            } else {
              let random_pad: u8 = rng.gen();
              b ^ random_pad
            }
          })
          .collect();

        // println!("RS code ({}, {}) over GF(256)", code_len, msg_len);
        // println!("> org size: {} bits", words.len() * 8);
        let x = gd_dedup.dedup(&words).await.unwrap();
        // println!("> deduped size {} bits", x.data.len());
        let y = gd_dup.dup(&x).await.unwrap();
        // println!("> duped size {} bits", y.len() * 8);
        assert_eq!(y, words);
        // println!("{:?}", gd);
        println!(
          "RS code ({}, {}) over GF(256) of dict size {} > Deduped rate: {:.2} %",
          code_len,
          msg_len,
          dict_size,
          100.0 * (x.data.len() as f32) / (y.len() as f32)
        );
        // println!()
      }
    }
  }

  #[tokio::test]
  async fn rs_align_error_works() {
    let trans: Vec<Vec<u8>> = vec![
      vec![1u8, 0, 0, 0],
      vec![1u8, 1, 1, 4],
      vec![1u8, 1, 3, 0],
      vec![1u8, 2, 0, 0],
    ];
    let dict_size = 15;
    let code_len = 4;
    let msg_len = 3;

    let mut gd_dedup = GD::ReedSolomon(code_len, msg_len)
      .setup(dict_size)
      .await
      .unwrap();
    let mut gd_dup = GD::ReedSolomon(code_len, msg_len)
      .setup(dict_size)
      .await
      .unwrap();
    let res_dedup = gd_dedup.set_error_alignment(&trans).await;
    let res_dup = gd_dup.set_error_alignment(&trans).await;
    assert!(res_dedup.is_ok());
    assert!(res_dup.is_ok());

    let words = WORD_STR.to_string().into_bytes().repeat(RS_REPEAT);

    // println!("RS code ({}, {}) over GF(256)", code_len, msg_len);
    // println!("> org size: {} bits", words.len() * 8);
    let x = gd_dedup.dedup(&words).await.unwrap();
    // println!("> deduped size {} bits", x.data.len());
    let y = gd_dup.dup(&x).await.unwrap();
    // println!("> duped size {} bits", y.len() * 8);
    assert_eq!(y, words);
    // println!("{:?}", gd);
    println!(
      "RS code ({}, {}) over GF(256) of dict size {} > Deduped rate: {:.2} %",
      code_len,
      msg_len,
      dict_size,
      100.0 * (x.data.len() as f32) / (y.len() as f32)
    );
  }
}