rav1e 0.4.0-alpha

// Copyright (c) 2019, The rav1e contributors. All rights reserved
//
// This source code is subject to the terms of the BSD 2 Clause License and
// the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
// was not distributed with this source code in the LICENSE file, you can
// obtain it at www.aomedia.org/license/software. If the Alliance for Open
// Media Patent License 1.0 was not distributed with this source code in the
// PATENTS file, you can obtain it at www.aomedia.org/license/patent.

#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;

use crate::context::av1_get_coded_tx_size;
use crate::cpu_features::CpuFeatureLevel;
use crate::quantize::*;
use crate::transform::TxSize;
use crate::util::*;

type DequantizeFn = unsafe fn(
  qindex: u8,
  coeffs_ptr: *const i16,
  _eob: usize,
  rcoeffs_ptr: *mut i16,
  tx_size: TxSize,
  bit_depth: usize,
  dc_delta_q: i8,
  ac_delta_q: i8,
);

cpu_function_lookup_table!(
  DEQUANTIZE_FNS: [Option<DequantizeFn>],
  default: None,
  [(AVX2, Some(dequantize_avx2))]
);

#[inline(always)]
pub fn dequantize<T: Coefficient>(
  qindex: u8, coeffs: &[T], eob: usize, rcoeffs: &mut [T], tx_size: TxSize,
  bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8, cpu: CpuFeatureLevel,
) {
  let call_rust = |rcoeffs: &mut [T]| {
    crate::quantize::rust::dequantize(
      qindex, coeffs, eob, rcoeffs, tx_size, bit_depth, dc_delta_q,
      ac_delta_q, cpu,
    );
  };

  #[cfg(any(feature = "check_asm", test))]
  let ref_rcoeffs = {
    let area = av1_get_coded_tx_size(tx_size).area();
    let mut copy = vec![T::cast_from(0); area];
    copy[..].copy_from_slice(&rcoeffs[..area]);
    call_rust(&mut copy);
    copy
  };

  match T::Pixel::type_enum() {
    PixelType::U8 => {
      if let Some(func) = DEQUANTIZE_FNS[cpu.as_index()] {
        unsafe {
          (func)(
            qindex,
            coeffs.as_ptr() as *const _,
            eob,
            rcoeffs.as_mut_ptr() as *mut _,
            tx_size,
            bit_depth,
            dc_delta_q,
            ac_delta_q,
          )
        }
      } else {
        call_rust(rcoeffs)
      }
    }
    PixelType::U16 => call_rust(rcoeffs),
  }

  #[cfg(any(feature = "check_asm", test))]
  {
    let area = av1_get_coded_tx_size(tx_size).area();
    assert_eq!(&rcoeffs[..area], &ref_rcoeffs[..]);
  }
}

#[target_feature(enable = "avx2")]
unsafe fn dequantize_avx2(
  qindex: u8, coeffs_ptr: *const i16, _eob: usize, rcoeffs_ptr: *mut i16,
  tx_size: TxSize, bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8,
) {
  let log_tx_scale = _mm256_set1_epi32(get_log_tx_scale(tx_size) as i32);

  let quants_ac =
    _mm256_set1_epi32(ac_q(qindex, ac_delta_q, bit_depth) as i32);
  // Use the dc quantize as first vector element for the first iteration
  let mut quants = _mm256_insert_epi32(
    quants_ac,
    dc_q(qindex, dc_delta_q, bit_depth) as i32,
    0,
  );

  let area: usize = av1_get_coded_tx_size(tx_size).area();
  // Step by 16 (256/16) coefficients for each iteration
  let step: usize = 16;
  assert!(area >= step);

  // Increase the pointers as we iterate
  let mut coeffs_ptr: *const i16 = coeffs_ptr;
  let mut rcoeffs_ptr: *mut i16 = rcoeffs_ptr;
  for _i in (0..area).step_by(step) {
    let coeffs = _mm256_load_si256(coeffs_ptr as *const _);
    let coeffs_abs = _mm256_abs_epi16(coeffs);

    // TODO: Since log_tx_scale is at most 2 and we gain an extra bit by taking
    // the abs value (unless the range is [-(2^15), 2^15 + 1]), it might be
    // possible to perform a 16-bit multiply and get the highest bit by
    // comparing coeffs to (1<<16) / quant. The would cost 1 compare, 1 blend,
    // and 1 add, but would get rid of 1 pack, 2 unpacks, 1 shift, and 1
    // multiply.

    let rcoeffs = _mm256_sign_epi16(
      _mm256_packs_epi32(
        // (abs_coeff * quant) >> log_tx_scale
        _mm256_srlv_epi32(
          _mm256_madd_epi16(
            quants,
            // Convert the first half of each lane to 32-bits
            _mm256_unpacklo_epi16(coeffs_abs, _mm256_setzero_si256()),
          ),
          log_tx_scale,
        ),
        // Second half
        _mm256_srlv_epi32(
          _mm256_madd_epi16(
            quants_ac,
            _mm256_unpackhi_epi16(coeffs_abs, _mm256_setzero_si256()),
          ),
          log_tx_scale,
        ),
      ),
      coeffs,
    );
    _mm256_store_si256(rcoeffs_ptr as *mut _, rcoeffs);

    // Only use a dc quantizer for the first iteration
    quants = quants_ac;
    coeffs_ptr = coeffs_ptr.add(step);
    rcoeffs_ptr = rcoeffs_ptr.add(step);
  }
}

#[cfg(test)]
mod test {
  use super::*;
  use rand::{thread_rng, Rng};

  #[test]
  fn dequantize_test() {
    let mut rng = thread_rng();

    use TxSize::*;
    let tx_sizes = [
      TX_4X4, TX_8X8, TX_16X16, TX_32X32, TX_64X64, TX_4X8, TX_8X4, TX_8X16,
      TX_16X8, TX_16X32, TX_32X16, TX_32X64, TX_64X32, TX_4X16, TX_16X4,
      TX_8X32, TX_32X8, TX_16X64, TX_64X16,
    ];

    let bd: usize = 8;

    for &tx_size in &tx_sizes {
      let qindex: u8 = rng.gen_range(MINQ as u8, MAXQ as u8);
      let dc_quant: i16 = dc_q(qindex, 0, bd);
      let ac_quant: i16 = ac_q(qindex, 0, bd);

      // Test the min, max, and random eobs
      let eobs = {
        let mut out = [0usize; 16];
        let area: usize = av1_get_coded_tx_size(tx_size).area();
        out[0] = 0;
        out[1] = area;
        for eob in out.iter_mut().skip(2) {
          *eob = rng.gen_range(0, area);
        }
        out
      };

      for &eob in &eobs {
        let mut qcoeffs = Aligned::new([0i16; 32 * 32]);
        let mut rcoeffs = Aligned::new([0i16; 32 * 32]);

        // Generate quantized coefficients upto the eob
        for (i, qcoeff) in qcoeffs.data.iter_mut().enumerate().take(eob) {
          *qcoeff =
            rng.gen::<i16>() / if i == 0 { dc_quant } else { ac_quant };
        }

        // Rely on quantize's internal tests
        dequantize(
          qindex,
          &qcoeffs.data,
          eob,
          &mut rcoeffs.data,
          tx_size,
          bd,
          0,
          0,
          CpuFeatureLevel::default(),
        );
      }
    }
  }
}