rav1e/
rdo.rs

1// Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
2// Copyright (c) 2017-2022, The rav1e contributors. All rights reserved
3//
4// This source code is subject to the terms of the BSD 2 Clause License and
5// the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6// was not distributed with this source code in the LICENSE file, you can
7// obtain it at www.aomedia.org/license/software. If the Alliance for Open
8// Media Patent License 1.0 was not distributed with this source code in the
9// PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
11#![allow(non_camel_case_types)]
12
13use std::fmt;
14use std::mem::MaybeUninit;
15
16use arrayvec::*;
17use itertools::izip;
18
19use crate::api::*;
20use crate::cdef::*;
21use crate::context::*;
22use crate::cpu_features::CpuFeatureLevel;
23use crate::deblock::*;
24use crate::dist::*;
25use crate::ec::{Writer, WriterCounter, OD_BITRES};
26use crate::encode_block_with_modes;
27use crate::encoder::{FrameInvariants, IMPORTANCE_BLOCK_SIZE};
28use crate::frame::*;
29use crate::header::ReferenceMode;
30use crate::lrf::*;
31use crate::mc::MotionVector;
32use crate::me::estimate_motion;
33use crate::me::MVSamplingMode;
34use crate::me::MotionSearchResult;
35use crate::motion_compensate;
36use crate::partition::PartitionType::*;
37use crate::partition::RefType::*;
38use crate::partition::*;
39use crate::predict::{
40  luma_ac, AngleDelta, IntraEdgeFilterParameters, IntraParam, PredictionMode,
41  RAV1E_INTER_COMPOUND_MODES, RAV1E_INTER_MODES_MINIMAL, RAV1E_INTRA_MODES,
42};
43use crate::rdo_tables::*;
44use crate::tiling::*;
45use crate::transform::{TxSet, TxSize, TxType, RAV1E_TX_TYPES};
46use crate::util::{init_slice_repeat_mut, Aligned, Pixel};
47use crate::write_tx_blocks;
48use crate::write_tx_tree;
49use crate::Tune;
50use crate::{encode_block_post_cdef, encode_block_pre_cdef};
51
52#[derive(Copy, Clone, PartialEq, Eq)]
53pub enum RDOType {
54  PixelDistRealRate,
55  TxDistRealRate,
56  TxDistEstRate,
57}
58
59impl RDOType {
60  #[inline]
61  pub const fn needs_tx_dist(self) -> bool {
62    match self {
63      // Pixel-domain distortion and exact ec rate
64      RDOType::PixelDistRealRate => false,
65      // Tx-domain distortion and exact ec rate
66      RDOType::TxDistRealRate => true,
67      // Tx-domain distortion and txdist-based rate
68      RDOType::TxDistEstRate => true,
69    }
70  }
71  #[inline]
72  pub const fn needs_coeff_rate(self) -> bool {
73    match self {
74      RDOType::PixelDistRealRate => true,
75      RDOType::TxDistRealRate => true,
76      RDOType::TxDistEstRate => false,
77    }
78  }
79}
80
81#[derive(Clone)]
82pub struct PartitionGroupParameters {
83  pub rd_cost: f64,
84  pub part_type: PartitionType,
85  pub part_modes: ArrayVec<PartitionParameters, 4>,
86}
87
88#[derive(Clone, Debug)]
89pub struct PartitionParameters {
90  pub rd_cost: f64,
91  pub bo: TileBlockOffset,
92  pub bsize: BlockSize,
93  pub pred_mode_luma: PredictionMode,
94  pub pred_mode_chroma: PredictionMode,
95  pub pred_cfl_params: CFLParams,
96  pub angle_delta: AngleDelta,
97  pub ref_frames: [RefType; 2],
98  pub mvs: [MotionVector; 2],
99  pub skip: bool,
100  pub has_coeff: bool,
101  pub tx_size: TxSize,
102  pub tx_type: TxType,
103  pub sidx: u8,
104}
105
106impl Default for PartitionParameters {
107  fn default() -> Self {
108    PartitionParameters {
109      rd_cost: f64::MAX,
110      bo: TileBlockOffset::default(),
111      bsize: BlockSize::BLOCK_32X32,
112      pred_mode_luma: PredictionMode::default(),
113      pred_mode_chroma: PredictionMode::default(),
114      pred_cfl_params: CFLParams::default(),
115      angle_delta: AngleDelta::default(),
116      ref_frames: [RefType::INTRA_FRAME, RefType::NONE_FRAME],
117      mvs: [MotionVector::default(); 2],
118      skip: false,
119      has_coeff: true,
120      tx_size: TxSize::TX_4X4,
121      tx_type: TxType::DCT_DCT,
122      sidx: 0,
123    }
124  }
125}
126
127pub fn estimate_rate(qindex: u8, ts: TxSize, fast_distortion: u64) -> u64 {
128  let bs_index = ts as usize;
129  let q_bin_idx = (qindex as usize) / RDO_QUANT_DIV;
130  let bin_idx_down =
131    ((fast_distortion) / RATE_EST_BIN_SIZE).min((RDO_NUM_BINS - 2) as u64);
132  let bin_idx_up = (bin_idx_down + 1).min((RDO_NUM_BINS - 1) as u64);
133  let x0 = (bin_idx_down * RATE_EST_BIN_SIZE) as i64;
134  let x1 = (bin_idx_up * RATE_EST_BIN_SIZE) as i64;
135  let y0 = RDO_RATE_TABLE[q_bin_idx][bs_index][bin_idx_down as usize] as i64;
136  let y1 = RDO_RATE_TABLE[q_bin_idx][bs_index][bin_idx_up as usize] as i64;
137  let slope = ((y1 - y0) << 8) / (x1 - x0);
138  (y0 + (((fast_distortion as i64 - x0) * slope) >> 8)).max(0) as u64
139}
140
141#[allow(unused)]
142pub fn cdef_dist_wxh<T: Pixel, F: Fn(Area, BlockSize) -> DistortionScale>(
143  src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, w: usize, h: usize,
144  bit_depth: usize, compute_bias: F, cpu: CpuFeatureLevel,
145) -> Distortion {
146  debug_assert!(src1.plane_cfg.xdec == 0);
147  debug_assert!(src1.plane_cfg.ydec == 0);
148  debug_assert!(src2.plane_cfg.xdec == 0);
149  debug_assert!(src2.plane_cfg.ydec == 0);
150
151  let mut sum = Distortion::zero();
152  for y in (0..h).step_by(8) {
153    for x in (0..w).step_by(8) {
154      let kernel_h = (h - y).min(8);
155      let kernel_w = (w - x).min(8);
156      let area = Area::StartingAt { x: x as isize, y: y as isize };
157
158      let value = RawDistortion(cdef_dist_kernel(
159        &src1.subregion(area),
160        &src2.subregion(area),
161        kernel_w,
162        kernel_h,
163        bit_depth,
164        cpu,
165      ) as u64);
166
167      // cdef is always called on non-subsampled planes, so BLOCK_8X8 is
168      // correct here.
169      sum += value * compute_bias(area, BlockSize::BLOCK_8X8);
170    }
171  }
172  sum
173}
174
175/// Sum of Squared Error for a wxh block
176/// Currently limited to w and h of valid blocks
177pub fn sse_wxh<T: Pixel, F: Fn(Area, BlockSize) -> DistortionScale>(
178  src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, w: usize, h: usize,
179  compute_bias: F, bit_depth: usize, cpu: CpuFeatureLevel,
180) -> Distortion {
181  // See get_weighted_sse in src/dist.rs.
182  // Provide a scale to get_weighted_sse for each square region of this size.
183  const CHUNK_SIZE: usize = IMPORTANCE_BLOCK_SIZE >> 1;
184
185  // To bias the distortion correctly, compute it in blocks up to the size
186  // importance block size in a non-subsampled plane.
187  let imp_block_w = CHUNK_SIZE << src1.plane_cfg.xdec;
188  let imp_block_h = CHUNK_SIZE << src1.plane_cfg.ydec;
189
190  let imp_bsize = BlockSize::from_width_and_height(imp_block_w, imp_block_h);
191
192  let n_imp_blocks_w = w.div_ceil(CHUNK_SIZE);
193  let n_imp_blocks_h = h.div_ceil(CHUNK_SIZE);
194
195  // TODO: Copying biases into a buffer is slow. It would be best if biases were
196  // passed directly. To do this, we would need different versions of the
197  // weighted sse function for decimated/subsampled data. Also requires
198  // eliminating use of unbiased sse.
199  // It should also be noted that the current copy code does not auto-vectorize.
200
201  // Copy biases into a buffer.
202  let mut buf_storage = Aligned::new(
203    [MaybeUninit::<u32>::uninit(); 128 / CHUNK_SIZE * 128 / CHUNK_SIZE],
204  );
205  let buf_stride = n_imp_blocks_w.next_power_of_two();
206  let buf = init_slice_repeat_mut(
207    &mut buf_storage.data[..buf_stride * n_imp_blocks_h],
208    0,
209  );
210
211  for block_y in 0..n_imp_blocks_h {
212    for block_x in 0..n_imp_blocks_w {
213      let block = Area::StartingAt {
214        x: (block_x * CHUNK_SIZE) as isize,
215        y: (block_y * CHUNK_SIZE) as isize,
216      };
217      buf[block_y * buf_stride + block_x] = compute_bias(block, imp_bsize).0;
218    }
219  }
220
221  Distortion(get_weighted_sse(
222    src1, src2, buf, buf_stride, w, h, bit_depth, cpu,
223  ))
224}
225
226// TODO consider saturating_sub later
227#[allow(clippy::implicit_saturating_sub)]
228pub const fn clip_visible_bsize(
229  frame_w: usize, frame_h: usize, bsize: BlockSize, x: usize, y: usize,
230) -> (usize, usize) {
231  let blk_w = bsize.width();
232  let blk_h = bsize.height();
233
234  let visible_w: usize = if x + blk_w <= frame_w {
235    blk_w
236  } else if x >= frame_w {
237    0
238  } else {
239    frame_w - x
240  };
241
242  let visible_h: usize = if y + blk_h <= frame_h {
243    blk_h
244  } else if y >= frame_h {
245    0
246  } else {
247    frame_h - y
248  };
249
250  (visible_w, visible_h)
251}
252
253// Compute the pixel-domain distortion for an encode
254fn compute_distortion<T: Pixel>(
255  fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>, bsize: BlockSize,
256  is_chroma_block: bool, tile_bo: TileBlockOffset, luma_only: bool,
257) -> ScaledDistortion {
258  let area = Area::BlockStartingAt { bo: tile_bo.0 };
259  let input_region = ts.input_tile.planes[0].subregion(area);
260  let rec_region = ts.rec.planes[0].subregion(area);
261
262  // clip a block to have visible pixles only
263  let frame_bo = ts.to_frame_block_offset(tile_bo);
264  let (visible_w, visible_h) = clip_visible_bsize(
265    fi.width,
266    fi.height,
267    bsize,
268    frame_bo.0.x << MI_SIZE_LOG2,
269    frame_bo.0.y << MI_SIZE_LOG2,
270  );
271
272  if visible_w == 0 || visible_h == 0 {
273    return ScaledDistortion::zero();
274  }
275
276  let mut distortion = match fi.config.tune {
277    Tune::Psychovisual => cdef_dist_wxh(
278      &input_region,
279      &rec_region,
280      visible_w,
281      visible_h,
282      fi.sequence.bit_depth,
283      |bias_area, bsize| {
284        distortion_scale(
285          fi,
286          input_region.subregion(bias_area).frame_block_offset(),
287          bsize,
288        )
289      },
290      fi.cpu_feature_level,
291    ),
292    Tune::Psnr => sse_wxh(
293      &input_region,
294      &rec_region,
295      visible_w,
296      visible_h,
297      |bias_area, bsize| {
298        distortion_scale(
299          fi,
300          input_region.subregion(bias_area).frame_block_offset(),
301          bsize,
302        )
303      },
304      fi.sequence.bit_depth,
305      fi.cpu_feature_level,
306    ),
307  } * fi.dist_scale[0];
308
309  if is_chroma_block
310    && !luma_only
311    && fi.sequence.chroma_sampling != ChromaSampling::Cs400
312  {
313    let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg;
314    let chroma_w = if bsize.width() >= 8 || xdec == 0 {
315      (visible_w + xdec) >> xdec
316    } else {
317      (4 + visible_w + xdec) >> xdec
318    };
319    let chroma_h = if bsize.height() >= 8 || ydec == 0 {
320      (visible_h + ydec) >> ydec
321    } else {
322      (4 + visible_h + ydec) >> ydec
323    };
324
325    for p in 1..3 {
326      let input_region = ts.input_tile.planes[p].subregion(area);
327      let rec_region = ts.rec.planes[p].subregion(area);
328      distortion += sse_wxh(
329        &input_region,
330        &rec_region,
331        chroma_w,
332        chroma_h,
333        |bias_area, bsize| {
334          distortion_scale(
335            fi,
336            input_region.subregion(bias_area).frame_block_offset(),
337            bsize,
338          )
339        },
340        fi.sequence.bit_depth,
341        fi.cpu_feature_level,
342      ) * fi.dist_scale[p];
343    }
344  }
345  distortion
346}
347
348// Compute the transform-domain distortion for an encode
349fn compute_tx_distortion<T: Pixel>(
350  fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>, bsize: BlockSize,
351  is_chroma_block: bool, tile_bo: TileBlockOffset, tx_dist: ScaledDistortion,
352  skip: bool, luma_only: bool,
353) -> ScaledDistortion {
354  assert!(fi.config.tune == Tune::Psnr);
355  let area = Area::BlockStartingAt { bo: tile_bo.0 };
356  let input_region = ts.input_tile.planes[0].subregion(area);
357  let rec_region = ts.rec.planes[0].subregion(area);
358
359  let (visible_w, visible_h) = if !skip {
360    (bsize.width(), bsize.height())
361  } else {
362    let frame_bo = ts.to_frame_block_offset(tile_bo);
363    clip_visible_bsize(
364      fi.width,
365      fi.height,
366      bsize,
367      frame_bo.0.x << MI_SIZE_LOG2,
368      frame_bo.0.y << MI_SIZE_LOG2,
369    )
370  };
371
372  if visible_w == 0 || visible_h == 0 {
373    return ScaledDistortion::zero();
374  }
375
376  let mut distortion = if skip {
377    sse_wxh(
378      &input_region,
379      &rec_region,
380      visible_w,
381      visible_h,
382      |bias_area, bsize| {
383        distortion_scale(
384          fi,
385          input_region.subregion(bias_area).frame_block_offset(),
386          bsize,
387        )
388      },
389      fi.sequence.bit_depth,
390      fi.cpu_feature_level,
391    ) * fi.dist_scale[0]
392  } else {
393    tx_dist
394  };
395
396  if is_chroma_block
397    && !luma_only
398    && skip
399    && fi.sequence.chroma_sampling != ChromaSampling::Cs400
400  {
401    let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg;
402    let chroma_w = if bsize.width() >= 8 || xdec == 0 {
403      (visible_w + xdec) >> xdec
404    } else {
405      (4 + visible_w + xdec) >> xdec
406    };
407    let chroma_h = if bsize.height() >= 8 || ydec == 0 {
408      (visible_h + ydec) >> ydec
409    } else {
410      (4 + visible_h + ydec) >> ydec
411    };
412
413    for p in 1..3 {
414      let input_region = ts.input_tile.planes[p].subregion(area);
415      let rec_region = ts.rec.planes[p].subregion(area);
416      distortion += sse_wxh(
417        &input_region,
418        &rec_region,
419        chroma_w,
420        chroma_h,
421        |bias_area, bsize| {
422          distortion_scale(
423            fi,
424            input_region.subregion(bias_area).frame_block_offset(),
425            bsize,
426          )
427        },
428        fi.sequence.bit_depth,
429        fi.cpu_feature_level,
430      ) * fi.dist_scale[p];
431    }
432  }
433  distortion
434}
435
436/// Compute a scaling factor to multiply the distortion of a block by,
437/// this factor is determined using temporal RDO.
438///
439/// # Panics
440///
441/// - If called with `bsize` of 8x8 or smaller
442/// - If the coded frame data doesn't exist on the `FrameInvariants`
443pub fn distortion_scale<T: Pixel>(
444  fi: &FrameInvariants<T>, frame_bo: PlaneBlockOffset, bsize: BlockSize,
445) -> DistortionScale {
446  if !fi.config.temporal_rdo() {
447    return DistortionScale::default();
448  }
449  // EncoderConfig::temporal_rdo() should always return false in situations
450  // where distortion is computed on > 8x8 blocks, so we should never hit this
451  // assert.
452  assert!(bsize <= BlockSize::BLOCK_8X8);
453
454  let x = frame_bo.0.x >> IMPORTANCE_BLOCK_TO_BLOCK_SHIFT;
455  let y = frame_bo.0.y >> IMPORTANCE_BLOCK_TO_BLOCK_SHIFT;
456
457  let coded_data = fi.coded_frame_data.as_ref().unwrap();
458  coded_data.distortion_scales[y * coded_data.w_in_imp_b + x]
459}
460
461/// # Panics
462///
463/// - If the coded frame data doesn't exist on the `FrameInvariants`
464pub fn spatiotemporal_scale<T: Pixel>(
465  fi: &FrameInvariants<T>, frame_bo: PlaneBlockOffset, bsize: BlockSize,
466) -> DistortionScale {
467  if !fi.config.temporal_rdo() && fi.config.tune != Tune::Psychovisual {
468    return DistortionScale::default();
469  }
470
471  let coded_data = fi.coded_frame_data.as_ref().unwrap();
472
473  let x0 = frame_bo.0.x >> IMPORTANCE_BLOCK_TO_BLOCK_SHIFT;
474  let y0 = frame_bo.0.y >> IMPORTANCE_BLOCK_TO_BLOCK_SHIFT;
475  let x1 = (x0 + bsize.width_imp_b()).min(coded_data.w_in_imp_b);
476  let y1 = (y0 + bsize.height_imp_b()).min(coded_data.h_in_imp_b);
477  let den = (((x1 - x0) * (y1 - y0)) as u64) << DistortionScale::SHIFT;
478
479  // calling this on each slice individually improves autovectorization
480  // compared to using `Iterator::take`
481  #[inline(always)]
482  fn take_slice<T>(slice: &[T], n: usize) -> &[T] {
483    slice.get(..n).unwrap_or(slice)
484  }
485
486  let mut sum = 0;
487  for y in y0..y1 {
488    sum += take_slice(
489      &coded_data.distortion_scales[y * coded_data.w_in_imp_b..][x0..x1],
490      MAX_SB_IN_IMP_B,
491    )
492    .iter()
493    .zip(
494      take_slice(
495        &coded_data.activity_scales[y * coded_data.w_in_imp_b..][x0..x1],
496        MAX_SB_IN_IMP_B,
497      )
498      .iter(),
499    )
500    .map(|(d, a)| d.0 as u64 * a.0 as u64)
501    .sum::<u64>();
502  }
503  DistortionScale(((sum + (den >> 1)) / den) as u32)
504}
505
506pub fn distortion_scale_for(
507  propagate_cost: f64, intra_cost: f64,
508) -> DistortionScale {
509  // The mbtree paper \cite{mbtree} uses the following formula:
510  //
511  //     QP_delta = -strength * log2(1 + (propagate_cost / intra_cost))
512  //
513  // Since this is H.264, this corresponds to the following quantizer:
514  //
515  //     Q' = Q * 2^(QP_delta/6)
516  //
517  // Since lambda is proportial to Q^2, this means we want to minimize:
518  //
519  //     D + lambda' * R
520  //   = D + 2^(QP_delta / 3) * lambda * R
521  //
522  // If we want to keep lambda fixed, we can instead scale distortion and
523  // minimize:
524  //
525  //     D * scale + lambda * R
526  //
527  // where:
528  //
529  //     scale = 2^(QP_delta / -3)
530  //           = (1 + (propagate_cost / intra_cost))^(strength / 3)
531  //
532  //  The original paper empirically chooses strength = 2.0, but strength = 1.0
533  //  seems to work best in rav1e currently, this may have something to do with
534  //  the fact that they use 16x16 blocks whereas our "importance blocks" are
535  //  8x8, but everything should be scale invariant here so that's weird.
536  //
537  // @article{mbtree,
538  //   title={A novel macroblock-tree algorithm for high-performance
539  //    optimization of dependent video coding in H.264/AVC},
540  //   author={Garrett-Glaser, Jason},
541  //   journal={Tech. Rep.},
542  //   year={2009},
543  //   url={https://pdfs.semanticscholar.org/032f/1ab7d9db385780a02eb2d579af8303b266d2.pdf}
544  // }
545
546  if intra_cost == 0. {
547    return DistortionScale::default(); // no scaling
548  }
549
550  let strength = 1.0; // empirical, see comment above
551  let frac = (intra_cost + propagate_cost) / intra_cost;
552  frac.powf(strength / 3.0).into()
553}
554
555/// Fixed point arithmetic version of distortion scale
556#[repr(transparent)]
557#[derive(Copy, Clone)]
558pub struct DistortionScale(pub u32);
559
560#[repr(transparent)]
561pub struct RawDistortion(u64);
562
563#[repr(transparent)]
564pub struct Distortion(pub u64);
565
566#[repr(transparent)]
567pub struct ScaledDistortion(u64);
568
569impl DistortionScale {
570  /// Bits past the radix point
571  const SHIFT: u32 = 14;
572  /// Number of bits used. Determines the max value.
573  /// 28 bits is quite excessive.
574  const BITS: u32 = 28;
575  /// Maximum internal value
576  const MAX: u64 = (1 << Self::BITS) - 1;
577
578  #[inline]
579  pub const fn new(num: u64, den: u64) -> Self {
580    let raw = (num << Self::SHIFT).saturating_add(den / 2) / den;
581    let mask = (raw <= Self::MAX) as u64;
582    Self((mask * raw + (1 - mask) * Self::MAX) as u32)
583  }
584
585  pub fn inv_mean(slice: &[Self]) -> Self {
586    use crate::util::{bexp64, blog32_q11};
587    let sum = slice.iter().map(|&s| blog32_q11(s.0) as i64).sum::<i64>();
588    let log_inv_mean_q11 =
589      (Self::SHIFT << 11) as i64 - sum / slice.len() as i64;
590    Self(
591      bexp64((log_inv_mean_q11 + (Self::SHIFT << 11) as i64) << (57 - 11))
592        .clamp(1, (1 << Self::BITS) - 1) as u32,
593    )
594  }
595
596  /// Binary logarithm in Q11
597  #[inline]
598  pub const fn blog16(self) -> i16 {
599    use crate::util::blog32_q11;
600    (blog32_q11(self.0) - ((Self::SHIFT as i32) << 11)) as i16
601  }
602
603  /// Binary logarithm in Q57
604  #[inline]
605  pub const fn blog64(self) -> i64 {
606    use crate::util::{blog64, q57};
607    blog64(self.0 as i64) - q57(Self::SHIFT as i32)
608  }
609
610  /// Multiply, round and shift
611  /// Internal implementation, so don't use multiply trait.
612  #[inline]
613  pub const fn mul_u64(self, dist: u64) -> u64 {
614    (self.0 as u64 * dist + (1 << Self::SHIFT >> 1)) >> Self::SHIFT
615  }
616}
617
618impl std::ops::Mul for DistortionScale {
619  type Output = Self;
620
621  /// Multiply, round and shift
622  #[inline]
623  fn mul(self, rhs: Self) -> Self {
624    Self(
625      (((self.0 as u64 * rhs.0 as u64) + (1 << (Self::SHIFT - 1)))
626        >> Self::SHIFT)
627        .clamp(1, (1 << Self::BITS) - 1) as u32,
628    )
629  }
630}
631
632impl std::ops::MulAssign for DistortionScale {
633  fn mul_assign(&mut self, rhs: Self) {
634    *self = *self * rhs;
635  }
636}
637
638// Default value for DistortionScale is a fixed point 1
639impl Default for DistortionScale {
640  #[inline]
641  fn default() -> Self {
642    Self(1 << Self::SHIFT)
643  }
644}
645
646impl fmt::Debug for DistortionScale {
647  fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
648    write!(f, "{}", f64::from(*self))
649  }
650}
651
652impl From<f64> for DistortionScale {
653  #[inline]
654  fn from(scale: f64) -> Self {
655    let den = 1 << (Self::SHIFT + 1);
656    Self::new((scale * den as f64) as u64, den)
657  }
658}
659
660impl From<DistortionScale> for f64 {
661  #[inline]
662  fn from(scale: DistortionScale) -> Self {
663    scale.0 as f64 / (1 << DistortionScale::SHIFT) as f64
664  }
665}
666
667impl RawDistortion {
668  #[inline]
669  pub const fn new(dist: u64) -> Self {
670    Self(dist)
671  }
672}
673
674impl std::ops::Mul<DistortionScale> for RawDistortion {
675  type Output = Distortion;
676  #[inline]
677  fn mul(self, rhs: DistortionScale) -> Distortion {
678    Distortion(rhs.mul_u64(self.0))
679  }
680}
681
682impl Distortion {
683  #[inline]
684  pub const fn zero() -> Self {
685    Self(0)
686  }
687}
688
689impl std::ops::Mul<DistortionScale> for Distortion {
690  type Output = ScaledDistortion;
691  #[inline]
692  fn mul(self, rhs: DistortionScale) -> ScaledDistortion {
693    ScaledDistortion(rhs.mul_u64(self.0))
694  }
695}
696
697impl std::ops::AddAssign for Distortion {
698  #[inline]
699  fn add_assign(&mut self, other: Self) {
700    self.0 += other.0;
701  }
702}
703
704impl ScaledDistortion {
705  #[inline]
706  pub const fn zero() -> Self {
707    Self(0)
708  }
709}
710
711impl std::ops::AddAssign for ScaledDistortion {
712  #[inline]
713  fn add_assign(&mut self, other: Self) {
714    self.0 += other.0;
715  }
716}
717
718pub fn compute_rd_cost<T: Pixel>(
719  fi: &FrameInvariants<T>, rate: u32, distortion: ScaledDistortion,
720) -> f64 {
721  let rate_in_bits = (rate as f64) / ((1 << OD_BITRES) as f64);
722  fi.lambda.mul_add(rate_in_bits, distortion.0 as f64)
723}
724
725pub fn rdo_tx_size_type<T: Pixel>(
726  fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
727  cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset,
728  luma_mode: PredictionMode, ref_frames: [RefType; 2], mvs: [MotionVector; 2],
729  skip: bool,
730) -> (TxSize, TxType) {
731  let is_inter = !luma_mode.is_intra();
732  let mut tx_size = max_txsize_rect_lookup[bsize as usize];
733
734  if fi.enable_inter_txfm_split && is_inter && !skip {
735    tx_size = sub_tx_size_map[tx_size as usize]; // Always choose one level split size
736  }
737
738  let mut best_tx_type = TxType::DCT_DCT;
739  let mut best_tx_size = tx_size;
740  let mut best_rd = f64::MAX;
741
742  let do_rdo_tx_size = fi.tx_mode_select
743    && fi.config.speed_settings.transform.rdo_tx_decision
744    && !is_inter;
745  let rdo_tx_depth = if do_rdo_tx_size { 2 } else { 0 };
746  let mut cw_checkpoint: Option<ContextWriterCheckpoint> = None;
747
748  for _ in 0..=rdo_tx_depth {
749    let tx_set = get_tx_set(tx_size, is_inter, fi.use_reduced_tx_set);
750
751    let do_rdo_tx_type = tx_set > TxSet::TX_SET_DCTONLY
752      && fi.config.speed_settings.transform.rdo_tx_decision
753      && !is_inter
754      && !skip;
755
756    if !do_rdo_tx_size && !do_rdo_tx_type {
757      return (best_tx_size, best_tx_type);
758    };
759
760    let tx_types =
761      if do_rdo_tx_type { RAV1E_TX_TYPES } else { &[TxType::DCT_DCT] };
762
763    // Luma plane transform type decision
764    let (tx_type, rd_cost) = rdo_tx_type_decision(
765      fi,
766      ts,
767      cw,
768      &mut cw_checkpoint,
769      luma_mode,
770      ref_frames,
771      mvs,
772      bsize,
773      tile_bo,
774      tx_size,
775      tx_set,
776      tx_types,
777      best_rd,
778    );
779
780    if rd_cost < best_rd {
781      best_tx_size = tx_size;
782      best_tx_type = tx_type;
783      best_rd = rd_cost;
784    }
785
786    debug_assert!(tx_size.width_log2() <= bsize.width_log2());
787    debug_assert!(tx_size.height_log2() <= bsize.height_log2());
788    debug_assert!(
789      tx_size.sqr() <= TxSize::TX_32X32 || tx_type == TxType::DCT_DCT
790    );
791
792    let next_tx_size = sub_tx_size_map[tx_size as usize];
793
794    if next_tx_size == tx_size {
795      break;
796    } else {
797      tx_size = next_tx_size;
798    };
799  }
800
801  (best_tx_size, best_tx_type)
802}
803
804#[inline]
805const fn dmv_in_range(mv: MotionVector, ref_mv: MotionVector) -> bool {
806  let diff_row = mv.row as i32 - ref_mv.row as i32;
807  let diff_col = mv.col as i32 - ref_mv.col as i32;
808  diff_row >= MV_LOW
809    && diff_row <= MV_UPP
810    && diff_col >= MV_LOW
811    && diff_col <= MV_UPP
812}
813
814#[inline]
815#[profiling::function]
816fn luma_chroma_mode_rdo<T: Pixel>(
817  luma_mode: PredictionMode, fi: &FrameInvariants<T>, bsize: BlockSize,
818  tile_bo: TileBlockOffset, ts: &mut TileStateMut<'_, T>,
819  cw: &mut ContextWriter, rdo_type: RDOType,
820  cw_checkpoint: &ContextWriterCheckpoint, best: &mut PartitionParameters,
821  mvs: [MotionVector; 2], ref_frames: [RefType; 2],
822  mode_set_chroma: &[PredictionMode], luma_mode_is_intra: bool,
823  mode_context: usize, mv_stack: &ArrayVec<CandidateMV, 9>,
824  angle_delta: AngleDelta,
825) {
826  let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg;
827
828  let is_chroma_block =
829    has_chroma(tile_bo, bsize, xdec, ydec, fi.sequence.chroma_sampling);
830
831  if !luma_mode_is_intra {
832    let ref_mvs = if mv_stack.is_empty() {
833      [MotionVector::default(); 2]
834    } else {
835      [mv_stack[0].this_mv, mv_stack[0].comp_mv]
836    };
837
838    if (luma_mode == PredictionMode::NEWMV
839      || luma_mode == PredictionMode::NEW_NEWMV
840      || luma_mode == PredictionMode::NEW_NEARESTMV)
841      && !dmv_in_range(mvs[0], ref_mvs[0])
842    {
843      return;
844    }
845
846    if (luma_mode == PredictionMode::NEW_NEWMV
847      || luma_mode == PredictionMode::NEAREST_NEWMV)
848      && !dmv_in_range(mvs[1], ref_mvs[1])
849    {
850      return;
851    }
852  }
853
854  // Find the best chroma prediction mode for the current luma prediction mode
855  let mut chroma_rdo = |skip: bool| -> bool {
856    use crate::segmentation::select_segment;
857
858    let mut zero_distortion = false;
859
860    for sidx in select_segment(fi, ts, tile_bo, bsize, skip) {
861      cw.bc.blocks.set_segmentation_idx(tile_bo, bsize, sidx);
862
863      let (tx_size, tx_type) = rdo_tx_size_type(
864        fi, ts, cw, bsize, tile_bo, luma_mode, ref_frames, mvs, skip,
865      );
866      for &chroma_mode in mode_set_chroma.iter() {
867        let wr = &mut WriterCounter::new();
868        let tell = wr.tell_frac();
869
870        if bsize >= BlockSize::BLOCK_8X8 && bsize.is_sqr() {
871          cw.write_partition(
872            wr,
873            tile_bo,
874            PartitionType::PARTITION_NONE,
875            bsize,
876          );
877        }
878
879        // TODO(yushin): luma and chroma would have different decision based on chroma format
880        let need_recon_pixel =
881          luma_mode_is_intra && tx_size.block_size() != bsize;
882
883        encode_block_pre_cdef(&fi.sequence, ts, cw, wr, bsize, tile_bo, skip);
884        let (has_coeff, tx_dist) = encode_block_post_cdef(
885          fi,
886          ts,
887          cw,
888          wr,
889          luma_mode,
890          chroma_mode,
891          angle_delta,
892          ref_frames,
893          mvs,
894          bsize,
895          tile_bo,
896          skip,
897          CFLParams::default(),
898          tx_size,
899          tx_type,
900          mode_context,
901          mv_stack,
902          rdo_type,
903          need_recon_pixel,
904          None,
905        );
906
907        let rate = wr.tell_frac() - tell;
908        let distortion = if fi.use_tx_domain_distortion && !need_recon_pixel {
909          compute_tx_distortion(
910            fi,
911            ts,
912            bsize,
913            is_chroma_block,
914            tile_bo,
915            tx_dist,
916            skip,
917            false,
918          )
919        } else {
920          compute_distortion(fi, ts, bsize, is_chroma_block, tile_bo, false)
921        };
922        let is_zero_dist = distortion.0 == 0;
923        let rd = compute_rd_cost(fi, rate, distortion);
924        if rd < best.rd_cost {
925          //if rd < best.rd_cost || luma_mode == PredictionMode::NEW_NEWMV {
926          best.rd_cost = rd;
927          best.pred_mode_luma = luma_mode;
928          best.pred_mode_chroma = chroma_mode;
929          best.angle_delta = angle_delta;
930          best.ref_frames = ref_frames;
931          best.mvs = mvs;
932          best.skip = skip;
933          best.has_coeff = has_coeff;
934          best.tx_size = tx_size;
935          best.tx_type = tx_type;
936          best.sidx = sidx;
937          zero_distortion = is_zero_dist;
938        }
939
940        cw.rollback(cw_checkpoint);
941      }
942    }
943
944    zero_distortion
945  };
946
947  // Don't skip when using intra modes
948  let zero_distortion =
949    if !luma_mode_is_intra { chroma_rdo(true) } else { false };
950  // early skip
951  if !zero_distortion {
952    chroma_rdo(false);
953  }
954}
955
956/// RDO-based mode decision
957///
958/// # Panics
959///
960/// - If the best RD found is negative.
961///   This should never happen and indicates a development error.
962#[profiling::function]
963pub fn rdo_mode_decision<T: Pixel>(
964  fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
965  cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset,
966  inter_cfg: &InterConfig,
967) -> PartitionParameters {
968  let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg;
969  let cw_checkpoint = cw.checkpoint(&tile_bo, fi.sequence.chroma_sampling);
970
971  let rdo_type = if fi.use_tx_domain_rate {
972    RDOType::TxDistEstRate
973  } else if fi.use_tx_domain_distortion {
974    RDOType::TxDistRealRate
975  } else {
976    RDOType::PixelDistRealRate
977  };
978
979  let mut best = if fi.frame_type.has_inter() {
980    assert!(fi.frame_type != FrameType::KEY);
981
982    inter_frame_rdo_mode_decision(
983      fi,
984      ts,
985      cw,
986      bsize,
987      tile_bo,
988      inter_cfg,
989      &cw_checkpoint,
990      rdo_type,
991    )
992  } else {
993    PartitionParameters::default()
994  };
995
996  let is_chroma_block =
997    has_chroma(tile_bo, bsize, xdec, ydec, fi.sequence.chroma_sampling);
998
999  if !best.skip {
1000    best = intra_frame_rdo_mode_decision(
1001      fi,
1002      ts,
1003      cw,
1004      bsize,
1005      tile_bo,
1006      &cw_checkpoint,
1007      rdo_type,
1008      best,
1009      is_chroma_block,
1010    );
1011  }
1012
1013  if best.pred_mode_luma.is_intra() && is_chroma_block && bsize.cfl_allowed() {
1014    cw.bc.blocks.set_segmentation_idx(tile_bo, bsize, best.sidx);
1015
1016    let chroma_mode = PredictionMode::UV_CFL_PRED;
1017    let cw_checkpoint = cw.checkpoint(&tile_bo, fi.sequence.chroma_sampling);
1018    let mut wr = WriterCounter::new();
1019    let angle_delta = AngleDelta { y: best.angle_delta.y, uv: 0 };
1020
1021    write_tx_blocks(
1022      fi,
1023      ts,
1024      cw,
1025      &mut wr,
1026      best.pred_mode_luma,
1027      best.pred_mode_luma,
1028      angle_delta,
1029      tile_bo,
1030      bsize,
1031      best.tx_size,
1032      best.tx_type,
1033      false,
1034      CFLParams::default(),
1035      true,
1036      rdo_type,
1037      true,
1038    );
1039    cw.rollback(&cw_checkpoint);
1040    if fi.sequence.chroma_sampling != ChromaSampling::Cs400 {
1041      if let Some(cfl) = rdo_cfl_alpha(ts, tile_bo, bsize, best.tx_size, fi) {
1042        let mut wr = WriterCounter::new();
1043        let tell = wr.tell_frac();
1044
1045        encode_block_pre_cdef(
1046          &fi.sequence,
1047          ts,
1048          cw,
1049          &mut wr,
1050          bsize,
1051          tile_bo,
1052          best.skip,
1053        );
1054        let (has_coeff, _) = encode_block_post_cdef(
1055          fi,
1056          ts,
1057          cw,
1058          &mut wr,
1059          best.pred_mode_luma,
1060          chroma_mode,
1061          angle_delta,
1062          best.ref_frames,
1063          best.mvs,
1064          bsize,
1065          tile_bo,
1066          best.skip,
1067          cfl,
1068          best.tx_size,
1069          best.tx_type,
1070          0,
1071          &[],
1072          rdo_type,
1073          true, // For CFL, luma should be always reconstructed.
1074          None,
1075        );
1076
1077        let rate = wr.tell_frac() - tell;
1078
1079        // For CFL, tx-domain distortion is not an option.
1080        let distortion =
1081          compute_distortion(fi, ts, bsize, is_chroma_block, tile_bo, false);
1082        let rd = compute_rd_cost(fi, rate, distortion);
1083        if rd < best.rd_cost {
1084          best.rd_cost = rd;
1085          best.pred_mode_chroma = chroma_mode;
1086          best.angle_delta = angle_delta;
1087          best.has_coeff = has_coeff;
1088          best.pred_cfl_params = cfl;
1089        }
1090
1091        cw.rollback(&cw_checkpoint);
1092      }
1093    }
1094  }
1095
1096  cw.bc.blocks.set_mode(tile_bo, bsize, best.pred_mode_luma);
1097  cw.bc.blocks.set_ref_frames(tile_bo, bsize, best.ref_frames);
1098  cw.bc.blocks.set_motion_vectors(tile_bo, bsize, best.mvs);
1099
1100  assert!(best.rd_cost >= 0_f64);
1101
1102  PartitionParameters {
1103    bo: tile_bo,
1104    bsize,
1105    pred_mode_luma: best.pred_mode_luma,
1106    pred_mode_chroma: best.pred_mode_chroma,
1107    pred_cfl_params: best.pred_cfl_params,
1108    angle_delta: best.angle_delta,
1109    ref_frames: best.ref_frames,
1110    mvs: best.mvs,
1111    rd_cost: best.rd_cost,
1112    skip: best.skip,
1113    has_coeff: best.has_coeff,
1114    tx_size: best.tx_size,
1115    tx_type: best.tx_type,
1116    sidx: best.sidx,
1117  }
1118}
1119
1120#[profiling::function]
1121fn inter_frame_rdo_mode_decision<T: Pixel>(
1122  fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
1123  cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset,
1124  inter_cfg: &InterConfig, cw_checkpoint: &ContextWriterCheckpoint,
1125  rdo_type: RDOType,
1126) -> PartitionParameters {
1127  let mut best = PartitionParameters::default();
1128
1129  // we can never have more than 7 reference frame sets
1130  let mut ref_frames_set = ArrayVec::<_, 7>::new();
1131  // again, max of 7 ref slots
1132  let mut ref_slot_set = ArrayVec::<_, 7>::new();
1133  // our implementation never returns more than 3 at the moment
1134  let mut mvs_from_me = ArrayVec::<_, 3>::new();
1135  let mut fwdref = None;
1136  let mut bwdref = None;
1137
1138  for i in inter_cfg.allowed_ref_frames().iter().copied() {
1139    // Don't search LAST3 since it's used only for probs
1140    if i == LAST3_FRAME {
1141      continue;
1142    }
1143
1144    if !ref_slot_set.contains(&fi.ref_frames[i.to_index()]) {
1145      if fwdref.is_none() && i.is_fwd_ref() {
1146        fwdref = Some(ref_frames_set.len());
1147      }
1148      if bwdref.is_none() && i.is_bwd_ref() {
1149        bwdref = Some(ref_frames_set.len());
1150      }
1151      ref_frames_set.push([i, NONE_FRAME]);
1152      let slot_idx = fi.ref_frames[i.to_index()];
1153      ref_slot_set.push(slot_idx);
1154    }
1155  }
1156  assert!(!ref_frames_set.is_empty());
1157
1158  let mut inter_mode_set = ArrayVec::<(PredictionMode, usize), 20>::new();
1159  let mut mvs_set = ArrayVec::<[MotionVector; 2], 20>::new();
1160  let mut satds = ArrayVec::<u32, 20>::new();
1161  let mut mv_stacks = ArrayVec::<_, 20>::new();
1162  let mut mode_contexts = ArrayVec::<_, 7>::new();
1163
1164  for (i, &ref_frames) in ref_frames_set.iter().enumerate() {
1165    let mut mv_stack = ArrayVec::<CandidateMV, 9>::new();
1166    mode_contexts.push(cw.find_mvrefs(
1167      tile_bo,
1168      ref_frames,
1169      &mut mv_stack,
1170      bsize,
1171      fi,
1172      false,
1173    ));
1174
1175    let mut pmv = [MotionVector::default(); 2];
1176    if !mv_stack.is_empty() {
1177      pmv[0] = mv_stack[0].this_mv;
1178    }
1179    if mv_stack.len() > 1 {
1180      pmv[1] = mv_stack[1].this_mv;
1181    }
1182
1183    let res = estimate_motion(
1184      fi,
1185      ts,
1186      bsize.width(),
1187      bsize.height(),
1188      tile_bo,
1189      ref_frames[0],
1190      Some(pmv),
1191      MVSamplingMode::CORNER { right: true, bottom: true },
1192      false,
1193      0,
1194      None,
1195    )
1196    .unwrap_or_else(MotionSearchResult::empty);
1197    let b_me = res.mv;
1198
1199    mvs_from_me.push([b_me, MotionVector::default()]);
1200
1201    for &x in RAV1E_INTER_MODES_MINIMAL {
1202      inter_mode_set.push((x, i));
1203    }
1204    if !mv_stack.is_empty() {
1205      inter_mode_set.push((PredictionMode::NEAR0MV, i));
1206    }
1207    if mv_stack.len() >= 2 {
1208      inter_mode_set.push((PredictionMode::GLOBALMV, i));
1209    }
1210    let include_near_mvs = fi.config.speed_settings.motion.include_near_mvs;
1211    if include_near_mvs {
1212      if mv_stack.len() >= 3 {
1213        inter_mode_set.push((PredictionMode::NEAR1MV, i));
1214      }
1215      if mv_stack.len() >= 4 {
1216        inter_mode_set.push((PredictionMode::NEAR2MV, i));
1217      }
1218    }
1219    let same_row_col = |x: &CandidateMV| {
1220      x.this_mv.row == mvs_from_me[i][0].row
1221        && x.this_mv.col == mvs_from_me[i][0].col
1222    };
1223    if !mv_stack
1224      .iter()
1225      .take(if include_near_mvs { 4 } else { 2 })
1226      .any(same_row_col)
1227      && (mvs_from_me[i][0].row != 0 || mvs_from_me[i][0].col != 0)
1228    {
1229      inter_mode_set.push((PredictionMode::NEWMV, i));
1230    }
1231
1232    mv_stacks.push(mv_stack);
1233  }
1234
1235  let sz = bsize.width_mi().min(bsize.height_mi());
1236
1237  // To use non single reference modes, block width and height must be greater than 4.
1238  if fi.reference_mode != ReferenceMode::SINGLE && sz >= 2 {
1239    // Adding compound candidate
1240    if let Some(r0) = fwdref {
1241      if let Some(r1) = bwdref {
1242        let ref_frames = [ref_frames_set[r0][0], ref_frames_set[r1][0]];
1243        ref_frames_set.push(ref_frames);
1244        let mv0 = mvs_from_me[r0][0];
1245        let mv1 = mvs_from_me[r1][0];
1246        mvs_from_me.push([mv0, mv1]);
1247        let mut mv_stack = ArrayVec::<CandidateMV, 9>::new();
1248        mode_contexts.push(cw.find_mvrefs(
1249          tile_bo,
1250          ref_frames,
1251          &mut mv_stack,
1252          bsize,
1253          fi,
1254          true,
1255        ));
1256        for &x in RAV1E_INTER_COMPOUND_MODES {
1257          // exclude any NEAR mode based on speed setting
1258          if fi.config.speed_settings.motion.include_near_mvs
1259            || !x.has_nearmv()
1260          {
1261            let mv_stack_idx = ref_frames_set.len() - 1;
1262            // exclude NEAR modes if the mv_stack is too short
1263            if !(x.has_nearmv() && x.ref_mv_idx() >= mv_stack.len()) {
1264              inter_mode_set.push((x, mv_stack_idx));
1265            }
1266          }
1267        }
1268        mv_stacks.push(mv_stack);
1269      }
1270    }
1271  }
1272
1273  let num_modes_rdo = if fi.config.speed_settings.prediction.prediction_modes
1274    >= PredictionModesSetting::ComplexAll
1275  {
1276    inter_mode_set.len()
1277  } else {
1278    9 // This number is determined by AWCY test
1279  };
1280
1281  inter_mode_set.iter().for_each(|&(luma_mode, i)| {
1282    let mvs = match luma_mode {
1283      PredictionMode::NEWMV | PredictionMode::NEW_NEWMV => mvs_from_me[i],
1284      PredictionMode::NEARESTMV | PredictionMode::NEAREST_NEARESTMV => {
1285        if !mv_stacks[i].is_empty() {
1286          [mv_stacks[i][0].this_mv, mv_stacks[i][0].comp_mv]
1287        } else {
1288          [MotionVector::default(); 2]
1289        }
1290      }
1291      PredictionMode::NEAR0MV | PredictionMode::NEAR_NEAR0MV => {
1292        if mv_stacks[i].len() > 1 {
1293          [mv_stacks[i][1].this_mv, mv_stacks[i][1].comp_mv]
1294        } else {
1295          [MotionVector::default(); 2]
1296        }
1297      }
1298      PredictionMode::NEAR1MV
1299      | PredictionMode::NEAR2MV
1300      | PredictionMode::NEAR_NEAR1MV
1301      | PredictionMode::NEAR_NEAR2MV => [
1302        mv_stacks[i][luma_mode.ref_mv_idx()].this_mv,
1303        mv_stacks[i][luma_mode.ref_mv_idx()].comp_mv,
1304      ],
1305      PredictionMode::NEAREST_NEWMV => {
1306        [mv_stacks[i][0].this_mv, mvs_from_me[i][1]]
1307      }
1308      PredictionMode::NEW_NEARESTMV => {
1309        [mvs_from_me[i][0], mv_stacks[i][0].comp_mv]
1310      }
1311      PredictionMode::GLOBALMV | PredictionMode::GLOBAL_GLOBALMV => {
1312        [MotionVector::default(); 2]
1313      }
1314      _ => {
1315        unimplemented!();
1316      }
1317    };
1318    mvs_set.push(mvs);
1319
1320    // Calculate SATD for each mode
1321    if num_modes_rdo != inter_mode_set.len() {
1322      let tile_rect = ts.tile_rect();
1323      let rec = &mut ts.rec.planes[0];
1324      let po = tile_bo.plane_offset(rec.plane_cfg);
1325      let mut rec_region =
1326        rec.subregion_mut(Area::BlockStartingAt { bo: tile_bo.0 });
1327
1328      luma_mode.predict_inter(
1329        fi,
1330        tile_rect,
1331        0,
1332        po,
1333        &mut rec_region,
1334        bsize.width(),
1335        bsize.height(),
1336        ref_frames_set[i],
1337        mvs,
1338        &mut ts.inter_compound_buffers,
1339      );
1340
1341      let plane_org = ts.input_tile.planes[0]
1342        .subregion(Area::BlockStartingAt { bo: tile_bo.0 });
1343      let plane_ref = rec_region.as_const();
1344
1345      let satd = get_satd(
1346        &plane_org,
1347        &plane_ref,
1348        bsize.width(),
1349        bsize.height(),
1350        fi.sequence.bit_depth,
1351        fi.cpu_feature_level,
1352      );
1353      satds.push(satd);
1354    } else {
1355      satds.push(0);
1356    }
1357  });
1358
1359  let mut sorted =
1360    izip!(inter_mode_set, mvs_set, satds).collect::<ArrayVec<_, 20>>();
1361  if num_modes_rdo != sorted.len() {
1362    sorted.sort_by_key(|((_mode, _i), _mvs, satd)| *satd);
1363  }
1364
1365  sorted.iter().take(num_modes_rdo).for_each(
1366    |&((luma_mode, i), mvs, _satd)| {
1367      let mode_set_chroma = ArrayVec::from([luma_mode]);
1368
1369      luma_chroma_mode_rdo(
1370        luma_mode,
1371        fi,
1372        bsize,
1373        tile_bo,
1374        ts,
1375        cw,
1376        rdo_type,
1377        cw_checkpoint,
1378        &mut best,
1379        mvs,
1380        ref_frames_set[i],
1381        &mode_set_chroma,
1382        false,
1383        mode_contexts[i],
1384        &mv_stacks[i],
1385        AngleDelta::default(),
1386      );
1387    },
1388  );
1389
1390  best
1391}
1392
1393#[profiling::function]
1394fn intra_frame_rdo_mode_decision<T: Pixel>(
1395  fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
1396  cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset,
1397  cw_checkpoint: &ContextWriterCheckpoint, rdo_type: RDOType,
1398  mut best: PartitionParameters, is_chroma_block: bool,
1399) -> PartitionParameters {
1400  let mut modes = ArrayVec::<_, INTRA_MODES>::new();
1401
1402  // Reduce number of prediction modes at higher speed levels
1403  let num_modes_rdo = if (fi.frame_type == FrameType::KEY
1404    && fi.config.speed_settings.prediction.prediction_modes
1405      >= PredictionModesSetting::ComplexKeyframes)
1406    || (fi.frame_type.has_inter()
1407      && fi.config.speed_settings.prediction.prediction_modes
1408        >= PredictionModesSetting::ComplexAll)
1409  {
1410    7
1411  } else {
1412    3
1413  };
1414
1415  let intra_mode_set = RAV1E_INTRA_MODES;
1416
1417  // Find mode with lowest rate cost
1418  {
1419    use crate::ec::cdf_to_pdf;
1420
1421    let probs_all = cdf_to_pdf(if fi.frame_type.has_inter() {
1422      cw.get_cdf_intra_mode(bsize)
1423    } else {
1424      cw.get_cdf_intra_mode_kf(tile_bo)
1425    });
1426
1427    modes.try_extend_from_slice(intra_mode_set).unwrap();
1428    modes.sort_by_key(|&a| !probs_all[a as usize]);
1429  }
1430
1431  // If tx partition (i.e. fi.tx_mode_select) is enabled, the below intra prediction screening
1432  // may be improved by emulating prediction for each tx block.
1433  {
1434    let satds = {
1435      // FIXME: If tx partition is used, this whole sads block should be fixed
1436      let tx_size = bsize.tx_size();
1437      let mut edge_buf = Aligned::uninit_array();
1438      let edge_buf = {
1439        let rec = &ts.rec.planes[0].as_const();
1440        let po = tile_bo.plane_offset(rec.plane_cfg);
1441        // FIXME: If tx partition is used, get_intra_edges() should be called for each tx block
1442        get_intra_edges(
1443          &mut edge_buf,
1444          rec,
1445          tile_bo,
1446          0,
1447          0,
1448          bsize,
1449          po,
1450          tx_size,
1451          fi.sequence.bit_depth,
1452          None,
1453          fi.sequence.enable_intra_edge_filter,
1454          IntraParam::None,
1455        )
1456      };
1457
1458      let ief_params = if fi.sequence.enable_intra_edge_filter {
1459        let above_block_info = ts.above_block_info(tile_bo, 0, 0);
1460        let left_block_info = ts.left_block_info(tile_bo, 0, 0);
1461        Some(IntraEdgeFilterParameters::new(
1462          0,
1463          above_block_info,
1464          left_block_info,
1465        ))
1466      } else {
1467        None
1468      };
1469
1470      let mut satds_all = [0; INTRA_MODES];
1471      for &luma_mode in modes.iter().skip(num_modes_rdo / 2) {
1472        let tile_rect = ts.tile_rect();
1473        let rec = &mut ts.rec.planes[0];
1474        let mut rec_region =
1475          rec.subregion_mut(Area::BlockStartingAt { bo: tile_bo.0 });
1476        // FIXME: If tx partition is used, luma_mode.predict_intra() should be called for each tx block
1477        luma_mode.predict_intra(
1478          tile_rect,
1479          &mut rec_region,
1480          tx_size,
1481          fi.sequence.bit_depth,
1482          &[0i16; 2],
1483          IntraParam::None,
1484          if luma_mode.is_directional() { ief_params } else { None },
1485          &edge_buf,
1486          fi.cpu_feature_level,
1487        );
1488
1489        let plane_org = ts.input_tile.planes[0]
1490          .subregion(Area::BlockStartingAt { bo: tile_bo.0 });
1491        let plane_ref = rec_region.as_const();
1492
1493        satds_all[luma_mode as usize] = get_satd(
1494          &plane_org,
1495          &plane_ref,
1496          tx_size.width(),
1497          tx_size.height(),
1498          fi.sequence.bit_depth,
1499          fi.cpu_feature_level,
1500        );
1501      }
1502      satds_all
1503    };
1504
1505    modes[num_modes_rdo / 2..].sort_by_key(|&a| satds[a as usize]);
1506  }
1507
1508  debug_assert!(num_modes_rdo >= 1);
1509
1510  modes.iter().take(num_modes_rdo).for_each(|&luma_mode| {
1511    let mvs = [MotionVector::default(); 2];
1512    let ref_frames = [INTRA_FRAME, NONE_FRAME];
1513    let mut mode_set_chroma = ArrayVec::<_, 2>::new();
1514    mode_set_chroma.push(luma_mode);
1515    if is_chroma_block && luma_mode != PredictionMode::DC_PRED {
1516      mode_set_chroma.push(PredictionMode::DC_PRED);
1517    }
1518    luma_chroma_mode_rdo(
1519      luma_mode,
1520      fi,
1521      bsize,
1522      tile_bo,
1523      ts,
1524      cw,
1525      rdo_type,
1526      cw_checkpoint,
1527      &mut best,
1528      mvs,
1529      ref_frames,
1530      &mode_set_chroma,
1531      true,
1532      0,
1533      &ArrayVec::<CandidateMV, 9>::new(),
1534      AngleDelta::default(),
1535    );
1536  });
1537
1538  if fi.config.speed_settings.prediction.fine_directional_intra
1539    && bsize >= BlockSize::BLOCK_8X8
1540  {
1541    // Find the best angle delta for the current best prediction mode
1542    let luma_deltas = best.pred_mode_luma.angle_delta_count();
1543    let chroma_deltas = best.pred_mode_chroma.angle_delta_count();
1544
1545    let mvs = [MotionVector::default(); 2];
1546    let ref_frames = [INTRA_FRAME, NONE_FRAME];
1547    let mode_set_chroma = [best.pred_mode_chroma];
1548    let mv_stack = ArrayVec::<_, 9>::new();
1549    let mut best_angle_delta = best.angle_delta;
1550    let mut angle_delta_rdo = |y, uv| -> AngleDelta {
1551      if best.angle_delta.y != y || best.angle_delta.uv != uv {
1552        luma_chroma_mode_rdo(
1553          best.pred_mode_luma,
1554          fi,
1555          bsize,
1556          tile_bo,
1557          ts,
1558          cw,
1559          rdo_type,
1560          cw_checkpoint,
1561          &mut best,
1562          mvs,
1563          ref_frames,
1564          &mode_set_chroma,
1565          true,
1566          0,
1567          &mv_stack,
1568          AngleDelta { y, uv },
1569        );
1570      }
1571      best.angle_delta
1572    };
1573
1574    for i in 0..luma_deltas {
1575      let angle_delta_y =
1576        if luma_deltas == 1 { 0 } else { i - MAX_ANGLE_DELTA as i8 };
1577      best_angle_delta = angle_delta_rdo(angle_delta_y, best_angle_delta.uv);
1578    }
1579    for j in 0..chroma_deltas {
1580      let angle_delta_uv =
1581        if chroma_deltas == 1 { 0 } else { j - MAX_ANGLE_DELTA as i8 };
1582      best_angle_delta = angle_delta_rdo(best_angle_delta.y, angle_delta_uv);
1583    }
1584  }
1585
1586  best
1587}
1588
1589/// # Panics
1590///
1591/// - If the block size is invalid for subsampling.
1592#[profiling::function]
1593pub fn rdo_cfl_alpha<T: Pixel>(
1594  ts: &mut TileStateMut<'_, T>, tile_bo: TileBlockOffset, bsize: BlockSize,
1595  luma_tx_size: TxSize, fi: &FrameInvariants<T>,
1596) -> Option<CFLParams> {
1597  let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg;
1598  let uv_tx_size = bsize.largest_chroma_tx_size(xdec, ydec);
1599  debug_assert!(
1600    bsize.subsampled_size(xdec, ydec).unwrap() == uv_tx_size.block_size()
1601  );
1602
1603  let frame_bo = ts.to_frame_block_offset(tile_bo);
1604  let (visible_tx_w, visible_tx_h) = clip_visible_bsize(
1605    (fi.width + xdec) >> xdec,
1606    (fi.height + ydec) >> ydec,
1607    uv_tx_size.block_size(),
1608    (frame_bo.0.x << MI_SIZE_LOG2) >> xdec,
1609    (frame_bo.0.y << MI_SIZE_LOG2) >> ydec,
1610  );
1611
1612  if visible_tx_w == 0 || visible_tx_h == 0 {
1613    return None;
1614  };
1615  let mut ac = Aligned::<[MaybeUninit<i16>; 32 * 32]>::uninit_array();
1616  let ac = luma_ac(&mut ac.data, ts, tile_bo, bsize, luma_tx_size, fi);
1617  let best_alpha: ArrayVec<i16, 2> = (1..3)
1618    .map(|p| {
1619      let &PlaneConfig { xdec, ydec, .. } = ts.rec.planes[p].plane_cfg;
1620      let tile_rect = ts.tile_rect().decimated(xdec, ydec);
1621      let rec = &mut ts.rec.planes[p];
1622      let input = &ts.input_tile.planes[p];
1623      let po = tile_bo.plane_offset(rec.plane_cfg);
1624      let mut edge_buf = Aligned::uninit_array();
1625      let edge_buf = get_intra_edges(
1626        &mut edge_buf,
1627        &rec.as_const(),
1628        tile_bo,
1629        0,
1630        0,
1631        bsize,
1632        po,
1633        uv_tx_size,
1634        fi.sequence.bit_depth,
1635        Some(PredictionMode::UV_CFL_PRED),
1636        fi.sequence.enable_intra_edge_filter,
1637        IntraParam::None,
1638      );
1639      let mut alpha_cost = |alpha: i16| -> u64 {
1640        let mut rec_region =
1641          rec.subregion_mut(Area::BlockStartingAt { bo: tile_bo.0 });
1642        PredictionMode::UV_CFL_PRED.predict_intra(
1643          tile_rect,
1644          &mut rec_region,
1645          uv_tx_size,
1646          fi.sequence.bit_depth,
1647          ac,
1648          IntraParam::Alpha(alpha),
1649          None,
1650          &edge_buf,
1651          fi.cpu_feature_level,
1652        );
1653        sse_wxh(
1654          &input.subregion(Area::BlockStartingAt { bo: tile_bo.0 }),
1655          &rec_region.as_const(),
1656          visible_tx_w,
1657          visible_tx_h,
1658          |_, _| DistortionScale::default(), // We're not doing RDO here.
1659          fi.sequence.bit_depth,
1660          fi.cpu_feature_level,
1661        )
1662        .0
1663      };
1664      let mut best = (alpha_cost(0), 0);
1665      let mut count = 2;
1666      for alpha in 1i16..=16i16 {
1667        let cost = (alpha_cost(alpha), alpha_cost(-alpha));
1668        if cost.0 < best.0 {
1669          best = (cost.0, alpha);
1670          count += 2;
1671        }
1672        if cost.1 < best.0 {
1673          best = (cost.1, -alpha);
1674          count += 2;
1675        }
1676        if count < alpha {
1677          break;
1678        }
1679      }
1680      best.1
1681    })
1682    .collect();
1683
1684  if best_alpha[0] == 0 && best_alpha[1] == 0 {
1685    None
1686  } else {
1687    Some(CFLParams::from_alpha(best_alpha[0], best_alpha[1]))
1688  }
1689}
1690
1691/// RDO-based transform type decision
1692/// If `cw_checkpoint` is `None`, a checkpoint for cw's (`ContextWriter`) current
1693/// state is created and stored for later use.
1694///
1695/// # Panics
1696///
1697/// - If a writer checkpoint is never created before or within the function.
1698///   This should never happen and indicates a development error.
1699/// - If the best RD found is negative.
1700///   This should never happen and indicates a development error.
1701pub fn rdo_tx_type_decision<T: Pixel>(
1702  fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
1703  cw: &mut ContextWriter, cw_checkpoint: &mut Option<ContextWriterCheckpoint>,
1704  mode: PredictionMode, ref_frames: [RefType; 2], mvs: [MotionVector; 2],
1705  bsize: BlockSize, tile_bo: TileBlockOffset, tx_size: TxSize, tx_set: TxSet,
1706  tx_types: &[TxType], cur_best_rd: f64,
1707) -> (TxType, f64) {
1708  let mut best_type = TxType::DCT_DCT;
1709  let mut best_rd = f64::MAX;
1710
1711  let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg;
1712  let is_chroma_block =
1713    has_chroma(tile_bo, bsize, xdec, ydec, fi.sequence.chroma_sampling);
1714
1715  let is_inter = !mode.is_intra();
1716
1717  if cw_checkpoint.is_none() {
1718    // Only run the first call
1719    // Prevents creating multiple checkpoints for own version of cw
1720    *cw_checkpoint =
1721      Some(cw.checkpoint(&tile_bo, fi.sequence.chroma_sampling));
1722  }
1723
1724  let rdo_type = if fi.use_tx_domain_distortion {
1725    RDOType::TxDistRealRate
1726  } else {
1727    RDOType::PixelDistRealRate
1728  };
1729  let need_recon_pixel = tx_size.block_size() != bsize && !is_inter;
1730
1731  let mut first_iteration = true;
1732  for &tx_type in tx_types {
1733    // Skip unsupported transform types
1734    if av1_tx_used[tx_set as usize][tx_type as usize] == 0 {
1735      continue;
1736    }
1737
1738    if is_inter {
1739      motion_compensate(
1740        fi, ts, cw, mode, ref_frames, mvs, bsize, tile_bo, true,
1741      );
1742    }
1743
1744    let mut wr = WriterCounter::new();
1745    let tell = wr.tell_frac();
1746    let (_, tx_dist) = if is_inter {
1747      write_tx_tree(
1748        fi,
1749        ts,
1750        cw,
1751        &mut wr,
1752        mode,
1753        0,
1754        tile_bo,
1755        bsize,
1756        tx_size,
1757        tx_type,
1758        false,
1759        true,
1760        rdo_type,
1761        need_recon_pixel,
1762      )
1763    } else {
1764      write_tx_blocks(
1765        fi,
1766        ts,
1767        cw,
1768        &mut wr,
1769        mode,
1770        mode,
1771        AngleDelta::default(),
1772        tile_bo,
1773        bsize,
1774        tx_size,
1775        tx_type,
1776        false,
1777        CFLParams::default(), // Unused.
1778        true,
1779        rdo_type,
1780        need_recon_pixel,
1781      )
1782    };
1783
1784    let rate = wr.tell_frac() - tell;
1785    let distortion = if fi.use_tx_domain_distortion {
1786      compute_tx_distortion(
1787        fi,
1788        ts,
1789        bsize,
1790        is_chroma_block,
1791        tile_bo,
1792        tx_dist,
1793        false,
1794        true,
1795      )
1796    } else {
1797      compute_distortion(fi, ts, bsize, is_chroma_block, tile_bo, true)
1798    };
1799    cw.rollback(cw_checkpoint.as_ref().unwrap());
1800
1801    let rd = compute_rd_cost(fi, rate, distortion);
1802
1803    if first_iteration {
1804      // We use an optimization to early exit after testing the first
1805      // transform type if the cost is higher than the existing best.
1806      // The idea is that if this transform size is not better than he
1807      // previous size, it is not worth testing remaining modes for this size.
1808      if rd > cur_best_rd {
1809        break;
1810      }
1811      first_iteration = false;
1812    }
1813
1814    if rd < best_rd {
1815      best_rd = rd;
1816      best_type = tx_type;
1817    }
1818  }
1819
1820  assert!(best_rd >= 0_f64);
1821
1822  (best_type, best_rd)
1823}
1824
1825pub fn get_sub_partitions(
1826  four_partitions: &[TileBlockOffset; 4], partition: PartitionType,
1827) -> ArrayVec<TileBlockOffset, 4> {
1828  let mut partition_offsets = ArrayVec::<TileBlockOffset, 4>::new();
1829
1830  partition_offsets.push(four_partitions[0]);
1831
1832  if partition == PARTITION_NONE {
1833    return partition_offsets;
1834  }
1835  if partition == PARTITION_VERT || partition == PARTITION_SPLIT {
1836    partition_offsets.push(four_partitions[1]);
1837  };
1838  if partition == PARTITION_HORZ || partition == PARTITION_SPLIT {
1839    partition_offsets.push(four_partitions[2]);
1840  };
1841  if partition == PARTITION_SPLIT {
1842    partition_offsets.push(four_partitions[3]);
1843  };
1844
1845  partition_offsets
1846}
1847
1848#[inline(always)]
1849fn rdo_partition_none<T: Pixel>(
1850  fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
1851  cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset,
1852  inter_cfg: &InterConfig, child_modes: &mut ArrayVec<PartitionParameters, 4>,
1853) -> f64 {
1854  debug_assert!(tile_bo.0.x < ts.mi_width && tile_bo.0.y < ts.mi_height);
1855
1856  let mode = rdo_mode_decision(fi, ts, cw, bsize, tile_bo, inter_cfg);
1857  let cost = mode.rd_cost;
1858
1859  child_modes.push(mode);
1860
1861  cost
1862}
1863
1864// VERTICAL, HORIZONTAL or simple SPLIT
1865#[inline(always)]
1866fn rdo_partition_simple<T: Pixel, W: Writer>(
1867  fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
1868  cw: &mut ContextWriter, w_pre_cdef: &mut W, w_post_cdef: &mut W,
1869  bsize: BlockSize, tile_bo: TileBlockOffset, inter_cfg: &InterConfig,
1870  partition: PartitionType, rdo_type: RDOType, best_rd: f64,
1871  child_modes: &mut ArrayVec<PartitionParameters, 4>,
1872) -> Option<f64> {
1873  debug_assert!(tile_bo.0.x < ts.mi_width && tile_bo.0.y < ts.mi_height);
1874  let subsize = bsize.subsize(partition).unwrap();
1875
1876  let cost = if bsize >= BlockSize::BLOCK_8X8 {
1877    let w: &mut W = if cw.bc.cdef_coded { w_post_cdef } else { w_pre_cdef };
1878    let tell = w.tell_frac();
1879    cw.write_partition(w, tile_bo, partition, bsize);
1880    compute_rd_cost(fi, w.tell_frac() - tell, ScaledDistortion::zero())
1881  } else {
1882    0.0
1883  };
1884
1885  let hbsw = subsize.width_mi(); // Half the block size width in blocks
1886  let hbsh = subsize.height_mi(); // Half the block size height in blocks
1887  let four_partitions = [
1888    tile_bo,
1889    TileBlockOffset(BlockOffset { x: tile_bo.0.x + hbsw, y: tile_bo.0.y }),
1890    TileBlockOffset(BlockOffset { x: tile_bo.0.x, y: tile_bo.0.y + hbsh }),
1891    TileBlockOffset(BlockOffset {
1892      x: tile_bo.0.x + hbsw,
1893      y: tile_bo.0.y + hbsh,
1894    }),
1895  ];
1896
1897  let partitions = get_sub_partitions(&four_partitions, partition);
1898
1899  let mut rd_cost_sum = 0.0;
1900
1901  for offset in partitions {
1902    let hbs = subsize.width_mi() >> 1;
1903    let has_cols = offset.0.x + hbs < ts.mi_width;
1904    let has_rows = offset.0.y + hbs < ts.mi_height;
1905
1906    if has_cols && has_rows {
1907      let mode_decision =
1908        rdo_mode_decision(fi, ts, cw, subsize, offset, inter_cfg);
1909
1910      rd_cost_sum += mode_decision.rd_cost;
1911
1912      if fi.enable_early_exit && rd_cost_sum > best_rd {
1913        return None;
1914      }
1915      if subsize >= BlockSize::BLOCK_8X8 && subsize.is_sqr() {
1916        let w: &mut W =
1917          if cw.bc.cdef_coded { w_post_cdef } else { w_pre_cdef };
1918        cw.write_partition(w, offset, PartitionType::PARTITION_NONE, subsize);
1919      }
1920      encode_block_with_modes(
1921        fi,
1922        ts,
1923        cw,
1924        w_pre_cdef,
1925        w_post_cdef,
1926        subsize,
1927        offset,
1928        &mode_decision,
1929        rdo_type,
1930        None,
1931      );
1932      child_modes.push(mode_decision);
1933    } else {
1934      //rd_cost_sum += f64::MAX;
1935      return None;
1936    }
1937  }
1938
1939  Some(cost + rd_cost_sum)
1940}
1941
1942/// RDO-based single level partitioning decision
1943///
1944/// # Panics
1945///
1946/// - If the best RD found is negative.
1947///   This should never happen, and indicates a development error.
1948#[profiling::function]
1949pub fn rdo_partition_decision<T: Pixel, W: Writer>(
1950  fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
1951  cw: &mut ContextWriter, w_pre_cdef: &mut W, w_post_cdef: &mut W,
1952  bsize: BlockSize, tile_bo: TileBlockOffset,
1953  cached_block: &PartitionGroupParameters, partition_types: &[PartitionType],
1954  rdo_type: RDOType, inter_cfg: &InterConfig,
1955) -> PartitionGroupParameters {
1956  let mut best_partition = cached_block.part_type;
1957  let mut best_rd = cached_block.rd_cost;
1958  let mut best_pred_modes = cached_block.part_modes.clone();
1959
1960  let cw_checkpoint = cw.checkpoint(&tile_bo, fi.sequence.chroma_sampling);
1961  let w_pre_checkpoint = w_pre_cdef.checkpoint();
1962  let w_post_checkpoint = w_post_cdef.checkpoint();
1963
1964  for &partition in partition_types {
1965    // Do not re-encode results we already have
1966    if partition == cached_block.part_type {
1967      continue;
1968    }
1969
1970    let mut child_modes = ArrayVec::<_, 4>::new();
1971
1972    let cost = match partition {
1973      PARTITION_NONE if bsize <= BlockSize::BLOCK_64X64 => {
1974        Some(rdo_partition_none(
1975          fi,
1976          ts,
1977          cw,
1978          bsize,
1979          tile_bo,
1980          inter_cfg,
1981          &mut child_modes,
1982        ))
1983      }
1984      PARTITION_SPLIT | PARTITION_HORZ | PARTITION_VERT => {
1985        rdo_partition_simple(
1986          fi,
1987          ts,
1988          cw,
1989          w_pre_cdef,
1990          w_post_cdef,
1991          bsize,
1992          tile_bo,
1993          inter_cfg,
1994          partition,
1995          rdo_type,
1996          best_rd,
1997          &mut child_modes,
1998        )
1999      }
2000      _ => {
2001        unreachable!();
2002      }
2003    };
2004
2005    if let Some(rd) = cost {
2006      if rd < best_rd {
2007        best_rd = rd;
2008        best_partition = partition;
2009        best_pred_modes.clone_from(&child_modes);
2010      }
2011    }
2012    cw.rollback(&cw_checkpoint);
2013    w_pre_cdef.rollback(&w_pre_checkpoint);
2014    w_post_cdef.rollback(&w_post_checkpoint);
2015  }
2016
2017  assert!(best_rd >= 0_f64);
2018
2019  PartitionGroupParameters {
2020    rd_cost: best_rd,
2021    part_type: best_partition,
2022    part_modes: best_pred_modes,
2023  }
2024}
2025
2026#[profiling::function]
2027fn rdo_loop_plane_error<T: Pixel>(
2028  base_sbo: TileSuperBlockOffset, offset_sbo: TileSuperBlockOffset,
2029  sb_w: usize, sb_h: usize, fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>,
2030  blocks: &TileBlocks<'_>, test: &Frame<T>, src: &Tile<'_, T>, pli: usize,
2031) -> ScaledDistortion {
2032  let sb_w_blocks =
2033    if fi.sequence.use_128x128_superblock { 16 } else { 8 } * sb_w;
2034  let sb_h_blocks =
2035    if fi.sequence.use_128x128_superblock { 16 } else { 8 } * sb_h;
2036  // Each direction block is 8x8 in y, potentially smaller if subsampled in chroma
2037  // accumulating in-frame and unpadded
2038  let mut err = Distortion::zero();
2039  for by in 0..sb_h_blocks {
2040    for bx in 0..sb_w_blocks {
2041      let loop_bo = offset_sbo.block_offset(bx << 1, by << 1);
2042      if loop_bo.0.x < blocks.cols() && loop_bo.0.y < blocks.rows() {
2043        let src_plane = &src.planes[pli];
2044        let test_plane = &test.planes[pli];
2045        let PlaneConfig { xdec, ydec, .. } = *src_plane.plane_cfg;
2046        debug_assert_eq!(xdec, test_plane.cfg.xdec);
2047        debug_assert_eq!(ydec, test_plane.cfg.ydec);
2048
2049        // Unfortunately, our distortion biases are only available via
2050        // Frame-absolute addressing, so we need a block offset
2051        // relative to the full frame origin (not the tile or analysis
2052        // area)
2053        let frame_bo = (base_sbo + offset_sbo).block_offset(bx << 1, by << 1);
2054        let bias = distortion_scale(
2055          fi,
2056          ts.to_frame_block_offset(frame_bo),
2057          BlockSize::BLOCK_8X8,
2058        );
2059
2060        let src_region =
2061          src_plane.subregion(Area::BlockStartingAt { bo: loop_bo.0 });
2062        let test_region =
2063          test_plane.region(Area::BlockStartingAt { bo: loop_bo.0 });
2064
2065        err += if pli == 0 {
2066          // For loop filters, We intentionally use cdef_dist even with
2067          // `--tune Psnr`. Using SSE instead gives no PSNR gain but has a
2068          // significant negative impact on other metrics and visual quality.
2069          RawDistortion(cdef_dist_kernel(
2070            &src_region,
2071            &test_region,
2072            8,
2073            8,
2074            fi.sequence.bit_depth,
2075            fi.cpu_feature_level,
2076          ) as u64)
2077            * bias
2078        } else {
2079          sse_wxh(
2080            &src_region,
2081            &test_region,
2082            8 >> xdec,
2083            8 >> ydec,
2084            |_, _| bias,
2085            fi.sequence.bit_depth,
2086            fi.cpu_feature_level,
2087          )
2088        };
2089      }
2090    }
2091  }
2092  err * fi.dist_scale[pli]
2093}
2094
2095/// Passed in a superblock offset representing the upper left corner of
2096/// the LRU area we're optimizing.  This area covers the largest LRU in
2097/// any of the present planes, but may consist of a number of
2098/// superblocks and full, smaller LRUs in the other planes
2099///
2100/// # Panics
2101///
2102/// - If both CDEF and LRF are disabled.
2103#[profiling::function]
2104pub fn rdo_loop_decision<T: Pixel, W: Writer>(
2105  base_sbo: TileSuperBlockOffset, fi: &FrameInvariants<T>,
2106  ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, w: &mut W,
2107  deblock_p: bool,
2108) {
2109  let planes = if fi.sequence.chroma_sampling == ChromaSampling::Cs400 {
2110    1
2111  } else {
2112    MAX_PLANES
2113  };
2114  assert!(fi.sequence.enable_cdef || fi.sequence.enable_restoration);
2115  // Determine area of optimization: Which plane has the largest LRUs?
2116  // How many LRUs for each?
2117  let mut sb_w = 1; // how many superblocks wide the largest LRU
2118                    // is/how many SBs we're processing (same thing)
2119  let mut sb_h = 1; // how many superblocks wide the largest LRU
2120                    // is/how many SBs we're processing (same thing)
2121  let mut lru_w = [0; MAX_PLANES]; // how many LRUs we're processing
2122  let mut lru_h = [0; MAX_PLANES]; // how many LRUs we're processing
2123  for pli in 0..planes {
2124    let sb_h_shift = ts.restoration.planes[pli].rp_cfg.sb_h_shift;
2125    let sb_v_shift = ts.restoration.planes[pli].rp_cfg.sb_v_shift;
2126    if sb_w < (1 << sb_h_shift) {
2127      sb_w = 1 << sb_h_shift;
2128    }
2129    if sb_h < (1 << sb_v_shift) {
2130      sb_h = 1 << sb_v_shift;
2131    }
2132  }
2133  for pli in 0..planes {
2134    let sb_h_shift = ts.restoration.planes[pli].rp_cfg.sb_h_shift;
2135    let sb_v_shift = ts.restoration.planes[pli].rp_cfg.sb_v_shift;
2136    lru_w[pli] = sb_w / (1 << sb_h_shift);
2137    lru_h[pli] = sb_h / (1 << sb_v_shift);
2138  }
2139
2140  // The superblock width/height determinations may be calling for us
2141  // to compute over superblocks that do not actually exist in the
2142  // frame (off the right or lower edge).  Trim sb width/height down
2143  // to actual superblocks.  Note that these last superblocks on the
2144  // right/bottom may themselves still span the edge of the frame, but
2145  // they do hold at least some visible pixels.
2146  sb_w = sb_w.min(ts.sb_width - base_sbo.0.x);
2147  sb_h = sb_h.min(ts.sb_height - base_sbo.0.y);
2148
2149  // We have need to know the Y visible pixel limits as well (the
2150  // sb_w/sb_h figures above can be used to determine how many
2151  // allocated pixels, possibly beyond the visible frame, exist).
2152  let crop_w =
2153    fi.width - ((ts.sbo.0.x + base_sbo.0.x) << SUPERBLOCK_TO_PLANE_SHIFT);
2154  let crop_h =
2155    fi.height - ((ts.sbo.0.y + base_sbo.0.y) << SUPERBLOCK_TO_PLANE_SHIFT);
2156  let pixel_w = crop_w.min(sb_w << SUPERBLOCK_TO_PLANE_SHIFT);
2157  let pixel_h = crop_h.min(sb_h << SUPERBLOCK_TO_PLANE_SHIFT);
2158
2159  // Based on `RestorationState::new`
2160  const MAX_SB_SHIFT: usize = 4;
2161  const MAX_SB_SIZE: usize = 1 << MAX_SB_SHIFT;
2162  const MAX_LRU_SIZE: usize = MAX_SB_SIZE;
2163
2164  // Static allocation relies on the "minimal LRU area for all N planes" invariant.
2165  let mut best_index = [-1; MAX_SB_SIZE * MAX_SB_SIZE];
2166  let mut best_lrf =
2167    [[RestorationFilter::None; MAX_PLANES]; MAX_LRU_SIZE * MAX_LRU_SIZE];
2168
2169  // due to imprecision in the reconstruction parameter solver, we
2170  // need to make sure we don't fall into a limit cycle.  Track our
2171  // best cost at LRF so that we can break if we get a solution that doesn't
2172  // improve at the reconstruction stage.
2173  let mut best_lrf_cost = [[-1.0; MAX_PLANES]; MAX_LRU_SIZE * MAX_LRU_SIZE];
2174
2175  // sub-setted region of the TileBlocks for our working frame area.
2176  // Note that the size of this subset is what signals CDEF as to the
2177  // actual coded size.
2178  let mut tileblocks_subset = cw.bc.blocks.subregion_mut(
2179    base_sbo.block_offset(0, 0).0.x,
2180    base_sbo.block_offset(0, 0).0.y,
2181    sb_w << SUPERBLOCK_TO_BLOCK_SHIFT,
2182    sb_h << SUPERBLOCK_TO_BLOCK_SHIFT,
2183  );
2184
2185  // cdef doesn't run on superblocks that are completely skipped.
2186  // Determine which super blocks are marked as skipped so we can avoid running
2187  // them. If all blocks are skipped, we can avoid some of the overhead related
2188  // to setting up for cdef.
2189  let mut cdef_skip = [true; MAX_SB_SIZE * MAX_SB_SIZE];
2190  let mut cdef_skip_all = true;
2191  if fi.sequence.enable_cdef {
2192    for sby in 0..sb_h {
2193      for sbx in 0..sb_w {
2194        let blocks = tileblocks_subset.subregion(16 * sbx, 16 * sby, 16, 16);
2195        let mut skip = true;
2196        for y in 0..blocks.rows() {
2197          for block in blocks[y].iter() {
2198            skip &= block.skip;
2199          }
2200        }
2201        cdef_skip[sby * MAX_SB_SIZE + sbx] = skip;
2202        cdef_skip_all &= skip;
2203      }
2204    }
2205  }
2206
2207  // Unlike cdef, loop restoration will run regardless of whether blocks are
2208  // skipped or not. At the same time, the most significant improvement will
2209  // generally be from un-skipped blocks, so lru is only performed if there are
2210  // un-skipped blocks.
2211  // This should be the same as `cdef_skip_all`, except when cdef is disabled.
2212  let mut lru_skip_all = true;
2213  let mut lru_skip = [[true; MAX_PLANES]; MAX_LRU_SIZE * MAX_LRU_SIZE];
2214  if fi.sequence.enable_restoration {
2215    if fi.config.speed_settings.lru_on_skip {
2216      lru_skip_all = false;
2217      lru_skip = [[false; MAX_PLANES]; MAX_LRU_SIZE * MAX_LRU_SIZE];
2218    } else {
2219      for pli in 0..planes {
2220        // width, in sb, of an LRU in this plane
2221        let lru_sb_w = 1 << ts.restoration.planes[pli].rp_cfg.sb_h_shift;
2222        // height, in sb, of an LRU in this plane
2223        let lru_sb_h = 1 << ts.restoration.planes[pli].rp_cfg.sb_v_shift;
2224        for lru_y in 0..lru_h[pli] {
2225          // number of LRUs vertically
2226          for lru_x in 0..lru_w[pli] {
2227            // number of LRUs horizontally
2228
2229            let loop_sbo = TileSuperBlockOffset(SuperBlockOffset {
2230              x: lru_x * lru_sb_w,
2231              y: lru_y * lru_sb_h,
2232            });
2233
2234            if !ts.restoration.has_restoration_unit(
2235              base_sbo + loop_sbo,
2236              pli,
2237              false,
2238            ) {
2239              continue;
2240            }
2241
2242            let start = loop_sbo.block_offset(0, 0).0;
2243            let size = TileSuperBlockOffset(SuperBlockOffset {
2244              x: lru_sb_w,
2245              y: lru_sb_h,
2246            })
2247            .block_offset(0, 0)
2248            .0;
2249
2250            let blocks =
2251              tileblocks_subset.subregion(start.x, start.y, size.x, size.y);
2252            let mut skip = true;
2253            for y in 0..blocks.rows() {
2254              for block in blocks[y].iter() {
2255                skip &= block.skip;
2256              }
2257            }
2258            lru_skip[lru_y * MAX_LRU_SIZE + lru_x][pli] = skip;
2259            lru_skip_all &= skip;
2260          }
2261        }
2262      }
2263    }
2264  }
2265
2266  // Return early if all blocks are skipped for lru and cdef.
2267  if lru_skip_all && cdef_skip_all {
2268    return;
2269  }
2270
2271  // Loop filter RDO is an iterative process and we need temporary
2272  // scratch data to hold the results of deblocking, cdef, and the
2273  // loop reconstruction filter so that each can be partially updated
2274  // without recomputing the entire stack.  Construct
2275  // largest-LRU-sized frames for each, accounting for padding
2276  // required by deblocking, cdef and [optionally] LR.
2277  let mut rec_subset = ts
2278    .rec
2279    .subregion(Area::BlockRect {
2280      bo: base_sbo.block_offset(0, 0).0,
2281      width: (pixel_w + 7) >> 3 << 3,
2282      height: (pixel_h + 7) >> 3 << 3,
2283    })
2284    .scratch_copy();
2285
2286  // const, no need to copy, just need the subregion (but do zero the
2287  // origin to match the other copies/new backing frames).
2288  let src_subset = ts
2289    .input_tile
2290    .subregion(Area::BlockRect {
2291      bo: base_sbo.block_offset(0, 0).0,
2292      width: (pixel_w + 7) >> 3 << 3,
2293      height: (pixel_h + 7) >> 3 << 3,
2294    })
2295    .home();
2296
2297  if deblock_p {
2298    // Find a good deblocking filter solution for the passed in area.
2299    // This is not RDO of deblocking itself, merely a solution to get
2300    // better results from CDEF/LRF RDO.
2301    let deblock_levels = deblock_filter_optimize(
2302      fi,
2303      &rec_subset.as_tile(),
2304      &src_subset,
2305      &tileblocks_subset.as_const(),
2306      crop_w,
2307      crop_h,
2308    );
2309
2310    // Deblock the contents of our reconstruction copy.
2311    if deblock_levels[0] != 0 || deblock_levels[1] != 0 {
2312      // copy ts.deblock because we need to set some of our own values here
2313      let mut deblock_copy = *ts.deblock;
2314      deblock_copy.levels = deblock_levels;
2315
2316      // finally, deblock the temp frame
2317      deblock_filter_frame(
2318        &deblock_copy,
2319        &mut rec_subset.as_tile_mut(),
2320        &tileblocks_subset.as_const(),
2321        crop_w,
2322        crop_h,
2323        fi.sequence.bit_depth,
2324        planes,
2325      );
2326    }
2327  }
2328
2329  let mut cdef_work =
2330    if !cdef_skip_all { Some(rec_subset.clone()) } else { None };
2331  let mut lrf_work = if !lru_skip_all {
2332    Some(Frame {
2333      planes: {
2334        let new_plane = |pli: usize| {
2335          let PlaneConfig { xdec, ydec, width, height, .. } =
2336            rec_subset.planes[pli].cfg;
2337          Plane::new(width, height, xdec, ydec, 0, 0)
2338        };
2339        [new_plane(0), new_plane(1), new_plane(2)]
2340      },
2341    })
2342  } else {
2343    None
2344  };
2345
2346  // Precompute directional analysis for CDEF
2347  let cdef_data = {
2348    if cdef_work.is_some() {
2349      Some((
2350        &rec_subset,
2351        cdef_analyze_superblock_range(
2352          fi,
2353          &rec_subset,
2354          &tileblocks_subset.as_const(),
2355          sb_w,
2356          sb_h,
2357        ),
2358      ))
2359    } else {
2360      None
2361    }
2362  };
2363
2364  // CDEF/LRF decision iteration
2365  // Start with a default of CDEF 0 and RestorationFilter::None
2366  // Try all CDEF options for each sb with current LRF; if new CDEF+LRF choice is better, select it.
2367  // Then try all LRF options with current CDEFs; if new CDEFs+LRF choice is better, select it.
2368  // If LRF choice changed for any plane, repeat until no changes
2369  // Limit iterations and where we break based on speed setting (in the TODO list ;-)
2370  let mut cdef_change = true;
2371  let mut lrf_change = true;
2372  while cdef_change || lrf_change {
2373    // search for improved cdef indices, superblock by superblock, if cdef is enabled.
2374    if let (Some((rec_copy, cdef_dirs)), Some(cdef_ref)) =
2375      (&cdef_data, &mut cdef_work.as_mut())
2376    {
2377      for sby in 0..sb_h {
2378        for sbx in 0..sb_w {
2379          // determine whether this superblock can be skipped
2380          if cdef_skip[sby * MAX_SB_SIZE + sbx] {
2381            continue;
2382          }
2383
2384          let prev_best_index = best_index[sby * sb_w + sbx];
2385          let mut best_cost = -1.;
2386          let mut best_new_index = -1i8;
2387
2388          /* offset of the superblock we're currently testing within the larger
2389          analysis area */
2390          let loop_sbo =
2391            TileSuperBlockOffset(SuperBlockOffset { x: sbx, y: sby });
2392
2393          /* cdef index testing loop */
2394          for cdef_index in 0..(1 << fi.cdef_bits) {
2395            let mut err = ScaledDistortion::zero();
2396            let mut rate = 0;
2397
2398            cdef_filter_superblock(
2399              fi,
2400              &rec_subset,
2401              &mut cdef_ref.as_tile_mut(),
2402              &tileblocks_subset.as_const(),
2403              loop_sbo,
2404              cdef_index,
2405              &cdef_dirs[sby * sb_w + sbx],
2406            );
2407            // apply LRF if any
2408            for pli in 0..planes {
2409              // We need the cropped-to-visible-frame area of this SB
2410              let wh =
2411                if fi.sequence.use_128x128_superblock { 128 } else { 64 };
2412              let PlaneConfig { xdec, ydec, .. } = cdef_ref.planes[pli].cfg;
2413              let vis_width = (wh >> xdec).min(
2414                (crop_w >> xdec)
2415                  - loop_sbo.plane_offset(&cdef_ref.planes[pli].cfg).x
2416                    as usize,
2417              );
2418              let vis_height = (wh >> ydec).min(
2419                (crop_h >> ydec)
2420                  - loop_sbo.plane_offset(&cdef_ref.planes[pli].cfg).y
2421                    as usize,
2422              );
2423              // which LRU are we currently testing against?
2424              if let (Some((lru_x, lru_y)), Some(lrf_ref)) = {
2425                let rp = &ts.restoration.planes[pli];
2426                (
2427                  rp.restoration_unit_offset(base_sbo, loop_sbo, false),
2428                  &mut lrf_work,
2429                )
2430              } {
2431                // We have a valid LRU, apply LRF, compute error
2432                match best_lrf[lru_y * lru_w[pli] + lru_x][pli] {
2433                  RestorationFilter::None => {
2434                    err += rdo_loop_plane_error(
2435                      base_sbo,
2436                      loop_sbo,
2437                      1,
2438                      1,
2439                      fi,
2440                      ts,
2441                      &tileblocks_subset.as_const(),
2442                      cdef_ref,
2443                      &src_subset,
2444                      pli,
2445                    );
2446                    rate += if fi.sequence.enable_restoration {
2447                      cw.fc.count_lrf_switchable(
2448                        w,
2449                        &ts.restoration.as_const(),
2450                        best_lrf[lru_y * lru_w[pli] + lru_x][pli],
2451                        pli,
2452                      )
2453                    } else {
2454                      0 // no relative cost differeneces to different
2455                        // CDEF params.  If cdef is on, it's a wash.
2456                    };
2457                  }
2458                  RestorationFilter::Sgrproj { set, xqd } => {
2459                    // only run on this single superblock
2460                    let loop_po =
2461                      loop_sbo.plane_offset(&cdef_ref.planes[pli].cfg);
2462                    // todo: experiment with borrowing border pixels
2463                    // rather than edge-extending. Right now this is
2464                    // hard-clipping to the superblock boundary.
2465                    setup_integral_image(
2466                      &mut ts.integral_buffer,
2467                      SOLVE_IMAGE_STRIDE,
2468                      vis_width,
2469                      vis_height,
2470                      vis_width,
2471                      vis_height,
2472                      &cdef_ref.planes[pli].slice(loop_po),
2473                      &cdef_ref.planes[pli].slice(loop_po),
2474                    );
2475                    sgrproj_stripe_filter(
2476                      set,
2477                      xqd,
2478                      fi,
2479                      &ts.integral_buffer,
2480                      SOLVE_IMAGE_STRIDE,
2481                      &cdef_ref.planes[pli].slice(loop_po),
2482                      &mut lrf_ref.planes[pli].region_mut(Area::Rect {
2483                        x: loop_po.x,
2484                        y: loop_po.y,
2485                        width: vis_width,
2486                        height: vis_height,
2487                      }),
2488                    );
2489                    err += rdo_loop_plane_error(
2490                      base_sbo,
2491                      loop_sbo,
2492                      1,
2493                      1,
2494                      fi,
2495                      ts,
2496                      &tileblocks_subset.as_const(),
2497                      lrf_ref,
2498                      &src_subset,
2499                      pli,
2500                    );
2501                    rate += cw.fc.count_lrf_switchable(
2502                      w,
2503                      &ts.restoration.as_const(),
2504                      best_lrf[lru_y * lru_w[pli] + lru_x][pli],
2505                      pli,
2506                    );
2507                  }
2508                  RestorationFilter::Wiener { .. } => unreachable!(), // coming soon
2509                }
2510              } else {
2511                // No actual LRU here, compute error directly from CDEF output.
2512                err += rdo_loop_plane_error(
2513                  base_sbo,
2514                  loop_sbo,
2515                  1,
2516                  1,
2517                  fi,
2518                  ts,
2519                  &tileblocks_subset.as_const(),
2520                  cdef_ref,
2521                  &src_subset,
2522                  pli,
2523                );
2524                // no relative cost differeneces to different
2525                // CDEF params.  If cdef is on, it's a wash.
2526                // rate += 0;
2527              }
2528            }
2529
2530            let cost = compute_rd_cost(fi, rate, err);
2531            if best_cost < 0. || cost < best_cost {
2532              best_cost = cost;
2533              best_new_index = cdef_index as i8;
2534            }
2535          }
2536
2537          // Did we change any preexisting choices?
2538          if best_new_index != prev_best_index {
2539            cdef_change = true;
2540            best_index[sby * sb_w + sbx] = best_new_index;
2541            tileblocks_subset.set_cdef(loop_sbo, best_new_index as u8);
2542          }
2543
2544          let mut cdef_ref_tm = TileMut::new(
2545            cdef_ref,
2546            TileRect {
2547              x: 0,
2548              y: 0,
2549              width: cdef_ref.planes[0].cfg.width,
2550              height: cdef_ref.planes[0].cfg.height,
2551            },
2552          );
2553
2554          // Keep cdef output up to date; we need it for restoration
2555          // both below and above (padding)
2556          cdef_filter_superblock(
2557            fi,
2558            rec_copy,
2559            &mut cdef_ref_tm,
2560            &tileblocks_subset.as_const(),
2561            loop_sbo,
2562            best_index[sby * sb_w + sbx] as u8,
2563            &cdef_dirs[sby * sb_w + sbx],
2564          );
2565        }
2566      }
2567    }
2568
2569    if !cdef_change {
2570      break;
2571    }
2572    cdef_change = false;
2573    lrf_change = false;
2574
2575    // search for improved restoration filter parameters if restoration is enabled
2576    if let Some(lrf_ref) = &mut lrf_work.as_mut() {
2577      let lrf_input = if cdef_work.is_some() {
2578        // When CDEF is enabled, we pull from the CDEF output
2579        cdef_work.as_ref().unwrap()
2580      } else {
2581        // When CDEF is disabled, we pull from the [optionally
2582        // deblocked] reconstruction
2583        &rec_subset
2584      };
2585      for pli in 0..planes {
2586        // Nominal size of LRU in pixels before clipping to visible frame
2587        let unit_size = ts.restoration.planes[pli].rp_cfg.unit_size;
2588        // width, in sb, of an LRU in this plane
2589        let lru_sb_w = 1 << ts.restoration.planes[pli].rp_cfg.sb_h_shift;
2590        // height, in sb, of an LRU in this plane
2591        let lru_sb_h = 1 << ts.restoration.planes[pli].rp_cfg.sb_v_shift;
2592        let PlaneConfig { xdec, ydec, .. } = lrf_ref.planes[pli].cfg;
2593        for lru_y in 0..lru_h[pli] {
2594          // number of LRUs vertically
2595          for lru_x in 0..lru_w[pli] {
2596            // number of LRUs horizontally
2597
2598            // determine whether this lru should be skipped
2599            if lru_skip[lru_y * MAX_LRU_SIZE + lru_x][pli] {
2600              continue;
2601            }
2602
2603            let loop_sbo = TileSuperBlockOffset(SuperBlockOffset {
2604              x: lru_x * lru_sb_w,
2605              y: lru_y * lru_sb_h,
2606            });
2607            if ts.restoration.has_restoration_unit(
2608              base_sbo + loop_sbo,
2609              pli,
2610              false,
2611            ) {
2612              let src_plane = &src_subset.planes[pli]; // uncompressed input for reference
2613              let lrf_in_plane = &lrf_input.planes[pli];
2614              let lrf_po = loop_sbo.plane_offset(src_plane.plane_cfg);
2615              let mut best_new_lrf = best_lrf[lru_y * lru_w[pli] + lru_x][pli];
2616              let mut best_cost =
2617                best_lrf_cost[lru_y * lru_w[pli] + lru_x][pli];
2618
2619              // Check the no filter option
2620              {
2621                let err = rdo_loop_plane_error(
2622                  base_sbo,
2623                  loop_sbo,
2624                  lru_sb_w,
2625                  lru_sb_h,
2626                  fi,
2627                  ts,
2628                  &tileblocks_subset.as_const(),
2629                  lrf_input,
2630                  &src_subset,
2631                  pli,
2632                );
2633                let rate = cw.fc.count_lrf_switchable(
2634                  w,
2635                  &ts.restoration.as_const(),
2636                  best_new_lrf,
2637                  pli,
2638                );
2639
2640                let cost = compute_rd_cost(fi, rate, err);
2641                // Was this choice actually an improvement?
2642                if best_cost < 0. || cost < best_cost {
2643                  best_cost = cost;
2644                  best_lrf_cost[lru_y * lru_w[pli] + lru_x][pli] = cost;
2645                  best_new_lrf = RestorationFilter::None;
2646                }
2647              }
2648
2649              // Look for a self guided filter
2650              // We need the cropped-to-visible-frame computation area of this LRU
2651              let vis_width = unit_size.min(
2652                (crop_w >> xdec)
2653                  - loop_sbo.plane_offset(&lrf_ref.planes[pli].cfg).x as usize,
2654              );
2655              let vis_height = unit_size.min(
2656                (crop_h >> ydec)
2657                  - loop_sbo.plane_offset(&lrf_ref.planes[pli].cfg).y as usize,
2658              );
2659
2660              // todo: experiment with borrowing border pixels
2661              // rather than edge-extending. Right now this is
2662              // hard-clipping to the superblock boundary.
2663              setup_integral_image(
2664                &mut ts.integral_buffer,
2665                SOLVE_IMAGE_STRIDE,
2666                vis_width,
2667                vis_height,
2668                vis_width,
2669                vis_height,
2670                &lrf_in_plane.slice(lrf_po),
2671                &lrf_in_plane.slice(lrf_po),
2672              );
2673
2674              for &set in get_sgr_sets(fi.config.speed_settings.sgr_complexity)
2675              {
2676                let (xqd0, xqd1) = sgrproj_solve(
2677                  set,
2678                  fi,
2679                  &ts.integral_buffer,
2680                  &src_plane
2681                    .subregion(Area::StartingAt { x: lrf_po.x, y: lrf_po.y }),
2682                  &lrf_in_plane.slice(lrf_po),
2683                  vis_width,
2684                  vis_height,
2685                );
2686                let current_lrf =
2687                  RestorationFilter::Sgrproj { set, xqd: [xqd0, xqd1] };
2688                if let RestorationFilter::Sgrproj { set, xqd } = current_lrf {
2689                  sgrproj_stripe_filter(
2690                    set,
2691                    xqd,
2692                    fi,
2693                    &ts.integral_buffer,
2694                    SOLVE_IMAGE_STRIDE,
2695                    &lrf_in_plane.slice(lrf_po),
2696                    &mut lrf_ref.planes[pli].region_mut(Area::Rect {
2697                      x: lrf_po.x,
2698                      y: lrf_po.y,
2699                      width: vis_width,
2700                      height: vis_height,
2701                    }),
2702                  );
2703                }
2704                let err = rdo_loop_plane_error(
2705                  base_sbo,
2706                  loop_sbo,
2707                  lru_sb_w,
2708                  lru_sb_h,
2709                  fi,
2710                  ts,
2711                  &tileblocks_subset.as_const(),
2712                  lrf_ref,
2713                  &src_subset,
2714                  pli,
2715                );
2716                let rate = cw.fc.count_lrf_switchable(
2717                  w,
2718                  &ts.restoration.as_const(),
2719                  current_lrf,
2720                  pli,
2721                );
2722                let cost = compute_rd_cost(fi, rate, err);
2723                if cost < best_cost {
2724                  best_cost = cost;
2725                  best_lrf_cost[lru_y * lru_w[pli] + lru_x][pli] = cost;
2726                  best_new_lrf = current_lrf;
2727                }
2728              }
2729
2730              if best_lrf[lru_y * lru_w[pli] + lru_x][pli]
2731                .notequal(best_new_lrf)
2732              {
2733                best_lrf[lru_y * lru_w[pli] + lru_x][pli] = best_new_lrf;
2734                lrf_change = true;
2735                if let Some(ru) = ts.restoration.planes[pli]
2736                  .restoration_unit_mut(base_sbo + loop_sbo)
2737                {
2738                  ru.filter = best_new_lrf;
2739                }
2740              }
2741            }
2742          }
2743        }
2744      }
2745    }
2746  }
2747}
2748
2749#[test]
2750fn estimate_rate_test() {
2751  assert_eq!(estimate_rate(0, TxSize::TX_4X4, 0), RDO_RATE_TABLE[0][0][0]);
2752}
rav1e/rdo.rs

rav1e/
rdo.rs