rav1e/transform/
inverse.rs

1// Copyright (c) 2018-2022, The rav1e contributors. All rights reserved
2//
3// This source code is subject to the terms of the BSD 2 Clause License and
4// the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
5// was not distributed with this source code in the LICENSE file, you can
6// obtain it at www.aomedia.org/license/software. If the Alliance for Open
7// Media Patent License 1.0 was not distributed with this source code in the
8// PATENTS file, you can obtain it at www.aomedia.org/license/patent.
9
10cfg_if::cfg_if! {
11  if #[cfg(nasm_x86_64)] {
12    pub use crate::asm::x86::transform::inverse::*;
13  } else if #[cfg(asm_neon)] {
14    pub use crate::asm::aarch64::transform::inverse::*;
15  } else {
16    pub use self::rust::*;
17  }
18}
19
20use crate::tiling::PlaneRegionMut;
21use crate::util::*;
22
23// TODO: move 1d txfm code to rust module.
24
25use super::clamp_value;
26use super::consts::*;
27use super::get_1d_tx_types;
28use super::half_btf;
29use super::TxSize;
30use super::TxType;
31
32/// # Panics
33///
34/// - If `input` or `output` have fewer than 4 items.
35pub fn av1_iwht4(input: &[i32], output: &mut [i32], _range: usize) {
36  assert!(input.len() >= 4);
37  assert!(output.len() >= 4);
38
39  // <https://aomediacodec.github.io/av1-spec/#inverse-walsh-hadamard-transform-process>
40  let x0 = input[0];
41  let x1 = input[1];
42  let x2 = input[2];
43  let x3 = input[3];
44  let s0 = x0 + x1;
45  let s2 = x2 - x3;
46  let s4 = (s0 - s2) >> 1;
47  let s3 = s4 - x3;
48  let s1 = s4 - x1;
49  output[0] = s0 - s3;
50  output[1] = s3;
51  output[2] = s1;
52  output[3] = s2 + s1;
53}
54
55static COSPI_INV: [i32; 64] = [
56  4096, 4095, 4091, 4085, 4076, 4065, 4052, 4036, 4017, 3996, 3973, 3948,
57  3920, 3889, 3857, 3822, 3784, 3745, 3703, 3659, 3612, 3564, 3513, 3461,
58  3406, 3349, 3290, 3229, 3166, 3102, 3035, 2967, 2896, 2824, 2751, 2675,
59  2598, 2520, 2440, 2359, 2276, 2191, 2106, 2019, 1931, 1842, 1751, 1660,
60  1567, 1474, 1380, 1285, 1189, 1092, 995, 897, 799, 700, 601, 501, 401, 301,
61  201, 101,
62];
63
64static SINPI_INV: [i32; 5] = [0, 1321, 2482, 3344, 3803];
65
66const INV_COS_BIT: usize = 12;
67
68/// # Panics
69///
70/// - If `input` or `output` have fewer than 4 items.
71pub fn av1_idct4(input: &[i32], output: &mut [i32], range: usize) {
72  assert!(input.len() >= 4);
73  assert!(output.len() >= 4);
74
75  // stage 1
76  let stg1 = [input[0], input[2], input[1], input[3]];
77
78  // stage 2
79  let stg2 = [
80    half_btf(COSPI_INV[32], stg1[0], COSPI_INV[32], stg1[1], INV_COS_BIT),
81    half_btf(COSPI_INV[32], stg1[0], -COSPI_INV[32], stg1[1], INV_COS_BIT),
82    half_btf(COSPI_INV[48], stg1[2], -COSPI_INV[16], stg1[3], INV_COS_BIT),
83    half_btf(COSPI_INV[16], stg1[2], COSPI_INV[48], stg1[3], INV_COS_BIT),
84  ];
85
86  // stage 3
87  output[0] = clamp_value(stg2[0] + stg2[3], range);
88  output[1] = clamp_value(stg2[1] + stg2[2], range);
89  output[2] = clamp_value(stg2[1] - stg2[2], range);
90  output[3] = clamp_value(stg2[0] - stg2[3], range);
91}
92
93pub fn av1_iflipadst4(input: &[i32], output: &mut [i32], range: usize) {
94  av1_iadst4(input, output, range);
95  output[..4].reverse();
96}
97
98/// # Panics
99///
100/// - If `input` or `output` have fewer than 4 items.
101#[inline(always)]
102pub fn av1_iadst4(input: &[i32], output: &mut [i32], _range: usize) {
103  assert!(input.len() >= 4);
104  assert!(output.len() >= 4);
105
106  let bit = 12;
107
108  let x0 = input[0];
109  let x1 = input[1];
110  let x2 = input[2];
111  let x3 = input[3];
112
113  // stage 1
114  let s0 = SINPI_INV[1] * x0;
115  let s1 = SINPI_INV[2] * x0;
116  let s2 = SINPI_INV[3] * x1;
117  let s3 = SINPI_INV[4] * x2;
118  let s4 = SINPI_INV[1] * x2;
119  let s5 = SINPI_INV[2] * x3;
120  let s6 = SINPI_INV[4] * x3;
121
122  // stage 2
123  let s7 = (x0 - x2) + x3;
124
125  // stage 3
126  let s0 = s0 + s3;
127  let s1 = s1 - s4;
128  let s3 = s2;
129  let s2 = SINPI_INV[3] * s7;
130
131  // stage 4
132  let s0 = s0 + s5;
133  let s1 = s1 - s6;
134
135  // stage 5
136  let x0 = s0 + s3;
137  let x1 = s1 + s3;
138  let x2 = s2;
139  let x3 = s0 + s1;
140
141  // stage 6
142  let x3 = x3 - s3;
143
144  output[0] = round_shift(x0, bit);
145  output[1] = round_shift(x1, bit);
146  output[2] = round_shift(x2, bit);
147  output[3] = round_shift(x3, bit);
148}
149
150pub fn av1_iidentity4(input: &[i32], output: &mut [i32], _range: usize) {
151  output[..4]
152    .iter_mut()
153    .zip(input[..4].iter())
154    .for_each(|(outp, inp)| *outp = round_shift(SQRT2 * *inp, 12));
155}
156
157/// # Panics
158///
159/// - If `input` or `output` have fewer than 8 items.
160pub fn av1_idct8(input: &[i32], output: &mut [i32], range: usize) {
161  assert!(input.len() >= 8);
162  assert!(output.len() >= 8);
163
164  // call idct4
165  let temp_in = [input[0], input[2], input[4], input[6]];
166  let mut temp_out: [i32; 4] = [0; 4];
167  av1_idct4(&temp_in, &mut temp_out, range);
168
169  // stage 0
170
171  // stage 1
172  let stg1 = [input[1], input[5], input[3], input[7]];
173
174  // stage 2
175  let stg2 = [
176    half_btf(COSPI_INV[56], stg1[0], -COSPI_INV[8], stg1[3], INV_COS_BIT),
177    half_btf(COSPI_INV[24], stg1[1], -COSPI_INV[40], stg1[2], INV_COS_BIT),
178    half_btf(COSPI_INV[40], stg1[1], COSPI_INV[24], stg1[2], INV_COS_BIT),
179    half_btf(COSPI_INV[8], stg1[0], COSPI_INV[56], stg1[3], INV_COS_BIT),
180  ];
181
182  // stage 3
183  let stg3 = [
184    clamp_value(stg2[0] + stg2[1], range),
185    clamp_value(stg2[0] - stg2[1], range),
186    clamp_value(-stg2[2] + stg2[3], range),
187    clamp_value(stg2[2] + stg2[3], range),
188  ];
189
190  // stage 4
191  let stg4 = [
192    stg3[0],
193    half_btf(-COSPI_INV[32], stg3[1], COSPI_INV[32], stg3[2], INV_COS_BIT),
194    half_btf(COSPI_INV[32], stg3[1], COSPI_INV[32], stg3[2], INV_COS_BIT),
195    stg3[3],
196  ];
197
198  // stage 5
199  output[0] = clamp_value(temp_out[0] + stg4[3], range);
200  output[1] = clamp_value(temp_out[1] + stg4[2], range);
201  output[2] = clamp_value(temp_out[2] + stg4[1], range);
202  output[3] = clamp_value(temp_out[3] + stg4[0], range);
203  output[4] = clamp_value(temp_out[3] - stg4[0], range);
204  output[5] = clamp_value(temp_out[2] - stg4[1], range);
205  output[6] = clamp_value(temp_out[1] - stg4[2], range);
206  output[7] = clamp_value(temp_out[0] - stg4[3], range);
207}
208
209pub fn av1_iflipadst8(input: &[i32], output: &mut [i32], range: usize) {
210  av1_iadst8(input, output, range);
211  output[..8].reverse();
212}
213
214/// # Panics
215///
216/// - If `input` or `output` have fewer than 8 items.
217#[inline(always)]
218pub fn av1_iadst8(input: &[i32], output: &mut [i32], range: usize) {
219  assert!(input.len() >= 8);
220  assert!(output.len() >= 8);
221
222  // stage 1
223  let stg1 = [
224    input[7], input[0], input[5], input[2], input[3], input[4], input[1],
225    input[6],
226  ];
227
228  // stage 2
229  let stg2 = [
230    half_btf(COSPI_INV[4], stg1[0], COSPI_INV[60], stg1[1], INV_COS_BIT),
231    half_btf(COSPI_INV[60], stg1[0], -COSPI_INV[4], stg1[1], INV_COS_BIT),
232    half_btf(COSPI_INV[20], stg1[2], COSPI_INV[44], stg1[3], INV_COS_BIT),
233    half_btf(COSPI_INV[44], stg1[2], -COSPI_INV[20], stg1[3], INV_COS_BIT),
234    half_btf(COSPI_INV[36], stg1[4], COSPI_INV[28], stg1[5], INV_COS_BIT),
235    half_btf(COSPI_INV[28], stg1[4], -COSPI_INV[36], stg1[5], INV_COS_BIT),
236    half_btf(COSPI_INV[52], stg1[6], COSPI_INV[12], stg1[7], INV_COS_BIT),
237    half_btf(COSPI_INV[12], stg1[6], -COSPI_INV[52], stg1[7], INV_COS_BIT),
238  ];
239
240  // stage 3
241  let stg3 = [
242    clamp_value(stg2[0] + stg2[4], range),
243    clamp_value(stg2[1] + stg2[5], range),
244    clamp_value(stg2[2] + stg2[6], range),
245    clamp_value(stg2[3] + stg2[7], range),
246    clamp_value(stg2[0] - stg2[4], range),
247    clamp_value(stg2[1] - stg2[5], range),
248    clamp_value(stg2[2] - stg2[6], range),
249    clamp_value(stg2[3] - stg2[7], range),
250  ];
251
252  // stage 4
253  let stg4 = [
254    stg3[0],
255    stg3[1],
256    stg3[2],
257    stg3[3],
258    half_btf(COSPI_INV[16], stg3[4], COSPI_INV[48], stg3[5], INV_COS_BIT),
259    half_btf(COSPI_INV[48], stg3[4], -COSPI_INV[16], stg3[5], INV_COS_BIT),
260    half_btf(-COSPI_INV[48], stg3[6], COSPI_INV[16], stg3[7], INV_COS_BIT),
261    half_btf(COSPI_INV[16], stg3[6], COSPI_INV[48], stg3[7], INV_COS_BIT),
262  ];
263
264  // stage 5
265  let stg5 = [
266    clamp_value(stg4[0] + stg4[2], range),
267    clamp_value(stg4[1] + stg4[3], range),
268    clamp_value(stg4[0] - stg4[2], range),
269    clamp_value(stg4[1] - stg4[3], range),
270    clamp_value(stg4[4] + stg4[6], range),
271    clamp_value(stg4[5] + stg4[7], range),
272    clamp_value(stg4[4] - stg4[6], range),
273    clamp_value(stg4[5] - stg4[7], range),
274  ];
275
276  // stage 6
277  let stg6 = [
278    stg5[0],
279    stg5[1],
280    half_btf(COSPI_INV[32], stg5[2], COSPI_INV[32], stg5[3], INV_COS_BIT),
281    half_btf(COSPI_INV[32], stg5[2], -COSPI_INV[32], stg5[3], INV_COS_BIT),
282    stg5[4],
283    stg5[5],
284    half_btf(COSPI_INV[32], stg5[6], COSPI_INV[32], stg5[7], INV_COS_BIT),
285    half_btf(COSPI_INV[32], stg5[6], -COSPI_INV[32], stg5[7], INV_COS_BIT),
286  ];
287
288  // stage 7
289  output[0] = stg6[0];
290  output[1] = -stg6[4];
291  output[2] = stg6[6];
292  output[3] = -stg6[2];
293  output[4] = stg6[3];
294  output[5] = -stg6[7];
295  output[6] = stg6[5];
296  output[7] = -stg6[1];
297}
298
299pub fn av1_iidentity8(input: &[i32], output: &mut [i32], _range: usize) {
300  output[..8]
301    .iter_mut()
302    .zip(input[..8].iter())
303    .for_each(|(outp, inp)| *outp = 2 * *inp);
304}
305
306fn av1_idct16(input: &[i32], output: &mut [i32], range: usize) {
307  assert!(input.len() >= 16);
308  assert!(output.len() >= 16);
309
310  // call idct8
311  let temp_in = [
312    input[0], input[2], input[4], input[6], input[8], input[10], input[12],
313    input[14],
314  ];
315  let mut temp_out: [i32; 8] = [0; 8];
316  av1_idct8(&temp_in, &mut temp_out, range);
317
318  // stage 1
319  let stg1 = [
320    input[1], input[9], input[5], input[13], input[3], input[11], input[7],
321    input[15],
322  ];
323
324  // stage 2
325  let stg2 = [
326    half_btf(COSPI_INV[60], stg1[0], -COSPI_INV[4], stg1[7], INV_COS_BIT),
327    half_btf(COSPI_INV[28], stg1[1], -COSPI_INV[36], stg1[6], INV_COS_BIT),
328    half_btf(COSPI_INV[44], stg1[2], -COSPI_INV[20], stg1[5], INV_COS_BIT),
329    half_btf(COSPI_INV[12], stg1[3], -COSPI_INV[52], stg1[4], INV_COS_BIT),
330    half_btf(COSPI_INV[52], stg1[3], COSPI_INV[12], stg1[4], INV_COS_BIT),
331    half_btf(COSPI_INV[20], stg1[2], COSPI_INV[44], stg1[5], INV_COS_BIT),
332    half_btf(COSPI_INV[36], stg1[1], COSPI_INV[28], stg1[6], INV_COS_BIT),
333    half_btf(COSPI_INV[4], stg1[0], COSPI_INV[60], stg1[7], INV_COS_BIT),
334  ];
335
336  // stage 3
337  let stg3 = [
338    clamp_value(stg2[0] + stg2[1], range),
339    clamp_value(stg2[0] - stg2[1], range),
340    clamp_value(-stg2[2] + stg2[3], range),
341    clamp_value(stg2[2] + stg2[3], range),
342    clamp_value(stg2[4] + stg2[5], range),
343    clamp_value(stg2[4] - stg2[5], range),
344    clamp_value(-stg2[6] + stg2[7], range),
345    clamp_value(stg2[6] + stg2[7], range),
346  ];
347
348  // stage 4
349  let stg4 = [
350    stg3[0],
351    half_btf(-COSPI_INV[16], stg3[1], COSPI_INV[48], stg3[6], INV_COS_BIT),
352    half_btf(-COSPI_INV[48], stg3[2], -COSPI_INV[16], stg3[5], INV_COS_BIT),
353    stg3[3],
354    stg3[4],
355    half_btf(-COSPI_INV[16], stg3[2], COSPI_INV[48], stg3[5], INV_COS_BIT),
356    half_btf(COSPI_INV[48], stg3[1], COSPI_INV[16], stg3[6], INV_COS_BIT),
357    stg3[7],
358  ];
359
360  // stage 5
361  let stg5 = [
362    clamp_value(stg4[0] + stg4[3], range),
363    clamp_value(stg4[1] + stg4[2], range),
364    clamp_value(stg4[1] - stg4[2], range),
365    clamp_value(stg4[0] - stg4[3], range),
366    clamp_value(-stg4[4] + stg4[7], range),
367    clamp_value(-stg4[5] + stg4[6], range),
368    clamp_value(stg4[5] + stg4[6], range),
369    clamp_value(stg4[4] + stg4[7], range),
370  ];
371
372  // stage 6
373  let stg6 = [
374    stg5[0],
375    stg5[1],
376    half_btf(-COSPI_INV[32], stg5[2], COSPI_INV[32], stg5[5], INV_COS_BIT),
377    half_btf(-COSPI_INV[32], stg5[3], COSPI_INV[32], stg5[4], INV_COS_BIT),
378    half_btf(COSPI_INV[32], stg5[3], COSPI_INV[32], stg5[4], INV_COS_BIT),
379    half_btf(COSPI_INV[32], stg5[2], COSPI_INV[32], stg5[5], INV_COS_BIT),
380    stg5[6],
381    stg5[7],
382  ];
383
384  // stage 7
385  output[0] = clamp_value(temp_out[0] + stg6[7], range);
386  output[1] = clamp_value(temp_out[1] + stg6[6], range);
387  output[2] = clamp_value(temp_out[2] + stg6[5], range);
388  output[3] = clamp_value(temp_out[3] + stg6[4], range);
389  output[4] = clamp_value(temp_out[4] + stg6[3], range);
390  output[5] = clamp_value(temp_out[5] + stg6[2], range);
391  output[6] = clamp_value(temp_out[6] + stg6[1], range);
392  output[7] = clamp_value(temp_out[7] + stg6[0], range);
393  output[8] = clamp_value(temp_out[7] - stg6[0], range);
394  output[9] = clamp_value(temp_out[6] - stg6[1], range);
395  output[10] = clamp_value(temp_out[5] - stg6[2], range);
396  output[11] = clamp_value(temp_out[4] - stg6[3], range);
397  output[12] = clamp_value(temp_out[3] - stg6[4], range);
398  output[13] = clamp_value(temp_out[2] - stg6[5], range);
399  output[14] = clamp_value(temp_out[1] - stg6[6], range);
400  output[15] = clamp_value(temp_out[0] - stg6[7], range);
401}
402
403pub fn av1_iflipadst16(input: &[i32], output: &mut [i32], range: usize) {
404  av1_iadst16(input, output, range);
405  output[..16].reverse();
406}
407
408#[inline(always)]
409fn av1_iadst16(input: &[i32], output: &mut [i32], range: usize) {
410  assert!(input.len() >= 16);
411  assert!(output.len() >= 16);
412
413  // stage 1
414  let stg1 = [
415    input[15], input[0], input[13], input[2], input[11], input[4], input[9],
416    input[6], input[7], input[8], input[5], input[10], input[3], input[12],
417    input[1], input[14],
418  ];
419
420  // stage 2
421  let stg2 = [
422    half_btf(COSPI_INV[2], stg1[0], COSPI_INV[62], stg1[1], INV_COS_BIT),
423    half_btf(COSPI_INV[62], stg1[0], -COSPI_INV[2], stg1[1], INV_COS_BIT),
424    half_btf(COSPI_INV[10], stg1[2], COSPI_INV[54], stg1[3], INV_COS_BIT),
425    half_btf(COSPI_INV[54], stg1[2], -COSPI_INV[10], stg1[3], INV_COS_BIT),
426    half_btf(COSPI_INV[18], stg1[4], COSPI_INV[46], stg1[5], INV_COS_BIT),
427    half_btf(COSPI_INV[46], stg1[4], -COSPI_INV[18], stg1[5], INV_COS_BIT),
428    half_btf(COSPI_INV[26], stg1[6], COSPI_INV[38], stg1[7], INV_COS_BIT),
429    half_btf(COSPI_INV[38], stg1[6], -COSPI_INV[26], stg1[7], INV_COS_BIT),
430    half_btf(COSPI_INV[34], stg1[8], COSPI_INV[30], stg1[9], INV_COS_BIT),
431    half_btf(COSPI_INV[30], stg1[8], -COSPI_INV[34], stg1[9], INV_COS_BIT),
432    half_btf(COSPI_INV[42], stg1[10], COSPI_INV[22], stg1[11], INV_COS_BIT),
433    half_btf(COSPI_INV[22], stg1[10], -COSPI_INV[42], stg1[11], INV_COS_BIT),
434    half_btf(COSPI_INV[50], stg1[12], COSPI_INV[14], stg1[13], INV_COS_BIT),
435    half_btf(COSPI_INV[14], stg1[12], -COSPI_INV[50], stg1[13], INV_COS_BIT),
436    half_btf(COSPI_INV[58], stg1[14], COSPI_INV[6], stg1[15], INV_COS_BIT),
437    half_btf(COSPI_INV[6], stg1[14], -COSPI_INV[58], stg1[15], INV_COS_BIT),
438  ];
439
440  // stage 3
441  let stg3 = [
442    clamp_value(stg2[0] + stg2[8], range),
443    clamp_value(stg2[1] + stg2[9], range),
444    clamp_value(stg2[2] + stg2[10], range),
445    clamp_value(stg2[3] + stg2[11], range),
446    clamp_value(stg2[4] + stg2[12], range),
447    clamp_value(stg2[5] + stg2[13], range),
448    clamp_value(stg2[6] + stg2[14], range),
449    clamp_value(stg2[7] + stg2[15], range),
450    clamp_value(stg2[0] - stg2[8], range),
451    clamp_value(stg2[1] - stg2[9], range),
452    clamp_value(stg2[2] - stg2[10], range),
453    clamp_value(stg2[3] - stg2[11], range),
454    clamp_value(stg2[4] - stg2[12], range),
455    clamp_value(stg2[5] - stg2[13], range),
456    clamp_value(stg2[6] - stg2[14], range),
457    clamp_value(stg2[7] - stg2[15], range),
458  ];
459
460  // stage 4
461  let stg4 = [
462    stg3[0],
463    stg3[1],
464    stg3[2],
465    stg3[3],
466    stg3[4],
467    stg3[5],
468    stg3[6],
469    stg3[7],
470    half_btf(COSPI_INV[8], stg3[8], COSPI_INV[56], stg3[9], INV_COS_BIT),
471    half_btf(COSPI_INV[56], stg3[8], -COSPI_INV[8], stg3[9], INV_COS_BIT),
472    half_btf(COSPI_INV[40], stg3[10], COSPI_INV[24], stg3[11], INV_COS_BIT),
473    half_btf(COSPI_INV[24], stg3[10], -COSPI_INV[40], stg3[11], INV_COS_BIT),
474    half_btf(-COSPI_INV[56], stg3[12], COSPI_INV[8], stg3[13], INV_COS_BIT),
475    half_btf(COSPI_INV[8], stg3[12], COSPI_INV[56], stg3[13], INV_COS_BIT),
476    half_btf(-COSPI_INV[24], stg3[14], COSPI_INV[40], stg3[15], INV_COS_BIT),
477    half_btf(COSPI_INV[40], stg3[14], COSPI_INV[24], stg3[15], INV_COS_BIT),
478  ];
479
480  // stage 5
481  let stg5 = [
482    clamp_value(stg4[0] + stg4[4], range),
483    clamp_value(stg4[1] + stg4[5], range),
484    clamp_value(stg4[2] + stg4[6], range),
485    clamp_value(stg4[3] + stg4[7], range),
486    clamp_value(stg4[0] - stg4[4], range),
487    clamp_value(stg4[1] - stg4[5], range),
488    clamp_value(stg4[2] - stg4[6], range),
489    clamp_value(stg4[3] - stg4[7], range),
490    clamp_value(stg4[8] + stg4[12], range),
491    clamp_value(stg4[9] + stg4[13], range),
492    clamp_value(stg4[10] + stg4[14], range),
493    clamp_value(stg4[11] + stg4[15], range),
494    clamp_value(stg4[8] - stg4[12], range),
495    clamp_value(stg4[9] - stg4[13], range),
496    clamp_value(stg4[10] - stg4[14], range),
497    clamp_value(stg4[11] - stg4[15], range),
498  ];
499
500  // stage 6
501  let stg6 = [
502    stg5[0],
503    stg5[1],
504    stg5[2],
505    stg5[3],
506    half_btf(COSPI_INV[16], stg5[4], COSPI_INV[48], stg5[5], INV_COS_BIT),
507    half_btf(COSPI_INV[48], stg5[4], -COSPI_INV[16], stg5[5], INV_COS_BIT),
508    half_btf(-COSPI_INV[48], stg5[6], COSPI_INV[16], stg5[7], INV_COS_BIT),
509    half_btf(COSPI_INV[16], stg5[6], COSPI_INV[48], stg5[7], INV_COS_BIT),
510    stg5[8],
511    stg5[9],
512    stg5[10],
513    stg5[11],
514    half_btf(COSPI_INV[16], stg5[12], COSPI_INV[48], stg5[13], INV_COS_BIT),
515    half_btf(COSPI_INV[48], stg5[12], -COSPI_INV[16], stg5[13], INV_COS_BIT),
516    half_btf(-COSPI_INV[48], stg5[14], COSPI_INV[16], stg5[15], INV_COS_BIT),
517    half_btf(COSPI_INV[16], stg5[14], COSPI_INV[48], stg5[15], INV_COS_BIT),
518  ];
519
520  // stage 7
521  let stg7 = [
522    clamp_value(stg6[0] + stg6[2], range),
523    clamp_value(stg6[1] + stg6[3], range),
524    clamp_value(stg6[0] - stg6[2], range),
525    clamp_value(stg6[1] - stg6[3], range),
526    clamp_value(stg6[4] + stg6[6], range),
527    clamp_value(stg6[5] + stg6[7], range),
528    clamp_value(stg6[4] - stg6[6], range),
529    clamp_value(stg6[5] - stg6[7], range),
530    clamp_value(stg6[8] + stg6[10], range),
531    clamp_value(stg6[9] + stg6[11], range),
532    clamp_value(stg6[8] - stg6[10], range),
533    clamp_value(stg6[9] - stg6[11], range),
534    clamp_value(stg6[12] + stg6[14], range),
535    clamp_value(stg6[13] + stg6[15], range),
536    clamp_value(stg6[12] - stg6[14], range),
537    clamp_value(stg6[13] - stg6[15], range),
538  ];
539
540  // stage 8
541  let stg8 = [
542    stg7[0],
543    stg7[1],
544    half_btf(COSPI_INV[32], stg7[2], COSPI_INV[32], stg7[3], INV_COS_BIT),
545    half_btf(COSPI_INV[32], stg7[2], -COSPI_INV[32], stg7[3], INV_COS_BIT),
546    stg7[4],
547    stg7[5],
548    half_btf(COSPI_INV[32], stg7[6], COSPI_INV[32], stg7[7], INV_COS_BIT),
549    half_btf(COSPI_INV[32], stg7[6], -COSPI_INV[32], stg7[7], INV_COS_BIT),
550    stg7[8],
551    stg7[9],
552    half_btf(COSPI_INV[32], stg7[10], COSPI_INV[32], stg7[11], INV_COS_BIT),
553    half_btf(COSPI_INV[32], stg7[10], -COSPI_INV[32], stg7[11], INV_COS_BIT),
554    stg7[12],
555    stg7[13],
556    half_btf(COSPI_INV[32], stg7[14], COSPI_INV[32], stg7[15], INV_COS_BIT),
557    half_btf(COSPI_INV[32], stg7[14], -COSPI_INV[32], stg7[15], INV_COS_BIT),
558  ];
559
560  // stage 9
561  output[0] = stg8[0];
562  output[1] = -stg8[8];
563  output[2] = stg8[12];
564  output[3] = -stg8[4];
565  output[4] = stg8[6];
566  output[5] = -stg8[14];
567  output[6] = stg8[10];
568  output[7] = -stg8[2];
569  output[8] = stg8[3];
570  output[9] = -stg8[11];
571  output[10] = stg8[15];
572  output[11] = -stg8[7];
573  output[12] = stg8[5];
574  output[13] = -stg8[13];
575  output[14] = stg8[9];
576  output[15] = -stg8[1];
577}
578
579fn av1_iidentity16(input: &[i32], output: &mut [i32], _range: usize) {
580  output[..16]
581    .iter_mut()
582    .zip(input[..16].iter())
583    .for_each(|(outp, inp)| *outp = round_shift(SQRT2 * 2 * *inp, 12));
584}
585
586fn av1_idct32(input: &[i32], output: &mut [i32], range: usize) {
587  assert!(input.len() >= 32);
588  assert!(output.len() >= 32);
589
590  // stage 1;
591  let stg1 = [
592    input[0], input[16], input[8], input[24], input[4], input[20], input[12],
593    input[28], input[2], input[18], input[10], input[26], input[6], input[22],
594    input[14], input[30], input[1], input[17], input[9], input[25], input[5],
595    input[21], input[13], input[29], input[3], input[19], input[11],
596    input[27], input[7], input[23], input[15], input[31],
597  ];
598
599  // stage 2
600  let stg2 = [
601    stg1[0],
602    stg1[1],
603    stg1[2],
604    stg1[3],
605    stg1[4],
606    stg1[5],
607    stg1[6],
608    stg1[7],
609    stg1[8],
610    stg1[9],
611    stg1[10],
612    stg1[11],
613    stg1[12],
614    stg1[13],
615    stg1[14],
616    stg1[15],
617    half_btf(COSPI_INV[62], stg1[16], -COSPI_INV[2], stg1[31], INV_COS_BIT),
618    half_btf(COSPI_INV[30], stg1[17], -COSPI_INV[34], stg1[30], INV_COS_BIT),
619    half_btf(COSPI_INV[46], stg1[18], -COSPI_INV[18], stg1[29], INV_COS_BIT),
620    half_btf(COSPI_INV[14], stg1[19], -COSPI_INV[50], stg1[28], INV_COS_BIT),
621    half_btf(COSPI_INV[54], stg1[20], -COSPI_INV[10], stg1[27], INV_COS_BIT),
622    half_btf(COSPI_INV[22], stg1[21], -COSPI_INV[42], stg1[26], INV_COS_BIT),
623    half_btf(COSPI_INV[38], stg1[22], -COSPI_INV[26], stg1[25], INV_COS_BIT),
624    half_btf(COSPI_INV[6], stg1[23], -COSPI_INV[58], stg1[24], INV_COS_BIT),
625    half_btf(COSPI_INV[58], stg1[23], COSPI_INV[6], stg1[24], INV_COS_BIT),
626    half_btf(COSPI_INV[26], stg1[22], COSPI_INV[38], stg1[25], INV_COS_BIT),
627    half_btf(COSPI_INV[42], stg1[21], COSPI_INV[22], stg1[26], INV_COS_BIT),
628    half_btf(COSPI_INV[10], stg1[20], COSPI_INV[54], stg1[27], INV_COS_BIT),
629    half_btf(COSPI_INV[50], stg1[19], COSPI_INV[14], stg1[28], INV_COS_BIT),
630    half_btf(COSPI_INV[18], stg1[18], COSPI_INV[46], stg1[29], INV_COS_BIT),
631    half_btf(COSPI_INV[34], stg1[17], COSPI_INV[30], stg1[30], INV_COS_BIT),
632    half_btf(COSPI_INV[2], stg1[16], COSPI_INV[62], stg1[31], INV_COS_BIT),
633  ];
634
635  // stage 3
636  let stg3 = [
637    stg2[0],
638    stg2[1],
639    stg2[2],
640    stg2[3],
641    stg2[4],
642    stg2[5],
643    stg2[6],
644    stg2[7],
645    half_btf(COSPI_INV[60], stg2[8], -COSPI_INV[4], stg2[15], INV_COS_BIT),
646    half_btf(COSPI_INV[28], stg2[9], -COSPI_INV[36], stg2[14], INV_COS_BIT),
647    half_btf(COSPI_INV[44], stg2[10], -COSPI_INV[20], stg2[13], INV_COS_BIT),
648    half_btf(COSPI_INV[12], stg2[11], -COSPI_INV[52], stg2[12], INV_COS_BIT),
649    half_btf(COSPI_INV[52], stg2[11], COSPI_INV[12], stg2[12], INV_COS_BIT),
650    half_btf(COSPI_INV[20], stg2[10], COSPI_INV[44], stg2[13], INV_COS_BIT),
651    half_btf(COSPI_INV[36], stg2[9], COSPI_INV[28], stg2[14], INV_COS_BIT),
652    half_btf(COSPI_INV[4], stg2[8], COSPI_INV[60], stg2[15], INV_COS_BIT),
653    clamp_value(stg2[16] + stg2[17], range),
654    clamp_value(stg2[16] - stg2[17], range),
655    clamp_value(-stg2[18] + stg2[19], range),
656    clamp_value(stg2[18] + stg2[19], range),
657    clamp_value(stg2[20] + stg2[21], range),
658    clamp_value(stg2[20] - stg2[21], range),
659    clamp_value(-stg2[22] + stg2[23], range),
660    clamp_value(stg2[22] + stg2[23], range),
661    clamp_value(stg2[24] + stg2[25], range),
662    clamp_value(stg2[24] - stg2[25], range),
663    clamp_value(-stg2[26] + stg2[27], range),
664    clamp_value(stg2[26] + stg2[27], range),
665    clamp_value(stg2[28] + stg2[29], range),
666    clamp_value(stg2[28] - stg2[29], range),
667    clamp_value(-stg2[30] + stg2[31], range),
668    clamp_value(stg2[30] + stg2[31], range),
669  ];
670
671  // stage 4
672  let stg4 = [
673    stg3[0],
674    stg3[1],
675    stg3[2],
676    stg3[3],
677    half_btf(COSPI_INV[56], stg3[4], -COSPI_INV[8], stg3[7], INV_COS_BIT),
678    half_btf(COSPI_INV[24], stg3[5], -COSPI_INV[40], stg3[6], INV_COS_BIT),
679    half_btf(COSPI_INV[40], stg3[5], COSPI_INV[24], stg3[6], INV_COS_BIT),
680    half_btf(COSPI_INV[8], stg3[4], COSPI_INV[56], stg3[7], INV_COS_BIT),
681    clamp_value(stg3[8] + stg3[9], range),
682    clamp_value(stg3[8] - stg3[9], range),
683    clamp_value(-stg3[10] + stg3[11], range),
684    clamp_value(stg3[10] + stg3[11], range),
685    clamp_value(stg3[12] + stg3[13], range),
686    clamp_value(stg3[12] - stg3[13], range),
687    clamp_value(-stg3[14] + stg3[15], range),
688    clamp_value(stg3[14] + stg3[15], range),
689    stg3[16],
690    half_btf(-COSPI_INV[8], stg3[17], COSPI_INV[56], stg3[30], INV_COS_BIT),
691    half_btf(-COSPI_INV[56], stg3[18], -COSPI_INV[8], stg3[29], INV_COS_BIT),
692    stg3[19],
693    stg3[20],
694    half_btf(-COSPI_INV[40], stg3[21], COSPI_INV[24], stg3[26], INV_COS_BIT),
695    half_btf(-COSPI_INV[24], stg3[22], -COSPI_INV[40], stg3[25], INV_COS_BIT),
696    stg3[23],
697    stg3[24],
698    half_btf(-COSPI_INV[40], stg3[22], COSPI_INV[24], stg3[25], INV_COS_BIT),
699    half_btf(COSPI_INV[24], stg3[21], COSPI_INV[40], stg3[26], INV_COS_BIT),
700    stg3[27],
701    stg3[28],
702    half_btf(-COSPI_INV[8], stg3[18], COSPI_INV[56], stg3[29], INV_COS_BIT),
703    half_btf(COSPI_INV[56], stg3[17], COSPI_INV[8], stg3[30], INV_COS_BIT),
704    stg3[31],
705  ];
706
707  // stage 5
708  let stg5 = [
709    half_btf(COSPI_INV[32], stg4[0], COSPI_INV[32], stg4[1], INV_COS_BIT),
710    half_btf(COSPI_INV[32], stg4[0], -COSPI_INV[32], stg4[1], INV_COS_BIT),
711    half_btf(COSPI_INV[48], stg4[2], -COSPI_INV[16], stg4[3], INV_COS_BIT),
712    half_btf(COSPI_INV[16], stg4[2], COSPI_INV[48], stg4[3], INV_COS_BIT),
713    clamp_value(stg4[4] + stg4[5], range),
714    clamp_value(stg4[4] - stg4[5], range),
715    clamp_value(-stg4[6] + stg4[7], range),
716    clamp_value(stg4[6] + stg4[7], range),
717    stg4[8],
718    half_btf(-COSPI_INV[16], stg4[9], COSPI_INV[48], stg4[14], INV_COS_BIT),
719    half_btf(-COSPI_INV[48], stg4[10], -COSPI_INV[16], stg4[13], INV_COS_BIT),
720    stg4[11],
721    stg4[12],
722    half_btf(-COSPI_INV[16], stg4[10], COSPI_INV[48], stg4[13], INV_COS_BIT),
723    half_btf(COSPI_INV[48], stg4[9], COSPI_INV[16], stg4[14], INV_COS_BIT),
724    stg4[15],
725    clamp_value(stg4[16] + stg4[19], range),
726    clamp_value(stg4[17] + stg4[18], range),
727    clamp_value(stg4[17] - stg4[18], range),
728    clamp_value(stg4[16] - stg4[19], range),
729    clamp_value(-stg4[20] + stg4[23], range),
730    clamp_value(-stg4[21] + stg4[22], range),
731    clamp_value(stg4[21] + stg4[22], range),
732    clamp_value(stg4[20] + stg4[23], range),
733    clamp_value(stg4[24] + stg4[27], range),
734    clamp_value(stg4[25] + stg4[26], range),
735    clamp_value(stg4[25] - stg4[26], range),
736    clamp_value(stg4[24] - stg4[27], range),
737    clamp_value(-stg4[28] + stg4[31], range),
738    clamp_value(-stg4[29] + stg4[30], range),
739    clamp_value(stg4[29] + stg4[30], range),
740    clamp_value(stg4[28] + stg4[31], range),
741  ];
742
743  // stage 6
744  let stg6 = [
745    clamp_value(stg5[0] + stg5[3], range),
746    clamp_value(stg5[1] + stg5[2], range),
747    clamp_value(stg5[1] - stg5[2], range),
748    clamp_value(stg5[0] - stg5[3], range),
749    stg5[4],
750    half_btf(-COSPI_INV[32], stg5[5], COSPI_INV[32], stg5[6], INV_COS_BIT),
751    half_btf(COSPI_INV[32], stg5[5], COSPI_INV[32], stg5[6], INV_COS_BIT),
752    stg5[7],
753    clamp_value(stg5[8] + stg5[11], range),
754    clamp_value(stg5[9] + stg5[10], range),
755    clamp_value(stg5[9] - stg5[10], range),
756    clamp_value(stg5[8] - stg5[11], range),
757    clamp_value(-stg5[12] + stg5[15], range),
758    clamp_value(-stg5[13] + stg5[14], range),
759    clamp_value(stg5[13] + stg5[14], range),
760    clamp_value(stg5[12] + stg5[15], range),
761    stg5[16],
762    stg5[17],
763    half_btf(-COSPI_INV[16], stg5[18], COSPI_INV[48], stg5[29], INV_COS_BIT),
764    half_btf(-COSPI_INV[16], stg5[19], COSPI_INV[48], stg5[28], INV_COS_BIT),
765    half_btf(-COSPI_INV[48], stg5[20], -COSPI_INV[16], stg5[27], INV_COS_BIT),
766    half_btf(-COSPI_INV[48], stg5[21], -COSPI_INV[16], stg5[26], INV_COS_BIT),
767    stg5[22],
768    stg5[23],
769    stg5[24],
770    stg5[25],
771    half_btf(-COSPI_INV[16], stg5[21], COSPI_INV[48], stg5[26], INV_COS_BIT),
772    half_btf(-COSPI_INV[16], stg5[20], COSPI_INV[48], stg5[27], INV_COS_BIT),
773    half_btf(COSPI_INV[48], stg5[19], COSPI_INV[16], stg5[28], INV_COS_BIT),
774    half_btf(COSPI_INV[48], stg5[18], COSPI_INV[16], stg5[29], INV_COS_BIT),
775    stg5[30],
776    stg5[31],
777  ];
778
779  // stage 7
780  let stg7 = [
781    clamp_value(stg6[0] + stg6[7], range),
782    clamp_value(stg6[1] + stg6[6], range),
783    clamp_value(stg6[2] + stg6[5], range),
784    clamp_value(stg6[3] + stg6[4], range),
785    clamp_value(stg6[3] - stg6[4], range),
786    clamp_value(stg6[2] - stg6[5], range),
787    clamp_value(stg6[1] - stg6[6], range),
788    clamp_value(stg6[0] - stg6[7], range),
789    stg6[8],
790    stg6[9],
791    half_btf(-COSPI_INV[32], stg6[10], COSPI_INV[32], stg6[13], INV_COS_BIT),
792    half_btf(-COSPI_INV[32], stg6[11], COSPI_INV[32], stg6[12], INV_COS_BIT),
793    half_btf(COSPI_INV[32], stg6[11], COSPI_INV[32], stg6[12], INV_COS_BIT),
794    half_btf(COSPI_INV[32], stg6[10], COSPI_INV[32], stg6[13], INV_COS_BIT),
795    stg6[14],
796    stg6[15],
797    clamp_value(stg6[16] + stg6[23], range),
798    clamp_value(stg6[17] + stg6[22], range),
799    clamp_value(stg6[18] + stg6[21], range),
800    clamp_value(stg6[19] + stg6[20], range),
801    clamp_value(stg6[19] - stg6[20], range),
802    clamp_value(stg6[18] - stg6[21], range),
803    clamp_value(stg6[17] - stg6[22], range),
804    clamp_value(stg6[16] - stg6[23], range),
805    clamp_value(-stg6[24] + stg6[31], range),
806    clamp_value(-stg6[25] + stg6[30], range),
807    clamp_value(-stg6[26] + stg6[29], range),
808    clamp_value(-stg6[27] + stg6[28], range),
809    clamp_value(stg6[27] + stg6[28], range),
810    clamp_value(stg6[26] + stg6[29], range),
811    clamp_value(stg6[25] + stg6[30], range),
812    clamp_value(stg6[24] + stg6[31], range),
813  ];
814
815  // stage 8
816  let stg8 = [
817    clamp_value(stg7[0] + stg7[15], range),
818    clamp_value(stg7[1] + stg7[14], range),
819    clamp_value(stg7[2] + stg7[13], range),
820    clamp_value(stg7[3] + stg7[12], range),
821    clamp_value(stg7[4] + stg7[11], range),
822    clamp_value(stg7[5] + stg7[10], range),
823    clamp_value(stg7[6] + stg7[9], range),
824    clamp_value(stg7[7] + stg7[8], range),
825    clamp_value(stg7[7] - stg7[8], range),
826    clamp_value(stg7[6] - stg7[9], range),
827    clamp_value(stg7[5] - stg7[10], range),
828    clamp_value(stg7[4] - stg7[11], range),
829    clamp_value(stg7[3] - stg7[12], range),
830    clamp_value(stg7[2] - stg7[13], range),
831    clamp_value(stg7[1] - stg7[14], range),
832    clamp_value(stg7[0] - stg7[15], range),
833    stg7[16],
834    stg7[17],
835    stg7[18],
836    stg7[19],
837    half_btf(-COSPI_INV[32], stg7[20], COSPI_INV[32], stg7[27], INV_COS_BIT),
838    half_btf(-COSPI_INV[32], stg7[21], COSPI_INV[32], stg7[26], INV_COS_BIT),
839    half_btf(-COSPI_INV[32], stg7[22], COSPI_INV[32], stg7[25], INV_COS_BIT),
840    half_btf(-COSPI_INV[32], stg7[23], COSPI_INV[32], stg7[24], INV_COS_BIT),
841    half_btf(COSPI_INV[32], stg7[23], COSPI_INV[32], stg7[24], INV_COS_BIT),
842    half_btf(COSPI_INV[32], stg7[22], COSPI_INV[32], stg7[25], INV_COS_BIT),
843    half_btf(COSPI_INV[32], stg7[21], COSPI_INV[32], stg7[26], INV_COS_BIT),
844    half_btf(COSPI_INV[32], stg7[20], COSPI_INV[32], stg7[27], INV_COS_BIT),
845    stg7[28],
846    stg7[29],
847    stg7[30],
848    stg7[31],
849  ];
850
851  // stage 9
852  output[0] = clamp_value(stg8[0] + stg8[31], range);
853  output[1] = clamp_value(stg8[1] + stg8[30], range);
854  output[2] = clamp_value(stg8[2] + stg8[29], range);
855  output[3] = clamp_value(stg8[3] + stg8[28], range);
856  output[4] = clamp_value(stg8[4] + stg8[27], range);
857  output[5] = clamp_value(stg8[5] + stg8[26], range);
858  output[6] = clamp_value(stg8[6] + stg8[25], range);
859  output[7] = clamp_value(stg8[7] + stg8[24], range);
860  output[8] = clamp_value(stg8[8] + stg8[23], range);
861  output[9] = clamp_value(stg8[9] + stg8[22], range);
862  output[10] = clamp_value(stg8[10] + stg8[21], range);
863  output[11] = clamp_value(stg8[11] + stg8[20], range);
864  output[12] = clamp_value(stg8[12] + stg8[19], range);
865  output[13] = clamp_value(stg8[13] + stg8[18], range);
866  output[14] = clamp_value(stg8[14] + stg8[17], range);
867  output[15] = clamp_value(stg8[15] + stg8[16], range);
868  output[16] = clamp_value(stg8[15] - stg8[16], range);
869  output[17] = clamp_value(stg8[14] - stg8[17], range);
870  output[18] = clamp_value(stg8[13] - stg8[18], range);
871  output[19] = clamp_value(stg8[12] - stg8[19], range);
872  output[20] = clamp_value(stg8[11] - stg8[20], range);
873  output[21] = clamp_value(stg8[10] - stg8[21], range);
874  output[22] = clamp_value(stg8[9] - stg8[22], range);
875  output[23] = clamp_value(stg8[8] - stg8[23], range);
876  output[24] = clamp_value(stg8[7] - stg8[24], range);
877  output[25] = clamp_value(stg8[6] - stg8[25], range);
878  output[26] = clamp_value(stg8[5] - stg8[26], range);
879  output[27] = clamp_value(stg8[4] - stg8[27], range);
880  output[28] = clamp_value(stg8[3] - stg8[28], range);
881  output[29] = clamp_value(stg8[2] - stg8[29], range);
882  output[30] = clamp_value(stg8[1] - stg8[30], range);
883  output[31] = clamp_value(stg8[0] - stg8[31], range);
884}
885
886fn av1_iidentity32(input: &[i32], output: &mut [i32], _range: usize) {
887  output[..32]
888    .iter_mut()
889    .zip(input[..32].iter())
890    .for_each(|(outp, inp)| *outp = 4 * *inp);
891}
892
893fn av1_idct64(input: &[i32], output: &mut [i32], range: usize) {
894  assert!(input.len() >= 64);
895  assert!(output.len() >= 64);
896
897  // stage 1;
898  let stg1 = [
899    input[0], input[32], input[16], input[48], input[8], input[40], input[24],
900    input[56], input[4], input[36], input[20], input[52], input[12],
901    input[44], input[28], input[60], input[2], input[34], input[18],
902    input[50], input[10], input[42], input[26], input[58], input[6],
903    input[38], input[22], input[54], input[14], input[46], input[30],
904    input[62], input[1], input[33], input[17], input[49], input[9], input[41],
905    input[25], input[57], input[5], input[37], input[21], input[53],
906    input[13], input[45], input[29], input[61], input[3], input[35],
907    input[19], input[51], input[11], input[43], input[27], input[59],
908    input[7], input[39], input[23], input[55], input[15], input[47],
909    input[31], input[63],
910  ];
911
912  // stage 2
913  let stg2 = [
914    stg1[0],
915    stg1[1],
916    stg1[2],
917    stg1[3],
918    stg1[4],
919    stg1[5],
920    stg1[6],
921    stg1[7],
922    stg1[8],
923    stg1[9],
924    stg1[10],
925    stg1[11],
926    stg1[12],
927    stg1[13],
928    stg1[14],
929    stg1[15],
930    stg1[16],
931    stg1[17],
932    stg1[18],
933    stg1[19],
934    stg1[20],
935    stg1[21],
936    stg1[22],
937    stg1[23],
938    stg1[24],
939    stg1[25],
940    stg1[26],
941    stg1[27],
942    stg1[28],
943    stg1[29],
944    stg1[30],
945    stg1[31],
946    half_btf(COSPI_INV[63], stg1[32], -COSPI_INV[1], stg1[63], INV_COS_BIT),
947    half_btf(COSPI_INV[31], stg1[33], -COSPI_INV[33], stg1[62], INV_COS_BIT),
948    half_btf(COSPI_INV[47], stg1[34], -COSPI_INV[17], stg1[61], INV_COS_BIT),
949    half_btf(COSPI_INV[15], stg1[35], -COSPI_INV[49], stg1[60], INV_COS_BIT),
950    half_btf(COSPI_INV[55], stg1[36], -COSPI_INV[9], stg1[59], INV_COS_BIT),
951    half_btf(COSPI_INV[23], stg1[37], -COSPI_INV[41], stg1[58], INV_COS_BIT),
952    half_btf(COSPI_INV[39], stg1[38], -COSPI_INV[25], stg1[57], INV_COS_BIT),
953    half_btf(COSPI_INV[7], stg1[39], -COSPI_INV[57], stg1[56], INV_COS_BIT),
954    half_btf(COSPI_INV[59], stg1[40], -COSPI_INV[5], stg1[55], INV_COS_BIT),
955    half_btf(COSPI_INV[27], stg1[41], -COSPI_INV[37], stg1[54], INV_COS_BIT),
956    half_btf(COSPI_INV[43], stg1[42], -COSPI_INV[21], stg1[53], INV_COS_BIT),
957    half_btf(COSPI_INV[11], stg1[43], -COSPI_INV[53], stg1[52], INV_COS_BIT),
958    half_btf(COSPI_INV[51], stg1[44], -COSPI_INV[13], stg1[51], INV_COS_BIT),
959    half_btf(COSPI_INV[19], stg1[45], -COSPI_INV[45], stg1[50], INV_COS_BIT),
960    half_btf(COSPI_INV[35], stg1[46], -COSPI_INV[29], stg1[49], INV_COS_BIT),
961    half_btf(COSPI_INV[3], stg1[47], -COSPI_INV[61], stg1[48], INV_COS_BIT),
962    half_btf(COSPI_INV[61], stg1[47], COSPI_INV[3], stg1[48], INV_COS_BIT),
963    half_btf(COSPI_INV[29], stg1[46], COSPI_INV[35], stg1[49], INV_COS_BIT),
964    half_btf(COSPI_INV[45], stg1[45], COSPI_INV[19], stg1[50], INV_COS_BIT),
965    half_btf(COSPI_INV[13], stg1[44], COSPI_INV[51], stg1[51], INV_COS_BIT),
966    half_btf(COSPI_INV[53], stg1[43], COSPI_INV[11], stg1[52], INV_COS_BIT),
967    half_btf(COSPI_INV[21], stg1[42], COSPI_INV[43], stg1[53], INV_COS_BIT),
968    half_btf(COSPI_INV[37], stg1[41], COSPI_INV[27], stg1[54], INV_COS_BIT),
969    half_btf(COSPI_INV[5], stg1[40], COSPI_INV[59], stg1[55], INV_COS_BIT),
970    half_btf(COSPI_INV[57], stg1[39], COSPI_INV[7], stg1[56], INV_COS_BIT),
971    half_btf(COSPI_INV[25], stg1[38], COSPI_INV[39], stg1[57], INV_COS_BIT),
972    half_btf(COSPI_INV[41], stg1[37], COSPI_INV[23], stg1[58], INV_COS_BIT),
973    half_btf(COSPI_INV[9], stg1[36], COSPI_INV[55], stg1[59], INV_COS_BIT),
974    half_btf(COSPI_INV[49], stg1[35], COSPI_INV[15], stg1[60], INV_COS_BIT),
975    half_btf(COSPI_INV[17], stg1[34], COSPI_INV[47], stg1[61], INV_COS_BIT),
976    half_btf(COSPI_INV[33], stg1[33], COSPI_INV[31], stg1[62], INV_COS_BIT),
977    half_btf(COSPI_INV[1], stg1[32], COSPI_INV[63], stg1[63], INV_COS_BIT),
978  ];
979
980  // stage 3
981  let stg3 = [
982    stg2[0],
983    stg2[1],
984    stg2[2],
985    stg2[3],
986    stg2[4],
987    stg2[5],
988    stg2[6],
989    stg2[7],
990    stg2[8],
991    stg2[9],
992    stg2[10],
993    stg2[11],
994    stg2[12],
995    stg2[13],
996    stg2[14],
997    stg2[15],
998    half_btf(COSPI_INV[62], stg2[16], -COSPI_INV[2], stg2[31], INV_COS_BIT),
999    half_btf(COSPI_INV[30], stg2[17], -COSPI_INV[34], stg2[30], INV_COS_BIT),
1000    half_btf(COSPI_INV[46], stg2[18], -COSPI_INV[18], stg2[29], INV_COS_BIT),
1001    half_btf(COSPI_INV[14], stg2[19], -COSPI_INV[50], stg2[28], INV_COS_BIT),
1002    half_btf(COSPI_INV[54], stg2[20], -COSPI_INV[10], stg2[27], INV_COS_BIT),
1003    half_btf(COSPI_INV[22], stg2[21], -COSPI_INV[42], stg2[26], INV_COS_BIT),
1004    half_btf(COSPI_INV[38], stg2[22], -COSPI_INV[26], stg2[25], INV_COS_BIT),
1005    half_btf(COSPI_INV[6], stg2[23], -COSPI_INV[58], stg2[24], INV_COS_BIT),
1006    half_btf(COSPI_INV[58], stg2[23], COSPI_INV[6], stg2[24], INV_COS_BIT),
1007    half_btf(COSPI_INV[26], stg2[22], COSPI_INV[38], stg2[25], INV_COS_BIT),
1008    half_btf(COSPI_INV[42], stg2[21], COSPI_INV[22], stg2[26], INV_COS_BIT),
1009    half_btf(COSPI_INV[10], stg2[20], COSPI_INV[54], stg2[27], INV_COS_BIT),
1010    half_btf(COSPI_INV[50], stg2[19], COSPI_INV[14], stg2[28], INV_COS_BIT),
1011    half_btf(COSPI_INV[18], stg2[18], COSPI_INV[46], stg2[29], INV_COS_BIT),
1012    half_btf(COSPI_INV[34], stg2[17], COSPI_INV[30], stg2[30], INV_COS_BIT),
1013    half_btf(COSPI_INV[2], stg2[16], COSPI_INV[62], stg2[31], INV_COS_BIT),
1014    clamp_value(stg2[32] + stg2[33], range),
1015    clamp_value(stg2[32] - stg2[33], range),
1016    clamp_value(-stg2[34] + stg2[35], range),
1017    clamp_value(stg2[34] + stg2[35], range),
1018    clamp_value(stg2[36] + stg2[37], range),
1019    clamp_value(stg2[36] - stg2[37], range),
1020    clamp_value(-stg2[38] + stg2[39], range),
1021    clamp_value(stg2[38] + stg2[39], range),
1022    clamp_value(stg2[40] + stg2[41], range),
1023    clamp_value(stg2[40] - stg2[41], range),
1024    clamp_value(-stg2[42] + stg2[43], range),
1025    clamp_value(stg2[42] + stg2[43], range),
1026    clamp_value(stg2[44] + stg2[45], range),
1027    clamp_value(stg2[44] - stg2[45], range),
1028    clamp_value(-stg2[46] + stg2[47], range),
1029    clamp_value(stg2[46] + stg2[47], range),
1030    clamp_value(stg2[48] + stg2[49], range),
1031    clamp_value(stg2[48] - stg2[49], range),
1032    clamp_value(-stg2[50] + stg2[51], range),
1033    clamp_value(stg2[50] + stg2[51], range),
1034    clamp_value(stg2[52] + stg2[53], range),
1035    clamp_value(stg2[52] - stg2[53], range),
1036    clamp_value(-stg2[54] + stg2[55], range),
1037    clamp_value(stg2[54] + stg2[55], range),
1038    clamp_value(stg2[56] + stg2[57], range),
1039    clamp_value(stg2[56] - stg2[57], range),
1040    clamp_value(-stg2[58] + stg2[59], range),
1041    clamp_value(stg2[58] + stg2[59], range),
1042    clamp_value(stg2[60] + stg2[61], range),
1043    clamp_value(stg2[60] - stg2[61], range),
1044    clamp_value(-stg2[62] + stg2[63], range),
1045    clamp_value(stg2[62] + stg2[63], range),
1046  ];
1047
1048  // stage 4
1049  let stg4 = [
1050    stg3[0],
1051    stg3[1],
1052    stg3[2],
1053    stg3[3],
1054    stg3[4],
1055    stg3[5],
1056    stg3[6],
1057    stg3[7],
1058    half_btf(COSPI_INV[60], stg3[8], -COSPI_INV[4], stg3[15], INV_COS_BIT),
1059    half_btf(COSPI_INV[28], stg3[9], -COSPI_INV[36], stg3[14], INV_COS_BIT),
1060    half_btf(COSPI_INV[44], stg3[10], -COSPI_INV[20], stg3[13], INV_COS_BIT),
1061    half_btf(COSPI_INV[12], stg3[11], -COSPI_INV[52], stg3[12], INV_COS_BIT),
1062    half_btf(COSPI_INV[52], stg3[11], COSPI_INV[12], stg3[12], INV_COS_BIT),
1063    half_btf(COSPI_INV[20], stg3[10], COSPI_INV[44], stg3[13], INV_COS_BIT),
1064    half_btf(COSPI_INV[36], stg3[9], COSPI_INV[28], stg3[14], INV_COS_BIT),
1065    half_btf(COSPI_INV[4], stg3[8], COSPI_INV[60], stg3[15], INV_COS_BIT),
1066    clamp_value(stg3[16] + stg3[17], range),
1067    clamp_value(stg3[16] - stg3[17], range),
1068    clamp_value(-stg3[18] + stg3[19], range),
1069    clamp_value(stg3[18] + stg3[19], range),
1070    clamp_value(stg3[20] + stg3[21], range),
1071    clamp_value(stg3[20] - stg3[21], range),
1072    clamp_value(-stg3[22] + stg3[23], range),
1073    clamp_value(stg3[22] + stg3[23], range),
1074    clamp_value(stg3[24] + stg3[25], range),
1075    clamp_value(stg3[24] - stg3[25], range),
1076    clamp_value(-stg3[26] + stg3[27], range),
1077    clamp_value(stg3[26] + stg3[27], range),
1078    clamp_value(stg3[28] + stg3[29], range),
1079    clamp_value(stg3[28] - stg3[29], range),
1080    clamp_value(-stg3[30] + stg3[31], range),
1081    clamp_value(stg3[30] + stg3[31], range),
1082    stg3[32],
1083    half_btf(-COSPI_INV[4], stg3[33], COSPI_INV[60], stg3[62], INV_COS_BIT),
1084    half_btf(-COSPI_INV[60], stg3[34], -COSPI_INV[4], stg3[61], INV_COS_BIT),
1085    stg3[35],
1086    stg3[36],
1087    half_btf(-COSPI_INV[36], stg3[37], COSPI_INV[28], stg3[58], INV_COS_BIT),
1088    half_btf(-COSPI_INV[28], stg3[38], -COSPI_INV[36], stg3[57], INV_COS_BIT),
1089    stg3[39],
1090    stg3[40],
1091    half_btf(-COSPI_INV[20], stg3[41], COSPI_INV[44], stg3[54], INV_COS_BIT),
1092    half_btf(-COSPI_INV[44], stg3[42], -COSPI_INV[20], stg3[53], INV_COS_BIT),
1093    stg3[43],
1094    stg3[44],
1095    half_btf(-COSPI_INV[52], stg3[45], COSPI_INV[12], stg3[50], INV_COS_BIT),
1096    half_btf(-COSPI_INV[12], stg3[46], -COSPI_INV[52], stg3[49], INV_COS_BIT),
1097    stg3[47],
1098    stg3[48],
1099    half_btf(-COSPI_INV[52], stg3[46], COSPI_INV[12], stg3[49], INV_COS_BIT),
1100    half_btf(COSPI_INV[12], stg3[45], COSPI_INV[52], stg3[50], INV_COS_BIT),
1101    stg3[51],
1102    stg3[52],
1103    half_btf(-COSPI_INV[20], stg3[42], COSPI_INV[44], stg3[53], INV_COS_BIT),
1104    half_btf(COSPI_INV[44], stg3[41], COSPI_INV[20], stg3[54], INV_COS_BIT),
1105    stg3[55],
1106    stg3[56],
1107    half_btf(-COSPI_INV[36], stg3[38], COSPI_INV[28], stg3[57], INV_COS_BIT),
1108    half_btf(COSPI_INV[28], stg3[37], COSPI_INV[36], stg3[58], INV_COS_BIT),
1109    stg3[59],
1110    stg3[60],
1111    half_btf(-COSPI_INV[4], stg3[34], COSPI_INV[60], stg3[61], INV_COS_BIT),
1112    half_btf(COSPI_INV[60], stg3[33], COSPI_INV[4], stg3[62], INV_COS_BIT),
1113    stg3[63],
1114  ];
1115
1116  // stage 5
1117  let stg5 = [
1118    stg4[0],
1119    stg4[1],
1120    stg4[2],
1121    stg4[3],
1122    half_btf(COSPI_INV[56], stg4[4], -COSPI_INV[8], stg4[7], INV_COS_BIT),
1123    half_btf(COSPI_INV[24], stg4[5], -COSPI_INV[40], stg4[6], INV_COS_BIT),
1124    half_btf(COSPI_INV[40], stg4[5], COSPI_INV[24], stg4[6], INV_COS_BIT),
1125    half_btf(COSPI_INV[8], stg4[4], COSPI_INV[56], stg4[7], INV_COS_BIT),
1126    clamp_value(stg4[8] + stg4[9], range),
1127    clamp_value(stg4[8] - stg4[9], range),
1128    clamp_value(-stg4[10] + stg4[11], range),
1129    clamp_value(stg4[10] + stg4[11], range),
1130    clamp_value(stg4[12] + stg4[13], range),
1131    clamp_value(stg4[12] - stg4[13], range),
1132    clamp_value(-stg4[14] + stg4[15], range),
1133    clamp_value(stg4[14] + stg4[15], range),
1134    stg4[16],
1135    half_btf(-COSPI_INV[8], stg4[17], COSPI_INV[56], stg4[30], INV_COS_BIT),
1136    half_btf(-COSPI_INV[56], stg4[18], -COSPI_INV[8], stg4[29], INV_COS_BIT),
1137    stg4[19],
1138    stg4[20],
1139    half_btf(-COSPI_INV[40], stg4[21], COSPI_INV[24], stg4[26], INV_COS_BIT),
1140    half_btf(-COSPI_INV[24], stg4[22], -COSPI_INV[40], stg4[25], INV_COS_BIT),
1141    stg4[23],
1142    stg4[24],
1143    half_btf(-COSPI_INV[40], stg4[22], COSPI_INV[24], stg4[25], INV_COS_BIT),
1144    half_btf(COSPI_INV[24], stg4[21], COSPI_INV[40], stg4[26], INV_COS_BIT),
1145    stg4[27],
1146    stg4[28],
1147    half_btf(-COSPI_INV[8], stg4[18], COSPI_INV[56], stg4[29], INV_COS_BIT),
1148    half_btf(COSPI_INV[56], stg4[17], COSPI_INV[8], stg4[30], INV_COS_BIT),
1149    stg4[31],
1150    clamp_value(stg4[32] + stg4[35], range),
1151    clamp_value(stg4[33] + stg4[34], range),
1152    clamp_value(stg4[33] - stg4[34], range),
1153    clamp_value(stg4[32] - stg4[35], range),
1154    clamp_value(-stg4[36] + stg4[39], range),
1155    clamp_value(-stg4[37] + stg4[38], range),
1156    clamp_value(stg4[37] + stg4[38], range),
1157    clamp_value(stg4[36] + stg4[39], range),
1158    clamp_value(stg4[40] + stg4[43], range),
1159    clamp_value(stg4[41] + stg4[42], range),
1160    clamp_value(stg4[41] - stg4[42], range),
1161    clamp_value(stg4[40] - stg4[43], range),
1162    clamp_value(-stg4[44] + stg4[47], range),
1163    clamp_value(-stg4[45] + stg4[46], range),
1164    clamp_value(stg4[45] + stg4[46], range),
1165    clamp_value(stg4[44] + stg4[47], range),
1166    clamp_value(stg4[48] + stg4[51], range),
1167    clamp_value(stg4[49] + stg4[50], range),
1168    clamp_value(stg4[49] - stg4[50], range),
1169    clamp_value(stg4[48] - stg4[51], range),
1170    clamp_value(-stg4[52] + stg4[55], range),
1171    clamp_value(-stg4[53] + stg4[54], range),
1172    clamp_value(stg4[53] + stg4[54], range),
1173    clamp_value(stg4[52] + stg4[55], range),
1174    clamp_value(stg4[56] + stg4[59], range),
1175    clamp_value(stg4[57] + stg4[58], range),
1176    clamp_value(stg4[57] - stg4[58], range),
1177    clamp_value(stg4[56] - stg4[59], range),
1178    clamp_value(-stg4[60] + stg4[63], range),
1179    clamp_value(-stg4[61] + stg4[62], range),
1180    clamp_value(stg4[61] + stg4[62], range),
1181    clamp_value(stg4[60] + stg4[63], range),
1182  ];
1183
1184  // stage 6
1185  let stg6 = [
1186    half_btf(COSPI_INV[32], stg5[0], COSPI_INV[32], stg5[1], INV_COS_BIT),
1187    half_btf(COSPI_INV[32], stg5[0], -COSPI_INV[32], stg5[1], INV_COS_BIT),
1188    half_btf(COSPI_INV[48], stg5[2], -COSPI_INV[16], stg5[3], INV_COS_BIT),
1189    half_btf(COSPI_INV[16], stg5[2], COSPI_INV[48], stg5[3], INV_COS_BIT),
1190    clamp_value(stg5[4] + stg5[5], range),
1191    clamp_value(stg5[4] - stg5[5], range),
1192    clamp_value(-stg5[6] + stg5[7], range),
1193    clamp_value(stg5[6] + stg5[7], range),
1194    stg5[8],
1195    half_btf(-COSPI_INV[16], stg5[9], COSPI_INV[48], stg5[14], INV_COS_BIT),
1196    half_btf(-COSPI_INV[48], stg5[10], -COSPI_INV[16], stg5[13], INV_COS_BIT),
1197    stg5[11],
1198    stg5[12],
1199    half_btf(-COSPI_INV[16], stg5[10], COSPI_INV[48], stg5[13], INV_COS_BIT),
1200    half_btf(COSPI_INV[48], stg5[9], COSPI_INV[16], stg5[14], INV_COS_BIT),
1201    stg5[15],
1202    clamp_value(stg5[16] + stg5[19], range),
1203    clamp_value(stg5[17] + stg5[18], range),
1204    clamp_value(stg5[17] - stg5[18], range),
1205    clamp_value(stg5[16] - stg5[19], range),
1206    clamp_value(-stg5[20] + stg5[23], range),
1207    clamp_value(-stg5[21] + stg5[22], range),
1208    clamp_value(stg5[21] + stg5[22], range),
1209    clamp_value(stg5[20] + stg5[23], range),
1210    clamp_value(stg5[24] + stg5[27], range),
1211    clamp_value(stg5[25] + stg5[26], range),
1212    clamp_value(stg5[25] - stg5[26], range),
1213    clamp_value(stg5[24] - stg5[27], range),
1214    clamp_value(-stg5[28] + stg5[31], range),
1215    clamp_value(-stg5[29] + stg5[30], range),
1216    clamp_value(stg5[29] + stg5[30], range),
1217    clamp_value(stg5[28] + stg5[31], range),
1218    stg5[32],
1219    stg5[33],
1220    half_btf(-COSPI_INV[8], stg5[34], COSPI_INV[56], stg5[61], INV_COS_BIT),
1221    half_btf(-COSPI_INV[8], stg5[35], COSPI_INV[56], stg5[60], INV_COS_BIT),
1222    half_btf(-COSPI_INV[56], stg5[36], -COSPI_INV[8], stg5[59], INV_COS_BIT),
1223    half_btf(-COSPI_INV[56], stg5[37], -COSPI_INV[8], stg5[58], INV_COS_BIT),
1224    stg5[38],
1225    stg5[39],
1226    stg5[40],
1227    stg5[41],
1228    half_btf(-COSPI_INV[40], stg5[42], COSPI_INV[24], stg5[53], INV_COS_BIT),
1229    half_btf(-COSPI_INV[40], stg5[43], COSPI_INV[24], stg5[52], INV_COS_BIT),
1230    half_btf(-COSPI_INV[24], stg5[44], -COSPI_INV[40], stg5[51], INV_COS_BIT),
1231    half_btf(-COSPI_INV[24], stg5[45], -COSPI_INV[40], stg5[50], INV_COS_BIT),
1232    stg5[46],
1233    stg5[47],
1234    stg5[48],
1235    stg5[49],
1236    half_btf(-COSPI_INV[40], stg5[45], COSPI_INV[24], stg5[50], INV_COS_BIT),
1237    half_btf(-COSPI_INV[40], stg5[44], COSPI_INV[24], stg5[51], INV_COS_BIT),
1238    half_btf(COSPI_INV[24], stg5[43], COSPI_INV[40], stg5[52], INV_COS_BIT),
1239    half_btf(COSPI_INV[24], stg5[42], COSPI_INV[40], stg5[53], INV_COS_BIT),
1240    stg5[54],
1241    stg5[55],
1242    stg5[56],
1243    stg5[57],
1244    half_btf(-COSPI_INV[8], stg5[37], COSPI_INV[56], stg5[58], INV_COS_BIT),
1245    half_btf(-COSPI_INV[8], stg5[36], COSPI_INV[56], stg5[59], INV_COS_BIT),
1246    half_btf(COSPI_INV[56], stg5[35], COSPI_INV[8], stg5[60], INV_COS_BIT),
1247    half_btf(COSPI_INV[56], stg5[34], COSPI_INV[8], stg5[61], INV_COS_BIT),
1248    stg5[62],
1249    stg5[63],
1250  ];
1251
1252  // stage 7
1253  let stg7 = [
1254    clamp_value(stg6[0] + stg6[3], range),
1255    clamp_value(stg6[1] + stg6[2], range),
1256    clamp_value(stg6[1] - stg6[2], range),
1257    clamp_value(stg6[0] - stg6[3], range),
1258    stg6[4],
1259    half_btf(-COSPI_INV[32], stg6[5], COSPI_INV[32], stg6[6], INV_COS_BIT),
1260    half_btf(COSPI_INV[32], stg6[5], COSPI_INV[32], stg6[6], INV_COS_BIT),
1261    stg6[7],
1262    clamp_value(stg6[8] + stg6[11], range),
1263    clamp_value(stg6[9] + stg6[10], range),
1264    clamp_value(stg6[9] - stg6[10], range),
1265    clamp_value(stg6[8] - stg6[11], range),
1266    clamp_value(-stg6[12] + stg6[15], range),
1267    clamp_value(-stg6[13] + stg6[14], range),
1268    clamp_value(stg6[13] + stg6[14], range),
1269    clamp_value(stg6[12] + stg6[15], range),
1270    stg6[16],
1271    stg6[17],
1272    half_btf(-COSPI_INV[16], stg6[18], COSPI_INV[48], stg6[29], INV_COS_BIT),
1273    half_btf(-COSPI_INV[16], stg6[19], COSPI_INV[48], stg6[28], INV_COS_BIT),
1274    half_btf(-COSPI_INV[48], stg6[20], -COSPI_INV[16], stg6[27], INV_COS_BIT),
1275    half_btf(-COSPI_INV[48], stg6[21], -COSPI_INV[16], stg6[26], INV_COS_BIT),
1276    stg6[22],
1277    stg6[23],
1278    stg6[24],
1279    stg6[25],
1280    half_btf(-COSPI_INV[16], stg6[21], COSPI_INV[48], stg6[26], INV_COS_BIT),
1281    half_btf(-COSPI_INV[16], stg6[20], COSPI_INV[48], stg6[27], INV_COS_BIT),
1282    half_btf(COSPI_INV[48], stg6[19], COSPI_INV[16], stg6[28], INV_COS_BIT),
1283    half_btf(COSPI_INV[48], stg6[18], COSPI_INV[16], stg6[29], INV_COS_BIT),
1284    stg6[30],
1285    stg6[31],
1286    clamp_value(stg6[32] + stg6[39], range),
1287    clamp_value(stg6[33] + stg6[38], range),
1288    clamp_value(stg6[34] + stg6[37], range),
1289    clamp_value(stg6[35] + stg6[36], range),
1290    clamp_value(stg6[35] - stg6[36], range),
1291    clamp_value(stg6[34] - stg6[37], range),
1292    clamp_value(stg6[33] - stg6[38], range),
1293    clamp_value(stg6[32] - stg6[39], range),
1294    clamp_value(-stg6[40] + stg6[47], range),
1295    clamp_value(-stg6[41] + stg6[46], range),
1296    clamp_value(-stg6[42] + stg6[45], range),
1297    clamp_value(-stg6[43] + stg6[44], range),
1298    clamp_value(stg6[43] + stg6[44], range),
1299    clamp_value(stg6[42] + stg6[45], range),
1300    clamp_value(stg6[41] + stg6[46], range),
1301    clamp_value(stg6[40] + stg6[47], range),
1302    clamp_value(stg6[48] + stg6[55], range),
1303    clamp_value(stg6[49] + stg6[54], range),
1304    clamp_value(stg6[50] + stg6[53], range),
1305    clamp_value(stg6[51] + stg6[52], range),
1306    clamp_value(stg6[51] - stg6[52], range),
1307    clamp_value(stg6[50] - stg6[53], range),
1308    clamp_value(stg6[49] - stg6[54], range),
1309    clamp_value(stg6[48] - stg6[55], range),
1310    clamp_value(-stg6[56] + stg6[63], range),
1311    clamp_value(-stg6[57] + stg6[62], range),
1312    clamp_value(-stg6[58] + stg6[61], range),
1313    clamp_value(-stg6[59] + stg6[60], range),
1314    clamp_value(stg6[59] + stg6[60], range),
1315    clamp_value(stg6[58] + stg6[61], range),
1316    clamp_value(stg6[57] + stg6[62], range),
1317    clamp_value(stg6[56] + stg6[63], range),
1318  ];
1319
1320  // stage 8
1321  let stg8 = [
1322    clamp_value(stg7[0] + stg7[7], range),
1323    clamp_value(stg7[1] + stg7[6], range),
1324    clamp_value(stg7[2] + stg7[5], range),
1325    clamp_value(stg7[3] + stg7[4], range),
1326    clamp_value(stg7[3] - stg7[4], range),
1327    clamp_value(stg7[2] - stg7[5], range),
1328    clamp_value(stg7[1] - stg7[6], range),
1329    clamp_value(stg7[0] - stg7[7], range),
1330    stg7[8],
1331    stg7[9],
1332    half_btf(-COSPI_INV[32], stg7[10], COSPI_INV[32], stg7[13], INV_COS_BIT),
1333    half_btf(-COSPI_INV[32], stg7[11], COSPI_INV[32], stg7[12], INV_COS_BIT),
1334    half_btf(COSPI_INV[32], stg7[11], COSPI_INV[32], stg7[12], INV_COS_BIT),
1335    half_btf(COSPI_INV[32], stg7[10], COSPI_INV[32], stg7[13], INV_COS_BIT),
1336    stg7[14],
1337    stg7[15],
1338    clamp_value(stg7[16] + stg7[23], range),
1339    clamp_value(stg7[17] + stg7[22], range),
1340    clamp_value(stg7[18] + stg7[21], range),
1341    clamp_value(stg7[19] + stg7[20], range),
1342    clamp_value(stg7[19] - stg7[20], range),
1343    clamp_value(stg7[18] - stg7[21], range),
1344    clamp_value(stg7[17] - stg7[22], range),
1345    clamp_value(stg7[16] - stg7[23], range),
1346    clamp_value(-stg7[24] + stg7[31], range),
1347    clamp_value(-stg7[25] + stg7[30], range),
1348    clamp_value(-stg7[26] + stg7[29], range),
1349    clamp_value(-stg7[27] + stg7[28], range),
1350    clamp_value(stg7[27] + stg7[28], range),
1351    clamp_value(stg7[26] + stg7[29], range),
1352    clamp_value(stg7[25] + stg7[30], range),
1353    clamp_value(stg7[24] + stg7[31], range),
1354    stg7[32],
1355    stg7[33],
1356    stg7[34],
1357    stg7[35],
1358    half_btf(-COSPI_INV[16], stg7[36], COSPI_INV[48], stg7[59], INV_COS_BIT),
1359    half_btf(-COSPI_INV[16], stg7[37], COSPI_INV[48], stg7[58], INV_COS_BIT),
1360    half_btf(-COSPI_INV[16], stg7[38], COSPI_INV[48], stg7[57], INV_COS_BIT),
1361    half_btf(-COSPI_INV[16], stg7[39], COSPI_INV[48], stg7[56], INV_COS_BIT),
1362    half_btf(-COSPI_INV[48], stg7[40], -COSPI_INV[16], stg7[55], INV_COS_BIT),
1363    half_btf(-COSPI_INV[48], stg7[41], -COSPI_INV[16], stg7[54], INV_COS_BIT),
1364    half_btf(-COSPI_INV[48], stg7[42], -COSPI_INV[16], stg7[53], INV_COS_BIT),
1365    half_btf(-COSPI_INV[48], stg7[43], -COSPI_INV[16], stg7[52], INV_COS_BIT),
1366    stg7[44],
1367    stg7[45],
1368    stg7[46],
1369    stg7[47],
1370    stg7[48],
1371    stg7[49],
1372    stg7[50],
1373    stg7[51],
1374    half_btf(-COSPI_INV[16], stg7[43], COSPI_INV[48], stg7[52], INV_COS_BIT),
1375    half_btf(-COSPI_INV[16], stg7[42], COSPI_INV[48], stg7[53], INV_COS_BIT),
1376    half_btf(-COSPI_INV[16], stg7[41], COSPI_INV[48], stg7[54], INV_COS_BIT),
1377    half_btf(-COSPI_INV[16], stg7[40], COSPI_INV[48], stg7[55], INV_COS_BIT),
1378    half_btf(COSPI_INV[48], stg7[39], COSPI_INV[16], stg7[56], INV_COS_BIT),
1379    half_btf(COSPI_INV[48], stg7[38], COSPI_INV[16], stg7[57], INV_COS_BIT),
1380    half_btf(COSPI_INV[48], stg7[37], COSPI_INV[16], stg7[58], INV_COS_BIT),
1381    half_btf(COSPI_INV[48], stg7[36], COSPI_INV[16], stg7[59], INV_COS_BIT),
1382    stg7[60],
1383    stg7[61],
1384    stg7[62],
1385    stg7[63],
1386  ];
1387
1388  // stage 9
1389  let stg9 = [
1390    clamp_value(stg8[0] + stg8[15], range),
1391    clamp_value(stg8[1] + stg8[14], range),
1392    clamp_value(stg8[2] + stg8[13], range),
1393    clamp_value(stg8[3] + stg8[12], range),
1394    clamp_value(stg8[4] + stg8[11], range),
1395    clamp_value(stg8[5] + stg8[10], range),
1396    clamp_value(stg8[6] + stg8[9], range),
1397    clamp_value(stg8[7] + stg8[8], range),
1398    clamp_value(stg8[7] - stg8[8], range),
1399    clamp_value(stg8[6] - stg8[9], range),
1400    clamp_value(stg8[5] - stg8[10], range),
1401    clamp_value(stg8[4] - stg8[11], range),
1402    clamp_value(stg8[3] - stg8[12], range),
1403    clamp_value(stg8[2] - stg8[13], range),
1404    clamp_value(stg8[1] - stg8[14], range),
1405    clamp_value(stg8[0] - stg8[15], range),
1406    stg8[16],
1407    stg8[17],
1408    stg8[18],
1409    stg8[19],
1410    half_btf(-COSPI_INV[32], stg8[20], COSPI_INV[32], stg8[27], INV_COS_BIT),
1411    half_btf(-COSPI_INV[32], stg8[21], COSPI_INV[32], stg8[26], INV_COS_BIT),
1412    half_btf(-COSPI_INV[32], stg8[22], COSPI_INV[32], stg8[25], INV_COS_BIT),
1413    half_btf(-COSPI_INV[32], stg8[23], COSPI_INV[32], stg8[24], INV_COS_BIT),
1414    half_btf(COSPI_INV[32], stg8[23], COSPI_INV[32], stg8[24], INV_COS_BIT),
1415    half_btf(COSPI_INV[32], stg8[22], COSPI_INV[32], stg8[25], INV_COS_BIT),
1416    half_btf(COSPI_INV[32], stg8[21], COSPI_INV[32], stg8[26], INV_COS_BIT),
1417    half_btf(COSPI_INV[32], stg8[20], COSPI_INV[32], stg8[27], INV_COS_BIT),
1418    stg8[28],
1419    stg8[29],
1420    stg8[30],
1421    stg8[31],
1422    clamp_value(stg8[32] + stg8[47], range),
1423    clamp_value(stg8[33] + stg8[46], range),
1424    clamp_value(stg8[34] + stg8[45], range),
1425    clamp_value(stg8[35] + stg8[44], range),
1426    clamp_value(stg8[36] + stg8[43], range),
1427    clamp_value(stg8[37] + stg8[42], range),
1428    clamp_value(stg8[38] + stg8[41], range),
1429    clamp_value(stg8[39] + stg8[40], range),
1430    clamp_value(stg8[39] - stg8[40], range),
1431    clamp_value(stg8[38] - stg8[41], range),
1432    clamp_value(stg8[37] - stg8[42], range),
1433    clamp_value(stg8[36] - stg8[43], range),
1434    clamp_value(stg8[35] - stg8[44], range),
1435    clamp_value(stg8[34] - stg8[45], range),
1436    clamp_value(stg8[33] - stg8[46], range),
1437    clamp_value(stg8[32] - stg8[47], range),
1438    clamp_value(-stg8[48] + stg8[63], range),
1439    clamp_value(-stg8[49] + stg8[62], range),
1440    clamp_value(-stg8[50] + stg8[61], range),
1441    clamp_value(-stg8[51] + stg8[60], range),
1442    clamp_value(-stg8[52] + stg8[59], range),
1443    clamp_value(-stg8[53] + stg8[58], range),
1444    clamp_value(-stg8[54] + stg8[57], range),
1445    clamp_value(-stg8[55] + stg8[56], range),
1446    clamp_value(stg8[55] + stg8[56], range),
1447    clamp_value(stg8[54] + stg8[57], range),
1448    clamp_value(stg8[53] + stg8[58], range),
1449    clamp_value(stg8[52] + stg8[59], range),
1450    clamp_value(stg8[51] + stg8[60], range),
1451    clamp_value(stg8[50] + stg8[61], range),
1452    clamp_value(stg8[49] + stg8[62], range),
1453    clamp_value(stg8[48] + stg8[63], range),
1454  ];
1455
1456  // stage 10
1457  let stg10 = [
1458    clamp_value(stg9[0] + stg9[31], range),
1459    clamp_value(stg9[1] + stg9[30], range),
1460    clamp_value(stg9[2] + stg9[29], range),
1461    clamp_value(stg9[3] + stg9[28], range),
1462    clamp_value(stg9[4] + stg9[27], range),
1463    clamp_value(stg9[5] + stg9[26], range),
1464    clamp_value(stg9[6] + stg9[25], range),
1465    clamp_value(stg9[7] + stg9[24], range),
1466    clamp_value(stg9[8] + stg9[23], range),
1467    clamp_value(stg9[9] + stg9[22], range),
1468    clamp_value(stg9[10] + stg9[21], range),
1469    clamp_value(stg9[11] + stg9[20], range),
1470    clamp_value(stg9[12] + stg9[19], range),
1471    clamp_value(stg9[13] + stg9[18], range),
1472    clamp_value(stg9[14] + stg9[17], range),
1473    clamp_value(stg9[15] + stg9[16], range),
1474    clamp_value(stg9[15] - stg9[16], range),
1475    clamp_value(stg9[14] - stg9[17], range),
1476    clamp_value(stg9[13] - stg9[18], range),
1477    clamp_value(stg9[12] - stg9[19], range),
1478    clamp_value(stg9[11] - stg9[20], range),
1479    clamp_value(stg9[10] - stg9[21], range),
1480    clamp_value(stg9[9] - stg9[22], range),
1481    clamp_value(stg9[8] - stg9[23], range),
1482    clamp_value(stg9[7] - stg9[24], range),
1483    clamp_value(stg9[6] - stg9[25], range),
1484    clamp_value(stg9[5] - stg9[26], range),
1485    clamp_value(stg9[4] - stg9[27], range),
1486    clamp_value(stg9[3] - stg9[28], range),
1487    clamp_value(stg9[2] - stg9[29], range),
1488    clamp_value(stg9[1] - stg9[30], range),
1489    clamp_value(stg9[0] - stg9[31], range),
1490    stg9[32],
1491    stg9[33],
1492    stg9[34],
1493    stg9[35],
1494    stg9[36],
1495    stg9[37],
1496    stg9[38],
1497    stg9[39],
1498    half_btf(-COSPI_INV[32], stg9[40], COSPI_INV[32], stg9[55], INV_COS_BIT),
1499    half_btf(-COSPI_INV[32], stg9[41], COSPI_INV[32], stg9[54], INV_COS_BIT),
1500    half_btf(-COSPI_INV[32], stg9[42], COSPI_INV[32], stg9[53], INV_COS_BIT),
1501    half_btf(-COSPI_INV[32], stg9[43], COSPI_INV[32], stg9[52], INV_COS_BIT),
1502    half_btf(-COSPI_INV[32], stg9[44], COSPI_INV[32], stg9[51], INV_COS_BIT),
1503    half_btf(-COSPI_INV[32], stg9[45], COSPI_INV[32], stg9[50], INV_COS_BIT),
1504    half_btf(-COSPI_INV[32], stg9[46], COSPI_INV[32], stg9[49], INV_COS_BIT),
1505    half_btf(-COSPI_INV[32], stg9[47], COSPI_INV[32], stg9[48], INV_COS_BIT),
1506    half_btf(COSPI_INV[32], stg9[47], COSPI_INV[32], stg9[48], INV_COS_BIT),
1507    half_btf(COSPI_INV[32], stg9[46], COSPI_INV[32], stg9[49], INV_COS_BIT),
1508    half_btf(COSPI_INV[32], stg9[45], COSPI_INV[32], stg9[50], INV_COS_BIT),
1509    half_btf(COSPI_INV[32], stg9[44], COSPI_INV[32], stg9[51], INV_COS_BIT),
1510    half_btf(COSPI_INV[32], stg9[43], COSPI_INV[32], stg9[52], INV_COS_BIT),
1511    half_btf(COSPI_INV[32], stg9[42], COSPI_INV[32], stg9[53], INV_COS_BIT),
1512    half_btf(COSPI_INV[32], stg9[41], COSPI_INV[32], stg9[54], INV_COS_BIT),
1513    half_btf(COSPI_INV[32], stg9[40], COSPI_INV[32], stg9[55], INV_COS_BIT),
1514    stg9[56],
1515    stg9[57],
1516    stg9[58],
1517    stg9[59],
1518    stg9[60],
1519    stg9[61],
1520    stg9[62],
1521    stg9[63],
1522  ];
1523
1524  // stage 11
1525  output[0] = clamp_value(stg10[0] + stg10[63], range);
1526  output[1] = clamp_value(stg10[1] + stg10[62], range);
1527  output[2] = clamp_value(stg10[2] + stg10[61], range);
1528  output[3] = clamp_value(stg10[3] + stg10[60], range);
1529  output[4] = clamp_value(stg10[4] + stg10[59], range);
1530  output[5] = clamp_value(stg10[5] + stg10[58], range);
1531  output[6] = clamp_value(stg10[6] + stg10[57], range);
1532  output[7] = clamp_value(stg10[7] + stg10[56], range);
1533  output[8] = clamp_value(stg10[8] + stg10[55], range);
1534  output[9] = clamp_value(stg10[9] + stg10[54], range);
1535  output[10] = clamp_value(stg10[10] + stg10[53], range);
1536  output[11] = clamp_value(stg10[11] + stg10[52], range);
1537  output[12] = clamp_value(stg10[12] + stg10[51], range);
1538  output[13] = clamp_value(stg10[13] + stg10[50], range);
1539  output[14] = clamp_value(stg10[14] + stg10[49], range);
1540  output[15] = clamp_value(stg10[15] + stg10[48], range);
1541  output[16] = clamp_value(stg10[16] + stg10[47], range);
1542  output[17] = clamp_value(stg10[17] + stg10[46], range);
1543  output[18] = clamp_value(stg10[18] + stg10[45], range);
1544  output[19] = clamp_value(stg10[19] + stg10[44], range);
1545  output[20] = clamp_value(stg10[20] + stg10[43], range);
1546  output[21] = clamp_value(stg10[21] + stg10[42], range);
1547  output[22] = clamp_value(stg10[22] + stg10[41], range);
1548  output[23] = clamp_value(stg10[23] + stg10[40], range);
1549  output[24] = clamp_value(stg10[24] + stg10[39], range);
1550  output[25] = clamp_value(stg10[25] + stg10[38], range);
1551  output[26] = clamp_value(stg10[26] + stg10[37], range);
1552  output[27] = clamp_value(stg10[27] + stg10[36], range);
1553  output[28] = clamp_value(stg10[28] + stg10[35], range);
1554  output[29] = clamp_value(stg10[29] + stg10[34], range);
1555  output[30] = clamp_value(stg10[30] + stg10[33], range);
1556  output[31] = clamp_value(stg10[31] + stg10[32], range);
1557  output[32] = clamp_value(stg10[31] - stg10[32], range);
1558  output[33] = clamp_value(stg10[30] - stg10[33], range);
1559  output[34] = clamp_value(stg10[29] - stg10[34], range);
1560  output[35] = clamp_value(stg10[28] - stg10[35], range);
1561  output[36] = clamp_value(stg10[27] - stg10[36], range);
1562  output[37] = clamp_value(stg10[26] - stg10[37], range);
1563  output[38] = clamp_value(stg10[25] - stg10[38], range);
1564  output[39] = clamp_value(stg10[24] - stg10[39], range);
1565  output[40] = clamp_value(stg10[23] - stg10[40], range);
1566  output[41] = clamp_value(stg10[22] - stg10[41], range);
1567  output[42] = clamp_value(stg10[21] - stg10[42], range);
1568  output[43] = clamp_value(stg10[20] - stg10[43], range);
1569  output[44] = clamp_value(stg10[19] - stg10[44], range);
1570  output[45] = clamp_value(stg10[18] - stg10[45], range);
1571  output[46] = clamp_value(stg10[17] - stg10[46], range);
1572  output[47] = clamp_value(stg10[16] - stg10[47], range);
1573  output[48] = clamp_value(stg10[15] - stg10[48], range);
1574  output[49] = clamp_value(stg10[14] - stg10[49], range);
1575  output[50] = clamp_value(stg10[13] - stg10[50], range);
1576  output[51] = clamp_value(stg10[12] - stg10[51], range);
1577  output[52] = clamp_value(stg10[11] - stg10[52], range);
1578  output[53] = clamp_value(stg10[10] - stg10[53], range);
1579  output[54] = clamp_value(stg10[9] - stg10[54], range);
1580  output[55] = clamp_value(stg10[8] - stg10[55], range);
1581  output[56] = clamp_value(stg10[7] - stg10[56], range);
1582  output[57] = clamp_value(stg10[6] - stg10[57], range);
1583  output[58] = clamp_value(stg10[5] - stg10[58], range);
1584  output[59] = clamp_value(stg10[4] - stg10[59], range);
1585  output[60] = clamp_value(stg10[3] - stg10[60], range);
1586  output[61] = clamp_value(stg10[2] - stg10[61], range);
1587  output[62] = clamp_value(stg10[1] - stg10[62], range);
1588  output[63] = clamp_value(stg10[0] - stg10[63], range);
1589}
1590
1591type InvTxfmFn = fn(input: &[i32], output: &mut [i32], range: usize);
1592
1593static INV_TXFM_FNS: [[InvTxfmFn; 5]; 5] = [
1594  [av1_idct4, av1_idct8, av1_idct16, av1_idct32, av1_idct64],
1595  [
1596    av1_iadst4,
1597    av1_iadst8,
1598    av1_iadst16,
1599    |_, _, _| unimplemented!(),
1600    |_, _, _| unimplemented!(),
1601  ],
1602  [
1603    av1_iflipadst4,
1604    av1_iflipadst8,
1605    av1_iflipadst16,
1606    |_, _, _| unimplemented!(),
1607    |_, _, _| unimplemented!(),
1608  ],
1609  [
1610    av1_iidentity4,
1611    av1_iidentity8,
1612    av1_iidentity16,
1613    av1_iidentity32,
1614    |_, _, _| unimplemented!(),
1615  ],
1616  [
1617    av1_iwht4,
1618    |_, _, _| unimplemented!(),
1619    |_, _, _| unimplemented!(),
1620    |_, _, _| unimplemented!(),
1621    |_, _, _| unimplemented!(),
1622  ],
1623];
1624
1625pub(crate) mod rust {
1626  use super::*;
1627  use crate::cpu_features::CpuFeatureLevel;
1628
1629  use simd_helpers::cold_for_target_arch;
1630  use std::cmp;
1631
1632  #[cold_for_target_arch("x86_64", "aarch64")]
1633  pub fn inverse_transform_add<T: Pixel>(
1634    input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, _eob: u16,
1635    tx_size: TxSize, tx_type: TxType, bd: usize, _cpu: CpuFeatureLevel,
1636  ) {
1637    let width: usize = tx_size.width();
1638    let height: usize = tx_size.height();
1639
1640    // Only use at most 32 columns and 32 rows of input coefficients.
1641    let input: &[T::Coeff] = &input[..width.min(32) * height.min(32)];
1642
1643    // For 64 point transforms, rely on the last 32 columns being initialized
1644    //   to zero for filling out missing input coeffs.
1645    let mut buffer = vec![0i32; width * height].into_boxed_slice();
1646    let rect_type = tx_size.rect_ratio_log2();
1647    let tx_types_1d = get_1d_tx_types(tx_type);
1648    let lossless = tx_type == TxType::WHT_WHT;
1649
1650    // perform inv txfm on every row
1651    let range = bd + 8;
1652    let txfm_fn =
1653      INV_TXFM_FNS[tx_types_1d.1 as usize][tx_size.width_log2() - 2];
1654    // 64 point transforms only signal 32 coeffs. We only take chunks of 32
1655    //   and skip over the last 32 transforms here.
1656    for (r, buffer_slice) in (0..height.min(32)).zip(buffer.chunks_mut(width))
1657    {
1658      // For 64 point transforms, rely on the last 32 elements being
1659      //   initialized to zero for filling out the missing coeffs.
1660      let mut temp_in: [i32; 64] = [0; 64];
1661      for (raw, clamped) in input[r..]
1662        .iter()
1663        .map(|a| i32::cast_from(*a))
1664        .step_by(height.min(32))
1665        .zip(temp_in.iter_mut())
1666      {
1667        let val = if rect_type.abs() == 1 {
1668          round_shift(raw * INV_SQRT2, SQRT2_BITS)
1669        } else if lossless {
1670          raw >> 2
1671        } else {
1672          raw
1673        };
1674        *clamped = clamp_value(val, range);
1675      }
1676      txfm_fn(&temp_in, buffer_slice, range);
1677    }
1678
1679    // perform inv txfm on every col
1680    let range = cmp::max(bd + 6, 16);
1681    let txfm_fn =
1682      INV_TXFM_FNS[tx_types_1d.0 as usize][tx_size.height_log2() - 2];
1683    for c in 0..width {
1684      let mut temp_in: [i32; 64] = [0; 64];
1685      let mut temp_out: [i32; 64] = [0; 64];
1686      for (raw, clamped) in
1687        buffer[c..].iter().step_by(width).zip(temp_in.iter_mut())
1688      {
1689        *clamped = clamp_value(
1690          round_shift(*raw, INV_INTERMEDIATE_SHIFTS[tx_size as usize]),
1691          range,
1692        );
1693      }
1694      txfm_fn(&temp_in, &mut temp_out, range);
1695      for (temp, out) in temp_out
1696        .iter()
1697        .zip(output.rows_iter_mut().map(|row| &mut row[c]).take(height))
1698      {
1699        let v: i32 = (*out).as_();
1700        let r = if lossless { *temp } else { round_shift(*temp, 4) };
1701        let v = clamp(v + r, 0, (1 << bd) - 1);
1702        *out = T::cast_from(v);
1703      }
1704    }
1705  }
1706
1707  /* From AV1 Spec.
1708  https://aomediacodec.github.io/av1-spec/#2d-inverse-transform-process
1709  */
1710  const INV_INTERMEDIATE_SHIFTS: [usize; TxSize::TX_SIZES_ALL] =
1711    [0, 1, 2, 2, 2, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2];
1712}