1#![allow(
2 clippy::needless_range_loop,
3 reason = "keep the scalar code comparable to the vectorized code"
4)]
5use core::{
27 fmt::{Debug, Display},
28 ops::Add,
29};
30
31use generic_array::typenum::U3;
32#[allow(unused_imports)]
33use generic_array::{
34 ArrayLength, GenericArray,
35 sequence::Flatten,
36 typenum::{B1, U0, U1, U2, U4, U16, U127, U128, U256, U512, Unsigned},
37};
38
39#[cfg_attr(not(feature = "reference-rug"), allow(unused_imports))]
40use num_traits::{FromPrimitive, NumCast, Signed, ToPrimitive, float::FloatCore};
41
42pub use generic_array;
43use sealing::Sealed;
44use type_traits::{DivisibleBy8, EvaluateHardwareFeature, SquareOf, Term};
45
46use crate::alignment::DefaultPaddedArray;
47
48pub mod constants;
50
51#[cfg(target_arch = "x86_64")]
53pub mod x86;
54
55pub mod router;
57
58#[cfg(feature = "reference-rug")]
59pub mod float128;
61
62#[cfg(feature = "portable-simd")]
63pub mod portable_simd;
65
66pub mod threshold;
71
72pub mod type_traits;
74
75include!(concat!(env!("OUT_DIR"), "/dct_matrix.rs"));
76include!(concat!(env!("OUT_DIR"), "/tent_filter_weights.rs"));
77include!(concat!(env!("OUT_DIR"), "/convolution_offset.rs"));
78
79#[cfg(feature = "portable-simd")]
80pub type FallbackKernel = portable_simd::PortableSimdF32Kernel<8>;
82
83#[cfg(all(not(feature = "portable-simd"), not(target_arch = "x86_64")))]
84pub type FallbackKernel = DefaultKernelNoPadding;
86
87#[cfg(all(not(feature = "portable-simd"), target_arch = "x86_64"))]
88pub type FallbackKernel = DefaultKernelPadXYTo128;
90
91#[inline(always)]
95#[must_use]
96pub fn smart_kernel() -> impl Kernel<
97 RequiredHardwareFeature = impl EvaluateHardwareFeature<EnabledStatic = B1>,
98 InputDimension = U512,
99 OutputDimension = U16,
100 InternalFloat = f32,
101> + Clone {
102 smart_kernel_impl()
103}
104
105#[cfg(target_arch = "x86_64")]
106#[cfg(feature = "avx512")]
107#[cfg(any(feature = "prefer-x86-intrinsics", not(feature = "portable-simd")))]
108pub(crate) type SmartKernelConcreteType = router::KernelRouter<
109 x86::Avx512F32Kernel,
110 router::KernelRouter<x86::Avx2F32Kernel, FallbackKernel>,
111>;
112
113#[cfg(target_arch = "x86_64")]
114#[cfg(feature = "avx512")]
115#[cfg(all(not(feature = "prefer-x86-intrinsics"), feature = "portable-simd"))]
116pub(crate) type SmartKernelConcreteType =
117 router::KernelRouter<x86::Avx512F32Kernel, portable_simd::PortableSimdF32Kernel<8>>;
118
119#[cfg(target_arch = "x86_64")]
120#[cfg(feature = "avx512")]
121#[inline(always)]
122pub(crate) fn smart_kernel_impl() -> SmartKernelConcreteType {
123 #[allow(unused_imports)]
124 use router::KernelRouter;
125
126 #[cfg(any(feature = "prefer-x86-intrinsics", not(feature = "portable-simd")))]
127 {
128 KernelRouter::new(x86::Avx2F32Kernel, FallbackKernel::default())
129 .layer_on_top(x86::Avx512F32Kernel)
130 }
131
132 #[cfg(all(not(feature = "prefer-x86-intrinsics"), feature = "portable-simd"))]
133 {
134 KernelRouter::new(
135 x86::Avx512F32Kernel,
136 portable_simd::PortableSimdF32Kernel::<8>,
137 )
138 }
139}
140
141#[cfg(target_arch = "x86_64")]
142#[cfg(not(feature = "avx512"))]
143#[cfg(any(feature = "prefer-x86-intrinsics", not(feature = "portable-simd")))]
144pub(crate) type SmartKernelConcreteType = router::KernelRouter<x86::Avx2F32Kernel, FallbackKernel>;
145
146#[cfg(target_arch = "x86_64")]
147#[cfg(not(feature = "avx512"))]
148#[cfg(all(not(feature = "prefer-x86-intrinsics"), feature = "portable-simd"))]
149pub(crate) type SmartKernelConcreteType = portable_simd::PortableSimdF32Kernel<8>;
150
151#[cfg(target_arch = "x86_64")]
152#[cfg(not(feature = "avx512"))]
153#[inline(always)]
154pub(crate) fn smart_kernel_impl() -> SmartKernelConcreteType {
155 #[allow(unused_imports)]
156 use router::KernelRouter;
157
158 #[cfg(any(feature = "prefer-x86-intrinsics", not(feature = "portable-simd")))]
159 {
160 KernelRouter::new(x86::Avx2F32Kernel, FallbackKernel::default())
161 }
162
163 #[cfg(all(not(feature = "prefer-x86-intrinsics"), feature = "portable-simd"))]
164 {
165 portable_simd::PortableSimdF32Kernel::<8>::default()
166 }
167}
168
169#[cfg(not(target_arch = "x86_64"))]
170pub(crate) type SmartKernelConcreteType = FallbackKernel;
171
172#[cfg(not(target_arch = "x86_64"))]
173#[inline(always)]
174pub(crate) fn smart_kernel_impl() -> SmartKernelConcreteType {
175 FallbackKernel::default()
176}
177
178mod sealing {
179 pub trait Sealed {}
181}
182
183pub trait SquareGenericArrayExt<I, L: SquareOf>: Sized + Sealed {
187 fn unflatten_square(self) -> GenericArray<GenericArray<I, L>, L> {
189 unsafe { generic_array::const_transmute(self) }
190 }
191
192 fn unflatten_square_ref(&self) -> &GenericArray<GenericArray<I, L>, L> {
194 unsafe { generic_array::const_transmute(self) }
195 }
196
197 fn unflatten_square_mut(&mut self) -> &mut GenericArray<GenericArray<I, L>, L> {
199 unsafe { generic_array::const_transmute(self) }
200 }
201}
202
203impl<I, L: ArrayLength> Sealed for GenericArray<I, L> {}
204
205impl<I, L: SquareOf> SquareGenericArrayExt<I, L> for GenericArray<I, <L as SquareOf>::Output> {}
206
207pub trait PreciseKernel: Kernel {}
209
210#[allow(clippy::all, clippy::pedantic)]
212pub(crate) fn torben_median<F: FloatCore>(m: &GenericArray<GenericArray<F, U16>, U16>) -> F {
213 let mut min = m.iter().flatten().cloned().reduce(F::min).unwrap();
214 let mut max = m.iter().flatten().cloned().reduce(F::max).unwrap();
215
216 let half = (16 * 16 + 1) / 2;
217 loop {
218 let guess = (min + max) / F::from(2).unwrap();
219 let mut less = 0;
220 let mut greater = 0;
221 let mut equal = 0;
222 let mut maxltguess = min;
223 let mut mingtguess = max;
224 for val in m.into_iter().flatten() {
225 if *val < guess {
226 less += 1;
227 if *val > maxltguess {
228 maxltguess = *val;
229 }
230 } else if *val > guess {
231 greater += 1;
232 if *val < mingtguess {
233 mingtguess = *val;
234 }
235 } else {
236 equal += 1;
237 }
238 }
239 if less <= half && greater <= half {
240 return if less >= half {
241 maxltguess
242 } else if less + equal >= half {
243 guess
244 } else {
245 mingtguess
246 };
247 } else if less > greater {
248 max = maxltguess;
249 } else {
250 min = mingtguess;
251 }
252 }
253}
254
255pub const QUALITY_ADJUST_DIVISOR: usize = 180;
258
259pub trait Kernel {
269 type Buffer1WidthX: ArrayLength;
271 type Buffer1LengthY: ArrayLength;
273 type InputDimension: ArrayLength + SquareOf;
275 type OutputDimension: ArrayLength + SquareOf;
277 type RequiredHardwareFeature: EvaluateHardwareFeature;
279
280 type Ident: Debug + Display + Clone + Copy + 'static + PartialEq;
282
283 type InternalFloat: num_traits::float::TotalOrder
285 + num_traits::FromPrimitive
286 + num_traits::NumAssign
287 + num_traits::bounds::Bounded
288 + num_traits::NumCast
289 + num_traits::identities::Zero
290 + num_traits::identities::One
291 + num_traits::Signed
292 + PartialOrd
293 + Clone
294 + Display
295 + Debug
296 + Default
297 + Send
298 + Sync;
299
300 fn cvt_rgb8_to_luma8f<const R_COEFF: u32, const G_COEFF: u32, const B_COEFF: u32>(
304 &mut self,
305 input: &GenericArray<GenericArray<u8, U3>, Self::InputDimension>,
306 output: &mut GenericArray<f32, Self::InputDimension>,
307 ) where
308 Self::RequiredHardwareFeature: EvaluateHardwareFeature<EnabledStatic = B1>,
309 {
310 for (input_pixel, output_pixel) in input.iter().zip(output.iter_mut()) {
311 let r = input_pixel[0] as f32;
312 let g = input_pixel[1] as f32;
313 let b = input_pixel[2] as f32;
314 let luma = f32::from_ne_bytes(R_COEFF.to_ne_bytes()) * r
315 + f32::from_ne_bytes(G_COEFF.to_ne_bytes()) * g
316 + f32::from_ne_bytes(B_COEFF.to_ne_bytes()) * b;
317 *output_pixel = luma;
318 }
319 }
320
321 fn cvt_rgba8_to_luma8f<const R_COEFF: u32, const G_COEFF: u32, const B_COEFF: u32>(
325 &mut self,
326 input: &GenericArray<GenericArray<u8, U4>, Self::InputDimension>,
327 output: &mut GenericArray<f32, Self::InputDimension>,
328 ) where
329 Self::RequiredHardwareFeature: EvaluateHardwareFeature<EnabledStatic = B1>,
330 {
331 for (input_pixel, output_pixel) in input.iter().zip(output.iter_mut()) {
332 let r = input_pixel[0] as f32;
333 let g = input_pixel[1] as f32;
334 let b = input_pixel[2] as f32;
335 let luma = f32::from_ne_bytes(R_COEFF.to_ne_bytes()) * r
336 + f32::from_ne_bytes(G_COEFF.to_ne_bytes()) * g
337 + f32::from_ne_bytes(B_COEFF.to_ne_bytes()) * b;
338 *output_pixel = luma;
339 }
340 }
341
342 fn pdqf_t(
346 &mut self,
347 input: &mut GenericArray<
348 GenericArray<Self::InternalFloat, Self::OutputDimension>,
349 Self::OutputDimension,
350 >,
351 ) where
352 Self::RequiredHardwareFeature: EvaluateHardwareFeature<EnabledStatic = B1>,
353 {
354 for i in 0..Self::OutputDimension::USIZE {
355 for j in 0..Self::OutputDimension::USIZE {
356 if i < j {
357 (input[j][i], input[i][j]) = (input[i][j].clone(), input[j][i].clone());
358 }
359 }
360 }
361 }
362
363 fn pdqf_negate_alt_cols<const NEGATE: bool>(
369 &mut self,
370 input: &mut GenericArray<
371 GenericArray<Self::InternalFloat, Self::OutputDimension>,
372 Self::OutputDimension,
373 >,
374 ) where
375 Self::RequiredHardwareFeature: EvaluateHardwareFeature<EnabledStatic = B1>,
376 {
377 for i in 0..Self::OutputDimension::USIZE {
378 for j in ((if NEGATE { 1 } else { 0 })..Self::OutputDimension::USIZE).step_by(2) {
379 input[i][j] = -input[i][j].clone();
380 }
381 }
382 }
383
384 fn pdqf_negate_alt_rows<const NEGATE: bool>(
392 &mut self,
393 input: &mut GenericArray<
394 GenericArray<Self::InternalFloat, Self::OutputDimension>,
395 Self::OutputDimension,
396 >,
397 ) where
398 Self::RequiredHardwareFeature: EvaluateHardwareFeature<EnabledStatic = B1>,
399 {
400 for i in ((if NEGATE { 1 } else { 0 })..Self::OutputDimension::USIZE).step_by(2) {
401 for j in 0..Self::OutputDimension::USIZE {
402 input[i][j] = -input[i][j].clone();
403 }
404 }
405 }
406
407 fn pdqf_negate_off_diagonals(
414 &mut self,
415 input: &mut GenericArray<
416 GenericArray<Self::InternalFloat, Self::OutputDimension>,
417 Self::OutputDimension,
418 >,
419 ) {
420 for i in 0..Self::OutputDimension::USIZE {
421 for j in 0..Self::OutputDimension::USIZE {
422 if j.wrapping_sub(i) % 2 == 1 {
423 input[i][j] = -input[i][j].clone();
424 }
425 }
426 }
427 }
428
429 fn ident(&self) -> Self::Ident;
431
432 #[must_use]
436 fn required_hardware_features_met() -> bool {
437 Self::RequiredHardwareFeature::met_runtime()
438 }
439
440 fn jarosz_compress(
442 &mut self,
443 _buffer: &GenericArray<GenericArray<f32, Self::InputDimension>, Self::InputDimension>,
444 _output: &mut GenericArray<
445 GenericArray<Self::InternalFloat, Self::Buffer1WidthX>,
446 Self::Buffer1LengthY,
447 >,
448 ) where
449 Self::RequiredHardwareFeature: EvaluateHardwareFeature<EnabledStatic = B1>;
450
451 fn quantize(
453 &mut self,
454 _input: &GenericArray<
455 GenericArray<Self::InternalFloat, Self::OutputDimension>,
456 Self::OutputDimension,
457 >,
458 _threshold: &mut Self::InternalFloat,
459 _output: &mut GenericArray<
460 GenericArray<u8, <Self::OutputDimension as DivisibleBy8>::Output>,
461 Self::OutputDimension,
462 >,
463 ) where
464 Self::RequiredHardwareFeature: EvaluateHardwareFeature<EnabledStatic = B1>,
465 <Self as Kernel>::OutputDimension: DivisibleBy8;
466
467 #[must_use]
469 fn sum_of_gradients(
470 &mut self,
471 _input: &GenericArray<
472 GenericArray<Self::InternalFloat, Self::OutputDimension>,
473 Self::OutputDimension,
474 >,
475 ) -> Self::InternalFloat
476 where
477 Self::RequiredHardwareFeature: EvaluateHardwareFeature<EnabledStatic = B1>;
478
479 #[must_use]
481 fn adjust_quality(_input: Self::InternalFloat) -> f32;
482
483 fn dct2d(
485 &mut self,
486 _buffer: &GenericArray<
487 GenericArray<Self::InternalFloat, Self::Buffer1WidthX>,
488 Self::Buffer1LengthY,
489 >,
490 _tmp_row_buffer: &mut GenericArray<Self::InternalFloat, Self::Buffer1WidthX>,
491 _output: &mut GenericArray<
492 GenericArray<Self::InternalFloat, Self::OutputDimension>,
493 Self::OutputDimension,
494 >,
495 ) where
496 Self::RequiredHardwareFeature: EvaluateHardwareFeature<EnabledStatic = B1>;
497}
498
499#[derive(Clone, Copy)]
501pub struct DefaultKernel<
502 PadIntermediateX: ArrayLength + Add<U127> = U0,
503 PadIntermediateY: ArrayLength + Add<U127> = U0,
504> where
505 <PadIntermediateX as Add<U127>>::Output: ArrayLength,
506 <PadIntermediateY as Add<U127>>::Output: ArrayLength,
507{
508 _marker: core::marker::PhantomData<(PadIntermediateX, PadIntermediateY)>,
509}
510
511pub type DefaultKernelNoPadding = DefaultKernel<U0, U0>;
513pub type DefaultKernelPadXTo128 = DefaultKernel<U1, U0>;
516pub type DefaultKernelPadYTo128 = DefaultKernel<U0, U1>;
519pub type DefaultKernelPadXYTo128 = DefaultKernel<U1, U1>;
522
523impl<PadIntermediateX: ArrayLength + Add<U127>, PadIntermediateY: ArrayLength + Add<U127>> Default
524 for DefaultKernel<PadIntermediateX, PadIntermediateY>
525where
526 <PadIntermediateX as Add<U127>>::Output: ArrayLength,
527 <PadIntermediateY as Add<U127>>::Output: ArrayLength,
528{
529 fn default() -> Self {
530 Self {
531 _marker: core::marker::PhantomData,
532 }
533 }
534}
535
536impl<PadIntermediateX: ArrayLength + Add<U127>, PadIntermediateY: ArrayLength + Add<U127>>
537 DefaultKernel<PadIntermediateX, PadIntermediateY>
538where
539 <PadIntermediateX as Add<U127>>::Output: ArrayLength,
540 <PadIntermediateY as Add<U127>>::Output: ArrayLength,
541{
542 #[inline(always)]
543 pub(crate) fn dct2d_impl<P: ArrayLength>(
544 dct_matrix_rmajor: &DefaultPaddedArray<f32, DctMatrixNumElements, P>,
545 buffer: &GenericArray<
546 GenericArray<f32, <PadIntermediateX as Add<U127>>::Output>,
547 <PadIntermediateY as Add<U127>>::Output,
548 >,
549 tmp_row_buffer: &mut GenericArray<f32, <PadIntermediateX as Add<U127>>::Output>,
550 output: &mut GenericArray<GenericArray<f32, U16>, U16>,
551 ) {
552 for k in 0..16 {
554 for j in 0..127 {
555 let mut sumks = [0.0; 4];
556 for (k2, sumk) in (0..127).zip((0..4).cycle()) {
557 sumks[sumk] +=
558 dct_matrix_rmajor[k * DctMatrixNumCols::USIZE + k2] * buffer[k2][j];
559 }
560
561 tmp_row_buffer[j] = sumks[0] + sumks[1] + sumks[2] + sumks[3];
562 }
563
564 for j in 0..(DctMatrixNumRows::USIZE) {
565 let mut sumks = [0.0; 4];
566 for (m, sumk) in (0..DctMatrixNumCols::USIZE).zip((0..4).cycle()) {
567 sumks[sumk] +=
568 tmp_row_buffer[m] * dct_matrix_rmajor[j * DctMatrixNumCols::USIZE + m];
569 }
570 output[k][j] = sumks[0] + sumks[1] + sumks[2] + sumks[3];
571 }
572 }
573 }
575}
576
577impl<PadIntermediateX: ArrayLength + Add<U127>, PadIntermediateY: ArrayLength + Add<U127>> Kernel
578 for DefaultKernel<PadIntermediateX, PadIntermediateY>
579where
580 <PadIntermediateX as Add<U127>>::Output: ArrayLength,
581 <PadIntermediateY as Add<U127>>::Output: ArrayLength,
582{
583 type Buffer1WidthX = <PadIntermediateX as Add<U127>>::Output;
584 type Buffer1LengthY = <PadIntermediateY as Add<U127>>::Output;
585 type InternalFloat = f32;
586 type InputDimension = U512;
587 type OutputDimension = U16;
588 type RequiredHardwareFeature = Term;
589 type Ident = &'static str;
590
591 fn ident(&self) -> &'static str {
592 "default_scalar_autovectorized_f32"
593 }
594
595 fn quantize(
596 &mut self,
597 input: &GenericArray<GenericArray<Self::InternalFloat, U16>, U16>,
598 threshold: &mut Self::InternalFloat,
599 output: &mut GenericArray<GenericArray<u8, U2>, U16>,
600 ) {
601 let median = torben_median(input);
602 *threshold = median;
603 let output = output.flatten();
604 output.fill(0);
605 for (i, j) in input.iter().flatten().enumerate() {
606 output[32 - 1 - i / 8] += if *j > median { 1 << (i % 8) } else { 0 };
607 }
608 }
609
610 #[allow(clippy::cast_precision_loss)]
611 fn adjust_quality(input: Self::InternalFloat) -> f32 {
612 let scaled = input.to_f32().unwrap() / (QUALITY_ADJUST_DIVISOR as f32);
613
614 scaled.min(1.0)
615 }
616
617 fn sum_of_gradients(
618 &mut self,
619 input: &GenericArray<GenericArray<Self::InternalFloat, U16>, U16>,
620 ) -> Self::InternalFloat {
621 let mut gradient_sum = Default::default();
622
623 for i in 0..(16 - 1) {
624 for j in 0..16 {
625 let u = input[i][j];
626 let v = input[i + 1][j];
627 let d = u - v;
628 gradient_sum += d.abs();
629 }
630 }
631
632 for i in 0..16 {
633 for j in 0..(16 - 1) {
634 let u = input[i][j];
635 let v = input[i][j + 1];
636 let d = u - v;
637 gradient_sum += d.abs();
638 }
639 }
640
641 gradient_sum
642 }
643
644 fn jarosz_compress(
645 &mut self,
646 buffer: &GenericArray<GenericArray<f32, U512>, U512>,
647 output: &mut GenericArray<
648 GenericArray<Self::InternalFloat, Self::Buffer1WidthX>,
649 Self::Buffer1LengthY,
650 >,
651 ) {
652 for outi in 0..127 {
653 let in_i = CONVOLUTION_OFFSET_512_TO_127[outi] - TENT_FILTER_COLUMN_OFFSET;
654 for outj in 0..127 {
655 let in_j = CONVOLUTION_OFFSET_512_TO_127[outj] - TENT_FILTER_COLUMN_OFFSET;
656 let mut sum = 0.0;
657 for di in 0..TENT_FILTER_EFFECTIVE_ROWS {
658 for dj in 0..TENT_FILTER_EFFECTIVE_COLS {
659 sum += TENT_FILTER_WEIGHTS[di * TENT_FILTER_EFFECTIVE_COLS + dj]
660 * buffer[in_i + di][in_j + dj];
661 }
662 }
663 output[outi][outj] = sum;
664 }
665 }
666 }
667
668 fn dct2d(
669 &mut self,
670 buffer: &GenericArray<GenericArray<f32, Self::Buffer1WidthX>, Self::Buffer1LengthY>,
671 tmp_row_buffer: &mut GenericArray<f32, Self::Buffer1WidthX>,
672 output: &mut GenericArray<GenericArray<f32, U16>, U16>,
673 ) {
674 Self::dct2d_impl(&DCT_MATRIX_RMAJOR, buffer, tmp_row_buffer, output);
675 }
676}
677
678#[cfg(any(feature = "std", test))]
680#[derive(Default)]
681pub struct ReferenceKernel<N: num_traits::FromPrimitive = f32> {
682 _marker: core::marker::PhantomData<N>,
683}
684
685#[cfg(any(feature = "std", test))]
686impl PreciseKernel for ReferenceKernel<f32> {}
687
688#[cfg(any(feature = "std", test))]
689impl PreciseKernel for ReferenceKernel<f64> {}
690#[cfg(any(feature = "std", test))]
691impl Kernel for ReferenceKernel<f32> {
692 type Buffer1WidthX = U127;
693 type Buffer1LengthY = U127;
694 type InternalFloat = f32;
695 type InputDimension = U512;
696 type OutputDimension = U16;
697 type RequiredHardwareFeature = Term;
698 type Ident = &'static str;
699
700 fn ident(&self) -> Self::Ident {
701 "reference_scalar_autovectorized"
702 }
703
704 #[allow(clippy::cast_precision_loss)]
705 fn adjust_quality(input: Self::InternalFloat) -> f32 {
706 let scaled = input / (QUALITY_ADJUST_DIVISOR as f32);
707
708 scaled.min(1.0)
709 }
710
711 fn dct2d(
712 &mut self,
713 buffer: &GenericArray<GenericArray<f32, Self::Buffer1WidthX>, Self::Buffer1LengthY>,
714 tmp_row_buffer: &mut GenericArray<f32, Self::Buffer1WidthX>,
715 output: &mut GenericArray<GenericArray<f32, U16>, U16>,
716 ) {
717 for i in 0..16 {
718 for j in 0..127 {
719 let mut sumk = 0.0;
720 for k in 0..127 {
721 sumk += DCT_MATRIX_RMAJOR[i * DctMatrixNumCols::USIZE + k] * buffer[k][j];
722 }
723
724 tmp_row_buffer[j] = sumk;
725 }
726
727 for j in 0..16 {
728 let mut sumk = 0.0;
729 for k in 0..127 {
730 sumk += tmp_row_buffer[k] * DCT_MATRIX_RMAJOR[j * DctMatrixNumCols::USIZE + k];
731 }
732 output[i][j] = sumk;
733 }
734 }
735 }
736
737 fn sum_of_gradients(
738 &mut self,
739 input: &GenericArray<GenericArray<Self::InternalFloat, U16>, U16>,
740 ) -> Self::InternalFloat {
741 let mut gradient_sum = Default::default();
742
743 for i in 0..(16 - 1) {
744 for j in 0..16 {
745 let u = input[i][j];
746 let v = input[i + 1][j];
747 let d = u - v;
748 gradient_sum += d.abs();
749 }
750 }
751
752 for i in 0..16 {
753 for j in 0..(16 - 1) {
754 let u = input[i][j];
755 let v = input[i][j + 1];
756 let d = u - v;
757 gradient_sum += d.abs();
758 }
759 }
760
761 gradient_sum
762 }
763
764 fn quantize(
765 &mut self,
766 input: &GenericArray<GenericArray<f32, U16>, U16>,
767 threshold: &mut Self::InternalFloat,
768 output: &mut GenericArray<GenericArray<u8, U2>, U16>,
769 ) {
770 let median = torben_median(input);
771 *threshold = median;
772
773 let output = output.flatten();
774
775 for i in 0..32 {
776 let mut byte = 0;
777 for j in 0..8 {
778 let offset = i * 8 + j;
779 let offset_i = offset / 16;
780 let offset_j = offset % 16;
781 let val = input[offset_i][offset_j];
782 if val > median {
783 byte |= 1 << j;
784 }
785 }
786 output[32 - i - 1] = byte;
787 }
788 }
789
790 fn jarosz_compress(
791 &mut self,
792 buffer: &GenericArray<GenericArray<f32, U512>, U512>,
793 output: &mut GenericArray<GenericArray<f32, Self::Buffer1WidthX>, Self::Buffer1LengthY>,
794 ) {
795 #[allow(missing_docs, dead_code)]
796 mod reference {
797 include!("ref.rs");
798 }
799
800 let window_size = reference::compute_jarosz_filter_window_size(512, 127);
801 let mut buffer = buffer.flatten().to_vec();
802 reference::jarosz_filter_float(
803 buffer.as_mut_slice().try_into().unwrap(),
804 512,
805 512,
806 window_size,
807 window_size,
808 2,
809 );
810
811 for outi in 0..127 {
812 let ini = ((outi * 2 + 1) * 512) / (127 * 2);
813
814 for outj in 0..127 {
815 let inj = ((outj * 2 + 1) * 512) / (127 * 2);
816
817 output[outi][outj] = buffer[ini * 512 + inj];
818 }
819 }
820 }
821}
822
823#[cfg(any(feature = "std", test))]
824impl Kernel for ReferenceKernel<f64> {
825 type Buffer1WidthX = U127;
826 type Buffer1LengthY = U127;
827 type InternalFloat = f64;
828 type InputDimension = U512;
829 type OutputDimension = U16;
830 type RequiredHardwareFeature = Term;
831 type Ident = &'static str;
832
833 fn ident(&self) -> Self::Ident {
834 "reference_scalar_autovectorized_f64"
835 }
836
837 #[allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)]
838 fn adjust_quality(input: Self::InternalFloat) -> f32 {
839 let scaled = input / (QUALITY_ADJUST_DIVISOR as f64);
840
841 scaled.min(1.0) as f32
842 }
843
844 fn dct2d(
845 &mut self,
846 buffer: &GenericArray<GenericArray<f64, Self::Buffer1WidthX>, Self::Buffer1LengthY>,
847 tmp_row_buffer: &mut GenericArray<f64, Self::Buffer1WidthX>,
848 output: &mut GenericArray<GenericArray<f64, U16>, U16>,
849 ) {
850 for i in 0..16 {
851 for j in 0..127 {
852 let mut sumk = 0.0;
853 for k in 0..127 {
854 sumk += DCT_MATRIX_RMAJOR_64[i * DctMatrixNumCols::USIZE + k] * buffer[k][j];
855 }
856
857 tmp_row_buffer[j] = sumk;
858 }
859
860 for j in 0..16 {
861 let mut sumk = 0.0;
862 for k in 0..127 {
863 sumk +=
864 tmp_row_buffer[k] * DCT_MATRIX_RMAJOR_64[j * DctMatrixNumCols::USIZE + k];
865 }
866 output[i][j] = sumk;
867 }
868 }
869 }
870
871 fn sum_of_gradients(
872 &mut self,
873 input: &GenericArray<GenericArray<Self::InternalFloat, U16>, U16>,
874 ) -> Self::InternalFloat {
875 let mut gradient_sum = Default::default();
876
877 for i in 0..(16 - 1) {
878 for j in 0..16 {
879 let u = input[i][j];
880 let v = input[i + 1][j];
881 let d = u - v;
882 gradient_sum += d.abs();
883 }
884 }
885
886 for i in 0..16 {
887 for j in 0..(16 - 1) {
888 let u = input[i][j];
889 let v = input[i][j + 1];
890 let d = u - v;
891 gradient_sum += d.abs();
892 }
893 }
894
895 gradient_sum
896 }
897
898 fn quantize(
899 &mut self,
900 input: &GenericArray<GenericArray<f64, U16>, U16>,
901 threshold: &mut Self::InternalFloat,
902 output: &mut GenericArray<GenericArray<u8, U2>, U16>,
903 ) {
904 let median = torben_median(input);
905 *threshold = median;
906
907 let output = output.flatten();
908
909 for i in 0..32 {
910 let mut byte = 0;
911 for j in 0..8 {
912 let offset = i * 8 + j;
913 let offset_i = offset / 16;
914 let offset_j = offset % 16;
915 let val = input[offset_i][offset_j];
916 if val > median {
917 byte |= 1 << j;
918 }
919 }
920 output[32 - i - 1] = byte;
921 }
922 }
923
924 fn jarosz_compress(
925 &mut self,
926 buffer: &GenericArray<GenericArray<f32, U512>, U512>,
927 output: &mut GenericArray<GenericArray<f64, Self::Buffer1WidthX>, Self::Buffer1LengthY>,
928 ) {
929 #[allow(missing_docs, dead_code)]
930 mod reference {
931 include!("ref.rs");
932 }
933
934 let window_size = reference::compute_jarosz_filter_window_size(512, 127);
935 let mut buffer = buffer
936 .flatten()
937 .into_iter()
938 .map(|s| From::from(*s))
939 .collect::<Vec<f64>>();
940 reference::jarosz_filter_float(
941 buffer.as_mut_slice().try_into().unwrap(),
942 512,
943 512,
944 window_size,
945 window_size,
946 2,
947 );
948
949 for outi in 0..127 {
950 let ini = ((outi * 2 + 1) * 512) / (127 * 2);
951
952 for outj in 0..127 {
953 let inj = ((outj * 2 + 1) * 512) / (127 * 2);
954
955 output[outi][outj] = buffer[ini * 512 + inj];
956 }
957 }
958 }
959}
960
961#[cfg(feature = "reference-rug")]
962impl<const C: u32> Kernel for ReferenceKernel<float128::ArbFloat<C>> {
963 type Buffer1WidthX = U127;
964 type Buffer1LengthY = U127;
965 type InternalFloat = float128::ArbFloat<C>;
966 type InputDimension = U512;
967 type OutputDimension = U16;
968 type RequiredHardwareFeature = Term;
969 type Ident = &'static str;
970
971 fn ident(&self) -> &'static str {
972 "reference_scalar_autovectorized_arb_float"
973 }
974
975 fn adjust_quality(input: Self::InternalFloat) -> f32 {
976 let scaled = input / (float128::ArbFloat::from_usize(QUALITY_ADJUST_DIVISOR).unwrap());
977
978 let one = float128::ArbFloat::from_f32(1.0).unwrap();
979 if scaled > one { 1.0 } else { scaled.to_f32() }
980 }
981
982 fn sum_of_gradients(
983 &mut self,
984 input: &GenericArray<GenericArray<Self::InternalFloat, U16>, U16>,
985 ) -> Self::InternalFloat {
986 let mut gradient_sum = float128::ArbFloat::default();
987
988 for i in 0..(16 - 1) {
989 for j in 0..16 {
990 let u = input[i][j].clone();
991 let v = input[i + 1][j].clone();
992 let d = u - v;
993 gradient_sum += d.abs();
994 }
995 }
996
997 for i in 0..16 {
998 for j in 0..(16 - 1) {
999 let u = input[i][j].clone();
1000 let v = input[i][j + 1].clone();
1001 let d = u - v;
1002 gradient_sum += d.abs();
1003 }
1004 }
1005
1006 gradient_sum
1007 }
1008
1009 fn dct2d(
1010 &mut self,
1011 buffer: &GenericArray<
1012 GenericArray<Self::InternalFloat, Self::Buffer1WidthX>,
1013 Self::Buffer1LengthY,
1014 >,
1015 _tmp_row_buffer: &mut GenericArray<Self::InternalFloat, Self::Buffer1WidthX>,
1016 output: &mut GenericArray<GenericArray<Self::InternalFloat, U16>, U16>,
1017 ) {
1018 let d_value = |i: i32, j: i32, n: i32| {
1019 let n1: Self::InternalFloat = NumCast::from(n).unwrap();
1020 let i1: Self::InternalFloat = NumCast::from(i).unwrap();
1021 let j1: Self::InternalFloat = NumCast::from(j).unwrap();
1022 let one: Self::InternalFloat = NumCast::from(1).unwrap();
1023 let two: Self::InternalFloat = one.clone() + one.clone();
1024 let pi: Self::InternalFloat = float128::ArbFloat::<C>::pi();
1025
1026 (two.clone() / n1.clone()).sqrt()
1027 * (pi / (two.clone() * n1) * i1 * (two * j1 + one)).cos()
1028 };
1029
1030 let mut d_table = Vec::with_capacity(127 * 16);
1031 for i in 1..=16 {
1032 for j in 0..127 {
1033 d_table.push(d_value(i, j, 127));
1034 }
1035 }
1036
1037 let mut intermediate_matrix = Vec::<Self::InternalFloat>::with_capacity(127);
1038 intermediate_matrix.resize(127, Self::InternalFloat::default());
1039
1040 for i in 0..16 {
1041 for j in 0..127 {
1042 let mut sumk = Self::InternalFloat::default();
1043 for k in 0..127 {
1044 let d = d_table[i * 127 + k].clone();
1045 sumk += d * buffer[k][j].clone();
1046 }
1047
1048 intermediate_matrix[j] = sumk;
1049 }
1050
1051 for j in 0..16 {
1052 let mut sumk = Self::InternalFloat::default();
1053 for k in 0..127 {
1054 let d = d_table[j * 127 + k].clone();
1055 sumk += intermediate_matrix[k].clone() * d;
1056 }
1057 output[i][j] = sumk;
1058 }
1059 }
1060 }
1061
1062 fn quantize(
1063 &mut self,
1064 input: &GenericArray<GenericArray<Self::InternalFloat, U16>, U16>,
1065 threshold: &mut Self::InternalFloat,
1066 output: &mut GenericArray<GenericArray<u8, U2>, U16>,
1067 ) {
1068 let half = Self::InternalFloat::from_f64(0.5).unwrap();
1069 let mut flattened = GenericArray::<Self::InternalFloat, U256>::default();
1070 for i in 0..16 {
1071 for j in 0..16 {
1072 flattened[i * 16 + j] = input[i][j].clone();
1073 }
1074 }
1075 flattened.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap());
1076 let median = (flattened[127].clone() + flattened[128].clone()) * half;
1077 *threshold = median.clone();
1078
1079 let output = output.flatten();
1080 for i in 0..32 {
1081 let mut byte = 0;
1082 for j in 0..8 {
1083 let offset = i * 8 + j;
1084 let offset_i = offset / 16;
1085 let offset_j = offset % 16;
1086 let val = input[offset_i][offset_j].clone();
1087 if val > median {
1088 byte |= 1 << j;
1089 }
1090 }
1091 output[32 - i - 1] = byte;
1092 }
1093 }
1094
1095 fn jarosz_compress(
1096 &mut self,
1097 buffer: &GenericArray<GenericArray<f32, U512>, U512>,
1098 output: &mut GenericArray<
1099 GenericArray<Self::InternalFloat, Self::Buffer1WidthX>,
1100 Self::Buffer1LengthY,
1101 >,
1102 ) {
1103 #[allow(missing_docs, dead_code)]
1104 mod reference {
1105 include!("ref.rs");
1106 }
1107
1108 let window_size = reference::compute_jarosz_filter_window_size(512, 127);
1109 let mut buffer = buffer
1110 .flatten()
1111 .iter()
1112 .map(|s| float128::ArbFloat::<C>::from_f32(*s).unwrap())
1113 .collect::<Vec<_>>();
1114 reference::jarosz_filter_float(
1115 buffer.as_mut_slice().try_into().unwrap(),
1116 512,
1117 512,
1118 window_size,
1119 window_size,
1120 2,
1121 );
1122
1123 for outi in 0..127 {
1126 let ini = ((outi * 2 + 1) * 512) / (127 * 2);
1127
1128 for outj in 0..127 {
1129 let inj = ((outj * 2 + 1) * 512) / (127 * 2);
1130
1131 output[outi][outj] = buffer[ini * 512 + inj].clone();
1132 }
1133 }
1134 }
1135}
1136
1137#[cfg(feature = "reference-rug")]
1138impl<const C: u32> PreciseKernel for ReferenceKernel<float128::ArbFloat<C>> {}
1139
1140#[cfg(test)]
1141mod tests {
1142 use core::ops::Mul;
1143
1144 use num_traits::NumCast;
1145 use rand::Rng;
1146
1147 use super::*;
1148
1149 extern crate alloc;
1150
1151 #[allow(dead_code)]
1152 fn test_gradient_impl<OD: ArrayLength, K: Kernel<OutputDimension = OD>>(kernel: &mut K)
1153 where
1154 ReferenceKernel<<K as Kernel>::InternalFloat>:
1155 Kernel<InternalFloat = <K as Kernel>::InternalFloat, OutputDimension = OD>,
1156 OD: Mul<OD>,
1157 <OD as Mul<OD>>::Output: ArrayLength,
1158 K::RequiredHardwareFeature: EvaluateHardwareFeature<EnabledStatic = B1>,
1159 ReferenceKernel<K::InternalFloat>:
1160 Kernel<InternalFloat = K::InternalFloat, OutputDimension = OD>,
1161 <ReferenceKernel<K::InternalFloat> as Kernel>::RequiredHardwareFeature:
1162 EvaluateHardwareFeature<EnabledStatic = B1>,
1163 {
1164 let mut rng = rand::rng();
1165 let mut input: GenericArray<GenericArray<<K as Kernel>::InternalFloat, OD>, OD> =
1166 GenericArray::default();
1167 input.iter_mut().for_each(|row| {
1168 row.iter_mut()
1169 .for_each(|val| *val = NumCast::from(rng.random_range(0.0..1.0)).unwrap());
1170 });
1171
1172 let gradient_ref = ReferenceKernel::<K::InternalFloat>::default().sum_of_gradients(&input);
1173 let gradient = kernel.sum_of_gradients(&input);
1174
1175 let diff = (gradient.clone() - NumCast::from(gradient_ref.clone()).unwrap()).abs();
1176 let diff_ref = diff / NumCast::from(gradient_ref.clone()).unwrap();
1177
1178 assert!(
1179 diff_ref < NumCast::from(0.05).unwrap(),
1180 "gradient: {}, gradient_ref: {}, diff: {} %",
1181 gradient,
1182 gradient_ref,
1183 diff_ref * NumCast::from(100.0).unwrap(),
1184 );
1185 }
1186
1187 fn test_dct64_impl<K: Kernel>(kernel: &mut K, eps: K::InternalFloat)
1188 where
1189 <K as Kernel>::RequiredHardwareFeature: EvaluateHardwareFeature<EnabledStatic = B1>,
1190 ReferenceKernel<K::InternalFloat>: Kernel<InternalFloat = K::InternalFloat>,
1191 <ReferenceKernel<K::InternalFloat> as Kernel>::RequiredHardwareFeature:
1192 EvaluateHardwareFeature<EnabledStatic = B1>,
1193 {
1194 let mut rng = rand::rng();
1195 let mut input_ref: GenericArray<GenericArray<<K as Kernel>::InternalFloat, _>, _> =
1196 GenericArray::default();
1197 input_ref.iter_mut().enumerate().for_each(|(i, row)| {
1198 row.iter_mut().enumerate().for_each(|(j, val)| {
1199 *val = if (i + 2 * j) % 20 < 10 {
1200 NumCast::from(rng.random_range(0.0..1.0)).unwrap()
1201 } else {
1202 NumCast::from(rng.random_range(-1.0..0.0)).unwrap()
1203 };
1204 });
1205 });
1206 let mut input: GenericArray<
1207 GenericArray<K::InternalFloat, K::Buffer1WidthX>,
1208 K::Buffer1LengthY,
1209 > = GenericArray::default();
1210 for i in 0..127 {
1211 for j in 0..127 {
1212 input[i][j] = input_ref[i][j].clone();
1213 }
1214 }
1215
1216 let mut output = GenericArray::default();
1217 let mut output_ref = GenericArray::default();
1218 ReferenceKernel::<K::InternalFloat>::default().dct2d(
1219 &input_ref,
1220 &mut GenericArray::default(),
1221 &mut output_ref,
1222 );
1223
1224 kernel.dct2d(&input, &mut GenericArray::default(), &mut output);
1225
1226 output
1227 .iter()
1228 .flatten()
1229 .zip(output_ref.iter().flatten())
1230 .for_each(|(a, b)| {
1231 let diff = (a.clone() - NumCast::from(b.clone()).unwrap()).abs();
1232 assert!(diff <= eps, "difference: {:?} (tolerance: {:?})", diff, eps);
1233 });
1234 }
1235
1236 #[test]
1237 fn test_dct64_impl_base() {
1238 let mut kernel = DefaultKernelNoPadding::default();
1239 test_dct64_impl(&mut kernel, NumCast::from(f32::EPSILON * 32.00).unwrap());
1240 }
1241
1242 #[test]
1243 fn test_dct64_impl_smart_kernel() {
1244 let mut kernel = smart_kernel();
1245 test_dct64_impl(&mut kernel, NumCast::from(f32::EPSILON * 32.00).unwrap());
1246 }
1247
1248 #[test]
1249 #[cfg(target_arch = "x86_64")]
1250 fn test_dct64_impl_avx2_equivalence() {
1251 use generic_array::typenum::U128;
1254
1255 let mut dct_matrix =
1256 DefaultPaddedArray::<_, _, U16>::new(*GenericArray::from_slice(&[1.0; 16 * 127]));
1257
1258 let mut input_buffer_127: Box<GenericArray<GenericArray<f32, U127>, U127>> = Box::default();
1259 let mut input_buffer_128: Box<GenericArray<GenericArray<f32, U128>, U128>> = Box::default();
1260
1261 input_buffer_127
1263 .iter_mut()
1264 .enumerate()
1265 .for_each(|(i, row)| {
1266 row.iter_mut().enumerate().for_each(|(j, val)| {
1267 input_buffer_128[i][j] = (i * 127 + j) as f32 * 0.00001;
1268 *val = (i * 127 + j) as f32 * 0.00001;
1269 });
1270 });
1271 let mut output = GenericArray::default();
1272 let mut output_ref = GenericArray::default();
1273 DefaultKernelNoPadding::dct2d_impl(
1274 &dct_matrix,
1275 &input_buffer_127,
1276 &mut GenericArray::default(),
1277 &mut output_ref,
1278 );
1279
1280 #[allow(unused_unsafe)]
1281 unsafe {
1282 x86::Avx2F32Kernel::dct2d_impl(
1283 &dct_matrix,
1284 &input_buffer_128,
1285 &mut GenericArray::default(),
1286 &mut output,
1287 );
1288 }
1289
1290 for i in 0..16 {
1291 for j in 0..16 {
1292 let diff = (output_ref[i][j] - output[i][j]).abs();
1293 let diff_ref = diff / output_ref[i][j];
1294 assert!(
1296 diff_ref < 4e-7,
1297 "difference: {:?} (relative: {:?})",
1298 diff,
1299 diff_ref
1300 );
1301 }
1302 }
1303
1304 input_buffer_127
1306 .iter_mut()
1307 .enumerate()
1308 .for_each(|(i, row)| {
1309 row.iter_mut().enumerate().for_each(|(j, val)| {
1310 input_buffer_128[i][j] = 0.00001;
1311 *val = 0.00001;
1312 });
1313 });
1314
1315 dct_matrix
1316 .iter_mut()
1317 .enumerate()
1318 .for_each(|(idx, val)| *val = idx as f32);
1319
1320 DefaultKernelNoPadding::dct2d_impl(
1321 &dct_matrix,
1322 &input_buffer_127,
1323 &mut GenericArray::default(),
1324 &mut output_ref,
1325 );
1326
1327 #[allow(unused_unsafe)]
1328 unsafe {
1329 x86::Avx2F32Kernel::dct2d_impl(
1330 &dct_matrix,
1331 &input_buffer_128,
1332 &mut GenericArray::default(),
1333 &mut output,
1334 );
1335 }
1336
1337 for i in 0..16 {
1338 for j in 0..16 {
1339 let diff = (output_ref[i][j] - output[i][j]).abs();
1340 let diff_ref = diff / output_ref[i][j];
1341 assert!(
1343 diff_ref < 8e-7,
1344 "difference: {:?} (relative: {:?})",
1345 diff,
1346 diff_ref
1347 );
1348 }
1349 }
1350 }
1351
1352 #[cfg(all(target_arch = "x86_64", feature = "avx512", target_feature = "avx512f"))]
1353 #[test]
1354 fn test_dct64_impl_avx512() {
1355 let mut kernel = x86::Avx512F32Kernel;
1356 test_gradient_impl(&mut kernel);
1357 test_dct64_impl(&mut kernel, NumCast::from(5e-6).unwrap());
1358 }
1359}