yume_pdq/kernel/
mod.rs

1#![allow(
2    clippy::needless_range_loop,
3    reason = "keep the scalar code comparable to the vectorized code"
4)]
5/*
6 * Copyright (c) 2025 Yumechi <yume@yumechi.jp>
7 *
8 * Created on Saturday, March 22, 2025
9 * Author: Yumechi <yume@yumechi.jp>
10 *
11 * SPDX-License-Identifier: Apache-2.0
12 *
13 * Licensed under the Apache License, Version 2.0 (the "License");
14 * you may not use this file except in compliance with the License.
15 * You may obtain a copy of the License at
16 *
17 * http://www.apache.org/licenses/LICENSE-2.0
18 *
19 * Unless required by applicable law or agreed to in writing, software
20 * distributed under the License is distributed on an "AS IS" BASIS,
21 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22 * See the License for the specific language governing permissions and
23 * limitations under the License.
24 */
25
26use core::{
27    fmt::{Debug, Display},
28    ops::Add,
29};
30
31use generic_array::typenum::U3;
32#[allow(unused_imports)]
33use generic_array::{
34    ArrayLength, GenericArray,
35    sequence::Flatten,
36    typenum::{B1, U0, U1, U2, U4, U16, U127, U128, U256, U512, Unsigned},
37};
38
39#[cfg_attr(not(feature = "reference-rug"), allow(unused_imports))]
40use num_traits::{FromPrimitive, NumCast, Signed, ToPrimitive, float::FloatCore};
41
42pub use generic_array;
43use sealing::Sealed;
44use type_traits::{DivisibleBy8, EvaluateHardwareFeature, SquareOf, Term};
45
46use crate::alignment::DefaultPaddedArray;
47
48/// Constants for conversions.
49pub mod constants;
50
51/// Kernels based on x86-64 intrinsics.
52#[cfg(target_arch = "x86_64")]
53pub mod x86;
54
55/// A fallback router for kernels.
56pub mod router;
57
58#[cfg(feature = "reference-rug")]
59/// 128-bit floating point type.
60pub mod float128;
61
62#[cfg(feature = "portable-simd")]
63/// Portable SIMD implementation.
64pub mod portable_simd;
65
66// potentially incorrect, don't use it for now
67// pub mod dihedral;
68
69/// Threshold/quantization methods.
70pub mod threshold;
71
72/// Type traits.
73pub mod type_traits;
74
75include!(concat!(env!("OUT_DIR"), "/dct_matrix.rs"));
76include!(concat!(env!("OUT_DIR"), "/tent_filter_weights.rs"));
77include!(concat!(env!("OUT_DIR"), "/convolution_offset.rs"));
78
79#[cfg(feature = "portable-simd")]
80/// The worst case fallback kernel.
81pub type FallbackKernel = portable_simd::PortableSimdF32Kernel<8>;
82
83#[cfg(all(not(feature = "portable-simd"), not(target_arch = "x86_64")))]
84/// The worst case fallback kernel.
85pub type FallbackKernel = DefaultKernelNoPadding;
86
87#[cfg(all(not(feature = "portable-simd"), target_arch = "x86_64"))]
88/// The worst case fallback kernel.
89pub type FallbackKernel = DefaultKernelPadXYTo128;
90
91/// Return an opaque kernel object that is likely what you want. (based on your feature flags)
92///
93/// Generally we will make every effort to make new kernels available through this function so you don't need to care about the underlying implementation.
94#[inline(always)]
95#[must_use]
96pub fn smart_kernel() -> impl Kernel<
97    RequiredHardwareFeature = impl EvaluateHardwareFeature<EnabledStatic = B1>,
98    InputDimension = U512,
99    OutputDimension = U16,
100    InternalFloat = f32,
101> + Clone {
102    smart_kernel_impl()
103}
104
105#[cfg(target_arch = "x86_64")]
106#[cfg(feature = "avx512")]
107#[cfg(any(feature = "prefer-x86-intrinsics", not(feature = "portable-simd")))]
108pub(crate) type SmartKernelConcreteType = router::KernelRouter<
109    x86::Avx512F32Kernel,
110    router::KernelRouter<x86::Avx2F32Kernel, FallbackKernel>,
111>;
112
113#[cfg(target_arch = "x86_64")]
114#[cfg(feature = "avx512")]
115#[cfg(all(not(feature = "prefer-x86-intrinsics"), feature = "portable-simd"))]
116pub(crate) type SmartKernelConcreteType =
117    router::KernelRouter<x86::Avx512F32Kernel, portable_simd::PortableSimdF32Kernel<8>>;
118
119#[cfg(target_arch = "x86_64")]
120#[cfg(feature = "avx512")]
121#[inline(always)]
122pub(crate) fn smart_kernel_impl() -> SmartKernelConcreteType {
123    #[allow(unused_imports)]
124    use router::KernelRouter;
125
126    #[cfg(any(feature = "prefer-x86-intrinsics", not(feature = "portable-simd")))]
127    {
128        KernelRouter::new(x86::Avx2F32Kernel, FallbackKernel::default())
129            .layer_on_top(x86::Avx512F32Kernel)
130    }
131
132    #[cfg(all(not(feature = "prefer-x86-intrinsics"), feature = "portable-simd"))]
133    {
134        KernelRouter::new(
135            x86::Avx512F32Kernel,
136            portable_simd::PortableSimdF32Kernel::<8>,
137        )
138    }
139}
140
141#[cfg(target_arch = "x86_64")]
142#[cfg(not(feature = "avx512"))]
143#[cfg(any(feature = "prefer-x86-intrinsics", not(feature = "portable-simd")))]
144pub(crate) type SmartKernelConcreteType = router::KernelRouter<x86::Avx2F32Kernel, FallbackKernel>;
145
146#[cfg(target_arch = "x86_64")]
147#[cfg(not(feature = "avx512"))]
148#[cfg(all(not(feature = "prefer-x86-intrinsics"), feature = "portable-simd"))]
149pub(crate) type SmartKernelConcreteType = portable_simd::PortableSimdF32Kernel<8>;
150
151#[cfg(target_arch = "x86_64")]
152#[cfg(not(feature = "avx512"))]
153#[inline(always)]
154pub(crate) fn smart_kernel_impl() -> SmartKernelConcreteType {
155    #[allow(unused_imports)]
156    use router::KernelRouter;
157
158    #[cfg(any(feature = "prefer-x86-intrinsics", not(feature = "portable-simd")))]
159    {
160        KernelRouter::new(x86::Avx2F32Kernel, FallbackKernel::default())
161    }
162
163    #[cfg(all(not(feature = "prefer-x86-intrinsics"), feature = "portable-simd"))]
164    {
165        portable_simd::PortableSimdF32Kernel::<8>::default()
166    }
167}
168
169#[cfg(not(target_arch = "x86_64"))]
170pub(crate) type SmartKernelConcreteType = FallbackKernel;
171
172#[cfg(not(target_arch = "x86_64"))]
173#[inline(always)]
174pub(crate) fn smart_kernel_impl() -> SmartKernelConcreteType {
175    FallbackKernel::default()
176}
177
178mod sealing {
179    /// Private sealing trait
180    pub trait Sealed {}
181}
182
183/// Extension trait for square `GenericArray`'s.
184///
185/// This trait is defined for all non-empty square `GenericArray`'s up to 1024x1024.
186pub trait SquareGenericArrayExt<I, L: SquareOf>: Sized + Sealed {
187    /// Unflatten a [`GenericArray`] into a square matrix.
188    fn unflatten_square(self) -> GenericArray<GenericArray<I, L>, L> {
189        unsafe { generic_array::const_transmute(self) }
190    }
191
192    /// Unflatten a [`GenericArray`] into a square matrix by reference.
193    fn unflatten_square_ref(&self) -> &GenericArray<GenericArray<I, L>, L> {
194        unsafe { generic_array::const_transmute(self) }
195    }
196
197    /// Unflatten a [`GenericArray`] into a square matrix by mutable reference.
198    fn unflatten_square_mut(&mut self) -> &mut GenericArray<GenericArray<I, L>, L> {
199        unsafe { generic_array::const_transmute(self) }
200    }
201}
202
203impl<I, L: ArrayLength> Sealed for GenericArray<I, L> {}
204
205impl<I, L: SquareOf> SquareGenericArrayExt<I, L> for GenericArray<I, <L as SquareOf>::Output> {}
206
207/// Marker trait to indicate that this kernel produces precise results (i.e. no lossy LUTs).
208pub trait PreciseKernel: Kernel {}
209
210// Copied verbatim from [darwinium-com/pdqhash](https://github.com/darwinium-com/pdqhash/blob/main/src/lib.rs).
211#[allow(clippy::all, clippy::pedantic)]
212pub(crate) fn torben_median<F: FloatCore>(m: &GenericArray<GenericArray<F, U16>, U16>) -> F {
213    let mut min = m.iter().flatten().cloned().reduce(F::min).unwrap();
214    let mut max = m.iter().flatten().cloned().reduce(F::max).unwrap();
215
216    let half = (16 * 16 + 1) / 2;
217    loop {
218        let guess = (min + max) / F::from(2).unwrap();
219        let mut less = 0;
220        let mut greater = 0;
221        let mut equal = 0;
222        let mut maxltguess = min;
223        let mut mingtguess = max;
224        for val in m.into_iter().flatten() {
225            if *val < guess {
226                less += 1;
227                if *val > maxltguess {
228                    maxltguess = *val;
229                }
230            } else if *val > guess {
231                greater += 1;
232                if *val < mingtguess {
233                    mingtguess = *val;
234                }
235            } else {
236                equal += 1;
237            }
238        }
239        if less <= half && greater <= half {
240            return if less >= half {
241                maxltguess
242            } else if less + equal >= half {
243                guess
244            } else {
245                mingtguess
246            };
247        } else if less > greater {
248            max = maxltguess;
249        } else {
250            min = mingtguess;
251        }
252    }
253}
254
255/// The divisor for the quality adjustment offset.
256/// cbindgen:ignore
257pub const QUALITY_ADJUST_DIVISOR: usize = 180;
258
259// reference: https://raw.githubusercontent.com/facebook/ThreatExchange/main/hashing/hashing.pdf
260
261/// Compute kernel for doing heavy-duty transformations.
262///
263/// Kernels not marked with [`PreciseKernel`] uses LUTs extensively and are thus less accurate, albeit still well within the official docs' tolerance of "correct" implementation (10 bits different).
264///
265/// A typical matching threshold is distance <=31 bits out of 256, well within the error margin.
266///
267/// A scalar (auto-vectorized) implementation is provided in [`DefaultKernel`].
268pub trait Kernel {
269    /// The width of the first stage (compression) buffer.
270    type Buffer1WidthX: ArrayLength;
271    /// The length of the first stage (compression) buffer.
272    type Buffer1LengthY: ArrayLength;
273    /// The width and height of the input image.
274    type InputDimension: ArrayLength + SquareOf;
275    /// The width and height of the output hash.
276    type OutputDimension: ArrayLength + SquareOf;
277    /// The hardware features required to run this kernel
278    type RequiredHardwareFeature: EvaluateHardwareFeature;
279
280    /// Identification token.
281    type Ident: Debug + Display + Clone + Copy + 'static + PartialEq;
282
283    /// The internal floating point type used for intermediate calculations.
284    type InternalFloat: num_traits::float::TotalOrder
285        + num_traits::FromPrimitive
286        + num_traits::NumAssign
287        + num_traits::bounds::Bounded
288        + num_traits::NumCast
289        + num_traits::identities::Zero
290        + num_traits::identities::One
291        + num_traits::Signed
292        + PartialOrd
293        + Clone
294        + Display
295        + Debug
296        + Default
297        + Send
298        + Sync;
299
300    /// Convert one row of input from RGB8 to LUMA8 floating point.
301    ///
302    /// A scalar implementation is provided by default.
303    fn cvt_rgb8_to_luma8f<const R_COEFF: u32, const G_COEFF: u32, const B_COEFF: u32>(
304        &mut self,
305        input: &GenericArray<GenericArray<u8, U3>, Self::InputDimension>,
306        output: &mut GenericArray<f32, Self::InputDimension>,
307    ) where
308        Self::RequiredHardwareFeature: EvaluateHardwareFeature<EnabledStatic = B1>,
309    {
310        for (input_pixel, output_pixel) in input.iter().zip(output.iter_mut()) {
311            let r = input_pixel[0] as f32;
312            let g = input_pixel[1] as f32;
313            let b = input_pixel[2] as f32;
314            let luma = f32::from_ne_bytes(R_COEFF.to_ne_bytes()) * r
315                + f32::from_ne_bytes(G_COEFF.to_ne_bytes()) * g
316                + f32::from_ne_bytes(B_COEFF.to_ne_bytes()) * b;
317            *output_pixel = luma;
318        }
319    }
320
321    /// Convert RGBA8 to LUMA8.
322    ///
323    /// A scalar implementation is provided by default.
324    fn cvt_rgba8_to_luma8f<const R_COEFF: u32, const G_COEFF: u32, const B_COEFF: u32>(
325        &mut self,
326        input: &GenericArray<GenericArray<u8, U4>, Self::InputDimension>,
327        output: &mut GenericArray<f32, Self::InputDimension>,
328    ) where
329        Self::RequiredHardwareFeature: EvaluateHardwareFeature<EnabledStatic = B1>,
330    {
331        for (input_pixel, output_pixel) in input.iter().zip(output.iter_mut()) {
332            let r = input_pixel[0] as f32;
333            let g = input_pixel[1] as f32;
334            let b = input_pixel[2] as f32;
335            let luma = f32::from_ne_bytes(R_COEFF.to_ne_bytes()) * r
336                + f32::from_ne_bytes(G_COEFF.to_ne_bytes()) * g
337                + f32::from_ne_bytes(B_COEFF.to_ne_bytes()) * b;
338            *output_pixel = luma;
339        }
340    }
341
342    /// Transpose a PDQF matrix in place. Output is equivalent to PDQF(t(image)).
343    ///
344    /// A scalar implementation is provided by default.
345    fn pdqf_t(
346        &mut self,
347        input: &mut GenericArray<
348            GenericArray<Self::InternalFloat, Self::OutputDimension>,
349            Self::OutputDimension,
350        >,
351    ) where
352        Self::RequiredHardwareFeature: EvaluateHardwareFeature<EnabledStatic = B1>,
353    {
354        for i in 0..Self::OutputDimension::USIZE {
355            for j in 0..Self::OutputDimension::USIZE {
356                if i < j {
357                    (input[j][i], input[i][j]) = (input[i][j].clone(), input[j][i].clone());
358                }
359            }
360        }
361    }
362
363    /// Negate alternative columns of PDQF matrix in place. Equivalent to PDQF(flop(image)).
364    ///
365    /// NEGATE means start from odd-indices (even columns) instead, useful for continuous flipping.
366    ///
367    /// A scalar implementation is provided by default.
368    fn pdqf_negate_alt_cols<const NEGATE: bool>(
369        &mut self,
370        input: &mut GenericArray<
371            GenericArray<Self::InternalFloat, Self::OutputDimension>,
372            Self::OutputDimension,
373        >,
374    ) where
375        Self::RequiredHardwareFeature: EvaluateHardwareFeature<EnabledStatic = B1>,
376    {
377        for i in 0..Self::OutputDimension::USIZE {
378            for j in ((if NEGATE { 1 } else { 0 })..Self::OutputDimension::USIZE).step_by(2) {
379                input[i][j] = -input[i][j].clone();
380            }
381        }
382    }
383
384    /// Negate alternative rows of PDQF matrix in place. Equivalent to PDQF(flip(image)).
385    ///
386    /// NEGATE means start from odd-indices (even rows) instead, useful for continuous flipping.
387    ///
388    /// A scalar implementation is provided by default.
389    ///
390    /// You usually should not have to override this as it is trivially vectorizable.
391    fn pdqf_negate_alt_rows<const NEGATE: bool>(
392        &mut self,
393        input: &mut GenericArray<
394            GenericArray<Self::InternalFloat, Self::OutputDimension>,
395            Self::OutputDimension,
396        >,
397    ) where
398        Self::RequiredHardwareFeature: EvaluateHardwareFeature<EnabledStatic = B1>,
399    {
400        for i in ((if NEGATE { 1 } else { 0 })..Self::OutputDimension::USIZE).step_by(2) {
401            for j in 0..Self::OutputDimension::USIZE {
402                input[i][j] = -input[i][j].clone();
403            }
404        }
405    }
406
407    /// Negate off-diagonals of PDQF matrix in place. Equivalent to PDQF(rotate180(image)).
408    ///
409    /// If you need the intermediate result,
410    /// it is usually less efficient than just doing [`Self::pdqf_negate_alt_cols`] and [`Self::pdqf_negate_alt_rows`] inverted in sequence.
411    ///
412    /// A scalar implementation is provided by default.
413    fn pdqf_negate_off_diagonals(
414        &mut self,
415        input: &mut GenericArray<
416            GenericArray<Self::InternalFloat, Self::OutputDimension>,
417            Self::OutputDimension,
418        >,
419    ) {
420        for i in 0..Self::OutputDimension::USIZE {
421            for j in 0..Self::OutputDimension::USIZE {
422                if j.wrapping_sub(i) % 2 == 1 {
423                    input[i][j] = -input[i][j].clone();
424                }
425            }
426        }
427    }
428
429    /// Return an identification of the kernel. For composite kernels that route to multiple kernels, report the kernel that would be executed.
430    fn ident(&self) -> Self::Ident;
431
432    /// Shorthand to `<<K as Kernel>::RequiredHardwareFeature as EvaluateHardwareFeature>::met_runtime()`
433    ///
434    /// Usually downstream applications should be generic over this trait and finally a runtime check is done to dispatch application logic with the suitable kernel.
435    #[must_use]
436    fn required_hardware_features_met() -> bool {
437        Self::RequiredHardwareFeature::met_runtime()
438    }
439
440    /// Apply a tent-filter average to every 8x8 sub-block of the input buffer and write the result of each sub-block to the output buffer.
441    fn jarosz_compress(
442        &mut self,
443        _buffer: &GenericArray<GenericArray<f32, Self::InputDimension>, Self::InputDimension>,
444        _output: &mut GenericArray<
445            GenericArray<Self::InternalFloat, Self::Buffer1WidthX>,
446            Self::Buffer1LengthY,
447        >,
448    ) where
449        Self::RequiredHardwareFeature: EvaluateHardwareFeature<EnabledStatic = B1>;
450
451    /// Convert input to binary by thresholding at median
452    fn quantize(
453        &mut self,
454        _input: &GenericArray<
455            GenericArray<Self::InternalFloat, Self::OutputDimension>,
456            Self::OutputDimension,
457        >,
458        _threshold: &mut Self::InternalFloat,
459        _output: &mut GenericArray<
460            GenericArray<u8, <Self::OutputDimension as DivisibleBy8>::Output>,
461            Self::OutputDimension,
462        >,
463    ) where
464        Self::RequiredHardwareFeature: EvaluateHardwareFeature<EnabledStatic = B1>,
465        <Self as Kernel>::OutputDimension: DivisibleBy8;
466
467    /// Compute the sum of gradients of the input buffer in both horizontal and vertical directions.
468    #[must_use]
469    fn sum_of_gradients(
470        &mut self,
471        _input: &GenericArray<
472            GenericArray<Self::InternalFloat, Self::OutputDimension>,
473            Self::OutputDimension,
474        >,
475    ) -> Self::InternalFloat
476    where
477        Self::RequiredHardwareFeature: EvaluateHardwareFeature<EnabledStatic = B1>;
478
479    /// Adjust the quality metric to be between 0 and 1.
480    #[must_use]
481    fn adjust_quality(_input: Self::InternalFloat) -> f32;
482
483    /// Apply a 2D DCT-II transformation to the input buffer write the result to the output buffer.
484    fn dct2d(
485        &mut self,
486        _buffer: &GenericArray<
487            GenericArray<Self::InternalFloat, Self::Buffer1WidthX>,
488            Self::Buffer1LengthY,
489        >,
490        _tmp_row_buffer: &mut GenericArray<Self::InternalFloat, Self::Buffer1WidthX>,
491        _output: &mut GenericArray<
492            GenericArray<Self::InternalFloat, Self::OutputDimension>,
493            Self::OutputDimension,
494        >,
495    ) where
496        Self::RequiredHardwareFeature: EvaluateHardwareFeature<EnabledStatic = B1>;
497}
498
499/// A pure-Rust implementation of the `Kernel` trait.
500#[derive(Clone, Copy)]
501pub struct DefaultKernel<
502    PadIntermediateX: ArrayLength + Add<U127> = U0,
503    PadIntermediateY: ArrayLength + Add<U127> = U0,
504> where
505    <PadIntermediateX as Add<U127>>::Output: ArrayLength,
506    <PadIntermediateY as Add<U127>>::Output: ArrayLength,
507{
508    _marker: core::marker::PhantomData<(PadIntermediateX, PadIntermediateY)>,
509}
510
511/// A default kernel with no padding on the intermediate 127x127 matrix.
512pub type DefaultKernelNoPadding = DefaultKernel<U0, U0>;
513/// A default kernel with padding 1 row on the intermediate 127x127 matrix on the X axis.
514/// This does not make it faster but only useful for sharing buffers with vectorized kernels.
515pub type DefaultKernelPadXTo128 = DefaultKernel<U1, U0>;
516/// A default kernel with padding 1 column on the intermediate 127x127 matrix on the Y axis.
517/// This does not make it faster but only useful for sharing buffers with vectorized kernels.
518pub type DefaultKernelPadYTo128 = DefaultKernel<U0, U1>;
519/// A default kernel with padding 1 row and 1 column on the intermediate 127x127 matrix on both the X and Y axes.
520/// This does not make it faster but only useful for sharing buffers with vectorized kernels.
521pub type DefaultKernelPadXYTo128 = DefaultKernel<U1, U1>;
522
523impl<PadIntermediateX: ArrayLength + Add<U127>, PadIntermediateY: ArrayLength + Add<U127>> Default
524    for DefaultKernel<PadIntermediateX, PadIntermediateY>
525where
526    <PadIntermediateX as Add<U127>>::Output: ArrayLength,
527    <PadIntermediateY as Add<U127>>::Output: ArrayLength,
528{
529    fn default() -> Self {
530        Self {
531            _marker: core::marker::PhantomData,
532        }
533    }
534}
535
536impl<PadIntermediateX: ArrayLength + Add<U127>, PadIntermediateY: ArrayLength + Add<U127>>
537    DefaultKernel<PadIntermediateX, PadIntermediateY>
538where
539    <PadIntermediateX as Add<U127>>::Output: ArrayLength,
540    <PadIntermediateY as Add<U127>>::Output: ArrayLength,
541{
542    #[inline(always)]
543    pub(crate) fn dct2d_impl<P: ArrayLength>(
544        dct_matrix_rmajor: &DefaultPaddedArray<f32, DctMatrixNumElements, P>,
545        buffer: &GenericArray<
546            GenericArray<f32, <PadIntermediateX as Add<U127>>::Output>,
547            <PadIntermediateY as Add<U127>>::Output,
548        >,
549        tmp_row_buffer: &mut GenericArray<f32, <PadIntermediateX as Add<U127>>::Output>,
550        output: &mut GenericArray<GenericArray<f32, U16>, U16>,
551    ) {
552        // crate::testing::dump_image("dct2d_input_scalar.ppm", buffer);
553        for k in 0..16 {
554            for j in 0..127 {
555                let mut sumks = [0.0; 4];
556                for (k2, sumk) in (0..127).zip((0..4).cycle()) {
557                    sumks[sumk] +=
558                        dct_matrix_rmajor[k * DctMatrixNumCols::USIZE + k2] * buffer[k2][j];
559                }
560
561                tmp_row_buffer[j] = sumks[0] + sumks[1] + sumks[2] + sumks[3];
562            }
563
564            for j in 0..(DctMatrixNumRows::USIZE) {
565                let mut sumks = [0.0; 4];
566                for (m, sumk) in (0..DctMatrixNumCols::USIZE).zip((0..4).cycle()) {
567                    sumks[sumk] +=
568                        tmp_row_buffer[m] * dct_matrix_rmajor[j * DctMatrixNumCols::USIZE + m];
569                }
570                output[k][j] = sumks[0] + sumks[1] + sumks[2] + sumks[3];
571            }
572        }
573        // crate::testing::dump_image("dct2d_output_scalar.ppm", output);
574    }
575}
576
577impl<PadIntermediateX: ArrayLength + Add<U127>, PadIntermediateY: ArrayLength + Add<U127>> Kernel
578    for DefaultKernel<PadIntermediateX, PadIntermediateY>
579where
580    <PadIntermediateX as Add<U127>>::Output: ArrayLength,
581    <PadIntermediateY as Add<U127>>::Output: ArrayLength,
582{
583    type Buffer1WidthX = <PadIntermediateX as Add<U127>>::Output;
584    type Buffer1LengthY = <PadIntermediateY as Add<U127>>::Output;
585    type InternalFloat = f32;
586    type InputDimension = U512;
587    type OutputDimension = U16;
588    type RequiredHardwareFeature = Term;
589    type Ident = &'static str;
590
591    fn ident(&self) -> &'static str {
592        "default_scalar_autovectorized_f32"
593    }
594
595    fn quantize(
596        &mut self,
597        input: &GenericArray<GenericArray<Self::InternalFloat, U16>, U16>,
598        threshold: &mut Self::InternalFloat,
599        output: &mut GenericArray<GenericArray<u8, U2>, U16>,
600    ) {
601        let median = torben_median(input);
602        *threshold = median;
603        let output = output.flatten();
604        output.fill(0);
605        for (i, j) in input.iter().flatten().enumerate() {
606            output[32 - 1 - i / 8] += if *j > median { 1 << (i % 8) } else { 0 };
607        }
608    }
609
610    #[allow(clippy::cast_precision_loss)]
611    fn adjust_quality(input: Self::InternalFloat) -> f32 {
612        let scaled = input.to_f32().unwrap() / (QUALITY_ADJUST_DIVISOR as f32);
613
614        scaled.min(1.0)
615    }
616
617    fn sum_of_gradients(
618        &mut self,
619        input: &GenericArray<GenericArray<Self::InternalFloat, U16>, U16>,
620    ) -> Self::InternalFloat {
621        let mut gradient_sum = Default::default();
622
623        for i in 0..(16 - 1) {
624            for j in 0..16 {
625                let u = input[i][j];
626                let v = input[i + 1][j];
627                let d = u - v;
628                gradient_sum += d.abs();
629            }
630        }
631
632        for i in 0..16 {
633            for j in 0..(16 - 1) {
634                let u = input[i][j];
635                let v = input[i][j + 1];
636                let d = u - v;
637                gradient_sum += d.abs();
638            }
639        }
640
641        gradient_sum
642    }
643
644    fn jarosz_compress(
645        &mut self,
646        buffer: &GenericArray<GenericArray<f32, U512>, U512>,
647        output: &mut GenericArray<
648            GenericArray<Self::InternalFloat, Self::Buffer1WidthX>,
649            Self::Buffer1LengthY,
650        >,
651    ) {
652        for outi in 0..127 {
653            let in_i = CONVOLUTION_OFFSET_512_TO_127[outi] - TENT_FILTER_COLUMN_OFFSET;
654            for outj in 0..127 {
655                let in_j = CONVOLUTION_OFFSET_512_TO_127[outj] - TENT_FILTER_COLUMN_OFFSET;
656                let mut sum = 0.0;
657                for di in 0..TENT_FILTER_EFFECTIVE_ROWS {
658                    for dj in 0..TENT_FILTER_EFFECTIVE_COLS {
659                        sum += TENT_FILTER_WEIGHTS[di * TENT_FILTER_EFFECTIVE_COLS + dj]
660                            * buffer[in_i + di][in_j + dj];
661                    }
662                }
663                output[outi][outj] = sum;
664            }
665        }
666    }
667
668    fn dct2d(
669        &mut self,
670        buffer: &GenericArray<GenericArray<f32, Self::Buffer1WidthX>, Self::Buffer1LengthY>,
671        tmp_row_buffer: &mut GenericArray<f32, Self::Buffer1WidthX>,
672        output: &mut GenericArray<GenericArray<f32, U16>, U16>,
673    ) {
674        Self::dct2d_impl(&DCT_MATRIX_RMAJOR, buffer, tmp_row_buffer, output);
675    }
676}
677
678/// A reference implementation of the `Kernel` trait, copied verbatim from the officially-endorsed implementation by [darwinium-com](https://github.com/darwinium-com/pdqhash).
679#[cfg(any(feature = "std", test))]
680#[derive(Default)]
681pub struct ReferenceKernel<N: num_traits::FromPrimitive = f32> {
682    _marker: core::marker::PhantomData<N>,
683}
684
685#[cfg(any(feature = "std", test))]
686impl PreciseKernel for ReferenceKernel<f32> {}
687
688#[cfg(any(feature = "std", test))]
689impl PreciseKernel for ReferenceKernel<f64> {}
690#[cfg(any(feature = "std", test))]
691impl Kernel for ReferenceKernel<f32> {
692    type Buffer1WidthX = U127;
693    type Buffer1LengthY = U127;
694    type InternalFloat = f32;
695    type InputDimension = U512;
696    type OutputDimension = U16;
697    type RequiredHardwareFeature = Term;
698    type Ident = &'static str;
699
700    fn ident(&self) -> Self::Ident {
701        "reference_scalar_autovectorized"
702    }
703
704    #[allow(clippy::cast_precision_loss)]
705    fn adjust_quality(input: Self::InternalFloat) -> f32 {
706        let scaled = input / (QUALITY_ADJUST_DIVISOR as f32);
707
708        scaled.min(1.0)
709    }
710
711    fn dct2d(
712        &mut self,
713        buffer: &GenericArray<GenericArray<f32, Self::Buffer1WidthX>, Self::Buffer1LengthY>,
714        tmp_row_buffer: &mut GenericArray<f32, Self::Buffer1WidthX>,
715        output: &mut GenericArray<GenericArray<f32, U16>, U16>,
716    ) {
717        for i in 0..16 {
718            for j in 0..127 {
719                let mut sumk = 0.0;
720                for k in 0..127 {
721                    sumk += DCT_MATRIX_RMAJOR[i * DctMatrixNumCols::USIZE + k] * buffer[k][j];
722                }
723
724                tmp_row_buffer[j] = sumk;
725            }
726
727            for j in 0..16 {
728                let mut sumk = 0.0;
729                for k in 0..127 {
730                    sumk += tmp_row_buffer[k] * DCT_MATRIX_RMAJOR[j * DctMatrixNumCols::USIZE + k];
731                }
732                output[i][j] = sumk;
733            }
734        }
735    }
736
737    fn sum_of_gradients(
738        &mut self,
739        input: &GenericArray<GenericArray<Self::InternalFloat, U16>, U16>,
740    ) -> Self::InternalFloat {
741        let mut gradient_sum = Default::default();
742
743        for i in 0..(16 - 1) {
744            for j in 0..16 {
745                let u = input[i][j];
746                let v = input[i + 1][j];
747                let d = u - v;
748                gradient_sum += d.abs();
749            }
750        }
751
752        for i in 0..16 {
753            for j in 0..(16 - 1) {
754                let u = input[i][j];
755                let v = input[i][j + 1];
756                let d = u - v;
757                gradient_sum += d.abs();
758            }
759        }
760
761        gradient_sum
762    }
763
764    fn quantize(
765        &mut self,
766        input: &GenericArray<GenericArray<f32, U16>, U16>,
767        threshold: &mut Self::InternalFloat,
768        output: &mut GenericArray<GenericArray<u8, U2>, U16>,
769    ) {
770        let median = torben_median(input);
771        *threshold = median;
772
773        let output = output.flatten();
774
775        for i in 0..32 {
776            let mut byte = 0;
777            for j in 0..8 {
778                let offset = i * 8 + j;
779                let offset_i = offset / 16;
780                let offset_j = offset % 16;
781                let val = input[offset_i][offset_j];
782                if val > median {
783                    byte |= 1 << j;
784                }
785            }
786            output[32 - i - 1] = byte;
787        }
788    }
789
790    fn jarosz_compress(
791        &mut self,
792        buffer: &GenericArray<GenericArray<f32, U512>, U512>,
793        output: &mut GenericArray<GenericArray<f32, Self::Buffer1WidthX>, Self::Buffer1LengthY>,
794    ) {
795        #[allow(missing_docs, dead_code)]
796        mod reference {
797            include!("ref.rs");
798        }
799
800        let window_size = reference::compute_jarosz_filter_window_size(512, 127);
801        let mut buffer = buffer.flatten().to_vec();
802        reference::jarosz_filter_float(
803            buffer.as_mut_slice().try_into().unwrap(),
804            512,
805            512,
806            window_size,
807            window_size,
808            2,
809        );
810
811        for outi in 0..127 {
812            let ini = ((outi * 2 + 1) * 512) / (127 * 2);
813
814            for outj in 0..127 {
815                let inj = ((outj * 2 + 1) * 512) / (127 * 2);
816
817                output[outi][outj] = buffer[ini * 512 + inj];
818            }
819        }
820    }
821}
822
823#[cfg(any(feature = "std", test))]
824impl Kernel for ReferenceKernel<f64> {
825    type Buffer1WidthX = U127;
826    type Buffer1LengthY = U127;
827    type InternalFloat = f64;
828    type InputDimension = U512;
829    type OutputDimension = U16;
830    type RequiredHardwareFeature = Term;
831    type Ident = &'static str;
832
833    fn ident(&self) -> Self::Ident {
834        "reference_scalar_autovectorized_f64"
835    }
836
837    #[allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)]
838    fn adjust_quality(input: Self::InternalFloat) -> f32 {
839        let scaled = input / (QUALITY_ADJUST_DIVISOR as f64);
840
841        scaled.min(1.0) as f32
842    }
843
844    fn dct2d(
845        &mut self,
846        buffer: &GenericArray<GenericArray<f64, Self::Buffer1WidthX>, Self::Buffer1LengthY>,
847        tmp_row_buffer: &mut GenericArray<f64, Self::Buffer1WidthX>,
848        output: &mut GenericArray<GenericArray<f64, U16>, U16>,
849    ) {
850        for i in 0..16 {
851            for j in 0..127 {
852                let mut sumk = 0.0;
853                for k in 0..127 {
854                    sumk += DCT_MATRIX_RMAJOR_64[i * DctMatrixNumCols::USIZE + k] * buffer[k][j];
855                }
856
857                tmp_row_buffer[j] = sumk;
858            }
859
860            for j in 0..16 {
861                let mut sumk = 0.0;
862                for k in 0..127 {
863                    sumk +=
864                        tmp_row_buffer[k] * DCT_MATRIX_RMAJOR_64[j * DctMatrixNumCols::USIZE + k];
865                }
866                output[i][j] = sumk;
867            }
868        }
869    }
870
871    fn sum_of_gradients(
872        &mut self,
873        input: &GenericArray<GenericArray<Self::InternalFloat, U16>, U16>,
874    ) -> Self::InternalFloat {
875        let mut gradient_sum = Default::default();
876
877        for i in 0..(16 - 1) {
878            for j in 0..16 {
879                let u = input[i][j];
880                let v = input[i + 1][j];
881                let d = u - v;
882                gradient_sum += d.abs();
883            }
884        }
885
886        for i in 0..16 {
887            for j in 0..(16 - 1) {
888                let u = input[i][j];
889                let v = input[i][j + 1];
890                let d = u - v;
891                gradient_sum += d.abs();
892            }
893        }
894
895        gradient_sum
896    }
897
898    fn quantize(
899        &mut self,
900        input: &GenericArray<GenericArray<f64, U16>, U16>,
901        threshold: &mut Self::InternalFloat,
902        output: &mut GenericArray<GenericArray<u8, U2>, U16>,
903    ) {
904        let median = torben_median(input);
905        *threshold = median;
906
907        let output = output.flatten();
908
909        for i in 0..32 {
910            let mut byte = 0;
911            for j in 0..8 {
912                let offset = i * 8 + j;
913                let offset_i = offset / 16;
914                let offset_j = offset % 16;
915                let val = input[offset_i][offset_j];
916                if val > median {
917                    byte |= 1 << j;
918                }
919            }
920            output[32 - i - 1] = byte;
921        }
922    }
923
924    fn jarosz_compress(
925        &mut self,
926        buffer: &GenericArray<GenericArray<f32, U512>, U512>,
927        output: &mut GenericArray<GenericArray<f64, Self::Buffer1WidthX>, Self::Buffer1LengthY>,
928    ) {
929        #[allow(missing_docs, dead_code)]
930        mod reference {
931            include!("ref.rs");
932        }
933
934        let window_size = reference::compute_jarosz_filter_window_size(512, 127);
935        let mut buffer = buffer
936            .flatten()
937            .into_iter()
938            .map(|s| From::from(*s))
939            .collect::<Vec<f64>>();
940        reference::jarosz_filter_float(
941            buffer.as_mut_slice().try_into().unwrap(),
942            512,
943            512,
944            window_size,
945            window_size,
946            2,
947        );
948
949        for outi in 0..127 {
950            let ini = ((outi * 2 + 1) * 512) / (127 * 2);
951
952            for outj in 0..127 {
953                let inj = ((outj * 2 + 1) * 512) / (127 * 2);
954
955                output[outi][outj] = buffer[ini * 512 + inj];
956            }
957        }
958    }
959}
960
961#[cfg(feature = "reference-rug")]
962impl<const C: u32> Kernel for ReferenceKernel<float128::ArbFloat<C>> {
963    type Buffer1WidthX = U127;
964    type Buffer1LengthY = U127;
965    type InternalFloat = float128::ArbFloat<C>;
966    type InputDimension = U512;
967    type OutputDimension = U16;
968    type RequiredHardwareFeature = Term;
969    type Ident = &'static str;
970
971    fn ident(&self) -> &'static str {
972        "reference_scalar_autovectorized_arb_float"
973    }
974
975    fn adjust_quality(input: Self::InternalFloat) -> f32 {
976        let scaled = input / (float128::ArbFloat::from_usize(QUALITY_ADJUST_DIVISOR).unwrap());
977
978        let one = float128::ArbFloat::from_f32(1.0).unwrap();
979        if scaled > one { 1.0 } else { scaled.to_f32() }
980    }
981
982    fn sum_of_gradients(
983        &mut self,
984        input: &GenericArray<GenericArray<Self::InternalFloat, U16>, U16>,
985    ) -> Self::InternalFloat {
986        let mut gradient_sum = float128::ArbFloat::default();
987
988        for i in 0..(16 - 1) {
989            for j in 0..16 {
990                let u = input[i][j].clone();
991                let v = input[i + 1][j].clone();
992                let d = u - v;
993                gradient_sum += d.abs();
994            }
995        }
996
997        for i in 0..16 {
998            for j in 0..(16 - 1) {
999                let u = input[i][j].clone();
1000                let v = input[i][j + 1].clone();
1001                let d = u - v;
1002                gradient_sum += d.abs();
1003            }
1004        }
1005
1006        gradient_sum
1007    }
1008
1009    fn dct2d(
1010        &mut self,
1011        buffer: &GenericArray<
1012            GenericArray<Self::InternalFloat, Self::Buffer1WidthX>,
1013            Self::Buffer1LengthY,
1014        >,
1015        _tmp_row_buffer: &mut GenericArray<Self::InternalFloat, Self::Buffer1WidthX>,
1016        output: &mut GenericArray<GenericArray<Self::InternalFloat, U16>, U16>,
1017    ) {
1018        let d_value = |i: i32, j: i32, n: i32| {
1019            let n1: Self::InternalFloat = NumCast::from(n).unwrap();
1020            let i1: Self::InternalFloat = NumCast::from(i).unwrap();
1021            let j1: Self::InternalFloat = NumCast::from(j).unwrap();
1022            let one: Self::InternalFloat = NumCast::from(1).unwrap();
1023            let two: Self::InternalFloat = one.clone() + one.clone();
1024            let pi: Self::InternalFloat = float128::ArbFloat::<C>::pi();
1025
1026            (two.clone() / n1.clone()).sqrt()
1027                * (pi / (two.clone() * n1) * i1 * (two * j1 + one)).cos()
1028        };
1029
1030        let mut d_table = Vec::with_capacity(127 * 16);
1031        for i in 1..=16 {
1032            for j in 0..127 {
1033                d_table.push(d_value(i, j, 127));
1034            }
1035        }
1036
1037        let mut intermediate_matrix = Vec::<Self::InternalFloat>::with_capacity(127);
1038        intermediate_matrix.resize(127, Self::InternalFloat::default());
1039
1040        for i in 0..16 {
1041            for j in 0..127 {
1042                let mut sumk = Self::InternalFloat::default();
1043                for k in 0..127 {
1044                    let d = d_table[i * 127 + k].clone();
1045                    sumk += d * buffer[k][j].clone();
1046                }
1047
1048                intermediate_matrix[j] = sumk;
1049            }
1050
1051            for j in 0..16 {
1052                let mut sumk = Self::InternalFloat::default();
1053                for k in 0..127 {
1054                    let d = d_table[j * 127 + k].clone();
1055                    sumk += intermediate_matrix[k].clone() * d;
1056                }
1057                output[i][j] = sumk;
1058            }
1059        }
1060    }
1061
1062    fn quantize(
1063        &mut self,
1064        input: &GenericArray<GenericArray<Self::InternalFloat, U16>, U16>,
1065        threshold: &mut Self::InternalFloat,
1066        output: &mut GenericArray<GenericArray<u8, U2>, U16>,
1067    ) {
1068        let half = Self::InternalFloat::from_f64(0.5).unwrap();
1069        let mut flattened = GenericArray::<Self::InternalFloat, U256>::default();
1070        for i in 0..16 {
1071            for j in 0..16 {
1072                flattened[i * 16 + j] = input[i][j].clone();
1073            }
1074        }
1075        flattened.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap());
1076        let median = (flattened[127].clone() + flattened[128].clone()) * half;
1077        *threshold = median.clone();
1078
1079        let output = output.flatten();
1080        for i in 0..32 {
1081            let mut byte = 0;
1082            for j in 0..8 {
1083                let offset = i * 8 + j;
1084                let offset_i = offset / 16;
1085                let offset_j = offset % 16;
1086                let val = input[offset_i][offset_j].clone();
1087                if val > median {
1088                    byte |= 1 << j;
1089                }
1090            }
1091            output[32 - i - 1] = byte;
1092        }
1093    }
1094
1095    fn jarosz_compress(
1096        &mut self,
1097        buffer: &GenericArray<GenericArray<f32, U512>, U512>,
1098        output: &mut GenericArray<
1099            GenericArray<Self::InternalFloat, Self::Buffer1WidthX>,
1100            Self::Buffer1LengthY,
1101        >,
1102    ) {
1103        #[allow(missing_docs, dead_code)]
1104        mod reference {
1105            include!("ref.rs");
1106        }
1107
1108        let window_size = reference::compute_jarosz_filter_window_size(512, 127);
1109        let mut buffer = buffer
1110            .flatten()
1111            .iter()
1112            .map(|s| float128::ArbFloat::<C>::from_f32(*s).unwrap())
1113            .collect::<Vec<_>>();
1114        reference::jarosz_filter_float(
1115            buffer.as_mut_slice().try_into().unwrap(),
1116            512,
1117            512,
1118            window_size,
1119            window_size,
1120            2,
1121        );
1122
1123        // target centers not corners:
1124
1125        for outi in 0..127 {
1126            let ini = ((outi * 2 + 1) * 512) / (127 * 2);
1127
1128            for outj in 0..127 {
1129                let inj = ((outj * 2 + 1) * 512) / (127 * 2);
1130
1131                output[outi][outj] = buffer[ini * 512 + inj].clone();
1132            }
1133        }
1134    }
1135}
1136
1137#[cfg(feature = "reference-rug")]
1138impl<const C: u32> PreciseKernel for ReferenceKernel<float128::ArbFloat<C>> {}
1139
1140#[cfg(test)]
1141mod tests {
1142    use core::ops::Mul;
1143
1144    use num_traits::NumCast;
1145    use rand::Rng;
1146
1147    use super::*;
1148
1149    extern crate alloc;
1150
1151    #[allow(dead_code)]
1152    fn test_gradient_impl<OD: ArrayLength, K: Kernel<OutputDimension = OD>>(kernel: &mut K)
1153    where
1154        ReferenceKernel<<K as Kernel>::InternalFloat>:
1155            Kernel<InternalFloat = <K as Kernel>::InternalFloat, OutputDimension = OD>,
1156        OD: Mul<OD>,
1157        <OD as Mul<OD>>::Output: ArrayLength,
1158        K::RequiredHardwareFeature: EvaluateHardwareFeature<EnabledStatic = B1>,
1159        ReferenceKernel<K::InternalFloat>:
1160            Kernel<InternalFloat = K::InternalFloat, OutputDimension = OD>,
1161        <ReferenceKernel<K::InternalFloat> as Kernel>::RequiredHardwareFeature:
1162            EvaluateHardwareFeature<EnabledStatic = B1>,
1163    {
1164        let mut rng = rand::rng();
1165        let mut input: GenericArray<GenericArray<<K as Kernel>::InternalFloat, OD>, OD> =
1166            GenericArray::default();
1167        input.iter_mut().for_each(|row| {
1168            row.iter_mut()
1169                .for_each(|val| *val = NumCast::from(rng.random_range(0.0..1.0)).unwrap());
1170        });
1171
1172        let gradient_ref = ReferenceKernel::<K::InternalFloat>::default().sum_of_gradients(&input);
1173        let gradient = kernel.sum_of_gradients(&input);
1174
1175        let diff = (gradient.clone() - NumCast::from(gradient_ref.clone()).unwrap()).abs();
1176        let diff_ref = diff / NumCast::from(gradient_ref.clone()).unwrap();
1177
1178        assert!(
1179            diff_ref < NumCast::from(0.05).unwrap(),
1180            "gradient: {}, gradient_ref: {}, diff: {} %",
1181            gradient,
1182            gradient_ref,
1183            diff_ref * NumCast::from(100.0).unwrap(),
1184        );
1185    }
1186
1187    fn test_dct64_impl<K: Kernel>(kernel: &mut K, eps: K::InternalFloat)
1188    where
1189        <K as Kernel>::RequiredHardwareFeature: EvaluateHardwareFeature<EnabledStatic = B1>,
1190        ReferenceKernel<K::InternalFloat>: Kernel<InternalFloat = K::InternalFloat>,
1191        <ReferenceKernel<K::InternalFloat> as Kernel>::RequiredHardwareFeature:
1192            EvaluateHardwareFeature<EnabledStatic = B1>,
1193    {
1194        let mut rng = rand::rng();
1195        let mut input_ref: GenericArray<GenericArray<<K as Kernel>::InternalFloat, _>, _> =
1196            GenericArray::default();
1197        input_ref.iter_mut().enumerate().for_each(|(i, row)| {
1198            row.iter_mut().enumerate().for_each(|(j, val)| {
1199                *val = if (i + 2 * j) % 20 < 10 {
1200                    NumCast::from(rng.random_range(0.0..1.0)).unwrap()
1201                } else {
1202                    NumCast::from(rng.random_range(-1.0..0.0)).unwrap()
1203                };
1204            });
1205        });
1206        let mut input: GenericArray<
1207            GenericArray<K::InternalFloat, K::Buffer1WidthX>,
1208            K::Buffer1LengthY,
1209        > = GenericArray::default();
1210        for i in 0..127 {
1211            for j in 0..127 {
1212                input[i][j] = input_ref[i][j].clone();
1213            }
1214        }
1215
1216        let mut output = GenericArray::default();
1217        let mut output_ref = GenericArray::default();
1218        ReferenceKernel::<K::InternalFloat>::default().dct2d(
1219            &input_ref,
1220            &mut GenericArray::default(),
1221            &mut output_ref,
1222        );
1223
1224        kernel.dct2d(&input, &mut GenericArray::default(), &mut output);
1225
1226        output
1227            .iter()
1228            .flatten()
1229            .zip(output_ref.iter().flatten())
1230            .for_each(|(a, b)| {
1231                let diff = (a.clone() - NumCast::from(b.clone()).unwrap()).abs();
1232                assert!(diff <= eps, "difference: {:?} (tolerance: {:?})", diff, eps);
1233            });
1234    }
1235
1236    #[test]
1237    fn test_dct64_impl_base() {
1238        let mut kernel = DefaultKernelNoPadding::default();
1239        test_dct64_impl(&mut kernel, NumCast::from(f32::EPSILON * 32.00).unwrap());
1240    }
1241
1242    #[test]
1243    fn test_dct64_impl_smart_kernel() {
1244        let mut kernel = smart_kernel();
1245        test_dct64_impl(&mut kernel, NumCast::from(f32::EPSILON * 32.00).unwrap());
1246    }
1247
1248    #[test]
1249    #[cfg(target_arch = "x86_64")]
1250    fn test_dct64_impl_avx2_equivalence() {
1251        // do an empirical equivalence check between the default and AVX2 implementations
1252
1253        use generic_array::typenum::U128;
1254
1255        let mut dct_matrix =
1256            DefaultPaddedArray::<_, _, U16>::new(*GenericArray::from_slice(&[1.0; 16 * 127]));
1257
1258        let mut input_buffer_127: Box<GenericArray<GenericArray<f32, U127>, U127>> = Box::default();
1259        let mut input_buffer_128: Box<GenericArray<GenericArray<f32, U128>, U128>> = Box::default();
1260
1261        // part 1: hold the DCT matrix constant as 1 and fill the input buffer sequentially
1262        input_buffer_127
1263            .iter_mut()
1264            .enumerate()
1265            .for_each(|(i, row)| {
1266                row.iter_mut().enumerate().for_each(|(j, val)| {
1267                    input_buffer_128[i][j] = (i * 127 + j) as f32 * 0.00001;
1268                    *val = (i * 127 + j) as f32 * 0.00001;
1269                });
1270            });
1271        let mut output = GenericArray::default();
1272        let mut output_ref = GenericArray::default();
1273        DefaultKernelNoPadding::dct2d_impl(
1274            &dct_matrix,
1275            &input_buffer_127,
1276            &mut GenericArray::default(),
1277            &mut output_ref,
1278        );
1279
1280        #[allow(unused_unsafe)]
1281        unsafe {
1282            x86::Avx2F32Kernel::dct2d_impl(
1283                &dct_matrix,
1284                &input_buffer_128,
1285                &mut GenericArray::default(),
1286                &mut output,
1287            );
1288        }
1289
1290        for i in 0..16 {
1291            for j in 0..16 {
1292                let diff = (output_ref[i][j] - output[i][j]).abs();
1293                let diff_ref = diff / output_ref[i][j];
1294                // accept up to 0.00004% rounding error (4e-7), almost f32::EPSILON
1295                assert!(
1296                    diff_ref < 4e-7,
1297                    "difference: {:?} (relative: {:?})",
1298                    diff,
1299                    diff_ref
1300                );
1301            }
1302        }
1303
1304        // part 2: hold the input buffer constant and fill the DCT matrix sequentially
1305        input_buffer_127
1306            .iter_mut()
1307            .enumerate()
1308            .for_each(|(i, row)| {
1309                row.iter_mut().enumerate().for_each(|(j, val)| {
1310                    input_buffer_128[i][j] = 0.00001;
1311                    *val = 0.00001;
1312                });
1313            });
1314
1315        dct_matrix
1316            .iter_mut()
1317            .enumerate()
1318            .for_each(|(idx, val)| *val = idx as f32);
1319
1320        DefaultKernelNoPadding::dct2d_impl(
1321            &dct_matrix,
1322            &input_buffer_127,
1323            &mut GenericArray::default(),
1324            &mut output_ref,
1325        );
1326
1327        #[allow(unused_unsafe)]
1328        unsafe {
1329            x86::Avx2F32Kernel::dct2d_impl(
1330                &dct_matrix,
1331                &input_buffer_128,
1332                &mut GenericArray::default(),
1333                &mut output,
1334            );
1335        }
1336
1337        for i in 0..16 {
1338            for j in 0..16 {
1339                let diff = (output_ref[i][j] - output[i][j]).abs();
1340                let diff_ref = diff / output_ref[i][j];
1341                // accept up to 0.00008% rounding error (8e-7), almost f32::EPSILON
1342                assert!(
1343                    diff_ref < 8e-7,
1344                    "difference: {:?} (relative: {:?})",
1345                    diff,
1346                    diff_ref
1347                );
1348            }
1349        }
1350    }
1351
1352    #[cfg(all(target_arch = "x86_64", feature = "avx512", target_feature = "avx512f"))]
1353    #[test]
1354    fn test_dct64_impl_avx512() {
1355        let mut kernel = x86::Avx512F32Kernel;
1356        test_gradient_impl(&mut kernel);
1357        test_dct64_impl(&mut kernel, NumCast::from(5e-6).unwrap());
1358    }
1359}