yuvutils_rs/
from_identity_f16.rs

1/*
2 * Copyright (c) Radzivon Bartoshyk, 10/2024. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without modification,
5 * are permitted provided that the following conditions are met:
6 *
7 * 1.  Redistributions of source code must retain the above copyright notice, this
8 * list of conditions and the following disclaimer.
9 *
10 * 2.  Redistributions in binary form must reproduce the above copyright notice,
11 * this list of conditions and the following disclaimer in the documentation
12 * and/or other materials provided with the distribution.
13 *
14 * 3.  Neither the name of the copyright holder nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29use crate::numerics::qrshr;
30use crate::yuv_error::check_rgba_destination;
31use crate::yuv_support::{get_yuv_range, YuvSourceChannels};
32use crate::{YuvChromaSubsampling, YuvError, YuvPlanarImage, YuvRange};
33use core::f16;
34use num_traits::AsPrimitive;
35#[cfg(feature = "rayon")]
36use rayon::iter::{IndexedParallelIterator, ParallelIterator};
37#[cfg(feature = "rayon")]
38use rayon::prelude::{ParallelSlice, ParallelSliceMut};
39use std::fmt::Debug;
40use std::marker::PhantomData;
41use std::mem::size_of;
42use std::ops::Sub;
43
44trait FullRowHandle<V: Copy + 'static + Sized + Debug + Send + Sync + Default + CastableToF16> {
45    unsafe fn process_row(&self, dst: &mut [f16], y_src: &[V], u_src: &[V], v_src: &[V]);
46}
47
48trait CastableToF16 {
49    fn cast_to_f16<const BIT_DEPTH: usize>(self) -> f16;
50}
51
52impl CastableToF16 for u16 {
53    fn cast_to_f16<const BIT_DEPTH: usize>(self) -> f16 {
54        if BIT_DEPTH == 16 {
55            (self as i32) as f16
56        } else {
57            (self as i16) as f16
58        }
59    }
60}
61
62trait LimitedRowHandle<
63    V: Copy + AsPrimitive<J> + 'static + Sized + Debug + Send + Sync,
64    J: Copy + Sub<Output = J> + AsPrimitive<i32>,
65>
66{
67    unsafe fn process_row(
68        &self,
69        dst: &mut [f16],
70        y_src: &[V],
71        u_src: &[V],
72        v_src: &[V],
73        y_bias: J,
74        y_coef: i16,
75    );
76}
77
78macro_rules! exec_cv_full {
79    ($dst: expr, $y_src: expr, $u_src: expr, $v_src: expr, $cn: expr, $bit_depth: expr) => {
80        let max_value = (1 << $bit_depth) - 1;
81        let max_value_f16 = 1f32 as f16;
82        let rgb_chunks = $dst.chunks_exact_mut($cn.get_channels_count());
83        let scale = (1f32 / max_value as f32) as f16;
84
85        for (((&y_src, &u_src), &v_src), rgb_dst) in
86            $y_src.iter().zip($u_src).zip($v_src).zip(rgb_chunks)
87        {
88            rgb_dst[$cn.get_r_channel_offset()] = v_src.cast_to_f16::<$bit_depth>() * scale;
89            rgb_dst[$cn.get_g_channel_offset()] = y_src.cast_to_f16::<$bit_depth>() * scale;
90            rgb_dst[$cn.get_b_channel_offset()] = u_src.cast_to_f16::<$bit_depth>() * scale;
91            if $cn.has_alpha() {
92                rgb_dst[$cn.get_a_channel_offset()] = max_value_f16;
93            }
94        }
95    };
96}
97
98macro_rules! exec_cv_limited {
99    ($dst: expr, $y_src: expr, $u_src: expr, $v_src: expr, $cn: expr, $bit_depth: expr, $y_bias: expr, $y_coef: expr, $precision: expr) => {
100        let max_value = (1 << $bit_depth) - 1;
101        let max_value_f16 = 1f32 as f16;
102        let rgb_chunks = $dst.chunks_exact_mut($cn.get_channels_count());
103        let scale = (1f32 / max_value as f32) as f16;
104
105        for (((&y_src, &u_src), &v_src), rgb_dst) in
106            $y_src.iter().zip($u_src).zip($v_src).zip(rgb_chunks)
107        {
108            rgb_dst[$cn.get_r_channel_offset()] =
109                qrshr::<$precision, $bit_depth>((v_src.as_() - $y_bias).as_() * $y_coef as i32)
110                    as f16
111                    * scale;
112            rgb_dst[$cn.get_g_channel_offset()] =
113                qrshr::<$precision, $bit_depth>((y_src.as_() - $y_bias).as_() * $y_coef as i32)
114                    as f16
115                    * scale;
116            rgb_dst[$cn.get_b_channel_offset()] =
117                qrshr::<$precision, $bit_depth>((u_src.as_() - $y_bias).as_() * $y_coef as i32)
118                    as f16
119                    * scale;
120            if $cn.has_alpha() {
121                rgb_dst[$cn.get_a_channel_offset()] = max_value_f16;
122            }
123        }
124    };
125}
126
127#[derive(Default)]
128struct DefaultFullRowHandle<
129    V: Copy + 'static + Sized + Debug + Send + Sync + Default + CastableToF16,
130    const CHANNELS: u8,
131    const BIT_DEPTH: usize,
132> {
133    _phantom: PhantomData<V>,
134}
135
136impl<
137        V: Copy + 'static + Sized + Debug + Send + Sync + Default + CastableToF16,
138        const CHANNELS: u8,
139        const BIT_DEPTH: usize,
140    > FullRowHandle<V> for DefaultFullRowHandle<V, CHANNELS, BIT_DEPTH>
141{
142    unsafe fn process_row(&self, dst: &mut [f16], y_src: &[V], u_src: &[V], v_src: &[V]) {
143        let cn: YuvSourceChannels = CHANNELS.into();
144        exec_cv_full!(dst, y_src, u_src, v_src, cn, BIT_DEPTH);
145    }
146}
147
148#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
149#[derive(Default)]
150struct DefaultFullRowHandleNeonFp16<
151    V: Copy + 'static + Sized + Debug + Send + Sync + Default + CastableToF16,
152    const CHANNELS: u8,
153    const BIT_DEPTH: usize,
154> {
155    _phantom: PhantomData<V>,
156}
157
158#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
159impl<
160        V: Copy + 'static + Sized + Debug + Send + Sync + Default + CastableToF16,
161        const CHANNELS: u8,
162        const BIT_DEPTH: usize,
163    > FullRowHandle<V> for DefaultFullRowHandleNeonFp16<V, CHANNELS, BIT_DEPTH>
164{
165    #[target_feature(enable = "fp16")]
166    unsafe fn process_row(&self, dst: &mut [f16], y_src: &[V], u_src: &[V], v_src: &[V]) {
167        let cn: YuvSourceChannels = CHANNELS.into();
168        exec_cv_full!(dst, y_src, u_src, v_src, cn, BIT_DEPTH);
169    }
170}
171
172#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "avx"))]
173#[derive(Default)]
174struct DefaultFullRowHandleAvxFp16c<
175    V: Copy + 'static + Sized + Debug + Send + Sync + Default + CastableToF16,
176    const CHANNELS: u8,
177    const BIT_DEPTH: usize,
178> {
179    _phantom: PhantomData<V>,
180}
181
182#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "avx"))]
183impl<
184        V: Copy + 'static + Sized + Debug + Send + Sync + Default + CastableToF16,
185        const CHANNELS: u8,
186        const BIT_DEPTH: usize,
187    > FullRowHandle<V> for DefaultFullRowHandleAvxFp16c<V, CHANNELS, BIT_DEPTH>
188{
189    #[target_feature(enable = "avx2", enable = "f16c")]
190    unsafe fn process_row(&self, dst: &mut [f16], y_src: &[V], u_src: &[V], v_src: &[V]) {
191        let cn: YuvSourceChannels = CHANNELS.into();
192        exec_cv_full!(dst, y_src, u_src, v_src, cn, BIT_DEPTH);
193    }
194}
195
196#[derive(Default)]
197struct DefaultLimitedRowHandle<
198    V: Copy + AsPrimitive<J> + 'static + Sized + Debug + Send + Sync + Default,
199    J: Copy + Sub<Output = J> + AsPrimitive<i32> + Default,
200    const CHANNELS: u8,
201    const BIT_DEPTH: usize,
202    const PRECISION: i32,
203> {
204    _phantom: PhantomData<V>,
205    _phantom2: PhantomData<J>,
206}
207
208impl<
209        V: Copy + AsPrimitive<J> + 'static + Sized + Debug + Send + Sync + Default,
210        J: Copy + Sub<Output = J> + AsPrimitive<i32> + Default + Send + Sync,
211        const CHANNELS: u8,
212        const BIT_DEPTH: usize,
213        const PRECISION: i32,
214    > LimitedRowHandle<V, J> for DefaultLimitedRowHandle<V, J, CHANNELS, BIT_DEPTH, PRECISION>
215{
216    unsafe fn process_row(
217        &self,
218        dst: &mut [f16],
219        y_src: &[V],
220        u_src: &[V],
221        v_src: &[V],
222        y_bias: J,
223        y_coef: i16,
224    ) {
225        let cn: YuvSourceChannels = CHANNELS.into();
226        exec_cv_limited!(dst, y_src, u_src, v_src, cn, BIT_DEPTH, y_bias, y_coef, PRECISION);
227    }
228}
229
230#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
231#[derive(Default)]
232struct DefaultLimitedRowHandleNeonFp16<
233    V: Copy + AsPrimitive<J> + 'static + Sized + Debug + Send + Sync,
234    J: Copy + Sub<Output = J> + AsPrimitive<i32> + Default + Send + Sync,
235    const CHANNELS: u8,
236    const BIT_DEPTH: usize,
237    const PRECISION: i32,
238> {
239    _phantom: PhantomData<V>,
240    _phantom2: PhantomData<J>,
241}
242
243#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
244impl<
245        V: Copy + AsPrimitive<J> + 'static + Sized + Debug + Send + Sync,
246        J: Copy + Sub<Output = J> + AsPrimitive<i32> + Default + Send + Sync,
247        const CHANNELS: u8,
248        const BIT_DEPTH: usize,
249        const PRECISION: i32,
250    > LimitedRowHandle<V, J>
251    for DefaultLimitedRowHandleNeonFp16<V, J, CHANNELS, BIT_DEPTH, PRECISION>
252{
253    #[target_feature(enable = "fp16")]
254    unsafe fn process_row(
255        &self,
256        dst: &mut [f16],
257        y_src: &[V],
258        u_src: &[V],
259        v_src: &[V],
260        y_bias: J,
261        y_coef: i16,
262    ) {
263        let cn: YuvSourceChannels = CHANNELS.into();
264        exec_cv_limited!(dst, y_src, u_src, v_src, cn, BIT_DEPTH, y_bias, y_coef, PRECISION);
265    }
266}
267
268#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "avx"))]
269#[derive(Default)]
270struct DefaultLimitedRowHandleAvxFp16c<
271    V: Copy + AsPrimitive<J> + 'static + Sized + Debug + Send + Sync,
272    J: Copy + Sub<Output = J> + AsPrimitive<i32> + Default + Send + Sync,
273    const CHANNELS: u8,
274    const BIT_DEPTH: usize,
275    const PRECISION: i32,
276> {
277    _phantom: PhantomData<V>,
278    _phantom2: PhantomData<J>,
279}
280
281#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "avx"))]
282impl<
283        V: Copy + AsPrimitive<J> + 'static + Sized + Debug + Send + Sync,
284        J: Copy + Sub<Output = J> + AsPrimitive<i32> + Default + Send + Sync,
285        const CHANNELS: u8,
286        const BIT_DEPTH: usize,
287        const PRECISION: i32,
288    > LimitedRowHandle<V, J>
289    for DefaultLimitedRowHandleAvxFp16c<V, J, CHANNELS, BIT_DEPTH, PRECISION>
290{
291    #[target_feature(enable = "avx2", enable = "f16c")]
292    unsafe fn process_row(
293        &self,
294        dst: &mut [f16],
295        y_src: &[V],
296        u_src: &[V],
297        v_src: &[V],
298        y_bias: J,
299        y_coef: i16,
300    ) {
301        let cn: YuvSourceChannels = CHANNELS.into();
302        exec_cv_limited!(dst, y_src, u_src, v_src, cn, BIT_DEPTH, y_bias, y_coef, PRECISION);
303    }
304}
305
306#[inline]
307fn gbr_to_rgbx_f16_impl<
308    V: Copy + AsPrimitive<J> + 'static + Sized + Debug + Send + Sync + Default + CastableToF16,
309    J: Copy + Sub<Output = J> + AsPrimitive<i32> + Default + Send + Sync,
310    const CHANNELS: u8,
311    const BIT_DEPTH: usize,
312>(
313    image: &YuvPlanarImage<V>,
314    rgba: &mut [f16],
315    rgba_stride: u32,
316    yuv_range: YuvRange,
317) -> Result<(), YuvError>
318where
319    u32: AsPrimitive<J>,
320{
321    let cn: YuvSourceChannels = CHANNELS.into();
322    let channels = cn.get_channels_count();
323    assert!(
324        channels == 3 || channels == 4,
325        "GBR -> RGB is implemented only on 3 and 4 channels"
326    );
327    assert!(
328        (8..=16).contains(&BIT_DEPTH),
329        "Invalid bit depth is provided"
330    );
331    assert!(
332        if BIT_DEPTH > 8 {
333            size_of::<V>() == 2
334        } else {
335            size_of::<V>() == 1
336        },
337        "Unsupported bit depth and data type combination"
338    );
339    let y_plane = image.y_plane;
340    let u_plane = image.u_plane;
341    let v_plane = image.v_plane;
342    let y_stride = image.y_stride as usize;
343    let u_stride = image.u_stride as usize;
344    let v_stride = image.v_stride as usize;
345    let height = image.height;
346
347    image.check_constraints(YuvChromaSubsampling::Yuv444)?;
348    check_rgba_destination(rgba, rgba_stride, image.width, height, channels)?;
349
350    let y_iter;
351    let rgb_iter;
352    let u_iter;
353    let v_iter;
354
355    #[cfg(feature = "rayon")]
356    {
357        y_iter = y_plane.par_chunks_exact(y_stride);
358        rgb_iter = rgba.par_chunks_exact_mut(rgba_stride as usize);
359        u_iter = u_plane.par_chunks_exact(u_stride);
360        v_iter = v_plane.par_chunks_exact(v_stride);
361    }
362    #[cfg(not(feature = "rayon"))]
363    {
364        y_iter = y_plane.chunks_exact(y_stride);
365        rgb_iter = rgba.chunks_exact_mut(rgba_stride as usize);
366        u_iter = u_plane.chunks_exact(u_stride);
367        v_iter = v_plane.chunks_exact(v_stride);
368    }
369
370    match yuv_range {
371        YuvRange::Limited => {
372            const PRECISION: i32 = 13;
373            // All channels on identity should use Y range
374            let range = get_yuv_range(BIT_DEPTH as u32, yuv_range);
375            let range_rgba = (1 << BIT_DEPTH) - 1;
376            let y_coef =
377                ((range_rgba as f32 / range.range_y as f32) * (1 << PRECISION) as f32) as i16;
378            let y_bias = range.bias_y.as_();
379
380            let iter = y_iter.zip(u_iter).zip(v_iter).zip(rgb_iter);
381
382            iter.for_each(|(((y_src, u_src), v_src), rgb)| {
383                let y_src = &y_src[0..image.width as usize];
384                let mut _row_processor: Box<dyn LimitedRowHandle<V, J> + Send + Sync> =
385                    Box::new(DefaultLimitedRowHandle::<
386                        V,
387                        J,
388                        CHANNELS,
389                        BIT_DEPTH,
390                        PRECISION,
391                    >::default());
392
393                #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
394                if std::arch::is_aarch64_feature_detected!("fp16") {
395                    _row_processor = Box::new(DefaultLimitedRowHandleNeonFp16::<
396                        V,
397                        J,
398                        CHANNELS,
399                        BIT_DEPTH,
400                        PRECISION,
401                    >::default());
402                }
403
404                #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "avx"))]
405                {
406                    if std::arch::is_x86_feature_detected!("avx2")
407                        && std::arch::is_x86_feature_detected!("f16c")
408                    {
409                        _row_processor = Box::new(DefaultLimitedRowHandleAvxFp16c::<
410                            V,
411                            J,
412                            CHANNELS,
413                            BIT_DEPTH,
414                            PRECISION,
415                        >::default());
416                    }
417                }
418
419                unsafe {
420                    _row_processor.process_row(rgb, y_src, u_src, v_src, y_bias, y_coef);
421                }
422            });
423        }
424        YuvRange::Full => {
425            let iter = y_iter.zip(u_iter).zip(v_iter).zip(rgb_iter);
426
427            let mut _row_processor: Box<dyn FullRowHandle<V> + Send + Sync> =
428                Box::new(DefaultFullRowHandle::<V, CHANNELS, BIT_DEPTH>::default());
429
430            #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
431            if std::arch::is_aarch64_feature_detected!("fp16") {
432                _row_processor =
433                    Box::new(DefaultFullRowHandleNeonFp16::<V, CHANNELS, BIT_DEPTH>::default());
434            }
435
436            #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "avx"))]
437            {
438                if std::arch::is_x86_feature_detected!("avx2")
439                    && std::arch::is_x86_feature_detected!("f16c")
440                {
441                    _row_processor =
442                        Box::new(DefaultFullRowHandleAvxFp16c::<V, CHANNELS, BIT_DEPTH>::default());
443                }
444            }
445
446            iter.for_each(|(((y_src, u_src), v_src), rgb)| {
447                let y_src = &y_src[0..image.width as usize];
448                unsafe {
449                    _row_processor.process_row(rgb, y_src, u_src, v_src);
450                }
451            });
452        }
453    }
454
455    Ok(())
456}
457
458macro_rules! d_cv {
459    ($method: ident, $px_fmt: expr, $bit_depth: expr, $rgb_name: expr, $dst_name: ident, $stride_name: ident, $tr: ident) => {
460        #[doc = concat!("Convert GBR", $bit_depth," to ", $rgb_name,"F16, IEEE float16 format.
461
462This function takes GBR planar format data with ", stringify!($bit_depth) ," bit precision,
463and converts it to ", $rgb_name,"F16 IEEE float16 format.
464
465# Arguments
466
467* `image` - Source GB", stringify!($bit_depth)," image.
468* `", stringify!($dst_name),"` - A slice to store the ",$rgb_name,"F16 data.
469* `", stringify!($stride_name), "` - The stride (components per row) for the ", $rgb_name,"F16.
470* `range` - YUV values range.
471
472# Panics
473
474This function panics if the lengths of the planes or the input ",$rgb_name," data are not valid based
475on the specified width, height, and strides is provided.")]
476        pub fn $method(
477            image: &YuvPlanarImage<u16>,
478            $dst_name: &mut [f16],
479            $stride_name: u32,
480            range: YuvRange,
481        ) -> Result<(), YuvError> {
482            gbr_to_rgbx_f16_impl::<u16, $tr, { $px_fmt as u8 }, $bit_depth>(
483                image, $dst_name, $stride_name, range,
484            )
485        }
486    };
487}
488
489d_cv!(
490    gb10_to_rgba_f16,
491    YuvSourceChannels::Rgba,
492    10,
493    "RGBA",
494    rgba,
495    rgba_stride,
496    i16
497);
498d_cv!(
499    gb12_to_rgba_f16,
500    YuvSourceChannels::Rgba,
501    12,
502    "RGBA",
503    rgba,
504    rgba_stride,
505    i16
506);
507d_cv!(
508    gb14_to_rgba_f16,
509    YuvSourceChannels::Rgba,
510    14,
511    "RGBA",
512    rgba,
513    rgba_stride,
514    i16
515);
516d_cv!(
517    gb16_to_rgba_f16,
518    YuvSourceChannels::Rgba,
519    16,
520    "RGBA",
521    rgba,
522    rgba_stride,
523    i32
524);
525
526d_cv!(
527    gb10_to_rgb_f16,
528    YuvSourceChannels::Rgb,
529    10,
530    "RGB",
531    rgb,
532    rgb_stride,
533    i16
534);
535d_cv!(
536    gb12_to_rgb_f16,
537    YuvSourceChannels::Rgb,
538    12,
539    "RGB",
540    rgb,
541    rgb_stride,
542    i16
543);
544d_cv!(
545    gb14_to_rgb_f16,
546    YuvSourceChannels::Rgb,
547    14,
548    "RGB",
549    rgb,
550    rgb_stride,
551    i16
552);
553d_cv!(
554    gb16_to_rgb_f16,
555    YuvSourceChannels::Rgb,
556    16,
557    "RGB",
558    rgb,
559    rgb_stride,
560    i32
561);