yuvutils_rs/
yuv_nv_p16_to_rgb16.rs

1/*
2 * Copyright (c) Radzivon Bartoshyk, 10/2024. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without modification,
5 * are permitted provided that the following conditions are met:
6 *
7 * 1.  Redistributions of source code must retain the above copyright notice, this
8 * list of conditions and the following disclaimer.
9 *
10 * 2.  Redistributions in binary form must reproduce the above copyright notice,
11 * this list of conditions and the following disclaimer in the documentation
12 * and/or other materials provided with the distribution.
13 *
14 * 3.  Neither the name of the copyright holder nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29use crate::internals::ProcessedOffset;
30#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
31use crate::neon::neon_yuv_nv_p16_to_rgba_row;
32use crate::numerics::{qrshr_n, to_ne};
33use crate::yuv_error::check_rgba_destination;
34use crate::yuv_support::*;
35use crate::{YuvBiPlanarImage, YuvError};
36#[cfg(feature = "rayon")]
37use rayon::iter::{IndexedParallelIterator, ParallelIterator};
38#[cfg(feature = "rayon")]
39use rayon::prelude::{ParallelSlice, ParallelSliceMut};
40
41fn yuv_nv_p16_to_image_impl<
42    const DESTINATION_CHANNELS: u8,
43    const NV_ORDER: u8,
44    const SAMPLING: u8,
45    const ENDIANNESS: u8,
46    const BYTES_POSITION: u8,
47    const BIT_DEPTH: usize,
48>(
49    image: &YuvBiPlanarImage<u16>,
50    bgra: &mut [u16],
51    bgra_stride: u32,
52    range: YuvRange,
53    matrix: YuvStandardMatrix,
54) -> Result<(), YuvError> {
55    let dst_chans: YuvSourceChannels = DESTINATION_CHANNELS.into();
56    let channels = dst_chans.get_channels_count();
57    let uv_order: YuvNVOrder = NV_ORDER.into();
58    let chroma_subsampling: YuvChromaSubsampling = SAMPLING.into();
59    let chroma_range = get_yuv_range(BIT_DEPTH as u32, range);
60    let kr_kb = matrix.get_kr_kb();
61    let max_range = ((1u32 << (BIT_DEPTH as u32)) - 1u32) as i32;
62
63    image.check_constraints(chroma_subsampling)?;
64    check_rgba_destination(bgra, bgra_stride, image.width, image.height, channels)?;
65
66    const PRECISION: i32 = 13;
67    let i_transform = search_inverse_transform(
68        PRECISION,
69        BIT_DEPTH as u32,
70        range,
71        matrix,
72        chroma_range,
73        kr_kb,
74    );
75    let cr_coef = i_transform.cr_coef;
76    let cb_coef = i_transform.cb_coef;
77    let y_coef = i_transform.y_coef;
78    let g_coef_1 = i_transform.g_coeff_1;
79    let g_coef_2 = i_transform.g_coeff_2;
80
81    let bias_y = chroma_range.bias_y as i32;
82    let bias_uv = chroma_range.bias_uv as i32;
83
84    #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "sse"))]
85    let mut _use_sse = std::arch::is_x86_feature_detected!("sse4.1");
86    #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
87    let is_rdm_available = std::arch::is_aarch64_feature_detected!("rdm");
88    #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
89    let neon_wide_row_handler = if is_rdm_available && BIT_DEPTH <= 12 {
90        #[cfg(feature = "rdm")]
91        {
92            use crate::neon::neon_yuv_nv_p16_to_rgba_row_rdm;
93            neon_yuv_nv_p16_to_rgba_row_rdm::<
94                DESTINATION_CHANNELS,
95                NV_ORDER,
96                SAMPLING,
97                ENDIANNESS,
98                BYTES_POSITION,
99                BIT_DEPTH,
100                PRECISION,
101            >
102        }
103        #[cfg(not(feature = "rdm"))]
104        {
105            neon_yuv_nv_p16_to_rgba_row::<
106                DESTINATION_CHANNELS,
107                NV_ORDER,
108                SAMPLING,
109                ENDIANNESS,
110                BYTES_POSITION,
111                BIT_DEPTH,
112                PRECISION,
113            >
114        }
115    } else {
116        neon_yuv_nv_p16_to_rgba_row::<
117            DESTINATION_CHANNELS,
118            NV_ORDER,
119            SAMPLING,
120            ENDIANNESS,
121            BYTES_POSITION,
122            BIT_DEPTH,
123            PRECISION,
124        >
125    };
126
127    let process_wide_row = |_rgba: &mut [u16], _y_src: &[u16], _uv_src: &[u16]| {
128        let mut _offset = ProcessedOffset { cx: 0, ux: 0 };
129        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
130        {
131            #[cfg(feature = "sse")]
132            if _use_sse {
133                use crate::sse::sse_yuv_nv_p16_to_rgba_row;
134                unsafe {
135                    let processed = sse_yuv_nv_p16_to_rgba_row::<
136                        DESTINATION_CHANNELS,
137                        NV_ORDER,
138                        SAMPLING,
139                        ENDIANNESS,
140                        BYTES_POSITION,
141                        BIT_DEPTH,
142                        PRECISION,
143                    >(
144                        _y_src,
145                        _uv_src,
146                        _rgba,
147                        image.width,
148                        &chroma_range,
149                        &i_transform,
150                        _offset.cx,
151                        _offset.ux,
152                    );
153                    _offset = processed;
154                }
155            }
156        }
157
158        #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
159        {
160            unsafe {
161                let processed = neon_wide_row_handler(
162                    _y_src,
163                    _uv_src,
164                    _rgba,
165                    image.width,
166                    &chroma_range,
167                    &i_transform,
168                    0,
169                    0,
170                );
171                _offset = processed;
172            }
173        }
174        _offset
175    };
176
177    let msb_shift = (16 - BIT_DEPTH) as i32;
178    let width = image.width;
179
180    let process_halved_chroma_row = |y_src: &[u16], uv_src: &[u16], rgba: &mut [u16]| {
181        let processed = process_wide_row(rgba, y_src, uv_src);
182
183        for ((rgba, y_src), uv_src) in rgba
184            .chunks_exact_mut(channels * 2)
185            .zip(y_src.chunks_exact(2))
186            .zip(uv_src.chunks_exact(2))
187            .skip(processed.cx / 2)
188        {
189            let y_vl0 = to_ne::<ENDIANNESS, BYTES_POSITION>(y_src[0], msb_shift) as i32;
190            let mut cb_value =
191                to_ne::<ENDIANNESS, BYTES_POSITION>(uv_src[uv_order.get_u_position()], msb_shift)
192                    as i32;
193            let mut cr_value =
194                to_ne::<ENDIANNESS, BYTES_POSITION>(uv_src[uv_order.get_v_position()], msb_shift)
195                    as i32;
196
197            let y_value0: i32 = (y_vl0 - bias_y) * y_coef;
198
199            cb_value -= bias_uv;
200            cr_value -= bias_uv;
201
202            let r_p0 = qrshr_n::<PRECISION>(y_value0 + cr_coef * cr_value, max_range);
203            let b_p0 = qrshr_n::<PRECISION>(y_value0 + cb_coef * cb_value, max_range);
204            let g_p0 = qrshr_n::<PRECISION>(
205                y_value0 - g_coef_1 * cr_value - g_coef_2 * cb_value,
206                max_range,
207            );
208
209            let rgba0 = &mut rgba[0..channels];
210
211            rgba0[dst_chans.get_b_channel_offset()] = b_p0 as u16;
212            rgba0[dst_chans.get_g_channel_offset()] = g_p0 as u16;
213            rgba0[dst_chans.get_r_channel_offset()] = r_p0 as u16;
214
215            if dst_chans.has_alpha() {
216                rgba0[dst_chans.get_a_channel_offset()] = max_range as u16;
217            }
218
219            let y_vl1 = to_ne::<ENDIANNESS, BYTES_POSITION>(y_src[1], msb_shift) as i32;
220
221            let y_value1: i32 = (y_vl1 - bias_y) * y_coef;
222
223            let r_p1 = qrshr_n::<PRECISION>(y_value1 + cr_coef * cr_value, max_range);
224            let b_p1 = qrshr_n::<PRECISION>(y_value1 + cb_coef * cb_value, max_range);
225            let g_p1 = qrshr_n::<PRECISION>(
226                y_value1 - g_coef_1 * cr_value - g_coef_2 * cb_value,
227                max_range,
228            );
229
230            let rgba1 = &mut rgba[channels..channels * 2];
231
232            rgba1[dst_chans.get_b_channel_offset()] = b_p1 as u16;
233            rgba1[dst_chans.get_g_channel_offset()] = g_p1 as u16;
234            rgba1[dst_chans.get_r_channel_offset()] = r_p1 as u16;
235
236            if dst_chans.has_alpha() {
237                rgba1[dst_chans.get_a_channel_offset()] = max_range as u16;
238            }
239        }
240
241        if width & 1 != 0 {
242            let rgba = rgba.chunks_exact_mut(channels * 2).into_remainder();
243            let rgba = &mut rgba[0..channels];
244            let uv_src = uv_src.chunks_exact(2).last().unwrap();
245            let y_src = y_src.chunks_exact(2).remainder();
246
247            let y_vl0 = to_ne::<ENDIANNESS, BYTES_POSITION>(y_src[0], msb_shift) as i32;
248            let y_value0: i32 = (y_vl0 - bias_y) * y_coef;
249            let mut cb_value =
250                to_ne::<ENDIANNESS, BYTES_POSITION>(uv_src[uv_order.get_u_position()], msb_shift)
251                    as i32;
252            let mut cr_value =
253                to_ne::<ENDIANNESS, BYTES_POSITION>(uv_src[uv_order.get_v_position()], msb_shift)
254                    as i32;
255
256            cb_value -= bias_uv;
257            cr_value -= bias_uv;
258
259            let r_p0 = qrshr_n::<PRECISION>(y_value0 + cr_coef * cr_value, max_range);
260            let b_p0 = qrshr_n::<PRECISION>(y_value0 + cb_coef * cb_value, max_range);
261            let g_p0 = qrshr_n::<PRECISION>(
262                y_value0 - g_coef_1 * cr_value - g_coef_2 * cb_value,
263                max_range,
264            );
265
266            rgba[dst_chans.get_b_channel_offset()] = b_p0 as u16;
267            rgba[dst_chans.get_g_channel_offset()] = g_p0 as u16;
268            rgba[dst_chans.get_r_channel_offset()] = r_p0 as u16;
269
270            if dst_chans.has_alpha() {
271                rgba[dst_chans.get_a_channel_offset()] = max_range as u16;
272            }
273        }
274    };
275
276    let y_stride = image.y_stride;
277    let uv_stride = image.uv_stride;
278    let y_plane = image.y_plane;
279    let uv_plane = image.uv_plane;
280
281    if chroma_subsampling == YuvChromaSubsampling::Yuv444 {
282        let iter;
283        #[cfg(feature = "rayon")]
284        {
285            iter = y_plane
286                .par_chunks_exact(y_stride as usize)
287                .zip(uv_plane.par_chunks_exact(uv_stride as usize))
288                .zip(bgra.par_chunks_exact_mut(bgra_stride as usize));
289        }
290        #[cfg(not(feature = "rayon"))]
291        {
292            iter = y_plane
293                .chunks_exact(y_stride as usize)
294                .zip(uv_plane.chunks_exact(uv_stride as usize))
295                .zip(bgra.chunks_exact_mut(bgra_stride as usize));
296        }
297        iter.for_each(|((y_src, uv_src), rgba)| {
298            let y_src = &y_src[0..image.width as usize];
299            let processed = process_wide_row(rgba, y_src, uv_src);
300
301            for ((rgba, &y_src), uv_src) in rgba
302                .chunks_exact_mut(channels)
303                .zip(y_src.iter())
304                .zip(uv_src.chunks_exact(2))
305                .skip(processed.cx)
306            {
307                let y_vl = to_ne::<ENDIANNESS, BYTES_POSITION>(y_src, msb_shift) as i32;
308                let mut cb_value = to_ne::<ENDIANNESS, BYTES_POSITION>(
309                    uv_src[uv_order.get_u_position()],
310                    msb_shift,
311                ) as i32;
312                let mut cr_value = to_ne::<ENDIANNESS, BYTES_POSITION>(
313                    uv_src[uv_order.get_v_position()],
314                    msb_shift,
315                ) as i32;
316
317                let y_value: i32 = (y_vl - bias_y) * y_coef;
318
319                cb_value -= bias_uv;
320                cr_value -= bias_uv;
321
322                let r_p16 = qrshr_n::<PRECISION>(y_value + cr_coef * cr_value, max_range);
323                let b_p16 = qrshr_n::<PRECISION>(y_value + cb_coef * cb_value, max_range);
324                let g_p16 = qrshr_n::<PRECISION>(
325                    y_value - g_coef_1 * cr_value - g_coef_2 * cb_value,
326                    max_range,
327                );
328
329                let rgba0 = &mut rgba[0..channels];
330
331                rgba0[dst_chans.get_b_channel_offset()] = b_p16 as u16;
332                rgba0[dst_chans.get_g_channel_offset()] = g_p16 as u16;
333                rgba0[dst_chans.get_r_channel_offset()] = r_p16 as u16;
334
335                if dst_chans.has_alpha() {
336                    rgba0[dst_chans.get_a_channel_offset()] = max_range as u16;
337                }
338            }
339        });
340    } else if chroma_subsampling == YuvChromaSubsampling::Yuv422 {
341        let iter;
342        #[cfg(feature = "rayon")]
343        {
344            iter = y_plane
345                .par_chunks_exact(y_stride as usize)
346                .zip(uv_plane.par_chunks_exact(uv_stride as usize))
347                .zip(bgra.par_chunks_exact_mut(bgra_stride as usize));
348        }
349        #[cfg(not(feature = "rayon"))]
350        {
351            iter = y_plane
352                .chunks_exact(y_stride as usize)
353                .zip(uv_plane.chunks_exact(uv_stride as usize))
354                .zip(bgra.chunks_exact_mut(bgra_stride as usize));
355        }
356        iter.for_each(|((y_src, uv_src), rgba)| {
357            process_halved_chroma_row(
358                &y_src[0..image.width as usize],
359                &uv_src[0..(image.width as usize).div_ceil(2) * 2],
360                &mut rgba[0..image.width as usize * channels],
361            );
362        });
363    } else if chroma_subsampling == YuvChromaSubsampling::Yuv420 {
364        let iter;
365        #[cfg(feature = "rayon")]
366        {
367            iter = y_plane
368                .par_chunks_exact(y_stride as usize * 2)
369                .zip(uv_plane.par_chunks_exact(uv_stride as usize))
370                .zip(bgra.par_chunks_exact_mut(bgra_stride as usize * 2));
371        }
372        #[cfg(not(feature = "rayon"))]
373        {
374            iter = y_plane
375                .chunks_exact(y_stride as usize * 2)
376                .zip(uv_plane.chunks_exact(uv_stride as usize))
377                .zip(bgra.chunks_exact_mut(bgra_stride as usize * 2));
378        }
379        iter.for_each(|((y_src, uv_src), rgba)| {
380            for (y_src, rgba) in y_src
381                .chunks_exact(y_stride as usize)
382                .zip(rgba.chunks_exact_mut(bgra_stride as usize))
383            {
384                process_halved_chroma_row(
385                    &y_src[0..image.width as usize],
386                    &uv_src[0..(image.width as usize).div_ceil(2) * 2],
387                    &mut rgba[0..image.width as usize * channels],
388                );
389            }
390        });
391        if image.height & 1 != 0 {
392            let y_src = y_plane.chunks_exact(y_stride as usize * 2).remainder();
393            let uv_src = uv_plane.chunks_exact(uv_stride as usize).last().unwrap();
394            let rgba = bgra
395                .chunks_exact_mut(bgra_stride as usize * 2)
396                .into_remainder();
397            process_halved_chroma_row(
398                &y_src[0..image.width as usize],
399                &uv_src[0..(image.width as usize).div_ceil(2) * 2],
400                &mut rgba[0..image.width as usize * channels],
401            );
402        }
403    } else {
404        unreachable!();
405    }
406    Ok(())
407}
408
409macro_rules! d_cnv {
410    ($method: ident, $px_fmt: expr, $subsampling: expr, $yuv_name: expr, $px_name: expr, $bit_precision: expr) => {
411        #[doc = concat!("Convert ", $yuv_name," format to ", $px_name, stringify!($bit_precision)," format.
412
413This function takes ", $yuv_name," data with ", stringify!($bit_precision),"-bit precision
414and converts it to ", $px_name, stringify!($bit_precision)," format with ", $bit_precision," bit-depth precision.
415
416# Arguments
417
418* `bi_planar_image` - Source ", stringify!($bit_precision)," bit-depth ", $yuv_name," image.
419* `dst` - A mutable slice to store the converted ", $px_name," ", $bit_precision," bit-depth data.
420* `dst_stride` - The stride (components per row) for the ", $px_name," image data.
421* `range` - range of YUV, see [YuvRange] for more info.
422* `matrix` - The YUV standard matrix (BT.601 or BT.709 or BT.2020 or other).
423
424# Panics
425
426This function panics if the lengths of the planes or the input ", $px_name," data are not valid based
427on the specified width, height, and strides, or if invalid YUV range or matrix is provided.")]
428        pub fn $method(
429            bi_planar_image: &YuvBiPlanarImage<u16>,
430            rgba: &mut [u16],
431            rgba_stride: u32,
432            range: YuvRange,
433            matrix: YuvStandardMatrix,
434        ) -> Result<(), YuvError> {
435            let dispatcher = yuv_nv_p16_to_image_impl::<
436                    { $px_fmt as u8 },
437                    { YuvNVOrder::UV as u8 },
438                    { $subsampling as u8 },
439                    { YuvEndianness::LittleEndian as u8 },
440                    { YuvBytesPacking::MostSignificantBytes as u8 },
441                    $bit_precision,
442                >;
443            dispatcher(bi_planar_image, rgba, rgba_stride, range, matrix)
444        }
445    };
446}
447
448d_cnv!(
449    p010_to_rgba10,
450    YuvSourceChannels::Rgba,
451    YuvChromaSubsampling::Yuv420,
452    "P010",
453    "RGBA",
454    10
455);
456d_cnv!(
457    p010_to_rgb10,
458    YuvSourceChannels::Rgb,
459    YuvChromaSubsampling::Yuv420,
460    "P010",
461    "RGB",
462    10
463);
464d_cnv!(
465    p210_to_rgba10,
466    YuvSourceChannels::Rgba,
467    YuvChromaSubsampling::Yuv422,
468    "P210",
469    "RGBA",
470    10
471);
472d_cnv!(
473    p210_to_rgb10,
474    YuvSourceChannels::Rgb,
475    YuvChromaSubsampling::Yuv422,
476    "P210",
477    "RGB",
478    10
479);
480d_cnv!(
481    p410_to_rgba10,
482    YuvSourceChannels::Rgba,
483    YuvChromaSubsampling::Yuv444,
484    "P410",
485    "RGBA",
486    10
487);
488d_cnv!(
489    p410_to_rgb10,
490    YuvSourceChannels::Rgb,
491    YuvChromaSubsampling::Yuv444,
492    "P410",
493    "RGB",
494    10
495);
496
497d_cnv!(
498    p012_to_rgba12,
499    YuvSourceChannels::Rgba,
500    YuvChromaSubsampling::Yuv420,
501    "P012",
502    "RGBA",
503    12
504);
505d_cnv!(
506    p012_to_rgb12,
507    YuvSourceChannels::Rgb,
508    YuvChromaSubsampling::Yuv420,
509    "P012",
510    "RGB",
511    12
512);
513d_cnv!(
514    p212_to_rgba12,
515    YuvSourceChannels::Rgba,
516    YuvChromaSubsampling::Yuv422,
517    "P212",
518    "RGBA",
519    12
520);
521d_cnv!(
522    p212_to_rgb12,
523    YuvSourceChannels::Rgb,
524    YuvChromaSubsampling::Yuv422,
525    "P212",
526    "RGB",
527    12
528);
529d_cnv!(
530    p412_to_rgba12,
531    YuvSourceChannels::Rgba,
532    YuvChromaSubsampling::Yuv444,
533    "P412",
534    "RGBA",
535    12
536);
537d_cnv!(
538    p412_to_rgb12,
539    YuvSourceChannels::Rgb,
540    YuvChromaSubsampling::Yuv444,
541    "P412",
542    "RGB",
543    12
544);