1use crate::numerics::qrshr;
30use crate::yuv_error::check_rgba_destination;
31use crate::yuv_support::{get_yuv_range, YuvSourceChannels};
32use crate::{YuvChromaSubsampling, YuvError, YuvPlanarImage, YuvRange};
33use core::f16;
34use num_traits::AsPrimitive;
35#[cfg(feature = "rayon")]
36use rayon::iter::{IndexedParallelIterator, ParallelIterator};
37#[cfg(feature = "rayon")]
38use rayon::prelude::{ParallelSlice, ParallelSliceMut};
39use std::fmt::Debug;
40use std::marker::PhantomData;
41use std::mem::size_of;
42use std::ops::Sub;
43
44trait FullRowHandle<V: Copy + 'static + Sized + Debug + Send + Sync + Default + CastableToF16> {
45 unsafe fn process_row(&self, dst: &mut [f16], y_src: &[V], u_src: &[V], v_src: &[V]);
46}
47
48trait CastableToF16 {
49 fn cast_to_f16<const BIT_DEPTH: usize>(self) -> f16;
50}
51
52impl CastableToF16 for u16 {
53 fn cast_to_f16<const BIT_DEPTH: usize>(self) -> f16 {
54 if BIT_DEPTH == 16 {
55 (self as i32) as f16
56 } else {
57 (self as i16) as f16
58 }
59 }
60}
61
62trait LimitedRowHandle<
63 V: Copy + AsPrimitive<J> + 'static + Sized + Debug + Send + Sync,
64 J: Copy + Sub<Output = J> + AsPrimitive<i32>,
65>
66{
67 unsafe fn process_row(
68 &self,
69 dst: &mut [f16],
70 y_src: &[V],
71 u_src: &[V],
72 v_src: &[V],
73 y_bias: J,
74 y_coef: i16,
75 );
76}
77
78macro_rules! exec_cv_full {
79 ($dst: expr, $y_src: expr, $u_src: expr, $v_src: expr, $cn: expr, $bit_depth: expr) => {
80 let max_value = (1 << $bit_depth) - 1;
81 let max_value_f16 = 1f32 as f16;
82 let rgb_chunks = $dst.chunks_exact_mut($cn.get_channels_count());
83 let scale = (1f32 / max_value as f32) as f16;
84
85 for (((&y_src, &u_src), &v_src), rgb_dst) in
86 $y_src.iter().zip($u_src).zip($v_src).zip(rgb_chunks)
87 {
88 rgb_dst[$cn.get_r_channel_offset()] = v_src.cast_to_f16::<$bit_depth>() * scale;
89 rgb_dst[$cn.get_g_channel_offset()] = y_src.cast_to_f16::<$bit_depth>() * scale;
90 rgb_dst[$cn.get_b_channel_offset()] = u_src.cast_to_f16::<$bit_depth>() * scale;
91 if $cn.has_alpha() {
92 rgb_dst[$cn.get_a_channel_offset()] = max_value_f16;
93 }
94 }
95 };
96}
97
98macro_rules! exec_cv_limited {
99 ($dst: expr, $y_src: expr, $u_src: expr, $v_src: expr, $cn: expr, $bit_depth: expr, $y_bias: expr, $y_coef: expr, $precision: expr) => {
100 let max_value = (1 << $bit_depth) - 1;
101 let max_value_f16 = 1f32 as f16;
102 let rgb_chunks = $dst.chunks_exact_mut($cn.get_channels_count());
103 let scale = (1f32 / max_value as f32) as f16;
104
105 for (((&y_src, &u_src), &v_src), rgb_dst) in
106 $y_src.iter().zip($u_src).zip($v_src).zip(rgb_chunks)
107 {
108 rgb_dst[$cn.get_r_channel_offset()] =
109 qrshr::<$precision, $bit_depth>((v_src.as_() - $y_bias).as_() * $y_coef as i32)
110 as f16
111 * scale;
112 rgb_dst[$cn.get_g_channel_offset()] =
113 qrshr::<$precision, $bit_depth>((y_src.as_() - $y_bias).as_() * $y_coef as i32)
114 as f16
115 * scale;
116 rgb_dst[$cn.get_b_channel_offset()] =
117 qrshr::<$precision, $bit_depth>((u_src.as_() - $y_bias).as_() * $y_coef as i32)
118 as f16
119 * scale;
120 if $cn.has_alpha() {
121 rgb_dst[$cn.get_a_channel_offset()] = max_value_f16;
122 }
123 }
124 };
125}
126
127#[derive(Default)]
128struct DefaultFullRowHandle<
129 V: Copy + 'static + Sized + Debug + Send + Sync + Default + CastableToF16,
130 const CHANNELS: u8,
131 const BIT_DEPTH: usize,
132> {
133 _phantom: PhantomData<V>,
134}
135
136impl<
137 V: Copy + 'static + Sized + Debug + Send + Sync + Default + CastableToF16,
138 const CHANNELS: u8,
139 const BIT_DEPTH: usize,
140 > FullRowHandle<V> for DefaultFullRowHandle<V, CHANNELS, BIT_DEPTH>
141{
142 unsafe fn process_row(&self, dst: &mut [f16], y_src: &[V], u_src: &[V], v_src: &[V]) {
143 let cn: YuvSourceChannels = CHANNELS.into();
144 exec_cv_full!(dst, y_src, u_src, v_src, cn, BIT_DEPTH);
145 }
146}
147
148#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
149#[derive(Default)]
150struct DefaultFullRowHandleNeonFp16<
151 V: Copy + 'static + Sized + Debug + Send + Sync + Default + CastableToF16,
152 const CHANNELS: u8,
153 const BIT_DEPTH: usize,
154> {
155 _phantom: PhantomData<V>,
156}
157
158#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
159impl<
160 V: Copy + 'static + Sized + Debug + Send + Sync + Default + CastableToF16,
161 const CHANNELS: u8,
162 const BIT_DEPTH: usize,
163 > FullRowHandle<V> for DefaultFullRowHandleNeonFp16<V, CHANNELS, BIT_DEPTH>
164{
165 #[target_feature(enable = "fp16")]
166 unsafe fn process_row(&self, dst: &mut [f16], y_src: &[V], u_src: &[V], v_src: &[V]) {
167 let cn: YuvSourceChannels = CHANNELS.into();
168 exec_cv_full!(dst, y_src, u_src, v_src, cn, BIT_DEPTH);
169 }
170}
171
172#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "avx"))]
173#[derive(Default)]
174struct DefaultFullRowHandleAvxFp16c<
175 V: Copy + 'static + Sized + Debug + Send + Sync + Default + CastableToF16,
176 const CHANNELS: u8,
177 const BIT_DEPTH: usize,
178> {
179 _phantom: PhantomData<V>,
180}
181
182#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "avx"))]
183impl<
184 V: Copy + 'static + Sized + Debug + Send + Sync + Default + CastableToF16,
185 const CHANNELS: u8,
186 const BIT_DEPTH: usize,
187 > FullRowHandle<V> for DefaultFullRowHandleAvxFp16c<V, CHANNELS, BIT_DEPTH>
188{
189 #[target_feature(enable = "avx2", enable = "f16c")]
190 unsafe fn process_row(&self, dst: &mut [f16], y_src: &[V], u_src: &[V], v_src: &[V]) {
191 let cn: YuvSourceChannels = CHANNELS.into();
192 exec_cv_full!(dst, y_src, u_src, v_src, cn, BIT_DEPTH);
193 }
194}
195
196#[derive(Default)]
197struct DefaultLimitedRowHandle<
198 V: Copy + AsPrimitive<J> + 'static + Sized + Debug + Send + Sync + Default,
199 J: Copy + Sub<Output = J> + AsPrimitive<i32> + Default,
200 const CHANNELS: u8,
201 const BIT_DEPTH: usize,
202 const PRECISION: i32,
203> {
204 _phantom: PhantomData<V>,
205 _phantom2: PhantomData<J>,
206}
207
208impl<
209 V: Copy + AsPrimitive<J> + 'static + Sized + Debug + Send + Sync + Default,
210 J: Copy + Sub<Output = J> + AsPrimitive<i32> + Default + Send + Sync,
211 const CHANNELS: u8,
212 const BIT_DEPTH: usize,
213 const PRECISION: i32,
214 > LimitedRowHandle<V, J> for DefaultLimitedRowHandle<V, J, CHANNELS, BIT_DEPTH, PRECISION>
215{
216 unsafe fn process_row(
217 &self,
218 dst: &mut [f16],
219 y_src: &[V],
220 u_src: &[V],
221 v_src: &[V],
222 y_bias: J,
223 y_coef: i16,
224 ) {
225 let cn: YuvSourceChannels = CHANNELS.into();
226 exec_cv_limited!(dst, y_src, u_src, v_src, cn, BIT_DEPTH, y_bias, y_coef, PRECISION);
227 }
228}
229
230#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
231#[derive(Default)]
232struct DefaultLimitedRowHandleNeonFp16<
233 V: Copy + AsPrimitive<J> + 'static + Sized + Debug + Send + Sync,
234 J: Copy + Sub<Output = J> + AsPrimitive<i32> + Default + Send + Sync,
235 const CHANNELS: u8,
236 const BIT_DEPTH: usize,
237 const PRECISION: i32,
238> {
239 _phantom: PhantomData<V>,
240 _phantom2: PhantomData<J>,
241}
242
243#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
244impl<
245 V: Copy + AsPrimitive<J> + 'static + Sized + Debug + Send + Sync,
246 J: Copy + Sub<Output = J> + AsPrimitive<i32> + Default + Send + Sync,
247 const CHANNELS: u8,
248 const BIT_DEPTH: usize,
249 const PRECISION: i32,
250 > LimitedRowHandle<V, J>
251 for DefaultLimitedRowHandleNeonFp16<V, J, CHANNELS, BIT_DEPTH, PRECISION>
252{
253 #[target_feature(enable = "fp16")]
254 unsafe fn process_row(
255 &self,
256 dst: &mut [f16],
257 y_src: &[V],
258 u_src: &[V],
259 v_src: &[V],
260 y_bias: J,
261 y_coef: i16,
262 ) {
263 let cn: YuvSourceChannels = CHANNELS.into();
264 exec_cv_limited!(dst, y_src, u_src, v_src, cn, BIT_DEPTH, y_bias, y_coef, PRECISION);
265 }
266}
267
268#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "avx"))]
269#[derive(Default)]
270struct DefaultLimitedRowHandleAvxFp16c<
271 V: Copy + AsPrimitive<J> + 'static + Sized + Debug + Send + Sync,
272 J: Copy + Sub<Output = J> + AsPrimitive<i32> + Default + Send + Sync,
273 const CHANNELS: u8,
274 const BIT_DEPTH: usize,
275 const PRECISION: i32,
276> {
277 _phantom: PhantomData<V>,
278 _phantom2: PhantomData<J>,
279}
280
281#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "avx"))]
282impl<
283 V: Copy + AsPrimitive<J> + 'static + Sized + Debug + Send + Sync,
284 J: Copy + Sub<Output = J> + AsPrimitive<i32> + Default + Send + Sync,
285 const CHANNELS: u8,
286 const BIT_DEPTH: usize,
287 const PRECISION: i32,
288 > LimitedRowHandle<V, J>
289 for DefaultLimitedRowHandleAvxFp16c<V, J, CHANNELS, BIT_DEPTH, PRECISION>
290{
291 #[target_feature(enable = "avx2", enable = "f16c")]
292 unsafe fn process_row(
293 &self,
294 dst: &mut [f16],
295 y_src: &[V],
296 u_src: &[V],
297 v_src: &[V],
298 y_bias: J,
299 y_coef: i16,
300 ) {
301 let cn: YuvSourceChannels = CHANNELS.into();
302 exec_cv_limited!(dst, y_src, u_src, v_src, cn, BIT_DEPTH, y_bias, y_coef, PRECISION);
303 }
304}
305
306#[inline]
307fn gbr_to_rgbx_f16_impl<
308 V: Copy + AsPrimitive<J> + 'static + Sized + Debug + Send + Sync + Default + CastableToF16,
309 J: Copy + Sub<Output = J> + AsPrimitive<i32> + Default + Send + Sync,
310 const CHANNELS: u8,
311 const BIT_DEPTH: usize,
312>(
313 image: &YuvPlanarImage<V>,
314 rgba: &mut [f16],
315 rgba_stride: u32,
316 yuv_range: YuvRange,
317) -> Result<(), YuvError>
318where
319 u32: AsPrimitive<J>,
320{
321 let cn: YuvSourceChannels = CHANNELS.into();
322 let channels = cn.get_channels_count();
323 assert!(
324 channels == 3 || channels == 4,
325 "GBR -> RGB is implemented only on 3 and 4 channels"
326 );
327 assert!(
328 (8..=16).contains(&BIT_DEPTH),
329 "Invalid bit depth is provided"
330 );
331 assert!(
332 if BIT_DEPTH > 8 {
333 size_of::<V>() == 2
334 } else {
335 size_of::<V>() == 1
336 },
337 "Unsupported bit depth and data type combination"
338 );
339 let y_plane = image.y_plane;
340 let u_plane = image.u_plane;
341 let v_plane = image.v_plane;
342 let y_stride = image.y_stride as usize;
343 let u_stride = image.u_stride as usize;
344 let v_stride = image.v_stride as usize;
345 let height = image.height;
346
347 image.check_constraints(YuvChromaSubsampling::Yuv444)?;
348 check_rgba_destination(rgba, rgba_stride, image.width, height, channels)?;
349
350 let y_iter;
351 let rgb_iter;
352 let u_iter;
353 let v_iter;
354
355 #[cfg(feature = "rayon")]
356 {
357 y_iter = y_plane.par_chunks_exact(y_stride);
358 rgb_iter = rgba.par_chunks_exact_mut(rgba_stride as usize);
359 u_iter = u_plane.par_chunks_exact(u_stride);
360 v_iter = v_plane.par_chunks_exact(v_stride);
361 }
362 #[cfg(not(feature = "rayon"))]
363 {
364 y_iter = y_plane.chunks_exact(y_stride);
365 rgb_iter = rgba.chunks_exact_mut(rgba_stride as usize);
366 u_iter = u_plane.chunks_exact(u_stride);
367 v_iter = v_plane.chunks_exact(v_stride);
368 }
369
370 match yuv_range {
371 YuvRange::Limited => {
372 const PRECISION: i32 = 13;
373 let range = get_yuv_range(BIT_DEPTH as u32, yuv_range);
375 let range_rgba = (1 << BIT_DEPTH) - 1;
376 let y_coef =
377 ((range_rgba as f32 / range.range_y as f32) * (1 << PRECISION) as f32) as i16;
378 let y_bias = range.bias_y.as_();
379
380 let iter = y_iter.zip(u_iter).zip(v_iter).zip(rgb_iter);
381
382 iter.for_each(|(((y_src, u_src), v_src), rgb)| {
383 let y_src = &y_src[0..image.width as usize];
384 let mut _row_processor: Box<dyn LimitedRowHandle<V, J> + Send + Sync> =
385 Box::new(DefaultLimitedRowHandle::<
386 V,
387 J,
388 CHANNELS,
389 BIT_DEPTH,
390 PRECISION,
391 >::default());
392
393 #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
394 if std::arch::is_aarch64_feature_detected!("fp16") {
395 _row_processor = Box::new(DefaultLimitedRowHandleNeonFp16::<
396 V,
397 J,
398 CHANNELS,
399 BIT_DEPTH,
400 PRECISION,
401 >::default());
402 }
403
404 #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "avx"))]
405 {
406 if std::arch::is_x86_feature_detected!("avx2")
407 && std::arch::is_x86_feature_detected!("f16c")
408 {
409 _row_processor = Box::new(DefaultLimitedRowHandleAvxFp16c::<
410 V,
411 J,
412 CHANNELS,
413 BIT_DEPTH,
414 PRECISION,
415 >::default());
416 }
417 }
418
419 unsafe {
420 _row_processor.process_row(rgb, y_src, u_src, v_src, y_bias, y_coef);
421 }
422 });
423 }
424 YuvRange::Full => {
425 let iter = y_iter.zip(u_iter).zip(v_iter).zip(rgb_iter);
426
427 let mut _row_processor: Box<dyn FullRowHandle<V> + Send + Sync> =
428 Box::new(DefaultFullRowHandle::<V, CHANNELS, BIT_DEPTH>::default());
429
430 #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
431 if std::arch::is_aarch64_feature_detected!("fp16") {
432 _row_processor =
433 Box::new(DefaultFullRowHandleNeonFp16::<V, CHANNELS, BIT_DEPTH>::default());
434 }
435
436 #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "avx"))]
437 {
438 if std::arch::is_x86_feature_detected!("avx2")
439 && std::arch::is_x86_feature_detected!("f16c")
440 {
441 _row_processor =
442 Box::new(DefaultFullRowHandleAvxFp16c::<V, CHANNELS, BIT_DEPTH>::default());
443 }
444 }
445
446 iter.for_each(|(((y_src, u_src), v_src), rgb)| {
447 let y_src = &y_src[0..image.width as usize];
448 unsafe {
449 _row_processor.process_row(rgb, y_src, u_src, v_src);
450 }
451 });
452 }
453 }
454
455 Ok(())
456}
457
458macro_rules! d_cv {
459 ($method: ident, $px_fmt: expr, $bit_depth: expr, $rgb_name: expr, $dst_name: ident, $stride_name: ident, $tr: ident) => {
460 #[doc = concat!("Convert GBR", $bit_depth," to ", $rgb_name,"F16, IEEE float16 format.
461
462This function takes GBR planar format data with ", stringify!($bit_depth) ," bit precision,
463and converts it to ", $rgb_name,"F16 IEEE float16 format.
464
465# Arguments
466
467* `image` - Source GB", stringify!($bit_depth)," image.
468* `", stringify!($dst_name),"` - A slice to store the ",$rgb_name,"F16 data.
469* `", stringify!($stride_name), "` - The stride (components per row) for the ", $rgb_name,"F16.
470* `range` - YUV values range.
471
472# Panics
473
474This function panics if the lengths of the planes or the input ",$rgb_name," data are not valid based
475on the specified width, height, and strides is provided.")]
476 pub fn $method(
477 image: &YuvPlanarImage<u16>,
478 $dst_name: &mut [f16],
479 $stride_name: u32,
480 range: YuvRange,
481 ) -> Result<(), YuvError> {
482 gbr_to_rgbx_f16_impl::<u16, $tr, { $px_fmt as u8 }, $bit_depth>(
483 image, $dst_name, $stride_name, range,
484 )
485 }
486 };
487}
488
489d_cv!(
490 gb10_to_rgba_f16,
491 YuvSourceChannels::Rgba,
492 10,
493 "RGBA",
494 rgba,
495 rgba_stride,
496 i16
497);
498d_cv!(
499 gb12_to_rgba_f16,
500 YuvSourceChannels::Rgba,
501 12,
502 "RGBA",
503 rgba,
504 rgba_stride,
505 i16
506);
507d_cv!(
508 gb14_to_rgba_f16,
509 YuvSourceChannels::Rgba,
510 14,
511 "RGBA",
512 rgba,
513 rgba_stride,
514 i16
515);
516d_cv!(
517 gb16_to_rgba_f16,
518 YuvSourceChannels::Rgba,
519 16,
520 "RGBA",
521 rgba,
522 rgba_stride,
523 i32
524);
525
526d_cv!(
527 gb10_to_rgb_f16,
528 YuvSourceChannels::Rgb,
529 10,
530 "RGB",
531 rgb,
532 rgb_stride,
533 i16
534);
535d_cv!(
536 gb12_to_rgb_f16,
537 YuvSourceChannels::Rgb,
538 12,
539 "RGB",
540 rgb,
541 rgb_stride,
542 i16
543);
544d_cv!(
545 gb14_to_rgb_f16,
546 YuvSourceChannels::Rgb,
547 14,
548 "RGB",
549 rgb,
550 rgb_stride,
551 i16
552);
553d_cv!(
554 gb16_to_rgb_f16,
555 YuvSourceChannels::Rgb,
556 16,
557 "RGB",
558 rgb,
559 rgb_stride,
560 i32
561);