Skip to main content

zenpixels_convert/
convert.rs

1//! Row-level pixel conversion kernels.
2//!
3//! Each kernel converts one row of `width` pixels from a source format to
4//! a destination format. Individual step kernels are pure functions with
5//! no allocation. Multi-step plans use [`ConvertScratch`] ping-pong
6//! buffers to avoid per-row heap allocation in streaming loops.
7
8use alloc::vec;
9use alloc::vec::Vec;
10use core::cmp::min;
11
12use crate::policy::{AlphaPolicy, ConvertOptions, DepthPolicy};
13use crate::{
14    AlphaMode, ChannelLayout, ChannelType, ColorPrimaries, ConvertError, PixelDescriptor,
15    TransferFunction,
16};
17use whereat::{At, ResultAtExt};
18
19/// Pre-computed conversion plan.
20///
21/// Stores the chain of steps needed to convert from one format to another.
22/// Created once, applied to every row.
23#[derive(Clone, Debug)]
24pub struct ConvertPlan {
25    pub(crate) from: PixelDescriptor,
26    pub(crate) to: PixelDescriptor,
27    pub(crate) steps: Vec<ConvertStep>,
28}
29
30/// A single conversion step.
31///
32/// Not `Copy` — some variants (e.g., [`ExternalTransform`]) carry an
33/// `Arc`. Peephole rewrites must use `.clone()` or index assignment with
34/// pattern matching instead of `*step` dereferences.
35///
36/// [`ExternalTransform`]: ConvertStep::ExternalTransform
37#[derive(Clone)]
38pub(crate) enum ConvertStep {
39    /// No-op (identity).
40    Identity,
41    /// BGRA → RGBA byte swizzle (or vice versa).
42    SwizzleBgraRgba,
43    /// Fused RGB → BGRA: byte swap + add opaque alpha in a single SIMD pass.
44    /// Equivalent to `[AddAlpha, SwizzleBgraRgba]` but writes the destination
45    /// once instead of twice.
46    RgbToBgra,
47    /// Add alpha channel (3ch → 4ch), filling with opaque.
48    AddAlpha,
49    /// Drop alpha channel (4ch → 3ch).
50    DropAlpha,
51    /// Composite onto solid matte color, then drop alpha (4ch → 3ch).
52    ///
53    /// Blends in linear light: src and matte are converted from sRGB to linear,
54    /// alpha-blended, then converted back. For f32 data, pixel values are
55    /// assumed already linear; only the sRGB u8 matte is linearized.
56    MatteComposite { r: u8, g: u8, b: u8 },
57    /// Gray → RGB (replicate gray to all 3 channels).
58    GrayToRgb,
59    /// Gray → RGBA (replicate + opaque alpha).
60    GrayToRgba,
61    /// RGB → Gray (BT.709 luma).
62    RgbToGray,
63    /// RGBA → Gray (BT.709 luma, drop alpha).
64    RgbaToGray,
65    /// GrayAlpha → RGBA (replicate gray, keep alpha).
66    GrayAlphaToRgba,
67    /// GrayAlpha → RGB (replicate gray, drop alpha).
68    GrayAlphaToRgb,
69    /// Gray → GrayAlpha (add opaque alpha).
70    GrayToGrayAlpha,
71    /// GrayAlpha → Gray (drop alpha).
72    GrayAlphaToGray,
73    /// sRGB u8 → linear f32 (per channel, EOTF).
74    SrgbU8ToLinearF32,
75    /// Linear f32 → sRGB u8 (per channel, OETF).
76    LinearF32ToSrgbU8,
77    /// Naive u8 → f32 (v / 255.0, no gamma).
78    NaiveU8ToF32,
79    /// Naive f32 → u8 (clamp * 255 + 0.5, no gamma).
80    NaiveF32ToU8,
81    /// u16 → u8 ((v * 255 + 32768) >> 16).
82    U16ToU8,
83    /// u8 → u16 (v * 257).
84    U8ToU16,
85    /// u16 → f32 (v / 65535.0).
86    U16ToF32,
87    /// f32 → u16 (clamp * 65535 + 0.5).
88    F32ToU16,
89    /// PQ (SMPTE ST 2084) u16 → linear f32 (EOTF).
90    PqU16ToLinearF32,
91    /// Linear f32 → PQ u16 (inverse EOTF / OETF).
92    LinearF32ToPqU16,
93    /// PQ f32 [0,1] → linear f32 (EOTF, no depth change).
94    PqF32ToLinearF32,
95    /// Linear f32 → PQ f32 [0,1] (OETF, no depth change).
96    LinearF32ToPqF32,
97    /// HLG (ARIB STD-B67) u16 → linear f32 (EOTF).
98    HlgU16ToLinearF32,
99    /// Linear f32 → HLG u16 (OETF).
100    LinearF32ToHlgU16,
101    /// HLG f32 [0,1] → linear f32 (EOTF, no depth change).
102    HlgF32ToLinearF32,
103    /// Linear f32 → HLG f32 [0,1] (OETF, no depth change).
104    LinearF32ToHlgF32,
105    /// sRGB f32 [0,1] → linear f32 (EOTF, no depth change). Clamps input.
106    SrgbF32ToLinearF32,
107    /// Linear f32 → sRGB f32 [0,1] (OETF, no depth change). Clamps output.
108    LinearF32ToSrgbF32,
109    /// sRGB f32 → linear f32 (EOTF, sign-preserving extended range).
110    /// Emitted when `ConvertOptions::clip_out_of_gamut == false`.
111    SrgbF32ToLinearF32Extended,
112    /// Linear f32 → sRGB f32 (OETF, sign-preserving extended range).
113    LinearF32ToSrgbF32Extended,
114    /// BT.709 f32 [0,1] → linear f32 (EOTF, no depth change).
115    Bt709F32ToLinearF32,
116    /// Linear f32 → BT.709 f32 [0,1] (OETF, no depth change).
117    LinearF32ToBt709F32,
118    /// Gamma 2.2 (Adobe RGB 1998) f32 [0,1] → linear f32 (EOTF, no depth change).
119    /// Uses the Adobe RGB 1998 canonical exponent 563/256 ≈ 2.19921875.
120    Gamma22F32ToLinearF32,
121    /// Linear f32 → Gamma 2.2 (Adobe RGB 1998) f32 [0,1] (OETF, no depth change).
122    LinearF32ToGamma22F32,
123    /// Straight → Premultiplied alpha.
124    StraightToPremul,
125    /// Premultiplied → Straight alpha.
126    PremulToStraight,
127    /// Linear RGB f32 → Oklab f32 (3-channel color model change).
128    LinearRgbToOklab,
129    /// Oklab f32 → Linear RGB f32 (3-channel color model change).
130    OklabToLinearRgb,
131    /// Linear RGBA f32 → Oklaba f32 (4-channel, alpha preserved).
132    LinearRgbaToOklaba,
133    /// Oklaba f32 → Linear RGBA f32 (4-channel, alpha preserved).
134    OklabaToLinearRgba,
135    /// Apply a 3×3 gamut matrix to linear RGB f32 (3 channels per pixel).
136    ///
137    /// Used for color primaries conversion (e.g., BT.709 ↔ Display P3 ↔ BT.2020).
138    /// Data must be in linear light. The matrix is row-major `[[f32; 3]; 3]`
139    /// flattened to `[f32; 9]`.
140    GamutMatrixRgbF32([f32; 9]),
141    /// Apply a 3×3 gamut matrix to linear RGBA f32 (4 channels, alpha passthrough).
142    GamutMatrixRgbaF32([f32; 9]),
143    /// Fused u8-sRGB RGB primaries conversion: LUT linearize → SIMD matrix →
144    /// SIMD f32→i32 → LUT encode, in one pass. Replaces the 3-step sequence
145    /// `[SrgbU8ToLinearF32, GamutMatrixRgbF32(m), LinearF32ToSrgbU8]`.
146    FusedSrgbU8GamutRgb([f32; 9]),
147    /// Fused u8-sRGB RGBA primaries conversion (alpha passthrough).
148    FusedSrgbU8GamutRgba([f32; 9]),
149    /// Fused u16-sRGB RGB primaries conversion via 65K-entry LUTs.
150    FusedSrgbU16GamutRgb([f32; 9]),
151    /// Fused u8-sRGB → linear-f32 RGB primaries conversion (cross-depth).
152    /// Output preserves extended range (no clamp).
153    FusedSrgbU8ToLinearF32Rgb([f32; 9]),
154    /// Fused linear-f32 → u8-sRGB RGB primaries conversion (cross-depth).
155    /// Always clamps since u8 can't represent out-of-gamut values.
156    FusedLinearF32ToSrgbU8Rgb([f32; 9]),
157}
158
159impl core::fmt::Debug for ConvertStep {
160    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
161        match self {
162            Self::Identity => f.write_str("Identity"),
163            Self::SwizzleBgraRgba => f.write_str("SwizzleBgraRgba"),
164            Self::RgbToBgra => f.write_str("RgbToBgra"),
165            Self::AddAlpha => f.write_str("AddAlpha"),
166            Self::DropAlpha => f.write_str("DropAlpha"),
167            Self::MatteComposite { r, g, b } => f
168                .debug_struct("MatteComposite")
169                .field("r", r)
170                .field("g", g)
171                .field("b", b)
172                .finish(),
173            Self::GrayToRgb => f.write_str("GrayToRgb"),
174            Self::GrayToRgba => f.write_str("GrayToRgba"),
175            Self::RgbToGray => f.write_str("RgbToGray"),
176            Self::RgbaToGray => f.write_str("RgbaToGray"),
177            Self::GrayAlphaToRgba => f.write_str("GrayAlphaToRgba"),
178            Self::GrayAlphaToRgb => f.write_str("GrayAlphaToRgb"),
179            Self::GrayToGrayAlpha => f.write_str("GrayToGrayAlpha"),
180            Self::GrayAlphaToGray => f.write_str("GrayAlphaToGray"),
181            Self::SrgbU8ToLinearF32 => f.write_str("SrgbU8ToLinearF32"),
182            Self::LinearF32ToSrgbU8 => f.write_str("LinearF32ToSrgbU8"),
183            Self::NaiveU8ToF32 => f.write_str("NaiveU8ToF32"),
184            Self::NaiveF32ToU8 => f.write_str("NaiveF32ToU8"),
185            Self::U16ToU8 => f.write_str("U16ToU8"),
186            Self::U8ToU16 => f.write_str("U8ToU16"),
187            Self::U16ToF32 => f.write_str("U16ToF32"),
188            Self::F32ToU16 => f.write_str("F32ToU16"),
189            Self::PqU16ToLinearF32 => f.write_str("PqU16ToLinearF32"),
190            Self::LinearF32ToPqU16 => f.write_str("LinearF32ToPqU16"),
191            Self::PqF32ToLinearF32 => f.write_str("PqF32ToLinearF32"),
192            Self::LinearF32ToPqF32 => f.write_str("LinearF32ToPqF32"),
193            Self::HlgU16ToLinearF32 => f.write_str("HlgU16ToLinearF32"),
194            Self::LinearF32ToHlgU16 => f.write_str("LinearF32ToHlgU16"),
195            Self::HlgF32ToLinearF32 => f.write_str("HlgF32ToLinearF32"),
196            Self::LinearF32ToHlgF32 => f.write_str("LinearF32ToHlgF32"),
197            Self::SrgbF32ToLinearF32 => f.write_str("SrgbF32ToLinearF32"),
198            Self::LinearF32ToSrgbF32 => f.write_str("LinearF32ToSrgbF32"),
199            Self::SrgbF32ToLinearF32Extended => f.write_str("SrgbF32ToLinearF32Extended"),
200            Self::LinearF32ToSrgbF32Extended => f.write_str("LinearF32ToSrgbF32Extended"),
201            Self::Bt709F32ToLinearF32 => f.write_str("Bt709F32ToLinearF32"),
202            Self::LinearF32ToBt709F32 => f.write_str("LinearF32ToBt709F32"),
203            Self::Gamma22F32ToLinearF32 => f.write_str("Gamma22F32ToLinearF32"),
204            Self::LinearF32ToGamma22F32 => f.write_str("LinearF32ToGamma22F32"),
205            Self::StraightToPremul => f.write_str("StraightToPremul"),
206            Self::PremulToStraight => f.write_str("PremulToStraight"),
207            Self::LinearRgbToOklab => f.write_str("LinearRgbToOklab"),
208            Self::OklabToLinearRgb => f.write_str("OklabToLinearRgb"),
209            Self::LinearRgbaToOklaba => f.write_str("LinearRgbaToOklaba"),
210            Self::OklabaToLinearRgba => f.write_str("OklabaToLinearRgba"),
211            Self::GamutMatrixRgbF32(m) => f.debug_tuple("GamutMatrixRgbF32").field(m).finish(),
212            Self::GamutMatrixRgbaF32(m) => f.debug_tuple("GamutMatrixRgbaF32").field(m).finish(),
213            Self::FusedSrgbU8GamutRgb(m) => f.debug_tuple("FusedSrgbU8GamutRgb").field(m).finish(),
214            Self::FusedSrgbU8GamutRgba(m) => {
215                f.debug_tuple("FusedSrgbU8GamutRgba").field(m).finish()
216            }
217            Self::FusedSrgbU16GamutRgb(m) => {
218                f.debug_tuple("FusedSrgbU16GamutRgb").field(m).finish()
219            }
220            Self::FusedSrgbU8ToLinearF32Rgb(m) => {
221                f.debug_tuple("FusedSrgbU8ToLinearF32Rgb").field(m).finish()
222            }
223            Self::FusedLinearF32ToSrgbU8Rgb(m) => {
224                f.debug_tuple("FusedLinearF32ToSrgbU8Rgb").field(m).finish()
225            }
226        }
227    }
228}
229
230/// Assert that a descriptor is not CMYK.
231///
232/// CMYK is device-dependent and cannot be converted by zenpixels-convert.
233/// Use a CMS (e.g., moxcms) with an ICC profile for CMYK↔RGB conversion.
234fn assert_not_cmyk(desc: &PixelDescriptor) {
235    assert!(
236        desc.color_model() != crate::ColorModel::Cmyk,
237        "CMYK pixel data cannot be processed by zenpixels-convert. \
238         Use a CMS (e.g., moxcms) with an ICC profile for CMYK↔RGB conversion."
239    );
240}
241
242impl ConvertPlan {
243    /// Create a conversion plan from `from` to `to`.
244    ///
245    /// Returns `Err` if no conversion path exists.
246    ///
247    /// # Panics
248    ///
249    /// Panics if either `from` or `to` uses [`ColorModel::Cmyk`](zenpixels::ColorModel::Cmyk).
250    /// CMYK requires a CMS with an ICC profile for conversion.
251    #[track_caller]
252    pub fn new(from: PixelDescriptor, to: PixelDescriptor) -> Result<Self, At<ConvertError>> {
253        assert_not_cmyk(&from);
254        assert_not_cmyk(&to);
255        if from == to {
256            return Ok(Self {
257                from,
258                to,
259                steps: vec![ConvertStep::Identity],
260            });
261        }
262
263        let mut steps = Vec::with_capacity(3);
264
265        // Step 1: Layout conversion (within same depth class).
266        // Step 2: Depth conversion.
267        // Step 3: Alpha mode conversion.
268        //
269        // For cross-depth conversions, we convert layout at the source depth
270        // first, then change depth. This minimizes the number of channels
271        // we need to depth-convert.
272
273        let need_depth_change = from.channel_type() != to.channel_type();
274        let need_layout_change = from.layout() != to.layout();
275        let need_alpha_change =
276            from.alpha() != to.alpha() && from.alpha().is_some() && to.alpha().is_some();
277
278        // Depth/TF steps are needed when depth changes, or when transfer
279        // functions differ (at any depth — integer TF changes route through
280        // an F32 linear intermediate, handled in `depth_steps`).
281        let need_depth_or_tf = need_depth_change || from.transfer() != to.transfer();
282
283        // If we need to change depth AND layout, plan the optimal order.
284        if need_layout_change {
285            // When going to fewer channels, convert layout first (less depth work).
286            // When going to more channels, convert depth first (less layout work).
287            //
288            // Exception: Oklab layout steps require f32 data. When the source
289            // is integer (U8/U16) and the layout change involves Oklab, we must
290            // convert depth first regardless of channel count.
291            let src_ch = from.layout().channels();
292            let dst_ch = to.layout().channels();
293            let involves_oklab =
294                matches!(from.layout(), ChannelLayout::Oklab | ChannelLayout::OklabA)
295                    || matches!(to.layout(), ChannelLayout::Oklab | ChannelLayout::OklabA);
296
297            // Oklab conversion requires known primaries for the RGB→LMS matrix.
298            if involves_oklab && from.primaries == ColorPrimaries::Unknown {
299                return Err(whereat::at!(ConvertError::NoPath { from, to }));
300            }
301
302            let depth_first = need_depth_or_tf
303                && (dst_ch > src_ch || (involves_oklab && from.channel_type() != ChannelType::F32));
304
305            if depth_first {
306                // Depth first, then layout.
307                steps.extend(
308                    depth_steps(
309                        from.channel_type(),
310                        to.channel_type(),
311                        from.transfer(),
312                        to.transfer(),
313                    )
314                    .map_err(|e| whereat::at!(e))?,
315                );
316                steps.extend(layout_steps(from.layout(), to.layout()));
317            } else {
318                // Layout first, then depth.
319                steps.extend(layout_steps(from.layout(), to.layout()));
320                if need_depth_or_tf {
321                    steps.extend(
322                        depth_steps(
323                            from.channel_type(),
324                            to.channel_type(),
325                            from.transfer(),
326                            to.transfer(),
327                        )
328                        .map_err(|e| whereat::at!(e))?,
329                    );
330                }
331            }
332        } else if need_depth_or_tf {
333            steps.extend(
334                depth_steps(
335                    from.channel_type(),
336                    to.channel_type(),
337                    from.transfer(),
338                    to.transfer(),
339                )
340                .map_err(|e| whereat::at!(e))?,
341            );
342        }
343
344        // Alpha mode conversion (if both have alpha and modes differ).
345        if need_alpha_change {
346            match (from.alpha(), to.alpha()) {
347                (Some(AlphaMode::Straight), Some(AlphaMode::Premultiplied)) => {
348                    steps.push(ConvertStep::StraightToPremul);
349                }
350                (Some(AlphaMode::Premultiplied), Some(AlphaMode::Straight)) => {
351                    steps.push(ConvertStep::PremulToStraight);
352                }
353                _ => {}
354            }
355        }
356
357        // Primaries conversion: if source and destination have different known
358        // primaries, inject a gamut matrix in linear f32 space.
359        let need_primaries = from.primaries != to.primaries
360            && from.primaries != ColorPrimaries::Unknown
361            && to.primaries != ColorPrimaries::Unknown;
362
363        if need_primaries
364            && let Some(matrix) = crate::gamut::conversion_matrix(from.primaries, to.primaries)
365        {
366            // Flatten the 3×3 matrix for storage in the step enum.
367            let flat = [
368                matrix[0][0],
369                matrix[0][1],
370                matrix[0][2],
371                matrix[1][0],
372                matrix[1][1],
373                matrix[1][2],
374                matrix[2][0],
375                matrix[2][1],
376                matrix[2][2],
377            ];
378
379            // The gamut matrix must be applied in linear f32 space.
380            // Check if the existing steps already go through linear f32.
381            let mut goes_through_linear = false;
382            {
383                let mut desc = from;
384                for step in &steps {
385                    desc = intermediate_desc(desc, step);
386                    if desc.channel_type() == ChannelType::F32
387                        && desc.transfer() == TransferFunction::Linear
388                    {
389                        goes_through_linear = true;
390                    }
391                }
392            }
393
394            if goes_through_linear {
395                // Insert the gamut matrix right after the first step that
396                // produces linear f32. All subsequent steps encode to the
397                // target format.
398                let mut insert_pos = 0;
399                let mut desc = from;
400                for (i, step) in steps.iter().enumerate() {
401                    desc = intermediate_desc(desc, step);
402                    if desc.channel_type() == ChannelType::F32
403                        && desc.transfer() == TransferFunction::Linear
404                    {
405                        insert_pos = i + 1;
406                        break;
407                    }
408                }
409                let gamut_step = if desc.layout().has_alpha() {
410                    ConvertStep::GamutMatrixRgbaF32(flat)
411                } else {
412                    ConvertStep::GamutMatrixRgbF32(flat)
413                };
414                steps.insert(insert_pos, gamut_step);
415            } else {
416                // No existing linear f32 step — we must add linearize → gamut → delinearize.
417                // Determine layout for the gamut step.
418                let has_alpha = from.layout().has_alpha() || to.layout().has_alpha();
419                // Use the layout at the current point in the plan.
420                let mut desc = from;
421                for step in &steps {
422                    desc = intermediate_desc(desc, step);
423                }
424                let gamut_step = if desc.layout().has_alpha() || has_alpha {
425                    ConvertStep::GamutMatrixRgbaF32(flat)
426                } else {
427                    ConvertStep::GamutMatrixRgbF32(flat)
428                };
429
430                // Insert linearize → gamut → encode-to-target-tf at the end,
431                // before any alpha mode steps.
432                let linearize = match desc.transfer() {
433                    TransferFunction::Srgb => ConvertStep::SrgbF32ToLinearF32,
434                    TransferFunction::Bt709 => ConvertStep::Bt709F32ToLinearF32,
435                    TransferFunction::Pq => ConvertStep::PqF32ToLinearF32,
436                    TransferFunction::Hlg => ConvertStep::HlgF32ToLinearF32,
437                    TransferFunction::Gamma22 => ConvertStep::Gamma22F32ToLinearF32,
438                    TransferFunction::Linear => ConvertStep::Identity,
439                    _ => ConvertStep::SrgbF32ToLinearF32, // assume sRGB for Unknown
440                };
441                let to_target_tf = match to.transfer() {
442                    TransferFunction::Srgb => ConvertStep::LinearF32ToSrgbF32,
443                    TransferFunction::Bt709 => ConvertStep::LinearF32ToBt709F32,
444                    TransferFunction::Pq => ConvertStep::LinearF32ToPqF32,
445                    TransferFunction::Hlg => ConvertStep::LinearF32ToHlgF32,
446                    TransferFunction::Gamma22 => ConvertStep::LinearF32ToGamma22F32,
447                    TransferFunction::Linear => ConvertStep::Identity,
448                    _ => ConvertStep::LinearF32ToSrgbF32, // assume sRGB for Unknown
449                };
450
451                // Need to be in f32 first. If current is integer, add naive conversion.
452                let mut gamut_steps = Vec::new();
453                // Direct fused-step emissions for common cases.
454                if desc.channel_type() == ChannelType::U16
455                    && desc.transfer() == TransferFunction::Srgb
456                    && to.channel_type() == ChannelType::U16
457                    && to.transfer() == TransferFunction::Srgb
458                    && !desc.layout().has_alpha()
459                    && !to.layout().has_alpha()
460                {
461                    // u16 sRGB → u16 sRGB RGB: single-step matlut.
462                    gamut_steps.push(ConvertStep::FusedSrgbU16GamutRgb(flat));
463                    steps.extend(gamut_steps);
464                    if steps.is_empty() {
465                        steps.push(ConvertStep::Identity);
466                    }
467                    fuse_matlut_patterns(&mut steps);
468                    return Ok(Self { from, to, steps });
469                }
470                if desc.channel_type() == ChannelType::U8
471                    && matches!(desc.transfer(), TransferFunction::Srgb)
472                    && to.channel_type() == ChannelType::F32
473                    && to.transfer() == TransferFunction::Linear
474                    && !desc.layout().has_alpha()
475                    && !to.layout().has_alpha()
476                {
477                    // u8 sRGB → linear f32 RGB: cross-depth matlut.
478                    gamut_steps.push(ConvertStep::FusedSrgbU8ToLinearF32Rgb(flat));
479                    steps.extend(gamut_steps);
480                    if steps.is_empty() {
481                        steps.push(ConvertStep::Identity);
482                    }
483                    fuse_matlut_patterns(&mut steps);
484                    return Ok(Self { from, to, steps });
485                }
486                if desc.channel_type() == ChannelType::F32
487                    && desc.transfer() == TransferFunction::Linear
488                    && to.channel_type() == ChannelType::U8
489                    && to.transfer() == TransferFunction::Srgb
490                    && !desc.layout().has_alpha()
491                    && !to.layout().has_alpha()
492                {
493                    // linear f32 → u8 sRGB RGB: cross-depth matlut.
494                    gamut_steps.push(ConvertStep::FusedLinearF32ToSrgbU8Rgb(flat));
495                    steps.extend(gamut_steps);
496                    if steps.is_empty() {
497                        steps.push(ConvertStep::Identity);
498                    }
499                    fuse_matlut_patterns(&mut steps);
500                    return Ok(Self { from, to, steps });
501                }
502                if desc.channel_type() != ChannelType::F32 {
503                    // Use the fused sRGB u8→linear f32 if applicable.
504                    if desc.channel_type() == ChannelType::U8
505                        && matches!(
506                            desc.transfer(),
507                            TransferFunction::Srgb
508                                | TransferFunction::Bt709
509                                | TransferFunction::Unknown
510                        )
511                    {
512                        gamut_steps.push(ConvertStep::SrgbU8ToLinearF32);
513                        // Already linear, skip separate linearize.
514                        gamut_steps.push(gamut_step);
515                        gamut_steps.push(ConvertStep::LinearF32ToSrgbU8);
516                    } else if desc.channel_type() == ChannelType::U16
517                        && desc.transfer() == TransferFunction::Pq
518                    {
519                        gamut_steps.push(ConvertStep::PqU16ToLinearF32);
520                        gamut_steps.push(gamut_step);
521                        gamut_steps.push(ConvertStep::LinearF32ToPqU16);
522                    } else if desc.channel_type() == ChannelType::U16
523                        && desc.transfer() == TransferFunction::Hlg
524                    {
525                        gamut_steps.push(ConvertStep::HlgU16ToLinearF32);
526                        gamut_steps.push(gamut_step);
527                        gamut_steps.push(ConvertStep::LinearF32ToHlgU16);
528                    } else {
529                        // Generic: naive to f32, linearize, gamut, delinearize, naive back
530                        gamut_steps.push(ConvertStep::NaiveU8ToF32);
531                        if !matches!(linearize, ConvertStep::Identity) {
532                            gamut_steps.push(linearize);
533                        }
534                        gamut_steps.push(gamut_step);
535                        if !matches!(to_target_tf, ConvertStep::Identity) {
536                            gamut_steps.push(to_target_tf);
537                        }
538                        gamut_steps.push(ConvertStep::NaiveF32ToU8);
539                    }
540                } else {
541                    // Already f32, just linearize → gamut → encode
542                    if !matches!(linearize, ConvertStep::Identity) {
543                        gamut_steps.push(linearize);
544                    }
545                    gamut_steps.push(gamut_step);
546                    if !matches!(to_target_tf, ConvertStep::Identity) {
547                        gamut_steps.push(to_target_tf);
548                    }
549                }
550
551                steps.extend(gamut_steps);
552            }
553        }
554
555        if steps.is_empty() {
556            // Transfer-only difference or alpha-mode-only: identity path.
557            steps.push(ConvertStep::Identity);
558        }
559
560        // Peephole fusion: collapse common 3-step patterns into single fused
561        // kernels that avoid scratch-buffer round-trips.
562        fuse_matlut_patterns(&mut steps);
563
564        Ok(Self { from, to, steps })
565    }
566
567    /// Create a conversion plan with explicit policy enforcement.
568    ///
569    /// Validates that the planned conversion steps are allowed by the given
570    /// policies before creating the plan. Returns an error if a forbidden
571    /// operation would be required.
572    ///
573    /// # Panics
574    ///
575    /// Panics if either `from` or `to` uses [`ColorModel::Cmyk`](zenpixels::ColorModel::Cmyk).
576    /// CMYK requires a CMS with an ICC profile for conversion.
577    #[track_caller]
578    pub fn new_explicit(
579        from: PixelDescriptor,
580        to: PixelDescriptor,
581        options: &ConvertOptions,
582    ) -> Result<Self, At<ConvertError>> {
583        assert_not_cmyk(&from);
584        assert_not_cmyk(&to);
585        // Check alpha removal policy.
586        let drops_alpha = from.alpha().is_some() && to.alpha().is_none();
587        if drops_alpha && options.alpha_policy == AlphaPolicy::Forbid {
588            return Err(whereat::at!(ConvertError::AlphaRemovalForbidden));
589        }
590
591        // Check depth reduction policy.
592        let reduces_depth = from.channel_type().byte_size() > to.channel_type().byte_size();
593        if reduces_depth && options.depth_policy == DepthPolicy::Forbid {
594            return Err(whereat::at!(ConvertError::DepthReductionForbidden));
595        }
596
597        // Check RGB→Gray requires luma coefficients.
598        let src_is_rgb = matches!(
599            from.layout(),
600            ChannelLayout::Rgb | ChannelLayout::Rgba | ChannelLayout::Bgra
601        );
602        let dst_is_gray = matches!(to.layout(), ChannelLayout::Gray | ChannelLayout::GrayAlpha);
603        if src_is_rgb && dst_is_gray && options.luma.is_none() {
604            return Err(whereat::at!(ConvertError::RgbToGray));
605        }
606
607        let mut plan = Self::new(from, to).at()?;
608
609        // Replace DropAlpha with MatteComposite when policy is CompositeOnto.
610        //
611        // The `matte_composite` kernel uses the straight-alpha over operator
612        // `fg*a + bg*(1-a)` after decoding to linear light. If the source is
613        // premultiplied (our library's convention is encoded-space premul,
614        // per Canvas 2D), feeding its bytes into the straight kernel would
615        // multiply by `a` twice, producing `straight*a² + bg*(1-a)`.
616        // Fix: un-premultiply first (in the source byte space, matching how
617        // our StraightToPremul/PremulToStraight kernels operate).
618        if drops_alpha && let AlphaPolicy::CompositeOnto { r, g, b } = options.alpha_policy {
619            let src_is_premul = from.alpha() == Some(AlphaMode::Premultiplied);
620            let mut idx = 0;
621            while idx < plan.steps.len() {
622                if matches!(plan.steps[idx], ConvertStep::DropAlpha) {
623                    plan.steps[idx] = ConvertStep::MatteComposite { r, g, b };
624                    if src_is_premul {
625                        plan.steps.insert(idx, ConvertStep::PremulToStraight);
626                        idx += 1;
627                    }
628                }
629                idx += 1;
630            }
631        }
632
633        // When the caller opts out of clipping, swap pure-f32 sRGB transfer
634        // steps for their sign-preserving extended-range counterparts.
635        // Fused u8/u16 matlut steps are unaffected (integer I/O can't
636        // represent extended range anyway).
637        if !options.clip_out_of_gamut {
638            for step in &mut plan.steps {
639                match step {
640                    ConvertStep::SrgbF32ToLinearF32 => {
641                        *step = ConvertStep::SrgbF32ToLinearF32Extended;
642                    }
643                    ConvertStep::LinearF32ToSrgbF32 => {
644                        *step = ConvertStep::LinearF32ToSrgbF32Extended;
645                    }
646                    _ => {}
647                }
648            }
649        }
650
651        Ok(plan)
652    }
653
654    /// Create a shell plan that records from/to but has no conversion steps.
655    ///
656    /// Used when an external CMS transform handles the conversion — the
657    /// plan exists only for `from()`/`to()` metadata; the actual row
658    /// work is driven by the external transform stored on `RowConverter`.
659    pub(crate) fn identity(from: PixelDescriptor, to: PixelDescriptor) -> Self {
660        Self {
661            from,
662            to,
663            steps: vec![ConvertStep::Identity],
664        }
665    }
666
667    /// Compose two plans into one: apply `self` then `other`.
668    ///
669    /// The composed plan executes both conversions in a single `convert_row`
670    /// call, using one intermediate buffer instead of two. Adjacent inverse
671    /// steps are cancelled (e.g., `SrgbU8ToLinearF32` + `LinearF32ToSrgbU8`
672    /// → identity).
673    ///
674    /// Returns `None` if `self.to` != `other.from` (incompatible plans).
675    pub fn compose(&self, other: &Self) -> Option<Self> {
676        if self.to != other.from {
677            return None;
678        }
679
680        let mut steps = self.steps.clone();
681
682        // Append other's steps, skipping its Identity if present.
683        for step in &other.steps {
684            if matches!(step, ConvertStep::Identity) {
685                continue;
686            }
687            steps.push(step.clone());
688        }
689
690        // Peephole: cancel adjacent inverse pairs.
691        let mut changed = true;
692        while changed {
693            changed = false;
694            let mut i = 0;
695            while i + 1 < steps.len() {
696                if are_inverse(&steps[i], &steps[i + 1]) {
697                    steps.remove(i + 1);
698                    steps.remove(i);
699                    changed = true;
700                    // Don't advance — check the new adjacent pair.
701                } else {
702                    i += 1;
703                }
704            }
705        }
706
707        // If everything cancelled, produce identity.
708        if steps.is_empty() {
709            steps.push(ConvertStep::Identity);
710        }
711
712        // Remove leading/trailing Identity if there are real steps.
713        if steps.len() > 1 {
714            steps.retain(|s| !matches!(s, ConvertStep::Identity));
715            if steps.is_empty() {
716                steps.push(ConvertStep::Identity);
717            }
718        }
719
720        Some(Self {
721            from: self.from,
722            to: other.to,
723            steps,
724        })
725    }
726
727    /// True if conversion is a no-op.
728    #[must_use]
729    pub fn is_identity(&self) -> bool {
730        self.steps.len() == 1 && matches!(self.steps[0], ConvertStep::Identity)
731    }
732
733    /// Maximum bytes-per-pixel across all intermediate formats in the plan.
734    ///
735    /// Used to pre-allocate scratch buffers for streaming conversion.
736    pub(crate) fn max_intermediate_bpp(&self) -> usize {
737        let mut desc = self.from;
738        let mut max_bpp = desc.bytes_per_pixel();
739        for step in &self.steps {
740            desc = intermediate_desc(desc, step);
741            max_bpp = max_bpp.max(desc.bytes_per_pixel());
742        }
743        max_bpp
744    }
745
746    /// Source descriptor.
747    pub fn from(&self) -> PixelDescriptor {
748        self.from
749    }
750
751    /// Target descriptor.
752    pub fn to(&self) -> PixelDescriptor {
753        self.to
754    }
755}
756
757/// Determine the layout conversion step(s).
758///
759/// Some layout conversions require two steps (e.g., BGRA -> RGB needs
760/// swizzle + drop alpha). Returns up to 2 steps.
761fn layout_steps(from: ChannelLayout, to: ChannelLayout) -> Vec<ConvertStep> {
762    if from == to {
763        return Vec::new();
764    }
765    match (from, to) {
766        (ChannelLayout::Bgra, ChannelLayout::Rgba) | (ChannelLayout::Rgba, ChannelLayout::Bgra) => {
767            vec![ConvertStep::SwizzleBgraRgba]
768        }
769        (ChannelLayout::Rgb, ChannelLayout::Rgba) => vec![ConvertStep::AddAlpha],
770        (ChannelLayout::Rgb, ChannelLayout::Bgra) => {
771            // Single fused SIMD pass (garb::bytes::rgb_to_bgra). For non-u8
772            // channel types `apply_step_u8` falls back to AddAlpha+Swizzle.
773            vec![ConvertStep::RgbToBgra]
774        }
775        (ChannelLayout::Rgba, ChannelLayout::Rgb) => vec![ConvertStep::DropAlpha],
776        (ChannelLayout::Bgra, ChannelLayout::Rgb) => {
777            // BGRA -> RGBA -> RGB: swizzle then drop alpha.
778            vec![ConvertStep::SwizzleBgraRgba, ConvertStep::DropAlpha]
779        }
780        (ChannelLayout::Gray, ChannelLayout::Rgb) => vec![ConvertStep::GrayToRgb],
781        (ChannelLayout::Gray, ChannelLayout::Rgba) => vec![ConvertStep::GrayToRgba],
782        (ChannelLayout::Gray, ChannelLayout::Bgra) => {
783            // Gray -> RGBA -> BGRA: expand then swizzle.
784            vec![ConvertStep::GrayToRgba, ConvertStep::SwizzleBgraRgba]
785        }
786        (ChannelLayout::Rgb, ChannelLayout::Gray) => vec![ConvertStep::RgbToGray],
787        (ChannelLayout::Rgba, ChannelLayout::Gray) => vec![ConvertStep::RgbaToGray],
788        (ChannelLayout::Bgra, ChannelLayout::Gray) => {
789            // BGRA -> RGBA -> Gray: swizzle then to gray.
790            vec![ConvertStep::SwizzleBgraRgba, ConvertStep::RgbaToGray]
791        }
792        (ChannelLayout::GrayAlpha, ChannelLayout::Rgba) => vec![ConvertStep::GrayAlphaToRgba],
793        (ChannelLayout::GrayAlpha, ChannelLayout::Bgra) => {
794            // GrayAlpha -> RGBA -> BGRA: expand then swizzle.
795            vec![ConvertStep::GrayAlphaToRgba, ConvertStep::SwizzleBgraRgba]
796        }
797        (ChannelLayout::GrayAlpha, ChannelLayout::Rgb) => vec![ConvertStep::GrayAlphaToRgb],
798        (ChannelLayout::Gray, ChannelLayout::GrayAlpha) => vec![ConvertStep::GrayToGrayAlpha],
799        (ChannelLayout::GrayAlpha, ChannelLayout::Gray) => vec![ConvertStep::GrayAlphaToGray],
800
801        // Oklab ↔ RGB conversions (via linear RGB).
802        (ChannelLayout::Rgb, ChannelLayout::Oklab) => vec![ConvertStep::LinearRgbToOklab],
803        (ChannelLayout::Oklab, ChannelLayout::Rgb) => vec![ConvertStep::OklabToLinearRgb],
804        (ChannelLayout::Rgba, ChannelLayout::OklabA) => vec![ConvertStep::LinearRgbaToOklaba],
805        (ChannelLayout::OklabA, ChannelLayout::Rgba) => vec![ConvertStep::OklabaToLinearRgba],
806
807        // Oklab ↔ RGB with alpha add/drop.
808        (ChannelLayout::Rgb, ChannelLayout::OklabA) => {
809            vec![ConvertStep::AddAlpha, ConvertStep::LinearRgbaToOklaba]
810        }
811        (ChannelLayout::OklabA, ChannelLayout::Rgb) => {
812            vec![ConvertStep::OklabaToLinearRgba, ConvertStep::DropAlpha]
813        }
814        (ChannelLayout::Oklab, ChannelLayout::Rgba) => {
815            vec![ConvertStep::OklabToLinearRgb, ConvertStep::AddAlpha]
816        }
817        (ChannelLayout::Rgba, ChannelLayout::Oklab) => {
818            vec![ConvertStep::DropAlpha, ConvertStep::LinearRgbToOklab]
819        }
820
821        // Oklab ↔ BGRA (swizzle to/from RGBA, then Oklab).
822        (ChannelLayout::Bgra, ChannelLayout::OklabA) => {
823            vec![
824                ConvertStep::SwizzleBgraRgba,
825                ConvertStep::LinearRgbaToOklaba,
826            ]
827        }
828        (ChannelLayout::OklabA, ChannelLayout::Bgra) => {
829            vec![
830                ConvertStep::OklabaToLinearRgba,
831                ConvertStep::SwizzleBgraRgba,
832            ]
833        }
834        (ChannelLayout::Bgra, ChannelLayout::Oklab) => {
835            vec![
836                ConvertStep::SwizzleBgraRgba,
837                ConvertStep::DropAlpha,
838                ConvertStep::LinearRgbToOklab,
839            ]
840        }
841        (ChannelLayout::Oklab, ChannelLayout::Bgra) => {
842            vec![
843                ConvertStep::OklabToLinearRgb,
844                ConvertStep::AddAlpha,
845                ConvertStep::SwizzleBgraRgba,
846            ]
847        }
848
849        // Gray ↔ Oklab (expand gray to RGB first).
850        (ChannelLayout::Gray, ChannelLayout::Oklab) => {
851            vec![ConvertStep::GrayToRgb, ConvertStep::LinearRgbToOklab]
852        }
853        (ChannelLayout::Oklab, ChannelLayout::Gray) => {
854            vec![ConvertStep::OklabToLinearRgb, ConvertStep::RgbToGray]
855        }
856        (ChannelLayout::Gray, ChannelLayout::OklabA) => {
857            vec![ConvertStep::GrayToRgba, ConvertStep::LinearRgbaToOklaba]
858        }
859        (ChannelLayout::OklabA, ChannelLayout::Gray) => {
860            vec![ConvertStep::OklabaToLinearRgba, ConvertStep::RgbaToGray]
861        }
862        (ChannelLayout::GrayAlpha, ChannelLayout::OklabA) => {
863            vec![
864                ConvertStep::GrayAlphaToRgba,
865                ConvertStep::LinearRgbaToOklaba,
866            ]
867        }
868        (ChannelLayout::OklabA, ChannelLayout::GrayAlpha) => {
869            // Drop alpha from OklabA→Oklab, convert to RGB, then to GrayAlpha.
870            // Alpha is lost; this is inherently lossy.
871            vec![
872                ConvertStep::OklabaToLinearRgba,
873                ConvertStep::RgbaToGray,
874                ConvertStep::GrayToGrayAlpha,
875            ]
876        }
877        (ChannelLayout::GrayAlpha, ChannelLayout::Oklab) => {
878            vec![ConvertStep::GrayAlphaToRgb, ConvertStep::LinearRgbToOklab]
879        }
880        (ChannelLayout::Oklab, ChannelLayout::GrayAlpha) => {
881            vec![
882                ConvertStep::OklabToLinearRgb,
883                ConvertStep::RgbToGray,
884                ConvertStep::GrayToGrayAlpha,
885            ]
886        }
887
888        // Oklab ↔ alpha variants.
889        (ChannelLayout::Oklab, ChannelLayout::OklabA) => vec![ConvertStep::AddAlpha],
890        (ChannelLayout::OklabA, ChannelLayout::Oklab) => vec![ConvertStep::DropAlpha],
891
892        _ => Vec::new(), // Unsupported layout conversion.
893    }
894}
895
896/// F32→F32 linearize step for a transfer function, or `None` if the TF is
897/// already linear (or Unknown — caller decides how to handle Unknown).
898fn f32_linearize_step(tf: TransferFunction) -> Option<ConvertStep> {
899    match tf {
900        TransferFunction::Linear => None,
901        TransferFunction::Srgb => Some(ConvertStep::SrgbF32ToLinearF32),
902        TransferFunction::Bt709 => Some(ConvertStep::Bt709F32ToLinearF32),
903        TransferFunction::Pq => Some(ConvertStep::PqF32ToLinearF32),
904        TransferFunction::Hlg => Some(ConvertStep::HlgF32ToLinearF32),
905        TransferFunction::Gamma22 => Some(ConvertStep::Gamma22F32ToLinearF32),
906        TransferFunction::Unknown => None,
907        _ => None,
908    }
909}
910
911/// F32→F32 OETF step for a transfer function, or `None` if the TF is linear
912/// (or Unknown).
913fn f32_encode_step(tf: TransferFunction) -> Option<ConvertStep> {
914    match tf {
915        TransferFunction::Linear => None,
916        TransferFunction::Srgb => Some(ConvertStep::LinearF32ToSrgbF32),
917        TransferFunction::Bt709 => Some(ConvertStep::LinearF32ToBt709F32),
918        TransferFunction::Pq => Some(ConvertStep::LinearF32ToPqF32),
919        TransferFunction::Hlg => Some(ConvertStep::LinearF32ToHlgF32),
920        TransferFunction::Gamma22 => Some(ConvertStep::LinearF32ToGamma22F32),
921        TransferFunction::Unknown => None,
922        _ => None,
923    }
924}
925
926/// F32→F32 TF-change steps: linearize (if not already linear) then encode
927/// (if target is not linear).
928///
929/// Returns empty when `from == to`, or when either side is `Unknown` — when
930/// one side's TF is unknown we can't mechanically compute a correct
931/// conversion, so we preserve bytes as-is. Addressing the Unknown ambiguity
932/// via explicit opt-in API is tracked as issue #19 [C]/[D] (deprecate-and-add).
933fn f32_tf_pair_steps(from: TransferFunction, to: TransferFunction) -> Vec<ConvertStep> {
934    if from == to || from == TransferFunction::Unknown || to == TransferFunction::Unknown {
935        return Vec::new();
936    }
937    let mut steps = Vec::with_capacity(2);
938    if let Some(s) = f32_linearize_step(from) {
939        steps.push(s);
940    }
941    if let Some(s) = f32_encode_step(to) {
942        steps.push(s);
943    }
944    steps
945}
946
947/// Integer→F32 scale step for a given integer channel type. Panics for F32
948/// (caller must check); CMYK is rejected upstream by `assert_not_cmyk`.
949fn int_to_f32_step(ct: ChannelType) -> ConvertStep {
950    match ct {
951        ChannelType::U8 => ConvertStep::NaiveU8ToF32,
952        ChannelType::U16 => ConvertStep::U16ToF32,
953        _ => unreachable!("int_to_f32_step called with non-integer channel type"),
954    }
955}
956
957/// F32→integer scale step.
958fn f32_to_int_step(ct: ChannelType) -> ConvertStep {
959    match ct {
960        ChannelType::U8 => ConvertStep::NaiveF32ToU8,
961        ChannelType::U16 => ConvertStep::F32ToU16,
962        _ => unreachable!("f32_to_int_step called with non-integer channel type"),
963    }
964}
965
966/// Determine the depth conversion step(s), considering transfer functions.
967///
968/// Returns one or more steps. Multi-step conversions route through an F32
969/// linear intermediate (e.g. PQ U16 → sRGB U8 goes PQ U16 → Linear F32 →
970/// sRGB U8), and same-depth integer TF changes route through an F32 linear
971/// intermediate too: passing integer bytes through unchanged under a new
972/// TF label produces wrong pixels.
973fn depth_steps(
974    from: ChannelType,
975    to: ChannelType,
976    from_tf: TransferFunction,
977    to_tf: TransferFunction,
978) -> Result<Vec<ConvertStep>, ConvertError> {
979    if from == to && from_tf == to_tf {
980        return Ok(Vec::new());
981    }
982
983    // Same depth, F32: apply EOTF/OETF in place.
984    if from == to && from == ChannelType::F32 {
985        return Ok(f32_tf_pair_steps(from_tf, to_tf));
986    }
987
988    // Same depth, integer: TF change requires re-encoding. Route through F32
989    // linear intermediate — passing bytes through labeled as a different TF
990    // produces wrong pixels.
991    //
992    // Exception: if either TF is Unknown, we don't know the correct conversion.
993    // Preserve bytes exactly (no F32 round-trip — that would introduce U8/U16
994    // rounding error for no semantic benefit). Addressed properly by issue
995    // #19 [C]/[D] via opt-in deprecate-and-add.
996    if from == to && from != ChannelType::F32 {
997        if from_tf == TransferFunction::Unknown || to_tf == TransferFunction::Unknown {
998            return Ok(Vec::new());
999        }
1000        let mut steps = Vec::with_capacity(4);
1001        steps.push(int_to_f32_step(from));
1002        steps.extend(f32_tf_pair_steps(from_tf, to_tf));
1003        steps.push(f32_to_int_step(to));
1004        return Ok(steps);
1005    }
1006
1007    match (from, to) {
1008        (ChannelType::U8, ChannelType::F32) => {
1009            // Fused sRGB EOTF kernel — sRGB only. BT.709 uses a different EOTF
1010            // (~17% linear-light error at mid-gray if we routed it through the
1011            // sRGB kernel) and must compose through the F32 BT.709 EOTF step.
1012            if from_tf == TransferFunction::Srgb && to_tf == TransferFunction::Linear {
1013                Ok(vec![ConvertStep::SrgbU8ToLinearF32])
1014            } else if from_tf == to_tf {
1015                Ok(vec![ConvertStep::NaiveU8ToF32])
1016            } else {
1017                // Cross-depth + cross-TF: linearize/encode after the U8→F32 scale.
1018                // Previously dropped the TF math and returned bytes labeled with
1019                // the target TF — silent wrong pixels for any TF pair other than
1020                // {Srgb,Bt709}→Linear.
1021                let mut steps = Vec::with_capacity(3);
1022                steps.push(ConvertStep::NaiveU8ToF32);
1023                steps.extend(f32_tf_pair_steps(from_tf, to_tf));
1024                Ok(steps)
1025            }
1026        }
1027        (ChannelType::F32, ChannelType::U8) => {
1028            // Fused sRGB OETF kernel — sRGB only (same reason as above).
1029            if from_tf == TransferFunction::Linear && to_tf == TransferFunction::Srgb {
1030                Ok(vec![ConvertStep::LinearF32ToSrgbU8])
1031            } else if from_tf == to_tf {
1032                Ok(vec![ConvertStep::NaiveF32ToU8])
1033            } else {
1034                // Linearize/encode in F32 first, then compress to U8.
1035                let mut steps = f32_tf_pair_steps(from_tf, to_tf);
1036                steps.push(ConvertStep::NaiveF32ToU8);
1037                Ok(steps)
1038            }
1039        }
1040        (ChannelType::U16, ChannelType::F32) => {
1041            // PQ/HLG U16 → Linear F32: apply EOTF during conversion.
1042            match (from_tf, to_tf) {
1043                (TransferFunction::Pq, TransferFunction::Linear) => {
1044                    Ok(vec![ConvertStep::PqU16ToLinearF32])
1045                }
1046                (TransferFunction::Hlg, TransferFunction::Linear) => {
1047                    Ok(vec![ConvertStep::HlgU16ToLinearF32])
1048                }
1049                (a, b) if a == b => Ok(vec![ConvertStep::U16ToF32]),
1050                _ => {
1051                    let mut steps = Vec::with_capacity(3);
1052                    steps.push(ConvertStep::U16ToF32);
1053                    steps.extend(f32_tf_pair_steps(from_tf, to_tf));
1054                    Ok(steps)
1055                }
1056            }
1057        }
1058        (ChannelType::F32, ChannelType::U16) => {
1059            // Linear F32 → PQ/HLG U16: apply OETF during conversion.
1060            match (from_tf, to_tf) {
1061                (TransferFunction::Linear, TransferFunction::Pq) => {
1062                    Ok(vec![ConvertStep::LinearF32ToPqU16])
1063                }
1064                (TransferFunction::Linear, TransferFunction::Hlg) => {
1065                    Ok(vec![ConvertStep::LinearF32ToHlgU16])
1066                }
1067                (a, b) if a == b => Ok(vec![ConvertStep::F32ToU16]),
1068                _ => {
1069                    let mut steps = f32_tf_pair_steps(from_tf, to_tf);
1070                    steps.push(ConvertStep::F32ToU16);
1071                    Ok(steps)
1072                }
1073            }
1074        }
1075        (ChannelType::U16, ChannelType::U8) => {
1076            // HDR U16 → SDR U8: go through linear F32 with proper EOTF → OETF.
1077            if from_tf == TransferFunction::Pq && to_tf == TransferFunction::Srgb {
1078                Ok(vec![
1079                    ConvertStep::PqU16ToLinearF32,
1080                    ConvertStep::LinearF32ToSrgbU8,
1081                ])
1082            } else if from_tf == TransferFunction::Hlg && to_tf == TransferFunction::Srgb {
1083                Ok(vec![
1084                    ConvertStep::HlgU16ToLinearF32,
1085                    ConvertStep::LinearF32ToSrgbU8,
1086                ])
1087            } else if from_tf == to_tf {
1088                Ok(vec![ConvertStep::U16ToU8])
1089            } else {
1090                let mut steps = Vec::with_capacity(4);
1091                steps.push(ConvertStep::U16ToF32);
1092                steps.extend(f32_tf_pair_steps(from_tf, to_tf));
1093                steps.push(ConvertStep::NaiveF32ToU8);
1094                Ok(steps)
1095            }
1096        }
1097        (ChannelType::U8, ChannelType::U16) => {
1098            if from_tf == to_tf {
1099                Ok(vec![ConvertStep::U8ToU16])
1100            } else {
1101                let mut steps = Vec::with_capacity(4);
1102                steps.push(ConvertStep::NaiveU8ToF32);
1103                steps.extend(f32_tf_pair_steps(from_tf, to_tf));
1104                steps.push(ConvertStep::F32ToU16);
1105                Ok(steps)
1106            }
1107        }
1108        _ => Err(ConvertError::NoPath {
1109            from: PixelDescriptor::new(from, ChannelLayout::Rgb, None, from_tf),
1110            to: PixelDescriptor::new(to, ChannelLayout::Rgb, None, to_tf),
1111        }),
1112    }
1113}
1114
1115// ---------------------------------------------------------------------------
1116// Row conversion kernels
1117// ---------------------------------------------------------------------------
1118
1119/// Pre-allocated scratch buffer for multi-step row conversions.
1120///
1121/// Eliminates per-row heap allocation by reusing two ping-pong halves
1122/// of a single buffer across calls. Create once per [`ConvertPlan`],
1123/// then pass to `convert_row_buffered` for each row.
1124pub(crate) struct ConvertScratch {
1125    /// Single allocation split into two halves via `split_at_mut`.
1126    /// Stored as `Vec<u32>` to guarantee 4-byte alignment, which lets
1127    /// garb and bytemuck use fast aligned paths instead of unaligned fallbacks.
1128    buf: Vec<u32>,
1129}
1130
1131impl ConvertScratch {
1132    /// Create empty scratch (buffer grows on first use).
1133    pub(crate) fn new() -> Self {
1134        Self { buf: Vec::new() }
1135    }
1136
1137    /// Ensure the buffer is large enough for two halves of the max
1138    /// intermediate format at the given width.
1139    fn ensure_capacity(&mut self, plan: &ConvertPlan, width: u32) {
1140        let half_bytes = (width as usize) * plan.max_intermediate_bpp();
1141        let total_u32 = (half_bytes * 2).div_ceil(4);
1142        if self.buf.len() < total_u32 {
1143            self.buf.resize(total_u32, 0);
1144        }
1145    }
1146}
1147
1148impl core::fmt::Debug for ConvertScratch {
1149    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1150        f.debug_struct("ConvertScratch")
1151            .field("capacity", &self.buf.capacity())
1152            .finish()
1153    }
1154}
1155
1156/// Convert one row of `width` pixels using a pre-computed plan.
1157///
1158/// `src` and `dst` must be sized for `width` pixels in their respective formats.
1159/// For multi-step plans, an internal scratch buffer is allocated per call.
1160/// Prefer [`RowConverter`](crate::RowConverter) in hot loops (reuses scratch buffers).
1161pub fn convert_row(plan: &ConvertPlan, src: &[u8], dst: &mut [u8], width: u32) {
1162    if plan.is_identity() {
1163        let len = min(src.len(), dst.len());
1164        dst[..len].copy_from_slice(&src[..len]);
1165        return;
1166    }
1167
1168    if plan.steps.len() == 1 {
1169        apply_step_u8(&plan.steps[0], src, dst, width, plan.from, plan.to);
1170        return;
1171    }
1172
1173    // Allocating fallback for one-off calls.
1174    let mut scratch = ConvertScratch::new();
1175    convert_row_buffered(plan, src, dst, width, &mut scratch);
1176}
1177
1178/// Convert one row of `width` pixels, reusing pre-allocated scratch buffers.
1179///
1180/// For multi-step plans this avoids per-row heap allocation by ping-ponging
1181/// between two halves of a scratch buffer. Single-step plans bypass scratch.
1182pub(crate) fn convert_row_buffered(
1183    plan: &ConvertPlan,
1184    src: &[u8],
1185    dst: &mut [u8],
1186    width: u32,
1187    scratch: &mut ConvertScratch,
1188) {
1189    if plan.is_identity() {
1190        let len = min(src.len(), dst.len());
1191        dst[..len].copy_from_slice(&src[..len]);
1192        return;
1193    }
1194
1195    if plan.steps.len() == 1 {
1196        apply_step_u8(&plan.steps[0], src, dst, width, plan.from, plan.to);
1197        return;
1198    }
1199
1200    scratch.ensure_capacity(plan, width);
1201
1202    let buf_bytes: &mut [u8] = bytemuck::cast_slice_mut(&mut scratch.buf);
1203    let half = buf_bytes.len() / 2;
1204    let (buf_a, buf_b) = buf_bytes.split_at_mut(half);
1205
1206    let num_steps = plan.steps.len();
1207    let mut current_desc = plan.from;
1208
1209    for (i, step) in plan.steps.iter().enumerate() {
1210        let is_last = i == num_steps - 1;
1211        let next_desc = if is_last {
1212            plan.to
1213        } else {
1214            intermediate_desc(current_desc, step)
1215        };
1216
1217        let next_len = (width as usize) * next_desc.bytes_per_pixel();
1218        let curr_len = (width as usize) * current_desc.bytes_per_pixel();
1219
1220        // Ping-pong: even steps read src/buf_b and write buf_a;
1221        // odd steps read buf_a and write buf_b. Each branch only
1222        // borrows each half in one mode, satisfying the borrow checker.
1223        if i % 2 == 0 {
1224            let input = if i == 0 { src } else { &buf_b[..curr_len] };
1225            if is_last {
1226                apply_step_u8(step, input, dst, width, current_desc, next_desc);
1227            } else {
1228                apply_step_u8(
1229                    step,
1230                    input,
1231                    &mut buf_a[..next_len],
1232                    width,
1233                    current_desc,
1234                    next_desc,
1235                );
1236            }
1237        } else {
1238            let input = &buf_a[..curr_len];
1239            if is_last {
1240                apply_step_u8(step, input, dst, width, current_desc, next_desc);
1241            } else {
1242                apply_step_u8(
1243                    step,
1244                    input,
1245                    &mut buf_b[..next_len],
1246                    width,
1247                    current_desc,
1248                    next_desc,
1249                );
1250            }
1251        }
1252
1253        current_desc = next_desc;
1254    }
1255}
1256
1257/// Check if two steps are inverses that cancel each other.
1258/// Collapse `[SrgbU8ToLinearF32, GamutMatrix*F32(m), LinearF32ToSrgbU8]`
1259/// into a single fused matlut step. Mutates in place.
1260fn fuse_matlut_patterns(steps: &mut Vec<ConvertStep>) {
1261    let mut i = 0;
1262    while i + 2 < steps.len() {
1263        let rewrite = match (&steps[i], &steps[i + 1], &steps[i + 2]) {
1264            (
1265                ConvertStep::SrgbU8ToLinearF32,
1266                ConvertStep::GamutMatrixRgbF32(m),
1267                ConvertStep::LinearF32ToSrgbU8,
1268            ) => Some(ConvertStep::FusedSrgbU8GamutRgb(*m)),
1269            (
1270                ConvertStep::SrgbU8ToLinearF32,
1271                ConvertStep::GamutMatrixRgbaF32(m),
1272                ConvertStep::LinearF32ToSrgbU8,
1273            ) => Some(ConvertStep::FusedSrgbU8GamutRgba(*m)),
1274            _ => None,
1275        };
1276        if let Some(fused) = rewrite {
1277            steps[i] = fused;
1278            steps.drain(i + 1..i + 3);
1279            continue;
1280        }
1281        i += 1;
1282    }
1283}
1284
1285fn are_inverse(a: &ConvertStep, b: &ConvertStep) -> bool {
1286    matches!(
1287        (a, b),
1288        // Self-inverse
1289        (ConvertStep::SwizzleBgraRgba, ConvertStep::SwizzleBgraRgba)
1290        // Layout inverses (lossless for opaque data)
1291        | (ConvertStep::AddAlpha, ConvertStep::DropAlpha)
1292        // Transfer function f32↔f32 (exact inverses in float)
1293        | (ConvertStep::SrgbF32ToLinearF32, ConvertStep::LinearF32ToSrgbF32)
1294        | (ConvertStep::LinearF32ToSrgbF32, ConvertStep::SrgbF32ToLinearF32)
1295        | (ConvertStep::PqF32ToLinearF32, ConvertStep::LinearF32ToPqF32)
1296        | (ConvertStep::LinearF32ToPqF32, ConvertStep::PqF32ToLinearF32)
1297        | (ConvertStep::HlgF32ToLinearF32, ConvertStep::LinearF32ToHlgF32)
1298        | (ConvertStep::LinearF32ToHlgF32, ConvertStep::HlgF32ToLinearF32)
1299        | (ConvertStep::Bt709F32ToLinearF32, ConvertStep::LinearF32ToBt709F32)
1300        | (ConvertStep::LinearF32ToBt709F32, ConvertStep::Bt709F32ToLinearF32)
1301        | (ConvertStep::Gamma22F32ToLinearF32, ConvertStep::LinearF32ToGamma22F32)
1302        | (ConvertStep::LinearF32ToGamma22F32, ConvertStep::Gamma22F32ToLinearF32)
1303        // Alpha mode (exact inverses in float)
1304        | (ConvertStep::StraightToPremul, ConvertStep::PremulToStraight)
1305        | (ConvertStep::PremulToStraight, ConvertStep::StraightToPremul)
1306        // Color model (exact inverses in float)
1307        | (ConvertStep::LinearRgbToOklab, ConvertStep::OklabToLinearRgb)
1308        | (ConvertStep::OklabToLinearRgb, ConvertStep::LinearRgbToOklab)
1309        | (ConvertStep::LinearRgbaToOklaba, ConvertStep::OklabaToLinearRgba)
1310        | (ConvertStep::OklabaToLinearRgba, ConvertStep::LinearRgbaToOklaba)
1311        // Cross-depth pairs (near-lossless for same depth class)
1312        | (ConvertStep::NaiveU8ToF32, ConvertStep::NaiveF32ToU8)
1313        | (ConvertStep::NaiveF32ToU8, ConvertStep::NaiveU8ToF32)
1314        | (ConvertStep::U8ToU16, ConvertStep::U16ToU8)
1315        | (ConvertStep::U16ToU8, ConvertStep::U8ToU16)
1316        | (ConvertStep::U16ToF32, ConvertStep::F32ToU16)
1317        | (ConvertStep::F32ToU16, ConvertStep::U16ToF32)
1318        // Cross-depth with transfer (near-lossless roundtrip)
1319        | (ConvertStep::SrgbU8ToLinearF32, ConvertStep::LinearF32ToSrgbU8)
1320        | (ConvertStep::LinearF32ToSrgbU8, ConvertStep::SrgbU8ToLinearF32)
1321        | (ConvertStep::PqU16ToLinearF32, ConvertStep::LinearF32ToPqU16)
1322        | (ConvertStep::LinearF32ToPqU16, ConvertStep::PqU16ToLinearF32)
1323        | (ConvertStep::HlgU16ToLinearF32, ConvertStep::LinearF32ToHlgU16)
1324        | (ConvertStep::LinearF32ToHlgU16, ConvertStep::HlgU16ToLinearF32)
1325        // Extended-range sRGB f32 pairs
1326        | (ConvertStep::SrgbF32ToLinearF32Extended, ConvertStep::LinearF32ToSrgbF32Extended)
1327        | (ConvertStep::LinearF32ToSrgbF32Extended, ConvertStep::SrgbF32ToLinearF32Extended)
1328    )
1329}
1330
1331/// Compute the descriptor after applying one step.
1332fn intermediate_desc(current: PixelDescriptor, step: &ConvertStep) -> PixelDescriptor {
1333    match step {
1334        ConvertStep::Identity => current,
1335        ConvertStep::SwizzleBgraRgba => {
1336            let new_layout = match current.layout() {
1337                ChannelLayout::Bgra => ChannelLayout::Rgba,
1338                ChannelLayout::Rgba => ChannelLayout::Bgra,
1339                other => other,
1340            };
1341            PixelDescriptor::new(
1342                current.channel_type(),
1343                new_layout,
1344                current.alpha(),
1345                current.transfer(),
1346            )
1347        }
1348        ConvertStep::AddAlpha => PixelDescriptor::new(
1349            current.channel_type(),
1350            ChannelLayout::Rgba,
1351            Some(AlphaMode::Straight),
1352            current.transfer(),
1353        ),
1354        ConvertStep::RgbToBgra => PixelDescriptor::new(
1355            current.channel_type(),
1356            ChannelLayout::Bgra,
1357            Some(AlphaMode::Straight),
1358            current.transfer(),
1359        ),
1360        ConvertStep::DropAlpha | ConvertStep::MatteComposite { .. } => PixelDescriptor::new(
1361            current.channel_type(),
1362            ChannelLayout::Rgb,
1363            None,
1364            current.transfer(),
1365        ),
1366        ConvertStep::GrayToRgb => PixelDescriptor::new(
1367            current.channel_type(),
1368            ChannelLayout::Rgb,
1369            None,
1370            current.transfer(),
1371        ),
1372        ConvertStep::GrayToRgba => PixelDescriptor::new(
1373            current.channel_type(),
1374            ChannelLayout::Rgba,
1375            Some(AlphaMode::Straight),
1376            current.transfer(),
1377        ),
1378        ConvertStep::RgbToGray | ConvertStep::RgbaToGray => PixelDescriptor::new(
1379            current.channel_type(),
1380            ChannelLayout::Gray,
1381            None,
1382            current.transfer(),
1383        ),
1384        ConvertStep::GrayAlphaToRgba => PixelDescriptor::new(
1385            current.channel_type(),
1386            ChannelLayout::Rgba,
1387            current.alpha(),
1388            current.transfer(),
1389        ),
1390        ConvertStep::GrayAlphaToRgb => PixelDescriptor::new(
1391            current.channel_type(),
1392            ChannelLayout::Rgb,
1393            None,
1394            current.transfer(),
1395        ),
1396        ConvertStep::GrayToGrayAlpha => PixelDescriptor::new(
1397            current.channel_type(),
1398            ChannelLayout::GrayAlpha,
1399            Some(AlphaMode::Straight),
1400            current.transfer(),
1401        ),
1402        ConvertStep::GrayAlphaToGray => PixelDescriptor::new(
1403            current.channel_type(),
1404            ChannelLayout::Gray,
1405            None,
1406            current.transfer(),
1407        ),
1408        ConvertStep::SrgbU8ToLinearF32
1409        | ConvertStep::NaiveU8ToF32
1410        | ConvertStep::U16ToF32
1411        | ConvertStep::PqU16ToLinearF32
1412        | ConvertStep::HlgU16ToLinearF32
1413        | ConvertStep::PqF32ToLinearF32
1414        | ConvertStep::HlgF32ToLinearF32
1415        | ConvertStep::SrgbF32ToLinearF32
1416        | ConvertStep::SrgbF32ToLinearF32Extended
1417        | ConvertStep::Bt709F32ToLinearF32
1418        | ConvertStep::Gamma22F32ToLinearF32 => PixelDescriptor::new(
1419            ChannelType::F32,
1420            current.layout(),
1421            current.alpha(),
1422            TransferFunction::Linear,
1423        ),
1424        ConvertStep::LinearF32ToSrgbU8 | ConvertStep::NaiveF32ToU8 | ConvertStep::U16ToU8 => {
1425            PixelDescriptor::new(
1426                ChannelType::U8,
1427                current.layout(),
1428                current.alpha(),
1429                TransferFunction::Srgb,
1430            )
1431        }
1432        ConvertStep::U8ToU16 => PixelDescriptor::new(
1433            ChannelType::U16,
1434            current.layout(),
1435            current.alpha(),
1436            current.transfer(),
1437        ),
1438        ConvertStep::F32ToU16 | ConvertStep::LinearF32ToPqU16 | ConvertStep::LinearF32ToHlgU16 => {
1439            let tf = match step {
1440                ConvertStep::LinearF32ToPqU16 => TransferFunction::Pq,
1441                ConvertStep::LinearF32ToHlgU16 => TransferFunction::Hlg,
1442                _ => current.transfer(),
1443            };
1444            PixelDescriptor::new(ChannelType::U16, current.layout(), current.alpha(), tf)
1445        }
1446        ConvertStep::LinearF32ToPqF32 => PixelDescriptor::new(
1447            ChannelType::F32,
1448            current.layout(),
1449            current.alpha(),
1450            TransferFunction::Pq,
1451        ),
1452        ConvertStep::LinearF32ToHlgF32 => PixelDescriptor::new(
1453            ChannelType::F32,
1454            current.layout(),
1455            current.alpha(),
1456            TransferFunction::Hlg,
1457        ),
1458        ConvertStep::LinearF32ToSrgbF32 | ConvertStep::LinearF32ToSrgbF32Extended => {
1459            PixelDescriptor::new(
1460                ChannelType::F32,
1461                current.layout(),
1462                current.alpha(),
1463                TransferFunction::Srgb,
1464            )
1465        }
1466        ConvertStep::LinearF32ToBt709F32 => PixelDescriptor::new(
1467            ChannelType::F32,
1468            current.layout(),
1469            current.alpha(),
1470            TransferFunction::Bt709,
1471        ),
1472        ConvertStep::LinearF32ToGamma22F32 => PixelDescriptor::new(
1473            ChannelType::F32,
1474            current.layout(),
1475            current.alpha(),
1476            TransferFunction::Gamma22,
1477        ),
1478        ConvertStep::StraightToPremul => PixelDescriptor::new(
1479            current.channel_type(),
1480            current.layout(),
1481            Some(AlphaMode::Premultiplied),
1482            current.transfer(),
1483        ),
1484        ConvertStep::PremulToStraight => PixelDescriptor::new(
1485            current.channel_type(),
1486            current.layout(),
1487            Some(AlphaMode::Straight),
1488            current.transfer(),
1489        ),
1490        ConvertStep::LinearRgbToOklab => PixelDescriptor::new(
1491            ChannelType::F32,
1492            ChannelLayout::Oklab,
1493            None,
1494            TransferFunction::Unknown,
1495        )
1496        .with_primaries(current.primaries),
1497        ConvertStep::OklabToLinearRgb => PixelDescriptor::new(
1498            ChannelType::F32,
1499            ChannelLayout::Rgb,
1500            None,
1501            TransferFunction::Linear,
1502        )
1503        .with_primaries(current.primaries),
1504        ConvertStep::LinearRgbaToOklaba => PixelDescriptor::new(
1505            ChannelType::F32,
1506            ChannelLayout::OklabA,
1507            Some(AlphaMode::Straight),
1508            TransferFunction::Unknown,
1509        )
1510        .with_primaries(current.primaries),
1511        ConvertStep::OklabaToLinearRgba => PixelDescriptor::new(
1512            ChannelType::F32,
1513            ChannelLayout::Rgba,
1514            current.alpha(),
1515            TransferFunction::Linear,
1516        )
1517        .with_primaries(current.primaries),
1518
1519        // Gamut matrix: same depth/layout/TF, but primaries change.
1520        // The actual target primaries are embedded in the matrix, not tracked
1521        // here — we mark them as Unknown since the step doesn't carry that info.
1522        // The final plan.to descriptor has the correct primaries.
1523        ConvertStep::GamutMatrixRgbF32(_) => PixelDescriptor::new(
1524            ChannelType::F32,
1525            current.layout(),
1526            current.alpha(),
1527            TransferFunction::Linear,
1528        ),
1529        ConvertStep::GamutMatrixRgbaF32(_) => PixelDescriptor::new(
1530            ChannelType::F32,
1531            current.layout(),
1532            current.alpha(),
1533            TransferFunction::Linear,
1534        ),
1535        // Fused steps: u8 sRGB in, u8 sRGB out (same layout, same alpha).
1536        ConvertStep::FusedSrgbU8GamutRgb(_) | ConvertStep::FusedSrgbU8GamutRgba(_) => {
1537            PixelDescriptor::new(
1538                ChannelType::U8,
1539                current.layout(),
1540                current.alpha(),
1541                TransferFunction::Srgb,
1542            )
1543        }
1544        ConvertStep::FusedSrgbU16GamutRgb(_) => PixelDescriptor::new(
1545            ChannelType::U16,
1546            current.layout(),
1547            current.alpha(),
1548            TransferFunction::Srgb,
1549        ),
1550        ConvertStep::FusedSrgbU8ToLinearF32Rgb(_) => PixelDescriptor::new(
1551            ChannelType::F32,
1552            current.layout(),
1553            current.alpha(),
1554            TransferFunction::Linear,
1555        ),
1556        ConvertStep::FusedLinearF32ToSrgbU8Rgb(_) => PixelDescriptor::new(
1557            ChannelType::U8,
1558            current.layout(),
1559            current.alpha(),
1560            TransferFunction::Srgb,
1561        ),
1562    }
1563}
1564
1565#[path = "convert_kernels.rs"]
1566mod convert_kernels;
1567use convert_kernels::apply_step_u8;
1568pub(crate) use convert_kernels::{hlg_eotf, hlg_oetf, pq_eotf, pq_oetf};