ctt 0.3.0

Compress images to GPU texture formats
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
//! sRGB-encoded u8 loaders.
//!
//! Shared state:
//! * [`EOTF_LUT`] — 256-entry scalar lookup table (exact sRGB EOTF).
//! * [`SRGB_MINIMAX_A`]/[`SRGB_MINIMAX_B`]/[`SRGB_MINIMAX_C`] — piecewise
//!   minimax approximation used by the SIMD fast paths.

use std::sync::LazyLock;

use crate::error::Result;
use crate::surface::Surface;

use super::{Buffer, read_pixels_f32};

/// sRGB EOTF lookup table — maps every u8 value (0–255) to its linear f32 equivalent.
static EOTF_LUT: LazyLock<[f32; 256]> = LazyLock::new(|| {
    let mut table = [0.0f32; 256];
    for (i, entry) in table.iter_mut().enumerate() {
        let c = i as f32 / 255.0;
        *entry = srgb_eotf(c);
    }
    table
});

/// Apply the sRGB EOTF (sRGB-encoded → linear) to a single value.
fn srgb_eotf(c: f32) -> f32 {
    if c <= 0.04045 {
        c / 12.92
    } else {
        ((c + 0.055) / 1.055).powf(2.4)
    }
}

// Piecewise minimax approximation of the sRGB EOTF over the u8 byte domain.
// With `x = byte / 255`, the curve branch fits `((x + 0.055) / 1.055)^2.4`
// with max abs error ≈ 1.28e-4 — well inside ±0.5/255, so u8 round-trip
// stays bit-exact versus the LUT path. See `srgb-opt.py`.
#[cfg(target_arch = "x86_64")]
const SRGB_MINIMAX_A: f32 = -0.983_177_1;
#[cfg(target_arch = "x86_64")]
const SRGB_MINIMAX_B: f32 = -0.083_670_19;
#[cfg(target_arch = "x86_64")]
const SRGB_MINIMAX_C: f32 = -0.121_285_7;

pub fn load_srgb8_f32(surface: &Surface, channels: usize) -> Result<Buffer<f32>> {
    profiling::scope!("load_srgb8_f32");

    #[cfg(target_arch = "x86_64")]
    {
        if channels == 4 && is_x86_feature_detected!("avx2") && is_x86_feature_detected!("fma") {
            // SAFETY: runtime check confirms avx2 + fma are available.
            return unsafe { load_srgb8_rgba_f32_avx2_fma(surface) };
        }
        if channels == 4 && is_x86_feature_detected!("sse4.1") {
            // SAFETY: runtime check confirms sse4.1 is available.
            return unsafe { load_srgb8_rgba_f32_sse4_1(surface) };
        }
    }

    let lut = &*EOTF_LUT;
    read_pixels_f32(surface, channels, 1, |bytes, lanes| {
        // RGB lanes through the sRGB EOTF, alpha linear.
        for (c, (lane, &byte)) in lanes.iter_mut().zip(bytes).enumerate() {
            *lane = if c < 3 {
                lut[byte as usize]
            } else {
                byte as f32 / 255.0
            };
        }
    })
}

pub fn load_bgra8_srgb_f32(surface: &Surface) -> Result<Buffer<f32>> {
    profiling::scope!("load_bgra8_srgb_f32");
    let lut = &*EOTF_LUT;
    read_pixels_f32(surface, 4, 1, |bytes, lanes| {
        let &[b, g, r, a] = <&[u8; 4]>::try_from(bytes).expect("4-byte pixel");
        lanes[0] = lut[r as usize];
        lanes[1] = lut[g as usize];
        lanes[2] = lut[b as usize];
        lanes[3] = a as f32 / 255.0;
    })
}

pub fn load_bgr8_srgb_f32(surface: &Surface) -> Result<Buffer<f32>> {
    profiling::scope!("load_bgr8_srgb_f32");
    let lut = &*EOTF_LUT;
    read_pixels_f32(surface, 3, 1, |bytes, lanes| {
        let &[b, g, r] = <&[u8; 3]>::try_from(bytes).expect("3-byte pixel");
        lanes[0] = lut[r as usize];
        lanes[1] = lut[g as usize];
        lanes[2] = lut[b as usize];
    })
}

/// Decode one 4-byte sRGB RGBA pixel into `[R, G, B, A]` linear f32 lanes,
/// shared between the SSE4.1 main loop and the AVX2 fast path's tail.
///
/// Piecewise form for RGB, with `x = byte / 255`:
/// * `byte <= 10`: `x / 12.92` (linear segment of the sRGB spec)
/// * `byte >= 11`: `(a·x + b)^2 * (c·x + sqrt(x))` — minimax fit of
///   `((x + 0.055) / 1.055)^2.4` with max abs error ≈ 1.28e-4.
///
/// # Safety
/// * The SSE4.1 feature must be available (enforced by `target_feature`).
/// * `bytes_ptr` must be valid for a 4-byte read.
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "sse4.1")]
#[inline]
unsafe fn decode_srgb_pixel_sse4_1(bytes_ptr: *const u8) -> std::arch::x86_64::__m128 {
    use std::arch::x86_64::*;

    // SAFETY: caller guarantees 4 valid bytes at bytes_ptr.
    let raw = unsafe { bytes_ptr.cast::<u32>().read_unaligned() };
    let packed = _mm_cvtsi32_si128(raw as i32);
    let as_i32 = _mm_cvtepu8_epi32(packed);
    let as_f32 = _mm_cvtepi32_ps(as_i32);

    let coeff_a = _mm_set1_ps(SRGB_MINIMAX_A);
    let coeff_b = _mm_set1_ps(SRGB_MINIMAX_B);
    let coeff_c = _mm_set1_ps(SRGB_MINIMAX_C);
    let inv_255 = _mm_set1_ps(1.0 / 255.0);
    let inv_255_12_92 = _mm_set1_ps(1.0 / (255.0 * 12.92));
    // Lane 3 is the alpha channel in the [R,G,B,A] layout.
    let alpha_lane_mask = _mm_castsi128_ps(_mm_setr_epi32(0, 0, 0, -1));
    let curve_threshold = _mm_set1_epi32(10);

    let x_norm = _mm_mul_ps(as_f32, inv_255);
    let linear = _mm_mul_ps(as_f32, inv_255_12_92);
    let t = _mm_sqrt_ps(x_norm);
    let u = _mm_add_ps(_mm_mul_ps(x_norm, coeff_a), coeff_b);
    let v = _mm_add_ps(_mm_mul_ps(x_norm, coeff_c), t);
    let curve = _mm_mul_ps(_mm_mul_ps(u, u), v);
    let use_curve = _mm_castsi128_ps(_mm_cmpgt_epi32(as_i32, curve_threshold));
    let rgb = _mm_blendv_ps(linear, curve, use_curve);
    _mm_blendv_ps(rgb, x_norm, alpha_lane_mask)
}

/// SSE4.1 path for `R8G8B8A8_SRGB` (and equivalent 4-channel sRGB layouts).
///
/// Processes one pixel (4 bytes → 4 f32) per iteration via
/// [`decode_srgb_pixel_sse4_1`]. See that helper for the piecewise form and
/// accuracy guarantees.
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "sse4.1")]
unsafe fn load_srgb8_rgba_f32_sse4_1(surface: &Surface) -> Result<Buffer<f32>> {
    use std::arch::x86_64::*;

    profiling::scope!("load_srgb8_rgba_f32_sse4_1");
    super::validate_surface(surface, 4)?;

    let w = surface.width as usize;
    let h = surface.height as usize;
    let stride = surface.stride as usize;
    let row_bytes = w * 4;
    let total_pixels = w * h;

    let mut pixels: Vec<[f32; 4]> = Vec::with_capacity(total_pixels);
    let out_base = pixels.as_mut_ptr() as *mut f32;

    // SAFETY: every intrinsic and pointer op below runs with sse4.1 enabled
    // (target_feature on the enclosing fn) and within the capacity reserved
    // for `pixels`; validate_surface has already bounded the input slice.
    unsafe {
        let mut out_f32 = 0usize;

        for row_region in surface.data.chunks(stride).take(h) {
            let row = &row_region[..row_bytes];
            let mut x = 0usize;

            // 1 pixel (4 input bytes, 4 output f32s) per iteration.
            while x + 4 <= row_bytes {
                let result = decode_srgb_pixel_sse4_1(row.as_ptr().add(x));
                _mm_storeu_ps(out_base.add(out_f32), result);
                out_f32 += 4;
                x += 4;
            }
        }

        debug_assert_eq!(out_f32, total_pixels * 4);
        pixels.set_len(total_pixels);
    }

    Ok(Buffer {
        pixels,
        width: surface.width,
        height: surface.height,
    })
}

/// AVX2 + FMA path for `R8G8B8A8_SRGB` (and equivalent 4-channel sRGB layouts).
///
/// Processes two pixels (8 bytes → 8 f32) per iteration; any 1-pixel odd-width
/// tail is handled by the SSE4.1 [`decode_srgb_pixel_sse4_1`] helper so the tail
/// stays vectorized and consistent with the rest of the SIMD fast paths. See
/// that helper for the piecewise form and accuracy guarantees.
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2,fma")]
unsafe fn load_srgb8_rgba_f32_avx2_fma(surface: &Surface) -> Result<Buffer<f32>> {
    use std::arch::x86_64::*;

    profiling::scope!("load_srgb8_rgba_f32_avx2_fma");
    super::validate_surface(surface, 4)?;

    let w = surface.width as usize;
    let h = surface.height as usize;
    let stride = surface.stride as usize;
    let row_bytes = w * 4;
    let total_pixels = w * h;

    let mut pixels: Vec<[f32; 4]> = Vec::with_capacity(total_pixels);
    let out_base = pixels.as_mut_ptr() as *mut f32;

    // SAFETY: every intrinsic and pointer op below runs with avx2+fma enabled
    // (target_feature on the enclosing fn) and within the capacity reserved
    // for `pixels`; validate_surface has already bounded the input slice.
    unsafe {
        let coeff_a = _mm256_set1_ps(SRGB_MINIMAX_A);
        let coeff_b = _mm256_set1_ps(SRGB_MINIMAX_B);
        let coeff_c = _mm256_set1_ps(SRGB_MINIMAX_C);
        let inv_255 = _mm256_set1_ps(1.0 / 255.0);
        let inv_255_12_92 = _mm256_set1_ps(1.0 / (255.0 * 12.92));
        // Lanes 3 and 7 are the alpha channel in the [R,G,B,A,R,G,B,A] layout.
        let alpha_lane_mask = _mm256_castsi256_ps(_mm256_setr_epi32(0, 0, 0, -1, 0, 0, 0, -1));
        let curve_threshold = _mm256_set1_epi32(10);

        let mut out_f32 = 0usize;

        for row_region in surface.data.chunks(stride).take(h) {
            let row = &row_region[..row_bytes];
            let mut x = 0usize;

            // 2 pixels (8 input bytes, 8 output f32s) per iteration.
            while x + 8 <= row_bytes {
                let bytes = _mm_loadl_epi64(row.as_ptr().add(x) as *const __m128i);
                let as_i32 = _mm256_cvtepu8_epi32(bytes);
                let as_f32 = _mm256_cvtepi32_ps(as_i32);

                let x_norm = _mm256_mul_ps(as_f32, inv_255);
                let linear = _mm256_mul_ps(as_f32, inv_255_12_92);

                let t = _mm256_sqrt_ps(x_norm);
                let u = _mm256_fmadd_ps(x_norm, coeff_a, coeff_b);
                let v = _mm256_fmadd_ps(x_norm, coeff_c, t);
                let curve = _mm256_mul_ps(_mm256_mul_ps(u, u), v);

                let use_curve = _mm256_castsi256_ps(_mm256_cmpgt_epi32(as_i32, curve_threshold));
                let rgb = _mm256_blendv_ps(linear, curve, use_curve);
                let result = _mm256_blendv_ps(rgb, x_norm, alpha_lane_mask);

                _mm256_storeu_ps(out_base.add(out_f32), result);
                out_f32 += 8;
                x += 8;
            }

            // Odd-width tail: at most one 4-byte pixel left. Hand it to the
            // SSE4.1 helper so the tail stays vectorized and algorithmically
            // consistent with the SSE4.1 fast path.
            if x < row_bytes {
                let result = decode_srgb_pixel_sse4_1(row.as_ptr().add(x));
                _mm_storeu_ps(out_base.add(out_f32), result);
                out_f32 += 4;
            }
        }

        debug_assert_eq!(out_f32, total_pixels * 4);
        pixels.set_len(total_pixels);
    }

    Ok(Buffer {
        pixels,
        width: surface.width,
        height: surface.height,
    })
}

#[cfg(all(test, target_arch = "x86_64"))]
mod simd_tests {
    use super::*;
    use crate::alpha::AlphaMode;
    use crate::surface::{ColorSpace, Surface};

    fn srgb_surface(data: Vec<u8>, width: u32, height: u32, stride: u32) -> Surface {
        Surface {
            data,
            width,
            height,
            stride,
            format: ktx2::Format::R8G8B8A8_SRGB,
            color_space: ColorSpace::Srgb,
            alpha: AlphaMode::Opaque,
        }
    }

    /// Build a 2-row × 256-pixel test surface that covers every u8 input byte
    /// on every RGBA channel.
    fn full_domain_surface() -> Surface {
        let w: u32 = 256;
        let h: u32 = 2;
        let mut data = vec![0u8; (w * h * 4) as usize];
        for x in 0..w as usize {
            let row_a = x * 4;
            data[row_a] = x as u8;
            data[row_a + 1] = (255 - x) as u8;
            data[row_a + 2] = ((x * 7) & 0xff) as u8;
            data[row_a + 3] = x as u8;

            let row_b = (w as usize + x) * 4;
            data[row_b] = x as u8;
            data[row_b + 1] = x as u8;
            data[row_b + 2] = x as u8;
            data[row_b + 3] = 255;
        }
        srgb_surface(data, w, h, w * 4)
    }

    fn assert_within_u8_tolerance(pixels: &[[f32; 4]], source: &[u8]) {
        let lut = &*EOTF_LUT;
        let tol = 0.5 / 255.0;
        for (i, px) in pixels.iter().enumerate() {
            let base = i * 4;
            let rb = source[base];
            let gb = source[base + 1];
            let bb = source[base + 2];
            let ab = source[base + 3];
            assert!(
                (px[0] - lut[rb as usize]).abs() < tol,
                "R byte {rb}: {}",
                px[0]
            );
            assert!(
                (px[1] - lut[gb as usize]).abs() < tol,
                "G byte {gb}: {}",
                px[1]
            );
            assert!(
                (px[2] - lut[bb as usize]).abs() < tol,
                "B byte {bb}: {}",
                px[2]
            );
            assert!(
                (px[3] - ab as f32 / 255.0).abs() < 1e-6,
                "A byte {ab}: {}",
                px[3]
            );
        }
    }

    #[test]
    fn avx2_srgb_matches_lut_within_u8_tolerance() {
        if !(is_x86_feature_detected!("avx2") && is_x86_feature_detected!("fma")) {
            return;
        }
        let surface = full_domain_surface();
        let simd = unsafe { load_srgb8_rgba_f32_avx2_fma(&surface).unwrap() };
        assert_within_u8_tolerance(&simd.pixels, &surface.data);
    }

    #[test]
    fn sse4_srgb_matches_lut_within_u8_tolerance() {
        if !is_x86_feature_detected!("sse4.1") {
            return;
        }
        let surface = full_domain_surface();
        let simd = unsafe { load_srgb8_rgba_f32_sse4_1(&surface).unwrap() };
        assert_within_u8_tolerance(&simd.pixels, &surface.data);
    }

    #[test]
    fn avx2_srgb_odd_width_tail_matches_sse4_path() {
        if !(is_x86_feature_detected!("avx2") && is_x86_feature_detected!("fma")) {
            return;
        }

        // 3 px wide exercises one AVX2 iteration (2 px) + one SSE4.1 tail (1 px).
        let data = vec![
            0u8, 10, 11, 255, //
            128, 200, 255, 64, //
            17, 42, 99, 200,
        ];
        let surface = srgb_surface(data.clone(), 3, 1, 12);

        let avx2 = unsafe { load_srgb8_rgba_f32_avx2_fma(&surface).unwrap() };

        // The 1-pixel tail runs through the same SSE4.1 helper as the SSE4 path,
        // so a dedicated 1×1 SSE4 decode of the same bytes must match bit-exactly.
        let tail_surface = srgb_surface(data[8..].to_vec(), 1, 1, 4);
        let sse4 = unsafe { load_srgb8_rgba_f32_sse4_1(&tail_surface).unwrap() };
        assert_eq!(avx2.pixels[2], sse4.pixels[0]);

        // And the tail still lands inside u8 tolerance of the LUT reference.
        assert_within_u8_tolerance(&avx2.pixels[2..], &data[8..]);
    }

    #[test]
    fn avx2_srgb_stride_padding_is_skipped() {
        if !(is_x86_feature_detected!("avx2") && is_x86_feature_detected!("fma")) {
            return;
        }

        // 2×2 pixel image with 4 bytes of junk padding on every row.
        let w = 2u32;
        let h = 2u32;
        let stride = w * 4 + 4;
        let mut data = Vec::new();
        let rows = [
            [10u8, 20, 30, 40, 50, 60, 70, 80],
            [90, 100, 110, 120, 130, 140, 150, 160],
        ];
        for r in &rows {
            data.extend_from_slice(r);
            data.extend_from_slice(&[0xFE, 0xFE, 0xFE, 0xFE]);
        }

        let surface = srgb_surface(data, w, h, stride);
        let simd = unsafe { load_srgb8_rgba_f32_avx2_fma(&surface).unwrap() };

        assert_eq!(simd.pixels.len(), 4);
        // The 0xFE junk must not show up — alpha of pixel 1 is 40/255, not 254/255.
        assert!((simd.pixels[0][3] - 40.0 / 255.0).abs() < 1e-6);
        assert!((simd.pixels[3][3] - 160.0 / 255.0).abs() < 1e-6);
    }

    #[test]
    fn sse4_srgb_stride_padding_is_skipped() {
        if !is_x86_feature_detected!("sse4.1") {
            return;
        }

        let w = 2u32;
        let h = 2u32;
        let stride = w * 4 + 4;
        let mut data = Vec::new();
        let rows = [
            [10u8, 20, 30, 40, 50, 60, 70, 80],
            [90, 100, 110, 120, 130, 140, 150, 160],
        ];
        for r in &rows {
            data.extend_from_slice(r);
            data.extend_from_slice(&[0xFE, 0xFE, 0xFE, 0xFE]);
        }

        let surface = srgb_surface(data, w, h, stride);
        let simd = unsafe { load_srgb8_rgba_f32_sse4_1(&surface).unwrap() };

        assert_eq!(simd.pixels.len(), 4);
        assert!((simd.pixels[0][3] - 40.0 / 255.0).abs() < 1e-6);
        assert!((simd.pixels[3][3] - 160.0 / 255.0).abs() < 1e-6);
    }
}