jxl-encoder 0.3.0

JPEG XL encoder in pure Rust
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
// Copyright (c) Imazen LLC and the JPEG XL Project Authors.
// Algorithms and constants derived from libjxl (BSD-3-Clause).
// Licensed under AGPL-3.0-or-later. Commercial licenses at https://www.imazen.io/pricing

//! Forward DCT transforms for sizes 32x32 and larger.

use super::constants::*;
use super::forward::dct1d_16;
use super::inverse::{idct1d_4, idct1d_8};
use crate::vardct::common::{as_array_mut, as_array_ref};

pub fn dct1d_32(mem: &mut [f32]) {
    let mut tmp = [0.0f32; 32];

    // AddReverse for first half
    for i in 0..16 {
        tmp[i] = mem[i] + mem[31 - i];
    }
    // SubReverse for second half
    for i in 0..16 {
        tmp[16 + i] = mem[i] - mem[31 - i];
    }

    // DCT on first half
    dct1d_16(&mut tmp[0..16]);

    // Multiply second half by WcMultipliers
    for i in 0..16 {
        tmp[16 + i] *= WC_MULTIPLIERS_32[i];
    }

    // DCT on second half
    dct1d_16(&mut tmp[16..32]);

    // B transform on second half
    tmp[16] = SQRT2 * tmp[16] + tmp[17];
    for i in 1..15 {
        tmp[16 + i] += tmp[16 + i + 1];
    }

    // InverseEvenOdd: interleave
    for i in 0..16 {
        mem[2 * i] = tmp[i];
        mem[2 * i + 1] = tmp[16 + i];
    }
}

/// Compute scaled 32x32 DCT (32 rows, 32 columns).
///
/// Input: 32x32 block in row-major order (1024 floats)
/// Output: 32x32 DCT coefficients
///
/// Like `dct_8x8()` and `dct_16x16()`, there is NO final transpose for square blocks.
/// C++ `ComputeScaledDCT<32,32>` takes the ROWS >= COLS branch (no final transpose).
pub fn dct_32x32(input: &[f32; 1024], output: &mut [f32; 1024]) {
    jxl_simd::dct_32x32(input, output);
}

/// Extract DC values from 32x32 DCT coefficients.
/// Returns 16 DC values (for the 16 covered 8x8 blocks) in row-major 4x4 order.
///
/// The LLF region is 4x4 coefficients at positions `[r*32+c]` for r,c in 0..4
/// in the 32x32 layout (stride 32). We apply `DCTTotalResampleScale<32, 4>` to
/// each dimension, then a 4x4 IDCT to get the 16 DC values.
///
/// C++ uses `ReinterpretingIDCT<32, 32, 4, 4, 4, 4>` with the ROWS >= COLS branch
/// (since ROWS=COLS=4).
pub fn dc_from_dct_32x32(coeffs: &[f32; 1024]) -> [f32; 16] {
    // Step 1: Extract 4x4 LLF and apply resample scales.
    // The forward DCT32x32 scaled by 1/1024. The 4x4 IDCT will apply 4*4=16 scaling,
    // so we need an additional 1024/16 = 64 to get back to spatial values.
    let mut block = [0.0f32; 16];
    for iy in 0..4 {
        for ix in 0..4 {
            block[iy * 4 + ix] = coeffs[iy * 32 + ix]
                * DCT_RESAMPLE_SCALE_32_TO_4[iy]
                * DCT_RESAMPLE_SCALE_32_TO_4[ix]
                * 16.0; // Compensate for forward/inverse scaling mismatch
        }
    }

    // Step 2: 4x4 IDCT matching C++ ComputeScaledIDCT<4,4> (ROWS >= COLS):
    //   IDCT rows → transpose → IDCT rows.
    // Using matched idct1d_4 that exactly reverses our forward dct1d_4.

    // IDCT rows (in-place)
    for iy in 0..4 {
        idct1d_4(&mut block[iy * 4..(iy + 1) * 4]);
    }

    // Transpose 4x4
    let mut transposed = [0.0f32; 16];
    for iy in 0..4 {
        for ix in 0..4 {
            transposed[ix * 4 + iy] = block[iy * 4 + ix];
        }
    }

    // IDCT rows (on transposed data = columns of original)
    for iy in 0..4 {
        idct1d_4(&mut transposed[iy * 4..(iy + 1) * 4]);
    }

    transposed
}

// =============================================================================
// DCT32x16 and DCT16x32 support
// =============================================================================

/// Compute scaled 32x16 DCT (32 rows, 16 columns).
///
/// Input: 32x16 block in row-major order (512 floats)
/// Output: 32x16 DCT coefficients in 16×32 layout (stride 32)
///
/// C++ `ComputeScaledDCT<32,16>` takes the ROWS >= COLS branch (no final transpose).
pub fn dct_32x16(input: &[f32; 512], output: &mut [f32; 512]) {
    jxl_simd::dct_32x16(input, output);
}

/// Compute scaled 16x32 DCT (16 rows, 32 columns).
///
/// Input: 16x32 block in row-major order (512 floats)
/// Output: 16x32 DCT coefficients
///
/// C++ `ComputeScaledDCT<16,32>` takes the ROWS < COLS branch (includes final transpose).
pub fn dct_16x32(input: &[f32; 512], output: &mut [f32; 512]) {
    jxl_simd::dct_16x32(input, output);
}

/// Extract DC values from 32x16 DCT coefficients.
/// Returns 8 DC values (for the 8 covered 8x8 blocks) in row-major 4x2 order.
///
/// DCT32x16 output is 16×32 (stride 32, no final transpose):
///   row = horiz freq (16-dim, 2 blocks), col = vert freq (32-dim, 4 blocks)
/// LLF region is 2×4 at positions `[r*32+c]` for r in 0..2, c in 0..4.
/// We apply `DCTTotalResampleScale<16, 2>` to rows and `<32, 4>` to columns,
/// then a 2×4 IDCT (skipping final transpose to produce 4×2 spatial output).
pub fn dc_from_dct_32x16(coeffs: &[f32; 512]) -> [f32; 8] {
    // Extract 2×4 LLF and apply resample scales.
    // Compensation factor = 4.0: idct1d_4 has DC gain 1/4, inline 2-point has DC gain 1.
    // Total LLF IDCT DC gain = 1/4, so we compensate with ×4.
    let mut block = [0.0f32; 8];
    for iy in 0..2 {
        for ix in 0..4 {
            block[iy * 4 + ix] = coeffs[iy * 32 + ix]
                * DCT_RESAMPLE_SCALE_16_TO_2[iy]
                * DCT_RESAMPLE_SCALE_32_TO_4[ix]
                * 4.0;
        }
    }

    // 2×4 IDCT (ROWS=2 < COLS=4): IDCT rows → transpose → IDCT rows.
    // Skip final transpose: the 2×4 LLF has rows=horiz freq, cols=vert freq.
    // After IDCT without final transpose, result is 4×2 = [block_row][block_col].

    // IDCT on 4-element rows (2 rows) — converts vert freq → spatial block row
    idct1d_4(&mut block[0..4]);
    idct1d_4(&mut block[4..8]);

    // Transpose 2×4 → 4×2
    let mut transposed = [0.0f32; 8];
    for iy in 0..2 {
        for ix in 0..4 {
            transposed[ix * 2 + iy] = block[iy * 4 + ix];
        }
    }

    // IDCT on 2-element rows (4 rows) — converts horiz freq → spatial block col
    for iy in 0..4 {
        let a = transposed[iy * 2];
        let b = transposed[iy * 2 + 1];
        transposed[iy * 2] = a + b;
        transposed[iy * 2 + 1] = a - b;
    }

    // Result is 4×2 = [block_row * 2 + block_col], matching caller
    transposed
}

/// Extract DC values from 16x32 DCT coefficients.
/// Returns 8 DC values (for the 8 covered 8x8 blocks) in row-major 2x4 order.
///
/// The LLF region is 2x4 coefficients. We apply `DCTTotalResampleScale<16, 2>` to rows
/// and `DCTTotalResampleScale<32, 4>` to columns, then a 2x4 IDCT.
pub fn dc_from_dct_16x32(coeffs: &[f32; 512]) -> [f32; 8] {
    // Extract 2×4 LLF and apply resample scales.
    // Compensation factor = 4.0: idct1d_4 has DC gain 1/4, inline 2-point has DC gain 1.
    // Total LLF IDCT DC gain = 1/4, so we compensate with ×4.
    let mut block = [0.0f32; 8];
    for iy in 0..2 {
        for ix in 0..4 {
            block[iy * 4 + ix] = coeffs[iy * 32 + ix]
                * DCT_RESAMPLE_SCALE_16_TO_2[iy]
                * DCT_RESAMPLE_SCALE_32_TO_4[ix]
                * 4.0;
        }
    }

    // 2×4 IDCT: Since ROWS=2 < COLS=4, this is ROWS < COLS branch
    // IDCT rows -> transpose -> IDCT rows -> transpose back

    // IDCT on 4-element rows (2 rows)
    idct1d_4(&mut block[0..4]);
    idct1d_4(&mut block[4..8]);

    // Transpose 2x4 -> 4x2
    let mut transposed = [0.0f32; 8];
    for iy in 0..2 {
        for ix in 0..4 {
            transposed[ix * 2 + iy] = block[iy * 4 + ix];
        }
    }

    // IDCT on 2-element rows (4 rows)
    for iy in 0..4 {
        let a = transposed[iy * 2];
        let b = transposed[iy * 2 + 1];
        transposed[iy * 2] = a + b;
        transposed[iy * 2 + 1] = a - b;
    }

    // Transpose back 4x2 -> 2x4
    let mut result = [0.0f32; 8];
    for iy in 0..4 {
        for ix in 0..2 {
            result[ix * 4 + iy] = transposed[iy * 2 + ix];
        }
    }

    result
}

pub fn dct1d_64(mem: &mut [f32]) {
    let mut tmp = [0.0f32; 64];

    // AddReverse for first half
    for i in 0..32 {
        tmp[i] = mem[i] + mem[63 - i];
    }
    // SubReverse for second half
    for i in 0..32 {
        tmp[32 + i] = mem[i] - mem[63 - i];
    }

    // DCT on first half
    dct1d_32(&mut tmp[0..32]);

    // Multiply second half by WcMultipliers
    for i in 0..32 {
        tmp[32 + i] *= WC_MULTIPLIERS_64[i];
    }

    // DCT on second half
    dct1d_32(&mut tmp[32..64]);

    // B transform on second half
    tmp[32] = SQRT2 * tmp[32] + tmp[33];
    for i in 1..31 {
        tmp[32 + i] += tmp[32 + i + 1];
    }

    // InverseEvenOdd: interleave
    for i in 0..32 {
        mem[2 * i] = tmp[i];
        mem[2 * i + 1] = tmp[32 + i];
    }
}

/// Compute scaled 64x64 DCT (64 rows, 64 columns).
///
/// Input: 64x64 block in row-major order (4096 floats)
/// Output: 64x64 DCT coefficients
///
/// NO final transpose for square blocks (ROWS >= COLS branch).
pub fn dct_64x64(input: &[f32], output: &mut [f32]) {
    debug_assert!(input.len() >= 4096);
    debug_assert!(output.len() >= 4096);

    jxl_simd::dct_64x64(as_array_ref(input, 0), as_array_mut(output, 0));
}

/// Compute scaled 64x32 DCT (64 rows, 32 columns).
///
/// Input: 64x32 block in row-major order (2048 floats)
/// Output: DCT coefficients in 32x64 layout (stride 64)
///
/// C++ `ComputeScaledDCT<64,32>` takes the ROWS >= COLS branch (no final transpose).
pub fn dct_64x32(input: &[f32], output: &mut [f32]) {
    debug_assert!(input.len() >= 2048);
    debug_assert!(output.len() >= 2048);

    jxl_simd::dct_64x32(as_array_ref(input, 0), as_array_mut(output, 0));
}

/// Compute scaled 32x64 DCT (32 rows, 64 columns).
///
/// Input: 32x64 block in row-major order (2048 floats)
/// Output: DCT coefficients
///
/// C++ `ComputeScaledDCT<32,64>` takes the ROWS < COLS branch (WITH final transpose).
pub fn dct_32x64(input: &[f32], output: &mut [f32]) {
    debug_assert!(input.len() >= 2048);
    debug_assert!(output.len() >= 2048);

    jxl_simd::dct_32x64(as_array_ref(input, 0), as_array_mut(output, 0));
}

/// Extract DC values from 64x64 DCT coefficients.
/// Returns 64 DC values (for the 64 covered 8x8 blocks) in row-major 8x8 order.
///
/// The LLF region is 8x8 coefficients at positions `[r*64+c]` for r,c in 0..8
/// in the 64x64 layout (stride 64). We apply `DCTResampleScale<64, 8>` to
/// each dimension, then an 8x8 IDCT.
pub fn dc_from_dct_64x64(coeffs: &[f32]) -> [f32; 64] {
    debug_assert!(coeffs.len() >= 4096);

    // Step 1: Extract 8x8 LLF and apply resample scales.
    // Forward DCT64x64 scaled by 1/4096. The 8x8 IDCT will apply 8*8=64 scaling,
    // so we need 4096/64 = 64 factor.
    let mut block = [0.0f32; 64];
    for iy in 0..8 {
        for ix in 0..8 {
            block[iy * 8 + ix] = coeffs[iy * 64 + ix]
                * DCT_RESAMPLE_SCALE_64_TO_8[iy]
                * DCT_RESAMPLE_SCALE_64_TO_8[ix];
        }
    }

    // Step 2: 8x8 IDCT matching ComputeScaledIDCT<8,8> (ROWS >= COLS):
    //   IDCT rows → transpose → IDCT rows.

    // IDCT rows
    for iy in 0..8 {
        idct1d_8(&mut block[iy * 8..(iy + 1) * 8]);
    }

    // Transpose 8x8
    let mut transposed = [0.0f32; 64];
    for iy in 0..8 {
        for ix in 0..8 {
            transposed[ix * 8 + iy] = block[iy * 8 + ix];
        }
    }

    // IDCT rows
    for iy in 0..8 {
        idct1d_8(&mut transposed[iy * 8..(iy + 1) * 8]);
    }

    transposed
}

/// Extract DC values from 64x32 DCT coefficients.
/// Returns 32 DC values (for the 32 covered 8x8 blocks) in row-major 8x4 order.
///
/// DCT64x32 output is 32×64 (stride 64, no final transpose):
///   row = horiz freq (32-dim, 4 blocks), col = vert freq (64-dim, 8 blocks)
/// LLF region is 4×8 at positions `[r*64+c]` for r in 0..4, c in 0..8.
/// We apply `DCTTotalResampleScale<32, 4>` to rows and `<64, 8>` to columns,
/// then a 4×8 IDCT (skipping final transpose to produce 8×4 spatial output).
///
/// Coverage: 8 block rows × 4 block cols. DC output is 8 rows × 4 cols.
pub fn dc_from_dct_64x32(coeffs: &[f32]) -> [f32; 32] {
    debug_assert!(coeffs.len() >= 2048);

    // Extract 4×8 LLF from the 32×64 layout (stride 64)
    let mut block = [0.0f32; 32];
    for iy in 0..4 {
        for ix in 0..8 {
            block[iy * 8 + ix] = coeffs[iy * 64 + ix]
                * DCT_RESAMPLE_SCALE_32_TO_4[iy]
                * DCT_RESAMPLE_SCALE_64_TO_8[ix]
                * 4.0;
        }
    }

    // 4×8 IDCT (ROWS=4 < COLS=8): IDCT rows → transpose → IDCT rows.
    // Skip final transpose: the 4×8 LLF has rows=horiz freq, cols=vert freq.
    // After IDCT without final transpose, result is 8×4 = [block_row][block_col].

    // IDCT on 8-element rows (4 rows) — converts vert freq → spatial block row
    for iy in 0..4 {
        idct1d_8(&mut block[iy * 8..(iy + 1) * 8]);
    }

    // Transpose 4×8 → 8×4
    let mut transposed = [0.0f32; 32];
    for iy in 0..4 {
        for ix in 0..8 {
            transposed[ix * 4 + iy] = block[iy * 8 + ix];
        }
    }

    // IDCT on 4-element rows (8 rows) — converts horiz freq → spatial block col
    for iy in 0..8 {
        idct1d_4(&mut transposed[iy * 4..(iy + 1) * 4]);
    }

    // Result is 8×4 = [block_row * 4 + block_col], matching caller
    transposed
}

/// Extract DC values from 32x64 DCT coefficients.
/// Returns 32 DC values (for the 32 covered 8x8 blocks) in row-major 8x4 order.
///
/// Coverage: 8 cols × 4 rows of 8x8 blocks. DC output is 4 rows × 8 cols.
/// After dct_32x64's final transpose, coefficients are in stride-64 layout.
/// CoefficientLayout: cx=8 >= cy=4, so stride = cx*8 = 64.
pub fn dc_from_dct_32x64(coeffs: &[f32]) -> [f32; 32] {
    debug_assert!(coeffs.len() >= 2048);

    // Extract 4x8 LLF from stride-64 layout
    let mut block = [0.0f32; 32];
    for iy in 0..4 {
        for ix in 0..8 {
            block[iy * 8 + ix] = coeffs[iy * 64 + ix]
                * DCT_RESAMPLE_SCALE_32_TO_4[iy]
                * DCT_RESAMPLE_SCALE_64_TO_8[ix]
                * 4.0;
        }
    }

    // 4x8 IDCT: ROWS=4 < COLS=8, so ROWS < COLS branch:
    // IDCT rows -> transpose -> IDCT rows -> transpose back

    // IDCT on 8-element rows (4 rows)
    for iy in 0..4 {
        idct1d_8(&mut block[iy * 8..(iy + 1) * 8]);
    }

    // Transpose 4x8 -> 8x4
    let mut transposed = [0.0f32; 32];
    for iy in 0..4 {
        for ix in 0..8 {
            transposed[ix * 4 + iy] = block[iy * 8 + ix];
        }
    }

    // IDCT on 4-element rows (8 rows)
    for iy in 0..8 {
        idct1d_4(&mut transposed[iy * 4..(iy + 1) * 4]);
    }

    // Transpose back 8x4 -> 4x8
    let mut result = [0.0f32; 32];
    for iy in 0..8 {
        for ix in 0..4 {
            result[ix * 8 + iy] = transposed[iy * 4 + ix];
        }
    }

    result
}