mlxrs 0.1.0

Safe Rust bindings for Apple's MLX array framework, with LM, VLM, audio, and embeddings support
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
//! Model-agnostic video *preprocessing* math: frame-count sampling,
//! pixel-budget-aware target sizing, and per-frame image preparation.
//!
//! Ported from
//! [`mlx-vlm/mlx_vlm/video_generate.py`](https://github.com/Blaizzy/mlx-vlm/blob/main/mlx_vlm/video_generate.py)
//! (`round_by_factor` / `ceil_by_factor` / `floor_by_factor`,
//! [`smart_resize`], [`smart_nframes`], and the per-frame resize+stack
//! body of `fetch_video`) and cross-checked against
//! [`mlx-swift-lm/Libraries/MLXVLM/Models/QwenVL.swift`](https://github.com/ml-explore/mlx-swift-lm/blob/main/Libraries/MLXVLM/Models/QwenVL.swift)
//! `QwenVL.targetSize` (the swift mirror of `image_processing_qwen2_vl.
//! smart_resize`) and `MediaProcessing.asProcessedSequence`
//! (uniform/fps frame sampling).
//!
//! ## Scope
//! - **In scope (this module):** the portable, codec-free preprocessing
//!   *math* every Qwen-VL-class video pipeline shares —
//!     - [`smart_resize`]: compute the factor-aligned `(H, W)` that keeps
//!       the aspect ratio while landing the pixel count inside
//!       `[min_pixels, max_pixels]`.
//!     - [`smart_nframes`]: compute how many frames to sample from the
//!       fps / min-frames / max-frames / total-frames budget (with the
//!       reference's clamp + factor-rounding edges), plus
//!       [`sample_frame_indices`] for the `np.linspace(0, total-1,
//!       n).round()` index picks.
//!     - [`process_frames`]: apply the existing cross-model image
//!       [`preprocess`] to a
//!       caller-provided sequence of already-decoded frames and stack the
//!       result into the model input layout.
//! - **Out of scope — DECODING IS THE CALLER'S RESPONSIBILITY.** This
//!   module deliberately takes **already-decoded frames** (a
//!   `&[image::DynamicImage]`) and never opens a container. The python
//!   reference decodes mp4 via `cv2.VideoCapture` (`video_generate.py:
//!   207-226`) and the swift reference via `AVAssetImageGenerator`
//!   (`MediaProcessing.swift:288-330`); both pull in a platform codec.
//!   Wiring a Rust container demuxer/decoder (ffmpeg / `symphonia`-video /
//!   `re_mp4` / etc.) is a documented **follow-up** that needs a codec
//!   dependency and is intentionally not part of this portable-math port.
//!   The valuable, dependency-free part — the sampling + resize + prep
//!   arithmetic — is what lives here. Callers decode frames however they
//!   like (their own ffmpeg binding, a test fixture, pre-extracted PNGs)
//!   and hand the resulting `&[DynamicImage]` to [`process_frames`], using
//!   [`smart_nframes`] + [`sample_frame_indices`] to choose which source
//!   frames to decode in the first place.
//!
//! ## Channel layout (intentional divergence from python, parity with
//! [`crate::vlm::image`])
//! The python `fetch_video` returns planar `(T, C, H, W)` f32
//! (`video_generate.py:282`). We instead stack **channel-last**
//! `[T, H, W, 3]` because [`process_frames`] composes the existing
//! cross-model [`preprocess`], whose
//! documented output is channel-last `[H, W, 3]` (see that module's
//! `Conventions > Channel layout` block). Per the project's
//! no-per-model-arch rule, the planar `[T, C, H, W]` (or the Qwen
//! flattened-patch `[grid_t*grid_h*grid_w, ...]`) layout is a per-model
//! contract the per-model video processor owns; it is one lazy
//! `transpose_axes(&[0, 3, 1, 2])` away from the channel-last stack this
//! module emits (zero data movement until `eval`).
//!
//! ## No implicit eval
//! [`process_frames`] composes lazily and returns an un-evaluated
//! `Array`; callers `eval()` (or use a data accessor) to materialize.

use crate::{
  array::Array,
  error::{
    EmptyInputPayload, Error, InvariantViolationPayload, NonFiniteScalarPayload, OutOfRangePayload,
    Result, UnknownEnumValuePayload,
  },
  ops::shape::stack,
  vlm::image::{ImageProcessorConfig, Layout, preprocess},
};

/// `image_processing_qwen2_vl.IMAGE_FACTOR` (`video_generate.py:33`):
/// every resized side is aligned to a multiple of this. 28 = a 14×14 ViT
/// patch × the 2×2 spatial merge Qwen-VL applies.
pub const IMAGE_FACTOR: i64 = 28;
/// `MIN_PIXELS` (`video_generate.py:34`) — `4 * 28 * 28`.
pub const MIN_PIXELS: i64 = 4 * 28 * 28;
/// `MAX_PIXELS` (`video_generate.py:35`) — `16384 * 28 * 28`.
pub const MAX_PIXELS: i64 = 16384 * 28 * 28;
/// `MAX_RATIO` (`video_generate.py:36`): the absolute aspect-ratio ceiling
/// `smart_resize` rejects above.
pub const MAX_RATIO: i64 = 200;

/// `VIDEO_MIN_PIXELS` (`video_generate.py:38`) — `128 * 28 * 28`.
pub const VIDEO_MIN_PIXELS: i64 = 128 * 28 * 28;
/// `VIDEO_MAX_PIXELS` (`video_generate.py:39`) — `768 * 28 * 28`.
pub const VIDEO_MAX_PIXELS: i64 = 768 * 28 * 28;
/// `FRAME_FACTOR` (`video_generate.py:40`): sampled frame counts are
/// aligned to a multiple of this (Qwen-VL's temporal patch size).
pub const FRAME_FACTOR: i64 = 2;
/// `FPS` (`video_generate.py:41`): default frames-per-second to sample at.
pub const FPS: f64 = 2.0;
/// `FPS_MIN_FRAMES` (`video_generate.py:42`): floor on the fps-derived
/// frame count.
pub const FPS_MIN_FRAMES: i64 = 4;
/// `FPS_MAX_FRAMES` (`video_generate.py:43`): ceiling on the fps-derived
/// frame count.
pub const FPS_MAX_FRAMES: i64 = 768;

/// Largest `i64` magnitude that survives an `i64 → f64 → i64` round-trip
/// without precision loss: f64's mantissa holds 53 bits, so every integer in
/// `[-2^53, 2^53]` is exactly representable but beyond it consecutive
/// integers collapse onto the same f64. The `*_by_factor` family below does
/// `op(number / factor) * factor` in f64; for an `|number| > 2^53` the very
/// first `number as f64` is already a *different* value, so the result is
/// silently wrong (not merely overflowing). We reject such inputs up front
/// rather than emit a plausible-looking but corrupt size — every real image
/// dimension / frame count is many orders of magnitude below this.
const F64_EXACT_INT_MAX: i64 = 1 << 53;

/// `i64::MAX + 1 == 2^63` as the smallest `f64` strictly above the `i64`
/// range. `i64::MAX` itself is not f64-representable (`i64::MAX as f64`
/// rounds UP to `2^63`), so an `f64 < this` bound guarantees the subsequent
/// `as i64` cast is exact and in range.
const F64_TWO_POW_63: f64 = 9_223_372_036_854_775_808.0;

/// Multiply a (already factor-divided + rounded, integral) `quotient` float
/// by an `i64` `factor`, returning [`Error::OutOfRange`] instead of
/// panicking (debug) or silently wrapping (release) on overflow.
///
/// The `*_by_factor` family computes `op(number / factor) * factor`. The
/// final `quotient * factor` can blow past `i64` for a legitimately large
/// float `number` (the `_f` scale variants pass `height * beta` etc.). We
/// funnel every such product through this one checked path so NO caller can
/// panic or wrap — overflow surfaces recoverably, naming the operands.
///
/// `quotient` arrives as the post-`round`/`ceil`/`floor` `f64`, so it is
/// integral (or `±inf`/`NaN` on a degenerate input). We reject non-finite
/// quotients and any quotient outside the exactly-representable `i64` range
/// before the cast, then `checked_mul` (== an i128 product bounds-checked to
/// fit `i64`) the validated integer by `factor`.
fn factor_product(quotient: f64, factor: i64, op: &'static str) -> Result<i64> {
  if !quotient.is_finite() {
    return Err(Error::NonFiniteScalar(NonFiniteScalarPayload::new(
      op, quotient,
    )));
  }
  if quotient >= F64_TWO_POW_63 || quotient < i64::MIN as f64 {
    return Err(Error::OutOfRange(OutOfRangePayload::new(
      op,
      "must be in i64 range before scaling by factor",
      format!("quotient={quotient}, factor={factor}"),
    )));
  }
  let q_i64 = quotient as i64;
  q_i64.checked_mul(factor).ok_or_else(|| {
    // OutOfRange (not ArithmeticOverflow) — the latter casts both operands to
    // `u64`, which misrepresents signed values: a legitimately reachable
    // negative `q_i64` (the user passed a negative `number`) and / or a
    // negative `factor` would surface as huge unsigned numbers. SmolStr-text
    // preserves the signed values exactly. The public `*_by_factor` entries
    // additionally reject `factor <= 0` up front (see `check_positive_factor`),
    // so only `factor >= 1` reaches here; the residual overflow path is
    // `q_i64 < 0` (a negative `number` input scaled past `i64::MIN`).
    Error::OutOfRange(OutOfRangePayload::new(
      op,
      "i64 multiplication overflowed",
      format!("quotient={q_i64}, factor={factor}"),
    ))
  })
}

/// Reject a non-positive `factor` BEFORE the f64 division inside the public
/// `*_by_factor` family. A `factor <= 0` is both a numerical-correctness bug
/// (floor / ceil / round semantics require a positive factor) and an overflow
/// trap: `floor_by_factor(1, i64::MIN)` would otherwise reach
/// `factor_product`'s `q_i64 * factor` with `q_i64 = -1`, `factor = i64::MIN`,
/// and a downstream payload would lose the sign of `factor` when cast to
/// `u64`. Surfacing the signed value here via `SmolStr` keeps the error
/// faithful.
fn check_positive_factor(factor: i64, op: &'static str) -> Result<()> {
  if factor <= 0 {
    return Err(Error::OutOfRange(OutOfRangePayload::new(
      op,
      "factor must be > 0",
      format!("factor={factor}"),
    )));
  }
  Ok(())
}

/// Guard an integer-input `*_by_factor` call before its lossy f64 division:
/// reject a `number` whose magnitude exceeds [`F64_EXACT_INT_MAX`] (the f64
/// round-trip would corrupt it) — see that constant. Normal frame-sized
/// inputs pass untouched.
fn check_factor_input(number: i64, op: &'static str) -> Result<()> {
  // `unsigned_abs` (not `i64::abs`) — `i64::MIN.abs()` itself overflows/panics,
  // and a hostile caller can pass `i64::MIN` (e.g. `FrameSampling::Fixed`).
  if number.unsigned_abs() > F64_EXACT_INT_MAX as u64 {
    return Err(Error::OutOfRange(OutOfRangePayload::new(
      op,
      "must be within the f64 exact-integer range (|number| <= 2^53; the float division would \
         lose precision and corrupt the result)",
      format!("{number}"),
    )));
  }
  Ok(())
}

/// Bound the `smart_resize` `beta` (sqrt) scale path to the domain where the
/// f64 ratio is provably bit-exact with the python reference.
///
/// Python computes `beta`'s ratio as an EXACT arbitrary-precision `int / int`
/// (`(height * width) / max_pixels` for scale-down, `min_pixels / (height *
/// width)` for scale-up) and *then* takes `math.sqrt`. For huge operands that
/// `int / int -> float` is correctly-rounded straight from the true rational,
/// whereas a Rust `f64 / f64` double-rounds — each integer rounds to f64 first,
/// then the division rounds again — and can differ in the last ULP. Reproducing
/// python's single-rounding for arbitrary magnitudes would need a
/// correctly-rounded big-rational→f64 divider (risky, and out of scope per the
/// match-the-reference rule).
///
/// Instead we BOUND THE DOMAIN per branch: every integer in `[0, 2^53]` is
/// exactly f64-representable, so when BOTH operands of the *actual* division on
/// this branch are `<= 2^53` the `as f64` casts lose nothing and IEEE-754
/// division is correctly-rounded — i.e. the f64 ratio EQUALS python's
/// `int / int -> float` to the bit. Above `2^53` we reject with a recoverable
/// [`Error::OutOfRange`] naming the values rather than emit an
/// imprecisely-rounded size.
///
/// **Branch-specific operands** (this is why the helper is per-branch — a
/// single combined guard over-rejects):
/// - Scale-DOWN: `beta = sqrt((height * width) / max_pixels)`. The ratio
///   operands are `height * width` (numerator) and `max_pixels` (denominator);
///   `min_pixels` does NOT appear. Use [`check_beta_domain_down`].
/// - Scale-UP: `beta = sqrt(min_pixels / (height * width))`. The ratio operands
///   are `min_pixels` (numerator) and `height * width` (denominator);
///   `max_pixels` does NOT appear. Use [`check_beta_domain_up`]. A caller with
///   a small image, a huge `max_pixels` sentinel, and a normal `min_pixels`
///   (e.g. `smart_resize(1, 1, 28, 3136, i64::MAX)`) is correctly admitted to
///   the scale-up branch under this split — under a `max_pixels`-keyed guard
///   it would be over-rejected even though python returns `(56, 56)`.
///
/// The realistic video domain is unaffected: frame dims run in the thousands,
/// so `height * width` is in the millions — twenty-plus orders of magnitude
/// below `2^53` — and `max_pixels` / `min_pixels` are the
/// `16384 * 28 * 28 ≈ 1.3e7` / `4 * 28 * 28` budgets. Both guards therefore
/// always pass (and match python exactly) for real inputs; each fires only on
/// the pathological dims/budgets where its branch's f64 path could silently
/// diverge.
fn check_beta_domain_down(height: i64, width: i64, max_pixels: i64) -> Result<()> {
  // EXACT i128 product — `height`/`width` are validated positive `i64`, so
  // `height * width` can exceed `i64` (and is inexact in f64); i128 holds it
  // overflow-free and the comparison is exact.
  let area = (height as i128) * (width as i128);
  if area > F64_EXACT_INT_MAX as i128 {
    return Err(Error::OutOfRange(OutOfRangePayload::new(
      "smart_resize (scale-down): height*width area (beta sqrt ratio operand)",
      "must be <= 2^53 (F64_EXACT_INT_MAX) so the f64 ratio is bit-exact with python's int/int",
      format!("{area}"),
    )));
  }
  if max_pixels > F64_EXACT_INT_MAX {
    return Err(Error::OutOfRange(OutOfRangePayload::new(
      "smart_resize (scale-down): max_pixels (beta sqrt ratio operand)",
      "must be <= 2^53 (F64_EXACT_INT_MAX) so the f64 ratio is bit-exact with python's int/int",
      format!("{max_pixels}"),
    )));
  }
  Ok(())
}

/// Scale-UP companion to [`check_beta_domain_down`]: the scale-up `beta` ratio
/// is `min_pixels / (height * width)`, so the operands to bound are
/// `min_pixels` and `area`, NOT `max_pixels`. See [`check_beta_domain_down`]'s
/// doc for the precision-rationale and the over-rejection case this split
/// avoids.
fn check_beta_domain_up(height: i64, width: i64, min_pixels: i64) -> Result<()> {
  let area = (height as i128) * (width as i128);
  if area > F64_EXACT_INT_MAX as i128 {
    return Err(Error::OutOfRange(OutOfRangePayload::new(
      "smart_resize (scale-up): height*width area (beta sqrt ratio operand)",
      "must be <= 2^53 (F64_EXACT_INT_MAX) so the f64 ratio is bit-exact with python's int/int",
      format!("{area}"),
    )));
  }
  if min_pixels > F64_EXACT_INT_MAX {
    return Err(Error::OutOfRange(OutOfRangePayload::new(
      "smart_resize (scale-up): min_pixels (beta sqrt ratio operand)",
      "must be <= 2^53 (F64_EXACT_INT_MAX) so the f64 ratio is bit-exact with python's int/int",
      format!("{min_pixels}"),
    )));
  }
  Ok(())
}

/// Closest integer to `number / factor`, times `factor`.
///
/// Ports python `round_by_factor` (`video_generate.py:51-53`):
/// `round(number / factor) * factor`. Python's `round` is **round-half-to
/// -even** (banker's rounding), and this port reproduces that exactly via
/// an internal `round_half_to_even` helper so a `.5` quotient (e.g.
/// `round_by_factor(14, 28)` → `round(0.5) = 0`) matches the reference
/// rather than diverging like the swift `Int(round(...))`
/// (round-half-away-from-zero) would.
///
/// # Errors
/// [`Error::OutOfRange`] for an adversarial near-`i64::MAX` `number`: the
/// f64 division/`* factor` cannot represent it faithfully (precision loss
/// past ±2^53 or `i64` product overflow). The reference assumes frame-sized
/// inputs and never trips this; we surface it recoverably rather than panic
/// (debug) or silently wrap to a small valid-looking size (release).
pub fn round_by_factor(number: i64, factor: i64) -> Result<i64> {
  check_positive_factor(factor, "round")?;
  check_factor_input(number, "round")?;
  factor_product(
    round_half_to_even_f(number as f64 / factor as f64),
    factor,
    "round",
  )
}

/// Smallest multiple of `factor` that is `>= number`.
///
/// Ports python `ceil_by_factor` (`video_generate.py:56-58`):
/// `ceil(number / factor) * factor`.
///
/// # Errors
/// [`Error::OutOfRange`] when `number` cannot be faithfully scaled in f64
/// (precision loss past ±2^53 or `i64` product overflow) — see
/// [`round_by_factor`].
pub fn ceil_by_factor(number: i64, factor: i64) -> Result<i64> {
  check_positive_factor(factor, "ceil")?;
  check_factor_input(number, "ceil")?;
  factor_product((number as f64 / factor as f64).ceil(), factor, "ceil")
}

/// Largest multiple of `factor` that is `<= number`.
///
/// Ports python `floor_by_factor` (`video_generate.py:61-63`):
/// `floor(number / factor) * factor`.
///
/// # Errors
/// [`Error::OutOfRange`] when `number` cannot be faithfully scaled in f64
/// (precision loss past ±2^53 or `i64` product overflow) — see
/// [`round_by_factor`].
pub fn floor_by_factor(number: i64, factor: i64) -> Result<i64> {
  check_positive_factor(factor, "floor")?;
  check_factor_input(number, "floor")?;
  factor_product((number as f64 / factor as f64).floor(), factor, "floor")
}

/// Float-input variant of [`ceil_by_factor`]: `ceil(number / factor) *
/// factor` with a real-valued `number`. Used by [`smart_resize`]'s
/// scale-up branch, where python calls `ceil_by_factor(height * beta,
/// factor)` with the FLOAT `height * beta` (`video_generate.py:92`).
/// Kept separate from the integer entry so the public `ceil_by_factor`
/// stays the faithful 1:1 of the integer call sites
/// (`smart_nframes`/`fetch_video`). Product overflow is caught by
/// [`factor_product`].
fn ceil_by_factor_f(number: f64, factor: i64) -> Result<i64> {
  factor_product((number / factor as f64).ceil(), factor, "ceil")
}

/// Float-input variant of [`floor_by_factor`]: `floor(number / factor) *
/// factor` with a real-valued `number`. Used by [`smart_resize`]'s
/// scale-down branch (python `floor_by_factor(height / beta, factor)`
/// with the FLOAT `height / beta`, `video_generate.py:88`). Product overflow
/// is caught by [`factor_product`].
fn floor_by_factor_f(number: f64, factor: i64) -> Result<i64> {
  factor_product((number / factor as f64).floor(), factor, "floor")
}

/// Round half to even (banker's rounding) — the behavior of python's
/// built-in `round()` on a single float, which `round_by_factor` relies
/// on. `round(0.5) == 0`, `round(1.5) == 2`, `round(2.5) == 2`,
/// `round(-0.5) == 0`. Rust's `f64::round` rounds half *away from zero*
/// (`0.5_f64.round() == 1.0`), so we cannot use it directly for parity.
///
/// Returns the rounded value as an `f64` (still integral) so callers that
/// must multiply by a factor can range-check before the `i64` cast — see
/// [`factor_product`]. The exactly-`.5` tie-break inspects the floor's
/// integer parity; for a non-finite or out-of-`i64`-range floor that parity
/// is meaningless, so we treat such inputs as "not a tie" and round toward
/// the floor, letting [`factor_product`]'s range-check reject them.
fn round_half_to_even_f(x: f64) -> f64 {
  let floor = x.floor();
  let diff = x - floor;
  if diff < 0.5 {
    floor
  } else if diff > 0.5 {
    floor + 1.0
  } else {
    // Exactly .5 — pick the even neighbor. Parity is only meaningful for a
    // floor that fits i64; outside that, fall through to `floor` (the value
    // is rejected downstream by `factor_product` anyway).
    if floor.is_finite() && floor >= i64::MIN as f64 && floor < F64_TWO_POW_63 {
      let floor_i = floor as i64;
      if floor_i % 2 == 0 { floor } else { floor + 1.0 }
    } else {
      floor
    }
  }
}

/// `i64`-returning banker's rounding for the frame-index picker, where the
/// input is always a bounded `linspace` point in `[0, total_frames - 1]`
/// (no factor multiply, so no overflow path). Thin wrapper over
/// [`round_half_to_even_f`].
fn round_half_to_even(x: f64) -> i64 {
  round_half_to_even_f(x) as i64
}

/// Compute a factor-aligned `(height, width)` that preserves aspect ratio
/// while keeping the total pixel count inside `[min_pixels, max_pixels]`.
///
/// Ports python `smart_resize` (`video_generate.py:66-94`); cross-checked
/// against swift `QwenVL.targetSize` (`QwenVL.swift:123-173`). The math:
///
/// 1. Reject when `max(h, w) / min(h, w) > MAX_RATIO` (integer division,
///    matching the python `max(...) / min(...)` truncation only where both
///    references agree — see below).
/// 2. `h_bar = max(factor, round_by_factor(h, factor))`,
///    `w_bar = max(factor, round_by_factor(w, factor))`.
/// 3. If `h_bar * w_bar > max_pixels`, scale **down** by
///    `beta = sqrt(h*w / max_pixels)` and `floor_by_factor`.
/// 4. Else if `h_bar * w_bar < min_pixels`, scale **up** by
///    `beta = sqrt(min_pixels / (h*w))` and `ceil_by_factor`.
///
/// # `beta` (sqrt) path exactness — bounded domain (branch-specific)
/// Python forms each `beta` ratio as an EXACT arbitrary-precision `int / int`
/// (then `math.sqrt`). This port's `beta` is **bit-exact** with the reference
/// **only while the operands of *that branch's* division are `<= 2^53`** (every
/// operand is then f64-representable and IEEE division is correctly-rounded, so
/// the f64 ratio equals python's `int / int -> float` to the bit):
/// - Scale-DOWN bound: `height * width <= 2^53` AND `max_pixels <= 2^53`
///   (operands of `(height * width) / max_pixels`).
/// - Scale-UP bound: `height * width <= 2^53` AND `min_pixels <= 2^53`
///   (operands of `min_pixels / (height * width)`).
///
/// The bounds are **per-branch on purpose**: a single combined `max_pixels`
/// guard would over-reject scale-up — e.g. `smart_resize(1, 1, 28, 3136,
/// i64::MAX)` (small image + huge `max_pixels` sentinel + normal `min_pixels`)
/// enters scale-up, where the actual ratio is `3136 / 1` (both far below
/// `2^53`), so it must succeed and return `(56, 56)` like python. Inputs above
/// the relevant bound are **rejected** with [`Error::OutOfRange`] rather
/// than computed imprecisely — a naive `f64 / f64` would double-round and
/// could diverge in the last bit there, and a correctly-rounded big-rational
/// divider is out of scope. Every realistic frame size (dims in the thousands,
/// area in the millions) is far below both bounds and so always matches python
/// exactly.
///
/// # Aspect-ratio check fidelity
/// The python reference computes `max(h, w) / min(h, w)` in **float**
/// (true division) and compares to `MAX_RATIO`. The swift reference uses
/// **integer** division (`max / min > 200`). We follow the python float
/// comparison (this is an mlx-vlm port) so a ratio of, say, `200.5`
/// is correctly rejected where integer division would truncate it to
/// `200` and pass.
///
/// # Errors
/// - [`Error::OutOfRange`] if `height <= 0` or `width <= 0` (the
///   reference assumes positive dims; we guard so the `min`/`max` and the
///   `sqrt` domain are well-defined).
/// - [`Error::OutOfRange`] if `factor <= 0` (division by / alignment to
///   a non-positive factor is undefined).
/// - [`Error::OutOfRange`] if `max_pixels <= 0` or `min_pixels < 0`;
///   [`Error::InvariantViolation`] if `min_pixels > max_pixels`
///   (the reference's pixel budget must be a valid non-empty interval;
///   the `sqrt` arguments must be non-negative).
/// - [`Error::OutOfRange`] if `max(h, w) / min(h, w) > MAX_RATIO`
///   (mirrors the python `raise ValueError`).
/// - [`Error::OutOfRange`] if the budget **cannot contain a positive
///   factor-aligned size** — i.e. `max_pixels < factor * factor` (the
///   smallest legal output, a `factor × factor` square, already overflows
///   the budget) or the scale-down branch floors a side to a non-positive
///   multiple of `factor`. The python reference omits this guard and
///   silently returns `(0, 0)` for an impossible budget such as
///   `smart_resize(28, 28, 28, 1, 1)` (the `min`/`max` math has no positive
///   factor-aligned solution); we surface it recoverably, naming the
///   budget, rather than emit a zero dimension. A *positive* result whose
///   area lands a single factor-step outside `[min_pixels, max_pixels]` is
///   **kept**, not rejected: the reference's `floor_by_factor` /
///   `ceil_by_factor` do not re-clamp, so re-clamping here would diverge
///   from the python output.
/// - [`Error::OutOfRange`] if a rescale (`beta`) is needed but the operands
///   of *that branch's* division exceed `2^53` — see the *`beta` (sqrt) path
///   exactness* note above for the per-branch bounds (scale-down: area /
///   `max_pixels`; scale-up: area / `min_pixels`). The f64 ratio cannot be
///   guaranteed bit-exact with python's `int / int` past that range, so the
///   value is rejected rather than computed imprecisely. (Only reached on the
///   scale-up / scale-down branches; a within-budget input that needs no
///   rescale never trips it.)
pub fn smart_resize(
  height: i64,
  width: i64,
  factor: i64,
  min_pixels: i64,
  max_pixels: i64,
) -> Result<(i64, i64)> {
  if height <= 0 {
    return Err(Error::OutOfRange(OutOfRangePayload::new(
      "smart_resize: height",
      "must be > 0",
      format!("{height}"),
    )));
  }
  if width <= 0 {
    return Err(Error::OutOfRange(OutOfRangePayload::new(
      "smart_resize: width",
      "must be > 0",
      format!("{width}"),
    )));
  }
  if factor <= 0 {
    return Err(Error::OutOfRange(OutOfRangePayload::new(
      "smart_resize: factor",
      "must be > 0",
      format!("{factor}"),
    )));
  }
  if max_pixels <= 0 {
    return Err(Error::OutOfRange(OutOfRangePayload::new(
      "smart_resize: max_pixels",
      "must be > 0",
      format!("{max_pixels}"),
    )));
  }
  if min_pixels < 0 {
    return Err(Error::OutOfRange(OutOfRangePayload::new(
      "smart_resize: min_pixels",
      "must be >= 0",
      format!("{min_pixels}"),
    )));
  }
  if min_pixels > max_pixels {
    return Err(Error::InvariantViolation(InvariantViolationPayload::new(
      "smart_resize: min_pixels vs max_pixels",
      "min_pixels must be <= max_pixels",
    )));
  }
  // The smallest legal output is a `factor × factor` square (both sides are
  // pinned to at least `factor` by the `max(factor, ...)` guards below).
  // If even that minimal cell overflows `max_pixels`, no positive
  // factor-aligned size fits the budget; the python reference would scale
  // down to `(0, 0)`. Reject up front, naming the budget.
  //
  // EXACT i128 arithmetic — NOT f64. `factor` can be any positive `i64`, so
  // `factor * factor` can overflow `i64` (debug panic / release wrap) and is
  // *inexact* in f64 (the 53-bit mantissa collapses adjacent values: e.g.
  // `factor=3_037_000_499` and `max_pixels=factor*factor-1` both round to the
  // same f64, silently bypassing the guard and returning an over-budget size).
  // `i128` represents every `i64²` product exactly and overflow-free, so the
  // comparison and the error message are both exact for any positive `i64`.
  let min_cell = (factor as i128) * (factor as i128);
  if min_cell > max_pixels as i128 {
    // EXACT i128 telemetry: `min_cell` can be up to `i64::MAX * i64::MAX`
    // (~2^126), which truncates to garbage if cast to u64 — e.g. `factor =
    // 1 << 32` yields `min_cell = 2^64`, and `as u64` collapses it to 0.
    // Carry both values through `OutOfRangePayload`'s `SmolStr` so the
    // typed observed value preserves the i128 magnitude exactly.
    return Err(Error::OutOfRange(OutOfRangePayload::new(
      "smart_resize: smallest factor-aligned size (factor x factor) vs max_pixels budget",
      "factor*factor must be <= max_pixels",
      format!("min_cell={min_cell}, cap={max_pixels}"),
    )));
  }

  let hi = height.max(width) as f64;
  let lo = height.min(width) as f64;
  if hi / lo > MAX_RATIO as f64 {
    return Err(Error::OutOfRange(OutOfRangePayload::new(
      "smart_resize: absolute aspect ratio (max(h,w) / min(h,w))",
      "must be < MAX_RATIO",
      format!("{} (MAX_RATIO={MAX_RATIO})", hi / lo),
    )));
  }

  // Compute the factor-aligned bars BEFORE the branch decision. The
  // `*_by_factor` helpers are overflow-safe (`?`-propagating
  // [`Error::OutOfRange`]): an adversarial near-`i64::MAX` `height`/
  // `width` with `factor=28` would otherwise panic (debug) or wrap negative
  // (release) inside `round_by_factor`, after which `factor.max(..)` would
  // promote the wrapped value to a small valid-looking size and the late
  // `checked_mul` would never see the overflow. Surfacing it here, before
  // any `h_bar * w_bar` comparison, closes that silent-corruption path.
  let mut h_bar = factor.max(round_by_factor(height, factor)?);
  let mut w_bar = factor.max(round_by_factor(width, factor)?);

  // python branches on `h_bar * w_bar > max_pixels` / `< min_pixels` with
  // arbitrary-precision ints — i.e. EXACT integer comparisons. Mirror that
  // with `i128` (NOT f64): both bars are positive multiples of `factor` here,
  // so their product can exceed `i64` and is inexact in f64 (the 53-bit
  // mantissa collapses adjacent integers, which could pick the wrong branch
  // or skip a needed rescale on an adversarial budget). `i128` holds every
  // `i64 * i64` exactly and overflow-free.
  let bar_area = (h_bar as i128) * (w_bar as i128);
  if bar_area > max_pixels as i128 {
    // SCALE DOWN — python: beta = sqrt((height * width) / max_pixels), then
    // floor_by_factor(height / beta, factor). `beta`'s sqrt is the only float
    // math, mirroring python's `math.sqrt(...)`. Bound the f64 domain of THIS
    // branch's actual operands — `height * width` (numerator) and `max_pixels`
    // (denominator); `min_pixels` does not appear in the scale-down ratio — so
    // the division is bit-exact with the reference (see
    // `check_beta_domain_down`).
    check_beta_domain_down(height, width, max_pixels)?;
    let hw = height as f64 * width as f64;
    let beta = (hw / max_pixels as f64).sqrt();
    // python: floor_by_factor(height / beta, factor) where the argument is
    // the FLOAT `height / beta` — use the float-input helper so we don't
    // truncate before the `floor(_ / factor)` (a double-floor would diverge).
    h_bar = floor_by_factor_f(height as f64 / beta, factor)?;
    w_bar = floor_by_factor_f(width as f64 / beta, factor)?;
  } else if bar_area < min_pixels as i128 {
    // SCALE UP — python: beta = sqrt(min_pixels / (height * width)), then
    // ceil_by_factor(height * beta, factor). Bound the f64 domain of THIS
    // branch's actual operands — `min_pixels` (numerator) and `height * width`
    // (denominator); `max_pixels` does not appear in the scale-up ratio, so a
    // huge `max_pixels` sentinel must NOT over-reject here (e.g.
    // `smart_resize(1, 1, 28, 3136, i64::MAX)` is legitimate scale-up that
    // python returns `(56, 56)` for). See `check_beta_domain_up`.
    check_beta_domain_up(height, width, min_pixels)?;
    let hw = height as f64 * width as f64;
    let beta = (min_pixels as f64 / hw).sqrt();
    h_bar = ceil_by_factor_f(height as f64 * beta, factor)?;
    w_bar = ceil_by_factor_f(width as f64 * beta, factor)?;
  }

  // Validate the result is a *positive* factor-aligned size whose pixel
  // product is representable (overflow-safe `checked_mul`). The `*_by_factor`
  // helpers keep both sides multiples of `factor`, but a tight `max_pixels`
  // paired with an extreme aspect ratio can still floor one side to 0 in the
  // scale-down branch (e.g. height=537, width=4962, factor=28, max=1646 ->
  // (0, 112)), which the early `min_cell` guard does not catch. A non-positive
  // dimension is the degenerate `(0, 0)`-style result an impossible budget
  // produces, so reject it instead of returning a zero / out-of-budget dim.
  //
  // We deliberately do NOT reject a positive result whose area merely sits a
  // rounding step *outside* `[min_pixels, max_pixels]`: the reference applies
  // `floor_by_factor` / `ceil_by_factor` *without* re-clamping, so on many
  // satisfiable budgets it faithfully returns a positive size whose product is
  // marginally below `min_pixels` (scale-down floor) or above `max_pixels`
  // (scale-up ceil). Erroring on those would diverge from the python output we
  // are porting (verified: ~0.2% of valid inputs land just outside the band by
  // one factor step). Faithfulness requires accepting them.
  let representable = h_bar.checked_mul(w_bar).is_some();
  if h_bar <= 0 || w_bar <= 0 || !representable {
    return Err(Error::InvariantViolation(InvariantViolationPayload::new(
      "smart_resize: computed (h_bar, w_bar)",
      "no positive factor-aligned size fits budget [min_pixels, max_pixels] for the given input \
         (either an axis floored to <= 0 in the scale-down branch or h_bar*w_bar overflows i64)",
    )));
  }
  Ok((h_bar, w_bar))
}

/// How to choose the number of frames to sample: either a fixed count or
/// an fps-driven budget. Mirrors the two mutually-exclusive branches of
/// python `smart_nframes` (`video_generate.py:161-192`), which asserts
/// `not ("fps" in ele and "nframes" in ele)`.
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum FrameSampling {
  /// Fixed frame count — python `ele["nframes"]` branch
  /// (`video_generate.py:173-174`). Rounded to a multiple of
  /// [`FRAME_FACTOR`].
  Fixed {
    /// Requested frame count (pre factor-rounding).
    nframes: i64,
  },
  /// fps-driven budget — python `ele.get("fps", FPS)` branch
  /// (`video_generate.py:176-187`).
  Fps {
    /// Target sampling rate (python `ele.get("fps", FPS)`). Use [`FPS`]
    /// for the reference default.
    fps: f64,
    /// Floor on the result before factor-rounding (python
    /// `ele.get("min_frames", FPS_MIN_FRAMES)`, then `ceil_by_factor`).
    /// Use [`FPS_MIN_FRAMES`] for the reference default.
    min_frames: i64,
    /// Ceiling on the result before factor-rounding (python
    /// `ele.get("max_frames", min(FPS_MAX_FRAMES, total_frames))`, then
    /// `floor_by_factor`). `None` selects the reference default
    /// `min(FPS_MAX_FRAMES, total_frames)`.
    max_frames: Option<i64>,
  },
}

impl Default for FrameSampling {
  /// The python default path: fps-driven with [`FPS`] /
  /// [`FPS_MIN_FRAMES`] and the `max_frames` default
  /// (`min(FPS_MAX_FRAMES, total_frames)`).
  fn default() -> Self {
    Self::Fps {
      fps: FPS,
      min_frames: FPS_MIN_FRAMES,
      max_frames: None,
    }
  }
}

/// Compute how many frames to sample from a video as model input.
///
/// Ports python `smart_nframes` (`video_generate.py:161-192`) and is
/// cross-checked against the swift `estimatedFrames = round(fps *
/// duration); min(_, maxFrames); max(_, 1)` clamp in
/// `MediaProcessing._asProcessedSequence` (`MediaProcessing.swift:
/// 423-480`).
///
/// - [`FrameSampling::Fixed`]: `round_by_factor(nframes, FRAME_FACTOR)`.
/// - [`FrameSampling::Fps`]: `nframes = total_frames / video_fps * fps`,
///   then clamped to `[min_frames, max_frames]` (each factor-rounded —
///   `min` via `ceil_by_factor`, `max` via `floor_by_factor`) **and** to
///   `total_frames`, then `floor_by_factor(_, FRAME_FACTOR)`.
///
/// Both branches finally assert `FRAME_FACTOR <= nframes <= total_frames`
/// (python raises `ValueError`; we return [`Error::OutOfRange`]).
///
/// # Errors
/// - [`Error::OutOfRange`] if `total_frames <= 0` or `video_fps <= 0`
///   (the python flow assumes a real, non-empty clip; a zero `video_fps`
///   would divide-by-zero in the fps branch).
/// - [`Error::OutOfRange`] if the `fps` is non-positive (the budget
///   `total/video_fps*fps` would be non-positive).
/// - [`Error::OutOfRange`] if the resulting count falls outside
///   `[FRAME_FACTOR, total_frames]` — mirrors the python final check.
pub fn smart_nframes(sampling: FrameSampling, total_frames: i64, video_fps: f64) -> Result<i64> {
  if total_frames <= 0 {
    return Err(Error::OutOfRange(OutOfRangePayload::new(
      "smart_nframes: total_frames",
      "must be > 0",
      format!("{total_frames}"),
    )));
  }
  let nframes = match sampling {
    // Overflow-safe `?`: an adversarial near-`i64::MAX` `nframes` would
    // otherwise panic (debug) / wrap (release) inside `round_by_factor`.
    FrameSampling::Fixed { nframes } => round_by_factor(nframes, FRAME_FACTOR)?,
    FrameSampling::Fps {
      fps,
      min_frames,
      max_frames,
    } => {
      // Reject non-positive AND non-finite (NaN/inf) rates: a zero
      // `video_fps` divides-by-zero and a non-positive/NaN budget would
      // break the clamp + the final `[FRAME_FACTOR, total_frames]` check.
      if !video_fps.is_finite() {
        return Err(Error::NonFiniteScalar(NonFiniteScalarPayload::new(
          "smart_nframes: video_fps",
          video_fps,
        )));
      }
      if video_fps <= 0.0 {
        return Err(Error::OutOfRange(OutOfRangePayload::new(
          "smart_nframes: video_fps",
          "must be > 0",
          format!("{video_fps}"),
        )));
      }
      if !fps.is_finite() {
        return Err(Error::NonFiniteScalar(NonFiniteScalarPayload::new(
          "smart_nframes: fps",
          fps,
        )));
      }
      if fps <= 0.0 {
        return Err(Error::OutOfRange(OutOfRangePayload::new(
          "smart_nframes: fps",
          "must be > 0",
          format!("{fps}"),
        )));
      }
      // Overflow-safe `?` on every factor-rounding (the clamp bounds and the
      // final round): an adversarial `min_frames`/`max_frames`/derived count
      // near `i64::MAX` would otherwise panic/wrap in the `*_by_factor` math.
      let min_frames = ceil_by_factor(min_frames, FRAME_FACTOR)?;
      // python: max_frames default = min(FPS_MAX_FRAMES, total_frames).
      let max_frames_raw = max_frames.unwrap_or_else(|| FPS_MAX_FRAMES.min(total_frames));
      let max_frames = floor_by_factor(max_frames_raw, FRAME_FACTOR)?;
      // python: nframes = total_frames / video_fps * fps   (float)
      let raw = total_frames as f64 / video_fps * fps;
      // python: min(min(max(nframes, min_frames), max_frames), total_frames)
      // performed in float, THEN floor_by_factor. The clamp pins the result
      // into [min_frames, total_frames] before the (overflow-checked) round.
      let clamped = raw
        .max(min_frames as f64)
        .min(max_frames as f64)
        .min(total_frames as f64);
      floor_by_factor(clamped.floor() as i64, FRAME_FACTOR)?
    }
  };
  if !(FRAME_FACTOR <= nframes && nframes <= total_frames) {
    return Err(Error::OutOfRange(OutOfRangePayload::new(
      "smart_nframes: nframes",
      "must be in [FRAME_FACTOR, total_frames]",
      format!("nframes={nframes} (FRAME_FACTOR={FRAME_FACTOR}, total_frames={total_frames})"),
    )));
  }
  Ok(nframes)
}

/// The `nframes` source-frame indices to sample, evenly spread across
/// `[0, total_frames - 1]`.
///
/// Ports python `np.linspace(0, total_frames - 1, nframes).round().
/// astype(int)` (`video_generate.py:217`). This is the index picker the
/// caller uses to decide *which* decoded frames to hand to
/// [`process_frames`]; it is codec-free (decoding stays the caller's job).
///
/// numpy computes `linspace` as `step = (stop - start) / (num - 1)` FIRST,
/// then point `i` as `start + step * i`, and finally (because the default
/// `endpoint=True`) overwrites the LAST sample with `stop` exactly. We
/// replicate that operation order bit-for-bit rather than the algebraically
/// equal `(n-1) * i / (k-1)`: the intermediate `step` changes which float
/// ties land exactly on `.5` before banker's rounding, so the orders are
/// NOT interchangeable. Example: `total_frames = 26, nframes = 23`, the
/// midpoint `i = 11` is `12.500000000000002` the numpy way (rounds to 13)
/// but exactly `12.5` the fused way (banker's-rounds to 12) — a silently
/// wrong frame. The single-sample `k == 1` case yields `[0]` (numpy's
/// `linspace` with `num=1` returns just the start). Each value is rounded
/// **half-to-even** (numpy's `ndarray.round`) and clamped into `[0,
/// total_frames - 1]` for safety against float-edge overshoot.
///
/// # Errors
/// - [`Error::OutOfRange`] if `total_frames <= 0` or `nframes <= 0`.
/// - [`Error::OutOfMemory`] if the `nframes`-long index buffer cannot be
///   allocated (request-scaled; surfaced recoverably rather than aborting).
pub fn sample_frame_indices(total_frames: i64, nframes: i64) -> Result<Vec<i64>> {
  if total_frames <= 0 {
    return Err(Error::OutOfRange(OutOfRangePayload::new(
      "sample_frame_indices: total_frames",
      "must be > 0",
      format!("{total_frames}"),
    )));
  }
  if nframes <= 0 {
    return Err(Error::OutOfRange(OutOfRangePayload::new(
      "sample_frame_indices: nframes",
      "must be > 0",
      format!("{nframes}"),
    )));
  }
  let n = nframes as usize;
  let mut out = crate::error::try_with_capacity::<i64>(n)?;
  let stop = (total_frames - 1) as f64;
  if nframes == 1 {
    // numpy linspace(start, stop, num=1) -> [start]
    out.push(0);
    return Ok(out);
  }
  // numpy: step = (stop - start) / (num - 1), computed BEFORE the per-point
  // multiply (start == 0 here). Reusing this single `step` for every point
  // reproduces numpy's float ties exactly (see the doc comment / the
  // `total_frames=26, nframes=23` regression).
  let step = stop / (nframes - 1) as f64;
  for i in 0..nframes {
    // endpoint=True: numpy overwrites the final sample with `stop` exactly
    // rather than trusting `step * (n-1)` to land on it.
    let v = if i == nframes - 1 {
      stop
    } else {
      step * (i as f64)
    };
    let idx = round_half_to_even(v).clamp(0, total_frames - 1);
    out.push(idx);
  }
  Ok(out)
}

/// Apply the cross-model image [`preprocess`]
/// to each already-decoded `frame` and stack into `[T, H, W, 3]`.
///
/// Ports the per-frame resize+stack body of python `fetch_video`
/// (`video_generate.py:270-282`): each frame is resized + converted to a
/// float tensor and the frames are `np.stack(..., axis=0)`'d. We reuse the
/// existing cross-model [`preprocess`] (so
/// the resize/rescale/normalize is byte-identical to single-image VLM
/// preprocessing) and [`stack`] along a new
/// leading time axis.
///
/// **Decoding is the caller's responsibility** (see the module doc): pass
/// the frames you have already decoded. To choose *which* source frames to
/// decode, use [`smart_nframes`] + [`sample_frame_indices`]; to choose the
/// resize target, use [`smart_resize`] and set `cfg.size` accordingly
/// before calling.
///
/// **Layout:** channel-last `[T, H, W, 3]` (the natural stack of the
/// channel-last `[H, W, 3]` [`preprocess`]
/// output). The python planar `[T, C, H, W]` (`video_generate.py:282`) is
/// a per-model contract reachable in one lazy `transpose_axes(&[0, 3, 1,
/// 2])`; see the module `Channel layout` block.
///
/// **`cfg.layout` constraint — only [`Layout::Hwc`] is currently
/// supported.** `process_frames` produces a channel-last `[T, H, W, 3]`
/// stack and the *video-tensor* analogues of [`Layout::Chw`] /
/// [`Layout::Bchw`] (i.e. whether a video tensor should be
/// `[T, 3, H, W]`, `[1, T, 3, H, W]`, or something else) are not yet
/// pinned to a per-model contract, so applying [`Layout`] **per-frame**
/// here would silently break the stack contract above. Passing
/// `cfg.layout != Layout::Hwc` therefore returns
/// [`Error::Backend`] rather than producing a misleading shape. Callers
/// that need a planar video tensor can post-process the returned
/// `[T, H, W, 3]` themselves (one lazy `transpose_axes(&[0, 3, 1, 2])`)
/// until a future PR defines first-class video-layout semantics.
///
/// **No implicit eval:** every frame composes lazily; the returned
/// `Array` is un-evaluated.
///
/// # Errors
/// - [`Error::EmptyInput`] if `frames` is empty (python `np.stack`
///   raises on an empty sequence; the swift `_asProcessedSequence` has a
///   `precondition(videoFrames.isEmpty == false)`).
/// - [`Error::Backend`] if `cfg.layout != Layout::Hwc` (see the
///   `cfg.layout` constraint above).
/// - Any error from [`preprocess`] on a
///   frame (dtype/shape/allocation) is propagated.
/// - [`Error::OutOfMemory`] if the per-frame `Array` handle vector cannot
///   be allocated (request-scaled).
pub fn process_frames(
  frames: &[::image::DynamicImage],
  cfg: &ImageProcessorConfig,
) -> Result<Array> {
  if frames.is_empty() {
    return Err(Error::EmptyInput(EmptyInputPayload::new(
      "process_frames: frames",
    )));
  }
  // Reject non-Hwc per-frame layouts. Applying a planar `Layout` per
  // frame here would silently break the documented `[T, H, W, 3]` stack
  // contract (e.g. `Layout::Chw` would yield `[T, 3, H, W]` and
  // `Layout::Bchw` would yield a rank-5 `[T, 1, 3, H, W]` that matches
  // no standard video layout). Video-tensor layout semantics are not
  // yet defined; callers wanting planar output should post-process the
  // returned `[T, H, W, 3]` themselves.
  if cfg.layout() != Layout::Hwc {
    return Err(Error::UnknownEnumValue(UnknownEnumValuePayload::new(
      "process_frames: cfg.layout (video processing currently only supports Layout::Hwc; \
         video-tensor layout for Chw/Bchw is not yet defined — use Layout::Hwc and post-process \
         the returned [T, H, W, 3] if a planar shape is needed)",
      cfg.layout().as_str(),
      &["hwc"],
    )));
  }
  // Preprocess every frame to a channel-last [H, W, 3] f32 Array. The
  // owned `Array`s must outlive the `&Array` borrow slice handed to
  // `stack`, so collect them first (request-scaled → recoverable alloc).
  let mut processed = crate::error::try_with_capacity::<Array>(frames.len())?;
  for frame in frames {
    processed.push(preprocess(frame, cfg)?);
  }
  // `stack` borrows each Array; build the borrow slice (request-scaled).
  let mut refs = crate::error::try_with_capacity::<&Array>(processed.len())?;
  for a in &processed {
    refs.push(a);
  }
  stack(&refs)
}