lfm 0.1.0

Rust ONNX inference for LiquidAI LFM2.5-VL (vision-language) models — implements the engine-agnostic llmtask::Task contract via llguidance for schema-constrained sampling
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
//! Image preprocessing for LFM2.5-VL. Wasm-compatible.
//!
//! Per spec §6.4: vision_encoder.onnx takes
//! pre-patchified `[N_batch, num_patches, 768]` (NOT image-shaped).
//! Patch-level padding via attention_mask, NOT pixel-level.

#[cfg(all(feature = "decoders", not(target_arch = "wasm32")))]
use std::path::Path;

use image::DynamicImage;

use crate::{
  error::{Error, Result},
  options::ImageBudget,
};

pub mod tile_grid;
pub use tile_grid::TileGrid;
mod target_ratios;
use tile_grid::{PATCH_SIZE, TILE_PIXEL_UNIT};

/// Image preprocessor for LFM2.5-VL.
#[derive(Debug, Clone, Copy)]
pub struct Preprocessor {
  budget: ImageBudget,
}

impl Preprocessor {
  /// Construct with the given image budget.
  pub fn new(budget: ImageBudget) -> Self {
    Self { budget }
  }

  /// Returns the budget this preprocessor was constructed with.
  pub fn budget(&self) -> &ImageBudget {
    &self.budget
  }

  /// Single-image preprocess.
  pub fn preprocess(&self, image: &DynamicImage) -> Result<PreprocessedImage> {
    self.budget.validate()?;
    let grid = tile_grid::pick_tile_grid(image.width(), image.height(), &self.budget)?;
    flatten_to_patches(image, &grid)
  }

  /// Multi-image convenience.
  pub fn preprocess_batch(&self, images: &[DynamicImage]) -> Result<Vec<PreprocessedImage>> {
    images.iter().map(|i| self.preprocess(i)).collect()
  }

  /// Path-based convenience with EXIF orientation correction.
  #[cfg(all(feature = "decoders", not(target_arch = "wasm32")))]
  #[cfg_attr(
    docsrs,
    doc(cfg(all(feature = "decoders", not(target_arch = "wasm32"))))
  )]
  pub fn preprocess_path(&self, path: &Path) -> Result<PreprocessedImage> {
    let img = decode_with_orientation(path)?;
    self.preprocess(&img)
  }
}

/// Output of `Preprocessor::preprocess` — directly fed to `vision_encoder.run`.
///
/// LAYOUT:
/// - `pixel_values`: `[N_batch, num_patches, 768]` flattened (NOT image-shaped).
///   768 = 16² × 3 = patch_size² × channels.
/// - `pixel_attention_mask`: `[N_batch, num_patches]` — 1 = valid, 0 = padded.
/// - `spatial_shapes`: `[N_batch, 2]` — (h_patches, w_patches) per entry.
#[derive(Debug, Clone)]
pub struct PreprocessedImage {
  pixel_values: Vec<f32>,
  pixel_attention_mask: Vec<i64>,
  spatial_shapes: Vec<i64>,
  batch_size: usize,
  patches_per_entry: usize,
  rows: u32,
  cols: u32,
  main_tile_h: u32,
  main_tile_w: u32,
  thumbnail_size: Option<(u32, u32)>,
  tokens_per_main_tile: usize,
  thumbnail_tokens: Option<usize>,
}

impl PreprocessedImage {
  /// Pre-patchified pixel values `[N_batch * num_patches * 768]`.
  pub fn pixel_values(&self) -> &[f32] {
    &self.pixel_values
  }

  /// Per-patch attention mask `[N_batch * num_patches]` (1=valid, 0=padded).
  pub fn pixel_attention_mask(&self) -> &[i64] {
    &self.pixel_attention_mask
  }

  /// Per-entry (h_patches, w_patches) `[N_batch * 2]`.
  pub fn spatial_shapes(&self) -> &[i64] {
    &self.spatial_shapes
  }

  /// Number of batch entries (= number of tiles incl. thumbnail).
  pub fn batch_size(&self) -> usize {
    self.batch_size
  }

  /// Per-batch-entry padded num_patches (the tensor's second dim).
  pub fn patches_per_entry(&self) -> usize {
    self.patches_per_entry
  }

  /// Total number of tiles (main + thumbnail).
  pub fn num_tiles(&self) -> usize {
    (self.rows * self.cols) as usize + usize::from(self.thumbnail_size.is_some())
  }

  /// Main tile-grid rows (1 in single-tile path).
  pub fn rows(&self) -> usize {
    self.rows as usize
  }

  /// Main tile-grid cols (1 in single-tile path).
  pub fn cols(&self) -> usize {
    self.cols as usize
  }

  /// (h, w) of one main tile.
  pub fn main_tile_size(&self) -> (usize, usize) {
    (self.main_tile_h as usize, self.main_tile_w as usize)
  }

  /// (h, w) of the thumbnail tile if present.
  pub fn thumbnail_size(&self) -> Option<(usize, usize)> {
    self.thumbnail_size.map(|(h, w)| (h as usize, w as usize))
  }

  /// Tokens per main tile (256 in multi-tile path; dynamic in single-tile).
  pub fn tokens_per_main_tile(&self) -> usize {
    self.tokens_per_main_tile
  }

  /// Tokens for the thumbnail tile (None when no thumbnail).
  pub fn thumbnail_tokens(&self) -> Option<usize> {
    self.thumbnail_tokens
  }

  /// Total `<image>` tokens in the chat template after expansion.
  pub fn num_image_tokens(&self) -> usize {
    (self.rows as usize) * (self.cols as usize) * self.tokens_per_main_tile
      + self.thumbnail_tokens.unwrap_or(0)
  }

  /// Build an [`crate::chat_template::ImagePlaceholderInfo`] for use with
  /// `expand_image_placeholders` — bridges this preproc output to the
  /// chat-template module's grid-info struct.
  pub fn to_placeholder_info(&self) -> crate::chat_template::ImagePlaceholderInfo {
    crate::chat_template::ImagePlaceholderInfo::new(
      self.rows as usize,
      self.cols as usize,
      self.tokens_per_main_tile,
      self.thumbnail_tokens,
    )
  }
}

/// Decode an image from disk applying EXIF orientation. Mirrors siglip2 idiom.
#[cfg(all(feature = "decoders", not(target_arch = "wasm32")))]
#[cfg_attr(
  docsrs,
  doc(cfg(all(feature = "decoders", not(target_arch = "wasm32"))))
)]
pub fn decode_with_orientation(path: &Path) -> Result<DynamicImage> {
  use image::{ImageDecoder, ImageReader};
  let mut decoder = ImageReader::open(path)?
    .with_guessed_format()?
    .into_decoder()?;
  // Cap source dims + alloc BEFORE the full decode so a
  // decompression-bomb header (e.g., a 100k×100k PNG that
  // would allocate >30 GB of RGB) fails fast instead of OOMing the
  // process. set_limits() is strict for width/height — exceeding either
  // returns Err(LimitError) without allocating the buffer.
  decoder.set_limits(decode_limits())?;
  let orientation = decoder.orientation()?;
  let mut img = DynamicImage::from_decoder(decoder)?;
  img.apply_orientation(orientation);
  Ok(img)
}

/// In-memory variant of [`decode_with_orientation`].
#[cfg(feature = "decoders")]
#[cfg_attr(docsrs, doc(cfg(feature = "decoders")))]
pub fn decode_bytes_with_orientation(bytes: &[u8]) -> Result<DynamicImage> {
  use image::{ImageDecoder, ImageReader};
  use std::io::Cursor;
  let mut decoder = ImageReader::new(Cursor::new(bytes))
    .with_guessed_format()?
    .into_decoder()?;
  // Same decompression-bomb cap as the path-based path; see comment
  // in decode_with_orientation above.
  decoder.set_limits(decode_limits())?;
  let orientation = decoder.orientation()?;
  let mut img = DynamicImage::from_decoder(decoder)?;
  img.apply_orientation(orientation);
  Ok(img)
}

/// Strict resource limits for image decoding. Caps source width and
/// height at 16 384 px each (4× 4K, generous for legitimate use) and
/// total decoder allocation at 256 MiB (half the image-crate default
/// of 512 MiB).
///
/// 16384² × 4 bytes ≈ 1 GiB raw RGBA — well above max_alloc, so the
/// width/height check fires first for square decompression bombs.
/// For asymmetric bombs (e.g., 1×1_000_000) the height limit catches
/// them before max_alloc would.
#[cfg(feature = "decoders")]
fn decode_limits() -> image::Limits {
  let mut limits = image::Limits::default();
  limits.max_image_width = Some(16_384);
  limits.max_image_height = Some(16_384);
  limits.max_alloc = Some(256 * 1024 * 1024);
  limits
}

/// Same source-dim caps as [`decode_limits`], exposed for the
/// header-only `image_dimensions` path in `generate.rs` — the
/// header-read must reject decompression bombs at the same
/// threshold the full decode does.
// Only `generate.rs` consumes this, and generate.rs is gated on
// `inference + decoders`. Under `--features decoders` (without
// `inference`), the function is dead code and `-D dead_code`
// would fail clippy. Gate accordingly.
#[cfg(all(feature = "decoders", feature = "inference"))]
pub(crate) fn header_decode_limits() -> image::Limits {
  decode_limits()
}

/// PIL-compatible bilinear-with-antialias resize. Upstream
/// `Lfm2VlImageProcessorFast` resizes via torchvision
/// `F.resize(..., interpolation=BILINEAR, antialias=True)`, which
/// matches PIL's `Image.resize(..., Image.BILINEAR)` — a separable
/// triangle filter with kernel SUPPORT scaled by max(2/ratio, 2)
/// for each downscaled axis (i.e., a low-pass prefilter that the
/// `image` crate's `FilterType::Triangle` does NOT apply).
///
/// `fast_image_resize`'s `Convolution(FilterType::Bilinear)` is
/// explicitly designed to match Pillow's bilinear (the same target
/// torchvision aims for). Using it removes the silent algorithmic
/// divergence between our preprocessing and the upstream encoder's
/// training distribution.
///
/// Returns an `RgbImage` of size `(dst_w, dst_h)`. Errors only on
/// structurally-impossible conditions (buffer mis-sizing); for our
/// callers (validated `RgbImage` source, validated dst dims) the
/// failure path is unreachable but we still propagate rather than
/// panic.
fn pil_bilinear_resize(src: &image::RgbImage, dst_w: u32, dst_h: u32) -> Result<image::RgbImage> {
  use fast_image_resize::{
    FilterType, PixelType, ResizeAlg, ResizeOptions, Resizer, images::Image as FirImage,
  };

  let (sw, sh) = (src.width(), src.height());
  let src_fir = FirImage::from_vec_u8(sw, sh, src.as_raw().to_vec(), PixelType::U8x3)
    .map_err(|_| Error::InvalidRequest("pil_bilinear_resize: source buffer mis-sized"))?;
  let mut dst_fir = FirImage::new(dst_w, dst_h, PixelType::U8x3);
  let mut resizer = Resizer::new();
  let opts = ResizeOptions::new().resize_alg(ResizeAlg::Convolution(FilterType::Bilinear));
  resizer
    .resize(&src_fir, &mut dst_fir, &opts)
    .map_err(|_| Error::InvalidRequest("pil_bilinear_resize: resize failed"))?;
  image::RgbImage::from_raw(dst_w, dst_h, dst_fir.into_vec()).ok_or(Error::InvalidRequest(
    "pil_bilinear_resize: output buffer mis-sized",
  ))
}

/// Convert source image into the patch-flattened tensor layout
/// `vision_encoder.onnx` expects.
fn flatten_to_patches(src: &DynamicImage, grid: &TileGrid) -> Result<PreprocessedImage> {
  use image::imageops;

  // 1. Resize source to (cols × tile_w, rows × tile_h).
  let target_w = grid.cols() * grid.tile_w();
  let target_h = grid.rows() * grid.tile_h();
  let src_rgb = src.to_rgb8();
  let resized = if src_rgb.width() == target_w && src_rgb.height() == target_h {
    src_rgb.clone()
  } else {
    pil_bilinear_resize(&src_rgb, target_w, target_h)?
  };

  // 2. Build per-tile RGB blocks (row-major).
  let mut tiles: Vec<image::RgbImage> = Vec::with_capacity(grid.num_tiles());
  for r in 0..grid.rows() {
    for c in 0..grid.cols() {
      let crop = imageops::crop_imm(
        &resized,
        c * grid.tile_w(),
        r * grid.tile_h(),
        grid.tile_w(),
        grid.tile_h(),
      )
      .to_image();
      tiles.push(crop);
    }
  }

  // 3. Append thumbnail (if present) — smart-resize the WHOLE source.
  if let Some((th, tw)) = grid.thumbnail() {
    let thumb = pil_bilinear_resize(&src_rgb, tw, th)?;
    tiles.push(thumb);
  }

  // 4. Per-tile: 16×16 RGB patches → flatten to 768-vec each → normalize px/255 → 2*px-1.
  // Pad each batch entry to `max_h * max_w` patches (per-axis maxes
  // across the batch, NOT max(h*w) per entry). The vision encoder's
  // SigLIP2 NaFlex `pos_embed` reduces `spatial_shapes` with
  // `ReduceMax(axis=0)` per axis to choose the Resize target
  // `(max_h, max_w)`; it then pads the resulting `[max_h * max_w,
  // dim]` positions out to `pixel_values.shape[1]` by repeating the
  // first-position embedding. So `pixel_values.shape[1]` must equal
  // `max_h * max_w` (or be larger), otherwise the position-embedding
  // tensor and the patch-embedding tensor disagree on axis 1 and the
  // first Add inside the encoder fails to broadcast.
  let max_h = tiles
    .iter()
    .map(|t| (t.height() / PATCH_SIZE) as usize)
    .max()
    .unwrap_or(0);
  let max_w = tiles
    .iter()
    .map(|t| (t.width() / PATCH_SIZE) as usize)
    .max()
    .unwrap_or(0);
  let max_patches = max_h * max_w;
  let n_batch = tiles.len();
  let mut pixel_values = vec![0f32; n_batch * max_patches * 768];
  let mut attn_mask = vec![0i64; n_batch * max_patches];
  let mut spatial = Vec::with_capacity(n_batch * 2);

  // Scratch buffer: one tile's worth of u8 bytes in patchified order (py, px, dy, dx, ch).
  //
  // Past reviews flagged this as a possible HWC-vs-CHW mismatch
  // ("preprocessor_config says channels_first but we emit HWC").
  // FALSE POSITIVE — explained:
  //
  // The "channels_first" data_format in preprocessor_config.json refers
  // to the LAYOUT OF THE RESIZED IMAGE FED TO `convert_image_to_patches`
  // — i.e., torch tensor shape (B, C, H, W). Inside that function,
  // upstream does:
  //   patched = images.reshape(B, C, n_h, ps, n_w, ps)
  //   patched = patched.permute(0, 2, 4, 3, 5, 1)  # → (B, n_h, n_w, ps, ps, C)
  //   patched = patched.reshape(B, n_h * n_w, -1)
  // The final `.reshape(..., -1)` collapses (ps, ps, C) into 768 bytes
  // in HWC order (last dim is C). So the actual ENCODER input is HWC
  // per-patch despite the upstream pipeline starting from a CHW image.
  //
  // Our `(dy, dx, ch)` byte order IS HWC and IS what upstream produces.
  // The multi_image_ordering_proof.json fixture captures upstream
  // pixel_values bit-for-bit and our code passes it.
  //
  // Strategy: access tiles via `as_raw()` (avoids get_pixel's per-call bounds check overhead),
  // assemble bytes in patch-traversal order into a contiguous u8 scratch buffer, then run a
  // single flat iterator-zip to convert u8 → f32 — the form the compiler vectorizes best.
  let max_tile_pixels = tiles
    .iter()
    .map(|t| (t.width() * t.height()) as usize * 3)
    .max()
    .unwrap_or(0);
  let mut raw_patch_bytes: Vec<u8> = Vec::with_capacity(max_tile_pixels);

  for (i, tile) in tiles.iter().enumerate() {
    let (tw, th) = (tile.width(), tile.height());
    let h_patches = th / PATCH_SIZE;
    let w_patches = tw / PATCH_SIZE;
    spatial.push(h_patches as i64);
    spatial.push(w_patches as i64);

    // Access raw pixel buffer: row-major (y * width + x) * 3, channels R/G/B.
    // Using the raw slice eliminates per-pixel bounds checks from get_pixel.
    let raw: &[u8] = tile.as_raw();
    let stride = tw as usize * 3; // bytes per row

    let n_valid = (h_patches * w_patches) as usize;
    raw_patch_bytes.clear();
    // Preserve same traversal order as old impl: outer (py, px), inner (dy, dx, ch).
    for py in 0..h_patches as usize {
      for px in 0..w_patches as usize {
        for dy in 0..PATCH_SIZE as usize {
          let row_start = (py * PATCH_SIZE as usize + dy) * stride + px * PATCH_SIZE as usize * 3;
          // Push the 16 pixels (48 bytes) of this row-within-patch in one extend.
          raw_patch_bytes.extend_from_slice(&raw[row_start..row_start + PATCH_SIZE as usize * 3]);
        }
      }
    }

    // Normalize: single flat u8 → f32 iterator, no division/modulo in the hot path.
    // Each patch's 768 bytes are contiguous in raw_patch_bytes at patch_idx*768, and
    // the destination in pixel_values is also at dst_base + patch_idx*768.
    let dst_base = i * max_patches * 768;
    let dst = &mut pixel_values[dst_base..dst_base + n_valid * 768];
    for (dst_el, &b) in dst.iter_mut().zip(raw_patch_bytes.iter()) {
      *dst_el = (b as f32 / 255.0) * 2.0 - 1.0;
    }
    // Mark valid patches in the attention mask.
    for p in 0..n_valid {
      attn_mask[i * max_patches + p] = 1;
    }
  }

  let tokens_per_main =
    ((grid.tile_h() / TILE_PIXEL_UNIT) * (grid.tile_w() / TILE_PIXEL_UNIT)) as usize;
  let thumbnail_tokens = grid
    .thumbnail()
    .map(|(th, tw)| ((th / TILE_PIXEL_UNIT) * (tw / TILE_PIXEL_UNIT)) as usize);

  Ok(PreprocessedImage {
    pixel_values,
    pixel_attention_mask: attn_mask,
    spatial_shapes: spatial,
    batch_size: n_batch,
    patches_per_entry: max_patches,
    rows: grid.rows(),
    cols: grid.cols(),
    main_tile_h: grid.tile_h(),
    main_tile_w: grid.tile_w(),
    thumbnail_size: grid.thumbnail(),
    tokens_per_main_tile: tokens_per_main,
    thumbnail_tokens,
  })
}

#[cfg(test)]
mod tests {
  use super::*;
  use image::{ImageBuffer, Rgb};

  #[test]
  fn preprocess_small_square_succeeds() {
    let img = DynamicImage::ImageRgb8(ImageBuffer::from_pixel(256, 256, Rgb([128, 128, 128])));
    let p = Preprocessor::new(ImageBudget::new());
    let out = p.preprocess(&img).unwrap();
    assert!(out.batch_size() >= 1);
    assert!(out.num_image_tokens() > 0);
  }

  #[test]
  fn preprocess_large_square_routes_multi_tile() {
    let img = DynamicImage::ImageRgb8(ImageBuffer::from_pixel(1024, 1024, Rgb([128, 128, 128])));
    let p = Preprocessor::new(ImageBudget::new());
    let out = p.preprocess(&img).unwrap();
    assert!(out.num_tiles() >= 4);
    assert_eq!(out.tokens_per_main_tile(), 256);
  }

  #[test]
  fn pixel_values_normalized_minus_one_to_one() {
    let img = DynamicImage::ImageRgb8(ImageBuffer::from_pixel(256, 256, Rgb([255, 0, 0])));
    let p = Preprocessor::new(ImageBudget::new());
    let out = p.preprocess(&img).unwrap();
    let pv = out.pixel_values();
    assert!((pv[0] - 1.0).abs() < 1e-5); // R = 255 → 1.0
    assert!((pv[1] + 1.0).abs() < 1e-5); // G = 0 → -1.0
    assert!((pv[2] + 1.0).abs() < 1e-5); // B = 0 → -1.0
  }

  #[test]
  fn batch_preserves_order() {
    let p = Preprocessor::new(ImageBudget::new());
    let red = DynamicImage::ImageRgb8(ImageBuffer::from_pixel(256, 256, Rgb([255, 0, 0])));
    let blue = DynamicImage::ImageRgb8(ImageBuffer::from_pixel(256, 256, Rgb([0, 0, 255])));
    let batch = p.preprocess_batch(&[red, blue]).unwrap();
    assert_eq!(batch.len(), 2);
    // First image's first pixel R should be 1.0 (red); second's B should be 1.0 (blue).
    assert!((batch[0].pixel_values()[0] - 1.0).abs() < 1e-5);
    assert!((batch[1].pixel_values()[2] - 1.0).abs() < 1e-5);
  }

  /// N5 byte-identity test: ensures the vectorized flat-u8→f32 normalization
  /// produces bit-for-bit identical output to the scalar formula `(b as f32 / 255.0) * 2.0 - 1.0`
  /// applied to raw pixel bytes in patchified traversal order.
  #[test]
  fn normalization_byte_identical_to_scalar_reference() {
    // Use a non-uniform image so different pixel values are exercised.
    // Pattern: x selects R (0–255 gradient), y selects G (0–255 gradient), B=128 constant.
    let w: u32 = 64;
    let h: u32 = 64;
    let img = DynamicImage::ImageRgb8(ImageBuffer::from_fn(w, h, |x, y| {
      Rgb([((x * 4) % 256) as u8, ((y * 4) % 256) as u8, 128u8])
    }));

    let p = Preprocessor::new(ImageBudget::new());
    let out = p.preprocess(&img).unwrap();
    let pv = out.pixel_values();

    // Reconstruct the same patchified traversal as flatten_to_patches to get expected values.
    // This is the scalar reference: collect bytes in patch order, then apply the formula.
    use image::imageops;
    let budget = ImageBudget::new();
    let grid = tile_grid::pick_tile_grid(w, h, &budget).unwrap();
    let target_w = grid.cols() * grid.tile_w();
    let target_h = grid.rows() * grid.tile_h();
    let src_rgb = img.to_rgb8();
    let resized = if src_rgb.width() == target_w && src_rgb.height() == target_h {
      src_rgb.clone()
    } else {
      pil_bilinear_resize(&src_rgb, target_w, target_h).unwrap()
    };

    let mut expected: Vec<f32> = Vec::with_capacity(pv.len());
    // Compute max_patches (same as flatten_to_patches).
    let h_patches_main = grid.tile_h() / tile_grid::PATCH_SIZE;
    let w_patches_main = grid.tile_w() / tile_grid::PATCH_SIZE;
    let main_patches = (h_patches_main * w_patches_main) as usize;
    let thumb_patches = grid
      .thumbnail()
      .map(|(th, tw)| ((th / tile_grid::PATCH_SIZE) * (tw / tile_grid::PATCH_SIZE)) as usize)
      .unwrap_or(0);
    // All main tiles have the same patch count; thumbnail (if any) may differ.
    let max_patches = if grid.thumbnail().is_some() {
      main_patches.max(thumb_patches)
    } else {
      main_patches
    };

    // Build expected: main tiles.
    for r in 0..grid.rows() {
      for c in 0..grid.cols() {
        let crop = imageops::crop_imm(
          &resized,
          c * grid.tile_w(),
          r * grid.tile_h(),
          grid.tile_w(),
          grid.tile_h(),
        )
        .to_image();
        let mut patch_vals = vec![0f32; max_patches * 768];
        for py in 0..h_patches_main {
          for px in 0..w_patches_main {
            let pidx = (py * w_patches_main + px) as usize;
            for dy in 0..tile_grid::PATCH_SIZE {
              for dx in 0..tile_grid::PATCH_SIZE {
                let pix = crop.get_pixel(
                  px * tile_grid::PATCH_SIZE + dx,
                  py * tile_grid::PATCH_SIZE + dy,
                );
                for ch in 0..3usize {
                  let k = dy * tile_grid::PATCH_SIZE * 3 + dx * 3 + ch as u32;
                  patch_vals[pidx * 768 + k as usize] = (pix[ch] as f32 / 255.0) * 2.0 - 1.0;
                }
              }
            }
          }
        }
        expected.extend_from_slice(&patch_vals);
      }
    }
    // Thumbnail (if any).
    if let Some((th, tw)) = grid.thumbnail() {
      let thumb = pil_bilinear_resize(&src_rgb, tw, th).unwrap();
      let th_h_patches = th / tile_grid::PATCH_SIZE;
      let th_w_patches = tw / tile_grid::PATCH_SIZE;
      let mut patch_vals = vec![0f32; max_patches * 768];
      for py in 0..th_h_patches {
        for px in 0..th_w_patches {
          let pidx = (py * th_w_patches + px) as usize;
          for dy in 0..tile_grid::PATCH_SIZE {
            for dx in 0..tile_grid::PATCH_SIZE {
              let pix = thumb.get_pixel(
                px * tile_grid::PATCH_SIZE + dx,
                py * tile_grid::PATCH_SIZE + dy,
              );
              for ch in 0..3usize {
                let k = dy * tile_grid::PATCH_SIZE * 3 + dx * 3 + ch as u32;
                patch_vals[pidx * 768 + k as usize] = (pix[ch] as f32 / 255.0) * 2.0 - 1.0;
              }
            }
          }
        }
      }
      expected.extend_from_slice(&patch_vals);
    }

    assert_eq!(pv.len(), expected.len(), "pixel_values length mismatch");
    for (idx, (&got, &exp)) in pv.iter().zip(expected.iter()).enumerate() {
      assert_eq!(
        got.to_bits(),
        exp.to_bits(),
        "pixel_values[{idx}] mismatch: got {got} vs ref {exp}"
      );
    }
  }

  #[test]
  fn to_placeholder_info_round_trip() {
    let img = DynamicImage::ImageRgb8(ImageBuffer::from_pixel(1024, 1024, Rgb([128, 128, 128])));
    let p = Preprocessor::new(ImageBudget::new());
    let pre = p.preprocess(&img).unwrap();
    let info = pre.to_placeholder_info();
    assert_eq!(info.rows(), pre.rows());
    assert_eq!(info.cols(), pre.cols());
    assert_eq!(info.tokens_per_main_tile(), pre.tokens_per_main_tile());
    assert_eq!(info.num_image_tokens(), pre.num_image_tokens());
  }
}