lfm 0.1.0

Rust ONNX inference for LiquidAI LFM2.5-VL (vision-language) models — implements the engine-agnostic llmtask::Task contract via llguidance for schema-constrained sampling
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
//! Public Engine API. Holds runtime sessions + tokenizer + lazy
//! ParserFactory for schema-constrained sampling.
//!
//! The Engine is single-threaded (`&mut self` on every mutating method)
//! because the underlying ORT sessions are not `Sync`. Multi-engine
//! pools are out of scope for v0.1.
//!
//! ## llguidance 1.7.x setup (verified in Task 13)
//!
//! **TokenizerEnv adapter:**
//! `toktrie_hf_tokenizers::ByteTokenizer::from_file(tokenizer_path)?`
//! loads the tokenizer from `tokenizer.json` and extracts a byte-per-token representation.
//! `.into_tok_env(None)?` turns it into a `toktrie::TokEnv` (`Arc<dyn TokenizerEnv>`).
//!
//! *Why `from_file` instead of `from_tokenizer`?* `lfm` uses `tokenizers` v0.23 while
//! `toktrie_hf_tokenizers` 1.7 depends on v0.21. The two `Tokenizer` types are
//! incompatible at the type level (different crate versions → different nominal types).
//! Re-reading from the same path via `from_file` avoids the version boundary.
//!
//! **Factory construction:**
//! `ParserFactory::new_simple(&tok_env)?` — uses `InferenceCapabilities::default()`
//! (no ff_tokens) and `SlicedBiasComputer::general_slices()` (standard regex slices).
//!
//! **Constraint construction per call:**
//! `TopLevelGrammar::from_json_schema(serde_json::from_str(schema_json)?)` builds the
//! grammar from the task's JSON schema `Value`.  `factory.create_parser(grammar)?`
//! returns a `TokenParser`; `Constraint::new(parser)` wraps it for the sampling loop.
//!
//! **Note on ff_tokens:** `InferenceCapabilities::default()` disables fast-forward tokens;
//! `ConstrainedSampler` handles the `sample_mask = None` case defensively (see sampler.rs).

use std::{
  path::{Path, PathBuf},
  sync::Arc,
};

use tokenizers::Tokenizer;

use crate::{
  ChatMessage, ContentPart, ImageInput,
  chat_template::{
    BOS, BOS_TOKEN_ID, EOS_TOKEN_ID, IM_END, IM_START, IM_START_TOKEN_ID, IMAGE_END,
    IMAGE_END_TOKEN_ID, IMAGE_START, IMAGE_START_TOKEN_ID, IMAGE_THUMBNAIL,
    IMAGE_THUMBNAIL_TOKEN_ID, IMAGE_TOKEN, IMAGE_TOKEN_ID, IMG_ROW_COL_BASE_ID,
  },
  error::{Error, Result},
  generate::{GenerateInputs, generate},
  options::{Options, RequestOptions},
  preproc::Preprocessor,
  runtime::{
    decoder::Decoder,
    embed_tokens::EmbedTokens,
    sampler::{ConstrainedSampler, FreeSampler},
    vision::VisionEncoder,
  },
};

use llguidance::{Constraint, ParserFactory, api::TopLevelGrammar};
use toktrie::TokEnv;

/// Public engine for LFM2.5-VL inference.
///
/// Construct via [`Engine::from_dir`] for the standard HuggingFace download
/// layout, or via [`Engine::from_paths`] for unusual file arrangements.
pub struct Engine {
  preproc: Preprocessor,
  vision: VisionEncoder,
  embed: EmbedTokens,
  decoder: Decoder,
  tokenizer: Tokenizer,
  /// Bytes of `tokenizer.json` captured at construction. Storing
  /// a `tokenizer_path` and re-reading lazily inside
  /// `parser_factory()` would let a file replaced between Engine
  /// construction and the first schema-constrained `run` cause
  /// silent schema-vs-model mismatch — llguidance would mask token
  /// IDs from the new file while embedding/detokenization continued
  /// to use the originally-validated `Tokenizer`. Capturing the
  /// bytes once ties both loads to the same content.
  tokenizer_bytes: Vec<u8>,
  /// Cached ParserFactory; lazily initialized on first schema-constrained call.
  parser_factory: Option<Arc<ParserFactory>>,
  eos_token_id: u32,
  /// Per-call sampler seed; advances every `generate`/`run`. Initialized
  /// from system-time nanoseconds so two engines on the same machine
  /// don't return identical sequences.
  next_seed: u64,
}

impl Engine {
  /// Construct from a directory containing the ONNX model files.
  ///
  /// Expected layout (matches HuggingFace download):
  /// ```text
  /// {model_dir}/
  ///   onnx/
  ///     vision_encoder.onnx
  ///     embed_tokens.onnx
  ///     decoder_model_merged.onnx
  ///   tokenizer.json
  ///   preprocessor_config.json
  /// ```
  ///
  /// **Strict constructor.** Validates `preprocessor_config.json`
  /// matches our hardcoded preprocessing constants AND validates the
  /// supplied `tokenizer.json` byte-matches the bundled blob — a
  /// custom tokenizer whose normal vocabulary drifts from what the
  /// embedding table expects would silently corrupt prompts.
  /// Requires the `bundled` feature so the
  /// byte-compare reference is available; without it, use
  /// [`Engine::from_paths`] (the unchecked escape hatch for advanced
  /// callers pairing custom tokenizers with custom ONNX).
  #[cfg(feature = "bundled")]
  #[cfg_attr(docsrs, doc(cfg(feature = "bundled")))]
  pub fn from_dir<P: AsRef<Path>>(model_dir: P, opts: Options) -> Result<Self> {
    let dir: PathBuf = model_dir.as_ref().to_path_buf();
    // validate preprocessor_config.json
    // matches our hardcoded algorithm constants. A model directory
    // with compatible ONNX shapes but a drifted preprocessing config
    // (different tile_size, patch_size, normalization) would
    // otherwise produce wrong visual embeddings without a clear
    // load-time error. Only runs in from_dir / from_onnx_dir where
    // we have access to the model's config files; from_paths users
    // explicitly opted out of this check.
    validate_preprocessor_config(&dir.join("preprocessor_config.json"))?;
    // The tokenizer special-token contract verifies
    // BOS/IM_START/image/row-col IDs, but a tokenizer with
    // those IDs unchanged AND a drifted NORMAL vocabulary (different
    // BPE merges, swapped subword IDs, etc.) would still pass. Such
    // a tokenizer would encode the same text into different token
    // IDs that no longer match the model's embedding table, silently
    // corrupting every prompt. For from_dir (strict constructor),
    // require the supplied tokenizer.json to byte-match the bundled
    // blob. from_paths remains the unchecked escape hatch for
    // advanced callers pairing custom tokenizers with custom ONNX.
    // (No inner cfg gate needed — from_dir itself is gated on `bundled`.)
    validate_tokenizer_matches_bundled(&dir.join("tokenizer.json"))?;
    // validate the model directory's chat
    // template byte-equals our bundled jinja. The renderer always
    // uses BUNDLED_CHAT_TEMPLATE_JINJA at run-time; a model revision
    // can ship a byte-identical tokenizer.json yet a different chat
    // template (different role envelope, different image-block
    // wrapping) — and from_dir would silently load it while we
    // render with the wrong template, producing semantically wrong
    // prompts whose `<image>` count still happens to line up. The
    // file is required for from_dir; absence is treated the same
    // way as a content mismatch (use from_paths to bypass).
    validate_chat_template_matches_bundled(&dir.join("chat_template.jinja"))?;
    // validate the model's
    // text_config.max_position_embeddings matches the hard-coded
    // MODEL_CONTEXT_TOKENS used by generate's admission gates. A
    // model directory with the same tokenizer/template/preprocessor
    // but a smaller-context decoder export would otherwise load
    // successfully, and requests up to 128 K tokens would pass
    // admission then fail late or generate with invalid position
    // state. Same theme as the chat_template drift check.
    validate_config_context_matches_bundled(&dir.join("config.json"))?;
    let onnx = dir.join("onnx");
    Self::from_paths(
      EnginePaths::new(
        onnx.join("vision_encoder.onnx"),
        onnx.join("embed_tokens.onnx"),
        onnx.join("decoder_model_merged.onnx"),
        dir.join("tokenizer.json"),
      ),
      opts,
    )
  }

  /// Construct from a directory that contains **only the ONNX files**, using
  /// the tokenizer + configs that were bundled into this crate at compile time.
  ///
  /// Use this when you've downloaded only the ONNX artifacts from the upstream
  /// HuggingFace repo and don't want to also fetch the tokenizer / JSON configs.
  /// The bundled `tokenizer.json` is written to a per-process temp file (required
  /// by `toktrie_hf_tokenizers::ByteTokenizer::from_file`) and reused across all
  /// schema-constrained calls within the same `Engine` instance.
  ///
  /// Expected ONNX directory layout:
  /// ```text
  /// {onnx_dir}/
  ///   vision_encoder.onnx
  ///   embed_tokens.onnx
  ///   decoder_model_merged.onnx
  /// ```
  ///
  /// The tokenizer written to the temp directory (`$TMPDIR/lfm-bundled-<PID>/`)
  /// is never explicitly deleted; the OS cleans it up on next boot (standard
  /// behaviour for `std::env::temp_dir()`).
  #[cfg(feature = "bundled")]
  #[cfg_attr(docsrs, doc(cfg(feature = "bundled")))]
  pub fn from_onnx_dir<P: AsRef<Path>>(onnx_dir: P, opts: Options) -> Result<Self> {
    let onnx = onnx_dir.as_ref();
    let tmp_tokenizer = write_bundled_tokenizer()?;
    Self::from_paths(
      EnginePaths::new(
        onnx.join("vision_encoder.onnx"),
        onnx.join("embed_tokens.onnx"),
        onnx.join("decoder_model_merged.onnx"),
        tmp_tokenizer,
      ),
      opts,
    )
  }

  /// Construct from explicit paths (for non-standard layouts).
  pub fn from_paths(paths: EnginePaths, opts: Options) -> Result<Self> {
    // Validate budget BEFORE any expensive work. validate_image_tokenizer_contract
    // performs an O(max_tiles²) nested scan; without this guard, an invalid
    // budget like `with_max_tiles(usize::MAX)` would hang construction
    // indefinitely. Validate() also caps max_tiles at MAX_TOKENIZER_TILE_DIM=10
    // so the scan is provably bounded after this returns Ok.
    opts.image_budget().validate()?;
    let preproc = Preprocessor::new(*opts.image_budget());
    let vision = VisionEncoder::from_path(paths.vision(), &opts)?;
    let embed = EmbedTokens::from_path(paths.embed(), &opts)?;
    let decoder = Decoder::from_path(paths.decoder(), &opts)?;
    // Clone tokenizer path before passing it to from_file; we store it
    // for the lazy ParserFactory which uses toktrie_hf_tokenizers::ByteTokenizer::from_file.
    let tokenizer_path = paths.tokenizer().clone();
    // read bytes ONCE, then build the
    // `tokenizers::Tokenizer` from those exact bytes. The same
    // bytes are stored on the Engine and reused by the lazy
    // ParserFactory — guaranteeing the schema matcher and the
    // tokenizer/embedding stack agree, regardless of any later
    // file changes at `tokenizer_path`.
    let tokenizer_bytes = std::fs::read(&tokenizer_path).map_err(Error::Io)?;
    let tokenizer = Tokenizer::from_bytes(&tokenizer_bytes).map_err(Error::tokenizer)?;

    // EOS token: LFM2.5-VL chat models use <|im_end|> (id 7).
    // IM_END / EOS_TOKEN_ID are consts in chat_template.rs; cross-check
    // detects tokenizer.json drift (model rev mismatch, custom tokenizer).
    let eos_token_id = tokenizer
      .token_to_id(IM_END)
      .ok_or(Error::InvalidRequest("tokenizer missing <|im_end|> token"))?;
    if eos_token_id != EOS_TOKEN_ID {
      return Err(Error::InvalidRequest(
        "tokenizer <|im_end|> token id differs from expected EOS_TOKEN_ID (7) — wrong tokenizer.json?",
      ));
    }

    // Validate every special token that expand_image_placeholders can
    // emit. Without this, a tokenizer that's missing
    // <|image_start|>/<|image_end|>/<|img_thumbnail|> or any
    // <|img_row_R_col_C|> marker reachable under max_tiles loads
    // successfully — then tokenization at run-time silently splits
    // those markers into byte-level tokens while the <image>-token
    // count still matches, corrupting position-token embeddings on
    // every multi-tile prompt with no error reported.
    validate_image_tokenizer_contract(&tokenizer, opts.image_budget().max_tiles())?;

    let next_seed = std::time::SystemTime::now()
      .duration_since(std::time::UNIX_EPOCH)
      .map(|d| d.as_nanos() as u64)
      .unwrap_or(0xC0_FFEE);

    Ok(Self {
      preproc,
      vision,
      embed,
      decoder,
      tokenizer,
      tokenizer_bytes,
      parser_factory: None,
      eos_token_id,
      next_seed,
    })
  }

  /// Free-form generation (no schema constraint).
  ///
  /// Uses an unconstrained sampler (greedy or min-p with repetition penalty).
  pub fn generate(
    &mut self,
    messages: &[ChatMessage],
    images: &[ImageInput<'_>],
    req: &RequestOptions,
  ) -> Result<String> {
    req.validate()?;
    let seed = self.draw_seed();
    let mut sampler = FreeSampler::new(*req, seed, self.tokenizer.get_vocab_size(true) as u32);
    generate(
      &self.preproc,
      &mut self.vision,
      &mut self.embed,
      &mut self.decoder,
      &self.tokenizer,
      &mut sampler,
      GenerateInputs::new(messages, images, req, self.eos_token_id),
    )
  }

  /// Schema-constrained generation driven by a [`llmtask::Task`].
  ///
  /// 1. Builds the user message from the supplied images plus
  ///    `task.prompt()` — the caller does not pass `messages`. This
  ///    guarantees `task.prompt()` is always present, so the schema-
  ///    valid output reflects the task's grounding rules and not just
  ///    its JSON shape.
  /// 2. Compiles `task.schema()` into an llguidance `Constraint`.
  /// 3. Runs the generation loop with a constraint-driven sampler.
  /// 4. Passes the raw text to `task.parse(raw)` for typed deserialization.
  ///
  /// The `ParserFactory` is constructed once and cached across calls.
  pub fn run<T: llmtask::Task>(
    &mut self,
    task: &T,
    images: &[ImageInput<'_>],
    req: &RequestOptions,
  ) -> Result<T::Output>
  where
    Error: From<T::ParseError>,
  {
    req.validate()?;
    // preflight image-count bounds BEFORE
    // allocating one ContentPart per image. Without this, a request
    // with millions of ImageInput entries would force a giant
    // Vec<ContentPart> allocation before generate() could reject
    // via its own admission checks. Mirror those checks here on
    // stack-only state.
    if images.len().saturating_add(1) > crate::generate::MAX_TOTAL_CONTENT_PARTS {
      return Err(Error::InvalidRequest(
        "too many images per request (request-shape DoS guard)",
      ));
    }
    crate::generate::check_image_count_lower_bound(
      images.len(),
      self
        .preproc
        .budget()
        .min_image_tokens()
        .saturating_add(crate::generate::IMAGE_BLOCK_WRAPPER_TOKENS),
      req.max_new_tokens(),
    )?;

    // Build a single user message: N image parts followed by the task
    // prompt text. This locks in the contract that task.prompt() is
    // always sent with the images — callers can't accidentally drop it.
    let mut parts: Vec<ContentPart> = Vec::with_capacity(images.len() + 1);
    for _ in 0..images.len() {
      parts.push(ContentPart::Image);
    }
    parts.push(ContentPart::Text(task.prompt().to_owned()));
    let messages = [ChatMessage::parts(
      smol_str::SmolStr::new_static("user"),
      parts,
    )];

    let factory = self.parser_factory()?;
    let constraint = build_constraint(&factory, &task.grammar())?;
    let seed = self.draw_seed();
    let mut sampler = ConstrainedSampler::new(
      constraint,
      *req,
      seed,
      self.tokenizer.get_vocab_size(true) as u32,
    );
    let text = generate(
      &self.preproc,
      &mut self.vision,
      &mut self.embed,
      &mut self.decoder,
      &self.tokenizer,
      &mut sampler,
      GenerateInputs::new(&messages, images, req, self.eos_token_id),
    )?;
    task.parse(&text).map_err(Error::from)
  }

  // ===== internal =====

  /// Return the current seed and advance the counter. Each call to
  /// [`Engine::generate`] / [`Engine::run`] gets a distinct seed so
  /// non-greedy sampling doesn't replay an identical sequence.
  fn draw_seed(&mut self) -> u64 {
    let seed = self.next_seed;
    self.next_seed = self.next_seed.wrapping_add(1);
    seed
  }

  /// Lazily construct and cache the `ParserFactory`.
  ///
  /// The factory is wrapped in `Arc` so it can be shared across
  /// multiple `Constraint` instances across calls without cloning
  /// the heavy trie data.
  fn parser_factory(&mut self) -> Result<Arc<ParserFactory>> {
    if let Some(f) = &self.parser_factory {
      return Ok(f.clone());
    }
    let factory = build_parser_factory(&self.tokenizer_bytes)?;
    let arc = Arc::new(factory);
    self.parser_factory = Some(arc.clone());
    Ok(arc)
  }
}

// =========================================================================
// Tokenizer contract validation
// =========================================================================

/// Validate that the tokenizer recognises every special token that
/// [`crate::chat_template::expand_image_placeholders`] can emit:
///
/// - `<image>` (with the bundled-tokenizer id as a cross-check)
/// - `<|image_start|>`, `<|image_end|>`, `<|img_thumbnail|>`
/// - `<|img_row_R_col_C|>` for every R, C in `[1, max_tiles]`
///
/// Run-time tokenization treats unknown special-token strings as raw
/// text and breaks them into byte-level pieces, while the
/// `<image>`-token count remains correct — silently corrupting
/// position-token embeddings on every multi-tile prompt. Catching
/// this at session-construction prevents the silent failure mode.
#[allow(dead_code)]
fn validate_image_tokenizer_contract(tokenizer: &Tokenizer, max_tiles: usize) -> Result<()> {
  // the embedding/decoder contract is
  // ID-based — a tokenizer with the same special-token STRINGS but
  // remapped IDs would pass a presence-only check, then embed
  // markers as the wrong tokens at run-time. Validate ID for every
  // structural special token the chat template / image expansion
  // can emit. The bundled tokenizer's IDs are listed in the const
  // table at the top of chat_template.rs.
  let id_check = |name_str: &str, expected: u32| -> Result<()> {
    let actual = tokenizer
      .token_to_id(name_str)
      .ok_or(Error::InvalidRequest(
        "tokenizer missing required special token — wrong tokenizer.json?",
      ))?;
    if actual != expected {
      return Err(Error::InvalidRequest(
        "tokenizer special-token id differs from expected — wrong tokenizer.json?",
      ));
    }
    Ok(())
  };
  id_check(BOS, BOS_TOKEN_ID)?;
  id_check(IM_START, IM_START_TOKEN_ID)?;
  id_check(IMAGE_TOKEN, IMAGE_TOKEN_ID)?;
  id_check(IMAGE_START, IMAGE_START_TOKEN_ID)?;
  id_check(IMAGE_END, IMAGE_END_TOKEN_ID)?;
  id_check(IMAGE_THUMBNAIL, IMAGE_THUMBNAIL_TOKEN_ID)?;

  // Per-tile row/col markers reachable under max_tiles. The candidate
  // grid search in find_closest_aspect_ratio enumerates (i, j) for
  // i, j in [1, max_tiles] (constrained by i*j <= max_tiles), so any
  // reachable marker has both indices in [1, max_tiles].
  // ImageBudget::validate caps max_tiles at MAX_TOKENIZER_TILE_DIM
  // (=10), so this is at most 100 lookups.
  //
  // Defense-in-depth: even though the caller (Engine::from_paths) is
  // expected to validate the budget first, refuse to scan past
  // MAX_TOKENIZER_TILE_DIM here to keep the loop provably bounded
  // even if a future caller forgets.
  if max_tiles > crate::options::MAX_TOKENIZER_TILE_DIM {
    return Err(Error::InvalidBudget(
      "max_tiles must be <= 10 (bundled tokenizer's row/col marker grid is 10x10)",
    ));
  }
  // Per-tile markers <|img_row_R_col_C|> for R, C in [1, max_tiles].
  // Bundled IDs are contiguous: IMG_ROW_COL_BASE_ID + (R-1)*10 + (C-1)
  // for R, C in [1, 10] (so ids 397..=496). We validate both presence
  // AND id so a tokenizer with same strings but remapped ids fails
  // construction.
  for r in 1..=max_tiles as u32 {
    for c in 1..=max_tiles as u32 {
      let marker = format!("<|img_row_{r}_col_{c}|>");
      let actual = tokenizer
        .token_to_id(&marker)
        .ok_or(Error::InvalidRequest(
          "tokenizer missing one or more <|img_row_R_col_C|> markers reachable under max_tiles — wrong tokenizer.json?",
        ))?;
      let expected = IMG_ROW_COL_BASE_ID + (r - 1) * 10 + (c - 1);
      if actual != expected {
        return Err(Error::InvalidRequest(
          "tokenizer <|img_row_R_col_C|> id differs from expected (IMG_ROW_COL_BASE_ID + (R-1)*10 + (C-1)) — wrong tokenizer.json?",
        ));
      }
    }
  }

  Ok(())
}

// =========================================================================
// Preprocessor-config drift detector ()
// =========================================================================

/// Validate the model's `preprocessor_config.json` matches our hardcoded
/// preprocessing-algorithm constants (patch_size, downsample_factor,
/// tile_size, image_mean, image_std). Drift between any of these and
/// the values our `flatten_to_patches` / `smart_resize` / `pick_tile_grid`
/// rely on would produce visually-wrong embeddings without an obvious
/// runtime error.
///
/// Budget-tunable fields (min/max image_tokens, min/max tiles,
/// max_pixels_tolerance, use_thumbnail) are deliberately NOT checked
/// here — callers can override them via `Options::image_budget()`.
///
/// Used only by `from_dir` (where the model directory has the config
/// alongside the ONNX files). `from_onnx_dir` uses bundled assets and
/// our own constants by construction, so no drift is possible.
#[cfg_attr(not(feature = "bundled"), allow(dead_code))]
fn validate_preprocessor_config(path: &Path) -> Result<()> {
  // fail closed on missing config. The
  // strict drift detector is the whole point of this check; allowing
  // its absence to skip validation defeats it. If a caller has a
  // stripped-down model directory without preprocessor_config.json,
  // they should use `from_paths` (which explicitly opts out) or
  // `from_onnx_dir` (which uses bundled assets).
  if !path.exists() {
    return Err(Error::InvalidRequest(
      "model directory missing preprocessor_config.json — use from_paths to bypass strict drift checks",
    ));
  }
  let raw = std::fs::read_to_string(path).map_err(Error::Io)?;
  let cfg: serde_json::Value = serde_json::from_str(&raw)
    .map_err(|e| Error::tokenizer(format!("preprocessor_config.json parse failure: {e}")))?;

  // Helpers
  let read_u64 = |key: &'static str| -> Result<u64> {
    cfg
      .get(key)
      .and_then(|v| v.as_u64())
      .ok_or(Error::InvalidRequest(
        "preprocessor_config.json missing required integer field — wrong model revision?",
      ))
  };
  let read_bool = |key: &'static str| -> Result<bool> {
    cfg
      .get(key)
      .and_then(|v| v.as_bool())
      .ok_or(Error::InvalidRequest(
        "preprocessor_config.json missing required boolean field — wrong model revision?",
      ))
  };
  let read_str = |key: &'static str| -> Result<&str> {
    cfg
      .get(key)
      .and_then(|v| v.as_str())
      .ok_or(Error::InvalidRequest(
        "preprocessor_config.json missing required string field — wrong model revision?",
      ))
  };
  let read_f64 = |key: &'static str| -> Result<f64> {
    cfg
      .get(key)
      .and_then(|v| v.as_f64())
      .ok_or(Error::InvalidRequest(
        "preprocessor_config.json missing required number field — wrong model revision?",
      ))
  };
  let read_f32_array3 = |key: &'static str| -> Result<[f32; 3]> {
    let arr = cfg
      .get(key)
      .and_then(|v| v.as_array())
      .ok_or(Error::InvalidRequest(
        "preprocessor_config.json missing required [f32; 3] field — wrong model revision?",
      ))?;
    if arr.len() != 3 {
      return Err(Error::InvalidRequest(
        "preprocessor_config.json field is not a 3-element array — wrong model revision?",
      ));
    }
    let mut out = [0f32; 3];
    for (i, v) in arr.iter().enumerate() {
      out[i] = v.as_f64().ok_or(Error::InvalidRequest(
        "preprocessor_config.json array element is not a number — wrong model revision?",
      ))? as f32;
    }
    Ok(out)
  };

  // Round-20 baseline: model-fixed dimensional constants.
  if read_u64("encoder_patch_size")? != crate::preproc::tile_grid::PATCH_SIZE as u64 {
    return Err(Error::InvalidRequest(
      "preprocessor_config.json encoder_patch_size != 16 (lfm crate hardcoded) — wrong model revision?",
    ));
  }
  if read_u64("downsample_factor")? != crate::preproc::tile_grid::DOWNSAMPLE_FACTOR as u64 {
    return Err(Error::InvalidRequest(
      "preprocessor_config.json downsample_factor != 2 (lfm crate hardcoded) — wrong model revision?",
    ));
  }
  if read_u64("tile_size")? != crate::preproc::tile_grid::FULL_TILE_SIZE as u64 {
    return Err(Error::InvalidRequest(
      "preprocessor_config.json tile_size != 512 (lfm crate hardcoded) — wrong model revision?",
    ));
  }

  // also validate every preprocessing
  // semantic the Rust code hardcodes. Any of these flipped vs the
  // model's training-time config would produce wrong embeddings.
  for (key, expected) in [
    ("do_resize", true),
    ("do_rescale", true),
    ("do_normalize", true),
    ("do_pad", true),
    ("do_image_splitting", true),
  ] {
    if read_bool(key)? != expected {
      return Err(Error::InvalidRequest(
        "preprocessor_config.json boolean preprocessing flag differs from lfm crate hardcoded value — wrong model revision?",
      ));
    }
  }

  // data_format: must be channels_first. Our flatten_to_patches
  // produces (C, H, W) order — see PATCH_SIZE × PATCH_SIZE × 3 unfold.
  if read_str("data_format")? != "channels_first" {
    return Err(Error::InvalidRequest(
      "preprocessor_config.json data_format != channels_first — wrong model revision?",
    ));
  }

  // resample: 2 = PIL BILINEAR. We use image::imageops::FilterType::Triangle
  // which is bilinear; matches.
  if read_u64("resample")? != 2 {
    return Err(Error::InvalidRequest(
      "preprocessor_config.json resample != 2 (BILINEAR) — wrong model revision?",
    ));
  }

  // rescale_factor: 1/255 = 0.003921568627... — our flatten_to_patches
  // computes (px / 255.0) * 2.0 - 1.0, where /255.0 IS the rescale.
  let rf = read_f64("rescale_factor")?;
  if (rf - (1.0 / 255.0)).abs() > 1e-9 {
    return Err(Error::InvalidRequest(
      "preprocessor_config.json rescale_factor != 1/255 — wrong model revision?",
    ));
  }

  // size = {height: 512, width: 512}. Mirrors tile_size but checked
  // separately because some HF processors use `size` independently.
  let size = cfg
    .get("size")
    .and_then(|v| v.as_object())
    .ok_or(Error::InvalidRequest(
      "preprocessor_config.json missing size object — wrong model revision?",
    ))?;
  for (key, expected) in [("height", 512u64), ("width", 512u64)] {
    if size.get(key).and_then(|v| v.as_u64()) != Some(expected) {
      return Err(Error::InvalidRequest(
        "preprocessor_config.json size.{height,width} != 512 — wrong model revision?",
      ));
    }
  }

  // Normalization: our flatten_to_patches does (px/255)*2 - 1, which
  // is equivalent to (px/255 - 0.5) / 0.5 = subtract 0.5 then divide
  // by 0.5. So image_mean and image_std must both be [0.5, 0.5, 0.5].
  for (key, expected) in [("image_mean", [0.5f32; 3]), ("image_std", [0.5f32; 3])] {
    let got = read_f32_array3(key)?;
    for (g, e) in got.iter().zip(expected.iter()) {
      if (g - e).abs() > 1e-4 {
        return Err(Error::InvalidRequest(
          "preprocessor_config.json image_mean/image_std differs from [0.5, 0.5, 0.5] (lfm crate hardcoded normalization) — wrong model revision?",
        ));
      }
    }
  }
  Ok(())
}

// =========================================================================
// Tokenizer-bytes drift detector (bundled feature only)
// =========================================================================

/// Verify the supplied `tokenizer.json` byte-matches the bundled blob.
/// a tokenizer with the same special-token IDs
/// but a drifted normal vocabulary would pass the per-token contract
/// check yet still encode normal text into different IDs that don't
/// match the embedding table — silent global prompt corruption.
///
/// Called from `from_dir` (strict constructor) only; `from_paths`
/// remains the explicit escape hatch for callers intentionally
/// pairing custom tokenizers with custom ONNX.
/// Validate the model's `config.json` exposes a
/// `text_config.max_position_embeddings` (or top-level
/// `max_position_embeddings`) that matches our hard-coded
/// [`crate::options::MODEL_CONTEXT_TOKENS`].
/// finding 1: generate()'s admission gates trust this constant; a
/// model exported with a smaller positional embedding range would
/// pass byte-identical tokenizer/template/preprocessor checks
/// (the static assets) but quietly accept prompts past its real
/// limit and either fail late or produce invalid position state.
#[cfg(feature = "bundled")]
fn validate_config_context_matches_bundled(path: &Path) -> Result<()> {
  if !path.exists() {
    return Err(Error::InvalidRequest(
      "model directory missing config.json — use from_paths to bypass strict context-length drift checks (advanced: requires matching ONNX embedding table)",
    ));
  }
  let supplied = std::fs::read(path).map_err(Error::Io)?;
  let v: serde_json::Value = serde_json::from_slice(&supplied)
    .map_err(|_| Error::InvalidRequest("config.json is not valid JSON"))?;
  // LFM2.5-VL's config nests text params under `text_config`; older
  // single-modality configs put `max_position_embeddings` at the
  // top level. Accept either layout.
  let max_pos = v
    .get("text_config")
    .and_then(|tc| tc.get("max_position_embeddings"))
    .or_else(|| v.get("max_position_embeddings"))
    .and_then(|n| n.as_u64())
    .ok_or(Error::InvalidRequest(
      "config.json missing text_config.max_position_embeddings (or top-level max_position_embeddings)",
    ))?;
  if max_pos != crate::options::MODEL_CONTEXT_TOKENS as u64 {
    return Err(Error::InvalidRequest(
      "config.json max_position_embeddings differs from crate's MODEL_CONTEXT_TOKENS (128_000) — admission gates would accept requests past the loaded model's real position limit",
    ));
  }
  Ok(())
}

/// Validate the model directory's `chat_template.jinja` byte-equals
/// the bundled jinja used by the engine at render time. The
/// renderer uses `chat_template::BUNDLED_CHAT_TEMPLATE_JINJA`
/// regardless of what the model directory ships, so if a model
/// revision changes the template (e.g., role wrapper, image-block
/// layout) but keeps `tokenizer.json` byte-identical, `from_dir`
/// would silently accept the directory; the engine would render
/// with the bundled template; the resulting prompt would be
/// semantically wrong even though `<image>` token counts still
/// match. Fail closed.
///
/// Fail-closed on missing file: from_dir is the strict constructor.
/// Callers with stripped-down model directories should use
/// `from_paths` (which explicitly opts out) or `from_onnx_dir`
/// (which uses bundled assets, so no drift is possible).
#[cfg(feature = "bundled")]
fn validate_chat_template_matches_bundled(path: &Path) -> Result<()> {
  if !path.exists() {
    return Err(Error::InvalidRequest(
      "model directory missing chat_template.jinja — use from_paths to bypass strict prompt-template drift checks (advanced: requires matching ONNX embedding table)",
    ));
  }
  let supplied = std::fs::read(path).map_err(Error::Io)?;
  if supplied != crate::bundled::CHAT_TEMPLATE_JINJA {
    return Err(Error::InvalidRequest(
      "supplied chat_template.jinja bytes do not match the bundled chat template — engine renders with bundled template; mismatched model template would produce semantically wrong prompts even when <image> counts line up",
    ));
  }
  Ok(())
}

#[cfg(feature = "bundled")]
fn validate_tokenizer_matches_bundled(path: &Path) -> Result<()> {
  let supplied = std::fs::read(path).map_err(Error::Io)?;
  if supplied != crate::bundled::TOKENIZER_JSON {
    return Err(Error::InvalidRequest(
      "supplied tokenizer.json bytes do not match the bundled tokenizer — use Engine::from_paths to bypass strict tokenizer-identity check (advanced: requires matching ONNX embedding table)",
    ));
  }
  Ok(())
}

// =========================================================================
// Bundled-tokenizer helper (bundled feature only)
// =========================================================================

/// Write the bundled `tokenizer.json` bytes to a content-addressed
/// temp file and return its path. hardening:
///
/// - **Content-addressed path**: `$TMPDIR/lfm-bundled-<sha256_first_16hex>/`
///   so different lfm versions (or different bundled bytes) get
///   distinct paths. PID-based paths could reuse a stale file from a
///   previous process whose PID got recycled.
/// - **Initialize once**: a per-process `OnceLock<PathBuf>` ensures
///   only one writer races for a given target path within this
///   process. Cross-process races are still possible — see atomic
///   write below.
/// - **Atomic write via tempfile + rename**: write to a sibling
///   `<random>.tmp` file then rename to the final name. Concurrent
///   readers see either no file or the fully-written file — never
///   a partial write.
/// - **Content verification on reuse**: if the target file already
///   exists, verify its bytes match the bundled blob before reusing.
///   Mismatch → rewrite (same atomic dance).
#[cfg(feature = "bundled")]
fn write_bundled_tokenizer() -> Result<PathBuf> {
  use std::sync::Mutex;
  // hardening still had a TOCTOU
  // race between the OnceLock check and the temp-file write. Two
  // threads both calling Engine::from_onnx_dir simultaneously could
  // observe an empty cache and both write to the same
  // tokenizer.json.<PID>.tmp; one rename would then remove the
  // other's temp before the second rename ran, causing a spurious
  // failure.
  //
  // Fix: serialize the entire init under a Mutex<Option<PathBuf>>.
  // Fast path (cache hit) is just a lock + clone. Slow path holds
  // the lock during the FS work, but that's fine — Engine
  // construction is rare and the work is bounded (~5 MB write +
  // rename). Also add a thread-id to the temp filename as
  // belt-and-suspenders against any future concurrent code.
  static CACHE: Mutex<Option<PathBuf>> = Mutex::new(None);
  let mut guard = CACHE
    .lock()
    .expect("write_bundled_tokenizer mutex poisoned");
  if let Some(p) = guard.as_ref() {
    // on every cache hit, re-read the
    // file and verify it still matches the bundled blob. If a
    // process (ours or another) has modified the cached temp file
    // between calls, the old code would happily return the stale
    // path and from_paths would consume the tampered tokenizer
    // (its structural-ID validation can't catch normal-vocab
    // drift). Re-validation forces a rewrite on tamper.
    match std::fs::read(p) {
      Ok(existing) if existing == crate::bundled::TOKENIZER_JSON => return Ok(p.clone()),
      _ => {
        // Tampered or removed — drop the cache and fall through
        // to the rewrite path below.
        *guard = None;
      }
    }
  }

  // Content hash: 16 hex chars of FNV-1a over the bundled bytes
  // (not crypto, just enough entropy to namespace by content).
  let hash = simple_hash_hex(crate::bundled::TOKENIZER_JSON);
  let dir = std::env::temp_dir().join(format!("lfm-bundled-{hash}"));
  std::fs::create_dir_all(&dir).map_err(Error::Io)?;
  let path = dir.join("tokenizer.json");

  // If file already exists, verify content matches before reuse.
  let needs_write = match std::fs::read(&path) {
    Ok(existing) if existing == crate::bundled::TOKENIZER_JSON => false,
    Ok(_) => true,
    Err(_) => true,
  };
  if needs_write {
    // Per-thread + per-process unique temp filename. Even though
    // the Mutex serializes within a process, the thread id makes
    // cross-process collisions on the temp filename impossible.
    let tid = std::thread::current().id();
    let tmp = dir.join(format!(
      "tokenizer.json.{}.{:?}.tmp",
      std::process::id(),
      tid
    ));
    std::fs::write(&tmp, crate::bundled::TOKENIZER_JSON).map_err(Error::Io)?;
    // rename can fail on Windows if
    // destination already exists (another process won the race
    // between our needs_write check and our rename). Recover by
    // re-reading the destination — if its bytes match the bundled
    // blob, the other process produced a correct file and we can
    // accept it. If the rename fails for any other reason, or if
    // the bytes still don't match, propagate the original error.
    if let Err(rename_err) = std::fs::rename(&tmp, &path) {
      let _ = std::fs::remove_file(&tmp); // clean up our temp
      match std::fs::read(&path) {
        Ok(existing) if existing == crate::bundled::TOKENIZER_JSON => {
          // Another process beat us; their file is correct. Accept.
        }
        _ => return Err(Error::Io(rename_err)),
      }
    }
  }

  *guard = Some(path.clone());
  Ok(path)
}

/// Simple content hash producing a 16-char hex string. Not crypto;
/// just enough entropy to namespace bundled bytes by content.
#[cfg(feature = "bundled")]
fn simple_hash_hex(bytes: &[u8]) -> String {
  // FNV-1a 64-bit
  let mut h: u64 = 0xcbf29ce484222325;
  for &b in bytes {
    h ^= b as u64;
    h = h.wrapping_mul(0x100000001b3);
  }
  format!("{h:016x}")
}

/// Paths to the four model files used by [`Engine::from_paths`].
pub struct EnginePaths {
  /// Path to `vision_encoder.onnx`.
  vision: PathBuf,
  /// Path to `embed_tokens.onnx`.
  embed: PathBuf,
  /// Path to `decoder_model_merged.onnx`.
  decoder: PathBuf,
  /// Path to `tokenizer.json`.
  tokenizer: PathBuf,
}

impl EnginePaths {
  /// Construct a new `EnginePaths`.
  pub fn new(vision: PathBuf, embed: PathBuf, decoder: PathBuf, tokenizer: PathBuf) -> Self {
    Self {
      vision,
      embed,
      decoder,
      tokenizer,
    }
  }

  /// Path to `vision_encoder.onnx`.
  pub fn vision(&self) -> &PathBuf {
    &self.vision
  }

  /// Path to `embed_tokens.onnx`.
  pub fn embed(&self) -> &PathBuf {
    &self.embed
  }

  /// Path to `decoder_model_merged.onnx`.
  pub fn decoder(&self) -> &PathBuf {
    &self.decoder
  }

  /// Path to `tokenizer.json`.
  pub fn tokenizer(&self) -> &PathBuf {
    &self.tokenizer
  }

  /// Set the vision encoder path.
  pub fn set_vision(&mut self, vision: PathBuf) {
    self.vision = vision;
  }

  /// Set the embed tokens path.
  pub fn set_embed(&mut self, embed: PathBuf) {
    self.embed = embed;
  }

  /// Set the decoder path.
  pub fn set_decoder(&mut self, decoder: PathBuf) {
    self.decoder = decoder;
  }

  /// Set the tokenizer path.
  pub fn set_tokenizer(&mut self, tokenizer: PathBuf) {
    self.tokenizer = tokenizer;
  }

  /// Builder: set the vision encoder path (chainable).
  pub fn with_vision(mut self, vision: PathBuf) -> Self {
    self.vision = vision;
    self
  }

  /// Builder: set the embed tokens path (chainable).
  pub fn with_embed(mut self, embed: PathBuf) -> Self {
    self.embed = embed;
    self
  }

  /// Builder: set the decoder path (chainable).
  pub fn with_decoder(mut self, decoder: PathBuf) -> Self {
    self.decoder = decoder;
    self
  }

  /// Builder: set the tokenizer path (chainable).
  pub fn with_tokenizer(mut self, tokenizer: PathBuf) -> Self {
    self.tokenizer = tokenizer;
    self
  }
}

// =========================================================================
// llguidance wiring helpers (inference feature only)
// =========================================================================

/// Build a `ParserFactory` from the tokenizer JSON bytes.
///
/// Steps:
/// 1. `ByteTokenizer::from_json_bytes(bytes)` — loads from in-memory
///    bytes captured at Engine construction (eliminating any
///    path-reload TOCTOU). Uses `toktrie_hf_tokenizers`'s
///    own `tokenizers` dependency (v0.21), avoiding a type-incompatibility
///    with the `tokenizers` v0.23 used elsewhere in `lfm`. This is safe:
///    both versions read the same `tokenizer.json` format.
/// 2. `.into_tok_env(None)` — builds a `TokTrie` and wraps it in
///    `Arc<dyn TokenizerEnv>` (`TokEnv`).
/// 3. `ParserFactory::new_simple(&tok_env)` — compiles with
///    `InferenceCapabilities::default()` (ff_tokens disabled) and
///    `SlicedBiasComputer::general_slices()`.
fn build_parser_factory(tokenizer_bytes: &[u8]) -> Result<ParserFactory> {
  let byte_tok = toktrie_hf_tokenizers::ByteTokenizer::from_json_bytes(tokenizer_bytes)
    .map_err(Error::llguidance)?;
  let tok_env: TokEnv = byte_tok.into_tok_env(None).map_err(Error::llguidance)?;
  ParserFactory::new_simple(&tok_env).map_err(Error::llguidance)
}

/// Build a `Constraint` for one generation call from any
/// [`llmtask::Grammar`] variant.
///
/// llguidance natively supports JSON Schema, Lark, and Regex —
/// all three [`Grammar`] variants this crate's [`llmtask::Task`]
/// can produce. Each variant routes to its corresponding
/// `TopLevelGrammar` constructor.
fn build_constraint(factory: &ParserFactory, grammar: &llmtask::Grammar) -> Result<Constraint> {
  let top = match grammar {
    llmtask::Grammar::JsonSchema(schema) => TopLevelGrammar::from_json_schema(schema.clone()),
    llmtask::Grammar::Lark(src) => TopLevelGrammar::from_lark(src.to_string()),
    // Grammar::Regex wraps a private RegexGrammar with both the
    // source pattern and a default-options compiled regex —
    // forcing default options prevents `RegexBuilder::
    // case_insensitive(true)`-smuggled regexes from diverging
    // between local validation and engine constraint. Borrow the
    // source pattern via `pattern()` and hand it to llguidance.
    llmtask::Grammar::Regex(rg) => TopLevelGrammar::from_regex(rg.pattern()),
    // Grammar is #[non_exhaustive]; future variants (e.g., raw
    // CFG, GBNF) would land here. lfm via llguidance can support
    // most of them but they need a per-variant routing change.
    _ => {
      return Err(Error::InvalidRequest(
        "llmtask::Grammar variant unsupported by lfm — please open an issue (lfm uses llguidance and can extend support)",
      ));
    }
  };
  let parser = factory.create_parser(top).map_err(Error::llguidance)?;
  Ok(Constraint::new(parser))
}

// =========================================================================
// Tests
// =========================================================================

#[cfg(test)]
mod tests {
  use super::*;

  #[test]
  fn engine_paths_accessors() {
    let ep = EnginePaths::new(
      PathBuf::from("v.onnx"),
      PathBuf::from("e.onnx"),
      PathBuf::from("d.onnx"),
      PathBuf::from("t.json"),
    );
    assert_eq!(ep.vision(), &PathBuf::from("v.onnx"));
    assert_eq!(ep.tokenizer(), &PathBuf::from("t.json"));
  }

  #[test]
  fn validate_image_tokenizer_contract_caps_max_tiles() {
    // Defense-in-depth: even if a caller forgot ImageBudget::validate(),
    // the contract validator must refuse to scan past
    // MAX_TOKENIZER_TILE_DIM. Without this guard, max_tiles=usize::MAX
    // would loop ~∞ in the nested R×C scan and hang Engine
    // construction (a startup-DoS path).
    //
    // Exercise via the bundled tokenizer (always available under the
    // `bundled` feature, which gates this whole test file via the
    // `inference + decoders` mod gate that engine.rs lives under).
    #[cfg(feature = "bundled")]
    {
      let path = write_bundled_tokenizer().expect("write bundled tokenizer");
      let tokenizer = Tokenizer::from_file(&path).expect("load tokenizer");
      let r =
        validate_image_tokenizer_contract(&tokenizer, crate::options::MAX_TOKENIZER_TILE_DIM + 1);
      assert!(
        matches!(r, Err(Error::InvalidBudget(_))),
        "must reject max_tiles above the cap, got {r:?}"
      );
      // Sanity: at the cap it succeeds.
      assert!(
        validate_image_tokenizer_contract(&tokenizer, crate::options::MAX_TOKENIZER_TILE_DIM)
          .is_ok()
      );
    }
  }

  #[test]
  #[cfg(feature = "bundled")]
  fn validate_tokenizer_matches_bundled_rejects_drift() {
    // a tokenizer with valid special-token IDs but
    // any drift in normal vocabulary must be rejected by the strict
    // constructor. Reproducer: write a 1-byte mutation of the
    // bundled tokenizer.json to a temp file and verify the helper
    // rejects it. (We can't easily craft a "valid JSON tokenizer
    // with one normal-token ID swapped" without a tokenizer-aware
    // mutation library, but ANY byte difference must trip the
    // byte-equality check, which is the whole point of the helper.)
    let dir = std::env::temp_dir().join(format!("lfm-test-drift-{}", std::process::id()));
    std::fs::create_dir_all(&dir).unwrap();
    let drift_path = dir.join("tokenizer-drift.json");
    let mut bytes = crate::bundled::TOKENIZER_JSON.to_vec();
    // Mutate a single byte deep in the file (vocab section) so we
    // don't accidentally produce something that's still valid by
    // coincidence. Last byte is safest.
    let last = bytes.len() - 1;
    bytes[last] = bytes[last].wrapping_add(1);
    std::fs::write(&drift_path, &bytes).unwrap();
    let result = validate_tokenizer_matches_bundled(&drift_path);
    assert!(
      matches!(result, Err(Error::InvalidRequest(_))),
      "drifted tokenizer must be rejected, got {result:?}"
    );

    // Sanity: writing the unmodified bundled bytes passes.
    let ok_path = dir.join("tokenizer-ok.json");
    std::fs::write(&ok_path, crate::bundled::TOKENIZER_JSON).unwrap();
    assert!(validate_tokenizer_matches_bundled(&ok_path).is_ok());
  }

  #[test]
  #[cfg(feature = "bundled")]
  fn validate_config_context_matches_bundled_accepts_correct_and_rejects_drift() {
    // from_dir's strict drift check for
    // text_config.max_position_embeddings vs MODEL_CONTEXT_TOKENS.
    let dir = std::env::temp_dir().join(format!("lfm-test-config-drift-{}", std::process::id()));
    std::fs::create_dir_all(&dir).unwrap();

    // Missing → reject.
    let missing = dir.join("config-missing.json");
    let _ = std::fs::remove_file(&missing);
    assert!(matches!(
      validate_config_context_matches_bundled(&missing),
      Err(Error::InvalidRequest(_))
    ));

    // Wrong context → reject.
    let drift = dir.join("config-drift.json");
    std::fs::write(
      &drift,
      r#"{"text_config":{"max_position_embeddings":4096}}"#,
    )
    .unwrap();
    assert!(matches!(
      validate_config_context_matches_bundled(&drift),
      Err(Error::InvalidRequest(_))
    ));

    // Correct nested layout → ok.
    let ok_nested = dir.join("config-ok-nested.json");
    std::fs::write(
      &ok_nested,
      r#"{"text_config":{"max_position_embeddings":128000}}"#,
    )
    .unwrap();
    assert!(validate_config_context_matches_bundled(&ok_nested).is_ok());

    // Correct top-level layout → ok (older single-modality configs).
    let ok_flat = dir.join("config-ok-flat.json");
    std::fs::write(&ok_flat, r#"{"max_position_embeddings":128000}"#).unwrap();
    assert!(validate_config_context_matches_bundled(&ok_flat).is_ok());

    // Bundled config.json → ok.
    let ok_bundled = dir.join("config-ok-bundled.json");
    std::fs::write(&ok_bundled, crate::bundled::CONFIG_JSON).unwrap();
    assert!(validate_config_context_matches_bundled(&ok_bundled).is_ok());

    // Invalid JSON → reject.
    let bad_json = dir.join("config-bad.json");
    std::fs::write(&bad_json, b"{not json").unwrap();
    assert!(matches!(
      validate_config_context_matches_bundled(&bad_json),
      Err(Error::InvalidRequest(_))
    ));
  }

  #[test]
  #[cfg(feature = "bundled")]
  fn validate_chat_template_matches_bundled_rejects_drift_and_missing() {
    // from_dir's strict drift check for
    // the model directory's chat_template.jinja. A model rev that
    // changes the template (role envelope, image-block layout)
    // while keeping tokenizer.json byte-identical must be rejected.
    let dir = std::env::temp_dir().join(format!("lfm-test-tmpl-drift-{}", std::process::id()));
    std::fs::create_dir_all(&dir).unwrap();

    // Missing file → reject (fail-closed for strict constructor).
    let missing = dir.join("chat_template-missing.jinja");
    let _ = std::fs::remove_file(&missing);
    assert!(matches!(
      validate_chat_template_matches_bundled(&missing),
      Err(Error::InvalidRequest(_))
    ));

    // Drifted bytes → reject. Mutate the last byte.
    let drift = dir.join("chat_template-drift.jinja");
    let mut bytes = crate::bundled::CHAT_TEMPLATE_JINJA.to_vec();
    let last = bytes.len() - 1;
    bytes[last] = bytes[last].wrapping_add(1);
    std::fs::write(&drift, &bytes).unwrap();
    assert!(matches!(
      validate_chat_template_matches_bundled(&drift),
      Err(Error::InvalidRequest(_))
    ));

    // Sanity: bundled bytes pass.
    let ok = dir.join("chat_template-ok.jinja");
    std::fs::write(&ok, crate::bundled::CHAT_TEMPLATE_JINJA).unwrap();
    assert!(validate_chat_template_matches_bundled(&ok).is_ok());
  }
}