1use crate::models::llama_370m::Llama370MConfig;
23use crate::train::pretrain::{CheckpointFn, EpochArtifact, StepFn, ValFn};
24use crate::train::transformer_trainer::{LMBatch, TransformerTrainConfig, TransformerTrainer};
25use crate::transformer::{ModelArchitecture, Transformer, TransformerConfig};
26use crate::Tensor;
27use std::cell::RefCell;
28use std::collections::BTreeMap;
29use std::path::Path;
30use std::rc::Rc;
31
32pub type SharedTrainer = Rc<RefCell<TransformerTrainer>>;
36
37pub fn load_init_tensors_from_apr(
65 path: impl AsRef<Path>,
66) -> Result<BTreeMap<String, (Vec<f32>, Vec<usize>)>, String> {
67 let path_ref = path.as_ref();
68 aprender::format::converter::load_model_tensors(path_ref).map_err(|e| {
69 format!(
70 "FALSIFY-APR-PRETRAIN-INIT-006: failed to load init tensors from APR file {}: {e}",
71 path_ref.display()
72 )
73 })
74}
75
76pub fn validate_pretrain_init_arch_compatible(cfg: &TransformerConfig) -> Result<(), String> {
98 match cfg.architecture {
99 ModelArchitecture::Decoder => Ok(()),
100 ModelArchitecture::Encoder => Err(format!(
101 "FALSIFY-APR-PRETRAIN-ARCH-007: --init checkpoint has architecture=Encoder \
102 (e.g., BERT/RoBERTa/CodeBERT) but the pretrain trainer is decoder-only \
103 (Llama/Qwen-class causal LMs). Loading encoder weights into a decoder \
104 trainer would produce nonsense gradients. Architectural details: \
105 hidden_size={}, num_layers={}, vocab_size={}, hf_architecture={:?}",
106 cfg.hidden_size, cfg.num_hidden_layers, cfg.vocab_size, cfg.hf_architecture
107 )),
108 }
109}
110
111pub fn populate_trainer_from_init_tensors(
142 transformer: &mut Transformer,
143 init_tensors: &BTreeMap<String, (Vec<f32>, Vec<usize>)>,
144) -> Result<usize, String> {
145 let expected: Vec<(String, usize)> =
146 transformer.named_parameters().into_iter().map(|(name, t)| (name, t.len())).collect();
147 let mut populated = 0usize;
148 let mut errors: Vec<String> = Vec::new();
149
150 for (name, expected_len) in &expected {
151 match init_tensors.get(name) {
152 Some((data, _shape)) => {
153 if data.len() != *expected_len {
154 errors.push(format!(
155 "{name}: init length {} != trainer expected {expected_len}",
156 data.len()
157 ));
158 continue;
159 }
160 let tensor = Tensor::from_vec(data.clone(), true);
161 if !transformer.set_named_parameter(name, tensor) {
162 errors.push(format!("{name}: set_named_parameter rejected the assignment"));
163 continue;
164 }
165 populated += 1;
166 }
167 None => {
168 errors.push(format!("{name}: not present in init APR tensors"));
169 }
170 }
171 }
172
173 if !errors.is_empty() {
174 let total = errors.len();
175 let head = errors.iter().take(5).cloned().collect::<Vec<_>>().join("; ");
176 return Err(format!(
177 "FALSIFY-APR-PRETRAIN-INIT-007: populate_trainer_from_init_tensors \
178 failed for {total} parameter(s); first {} of {total}: {head}",
179 errors.len().min(5)
180 ));
181 }
182
183 Ok(populated)
184}
185
186pub fn llama_370m_transformer_config() -> TransformerConfig {
189 TransformerConfig {
190 hidden_size: Llama370MConfig::HIDDEN_DIM,
191 num_attention_heads: Llama370MConfig::NUM_HEADS,
192 num_kv_heads: Llama370MConfig::NUM_KV_HEADS,
193 intermediate_size: Llama370MConfig::INTERMEDIATE_DIM,
194 num_hidden_layers: Llama370MConfig::NUM_LAYERS,
195 vocab_size: Llama370MConfig::VOCAB_SIZE,
196 max_position_embeddings: Llama370MConfig::MAX_POSITION_EMBEDDINGS,
197 rms_norm_eps: Llama370MConfig::RMS_NORM_EPS,
198 rope_theta: Llama370MConfig::ROPE_THETA,
199 use_bias: false,
200 head_dim_override: None,
201 architecture: ModelArchitecture::Decoder,
202 hf_architecture: Some("LlamaForCausalLM".into()),
203 hf_model_type: Some("llama".into()),
204 tie_word_embeddings: true,
205 }
206}
207
208pub fn build_transformer_config(init: Option<&TransformerConfig>) -> TransformerConfig {
226 match init {
227 None => llama_370m_transformer_config(),
228 Some(cfg) => cfg.clone(),
229 }
230}
231
232pub fn llama_370m_train_config(lr: f32, seq_length: usize, seed: u64) -> TransformerTrainConfig {
235 let model_cfg = llama_370m_transformer_config();
236 let mut cfg = TransformerTrainConfig::new(model_cfg);
237 cfg.lr = lr;
238 cfg.max_seq_len = seq_length;
239 cfg.seed = seed;
240 cfg
241}
242
243pub struct RealStepFn {
247 trainer: SharedTrainer,
248 batches: Box<dyn Iterator<Item = LMBatch>>,
249}
250
251impl RealStepFn {
252 pub fn new(trainer: SharedTrainer, batches: Box<dyn Iterator<Item = LMBatch>>) -> Self {
253 Self { trainer, batches }
254 }
255}
256
257impl StepFn for RealStepFn {
258 fn step(&mut self, _step: u64, _lr: f32, _batch_tokens: u64) -> (f32, f32) {
259 let Some(batch) = self.batches.next() else {
265 return (1.0, 1.0);
266 };
267 let mut trainer = self.trainer.borrow_mut();
268 let loss = trainer.train_batch(&batch);
269 let grad_norm = 1.0_f32;
274 (loss, grad_norm)
275 }
276
277 fn optimizer_state_sha256(&self) -> Option<String> {
279 Some(self.trainer.borrow().optimizer_state_sha256())
280 }
281}
282
283pub struct RealValFn {
286 trainer: SharedTrainer,
287 held_out: Vec<LMBatch>,
288}
289
290impl RealValFn {
291 pub fn new(trainer: SharedTrainer, held_out: Vec<LMBatch>) -> Self {
292 Self { trainer, held_out }
293 }
294}
295
296impl ValFn for RealValFn {
297 fn validate(&mut self, _epoch: usize) -> f32 {
298 if self.held_out.is_empty() {
299 return f32::NAN;
300 }
301 let trainer = self.trainer.borrow();
302 let mut total_loss = 0.0_f32;
303 let mut total_items = 0_usize;
304 for batch in &self.held_out {
305 for i in 0..batch.batch_size {
306 let Some(inp) = batch.get_input(i) else {
307 continue;
308 };
309 let Some(tgt) = batch.get_target(i) else {
310 continue;
311 };
312 let (loss_val, _loss_tensor, _logits) = trainer.forward_single(inp, tgt);
313 total_loss += loss_val;
314 total_items += 1;
315 }
316 }
317 if total_items == 0 {
318 f32::NAN
319 } else {
320 total_loss / total_items as f32
321 }
322 }
323}
324
325pub struct AprCheckpointFn {
331 trainer: SharedTrainer,
332 model_name: String,
333 architecture: String,
334}
335
336impl AprCheckpointFn {
337 pub fn new(
338 trainer: SharedTrainer,
339 model_name: impl Into<String>,
340 architecture: impl Into<String>,
341 ) -> Self {
342 Self { trainer, model_name: model_name.into(), architecture: architecture.into() }
343 }
344}
345
346impl CheckpointFn for AprCheckpointFn {
347 fn save(&mut self, _epoch: usize, artifact: &EpochArtifact) -> Result<(), String> {
348 let trainer = self.trainer.borrow();
349 trainer
350 .save_apr(&artifact.checkpoint_path, &self.model_name, &self.architecture)
351 .map_err(|e| format!("save_apr failed: {e}"))
352 }
353}
354
355pub fn build_shared_trainer(lr: f32, seq_length: usize, seed: u64) -> SharedTrainer {
358 let cfg = llama_370m_train_config(lr, seq_length, seed);
359 let trainer = TransformerTrainer::new(cfg);
360 #[cfg(debug_assertions)]
365 {
366 let param_count: usize = trainer.model().parameters().iter().map(|t| t.len()).sum();
367 debug_assert!(
368 (366_000_000..=374_000_000).contains(¶m_count),
369 "INV-ARCH-370M-001: parameter count {param_count} outside [366M, 374M] band",
370 );
371 }
372 Rc::new(RefCell::new(trainer))
373}
374
375pub fn build_shared_trainer_with_init(
402 lr: f32,
403 seq_length: usize,
404 seed: u64,
405 init_arch: Option<&TransformerConfig>,
406 init_path: Option<&Path>,
407) -> Result<SharedTrainer, String> {
408 if init_arch.is_some() != init_path.is_some() {
409 return Err(format!(
410 "build_shared_trainer_with_init: init_arch and init_path must both be Some \
411 or both None (caller bug; init_arch.is_some()={}, init_path.is_some()={})",
412 init_arch.is_some(),
413 init_path.is_some()
414 ));
415 }
416
417 if let Some(cfg) = init_arch {
418 validate_pretrain_init_arch_compatible(cfg)?;
419 }
420
421 let model_cfg = build_transformer_config(init_arch);
422 let mut train_cfg = TransformerTrainConfig::new(model_cfg);
423 train_cfg.lr = lr;
424 train_cfg.max_seq_len = seq_length;
425 train_cfg.seed = seed;
426 let mut trainer = TransformerTrainer::new(train_cfg);
427
428 if let Some(path) = init_path {
435 let tensors = load_init_tensors_from_apr(path)?;
436 populate_trainer_from_init_tensors(trainer.model_mut(), &tensors)?;
437 }
438
439 Ok(Rc::new(RefCell::new(trainer)))
440}
441
442#[cfg(test)]
443mod tests {
444 use super::*;
445 use crate::train::transformer_trainer::LMBatch;
446
447 #[test]
453 fn load_init_tensors_missing_file_errors_with_falsifier_id() {
454 let tmp = tempfile::TempDir::new().expect("tempdir");
455 let missing = tmp.path().join("does-not-exist.apr");
456 let err =
457 load_init_tensors_from_apr(&missing).expect_err("missing init APR file MUST fail-fast");
458 assert!(
459 err.contains("FALSIFY-APR-PRETRAIN-INIT-006"),
460 "error must cite falsifier id (auditability): {err}"
461 );
462 assert!(
463 err.contains("does-not-exist.apr"),
464 "error must name the missing path (operator-experience): {err}"
465 );
466 }
467
468 #[test]
477 fn load_init_tensors_signature_compile_bind() {
478 fn _check_signature<F>(_f: F)
482 where
483 F: Fn(&Path) -> Result<BTreeMap<String, (Vec<f32>, Vec<usize>)>, String>,
484 {
485 }
486 _check_signature(|p| load_init_tensors_from_apr(p));
487 }
488
489 #[test]
490 fn transformer_config_matches_llama_370m_constants() {
491 let cfg = llama_370m_transformer_config();
492 assert_eq!(cfg.hidden_size, Llama370MConfig::HIDDEN_DIM);
493 assert_eq!(cfg.num_hidden_layers, Llama370MConfig::NUM_LAYERS);
494 assert_eq!(cfg.num_attention_heads, Llama370MConfig::NUM_HEADS);
495 assert_eq!(cfg.num_kv_heads, Llama370MConfig::NUM_KV_HEADS);
496 assert_eq!(cfg.intermediate_size, Llama370MConfig::INTERMEDIATE_DIM);
497 assert_eq!(cfg.vocab_size, Llama370MConfig::VOCAB_SIZE);
498 assert!((cfg.rope_theta - Llama370MConfig::ROPE_THETA).abs() < f32::EPSILON);
499 assert!((cfg.rms_norm_eps - Llama370MConfig::RMS_NORM_EPS).abs() < f32::EPSILON);
500 assert!(!cfg.use_bias, "INV-ARCH-370M-008: no bias");
501 assert!(cfg.tie_word_embeddings, "INV-ARCH-370M-004: tied embeddings");
502 }
503
504 #[test]
510 fn build_transformer_config_no_init_matches_llama370m() {
511 let baseline = llama_370m_transformer_config();
512 let result = build_transformer_config(None);
513 assert_eq!(result.hidden_size, baseline.hidden_size);
514 assert_eq!(result.num_attention_heads, baseline.num_attention_heads);
515 assert_eq!(result.num_kv_heads, baseline.num_kv_heads);
516 assert_eq!(result.intermediate_size, baseline.intermediate_size);
517 assert_eq!(result.num_hidden_layers, baseline.num_hidden_layers);
518 assert_eq!(result.vocab_size, baseline.vocab_size);
519 assert_eq!(result.max_position_embeddings, baseline.max_position_embeddings);
520 assert!((result.rms_norm_eps - baseline.rms_norm_eps).abs() < f32::EPSILON);
521 assert!((result.rope_theta - baseline.rope_theta).abs() < f32::EPSILON);
522 assert_eq!(result.use_bias, baseline.use_bias);
523 assert_eq!(result.tie_word_embeddings, baseline.tie_word_embeddings);
524 assert_eq!(result.architecture, baseline.architecture);
525 assert_eq!(result.hf_architecture, baseline.hf_architecture);
526 assert_eq!(result.hf_model_type, baseline.hf_model_type);
527 }
528
529 #[test]
536 fn build_transformer_config_qwen_init_matches_input() {
537 let qwen = TransformerConfig::qwen2_0_5b();
538 let result = build_transformer_config(Some(&qwen));
539 assert_eq!(result.hidden_size, qwen.hidden_size, "hidden_size");
540 assert_eq!(result.num_attention_heads, qwen.num_attention_heads, "num_attention_heads");
541 assert_eq!(result.num_kv_heads, qwen.num_kv_heads, "num_kv_heads");
542 assert_eq!(result.intermediate_size, qwen.intermediate_size, "intermediate_size");
543 assert_eq!(result.num_hidden_layers, qwen.num_hidden_layers, "num_hidden_layers");
544 assert_eq!(result.vocab_size, qwen.vocab_size, "vocab_size");
545 assert_eq!(
546 result.max_position_embeddings, qwen.max_position_embeddings,
547 "max_position_embeddings"
548 );
549 assert_eq!(result.use_bias, qwen.use_bias, "use_bias");
550 assert_eq!(result.tie_word_embeddings, qwen.tie_word_embeddings, "tie_word_embeddings");
551 assert_eq!(result.architecture, qwen.architecture, "architecture");
552 assert_eq!(
554 result.num_attention_heads / result.num_kv_heads,
555 7,
556 "GQA ratio must preserve as 7:1 (Qwen2.5-0.5B canonical)"
557 );
558 }
559
560 #[test]
567 fn build_transformer_config_dispatch_mutually_exclusive() {
568 let qwen = TransformerConfig::qwen2_0_5b();
569 let none_result = build_transformer_config(None);
570 let some_result = build_transformer_config(Some(&qwen));
571 assert_ne!(
573 none_result.hidden_size, some_result.hidden_size,
574 "dispatch must differentiate None vs Some — Llama370M hidden=1024 vs Qwen=896"
575 );
576 assert_ne!(
577 none_result.vocab_size, some_result.vocab_size,
578 "dispatch must differentiate None vs Some — Llama370M vocab=50257 vs Qwen=151936"
579 );
580 }
581
582 #[test]
587 fn validate_pretrain_init_arch_accepts_decoder() {
588 let qwen = TransformerConfig::qwen2_0_5b();
589 assert_eq!(qwen.architecture, ModelArchitecture::Decoder);
590 validate_pretrain_init_arch_compatible(&qwen)
591 .expect("decoder-family config (Qwen2.5-0.5B) MUST pass arch-compat gate");
592 }
593
594 #[test]
602 fn validate_pretrain_init_arch_rejects_encoder() {
603 let bert = TransformerConfig {
605 hidden_size: 768,
606 num_attention_heads: 12,
607 num_kv_heads: 12,
608 intermediate_size: 3072,
609 num_hidden_layers: 12,
610 vocab_size: 50265,
611 max_position_embeddings: 514,
612 rms_norm_eps: 1e-12,
613 rope_theta: 10_000.0,
614 use_bias: true,
615 head_dim_override: None,
616 architecture: ModelArchitecture::Encoder,
617 hf_architecture: Some("RobertaModel".to_string()),
618 hf_model_type: Some("roberta".to_string()),
619 tie_word_embeddings: false,
620 };
621 let err = validate_pretrain_init_arch_compatible(&bert).expect_err(
622 "encoder-family config (CodeBERT/RoBERTa) MUST fail arch-compat gate — \
623 silent acceptance would corrupt §49 fine-tune trajectory before any \
624 FALSIFY-006 check could measure it",
625 );
626 assert!(
627 err.contains("FALSIFY-APR-PRETRAIN-ARCH-007"),
628 "error must cite falsifier id: {err}"
629 );
630 assert!(err.contains("Encoder"), "error must name the architecture family: {err}");
631 assert!(
632 err.contains("decoder-only"),
633 "error must explain why this is wrong (decoder trainer): {err}"
634 );
635 assert!(
636 err.contains("RobertaModel"),
637 "error must name the offending hf_architecture: {err}"
638 );
639 }
640
641 #[test]
645 fn validate_pretrain_init_arch_accepts_llama370m_baseline() {
646 let llama = llama_370m_transformer_config();
647 assert_eq!(
648 llama.architecture,
649 ModelArchitecture::Decoder,
650 "Llama370M baseline MUST be Decoder (regression-free)"
651 );
652 validate_pretrain_init_arch_compatible(&llama)
653 .expect("Llama370M baseline (Decoder) MUST pass arch-compat gate");
654 }
655
656 #[test]
657 fn real_step_fn_exhausted_iterator_returns_finite_placeholder() {
658 let mut tiny = TransformerConfig::llama2_7b();
666 tiny.hidden_size = 64;
667 tiny.num_attention_heads = 4;
668 tiny.num_kv_heads = 4;
669 tiny.num_hidden_layers = 2;
670 tiny.intermediate_size = 128;
671 tiny.vocab_size = 256;
672 let cfg = TransformerTrainConfig::new(tiny);
673 let trainer = Rc::new(RefCell::new(TransformerTrainer::new(cfg)));
674 let empty_iter: Box<dyn Iterator<Item = LMBatch>> = Box::new(std::iter::empty::<LMBatch>());
675 let mut step = RealStepFn::new(trainer, empty_iter);
676 let (loss, grad_norm) = step.step(0, 1.0e-4, 128);
677 assert!(loss.is_finite(), "exhausted iter must return finite loss");
678 assert!(grad_norm.is_finite(), "grad_norm must be finite");
679 assert!(grad_norm >= 0.0, "INV-TRAIN-008: grad_norm non-negative");
680 }
681
682 #[test]
683 fn real_val_fn_empty_held_out_returns_nan() {
684 let mut tiny = TransformerConfig::llama2_7b();
685 tiny.hidden_size = 64;
686 tiny.num_attention_heads = 4;
687 tiny.num_kv_heads = 4;
688 tiny.num_hidden_layers = 2;
689 tiny.intermediate_size = 128;
690 tiny.vocab_size = 256;
691 let cfg = TransformerTrainConfig::new(tiny);
692 let trainer = Rc::new(RefCell::new(TransformerTrainer::new(cfg)));
693 let mut val = RealValFn::new(trainer, Vec::new());
694 let loss = val.validate(0);
695 assert!(loss.is_nan(), "empty held_out must surface as NaN to the guard");
696 }
697
698 fn tiny_test_transformer() -> Transformer {
702 let mut tiny = TransformerConfig::llama2_7b();
703 tiny.hidden_size = 32;
704 tiny.num_attention_heads = 2;
705 tiny.num_kv_heads = 2;
706 tiny.num_hidden_layers = 2;
707 tiny.intermediate_size = 64;
708 tiny.vocab_size = 16;
709 Transformer::new(&tiny)
710 }
711
712 fn tensors_map_from_transformer(
716 transformer: &Transformer,
717 ) -> BTreeMap<String, (Vec<f32>, Vec<usize>)> {
718 let mut map = BTreeMap::new();
719 for (name, t) in transformer.named_parameters() {
720 let len = t.len();
721 let data: Vec<f32> = (0..len).map(|i| i as f32 * 0.001).collect();
722 map.insert(name, (data, vec![len]));
723 }
724 map
725 }
726
727 #[test]
730 fn populate_trainer_from_init_tensors_happy_path() {
731 let mut transformer = tiny_test_transformer();
732 let init_tensors = tensors_map_from_transformer(&transformer);
733 let expected_count = transformer.named_parameters().len();
734 let result = populate_trainer_from_init_tensors(&mut transformer, &init_tensors);
735 assert!(result.is_ok(), "happy-path populate must succeed: {result:?}");
736 assert_eq!(
737 result.unwrap(),
738 expected_count,
739 "populated count must equal named_parameters().len()"
740 );
741 }
742
743 #[test]
748 fn populate_trainer_from_init_tensors_extra_entries_silently_ignored() {
749 let mut transformer = tiny_test_transformer();
750 let mut init_tensors = tensors_map_from_transformer(&transformer);
751 init_tensors
753 .insert("model.layers.999.fictitious.weight".to_string(), (vec![0.0; 4], vec![4]));
754 let expected_count = transformer.named_parameters().len();
755 let result = populate_trainer_from_init_tensors(&mut transformer, &init_tensors);
756 assert!(result.is_ok(), "extra init entries must NOT cause Err: {result:?}");
757 assert_eq!(result.unwrap(), expected_count);
758 }
759
760 #[test]
764 fn populate_trainer_from_init_tensors_rejects_length_mismatch() {
765 let mut transformer = tiny_test_transformer();
766 let mut init_tensors = tensors_map_from_transformer(&transformer);
767 let any_name = transformer.named_parameters()[0].0.clone();
769 init_tensors.insert(any_name.clone(), (vec![0.0; 7], vec![7]));
770 let result = populate_trainer_from_init_tensors(&mut transformer, &init_tensors);
771 assert!(result.is_err(), "length-mismatch must Err, not silently truncate");
772 let err = result.unwrap_err();
773 assert!(
774 err.contains("FALSIFY-APR-PRETRAIN-INIT-007"),
775 "error must cite falsifier id; got: {err}"
776 );
777 assert!(err.contains(&any_name), "error must name the offending parameter; got: {err}");
778 assert!(
779 err.contains("init length 7"),
780 "error must report the actual init length; got: {err}"
781 );
782 }
783
784 #[test]
790 fn populate_trainer_from_init_tensors_rejects_missing_required_param() {
791 let mut transformer = tiny_test_transformer();
792 let mut init_tensors = tensors_map_from_transformer(&transformer);
793 let any_name = transformer.named_parameters()[0].0.clone();
795 init_tensors.remove(&any_name);
796 let result = populate_trainer_from_init_tensors(&mut transformer, &init_tensors);
797 assert!(result.is_err(), "missing-required must Err, not silently leave random init");
798 let err = result.unwrap_err();
799 assert!(
800 err.contains("FALSIFY-APR-PRETRAIN-INIT-007"),
801 "error must cite falsifier id; got: {err}"
802 );
803 assert!(err.contains(&any_name), "error must name the missing parameter; got: {err}");
804 assert!(
805 err.contains("not present in init APR"),
806 "error must say what was missing; got: {err}"
807 );
808 }
809
810 #[test]
817 fn build_shared_trainer_with_init_none_uses_llama370m_shape() {
818 let trainer = build_shared_trainer_with_init(1.0e-4, 128, 42, None, None)
819 .expect("None case must succeed");
820 let model = trainer.borrow();
821 let embed_len = model.model().named_parameters()[0].1.len();
824 let expected_embed_len = Llama370MConfig::VOCAB_SIZE * Llama370MConfig::HIDDEN_DIM;
825 assert_eq!(
826 embed_len,
827 expected_embed_len,
828 "init=None must produce Llama370M-shaped embedding (vocab={} × hidden={})",
829 Llama370MConfig::VOCAB_SIZE,
830 Llama370MConfig::HIDDEN_DIM
831 );
832 }
833
834 #[test]
838 fn build_shared_trainer_with_init_rejects_unpaired_args() {
839 let cfg = TransformerConfig::qwen2_0_5b();
841 let result = build_shared_trainer_with_init(1.0e-4, 128, 42, Some(&cfg), None);
842 assert!(result.is_err(), "unpaired (arch=Some, path=None) must Err");
843 let dummy_path = std::path::PathBuf::from("/dev/null");
845 let result = build_shared_trainer_with_init(1.0e-4, 128, 42, None, Some(&dummy_path));
846 assert!(result.is_err(), "unpaired (arch=None, path=Some) must Err");
847 }
848
849 #[test]
853 fn build_shared_trainer_with_init_rejects_encoder_family() {
854 let mut encoder_cfg = TransformerConfig::qwen2_0_5b();
855 encoder_cfg.architecture = ModelArchitecture::Encoder;
856 let dummy_path = std::path::PathBuf::from("/nonexistent/encoder.apr");
857 let result =
858 build_shared_trainer_with_init(1.0e-4, 128, 42, Some(&encoder_cfg), Some(&dummy_path));
859 let err = match result {
860 Ok(_) => panic!("encoder family must be rejected before tensor load"),
861 Err(e) => e,
862 };
863 assert!(
864 err.contains("FALSIFY-APR-PRETRAIN-ARCH-007"),
865 "error must cite falsifier id; got: {err}"
866 );
867 }
868
869 #[test]
874 fn build_shared_trainer_with_init_decoder_family_proceeds_to_tensor_load() {
875 let cfg = TransformerConfig::qwen2_0_5b();
876 let dummy_path = std::path::PathBuf::from("/nonexistent/decoder.apr");
877 let result = build_shared_trainer_with_init(1.0e-4, 128, 42, Some(&cfg), Some(&dummy_path));
878 let err = match result {
879 Ok(_) => panic!("missing tensor path must Err"),
880 Err(e) => e,
881 };
882 assert!(
883 err.contains("FALSIFY-APR-PRETRAIN-INIT-006"),
884 "decoder family proceeds to tensor load; failure cites INIT-006 not ARCH-007; got: {err}"
885 );
886 assert!(
887 !err.contains("FALSIFY-APR-PRETRAIN-ARCH-007"),
888 "decoder family must NOT trigger encoder-rejection; got: {err}"
889 );
890 }
891
892 #[test]
919 fn falsify_h4_init_stats_qwen_embed_norm_sensible() {
920 let fresh = std::path::Path::new("/mnt/nvme-raid0/models/qwen2.5-coder-0.5b-fresh.apr");
921 let legacy =
922 std::path::Path::new("/mnt/nvme-raid0/models/qwen2.5-coder-0.5b-instruct-fp16.apr");
923 let path = if fresh.exists() {
924 fresh
925 } else if legacy.exists() {
926 legacy
927 } else {
928 eprintln!("[falsify-h4-init-stats-001] skipping: host lacks Qwen 0.5B APR");
929 return;
930 };
931 let _ = path; if !path.exists() {
933 eprintln!("[falsify-h4-init-stats-001] skipping: host lacks {}", path.display());
934 return;
935 }
936 {
940 use aprender::format::v2::AprV2Reader;
941 let bytes = std::fs::read(path).expect("read APR");
942 let reader = AprV2Reader::from_bytes(&bytes).expect("parse APR v2");
943 for name in ["model.layers.0.self_attn.q_proj.bias", "model.norm.weight"] {
944 if let Some(entry) = reader.get_tensor(name) {
945 eprintln!(
946 "[h4-init-dtype] {name}: dtype={:?} shape={:?}",
947 entry.dtype, entry.shape
948 );
949 }
950 }
951 }
952 let tensors = match load_init_tensors_from_apr(path) {
953 Ok(t) => t,
954 Err(e) => {
955 panic!("FALSIFY-H4-INIT-STATS-001: load_init_tensors_from_apr failed: {e}");
956 }
957 };
958
959 let embed = tensors
961 .get("model.embed_tokens.weight")
962 .unwrap_or_else(|| panic!("missing model.embed_tokens.weight in init APR"));
963 let norm = tensors
964 .get("model.norm.weight")
965 .unwrap_or_else(|| panic!("missing model.norm.weight in init APR"));
966
967 let stats = |name: &str, data: &[f32]| -> (f64, f64, f32, f32) {
968 let n = data.len() as f64;
969 let mean = data.iter().map(|&v| v as f64).sum::<f64>() / n;
970 let var = data
971 .iter()
972 .map(|&v| {
973 let d = v as f64 - mean;
974 d * d
975 })
976 .sum::<f64>()
977 / n;
978 let std = var.sqrt();
979 let min = data.iter().copied().fold(f32::INFINITY, f32::min);
980 let max = data.iter().copied().fold(f32::NEG_INFINITY, f32::max);
981 eprintln!(
982 "[h4-init-stats] {name}: n={n} mean={mean:.5} std={std:.5} min={min:.4} max={max:.4}"
983 );
984 (mean, std, min, max)
985 };
986 {
990 let q = tensors.get("model.layers.0.self_attn.q_proj.bias").unwrap();
991 eprintln!(
992 "[h4-dtype-mislabel] q_proj.bias L0[0..6] (aprender F16-decoded): {:?}",
993 &q.0[..6]
994 );
995 let n = tensors.get("model.norm.weight").unwrap();
996 eprintln!(
997 "[h4-dtype-mislabel] model.norm.weight[0..6] (aprender F16-decoded): {:?}",
998 &n.0[..6]
999 );
1000 }
1001
1002 let (em, es, _, _) = stats("model.embed_tokens.weight", &embed.0);
1003 let (nm, ns, _, _) = stats("model.norm.weight", &norm.0);
1004
1005 for layer_idx in [0_usize, 5, 11, 23] {
1009 for kind in ["input_layernorm", "post_attention_layernorm"] {
1010 let key = format!("model.layers.{layer_idx}.{kind}.weight");
1011 if let Some(t) = tensors.get(&key) {
1012 stats(&key, &t.0);
1013 }
1014 }
1015 }
1016 for kind in [
1017 "model.layers.0.self_attn.q_proj.weight",
1018 "model.layers.0.self_attn.q_proj.bias",
1019 "model.layers.0.mlp.gate_proj.weight",
1020 "model.layers.0.mlp.down_proj.weight",
1021 ] {
1022 if let Some(t) = tensors.get(kind) {
1023 stats(kind, &t.0);
1024 }
1025 }
1026
1027 assert!(
1031 em.abs() < 0.05,
1032 "FALSIFY-H4-INIT-STATS-001: embed mean={em} > 0.05; weights are not centered. \
1033 Possible f16→f32 sign-bit corruption or wrong byte-order."
1034 );
1035 assert!(
1036 (0.005..=0.5).contains(&es),
1037 "FALSIFY-H4-INIT-STATS-001: embed std={es} outside [0.005, 0.5]; weights are not \
1038 distributed like trained transformer init. Possible f16 mantissa misread or \
1039 scale corruption."
1040 );
1041
1042 assert!(
1045 nm > 0.01 && nm < 100.0,
1046 "FALSIFY-H4-INIT-STATS-001: norm mean={nm} outside [0.01, 100]; RMSNorm scale \
1047 load is corrupt. Trained pretrained values are typically near 1.0."
1048 );
1049 assert!(
1050 ns < 100.0,
1051 "FALSIFY-H4-INIT-STATS-001: norm std={ns} > 100; RMSNorm has explosive variance. \
1052 Tensor load is corrupt."
1053 );
1054 }
1055
1056 #[test]
1063 fn falsify_h4_cpu_forward_qwen_logits_sensible() {
1064 let fresh = std::path::Path::new("/mnt/nvme-raid0/models/qwen2.5-coder-0.5b-fresh.apr");
1065 let legacy =
1066 std::path::Path::new("/mnt/nvme-raid0/models/qwen2.5-coder-0.5b-instruct-fp16.apr");
1067 let path = if fresh.exists() {
1068 fresh
1069 } else if legacy.exists() {
1070 legacy
1071 } else {
1072 eprintln!("[falsify-h4-cpu-forward-001] skipping: host lacks Qwen 0.5B APR");
1073 return;
1074 };
1075
1076 let tensors = load_init_tensors_from_apr(path).expect("load_init_tensors_from_apr");
1077 let cfg = TransformerConfig::qwen2_0_5b();
1078 let mut transformer = Transformer::new(&cfg);
1079 let populated = populate_trainer_from_init_tensors(&mut transformer, &tensors)
1080 .expect("populate_trainer_from_init_tensors");
1081 eprintln!("[falsify-h4-cpu-forward-001] populated {populated} tensors");
1082
1083 let token_ids = vec![100_u32];
1084 let logits = transformer.forward(&token_ids);
1085 let data = logits.data();
1086 let slice = data.as_slice().expect("logits contiguous");
1087
1088 let mut nan_count = 0usize;
1089 let mut inf_count = 0usize;
1090 let mut min = f32::INFINITY;
1091 let mut max = f32::NEG_INFINITY;
1092 let mut sum = 0.0_f64;
1093 let mut sum_sq = 0.0_f64;
1094 let mut argmax_idx = 0_usize;
1095 for (i, &v) in slice.iter().enumerate() {
1096 if v.is_nan() {
1097 nan_count += 1;
1098 } else if v.is_infinite() {
1099 inf_count += 1;
1100 } else {
1101 if v < min {
1102 min = v;
1103 }
1104 if v > max {
1105 max = v;
1106 argmax_idx = i;
1107 }
1108 sum += v as f64;
1109 sum_sq += (v as f64) * (v as f64);
1110 }
1111 }
1112 let n = slice.len() as f64;
1113 let mean = sum / n;
1114 let std = (sum_sq / n - mean * mean).sqrt();
1115
1116 eprintln!(
1117 "[falsify-h4-cpu-forward-001] token=100 logits: n={} nan={nan_count} inf={inf_count} \
1118 min={min:.4} max={max:.4} mean={mean:.4} std={std:.4} argmax={argmax_idx}",
1119 slice.len()
1120 );
1121
1122 assert_eq!(nan_count, 0, "logits contain NaN — forward corruption");
1123 assert_eq!(inf_count, 0, "logits contain Inf — forward corruption");
1124 assert!(
1125 std > 0.01,
1126 "FALSIFY-H4-CPU-FORWARD-001: logits std={std} < 0.01 — essentially constant"
1127 );
1128 let peak_to_mean = (max as f64 - mean).abs() / std.max(1e-9);
1129 assert!(
1130 peak_to_mean > 1.5,
1131 "FALSIFY-H4-CPU-FORWARD-001: peak-to-mean ratio = {peak_to_mean} < 1.5 — \
1132 logits are essentially uniform"
1133 );
1134 assert!(
1135 (argmax_idx as u32) < cfg.vocab_size as u32,
1136 "FALSIFY-H4-CPU-FORWARD-001: argmax_idx={argmax_idx} >= vocab_size={}",
1137 cfg.vocab_size
1138 );
1139 }
1140}