aprender-core 0.29.1

Next-generation machine learning library in pure Rust
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
//! APR Import Pipeline
//! PMAT-197: Extracted from mod.rs

use crate::error::{AprenderError, Result};
use crate::format::converter_types::{
    Architecture, ImportError, ImportOptions, QuantizationType, Source, TensorExpectation,
    ValidationConfig,
};
use crate::format::gguf::{
    load_gguf_raw, load_gguf_with_tokenizer, GgufModelConfig, GgufRawTensor, GgufTokenizer,
};
use crate::format::layout_contract::{contract, enforce_import_contract};
use crate::format::sharded::ShardIndex;
use crate::format::validation::{AprValidator, TensorStats, ValidationReport};
use crate::serialization::safetensors::{MappedSafeTensors, UserMetadata};
use provable_contracts_macros::{ensures, requires};
use std::collections::BTreeMap;

// Import write functions and helpers from parent module
use super::{validate_tensor_values, write_apr_file, write_apr_file_raw};
use std::fs;
use std::path::{Path, PathBuf};

#[cfg(feature = "hf-hub-integration")]
use crate::format::converter_types::parse_import_error;

/// Import a model from external format (GGUF, SafeTensors, HF) into APR format.
///
/// Provable contract: source is non-empty, output path is writable.
#[requires(!source.is_empty())]
pub fn apr_import<P: AsRef<Path>>(
    source: &str,
    output: P,
    options: ImportOptions,
) -> Result<ValidationReport> {
    let parsed_source = Source::parse(source)?;
    let output_path = output.as_ref();

    // Step 1: Resolve source to local path
    let local_path = resolve_source(&parsed_source, options.cache)?;

    // Step 2: Check if GGUF - use raw import path to preserve quantization
    // PMAT-271: Use magic bytes first, extension fallback for extensionless HF cache blobs
    let is_gguf = crate::format::rosetta::FormatType::from_magic(&local_path)
        .map(|f| matches!(f, crate::format::rosetta::FormatType::Gguf))
        .unwrap_or_else(|_| local_path.extension().and_then(|e| e.to_str()) == Some("gguf"));
    if is_gguf {
        // PMAT-103: Use raw GGUF loading to preserve Q4_K/Q6_K quantization
        // GH-375: Falls back to dequant→requant for unsupported dtypes (Q4_0, Q5_0, Q8_0)
        if let Some(report) = try_gguf_raw_import(&local_path, output_path, &options)? {
            return Ok(report);
        }
    }

    // realizar#136: Streaming import for sharded SafeTensors (>10B params).
    // Writes tensors to disk incrementally — peak RAM = 1 shard (~5 GB) instead of full model.
    let is_sharded_safetensors = local_path
        .file_name()
        .is_some_and(|n| n.to_string_lossy().ends_with(".index.json"));
    if is_sharded_safetensors {
        return streaming_sharded_import(&local_path, output_path, &options);
    }

    // Non-GGUF path: Load tensors as f32, apply quantization during write
    let mut load_result = load_source_tensors(&local_path, &options)?;

    // PMAT-SAFETENSORS-TOK-001: For HuggingFace SafeTensors imports, try to find
    // tokenizer.json from the same repo if not found as sibling file
    resolve_hf_tokenizer_fallback(&mut load_result, &parsed_source);

    // model-metadata-bounds-v1.yaml: warn on out-of-bounds config values at import time
    if let Some(config) = load_result.model_config.as_ref() {
        config.warn_out_of_bounds();
    }

    // PMAT-224: Warn about unverified architectures before proceeding
    let metadata_arch = infer_architecture(
        &options.architecture,
        load_result
            .model_config
            .as_ref()
            .and_then(|c| c.architecture.as_deref()),
    );
    // CONTRACT: Tensor evidence overrides metadata claims
    // GH-576: Pass user_specified flag so explicit --arch is respected
    let user_specified = options.architecture != Architecture::Auto;
    let effective_arch = verify_architecture_from_tensor_evidence(
        metadata_arch,
        load_result.tensors.keys().map(String::as_str),
        user_specified,
    );
    warn_unverified_architecture(&effective_arch, options.strict)?;

    // Step 3: Map tensor names to canonical APR names
    let mut mapped_tensors = map_tensor_names(&load_result.tensors, effective_arch);

    // GH-233: Split fused QKV tensors for GPT-2 after name mapping
    if effective_arch == Architecture::Gpt2 {
        Architecture::split_gpt2_fused_qkv(&mut mapped_tensors);
    }
    // GH-311: Split fused QKV tensors for GPT-NeoX after name mapping
    if effective_arch == Architecture::GptNeoX {
        Architecture::split_neox_fused_qkv(&mut mapped_tensors);
    }

    // BUG-IMPORT-001 FIX: When GGUF falls back to dequant→requant (GH-375),
    // load_source_tensors returns GGUF col-major shapes [ne0, ne1].
    // We must apply enforce_import_contract to reverse them to APR row-major [ne1, ne0].
    // Without this, lm_head.weight gets shape [hidden, vocab] instead of [vocab, hidden],
    // producing corrupt APR files (Grade F, density violations on re-quantization).
    if is_gguf {
        let vocab_size_for_contract = load_result
            .model_config
            .as_ref()
            .and_then(|c| c.vocab_size)
            .unwrap_or(0);
        let hidden_dim_for_contract = load_result
            .model_config
            .as_ref()
            .and_then(|c| c.hidden_size)
            .unwrap_or(0);
        mapped_tensors = mapped_tensors
            .into_iter()
            .map(|(name, (data, shape))| {
                let (apr_shape, needs_data_transpose) =
                    enforce_import_contract(&name, &shape, vocab_size_for_contract, hidden_dim_for_contract);
                assert!(
                    !needs_data_transpose,
                    "CONTRACT BUG: enforce_import_contract returned needs_data_transpose=true for '{}'. \
                     GGUF→APR NEVER needs data transpose. See GH-208, BUG-IMPORT-001.",
                    name
                );
                (name, (data, apr_shape))
            })
            .collect();
        let transformed_count = mapped_tensors.len();
        eprintln!(
            "[BUG-IMPORT-001] Applied GGUF→APR shape contract to {} tensors (dequant fallback path)",
            transformed_count
        );
    }

    // GH-205 + GH-353: Also map F16/BF16 raw tensor names for passthrough
    let mapped_f16_raw: BTreeMap<String, (Vec<u8>, Vec<usize>, bool)> = load_result
        .f16_raw_tensors
        .iter()
        .map(|(name, (bytes, shape, is_bf16))| {
            let mapped_name = effective_arch.map_name(name);
            (mapped_name, (bytes.clone(), shape.clone(), *is_bf16))
        })
        .collect();

    // Step 4: ENFORCE CONTRACT (P0 - contracts/tensor-layout-v1.yaml)
    // The contract is the SOURCE OF TRUTH for tensor shapes.
    let layout_contract = contract();
    let vocab_size = load_result
        .model_config
        .as_ref()
        .and_then(|c| c.vocab_size)
        .unwrap_or(0);
    let hidden_dim = load_result
        .model_config
        .as_ref()
        .and_then(|c| c.hidden_size)
        .unwrap_or(0);

    validate_contract_f32(
        &layout_contract,
        &mapped_tensors,
        vocab_size,
        hidden_dim,
        options.strict,
    )?;

    // GH-279: Architecture completeness gate for SafeTensors path
    // PMAT-296: Gate MUST run even without model_config — infer num_layers from tensor names
    if let Some(config) = load_result.model_config.as_ref() {
        enforce_arch_completeness_gate_f32(&effective_arch, &mapped_tensors, config)?;
    } else {
        enforce_arch_completeness_gate_inferred(&effective_arch, &mapped_tensors)?;
    }

    // Step 5: Validate tensors (inline validation)
    let validation_result = validate_tensors(&mapped_tensors, &options)?;

    // Step 5: Write APR format (with tokenizer AND model config - CRITICAL for inference)
    // Note: Quantization (fp16/int8/int4) is applied during write for true packed storage
    // PMAT-223: Pass user metadata for preservation in APR custom field
    // GH-205: Pass F16 raw tensors for passthrough
    write_apr_file(
        &mapped_tensors,
        &mapped_f16_raw,
        output_path,
        &options,
        load_result.tokenizer.as_ref(),
        load_result.model_config.as_ref(),
        &load_result.user_metadata,
    )?;

    Ok(validation_result)
}

/// GH-375: Try raw GGUF import, falling back to dequant path for unsupported dtypes.
///
/// Returns `Ok(Some(report))` on raw success, `Ok(None)` to fall through to
/// the dequant→requant path, or `Err` for non-recoverable failures.
fn try_gguf_raw_import(
    path: &Path,
    output_path: &Path,
    options: &ImportOptions,
) -> Result<Option<ValidationReport>> {
    match apr_import_gguf_raw(path, output_path, options) {
        Ok(report) => Ok(Some(report)),
        Err(e) => {
            let msg = e.to_string();
            if msg.contains("cannot represent exactly") || msg.contains("not yet supported") {
                eprintln!(
                    "[GH-375] Raw import failed ({}), falling back to dequant→requant path",
                    msg.lines().next().unwrap_or("unsupported dtype")
                );
                Ok(None)
            } else {
                Err(e)
            }
        }
    }
}

/// Import GGUF file preserving original quantization (Q4_K, Q6_K, etc.)
///
/// This is the preferred path for GGUF import as it preserves the exact
/// quantization from the source file, ensuring format parity with Ollama/llama.cpp.
#[requires(gguf_path.exists())]
pub(crate) fn apr_import_gguf_raw(
    gguf_path: &Path,
    output_path: &Path,
    options: &ImportOptions,
) -> Result<ValidationReport> {
    let raw_result = load_gguf_raw(gguf_path)?;

    // model-metadata-bounds-v1.yaml: warn on out-of-bounds config values at import time
    raw_result.model_config.warn_out_of_bounds();

    let effective_tokenizer = resolve_gguf_tokenizer(
        &raw_result.tokenizer,
        gguf_path,
        options.tokenizer_path.as_deref(),
    )?;

    let metadata_arch = resolve_and_log_architecture(
        &options.architecture,
        raw_result.model_config.architecture.as_deref(),
        options.strict,
    )?;
    // CONTRACT: Tensor evidence overrides metadata claims (e.g., bartowski Qwen3 GGUFs claim "qwen2")
    // GH-576: Respect explicit --arch flag
    let user_specified_gguf = options.architecture != Architecture::Auto;
    let effective_arch = verify_architecture_from_tensor_evidence(
        metadata_arch,
        raw_result.tensors.keys().map(String::as_str),
        user_specified_gguf,
    );

    let mapped_tensors = map_and_enforce_raw_tensors(
        raw_result.tensors,
        &effective_arch,
        &raw_result.model_config,
    )?;

    // GH-279: Architecture completeness gate — refuse to write incomplete models
    enforce_arch_completeness_gate(&effective_arch, &mapped_tensors, &raw_result.model_config)?;

    let mut validation_result = ValidationReport::new();
    validation_result.total_score = 85;

    write_apr_file_raw(
        &mapped_tensors,
        output_path,
        options,
        Some(&effective_tokenizer),
        Some(&raw_result.model_config),
    )?;

    Ok(validation_result)
}

/// Resolve architecture from options/GGUF config, log detection, and warn if unverified.
fn resolve_and_log_architecture(
    user_arch: &Architecture,
    gguf_arch: Option<&str>,
    strict: bool,
) -> Result<Architecture> {
    let effective_arch = infer_architecture(user_arch, gguf_arch);
    if effective_arch != Architecture::Auto {
        eprintln!(
            "[PMAT-222] Auto-detected architecture: {:?} (tensor names will be mapped)",
            effective_arch
        );
    }
    warn_unverified_architecture(&effective_arch, strict)?;
    Ok(effective_arch)
}

/// Map tensor names, split GPT-2 QKV if needed, and enforce layout contract.
#[requires(!tensors.is_empty())]
#[ensures(ret.is_ok())]
fn map_and_enforce_raw_tensors(
    tensors: BTreeMap<String, GgufRawTensor>,
    effective_arch: &Architecture,
    model_config: &crate::format::gguf::GgufModelConfig,
) -> Result<BTreeMap<String, GgufRawTensor>> {
    use crate::format::layout_contract::enforce_import_contract;

    // Stage 1: Name mapping
    let mut mapped: BTreeMap<String, GgufRawTensor> = tensors
        .into_iter()
        .map(|(name, tensor)| (effective_arch.map_name(&name), tensor))
        .collect();

    // Stage 2: GPT-2 / GPT-NeoX QKV splitting
    if *effective_arch == Architecture::Gpt2 {
        Architecture::split_gpt2_fused_qkv_raw(&mut mapped);
    }
    if *effective_arch == Architecture::GptNeoX {
        Architecture::split_neox_fused_qkv_raw(&mut mapped);
    }

    // Stage 3: Contract enforcement (GH-208)
    let vocab_size = model_config.vocab_size.unwrap_or(0);
    let hidden_dim = model_config.hidden_size.unwrap_or(0);

    if vocab_size == 0 || hidden_dim == 0 {
        return Err(AprenderError::FormatError {
            message: format!(
                "CONTRACT ENFORCEMENT FAILED: Missing vocab_size ({}) or hidden_dim ({}). \
                 Cannot validate tensor layouts without model config. \
                 This GGUF file may be malformed.",
                vocab_size, hidden_dim
            ),
        });
    }

    let mapped: BTreeMap<String, GgufRawTensor> = mapped
        .into_iter()
        .map(|(name, mut tensor)| {
            let (apr_shape, needs_data_transpose) =
                enforce_import_contract(&name, &tensor.shape, vocab_size, hidden_dim);
            assert!(
                !needs_data_transpose,
                "CONTRACT BUG: enforce_import_contract returned needs_data_transpose=true for '{}'. \
                 GGUF→APR NEVER needs data transpose. See GH-208.",
                name
            );
            tensor.shape = apr_shape;
            (name, tensor)
        })
        .collect();

    eprintln!(
        "[CONTRACT-ENFORCED] {} tensors transformed via tensor-layout-v1.yaml (vocab={}, hidden={})",
        mapped.len(),
        vocab_size,
        hidden_dim
    );

    Ok(mapped)
}

/// GH-279: Architecture completeness gate for raw GGUF tensor import.
///
/// Verifies that ALL tensors required by the declared architecture are present
/// BEFORE writing the APR file. Missing tensor = hard error, not silent garbage later.
fn enforce_arch_completeness_gate(
    arch: &Architecture,
    tensors: &BTreeMap<String, GgufRawTensor>,
    config: &GgufModelConfig,
) -> Result<()> {
    let Some(arch_key) = arch.completeness_key() else {
        return Ok(()); // Non-transformer architectures skip this gate
    };
    let Some(num_layers) = config.num_layers else {
        return Ok(()); // Can't check without layer count
    };
    // Skip if model has no layer tensors — a config.json may describe layers
    // that aren't present in this particular file (e.g., standalone embeddings)
    let has_layers = tensors
        .keys()
        .any(|n| n.contains("model.layers.") || n.contains("blk."));
    if !has_layers {
        return Ok(());
    }
    let names: Vec<&str> = tensors.keys().map(String::as_str).collect();
    crate::format::layout_contract::enforce_architecture_completeness(&names, arch_key, num_layers)
        .map_err(|e| AprenderError::FormatError {
            message: format!("GH-279 architecture completeness gate: {e}"),
        })
}

/// GH-279: Architecture completeness gate for F32 SafeTensors import.
fn enforce_arch_completeness_gate_f32(
    arch: &Architecture,
    tensors: &BTreeMap<String, (Vec<f32>, Vec<usize>)>,
    config: &GgufModelConfig,
) -> Result<()> {
    let Some(arch_key) = arch.completeness_key() else {
        return Ok(());
    };
    let Some(num_layers) = config.num_layers else {
        return Ok(());
    };
    // Skip if model has no layer tensors — a config.json may describe layers
    // that aren't present in this particular file (e.g., standalone embeddings)
    let has_layers = tensors
        .keys()
        .any(|n| n.contains("model.layers.") || n.contains("blk."));
    if !has_layers {
        return Ok(());
    }
    let names: Vec<&str> = tensors.keys().map(String::as_str).collect();
    crate::format::layout_contract::enforce_architecture_completeness(&names, arch_key, num_layers)
        .map_err(|e| AprenderError::FormatError {
            message: format!("GH-279 architecture completeness gate: {e}"),
        })
}

/// PMAT-296: Architecture completeness gate when model_config is unavailable.
///
/// Infers `num_layers` from tensor names (counting unique layer indices).
/// This closes the GAP-1 bypass: SafeTensors without config.json still get checked.
fn enforce_arch_completeness_gate_inferred(
    arch: &Architecture,
    tensors: &BTreeMap<String, (Vec<f32>, Vec<usize>)>,
) -> Result<()> {
    let Some(arch_key) = arch.completeness_key() else {
        return Ok(());
    };
    let num_layers = infer_num_layers_from_tensor_names(tensors.keys().map(String::as_str));
    if num_layers == 0 {
        return Ok(()); // No layer tensors → skip (e.g., standalone embeddings)
    }
    let names: Vec<&str> = tensors.keys().map(String::as_str).collect();
    crate::format::layout_contract::enforce_architecture_completeness(&names, arch_key, num_layers)
        .map_err(|e| AprenderError::FormatError {
            message: format!("GH-279 architecture completeness gate (inferred): {e}"),
        })
}

/// Infer the number of transformer layers from tensor name patterns.
///
/// Scans for `blk.{N}.` or `model.layers.{N}.` patterns and returns max(N) + 1.
fn infer_num_layers_from_tensor_names<'a>(names: impl Iterator<Item = &'a str>) -> usize {
    let mut max_layer: Option<usize> = None;
    for name in names {
        let idx = if let Some(rest) = name.strip_prefix("blk.") {
            rest.split('.').next().and_then(|s| s.parse::<usize>().ok())
        } else if let Some(rest) = name.strip_prefix("model.layers.") {
            rest.split('.').next().and_then(|s| s.parse::<usize>().ok())
        } else {
            None
        };
        if let Some(i) = idx {
            max_layer = Some(max_layer.map_or(i, |m: usize| m.max(i)));
        }
    }
    max_layer.map_or(0, |m| m + 1)
}

/// PMAT-SAFETENSORS-TOK-001: Try to find tokenizer.json from HF cache for this repo.
fn resolve_hf_tokenizer_fallback(load_result: &mut SourceLoadResult, source: &Source) {
    if load_result.tokenizer.is_some() {
        return;
    }
    if let Source::HuggingFace { org, repo, .. } = source {
        if let Some(tokenizer_path) = find_in_cache(org, repo, "tokenizer.json") {
            load_result.tokenizer = load_tokenizer_from_json(&tokenizer_path);
        }
    }
}

/// Resolve a source to a local file path
pub(crate) fn resolve_source(source: &Source, cache: bool) -> Result<PathBuf> {
    match source {
        Source::Local(path) => resolve_local_source(path),
        Source::HuggingFace { org, repo, file } => {
            resolve_hf_source(org, repo, file.as_ref(), cache)
        }
        Source::Url(url) => resolve_url_source(url),
    }
}

/// Resolve a local file or directory to a model path.
fn resolve_local_source(path: &Path) -> Result<PathBuf> {
    if !path.exists() {
        // GH-129: Use ImportError for actionable message
        let err = ImportError::NotFound {
            resource: path.display().to_string(),
            status: 0, // Local file, not HTTP
        };
        return Err(AprenderError::from(err));
    }
    // GH-218: Handle sharded SafeTensors directories
    if path.is_dir() {
        return resolve_local_directory(path);
    }
    Ok(path.to_path_buf())
}

/// Resolve a local directory to the best model file within it.
fn resolve_local_directory(path: &Path) -> Result<PathBuf> {
    let index = path.join("model.safetensors.index.json");
    if index.exists() {
        return Ok(index);
    }
    let single = path.join("model.safetensors");
    if single.exists() {
        return Ok(single);
    }
    Err(AprenderError::FormatError {
        message: format!(
            "Directory {} contains no model.safetensors.index.json or model.safetensors",
            path.display()
        ),
    })
}

/// Resolve a HuggingFace source by checking cache and optionally downloading.
fn resolve_hf_source(org: &str, repo: &str, file: Option<&String>, cache: bool) -> Result<PathBuf> {
    // PMAT-168: Smart default filename based on repo type
    let filename = file.map(String::as_str).unwrap_or_else(|| {
        // Detect GGUF repos by name convention
        if repo.to_lowercase().contains("gguf") {
            // Try common GGUF naming patterns
            // e.g., Qwen2.5-Coder-1.5B-Instruct-GGUF -> qwen2.5-coder-1.5b-instruct-q4_k_m.gguf
            "model.gguf" // We'll try multiple patterns in find_in_cache
        } else {
            "model.safetensors"
        }
    });

    // Check standard cache locations first
    if cache {
        if let Some(path) = find_hf_in_cache(org, repo, file, filename) {
            return Ok(path);
        }
        // GH-279-2: For SafeTensors repos, also check for sharded index
        // Sharded models (e.g. Qwen3-8B) have model.safetensors.index.json
        // instead of a single model.safetensors file.
        if file.is_none() && filename == "model.safetensors" {
            if let Some(path) = find_in_cache(org, repo, "model.safetensors.index.json") {
                return Ok(path);
            }
        }
    }

    // Try to download using hf-hub if feature is enabled (GH-129: proper error handling)
    #[cfg(feature = "hf-hub-integration")]
    {
        let repo_id = format!("{org}/{repo}");
        // Return the result directly without explicit return statements
        download_from_hf(&repo_id, filename)
    }

    // Only reach here if hf-hub-integration feature is disabled
    #[cfg(not(feature = "hf-hub-integration"))]
    Err(AprenderError::FormatError {
        message: format!(
            "HuggingFace model not found in cache. Download manually:\n\
             huggingface-cli download {org}/{repo} {filename}\n\
             Or provide a local path to the SafeTensors/GGUF file.",
        ),
    })
}

/// Search HuggingFace cache for a model file, trying GGUF patterns if applicable.
fn find_hf_in_cache(
    org: &str,
    repo: &str,
    file: Option<&String>,
    filename: &str,
) -> Option<PathBuf> {
    // PMAT-168: Try multiple common filenames for GGUF repos
    if repo.to_lowercase().contains("gguf") && file.is_none() {
        let base_name = repo
            .to_lowercase()
            .replace("-gguf", "")
            .replace("_gguf", "");
        let gguf_patterns = [
            format!("{base_name}-q4_k_m.gguf"),
            format!("{base_name}-q4_k.gguf"),
            format!("{base_name}-q8_0.gguf"),
            "model.gguf".to_string(),
        ];
        for pattern in &gguf_patterns {
            if let Some(path) = find_in_cache(org, repo, pattern) {
                return Some(path);
            }
        }
    }
    find_in_cache(org, repo, filename)
}

/// Resolve a URL source (not yet implemented).
fn resolve_url_source(url: &str) -> Result<PathBuf> {
    Err(AprenderError::FormatError {
        message: format!("URL download not yet implemented: {url}"),
    })
}

/// GH-478: Quantization dispatch for streaming imports.
///
/// Mirrors `dispatch_quantize()` in write.rs but operates on `AprV2StreamingWriter`.
/// Respects `should_skip_quantization` for sensitive tensors (norms, biases, small tensors).
fn streaming_dispatch_quantize(
    writer: &mut crate::format::v2::AprV2StreamingWriter,
    name: &str,
    data: &[f32],
    shape: Vec<usize>,
    quantize: Option<QuantizationType>,
) -> std::result::Result<(), crate::format::v2::V2FormatError> {
    // GH-439 (poka-yoke): Exhaustive match — no silent fallbacks.
    // Adding a new QuantizationType variant forces a compile error here.
    let should_skip = super::should_skip_quantization(name, data.len());
    match quantize {
        Some(QuantizationType::Fp16) => writer.add_f16_tensor(name, shape, data),
        Some(QuantizationType::Int8) if !should_skip => writer.add_q8_tensor(name, shape, data),
        Some(QuantizationType::Int4) if !should_skip => writer.add_q4_tensor(name, shape, data),
        Some(QuantizationType::Q4K) if !should_skip => {
            let q4k_bytes = super::quantize_q4_k_matrix(data, &shape);
            writer.add_q4k_raw_tensor(name, shape, &q4k_bytes)
        }
        // Skip quantization: norms, biases, small tensors → store as F32
        Some(QuantizationType::Int8 | QuantizationType::Int4 | QuantizationType::Q4K) => {
            writer.add_f32_tensor(name, shape, data)
        }
        // No quantization requested → store as F32
        None => writer.add_f32_tensor(name, shape, data),
    }
}

/// realizar#136: Streaming import for sharded SafeTensors models.
///
/// Processes one shard at a time, writing tensors to disk immediately via
/// `AprV2StreamingWriter`. Peak RAM = 1 shard's mmap (~5 GB) + streaming
/// writer's index entries (~180 KB for 1811 tensors).
///
/// For Qwen3.5-35B-A3B (67 GB, 14 shards, 1811 tensors), this uses ~5 GB
/// peak RAM instead of ~134 GB with the non-streaming path.
fn streaming_sharded_import(
    index_path: &Path,
    output_path: &Path,
    options: &ImportOptions,
) -> Result<ValidationReport> {
    use crate::format::v2::{AprV2Metadata, AprV2StreamingWriter};

    let content = fs::read_to_string(index_path).map_err(|e| AprenderError::FormatError {
        message: format!("Failed to read shard index {}: {e}", index_path.display()),
    })?;
    let index = ShardIndex::from_json(&content)?;

    if index.shard_count() == 0 {
        return Err(AprenderError::FormatError {
            message: "Shard index contains no shard files".to_string(),
        });
    }

    let canonical_index =
        std::fs::canonicalize(index_path).unwrap_or_else(|_| index_path.to_path_buf());
    let base_dir = canonical_index
        .parent()
        .ok_or_else(|| AprenderError::FormatError {
            message: format!(
                "Cannot determine parent directory of {}",
                index_path.display()
            ),
        })?;

    // Load config.json and tokenizer
    let sibling_path = base_dir.join("model.safetensors.index.json");
    let model_config = load_model_config_from_json(&sibling_path);
    let tokenizer = load_tokenizer_from_json(&sibling_path);

    if model_config.is_none() && !options.allow_no_config {
        return Err(AprenderError::FormatError {
            message: format!(
                "config.json not found at {}. Use --allow-no-config to proceed without it.",
                base_dir.join("config.json").display()
            ),
        });
    }

    // Infer architecture
    let metadata_arch = infer_architecture(
        &options.architecture,
        model_config
            .as_ref()
            .and_then(|c| c.architecture.as_deref()),
    );

    // Build metadata for APR file
    let param_count = 0u64; // Computed during finalize from tensor shapes

    // GH-478: Embed tokenizer in metadata (was previously discarded in streaming path)
    let mut custom = std::collections::HashMap::new();
    if let Some(ref tok) = tokenizer {
        super::write::insert_f32_tokenizer_metadata(tok, &mut custom);
    }

    let metadata = AprV2Metadata {
        model_type: format!("{metadata_arch:?}"),
        name: Some(
            output_path
                .file_stem()
                .and_then(|s| s.to_str())
                .unwrap_or("model")
                .to_string(),
        ),
        param_count,
        custom,
        architecture: model_config.as_ref().and_then(|c| c.architecture.clone()),
        hidden_size: model_config.as_ref().and_then(|c| c.hidden_size),
        num_layers: model_config.as_ref().and_then(|c| c.num_layers),
        num_heads: model_config.as_ref().and_then(|c| c.num_heads),
        num_kv_heads: model_config.as_ref().and_then(|c| c.num_kv_heads),
        vocab_size: model_config.as_ref().and_then(|c| c.vocab_size),
        intermediate_size: model_config.as_ref().and_then(|c| c.intermediate_size),
        max_position_embeddings: model_config
            .as_ref()
            .and_then(|c| c.max_position_embeddings),
        rope_theta: model_config.as_ref().and_then(|c| c.rope_theta),
        rope_type: model_config.as_ref().and_then(|c| c.rope_type),
        rms_norm_eps: model_config.as_ref().and_then(|c| c.rms_norm_eps),
        ..Default::default()
    };

    eprintln!(
        "[realizar#136] Streaming import: {} shards, {} tensors → {}",
        index.shard_count(),
        index.tensor_count(),
        output_path.display(),
    );

    let mut writer =
        AprV2StreamingWriter::new(metadata).map_err(|e| AprenderError::FormatError {
            message: format!("Failed to create streaming writer: {e}"),
        })?;

    // Process each shard: mmap → extract tensors → write to streaming writer → drop mmap
    let mut total_tensors = 0usize;
    let mut f16_passthrough = 0usize;

    // GH-478: Track weight tying — if no lm_head.weight found, duplicate embedding
    let mut has_lm_head = false;
    let mut embed_info: Option<(String, String)> = None; // (shard_file, original_name)

    for shard_file in index.shard_files() {
        let shard_path = base_dir.join(shard_file);
        if !shard_path.exists() {
            return Err(AprenderError::FormatError {
                message: format!(
                    "Shard file {} not found at {}",
                    shard_file,
                    shard_path.display()
                ),
            });
        }

        let mapped =
            MappedSafeTensors::open(&shard_path).map_err(|e| AprenderError::FormatError {
                message: format!("Failed to mmap shard {shard_file}: {e}"),
            })?;

        let names: Vec<String> = mapped
            .tensor_names()
            .iter()
            .map(|&s| (*s).to_string())
            .collect();

        let mut shard_f16 = 0usize;

        for name in &names {
            if name.starts_with("__") {
                continue;
            }

            let meta = mapped
                .get_metadata(name)
                .ok_or_else(|| AprenderError::FormatError {
                    message: format!("Tensor metadata not found for '{name}'"),
                })?;

            // Map tensor name to canonical APR name
            let mapped_name = metadata_arch.map_name(name);

            // GH-478: Track lm_head and embedding for weight tying
            if mapped_name == "lm_head.weight" || mapped_name == "output.weight" {
                has_lm_head = true;
            }
            if mapped_name.contains("embed_tokens.weight")
                || mapped_name == "token_embd.weight"
                || mapped_name == "wte.weight"
            {
                embed_info = Some((shard_file.clone(), name.clone()));
            }

            let is_bf16 = meta.dtype == "BF16";
            let is_f16 = meta.dtype == "F16" || is_bf16;

            // GH-478: Only passthrough F16/BF16 when no quantization (or Fp16) is requested.
            // When user requests Int4/Int8/Q4K, dequantize to F32 and dispatch quantization.
            if is_f16 && matches!(options.quantize, None | Some(QuantizationType::Fp16)) {
                if let Some(raw_bytes) = mapped.get_tensor_bytes(name) {
                    writer
                        .add_raw_f16_tensor(&mapped_name, meta.shape.clone(), raw_bytes, is_bf16)
                        .map_err(|e| AprenderError::FormatError {
                            message: format!("Failed to write tensor '{mapped_name}': {e}"),
                        })?;
                    shard_f16 += 1;
                    total_tensors += 1;
                    continue;
                }
            }

            // Dequantize to F32 (handles F16/BF16→F32 and native F32)
            let data = mapped
                .get_tensor(name)
                .map_err(|e| AprenderError::FormatError {
                    message: format!("Failed to extract tensor '{name}': {e}"),
                })?;

            // GH-478: Dispatch quantization for streaming imports
            streaming_dispatch_quantize(
                &mut writer,
                &mapped_name,
                &data,
                meta.shape.clone(),
                options.quantize,
            )
            .map_err(|e| AprenderError::FormatError {
                message: format!("Failed to write tensor '{mapped_name}': {e}"),
            })?;
            total_tensors += 1;
        }

        let shard_quantized = names.len() - shard_f16;
        f16_passthrough += shard_f16;
        eprintln!(
            "[realizar#136] Shard {shard_file}: {} tensors ({shard_f16} F16 passthrough, {shard_quantized} quantized)",
            names.len(),
        );

        // mapped (mmap) dropped here — OS reclaims virtual address space
    }

    // GH-478: Weight tying — if no lm_head.weight, duplicate embedding as lm_head
    if !has_lm_head {
        if let Some((embed_shard, embed_name)) = &embed_info {
            let shard_path = base_dir.join(embed_shard);
            let mapped =
                MappedSafeTensors::open(&shard_path).map_err(|e| AprenderError::FormatError {
                    message: format!("Failed to re-mmap shard for weight tying: {e}"),
                })?;
            let meta =
                mapped
                    .get_metadata(embed_name)
                    .ok_or_else(|| AprenderError::FormatError {
                        message: "Embedding tensor metadata not found for weight tying".to_string(),
                    })?;
            let data = mapped
                .get_tensor(embed_name)
                .map_err(|e| AprenderError::FormatError {
                    message: format!("Failed to extract embedding for weight tying: {e}"),
                })?;
            streaming_dispatch_quantize(
                &mut writer,
                "lm_head.weight",
                &data,
                meta.shape.clone(),
                options.quantize,
            )
            .map_err(|e| AprenderError::FormatError {
                message: format!("Failed to write tied lm_head.weight: {e}"),
            })?;
            total_tensors += 1;
            eprintln!("[realizar#136] Weight tying: duplicated {embed_name} → lm_head.weight");
        }
    }

    let total_quantized = total_tensors - f16_passthrough;
    eprintln!(
        "[realizar#136] Streaming write complete: {} tensors ({} F16 passthrough, {} quantized), {:.1} GB data",
        total_tensors,
        f16_passthrough,
        total_quantized,
        writer.data_bytes_written() as f64 / 1_073_741_824.0,
    );

    // Finalize: write header + metadata + index + data from temp file
    writer
        .finalize(output_path)
        .map_err(|e| AprenderError::FormatError {
            message: format!("Failed to finalize APR file: {e}"),
        })?;

    let file_size = fs::metadata(output_path).map(|m| m.len()).unwrap_or(0);
    eprintln!(
        "[realizar#136] Written {} ({:.1} GB)",
        output_path.display(),
        file_size as f64 / 1_073_741_824.0,
    );

    // Return a basic validation report
    Ok(ValidationReport::new())
}

include!("import_include_01.rs");