1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
/// Update inferred config from a single tensor's name and shape.
fn infer_q4k_single_tensor(cfg: &mut InferredQ4kConfig, name: &str, shape: &[usize]) {
// Infer hidden_size from norm weights (1D tensor of hidden_dim)
if name.contains("input_layernorm.weight") && shape.len() == 1 {
cfg.hidden_size = Some(shape[0]);
}
// Infer vocab_size from embedding [vocab_size, hidden_dim]
if name.contains("embed_tokens.weight") && shape.len() == 2 {
cfg.vocab_size = Some(shape[0]);
if cfg.hidden_size.is_none() {
cfg.hidden_size = Some(shape[1]);
}
}
// Count layers
if let Some(idx) = name.strip_prefix("model.layers.") {
if let Some(layer_num) = idx.split('.').next().and_then(|s| s.parse::<usize>().ok()) {
cfg.num_layers = Some(
cfg.num_layers
.map_or(layer_num + 1, |n| n.max(layer_num + 1)),
);
}
}
// Infer kv_heads from k_proj shape [kv_dim, hidden_dim]
if name.contains("k_proj.weight") && shape.len() == 2 && cfg.hidden_size.is_some() {
// kv_dim = shape[0], hidden_dim = shape[1]
// num_kv_heads = kv_dim / head_dim where head_dim = hidden_dim / num_heads
// For Qwen2-0.5B: kv_dim=128, hidden_dim=896, head_dim=64, num_kv_heads=2
cfg.num_kv_heads = Some(shape[0] / 64); // Assume head_dim=64 for now
}
// Infer num_heads from q_proj shape [q_dim, hidden_dim]
if name.contains("q_proj.weight") && shape.len() == 2 {
// q_dim = hidden_dim for standard attention
// num_heads = hidden_dim / head_dim = hidden_dim / 64
cfg.num_heads = Some(shape[0] / 64);
}
// Infer intermediate_size from gate_proj [intermediate, hidden]
if name.contains("gate_proj.weight") && shape.len() == 2 {
cfg.intermediate_size = Some(shape[0]);
}
}
/// Build APR v2 metadata for a Q4K-quantized model.
///
/// Populates architecture fields from the inferred config and sets Qwen2-specific
/// defaults for RoPE, norm epsilon, and position embeddings.
fn build_q4k_metadata(cfg: &InferredQ4kConfig, param_count: u64) -> AprV2Metadata {
AprV2Metadata {
model_type: "qwen2".to_string(),
name: Some("Quantized Model".to_string()),
description: Some("Q4K quantized from SafeTensors".to_string()),
author: None,
license: None,
data_source: None,
data_license: None,
version: Some("1.0.0".to_string()),
source: None,
original_format: Some("safetensors".to_string()),
created_at: None,
total_size: 0,
param_count,
quantization: Some(QuantizationMetadata {
quant_type: "q4_k".to_string(),
bits: 4,
block_size: Some(256),
symmetric: false,
}),
sharding: None,
chat_template: None,
chat_format: None,
special_tokens: None,
architecture: Some("qwen2".to_string()),
hidden_size: cfg.hidden_size,
num_layers: cfg.num_layers,
num_heads: cfg.num_heads,
num_kv_heads: cfg.num_kv_heads,
vocab_size: cfg.vocab_size,
intermediate_size: cfg.intermediate_size,
max_position_embeddings: Some(32768), // Default for Qwen2
rope_theta: Some(1000000.0), // Default for Qwen2
rope_type: Some(2), // NEOX style for Qwen2 (PMAT-114)
rms_norm_eps: Some(1e-6), // Default for Qwen2
head_dim: None,
num_experts: None,
num_experts_per_tok: None,
moe_intermediate_size: None,
custom: std::collections::HashMap::new(),
}
}
/// Determine whether a tensor should be quantized to Q4K.
///
/// Returns `true` for large (>= 256 elements) multi-dimensional weight tensors,
/// excluding biases, norms, scales, and embeddings which are kept as F32.
fn should_quantize_tensor(name: &str, shape: &[usize], data_len: usize) -> bool {
shape.len() >= 2
&& data_len >= 256 // Minimum size for Q4K (one super-block)
&& !name.contains("bias")
&& !name.contains("norm")
&& !name.contains("scale")
&& !name.contains("embed") // Keep embeddings as F32 for now
}
/// Serialize APR writer output and write the resulting bytes to a file.
fn write_q4k_apr_file(mut writer: AprV2Writer, output: &Path) -> Result<()> {
use std::io::Write as IoWrite;
let bytes = writer.write().map_err(|e| AprenderError::FormatError {
message: format!("Failed to serialize APR format: {e}"),
})?;
let mut file = fs::File::create(output).map_err(|e| AprenderError::FormatError {
message: format!("Failed to create output file: {e}"),
})?;
file.write_all(&bytes)
.map_err(|e| AprenderError::FormatError {
message: format!("Failed to write APR file: {e}"),
})
}
/// Save model tensors with Q4K quantization in APR format
///
/// Selectively quantizes large weight tensors while keeping biases and norms as F32.
/// Uses APR format with proper Q4K dtype for GPU-accelerated inference.
///
/// **PMAT-154 (P0 fix):** Now accepts an optional GGUF tokenizer and embeds it
/// into APR metadata via `insert_f32_tokenizer_metadata()`. Previously omitted,
/// producing APR files that failed inference with "Tokenizer encode failed".
fn save_model_tensors_q4k(
tensors: &BTreeMap<String, (Vec<f32>, Vec<usize>)>,
output: &Path,
gguf_tokenizer: Option<&GgufTokenizer>,
) -> Result<()> {
let cfg = infer_q4k_config(tensors);
let param_count: u64 = tensors.values().map(|(data, _)| data.len() as u64).sum();
let mut metadata = build_q4k_metadata(&cfg, param_count);
// PMAT-154: Embed tokenizer into APR metadata (Jidoka: APR files MUST be self-contained)
if let Some(tok) = gguf_tokenizer {
write::insert_f32_tokenizer_metadata(tok, &mut metadata.custom);
eprintln!(
"[PMAT-154] Embedded tokenizer ({} vocab, {} merges) into Q4K APR",
tok.vocabulary.len(),
tok.merges.len()
);
}
let mut writer = AprV2Writer::new(metadata);
// Add tensors, selectively quantizing to Q4K
// GH-202 FIX: Use quantize_q4_k_matrix for 2D tensors to ensure proper
// row-aligned block layout. quantize_q4_k treats data as flat, which
// produces wrong block boundaries when row width != multiple of 256.
for (name, (data, shape)) in tensors {
if should_quantize_tensor(name, shape, data.len()) {
// GH-202 FIX: Use matrix-aware quantization for proper row padding
let q4k_bytes = quantize_q4_k_matrix(data, shape);
writer.add_q4k_raw_tensor(name, shape.clone(), q4k_bytes);
} else {
// Keep as F32
writer.add_f32_tensor(name, shape.clone(), data);
}
}
write_q4k_apr_file(writer, output)
}
// ============================================================================
// Write functions extracted to write.rs (PMAT-197)
mod write;
pub(crate) use write::{write_apr_file, write_apr_file_raw};
// Import pipeline extracted to import.rs (PMAT-197)
mod import;
pub use import::apr_import;
// Export functionality extracted to export.rs (PMAT-197)
mod export;
pub use export::{apr_export, ExportFormat, ExportOptions, ExportReport};
// Merge functionality extracted to merge.rs (PMAT-197)
mod merge;
pub use merge::{apr_merge, MergeOptions, MergeReport, MergeStrategy};
// Evolutionary merge optimization (GH-444)
pub mod evolutionary_merge;
pub use evolutionary_merge::{
evolutionary_merge as evolutionary_merge_fn, EvolutionaryMergeConfig,
EvolutionaryMergeResult,
};
// For tests
// Tests extracted to tests.rs (PMAT-197)
#[cfg(test)]
mod tests;