use native_neural_network::modules::{
activations::ActivationKind,
benchmark::{
decode_benchmark_blob, encode_benchmark_blob, encoded_size_benchmark_blob, BenchmarkMetrics,
},
layers::{LayerDesc, LayerSpec},
trainer::{sgd_step, SgdConfig, SgdScratch, TrainError},
};
use native_neural_network::rnn_api::{
build_f32_into, build_f64_into, read_rnn, ActivationConfig, PackBuffers, RnnApiError,
};
use std::fs;
use std::fs::OpenOptions;
use std::io::Write;
use std::path::{Path, PathBuf};
use std::thread::sleep;
use std::time::{Duration, Instant};
#[derive(Clone, Copy)]
pub enum SampleVariant {
Baseline,
Harmonic,
Spike,
Mirror,
}
#[derive(Clone, Copy)]
pub struct TrainingOrder(pub u8);
impl TrainingOrder {
pub fn sample_variant(self, seed: usize) -> SampleVariant {
match self.0 {
0 => match seed % 4 {
0 => SampleVariant::Baseline,
1 => SampleVariant::Harmonic,
2 => SampleVariant::Spike,
_ => SampleVariant::Mirror,
},
1 => match seed % 4 {
0 => SampleVariant::Mirror,
1 => SampleVariant::Spike,
2 => SampleVariant::Harmonic,
_ => SampleVariant::Baseline,
},
2 => match seed % 4 {
0 => SampleVariant::Spike,
1 => SampleVariant::Baseline,
2 => SampleVariant::Harmonic,
_ => SampleVariant::Mirror,
},
_ => match seed % 4 {
0 => SampleVariant::Harmonic,
1 => SampleVariant::Baseline,
2 => SampleVariant::Mirror,
_ => SampleVariant::Spike,
},
}
}
}
pub fn synthetic_sample_in_place(
step: usize,
variant: SampleVariant,
input_noise_amplitude: f32,
input: &mut [f32],
target: &mut [f32],
) {
for (i, slot) in input.iter_mut().enumerate() {
let phase = ((step.wrapping_mul(17).wrapping_add(i.wrapping_mul(31))) % 257) as f32 / 256.0;
let mut value = phase * 2.0 - 1.0;
value = match variant {
SampleVariant::Baseline => value,
SampleVariant::Harmonic => {
let h = (((step + i * 3) % 41) as f32 / 40.0) * 2.0 - 1.0;
(0.65 * value + 0.35 * h).tanh()
}
SampleVariant::Spike => {
if (step + i).is_multiple_of(11) {
(value + 0.85).clamp(-1.0, 1.0)
} else {
value
}
}
SampleVariant::Mirror => {
if i % 2 == 0 {
value
} else {
-value
}
}
};
let noise = centered_noise(step, i, 97) * input_noise_amplitude;
*slot = (value + noise).clamp(-1.0, 1.0);
}
let mean = if input.is_empty() {
0.0
} else {
input.iter().copied().sum::<f32>() / input.len() as f32
};
let l2 = if input.is_empty() {
0.0
} else {
let s = input.iter().map(|v| v * v).sum::<f32>() / input.len() as f32;
s.sqrt()
};
let weighted = if input.is_empty() {
0.0
} else {
let mut acc = 0.0f32;
for (i, value) in input.iter().enumerate() {
let w = ((i % 5) as f32 - 2.0) * 0.15;
acc += *value * w;
}
acc / input.len() as f32
};
let variant_bias = match variant {
SampleVariant::Baseline => 0.0,
SampleVariant::Harmonic => 0.08,
SampleVariant::Spike => 0.12,
SampleVariant::Mirror => -0.05,
};
for (o, slot) in target.iter_mut().enumerate() {
let phase = ((step + o * 7) % 19) as f32 / 19.0;
let head_noise = centered_noise(step, o, 313) * 0.03;
let mixed =
0.45 * mean + 0.25 * weighted + 0.20 * l2 + (phase - 0.5) + variant_bias + head_noise;
*slot = mixed.tanh();
}
}
pub fn pace_cpu_target_utilization(
start: Instant,
compute_elapsed: &mut Duration,
cycle_compute_elapsed: Duration,
cpu_target_utilization: f64,
) {
*compute_elapsed = compute_elapsed.saturating_add(cycle_compute_elapsed);
if cpu_target_utilization > 0.0 && cpu_target_utilization < 1.0 {
let desired_wall =
Duration::from_secs_f64(compute_elapsed.as_secs_f64() / cpu_target_utilization);
let wall_now = start.elapsed();
if desired_wall > wall_now {
sleep(desired_wall - wall_now);
}
}
}
pub struct FactoredSample {
pub input: Vec<f32>,
pub target: Vec<f32>,
}
#[derive(Clone, Copy)]
pub struct FactorizedKernelConfig {
pub input_noise_amplitude: f32,
pub factor_rank: usize,
pub vector_stride: usize,
pub kernel_gain: f32,
pub target_feedback: f32,
pub model_kernel_id: usize,
pub corruption_every_n: usize,
pub corruption_strength: f32,
}
pub fn kernel_config_for_model(model_file: &str) -> FactorizedKernelConfig {
match model_file {
"small.rnn" => FactorizedKernelConfig {
input_noise_amplitude: 0.06,
factor_rank: 8,
vector_stride: 3,
kernel_gain: 0.18,
target_feedback: 0.12,
model_kernel_id: 1,
corruption_every_n: 9,
corruption_strength: 0.08,
},
"medium.rnn" => FactorizedKernelConfig {
input_noise_amplitude: 0.06,
factor_rank: 16,
vector_stride: 5,
kernel_gain: 0.24,
target_feedback: 0.16,
model_kernel_id: 2,
corruption_every_n: 11,
corruption_strength: 0.10,
},
"large.rnn" => FactorizedKernelConfig {
input_noise_amplitude: 0.06,
factor_rank: 32,
vector_stride: 7,
kernel_gain: 0.30,
target_feedback: 0.20,
model_kernel_id: 3,
corruption_every_n: 13,
corruption_strength: 0.12,
},
"enormous.rnn" => FactorizedKernelConfig {
input_noise_amplitude: 0.06,
factor_rank: 64,
vector_stride: 11,
kernel_gain: 0.36,
target_feedback: 0.24,
model_kernel_id: 4,
corruption_every_n: 15,
corruption_strength: 0.14,
},
_ => FactorizedKernelConfig {
input_noise_amplitude: 0.06,
factor_rank: 8,
vector_stride: 3,
kernel_gain: 0.18,
target_feedback: 0.12,
model_kernel_id: 0,
corruption_every_n: 9,
corruption_strength: 0.08,
},
}
}
pub struct BenchmarkRecord<'a> {
pub path: PathBuf,
pub model_file: &'a str,
pub topology: &'a [usize],
pub precision_label: &'a str,
pub element_size_bytes: u64,
pub iterations: usize,
pub elapsed: Duration,
pub avg_loss: f32,
pub last_loss: f32,
pub output_bytes: usize,
pub train_samples_per_cycle: usize,
}
pub struct LiveBenchmarkSnapshot {
path: PathBuf,
interval_ms: u64,
next_emit_ms: u64,
}
pub struct LiveSnapshotPoint<'a> {
pub model_file: &'a str,
pub precision_label: &'a str,
pub elapsed: Duration,
pub iterations: usize,
pub avg_loss: f32,
pub last_loss: f32,
pub train_samples_per_cycle: usize,
}
#[derive(Clone, Copy, Debug)]
pub struct ResumeCursor {
pub iterations: usize,
pub elapsed: Duration,
}
pub struct BenchmarkFinalizeRequest<'a> {
pub model_file: &'a str,
pub stem: &'a str,
pub topology: &'a [usize],
pub precision_label: &'a str,
pub element_size_bytes: u64,
pub iterations: usize,
pub elapsed: Duration,
pub avg_loss: f32,
pub last_loss: f32,
pub train_samples_per_cycle: usize,
pub baseline_output_bytes: usize,
pub trained_path: PathBuf,
pub benchmark_dir: PathBuf,
}
impl LiveBenchmarkSnapshot {
pub fn new(path: PathBuf, interval_ms: u64) -> Result<Self, String> {
let interval_ms = interval_ms.max(1);
if !path.exists() {
fs::write(
&path,
"model,precision,elapsed_ms,iterations,train_samples,avg_loss,last_loss,iterations_per_sec,samples_per_sec\n",
)
.map_err(|e| format!("failed to initialize {}: {e}", path.display()))?;
}
Ok(Self {
path,
interval_ms,
next_emit_ms: interval_ms,
})
}
pub fn maybe_write(&mut self, point: LiveSnapshotPoint<'_>, force: bool) -> Result<(), String> {
let elapsed_ms = point.elapsed.as_millis() as u64;
if !force && elapsed_ms < self.next_emit_ms {
return Ok(());
}
let train_samples = point
.iterations
.saturating_mul(point.train_samples_per_cycle) as u64;
let elapsed_secs = point.elapsed.as_secs_f32();
let iterations_per_sec = if elapsed_secs > 0.0 {
point.iterations as f32 / elapsed_secs
} else {
0.0
};
let samples_per_sec = if elapsed_secs > 0.0 {
train_samples as f32 / elapsed_secs
} else {
0.0
};
let line = format!(
"\"{}\",\"{}\",{},{},{},{},{},{},{}\n",
point.model_file,
point.precision_label,
elapsed_ms,
point.iterations,
train_samples,
point.avg_loss,
point.last_loss,
iterations_per_sec,
samples_per_sec,
);
let mut file = OpenOptions::new()
.append(true)
.open(&self.path)
.map_err(|e| format!("failed to open {}: {e}", self.path.display()))?;
file.write_all(line.as_bytes())
.map_err(|e| format!("failed to append {}: {e}", self.path.display()))?;
if force {
self.next_emit_ms = elapsed_ms.saturating_add(self.interval_ms);
} else {
while self.next_emit_ms <= elapsed_ms {
self.next_emit_ms = self.next_emit_ms.saturating_add(self.interval_ms);
}
}
Ok(())
}
}
pub fn load_resume_cursor(benchmark_dir: &Path, stem: &str) -> Result<ResumeCursor, String> {
let path = benchmark_dir.join(format!("{}.csv", stem));
if !path.exists() {
return Ok(ResumeCursor {
iterations: 0,
elapsed: Duration::ZERO,
});
}
let text =
fs::read_to_string(&path).map_err(|e| format!("failed to read {}: {e}", path.display()))?;
for line in text.lines().rev() {
let trimmed = line.trim();
if trimmed.is_empty() || trimmed.starts_with("model,precision,") {
continue;
}
let cols: Vec<&str> = trimmed.split(',').collect();
if cols.len() < 4 {
continue;
}
let Ok(elapsed_ms) = cols[2].trim().parse::<u64>() else {
continue;
};
let Ok(iterations) = cols[3].trim().parse::<usize>() else {
continue;
};
return Ok(ResumeCursor {
iterations,
elapsed: Duration::from_millis(elapsed_ms),
});
}
Ok(ResumeCursor {
iterations: 0,
elapsed: Duration::ZERO,
})
}
pub fn ensure_source_model(
trained_dir: &Path,
precision_folder: &str,
model_file: &str,
) -> Result<PathBuf, String> {
fs::create_dir_all(trained_dir)
.map_err(|e| format!("failed to create {}: {e}", trained_dir.display()))?;
let source_path = trained_dir.join(model_file);
if source_path.exists() {
let bytes = fs::read(&source_path)
.map_err(|e| format!("failed to read {}: {e}", source_path.display()))?;
if extract_last_rnn_snapshot(&bytes).is_ok() {
return Ok(source_path);
}
}
let fallback = Path::new("sample_model")
.join(precision_folder)
.join("sample.rnn");
if fallback.exists() {
fs::copy(&fallback, &source_path).map_err(|e| {
format!(
"failed to bootstrap {} from {}: {e}",
source_path.display(),
fallback.display()
)
})?;
let copied = fs::read(&source_path)
.map_err(|e| format!("failed to read {}: {e}", source_path.display()))?;
if extract_last_rnn_snapshot(&copied).is_ok() {
return Ok(source_path);
}
}
let topology = bootstrap_topology_for_model(model_file);
let hidden_activation = ActivationKind::Relu;
let output_activation = ActivationKind::Identity;
let mut weights_len = 0usize;
let mut biases_len = 0usize;
for pair in topology.windows(2) {
weights_len = weights_len.saturating_add(pair[0].saturating_mul(pair[1]));
biases_len = biases_len.saturating_add(pair[1]);
}
let out_bytes = if precision_folder == "f64" {
let weights: Vec<f64> = (0..weights_len)
.map(|i| {
let base = ((i.wrapping_mul(31).wrapping_add(7)) % 200) as f64 / 100.0 - 1.0;
base * 0.05
})
.collect();
let biases = vec![0.0f64; biases_len];
pack_with_benchmark_f64(
topology,
hidden_activation,
output_activation,
&weights,
&biases,
&[],
)?
} else {
let weights: Vec<f32> = (0..weights_len)
.map(|i| {
let base = ((i.wrapping_mul(31).wrapping_add(7)) % 200) as f32 / 100.0 - 1.0;
base * 0.05
})
.collect();
let biases = vec![0.0f32; biases_len];
pack_with_benchmark_f32(
topology,
hidden_activation,
output_activation,
&weights,
&biases,
&[],
)?
};
fs::write(&source_path, &out_bytes)
.map_err(|e| format!("failed to write {}: {e}", source_path.display()))?;
if extract_last_rnn_snapshot(&out_bytes).is_err() {
return Err(format!(
"missing source model for in-place training: {} (unable to build bootstrap model)",
source_path.display()
));
}
Ok(source_path)
}
fn bootstrap_topology_for_model(model_file: &str) -> &'static [usize] {
match model_file {
"small.rnn" => &[128, 256, 128, 64],
"medium.rnn" => &[256, 384, 256, 128],
"large.rnn" => &[384, 512, 384, 192],
"enormous.rnn" => &[512, 768, 512, 256],
_ => &[64, 128, 64, 32],
}
}
pub fn validate_rnn_has_benchmark(bytes: &[u8]) -> Result<(), String> {
let readable = read_rnn(bytes, None)
.map_err(|e| format!("invalid rnn bytes: benchmark parse error: {e:?}"))?;
if readable.scan.has_benchmark == Some(true) {
Ok(())
} else {
Err("invalid rnn bytes: missing benchmark blob".to_string())
}
}
type DenseCoreBlobs<'a> = (&'a [u8], &'a [u8], &'a [u8]);
pub fn extract_core_dense_blobs(bytes: &[u8]) -> Result<DenseCoreBlobs<'_>, String> {
let readable =
read_rnn(bytes, None).map_err(|e| format!("invalid rnn bytes: parse error: {e:?}"))?;
let layer_meta = readable
.layer_meta
.ok_or_else(|| "missing layer_meta blob".to_string())?;
let weights = readable
.weights
.ok_or_else(|| "missing weights blob".to_string())?;
let biases = readable
.biases
.ok_or_else(|| "missing biases blob".to_string())?;
Ok((layer_meta, weights, biases))
}
pub fn finalize_benchmark_outputs<F>(
request: BenchmarkFinalizeRequest<'_>,
mut pack_with_benchmark: F,
) -> Result<Vec<u8>, String>
where
F: FnMut(&[u8]) -> Result<Vec<u8>, String>,
{
let provisional_benchmark_blob = build_benchmark_blob(&BenchmarkRecord {
path: PathBuf::new(),
model_file: request.model_file,
topology: request.topology,
precision_label: request.precision_label,
element_size_bytes: request.element_size_bytes,
iterations: request.iterations,
elapsed: request.elapsed,
avg_loss: request.avg_loss,
last_loss: request.last_loss,
output_bytes: request.baseline_output_bytes,
train_samples_per_cycle: request.train_samples_per_cycle,
})?;
let provisional_out_bytes = pack_with_benchmark(&provisional_benchmark_blob)?;
let benchmark_blob = build_benchmark_blob(&BenchmarkRecord {
path: PathBuf::new(),
model_file: request.model_file,
topology: request.topology,
precision_label: request.precision_label,
element_size_bytes: request.element_size_bytes,
iterations: request.iterations,
elapsed: request.elapsed,
avg_loss: request.avg_loss,
last_loss: request.last_loss,
output_bytes: provisional_out_bytes.len(),
train_samples_per_cycle: request.train_samples_per_cycle,
})?;
let out_bytes = pack_with_benchmark(&benchmark_blob)?;
validate_rnn_has_benchmark(&out_bytes)?;
let mut trained_file = OpenOptions::new()
.create(true)
.append(true)
.open(&request.trained_path)
.map_err(|e| format!("failed to open {}: {e}", request.trained_path.display()))?;
trained_file
.write_all(&out_bytes)
.map_err(|e| format!("failed to append {}: {e}", request.trained_path.display()))?;
let benchmark_path = request.benchmark_dir.join(format!("{}.bmk", request.stem));
write_benchmark(&BenchmarkRecord {
path: benchmark_path.clone(),
model_file: request.model_file,
topology: request.topology,
precision_label: request.precision_label,
element_size_bytes: request.element_size_bytes,
iterations: request.iterations,
elapsed: request.elapsed,
avg_loss: request.avg_loss,
last_loss: request.last_loss,
output_bytes: out_bytes.len(),
train_samples_per_cycle: request.train_samples_per_cycle,
})?;
let json_path = request.benchmark_dir.join(format!("{}.json", request.stem));
write_benchmark_text_exports(&BenchmarkRecord {
path: json_path.clone(),
model_file: request.model_file,
topology: request.topology,
precision_label: request.precision_label,
element_size_bytes: request.element_size_bytes,
iterations: request.iterations,
elapsed: request.elapsed,
avg_loss: request.avg_loss,
last_loss: request.last_loss,
output_bytes: out_bytes.len(),
train_samples_per_cycle: request.train_samples_per_cycle,
})?;
let yaml_path = json_path.with_extension("yaml");
parse_and_validate_benchmark_exports_paths(&benchmark_path, &json_path, &yaml_path)?;
Ok(out_bytes)
}
pub fn extract_last_rnn_snapshot(bytes: &[u8]) -> Result<Vec<u8>, String> {
if bytes.len() < 4 {
return Err("invalid rnn bytes: too short".to_string());
}
let mut starts = Vec::new();
for idx in 0..=bytes.len() - 4 {
if &bytes[idx..idx + 4] == b"RNN\x00" {
starts.push(idx);
}
}
if starts.is_empty() {
if read_rnn(bytes, None).is_ok() {
return Ok(bytes.to_vec());
}
return Err("invalid rnn bytes: bad magic".to_string());
}
for idx in starts.into_iter().rev() {
let candidate = &bytes[idx..];
if read_rnn(candidate, None).is_ok() {
return Ok(candidate.to_vec());
}
}
if read_rnn(bytes, None).is_ok() {
return Ok(bytes.to_vec());
}
Err("invalid rnn bytes: no decodable snapshot found".to_string())
}
pub fn pack_with_benchmark_f32(
topology: &[usize],
hidden_activation: ActivationKind,
output_activation: ActivationKind,
weights: &[f32],
biases: &[f32],
_benchmark_blob: &[u8],
) -> Result<Vec<u8>, String> {
let layer_count = topology.len().saturating_sub(1);
let mut layer_specs_scratch = vec![
LayerSpec::Dense(LayerDesc {
input_size: 1,
output_size: 1,
weight_offset: 0,
bias_offset: 0,
activation: ActivationKind::Identity,
});
layer_count.max(2)
];
let mut rmd1_scratch = vec![
0u8;
64usize
.saturating_add(weights.len().saturating_mul(4))
.saturating_add(biases.len().saturating_mul(4))
.max(1024)
];
let mut metadata_scratch = vec![0u8; 1024];
let mut out = vec![
0u8;
rmd1_scratch
.len()
.saturating_mul(2)
.saturating_add(8192)
.max(8192)
];
let activations = ActivationConfig {
hidden: hidden_activation,
output: output_activation,
};
loop {
let buffers = PackBuffers {
layer_specs_scratch: &mut layer_specs_scratch,
rmd1_scratch: &mut rmd1_scratch,
metadata_scratch: &mut metadata_scratch,
out_bytes: &mut out,
};
match build_f32_into(topology, activations, weights, biases, buffers) {
Ok(used) => {
out.truncate(used);
return Ok(out);
}
Err(RnnApiError::CapacityTooSmall) => {
layer_specs_scratch.resize(
layer_specs_scratch.len().saturating_mul(2).max(2),
LayerSpec::Dense(LayerDesc {
input_size: 1,
output_size: 1,
weight_offset: 0,
bias_offset: 0,
activation: ActivationKind::Identity,
}),
);
rmd1_scratch.resize(rmd1_scratch.len().saturating_mul(2).max(2048), 0);
metadata_scratch.resize(metadata_scratch.len().saturating_mul(2).max(2048), 0);
out.resize(out.len().saturating_mul(2).max(8192), 0);
}
Err(e) => {
return Err(format!(
"failed to encode rnn model with benchmark (f32): {e:?}"
))
}
}
}
}
pub fn pack_with_benchmark_f64(
topology: &[usize],
hidden_activation: ActivationKind,
output_activation: ActivationKind,
weights: &[f64],
biases: &[f64],
_benchmark_blob: &[u8],
) -> Result<Vec<u8>, String> {
let layer_count = topology.len().saturating_sub(1);
let mut layer_specs_scratch = vec![
LayerSpec::Dense(LayerDesc {
input_size: 1,
output_size: 1,
weight_offset: 0,
bias_offset: 0,
activation: ActivationKind::Identity,
});
layer_count.max(2)
];
let mut rmd1_scratch = vec![
0u8;
64usize
.saturating_add(weights.len().saturating_mul(8))
.saturating_add(biases.len().saturating_mul(8))
.max(1024)
];
let mut metadata_scratch = vec![0u8; 1024];
let mut out = vec![
0u8;
rmd1_scratch
.len()
.saturating_mul(2)
.saturating_add(8192)
.max(8192)
];
let activations = ActivationConfig {
hidden: hidden_activation,
output: output_activation,
};
loop {
let buffers = PackBuffers {
layer_specs_scratch: &mut layer_specs_scratch,
rmd1_scratch: &mut rmd1_scratch,
metadata_scratch: &mut metadata_scratch,
out_bytes: &mut out,
};
match build_f64_into(topology, activations, weights, biases, buffers) {
Ok(used) => {
out.truncate(used);
return Ok(out);
}
Err(RnnApiError::CapacityTooSmall) => {
layer_specs_scratch.resize(
layer_specs_scratch.len().saturating_mul(2).max(2),
LayerSpec::Dense(LayerDesc {
input_size: 1,
output_size: 1,
weight_offset: 0,
bias_offset: 0,
activation: ActivationKind::Identity,
}),
);
rmd1_scratch.resize(rmd1_scratch.len().saturating_mul(2).max(2048), 0);
metadata_scratch.resize(metadata_scratch.len().saturating_mul(2).max(2048), 0);
out.resize(out.len().saturating_mul(2).max(8192), 0);
}
Err(e) => {
return Err(format!(
"failed to encode rnn model with benchmark (f64): {e:?}"
))
}
}
}
}
#[allow(clippy::too_many_arguments)]
pub fn train_step(
layers: &[usize],
weights: &mut [f32],
biases: &mut [f32],
input: &[f32],
target: &[f32],
layer_specs_scratch: &mut [LayerSpec],
activations_scratch: &mut [f32],
deltas_scratch: &mut [f32],
config: SgdConfig,
) -> Result<f32, TrainError> {
let mut scratch = SgdScratch {
layer_specs_scratch,
activations_scratch,
deltas_scratch,
};
sgd_step(layers, weights, biases, input, target, &mut scratch, config)
}
pub fn pack_dense_native_with_benchmark_f32(
topology: &[usize],
hidden_activation: ActivationKind,
output_activation: ActivationKind,
weights: &[f32],
biases: &[f32],
benchmark_blob: &[u8],
) -> Result<Vec<u8>, String> {
pack_with_benchmark_f32(
topology,
hidden_activation,
output_activation,
weights,
biases,
benchmark_blob,
)
}
pub fn pack_dense_native_with_benchmark_f64(
topology: &[usize],
hidden_activation: ActivationKind,
output_activation: ActivationKind,
weights: &[f64],
biases: &[f64],
benchmark_blob: &[u8],
) -> Result<Vec<u8>, String> {
pack_with_benchmark_f64(
topology,
hidden_activation,
output_activation,
weights,
biases,
benchmark_blob,
)
}
#[allow(clippy::too_many_arguments)]
pub fn train_dense_step(
layers: &[usize],
weights: &mut [f32],
biases: &mut [f32],
input: &[f32],
target: &[f32],
layer_specs_scratch: &mut [LayerSpec],
activations_scratch: &mut [f32],
deltas_scratch: &mut [f32],
config: SgdConfig,
) -> Result<f32, TrainError> {
train_step(
layers,
weights,
biases,
input,
target,
layer_specs_scratch,
activations_scratch,
deltas_scratch,
config,
)
}
pub fn create_live_snapshot(
benchmark_dir: &Path,
stem: &str,
interval_ms: u64,
) -> Result<LiveBenchmarkSnapshot, String> {
LiveBenchmarkSnapshot::new(benchmark_dir.join(format!("{}.csv", stem)), interval_ms)
}
pub fn build_parallel_factored_batch(
base_iteration: usize,
input_size: usize,
output_size: usize,
train_samples_per_cycle: usize,
training_order: TrainingOrder,
kernel_cfg: FactorizedKernelConfig,
) -> Result<Vec<FactoredSample>, String> {
std::thread::scope(|scope| {
let mut handles = Vec::with_capacity(train_samples_per_cycle);
for lane in 0..train_samples_per_cycle {
handles.push(scope.spawn(move || {
let sample_seed = base_iteration.saturating_add(lane);
let variant = training_order.sample_variant(sample_seed);
let mut input = vec![0.0f32; input_size];
let mut target = vec![0.0f32; output_size];
synthetic_sample_in_place(
sample_seed,
variant,
kernel_cfg.input_noise_amplitude,
&mut input,
&mut target,
);
apply_factorized_kernel(sample_seed, lane, &mut input, &mut target, kernel_cfg);
FactoredSample { input, target }
}));
}
let mut out = Vec::with_capacity(train_samples_per_cycle);
for handle in handles {
let sample = handle
.join()
.map_err(|_| "parallel sample worker panicked".to_string())?;
out.push(sample);
}
Ok(out)
})
}
fn apply_factorized_kernel(
sample_seed: usize,
lane: usize,
input: &mut [f32],
target: &mut [f32],
kernel_cfg: FactorizedKernelConfig,
) {
if input.is_empty() || target.is_empty() {
return;
}
let rank = kernel_cfg.factor_rank.min(input.len().max(1));
let mut factors = vec![0.0f32; rank];
let mut counts = vec![0usize; rank];
for (idx, value) in input.iter().copied().enumerate() {
let factor_idx = (idx + sample_seed + kernel_cfg.model_kernel_id * 5) % rank;
factors[factor_idx] += value;
counts[factor_idx] = counts[factor_idx].saturating_add(1);
}
for idx in 0..rank {
let count = counts[idx].max(1) as f32;
factors[idx] = (factors[idx] / count).tanh();
}
for (idx, slot) in input.iter_mut().enumerate() {
let a = factors[idx % rank];
let b = factors[(idx
.wrapping_mul(kernel_cfg.vector_stride)
.wrapping_add(lane + kernel_cfg.model_kernel_id))
% rank];
let mixed = (1.0 - kernel_cfg.kernel_gain) * *slot
+ (kernel_cfg.kernel_gain * 0.65) * a
+ (kernel_cfg.kernel_gain * 0.35) * b;
*slot = mixed.tanh();
}
let mut energy = 0.0f32;
for value in input.iter().copied() {
energy += value * value;
}
energy = (energy / input.len() as f32).sqrt();
for (idx, slot) in target.iter_mut().enumerate() {
let latent = factors[(idx + lane + kernel_cfg.model_kernel_id) % rank];
let harmonic =
factors[(idx.wrapping_mul(kernel_cfg.vector_stride + 1) + sample_seed) % rank];
let mixed = 0.72 * *slot
+ kernel_cfg.target_feedback * latent
+ (kernel_cfg.target_feedback * 0.5) * harmonic
+ 0.08 * energy;
*slot = mixed.tanh();
}
attempt_training_corruption(sample_seed, lane, input, target, kernel_cfg);
}
fn attempt_training_corruption(
sample_seed: usize,
lane: usize,
input: &mut [f32],
target: &mut [f32],
kernel_cfg: FactorizedKernelConfig,
) {
if kernel_cfg.corruption_every_n == 0 {
return;
}
let trigger_seed = sample_seed
.wrapping_add(lane.wrapping_mul(13))
.wrapping_add(kernel_cfg.model_kernel_id.wrapping_mul(17));
if !trigger_seed.is_multiple_of(kernel_cfg.corruption_every_n) {
return;
}
let stride = (kernel_cfg.vector_stride.max(1) + kernel_cfg.model_kernel_id).max(1);
for idx in (0..input.len()).step_by(stride) {
let direction = if centered_noise(sample_seed, idx, 701) >= 0.0 {
1.0
} else {
-1.0
};
let corrupted = input[idx] + direction * kernel_cfg.corruption_strength;
input[idx] = corrupted.clamp(-1.0, 1.0);
}
let target_stride = (stride / 2).max(1);
for idx in (0..target.len()).step_by(target_stride) {
let sign = if centered_noise(sample_seed, idx, 911) >= 0.0 {
1.0
} else {
-1.0
};
let blended = 0.80 * target[idx] + sign * kernel_cfg.corruption_strength * 0.75;
target[idx] = blended.clamp(-1.0, 1.0);
}
}
pub fn write_benchmark(record: &BenchmarkRecord<'_>) -> Result<(), String> {
let content = build_benchmark_blob(record)?;
let mut file = OpenOptions::new()
.create(true)
.append(true)
.open(&record.path)
.map_err(|e| format!("failed to open {}: {e}", record.path.display()))?;
let len = u32::try_from(content.len())
.map_err(|_| format!("benchmark blob too large for framing: {}", content.len()))?;
file.write_all(b"BMKF")
.map_err(|e| format!("failed to append {}: {e}", record.path.display()))?;
file.write_all(&len.to_le_bytes())
.map_err(|e| format!("failed to append {}: {e}", record.path.display()))?;
file.write_all(&content)
.map_err(|e| format!("failed to append {}: {e}", record.path.display()))
}
pub fn write_benchmark_text_exports(record: &BenchmarkRecord<'_>) -> Result<(), String> {
let metrics = to_metrics(record);
let topology_str = format_topology(record.topology);
let json_path = record.path.clone();
let yaml_path = json_path.with_extension("yaml");
let json = format!(
"{{\n \"model\": \"{}\",\n \"precision\": \"{}\",\n \"topology\": {},\n \"elapsed_ms\": {},\n \"iterations\": {},\n \"train_samples\": {},\n \"train_samples_per_cycle\": {},\n \"element_size_bytes\": {},\n \"avg_loss\": {},\n \"last_loss\": {},\n \"min_loss\": {},\n \"max_loss\": {},\n \"loss_stddev\": {},\n \"output_bytes\": {},\n \"total_params\": {},\n \"layer_count\": {},\n \"input_dim\": {},\n \"output_dim\": {},\n \"benchmark_flags\": {},\n \"weights_bytes\": {},\n \"biases_bytes\": {},\n \"iterations_per_sec\": {},\n \"samples_per_sec\": {}\n}}\n",
metrics.model_name,
metrics.precision,
topology_str,
metrics.elapsed_ms,
metrics.iterations,
metrics.train_samples,
record.train_samples_per_cycle,
record.element_size_bytes,
metrics.avg_loss,
metrics.last_loss,
metrics.min_loss,
metrics.max_loss,
metrics.loss_stddev,
metrics.output_bytes,
metrics.total_params,
metrics.layer_count,
metrics.input_dim,
metrics.output_dim,
metrics.benchmark_flags,
metrics.weights_bytes,
metrics.biases_bytes,
metrics.iterations_per_sec,
metrics.samples_per_sec,
);
let yaml = format!(
"model: {}\nprecision: {}\ntopology: {}\nelapsed_ms: {}\niterations: {}\ntrain_samples: {}\ntrain_samples_per_cycle: {}\nelement_size_bytes: {}\navg_loss: {}\nlast_loss: {}\nmin_loss: {}\nmax_loss: {}\nloss_stddev: {}\noutput_bytes: {}\ntotal_params: {}\nlayer_count: {}\ninput_dim: {}\noutput_dim: {}\nbenchmark_flags: {}\nweights_bytes: {}\nbiases_bytes: {}\niterations_per_sec: {}\nsamples_per_sec: {}\n",
metrics.model_name,
metrics.precision,
topology_str,
metrics.elapsed_ms,
metrics.iterations,
metrics.train_samples,
record.train_samples_per_cycle,
record.element_size_bytes,
metrics.avg_loss,
metrics.last_loss,
metrics.min_loss,
metrics.max_loss,
metrics.loss_stddev,
metrics.output_bytes,
metrics.total_params,
metrics.layer_count,
metrics.input_dim,
metrics.output_dim,
metrics.benchmark_flags,
metrics.weights_bytes,
metrics.biases_bytes,
metrics.iterations_per_sec,
metrics.samples_per_sec,
);
let mut json_file = OpenOptions::new()
.create(true)
.append(true)
.open(&json_path)
.map_err(|e| format!("failed to open {}: {e}", json_path.display()))?;
json_file
.write_all(json.as_bytes())
.map_err(|e| format!("failed to append {}: {e}", json_path.display()))?;
let mut yaml_file = OpenOptions::new()
.create(true)
.append(true)
.open(&yaml_path)
.map_err(|e| format!("failed to open {}: {e}", yaml_path.display()))?;
yaml_file
.write_all(b"---\n")
.map_err(|e| format!("failed to append {}: {e}", yaml_path.display()))?;
yaml_file
.write_all(yaml.as_bytes())
.map_err(|e| format!("failed to append {}: {e}", yaml_path.display()))?;
Ok(())
}
pub fn build_benchmark_blob(record: &BenchmarkRecord<'_>) -> Result<Vec<u8>, String> {
let metrics = to_metrics(record);
let needed = encoded_size_benchmark_blob(&metrics)
.ok_or_else(|| "failed to compute benchmark blob size".to_string())?;
let mut out = vec![0u8; needed];
let used = encode_benchmark_blob(&metrics, &mut out)
.map_err(|e| format!("failed to encode benchmark blob: {e:?}"))?;
out.truncate(used);
Ok(out)
}
fn to_metrics<'a>(record: &'a BenchmarkRecord<'a>) -> BenchmarkMetrics<'a> {
let (weights, biases) = count_params(record.topology);
let elapsed_secs = record.elapsed.as_secs_f32();
let iterations_per_sec = if elapsed_secs > 0.0 {
record.iterations as f32 / elapsed_secs
} else {
0.0
};
let train_samples = record
.iterations
.saturating_mul(record.train_samples_per_cycle) as u64;
let samples_per_sec = if elapsed_secs > 0.0 {
train_samples as f32 / elapsed_secs
} else {
0.0
};
let min_loss = record.avg_loss.min(record.last_loss);
let max_loss = record.avg_loss.max(record.last_loss);
BenchmarkMetrics {
model_name: record.model_file,
precision: record.precision_label,
elapsed_ms: record.elapsed.as_millis() as u64,
iterations: record.iterations as u64,
train_samples,
avg_loss: record.avg_loss,
last_loss: record.last_loss,
output_bytes: record.output_bytes,
total_params: weights.saturating_add(biases) as u64,
layer_count: record.topology.len().saturating_sub(1) as u32,
input_dim: record.topology.first().copied().unwrap_or(0) as u32,
output_dim: record.topology.last().copied().unwrap_or(0) as u32,
benchmark_flags: 0,
weights_bytes: (weights as u64).saturating_mul(record.element_size_bytes),
biases_bytes: (biases as u64).saturating_mul(record.element_size_bytes),
min_loss,
max_loss,
loss_stddev: (max_loss - min_loss) * 0.5,
iterations_per_sec,
samples_per_sec,
}
}
fn count_params(topology: &[usize]) -> (usize, usize) {
let mut weights = 0usize;
let mut biases = 0usize;
for pair in topology.windows(2) {
let inp = pair[0];
let out = pair[1];
weights = weights.saturating_add(inp.saturating_mul(out));
biases = biases.saturating_add(out);
}
(weights, biases)
}
fn format_topology(topology: &[usize]) -> String {
let mut out = String::from("[");
for (idx, v) in topology.iter().enumerate() {
if idx > 0 {
out.push(',');
}
out.push_str(&v.to_string());
}
out.push(']');
out
}
#[derive(Clone, Debug)]
struct ParsedBenchmarkCore {
model_name: String,
precision: String,
elapsed_ms: u64,
iterations: u64,
train_samples: u64,
avg_loss: f32,
last_loss: f32,
output_bytes: usize,
}
pub fn parse_and_validate_benchmark_exports_paths(
bench_path: &Path,
json_path: &Path,
yaml_path: &Path,
) -> Result<(), String> {
let parsed_bench = parse_bench_file(bench_path)?;
let parsed_json = parse_json_file(json_path)?;
let parsed_yaml = parse_yaml_file(yaml_path)?;
ensure_core_match(&parsed_bench, &parsed_json, "bench", "json")?;
ensure_core_match(&parsed_bench, &parsed_yaml, "bench", "yaml")?;
let benchmark_dir = bench_path.parent().unwrap_or_else(|| Path::new("."));
let stem = bench_path
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("benchmark");
let old_final = benchmark_dir.join(format!("{}.final.csv", stem));
if old_final.exists() {
let _ = fs::remove_file(&old_final);
}
let old_finall = benchmark_dir.join(format!("{}.finall.csv", stem));
if old_finall.exists() {
let _ = fs::remove_file(&old_finall);
}
Ok(())
}
fn ensure_core_match(
left: &ParsedBenchmarkCore,
right: &ParsedBenchmarkCore,
left_name: &str,
right_name: &str,
) -> Result<(), String> {
if left.model_name != right.model_name
|| left.precision != right.precision
|| left.elapsed_ms != right.elapsed_ms
|| left.iterations != right.iterations
|| left.train_samples != right.train_samples
|| left.output_bytes != right.output_bytes
|| (left.avg_loss - right.avg_loss).abs() > 1e-4
|| (left.last_loss - right.last_loss).abs() > 1e-4
{
return Err(format!(
"benchmark mismatch between {} and {}",
left_name, right_name
));
}
Ok(())
}
fn parse_bench_file(path: &Path) -> Result<ParsedBenchmarkCore, String> {
let bytes = fs::read(path).map_err(|e| format!("failed to read {}: {e}", path.display()))?;
if bytes.len() >= 8 {
for idx in (0..=bytes.len() - 8).rev() {
if &bytes[idx..idx + 4] != b"BMKF" {
continue;
}
let len = u32::from_le_bytes([
bytes[idx + 4],
bytes[idx + 5],
bytes[idx + 6],
bytes[idx + 7],
]) as usize;
let start = idx + 8;
let end = start.saturating_add(len);
if end > bytes.len() {
continue;
}
if let Ok(view) = decode_benchmark_blob(&bytes[start..end]) {
return Ok(ParsedBenchmarkCore {
model_name: view.model_name.to_string(),
precision: view.precision.to_string(),
elapsed_ms: view.elapsed_ms,
iterations: view.iterations,
train_samples: view.train_samples,
avg_loss: view.avg_loss,
last_loss: view.last_loss,
output_bytes: view.output_bytes,
});
}
}
}
if let Ok(view) = decode_benchmark_blob(&bytes) {
return Ok(ParsedBenchmarkCore {
model_name: view.model_name.to_string(),
precision: view.precision.to_string(),
elapsed_ms: view.elapsed_ms,
iterations: view.iterations,
train_samples: view.train_samples,
avg_loss: view.avg_loss,
last_loss: view.last_loss,
output_bytes: view.output_bytes,
});
}
let mut found = None;
if bytes.len() >= 4 {
for idx in 0..=bytes.len() - 4 {
if &bytes[idx..idx + 4] == b"BMK\x01" {
found = Some(idx);
}
}
}
let Some(last_idx) = found else {
return Err(format!(
"failed to decode {}: benchmark magic not found",
path.display()
));
};
let view = decode_benchmark_blob(&bytes[last_idx..])
.map_err(|e| format!("failed to decode {}: {e:?}", path.display()))?;
Ok(ParsedBenchmarkCore {
model_name: view.model_name.to_string(),
precision: view.precision.to_string(),
elapsed_ms: view.elapsed_ms,
iterations: view.iterations,
train_samples: view.train_samples,
avg_loss: view.avg_loss,
last_loss: view.last_loss,
output_bytes: view.output_bytes,
})
}
fn parse_json_file(path: &Path) -> Result<ParsedBenchmarkCore, String> {
let text =
fs::read_to_string(path).map_err(|e| format!("failed to read {}: {e}", path.display()))?;
Ok(ParsedBenchmarkCore {
model_name: parse_json_string_field(&text, "model")?,
precision: parse_json_string_field(&text, "precision")?,
elapsed_ms: parse_json_u64_field(&text, "elapsed_ms")?,
iterations: parse_json_u64_field(&text, "iterations")?,
train_samples: parse_json_u64_field(&text, "train_samples")?,
avg_loss: parse_json_f32_field(&text, "avg_loss")?,
last_loss: parse_json_f32_field(&text, "last_loss")?,
output_bytes: parse_json_usize_field(&text, "output_bytes")?,
})
}
fn parse_yaml_file(path: &Path) -> Result<ParsedBenchmarkCore, String> {
let text =
fs::read_to_string(path).map_err(|e| format!("failed to read {}: {e}", path.display()))?;
Ok(ParsedBenchmarkCore {
model_name: parse_yaml_string_field(&text, "model")?,
precision: parse_yaml_string_field(&text, "precision")?,
elapsed_ms: parse_yaml_u64_field(&text, "elapsed_ms")?,
iterations: parse_yaml_u64_field(&text, "iterations")?,
train_samples: parse_yaml_u64_field(&text, "train_samples")?,
avg_loss: parse_yaml_f32_field(&text, "avg_loss")?,
last_loss: parse_yaml_f32_field(&text, "last_loss")?,
output_bytes: parse_yaml_usize_field(&text, "output_bytes")?,
})
}
fn parse_json_string_field(text: &str, key: &str) -> Result<String, String> {
let raw = parse_json_raw_field(text, key)?;
if let Some(stripped) = raw.strip_prefix('"').and_then(|s| s.strip_suffix('"')) {
Ok(stripped.to_string())
} else {
Err(format!("json field {} is not a string", key))
}
}
fn parse_json_u64_field(text: &str, key: &str) -> Result<u64, String> {
let raw = parse_json_raw_field(text, key)?;
raw.parse::<u64>()
.map_err(|e| format!("json field {} parse error: {e}", key))
}
fn parse_json_usize_field(text: &str, key: &str) -> Result<usize, String> {
let raw = parse_json_raw_field(text, key)?;
raw.parse::<usize>()
.map_err(|e| format!("json field {} parse error: {e}", key))
}
fn parse_json_f32_field(text: &str, key: &str) -> Result<f32, String> {
let raw = parse_json_raw_field(text, key)?;
raw.parse::<f32>()
.map_err(|e| format!("json field {} parse error: {e}", key))
}
fn parse_json_raw_field<'a>(text: &'a str, key: &str) -> Result<&'a str, String> {
let pattern = format!("\"{}\"", key);
let mut last: Option<&'a str> = None;
for line in text.lines() {
let trimmed = line.trim();
if !trimmed.starts_with(&pattern) {
continue;
}
let (_, rhs) = trimmed
.split_once(':')
.ok_or_else(|| format!("json malformed field {}", key))?;
last = Some(rhs.trim().trim_end_matches(','));
}
last.ok_or_else(|| format!("missing json field {}", key))
}
fn parse_yaml_string_field(text: &str, key: &str) -> Result<String, String> {
Ok(parse_yaml_raw_field(text, key)?.to_string())
}
fn parse_yaml_u64_field(text: &str, key: &str) -> Result<u64, String> {
parse_yaml_raw_field(text, key)?
.parse::<u64>()
.map_err(|e| format!("yaml field {} parse error: {e}", key))
}
fn parse_yaml_usize_field(text: &str, key: &str) -> Result<usize, String> {
parse_yaml_raw_field(text, key)?
.parse::<usize>()
.map_err(|e| format!("yaml field {} parse error: {e}", key))
}
fn parse_yaml_f32_field(text: &str, key: &str) -> Result<f32, String> {
parse_yaml_raw_field(text, key)?
.parse::<f32>()
.map_err(|e| format!("yaml field {} parse error: {e}", key))
}
fn parse_yaml_raw_field<'a>(text: &'a str, key: &str) -> Result<&'a str, String> {
let prefix = format!("{}:", key);
let mut last: Option<&'a str> = None;
for line in text.lines() {
let trimmed = line.trim();
if let Some(value) = trimmed.strip_prefix(&prefix) {
last = Some(value.trim());
}
}
last.ok_or_else(|| format!("missing yaml field {}", key))
}
fn centered_noise(step: usize, idx: usize, salt: usize) -> f32 {
let mut x = (step as u64)
.wrapping_mul(0x9E37_79B9_7F4A_7C15)
.wrapping_add((idx as u64).wrapping_mul(0xBF58_476D_1CE4_E5B9))
.wrapping_add(salt as u64);
x ^= x >> 30;
x = x.wrapping_mul(0xBF58_476D_1CE4_E5B9);
x ^= x >> 27;
x = x.wrapping_mul(0x94D0_49BB_1331_11EB);
x ^= x >> 31;
let unit = ((x & 0xFFFF) as f32) / 65535.0;
unit * 2.0 - 1.0
}