native_neural_network 0.3.1

use native_neural_network::modules::{
    activations::ActivationKind,
    benchmark::{
        decode_benchmark_blob, encode_benchmark_blob, encoded_size_benchmark_blob, BenchmarkMetrics,
    },
    layers::{LayerDesc, LayerSpec},
    trainer::{sgd_step, SgdConfig, SgdScratch, TrainError},
};
use native_neural_network::rnn_api::{
    build_f32_into, build_f64_into, read_rnn, ActivationConfig, PackBuffers, RnnApiError,
};
use std::fs;
use std::fs::OpenOptions;
use std::io::Write;
use std::path::{Path, PathBuf};
use std::thread::sleep;
use std::time::{Duration, Instant};

#[derive(Clone, Copy)]
pub enum SampleVariant {
    Baseline,
    Harmonic,
    Spike,
    Mirror,
}

#[derive(Clone, Copy)]
pub struct TrainingOrder(pub u8);

impl TrainingOrder {
    pub fn sample_variant(self, seed: usize) -> SampleVariant {
        match self.0 {
            0 => match seed % 4 {
                0 => SampleVariant::Baseline,
                1 => SampleVariant::Harmonic,
                2 => SampleVariant::Spike,
                _ => SampleVariant::Mirror,
            },
            1 => match seed % 4 {
                0 => SampleVariant::Mirror,
                1 => SampleVariant::Spike,
                2 => SampleVariant::Harmonic,
                _ => SampleVariant::Baseline,
            },
            2 => match seed % 4 {
                0 => SampleVariant::Spike,
                1 => SampleVariant::Baseline,
                2 => SampleVariant::Harmonic,
                _ => SampleVariant::Mirror,
            },
            _ => match seed % 4 {
                0 => SampleVariant::Harmonic,
                1 => SampleVariant::Baseline,
                2 => SampleVariant::Mirror,
                _ => SampleVariant::Spike,
            },
        }
    }
}

pub fn synthetic_sample_in_place(
    step: usize,
    variant: SampleVariant,
    input_noise_amplitude: f32,
    input: &mut [f32],
    target: &mut [f32],
) {
    for (i, slot) in input.iter_mut().enumerate() {
        let phase = ((step.wrapping_mul(17).wrapping_add(i.wrapping_mul(31))) % 257) as f32 / 256.0;
        let mut value = phase * 2.0 - 1.0;
        value = match variant {
            SampleVariant::Baseline => value,
            SampleVariant::Harmonic => {
                let h = (((step + i * 3) % 41) as f32 / 40.0) * 2.0 - 1.0;
                (0.65 * value + 0.35 * h).tanh()
            }
            SampleVariant::Spike => {
                if (step + i).is_multiple_of(11) {
                    (value + 0.85).clamp(-1.0, 1.0)
                } else {
                    value
                }
            }
            SampleVariant::Mirror => {
                if i % 2 == 0 {
                    value
                } else {
                    -value
                }
            }
        };

        let noise = centered_noise(step, i, 97) * input_noise_amplitude;
        *slot = (value + noise).clamp(-1.0, 1.0);
    }

    let mean = if input.is_empty() {
        0.0
    } else {
        input.iter().copied().sum::<f32>() / input.len() as f32
    };

    let l2 = if input.is_empty() {
        0.0
    } else {
        let s = input.iter().map(|v| v * v).sum::<f32>() / input.len() as f32;
        s.sqrt()
    };

    let weighted = if input.is_empty() {
        0.0
    } else {
        let mut acc = 0.0f32;
        for (i, value) in input.iter().enumerate() {
            let w = ((i % 5) as f32 - 2.0) * 0.15;
            acc += *value * w;
        }
        acc / input.len() as f32
    };

    let variant_bias = match variant {
        SampleVariant::Baseline => 0.0,
        SampleVariant::Harmonic => 0.08,
        SampleVariant::Spike => 0.12,
        SampleVariant::Mirror => -0.05,
    };

    for (o, slot) in target.iter_mut().enumerate() {
        let phase = ((step + o * 7) % 19) as f32 / 19.0;
        let head_noise = centered_noise(step, o, 313) * 0.03;
        let mixed =
            0.45 * mean + 0.25 * weighted + 0.20 * l2 + (phase - 0.5) + variant_bias + head_noise;
        *slot = mixed.tanh();
    }
}

pub fn pace_cpu_target_utilization(
    start: Instant,
    compute_elapsed: &mut Duration,
    cycle_compute_elapsed: Duration,
    cpu_target_utilization: f64,
) {
    *compute_elapsed = compute_elapsed.saturating_add(cycle_compute_elapsed);
    if cpu_target_utilization > 0.0 && cpu_target_utilization < 1.0 {
        let desired_wall =
            Duration::from_secs_f64(compute_elapsed.as_secs_f64() / cpu_target_utilization);
        let wall_now = start.elapsed();
        if desired_wall > wall_now {
            sleep(desired_wall - wall_now);
        }
    }
}

pub struct FactoredSample {
    pub input: Vec<f32>,
    pub target: Vec<f32>,
}

#[derive(Clone, Copy)]
pub struct FactorizedKernelConfig {
    pub input_noise_amplitude: f32,
    pub factor_rank: usize,
    pub vector_stride: usize,
    pub kernel_gain: f32,
    pub target_feedback: f32,
    pub model_kernel_id: usize,
    pub corruption_every_n: usize,
    pub corruption_strength: f32,
}

pub fn kernel_config_for_model(model_file: &str) -> FactorizedKernelConfig {
    match model_file {
        "small.rnn" => FactorizedKernelConfig {
            input_noise_amplitude: 0.06,
            factor_rank: 8,
            vector_stride: 3,
            kernel_gain: 0.18,
            target_feedback: 0.12,
            model_kernel_id: 1,
            corruption_every_n: 9,
            corruption_strength: 0.08,
        },
        "medium.rnn" => FactorizedKernelConfig {
            input_noise_amplitude: 0.06,
            factor_rank: 16,
            vector_stride: 5,
            kernel_gain: 0.24,
            target_feedback: 0.16,
            model_kernel_id: 2,
            corruption_every_n: 11,
            corruption_strength: 0.10,
        },
        "large.rnn" => FactorizedKernelConfig {
            input_noise_amplitude: 0.06,
            factor_rank: 32,
            vector_stride: 7,
            kernel_gain: 0.30,
            target_feedback: 0.20,
            model_kernel_id: 3,
            corruption_every_n: 13,
            corruption_strength: 0.12,
        },
        "enormous.rnn" => FactorizedKernelConfig {
            input_noise_amplitude: 0.06,
            factor_rank: 64,
            vector_stride: 11,
            kernel_gain: 0.36,
            target_feedback: 0.24,
            model_kernel_id: 4,
            corruption_every_n: 15,
            corruption_strength: 0.14,
        },
        _ => FactorizedKernelConfig {
            input_noise_amplitude: 0.06,
            factor_rank: 8,
            vector_stride: 3,
            kernel_gain: 0.18,
            target_feedback: 0.12,
            model_kernel_id: 0,
            corruption_every_n: 9,
            corruption_strength: 0.08,
        },
    }
}

pub struct BenchmarkRecord<'a> {
    pub path: PathBuf,
    pub model_file: &'a str,
    pub topology: &'a [usize],
    pub precision_label: &'a str,
    pub element_size_bytes: u64,
    pub iterations: usize,
    pub elapsed: Duration,
    pub avg_loss: f32,
    pub last_loss: f32,
    pub output_bytes: usize,
    pub train_samples_per_cycle: usize,
}

pub struct LiveBenchmarkSnapshot {
    path: PathBuf,
    interval_ms: u64,
    next_emit_ms: u64,
}

pub struct LiveSnapshotPoint<'a> {
    pub model_file: &'a str,
    pub precision_label: &'a str,
    pub elapsed: Duration,
    pub iterations: usize,
    pub avg_loss: f32,
    pub last_loss: f32,
    pub train_samples_per_cycle: usize,
}

#[derive(Clone, Copy, Debug)]
pub struct ResumeCursor {
    pub iterations: usize,
    pub elapsed: Duration,
}

pub struct BenchmarkFinalizeRequest<'a> {
    pub model_file: &'a str,
    pub stem: &'a str,
    pub topology: &'a [usize],
    pub precision_label: &'a str,
    pub element_size_bytes: u64,
    pub iterations: usize,
    pub elapsed: Duration,
    pub avg_loss: f32,
    pub last_loss: f32,
    pub train_samples_per_cycle: usize,
    pub baseline_output_bytes: usize,
    pub trained_path: PathBuf,
    pub benchmark_dir: PathBuf,
}

impl LiveBenchmarkSnapshot {
    pub fn new(path: PathBuf, interval_ms: u64) -> Result<Self, String> {
        let interval_ms = interval_ms.max(1);
        if !path.exists() {
            fs::write(
                &path,
                "model,precision,elapsed_ms,iterations,train_samples,avg_loss,last_loss,iterations_per_sec,samples_per_sec\n",
            )
            .map_err(|e| format!("failed to initialize {}: {e}", path.display()))?;
        }
        Ok(Self {
            path,
            interval_ms,
            next_emit_ms: interval_ms,
        })
    }

    pub fn maybe_write(&mut self, point: LiveSnapshotPoint<'_>, force: bool) -> Result<(), String> {
        let elapsed_ms = point.elapsed.as_millis() as u64;
        if !force && elapsed_ms < self.next_emit_ms {
            return Ok(());
        }

        let train_samples = point
            .iterations
            .saturating_mul(point.train_samples_per_cycle) as u64;
        let elapsed_secs = point.elapsed.as_secs_f32();
        let iterations_per_sec = if elapsed_secs > 0.0 {
            point.iterations as f32 / elapsed_secs
        } else {
            0.0
        };
        let samples_per_sec = if elapsed_secs > 0.0 {
            train_samples as f32 / elapsed_secs
        } else {
            0.0
        };

        let line = format!(
            "\"{}\",\"{}\",{},{},{},{},{},{},{}\n",
            point.model_file,
            point.precision_label,
            elapsed_ms,
            point.iterations,
            train_samples,
            point.avg_loss,
            point.last_loss,
            iterations_per_sec,
            samples_per_sec,
        );

        let mut file = OpenOptions::new()
            .append(true)
            .open(&self.path)
            .map_err(|e| format!("failed to open {}: {e}", self.path.display()))?;
        file.write_all(line.as_bytes())
            .map_err(|e| format!("failed to append {}: {e}", self.path.display()))?;

        if force {
            self.next_emit_ms = elapsed_ms.saturating_add(self.interval_ms);
        } else {
            while self.next_emit_ms <= elapsed_ms {
                self.next_emit_ms = self.next_emit_ms.saturating_add(self.interval_ms);
            }
        }
        Ok(())
    }
}

pub fn load_resume_cursor(benchmark_dir: &Path, stem: &str) -> Result<ResumeCursor, String> {
    let path = benchmark_dir.join(format!("{}.csv", stem));
    if !path.exists() {
        return Ok(ResumeCursor {
            iterations: 0,
            elapsed: Duration::ZERO,
        });
    }

    let text =
        fs::read_to_string(&path).map_err(|e| format!("failed to read {}: {e}", path.display()))?;

    for line in text.lines().rev() {
        let trimmed = line.trim();
        if trimmed.is_empty() || trimmed.starts_with("model,precision,") {
            continue;
        }
        let cols: Vec<&str> = trimmed.split(',').collect();
        if cols.len() < 4 {
            continue;
        }
        let Ok(elapsed_ms) = cols[2].trim().parse::<u64>() else {
            continue;
        };
        let Ok(iterations) = cols[3].trim().parse::<usize>() else {
            continue;
        };
        return Ok(ResumeCursor {
            iterations,
            elapsed: Duration::from_millis(elapsed_ms),
        });
    }

    Ok(ResumeCursor {
        iterations: 0,
        elapsed: Duration::ZERO,
    })
}

pub fn ensure_source_model(
    trained_dir: &Path,
    precision_folder: &str,
    model_file: &str,
) -> Result<PathBuf, String> {
    fs::create_dir_all(trained_dir)
        .map_err(|e| format!("failed to create {}: {e}", trained_dir.display()))?;

    let source_path = trained_dir.join(model_file);
    if source_path.exists() {
        let bytes = fs::read(&source_path)
            .map_err(|e| format!("failed to read {}: {e}", source_path.display()))?;
        if extract_last_rnn_snapshot(&bytes).is_ok() {
            return Ok(source_path);
        }
    }

    let fallback = Path::new("sample_model")
        .join(precision_folder)
        .join("sample.rnn");
    if fallback.exists() {
        fs::copy(&fallback, &source_path).map_err(|e| {
            format!(
                "failed to bootstrap {} from {}: {e}",
                source_path.display(),
                fallback.display()
            )
        })?;
        let copied = fs::read(&source_path)
            .map_err(|e| format!("failed to read {}: {e}", source_path.display()))?;
        if extract_last_rnn_snapshot(&copied).is_ok() {
            return Ok(source_path);
        }
    }

    let topology = bootstrap_topology_for_model(model_file);
    let hidden_activation = ActivationKind::Relu;
    let output_activation = ActivationKind::Identity;

    let mut weights_len = 0usize;
    let mut biases_len = 0usize;
    for pair in topology.windows(2) {
        weights_len = weights_len.saturating_add(pair[0].saturating_mul(pair[1]));
        biases_len = biases_len.saturating_add(pair[1]);
    }

    let out_bytes = if precision_folder == "f64" {
        let weights: Vec<f64> = (0..weights_len)
            .map(|i| {
                let base = ((i.wrapping_mul(31).wrapping_add(7)) % 200) as f64 / 100.0 - 1.0;
                base * 0.05
            })
            .collect();
        let biases = vec![0.0f64; biases_len];
        pack_with_benchmark_f64(
            topology,
            hidden_activation,
            output_activation,
            &weights,
            &biases,
            &[],
        )?
    } else {
        let weights: Vec<f32> = (0..weights_len)
            .map(|i| {
                let base = ((i.wrapping_mul(31).wrapping_add(7)) % 200) as f32 / 100.0 - 1.0;
                base * 0.05
            })
            .collect();
        let biases = vec![0.0f32; biases_len];
        pack_with_benchmark_f32(
            topology,
            hidden_activation,
            output_activation,
            &weights,
            &biases,
            &[],
        )?
    };

    fs::write(&source_path, &out_bytes)
        .map_err(|e| format!("failed to write {}: {e}", source_path.display()))?;

    if extract_last_rnn_snapshot(&out_bytes).is_err() {
        return Err(format!(
            "missing source model for in-place training: {} (unable to build bootstrap model)",
            source_path.display()
        ));
    }

    Ok(source_path)
}

fn bootstrap_topology_for_model(model_file: &str) -> &'static [usize] {
    match model_file {
        "small.rnn" => &[128, 256, 128, 64],
        "medium.rnn" => &[256, 384, 256, 128],
        "large.rnn" => &[384, 512, 384, 192],
        "enormous.rnn" => &[512, 768, 512, 256],
        _ => &[64, 128, 64, 32],
    }
}

pub fn validate_rnn_has_benchmark(bytes: &[u8]) -> Result<(), String> {
    let readable = read_rnn(bytes, None)
        .map_err(|e| format!("invalid rnn bytes: benchmark parse error: {e:?}"))?;
    if readable.scan.has_benchmark == Some(true) {
        Ok(())
    } else {
        Err("invalid rnn bytes: missing benchmark blob".to_string())
    }
}

type DenseCoreBlobs<'a> = (&'a [u8], &'a [u8], &'a [u8]);

pub fn extract_core_dense_blobs(bytes: &[u8]) -> Result<DenseCoreBlobs<'_>, String> {
    let readable =
        read_rnn(bytes, None).map_err(|e| format!("invalid rnn bytes: parse error: {e:?}"))?;

    let layer_meta = readable
        .layer_meta
        .ok_or_else(|| "missing layer_meta blob".to_string())?;
    let weights = readable
        .weights
        .ok_or_else(|| "missing weights blob".to_string())?;
    let biases = readable
        .biases
        .ok_or_else(|| "missing biases blob".to_string())?;

    Ok((layer_meta, weights, biases))
}

pub fn finalize_benchmark_outputs<F>(
    request: BenchmarkFinalizeRequest<'_>,
    mut pack_with_benchmark: F,
) -> Result<Vec<u8>, String>
where
    F: FnMut(&[u8]) -> Result<Vec<u8>, String>,
{
    let provisional_benchmark_blob = build_benchmark_blob(&BenchmarkRecord {
        path: PathBuf::new(),
        model_file: request.model_file,
        topology: request.topology,
        precision_label: request.precision_label,
        element_size_bytes: request.element_size_bytes,
        iterations: request.iterations,
        elapsed: request.elapsed,
        avg_loss: request.avg_loss,
        last_loss: request.last_loss,
        output_bytes: request.baseline_output_bytes,
        train_samples_per_cycle: request.train_samples_per_cycle,
    })?;

    let provisional_out_bytes = pack_with_benchmark(&provisional_benchmark_blob)?;

    let benchmark_blob = build_benchmark_blob(&BenchmarkRecord {
        path: PathBuf::new(),
        model_file: request.model_file,
        topology: request.topology,
        precision_label: request.precision_label,
        element_size_bytes: request.element_size_bytes,
        iterations: request.iterations,
        elapsed: request.elapsed,
        avg_loss: request.avg_loss,
        last_loss: request.last_loss,
        output_bytes: provisional_out_bytes.len(),
        train_samples_per_cycle: request.train_samples_per_cycle,
    })?;

    let out_bytes = pack_with_benchmark(&benchmark_blob)?;
    validate_rnn_has_benchmark(&out_bytes)?;

    let mut trained_file = OpenOptions::new()
        .create(true)
        .append(true)
        .open(&request.trained_path)
        .map_err(|e| format!("failed to open {}: {e}", request.trained_path.display()))?;
    trained_file
        .write_all(&out_bytes)
        .map_err(|e| format!("failed to append {}: {e}", request.trained_path.display()))?;

    let benchmark_path = request.benchmark_dir.join(format!("{}.bmk", request.stem));
    write_benchmark(&BenchmarkRecord {
        path: benchmark_path.clone(),
        model_file: request.model_file,
        topology: request.topology,
        precision_label: request.precision_label,
        element_size_bytes: request.element_size_bytes,
        iterations: request.iterations,
        elapsed: request.elapsed,
        avg_loss: request.avg_loss,
        last_loss: request.last_loss,
        output_bytes: out_bytes.len(),
        train_samples_per_cycle: request.train_samples_per_cycle,
    })?;

    let json_path = request.benchmark_dir.join(format!("{}.json", request.stem));
    write_benchmark_text_exports(&BenchmarkRecord {
        path: json_path.clone(),
        model_file: request.model_file,
        topology: request.topology,
        precision_label: request.precision_label,
        element_size_bytes: request.element_size_bytes,
        iterations: request.iterations,
        elapsed: request.elapsed,
        avg_loss: request.avg_loss,
        last_loss: request.last_loss,
        output_bytes: out_bytes.len(),
        train_samples_per_cycle: request.train_samples_per_cycle,
    })?;

    let yaml_path = json_path.with_extension("yaml");
    parse_and_validate_benchmark_exports_paths(&benchmark_path, &json_path, &yaml_path)?;

    Ok(out_bytes)
}

pub fn extract_last_rnn_snapshot(bytes: &[u8]) -> Result<Vec<u8>, String> {
    if bytes.len() < 4 {
        return Err("invalid rnn bytes: too short".to_string());
    }

    let mut starts = Vec::new();
    for idx in 0..=bytes.len() - 4 {
        if &bytes[idx..idx + 4] == b"RNN\x00" {
            starts.push(idx);
        }
    }

    if starts.is_empty() {
        if read_rnn(bytes, None).is_ok() {
            return Ok(bytes.to_vec());
        }
        return Err("invalid rnn bytes: bad magic".to_string());
    }

    for idx in starts.into_iter().rev() {
        let candidate = &bytes[idx..];
        if read_rnn(candidate, None).is_ok() {
            return Ok(candidate.to_vec());
        }
    }

    if read_rnn(bytes, None).is_ok() {
        return Ok(bytes.to_vec());
    }

    Err("invalid rnn bytes: no decodable snapshot found".to_string())
}

pub fn pack_with_benchmark_f32(
    topology: &[usize],
    hidden_activation: ActivationKind,
    output_activation: ActivationKind,
    weights: &[f32],
    biases: &[f32],
    _benchmark_blob: &[u8],
) -> Result<Vec<u8>, String> {
    let layer_count = topology.len().saturating_sub(1);
    let mut layer_specs_scratch = vec![
        LayerSpec::Dense(LayerDesc {
            input_size: 1,
            output_size: 1,
            weight_offset: 0,
            bias_offset: 0,
            activation: ActivationKind::Identity,
        });
        layer_count.max(2)
    ];

    let mut rmd1_scratch = vec![
        0u8;
        64usize
            .saturating_add(weights.len().saturating_mul(4))
            .saturating_add(biases.len().saturating_mul(4))
            .max(1024)
    ];
    let mut metadata_scratch = vec![0u8; 1024];
    let mut out = vec![
        0u8;
        rmd1_scratch
            .len()
            .saturating_mul(2)
            .saturating_add(8192)
            .max(8192)
    ];

    let activations = ActivationConfig {
        hidden: hidden_activation,
        output: output_activation,
    };

    loop {
        let buffers = PackBuffers {
            layer_specs_scratch: &mut layer_specs_scratch,
            rmd1_scratch: &mut rmd1_scratch,
            metadata_scratch: &mut metadata_scratch,
            out_bytes: &mut out,
        };

        match build_f32_into(topology, activations, weights, biases, buffers) {
            Ok(used) => {
                out.truncate(used);
                return Ok(out);
            }
            Err(RnnApiError::CapacityTooSmall) => {
                layer_specs_scratch.resize(
                    layer_specs_scratch.len().saturating_mul(2).max(2),
                    LayerSpec::Dense(LayerDesc {
                        input_size: 1,
                        output_size: 1,
                        weight_offset: 0,
                        bias_offset: 0,
                        activation: ActivationKind::Identity,
                    }),
                );
                rmd1_scratch.resize(rmd1_scratch.len().saturating_mul(2).max(2048), 0);
                metadata_scratch.resize(metadata_scratch.len().saturating_mul(2).max(2048), 0);
                out.resize(out.len().saturating_mul(2).max(8192), 0);
            }
            Err(e) => {
                return Err(format!(
                    "failed to encode rnn model with benchmark (f32): {e:?}"
                ))
            }
        }
    }
}

pub fn pack_with_benchmark_f64(
    topology: &[usize],
    hidden_activation: ActivationKind,
    output_activation: ActivationKind,
    weights: &[f64],
    biases: &[f64],
    _benchmark_blob: &[u8],
) -> Result<Vec<u8>, String> {
    let layer_count = topology.len().saturating_sub(1);
    let mut layer_specs_scratch = vec![
        LayerSpec::Dense(LayerDesc {
            input_size: 1,
            output_size: 1,
            weight_offset: 0,
            bias_offset: 0,
            activation: ActivationKind::Identity,
        });
        layer_count.max(2)
    ];

    let mut rmd1_scratch = vec![
        0u8;
        64usize
            .saturating_add(weights.len().saturating_mul(8))
            .saturating_add(biases.len().saturating_mul(8))
            .max(1024)
    ];
    let mut metadata_scratch = vec![0u8; 1024];
    let mut out = vec![
        0u8;
        rmd1_scratch
            .len()
            .saturating_mul(2)
            .saturating_add(8192)
            .max(8192)
    ];

    let activations = ActivationConfig {
        hidden: hidden_activation,
        output: output_activation,
    };

    loop {
        let buffers = PackBuffers {
            layer_specs_scratch: &mut layer_specs_scratch,
            rmd1_scratch: &mut rmd1_scratch,
            metadata_scratch: &mut metadata_scratch,
            out_bytes: &mut out,
        };

        match build_f64_into(topology, activations, weights, biases, buffers) {
            Ok(used) => {
                out.truncate(used);
                return Ok(out);
            }
            Err(RnnApiError::CapacityTooSmall) => {
                layer_specs_scratch.resize(
                    layer_specs_scratch.len().saturating_mul(2).max(2),
                    LayerSpec::Dense(LayerDesc {
                        input_size: 1,
                        output_size: 1,
                        weight_offset: 0,
                        bias_offset: 0,
                        activation: ActivationKind::Identity,
                    }),
                );
                rmd1_scratch.resize(rmd1_scratch.len().saturating_mul(2).max(2048), 0);
                metadata_scratch.resize(metadata_scratch.len().saturating_mul(2).max(2048), 0);
                out.resize(out.len().saturating_mul(2).max(8192), 0);
            }
            Err(e) => {
                return Err(format!(
                    "failed to encode rnn model with benchmark (f64): {e:?}"
                ))
            }
        }
    }
}

#[allow(clippy::too_many_arguments)]
pub fn train_step(
    layers: &[usize],
    weights: &mut [f32],
    biases: &mut [f32],
    input: &[f32],
    target: &[f32],
    layer_specs_scratch: &mut [LayerSpec],
    activations_scratch: &mut [f32],
    deltas_scratch: &mut [f32],
    config: SgdConfig,
) -> Result<f32, TrainError> {
    let mut scratch = SgdScratch {
        layer_specs_scratch,
        activations_scratch,
        deltas_scratch,
    };
    sgd_step(layers, weights, biases, input, target, &mut scratch, config)
}

pub fn pack_dense_native_with_benchmark_f32(
    topology: &[usize],
    hidden_activation: ActivationKind,
    output_activation: ActivationKind,
    weights: &[f32],
    biases: &[f32],
    benchmark_blob: &[u8],
) -> Result<Vec<u8>, String> {
    pack_with_benchmark_f32(
        topology,
        hidden_activation,
        output_activation,
        weights,
        biases,
        benchmark_blob,
    )
}

pub fn pack_dense_native_with_benchmark_f64(
    topology: &[usize],
    hidden_activation: ActivationKind,
    output_activation: ActivationKind,
    weights: &[f64],
    biases: &[f64],
    benchmark_blob: &[u8],
) -> Result<Vec<u8>, String> {
    pack_with_benchmark_f64(
        topology,
        hidden_activation,
        output_activation,
        weights,
        biases,
        benchmark_blob,
    )
}

#[allow(clippy::too_many_arguments)]
pub fn train_dense_step(
    layers: &[usize],
    weights: &mut [f32],
    biases: &mut [f32],
    input: &[f32],
    target: &[f32],
    layer_specs_scratch: &mut [LayerSpec],
    activations_scratch: &mut [f32],
    deltas_scratch: &mut [f32],
    config: SgdConfig,
) -> Result<f32, TrainError> {
    train_step(
        layers,
        weights,
        biases,
        input,
        target,
        layer_specs_scratch,
        activations_scratch,
        deltas_scratch,
        config,
    )
}

pub fn create_live_snapshot(
    benchmark_dir: &Path,
    stem: &str,
    interval_ms: u64,
) -> Result<LiveBenchmarkSnapshot, String> {
    LiveBenchmarkSnapshot::new(benchmark_dir.join(format!("{}.csv", stem)), interval_ms)
}

pub fn build_parallel_factored_batch(
    base_iteration: usize,
    input_size: usize,
    output_size: usize,
    train_samples_per_cycle: usize,
    training_order: TrainingOrder,
    kernel_cfg: FactorizedKernelConfig,
) -> Result<Vec<FactoredSample>, String> {
    std::thread::scope(|scope| {
        let mut handles = Vec::with_capacity(train_samples_per_cycle);
        for lane in 0..train_samples_per_cycle {
            handles.push(scope.spawn(move || {
                let sample_seed = base_iteration.saturating_add(lane);
                let variant = training_order.sample_variant(sample_seed);
                let mut input = vec![0.0f32; input_size];
                let mut target = vec![0.0f32; output_size];
                synthetic_sample_in_place(
                    sample_seed,
                    variant,
                    kernel_cfg.input_noise_amplitude,
                    &mut input,
                    &mut target,
                );
                apply_factorized_kernel(sample_seed, lane, &mut input, &mut target, kernel_cfg);
                FactoredSample { input, target }
            }));
        }

        let mut out = Vec::with_capacity(train_samples_per_cycle);
        for handle in handles {
            let sample = handle
                .join()
                .map_err(|_| "parallel sample worker panicked".to_string())?;
            out.push(sample);
        }
        Ok(out)
    })
}

fn apply_factorized_kernel(
    sample_seed: usize,
    lane: usize,
    input: &mut [f32],
    target: &mut [f32],
    kernel_cfg: FactorizedKernelConfig,
) {
    if input.is_empty() || target.is_empty() {
        return;
    }

    let rank = kernel_cfg.factor_rank.min(input.len().max(1));
    let mut factors = vec![0.0f32; rank];
    let mut counts = vec![0usize; rank];

    for (idx, value) in input.iter().copied().enumerate() {
        let factor_idx = (idx + sample_seed + kernel_cfg.model_kernel_id * 5) % rank;
        factors[factor_idx] += value;
        counts[factor_idx] = counts[factor_idx].saturating_add(1);
    }

    for idx in 0..rank {
        let count = counts[idx].max(1) as f32;
        factors[idx] = (factors[idx] / count).tanh();
    }

    for (idx, slot) in input.iter_mut().enumerate() {
        let a = factors[idx % rank];
        let b = factors[(idx
            .wrapping_mul(kernel_cfg.vector_stride)
            .wrapping_add(lane + kernel_cfg.model_kernel_id))
            % rank];
        let mixed = (1.0 - kernel_cfg.kernel_gain) * *slot
            + (kernel_cfg.kernel_gain * 0.65) * a
            + (kernel_cfg.kernel_gain * 0.35) * b;
        *slot = mixed.tanh();
    }

    let mut energy = 0.0f32;
    for value in input.iter().copied() {
        energy += value * value;
    }
    energy = (energy / input.len() as f32).sqrt();

    for (idx, slot) in target.iter_mut().enumerate() {
        let latent = factors[(idx + lane + kernel_cfg.model_kernel_id) % rank];
        let harmonic =
            factors[(idx.wrapping_mul(kernel_cfg.vector_stride + 1) + sample_seed) % rank];
        let mixed = 0.72 * *slot
            + kernel_cfg.target_feedback * latent
            + (kernel_cfg.target_feedback * 0.5) * harmonic
            + 0.08 * energy;
        *slot = mixed.tanh();
    }

    attempt_training_corruption(sample_seed, lane, input, target, kernel_cfg);
}

fn attempt_training_corruption(
    sample_seed: usize,
    lane: usize,
    input: &mut [f32],
    target: &mut [f32],
    kernel_cfg: FactorizedKernelConfig,
) {
    if kernel_cfg.corruption_every_n == 0 {
        return;
    }

    let trigger_seed = sample_seed
        .wrapping_add(lane.wrapping_mul(13))
        .wrapping_add(kernel_cfg.model_kernel_id.wrapping_mul(17));
    if !trigger_seed.is_multiple_of(kernel_cfg.corruption_every_n) {
        return;
    }

    let stride = (kernel_cfg.vector_stride.max(1) + kernel_cfg.model_kernel_id).max(1);
    for idx in (0..input.len()).step_by(stride) {
        let direction = if centered_noise(sample_seed, idx, 701) >= 0.0 {
            1.0
        } else {
            -1.0
        };
        let corrupted = input[idx] + direction * kernel_cfg.corruption_strength;
        input[idx] = corrupted.clamp(-1.0, 1.0);
    }

    let target_stride = (stride / 2).max(1);
    for idx in (0..target.len()).step_by(target_stride) {
        let sign = if centered_noise(sample_seed, idx, 911) >= 0.0 {
            1.0
        } else {
            -1.0
        };
        let blended = 0.80 * target[idx] + sign * kernel_cfg.corruption_strength * 0.75;
        target[idx] = blended.clamp(-1.0, 1.0);
    }
}

pub fn write_benchmark(record: &BenchmarkRecord<'_>) -> Result<(), String> {
    let content = build_benchmark_blob(record)?;
    let mut file = OpenOptions::new()
        .create(true)
        .append(true)
        .open(&record.path)
        .map_err(|e| format!("failed to open {}: {e}", record.path.display()))?;
    let len = u32::try_from(content.len())
        .map_err(|_| format!("benchmark blob too large for framing: {}", content.len()))?;
    file.write_all(b"BMKF")
        .map_err(|e| format!("failed to append {}: {e}", record.path.display()))?;
    file.write_all(&len.to_le_bytes())
        .map_err(|e| format!("failed to append {}: {e}", record.path.display()))?;
    file.write_all(&content)
        .map_err(|e| format!("failed to append {}: {e}", record.path.display()))
}

pub fn write_benchmark_text_exports(record: &BenchmarkRecord<'_>) -> Result<(), String> {
    let metrics = to_metrics(record);
    let topology_str = format_topology(record.topology);

    let json_path = record.path.clone();
    let yaml_path = json_path.with_extension("yaml");

    let json = format!(
        "{{\n  \"model\": \"{}\",\n  \"precision\": \"{}\",\n  \"topology\": {},\n  \"elapsed_ms\": {},\n  \"iterations\": {},\n  \"train_samples\": {},\n  \"train_samples_per_cycle\": {},\n  \"element_size_bytes\": {},\n  \"avg_loss\": {},\n  \"last_loss\": {},\n  \"min_loss\": {},\n  \"max_loss\": {},\n  \"loss_stddev\": {},\n  \"output_bytes\": {},\n  \"total_params\": {},\n  \"layer_count\": {},\n  \"input_dim\": {},\n  \"output_dim\": {},\n  \"benchmark_flags\": {},\n  \"weights_bytes\": {},\n  \"biases_bytes\": {},\n  \"iterations_per_sec\": {},\n  \"samples_per_sec\": {}\n}}\n",
        metrics.model_name,
        metrics.precision,
        topology_str,
        metrics.elapsed_ms,
        metrics.iterations,
        metrics.train_samples,
        record.train_samples_per_cycle,
        record.element_size_bytes,
        metrics.avg_loss,
        metrics.last_loss,
        metrics.min_loss,
        metrics.max_loss,
        metrics.loss_stddev,
        metrics.output_bytes,
        metrics.total_params,
        metrics.layer_count,
        metrics.input_dim,
        metrics.output_dim,
        metrics.benchmark_flags,
        metrics.weights_bytes,
        metrics.biases_bytes,
        metrics.iterations_per_sec,
        metrics.samples_per_sec,
    );

    let yaml = format!(
        "model: {}\nprecision: {}\ntopology: {}\nelapsed_ms: {}\niterations: {}\ntrain_samples: {}\ntrain_samples_per_cycle: {}\nelement_size_bytes: {}\navg_loss: {}\nlast_loss: {}\nmin_loss: {}\nmax_loss: {}\nloss_stddev: {}\noutput_bytes: {}\ntotal_params: {}\nlayer_count: {}\ninput_dim: {}\noutput_dim: {}\nbenchmark_flags: {}\nweights_bytes: {}\nbiases_bytes: {}\niterations_per_sec: {}\nsamples_per_sec: {}\n",
        metrics.model_name,
        metrics.precision,
        topology_str,
        metrics.elapsed_ms,
        metrics.iterations,
        metrics.train_samples,
        record.train_samples_per_cycle,
        record.element_size_bytes,
        metrics.avg_loss,
        metrics.last_loss,
        metrics.min_loss,
        metrics.max_loss,
        metrics.loss_stddev,
        metrics.output_bytes,
        metrics.total_params,
        metrics.layer_count,
        metrics.input_dim,
        metrics.output_dim,
        metrics.benchmark_flags,
        metrics.weights_bytes,
        metrics.biases_bytes,
        metrics.iterations_per_sec,
        metrics.samples_per_sec,
    );

    let mut json_file = OpenOptions::new()
        .create(true)
        .append(true)
        .open(&json_path)
        .map_err(|e| format!("failed to open {}: {e}", json_path.display()))?;
    json_file
        .write_all(json.as_bytes())
        .map_err(|e| format!("failed to append {}: {e}", json_path.display()))?;

    let mut yaml_file = OpenOptions::new()
        .create(true)
        .append(true)
        .open(&yaml_path)
        .map_err(|e| format!("failed to open {}: {e}", yaml_path.display()))?;
    yaml_file
        .write_all(b"---\n")
        .map_err(|e| format!("failed to append {}: {e}", yaml_path.display()))?;
    yaml_file
        .write_all(yaml.as_bytes())
        .map_err(|e| format!("failed to append {}: {e}", yaml_path.display()))?;
    Ok(())
}

pub fn build_benchmark_blob(record: &BenchmarkRecord<'_>) -> Result<Vec<u8>, String> {
    let metrics = to_metrics(record);
    let needed = encoded_size_benchmark_blob(&metrics)
        .ok_or_else(|| "failed to compute benchmark blob size".to_string())?;
    let mut out = vec![0u8; needed];
    let used = encode_benchmark_blob(&metrics, &mut out)
        .map_err(|e| format!("failed to encode benchmark blob: {e:?}"))?;
    out.truncate(used);
    Ok(out)
}

fn to_metrics<'a>(record: &'a BenchmarkRecord<'a>) -> BenchmarkMetrics<'a> {
    let (weights, biases) = count_params(record.topology);
    let elapsed_secs = record.elapsed.as_secs_f32();
    let iterations_per_sec = if elapsed_secs > 0.0 {
        record.iterations as f32 / elapsed_secs
    } else {
        0.0
    };
    let train_samples = record
        .iterations
        .saturating_mul(record.train_samples_per_cycle) as u64;
    let samples_per_sec = if elapsed_secs > 0.0 {
        train_samples as f32 / elapsed_secs
    } else {
        0.0
    };
    let min_loss = record.avg_loss.min(record.last_loss);
    let max_loss = record.avg_loss.max(record.last_loss);
    BenchmarkMetrics {
        model_name: record.model_file,
        precision: record.precision_label,
        elapsed_ms: record.elapsed.as_millis() as u64,
        iterations: record.iterations as u64,
        train_samples,
        avg_loss: record.avg_loss,
        last_loss: record.last_loss,
        output_bytes: record.output_bytes,
        total_params: weights.saturating_add(biases) as u64,
        layer_count: record.topology.len().saturating_sub(1) as u32,
        input_dim: record.topology.first().copied().unwrap_or(0) as u32,
        output_dim: record.topology.last().copied().unwrap_or(0) as u32,
        benchmark_flags: 0,
        weights_bytes: (weights as u64).saturating_mul(record.element_size_bytes),
        biases_bytes: (biases as u64).saturating_mul(record.element_size_bytes),
        min_loss,
        max_loss,
        loss_stddev: (max_loss - min_loss) * 0.5,
        iterations_per_sec,
        samples_per_sec,
    }
}

fn count_params(topology: &[usize]) -> (usize, usize) {
    let mut weights = 0usize;
    let mut biases = 0usize;
    for pair in topology.windows(2) {
        let inp = pair[0];
        let out = pair[1];
        weights = weights.saturating_add(inp.saturating_mul(out));
        biases = biases.saturating_add(out);
    }
    (weights, biases)
}

fn format_topology(topology: &[usize]) -> String {
    let mut out = String::from("[");
    for (idx, v) in topology.iter().enumerate() {
        if idx > 0 {
            out.push(',');
        }
        out.push_str(&v.to_string());
    }
    out.push(']');
    out
}

#[derive(Clone, Debug)]
struct ParsedBenchmarkCore {
    model_name: String,
    precision: String,
    elapsed_ms: u64,
    iterations: u64,
    train_samples: u64,
    avg_loss: f32,
    last_loss: f32,
    output_bytes: usize,
}

pub fn parse_and_validate_benchmark_exports_paths(
    bench_path: &Path,
    json_path: &Path,
    yaml_path: &Path,
) -> Result<(), String> {
    let parsed_bench = parse_bench_file(bench_path)?;
    let parsed_json = parse_json_file(json_path)?;
    let parsed_yaml = parse_yaml_file(yaml_path)?;

    ensure_core_match(&parsed_bench, &parsed_json, "bench", "json")?;
    ensure_core_match(&parsed_bench, &parsed_yaml, "bench", "yaml")?;

    let benchmark_dir = bench_path.parent().unwrap_or_else(|| Path::new("."));
    let stem = bench_path
        .file_stem()
        .and_then(|s| s.to_str())
        .unwrap_or("benchmark");

    let old_final = benchmark_dir.join(format!("{}.final.csv", stem));
    if old_final.exists() {
        let _ = fs::remove_file(&old_final);
    }
    let old_finall = benchmark_dir.join(format!("{}.finall.csv", stem));
    if old_finall.exists() {
        let _ = fs::remove_file(&old_finall);
    }

    Ok(())
}

fn ensure_core_match(
    left: &ParsedBenchmarkCore,
    right: &ParsedBenchmarkCore,
    left_name: &str,
    right_name: &str,
) -> Result<(), String> {
    if left.model_name != right.model_name
        || left.precision != right.precision
        || left.elapsed_ms != right.elapsed_ms
        || left.iterations != right.iterations
        || left.train_samples != right.train_samples
        || left.output_bytes != right.output_bytes
        || (left.avg_loss - right.avg_loss).abs() > 1e-4
        || (left.last_loss - right.last_loss).abs() > 1e-4
    {
        return Err(format!(
            "benchmark mismatch between {} and {}",
            left_name, right_name
        ));
    }
    Ok(())
}

fn parse_bench_file(path: &Path) -> Result<ParsedBenchmarkCore, String> {
    let bytes = fs::read(path).map_err(|e| format!("failed to read {}: {e}", path.display()))?;
    if bytes.len() >= 8 {
        for idx in (0..=bytes.len() - 8).rev() {
            if &bytes[idx..idx + 4] != b"BMKF" {
                continue;
            }
            let len = u32::from_le_bytes([
                bytes[idx + 4],
                bytes[idx + 5],
                bytes[idx + 6],
                bytes[idx + 7],
            ]) as usize;
            let start = idx + 8;
            let end = start.saturating_add(len);
            if end > bytes.len() {
                continue;
            }
            if let Ok(view) = decode_benchmark_blob(&bytes[start..end]) {
                return Ok(ParsedBenchmarkCore {
                    model_name: view.model_name.to_string(),
                    precision: view.precision.to_string(),
                    elapsed_ms: view.elapsed_ms,
                    iterations: view.iterations,
                    train_samples: view.train_samples,
                    avg_loss: view.avg_loss,
                    last_loss: view.last_loss,
                    output_bytes: view.output_bytes,
                });
            }
        }
    }

    if let Ok(view) = decode_benchmark_blob(&bytes) {
        return Ok(ParsedBenchmarkCore {
            model_name: view.model_name.to_string(),
            precision: view.precision.to_string(),
            elapsed_ms: view.elapsed_ms,
            iterations: view.iterations,
            train_samples: view.train_samples,
            avg_loss: view.avg_loss,
            last_loss: view.last_loss,
            output_bytes: view.output_bytes,
        });
    }

    let mut found = None;
    if bytes.len() >= 4 {
        for idx in 0..=bytes.len() - 4 {
            if &bytes[idx..idx + 4] == b"BMK\x01" {
                found = Some(idx);
            }
        }
    }
    let Some(last_idx) = found else {
        return Err(format!(
            "failed to decode {}: benchmark magic not found",
            path.display()
        ));
    };
    let view = decode_benchmark_blob(&bytes[last_idx..])
        .map_err(|e| format!("failed to decode {}: {e:?}", path.display()))?;
    Ok(ParsedBenchmarkCore {
        model_name: view.model_name.to_string(),
        precision: view.precision.to_string(),
        elapsed_ms: view.elapsed_ms,
        iterations: view.iterations,
        train_samples: view.train_samples,
        avg_loss: view.avg_loss,
        last_loss: view.last_loss,
        output_bytes: view.output_bytes,
    })
}

fn parse_json_file(path: &Path) -> Result<ParsedBenchmarkCore, String> {
    let text =
        fs::read_to_string(path).map_err(|e| format!("failed to read {}: {e}", path.display()))?;
    Ok(ParsedBenchmarkCore {
        model_name: parse_json_string_field(&text, "model")?,
        precision: parse_json_string_field(&text, "precision")?,
        elapsed_ms: parse_json_u64_field(&text, "elapsed_ms")?,
        iterations: parse_json_u64_field(&text, "iterations")?,
        train_samples: parse_json_u64_field(&text, "train_samples")?,
        avg_loss: parse_json_f32_field(&text, "avg_loss")?,
        last_loss: parse_json_f32_field(&text, "last_loss")?,
        output_bytes: parse_json_usize_field(&text, "output_bytes")?,
    })
}

fn parse_yaml_file(path: &Path) -> Result<ParsedBenchmarkCore, String> {
    let text =
        fs::read_to_string(path).map_err(|e| format!("failed to read {}: {e}", path.display()))?;
    Ok(ParsedBenchmarkCore {
        model_name: parse_yaml_string_field(&text, "model")?,
        precision: parse_yaml_string_field(&text, "precision")?,
        elapsed_ms: parse_yaml_u64_field(&text, "elapsed_ms")?,
        iterations: parse_yaml_u64_field(&text, "iterations")?,
        train_samples: parse_yaml_u64_field(&text, "train_samples")?,
        avg_loss: parse_yaml_f32_field(&text, "avg_loss")?,
        last_loss: parse_yaml_f32_field(&text, "last_loss")?,
        output_bytes: parse_yaml_usize_field(&text, "output_bytes")?,
    })
}

fn parse_json_string_field(text: &str, key: &str) -> Result<String, String> {
    let raw = parse_json_raw_field(text, key)?;
    if let Some(stripped) = raw.strip_prefix('"').and_then(|s| s.strip_suffix('"')) {
        Ok(stripped.to_string())
    } else {
        Err(format!("json field {} is not a string", key))
    }
}

fn parse_json_u64_field(text: &str, key: &str) -> Result<u64, String> {
    let raw = parse_json_raw_field(text, key)?;
    raw.parse::<u64>()
        .map_err(|e| format!("json field {} parse error: {e}", key))
}

fn parse_json_usize_field(text: &str, key: &str) -> Result<usize, String> {
    let raw = parse_json_raw_field(text, key)?;
    raw.parse::<usize>()
        .map_err(|e| format!("json field {} parse error: {e}", key))
}

fn parse_json_f32_field(text: &str, key: &str) -> Result<f32, String> {
    let raw = parse_json_raw_field(text, key)?;
    raw.parse::<f32>()
        .map_err(|e| format!("json field {} parse error: {e}", key))
}

fn parse_json_raw_field<'a>(text: &'a str, key: &str) -> Result<&'a str, String> {
    let pattern = format!("\"{}\"", key);
    let mut last: Option<&'a str> = None;
    for line in text.lines() {
        let trimmed = line.trim();
        if !trimmed.starts_with(&pattern) {
            continue;
        }
        let (_, rhs) = trimmed
            .split_once(':')
            .ok_or_else(|| format!("json malformed field {}", key))?;
        last = Some(rhs.trim().trim_end_matches(','));
    }
    last.ok_or_else(|| format!("missing json field {}", key))
}

fn parse_yaml_string_field(text: &str, key: &str) -> Result<String, String> {
    Ok(parse_yaml_raw_field(text, key)?.to_string())
}

fn parse_yaml_u64_field(text: &str, key: &str) -> Result<u64, String> {
    parse_yaml_raw_field(text, key)?
        .parse::<u64>()
        .map_err(|e| format!("yaml field {} parse error: {e}", key))
}

fn parse_yaml_usize_field(text: &str, key: &str) -> Result<usize, String> {
    parse_yaml_raw_field(text, key)?
        .parse::<usize>()
        .map_err(|e| format!("yaml field {} parse error: {e}", key))
}

fn parse_yaml_f32_field(text: &str, key: &str) -> Result<f32, String> {
    parse_yaml_raw_field(text, key)?
        .parse::<f32>()
        .map_err(|e| format!("yaml field {} parse error: {e}", key))
}

fn parse_yaml_raw_field<'a>(text: &'a str, key: &str) -> Result<&'a str, String> {
    let prefix = format!("{}:", key);
    let mut last: Option<&'a str> = None;
    for line in text.lines() {
        let trimmed = line.trim();
        if let Some(value) = trimmed.strip_prefix(&prefix) {
            last = Some(value.trim());
        }
    }
    last.ok_or_else(|| format!("missing yaml field {}", key))
}

fn centered_noise(step: usize, idx: usize, salt: usize) -> f32 {
    let mut x = (step as u64)
        .wrapping_mul(0x9E37_79B9_7F4A_7C15)
        .wrapping_add((idx as u64).wrapping_mul(0xBF58_476D_1CE4_E5B9))
        .wrapping_add(salt as u64);
    x ^= x >> 30;
    x = x.wrapping_mul(0xBF58_476D_1CE4_E5B9);
    x ^= x >> 27;
    x = x.wrapping_mul(0x94D0_49BB_1331_11EB);
    x ^= x >> 31;
    let unit = ((x & 0xFFFF) as f32) / 65535.0;
    unit * 2.0 - 1.0
}