dreamwell-intelligence 1.0.0

// QCT — Quantum Causal Transformer.
//
// Stack of quantum attention blocks. Each block: embed → evolve → dephase → measure → project.
// The full transformer: tokenize → [QCT block × N] → readout → loss.
//
// Clean Compute: all buffers pre-allocated per block. No dynamic allocation in forward pass.

use crate::attention::{attention_project, quantum_causal_attention, AttentionOutput};
use crate::density_matrix::DensityMatrixN;
use crate::embed::QuantumEmbedding;
use crate::hamiltonian::LearnedHamiltonian;

/// Configuration for a QCT model.
#[derive(Clone, Debug)]
pub struct QCTConfig {
    /// Vocabulary size (number of unique tokens).
    pub vocab_size: usize,
    /// Model dimension (density matrix size per token).
    pub dim: usize,
    /// Number of QCT blocks (layers).
    pub num_blocks: usize,
    /// Random seed for initialization.
    pub seed: u64,
}

impl Default for QCTConfig {
    fn default() -> Self {
        Self {
            vocab_size: 65, // ASCII printable (nanoGPT Shakespeare default)
            dim: 5,         // 5-mode density matrix (matches our toy models)
            num_blocks: 2,  // 2 blocks for proof of concept
            seed: 42,
        }
    }
}

/// A single QCT block: quantum attention + value projection.
#[derive(Clone)]
pub struct QCTBlock {
    pub hamiltonian: LearnedHamiltonian,
    /// Value projection weights [dim × dim]. Maps populations → output features.
    pub value_weights: Vec<f32>,
}

const PHI_INV: f32 = 0.618033988;

impl QCTBlock {
    pub fn new(dim: usize, seed: u64) -> Self {
        // Value weight scale: 1/φ — golden partition of [-1, 1] range.
        // Matches cloud compression ratio and every other blend weight in the pipeline.
        let scale = PHI_INV;
        let mut value_weights = Vec::with_capacity(dim * dim);
        for i in 0..(dim * dim) {
            let s = seed.wrapping_add((i + 1000) as u64).wrapping_mul(0x94d049bb133111eb);
            value_weights.push(scale * ((s % 2000) as f32 / 1000.0 - 1.0));
        }
        Self {
            hamiltonian: LearnedHamiltonian::new(dim, seed),
            value_weights,
        }
    }

    /// Forward pass through one QCT block.
    /// Input: sequence of density matrices + value vectors.
    /// Output: updated value vectors (populations-weighted projection).
    pub fn forward(&self, states: &[DensityMatrixN], values: &[Vec<f32>]) -> (AttentionOutput, Vec<Vec<f32>>) {
        let attn = quantum_causal_attention(states, &self.hamiltonian);
        let projected = attention_project(&attn, values, self.hamiltonian.dim);
        (attn, projected)
    }

    /// Number of learnable parameters in this block.
    pub fn num_params(&self) -> usize {
        self.hamiltonian.num_params() + self.value_weights.len()
    }
}

/// The full Quantum Causal Transformer.
#[derive(Clone)]
pub struct QCT {
    pub config: QCTConfig,
    pub embedding: QuantumEmbedding,
    pub blocks: Vec<QCTBlock>,
    /// Output projection: dim → vocab_size (logits).
    pub output_weights: Vec<f32>,
}

impl QCT {
    pub fn new(config: QCTConfig) -> Self {
        let embedding = QuantumEmbedding::new(config.vocab_size, config.dim, config.seed);

        let mut blocks = Vec::with_capacity(config.num_blocks);
        for i in 0..config.num_blocks {
            blocks.push(QCTBlock::new(config.dim, config.seed.wrapping_add(i as u64 * 1000)));
        }

        // Output projection: dim → vocab_size.
        // Scale: 1/φ⁵ ≈ 0.090 — matches evolution dt and Hamiltonian bias range.
        // Output is the final projection to logits; small init prevents saturation
        // while maintaining the φ-chain through the entire parameter space.
        let out_scale = 0.090169944_f32; // 1/φ⁵
        let mut output_weights = Vec::with_capacity(config.dim * config.vocab_size);
        for i in 0..(config.dim * config.vocab_size) {
            let s = config
                .seed
                .wrapping_add((i + 5000) as u64)
                .wrapping_mul(0x517cc1b727220a95);
            output_weights.push(out_scale * ((s % 2000) as f32 / 1000.0 - 1.0));
        }

        Self {
            config,
            embedding,
            blocks,
            output_weights,
        }
    }

    /// Forward pass: tokens → logits.
    /// Returns (logits, total_free_energy).
    pub fn forward(&self, tokens: &[usize]) -> (Vec<Vec<f32>>, f32) {
        let dim = self.config.dim;
        let t = tokens.len();

        // 1. Embed tokens as density matrices
        let states: Vec<DensityMatrixN> = tokens.iter().map(|&tok| self.embedding.embed(tok)).collect();

        // 2. Initial value vectors = populations of embedded states
        let mut values: Vec<Vec<f32>> = states.iter().map(|s| s.populations()).collect();

        // 3. Pass through QCT blocks
        let mut total_free_energy = 0.0f32;
        let mut current_states = states;

        for block in &self.blocks {
            let (attn, new_values) = block.forward(&current_states, &values);

            // Accumulate free energy from attention
            total_free_energy += attn.free_energies.iter().sum::<f32>();

            // Update values; states carry through (coherences persist between blocks)
            values = new_values;

            // Inter-block dephasing: ε = 1/(φ³ × num_blocks).
            // Surviving coherence after all blocks: exp(-1/φ³) ≈ 79%. Scale-invariant.
            let eps_block = 0.236 / self.blocks.len().max(1) as f32;
            for state in &mut current_states {
                state.dephase(eps_block);
            }
        }

        // 4. Output projection: values → logits over vocabulary
        let vocab = self.config.vocab_size;
        let mut logits = Vec::with_capacity(t);
        for i in 0..t {
            let mut token_logits = vec![0.0f32; vocab];
            for v in 0..vocab {
                for d in 0..dim {
                    token_logits[v] += values[i][d] * self.output_weights[d * vocab + v];
                }
            }
            logits.push(token_logits);
        }

        (logits, total_free_energy / t as f32)
    }

    /// Total number of learnable parameters.
    pub fn num_params(&self) -> usize {
        let embed_params = self.embedding.num_params();
        let block_params: usize = self.blocks.iter().map(|b| b.num_params()).sum();
        let output_params = self.output_weights.len();
        embed_params + block_params + output_params
    }

    /// Flatten ALL model parameters into a single Vec.
    /// Order: [embedding_angles | block_0_hamiltonian | block_0_values | ... | output_weights]
    pub fn all_params(&self) -> Vec<f32> {
        let mut p = Vec::with_capacity(self.num_params());
        p.extend_from_slice(&self.embedding.angles);
        for block in &self.blocks {
            p.extend_from_slice(&block.hamiltonian.params());
            p.extend_from_slice(&block.value_weights);
        }
        p.extend_from_slice(&self.output_weights);
        p
    }

    /// Set ALL model parameters from a flat Vec.
    pub fn set_all_params(&mut self, params: &[f32]) {
        let mut offset = 0;
        let embed_len = self.embedding.angles.len();
        self.embedding.angles[..embed_len].copy_from_slice(&params[offset..offset + embed_len]);
        offset += embed_len;
        for block in &mut self.blocks {
            let h_len = block.hamiltonian.num_params();
            block.hamiltonian.set_params(&params[offset..offset + h_len]);
            offset += h_len;
            let v_len = block.value_weights.len();
            block.value_weights[..v_len].copy_from_slice(&params[offset..offset + v_len]);
            offset += v_len;
        }
        let out_len = self.output_weights.len();
        self.output_weights[..out_len].copy_from_slice(&params[offset..offset + out_len]);
    }

    /// Apply gradient update in-place: θ ← θ - lr * grad * scale.
    /// Zero allocation — updates parameters directly without all_params()/set_all_params().
    /// Returns the number of parameters updated.
    pub fn apply_gradient_update(&mut self, grad: &[f32], lr: f32, scale: f32) -> usize {
        let mut offset = 0;
        let factor = lr * scale;

        // Embedding angles
        let embed_len = self.embedding.angles.len();
        for k in 0..embed_len.min(grad.len()) {
            self.embedding.angles[k] -= factor * grad[k];
        }
        offset += embed_len;

        // Block parameters
        for block in &mut self.blocks {
            // Hamiltonian bias
            let d = block.hamiltonian.dim;
            for k in 0..d {
                if offset + k < grad.len() {
                    block.hamiltonian.bias[k] -= factor * grad[offset + k];
                }
            }
            offset += d;

            // Hamiltonian couplings
            let nc = block.hamiltonian.couplings.len();
            for k in 0..nc {
                if offset + k < grad.len() {
                    block.hamiltonian.couplings[k] -= factor * grad[offset + k];
                }
            }
            offset += nc;

            // Hamiltonian dephasing_rate + temperature
            // Clamps: φ-derived bounds matching set_params()
            if offset < grad.len() {
                block.hamiltonian.dephasing_rate =
                    (block.hamiltonian.dephasing_rate - factor * grad[offset]).clamp(0.013155617, 1.0);
                // [1/φ⁸, 1]
            }
            offset += 1;
            if offset < grad.len() {
                block.hamiltonian.temperature =
                    (block.hamiltonian.temperature - factor * grad[offset]).clamp(0.090169944, 11.09017);
                // [1/φ⁵, φ⁵]
            }
            offset += 1;

            // Value weights
            let v_len = block.value_weights.len();
            for k in 0..v_len {
                if offset + k < grad.len() {
                    block.value_weights[k] -= factor * grad[offset + k];
                }
            }
            offset += v_len;
        }

        // Output weights
        let out_len = self.output_weights.len();
        for k in 0..out_len {
            if offset + k < grad.len() {
                self.output_weights[k] -= factor * grad[offset + k];
            }
        }
        offset += out_len;

        offset.min(grad.len())
    }

    /// Compute cross-entropy loss for next-token prediction.
    /// NOTE: This recomputes the full forward pass. For training loops that already
    /// have logits from forward_with_cache(), use loss_from_logits() instead.
    pub fn loss(&self, tokens: &[usize]) -> f32 {
        if tokens.len() < 2 {
            return 0.0;
        }
        let (logits, avg_free_energy) = self.forward(&tokens[..tokens.len() - 1]);
        Self::loss_from_logits(&logits, tokens, avg_free_energy)
    }

    /// Compute loss from pre-computed logits. Zero-cost — no forward pass.
    /// Use this in training loops where forward_with_cache() already produced logits.
    pub fn loss_from_logits(logits: &[Vec<f32>], tokens: &[usize], avg_free_energy: f32) -> f32 {
        if tokens.len() < 2 || logits.is_empty() {
            return 0.0;
        }
        let mut total_ce = 0.0f32;
        let n = logits.len();

        for (i, token_logits) in logits.iter().enumerate() {
            let target = if i + 1 < tokens.len() { tokens[i + 1] } else { continue };

            // Softmax + cross-entropy
            let max_logit = token_logits.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
            let exp_sum: f32 = token_logits.iter().map(|&l| (l - max_logit).exp()).sum();
            let log_prob = (token_logits[target] - max_logit) - exp_sum.ln();
            total_ce -= log_prob;
        }

        let avg_ce = total_ce / n as f32;
        // λ = 1/φ⁴ ≈ 0.146 — free energy gets real weight in the loss.
        // The model optimizes for prediction accuracy AND coherent structure.
        avg_ce + 0.146 * avg_free_energy
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn qct_forward_produces_logits() {
        let config = QCTConfig::default();
        let model = QCT::new(config.clone());
        let tokens = vec![0, 1, 2, 3, 4, 5, 6, 7];
        let (logits, free_energy) = model.forward(&tokens);

        assert_eq!(logits.len(), tokens.len());
        for (i, l) in logits.iter().enumerate() {
            assert_eq!(l.len(), config.vocab_size, "token {i}: logit dim should be vocab_size");
        }
        assert!(free_energy.is_finite(), "free energy should be finite");
    }

    #[test]
    fn qct_loss_finite() {
        let model = QCT::new(QCTConfig::default());
        let tokens = vec![0, 1, 2, 3, 4, 5];
        let loss = model.loss(&tokens);
        assert!(loss.is_finite(), "loss should be finite: {loss}");
        assert!(loss > 0.0, "loss should be positive: {loss}");
    }

    #[test]
    fn qct_param_count() {
        let config = QCTConfig {
            vocab_size: 65,
            dim: 5,
            num_blocks: 2,
            seed: 42,
        };
        let model = QCT::new(config);
        let params = model.num_params();
        // Embedding: 65 * 5 = 325
        // Per block: 5 bias + 10 couplings + 2 (dephasing+temp) + 25 value_weights = 42
        // 2 blocks: 84
        // Output: 5 * 65 = 325
        // Total: 325 + 84 + 325 = 734
        assert!(params > 0, "should have parameters: {params}");
        eprintln!("QCT parameter count: {params}");
    }

    #[test]
    fn qct_deterministic() {
        let model = QCT::new(QCTConfig::default());
        let tokens = vec![10, 20, 30, 40, 50];
        let loss_a = model.loss(&tokens);
        let loss_b = model.loss(&tokens);
        assert_eq!(loss_a, loss_b, "QCT should be deterministic");
    }
}