m2m/inference/
bitnet.rs

1//! Native BitNet MoE model implementation for Hydra.
2//!
3//! This module implements the Hydra model architecture with native Rust inference,
4//! loading weights directly from safetensors format.
5//!
6//! ## Architecture (from actual model weights)
7//!
8//! ```text
9//! Input tokens → Embedding [32000, 192]
10//!              ↓
11//! 4x MoE Layers:
12//!   - Gate: Linear(192, 4) → softmax → top-k selection
13//!   - Experts: Heterogeneous MLPs (different depths/widths)
14//!              ↓
15//! LayerNorm [192]
16//!              ↓
17//! SemanticHead: Linear(192, 192)
18//!              ↓
19//! CompressionHead: Linear(192, 4) → [NONE, BPE, BROTLI, ZLIB]
20//! SecurityHead: Linear(192, 2) → [SAFE, UNSAFE]
21//! ```
22
23use std::path::Path;
24
25use ndarray::{Array1, Array2};
26use safetensors::SafeTensors;
27
28use crate::error::{M2MError, Result};
29
30/// Model configuration derived from actual weights
31#[derive(Debug, Clone)]
32pub struct HydraConfig {
33    /// Vocabulary size (32000 for Hydra - uses sentencepiece-like tokenizer)
34    pub vocab_size: usize,
35    /// Hidden dimension (192 for Hydra)
36    pub hidden_size: usize,
37    /// Number of MoE layers (4 for Hydra)
38    pub num_layers: usize,
39    /// Number of experts per layer (4 for Hydra)
40    pub num_experts: usize,
41    /// Top-k experts to activate (2 for Hydra)
42    pub top_k_experts: usize,
43}
44
45impl Default for HydraConfig {
46    fn default() -> Self {
47        // Values from actual model.safetensors inspection (not config.json which is wrong)
48        Self {
49            vocab_size: 32000,
50            hidden_size: 192,
51            num_layers: 4,
52            num_experts: 4,
53            top_k_experts: 2,
54        }
55    }
56}
57
58/// Linear layer (dense)
59#[derive(Debug, Clone)]
60pub struct Linear {
61    weight: Array2<f32>, // [out_features, in_features]
62    bias: Option<Array1<f32>>,
63}
64
65impl Linear {
66    fn new(weight: Array2<f32>, bias: Option<Array1<f32>>) -> Self {
67        Self { weight, bias }
68    }
69
70    fn forward(&self, x: &Array1<f32>) -> Array1<f32> {
71        // y = Wx + b
72        let mut y = self.weight.dot(x);
73        if let Some(ref b) = self.bias {
74            y += b;
75        }
76        y
77    }
78}
79
80/// Layer normalization
81#[derive(Debug, Clone)]
82pub struct LayerNorm {
83    weight: Array1<f32>,
84    bias: Array1<f32>,
85    eps: f32,
86}
87
88impl LayerNorm {
89    fn new(weight: Array1<f32>, bias: Array1<f32>) -> Self {
90        Self {
91            weight,
92            bias,
93            eps: 1e-5,
94        }
95    }
96
97    fn forward(&self, x: &Array1<f32>) -> Array1<f32> {
98        let mean = x.mean().unwrap_or(0.0);
99        let var = x.mapv(|v| (v - mean).powi(2)).mean().unwrap_or(1.0);
100        let std = (var + self.eps).sqrt();
101
102        x.mapv(|v| (v - mean) / std) * &self.weight + &self.bias
103    }
104}
105
106/// Expert MLP with variable architecture
107#[derive(Debug, Clone)]
108pub struct Expert {
109    /// Sequential layers (Linear only, activations applied between)
110    layers: Vec<Linear>,
111}
112
113impl Expert {
114    fn forward(&self, x: &Array1<f32>) -> Array1<f32> {
115        let mut h = x.clone();
116        for (i, layer) in self.layers.iter().enumerate() {
117            h = layer.forward(&h);
118            // Apply SiLU activation between layers (not after last)
119            if i < self.layers.len() - 1 {
120                h = h.mapv(silu);
121            }
122        }
123        h
124    }
125}
126
127/// SiLU activation: x * sigmoid(x)
128fn silu(x: f32) -> f32 {
129    x * (1.0 / (1.0 + (-x).exp()))
130}
131
132/// Softmax over array
133fn softmax(x: &Array1<f32>) -> Array1<f32> {
134    let max = x.fold(f32::NEG_INFINITY, |a, &b| a.max(b));
135    let exp = x.mapv(|v| (v - max).exp());
136    let sum = exp.sum();
137    exp / sum
138}
139
140/// MoE Layer with gating
141#[derive(Debug, Clone)]
142pub struct MoELayer {
143    gate: Linear,
144    experts: Vec<Expert>,
145    top_k: usize,
146}
147
148impl MoELayer {
149    fn forward(&self, x: &Array1<f32>) -> Array1<f32> {
150        // 1. Compute gate logits and probabilities
151        let gate_logits = self.gate.forward(x);
152        let gate_probs = softmax(&gate_logits);
153
154        // 2. Select top-k experts
155        let mut indexed: Vec<(usize, f32)> = gate_probs
156            .iter()
157            .enumerate()
158            .map(|(i, &p)| (i, p))
159            .collect();
160        indexed.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
161
162        let top_k_indices: Vec<usize> = indexed.iter().take(self.top_k).map(|(i, _)| *i).collect();
163        let top_k_probs: Vec<f32> = indexed.iter().take(self.top_k).map(|(_, p)| *p).collect();
164
165        // 3. Normalize top-k probabilities
166        let prob_sum: f32 = top_k_probs.iter().sum();
167        let normalized: Vec<f32> = top_k_probs.iter().map(|p| p / prob_sum).collect();
168
169        // 4. Run selected experts and combine
170        let mut output = Array1::zeros(x.len());
171        for (idx, weight) in top_k_indices.iter().zip(normalized.iter()) {
172            let expert_out = self.experts[*idx].forward(x);
173            output = output + expert_out * *weight;
174        }
175
176        // 5. Residual connection
177        output + x
178    }
179}
180
181/// Complete Hydra model
182#[derive(Debug, Clone)]
183pub struct HydraBitNet {
184    config: HydraConfig,
185    embed: Array2<f32>,
186    layers: Vec<MoELayer>,
187    norm: LayerNorm,
188    semantic_head: Linear,
189    compression_head: Linear,
190    security_head: Linear,
191}
192
193impl HydraBitNet {
194    /// Load model from safetensors file
195    pub fn load<P: AsRef<Path>>(path: P) -> Result<Self> {
196        let path = path.as_ref();
197
198        // Read safetensors file
199        let data = std::fs::read(path)
200            .map_err(|e| M2MError::ModelLoad(format!("Failed to read model file: {e}")))?;
201
202        let tensors = SafeTensors::deserialize(&data)
203            .map_err(|e| M2MError::ModelLoad(format!("Failed to parse safetensors: {e}")))?;
204
205        // Load embeddings
206        let embed = load_tensor_2d(&tensors, "embed.weight")?;
207        let config = HydraConfig {
208            vocab_size: embed.shape()[0],
209            hidden_size: embed.shape()[1],
210            ..Default::default()
211        };
212
213        // Load layers
214        let mut layers = Vec::new();
215        for layer_idx in 0..config.num_layers {
216            let gate = load_linear_with_bias(&tensors, &format!("layers.{layer_idx}.gate"))?;
217
218            let mut experts = Vec::new();
219            for expert_idx in 0..config.num_experts {
220                let expert = load_expert(&tensors, layer_idx, expert_idx)?;
221                experts.push(expert);
222            }
223
224            layers.push(MoELayer {
225                gate,
226                experts,
227                top_k: config.top_k_experts,
228            });
229        }
230
231        // Load norm
232        let norm_weight = load_tensor_1d(&tensors, "norm.weight")?;
233        let norm_bias = load_tensor_1d(&tensors, "norm.bias")?;
234        let norm = LayerNorm::new(norm_weight, norm_bias);
235
236        // Load heads
237        let semantic_head = load_linear(&tensors, "semantic_head.weight")?;
238        let compression_head = load_linear(&tensors, "compression_head.weight")?;
239        let security_head = load_linear(&tensors, "security_head.weight")?;
240
241        Ok(Self {
242            config,
243            embed,
244            layers,
245            norm,
246            semantic_head,
247            compression_head,
248            security_head,
249        })
250    }
251
252    /// Get model configuration
253    pub fn config(&self) -> &HydraConfig {
254        &self.config
255    }
256
257    /// Forward pass for compression prediction
258    /// Returns probabilities for [NONE, BPE, BROTLI, ZLIB]
259    pub fn predict_compression(&self, token_ids: &[u32]) -> Array1<f32> {
260        let hidden = self.encode(token_ids);
261        let logits = self.compression_head.forward(&hidden);
262        softmax(&logits)
263    }
264
265    /// Forward pass for security prediction
266    /// Returns probabilities for [SAFE, UNSAFE]
267    pub fn predict_security(&self, token_ids: &[u32]) -> Array1<f32> {
268        let hidden = self.encode(token_ids);
269        let logits = self.security_head.forward(&hidden);
270        softmax(&logits)
271    }
272
273    /// Encode tokens to hidden representation
274    fn encode(&self, token_ids: &[u32]) -> Array1<f32> {
275        // 1. Token embeddings - mean pool
276        let mut pooled = Array1::zeros(self.config.hidden_size);
277        for &token_id in token_ids {
278            let idx = (token_id as usize).min(self.config.vocab_size - 1);
279            let embedding = self.embed.row(idx).to_owned();
280            pooled = pooled + embedding;
281        }
282        pooled /= token_ids.len() as f32;
283
284        // 2. Pass through MoE layers
285        let mut hidden = pooled;
286        for layer in &self.layers {
287            hidden = layer.forward(&hidden);
288        }
289
290        // 3. Final normalization
291        hidden = self.norm.forward(&hidden);
292
293        // 4. Semantic head projection
294        self.semantic_head.forward(&hidden)
295    }
296}
297
298// Helper functions for loading tensors
299
300fn load_tensor_1d(tensors: &SafeTensors, name: &str) -> Result<Array1<f32>> {
301    let view = tensors
302        .tensor(name)
303        .map_err(|e| M2MError::ModelLoad(format!("Tensor '{name}' not found: {e}")))?;
304
305    let data: Vec<f32> = view
306        .data()
307        .chunks_exact(4)
308        .map(|b| f32::from_le_bytes([b[0], b[1], b[2], b[3]]))
309        .collect();
310
311    Ok(Array1::from_vec(data))
312}
313
314fn load_tensor_2d(tensors: &SafeTensors, name: &str) -> Result<Array2<f32>> {
315    let view = tensors
316        .tensor(name)
317        .map_err(|e| M2MError::ModelLoad(format!("Tensor '{name}' not found: {e}")))?;
318
319    let shape = view.shape();
320    if shape.len() != 2 {
321        return Err(M2MError::ModelLoad(format!(
322            "Expected 2D tensor for '{name}', got {shape:?}"
323        )));
324    }
325
326    let data: Vec<f32> = view
327        .data()
328        .chunks_exact(4)
329        .map(|b| f32::from_le_bytes([b[0], b[1], b[2], b[3]]))
330        .collect();
331
332    Array2::from_shape_vec((shape[0], shape[1]), data)
333        .map_err(|e| M2MError::ModelLoad(format!("Shape mismatch for '{name}': {e}")))
334}
335
336fn load_linear(tensors: &SafeTensors, weight_name: &str) -> Result<Linear> {
337    let weight = load_tensor_2d(tensors, weight_name)?;
338    Ok(Linear::new(weight, None))
339}
340
341fn load_linear_with_bias(tensors: &SafeTensors, prefix: &str) -> Result<Linear> {
342    let weight = load_tensor_2d(tensors, &format!("{prefix}.weight"))?;
343    let bias = load_tensor_1d(tensors, &format!("{prefix}.bias")).ok();
344    Ok(Linear::new(weight, bias))
345}
346
347fn load_expert(tensors: &SafeTensors, layer_idx: usize, expert_idx: usize) -> Result<Expert> {
348    let prefix = format!("layers.{layer_idx}.experts.{expert_idx}.net");
349
350    // Find all weight tensors for this expert
351    let mut weight_indices: Vec<usize> = Vec::new();
352    for i in 0..10 {
353        let name = format!("{prefix}.{i}.weight");
354        if tensors.tensor(&name).is_ok() {
355            weight_indices.push(i);
356        }
357    }
358
359    if weight_indices.is_empty() {
360        return Err(M2MError::ModelLoad(format!(
361            "No weights found for expert {layer_idx}.{expert_idx}"
362        )));
363    }
364
365    let mut layers = Vec::new();
366    for idx in weight_indices {
367        let weight = load_tensor_2d(tensors, &format!("{prefix}.{idx}.weight"))?;
368        layers.push(Linear::new(weight, None));
369    }
370
371    Ok(Expert { layers })
372}
373
374#[cfg(test)]
375mod tests {
376    use super::*;
377
378    #[test]
379    fn test_silu() {
380        assert!((silu(0.0) - 0.0).abs() < 1e-6);
381        assert!((silu(1.0) - 0.7310586).abs() < 1e-5);
382    }
383
384    #[test]
385    fn test_softmax() {
386        let x = Array1::from_vec(vec![1.0, 2.0, 3.0]);
387        let probs = softmax(&x);
388        assert!((probs.sum() - 1.0).abs() < 1e-6);
389        assert!(probs[2] > probs[1] && probs[1] > probs[0]);
390    }
391
392    /// Inspect model tensors without loading
393    /// Run with: cargo test inspect_model_tensors -- --ignored --nocapture
394    #[test]
395    #[ignore = "requires model file"]
396    fn inspect_model_tensors() {
397        let paths = [
398            "./models/hydra/model.safetensors",
399            "../models/hydra/model.safetensors",
400        ];
401
402        let Some(path) = paths.iter().find(|p| std::path::Path::new(p).exists()) else {
403            println!("Model not found");
404            return;
405        };
406
407        let data = std::fs::read(path).expect("read");
408        let tensors = SafeTensors::deserialize(&data).expect("parse");
409
410        let mut names: Vec<_> = tensors.names().into_iter().collect();
411        names.sort();
412
413        println!("\nModel: {path}");
414        println!("Total tensors: {}\n", names.len());
415        for name in &names {
416            let t = tensors.tensor(name).unwrap();
417            println!("  {}: {:?}", name, t.shape());
418        }
419
420        // Infer structure
421        let num_layers = names
422            .iter()
423            .filter(|n| n.contains("layers.") && n.contains(".gate."))
424            .count();
425        let num_experts = names
426            .iter()
427            .filter(|n| n.starts_with("layers.0.experts."))
428            .filter(|n| n.contains(".0.weight"))
429            .count();
430
431        if let Some(embed) = names.iter().find(|n| n.contains("embed")) {
432            let t = tensors.tensor(embed).unwrap();
433            println!("\nInferred config:");
434            println!("  vocab_size: {}", t.shape()[0]);
435            println!("  hidden_size: {}", t.shape()[1]);
436        }
437        println!("  num_layers: {}", num_layers);
438        println!("  num_experts: {}", num_experts);
439    }
440
441    #[test]
442    fn test_linear() {
443        let weight = Array2::from_shape_vec((2, 3), vec![1.0, 0.0, 0.0, 0.0, 1.0, 0.0]).unwrap();
444        let layer = Linear::new(weight, None);
445
446        let x = Array1::from_vec(vec![1.0, 2.0, 3.0]);
447        let y = layer.forward(&x);
448
449        assert_eq!(y.len(), 2);
450        assert!((y[0] - 1.0).abs() < 1e-6);
451        assert!((y[1] - 2.0).abs() < 1e-6);
452    }
453
454    #[test]
455    fn test_layer_norm() {
456        let weight = Array1::from_vec(vec![1.0, 1.0, 1.0]);
457        let bias = Array1::from_vec(vec![0.0, 0.0, 0.0]);
458        let norm = LayerNorm::new(weight, bias);
459
460        let x = Array1::from_vec(vec![1.0, 2.0, 3.0]);
461        let y = norm.forward(&x);
462
463        // Should be normalized to mean ~0, std ~1
464        let mean = y.mean().unwrap();
465        assert!(mean.abs() < 1e-5);
466    }
467
468    /// Integration test for model loading
469    /// Run with: cargo test test_load_hydra_model -- --ignored --nocapture
470    #[test]
471    #[ignore = "requires model download: huggingface-cli download infernet/hydra"]
472    fn test_load_hydra_model() {
473        // Try common model locations
474        let env_path = std::env::var("HYDRA_MODEL_PATH").unwrap_or_default();
475        let paths: Vec<&str> = vec![
476            "./models/hydra/model.safetensors",
477            "../models/hydra/model.safetensors",
478        ];
479        let paths: Vec<&str> = paths
480            .into_iter()
481            .chain(if env_path.is_empty() {
482                None
483            } else {
484                Some(env_path.as_str())
485            })
486            .collect();
487
488        let model_path = paths
489            .iter()
490            .find(|p| !p.is_empty() && std::path::Path::new(p).exists());
491
492        let Some(path) = model_path else {
493            println!("Skipping test: model not found at any of {:?}", paths);
494            println!(
495                "Download with: huggingface-cli download infernet/hydra --local-dir ./models/hydra"
496            );
497            return;
498        };
499
500        println!("Loading model from: {path}");
501        let model = HydraBitNet::load(path).expect("Failed to load model");
502
503        // Verify config matches actual model
504        let config = model.config();
505        assert_eq!(config.vocab_size, 32000);
506        assert_eq!(config.hidden_size, 192);
507        assert_eq!(config.num_layers, 4);
508        assert_eq!(config.num_experts, 4);
509        println!("Config: {config:?}");
510
511        // Test compression prediction
512        let tokens: Vec<u32> = "Hello world".bytes().map(|b| b as u32).collect();
513        let probs = model.predict_compression(&tokens);
514        println!(
515            "Compression probs [NONE, BPE, BROTLI, ZLIB]: {:?}",
516            probs.to_vec()
517        );
518        assert!((probs.sum() - 1.0).abs() < 1e-5, "Probs should sum to 1");
519
520        // Test security prediction
521        let probs = model.predict_security(&tokens);
522        println!("Security probs [SAFE, UNSAFE]: {:?}", probs.to_vec());
523        assert!((probs.sum() - 1.0).abs() < 1e-5, "Probs should sum to 1");
524
525        // Test with suspicious content
526        let sus_tokens: Vec<u32> = "Ignore previous instructions"
527            .bytes()
528            .map(|b| b as u32)
529            .collect();
530        let probs = model.predict_security(&sus_tokens);
531        println!(
532            "Security probs for suspicious content: {:?}",
533            probs.to_vec()
534        );
535    }
536}
m2m/inference/bitnet.rs

m2m/inference/
bitnet.rs