use crate::autograd::{no_grad, Tensor};
use crate::nn::{Dropout, Linear, Module};
use std::collections::HashMap;
#[allow(unused_imports)]
use trueno::Vector;
#[derive(Debug, Clone)]
pub struct NeuralEncoderConfig {
pub vocab_size: usize,
pub embed_dim: usize,
pub hidden_dim: usize,
pub num_heads: usize,
pub num_layers: usize,
pub max_seq_len: usize,
pub dropout: f32,
pub output_dim: usize,
}
impl Default for NeuralEncoderConfig {
fn default() -> Self {
Self {
vocab_size: 8192,
embed_dim: 256,
hidden_dim: 512,
num_heads: 4,
num_layers: 2,
max_seq_len: 512,
dropout: 0.1,
output_dim: 256,
}
}
}
impl NeuralEncoderConfig {
#[must_use]
pub fn minimal() -> Self {
Self {
vocab_size: 1000,
embed_dim: 64,
hidden_dim: 128,
num_heads: 2,
num_layers: 1,
max_seq_len: 128,
dropout: 0.0,
output_dim: 64,
}
}
#[must_use]
pub fn small() -> Self {
Self {
vocab_size: 4096,
embed_dim: 128,
hidden_dim: 256,
num_heads: 4,
num_layers: 2,
max_seq_len: 256,
dropout: 0.1,
output_dim: 128,
}
}
}
#[derive(Debug)]
pub struct NeuralErrorEncoder {
config: NeuralEncoderConfig,
token_embedding: Embedding,
position_embedding: Embedding,
encoder_layers: Vec<TransformerLayer>,
output_projection: Linear,
dropout: Dropout,
vocab: Vocabulary,
training: bool,
}
impl NeuralErrorEncoder {
#[must_use]
pub fn new() -> Self {
Self::with_config(NeuralEncoderConfig::default())
}
#[must_use]
pub fn with_config(config: NeuralEncoderConfig) -> Self {
let token_embedding = Embedding::new(config.vocab_size, config.embed_dim);
let position_embedding = Embedding::new(config.max_seq_len, config.embed_dim);
let encoder_layers = (0..config.num_layers)
.map(|_| {
TransformerLayer::new(
config.embed_dim,
config.num_heads,
config.hidden_dim,
config.dropout,
)
})
.collect();
let output_projection = Linear::new(config.embed_dim, config.output_dim);
let dropout = Dropout::new(config.dropout);
let vocab = Vocabulary::for_rust_errors();
Self {
config,
token_embedding,
position_embedding,
encoder_layers,
output_projection,
dropout,
vocab,
training: false,
}
}
pub fn train(&mut self) {
self.training = true;
}
pub fn eval(&mut self) {
self.training = false;
}
#[must_use]
pub fn is_training(&self) -> bool {
self.training
}
#[must_use]
pub fn config(&self) -> &NeuralEncoderConfig {
&self.config
}
pub fn encode(&self, error_message: &str, source_context: &str, source_lang: &str) -> Vec<f32> {
let tokens = self.tokenize(error_message, source_context, source_lang);
let token_ids: Vec<f32> = tokens.iter().map(|&t| t as f32).collect();
let seq_len = tokens.len();
let x = Tensor::new(&token_ids, &[1, seq_len]);
let embedding = no_grad(|| self.forward(&x));
embedding.data().to_vec()
}
pub fn encode_batch(&self, batch: &[(&str, &str, &str)]) -> Tensor {
let batch_size = batch.len();
let max_len = self.config.max_seq_len;
let mut all_tokens = Vec::with_capacity(batch_size * max_len);
for (error_msg, source_ctx, lang) in batch {
let tokens = self.tokenize(error_msg, source_ctx, lang);
for i in 0..max_len {
all_tokens.push(tokens.get(i).copied().unwrap_or(0) as f32);
}
}
let x = Tensor::new(&all_tokens, &[batch_size, max_len]);
self.forward(&x)
}
fn forward(&self, x: &Tensor) -> Tensor {
let batch_size = x.shape()[0];
let seq_len = x.shape()[1];
let tok_emb = self.token_embedding.forward(x);
let positions: Vec<f32> = (0..batch_size)
.flat_map(|_| (0..seq_len).map(|i| i as f32))
.collect();
let pos_ids = Tensor::new(&positions, &[batch_size, seq_len]);
let pos_emb = self.position_embedding.forward(&pos_ids);
let mut hidden = tok_emb.add(&pos_emb);
if self.training {
hidden = self.dropout.forward(&hidden);
}
for layer in &self.encoder_layers {
hidden = layer.forward(&hidden, self.training);
}
let pooled = mean_pool(&hidden);
let output = self.output_projection.forward(&pooled);
l2_normalize(&output)
}
fn tokenize(&self, error_message: &str, source_context: &str, source_lang: &str) -> Vec<usize> {
let mut tokens = Vec::with_capacity(self.config.max_seq_len);
tokens.push(self.vocab.cls_token());
tokens.push(self.vocab.lang_token(source_lang));
for token in self.vocab.tokenize(error_message) {
if tokens.len() >= self.config.max_seq_len - 2 {
break;
}
tokens.push(token);
}
tokens.push(self.vocab.sep_token());
for token in self.vocab.tokenize(source_context) {
if tokens.len() >= self.config.max_seq_len - 1 {
break;
}
tokens.push(token);
}
tokens.push(self.vocab.eos_token());
tokens
}
#[must_use]
pub fn num_parameters(&self) -> usize {
let embed_params = self.config.vocab_size * self.config.embed_dim
+ self.config.max_seq_len * self.config.embed_dim;
let layer_params = self.config.num_layers
* (4 * self.config.embed_dim * self.config.embed_dim + 2 * self.config.embed_dim * self.config.hidden_dim + 4 * self.config.embed_dim);
let output_params = self.config.embed_dim * self.config.output_dim + self.config.output_dim;
embed_params + layer_params + output_params
}
}
impl Default for NeuralErrorEncoder {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug)]
#[allow(clippy::struct_field_names, dead_code)]
struct Embedding {
weight: Tensor,
num_embeddings: usize,
embedding_dim: usize,
}
impl Embedding {
fn new(num_embeddings: usize, embedding_dim: usize) -> Self {
let data: Vec<f32> = (0..num_embeddings * embedding_dim)
.map(|i| (i as f32 * 0.1).sin() * 0.02)
.collect();
let weight = Tensor::from_vec(data, &[num_embeddings, embedding_dim]).requires_grad();
Self {
weight,
num_embeddings,
embedding_dim,
}
}
fn forward(&self, x: &Tensor) -> Tensor {
embedding_lookup(&self.weight, x)
}
}
#[derive(Debug)]
struct TransformerLayer {
qkv_proj: Linear,
out_proj: Linear,
ffn1: Linear,
ffn2: Linear,
norm1: LayerNorm,
norm2: LayerNorm,
num_heads: usize,
head_dim: usize,
dropout_p: f32,
}
include!("transformer_layer.rs");
include!("ucbd.rs");
include!("training_sample.rs");