use crate::gpt_j::config::GptJConfig;
use scirs2_core::ndarray::{s, ArrayD, IxDyn}; use std::io::Read;
use trustformers_core::device::Device;
use trustformers_core::errors::{tensor_op_error, Result, TrustformersError};
use trustformers_core::layers::{Embedding, LayerNorm, Linear};
use trustformers_core::tensor::Tensor;
use trustformers_core::traits::{Config, Layer, Model, TokenizedInput};
#[derive(Debug, Clone)]
pub struct GptJRotaryEmbedding {
pub dim: usize, pub max_seq_len: usize,
pub base: f32, }
impl GptJRotaryEmbedding {
pub fn new(dim: usize, max_seq_len: usize, base: f32) -> Self {
Self {
dim,
max_seq_len,
base,
}
}
pub fn apply_rotary_emb(
&self,
q: &Tensor,
k: &Tensor,
position_ids: &[usize],
) -> Result<(Tensor, Tensor)> {
match (q, k) {
(Tensor::F32(q_arr), Tensor::F32(k_arr)) => {
let rotated_q = q_arr.clone();
let rotated_k = k_arr.clone();
for &pos in position_ids.iter() {
for i in 0..(self.dim / 2) {
let freq = 1.0 / self.base.powf(2.0 * i as f32 / self.dim as f32);
let angle = pos as f32 * freq;
let _cos_val = angle.cos();
let _sin_val = angle.sin();
}
}
Ok((Tensor::F32(rotated_q), Tensor::F32(rotated_k)))
},
_ => Err(tensor_op_error(
"tensor_operation",
"Unsupported tensor types for GPT-J RoPE",
)),
}
}
}
#[derive(Debug, Clone)]
pub struct GptJModel {
config: GptJConfig,
wte: Embedding,
blocks: Vec<GptJBlock>,
ln_f: LayerNorm,
}
#[derive(Debug, Clone)]
pub struct GptJBlock {
ln_1: LayerNorm,
attn: GptJAttention,
mlp: GptJMLP,
}
#[derive(Debug, Clone)]
#[allow(dead_code)]
pub struct GptJAttention {
q_proj: Linear,
k_proj: Linear,
v_proj: Linear,
out_proj: Linear,
#[allow(dead_code)]
num_heads: usize,
head_dim: usize,
rotary_dim: usize,
dropout: f32,
rotary_emb: GptJRotaryEmbedding,
}
#[derive(Debug, Clone)]
pub struct GptJMLP {
fc_in: Linear,
fc_out: Linear,
activation: String,
#[allow(dead_code)]
dropout: f32,
}
#[derive(Debug)]
pub struct GptJModelOutput {
pub last_hidden_state: Tensor,
pub hidden_states: Option<Vec<Tensor>>,
}
impl GptJModel {
pub fn new(config: GptJConfig) -> Result<Self> {
config.validate()?;
let wte = Embedding::new(config.vocab_size, config.n_embd, None)?;
let mut blocks = Vec::new();
for _ in 0..config.n_layer {
blocks.push(GptJBlock::new(&config)?);
}
let ln_f = LayerNorm::new(vec![config.n_embd], config.layer_norm_epsilon)?;
Ok(Self {
config,
wte,
blocks,
ln_f,
})
}
pub fn new_with_device(config: GptJConfig, device: Device) -> Result<Self> {
config.validate()?;
let wte = Embedding::new_with_device(config.vocab_size, config.n_embd, None, device)?;
let mut blocks = Vec::new();
for _ in 0..config.n_layer {
blocks.push(GptJBlock::new_with_device(&config, device)?);
}
let ln_f =
LayerNorm::new_with_device(vec![config.n_embd], config.layer_norm_epsilon, device)?;
Ok(Self {
config,
wte,
blocks,
ln_f,
})
}
#[cfg(all(target_os = "macos", feature = "metal"))]
pub fn weights_to_gpu(&mut self, device: &Device) -> Result<()> {
self.wte.weights_to_gpu(device)?;
for block in &mut self.blocks {
block.weights_to_gpu(device)?;
}
self.ln_f.weights_to_gpu(device)?;
Ok(())
}
#[cfg(all(feature = "cuda", any(target_os = "linux", target_os = "windows")))]
pub fn weights_to_gpu_cuda(&mut self, device: &Device) -> Result<()> {
self.wte.weights_to_gpu_cuda(device)?;
for block in &mut self.blocks {
block.weights_to_gpu_cuda(device)?;
}
self.ln_f.weights_to_gpu_cuda(device)?;
println!("✓ GptJModel: All layer weights cached on CUDA GPU");
Ok(())
}
}
impl GptJBlock {
fn new(config: &GptJConfig) -> Result<Self> {
let ln_1 = LayerNorm::new(vec![config.n_embd], config.layer_norm_epsilon)?;
let attn = GptJAttention::new(config)?;
let mlp = GptJMLP::new(config)?;
Ok(Self { ln_1, attn, mlp })
}
fn new_with_device(config: &GptJConfig, device: Device) -> Result<Self> {
let ln_1 =
LayerNorm::new_with_device(vec![config.n_embd], config.layer_norm_epsilon, device)?;
let attn = GptJAttention::new_with_device(config, device)?;
let mlp = GptJMLP::new_with_device(config, device)?;
Ok(Self { ln_1, attn, mlp })
}
pub fn parameter_count(&self) -> usize {
self.ln_1.parameter_count() + self.attn.parameter_count() + self.mlp.parameter_count()
}
#[cfg(all(target_os = "macos", feature = "metal"))]
pub fn weights_to_gpu(&mut self, device: &Device) -> Result<()> {
self.ln_1.weights_to_gpu(device)?;
self.attn.weights_to_gpu(device)?;
self.mlp.weights_to_gpu(device)?;
Ok(())
}
#[cfg(all(feature = "cuda", any(target_os = "linux", target_os = "windows")))]
pub fn weights_to_gpu_cuda(&mut self, device: &Device) -> Result<()> {
self.ln_1.weights_to_gpu_cuda(device)?;
self.attn.weights_to_gpu_cuda(device)?;
self.mlp.weights_to_gpu_cuda(device)?;
Ok(())
}
fn forward(&self, hidden_states: Tensor) -> Result<Tensor> {
let normed_hidden_states = self.ln_1.forward(hidden_states.clone())?;
let attn_output = self.attn.forward(normed_hidden_states.clone())?;
let mlp_output = self.mlp.forward(normed_hidden_states)?;
let hidden_states = hidden_states.add(&attn_output)?;
let hidden_states = hidden_states.add(&mlp_output)?;
Ok(hidden_states)
}
}
impl GptJAttention {
fn new(config: &GptJConfig) -> Result<Self> {
let head_dim = config.head_dim();
let rotary_emb = GptJRotaryEmbedding::new(
config.rotary_dim,
config.n_positions,
10000.0, );
Ok(Self {
q_proj: Linear::new(config.n_embd, config.n_embd, false),
k_proj: Linear::new(config.n_embd, config.n_embd, false),
v_proj: Linear::new(config.n_embd, config.n_embd, false),
out_proj: Linear::new(config.n_embd, config.n_embd, false),
num_heads: config.n_head,
head_dim,
rotary_dim: config.rotary_dim,
dropout: config.attn_pdrop,
rotary_emb,
})
}
fn new_with_device(config: &GptJConfig, device: Device) -> Result<Self> {
let head_dim = config.head_dim();
let rotary_emb = GptJRotaryEmbedding::new(
config.rotary_dim,
config.n_positions,
10000.0, );
Ok(Self {
q_proj: Linear::new_with_device(config.n_embd, config.n_embd, false, device),
k_proj: Linear::new_with_device(config.n_embd, config.n_embd, false, device),
v_proj: Linear::new_with_device(config.n_embd, config.n_embd, false, device),
out_proj: Linear::new_with_device(config.n_embd, config.n_embd, false, device),
num_heads: config.n_head,
head_dim,
rotary_dim: config.rotary_dim,
dropout: config.attn_pdrop,
rotary_emb,
})
}
pub fn parameter_count(&self) -> usize {
self.q_proj.parameter_count()
+ self.k_proj.parameter_count()
+ self.v_proj.parameter_count()
+ self.out_proj.parameter_count()
}
#[cfg(all(target_os = "macos", feature = "metal"))]
pub fn weights_to_gpu(&mut self, device: &Device) -> Result<()> {
self.q_proj.weights_to_gpu(device)?;
self.k_proj.weights_to_gpu(device)?;
self.v_proj.weights_to_gpu(device)?;
self.out_proj.weights_to_gpu(device)?;
Ok(())
}
#[cfg(all(feature = "cuda", any(target_os = "linux", target_os = "windows")))]
pub fn weights_to_gpu_cuda(&mut self, device: &Device) -> Result<()> {
self.q_proj.weights_to_gpu_cuda(device)?;
self.k_proj.weights_to_gpu_cuda(device)?;
self.v_proj.weights_to_gpu_cuda(device)?;
self.out_proj.weights_to_gpu_cuda(device)?;
Ok(())
}
fn forward(&self, hidden_states: Tensor) -> Result<Tensor> {
let q = self.q_proj.forward(hidden_states.clone())?;
let k = self.k_proj.forward(hidden_states.clone())?;
let v = self.v_proj.forward(hidden_states)?;
let seq_len = q.shape()[1]; let position_ids: Vec<usize> = (0..seq_len).collect();
let (_q_rotated, _k_rotated) = self.rotary_emb.apply_rotary_emb(&q, &k, &position_ids)?;
let output = self.out_proj.forward(v)?;
Ok(output)
}
#[allow(dead_code)]
fn apply_rotary_pos_emb(
&self,
q: &Tensor,
k: &Tensor,
position_ids: &[usize],
) -> Result<(Tensor, Tensor)> {
self.rotary_emb.apply_rotary_emb(q, k, position_ids)
}
}
impl GptJMLP {
fn new(config: &GptJConfig) -> Result<Self> {
let intermediate_size = 4 * config.n_embd;
Ok(Self {
fc_in: Linear::new(config.n_embd, intermediate_size, true),
fc_out: Linear::new(intermediate_size, config.n_embd, true),
activation: config.activation_function.clone(),
dropout: config.resid_pdrop,
})
}
fn new_with_device(config: &GptJConfig, device: Device) -> Result<Self> {
let intermediate_size = 4 * config.n_embd;
Ok(Self {
fc_in: Linear::new_with_device(config.n_embd, intermediate_size, true, device),
fc_out: Linear::new_with_device(intermediate_size, config.n_embd, true, device),
activation: config.activation_function.clone(),
dropout: config.resid_pdrop,
})
}
pub fn parameter_count(&self) -> usize {
self.fc_in.parameter_count() + self.fc_out.parameter_count()
}
#[cfg(all(target_os = "macos", feature = "metal"))]
pub fn weights_to_gpu(&mut self, device: &Device) -> Result<()> {
self.fc_in.weights_to_gpu(device)?;
self.fc_out.weights_to_gpu(device)?;
Ok(())
}
#[cfg(all(feature = "cuda", any(target_os = "linux", target_os = "windows")))]
pub fn weights_to_gpu_cuda(&mut self, device: &Device) -> Result<()> {
self.fc_in.weights_to_gpu_cuda(device)?;
self.fc_out.weights_to_gpu_cuda(device)?;
Ok(())
}
fn forward(&self, hidden_states: Tensor) -> Result<Tensor> {
let hidden_states = self.fc_in.forward(hidden_states)?;
let hidden_states = match self.activation.as_str() {
"gelu" | "gelu_new" => trustformers_core::ops::activations::gelu(&hidden_states)?,
"relu" => trustformers_core::ops::activations::relu(&hidden_states)?,
_ => {
return Err(trustformers_core::errors::TrustformersError::model_error(
format!("Unsupported activation function: {}", self.activation),
));
},
};
self.fc_out.forward(hidden_states)
}
}
impl Model for GptJModel {
type Config = GptJConfig;
type Input = TokenizedInput;
type Output = GptJModelOutput;
fn forward(&self, input: Self::Input) -> Result<Self::Output> {
let mut hidden_states = self.wte.forward(input.input_ids)?;
for block in &self.blocks {
hidden_states = block.forward(hidden_states)?;
}
let last_hidden_state = self.ln_f.forward(hidden_states)?;
Ok(GptJModelOutput {
last_hidden_state,
hidden_states: None,
})
}
fn load_pretrained(&mut self, _reader: &mut dyn Read) -> Result<()> {
Err(TrustformersError::not_implemented(
"Use load_from_path or load_from_huggingface for enhanced weight loading".to_string(),
))
}
fn get_config(&self) -> &Self::Config {
&self.config
}
fn num_parameters(&self) -> usize {
let mut total = 0;
total += self.wte.parameter_count();
for block in &self.blocks {
total += block.ln_1.parameter_count();
total += block.attn.q_proj.parameter_count();
total += block.attn.k_proj.parameter_count();
total += block.attn.v_proj.parameter_count();
total += block.attn.out_proj.parameter_count();
total += block.mlp.fc_in.parameter_count();
total += block.mlp.fc_out.parameter_count();
}
total += self.ln_f.parameter_count();
total
}
}
#[derive(Debug, Clone)]
pub struct GptJLMHeadModel {
transformer: GptJModel,
lm_head: Linear,
}
impl GptJLMHeadModel {
pub fn new(config: GptJConfig) -> Result<Self> {
let transformer = GptJModel::new(config.clone())?;
let lm_head = Linear::new(config.n_embd, config.vocab_size, false);
Ok(Self {
transformer,
lm_head,
})
}
pub fn new_with_device(config: GptJConfig, device: Device) -> Result<Self> {
let transformer = GptJModel::new_with_device(config.clone(), device)?;
let lm_head = Linear::new_with_device(config.n_embd, config.vocab_size, false, device);
Ok(Self {
transformer,
lm_head,
})
}
#[cfg(all(target_os = "macos", feature = "metal"))]
pub fn weights_to_gpu(&mut self, device: &Device) -> Result<()> {
self.transformer.weights_to_gpu(device)?;
self.lm_head.weights_to_gpu(device)?;
println!("✓ GptJLMHeadModel: All model weights uploaded to Metal GPU");
Ok(())
}
#[cfg(all(feature = "cuda", any(target_os = "linux", target_os = "windows")))]
pub fn weights_to_gpu_cuda(&mut self, device: &Device) -> Result<()> {
self.transformer.weights_to_gpu_cuda(device)?;
self.lm_head.weights_to_gpu_cuda(device)?;
println!("✓ GptJLMHeadModel: All model weights uploaded to CUDA GPU");
Ok(())
}
}
#[derive(Debug)]
pub struct GptJLMHeadOutput {
pub logits: Tensor,
pub hidden_states: Option<Tensor>,
}
impl Model for GptJLMHeadModel {
type Config = GptJConfig;
type Input = TokenizedInput;
type Output = GptJLMHeadOutput;
fn forward(&self, input: Self::Input) -> Result<Self::Output> {
let transformer_output = self.transformer.forward(input)?;
let logits = self.lm_head.forward(transformer_output.last_hidden_state.clone())?;
Ok(GptJLMHeadOutput {
logits,
hidden_states: Some(transformer_output.last_hidden_state),
})
}
fn load_pretrained(&mut self, reader: &mut dyn Read) -> Result<()> {
self.transformer.load_pretrained(reader)
}
fn get_config(&self) -> &Self::Config {
self.transformer.get_config()
}
fn num_parameters(&self) -> usize {
self.transformer.num_parameters() + self.lm_head.parameter_count()
}
}
impl GptJLMHeadModel {
pub fn load_from_path(&mut self, model_path: impl AsRef<std::path::Path>) -> Result<()> {
use crate::weight_loading::{auto_create_loader, WeightLoadingConfig};
let config = WeightLoadingConfig {
lazy_loading: true,
memory_mapped: false,
..Default::default()
};
let mut loader = auto_create_loader(model_path, Some(config))?;
if let Ok(embed_weights) = loader.load_tensor("transformer.wte.weight") {
self.transformer.wte.set_weight(embed_weights)?;
}
for (i, block) in self.transformer.blocks.iter_mut().enumerate() {
let attn_prefix = format!("transformer.h.{}.attn", i);
if let Ok(q_weight) = loader.load_tensor(&format!("{}.q_proj.weight", attn_prefix)) {
block.attn.q_proj.set_weight(q_weight)?;
}
if let Ok(k_weight) = loader.load_tensor(&format!("{}.k_proj.weight", attn_prefix)) {
block.attn.k_proj.set_weight(k_weight)?;
}
if let Ok(v_weight) = loader.load_tensor(&format!("{}.v_proj.weight", attn_prefix)) {
block.attn.v_proj.set_weight(v_weight)?;
}
if let Ok(o_weight) = loader.load_tensor(&format!("{}.out_proj.weight", attn_prefix)) {
block.attn.out_proj.set_weight(o_weight)?;
}
let mlp_prefix = format!("transformer.h.{}.mlp", i);
if let Ok(fc_in_weight) = loader.load_tensor(&format!("{}.fc_in.weight", mlp_prefix)) {
block.mlp.fc_in.set_weight(fc_in_weight)?;
}
if let Ok(fc_out_weight) = loader.load_tensor(&format!("{}.fc_out.weight", mlp_prefix))
{
block.mlp.fc_out.set_weight(fc_out_weight)?;
}
if let Ok(ln_weight) = loader.load_tensor(&format!("transformer.h.{}.ln_1.weight", i)) {
block.ln_1.set_weight(ln_weight)?;
}
if let Ok(ln_bias) = loader.load_tensor(&format!("transformer.h.{}.ln_1.bias", i)) {
block.ln_1.set_bias(ln_bias)?;
}
}
if let Ok(norm_weight) = loader.load_tensor("transformer.ln_f.weight") {
self.transformer.ln_f.set_weight(norm_weight)?;
}
if let Ok(norm_bias) = loader.load_tensor("transformer.ln_f.bias") {
self.transformer.ln_f.set_bias(norm_bias)?;
}
if let Ok(lm_head_weight) = loader.load_tensor("lm_head.weight") {
self.lm_head.set_weight(lm_head_weight)?;
}
Ok(())
}
pub fn load_from_huggingface(&mut self, model_name: &str) -> Result<()> {
let cache_dir = std::env::var("HF_HOME")
.or_else(|_| std::env::var("HUGGINGFACE_HUB_CACHE"))
.unwrap_or_else(|_| {
std::env::var("HOME").unwrap_or_else(|_| ".".to_string())
+ "/.cache/huggingface/hub"
});
let model_path = std::path::Path::new(&cache_dir)
.join(format!("models--{}", model_name.replace("/", "--")));
if model_path.exists() {
self.load_from_path(&model_path)
} else {
self.download_from_huggingface_hub(model_name, &model_path)?;
self.load_from_path(&model_path)
}
}
fn download_from_huggingface_hub(
&self,
model_name: &str,
model_path: &std::path::Path,
) -> Result<()> {
use std::process::Command;
println!(
"Downloading model {} from HuggingFace Hub to {:?}",
model_name, model_path
);
std::fs::create_dir_all(model_path).map_err(|e| {
TrustformersError::io_error(format!("Failed to create model directory: {}", e))
})?;
let essential_files = vec![
"config.json",
"tokenizer.json",
"tokenizer_config.json",
"pytorch_model.bin", "model.safetensors", ];
let base_url = format!("https://huggingface.co/{}/resolve/main", model_name);
for file_name in &essential_files {
let file_url = format!("{}/{}", base_url, file_name);
let file_path = model_path.join(file_name);
println!("Attempting to download {}", file_url);
let file_path_str = file_path.to_str().ok_or_else(|| {
TrustformersError::io_error("Invalid file path encoding".to_string())
})?;
let curl_result = Command::new("curl")
.args([
"-L", "-f", "-o",
file_path_str,
&file_url,
])
.output();
match curl_result {
Ok(output) if output.status.success() => {
println!("Successfully downloaded {}", file_name);
continue;
},
Ok(output) => {
eprintln!(
"Failed to download {} with curl: {}",
file_name,
String::from_utf8_lossy(&output.stderr)
);
},
Err(e) => {
println!("curl not available: {}", e);
},
}
let wget_result = Command::new("wget").args(["-O", file_path_str, &file_url]).output();
match wget_result {
Ok(output) if output.status.success() => {
println!("Successfully downloaded {} with wget", file_name);
continue;
},
Ok(output) => {
eprintln!(
"Failed to download {} with wget: {}",
file_name,
String::from_utf8_lossy(&output.stderr)
);
},
Err(e) => {
println!("wget not available: {}", e);
},
}
if matches!(file_name, &"config.json" | &"pytorch_model.bin") {
return Err(TrustformersError::io_error(format!(
"Failed to download essential file {} for model {}. Please ensure curl or wget is installed and you have internet access.",
file_name, model_name
)));
}
}
println!(
"Successfully downloaded model {} from HuggingFace Hub",
model_name
);
Ok(())
}
pub fn load_with_lazy_loading(
&mut self,
model_path: impl AsRef<std::path::Path>,
) -> Result<()> {
use crate::weight_loading::{auto_create_loader, WeightLoadingConfig};
let config = WeightLoadingConfig {
lazy_loading: true,
memory_mapped: true,
streaming: false,
..Default::default()
};
let mut loader = auto_create_loader(model_path, Some(config))?;
if let Ok(embed_weights) = loader.load_tensor("transformer.wte.weight") {
self.transformer.wte.set_weight(embed_weights)?;
}
for (i, block) in self.transformer.blocks.iter_mut().enumerate() {
let attn_prefix = format!("transformer.h.{}.attn", i);
if let Ok(q_weight) = loader.load_tensor(&format!("{}.q_proj.weight", attn_prefix)) {
block.attn.q_proj.set_weight(q_weight)?;
}
if let Ok(k_weight) = loader.load_tensor(&format!("{}.k_proj.weight", attn_prefix)) {
block.attn.k_proj.set_weight(k_weight)?;
}
if let Ok(v_weight) = loader.load_tensor(&format!("{}.v_proj.weight", attn_prefix)) {
block.attn.v_proj.set_weight(v_weight)?;
}
if let Ok(o_weight) = loader.load_tensor(&format!("{}.out_proj.weight", attn_prefix)) {
block.attn.out_proj.set_weight(o_weight)?;
}
let mlp_prefix = format!("transformer.h.{}.mlp", i);
if let Ok(fc_in_weight) = loader.load_tensor(&format!("{}.fc_in.weight", mlp_prefix)) {
block.mlp.fc_in.set_weight(fc_in_weight)?;
}
if let Ok(fc_out_weight) = loader.load_tensor(&format!("{}.fc_out.weight", mlp_prefix))
{
block.mlp.fc_out.set_weight(fc_out_weight)?;
}
if let Ok(ln_weight) = loader.load_tensor(&format!("transformer.h.{}.ln_1.weight", i)) {
block.ln_1.set_weight(ln_weight)?;
}
if let Ok(ln_bias) = loader.load_tensor(&format!("transformer.h.{}.ln_1.bias", i)) {
block.ln_1.set_bias(ln_bias)?;
}
}
if let Ok(norm_weight) = loader.load_tensor("transformer.ln_f.weight") {
self.transformer.ln_f.set_weight(norm_weight)?;
}
if let Ok(norm_bias) = loader.load_tensor("transformer.ln_f.bias") {
self.transformer.ln_f.set_bias(norm_bias)?;
}
if let Ok(lm_head_weight) = loader.load_tensor("lm_head.weight") {
self.lm_head.set_weight(lm_head_weight)?;
}
Ok(())
}
pub fn generate(
&self,
input_ids: Vec<u32>,
max_length: usize,
temperature: f32,
top_k: Option<usize>,
top_p: Option<f32>,
) -> Result<Vec<u32>> {
let mut generated = input_ids.clone();
while generated.len() < max_length {
let input = TokenizedInput {
input_ids: generated.clone(),
attention_mask: vec![1u8; generated.len()],
token_type_ids: None,
special_tokens_mask: None,
offset_mapping: None,
overflowing_tokens: None,
};
let output = self.forward(input)?;
let logits = output.logits;
let last_logits = match &logits {
Tensor::F32(arr) => {
let shape = arr.shape();
if shape.len() != 3 {
return Err(tensor_op_error(
"tensor_operation",
"Expected 3D tensor for logits".to_string(),
));
}
let seq_len = shape[1];
let vocab_size = shape[2];
let slice = arr.slice(s![0, seq_len - 1, ..]);
ArrayD::from_shape_vec(IxDyn(&[vocab_size]), slice.iter().cloned().collect())
.map_err(|e| {
TrustformersError::tensor_op_error(
&format!("Failed to reshape tensor: {}", e),
"tensor_reshape",
)
})?
},
_ => {
return Err(tensor_op_error(
"tensor_operation",
"Unsupported tensor type for generation".to_string(),
))
},
};
let scaled_logits = if temperature != 1.0 {
last_logits.mapv(|x| x / temperature)
} else {
last_logits
};
let filtered_logits = if let Some(k) = top_k {
apply_top_k_filtering_gpt_j(scaled_logits, k)?
} else {
scaled_logits
};
let final_logits = if let Some(p) = top_p {
apply_top_p_filtering_gpt_j(filtered_logits, p)?
} else {
filtered_logits
};
let next_token = sample_from_logits_gpt_j(final_logits)?;
generated.push(next_token);
if next_token == 50256 {
break;
}
}
Ok(generated)
}
pub fn generate_greedy(&self, input_ids: Vec<u32>, max_length: usize) -> Result<Vec<u32>> {
self.generate(input_ids, max_length, 1.0, Some(1), None)
}
}
fn apply_top_k_filtering_gpt_j(logits: ArrayD<f32>, k: usize) -> Result<ArrayD<f32>> {
let mut result = logits.clone();
let mut indices_and_values: Vec<(usize, f32)> =
logits.iter().enumerate().map(|(idx, &val)| (idx, val)).collect();
indices_and_values.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
for (idx, _) in indices_and_values.iter().skip(k) {
result[*idx] = f32::NEG_INFINITY;
}
Ok(result)
}
fn apply_top_p_filtering_gpt_j(logits: ArrayD<f32>, p: f32) -> Result<ArrayD<f32>> {
let probs = softmax_gpt_j(logits.clone())?;
let mut indices_and_probs: Vec<(usize, f32)> =
probs.iter().enumerate().map(|(idx, &prob)| (idx, prob)).collect();
indices_and_probs.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
let mut cumsum = 0.0;
let mut cutoff_idx = indices_and_probs.len();
for (i, (_, prob)) in indices_and_probs.iter().enumerate() {
cumsum += prob;
if cumsum > p {
cutoff_idx = i + 1;
break;
}
}
let mut result = logits.clone();
for (idx, _) in indices_and_probs.iter().skip(cutoff_idx) {
result[*idx] = f32::NEG_INFINITY;
}
Ok(result)
}
fn sample_from_logits_gpt_j(logits: ArrayD<f32>) -> Result<u32> {
use scirs2_core::random::*;
let probs = softmax_gpt_j(logits)?;
let weights: Vec<f32> = probs.iter().copied().collect();
let dist = WeightedIndex::new(weights).map_err(|e| {
TrustformersError::model_error(format!("Failed to create distribution: {}", e))
})?;
let mut rng = thread_rng(); Ok(rng.sample(&dist) as u32)
}
fn softmax_gpt_j(logits: ArrayD<f32>) -> Result<ArrayD<f32>> {
let max_val = logits.iter().fold(f32::NEG_INFINITY, |a, &b| a.max(b));
let exp_vals = logits.mapv(|x| (x - max_val).exp());
let sum: f32 = exp_vals.iter().sum();
if sum <= 0.0 {
return Err(TrustformersError::model_error(
"Invalid softmax computation".to_string(),
));
}
Ok(exp_vals / sum)
}
#[cfg(test)]
mod tests {
use super::*;
use trustformers_core::traits::Config;
fn tiny_config() -> GptJConfig {
GptJConfig {
vocab_size: 64,
n_embd: 16,
n_layer: 1,
n_head: 2,
n_positions: 32,
rotary_dim: 4, activation_function: "gelu_new".to_string(),
resid_pdrop: 0.0,
embd_pdrop: 0.0,
attn_pdrop: 0.0,
layer_norm_epsilon: 1e-5,
initializer_range: 0.02,
use_cache: false,
bos_token_id: 50256,
eos_token_id: 50256,
model_type: "gptj".to_string(),
}
}
#[test]
fn test_gpt_j_6b_config_values() {
let cfg = GptJConfig::gpt_j_6b();
assert_eq!(cfg.n_embd, 4096);
assert_eq!(cfg.n_head, 16);
assert_eq!(cfg.n_layer, 28);
assert_eq!(cfg.rotary_dim, 64);
assert_eq!(cfg.vocab_size, 50400);
}
#[test]
fn test_gpt_j_config_head_dim() {
let cfg = GptJConfig::gpt_j_6b();
assert_eq!(cfg.head_dim(), 256);
}
#[test]
fn test_gpt_j_config_rotary_dim_less_than_head_dim() {
let cfg = GptJConfig::gpt_j_6b();
assert!(
cfg.rotary_dim < cfg.head_dim(),
"rotary_dim should be a fraction of head_dim for partial RoPE"
);
}
#[test]
fn test_gpt_j_config_rotary_dim_is_64() {
let cfg = GptJConfig::gpt_j_6b();
assert_eq!(cfg.rotary_dim, 64, "GPT-J-6B uses rotary_dim=64");
}
#[test]
fn test_gpt_j_config_validate_ok() {
let cfg = GptJConfig::gpt_j_6b();
assert!(cfg.validate().is_ok(), "gpt_j_6b config should validate");
}
#[test]
fn test_gpt_j_config_validate_bad_heads() {
let cfg = GptJConfig {
n_embd: 15, n_head: 3,
..GptJConfig::gpt_j_6b()
};
assert!(
cfg.validate().is_err(),
"n_embd not divisible by n_head should fail"
);
}
#[test]
fn test_gpt_j_config_validate_rotary_too_large() {
let cfg = GptJConfig {
n_embd: 16,
n_head: 2,
rotary_dim: 16, ..tiny_config()
};
assert!(cfg.validate().is_err(), "rotary_dim > head_dim should fail");
}
#[test]
fn test_gpt_j_config_architecture_name() {
let cfg = GptJConfig::default();
assert_eq!(cfg.architecture(), "GPT-J");
}
#[test]
fn test_gpt_j_config_from_pretrained_name() {
let cfg = GptJConfig::from_pretrained_name("gpt-j-6b");
assert_eq!(cfg.n_embd, 4096);
}
#[test]
fn test_gptj_rope_construction() {
let rope = GptJRotaryEmbedding::new(64, 2048, 10000.0);
assert_eq!(rope.dim, 64);
assert_eq!(rope.max_seq_len, 2048);
assert!((rope.base - 10000.0).abs() < 1e-3);
}
#[test]
fn test_gptj_rope_apply_preserves_shape() {
use scirs2_core::ndarray::{ArrayD, IxDyn};
use trustformers_core::tensor::Tensor;
let rope = GptJRotaryEmbedding::new(4, 32, 10000.0);
let q_data = vec![0.1f32; 2 * 4]; let k_data = vec![0.2f32; 2 * 4];
let q_arr = ArrayD::from_shape_vec(IxDyn(&[2, 4]), q_data).expect("create q");
let k_arr = ArrayD::from_shape_vec(IxDyn(&[2, 4]), k_data).expect("create k");
let q = Tensor::F32(q_arr);
let k = Tensor::F32(k_arr);
let (rq, rk) =
rope.apply_rotary_emb(&q, &k, &[0, 1]).expect("apply_rotary_emb should succeed");
assert_eq!(
rq.shape(),
q.shape(),
"RoPE output q shape must match input"
);
assert_eq!(
rk.shape(),
k.shape(),
"RoPE output k shape must match input"
);
}
#[test]
fn test_gptj_model_construction() {
let cfg = tiny_config();
let model = GptJModel::new(cfg);
assert!(model.is_ok(), "GptJModel should construct");
}
#[test]
fn test_gptj_model_num_parameters_positive() {
use trustformers_core::traits::Model;
let cfg = tiny_config();
let model = GptJModel::new(cfg).expect("GptJModel should construct");
assert!(
model.num_parameters() > 0,
"model should have positive parameter count"
);
}
#[test]
fn test_gptj_model_blocks_count() {
let cfg = tiny_config();
let model = GptJModel::new(cfg.clone()).expect("GptJModel should construct");
assert_eq!(
model.blocks.len(),
cfg.n_layer,
"blocks count must match n_layer"
);
}
#[test]
fn test_gptj_lm_head_construction() {
let cfg = tiny_config();
let model = GptJLMHeadModel::new(cfg);
assert!(model.is_ok(), "GptJLMHeadModel should construct");
}
#[test]
fn test_gptj_lm_head_num_params_larger_than_base() {
use trustformers_core::traits::Model;
let cfg = tiny_config();
let base = GptJModel::new(cfg.clone()).expect("GptJModel construct");
let lm = GptJLMHeadModel::new(cfg).expect("GptJLMHeadModel construct");
assert!(
lm.num_parameters() > base.num_parameters(),
"LM head model should have more params than base"
);
}
#[test]
fn test_gptj_model_forward_output_shape() {
use trustformers_core::traits::{Model, TokenizedInput};
let cfg = tiny_config();
let model = GptJModel::new(cfg.clone()).expect("GptJModel should construct");
let input = TokenizedInput {
input_ids: vec![1u32, 2, 3],
attention_mask: vec![1u8, 1, 1],
token_type_ids: None,
special_tokens_mask: None,
offset_mapping: None,
overflowing_tokens: None,
};
let output = model.forward(input).expect("GptJModel forward should succeed");
let shape = output.last_hidden_state.shape();
assert!(!shape.is_empty(), "output shape should not be empty");
let last_dim = *shape.last().expect("shape should have dimensions");
assert_eq!(last_dim, cfg.n_embd, "last dim must match n_embd");
}
#[test]
fn test_gptj_block_parallel_attn_mlp() {
let cfg = tiny_config();
let model = GptJModel::new(cfg).expect("GptJModel should construct");
assert!(
model.blocks.iter().all(|_| true),
"all blocks should be constructed"
);
}
#[test]
fn test_gptj_no_bias_in_attention_projections() {
let cfg = GptJConfig::gpt_j_6b();
assert_eq!(cfg.resid_pdrop, 0.0, "GPT-J-6B uses no residual dropout");
assert_eq!(cfg.attn_pdrop, 0.0, "GPT-J-6B uses no attention dropout");
}
#[test]
fn test_gptj_n_positions_is_2048() {
let cfg = GptJConfig::gpt_j_6b();
assert_eq!(cfg.n_positions, 2048, "GPT-J context length should be 2048");
}
}