Expand description
§Hermes LLM
A Rust library for training and running Large Language Models from scratch.
§Features
- Model Architecture Language (MAL): Define any transformer architecture using a composable DSL
- Training: Distributed training with NCCL, gradient accumulation, checkpointing
- Generation: Text generation with temperature, top-k sampling
- Tokenization: BPE tokenizer training and inference
- DPO: Direct Preference Optimization for RLHF
§Quick Start
ⓘ
use hermes_llm::{Transformer, Trainer, get_builtin_model};
// Load a predefined model architecture
let model_def = get_builtin_model("tiny").unwrap();
// Or parse from MAL file
let model_def = hermes_llm::parse_mal_file("model.mal").unwrap();§Model Architecture Language (MAL)
MAL allows defining transformer architectures in a readable, composable format:
attention my_attn {
num_heads: 32
num_kv_heads: 8
}
ffn my_ffn {
hidden_dim: 4096
activation: swiglu
}
block my_block {
attention: my_attn
ffn: my_ffn
norm: rmsnorm { eps: 1e-5 }
norm_position: pre
}
model my_model {
vocab_size: 32000
hidden_size: 1024
num_layers: 32
block: my_block
}Re-exports§
pub use config::TrainingConfig;pub use model::Transformer;pub use training::Trainer;pub use training::TrainingState;pub use training::create_progress_bar;pub use generate::TextGenerator;pub use generate::get_lr_with_warmup;pub use distributed::DistributedConfig;pub use distributed::NcclCommunicator;pub use mal::Activation;pub use mal::AttentionDef;pub use mal::BlockDef;pub use mal::FfnDef;pub use mal::MalFile;pub use mal::ModelDef;pub use mal::NormPosition;pub use mal::NormType;pub use mal::PositionEncoding;pub use mal::get_builtin_model;pub use mal::get_wellknown_mal;pub use mal::list_wellknown_models;pub use mal::parse_mal;pub use mal::parse_mal_file;pub use mal::parse_mal_full;pub use data::DataLoader;pub use data::Dataset;pub use tokenizer::BPETrainer;pub use tokenizer::Tokenizer;