hot-loop 0.2.1

Running Gguf Chat-Models on Pure-Rust, Uses the Candle Backend
Documentation
<div align="center">
    <h1>⚡ Hot-Loop</h1>
    <p><strong>
    High-Level 🦀 Pure-Rust Crate for Running Gguf Chat-Models,
    Uses the Candle 🕯️ Backend
    </strong></p>
</div>

---

<div align="center">
    <p><strong>
    This project is currently in Beta. API is subject to change
    </strong></p>
</div>

---

[![Crates.io](https://img.shields.io/crates/v/hot-loop.svg)](https://crates.io/crates/hot-loop)

# Quick Start


### Install: ```cargo add hot-loop```


```rust
use std::fs::{File, read};
use std::io::{stdout, Write};

use hot_loop::{
    Model,
    models::Qwen3,
    Device,
    Error,
};

fn main() -> Result<(), Error> {
    let mut model_file = File::open("models/Qwen3-4B-it-Q4_K_M.gguf").unwrap();
    let tokenizer_bytes = read("models/tokenizer.json").unwrap();

    // model read only
    let model = Qwen3::load(&mut model_file, &tokenizer_bytes, Device::Cpu)?;

    let mut session = model.new_session();
    // and more sessions!
    // let mut session2 = model.new_session();
    // let mut session3 = model.new_session();

    let mut generate = session.generate("Hello!")?;

    while let Some(chunk) = generate.next_chunk()? {
        print!("{chunk}");
        stdout().flush().unwrap();
    }

    Ok(())
}
```

---

# Boost Your Code 🚀🦀


## Use ```target-cpu=native``` to boost generation speed!


```
your-project/
├── .cargo/
│   └── config.toml
├── src/
│   └── main.rs
└── Cargo.toml
```

## .cargo/config.toml:


```toml
[build]
rustflags = ["-C", "target-cpu=native"]
```

## Cargo.toml:


```toml
[profile.release]
lto = "fat"
opt-level = 3
strip = true
codegen-units = 1
panic = "abort"
```

---

# Enable CUDA 🔋


## Cargo.toml:

```toml
[dependencies]
hot-loop = { version = "...", features = ["cuda"] }
```

## Rust Code

```rust
Qwen3::load(..., ..., Device::new_cuda(0)?)?
```

---

# Typing


```rust
use std::fs::{File, read};

use hot_loop::{
    models::Qwen3,
    session::{Session, Generation},
    Model, // trait
    Device,
    Error
};

fn func1(_model: &impl Model) {}

fn func2(_session: &mut Session<impl Model>) {}

fn func3(_generation: &mut Generation<impl Model>) {}

fn main() -> Result<(), Error> {
    let mut model_file = File::open("Qwen3.gguf").unwrap();
    let tokenizer_bytes = read("tokenizer.json").unwrap();

    let model = Qwen3::load(&mut model_file, &tokenizer_bytes, Device::Cpu)?;
    func1(&model);

    let mut session: Session<Qwen3> = model.new_session();
    func2(&mut session);

    let mut generation: Generation<Qwen3> = session.generate("Hello")?;
    func3(&mut generation);

    Ok(())
}
```

---

# Session Settings


```rust
use std::fs::{File, read};

use hot_loop::{
    Model,
    models::Qwen3,
    Device,
    Error,
    settings::{Settings, Seed},
};

fn main() -> Result<(), Error> {
    let mut model_file = File::open("Qwen3.gguf").unwrap();
    let tokenizer_bytes = read("tokenizer.json").unwrap();

    let model = Qwen3::load(&mut model_file, &tokenizer_bytes, Device::Cpu)?;

    let settings = Settings::default()
        .with_temperature(0.7)
        .with_sample_len(512)
        .with_seed(Seed::Custom(12345)) // or Seed::Default
        .with_top_p(Some(0.5))
        .with_top_k(Some(40))
        .with_repeat_penalty(1.1)
        .with_repeat_last_n(64);

    let mut session = model.new_session()
        .with_settings(settings); // set settings
    
    // OR
    
    session.set_settings(Settings::default()); // set settings

    Ok(())
}
```

---

# Session System-prompt


```rust
use std::fs::{File, read};

use hot_loop::{
    Model,
    models::Qwen3,
    Device,
    Error,
};

fn main() -> Result<(), Error> {
    let mut model_file = File::open("Qwen3.gguf").unwrap();
    let tokenizer_bytes = read("tokenizer.json").unwrap();

    let model = Qwen3::load(&mut model_file, &tokenizer_bytes, Device::Cpu)?;

    let sys_prompt = "always answer in json!";

    let mut session = model.new_session()
        .with_system_prompt(sys_prompt)?; // set system prompt

    // OR
    session.set_system_prompt_and_clear_history(sys_prompt)?;
    
    
    session.clear_system_prompt_and_history(); // clear system prompt

    Ok(())
}
```

---

# Session History


```rust
use std::fs::{File, read};

use hot_loop::{
    Model,
    models::Qwen3,
    Device,
    Error,
};

fn main() -> Result<(), Error> {
    let mut model_file = File::open("Qwen3.gguf").unwrap();
    let tokenizer_bytes = read("tokenizer.json").unwrap();

    let model = Qwen3::load(&mut model_file, &tokenizer_bytes, Device::Cpu)?;
    let mut session = model.new_session();

    let questions = ["Hello!", "what can you do?", "ok"];

    for question in questions {
        let mut generate = session.generate(question)?;
        while let Some(_) = generate.next_chunk()? {
            // model answers
        }
    }

    session.clear_history(); // clear history

    Ok(())
}
```

---

# Thread Safety

### Parallelism Generation in different independent sessions


```rust
use std::fs::{File, read};
use std::thread;
use hot_loop::{
    Model,
    models::{Qwen3},
    session::Session,
    Device,
    Error,
};

fn generate<M: Model>(mut session: Session<M>, prompt: &str) -> Result<String, Error> {
    let mut answer = String::new();
    let mut generate = session.generate(prompt)?;

    while let Some(chunk) = generate.next_chunk()? {
        answer.push_str(&chunk);
    }
    Ok(answer)
}

// one question for one thread
const QUESTIONS: [&str; 4] = [
    "hello!",
    "what can you do?",
    "what is the book war and peace about?",
    "make a plan for my trip to Shanghai"
];

fn main() -> Result<(), Error> {
    let mut model_file = File::open("models/Qwen3-4B-it-Q4_K_M.gguf").unwrap();
    let tokenizer_bytes = read("models/tokenizer.json").unwrap();

    let model = Qwen3::load(&mut model_file, &tokenizer_bytes, Device::Cpu)?;

    let mut handles = Vec::new();

    for prompt in QUESTIONS {
        let session = model.new_session();

        let handle = thread::spawn(move || {
            generate(session, prompt)
        });
        handles.push(handle);
    }

    for handle in handles {
        let answer = handle.join().unwrap()?;
        println!("{answer}");
    }

    Ok(())
}
```

---