realizar 0.8.4

Pure Rust ML inference engine built from scratch - model serving for GGUF and safetensors
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
//! Token generation methods for CUDA-accelerated inference
//!
//! This module contains all generation loop implementations:
//! - `generate_cuda`: Basic CUDA generation
//! - `generate_cuda_with_cache`: Generation with KV cache
//! - `generate_full_cuda_with_cache`: Full GPU generation with cache
//! - `generate_gpu_resident`: GPU-resident generation (minimal transfers)
//! - `generate_gpu_resident_streaming`: Streaming generation with callback
//! - `generate_batch_gpu_resident`: Batch generation for multiple prompts

use super::super::model::OwnedQuantizedModel;
use super::{OwnedQuantizedKVCache, OwnedQuantizedModelCuda, QuantizedGenerateConfig};
use crate::error::{RealizarError, Result};

include!("generate_1.rs");
include!("generate_batched_streaming.rs");