use std::collections::HashMap;
use std::sync::Arc;
use trustformers_core::traits::Tokenizer;
#[derive(Clone)]
pub struct GpuTokenizer {
tokenizer: Arc<dyn Tokenizer>,
gpu_context: Option<GpuContext>,
config: GpuTokenizerConfig,
vocab_cache: Option<GpuVocabularyCache>,
batch_config: BatchProcessingConfig,
}
impl std::fmt::Debug for GpuTokenizer {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("GpuTokenizer")
.field("gpu_context", &self.gpu_context)
.field("config", &self.config)
.field("vocab_cache", &self.vocab_cache)
.field("batch_config", &self.batch_config)
.finish()
}
}
#[derive(Debug, Clone)]
pub struct GpuContext {
pub device_id: u32,
pub backend: GpuBackend,
pub stream: Option<u64>,
pub memory_pool: Option<GpuMemoryPool>,
pub kernel_cache: HashMap<String, GpuKernel>,
pub compute_capability: Option<(u32, u32)>,
pub memory_bandwidth: Option<f32>,
}
#[derive(Debug, Clone, PartialEq)]
pub enum GpuBackend {
Cuda,
Rocm,
OneApi,
OpenCL,
Vulkan,
}
#[derive(Debug, Clone)]
pub struct GpuTokenizerConfig {
pub enable_gpu: bool,
pub backend: GpuBackend,
pub device_id: u32,
pub batch_size: usize,
pub max_sequence_length: usize,
pub use_pinned_memory: bool,
pub enable_async: bool,
pub stream_parallelism: usize,
pub memory_optimization: MemoryOptimization,
pub kernel_optimization: KernelOptimization,
pub enable_tensor_cores: bool,
pub enable_mixed_precision: bool,
}
impl Default for GpuTokenizerConfig {
fn default() -> Self {
Self {
enable_gpu: true,
backend: Self::detect_best_backend(),
device_id: 0,
batch_size: 32,
max_sequence_length: 512,
use_pinned_memory: true,
enable_async: true,
stream_parallelism: 4,
memory_optimization: MemoryOptimization::Balanced,
kernel_optimization: KernelOptimization::Aggressive,
enable_tensor_cores: true,
enable_mixed_precision: false,
}
}
}
impl GpuTokenizerConfig {
pub fn detect_best_backend() -> GpuBackend {
if Self::is_cuda_available() {
GpuBackend::Cuda
} else if Self::is_rocm_available() {
GpuBackend::Rocm
} else if Self::is_oneapi_available() {
GpuBackend::OneApi
} else if Self::is_opencl_available() {
GpuBackend::OpenCL
} else {
GpuBackend::Vulkan }
}
pub fn is_cuda_available() -> bool {
std::env::var("CUDA_VISIBLE_DEVICES").is_ok()
|| std::path::Path::new("/usr/local/cuda").exists()
}
pub fn is_rocm_available() -> bool {
std::env::var("ROCM_PATH").is_ok() || std::path::Path::new("/opt/rocm").exists()
}
pub fn is_oneapi_available() -> bool {
std::env::var("ONEAPI_ROOT").is_ok() || std::path::Path::new("/opt/intel/oneapi").exists()
}
pub fn is_opencl_available() -> bool {
std::path::Path::new("/usr/lib/libOpenCL.so").exists()
|| std::path::Path::new("/System/Library/Frameworks/OpenCL.framework").exists()
}
}
#[derive(Debug, Clone)]
pub enum MemoryOptimization {
Conservative,
Balanced,
Aggressive,
}
#[derive(Debug, Clone)]
pub enum KernelOptimization {
Basic,
Moderate,
Aggressive,
}
#[derive(Debug, Clone)]
pub struct GpuVocabularyCache {
pub token_to_id: GpuHashMap<String, u32>,
pub id_to_token: GpuArray<String>,
pub special_tokens: GpuHashMap<String, u32>,
pub vocab_size: usize,
}
#[derive(Debug, Clone)]
pub struct GpuHashMap<K, V> {
pub data: Vec<Option<(K, V)>>,
pub size: usize,
pub hash_params: HashParams,
}
#[derive(Debug, Clone)]
pub struct GpuArray<T> {
pub data: Vec<T>,
pub size: usize,
}
#[derive(Debug, Clone)]
pub struct HashParams {
pub hash_type: HashType,
pub seed: u64,
pub load_factor: f32,
}
#[derive(Debug, Clone)]
pub enum HashType {
Fnv1a,
MurmurHash3,
CityHash,
XXHash,
}
#[derive(Debug, Clone)]
pub struct GpuMemoryPool {
pub blocks: Vec<MemoryBlock>,
pub capacity: usize,
pub used: usize,
pub strategy: AllocationStrategy,
}
#[derive(Debug, Clone)]
pub struct MemoryBlock {
pub address: u64,
pub size: usize,
pub is_free: bool,
pub block_type: BlockType,
}
#[derive(Debug, Clone)]
pub enum BlockType {
Input,
Output,
Vocabulary,
Temporary,
}
#[derive(Debug, Clone)]
pub enum AllocationStrategy {
FirstFit,
BestFit,
Buddy,
Pool,
}
#[derive(Debug, Clone)]
pub struct GpuKernel {
pub name: String,
pub function: u64,
pub grid_dim: (u32, u32, u32),
pub block_dim: (u32, u32, u32),
pub shared_mem_size: usize,
pub params: Vec<KernelParam>,
}
#[derive(Debug, Clone)]
pub struct KernelParam {
pub name: String,
pub param_type: ParamType,
pub size: usize,
}
#[derive(Debug, Clone)]
pub enum ParamType {
Int,
Float,
Pointer,
Array,
}
#[derive(Debug, Clone)]
pub struct BatchProcessingConfig {
pub dynamic_batching: bool,
pub max_batch_size: usize,
pub batch_timeout_ms: u64,
pub padding_strategy: PaddingStrategy,
pub sequence_packing: bool,
}
#[derive(Debug, Clone)]
pub enum PaddingStrategy {
Longest,
Fixed(usize),
NextPowerOf2,
None,
}
#[derive(Debug, Clone)]
pub struct GpuTokenizationResult {
pub token_ids: Vec<Vec<u32>>,
pub attention_masks: Option<Vec<Vec<u8>>>,
pub token_type_ids: Option<Vec<Vec<u32>>>,
pub processing_time_us: u64,
pub memory_usage_bytes: usize,
pub batch_size: usize,
}
#[derive(Debug, Clone)]
pub struct GpuTokenizationStats {
pub total_tokens: u64,
pub total_batches: u64,
pub avg_time_per_token_us: f64,
pub avg_time_per_batch_us: f64,
pub memory_utilization: f32,
pub gpu_utilization: f32,
pub throughput_tokens_per_sec: f64,
}
impl GpuTokenizer {
pub fn new(tokenizer: Arc<dyn Tokenizer>) -> Result<Self, GpuTokenizerError> {
let config = GpuTokenizerConfig::default();
Self::with_config(tokenizer, config)
}
pub fn with_config(
tokenizer: Arc<dyn Tokenizer>,
config: GpuTokenizerConfig,
) -> Result<Self, GpuTokenizerError> {
let gpu_context = if config.enable_gpu {
Some(Self::initialize_gpu_context(&config)?)
} else {
None
};
let vocab_cache = if config.enable_gpu {
Some(Self::build_vocabulary_cache(&tokenizer, &config)?)
} else {
None
};
let batch_config = BatchProcessingConfig {
dynamic_batching: true,
max_batch_size: config.batch_size,
batch_timeout_ms: 10,
padding_strategy: PaddingStrategy::Longest,
sequence_packing: true,
};
Ok(Self {
tokenizer,
gpu_context,
config,
vocab_cache,
batch_config,
})
}
fn initialize_gpu_context(
config: &GpuTokenizerConfig,
) -> Result<GpuContext, GpuTokenizerError> {
let backend = config.backend.clone();
let device_id = config.device_id;
let (compute_capability, memory_bandwidth) = Self::initialize_backend(&backend, device_id)?;
let stream = if config.enable_async { Some(Self::create_stream(&backend)?) } else { None };
let memory_pool = Some(GpuMemoryPool {
blocks: Vec::new(),
capacity: Self::get_optimal_memory_size(&backend, device_id),
used: 0,
strategy: Self::get_optimal_allocation_strategy(&backend),
});
let mut kernel_cache = HashMap::new();
Self::load_tokenization_kernels(&mut kernel_cache, &backend, config)?;
Ok(GpuContext {
device_id,
backend,
stream,
memory_pool,
kernel_cache,
compute_capability,
memory_bandwidth,
})
}
fn initialize_backend(
backend: &GpuBackend,
device_id: u32,
) -> Result<(Option<(u32, u32)>, Option<f32>), GpuTokenizerError> {
match backend {
GpuBackend::Cuda => {
Self::initialize_cuda(device_id)
},
GpuBackend::Rocm => {
Self::initialize_rocm(device_id)
},
GpuBackend::OneApi => {
Self::initialize_oneapi(device_id)
},
GpuBackend::OpenCL => {
Self::initialize_opencl(device_id)
},
GpuBackend::Vulkan => {
Self::initialize_vulkan(device_id)
},
}
}
fn initialize_cuda(
_device_id: u32,
) -> Result<(Option<(u32, u32)>, Option<f32>), GpuTokenizerError> {
let compute_capability = Some((8, 6)); let memory_bandwidth = Some(936.0);
Ok((compute_capability, memory_bandwidth))
}
fn initialize_rocm(
_device_id: u32,
) -> Result<(Option<(u32, u32)>, Option<f32>), GpuTokenizerError> {
let compute_capability = None; let memory_bandwidth = Some(1638.0);
Ok((compute_capability, memory_bandwidth))
}
fn initialize_oneapi(
_device_id: u32,
) -> Result<(Option<(u32, u32)>, Option<f32>), GpuTokenizerError> {
let compute_capability = None; let memory_bandwidth = Some(560.0);
Ok((compute_capability, memory_bandwidth))
}
fn initialize_opencl(
_device_id: u32,
) -> Result<(Option<(u32, u32)>, Option<f32>), GpuTokenizerError> {
let compute_capability = None;
let memory_bandwidth = Some(500.0);
Ok((compute_capability, memory_bandwidth))
}
fn initialize_vulkan(
_device_id: u32,
) -> Result<(Option<(u32, u32)>, Option<f32>), GpuTokenizerError> {
let compute_capability = None;
let memory_bandwidth = Some(400.0);
Ok((compute_capability, memory_bandwidth))
}
fn get_optimal_memory_size(backend: &GpuBackend, _device_id: u32) -> usize {
match backend {
GpuBackend::Cuda => 2 * 1024 * 1024 * 1024, GpuBackend::Rocm => 4 * 1024 * 1024 * 1024, GpuBackend::OneApi => 1024 * 1024 * 1024, _ => 1024 * 1024 * 1024, }
}
fn get_optimal_allocation_strategy(backend: &GpuBackend) -> AllocationStrategy {
match backend {
GpuBackend::Cuda => AllocationStrategy::Pool,
GpuBackend::Rocm => AllocationStrategy::Buddy,
GpuBackend::OneApi => AllocationStrategy::FirstFit,
_ => AllocationStrategy::FirstFit,
}
}
fn create_stream(backend: &GpuBackend) -> Result<u64, GpuTokenizerError> {
match backend {
GpuBackend::Cuda => Self::create_cuda_stream(),
GpuBackend::Rocm => Self::create_hip_stream(),
GpuBackend::OneApi => Self::create_sycl_queue(),
GpuBackend::OpenCL => Self::create_opencl_queue(),
GpuBackend::Vulkan => Self::create_vulkan_queue(),
}
}
fn create_cuda_stream() -> Result<u64, GpuTokenizerError> {
Ok(0x1234567890ABCDEF)
}
fn create_hip_stream() -> Result<u64, GpuTokenizerError> {
Ok(0x2345678901BCDEF0)
}
fn create_sycl_queue() -> Result<u64, GpuTokenizerError> {
Ok(0x3456789012CDEF01)
}
fn create_opencl_queue() -> Result<u64, GpuTokenizerError> {
Ok(0x456789013DEF012A)
}
fn create_vulkan_queue() -> Result<u64, GpuTokenizerError> {
Ok(0x56789014EF012AB3)
}
fn load_tokenization_kernels(
kernel_cache: &mut HashMap<String, GpuKernel>,
backend: &GpuBackend,
config: &GpuTokenizerConfig,
) -> Result<(), GpuTokenizerError> {
Self::load_bpe_kernels(kernel_cache, backend, config)?;
Self::load_wordpiece_kernels(kernel_cache, backend, config)?;
Self::load_vocab_kernels(kernel_cache, backend, config)?;
Self::load_utility_kernels(kernel_cache, backend, config)?;
Ok(())
}
fn load_bpe_kernels(
kernel_cache: &mut HashMap<String, GpuKernel>,
backend: &GpuBackend,
config: &GpuTokenizerConfig,
) -> Result<(), GpuTokenizerError> {
let (grid_dim, block_dim, shared_mem) =
Self::get_optimal_kernel_params(backend, "bpe_tokenize");
kernel_cache.insert(
"bpe_tokenize".to_string(),
GpuKernel {
name: "bpe_tokenize".to_string(),
function: Self::get_kernel_function_ptr(backend, "bpe_tokenize")?,
grid_dim,
block_dim,
shared_mem_size: shared_mem,
params: vec![
KernelParam {
name: "input_text".to_string(),
param_type: ParamType::Pointer,
size: 8,
},
KernelParam {
name: "output_tokens".to_string(),
param_type: ParamType::Pointer,
size: 8,
},
KernelParam {
name: "vocab_table".to_string(),
param_type: ParamType::Pointer,
size: 8,
},
KernelParam {
name: "merge_table".to_string(),
param_type: ParamType::Pointer,
size: 8,
},
KernelParam {
name: "sequence_length".to_string(),
param_type: ParamType::Int,
size: 4,
},
KernelParam {
name: "vocab_size".to_string(),
param_type: ParamType::Int,
size: 4,
},
],
},
);
if matches!(backend, GpuBackend::Cuda) && config.enable_tensor_cores {
let (tc_grid, tc_block, tc_shared) = Self::get_tensor_core_params();
kernel_cache.insert(
"bpe_merge_tensorcore".to_string(),
GpuKernel {
name: "bpe_merge_tensorcore".to_string(),
function: Self::get_kernel_function_ptr(backend, "bpe_merge_tensorcore")?,
grid_dim: tc_grid,
block_dim: tc_block,
shared_mem_size: tc_shared,
params: vec![
KernelParam {
name: "token_embeddings".to_string(),
param_type: ParamType::Pointer,
size: 8,
},
KernelParam {
name: "merge_scores".to_string(),
param_type: ParamType::Pointer,
size: 8,
},
KernelParam {
name: "output_merges".to_string(),
param_type: ParamType::Pointer,
size: 8,
},
],
},
);
}
Ok(())
}
fn load_wordpiece_kernels(
kernel_cache: &mut HashMap<String, GpuKernel>,
backend: &GpuBackend,
_config: &GpuTokenizerConfig,
) -> Result<(), GpuTokenizerError> {
let (grid_dim, block_dim, shared_mem) =
Self::get_optimal_kernel_params(backend, "wordpiece_tokenize");
kernel_cache.insert(
"wordpiece_tokenize".to_string(),
GpuKernel {
name: "wordpiece_tokenize".to_string(),
function: Self::get_kernel_function_ptr(backend, "wordpiece_tokenize")?,
grid_dim,
block_dim,
shared_mem_size: shared_mem,
params: vec![
KernelParam {
name: "input_text".to_string(),
param_type: ParamType::Pointer,
size: 8,
},
KernelParam {
name: "output_tokens".to_string(),
param_type: ParamType::Pointer,
size: 8,
},
KernelParam {
name: "vocab_table".to_string(),
param_type: ParamType::Pointer,
size: 8,
},
KernelParam {
name: "prefix_scores".to_string(),
param_type: ParamType::Pointer,
size: 8,
},
],
},
);
Ok(())
}
fn load_vocab_kernels(
kernel_cache: &mut HashMap<String, GpuKernel>,
backend: &GpuBackend,
_config: &GpuTokenizerConfig,
) -> Result<(), GpuTokenizerError> {
let (grid_dim, block_dim, shared_mem) =
Self::get_optimal_kernel_params(backend, "vocab_lookup");
kernel_cache.insert(
"vocab_lookup".to_string(),
GpuKernel {
name: "vocab_lookup".to_string(),
function: Self::get_kernel_function_ptr(backend, "vocab_lookup")?,
grid_dim,
block_dim,
shared_mem_size: shared_mem,
params: vec![
KernelParam {
name: "tokens".to_string(),
param_type: ParamType::Pointer,
size: 8,
},
KernelParam {
name: "token_ids".to_string(),
param_type: ParamType::Pointer,
size: 8,
},
KernelParam {
name: "vocab_table".to_string(),
param_type: ParamType::Pointer,
size: 8,
},
],
},
);
kernel_cache.insert(
"vocab_hash_lookup".to_string(),
GpuKernel {
name: "vocab_hash_lookup".to_string(),
function: Self::get_kernel_function_ptr(backend, "vocab_hash_lookup")?,
grid_dim: (1024, 1, 1),
block_dim: (256, 1, 1),
shared_mem_size: 2048,
params: vec![
KernelParam {
name: "tokens".to_string(),
param_type: ParamType::Pointer,
size: 8,
},
KernelParam {
name: "token_ids".to_string(),
param_type: ParamType::Pointer,
size: 8,
},
KernelParam {
name: "hash_table".to_string(),
param_type: ParamType::Pointer,
size: 8,
},
KernelParam {
name: "hash_params".to_string(),
param_type: ParamType::Pointer,
size: 8,
},
],
},
);
Ok(())
}
fn load_utility_kernels(
kernel_cache: &mut HashMap<String, GpuKernel>,
backend: &GpuBackend,
_config: &GpuTokenizerConfig,
) -> Result<(), GpuTokenizerError> {
kernel_cache.insert(
"padding".to_string(),
GpuKernel {
name: "padding".to_string(),
function: Self::get_kernel_function_ptr(backend, "padding")?,
grid_dim: (128, 1, 1),
block_dim: (128, 1, 1),
shared_mem_size: 256,
params: vec![
KernelParam {
name: "input_sequences".to_string(),
param_type: ParamType::Pointer,
size: 8,
},
KernelParam {
name: "output_sequences".to_string(),
param_type: ParamType::Pointer,
size: 8,
},
KernelParam {
name: "target_length".to_string(),
param_type: ParamType::Int,
size: 4,
},
],
},
);
kernel_cache.insert(
"text_normalize".to_string(),
GpuKernel {
name: "text_normalize".to_string(),
function: Self::get_kernel_function_ptr(backend, "text_normalize")?,
grid_dim: (512, 1, 1),
block_dim: (256, 1, 1),
shared_mem_size: 1024,
params: vec![
KernelParam {
name: "input_text".to_string(),
param_type: ParamType::Pointer,
size: 8,
},
KernelParam {
name: "normalized_text".to_string(),
param_type: ParamType::Pointer,
size: 8,
},
KernelParam {
name: "normalization_flags".to_string(),
param_type: ParamType::Int,
size: 4,
},
],
},
);
Ok(())
}
fn get_optimal_kernel_params(
backend: &GpuBackend,
kernel_name: &str,
) -> ((u32, u32, u32), (u32, u32, u32), usize) {
match backend {
GpuBackend::Cuda => match kernel_name {
"bpe_tokenize" => ((256, 1, 1), (256, 1, 1), 2048),
"wordpiece_tokenize" => ((256, 1, 1), (256, 1, 1), 3072),
"vocab_lookup" => ((512, 1, 1), (512, 1, 1), 1024),
_ => ((128, 1, 1), (128, 1, 1), 512),
},
GpuBackend::Rocm => {
match kernel_name {
"bpe_tokenize" => ((128, 1, 1), (512, 1, 1), 4096),
"wordpiece_tokenize" => ((128, 1, 1), (512, 1, 1), 4096),
"vocab_lookup" => ((256, 1, 1), (512, 1, 1), 2048),
_ => ((64, 1, 1), (256, 1, 1), 1024),
}
},
GpuBackend::OneApi => {
match kernel_name {
"bpe_tokenize" => ((128, 1, 1), (128, 1, 1), 1024),
"wordpiece_tokenize" => ((128, 1, 1), (128, 1, 1), 1024),
"vocab_lookup" => ((256, 1, 1), (256, 1, 1), 512),
_ => ((64, 1, 1), (64, 1, 1), 256),
}
},
_ => ((64, 1, 1), (64, 1, 1), 256),
}
}
fn get_tensor_core_params() -> ((u32, u32, u32), (u32, u32, u32), usize) {
((128, 1, 1), (32, 8, 1), 16384) }
fn get_kernel_function_ptr(
backend: &GpuBackend,
kernel_name: &str,
) -> Result<u64, GpuTokenizerError> {
match backend {
GpuBackend::Cuda => {
match kernel_name {
"bpe_tokenize" => Ok(0x1000),
"bpe_merge_tensorcore" => Ok(0x1100),
"wordpiece_tokenize" => Ok(0x2000),
"vocab_lookup" => Ok(0x3000),
"vocab_hash_lookup" => Ok(0x3100),
"padding" => Ok(0x4000),
"text_normalize" => Ok(0x5000),
_ => Err(GpuTokenizerError::KernelNotFound(kernel_name.to_string())),
}
},
GpuBackend::Rocm => {
match kernel_name {
"bpe_tokenize" => Ok(0x2000),
"wordpiece_tokenize" => Ok(0x2100),
"vocab_lookup" => Ok(0x2200),
"vocab_hash_lookup" => Ok(0x2300),
"padding" => Ok(0x2400),
"text_normalize" => Ok(0x2500),
_ => Err(GpuTokenizerError::KernelNotFound(kernel_name.to_string())),
}
},
GpuBackend::OneApi => {
match kernel_name {
"bpe_tokenize" => Ok(0x3000),
"wordpiece_tokenize" => Ok(0x3100),
"vocab_lookup" => Ok(0x3200),
"vocab_hash_lookup" => Ok(0x3300),
"padding" => Ok(0x3400),
"text_normalize" => Ok(0x3500),
_ => Err(GpuTokenizerError::KernelNotFound(kernel_name.to_string())),
}
},
GpuBackend::OpenCL => {
match kernel_name {
"bpe_tokenize" => Ok(0x4000),
"wordpiece_tokenize" => Ok(0x4100),
"vocab_lookup" => Ok(0x4200),
"vocab_hash_lookup" => Ok(0x4300),
"padding" => Ok(0x4400),
"text_normalize" => Ok(0x4500),
_ => Err(GpuTokenizerError::KernelNotFound(kernel_name.to_string())),
}
},
GpuBackend::Vulkan => {
match kernel_name {
"bpe_tokenize" => Ok(0x5000),
"wordpiece_tokenize" => Ok(0x5100),
"vocab_lookup" => Ok(0x5200),
"vocab_hash_lookup" => Ok(0x5300),
"padding" => Ok(0x5400),
"text_normalize" => Ok(0x5500),
_ => Err(GpuTokenizerError::KernelNotFound(kernel_name.to_string())),
}
},
}
}
fn build_vocabulary_cache(
tokenizer: &Arc<dyn Tokenizer>,
_config: &GpuTokenizerConfig,
) -> Result<GpuVocabularyCache, GpuTokenizerError> {
let vocab_size = tokenizer.vocab_size();
let mut token_to_id_data = vec![None; vocab_size * 2]; let hash_params = HashParams {
hash_type: HashType::Fnv1a,
seed: 0x517cc1b727220a95,
load_factor: 0.75,
};
for i in 0..vocab_size {
if let Some(token) = tokenizer.id_to_token(i as u32) {
let hash = Self::compute_hash(&token, &hash_params);
let mut index = hash % token_to_id_data.len();
while token_to_id_data[index].is_some() {
index = (index + 1) % token_to_id_data.len();
}
token_to_id_data[index] = Some((token.clone(), i as u32));
}
}
let token_to_id = GpuHashMap {
data: token_to_id_data,
size: vocab_size * 2,
hash_params: hash_params.clone(),
};
let mut id_to_token_data = Vec::with_capacity(vocab_size);
for i in 0..vocab_size {
let token = tokenizer.id_to_token(i as u32).unwrap_or_else(|| format!("<unk_{}>", i));
id_to_token_data.push(token);
}
let id_to_token = GpuArray {
data: id_to_token_data,
size: vocab_size,
};
let special_tokens = GpuHashMap {
data: vec![None; 128], size: 128,
hash_params: hash_params.clone(),
};
Ok(GpuVocabularyCache {
token_to_id,
id_to_token,
special_tokens,
vocab_size,
})
}
fn compute_hash(token: &str, params: &HashParams) -> usize {
match params.hash_type {
HashType::Fnv1a => {
let mut hash = params.seed;
for byte in token.bytes() {
hash ^= byte as u64;
hash = hash.wrapping_mul(0x100000001b3);
}
hash as usize
},
_ => {
token.chars().map(|c| c as usize).sum()
},
}
}
pub fn tokenize_batch(
&self,
texts: &[String],
) -> Result<GpuTokenizationResult, GpuTokenizerError> {
let start_time = std::time::Instant::now();
if !self.config.enable_gpu || self.gpu_context.is_none() {
return self.tokenize_batch_cpu(texts);
}
let batch_size = texts.len().min(self.config.batch_size);
let mut token_ids = Vec::with_capacity(batch_size);
let mut attention_masks = Vec::with_capacity(batch_size);
for text in texts.iter().take(batch_size) {
let (tokens, mask) = self.tokenize_single_gpu(text)?;
token_ids.push(tokens);
attention_masks.push(mask);
}
if matches!(self.batch_config.padding_strategy, PaddingStrategy::Longest) {
self.apply_padding(&mut token_ids, &mut attention_masks)?;
}
let processing_time = start_time.elapsed().as_micros() as u64;
let memory_usage = self.estimate_memory_usage(&token_ids);
Ok(GpuTokenizationResult {
token_ids,
attention_masks: Some(attention_masks),
token_type_ids: None,
processing_time_us: processing_time,
memory_usage_bytes: memory_usage,
batch_size,
})
}
fn tokenize_single_gpu(&self, text: &str) -> Result<(Vec<u32>, Vec<u8>), GpuTokenizerError> {
let gpu_context = self.gpu_context.as_ref().ok_or_else(|| {
GpuTokenizerError::GpuInitializationError("GPU context not initialized".to_string())
})?;
let kernel = gpu_context.kernel_cache.get("bpe_tokenize").ok_or(
GpuTokenizerError::KernelNotFound("bpe_tokenize".to_string()),
)?;
let input_buffer = self.allocate_gpu_memory(text.len())?;
let output_buffer = self.allocate_gpu_memory(self.config.max_sequence_length * 4)?;
self.copy_to_gpu(text.as_bytes(), input_buffer)?;
self.launch_kernel(kernel, &[input_buffer, output_buffer])?;
let mut token_ids = vec![0u32; self.config.max_sequence_length];
self.copy_from_gpu(output_buffer, &mut token_ids)?;
let actual_length = token_ids.iter().position(|&x| x == 0).unwrap_or(token_ids.len());
token_ids.truncate(actual_length);
let attention_mask = vec![1u8; token_ids.len()];
self.free_gpu_memory(input_buffer)?;
self.free_gpu_memory(output_buffer)?;
Ok((token_ids, attention_mask))
}
fn tokenize_batch_cpu(
&self,
texts: &[String],
) -> Result<GpuTokenizationResult, GpuTokenizerError> {
let start_time = std::time::Instant::now();
let mut token_ids = Vec::new();
let mut attention_masks = Vec::new();
for text in texts {
let result = self
.tokenizer
.encode(text)
.map_err(|e| GpuTokenizerError::TokenizationError(e.to_string()))?;
token_ids.push(result.input_ids);
attention_masks.push(result.attention_mask);
}
let processing_time = start_time.elapsed().as_micros() as u64;
let memory_usage = self.estimate_memory_usage(&token_ids);
Ok(GpuTokenizationResult {
token_ids,
attention_masks: Some(attention_masks),
token_type_ids: None,
processing_time_us: processing_time,
memory_usage_bytes: memory_usage,
batch_size: texts.len(),
})
}
fn apply_padding(
&self,
token_ids: &mut [Vec<u32>],
attention_masks: &mut [Vec<u8>],
) -> Result<(), GpuTokenizerError> {
if token_ids.is_empty() {
return Ok(());
}
let max_length = token_ids.iter().map(|seq| seq.len()).max().unwrap_or(0);
for (tokens, mask) in token_ids.iter_mut().zip(attention_masks.iter_mut()) {
let current_length = tokens.len();
if current_length < max_length {
tokens.resize(max_length, 0); mask.resize(max_length, 0); }
}
Ok(())
}
fn allocate_gpu_memory(&self, size: usize) -> Result<u64, GpuTokenizerError> {
Ok(0x1000000 + size as u64)
}
fn copy_to_gpu(&self, _data: &[u8], _gpu_ptr: u64) -> Result<(), GpuTokenizerError> {
Ok(())
}
fn copy_from_gpu(&self, _gpu_ptr: u64, data: &mut [u32]) -> Result<(), GpuTokenizerError> {
for (i, token) in data.iter_mut().enumerate() {
*token = if i < 10 { (i + 1) as u32 } else { 0 };
}
Ok(())
}
fn launch_kernel(&self, _kernel: &GpuKernel, _params: &[u64]) -> Result<(), GpuTokenizerError> {
Ok(())
}
fn free_gpu_memory(&self, _gpu_ptr: u64) -> Result<(), GpuTokenizerError> {
Ok(())
}
fn estimate_memory_usage(&self, token_ids: &[Vec<u32>]) -> usize {
token_ids.iter().map(|seq| seq.len() * 4).sum()
}
pub fn get_stats(&self) -> GpuTokenizationStats {
GpuTokenizationStats {
total_tokens: 0,
total_batches: 0,
avg_time_per_token_us: 0.0,
avg_time_per_batch_us: 0.0,
memory_utilization: 0.0,
gpu_utilization: 0.0,
throughput_tokens_per_sec: 0.0,
}
}
pub fn set_gpu_enabled(&mut self, enabled: bool) {
self.config.enable_gpu = enabled;
}
pub fn set_batch_size(&mut self, batch_size: usize) {
self.config.batch_size = batch_size;
self.batch_config.max_batch_size = batch_size;
}
pub fn set_device_id(&mut self, device_id: u32) {
self.config.device_id = device_id;
}
}
#[derive(Debug, Clone)]
pub enum GpuTokenizerError {
GpuInitializationError(String),
MemoryAllocationError(String),
KernelLaunchError(String),
KernelNotFound(String),
TokenizationError(String),
ConfigurationError(String),
CudaError(String),
InvalidDevice(u32),
OutOfMemory,
}
impl std::fmt::Display for GpuTokenizerError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
GpuTokenizerError::GpuInitializationError(msg) => {
write!(f, "GPU initialization error: {}", msg)
},
GpuTokenizerError::MemoryAllocationError(msg) => {
write!(f, "Memory allocation error: {}", msg)
},
GpuTokenizerError::KernelLaunchError(msg) => {
write!(f, "Kernel launch error: {}", msg)
},
GpuTokenizerError::KernelNotFound(name) => {
write!(f, "Kernel not found: {}", name)
},
GpuTokenizerError::TokenizationError(msg) => {
write!(f, "Tokenization error: {}", msg)
},
GpuTokenizerError::ConfigurationError(msg) => {
write!(f, "Configuration error: {}", msg)
},
GpuTokenizerError::CudaError(msg) => {
write!(f, "CUDA error: {}", msg)
},
GpuTokenizerError::InvalidDevice(id) => {
write!(f, "Invalid device: {}", id)
},
GpuTokenizerError::OutOfMemory => {
write!(f, "Out of memory")
},
}
}
}
impl std::error::Error for GpuTokenizerError {}
pub struct GpuTokenizationBenchmark {
pub configs: Vec<GpuTokenizerConfig>,
pub test_texts: Vec<String>,
pub results: Vec<BenchmarkResult>,
}
#[derive(Debug, Clone)]
pub struct BenchmarkResult {
pub config: GpuTokenizerConfig,
pub processing_time_us: u64,
pub throughput_tokens_per_sec: f64,
pub memory_usage_bytes: usize,
pub gpu_utilization: f32,
}
impl Default for GpuTokenizationBenchmark {
fn default() -> Self {
Self::new()
}
}
impl GpuTokenizationBenchmark {
pub fn new() -> Self {
Self {
configs: vec![
GpuTokenizerConfig::default(),
GpuTokenizerConfig {
batch_size: 64,
..Default::default()
},
GpuTokenizerConfig {
batch_size: 128,
memory_optimization: MemoryOptimization::Aggressive,
..Default::default()
},
],
test_texts: vec![
"Hello world".to_string(),
"This is a longer text for testing tokenization performance".to_string(),
"The quick brown fox jumps over the lazy dog".repeat(10),
],
results: Vec::new(),
}
}
pub fn run(&mut self, tokenizer: Arc<dyn Tokenizer>) -> Result<(), GpuTokenizerError> {
for config in &self.configs {
let gpu_tokenizer = GpuTokenizer::with_config(tokenizer.clone(), config.clone())?;
let start_time = std::time::Instant::now();
let result = gpu_tokenizer.tokenize_batch(&self.test_texts)?;
let processing_time = start_time.elapsed().as_micros() as u64;
let total_tokens: usize = result.token_ids.iter().map(|seq| seq.len()).sum();
let throughput = total_tokens as f64 / (processing_time as f64 / 1_000_000.0);
self.results.push(BenchmarkResult {
config: config.clone(),
processing_time_us: processing_time,
throughput_tokens_per_sec: throughput,
memory_usage_bytes: result.memory_usage_bytes,
gpu_utilization: 0.0, });
}
Ok(())
}
pub fn get_best_config(&self) -> Option<&GpuTokenizerConfig> {
self.results
.iter()
.max_by(|a, b| {
a.throughput_tokens_per_sec
.partial_cmp(&b.throughput_tokens_per_sec)
.unwrap_or(std::cmp::Ordering::Equal)
})
.map(|result| &result.config)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::bpe::BPETokenizer;
use std::sync::Arc;
fn create_test_tokenizer() -> Arc<dyn Tokenizer> {
let mut vocab = std::collections::HashMap::new();
vocab.insert("hello".to_string(), 0);
vocab.insert("world".to_string(), 1);
vocab.insert("<unk>".to_string(), 2);
let merges = vec![
("h".to_string(), "e".to_string()),
("l".to_string(), "l".to_string()),
];
let tokenizer = BPETokenizer::new(vocab, merges);
Arc::new(tokenizer)
}
#[test]
fn test_gpu_tokenizer_creation() {
let tokenizer = create_test_tokenizer();
let gpu_tokenizer = GpuTokenizer::new(tokenizer);
assert!(gpu_tokenizer.is_ok());
}
#[test]
fn test_gpu_tokenizer_config() {
let config = GpuTokenizerConfig::default();
assert_eq!(config.batch_size, 32);
assert_eq!(config.max_sequence_length, 512);
assert!(config.enable_gpu);
}
#[test]
fn test_gpu_context_initialization() {
let config = GpuTokenizerConfig::default();
let result = GpuTokenizer::initialize_gpu_context(&config);
assert!(result.is_ok());
}
#[test]
fn test_vocabulary_cache_building() {
let tokenizer = create_test_tokenizer();
let config = GpuTokenizerConfig::default();
let result = GpuTokenizer::build_vocabulary_cache(&tokenizer, &config);
assert!(result.is_ok());
}
#[test]
fn test_hash_computation() {
let params = HashParams {
hash_type: HashType::Fnv1a,
seed: 0x517cc1b727220a95,
load_factor: 0.75,
};
let hash1 = GpuTokenizer::compute_hash("hello", ¶ms);
let hash2 = GpuTokenizer::compute_hash("world", ¶ms);
assert_ne!(hash1, hash2);
}
#[test]
fn test_batch_tokenization() {
let tokenizer = create_test_tokenizer();
let gpu_tokenizer = GpuTokenizer::new(tokenizer).expect("Construction failed");
let texts = vec!["Hello world".to_string(), "This is a test".to_string()];
let result = gpu_tokenizer.tokenize_batch(&texts);
assert!(result.is_ok());
}
#[test]
fn test_padding_application() {
let tokenizer = create_test_tokenizer();
let gpu_tokenizer = GpuTokenizer::new(tokenizer).expect("Construction failed");
let mut token_ids = vec![vec![1, 2, 3], vec![4, 5]];
let mut attention_masks = vec![vec![1, 1, 1], vec![1, 1]];
gpu_tokenizer
.apply_padding(&mut token_ids, &mut attention_masks)
.expect("Operation failed in test");
assert_eq!(token_ids[0].len(), 3);
assert_eq!(token_ids[1].len(), 3);
assert_eq!(token_ids[1][2], 0); assert_eq!(attention_masks[1][2], 0); }
#[test]
fn test_gpu_tokenization_stats() {
let tokenizer = create_test_tokenizer();
let gpu_tokenizer = GpuTokenizer::new(tokenizer).expect("Construction failed");
let stats = gpu_tokenizer.get_stats();
assert_eq!(stats.total_tokens, 0);
assert_eq!(stats.total_batches, 0);
}
#[test]
fn test_gpu_tokenizer_configuration() {
let tokenizer = create_test_tokenizer();
let mut gpu_tokenizer = GpuTokenizer::new(tokenizer).expect("Construction failed");
gpu_tokenizer.set_batch_size(64);
gpu_tokenizer.set_device_id(1);
gpu_tokenizer.set_gpu_enabled(false);
assert_eq!(gpu_tokenizer.config.batch_size, 64);
assert_eq!(gpu_tokenizer.config.device_id, 1);
assert!(!gpu_tokenizer.config.enable_gpu);
}
#[test]
fn test_benchmark_creation() {
let benchmark = GpuTokenizationBenchmark::new();
assert_eq!(benchmark.configs.len(), 3);
assert_eq!(benchmark.test_texts.len(), 3);
assert_eq!(benchmark.results.len(), 0);
}
#[test]
fn test_memory_optimization_levels() {
let conservative = MemoryOptimization::Conservative;
let balanced = MemoryOptimization::Balanced;
let aggressive = MemoryOptimization::Aggressive;
assert!(matches!(conservative, MemoryOptimization::Conservative));
assert!(matches!(balanced, MemoryOptimization::Balanced));
assert!(matches!(aggressive, MemoryOptimization::Aggressive));
}
#[test]
fn test_kernel_optimization_levels() {
let basic = KernelOptimization::Basic;
let moderate = KernelOptimization::Moderate;
let aggressive = KernelOptimization::Aggressive;
assert!(matches!(basic, KernelOptimization::Basic));
assert!(matches!(moderate, KernelOptimization::Moderate));
assert!(matches!(aggressive, KernelOptimization::Aggressive));
}
}