realizar 0.8.5

Pure Rust ML inference engine built from scratch - model serving for GGUF and safetensors
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
//! Batch Scheduling & Async Processing (PMAT-802)
//!
//! M25: Token Batching & Speculative Decoding
//! M26: Async I/O & Event-Driven Processing
//! M27: Request Scheduling & Resource Management

// =============================================================================
// M25: Token Batching & Speculative Decoding (Phase 16)
// =============================================================================

/// Token batch accumulator for batched processing (M25 - IMP-058)
///
/// Accumulates tokens until batch is full, then returns for processing.
/// Improves throughput by processing multiple tokens together.
#[derive(Debug)]
pub struct TokenBatch {
    tokens: Vec<usize>,
    capacity: usize,
}

impl TokenBatch {
    /// Create a new token batch with given capacity
    #[must_use]
    pub fn new(capacity: usize) -> Self {
        Self {
            tokens: Vec::with_capacity(capacity),
            capacity,
        }
    }

    /// Get the batch capacity
    #[must_use]
    pub fn capacity(&self) -> usize {
        self.capacity
    }

    /// Get current number of tokens in batch
    #[must_use]
    pub fn len(&self) -> usize {
        self.tokens.len()
    }

    /// Check if batch is empty
    #[must_use]
    pub fn is_empty(&self) -> bool {
        self.tokens.is_empty()
    }

    /// Check if batch is full
    #[must_use]
    pub fn is_full(&self) -> bool {
        self.tokens.len() >= self.capacity
    }

    /// Push a token to the batch
    ///
    /// Returns `Some(tokens)` when batch becomes full, `None` otherwise.
    pub fn push(&mut self, token: usize) -> Option<Vec<usize>> {
        self.tokens.push(token);
        if self.is_full() {
            Some(self.flush())
        } else {
            None
        }
    }

    /// Flush and return all tokens, clearing the batch
    pub fn flush(&mut self) -> Vec<usize> {
        std::mem::take(&mut self.tokens)
    }
}

/// Candidate token for speculative decoding
#[derive(Debug, Clone)]
struct SpeculativeCandidate {
    token: usize,
    /// Confidence score (stored for future use in acceptance thresholds)
    #[allow(dead_code)]
    confidence: f32,
}

/// Speculative token buffer for speculative decoding (M25 - IMP-059)
///
/// Manages candidate tokens generated speculatively, allowing verification
/// against actual model outputs for acceptance or rejection.
#[derive(Debug)]
pub struct SpeculativeBuffer {
    candidates: Vec<SpeculativeCandidate>,
    capacity: usize,
}

impl SpeculativeBuffer {
    /// Create a new speculative buffer with given capacity
    #[must_use]
    pub fn new(capacity: usize) -> Self {
        Self {
            candidates: Vec::with_capacity(capacity),
            capacity,
        }
    }

    /// Get the buffer capacity
    #[must_use]
    pub fn capacity(&self) -> usize {
        self.capacity
    }

    /// Get current number of candidates
    #[must_use]
    pub fn len(&self) -> usize {
        self.candidates.len()
    }

    /// Check if buffer is empty
    #[must_use]
    pub fn is_empty(&self) -> bool {
        self.candidates.is_empty()
    }

    /// Add a candidate token with confidence score
    pub fn add_candidate(&mut self, token: usize, confidence: f32) {
        if self.candidates.len() < self.capacity {
            self.candidates
                .push(SpeculativeCandidate { token, confidence });
        }
    }

    /// Verify candidates against actual tokens
    ///
    /// Returns (num_accepted, rejection_index) where rejection_index is
    /// the first index where mismatch occurred, or None if all matched.
    #[must_use]
    pub fn verify(&self, actual_tokens: &[usize]) -> (usize, Option<usize>) {
        let mut accepted = 0;
        for (i, candidate) in self.candidates.iter().enumerate() {
            if i < actual_tokens.len() && candidate.token == actual_tokens[i] {
                accepted += 1;
            } else {
                return (accepted, Some(i));
            }
        }
        (accepted, None)
    }

    /// Accept first n candidates, removing them from buffer
    pub fn accept(&mut self, n: usize) {
        if n >= self.candidates.len() {
            self.candidates.clear();
        } else {
            self.candidates.drain(0..n);
        }
    }

    /// Reject all remaining candidates
    pub fn reject(&mut self) {
        self.candidates.clear();
    }
}

/// Batch ID for tracking inference batches
pub type BatchId = u64;

/// Inference batch scheduler for coordinating batched processing (M25 - IMP-060)
///
/// Manages pending and completed batches, allowing asynchronous batch
/// submission and result retrieval.
#[derive(Debug)]
pub struct InferenceBatchScheduler {
    next_id: BatchId,
    pending: std::collections::HashMap<BatchId, Vec<usize>>,
    completed: std::collections::VecDeque<(BatchId, Vec<usize>)>,
}

impl InferenceBatchScheduler {
    /// Create a new inference batch scheduler
    #[must_use]
    pub fn new() -> Self {
        Self {
            next_id: 0,
            pending: std::collections::HashMap::new(),
            completed: std::collections::VecDeque::new(),
        }
    }

    /// Get count of pending batches
    #[must_use]
    pub fn pending_count(&self) -> usize {
        self.pending.len()
    }

    /// Get count of completed batches
    #[must_use]
    pub fn completed_count(&self) -> usize {
        self.completed.len()
    }

    /// Submit a batch for processing
    ///
    /// Returns a unique batch ID for tracking.
    pub fn submit(&mut self, tokens: Vec<usize>) -> BatchId {
        let id = self.next_id;
        self.next_id += 1;
        self.pending.insert(id, tokens);
        id
    }

    /// Mark a batch as complete with results
    pub fn complete(&mut self, batch_id: BatchId, results: Vec<usize>) {
        self.pending.remove(&batch_id);
        self.completed.push_back((batch_id, results));
    }

    /// Poll for a completed batch
    ///
    /// Returns `Some((batch_id, results))` if a batch is ready, `None` otherwise.
    pub fn poll(&mut self) -> Option<(BatchId, Vec<usize>)> {
        self.completed.pop_front()
    }

    /// Drain all completed batches
    pub fn drain(&mut self) -> Vec<(BatchId, Vec<usize>)> {
        self.completed.drain(..).collect()
    }
}

impl Default for InferenceBatchScheduler {
    fn default() -> Self {
        Self::new()
    }
}

// =============================================================================
// M26: Async I/O & Event-Driven Processing (Phase 17)
// =============================================================================

/// Async request queue for non-blocking request handling (M26 - IMP-061)
///
/// Provides a bounded FIFO queue for inference requests with backpressure
/// support via try-based operations.
#[derive(Debug)]
pub struct AsyncRequestQueue<T> {
    items: std::collections::VecDeque<T>,
    capacity: usize,
}

impl<T> AsyncRequestQueue<T> {
    /// Create a new async request queue with specified capacity
    #[must_use]
    pub fn new(capacity: usize) -> Self {
        Self {
            items: std::collections::VecDeque::with_capacity(capacity),
            capacity,
        }
    }

    /// Get queue capacity
    #[must_use]
    pub fn capacity(&self) -> usize {
        self.capacity
    }

    /// Get current queue length
    #[must_use]
    pub fn len(&self) -> usize {
        self.items.len()
    }

    /// Check if queue is empty
    #[must_use]
    pub fn is_empty(&self) -> bool {
        self.items.is_empty()
    }

    /// Check if queue is full
    #[must_use]
    pub fn is_full(&self) -> bool {
        self.items.len() >= self.capacity
    }

    /// Try to push an item to the queue
    ///
    /// Returns `true` if successful, `false` if queue is full (backpressure).
    pub fn try_push(&mut self, item: T) -> bool {
        if self.is_full() {
            false
        } else {
            self.items.push_back(item);
            true
        }
    }

    /// Try to pop an item from the queue
    ///
    /// Returns `Some(item)` if available, `None` if queue is empty.
    pub fn try_pop(&mut self) -> Option<T> {
        self.items.pop_front()
    }
}

/// Type alias for inference completion handler
pub type InferenceCompletionHandler = Box<dyn Fn(u64, &[usize]) + Send + Sync>;

/// Event notifier for inference completion (M26 - IMP-062)
///
/// Allows registration of handlers that are called when inference completes.
pub struct InferenceEventNotifier {
    handlers: Vec<InferenceCompletionHandler>,
}

impl std::fmt::Debug for InferenceEventNotifier {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("InferenceEventNotifier")
            .field("handler_count", &self.handlers.len())
            .finish()
    }
}

impl InferenceEventNotifier {
    /// Create a new event notifier
    #[must_use]
    pub fn new() -> Self {
        Self {
            handlers: Vec::new(),
        }
    }

    /// Get count of registered handlers
    #[must_use]
    pub fn handler_count(&self) -> usize {
        self.handlers.len()
    }

    /// Register a completion handler
    ///
    /// Handler receives (request_id, output_tokens) when inference completes.
    pub fn register(&mut self, handler: InferenceCompletionHandler) {
        self.handlers.push(handler);
    }

    /// Notify all handlers of completion
    ///
    /// Calls each registered handler with the request ID and output tokens.
    pub fn notify(&self, request_id: u64, tokens: &[usize]) {
        for handler in &self.handlers {
            handler(request_id, tokens);
        }
    }

    /// Clear all registered handlers
    pub fn clear(&mut self) {
        self.handlers.clear();
    }
}

impl Default for InferenceEventNotifier {
    fn default() -> Self {
        Self::new()
    }
}

/// Request ID type for timeout tracking
pub type RequestId = u64;

/// Timeout manager for request deadline tracking (M26 - IMP-063)
///
/// Tracks request deadlines and identifies expired requests.
#[derive(Debug)]
pub struct TimeoutManager {
    deadlines: std::collections::HashMap<RequestId, std::time::Instant>,
}

impl TimeoutManager {
    /// Create a new timeout manager
    #[must_use]
    pub fn new() -> Self {
        Self {
            deadlines: std::collections::HashMap::new(),
        }
    }

    /// Get count of active timeout registrations
    #[must_use]
    pub fn active_count(&self) -> usize {
        self.deadlines.len()
    }

    /// Register a timeout for a request
    ///
    /// The deadline is the absolute time at which the request should timeout.
    pub fn register(&mut self, request_id: RequestId, deadline: std::time::Instant) {
        self.deadlines.insert(request_id, deadline);
    }

    /// Remove timeout registration for a request
    ///
    /// Use when request completes before timeout.
    pub fn remove(&mut self, request_id: RequestId) {
        self.deadlines.remove(&request_id);
    }

    /// Check for expired requests and remove them
    ///
    /// Returns list of request IDs that have timed out.
    pub fn check_expired(&mut self) -> Vec<RequestId> {
        let now = std::time::Instant::now();
        let expired: Vec<RequestId> = self
            .deadlines
            .iter()
            .filter(|(_, &deadline)| now >= deadline)
            .map(|(&id, _)| id)
            .collect();

        for id in &expired {
            self.deadlines.remove(id);
        }

        expired
    }
}

impl Default for TimeoutManager {
    fn default() -> Self {
        Self::new()
    }
}

// =============================================================================
// M27: Request Scheduling & Resource Management (Phase 18)
// =============================================================================

/// Priority level type (higher = more important)
pub type Priority = u32;

/// Priority request wrapper for priority queue (M27 - IMP-064)
#[derive(Debug, Clone)]
pub struct PriorityRequest<T> {
    priority: Priority,
    sequence: u64, // For FIFO ordering within same priority
    data: T,
}

include!("priority_queue.rs");
include!("batch_scheduling_token_rate.rs");