xybrid-core 0.1.0-rc4

Core runtime for hybrid cloud-edge AI inference: model execution, pipeline orchestration, and routing primitives.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
//! LLM types and adapter - shared abstractions for local LLM inference.
//!
//! This module provides:
//! - `LlmBackend` trait - interface for LLM backends (mistral, llama_cpp)
//! - `LlmConfig` / `GenerationConfig` - configuration types
//! - `LlmRuntimeAdapter` - RuntimeAdapter implementation that uses backends
//!
//! # Architecture
//!
//! ```text
//! LlmRuntimeAdapter (RuntimeAdapter impl)
//!//!     └── LlmBackend (trait - swappable)
//!//!             ├── MistralBackend (mistral.rs - desktop)
//!             └── LlamaCppBackend (llama.cpp - Android + fallback)
//! ```

use crate::ir::{Envelope, EnvelopeKind};
use crate::runtime_adapter::{
    AdapterError, AdapterResult, ModelMetadata, RuntimeAdapter, RuntimeAdapterExt,
};
use std::collections::HashMap;
use std::path::Path;

// Re-export types from the always-available types module
pub use super::types::{
    ChatMessage, GenerationConfig, LlmConfig, PartialToken, StreamingCallback, StreamingError,
};

// =============================================================================
// Result Type
// =============================================================================

/// Result type for LLM backend operations.
pub type LlmResult<T> = Result<T, AdapterError>;

/// Resolve the execution-provider label for the local LLM path. The
/// value lands on the inference span via `add_metadata` and is hoisted
/// to the wire payload by the SDK so the analytics backend can attribute
/// latency to the engine that actually ran (Metal vs CPU vs CUDA).
///
/// Build flags determine the answer because backend selection happens at
/// compile time: `llm-mistral-metal` and `llm-mistral-cuda` switch
/// mistralrs to the corresponding accelerator; `llm-llamacpp` on macOS
/// builds with `GGML_METAL=ON` by default (see `build.rs`). For unknown
/// or mock backends we report `cpu` rather than guessing — wrong is
/// worse than missing on a diagnostic field.
pub(crate) fn local_execution_provider(backend_name: &str) -> &'static str {
    match backend_name {
        "llama-cpp" => llamacpp_execution_provider(),
        "mistral" => mistral_execution_provider(),
        _ => "cpu",
    }
}

#[cfg(all(feature = "llm-llamacpp", any(target_os = "macos", target_os = "ios")))]
fn llamacpp_execution_provider() -> &'static str {
    // build.rs sets GGML_METAL=ON for both macOS and iOS Apple targets.
    "metal"
}

#[cfg(all(
    feature = "llm-llamacpp",
    not(any(target_os = "macos", target_os = "ios"))
))]
fn llamacpp_execution_provider() -> &'static str {
    // Linux / Windows / Android currently build llama.cpp without
    // GGML_CUDA. When we add a CUDA build flag, switch on it here.
    "cpu"
}

#[cfg(not(feature = "llm-llamacpp"))]
fn llamacpp_execution_provider() -> &'static str {
    // Compiled without llama.cpp — should be unreachable from a
    // LlamaCppBackend instance, but pick a safe fallback for tests.
    "cpu"
}

#[cfg(feature = "llm-mistral-metal")]
fn mistral_execution_provider() -> &'static str {
    "metal"
}

#[cfg(all(feature = "llm-mistral-cuda", not(feature = "llm-mistral-metal")))]
fn mistral_execution_provider() -> &'static str {
    "cuda"
}

#[cfg(not(any(feature = "llm-mistral-metal", feature = "llm-mistral-cuda")))]
fn mistral_execution_provider() -> &'static str {
    "cpu"
}

// =============================================================================
// Generation Output
// =============================================================================

/// Generation output from LLM inference.
#[derive(Debug, Clone)]
pub struct GenerationOutput {
    /// Generated text
    pub text: String,
    /// Number of tokens generated
    pub tokens_generated: usize,
    /// Generation time in milliseconds
    pub generation_time_ms: u64,
    /// Tokens per second (wallclock — includes prefill + decode)
    pub tokens_per_second: f32,
    /// Finish reason: "stop", "length", "error"
    pub finish_reason: String,
    /// Time to first emitted chunk (ms). `None` for non-streaming backends.
    pub ttft_ms: Option<u64>,
    /// Mean inter-token latency (ms), computed from inter-chunk gaps.
    pub mean_itl_ms: Option<f32>,
    /// p95 inter-token latency (ms).
    pub p95_itl_ms: Option<u32>,
    /// Number of chunks emitted (≈ tokens when 1 token/chunk).
    pub emitted_chunks: Option<u32>,
    /// Per-gap inter-chunk latencies (ms), for histogram analysis if desired.
    pub inter_chunk_ms: Vec<u32>,
    /// Decode-only tokens/sec (excludes prefill time). Sourced from
    /// mistralrs's terminal `Usage.avg_compl_tok_per_sec`. Steady-state
    /// throughput, not wallclock.
    pub decode_tps: Option<f32>,
    /// Prefill tokens/sec. Sourced from `Usage.avg_prompt_tok_per_sec`.
    pub prefill_tps: Option<f32>,
}

// =============================================================================
// LLM Backend Trait
// =============================================================================

/// Trait for LLM inference backends.
///
/// This abstraction allows the LlmRuntimeAdapter to work with different
/// LLM engines (mistral.rs, llama.cpp, etc.) through a common interface.
pub trait LlmBackend: Send + Sync {
    /// Returns the name of this backend (e.g., "mistral", "llama-cpp").
    fn name(&self) -> &str;

    /// Returns supported model file formats.
    fn supported_formats(&self) -> Vec<&'static str>;

    /// Load a model from the specified configuration.
    fn load(&mut self, config: &LlmConfig) -> LlmResult<()>;

    /// Check if a model is currently loaded.
    fn is_loaded(&self) -> bool;

    /// Unload the current model, freeing resources.
    fn unload(&mut self) -> LlmResult<()>;

    /// Generate text from a list of chat messages.
    fn generate(
        &self,
        messages: &[ChatMessage],
        config: &GenerationConfig,
    ) -> LlmResult<GenerationOutput>;

    /// Generate text from a raw prompt (no chat template).
    fn generate_raw(&self, prompt: &str, config: &GenerationConfig) -> LlmResult<GenerationOutput>;

    /// Generate text with streaming, calling the callback for each token.
    ///
    /// The callback receives a `PartialToken` for each generated token.
    /// If the callback returns an error, generation stops and the error is propagated.
    ///
    /// # Default Implementation
    ///
    /// The default implementation falls back to non-streaming `generate()` and
    /// emits all tokens at once. Override this for true streaming support.
    ///
    /// # Example
    ///
    /// ```rust,ignore
    /// backend.generate_streaming(&messages, &config, Box::new(|token| {
    ///     print!("{}", token.token);
    ///     std::io::stdout().flush()?;
    ///     Ok(())
    /// }))?;
    /// ```
    fn generate_streaming(
        &self,
        messages: &[ChatMessage],
        config: &GenerationConfig,
        on_token: StreamingCallback<'_>,
    ) -> LlmResult<GenerationOutput> {
        // Default implementation: fall back to non-streaming and emit all at once
        let output = self.generate(messages, config)?;

        // Emit the entire output as a single "token"
        let partial = PartialToken {
            token: output.text.clone(),
            token_id: None,
            index: 0,
            cumulative_text: output.text.clone(),
            finish_reason: Some(output.finish_reason.clone()),
        };

        let mut callback = on_token;
        callback(partial).map_err(AdapterError::from_streaming_callback_error)?;

        Ok(output)
    }

    /// Check if this backend supports true streaming.
    ///
    /// Backends that override `generate_streaming` should return `true`.
    fn supports_streaming(&self) -> bool {
        false
    }

    /// Get approximate memory usage in bytes.
    fn memory_usage(&self) -> Option<u64> {
        None
    }

    /// Get the maximum context length for the loaded model.
    fn context_length(&self) -> Option<usize> {
        None
    }

    /// Number of prompt tokens served from the backend's local KV cache
    /// on the most recent `generate*` call, when the backend supports
    /// multi-turn cache reuse. `None` for backends without cross-call
    /// cache state (the default — covers cloud, mistralrs, mock test
    /// backends). `Some(0)` for backends that *do* keep cache state but
    /// had no shared prefix on the most recent call (a fresh first
    /// turn, or a totally divergent prompt).
    ///
    /// Telemetry uses this to surface `prompt_cached_tokens` on
    /// inference events — the local-LLM mirror of the cloud-side
    /// `cache_read_input_tokens` field. Multi-turn workloads with a
    /// stable system prompt typically see 70-90% cache hits on every
    /// call after the first, which is the difference between a 7B
    /// model feeling responsive and feeling sluggish.
    fn last_cached_prefix_len(&self) -> Option<usize> {
        None
    }

    /// Canonical wire label for cost-attribution telemetry.
    ///
    /// Returns the closed-set `backend` value the inner `llm_inference`
    /// span should carry on the wire (`"llamacpp"` / `"mistralrs"` /
    /// future additions). Defaults to `None` so mock and test backends
    /// don't claim a real runtime identity.
    ///
    /// Real LLM backends override this and the inner-span sites call
    /// it *after* backend selection so the wire label reflects the
    /// runtime that actually executes the model — not the
    /// template-derived default produced by
    /// `backend_label_from_template`. This matters because the runtime
    /// is chosen by cargo feature (see [`Self::new`] precedence), so
    /// non-default-features builds (e.g. `llm-mistral` only, or
    /// `llm-mistral` + `llm-llamacpp` where the mistral arm wins) would
    /// otherwise mislabel unannotated GGUF bundles as `"llamacpp"`
    /// when mistral.rs is the executing backend.
    fn wire_label(&self) -> Option<&'static str> {
        None
    }
}

/// Factory function type for creating LLM backends.
pub type BackendFactory = fn() -> LlmResult<Box<dyn LlmBackend>>;

// =============================================================================
// LLM Runtime Adapter
// =============================================================================

/// LLM Runtime Adapter.
///
/// Provides local LLM inference through the RuntimeAdapter interface.
/// Uses a pluggable backend for actual inference.
pub struct LlmRuntimeAdapter {
    /// The underlying LLM backend
    backend: Box<dyn LlmBackend>,
    /// Model metadata
    metadata: Option<ModelMetadata>,
    /// Current model path
    current_model_path: Option<String>,
    /// Default generation config
    default_generation_config: GenerationConfig,
}

impl LlmRuntimeAdapter {
    /// Create a new LLM Runtime Adapter with the default backend.
    ///
    /// The default backend depends on feature flags:
    /// - `llm-mistral`: Uses MistralBackend (desktop, not Android)
    /// - `llm-llamacpp`: Uses LlamaCppBackend (Android compatible)
    ///
    /// If both are enabled, prefers MistralBackend on non-Android platforms
    /// and LlamaCppBackend on Android.
    #[cfg(feature = "llm-mistral")]
    pub fn new() -> AdapterResult<Self> {
        use crate::runtime_adapter::mistral::MistralBackend;
        let backend = MistralBackend::new()?;
        Ok(Self::with_backend(Box::new(backend)))
    }

    /// Create a new LLM Runtime Adapter with LlamaCppBackend.
    #[cfg(all(feature = "llm-llamacpp", not(feature = "llm-mistral")))]
    pub fn new() -> AdapterResult<Self> {
        use crate::runtime_adapter::llama_cpp::LlamaCppBackend;
        let backend = LlamaCppBackend::new()?;
        Ok(Self::with_backend(Box::new(backend)))
    }

    /// Create a new LLM Runtime Adapter with MistralBackend explicitly.
    ///
    /// Use this when you need to force MistralBackend regardless of metadata hints.
    #[cfg(feature = "llm-mistral")]
    pub fn with_mistral() -> AdapterResult<Self> {
        use crate::runtime_adapter::mistral::MistralBackend;
        let backend = MistralBackend::new()?;
        Ok(Self::with_backend(Box::new(backend)))
    }

    /// Create a new LLM Runtime Adapter with LlamaCppBackend explicitly.
    ///
    /// Use this when you need to force LlamaCppBackend (e.g., for models that
    /// require it like Gemma 3 which isn't supported by mistral.rs GGUF).
    #[cfg(feature = "llm-llamacpp")]
    pub fn with_llamacpp() -> AdapterResult<Self> {
        use crate::runtime_adapter::llama_cpp::LlamaCppBackend;
        let backend = LlamaCppBackend::new()?;
        Ok(Self::with_backend(Box::new(backend)))
    }

    /// Create a new LLM Runtime Adapter based on a backend hint.
    ///
    /// If the hint is "llamacpp" and the feature is available, uses LlamaCppBackend.
    /// Otherwise falls back to the default backend for the platform.
    #[cfg(any(feature = "llm-mistral", feature = "llm-llamacpp"))]
    pub fn with_backend_hint(hint: Option<&str>) -> AdapterResult<Self> {
        match hint {
            #[cfg(feature = "llm-llamacpp")]
            Some("llamacpp") => Self::with_llamacpp(),
            #[cfg(feature = "llm-mistral")]
            Some("mistral") => Self::with_mistral(),
            _ => Self::new(),
        }
    }

    /// Create a new adapter with a custom backend.
    pub fn with_backend(backend: Box<dyn LlmBackend>) -> Self {
        Self {
            backend,
            metadata: None,
            current_model_path: None,
            default_generation_config: GenerationConfig::default(),
        }
    }

    /// Set the default generation configuration.
    pub fn set_generation_config(&mut self, config: GenerationConfig) {
        self.default_generation_config = config;
    }

    /// Get the current generation configuration.
    pub fn generation_config(&self) -> &GenerationConfig {
        &self.default_generation_config
    }

    /// Get memory usage in bytes (if available).
    pub fn memory_usage(&self) -> Option<u64> {
        self.backend.memory_usage()
    }

    /// Get the context length of the loaded model.
    pub fn context_length(&self) -> Option<usize> {
        self.backend.context_length()
    }

    /// Canonical wire label of the underlying backend (see
    /// [`LlmBackend::wire_label`]). Inner LLM span sites call this to
    /// stamp the runtime that actually executed, overwriting the
    /// template-derived default from `backend_label_from_template`.
    pub fn wire_label(&self) -> Option<&'static str> {
        self.backend.wire_label()
    }

    /// Get a reference to the underlying backend.
    ///
    /// This is useful for accessing backend-specific features like streaming.
    pub fn backend(&self) -> &dyn LlmBackend {
        self.backend.as_ref()
    }

    /// Generate with custom configuration.
    pub fn generate_with_config(
        &self,
        prompt: &str,
        system: Option<&str>,
        config: &GenerationConfig,
    ) -> AdapterResult<GenerationOutput> {
        let mut messages = Vec::new();

        if let Some(sys) = system {
            messages.push(ChatMessage::system(sys));
        }
        messages.push(ChatMessage::user(prompt));

        self.backend.generate(&messages, config)
    }

    /// Extract model ID from path.
    fn extract_model_id(&self, path: &str) -> String {
        Path::new(path)
            .file_stem()
            .and_then(|s| s.to_str())
            .unwrap_or("unknown")
            .to_string()
    }

    /// Parse generation config from envelope metadata.
    fn parse_generation_config(&self, metadata: &HashMap<String, String>) -> GenerationConfig {
        let mut config = self.default_generation_config.clone();

        if let Some(max_tokens) = metadata.get("max_tokens").and_then(|s| s.parse().ok()) {
            config.max_tokens = max_tokens;
        }
        if let Some(temperature) = metadata.get("temperature").and_then(|s| s.parse().ok()) {
            config.temperature = temperature;
        }
        if let Some(top_p) = metadata.get("top_p").and_then(|s| s.parse().ok()) {
            config.top_p = top_p;
        }
        if let Some(min_p) = metadata.get("min_p").and_then(|s| s.parse().ok()) {
            config.min_p = min_p;
        }
        if let Some(top_k) = metadata.get("top_k").and_then(|s| s.parse().ok()) {
            config.top_k = top_k;
        }

        config
    }
}

impl LlmRuntimeAdapter {
    /// Load a model with a full LlmConfig, preserving context_length and other settings.
    pub fn load_model_with_config(&mut self, config: &LlmConfig) -> AdapterResult<()> {
        let path = &config.model_path;
        let model_path = Path::new(path);

        if !model_path.exists() {
            return Err(AdapterError::ModelNotFound(path.to_string()));
        }

        self.backend.load(config)?;

        let model_id = self.extract_model_id(path);

        self.metadata = Some(ModelMetadata {
            model_id: model_id.clone(),
            version: "1.0.0".to_string(),
            runtime_type: self.backend.name().to_string(),
            model_path: path.to_string(),
            input_schema: {
                let mut schema = HashMap::new();
                schema.insert("text".to_string(), vec![1]);
                schema
            },
            output_schema: {
                let mut schema = HashMap::new();
                schema.insert("text".to_string(), vec![1]);
                schema
            },
        });

        self.current_model_path = Some(path.to_string());
        Ok(())
    }
}

impl RuntimeAdapter for LlmRuntimeAdapter {
    fn name(&self) -> &str {
        "llm"
    }

    fn supported_formats(&self) -> Vec<&'static str> {
        self.backend.supported_formats()
    }

    fn load_model(&mut self, path: &str) -> AdapterResult<()> {
        self.load_model_with_config(&LlmConfig::new(path))
    }

    fn execute(&self, input: &Envelope) -> AdapterResult<Envelope> {
        if !self.backend.is_loaded() {
            return Err(AdapterError::ModelNotLoaded(
                "No model loaded. Call load_model() first.".to_string(),
            ));
        }

        match &input.kind {
            EnvelopeKind::Text(prompt) => {
                let system = input.metadata.get("system_prompt").map(|s| s.as_str());
                let config = self.parse_generation_config(&input.metadata);

                let mut messages = Vec::new();
                if let Some(sys) = system {
                    messages.push(ChatMessage::system(sys));
                }
                messages.push(ChatMessage::user(prompt));

                let output = self.backend.generate(&messages, &config)?;

                let mut response_metadata = HashMap::new();
                response_metadata.insert(
                    "tokens_generated".to_string(),
                    output.tokens_generated.to_string(),
                );
                response_metadata.insert(
                    "generation_time_ms".to_string(),
                    output.generation_time_ms.to_string(),
                );
                response_metadata.insert(
                    "tokens_per_second".to_string(),
                    format!("{:.2}", output.tokens_per_second),
                );
                response_metadata.insert("finish_reason".to_string(), output.finish_reason.clone());

                // Streaming-path metrics — the canonical keys the platform's
                // span-extraction layer reads. Route to both the envelope
                // metadata (so downstream stages can see them) and
                // `xybrid_core::tracing::add_metadata` (so the span-level
                // wire path picks them up — see plan phase 0.4).
                if let Some(v) = output.ttft_ms {
                    let s = v.to_string();
                    response_metadata.insert("ttft_ms".to_string(), s.clone());
                    crate::tracing::add_metadata("ttft_ms", &s);
                }
                if let Some(v) = output.mean_itl_ms {
                    let s = format!("{:.4}", v);
                    response_metadata.insert("mean_itl_ms".to_string(), s.clone());
                    crate::tracing::add_metadata("mean_itl_ms", &s);
                }
                if let Some(v) = output.p95_itl_ms {
                    let s = v.to_string();
                    response_metadata.insert("p95_itl_ms".to_string(), s.clone());
                    crate::tracing::add_metadata("p95_itl_ms", &s);
                }
                if let Some(v) = output.emitted_chunks {
                    let s = v.to_string();
                    response_metadata.insert("emitted_chunks".to_string(), s.clone());
                    crate::tracing::add_metadata("emitted_chunks", &s);
                }
                if let Some(v) = output.decode_tps {
                    let s = format!("{:.4}", v);
                    response_metadata.insert("decode_tps".to_string(), s.clone());
                    crate::tracing::add_metadata("decode_tps", &s);
                }
                if let Some(v) = output.prefill_tps {
                    let s = format!("{:.4}", v);
                    response_metadata.insert("prefill_tps".to_string(), s.clone());
                    crate::tracing::add_metadata("prefill_tps", &s);
                }
                // Mirror the always-present scalars onto the span metadata
                // too so they reach the platform without relying on the
                // envelope metadata being serialized downstream.
                crate::tracing::add_metadata(
                    "tokens_generated",
                    output.tokens_generated.to_string(),
                );
                // Canonical `tokens_out` key for the analytics backend's
                // span extractor. Equal to `tokens_generated` by
                // construction — the SDK hoist lifts this onto the outer
                // payload so trace dashboards can render the column.
                crate::tracing::add_metadata("tokens_out", output.tokens_generated.to_string());
                crate::tracing::add_metadata(
                    "generation_time_ms",
                    output.generation_time_ms.to_string(),
                );
                crate::tracing::add_metadata(
                    "tokens_per_second",
                    format!("{:.2}", output.tokens_per_second),
                );
                crate::tracing::add_metadata("finish_reason", &output.finish_reason);
                // span_kind tells the console swim-lanes renderer which
                // bar color to use. llama.cpp with Metal compiled in runs
                // the forward pass on the GPU; otherwise it's CPU.
                #[cfg(all(
                    any(feature = "llm-mistral-metal", feature = "llm-llamacpp"),
                    target_os = "macos"
                ))]
                crate::tracing::add_metadata("span_kind", "gpu");
                #[cfg(not(all(
                    any(feature = "llm-mistral-metal", feature = "llm-llamacpp"),
                    target_os = "macos"
                )))]
                crate::tracing::add_metadata("span_kind", "cpu");

                // Resolved execution provider: which on-device engine
                // path actually ran. Cost-attribution telemetry needs
                // this to explain latency variance on the same chip +
                // model. Cloud LLMs get attribution from the `provider`
                // field on their adapter span instead, so we omit here.
                crate::tracing::add_metadata(
                    "execution_provider",
                    local_execution_provider(self.backend.name()),
                );

                // Local KV cache hits: only emit when the backend
                // actually reused a prefix (n > 0). `Some(0)` means
                // first-turn / divergent prompt and we want telemetry
                // to read like a non-cached call. `None` means the
                // backend doesn't track prefix reuse at all.
                if let Some(n) = self.backend.last_cached_prefix_len() {
                    if n > 0 {
                        crate::tracing::add_metadata("prompt_cached_tokens", n.to_string());
                    }
                }

                Ok(Envelope {
                    kind: EnvelopeKind::Text(output.text),
                    metadata: response_metadata,
                })
            }
            EnvelopeKind::Audio(_) => Err(AdapterError::InvalidInput(
                "LLM adapter expects Text input, not Audio".to_string(),
            )),
            EnvelopeKind::Embedding(_) => Err(AdapterError::InvalidInput(
                "LLM adapter expects Text input, not Embedding".to_string(),
            )),
        }
    }
}

impl RuntimeAdapterExt for LlmRuntimeAdapter {
    fn is_loaded(&self, _model_id: &str) -> bool {
        self.backend.is_loaded()
    }

    fn get_metadata(&self, _model_id: &str) -> AdapterResult<&ModelMetadata> {
        self.metadata
            .as_ref()
            .ok_or_else(|| AdapterError::ModelNotLoaded("No model loaded".to_string()))
    }

    fn infer(&self, _model_id: &str, input: &Envelope) -> AdapterResult<Envelope> {
        self.execute(input)
    }

    fn unload_model(&mut self, _model_id: &str) -> AdapterResult<()> {
        self.backend.unload()?;
        self.metadata = None;
        self.current_model_path = None;
        Ok(())
    }

    fn list_loaded_models(&self) -> Vec<String> {
        if self.backend.is_loaded() {
            self.metadata
                .as_ref()
                .map(|m| vec![m.model_id.clone()])
                .unwrap_or_default()
        } else {
            Vec::new()
        }
    }
}

// =============================================================================
// Tests
// =============================================================================

#[cfg(test)]
mod tests {
    use super::*;

    // Note: Tests for PartialToken, ChatMessage, GenerationConfig, and LlmConfig
    // are in runtime_adapter/types.rs since those types are now defined there.

    #[test]
    fn local_execution_provider_maps_known_backends() {
        // We only assert the names we ship: backend names "llama-cpp"
        // and "mistral" must round-trip through the helper into a
        // non-empty string. Exact label depends on build flags + target,
        // so the assertions here stay shape-only — full per-cfg coverage
        // is enforced at compile time by the `#[cfg]` arms themselves.
        let llama = local_execution_provider("llama-cpp");
        assert!(!llama.is_empty(), "llama-cpp must map to a label");
        let mistral = local_execution_provider("mistral");
        assert!(!mistral.is_empty(), "mistral must map to a label");
    }

    #[test]
    fn local_execution_provider_unknown_backend_falls_back_to_cpu() {
        // Mock backends in tests, or any name we don't recognise, get
        // "cpu" rather than a guess — wrong is worse than missing on
        // a diagnostic field.
        assert_eq!(local_execution_provider("mock"), "cpu");
        assert_eq!(local_execution_provider(""), "cpu");
    }

    #[test]
    fn test_generation_output_structure() {
        let output = GenerationOutput {
            text: "Hello world".to_string(),
            tokens_generated: 3,
            generation_time_ms: 100,
            tokens_per_second: 30.0,
            finish_reason: "stop".to_string(),
            ttft_ms: None,
            mean_itl_ms: None,
            p95_itl_ms: None,
            emitted_chunks: None,
            inter_chunk_ms: Vec::new(),
            decode_tps: None,
            prefill_tps: None,
        };
        assert_eq!(output.text, "Hello world");
        assert_eq!(output.tokens_generated, 3);
        assert_eq!(output.generation_time_ms, 100);
        assert!((output.tokens_per_second - 30.0).abs() < f32::EPSILON);
        assert_eq!(output.finish_reason, "stop");
    }

    /// Test that the default streaming implementation emits all text as one token
    #[test]
    fn test_default_streaming_implementation() {
        // Create a mock backend that implements LlmBackend with default streaming
        struct MockBackend;

        impl LlmBackend for MockBackend {
            fn name(&self) -> &str {
                "mock"
            }
            fn supported_formats(&self) -> Vec<&'static str> {
                vec!["test"]
            }
            fn load(&mut self, _config: &LlmConfig) -> LlmResult<()> {
                Ok(())
            }
            fn is_loaded(&self) -> bool {
                true
            }
            fn unload(&mut self) -> LlmResult<()> {
                Ok(())
            }
            fn generate(
                &self,
                _messages: &[ChatMessage],
                _config: &GenerationConfig,
            ) -> LlmResult<GenerationOutput> {
                Ok(GenerationOutput {
                    text: "Test response".to_string(),
                    tokens_generated: 2,
                    generation_time_ms: 50,
                    tokens_per_second: 40.0,
                    finish_reason: "stop".to_string(),
                    ttft_ms: None,
                    mean_itl_ms: None,
                    p95_itl_ms: None,
                    emitted_chunks: None,
                    inter_chunk_ms: Vec::new(),
                    decode_tps: None,
                    prefill_tps: None,
                })
            }
            fn generate_raw(
                &self,
                prompt: &str,
                config: &GenerationConfig,
            ) -> LlmResult<GenerationOutput> {
                self.generate(&[ChatMessage::user(prompt)], config)
            }
            // Uses default generate_streaming implementation
        }

        let backend = MockBackend;
        assert!(!backend.supports_streaming()); // Default is false

        let messages = vec![ChatMessage::user("test")];
        let config = GenerationConfig::default();

        let mut received_tokens: Vec<PartialToken> = Vec::new();
        let result = backend.generate_streaming(
            &messages,
            &config,
            Box::new(|token| {
                received_tokens.push(token);
                Ok(())
            }),
        );

        assert!(result.is_ok());
        let output = result.unwrap();
        assert_eq!(output.text, "Test response");

        // Default implementation emits entire text as single "token"
        assert_eq!(received_tokens.len(), 1);
        assert_eq!(received_tokens[0].token, "Test response");
        assert_eq!(received_tokens[0].cumulative_text, "Test response");
        assert_eq!(received_tokens[0].finish_reason, Some("stop".to_string()));
        assert!(received_tokens[0].is_final());
    }

    /// Test that callback errors propagate correctly in default streaming
    #[test]
    fn test_default_streaming_callback_error() {
        struct MockBackend;

        impl LlmBackend for MockBackend {
            fn name(&self) -> &str {
                "mock"
            }
            fn supported_formats(&self) -> Vec<&'static str> {
                vec!["test"]
            }
            fn load(&mut self, _config: &LlmConfig) -> LlmResult<()> {
                Ok(())
            }
            fn is_loaded(&self) -> bool {
                true
            }
            fn unload(&mut self) -> LlmResult<()> {
                Ok(())
            }
            fn generate(
                &self,
                _messages: &[ChatMessage],
                _config: &GenerationConfig,
            ) -> LlmResult<GenerationOutput> {
                Ok(GenerationOutput {
                    text: "Test".to_string(),
                    tokens_generated: 1,
                    generation_time_ms: 10,
                    tokens_per_second: 100.0,
                    finish_reason: "stop".to_string(),
                    ttft_ms: None,
                    mean_itl_ms: None,
                    p95_itl_ms: None,
                    emitted_chunks: None,
                    inter_chunk_ms: Vec::new(),
                    decode_tps: None,
                    prefill_tps: None,
                })
            }
            fn generate_raw(
                &self,
                prompt: &str,
                config: &GenerationConfig,
            ) -> LlmResult<GenerationOutput> {
                self.generate(&[ChatMessage::user(prompt)], config)
            }
        }

        let backend = MockBackend;
        let messages = vec![ChatMessage::user("test")];
        let config = GenerationConfig::default();

        // Callback that returns an error
        let result = backend.generate_streaming(
            &messages,
            &config,
            Box::new(|_token| Err("User cancelled".into())),
        );

        assert!(result.is_err());
        let err_msg = result.unwrap_err().to_string();
        assert!(err_msg.contains("callback error") || err_msg.contains("User cancelled"));
    }
}