brainos-cortex 0.5.0

LLM provider abstraction, context assembly, and action dispatch for Brain OS
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
//! LLM client — hybrid provider with trait-based adapter.
//!
//! `LlmProvider` trait with multiple implementations:
//! - `OllamaProvider` — local Ollama server
//! - `OpenAiProvider` — OpenAI compatible APIs

use std::pin::Pin;

use futures::Stream;
use serde::{Deserialize, Serialize};
use thiserror::Error;

mod ollama;
mod openai;

#[cfg(test)]
mod tests;

pub use ollama::OllamaProvider;
pub use openai::OpenAiProvider;

mod failover;

// ─── Errors ─────────────────────────────────────────────────────────────────

/// Errors from the LLM layer.
#[derive(Debug, Error)]
pub enum LlmError {
    #[error("HTTP request failed: {0}")]
    Http(#[from] reqwest::Error),

    #[error("API error: {status} - {message}")]
    Api { status: u16, message: String },

    #[error("Stream error: {0}")]
    Stream(String),

    #[error("Invalid response format: {0}")]
    InvalidFormat(String),

    #[error("Provider not available: {0}")]
    ProviderUnavailable(String),

    #[error("Rate limited")]
    RateLimited,

    #[error("Timeout")]
    Timeout,
}

// ─── Types ──────────────────────────────────────────────────────────────────

/// A message in the conversation.
///
/// Plain text turns set only `role` + `content`; the two extra fields carry
/// tool-use protocol state and stay empty otherwise. Prefer the
/// constructors ([`Message::user`], [`Message::tool_result`], …) over a
/// struct literal so the tool-use fields default correctly.
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct Message {
    pub role: Role,
    pub content: String,
    /// Tool calls an assistant turn proposed. Replayed verbatim to the
    /// provider so the following [`Role::Tool`] result turns resolve
    /// against them. Empty for every non-assistant or plain-text message.
    #[serde(default, skip_serializing_if = "Vec::is_empty")]
    pub tool_calls: Vec<ProposedToolCall>,
    /// For a [`Role::Tool`] result turn: the id of the proposed call this
    /// answers (links the result to the assistant's `tool_calls`). `None`
    /// for every other role.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub tool_call_id: Option<String>,
}

impl Message {
    /// A system-prompt turn.
    pub fn system(content: impl Into<String>) -> Self {
        Self::plain(Role::System, content)
    }

    /// A user turn.
    pub fn user(content: impl Into<String>) -> Self {
        Self::plain(Role::User, content)
    }

    /// A plain-text assistant turn (no proposed tool calls).
    pub fn assistant(content: impl Into<String>) -> Self {
        Self::plain(Role::Assistant, content)
    }

    /// An assistant turn that proposed tool calls. `content` may be empty
    /// (a pure tool-call turn carries no prose).
    pub fn assistant_with_tool_calls(
        content: impl Into<String>,
        tool_calls: Vec<ProposedToolCall>,
    ) -> Self {
        Self {
            role: Role::Assistant,
            content: content.into(),
            tool_calls,
            tool_call_id: None,
        }
    }

    /// A tool-result turn answering the proposed call `tool_call_id`.
    pub fn tool_result(tool_call_id: impl Into<String>, content: impl Into<String>) -> Self {
        Self {
            role: Role::Tool,
            content: content.into(),
            tool_calls: Vec::new(),
            tool_call_id: Some(tool_call_id.into()),
        }
    }

    fn plain(role: Role, content: impl Into<String>) -> Self {
        Self {
            role,
            content: content.into(),
            tool_calls: Vec::new(),
            tool_call_id: None,
        }
    }
}

/// Message roles.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
#[serde(rename_all = "lowercase")]
pub enum Role {
    System,
    #[default]
    User,
    Assistant,
    /// A tool-call result fed back to the model. Carries a `tool_call_id`
    /// on its [`Message`].
    Tool,
}

impl Role {
    /// Wire-format string used by every OpenAI-shaped chat API (OpenAI,
    /// OpenRouter, Ollama in chat mode, etc.). Centralised here so each
    /// provider's `convert_messages` body stays a one-line `.map(...)`.
    pub fn as_wire_str(&self) -> &'static str {
        match self {
            Role::System => "system",
            Role::User => "user",
            Role::Assistant => "assistant",
            Role::Tool => "tool",
        }
    }
}

// ─── Shared HTTP helpers (used by openai + ollama submodules) ───────────────

/// Build a `reqwest::Client` with the given timeout, mapping construction
/// failure to [`LlmError::ProviderUnavailable`].
pub(crate) fn build_http_client(timeout: std::time::Duration) -> Result<reqwest::Client, LlmError> {
    reqwest::Client::builder()
        .timeout(timeout)
        .build()
        .map_err(|e| LlmError::ProviderUnavailable(format!("Failed to create HTTP client: {e}")))
}

/// If `resp` is non-success, drain the body and turn it into
/// [`LlmError::Api`]. Otherwise pass the response through untouched so the
/// caller can `.json()` / `.bytes_stream()` it.
pub(crate) async fn ensure_ok(resp: reqwest::Response) -> Result<reqwest::Response, LlmError> {
    if resp.status().is_success() {
        return Ok(resp);
    }
    let status = resp.status();
    let body = resp.text().await.unwrap_or_default();
    Err(LlmError::Api {
        status: status.as_u16(),
        message: body,
    })
}

/// LLM response chunk (for streaming).
#[derive(Debug, Clone)]
pub struct ResponseChunk {
    pub content: String,
    pub is_done: bool,
}

/// A tool the model may call, in the provider-agnostic shape the kernel
/// hands down the tools channel. `parameters` is a JSON Schema object
/// describing the call arguments (the same `input_schema` a
/// [`intent::ToolDescriptor`](intent) carries). Producers route any
/// untrusted `description` through `intent::sanitization` before it
/// reaches a provider.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ToolDef {
    pub name: String,
    pub description: String,
    pub parameters: serde_json::Value,
}

/// A tool call the model proposed in its response. Awareness ≠ permission:
/// a proposed call is *not* executed here — the caller resolves it to a
/// route and runs it through the consent/audit path.
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct ProposedToolCall {
    /// Provider-assigned call id (OpenAI sets one; Ollama may not).
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub id: Option<String>,
    pub name: String,
    /// Parsed call arguments. Providers send these as a JSON string; we
    /// parse to a [`serde_json::Value`] so the caller never re-parses.
    pub arguments: serde_json::Value,
}

/// Complete LLM response.
#[derive(Debug, Clone, Default)]
pub struct Response {
    pub content: String,
    pub usage: Option<Usage>,
    /// Tool calls the model proposed this turn. Empty for a plain text
    /// answer or for any provider without a tools channel.
    pub tool_calls: Vec<ProposedToolCall>,
}

impl Response {
    /// Construct a plain text response with no proposed tool calls — the
    /// common case for providers and mocks that don't use the tools
    /// channel.
    pub fn text(content: impl Into<String>, usage: Option<Usage>) -> Self {
        Self {
            content: content.into(),
            usage,
            tool_calls: Vec::new(),
        }
    }
}

/// Token usage statistics.
#[derive(Debug, Clone)]
pub struct Usage {
    pub prompt_tokens: u32,
    pub completion_tokens: u32,
    pub total_tokens: u32,
}

// ─── Provider Trait ─────────────────────────────────────────────────────────

/// Trait for LLM providers.
#[async_trait::async_trait]
pub trait LlmProvider: Send + Sync {
    /// Generate a complete response (non-streaming).
    async fn generate(&self, messages: &[Message]) -> Result<Response, LlmError>;

    /// Generate with an optional tools channel. Providers that support
    /// function-calling override this to advertise `tools` and surface any
    /// proposed calls in [`Response::tool_calls`]; the default ignores
    /// `tools` and delegates to [`generate`](LlmProvider::generate), so a
    /// chat turn degrades gracefully to a plain text answer on a provider
    /// (or mock) without a tools channel.
    async fn generate_with_tools(
        &self,
        messages: &[Message],
        tools: &[ToolDef],
    ) -> Result<Response, LlmError> {
        let _ = tools;
        self.generate(messages).await
    }

    /// Generate a streaming response.
    async fn generate_stream(
        &self,
        messages: &[Message],
    ) -> Result<Pin<Box<dyn Stream<Item = Result<ResponseChunk, LlmError>> + Send>>, LlmError>;

    /// Check if the provider is available.
    async fn health_check(&self) -> bool;

    /// Get the provider name.
    fn name(&self) -> &str;

    /// Get the active model name.
    fn model(&self) -> &str;

    /// List models available from this provider. Used by `select_provider`
    /// to probe reachability and match `preferred_models` during startup.
    async fn list_models(&self) -> Result<Vec<String>, LlmError>;

    /// Probe the provider for the active model's context window (in tokens).
    /// Returns `None` when the provider doesn't expose this information.
    /// Providers that advertise `context_length` in their API (OpenRouter,
    /// Ollama) override this for accurate detection; all providers get a
    /// model-name heuristics fallback via [`known_context_window`].
    async fn fetch_context_window(&self) -> Option<usize> {
        known_context_window(self.model())
    }
}

// ─── Provider Factory ───────────────────────────────────────────────────────

/// Configuration for LLM provider selection.
#[derive(Debug, Clone)]
pub struct ProviderConfig {
    pub provider: String,
    pub base_url: String,
    pub api_key: Option<String>,
    pub model: String,
    pub temperature: f64,
    pub max_tokens: i32,
}

impl Default for ProviderConfig {
    fn default() -> Self {
        Self {
            provider: "ollama".to_string(),
            base_url: "http://localhost:11434".to_string(),
            api_key: None,
            model: "qwen2.5-coder:7b".to_string(),
            temperature: 0.7,
            max_tokens: 4096,
        }
    }
}

/// Create an LLM provider from configuration.
///
/// Resolution order:
/// 1. `ollama` → `OllamaProvider`.
/// 2. `openai_compat` (or a built-in preset: openai, openrouter, groq,
///    deepseek, together, gemini-compat) → OpenAI-compatible provider.
///    An explicit non-empty `base_url` overrides the preset default.
/// 3. Unknown provider → fall back to default Ollama with a warning.
pub fn create_provider(config: &ProviderConfig) -> Result<Box<dyn LlmProvider>, LlmError> {
    if config.provider == "ollama" {
        let provider = OllamaProvider::new(
            &config.base_url,
            &config.model,
            config.temperature,
            config.max_tokens,
        )
        .or_else(|e| {
            tracing::error!(error = %e, "Failed to create Ollama provider, falling back to default");
            OllamaProvider::default_config()
        })?;
        return Ok(Box::new(provider));
    }

    let preset_base = crate::presets::resolve(&config.provider).map(|p| p.base_url);

    if config.provider == "openai_compat" || preset_base.is_some() {
        let base_url = if !config.base_url.is_empty() {
            config.base_url.as_str()
        } else if let Some(b) = preset_base {
            b
        } else {
            return Err(LlmError::ProviderUnavailable(format!(
                "provider `{}` has no base_url configured",
                config.provider
            )));
        };
        return Ok(Box::new(OpenAiProvider::new(
            base_url,
            config.api_key.as_deref(),
            &config.model,
            config.temperature,
            Some(config.max_tokens),
        )?));
    }

    tracing::warn!(
        provider = %config.provider,
        "Unknown LLM provider, falling back to default Ollama"
    );
    Ok(Box::new(OllamaProvider::default_config()?))
}

// ─── Multi-provider selection ───────────────────────────────────────────────

/// Build a `ProviderConfig` from a `brain::ProviderEntry` and shared
/// temperature/max_tokens. `model_override` lets `select_provider` swap in
/// a preferred model discovered via `list_models`.
fn provider_config_from_entry(
    entry: &brain::ProviderEntry,
    temperature: f64,
    max_tokens: i32,
    model_override: Option<&str>,
) -> ProviderConfig {
    // Issue 125: `api_key_file` wins over the inline `api_key`. A file
    // read failure here downgrades to `None` rather than failing the
    // whole select_provider — the provider call will surface the missing
    // key with a clearer message, and we don't want a typo in one entry
    // to disable an unrelated working entry below it.
    let api_key = match entry.api_key_file.as_ref() {
        Some(path) => match std::fs::read_to_string(path) {
            Ok(raw) => {
                let trimmed = raw.trim().to_string();
                if trimmed.is_empty() {
                    tracing::warn!(
                        provider = %entry.name,
                        path = %path.display(),
                        "llm.providers[].api_key_file is empty; falling back to inline api_key"
                    );
                    entry.api_key.trim().to_string()
                } else {
                    trimmed
                }
            }
            Err(e) => {
                tracing::warn!(
                    provider = %entry.name,
                    path = %path.display(),
                    error = %e,
                    "llm.providers[].api_key_file unreadable; falling back to inline api_key"
                );
                entry.api_key.trim().to_string()
            }
        },
        None => entry.api_key.trim().to_string(),
    };
    ProviderConfig {
        provider: entry.kind.clone(),
        base_url: entry.base_url.clone(),
        api_key: if api_key.is_empty() {
            None
        } else {
            Some(api_key)
        },
        model: model_override.unwrap_or(&entry.model).to_string(),
        temperature,
        max_tokens,
    }
}

/// Probe every configured provider, pick the first reachable one whose
/// `preferred_models` intersects the live model list, and return it.
///
/// When `llm.providers` is empty we synthesise a single entry from the
/// legacy `llm.provider`/`model`/`base_url`/`api_key` fields — so existing
/// configs keep working unchanged.
///
/// Fail-safe: if no provider answers `list_models`, we still return the
/// first entry as a best effort rather than erroring out (the underlying
/// generate call will surface the real problem when used).
pub async fn select_provider(llm: &brain::LlmConfig) -> Result<Box<dyn LlmProvider>, LlmError> {
    let entries = synthesise_entries(llm);
    let max_tokens = llm.max_tokens as i32;

    if entries.is_empty() {
        return Err(LlmError::ProviderUnavailable(
            "no LLM providers configured".into(),
        ));
    }

    for entry in &entries {
        let cfg = provider_config_from_entry(entry, llm.temperature, max_tokens, None);
        let probe = match create_provider(&cfg) {
            Ok(p) => p,
            Err(e) => {
                tracing::warn!(name = %entry.name, error = %e, "skipping provider — construction failed");
                continue;
            }
        };

        match probe.list_models().await {
            Ok(models) => {
                let chosen = pick_model(&entry.preferred_models, &models, &entry.model);
                tracing::info!(
                    name = %entry.name,
                    kind = %entry.kind,
                    model = %chosen,
                    "LLM provider selected"
                );
                let cfg =
                    provider_config_from_entry(entry, llm.temperature, max_tokens, Some(&chosen));
                return create_provider(&cfg);
            }
            Err(e) => {
                tracing::warn!(
                    name = %entry.name,
                    error = %e,
                    "provider unreachable — trying next"
                );
            }
        }
    }

    // All probes failed — fall back to the first entry so startup continues
    // and the caller surfaces the real failure on first generate().
    let first = &entries[0];
    tracing::warn!(
        name = %first.name,
        "no provider answered list_models — falling back to first entry"
    );
    let cfg = provider_config_from_entry(first, llm.temperature, max_tokens, None);
    create_provider(&cfg)
}

/// Build a failover chain from all configured providers.
///
/// The chain is ordered: the startup-probed winner goes first; the remaining
/// entries (built without probing) follow as fallbacks. At request time the
/// chain tries each in order whenever the current provider returns a retriable
/// error (429 / 5xx / unavailable / timeout).
pub async fn build_failover_chain(
    llm: &brain::LlmConfig,
) -> Result<failover::FailoverProvider, LlmError> {
    let entries = synthesise_entries(llm);
    let max_tokens = llm.max_tokens as i32;

    if entries.is_empty() {
        return Err(LlmError::ProviderUnavailable(
            "no LLM providers configured".into(),
        ));
    }

    // Find the primary via probing (same logic as select_provider).
    let mut primary_idx = None;
    for (i, entry) in entries.iter().enumerate() {
        let cfg = provider_config_from_entry(entry, llm.temperature, max_tokens, None);
        let probe = match create_provider(&cfg) {
            Ok(p) => p,
            Err(e) => {
                tracing::warn!(name = %entry.name, error = %e, "skipping provider — construction failed");
                continue;
            }
        };
        match probe.list_models().await {
            Ok(models) => {
                let chosen = pick_model(&entry.preferred_models, &models, &entry.model);
                tracing::info!(
                    name = %entry.name,
                    kind = %entry.kind,
                    model = %chosen,
                    "LLM provider selected"
                );
                primary_idx = Some((i, chosen));
                break;
            }
            Err(e) => {
                tracing::warn!(name = %entry.name, error = %e, "provider unreachable — trying next");
            }
        }
    }

    // If no probe succeeded, fall back to index 0 (best-effort).
    let (primary_i, model_override) = primary_idx.unwrap_or_else(|| {
        tracing::warn!("no provider answered list_models — using first entry as primary");
        (0, entries[0].model.clone())
    });

    // Build all providers: primary first, rest appended in config order.
    let mut providers: Vec<Box<dyn LlmProvider>> = Vec::with_capacity(entries.len());
    let primary_cfg = provider_config_from_entry(
        &entries[primary_i],
        llm.temperature,
        max_tokens,
        Some(&model_override),
    );
    providers.push(create_provider(&primary_cfg)?);

    for (i, entry) in entries.iter().enumerate() {
        if i == primary_i {
            continue;
        }
        let cfg = provider_config_from_entry(entry, llm.temperature, max_tokens, None);
        match create_provider(&cfg) {
            Ok(p) => {
                tracing::info!(name = %entry.name, "registered as fallback provider");
                providers.push(p);
            }
            Err(e) => {
                tracing::warn!(name = %entry.name, error = %e, "fallback provider construction failed — skipping");
            }
        }
    }

    Ok(failover::FailoverProvider::new(providers))
}

fn synthesise_entries(llm: &brain::LlmConfig) -> Vec<brain::ProviderEntry> {
    if !llm.providers.is_empty() {
        return llm.providers.clone();
    }
    // Single-provider fallback path — legitimate use of the deprecated
    // legacy `llm.{provider,model,base_url,api_key}` fields (Issue 40).
    // The startup warning fires when both shapes are set; here
    // `providers[]` is empty so it's the only way to know which transport
    // to talk to.
    #[allow(deprecated)]
    let entry = brain::ProviderEntry {
        name: "default".to_string(),
        kind: llm.provider.clone(),
        base_url: llm.base_url.clone(),
        api_key: llm.api_key.clone(),
        api_key_file: llm.api_key_file.clone(),
        model: llm.model.clone(),
        preferred_models: Vec::new(),
    };
    vec![entry]
}

/// Fallback context-window heuristics based on model name patterns.
/// Used by providers whose API doesn't expose `context_length` (OpenAI,
/// Groq, DeepSeek, Together, etc.) and as a second-chance fallback after
/// API-based detection fails.
pub(crate) fn known_context_window(model: &str) -> Option<usize> {
    let lower = &model.to_ascii_lowercase();

    // Gemini 1.5/2.0/2.5 — all have 1M token windows.
    if lower.contains("gemini") && !lower.contains("gemini-2.0-flash-lite") {
        return Some(1_000_000);
    }

    // Claude 3/4 — Opus, Sonnet, Haiku all share 200K.
    if lower.contains("claude")
        && (lower.contains("sonnet") || lower.contains("opus") || lower.contains("haiku"))
    {
        return Some(200_000);
    }
    // Generic Claude fallback (exact version unknown but definitely 200K).
    if lower.contains("claude") {
        return Some(200_000);
    }

    // GPT-4o / GPT-4.5 / GPT-4-turbo — all 128K. Must precede the generic
    // gpt-4 branch below, which would otherwise swallow these.
    if lower.contains("gpt-4o") || lower.contains("gpt-4.5") || lower.contains("gpt-4-turbo") {
        return Some(128_000);
    }
    // GPT-3.5 — 16K.
    if lower.contains("gpt-3.5") {
        return Some(16_000);
    }
    // GPT-4 (non-4o, non-turbo) — 32K safe fallback (base gpt-4 is 8K, but
    // 32K is the common denominator for modern gpt-4-* and the heuristic is
    // generous).
    if lower.contains("gpt-4") {
        return Some(32_000);
    }
    // o1 / o3 reasoning models — 200K (o1) / 100K (o3-mini).
    if lower.starts_with("o1") || lower.starts_with("o3") {
        return Some(200_000);
    }

    // DeepSeek V2/V3/R1 — all 128K.
    if lower.contains("deepseek") {
        return Some(128_000);
    }

    // Qwen 2.5 — 128K default; smaller quantised variants keep it.
    if lower.contains("qwen") {
        return Some(128_000);
    }

    // Llama 3.x — 128K for 3.1+; fall back to 8K for older.
    if lower.contains("llama") && lower.contains("3") {
        return Some(128_000);
    }
    if lower.contains("llama") {
        return Some(8_192);
    }

    // Mistral / Mixtral — Large/Nemo/Codestral = 128K; others = 32K.
    if lower.contains("mistral") || lower.contains("mixtral") {
        if lower.contains("large") || lower.contains("nemo") || lower.contains("codestral") {
            return Some(128_000);
        }
        return Some(32_000);
    }

    // Command R / R+ (Cohere) — 128K.
    if lower.contains("command-r") || lower.contains("command-r+") {
        return Some(128_000);
    }

    // DBRX / MPT — 32K.
    if lower.contains("dbrx") || lower.contains("mpt") {
        return Some(32_000);
    }

    // OpenRouter community / open-source models — typically 128K.
    // Model IDs like "openai/gpt-oss-120b:free" contain "oss", "120b", etc.
    // These don't match the commercial model patterns above, so catch
    // them by looking for explicit context indicators in the name.
    if lower.contains("128k") || lower.contains("131k") || lower.contains("131072") {
        return Some(131_072);
    }
    if lower.contains("200k") {
        return Some(200_000);
    }
    if lower.contains("1m") || lower.contains("1000k") {
        return Some(1_000_000);
    }

    // Models with "70b", "120b", "180b", "240b" in the name are large
    // open-source models that almost always use 128K context windows.
    if lower.contains("70b")
        || lower.contains("120b")
        || lower.contains("180b")
        || lower.contains("240b")
    {
        return Some(131_072);
    }

    // "oss" (open-source) models on OpenRouter — 128K.
    if lower.contains("/oss") || lower.contains("oss-") || lower.contains("-oss") {
        return Some(131_072);
    }

    None
}

fn pick_model(preferred: &[String], available: &[String], fallback: &str) -> String {
    for want in preferred {
        if available.iter().any(|m| m == want) {
            return want.clone();
        }
    }
    fallback.to_string()
}

/// Extract a JSON object from an LLM response string.
///
/// LLMs sometimes wrap JSON in markdown fences or explanatory text.
/// This tries direct parse first, then finds the outermost `{...}`.
pub fn extract_json_from_response<T: serde::de::DeserializeOwned>(raw: &str) -> Option<T> {
    let trimmed = raw.trim();
    if let Ok(parsed) = serde_json::from_str::<T>(trimmed) {
        return Some(parsed);
    }
    let start = trimmed.find('{')?;
    let end = trimmed.rfind('}')?;
    serde_json::from_str::<T>(&trimmed[start..=end]).ok()
}